Overview
Submit package home:Aloysius:branches:Essentials / x265 to package Essentials / x265
x265.changes
Changed
x
1
2
-------------------------------------------------------------------
3
+Tue Oct 9 20:03:53 UTC 2018 - aloisio@gmx.com
4
+
5
+- Update to version 2.9
6
+ New features:
7
+ * Support for chunked encoding
8
+ + :option:`--chunk-start and --chunk-end`
9
+ + Frames preceding first frame of chunk in display order
10
+ will be encoded, however, they will be discarded in the
11
+ bitstream.
12
+ + Frames following last frame of the chunk in display order
13
+ will be used in taking lookahead decisions, but, they will
14
+ not be encoded.
15
+ + This feature can be enabled only in closed GOP structures.
16
+ Default disabled.
17
+ * Support for HDR10+ version 1 SEI messages.
18
+ Encoder enhancements:
19
+ * Create API function for allocating and freeing
20
+ x265_analysis_data.
21
+ * CEA 608/708 support: Read SEI messages from text file and
22
+ encode it using userSEI message.
23
+ Bug fixes:
24
+ * Disable noise reduction when vbv is enabled.
25
+ * Support minLuma and maxLuma values changed by the
26
+ commandline.
27
+ version 2.8
28
+ New features:
29
+ * :option:`--asm avx512` used to enable AVX-512 in x265.
30
+ Default disabled.
31
+ + For 4K main10 high-quality encoding, we are seeing good
32
+ gains; for other resolutions and presets, we don't
33
+ recommend using this setting for now.
34
+ * :option:`--dynamic-refine` dynamically switches between
35
+ different inter refine levels. Default disabled.
36
+ + It is recommended to use :option:`--refine-intra 4' with
37
+ dynamic refinement for a better trade-off between encode
38
+ efficiency and performance than using static refinement.
39
+ * :option:`--single-sei`
40
+ + Encode SEI messages in a single NAL unit instead of
41
+ multiple NAL units. Default disabled.
42
+ * :option:`--max-ausize-factor` controls the maximum AU size
43
+ defined in HEVC specification.
44
+ + It represents the percentage of maximum AU size used.
45
+ Default is 1.
46
+ * VMAF (Video Multi-Method Assessment Fusion)
47
+ + Added VMAF support for objective quality measurement of a
48
+ video sequence.
49
+ + Enable cmake option ENABLE_LIBVMAF to report per frame and
50
+ aggregate VMAF score. The frame level VMAF score does not
51
+ include temporal scores.
52
+ + This is supported only on linux for now.
53
+ Encoder enhancements:
54
+ * Introduced refine-intra level 4 to improve quality.
55
+ * Support for HLG-graded content and pic_struct in SEI message.
56
+ Bug Fixes:
57
+ * Fix 32 bit build error (using CMAKE GUI) in Linux.
58
+ * Fix 32 bit build error for asm primitives.
59
+ * Fix build error on mac OS.
60
+ * Fix VBV Lookahead in analysis load to achieve target bitrate.
61
+
62
+- Added x265-fix_enable512.patch
63
+
64
+-------------------------------------------------------------------
65
Fri May 4 22:21:57 UTC 2018 - zaitor@opensuse.org
66
67
- Build with nasm >= 2.13 for openSUSE Leap 42.3 and SLE-12, since
68
x265.spec
Changed
83
1
2
# based on the spec file from https://build.opensuse.org/package/view_file/home:Simmphonie/libx265/
3
4
Name: x265
5
-%define soname 151
6
+%define soname 165
7
%define libname lib%{name}
8
%define libsoname %{libname}-%{soname}
9
-Version: 2.7
10
+Version: 2.9
11
Release: 0
12
License: GPL-2.0+
13
Summary: A free h265/HEVC encoder - encoder binary
14
15
Source0: https://bitbucket.org/multicoreware/x265/downloads/%{name}_%{version}.tar.gz
16
Patch0: arm.patch
17
Patch1: x265.pkgconfig.patch
18
+Patch2: x265-fix_enable512.patch
19
BuildRequires: gcc
20
BuildRequires: gcc-c++
21
BuildRequires: cmake >= 2.8.8
22
BuildRequires: pkg-config
23
BuildRequires: nasm >= 2.13
24
-%if 0%{?suse_version} > 1310
25
%ifarch x86_64
26
BuildRequires: libnuma-devel >= 2.0.9
27
%endif
28
-%endif
29
-BuildRoot: %{_tmppath}/%{name}-%{version}-build
30
31
%description
32
x265 is a free library for encoding next-generation H265/HEVC video
33
34
35
%description -n %{libname}-devel
36
x265 is a free library for encoding next-generation H265/HEVC video
37
-streams.
38
+streams.
39
40
%prep
41
%setup -q -n %{name}_%{version}
42
%patch0 -p1
43
%patch1 -p1
44
+%patch2 -p1
45
46
sed -i -e "s/0.0/%{soname}.0/g" source/cmake/version.cmake
47
48
49
%build
50
-%if 0%{?suse_version} < 1330
51
+%if 0%{?suse_version} < 1500
52
cd source
53
%else
54
%define __builddir ./source/build
55
56
make %{?_smp_mflags}
57
58
%install
59
-%if 0%{?suse_version} < 1330
60
+%if 0%{?suse_version} < 1500
61
cd source
62
%endif
63
%cmake_install
64
65
%postun -n %{libsoname} -p /sbin/ldconfig
66
67
%files -n %{libsoname}
68
-%defattr(0644,root,root)
69
%{_libdir}/%{libname}.so.%{soname}*
70
71
-%files
72
-%defattr(0755,root,root)
73
+%files
74
%{_bindir}/%{name}
75
76
%files -n %{libname}-devel
77
-%defattr(0644,root,root)
78
+%license COPYING
79
+%doc readme.rst
80
%{_includedir}/%{name}.h
81
%{_includedir}/%{name}_config.h
82
%{_libdir}/pkgconfig/%{name}.pc
83
x265-fix_enable512.patch
Added
27
1
2
+--- a/source/common/cpu.cpp
3
++++ b/source/common/cpu.cpp
4
+@@ -110,6 +110,11 @@ const cpu_name_t cpu_names[] =
5
+ { "", 0 },
6
+ };
7
+
8
++bool detect512()
9
++{
10
++ return(enable512);
11
++}
12
++
13
+ #if X265_ARCH_X86
14
+
15
+ extern "C" {
16
+@@ -123,10 +128,6 @@ uint64_t PFX(cpu_xgetbv)(int xcr);
17
+ #pragma warning(disable: 4309) // truncation of constant value
18
+ #endif
19
+
20
+-bool detect512()
21
+-{
22
+- return(enable512);
23
+-}
24
+ uint32_t cpu_detect(bool benableavx512 )
25
+ {
26
+
27
x265_2.7.tar.gz/.hg_archival.txt -> x265_2.9.tar.gz/.hg_archival.txt
Changed
8
1
2
repo: 09fe40627f03a0f9c3e6ac78b22ac93da23f9fdf
3
-node: e41a9bf2bac4a7af2bec2bbadf91e63752d320ef
4
+node: f9681d731f2e56c2ca185cec10daece5939bee07
5
branch: stable
6
-tag: 2.7
7
+tag: 2.9
8
x265_2.7.tar.gz/.hgtags -> x265_2.9.tar.gz/.hgtags
Changed
7
1
2
e7a4dd48293b7956d4a20df257d23904cc78e376 2.4
3
64b2d0bf45a52511e57a6b7299160b961ca3d51c 2.5
4
0e9ea76945c89962cd46cee6537586e2054b2935 2.6
5
+e41a9bf2bac4a7af2bec2bbadf91e63752d320ef 2.7
6
+a158a3a029663133455268e2a63ae6b0af2df720 2.8
7
x265_2.7.tar.gz/doc/reST/api.rst -> x265_2.9.tar.gz/doc/reST/api.rst
Changed
51
1
2
* returns negative on error, 0 access unit were output.*/
3
int x265_set_analysis_data(x265_encoder *encoder, x265_analysis_data *analysis_data, int poc, uint32_t cuBytes);
4
5
+**x265_alloc_analysis_data()** may be used to allocate memory for the x265_analysis_data::
6
+
7
+ /* x265_alloc_analysis_data:
8
+ * Allocate memory for the x265_analysis_data object's internal structures. */
9
+ void x265_alloc_analysis_data(x265_param *param, x265_analysis_data* analysis);
10
+
11
+**x265_free_analysis_data()** may be used to free memory for the x265_analysis_data::
12
+
13
+ /* x265_free_analysis_data:
14
+ * Free the allocated memory for x265_analysis_data object's internal structures. */
15
+ void x265_free_analysis_data(x265_param *param, x265_analysis_data* analysis);
16
+
17
Pictures
18
========
19
20
21
* release library static allocations, reset configured CTU size */
22
void x265_cleanup(void);
23
24
+VMAF (Video Multi-Method Assessment Fusion)
25
+==========================================
26
+
27
+If you set the ENABLE_LIBVMAF cmake option to ON, then x265 will report per frame
28
+and aggregate VMAF score for the given input and dump the scores in csv file.
29
+The user also need to specify the :option:`--recon` in command line to get the VMAF scores.
30
+
31
+ /* x265_calculate_vmafScore:
32
+ * returns VMAF score for the input video.
33
+ * This api must be called only after encoding was done. */
34
+ double x265_calculate_vmafscore(x265_param*, x265_vmaf_data*);
35
+
36
+ /* x265_calculate_vmaf_framelevelscore:
37
+ * returns VMAF score for each frame in a given input video. The frame level VMAF score does not include temporal scores. */
38
+ double x265_calculate_vmaf_framelevelscore(x265_vmaf_framedata*);
39
+
40
+.. Note::
41
42
+ When setting ENABLE_LIBVMAF cmake option to ON, it is recommended to
43
+ also set ENABLE_SHARED to OFF to prevent build problems.
44
+ We only need the static library from these builds.
45
+
46
+ Binaries build with windows will not have VMAF support.
47
+
48
Multi-library Interface
49
=======================
50
51
x265_2.7.tar.gz/doc/reST/cli.rst -> x265_2.9.tar.gz/doc/reST/cli.rst
Changed
625
1
2
2. unable to open encoder
3
3. unable to generate stream headers
4
4. encoder abort
5
-
6
+
7
Logging/Statistic Options
8
=========================
9
10
11
**BufferFill** Bits available for the next frame. Includes bits carried
12
over from the current frame.
13
14
+ **BufferFillFinal** Buffer bits available after removing the frame out of CPB.
15
+
16
**Latency** Latency in terms of number of frames between when the frame
17
was given in and when the frame is given out.
18
19
20
21
.. option:: --csv-log-level <integer>
22
23
- Controls the level of detail (and size) of --csv log files
24
-
25
- 0. summary **(default)**
26
- 1. frame level logging
27
- 2. frame level logging with performance statistics
28
+ Controls the level of detail (and size) of --csv log files
29
+
30
+ 0. summary **(default)**
31
+ 1. frame level logging
32
+ 2. frame level logging with performance statistics
33
34
.. option:: --ssim, --no-ssim
35
36
37
"*" - same as default
38
"none" - no thread pools are created, only frame parallelism possible
39
"-" - same as "none"
40
- "10" - allocate one pool, using up to 10 cores on node 0
41
+ "10" - allocate one pool, using up to 10 cores on all available nodes
42
"-,+" - allocate one pool, using all cores on node 1
43
"+,-,+" - allocate one pool, using only cores on nodes 0 and 2
44
"+,-,+,-" - allocate one pool, using only cores on nodes 0 and 2
45
46
47
**CLI ONLY**
48
49
+.. option:: --chunk-start <integer>
50
+
51
+ First frame of the chunk. Frames preceeding this in display order will
52
+ be encoded, however, they will be discarded in the bitstream. This
53
+ feature can be enabled only in closed GOP structures.
54
+ Default 0 (disabled).
55
+
56
+.. option:: --chunk-end <integer>
57
+
58
+ Last frame of the chunk. Frames following this in display order will be
59
+ used in taking lookahead decisions, but, they will not be encoded.
60
+ This feature can be enabled only in closed GOP structures.
61
+ Default 0 (disabled).
62
+
63
Profile, Level, Tier
64
====================
65
66
67
encoding options, the encoder will attempt to modify/set the right
68
encode specifications. If the encoder is unable to do so, this option
69
will be turned OFF. Highly experimental.
70
-
71
+
72
Default: disabled
73
-
74
+
75
.. note::
76
77
:option:`--profile`, :option:`--level-idc`, and
78
79
Default 3.
80
81
.. option:: --limit-modes, --no-limit-modes
82
-
83
+
84
When enabled, limit-modes will limit modes analyzed for each CU using cost
85
metrics from the 4 sub-CUs. When multiple inter modes like :option:`--rect`
86
and/or :option:`--amp` are enabled, this feature will use motion cost
87
88
89
Default: enabled, disabled for :option:`--tune grain`
90
91
+.. option:: --splitrd-skip, --no-splitrd-skip
92
+
93
+ Enable skipping split RD analysis when sum of split CU rdCost larger than one
94
+ split CU rdCost for Intra CU. Default disabled.
95
+
96
.. option:: --fast-intra, --no-fast-intra
97
98
Perform an initial scan of every fifth intra angular mode, then
99
100
101
Note that --analysis-reuse-level must be paired with analysis-reuse-mode.
102
103
- +--------------+------------------------------------------+
104
- | Level | Description |
105
- +==============+==========================================+
106
- | 1 | Lookahead information |
107
- +--------------+------------------------------------------+
108
- | 2 to 4 | Level 1 + intra/inter modes, ref's |
109
- +--------------+------------------------------------------+
110
- | 5,6 and 9 | Level 2 + rect-amp |
111
- +--------------+------------------------------------------+
112
- | 7 | Level 5 + AVC size CU refinement |
113
- +--------------+------------------------------------------+
114
- | 8 | Level 5 + AVC size Full CU analysis-info |
115
- +--------------+------------------------------------------+
116
- | 10 | Level 5 + Full CU analysis-info |
117
- +--------------+------------------------------------------+
118
+ +--------------+------------------------------------------+
119
+ | Level | Description |
120
+ +==============+==========================================+
121
+ | 1 | Lookahead information |
122
+ +--------------+------------------------------------------+
123
+ | 2 to 4 | Level 1 + intra/inter modes, ref's |
124
+ +--------------+------------------------------------------+
125
+ | 5 and 6 | Level 2 + rect-amp |
126
+ +--------------+------------------------------------------+
127
+ | 7 | Level 5 + AVC size CU refinement |
128
+ +--------------+------------------------------------------+
129
+ | 8 and 9 | Level 5 + AVC size Full CU analysis-info |
130
+ +--------------+------------------------------------------+
131
+ | 10 | Level 5 + Full CU analysis-info |
132
+ +--------------+------------------------------------------+
133
134
.. option:: --refine-mv-type <string>
135
136
- Reuse MV information received through API call. Currently receives information for AVC size and the accepted
137
- string input is "avc". Default is disabled.
138
+ Reuse MV information received through API call. Currently receives information for AVC size and the accepted
139
+ string input is "avc". Default is disabled.
140
141
.. option:: --scale-factor
142
143
- Factor by which input video is scaled down for analysis save mode.
144
- This option should be coupled with analysis-reuse-mode option, --analysis-reuse-level 10.
145
- The ctu size of load should be double the size of save. Default 0.
146
+ Factor by which input video is scaled down for analysis save mode.
147
+ This option should be coupled with analysis-reuse-mode option,
148
+ --analysis-reuse-level 10. The ctu size of load can either be the
149
+ same as that of save or double the size of save. Default 0.
150
+
151
+.. option:: --refine-intra <0..4>
152
153
-.. option:: --refine-intra <0..3>
154
-
155
Enables refinement of intra blocks in current encode.
156
157
Level 0 - Forces both mode and depth from the save encode.
158
159
160
Level 3 - Perform analysis of intra modes for depth reused from first encode.
161
162
- Default 0.
163
+ Level 4 - Does not reuse any analysis information - redo analysis for the intra block.
164
165
+ Default 0.
166
+
167
.. option:: --refine-inter <0..3>
168
169
Enables refinement of inter blocks in current encode.
170
171
172
Default 0.
173
174
+.. option:: --dynamic-refine, --no-dynamic-refine
175
+
176
+ Dynamically switches :option:`--refine-inter` levels 0-3 based on the content and
177
+ the encoder settings. It is recommended to use :option:`--refine-intra` 4 with dynamic
178
+ refinement. Default disabled.
179
+
180
.. option:: --refine-mv
181
182
Enables refinement of motion vector for scaled video. Evaluates the best
183
motion vector by searching the surrounding eight integer and subpel pixel
184
- positions.
185
+ positions.
186
187
Options which affect the transform unit quad-tree, sometimes referred to
188
as the residual quad-tree (RQT).
189
190
quad-tree begins at the same depth of the coded tree unit, but if the
191
maximum TU size is smaller than the CU size then transform QT begins
192
at the depth of the max-tu-size. Default: 32.
193
-
194
+
195
.. option:: --dynamic-rd <0..4>
196
-
197
+
198
Increases the RD level at points where quality drops due to VBV rate
199
control enforcement. The number of CUs for which the RD is reconfigured
200
is determined based on the strength. Strength 1 gives the best FPS,
201
202
203
.. option:: --ssim-rd, --no-ssim-rd
204
205
- Enable/Disable SSIM RDO. SSIM is a better perceptual quality assessment
206
- method as compared to MSE. SSIM based RDO calculation is based on residual
207
- divisive normalization scheme. This normalization is consistent with the
208
- luminance and contrast masking effect of Human Visual System. It is used
209
- for mode selection during analysis of CTUs and can achieve significant
210
- gain in terms of objective quality metrics SSIM and PSNR. It only has effect
211
- on presets which use RDO-based mode decisions (:option:`--rd` 3 and above).
212
+ Enable/Disable SSIM RDO. SSIM is a better perceptual quality assessment
213
+ method as compared to MSE. SSIM based RDO calculation is based on residual
214
+ divisive normalization scheme. This normalization is consistent with the
215
+ luminance and contrast masking effect of Human Visual System. It is used
216
+ for mode selection during analysis of CTUs and can achieve significant
217
+ gain in terms of objective quality metrics SSIM and PSNR. It only has effect
218
+ on presets which use RDO-based mode decisions (:option:`--rd` 3 and above).
219
220
Temporal / motion search options
221
================================
222
223
224
.. option:: --analyze-src-pics, --no-analyze-src-pics
225
226
- Enalbe motion estimation with source frame pixels, in this mode,
227
- motion estimation can be computed independently. Default disabled.
228
+ Enable motion estimation with source frame pixels, in this mode,
229
+ motion estimation can be computed independently. Default disabled.
230
231
Spatial/intra options
232
=====================
233
234
235
.. option:: --ctu-info <0, 1, 2, 4, 6>
236
237
- This value enables receiving CTU information asynchronously and determine reaction to the CTU information. Default 0.
238
- 1: force the partitions if CTU information is present.
239
- 2: functionality of (1) and reduce qp if CTU information has changed.
240
- 4: functionality of (1) and force Inter modes when CTU Information has changed, merge/skip otherwise.
241
- This option should be enabled only when planning to invoke the API function x265_encoder_ctu_info to copy ctu-info asynchronously.
242
- If enabled without calling the API function, the encoder will wait indefinitely.
243
+ This value enables receiving CTU information asynchronously and determine reaction to the CTU information. Default 0.
244
+ 1: force the partitions if CTU information is present.
245
+ 2: functionality of (1) and reduce qp if CTU information has changed.
246
+ 4: functionality of (1) and force Inter modes when CTU Information has changed, merge/skip otherwise.
247
+ This option should be enabled only when planning to invoke the API function x265_encoder_ctu_info to copy ctu-info asynchronously.
248
+ If enabled without calling the API function, the encoder will wait indefinitely.
249
250
.. option:: --intra-refresh
251
252
253
Default 20
254
255
**Range of values:** Between the maximum consecutive bframe count (:option:`--bframes`) and 250
256
+
257
.. option:: --gop-lookahead <integer>
258
259
- Number of frames for GOP boundary decision lookahead. If a scenecut frame is found
260
- within this from the gop boundary set by `--keyint`, the GOP will be extented until such a point,
261
- otherwise the GOP will be terminated as set by `--keyint`. Default 0.
262
+ Number of frames for GOP boundary decision lookahead. If a scenecut frame is found
263
+ within this from the gop boundary set by `--keyint`, the GOP will be extented until such a point,
264
+ otherwise the GOP will be terminated as set by `--keyint`. Default 0.
265
266
- **Range of values:** Between 0 and (`--rc-lookahead` - mini-GOP length)
267
+ **Range of values:** Between 0 and (`--rc-lookahead` - mini-GOP length)
268
269
- It is recommended to have `--gop-lookahaed` less than `--min-keyint` as scenecuts beyond
270
- `--min-keyint` are already being coded as keyframes.
271
+ It is recommended to have `--gop-lookahaed` less than `--min-keyint` as scenecuts beyond
272
+ `--min-keyint` are already being coded as keyframes.
273
274
.. option:: --lookahead-slices <0..16>
275
276
277
on systems with many threads.
278
279
The encoder may internally lower the number of slices or disable
280
- slicing to ensure each slice codes at least 10 16x16 rows of lowres
281
- blocks to minimize the impact on quality. For example, for 720p and
282
- 1080p videos, the number of slices is capped to 4 and 6, respectively.
283
- For resolutions lesser than 720p, slicing is auto-disabled.
284
-
285
- If slices are used in lookahead, they are logged in the list of tools
286
- as *lslices*
287
+ slicing to ensure each slice codes at least 10 16x16 rows of lowres
288
+ blocks to minimize the impact on quality. For example, for 720p and
289
+ 1080p videos, the number of slices is capped to 4 and 6, respectively.
290
+ For resolutions lesser than 720p, slicing is auto-disabled.
291
+
292
+ If slices are used in lookahead, they are logged in the list of tools
293
+ as *lslices*
294
295
**Values:** 0 - disabled. 1 is the same as 0. Max 16.
296
- Default: 8 for ultrafast, superfast, faster, fast, medium
297
- 4 for slow, slower
298
- disabled for veryslow, slower
299
-
300
+ Default: 8 for ultrafast, superfast, faster, fast, medium
301
+ 4 for slow, slower
302
+ disabled for veryslow, slower
303
+
304
.. option:: --lookahead-threads <integer>
305
306
- Use multiple worker threads dedicated to doing only lookahead instead of sharing
307
- the worker threads with frame Encoders. A dedicated lookahead threadpool is created with the
308
- specified number of worker threads. This can range from 0 upto half the
309
- hardware threads available for encoding. Using too many threads for lookahead can starve
310
- resources for frame Encoder and can harm performance. Default is 0 - disabled, Lookahead
311
+ Use multiple worker threads dedicated to doing only lookahead instead of sharing
312
+ the worker threads with frame Encoders. A dedicated lookahead threadpool is created with the
313
+ specified number of worker threads. This can range from 0 upto half the
314
+ hardware threads available for encoding. Using too many threads for lookahead can starve
315
+ resources for frame Encoder and can harm performance. Default is 0 - disabled, Lookahead
316
shares worker threads with other FrameEncoders .
317
318
**Values:** 0 - disabled(default). Max - Half of available hardware threads.
319
-
320
+
321
.. option:: --b-adapt <integer>
322
323
Set the level of effort in determining B frame placement.
324
325
.. option:: --b-pyramid, --no-b-pyramid
326
327
Use B-frames as references, when possible. Default enabled
328
-
329
+
330
.. option:: --force-flush <integer>
331
332
Force the encoder to flush frames. Default is 0.
333
-
334
+
335
Values:
336
0 - flush the encoder only when all the input pictures are over.
337
1 - flush all the frames even when the input is not over.
338
339
any given frame (ensuring a max QP). This is dangerous when CRF is
340
used in combination with VBV as it may result in buffer underruns.
341
Default disabled
342
-
343
+
344
.. option:: --crf-min <0..51.0>
345
346
Specify an lower limit to the rate factor which may be assigned to
347
348
Default 0.9
349
350
**Range of values:** fractional: 0 - 1.0, or kbits: 2 .. bufsize
351
-
352
+
353
.. option:: --vbv-end <float>
354
355
Final buffer emptiness. The portion of the decode buffer that must be
356
357
can specify the starting and ending state of the VBV buffer so that VBV
358
compliance can be maintained when chunks are independently encoded and
359
stitched together.
360
-
361
+
362
.. option:: --vbv-end-fr-adj <float>
363
364
Frame from which qp has to be adjusted to achieve final decode buffer
365
366
367
.. option:: --multi-pass-opt-analysis, --no-multi-pass-opt-analysis
368
369
- Enable/Disable multipass analysis refinement along with multipass ratecontrol. Based on
370
- the information stored in pass 1, in subsequent passes analysis data is refined
371
- and also redundant steps are skipped.
372
- In pass 1 analysis information like motion vector, depth, reference and prediction
373
- modes of the final best CTU partition is stored for each CTU.
374
- Multipass analysis refinement cannot be enabled when 'analysis-save/analysis-load' option
375
- is enabled and both will be disabled when enabled together. This feature requires 'pmode/pme'
376
- to be disabled and hence pmode/pme will be disabled when enabled at the same time.
377
+ Enable/Disable multipass analysis refinement along with multipass ratecontrol. Based on
378
+ the information stored in pass 1, in subsequent passes analysis data is refined
379
+ and also redundant steps are skipped.
380
+ In pass 1 analysis information like motion vector, depth, reference and prediction
381
+ modes of the final best CTU partition is stored for each CTU.
382
+ Multipass analysis refinement cannot be enabled when 'analysis-save/analysis-load' option
383
+ is enabled and both will be disabled when enabled together. This feature requires 'pmode/pme'
384
+ to be disabled and hence pmode/pme will be disabled when enabled at the same time.
385
386
- Default: disabled.
387
+ Default: disabled.
388
389
.. option:: --multi-pass-opt-distortion, --no-multi-pass-opt-distortion
390
391
- Enable/Disable multipass refinement of qp based on distortion data along with multipass
392
- ratecontrol. In pass 1 distortion of best CTU partition is stored. CTUs with high
393
- distortion get lower(negative)qp offsets and vice-versa for low distortion CTUs in pass 2.
394
- This helps to improve the subjective quality.
395
- Multipass refinement of qp cannot be enabled when 'analysis-save/analysis-load' option
396
- is enabled and both will be disabled when enabled together. 'multi-pass-opt-distortion'
397
- requires 'pmode/pme' to be disabled and hence pmode/pme will be disabled when enabled along with it.
398
+ Enable/Disable multipass refinement of qp based on distortion data along with multipass
399
+ ratecontrol. In pass 1 distortion of best CTU partition is stored. CTUs with high
400
+ distortion get lower(negative)qp offsets and vice-versa for low distortion CTUs in pass 2.
401
+ This helps to improve the subjective quality.
402
+ Multipass refinement of qp cannot be enabled when 'analysis-save/analysis-load' option
403
+ is enabled and both will be disabled when enabled together. 'multi-pass-opt-distortion'
404
+ requires 'pmode/pme' to be disabled and hence pmode/pme will be disabled when enabled along with it.
405
406
- Default: disabled.
407
+ Default: disabled.
408
409
.. option:: --strict-cbr, --no-strict-cbr
410
-
411
+
412
Enables stricter conditions to control bitrate deviance from the
413
target bitrate in ABR mode. Bit rate adherence is prioritised
414
over quality. Rate tolerance is reduced to 50%. Default disabled.
415
416
encoded frames to control QP. strict-cbr allows the encoder to be
417
more aggressive in hitting the target bitrate even for short segment
418
videos.
419
-
420
+
421
.. option:: --cbqpoffs <integer>
422
423
Offset of Cb chroma QP from the luma QP selected by rate control.
424
425
426
qComp sets the quantizer curve compression factor. It weights the
427
frame quantizer based on the complexity of residual (measured by
428
- lookahead). Default value is 0.6. Increasing it to 1 will
429
- effectively generate CQP
430
+ lookahead). It's value must be between 0.5 and 1.0. Default value is
431
+ 0.6. Increasing it to 1.0 will effectively generate CQP.
432
433
.. option:: --qpstep <integer>
434
435
- The maximum single adjustment in QP allowed to rate control. Default
436
- 4
437
-
438
+ The maximum single adjustment in QP allowed to rate control. Default 4
439
+
440
.. option:: --qpmin <integer>
441
442
sets a hard lower limit on QP allowed to ratecontrol. Default 0
443
444
.. option:: --qpmax <integer>
445
446
sets a hard upper limit on QP allowed to ratecontrol. Default 69
447
-
448
+
449
.. option:: --rc-grain, --no-rc-grain
450
451
- Enables a specialised ratecontrol algorithm for film grain content. This
452
- parameter strictly minimises QP fluctuations within and across frames
453
- and removes pulsing of grain. Default disabled.
454
- Enabled when :option:'--tune' grain is applied. It is highly recommended
455
- that this option is used through the tune grain feature where a combination
456
- of param options are used to improve visual quality.
457
-
458
+ Enables a specialised ratecontrol algorithm for film grain content. This
459
+ parameter strictly minimises QP fluctuations within and across frames
460
+ and removes pulsing of grain. Default disabled.
461
+ Enabled when :option:'--tune' grain is applied. It is highly recommended
462
+ that this option is used through the tune grain feature where a combination
463
+ of param options are used to improve visual quality.
464
+
465
.. option:: --const-vbv, --no-const-vbv
466
467
- Enables VBV algorithm to be consistent across runs. Default disabled.
468
- Enabled when :option:'--tune' grain is applied.
469
-
470
+ Enables VBV algorithm to be consistent across runs. Default disabled.
471
+ Enabled when :option:'--tune' grain is applied.
472
+
473
.. option:: --qblur <float>
474
475
Temporally blur quants. Default 0.5
476
477
HEVC specifies a default set of scaling lists which may be enabled
478
without requiring them to be signaled in the SPS. Those scaling
479
lists can be enabled via :option:`--scaling-list` *default*.
480
-
481
+
482
All other strings indicate a filename containing custom scaling
483
lists in the HM format. The encode will abort if the file is not
484
- parsed correctly. Custom lists must be signaled in the SPS
485
+ parsed correctly. Custom lists must be signaled in the SPS. A sample
486
+ scaling list file is available in `the downloads page <https://bitbucket.org/multicoreware/x265/downloads/reference_scalinglist.txt>`_
487
488
.. option:: --lambda-file <filename>
489
490
Specify a text file containing values for x265_lambda_tab and
491
x265_lambda2_tab. Each table requires MAX_MAX_QP+1 (70) float
492
values.
493
-
494
+
495
The text file syntax is simple. Comma is considered to be
496
white-space. All white-space is ignored. Lines must be less than 2k
497
bytes in length. Content following hash (#) characters are ignored.
498
499
vectors and splits) and less on residual. This feature is intended
500
for experimentation.
501
502
+.. option:: --max-ausize-factor <float>
503
+
504
+ It controls the maximum AU size defined in specification. It represents
505
+ the percentage of maximum AU size used. Default is 1. Range is 0.5 to 1.
506
+
507
Loop filters
508
============
509
510
511
7. smpte240m
512
8. film
513
9. bt2020
514
- 10. smpte428
515
- 11. smpte431
516
- 12. smpte432
517
+ 10. smpte428
518
+ 11. smpte431
519
+ 12. smpte432
520
521
.. option:: --transfer <integer|string>
522
523
524
8. YCgCo
525
9. bt2020nc
526
10. bt2020c
527
- 11. smpte2085
528
- 12. chroma-derived-nc
529
- 13. chroma-derived-c
530
- 14. ictcp
531
+ 11. smpte2085
532
+ 12. chroma-derived-nc
533
+ 13. chroma-derived-c
534
+ 14. ictcp
535
536
.. option:: --chromaloc <0..5>
537
538
539
automatically when :option:`--master-display` or :option:`--max-cll` is
540
specified. Useful when there is a desire to signal 0 values for max-cll
541
and max-fall. Default disabled.
542
-
543
+
544
.. option:: --hdr-opt, --no-hdr-opt
545
546
Add luma and chroma offsets for HDR/WCG content.
547
Input video should be 10 bit 4:2:0. Applicable for HDR content. It is recommended
548
that AQ-mode be enabled along with this feature. Default disabled.
549
-
550
+
551
.. option:: --dhdr10-info <filename>
552
553
Inserts tone mapping information as an SEI message. It takes as input,
554
555
Maximum luma value allowed for input pictures. Any values above max-luma
556
are clipped. No default.
557
558
+.. option:: --nalu-file <filename>
559
+
560
+ Text file containing userSEI in POC order : <POC><space><PREFIX><space><NAL UNIT TYPE>/<SEI TYPE><space><SEI Payload>
561
+ Parse the input file specified and inserts SEI messages into the bitstream.
562
+ Currently, we support only PREFIX SEI messages. This is an "application-only" feature.
563
+
564
+.. option:: --atc-sei <integer>
565
+
566
+ Emit the alternative transfer characteristics SEI message where the integer
567
+ is the preferred transfer characteristics. Required for HLG (Hybrid Log Gamma)
568
+ signalling. Not signalled by default.
569
+
570
+.. option:: --pic-struct <integer>
571
+
572
+ Set the picture structure and emits it in the picture timing SEI message.
573
+ Values in the range 0..12. See D.3.3 of the HEVC spec. for a detailed explanation.
574
+ Required for HLG (Hybrid Log Gamma) signalling. Not signalled by default.
575
+
576
Bitstream options
577
=================
578
579
580
581
.. option:: --log2-max-poc-lsb <integer>
582
583
- Maximum of the picture order count. Default 8
584
+ Maximum of the picture order count. Default 8
585
586
.. option:: --vui-timing-info, --no-vui-timing-info
587
588
589
590
Only effective at RD levels 5 and 6
591
592
+.. option:: --idr-recovery-sei, --no-idr-recovery-sei
593
+ Emit RecoveryPoint info as sei in bitstream for each IDR frame. Default disabled.
594
+
595
+.. option:: --single-sei, --no-single-sei
596
+ Emit SEI messages in a single NAL unit instead of multiple NALs. Default disabled.
597
+ When HRD SEI is enabled the HM decoder will throw a warning.
598
+
599
DCT Approximations
600
=================
601
602
.. option:: --lowpass-dct
603
604
- If enabled, x265 will use low-pass subband dct approximation instead of the
605
- standard dct for 16x16 and 32x32 blocks. This approximation is less computational
606
- intensive but it generates truncated coefficient matrixes for the transformed block.
607
- Empirical analysis shows marginal loss in compression and performance gains up to 10%,
608
- paticularly at moderate bit-rates.
609
+ If enabled, x265 will use low-pass subband dct approximation instead of the
610
+ standard dct for 16x16 and 32x32 blocks. This approximation is less computational
611
+ intensive but it generates truncated coefficient matrixes for the transformed block.
612
+ Empirical analysis shows marginal loss in compression and performance gains up to 10%,
613
+ paticularly at moderate bit-rates.
614
615
- This approximation should be considered for platforms with performance and time
616
- constrains.
617
+ This approximation should be considered for platforms with performance and time
618
+ constrains.
619
620
- Default disabled. **Experimental feature**
621
+ Default disabled. **Experimental feature**
622
623
Debugging options
624
=================
625
x265_2.7.tar.gz/doc/reST/presets.rst -> x265_2.9.tar.gz/doc/reST/presets.rst
Changed
13
1
2
that strictly minimises QP fluctuations across frames, while still allowing
3
the encoder to hit bitrate targets and VBV buffer limits (with a slightly
4
higher margin of error than normal). It is highly recommended that this
5
-algorithm is used only through the :option:`--tune` *grain* feature.
6
+algorithm is used only through the :option:`--tune` *grain* feature.
7
+Overriding the `--tune` *grain* settings might result in grain strobing, especially
8
+when enabling features like :option:`--aq-mode` and :option:`--cutree` that modify
9
+per-block QPs within a given frame.
10
11
Fast Decode
12
~~~~~~~~~~~
13
x265_2.7.tar.gz/doc/reST/releasenotes.rst -> x265_2.9.tar.gz/doc/reST/releasenotes.rst
Changed
71
1
2
Release Notes
3
*************
4
5
+Version 2.9
6
+===========
7
+
8
+Release date - 05/10/2018
9
+
10
+New features
11
+-------------
12
+1. Support for chunked encoding
13
+
14
+ :option:`--chunk-start and --chunk-end`
15
+ Frames preceding first frame of chunk in display order will be encoded, however, they will be discarded in the bitstream.
16
+ Frames following last frame of the chunk in display order will be used in taking lookahead decisions, but, they will not be encoded.
17
+ This feature can be enabled only in closed GOP structures. Default disabled.
18
+
19
+2. Support for HDR10+ version 1 SEI messages.
20
+
21
+Encoder enhancements
22
+--------------------
23
+1. Create API function for allocating and freeing x265_analysis_data.
24
+2. CEA 608/708 support: Read SEI messages from text file and encode it using userSEI message.
25
+
26
+Bug fixes
27
+---------
28
+1. Disable noise reduction when vbv is enabled.
29
+2. Support minLuma and maxLuma values changed by the commandline.
30
+
31
+Version 2.8
32
+===========
33
+
34
+Release date - 21/05/2018
35
+
36
+New features
37
+-------------
38
+1. :option:`--asm avx512` used to enable AVX-512 in x265. Default disabled.
39
+ For 4K main10 high-quality encoding, we are seeing good gains; for other resolutions and presets, we don't recommend using this setting for now.
40
+
41
+2. :option:`--dynamic-refine` dynamically switches between different inter refine levels. Default disabled.
42
+ It is recommended to use :option:`--refine-intra 4' with dynamic refinement for a better trade-off between encode efficiency and performance than using static refinement.
43
+
44
+3. :option:`--single-sei`
45
+ Encode SEI messages in a single NAL unit instead of multiple NAL units. Default disabled.
46
+
47
+4. :option:`--max-ausize-factor` controls the maximum AU size defined in HEVC specification.
48
+ It represents the percentage of maximum AU size used. Default is 1.
49
+
50
+5. VMAF (Video Multi-Method Assessment Fusion)
51
+ Added VMAF support for objective quality measurement of a video sequence.
52
+ Enable cmake option ENABLE_LIBVMAF to report per frame and aggregate VMAF score. The frame level VMAF score does not include temporal scores.
53
+ This is supported only on linux for now.
54
+
55
+Encoder enhancements
56
+--------------------
57
+1. Introduced refine-intra level 4 to improve quality.
58
+2. Support for HLG-graded content and pic_struct in SEI message.
59
+
60
+Bug Fixes
61
+---------
62
+1. Fix 32 bit build error (using CMAKE GUI) in Linux.
63
+2. Fix 32 bit build error for asm primitives.
64
+3. Fix build error on mac OS.
65
+4. Fix VBV Lookahead in analysis load to achieve target bitrate.
66
+
67
+
68
Version 2.7
69
===========
70
71
x265_2.7.tar.gz/source/CMakeLists.txt -> x265_2.9.tar.gz/source/CMakeLists.txt
Changed
57
1
2
option(STATIC_LINK_CRT "Statically link C runtime for release builds" OFF)
3
mark_as_advanced(FPROFILE_USE FPROFILE_GENERATE NATIVE_BUILD)
4
# X265_BUILD must be incremented each time the public API is changed
5
-set(X265_BUILD 151)
6
+set(X265_BUILD 165)
7
configure_file("${PROJECT_SOURCE_DIR}/x265.def.in"
8
"${PROJECT_BINARY_DIR}/x265.def")
9
configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in"
10
11
if("${SYSPROC}" STREQUAL "" OR X86MATCH GREATER "-1")
12
set(X86 1)
13
add_definitions(-DX265_ARCH_X86=1)
14
- if("${CMAKE_SIZEOF_VOID_P}" MATCHES 8)
15
+ if(CMAKE_CXX_FLAGS STREQUAL "-m32")
16
+ message(STATUS "Detected x86 target processor")
17
+ elseif("${CMAKE_SIZEOF_VOID_P}" MATCHES 8)
18
set(X64 1)
19
add_definitions(-DX86_64=1)
20
message(STATUS "Detected x86_64 target processor")
21
- else()
22
- message(STATUS "Detected x86 target processor")
23
endif()
24
elseif(POWERMATCH GREATER "-1")
25
message(STATUS "Detected POWER target processor")
26
27
if(NO_ATOMICS)
28
add_definitions(-DNO_ATOMICS=1)
29
endif(NO_ATOMICS)
30
+ find_library(VMAF vmaf)
31
+ option(ENABLE_LIBVMAF "Enable VMAF" OFF)
32
+ if(ENABLE_LIBVMAF)
33
+ add_definitions(-DENABLE_LIBVMAF)
34
+ endif()
35
endif(UNIX)
36
37
if(X64 AND NOT WIN32)
38
39
if(EXTRA_LIB)
40
target_link_libraries(x265-static ${EXTRA_LIB})
41
endif()
42
+if(ENABLE_LIBVMAF)
43
+ target_link_libraries(x265-static ${VMAF})
44
+endif()
45
install(TARGETS x265-static
46
LIBRARY DESTINATION ${LIB_INSTALL_DIR}
47
ARCHIVE DESTINATION ${LIB_INSTALL_DIR})
48
49
ARCHIVE DESTINATION ${LIB_INSTALL_DIR})
50
endif()
51
install(FILES x265.h "${PROJECT_BINARY_DIR}/x265_config.h" DESTINATION include)
52
-if(WIN32)
53
+if((WIN32 AND ENABLE_CLI) OR (WIN32 AND ENABLE_SHARED))
54
if(MSVC_IDE)
55
install(FILES "${PROJECT_BINARY_DIR}/Debug/x265.pdb" DESTINATION ${BIN_INSTALL_DIR} CONFIGURATIONS Debug)
56
install(FILES "${PROJECT_BINARY_DIR}/RelWithDebInfo/x265.pdb" DESTINATION ${BIN_INSTALL_DIR} CONFIGURATIONS RelWithDebInfo)
57
x265_2.7.tar.gz/source/common/common.cpp -> x265_2.9.tar.gz/source/common/common.cpp
Changed
10
1
2
#endif
3
}
4
5
-#define X265_ALIGNBYTES 32
6
+#define X265_ALIGNBYTES 64
7
8
#if _WIN32
9
#if defined(__MINGW32__) && !defined(__MINGW64_VERSION_MAJOR)
10
x265_2.7.tar.gz/source/common/common.h -> x265_2.9.tar.gz/source/common/common.h
Changed
26
1
2
#define ALIGN_VAR_8(T, var) T var __attribute__((aligned(8)))
3
#define ALIGN_VAR_16(T, var) T var __attribute__((aligned(16)))
4
#define ALIGN_VAR_32(T, var) T var __attribute__((aligned(32)))
5
+#define ALIGN_VAR_64(T, var) T var __attribute__((aligned(64)))
6
#if defined(__MINGW32__)
7
#define fseeko fseeko64
8
#define ftello ftello64
9
10
#define ALIGN_VAR_8(T, var) __declspec(align(8)) T var
11
#define ALIGN_VAR_16(T, var) __declspec(align(16)) T var
12
#define ALIGN_VAR_32(T, var) __declspec(align(32)) T var
13
+#define ALIGN_VAR_64(T, var) __declspec(align(64)) T var
14
#define fseeko _fseeki64
15
#define ftello _ftelli64
16
#endif // if defined(__GNUC__)
17
18
#define START_CODE_OVERHEAD 3
19
#define FILLER_OVERHEAD (NAL_TYPE_OVERHEAD + START_CODE_OVERHEAD + 1)
20
21
+#define MAX_NUM_DYN_REFINE (NUM_CU_DEPTH * X265_REFINE_INTER_LEVELS)
22
+
23
namespace X265_NS {
24
25
enum { SAO_NUM_OFFSET = 4 };
26
x265_2.7.tar.gz/source/common/cpu.cpp -> x265_2.9.tar.gz/source/common/cpu.cpp
Changed
200
1
2
#endif // if X265_ARCH_ARM
3
4
namespace X265_NS {
5
+static bool enable512 = false;
6
const cpu_name_t cpu_names[] =
7
{
8
#if X265_ARCH_X86
9
-#define MMX2 X265_CPU_MMX | X265_CPU_MMX2 | X265_CPU_CMOV
10
+#define MMX2 X265_CPU_MMX | X265_CPU_MMX2
11
{ "MMX2", MMX2 },
12
{ "MMXEXT", MMX2 },
13
{ "SSE", MMX2 | X265_CPU_SSE },
14
15
{ "BMI2", AVX | X265_CPU_LZCNT | X265_CPU_BMI1 | X265_CPU_BMI2 },
16
#define AVX2 AVX | X265_CPU_FMA3 | X265_CPU_LZCNT | X265_CPU_BMI1 | X265_CPU_BMI2 | X265_CPU_AVX2
17
{ "AVX2", AVX2},
18
+ { "AVX512", AVX2 | X265_CPU_AVX512 },
19
#undef AVX2
20
#undef AVX
21
#undef SSE2
22
#undef MMX2
23
{ "Cache32", X265_CPU_CACHELINE_32 },
24
{ "Cache64", X265_CPU_CACHELINE_64 },
25
- { "SlowCTZ", X265_CPU_SLOW_CTZ },
26
{ "SlowAtom", X265_CPU_SLOW_ATOM },
27
{ "SlowPshufb", X265_CPU_SLOW_PSHUFB },
28
{ "SlowPalignr", X265_CPU_SLOW_PALIGNR },
29
30
/* cpu-a.asm */
31
int PFX(cpu_cpuid_test)(void);
32
void PFX(cpu_cpuid)(uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx);
33
-void PFX(cpu_xgetbv)(uint32_t op, uint32_t *eax, uint32_t *edx);
34
+uint64_t PFX(cpu_xgetbv)(int xcr);
35
}
36
37
#if defined(_MSC_VER)
38
#pragma warning(disable: 4309) // truncation of constant value
39
#endif
40
41
-uint32_t cpu_detect(void)
42
+bool detect512()
43
+{
44
+ return(enable512);
45
+}
46
+uint32_t cpu_detect(bool benableavx512 )
47
{
48
- uint32_t cpu = 0;
49
50
+ uint32_t cpu = 0;
51
uint32_t eax, ebx, ecx, edx;
52
uint32_t vendor[4] = { 0 };
53
uint32_t max_extended_cap, max_basic_cap;
54
+ uint64_t xcr0 = 0;
55
56
#if !X86_64
57
if (!PFX(cpu_cpuid_test)())
58
return 0;
59
#endif
60
61
- PFX(cpu_cpuid)(0, &eax, vendor + 0, vendor + 2, vendor + 1);
62
- max_basic_cap = eax;
63
+ PFX(cpu_cpuid)(0, &max_basic_cap, vendor + 0, vendor + 2, vendor + 1);
64
if (max_basic_cap == 0)
65
return 0;
66
67
68
return cpu;
69
if (edx & 0x02000000)
70
cpu |= X265_CPU_MMX2 | X265_CPU_SSE;
71
- if (edx & 0x00008000)
72
- cpu |= X265_CPU_CMOV;
73
- else
74
- return cpu;
75
if (edx & 0x04000000)
76
cpu |= X265_CPU_SSE2;
77
if (ecx & 0x00000001)
78
cpu |= X265_CPU_SSE3;
79
if (ecx & 0x00000200)
80
- cpu |= X265_CPU_SSSE3;
81
+ cpu |= X265_CPU_SSSE3 | X265_CPU_SSE2_IS_FAST;
82
if (ecx & 0x00080000)
83
cpu |= X265_CPU_SSE4;
84
if (ecx & 0x00100000)
85
cpu |= X265_CPU_SSE42;
86
- /* Check OXSAVE and AVX bits */
87
- if ((ecx & 0x18000000) == 0x18000000)
88
+
89
+ if (ecx & 0x08000000) /* XGETBV supported and XSAVE enabled by OS */
90
{
91
/* Check for OS support */
92
- PFX(cpu_xgetbv)(0, &eax, &edx);
93
- if ((eax & 0x6) == 0x6)
94
+ xcr0 = PFX(cpu_xgetbv)(0);
95
+ if ((xcr0 & 0x6) == 0x6) /* XMM/YMM state */
96
{
97
+ if (ecx & 0x10000000)
98
cpu |= X265_CPU_AVX;
99
if (ecx & 0x00001000)
100
cpu |= X265_CPU_FMA3;
101
102
{
103
PFX(cpu_cpuid)(7, &eax, &ebx, &ecx, &edx);
104
/* AVX2 requires OS support, but BMI1/2 don't. */
105
- if ((cpu & X265_CPU_AVX) && (ebx & 0x00000020))
106
- cpu |= X265_CPU_AVX2;
107
if (ebx & 0x00000008)
108
- {
109
cpu |= X265_CPU_BMI1;
110
- if (ebx & 0x00000100)
111
- cpu |= X265_CPU_BMI2;
112
+ if (ebx & 0x00000100)
113
+ cpu |= X265_CPU_BMI2;
114
+
115
+ if ((xcr0 & 0x6) == 0x6) /* XMM/YMM state */
116
+ {
117
+ if (ebx & 0x00000020)
118
+ cpu |= X265_CPU_AVX2;
119
+ if (benableavx512)
120
+ {
121
+ if ((xcr0 & 0xE0) == 0xE0) /* OPMASK/ZMM state */
122
+ {
123
+ if ((ebx & 0xD0030000) == 0xD0030000)
124
+ {
125
+ cpu |= X265_CPU_AVX512;
126
+ enable512 = true;
127
+ }
128
+ }
129
+ }
130
}
131
}
132
133
- if (cpu & X265_CPU_SSSE3)
134
- cpu |= X265_CPU_SSE2_IS_FAST;
135
-
136
PFX(cpu_cpuid)(0x80000000, &eax, &ebx, &ecx, &edx);
137
max_extended_cap = eax;
138
139
140
{
141
if (edx & 0x00400000)
142
cpu |= X265_CPU_MMX2;
143
- if (!(cpu & X265_CPU_LZCNT))
144
- cpu |= X265_CPU_SLOW_CTZ;
145
if ((cpu & X265_CPU_SSE2) && !(cpu & X265_CPU_SSE2_IS_FAST))
146
cpu |= X265_CPU_SSE2_IS_SLOW; /* AMD CPUs come in two types: terrible at SSE and great at it */
147
}
148
149
int model = ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0);
150
if (family == 6)
151
{
152
- /* 6/9 (pentium-m "banias"), 6/13 (pentium-m "dothan"), and 6/14 (core1 "yonah")
153
- * theoretically support sse2, but it's significantly slower than mmx for
154
- * almost all of x264's functions, so let's just pretend they don't. */
155
- if (model == 9 || model == 13 || model == 14)
156
- {
157
- cpu &= ~(X265_CPU_SSE2 | X265_CPU_SSE3);
158
- X265_CHECK(!(cpu & (X265_CPU_SSSE3 | X265_CPU_SSE4)), "unexpected CPU ID %d\n", cpu);
159
- }
160
/* Detect Atom CPU */
161
- else if (model == 28)
162
+ if (model == 28)
163
{
164
cpu |= X265_CPU_SLOW_ATOM;
165
- cpu |= X265_CPU_SLOW_CTZ;
166
cpu |= X265_CPU_SLOW_PSHUFB;
167
}
168
169
170
int PFX(cpu_fast_neon_mrc_test)(void);
171
}
172
173
-uint32_t cpu_detect(void)
174
+uint32_t cpu_detect(bool benableavx512)
175
{
176
int flags = 0;
177
178
179
180
#elif X265_ARCH_POWER8
181
182
-uint32_t cpu_detect(void)
183
+uint32_t cpu_detect(bool benableavx512)
184
{
185
#if HAVE_ALTIVEC
186
return X265_CPU_ALTIVEC;
187
188
189
#else // if X265_ARCH_POWER8
190
191
-uint32_t cpu_detect(void)
192
+uint32_t cpu_detect(bool benableavx512)
193
{
194
return 0;
195
}
196
197
#endif // if X265_ARCH_X86
198
}
199
+
200
x265_2.7.tar.gz/source/common/cpu.h -> x265_2.9.tar.gz/source/common/cpu.h
Changed
19
1
2
#define X265_CPU_H
3
4
#include "common.h"
5
-
6
/* All assembly functions are prefixed with X265_NS (macro expanded) */
7
#define PFX3(prefix, name) prefix ## _ ## name
8
#define PFX2(prefix, name) PFX3(prefix, name)
9
10
#endif
11
12
namespace X265_NS {
13
-uint32_t cpu_detect(void);
14
+uint32_t cpu_detect(bool);
15
+bool detect512();
16
17
struct cpu_name_t
18
{
19
x265_2.7.tar.gz/source/common/cudata.cpp -> x265_2.9.tar.gz/source/common/cudata.cpp
Changed
29
1
2
dir |= (1 << list);
3
candMvField[count][list].mv = colmv;
4
candMvField[count][list].refIdx = refIdx;
5
- if (m_encData->m_param->scaleFactor && m_encData->m_param->analysisSave && m_log2CUSize[0] < 4)
6
- {
7
- MV dist(MAX_MV, MAX_MV);
8
- candMvField[count][list].mv = dist;
9
- }
10
}
11
}
12
13
14
15
int curRefPOC = m_slice->m_refPOCList[picList][refIdx];
16
int curPOC = m_slice->m_poc;
17
-
18
- if (m_encData->m_param->scaleFactor && m_encData->m_param->analysisSave && (m_log2CUSize[0] < 4))
19
- {
20
- MV dist(MAX_MV, MAX_MV);
21
- pmv[numMvc++] = amvpCand[num++] = dist;
22
- }
23
- else
24
- pmv[numMvc++] = amvpCand[num++] = scaleMvByPOCDist(neighbours[MD_COLLOCATED].mv[picList], curPOC, curRefPOC, colPOC, colRefPOC);
25
+ pmv[numMvc++] = amvpCand[num++] = scaleMvByPOCDist(neighbours[MD_COLLOCATED].mv[picList], curPOC, curRefPOC, colPOC, colRefPOC);
26
}
27
}
28
29
x265_2.7.tar.gz/source/common/cudata.h -> x265_2.9.tar.gz/source/common/cudata.h
Changed
27
1
2
uint64_t m_fAc_den[3];
3
uint64_t m_fDc_den[3];
4
5
+ /* Feature values per CTU for dynamic refinement */
6
+ uint64_t* m_collectCURd;
7
+ uint32_t* m_collectCUVariance;
8
+ uint32_t* m_collectCUCount;
9
+
10
CUData();
11
12
void initialize(const CUDataMemPool& dataPool, uint32_t depth, const x265_param& param, int instance);
13
14
coeff_t* trCoeffMemBlock;
15
MV* mvMemBlock;
16
sse_t* distortionMemBlock;
17
+ uint64_t* dynRefineRdBlock;
18
+ uint32_t* dynRefCntBlock;
19
+ uint32_t* dynRefVarBlock;
20
21
- CUDataMemPool() { charMemBlock = NULL; trCoeffMemBlock = NULL; mvMemBlock = NULL; distortionMemBlock = NULL; }
22
+ CUDataMemPool() { charMemBlock = NULL; trCoeffMemBlock = NULL; mvMemBlock = NULL; distortionMemBlock = NULL;
23
+ dynRefineRdBlock = NULL; dynRefCntBlock = NULL; dynRefVarBlock = NULL;}
24
25
bool create(uint32_t depth, uint32_t csp, uint32_t numInstances, const x265_param& param)
26
{
27
x265_2.7.tar.gz/source/common/dct.cpp -> x265_2.9.tar.gz/source/common/dct.cpp
Changed
130
1
2
sum += sbacGetEntropyBits(mstate, firstC2Flag);
3
}
4
}
5
-
6
return (sum & 0x00FFFFFF) + (c1 << 26) + (firstC2Idx << 28);
7
}
8
+template<int log2TrSize>
9
+static void nonPsyRdoQuant_c(int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, uint32_t blkPos)
10
+{
11
+ const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
12
+ const int scaleBits = SCALE_BITS - 2 * transformShift;
13
+ const uint32_t trSize = 1 << log2TrSize;
14
+
15
+ for (int y = 0; y < MLS_CG_SIZE; y++)
16
+ {
17
+ for (int x = 0; x < MLS_CG_SIZE; x++)
18
+ {
19
+ int64_t signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */
20
+ costUncoded[blkPos + x] = static_cast<int64_t>((double)((signCoef * signCoef) << scaleBits));
21
+ *totalUncodedCost += costUncoded[blkPos + x];
22
+ *totalRdCost += costUncoded[blkPos + x];
23
+ }
24
+ blkPos += trSize;
25
+ }
26
+}
27
+template<int log2TrSize>
28
+static void psyRdoQuant_c(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos)
29
+{
30
+ const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
31
+ const int scaleBits = SCALE_BITS - 2 * transformShift;
32
+ const uint32_t trSize = 1 << log2TrSize;
33
+ int max = X265_MAX(0, (2 * transformShift + 1));
34
+
35
+ for (int y = 0; y < MLS_CG_SIZE; y++)
36
+ {
37
+ for (int x = 0; x < MLS_CG_SIZE; x++)
38
+ {
39
+ int64_t signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */
40
+ int64_t predictedCoef = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/
41
+
42
+ costUncoded[blkPos + x] = static_cast<int64_t>((double)((signCoef * signCoef) << scaleBits));
43
+
44
+ /* when no residual coefficient is coded, predicted coef == recon coef */
45
+ costUncoded[blkPos + x] -= static_cast<int64_t>((double)(((*psyScale) * predictedCoef) >> max));
46
+
47
+ *totalUncodedCost += costUncoded[blkPos + x];
48
+ *totalRdCost += costUncoded[blkPos + x];
49
+ }
50
+ blkPos += trSize;
51
+ }
52
+}
53
+template<int log2TrSize>
54
+static void psyRdoQuant_c_1(int16_t *m_resiDctCoeff, /*int16_t *m_fencDctCoeff, */ int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, /* int64_t *psyScale,*/ uint32_t blkPos)
55
+{
56
+ const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
57
+ const int scaleBits = SCALE_BITS - 2 * transformShift;
58
+ const uint32_t trSize = 1 << log2TrSize;
59
+
60
+ for (int y = 0; y < MLS_CG_SIZE; y++)
61
+ {
62
+ for (int x = 0; x < MLS_CG_SIZE; x++)
63
+ {
64
+ int64_t signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */
65
+ costUncoded[blkPos + x] = static_cast<int64_t>((double)((signCoef * signCoef) << scaleBits));
66
+ *totalUncodedCost += costUncoded[blkPos + x];
67
+ *totalRdCost += costUncoded[blkPos + x];
68
+ }
69
+ blkPos += trSize;
70
+ }
71
+}
72
+template<int log2TrSize>
73
+static void psyRdoQuant_c_2(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos)
74
+{
75
+ const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
76
+
77
+ const uint32_t trSize = 1 << log2TrSize;
78
+ int max = X265_MAX(0, (2 * transformShift + 1));
79
+
80
+ for (int y = 0; y < MLS_CG_SIZE; y++)
81
+ {
82
+ for (int x = 0; x < MLS_CG_SIZE; x++)
83
+ {
84
+ int64_t signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */
85
+ int64_t predictedCoef = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/
86
+ costUncoded[blkPos + x] -= static_cast<int64_t>((double)(((*psyScale) * predictedCoef) >> max));
87
+ *totalUncodedCost += costUncoded[blkPos + x];
88
+ *totalRdCost += costUncoded[blkPos + x];
89
+ }
90
+ blkPos += trSize;
91
+ }
92
+}
93
94
namespace X265_NS {
95
// x265 private namespace
96
-
97
void setupDCTPrimitives_c(EncoderPrimitives& p)
98
{
99
p.dequant_scaling = dequant_scaling_c;
100
p.dequant_normal = dequant_normal_c;
101
p.quant = quant_c;
102
p.nquant = nquant_c;
103
+ p.cu[BLOCK_4x4].nonPsyRdoQuant = nonPsyRdoQuant_c<2>;
104
+ p.cu[BLOCK_8x8].nonPsyRdoQuant = nonPsyRdoQuant_c<3>;
105
+ p.cu[BLOCK_16x16].nonPsyRdoQuant = nonPsyRdoQuant_c<4>;
106
+ p.cu[BLOCK_32x32].nonPsyRdoQuant = nonPsyRdoQuant_c<5>;
107
+ p.cu[BLOCK_4x4].psyRdoQuant = psyRdoQuant_c<2>;
108
+ p.cu[BLOCK_8x8].psyRdoQuant = psyRdoQuant_c<3>;
109
+ p.cu[BLOCK_16x16].psyRdoQuant = psyRdoQuant_c<4>;
110
+ p.cu[BLOCK_32x32].psyRdoQuant = psyRdoQuant_c<5>;
111
p.dst4x4 = dst4_c;
112
p.cu[BLOCK_4x4].dct = dct4_c;
113
p.cu[BLOCK_8x8].dct = dct8_c;
114
115
p.cu[BLOCK_8x8].copy_cnt = copy_count<8>;
116
p.cu[BLOCK_16x16].copy_cnt = copy_count<16>;
117
p.cu[BLOCK_32x32].copy_cnt = copy_count<32>;
118
-
119
+ p.cu[BLOCK_4x4].psyRdoQuant_1p = psyRdoQuant_c_1<2>;
120
+ p.cu[BLOCK_4x4].psyRdoQuant_2p = psyRdoQuant_c_2<2>;
121
+ p.cu[BLOCK_8x8].psyRdoQuant_1p = psyRdoQuant_c_1<3>;
122
+ p.cu[BLOCK_8x8].psyRdoQuant_2p = psyRdoQuant_c_2<3>;
123
+ p.cu[BLOCK_16x16].psyRdoQuant_1p = psyRdoQuant_c_1<4>;
124
+ p.cu[BLOCK_16x16].psyRdoQuant_2p = psyRdoQuant_c_2<4>;
125
+ p.cu[BLOCK_32x32].psyRdoQuant_1p = psyRdoQuant_c_1<5>;
126
+ p.cu[BLOCK_32x32].psyRdoQuant_2p = psyRdoQuant_c_2<5>;
127
p.scanPosLast = scanPosLast_c;
128
p.findPosFirstLast = findPosFirstLast_c;
129
p.costCoeffNxN = costCoeffNxN_c;
130
x265_2.7.tar.gz/source/common/frame.cpp -> x265_2.9.tar.gz/source/common/frame.cpp
Changed
56
1
2
m_addOnDepth = NULL;
3
m_addOnCtuInfo = NULL;
4
m_addOnPrevChange = NULL;
5
+ m_classifyFrame = false;
6
}
7
8
bool Frame::create(x265_param *param, float* quantOffsets)
9
10
m_analysisData.wt = NULL;
11
m_analysisData.intraData = NULL;
12
m_analysisData.interData = NULL;
13
- m_analysis2Pass.analysisFramedata = NULL;
14
+ m_analysisData.distortionData = NULL;
15
}
16
17
- if (m_fencPic->create(param, !!m_param->bCopyPicToFrame) && m_lowres.create(m_fencPic, param->bframes, !!param->rc.aqMode || !!param->bAQMotion, param->rc.qgSize))
18
+ if (param->bDynamicRefine)
19
+ {
20
+ int size = m_param->maxCUDepth * X265_REFINE_INTER_LEVELS;
21
+ CHECKED_MALLOC_ZERO(m_classifyRd, uint64_t, size);
22
+ CHECKED_MALLOC_ZERO(m_classifyVariance, uint64_t, size);
23
+ CHECKED_MALLOC_ZERO(m_classifyCount, uint32_t, size);
24
+ }
25
+
26
+ if (m_fencPic->create(param, !!m_param->bCopyPicToFrame) && m_lowres.create(param, m_fencPic, param->rc.qgSize))
27
{
28
X265_CHECK((m_reconColCount == NULL), "m_reconColCount was initialized");
29
m_numRows = (m_fencPic->m_picHeight + param->maxCUSize - 1) / param->maxCUSize;
30
31
32
if (quantOffsets)
33
{
34
- int32_t cuCount;
35
- if (param->rc.qgSize == 8)
36
- cuCount = m_lowres.maxBlocksInRowFullRes * m_lowres.maxBlocksInColFullRes;
37
- else
38
- cuCount = m_lowres.maxBlocksInRow * m_lowres.maxBlocksInCol;
39
+ int32_t cuCount = (param->rc.qgSize == 8) ? m_lowres.maxBlocksInRowFullRes * m_lowres.maxBlocksInColFullRes :
40
+ m_lowres.maxBlocksInRow * m_lowres.maxBlocksInCol;
41
m_quantOffsets = new float[cuCount];
42
}
43
return true;
44
45
}
46
m_lowres.destroy();
47
X265_FREE(m_rcData);
48
+
49
+ if (m_param->bDynamicRefine)
50
+ {
51
+ X265_FREE_ZERO(m_classifyRd);
52
+ X265_FREE_ZERO(m_classifyVariance);
53
+ X265_FREE_ZERO(m_classifyCount);
54
+ }
55
}
56
x265_2.7.tar.gz/source/common/frame.h -> x265_2.9.tar.gz/source/common/frame.h
Changed
24
1
2
Frame* m_prev;
3
x265_param* m_param; // Points to the latest param set for the frame.
4
x265_analysis_data m_analysisData;
5
- x265_analysis_2Pass m_analysis2Pass;
6
RcStats* m_rcData;
7
8
Event m_copyMVType;
9
10
uint8_t** m_addOnDepth;
11
uint8_t** m_addOnCtuInfo;
12
int** m_addOnPrevChange;
13
+
14
+ /* Average feature values of frames being considered for classification */
15
+ uint64_t* m_classifyRd;
16
+ uint64_t* m_classifyVariance;
17
+ uint32_t* m_classifyCount;
18
+
19
+ bool m_classifyFrame;
20
+
21
Frame();
22
23
bool create(x265_param *param, float* quantOffsets);
24
x265_2.7.tar.gz/source/common/framedata.cpp -> x265_2.9.tar.gz/source/common/framedata.cpp
Changed
53
1
2
if (param.rc.bStatWrite)
3
m_spsrps = const_cast<RPS*>(sps.spsrps);
4
bool isallocated = m_cuMemPool.create(0, param.internalCsp, sps.numCUsInFrame, param);
5
+ if (m_param->bDynamicRefine)
6
+ {
7
+ CHECKED_MALLOC_ZERO(m_cuMemPool.dynRefineRdBlock, uint64_t, MAX_NUM_DYN_REFINE * sps.numCUsInFrame);
8
+ CHECKED_MALLOC_ZERO(m_cuMemPool.dynRefCntBlock, uint32_t, MAX_NUM_DYN_REFINE * sps.numCUsInFrame);
9
+ CHECKED_MALLOC_ZERO(m_cuMemPool.dynRefVarBlock, uint32_t, MAX_NUM_DYN_REFINE * sps.numCUsInFrame);
10
+ }
11
if (isallocated)
12
+ {
13
for (uint32_t ctuAddr = 0; ctuAddr < sps.numCUsInFrame; ctuAddr++)
14
+ {
15
+ if (m_param->bDynamicRefine)
16
+ {
17
+ m_picCTU[ctuAddr].m_collectCURd = m_cuMemPool.dynRefineRdBlock + (ctuAddr * MAX_NUM_DYN_REFINE);
18
+ m_picCTU[ctuAddr].m_collectCUVariance = m_cuMemPool.dynRefVarBlock + (ctuAddr * MAX_NUM_DYN_REFINE);
19
+ m_picCTU[ctuAddr].m_collectCUCount = m_cuMemPool.dynRefCntBlock + (ctuAddr * MAX_NUM_DYN_REFINE);
20
+ }
21
m_picCTU[ctuAddr].initialize(m_cuMemPool, 0, param, ctuAddr);
22
+ }
23
+ }
24
else
25
return false;
26
CHECKED_MALLOC_ZERO(m_cuStat, RCStatCU, sps.numCUsInFrame);
27
28
{
29
memset(m_cuStat, 0, sps.numCUsInFrame * sizeof(*m_cuStat));
30
memset(m_rowStat, 0, sps.numCuInHeight * sizeof(*m_rowStat));
31
+ if (m_param->bDynamicRefine)
32
+ {
33
+ memset(m_picCTU->m_collectCURd, 0, MAX_NUM_DYN_REFINE * sizeof(uint64_t));
34
+ memset(m_picCTU->m_collectCUVariance, 0, MAX_NUM_DYN_REFINE * sizeof(uint32_t));
35
+ memset(m_picCTU->m_collectCUCount, 0, MAX_NUM_DYN_REFINE * sizeof(uint32_t));
36
+ }
37
}
38
39
void FrameData::destroy()
40
41
42
m_cuMemPool.destroy();
43
44
+ if (m_param->bDynamicRefine)
45
+ {
46
+ X265_FREE(m_cuMemPool.dynRefineRdBlock);
47
+ X265_FREE(m_cuMemPool.dynRefCntBlock);
48
+ X265_FREE(m_cuMemPool.dynRefVarBlock);
49
+ }
50
X265_FREE(m_cuStat);
51
X265_FREE(m_rowStat);
52
for (int i = 0; i < INTEGRAL_PLANE_NUM; i++)
53
x265_2.7.tar.gz/source/common/framedata.h -> x265_2.9.tar.gz/source/common/framedata.h
Changed
61
1
2
uint64_t cntInterPu[NUM_CU_DEPTH][INTER_MODES - 1];
3
uint64_t cntMergePu[NUM_CU_DEPTH][INTER_MODES - 1];
4
5
+ /* Feature values per row for dynamic refinement */
6
+ uint64_t rowRdDyn[MAX_NUM_DYN_REFINE];
7
+ uint32_t rowVarDyn[MAX_NUM_DYN_REFINE];
8
+ uint32_t rowCntDyn[MAX_NUM_DYN_REFINE];
9
+
10
FrameStats()
11
{
12
memset(this, 0, sizeof(FrameStats));
13
14
inline CUData* getPicCTU(uint32_t ctuAddr) { return &m_picCTU[ctuAddr]; }
15
};
16
17
-/* Stores intra analysis data for a single frame. This struct needs better packing */
18
-struct analysis_intra_data
19
-{
20
- uint8_t* depth;
21
- uint8_t* modes;
22
- char* partSizes;
23
- uint8_t* chromaModes;
24
-};
25
-
26
-/* Stores inter analysis data for a single frame */
27
-struct analysis_inter_data
28
-{
29
- int32_t* ref;
30
- uint8_t* depth;
31
- uint8_t* modes;
32
- uint8_t* partSize;
33
- uint8_t* mergeFlag;
34
- uint8_t* interDir;
35
- uint8_t* mvpIdx[2];
36
- int8_t* refIdx[2];
37
- MV* mv[2];
38
- int64_t* sadCost;
39
-};
40
-
41
-struct analysis2PassFrameData
42
-{
43
- uint8_t* depth;
44
- MV* m_mv[2];
45
- int* mvpIdx[2];
46
- int32_t* ref[2];
47
- uint8_t* modes;
48
- sse_t* distortion;
49
- sse_t* ctuDistortion;
50
- double* scaledDistortion;
51
- double averageDistortion;
52
- double sdDistortion;
53
- uint32_t highDistortionCtuCount;
54
- uint32_t lowDistortionCtuCount;
55
- double* offset;
56
- double* threshold;
57
-};
58
-
59
}
60
#endif // ifndef X265_FRAMEDATA_H
61
x265_2.7.tar.gz/source/common/ipfilter.cpp -> x265_2.9.tar.gz/source/common/ipfilter.cpp
Changed
41
1
2
p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_vps = interp_vert_ps_c<4, W, H>; \
3
p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_vsp = interp_vert_sp_c<4, W, H>; \
4
p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_vss = interp_vert_ss_c<4, W, H>; \
5
- p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].p2s = filterPixelToShort_c<W, H>;
6
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].p2s[NONALIGNED] = filterPixelToShort_c<W, H>;\
7
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].p2s[ALIGNED] = filterPixelToShort_c<W, H>;
8
9
#define CHROMA_422(W, H) \
10
p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_hpp = interp_horiz_pp_c<4, W, H>; \
11
12
p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_vps = interp_vert_ps_c<4, W, H>; \
13
p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_vsp = interp_vert_sp_c<4, W, H>; \
14
p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_vss = interp_vert_ss_c<4, W, H>; \
15
- p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].p2s = filterPixelToShort_c<W, H>;
16
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].p2s[NONALIGNED] = filterPixelToShort_c<W, H>;\
17
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].p2s[ALIGNED] = filterPixelToShort_c<W, H>;
18
19
#define CHROMA_444(W, H) \
20
p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_hpp = interp_horiz_pp_c<4, W, H>; \
21
22
p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vps = interp_vert_ps_c<4, W, H>; \
23
p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vsp = interp_vert_sp_c<4, W, H>; \
24
p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vss = interp_vert_ss_c<4, W, H>; \
25
- p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].p2s = filterPixelToShort_c<W, H>;
26
+ p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].p2s[NONALIGNED] = filterPixelToShort_c<W, H>;\
27
+ p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].p2s[ALIGNED] = filterPixelToShort_c<W, H>;
28
29
#define LUMA(W, H) \
30
p.pu[LUMA_ ## W ## x ## H].luma_hpp = interp_horiz_pp_c<8, W, H>; \
31
32
p.pu[LUMA_ ## W ## x ## H].luma_vsp = interp_vert_sp_c<8, W, H>; \
33
p.pu[LUMA_ ## W ## x ## H].luma_vss = interp_vert_ss_c<8, W, H>; \
34
p.pu[LUMA_ ## W ## x ## H].luma_hvpp = interp_hv_pp_c<8, W, H>; \
35
- p.pu[LUMA_ ## W ## x ## H].convert_p2s = filterPixelToShort_c<W, H>;
36
+ p.pu[LUMA_ ## W ## x ## H].convert_p2s[NONALIGNED] = filterPixelToShort_c<W, H>;\
37
+ p.pu[LUMA_ ## W ## x ## H].convert_p2s[ALIGNED] = filterPixelToShort_c<W, H>;
38
39
void setupFilterPrimitives_c(EncoderPrimitives& p)
40
{
41
x265_2.7.tar.gz/source/common/lowres.cpp -> x265_2.9.tar.gz/source/common/lowres.cpp
Changed
66
1
2
3
using namespace X265_NS;
4
5
-bool Lowres::create(PicYuv *origPic, int _bframes, bool bAQEnabled, uint32_t qgSize)
6
+bool Lowres::create(x265_param* param, PicYuv *origPic, uint32_t qgSize)
7
{
8
isLowres = true;
9
- bframes = _bframes;
10
+ bframes = param->bframes;
11
width = origPic->m_picWidth / 2;
12
lines = origPic->m_picHeight / 2;
13
lumaStride = width + 2 * origPic->m_lumaMarginX;
14
15
maxBlocksInRowFullRes = maxBlocksInRow * 2;
16
maxBlocksInColFullRes = maxBlocksInCol * 2;
17
int cuCount = maxBlocksInRow * maxBlocksInCol;
18
- int cuCountFullRes;
19
- if (qgSize == 8)
20
- cuCountFullRes = maxBlocksInRowFullRes * maxBlocksInColFullRes;
21
- else
22
- cuCountFullRes = cuCount;
23
+ int cuCountFullRes = (qgSize > 8) ? cuCount : cuCount << 2;
24
25
/* rounding the width to multiple of lowres CU size */
26
width = maxBlocksInRow * X265_LOWRES_CU_SIZE;
27
28
29
size_t planesize = lumaStride * (lines + 2 * origPic->m_lumaMarginY);
30
size_t padoffset = lumaStride * origPic->m_lumaMarginY + origPic->m_lumaMarginX;
31
- if (bAQEnabled)
32
+ if (!!param->rc.aqMode)
33
{
34
CHECKED_MALLOC_ZERO(qpAqOffset, double, cuCountFullRes);
35
- CHECKED_MALLOC_ZERO(qpAqMotionOffset, double, cuCountFullRes);
36
CHECKED_MALLOC_ZERO(invQscaleFactor, int, cuCountFullRes);
37
CHECKED_MALLOC_ZERO(qpCuTreeOffset, double, cuCountFullRes);
38
- CHECKED_MALLOC_ZERO(blockVariance, uint32_t, cuCountFullRes);
39
if (qgSize == 8)
40
CHECKED_MALLOC_ZERO(invQscaleFactor8x8, int, cuCount);
41
}
42
+ if (origPic->m_param->bAQMotion)
43
+ CHECKED_MALLOC_ZERO(qpAqMotionOffset, double, cuCountFullRes);
44
+ if (origPic->m_param->bDynamicRefine)
45
+ CHECKED_MALLOC_ZERO(blockVariance, uint32_t, cuCountFullRes);
46
CHECKED_MALLOC(propagateCost, uint16_t, cuCount);
47
48
/* allocate lowres buffers */
49
50
X265_FREE(lowresMvCosts[1][i]);
51
}
52
X265_FREE(qpAqOffset);
53
- X265_FREE(qpAqMotionOffset);
54
X265_FREE(invQscaleFactor);
55
X265_FREE(qpCuTreeOffset);
56
X265_FREE(propagateCost);
57
- X265_FREE(blockVariance);
58
X265_FREE(invQscaleFactor8x8);
59
+ X265_FREE(qpAqMotionOffset);
60
+ X265_FREE(blockVariance);
61
}
62
-
63
// (re) initialize lowres state
64
void Lowres::init(PicYuv *origPic, int poc)
65
{
66
x265_2.7.tar.gz/source/common/lowres.h -> x265_2.9.tar.gz/source/common/lowres.h
Changed
35
1
2
int qmvy = qmv.y + (qmv.y & 1);
3
int hpelB = (qmvy & 2) | ((qmvx & 2) >> 1);
4
pixel *frefB = lowresPlane[hpelB] + blockOffset + (qmvx >> 2) + (qmvy >> 2) * lumaStride;
5
- primitives.pu[LUMA_8x8].pixelavg_pp(buf, outstride, frefA, lumaStride, frefB, lumaStride, 32);
6
+ primitives.pu[LUMA_8x8].pixelavg_pp[(outstride % 64 == 0) && (lumaStride % 64 == 0)](buf, outstride, frefA, lumaStride, frefB, lumaStride, 32);
7
return buf;
8
}
9
else
10
11
int qmvy = qmv.y + (qmv.y & 1);
12
int hpelB = (qmvy & 2) | ((qmvx & 2) >> 1);
13
pixel *frefB = lowresPlane[hpelB] + blockOffset + (qmvx >> 2) + (qmvy >> 2) * lumaStride;
14
- primitives.pu[LUMA_8x8].pixelavg_pp(subpelbuf, 8, frefA, lumaStride, frefB, lumaStride, 32);
15
+ primitives.pu[LUMA_8x8].pixelavg_pp[NONALIGNED](subpelbuf, 8, frefA, lumaStride, frefB, lumaStride, 32);
16
return comp(fenc, FENC_STRIDE, subpelbuf, 8);
17
}
18
else
19
20
uint32_t* blockVariance;
21
uint64_t wp_ssd[3]; // This is different than SSDY, this is sum(pixel^2) - sum(pixel)^2 for entire frame
22
uint64_t wp_sum[3];
23
- uint64_t frameVariance;
24
25
/* cutree intermediate data */
26
uint16_t* propagateCost;
27
double weightedCostDelta[X265_BFRAME_MAX + 2];
28
ReferencePlanes weightedRef[X265_BFRAME_MAX + 2];
29
-
30
- bool create(PicYuv *origPic, int _bframes, bool bAqEnabled, uint32_t qgSize);
31
+ bool create(x265_param* param, PicYuv *origPic, uint32_t qgSize);
32
void destroy();
33
void init(PicYuv *origPic, int poc);
34
};
35
x265_2.7.tar.gz/source/common/param.cpp -> x265_2.9.tar.gz/source/common/param.cpp
Changed
224
1
2
memset(param, 0, sizeof(x265_param));
3
4
/* Applying default values to all elements in the param structure */
5
- param->cpuid = X265_NS::cpu_detect();
6
+ param->cpuid = X265_NS::cpu_detect(false);
7
param->bEnableWavefront = 1;
8
param->frameNumThreads = 0;
9
10
11
param->bEmitHRDSEI = 0;
12
param->bEmitInfoSEI = 1;
13
param->bEmitHDRSEI = 0;
14
+ param->bEmitIDRRecoverySEI = 0;
15
16
/* CU definitions */
17
param->maxCUSize = 64;
18
19
param->lookaheadThreads = 0;
20
param->scenecutBias = 5.0;
21
param->radl = 0;
22
+ param->chunkStart = 0;
23
+ param->chunkEnd = 0;
24
+
25
/* Intra Coding Tools */
26
param->bEnableConstrainedIntra = 0;
27
param->bEnableStrongIntraSmoothing = 1;
28
29
param->bEnableSAO = 1;
30
param->bSaoNonDeblocked = 0;
31
param->bLimitSAO = 0;
32
+
33
/* Coding Quality */
34
param->cbQpOffset = 0;
35
param->crQpOffset = 0;
36
37
param->scaleFactor = 0;
38
param->intraRefine = 0;
39
param->interRefine = 0;
40
+ param->bDynamicRefine = 0;
41
param->mvRefine = 0;
42
param->bUseAnalysisFile = 1;
43
param->csvfpt = NULL;
44
param->forceFlush = 0;
45
param->bDisableLookahead = 0;
46
param->bCopyPicToFrame = 1;
47
+ param->maxAUSizeFactor = 1;
48
+ param->naluFile = NULL;
49
50
/* DCT Approximations */
51
param->bLowPassDct = 0;
52
param->bMVType = 0;
53
+ param->bSingleSeiNal = 0;
54
+
55
+ /* SEI messages */
56
+ param->preferredTransferCharacteristics = -1;
57
+ param->pictureStructure = -1;
58
}
59
60
int x265_param_default_preset(x265_param* param, const char* preset, const char* tune)
61
62
if (0) ;
63
OPT("asm")
64
{
65
+#if X265_ARCH_X86
66
+ if (!strcasecmp(value, "avx512"))
67
+ {
68
+ p->cpuid = X265_NS::cpu_detect(true);
69
+ if (!(p->cpuid & X265_CPU_AVX512))
70
+ x265_log(p, X265_LOG_WARNING, "AVX512 is not supported\n");
71
+ }
72
+ else
73
+ {
74
+ if (bValueWasNull)
75
+ p->cpuid = atobool(value);
76
+ else
77
+ p->cpuid = parseCpuName(value, bError, false);
78
+ }
79
+#else
80
if (bValueWasNull)
81
p->cpuid = atobool(value);
82
else
83
- p->cpuid = parseCpuName(value, bError);
84
+ p->cpuid = parseCpuName(value, bError, false);
85
+#endif
86
}
87
OPT("fps")
88
{
89
90
OPT("limit-sao") p->bLimitSAO = atobool(value);
91
OPT("dhdr10-info") p->toneMapFile = strdup(value);
92
OPT("dhdr10-opt") p->bDhdr10opt = atobool(value);
93
+ OPT("idr-recovery-sei") p->bEmitIDRRecoverySEI = atobool(value);
94
OPT("const-vbv") p->rc.bEnableConstVbv = atobool(value);
95
OPT("ctu-info") p->bCTUInfo = atoi(value);
96
OPT("scale-factor") p->scaleFactor = atoi(value);
97
98
OPT("refine-mv")p->mvRefine = atobool(value);
99
OPT("force-flush")p->forceFlush = atoi(value);
100
OPT("splitrd-skip") p->bEnableSplitRdSkip = atobool(value);
101
- OPT("lowpass-dct") p->bLowPassDct = atobool(value);
102
+ OPT("lowpass-dct") p->bLowPassDct = atobool(value);
103
OPT("vbv-end") p->vbvBufferEnd = atof(value);
104
OPT("vbv-end-fr-adj") p->vbvEndFrameAdjust = atof(value);
105
OPT("copy-pic") p->bCopyPicToFrame = atobool(value);
106
107
{
108
bError = true;
109
}
110
- }
111
+ }
112
OPT("gop-lookahead") p->gopLookahead = atoi(value);
113
OPT("analysis-save") p->analysisSave = strdup(value);
114
OPT("analysis-load") p->analysisLoad = strdup(value);
115
OPT("radl") p->radl = atoi(value);
116
+ OPT("max-ausize-factor") p->maxAUSizeFactor = atof(value);
117
+ OPT("dynamic-refine") p->bDynamicRefine = atobool(value);
118
+ OPT("single-sei") p->bSingleSeiNal = atobool(value);
119
+ OPT("atc-sei") p->preferredTransferCharacteristics = atoi(value);
120
+ OPT("pic-struct") p->pictureStructure = atoi(value);
121
+ OPT("chunk-start") p->chunkStart = atoi(value);
122
+ OPT("chunk-end") p->chunkEnd = atoi(value);
123
+ OPT("nalu-file") p->naluFile = strdup(value);
124
else
125
return X265_PARAM_BAD_NAME;
126
}
127
128
* false || no - disabled
129
* integer bitmap value
130
* comma separated list of SIMD names, eg: SSE4.1,XOP */
131
-int parseCpuName(const char* value, bool& bError)
132
+int parseCpuName(const char* value, bool& bError, bool bEnableavx512)
133
{
134
if (!value)
135
{
136
137
if (isdigit(value[0]))
138
cpu = x265_atoi(value, bError);
139
else
140
- cpu = !strcmp(value, "auto") || x265_atobool(value, bError) ? X265_NS::cpu_detect() : 0;
141
+ cpu = !strcmp(value, "auto") || x265_atobool(value, bError) ? X265_NS::cpu_detect(bEnableavx512) : 0;
142
143
if (bError)
144
{
145
146
"Supported values for bCTUInfo are 0, 1, 2, 4, 6");
147
CHECK(param->interRefine > 3 || param->interRefine < 0,
148
"Invalid refine-inter value, refine-inter levels 0 to 3 supported");
149
- CHECK(param->intraRefine > 3 || param->intraRefine < 0,
150
+ CHECK(param->intraRefine > 4 || param->intraRefine < 0,
151
"Invalid refine-intra value, refine-intra levels 0 to 3 supported");
152
+ CHECK(param->maxAUSizeFactor < 0.5 || param->maxAUSizeFactor > 1.0,
153
+ "Supported factor for controlling max AU size is from 0.5 to 1");
154
#if !X86_64
155
CHECK(param->searchMethod == X265_SEA && (param->sourceWidth > 840 || param->sourceHeight > 480),
156
"SEA motion search does not support resolutions greater than 480p in 32 bit build");
157
158
if (param->masteringDisplayColorVolume || param->maxFALL || param->maxCLL)
159
param->bEmitHDRSEI = 1;
160
161
+ bool isSingleSEI = (param->bRepeatHeaders
162
+ || param->bEmitHRDSEI
163
+ || param->bEmitInfoSEI
164
+ || param->bEmitHDRSEI
165
+ || param->bEmitIDRRecoverySEI
166
+ || !!param->interlaceMode
167
+ || param->preferredTransferCharacteristics > 1
168
+ || param->toneMapFile
169
+ || param->naluFile);
170
+
171
+ if (!isSingleSEI && param->bSingleSeiNal)
172
+ {
173
+ param->bSingleSeiNal = 0;
174
+ x265_log(param, X265_LOG_WARNING, "None of the SEI messages are enabled. Disabling Single SEI NAL\n");
175
+ }
176
return check_failed;
177
}
178
179
180
TOOLVAL(param->bCTUInfo, "ctu-info=%d");
181
if (param->bMVType == AVC_INFO)
182
TOOLOPT(param->bMVType, "refine-mv-type=avc");
183
+ TOOLOPT(param->bDynamicRefine, "dynamic-refine");
184
if (param->maxSlices > 1)
185
TOOLVAL(param->maxSlices, "slices=%d");
186
if (param->bEnableLoopFilter)
187
188
TOOLOPT(!param->bSaoNonDeblocked && param->bEnableSAO, "sao");
189
TOOLOPT(param->rc.bStatWrite, "stats-write");
190
TOOLOPT(param->rc.bStatRead, "stats-read");
191
+ TOOLOPT(param->bSingleSeiNal, "single-sei");
192
#if ENABLE_HDR10_PLUS
193
TOOLOPT(param->toneMapFile != NULL, "dhdr10-info");
194
#endif
195
196
s += sprintf(s, " input-res=%dx%d", p->sourceWidth - padx, p->sourceHeight - pady);
197
s += sprintf(s, " interlace=%d", p->interlaceMode);
198
s += sprintf(s, " total-frames=%d", p->totalFrames);
199
+ if (p->chunkStart)
200
+ s += sprintf(s, " chunk-start=%d", p->chunkStart);
201
+ if (p->chunkEnd)
202
+ s += sprintf(s, " chunk-end=%d", p->chunkEnd);
203
s += sprintf(s, " level-idc=%d", p->levelIdc);
204
s += sprintf(s, " high-tier=%d", p->bHighTier);
205
s += sprintf(s, " uhd-bd=%d", p->uhdBluray);
206
207
BOOL(p->bEmitHDRSEI, "hdr");
208
BOOL(p->bHDROpt, "hdr-opt");
209
BOOL(p->bDhdr10opt, "dhdr10-opt");
210
+ BOOL(p->bEmitIDRRecoverySEI, "idr-recovery-sei");
211
if (p->analysisSave)
212
s += sprintf(s, " analysis-save");
213
if (p->analysisLoad)
214
215
BOOL(p->bLowPassDct, "lowpass-dct");
216
s += sprintf(s, " refine-mv-type=%d", p->bMVType);
217
s += sprintf(s, " copy-pic=%d", p->bCopyPicToFrame);
218
+ s += sprintf(s, " max-ausize-factor=%.1f", p->maxAUSizeFactor);
219
+ BOOL(p->bDynamicRefine, "dynamic-refine");
220
+ BOOL(p->bSingleSeiNal, "single-sei");
221
#undef BOOL
222
return buf;
223
}
224
x265_2.7.tar.gz/source/common/param.h -> x265_2.9.tar.gz/source/common/param.h
Changed
10
1
2
char* x265_param2string(x265_param *param, int padx, int pady);
3
int x265_atoi(const char *str, bool& bError);
4
double x265_atof(const char *str, bool& bError);
5
-int parseCpuName(const char *value, bool& bError);
6
+int parseCpuName(const char *value, bool& bError, bool bEnableavx512);
7
void setParamAspectRatio(x265_param *p, int width, int height);
8
void getParamAspectRatio(x265_param *p, int& width, int& height);
9
bool parseLambdaFile(x265_param *param);
10
x265_2.7.tar.gz/source/common/picyuv.cpp -> x265_2.9.tar.gz/source/common/picyuv.cpp
Changed
21
1
2
pixel *uPic = m_picOrg[1];
3
pixel *vPic = m_picOrg[2];
4
5
+ if(param.minLuma != 0 || param.maxLuma != PIXEL_MAX)
6
+ {
7
+ for (int r = 0; r < height; r++)
8
+ {
9
+ for (int c = 0; c < width; c++)
10
+ {
11
+ yPic[c] = X265_MIN(yPic[c], (pixel)param.maxLuma);
12
+ yPic[c] = X265_MAX(yPic[c], (pixel)param.minLuma);
13
+ }
14
+ yPic += m_stride;
15
+ }
16
+ }
17
+ yPic = m_picOrg[0];
18
if (param.csvLogLevel >= 2 || param.maxCLL || param.maxFALL)
19
{
20
for (int r = 0; r < height; r++)
21
x265_2.7.tar.gz/source/common/picyuv.h -> x265_2.9.tar.gz/source/common/picyuv.h
Changed
9
1
2
pixel m_maxChromaVLevel;
3
pixel m_minChromaVLevel;
4
double m_avgChromaVLevel;
5
+ double m_vmafScore;
6
x265_param *m_param;
7
8
PicYuv();
9
x265_2.7.tar.gz/source/common/pixel.cpp -> x265_2.9.tar.gz/source/common/pixel.cpp
Changed
102
1
2
static void cuTreeFix8Pack(uint16_t *dst, double *src, int count)
3
{
4
for (int i = 0; i < count; i++)
5
- dst[i] = (uint16_t)(src[i] * 256.0);
6
+ dst[i] = (uint16_t)(int16_t)(src[i] * 256.0);
7
}
8
9
static void cuTreeFix8Unpack(double *dst, uint16_t *src, int count)
10
11
{
12
#define LUMA_PU(W, H) \
13
p.pu[LUMA_ ## W ## x ## H].copy_pp = blockcopy_pp_c<W, H>; \
14
- p.pu[LUMA_ ## W ## x ## H].addAvg = addAvg<W, H>; \
15
+ p.pu[LUMA_ ## W ## x ## H].addAvg[NONALIGNED] = addAvg<W, H>; \
16
+ p.pu[LUMA_ ## W ## x ## H].addAvg[ALIGNED] = addAvg<W, H>; \
17
p.pu[LUMA_ ## W ## x ## H].sad = sad<W, H>; \
18
p.pu[LUMA_ ## W ## x ## H].sad_x3 = sad_x3<W, H>; \
19
p.pu[LUMA_ ## W ## x ## H].sad_x4 = sad_x4<W, H>; \
20
- p.pu[LUMA_ ## W ## x ## H].pixelavg_pp = pixelavg_pp<W, H>;
21
-
22
+ p.pu[LUMA_ ## W ## x ## H].pixelavg_pp[NONALIGNED] = pixelavg_pp<W, H>; \
23
+ p.pu[LUMA_ ## W ## x ## H].pixelavg_pp[ALIGNED] = pixelavg_pp<W, H>;
24
#define LUMA_CU(W, H) \
25
p.cu[BLOCK_ ## W ## x ## H].sub_ps = pixel_sub_ps_c<W, H>; \
26
- p.cu[BLOCK_ ## W ## x ## H].add_ps = pixel_add_ps_c<W, H>; \
27
+ p.cu[BLOCK_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_c<W, H>; \
28
+ p.cu[BLOCK_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_c<W, H>; \
29
p.cu[BLOCK_ ## W ## x ## H].copy_sp = blockcopy_sp_c<W, H>; \
30
p.cu[BLOCK_ ## W ## x ## H].copy_ps = blockcopy_ps_c<W, H>; \
31
p.cu[BLOCK_ ## W ## x ## H].copy_ss = blockcopy_ss_c<W, H>; \
32
- p.cu[BLOCK_ ## W ## x ## H].blockfill_s = blockfill_s_c<W>; \
33
+ p.cu[BLOCK_ ## W ## x ## H].blockfill_s[NONALIGNED] = blockfill_s_c<W>; \
34
+ p.cu[BLOCK_ ## W ## x ## H].blockfill_s[ALIGNED] = blockfill_s_c<W>; \
35
p.cu[BLOCK_ ## W ## x ## H].cpy2Dto1D_shl = cpy2Dto1D_shl<W>; \
36
p.cu[BLOCK_ ## W ## x ## H].cpy2Dto1D_shr = cpy2Dto1D_shr<W>; \
37
- p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shl = cpy1Dto2D_shl<W>; \
38
+ p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shl[NONALIGNED] = cpy1Dto2D_shl<W>; \
39
+ p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shl[ALIGNED] = cpy1Dto2D_shl<W>; \
40
p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shr = cpy1Dto2D_shr<W>; \
41
p.cu[BLOCK_ ## W ## x ## H].psy_cost_pp = psyCost_pp<BLOCK_ ## W ## x ## H>; \
42
p.cu[BLOCK_ ## W ## x ## H].transpose = transpose<W>; \
43
- p.cu[BLOCK_ ## W ## x ## H].ssd_s = pixel_ssd_s_c<W>; \
44
+ p.cu[BLOCK_ ## W ## x ## H].ssd_s[NONALIGNED] = pixel_ssd_s_c<W>; \
45
+ p.cu[BLOCK_ ## W ## x ## H].ssd_s[ALIGNED] = pixel_ssd_s_c<W>; \
46
p.cu[BLOCK_ ## W ## x ## H].var = pixel_var<W>; \
47
- p.cu[BLOCK_ ## W ## x ## H].calcresidual = getResidual<W>; \
48
+ p.cu[BLOCK_ ## W ## x ## H].calcresidual[NONALIGNED] = getResidual<W>; \
49
+ p.cu[BLOCK_ ## W ## x ## H].calcresidual[ALIGNED] = getResidual<W>; \
50
p.cu[BLOCK_ ## W ## x ## H].sse_pp = sse<W, H, pixel, pixel>; \
51
p.cu[BLOCK_ ## W ## x ## H].sse_ss = sse<W, H, int16_t, int16_t>;
52
53
54
p.cu[BLOCK_64x64].sa8d = sa8d16<64, 64>;
55
56
#define CHROMA_PU_420(W, H) \
57
- p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].addAvg = addAvg<W, H>; \
58
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].addAvg[NONALIGNED] = addAvg<W, H>; \
59
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].addAvg[ALIGNED] = addAvg<W, H>; \
60
p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].copy_pp = blockcopy_pp_c<W, H>; \
61
62
CHROMA_PU_420(2, 2);
63
64
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_ps = blockcopy_ps_c<W, H>; \
65
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_ss = blockcopy_ss_c<W, H>; \
66
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].sub_ps = pixel_sub_ps_c<W, H>; \
67
- p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps = pixel_add_ps_c<W, H>;
68
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_c<W, H>; \
69
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_c<W, H>;
70
71
CHROMA_CU_420(2, 2)
72
CHROMA_CU_420(4, 4)
73
74
p.chroma[X265_CSP_I420].cu[BLOCK_64x64].sa8d = sa8d16<32, 32>;
75
76
#define CHROMA_PU_422(W, H) \
77
- p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].addAvg = addAvg<W, H>; \
78
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].addAvg[NONALIGNED] = addAvg<W, H>; \
79
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].addAvg[ALIGNED] = addAvg<W, H>; \
80
p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].copy_pp = blockcopy_pp_c<W, H>; \
81
82
CHROMA_PU_422(2, 4);
83
84
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_ps = blockcopy_ps_c<W, H>; \
85
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_ss = blockcopy_ss_c<W, H>; \
86
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].sub_ps = pixel_sub_ps_c<W, H>; \
87
- p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps = pixel_add_ps_c<W, H>;
88
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_c<W, H>; \
89
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_c<W, H>;
90
91
CHROMA_CU_422(2, 4)
92
CHROMA_CU_422(4, 8)
93
94
p.weight_pp = weight_pp_c;
95
p.weight_sp = weight_sp_c;
96
97
- p.scale1D_128to64 = scale1D_128to64;
98
+ p.scale1D_128to64[NONALIGNED] = p.scale1D_128to64[ALIGNED] = scale1D_128to64;
99
p.scale2D_64to32 = scale2D_64to32;
100
p.frameInitLowres = frame_init_lowres_core;
101
p.ssim_4x4x2_core = ssim_4x4x2_core;
102
x265_2.7.tar.gz/source/common/predict.cpp -> x265_2.9.tar.gz/source/common/predict.cpp
Changed
72
1
2
MV mv0 = cu.m_mv[0][pu.puAbsPartIdx];
3
cu.clipMv(mv0);
4
5
- if (cu.m_slice->m_pps->bUseWeightPred && wp0->bPresentFlag)
6
+ if (cu.m_slice->m_pps->bUseWeightPred && wp0->wtPresent)
7
{
8
for (int plane = 0; plane < (bChroma ? 3 : 1); plane++)
9
{
10
11
pwp0 = refIdx0 >= 0 ? cu.m_slice->m_weightPredTable[0][refIdx0] : NULL;
12
pwp1 = refIdx1 >= 0 ? cu.m_slice->m_weightPredTable[1][refIdx1] : NULL;
13
14
- if (pwp0 && pwp1 && (pwp0->bPresentFlag || pwp1->bPresentFlag))
15
+ if (pwp0 && pwp1 && (pwp0->wtPresent || pwp1->wtPresent))
16
{
17
/* biprediction weighting */
18
for (int plane = 0; plane < (bChroma ? 3 : 1); plane++)
19
20
predInterChromaShort(pu, m_predShortYuv[1], *cu.m_slice->m_refReconPicList[1][refIdx1], mv1);
21
}
22
23
- if (pwp0 && pwp1 && (pwp0->bPresentFlag || pwp1->bPresentFlag))
24
+ if (pwp0 && pwp1 && (pwp0->wtPresent || pwp1->wtPresent))
25
addWeightBi(pu, predYuv, m_predShortYuv[0], m_predShortYuv[1], wv0, wv1, bLuma, bChroma);
26
else
27
predYuv.addAvg(m_predShortYuv[0], m_predShortYuv[1], pu.puAbsPartIdx, pu.width, pu.height, bLuma, bChroma);
28
29
MV mv0 = cu.m_mv[0][pu.puAbsPartIdx];
30
cu.clipMv(mv0);
31
32
- if (pwp0 && pwp0->bPresentFlag)
33
+ if (pwp0 && pwp0->wtPresent)
34
{
35
ShortYuv& shortYuv = m_predShortYuv[0];
36
37
38
/* uniprediction to L1 */
39
X265_CHECK(refIdx1 >= 0, "refidx1 was not positive\n");
40
41
- if (pwp1 && pwp1->bPresentFlag)
42
+ if (pwp1 && pwp1->wtPresent)
43
{
44
ShortYuv& shortYuv = m_predShortYuv[0];
45
46
47
int yFrac = mv.y & 3;
48
49
if (!(yFrac | xFrac))
50
- primitives.pu[partEnum].convert_p2s(src, srcStride, dst, dstStride);
51
+ {
52
+ bool srcbufferAlignCheck = (refPic.m_cuOffsetY[pu.ctuAddr] + refPic.m_buOffsetY[pu.cuAbsPartIdx + pu.puAbsPartIdx] + srcOffset) % 64 == 0;
53
+ bool dstbufferAlignCheck = (dstSYuv.getAddrOffset(pu.puAbsPartIdx, dstSYuv.m_size) % 64) == 0;
54
+ primitives.pu[partEnum].convert_p2s[srcStride % 64 == 0 && dstStride % 64 == 0 && srcbufferAlignCheck && dstbufferAlignCheck](src, srcStride, dst, dstStride);
55
+ }
56
else if (!yFrac)
57
primitives.pu[partEnum].luma_hps(src, srcStride, dst, dstStride, xFrac, 0);
58
else if (!xFrac)
59
60
61
if (!(yFrac | xFrac))
62
{
63
- primitives.chroma[m_csp].pu[partEnum].p2s(refCb, refStride, dstCb, dstStride);
64
- primitives.chroma[m_csp].pu[partEnum].p2s(refCr, refStride, dstCr, dstStride);
65
+ bool srcbufferAlignCheckC = (refPic.m_cuOffsetC[pu.ctuAddr] + refPic.m_buOffsetC[pu.cuAbsPartIdx + pu.puAbsPartIdx] + refOffset) % 64 == 0;
66
+ bool dstbufferAlignCheckC = dstSYuv.getChromaAddrOffset(pu.puAbsPartIdx) % 64 == 0;
67
+ primitives.chroma[m_csp].pu[partEnum].p2s[refStride % 64 == 0 && dstStride % 64 == 0 && srcbufferAlignCheckC && dstbufferAlignCheckC](refCb, refStride, dstCb, dstStride);
68
+ primitives.chroma[m_csp].pu[partEnum].p2s[refStride % 64 == 0 && dstStride % 64 == 0 && srcbufferAlignCheckC && dstbufferAlignCheckC](refCr, refStride, dstCr, dstStride);
69
}
70
else if (!yFrac)
71
{
72
x265_2.7.tar.gz/source/common/primitives.cpp -> x265_2.9.tar.gz/source/common/primitives.cpp
Changed
25
1
2
for (int i = 0; i < NUM_PU_SIZES; i++)
3
{
4
p.chroma[X265_CSP_I444].pu[i].copy_pp = p.pu[i].copy_pp;
5
- p.chroma[X265_CSP_I444].pu[i].addAvg = p.pu[i].addAvg;
6
+ p.chroma[X265_CSP_I444].pu[i].addAvg[NONALIGNED] = p.pu[i].addAvg[NONALIGNED];
7
+ p.chroma[X265_CSP_I444].pu[i].addAvg[ALIGNED] = p.pu[i].addAvg[ALIGNED];
8
p.chroma[X265_CSP_I444].pu[i].satd = p.pu[i].satd;
9
- p.chroma[X265_CSP_I444].pu[i].p2s = p.pu[i].convert_p2s;
10
+ p.chroma[X265_CSP_I444].pu[i].p2s[NONALIGNED] = p.pu[i].convert_p2s[NONALIGNED];
11
+ p.chroma[X265_CSP_I444].pu[i].p2s[ALIGNED] = p.pu[i].convert_p2s[ALIGNED];
12
}
13
14
for (int i = 0; i < NUM_CU_SIZES; i++)
15
16
p.chroma[X265_CSP_I444].cu[i].sa8d = p.cu[i].sa8d;
17
p.chroma[X265_CSP_I444].cu[i].sse_pp = p.cu[i].sse_pp;
18
p.chroma[X265_CSP_I444].cu[i].sub_ps = p.cu[i].sub_ps;
19
- p.chroma[X265_CSP_I444].cu[i].add_ps = p.cu[i].add_ps;
20
+ p.chroma[X265_CSP_I444].cu[i].add_ps[NONALIGNED] = p.cu[i].add_ps[NONALIGNED];
21
+ p.chroma[X265_CSP_I444].cu[i].add_ps[ALIGNED] = p.cu[i].add_ps[ALIGNED];
22
p.chroma[X265_CSP_I444].cu[i].copy_ps = p.cu[i].copy_ps;
23
p.chroma[X265_CSP_I444].cu[i].copy_sp = p.cu[i].copy_sp;
24
p.chroma[X265_CSP_I444].cu[i].copy_ss = p.cu[i].copy_ss;
25
x265_2.7.tar.gz/source/common/primitives.h -> x265_2.9.tar.gz/source/common/primitives.h
Changed
117
1
2
NUM_CU_SIZES
3
};
4
5
+enum AlignPrimitive
6
+{
7
+ NONALIGNED,
8
+ ALIGNED,
9
+ NUM_ALIGNMENT_TYPES
10
+};
11
+
12
enum { NUM_TR_SIZE = 4 }; // TU are 4x4, 8x8, 16x16, and 32x32
13
14
15
16
17
typedef void (*integralv_t)(uint32_t *sum, intptr_t stride);
18
typedef void (*integralh_t)(uint32_t *sum, pixel *pix, intptr_t stride);
19
-
20
+typedef void(*nonPsyRdoQuant_t)(int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, uint32_t blkPos);
21
+typedef void(*psyRdoQuant_t)(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos);
22
+typedef void(*psyRdoQuant_t1)(int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost,uint32_t blkPos);
23
+typedef void(*psyRdoQuant_t2)(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos);
24
/* Function pointers to optimized encoder primitives. Each pointer can reference
25
* either an assembly routine, a SIMD intrinsic primitive, or a C function */
26
struct EncoderPrimitives
27
28
filter_sp_t luma_vsp;
29
filter_ss_t luma_vss;
30
filter_hv_pp_t luma_hvpp; // combines hps + vsp
31
-
32
- pixelavg_pp_t pixelavg_pp; // quick bidir using pixels (borrowed from x264)
33
- addAvg_t addAvg; // bidir motion compensation, uses 16bit values
34
-
35
+ pixelavg_pp_t pixelavg_pp[NUM_ALIGNMENT_TYPES]; // quick bidir using pixels (borrowed from x264)
36
+ addAvg_t addAvg[NUM_ALIGNMENT_TYPES]; // bidir motion compensation, uses 16bit values
37
copy_pp_t copy_pp;
38
- filter_p2s_t convert_p2s;
39
+ filter_p2s_t convert_p2s[NUM_ALIGNMENT_TYPES];
40
}
41
pu[NUM_PU_SIZES];
42
43
44
dct_t standard_dct; // original dct function, used by lowpass_dct
45
dct_t lowpass_dct; // lowpass dct approximation
46
47
- calcresidual_t calcresidual;
48
+ calcresidual_t calcresidual[NUM_ALIGNMENT_TYPES];
49
pixel_sub_ps_t sub_ps;
50
- pixel_add_ps_t add_ps;
51
- blockfill_s_t blockfill_s; // block fill, for DC transforms
52
+ pixel_add_ps_t add_ps[NUM_ALIGNMENT_TYPES];
53
+ blockfill_s_t blockfill_s[NUM_ALIGNMENT_TYPES]; // block fill, for DC transforms
54
copy_cnt_t copy_cnt; // copy coeff while counting non-zero
55
count_nonzero_t count_nonzero;
56
cpy2Dto1D_shl_t cpy2Dto1D_shl;
57
cpy2Dto1D_shr_t cpy2Dto1D_shr;
58
- cpy1Dto2D_shl_t cpy1Dto2D_shl;
59
+ cpy1Dto2D_shl_t cpy1Dto2D_shl[NUM_ALIGNMENT_TYPES];
60
cpy1Dto2D_shr_t cpy1Dto2D_shr;
61
-
62
copy_sp_t copy_sp;
63
copy_ps_t copy_ps;
64
copy_ss_t copy_ss;
65
66
pixel_sse_t sse_pp; // Sum of Square Error (pixel, pixel) fenc alignment not assumed
67
pixel_sse_ss_t sse_ss; // Sum of Square Error (short, short) fenc alignment not assumed
68
pixelcmp_t psy_cost_pp; // difference in AC energy between two pixel blocks
69
- pixel_ssd_s_t ssd_s; // Sum of Square Error (residual coeff to self)
70
+ pixel_ssd_s_t ssd_s[NUM_ALIGNMENT_TYPES]; // Sum of Square Error (residual coeff to self)
71
pixelcmp_t sa8d; // Sum of Transformed Differences (8x8 Hadamard), uses satd for 4x4 intra TU
72
-
73
transpose_t transpose; // transpose pixel block; for use with intra all-angs
74
intra_allangs_t intra_pred_allangs;
75
intra_filter_t intra_filter;
76
intra_pred_t intra_pred[NUM_INTRA_MODE];
77
+ nonPsyRdoQuant_t nonPsyRdoQuant;
78
+ psyRdoQuant_t psyRdoQuant;
79
+ psyRdoQuant_t1 psyRdoQuant_1p;
80
+ psyRdoQuant_t2 psyRdoQuant_2p;
81
}
82
cu[NUM_CU_SIZES];
83
-
84
/* These remaining primitives work on either fixed block sizes or take
85
* block dimensions as arguments and thus do not belong in either the PU or
86
* the CU arrays */
87
88
dequant_scaling_t dequant_scaling;
89
dequant_normal_t dequant_normal;
90
denoiseDct_t denoiseDct;
91
- scale1D_t scale1D_128to64;
92
+ scale1D_t scale1D_128to64[NUM_ALIGNMENT_TYPES];
93
scale2D_t scale2D_64to32;
94
95
ssim_4x4x2_core_t ssim_4x4x2_core;
96
97
filter_ss_t filter_vss;
98
filter_pp_t filter_hpp;
99
filter_hps_t filter_hps;
100
- addAvg_t addAvg;
101
+ addAvg_t addAvg[NUM_ALIGNMENT_TYPES];
102
copy_pp_t copy_pp;
103
- filter_p2s_t p2s;
104
+ filter_p2s_t p2s[NUM_ALIGNMENT_TYPES];
105
106
}
107
pu[NUM_PU_SIZES];
108
109
pixelcmp_t sa8d; // if chroma CU is not multiple of 8x8, will use satd
110
pixel_sse_t sse_pp;
111
pixel_sub_ps_t sub_ps;
112
- pixel_add_ps_t add_ps;
113
+ pixel_add_ps_t add_ps[NUM_ALIGNMENT_TYPES];
114
115
copy_ps_t copy_ps;
116
copy_sp_t copy_sp;
117
x265_2.7.tar.gz/source/common/quant.cpp -> x265_2.9.tar.gz/source/common/quant.cpp
Changed
163
1
2
uint32_t log2TrSize, TextType ttype, bool bIntra, bool useTransformSkip, uint32_t numSig)
3
{
4
const uint32_t sizeIdx = log2TrSize - 2;
5
-
6
if (cu.m_tqBypass[0])
7
{
8
- primitives.cu[sizeIdx].cpy1Dto2D_shl(residual, coeff, resiStride, 0);
9
+ primitives.cu[sizeIdx].cpy1Dto2D_shl[resiStride % 64 == 0](residual, coeff, resiStride, 0);
10
return;
11
}
12
-
13
// Values need to pass as input parameter in dequant
14
int rem = m_qpParam[ttype].rem;
15
int per = m_qpParam[ttype].per;
16
17
if (transformShift > 0)
18
primitives.cu[sizeIdx].cpy1Dto2D_shr(residual, m_resiDctCoeff, resiStride, transformShift);
19
else
20
- primitives.cu[sizeIdx].cpy1Dto2D_shl(residual, m_resiDctCoeff, resiStride, -transformShift);
21
+ primitives.cu[sizeIdx].cpy1Dto2D_shl[resiStride % 64 == 0](residual, m_resiDctCoeff, resiStride, -transformShift);
22
#endif
23
}
24
else
25
26
const int add_2nd = 1 << (shift_2nd - 1);
27
28
int dc_val = (((m_resiDctCoeff[0] * (64 >> 6) + add_1st) >> shift_1st) * (64 >> 3) + add_2nd) >> shift_2nd;
29
- primitives.cu[sizeIdx].blockfill_s(residual, resiStride, (int16_t)dc_val);
30
+ primitives.cu[sizeIdx].blockfill_s[resiStride % 64 == 0](residual, resiStride, (int16_t)dc_val);
31
return;
32
}
33
34
35
X265_CHECK((int)numSig == primitives.cu[log2TrSize - 2].count_nonzero(dstCoeff), "numSig differ\n");
36
if (!numSig)
37
return 0;
38
-
39
const uint32_t trSize = 1 << log2TrSize;
40
int64_t lambda2 = m_qpParam[ttype].lambda2;
41
- const int64_t psyScale = ((int64_t)m_psyRdoqScale * m_qpParam[ttype].lambda);
42
-
43
+ int64_t psyScale = ((int64_t)m_psyRdoqScale * m_qpParam[ttype].lambda);
44
/* unquant constants for measuring distortion. Scaling list quant coefficients have a (1 << 4)
45
* scale applied that must be removed during unquant. Note that in real dequant there is clipping
46
* at several stages. We skip the clipping for simplicity when measuring RD cost */
47
48
for (int cgScanPos = cgLastScanPos + 1; cgScanPos < (int)cgNum ; cgScanPos++)
49
{
50
X265_CHECK(coeffNum[cgScanPos] == 0, "count of coeff failure\n");
51
-
52
uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE);
53
uint32_t blkPos = codeParams.scan[scanPosBase];
54
-
55
- // TODO: we can't SIMD optimize because PSYVALUE need 64-bits multiplication, convert to Double can work faster by FMA
56
- for (int y = 0; y < MLS_CG_SIZE; y++)
57
+ bool enable512 = detect512();
58
+ if (enable512)
59
+ primitives.cu[log2TrSize - 2].psyRdoQuant(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos);
60
+ else
61
{
62
- for (int x = 0; x < MLS_CG_SIZE; x++)
63
- {
64
- int signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */
65
- int predictedCoef = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/
66
-
67
- costUncoded[blkPos + x] = ((int64_t)signCoef * signCoef) << scaleBits;
68
-
69
- /* when no residual coefficient is coded, predicted coef == recon coef */
70
- costUncoded[blkPos + x] -= PSYVALUE(predictedCoef);
71
-
72
- totalUncodedCost += costUncoded[blkPos + x];
73
- totalRdCost += costUncoded[blkPos + x];
74
- }
75
- blkPos += trSize;
76
+ primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost,blkPos);
77
+ primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos);
78
}
79
}
80
}
81
82
for (int cgScanPos = cgLastScanPos + 1; cgScanPos < (int)cgNum ; cgScanPos++)
83
{
84
X265_CHECK(coeffNum[cgScanPos] == 0, "count of coeff failure\n");
85
-
86
uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE);
87
uint32_t blkPos = codeParams.scan[scanPosBase];
88
-
89
- for (int y = 0; y < MLS_CG_SIZE; y++)
90
- {
91
- for (int x = 0; x < MLS_CG_SIZE; x++)
92
- {
93
- int signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */
94
- costUncoded[blkPos + x] = ((int64_t)signCoef * signCoef) << scaleBits;
95
-
96
- totalUncodedCost += costUncoded[blkPos + x];
97
- totalRdCost += costUncoded[blkPos + x];
98
- }
99
- blkPos += trSize;
100
- }
101
+ primitives.cu[log2TrSize - 2].nonPsyRdoQuant(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos);
102
}
103
}
104
-
105
static const uint8_t table_cnt[5][SCAN_SET_SIZE] =
106
{
107
// patternSigCtx = 0
108
109
// TODO: does we need zero-coeff cost?
110
const uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE);
111
uint32_t blkPos = codeParams.scan[scanPosBase];
112
-
113
if (usePsyMask)
114
{
115
- // TODO: we can't SIMD optimize because PSYVALUE need 64-bits multiplication, convert to Double can work faster by FMA
116
+ bool enable512 = detect512();
117
+
118
+ if (enable512)
119
+ primitives.cu[log2TrSize - 2].psyRdoQuant(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos);
120
+ else
121
+ {
122
+ primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos);
123
+ primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos);
124
+ }
125
+ blkPos = codeParams.scan[scanPosBase];
126
for (int y = 0; y < MLS_CG_SIZE; y++)
127
{
128
for (int x = 0; x < MLS_CG_SIZE; x++)
129
{
130
- int signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */
131
- int predictedCoef = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/
132
-
133
- costUncoded[blkPos + x] = ((int64_t)signCoef * signCoef) << scaleBits;
134
-
135
- /* when no residual coefficient is coded, predicted coef == recon coef */
136
- costUncoded[blkPos + x] -= PSYVALUE(predictedCoef);
137
-
138
- totalUncodedCost += costUncoded[blkPos + x];
139
- totalRdCost += costUncoded[blkPos + x];
140
-
141
const uint32_t scanPosOffset = y * MLS_CG_SIZE + x;
142
const uint32_t ctxSig = table_cnt[patternSigCtx][g_scan4x4[codeParams.scanType][scanPosOffset]] + ctxSigOffset;
143
X265_CHECK(trSize > 4, "trSize check failure\n");
144
145
else
146
{
147
// non-psy path
148
+ primitives.cu[log2TrSize - 2].nonPsyRdoQuant(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos);
149
+ blkPos = codeParams.scan[scanPosBase];
150
for (int y = 0; y < MLS_CG_SIZE; y++)
151
{
152
for (int x = 0; x < MLS_CG_SIZE; x++)
153
{
154
- int signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */
155
- costUncoded[blkPos + x] = ((int64_t)signCoef * signCoef) << scaleBits;
156
-
157
- totalUncodedCost += costUncoded[blkPos + x];
158
- totalRdCost += costUncoded[blkPos + x];
159
-
160
const uint32_t scanPosOffset = y * MLS_CG_SIZE + x;
161
const uint32_t ctxSig = table_cnt[patternSigCtx][g_scan4x4[codeParams.scanType][scanPosOffset]] + ctxSigOffset;
162
X265_CHECK(trSize > 4, "trSize check failure\n");
163
x265_2.7.tar.gz/source/common/slice.cpp -> x265_2.9.tar.gz/source/common/slice.cpp
Changed
10
1
2
for (int yuv = 0; yuv < 3; yuv++)
3
{
4
WeightParam& wp = m_weightPredTable[l][i][yuv];
5
- wp.bPresentFlag = false;
6
+ wp.wtPresent = 0;
7
wp.log2WeightDenom = 0;
8
wp.inputWeight = 1;
9
wp.inputOffset = 0;
10
x265_2.7.tar.gz/source/common/slice.h -> x265_2.9.tar.gz/source/common/slice.h
Changed
37
1
2
uint32_t log2WeightDenom;
3
int inputWeight;
4
int inputOffset;
5
- bool bPresentFlag;
6
+ int wtPresent;
7
8
/* makes a non-h265 weight (i.e. fix7), into an h265 weight */
9
void setFromWeightAndOffset(int w, int o, int denom, bool bNormalize)
10
11
(w).inputWeight = (s); \
12
(w).log2WeightDenom = (d); \
13
(w).inputOffset = (o); \
14
- (w).bPresentFlag = (b); \
15
+ (w).wtPresent = (b); \
16
}
17
18
class Slice
19
20
bool getRapPicFlag() const
21
{
22
return m_nalUnitType == NAL_UNIT_CODED_SLICE_IDR_W_RADL
23
+ || m_nalUnitType == NAL_UNIT_CODED_SLICE_IDR_N_LP
24
|| m_nalUnitType == NAL_UNIT_CODED_SLICE_CRA;
25
}
26
-
27
bool getIdrPicFlag() const
28
{
29
- return m_nalUnitType == NAL_UNIT_CODED_SLICE_IDR_W_RADL;
30
+ return m_nalUnitType == NAL_UNIT_CODED_SLICE_IDR_W_RADL
31
+ || m_nalUnitType == NAL_UNIT_CODED_SLICE_IDR_N_LP;
32
}
33
-
34
bool isIRAP() const { return m_nalUnitType >= 16 && m_nalUnitType <= 23; }
35
36
bool isIntra() const { return m_sliceType == I_SLICE; }
37
x265_2.7.tar.gz/source/common/x86/asm-primitives.cpp -> x265_2.9.tar.gz/source/common/x86/asm-primitives.cpp
Changed
2936
1
2
p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].sa8d = PFX(pixel_sa8d_8x16_ ## cpu); \
3
p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sa8d = PFX(pixel_sa8d_16x32_ ## cpu); \
4
p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sa8d = PFX(pixel_sa8d_32x64_ ## cpu)
5
-
6
#define PIXEL_AVG(cpu) \
7
- p.pu[LUMA_64x64].pixelavg_pp = PFX(pixel_avg_64x64_ ## cpu); \
8
- p.pu[LUMA_64x48].pixelavg_pp = PFX(pixel_avg_64x48_ ## cpu); \
9
- p.pu[LUMA_64x32].pixelavg_pp = PFX(pixel_avg_64x32_ ## cpu); \
10
- p.pu[LUMA_64x16].pixelavg_pp = PFX(pixel_avg_64x16_ ## cpu); \
11
- p.pu[LUMA_48x64].pixelavg_pp = PFX(pixel_avg_48x64_ ## cpu); \
12
- p.pu[LUMA_32x64].pixelavg_pp = PFX(pixel_avg_32x64_ ## cpu); \
13
- p.pu[LUMA_32x32].pixelavg_pp = PFX(pixel_avg_32x32_ ## cpu); \
14
- p.pu[LUMA_32x24].pixelavg_pp = PFX(pixel_avg_32x24_ ## cpu); \
15
- p.pu[LUMA_32x16].pixelavg_pp = PFX(pixel_avg_32x16_ ## cpu); \
16
- p.pu[LUMA_32x8].pixelavg_pp = PFX(pixel_avg_32x8_ ## cpu); \
17
- p.pu[LUMA_24x32].pixelavg_pp = PFX(pixel_avg_24x32_ ## cpu); \
18
- p.pu[LUMA_16x64].pixelavg_pp = PFX(pixel_avg_16x64_ ## cpu); \
19
- p.pu[LUMA_16x32].pixelavg_pp = PFX(pixel_avg_16x32_ ## cpu); \
20
- p.pu[LUMA_16x16].pixelavg_pp = PFX(pixel_avg_16x16_ ## cpu); \
21
- p.pu[LUMA_16x12].pixelavg_pp = PFX(pixel_avg_16x12_ ## cpu); \
22
- p.pu[LUMA_16x8].pixelavg_pp = PFX(pixel_avg_16x8_ ## cpu); \
23
- p.pu[LUMA_16x4].pixelavg_pp = PFX(pixel_avg_16x4_ ## cpu); \
24
- p.pu[LUMA_12x16].pixelavg_pp = PFX(pixel_avg_12x16_ ## cpu); \
25
- p.pu[LUMA_8x32].pixelavg_pp = PFX(pixel_avg_8x32_ ## cpu); \
26
- p.pu[LUMA_8x16].pixelavg_pp = PFX(pixel_avg_8x16_ ## cpu); \
27
- p.pu[LUMA_8x8].pixelavg_pp = PFX(pixel_avg_8x8_ ## cpu); \
28
- p.pu[LUMA_8x4].pixelavg_pp = PFX(pixel_avg_8x4_ ## cpu);
29
-
30
+ p.pu[LUMA_64x64].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_64x64_ ## cpu); \
31
+ p.pu[LUMA_64x48].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_64x48_ ## cpu); \
32
+ p.pu[LUMA_64x32].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_64x32_ ## cpu); \
33
+ p.pu[LUMA_64x16].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_64x16_ ## cpu); \
34
+ p.pu[LUMA_48x64].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_48x64_ ## cpu); \
35
+ p.pu[LUMA_32x64].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_32x64_ ## cpu); \
36
+ p.pu[LUMA_32x32].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_32x32_ ## cpu); \
37
+ p.pu[LUMA_32x24].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_32x24_ ## cpu); \
38
+ p.pu[LUMA_32x16].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_32x16_ ## cpu); \
39
+ p.pu[LUMA_32x8].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_32x8_ ## cpu); \
40
+ p.pu[LUMA_24x32].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_24x32_ ## cpu); \
41
+ p.pu[LUMA_16x64].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_16x64_ ## cpu); \
42
+ p.pu[LUMA_16x32].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_16x32_ ## cpu); \
43
+ p.pu[LUMA_16x16].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_16x16_ ## cpu); \
44
+ p.pu[LUMA_16x12].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_16x12_ ## cpu); \
45
+ p.pu[LUMA_16x8].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_16x8_ ## cpu); \
46
+ p.pu[LUMA_16x4].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_16x4_ ## cpu); \
47
+ p.pu[LUMA_12x16].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_12x16_ ## cpu); \
48
+ p.pu[LUMA_8x32].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_8x32_ ## cpu); \
49
+ p.pu[LUMA_8x16].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_8x16_ ## cpu); \
50
+ p.pu[LUMA_8x8].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_8x8_ ## cpu); \
51
+ p.pu[LUMA_8x4].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_8x4_ ## cpu); \
52
+ p.pu[LUMA_64x64].pixelavg_pp[ALIGNED] = PFX(pixel_avg_64x64_ ## cpu); \
53
+ p.pu[LUMA_64x48].pixelavg_pp[ALIGNED] = PFX(pixel_avg_64x48_ ## cpu); \
54
+ p.pu[LUMA_64x32].pixelavg_pp[ALIGNED] = PFX(pixel_avg_64x32_ ## cpu); \
55
+ p.pu[LUMA_64x16].pixelavg_pp[ALIGNED] = PFX(pixel_avg_64x16_ ## cpu); \
56
+ p.pu[LUMA_48x64].pixelavg_pp[ALIGNED] = PFX(pixel_avg_48x64_ ## cpu); \
57
+ p.pu[LUMA_32x64].pixelavg_pp[ALIGNED] = PFX(pixel_avg_32x64_ ## cpu); \
58
+ p.pu[LUMA_32x32].pixelavg_pp[ALIGNED] = PFX(pixel_avg_32x32_ ## cpu); \
59
+ p.pu[LUMA_32x24].pixelavg_pp[ALIGNED] = PFX(pixel_avg_32x24_ ## cpu); \
60
+ p.pu[LUMA_32x16].pixelavg_pp[ALIGNED] = PFX(pixel_avg_32x16_ ## cpu); \
61
+ p.pu[LUMA_32x8].pixelavg_pp[ALIGNED] = PFX(pixel_avg_32x8_ ## cpu); \
62
+ p.pu[LUMA_24x32].pixelavg_pp[ALIGNED] = PFX(pixel_avg_24x32_ ## cpu); \
63
+ p.pu[LUMA_16x64].pixelavg_pp[ALIGNED] = PFX(pixel_avg_16x64_ ## cpu); \
64
+ p.pu[LUMA_16x32].pixelavg_pp[ALIGNED] = PFX(pixel_avg_16x32_ ## cpu); \
65
+ p.pu[LUMA_16x16].pixelavg_pp[ALIGNED] = PFX(pixel_avg_16x16_ ## cpu); \
66
+ p.pu[LUMA_16x12].pixelavg_pp[ALIGNED] = PFX(pixel_avg_16x12_ ## cpu); \
67
+ p.pu[LUMA_16x8].pixelavg_pp[ALIGNED] = PFX(pixel_avg_16x8_ ## cpu); \
68
+ p.pu[LUMA_16x4].pixelavg_pp[ALIGNED] = PFX(pixel_avg_16x4_ ## cpu); \
69
+ p.pu[LUMA_12x16].pixelavg_pp[ALIGNED] = PFX(pixel_avg_12x16_ ## cpu); \
70
+ p.pu[LUMA_8x32].pixelavg_pp[ALIGNED] = PFX(pixel_avg_8x32_ ## cpu); \
71
+ p.pu[LUMA_8x16].pixelavg_pp[ALIGNED] = PFX(pixel_avg_8x16_ ## cpu); \
72
+ p.pu[LUMA_8x8].pixelavg_pp[ALIGNED] = PFX(pixel_avg_8x8_ ## cpu); \
73
+ p.pu[LUMA_8x4].pixelavg_pp[ALIGNED] = PFX(pixel_avg_8x4_ ## cpu);
74
#define PIXEL_AVG_W4(cpu) \
75
- p.pu[LUMA_4x4].pixelavg_pp = PFX(pixel_avg_4x4_ ## cpu); \
76
- p.pu[LUMA_4x8].pixelavg_pp = PFX(pixel_avg_4x8_ ## cpu); \
77
- p.pu[LUMA_4x16].pixelavg_pp = PFX(pixel_avg_4x16_ ## cpu);
78
-
79
+ p.pu[LUMA_4x4].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_4x4_ ## cpu); \
80
+ p.pu[LUMA_4x8].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_4x8_ ## cpu); \
81
+ p.pu[LUMA_4x16].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_4x16_ ## cpu); \
82
+ p.pu[LUMA_4x4].pixelavg_pp[ALIGNED] = PFX(pixel_avg_4x4_ ## cpu); \
83
+ p.pu[LUMA_4x8].pixelavg_pp[ALIGNED] = PFX(pixel_avg_4x8_ ## cpu); \
84
+ p.pu[LUMA_4x16].pixelavg_pp[ALIGNED] = PFX(pixel_avg_4x16_ ## cpu);
85
#define CHROMA_420_FILTERS(cpu) \
86
ALL_CHROMA_420_PU(filter_hpp, interp_4tap_horiz_pp, cpu); \
87
ALL_CHROMA_420_PU(filter_hps, interp_4tap_horiz_ps, cpu); \
88
89
90
#define LUMA_PIXELSUB(cpu) \
91
p.cu[BLOCK_4x4].sub_ps = PFX(pixel_sub_ps_4x4_ ## cpu); \
92
- p.cu[BLOCK_4x4].add_ps = PFX(pixel_add_ps_4x4_ ## cpu); \
93
+ p.cu[BLOCK_4x4].add_ps[NONALIGNED] = PFX(pixel_add_ps_4x4_ ## cpu); \
94
+ p.cu[BLOCK_4x4].add_ps[ALIGNED] = PFX(pixel_add_ps_4x4_ ## cpu); \
95
ALL_LUMA_CU(sub_ps, pixel_sub_ps, cpu); \
96
- ALL_LUMA_CU(add_ps, pixel_add_ps, cpu);
97
+ ALL_LUMA_CU(add_ps[NONALIGNED], pixel_add_ps, cpu); \
98
+ ALL_LUMA_CU(add_ps[ALIGNED], pixel_add_ps, cpu);
99
100
#define CHROMA_420_PIXELSUB_PS(cpu) \
101
ALL_CHROMA_420_CU(sub_ps, pixel_sub_ps, cpu); \
102
- ALL_CHROMA_420_CU(add_ps, pixel_add_ps, cpu);
103
+ ALL_CHROMA_420_CU(add_ps[NONALIGNED], pixel_add_ps, cpu); \
104
+ ALL_CHROMA_420_CU(add_ps[ALIGNED], pixel_add_ps, cpu);
105
106
#define CHROMA_422_PIXELSUB_PS(cpu) \
107
ALL_CHROMA_422_CU(sub_ps, pixel_sub_ps, cpu); \
108
- ALL_CHROMA_422_CU(add_ps, pixel_add_ps, cpu);
109
+ ALL_CHROMA_422_CU(add_ps[NONALIGNED], pixel_add_ps, cpu); \
110
+ ALL_CHROMA_422_CU(add_ps[ALIGNED], pixel_add_ps, cpu);
111
112
#define LUMA_VAR(cpu) ALL_LUMA_CU(var, pixel_var, cpu)
113
114
-#define LUMA_ADDAVG(cpu) ALL_LUMA_PU(addAvg, addAvg, cpu); p.pu[LUMA_4x4].addAvg = PFX(addAvg_4x4_ ## cpu)
115
-#define CHROMA_420_ADDAVG(cpu) ALL_CHROMA_420_PU(addAvg, addAvg, cpu);
116
-#define CHROMA_422_ADDAVG(cpu) ALL_CHROMA_422_PU(addAvg, addAvg, cpu);
117
+#define LUMA_ADDAVG(cpu) ALL_LUMA_PU(addAvg[NONALIGNED], addAvg, cpu); \
118
+ p.pu[LUMA_4x4].addAvg[NONALIGNED] = PFX(addAvg_4x4_ ## cpu); \
119
+ ALL_LUMA_PU(addAvg[ALIGNED], addAvg, cpu); \
120
+ p.pu[LUMA_4x4].addAvg[ALIGNED] = PFX(addAvg_4x4_ ## cpu)
121
+#define CHROMA_420_ADDAVG(cpu) ALL_CHROMA_420_PU(addAvg[NONALIGNED], addAvg, cpu); \
122
+ ALL_CHROMA_420_PU(addAvg[ALIGNED], addAvg, cpu)
123
+#define CHROMA_422_ADDAVG(cpu) ALL_CHROMA_422_PU(addAvg[NONALIGNED], addAvg, cpu); \
124
+ ALL_CHROMA_422_PU(addAvg[ALIGNED], addAvg, cpu)
125
126
#define SETUP_INTRA_ANG_COMMON(mode, fno, cpu) \
127
p.cu[BLOCK_4x4].intra_pred[mode] = PFX(intra_pred_ang4_ ## fno ## _ ## cpu); \
128
129
ALL_CHROMA_444_PU(filter_hpp, interp_4tap_horiz_pp, cpu); \
130
ALL_CHROMA_444_PU(filter_hps, interp_4tap_horiz_ps, cpu);
131
132
+#define ASSIGN2(func, fname) \
133
+ func[ALIGNED] = PFX(fname); \
134
+ func[NONALIGNED] = PFX(fname)
135
+
136
namespace X265_NS {
137
// private x265 namespace
138
139
140
141
void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask) // Main10
142
{
143
-#if !defined(X86_64)
144
-#error "Unsupported build configuration (32bit x86 and HIGH_BIT_DEPTH), you must configure ENABLE_ASSEMBLY=OFF"
145
-#endif
146
-
147
#if X86_64
148
p.scanPosLast = PFX(scanPosLast_x64);
149
#endif
150
151
CHROMA_422_VERT_FILTERS(_sse2);
152
CHROMA_444_VERT_FILTERS(sse2);
153
154
+#if X86_64
155
ALL_LUMA_PU(luma_hpp, interp_8tap_horiz_pp, sse2);
156
p.pu[LUMA_4x4].luma_hpp = PFX(interp_8tap_horiz_pp_4x4_sse2);
157
ALL_LUMA_PU(luma_hps, interp_8tap_horiz_ps, sse2);
158
p.pu[LUMA_4x4].luma_hps = PFX(interp_8tap_horiz_ps_4x4_sse2);
159
ALL_LUMA_PU(luma_vpp, interp_8tap_vert_pp, sse2);
160
ALL_LUMA_PU(luma_vps, interp_8tap_vert_ps, sse2);
161
+#endif
162
163
p.ssim_4x4x2_core = PFX(pixel_ssim_4x4x2_core_sse2);
164
p.ssim_end_4 = PFX(pixel_ssim_end4_sse2);
165
- PIXEL_AVG(sse2);
166
+ ASSIGN2(p.pu[LUMA_64x64].pixelavg_pp, pixel_avg_64x64_sse2);
167
+ ASSIGN2(p.pu[LUMA_64x48].pixelavg_pp, pixel_avg_64x48_sse2);
168
+ ASSIGN2(p.pu[LUMA_64x32].pixelavg_pp, pixel_avg_64x32_sse2);
169
+ ASSIGN2(p.pu[LUMA_64x16].pixelavg_pp, pixel_avg_64x16_sse2);
170
+ ASSIGN2(p.pu[LUMA_48x64].pixelavg_pp, pixel_avg_48x64_sse2);
171
+ ASSIGN2(p.pu[LUMA_32x64].pixelavg_pp, pixel_avg_32x64_sse2);
172
+ ASSIGN2(p.pu[LUMA_32x32].pixelavg_pp, pixel_avg_32x32_sse2);
173
+ ASSIGN2(p.pu[LUMA_32x24].pixelavg_pp, pixel_avg_32x24_sse2);
174
+ ASSIGN2(p.pu[LUMA_32x16].pixelavg_pp, pixel_avg_32x16_sse2);
175
+ ASSIGN2(p.pu[LUMA_32x8].pixelavg_pp, pixel_avg_32x8_sse2);
176
+ ASSIGN2(p.pu[LUMA_24x32].pixelavg_pp, pixel_avg_24x32_sse2);
177
+ ASSIGN2(p.pu[LUMA_16x64].pixelavg_pp, pixel_avg_16x64_sse2);
178
+ ASSIGN2(p.pu[LUMA_16x32].pixelavg_pp, pixel_avg_16x32_sse2);
179
+ ASSIGN2(p.pu[LUMA_16x16].pixelavg_pp, pixel_avg_16x16_sse2);
180
+ ASSIGN2(p.pu[LUMA_16x12].pixelavg_pp, pixel_avg_16x12_sse2);
181
+ ASSIGN2(p.pu[LUMA_16x8].pixelavg_pp, pixel_avg_16x8_sse2);
182
+ ASSIGN2(p.pu[LUMA_16x4].pixelavg_pp, pixel_avg_16x4_sse2);
183
+ ASSIGN2(p.pu[LUMA_12x16].pixelavg_pp, pixel_avg_12x16_sse2);
184
+#if X86_64
185
+ ASSIGN2(p.pu[LUMA_8x32].pixelavg_pp, pixel_avg_8x32_sse2);
186
+ ASSIGN2(p.pu[LUMA_8x16].pixelavg_pp, pixel_avg_8x16_sse2);
187
+ ASSIGN2(p.pu[LUMA_8x8].pixelavg_pp, pixel_avg_8x8_sse2);
188
+ ASSIGN2(p.pu[LUMA_8x4].pixelavg_pp, pixel_avg_8x4_sse2);
189
+#endif
190
PIXEL_AVG_W4(mmx2);
191
LUMA_VAR(sse2);
192
193
194
- ALL_LUMA_TU(blockfill_s, blockfill_s, sse2);
195
+ ALL_LUMA_TU(blockfill_s[ALIGNED], blockfill_s, sse2);
196
+ ALL_LUMA_TU(blockfill_s[NONALIGNED], blockfill_s, sse2);
197
ALL_LUMA_TU_S(cpy1Dto2D_shr, cpy1Dto2D_shr_, sse2);
198
- ALL_LUMA_TU_S(cpy1Dto2D_shl, cpy1Dto2D_shl_, sse2);
199
+ ALL_LUMA_TU_S(cpy1Dto2D_shl[ALIGNED], cpy1Dto2D_shl_, sse2);
200
+ ALL_LUMA_TU_S(cpy1Dto2D_shl[NONALIGNED], cpy1Dto2D_shl_, sse2);
201
ALL_LUMA_TU_S(cpy2Dto1D_shr, cpy2Dto1D_shr_, sse2);
202
ALL_LUMA_TU_S(cpy2Dto1D_shl, cpy2Dto1D_shl_, sse2);
203
- ALL_LUMA_TU_S(ssd_s, pixel_ssd_s_, sse2);
204
- ALL_LUMA_TU_S(calcresidual, getResidual, sse2);
205
+#if X86_64
206
+ ASSIGN2(p.cu[BLOCK_4x4].ssd_s,pixel_ssd_s_4_sse2 );
207
+ ASSIGN2(p.cu[BLOCK_8x8].ssd_s,pixel_ssd_s_8_sse2);
208
+ ASSIGN2(p.cu[BLOCK_16x16].ssd_s,pixel_ssd_s_16_sse2);
209
+ ASSIGN2(p.cu[BLOCK_32x32].ssd_s,pixel_ssd_s_32_sse2 );
210
+#endif
211
+ ALL_LUMA_TU_S(calcresidual[ALIGNED], getResidual, sse2);
212
+ ALL_LUMA_TU_S(calcresidual[NONALIGNED], getResidual, sse2);
213
ALL_LUMA_TU_S(transpose, transpose, sse2);
214
215
p.cu[BLOCK_4x4].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar4_sse2);
216
p.cu[BLOCK_8x8].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar8_sse2);
217
p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar16_sse2);
218
+#if X86_64
219
p.cu[BLOCK_32x32].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar32_sse2);
220
ALL_LUMA_TU_S(intra_pred[DC_IDX], intra_pred_dc, sse2);
221
-
222
+#endif
223
p.cu[BLOCK_4x4].intra_pred[2] = PFX(intra_pred_ang4_2_sse2);
224
p.cu[BLOCK_4x4].intra_pred[3] = PFX(intra_pred_ang4_3_sse2);
225
p.cu[BLOCK_4x4].intra_pred[4] = PFX(intra_pred_ang4_4_sse2);
226
227
p.cu[BLOCK_4x4].intra_pred[23] = PFX(intra_pred_ang4_23_sse2);
228
p.cu[BLOCK_4x4].intra_pred[24] = PFX(intra_pred_ang4_24_sse2);
229
p.cu[BLOCK_4x4].intra_pred[25] = PFX(intra_pred_ang4_25_sse2);
230
+#if X86_64
231
p.cu[BLOCK_4x4].intra_pred[26] = PFX(intra_pred_ang4_26_sse2);
232
+#endif
233
p.cu[BLOCK_4x4].intra_pred[27] = PFX(intra_pred_ang4_27_sse2);
234
p.cu[BLOCK_4x4].intra_pred[28] = PFX(intra_pred_ang4_28_sse2);
235
p.cu[BLOCK_4x4].intra_pred[29] = PFX(intra_pred_ang4_29_sse2);
236
237
p.cu[BLOCK_4x4].intra_pred[32] = PFX(intra_pred_ang4_32_sse2);
238
p.cu[BLOCK_4x4].intra_pred[33] = PFX(intra_pred_ang4_33_sse2);
239
240
+#if X86_64 && X265_DEPTH <= 10
241
+ p.cu[BLOCK_4x4].sse_ss = PFX(pixel_ssd_ss_4x4_mmx2);
242
p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_32x64_sse2);
243
p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_4x8_mmx2);
244
p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_8x16_sse2);
245
p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_16x32_sse2);
246
-#if X265_DEPTH <= 10
247
- p.cu[BLOCK_4x4].sse_ss = PFX(pixel_ssd_ss_4x4_mmx2);
248
- ALL_LUMA_CU(sse_ss, pixel_ssd_ss, sse2);
249
+
250
+ p.cu[BLOCK_8x8].sse_ss = PFX(pixel_ssd_ss_8x8_sse2);
251
+ p.cu[BLOCK_16x16].sse_ss = PFX(pixel_ssd_ss_16x16_sse2);
252
+ p.cu[BLOCK_32x32].sse_ss = PFX(pixel_ssd_ss_32x32_sse2);
253
+ p.cu[BLOCK_64x64].sse_ss = PFX(pixel_ssd_ss_64x64_sse2);
254
#endif
255
p.cu[BLOCK_4x4].dct = PFX(dct4_sse2);
256
p.cu[BLOCK_8x8].dct = PFX(dct8_sse2);
257
p.cu[BLOCK_4x4].idct = PFX(idct4_sse2);
258
+#if X86_64
259
p.cu[BLOCK_8x8].idct = PFX(idct8_sse2);
260
-
261
+#endif
262
p.idst4x4 = PFX(idst4_sse2);
263
p.dst4x4 = PFX(dst4_sse2);
264
265
266
//p.planecopy_sp = PFX(downShift_16_sse2);
267
p.planecopy_sp_shl = PFX(upShift_16_sse2);
268
269
- ALL_CHROMA_420_PU(p2s, filterPixelToShort, sse2);
270
- ALL_CHROMA_422_PU(p2s, filterPixelToShort, sse2);
271
- ALL_CHROMA_444_PU(p2s, filterPixelToShort, sse2);
272
- ALL_LUMA_PU(convert_p2s, filterPixelToShort, sse2);
273
+ ALL_CHROMA_420_PU(p2s[ALIGNED], filterPixelToShort, sse2);
274
+ ALL_CHROMA_422_PU(p2s[ALIGNED], filterPixelToShort, sse2);
275
+ ALL_CHROMA_444_PU(p2s[ALIGNED], filterPixelToShort, sse2);
276
+ ALL_LUMA_PU(convert_p2s[ALIGNED], filterPixelToShort, sse2);
277
+ ALL_CHROMA_420_PU(p2s[NONALIGNED], filterPixelToShort, sse2);
278
+ ALL_CHROMA_422_PU(p2s[NONALIGNED], filterPixelToShort, sse2);
279
+ ALL_CHROMA_444_PU(p2s[NONALIGNED], filterPixelToShort, sse2);
280
+ ALL_LUMA_PU(convert_p2s[NONALIGNED], filterPixelToShort, sse2);
281
ALL_LUMA_TU(count_nonzero, count_nonzero, sse2);
282
p.propagateCost = PFX(mbtree_propagate_cost_sse2);
283
}
284
if (cpuMask & X265_CPU_SSE3)
285
{
286
+#if X86_64
287
ALL_CHROMA_420_PU(filter_hpp, interp_4tap_horiz_pp, sse3);
288
ALL_CHROMA_422_PU(filter_hpp, interp_4tap_horiz_pp, sse3);
289
ALL_CHROMA_444_PU(filter_hpp, interp_4tap_horiz_pp, sse3);
290
ALL_CHROMA_420_PU(filter_hps, interp_4tap_horiz_ps, sse3);
291
ALL_CHROMA_422_PU(filter_hps, interp_4tap_horiz_ps, sse3);
292
ALL_CHROMA_444_PU(filter_hps, interp_4tap_horiz_ps, sse3);
293
+#endif
294
}
295
if (cpuMask & X265_CPU_SSSE3)
296
{
297
- p.scale1D_128to64 = PFX(scale1D_128to64_ssse3);
298
+ ASSIGN2(p.scale1D_128to64, scale1D_128to64_ssse3);
299
p.scale2D_64to32 = PFX(scale2D_64to32_ssse3);
300
301
// p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d = PFX(pixel_satd_4x4_ssse3); this one is broken
302
303
304
p.frameInitLowres = PFX(frame_init_lowres_core_ssse3);
305
306
- ALL_LUMA_PU(convert_p2s, filterPixelToShort, ssse3);
307
+ ALL_LUMA_PU(convert_p2s[ALIGNED], filterPixelToShort, ssse3);
308
+ ALL_LUMA_PU(convert_p2s[NONALIGNED], filterPixelToShort, ssse3);
309
+
310
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].p2s, filterPixelToShort_4x4_ssse3);
311
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].p2s, filterPixelToShort_4x8_ssse3);
312
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].p2s, filterPixelToShort_4x16_ssse3);
313
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].p2s, filterPixelToShort_8x4_ssse3);
314
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].p2s, filterPixelToShort_8x8_ssse3);
315
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].p2s, filterPixelToShort_8x16_ssse3);
316
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].p2s, filterPixelToShort_8x32_ssse3);
317
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].p2s, filterPixelToShort_16x4_ssse3);
318
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].p2s, filterPixelToShort_16x8_ssse3);
319
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].p2s, filterPixelToShort_16x12_ssse3);
320
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].p2s, filterPixelToShort_16x16_ssse3);
321
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].p2s, filterPixelToShort_16x32_ssse3);
322
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].p2s, filterPixelToShort_32x8_ssse3);
323
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].p2s, filterPixelToShort_32x16_ssse3);
324
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].p2s, filterPixelToShort_32x24_ssse3);
325
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].p2s, filterPixelToShort_32x32_ssse3);
326
+
327
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].p2s, filterPixelToShort_4x4_ssse3);
328
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].p2s, filterPixelToShort_4x8_ssse3);
329
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].p2s, filterPixelToShort_4x16_ssse3);
330
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].p2s, filterPixelToShort_4x32_ssse3);
331
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].p2s, filterPixelToShort_8x4_ssse3);
332
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].p2s, filterPixelToShort_8x8_ssse3);
333
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].p2s, filterPixelToShort_8x12_ssse3);
334
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].p2s, filterPixelToShort_8x16_ssse3);
335
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].p2s, filterPixelToShort_8x32_ssse3);
336
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].p2s, filterPixelToShort_8x64_ssse3);
337
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].p2s, filterPixelToShort_12x32_ssse3);
338
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].p2s, filterPixelToShort_16x8_ssse3);
339
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].p2s, filterPixelToShort_16x16_ssse3);
340
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].p2s, filterPixelToShort_16x24_ssse3);
341
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].p2s, filterPixelToShort_16x32_ssse3);
342
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].p2s, filterPixelToShort_16x64_ssse3);
343
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].p2s, filterPixelToShort_24x64_ssse3);
344
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].p2s, filterPixelToShort_32x16_ssse3);
345
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].p2s, filterPixelToShort_32x32_ssse3);
346
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s, filterPixelToShort_32x48_ssse3);
347
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s, filterPixelToShort_32x64_ssse3);
348
+
349
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].p2s, filterPixelToShort_4x2_ssse3);
350
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].p2s, filterPixelToShort_8x2_ssse3);
351
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].p2s, filterPixelToShort_8x6_ssse3);
352
353
- p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].p2s = PFX(filterPixelToShort_4x4_ssse3);
354
- p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].p2s = PFX(filterPixelToShort_4x8_ssse3);
355
- p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].p2s = PFX(filterPixelToShort_4x16_ssse3);
356
- p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].p2s = PFX(filterPixelToShort_8x4_ssse3);
357
- p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].p2s = PFX(filterPixelToShort_8x8_ssse3);
358
- p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].p2s = PFX(filterPixelToShort_8x16_ssse3);
359
- p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].p2s = PFX(filterPixelToShort_8x32_ssse3);
360
- p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].p2s = PFX(filterPixelToShort_16x4_ssse3);
361
- p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].p2s = PFX(filterPixelToShort_16x8_ssse3);
362
- p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].p2s = PFX(filterPixelToShort_16x12_ssse3);
363
- p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].p2s = PFX(filterPixelToShort_16x16_ssse3);
364
- p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].p2s = PFX(filterPixelToShort_16x32_ssse3);
365
- p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].p2s = PFX(filterPixelToShort_32x8_ssse3);
366
- p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].p2s = PFX(filterPixelToShort_32x16_ssse3);
367
- p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].p2s = PFX(filterPixelToShort_32x24_ssse3);
368
- p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].p2s = PFX(filterPixelToShort_32x32_ssse3);
369
- p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].p2s = PFX(filterPixelToShort_4x4_ssse3);
370
- p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].p2s = PFX(filterPixelToShort_4x8_ssse3);
371
- p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].p2s = PFX(filterPixelToShort_4x16_ssse3);
372
- p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].p2s = PFX(filterPixelToShort_4x32_ssse3);
373
- p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].p2s = PFX(filterPixelToShort_8x4_ssse3);
374
- p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].p2s = PFX(filterPixelToShort_8x8_ssse3);
375
- p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].p2s = PFX(filterPixelToShort_8x12_ssse3);
376
- p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].p2s = PFX(filterPixelToShort_8x16_ssse3);
377
- p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].p2s = PFX(filterPixelToShort_8x32_ssse3);
378
- p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].p2s = PFX(filterPixelToShort_8x64_ssse3);
379
- p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].p2s = PFX(filterPixelToShort_12x32_ssse3);
380
- p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].p2s = PFX(filterPixelToShort_16x8_ssse3);
381
- p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].p2s = PFX(filterPixelToShort_16x16_ssse3);
382
- p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].p2s = PFX(filterPixelToShort_16x24_ssse3);
383
- p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].p2s = PFX(filterPixelToShort_16x32_ssse3);
384
- p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].p2s = PFX(filterPixelToShort_16x64_ssse3);
385
- p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].p2s = PFX(filterPixelToShort_24x64_ssse3);
386
- p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].p2s = PFX(filterPixelToShort_32x16_ssse3);
387
- p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].p2s = PFX(filterPixelToShort_32x32_ssse3);
388
- p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s = PFX(filterPixelToShort_32x48_ssse3);
389
- p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s = PFX(filterPixelToShort_32x64_ssse3);
390
- p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].p2s = PFX(filterPixelToShort_4x2_ssse3);
391
- p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].p2s = PFX(filterPixelToShort_8x2_ssse3);
392
- p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].p2s = PFX(filterPixelToShort_8x6_ssse3);
393
p.findPosFirstLast = PFX(findPosFirstLast_ssse3);
394
p.fix8Unpack = PFX(cutree_fix8_unpack_ssse3);
395
p.fix8Pack = PFX(cutree_fix8_pack_ssse3);
396
}
397
if (cpuMask & X265_CPU_SSE4)
398
{
399
+#if X86_64
400
p.pelFilterLumaStrong[0] = PFX(pelFilterLumaStrong_V_sse4);
401
p.pelFilterLumaStrong[1] = PFX(pelFilterLumaStrong_H_sse4);
402
p.pelFilterChroma[0] = PFX(pelFilterChroma_V_sse4);
403
p.pelFilterChroma[1] = PFX(pelFilterChroma_H_sse4);
404
-
405
p.saoCuOrgE0 = PFX(saoCuOrgE0_sse4);
406
+#endif
407
p.saoCuOrgE1 = PFX(saoCuOrgE1_sse4);
408
p.saoCuOrgE1_2Rows = PFX(saoCuOrgE1_2Rows_sse4);
409
p.saoCuOrgE2[0] = PFX(saoCuOrgE2_sse4);
410
411
CHROMA_422_ADDAVG(sse4);
412
413
LUMA_FILTERS(sse4);
414
+
415
+#if X86_64
416
+ p.pu[LUMA_4x4].luma_hpp = PFX(interp_8tap_horiz_pp_4x4_sse4);
417
+ p.pu[LUMA_4x8].luma_hpp = PFX(interp_8tap_horiz_pp_4x8_sse4);
418
+ p.pu[LUMA_4x16].luma_hpp = PFX(interp_8tap_horiz_pp_4x16_sse4);
419
+ p.pu[LUMA_4x4].luma_hps = PFX(interp_8tap_horiz_ps_4x4_sse4);
420
+ p.pu[LUMA_4x8].luma_hps = PFX(interp_8tap_horiz_ps_4x8_sse4);
421
+ p.pu[LUMA_4x16].luma_hps = PFX(interp_8tap_horiz_ps_4x16_sse4);
422
+#endif
423
+
424
+ p.pu[LUMA_8x8].luma_hpp = PFX(interp_8tap_horiz_pp_8x8_sse4);
425
+ p.pu[LUMA_16x16].luma_hpp = PFX(interp_8tap_horiz_pp_16x16_sse4);
426
+ p.pu[LUMA_32x32].luma_hpp = PFX(interp_8tap_horiz_pp_32x32_sse4);
427
+ p.pu[LUMA_64x64].luma_hpp = PFX(interp_8tap_horiz_pp_64x64_sse4);
428
+ p.pu[LUMA_8x4].luma_hpp = PFX(interp_8tap_horiz_pp_8x4_sse4);
429
+
430
+ p.pu[LUMA_16x8].luma_hpp = PFX(interp_8tap_horiz_pp_16x8_sse4);
431
+ p.pu[LUMA_8x16].luma_hpp = PFX(interp_8tap_horiz_pp_8x16_sse4);
432
+ p.pu[LUMA_16x32].luma_hpp = PFX(interp_8tap_horiz_pp_16x32_sse4);
433
+ p.pu[LUMA_32x16].luma_hpp = PFX(interp_8tap_horiz_pp_32x16_sse4);
434
+ p.pu[LUMA_64x32].luma_hpp = PFX(interp_8tap_horiz_pp_64x32_sse4);
435
+ p.pu[LUMA_32x64].luma_hpp = PFX(interp_8tap_horiz_pp_32x64_sse4);
436
+ p.pu[LUMA_16x12].luma_hpp = PFX(interp_8tap_horiz_pp_16x12_sse4);
437
+ p.pu[LUMA_12x16].luma_hpp = PFX(interp_8tap_horiz_pp_12x16_sse4);
438
+ p.pu[LUMA_16x4].luma_hpp = PFX(interp_8tap_horiz_pp_16x4_sse4);
439
+
440
+ p.pu[LUMA_32x24].luma_hpp = PFX(interp_8tap_horiz_pp_32x24_sse4);
441
+ p.pu[LUMA_24x32].luma_hpp = PFX(interp_8tap_horiz_pp_24x32_sse4);
442
+ p.pu[LUMA_32x8].luma_hpp = PFX(interp_8tap_horiz_pp_32x8_sse4);
443
+ p.pu[LUMA_8x32].luma_hpp = PFX(interp_8tap_horiz_pp_8x32_sse4);
444
+ p.pu[LUMA_64x48].luma_hpp = PFX(interp_8tap_horiz_pp_64x48_sse4);
445
+ p.pu[LUMA_48x64].luma_hpp = PFX(interp_8tap_horiz_pp_48x64_sse4);
446
+ p.pu[LUMA_64x16].luma_hpp = PFX(interp_8tap_horiz_pp_64x16_sse4);
447
+ p.pu[LUMA_16x64].luma_hpp = PFX(interp_8tap_horiz_pp_16x64_sse4);
448
+
449
+ p.pu[LUMA_8x8].luma_hps = PFX(interp_8tap_horiz_ps_8x8_sse4);
450
+ p.pu[LUMA_16x16].luma_hps = PFX(interp_8tap_horiz_ps_16x16_sse4);
451
+ p.pu[LUMA_32x32].luma_hps = PFX(interp_8tap_horiz_ps_32x32_sse4);
452
+ p.pu[LUMA_64x64].luma_hps = PFX(interp_8tap_horiz_ps_64x64_sse4);
453
+ p.pu[LUMA_8x4].luma_hps = PFX(interp_8tap_horiz_ps_8x4_sse4);
454
+ p.pu[LUMA_16x8].luma_hps = PFX(interp_8tap_horiz_ps_16x8_sse4);
455
+ p.pu[LUMA_8x16].luma_hps = PFX(interp_8tap_horiz_ps_8x16_sse4);
456
+ p.pu[LUMA_16x32].luma_hps = PFX(interp_8tap_horiz_ps_16x32_sse4);
457
+ p.pu[LUMA_32x16].luma_hps = PFX(interp_8tap_horiz_ps_32x16_sse4);
458
+ p.pu[LUMA_64x32].luma_hps = PFX(interp_8tap_horiz_ps_64x32_sse4);
459
+ p.pu[LUMA_32x64].luma_hps = PFX(interp_8tap_horiz_ps_32x64_sse4);
460
+ p.pu[LUMA_16x12].luma_hps = PFX(interp_8tap_horiz_ps_16x12_sse4);
461
+ p.pu[LUMA_12x16].luma_hps = PFX(interp_8tap_horiz_ps_12x16_sse4);
462
+ p.pu[LUMA_16x4].luma_hps = PFX(interp_8tap_horiz_ps_16x4_sse4);
463
+ p.pu[LUMA_32x24].luma_hps = PFX(interp_8tap_horiz_ps_32x24_sse4);
464
+ p.pu[LUMA_24x32].luma_hps = PFX(interp_8tap_horiz_ps_24x32_sse4);
465
+ p.pu[LUMA_32x8].luma_hps = PFX(interp_8tap_horiz_ps_32x8_sse4);
466
+ p.pu[LUMA_8x32].luma_hps = PFX(interp_8tap_horiz_ps_8x32_sse4);
467
+ p.pu[LUMA_64x48].luma_hps = PFX(interp_8tap_horiz_ps_64x48_sse4);
468
+ p.pu[LUMA_48x64].luma_hps = PFX(interp_8tap_horiz_ps_48x64_sse4);
469
+ p.pu[LUMA_64x16].luma_hps = PFX(interp_8tap_horiz_ps_64x16_sse4);
470
+ p.pu[LUMA_16x64].luma_hps = PFX(interp_8tap_horiz_ps_16x64_sse4);
471
+
472
+ ALL_LUMA_PU(luma_vpp, interp_8tap_vert_pp, sse4); p.pu[LUMA_4x4].luma_vpp = PFX(interp_8tap_vert_pp_4x4_sse4);
473
+ ALL_LUMA_PU(luma_vps, interp_8tap_vert_ps, sse4); p.pu[LUMA_4x4].luma_vps = PFX(interp_8tap_vert_ps_4x4_sse4);
474
+ ALL_LUMA_PU(luma_vsp, interp_8tap_vert_sp, sse4); p.pu[LUMA_4x4].luma_vsp = PFX(interp_8tap_vert_sp_4x4_sse4);
475
+ ALL_LUMA_PU_T(luma_hvpp, interp_8tap_hv_pp_cpu); p.pu[LUMA_4x4].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_4x4>;
476
CHROMA_420_HORIZ_FILTERS(sse4);
477
CHROMA_420_VERT_FILTERS_SSE4(_sse4);
478
CHROMA_422_HORIZ_FILTERS(_sse4);
479
480
481
// TODO: check POPCNT flag!
482
ALL_LUMA_TU_S(copy_cnt, copy_cnt_, sse4);
483
-#if X265_DEPTH <= 10
484
+#if X86_64 && X265_DEPTH <= 10
485
ALL_LUMA_CU(psy_cost_pp, psyCost_pp, sse4);
486
#endif
487
488
- p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].p2s = PFX(filterPixelToShort_2x4_sse4);
489
- p.chroma[X265_CSP_I420].pu[CHROMA_420_2x8].p2s = PFX(filterPixelToShort_2x8_sse4);
490
- p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].p2s = PFX(filterPixelToShort_6x8_sse4);
491
- p.chroma[X265_CSP_I422].pu[CHROMA_422_2x8].p2s = PFX(filterPixelToShort_2x8_sse4);
492
- p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].p2s = PFX(filterPixelToShort_2x16_sse4);
493
- p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].p2s = PFX(filterPixelToShort_6x16_sse4);
494
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].p2s[NONALIGNED] = PFX(filterPixelToShort_2x4_sse4);
495
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_2x8].p2s[NONALIGNED] = PFX(filterPixelToShort_2x8_sse4);
496
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].p2s[NONALIGNED] = PFX(filterPixelToShort_6x8_sse4);
497
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_2x8].p2s[NONALIGNED] = PFX(filterPixelToShort_2x8_sse4);
498
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].p2s[NONALIGNED] = PFX(filterPixelToShort_2x16_sse4);
499
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].p2s[NONALIGNED] = PFX(filterPixelToShort_6x16_sse4);
500
p.costCoeffRemain = PFX(costCoeffRemain_sse4);
501
#if X86_64
502
p.saoCuStatsE0 = PFX(saoCuStatsE0_sse4);
503
504
p.saoCuStatsE3 = PFX(saoCuStatsE3_sse4);
505
#endif
506
}
507
+#if X86_64
508
if (cpuMask & X265_CPU_AVX)
509
{
510
// p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d = PFX(pixel_satd_4x4_avx); fails tests
511
512
p.cu[BLOCK_32x32].intra_pred[32] = PFX(intra_pred_ang32_32_avx2);
513
p.cu[BLOCK_32x32].intra_pred[33] = PFX(intra_pred_ang32_33_avx2);
514
p.cu[BLOCK_32x32].intra_pred[34] = PFX(intra_pred_ang32_2_avx2);
515
-
516
- p.pu[LUMA_12x16].pixelavg_pp = PFX(pixel_avg_12x16_avx2);
517
- p.pu[LUMA_16x4].pixelavg_pp = PFX(pixel_avg_16x4_avx2);
518
- p.pu[LUMA_16x8].pixelavg_pp = PFX(pixel_avg_16x8_avx2);
519
- p.pu[LUMA_16x12].pixelavg_pp = PFX(pixel_avg_16x12_avx2);
520
- p.pu[LUMA_16x16].pixelavg_pp = PFX(pixel_avg_16x16_avx2);
521
- p.pu[LUMA_16x32].pixelavg_pp = PFX(pixel_avg_16x32_avx2);
522
- p.pu[LUMA_16x64].pixelavg_pp = PFX(pixel_avg_16x64_avx2);
523
- p.pu[LUMA_24x32].pixelavg_pp = PFX(pixel_avg_24x32_avx2);
524
- p.pu[LUMA_32x8].pixelavg_pp = PFX(pixel_avg_32x8_avx2);
525
- p.pu[LUMA_32x16].pixelavg_pp = PFX(pixel_avg_32x16_avx2);
526
- p.pu[LUMA_32x24].pixelavg_pp = PFX(pixel_avg_32x24_avx2);
527
- p.pu[LUMA_32x32].pixelavg_pp = PFX(pixel_avg_32x32_avx2);
528
- p.pu[LUMA_32x64].pixelavg_pp = PFX(pixel_avg_32x64_avx2);
529
- p.pu[LUMA_64x16].pixelavg_pp = PFX(pixel_avg_64x16_avx2);
530
- p.pu[LUMA_64x32].pixelavg_pp = PFX(pixel_avg_64x32_avx2);
531
- p.pu[LUMA_64x48].pixelavg_pp = PFX(pixel_avg_64x48_avx2);
532
- p.pu[LUMA_64x64].pixelavg_pp = PFX(pixel_avg_64x64_avx2);
533
- p.pu[LUMA_48x64].pixelavg_pp = PFX(pixel_avg_48x64_avx2);
534
-
535
- p.pu[LUMA_8x4].addAvg = PFX(addAvg_8x4_avx2);
536
- p.pu[LUMA_8x8].addAvg = PFX(addAvg_8x8_avx2);
537
- p.pu[LUMA_8x16].addAvg = PFX(addAvg_8x16_avx2);
538
- p.pu[LUMA_8x32].addAvg = PFX(addAvg_8x32_avx2);
539
- p.pu[LUMA_12x16].addAvg = PFX(addAvg_12x16_avx2);
540
- p.pu[LUMA_16x4].addAvg = PFX(addAvg_16x4_avx2);
541
- p.pu[LUMA_16x8].addAvg = PFX(addAvg_16x8_avx2);
542
- p.pu[LUMA_16x12].addAvg = PFX(addAvg_16x12_avx2);
543
- p.pu[LUMA_16x16].addAvg = PFX(addAvg_16x16_avx2);
544
- p.pu[LUMA_16x32].addAvg = PFX(addAvg_16x32_avx2);
545
- p.pu[LUMA_16x64].addAvg = PFX(addAvg_16x64_avx2);
546
- p.pu[LUMA_24x32].addAvg = PFX(addAvg_24x32_avx2);
547
- p.pu[LUMA_32x8].addAvg = PFX(addAvg_32x8_avx2);
548
- p.pu[LUMA_32x16].addAvg = PFX(addAvg_32x16_avx2);
549
- p.pu[LUMA_32x24].addAvg = PFX(addAvg_32x24_avx2);
550
- p.pu[LUMA_32x32].addAvg = PFX(addAvg_32x32_avx2);
551
- p.pu[LUMA_32x64].addAvg = PFX(addAvg_32x64_avx2);
552
- p.pu[LUMA_48x64].addAvg = PFX(addAvg_48x64_avx2);
553
- p.pu[LUMA_64x16].addAvg = PFX(addAvg_64x16_avx2);
554
- p.pu[LUMA_64x32].addAvg = PFX(addAvg_64x32_avx2);
555
- p.pu[LUMA_64x48].addAvg = PFX(addAvg_64x48_avx2);
556
- p.pu[LUMA_64x64].addAvg = PFX(addAvg_64x64_avx2);
557
-
558
- p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].addAvg = PFX(addAvg_8x2_avx2);
559
- p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].addAvg = PFX(addAvg_8x4_avx2);
560
- p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].addAvg = PFX(addAvg_8x6_avx2);
561
- p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].addAvg = PFX(addAvg_8x8_avx2);
562
- p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].addAvg = PFX(addAvg_8x16_avx2);
563
- p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].addAvg = PFX(addAvg_8x32_avx2);
564
- p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].addAvg = PFX(addAvg_12x16_avx2);
565
- p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].addAvg = PFX(addAvg_16x4_avx2);
566
- p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].addAvg = PFX(addAvg_16x8_avx2);
567
- p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].addAvg = PFX(addAvg_16x12_avx2);
568
- p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].addAvg = PFX(addAvg_16x16_avx2);
569
- p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].addAvg = PFX(addAvg_16x32_avx2);
570
- p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].addAvg = PFX(addAvg_32x8_avx2);
571
- p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].addAvg = PFX(addAvg_32x16_avx2);
572
- p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].addAvg = PFX(addAvg_32x24_avx2);
573
- p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].addAvg = PFX(addAvg_32x32_avx2);
574
-
575
- p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].addAvg = PFX(addAvg_8x16_avx2);
576
- p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].addAvg = PFX(addAvg_16x32_avx2);
577
- p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].addAvg = PFX(addAvg_32x64_avx2);
578
- p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].addAvg = PFX(addAvg_8x8_avx2);
579
- p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].addAvg = PFX(addAvg_16x16_avx2);
580
- p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].addAvg = PFX(addAvg_8x32_avx2);
581
- p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].addAvg = PFX(addAvg_32x32_avx2);
582
- p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].addAvg = PFX(addAvg_16x64_avx2);
583
- p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].addAvg = PFX(addAvg_8x12_avx2);
584
- p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].addAvg = PFX(addAvg_8x4_avx2);
585
- p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].addAvg = PFX(addAvg_16x24_avx2);
586
- p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].addAvg = PFX(addAvg_16x8_avx2);
587
- p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].addAvg = PFX(addAvg_8x64_avx2);
588
- p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].addAvg = PFX(addAvg_24x64_avx2);
589
- p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].addAvg = PFX(addAvg_12x32_avx2);
590
- p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].addAvg = PFX(addAvg_32x16_avx2);
591
- p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].addAvg = PFX(addAvg_32x48_avx2);
592
+ ASSIGN2(p.pu[LUMA_12x16].pixelavg_pp, pixel_avg_12x16_avx2);
593
+ ASSIGN2(p.pu[LUMA_16x4].pixelavg_pp, pixel_avg_16x4_avx2);
594
+ ASSIGN2(p.pu[LUMA_16x8].pixelavg_pp, pixel_avg_16x8_avx2);
595
+ ASSIGN2(p.pu[LUMA_16x12].pixelavg_pp, pixel_avg_16x12_avx2);
596
+ ASSIGN2(p.pu[LUMA_16x16].pixelavg_pp, pixel_avg_16x16_avx2);
597
+ ASSIGN2(p.pu[LUMA_16x32].pixelavg_pp, pixel_avg_16x32_avx2);
598
+ ASSIGN2(p.pu[LUMA_16x64].pixelavg_pp, pixel_avg_16x64_avx2);
599
+ ASSIGN2(p.pu[LUMA_24x32].pixelavg_pp, pixel_avg_24x32_avx2);
600
+ ASSIGN2(p.pu[LUMA_32x8].pixelavg_pp, pixel_avg_32x8_avx2);
601
+ ASSIGN2(p.pu[LUMA_32x16].pixelavg_pp, pixel_avg_32x16_avx2);
602
+ ASSIGN2(p.pu[LUMA_32x24].pixelavg_pp, pixel_avg_32x24_avx2);
603
+ ASSIGN2(p.pu[LUMA_32x32].pixelavg_pp, pixel_avg_32x32_avx2);
604
+ ASSIGN2(p.pu[LUMA_32x64].pixelavg_pp, pixel_avg_32x64_avx2);
605
+ ASSIGN2(p.pu[LUMA_64x16].pixelavg_pp, pixel_avg_64x16_avx2);
606
+ ASSIGN2(p.pu[LUMA_64x32].pixelavg_pp, pixel_avg_64x32_avx2);
607
+ ASSIGN2(p.pu[LUMA_64x48].pixelavg_pp, pixel_avg_64x48_avx2);
608
+ ASSIGN2(p.pu[LUMA_64x64].pixelavg_pp, pixel_avg_64x64_avx2);
609
+ ASSIGN2(p.pu[LUMA_48x64].pixelavg_pp, pixel_avg_48x64_avx2);
610
+ ASSIGN2(p.pu[LUMA_8x4].addAvg, addAvg_8x4_avx2);
611
+ ASSIGN2(p.pu[LUMA_8x8].addAvg, addAvg_8x8_avx2);
612
+ ASSIGN2(p.pu[LUMA_8x16].addAvg, addAvg_8x16_avx2);
613
+ ASSIGN2(p.pu[LUMA_8x32].addAvg, addAvg_8x32_avx2);
614
+ ASSIGN2(p.pu[LUMA_12x16].addAvg, addAvg_12x16_avx2);
615
+ ASSIGN2(p.pu[LUMA_16x4].addAvg, addAvg_16x4_avx2);
616
+ ASSIGN2(p.pu[LUMA_16x8].addAvg, addAvg_16x8_avx2);
617
+ ASSIGN2(p.pu[LUMA_16x12].addAvg, addAvg_16x12_avx2);
618
+ ASSIGN2(p.pu[LUMA_16x16].addAvg, addAvg_16x16_avx2);
619
+ ASSIGN2(p.pu[LUMA_16x32].addAvg, addAvg_16x32_avx2);
620
+ ASSIGN2(p.pu[LUMA_16x64].addAvg, addAvg_16x64_avx2);
621
+ ASSIGN2(p.pu[LUMA_24x32].addAvg, addAvg_24x32_avx2);
622
+ ASSIGN2(p.pu[LUMA_32x8].addAvg, addAvg_32x8_avx2);
623
+ ASSIGN2(p.pu[LUMA_32x16].addAvg, addAvg_32x16_avx2);
624
+ ASSIGN2(p.pu[LUMA_32x24].addAvg, addAvg_32x24_avx2);
625
+ ASSIGN2(p.pu[LUMA_32x32].addAvg, addAvg_32x32_avx2);
626
+ ASSIGN2(p.pu[LUMA_32x64].addAvg, addAvg_32x64_avx2);
627
+ ASSIGN2(p.pu[LUMA_48x64].addAvg, addAvg_48x64_avx2);
628
+ ASSIGN2(p.pu[LUMA_64x16].addAvg, addAvg_64x16_avx2);
629
+ ASSIGN2(p.pu[LUMA_64x32].addAvg, addAvg_64x32_avx2);
630
+ ASSIGN2(p.pu[LUMA_64x48].addAvg, addAvg_64x48_avx2);
631
+ ASSIGN2(p.pu[LUMA_64x64].addAvg, addAvg_64x64_avx2);
632
+
633
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].addAvg, addAvg_8x2_avx2);
634
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].addAvg, addAvg_8x4_avx2);
635
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].addAvg, addAvg_8x6_avx2);
636
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].addAvg, addAvg_8x8_avx2);
637
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].addAvg, addAvg_8x16_avx2);
638
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].addAvg, addAvg_8x32_avx2);
639
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].addAvg, addAvg_12x16_avx2);
640
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].addAvg, addAvg_16x4_avx2);
641
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].addAvg, addAvg_16x8_avx2);
642
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].addAvg, addAvg_16x12_avx2);
643
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].addAvg, addAvg_16x16_avx2);
644
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].addAvg, addAvg_16x32_avx2);
645
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].addAvg, addAvg_32x8_avx2);
646
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].addAvg, addAvg_32x16_avx2);
647
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].addAvg, addAvg_32x24_avx2);
648
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].addAvg, addAvg_32x32_avx2);
649
+
650
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].addAvg, addAvg_8x16_avx2);
651
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].addAvg, addAvg_16x32_avx2);
652
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].addAvg, addAvg_32x64_avx2);
653
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].addAvg, addAvg_8x8_avx2);
654
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].addAvg,addAvg_16x16_avx2);
655
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].addAvg, addAvg_8x32_avx2);
656
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].addAvg, addAvg_32x32_avx2);
657
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].addAvg, addAvg_16x64_avx2);
658
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].addAvg, addAvg_8x12_avx2);
659
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].addAvg, addAvg_8x4_avx2);
660
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].addAvg, addAvg_16x24_avx2);
661
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].addAvg, addAvg_16x8_avx2);
662
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].addAvg, addAvg_8x64_avx2);
663
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].addAvg, addAvg_24x64_avx2);
664
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].addAvg, addAvg_12x32_avx2);
665
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].addAvg, addAvg_32x16_avx2);
666
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].addAvg, addAvg_32x48_avx2);
667
668
p.cu[BLOCK_4x4].psy_cost_pp = PFX(psyCost_pp_4x4_avx2);
669
p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar16_avx2);
670
671
p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].satd = PFX(pixel_satd_16x8_avx2);
672
p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd = PFX(pixel_satd_32x16_avx2);
673
674
- p.cu[BLOCK_16x16].ssd_s = PFX(pixel_ssd_s_16_avx2);
675
- p.cu[BLOCK_32x32].ssd_s = PFX(pixel_ssd_s_32_avx2);
676
-
677
+ ASSIGN2( p.cu[BLOCK_16x16].ssd_s,pixel_ssd_s_16_avx2);
678
+ ASSIGN2( p.cu[BLOCK_32x32].ssd_s,pixel_ssd_s_32_avx2);
679
p.cu[BLOCK_16x16].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_16x16_avx2);
680
p.cu[BLOCK_32x32].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_32x32_avx2);
681
p.cu[BLOCK_64x64].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_64x64_avx2);
682
683
p.idst4x4 = PFX(idst4_avx2);
684
p.denoiseDct = PFX(denoise_dct_avx2);
685
686
- p.scale1D_128to64 = PFX(scale1D_128to64_avx2);
687
+ ASSIGN2(p.scale1D_128to64, scale1D_128to64_avx2);
688
p.scale2D_64to32 = PFX(scale2D_64to32_avx2);
689
690
p.weight_pp = PFX(weight_pp_avx2);
691
692
p.sign = PFX(calSign_avx2);
693
p.planecopy_cp = PFX(upShift_8_avx2);
694
695
- p.cu[BLOCK_16x16].calcresidual = PFX(getResidual16_avx2);
696
- p.cu[BLOCK_32x32].calcresidual = PFX(getResidual32_avx2);
697
-
698
- p.cu[BLOCK_16x16].blockfill_s = PFX(blockfill_s_16x16_avx2);
699
- p.cu[BLOCK_32x32].blockfill_s = PFX(blockfill_s_32x32_avx2);
700
+ ASSIGN2(p.cu[BLOCK_16x16].calcresidual, getResidual16_avx2);
701
+ ASSIGN2(p.cu[BLOCK_32x32].calcresidual, getResidual32_avx2);
702
703
+ ASSIGN2(p.cu[BLOCK_16x16].blockfill_s, blockfill_s_16x16_avx2);
704
+ ASSIGN2(p.cu[BLOCK_32x32].blockfill_s, blockfill_s_32x32_avx2);
705
ALL_LUMA_TU(count_nonzero, count_nonzero, avx2);
706
- ALL_LUMA_TU_S(cpy1Dto2D_shl, cpy1Dto2D_shl_, avx2);
707
+ ALL_LUMA_TU_S(cpy1Dto2D_shl[ALIGNED], cpy1Dto2D_shl_, avx2);
708
+ ALL_LUMA_TU_S(cpy1Dto2D_shl[NONALIGNED], cpy1Dto2D_shl_, avx2);
709
ALL_LUMA_TU_S(cpy1Dto2D_shr, cpy1Dto2D_shr_, avx2);
710
-
711
p.cu[BLOCK_8x8].copy_cnt = PFX(copy_cnt_8_avx2);
712
p.cu[BLOCK_16x16].copy_cnt = PFX(copy_cnt_16_avx2);
713
p.cu[BLOCK_32x32].copy_cnt = PFX(copy_cnt_32_avx2);
714
715
ALL_LUMA_PU(luma_vss, interp_8tap_vert_ss, avx2);
716
p.pu[LUMA_4x4].luma_vsp = PFX(interp_8tap_vert_sp_4x4_avx2); // since ALL_LUMA_PU didn't declare 4x4 size, calling separately luma_vsp function to use
717
718
- p.cu[BLOCK_16x16].add_ps = PFX(pixel_add_ps_16x16_avx2);
719
- p.cu[BLOCK_32x32].add_ps = PFX(pixel_add_ps_32x32_avx2);
720
- p.cu[BLOCK_64x64].add_ps = PFX(pixel_add_ps_64x64_avx2);
721
- p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].add_ps = PFX(pixel_add_ps_16x16_avx2);
722
- p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps = PFX(pixel_add_ps_32x32_avx2);
723
- p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].add_ps = PFX(pixel_add_ps_16x32_avx2);
724
- p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps = PFX(pixel_add_ps_32x64_avx2);
725
+ ASSIGN2(p.cu[BLOCK_16x16].add_ps, pixel_add_ps_16x16_avx2);
726
+ ASSIGN2(p.cu[BLOCK_32x32].add_ps, pixel_add_ps_32x32_avx2);
727
+ ASSIGN2(p.cu[BLOCK_64x64].add_ps, pixel_add_ps_64x64_avx2);
728
+ ASSIGN2(p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].add_ps, pixel_add_ps_16x16_avx2);
729
+ ASSIGN2(p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps, pixel_add_ps_32x32_avx2);
730
+ ASSIGN2(p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].add_ps, pixel_add_ps_16x32_avx2);
731
+ ASSIGN2(p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps, pixel_add_ps_32x64_avx2);
732
733
p.cu[BLOCK_16x16].sub_ps = PFX(pixel_sub_ps_16x16_avx2);
734
p.cu[BLOCK_32x32].sub_ps = PFX(pixel_sub_ps_32x32_avx2);
735
736
p.pu[LUMA_64x48].sad_x4 = PFX(pixel_sad_x4_64x48_avx2);
737
p.pu[LUMA_64x64].sad_x4 = PFX(pixel_sad_x4_64x64_avx2);
738
739
- p.pu[LUMA_16x4].convert_p2s = PFX(filterPixelToShort_16x4_avx2);
740
- p.pu[LUMA_16x8].convert_p2s = PFX(filterPixelToShort_16x8_avx2);
741
- p.pu[LUMA_16x12].convert_p2s = PFX(filterPixelToShort_16x12_avx2);
742
- p.pu[LUMA_16x16].convert_p2s = PFX(filterPixelToShort_16x16_avx2);
743
- p.pu[LUMA_16x32].convert_p2s = PFX(filterPixelToShort_16x32_avx2);
744
- p.pu[LUMA_16x64].convert_p2s = PFX(filterPixelToShort_16x64_avx2);
745
- p.pu[LUMA_32x8].convert_p2s = PFX(filterPixelToShort_32x8_avx2);
746
- p.pu[LUMA_32x16].convert_p2s = PFX(filterPixelToShort_32x16_avx2);
747
- p.pu[LUMA_32x24].convert_p2s = PFX(filterPixelToShort_32x24_avx2);
748
- p.pu[LUMA_32x32].convert_p2s = PFX(filterPixelToShort_32x32_avx2);
749
- p.pu[LUMA_32x64].convert_p2s = PFX(filterPixelToShort_32x64_avx2);
750
- p.pu[LUMA_64x16].convert_p2s = PFX(filterPixelToShort_64x16_avx2);
751
- p.pu[LUMA_64x32].convert_p2s = PFX(filterPixelToShort_64x32_avx2);
752
- p.pu[LUMA_64x48].convert_p2s = PFX(filterPixelToShort_64x48_avx2);
753
- p.pu[LUMA_64x64].convert_p2s = PFX(filterPixelToShort_64x64_avx2);
754
- p.pu[LUMA_24x32].convert_p2s = PFX(filterPixelToShort_24x32_avx2);
755
- p.pu[LUMA_48x64].convert_p2s = PFX(filterPixelToShort_48x64_avx2);
756
-
757
- p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].p2s = PFX(filterPixelToShort_16x4_avx2);
758
- p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].p2s = PFX(filterPixelToShort_16x8_avx2);
759
- p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].p2s = PFX(filterPixelToShort_16x12_avx2);
760
- p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].p2s = PFX(filterPixelToShort_16x16_avx2);
761
- p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].p2s = PFX(filterPixelToShort_16x32_avx2);
762
- p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].p2s = PFX(filterPixelToShort_24x32_avx2);
763
- p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].p2s = PFX(filterPixelToShort_32x8_avx2);
764
- p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].p2s = PFX(filterPixelToShort_32x16_avx2);
765
- p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].p2s = PFX(filterPixelToShort_32x24_avx2);
766
- p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].p2s = PFX(filterPixelToShort_32x32_avx2);
767
- p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].p2s = PFX(filterPixelToShort_16x8_avx2);
768
- p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].p2s = PFX(filterPixelToShort_16x16_avx2);
769
- p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].p2s = PFX(filterPixelToShort_16x24_avx2);
770
- p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].p2s = PFX(filterPixelToShort_16x32_avx2);
771
- p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].p2s = PFX(filterPixelToShort_16x64_avx2);
772
- p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].p2s = PFX(filterPixelToShort_24x64_avx2);
773
- p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].p2s = PFX(filterPixelToShort_32x16_avx2);
774
- p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].p2s = PFX(filterPixelToShort_32x32_avx2);
775
- p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s = PFX(filterPixelToShort_32x48_avx2);
776
- p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s = PFX(filterPixelToShort_32x64_avx2);
777
+ ASSIGN2(p.pu[LUMA_16x4].convert_p2s, filterPixelToShort_16x4_avx2);
778
+ ASSIGN2(p.pu[LUMA_16x8].convert_p2s, filterPixelToShort_16x8_avx2);
779
+ ASSIGN2(p.pu[LUMA_16x12].convert_p2s, filterPixelToShort_16x12_avx2);
780
+ ASSIGN2(p.pu[LUMA_16x16].convert_p2s, filterPixelToShort_16x16_avx2);
781
+ ASSIGN2(p.pu[LUMA_16x32].convert_p2s, filterPixelToShort_16x32_avx2);
782
+ ASSIGN2(p.pu[LUMA_16x64].convert_p2s, filterPixelToShort_16x64_avx2);
783
+ ASSIGN2(p.pu[LUMA_32x8].convert_p2s, filterPixelToShort_32x8_avx2);
784
+ ASSIGN2(p.pu[LUMA_32x16].convert_p2s, filterPixelToShort_32x16_avx2);
785
+ ASSIGN2(p.pu[LUMA_32x24].convert_p2s, filterPixelToShort_32x24_avx2);
786
+ ASSIGN2(p.pu[LUMA_32x32].convert_p2s, filterPixelToShort_32x32_avx2);
787
+ ASSIGN2(p.pu[LUMA_32x64].convert_p2s, filterPixelToShort_32x64_avx2);
788
+ ASSIGN2(p.pu[LUMA_64x16].convert_p2s, filterPixelToShort_64x16_avx2);
789
+ ASSIGN2(p.pu[LUMA_64x32].convert_p2s, filterPixelToShort_64x32_avx2);
790
+ ASSIGN2(p.pu[LUMA_64x48].convert_p2s, filterPixelToShort_64x48_avx2);
791
+ ASSIGN2(p.pu[LUMA_64x64].convert_p2s, filterPixelToShort_64x64_avx2);
792
+ ASSIGN2(p.pu[LUMA_24x32].convert_p2s, filterPixelToShort_24x32_avx2);
793
+ ASSIGN2(p.pu[LUMA_48x64].convert_p2s, filterPixelToShort_48x64_avx2);
794
+
795
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].p2s, filterPixelToShort_16x4_avx2);
796
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].p2s, filterPixelToShort_16x8_avx2);
797
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].p2s, filterPixelToShort_16x12_avx2);
798
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].p2s, filterPixelToShort_16x16_avx2);
799
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].p2s, filterPixelToShort_16x32_avx2);
800
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].p2s, filterPixelToShort_24x32_avx2);
801
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].p2s, filterPixelToShort_32x8_avx2);
802
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].p2s, filterPixelToShort_32x16_avx2);
803
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].p2s, filterPixelToShort_32x24_avx2);
804
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].p2s, filterPixelToShort_32x32_avx2);
805
+
806
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].p2s, filterPixelToShort_16x8_avx2);
807
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].p2s, filterPixelToShort_16x16_avx2);
808
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].p2s, filterPixelToShort_16x24_avx2);
809
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].p2s, filterPixelToShort_16x32_avx2);
810
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].p2s, filterPixelToShort_16x64_avx2);
811
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].p2s, filterPixelToShort_24x64_avx2);
812
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].p2s, filterPixelToShort_32x16_avx2);
813
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].p2s, filterPixelToShort_32x32_avx2);
814
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s, filterPixelToShort_32x48_avx2);
815
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s, filterPixelToShort_32x64_avx2);
816
817
p.pu[LUMA_4x4].luma_hps = PFX(interp_8tap_horiz_ps_4x4_avx2);
818
p.pu[LUMA_4x8].luma_hps = PFX(interp_8tap_horiz_ps_4x8_avx2);
819
820
p.integral_inith[INTEGRAL_8] = PFX(integral8h_avx2);
821
p.integral_inith[INTEGRAL_12] = PFX(integral12h_avx2);
822
p.integral_inith[INTEGRAL_16] = PFX(integral16h_avx2);
823
+ p.cu[BLOCK_4x4].nonPsyRdoQuant = PFX(nonPsyRdoQuant4_avx2);
824
+ p.cu[BLOCK_8x8].nonPsyRdoQuant = PFX(nonPsyRdoQuant8_avx2);
825
+ p.cu[BLOCK_16x16].nonPsyRdoQuant = PFX(nonPsyRdoQuant16_avx2);
826
+ p.cu[BLOCK_32x32].nonPsyRdoQuant = PFX(nonPsyRdoQuant32_avx2);
827
+ p.cu[BLOCK_4x4].psyRdoQuant_1p = PFX(psyRdoQuant_1p4_avx2);
828
+ p.cu[BLOCK_8x8].psyRdoQuant_1p = PFX(psyRdoQuant_1p8_avx2);
829
+ p.cu[BLOCK_16x16].psyRdoQuant_1p = PFX(psyRdoQuant_1p16_avx2);
830
+ p.cu[BLOCK_32x32].psyRdoQuant_1p = PFX(psyRdoQuant_1p32_avx2);
831
832
/* TODO: This kernel needs to be modified to work with HIGH_BIT_DEPTH only
833
p.planeClipAndMax = PFX(planeClipAndMax_avx2); */
834
835
p.costCoeffNxN = PFX(costCoeffNxN_avx2_bmi2);
836
}
837
}
838
+ if (cpuMask & X265_CPU_AVX512)
839
+ {
840
+ p.cu[BLOCK_16x16].var = PFX(pixel_var_16x16_avx512);
841
+ p.cu[BLOCK_32x32].calcresidual[NONALIGNED] = PFX(getResidual32_avx512);
842
+ p.cu[BLOCK_32x32].calcresidual[ALIGNED] = PFX(getResidual_aligned32_avx512);
843
+ p.cu[BLOCK_64x64].sub_ps = PFX(pixel_sub_ps_64x64_avx512);
844
+ p.cu[BLOCK_32x32].sub_ps = PFX(pixel_sub_ps_32x32_avx512);
845
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sub_ps = PFX(pixel_sub_ps_32x32_avx512);
846
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sub_ps = PFX(pixel_sub_ps_32x64_avx512);
847
+
848
+ p.cu[BLOCK_64x64].add_ps[NONALIGNED] = PFX(pixel_add_ps_64x64_avx512);
849
+ p.cu[BLOCK_32x32].add_ps[NONALIGNED] = PFX(pixel_add_ps_32x32_avx512);
850
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps[NONALIGNED] = PFX(pixel_add_ps_32x32_avx512);
851
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps[NONALIGNED] = PFX(pixel_add_ps_32x64_avx512);
852
+
853
+ p.cu[BLOCK_32x32].add_ps[ALIGNED] = PFX(pixel_add_ps_aligned_32x32_avx512);
854
+ p.cu[BLOCK_64x64].add_ps[ALIGNED] = PFX(pixel_add_ps_aligned_64x64_avx512);
855
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps[ALIGNED] = PFX(pixel_add_ps_aligned_32x32_avx512);
856
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps[ALIGNED] = PFX(pixel_add_ps_aligned_32x64_avx512);
857
+
858
+ // 64 X N
859
+ p.cu[BLOCK_64x64].copy_ss = PFX(blockcopy_ss_64x64_avx512);
860
+ p.pu[LUMA_64x64].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x64_avx512);
861
+ p.pu[LUMA_64x48].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x48_avx512);
862
+ p.pu[LUMA_64x32].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x32_avx512);
863
+ p.pu[LUMA_64x16].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x16_avx512);
864
+ p.cu[BLOCK_64x64].copy_ps = (copy_ps_t)PFX(blockcopy_ss_64x64_avx512);
865
+ p.cu[BLOCK_64x64].copy_sp = (copy_sp_t)PFX(blockcopy_ss_64x64_avx512);
866
+
867
+ // 32 X N
868
+ p.cu[BLOCK_32x32].copy_ss = PFX(blockcopy_ss_32x32_avx512);
869
+ p.pu[LUMA_32x64].copy_pp = (copy_pp_t)PFX(blockcopy_ss_32x64_avx512);
870
+ p.pu[LUMA_32x32].copy_pp = (copy_pp_t)PFX(blockcopy_ss_32x32_avx512);
871
+ p.pu[LUMA_32x24].copy_pp = (copy_pp_t)PFX(blockcopy_ss_32x24_avx512);
872
+ p.pu[LUMA_32x16].copy_pp = (copy_pp_t)PFX(blockcopy_ss_32x16_avx512);
873
+ p.pu[LUMA_32x8].copy_pp = (copy_pp_t)PFX(blockcopy_ss_32x8_avx512);
874
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].copy_pp = (copy_pp_t)PFX(blockcopy_ss_32x8_avx512);
875
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].copy_pp = (copy_pp_t)PFX(blockcopy_ss_32x16_avx512);
876
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].copy_pp = (copy_pp_t)PFX(blockcopy_ss_32x24_avx512);
877
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].copy_pp = (copy_pp_t)PFX(blockcopy_ss_32x32_avx512);
878
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].copy_pp = (copy_pp_t)PFX(blockcopy_ss_32x16_avx512);
879
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].copy_pp = (copy_pp_t)PFX(blockcopy_ss_32x32_avx512);
880
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].copy_pp = (copy_pp_t)PFX(blockcopy_ss_32x48_avx512);
881
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].copy_pp = (copy_pp_t)PFX(blockcopy_ss_32x64_avx512);
882
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].copy_ss = PFX(blockcopy_ss_32x32_avx512);
883
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_ss = PFX(blockcopy_ss_32x64_avx512);
884
+ p.cu[BLOCK_32x32].copy_ps = (copy_ps_t)PFX(blockcopy_ss_32x32_avx512);
885
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].copy_ps = (copy_ps_t)PFX(blockcopy_ss_32x32_avx512);
886
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_ps = (copy_ps_t)PFX(blockcopy_ss_32x64_avx512);
887
+ p.cu[BLOCK_32x32].copy_sp = (copy_sp_t)PFX(blockcopy_ss_32x32_avx512);
888
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].copy_sp = (copy_sp_t)PFX(blockcopy_ss_32x32_avx512);
889
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_sp = (copy_sp_t)PFX(blockcopy_ss_32x64_avx512);
890
+
891
+ p.pu[LUMA_64x16].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_64x16_avx512);
892
+ p.pu[LUMA_64x32].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_64x32_avx512);
893
+ p.pu[LUMA_64x48].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_64x48_avx512);
894
+ p.pu[LUMA_64x64].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_64x64_avx512);
895
+ p.pu[LUMA_32x8].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_32x8_avx512);
896
+ p.pu[LUMA_32x16].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_32x16_avx512);
897
+ p.pu[LUMA_32x24].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_32x24_avx512);
898
+ p.pu[LUMA_32x32].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_32x32_avx512);
899
+ p.pu[LUMA_32x64].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_32x64_avx512);
900
+ p.pu[LUMA_48x64].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_48x64_avx512);
901
+
902
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].p2s[ALIGNED] = PFX(filterPixelToShort_2x4_sse4);
903
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_2x8].p2s[ALIGNED] = PFX(filterPixelToShort_2x8_sse4);
904
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].p2s[ALIGNED] = PFX(filterPixelToShort_6x8_sse4);
905
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].p2s[NONALIGNED] = PFX(filterPixelToShort_32x8_avx512);
906
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].p2s[NONALIGNED] = PFX(filterPixelToShort_32x16_avx512);
907
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].p2s[NONALIGNED] = PFX(filterPixelToShort_32x24_avx512);
908
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].p2s[NONALIGNED] = PFX(filterPixelToShort_32x32_avx512);
909
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].p2s[NONALIGNED] = PFX(filterPixelToShort_32x16_avx512);
910
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].p2s[NONALIGNED] = PFX(filterPixelToShort_32x32_avx512);
911
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s[NONALIGNED] = PFX(filterPixelToShort_32x48_avx512);
912
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s[NONALIGNED] = PFX(filterPixelToShort_32x64_avx512);
913
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_2x8].p2s[ALIGNED] = PFX(filterPixelToShort_2x8_sse4);
914
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].p2s[ALIGNED] = PFX(filterPixelToShort_2x16_sse4);
915
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].p2s[ALIGNED] = PFX(filterPixelToShort_6x16_sse4);
916
+ p.chroma[X265_CSP_I444].pu[LUMA_32x8].p2s[NONALIGNED] = PFX(filterPixelToShort_32x8_avx512);
917
+ p.chroma[X265_CSP_I444].pu[LUMA_32x16].p2s[NONALIGNED] = PFX(filterPixelToShort_32x16_avx512);
918
+ p.chroma[X265_CSP_I444].pu[LUMA_32x24].p2s[NONALIGNED] = PFX(filterPixelToShort_32x24_avx512);
919
+ p.chroma[X265_CSP_I444].pu[LUMA_32x32].p2s[NONALIGNED] = PFX(filterPixelToShort_32x32_avx512);
920
+ p.chroma[X265_CSP_I444].pu[LUMA_32x64].p2s[NONALIGNED] = PFX(filterPixelToShort_32x64_avx512);
921
+ p.chroma[X265_CSP_I444].pu[LUMA_64x16].p2s[NONALIGNED] = PFX(filterPixelToShort_64x16_avx512);
922
+ p.chroma[X265_CSP_I444].pu[LUMA_64x32].p2s[NONALIGNED] = PFX(filterPixelToShort_64x32_avx512);
923
+ p.chroma[X265_CSP_I444].pu[LUMA_64x48].p2s[NONALIGNED] = PFX(filterPixelToShort_64x48_avx512);
924
+ p.chroma[X265_CSP_I444].pu[LUMA_64x64].p2s[NONALIGNED] = PFX(filterPixelToShort_64x64_avx512);
925
+
926
+ p.pu[LUMA_64x16].convert_p2s[ALIGNED] = PFX(filterPixelToShort_aligned_64x16_avx512);
927
+ p.pu[LUMA_64x32].convert_p2s[ALIGNED] = PFX(filterPixelToShort_aligned_64x32_avx512);
928
+ p.pu[LUMA_64x48].convert_p2s[ALIGNED] = PFX(filterPixelToShort_aligned_64x48_avx512);
929
+ p.pu[LUMA_64x64].convert_p2s[ALIGNED] = PFX(filterPixelToShort_aligned_64x64_avx512);
930
+ p.pu[LUMA_32x8].convert_p2s[ALIGNED] = PFX(filterPixelToShort_aligned_32x8_avx512);
931
+ p.pu[LUMA_32x16].convert_p2s[ALIGNED] = PFX(filterPixelToShort_aligned_32x16_avx512);
932
+ p.pu[LUMA_32x24].convert_p2s[ALIGNED] = PFX(filterPixelToShort_aligned_32x24_avx512);
933
+ p.pu[LUMA_32x32].convert_p2s[ALIGNED] = PFX(filterPixelToShort_aligned_32x32_avx512);
934
+ p.pu[LUMA_32x64].convert_p2s[ALIGNED] = PFX(filterPixelToShort_aligned_32x64_avx512);
935
+ p.pu[LUMA_48x64].convert_p2s[ALIGNED] = PFX(filterPixelToShort_aligned_48x64_avx512);
936
+
937
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].p2s[ALIGNED] = PFX(filterPixelToShort_aligned_32x8_avx512);
938
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].p2s[ALIGNED] = PFX(filterPixelToShort_aligned_32x16_avx512);
939
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].p2s[ALIGNED] = PFX(filterPixelToShort_aligned_32x24_avx512);
940
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].p2s[ALIGNED] = PFX(filterPixelToShort_aligned_32x32_avx512);
941
+
942
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].p2s[ALIGNED] = PFX(filterPixelToShort_aligned_32x16_avx512);
943
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].p2s[ALIGNED] = PFX(filterPixelToShort_aligned_32x32_avx512);
944
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s[ALIGNED] = PFX(filterPixelToShort_aligned_32x48_avx512);
945
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s[ALIGNED] = PFX(filterPixelToShort_aligned_32x64_avx512);
946
+
947
+ p.chroma[X265_CSP_I444].pu[LUMA_32x8].p2s[ALIGNED] = PFX(filterPixelToShort_aligned_32x8_avx512);
948
+ p.chroma[X265_CSP_I444].pu[LUMA_32x16].p2s[ALIGNED] = PFX(filterPixelToShort_aligned_32x16_avx512);
949
+ p.chroma[X265_CSP_I444].pu[LUMA_32x24].p2s[ALIGNED] = PFX(filterPixelToShort_aligned_32x24_avx512);
950
+ p.chroma[X265_CSP_I444].pu[LUMA_32x32].p2s[ALIGNED] = PFX(filterPixelToShort_aligned_32x32_avx512);
951
+ p.chroma[X265_CSP_I444].pu[LUMA_32x64].p2s[ALIGNED] = PFX(filterPixelToShort_aligned_32x64_avx512);
952
+ p.chroma[X265_CSP_I444].pu[LUMA_64x16].p2s[ALIGNED] = PFX(filterPixelToShort_aligned_64x16_avx512);
953
+ p.chroma[X265_CSP_I444].pu[LUMA_64x32].p2s[ALIGNED] = PFX(filterPixelToShort_aligned_64x32_avx512);
954
+ p.chroma[X265_CSP_I444].pu[LUMA_64x48].p2s[ALIGNED] = PFX(filterPixelToShort_aligned_64x48_avx512);
955
+ p.chroma[X265_CSP_I444].pu[LUMA_64x64].p2s[ALIGNED] = PFX(filterPixelToShort_aligned_64x64_avx512);
956
+ p.cu[BLOCK_32x32].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_32_avx512);
957
+ p.cu[BLOCK_32x32].ssd_s[ALIGNED] = PFX(pixel_ssd_s_aligned_32_avx512);
958
+ p.cu[BLOCK_16x16].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_16_avx512);
959
+ p.cu[BLOCK_16x16].ssd_s[ALIGNED] = PFX(pixel_ssd_s_aligned_16_avx512);
960
+ p.pu[LUMA_16x32].sad = PFX(pixel_sad_16x32_avx512);
961
+ p.pu[LUMA_16x64].sad = PFX(pixel_sad_16x64_avx512);
962
+ p.pu[LUMA_32x8].sad = PFX(pixel_sad_32x8_avx512);
963
+ p.pu[LUMA_32x16].sad = PFX(pixel_sad_32x16_avx512);
964
+ p.pu[LUMA_32x24].sad = PFX(pixel_sad_32x24_avx512);
965
+ p.pu[LUMA_32x32].sad = PFX(pixel_sad_32x32_avx512);
966
+ p.pu[LUMA_32x64].sad = PFX(pixel_sad_32x64_avx512);
967
+ p.pu[LUMA_48x64].sad = PFX(pixel_sad_48x64_avx512);
968
+ p.pu[LUMA_64x16].sad = PFX(pixel_sad_64x16_avx512);
969
+ p.pu[LUMA_64x32].sad = PFX(pixel_sad_64x32_avx512);
970
+ p.pu[LUMA_64x48].sad = PFX(pixel_sad_64x48_avx512);
971
+ p.pu[LUMA_64x64].sad = PFX(pixel_sad_64x64_avx512);
972
+
973
+ p.pu[LUMA_64x16].addAvg[NONALIGNED] = PFX(addAvg_64x16_avx512);
974
+ p.pu[LUMA_64x32].addAvg[NONALIGNED] = PFX(addAvg_64x32_avx512);
975
+ p.pu[LUMA_64x48].addAvg[NONALIGNED] = PFX(addAvg_64x48_avx512);
976
+ p.pu[LUMA_64x64].addAvg[NONALIGNED] = PFX(addAvg_64x64_avx512);
977
+ p.pu[LUMA_32x8].addAvg[NONALIGNED] = PFX(addAvg_32x8_avx512);
978
+ p.pu[LUMA_32x16].addAvg[NONALIGNED] = PFX(addAvg_32x16_avx512);
979
+ p.pu[LUMA_32x24].addAvg[NONALIGNED] = PFX(addAvg_32x24_avx512);
980
+ p.pu[LUMA_32x32].addAvg[NONALIGNED] = PFX(addAvg_32x32_avx512);
981
+ p.pu[LUMA_32x64].addAvg[NONALIGNED] = PFX(addAvg_32x64_avx512);
982
+ p.pu[LUMA_16x4].addAvg[NONALIGNED] = PFX(addAvg_16x4_avx512);
983
+ p.pu[LUMA_16x8].addAvg[NONALIGNED] = PFX(addAvg_16x8_avx512);
984
+ p.pu[LUMA_16x12].addAvg[NONALIGNED] = PFX(addAvg_16x12_avx512);
985
+ p.pu[LUMA_16x16].addAvg[NONALIGNED] = PFX(addAvg_16x16_avx512);
986
+ p.pu[LUMA_16x32].addAvg[NONALIGNED] = PFX(addAvg_16x32_avx512);
987
+ p.pu[LUMA_16x64].addAvg[NONALIGNED] = PFX(addAvg_16x64_avx512);
988
+ p.pu[LUMA_48x64].addAvg[NONALIGNED] = PFX(addAvg_48x64_avx512);
989
+
990
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].addAvg[NONALIGNED] = PFX(addAvg_32x8_avx512);
991
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].addAvg[NONALIGNED] = PFX(addAvg_32x16_avx512);
992
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].addAvg[NONALIGNED] = PFX(addAvg_32x24_avx512);
993
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].addAvg[NONALIGNED] = PFX(addAvg_32x32_avx512);
994
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].addAvg[NONALIGNED] = PFX(addAvg_16x4_avx512);
995
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].addAvg[NONALIGNED] = PFX(addAvg_16x8_avx512);
996
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].addAvg[NONALIGNED] = PFX(addAvg_16x12_avx512);
997
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].addAvg[NONALIGNED] = PFX(addAvg_16x16_avx512);
998
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].addAvg[NONALIGNED] = PFX(addAvg_16x32_avx512);
999
+
1000
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].addAvg[NONALIGNED] = PFX(addAvg_32x16_avx512);
1001
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].addAvg[NONALIGNED] = PFX(addAvg_32x32_avx512);
1002
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].addAvg[NONALIGNED] = PFX(addAvg_32x48_avx512);
1003
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].addAvg[NONALIGNED] = PFX(addAvg_32x64_avx512);
1004
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].addAvg[NONALIGNED] = PFX(addAvg_16x32_avx512);
1005
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].addAvg[NONALIGNED] = PFX(addAvg_16x16_avx512);
1006
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].addAvg[NONALIGNED] = PFX(addAvg_16x64_avx512);
1007
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].addAvg[NONALIGNED] = PFX(addAvg_16x24_avx512);
1008
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].addAvg[NONALIGNED] = PFX(addAvg_16x8_avx512);
1009
+ p.pu[LUMA_32x8].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_32x8_avx512);
1010
+ p.pu[LUMA_32x16].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_32x16_avx512);
1011
+ p.pu[LUMA_32x24].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_32x24_avx512);
1012
+ p.pu[LUMA_32x32].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_32x32_avx512);
1013
+ p.pu[LUMA_32x64].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_32x64_avx512);
1014
+ p.pu[LUMA_64x16].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_64x16_avx512);
1015
+ p.pu[LUMA_64x32].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_64x32_avx512);
1016
+ p.pu[LUMA_64x48].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_64x48_avx512);
1017
+ p.pu[LUMA_64x64].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_64x64_avx512);
1018
+ p.pu[LUMA_48x64].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_48x64_avx512);
1019
+
1020
+ p.pu[LUMA_32x8].pixelavg_pp[ALIGNED] = PFX(pixel_avg_aligned_32x8_avx512);
1021
+ p.pu[LUMA_32x16].pixelavg_pp[ALIGNED] = PFX(pixel_avg_aligned_32x16_avx512);
1022
+ p.pu[LUMA_32x24].pixelavg_pp[ALIGNED] = PFX(pixel_avg_aligned_32x24_avx512);
1023
+ p.pu[LUMA_32x32].pixelavg_pp[ALIGNED] = PFX(pixel_avg_aligned_32x32_avx512);
1024
+ p.pu[LUMA_32x64].pixelavg_pp[ALIGNED] = PFX(pixel_avg_aligned_32x64_avx512);
1025
+ p.pu[LUMA_48x64].pixelavg_pp[ALIGNED] = PFX(pixel_avg_aligned_48x64_avx512);
1026
+ p.pu[LUMA_64x16].pixelavg_pp[ALIGNED] = PFX(pixel_avg_aligned_64x16_avx512);
1027
+ p.pu[LUMA_64x32].pixelavg_pp[ALIGNED] = PFX(pixel_avg_aligned_64x32_avx512);
1028
+ p.pu[LUMA_64x48].pixelavg_pp[ALIGNED] = PFX(pixel_avg_aligned_64x48_avx512);
1029
+ p.pu[LUMA_64x64].pixelavg_pp[ALIGNED] = PFX(pixel_avg_aligned_64x64_avx512);
1030
+ p.pu[LUMA_16x8].sad_x3 = PFX(pixel_sad_x3_16x8_avx512);
1031
+ p.pu[LUMA_16x12].sad_x3 = PFX(pixel_sad_x3_16x12_avx512);
1032
+ p.pu[LUMA_16x16].sad_x3 = PFX(pixel_sad_x3_16x16_avx512);
1033
+ p.pu[LUMA_16x32].sad_x3 = PFX(pixel_sad_x3_16x32_avx512);
1034
+ p.pu[LUMA_16x64].sad_x3 = PFX(pixel_sad_x3_16x64_avx512);
1035
+ p.pu[LUMA_32x8].sad_x3 = PFX(pixel_sad_x3_32x8_avx512);
1036
+ p.pu[LUMA_32x16].sad_x3 = PFX(pixel_sad_x3_32x16_avx512);
1037
+ p.pu[LUMA_32x24].sad_x3 = PFX(pixel_sad_x3_32x24_avx512);
1038
+ p.pu[LUMA_32x32].sad_x3 = PFX(pixel_sad_x3_32x32_avx512);
1039
+ p.pu[LUMA_32x64].sad_x3 = PFX(pixel_sad_x3_32x64_avx512);
1040
+ //p.pu[LUMA_48x64].sad_x3 = PFX(pixel_sad_x3_48x64_avx512);
1041
+ p.pu[LUMA_64x16].sad_x3 = PFX(pixel_sad_x3_64x16_avx512);
1042
+ p.pu[LUMA_64x32].sad_x3 = PFX(pixel_sad_x3_64x32_avx512);
1043
+ p.pu[LUMA_64x48].sad_x3 = PFX(pixel_sad_x3_64x48_avx512);
1044
+ p.pu[LUMA_64x64].sad_x3 = PFX(pixel_sad_x3_64x64_avx512);
1045
+
1046
+ p.pu[LUMA_16x8].sad_x4 = PFX(pixel_sad_x4_16x8_avx512);
1047
+ p.pu[LUMA_16x12].sad_x4 = PFX(pixel_sad_x4_16x12_avx512);
1048
+ p.pu[LUMA_16x16].sad_x4 = PFX(pixel_sad_x4_16x16_avx512);
1049
+ p.pu[LUMA_16x32].sad_x4 = PFX(pixel_sad_x4_16x32_avx512);
1050
+ p.pu[LUMA_16x64].sad_x4 = PFX(pixel_sad_x4_16x64_avx512);
1051
+ p.pu[LUMA_32x8].sad_x4 = PFX(pixel_sad_x4_32x8_avx512);
1052
+ p.pu[LUMA_32x16].sad_x4 = PFX(pixel_sad_x4_32x16_avx512);
1053
+ p.pu[LUMA_32x24].sad_x4 = PFX(pixel_sad_x4_32x24_avx512);
1054
+ p.pu[LUMA_32x32].sad_x4 = PFX(pixel_sad_x4_32x32_avx512);
1055
+ p.pu[LUMA_32x64].sad_x4 = PFX(pixel_sad_x4_32x64_avx512);
1056
+ //p.pu[LUMA_48x64].sad_x4 = PFX(pixel_sad_x4_48x64_avx512);
1057
+ p.pu[LUMA_64x16].sad_x4 = PFX(pixel_sad_x4_64x16_avx512);
1058
+ p.pu[LUMA_64x32].sad_x4 = PFX(pixel_sad_x4_64x32_avx512);
1059
+ p.pu[LUMA_64x48].sad_x4 = PFX(pixel_sad_x4_64x48_avx512);
1060
+ p.pu[LUMA_64x64].sad_x4 = PFX(pixel_sad_x4_64x64_avx512);
1061
+ p.cu[BLOCK_16x16].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_16_avx512);
1062
+ p.cu[BLOCK_32x32].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32_avx512);
1063
+ p.cu[BLOCK_32x32].cpy1Dto2D_shl[NONALIGNED] = PFX(cpy1Dto2D_shl_32_avx512);
1064
+ p.cu[BLOCK_32x32].cpy1Dto2D_shl[ALIGNED] = PFX(cpy1Dto2D_shl_aligned_32_avx512);
1065
+ p.cu[BLOCK_16x16].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_16_avx512);
1066
+ p.cu[BLOCK_32x32].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_32_avx512);
1067
+
1068
+ p.cu[BLOCK_16x16].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_16_avx512);
1069
+ p.cu[BLOCK_32x32].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_32_avx512);
1070
+
1071
+ p.weight_pp = PFX(weight_pp_avx512);
1072
+ p.weight_sp = PFX(weight_sp_avx512);
1073
+ p.dequant_normal = PFX(dequant_normal_avx512);
1074
+ p.dequant_scaling = PFX(dequant_scaling_avx512);
1075
+ p.cu[BLOCK_32x32].copy_cnt = PFX(copy_cnt_32_avx512);
1076
+ p.cu[BLOCK_16x16].copy_cnt = PFX(copy_cnt_16_avx512);
1077
+
1078
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].filter_hpp = PFX(interp_4tap_horiz_pp_8x4_avx512);
1079
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].filter_hpp = PFX(interp_4tap_horiz_pp_8x8_avx512);
1080
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].filter_hpp = PFX(interp_4tap_horiz_pp_8x16_avx512);
1081
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].filter_hpp = PFX(interp_4tap_horiz_pp_8x32_avx512);
1082
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_hpp = PFX(interp_4tap_horiz_pp_16x4_avx512);
1083
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_hpp = PFX(interp_4tap_horiz_pp_16x8_avx512);
1084
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_hpp = PFX(interp_4tap_horiz_pp_16x12_avx512);
1085
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_hpp = PFX(interp_4tap_horiz_pp_16x16_avx512);
1086
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].filter_hpp = PFX(interp_4tap_horiz_pp_16x32_avx512);
1087
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_hpp = PFX(interp_4tap_horiz_pp_32x8_avx512);
1088
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_hpp = PFX(interp_4tap_horiz_pp_32x16_avx512);
1089
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_hpp = PFX(interp_4tap_horiz_pp_32x24_avx512);
1090
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_hpp = PFX(interp_4tap_horiz_pp_32x32_avx512);
1091
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].filter_hpp = PFX(interp_4tap_horiz_pp_24x32_avx512);
1092
+
1093
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].filter_hpp = PFX(interp_4tap_horiz_pp_8x4_avx512);
1094
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].filter_hpp = PFX(interp_4tap_horiz_pp_8x8_avx512);
1095
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].filter_hpp = PFX(interp_4tap_horiz_pp_8x12_avx512);
1096
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].filter_hpp = PFX(interp_4tap_horiz_pp_8x16_avx512);
1097
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].filter_hpp = PFX(interp_4tap_horiz_pp_8x32_avx512);
1098
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].filter_hpp = PFX(interp_4tap_horiz_pp_8x64_avx512);
1099
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_hpp = PFX(interp_4tap_horiz_pp_16x8_avx512);
1100
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_hpp = PFX(interp_4tap_horiz_pp_16x16_avx512);
1101
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_hpp = PFX(interp_4tap_horiz_pp_16x24_avx512);
1102
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].filter_hpp = PFX(interp_4tap_horiz_pp_16x32_avx512);
1103
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].filter_hpp = PFX(interp_4tap_horiz_pp_16x64_avx512);
1104
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_hpp = PFX(interp_4tap_horiz_pp_32x16_avx512);
1105
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_hpp = PFX(interp_4tap_horiz_pp_32x32_avx512);
1106
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_hpp = PFX(interp_4tap_horiz_pp_32x48_avx512);
1107
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_hpp = PFX(interp_4tap_horiz_pp_32x64_avx512);
1108
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].filter_hpp = PFX(interp_4tap_horiz_pp_24x64_avx512);
1109
+
1110
+ p.chroma[X265_CSP_I444].pu[LUMA_8x4].filter_hpp = PFX(interp_4tap_horiz_pp_8x4_avx512);
1111
+ p.chroma[X265_CSP_I444].pu[LUMA_8x8].filter_hpp = PFX(interp_4tap_horiz_pp_8x8_avx512);
1112
+ p.chroma[X265_CSP_I444].pu[LUMA_8x16].filter_hpp = PFX(interp_4tap_horiz_pp_8x16_avx512);
1113
+ p.chroma[X265_CSP_I444].pu[LUMA_8x32].filter_hpp = PFX(interp_4tap_horiz_pp_8x32_avx512);
1114
+ p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_hpp = PFX(interp_4tap_horiz_pp_16x4_avx512);
1115
+ p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_hpp = PFX(interp_4tap_horiz_pp_16x8_avx512);
1116
+ p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_hpp = PFX(interp_4tap_horiz_pp_16x12_avx512);
1117
+ p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_hpp = PFX(interp_4tap_horiz_pp_16x16_avx512);
1118
+ p.chroma[X265_CSP_I444].pu[LUMA_16x32].filter_hpp = PFX(interp_4tap_horiz_pp_16x32_avx512);
1119
+ p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_hpp = PFX(interp_4tap_horiz_pp_16x64_avx512);
1120
+ p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_hpp = PFX(interp_4tap_horiz_pp_32x8_avx512);
1121
+ p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_hpp = PFX(interp_4tap_horiz_pp_32x16_avx512);
1122
+ p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_hpp = PFX(interp_4tap_horiz_pp_32x24_avx512);
1123
+ p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_hpp = PFX(interp_4tap_horiz_pp_32x32_avx512);
1124
+ p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_hpp = PFX(interp_4tap_horiz_pp_32x64_avx512);
1125
+ p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_hpp = PFX(interp_4tap_horiz_pp_64x16_avx512);
1126
+ p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_hpp = PFX(interp_4tap_horiz_pp_64x32_avx512);
1127
+ p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_hpp = PFX(interp_4tap_horiz_pp_64x48_avx512);
1128
+ p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_hpp = PFX(interp_4tap_horiz_pp_64x64_avx512);
1129
+ p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_hpp = PFX(interp_4tap_horiz_pp_48x64_avx512);
1130
+ p.chroma[X265_CSP_I444].pu[LUMA_24x32].filter_hpp = PFX(interp_4tap_horiz_pp_24x32_avx512);
1131
+
1132
+ p.pu[LUMA_16x4].addAvg[ALIGNED] = PFX(addAvg_aligned_16x4_avx512);
1133
+ p.pu[LUMA_16x8].addAvg[ALIGNED] = PFX(addAvg_aligned_16x8_avx512);
1134
+ p.pu[LUMA_16x12].addAvg[ALIGNED] = PFX(addAvg_aligned_16x12_avx512);
1135
+ p.pu[LUMA_16x16].addAvg[ALIGNED] = PFX(addAvg_aligned_16x16_avx512);
1136
+ p.pu[LUMA_16x32].addAvg[ALIGNED] = PFX(addAvg_aligned_16x32_avx512);
1137
+ p.pu[LUMA_16x64].addAvg[ALIGNED] = PFX(addAvg_aligned_16x64_avx512);
1138
+ p.pu[LUMA_48x64].addAvg[ALIGNED] = PFX(addAvg_aligned_48x64_avx512);
1139
+ p.pu[LUMA_32x8].addAvg[ALIGNED] = PFX(addAvg_aligned_32x8_avx512);
1140
+ p.pu[LUMA_32x16].addAvg[ALIGNED] = PFX(addAvg_aligned_32x16_avx512);
1141
+ p.pu[LUMA_32x24].addAvg[ALIGNED] = PFX(addAvg_aligned_32x24_avx512);
1142
+ p.pu[LUMA_32x32].addAvg[ALIGNED] = PFX(addAvg_aligned_32x32_avx512);
1143
+ p.pu[LUMA_32x64].addAvg[ALIGNED] = PFX(addAvg_aligned_32x64_avx512);
1144
+ p.pu[LUMA_64x16].addAvg[ALIGNED] = PFX(addAvg_aligned_64x16_avx512);
1145
+ p.pu[LUMA_64x32].addAvg[ALIGNED] = PFX(addAvg_aligned_64x32_avx512);
1146
+ p.pu[LUMA_64x48].addAvg[ALIGNED] = PFX(addAvg_aligned_64x48_avx512);
1147
+ p.pu[LUMA_64x64].addAvg[ALIGNED] = PFX(addAvg_aligned_64x64_avx512);
1148
+
1149
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].addAvg[ALIGNED] = PFX(addAvg_aligned_16x4_avx512);
1150
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].addAvg[ALIGNED] = PFX(addAvg_aligned_16x8_avx512);
1151
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].addAvg[ALIGNED] = PFX(addAvg_aligned_16x12_avx512);
1152
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].addAvg[ALIGNED] = PFX(addAvg_aligned_16x16_avx512);
1153
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].addAvg[ALIGNED] = PFX(addAvg_aligned_16x32_avx512);
1154
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].addAvg[ALIGNED] = PFX(addAvg_aligned_32x8_avx512);
1155
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].addAvg[ALIGNED] = PFX(addAvg_aligned_32x16_avx512);
1156
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].addAvg[ALIGNED] = PFX(addAvg_aligned_32x24_avx512);
1157
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].addAvg[ALIGNED] = PFX(addAvg_aligned_32x32_avx512);
1158
+
1159
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].addAvg[ALIGNED] = PFX(addAvg_aligned_16x32_avx512);
1160
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].addAvg[ALIGNED] = PFX(addAvg_aligned_16x16_avx512);
1161
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].addAvg[ALIGNED] = PFX(addAvg_aligned_16x64_avx512);
1162
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].addAvg[ALIGNED] = PFX(addAvg_aligned_16x24_avx512);
1163
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].addAvg[ALIGNED] = PFX(addAvg_aligned_16x8_avx512);
1164
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].addAvg[ALIGNED] = PFX(addAvg_aligned_32x16_avx512);
1165
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].addAvg[ALIGNED] = PFX(addAvg_aligned_32x32_avx512);
1166
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].addAvg[ALIGNED] = PFX(addAvg_aligned_32x48_avx512);
1167
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].addAvg[ALIGNED] = PFX(addAvg_aligned_32x64_avx512);
1168
+ p.cu[BLOCK_32x32].blockfill_s[NONALIGNED] = PFX(blockfill_s_32x32_avx512);
1169
+ p.cu[BLOCK_32x32].blockfill_s[ALIGNED] = PFX(blockfill_s_aligned_32x32_avx512);
1170
+ p.pu[LUMA_8x4].luma_hpp = PFX(interp_8tap_horiz_pp_8x4_avx512);
1171
+ p.pu[LUMA_8x8].luma_hpp = PFX(interp_8tap_horiz_pp_8x8_avx512);
1172
+ p.pu[LUMA_8x16].luma_hpp = PFX(interp_8tap_horiz_pp_8x16_avx512);
1173
+ p.pu[LUMA_8x32].luma_hpp = PFX(interp_8tap_horiz_pp_8x32_avx512);
1174
+ p.pu[LUMA_16x4].luma_hpp = PFX(interp_8tap_horiz_pp_16x4_avx512);
1175
+ p.pu[LUMA_16x8].luma_hpp = PFX(interp_8tap_horiz_pp_16x8_avx512);
1176
+ p.pu[LUMA_16x12].luma_hpp = PFX(interp_8tap_horiz_pp_16x12_avx512);
1177
+ p.pu[LUMA_16x16].luma_hpp = PFX(interp_8tap_horiz_pp_16x16_avx512);
1178
+ p.pu[LUMA_16x32].luma_hpp = PFX(interp_8tap_horiz_pp_16x32_avx512);
1179
+ p.pu[LUMA_16x64].luma_hpp = PFX(interp_8tap_horiz_pp_16x64_avx512);
1180
+ p.pu[LUMA_24x32].luma_hpp = PFX(interp_8tap_horiz_pp_24x32_avx512);
1181
+ p.pu[LUMA_32x8].luma_hpp = PFX(interp_8tap_horiz_pp_32x8_avx512);
1182
+ p.pu[LUMA_32x16].luma_hpp = PFX(interp_8tap_horiz_pp_32x16_avx512);
1183
+ p.pu[LUMA_32x24].luma_hpp = PFX(interp_8tap_horiz_pp_32x24_avx512);
1184
+ p.pu[LUMA_32x32].luma_hpp = PFX(interp_8tap_horiz_pp_32x32_avx512);
1185
+ p.pu[LUMA_32x64].luma_hpp = PFX(interp_8tap_horiz_pp_32x64_avx512);
1186
+ p.pu[LUMA_64x16].luma_hpp = PFX(interp_8tap_horiz_pp_64x16_avx512);
1187
+ p.pu[LUMA_64x32].luma_hpp = PFX(interp_8tap_horiz_pp_64x32_avx512);
1188
+ p.pu[LUMA_64x48].luma_hpp = PFX(interp_8tap_horiz_pp_64x48_avx512);
1189
+ p.pu[LUMA_64x64].luma_hpp = PFX(interp_8tap_horiz_pp_64x64_avx512);
1190
+ p.pu[LUMA_48x64].luma_hpp = PFX(interp_8tap_horiz_pp_48x64_avx512);
1191
+
1192
+ p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vpp = PFX(interp_4tap_vert_pp_64x16_avx512);
1193
+ p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vpp = PFX(interp_4tap_vert_pp_64x32_avx512);
1194
+ p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vpp = PFX(interp_4tap_vert_pp_64x48_avx512);
1195
+ p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vpp = PFX(interp_4tap_vert_pp_64x64_avx512);
1196
+ p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vps = PFX(interp_4tap_vert_ps_64x16_avx512);
1197
+ p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vps = PFX(interp_4tap_vert_ps_64x32_avx512);
1198
+ p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vps = PFX(interp_4tap_vert_ps_64x48_avx512);
1199
+ p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vps = PFX(interp_4tap_vert_ps_64x64_avx512);
1200
+ p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vsp = PFX(interp_4tap_vert_sp_64x16_avx512);
1201
+ p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vsp = PFX(interp_4tap_vert_sp_64x32_avx512);
1202
+ p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vsp = PFX(interp_4tap_vert_sp_64x48_avx512);
1203
+ p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vsp = PFX(interp_4tap_vert_sp_64x64_avx512);
1204
+ p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vss = PFX(interp_4tap_vert_ss_64x16_avx512);
1205
+ p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vss = PFX(interp_4tap_vert_ss_64x32_avx512);
1206
+ p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vss = PFX(interp_4tap_vert_ss_64x48_avx512);
1207
+ p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vss = PFX(interp_4tap_vert_ss_64x64_avx512);
1208
+
1209
+ p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vpp = PFX(interp_4tap_vert_pp_48x64_avx512);
1210
+ p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vps = PFX(interp_4tap_vert_ps_48x64_avx512);
1211
+ p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vsp = PFX(interp_4tap_vert_sp_48x64_avx512);
1212
+ p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vss = PFX(interp_4tap_vert_ss_48x64_avx512);
1213
+
1214
+ p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_vpp = PFX(interp_4tap_vert_pp_32x8_avx512);
1215
+ p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_vpp = PFX(interp_4tap_vert_pp_32x16_avx512);
1216
+ p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_vpp = PFX(interp_4tap_vert_pp_32x24_avx512);
1217
+ p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_vpp = PFX(interp_4tap_vert_pp_32x32_avx512);
1218
+ p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_vpp = PFX(interp_4tap_vert_pp_32x64_avx512);
1219
+ p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_vps = PFX(interp_4tap_vert_ps_32x8_avx512);
1220
+ p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_vps = PFX(interp_4tap_vert_ps_32x16_avx512);
1221
+ p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_vps = PFX(interp_4tap_vert_ps_32x24_avx512);
1222
+ p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_vps = PFX(interp_4tap_vert_ps_32x32_avx512);
1223
+ p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_vps = PFX(interp_4tap_vert_ps_32x64_avx512);
1224
+ p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_vss = PFX(interp_4tap_vert_ss_32x8_avx512);
1225
+ p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_vss = PFX(interp_4tap_vert_ss_32x16_avx512);
1226
+ p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_vss = PFX(interp_4tap_vert_ss_32x24_avx512);
1227
+ p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_vss = PFX(interp_4tap_vert_ss_32x32_avx512);
1228
+ p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_vss = PFX(interp_4tap_vert_ss_32x64_avx512);
1229
+ p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_vsp = PFX(interp_4tap_vert_sp_32x8_avx512);
1230
+ p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_vsp = PFX(interp_4tap_vert_sp_32x16_avx512);
1231
+ p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_vsp = PFX(interp_4tap_vert_sp_32x24_avx512);
1232
+ p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_vsp = PFX(interp_4tap_vert_sp_32x32_avx512);
1233
+ p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_vsp = PFX(interp_4tap_vert_sp_32x64_avx512);
1234
+
1235
+ p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_vpp = PFX(interp_4tap_vert_pp_16x4_avx512);
1236
+ p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_vpp = PFX(interp_4tap_vert_pp_16x8_avx512);
1237
+ p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_vpp = PFX(interp_4tap_vert_pp_16x12_avx512);
1238
+ p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_vpp = PFX(interp_4tap_vert_pp_16x16_avx512);
1239
+ p.chroma[X265_CSP_I444].pu[LUMA_16x32].filter_vpp = PFX(interp_4tap_vert_pp_16x32_avx512);
1240
+ p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_vpp = PFX(interp_4tap_vert_pp_16x64_avx512);
1241
+ p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_vps = PFX(interp_4tap_vert_ps_16x4_avx512);
1242
+ p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_vps = PFX(interp_4tap_vert_ps_16x8_avx512);
1243
+ p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_vps = PFX(interp_4tap_vert_ps_16x12_avx512);
1244
+ p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_vps = PFX(interp_4tap_vert_ps_16x16_avx512);
1245
+ p.chroma[X265_CSP_I444].pu[LUMA_16x32].filter_vps = PFX(interp_4tap_vert_ps_16x32_avx512);
1246
+ p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_vps = PFX(interp_4tap_vert_ps_16x64_avx512);
1247
+ p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_vss = PFX(interp_4tap_vert_ss_16x4_avx512);
1248
+ p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_vss = PFX(interp_4tap_vert_ss_16x8_avx512);
1249
+ p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_vss = PFX(interp_4tap_vert_ss_16x12_avx512);
1250
+ p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_vss = PFX(interp_4tap_vert_ss_16x16_avx512);
1251
+ p.chroma[X265_CSP_I444].pu[LUMA_16x32].filter_vss = PFX(interp_4tap_vert_ss_16x32_avx512);
1252
+ p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_vss = PFX(interp_4tap_vert_ss_16x64_avx512);
1253
+ p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_vsp = PFX(interp_4tap_vert_sp_16x4_avx512);
1254
+ p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_vsp = PFX(interp_4tap_vert_sp_16x8_avx512);
1255
+ p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_vsp = PFX(interp_4tap_vert_sp_16x12_avx512);
1256
+ p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_vsp = PFX(interp_4tap_vert_sp_16x16_avx512);
1257
+ p.chroma[X265_CSP_I444].pu[LUMA_16x32].filter_vsp = PFX(interp_4tap_vert_sp_16x32_avx512);
1258
+ p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_vsp = PFX(interp_4tap_vert_sp_16x64_avx512);
1259
+
1260
+ p.chroma[X265_CSP_I444].pu[LUMA_8x8].filter_vpp = PFX(interp_4tap_vert_pp_8x8_avx512);
1261
+ p.chroma[X265_CSP_I444].pu[LUMA_8x16].filter_vpp = PFX(interp_4tap_vert_pp_8x16_avx512);
1262
+ p.chroma[X265_CSP_I444].pu[LUMA_8x32].filter_vpp = PFX(interp_4tap_vert_pp_8x32_avx512);
1263
+ p.chroma[X265_CSP_I444].pu[LUMA_8x8].filter_vps = PFX(interp_4tap_vert_ps_8x8_avx512);
1264
+ p.chroma[X265_CSP_I444].pu[LUMA_8x16].filter_vps = PFX(interp_4tap_vert_ps_8x16_avx512);
1265
+ p.chroma[X265_CSP_I444].pu[LUMA_8x32].filter_vps = PFX(interp_4tap_vert_ps_8x32_avx512);
1266
+ p.chroma[X265_CSP_I444].pu[LUMA_8x8].filter_vss = PFX(interp_4tap_vert_ss_8x8_avx512);
1267
+ p.chroma[X265_CSP_I444].pu[LUMA_8x16].filter_vss = PFX(interp_4tap_vert_ss_8x16_avx512);
1268
+ p.chroma[X265_CSP_I444].pu[LUMA_8x32].filter_vss = PFX(interp_4tap_vert_ss_8x32_avx512);
1269
+ p.chroma[X265_CSP_I444].pu[LUMA_8x8].filter_vsp = PFX(interp_4tap_vert_sp_8x8_avx512);
1270
+ p.chroma[X265_CSP_I444].pu[LUMA_8x16].filter_vsp = PFX(interp_4tap_vert_sp_8x16_avx512);
1271
+ p.chroma[X265_CSP_I444].pu[LUMA_8x32].filter_vsp = PFX(interp_4tap_vert_sp_8x32_avx512);
1272
+
1273
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_vpp = PFX(interp_4tap_vert_pp_32x16_avx512);
1274
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_vpp = PFX(interp_4tap_vert_pp_32x32_avx512);
1275
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_vpp = PFX(interp_4tap_vert_pp_32x48_avx512);
1276
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_vpp = PFX(interp_4tap_vert_pp_32x64_avx512);
1277
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_vps = PFX(interp_4tap_vert_ps_32x16_avx512);
1278
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_vps = PFX(interp_4tap_vert_ps_32x32_avx512);
1279
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_vps = PFX(interp_4tap_vert_ps_32x48_avx512);
1280
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_vps = PFX(interp_4tap_vert_ps_32x64_avx512);
1281
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_vss = PFX(interp_4tap_vert_ss_32x16_avx512);
1282
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_vss = PFX(interp_4tap_vert_ss_32x32_avx512);
1283
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_vss = PFX(interp_4tap_vert_ss_32x48_avx512);
1284
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_vss = PFX(interp_4tap_vert_ss_32x64_avx512);
1285
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_vsp = PFX(interp_4tap_vert_sp_32x16_avx512);
1286
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_vsp = PFX(interp_4tap_vert_sp_32x32_avx512);
1287
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_vsp = PFX(interp_4tap_vert_sp_32x48_avx512);
1288
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_vsp = PFX(interp_4tap_vert_sp_32x64_avx512);
1289
+
1290
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_vpp = PFX(interp_4tap_vert_pp_16x8_avx512);
1291
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_vpp = PFX(interp_4tap_vert_pp_16x16_avx512);
1292
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_vpp = PFX(interp_4tap_vert_pp_16x24_avx512);
1293
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].filter_vpp = PFX(interp_4tap_vert_pp_16x32_avx512);
1294
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].filter_vpp = PFX(interp_4tap_vert_pp_16x64_avx512);
1295
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_vps = PFX(interp_4tap_vert_ps_16x8_avx512);
1296
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_vps = PFX(interp_4tap_vert_ps_16x16_avx512);
1297
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_vps = PFX(interp_4tap_vert_ps_16x24_avx512);
1298
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].filter_vps = PFX(interp_4tap_vert_ps_16x32_avx512);
1299
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].filter_vps = PFX(interp_4tap_vert_ps_16x64_avx512);
1300
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_vss = PFX(interp_4tap_vert_ss_16x8_avx512);
1301
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_vss = PFX(interp_4tap_vert_ss_16x16_avx512);
1302
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_vss = PFX(interp_4tap_vert_ss_16x24_avx512);
1303
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].filter_vss = PFX(interp_4tap_vert_ss_16x32_avx512);
1304
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].filter_vss = PFX(interp_4tap_vert_ss_16x64_avx512);
1305
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_vsp = PFX(interp_4tap_vert_sp_16x8_avx512);
1306
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_vsp = PFX(interp_4tap_vert_sp_16x16_avx512);
1307
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_vsp = PFX(interp_4tap_vert_sp_16x24_avx512);
1308
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].filter_vsp = PFX(interp_4tap_vert_sp_16x32_avx512);
1309
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].filter_vsp = PFX(interp_4tap_vert_sp_16x64_avx512);
1310
+
1311
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].filter_vpp = PFX(interp_4tap_vert_pp_8x8_avx512);
1312
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].filter_vpp = PFX(interp_4tap_vert_pp_8x16_avx512);
1313
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].filter_vpp = PFX(interp_4tap_vert_pp_8x32_avx512);
1314
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].filter_vpp = PFX(interp_4tap_vert_pp_8x64_avx512);
1315
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].filter_vps = PFX(interp_4tap_vert_ps_8x8_avx512);
1316
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].filter_vps = PFX(interp_4tap_vert_ps_8x16_avx512);
1317
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].filter_vps = PFX(interp_4tap_vert_ps_8x32_avx512);
1318
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].filter_vps = PFX(interp_4tap_vert_ps_8x64_avx512);
1319
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].filter_vss = PFX(interp_4tap_vert_ss_8x8_avx512);
1320
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].filter_vss = PFX(interp_4tap_vert_ss_8x16_avx512);
1321
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].filter_vss = PFX(interp_4tap_vert_ss_8x32_avx512);
1322
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].filter_vss = PFX(interp_4tap_vert_ss_8x64_avx512);
1323
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].filter_vsp = PFX(interp_4tap_vert_sp_8x8_avx512);
1324
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].filter_vsp = PFX(interp_4tap_vert_sp_8x16_avx512);
1325
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].filter_vsp = PFX(interp_4tap_vert_sp_8x32_avx512);
1326
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].filter_vsp = PFX(interp_4tap_vert_sp_8x64_avx512);
1327
+
1328
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_vpp = PFX(interp_4tap_vert_pp_32x8_avx512);
1329
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vpp = PFX(interp_4tap_vert_pp_32x16_avx512);
1330
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vpp = PFX(interp_4tap_vert_pp_32x24_avx512);
1331
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_vpp = PFX(interp_4tap_vert_pp_32x32_avx512);
1332
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_vps = PFX(interp_4tap_vert_ps_32x8_avx512);
1333
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vps = PFX(interp_4tap_vert_ps_32x16_avx512);
1334
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vps = PFX(interp_4tap_vert_ps_32x24_avx512);
1335
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_vps = PFX(interp_4tap_vert_ps_32x32_avx512);
1336
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_vss = PFX(interp_4tap_vert_ss_32x8_avx512);
1337
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vss = PFX(interp_4tap_vert_ss_32x16_avx512);
1338
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vss = PFX(interp_4tap_vert_ss_32x24_avx512);
1339
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_vss = PFX(interp_4tap_vert_ss_32x32_avx512);
1340
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_vsp = PFX(interp_4tap_vert_sp_32x8_avx512);
1341
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vsp = PFX(interp_4tap_vert_sp_32x16_avx512);
1342
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vsp = PFX(interp_4tap_vert_sp_32x24_avx512);
1343
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_vsp = PFX(interp_4tap_vert_sp_32x32_avx512);
1344
+
1345
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_vpp = PFX(interp_4tap_vert_pp_16x4_avx512);
1346
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_vpp = PFX(interp_4tap_vert_pp_16x8_avx512);
1347
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_vpp = PFX(interp_4tap_vert_pp_16x12_avx512);
1348
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_vpp = PFX(interp_4tap_vert_pp_16x16_avx512);
1349
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].filter_vpp = PFX(interp_4tap_vert_pp_16x32_avx512);
1350
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_vps = PFX(interp_4tap_vert_ps_16x4_avx512);
1351
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_vps = PFX(interp_4tap_vert_ps_16x8_avx512);
1352
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_vps = PFX(interp_4tap_vert_ps_16x12_avx512);
1353
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_vps = PFX(interp_4tap_vert_ps_16x16_avx512);
1354
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].filter_vps = PFX(interp_4tap_vert_ps_16x32_avx512);
1355
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_vss = PFX(interp_4tap_vert_ss_16x4_avx512);
1356
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_vss = PFX(interp_4tap_vert_ss_16x8_avx512);
1357
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_vss = PFX(interp_4tap_vert_ss_16x12_avx512);
1358
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_vss = PFX(interp_4tap_vert_ss_16x16_avx512);
1359
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].filter_vss = PFX(interp_4tap_vert_ss_16x32_avx512);
1360
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_vsp = PFX(interp_4tap_vert_sp_16x4_avx512);
1361
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_vsp = PFX(interp_4tap_vert_sp_16x8_avx512);
1362
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_vsp = PFX(interp_4tap_vert_sp_16x12_avx512);
1363
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_vsp = PFX(interp_4tap_vert_sp_16x16_avx512);
1364
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].filter_vsp = PFX(interp_4tap_vert_sp_16x32_avx512);
1365
+
1366
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].filter_vpp = PFX(interp_4tap_vert_pp_8x8_avx512);
1367
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].filter_vpp = PFX(interp_4tap_vert_pp_8x16_avx512);
1368
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].filter_vpp = PFX(interp_4tap_vert_pp_8x32_avx512);
1369
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].filter_vps = PFX(interp_4tap_vert_ps_8x8_avx512);
1370
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].filter_vps = PFX(interp_4tap_vert_ps_8x16_avx512);
1371
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].filter_vps = PFX(interp_4tap_vert_ps_8x32_avx512);
1372
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].filter_vss = PFX(interp_4tap_vert_ss_8x8_avx512);
1373
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].filter_vss = PFX(interp_4tap_vert_ss_8x16_avx512);
1374
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].filter_vss = PFX(interp_4tap_vert_ss_8x32_avx512);
1375
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].filter_vsp = PFX(interp_4tap_vert_sp_8x8_avx512);
1376
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].filter_vsp = PFX(interp_4tap_vert_sp_8x16_avx512);
1377
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].filter_vsp = PFX(interp_4tap_vert_sp_8x32_avx512);
1378
+
1379
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].filter_vpp = PFX(interp_4tap_vert_pp_24x32_avx512);
1380
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].filter_vpp = PFX(interp_4tap_vert_pp_24x64_avx512);
1381
+ p.chroma[X265_CSP_I444].pu[LUMA_24x32].filter_vpp = PFX(interp_4tap_vert_pp_24x32_avx512);
1382
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].filter_vps = PFX(interp_4tap_vert_ps_24x32_avx512);
1383
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].filter_vps = PFX(interp_4tap_vert_ps_24x64_avx512);
1384
+ p.chroma[X265_CSP_I444].pu[LUMA_24x32].filter_vps = PFX(interp_4tap_vert_ps_24x32_avx512);
1385
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].filter_vss = PFX(interp_4tap_vert_ss_24x32_avx512);
1386
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].filter_vss = PFX(interp_4tap_vert_ss_24x64_avx512);
1387
+ p.chroma[X265_CSP_I444].pu[LUMA_24x32].filter_vss = PFX(interp_4tap_vert_ss_24x32_avx512);
1388
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].filter_vsp = PFX(interp_4tap_vert_sp_24x32_avx512);
1389
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].filter_vsp = PFX(interp_4tap_vert_sp_24x64_avx512);
1390
+ p.chroma[X265_CSP_I444].pu[LUMA_24x32].filter_vsp = PFX(interp_4tap_vert_sp_24x32_avx512);
1391
+
1392
+ p.pu[LUMA_8x8].luma_vss = PFX(interp_8tap_vert_ss_8x8_avx512);
1393
+ p.pu[LUMA_8x16].luma_vss = PFX(interp_8tap_vert_ss_8x16_avx512);
1394
+ p.pu[LUMA_8x32].luma_vss = PFX(interp_8tap_vert_ss_8x32_avx512);
1395
+ p.pu[LUMA_16x4].luma_vss = PFX(interp_8tap_vert_ss_16x4_avx512);
1396
+ p.pu[LUMA_16x8].luma_vss = PFX(interp_8tap_vert_ss_16x8_avx512);
1397
+ p.pu[LUMA_16x12].luma_vss = PFX(interp_8tap_vert_ss_16x12_avx512);
1398
+ p.pu[LUMA_16x16].luma_vss = PFX(interp_8tap_vert_ss_16x16_avx512);
1399
+ p.pu[LUMA_16x32].luma_vss = PFX(interp_8tap_vert_ss_16x32_avx512);
1400
+ p.pu[LUMA_16x64].luma_vss = PFX(interp_8tap_vert_ss_16x64_avx512);
1401
+ p.pu[LUMA_24x32].luma_vss = PFX(interp_8tap_vert_ss_24x32_avx512);
1402
+ p.pu[LUMA_32x8].luma_vss = PFX(interp_8tap_vert_ss_32x8_avx512);
1403
+ p.pu[LUMA_32x16].luma_vss = PFX(interp_8tap_vert_ss_32x16_avx512);
1404
+ p.pu[LUMA_32x32].luma_vss = PFX(interp_8tap_vert_ss_32x32_avx512);
1405
+ p.pu[LUMA_32x24].luma_vss = PFX(interp_8tap_vert_ss_32x24_avx512);
1406
+ p.pu[LUMA_32x64].luma_vss = PFX(interp_8tap_vert_ss_32x64_avx512);
1407
+ p.pu[LUMA_64x16].luma_vss = PFX(interp_8tap_vert_ss_64x16_avx512);
1408
+ p.pu[LUMA_64x32].luma_vss = PFX(interp_8tap_vert_ss_64x32_avx512);
1409
+ p.pu[LUMA_64x48].luma_vss = PFX(interp_8tap_vert_ss_64x48_avx512);
1410
+ p.pu[LUMA_64x64].luma_vss = PFX(interp_8tap_vert_ss_64x64_avx512);
1411
+ p.pu[LUMA_48x64].luma_vss = PFX(interp_8tap_vert_ss_48x64_avx512);
1412
+
1413
+ p.pu[LUMA_8x8].luma_vsp = PFX(interp_8tap_vert_sp_8x8_avx512);
1414
+ p.pu[LUMA_8x16].luma_vsp = PFX(interp_8tap_vert_sp_8x16_avx512);
1415
+ p.pu[LUMA_8x32].luma_vsp = PFX(interp_8tap_vert_sp_8x32_avx512);
1416
+ p.pu[LUMA_16x4].luma_vsp = PFX(interp_8tap_vert_sp_16x4_avx512);
1417
+ p.pu[LUMA_16x8].luma_vsp = PFX(interp_8tap_vert_sp_16x8_avx512);
1418
+ p.pu[LUMA_16x12].luma_vsp = PFX(interp_8tap_vert_sp_16x12_avx512);
1419
+ p.pu[LUMA_16x16].luma_vsp = PFX(interp_8tap_vert_sp_16x16_avx512);
1420
+ p.pu[LUMA_16x32].luma_vsp = PFX(interp_8tap_vert_sp_16x32_avx512);
1421
+ p.pu[LUMA_16x64].luma_vsp = PFX(interp_8tap_vert_sp_16x64_avx512);
1422
+ p.pu[LUMA_24x32].luma_vsp = PFX(interp_8tap_vert_sp_24x32_avx512);
1423
+ p.pu[LUMA_32x8].luma_vsp = PFX(interp_8tap_vert_sp_32x8_avx512);
1424
+ p.pu[LUMA_32x16].luma_vsp = PFX(interp_8tap_vert_sp_32x16_avx512);
1425
+ p.pu[LUMA_32x32].luma_vsp = PFX(interp_8tap_vert_sp_32x32_avx512);
1426
+ p.pu[LUMA_32x24].luma_vsp = PFX(interp_8tap_vert_sp_32x24_avx512);
1427
+ p.pu[LUMA_32x64].luma_vsp = PFX(interp_8tap_vert_sp_32x64_avx512);
1428
+ p.pu[LUMA_64x16].luma_vsp = PFX(interp_8tap_vert_sp_64x16_avx512);
1429
+ p.pu[LUMA_64x32].luma_vsp = PFX(interp_8tap_vert_sp_64x32_avx512);
1430
+ p.pu[LUMA_64x48].luma_vsp = PFX(interp_8tap_vert_sp_64x48_avx512);
1431
+ p.pu[LUMA_64x64].luma_vsp = PFX(interp_8tap_vert_sp_64x64_avx512);
1432
+ p.pu[LUMA_48x64].luma_vsp = PFX(interp_8tap_vert_sp_48x64_avx512);
1433
+
1434
+ p.pu[LUMA_16x4].luma_vpp = PFX(interp_8tap_vert_pp_16x4_avx512);
1435
+ p.pu[LUMA_16x8].luma_vpp = PFX(interp_8tap_vert_pp_16x8_avx512);
1436
+ p.pu[LUMA_16x12].luma_vpp = PFX(interp_8tap_vert_pp_16x12_avx512);
1437
+ p.pu[LUMA_16x16].luma_vpp = PFX(interp_8tap_vert_pp_16x16_avx512);
1438
+ p.pu[LUMA_16x32].luma_vpp = PFX(interp_8tap_vert_pp_16x32_avx512);
1439
+ p.pu[LUMA_16x64].luma_vpp = PFX(interp_8tap_vert_pp_16x64_avx512);
1440
+ p.pu[LUMA_24x32].luma_vpp = PFX(interp_8tap_vert_pp_24x32_avx512);
1441
+ p.pu[LUMA_32x8].luma_vpp = PFX(interp_8tap_vert_pp_32x8_avx512);
1442
+ p.pu[LUMA_32x16].luma_vpp = PFX(interp_8tap_vert_pp_32x16_avx512);
1443
+ p.pu[LUMA_32x32].luma_vpp = PFX(interp_8tap_vert_pp_32x32_avx512);
1444
+ p.pu[LUMA_32x24].luma_vpp = PFX(interp_8tap_vert_pp_32x24_avx512);
1445
+ p.pu[LUMA_32x64].luma_vpp = PFX(interp_8tap_vert_pp_32x64_avx512);
1446
+ p.pu[LUMA_48x64].luma_vpp = PFX(interp_8tap_vert_pp_48x64_avx512);
1447
+ p.pu[LUMA_64x16].luma_vpp = PFX(interp_8tap_vert_pp_64x16_avx512);
1448
+ p.pu[LUMA_64x32].luma_vpp = PFX(interp_8tap_vert_pp_64x32_avx512);
1449
+ p.pu[LUMA_64x48].luma_vpp = PFX(interp_8tap_vert_pp_64x48_avx512);
1450
+ p.pu[LUMA_64x64].luma_vpp = PFX(interp_8tap_vert_pp_64x64_avx512);
1451
+
1452
+ p.pu[LUMA_16x4].luma_vps = PFX(interp_8tap_vert_ps_16x4_avx512);
1453
+ p.pu[LUMA_16x8].luma_vps = PFX(interp_8tap_vert_ps_16x8_avx512);
1454
+ p.pu[LUMA_16x12].luma_vps = PFX(interp_8tap_vert_ps_16x12_avx512);
1455
+ p.pu[LUMA_16x16].luma_vps = PFX(interp_8tap_vert_ps_16x16_avx512);
1456
+ p.pu[LUMA_16x32].luma_vps = PFX(interp_8tap_vert_ps_16x32_avx512);
1457
+ p.pu[LUMA_16x64].luma_vps = PFX(interp_8tap_vert_ps_16x64_avx512);
1458
+ p.pu[LUMA_24x32].luma_vps = PFX(interp_8tap_vert_ps_24x32_avx512);
1459
+ p.pu[LUMA_32x8].luma_vps = PFX(interp_8tap_vert_ps_32x8_avx512);
1460
+ p.pu[LUMA_32x16].luma_vps = PFX(interp_8tap_vert_ps_32x16_avx512);
1461
+ p.pu[LUMA_32x32].luma_vps = PFX(interp_8tap_vert_ps_32x32_avx512);
1462
+ p.pu[LUMA_32x24].luma_vps = PFX(interp_8tap_vert_ps_32x24_avx512);
1463
+ p.pu[LUMA_32x64].luma_vps = PFX(interp_8tap_vert_ps_32x64_avx512);
1464
+ p.pu[LUMA_48x64].luma_vps = PFX(interp_8tap_vert_ps_48x64_avx512);
1465
+ p.pu[LUMA_64x16].luma_vps = PFX(interp_8tap_vert_ps_64x16_avx512);
1466
+ p.pu[LUMA_64x32].luma_vps = PFX(interp_8tap_vert_ps_64x32_avx512);
1467
+ p.pu[LUMA_64x48].luma_vps = PFX(interp_8tap_vert_ps_64x48_avx512);
1468
+ p.pu[LUMA_64x64].luma_vps = PFX(interp_8tap_vert_ps_64x64_avx512);
1469
+
1470
+ p.cu[BLOCK_8x8].dct = PFX(dct8_avx512);
1471
+ /* TODO: Currently these kernels performance are similar to AVX2 version, we need a to improve them further to ebable
1472
+ * it. Probably a Vtune analysis will help here.
1473
+
1474
+ * p.cu[BLOCK_16x16].dct = PFX(dct16_avx512);
1475
+ * p.cu[BLOCK_32x32].dct = PFX(dct32_avx512); */
1476
+
1477
+ p.cu[BLOCK_8x8].idct = PFX(idct8_avx512);
1478
+ p.cu[BLOCK_16x16].idct = PFX(idct16_avx512);
1479
+ p.cu[BLOCK_32x32].idct = PFX(idct32_avx512);
1480
+ p.quant = PFX(quant_avx512);
1481
+ p.nquant = PFX(nquant_avx512);
1482
+ p.denoiseDct = PFX(denoise_dct_avx512);
1483
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_hps = PFX(interp_4tap_horiz_ps_32x32_avx512);
1484
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_hps = PFX(interp_4tap_horiz_ps_32x16_avx512);
1485
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_hps = PFX(interp_4tap_horiz_ps_32x24_avx512);
1486
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_hps = PFX(interp_4tap_horiz_ps_32x8_avx512);
1487
+
1488
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_hps = PFX(interp_4tap_horiz_ps_32x64_avx512);
1489
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_hps = PFX(interp_4tap_horiz_ps_32x32_avx512);
1490
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_hps = PFX(interp_4tap_horiz_ps_32x48_avx512);
1491
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_hps = PFX(interp_4tap_horiz_ps_32x16_avx512);
1492
+
1493
+ p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_hps = PFX(interp_4tap_horiz_ps_32x32_avx512);
1494
+ p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_hps = PFX(interp_4tap_horiz_ps_32x16_avx512);
1495
+ p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_hps = PFX(interp_4tap_horiz_ps_32x64_avx512);
1496
+ p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_hps = PFX(interp_4tap_horiz_ps_32x24_avx512);
1497
+ p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_hps = PFX(interp_4tap_horiz_ps_32x8_avx512);
1498
+
1499
+ p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_hps = PFX(interp_4tap_horiz_ps_64x64_avx512);
1500
+ p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_hps = PFX(interp_4tap_horiz_ps_64x48_avx512);
1501
+ p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_hps = PFX(interp_4tap_horiz_ps_64x32_avx512);
1502
+ p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_hps = PFX(interp_4tap_horiz_ps_64x16_avx512);
1503
+
1504
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_hps = PFX(interp_4tap_horiz_ps_16x16_avx512);
1505
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_hps = PFX(interp_4tap_horiz_ps_16x8_avx512);
1506
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].filter_hps = PFX(interp_4tap_horiz_ps_16x32_avx512);
1507
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_hps = PFX(interp_4tap_horiz_ps_16x12_avx512);
1508
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_hps = PFX(interp_4tap_horiz_ps_16x4_avx512);
1509
+
1510
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].filter_hps = PFX(interp_4tap_horiz_ps_16x32_avx512);
1511
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_hps = PFX(interp_4tap_horiz_ps_16x16_avx512);
1512
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].filter_hps = PFX(interp_4tap_horiz_ps_16x64_avx512);
1513
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_hps = PFX(interp_4tap_horiz_ps_16x24_avx512);
1514
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_hps = PFX(interp_4tap_horiz_ps_16x8_avx512);
1515
+
1516
+ p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_hps = PFX(interp_4tap_horiz_ps_16x16_avx512);
1517
+ p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_hps = PFX(interp_4tap_horiz_ps_16x8_avx512);
1518
+ p.chroma[X265_CSP_I444].pu[LUMA_16x32].filter_hps = PFX(interp_4tap_horiz_ps_16x32_avx512);
1519
+ p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_hps = PFX(interp_4tap_horiz_ps_16x12_avx512);
1520
+ p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_hps = PFX(interp_4tap_horiz_ps_16x4_avx512);
1521
+ p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_hps = PFX(interp_4tap_horiz_ps_16x64_avx512);
1522
+
1523
+ p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_hps = PFX(interp_4tap_horiz_ps_48x64_avx512);
1524
+
1525
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].filter_hps = PFX(interp_4tap_horiz_ps_8x8_avx512);
1526
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].filter_hps = PFX(interp_4tap_horiz_ps_8x4_avx512);
1527
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].filter_hps = PFX(interp_4tap_horiz_ps_8x16_avx512);
1528
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].filter_hps = PFX(interp_4tap_horiz_ps_8x32_avx512);
1529
+
1530
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].filter_hps = PFX(interp_4tap_horiz_ps_8x8_avx512);
1531
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].filter_hps = PFX(interp_4tap_horiz_ps_8x16_avx512);
1532
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].filter_hps = PFX(interp_4tap_horiz_ps_8x32_avx512);
1533
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].filter_hps = PFX(interp_4tap_horiz_ps_8x12_avx512);
1534
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].filter_hps = PFX(interp_4tap_horiz_ps_8x64_avx512);
1535
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].filter_hps = PFX(interp_4tap_horiz_ps_8x4_avx512);
1536
+
1537
+ p.chroma[X265_CSP_I444].pu[LUMA_8x8].filter_hps = PFX(interp_4tap_horiz_ps_8x8_avx512);
1538
+ p.chroma[X265_CSP_I444].pu[LUMA_8x4].filter_hps = PFX(interp_4tap_horiz_ps_8x4_avx512);
1539
+ p.chroma[X265_CSP_I444].pu[LUMA_8x16].filter_hps = PFX(interp_4tap_horiz_ps_8x16_avx512);
1540
+ p.chroma[X265_CSP_I444].pu[LUMA_8x32].filter_hps = PFX(interp_4tap_horiz_ps_8x32_avx512);
1541
+
1542
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].filter_hps = PFX(interp_4tap_horiz_ps_24x32_avx512);
1543
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].filter_hps = PFX(interp_4tap_horiz_ps_24x64_avx512);
1544
+ p.chroma[X265_CSP_I444].pu[LUMA_24x32].filter_hps = PFX(interp_4tap_horiz_ps_24x32_avx512);
1545
+
1546
+ //Luma_hps_32xN
1547
+ p.pu[LUMA_32x8].luma_hps = PFX(interp_8tap_horiz_ps_32x8_avx512);
1548
+ p.pu[LUMA_32x16].luma_hps = PFX(interp_8tap_horiz_ps_32x16_avx512);
1549
+ p.pu[LUMA_32x32].luma_hps = PFX(interp_8tap_horiz_ps_32x32_avx512);
1550
+ p.pu[LUMA_32x24].luma_hps = PFX(interp_8tap_horiz_ps_32x24_avx512);
1551
+ p.pu[LUMA_32x64].luma_hps = PFX(interp_8tap_horiz_ps_32x64_avx512);
1552
+ //Luma_hps_64xN
1553
+ p.pu[LUMA_64x16].luma_hps = PFX(interp_8tap_horiz_ps_64x16_avx512);
1554
+ p.pu[LUMA_64x32].luma_hps = PFX(interp_8tap_horiz_ps_64x32_avx512);
1555
+ p.pu[LUMA_64x48].luma_hps = PFX(interp_8tap_horiz_ps_64x48_avx512);
1556
+ p.pu[LUMA_64x64].luma_hps = PFX(interp_8tap_horiz_ps_64x64_avx512);
1557
+ //Luma_hps_16xN
1558
+ p.pu[LUMA_16x4].luma_hps = PFX(interp_8tap_horiz_ps_16x4_avx512);
1559
+ p.pu[LUMA_16x8].luma_hps = PFX(interp_8tap_horiz_ps_16x8_avx512);
1560
+ p.pu[LUMA_16x12].luma_hps = PFX(interp_8tap_horiz_ps_16x12_avx512);
1561
+ p.pu[LUMA_16x16].luma_hps = PFX(interp_8tap_horiz_ps_16x16_avx512);
1562
+ p.pu[LUMA_16x32].luma_hps = PFX(interp_8tap_horiz_ps_16x32_avx512);
1563
+ p.pu[LUMA_16x64].luma_hps = PFX(interp_8tap_horiz_ps_16x64_avx512);
1564
+ //Luma_hps_48x64
1565
+ p.pu[LUMA_48x64].luma_hps = PFX(interp_8tap_horiz_ps_48x64_avx512);
1566
+ //Luma_hps_24x32
1567
+ p.pu[LUMA_24x32].luma_hps = PFX(interp_8tap_horiz_ps_24x32_avx512);
1568
+ //Luma_hps_8xN
1569
+ p.pu[LUMA_8x4].luma_hps = PFX(interp_8tap_horiz_ps_8x4_avx512);
1570
+ p.pu[LUMA_8x8].luma_hps = PFX(interp_8tap_horiz_ps_8x8_avx512);
1571
+ p.pu[LUMA_8x16].luma_hps = PFX(interp_8tap_horiz_ps_8x16_avx512);
1572
+ p.pu[LUMA_8x32].luma_hps = PFX(interp_8tap_horiz_ps_8x32_avx512);
1573
+ p.pu[LUMA_16x8].satd = PFX(pixel_satd_16x8_avx512);
1574
+ p.pu[LUMA_16x16].satd = PFX(pixel_satd_16x16_avx512);
1575
+ p.pu[LUMA_16x32].satd = PFX(pixel_satd_16x32_avx512);
1576
+ p.pu[LUMA_16x64].satd = PFX(pixel_satd_16x64_avx512);
1577
+ p.pu[LUMA_32x8].satd = PFX(pixel_satd_32x8_avx512);
1578
+ p.pu[LUMA_32x16].satd = PFX(pixel_satd_32x16_avx512);
1579
+ p.pu[LUMA_32x24].satd = PFX(pixel_satd_32x24_avx512);
1580
+ p.pu[LUMA_32x32].satd = PFX(pixel_satd_32x32_avx512);
1581
+ p.pu[LUMA_32x64].satd = PFX(pixel_satd_32x64_avx512);
1582
+ p.pu[LUMA_64x16].satd = PFX(pixel_satd_64x16_avx512);
1583
+ p.pu[LUMA_64x32].satd = PFX(pixel_satd_64x32_avx512);
1584
+ p.pu[LUMA_64x48].satd = PFX(pixel_satd_64x48_avx512);
1585
+ p.pu[LUMA_64x64].satd = PFX(pixel_satd_64x64_avx512);
1586
+ p.pu[LUMA_48x64].satd = PFX(pixel_satd_48x64_avx512);
1587
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].satd = PFX(pixel_satd_16x32_avx512);
1588
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].satd = PFX(pixel_satd_16x16_avx512);
1589
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].satd = PFX(pixel_satd_16x8_avx512);
1590
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].satd = PFX(pixel_satd_32x32_avx512);
1591
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].satd = PFX(pixel_satd_32x16_avx512);
1592
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].satd = PFX(pixel_satd_32x24_avx512);
1593
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].satd = PFX(pixel_satd_32x8_avx512);
1594
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].satd = PFX(pixel_satd_16x64_avx512);
1595
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].satd = PFX(pixel_satd_16x32_avx512);
1596
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].satd = PFX(pixel_satd_16x16_avx512);
1597
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].satd = PFX(pixel_satd_16x8_avx512);
1598
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].satd = PFX(pixel_satd_32x64_avx512);
1599
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].satd = PFX(pixel_satd_32x32_avx512);
1600
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd = PFX(pixel_satd_32x16_avx512);
1601
+
1602
+ p.cu[BLOCK_32x32].intra_pred[DC_IDX] = PFX(intra_pred_dc32_avx512);
1603
+ p.cu[BLOCK_32x32].intra_pred[2] = PFX(intra_pred_ang32_2_avx512);
1604
+ p.cu[BLOCK_32x32].intra_pred[34] = PFX(intra_pred_ang32_2_avx512);
1605
+ p.cu[BLOCK_32x32].intra_pred[9] = PFX(intra_pred_ang32_9_avx512);
1606
+ p.cu[BLOCK_32x32].intra_pred[10] = PFX(intra_pred_ang32_10_avx512);
1607
+ p.cu[BLOCK_32x32].intra_pred[11] = PFX(intra_pred_ang32_11_avx512);
1608
+ p.cu[BLOCK_32x32].intra_pred[18] = PFX(intra_pred_ang32_18_avx512);
1609
+ p.cu[BLOCK_32x32].intra_pred[25] = PFX(intra_pred_ang32_25_avx512);
1610
+ p.cu[BLOCK_32x32].intra_pred[26] = PFX(intra_pred_ang32_26_avx512);
1611
+ p.cu[BLOCK_32x32].intra_pred[27] = PFX(intra_pred_ang32_27_avx512);
1612
+ p.cu[BLOCK_32x32].intra_pred[5] = PFX(intra_pred_ang32_5_avx512);
1613
+ p.cu[BLOCK_32x32].intra_pred[31] = PFX(intra_pred_ang32_31_avx512);
1614
+ p.cu[BLOCK_32x32].intra_pred[32] = PFX(intra_pred_ang32_32_avx512);
1615
+ p.cu[BLOCK_32x32].intra_pred[4] = PFX(intra_pred_ang32_4_avx512);
1616
+ p.cu[BLOCK_32x32].intra_pred[30] = PFX(intra_pred_ang32_30_avx512);
1617
+ p.cu[BLOCK_32x32].intra_pred[6] = PFX(intra_pred_ang32_6_avx512);
1618
+ p.cu[BLOCK_32x32].intra_pred[29] = PFX(intra_pred_ang32_29_avx512);
1619
+ p.cu[BLOCK_32x32].intra_pred[7] = PFX(intra_pred_ang32_7_avx512);
1620
+ p.cu[BLOCK_32x32].intra_pred[8] = PFX(intra_pred_ang32_8_avx512);
1621
+ p.cu[BLOCK_32x32].intra_pred[28] = PFX(intra_pred_ang32_28_avx512);
1622
+ p.cu[BLOCK_16x16].intra_pred[9] = PFX(intra_pred_ang16_9_avx512);
1623
+ p.cu[BLOCK_16x16].intra_pred[11] = PFX(intra_pred_ang16_11_avx512);
1624
+ p.cu[BLOCK_16x16].intra_pred[25] = PFX(intra_pred_ang16_25_avx512);
1625
+ p.cu[BLOCK_16x16].intra_pred[27] = PFX(intra_pred_ang16_27_avx512);
1626
+ p.cu[BLOCK_16x16].intra_pred[8] = PFX(intra_pred_ang16_8_avx512);
1627
+ p.cu[BLOCK_16x16].intra_pred[28] = PFX(intra_pred_ang16_28_avx512);
1628
+ p.cu[BLOCK_16x16].intra_pred[5] = PFX(intra_pred_ang16_5_avx512);
1629
+ p.cu[BLOCK_16x16].intra_pred[31] = PFX(intra_pred_ang16_31_avx512);
1630
+ p.cu[BLOCK_16x16].intra_pred[4] = PFX(intra_pred_ang16_4_avx512);
1631
+ p.cu[BLOCK_16x16].intra_pred[32] = PFX(intra_pred_ang16_32_avx512);
1632
+ p.cu[BLOCK_16x16].intra_pred[6] = PFX(intra_pred_ang16_6_avx512);
1633
+ p.cu[BLOCK_16x16].intra_pred[30] = PFX(intra_pred_ang16_30_avx512);
1634
+ p.cu[BLOCK_16x16].intra_pred[7] = PFX(intra_pred_ang16_7_avx512);
1635
+ p.cu[BLOCK_16x16].intra_pred[29] = PFX(intra_pred_ang16_29_avx512);
1636
+ p.pu[LUMA_64x64].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_64x64>;
1637
+ p.pu[LUMA_64x48].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_64x48>;
1638
+ p.pu[LUMA_64x32].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_64x32>;
1639
+ p.pu[LUMA_64x16].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_64x16>;
1640
+ p.pu[LUMA_32x8].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_32x8>;
1641
+ p.pu[LUMA_32x16].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_32x16>;
1642
+ p.pu[LUMA_32x32].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_32x32>;
1643
+ p.pu[LUMA_32x24].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_32x24>;
1644
+ p.pu[LUMA_32x64].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_32x64>;
1645
+ p.pu[LUMA_16x4].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_16x4>;
1646
+ p.pu[LUMA_16x8].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_16x8>;
1647
+ p.pu[LUMA_16x12].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_16x12>;
1648
+ p.pu[LUMA_16x16].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_16x16>;
1649
+ p.pu[LUMA_16x32].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_16x32>;
1650
+ p.pu[LUMA_16x64].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_16x64>;
1651
+ p.pu[LUMA_48x64].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_48x64>;
1652
+
1653
+ p.cu[BLOCK_16x16].psy_cost_pp = PFX(psyCost_pp_16x16_avx512);
1654
+ p.cu[BLOCK_32x32].psy_cost_pp = PFX(psyCost_pp_32x32_avx512);
1655
+ p.cu[BLOCK_64x64].psy_cost_pp = PFX(psyCost_pp_64x64_avx512);
1656
+
1657
+ p.cu[BLOCK_4x4].nonPsyRdoQuant = PFX(nonPsyRdoQuant4_avx512);
1658
+ p.cu[BLOCK_8x8].nonPsyRdoQuant = PFX(nonPsyRdoQuant8_avx512);
1659
+ p.cu[BLOCK_16x16].nonPsyRdoQuant = PFX(nonPsyRdoQuant16_avx512);
1660
+ p.cu[BLOCK_32x32].nonPsyRdoQuant = PFX(nonPsyRdoQuant32_avx512);
1661
+ p.cu[BLOCK_4x4].psyRdoQuant = PFX(psyRdoQuant4_avx512);
1662
+ p.cu[BLOCK_8x8].psyRdoQuant = PFX(psyRdoQuant8_avx512);
1663
+ p.cu[BLOCK_16x16].psyRdoQuant = PFX(psyRdoQuant16_avx512);
1664
+ p.cu[BLOCK_32x32].psyRdoQuant = PFX(psyRdoQuant32_avx512);
1665
+
1666
+ p.cu[BLOCK_32x32].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_32x32_avx512);
1667
+ p.cu[BLOCK_64x64].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_64x64_avx512);
1668
+ p.cu[BLOCK_32x32].sse_pp = PFX(pixel_ssd_32x32_avx512);
1669
+ p.cu[BLOCK_64x64].sse_pp = PFX(pixel_ssd_64x64_avx512);
1670
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sse_pp = (pixel_sse_t)PFX(pixel_ssd_32x32_avx512);
1671
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sse_pp = (pixel_sse_t)PFX(pixel_ssd_32x64_avx512);
1672
+ p.planecopy_sp_shl = PFX(upShift_16_avx512);
1673
+
1674
+ }
1675
+#endif
1676
}
1677
#else // if HIGH_BIT_DEPTH
1678
1679
1680
//p.frameInitLowres = PFX(frame_init_lowres_core_mmx2);
1681
p.frameInitLowres = PFX(frame_init_lowres_core_sse2);
1682
1683
- ALL_LUMA_TU(blockfill_s, blockfill_s, sse2);
1684
+ ALL_LUMA_TU(blockfill_s[NONALIGNED], blockfill_s, sse2);
1685
+ ALL_LUMA_TU(blockfill_s[ALIGNED], blockfill_s, sse2);
1686
ALL_LUMA_TU_S(cpy2Dto1D_shl, cpy2Dto1D_shl_, sse2);
1687
ALL_LUMA_TU_S(cpy2Dto1D_shr, cpy2Dto1D_shr_, sse2);
1688
- ALL_LUMA_TU_S(cpy1Dto2D_shl, cpy1Dto2D_shl_, sse2);
1689
+ ALL_LUMA_TU_S(cpy1Dto2D_shl[ALIGNED], cpy1Dto2D_shl_, sse2);
1690
+ ALL_LUMA_TU_S(cpy1Dto2D_shl[NONALIGNED], cpy1Dto2D_shl_, sse2);
1691
ALL_LUMA_TU_S(cpy1Dto2D_shr, cpy1Dto2D_shr_, sse2);
1692
- ALL_LUMA_TU_S(ssd_s, pixel_ssd_s_, sse2);
1693
-
1694
+ ALL_LUMA_TU_S(ssd_s[NONALIGNED], pixel_ssd_s_, sse2);
1695
ALL_LUMA_TU_S(intra_pred[PLANAR_IDX], intra_pred_planar, sse2);
1696
ALL_LUMA_TU_S(intra_pred[DC_IDX], intra_pred_dc, sse2);
1697
-
1698
p.cu[BLOCK_4x4].intra_pred[2] = PFX(intra_pred_ang4_2_sse2);
1699
p.cu[BLOCK_4x4].intra_pred[3] = PFX(intra_pred_ang4_3_sse2);
1700
p.cu[BLOCK_4x4].intra_pred[4] = PFX(intra_pred_ang4_4_sse2);
1701
1702
p.cu[BLOCK_4x4].intra_pred[33] = PFX(intra_pred_ang4_33_sse2);
1703
1704
p.cu[BLOCK_4x4].intra_pred_allangs = PFX(all_angs_pred_4x4_sse2);
1705
-
1706
- p.cu[BLOCK_4x4].calcresidual = PFX(getResidual4_sse2);
1707
- p.cu[BLOCK_8x8].calcresidual = PFX(getResidual8_sse2);
1708
+ ASSIGN2(p.cu[BLOCK_4x4].calcresidual, getResidual4_sse2);
1709
+ ASSIGN2(p.cu[BLOCK_8x8].calcresidual, getResidual8_sse2);
1710
1711
ALL_LUMA_TU_S(transpose, transpose, sse2);
1712
p.cu[BLOCK_64x64].transpose = PFX(transpose64_sse2);
1713
1714
p.dst4x4 = PFX(dst4_sse2);
1715
1716
p.planecopy_sp = PFX(downShift_16_sse2);
1717
- ALL_CHROMA_420_PU(p2s, filterPixelToShort, sse2);
1718
- ALL_CHROMA_422_PU(p2s, filterPixelToShort, sse2);
1719
- ALL_CHROMA_444_PU(p2s, filterPixelToShort, sse2);
1720
- ALL_LUMA_PU(convert_p2s, filterPixelToShort, sse2);
1721
+ ALL_CHROMA_420_PU(p2s[NONALIGNED], filterPixelToShort, sse2);
1722
+ ALL_CHROMA_422_PU(p2s[NONALIGNED], filterPixelToShort, sse2);
1723
+ ALL_CHROMA_444_PU(p2s[NONALIGNED], filterPixelToShort, sse2);
1724
+ ALL_CHROMA_420_PU(p2s[ALIGNED], filterPixelToShort, sse2);
1725
+ ALL_CHROMA_422_PU(p2s[ALIGNED], filterPixelToShort, sse2);
1726
+ ALL_CHROMA_444_PU(p2s[ALIGNED], filterPixelToShort, sse2);
1727
+ ALL_LUMA_PU(convert_p2s[NONALIGNED], filterPixelToShort, sse2);
1728
+ ALL_LUMA_PU(convert_p2s[ALIGNED], filterPixelToShort, sse2);
1729
ALL_LUMA_TU(count_nonzero, count_nonzero, sse2);
1730
p.propagateCost = PFX(mbtree_propagate_cost_sse2);
1731
}
1732
1733
p.pu[LUMA_8x8].luma_hvpp = PFX(interp_8tap_hv_pp_8x8_ssse3);
1734
1735
p.frameInitLowres = PFX(frame_init_lowres_core_ssse3);
1736
- p.scale1D_128to64 = PFX(scale1D_128to64_ssse3);
1737
+ ASSIGN2(p.scale1D_128to64, scale1D_128to64_ssse3);
1738
p.scale2D_64to32 = PFX(scale2D_64to32_ssse3);
1739
1740
- p.pu[LUMA_8x4].convert_p2s = PFX(filterPixelToShort_8x4_ssse3);
1741
- p.pu[LUMA_8x8].convert_p2s = PFX(filterPixelToShort_8x8_ssse3);
1742
- p.pu[LUMA_8x16].convert_p2s = PFX(filterPixelToShort_8x16_ssse3);
1743
- p.pu[LUMA_8x32].convert_p2s = PFX(filterPixelToShort_8x32_ssse3);
1744
- p.pu[LUMA_16x4].convert_p2s = PFX(filterPixelToShort_16x4_ssse3);
1745
- p.pu[LUMA_16x8].convert_p2s = PFX(filterPixelToShort_16x8_ssse3);
1746
- p.pu[LUMA_16x12].convert_p2s = PFX(filterPixelToShort_16x12_ssse3);
1747
- p.pu[LUMA_16x16].convert_p2s = PFX(filterPixelToShort_16x16_ssse3);
1748
- p.pu[LUMA_16x32].convert_p2s = PFX(filterPixelToShort_16x32_ssse3);
1749
- p.pu[LUMA_16x64].convert_p2s = PFX(filterPixelToShort_16x64_ssse3);
1750
- p.pu[LUMA_32x8].convert_p2s = PFX(filterPixelToShort_32x8_ssse3);
1751
- p.pu[LUMA_32x16].convert_p2s = PFX(filterPixelToShort_32x16_ssse3);
1752
- p.pu[LUMA_32x24].convert_p2s = PFX(filterPixelToShort_32x24_ssse3);
1753
- p.pu[LUMA_32x32].convert_p2s = PFX(filterPixelToShort_32x32_ssse3);
1754
- p.pu[LUMA_32x64].convert_p2s = PFX(filterPixelToShort_32x64_ssse3);
1755
- p.pu[LUMA_64x16].convert_p2s = PFX(filterPixelToShort_64x16_ssse3);
1756
- p.pu[LUMA_64x32].convert_p2s = PFX(filterPixelToShort_64x32_ssse3);
1757
- p.pu[LUMA_64x48].convert_p2s = PFX(filterPixelToShort_64x48_ssse3);
1758
- p.pu[LUMA_64x64].convert_p2s = PFX(filterPixelToShort_64x64_ssse3);
1759
- p.pu[LUMA_12x16].convert_p2s = PFX(filterPixelToShort_12x16_ssse3);
1760
- p.pu[LUMA_24x32].convert_p2s = PFX(filterPixelToShort_24x32_ssse3);
1761
- p.pu[LUMA_48x64].convert_p2s = PFX(filterPixelToShort_48x64_ssse3);
1762
-
1763
- p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].p2s = PFX(filterPixelToShort_8x2_ssse3);
1764
- p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].p2s = PFX(filterPixelToShort_8x4_ssse3);
1765
- p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].p2s = PFX(filterPixelToShort_8x6_ssse3);
1766
- p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].p2s = PFX(filterPixelToShort_8x8_ssse3);
1767
- p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].p2s = PFX(filterPixelToShort_8x16_ssse3);
1768
- p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].p2s = PFX(filterPixelToShort_8x32_ssse3);
1769
- p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].p2s = PFX(filterPixelToShort_16x4_ssse3);
1770
- p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].p2s = PFX(filterPixelToShort_16x8_ssse3);
1771
- p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].p2s = PFX(filterPixelToShort_16x12_ssse3);
1772
- p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].p2s = PFX(filterPixelToShort_16x16_ssse3);
1773
- p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].p2s = PFX(filterPixelToShort_16x32_ssse3);
1774
- p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].p2s = PFX(filterPixelToShort_32x8_ssse3);
1775
- p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].p2s = PFX(filterPixelToShort_32x16_ssse3);
1776
- p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].p2s = PFX(filterPixelToShort_32x24_ssse3);
1777
- p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].p2s = PFX(filterPixelToShort_32x32_ssse3);
1778
- p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].p2s = PFX(filterPixelToShort_8x4_ssse3);
1779
- p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].p2s = PFX(filterPixelToShort_8x8_ssse3);
1780
- p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].p2s = PFX(filterPixelToShort_8x12_ssse3);
1781
- p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].p2s = PFX(filterPixelToShort_8x16_ssse3);
1782
- p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].p2s = PFX(filterPixelToShort_8x32_ssse3);
1783
- p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].p2s = PFX(filterPixelToShort_8x64_ssse3);
1784
- p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].p2s = PFX(filterPixelToShort_12x32_ssse3);
1785
- p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].p2s = PFX(filterPixelToShort_16x8_ssse3);
1786
- p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].p2s = PFX(filterPixelToShort_16x16_ssse3);
1787
- p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].p2s = PFX(filterPixelToShort_16x24_ssse3);
1788
- p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].p2s = PFX(filterPixelToShort_16x32_ssse3);
1789
- p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].p2s = PFX(filterPixelToShort_16x64_ssse3);
1790
- p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].p2s = PFX(filterPixelToShort_24x64_ssse3);
1791
- p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].p2s = PFX(filterPixelToShort_32x16_ssse3);
1792
- p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].p2s = PFX(filterPixelToShort_32x32_ssse3);
1793
- p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s = PFX(filterPixelToShort_32x48_ssse3);
1794
- p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s = PFX(filterPixelToShort_32x64_ssse3);
1795
+ ASSIGN2(p.pu[LUMA_8x4].convert_p2s, filterPixelToShort_8x4_ssse3);
1796
+ ASSIGN2(p.pu[LUMA_8x8].convert_p2s, filterPixelToShort_8x8_ssse3);
1797
+ ASSIGN2(p.pu[LUMA_8x16].convert_p2s, filterPixelToShort_8x16_ssse3);
1798
+ ASSIGN2(p.pu[LUMA_8x32].convert_p2s, filterPixelToShort_8x32_ssse3);
1799
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].p2s, filterPixelToShort_8x2_ssse3);
1800
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].p2s, filterPixelToShort_8x4_ssse3);
1801
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].p2s, filterPixelToShort_8x6_ssse3);
1802
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].p2s, filterPixelToShort_8x8_ssse3);
1803
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].p2s, filterPixelToShort_8x16_ssse3);
1804
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].p2s, filterPixelToShort_8x32_ssse3);
1805
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].p2s, filterPixelToShort_8x4_ssse3);
1806
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].p2s, filterPixelToShort_8x8_ssse3);
1807
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].p2s, filterPixelToShort_8x12_ssse3);
1808
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].p2s, filterPixelToShort_8x16_ssse3);
1809
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].p2s, filterPixelToShort_8x32_ssse3);
1810
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].p2s, filterPixelToShort_8x64_ssse3);
1811
+
1812
+ ASSIGN2(p.pu[LUMA_16x4].convert_p2s, filterPixelToShort_16x4_ssse3);
1813
+ ASSIGN2(p.pu[LUMA_16x8].convert_p2s, filterPixelToShort_16x8_ssse3);
1814
+ ASSIGN2(p.pu[LUMA_16x12].convert_p2s, filterPixelToShort_16x12_ssse3);
1815
+ ASSIGN2(p.pu[LUMA_16x16].convert_p2s, filterPixelToShort_16x16_ssse3);
1816
+ ASSIGN2(p.pu[LUMA_16x32].convert_p2s, filterPixelToShort_16x32_ssse3);
1817
+ ASSIGN2(p.pu[LUMA_16x64].convert_p2s, filterPixelToShort_16x64_ssse3);
1818
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].p2s, filterPixelToShort_16x4_ssse3);
1819
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].p2s, filterPixelToShort_16x8_ssse3);
1820
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].p2s, filterPixelToShort_16x12_ssse3);
1821
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].p2s, filterPixelToShort_16x16_ssse3);
1822
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].p2s, filterPixelToShort_16x32_ssse3);
1823
+
1824
+ ASSIGN2(p.pu[LUMA_32x8].convert_p2s, filterPixelToShort_32x8_ssse3);
1825
+ ASSIGN2(p.pu[LUMA_32x16].convert_p2s, filterPixelToShort_32x16_ssse3);
1826
+ ASSIGN2(p.pu[LUMA_32x24].convert_p2s, filterPixelToShort_32x24_ssse3);
1827
+ ASSIGN2(p.pu[LUMA_32x32].convert_p2s, filterPixelToShort_32x32_ssse3);
1828
+ ASSIGN2(p.pu[LUMA_32x64].convert_p2s, filterPixelToShort_32x64_ssse3);
1829
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].p2s, filterPixelToShort_32x8_ssse3);
1830
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].p2s, filterPixelToShort_32x16_ssse3);
1831
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].p2s, filterPixelToShort_32x24_ssse3);
1832
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].p2s, filterPixelToShort_32x32_ssse3);
1833
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].p2s, filterPixelToShort_32x16_ssse3);
1834
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].p2s, filterPixelToShort_32x32_ssse3);
1835
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s, filterPixelToShort_32x48_ssse3);
1836
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s, filterPixelToShort_32x64_ssse3);
1837
+
1838
+ ASSIGN2(p.pu[LUMA_64x16].convert_p2s, filterPixelToShort_64x16_ssse3);
1839
+ ASSIGN2(p.pu[LUMA_64x32].convert_p2s, filterPixelToShort_64x32_ssse3);
1840
+ ASSIGN2(p.pu[LUMA_64x48].convert_p2s, filterPixelToShort_64x48_ssse3);
1841
+ ASSIGN2(p.pu[LUMA_64x64].convert_p2s, filterPixelToShort_64x64_ssse3);
1842
+ ASSIGN2(p.pu[LUMA_12x16].convert_p2s, filterPixelToShort_12x16_ssse3);
1843
+ ASSIGN2(p.pu[LUMA_24x32].convert_p2s, filterPixelToShort_24x32_ssse3);
1844
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].p2s, filterPixelToShort_24x64_ssse3);
1845
+ ASSIGN2(p.pu[LUMA_48x64].convert_p2s, filterPixelToShort_48x64_ssse3);
1846
+
1847
p.findPosFirstLast = PFX(findPosFirstLast_ssse3);
1848
p.fix8Unpack = PFX(cutree_fix8_unpack_ssse3);
1849
p.fix8Pack = PFX(cutree_fix8_pack_ssse3);
1850
1851
CHROMA_420_CU_BLOCKCOPY(ps, sse4);
1852
CHROMA_422_CU_BLOCKCOPY(ps, sse4);
1853
1854
- p.cu[BLOCK_16x16].calcresidual = PFX(getResidual16_sse4);
1855
- p.cu[BLOCK_32x32].calcresidual = PFX(getResidual32_sse4);
1856
+ ASSIGN2(p.cu[BLOCK_16x16].calcresidual, getResidual16_sse4);
1857
+ ASSIGN2(p.cu[BLOCK_32x32].calcresidual, getResidual32_sse4);
1858
p.cu[BLOCK_8x8].dct = PFX(dct8_sse4);
1859
p.denoiseDct = PFX(denoise_dct_sse4);
1860
p.quant = PFX(quant_sse4);
1861
1862
1863
p.cu[BLOCK_4x4].psy_cost_pp = PFX(psyCost_pp_4x4_sse4);
1864
1865
- p.pu[LUMA_4x4].convert_p2s = PFX(filterPixelToShort_4x4_sse4);
1866
- p.pu[LUMA_4x8].convert_p2s = PFX(filterPixelToShort_4x8_sse4);
1867
- p.pu[LUMA_4x16].convert_p2s = PFX(filterPixelToShort_4x16_sse4);
1868
-
1869
- p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].p2s = PFX(filterPixelToShort_2x4_sse4);
1870
- p.chroma[X265_CSP_I420].pu[CHROMA_420_2x8].p2s = PFX(filterPixelToShort_2x8_sse4);
1871
- p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].p2s = PFX(filterPixelToShort_4x2_sse4);
1872
- p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].p2s = PFX(filterPixelToShort_4x4_sse4);
1873
- p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].p2s = PFX(filterPixelToShort_4x8_sse4);
1874
- p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].p2s = PFX(filterPixelToShort_4x16_sse4);
1875
- p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].p2s = PFX(filterPixelToShort_6x8_sse4);
1876
- p.chroma[X265_CSP_I422].pu[CHROMA_422_2x8].p2s = PFX(filterPixelToShort_2x8_sse4);
1877
- p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].p2s = PFX(filterPixelToShort_2x16_sse4);
1878
- p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].p2s = PFX(filterPixelToShort_4x4_sse4);
1879
- p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].p2s = PFX(filterPixelToShort_4x8_sse4);
1880
- p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].p2s = PFX(filterPixelToShort_4x16_sse4);
1881
- p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].p2s = PFX(filterPixelToShort_4x32_sse4);
1882
- p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].p2s = PFX(filterPixelToShort_6x16_sse4);
1883
+ ASSIGN2(p.pu[LUMA_4x4].convert_p2s, filterPixelToShort_4x4_sse4);
1884
+ ASSIGN2(p.pu[LUMA_4x8].convert_p2s, filterPixelToShort_4x8_sse4);
1885
+ ASSIGN2(p.pu[LUMA_4x16].convert_p2s, filterPixelToShort_4x16_sse4);
1886
+
1887
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].p2s, filterPixelToShort_2x4_sse4);
1888
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_2x8].p2s, filterPixelToShort_2x8_sse4);
1889
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].p2s, filterPixelToShort_4x2_sse4);
1890
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].p2s, filterPixelToShort_4x4_sse4);
1891
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].p2s, filterPixelToShort_4x8_sse4);
1892
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].p2s, filterPixelToShort_4x16_sse4);
1893
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].p2s, filterPixelToShort_6x8_sse4);
1894
+
1895
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_2x8].p2s, filterPixelToShort_2x8_sse4);
1896
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].p2s, filterPixelToShort_2x16_sse4);
1897
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].p2s, filterPixelToShort_4x4_sse4);
1898
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].p2s, filterPixelToShort_4x8_sse4);
1899
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].p2s, filterPixelToShort_4x16_sse4);
1900
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].p2s, filterPixelToShort_4x32_sse4);
1901
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].p2s, filterPixelToShort_6x16_sse4);
1902
1903
#if X86_64
1904
p.pelFilterLumaStrong[0] = PFX(pelFilterLumaStrong_V_sse4);
1905
1906
p.cu[BLOCK_32x32].psy_cost_pp = PFX(psyCost_pp_32x32_avx2);
1907
p.cu[BLOCK_64x64].psy_cost_pp = PFX(psyCost_pp_64x64_avx2);
1908
1909
- p.pu[LUMA_8x4].addAvg = PFX(addAvg_8x4_avx2);
1910
- p.pu[LUMA_8x8].addAvg = PFX(addAvg_8x8_avx2);
1911
- p.pu[LUMA_8x16].addAvg = PFX(addAvg_8x16_avx2);
1912
- p.pu[LUMA_8x32].addAvg = PFX(addAvg_8x32_avx2);
1913
-
1914
- p.pu[LUMA_12x16].addAvg = PFX(addAvg_12x16_avx2);
1915
-
1916
- p.pu[LUMA_16x4].addAvg = PFX(addAvg_16x4_avx2);
1917
- p.pu[LUMA_16x8].addAvg = PFX(addAvg_16x8_avx2);
1918
- p.pu[LUMA_16x12].addAvg = PFX(addAvg_16x12_avx2);
1919
- p.pu[LUMA_16x16].addAvg = PFX(addAvg_16x16_avx2);
1920
- p.pu[LUMA_16x32].addAvg = PFX(addAvg_16x32_avx2);
1921
- p.pu[LUMA_16x64].addAvg = PFX(addAvg_16x64_avx2);
1922
-
1923
- p.pu[LUMA_24x32].addAvg = PFX(addAvg_24x32_avx2);
1924
-
1925
- p.pu[LUMA_32x8].addAvg = PFX(addAvg_32x8_avx2);
1926
- p.pu[LUMA_32x16].addAvg = PFX(addAvg_32x16_avx2);
1927
- p.pu[LUMA_32x24].addAvg = PFX(addAvg_32x24_avx2);
1928
- p.pu[LUMA_32x32].addAvg = PFX(addAvg_32x32_avx2);
1929
- p.pu[LUMA_32x64].addAvg = PFX(addAvg_32x64_avx2);
1930
-
1931
- p.pu[LUMA_48x64].addAvg = PFX(addAvg_48x64_avx2);
1932
-
1933
- p.pu[LUMA_64x16].addAvg = PFX(addAvg_64x16_avx2);
1934
- p.pu[LUMA_64x32].addAvg = PFX(addAvg_64x32_avx2);
1935
- p.pu[LUMA_64x48].addAvg = PFX(addAvg_64x48_avx2);
1936
- p.pu[LUMA_64x64].addAvg = PFX(addAvg_64x64_avx2);
1937
-
1938
- p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].addAvg = PFX(addAvg_8x2_avx2);
1939
- p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].addAvg = PFX(addAvg_8x4_avx2);
1940
- p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].addAvg = PFX(addAvg_8x6_avx2);
1941
- p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].addAvg = PFX(addAvg_8x8_avx2);
1942
- p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].addAvg = PFX(addAvg_8x16_avx2);
1943
- p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].addAvg = PFX(addAvg_8x32_avx2);
1944
- p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].addAvg = PFX(addAvg_12x16_avx2);
1945
- p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].addAvg = PFX(addAvg_16x4_avx2);
1946
- p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].addAvg = PFX(addAvg_16x8_avx2);
1947
- p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].addAvg = PFX(addAvg_16x12_avx2);
1948
- p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].addAvg = PFX(addAvg_16x16_avx2);
1949
- p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].addAvg = PFX(addAvg_16x32_avx2);
1950
- p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].addAvg = PFX(addAvg_32x8_avx2);
1951
- p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].addAvg = PFX(addAvg_32x16_avx2);
1952
- p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].addAvg = PFX(addAvg_32x24_avx2);
1953
- p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].addAvg = PFX(addAvg_32x32_avx2);
1954
-
1955
- p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].addAvg = PFX(addAvg_8x4_avx2);
1956
- p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].addAvg = PFX(addAvg_8x8_avx2);
1957
- p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].addAvg = PFX(addAvg_8x12_avx2);
1958
- p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].addAvg = PFX(addAvg_8x16_avx2);
1959
- p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].addAvg = PFX(addAvg_8x32_avx2);
1960
- p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].addAvg = PFX(addAvg_8x64_avx2);
1961
- p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].addAvg = PFX(addAvg_12x32_avx2);
1962
- p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].addAvg = PFX(addAvg_16x8_avx2);
1963
- p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].addAvg = PFX(addAvg_16x16_avx2);
1964
- p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].addAvg = PFX(addAvg_16x24_avx2);
1965
- p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].addAvg = PFX(addAvg_16x32_avx2);
1966
- p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].addAvg = PFX(addAvg_16x64_avx2);
1967
- p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].addAvg = PFX(addAvg_24x64_avx2);
1968
- p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].addAvg = PFX(addAvg_32x16_avx2);
1969
- p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].addAvg = PFX(addAvg_32x32_avx2);
1970
- p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].addAvg = PFX(addAvg_32x48_avx2);
1971
- p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].addAvg = PFX(addAvg_32x64_avx2);
1972
+ ASSIGN2(p.pu[LUMA_8x4].addAvg, addAvg_8x4_avx2);
1973
+ ASSIGN2(p.pu[LUMA_8x8].addAvg, addAvg_8x8_avx2);
1974
+ ASSIGN2(p.pu[LUMA_8x16].addAvg, addAvg_8x16_avx2);
1975
+ ASSIGN2(p.pu[LUMA_8x32].addAvg, addAvg_8x32_avx2);
1976
+ ASSIGN2(p.pu[LUMA_12x16].addAvg, addAvg_12x16_avx2);
1977
+ ASSIGN2(p.pu[LUMA_16x4].addAvg, addAvg_16x4_avx2);
1978
+ ASSIGN2(p.pu[LUMA_16x8].addAvg, addAvg_16x8_avx2);
1979
+ ASSIGN2(p.pu[LUMA_16x12].addAvg, addAvg_16x12_avx2);
1980
+ ASSIGN2(p.pu[LUMA_16x16].addAvg, addAvg_16x16_avx2);
1981
+ ASSIGN2(p.pu[LUMA_16x32].addAvg, addAvg_16x32_avx2);
1982
+ ASSIGN2(p.pu[LUMA_16x64].addAvg, addAvg_16x64_avx2);
1983
+ ASSIGN2(p.pu[LUMA_24x32].addAvg, addAvg_24x32_avx2);
1984
+ ASSIGN2(p.pu[LUMA_32x8].addAvg, addAvg_32x8_avx2);
1985
+ ASSIGN2(p.pu[LUMA_32x16].addAvg, addAvg_32x16_avx2);
1986
+ ASSIGN2(p.pu[LUMA_32x24].addAvg, addAvg_32x24_avx2);
1987
+ ASSIGN2(p.pu[LUMA_32x32].addAvg, addAvg_32x32_avx2);
1988
+ ASSIGN2(p.pu[LUMA_32x64].addAvg, addAvg_32x64_avx2);
1989
+ ASSIGN2(p.pu[LUMA_48x64].addAvg, addAvg_48x64_avx2);
1990
+ ASSIGN2(p.pu[LUMA_64x16].addAvg, addAvg_64x16_avx2);
1991
+ ASSIGN2(p.pu[LUMA_64x32].addAvg, addAvg_64x32_avx2);
1992
+ ASSIGN2(p.pu[LUMA_64x48].addAvg, addAvg_64x48_avx2);
1993
+ ASSIGN2(p.pu[LUMA_64x64].addAvg, addAvg_64x64_avx2);
1994
+
1995
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].addAvg, addAvg_8x2_avx2);
1996
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].addAvg, addAvg_8x4_avx2);
1997
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].addAvg, addAvg_8x6_avx2);
1998
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].addAvg, addAvg_8x8_avx2);
1999
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].addAvg, addAvg_8x16_avx2);
2000
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].addAvg, addAvg_8x32_avx2);
2001
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].addAvg, addAvg_12x16_avx2);
2002
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].addAvg, addAvg_16x4_avx2);
2003
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].addAvg, addAvg_16x8_avx2);
2004
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].addAvg, addAvg_16x12_avx2);
2005
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].addAvg, addAvg_16x16_avx2);
2006
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].addAvg, addAvg_16x32_avx2);
2007
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].addAvg, addAvg_32x8_avx2);
2008
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].addAvg, addAvg_32x16_avx2);
2009
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].addAvg, addAvg_32x24_avx2);
2010
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].addAvg, addAvg_32x32_avx2);
2011
+
2012
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].addAvg, addAvg_8x4_avx2);
2013
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].addAvg, addAvg_8x8_avx2);
2014
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].addAvg, addAvg_8x12_avx2);
2015
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].addAvg, addAvg_8x16_avx2);
2016
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].addAvg, addAvg_8x32_avx2);
2017
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].addAvg, addAvg_8x64_avx2);
2018
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].addAvg, addAvg_12x32_avx2);
2019
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].addAvg, addAvg_16x8_avx2);
2020
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].addAvg, addAvg_16x16_avx2);
2021
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].addAvg, addAvg_16x24_avx2);
2022
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].addAvg, addAvg_16x32_avx2);
2023
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].addAvg, addAvg_16x64_avx2);
2024
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].addAvg, addAvg_24x64_avx2);
2025
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].addAvg, addAvg_32x16_avx2);
2026
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].addAvg, addAvg_32x32_avx2);
2027
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].addAvg, addAvg_32x48_avx2);
2028
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].addAvg, addAvg_32x64_avx2);
2029
2030
p.cu[BLOCK_8x8].sa8d = PFX(pixel_sa8d_8x8_avx2);
2031
p.cu[BLOCK_16x16].sa8d = PFX(pixel_sa8d_16x16_avx2);
2032
2033
p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sa8d = PFX(pixel_sa8d_16x16_avx2);
2034
p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sa8d = PFX(pixel_sa8d_32x32_avx2);
2035
2036
- p.cu[BLOCK_16x16].add_ps = PFX(pixel_add_ps_16x16_avx2);
2037
- p.cu[BLOCK_32x32].add_ps = PFX(pixel_add_ps_32x32_avx2);
2038
- p.cu[BLOCK_64x64].add_ps = PFX(pixel_add_ps_64x64_avx2);
2039
- p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].add_ps = PFX(pixel_add_ps_16x16_avx2);
2040
- p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps = PFX(pixel_add_ps_32x32_avx2);
2041
- p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].add_ps = PFX(pixel_add_ps_16x32_avx2);
2042
- p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps = PFX(pixel_add_ps_32x64_avx2);
2043
+ ASSIGN2(p.cu[BLOCK_16x16].add_ps, pixel_add_ps_16x16_avx2);
2044
+ ASSIGN2(p.cu[BLOCK_32x32].add_ps, pixel_add_ps_32x32_avx2);
2045
+ ASSIGN2(p.cu[BLOCK_64x64].add_ps, pixel_add_ps_64x64_avx2);
2046
+ ASSIGN2(p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].add_ps, pixel_add_ps_16x16_avx2);
2047
+ ASSIGN2(p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps, pixel_add_ps_32x32_avx2);
2048
+ ASSIGN2(p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].add_ps, pixel_add_ps_16x32_avx2);
2049
+ ASSIGN2(p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps, pixel_add_ps_32x64_avx2);
2050
2051
p.cu[BLOCK_16x16].sub_ps = PFX(pixel_sub_ps_16x16_avx2);
2052
p.cu[BLOCK_32x32].sub_ps = PFX(pixel_sub_ps_32x32_avx2);
2053
2054
p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sub_ps = PFX(pixel_sub_ps_32x32_avx2);
2055
p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sub_ps = PFX(pixel_sub_ps_16x32_avx2);
2056
p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sub_ps = PFX(pixel_sub_ps_32x64_avx2);
2057
-
2058
- p.pu[LUMA_16x4].pixelavg_pp = PFX(pixel_avg_16x4_avx2);
2059
- p.pu[LUMA_16x8].pixelavg_pp = PFX(pixel_avg_16x8_avx2);
2060
- p.pu[LUMA_16x12].pixelavg_pp = PFX(pixel_avg_16x12_avx2);
2061
- p.pu[LUMA_16x16].pixelavg_pp = PFX(pixel_avg_16x16_avx2);
2062
- p.pu[LUMA_16x32].pixelavg_pp = PFX(pixel_avg_16x32_avx2);
2063
- p.pu[LUMA_16x64].pixelavg_pp = PFX(pixel_avg_16x64_avx2);
2064
-
2065
- p.pu[LUMA_32x64].pixelavg_pp = PFX(pixel_avg_32x64_avx2);
2066
- p.pu[LUMA_32x32].pixelavg_pp = PFX(pixel_avg_32x32_avx2);
2067
- p.pu[LUMA_32x24].pixelavg_pp = PFX(pixel_avg_32x24_avx2);
2068
- p.pu[LUMA_32x16].pixelavg_pp = PFX(pixel_avg_32x16_avx2);
2069
- p.pu[LUMA_32x8].pixelavg_pp = PFX(pixel_avg_32x8_avx2);
2070
- p.pu[LUMA_48x64].pixelavg_pp = PFX(pixel_avg_48x64_avx2);
2071
- p.pu[LUMA_64x64].pixelavg_pp = PFX(pixel_avg_64x64_avx2);
2072
- p.pu[LUMA_64x48].pixelavg_pp = PFX(pixel_avg_64x48_avx2);
2073
- p.pu[LUMA_64x32].pixelavg_pp = PFX(pixel_avg_64x32_avx2);
2074
- p.pu[LUMA_64x16].pixelavg_pp = PFX(pixel_avg_64x16_avx2);
2075
-
2076
+ ASSIGN2(p.pu[LUMA_16x4].pixelavg_pp, pixel_avg_16x4_avx2);
2077
+ ASSIGN2(p.pu[LUMA_16x8].pixelavg_pp, pixel_avg_16x8_avx2);
2078
+ ASSIGN2(p.pu[LUMA_16x12].pixelavg_pp, pixel_avg_16x12_avx2);
2079
+ ASSIGN2(p.pu[LUMA_16x16].pixelavg_pp, pixel_avg_16x16_avx2);
2080
+ ASSIGN2(p.pu[LUMA_16x32].pixelavg_pp, pixel_avg_16x32_avx2);
2081
+ ASSIGN2(p.pu[LUMA_16x64].pixelavg_pp, pixel_avg_16x64_avx2);
2082
+
2083
+ ASSIGN2(p.pu[LUMA_32x64].pixelavg_pp, pixel_avg_32x64_avx2);
2084
+ ASSIGN2(p.pu[LUMA_32x32].pixelavg_pp, pixel_avg_32x32_avx2);
2085
+ ASSIGN2(p.pu[LUMA_32x24].pixelavg_pp, pixel_avg_32x24_avx2);
2086
+ ASSIGN2(p.pu[LUMA_32x16].pixelavg_pp, pixel_avg_32x16_avx2);
2087
+ ASSIGN2(p.pu[LUMA_32x8].pixelavg_pp, pixel_avg_32x8_avx2);
2088
+ ASSIGN2(p.pu[LUMA_48x64].pixelavg_pp, pixel_avg_48x64_avx2);
2089
+ ASSIGN2(p.pu[LUMA_64x64].pixelavg_pp, pixel_avg_64x64_avx2);
2090
+ ASSIGN2(p.pu[LUMA_64x48].pixelavg_pp, pixel_avg_64x48_avx2);
2091
+ ASSIGN2(p.pu[LUMA_64x32].pixelavg_pp, pixel_avg_64x32_avx2);
2092
+ ASSIGN2(p.pu[LUMA_64x16].pixelavg_pp, pixel_avg_64x16_avx2);
2093
p.pu[LUMA_16x16].satd = PFX(pixel_satd_16x16_avx2);
2094
p.pu[LUMA_16x8].satd = PFX(pixel_satd_16x8_avx2);
2095
p.pu[LUMA_8x16].satd = PFX(pixel_satd_8x16_avx2);
2096
2097
p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sse_pp = PFX(pixel_ssd_16x16_avx2);
2098
p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sse_pp = PFX(pixel_ssd_32x32_avx2);
2099
2100
- p.cu[BLOCK_16x16].ssd_s = PFX(pixel_ssd_s_16_avx2);
2101
- p.cu[BLOCK_32x32].ssd_s = PFX(pixel_ssd_s_32_avx2);
2102
-
2103
+ ASSIGN2(p.cu[BLOCK_16x16].ssd_s, pixel_ssd_s_16_avx2);
2104
+ ASSIGN2(p.cu[BLOCK_32x32].ssd_s, pixel_ssd_s_32_avx2);
2105
p.cu[BLOCK_8x8].copy_cnt = PFX(copy_cnt_8_avx2);
2106
p.cu[BLOCK_16x16].copy_cnt = PFX(copy_cnt_16_avx2);
2107
p.cu[BLOCK_32x32].copy_cnt = PFX(copy_cnt_32_avx2);
2108
-
2109
- p.cu[BLOCK_16x16].blockfill_s = PFX(blockfill_s_16x16_avx2);
2110
- p.cu[BLOCK_32x32].blockfill_s = PFX(blockfill_s_32x32_avx2);
2111
-
2112
- ALL_LUMA_TU_S(cpy1Dto2D_shl, cpy1Dto2D_shl_, avx2);
2113
+ ASSIGN2(p.cu[BLOCK_16x16].blockfill_s, blockfill_s_16x16_avx2);
2114
+ ALL_LUMA_TU_S(cpy1Dto2D_shl[ALIGNED], cpy1Dto2D_shl_, avx2);
2115
+ ALL_LUMA_TU_S(cpy1Dto2D_shl[NONALIGNED], cpy1Dto2D_shl_, avx2);
2116
ALL_LUMA_TU_S(cpy1Dto2D_shr, cpy1Dto2D_shr_, avx2);
2117
-
2118
p.cu[BLOCK_8x8].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_8_avx2);
2119
p.cu[BLOCK_16x16].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_16_avx2);
2120
p.cu[BLOCK_32x32].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32_avx2);
2121
2122
p.dequant_normal = PFX(dequant_normal_avx2);
2123
p.dequant_scaling = PFX(dequant_scaling_avx2);
2124
2125
- p.cu[BLOCK_16x16].calcresidual = PFX(getResidual16_avx2);
2126
- p.cu[BLOCK_32x32].calcresidual = PFX(getResidual32_avx2);
2127
+ ASSIGN2(p.cu[BLOCK_16x16].calcresidual, getResidual16_avx2);
2128
+ ASSIGN2(p.cu[BLOCK_32x32].calcresidual, getResidual32_avx2);
2129
2130
- p.scale1D_128to64 = PFX(scale1D_128to64_avx2);
2131
+ ASSIGN2(p.scale1D_128to64, scale1D_128to64_avx2);
2132
p.weight_pp = PFX(weight_pp_avx2);
2133
p.weight_sp = PFX(weight_sp_avx2);
2134
2135
2136
ALL_LUMA_PU_T(luma_hvpp, interp_8tap_hv_pp_cpu);
2137
p.pu[LUMA_4x4].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_4x4>;
2138
2139
- p.pu[LUMA_16x4].convert_p2s = PFX(filterPixelToShort_16x4_avx2);
2140
- p.pu[LUMA_16x8].convert_p2s = PFX(filterPixelToShort_16x8_avx2);
2141
- p.pu[LUMA_16x12].convert_p2s = PFX(filterPixelToShort_16x12_avx2);
2142
- p.pu[LUMA_16x16].convert_p2s = PFX(filterPixelToShort_16x16_avx2);
2143
- p.pu[LUMA_16x32].convert_p2s = PFX(filterPixelToShort_16x32_avx2);
2144
- p.pu[LUMA_16x64].convert_p2s = PFX(filterPixelToShort_16x64_avx2);
2145
- p.pu[LUMA_32x8].convert_p2s = PFX(filterPixelToShort_32x8_avx2);
2146
- p.pu[LUMA_32x16].convert_p2s = PFX(filterPixelToShort_32x16_avx2);
2147
- p.pu[LUMA_32x24].convert_p2s = PFX(filterPixelToShort_32x24_avx2);
2148
- p.pu[LUMA_32x32].convert_p2s = PFX(filterPixelToShort_32x32_avx2);
2149
- p.pu[LUMA_32x64].convert_p2s = PFX(filterPixelToShort_32x64_avx2);
2150
- p.pu[LUMA_64x16].convert_p2s = PFX(filterPixelToShort_64x16_avx2);
2151
- p.pu[LUMA_64x32].convert_p2s = PFX(filterPixelToShort_64x32_avx2);
2152
- p.pu[LUMA_64x48].convert_p2s = PFX(filterPixelToShort_64x48_avx2);
2153
- p.pu[LUMA_64x64].convert_p2s = PFX(filterPixelToShort_64x64_avx2);
2154
- p.pu[LUMA_48x64].convert_p2s = PFX(filterPixelToShort_48x64_avx2);
2155
- p.pu[LUMA_24x32].convert_p2s = PFX(filterPixelToShort_24x32_avx2);
2156
-
2157
- p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].p2s = PFX(filterPixelToShort_16x4_avx2);
2158
- p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].p2s = PFX(filterPixelToShort_16x8_avx2);
2159
- p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].p2s = PFX(filterPixelToShort_16x12_avx2);
2160
- p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].p2s = PFX(filterPixelToShort_16x16_avx2);
2161
- p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].p2s = PFX(filterPixelToShort_16x32_avx2);
2162
- p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].p2s = PFX(filterPixelToShort_24x32_avx2);
2163
- p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].p2s = PFX(filterPixelToShort_32x8_avx2);
2164
- p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].p2s = PFX(filterPixelToShort_32x16_avx2);
2165
- p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].p2s = PFX(filterPixelToShort_32x24_avx2);
2166
- p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].p2s = PFX(filterPixelToShort_32x32_avx2);
2167
- p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].p2s = PFX(filterPixelToShort_16x8_avx2);
2168
- p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].p2s = PFX(filterPixelToShort_16x16_avx2);
2169
- p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].p2s = PFX(filterPixelToShort_16x24_avx2);
2170
- p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].p2s = PFX(filterPixelToShort_16x32_avx2);
2171
- p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].p2s = PFX(filterPixelToShort_16x64_avx2);
2172
- p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].p2s = PFX(filterPixelToShort_24x64_avx2);
2173
- p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].p2s = PFX(filterPixelToShort_32x16_avx2);
2174
- p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].p2s = PFX(filterPixelToShort_32x32_avx2);
2175
- p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s = PFX(filterPixelToShort_32x48_avx2);
2176
- p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s = PFX(filterPixelToShort_32x64_avx2);
2177
+ ASSIGN2(p.pu[LUMA_16x4].convert_p2s, filterPixelToShort_16x4_avx2);
2178
+ ASSIGN2(p.pu[LUMA_16x8].convert_p2s, filterPixelToShort_16x8_avx2);
2179
+ ASSIGN2(p.pu[LUMA_16x12].convert_p2s, filterPixelToShort_16x12_avx2);
2180
+ ASSIGN2(p.pu[LUMA_16x16].convert_p2s, filterPixelToShort_16x16_avx2);
2181
+ ASSIGN2(p.pu[LUMA_16x32].convert_p2s, filterPixelToShort_16x32_avx2);
2182
+ ASSIGN2(p.pu[LUMA_16x64].convert_p2s, filterPixelToShort_16x64_avx2);
2183
+ ASSIGN2(p.pu[LUMA_32x8].convert_p2s, filterPixelToShort_32x8_avx2);
2184
+ ASSIGN2(p.pu[LUMA_32x16].convert_p2s, filterPixelToShort_32x16_avx2);
2185
+ ASSIGN2(p.pu[LUMA_32x24].convert_p2s, filterPixelToShort_32x24_avx2);
2186
+ ASSIGN2(p.pu[LUMA_32x32].convert_p2s, filterPixelToShort_32x32_avx2);
2187
+ ASSIGN2(p.pu[LUMA_32x64].convert_p2s, filterPixelToShort_32x64_avx2);
2188
+ ASSIGN2(p.pu[LUMA_64x16].convert_p2s, filterPixelToShort_64x16_avx2);
2189
+ ASSIGN2(p.pu[LUMA_64x32].convert_p2s, filterPixelToShort_64x32_avx2);
2190
+ ASSIGN2(p.pu[LUMA_64x48].convert_p2s, filterPixelToShort_64x48_avx2);
2191
+ ASSIGN2(p.pu[LUMA_64x64].convert_p2s, filterPixelToShort_64x64_avx2);
2192
+ ASSIGN2(p.pu[LUMA_48x64].convert_p2s, filterPixelToShort_48x64_avx2);
2193
+ ASSIGN2(p.pu[LUMA_24x32].convert_p2s, filterPixelToShort_24x32_avx2);
2194
+
2195
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].p2s, filterPixelToShort_16x4_avx2);
2196
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].p2s, filterPixelToShort_16x8_avx2);
2197
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].p2s, filterPixelToShort_16x12_avx2);
2198
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].p2s, filterPixelToShort_16x16_avx2);
2199
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].p2s, filterPixelToShort_16x32_avx2);
2200
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].p2s, filterPixelToShort_24x32_avx2);
2201
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].p2s, filterPixelToShort_32x8_avx2);
2202
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].p2s, filterPixelToShort_32x16_avx2);
2203
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].p2s, filterPixelToShort_32x24_avx2);
2204
+ ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].p2s, filterPixelToShort_32x32_avx2);
2205
+
2206
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].p2s, filterPixelToShort_16x8_avx2);
2207
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].p2s, filterPixelToShort_16x16_avx2);
2208
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].p2s, filterPixelToShort_16x24_avx2);
2209
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].p2s, filterPixelToShort_16x32_avx2);
2210
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].p2s, filterPixelToShort_16x64_avx2);
2211
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].p2s, filterPixelToShort_24x64_avx2);
2212
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].p2s, filterPixelToShort_32x16_avx2);
2213
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].p2s, filterPixelToShort_32x32_avx2);
2214
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s, filterPixelToShort_32x48_avx2);
2215
+ ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s, filterPixelToShort_32x64_avx2);
2216
2217
//i422 for chroma_hpp
2218
p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].filter_hpp = PFX(interp_4tap_horiz_pp_12x32_avx2);
2219
2220
p.integral_inith[INTEGRAL_16] = PFX(integral16h_avx2);
2221
p.integral_inith[INTEGRAL_24] = PFX(integral24h_avx2);
2222
p.integral_inith[INTEGRAL_32] = PFX(integral32h_avx2);
2223
+ p.cu[BLOCK_4x4].nonPsyRdoQuant = PFX(nonPsyRdoQuant4_avx2);
2224
+ p.cu[BLOCK_8x8].nonPsyRdoQuant = PFX(nonPsyRdoQuant8_avx2);
2225
+ p.cu[BLOCK_16x16].nonPsyRdoQuant = PFX(nonPsyRdoQuant16_avx2);
2226
+ p.cu[BLOCK_32x32].nonPsyRdoQuant = PFX(nonPsyRdoQuant32_avx2);
2227
+ p.cu[BLOCK_4x4].psyRdoQuant_1p = PFX(psyRdoQuant_1p4_avx2);
2228
+ p.cu[BLOCK_8x8].psyRdoQuant_1p = PFX(psyRdoQuant_1p8_avx2);
2229
+ p.cu[BLOCK_16x16].psyRdoQuant_1p = PFX(psyRdoQuant_1p16_avx2);
2230
+ p.cu[BLOCK_32x32].psyRdoQuant_1p = PFX(psyRdoQuant_1p32_avx2);
2231
+
2232
+ }
2233
+ if (cpuMask & X265_CPU_AVX512)
2234
+ {
2235
+ p.pu[LUMA_32x8].sad = PFX(pixel_sad_32x8_avx512);
2236
+ // p.pu[LUMA_32x16].sad = PFX(pixel_sad_32x16_avx512);
2237
+ p.pu[LUMA_32x24].sad = PFX(pixel_sad_32x24_avx512);
2238
+ p.pu[LUMA_32x32].sad = PFX(pixel_sad_32x32_avx512);
2239
+ //p.pu[LUMA_32x64].sad = PFX(pixel_sad_32x64_avx512);
2240
+ p.pu[LUMA_64x16].sad = PFX(pixel_sad_64x16_avx512);
2241
+ p.pu[LUMA_64x32].sad = PFX(pixel_sad_64x32_avx512);
2242
+ p.pu[LUMA_64x48].sad = PFX(pixel_sad_64x48_avx512);
2243
+ p.pu[LUMA_64x64].sad = PFX(pixel_sad_64x64_avx512);
2244
+
2245
+ p.pu[LUMA_32x8].sad_x3 = PFX(pixel_sad_x3_32x8_avx512);
2246
+ p.pu[LUMA_32x16].sad_x3 = PFX(pixel_sad_x3_32x16_avx512);
2247
+ p.pu[LUMA_32x24].sad_x3 = PFX(pixel_sad_x3_32x24_avx512);
2248
+ p.pu[LUMA_32x32].sad_x3 = PFX(pixel_sad_x3_32x32_avx512);
2249
+ p.pu[LUMA_32x64].sad_x3 = PFX(pixel_sad_x3_32x64_avx512);
2250
+ p.pu[LUMA_64x16].sad_x3 = PFX(pixel_sad_x3_64x16_avx512);
2251
+ p.pu[LUMA_64x32].sad_x3 = PFX(pixel_sad_x3_64x32_avx512);
2252
+ p.pu[LUMA_64x48].sad_x3 = PFX(pixel_sad_x3_64x48_avx512);
2253
+ p.pu[LUMA_64x64].sad_x3 = PFX(pixel_sad_x3_64x64_avx512);
2254
+ p.pu[LUMA_48x64].sad_x3 = PFX(pixel_sad_x3_48x64_avx512);
2255
+
2256
+ p.pu[LUMA_32x32].sad_x4 = PFX(pixel_sad_x4_32x32_avx512);
2257
+ p.pu[LUMA_32x16].sad_x4 = PFX(pixel_sad_x4_32x16_avx512);
2258
+ p.pu[LUMA_32x64].sad_x4 = PFX(pixel_sad_x4_32x64_avx512);
2259
+ p.pu[LUMA_32x24].sad_x4 = PFX(pixel_sad_x4_32x24_avx512);
2260
+ p.pu[LUMA_32x8].sad_x4 = PFX(pixel_sad_x4_32x8_avx512);
2261
+ p.pu[LUMA_64x16].sad_x4 = PFX(pixel_sad_x4_64x16_avx512);
2262
+ p.pu[LUMA_64x32].sad_x4 = PFX(pixel_sad_x4_64x32_avx512);
2263
+ p.pu[LUMA_64x48].sad_x4 = PFX(pixel_sad_x4_64x48_avx512);
2264
+ p.pu[LUMA_64x64].sad_x4 = PFX(pixel_sad_x4_64x64_avx512);
2265
+ p.pu[LUMA_48x64].sad_x4 = PFX(pixel_sad_x4_48x64_avx512);
2266
+
2267
+ p.pu[LUMA_4x4].satd = PFX(pixel_satd_4x4_avx512);
2268
+ p.pu[LUMA_4x8].satd = PFX(pixel_satd_4x8_avx512);
2269
+ p.pu[LUMA_4x16].satd = PFX(pixel_satd_4x16_avx512);
2270
+ p.pu[LUMA_8x4].satd = PFX(pixel_satd_8x4_avx512);
2271
+ p.pu[LUMA_8x8].satd = PFX(pixel_satd_8x8_avx512);
2272
+ p.pu[LUMA_8x16].satd = PFX(pixel_satd_8x16_avx512);
2273
+ p.pu[LUMA_16x8].satd = PFX(pixel_satd_16x8_avx512);
2274
+ p.pu[LUMA_16x16].satd = PFX(pixel_satd_16x16_avx512);
2275
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].satd = PFX(pixel_satd_4x4_avx512);
2276
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd = PFX(pixel_satd_4x8_avx512);
2277
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].satd = PFX(pixel_satd_4x16_avx512);
2278
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].satd = PFX(pixel_satd_8x4_avx512);
2279
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].satd = PFX(pixel_satd_8x8_avx512);
2280
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].satd = PFX(pixel_satd_8x16_avx512);
2281
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].satd = PFX(pixel_satd_16x8_avx512);
2282
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].satd = PFX(pixel_satd_16x16_avx512);
2283
+
2284
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].satd = PFX(pixel_satd_4x4_avx512);
2285
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].satd = PFX(pixel_satd_4x8_avx512);
2286
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].satd = PFX(pixel_satd_4x16_avx512);
2287
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].satd = PFX(pixel_satd_8x4_avx512);
2288
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].satd = PFX(pixel_satd_8x8_avx512);
2289
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].satd = PFX(pixel_satd_8x16_avx512);
2290
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].satd = PFX(pixel_satd_16x8_avx512);
2291
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].satd = PFX(pixel_satd_16x16_avx512);
2292
+
2293
+ p.cu[BLOCK_8x8].sa8d = PFX(pixel_sa8d_8x8_avx512);
2294
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].sa8d = PFX(pixel_sa8d_8x8_avx512);
2295
+
2296
+ p.cu[BLOCK_8x8].var = PFX(pixel_var_8x8_avx512);
2297
+ p.cu[BLOCK_16x16].var = PFX(pixel_var_16x16_avx512);
2298
+ p.cu[BLOCK_32x32].var = PFX(pixel_var_32x32_avx512);
2299
+ p.cu[BLOCK_64x64].var = PFX(pixel_var_64x64_avx512);
2300
+ ASSIGN2(p.pu[LUMA_16x64].pixelavg_pp, pixel_avg_16x64_avx512);
2301
+ ASSIGN2(p.pu[LUMA_16x32].pixelavg_pp, pixel_avg_16x32_avx512);
2302
+ ASSIGN2(p.pu[LUMA_16x16].pixelavg_pp, pixel_avg_16x16_avx512);
2303
+ ASSIGN2(p.pu[LUMA_16x12].pixelavg_pp, pixel_avg_16x12_avx512);
2304
+ ASSIGN2(p.pu[LUMA_16x8].pixelavg_pp, pixel_avg_16x8_avx512);
2305
+ ASSIGN2(p.pu[LUMA_16x4].pixelavg_pp, pixel_avg_16x4_avx512);
2306
+ ASSIGN2(p.pu[LUMA_8x32].pixelavg_pp, pixel_avg_8x32_avx512);
2307
+ ASSIGN2(p.pu[LUMA_8x16].pixelavg_pp, pixel_avg_8x16_avx512);
2308
+ ASSIGN2(p.pu[LUMA_8x8].pixelavg_pp, pixel_avg_8x8_avx512);
2309
+ //p.pu[LUMA_8x4].pixelavg_pp = PFX(pixel_avg_8x4_avx512);
2310
+ p.pu[LUMA_4x4].sad = PFX(pixel_sad_4x4_avx512);
2311
+ p.pu[LUMA_4x8].sad = PFX(pixel_sad_4x8_avx512);
2312
+ p.pu[LUMA_4x16].sad = PFX(pixel_sad_4x16_avx512);
2313
+ p.pu[LUMA_8x4].sad = PFX(pixel_sad_8x4_avx512);
2314
+ p.pu[LUMA_8x8].sad = PFX(pixel_sad_8x8_avx512);
2315
+ // p.pu[LUMA_8x16].sad = PFX(pixel_sad_8x16_avx512);
2316
+ p.pu[LUMA_16x8].sad = PFX(pixel_sad_16x8_avx512);
2317
+ p.pu[LUMA_16x16].sad = PFX(pixel_sad_16x16_avx512);
2318
+
2319
+ p.pu[LUMA_64x64].copy_pp = PFX(blockcopy_pp_64x64_avx512);
2320
+ p.pu[LUMA_64x32].copy_pp = PFX(blockcopy_pp_64x32_avx512);
2321
+ p.pu[LUMA_64x48].copy_pp = PFX(blockcopy_pp_64x48_avx512);
2322
+ p.pu[LUMA_64x16].copy_pp = PFX(blockcopy_pp_64x16_avx512);
2323
+ p.pu[LUMA_32x16].copy_pp = PFX(blockcopy_pp_32x16_avx512);
2324
+ p.pu[LUMA_32x24].copy_pp = PFX(blockcopy_pp_32x24_avx512);
2325
+ p.pu[LUMA_32x32].copy_pp = PFX(blockcopy_pp_32x32_avx512);
2326
+ p.pu[LUMA_32x64].copy_pp = PFX(blockcopy_pp_32x64_avx512);
2327
+
2328
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].copy_pp = PFX(blockcopy_pp_32x16_avx512);
2329
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].copy_pp = PFX(blockcopy_pp_32x24_avx512);
2330
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].copy_pp = PFX(blockcopy_pp_32x32_avx512);
2331
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].copy_pp = PFX(blockcopy_pp_32x16_avx512);
2332
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].copy_pp = PFX(blockcopy_pp_32x32_avx512);
2333
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].copy_pp = PFX(blockcopy_pp_32x48_avx512);
2334
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].copy_pp = PFX(blockcopy_pp_32x64_avx512);
2335
+
2336
+ p.cu[BLOCK_64x64].copy_sp = PFX(blockcopy_sp_64x64_avx512);
2337
+ p.cu[BLOCK_32x32].copy_sp = PFX(blockcopy_sp_32x32_avx512);
2338
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].copy_sp = PFX(blockcopy_sp_32x32_avx512);
2339
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_sp = PFX(blockcopy_sp_32x64_avx512);
2340
+
2341
+ p.cu[BLOCK_32x32].copy_ps = PFX(blockcopy_ps_32x32_avx512);
2342
+ p.chroma[X265_CSP_I420].cu[CHROMA_420_32x32].copy_ps = PFX(blockcopy_ps_32x32_avx512);
2343
+ p.chroma[X265_CSP_I422].cu[CHROMA_422_32x64].copy_ps = PFX(blockcopy_ps_32x64_avx512);
2344
+ p.cu[BLOCK_64x64].copy_ps = PFX(blockcopy_ps_64x64_avx512);
2345
+
2346
+ p.scale1D_128to64[NONALIGNED] = PFX(scale1D_128to64_avx512);
2347
+ p.scale1D_128to64[ALIGNED] = PFX(scale1D_128to64_aligned_avx512);
2348
+
2349
+ p.pu[LUMA_64x16].addAvg[NONALIGNED] = PFX(addAvg_64x16_avx512);
2350
+ p.pu[LUMA_64x32].addAvg[NONALIGNED] = PFX(addAvg_64x32_avx512);
2351
+ p.pu[LUMA_64x48].addAvg[NONALIGNED] = PFX(addAvg_64x48_avx512);
2352
+ p.pu[LUMA_64x64].addAvg[NONALIGNED] = PFX(addAvg_64x64_avx512);
2353
+ p.pu[LUMA_32x8].addAvg[NONALIGNED] = PFX(addAvg_32x8_avx512);
2354
+ p.pu[LUMA_32x16].addAvg[NONALIGNED] = PFX(addAvg_32x16_avx512);
2355
+ p.pu[LUMA_32x24].addAvg[NONALIGNED] = PFX(addAvg_32x24_avx512);
2356
+ p.pu[LUMA_32x32].addAvg[NONALIGNED] = PFX(addAvg_32x32_avx512);
2357
+ p.pu[LUMA_32x64].addAvg[NONALIGNED] = PFX(addAvg_32x64_avx512);
2358
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].addAvg[NONALIGNED] = PFX(addAvg_32x8_avx512);
2359
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].addAvg[NONALIGNED] = PFX(addAvg_32x16_avx512);
2360
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].addAvg[NONALIGNED] = PFX(addAvg_32x24_avx512);
2361
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].addAvg[NONALIGNED] = PFX(addAvg_32x32_avx512);
2362
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].addAvg[NONALIGNED] = PFX(addAvg_32x16_avx512);
2363
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].addAvg[NONALIGNED] = PFX(addAvg_32x48_avx512);
2364
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].addAvg[NONALIGNED] = PFX(addAvg_32x64_avx512);
2365
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].addAvg[NONALIGNED] = PFX(addAvg_32x32_avx512);
2366
+
2367
+ p.pu[LUMA_32x8].addAvg[ALIGNED] = PFX(addAvg_aligned_32x8_avx512);
2368
+ p.pu[LUMA_32x16].addAvg[ALIGNED] = PFX(addAvg_aligned_32x16_avx512);
2369
+ p.pu[LUMA_32x24].addAvg[ALIGNED] = PFX(addAvg_aligned_32x24_avx512);
2370
+ p.pu[LUMA_32x32].addAvg[ALIGNED] = PFX(addAvg_aligned_32x32_avx512);
2371
+ p.pu[LUMA_32x64].addAvg[ALIGNED] = PFX(addAvg_aligned_32x64_avx512);
2372
+ p.pu[LUMA_64x16].addAvg[ALIGNED] = PFX(addAvg_aligned_64x16_avx512);
2373
+ p.pu[LUMA_64x32].addAvg[ALIGNED] = PFX(addAvg_aligned_64x32_avx512);
2374
+ p.pu[LUMA_64x48].addAvg[ALIGNED] = PFX(addAvg_aligned_64x48_avx512);
2375
+ p.pu[LUMA_64x64].addAvg[ALIGNED] = PFX(addAvg_aligned_64x64_avx512);
2376
+
2377
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].addAvg[ALIGNED] = PFX(addAvg_aligned_32x8_avx512);
2378
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].addAvg[ALIGNED] = PFX(addAvg_aligned_32x16_avx512);
2379
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].addAvg[ALIGNED] = PFX(addAvg_aligned_32x24_avx512);
2380
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].addAvg[ALIGNED] = PFX(addAvg_aligned_32x32_avx512);
2381
+
2382
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].addAvg[ALIGNED] = PFX(addAvg_aligned_32x16_avx512);
2383
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].addAvg[ALIGNED] = PFX(addAvg_aligned_32x48_avx512);
2384
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].addAvg[ALIGNED] = PFX(addAvg_aligned_32x64_avx512);
2385
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].addAvg[ALIGNED] = PFX(addAvg_aligned_32x32_avx512);
2386
+
2387
+ p.cu[BLOCK_32x32].blockfill_s[NONALIGNED] = PFX(blockfill_s_32x32_avx512);
2388
+ p.cu[BLOCK_32x32].blockfill_s[ALIGNED] = PFX(blockfill_s_aligned_32x32_avx512);
2389
+
2390
+ p.cu[BLOCK_64x64].add_ps[NONALIGNED] = PFX(pixel_add_ps_64x64_avx512);
2391
+ p.cu[BLOCK_32x32].add_ps[NONALIGNED] = PFX(pixel_add_ps_32x32_avx512);
2392
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps[NONALIGNED] = PFX(pixel_add_ps_32x32_avx512);
2393
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps[NONALIGNED] = PFX(pixel_add_ps_32x64_avx512);
2394
+
2395
+ p.cu[BLOCK_32x32].add_ps[ALIGNED] = PFX(pixel_add_ps_aligned_32x32_avx512);
2396
+ p.cu[BLOCK_64x64].add_ps[ALIGNED] = PFX(pixel_add_ps_aligned_64x64_avx512);
2397
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps[ALIGNED] = PFX(pixel_add_ps_aligned_32x32_avx512);
2398
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps[ALIGNED] = PFX(pixel_add_ps_aligned_32x64_avx512);
2399
+
2400
+ p.cu[BLOCK_64x64].sub_ps = PFX(pixel_sub_ps_64x64_avx512);
2401
+ p.cu[BLOCK_32x32].sub_ps = PFX(pixel_sub_ps_32x32_avx512);
2402
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sub_ps = PFX(pixel_sub_ps_32x32_avx512);
2403
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sub_ps = PFX(pixel_sub_ps_32x64_avx512);
2404
+
2405
+ p.pu[LUMA_64x16].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_64x16_avx512);
2406
+ p.pu[LUMA_64x32].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_64x32_avx512);
2407
+ p.pu[LUMA_64x48].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_64x48_avx512);
2408
+ p.pu[LUMA_64x64].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_64x64_avx512);
2409
+ p.pu[LUMA_32x8].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_32x8_avx2);
2410
+ p.pu[LUMA_32x16].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_32x16_avx512);
2411
+ p.pu[LUMA_32x24].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_32x24_avx512);
2412
+ p.pu[LUMA_32x32].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_32x32_avx512);
2413
+ p.pu[LUMA_32x64].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_32x64_avx512);
2414
+ p.pu[LUMA_48x64].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_48x64_avx512);
2415
+
2416
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].p2s[NONALIGNED] = PFX(filterPixelToShort_32x8_avx512);
2417
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].p2s[NONALIGNED] = PFX(filterPixelToShort_32x16_avx512);
2418
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].p2s[NONALIGNED] = PFX(filterPixelToShort_32x24_avx512);
2419
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].p2s[NONALIGNED] = PFX(filterPixelToShort_32x32_avx512);
2420
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].p2s[NONALIGNED] = PFX(filterPixelToShort_32x16_avx512);
2421
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].p2s[NONALIGNED] = PFX(filterPixelToShort_32x32_avx512);
2422
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s[NONALIGNED] = PFX(filterPixelToShort_32x48_avx512);
2423
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s[NONALIGNED] = PFX(filterPixelToShort_32x64_avx512);
2424
+ p.chroma[X265_CSP_I444].pu[LUMA_32x8].p2s[NONALIGNED] = PFX(filterPixelToShort_32x8_avx2);
2425
+ p.chroma[X265_CSP_I444].pu[LUMA_32x16].p2s[NONALIGNED] = PFX(filterPixelToShort_32x16_avx512);
2426
+ p.chroma[X265_CSP_I444].pu[LUMA_32x24].p2s[NONALIGNED] = PFX(filterPixelToShort_32x24_avx512);
2427
+ p.chroma[X265_CSP_I444].pu[LUMA_32x32].p2s[NONALIGNED] = PFX(filterPixelToShort_32x32_avx512);
2428
+ p.chroma[X265_CSP_I444].pu[LUMA_32x64].p2s[NONALIGNED] = PFX(filterPixelToShort_32x64_avx512);
2429
+ p.chroma[X265_CSP_I444].pu[LUMA_64x16].p2s[NONALIGNED] = PFX(filterPixelToShort_64x16_avx512);
2430
+ p.chroma[X265_CSP_I444].pu[LUMA_64x32].p2s[NONALIGNED] = PFX(filterPixelToShort_64x32_avx512);
2431
+ p.chroma[X265_CSP_I444].pu[LUMA_64x48].p2s[NONALIGNED] = PFX(filterPixelToShort_64x48_avx512);
2432
+ p.chroma[X265_CSP_I444].pu[LUMA_64x64].p2s[NONALIGNED] = PFX(filterPixelToShort_64x64_avx512);
2433
+
2434
+ p.pu[LUMA_64x16].convert_p2s[ALIGNED] = PFX(filterPixelToShort_aligned_64x16_avx512);
2435
+ p.pu[LUMA_64x32].convert_p2s[ALIGNED] = PFX(filterPixelToShort_aligned_64x32_avx512);
2436
+ p.pu[LUMA_64x48].convert_p2s[ALIGNED] = PFX(filterPixelToShort_aligned_64x48_avx512);
2437
+ p.pu[LUMA_64x64].convert_p2s[ALIGNED] = PFX(filterPixelToShort_aligned_64x64_avx512);
2438
+ p.pu[LUMA_32x8].convert_p2s[ALIGNED] = PFX(filterPixelToShort_aligned_32x8_avx512);
2439
+ p.pu[LUMA_32x16].convert_p2s[ALIGNED] = PFX(filterPixelToShort_aligned_32x16_avx512);
2440
+ p.pu[LUMA_32x24].convert_p2s[ALIGNED] = PFX(filterPixelToShort_aligned_32x24_avx512);
2441
+ p.pu[LUMA_32x32].convert_p2s[ALIGNED] = PFX(filterPixelToShort_aligned_32x32_avx512);
2442
+ p.pu[LUMA_32x64].convert_p2s[ALIGNED] = PFX(filterPixelToShort_aligned_32x64_avx512);
2443
+ p.pu[LUMA_48x64].convert_p2s[ALIGNED] = PFX(filterPixelToShort_aligned_48x64_avx512);
2444
+
2445
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].p2s[ALIGNED] = PFX(filterPixelToShort_aligned_32x8_avx512);
2446
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].p2s[ALIGNED] = PFX(filterPixelToShort_aligned_32x16_avx512);
2447
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].p2s[ALIGNED] = PFX(filterPixelToShort_aligned_32x24_avx512);
2448
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].p2s[ALIGNED] = PFX(filterPixelToShort_aligned_32x32_avx512);
2449
+
2450
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].p2s[ALIGNED] = PFX(filterPixelToShort_aligned_32x16_avx512);
2451
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].p2s[ALIGNED] = PFX(filterPixelToShort_aligned_32x32_avx512);
2452
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s[ALIGNED] = PFX(filterPixelToShort_aligned_32x48_avx512);
2453
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s[ALIGNED] = PFX(filterPixelToShort_aligned_32x64_avx512);
2454
+
2455
+ p.chroma[X265_CSP_I444].pu[LUMA_32x8].p2s[ALIGNED] = PFX(filterPixelToShort_aligned_32x8_avx512);
2456
+ p.chroma[X265_CSP_I444].pu[LUMA_32x16].p2s[ALIGNED] = PFX(filterPixelToShort_aligned_32x16_avx512);
2457
+ p.chroma[X265_CSP_I444].pu[LUMA_32x24].p2s[ALIGNED] = PFX(filterPixelToShort_aligned_32x24_avx512);
2458
+ p.chroma[X265_CSP_I444].pu[LUMA_32x32].p2s[ALIGNED] = PFX(filterPixelToShort_aligned_32x32_avx512);
2459
+ p.chroma[X265_CSP_I444].pu[LUMA_32x64].p2s[ALIGNED] = PFX(filterPixelToShort_aligned_32x64_avx512);
2460
+ p.chroma[X265_CSP_I444].pu[LUMA_64x16].p2s[ALIGNED] = PFX(filterPixelToShort_aligned_64x16_avx512);
2461
+ p.chroma[X265_CSP_I444].pu[LUMA_64x32].p2s[ALIGNED] = PFX(filterPixelToShort_aligned_64x32_avx512);
2462
+ p.chroma[X265_CSP_I444].pu[LUMA_64x48].p2s[ALIGNED] = PFX(filterPixelToShort_aligned_64x48_avx512);
2463
+ p.chroma[X265_CSP_I444].pu[LUMA_64x64].p2s[ALIGNED] = PFX(filterPixelToShort_aligned_64x64_avx512);
2464
+
2465
+ p.cu[BLOCK_64x64].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_64x64_avx512);
2466
+ p.cu[BLOCK_32x32].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_32x32_avx512);
2467
+ p.cu[BLOCK_16x16].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_16x16_avx512);
2468
+ p.cu[BLOCK_32x32].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_32_avx512);
2469
+ p.cu[BLOCK_32x32].ssd_s[ALIGNED] = PFX(pixel_ssd_s_32_avx512);
2470
+ p.cu[BLOCK_16x16].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_16_avx512);
2471
+ p.cu[BLOCK_16x16].ssd_s[ALIGNED] = PFX(pixel_ssd_s_aligned_16_avx512);
2472
+ p.cu[BLOCK_32x32].copy_ss = PFX(blockcopy_ss_32x32_avx512);
2473
+ p.chroma[X265_CSP_I420].cu[CHROMA_420_32x32].copy_ss = PFX(blockcopy_ss_32x32_avx512);
2474
+ p.chroma[X265_CSP_I422].cu[CHROMA_422_32x64].copy_ss = PFX(blockcopy_ss_32x64_avx512);
2475
+ p.cu[BLOCK_64x64].copy_ss = PFX(blockcopy_ss_64x64_avx512);
2476
+
2477
+ p.cu[BLOCK_32x32].calcresidual[NONALIGNED] = PFX(getResidual32_avx512);
2478
+ p.cu[BLOCK_32x32].calcresidual[ALIGNED] = PFX(getResidual_aligned32_avx512);
2479
+ p.cu[BLOCK_16x16].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_16_avx512);
2480
+ p.cu[BLOCK_32x32].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32_avx512);
2481
+ p.cu[BLOCK_32x32].cpy1Dto2D_shl[NONALIGNED] = PFX(cpy1Dto2D_shl_32_avx512);
2482
+ p.cu[BLOCK_32x32].cpy1Dto2D_shl[ALIGNED] = PFX(cpy1Dto2D_shl_aligned_32_avx512);
2483
+ p.cu[BLOCK_16x16].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_16_avx512);
2484
+ p.cu[BLOCK_32x32].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_32_avx512);
2485
+
2486
+ p.cu[BLOCK_16x16].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_16_avx512);
2487
+ p.cu[BLOCK_32x32].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_32_avx512);
2488
+
2489
+ p.cu[BLOCK_32x32].copy_cnt = PFX(copy_cnt_32_avx512);
2490
+ p.cu[BLOCK_16x16].copy_cnt = PFX(copy_cnt_16_avx512);
2491
+
2492
+ p.dequant_normal = PFX(dequant_normal_avx512);
2493
+ p.dequant_scaling = PFX(dequant_scaling_avx512);
2494
+ //i444 chroma_hpp
2495
+ p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_hpp = PFX(interp_4tap_horiz_pp_64x64_avx512);
2496
+ p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_hpp = PFX(interp_4tap_horiz_pp_64x32_avx512);
2497
+ p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_hpp = PFX(interp_4tap_horiz_pp_64x48_avx512);
2498
+ p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_hpp = PFX(interp_4tap_horiz_pp_64x16_avx512);
2499
+ p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_hpp = PFX(interp_4tap_horiz_pp_32x16_avx512);
2500
+ p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_hpp = PFX(interp_4tap_horiz_pp_32x64_avx512);
2501
+ p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_hpp = PFX(interp_4tap_horiz_pp_32x24_avx512);
2502
+ p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_hpp = PFX(interp_4tap_horiz_pp_32x8_avx512);
2503
+ p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_hpp = PFX(interp_4tap_horiz_pp_32x32_avx512);
2504
+ p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_hpp = PFX(interp_4tap_horiz_pp_16x16_avx512);
2505
+ p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_hpp = PFX(interp_4tap_horiz_pp_16x8_avx512);
2506
+ p.chroma[X265_CSP_I444].pu[LUMA_16x32].filter_hpp = PFX(interp_4tap_horiz_pp_16x32_avx512);
2507
+ p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_hpp = PFX(interp_4tap_horiz_pp_16x12_avx512);
2508
+ p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_hpp = PFX(interp_4tap_horiz_pp_16x4_avx512);
2509
+ p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_hpp = PFX(interp_4tap_horiz_pp_16x64_avx512);
2510
+ p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_hpp = PFX(interp_4tap_horiz_pp_48x64_avx512);
2511
+
2512
+ //i422 chroma_hpp
2513
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_hpp = PFX(interp_4tap_horiz_pp_16x8_avx512);
2514
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_hpp = PFX(interp_4tap_horiz_pp_16x16_avx512);
2515
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].filter_hpp = PFX(interp_4tap_horiz_pp_16x32_avx512);
2516
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].filter_hpp = PFX(interp_4tap_horiz_pp_16x64_avx512);
2517
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_hpp = PFX(interp_4tap_horiz_pp_16x24_avx512);
2518
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_hpp = PFX(interp_4tap_horiz_pp_32x16_avx512);
2519
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_hpp = PFX(interp_4tap_horiz_pp_32x32_avx512);
2520
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_hpp = PFX(interp_4tap_horiz_pp_32x64_avx512);
2521
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_hpp = PFX(interp_4tap_horiz_pp_32x48_avx512);
2522
+
2523
+ //i420 chroma_hpp
2524
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_hpp = PFX(interp_4tap_horiz_pp_16x4_avx512);
2525
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_hpp = PFX(interp_4tap_horiz_pp_16x8_avx512);
2526
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_hpp = PFX(interp_4tap_horiz_pp_16x12_avx512);
2527
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].filter_hpp = PFX(interp_4tap_horiz_pp_16x32_avx512);
2528
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_hpp = PFX(interp_4tap_horiz_pp_16x16_avx512);
2529
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_hpp = PFX(interp_4tap_horiz_pp_32x32_avx512);
2530
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_hpp = PFX(interp_4tap_horiz_pp_32x16_avx512);
2531
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_hpp = PFX(interp_4tap_horiz_pp_32x24_avx512);
2532
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_hpp = PFX(interp_4tap_horiz_pp_32x8_avx512);
2533
+
2534
+ p.weight_pp = PFX(weight_pp_avx512);
2535
+ p.weight_sp = PFX(weight_sp_avx512);
2536
+
2537
+ //i444 chroma_hps
2538
+ p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_hps = PFX(interp_4tap_horiz_ps_64x64_avx512);
2539
+ p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_hps = PFX(interp_4tap_horiz_ps_64x32_avx512);
2540
+ p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_hps = PFX(interp_4tap_horiz_ps_64x48_avx512);
2541
+ p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_hps = PFX(interp_4tap_horiz_ps_64x16_avx512);
2542
+
2543
+ p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_hps = PFX(interp_4tap_horiz_ps_32x32_avx512);
2544
+ p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_hps = PFX(interp_4tap_horiz_ps_32x16_avx512);
2545
+ p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_hps = PFX(interp_4tap_horiz_ps_32x64_avx512);
2546
+ p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_hps = PFX(interp_4tap_horiz_ps_32x24_avx512);
2547
+ p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_hps = PFX(interp_4tap_horiz_ps_32x8_avx512);
2548
+
2549
+ //i422 chroma_hps
2550
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_hps = PFX(interp_4tap_horiz_ps_32x16_avx512);
2551
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_hps = PFX(interp_4tap_horiz_ps_32x32_avx512);
2552
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_hps = PFX(interp_4tap_horiz_ps_32x64_avx512);
2553
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_hps = PFX(interp_4tap_horiz_ps_32x48_avx512);
2554
+
2555
+ //i420 chroma_hps
2556
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_hps = PFX(interp_4tap_horiz_ps_32x32_avx512);
2557
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_hps = PFX(interp_4tap_horiz_ps_32x16_avx512);
2558
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_hps = PFX(interp_4tap_horiz_ps_32x24_avx512);
2559
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_hps = PFX(interp_4tap_horiz_ps_32x8_avx512);
2560
+
2561
+ p.pu[LUMA_16x4].luma_hpp = PFX(interp_8tap_horiz_pp_16x4_avx512);
2562
+ p.pu[LUMA_16x8].luma_hpp = PFX(interp_8tap_horiz_pp_16x8_avx512);
2563
+ p.pu[LUMA_16x12].luma_hpp = PFX(interp_8tap_horiz_pp_16x12_avx512);
2564
+ p.pu[LUMA_16x16].luma_hpp = PFX(interp_8tap_horiz_pp_16x16_avx512);
2565
+ p.pu[LUMA_16x32].luma_hpp = PFX(interp_8tap_horiz_pp_16x32_avx512);
2566
+ p.pu[LUMA_16x64].luma_hpp = PFX(interp_8tap_horiz_pp_16x64_avx512);
2567
+ p.pu[LUMA_32x8].luma_hpp = PFX(interp_8tap_horiz_pp_32x8_avx512);
2568
+ p.pu[LUMA_32x16].luma_hpp = PFX(interp_8tap_horiz_pp_32x16_avx512);
2569
+ p.pu[LUMA_32x24].luma_hpp = PFX(interp_8tap_horiz_pp_32x24_avx512);
2570
+ p.pu[LUMA_32x32].luma_hpp = PFX(interp_8tap_horiz_pp_32x32_avx512);
2571
+ p.pu[LUMA_32x64].luma_hpp = PFX(interp_8tap_horiz_pp_32x64_avx512);
2572
+ p.pu[LUMA_64x16].luma_hpp = PFX(interp_8tap_horiz_pp_64x16_avx512);
2573
+ p.pu[LUMA_64x32].luma_hpp = PFX(interp_8tap_horiz_pp_64x32_avx512);
2574
+ p.pu[LUMA_64x48].luma_hpp = PFX(interp_8tap_horiz_pp_64x48_avx512);
2575
+ p.pu[LUMA_64x64].luma_hpp = PFX(interp_8tap_horiz_pp_64x64_avx512);
2576
+ p.pu[LUMA_48x64].luma_hpp = PFX(interp_8tap_horiz_pp_48x64_avx512);
2577
+ ASSIGN2(p.pu[LUMA_64x16].pixelavg_pp, pixel_avg_64x16_avx512);
2578
+ ASSIGN2(p.pu[LUMA_64x32].pixelavg_pp, pixel_avg_64x32_avx512);
2579
+ ASSIGN2(p.pu[LUMA_64x48].pixelavg_pp, pixel_avg_64x48_avx512);
2580
+ ASSIGN2(p.pu[LUMA_64x64].pixelavg_pp, pixel_avg_64x64_avx512);
2581
+ //luma hps
2582
+ p.pu[LUMA_64x64].luma_hps = PFX(interp_8tap_horiz_ps_64x64_avx512);
2583
+ p.pu[LUMA_64x48].luma_hps = PFX(interp_8tap_horiz_ps_64x48_avx512);
2584
+ p.pu[LUMA_64x32].luma_hps = PFX(interp_8tap_horiz_ps_64x32_avx512);
2585
+ p.pu[LUMA_64x16].luma_hps = PFX(interp_8tap_horiz_ps_64x16_avx512);
2586
+
2587
+ p.pu[LUMA_32x64].luma_hps = PFX(interp_8tap_horiz_ps_32x64_avx512);
2588
+ p.pu[LUMA_32x32].luma_hps = PFX(interp_8tap_horiz_ps_32x32_avx512);
2589
+ p.pu[LUMA_32x24].luma_hps = PFX(interp_8tap_horiz_ps_32x24_avx512);
2590
+ p.pu[LUMA_32x16].luma_hps = PFX(interp_8tap_horiz_ps_32x16_avx512);
2591
+ p.pu[LUMA_32x8].luma_hps = PFX(interp_8tap_horiz_ps_32x8_avx512);
2592
+
2593
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].filter_hps = PFX(interp_4tap_horiz_ps_16x32_avx512);
2594
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_hps = PFX(interp_4tap_horiz_ps_16x12_avx512);
2595
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_hps = PFX(interp_4tap_horiz_ps_16x8_avx512);
2596
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_hps = PFX(interp_4tap_horiz_ps_16x4_avx512);
2597
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_hps = PFX(interp_4tap_horiz_ps_16x16_avx512);
2598
+
2599
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_hps = PFX(interp_4tap_horiz_ps_16x8_avx512);
2600
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_hps = PFX(interp_4tap_horiz_ps_16x16_avx512);
2601
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].filter_hps = PFX(interp_4tap_horiz_ps_16x32_avx512);
2602
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].filter_hps = PFX(interp_4tap_horiz_ps_16x64_avx512);
2603
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_hps = PFX(interp_4tap_horiz_ps_16x24_avx512);
2604
+
2605
+ p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_hps = PFX(interp_4tap_horiz_ps_16x16_avx512);
2606
+ p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_hps = PFX(interp_4tap_horiz_ps_16x8_avx512);
2607
+ p.chroma[X265_CSP_I444].pu[LUMA_16x32].filter_hps = PFX(interp_4tap_horiz_ps_16x32_avx512);
2608
+ p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_hps = PFX(interp_4tap_horiz_ps_16x12_avx512);
2609
+ p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_hps = PFX(interp_4tap_horiz_ps_16x4_avx512);
2610
+ p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_hps = PFX(interp_4tap_horiz_ps_16x64_avx512);
2611
+
2612
+ p.pu[LUMA_16x8].luma_hps = PFX(interp_8tap_horiz_ps_16x8_avx512);
2613
+ p.pu[LUMA_16x16].luma_hps = PFX(interp_8tap_horiz_ps_16x16_avx512);
2614
+ p.pu[LUMA_16x12].luma_hps = PFX(interp_8tap_horiz_ps_16x12_avx512);
2615
+ p.pu[LUMA_16x4].luma_hps = PFX(interp_8tap_horiz_ps_16x4_avx512);
2616
+ p.pu[LUMA_16x32].luma_hps = PFX(interp_8tap_horiz_ps_16x32_avx512);
2617
+ p.pu[LUMA_16x64].luma_hps = PFX(interp_8tap_horiz_ps_16x64_avx512);
2618
+
2619
+ p.pu[LUMA_48x64].luma_hps = PFX(interp_8tap_horiz_ps_48x64_avx512);
2620
+ p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_hps = PFX(interp_4tap_horiz_ps_48x64_avx512);
2621
+
2622
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_vpp = PFX(interp_4tap_vert_pp_16x4_avx512);
2623
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_vpp = PFX(interp_4tap_vert_pp_16x8_avx512);
2624
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_vpp = PFX(interp_4tap_vert_pp_16x12_avx512);
2625
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_vpp = PFX(interp_4tap_vert_pp_16x16_avx512);
2626
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].filter_vpp = PFX(interp_4tap_vert_pp_16x32_avx512);
2627
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_vpp = PFX(interp_4tap_vert_pp_32x8_avx512);
2628
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vpp = PFX(interp_4tap_vert_pp_32x16_avx512);
2629
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vpp = PFX(interp_4tap_vert_pp_32x24_avx512);
2630
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_vpp = PFX(interp_4tap_vert_pp_32x32_avx512);
2631
+
2632
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].filter_vss = PFX(interp_4tap_vert_ss_8x4_avx512);
2633
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].filter_vss = PFX(interp_4tap_vert_ss_8x8_avx512);
2634
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].filter_vss = PFX(interp_4tap_vert_ss_8x16_avx512);
2635
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].filter_vss = PFX(interp_4tap_vert_ss_8x32_avx512);
2636
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_vss = PFX(interp_4tap_vert_ss_16x4_avx512);
2637
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_vss = PFX(interp_4tap_vert_ss_16x8_avx512);
2638
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_vss = PFX(interp_4tap_vert_ss_16x12_avx512);
2639
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_vss = PFX(interp_4tap_vert_ss_16x16_avx512);
2640
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].filter_vss = PFX(interp_4tap_vert_ss_16x32_avx512);
2641
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].filter_vss = PFX(interp_4tap_vert_ss_24x32_avx512);
2642
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_vss = PFX(interp_4tap_vert_ss_32x8_avx512);
2643
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vss = PFX(interp_4tap_vert_ss_32x16_avx512);
2644
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vss = PFX(interp_4tap_vert_ss_32x24_avx512);
2645
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_vss = PFX(interp_4tap_vert_ss_32x32_avx512);
2646
+
2647
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_vsp = PFX(interp_4tap_vert_sp_16x4_avx512);
2648
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_vsp = PFX(interp_4tap_vert_sp_16x8_avx512);
2649
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_vsp = PFX(interp_4tap_vert_sp_16x12_avx512);
2650
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_vsp = PFX(interp_4tap_vert_sp_16x16_avx512);
2651
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].filter_vsp = PFX(interp_4tap_vert_sp_16x32_avx512);
2652
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_vsp = PFX(interp_4tap_vert_sp_32x8_avx512);
2653
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vsp = PFX(interp_4tap_vert_sp_32x16_avx512);
2654
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vsp = PFX(interp_4tap_vert_sp_32x24_avx512);
2655
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_vsp = PFX(interp_4tap_vert_sp_32x32_avx512);
2656
+
2657
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_vpp = PFX(interp_4tap_vert_pp_16x8_avx512);
2658
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_vpp = PFX(interp_4tap_vert_pp_16x16_avx512);
2659
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_vpp = PFX(interp_4tap_vert_pp_16x24_avx512);
2660
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].filter_vpp = PFX(interp_4tap_vert_pp_16x32_avx512);
2661
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].filter_vpp = PFX(interp_4tap_vert_pp_16x64_avx512);
2662
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_vpp = PFX(interp_4tap_vert_pp_16x24_avx512);
2663
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_vpp = PFX(interp_4tap_vert_pp_32x16_avx512);
2664
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_vpp = PFX(interp_4tap_vert_pp_32x32_avx512);
2665
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_vpp = PFX(interp_4tap_vert_pp_32x48_avx512);
2666
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_vpp = PFX(interp_4tap_vert_pp_32x64_avx512);
2667
+
2668
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].filter_vss = PFX(interp_4tap_vert_ss_8x4_avx512);
2669
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].filter_vss = PFX(interp_4tap_vert_ss_8x8_avx512);
2670
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].filter_vss = PFX(interp_4tap_vert_ss_8x12_avx512);
2671
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].filter_vss = PFX(interp_4tap_vert_ss_8x16_avx512);
2672
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].filter_vss = PFX(interp_4tap_vert_ss_8x32_avx512);
2673
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].filter_vss = PFX(interp_4tap_vert_ss_8x64_avx512);
2674
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_vss = PFX(interp_4tap_vert_ss_16x8_avx512);
2675
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_vss = PFX(interp_4tap_vert_ss_16x16_avx512);
2676
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_vss = PFX(interp_4tap_vert_ss_16x24_avx512);
2677
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].filter_vss = PFX(interp_4tap_vert_ss_16x32_avx512);
2678
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].filter_vss = PFX(interp_4tap_vert_ss_16x64_avx512);
2679
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].filter_vss = PFX(interp_4tap_vert_ss_24x64_avx512);
2680
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_vss = PFX(interp_4tap_vert_ss_32x16_avx512);
2681
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_vss = PFX(interp_4tap_vert_ss_32x32_avx512);
2682
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_vss = PFX(interp_4tap_vert_ss_32x48_avx512);
2683
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_vss = PFX(interp_4tap_vert_ss_32x64_avx512);
2684
+
2685
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_vsp = PFX(interp_4tap_vert_sp_16x8_avx512);
2686
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_vsp = PFX(interp_4tap_vert_sp_16x16_avx512);
2687
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_vsp = PFX(interp_4tap_vert_sp_16x24_avx512);
2688
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].filter_vsp = PFX(interp_4tap_vert_sp_16x32_avx512);
2689
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].filter_vsp = PFX(interp_4tap_vert_sp_16x64_avx512);
2690
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_vsp = PFX(interp_4tap_vert_sp_32x16_avx512);
2691
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_vsp = PFX(interp_4tap_vert_sp_32x32_avx512);
2692
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_vsp = PFX(interp_4tap_vert_sp_32x48_avx512);
2693
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_vsp = PFX(interp_4tap_vert_sp_32x64_avx512);
2694
+
2695
+ p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_vpp = PFX(interp_4tap_vert_pp_16x4_avx512);
2696
+ p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_vpp = PFX(interp_4tap_vert_pp_16x8_avx512);
2697
+ p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_vpp = PFX(interp_4tap_vert_pp_16x12_avx512);
2698
+ p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_vpp = PFX(interp_4tap_vert_pp_16x16_avx512);
2699
+ p.chroma[X265_CSP_I444].pu[LUMA_16x32].filter_vpp = PFX(interp_4tap_vert_pp_16x32_avx512);
2700
+ p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_vpp = PFX(interp_4tap_vert_pp_16x64_avx512);
2701
+ p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_vpp = PFX(interp_4tap_vert_pp_32x8_avx512);
2702
+ p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_vpp = PFX(interp_4tap_vert_pp_32x16_avx512);
2703
+ p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_vpp = PFX(interp_4tap_vert_pp_32x24_avx512);
2704
+ p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_vpp = PFX(interp_4tap_vert_pp_32x32_avx512);
2705
+ p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_vpp = PFX(interp_4tap_vert_pp_32x64_avx512);
2706
+ p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vpp = PFX(interp_4tap_vert_pp_48x64_avx512);
2707
+ p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vpp = PFX(interp_4tap_vert_pp_64x64_avx512);
2708
+ p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vpp = PFX(interp_4tap_vert_pp_64x48_avx512);
2709
+ p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vpp = PFX(interp_4tap_vert_pp_64x32_avx512);
2710
+ p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vpp = PFX(interp_4tap_vert_pp_64x16_avx512);
2711
+
2712
+ p.chroma[X265_CSP_I444].pu[LUMA_8x4].filter_vss = PFX(interp_4tap_vert_ss_8x4_avx512);
2713
+ p.chroma[X265_CSP_I444].pu[LUMA_8x8].filter_vss = PFX(interp_4tap_vert_ss_8x8_avx512);
2714
+ p.chroma[X265_CSP_I444].pu[LUMA_8x16].filter_vss = PFX(interp_4tap_vert_ss_8x16_avx512);
2715
+ p.chroma[X265_CSP_I444].pu[LUMA_8x32].filter_vss = PFX(interp_4tap_vert_ss_8x32_avx512);
2716
+ p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_vss = PFX(interp_4tap_vert_ss_16x4_avx512);
2717
+ p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_vss = PFX(interp_4tap_vert_ss_16x8_avx512);
2718
+ p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_vss = PFX(interp_4tap_vert_ss_16x12_avx512);
2719
+ p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_vss = PFX(interp_4tap_vert_ss_16x16_avx512);
2720
+ p.chroma[X265_CSP_I444].pu[LUMA_16x32].filter_vss = PFX(interp_4tap_vert_ss_16x32_avx512);
2721
+ p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_vss = PFX(interp_4tap_vert_ss_16x64_avx512);
2722
+ p.chroma[X265_CSP_I444].pu[LUMA_24x32].filter_vss = PFX(interp_4tap_vert_ss_24x32_avx512);
2723
+ p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_vss = PFX(interp_4tap_vert_ss_32x8_avx512);
2724
+ p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_vss = PFX(interp_4tap_vert_ss_32x16_avx512);
2725
+ p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_vss = PFX(interp_4tap_vert_ss_32x24_avx512);
2726
+ p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_vss = PFX(interp_4tap_vert_ss_32x32_avx512);
2727
+ p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_vss = PFX(interp_4tap_vert_ss_32x64_avx512);
2728
+ p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vss = PFX(interp_4tap_vert_ss_64x64_avx512);
2729
+ p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vss = PFX(interp_4tap_vert_ss_64x48_avx512);
2730
+ p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vss = PFX(interp_4tap_vert_ss_64x32_avx512);
2731
+ p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vss = PFX(interp_4tap_vert_ss_64x16_avx512);
2732
+ p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vss = PFX(interp_4tap_vert_ss_48x64_avx512);
2733
+
2734
+ p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_vsp = PFX(interp_4tap_vert_sp_16x4_avx512);
2735
+ p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_vsp = PFX(interp_4tap_vert_sp_16x8_avx512);
2736
+ p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_vsp = PFX(interp_4tap_vert_sp_16x12_avx512);
2737
+ p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_vsp = PFX(interp_4tap_vert_sp_16x16_avx512);
2738
+ p.chroma[X265_CSP_I444].pu[LUMA_16x32].filter_vsp = PFX(interp_4tap_vert_sp_16x32_avx512);
2739
+ p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_vsp = PFX(interp_4tap_vert_sp_16x64_avx512);
2740
+ p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_vsp = PFX(interp_4tap_vert_sp_32x8_avx512);
2741
+ p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_vsp = PFX(interp_4tap_vert_sp_32x16_avx512);
2742
+ p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_vsp = PFX(interp_4tap_vert_sp_32x24_avx512);
2743
+ p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_vsp = PFX(interp_4tap_vert_sp_32x32_avx512);
2744
+ p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_vsp = PFX(interp_4tap_vert_sp_32x64_avx512);
2745
+ p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vsp = PFX(interp_4tap_vert_sp_48x64_avx512);
2746
+ p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vsp = PFX(interp_4tap_vert_sp_64x64_avx512);
2747
+ p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vsp = PFX(interp_4tap_vert_sp_64x48_avx512);
2748
+ p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vsp = PFX(interp_4tap_vert_sp_64x32_avx512);
2749
+ p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vsp = PFX(interp_4tap_vert_sp_64x16_avx512);
2750
+
2751
+ p.pu[LUMA_8x8].luma_vss = PFX(interp_8tap_vert_ss_8x8_avx512);
2752
+ p.pu[LUMA_8x16].luma_vss = PFX(interp_8tap_vert_ss_8x16_avx512);
2753
+ p.pu[LUMA_8x32].luma_vss = PFX(interp_8tap_vert_ss_8x32_avx512);
2754
+ p.pu[LUMA_16x4].luma_vss = PFX(interp_8tap_vert_ss_16x4_avx512);
2755
+ p.pu[LUMA_16x8].luma_vss = PFX(interp_8tap_vert_ss_16x8_avx512);
2756
+ p.pu[LUMA_16x12].luma_vss = PFX(interp_8tap_vert_ss_16x12_avx512);
2757
+ p.pu[LUMA_16x16].luma_vss = PFX(interp_8tap_vert_ss_16x16_avx512);
2758
+ p.pu[LUMA_16x32].luma_vss = PFX(interp_8tap_vert_ss_16x32_avx512);
2759
+ p.pu[LUMA_16x64].luma_vss = PFX(interp_8tap_vert_ss_16x64_avx512);
2760
+ p.pu[LUMA_24x32].luma_vss = PFX(interp_8tap_vert_ss_24x32_avx512);
2761
+ p.pu[LUMA_32x64].luma_vss = PFX(interp_8tap_vert_ss_32x64_avx512);
2762
+ p.pu[LUMA_32x32].luma_vss = PFX(interp_8tap_vert_ss_32x32_avx512);
2763
+ p.pu[LUMA_32x24].luma_vss = PFX(interp_8tap_vert_ss_32x24_avx512);
2764
+ p.pu[LUMA_32x16].luma_vss = PFX(interp_8tap_vert_ss_32x16_avx512);
2765
+ p.pu[LUMA_32x8].luma_vss = PFX(interp_8tap_vert_ss_32x8_avx512);
2766
+ p.pu[LUMA_48x64].luma_vss = PFX(interp_8tap_vert_ss_48x64_avx512);
2767
+ p.pu[LUMA_64x64].luma_vss = PFX(interp_8tap_vert_ss_64x64_avx512);
2768
+ p.pu[LUMA_64x48].luma_vss = PFX(interp_8tap_vert_ss_64x48_avx512);
2769
+ p.pu[LUMA_64x32].luma_vss = PFX(interp_8tap_vert_ss_64x32_avx512);
2770
+ p.pu[LUMA_64x16].luma_vss = PFX(interp_8tap_vert_ss_64x16_avx512);
2771
+
2772
+ p.pu[LUMA_16x64].luma_vpp = PFX(interp_8tap_vert_pp_16x64_avx512);
2773
+ p.pu[LUMA_16x32].luma_vpp = PFX(interp_8tap_vert_pp_16x32_avx512);
2774
+ p.pu[LUMA_16x16].luma_vpp = PFX(interp_8tap_vert_pp_16x16_avx512);
2775
+ p.pu[LUMA_16x8].luma_vpp = PFX(interp_8tap_vert_pp_16x8_avx512);
2776
+ p.pu[LUMA_32x64].luma_vpp = PFX(interp_8tap_vert_pp_32x64_avx512);
2777
+ p.pu[LUMA_32x32].luma_vpp = PFX(interp_8tap_vert_pp_32x32_avx512);
2778
+ p.pu[LUMA_32x24].luma_vpp = PFX(interp_8tap_vert_pp_32x24_avx512);
2779
+ p.pu[LUMA_32x16].luma_vpp = PFX(interp_8tap_vert_pp_32x16_avx512);
2780
+ p.pu[LUMA_32x8].luma_vpp = PFX(interp_8tap_vert_pp_32x8_avx512);
2781
+ p.pu[LUMA_48x64].luma_vpp = PFX(interp_8tap_vert_pp_48x64_avx512);
2782
+ p.pu[LUMA_64x64].luma_vpp = PFX(interp_8tap_vert_pp_64x64_avx512);
2783
+ p.pu[LUMA_64x48].luma_vpp = PFX(interp_8tap_vert_pp_64x48_avx512);
2784
+ p.pu[LUMA_64x32].luma_vpp = PFX(interp_8tap_vert_pp_64x32_avx512);
2785
+ p.pu[LUMA_64x16].luma_vpp = PFX(interp_8tap_vert_pp_64x16_avx512);
2786
+ p.pu[LUMA_16x4].luma_vsp = PFX(interp_8tap_vert_sp_16x4_avx512);
2787
+ p.pu[LUMA_16x8].luma_vsp = PFX(interp_8tap_vert_sp_16x8_avx512);
2788
+ p.pu[LUMA_16x12].luma_vsp = PFX(interp_8tap_vert_sp_16x12_avx512);
2789
+ p.pu[LUMA_16x16].luma_vsp = PFX(interp_8tap_vert_sp_16x16_avx512);
2790
+ p.pu[LUMA_16x32].luma_vsp = PFX(interp_8tap_vert_sp_16x32_avx512);
2791
+ p.pu[LUMA_16x64].luma_vsp = PFX(interp_8tap_vert_sp_16x64_avx512);
2792
+ p.pu[LUMA_32x64].luma_vsp = PFX(interp_8tap_vert_sp_32x64_avx512);
2793
+ p.pu[LUMA_32x32].luma_vsp = PFX(interp_8tap_vert_sp_32x32_avx512);
2794
+ p.pu[LUMA_32x24].luma_vsp = PFX(interp_8tap_vert_sp_32x24_avx512);
2795
+ p.pu[LUMA_32x16].luma_vsp = PFX(interp_8tap_vert_sp_32x16_avx512);
2796
+ p.pu[LUMA_32x8].luma_vsp = PFX(interp_8tap_vert_sp_32x8_avx512);
2797
+ p.pu[LUMA_48x64].luma_vsp = PFX(interp_8tap_vert_sp_48x64_avx512);
2798
+ p.pu[LUMA_64x64].luma_vsp = PFX(interp_8tap_vert_sp_64x64_avx512);
2799
+ p.pu[LUMA_64x48].luma_vsp = PFX(interp_8tap_vert_sp_64x48_avx512);
2800
+ p.pu[LUMA_64x32].luma_vsp = PFX(interp_8tap_vert_sp_64x32_avx512);
2801
+ p.pu[LUMA_64x16].luma_vsp = PFX(interp_8tap_vert_sp_64x16_avx512);
2802
+
2803
+ p.cu[BLOCK_8x8].dct = PFX(dct8_avx512);
2804
+ /* TODO: Currently these kernels performance are similar to AVX2 version, we need a to improve them further to ebable
2805
+ * it. Probably a Vtune analysis will help here.
2806
+
2807
+ * p.cu[BLOCK_16x16].dct = PFX(dct16_avx512);
2808
+ * p.cu[BLOCK_32x32].dct = PFX(dct32_avx512); */
2809
+
2810
+ p.cu[BLOCK_8x8].idct = PFX(idct8_avx512);
2811
+ p.cu[BLOCK_16x16].idct = PFX(idct16_avx512);
2812
+ p.cu[BLOCK_32x32].idct = PFX(idct32_avx512);
2813
+ p.quant = PFX(quant_avx512);
2814
+ p.nquant = PFX(nquant_avx512);
2815
+ p.denoiseDct = PFX(denoise_dct_avx512);
2816
+
2817
+ p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vps = PFX(interp_4tap_vert_ps_64x64_avx512);
2818
+ p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vps = PFX(interp_4tap_vert_ps_64x48_avx512);
2819
+ p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vps = PFX(interp_4tap_vert_ps_64x32_avx512);
2820
+ p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vps = PFX(interp_4tap_vert_ps_64x16_avx512);
2821
+
2822
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_vps = PFX(interp_4tap_vert_ps_32x32_avx512);
2823
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vps = PFX(interp_4tap_vert_ps_32x24_avx512);
2824
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vps = PFX(interp_4tap_vert_ps_32x16_avx512);
2825
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_vps = PFX(interp_4tap_vert_ps_32x8_avx512);
2826
+
2827
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_vps = PFX(interp_4tap_vert_ps_32x32_avx512);
2828
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_vps = PFX(interp_4tap_vert_ps_32x16_avx512);
2829
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_vps = PFX(interp_4tap_vert_ps_32x64_avx512);
2830
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_vps = PFX(interp_4tap_vert_ps_32x48_avx512);
2831
+
2832
+ p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_vps = PFX(interp_4tap_vert_ps_32x32_avx512);
2833
+ p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_vps = PFX(interp_4tap_vert_ps_32x16_avx512);
2834
+ p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_vps = PFX(interp_4tap_vert_ps_32x24_avx512);
2835
+ p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_vps = PFX(interp_4tap_vert_ps_32x8_avx512);
2836
+ p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_vps = PFX(interp_4tap_vert_ps_32x64_avx512);
2837
+
2838
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_vps = PFX(interp_4tap_vert_ps_16x4_avx512);
2839
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_vps = PFX(interp_4tap_vert_ps_16x8_avx512);
2840
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_vps = PFX(interp_4tap_vert_ps_16x12_avx512);
2841
+ //p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_vps = PFX(interp_4tap_vert_ps_16x16_avx512);
2842
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].filter_vps = PFX(interp_4tap_vert_ps_16x32_avx512);
2843
+
2844
+ /*p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].filter_vps = PFX(interp_4tap_vert_ps_16x32_avx512);
2845
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_vps = PFX(interp_4tap_vert_ps_16x16_avx512);
2846
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_vps = PFX(interp_4tap_vert_ps_16x8_avx512);
2847
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].filter_vps = PFX(interp_4tap_vert_ps_16x64_avx512);
2848
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_vps = PFX(interp_4tap_vert_ps_16x24_avx512);*/
2849
+
2850
+ //p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_vps = PFX(interp_4tap_vert_ps_16x16_avx512);
2851
+ p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_vps = PFX(interp_4tap_vert_ps_16x8_avx512);
2852
+ p.chroma[X265_CSP_I444].pu[LUMA_16x32].filter_vps = PFX(interp_4tap_vert_ps_16x32_avx512);
2853
+ //p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_vps = PFX(interp_4tap_vert_ps_16x12_avx512);
2854
+ p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_vps = PFX(interp_4tap_vert_ps_16x4_avx512);
2855
+ p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_vps = PFX(interp_4tap_vert_ps_16x64_avx512);
2856
+ p.cu[BLOCK_16x16].psy_cost_pp = PFX(psyCost_pp_16x16_avx512);
2857
+ p.cu[BLOCK_32x32].psy_cost_pp = PFX(psyCost_pp_32x32_avx512);
2858
+ p.cu[BLOCK_64x64].psy_cost_pp = PFX(psyCost_pp_64x64_avx512);
2859
+
2860
+ p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vps = PFX(interp_4tap_vert_ps_48x64_avx512);
2861
+
2862
+ p.pu[LUMA_64x16].luma_vps = PFX(interp_8tap_vert_ps_64x16_avx512);
2863
+ p.pu[LUMA_64x32].luma_vps = PFX(interp_8tap_vert_ps_64x32_avx512);
2864
+ p.pu[LUMA_64x48].luma_vps = PFX(interp_8tap_vert_ps_64x48_avx512);
2865
+ p.pu[LUMA_64x64].luma_vps = PFX(interp_8tap_vert_ps_64x64_avx512);
2866
+
2867
+ p.pu[LUMA_32x8].luma_vps = PFX(interp_8tap_vert_ps_32x8_avx512);
2868
+ p.pu[LUMA_32x16].luma_vps = PFX(interp_8tap_vert_ps_32x16_avx512);
2869
+ p.pu[LUMA_32x32].luma_vps = PFX(interp_8tap_vert_ps_32x32_avx512);
2870
+ p.pu[LUMA_32x24].luma_vps = PFX(interp_8tap_vert_ps_32x24_avx512);
2871
+ p.pu[LUMA_32x64].luma_vps = PFX(interp_8tap_vert_ps_32x64_avx512);
2872
+
2873
+ p.pu[LUMA_16x8].luma_vps = PFX(interp_8tap_vert_ps_16x8_avx512);
2874
+ p.pu[LUMA_16x16].luma_vps = PFX(interp_8tap_vert_ps_16x16_avx512);
2875
+ p.pu[LUMA_16x32].luma_vps = PFX(interp_8tap_vert_ps_16x32_avx512);
2876
+ //p.pu[LUMA_16x64].luma_vps = PFX(interp_8tap_vert_ps_16x64_avx512);
2877
+ p.pu[LUMA_48x64].luma_vps = PFX(interp_8tap_vert_ps_48x64_avx512);
2878
+
2879
+ p.pu[LUMA_64x64].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_64x64>;
2880
+ p.pu[LUMA_64x48].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_64x48>;
2881
+ p.pu[LUMA_64x32].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_64x32>;
2882
+ p.pu[LUMA_64x16].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_64x16>;
2883
+ p.pu[LUMA_32x8].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_32x8>;
2884
+ p.pu[LUMA_32x16].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_32x16>;
2885
+ p.pu[LUMA_32x32].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_32x32>;
2886
+ p.pu[LUMA_32x24].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_32x24>;
2887
+ p.pu[LUMA_32x64].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_32x64>;
2888
+ p.pu[LUMA_16x4].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_16x4>;
2889
+ p.pu[LUMA_16x8].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_16x8>;
2890
+ p.pu[LUMA_16x12].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_16x12>;
2891
+ p.pu[LUMA_16x16].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_16x16>;
2892
+ p.pu[LUMA_16x32].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_16x32>;
2893
+ p.pu[LUMA_16x64].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_16x64>;
2894
+ p.pu[LUMA_48x64].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_48x64>;
2895
+
2896
+ p.cu[BLOCK_4x4].nonPsyRdoQuant = PFX(nonPsyRdoQuant4_avx512);
2897
+ p.cu[BLOCK_8x8].nonPsyRdoQuant = PFX(nonPsyRdoQuant8_avx512);
2898
+ p.cu[BLOCK_16x16].nonPsyRdoQuant = PFX(nonPsyRdoQuant16_avx512);
2899
+ p.cu[BLOCK_32x32].nonPsyRdoQuant = PFX(nonPsyRdoQuant32_avx512);
2900
+ p.cu[BLOCK_4x4].psyRdoQuant = PFX(psyRdoQuant4_avx512);
2901
+ p.cu[BLOCK_8x8].psyRdoQuant = PFX(psyRdoQuant8_avx512);
2902
+ p.cu[BLOCK_16x16].psyRdoQuant = PFX(psyRdoQuant16_avx512);
2903
+ p.cu[BLOCK_32x32].psyRdoQuant = PFX(psyRdoQuant32_avx512);
2904
+ p.pu[LUMA_32x8].satd = PFX(pixel_satd_32x8_avx512);
2905
+ p.pu[LUMA_32x16].satd = PFX(pixel_satd_32x16_avx512);
2906
+ p.pu[LUMA_32x24].satd = PFX(pixel_satd_32x24_avx512);
2907
+ p.pu[LUMA_32x32].satd = PFX(pixel_satd_32x32_avx512);
2908
+ p.pu[LUMA_32x64].satd = PFX(pixel_satd_32x64_avx512);
2909
+ p.pu[LUMA_64x16].satd = PFX(pixel_satd_64x16_avx512);
2910
+ p.pu[LUMA_64x32].satd = PFX(pixel_satd_64x32_avx512);
2911
+ p.pu[LUMA_64x48].satd = PFX(pixel_satd_64x48_avx512);
2912
+ p.pu[LUMA_64x64].satd = PFX(pixel_satd_64x64_avx512);
2913
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].satd = PFX(pixel_satd_32x32_avx512);
2914
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].satd = PFX(pixel_satd_32x16_avx512);
2915
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].satd = PFX(pixel_satd_32x24_avx512);
2916
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].satd = PFX(pixel_satd_32x8_avx512);
2917
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].satd = PFX(pixel_satd_32x64_avx512);
2918
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].satd = PFX(pixel_satd_32x48_avx512);
2919
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].satd = PFX(pixel_satd_32x32_avx512);
2920
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd = PFX(pixel_satd_32x16_avx512);
2921
+ p.planecopy_sp_shl = PFX(upShift_16_avx512);
2922
+ p.cu[BLOCK_16x16].count_nonzero = PFX(count_nonzero_16x16_avx512);
2923
+ p.cu[BLOCK_32x32].count_nonzero = PFX(count_nonzero_32x32_avx512);
2924
2925
}
2926
#endif
2927
2928
// CPU dispatcher function
2929
void PFX(intel_cpu_indicator_init)(void)
2930
{
2931
- uint32_t cpu = x265::cpu_detect();
2932
+ uint32_t cpu = x265::cpu_detect(false);
2933
2934
if (cpu & X265_CPU_AVX)
2935
__intel_cpu_indicator = 0x20000;
2936
x265_2.7.tar.gz/source/common/x86/blockcopy8.asm -> x265_2.9.tar.gz/source/common/x86/blockcopy8.asm
Changed
1157
1
2
%include "x86inc.asm"
3
%include "x86util.asm"
4
5
-SECTION_RODATA 32
6
+SECTION_RODATA 64
7
+
8
+ALIGN 64
9
+const shuf1_avx512, dq 0, 2, 4, 6, 1, 3, 5, 7
10
11
cextern pb_4
12
cextern pb_1
13
14
BLOCKCOPY_PP_W64_H4_avx 64, 48
15
BLOCKCOPY_PP_W64_H4_avx 64, 64
16
17
+;----------------------------------------------------------------------------------------------
18
+; blockcopy_pp avx512 code start
19
+;----------------------------------------------------------------------------------------------
20
+%macro PROCESS_BLOCKCOPY_PP_64X4_avx512 0
21
+movu m0, [r2]
22
+movu m1, [r2 + r3]
23
+movu m2, [r2 + 2 * r3]
24
+movu m3, [r2 + r4]
25
+
26
+movu [r0] , m0
27
+movu [r0 + r1] , m1
28
+movu [r0 + 2 * r1] , m2
29
+movu [r0 + r5] , m3
30
+%endmacro
31
+
32
+%macro PROCESS_BLOCKCOPY_PP_32X4_avx512 0
33
+movu ym0, [r2]
34
+vinserti32x8 m0, [r2 + r3], 1
35
+movu ym1, [r2 + 2 * r3]
36
+vinserti32x8 m1, [r2 + r4], 1
37
+
38
+movu [r0] , ym0
39
+vextracti32x8 [r0 + r1] , m0, 1
40
+movu [r0 + 2 * r1] , ym1
41
+vextracti32x8 [r0 + r5] , m1, 1
42
+%endmacro
43
+
44
+;----------------------------------------------------------------------------------------------
45
+; void blockcopy_pp_64x%1(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
46
+;----------------------------------------------------------------------------------------------
47
+%macro BLOCKCOPY_PP_W64_H4_avx512 1
48
+INIT_ZMM avx512
49
+cglobal blockcopy_pp_64x%1, 4, 6, 4
50
+lea r4, [3 * r3]
51
+lea r5, [3 * r1]
52
+
53
+%rep %1/4 - 1
54
+PROCESS_BLOCKCOPY_PP_64X4_avx512
55
+lea r2, [r2 + 4 * r3]
56
+lea r0, [r0 + 4 * r1]
57
+%endrep
58
+
59
+PROCESS_BLOCKCOPY_PP_64X4_avx512
60
+RET
61
+%endmacro
62
+
63
+BLOCKCOPY_PP_W64_H4_avx512 16
64
+BLOCKCOPY_PP_W64_H4_avx512 32
65
+BLOCKCOPY_PP_W64_H4_avx512 48
66
+BLOCKCOPY_PP_W64_H4_avx512 64
67
+
68
+%macro BLOCKCOPY_PP_W32_H4_avx512 1
69
+INIT_ZMM avx512
70
+cglobal blockcopy_pp_32x%1, 4, 6, 2
71
+ lea r4, [3 * r3]
72
+ lea r5, [3 * r1]
73
+
74
+%rep %1/4 - 1
75
+ PROCESS_BLOCKCOPY_PP_32X4_avx512
76
+ lea r2, [r2 + 4 * r3]
77
+ lea r0, [r0 + 4 * r1]
78
+%endrep
79
+ PROCESS_BLOCKCOPY_PP_32X4_avx512
80
+ RET
81
+%endmacro
82
+
83
+BLOCKCOPY_PP_W32_H4_avx512 8
84
+BLOCKCOPY_PP_W32_H4_avx512 16
85
+BLOCKCOPY_PP_W32_H4_avx512 24
86
+BLOCKCOPY_PP_W32_H4_avx512 32
87
+BLOCKCOPY_PP_W32_H4_avx512 48
88
+BLOCKCOPY_PP_W32_H4_avx512 64
89
+;----------------------------------------------------------------------------------------------
90
+; blockcopy_pp avx512 code end
91
+;----------------------------------------------------------------------------------------------
92
+
93
;-----------------------------------------------------------------------------
94
; void blockcopy_sp_2x4(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
95
;-----------------------------------------------------------------------------
96
97
98
BLOCKCOPY_SP_W64_H4_avx2 64, 64
99
100
+%macro PROCESS_BLOCKCOPY_SP_64x4_AVX512 0
101
+ movu m0, [r2]
102
+ movu m1, [r2 + 64]
103
+ movu m2, [r2 + r3]
104
+ movu m3, [r2 + r3 + 64]
105
+
106
+ packuswb m0, m1
107
+ packuswb m2, m3
108
+ vpermq m0, m4, m0
109
+ vpermq m2, m4, m2
110
+ movu [r0], m0
111
+ movu [r0 + r1], m2
112
+
113
+ movu m0, [r2 + 2 * r3]
114
+ movu m1, [r2 + 2 * r3 + 64]
115
+ movu m2, [r2 + r4]
116
+ movu m3, [r2 + r4 + 64]
117
+
118
+ packuswb m0, m1
119
+ packuswb m2, m3
120
+ vpermq m0, m4, m0
121
+ vpermq m2, m4, m2
122
+ movu [r0 + 2 * r1], m0
123
+ movu [r0 + r5], m2
124
+%endmacro
125
+
126
+%macro PROCESS_BLOCKCOPY_SP_32x4_AVX512 0
127
+ movu m0, [r2]
128
+ movu m1, [r2 + r3]
129
+ movu m2, [r2 + 2 * r3]
130
+ movu m3, [r2 + r4]
131
+
132
+ packuswb m0, m1
133
+ packuswb m2, m3
134
+ vpermq m0, m4, m0
135
+ vpermq m2, m4, m2
136
+ movu [r0], ym0
137
+ vextracti32x8 [r0 + r1], m0, 1
138
+ movu [r0 + 2 * r1], ym2
139
+ vextracti32x8 [r0 + r5], m2, 1
140
+%endmacro
141
+
142
+;-----------------------------------------------------------------------------
143
+; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
144
+;-----------------------------------------------------------------------------
145
+INIT_ZMM avx512
146
+cglobal blockcopy_sp_64x64, 4, 6, 5
147
+ mova m4, [shuf1_avx512]
148
+ add r3, r3
149
+ lea r4, [3 * r3]
150
+ lea r5, [3 * r1]
151
+
152
+%rep 15
153
+ PROCESS_BLOCKCOPY_SP_64x4_AVX512
154
+ lea r0, [r0 + 4 * r1]
155
+ lea r2, [r2 + 4 * r3]
156
+%endrep
157
+ PROCESS_BLOCKCOPY_SP_64x4_AVX512
158
+ RET
159
+
160
+%macro BLOCKCOPY_SP_32xN_AVX512 1
161
+INIT_ZMM avx512
162
+cglobal blockcopy_sp_32x%1, 4, 6, 5
163
+ mova m4, [shuf1_avx512]
164
+ add r3, r3
165
+ lea r4, [3 * r3]
166
+ lea r5, [3 * r1]
167
+
168
+%rep %1/4 - 1
169
+ PROCESS_BLOCKCOPY_SP_32x4_AVX512
170
+ lea r0, [r0 + 4 * r1]
171
+ lea r2, [r2 + 4 * r3]
172
+%endrep
173
+ PROCESS_BLOCKCOPY_SP_32x4_AVX512
174
+ RET
175
+%endmacro
176
+
177
+BLOCKCOPY_SP_32xN_AVX512 32
178
+BLOCKCOPY_SP_32xN_AVX512 64
179
+
180
;-----------------------------------------------------------------------------
181
; void blockfill_s_4x4(int16_t* dst, intptr_t dstride, int16_t val)
182
;-----------------------------------------------------------------------------
183
184
movu [r0 + r3 + 32], m0
185
RET
186
187
+;--------------------------------------------------------------------
188
+; void blockfill_s_32x32(int16_t* dst, intptr_t dstride, int16_t val)
189
+;--------------------------------------------------------------------
190
+INIT_ZMM avx512
191
+cglobal blockfill_s_32x32, 3, 4, 1
192
+add r1, r1
193
+lea r3, [3 * r1]
194
+movd xm0, r2d
195
+vpbroadcastw m0, xm0
196
+
197
+%rep 8
198
+movu [r0], m0
199
+movu [r0 + r1], m0
200
+movu [r0 + 2 * r1], m0
201
+movu [r0 + r3], m0
202
+lea r0, [r0 + 4 * r1]
203
+%endrep
204
+RET
205
+
206
+;--------------------------------------------------------------------
207
+; void blockfill_s_aligned_32x32(int16_t* dst, intptr_t dstride, int16_t val)
208
+;--------------------------------------------------------------------
209
+INIT_ZMM avx512
210
+cglobal blockfill_s_aligned_32x32, 3, 4, 1
211
+add r1, r1
212
+lea r3, [3 * r1]
213
+movd xm0, r2d
214
+vpbroadcastw m0, xm0
215
+
216
+%rep 8
217
+mova [r0], m0
218
+mova [r0 + r1], m0
219
+mova [r0 + 2 * r1], m0
220
+mova [r0 + r3], m0
221
+lea r0, [r0 + 4 * r1]
222
+%endrep
223
+RET
224
;-----------------------------------------------------------------------------
225
; void blockcopy_ps_2x4(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
226
;-----------------------------------------------------------------------------
227
228
BLOCKCOPY_PS_W32_H4_avx2 32, 32
229
BLOCKCOPY_PS_W32_H4_avx2 32, 64
230
231
+%macro PROCESS_BLOCKCOPY_PS_32x8_AVX512 0
232
+ pmovzxbw m0, [r2]
233
+ pmovzxbw m1, [r2 + r3]
234
+ pmovzxbw m2, [r2 + r3 * 2]
235
+ pmovzxbw m3, [r2 + r4]
236
+
237
+ movu [r0], m0
238
+ movu [r0 + r1], m1
239
+ movu [r0 + r1 * 2], m2
240
+ movu [r0 + r5], m3
241
+
242
+ lea r0, [r0 + 4 * r1]
243
+ lea r2, [r2 + 4 * r3]
244
+
245
+ pmovzxbw m0, [r2]
246
+ pmovzxbw m1, [r2 + r3]
247
+ pmovzxbw m2, [r2 + r3 * 2]
248
+ pmovzxbw m3, [r2 + r4]
249
+
250
+ movu [r0], m0
251
+ movu [r0 + r1], m1
252
+ movu [r0 + r1 * 2], m2
253
+ movu [r0 + r5], m3
254
+%endmacro
255
+
256
+INIT_ZMM avx512
257
+cglobal blockcopy_ps_32x32, 4, 6, 4
258
+ add r1, r1
259
+ lea r4, [3 * r3]
260
+ lea r5, [3 * r1]
261
+
262
+ PROCESS_BLOCKCOPY_PS_32x8_AVX512
263
+ lea r0, [r0 + 4 * r1]
264
+ lea r2, [r2 + 4 * r3]
265
+ PROCESS_BLOCKCOPY_PS_32x8_AVX512
266
+ lea r0, [r0 + 4 * r1]
267
+ lea r2, [r2 + 4 * r3]
268
+ PROCESS_BLOCKCOPY_PS_32x8_AVX512
269
+ lea r0, [r0 + 4 * r1]
270
+ lea r2, [r2 + 4 * r3]
271
+ PROCESS_BLOCKCOPY_PS_32x8_AVX512
272
+ RET
273
+
274
+INIT_ZMM avx512
275
+cglobal blockcopy_ps_32x64, 4, 6, 4
276
+ add r1, r1
277
+ lea r4, [3 * r3]
278
+ lea r5, [3 * r1]
279
+
280
+ PROCESS_BLOCKCOPY_PS_32x8_AVX512
281
+ lea r0, [r0 + 4 * r1]
282
+ lea r2, [r2 + 4 * r3]
283
+ PROCESS_BLOCKCOPY_PS_32x8_AVX512
284
+ lea r0, [r0 + 4 * r1]
285
+ lea r2, [r2 + 4 * r3]
286
+ PROCESS_BLOCKCOPY_PS_32x8_AVX512
287
+ lea r0, [r0 + 4 * r1]
288
+ lea r2, [r2 + 4 * r3]
289
+ PROCESS_BLOCKCOPY_PS_32x8_AVX512
290
+ lea r0, [r0 + 4 * r1]
291
+ lea r2, [r2 + 4 * r3]
292
+ PROCESS_BLOCKCOPY_PS_32x8_AVX512
293
+ lea r0, [r0 + 4 * r1]
294
+ lea r2, [r2 + 4 * r3]
295
+ PROCESS_BLOCKCOPY_PS_32x8_AVX512
296
+ lea r0, [r0 + 4 * r1]
297
+ lea r2, [r2 + 4 * r3]
298
+ PROCESS_BLOCKCOPY_PS_32x8_AVX512
299
+ lea r0, [r0 + 4 * r1]
300
+ lea r2, [r2 + 4 * r3]
301
+ PROCESS_BLOCKCOPY_PS_32x8_AVX512
302
+ RET
303
+
304
;-----------------------------------------------------------------------------
305
; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
306
;-----------------------------------------------------------------------------
307
308
jnz .loop
309
RET
310
311
+%macro PROCESS_BLOCKCOPY_PS_64x8_AVX512 0
312
+ pmovzxbw m0, [r2]
313
+ pmovzxbw m1, [r2 + 32]
314
+ pmovzxbw m2, [r2 + r3]
315
+ pmovzxbw m3, [r2 + r3 + 32]
316
+ movu [r0], m0
317
+ movu [r0 + 64], m1
318
+ movu [r0 + r1], m2
319
+ movu [r0 + r1 + 64], m3
320
+
321
+ pmovzxbw m0, [r2 + r3 * 2]
322
+ pmovzxbw m1, [r2 + r3 * 2 + 32]
323
+ pmovzxbw m2, [r2 + r4]
324
+ pmovzxbw m3, [r2 + r4 + 32]
325
+ movu [r0 + r1 * 2], m0
326
+ movu [r0 + r1 * 2 + 64], m1
327
+ movu [r0 + r5], m2
328
+ movu [r0 + r5 + 64], m3
329
+
330
+ lea r0, [r0 + 4 * r1]
331
+ lea r2, [r2 + 4 * r3]
332
+
333
+ pmovzxbw m0, [r2]
334
+ pmovzxbw m1, [r2 + 32]
335
+ pmovzxbw m2, [r2 + r3]
336
+ pmovzxbw m3, [r2 + r3 + 32]
337
+ movu [r0], m0
338
+ movu [r0 + 64], m1
339
+ movu [r0 + r1], m2
340
+ movu [r0 + r1 + 64], m3
341
+
342
+ pmovzxbw m0, [r2 + r3 * 2]
343
+ pmovzxbw m1, [r2 + r3 * 2 + 32]
344
+ pmovzxbw m2, [r2 + r4]
345
+ pmovzxbw m3, [r2 + r4 + 32]
346
+ movu [r0 + r1 * 2], m0
347
+ movu [r0 + r1 * 2 + 64], m1
348
+ movu [r0 + r5], m2
349
+ movu [r0 + r5 + 64], m3
350
+%endmacro
351
+;-----------------------------------------------------------------------------
352
+; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
353
+;-----------------------------------------------------------------------------
354
+INIT_ZMM avx512
355
+cglobal blockcopy_ps_64x64, 4, 6, 4
356
+ add r1, r1
357
+ lea r4, [3 * r3]
358
+ lea r5, [3 * r1]
359
+
360
+ PROCESS_BLOCKCOPY_PS_64x8_AVX512
361
+ lea r0, [r0 + 4 * r1]
362
+ lea r2, [r2 + 4 * r3]
363
+ PROCESS_BLOCKCOPY_PS_64x8_AVX512
364
+ lea r0, [r0 + 4 * r1]
365
+ lea r2, [r2 + 4 * r3]
366
+ PROCESS_BLOCKCOPY_PS_64x8_AVX512
367
+ lea r0, [r0 + 4 * r1]
368
+ lea r2, [r2 + 4 * r3]
369
+ PROCESS_BLOCKCOPY_PS_64x8_AVX512
370
+ lea r0, [r0 + 4 * r1]
371
+ lea r2, [r2 + 4 * r3]
372
+ PROCESS_BLOCKCOPY_PS_64x8_AVX512
373
+ lea r0, [r0 + 4 * r1]
374
+ lea r2, [r2 + 4 * r3]
375
+ PROCESS_BLOCKCOPY_PS_64x8_AVX512
376
+ lea r0, [r0 + 4 * r1]
377
+ lea r2, [r2 + 4 * r3]
378
+ PROCESS_BLOCKCOPY_PS_64x8_AVX512
379
+ lea r0, [r0 + 4 * r1]
380
+ lea r2, [r2 + 4 * r3]
381
+ PROCESS_BLOCKCOPY_PS_64x8_AVX512
382
+ RET
383
+
384
;-----------------------------------------------------------------------------
385
; void blockcopy_ss_2x4(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
386
;-----------------------------------------------------------------------------
387
388
BLOCKCOPY_SS_W32_H4_avx 32, 48
389
BLOCKCOPY_SS_W32_H4_avx 32, 64
390
391
+%macro PROCESS_BLOCKCOPY_SS_W32_H8_avx512 0
392
+ movu m0, [r2]
393
+ movu m1, [r2 + r3]
394
+ movu m2, [r2 + 2 * r3]
395
+ movu m3, [r2 + r6]
396
+ lea r2, [r2 + 4 * r3]
397
+
398
+ movu [r0], m0
399
+ movu [r0 + r1], m1
400
+ movu [r0 + 2 * r1], m2
401
+ movu [r0 + r5], m3
402
+ lea r0, [r0 + 4 * r1]
403
+
404
+ movu m0, [r2]
405
+ movu m1, [r2 + r3]
406
+ movu m2, [r2 + 2 * r3]
407
+ movu m3, [r2 + r6]
408
+ lea r2, [r2 + 4 * r3]
409
+
410
+ movu [r0], m0
411
+ movu [r0 + r1], m1
412
+ movu [r0 + 2 * r1], m2
413
+ movu [r0 + r5], m3
414
+ lea r0, [r0 + 4 * r1]
415
+%endmacro
416
+
417
+%macro PROCESS_BLOCKCOPY_SS_W32_H8_LAST_avx512 0
418
+ movu m0, [r2]
419
+ movu m1, [r2 + r3]
420
+ movu m2, [r2 + 2 * r3]
421
+ movu m3, [r2 + r6]
422
+ lea r2, [r2 + 4 * r3]
423
+
424
+ movu [r0], m0
425
+ movu [r0 + r1], m1
426
+ movu [r0 + 2 * r1], m2
427
+ movu [r0 + r5], m3
428
+ lea r0, [r0 + 4 * r1]
429
+
430
+ movu m0, [r2]
431
+ movu m1, [r2 + r3]
432
+ movu m2, [r2 + 2 * r3]
433
+ movu m3, [r2 + r6]
434
+
435
+ movu [r0], m0
436
+ movu [r0 + r1], m1
437
+ movu [r0 + 2 * r1], m2
438
+ movu [r0 + r5], m3
439
+%endmacro
440
+
441
+;-----------------------------------------------------------------------------
442
+; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
443
+;-----------------------------------------------------------------------------
444
+INIT_ZMM avx512
445
+cglobal blockcopy_ss_32x8, 4, 7, 4
446
+
447
+ add r1, r1
448
+ add r3, r3
449
+ lea r5, [3 * r1]
450
+ lea r6, [3 * r3]
451
+
452
+ PROCESS_BLOCKCOPY_SS_W32_H8_LAST_avx512
453
+ RET
454
+
455
+INIT_ZMM avx512
456
+cglobal blockcopy_ss_32x16, 4, 7, 4
457
+
458
+ add r1, r1
459
+ add r3, r3
460
+ lea r5, [3 * r1]
461
+ lea r6, [3 * r3]
462
+
463
+ PROCESS_BLOCKCOPY_SS_W32_H8_avx512
464
+ PROCESS_BLOCKCOPY_SS_W32_H8_LAST_avx512
465
+ RET
466
+
467
+INIT_ZMM avx512
468
+cglobal blockcopy_ss_32x24, 4, 7, 4
469
+
470
+ add r1, r1
471
+ add r3, r3
472
+ lea r5, [3 * r1]
473
+ lea r6, [3 * r3]
474
+
475
+ PROCESS_BLOCKCOPY_SS_W32_H8_avx512
476
+ PROCESS_BLOCKCOPY_SS_W32_H8_avx512
477
+ PROCESS_BLOCKCOPY_SS_W32_H8_LAST_avx512
478
+ RET
479
+
480
+INIT_ZMM avx512
481
+cglobal blockcopy_ss_32x32, 4, 7, 4
482
+
483
+ add r1, r1
484
+ add r3, r3
485
+ lea r5, [3 * r1]
486
+ lea r6, [3 * r3]
487
+
488
+ PROCESS_BLOCKCOPY_SS_W32_H8_avx512
489
+ PROCESS_BLOCKCOPY_SS_W32_H8_avx512
490
+ PROCESS_BLOCKCOPY_SS_W32_H8_avx512
491
+ PROCESS_BLOCKCOPY_SS_W32_H8_LAST_avx512
492
+ RET
493
+
494
+INIT_ZMM avx512
495
+cglobal blockcopy_ss_32x48, 4, 7, 4
496
+
497
+ add r1, r1
498
+ add r3, r3
499
+ lea r5, [3 * r1]
500
+ lea r6, [3 * r3]
501
+
502
+ PROCESS_BLOCKCOPY_SS_W32_H8_avx512
503
+ PROCESS_BLOCKCOPY_SS_W32_H8_avx512
504
+ PROCESS_BLOCKCOPY_SS_W32_H8_avx512
505
+ PROCESS_BLOCKCOPY_SS_W32_H8_avx512
506
+ PROCESS_BLOCKCOPY_SS_W32_H8_avx512
507
+ PROCESS_BLOCKCOPY_SS_W32_H8_LAST_avx512
508
+ RET
509
+
510
+INIT_ZMM avx512
511
+cglobal blockcopy_ss_32x64, 4, 7, 4
512
+
513
+ add r1, r1
514
+ add r3, r3
515
+ lea r5, [3 * r1]
516
+ lea r6, [3 * r3]
517
+
518
+ PROCESS_BLOCKCOPY_SS_W32_H8_avx512
519
+ PROCESS_BLOCKCOPY_SS_W32_H8_avx512
520
+ PROCESS_BLOCKCOPY_SS_W32_H8_avx512
521
+ PROCESS_BLOCKCOPY_SS_W32_H8_avx512
522
+ PROCESS_BLOCKCOPY_SS_W32_H8_avx512
523
+ PROCESS_BLOCKCOPY_SS_W32_H8_avx512
524
+ PROCESS_BLOCKCOPY_SS_W32_H8_avx512
525
+ PROCESS_BLOCKCOPY_SS_W32_H8_LAST_avx512
526
+ RET
527
+
528
;-----------------------------------------------------------------------------
529
; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
530
;-----------------------------------------------------------------------------
531
532
BLOCKCOPY_SS_W64_H4_avx 64, 48
533
BLOCKCOPY_SS_W64_H4_avx 64, 64
534
535
+%macro PROCESS_BLOCKCOPY_SS_W64_H8_avx512 0
536
+ movu m0, [r2]
537
+ movu m1, [r2 + mmsize]
538
+ movu m2, [r2 + r3]
539
+ movu m3, [r2 + r3 + mmsize]
540
+
541
+ movu [r0], m0
542
+ movu [r0 + mmsize], m1
543
+ movu [r0 + r1], m2
544
+ movu [r0 + r1 + mmsize], m3
545
+
546
+ movu m0, [r2 + 2 * r3]
547
+ movu m1, [r2 + 2 * r3 + mmsize]
548
+ movu m2, [r2 + r6]
549
+ movu m3, [r2 + r6 + mmsize]
550
+ lea r2, [r2 + 4 * r3]
551
+
552
+ movu [r0 + 2 * r1], m0
553
+ movu [r0 + 2 * r1 + mmsize], m1
554
+ movu [r0 + r5], m2
555
+ movu [r0 + r5 + mmsize], m3
556
+ lea r0, [r0 + 4 * r1]
557
+
558
+ movu m0, [r2]
559
+ movu m1, [r2 + mmsize]
560
+ movu m2, [r2 + r3]
561
+ movu m3, [r2 + r3 + mmsize]
562
+
563
+ movu [r0], m0
564
+ movu [r0 + mmsize], m1
565
+ movu [r0 + r1], m2
566
+ movu [r0 + r1 + mmsize], m3
567
+
568
+ movu m0, [r2 + 2 * r3]
569
+ movu m1, [r2 + 2 * r3 + mmsize]
570
+ movu m2, [r2 + r6]
571
+ movu m3, [r2 + r6 + mmsize]
572
+ lea r2, [r2 + 4 * r3]
573
+
574
+ movu [r0 + 2 * r1], m0
575
+ movu [r0 + 2 * r1 + mmsize], m1
576
+ movu [r0 + r5], m2
577
+ movu [r0 + r5 + mmsize], m3
578
+ lea r0, [r0 + 4 * r1]
579
+%endmacro
580
+
581
+%macro PROCESS_BLOCKCOPY_SS_W64_H8_LAST_avx512 0
582
+ movu m0, [r2]
583
+ movu m1, [r2 + mmsize]
584
+ movu m2, [r2 + r3]
585
+ movu m3, [r2 + r3 + mmsize]
586
+
587
+ movu [r0], m0
588
+ movu [r0 + mmsize], m1
589
+ movu [r0 + r1], m2
590
+ movu [r0 + r1 + mmsize], m3
591
+
592
+ movu m0, [r2 + 2 * r3]
593
+ movu m1, [r2 + 2 * r3 + mmsize]
594
+ movu m2, [r2 + r6]
595
+ movu m3, [r2 + r6 + mmsize]
596
+ lea r2, [r2 + 4 * r3]
597
+
598
+ movu [r0 + 2 * r1], m0
599
+ movu [r0 + 2 * r1 + mmsize], m1
600
+ movu [r0 + r5], m2
601
+ movu [r0 + r5 + mmsize], m3
602
+ lea r0, [r0 + 4 * r1]
603
+
604
+ movu m0, [r2]
605
+ movu m1, [r2 + mmsize]
606
+ movu m2, [r2 + r3]
607
+ movu m3, [r2 + r3 + mmsize]
608
+
609
+ movu [r0], m0
610
+ movu [r0 + mmsize], m1
611
+ movu [r0 + r1], m2
612
+ movu [r0 + r1 + mmsize], m3
613
+
614
+ movu m0, [r2 + 2 * r3]
615
+ movu m1, [r2 + 2 * r3 + mmsize]
616
+ movu m2, [r2 + r6]
617
+ movu m3, [r2 + r6 + mmsize]
618
+
619
+ movu [r0 + 2 * r1], m0
620
+ movu [r0 + 2 * r1 + mmsize], m1
621
+ movu [r0 + r5], m2
622
+ movu [r0 + r5 + mmsize], m3
623
+%endmacro
624
+
625
+;-----------------------------------------------------------------------------
626
+; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
627
+;-----------------------------------------------------------------------------
628
+INIT_ZMM avx512
629
+cglobal blockcopy_ss_64x16, 4, 7, 4
630
+ add r1, r1
631
+ add r3, r3
632
+ lea r5, [3 * r1]
633
+ lea r6, [3 * r3]
634
+
635
+ PROCESS_BLOCKCOPY_SS_W64_H8_avx512
636
+ PROCESS_BLOCKCOPY_SS_W64_H8_LAST_avx512
637
+ RET
638
+
639
+INIT_ZMM avx512
640
+cglobal blockcopy_ss_64x32, 4, 7, 4
641
+ add r1, r1
642
+ add r3, r3
643
+ lea r5, [3 * r1]
644
+ lea r6, [3 * r3]
645
+
646
+ PROCESS_BLOCKCOPY_SS_W64_H8_avx512
647
+ PROCESS_BLOCKCOPY_SS_W64_H8_avx512
648
+ PROCESS_BLOCKCOPY_SS_W64_H8_avx512
649
+ PROCESS_BLOCKCOPY_SS_W64_H8_LAST_avx512
650
+ RET
651
+
652
+INIT_ZMM avx512
653
+cglobal blockcopy_ss_64x48, 4, 7, 4
654
+ add r1, r1
655
+ add r3, r3
656
+ lea r5, [3 * r1]
657
+ lea r6, [3 * r3]
658
+
659
+ PROCESS_BLOCKCOPY_SS_W64_H8_avx512
660
+ PROCESS_BLOCKCOPY_SS_W64_H8_avx512
661
+ PROCESS_BLOCKCOPY_SS_W64_H8_avx512
662
+ PROCESS_BLOCKCOPY_SS_W64_H8_avx512
663
+ PROCESS_BLOCKCOPY_SS_W64_H8_avx512
664
+ PROCESS_BLOCKCOPY_SS_W64_H8_LAST_avx512
665
+ RET
666
+
667
+INIT_ZMM avx512
668
+cglobal blockcopy_ss_64x64, 4, 7, 4
669
+ add r1, r1
670
+ add r3, r3
671
+ lea r5, [3 * r1]
672
+ lea r6, [3 * r3]
673
+
674
+ PROCESS_BLOCKCOPY_SS_W64_H8_avx512
675
+ PROCESS_BLOCKCOPY_SS_W64_H8_avx512
676
+ PROCESS_BLOCKCOPY_SS_W64_H8_avx512
677
+ PROCESS_BLOCKCOPY_SS_W64_H8_avx512
678
+ PROCESS_BLOCKCOPY_SS_W64_H8_avx512
679
+ PROCESS_BLOCKCOPY_SS_W64_H8_avx512
680
+ PROCESS_BLOCKCOPY_SS_W64_H8_avx512
681
+ PROCESS_BLOCKCOPY_SS_W64_H8_LAST_avx512
682
+ RET
683
;--------------------------------------------------------------------------------------
684
; void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
685
;--------------------------------------------------------------------------------------
686
687
jnz .loop
688
RET
689
690
+INIT_ZMM avx512
691
+cglobal cpy2Dto1D_shr_16, 4, 5, 4
692
+ shl r2d, 1
693
+ movd xm0, r3d
694
+ pcmpeqw ymm1, ymm1
695
+ psllw ym1, ymm1, xm0
696
+ psraw ym1, 1
697
+ vinserti32x8 m1, ym1, 1
698
+ lea r3, [r2 * 3]
699
+ mov r4d, 2
700
+
701
+.loop:
702
+ ; Row 0-1
703
+ movu ym2, [r1]
704
+ vinserti32x8 m2, [r1 + r2], 1
705
+ psubw m2, m1
706
+ psraw m2, xm0
707
+ movu [r0], m2
708
+
709
+ ; Row 2-3
710
+ movu ym2, [r1 + 2 * r2]
711
+ vinserti32x8 m2, [r1 + r3], 1
712
+ psubw m2, m1
713
+ psraw m2, xm0
714
+ movu [r0 + mmsize], m2
715
+
716
+ lea r1, [r1 + 4 * r2]
717
+ ; Row 4-5
718
+
719
+ movu ym2, [r1]
720
+ vinserti32x8 m2, [r1 + r2], 1
721
+ psubw m2, m1
722
+ psraw m2, xm0
723
+ movu [r0 + 2 * mmsize], m2
724
+
725
+ ; Row 6-7
726
+ movu ym2, [r1 + 2 * r2]
727
+ vinserti32x8 m2, [r1 + r3], 1
728
+ psubw m2, m1
729
+ psraw m2, xm0
730
+ movu [r0 + 3 * mmsize], m2
731
+
732
+ add r0, 4 * mmsize
733
+ lea r1, [r1 + 4 * r2]
734
+ dec r4d
735
+ jnz .loop
736
+ RET
737
738
;--------------------------------------------------------------------------------------
739
; void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
740
741
jnz .loop
742
RET
743
744
+INIT_ZMM avx512
745
+cglobal cpy2Dto1D_shr_32, 4, 5, 4
746
+ shl r2d, 1
747
+ movd xm0, r3d
748
+ pcmpeqw ymm1, ymm1
749
+ psllw ym1, ymm1, xm0
750
+ psraw ym1, 1
751
+ vinserti32x8 m1, ym1, 1
752
+ lea r3, [r2 * 3]
753
+ mov r4d, 8
754
+
755
+.loop:
756
+ ; Row 0
757
+ movu m2, [r1]
758
+ psubw m2, m1
759
+ psraw m2, xm0
760
+ movu [r0], m2
761
+
762
+ ; Row 1
763
+ movu m2, [r1 + r2]
764
+ psubw m2, m1
765
+ psraw m2, xm0
766
+ movu [r0 + mmsize], m2
767
+
768
+ ; Row 2
769
+ movu m2, [r1 + 2 * r2]
770
+ psubw m2, m1
771
+ psraw m2, xm0
772
+ movu [r0 + 2 * mmsize], m2
773
+
774
+ ; Row 3
775
+ movu m2, [r1 + r3]
776
+ psubw m2, m1
777
+ psraw m2, xm0
778
+ movu [r0 + 3 * mmsize], m2
779
+
780
+ add r0, 4 * mmsize
781
+ lea r1, [r1 + 4 * r2]
782
+ dec r4d
783
+ jnz .loop
784
+ RET
785
+
786
;--------------------------------------------------------------------------------------
787
; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
788
;--------------------------------------------------------------------------------------
789
790
jnz .loop
791
RET
792
793
+;--------------------------------------------------------------------------------------
794
+; cpy_1Dto2D_shl avx512 code start
795
+;--------------------------------------------------------------------------------------
796
+%macro PROCESS_CPY1Dto2D_SHL_32x8_AVX512 0
797
+ movu m1, [r1 + 0 * mmsize]
798
+ movu m2, [r1 + 1 * mmsize]
799
+ movu m3, [r1 + 2 * mmsize]
800
+ movu m4, [r1 + 3 * mmsize]
801
+ psllw m1, xm0
802
+ psllw m2, xm0
803
+ psllw m3, xm0
804
+ psllw m4, xm0
805
+ movu [r0], m1
806
+ movu [r0 + r2], m2
807
+ movu [r0 + 2 * r2], m3
808
+ movu [r0 + r3], m4
809
+
810
+ add r1, 4 * mmsize
811
+ lea r0, [r0 + r2 * 4]
812
+
813
+ movu m1, [r1 + 0 * mmsize]
814
+ movu m2, [r1 + 1 * mmsize]
815
+ movu m3, [r1 + 2 * mmsize]
816
+ movu m4, [r1 + 3 * mmsize]
817
+ psllw m1, xm0
818
+ psllw m2, xm0
819
+ psllw m3, xm0
820
+ psllw m4, xm0
821
+ movu [r0], m1
822
+ movu [r0 + r2], m2
823
+ movu [r0 + 2 * r2], m3
824
+ movu [r0 + r3], m4
825
+%endmacro
826
+;--------------------------------------------------------------------------------------
827
+; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
828
+;--------------------------------------------------------------------------------------
829
+INIT_ZMM avx512
830
+cglobal cpy1Dto2D_shl_32, 4, 4, 5
831
+ add r2d, r2d
832
+ movd xm0, r3d
833
+ lea r3, [3 * r2]
834
+%rep 3
835
+ PROCESS_CPY1Dto2D_SHL_32x8_AVX512
836
+ add r1, 4 * mmsize
837
+ lea r0, [r0 + r2 * 4]
838
+%endrep
839
+ PROCESS_CPY1Dto2D_SHL_32x8_AVX512
840
+ RET
841
842
+%macro PROCESS_CPY1Dto2D_SHL_ALIGNED_32x8_AVX512 0
843
+ mova m1, [r1 + 0 * mmsize]
844
+ mova m2, [r1 + 1 * mmsize]
845
+ mova m3, [r1 + 2 * mmsize]
846
+ mova m4, [r1 + 3 * mmsize]
847
+ psllw m1, xm0
848
+ psllw m2, xm0
849
+ psllw m3, xm0
850
+ psllw m4, xm0
851
+ mova [r0], m1
852
+ mova [r0 + r2], m2
853
+ mova [r0 + 2 * r2], m3
854
+ mova [r0 + r3], m4
855
+
856
+ add r1, 4 * mmsize
857
+ lea r0, [r0 + r2 * 4]
858
+
859
+ mova m1, [r1 + 0 * mmsize]
860
+ mova m2, [r1 + 1 * mmsize]
861
+ mova m3, [r1 + 2 * mmsize]
862
+ mova m4, [r1 + 3 * mmsize]
863
+ psllw m1, xm0
864
+ psllw m2, xm0
865
+ psllw m3, xm0
866
+ psllw m4, xm0
867
+ mova [r0], m1
868
+ mova [r0 + r2], m2
869
+ mova [r0 + 2 * r2], m3
870
+ mova [r0 + r3], m4
871
+%endmacro
872
+;--------------------------------------------------------------------------------------
873
+; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
874
+;--------------------------------------------------------------------------------------
875
+INIT_ZMM avx512
876
+cglobal cpy1Dto2D_shl_aligned_32, 4, 4, 5
877
+ add r2d, r2d
878
+ movd xm0, r3d
879
+ lea r3, [3 * r2]
880
+%rep 3
881
+ PROCESS_CPY1Dto2D_SHL_ALIGNED_32x8_AVX512
882
+ add r1, 4 * mmsize
883
+ lea r0, [r0 + r2 * 4]
884
+%endrep
885
+ PROCESS_CPY1Dto2D_SHL_ALIGNED_32x8_AVX512
886
+ RET
887
+;--------------------------------------------------------------------------------------
888
+; copy_cnt avx512 code end
889
+;--------------------------------------------------------------------------------------
890
;--------------------------------------------------------------------------------------
891
; uint32_t copy_cnt(int16_t* dst, const int16_t* src, intptr_t srcStride);
892
;--------------------------------------------------------------------------------------
893
894
movd eax, xm4
895
RET
896
897
+;--------------------------------------------------------------------------------------
898
+; copy_cnt avx512 code start
899
+;--------------------------------------------------------------------------------------
900
+%macro PROCESS_COPY_CNT_32x4_AVX512 0
901
+ movu m0, [r1]
902
+ movu m1, [r1 + r2]
903
+ movu [r0], m0
904
+ movu [r0 + mmsize], m1
905
+ packsswb m0, m1
906
+ pminub m0, m3
907
+
908
+ movu m1, [r1 + 2 * r2]
909
+ movu m2, [r1 + r3]
910
+ movu [r0 + 2 * mmsize], m1
911
+ movu [r0 + 3 * mmsize], m2
912
+ packsswb m1, m2
913
+ pminub m1, m3
914
+
915
+ paddb m0, m1
916
+ paddb m4, m0
917
+%endmacro
918
+
919
+%macro PROCESS_COPY_CNT_16x4_AVX512 0
920
+ movu ym0, [r1]
921
+ vinserti32x8 m0, [r1 + r2], 1
922
+ movu ym1, [r1 + 2 * r2]
923
+ vinserti32x8 m1, [r1 + r3], 1
924
+ movu [r0], m0
925
+ movu [r0 + mmsize], m1
926
+ packsswb m0, m1
927
+ pminub m0, m3
928
+ paddb m4, m0
929
+%endmacro
930
+
931
+%macro PROCESS_COPY_CNT_END_AVX512 0
932
+ pxor m0, m0
933
+ vextracti32x8 ym1, m4, 1
934
+ paddb ym4, ym1
935
+ vextracti32x4 xm1, ym4, 1
936
+ paddb xm4, xm1
937
+ psadbw xm4, xm0
938
+ movhlps xm1, xm4
939
+ paddd xm4, xm1
940
+ movd eax, xm4
941
+%endmacro
942
+
943
+;--------------------------------------------------------------------------------------
944
+; uint32_t copy_cnt(int32_t* dst, const int16_t* src, intptr_t stride);
945
+;--------------------------------------------------------------------------------------
946
+INIT_ZMM avx512
947
+cglobal copy_cnt_32, 3, 4, 5
948
+ add r2d, r2d
949
+ lea r3, [3 * r2]
950
+
951
+ vbroadcasti32x8 m3, [pb_1]
952
+ pxor m4, m4
953
+
954
+%rep 7
955
+ PROCESS_COPY_CNT_32x4_AVX512
956
+ add r0, 4 * mmsize
957
+ lea r1, [r1 + 4 * r2]
958
+%endrep
959
+ PROCESS_COPY_CNT_32x4_AVX512
960
+ PROCESS_COPY_CNT_END_AVX512
961
+ RET
962
+
963
+INIT_ZMM avx512
964
+cglobal copy_cnt_16, 3, 4, 5
965
+ add r2d, r2d
966
+ lea r3, [3 * r2]
967
+
968
+ vbroadcasti32x8 m3, [pb_1]
969
+ pxor m4, m4
970
971
+%rep 3
972
+ PROCESS_COPY_CNT_16x4_AVX512
973
+ add r0, 2 * mmsize
974
+ lea r1, [r1 + 4 * r2]
975
+%endrep
976
+ PROCESS_COPY_CNT_16x4_AVX512
977
+ PROCESS_COPY_CNT_END_AVX512
978
+ RET
979
+;--------------------------------------------------------------------------------------
980
+; copy_cnt avx512 code end
981
+;--------------------------------------------------------------------------------------
982
;--------------------------------------------------------------------------------------
983
; void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
984
;--------------------------------------------------------------------------------------
985
986
RET
987
988
;--------------------------------------------------------------------------------------
989
+; cpy2Dto1D_shl avx512 code start
990
+;--------------------------------------------------------------------------------------
991
+%macro PROCESS_CPY2Dto1D_SHL_16x8_AVX512 0
992
+ movu m1, [r1]
993
+ vinserti32x8 m1, [r1 + r2], 1
994
+ movu m2, [r1 + 2 * r2]
995
+ vinserti32x8 m2, [r1 + r3], 1
996
+
997
+ psllw m1, xm0
998
+ psllw m2, xm0
999
+ movu [r0], m1
1000
+ movu [r0 + mmsize], m2
1001
+
1002
+ add r0, 2 * mmsize
1003
+ lea r1, [r1 + r2 * 4]
1004
+
1005
+ movu m1, [r1]
1006
+ vinserti32x8 m1, [r1 + r2], 1
1007
+ movu m2, [r1 + 2 * r2]
1008
+ vinserti32x8 m2, [r1 + r3], 1
1009
+
1010
+ psllw m1, xm0
1011
+ psllw m2, xm0
1012
+ movu [r0], m1
1013
+ movu [r0 + mmsize], m2
1014
+%endmacro
1015
+
1016
+%macro PROCESS_CPY2Dto1D_SHL_32x8_AVX512 0
1017
+ movu m1, [r1]
1018
+ movu m2, [r1 + r2]
1019
+ movu m3, [r1 + 2 * r2]
1020
+ movu m4, [r1 + r3]
1021
+
1022
+ psllw m1, xm0
1023
+ psllw m2, xm0
1024
+ psllw m3, xm0
1025
+ psllw m4, xm0
1026
+ movu [r0], m1
1027
+ movu [r0 + mmsize], m2
1028
+ movu [r0 + 2 * mmsize], m3
1029
+ movu [r0 + 3 * mmsize], m4
1030
+
1031
+ add r0, 4 * mmsize
1032
+ lea r1, [r1 + r2 * 4]
1033
+
1034
+ movu m1, [r1]
1035
+ movu m2, [r1 + r2]
1036
+ movu m3, [r1 + 2 * r2]
1037
+ movu m4, [r1 + r3]
1038
+
1039
+ psllw m1, xm0
1040
+ psllw m2, xm0
1041
+ psllw m3, xm0
1042
+ psllw m4, xm0
1043
+ movu [r0], m1
1044
+ movu [r0 + mmsize], m2
1045
+ movu [r0 + 2 * mmsize], m3
1046
+ movu [r0 + 3 * mmsize], m4
1047
+%endmacro
1048
+
1049
+;--------------------------------------------------------------------------------------
1050
+; void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
1051
+;--------------------------------------------------------------------------------------
1052
+INIT_ZMM avx512
1053
+cglobal cpy2Dto1D_shl_32, 4, 4, 5
1054
+ add r2d, r2d
1055
+ movd xm0, r3d
1056
+ lea r3, [3 * r2]
1057
+
1058
+ PROCESS_CPY2Dto1D_SHL_32x8_AVX512
1059
+ add r0, 4 * mmsize
1060
+ lea r1, [r1 + r2 * 4]
1061
+ PROCESS_CPY2Dto1D_SHL_32x8_AVX512
1062
+ add r0, 4 * mmsize
1063
+ lea r1, [r1 + r2 * 4]
1064
+ PROCESS_CPY2Dto1D_SHL_32x8_AVX512
1065
+ add r0, 4 * mmsize
1066
+ lea r1, [r1 + r2 * 4]
1067
+ PROCESS_CPY2Dto1D_SHL_32x8_AVX512
1068
+ RET
1069
+
1070
+INIT_ZMM avx512
1071
+cglobal cpy2Dto1D_shl_16, 4, 4, 3
1072
+ add r2d, r2d
1073
+ movd xm0, r3d
1074
+ lea r3, [3 * r2]
1075
+
1076
+ PROCESS_CPY2Dto1D_SHL_16x8_AVX512
1077
+ add r0, 2 * mmsize
1078
+ lea r1, [r1 + r2 * 4]
1079
+ PROCESS_CPY2Dto1D_SHL_16x8_AVX512
1080
+ RET
1081
+;--------------------------------------------------------------------------------------
1082
+; cpy2Dto1D_shl avx512 code end
1083
+;--------------------------------------------------------------------------------------
1084
+;--------------------------------------------------------------------------------------
1085
; void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
1086
;--------------------------------------------------------------------------------------
1087
INIT_XMM sse2
1088
1089
jnz .loop
1090
RET
1091
1092
+INIT_ZMM avx512
1093
+cglobal cpy1Dto2D_shr_16, 3, 5, 4
1094
+ shl r2d, 1
1095
+ movd xm0, r3m
1096
+ pcmpeqw xmm1, xmm1
1097
+ psllw xm1, xmm1, xm0
1098
+ psraw xm1, 1
1099
+ vpbroadcastw m1, xm1
1100
+ mov r3d, 4
1101
+ lea r4, [r2 * 3]
1102
+
1103
+.loop:
1104
+ ; Row 0-1
1105
+ movu m2, [r1]
1106
+ psubw m2, m1
1107
+ psraw m2, xm0
1108
+ movu [r0], ym2
1109
+ vextracti32x8 [r0 + r2], m2, 1
1110
+
1111
+ ; Row 2-3
1112
+ movu m2, [r1 + mmsize]
1113
+ psubw m2, m1
1114
+ psraw m2, xm0
1115
+ movu [r0 + r2 * 2], ym2
1116
+ vextracti32x8 [r0 + r4], m2, 1
1117
+
1118
+ add r1, 2 * mmsize
1119
+ lea r0, [r0 + r2 * 4]
1120
+ dec r3d
1121
+ jnz .loop
1122
+ RET
1123
1124
;--------------------------------------------------------------------------------------
1125
; void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
1126
1127
dec r3d
1128
jnz .loop
1129
RET
1130
+
1131
+INIT_ZMM avx512
1132
+cglobal cpy1Dto2D_shr_32, 3, 4, 6
1133
+ shl r2d, 1
1134
+ movd xm0, r3m
1135
+ pcmpeqw xmm1, xmm1
1136
+ psllw xm1, xmm1, xm0
1137
+ psraw xm1, 1
1138
+ vpbroadcastw m1, xm1
1139
+ mov r3d, 16
1140
+
1141
+.loop:
1142
+ ; Row 0-1
1143
+ movu m2, [r1]
1144
+ movu m3, [r1 + mmsize]
1145
+ psubw m2, m1
1146
+ psubw m3, m1
1147
+ psraw m2, xm0
1148
+ psraw m3, xm0
1149
+ movu [r0], m2
1150
+ movu [r0 + r2], m3
1151
+
1152
+ add r1, 2 * mmsize
1153
+ lea r0, [r0 + r2 * 2]
1154
+ dec r3d
1155
+ jnz .loop
1156
+ RET
1157
x265_2.7.tar.gz/source/common/x86/blockcopy8.h -> x265_2.9.tar.gz/source/common/x86/blockcopy8.h
Changed
51
1
2
FUNCDEF_TU_S(void, cpy2Dto1D_shl, sse2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
3
FUNCDEF_TU_S(void, cpy2Dto1D_shl, sse4, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
4
FUNCDEF_TU_S(void, cpy2Dto1D_shl, avx2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
5
+FUNCDEF_TU_S(void, cpy2Dto1D_shl, avx512, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
6
7
FUNCDEF_TU_S(void, cpy2Dto1D_shr, sse2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
8
FUNCDEF_TU_S(void, cpy2Dto1D_shr, sse4, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
9
FUNCDEF_TU_S(void, cpy2Dto1D_shr, avx2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
10
+FUNCDEF_TU_S(void, cpy2Dto1D_shr, avx512, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
11
12
FUNCDEF_TU_S(void, cpy1Dto2D_shl, sse2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
13
FUNCDEF_TU_S(void, cpy1Dto2D_shl, sse4, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
14
FUNCDEF_TU_S(void, cpy1Dto2D_shl, avx2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
15
-
16
+FUNCDEF_TU_S(void, cpy1Dto2D_shl, avx512, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
17
+FUNCDEF_TU_S(void, cpy1Dto2D_shl_aligned, avx512, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
18
FUNCDEF_TU_S(void, cpy1Dto2D_shr, sse2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
19
FUNCDEF_TU_S(void, cpy1Dto2D_shr, sse4, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
20
FUNCDEF_TU_S(void, cpy1Dto2D_shr, avx2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
21
+FUNCDEF_TU_S(void, cpy1Dto2D_shr, avx512, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
22
23
FUNCDEF_TU_S(uint32_t, copy_cnt, sse2, int16_t* dst, const int16_t* src, intptr_t srcStride);
24
FUNCDEF_TU_S(uint32_t, copy_cnt, sse4, int16_t* dst, const int16_t* src, intptr_t srcStride);
25
FUNCDEF_TU_S(uint32_t, copy_cnt, avx2, int16_t* dst, const int16_t* src, intptr_t srcStride);
26
+FUNCDEF_TU_S(uint32_t, copy_cnt, avx512, int16_t* dst, const int16_t* src, intptr_t srcStride);
27
28
FUNCDEF_TU(void, blockfill_s, sse2, int16_t* dst, intptr_t dstride, int16_t val);
29
FUNCDEF_TU(void, blockfill_s, avx2, int16_t* dst, intptr_t dstride, int16_t val);
30
+FUNCDEF_TU(void, blockfill_s, avx512, int16_t* dst, intptr_t dstride, int16_t val);
31
+FUNCDEF_TU(void, blockfill_s_aligned, avx512, int16_t* dst, intptr_t dstride, int16_t val);
32
33
FUNCDEF_CHROMA_PU(void, blockcopy_ss, sse2, int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
34
FUNCDEF_CHROMA_PU(void, blockcopy_ss, avx, int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
35
+FUNCDEF_CHROMA_PU(void, blockcopy_ss, avx512, int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
36
37
FUNCDEF_CHROMA_PU(void, blockcopy_pp, sse2, pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
38
FUNCDEF_CHROMA_PU(void, blockcopy_pp, avx, pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
39
+FUNCDEF_CHROMA_PU(void, blockcopy_pp, avx512, pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
40
41
FUNCDEF_PU(void, blockcopy_sp, sse2, pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
42
FUNCDEF_PU(void, blockcopy_sp, sse4, pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
43
FUNCDEF_PU(void, blockcopy_sp, avx2, pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
44
+FUNCDEF_PU(void, blockcopy_sp, avx512, pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
45
FUNCDEF_PU(void, blockcopy_ps, sse2, int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
46
FUNCDEF_PU(void, blockcopy_ps, sse4, int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
47
FUNCDEF_PU(void, blockcopy_ps, avx2, int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
48
+FUNCDEF_PU(void, blockcopy_ps, avx512, int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
49
50
#endif // ifndef X265_I386_PIXEL_H
51
x265_2.7.tar.gz/source/common/x86/const-a.asm -> x265_2.9.tar.gz/source/common/x86/const-a.asm
Changed
10
1
2
3
%include "x86inc.asm"
4
5
-SECTION_RODATA 32
6
+SECTION_RODATA 64
7
8
;; 8-bit constants
9
10
x265_2.7.tar.gz/source/common/x86/cpu-a.asm -> x265_2.9.tar.gz/source/common/x86/cpu-a.asm
Changed
46
1
2
RET
3
4
;-----------------------------------------------------------------------------
5
-; void cpu_xgetbv( int op, int *eax, int *edx )
6
+; uint64_t cpu_xgetbv( int xcr )
7
;-----------------------------------------------------------------------------
8
-cglobal cpu_xgetbv, 3,7
9
- push r2
10
- push r1
11
- mov ecx, r0d
12
+cglobal cpu_xgetbv
13
+ movifnidn ecx, r0m
14
xgetbv
15
- pop r4
16
- mov [r4], eax
17
- pop r4
18
- mov [r4], edx
19
- RET
20
+%if ARCH_X86_64
21
+ shl rdx, 32
22
+ or rax, rdx
23
+%endif
24
+ ret
25
26
%if ARCH_X86_64
27
28
29
%if WIN64
30
sub rsp, 32 ; shadow space
31
%endif
32
- and rsp, ~31
33
+ and rsp, ~(STACK_ALIGNMENT - 1)
34
mov rax, r0
35
mov r0, r1
36
mov r1, r2
37
38
push ebp
39
mov ebp, esp
40
sub esp, 12
41
- and esp, ~31
42
+ and esp, ~(STACK_ALIGNMENT - 1)
43
mov ecx, [ebp+8]
44
mov edx, [ebp+12]
45
mov [esp], edx
46
x265_2.7.tar.gz/source/common/x86/dct8.asm -> x265_2.9.tar.gz/source/common/x86/dct8.asm
Changed
4080
1
2
3
%include "x86inc.asm"
4
%include "x86util.asm"
5
-SECTION_RODATA 32
6
+SECTION_RODATA 64
7
+
8
+tab_dct32: dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
9
+ dw 90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4, -4, -13, -22, -31, -38, -46, -54, -61, -67, -73, -78, -82, -85, -88, -90, -90
10
+ dw 90, 87, 80, 70, 57, 43, 25, 9, -9, -25, -43, -57, -70, -80, -87, -90, -90, -87, -80, -70, -57, -43, -25, -9, 9, 25, 43, 57, 70, 80, 87, 90
11
+ dw 90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13, 13, 38, 61, 78, 88, 90, 85, 73, 54, 31, 4, -22, -46, -67, -82, -90
12
+ dw 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89, 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89
13
+ dw 88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22, -22, -61, -85, -90, -73, -38, 4, 46, 78, 90, 82, 54, 13, -31, -67, -88
14
+ dw 87, 57, 9, -43, -80, -90, -70, -25, 25, 70, 90, 80, 43, -9, -57, -87, -87, -57, -9, 43, 80, 90, 70, 25, -25, -70, -90, -80, -43, 9, 57, 87
15
+ dw 85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31, 31, 78, 90, 61, 4, -54, -88, -82, -38, 22, 73, 90, 67, 13, -46, -85
16
+ dw 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83
17
+ dw 82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38, -38, -88, -73, -4, 67, 90, 46, -31, -85, -78, -13, 61, 90, 54, -22, -82
18
+ dw 80, 9, -70, -87, -25, 57, 90, 43, -43, -90, -57, 25, 87, 70, -9, -80, -80, -9, 70, 87, 25, -57, -90, -43, 43, 90, 57, -25, -87, -70, 9, 80
19
+ dw 78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46, 46, 90, 38, -54, -90, -31, 61, 88, 22, -67, -85, -13, 73, 82, 4, -78
20
+ dw 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75, 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75
21
+ dw 73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54, -54, -85, 4, 88, 46, -61, -82, 13, 90, 38, -67, -78, 22, 90, 31, -73
22
+ dw 70, -43, -87, 9, 90, 25, -80, -57, 57, 80, -25, -90, -9, 87, 43, -70, -70, 43, 87, -9, -90, -25, 80, 57, -57, -80, 25, 90, 9, -87, -43, 70
23
+ dw 67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61, 61, 73, -46, -82, 31, 88, -13, -90, -4, 90, 22, -85, -38, 78, 54, -67
24
+ dw 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64
25
+ dw 61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67, -67, -54, 78, 38, -85, -22, 90, 4, -90, 13, 88, -31, -82, 46, 73, -61
26
+ dw 57, -80, -25, 90, -9, -87, 43, 70, -70, -43, 87, 9, -90, 25, 80, -57, -57, 80, 25, -90, 9, 87, -43, -70, 70, 43, -87, -9, 90, -25, -80, 57
27
+ dw 54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73, 73, 31, -90, 22, 78, -67, -38, 90, -13, -82, 61, 46, -88, 4, 85, -54
28
+ dw 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50, 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50
29
+ dw 46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78, -78, -4, 82, -73, -13, 85, -67, -22, 88, -61, -31, 90, -54, -38, 90, -46
30
+ dw 43, -90, 57, 25, -87, 70, 9, -80, 80, -9, -70, 87, -25, -57, 90, -43, -43, 90, -57, -25, 87, -70, -9, 80, -80, 9, 70, -87, 25, 57, -90, 43
31
+ dw 38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82, 82, -22, -54, 90, -61, -13, 78, -85, 31, 46, -90, 67, 4, -73, 88, -38
32
+ dw 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36
33
+ dw 31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85, -85, 46, 13, -67, 90, -73, 22, 38, -82, 88, -54, -4, 61, -90, 78, -31
34
+ dw 25, -70, 90, -80, 43, 9, -57, 87, -87, 57, -9, -43, 80, -90, 70, -25, -25, 70, -90, 80, -43, -9, 57, -87, 87, -57, 9, 43, -80, 90, -70, 25
35
+ dw 22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88, 88, -67, 31, 13, -54, 82, -90, 78, -46, 4, 38, -73, 90, -85, 61, -22
36
+ dw 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18, 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18
37
+ dw 13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90, -90, 82, -67, 46, -22, -4, 31, -54, 73, -85, 90, -88, 78, -61, 38, -13
38
+ dw 9, -25, 43, -57, 70, -80, 87, -90, 90, -87, 80, -70, 57, -43, 25, -9, -9, 25, -43, 57, -70, 80, -87, 90, -90, 87, -80, 70, -57, 43, -25, 9
39
+ dw 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90, 90, -90, 88, -85, 82, -78, 73, -67, 61, -54, 46, -38, 31, -22, 13, -4
40
+tab_dct16: dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
41
+ dw 90, 87, 80, 70, 57, 43, 25, 9, -9, -25, -43, -57, -70, -80, -87, -90
42
+ dw 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89
43
+ dw 87, 57, 9, -43, -80, -90, -70, -25, 25, 70, 90, 80, 43, -9, -57, -87
44
+ dw 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83
45
+ dw 80, 9, -70, -87, -25, 57, 90, 43, -43, -90, -57, 25, 87, 70, -9, -80
46
+ dw 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75
47
+ dw 70, -43, -87, 9, 90, 25, -80, -57, 57, 80, -25, -90, -9, 87, 43, -70
48
+ dw 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64
49
+ dw 57, -80, -25, 90, -9, -87, 43, 70, -70, -43, 87, 9, -90, 25, 80, -57
50
+ dw 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50
51
+ dw 43, -90, 57, 25, -87, 70, 9, -80, 80, -9, -70, 87, -25, -57, 90, -43
52
+ dw 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36
53
+ dw 25, -70, 90, -80, 43, 9, -57, 87, -87, 57, -9, -43, 80, -90, 70, -25
54
+ dw 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18
55
+ dw 9, -25, 43, -57, 70, -80, 87, -90, 90, -87, 80, -70, 57, -43, 25, -9
56
+
57
+dct16_shuf_AVX512: dq 0, 1, 8, 9, 4, 5, 12, 13
58
+dct16_shuf1_AVX512: dq 2, 3, 10, 11, 6, 7, 14, 15
59
+dct16_shuf3_AVX512: dq 0, 1, 4, 5, 8, 9, 12, 13
60
+dct16_shuf4_AVX512: dq 2, 3, 6, 7, 10, 11, 14, 15
61
+dct16_shuf2_AVX512: dd 0, 4, 8, 12, 2, 6, 10, 14, 16, 20, 24, 28, 18, 22, 26, 30
62
+
63
+dct8_shuf5_AVX512: dq 0, 2, 4, 6, 1, 3, 5, 7
64
+dct8_shuf6_AVX512: dq 0, 2, 4, 6, 1, 3, 5, 7
65
+dct8_shuf8_AVX512: dd 0, 2, 8, 10, 4, 6, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
66
+dct8_shuf4_AVX512: times 2 dd 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
67
+dct16_shuf7_AVX512: dd 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
68
+dct16_shuf9_AVX512: dd 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
69
+
70
+dct32_shuf_AVX512: dd 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20 , 21, 24, 25, 28, 29
71
+dct32_shuf4_AVX512: times 2 dd 0, 4, 8, 12, 0, 4, 8, 12
72
+dct32_shuf5_AVX512: dd 0, 0, 0, 0, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0
73
+dct32_shuf6_AVX512: dd 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, 0, 0, 0, 0
74
+dct32_shuf7_AVX512: dd 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1
75
+dct32_shuf8_AVX512: dd -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
76
+dct16_shuf5_AVX512: dw 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27, 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
77
+dct16_shuf6_AVX512: dw 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
78
+dct16_shuf8_AVX512: dw 20, 0, 4, 2, 28, 8, 6, 10, 22, 16, 12, 18, 30, 24, 14, 26
79
+
80
+dct8_shuf7_AVX512: dw 0, 2, 16, 18, 8, 10, 24, 26, 4, 6, 20, 22, 12, 14, 28, 30
81
+dct8_shuf9_AVX512: times 2 dw 0, 8, 16, 24, 4, 12, 20, 28
82
+dct32_shuf1_AVX512: dw 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16
83
+dct32_shuf2_AVX512: dw 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23, 15, 14, 13, 12, 11, 10, 9, 8, 31, 30, 29, 28, 27, 26, 25, 24
84
+dct32_shuf3_AVX512: times 2 dw 0, 8, 16, 24, 2, 10, 18, 26
85
+
86
+dct8_shuf: times 2 db 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9
87
+dct8_shuf_AVX512: times 2 db 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11
88
+
89
tab_dct8: dw 64, 64, 64, 64, 64, 64, 64, 64
90
dw 89, 75, 50, 18, -18, -50, -75, -89
91
dw 83, 36, -36, -83, -83, -36, 36, 83
92
93
dw 36, -83, 83, -36, -36, 83, -83, 36
94
dw 18, -50, 75, -89, 89, -75, 50, -18
95
96
-dct8_shuf: times 2 db 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9
97
+tab_dct8_avx512: dw 64, 64, 64, 64, 89, 75, 50, 18
98
+ dw 83, 36, -36, -83, 75, -18, -89, -50
99
+ dw 64, -64, -64, 64, 50, -89, 18, 75
100
+ dw 36, -83, 83, -36, 18, -50, 75, -89
101
102
tab_dct16_1: dw 64, 64, 64, 64, 64, 64, 64, 64
103
dw 90, 87, 80, 70, 57, 43, 25, 9
104
105
dw 18, -50, 75, -89, 89, -75, 50, -18
106
dw 9, -25, 43, -57, 70, -80, 87, -90
107
108
-
109
tab_dct16_2: dw 64, 64, 64, 64, 64, 64, 64, 64
110
dw -9, -25, -43, -57, -70, -80, -87, -90
111
dw -89, -75, -50, -18, 18, 50, 75, 89
112
113
times 4 dw 50, -89, 18, 75
114
times 4 dw 18, -50, 75, -89
115
116
+avx512_idct8_1: times 8 dw 64, 83, 64, 36
117
+ times 8 dw 64, 36, -64, -83
118
+ times 8 dw 64, -36, -64, 83
119
+ times 8 dw 64, -83, 64, -36
120
+
121
+avx512_idct8_2: times 8 dw 89, 75, 50, 18
122
+ times 8 dw 75, -18, -89, -50
123
+ times 8 dw 50, -89, 18, 75
124
+ times 8 dw 18, -50, 75, -89
125
+
126
+avx512_idct8_3: dw 64, 83, 64, 83, 64, 83, 64, 83, 64, 83, 64, 83, 64, 83, 64, 83, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36
127
+ dw 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, -64, 83, -64, 83, -64, 83, -64, 83, -64, 83, -64, 83, -64, 83, -64, 83
128
+ dw 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, -83, 64, -83, 64, -83, 64, -83, 64, -83, 64, -83, 64, -83, 64, -83
129
+ dw -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36
130
+ dw 89, 75, 89, 75, 89, 75, 89, 75, 89, 75, 89, 75, 89, 75, 89, 75, 50, -89, 50, -89, 50, -89, 50, -89, 50, -89, 50, -89, 50, -89, 50, -89
131
+ dw 50, 18, 50, 18, 50, 18, 50, 18, 50, 18, 50, 18, 50, 18, 50, 18, 18, 75, 18, 75, 18, 75, 18, 75, 18, 75, 18, 75, 18, 75, 18, 75
132
+ dw 75, -18, 75, -18, 75, -18, 75, -18, 75, -18, 75, -18, 75, -18, 75, -18, 18, -50, 18, -50, 18, -50, 18, -50, 18, -50, 18, -50, 18, -50, 18, -50
133
+ dw -89, -50, -89, -50, -89, -50, -89, -50, -89, -50, -89, -50, -89, -50, -89, -50, 75, -89, 75, -89, 75, -89, 75, -89, 75, -89, 75, -89, 75, -89, 75, -89
134
+
135
idct8_shuf1: dd 0, 2, 4, 6, 1, 3, 5, 7
136
137
const idct8_shuf2, times 2 db 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15
138
139
idct8_shuf3: times 2 db 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
140
141
+
142
+idct8_avx512_shuf3: times 4 db 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
143
+
144
tab_idct16_1: dw 90, 87, 80, 70, 57, 43, 25, 9
145
dw 87, 57, 9, -43, -80, -90, -70, -25
146
dw 80, 9, -70, -87, -25, 57, 90, 43
147
148
idct16_shuff: dd 0, 4, 2, 6, 1, 5, 3, 7
149
150
idct16_shuff1: dd 2, 6, 0, 4, 3, 7, 1, 5
151
+idct16_shuff2: dw 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30
152
+idct16_shuff3: dw 1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31
153
+idct16_shuff4: dd 0, 8, 2, 10, 4, 12, 6, 14
154
+idct16_shuff5: dd 1, 9, 3, 11, 5, 13, 7, 15
155
+
156
+
157
+tab_AVX512_idct16_1: dw 90, 87, 80, 70, 57, 43, 25, 9, 90, 87, 80, 70, 57, 43, 25, 9, 80, 9, -70, -87, -25, 57, 90, 43, 80, 9, -70, -87, -25, 57, 90, 43
158
+ dw 87, 57, 9, -43, -80, -90, -70, -25, 87, 57, 9, -43, -80, -90, -70, -25, 70, -43, -87, 9, 90, 25, -80, -57, 70, -43, -87, 9, 90, 25, -80, -57
159
+ dw 57, -80, -25, 90, -9, -87, 43, 70, 57, -80, -25, 90, -9, -87, 43, 70, 25, -70, 90, -80, 43, 9, -57, 87, 25, -70, 90, -80, 43, 9, -57, 87
160
+ dw 43, -90, 57, 25, -87, 70, 9, -80, 43, -90, 57, 25, -87, 70, 9, -80, 9, -25, 43, -57, 70, -80, 87, -90, 9, -25, 43, -57, 70, -80, 87, -90
161
+
162
+tab_AVX512_idct16_2: dw 64, 89, 83, 75, 64, 50, 36, 18, 64, 89, 83, 75, 64, 50, 36, 18, 64, 50, -36, -89, -64, 18, 83, 75, 64, 50, -36, -89, -64, 18, 83, 75
163
+ dw 64, 75, 36, -18, -64, -89, -83, -50, 64, 75, 36, -18, -64, -89, -83, -50, 64, 18, -83, -50, 64, 75, -36, -89, 64, 18, -83, -50, 64, 75, -36, -89
164
+ dw 64, -18, -83, 50, 64, -75, -36, 89, 64, -18, -83, 50, 64, -75, -36, 89, 64, -75, 36, 18, -64, 89, -83, 50, 64, -75, 36, 18, -64, 89, -83, 50
165
+ dw 64, -50, -36, 89, -64, -18, 83, -75, 64, -50, -36, 89, -64, -18, 83, -75, 64, -89, 83, -75, 64, -50, 36, -18, 64, -89, 83, -75, 64, -50, 36, -18
166
+
167
+idct16_AVX512_shuff: dd 0, 4, 2, 6, 1, 5, 3, 7, 8, 12, 10, 14, 9, 13, 11, 15
168
+
169
+idct16_AVX512_shuff1: dd 2, 6, 0, 4, 3, 7, 1, 5, 10, 14, 8, 12, 11, 15, 9, 13
170
+
171
+idct16_AVX512_shuff2: dq 0, 1, 8, 9, 4, 5, 12, 13
172
+idct16_AVX512_shuff3: dq 2, 3, 10, 11, 6, 7, 14, 15
173
+idct16_AVX512_shuff4: dq 4, 5, 12, 13, 0, 1, 8, 9
174
+idct16_AVX512_shuff5: dq 6, 7, 14, 15, 2, 3, 10, 11
175
+idct16_AVX512_shuff6: times 4 db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1
176
177
tab_idct32_1: dw 90 ,90 ,88 ,85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4
178
dw 90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13
179
180
dw 64, -87, 75, -57, 36, -9, -18, 43, -64, 80, -89, 90, -83, 70, -50, 25
181
dw 64, -90, 89, -87, 83, -80, 75, -70, 64, -57, 50, -43, 36, -25, 18, -9
182
183
+
184
+tab_idct32_AVX512_1: dw 90 ,90 ,88 ,85, 82, 78, 73, 67, 90 ,90 ,88 ,85, 82, 78, 73, 67, 90, 82, 67, 46, 22, -4, -31, -54, 90, 82, 67, 46, 22, -4, -31, -54
185
+ dw 61, 54, 46, 38, 31, 22, 13, 4, 61, 54, 46, 38, 31, 22, 13, 4, -73, -85, -90, -88, -78, -61, -38, -13, -73, -85, -90, -88, -78, -61, -38, -13
186
+ dw 88, 67, 31, -13, -54, -82, -90, -78, 88, 67, 31, -13, -54, -82, -90, -78, 85, 46, -13, -67, -90, -73, -22, 38, 85, 46, -13, -67, -90, -73, -22, 38
187
+ dw -46, -4, 38, 73, 90, 85, 61, 22, -46, -4, 38, 73, 90, 85, 61, 22, 82, 88, 54, -4, -61, -90, -78, -31, 82, 88, 54, -4, -61, -90, -78, -31
188
+ dw 82, 22, -54, -90, -61, 13, 78, 85, 82, 22, -54, -90, -61, 13, 78, 85, 78, -4, -82, -73, 13, 85, 67, -22, 78, -4, -82, -73, 13, 85, 67, -22
189
+ dw 31, -46, -90, -67, 4, 73, 88, 38, 31, -46, -90, -67, 4, 73, 88, 38, -88, -61, 31, 90, 54, -38, -90, -46, -88, -61, 31, 90, 54, -38, -90, -46
190
+ dw 73, -31, -90, -22, 78, 67, -38, -90, 73, -31, -90, -22, 78, 67, -38, -90, 67, -54, -78, 38, 85, -22, -90, 4, 67, -54, -78, 38, 85, -22, -90, 4
191
+ dw -13, 82, 61, -46, -88, -4, 85, 54, -13, 82, 61, -46, -88, -4, 85, 54, 90, 13, -88, -31, 82, 46, -73, -61, 90, 13, -88, -31, 82, 46, -73, -61
192
+
193
+tab_idct32_AVX512_5: dw 4, -13, 22, -31, 38, -46, 54, -61, 4, -13, 22, -31, 38, -46, 54, -61, 13, -38, 61, -78, 88, -90, 85, -73, 13, -38, 61, -78, 88, -90, 85, -73
194
+ dw 67, -73, 78, -82, 85, -88, 90, -90, 67, -73, 78, -82, 85, -88, 90, -90, 54, -31, 4, 22, -46, 67, -82, 90, 54, -31, 4, 22, -46, 67, -82, 90
195
+ dw 22, -61, 85, -90, 73, -38, -4, 46, 22, -61, 85, -90, 73, -38, -4, 46, 31, -78, 90, -61, 4, 54, -88, 82, 31, -78, 90, -61, 4, 54, -88, 82
196
+ dw -78, 90, -82, 54, -13, -31, 67, -88, -78, 90, -82, 54, -13, -31, 67, -88, -38, -22, 73, -90, 67, -13, -46, 85, -38, -22, 73, -90, 67, -13, -46, 85
197
+ dw 38, -88, 73, -4, -67, 90, -46, -31, 38, -88, 73, -4, -67, 90, -46, -31, 46, -90, 38, 54, -90, 31, 61, -88, 46, -90, 38, 54, -90, 31, 61, -88
198
+ dw 85, -78, 13, 61, -90, 54, 22, -82, 85, -78, 13, 61, -90, 54, 22, -82, 22, 67, -85, 13, 73, -82, 4, 78, 22, 67, -85, 13, 73, -82, 4, 78
199
+ dw 54, -85, -4, 88, -46, -61, 82, 13, 54, -85, -4, 88, -46, -61, 82, 13, 61, -73, -46, 82, 31, -88, -13, 90, 61, -73, -46, 82, 31, -88, -13, 90
200
+ dw -90, 38, 67, -78, -22, 90, -31, -73, -90, 38, 67, -78, -22, 90, -31, -73, -4, -90, 22, 85, -38, -78, 54, 67, -4, -90, 22, 85, -38, -78, 54, 67
201
+
202
+
203
+tab_idct32_AVX512_2: dw 64, 89, 83, 75, 64, 50, 36, 18, 64, 89, 83, 75, 64, 50, 36, 18, 64, 75, 36, -18, -64, -89, -83, -50, 64, 75, 36, -18, -64, -89, -83, -50
204
+ dw 64, 50, -36, -89, -64, 18, 83, 75, 64, 50, -36, -89, -64, 18, 83, 75, 64, 18, -83, -50, 64, 75, -36, -89, 64, 18, -83, -50, 64, 75, -36, -89
205
+ dw 64, -18, -83, 50, 64, -75, -36, 89, 64, -18, -83, 50, 64, -75, -36, 89, 64, -50, -36, 89, -64, -18, 83, -75, 64, -50, -36, 89, -64, -18, 83, -75
206
+ dw 64, -75, 36, 18, -64, 89, -83, 50, 64, -75, 36, 18, -64, 89, -83, 50, 64, -89, 83, -75, 64, -50, 36, -18, 64, -89, 83, -75, 64, -50, 36, -18
207
+
208
+tab_idct32_AVX512_3: dw 90, 87, 80, 70, 57, 43, 25, 9, 90, 87, 80, 70, 57, 43, 25, 9, 87, 57, 9, -43, -80, -90, -70, -25, 87, 57, 9, -43, -80, -90, -70, -25
209
+ dw 80, 9, -70, -87, -25, 57, 90, 43, 80, 9, -70, -87, -25, 57, 90, 43, 70, -43, -87, 9, 90, 25, -80, -57, 70, -43, -87, 9, 90, 25, -80, -57
210
+ dw 57, -80, -25, 90, -9, -87, 43, 70, 57, -80, -25, 90, -9, -87, 43, 70, 43, -90, 57, 25, -87, 70, 9, -80, 43, -90, 57, 25, -87, 70, 9, -80
211
+ dw 25, -70, 90, -80, 43, 9, -57, 87, 25, -70, 90, -80, 43, 9, -57, 87, 9, -25, 43, -57, 70, -80, 87, -90, 9, -25, 43, -57, 70, -80, 87, -90
212
+
213
+tab_idct32_AVX512_4: dw 90 ,90 ,88 ,85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4, 90 ,90 ,88 ,85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4
214
+ dw 90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13, 90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13
215
+ dw 88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22, 88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22
216
+ dw 85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31, 85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31
217
+ dw 82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38, 82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38
218
+ dw 78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46, 78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46
219
+ dw 73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54, 73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54
220
+ dw 67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61, 67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61
221
+ dw 61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67, 61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67
222
+ dw 54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73, 54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73
223
+ dw 46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78, 46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78
224
+ dw 38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82, 38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82
225
+ dw 31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85, 31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85
226
+ dw 22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88, 22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88
227
+ dw 13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90, 13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90
228
+ dw 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90, 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90
229
+
230
+tab_idct32_AVX512_6: dw 64, 90, 89, 87, 83, 80, 75, 70, 64, 57, 50, 43, 36, 25, 18, 9, 64, 90, 89, 87, 83, 80, 75, 70, 64, 57, 50, 43, 36, 25, 18, 9
231
+ dw 64, 87, 75, 57, 36, 9, -18, -43, -64, -80, -89, -90, -83, -70, -50, -25, 64, 87, 75, 57, 36, 9, -18, -43, -64, -80, -89, -90, -83, -70, -50, -25
232
+ dw 64, 80, 50, 9, -36, -70, -89, -87, -64, -25, 18, 57, 83, 90, 75, 43, 64, 80, 50, 9, -36, -70, -89, -87, -64, -25, 18, 57, 83, 90, 75, 43
233
+ dw 64, 70, 18, -43, -83, -87, -50, 9, 64, 90, 75, 25, -36, -80, -89, -57, 64, 70, 18, -43, -83, -87, -50, 9, 64, 90, 75, 25, -36, -80, -89, -57
234
+ dw 64, 57, -18, -80, -83, -25, 50, 90, 64, -9, -75, -87, -36, 43, 89, 70, 64, 57, -18, -80, -83, -25, 50, 90, 64, -9, -75, -87, -36, 43, 89, 70
235
+ dw 64, 43, -50, -90, -36, 57, 89, 25, -64, -87, -18, 70, 83, 9, -75, -80, 64, 43, -50, -90, -36, 57, 89, 25, -64, -87, -18, 70, 83, 9, -75, -80
236
+ dw 64, 25, -75, -70, 36, 90, 18, -80, -64, 43, 89, 9, -83, -57, 50, 87, 64, 25, -75, -70, 36, 90, 18, -80, -64, 43, 89, 9, -83, -57, 50, 87
237
+ dw 64, 9, -89, -25, 83, 43, -75, -57, 64, 70, -50, -80, 36, 87, -18, -90, 64, 9, -89, -25, 83, 43, -75, -57, 64, 70, -50, -80, 36, 87, -18, -90
238
+ dw 64, -9, -89, 25, 83, -43, -75, 57, 64, -70, -50, 80, 36, -87, -18, 90, 64, -9, -89, 25, 83, -43, -75, 57, 64, -70, -50, 80, 36, -87, -18, 90
239
+ dw 64, -25, -75, 70, 36, -90, 18, 80, -64, -43, 89, -9, -83, 57, 50, -87, 64, -25, -75, 70, 36, -90, 18, 80, -64, -43, 89, -9, -83, 57, 50, -87
240
+ dw 64, -43, -50, 90, -36, -57, 89, -25, -64, 87, -18, -70, 83, -9, -75, 80, 64, -43, -50, 90, -36, -57, 89, -25, -64, 87, -18, -70, 83, -9, -75, 80
241
+ dw 64, -57, -18, 80, -83, 25, 50, -90, 64, 9, -75, 87, -36, -43, 89, -70, 64, -57, -18, 80, -83, 25, 50, -90, 64, 9, -75, 87, -36, -43, 89, -70
242
+ dw 64, -70, 18, 43, -83, 87, -50, -9, 64, -90, 75, -25, -36, 80, -89, 57, 64, -70, 18, 43, -83, 87, -50, -9, 64, -90, 75, -25, -36, 80, -89, 57
243
+ dw 64, -80, 50, -9, -36, 70, -89, 87, -64, 25, 18, -57, 83, -90, 75, -43, 64, -80, 50, -9, -36, 70, -89, 87, -64, 25, 18, -57, 83, -90, 75, -43
244
+ dw 64, -87, 75, -57, 36, -9, -18, 43, -64, 80, -89, 90, -83, 70, -50, 25, 64, -87, 75, -57, 36, -9, -18, 43, -64, 80, -89, 90, -83, 70, -50, 25
245
+ dw 64, -90, 89, -87, 83, -80, 75, -70, 64, -57, 50, -43, 36, -25, 18, -9, 64, -90, 89, -87, 83, -80, 75, -70, 64, -57, 50, -43, 36, -25, 18, -9
246
+
247
+
248
avx2_dct4: dw 64, 64, 64, 64, 64, 64, 64, 64, 64, -64, 64, -64, 64, -64, 64, -64
249
dw 83, 36, 83, 36, 83, 36, 83, 36, 36, -83, 36, -83, 36, -83, 36, -83
250
251
252
253
tab_idct8_2: times 1 dw 89, 75, 50, 18, 75, -18, -89, -50
254
times 1 dw 50, -89, 18, 75, 18, -50, 75, -89
255
-
256
pb_idct8odd: db 2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15
257
258
+;Scale bits table for rdoQuant
259
+tab_nonpsyRdo8 : dq 5, 7, 9, 11
260
+tab_nonpsyRdo10: dq 9, 11, 13, 15
261
+tab_nonpsyRdo12: dq 13, 15, 17, 19
262
+
263
SECTION .text
264
cextern pd_1
265
cextern pd_2
266
267
%define DST4_ROUND 16
268
%define DCT8_SHIFT1 6
269
%define DCT8_ROUND1 32
270
+ %define RDO_MAX_4 3
271
+ %define RDO_MAX_8 1
272
+ %define RDO_MAX_16 0
273
+ %define RDO_MAX_32 0
274
%elif BIT_DEPTH == 10
275
%define DCT4_SHIFT 3
276
%define DCT4_ROUND 4
277
278
%define DST4_ROUND 4
279
%define DCT8_SHIFT1 4
280
%define DCT8_ROUND1 8
281
+ %define RDO_MAX_4 7
282
+ %define RDO_MAX_8 5
283
+ %define RDO_MAX_16 3
284
+ %define RDO_MAX_32 1
285
%elif BIT_DEPTH == 8
286
%define DCT4_SHIFT 1
287
%define DCT4_ROUND 1
288
289
%define DST4_ROUND 1
290
%define DCT8_SHIFT1 2
291
%define DCT8_ROUND1 2
292
+ %define RDO_MAX_4 11
293
+ %define RDO_MAX_8 9
294
+ %define RDO_MAX_16 7
295
+ %define RDO_MAX_32 5
296
%else
297
%error Unsupported BIT_DEPTH!
298
%endif
299
300
dec r3d
301
jnz .loop
302
RET
303
+%if ARCH_X86_64 == 1
304
+INIT_ZMM avx512
305
+cglobal denoise_dct, 4, 4, 22
306
+ pxor m16, m16
307
+ sub r3d, 16
308
+ je .coeff16
309
+ add r3d, 16
310
+ shr r3d, 5
311
+ jmp .loop
312
+
313
+.coeff16:
314
+ movu ym19, [r0]
315
+ pabsw ym17, ym19
316
+ movu m2, [r1]
317
+ pmovsxwd m18, ym17
318
+ paddd m2, m18
319
+ movu [r1], m2
320
+ movu ym3, [r2]
321
+ psubusw ym17, ym3
322
+ pcmpgtw ym18, ym17, ym16
323
+ pand ym17, ym18
324
+ psignw ym17, ym19
325
+ movu [r0], ym17
326
+ RET
327
+
328
+.loop:
329
+ movu m21, [r0]
330
+ pabsw m17, m21
331
+ movu m2, [r1]
332
+ pmovsxwd m4, ym17
333
+ paddd m2, m4
334
+ movu [r1], m2
335
+ vextracti64x4 ym4, m17, 1
336
+
337
+ movu m2, [r1 + mmsize]
338
+ pmovsxwd m3, ym4
339
+ paddd m2, m3
340
+ movu [r1 + mmsize], m2
341
+ movu m3, [r2]
342
+ psubusw m17, m3
343
+
344
+ vextracti64x4 ym20, m17, 1
345
+ pcmpgtw ym18, ym17, ym16
346
+ pcmpgtw ym19, ym20, ym16
347
+ vinserti64x4 m18, m18, ym19, 1
348
+
349
+ pand m17, m18
350
+ vextracti64x4 ym19, m17, 1
351
+ vextracti64x4 ym20, m21, 1
352
+ psignw ym17, ym21
353
+ psignw ym19, ym20
354
+ vinserti64x4 m17, m17, ym19, 1
355
+
356
+ movu [r0], m17
357
+ add r0, mmsize
358
+ add r1, mmsize * 2
359
+ add r2, mmsize
360
+ dec r3d
361
+ jnz .loop
362
+ RET
363
+%endif ; ARCH_X86_64 == 1
364
365
%if ARCH_X86_64 == 1
366
%macro DCT8_PASS_1 4
367
368
movu [r1 + 96], m10
369
RET
370
371
+
372
+%macro DCT8_AVX512_PASS_1 4
373
+ vpmaddwd m%2, m3, m%1
374
+ vpsrlq m8, m%2, 32
375
+ vpaddd m%2, m8
376
+ vpaddd m%2, m5
377
+ vpsrad m%2, DCT8_SHIFT1
378
+
379
+ vpmaddwd m%4, m2, m%3
380
+ vpsrlq m8, m%4, 32
381
+ vpaddd m%4, m8
382
+ vpaddd m%4, m5
383
+ vpsrad m%4, DCT8_SHIFT1
384
+
385
+ vpackssdw m%2, m%4
386
+ vpermw m%2, m1, m%2
387
+%endmacro
388
+
389
+%macro DCT8_AVX512_PASS_2 4
390
+ vpmaddwd m0, m9, m%1
391
+ vpmaddwd m1, m10, m%1
392
+ vpsrldq m2, m0, 8
393
+ vpsrldq m3, m1, 8
394
+ vpaddd m0, m2
395
+ vpaddd m1, m3
396
+ vpsrlq m2, m0, 32
397
+ vpsrlq m3, m1, 32
398
+ vpaddd m0, m2
399
+ vpaddd m1, m3
400
+ vpaddd m0, m5
401
+ vpsrad m0, DCT8_SHIFT2
402
+ vpaddd m1, m5
403
+ vpsrad m1, DCT8_SHIFT2
404
+ vpackssdw m0, m1
405
+ vpermw m0, m19, m0
406
+
407
+ vpmaddwd m1, m9, m%2
408
+ vpmaddwd m2, m10, m%2
409
+ vpsrldq m3, m1, 8
410
+ vpsrldq m4, m2, 8
411
+ vpaddd m1, m3
412
+ vpaddd m2, m4
413
+ vpsrlq m3, m1, 32
414
+ vpsrlq m4, m2, 32
415
+ vpaddd m1, m3
416
+ vpaddd m2, m4
417
+ vpaddd m1, m5
418
+ vpsrad m1, DCT8_SHIFT2
419
+ vpaddd m2, m5
420
+ vpsrad m2, DCT8_SHIFT2
421
+ vpackssdw m1, m2
422
+ vpermw m1, m19, m1
423
+ vinserti128 ym0, ym0, xm1, 1
424
+
425
+ vpmaddwd m1, m9, m%3
426
+ vpmaddwd m2, m10, m%3
427
+ vpsrldq m3, m1, 8
428
+ vpsrldq m4, m2, 8
429
+ vpaddd m1, m3
430
+ vpaddd m2, m4
431
+ vpsrlq m3, m1, 32
432
+ vpsrlq m4, m2, 32
433
+ vpaddd m1, m3
434
+ vpaddd m2, m4
435
+ vpaddd m1, m5
436
+ vpsrad m1, DCT8_SHIFT2
437
+ vpaddd m2, m5
438
+ vpsrad m2, DCT8_SHIFT2
439
+ vpackssdw m1, m2
440
+ vpermw m1, m19, m1
441
+
442
+ vpmaddwd m2, m9, m%4
443
+ vpmaddwd m3, m10, m%4
444
+ vpsrldq m4, m2, 8
445
+ vpsrldq m6, m3, 8
446
+ vpaddd m2, m4
447
+ vpaddd m3, m6
448
+ vpsrlq m4, m2, 32
449
+ vpsrlq m6, m3, 32
450
+ vpaddd m2, m4
451
+ vpaddd m3, m6
452
+ vpaddd m2, m5
453
+ vpsrad m2, DCT8_SHIFT2
454
+ vpaddd m3, m5
455
+ vpsrad m3, DCT8_SHIFT2
456
+ vpackssdw m2, m3
457
+ vpermw m2, m19, m2
458
+
459
+ vinserti128 ym1, ym1, xm2, 1
460
+ vinserti64x4 m0, m0, ym1, 1
461
+%endmacro
462
+
463
+INIT_ZMM avx512
464
+cglobal dct8, 3, 7, 24
465
+
466
+ vbroadcasti32x4 m5, [pd_ %+ DCT8_ROUND1]
467
+ vbroadcasti32x8 m4, [dct8_shuf]
468
+ vbroadcasti32x4 m19, [dct8_shuf9_AVX512]
469
+
470
+ add r2d, r2d
471
+ lea r3, [r2 * 3]
472
+ lea r4, [r0 + r2 * 4]
473
+ lea r5, [tab_dct8]
474
+ lea r6, [tab_dct8_avx512]
475
+
476
+ ;pass1
477
+ mova xm0, [r0]
478
+ vinserti128 ym0, ym0, [r4], 1
479
+ mova xm1, [r0 + r2]
480
+ vinserti128 ym1, ym1, [r4 + r2], 1
481
+ mova xm2, [r0 + r2 * 2]
482
+ vinserti128 ym2, ym2, [r4 + r2 * 2], 1
483
+ mova xm3, [r0 + r3]
484
+ vinserti128 ym3, ym3, [r4 + r3], 1
485
+
486
+ vinserti64x4 m0, m0, ym2, 1
487
+ vinserti64x4 m1, m1, ym3, 1
488
+
489
+ vpunpcklqdq m2, m0, m1
490
+ vpunpckhqdq m0, m1
491
+
492
+ vpshufb m0, m4
493
+ vpaddw m3, m2, m0
494
+ vpsubw m2, m0
495
+
496
+ vbroadcasti32x8 m1, [dct8_shuf7_AVX512]
497
+
498
+ ; Load all the coefficients togather for better caching
499
+ vpbroadcastq m20, [r6 + 0 * 8]
500
+ vpbroadcastq m21, [r6 + 1 * 8]
501
+ vpbroadcastq m22, [r6 + 2 * 8]
502
+ vpbroadcastq m23, [r6 + 3 * 8]
503
+ vpbroadcastq m7, [r6 + 4 * 8]
504
+ vpbroadcastq m12, [r6 + 5 * 8]
505
+ vpbroadcastq m14, [r6 + 6 * 8]
506
+ vpbroadcastq m16, [r6 + 7 * 8]
507
+
508
+ DCT8_AVX512_PASS_1 20, 9, 21, 10
509
+ DCT8_AVX512_PASS_1 22, 11, 23, 10
510
+ DCT8_AVX512_PASS_1 7, 13, 12, 10
511
+ DCT8_AVX512_PASS_1 14, 15, 16, 10
512
+
513
+ ;pass2
514
+ vbroadcasti32x4 m5, [pd_ %+ DCT8_ROUND2]
515
+
516
+ vinserti64x4 m9, m9, ym11, 1
517
+ vinserti64x4 m10, m13, ym15, 1
518
+
519
+ ;Load all the coefficients togather for better caching and reuse common coefficients from PASS 1
520
+ vbroadcasti32x4 m21, [r5 + 1 * 16]
521
+ vbroadcasti32x4 m22, [r5 + 2 * 16]
522
+ vbroadcasti32x4 m23, [r5 + 3 * 16]
523
+ vbroadcasti32x4 m12, [r5 + 5 * 16]
524
+ vbroadcasti32x4 m14, [r5 + 6 * 16]
525
+ vbroadcasti32x4 m16, [r5 + 7 * 16]
526
+
527
+ DCT8_AVX512_PASS_2 20, 21, 22, 23
528
+ movu [r1], m0
529
+ DCT8_AVX512_PASS_2 7, 12, 14, 16
530
+ movu [r1 + 64], m0
531
+ RET
532
+
533
%macro DCT16_PASS_1_E 2
534
vpbroadcastq m7, [r7 + %1]
535
536
537
dec r4d
538
jnz .pass2
539
RET
540
+%macro DCT16_avx512_PASS_1_O 4
541
+ vbroadcasti32x4 m1, [r5 + %1]
542
+
543
+ pmaddwd m3, m6, m1
544
+ vpsrldq m11, m3, 8
545
+ vpaddd m3, m11
546
+
547
+ pmaddwd m11, m8, m1
548
+ vpsrldq m12, m11, 8
549
+ vpaddd m11, m12
550
+
551
+ vpunpcklqdq m12, m3, m11
552
+ vpsrldq m11, m12, 4
553
+ vpaddd m11, m12
554
+
555
+ pmaddwd m3, m10, m1
556
+ vpsrldq m12, m3, 8
557
+ vpaddd m3, m12
558
+
559
+ pmaddwd m12, m2, m1
560
+ vpsrldq m13, m12, 8
561
+ vpaddd m12, m13
562
+
563
+ vpunpcklqdq m13, m3, m12
564
+ vpsrldq m12, m13, 4
565
+ vpaddd m12, m13
566
+
567
+ mova m%3, m26
568
+ vpermi2d m%3, m11, m12
569
+ paddd m%3, m0
570
+ psrad m%3, DCT_SHIFT
571
+
572
+ ; next row start
573
+ vbroadcasti32x4 m1, [r5 + %2]
574
+
575
+ pmaddwd m3, m6, m1
576
+ vpsrldq m11, m3, 8
577
+ vpaddd m3, m11
578
+
579
+ pmaddwd m11, m8, m1
580
+ vpsrldq m12, m11, 8
581
+ vpaddd m11, m12
582
+
583
+ vpunpcklqdq m12, m3, m11
584
+ vpsrldq m11, m12, 4
585
+ vpaddd m11, m12
586
+
587
+ pmaddwd m3, m10, m1
588
+ vpsrldq m12, m3, 8
589
+ vpaddd m3, m12
590
+
591
+ pmaddwd m12, m2, m1
592
+ vpsrldq m13, m12, 8
593
+ vpaddd m12, m13
594
+
595
+ vpunpcklqdq m13, m3, m12
596
+ vpsrldq m12, m13, 4
597
+ vpaddd m12, m13
598
+
599
+ mova m%4, m26
600
+ vpermi2d m%4, m11, m12
601
+ paddd m%4, m0
602
+ psrad m%4, DCT_SHIFT
603
+ ;next row end
604
+
605
+ packssdw m%3, m%4
606
+ vpermw m%4, m25, m%3
607
+%endmacro
608
+
609
+%macro DCT16_AVX512_PASS_1_LOOP 0
610
+ vbroadcasti32x8 m1, [dct16_shuf1]
611
+ mova m2, [dct16_shuf3_AVX512]
612
+ mova m3, [dct16_shuf4_AVX512]
613
+
614
+ movu ym4, [r0]
615
+ movu ym5, [r0 + r2]
616
+ vinserti64x4 m4, m4, ym5, 1
617
+
618
+ movu ym5, [r0 + 2 * r2]
619
+ movu ym6, [r0 + r3]
620
+ vinserti64x4 m5, m5, ym6, 1
621
+
622
+ mova m6, m2
623
+ mova m7, m3
624
+ vpermi2q m6, m4, m5
625
+ vpermi2q m7, m4, m5
626
+
627
+ movu ym4, [r4]
628
+ movu ym5, [r4 + r2]
629
+ vinserti64x4 m4, m4, ym5, 1
630
+
631
+ movu ym5, [r4 + 2 * r2]
632
+ movu ym8, [r4 + r3]
633
+ vinserti64x4 m5, m5, ym8, 1
634
+
635
+ mova m8, m2
636
+ mova m9, m3
637
+ vpermi2q m8, m4, m5
638
+ vpermi2q m9, m4, m5
639
+
640
+ vpshufb m7, m1
641
+ vpshufb m9, m1
642
+
643
+ paddw m4, m6, m7
644
+ psubw m6, m7
645
+
646
+ paddw m5, m8, m9
647
+ psubw m8, m9
648
+
649
+ lea r0, [r0 + 8 * r2]
650
+ lea r4, [r0 + r2 * 4]
651
+
652
+ movu ym7, [r0]
653
+ movu ym9, [r0 + r2]
654
+ vinserti64x4 m7, m7, ym9, 1
655
+
656
+ movu ym9, [r0 + 2 * r2]
657
+ movu ym10, [r0 + r3]
658
+ vinserti64x4 m9, m9, ym10, 1
659
+
660
+ mova m10, m2
661
+ mova m11, m3
662
+ vpermi2q m10, m7, m9
663
+ vpermi2q m11, m7, m9
664
+
665
+ vpshufb m11, m1
666
+ paddw m7, m10, m11
667
+ psubw m10, m11
668
+
669
+ movu ym9, [r4]
670
+ movu ym11, [r4 + r2]
671
+ vinserti64x4 m9, m9, ym11, 1
672
+
673
+ movu ym11, [r4 + 2 * r2]
674
+ movu ym12, [r4 + r3]
675
+ vinserti64x4 m11, m11, ym12, 1
676
+
677
+ vpermi2q m2, m9, m11
678
+ vpermi2q m3, m9, m11
679
+
680
+ vpshufb m3, m1
681
+ paddw m9, m2, m3
682
+ psubw m2, m3
683
+%endmacro
684
+
685
+%macro DCT16_avx512_PASS_1_E 4
686
+ vpbroadcastq m1, [r5 + %1]
687
+
688
+ pmaddwd m19, m11, m1
689
+ vpsrldq m12, m19, 4
690
+ vpaddd m12, m19
691
+
692
+ pmaddwd m19, m13, m1
693
+ vpsrldq m18, m19, 4
694
+ vpaddd m18, m19
695
+
696
+ mova m%2, m27
697
+ vpermi2d m%2, m12, m18
698
+ paddd m%2, m0
699
+ psrad m%2, DCT_SHIFT
700
+
701
+ ; 2nd row
702
+ vpbroadcastq m1, [r5 + %3]
703
+
704
+ pmaddwd m19, m11, m1
705
+ vpsrldq m12, m19, 4
706
+ vpaddd m12, m19
707
+
708
+ pmaddwd m19, m13, m1
709
+ vpsrldq m18, m19, 4
710
+ vpaddd m18, m19
711
+
712
+ mova m%4, m27
713
+ vpermi2d m%4, m12, m18
714
+ paddd m%4, m0
715
+ psrad m%4, DCT_SHIFT
716
+
717
+ packssdw m%2, m%4
718
+ vpermw m%4, m25, m%2
719
+%endmacro
720
+
721
+%macro DCT16_PASS2_AVX512 10
722
+ vpmaddwd m5, m%2, m%1
723
+ vpsrldq m6, m5, 8
724
+ vpaddd m5, m6
725
+ vpsrldq m6, m5, 4
726
+ vpaddd m5, m6
727
+
728
+ vpmaddwd m6, m%3, m%1
729
+ vpsrldq m7, m6, 8
730
+ vpaddd m6, m7
731
+ vpsrldq m7, m6, 4
732
+ vpaddd m6, m7
733
+ vpunpckldq m7, m5, m6
734
+
735
+ vpmaddwd m5, m%4, m%1
736
+ vpsrldq m6, m5, 8
737
+ vpaddd m5, m6
738
+ vpsrldq m6, m5, 4
739
+ vpaddd m5, m6
740
+
741
+ vpmaddwd m6, m%5, m%1
742
+ vpsrldq m8, m6, 8
743
+ vpaddd m6, m8
744
+ vpsrldq m8, m6, 4
745
+ vpaddd m6, m8
746
+ vpunpckldq m8, m5, m6
747
+
748
+ vpunpcklqdq m5, m7, m8
749
+ vpermd m5, m2, m5
750
+ vpsrldq m6, m5, 4
751
+ vpaddd m5, m6
752
+
753
+ vpmaddwd m6, m%6, m%1
754
+ vpsrldq m7, m6, 8
755
+ vpaddd m6, m7
756
+ vpsrldq m7, m6, 4
757
+ vpaddd m6, m7
758
+
759
+ vpmaddwd m7, m%7, m%1
760
+ vpsrldq m8, m7, 8
761
+ vpaddd m7, m8
762
+ vpsrldq m8, m7, 4
763
+ vpaddd m7, m8
764
+ vpunpckldq m8, m6, m7
765
+
766
+ vpmaddwd m6, m%8, m%1
767
+ vpsrldq m7, m6, 8
768
+ vpaddd m6, m7
769
+ vpsrldq m7, m6, 4
770
+ vpaddd m6, m7
771
+
772
+ vpmaddwd m7, m%9, m%1
773
+ vpsrldq m4, m7, 8
774
+ vpaddd m7, m4
775
+ vpsrldq m4, m7, 4
776
+ vpaddd m7, m4
777
+ vpunpckldq m4, m6, m7
778
+
779
+ vpunpcklqdq m6, m8, m4
780
+ vpermd m6, m2, m6
781
+ vpsrldq m7, m6, 4
782
+ vpaddd m6, m7
783
+
784
+ paddd m5, m0
785
+ psrad m5, DCT_SHIFT2
786
+ paddd m6, m0
787
+ psrad m6, DCT_SHIFT2
788
+
789
+ packssdw m5, m6
790
+ vpermw m%10, m3, m5
791
+%endmacro
792
+
793
+INIT_ZMM avx512
794
+cglobal dct16, 3, 6, 29
795
+
796
+%if BIT_DEPTH == 12
797
+ %define DCT_SHIFT 7
798
+ vbroadcasti32x4 m0, [pd_64]
799
+%elif BIT_DEPTH == 10
800
+ %define DCT_SHIFT 5
801
+ vbroadcasti32x4 m0, [pd_16]
802
+%elif BIT_DEPTH == 8
803
+ %define DCT_SHIFT 3
804
+ vbroadcasti32x4 m0, [pd_4]
805
+%else
806
+ %error Unsupported BIT_DEPTH!
807
+%endif
808
+%define DCT_SHIFT2 10
809
+
810
+ add r2d, r2d
811
+ lea r3, [r2 * 3]
812
+ lea r4, [r0 + r2 * 4]
813
+ lea r5, [tab_dct16_1 + 8 * 16]
814
+
815
+ ;Load reuseable table once to save memory movments
816
+ mova m25, [dct16_shuf5_AVX512]
817
+ mova m26, [dct16_shuf2_AVX512]
818
+ mova m27, [dct16_shuf7_AVX512]
819
+ vbroadcasti32x8 m28, [dct16_shuf6_AVX512]
820
+
821
+ DCT16_AVX512_PASS_1_LOOP
822
+ DCT16_avx512_PASS_1_O -7 * 16, -5 * 16, 15, 14 ;row 1, 3
823
+ DCT16_avx512_PASS_1_O -3 * 16, -1 * 16, 16, 15 ;row 5, 7
824
+ DCT16_avx512_PASS_1_O 1 * 16, 3 * 16, 17, 16 ;row 9, 11
825
+ DCT16_avx512_PASS_1_O 5 * 16, 7 * 16, 18, 17 ;row 13, 15
826
+
827
+ vbroadcasti32x8 m1, [dct16_shuf2]
828
+ pshufb m4, m1
829
+ pshufb m5, m1
830
+ pshufb m7, m1
831
+ pshufb m9, m1
832
+
833
+ vpsrldq m3, m4, 2
834
+ vpsubw m11, m4, m3
835
+ vpsrldq m6, m5, 2
836
+ vpsubw m12, m5, m6
837
+ vpsrldq m8, m7, 2
838
+ vpsubw m13, m7, m8
839
+ vpsrldq m10, m9, 2
840
+ vpsubw m18, m9, m10
841
+
842
+ vpermw m11, m28, m11
843
+ vpermw m12, m28, m12
844
+ vinserti64x4 m11, m11, ym12, 1
845
+
846
+ vpermw m13, m28, m13
847
+ vpermw m18, m28, m18
848
+ vinserti64x4 m13, m13, ym18, 1
849
+
850
+ DCT16_avx512_PASS_1_E -6 * 16, 21, -2 * 16, 20 ; row 2, 6
851
+ DCT16_avx512_PASS_1_E 2 * 16, 22, 6 * 16, 21 ; row 10, 14
852
+
853
+ vpaddw m11, m4, m3
854
+ vpaddw m12, m5, m6
855
+ vpaddw m13, m7, m8
856
+ vpaddw m18, m9, m10
857
+
858
+ vpermw m11, m28, m11
859
+ vpermw m12, m28, m12
860
+ vinserti64x4 m11, m11, ym12, 1
861
+
862
+ vpermw m13, m28, m13
863
+ vpermw m18, m28, m18
864
+ vinserti64x4 m13, m13, ym18, 1
865
+
866
+ DCT16_avx512_PASS_1_E -8 * 16, 23, 0 * 16, 22 ; row 0, 8
867
+ DCT16_avx512_PASS_1_E -4 * 16, 24, 4 * 16, 23 ; row 4, 12
868
+
869
+ ;PASS2
870
+ vbroadcasti128 m0, [pd_512]
871
+
872
+ lea r5, [tab_dct16]
873
+ mova m2, [dct16_shuf9_AVX512]
874
+ vbroadcasti32x8 m3, [dct16_shuf8_AVX512]
875
+
876
+ vbroadcasti32x8 m1, [r5 + 0 * 32]
877
+ DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 9
878
+ vbroadcasti32x8 m1, [r5 + 1 * 32]
879
+ DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 10
880
+ vinserti64x4 m9, m9, ym10, 1
881
+ movu [r1 + 0 * 64], m9
882
+
883
+ vbroadcasti32x8 m1, [r5 + 2 * 32]
884
+ DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 9
885
+ vbroadcasti32x8 m1, [r5 + 3 * 32]
886
+ DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 10
887
+ vinserti64x4 m9, m9, ym10, 1
888
+ movu [r1 + 1 * 64], m9
889
+
890
+ vbroadcasti32x8 m1, [r5 + 4 * 32]
891
+ DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 9
892
+ vbroadcasti32x8 m1, [r5 + 5 * 32]
893
+ DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 10
894
+ vinserti64x4 m9, m9, ym10, 1
895
+ movu [r1 + 2 * 64], m9
896
+
897
+ vbroadcasti32x8 m1, [r5 + 6 * 32]
898
+ DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 9
899
+ vbroadcasti32x8 m1, [r5 + 7 * 32]
900
+ DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 10
901
+ vinserti64x4 m9, m9, ym10, 1
902
+ movu [r1 + 3 * 64], m9
903
+
904
+ vbroadcasti32x8 m1, [r5 + 8 * 32]
905
+ DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 9
906
+ vbroadcasti32x8 m1, [r5 + 9 * 32]
907
+ DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 10
908
+ vinserti64x4 m9, m9, ym10, 1
909
+ movu [r1 + 4 * 64], m9
910
+
911
+ vbroadcasti32x8 m1, [r5 + 10 * 32]
912
+ DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 9
913
+ vbroadcasti32x8 m1, [r5 + 11 * 32]
914
+ DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 10
915
+ vinserti64x4 m9, m9, ym10, 1
916
+ movu [r1 + 5 * 64], m9
917
+
918
+ vbroadcasti32x8 m1, [r5 + 12 * 32]
919
+ DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 9
920
+ vbroadcasti32x8 m1, [r5 + 13 * 32]
921
+ DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 10
922
+ vinserti64x4 m9, m9, ym10, 1
923
+ movu [r1 + 6 * 64], m9
924
+
925
+ vbroadcasti32x8 m1, [r5 + 14 * 32]
926
+ DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 9
927
+ vbroadcasti32x8 m1, [r5 + 15 * 32]
928
+ DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 10
929
+ vinserti64x4 m9, m9, ym10, 1
930
+ movu [r1 + 7 * 64], m9
931
+ RET
932
933
%macro DCT32_PASS_1 4
934
vbroadcasti128 m8, [r7 + %1]
935
-
936
pmaddwd m11, m%3, m8
937
pmaddwd m12, m%4, m8
938
phaddd m11, m12
939
940
jnz .pass2
941
RET
942
943
+
944
+%macro DCT32_avx512_LOOP 4
945
+ movu m1, [r0]
946
+ movu m2, [r0 + r2]
947
+
948
+ vinserti64x4 m3, m1, ym2, 1 ; row 0l, 1l
949
+ vextracti64x4 ym4, m1, 1
950
+ vinserti64x4 m2, m2, ym4, 0 ; row 0h, 1h
951
+ vpermw m2, m31, m2
952
+
953
+ psubw m%1, m3, m2 ; O
954
+ paddw m3, m2 ; E
955
+ mova [r9 + %3 * 64], m3
956
+
957
+ movu m1, [r0 + 2 * r2]
958
+ movu m5, [r0 + r3]
959
+
960
+ vinserti64x4 m6, m1, ym5, 1 ; row 2l, 3l
961
+ vextracti64x4 ym7, m1, 1
962
+ vinserti64x4 m5, m5, ym7, 0 ; row 2h, 3h
963
+ vpermw m5, m31, m5
964
+
965
+ psubw m%2, m6, m5 ; O
966
+ paddw m6, m5 ; E
967
+ mova [r9 + %4 * 64], m6
968
+%endmacro
969
+
970
+%macro DCT32_avx512_PASS_1_O 3
971
+ pmaddwd m10, m%2, m9
972
+ vpsrldq m11, m10, 8
973
+ vpaddd m10, m11
974
+
975
+ pmaddwd m11, m%3, m9
976
+ vpsrldq m12, m11, 8
977
+ vpaddd m11, m12
978
+
979
+ mova m12, m8
980
+ vpermi2d m12, m10, m11
981
+ vpsrldq m10, m12, 8
982
+ vpaddd m12, m10
983
+ vpsrldq m10, m12, 4
984
+ vpaddd m12, m10
985
+
986
+ vpaddd m12, m0
987
+ vpsrad m12, DCT_SHIFT
988
+ vpackssdw m12, m12
989
+ vpermw m12, m30, m12
990
+ movq [r5 + %1], xm12
991
+%endmacro
992
+
993
+%macro DCT32_avx512_PASS_1_ROW_O 0
994
+ vbroadcasti32x8 m9, [r7 + 1 * 32]
995
+
996
+ DCT32_avx512_LOOP 13, 14, 0, 1
997
+ DCT32_avx512_PASS_1_O 1 * 64 + 0 * 8, 13, 14
998
+
999
+ lea r0, [r0 + 4 * r2]
1000
+ DCT32_avx512_LOOP 15, 16, 2, 3
1001
+ DCT32_avx512_PASS_1_O 1 * 64 + 1 * 8, 15, 16
1002
+
1003
+ lea r0, [r0 + 4 * r2]
1004
+ DCT32_avx512_LOOP 17, 18, 4, 5
1005
+ DCT32_avx512_PASS_1_O 1 * 64 + 2 * 8, 17, 18
1006
+
1007
+ lea r0, [r0 + 4 * r2]
1008
+ DCT32_avx512_LOOP 19, 20, 6, 7
1009
+ DCT32_avx512_PASS_1_O 1 * 64 + 3 * 8, 19, 20
1010
+
1011
+ lea r0, [r0 + 4 * r2]
1012
+ DCT32_avx512_LOOP 21, 22, 8, 9
1013
+ DCT32_avx512_PASS_1_O 1 * 64 + 4 * 8, 21, 22
1014
+
1015
+ lea r0, [r0 + 4 * r2]
1016
+ DCT32_avx512_LOOP 23, 24, 10, 11
1017
+ DCT32_avx512_PASS_1_O 1 * 64 + 5 * 8, 23, 24
1018
+
1019
+ lea r0, [r0 + 4 * r2]
1020
+ DCT32_avx512_LOOP 25, 26, 12, 13
1021
+ DCT32_avx512_PASS_1_O 1 * 64 + 6 * 8, 25, 26
1022
+
1023
+ lea r0, [r0 + 4 * r2]
1024
+ DCT32_avx512_LOOP 27, 28, 14, 15
1025
+ DCT32_avx512_PASS_1_O 1 * 64 + 7 * 8, 27, 28
1026
+%endmacro
1027
+
1028
+%macro DCT32_avx512_PASS_1_ROW_O_1_7 1
1029
+ vbroadcasti32x8 m9, [r7 + %1 * 32]
1030
+
1031
+ DCT32_avx512_PASS_1_O %1 * 64 + 0 * 8, 13, 14
1032
+ DCT32_avx512_PASS_1_O %1 * 64 + 1 * 8, 15, 16
1033
+ DCT32_avx512_PASS_1_O %1 * 64 + 2 * 8, 17, 18
1034
+ DCT32_avx512_PASS_1_O %1 * 64 + 3 * 8, 19, 20
1035
+ DCT32_avx512_PASS_1_O %1 * 64 + 4 * 8, 21, 22
1036
+ DCT32_avx512_PASS_1_O %1 * 64 + 5 * 8, 23, 24
1037
+ DCT32_avx512_PASS_1_O %1 * 64 + 6 * 8, 25, 26
1038
+ DCT32_avx512_PASS_1_O %1 * 64 + 7 * 8, 27, 28
1039
+%endmacro
1040
+
1041
+%macro DCT32_avx512_LOOP_EO 4
1042
+ mova m4, [rsp + 32 * mmsize + %3 * 64]
1043
+ vpermw m4, m8, m4
1044
+ vextracti64x4 ym5, m4, 1
1045
+
1046
+ mova m6, [rsp + 32 * mmsize + %4 * 64]
1047
+ vpermw m6, m8, m6
1048
+ vextracti64x4 ym7, m6, 1
1049
+
1050
+ vinserti64x4 m4, m4, ym6, 1
1051
+ vinserti64x4 m5, m5, ym7, 1
1052
+
1053
+ psubw m%1, m4, m5 ; EO
1054
+ paddw m%2, m4, m5 ; EE
1055
+%endmacro
1056
+
1057
+%macro DCT32_avx512_PASS_1_ROW_EO 2
1058
+ pmaddwd m29, m%2, m12
1059
+ vpsrldq m30, m29, 8
1060
+ vpaddd m30, m29
1061
+ vpsrldq m29, m30, 4
1062
+ vpaddd m29, m30
1063
+
1064
+ vpaddd m29, m0
1065
+ vpsrad m29, DCT_SHIFT
1066
+ vpackssdw m29, m29
1067
+
1068
+ vpermw m29, m11, m29
1069
+ movq [r5 + %1], xm29
1070
+%endmacro
1071
+
1072
+%macro DCT32_avx512_PASS_1_ROW_EO_0 0
1073
+
1074
+ mova m8, [dct32_shuf2_AVX512]
1075
+ vbroadcasti32x4 m12, [r7 + 2 * 32]
1076
+
1077
+ DCT32_avx512_LOOP_EO 13, 14, 0, 1
1078
+ DCT32_avx512_PASS_1_ROW_EO 2 * 64 + 0 * 8, 13
1079
+
1080
+ lea r9, [r9 + 4 * r2]
1081
+ DCT32_avx512_LOOP_EO 15, 16, 2, 3
1082
+ DCT32_avx512_PASS_1_ROW_EO 2 * 64 + 1 * 8, 15
1083
+
1084
+ lea r9, [r9 + 4 * r2]
1085
+ DCT32_avx512_LOOP_EO 17, 18, 4, 5
1086
+ DCT32_avx512_PASS_1_ROW_EO 2 * 64 + 2 * 8, 17
1087
+
1088
+ lea r9, [r9 + 4 * r2]
1089
+ DCT32_avx512_LOOP_EO 19, 20, 6, 7
1090
+ DCT32_avx512_PASS_1_ROW_EO 2 * 64 + 3 * 8, 19
1091
+
1092
+ lea r9, [r9 + 4 * r2]
1093
+ DCT32_avx512_LOOP_EO 21, 22, 8, 9
1094
+ DCT32_avx512_PASS_1_ROW_EO 2 * 64 + 4 * 8, 21
1095
+
1096
+ lea r9, [r9 + 4 * r2]
1097
+ DCT32_avx512_LOOP_EO 23, 24, 10, 11
1098
+ DCT32_avx512_PASS_1_ROW_EO 2 * 64 + 5 * 8, 23
1099
+
1100
+ lea r9, [r9 + 4 * r2]
1101
+ DCT32_avx512_LOOP_EO 25, 26, 12, 13
1102
+ DCT32_avx512_PASS_1_ROW_EO 2 * 64 + 6 * 8, 25
1103
+
1104
+ lea r9, [r9 + 4 * r2]
1105
+ DCT32_avx512_LOOP_EO 27, 28, 14, 15
1106
+ DCT32_avx512_PASS_1_ROW_EO 2 * 64 + 7 * 8, 27
1107
+
1108
+%endmacro
1109
+
1110
+%macro DCT32_avx512_PASS_1_ROW_EO_1_7 1
1111
+
1112
+ vbroadcasti32x4 m12, [r7 + %1 * 32]
1113
+
1114
+ DCT32_avx512_PASS_1_ROW_EO %1 * 64 + 0 * 8, 13
1115
+ DCT32_avx512_PASS_1_ROW_EO %1 * 64 + 1 * 8, 15
1116
+ DCT32_avx512_PASS_1_ROW_EO %1 * 64 + 2 * 8, 17
1117
+ DCT32_avx512_PASS_1_ROW_EO %1 * 64 + 3 * 8, 19
1118
+ DCT32_avx512_PASS_1_ROW_EO %1 * 64 + 4 * 8, 21
1119
+ DCT32_avx512_PASS_1_ROW_EO %1 * 64 + 5 * 8, 23
1120
+ DCT32_avx512_PASS_1_ROW_EO %1 * 64 + 6 * 8, 25
1121
+ DCT32_avx512_PASS_1_ROW_EO %1 * 64 + 7 * 8, 27
1122
+
1123
+%endmacro
1124
+
1125
+%macro DCT32_avx512_LOOP_EEO 0
1126
+ vpunpcklqdq m2, m14, m16
1127
+ vpunpckhqdq m14, m16
1128
+ vpshufb m14, m31
1129
+
1130
+ vpaddw m16, m2, m14 ; EEE
1131
+ vpsubw m2, m14 ; EE0
1132
+
1133
+ vpunpcklqdq m3, m18, m20
1134
+ vpunpckhqdq m18, m20
1135
+ vpshufb m18, m31
1136
+
1137
+ vpaddw m20, m3, m18 ; EEE
1138
+ vpsubw m3, m18 ; EE0
1139
+
1140
+ vpunpcklqdq m4, m22, m24
1141
+ vpunpckhqdq m22, m24
1142
+ vpshufb m22, m31
1143
+
1144
+ vpaddw m24, m4, m22 ; EEE
1145
+ vpsubw m4, m22 ; EE0
1146
+
1147
+ vpunpcklqdq m5, m26, m28
1148
+ vpunpckhqdq m26, m28
1149
+ vpshufb m26, m31
1150
+
1151
+ vpaddw m28, m5, m26 ; EEE
1152
+ vpsubw m5, m26 ; EE0
1153
+%endmacro
1154
+
1155
+%macro DCT32_avx512_PASS_1_ROW_EEO 2
1156
+ pmaddwd m30, m%2, m1
1157
+ vpsrldq m29, m30, 4
1158
+ vpaddd m29, m30
1159
+
1160
+ vpaddd m29, m0
1161
+ vpsrad m29, DCT_SHIFT
1162
+ vpackssdw m29, m29
1163
+
1164
+ vpermw m29, m27, m29
1165
+ movu [r5 + %1], xm29
1166
+%endmacro
1167
+
1168
+%macro DCT32_avx512_PASS_1_ROW_EEO_1_4 1
1169
+
1170
+vpbroadcastq m1, [r7 + %1 * 32]
1171
+DCT32_avx512_PASS_1_ROW_EEO %1 * 64 + 0 * 16, 2
1172
+DCT32_avx512_PASS_1_ROW_EEO %1 * 64 + 1 * 16, 3
1173
+DCT32_avx512_PASS_1_ROW_EEO %1 * 64 + 2 * 16, 4
1174
+DCT32_avx512_PASS_1_ROW_EEO %1 * 64 + 3 * 16, 5
1175
+
1176
+%endmacro
1177
+
1178
+%macro DCT32_avx512_PASS_1_ROW_EEEO_1_4 1
1179
+
1180
+vpbroadcastq m1, [r7 + %1 * 32]
1181
+DCT32_avx512_PASS_1_ROW_EEO %1 * 64 + 0 * 16, 16
1182
+DCT32_avx512_PASS_1_ROW_EEO %1 * 64 + 1 * 16, 20
1183
+DCT32_avx512_PASS_1_ROW_EEO %1 * 64 + 2 * 16, 24
1184
+DCT32_avx512_PASS_1_ROW_EEO %1 * 64 + 3 * 16, 28
1185
+
1186
+%endmacro
1187
+
1188
+%macro DCT32_avx512_PASS2_OPT 5
1189
+ pmaddwd m9, m1, m%1
1190
+ vpsrldq m10, m9, 8
1191
+ vpaddd m9, m10
1192
+
1193
+ pmaddwd m10, m1, m%2
1194
+ vpsrldq m11, m10, 8
1195
+ vpaddd m10, m11
1196
+
1197
+ pmaddwd m11, m1, m%3
1198
+ vpsrldq m12, m11, 8
1199
+ vpaddd m11, m12
1200
+
1201
+ pmaddwd m12, m1, m%4
1202
+ vpsrldq m13, m12, 8
1203
+ vpaddd m12, m13
1204
+
1205
+ vpsrldq m13, m9, 4
1206
+ vpaddd m9, m13
1207
+ vpsrldq m13, m10, 4
1208
+ vpaddd m10, m13
1209
+ vpsrldq m13, m11, 4
1210
+ vpaddd m11, m13
1211
+ vpsrldq m13, m12, 4
1212
+ vpaddd m12, m13
1213
+
1214
+ vpermd m9, m31, m9
1215
+ vpermd m10, m31, m10
1216
+ vpermd m11, m31, m11
1217
+ vpermd m12, m31, m12
1218
+
1219
+ vpandd m9, m27
1220
+ vpandd m10, m30
1221
+ vpandd m11, m29
1222
+ vpandd m12, m28
1223
+
1224
+ vpaddd m9, m10
1225
+ vpaddd m11, m12
1226
+ vpaddd m9, m11
1227
+
1228
+ vpsrldq m10, m9, 8
1229
+ vpaddd m9, m10
1230
+ vpsrldq m10, m9, 4
1231
+ vpaddd m9, m10
1232
+
1233
+ vpermd m9, m31, m9
1234
+ vpaddd m9, m0
1235
+ vpsrad m9, DCT_SHIFT2
1236
+ vpackssdw m9, m9
1237
+ movq [r1 + %5], xm9
1238
+
1239
+%endmacro
1240
+
1241
+%macro DCT32_avx512_PASS2 5
1242
+
1243
+ mova m9, [r5 + %1]
1244
+ mova m10, [r5 + %2]
1245
+ mova m11, [r5 + %3]
1246
+ mova m12, [r5 + %4]
1247
+
1248
+ pmaddwd m9, m1, m9
1249
+ vpsrldq m13, m9, 8
1250
+ vpaddd m9, m13
1251
+
1252
+ pmaddwd m10, m1, m10
1253
+ vpsrldq m13, m10, 8
1254
+ vpaddd m10, m13
1255
+
1256
+ pmaddwd m11, m1, m11
1257
+ vpsrldq m13, m11, 8
1258
+ vpaddd m11, m13
1259
+
1260
+ pmaddwd m12, m1, m12
1261
+ vpsrldq m13, m12, 8
1262
+ vpaddd m12, m13
1263
+
1264
+ vpsrldq m13, m9, 4
1265
+ vpaddd m9, m13
1266
+ vpsrldq m13, m10, 4
1267
+ vpaddd m10, m13
1268
+ vpsrldq m13, m11, 4
1269
+ vpaddd m11, m13
1270
+ vpsrldq m13, m12, 4
1271
+ vpaddd m12, m13
1272
+
1273
+ vpermd m9, m31, m9
1274
+ vpermd m10, m31, m10
1275
+ vpermd m11, m31, m11
1276
+ vpermd m12, m31, m12
1277
+
1278
+ vpandd m9, m27
1279
+ vpandd m10, m30
1280
+ vpandd m11, m29
1281
+ vpandd m12, m28
1282
+
1283
+ vpaddd m9, m10
1284
+ vpaddd m11, m12
1285
+ vpaddd m9, m11
1286
+
1287
+ vpsrldq m10, m9, 8
1288
+ vpaddd m9, m10
1289
+ vpsrldq m10, m9, 4
1290
+ vpaddd m9, m10
1291
+
1292
+ vpermd m9, m31, m9
1293
+ vpaddd m9, m0
1294
+ vpsrad m9, DCT_SHIFT2
1295
+ vpackssdw m9, m9
1296
+ movq [r1 + %5], xm9
1297
+
1298
+%endmacro
1299
+
1300
+%macro DCT32_avx512_PASS2_1_ROW 1
1301
+
1302
+mova m1, [r8 + %1 * 64]
1303
+
1304
+DCT32_avx512_PASS2_OPT 2, 3, 4, 14, %1 * 64 + 0 * 8
1305
+DCT32_avx512_PASS2_OPT 15, 16, 17, 18, %1 * 64 + 1 * 8
1306
+DCT32_avx512_PASS2_OPT 19, 20, 21, 22, %1 * 64 + 2 * 8
1307
+DCT32_avx512_PASS2_OPT 23, 24, 25, 26, %1 * 64 + 3 * 8
1308
+DCT32_avx512_PASS2_OPT 5, 6, 7, 8, %1 * 64 + 4 * 8
1309
+
1310
+DCT32_avx512_PASS2 20 * 64, 21 * 64, 22 * 64, 23 * 64, %1 * 64 + 5 * 8
1311
+DCT32_avx512_PASS2 24 * 64, 25 * 64, 26 * 64, 27 * 64, %1 * 64 + 6 * 8
1312
+DCT32_avx512_PASS2 28 * 64, 29 * 64, 30 * 64, 31 * 64, %1 * 64 + 7 * 8
1313
+
1314
+%endmacro
1315
+
1316
+INIT_ZMM avx512
1317
+cglobal dct32, 3, 10, 32, 0-(32*mmsize + 16*mmsize)
1318
+
1319
+%if BIT_DEPTH == 12
1320
+ %define DCT_SHIFT 8
1321
+ vpbroadcastq m0, [pd_128]
1322
+%elif BIT_DEPTH == 10
1323
+ %define DCT_SHIFT 6
1324
+ vpbroadcastq m0, [pd_32]
1325
+%elif BIT_DEPTH == 8
1326
+ %define DCT_SHIFT 4
1327
+ vpbroadcastq m0, [pd_8]
1328
+%else
1329
+ %error Unsupported BIT_DEPTH!
1330
+%endif
1331
+%define DCT_SHIFT2 11
1332
+
1333
+ add r2d, r2d
1334
+ lea r7, [tab_dct32_1]
1335
+ lea r8, [tab_dct32]
1336
+ lea r3, [r2 * 3]
1337
+ mov r5, rsp
1338
+ mov r9, 2048 ; 32 * mmsize
1339
+ add r9, rsp
1340
+
1341
+ mova m31, [dct32_shuf1_AVX512]
1342
+
1343
+ ; PASSS 1
1344
+
1345
+ vbroadcasti32x8 m30, [dct8_shuf9_AVX512]
1346
+ mova m8, [dct32_shuf_AVX512]
1347
+
1348
+ DCT32_avx512_PASS_1_ROW_O
1349
+ DCT32_avx512_PASS_1_ROW_O_1_7 3
1350
+ DCT32_avx512_PASS_1_ROW_O_1_7 5
1351
+ DCT32_avx512_PASS_1_ROW_O_1_7 7
1352
+ DCT32_avx512_PASS_1_ROW_O_1_7 9
1353
+ DCT32_avx512_PASS_1_ROW_O_1_7 11
1354
+ DCT32_avx512_PASS_1_ROW_O_1_7 13
1355
+ DCT32_avx512_PASS_1_ROW_O_1_7 15
1356
+ DCT32_avx512_PASS_1_ROW_O_1_7 17
1357
+ DCT32_avx512_PASS_1_ROW_O_1_7 19
1358
+ DCT32_avx512_PASS_1_ROW_O_1_7 20
1359
+ DCT32_avx512_PASS_1_ROW_O_1_7 21
1360
+ DCT32_avx512_PASS_1_ROW_O_1_7 23
1361
+ DCT32_avx512_PASS_1_ROW_O_1_7 25
1362
+ DCT32_avx512_PASS_1_ROW_O_1_7 27
1363
+ DCT32_avx512_PASS_1_ROW_O_1_7 29
1364
+ DCT32_avx512_PASS_1_ROW_O_1_7 31
1365
+
1366
+ vbroadcasti32x8 m11, [dct8_shuf9_AVX512]
1367
+
1368
+ DCT32_avx512_PASS_1_ROW_EO_0
1369
+ DCT32_avx512_PASS_1_ROW_EO_1_7 6
1370
+ DCT32_avx512_PASS_1_ROW_EO_1_7 10
1371
+ DCT32_avx512_PASS_1_ROW_EO_1_7 14
1372
+ DCT32_avx512_PASS_1_ROW_EO_1_7 18
1373
+ DCT32_avx512_PASS_1_ROW_EO_1_7 22
1374
+ DCT32_avx512_PASS_1_ROW_EO_1_7 26
1375
+ DCT32_avx512_PASS_1_ROW_EO_1_7 30
1376
+
1377
+ vbroadcasti32x4 m31, [dct8_shuf]
1378
+ vbroadcasti32x8 m27, [dct32_shuf3_AVX512]
1379
+
1380
+ DCT32_avx512_LOOP_EEO
1381
+ DCT32_avx512_PASS_1_ROW_EEO_1_4 4
1382
+ DCT32_avx512_PASS_1_ROW_EEO_1_4 12
1383
+ DCT32_avx512_PASS_1_ROW_EEO_1_4 20
1384
+ DCT32_avx512_PASS_1_ROW_EEO_1_4 28
1385
+
1386
+ DCT32_avx512_PASS_1_ROW_EEEO_1_4 0
1387
+ DCT32_avx512_PASS_1_ROW_EEEO_1_4 16
1388
+ DCT32_avx512_PASS_1_ROW_EEEO_1_4 8
1389
+ DCT32_avx512_PASS_1_ROW_EEEO_1_4 24
1390
+
1391
+ ; PASS 2
1392
+
1393
+ vpbroadcastq m0, [pd_1024]
1394
+ vbroadcasti32x8 m31, [dct32_shuf4_AVX512]
1395
+ movu m30, [dct32_shuf5_AVX512]
1396
+ movu m29, [dct32_shuf6_AVX512]
1397
+ movu m28, [dct32_shuf7_AVX512]
1398
+ movu m27, [dct32_shuf8_AVX512]
1399
+
1400
+ ;Load the source coefficents into free registers and reuse them for all rows
1401
+
1402
+ mova m2, [r5 + 0 * 64]
1403
+ mova m3, [r5 + 1 * 64]
1404
+ mova m4, [r5 + 2 * 64]
1405
+ mova m14, [r5 + 3 * 64]
1406
+ mova m15, [r5 + 4 * 64]
1407
+ mova m16, [r5 + 5 * 64]
1408
+ mova m17, [r5 + 6 * 64]
1409
+ mova m18, [r5 + 7 * 64]
1410
+ mova m19, [r5 + 8 * 64]
1411
+ mova m20, [r5 + 9 * 64]
1412
+ mova m21, [r5 + 10 * 64]
1413
+ mova m22, [r5 + 11 * 64]
1414
+ mova m23, [r5 + 12 * 64]
1415
+ mova m24, [r5 + 13 * 64]
1416
+ mova m25, [r5 + 14 * 64]
1417
+ mova m26, [r5 + 15 * 64]
1418
+ mova m5, [r5 + 16 * 64]
1419
+ mova m6, [r5 + 17 * 64]
1420
+ mova m7, [r5 + 18 * 64]
1421
+ mova m8, [r5 + 19 * 64]
1422
+
1423
+ DCT32_avx512_PASS2_1_ROW 0
1424
+ DCT32_avx512_PASS2_1_ROW 1
1425
+ DCT32_avx512_PASS2_1_ROW 2
1426
+ DCT32_avx512_PASS2_1_ROW 3
1427
+ DCT32_avx512_PASS2_1_ROW 4
1428
+ DCT32_avx512_PASS2_1_ROW 5
1429
+ DCT32_avx512_PASS2_1_ROW 6
1430
+ DCT32_avx512_PASS2_1_ROW 7
1431
+ DCT32_avx512_PASS2_1_ROW 8
1432
+ DCT32_avx512_PASS2_1_ROW 9
1433
+ DCT32_avx512_PASS2_1_ROW 10
1434
+ DCT32_avx512_PASS2_1_ROW 11
1435
+ DCT32_avx512_PASS2_1_ROW 12
1436
+ DCT32_avx512_PASS2_1_ROW 13
1437
+ DCT32_avx512_PASS2_1_ROW 14
1438
+ DCT32_avx512_PASS2_1_ROW 15
1439
+ DCT32_avx512_PASS2_1_ROW 16
1440
+ DCT32_avx512_PASS2_1_ROW 17
1441
+ DCT32_avx512_PASS2_1_ROW 18
1442
+ DCT32_avx512_PASS2_1_ROW 19
1443
+ DCT32_avx512_PASS2_1_ROW 20
1444
+ DCT32_avx512_PASS2_1_ROW 21
1445
+ DCT32_avx512_PASS2_1_ROW 22
1446
+ DCT32_avx512_PASS2_1_ROW 23
1447
+ DCT32_avx512_PASS2_1_ROW 24
1448
+ DCT32_avx512_PASS2_1_ROW 25
1449
+ DCT32_avx512_PASS2_1_ROW 26
1450
+ DCT32_avx512_PASS2_1_ROW 27
1451
+ DCT32_avx512_PASS2_1_ROW 28
1452
+ DCT32_avx512_PASS2_1_ROW 29
1453
+ DCT32_avx512_PASS2_1_ROW 30
1454
+ DCT32_avx512_PASS2_1_ROW 31
1455
+
1456
+ RET
1457
+
1458
%macro IDCT8_PASS_1 1
1459
vpbroadcastd m7, [r5 + %1]
1460
vpbroadcastd m10, [r5 + %1 + 4]
1461
1462
mova [r1 + r3], xm3
1463
RET
1464
1465
+
1466
+%macro IDCT8_AVX512_PASS_1 0
1467
+ pmaddwd m5, m29, m17
1468
+ pmaddwd m6, m25, m18
1469
+ paddd m5, m6
1470
+
1471
+ pmaddwd m6, m30, m21
1472
+ pmaddwd m3, m26, m22
1473
+ paddd m6, m3
1474
+
1475
+ paddd m3, m5, m6
1476
+ paddd m3, m11
1477
+ psrad m3, IDCT_SHIFT1
1478
+
1479
+ psubd m5, m6
1480
+ paddd m5, m11
1481
+ psrad m5, IDCT_SHIFT1
1482
+
1483
+ pmaddwd m6, m29, m19
1484
+ pmaddwd m8, m25, m20
1485
+ paddd m6, m8
1486
+
1487
+ pmaddwd m8, m30, m23
1488
+ pmaddwd m9, m26, m24
1489
+ paddd m8, m9
1490
+
1491
+ paddd m9, m6, m8
1492
+ paddd m9, m11
1493
+ psrad m9, IDCT_SHIFT1
1494
+
1495
+ psubd m6, m8
1496
+ paddd m6, m11
1497
+ psrad m6, IDCT_SHIFT1
1498
+
1499
+ packssdw m3, m9
1500
+ vpermq m3, m3, 0xD8
1501
+
1502
+ packssdw m6, m5
1503
+ vpermq m6, m6, 0xD8
1504
+%endmacro
1505
+
1506
+
1507
+%macro IDCT8_AVX512_PASS_2 0
1508
+ mov r7d, 0xAAAA
1509
+ kmovd k1, r7d
1510
+ punpcklqdq m2, m3, m13
1511
+ punpckhqdq m0, m3, m13
1512
+
1513
+ pmaddwd m3, m2, [r5]
1514
+ pmaddwd m5, m2, [r5 + 1 * mmsize]
1515
+ pmaddwd m6, m2, [r5 + 2 * mmsize]
1516
+ pmaddwd m7, m2, [r5 + 3 * mmsize]
1517
+
1518
+ vpsrldq m14, m3, 4
1519
+ paddd m3, m14
1520
+ vpslldq m16, m5, 4
1521
+ paddd m5, m16
1522
+ vmovdqu32 m3 {k1}, m5
1523
+
1524
+ vpsrldq m14, m6, 4
1525
+ paddd m6, m14
1526
+ vpslldq m16, m7, 4
1527
+ paddd m7, m16
1528
+ vmovdqu32 m6 {k1}, m7
1529
+
1530
+ punpcklqdq m7, m3, m6
1531
+ punpckhqdq m3, m6
1532
+
1533
+ pmaddwd m5, m0, [r6]
1534
+ pmaddwd m6, m0, [r6 + 1 * mmsize]
1535
+ pmaddwd m8, m0, [r6 + 2 * mmsize]
1536
+ pmaddwd m9, m0, [r6 + 3 * mmsize]
1537
+
1538
+ vpsrldq m14, m5, 4
1539
+ paddd m5, m14
1540
+ vpslldq m16, m6, 4
1541
+ paddd m6, m16
1542
+ vmovdqu32 m5 {k1}, m6
1543
+
1544
+ vpsrldq m14, m8, 4
1545
+ paddd m8, m14
1546
+ vpslldq m16, m9, 4
1547
+ paddd m9, m16
1548
+ vmovdqu32 m8 {k1}, m9
1549
+
1550
+ punpcklqdq m6, m5, m8
1551
+ punpckhqdq m5, m8
1552
+
1553
+ paddd m8, m7, m6
1554
+ paddd m8, m12
1555
+ psrad m8, IDCT_SHIFT2
1556
+
1557
+ psubd m7, m6
1558
+ paddd m7, m12
1559
+ psrad m7, IDCT_SHIFT2
1560
+
1561
+ pshufb m7, [idct8_avx512_shuf3]
1562
+ packssdw m8, m7
1563
+
1564
+ paddd m9, m3, m5
1565
+ paddd m9, m12
1566
+ psrad m9, IDCT_SHIFT2
1567
+
1568
+ psubd m3, m5
1569
+ paddd m3, m12
1570
+ psrad m3, IDCT_SHIFT2
1571
+
1572
+ pshufb m3, [idct8_avx512_shuf3]
1573
+ packssdw m9, m3
1574
+%endmacro
1575
+
1576
+
1577
+%if ARCH_X86_64
1578
+INIT_ZMM avx512
1579
+cglobal idct8, 3, 8, 31
1580
+%if BIT_DEPTH == 12
1581
+ %define IDCT_SHIFT2 8
1582
+ vpbroadcastd m12, [pd_128]
1583
+%elif BIT_DEPTH == 10
1584
+ %define IDCT_SHIFT2 10
1585
+ vpbroadcastd m12, [pd_512]
1586
+%elif BIT_DEPTH == 8
1587
+ %define IDCT_SHIFT2 12
1588
+ vpbroadcastd m12, [pd_2048]
1589
+%else
1590
+ %error Unsupported BIT_DEPTH!
1591
+%endif
1592
+%define IDCT_SHIFT1 7
1593
+
1594
+ vpbroadcastd m11, [pd_64]
1595
+
1596
+ lea r4, [avx512_idct8_3]
1597
+ lea r5, [avx2_idct8_1]
1598
+ lea r6, [avx2_idct8_2]
1599
+ movu m16, [idct16_shuff2]
1600
+ movu m17, [idct16_shuff3]
1601
+
1602
+ ;pass1
1603
+ mova ym1, [r0 + 0 * 32]
1604
+ mova ym0, [r0 + 1 * 32]
1605
+ mova ym25, ym16
1606
+ mova ym26, ym17
1607
+ vpermi2w ym25, ym1, ym0
1608
+ vpermi2w ym26, ym1, ym0
1609
+
1610
+ mova ym1, [r0 + 2 * 32]
1611
+ mova ym0, [r0 + 3 * 32]
1612
+ mova ym27, ym16
1613
+ mova ym28, ym17
1614
+ vpermi2w ym27, ym1, ym0
1615
+ vpermi2w ym28, ym1, ym0
1616
+
1617
+ vperm2i128 ym29, ym25, ym26, 0x20
1618
+ vperm2i128 ym30, ym25, ym26, 0x31
1619
+ vperm2i128 ym25, ym27, ym28, 0x20
1620
+ vperm2i128 ym26, ym27, ym28, 0x31
1621
+
1622
+ vinserti64x4 m29, m29, ym29, 1
1623
+ vinserti64x4 m25, m25, ym25, 1
1624
+ vinserti64x4 m30, m30, ym30, 1
1625
+ vinserti64x4 m26, m26, ym26, 1
1626
+
1627
+ movu m17, [r4]
1628
+ movu m18, [r4 + 1 * mmsize]
1629
+ movu m19, [r4 + 2 * mmsize]
1630
+ movu m20, [r4 + 3 * mmsize]
1631
+ movu m21, [r4 + 4 * mmsize]
1632
+ movu m22, [r4 + 5 * mmsize]
1633
+ movu m23, [r4 + 6 * mmsize]
1634
+ movu m24, [r4 + 7 * mmsize]
1635
+
1636
+ IDCT8_AVX512_PASS_1
1637
+
1638
+ vextracti64x4 ym13, m3, 1
1639
+ vextracti64x4 ym14, m6, 1
1640
+ vinserti64x4 m3, m3, ym14, 1
1641
+ vinserti64x4 m13, m13, ym6, 1
1642
+
1643
+ ;pass2
1644
+ add r2d, r2d
1645
+ lea r3, [r2 * 3]
1646
+ lea r5, [avx512_idct8_1]
1647
+ lea r6, [avx512_idct8_2]
1648
+
1649
+ IDCT8_AVX512_PASS_2
1650
+
1651
+ vextracti128 xm3, ym8, 1
1652
+ mova [r1], xm8
1653
+ mova [r1 + r2], xm3
1654
+ vextracti128 xm3, ym9, 1
1655
+ mova [r1 + r2 * 2], xm9
1656
+ mova [r1 + r3], xm3
1657
+
1658
+ lea r1, [r1 + r2 * 4]
1659
+
1660
+ vextracti64x4 ym10, m8, 1
1661
+ vextracti64x4 ym11, m9, 1
1662
+
1663
+ vextracti128 xm3, ym10, 1
1664
+ mova [r1], xm10
1665
+ mova [r1 + r2], xm3
1666
+ vextracti128 xm3, ym11, 1
1667
+ mova [r1 + r2 * 2], xm11
1668
+ mova [r1 + r3], xm3
1669
+ RET
1670
+%endif
1671
+
1672
%macro IDCT_PASS1 2
1673
vbroadcasti128 m5, [tab_idct16_2 + %1 * 16]
1674
1675
1676
jnz .pass2
1677
RET
1678
1679
+
1680
+%macro IDCT16_AVX512_PASS1 3
1681
+ movu m5, [tab_AVX512_idct16_2 + %1 * 64]
1682
+ pmaddwd m9, m4, m5
1683
+ pmaddwd m10, m6, m5
1684
+
1685
+ vpsrldq m16, m9, 4
1686
+ paddd m9, m16
1687
+ vpslldq m17, m10, 4
1688
+ paddd m10, m17
1689
+ vmovdqu32 m9 {k1}, m10
1690
+
1691
+ pmaddwd m10, m7, m5
1692
+ pmaddwd m11, m8, m5
1693
+
1694
+ vpsrldq m16, m10, 4
1695
+ paddd m10, m16
1696
+ vpslldq m17, m11, 4
1697
+ paddd m11, m17
1698
+ vmovdqu32 m10 {k1}, m11
1699
+
1700
+ vpsrldq m16, m9, 8
1701
+ paddd m9, m16
1702
+ vpslldq m17, m10, 8
1703
+ paddd m10, m17
1704
+ vmovdqu32 m9 {k2}, m10
1705
+
1706
+ mova m5, [tab_AVX512_idct16_1 + %1 * 64]
1707
+ pmaddwd m10, m28, m5
1708
+ pmaddwd m11, m29, m5
1709
+
1710
+ vpsrldq m16, m10, 4
1711
+ paddd m10, m16
1712
+ vpslldq m17, m11, 4
1713
+ paddd m11, m17
1714
+ vmovdqu32 m10 {k1}, m11
1715
+
1716
+ pmaddwd m11, m30, m5
1717
+ pmaddwd m12, m31, m5
1718
+
1719
+ vpsrldq m16, m11, 4
1720
+ paddd m11, m16
1721
+ vpslldq m17, m12, 4
1722
+ paddd m12, m17
1723
+ vmovdqu32 m11 {k1}, m12
1724
+
1725
+ vpsrldq m16, m10, 8
1726
+ paddd m10, m16
1727
+ vpslldq m17, m11, 8
1728
+ paddd m11, m17
1729
+ vmovdqu32 m10 {k2}, m11
1730
+
1731
+ paddd m11, m9, m10
1732
+ paddd m11, m14
1733
+ psrad m11, IDCT_SHIFT1
1734
+
1735
+ psubd m9, m10
1736
+ paddd m9, m14
1737
+ psrad m9, IDCT_SHIFT1
1738
+
1739
+ mova m5, [tab_AVX512_idct16_2 + %1 * 64 + 64]
1740
+ pmaddwd m10, m4, m5
1741
+ pmaddwd m12, m6, m5
1742
+
1743
+
1744
+ vpsrldq m16, m10, 4
1745
+ paddd m10, m16
1746
+ vpslldq m17, m12, 4
1747
+ paddd m12, m17
1748
+ vmovdqu32 m10 {k1}, m12
1749
+
1750
+ pmaddwd m12, m7, m5
1751
+ pmaddwd m13, m8, m5
1752
+
1753
+
1754
+ vpsrldq m16, m12, 4
1755
+ paddd m12, m16
1756
+ vpslldq m17, m13, 4
1757
+ paddd m13, m17
1758
+ vmovdqu32 m12 {k1}, m13
1759
+
1760
+
1761
+ vpsrldq m16, m10, 8
1762
+ paddd m10, m16
1763
+ vpslldq m17, m12, 8
1764
+ paddd m12, m17
1765
+ vmovdqu32 m10 {k2}, m12
1766
+
1767
+
1768
+
1769
+ mova m5, [tab_AVX512_idct16_1 + %1 * 64 + 64]
1770
+ pmaddwd m12, m28, m5
1771
+ pmaddwd m13, m29, m5
1772
+
1773
+
1774
+ vpsrldq m16, m12, 4
1775
+ paddd m12, m16
1776
+ vpslldq m17, m13, 4
1777
+ paddd m13, m17
1778
+ vmovdqu32 m12 {k1}, m13
1779
+
1780
+ pmaddwd m13, m30, m5
1781
+ pmaddwd m5, m31
1782
+
1783
+
1784
+ vpsrldq m16, m13, 4
1785
+ paddd m13, m16
1786
+ vpslldq m17, m5, 4
1787
+ paddd m5, m17
1788
+ vmovdqu32 m13 {k1}, m5
1789
+
1790
+
1791
+ vpsrldq m16, m12, 8
1792
+ paddd m12, m16
1793
+ vpslldq m17, m13, 8
1794
+ paddd m13, m17
1795
+ vmovdqu32 m12 {k2}, m13
1796
+
1797
+
1798
+ paddd m5, m10, m12
1799
+ paddd m5, m14
1800
+ psrad m5, IDCT_SHIFT1
1801
+
1802
+ psubd m10, m12
1803
+ paddd m10, m14
1804
+ psrad m10, IDCT_SHIFT1
1805
+
1806
+ packssdw m11, m5
1807
+ packssdw m9, m10
1808
+
1809
+ mova m10, [idct16_AVX512_shuff]
1810
+ mova m5, [idct16_AVX512_shuff1]
1811
+
1812
+ vpermd m%2, m10, m11
1813
+ vpermd m%3, m5, m9
1814
+%endmacro
1815
+
1816
+%macro IDCT16_AVX512_PASS2 2
1817
+ vpermq m0, m%1, 0xD8
1818
+
1819
+ pmaddwd m1, m0, m7
1820
+ pmaddwd m2, m0, m8
1821
+
1822
+
1823
+ vpsrldq m14, m1, 4
1824
+ paddd m1, m14
1825
+ vpslldq m31, m2, 4
1826
+ paddd m2, m31
1827
+ vmovdqu32 m1 {k1}, m2
1828
+
1829
+ pmaddwd m2, m0, m9
1830
+ pmaddwd m3, m0, m10
1831
+
1832
+
1833
+ vpsrldq m14, m2, 4
1834
+ paddd m2, m14
1835
+ vpslldq m31, m3, 4
1836
+ paddd m3, m31
1837
+ vmovdqu32 m2 {k1}, m3
1838
+
1839
+
1840
+ vpsrldq m14, m1, 8
1841
+ paddd m1, m14
1842
+ vpslldq m31, m2, 8
1843
+ paddd m2, m31
1844
+ vmovdqu32 m1 {k2}, m2
1845
+
1846
+ pmaddwd m2, m0, m11
1847
+ pmaddwd m3, m0, m12
1848
+
1849
+
1850
+ vpsrldq m14, m2, 4
1851
+ paddd m2, m14
1852
+ vpslldq m31, m3, 4
1853
+ paddd m3, m31
1854
+ vmovdqu32 m2 {k1}, m3
1855
+
1856
+ vbroadcasti64x2 m14, [r5 + 112]
1857
+ pmaddwd m3, m0, m13
1858
+ pmaddwd m4, m0, m14
1859
+
1860
+
1861
+ vpsrldq m14, m3, 4
1862
+ paddd m3, m14
1863
+ vpslldq m31, m4, 4
1864
+ paddd m4, m31
1865
+ vmovdqu32 m3 {k1}, m4
1866
+
1867
+
1868
+ vpsrldq m14, m2, 8
1869
+ paddd m2, m14
1870
+ vpslldq m31, m3, 8
1871
+ paddd m3, m31
1872
+ vmovdqu32 m2 {k2}, m3
1873
+
1874
+ vpermq m0, m%2, 0xD8
1875
+ pmaddwd m3, m0, m16
1876
+ pmaddwd m4, m0, m17
1877
+
1878
+
1879
+ vpsrldq m14, m3, 4
1880
+ paddd m3, m14
1881
+ vpslldq m31, m4, 4
1882
+ paddd m4, m31
1883
+ vmovdqu32 m3 {k1}, m4
1884
+
1885
+ pmaddwd m4, m0, m19
1886
+ pmaddwd m5, m0, m23
1887
+
1888
+
1889
+ vpsrldq m14, m4, 4
1890
+ paddd m4, m14
1891
+ vpslldq m31, m5, 4
1892
+ paddd m5, m31
1893
+ vmovdqu32 m4 {k1}, m5
1894
+
1895
+
1896
+ vpsrldq m14, m3, 8
1897
+ paddd m3, m14
1898
+ vpslldq m31, m4, 8
1899
+ paddd m4, m31
1900
+ vmovdqu32 m3 {k2}, m4
1901
+
1902
+
1903
+ pmaddwd m4, m0, m28
1904
+ pmaddwd m5, m0, m29
1905
+
1906
+ vpsrldq m14, m4, 4
1907
+ paddd m4, m14
1908
+ vpslldq m31, m5, 4
1909
+ paddd m5, m31
1910
+ vmovdqu32 m4 {k1}, m5
1911
+
1912
+ pmaddwd m6, m0, m30
1913
+ vbroadcasti64x2 m31, [r6 + 112]
1914
+ pmaddwd m0, m31
1915
+
1916
+
1917
+ vpsrldq m14, m6, 4
1918
+ paddd m6, m14
1919
+ vpslldq m31, m0, 4
1920
+ paddd m0, m31
1921
+ vmovdqu32 m6 {k1}, m0
1922
+
1923
+
1924
+ vpsrldq m14, m4, 8
1925
+ paddd m4, m14
1926
+ vpslldq m31, m6, 8
1927
+ paddd m6, m31
1928
+ vmovdqu32 m4 {k2}, m6
1929
+
1930
+ paddd m5, m1, m3
1931
+ paddd m5, m15
1932
+ psrad m5, IDCT_SHIFT2
1933
+
1934
+ psubd m1, m3
1935
+ paddd m1, m15
1936
+ psrad m1, IDCT_SHIFT2
1937
+
1938
+ paddd m6, m2, m4
1939
+ paddd m6, m15
1940
+ psrad m6, IDCT_SHIFT2
1941
+
1942
+ psubd m2, m4
1943
+ paddd m2, m15
1944
+ psrad m2, IDCT_SHIFT2
1945
+
1946
+ packssdw m5, m6
1947
+ packssdw m1, m2
1948
+ pshufb m2, m1, [idct16_AVX512_shuff6]
1949
+%endmacro
1950
+
1951
+
1952
+;-------------------------------------------------------
1953
+; void idct16(const int16_t* src, int16_t* dst, intptr_t dstStride)
1954
+;-------------------------------------------------------
1955
+INIT_ZMM avx512
1956
+cglobal idct16, 3, 8, 32
1957
+%if BIT_DEPTH == 12
1958
+ %define IDCT_SHIFT2 8
1959
+ vpbroadcastd m15, [pd_128]
1960
+%elif BIT_DEPTH == 10
1961
+ %define IDCT_SHIFT2 10
1962
+ vpbroadcastd m15, [pd_512]
1963
+%elif BIT_DEPTH == 8
1964
+ %define IDCT_SHIFT2 12
1965
+ vpbroadcastd m15, [pd_2048]
1966
+%else
1967
+ %error Unsupported BIT_DEPTH!
1968
+%endif
1969
+%define IDCT_SHIFT1 7
1970
+
1971
+ vpbroadcastd m14, [pd_64]
1972
+
1973
+ add r2d, r2d
1974
+
1975
+ mov r7d, 0xAAAA
1976
+ kmovd k1, r7d
1977
+ mov r7d, 0xCCCC
1978
+ kmovd k2, r7d
1979
+ mova ym2, [idct16_shuff2]
1980
+ mova ym3, [idct16_shuff3]
1981
+ mova ym26, [idct16_shuff4]
1982
+ mova ym27, [idct16_shuff5]
1983
+
1984
+.pass1:
1985
+ movu xm0, [r0 + 0 * 32]
1986
+ vinserti128 ym0, ym0, [r0 + 8 * 32], 1
1987
+ movu xm1, [r0 + 2 * 32]
1988
+ vinserti128 ym1, ym1, [r0 + 10 * 32], 1
1989
+
1990
+ mova ym9, ym2
1991
+ mova ym10, ym3
1992
+ vpermi2w ym9, ym0, ym1
1993
+ vpermi2w ym10, ym0, ym1
1994
+
1995
+ movu xm0, [r0 + 4 * 32]
1996
+ vinserti128 ym0, ym0, [r0 + 12 * 32], 1
1997
+ movu xm1, [r0 + 6 * 32]
1998
+ vinserti128 ym1, ym1, [r0 + 14 * 32], 1
1999
+
2000
+ mova ym11, ym2
2001
+ mova ym12, ym3
2002
+ vpermi2w ym11, ym0, ym1
2003
+ vpermi2w ym12, ym0, ym1
2004
+
2005
+ mova ym4, ym26
2006
+ mova ym6, ym27
2007
+ vpermi2d ym4, ym9, ym11
2008
+ vpermi2d ym6, ym9, ym11
2009
+
2010
+ mova ym7, ym26
2011
+ mova ym8, ym27
2012
+ vpermi2d ym7, ym10, ym12
2013
+ vpermi2d ym8, ym10, ym12
2014
+
2015
+ vpermq ym4, ym4, q3120
2016
+ vpermq ym6, ym6, q3120
2017
+ vpermq ym7, ym7, q3120
2018
+ vpermq ym8, ym8, q3120
2019
+
2020
+ movu xm0, [r0 + 1 * 32]
2021
+ vinserti128 ym0, ym0, [r0 + 9 * 32], 1
2022
+ movu xm1, [r0 + 3 * 32]
2023
+ vinserti128 ym1, ym1, [r0 + 11 * 32], 1
2024
+
2025
+ mova ym9, ym2
2026
+ mova ym10, ym3
2027
+ vpermi2w ym9, ym0, ym1
2028
+ vpermi2w ym10, ym0, ym1
2029
+
2030
+ movu xm0, [r0 + 5 * 32]
2031
+ vinserti128 ym0, ym0, [r0 + 13 * 32], 1
2032
+ movu xm1, [r0 + 7 * 32]
2033
+ vinserti128 ym1, ym1, [r0 + 15 * 32], 1
2034
+
2035
+ mova ym11, ym2
2036
+ mova ym12, ym3
2037
+ vpermi2w ym11, ym0, ym1
2038
+ vpermi2w ym12, ym0, ym1
2039
+
2040
+ mova ym28, ym26
2041
+ mova ym29, ym27
2042
+ vpermi2d ym28, ym9, ym11
2043
+ vpermi2d ym29, ym9, ym11
2044
+
2045
+ mova ym30, ym26
2046
+ mova ym31, ym27
2047
+ vpermi2d ym30, ym10, ym12
2048
+ vpermi2d ym31, ym10, ym12
2049
+
2050
+ vpermq ym28, ym28, q3120
2051
+ vpermq ym29, ym29, q3120
2052
+ vpermq ym30, ym30, q3120
2053
+ vpermq ym31, ym31, q3120
2054
+
2055
+ vinserti64x4 m4, m4, ym4, 1
2056
+ vinserti64x4 m6, m6, ym6, 1
2057
+ vinserti64x4 m7, m7, ym7, 1
2058
+ vinserti64x4 m8, m8, ym8, 1
2059
+ vinserti64x4 m28, m28, ym28, 1
2060
+ vinserti64x4 m29, m29, ym29, 1
2061
+ vinserti64x4 m30, m30, ym30, 1
2062
+ vinserti64x4 m31, m31, ym31, 1
2063
+
2064
+ IDCT16_AVX512_PASS1 0, 18, 19
2065
+ IDCT16_AVX512_PASS1 2, 20, 21
2066
+
2067
+ add r0, 16
2068
+
2069
+ movu xm0, [r0 + 0 * 32]
2070
+ vinserti128 ym0, ym0, [r0 + 8 * 32], 1
2071
+ movu xm1, [r0 + 2 * 32]
2072
+ vinserti128 ym1, ym1, [r0 + 10 * 32], 1
2073
+
2074
+ mova ym9, ym2
2075
+ mova ym10, ym3
2076
+ vpermi2w ym9, ym0, ym1
2077
+ vpermi2w ym10, ym0, ym1
2078
+
2079
+ movu xm0, [r0 + 4 * 32]
2080
+ vinserti128 ym0, ym0, [r0 + 12 * 32], 1
2081
+ movu xm1, [r0 + 6 * 32]
2082
+ vinserti128 ym1, ym1, [r0 + 14 * 32], 1
2083
+
2084
+ mova ym11, ym2
2085
+ mova ym12, ym3
2086
+ vpermi2w ym11, ym0, ym1
2087
+ vpermi2w ym12, ym0, ym1
2088
+
2089
+ mova ym4, ym26
2090
+ mova ym6, ym27
2091
+ vpermi2d ym4, ym9, ym11
2092
+ vpermi2d ym6, ym9, ym11
2093
+
2094
+ mova ym7, ym26
2095
+ mova ym8, ym27
2096
+ vpermi2d ym7, ym10, ym12
2097
+ vpermi2d ym8, ym10, ym12
2098
+
2099
+ vpermq ym4, ym4, q3120
2100
+ vpermq ym6, ym6, q3120
2101
+ vpermq ym7, ym7, q3120
2102
+ vpermq ym8, ym8, q3120
2103
+
2104
+ movu xm0, [r0 + 1 * 32]
2105
+ vinserti128 ym0, ym0, [r0 + 9 * 32], 1
2106
+ movu xm1, [r0 + 3 * 32]
2107
+ vinserti128 ym1, ym1, [r0 + 11 * 32], 1
2108
+
2109
+ mova ym9, ym2
2110
+ mova ym10, ym3
2111
+ vpermi2w ym9, ym0, ym1
2112
+ vpermi2w ym10, ym0, ym1
2113
+
2114
+ movu xm0, [r0 + 5 * 32]
2115
+ vinserti128 ym0, ym0, [r0 + 13 * 32], 1
2116
+ movu xm1, [r0 + 7 * 32]
2117
+ vinserti128 ym1, ym1, [r0 + 15 * 32], 1
2118
+
2119
+ mova ym11, ym2
2120
+ mova ym12, ym3
2121
+ vpermi2w ym11, ym0, ym1
2122
+ vpermi2w ym12, ym0, ym1
2123
+
2124
+ mova ym28, ym26
2125
+ mova ym29, ym27
2126
+ vpermi2d ym28, ym9, ym11
2127
+ vpermi2d ym29, ym9, ym11
2128
+
2129
+ mova ym30, ym26
2130
+ mova ym31, ym27
2131
+ vpermi2d ym30, ym10, ym12
2132
+ vpermi2d ym31, ym10, ym12
2133
+
2134
+ vpermq ym28, ym28, q3120
2135
+ vpermq ym29, ym29, q3120
2136
+ vpermq ym30, ym30, q3120
2137
+ vpermq ym31, ym31, q3120
2138
+
2139
+ vinserti64x4 m4, m4, ym4, 1
2140
+ vinserti64x4 m6, m6, ym6, 1
2141
+ vinserti64x4 m7, m7, ym7, 1
2142
+ vinserti64x4 m8, m8, ym8, 1
2143
+ vinserti64x4 m28, m28, ym28, 1
2144
+ vinserti64x4 m29, m29, ym29, 1
2145
+ vinserti64x4 m30, m30, ym30, 1
2146
+ vinserti64x4 m31, m31, ym31, 1
2147
+
2148
+
2149
+ IDCT16_AVX512_PASS1 0, 22, 23
2150
+ IDCT16_AVX512_PASS1 2, 24, 25
2151
+
2152
+ mova m26, [idct16_AVX512_shuff2]
2153
+ mova m27, [idct16_AVX512_shuff3]
2154
+ vpermi2q m26, m18, m22
2155
+ vpermi2q m27, m18, m22
2156
+ mova m18, [idct16_AVX512_shuff2]
2157
+ mova m22, [idct16_AVX512_shuff3]
2158
+ vpermi2q m18, m20, m24
2159
+ vpermi2q m22, m20, m24
2160
+ mova m20, [idct16_AVX512_shuff4]
2161
+ mova m24, [idct16_AVX512_shuff5]
2162
+ vpermi2q m20, m21, m25
2163
+ vpermi2q m24, m21, m25
2164
+ mova m21, [idct16_AVX512_shuff4]
2165
+ mova m25, [idct16_AVX512_shuff5]
2166
+ vpermi2q m21, m19, m23
2167
+ vpermi2q m25, m19, m23
2168
+
2169
+ lea r5, [tab_idct16_2]
2170
+ lea r6, [tab_idct16_1]
2171
+
2172
+ vbroadcasti64x2 m7, [r5]
2173
+ vbroadcasti64x2 m8, [r5 + 16]
2174
+ vbroadcasti64x2 m9, [r5 + 32]
2175
+ vbroadcasti64x2 m10, [r5 + 48]
2176
+ vbroadcasti64x2 m11, [r5 + 64]
2177
+ vbroadcasti64x2 m12, [r5 + 80]
2178
+ vbroadcasti64x2 m13, [r5 + 96]
2179
+
2180
+ vbroadcasti64x2 m16, [r6]
2181
+ vbroadcasti64x2 m17, [r6 + 16]
2182
+ vbroadcasti64x2 m19, [r6 + 32]
2183
+ vbroadcasti64x2 m23, [r6 + 48]
2184
+ vbroadcasti64x2 m28, [r6 + 64]
2185
+ vbroadcasti64x2 m29, [r6 + 80]
2186
+ vbroadcasti64x2 m30, [r6 + 96]
2187
+
2188
+
2189
+ IDCT16_AVX512_PASS2 26, 27
2190
+ mova [r1], xm5
2191
+ mova [r1 + 16], xm2
2192
+ vextracti128 [r1 + r2], ym5, 1
2193
+ vextracti128 [r1 + r2 + 16], ym2, 1
2194
+ vextracti64x4 ym14, m5, 1
2195
+ vextracti64x4 ym31, m2, 1
2196
+ lea r1, [r1 + 2 * r2]
2197
+ mova [r1], xm14
2198
+ mova [r1 + 16], xm31
2199
+ vextracti128 [r1 + r2], ym14, 1
2200
+ vextracti128 [r1 + r2 + 16], ym31, 1
2201
+
2202
+ IDCT16_AVX512_PASS2 18, 22
2203
+ lea r1, [r1 + 2 * r2]
2204
+ mova [r1], xm5
2205
+ mova [r1 + 16], xm2
2206
+ vextracti128 [r1 + r2], ym5, 1
2207
+ vextracti128 [r1 + r2 + 16], ym2, 1
2208
+ vextracti64x4 ym14, m5, 1
2209
+ vextracti64x4 ym31, m2, 1
2210
+ lea r1, [r1 + 2 * r2]
2211
+ mova [r1], xm14
2212
+ mova [r1 + 16], xm31
2213
+ vextracti128 [r1 + r2], ym14, 1
2214
+ vextracti128 [r1 + r2 + 16], ym31, 1
2215
+
2216
+ IDCT16_AVX512_PASS2 20, 24
2217
+ lea r1, [r1 + 2 * r2]
2218
+ mova [r1], xm5
2219
+ mova [r1 + 16], xm2
2220
+ vextracti128 [r1 + r2], ym5, 1
2221
+ vextracti128 [r1 + r2 + 16], ym2, 1
2222
+ vextracti64x4 ym14, m5, 1
2223
+ vextracti64x4 ym31, m2, 1
2224
+ lea r1, [r1 + 2 * r2]
2225
+ mova [r1], xm14
2226
+ mova [r1 + 16], xm31
2227
+ vextracti128 [r1 + r2], ym14, 1
2228
+ vextracti128 [r1 + r2 + 16], ym31, 1
2229
+
2230
+ IDCT16_AVX512_PASS2 21, 25
2231
+ lea r1, [r1 + 2 * r2]
2232
+ mova [r1], xm5
2233
+ mova [r1 + 16], xm2
2234
+ vextracti128 [r1 + r2], ym5, 1
2235
+ vextracti128 [r1 + r2 + 16], ym2, 1
2236
+ vextracti64x4 ym14, m5, 1
2237
+ vextracti64x4 ym31, m2, 1
2238
+ lea r1, [r1 + 2 * r2]
2239
+ mova [r1], xm14
2240
+ mova [r1 + 16], xm31
2241
+ vextracti128 [r1 + r2], ym14, 1
2242
+ vextracti128 [r1 + r2 + 16], ym31, 1
2243
+ RET
2244
+
2245
+
2246
+
2247
%macro IDCT32_PASS1 1
2248
vbroadcasti128 m3, [tab_idct32_1 + %1 * 32]
2249
vbroadcasti128 m13, [tab_idct32_1 + %1 * 32 + 16]
2250
2251
jnz .pass2
2252
RET
2253
2254
+
2255
+%macro IDCT32_AVX512_PASS1 5
2256
+ pmaddwd m9, m8, m%4
2257
+ pmaddwd m10, m7, m%5
2258
+
2259
+ paddd m9, m10
2260
+ vpsrldq m0, m9, 8
2261
+ paddd m9, m0
2262
+ vpsrldq m0, m9, 4
2263
+ paddd m9, m0
2264
+
2265
+ pmaddwd m10, m4, m%4
2266
+ pmaddwd m11, m1, m%5
2267
+
2268
+ paddd m10, m11
2269
+ vpsrldq m0, m10, 8
2270
+ paddd m10, m0
2271
+ vpslldq m0, m10, 4
2272
+ paddd m10, m0
2273
+
2274
+ vmovdqu32 m9 {k3}, m10
2275
+
2276
+ mova m6, [tab_idct32_AVX512_5 + %1 * 64]
2277
+ mova m5, [tab_idct32_AVX512_5 + %1 * 64 + 64]
2278
+
2279
+ pmaddwd m10, m8, m6
2280
+ pmaddwd m11, m7, m5
2281
+
2282
+ paddd m10, m11
2283
+ vpslldq m0, m10, 8
2284
+ paddd m10, m0
2285
+ vpsrldq m0, m10, 4
2286
+ paddd m10, m0
2287
+
2288
+ pmaddwd m11, m4, m6
2289
+ pmaddwd m12, m1, m5
2290
+
2291
+ paddd m11, m12
2292
+ vpslldq m0, m11, 8
2293
+ paddd m11, m0
2294
+ vpslldq m0, m11, 4
2295
+ paddd m11, m0
2296
+
2297
+ vmovdqu32 m10 {k4}, m11
2298
+ vmovdqu32 m9 {k2}, m10
2299
+
2300
+ pmaddwd m10, m3, m%2
2301
+ pmaddwd m11, m14, m%2
2302
+
2303
+ vpsrldq m0, m10, 4
2304
+ paddd m10, m0
2305
+ vpslldq m5, m11, 4
2306
+ paddd m11, m5
2307
+ vmovdqu32 m10 {k1}, m11
2308
+
2309
+ vpsrldq m0, m10, 8
2310
+ paddd m10, m0
2311
+
2312
+ pmaddwd m11, m2, m%3
2313
+ pmaddwd m12, m13, m%3
2314
+
2315
+ vpsrldq m0, m11, 4
2316
+ paddd m11, m0
2317
+ vpslldq m5, m12, 4
2318
+ paddd m12, m5
2319
+ vmovdqu32 m11 {k1}, m12
2320
+
2321
+ vpsrldq m0, m11, 8
2322
+ paddd m11, m0
2323
+
2324
+ paddd m12, m10, m11
2325
+ psubd m10, m11
2326
+
2327
+ punpcklqdq m12, m10
2328
+ paddd m10, m9, m12
2329
+ paddd m10, m15
2330
+ psrad m10, IDCT_SHIFT1
2331
+
2332
+ psubd m12, m9
2333
+ paddd m12, m15
2334
+ psrad m12, IDCT_SHIFT1
2335
+
2336
+ packssdw m10, m12
2337
+ vextracti128 xm12, m10, 1
2338
+ vextracti64x4 ym5, m10, 1
2339
+ vextracti128 xm0, ym5, 1
2340
+
2341
+ movd [r3 + %1 * 64], xm10
2342
+ movd [r3 + 32 + %1 * 64], xm12
2343
+ pextrd [r4 - %1 * 64], xm10, 1
2344
+ pextrd [r4+ 32 - %1 * 64], xm12, 1
2345
+ pextrd [r3 + 16 * 64 + %1 *64], xm10, 3
2346
+ pextrd [r3 + 16 * 64 + 32 + %1 * 64], xm12, 3
2347
+ pextrd [r4 + 16 * 64 - %1 * 64], xm10, 2
2348
+ pextrd [r4 + 16 * 64 + 32 - %1 * 64], xm12, 2
2349
+
2350
+ movd [r3 + (%1 + 1) * 64], xm5
2351
+ movd [r3 + 32 + (%1 + 1) * 64], xm0
2352
+ pextrd [r4 - (%1 + 1) * 64], xm5, 1
2353
+ pextrd [r4+ 32 - (%1 + 1) * 64], xm0, 1
2354
+ pextrd [r3 + 16 * 64 + (%1 + 1) * 64], xm5, 3
2355
+ pextrd [r3 + 16 * 64 + 32 + (%1 + 1) * 64], xm0, 3
2356
+ pextrd [r4 + 16 * 64 - (%1 + 1) * 64], xm5, 2
2357
+ pextrd [r4 + 16 * 64 + 32 - (%1 + 1) * 64], xm0, 2
2358
+%endmacro
2359
+
2360
+%macro IDCT32_AVX512_PASS2 0
2361
+ pmaddwd m2, m0, m7
2362
+ pmaddwd m3, m0, m8
2363
+
2364
+ vpsrldq m24, m2, 4
2365
+ paddd m2, m24
2366
+ vpslldq m25, m3, 4
2367
+ paddd m3, m25
2368
+ vmovdqu32 m2 {k1}, m3
2369
+
2370
+ pmaddwd m3, m0, m9
2371
+ pmaddwd m4, m0, m10
2372
+
2373
+ vpsrldq m24, m3, 4
2374
+ paddd m3, m24
2375
+ vpslldq m25, m4, 4
2376
+ paddd m4, m25
2377
+ vmovdqu32 m3 {k1}, m4
2378
+
2379
+ vpsrldq m24, m2, 8
2380
+ paddd m2, m24
2381
+ vpslldq m25, m3, 8
2382
+ paddd m3, m25
2383
+ vmovdqu32 m2 {k2}, m3
2384
+
2385
+ pmaddwd m3, m0, m11
2386
+ pmaddwd m4, m0, m12
2387
+
2388
+ vpsrldq m24, m3, 4
2389
+ paddd m3, m24
2390
+ vpslldq m25, m4, 4
2391
+ paddd m4, m25
2392
+ vmovdqu32 m3 {k1}, m4
2393
+
2394
+ pmaddwd m4, m0, m13
2395
+ pmaddwd m5, m0, m14
2396
+
2397
+ vpsrldq m24, m4, 4
2398
+ paddd m4, m24
2399
+ vpslldq m25, m5, 4
2400
+ paddd m5, m25
2401
+ vmovdqu32 m4 {k1}, m5
2402
+
2403
+ vpsrldq m24, m3, 8
2404
+ paddd m3, m24
2405
+ vpslldq m25, m4, 8
2406
+ paddd m4, m25
2407
+ vmovdqu32 m3 {k2}, m4
2408
+
2409
+ mova m24, [idct16_AVX512_shuff3]
2410
+ mova m25, [idct16_AVX512_shuff2]
2411
+ vpermi2q m24, m2, m3
2412
+ vpermi2q m25, m2, m3
2413
+ paddd m2, m25, m24
2414
+
2415
+ pmaddwd m3, m0, m16
2416
+ pmaddwd m4, m0, m17
2417
+
2418
+ vpsrldq m24, m3, 4
2419
+ paddd m3, m24
2420
+ vpslldq m25, m4, 4
2421
+ paddd m4, m25
2422
+ vmovdqu32 m3 {k1}, m4
2423
+
2424
+ pmaddwd m4, m0, m18
2425
+ pmaddwd m5, m0, m19
2426
+
2427
+ vpsrldq m24, m4, 4
2428
+ paddd m4, m24
2429
+ vpslldq m25, m5, 4
2430
+ paddd m5, m25
2431
+ vmovdqu32 m4 {k1}, m5
2432
+
2433
+ vpsrldq m24, m3, 8
2434
+ paddd m3, m24
2435
+ vpslldq m25, m4, 8
2436
+ paddd m4, m25
2437
+ vmovdqu32 m3 {k2}, m4
2438
+
2439
+ pmaddwd m4, m0, m20
2440
+ pmaddwd m5, m0, m21
2441
+
2442
+ vpsrldq m24, m4, 4
2443
+ paddd m4, m24
2444
+ vpslldq m25, m5, 4
2445
+ paddd m5, m25
2446
+ vmovdqu32 m4 {k1}, m5
2447
+
2448
+ pmaddwd m5, m0, m22
2449
+ pmaddwd m0, m23
2450
+
2451
+ vpsrldq m24, m5, 4
2452
+ paddd m5, m24
2453
+ vpslldq m25, m0, 4
2454
+ paddd m0, m25
2455
+ vmovdqu32 m5 {k1}, m0
2456
+
2457
+ vpsrldq m24, m4, 8
2458
+ paddd m4, m24
2459
+ vpslldq m25, m5, 8
2460
+ paddd m5, m25
2461
+ vmovdqu32 m4 {k2}, m5
2462
+
2463
+ mova m24, [idct16_AVX512_shuff3]
2464
+ mova m25, [idct16_AVX512_shuff2]
2465
+ vpermi2q m24, m3, m4
2466
+ vpermi2q m25, m3, m4
2467
+ paddd m3, m25, m24
2468
+
2469
+ pmaddwd m4, m1, m26
2470
+ pmaddwd m0, m1, m27
2471
+
2472
+ vpsrldq m24, m4, 4
2473
+ paddd m4, m24
2474
+ vpslldq m25, m0, 4
2475
+ paddd m0, m25
2476
+ vmovdqu32 m4 {k1}, m0
2477
+
2478
+ pmaddwd m5, m1, m28
2479
+ pmaddwd m0, m1, m29
2480
+
2481
+ vpsrldq m24, m5, 4
2482
+ paddd m5, m24
2483
+ vpslldq m25, m0, 4
2484
+ paddd m0, m25
2485
+ vmovdqu32 m5 {k1}, m0
2486
+
2487
+
2488
+ vpsrldq m24, m4, 8
2489
+ paddd m4, m24
2490
+ vpslldq m25, m5, 8
2491
+ paddd m5, m25
2492
+ vmovdqu32 m4 {k2}, m5
2493
+
2494
+ pmaddwd m5, m1, m30
2495
+ pmaddwd m0, m1, m31
2496
+
2497
+ vpsrldq m24, m5, 4
2498
+ paddd m5, m24
2499
+ vpslldq m25, m0, 4
2500
+ paddd m0, m25
2501
+ vmovdqu32 m5 {k1}, m0
2502
+
2503
+ pmaddwd m6, m1, [tab_idct32_AVX512_4 + 6 * mmsize]
2504
+ pmaddwd m0, m1, [tab_idct32_AVX512_4 + 7 * mmsize]
2505
+
2506
+ vpsrldq m24, m6, 4
2507
+ paddd m6, m24
2508
+ vpslldq m25, m0, 4
2509
+ paddd m0, m25
2510
+ vmovdqu32 m6 {k1}, m0
2511
+
2512
+ vpsrldq m24, m5, 8
2513
+ paddd m5, m24
2514
+ vpslldq m25, m6, 8
2515
+ paddd m6, m25
2516
+ vmovdqu32 m5 {k2}, m6
2517
+
2518
+ mova m24, [idct16_AVX512_shuff3]
2519
+ mova m25, [idct16_AVX512_shuff2]
2520
+ vpermi2q m24, m4, m5
2521
+ vpermi2q m25, m4, m5
2522
+ paddd m4, m25, m24
2523
+
2524
+ pmaddwd m5, m1, [tab_idct32_AVX512_4 + 8 * mmsize]
2525
+ pmaddwd m0, m1, [tab_idct32_AVX512_4 + 9 * mmsize]
2526
+
2527
+ vpsrldq m24, m5, 4
2528
+ paddd m5, m24
2529
+ vpslldq m25, m0, 4
2530
+ paddd m0, m25
2531
+ vmovdqu32 m5 {k1}, m0
2532
+
2533
+ pmaddwd m6, m1, [tab_idct32_AVX512_4 + 10 * mmsize]
2534
+ pmaddwd m0, m1, [tab_idct32_AVX512_4 + 11 * mmsize]
2535
+
2536
+ vpsrldq m24, m6, 4
2537
+ paddd m6, m24
2538
+ vpslldq m25, m0, 4
2539
+ paddd m0, m25
2540
+ vmovdqu32 m6 {k1}, m0
2541
+
2542
+ vpsrldq m24, m5, 8
2543
+ paddd m5, m24
2544
+ vpslldq m25, m6, 8
2545
+ paddd m6, m25
2546
+ vmovdqu32 m5 {k2}, m6
2547
+
2548
+ pmaddwd m6, m1, [tab_idct32_AVX512_4 + 12 * mmsize]
2549
+ pmaddwd m0, m1, [tab_idct32_AVX512_4 + 13 * mmsize]
2550
+
2551
+ vpsrldq m24, m6, 4
2552
+ paddd m6, m24
2553
+ vpslldq m25, m0, 4
2554
+ paddd m0, m25
2555
+ vmovdqu32 m6 {k1}, m0
2556
+
2557
+ pmaddwd m0, m1, [tab_idct32_AVX512_4 + 14 * mmsize]
2558
+ pmaddwd m1, [tab_idct32_AVX512_4 + 15 * mmsize]
2559
+
2560
+ vpsrldq m24, m0, 4
2561
+ paddd m0, m24
2562
+ vpslldq m25, m1, 4
2563
+ paddd m1, m25
2564
+ vmovdqu32 m0 {k1}, m1
2565
+
2566
+ vpsrldq m24, m6, 8
2567
+ paddd m6, m24
2568
+ vpslldq m25, m0, 8
2569
+ paddd m0, m25
2570
+ vmovdqu32 m6 {k2}, m0
2571
+
2572
+ mova m24, [idct16_AVX512_shuff3]
2573
+ mova m25, [idct16_AVX512_shuff2]
2574
+ vpermi2q m24, m5, m6
2575
+ vpermi2q m25, m5, m6
2576
+ paddd m5, m25, m24
2577
+
2578
+ paddd m6, m2, m4
2579
+ paddd m6, m15
2580
+ psrad m6, IDCT_SHIFT2
2581
+
2582
+ psubd m2, m4
2583
+ paddd m2, m15
2584
+ psrad m2, IDCT_SHIFT2
2585
+
2586
+ paddd m4, m3, m5
2587
+ paddd m4, m15
2588
+ psrad m4, IDCT_SHIFT2
2589
+
2590
+ psubd m3, m5
2591
+ paddd m3, m15
2592
+ psrad m3, IDCT_SHIFT2
2593
+
2594
+ packssdw m6, m4
2595
+ packssdw m2, m3
2596
+
2597
+ vpermq m6, m6, 0xD8
2598
+ vpermq m2, m2, 0x8D
2599
+ pshufb m2, [idct16_AVX512_shuff6]
2600
+%endmacro
2601
+
2602
+;-------------------------------------------------------------------
2603
+; void idct32(const int16_t* src, int16_t* dst, intptr_t dstStride)
2604
+;-------------------------------------------------------------------
2605
+
2606
+INIT_ZMM avx512
2607
+cglobal idct32, 3, 8, 32, 0-32*64
2608
+
2609
+%define IDCT_SHIFT1 7
2610
+
2611
+ vbroadcasti128 m15, [pd_64]
2612
+
2613
+ mov r3, rsp
2614
+ lea r4, [r3 + 15 * 64]
2615
+ mov r5d, 8
2616
+ mov r7d, 0xAAAA
2617
+ kmovd k1, r7d
2618
+ mov r7d, 0xCCCC
2619
+ kmovd k2, r7d
2620
+ mov r7d, 0x2222
2621
+ kmovd k3, r7d
2622
+ mov r7d, 0x8888
2623
+ kmovd k4, r7d
2624
+
2625
+
2626
+ mova m16, [tab_idct32_AVX512_2 + 0 * 64]
2627
+ mova m17, [tab_idct32_AVX512_2 + 1 * 64]
2628
+ mova m18, [tab_idct32_AVX512_2 + 2 * 64]
2629
+ mova m19, [tab_idct32_AVX512_2 + 3 * 64]
2630
+
2631
+ mova m20, [tab_idct32_AVX512_3 + 0 * 64]
2632
+ mova m21, [tab_idct32_AVX512_3 + 1 * 64]
2633
+ mova m22, [tab_idct32_AVX512_3 + 2 * 64]
2634
+ mova m23, [tab_idct32_AVX512_3 + 3 * 64]
2635
+
2636
+ mova m24, [tab_idct32_AVX512_1 + 0 * 64]
2637
+ mova m25, [tab_idct32_AVX512_1 + 1 * 64]
2638
+ mova m26, [tab_idct32_AVX512_1 + 2 * 64]
2639
+ mova m27, [tab_idct32_AVX512_1 + 3 * 64]
2640
+ mova m28, [tab_idct32_AVX512_1 + 4 * 64]
2641
+ mova m29, [tab_idct32_AVX512_1 + 5 * 64]
2642
+ mova m30, [tab_idct32_AVX512_1 + 6 * 64]
2643
+ mova m31, [tab_idct32_AVX512_1 + 7 * 64]
2644
+
2645
+.pass1:
2646
+ movq xm0, [r0 + 2 * 64]
2647
+ movq xm1, [r0 + 18 * 64]
2648
+ punpcklqdq xm0, xm0, xm1
2649
+ movq xm1, [r0 + 0 * 64]
2650
+ movq xm2, [r0 + 16 * 64]
2651
+ punpcklqdq xm1, xm1, xm2
2652
+ vinserti128 ym0, ym0, xm1, 1 ;[2 18 0 16]
2653
+
2654
+ movq xm1, [r0 + 1 * 64]
2655
+ movq xm2, [r0 + 9 * 64]
2656
+ punpcklqdq xm1, xm1, xm2
2657
+ movq xm2, [r0 + 17 * 64]
2658
+ movq xm3, [r0 + 25 * 64]
2659
+ punpcklqdq xm2, xm2, xm3
2660
+ vinserti128 ym1, ym1, xm2, 1 ;[1 9 17 25]
2661
+
2662
+ movq xm2, [r0 + 6 * 64]
2663
+ movq xm3, [r0 + 22 * 64]
2664
+ punpcklqdq xm2, xm2, xm3
2665
+ movq xm3, [r0 + 4 * 64]
2666
+ movq xm4, [r0 + 20 * 64]
2667
+ punpcklqdq xm3, xm3, xm4
2668
+ vinserti128 ym2, ym2, xm3, 1 ;[6 22 4 20]
2669
+
2670
+ movq xm3, [r0 + 3 * 64]
2671
+ movq xm4, [r0 + 11 * 64]
2672
+ punpcklqdq xm3, xm3, xm4
2673
+ movq xm4, [r0 + 19 * 64]
2674
+ movq xm5, [r0 + 27 * 64]
2675
+ punpcklqdq xm4, xm4, xm5
2676
+ vinserti128 ym3, ym3, xm4, 1 ;[3 11 17 25]
2677
+
2678
+ movq xm4, [r0 + 10 * 64]
2679
+ movq xm5, [r0 + 26 * 64]
2680
+ punpcklqdq xm4, xm4, xm5
2681
+ movq xm5, [r0 + 8 * 64]
2682
+ movq xm6, [r0 + 24 * 64]
2683
+ punpcklqdq xm5, xm5, xm6
2684
+ vinserti128 ym4, ym4, xm5, 1 ;[10 26 8 24]
2685
+
2686
+ movq xm5, [r0 + 5 * 64]
2687
+ movq xm6, [r0 + 13 * 64]
2688
+ punpcklqdq xm5, xm5, xm6
2689
+ movq xm6, [r0 + 21 * 64]
2690
+ movq xm7, [r0 + 29 * 64]
2691
+ punpcklqdq xm6, xm6, xm7
2692
+ vinserti128 ym5, ym5, xm6, 1 ;[5 13 21 9]
2693
+
2694
+ movq xm6, [r0 + 14 * 64]
2695
+ movq xm7, [r0 + 30 * 64]
2696
+ punpcklqdq xm6, xm6, xm7
2697
+ movq xm7, [r0 + 12 * 64]
2698
+ movq xm8, [r0 + 28 * 64]
2699
+ punpcklqdq xm7, xm7, xm8
2700
+ vinserti128 ym6, ym6, xm7, 1 ;[14 30 12 28]
2701
+
2702
+ movq xm7, [r0 + 7 * 64]
2703
+ movq xm8, [r0 + 15 * 64]
2704
+ punpcklqdq xm7, xm7, xm8
2705
+ movq xm8, [r0 + 23 * 64]
2706
+ movq xm9, [r0 + 31 * 64]
2707
+ punpcklqdq xm8, xm8, xm9
2708
+ vinserti128 ym7, ym7, xm8, 1 ;[7 15 23 31]
2709
+
2710
+ punpckhwd ym8, ym0, ym2 ;[18 22 16 20]
2711
+ punpcklwd ym0, ym2 ;[2 6 0 4]
2712
+
2713
+ punpckhwd ym2, ym1, ym3 ;[9 11 25 27]
2714
+ punpcklwd ym1, ym3 ;[1 3 17 19]
2715
+
2716
+ punpckhwd ym3, ym4, ym6 ;[26 30 24 28]
2717
+ punpcklwd ym4, ym6 ;[10 14 8 12]
2718
+
2719
+ punpckhwd ym6, ym5, ym7 ;[13 15 29 31]
2720
+ punpcklwd ym5, ym7 ;[5 7 21 23]
2721
+
2722
+ punpckhdq ym7, ym0, ym4 ;[22 62 102 142 23 63 103 143 02 42 82 122 03 43 83 123]
2723
+ punpckldq ym0, ym4 ;[20 60 100 140 21 61 101 141 00 40 80 120 01 41 81 121]
2724
+
2725
+ punpckhdq ym4, ym8, ym3 ;[182 222 262 302 183 223 263 303 162 202 242 282 163 203 243 283]
2726
+ punpckldq ym8, ym3 ;[180 220 260 300 181 221 261 301 160 200 240 280 161 201 241 281]
2727
+
2728
+ punpckhdq ym3, ym1, ym5 ;[12 32 52 72 13 33 53 73 172 192 212 232 173 193 213 233]
2729
+ punpckldq ym1, ym5 ;[10 30 50 70 11 31 51 71 170 190 210 230 171 191 211 231]
2730
+
2731
+ punpckhdq ym5, ym2, ym6 ;[92 112 132 152 93 113 133 153 252 272 292 312 253 273 293 313]
2732
+ punpckldq ym2, ym6 ;[90 110 130 150 91 111 131 151 250 270 290 310 251 271 291 311]
2733
+
2734
+ punpckhqdq ym6, ym0, ym8 ;[21 61 101 141 181 221 261 301 01 41 81 121 161 201 241 281]
2735
+ punpcklqdq ym0, ym8 ;[20 60 100 140 180 220 260 300 00 40 80 120 160 200 240 280]
2736
+
2737
+ punpckhqdq ym8, ym7, ym4 ;[23 63 103 143 183 223 263 303 03 43 83 123 163 203 243 283]
2738
+ punpcklqdq ym7, ym4 ;[22 62 102 142 182 222 262 302 02 42 82 122 162 202 242 282]
2739
+
2740
+ punpckhqdq ym4, ym1, ym2 ;[11 31 51 71 91 111 131 151 171 191 211 231 251 271 291 311]
2741
+ punpcklqdq ym1, ym2 ;[10 30 50 70 90 110 130 150 170 190 210 230 250 270 290 310]
2742
+
2743
+ punpckhqdq ym2, ym3, ym5 ;[13 33 53 73 93 113 133 153 173 193 213 233 253 273 293 313]
2744
+ punpcklqdq ym3, ym5 ;[12 32 52 72 92 112 132 152 172 192 212 232 252 272 292 312]
2745
+
2746
+ vinserti64x4 m7, m7, ym7, 1
2747
+ vinserti64x4 m8, m8, ym8, 1
2748
+ movu m13, [idct16_AVX512_shuff2]
2749
+ movu m14, [idct16_AVX512_shuff3]
2750
+ vpermi2q m13, m7, m8
2751
+ vpermi2q m14, m7, m8
2752
+
2753
+ vinserti64x4 m1, m1, ym1, 1
2754
+ vinserti64x4 m4, m4, ym4, 1
2755
+ movu m7, [idct16_AVX512_shuff3]
2756
+ movu m8, [idct16_AVX512_shuff2]
2757
+ vpermi2q m7, m1, m4
2758
+ vpermi2q m8, m1, m4
2759
+
2760
+ vinserti64x4 m3, m3, ym3, 1
2761
+ vinserti64x4 m2, m2, ym2, 1
2762
+ movu m1, [idct16_AVX512_shuff3]
2763
+ movu m4, [idct16_AVX512_shuff2]
2764
+ vpermi2q m1, m3, m2
2765
+ vpermi2q m4, m3, m2
2766
+
2767
+ vinserti64x4 m0, m0, ym0, 1
2768
+ vinserti64x4 m6, m6, ym6, 1
2769
+ movu m2, [idct16_AVX512_shuff2]
2770
+ movu m3, [idct16_AVX512_shuff3]
2771
+ vpermi2q m2, m0, m6
2772
+ vpermi2q m3, m0, m6
2773
+
2774
+
2775
+ IDCT32_AVX512_PASS1 0, 16, 20, 24, 25
2776
+ IDCT32_AVX512_PASS1 2, 17, 21, 26, 27
2777
+ IDCT32_AVX512_PASS1 4, 18, 22, 28, 29
2778
+ IDCT32_AVX512_PASS1 6, 19, 23, 30, 31
2779
+
2780
+ add r0, 8
2781
+ add r3, 4
2782
+ add r4, 4
2783
+ dec r5d
2784
+ jnz .pass1
2785
+
2786
+%if BIT_DEPTH == 12
2787
+ %define IDCT_SHIFT2 8
2788
+ vpbroadcastd m15, [pd_128]
2789
+%elif BIT_DEPTH == 10
2790
+ %define IDCT_SHIFT2 10
2791
+ vpbroadcastd m15, [pd_512]
2792
+%elif BIT_DEPTH == 8
2793
+ %define IDCT_SHIFT2 12
2794
+ vpbroadcastd m15, [pd_2048]
2795
+%else
2796
+ %error Unsupported BIT_DEPTH!
2797
+%endif
2798
+
2799
+ mov r3, rsp
2800
+ add r2d, r2d
2801
+ mov r4d, 16
2802
+ mov r6d, 0xFFFF0000
2803
+ kmovd k3, r6d
2804
+
2805
+ mova m7, [tab_idct32_AVX512_6]
2806
+ mova m8, [tab_idct32_AVX512_6 + 1 * mmsize]
2807
+ mova m9, [tab_idct32_AVX512_6 + 2 * mmsize]
2808
+ mova m10, [tab_idct32_AVX512_6 + 3 * mmsize]
2809
+ mova m11, [tab_idct32_AVX512_6 + 4 * mmsize]
2810
+ mova m12, [tab_idct32_AVX512_6 + 5 * mmsize]
2811
+ mova m13, [tab_idct32_AVX512_6 + 6 * mmsize]
2812
+ mova m14, [tab_idct32_AVX512_6 + 7 * mmsize]
2813
+ mova m16, [tab_idct32_AVX512_6 + 8 * mmsize]
2814
+ mova m17, [tab_idct32_AVX512_6 + 9 * mmsize]
2815
+ mova m18, [tab_idct32_AVX512_6 + 10 * mmsize]
2816
+ mova m19, [tab_idct32_AVX512_6 + 11 * mmsize]
2817
+ mova m20, [tab_idct32_AVX512_6 + 12 * mmsize]
2818
+ mova m21, [tab_idct32_AVX512_6 + 13 * mmsize]
2819
+ mova m22, [tab_idct32_AVX512_6 + 14 * mmsize]
2820
+ mova m23, [tab_idct32_AVX512_6 + 15 * mmsize]
2821
+ mova m26, [tab_idct32_AVX512_4]
2822
+ mova m27, [tab_idct32_AVX512_4 + 1 * mmsize]
2823
+ mova m28, [tab_idct32_AVX512_4 + 2 * mmsize]
2824
+ mova m29, [tab_idct32_AVX512_4 + 3 * mmsize]
2825
+ mova m30, [tab_idct32_AVX512_4 + 4 * mmsize]
2826
+ mova m31, [tab_idct32_AVX512_4 + 5 * mmsize]
2827
+
2828
+.pass2:
2829
+ movu ym0, [r3]
2830
+ movu ym1, [r3 + 32]
2831
+ vmovdqu16 m0 {k3}, [r3 + 32]
2832
+ vmovdqu16 m1 {k3}, [r3 + 64]
2833
+
2834
+ IDCT32_AVX512_PASS2
2835
+ movu [r1], ym6
2836
+ movu [r1 + 32], ym2
2837
+ vextracti64x4 ym24, m6, 1
2838
+ vextracti64x4 ym25, m2, 1
2839
+ add r1, r2
2840
+ movu [r1 ], ym24
2841
+ movu [r1 + 32], ym25
2842
+
2843
+ add r1, r2
2844
+ add r3, 128
2845
+ dec r4d
2846
+ jnz .pass2
2847
+ RET
2848
+
2849
;-------------------------------------------------------
2850
; void idct4(const int16_t* src, int16_t* dst, intptr_t dstStride)
2851
;-------------------------------------------------------
2852
2853
movhps [r1 + 2 * r2], xm0
2854
movhps [r1 + r3], xm1
2855
RET
2856
+
2857
+;static void nonPsyRdoQuant_c(int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, uint32_t blkPos)
2858
+;{
2859
+; const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
2860
+; const int scaleBits = SCALE_BITS - 2 * transformShift;
2861
+; const uint32_t trSize = 1 << log2TrSize;
2862
+
2863
+; for (int y = 0; y < MLS_CG_SIZE; y++)
2864
+; {
2865
+; for (int x = 0; x < MLS_CG_SIZE; x++)
2866
+; {
2867
+; int signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */
2868
+; costUncoded[blkPos + x] = static_cast<int64_t>((double)((signCoef * signCoef) << scaleBits));
2869
+; *totalUncodedCost += costUncoded[blkPos + x];
2870
+; *totalRdCost += costUncoded[blkPos + x];
2871
+; }
2872
+; blkPos += trSize;
2873
+; }
2874
+;}
2875
+
2876
+;---------------------------------------------------------------------------------------------------------------------------------------------------------
2877
+; void nonPsyRdoQuant_c(int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, uint32_t blkPos)
2878
+;---------------------------------------------------------------------------------------------------------------------------------------------------------
2879
+INIT_ZMM avx512
2880
+cglobal nonPsyRdoQuant4, 5, 5, 8
2881
+ mov r4d, r4m
2882
+ lea r0, [r0 + 2 * r4]
2883
+ lea r4, [4 * r4]
2884
+ lea r1, [r1 + 2 * r4]
2885
+%if BIT_DEPTH == 12
2886
+ mov r4, [tab_nonpsyRdo12]
2887
+%elif BIT_DEPTH == 10
2888
+ mov r4, [tab_nonpsyRdo10]
2889
+%elif BIT_DEPTH == 8
2890
+ mov r4, [tab_nonpsyRdo8]
2891
+%else
2892
+ %error Unsupported BIT_DEPTH!
2893
+ %endif
2894
+ movq xm3, r4
2895
+ movq xm6, [r2]
2896
+ movq xm7, [r3]
2897
+ vpxor m4, m4
2898
+ vpxor m5, m5
2899
+;Row 1, 2
2900
+ movu xm0, [r0]
2901
+ vpmovsxwq m1, xm0
2902
+ vcvtqq2pd m2, m1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
2903
+ vfmadd213pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements
2904
+ vcvtpd2qq m1, m2
2905
+ vpsllq m1, xm3 ; costUncoded
2906
+ paddq m4, m1
2907
+ movu [r1], m1
2908
+ ;Row 3, 4
2909
+ movu xm0, [r0 + 16]
2910
+ vpmovsxwq m1, xm0
2911
+ vcvtqq2pd m2, m1
2912
+ vfmadd213pd m2, m2, m5
2913
+ vcvtpd2qq m1, m2
2914
+ vpsllq m1, xm3 ; costUncoded
2915
+ paddq m4, m1
2916
+ movu [r1 + 64], m1
2917
+ vextracti32x8 ym2, m4, 1
2918
+ paddq ym4, ym2
2919
+ vextracti32x4 xm2, m4, 1
2920
+ paddq xm4, xm2
2921
+ punpckhqdq xm2, xm4, xm5
2922
+ paddq xm4, xm2
2923
+
2924
+ paddq xm6, xm4
2925
+ paddq xm7, xm4
2926
+
2927
+ movq [r2], xm6
2928
+ movq [r3], xm7
2929
+ RET
2930
+INIT_ZMM avx512
2931
+cglobal nonPsyRdoQuant8, 5, 5, 8
2932
+ mov r4d, r4m
2933
+ lea r0, [r0 + 2 * r4]
2934
+ lea r4, [4 * r4]
2935
+ lea r1, [r1 + 2 * r4]
2936
+%if BIT_DEPTH == 12
2937
+ mov r4, [tab_nonpsyRdo12 + 8]
2938
+%elif BIT_DEPTH == 10
2939
+ mov r4, [tab_nonpsyRdo10 + 8]
2940
+%elif BIT_DEPTH == 8
2941
+ mov r4, [tab_nonpsyRdo8 + 8]
2942
+%else
2943
+ %error Unsupported BIT_DEPTH!
2944
+ %endif
2945
+ movq xm3, r4
2946
+ movq xm6, [r2]
2947
+ movq xm7, [r3]
2948
+ vpxor m4, m4
2949
+ vpxor m5, m5
2950
+
2951
+;Row 1, 2
2952
+ movq xm0, [r0]
2953
+ pinsrq xm0, [r0 + mmsize/4], 1
2954
+ vpmovsxwq m1, xm0
2955
+ vcvtqq2pd m2, m1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
2956
+ vfmadd213pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements
2957
+ vcvtpd2qq m1, m2
2958
+ vpsllq m1, xm3 ; costUncoded
2959
+ paddq m4, m1
2960
+ movu [r1], ym1
2961
+ vextracti32x8 [r1 + mmsize], m1 , 1
2962
+
2963
+ ;Row 3, 4
2964
+ movq xm0, [r0 + mmsize/2]
2965
+ pinsrq xm0, [r0 + 3 * mmsize/4], 1
2966
+ vpmovsxwq m1, xm0
2967
+ vcvtqq2pd m2, m1
2968
+ vfmadd213pd m2, m2, m5
2969
+ vcvtpd2qq m1, m2
2970
+ vpsllq m1, xm3 ; costUncoded
2971
+ paddq m4, m1
2972
+ movu [r1 + 2 * mmsize], ym1
2973
+ vextracti32x8 [r1 + 3 * mmsize], m1 , 1
2974
+
2975
+ vextracti32x8 ym2, m4, 1
2976
+ paddq ym4, ym2
2977
+ vextracti32x4 xm2, m4, 1
2978
+ paddq xm4, xm2
2979
+ punpckhqdq xm2, xm4, xm5
2980
+ paddq xm4, xm2
2981
+
2982
+ paddq xm6, xm4
2983
+ paddq xm7, xm4
2984
+
2985
+ movq [r2], xm6
2986
+ movq [r3], xm7
2987
+ RET
2988
+INIT_ZMM avx512
2989
+cglobal nonPsyRdoQuant16, 5, 5, 8
2990
+ mov r4d, r4m
2991
+ lea r0, [r0 + 2 * r4]
2992
+ lea r4, [4 * r4]
2993
+ lea r1, [r1 + 2 * r4]
2994
+%if BIT_DEPTH == 12
2995
+ mov r4, [tab_nonpsyRdo12 + 16]
2996
+%elif BIT_DEPTH == 10
2997
+ mov r4, [tab_nonpsyRdo10 + 16]
2998
+%elif BIT_DEPTH == 8
2999
+ mov r4, [tab_nonpsyRdo8 + 16]
3000
+%else
3001
+ %error Unsupported BIT_DEPTH!
3002
+ %endif
3003
+ movq xm3, r4
3004
+ movq xm6, [r2]
3005
+ movq xm7, [r3]
3006
+ vpxor m4, m4
3007
+ vpxor m5, m5
3008
+
3009
+;Row 1, 2
3010
+ movq xm0, [r0]
3011
+ pinsrq xm0, [r0 + mmsize/2], 1
3012
+ vpmovsxwq m1, xm0
3013
+ vcvtqq2pd m2, m1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
3014
+ vfmadd213pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements
3015
+ vcvtpd2qq m1, m2
3016
+ vpsllq m1, xm3 ; costUncoded
3017
+ paddq m4, m1
3018
+ movu [r1], ym1
3019
+ vextracti32x8 [r1 + 2 * mmsize], m1, 1
3020
+
3021
+ ;Row 3, 4
3022
+ movq xm0, [r0 + mmsize]
3023
+ pinsrq xm0, [r0 + 3 * mmsize/2], 1
3024
+ vpmovsxwq m1, xm0
3025
+ vcvtqq2pd m2, m1
3026
+ vfmadd213pd m2, m2, m5
3027
+ vcvtpd2qq m1, m2
3028
+ vpsllq m1, xm3 ; costUncoded
3029
+ paddq m4, m1
3030
+ movu [r1 + 4 * mmsize], ym1
3031
+ vextracti32x8 [r1 + 6 * mmsize], m1 , 1
3032
+
3033
+ vextracti32x8 ym2, m4, 1
3034
+ paddq ym4, ym2
3035
+ vextracti32x4 xm2, m4, 1
3036
+ paddq xm4, xm2
3037
+ punpckhqdq xm2, xm4, xm5
3038
+ paddq xm4, xm2
3039
+
3040
+ paddq xm6, xm4
3041
+ paddq xm7, xm4
3042
+
3043
+ movq [r2], xm6
3044
+ movq [r3], xm7
3045
+ RET
3046
+INIT_ZMM avx512
3047
+cglobal nonPsyRdoQuant32, 5, 5, 8
3048
+ mov r4d, r4m
3049
+ lea r0, [r0 + 2 * r4]
3050
+ lea r4, [4 * r4]
3051
+ lea r1, [r1 + 2 * r4]
3052
+%if BIT_DEPTH == 12
3053
+ mov r4, [tab_nonpsyRdo12 + 24]
3054
+%elif BIT_DEPTH == 10
3055
+ mov r4, [tab_nonpsyRdo10 + 24]
3056
+%elif BIT_DEPTH == 8
3057
+ mov r4, [tab_nonpsyRdo8 + 24]
3058
+%else
3059
+ %error Unsupported BIT_DEPTH!
3060
+ %endif
3061
+ movq xm3, r4
3062
+ movq xm6, [r2]
3063
+ movq xm7, [r3]
3064
+ vpxor m4, m4
3065
+ vpxor m5, m5
3066
+
3067
+;Row 1, 2
3068
+ movq xm0, [r0]
3069
+ pinsrq xm0, [r0 + mmsize], 1
3070
+ vpmovsxwq m1, xm0
3071
+ vcvtqq2pd m2, m1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
3072
+ vfmadd213pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements
3073
+ vcvtpd2qq m1, m2
3074
+ vpsllq m1, xm3 ; costUncoded
3075
+ paddq m4, m1
3076
+ movu [r1], ym1
3077
+ vextracti32x8 [r1 + 4 * mmsize], m1, 1
3078
+
3079
+ ;Row 3, 4
3080
+ movq xm0, [r0 + 2 * mmsize]
3081
+ pinsrq xm0, [r0 + 3 * mmsize], 1
3082
+ vpmovsxwq m1, xm0
3083
+ vcvtqq2pd m2, m1
3084
+ vfmadd213pd m2, m2, m5
3085
+ vcvtpd2qq m1, m2
3086
+ vpsllq m1, xm3 ; costUncoded
3087
+ paddq m4, m1
3088
+ movu [r1 + 8 * mmsize], ym1
3089
+ vextracti32x8 [r1 + 12 * mmsize], m1 , 1
3090
+
3091
+ vextracti32x8 ym2, m4, 1
3092
+ paddq ym4, ym2
3093
+ vextracti32x4 xm2, m4, 1
3094
+ paddq xm4, xm2
3095
+ punpckhqdq xm2, xm4, xm5
3096
+ paddq xm4, xm2
3097
+
3098
+ paddq xm6, xm4
3099
+ paddq xm7, xm4
3100
+
3101
+ movq [r2], xm6
3102
+ movq [r3], xm7
3103
+ RET
3104
+;static void psyRdoQuant_c(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t psyScale, uint32_t blkPos)
3105
+;{
3106
+; const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
3107
+; const int scaleBits = SCALE_BITS - 2 * transformShift;
3108
+; const uint32_t trSize = 1 << log2TrSize;
3109
+; int max = X265_MAX(0, (2 * transformShift + 1));
3110
+;
3111
+; for (int y = 0; y < MLS_CG_SIZE; y++)
3112
+; {
3113
+; for (int x = 0; x < MLS_CG_SIZE; x++)
3114
+; {
3115
+; int64_t signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */
3116
+; int64_t predictedCoef = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/
3117
+;
3118
+; costUncoded[blkPos + x] = static_cast<int64_t>((double)(signCoef * signCoef) << scaleBits);
3119
+;
3120
+; /* when no residual coefficient is coded, predicted coef == recon coef */
3121
+; costUncoded[blkPos + x] -= static_cast<int64_t>((psyScale * (predictedCoef)) >> max);
3122
+;
3123
+; *totalUncodedCost += costUncoded[blkPos + x];
3124
+; *totalRdCost += costUncoded[blkPos + x];
3125
+; }
3126
+; blkPos += trSize;
3127
+; }
3128
+;}
3129
+
3130
+;---------------------------------------------------------------------------------------------------------------------------------------------------------
3131
+; void psyRdoQuant(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos)
3132
+;---------------------------------------------------------------------------------------------------------------------------------------------------------
3133
+INIT_ZMM avx512
3134
+cglobal psyRdoQuant4, 5, 9, 13
3135
+%if WIN64
3136
+ mov r5, r5m
3137
+%endif
3138
+ mov r6d, r6m
3139
+ vpbroadcastq m12, [r5] ; psyScale
3140
+ lea r0, [r0 + 2 * r6]
3141
+ lea r1, [r1 + 2 * r6]
3142
+ lea r6, [4 * r6]
3143
+ lea r2, [r2 + 2 * r6]
3144
+ movq xm0, [r3]
3145
+ movq xm1, [r4]
3146
+
3147
+%if BIT_DEPTH == 12
3148
+ mov r5, [tab_nonpsyRdo12] ; scaleBits
3149
+%elif BIT_DEPTH == 10
3150
+ mov r5, [tab_nonpsyRdo10]
3151
+%elif BIT_DEPTH == 8
3152
+ mov r5, [tab_nonpsyRdo8]
3153
+%else
3154
+ %error Unsupported BIT_DEPTH!
3155
+%endif
3156
+
3157
+ movq xm2, r5
3158
+ vpxor m4, m4
3159
+ vpxor m3, m3
3160
+
3161
+;Row 1, 2
3162
+ vpmovsxwq m6, [r0]
3163
+ vpmovsxwq m7, [r1]
3164
+ psubq m7, m6 ; predictedCoef
3165
+
3166
+ vcvtqq2pd m9, m6
3167
+ vfmadd213pd m9, m9, m3
3168
+ vcvtpd2qq m8, m9
3169
+ vpsllq m8, xm2 ;(signCoef * signCoef) << scaleBits
3170
+
3171
+ vcvtqq2pd m10, m7
3172
+ vcvtqq2pd m11, m12
3173
+ vfmadd213pd m10, m11, m3
3174
+ vcvtpd2qq m9, m10
3175
+ vpsraq m9, RDO_MAX_4 ;(psyScale * predictedCoef) >> max
3176
+
3177
+ psubq m8, m9
3178
+ paddq m4, m8
3179
+ movu [r2], m8
3180
+
3181
+ ;Row 3, 4
3182
+ vpmovsxwq m6, [r0 + 16]
3183
+ vpmovsxwq m7, [r1 + 16]
3184
+ psubq m7, m6 ; predictedCoef
3185
+
3186
+ vcvtqq2pd m9, m6
3187
+ vfmadd213pd m9, m9, m3
3188
+ vcvtpd2qq m8, m9
3189
+ vpsllq m8, xm2 ;(signCoef * signCoef) << scaleBits
3190
+
3191
+ vcvtqq2pd m10, m7
3192
+ vcvtqq2pd m11, m12
3193
+ vfmadd213pd m10, m11, m3
3194
+ vcvtpd2qq m9, m10
3195
+ vpsraq m9, RDO_MAX_4 ;(psyScale * predictedCoef) >> max
3196
+
3197
+ psubq m8, m9
3198
+ paddq m4, m8
3199
+ movu [r2 + 64], m8
3200
+
3201
+ vextracti32x8 ym2, m4, 1
3202
+ paddq ym4, ym2
3203
+ vextracti32x4 xm2, m4, 1
3204
+ paddq xm4, xm2
3205
+ punpckhqdq xm2, xm4, xm3
3206
+ paddq xm4, xm2
3207
+
3208
+ paddq xm0, xm4
3209
+ paddq xm1, xm4
3210
+
3211
+ movq [r3], xm0
3212
+ movq [r4], xm1
3213
+ RET
3214
+
3215
+;---------------------------------------------------------------------------------------------------------------------------------------------------------
3216
+; void psyRdoQuant(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos)
3217
+;---------------------------------------------------------------------------------------------------------------------------------------------------------
3218
+INIT_ZMM avx512
3219
+cglobal psyRdoQuant8, 5, 9, 15
3220
+%if WIN64
3221
+ mov r5, r5m
3222
+%endif
3223
+ mov r6d, r6m
3224
+ vpbroadcastq m12, [r5] ; psyScale
3225
+ lea r0, [r0 + 2 * r6]
3226
+ lea r1, [r1 + 2 * r6]
3227
+ lea r6, [4 * r6]
3228
+ lea r2, [r2 + 2 * r6]
3229
+ movq xm0, [r3]
3230
+ movq xm1, [r4]
3231
+
3232
+%if BIT_DEPTH == 12
3233
+ mov r5, [tab_nonpsyRdo12 + 8] ; scaleBits
3234
+%elif BIT_DEPTH == 10
3235
+ mov r5, [tab_nonpsyRdo10 + 8]
3236
+%elif BIT_DEPTH == 8
3237
+ mov r5, [tab_nonpsyRdo8 + 8]
3238
+%else
3239
+ %error Unsupported BIT_DEPTH!
3240
+%endif
3241
+
3242
+ movq xm2, r5
3243
+ vpxor m4, m4
3244
+ vpxor m3, m3
3245
+
3246
+;Row 1, 2
3247
+ movq xm13, [r0]
3248
+ movq xm14, [r1]
3249
+ pinsrq xm13, [r0 + mmsize/4], 1
3250
+ pinsrq xm14, [r1 + mmsize/4], 1
3251
+ vpmovsxwq m6, xm13
3252
+ vpmovsxwq m7, xm14
3253
+ psubq m7, m6 ; predictedCoef
3254
+
3255
+ vcvtqq2pd m9, m6
3256
+ vfmadd213pd m9, m9, m3
3257
+ vcvtpd2qq m8, m9
3258
+ vpsllq m8, xm2 ;(signCoef * signCoef) << scaleBits
3259
+
3260
+ vcvtqq2pd m10, m7
3261
+ vcvtqq2pd m11, m12
3262
+ vfmadd213pd m10, m11, m3
3263
+ vcvtpd2qq m9, m10
3264
+ vpsraq m9, RDO_MAX_8 ;(psyScale * predictedCoef) >> max
3265
+
3266
+ psubq m8, m9
3267
+ paddq m4, m8
3268
+ movu [r2], ym8
3269
+ vextracti32x8 [r2 + mmsize], m8 , 1
3270
+
3271
+ ;Row 3, 4
3272
+ movq xm13, [r0 + mmsize/2]
3273
+ movq xm14, [r1 + mmsize/2]
3274
+ pinsrq xm13, [r0 + 3 * mmsize/4], 1
3275
+ pinsrq xm14, [r1 + 3 * mmsize/4], 1
3276
+ vpmovsxwq m6, xm13
3277
+ vpmovsxwq m7, xm14
3278
+ psubq m7, m6 ; predictedCoef
3279
+
3280
+ vcvtqq2pd m9, m6
3281
+ vfmadd213pd m9, m9, m3
3282
+ vcvtpd2qq m8, m9
3283
+ vpsllq m8, xm2 ;(signCoef * signCoef) << scaleBits
3284
+
3285
+ vcvtqq2pd m10, m7
3286
+ vcvtqq2pd m11, m12
3287
+ vfmadd213pd m10, m11, m3
3288
+ vcvtpd2qq m9, m10
3289
+ vpsraq m9, RDO_MAX_8 ;(psyScale * predictedCoef) >> max
3290
+
3291
+ psubq m8, m9
3292
+ paddq m4, m8
3293
+ movu [r2 + 2 * mmsize], ym8
3294
+ vextracti32x8 [r2 + 3 * mmsize], m8 , 1
3295
+
3296
+ vextracti32x8 ym2, m4, 1
3297
+ paddq ym4, ym2
3298
+ vextracti32x4 xm2, m4, 1
3299
+ paddq xm4, xm2
3300
+ punpckhqdq xm2, xm4, xm3
3301
+ paddq xm4, xm2
3302
+
3303
+ paddq xm0, xm4
3304
+ paddq xm1, xm4
3305
+
3306
+ movq [r3], xm0
3307
+ movq [r4], xm1
3308
+ RET
3309
+
3310
+;---------------------------------------------------------------------------------------------------------------------------------------------------------
3311
+; void psyRdoQuant(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos)
3312
+;---------------------------------------------------------------------------------------------------------------------------------------------------------
3313
+INIT_ZMM avx512
3314
+cglobal psyRdoQuant16, 5, 9, 15
3315
+%if WIN64
3316
+ mov r5, r5m
3317
+%endif
3318
+ mov r6d, r6m
3319
+ vpbroadcastq m12, [r5] ; psyScale
3320
+ lea r0, [r0 + 2 * r6]
3321
+ lea r1, [r1 + 2 * r6]
3322
+ lea r6, [4 * r6]
3323
+ lea r2, [r2 + 2 * r6]
3324
+ movq xm0, [r3]
3325
+ movq xm1, [r4]
3326
+
3327
+%if BIT_DEPTH == 12
3328
+ mov r5, [tab_nonpsyRdo12 + 16] ; scaleBits
3329
+%elif BIT_DEPTH == 10
3330
+ mov r5, [tab_nonpsyRdo10 + 16]
3331
+%elif BIT_DEPTH == 8
3332
+ mov r5, [tab_nonpsyRdo8 + 16]
3333
+%else
3334
+ %error Unsupported BIT_DEPTH!
3335
+%endif
3336
+
3337
+ movq xm2, r5
3338
+ vpxor m4, m4
3339
+ vpxor m3, m3
3340
+
3341
+;Row 1, 2
3342
+ movq xm13, [r0]
3343
+ movq xm14, [r1]
3344
+ pinsrq xm13, [r0 + mmsize/2], 1
3345
+ pinsrq xm14, [r1 + mmsize/2], 1
3346
+ vpmovsxwq m6, xm13
3347
+ vpmovsxwq m7, xm14
3348
+ psubq m7, m6 ; predictedCoef
3349
+
3350
+ vcvtqq2pd m9, m6
3351
+ vfmadd213pd m9, m9, m3
3352
+ vcvtpd2qq m8, m9
3353
+ vpsllq m8, xm2 ;(signCoef * signCoef) << scaleBits
3354
+
3355
+ vcvtqq2pd m10, m7
3356
+ vcvtqq2pd m11, m12
3357
+ vfmadd213pd m10, m11, m3
3358
+ vcvtpd2qq m9, m10
3359
+ vpsraq m9, RDO_MAX_16 ;(psyScale * predictedCoef) >> max
3360
+
3361
+ psubq m8, m9
3362
+ paddq m4, m8
3363
+ movu [r2], ym8
3364
+ vextracti32x8 [r2 + 2 * mmsize], m8 , 1
3365
+
3366
+ ;Row 3, 4
3367
+ movq xm13, [r0 + mmsize]
3368
+ movq xm14, [r1 + mmsize]
3369
+ pinsrq xm13, [r0 + 3 * mmsize/2], 1
3370
+ pinsrq xm14, [r1 + 3 * mmsize/2], 1
3371
+ vpmovsxwq m6, xm13
3372
+ vpmovsxwq m7, xm14
3373
+ psubq m7, m6 ; predictedCoef
3374
+
3375
+ vcvtqq2pd m9, m6
3376
+ vfmadd213pd m9, m9, m3
3377
+ vcvtpd2qq m8, m9
3378
+ vpsllq m8, xm2 ;(signCoef * signCoef) << scaleBits
3379
+
3380
+ vcvtqq2pd m10, m7
3381
+ vcvtqq2pd m11, m12
3382
+ vfmadd213pd m10, m11, m3
3383
+ vcvtpd2qq m9, m10
3384
+ vpsraq m9, RDO_MAX_16 ;(psyScale * predictedCoef) >> max
3385
+
3386
+ psubq m8, m9
3387
+ paddq m4, m8
3388
+ movu [r2 + 4 * mmsize], ym8
3389
+ vextracti32x8 [r2 + 6 * mmsize], m8 , 1
3390
+
3391
+ vextracti32x8 ym2, m4, 1
3392
+ paddq ym4, ym2
3393
+ vextracti32x4 xm2, m4, 1
3394
+ paddq xm4, xm2
3395
+ punpckhqdq xm2, xm4, xm3
3396
+ paddq xm4, xm2
3397
+
3398
+ paddq xm0, xm4
3399
+ paddq xm1, xm4
3400
+
3401
+ movq [r3], xm0
3402
+ movq [r4], xm1
3403
+ RET
3404
+
3405
+;---------------------------------------------------------------------------------------------------------------------------------------------------------
3406
+; void psyRdoQuant(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos)
3407
+;---------------------------------------------------------------------------------------------------------------------------------------------------------
3408
+INIT_ZMM avx512
3409
+cglobal psyRdoQuant32, 5, 9, 15
3410
+%if WIN64
3411
+ mov r5, r5m
3412
+%endif
3413
+ mov r6d, r6m
3414
+ vpbroadcastq m12, [r5] ; psyScale
3415
+ lea r0, [r0 + 2 * r6]
3416
+ lea r1, [r1 + 2 * r6]
3417
+ lea r6, [4 * r6]
3418
+ lea r2, [r2 + 2 * r6]
3419
+ movq xm0, [r3]
3420
+ movq xm1, [r4]
3421
+
3422
+%if BIT_DEPTH == 12
3423
+ mov r5, [tab_nonpsyRdo12 + 24] ; scaleBits
3424
+%elif BIT_DEPTH == 10
3425
+ mov r5, [tab_nonpsyRdo10 + 24]
3426
+%elif BIT_DEPTH == 8
3427
+ mov r5, [tab_nonpsyRdo8 + 24]
3428
+%else
3429
+ %error Unsupported BIT_DEPTH!
3430
+%endif
3431
+
3432
+ movq xm2, r5
3433
+ vpxor m4, m4
3434
+ vpxor m3, m3
3435
+
3436
+;Row 1, 2
3437
+ movq xm13, [r0]
3438
+ movq xm14, [r1]
3439
+ pinsrq xm13, [r0 + mmsize], 1
3440
+ pinsrq xm14, [r1 + mmsize], 1
3441
+ vpmovsxwq m6, xm13
3442
+ vpmovsxwq m7, xm14
3443
+ psubq m7, m6 ; predictedCoef
3444
+
3445
+ vcvtqq2pd m9, m6
3446
+ vfmadd213pd m9, m9, m3
3447
+ vcvtpd2qq m8, m9
3448
+ vpsllq m8, xm2 ;(signCoef * signCoef) << scaleBits
3449
+
3450
+ vcvtqq2pd m10, m7
3451
+ vcvtqq2pd m11, m12
3452
+ vfmadd213pd m10, m11, m3
3453
+ vcvtpd2qq m9, m10
3454
+ vpsraq m9, RDO_MAX_32 ;(psyScale * predictedCoef) >> max
3455
+
3456
+ psubq m8, m9
3457
+ paddq m4, m8
3458
+ movu [r2], ym8
3459
+ vextracti32x8 [r2 + 4 * mmsize], m8 , 1
3460
+
3461
+ ;Row 3, 4
3462
+ movq xm13, [r0 + 2 * mmsize]
3463
+ movq xm14, [r1 + 2 * mmsize]
3464
+ pinsrq xm13, [r0 + 3 * mmsize], 1
3465
+ pinsrq xm14, [r1 + 3 * mmsize], 1
3466
+ vpmovsxwq m6, xm13
3467
+ vpmovsxwq m7, xm14
3468
+ psubq m7, m6 ; predictedCoef
3469
+
3470
+ vcvtqq2pd m9, m6
3471
+ vfmadd213pd m9, m9, m3
3472
+ vcvtpd2qq m8, m9
3473
+ vpsllq m8, xm2 ;(signCoef * signCoef) << scaleBits
3474
+
3475
+ vcvtqq2pd m10, m7
3476
+ vcvtqq2pd m11, m12
3477
+ vfmadd213pd m10, m11, m3
3478
+ vcvtpd2qq m9, m10
3479
+ vpsraq m9, RDO_MAX_32 ;(psyScale * predictedCoef) >> max
3480
+
3481
+ psubq m8, m9
3482
+ paddq m4, m8
3483
+ movu [r2 + 8 * mmsize], ym8
3484
+ vextracti32x8 [r2 + 12 * mmsize], m8 , 1
3485
+
3486
+ vextracti32x8 ym2, m4, 1
3487
+ paddq ym4, ym2
3488
+ vextracti32x4 xm2, m4, 1
3489
+ paddq xm4, xm2
3490
+ punpckhqdq xm2, xm4, xm3
3491
+ paddq xm4, xm2
3492
+
3493
+ paddq xm0, xm4
3494
+ paddq xm1, xm4
3495
+
3496
+ movq [r3], xm0
3497
+ movq [r4], xm1
3498
+ RET
3499
+
3500
+INIT_YMM avx2
3501
+cglobal nonPsyRdoQuant4, 5, 9, 16
3502
+ mov r4d, r4m
3503
+ lea r0, [r0 + 2 * r4]
3504
+ lea r4, [4 * r4]
3505
+ lea r1, [r1 + 2 * r4]
3506
+ movq xm0, [r2]
3507
+ movq xm1, [r3]
3508
+
3509
+%if BIT_DEPTH == 12
3510
+ mov r5, [tab_nonpsyRdo12] ; scaleBits
3511
+%elif BIT_DEPTH == 10
3512
+ mov r5, [tab_nonpsyRdo10]
3513
+%elif BIT_DEPTH == 8
3514
+ mov r5, [tab_nonpsyRdo8]
3515
+%else
3516
+ %error Unsupported BIT_DEPTH!
3517
+%endif
3518
+ movq xm2, r5
3519
+ vpxor m4, m4
3520
+ vpxor m3, m3
3521
+ vpxor m13, m13
3522
+
3523
+ vpmovsxwd m6, [r0]
3524
+ vcvtdq2pd m9, xm6
3525
+ vfmadd213pd m9, m9, m3
3526
+ vcvtpd2dq xm8, m9
3527
+ vpmovsxdq m13, xm8 ; 32 bit int to 64 bit int
3528
+ vpsllq m13, xm2 ;(signCoef * signCoef) << scaleBits
3529
+ paddq m4, m13
3530
+ movu [r1], m13
3531
+
3532
+ vpmovsxwd m6, [r0 + 8]
3533
+ vcvtdq2pd m9, xm6
3534
+ vfmadd213pd m9, m9, m3
3535
+ vcvtpd2dq xm8, m9
3536
+ vpmovsxdq m13, xm8 ; 32 bit int to 64 bit int
3537
+ vpsllq m13, xm2 ;(signCoef * signCoef) << scaleBits
3538
+ paddq m4, m13
3539
+ movu [r1 + 32], m13
3540
+
3541
+ vpmovsxwd m6, [r0 + 16]
3542
+ vcvtdq2pd m9, xm6
3543
+ vfmadd213pd m9, m9, m3
3544
+ vcvtpd2dq xm8, m9
3545
+ vpmovsxdq m13, xm8 ; 32 bit int to 64 bit int
3546
+ vpsllq m13, xm2 ;(signCoef * signCoef) << scaleBits
3547
+ paddq m4, m13
3548
+ movu [r1 + 64], m13
3549
+
3550
+ vpmovsxwd m6, [r0 +24]
3551
+ vcvtdq2pd m9, xm6
3552
+ vfmadd213pd m9, m9, m3
3553
+ vcvtpd2dq xm8, m9
3554
+ vpmovsxdq m13, xm8 ; 32 bit int to 64 bit int
3555
+ vpsllq m13, xm2 ;(signCoef * signCoef) << scaleBits
3556
+ paddq m4, m13
3557
+ movu [r1 + 96], m13
3558
+
3559
+
3560
+ vextracti128 xm2, m4, 1
3561
+ paddq xm4, xm2
3562
+ punpckhqdq xm2, xm4, xm3
3563
+ paddq xm4, xm2
3564
+
3565
+ paddq xm0, xm4
3566
+ paddq xm1, xm4
3567
+
3568
+ movq [r2], xm0
3569
+ movq [r3], xm1
3570
+ RET
3571
+
3572
+
3573
+
3574
+INIT_YMM avx2
3575
+cglobal nonPsyRdoQuant8, 5, 5, 8
3576
+ mov r4d, r4m
3577
+ lea r0, [r0 + 2 * r4]
3578
+ lea r4, [4 * r4]
3579
+ lea r1, [r1 + 2 * r4]
3580
+%if BIT_DEPTH == 12
3581
+ mov r4, [tab_nonpsyRdo12 + 8]
3582
+%elif BIT_DEPTH == 10
3583
+ mov r4, [tab_nonpsyRdo10 + 8]
3584
+%elif BIT_DEPTH == 8
3585
+ mov r4, [tab_nonpsyRdo8 + 8]
3586
+%else
3587
+ %error Unsupported BIT_DEPTH!
3588
+ %endif
3589
+ movq xm3, r4
3590
+ movq xm6, [r2]
3591
+ movq xm7, [r3]
3592
+ vpxor m4, m4
3593
+ vpxor m5, m5
3594
+ movq xm0, [r0]
3595
+ vpmovsxwd m1, xm0
3596
+ vcvtdq2pd m2, xm1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
3597
+ vfmadd213pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements
3598
+ vcvtpd2dq xm1, m2
3599
+ vpmovsxdq m0 , xm1
3600
+ vpsllq m0, xm3 ; costUncoded
3601
+ paddq m4, m0
3602
+ movu [r1], ym0
3603
+ vpxor m0, m0
3604
+ movq xm0, [r0 +mmsize/2]
3605
+ vpmovsxwd m1, xm0
3606
+ vcvtdq2pd m2, xm1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
3607
+ vfmadd213pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements
3608
+ vcvtpd2dq xm1, m2
3609
+ vpmovsxdq m0 , xm1
3610
+ vpsllq m0, xm3 ; costUncoded
3611
+ paddq m4, m0
3612
+ movu [r1 +2*mmsize], m0
3613
+ vpxor m0, m0
3614
+ movq xm0, [r0 +mmsize]
3615
+ vpmovsxwd m1, xm0
3616
+ vcvtdq2pd m2, xm1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
3617
+ vfmadd213pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements
3618
+ vcvtpd2dq xm1, m2
3619
+ vpmovsxdq m0 , xm1
3620
+ vpsllq m0, xm3 ; costUncoded
3621
+ paddq m4, m0
3622
+ movu [r1 +4*mmsize], m0
3623
+ vpxor m0, m0
3624
+ movq xm0, [r0 +3*mmsize/2]
3625
+ vpmovsxwd m1, xm0
3626
+ vcvtdq2pd m2, xm1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
3627
+ vfmadd213pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements
3628
+ vcvtpd2dq xm1, m2
3629
+ vpmovsxdq m0 , xm1
3630
+ vpsllq m0, xm3 ; costUncoded
3631
+ paddq m4, m0
3632
+ movu [r1 +6*mmsize], m0
3633
+
3634
+ vextracti128 xm2, m4, 1
3635
+ paddq xm4, xm2
3636
+ punpckhqdq xm2, xm4, xm5
3637
+ paddq xm4, xm2
3638
+
3639
+ paddq xm6, xm4
3640
+ paddq xm7, xm4
3641
+
3642
+ movq [r2], xm6
3643
+ movq [r3], xm7
3644
+ RET
3645
+INIT_YMM avx2
3646
+cglobal nonPsyRdoQuant16, 5, 5, 8
3647
+ mov r4d, r4m
3648
+ lea r0, [r0 + 2 * r4]
3649
+ lea r4, [4 * r4]
3650
+ lea r1, [r1 + 2 * r4]
3651
+%if BIT_DEPTH == 12
3652
+ mov r4, [tab_nonpsyRdo12 + 16]
3653
+%elif BIT_DEPTH == 10
3654
+ mov r4, [tab_nonpsyRdo10 + 16]
3655
+%elif BIT_DEPTH == 8
3656
+ mov r4, [tab_nonpsyRdo8 + 16]
3657
+%else
3658
+ %error Unsupported BIT_DEPTH!
3659
+ %endif
3660
+ movq xm3, r4
3661
+ movq xm6, [r2]
3662
+ movq xm7, [r3]
3663
+ vpxor m4, m4
3664
+ vpxor m5, m5
3665
+
3666
+;Row 1, 2
3667
+ movq xm0, [r0]
3668
+ vpmovsxwd m1, xm0
3669
+ vcvtdq2pd m2, xm1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
3670
+ vfmadd213pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements
3671
+ vcvtpd2dq xm1, m2
3672
+ vpmovsxdq m0 , xm1
3673
+ vpsllq m0, xm3 ; costUncoded
3674
+ paddq m4, m0
3675
+ movu [r1], ym0
3676
+
3677
+ movq xm0, [r0 +mmsize]
3678
+ vpmovsxwd m1, xm0
3679
+ vcvtdq2pd m2, xm1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
3680
+ vfmadd213pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements
3681
+ vcvtpd2dq xm1, m2
3682
+ vpmovsxdq m0 , xm1
3683
+ vpsllq m0, xm3 ; costUncoded
3684
+ paddq m4, m0
3685
+ movu [r1+4*mmsize], ym0
3686
+
3687
+ movq xm0, [r0 + 2*mmsize]
3688
+ vpmovsxwd m1, xm0
3689
+ vcvtdq2pd m2, xm1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
3690
+ vfmadd213pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements
3691
+ vcvtpd2dq xm1, m2
3692
+ vpmovsxdq m0 , xm1
3693
+ vpsllq m0, xm3 ; costUncoded
3694
+ paddq m4, m0
3695
+ movu [r1+8*mmsize], ym0
3696
+
3697
+ movq xm0, [r0 + 3*mmsize]
3698
+ vpmovsxwd m1, xm0
3699
+ vcvtdq2pd m2, xm1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
3700
+ vfmadd213pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements
3701
+ vcvtpd2dq xm1, m2
3702
+ vpmovsxdq m0 , xm1
3703
+ vpsllq m0, xm3 ; costUncoded
3704
+ paddq m4, m0
3705
+ movu [r1+12*mmsize], ym0
3706
+
3707
+
3708
+ vextracti128 xm2, m4, 1
3709
+ paddq xm4, xm2
3710
+ punpckhqdq xm2, xm4, xm5
3711
+ paddq xm4, xm2
3712
+
3713
+ paddq xm6, xm4
3714
+ paddq xm7, xm4
3715
+
3716
+ movq [r2], xm6
3717
+ movq [r3], xm7
3718
+ RET
3719
+INIT_YMM avx2
3720
+cglobal nonPsyRdoQuant32, 5, 5, 8
3721
+ mov r4d, r4m
3722
+ lea r0, [r0 + 2 * r4]
3723
+ lea r4, [4 * r4]
3724
+ lea r1, [r1 + 2 * r4]
3725
+%if BIT_DEPTH == 12
3726
+ mov r4, [tab_nonpsyRdo12 + 24]
3727
+%elif BIT_DEPTH == 10
3728
+ mov r4, [tab_nonpsyRdo10 + 24]
3729
+%elif BIT_DEPTH == 8
3730
+ mov r4, [tab_nonpsyRdo8 + 24]
3731
+%else
3732
+ %error Unsupported BIT_DEPTH!
3733
+ %endif
3734
+ movq xm3, r4
3735
+ movq xm6, [r2]
3736
+ movq xm7, [r3]
3737
+ vpxor m4, m4
3738
+ vpxor m5, m5
3739
+
3740
+ movq xm0, [r0]
3741
+ vpmovsxwd m1, xm0
3742
+ vcvtdq2pd m2, xm1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
3743
+ vfmadd213pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements
3744
+ vcvtpd2dq xm1, m2
3745
+ vpmovsxdq m0 , xm1
3746
+ vpsllq m0, xm3 ; costUncoded
3747
+ paddq m4, m0
3748
+ movu [r1], m0
3749
+ vpxor m0, m0
3750
+
3751
+ movq xm0, [r0 +2*mmsize]
3752
+ vpmovsxwd m1, xm0
3753
+ vcvtdq2pd m2, xm1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
3754
+ vfmadd213pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements
3755
+ vcvtpd2dq xm1, m2
3756
+ vpmovsxdq m0 , xm1
3757
+ vpsllq m0, xm3 ; costUncoded
3758
+ paddq m4, m0
3759
+ movu [r1 + 8*mmsize], m0
3760
+ vpxor m0, m0
3761
+
3762
+ movq xm0, [r0 +4*mmsize]
3763
+ vpmovsxwd m1, xm0
3764
+ vcvtdq2pd m2, xm1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
3765
+ vfmadd213pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements
3766
+ vcvtpd2dq xm1, m2
3767
+ vpmovsxdq m0 , xm1
3768
+ vpsllq m0, xm3 ; costUncoded
3769
+ paddq m4, m0
3770
+ movu [r1 +16*mmsize], m0
3771
+ vpxor m0, m0
3772
+
3773
+ movq xm0, [r0 +6*mmsize]
3774
+ vpmovsxwd m1, xm0
3775
+ vcvtdq2pd m2, xm1 ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
3776
+ vfmadd213pd m2, m2, m5 ; Multiply packed double-precision (64-bit) floating-point elements
3777
+ vcvtpd2dq xm1, m2
3778
+ vpmovsxdq m0 , xm1
3779
+ vpsllq m0, xm3 ; costUncoded
3780
+ paddq m4, m0
3781
+ movu [r1 +24*mmsize], m0
3782
+
3783
+ vextracti128 xm2, m4, 1
3784
+ paddq xm4, xm2
3785
+ punpckhqdq xm2, xm4, xm5
3786
+ paddq xm4, xm2
3787
+
3788
+ paddq xm6, xm4
3789
+ paddq xm7, xm4
3790
+
3791
+ movq [r2], xm6
3792
+ movq [r3], xm7
3793
+ RET
3794
+
3795
+INIT_YMM avx2
3796
+cglobal psyRdoQuant_1p4, 5, 9, 16
3797
+ mov r4d, r4m
3798
+ lea r0, [r0 + 2 * r4]
3799
+ lea r4, [4 * r4]
3800
+ lea r1, [r1 + 2 * r4]
3801
+ movq xm0, [r2]
3802
+ movq xm1, [r3]
3803
+
3804
+%if BIT_DEPTH == 12
3805
+ mov r5, [tab_nonpsyRdo12] ; scaleBits
3806
+%elif BIT_DEPTH == 10
3807
+ mov r5, [tab_nonpsyRdo10]
3808
+%elif BIT_DEPTH == 8
3809
+ mov r5, [tab_nonpsyRdo8]
3810
+%else
3811
+ %error Unsupported BIT_DEPTH!
3812
+%endif
3813
+ movq xm2, r5
3814
+ vpxor m4, m4
3815
+ vpxor m3, m3
3816
+ vpxor m13, m13
3817
+
3818
+ vpmovsxwd m6, [r0]
3819
+ vcvtdq2pd m9, xm6
3820
+ vfmadd213pd m9, m9, m3
3821
+ vcvtpd2dq xm8, m9
3822
+ vpmovsxdq m13, xm8 ; 32 bit int to 64 bit int
3823
+ vpsllq m13, xm2 ;(signCoef * signCoef) << scaleBits
3824
+ paddq m4, m13
3825
+ movu [r1], m13
3826
+
3827
+ vpmovsxwd m6, [r0 + 8]
3828
+ vcvtdq2pd m9, xm6
3829
+ vfmadd213pd m9, m9, m3
3830
+ vcvtpd2dq xm8, m9
3831
+ vpmovsxdq m13, xm8 ; 32 bit int to 64 bit int
3832
+ vpsllq m13, xm2 ;(signCoef * signCoef) << scaleBits
3833
+ paddq m4, m13
3834
+ movu [r1 + 32], m13
3835
+
3836
+ vpmovsxwd m6, [r0 + 16]
3837
+ vcvtdq2pd m9, xm6
3838
+ vfmadd213pd m9, m9, m3
3839
+ vcvtpd2dq xm8, m9
3840
+ vpmovsxdq m13, xm8 ; 32 bit int to 64 bit int
3841
+ vpsllq m13, xm2 ;(signCoef * signCoef) << scaleBits
3842
+ paddq m4, m13
3843
+ movu [r1 + 64], m13
3844
+
3845
+ vpmovsxwd m6, [r0 +24]
3846
+ vcvtdq2pd m9, xm6
3847
+ vfmadd213pd m9, m9, m3
3848
+ vcvtpd2dq xm8, m9
3849
+ vpmovsxdq m13, xm8 ; 32 bit int to 64 bit int
3850
+ vpsllq m13, xm2 ;(signCoef * signCoef) << scaleBits
3851
+ paddq m4, m13
3852
+ movu [r1 + 96], m13
3853
+
3854
+
3855
+ vextracti128 xm2, m4, 1
3856
+ paddq xm4, xm2
3857
+ punpckhqdq xm2, xm4, xm3
3858
+ paddq xm4, xm2
3859
+
3860
+ paddq xm0, xm4
3861
+ paddq xm1, xm4
3862
+
3863
+ movq [r2], xm0
3864
+ movq [r3], xm1
3865
+ RET
3866
+INIT_YMM avx2
3867
+cglobal psyRdoQuant_1p8, 7, 9, 16
3868
+ mov r4d, r4m
3869
+ lea r0, [r0 + 2 * r4]
3870
+ lea r4, [4 * r4]
3871
+ lea r1, [r1 + 2 * r4]
3872
+ movq xm0, [r2]
3873
+ movq xm1, [r3]
3874
+%if BIT_DEPTH == 12
3875
+ mov r5, [tab_nonpsyRdo12 +8] ; scaleBits
3876
+%elif BIT_DEPTH == 10
3877
+ mov r5, [tab_nonpsyRdo10 +8]
3878
+%elif BIT_DEPTH == 8
3879
+ mov r5, [tab_nonpsyRdo8 + 8 ]
3880
+%else
3881
+ %error Unsupported BIT_DEPTH!
3882
+%endif
3883
+ movq xm2, r5
3884
+ vpxor m4, m4
3885
+ vpxor m3, m3
3886
+ vpxor m13, m13
3887
+
3888
+
3889
+ vpmovsxwd m6, [r0]
3890
+ vcvtdq2pd m9, xm6
3891
+ vfmadd213pd m9, m9, m3
3892
+ vcvtpd2dq xm8, m9
3893
+ vpmovsxdq m13, xm8 ; 32 bit int to 64 bit int
3894
+ vpsllq m13, xm2 ;(signCoef * signCoef) << scaleBits
3895
+ paddq m4, m13
3896
+ movu [r1], m13
3897
+
3898
+ vpmovsxwd m6, [r0 + 16]
3899
+ vcvtdq2pd m9, xm6
3900
+ vfmadd213pd m9, m9, m3
3901
+ vcvtpd2dq xm8, m9
3902
+ vpmovsxdq m13, xm8 ; 32 bit int to 64 bit int
3903
+ vpsllq m13, xm2 ;(signCoef * signCoef) << scaleBits
3904
+ paddq m4, m13
3905
+ movu [r1 + 64], m13
3906
+
3907
+ vpmovsxwd m6, [r0 +32]
3908
+ vcvtdq2pd m9, xm6
3909
+ vfmadd213pd m9, m9, m3
3910
+ vcvtpd2dq xm8, m9
3911
+ vpmovsxdq m13, xm8 ; 32 bit int to 64 bit int
3912
+ vpsllq m13, xm2 ;(signCoef * signCoef) << scaleBits
3913
+ paddq m4, m13
3914
+ movu [r1 +128], m13
3915
+
3916
+ vpmovsxwd m6, [r0 + 48]
3917
+ vcvtdq2pd m9, xm6
3918
+ vfmadd213pd m9, m9, m3
3919
+ vcvtpd2dq xm8, m9
3920
+ vpmovsxdq m13, xm8 ; 32 bit int to 64 bit int
3921
+ vpsllq m13, xm2 ;(signCoef * signCoef) << scaleBits
3922
+ paddq m4, m13
3923
+ movu [r1 + 192], m13
3924
+
3925
+ vextracti128 xm2, m4, 1
3926
+ paddq xm4, xm2
3927
+ punpckhqdq xm2, xm4, xm3
3928
+ paddq xm4, xm2
3929
+
3930
+ paddq xm0, xm4
3931
+ paddq xm1, xm4
3932
+
3933
+ movq [r2], xm0
3934
+ movq [r3], xm1
3935
+ RET
3936
+
3937
+INIT_YMM avx2
3938
+cglobal psyRdoQuant_1p16, 7, 9, 16
3939
+ mov r4d, r4m
3940
+ lea r0, [r0 + 2 * r4]
3941
+ lea r4, [4 * r4]
3942
+ lea r1, [r1 + 2 * r4]
3943
+ movq xm0, [r2]
3944
+ movq xm1, [r3]
3945
+%if BIT_DEPTH == 12
3946
+ mov r5, [tab_nonpsyRdo12 + 16] ; scaleBits
3947
+%elif BIT_DEPTH == 10
3948
+ mov r5, [tab_nonpsyRdo10 + 16]
3949
+%elif BIT_DEPTH == 8
3950
+ mov r5, [tab_nonpsyRdo8 + 16 ]
3951
+%else
3952
+ %error Unsupported BIT_DEPTH!
3953
+%endif
3954
+ movq xm2, r5
3955
+ vpxor m4, m4
3956
+ vpxor m3, m3
3957
+ vpxor m13, m13
3958
+
3959
+ vpmovsxwd m6, [r0]
3960
+ vcvtdq2pd m9, xm6
3961
+ vfmadd213pd m9, m9, m3
3962
+ vcvtpd2dq xm8, m9
3963
+ vpmovsxdq m13, xm8 ; 32 bit int to 64 bit int
3964
+ vpsllq m13, xm2 ;(signCoef * signCoef) << scaleBits
3965
+ paddq m4, m13
3966
+ movu [r1], m13
3967
+
3968
+ vpmovsxwd m6, [r0 + mmsize]
3969
+
3970
+ vcvtdq2pd m9, xm6
3971
+ vfmadd213pd m9, m9, m3
3972
+ vcvtpd2dq xm8, m9
3973
+ vpmovsxdq m13, xm8 ; 32 bit int to 64 bit int
3974
+ vpsllq m13, xm2 ;(signCoef * signCoef) << scaleBits
3975
+ paddq m4, m13
3976
+ movu [r1 + 4*mmsize], m13
3977
+
3978
+ vpmovsxwd m6, [r0 + 2 * mmsize]
3979
+ vcvtdq2pd m9, xm6
3980
+ vfmadd213pd m9, m9, m3
3981
+ vcvtpd2dq xm8, m9
3982
+ vpmovsxdq m13, xm8 ; 32 bit int to 64 bit int
3983
+ vpsllq m13, xm2 ;(signCoef * signCoef) << scaleBits
3984
+ paddq m4, m13
3985
+ movu [r1 + 8*mmsize], m13
3986
+
3987
+ vpmovsxwd m6, [r0 + 3 * mmsize]
3988
+ vcvtdq2pd m9, xm6
3989
+ vfmadd213pd m9, m9, m3
3990
+ vcvtpd2dq xm8, m9
3991
+ vpmovsxdq m13, xm8 ; 32 bit int to 64 bit int
3992
+ vpsllq m13, xm2 ;(signCoef * signCoef) << scaleBits
3993
+ paddq m4, m13
3994
+ movu [r1 + 12*mmsize], m13
3995
+
3996
+ vextracti128 xm2, m4, 1
3997
+ paddq xm4, xm2
3998
+ punpckhqdq xm2, xm4, xm3
3999
+ paddq xm4, xm2
4000
+
4001
+ paddq xm0, xm4
4002
+ paddq xm1, xm4
4003
+
4004
+ movq [r2], xm0
4005
+ movq [r3], xm1
4006
+ RET
4007
+
4008
+INIT_YMM avx2
4009
+cglobal psyRdoQuant_1p32, 7, 9, 16
4010
+ mov r4d, r4m
4011
+ lea r0, [r0 + 2 * r4]
4012
+ lea r4, [4 * r4]
4013
+ lea r1, [r1 + 2 * r4]
4014
+ movq xm0, [r2]
4015
+ movq xm1, [r3]
4016
+%if BIT_DEPTH == 12
4017
+ mov r5, [tab_nonpsyRdo12 + 24] ; scaleBits
4018
+%elif BIT_DEPTH == 10
4019
+ mov r5, [tab_nonpsyRdo10 + 24]
4020
+%elif BIT_DEPTH == 8
4021
+ mov r5, [tab_nonpsyRdo8 + 24]
4022
+%else
4023
+ %error Unsupported BIT_DEPTH!
4024
+%endif
4025
+ movq xm2, r5
4026
+ vpxor m4, m4
4027
+ vpxor m3, m3
4028
+ vpxor m13, m13
4029
+
4030
+
4031
+ vpmovsxwd m6, [r0]
4032
+ vcvtdq2pd m9, xm6
4033
+ vfmadd213pd m9, m9, m3
4034
+ vcvtpd2dq xm8, m9
4035
+ vpmovsxdq m13, xm8 ; 32 bit int to 64 bit int
4036
+ vpsllq m13, xm2 ;(signCoef * signCoef) << scaleBits
4037
+ paddq m4, m13
4038
+ movu [r1], m13
4039
+
4040
+ vpmovsxwd m6, [r0 + 2 * mmsize]
4041
+ vcvtdq2pd m9, xm6
4042
+ vfmadd213pd m9, m9, m3
4043
+ vcvtpd2dq xm8, m9
4044
+ vpmovsxdq m13, xm8 ; 32 bit int to 64 bit int
4045
+ vpsllq m13, xm2 ;(signCoef * signCoef) << scaleBits
4046
+ paddq m4, m13
4047
+ movu [r1 + 8 * mmsize], m13
4048
+
4049
+ vpmovsxwd m6, [r0 + 4 * mmsize]
4050
+ vcvtdq2pd m9, xm6
4051
+ vfmadd213pd m9, m9, m3
4052
+ vcvtpd2dq xm8, m9
4053
+ vpmovsxdq m13, xm8 ; 32 bit int to 64 bit int
4054
+ vpsllq m13, xm2 ;(signCoef * signCoef) << scaleBits
4055
+ paddq m4, m13
4056
+ movu [r1 + 16 * mmsize], m13
4057
+
4058
+ vpmovsxwd m6, [r0 + 6 * mmsize]
4059
+ vcvtdq2pd m9, xm6
4060
+ vfmadd213pd m9, m9, m3
4061
+ vcvtpd2dq xm8, m9
4062
+ vpmovsxdq m13, xm8 ; 32 bit int to 64 bit int
4063
+ vpsllq m13, xm2 ;(signCoef * signCoef) << scaleBits
4064
+ paddq m4, m13
4065
+ movu [r1 + 24 *mmsize], m13
4066
+
4067
+ vextracti128 xm2, m4, 1
4068
+ paddq xm4, xm2
4069
+ punpckhqdq xm2, xm4, xm3
4070
+ paddq xm4, xm2
4071
+
4072
+ paddq xm0, xm4
4073
+ paddq xm1, xm4
4074
+
4075
+ movq [r2], xm0
4076
+ movq [r3], xm1
4077
+ RET
4078
+
4079
%endif
4080
x265_2.7.tar.gz/source/common/x86/dct8.h -> x265_2.9.tar.gz/source/common/x86/dct8.h
Changed
26
1
2
FUNCDEF_TU_S2(void, idct, ssse3, const int16_t* src, int16_t* dst, intptr_t dstStride);
3
FUNCDEF_TU_S2(void, idct, sse4, const int16_t* src, int16_t* dst, intptr_t dstStride);
4
FUNCDEF_TU_S2(void, idct, avx2, const int16_t* src, int16_t* dst, intptr_t dstStride);
5
+FUNCDEF_TU_S2(void, nonPsyRdoQuant, avx512, int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, uint32_t blkPos);
6
+FUNCDEF_TU_S2(void, psyRdoQuant, avx512, int16_t* m_resiDctCoeff, int16_t* m_fencDctCoeff, int64_t* costUncoded, int64_t* totalUncodedCost, int64_t* totalRdCost, int64_t *psyScale, uint32_t blkPos);
7
+FUNCDEF_TU_S2(void, nonPsyRdoQuant, avx2, int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, uint32_t blkPos);
8
+FUNCDEF_TU_S2(void, psyRdoQuant_1p, avx2, int16_t* m_resiDctCoeff, int64_t* costUncoded, int64_t* totalUncodedCost, int64_t* totalRdCost, uint32_t blkPos);
9
+FUNCDEF_TU_S2(void, psyRdoQuant_2p, avx2, int16_t* m_resiDctCoeff, int16_t* m_fencDctCoeff, int64_t* costUncoded, int64_t* totalUncodedCost, int64_t* totalRdCost, int64_t *psyScale, uint32_t blkPos);
10
11
void PFX(dst4_ssse3)(const int16_t* src, int16_t* dst, intptr_t srcStride);
12
void PFX(dst4_sse2)(const int16_t* src, int16_t* dst, intptr_t srcStride);
13
14
void PFX(idst4_avx2)(const int16_t* src, int16_t* dst, intptr_t srcStride);
15
void PFX(denoise_dct_sse4)(int16_t* dct, uint32_t* sum, const uint16_t* offset, int size);
16
void PFX(denoise_dct_avx2)(int16_t* dct, uint32_t* sum, const uint16_t* offset, int size);
17
-
18
+void PFX(denoise_dct_avx512)(int16_t* dct, uint32_t* sum, const uint16_t* offset, int size);
19
+void PFX(dct8_avx512)(const int16_t* src, int16_t* dst, intptr_t srcStride);
20
+void PFX(idct8_avx512)(const int16_t* src, int16_t* dst, intptr_t dstStride);
21
+void PFX(idct16_avx512)(const int16_t* src, int16_t* dst, intptr_t dstStride);
22
+void PFX(idct32_avx512)(const int16_t* src, int16_t* dst, intptr_t dstStride);
23
+void PFX(dct32_avx512)(const int16_t* src, int16_t* dst, intptr_t srcStride);
24
+void PFX(dct16_avx512)(const int16_t* src, int16_t* dst, intptr_t srcStride);
25
#endif // ifndef X265_DCT8_H
26
x265_2.7.tar.gz/source/common/x86/h-ipfilter16.asm -> x265_2.9.tar.gz/source/common/x86/h-ipfilter16.asm
Changed
1589
1
2
3
h_pd_524800: times 8 dd 524800
4
5
-tab_LumaCoeff: dw 0, 0, 0, 64, 0, 0, 0, 0
6
+h_tab_LumaCoeff: dw 0, 0, 0, 64, 0, 0, 0, 0
7
dw -1, 4, -10, 58, 17, -5, 1, 0
8
dw -1, 4, -11, 40, 40, -11, 4, -1
9
dw 0, 1, -5, 17, 58, -10, 4, -1
10
11
db 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13
12
13
const interp8_hpp_shuf_new, db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9
14
- db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13
15
-
16
+ db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13
17
+
18
+ALIGN 64
19
+interp8_hpp_shuf1_load_avx512: times 4 db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9
20
+interp8_hpp_shuf2_load_avx512: times 4 db 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13
21
+interp8_hpp_shuf1_store_avx512: times 4 db 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15
22
+
23
SECTION .text
24
cextern pd_8
25
cextern pd_32
26
27
add r3d, r3d
28
29
%ifdef PIC
30
- lea r6, [tab_LumaCoeff]
31
+ lea r6, [h_tab_LumaCoeff]
32
mova m0, [r6 + r4]
33
%else
34
- mova m0, [tab_LumaCoeff + r4]
35
+ mova m0, [h_tab_LumaCoeff + r4]
36
%endif
37
38
%ifidn %3, pp
39
40
;------------------------------------------------------------------------------------------------------------
41
; void interp_8tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx
42
;------------------------------------------------------------------------------------------------------------
43
- FILTER_HOR_LUMA_sse2 4, 4, pp
44
+%if ARCH_X86_64
45
+ FILTER_HOR_LUMA_sse2 4, 4, pp
46
FILTER_HOR_LUMA_sse2 4, 8, pp
47
FILTER_HOR_LUMA_sse2 4, 16, pp
48
FILTER_HOR_LUMA_sse2 8, 4, pp
49
50
FILTER_HOR_LUMA_sse2 64, 32, ps
51
FILTER_HOR_LUMA_sse2 64, 48, ps
52
FILTER_HOR_LUMA_sse2 64, 64, ps
53
+%endif
54
55
;-----------------------------------------------------------------------------
56
; void interp_4tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
57
58
add r3, r3
59
60
%ifdef PIC
61
- lea r6, [tab_LumaCoeff]
62
+ lea r6, [h_tab_LumaCoeff]
63
mova m0, [r6 + r4]
64
%else
65
- mova m0, [tab_LumaCoeff + r4]
66
+ mova m0, [h_tab_LumaCoeff + r4]
67
%endif
68
69
%ifidn %3, pp
70
71
shl r4d, 4
72
73
%ifdef PIC
74
- lea r6, [tab_LumaCoeff]
75
+ lea r6, [h_tab_LumaCoeff]
76
mova m0, [r6 + r4]
77
%else
78
- mova m0, [tab_LumaCoeff + r4]
79
+ mova m0, [h_tab_LumaCoeff + r4]
80
%endif
81
82
%ifidn %3, pp
83
84
shl r4d, 4
85
86
%ifdef PIC
87
- lea r6, [tab_LumaCoeff]
88
+ lea r6, [h_tab_LumaCoeff]
89
mova m0, [r6 + r4]
90
%else
91
- mova m0, [tab_LumaCoeff + r4]
92
+ mova m0, [h_tab_LumaCoeff + r4]
93
%endif
94
%ifidn %3, pp
95
mova m1, [INTERP_OFFSET_PP]
96
97
shl r4d, 4
98
99
%ifdef PIC
100
- lea r6, [tab_LumaCoeff]
101
+ lea r6, [h_tab_LumaCoeff]
102
mova m0, [r6 + r4]
103
%else
104
- mova m0, [tab_LumaCoeff + r4]
105
+ mova m0, [h_tab_LumaCoeff + r4]
106
%endif
107
108
%ifidn %3, pp
109
110
shl r4d, 4
111
112
%ifdef PIC
113
- lea r6, [tab_LumaCoeff]
114
+ lea r6, [h_tab_LumaCoeff]
115
mova m0, [r6 + r4]
116
%else
117
- mova m0, [tab_LumaCoeff + r4]
118
+ mova m0, [h_tab_LumaCoeff + r4]
119
%endif
120
%ifidn %3, pp
121
mova m1, [pd_32]
122
123
mov r4d, r4m
124
shl r4d, 4
125
%ifdef PIC
126
- lea r5, [tab_LumaCoeff]
127
+ lea r5, [h_tab_LumaCoeff]
128
vpbroadcastq m0, [r5 + r4]
129
vpbroadcastq m1, [r5 + r4 + 8]
130
%else
131
- vpbroadcastq m0, [tab_LumaCoeff + r4]
132
- vpbroadcastq m1, [tab_LumaCoeff + r4 + 8]
133
+ vpbroadcastq m0, [h_tab_LumaCoeff + r4]
134
+ vpbroadcastq m1, [h_tab_LumaCoeff + r4 + 8]
135
%endif
136
lea r6, [pw_pixel_max]
137
mova m3, [interp8_hpp_shuf]
138
139
;-------------------------------------------------------------------------------------------------------------
140
; void interp_8tap_horiz_pp(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx
141
;-------------------------------------------------------------------------------------------------------------
142
-%macro FILTER_HOR_LUMA_W8 1
143
+%macro PROCESS_IPFILTER_LUMA_PP_8x2_AVX2 0
144
+ movu xm7, [r0]
145
+ movu xm8, [r0 + 8]
146
+ vinserti128 m7, m7, [r0 + r1], 1
147
+ vinserti128 m8, m8, [r0 + r1 + 8], 1
148
+ pshufb m10, m7, m14
149
+ pshufb m7, m13
150
+ pshufb m11, m8, m14
151
+ pshufb m8, m13
152
+
153
+ pmaddwd m7, m0
154
+ pmaddwd m10, m1
155
+ paddd m7, m10
156
+ pmaddwd m10, m11, m3
157
+ pmaddwd m9, m8, m2
158
+ paddd m10, m9
159
+ paddd m7, m10
160
+ paddd m7, m4
161
+ psrad m7, INTERP_SHIFT_PP
162
+
163
+ movu xm9, [r0 + 16]
164
+ vinserti128 m9, m9, [r0 + r1 + 16], 1
165
+ pshufb m10, m9, m14
166
+ pshufb m9, m13
167
+ pmaddwd m8, m0
168
+ pmaddwd m11, m1
169
+ paddd m8, m11
170
+ pmaddwd m10, m3
171
+ pmaddwd m9, m2
172
+ paddd m9, m10
173
+ paddd m8, m9
174
+ paddd m8, m4
175
+ psrad m8, INTERP_SHIFT_PP
176
+
177
+ packusdw m7, m8
178
+ pshufb m7, m12
179
+ CLIPW m7, m5, m6
180
+ movu [r2], xm7
181
+ vextracti128 [r2 + r3], m7, 1
182
+%endmacro
183
+
184
+%macro IPFILTER_LUMA_AVX2_8xN 1
185
INIT_YMM avx2
186
-cglobal interp_8tap_horiz_pp_8x%1, 4,6,8
187
- add r1d, r1d
188
- add r3d, r3d
189
- sub r0, 6
190
- mov r4d, r4m
191
- shl r4d, 4
192
+cglobal interp_8tap_horiz_pp_8x%1, 5,6,15
193
+ shl r1d, 1
194
+ shl r3d, 1
195
+ sub r0, 6
196
+ mov r4d, r4m
197
+ shl r4d, 4
198
+
199
%ifdef PIC
200
- lea r5, [tab_LumaCoeff]
201
- vpbroadcastq m0, [r5 + r4]
202
- vpbroadcastq m1, [r5 + r4 + 8]
203
+ lea r5, [h_tab_LumaCoeff]
204
+ vpbroadcastd m0, [r5 + r4]
205
+ vpbroadcastd m1, [r5 + r4 + 4]
206
+ vpbroadcastd m2, [r5 + r4 + 8]
207
+ vpbroadcastd m3, [r5 + r4 + 12]
208
%else
209
- vpbroadcastq m0, [tab_LumaCoeff + r4]
210
- vpbroadcastq m1, [h_ab_LumaCoeff + r4 + 8]
211
-%endif
212
- mova m3, [interp8_hpp_shuf]
213
- mova m7, [pd_32]
214
- pxor m2, m2
215
-
216
- ; register map
217
- ; m0 , m1 interpolate coeff
218
-
219
- mov r4d, %1/2
220
-
221
-.loop:
222
- vbroadcasti128 m4, [r0]
223
- vbroadcasti128 m5, [r0 + 8]
224
- pshufb m4, m3
225
- pshufb m5, m3
226
-
227
- pmaddwd m4, m0
228
- pmaddwd m5, m1
229
- paddd m4, m5
230
-
231
- vbroadcasti128 m5, [r0 + 8]
232
- vbroadcasti128 m6, [r0 + 16]
233
- pshufb m5, m3
234
- pshufb m6, m3
235
-
236
- pmaddwd m5, m0
237
- pmaddwd m6, m1
238
- paddd m5, m6
239
-
240
- phaddd m4, m5
241
- vpermq m4, m4, q3120
242
- paddd m4, m7
243
- psrad m4, INTERP_SHIFT_PP
244
-
245
- packusdw m4, m4
246
- vpermq m4, m4, q2020
247
- CLIPW m4, m2, [pw_pixel_max]
248
- movu [r2], xm4
249
-
250
- vbroadcasti128 m4, [r0 + r1]
251
- vbroadcasti128 m5, [r0 + r1 + 8]
252
- pshufb m4, m3
253
- pshufb m5, m3
254
-
255
- pmaddwd m4, m0
256
- pmaddwd m5, m1
257
- paddd m4, m5
258
-
259
- vbroadcasti128 m5, [r0 + r1 + 8]
260
- vbroadcasti128 m6, [r0 + r1 + 16]
261
- pshufb m5, m3
262
- pshufb m6, m3
263
-
264
- pmaddwd m5, m0
265
- pmaddwd m6, m1
266
- paddd m5, m6
267
-
268
- phaddd m4, m5
269
- vpermq m4, m4, q3120
270
- paddd m4, m7
271
- psrad m4, INTERP_SHIFT_PP
272
-
273
- packusdw m4, m4
274
- vpermq m4, m4, q2020
275
- CLIPW m4, m2, [pw_pixel_max]
276
- movu [r2 + r3], xm4
277
-
278
- lea r2, [r2 + 2 * r3]
279
- lea r0, [r0 + 2 * r1]
280
- dec r4d
281
- jnz .loop
282
+ vpbroadcastd m0, [h_tab_LumaCoeff + r4]
283
+ vpbroadcastd m1, [h_tab_LumaCoeff + r4 + 4]
284
+ vpbroadcastd m2, [h_tab_LumaCoeff + r4 + 8]
285
+ vpbroadcastd m3, [h_tab_LumaCoeff + r4 + 12]
286
+%endif
287
+ mova m13, [interp8_hpp_shuf1_load_avx512]
288
+ mova m14, [interp8_hpp_shuf2_load_avx512]
289
+ mova m12, [interp8_hpp_shuf1_store_avx512]
290
+ mova m4, [pd_32]
291
+ pxor m5, m5
292
+ mova m6, [pw_pixel_max]
293
+
294
+%rep %1/2 - 1
295
+ PROCESS_IPFILTER_LUMA_PP_8x2_AVX2
296
+ lea r0, [r0 + 2 * r1]
297
+ lea r2, [r2 + 2 * r3]
298
+%endrep
299
+ PROCESS_IPFILTER_LUMA_PP_8x2_AVX2
300
RET
301
%endmacro
302
-FILTER_HOR_LUMA_W8 4
303
-FILTER_HOR_LUMA_W8 8
304
-FILTER_HOR_LUMA_W8 16
305
-FILTER_HOR_LUMA_W8 32
306
+
307
+%if ARCH_X86_64
308
+ IPFILTER_LUMA_AVX2_8xN 4
309
+ IPFILTER_LUMA_AVX2_8xN 8
310
+ IPFILTER_LUMA_AVX2_8xN 16
311
+ IPFILTER_LUMA_AVX2_8xN 32
312
+%endif
313
314
;-------------------------------------------------------------------------------------------------------------
315
; void interp_8tap_horiz_pp(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx
316
;-------------------------------------------------------------------------------------------------------------
317
-%macro FILTER_HOR_LUMA_W16 1
318
+%macro PROCESS_IPFILTER_LUMA_PP_16x1_AVX2 0
319
+ movu m7, [r0]
320
+ movu m8, [r0 + 8]
321
+
322
+ pshufb m10, m7, m14
323
+ pshufb m7, m13
324
+ pshufb m11, m8, m14
325
+ pshufb m8, m13
326
+
327
+ pmaddwd m7, m0
328
+ pmaddwd m10, m1
329
+ paddd m7, m10
330
+ pmaddwd m10, m11, m3
331
+ pmaddwd m9, m8, m2
332
+ paddd m10, m9
333
+ paddd m7, m10
334
+ paddd m7, m4
335
+ psrad m7, INTERP_SHIFT_PP
336
+
337
+ movu m9, [r0 + 16]
338
+ pshufb m10, m9, m14
339
+ pshufb m9, m13
340
+ pmaddwd m8, m0
341
+ pmaddwd m11, m1
342
+ paddd m8, m11
343
+ pmaddwd m10, m3
344
+ pmaddwd m9, m2
345
+ paddd m9, m10
346
+ paddd m8, m9
347
+ paddd m8, m4
348
+ psrad m8, INTERP_SHIFT_PP
349
+
350
+ packusdw m7, m8
351
+ pshufb m7, m12
352
+ CLIPW m7, m5, m6
353
+ movu [r2], m7
354
+%endmacro
355
+
356
+%macro IPFILTER_LUMA_AVX2_16xN 1
357
INIT_YMM avx2
358
-cglobal interp_8tap_horiz_pp_16x%1, 4,6,8
359
- add r1d, r1d
360
- add r3d, r3d
361
- sub r0, 6
362
- mov r4d, r4m
363
- shl r4d, 4
364
+cglobal interp_8tap_horiz_pp_16x%1, 5,6,15
365
+ shl r1d, 1
366
+ shl r3d, 1
367
+ sub r0, 6
368
+ mov r4d, r4m
369
+ shl r4d, 4
370
+
371
%ifdef PIC
372
- lea r5, [tab_LumaCoeff]
373
- vpbroadcastq m0, [r5 + r4]
374
- vpbroadcastq m1, [r5 + r4 + 8]
375
+ lea r5, [h_tab_LumaCoeff]
376
+ vpbroadcastd m0, [r5 + r4]
377
+ vpbroadcastd m1, [r5 + r4 + 4]
378
+ vpbroadcastd m2, [r5 + r4 + 8]
379
+ vpbroadcastd m3, [r5 + r4 + 12]
380
%else
381
- vpbroadcastq m0, [tab_LumaCoeff + r4]
382
- vpbroadcastq m1, [tab_LumaCoeff + r4 + 8]
383
-%endif
384
- mova m3, [interp8_hpp_shuf]
385
- mova m7, [pd_32]
386
- pxor m2, m2
387
-
388
- ; register map
389
- ; m0 , m1 interpolate coeff
390
-
391
- mov r4d, %1
392
-
393
-.loop:
394
- vbroadcasti128 m4, [r0]
395
- vbroadcasti128 m5, [r0 + 8]
396
- pshufb m4, m3
397
- pshufb m5, m3
398
-
399
- pmaddwd m4, m0
400
- pmaddwd m5, m1
401
- paddd m4, m5
402
-
403
- vbroadcasti128 m5, [r0 + 8]
404
- vbroadcasti128 m6, [r0 + 16]
405
- pshufb m5, m3
406
- pshufb m6, m3
407
-
408
- pmaddwd m5, m0
409
- pmaddwd m6, m1
410
- paddd m5, m6
411
-
412
- phaddd m4, m5
413
- vpermq m4, m4, q3120
414
- paddd m4, m7
415
- psrad m4, INTERP_SHIFT_PP
416
-
417
- packusdw m4, m4
418
- vpermq m4, m4, q2020
419
- CLIPW m4, m2, [pw_pixel_max]
420
- movu [r2], xm4
421
-
422
- vbroadcasti128 m4, [r0 + 16]
423
- vbroadcasti128 m5, [r0 + 24]
424
- pshufb m4, m3
425
- pshufb m5, m3
426
-
427
- pmaddwd m4, m0
428
- pmaddwd m5, m1
429
- paddd m4, m5
430
-
431
- vbroadcasti128 m5, [r0 + 24]
432
- vbroadcasti128 m6, [r0 + 32]
433
- pshufb m5, m3
434
- pshufb m6, m3
435
-
436
- pmaddwd m5, m0
437
- pmaddwd m6, m1
438
- paddd m5, m6
439
-
440
- phaddd m4, m5
441
- vpermq m4, m4, q3120
442
- paddd m4, m7
443
- psrad m4, INTERP_SHIFT_PP
444
-
445
- packusdw m4, m4
446
- vpermq m4, m4, q2020
447
- CLIPW m4, m2, [pw_pixel_max]
448
- movu [r2 + 16], xm4
449
-
450
- add r2, r3
451
- add r0, r1
452
- dec r4d
453
- jnz .loop
454
+ vpbroadcastd m0, [h_tab_LumaCoeff + r4]
455
+ vpbroadcastd m1, [h_tab_LumaCoeff + r4 + 4]
456
+ vpbroadcastd m2, [h_tab_LumaCoeff + r4 + 8]
457
+ vpbroadcastd m3, [h_tab_LumaCoeff + r4 + 12]
458
+%endif
459
+ mova m13, [interp8_hpp_shuf1_load_avx512]
460
+ mova m14, [interp8_hpp_shuf2_load_avx512]
461
+ mova m12, [interp8_hpp_shuf1_store_avx512]
462
+ mova m4, [pd_32]
463
+ pxor m5, m5
464
+ mova m6, [pw_pixel_max]
465
+
466
+%rep %1 - 1
467
+ PROCESS_IPFILTER_LUMA_PP_16x1_AVX2
468
+ lea r0, [r0 + r1]
469
+ lea r2, [r2 + r3]
470
+%endrep
471
+ PROCESS_IPFILTER_LUMA_PP_16x1_AVX2
472
RET
473
%endmacro
474
-FILTER_HOR_LUMA_W16 4
475
-FILTER_HOR_LUMA_W16 8
476
-FILTER_HOR_LUMA_W16 12
477
-FILTER_HOR_LUMA_W16 16
478
-FILTER_HOR_LUMA_W16 32
479
-FILTER_HOR_LUMA_W16 64
480
+
481
+%if ARCH_X86_64
482
+ IPFILTER_LUMA_AVX2_16xN 4
483
+ IPFILTER_LUMA_AVX2_16xN 8
484
+ IPFILTER_LUMA_AVX2_16xN 12
485
+ IPFILTER_LUMA_AVX2_16xN 16
486
+ IPFILTER_LUMA_AVX2_16xN 32
487
+ IPFILTER_LUMA_AVX2_16xN 64
488
+%endif
489
490
;-------------------------------------------------------------------------------------------------------------
491
; void interp_8tap_horiz_pp(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx
492
;-------------------------------------------------------------------------------------------------------------
493
-%macro FILTER_HOR_LUMA_W32 2
494
+%macro PROCESS_IPFILTER_LUMA_PP_32x1_AVX2 0
495
+ PROCESS_IPFILTER_LUMA_PP_16x1_AVX2
496
+
497
+ movu m7, [r0 + mmsize]
498
+ movu m8, [r0 + 8 + mmsize]
499
+
500
+ pshufb m10, m7, m14
501
+ pshufb m7, m13
502
+ pshufb m11, m8, m14
503
+ pshufb m8, m13
504
+
505
+ pmaddwd m7, m0
506
+ pmaddwd m10, m1
507
+ paddd m7, m10
508
+ pmaddwd m10, m11, m3
509
+ pmaddwd m9, m8, m2
510
+ paddd m10, m9
511
+ paddd m7, m10
512
+ paddd m7, m4
513
+ psrad m7, INTERP_SHIFT_PP
514
+
515
+ movu m9, [r0 + 16 + mmsize]
516
+ pshufb m10, m9, m14
517
+ pshufb m9, m13
518
+ pmaddwd m8, m0
519
+ pmaddwd m11, m1
520
+ paddd m8, m11
521
+ pmaddwd m10, m3
522
+ pmaddwd m9, m2
523
+ paddd m9, m10
524
+ paddd m8, m9
525
+ paddd m8, m4
526
+ psrad m8, INTERP_SHIFT_PP
527
+
528
+ packusdw m7, m8
529
+ pshufb m7, m12
530
+ CLIPW m7, m5, m6
531
+ movu [r2 + mmsize], m7
532
+%endmacro
533
+
534
+%macro IPFILTER_LUMA_AVX2_32xN 1
535
INIT_YMM avx2
536
-cglobal interp_8tap_horiz_pp_%1x%2, 4,6,8
537
- add r1d, r1d
538
- add r3d, r3d
539
- sub r0, 6
540
- mov r4d, r4m
541
- shl r4d, 4
542
+cglobal interp_8tap_horiz_pp_32x%1, 5,6,15
543
+ shl r1d, 1
544
+ shl r3d, 1
545
+ sub r0, 6
546
+ mov r4d, r4m
547
+ shl r4d, 4
548
+
549
%ifdef PIC
550
- lea r5, [tab_LumaCoeff]
551
- vpbroadcastq m0, [r5 + r4]
552
- vpbroadcastq m1, [r5 + r4 + 8]
553
+ lea r5, [h_tab_LumaCoeff]
554
+ vpbroadcastd m0, [r5 + r4]
555
+ vpbroadcastd m1, [r5 + r4 + 4]
556
+ vpbroadcastd m2, [r5 + r4 + 8]
557
+ vpbroadcastd m3, [r5 + r4 + 12]
558
%else
559
- vpbroadcastq m0, [tab_LumaCoeff + r4]
560
- vpbroadcastq m1, [tab_LumaCoeff + r4 + 8]
561
-%endif
562
- mova m3, [interp8_hpp_shuf]
563
- mova m7, [pd_32]
564
- pxor m2, m2
565
-
566
- ; register map
567
- ; m0 , m1 interpolate coeff
568
-
569
- mov r4d, %2
570
-
571
-.loop:
572
-%assign x 0
573
-%rep %1/16
574
- vbroadcasti128 m4, [r0 + x]
575
- vbroadcasti128 m5, [r0 + 8 + x]
576
- pshufb m4, m3
577
- pshufb m5, m3
578
-
579
- pmaddwd m4, m0
580
- pmaddwd m5, m1
581
- paddd m4, m5
582
-
583
- vbroadcasti128 m5, [r0 + 8 + x]
584
- vbroadcasti128 m6, [r0 + 16 + x]
585
- pshufb m5, m3
586
- pshufb m6, m3
587
-
588
- pmaddwd m5, m0
589
- pmaddwd m6, m1
590
- paddd m5, m6
591
-
592
- phaddd m4, m5
593
- vpermq m4, m4, q3120
594
- paddd m4, m7
595
- psrad m4, INTERP_SHIFT_PP
596
-
597
- packusdw m4, m4
598
- vpermq m4, m4, q2020
599
- CLIPW m4, m2, [pw_pixel_max]
600
- movu [r2 + x], xm4
601
-
602
- vbroadcasti128 m4, [r0 + 16 + x]
603
- vbroadcasti128 m5, [r0 + 24 + x]
604
- pshufb m4, m3
605
- pshufb m5, m3
606
-
607
- pmaddwd m4, m0
608
- pmaddwd m5, m1
609
- paddd m4, m5
610
-
611
- vbroadcasti128 m5, [r0 + 24 + x]
612
- vbroadcasti128 m6, [r0 + 32 + x]
613
- pshufb m5, m3
614
- pshufb m6, m3
615
-
616
- pmaddwd m5, m0
617
- pmaddwd m6, m1
618
- paddd m5, m6
619
-
620
- phaddd m4, m5
621
- vpermq m4, m4, q3120
622
- paddd m4, m7
623
- psrad m4, INTERP_SHIFT_PP
624
-
625
- packusdw m4, m4
626
- vpermq m4, m4, q2020
627
- CLIPW m4, m2, [pw_pixel_max]
628
- movu [r2 + 16 + x], xm4
629
+ vpbroadcastd m0, [h_tab_LumaCoeff + r4]
630
+ vpbroadcastd m1, [h_tab_LumaCoeff + r4 + 4]
631
+ vpbroadcastd m2, [h_tab_LumaCoeff + r4 + 8]
632
+ vpbroadcastd m3, [h_tab_LumaCoeff + r4 + 12]
633
+%endif
634
+ mova m13, [interp8_hpp_shuf1_load_avx512]
635
+ mova m14, [interp8_hpp_shuf2_load_avx512]
636
+ mova m12, [interp8_hpp_shuf1_store_avx512]
637
+ mova m4, [pd_32]
638
+ pxor m5, m5
639
+ mova m6, [pw_pixel_max]
640
+
641
+%rep %1 - 1
642
+ PROCESS_IPFILTER_LUMA_PP_32x1_AVX2
643
+ lea r0, [r0 + r1]
644
+ lea r2, [r2 + r3]
645
+%endrep
646
+ PROCESS_IPFILTER_LUMA_PP_32x1_AVX2
647
+ RET
648
+%endmacro
649
650
+%if ARCH_X86_64
651
+ IPFILTER_LUMA_AVX2_32xN 8
652
+ IPFILTER_LUMA_AVX2_32xN 16
653
+ IPFILTER_LUMA_AVX2_32xN 24
654
+ IPFILTER_LUMA_AVX2_32xN 32
655
+ IPFILTER_LUMA_AVX2_32xN 64
656
+%endif
657
+
658
+%macro PROCESS_IPFILTER_LUMA_PP_64x1_AVX2 0
659
+ PROCESS_IPFILTER_LUMA_PP_16x1_AVX2
660
+%assign x 32
661
+%rep 3
662
+ movu m7, [r0 + x]
663
+ movu m8, [r0 + 8 + x]
664
+
665
+ pshufb m10, m7, m14
666
+ pshufb m7, m13
667
+ pshufb m11, m8, m14
668
+ pshufb m8, m13
669
+
670
+ pmaddwd m7, m0
671
+ pmaddwd m10, m1
672
+ paddd m7, m10
673
+ pmaddwd m10, m11, m3
674
+ pmaddwd m9, m8, m2
675
+ paddd m10, m9
676
+ paddd m7, m10
677
+ paddd m7, m4
678
+ psrad m7, INTERP_SHIFT_PP
679
+
680
+ movu m9, [r0 + 16 + x]
681
+ pshufb m10, m9, m14
682
+ pshufb m9, m13
683
+ pmaddwd m8, m0
684
+ pmaddwd m11, m1
685
+ paddd m8, m11
686
+ pmaddwd m10, m3
687
+ pmaddwd m9, m2
688
+ paddd m9, m10
689
+ paddd m8, m9
690
+ paddd m8, m4
691
+ psrad m8, INTERP_SHIFT_PP
692
+
693
+ packusdw m7, m8
694
+ pshufb m7, m12
695
+ CLIPW m7, m5, m6
696
+ movu [r2 + x], m7
697
%assign x x+32
698
%endrep
699
+%endmacro
700
701
- add r2, r3
702
- add r0, r1
703
- dec r4d
704
- jnz .loop
705
+%macro IPFILTER_LUMA_AVX2_64xN 1
706
+INIT_YMM avx2
707
+cglobal interp_8tap_horiz_pp_64x%1, 5,6,15
708
+ shl r1d, 1
709
+ shl r3d, 1
710
+ sub r0, 6
711
+ mov r4d, r4m
712
+ shl r4d, 4
713
+
714
+%ifdef PIC
715
+ lea r5, [h_tab_LumaCoeff]
716
+ vpbroadcastd m0, [r5 + r4]
717
+ vpbroadcastd m1, [r5 + r4 + 4]
718
+ vpbroadcastd m2, [r5 + r4 + 8]
719
+ vpbroadcastd m3, [r5 + r4 + 12]
720
+%else
721
+ vpbroadcastd m0, [h_tab_LumaCoeff + r4]
722
+ vpbroadcastd m1, [h_tab_LumaCoeff + r4 + 4]
723
+ vpbroadcastd m2, [h_tab_LumaCoeff + r4 + 8]
724
+ vpbroadcastd m3, [h_tab_LumaCoeff + r4 + 12]
725
+%endif
726
+ mova m13, [interp8_hpp_shuf1_load_avx512]
727
+ mova m14, [interp8_hpp_shuf2_load_avx512]
728
+ mova m12, [interp8_hpp_shuf1_store_avx512]
729
+ mova m4, [pd_32]
730
+ pxor m5, m5
731
+ mova m6, [pw_pixel_max]
732
+
733
+%rep %1 - 1
734
+ PROCESS_IPFILTER_LUMA_PP_64x1_AVX2
735
+ lea r0, [r0 + r1]
736
+ lea r2, [r2 + r3]
737
+%endrep
738
+ PROCESS_IPFILTER_LUMA_PP_64x1_AVX2
739
RET
740
%endmacro
741
-FILTER_HOR_LUMA_W32 32, 8
742
-FILTER_HOR_LUMA_W32 32, 16
743
-FILTER_HOR_LUMA_W32 32, 24
744
-FILTER_HOR_LUMA_W32 32, 32
745
-FILTER_HOR_LUMA_W32 32, 64
746
-FILTER_HOR_LUMA_W32 64, 16
747
-FILTER_HOR_LUMA_W32 64, 32
748
-FILTER_HOR_LUMA_W32 64, 48
749
-FILTER_HOR_LUMA_W32 64, 64
750
+
751
+%if ARCH_X86_64
752
+ IPFILTER_LUMA_AVX2_64xN 16
753
+ IPFILTER_LUMA_AVX2_64xN 32
754
+ IPFILTER_LUMA_AVX2_64xN 48
755
+ IPFILTER_LUMA_AVX2_64xN 64
756
+%endif
757
758
;-------------------------------------------------------------------------------------------------------------
759
; void interp_8tap_horiz_pp(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx
760
761
mov r4d, r4m
762
shl r4d, 4
763
%ifdef PIC
764
- lea r5, [tab_LumaCoeff]
765
+ lea r5, [h_tab_LumaCoeff]
766
vpbroadcastq m0, [r5 + r4]
767
vpbroadcastq m1, [r5 + r4 + 8]
768
%else
769
- vpbroadcastq m0, [tab_LumaCoeff + r4]
770
- vpbroadcastq m1, [tab_LumaCoeff + r4 + 8]
771
+ vpbroadcastq m0, [h_tab_LumaCoeff + r4]
772
+ vpbroadcastq m1, [h_tab_LumaCoeff + r4 + 8]
773
%endif
774
mova m3, [interp8_hpp_shuf]
775
mova m7, [pd_32]
776
777
mov r4d, r4m
778
shl r4d, 4
779
%ifdef PIC
780
- lea r5, [tab_LumaCoeff]
781
+ lea r5, [h_tab_LumaCoeff]
782
vpbroadcastq m0, [r5 + r4]
783
vpbroadcastq m1, [r5 + r4 + 8]
784
%else
785
- vpbroadcastq m0, [tab_LumaCoeff + r4]
786
- vpbroadcastq m1, [tab_LumaCoeff + r4 + 8]
787
+ vpbroadcastq m0, [h_tab_LumaCoeff + r4]
788
+ vpbroadcastq m1, [h_tab_LumaCoeff + r4 + 8]
789
%endif
790
mova m3, [interp8_hpp_shuf]
791
mova m7, [pd_32]
792
793
;-------------------------------------------------------------------------------------------------------------
794
; void interp_8tap_horiz_pp(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx
795
;-------------------------------------------------------------------------------------------------------------
796
+%macro PROCESS_IPFILTER_LUMA_PP_48x1_AVX2 0
797
+ PROCESS_IPFILTER_LUMA_PP_32x1_AVX2
798
+
799
+ movu m7, [r0 + 2 * mmsize]
800
+ movu m8, [r0 + 8 + 2 * mmsize]
801
+
802
+ pshufb m10, m7, m14
803
+ pshufb m7, m13
804
+ pshufb m11, m8, m14
805
+ pshufb m8, m13
806
+
807
+ pmaddwd m7, m0
808
+ pmaddwd m10, m1
809
+ paddd m7, m10
810
+ pmaddwd m10, m11, m3
811
+ pmaddwd m9, m8, m2
812
+ paddd m10, m9
813
+ paddd m7, m10
814
+ paddd m7, m4
815
+ psrad m7, INTERP_SHIFT_PP
816
+
817
+ movu m9, [r0 + 16 + 2 * mmsize]
818
+ pshufb m10, m9, m14
819
+ pshufb m9, m13
820
+ pmaddwd m8, m0
821
+ pmaddwd m11, m1
822
+ paddd m8, m11
823
+ pmaddwd m10, m3
824
+ pmaddwd m9, m2
825
+ paddd m9, m10
826
+ paddd m8, m9
827
+ paddd m8, m4
828
+ psrad m8, INTERP_SHIFT_PP
829
+
830
+ packusdw m7, m8
831
+ pshufb m7, m12
832
+ CLIPW m7, m5, m6
833
+ movu [r2 + 2 * mmsize], m7
834
+%endmacro
835
+
836
+%if ARCH_X86_64
837
INIT_YMM avx2
838
-cglobal interp_8tap_horiz_pp_48x64, 4,6,8
839
- add r1d, r1d
840
- add r3d, r3d
841
- sub r0, 6
842
- mov r4d, r4m
843
- shl r4d, 4
844
+cglobal interp_8tap_horiz_pp_48x64, 5,6,15
845
+ shl r1d, 1
846
+ shl r3d, 1
847
+ sub r0, 6
848
+ mov r4d, r4m
849
+ shl r4d, 4
850
+
851
%ifdef PIC
852
- lea r5, [tab_LumaCoeff]
853
- vpbroadcastq m0, [r5 + r4]
854
- vpbroadcastq m1, [r5 + r4 + 8]
855
+ lea r5, [h_tab_LumaCoeff]
856
+ vpbroadcastd m0, [r5 + r4]
857
+ vpbroadcastd m1, [r5 + r4 + 4]
858
+ vpbroadcastd m2, [r5 + r4 + 8]
859
+ vpbroadcastd m3, [r5 + r4 + 12]
860
%else
861
- vpbroadcastq m0, [tab_LumaCoeff + r4]
862
- vpbroadcastq m1, [tab_LumaCoeff + r4 + 8]
863
-%endif
864
- mova m3, [interp8_hpp_shuf]
865
- mova m7, [pd_32]
866
- pxor m2, m2
867
-
868
- ; register map
869
- ; m0 , m1 interpolate coeff
870
-
871
- mov r4d, 64
872
-
873
-.loop:
874
-%assign x 0
875
-%rep 2
876
- vbroadcasti128 m4, [r0 + x]
877
- vbroadcasti128 m5, [r0 + 8 + x]
878
- pshufb m4, m3
879
- pshufb m5, m3
880
-
881
- pmaddwd m4, m0
882
- pmaddwd m5, m1
883
- paddd m4, m5
884
-
885
- vbroadcasti128 m5, [r0 + 8 + x]
886
- vbroadcasti128 m6, [r0 + 16 + x]
887
- pshufb m5, m3
888
- pshufb m6, m3
889
-
890
- pmaddwd m5, m0
891
- pmaddwd m6, m1
892
- paddd m5, m6
893
-
894
- phaddd m4, m5
895
- vpermq m4, m4, q3120
896
- paddd m4, m7
897
- psrad m4, INTERP_SHIFT_PP
898
-
899
- packusdw m4, m4
900
- vpermq m4, m4, q2020
901
- CLIPW m4, m2, [pw_pixel_max]
902
- movu [r2 + x], xm4
903
-
904
- vbroadcasti128 m4, [r0 + 16 + x]
905
- vbroadcasti128 m5, [r0 + 24 + x]
906
- pshufb m4, m3
907
- pshufb m5, m3
908
-
909
- pmaddwd m4, m0
910
- pmaddwd m5, m1
911
- paddd m4, m5
912
-
913
- vbroadcasti128 m5, [r0 + 24 + x]
914
- vbroadcasti128 m6, [r0 + 32 + x]
915
- pshufb m5, m3
916
- pshufb m6, m3
917
-
918
- pmaddwd m5, m0
919
- pmaddwd m6, m1
920
- paddd m5, m6
921
-
922
- phaddd m4, m5
923
- vpermq m4, m4, q3120
924
- paddd m4, m7
925
- psrad m4, INTERP_SHIFT_PP
926
-
927
- packusdw m4, m4
928
- vpermq m4, m4, q2020
929
- CLIPW m4, m2, [pw_pixel_max]
930
- movu [r2 + 16 + x], xm4
931
-
932
- vbroadcasti128 m4, [r0 + 32 + x]
933
- vbroadcasti128 m5, [r0 + 40 + x]
934
- pshufb m4, m3
935
- pshufb m5, m3
936
-
937
- pmaddwd m4, m0
938
- pmaddwd m5, m1
939
- paddd m4, m5
940
-
941
- vbroadcasti128 m5, [r0 + 40 + x]
942
- vbroadcasti128 m6, [r0 + 48 + x]
943
- pshufb m5, m3
944
- pshufb m6, m3
945
-
946
- pmaddwd m5, m0
947
- pmaddwd m6, m1
948
- paddd m5, m6
949
-
950
- phaddd m4, m5
951
- vpermq m4, m4, q3120
952
- paddd m4, m7
953
- psrad m4, INTERP_SHIFT_PP
954
-
955
- packusdw m4, m4
956
- vpermq m4, m4, q2020
957
- CLIPW m4, m2, [pw_pixel_max]
958
- movu [r2 + 32 + x], xm4
959
-
960
-%assign x x+48
961
+ vpbroadcastd m0, [h_tab_LumaCoeff + r4]
962
+ vpbroadcastd m1, [h_tab_LumaCoeff + r4 + 4]
963
+ vpbroadcastd m2, [h_tab_LumaCoeff + r4 + 8]
964
+ vpbroadcastd m3, [h_tab_LumaCoeff + r4 + 12]
965
+%endif
966
+ mova m13, [interp8_hpp_shuf1_load_avx512]
967
+ mova m14, [interp8_hpp_shuf2_load_avx512]
968
+ mova m12, [interp8_hpp_shuf1_store_avx512]
969
+ mova m4, [pd_32]
970
+ pxor m5, m5
971
+ mova m6, [pw_pixel_max]
972
+
973
+%rep 63
974
+ PROCESS_IPFILTER_LUMA_PP_48x1_AVX2
975
+ lea r0, [r0 + r1]
976
+ lea r2, [r2 + r3]
977
%endrep
978
-
979
- add r2, r3
980
- add r0, r1
981
- dec r4d
982
- jnz .loop
983
+ PROCESS_IPFILTER_LUMA_PP_48x1_AVX2
984
RET
985
+%endif
986
987
;-----------------------------------------------------------------------------------------------------------------------------
988
;void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)
989
990
add r3d, r3d
991
992
%ifdef PIC
993
- lea r6, [tab_LumaCoeff]
994
+ lea r6, [h_tab_LumaCoeff]
995
lea r4, [r4 * 8]
996
vbroadcasti128 m0, [r6 + r4 * 2]
997
%else
998
lea r4, [r4 * 8]
999
- vbroadcasti128 m0, [tab_LumaCoeff + r4 * 2]
1000
+ vbroadcasti128 m0, [h_tab_LumaCoeff + r4 * 2]
1001
%endif
1002
1003
vbroadcasti128 m2, [INTERP_OFFSET_PS]
1004
1005
IPFILTER_LUMA_PS_4xN_AVX2 8
1006
IPFILTER_LUMA_PS_4xN_AVX2 16
1007
1008
+ %macro PROCESS_IPFILTER_LUMA_PS_8x1_AVX2 1
1009
+
1010
+ %assign x 0
1011
+ %rep %1/8
1012
+ vbroadcasti128 m4, [r0 + x]
1013
+ vbroadcasti128 m5, [r0 + 8+ x]
1014
+ pshufb m4, m3
1015
+ pshufb m7, m5, m3
1016
+ pmaddwd m4, m0
1017
+ pmaddwd m7, m1
1018
+ paddd m4, m7
1019
+
1020
+ vbroadcasti128 m6, [r0 + 16 + x]
1021
+ pshufb m5, m3
1022
+ pshufb m6, m3
1023
+ pmaddwd m5, m0
1024
+ pmaddwd m6, m1
1025
+ paddd m5, m6
1026
+
1027
+ phaddd m4, m5
1028
+ vpermq m4, m4, q3120
1029
+ paddd m4, m2
1030
+ vextracti128 xm5,m4, 1
1031
+ psrad xm4, INTERP_SHIFT_PS
1032
+ psrad xm5, INTERP_SHIFT_PS
1033
+ packssdw xm4, xm5
1034
+ movu [r2 + x], xm4
1035
+ %assign x x+16
1036
+ %endrep
1037
+ %endmacro
1038
+
1039
%macro IPFILTER_LUMA_PS_8xN_AVX2 1
1040
INIT_YMM avx2
1041
%if ARCH_X86_64 == 1
1042
cglobal interp_8tap_horiz_ps_8x%1, 4, 6, 8
1043
- add r1d, r1d
1044
- add r3d, r3d
1045
+ shl r1d, 1
1046
+ shl r3d, 1
1047
mov r4d, r4m
1048
mov r5d, r5m
1049
shl r4d, 4
1050
%ifdef PIC
1051
- lea r6, [tab_LumaCoeff]
1052
+ lea r6, [h_tab_LumaCoeff]
1053
vpbroadcastq m0, [r6 + r4]
1054
vpbroadcastq m1, [r6 + r4 + 8]
1055
%else
1056
- vpbroadcastq m0, [tab_LumaCoeff + r4]
1057
- vpbroadcastq m1, [tab_LumaCoeff + r4 + 8]
1058
+ vpbroadcastq m0, [h_tab_LumaCoeff + r4]
1059
+ vpbroadcastq m1, [h_tab_LumaCoeff + r4 + 8]
1060
%endif
1061
mova m3, [interp8_hpp_shuf]
1062
vbroadcasti128 m2, [INTERP_OFFSET_PS]
1063
1064
add r4d, 7
1065
1066
.loop0:
1067
- vbroadcasti128 m4, [r0]
1068
- vbroadcasti128 m5, [r0 + 8]
1069
- pshufb m4, m3
1070
- pshufb m7, m5, m3
1071
- pmaddwd m4, m0
1072
- pmaddwd m7, m1
1073
- paddd m4, m7
1074
-
1075
- vbroadcasti128 m6, [r0 + 16]
1076
- pshufb m5, m3
1077
- pshufb m6, m3
1078
- pmaddwd m5, m0
1079
- pmaddwd m6, m1
1080
- paddd m5, m6
1081
-
1082
- phaddd m4, m5
1083
- vpermq m4, m4, q3120
1084
- paddd m4, m2
1085
- vextracti128 xm5,m4, 1
1086
- psrad xm4, INTERP_SHIFT_PS
1087
- psrad xm5, INTERP_SHIFT_PS
1088
- packssdw xm4, xm5
1089
-
1090
- movu [r2], xm4
1091
+ PROCESS_IPFILTER_LUMA_PS_8x1_AVX2 8
1092
add r2, r3
1093
add r0, r1
1094
dec r4d
1095
1096
mov r5d, r5m
1097
shl r4d, 4
1098
%ifdef PIC
1099
- lea r6, [tab_LumaCoeff]
1100
+ lea r6, [h_tab_LumaCoeff]
1101
vpbroadcastq m0, [r6 + r4]
1102
vpbroadcastq m1, [r6 + r4 + 8]
1103
%else
1104
- vpbroadcastq m0, [tab_LumaCoeff + r4]
1105
- vpbroadcastq m1, [tab_LumaCoeff + r4 + 8]
1106
+ vpbroadcastq m0, [h_tab_LumaCoeff + r4]
1107
+ vpbroadcastq m1, [h_tab_LumaCoeff + r4 + 8]
1108
%endif
1109
mova m3, [interp8_hpp_shuf]
1110
vbroadcasti128 m2, [INTERP_OFFSET_PS]
1111
1112
sub r0, r6
1113
add r4d, 7
1114
1115
+
1116
.loop0:
1117
-%assign x 0
1118
-%rep 24/8
1119
- vbroadcasti128 m4, [r0 + x]
1120
- vbroadcasti128 m5, [r0 + 8 + x]
1121
- pshufb m4, m3
1122
- pshufb m7, m5, m3
1123
- pmaddwd m4, m0
1124
- pmaddwd m7, m1
1125
- paddd m4, m7
1126
+ PROCESS_IPFILTER_LUMA_PS_8x1_AVX2 24
1127
+ add r2, r3
1128
+ add r0, r1
1129
+ dec r4d
1130
+ jnz .loop0
1131
+ RET
1132
+%endif
1133
1134
- vbroadcasti128 m6, [r0 + 16 + x]
1135
- pshufb m5, m3
1136
- pshufb m6, m3
1137
- pmaddwd m5, m0
1138
- pmaddwd m6, m1
1139
- paddd m5, m6
1140
1141
- phaddd m4, m5
1142
- vpermq m4, m4, q3120
1143
- paddd m4, m2
1144
- vextracti128 xm5,m4, 1
1145
- psrad xm4, INTERP_SHIFT_PS
1146
- psrad xm5, INTERP_SHIFT_PS
1147
- packssdw xm4, xm5
1148
+%macro PROCESS_IPFILTER_LUMA_PS_16x1_AVX2 0
1149
+ movu m7, [r0]
1150
+ movu m8, [r0 + 8]
1151
+ pshufb m10, m7, m14
1152
+ pshufb m7, m13
1153
+ pshufb m11, m8, m14
1154
+ pshufb m8, m13
1155
+
1156
+ pmaddwd m7, m0
1157
+ pmaddwd m10, m1
1158
+ paddd m7, m10
1159
+ pmaddwd m10, m11, m3
1160
+ pmaddwd m9, m8, m2
1161
+ paddd m10, m9
1162
+ paddd m7, m10
1163
+ paddd m7, m4
1164
+ psrad m7, INTERP_SHIFT_PS
1165
+ movu m9, [r0 + 16]
1166
+ pshufb m10, m9, m14
1167
+ pshufb m9, m13
1168
+ pmaddwd m8, m0
1169
+ pmaddwd m11, m1
1170
+ paddd m8, m11
1171
+ pmaddwd m10, m3
1172
+ pmaddwd m9, m2
1173
+ paddd m9, m10
1174
+ paddd m8, m9
1175
+ paddd m8, m4
1176
+ psrad m8, INTERP_SHIFT_PS
1177
+ packssdw m7, m8
1178
+ pshufb m7, m12
1179
+ movu [r2], m7
1180
+%endmacro
1181
1182
- movu [r2 + x], xm4
1183
- %assign x x+16
1184
- %endrep
1185
+%macro IPFILTER_LUMA_PS_16xN_AVX2 1
1186
+INIT_YMM avx2
1187
+%if ARCH_X86_64 == 1
1188
+cglobal interp_8tap_horiz_ps_16x%1, 5, 6, 15
1189
1190
- add r2, r3
1191
- add r0, r1
1192
+ shl r1d, 1
1193
+ shl r3d, 1
1194
+ mov r4d, r4m
1195
+ mov r5d, r5m
1196
+ shl r4d, 4
1197
+%ifdef PIC
1198
+ lea r6, [h_tab_LumaCoeff]
1199
+ vpbroadcastd m0, [r6 + r4]
1200
+ vpbroadcastd m1, [r6 + r4 + 4]
1201
+ vpbroadcastd m2, [r6 + r4 + 8]
1202
+ vpbroadcastd m3, [r6 + r4 + 12]
1203
+%else
1204
+ vpbroadcastd m0, [h_tab_LumaCoeff + r4]
1205
+ vpbroadcastd m1, [h_tab_LumaCoeff + r4 + 4]
1206
+ vpbroadcastd m2, [h_tab_LumaCoeff + r4 + 8]
1207
+ vpbroadcastd m3, [h_tab_LumaCoeff + r4 + 12]
1208
+%endif
1209
+ mova m13, [interp8_hpp_shuf1_load_avx512]
1210
+ mova m14, [interp8_hpp_shuf2_load_avx512]
1211
+ mova m12, [interp8_hpp_shuf1_store_avx512]
1212
+ vbroadcasti128 m4, [INTERP_OFFSET_PS]
1213
+
1214
+ ; register map
1215
+ ; m0 , m1 interpolate coeff
1216
+
1217
+ sub r0, 6
1218
+ test r5d, r5d
1219
+ mov r4d, %1
1220
+ jz .loop0
1221
+ lea r6, [r1*3]
1222
+ sub r0, r6
1223
+ add r4d, 7
1224
+
1225
+.loop0:
1226
+
1227
+ PROCESS_IPFILTER_LUMA_PS_16x1_AVX2
1228
+ lea r0, [r0 + r1]
1229
+ lea r2, [r2 + r3]
1230
+ ;add r2, r3
1231
+ ;add r0, r1
1232
dec r4d
1233
jnz .loop0
1234
RET
1235
%endif
1236
-%macro IPFILTER_LUMA_PS_32_64_AVX2 2
1237
+%endmacro
1238
+
1239
+ IPFILTER_LUMA_PS_16xN_AVX2 4
1240
+ IPFILTER_LUMA_PS_16xN_AVX2 8
1241
+ IPFILTER_LUMA_PS_16xN_AVX2 12
1242
+ IPFILTER_LUMA_PS_16xN_AVX2 16
1243
+ IPFILTER_LUMA_PS_16xN_AVX2 32
1244
+ IPFILTER_LUMA_PS_16xN_AVX2 64
1245
+%macro PROCESS_IPFILTER_LUMA_PS_32x1_AVX2 0
1246
+ PROCESS_IPFILTER_LUMA_PS_16x1_AVX2
1247
+ movu m7, [r0 + mmsize]
1248
+ movu m8, [r0 + 8+ mmsize]
1249
+ pshufb m10, m7, m14
1250
+ pshufb m7, m13
1251
+ pshufb m11, m8, m14
1252
+ pshufb m8, m13
1253
+
1254
+ pmaddwd m7, m0
1255
+ pmaddwd m10, m1
1256
+ paddd m7, m10
1257
+ pmaddwd m10, m11, m3
1258
+ pmaddwd m9, m8, m2
1259
+ paddd m10, m9
1260
+ paddd m7, m10
1261
+ paddd m7, m4
1262
+ psrad m7, INTERP_SHIFT_PS
1263
+ movu m9, [r0 + 16+ mmsize]
1264
+ pshufb m10, m9, m14
1265
+ pshufb m9, m13
1266
+ pmaddwd m8, m0
1267
+ pmaddwd m11, m1
1268
+ paddd m8, m11
1269
+ pmaddwd m10, m3
1270
+ pmaddwd m9, m2
1271
+ paddd m9, m10
1272
+ paddd m8, m9
1273
+ paddd m8, m4
1274
+ psrad m8, INTERP_SHIFT_PS
1275
+ packssdw m7, m8
1276
+ pshufb m7, m12
1277
+ movu [r2+ mmsize], m7
1278
+%endmacro
1279
+
1280
+%macro IPFILTER_LUMA_PS_32xN_AVX2 1
1281
+INIT_YMM avx2
1282
+%if ARCH_X86_64
1283
+cglobal interp_8tap_horiz_ps_32x%1, 5, 6, 15
1284
+
1285
+ shl r1d, 1
1286
+ shl r3d, 1
1287
+ mov r4d, r4m
1288
+ mov r5d, r5m
1289
+ shl r4d, 4
1290
+%ifdef PIC
1291
+ lea r6, [h_tab_LumaCoeff]
1292
+ vpbroadcastd m0, [r6 + r4]
1293
+ vpbroadcastd m1, [r6 + r4 + 4]
1294
+ vpbroadcastd m2, [r6 + r4 + 8]
1295
+ vpbroadcastd m3, [r6 + r4 + 12]
1296
+%else
1297
+ vpbroadcastd m0, [h_tab_LumaCoeff + r4]
1298
+ vpbroadcastd m1, [h_tab_LumaCoeff + r4 + 4]
1299
+ vpbroadcastd m2, [h_tab_LumaCoeff + r4 + 8]
1300
+ vpbroadcastd m3, [h_tab_LumaCoeff + r4 + 12]
1301
+%endif
1302
+ mova m13, [interp8_hpp_shuf1_load_avx512]
1303
+ mova m14, [interp8_hpp_shuf2_load_avx512]
1304
+ mova m12, [interp8_hpp_shuf1_store_avx512]
1305
+ vbroadcasti128 m4, [INTERP_OFFSET_PS]
1306
+
1307
+ ; register map
1308
+ ; m0 , m1 interpolate coeff
1309
+
1310
+ sub r0, 6
1311
+ test r5d, r5d
1312
+ mov r4d, %1
1313
+ jz .loop0
1314
+ lea r6, [r1*3]
1315
+ sub r0, r6
1316
+ add r4d, 7
1317
+
1318
+.loop0:
1319
+ PROCESS_IPFILTER_LUMA_PS_32x1_AVX2
1320
+ lea r0, [r0 + r1]
1321
+ lea r2, [r2 + r3]
1322
+ ;add r2, r3
1323
+ ;add r0, r1
1324
+ dec r4d
1325
+ jnz .loop0
1326
+ RET
1327
+%endif
1328
+%endmacro
1329
+
1330
+ IPFILTER_LUMA_PS_32xN_AVX2 8
1331
+ IPFILTER_LUMA_PS_32xN_AVX2 16
1332
+ IPFILTER_LUMA_PS_32xN_AVX2 24
1333
+ IPFILTER_LUMA_PS_32xN_AVX2 32
1334
+ IPFILTER_LUMA_PS_32xN_AVX2 64
1335
+
1336
+%macro PROCESS_IPFILTER_LUMA_PS_64x1_AVX2 0
1337
+ PROCESS_IPFILTER_LUMA_PS_16x1_AVX2
1338
+%assign x 32
1339
+%rep 3
1340
+ movu m7, [r0 + x]
1341
+ movu m8, [r0 + 8+ x]
1342
+ pshufb m10, m7, m14
1343
+ pshufb m7, m13
1344
+ pshufb m11, m8, m14
1345
+ pshufb m8, m13
1346
+
1347
+ pmaddwd m7, m0
1348
+ pmaddwd m10, m1
1349
+ paddd m7, m10
1350
+ pmaddwd m10, m11, m3
1351
+ pmaddwd m9, m8, m2
1352
+ paddd m10, m9
1353
+ paddd m7, m10
1354
+ paddd m7, m4
1355
+ psrad m7, INTERP_SHIFT_PS
1356
+ movu m9, [r0 + 16+ x]
1357
+ pshufb m10, m9, m14
1358
+ pshufb m9, m13
1359
+ pmaddwd m8, m0
1360
+ pmaddwd m11, m1
1361
+ paddd m8, m11
1362
+ pmaddwd m10, m3
1363
+ pmaddwd m9, m2
1364
+ paddd m9, m10
1365
+ paddd m8, m9
1366
+ paddd m8, m4
1367
+ psrad m8, INTERP_SHIFT_PS
1368
+ packssdw m7, m8
1369
+ pshufb m7, m12
1370
+ movu [r2+ x], m7
1371
+%assign x x+32
1372
+%endrep
1373
+%endmacro
1374
+
1375
+%macro IPFILTER_LUMA_PS_64xN_AVX2 1
1376
+INIT_YMM avx2
1377
+%if ARCH_X86_64
1378
+cglobal interp_8tap_horiz_ps_64x%1, 5, 6, 15
1379
+
1380
+ shl r1d, 1
1381
+ shl r3d, 1
1382
+ mov r4d, r4m
1383
+ mov r5d, r5m
1384
+ shl r4d, 4
1385
+%ifdef PIC
1386
+ lea r6, [h_tab_LumaCoeff]
1387
+ vpbroadcastd m0, [r6 + r4]
1388
+ vpbroadcastd m1, [r6 + r4 + 4]
1389
+ vpbroadcastd m2, [r6 + r4 + 8]
1390
+ vpbroadcastd m3, [r6 + r4 + 12]
1391
+%else
1392
+ vpbroadcastd m0, [h_tab_LumaCoeff + r4]
1393
+ vpbroadcastd m1, [h_tab_LumaCoeff + r4 + 4]
1394
+ vpbroadcastd m2, [h_tab_LumaCoeff + r4 + 8]
1395
+ vpbroadcastd m3, [h_tab_LumaCoeff + r4 + 12]
1396
+%endif
1397
+ mova m13, [interp8_hpp_shuf1_load_avx512]
1398
+ mova m14, [interp8_hpp_shuf2_load_avx512]
1399
+ mova m12, [interp8_hpp_shuf1_store_avx512]
1400
+ vbroadcasti128 m4, [INTERP_OFFSET_PS]
1401
+
1402
+ ; register map
1403
+ ; m0 , m1 interpolate coeff
1404
+
1405
+ sub r0, 6
1406
+ test r5d, r5d
1407
+ mov r4d, %1
1408
+ jz .loop0
1409
+ lea r6, [r1*3]
1410
+ sub r0, r6
1411
+ add r4d, 7
1412
+
1413
+.loop0:
1414
+ PROCESS_IPFILTER_LUMA_PS_64x1_AVX2
1415
+ lea r0, [r0 + r1]
1416
+ lea r2, [r2 + r3]
1417
+ ;add r2, r3
1418
+ ;add r0, r1
1419
+ dec r4d
1420
+ jnz .loop0
1421
+ RET
1422
+%endif
1423
+%endmacro
1424
+
1425
+ IPFILTER_LUMA_PS_64xN_AVX2 16
1426
+ IPFILTER_LUMA_PS_64xN_AVX2 32
1427
+ IPFILTER_LUMA_PS_64xN_AVX2 48
1428
+ IPFILTER_LUMA_PS_64xN_AVX2 64
1429
+
1430
+%macro IPFILTER_LUMA_PS_48xN_AVX2 1
1431
INIT_YMM avx2
1432
%if ARCH_X86_64 == 1
1433
-cglobal interp_8tap_horiz_ps_%1x%2, 4, 6, 8
1434
+cglobal interp_8tap_horiz_ps_48x%1, 5, 9,15
1435
1436
add r1d, r1d
1437
add r3d, r3d
1438
1439
1440
sub r0, 6
1441
test r5d, r5d
1442
- mov r4d, %2
1443
+ mov r4d, %1
1444
jz .loop0
1445
lea r6, [r1*3]
1446
sub r0, r6
1447
1448
1449
.loop0:
1450
%assign x 0
1451
-%rep %1/16
1452
+%rep 3
1453
vbroadcasti128 m4, [r0 + x]
1454
vbroadcasti128 m5, [r0 + 4 * SIZEOF_PIXEL + x]
1455
pshufb m4, m3
1456
1457
RET
1458
%endif
1459
%endmacro
1460
-
1461
- IPFILTER_LUMA_PS_32_64_AVX2 32, 8
1462
- IPFILTER_LUMA_PS_32_64_AVX2 32, 16
1463
- IPFILTER_LUMA_PS_32_64_AVX2 32, 24
1464
- IPFILTER_LUMA_PS_32_64_AVX2 32, 32
1465
- IPFILTER_LUMA_PS_32_64_AVX2 32, 64
1466
-
1467
- IPFILTER_LUMA_PS_32_64_AVX2 64, 16
1468
- IPFILTER_LUMA_PS_32_64_AVX2 64, 32
1469
- IPFILTER_LUMA_PS_32_64_AVX2 64, 48
1470
- IPFILTER_LUMA_PS_32_64_AVX2 64, 64
1471
-
1472
- IPFILTER_LUMA_PS_32_64_AVX2 48, 64
1473
-
1474
-%macro IPFILTER_LUMA_PS_16xN_AVX2 1
1475
-INIT_YMM avx2
1476
-%if ARCH_X86_64 == 1
1477
-cglobal interp_8tap_horiz_ps_16x%1, 4, 6, 8
1478
-
1479
- add r1d, r1d
1480
- add r3d, r3d
1481
- mov r4d, r4m
1482
- mov r5d, r5m
1483
- shl r4d, 4
1484
-%ifdef PIC
1485
- lea r6, [tab_LumaCoeff]
1486
- vpbroadcastq m0, [r6 + r4]
1487
- vpbroadcastq m1, [r6 + r4 + 8]
1488
-%else
1489
- vpbroadcastq m0, [tab_LumaCoeff + r4]
1490
- vpbroadcastq m1, [tab_LumaCoeff + r4 + 8]
1491
-%endif
1492
- mova m3, [interp8_hpp_shuf]
1493
- vbroadcasti128 m2, [INTERP_OFFSET_PS]
1494
-
1495
- ; register map
1496
- ; m0 , m1 interpolate coeff
1497
-
1498
- sub r0, 6
1499
- test r5d, r5d
1500
- mov r4d, %1
1501
- jz .loop0
1502
- lea r6, [r1*3]
1503
- sub r0, r6
1504
- add r4d, 7
1505
-
1506
-.loop0:
1507
- vbroadcasti128 m4, [r0]
1508
- vbroadcasti128 m5, [r0 + 8]
1509
- pshufb m4, m3
1510
- pshufb m7, m5, m3
1511
- pmaddwd m4, m0
1512
- pmaddwd m7, m1
1513
- paddd m4, m7
1514
-
1515
- vbroadcasti128 m6, [r0 + 16]
1516
- pshufb m5, m3
1517
- pshufb m7, m6, m3
1518
- pmaddwd m5, m0
1519
- pmaddwd m7, m1
1520
- paddd m5, m7
1521
-
1522
- phaddd m4, m5
1523
- vpermq m4, m4, q3120
1524
- paddd m4, m2
1525
- vextracti128 xm5, m4, 1
1526
- psrad xm4, INTERP_SHIFT_PS
1527
- psrad xm5, INTERP_SHIFT_PS
1528
- packssdw xm4, xm5
1529
- movu [r2], xm4
1530
-
1531
- vbroadcasti128 m5, [r0 + 24]
1532
- pshufb m6, m3
1533
- pshufb m7, m5, m3
1534
- pmaddwd m6, m0
1535
- pmaddwd m7, m1
1536
- paddd m6, m7
1537
-
1538
- vbroadcasti128 m7, [r0 + 32]
1539
- pshufb m5, m3
1540
- pshufb m7, m3
1541
- pmaddwd m5, m0
1542
- pmaddwd m7, m1
1543
- paddd m5, m7
1544
-
1545
- phaddd m6, m5
1546
- vpermq m6, m6, q3120
1547
- paddd m6, m2
1548
- vextracti128 xm5,m6, 1
1549
- psrad xm6, INTERP_SHIFT_PS
1550
- psrad xm5, INTERP_SHIFT_PS
1551
- packssdw xm6, xm5
1552
- movu [r2 + 16], xm6
1553
-
1554
- add r2, r3
1555
- add r0, r1
1556
- dec r4d
1557
- jnz .loop0
1558
- RET
1559
-%endif
1560
-%endmacro
1561
-
1562
- IPFILTER_LUMA_PS_16xN_AVX2 4
1563
- IPFILTER_LUMA_PS_16xN_AVX2 8
1564
- IPFILTER_LUMA_PS_16xN_AVX2 12
1565
- IPFILTER_LUMA_PS_16xN_AVX2 16
1566
- IPFILTER_LUMA_PS_16xN_AVX2 32
1567
- IPFILTER_LUMA_PS_16xN_AVX2 64
1568
-
1569
+ IPFILTER_LUMA_PS_48xN_AVX2 64
1570
INIT_YMM avx2
1571
%if ARCH_X86_64 == 1
1572
cglobal interp_8tap_horiz_ps_12x16, 4, 6, 8
1573
1574
mov r5d, r5m
1575
shl r4d, 4
1576
%ifdef PIC
1577
- lea r6, [tab_LumaCoeff]
1578
+ lea r6, [h_tab_LumaCoeff]
1579
vpbroadcastq m0, [r6 + r4]
1580
vpbroadcastq m1, [r6 + r4 + 8]
1581
%else
1582
- vpbroadcastq m0, [tab_LumaCoeff + r4]
1583
- vpbroadcastq m1, [tab_LumaCoeff + r4 + 8]
1584
+ vpbroadcastq m0, [h_tab_LumaCoeff + r4]
1585
+ vpbroadcastq m1, [h_tab_LumaCoeff + r4 + 8]
1586
%endif
1587
mova m3, [interp8_hpp_shuf]
1588
vbroadcasti128 m2, [INTERP_OFFSET_PS]
1589
x265_2.7.tar.gz/source/common/x86/h4-ipfilter16.asm -> x265_2.9.tar.gz/source/common/x86/h4-ipfilter16.asm
Changed
299
1
2
3
tab_Tm16: db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9
4
5
-tab_ChromaCoeff: dw 0, 64, 0, 0
6
+h4_tab_ChromaCoeff: dw 0, 64, 0, 0
7
dw -2, 58, 10, -2
8
dw -4, 54, 16, -2
9
dw -6, 46, 28, -4
10
11
add r4d, r4d
12
13
%ifdef PIC
14
- lea r6, [tab_ChromaCoeff]
15
+ lea r6, [h4_tab_ChromaCoeff]
16
movddup m0, [r6 + r4 * 4]
17
%else
18
- movddup m0, [tab_ChromaCoeff + r4 * 4]
19
+ movddup m0, [h4_tab_ChromaCoeff + r4 * 4]
20
%endif
21
22
%ifidn %3, ps
23
24
; void interp_4tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
25
;-----------------------------------------------------------------------------
26
27
+%if ARCH_X86_64
28
FILTER_HOR_CHROMA_sse3 2, 4, pp
29
FILTER_HOR_CHROMA_sse3 2, 8, pp
30
FILTER_HOR_CHROMA_sse3 2, 16, pp
31
32
FILTER_HOR_CHROMA_sse3 64, 32, ps
33
FILTER_HOR_CHROMA_sse3 64, 48, ps
34
FILTER_HOR_CHROMA_sse3 64, 64, ps
35
+%endif
36
37
%macro FILTER_W2_2 1
38
movu m3, [r0]
39
40
add r4d, r4d
41
42
%ifdef PIC
43
- lea r%6, [tab_ChromaCoeff]
44
+ lea r%6, [h4_tab_ChromaCoeff]
45
movh m0, [r%6 + r4 * 4]
46
%else
47
- movh m0, [tab_ChromaCoeff + r4 * 4]
48
+ movh m0, [h4_tab_ChromaCoeff + r4 * 4]
49
%endif
50
51
punpcklqdq m0, m0
52
53
add r4d, r4d
54
55
%ifdef PIC
56
- lea r%4, [tab_ChromaCoeff]
57
+ lea r%4, [h4_tab_ChromaCoeff]
58
movh m0, [r%4 + r4 * 4]
59
%else
60
- movh m0, [tab_ChromaCoeff + r4 * 4]
61
+ movh m0, [h4_tab_ChromaCoeff + r4 * 4]
62
%endif
63
64
punpcklqdq m0, m0
65
66
sub r0, 2
67
mov r4d, r4m
68
%ifdef PIC
69
- lea r5, [tab_ChromaCoeff]
70
+ lea r5, [h4_tab_ChromaCoeff]
71
vpbroadcastq m0, [r5 + r4 * 8]
72
%else
73
- vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8]
74
+ vpbroadcastq m0, [h4_tab_ChromaCoeff + r4 * 8]
75
%endif
76
mova m1, [h4_interp8_hpp_shuf]
77
vpbroadcastd m2, [pd_32]
78
79
sub r0, 2
80
mov r4d, r4m
81
%ifdef PIC
82
- lea r5, [tab_ChromaCoeff]
83
+ lea r5, [h4_tab_ChromaCoeff]
84
vpbroadcastq m0, [r5 + r4 * 8]
85
%else
86
- vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8]
87
+ vpbroadcastq m0, [h4_tab_ChromaCoeff + r4 * 8]
88
%endif
89
mova m1, [h4_interp8_hpp_shuf]
90
vpbroadcastd m2, [pd_32]
91
92
sub r0, 2
93
mov r4d, r4m
94
%ifdef PIC
95
- lea r5, [tab_ChromaCoeff]
96
+ lea r5, [h4_tab_ChromaCoeff]
97
vpbroadcastq m0, [r5 + r4 * 8]
98
%else
99
- vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8]
100
+ vpbroadcastq m0, [h4_tab_ChromaCoeff + r4 * 8]
101
%endif
102
mova m1, [h4_interp8_hpp_shuf]
103
vpbroadcastd m2, [pd_32]
104
105
sub r0, 2
106
mov r4d, r4m
107
%ifdef PIC
108
- lea r5, [tab_ChromaCoeff]
109
+ lea r5, [h4_tab_ChromaCoeff]
110
vpbroadcastq m0, [r5 + r4 * 8]
111
%else
112
- vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8]
113
+ vpbroadcastq m0, [h4_tab_ChromaCoeff + r4 * 8]
114
%endif
115
mova m1, [h4_interp8_hpp_shuf]
116
vpbroadcastd m2, [pd_32]
117
118
sub r0, 2
119
mov r4d, r4m
120
%ifdef PIC
121
- lea r5, [tab_ChromaCoeff]
122
+ lea r5, [h4_tab_ChromaCoeff]
123
vpbroadcastq m0, [r5 + r4 * 8]
124
%else
125
- vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8]
126
+ vpbroadcastq m0, [h4_tab_ChromaCoeff + r4 * 8]
127
%endif
128
mova m1, [h4_interp8_hpp_shuf]
129
vpbroadcastd m2, [pd_32]
130
131
sub r0, 2
132
mov r4d, r4m
133
%ifdef PIC
134
- lea r5, [tab_ChromaCoeff]
135
+ lea r5, [h4_tab_ChromaCoeff]
136
vpbroadcastq m0, [r5 + r4 * 8]
137
%else
138
- vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8]
139
+ vpbroadcastq m0, [h4_tab_ChromaCoeff + r4 * 8]
140
%endif
141
mova m1, [h4_interp8_hpp_shuf]
142
vpbroadcastd m2, [pd_32]
143
144
sub r0, 2
145
mov r4d, r4m
146
%ifdef PIC
147
- lea r5, [tab_ChromaCoeff]
148
+ lea r5, [h4_tab_ChromaCoeff]
149
vpbroadcastq m0, [r5 + r4 * 8]
150
%else
151
- vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8]
152
+ vpbroadcastq m0, [h4_tab_ChromaCoeff + r4 * 8]
153
%endif
154
mova m1, [h4_interp8_hpp_shuf]
155
vpbroadcastd m2, [pd_32]
156
157
sub r0, 2
158
mov r4d, r4m
159
%ifdef PIC
160
- lea r5, [tab_ChromaCoeff]
161
+ lea r5, [h4_tab_ChromaCoeff]
162
vpbroadcastq m0, [r5 + r4 * 8]
163
%else
164
- vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8]
165
+ vpbroadcastq m0, [h4_tab_ChromaCoeff + r4 * 8]
166
%endif
167
mova m1, [h4_interp8_hpp_shuf]
168
vpbroadcastd m2, [pd_32]
169
170
sub r0, 2
171
mov r4d, r4m
172
%ifdef PIC
173
- lea r5, [tab_ChromaCoeff]
174
+ lea r5, [h4_tab_ChromaCoeff]
175
vpbroadcastq m0, [r5 + r4 * 8]
176
%else
177
- vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8]
178
+ vpbroadcastq m0, [h4_tab_ChromaCoeff + r4 * 8]
179
%endif
180
mova m1, [h4_interp8_hpp_shuf]
181
vpbroadcastd m2, [pd_32]
182
183
sub r0, 2
184
mov r4d, r4m
185
%ifdef PIC
186
- lea r5, [tab_ChromaCoeff]
187
+ lea r5, [h4_tab_ChromaCoeff]
188
vpbroadcastq m0, [r5 + r4 * 8]
189
%else
190
- vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8]
191
+ vpbroadcastq m0, [h4_tab_ChromaCoeff + r4 * 8]
192
%endif
193
mova m1, [h4_interp8_hpp_shuf]
194
vpbroadcastd m2, [pd_32]
195
196
mov r5d, r5m
197
198
%ifdef PIC
199
- lea r6, [tab_ChromaCoeff]
200
+ lea r6, [h4_tab_ChromaCoeff]
201
vpbroadcastq m0, [r6 + r4 * 8]
202
%else
203
- vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8]
204
+ vpbroadcastq m0, [h4_tab_ChromaCoeff + r4 * 8]
205
%endif
206
mova m3, [h4_interp8_hpp_shuf]
207
vbroadcasti128 m2, [INTERP_OFFSET_PS]
208
209
mov r5d, r5m
210
211
%ifdef PIC
212
- lea r6, [tab_ChromaCoeff]
213
+ lea r6, [h4_tab_ChromaCoeff]
214
vpbroadcastq m0, [r6 + r4 * 8]
215
%else
216
- vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8]
217
+ vpbroadcastq m0, [h4_tab_ChromaCoeff + r4 * 8]
218
%endif
219
mova m3, [h4_interp8_hpp_shuf]
220
vbroadcasti128 m2, [INTERP_OFFSET_PS]
221
222
mov r5d, r5m
223
224
%ifdef PIC
225
- lea r6, [tab_ChromaCoeff]
226
+ lea r6, [h4_tab_ChromaCoeff]
227
vpbroadcastq m0, [r6 + r4 * 8]
228
%else
229
- vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8]
230
+ vpbroadcastq m0, [h4_tab_ChromaCoeff + r4 * 8]
231
%endif
232
mova m3, [h4_interp8_hpp_shuf]
233
vbroadcasti128 m2, [INTERP_OFFSET_PS]
234
235
mov r5d, r5m
236
237
%ifdef PIC
238
- lea r6, [tab_ChromaCoeff]
239
+ lea r6, [h4_tab_ChromaCoeff]
240
vpbroadcastq m0, [r6 + r4 * 8]
241
%else
242
- vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8]
243
+ vpbroadcastq m0, [h4_tab_ChromaCoeff + r4 * 8]
244
%endif
245
mova m3, [h4_interp8_hpp_shuf]
246
vbroadcasti128 m2, [INTERP_OFFSET_PS]
247
248
mov r5d, r5m
249
250
%ifdef PIC
251
- lea r6, [tab_ChromaCoeff]
252
+ lea r6, [h4_tab_ChromaCoeff]
253
vpbroadcastq m0, [r6 + r4 * 8]
254
%else
255
- vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8]
256
+ vpbroadcastq m0, [h4_tab_ChromaCoeff + r4 * 8]
257
%endif
258
mova m3, [h4_interp8_hpp_shuf]
259
vbroadcasti128 m2, [INTERP_OFFSET_PS]
260
261
mov r5d, r5m
262
263
%ifdef PIC
264
- lea r6, [tab_ChromaCoeff]
265
+ lea r6, [h4_tab_ChromaCoeff]
266
vpbroadcastq m0, [r6 + r4 * 8]
267
%else
268
- vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8]
269
+ vpbroadcastq m0, [h4_tab_ChromaCoeff + r4 * 8]
270
%endif
271
mova m3, [h4_interp8_hpp_shuf]
272
vbroadcasti128 m2, [INTERP_OFFSET_PS]
273
274
mov r5d, r5m
275
276
%ifdef PIC
277
- lea r6, [tab_ChromaCoeff]
278
+ lea r6, [h4_tab_ChromaCoeff]
279
vpbroadcastq m0, [r6 + r4 * 8]
280
%else
281
- vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8]
282
+ vpbroadcastq m0, [h4_tab_ChromaCoeff + r4 * 8]
283
%endif
284
mova m3, [h4_interp8_hpp_shuf]
285
vbroadcasti128 m2, [INTERP_OFFSET_PS]
286
287
mov r5d, r5m
288
289
%ifdef PIC
290
- lea r6, [tab_ChromaCoeff]
291
+ lea r6, [h4_tab_ChromaCoeff]
292
vpbroadcastq m0, [r6 + r4 * 8]
293
%else
294
- vpbroadcastq m0, [tab_ChromaCoeff + r4 * 8]
295
+ vpbroadcastq m0, [h4_tab_ChromaCoeff + r4 * 8]
296
%endif
297
mova m3, [h4_interp8_hpp_shuf]
298
vbroadcasti128 m2, [INTERP_OFFSET_PS]
299
x265_2.7.tar.gz/source/common/x86/intrapred.h -> x265_2.9.tar.gz/source/common/x86/intrapred.h
Changed
19
1
2
FUNCDEF_TU_S2(void, intra_pred_dc, sse2, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
3
FUNCDEF_TU_S2(void, intra_pred_dc, sse4, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
4
FUNCDEF_TU_S2(void, intra_pred_dc, avx2, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
5
-
6
+FUNCDEF_TU_S2(void, intra_pred_dc, avx512, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
7
FUNCDEF_TU_S2(void, intra_pred_planar, sse2, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
8
FUNCDEF_TU_S2(void, intra_pred_planar, sse4, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
9
FUNCDEF_TU_S2(void, intra_pred_planar, avx2, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
10
11
DECL_ALL(ssse3);
12
DECL_ALL(sse4);
13
DECL_ALL(avx2);
14
-
15
+DECL_ALL(avx512);
16
#undef DECL_ALL
17
#undef DECL_ANGS
18
#undef DECL_ANG
19
x265_2.7.tar.gz/source/common/x86/intrapred16.asm -> x265_2.9.tar.gz/source/common/x86/intrapred16.asm
Changed
2638
1
2
const pw_ang8_16, db 0, 0, 0, 0, 0, 0, 12, 13, 10, 11, 6, 7, 4, 5, 0, 1
3
const pw_ang8_17, db 0, 0, 14, 15, 12, 13, 10, 11, 8, 9, 4, 5, 2, 3, 0, 1
4
const pw_swap16, times 2 db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1
5
-
6
+const pw_swap16_avx512, times 4 db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1
7
const pw_ang16_13, db 14, 15, 8, 9, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
8
const pw_ang16_16, db 0, 0, 0, 0, 0, 0, 10, 11, 8, 9, 6, 7, 2, 3, 0, 1
9
10
11
;-----------------------------------------------------------------------------------
12
; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* above, int, int filter)
13
;-----------------------------------------------------------------------------------
14
+%if ARCH_X86_64
15
INIT_XMM sse2
16
cglobal intra_pred_dc8, 5, 8, 2
17
movu m0, [r2 + 34]
18
19
mov [r0 + r7], r3w
20
.end:
21
RET
22
+%endif
23
24
;-------------------------------------------------------------------------------------------------------
25
; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
26
;-------------------------------------------------------------------------------------------------------
27
+%if ARCH_X86_64
28
+;This code is meant for 64 bit architecture
29
INIT_XMM sse2
30
cglobal intra_pred_dc16, 5, 10, 4
31
lea r3, [r2 + 66]
32
33
mov [r9 + r1 * 8], r3w
34
.end:
35
RET
36
+%endif
37
38
;-------------------------------------------------------------------------------------------
39
; void intra_pred_dc(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int filter)
40
41
;-------------------------------------------------------------------------------------------------------
42
; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
43
;-------------------------------------------------------------------------------------------------------
44
+%if ARCH_X86_64
45
INIT_YMM avx2
46
cglobal intra_pred_dc16, 3, 9, 4
47
mov r3d, r4m
48
49
movu [r0 + r2 * 1 + 0], m0
50
movu [r0 + r2 * 1 + mmsize], m0
51
RET
52
+INIT_ZMM avx512
53
+cglobal intra_pred_dc32, 3,3,2
54
+ add r2, 2
55
+ add r1d, r1d
56
+ movu m0, [r2]
57
+ movu m1, [r2 + 2 * mmsize]
58
+ paddw m0, m1
59
+ vextracti32x8 ym1, m0, 1
60
+ paddw ym0, ym1
61
+ vextracti32x4 xm1, m0, 1
62
+ paddw xm0, xm1
63
+ pmaddwd xm0, [pw_1]
64
+ movhlps xm1, xm0
65
+ paddd xm0, xm1
66
+ vpsrldq xm1, xm0, 4
67
+ paddd xm0, xm1
68
+ paddd xm0, [pd_32] ; sum = sum + 32
69
+ psrld xm0, 6 ; sum = sum / 64
70
+ vpbroadcastw m0, xm0
71
+ lea r2, [r1 * 3]
72
+ ; store DC 32x32
73
+ movu [r0 + r1 * 0 + 0], m0
74
+ movu [r0 + r1 * 1 + 0], m0
75
+ movu [r0 + r1 * 2 + 0], m0
76
+ movu [r0 + r2 * 1 + 0], m0
77
+ lea r0, [r0 + r1 * 4]
78
+ movu [r0 + r1 * 0 + 0], m0
79
+ movu [r0 + r1 * 1 + 0], m0
80
+ movu [r0 + r1 * 2 + 0], m0
81
+ movu [r0 + r2 * 1 + 0], m0
82
+ lea r0, [r0 + r1 * 4]
83
+ movu [r0 + r1 * 0 + 0], m0
84
+ movu [r0 + r1 * 1 + 0], m0
85
+ movu [r0 + r1 * 2 + 0], m0
86
+ movu [r0 + r2 * 1 + 0], m0
87
+ lea r0, [r0 + r1 * 4]
88
+ movu [r0 + r1 * 0 + 0], m0
89
+ movu [r0 + r1 * 1 + 0], m0
90
+ movu [r0 + r1 * 2 + 0], m0
91
+ movu [r0 + r2 * 1 + 0], m0
92
+ lea r0, [r0 + r1 * 4]
93
+ movu [r0 + r1 * 0 + 0], m0
94
+ movu [r0 + r1 * 1 + 0], m0
95
+ movu [r0 + r1 * 2 + 0], m0
96
+ movu [r0 + r2 * 1 + 0], m0
97
+ lea r0, [r0 + r1 * 4]
98
+ movu [r0 + r1 * 0 + 0], m0
99
+ movu [r0 + r1 * 1 + 0], m0
100
+ movu [r0 + r1 * 2 + 0], m0
101
+ movu [r0 + r2 * 1 + 0], m0
102
+ lea r0, [r0 + r1 * 4]
103
+ movu [r0 + r1 * 0 + 0], m0
104
+ movu [r0 + r1 * 1 + 0], m0
105
+ movu [r0 + r1 * 2 + 0], m0
106
+ movu [r0 + r2 * 1 + 0], m0
107
+ lea r0, [r0 + r1 * 4]
108
+ movu [r0 + r1 * 0 + 0], m0
109
+ movu [r0 + r1 * 1 + 0], m0
110
+ movu [r0 + r1 * 2 + 0], m0
111
+ movu [r0 + r2 * 1 + 0], m0
112
+ RET
113
+%endif
114
115
;---------------------------------------------------------------------------------------
116
; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
117
118
;---------------------------------------------------------------------------------------
119
; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
120
;---------------------------------------------------------------------------------------
121
+%if ARCH_X86_64
122
INIT_XMM sse2
123
cglobal intra_pred_planar32, 3,3,16
124
movd m3, [r2 + 66] ; topRight = above[32]
125
126
%endrep
127
RET
128
%endif
129
-
130
+%endif
131
;---------------------------------------------------------------------------------------
132
; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
133
;---------------------------------------------------------------------------------------
134
135
STORE_4x4
136
RET
137
138
+%if ARCH_X86_64
139
cglobal intra_pred_ang4_26, 3,3,3
140
movh m0, [r2 + 2] ;[8 7 6 5 4 3 2 1]
141
add r1d, r1d
142
143
mov [r0 + r3], r2w
144
.quit:
145
RET
146
+%endif
147
148
cglobal intra_pred_ang4_27, 3,3,5
149
movu m0, [r2 + 2] ;[8 7 6 5 4 3 2 1]
150
151
152
%macro TRANSPOSE_STORE_AVX2 11
153
jnz .skip%11
154
- punpckhwd m%9, m%1, m%2
155
- punpcklwd m%1, m%2
156
- punpckhwd m%2, m%3, m%4
157
- punpcklwd m%3, m%4
158
-
159
- punpckldq m%4, m%1, m%3
160
- punpckhdq m%1, m%3
161
- punpckldq m%3, m%9, m%2
162
- punpckhdq m%9, m%2
163
-
164
- punpckhwd m%10, m%5, m%6
165
- punpcklwd m%5, m%6
166
- punpckhwd m%6, m%7, m%8
167
- punpcklwd m%7, m%8
168
-
169
- punpckldq m%8, m%5, m%7
170
- punpckhdq m%5, m%7
171
- punpckldq m%7, m%10, m%6
172
- punpckhdq m%10, m%6
173
-
174
- punpcklqdq m%6, m%4, m%8
175
- punpckhqdq m%2, m%4, m%8
176
- punpcklqdq m%4, m%1, m%5
177
- punpckhqdq m%8, m%1, m%5
178
-
179
- punpcklqdq m%1, m%3, m%7
180
- punpckhqdq m%5, m%3, m%7
181
- punpcklqdq m%3, m%9, m%10
182
- punpckhqdq m%7, m%9, m%10
183
+ punpckhwd ym%9, ym%1, ym%2
184
+ punpcklwd ym%1, ym%2
185
+ punpckhwd ym%2, ym%3, ym%4
186
+ punpcklwd ym%3, ym%4
187
+
188
+ punpckldq ym%4, ym%1, ym%3
189
+ punpckhdq ym%1, ym%3
190
+ punpckldq ym%3, ym%9, ym%2
191
+ punpckhdq ym%9, ym%2
192
+
193
+ punpckhwd ym%10, ym%5, ym%6
194
+ punpcklwd ym%5, ym%6
195
+ punpckhwd ym%6, ym%7, ym%8
196
+ punpcklwd ym%7, ym%8
197
+
198
+ punpckldq ym%8, ym%5, ym%7
199
+ punpckhdq ym%5, ym%7
200
+ punpckldq ym%7, ym%10, ym%6
201
+ punpckhdq ym%10, ym%6
202
+
203
+ punpcklqdq ym%6, ym%4, ym%8
204
+ punpckhqdq ym%2, ym%4, ym%8
205
+ punpcklqdq ym%4, ym%1, ym%5
206
+ punpckhqdq ym%8, ym%1, ym%5
207
+
208
+ punpcklqdq ym%1, ym%3, ym%7
209
+ punpckhqdq ym%5, ym%3, ym%7
210
+ punpcklqdq ym%3, ym%9, ym%10
211
+ punpckhqdq ym%7, ym%9, ym%10
212
213
movu [r0 + r1 * 0 + %11], xm%6
214
movu [r0 + r1 * 1 + %11], xm%2
215
216
movu [r5 + r4 * 1 + %11], xm%7
217
218
lea r5, [r5 + r1 * 4]
219
- vextracti128 [r5 + r1 * 0 + %11], m%6, 1
220
- vextracti128 [r5 + r1 * 1 + %11], m%2, 1
221
- vextracti128 [r5 + r1 * 2 + %11], m%4, 1
222
- vextracti128 [r5 + r4 * 1 + %11], m%8, 1
223
+ vextracti128 [r5 + r1 * 0 + %11], ym%6, 1
224
+ vextracti128 [r5 + r1 * 1 + %11], ym%2, 1
225
+ vextracti128 [r5 + r1 * 2 + %11], ym%4, 1
226
+ vextracti128 [r5 + r4 * 1 + %11], ym%8, 1
227
228
lea r5, [r5 + r1 * 4]
229
- vextracti128 [r5 + r1 * 0 + %11], m%1, 1
230
- vextracti128 [r5 + r1 * 1 + %11], m%5, 1
231
- vextracti128 [r5 + r1 * 2 + %11], m%3, 1
232
- vextracti128 [r5 + r4 * 1 + %11], m%7, 1
233
+ vextracti128 [r5 + r1 * 0 + %11], ym%1, 1
234
+ vextracti128 [r5 + r1 * 1 + %11], ym%5, 1
235
+ vextracti128 [r5 + r1 * 2 + %11], ym%3, 1
236
+ vextracti128 [r5 + r4 * 1 + %11], ym%7, 1
237
jmp .end%11
238
.skip%11:
239
- movu [r0 + r1 * 0], m%1
240
- movu [r0 + r1 * 1], m%2
241
- movu [r0 + r1 * 2], m%3
242
- movu [r0 + r4 * 1], m%4
243
+ movu [r0 + r1 * 0], ym%1
244
+ movu [r0 + r1 * 1], ym%2
245
+ movu [r0 + r1 * 2], ym%3
246
+ movu [r0 + r4 * 1], ym%4
247
248
lea r0, [r0 + r1 * 4]
249
- movu [r0 + r1 * 0], m%5
250
- movu [r0 + r1 * 1], m%6
251
- movu [r0 + r1 * 2], m%7
252
- movu [r0 + r4 * 1], m%8
253
+ movu [r0 + r1 * 0], ym%5
254
+ movu [r0 + r1 * 1], ym%6
255
+ movu [r0 + r1 * 2], ym%7
256
+ movu [r0 + r4 * 1], ym%8
257
lea r0, [r0 + r1 * 4]
258
.end%11:
259
%endmacro
260
261
+%if ARCH_X86_64
262
;; angle 16, modes 3 and 33
263
cglobal ang16_mode_3_33
264
test r6d, r6d
265
266
packusdw m11, m3
267
TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 16
268
ret
269
-
270
;; angle 16, modes 7 and 29
271
cglobal ang16_mode_7_29
272
test r6d, r6d
273
274
275
mov rsp, [rsp+4*mmsize]
276
RET
277
+%endif
278
;-------------------------------------------------------------------------------------------------------
279
; end of avx2 code for intra_pred_ang32 mode 2 to 34
280
;-------------------------------------------------------------------------------------------------------
281
+;-------------------------------------------------------------------------------------------------------
282
+; avx512 code for intra_pred_ang32 mode 2 to 34 start
283
+;-------------------------------------------------------------------------------------------------------
284
+INIT_ZMM avx512
285
+cglobal intra_pred_ang32_2, 3,5,3
286
+ lea r4, [r2]
287
+ add r2, 128
288
+ cmp r3m, byte 34
289
+ cmove r2, r4
290
+ add r1d, r1d
291
+ lea r3, [r1 * 3]
292
+ movu m0, [r2 + 4]
293
+ movu m1, [r2 + 20]
294
+
295
+ movu [r0], m0
296
+ palignr m2, m1, m0, 2
297
+ movu [r0 + r1], m2
298
+ palignr m2, m1, m0, 4
299
+ movu [r0 + r1 * 2], m2
300
+ palignr m2, m1, m0, 6
301
+ movu [r0 + r3], m2
302
+
303
+ lea r0, [r0 + r1 * 4]
304
+ palignr m2, m1, m0, 8
305
+ movu [r0], m2
306
+ palignr m2, m1, m0, 10
307
+ movu [r0 + r1], m2
308
+ palignr m2, m1, m0, 12
309
+ movu [r0 + r1 * 2], m2
310
+ palignr m2, m1, m0, 14
311
+ movu [r0 + r3], m2
312
+
313
+ movu m0, [r2 + 36]
314
+ lea r0, [r0 + r1 * 4]
315
+ movu [r0], m1
316
+ palignr m2, m0, m1, 2
317
+ movu [r0 + r1], m2
318
+ palignr m2, m0, m1, 4
319
+ movu [r0 + r1 * 2], m2
320
+ palignr m2, m0, m1, 6
321
+ movu [r0 + r3], m2
322
+
323
+ lea r0, [r0 + r1 * 4]
324
+ palignr m2, m0, m1, 8
325
+ movu [r0], m2
326
+ palignr m2, m0, m1, 10
327
+ movu [r0 + r1], m2
328
+ palignr m2, m0, m1, 12
329
+ movu [r0 + r1 * 2], m2
330
+ palignr m2, m0, m1, 14
331
+ movu [r0 + r3], m2
332
+
333
+ lea r0, [r0 + r1 * 4]
334
+ movu m1, [r2 + 52]
335
+
336
+ movu [r0], m0
337
+ palignr m2, m1, m0, 2
338
+ movu [r0 + r1], m2
339
+ palignr m2, m1, m0, 4
340
+ movu [r0 + r1 * 2], m2
341
+ palignr m2, m1, m0, 6
342
+ movu [r0 + r3], m2
343
+
344
+ lea r0, [r0 + r1 * 4]
345
+ palignr m2, m1, m0, 8
346
+ movu [r0], m2
347
+ palignr m2, m1, m0, 10
348
+ movu [r0 + r1], m2
349
+ palignr m2, m1, m0, 12
350
+ movu [r0 + r1 * 2], m2
351
+ palignr m2, m1, m0, 14
352
+ movu [r0 + r3], m2
353
+
354
+ movu m0, [r2 + 68]
355
+ lea r0, [r0 + r1 * 4]
356
+ movu [r0], m1
357
+ palignr m2, m0, m1, 2
358
+ movu [r0 + r1], m2
359
+ palignr m2, m0, m1, 4
360
+ movu [r0 + r1 * 2], m2
361
+ palignr m2, m0, m1, 6
362
+ movu [r0 + r3], m2
363
+
364
+ lea r0, [r0 + r1 * 4]
365
+ palignr m2, m0, m1, 8
366
+ movu [r0], m2
367
+ palignr m2, m0, m1, 10
368
+ movu [r0 + r1], m2
369
+ palignr m2, m0, m1, 12
370
+ movu [r0 + r1 * 2], m2
371
+ palignr m2, m0, m1, 14
372
+ movu [r0 + r3], m2
373
+ RET
374
+
375
+cglobal intra_pred_ang32_10, 3,4,2
376
+ add r2, mmsize*2
377
+ add r1d, r1d
378
+ lea r3, [r1 * 3]
379
+
380
+ vpbroadcastw m0, [r2 + 2] ; [1...]
381
+ vpbroadcastw m1, [r2 + 2 + 2] ; [2...]
382
+ movu [r0], m0
383
+ movu [r0 + r1], m1
384
+
385
+ vpbroadcastw m0, [r2 + 2 + 4] ; [3...]
386
+ vpbroadcastw m1, [r2 + 2 + 6] ; [4...]
387
+ movu [r0 + r1 * 2], m0
388
+ movu [r0 + r3], m1
389
+ lea r0, [r0 + r1 * 4]
390
+
391
+ vpbroadcastw m0, [r2 + 2 + 8] ; [5...]
392
+ vpbroadcastw m1, [r2 + 2 + 10] ; [6...]
393
+ movu [r0], m0
394
+ movu [r0 + r1], m1
395
+
396
+ vpbroadcastw m0, [r2 + 2 + 12] ; [7...]
397
+ vpbroadcastw m1, [r2 + 2 + 14] ; [8...]
398
+ movu [r0 + r1 * 2], m0
399
+ movu [r0 + r3], m1
400
+ lea r0, [r0 + r1 *4]
401
+
402
+ vpbroadcastw m0, [r2 + 2 + 16] ; [9...]
403
+ vpbroadcastw m1, [r2 + 2 + 18] ; [10...]
404
+ movu [r0], m0
405
+ movu [r0 + r1], m1
406
+
407
+ vpbroadcastw m0, [r2 + 2 + 20] ; [11...]
408
+ vpbroadcastw m1, [r2 + 2 + 22] ; [12...]
409
+ movu [r0 + r1 * 2], m0
410
+ movu [r0 + r3], m1
411
+ lea r0, [r0 + r1 *4]
412
+
413
+ vpbroadcastw m0, [r2 + 2 + 24] ; [13...]
414
+ vpbroadcastw m1, [r2 + 2 + 26] ; [14...]
415
+ movu [r0], m0
416
+ movu [r0 + r1], m1
417
+
418
+ vpbroadcastw m0, [r2 + 2 + 28] ; [15...]
419
+ vpbroadcastw m1, [r2 + 2 + 30] ; [16...]
420
+ movu [r0 + r1 * 2], m0
421
+ movu [r0 + r3], m1
422
+ lea r0, [r0 + r1 *4]
423
+
424
+ vpbroadcastw m0, [r2 + 2 + 32] ; [17...]
425
+ vpbroadcastw m1, [r2 + 2 + 34] ; [18...]
426
+ movu [r0], m0
427
+ movu [r0 + r1], m1
428
+
429
+ vpbroadcastw m0, [r2 + 2 + 36] ; [19...]
430
+ vpbroadcastw m1, [r2 + 2 + 38] ; [20...]
431
+ movu [r0 + r1 * 2], m0
432
+ movu [r0 + r3], m1
433
+ lea r0, [r0 + r1 *4]
434
+
435
+ vpbroadcastw m0, [r2 + 2 + 40] ; [21...]
436
+ vpbroadcastw m1, [r2 + 2 + 42] ; [22...]
437
+ movu [r0], m0
438
+ movu [r0 + r1], m1
439
+
440
+ vpbroadcastw m0, [r2 + 2 + 44] ; [23...]
441
+ vpbroadcastw m1, [r2 + 2 + 46] ; [24...]
442
+ movu [r0 + r1 * 2], m0
443
+ movu [r0 + r3], m1
444
+ lea r0, [r0 + r1 *4]
445
+
446
+ vpbroadcastw m0, [r2 + 2 + 48] ; [25...]
447
+ vpbroadcastw m1, [r2 + 2 + 50] ; [26...]
448
+ movu [r0], m0
449
+ movu [r0 + r1], m1
450
+
451
+ vpbroadcastw m0, [r2 + 2 + 52] ; [27...]
452
+ vpbroadcastw m1, [r2 + 2 + 54] ; [28...]
453
+ movu [r0 + r1 * 2], m0
454
+ movu [r0 + r3], m1
455
+ lea r0, [r0 + r1 *4]
456
+
457
+ vpbroadcastw m0, [r2 + 2 + 56] ; [29...]
458
+ vpbroadcastw m1, [r2 + 2 + 58] ; [30...]
459
+ movu [r0], m0
460
+ movu [r0 + r1], m1
461
+
462
+ vpbroadcastw m0, [r2 + 2 + 60] ; [31...]
463
+ vpbroadcastw m1, [r2 + 2 + 62] ; [32...]
464
+ movu [r0 + r1 * 2], m0
465
+ movu [r0 + r3], m1
466
+ RET
467
+
468
+cglobal intra_pred_ang32_18, 3,6,6
469
+ mov r4, rsp
470
+ sub rsp, 4*(mmsize/2)+gprsize
471
+ and rsp, ~63
472
+ mov [rsp+4*(mmsize/2)], r4
473
+
474
+ movu m0, [r2]
475
+ mova [rsp + 2*(mmsize/2)], ym0
476
+ vextracti32x8 [rsp + 3*(mmsize/2)], m0, 1
477
+
478
+ movu m2, [r2 + 130]
479
+ pshufb m2, [pw_swap16_avx512]
480
+ vpermq m2, m2, q1032
481
+ mova [rsp + 1*(mmsize/2)], ym2
482
+ vextracti32x8 [rsp + 0*(mmsize/2)], m2, 1
483
+
484
+ add r1d, r1d
485
+ lea r2, [rsp+2*(mmsize/2)]
486
+ lea r4, [r1 * 2]
487
+ lea r3, [r1 * 3]
488
+ lea r5, [r1 * 4]
489
+
490
+ movu m0, [r2]
491
+ movu m2, [r2 - 16]
492
+ movu [r0], m0
493
+
494
+ palignr m4, m0, m2, 14
495
+ palignr m5, m0, m2, 12
496
+ movu [r0 + r1], m4
497
+ movu [r0 + r4], m5
498
+
499
+ palignr m4, m0, m2, 10
500
+ palignr m5, m0, m2, 8
501
+ movu [r0 + r3], m4
502
+ add r0, r5
503
+ movu [r0], m5
504
+
505
+ palignr m4, m0, m2, 6
506
+ palignr m5, m0, m2, 4
507
+ movu [r0 + r1], m4
508
+ movu [r0 + r4], m5
509
+
510
+ palignr m4, m0, m2, 2
511
+ movu [r0 + r3], m4
512
+ add r0, r5
513
+ movu [r0], m2
514
+
515
+ movu m0, [r2 - 32]
516
+ palignr m4, m2, m0, 14
517
+ palignr m5, m2, m0, 12
518
+ movu [r0 + r1], m4
519
+ movu [r0 + r4], m5
520
+
521
+ palignr m4, m2, m0, 10
522
+ palignr m5, m2, m0, 8
523
+ movu [r0 + r3], m4
524
+ add r0, r5
525
+ movu [r0], m5
526
+
527
+ palignr m4, m2, m0, 6
528
+ palignr m5, m2, m0, 4
529
+ movu [r0 + r1], m4
530
+ movu [r0 + r4], m5
531
+
532
+ palignr m4, m2, m0, 2
533
+ movu [r0 + r3], m4
534
+ add r0, r5
535
+ movu [r0], m0
536
+
537
+ movu m2, [r2 - 48]
538
+ palignr m4, m0, m2, 14
539
+ palignr m5, m0, m2, 12
540
+ movu [r0 + r1], m4
541
+ movu [r0 + r4], m5
542
+
543
+ palignr m4, m0, m2, 10
544
+ palignr m5, m0, m2, 8
545
+ movu [r0 + r3], m4
546
+ add r0, r5
547
+ movu [r0], m5
548
+
549
+ palignr m4, m0, m2, 6
550
+ palignr m5, m0, m2, 4
551
+ movu [r0 + r1], m4
552
+ movu [r0 + r4], m5
553
+
554
+ palignr m4, m0, m2, 2
555
+ movu [r0 + r3], m4
556
+ add r0, r5
557
+ movu [r0], m2
558
+
559
+ movu m0, [r2 - 64]
560
+ palignr m4, m2, m0, 14
561
+ palignr m5, m2, m0, 12
562
+ movu [r0 + r1], m4
563
+ movu [r0 + r4], m5
564
+
565
+ palignr m4, m2, m0, 10
566
+ palignr m5, m2, m0, 8
567
+ movu [r0 + r3], m4
568
+ add r0, r5
569
+ movu [r0], m5
570
+
571
+ palignr m4, m2, m0, 6
572
+ palignr m5, m2, m0, 4
573
+ movu [r0 + r1], m4
574
+ movu [r0 + r4], m5
575
+
576
+ palignr m4, m2, m0, 2
577
+ movu [r0 + r3], m4
578
+ mov rsp, [rsp+4*(mmsize/2)]
579
+ RET
580
+INIT_ZMM avx512
581
+cglobal intra_pred_ang32_26, 3,3,2
582
+ movu m0, [r2 + 2]
583
+ add r1d, r1d
584
+ lea r2, [r1 * 3]
585
+ movu [r0], m0
586
+ movu [r0 + r1], m0
587
+ movu [r0 + r1 * 2], m0
588
+ movu [r0 + r2], m0
589
+ lea r0, [r0 + r1 *4]
590
+ movu [r0], m0
591
+ movu [r0 + r1], m0
592
+ movu [r0 + r1 * 2], m0
593
+ movu [r0 + r2], m0
594
+ lea r0, [r0 + r1 *4]
595
+ movu [r0], m0
596
+ movu [r0 + r1], m0
597
+ movu [r0 + r1 * 2], m0
598
+ movu [r0 + r2], m0
599
+ lea r0, [r0 + r1 *4]
600
+ movu [r0], m0
601
+ movu [r0 + r1], m0
602
+ movu [r0 + r1 * 2], m0
603
+ movu [r0 + r2], m0
604
+ lea r0, [r0 + r1 *4]
605
+ movu [r0], m0
606
+ movu [r0 + r1], m0
607
+ movu [r0 + r1 * 2], m0
608
+ movu [r0 + r2], m0
609
+ lea r0, [r0 + r1 *4]
610
+ movu [r0], m0
611
+ movu [r0 + r1], m0
612
+ movu [r0 + r1 * 2], m0
613
+ movu [r0 + r2], m0
614
+ lea r0, [r0 + r1 *4]
615
+ movu [r0], m0
616
+ movu [r0 + r1], m0
617
+ movu [r0 + r1 * 2], m0
618
+ movu [r0 + r2], m0
619
+ lea r0, [r0 + r1 *4]
620
+ movu [r0], m0
621
+ movu [r0 + r1], m0
622
+ movu [r0 + r1 * 2], m0
623
+ movu [r0 + r2], m0
624
+ RET
625
+
626
+;; angle 16, modes 9 and 27
627
+cglobal ang16_mode_9_27
628
+ test r6d, r6d
629
+
630
+ vbroadcasti32x8 m0, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
631
+ vbroadcasti32x8 m1, [r2 + 4] ; [17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
632
+
633
+ punpcklwd m3, m0, m1 ; [13 12 12 11 11 10 10 9 5 4 4 3 3 2 2 1]
634
+ punpckhwd m0, m1 ; [17 16 16 15 15 14 14 13 9 8 8 7 7 6 6 5]
635
+
636
+ vbroadcasti32x8 m2, [r2 + 18] ; [24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9]
637
+ vbroadcasti32x8 m4, [r2 + 20] ; [25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10]
638
+ punpcklwd m2, m4 ; [21 20 20 19 19 18 18 17 13 12 12 11 11 10 10 9]
639
+
640
+ movu ym16, [r3 - 14 * 32] ; [2]
641
+ vinserti32x8 m16, [r3 - 12 * 32], 1 ; [4]
642
+ pmaddwd m4, m3, m16
643
+ paddd m4, m15
644
+ psrld m4, 5
645
+ pmaddwd m5, m0, m16
646
+ paddd m5, m15
647
+ psrld m5, 5
648
+ packusdw m4, m5
649
+ vextracti32x8 ym5, m4, 1
650
+ movu ym16, [r3 - 10 * 32] ; [6]
651
+ vinserti32x8 m16, [r3 - 8 * 32], 1 ; [8]
652
+ pmaddwd m6, m3, m16
653
+ paddd m6, m15
654
+ psrld m6, 5
655
+ pmaddwd m9, m0, m16
656
+ paddd m9, m15
657
+ psrld m9, 5
658
+ packusdw m6, m9
659
+ vextracti32x8 ym7, m6, 1
660
+ movu ym16, [r3 - 6 * 32] ; [10]
661
+ vinserti32x8 m16, [r3 - 4 * 32], 1 ; [12]
662
+ pmaddwd m8, m3, m16
663
+ paddd m8, m15
664
+ psrld m8, 5
665
+ pmaddwd m9, m0, m16
666
+ paddd m9, m15
667
+ psrld m9, 5
668
+ packusdw m8, m9
669
+ vextracti32x8 ym9, m8, 1
670
+ movu ym16, [r3 - 2 * 32] ; [14]
671
+ vinserti32x8 m16, [r3], 1 ; [16]
672
+ pmaddwd m10, m3, m16
673
+ paddd m10, m15
674
+ psrld m10, 5
675
+ pmaddwd m1, m0, m16
676
+ paddd m1, m15
677
+ psrld m1, 5
678
+ packusdw m10, m1
679
+ vextracti32x8 ym11, m10, 1
680
+ TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 2, 1, 0
681
+
682
+ movu ym16, [r3 + 2 * 32] ; [18]
683
+ vinserti32x8 m16, [r3 + 4 * 32], 1 ; [20]
684
+ pmaddwd m4, m3, m16
685
+ paddd m4, m15
686
+ psrld m4, 5
687
+ pmaddwd m5, m0, m16
688
+ paddd m5, m15
689
+ psrld m5, 5
690
+ packusdw m4, m5
691
+ vextracti32x8 ym5, m4, 1
692
+ movu ym16, [r3 + 6 * 32] ; [22]
693
+ vinserti32x8 m16, [r3 + 8 * 32], 1 ; [24]
694
+ pmaddwd m6, m3, m16
695
+ paddd m6, m15
696
+ psrld m6, 5
697
+ pmaddwd m8, m0, m16
698
+ paddd m8, m15
699
+ psrld m8, 5
700
+ packusdw m6, m8
701
+ vextracti32x8 ym7, m6, 1
702
+ movu ym16, [r3 + 10 * 32] ; [26]
703
+ vinserti32x8 m16, [r3 + 12 * 32], 1 ; [28]
704
+ pmaddwd m8, m3, m16
705
+ paddd m8, m15
706
+ psrld m8, 5
707
+ pmaddwd m9, m0, m16
708
+ paddd m9, m15
709
+ psrld m9, 5
710
+ packusdw m8, m9
711
+ vextracti32x8 ym9, m8, 1
712
+ movu ym16, [r3 + 14 * 32] ; [30]
713
+ pmaddwd ym3, ym16
714
+ paddd ym3, ym15
715
+ psrld ym3, 5
716
+ pmaddwd ym0, ym16
717
+ paddd ym0, ym15
718
+ psrld ym0, 5
719
+ packusdw ym3, ym0
720
+
721
+ movu ym1, [r2 + 4]
722
+ TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 3, 1, 0, 2, 16
723
+ ret
724
+
725
+cglobal intra_pred_ang32_9, 3,8,17
726
+ add r2, 128
727
+ xor r6d, r6d
728
+ lea r3, [ang_table_avx2 + 16 * 32]
729
+ shl r1d, 1
730
+ lea r4, [r1 * 3]
731
+ lea r7, [r0 + 8 * r1]
732
+ vbroadcasti32x8 m15, [pd_16]
733
+
734
+ call ang16_mode_9_27
735
+ add r2, 2
736
+ lea r0, [r0 + 32]
737
+ call ang16_mode_9_27
738
+ add r2, 30
739
+ lea r0, [r7 + 8 * r1]
740
+ call ang16_mode_9_27
741
+ add r2, 2
742
+ lea r0, [r0 + 32]
743
+ call ang16_mode_9_27
744
+ RET
745
+
746
+cglobal intra_pred_ang32_27, 3,7,17
747
+ xor r6d, r6d
748
+ inc r6d
749
+ lea r3, [ang_table_avx2 + 16 * 32]
750
+ shl r1d, 1
751
+ lea r4, [r1 * 3]
752
+ lea r5, [r0 + 32]
753
+ vbroadcasti32x8 m15, [pd_16]
754
+
755
+ call ang16_mode_9_27
756
+ add r2, 2
757
+ call ang16_mode_9_27
758
+ add r2, 30
759
+ mov r0, r5
760
+ call ang16_mode_9_27
761
+ add r2, 2
762
+ call ang16_mode_9_27
763
+ RET
764
+;; angle 16, modes 11 and 25
765
+cglobal ang16_mode_11_25
766
+ test r6d, r6d
767
+
768
+ vbroadcasti32x8 m0, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
769
+ vbroadcasti32x8 m1, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
770
+
771
+ punpcklwd m3, m0, m1 ; [12 11 11 10 10 9 9 8 4 3 3 2 2 1 1 0]
772
+ punpckhwd m0, m1 ; [16 15 15 14 14 13 13 12 8 7 7 6 6 5 5 4]
773
+
774
+ movu ym16, [r3 + 14 * 32] ; [30]
775
+ vinserti32x8 m16, [r3 + 12 * 32], 1 ; [28]
776
+ pmaddwd m4, m3, m16
777
+ paddd m4, m15
778
+ psrld m4, 5
779
+ pmaddwd m5, m0, m16
780
+ paddd m5, m15
781
+ psrld m5, 5
782
+ packusdw m4, m5
783
+ vextracti32x8 ym5, m4, 1
784
+ movu ym16, [r3 + 10 * 32] ; [26]
785
+ vinserti32x8 m16, [r3 + 8 * 32], 1 ; [24]
786
+ pmaddwd m6, m3, m16
787
+ paddd m6, m15
788
+ psrld m6, 5
789
+ pmaddwd m9, m0, m16
790
+ paddd m9, m15
791
+ psrld m9, 5
792
+ packusdw m6, m9
793
+ vextracti32x8 ym7, m6, 1
794
+ movu ym16, [r3 + 6 * 32] ; [22]
795
+ vinserti32x8 m16, [r3 + 4 * 32], 1 ; [20]
796
+ pmaddwd m8, m3, m16
797
+ paddd m8, m15
798
+ psrld m8, 5
799
+ pmaddwd m9, m0, m16
800
+ paddd m9, m15
801
+ psrld m9, 5
802
+ packusdw m8, m9
803
+ vextracti32x8 ym9, m8, 1
804
+ movu ym16, [r3 + 2 * 32] ; [18]
805
+ vinserti32x8 m16, [r3], 1 ; [16]
806
+ pmaddwd m10, m3, m16
807
+ paddd m10, m15
808
+ psrld m10, 5
809
+ pmaddwd m1, m0, m16
810
+ paddd m1, m15
811
+ psrld m1, 5
812
+ packusdw m10, m1
813
+ vextracti32x8 ym11, m10, 1
814
+ TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 2, 1, 0
815
+
816
+ movu ym16, [r3 - 2 * 32] ; [14]
817
+ vinserti32x8 m16, [r3 - 4 * 32], 1 ; [12]
818
+ pmaddwd m4, m3, m16
819
+ paddd m4, m15
820
+ psrld m4, 5
821
+ pmaddwd m5, m0, m16
822
+ paddd m5, m15
823
+ psrld m5, 5
824
+ packusdw m4, m5
825
+ vextracti32x8 ym5, m4, 1
826
+ movu ym16, [r3 - 6 * 32] ; [10]
827
+ vinserti32x8 m16, [r3 - 8 * 32], 1 ; [8]
828
+ pmaddwd m6, m3, m16
829
+ paddd m6, m15
830
+ psrld m6, 5
831
+ pmaddwd m8, m0, m16
832
+ paddd m8, m15
833
+ psrld m8, 5
834
+ packusdw m6, m8
835
+ vextracti32x8 ym7, m6, 1
836
+ movu ym16, [r3 - 10 * 32] ; [6]
837
+ vinserti32x8 m16, [r3 - 12 * 32], 1 ; [4]
838
+ pmaddwd m8, m3, m16
839
+ paddd m8, m15
840
+ psrld m8, 5
841
+ pmaddwd m9, m0, m16
842
+ paddd m9, m15
843
+ psrld m9, 5
844
+ packusdw m8, m9
845
+ vextracti32x8 ym9, m8, 1
846
+ pmaddwd ym3, [r3 - 14 * 32] ; [2]
847
+ paddd ym3, ym15
848
+ psrld ym3, 5
849
+ pmaddwd ym0, [r3 - 14 * 32]
850
+ paddd ym0, ym15
851
+ psrld ym0, 5
852
+ packusdw ym3, ym0
853
+
854
+ movu ym1, [r2]
855
+ TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 3, 1, 0, 2, 16
856
+ ret
857
+
858
+cglobal intra_pred_ang32_11, 3,8,17, 0-8
859
+ movzx r5d, word [r2 + 128] ; [0]
860
+ movzx r6d, word [r2]
861
+ mov [rsp], r5w
862
+ mov [r2 + 128], r6w
863
+
864
+ movzx r5d, word [r2 + 126] ; [16]
865
+ movzx r6d, word [r2 + 32]
866
+ mov [rsp + 4], r5w
867
+ mov [r2 + 126], r6w
868
+ vbroadcasti32x8 m15, [pd_16]
869
+ add r2, 128
870
+ xor r6d, r6d
871
+ lea r3, [ang_table_avx2 + 16 * 32]
872
+ shl r1d, 1
873
+ lea r4, [r1 * 3]
874
+ lea r7, [r0 + 8 * r1]
875
+
876
+ call ang16_mode_11_25
877
+ sub r2, 2
878
+ lea r0, [r0 + 32]
879
+ call ang16_mode_11_25
880
+ add r2, 34
881
+ lea r0, [r7 + 8 * r1]
882
+ call ang16_mode_11_25
883
+ sub r2, 2
884
+ lea r0, [r0 + 32]
885
+ call ang16_mode_11_25
886
+ mov r6d, [rsp]
887
+ mov [r2 - 30], r6w
888
+ mov r6d, [rsp + 4]
889
+ mov [r2 - 32], r6w
890
+ RET
891
+
892
+cglobal intra_pred_ang32_25, 3,7,17, 0-4
893
+ xor r6d, r6d
894
+ inc r6d
895
+ lea r3, [ang_table_avx2 + 16 * 32]
896
+ shl r1d, 1
897
+ vbroadcasti32x8 m15, [pd_16]
898
+ movzx r4d, word [r2 - 2]
899
+ movzx r5d, word [r2 + 160] ; [16]
900
+ mov [rsp], r4w
901
+ mov [r2 - 2], r5w
902
+
903
+ lea r4, [r1 * 3]
904
+ lea r5, [r0 + 32]
905
+ call ang16_mode_11_25
906
+ sub r2, 2
907
+ call ang16_mode_11_25
908
+ add r2, 34
909
+ mov r0, r5
910
+ call ang16_mode_11_25
911
+ sub r2, 2
912
+ call ang16_mode_11_25
913
+ mov r5d, [rsp]
914
+ mov [r2 - 32], r5w
915
+ RET
916
+
917
+cglobal intra_pred_ang16_9, 3,7,17
918
+ add r2, 64
919
+ xor r6d, r6d
920
+ lea r3, [ang_table_avx2 + 16 * 32]
921
+ shl r1d, 1
922
+ lea r4, [r1 * 3]
923
+ vbroadcasti32x8 m15, [pd_16]
924
+ call ang16_mode_9_27
925
+ RET
926
+
927
+cglobal intra_pred_ang16_27, 3,7,17
928
+ xor r6d, r6d
929
+ inc r6d
930
+ lea r3, [ang_table_avx2 + 16 * 32]
931
+ shl r1d, 1
932
+ lea r4, [r1 * 3]
933
+ vbroadcasti32x8 m15, [pd_16]
934
+ call ang16_mode_9_27
935
+ RET
936
+
937
+cglobal intra_pred_ang16_11, 3,7,17, 0-4
938
+ movzx r5d, word [r2 + 64]
939
+ movzx r6d, word [r2]
940
+ mov [rsp], r5w
941
+ mov [r2 + 64], r6w
942
+ vbroadcasti32x8 m15, [pd_16]
943
+ add r2, 64
944
+ xor r6d, r6d
945
+ lea r3, [ang_table_avx2 + 16 * 32]
946
+ shl r1d, 1
947
+ lea r4, [r1 * 3]
948
+ call ang16_mode_11_25
949
+ mov r6d, [rsp]
950
+ mov [r2], r6w
951
+ RET
952
+
953
+cglobal intra_pred_ang16_25, 3,7,17
954
+ xor r6d, r6d
955
+ inc r6d
956
+ vbroadcasti32x8 m15, [pd_16]
957
+ lea r3, [ang_table_avx2 + 16 * 32]
958
+ shl r1d, 1
959
+ lea r4, [r1 * 3]
960
+ call ang16_mode_11_25
961
+ RET
962
+cglobal ang16_mode_5_31
963
+ test r6d, r6d
964
+
965
+ vbroadcasti32x8 m0, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
966
+ vbroadcasti32x8 m1, [r2 + 4] ; [17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
967
+
968
+ punpcklwd m3, m0, m1 ; [13 12 12 11 11 10 10 9 5 4 4 3 3 2 2 1]
969
+ punpckhwd m0, m1 ; [17 16 16 15 15 14 14 13 9 8 8 7 7 6 6 5]
970
+
971
+ vbroadcasti32x8 m1, [r2 + 18] ; [24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9]
972
+ vbroadcasti32x8 m4, [r2 + 20] ; [25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10]
973
+ punpcklwd m2, m1, m4 ; [21 20 20 19 19 18 18 17 13 12 12 11 11 10 10 9]
974
+ punpckhwd m1, m4 ; [25 24 24 23 23 22 22 21 17 16 16 15 15 14 14 13]
975
+
976
+ pmaddwd m4, m3, [r3 + 1 * 32] ; [17]
977
+ paddd m4, m15
978
+ psrld m4, 5
979
+ pmaddwd m5, m0, [r3 + 1 * 32]
980
+ paddd m5, m15
981
+ psrld m5, 5
982
+ packusdw m4, m5
983
+
984
+ movu ym16, [r3 - 14 * 32] ; [2]
985
+ vinserti32x8 m16, [r3 + 3 * 32] ,1 ; [19]
986
+ palignr m6, m0, m3, 4
987
+ pmaddwd m5, m6, m16
988
+ paddd m5, m15
989
+ psrld m5, 5
990
+ palignr m7, m2, m0, 4
991
+ pmaddwd m8, m7, m16
992
+ paddd m8, m15
993
+ psrld m8, 5
994
+ packusdw m5, m8
995
+ vextracti32x8 ym6, m5, 1
996
+
997
+ palignr m8, m0, m3, 8
998
+ palignr m9, m2, m0, 8
999
+ movu ym16, [r3 - 12 * 32] ; [4]
1000
+ vinserti32x8 m16, [r3 + 5 * 32],1 ; [21]
1001
+ pmaddwd m7, m8, m16
1002
+ paddd m7, m15
1003
+ psrld m7, 5
1004
+ pmaddwd m10, m9,m16
1005
+ paddd m10, m15
1006
+ psrld m10, 5
1007
+ packusdw m7, m10
1008
+ vextracti32x8 ym8, m7, 1
1009
+
1010
+ palignr m10, m0, m3, 12
1011
+ palignr m11, m2, m0, 12
1012
+ movu ym16,[r3 - 10 * 32] ; [6]
1013
+ vinserti32x8 m16, [r3 + 7 * 32] ,1 ; [23]
1014
+ pmaddwd m9, m10, m16
1015
+ paddd m9, m15
1016
+ psrld m9, 5
1017
+ pmaddwd m3, m11, m16
1018
+ paddd m3, m15
1019
+ psrld m3, 5
1020
+ packusdw m9, m3
1021
+ vextracti32x8 ym10, m9, 1
1022
+
1023
+ pmaddwd m11, m0, [r3 - 8 * 32] ; [8]
1024
+ paddd m11, m15
1025
+ psrld m11, 5
1026
+ pmaddwd m3, m2, [r3 - 8 * 32]
1027
+ paddd m3, m15
1028
+ psrld m3, 5
1029
+ packusdw m11, m3
1030
+
1031
+ TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 3, 0
1032
+
1033
+ pmaddwd m4, m0, [r3 + 9 * 32] ; [25]
1034
+ paddd m4, m15
1035
+ psrld m4, 5
1036
+ pmaddwd m5, m2, [r3 + 9 * 32]
1037
+ paddd m5, m15
1038
+ psrld m5, 5
1039
+ packusdw m4, m5
1040
+
1041
+ palignr m6, m2, m0, 4
1042
+ movu ym16, [r3 - 6 * 32] ; [10]
1043
+ vinserti32x8 m16, [r3 + 11 * 32] ,1 ; [27]
1044
+ pmaddwd m5, m6,m16
1045
+ paddd m5, m15
1046
+ psrld m5, 5
1047
+ palignr m7, m1, m2, 4
1048
+ pmaddwd m3, m7,m16
1049
+ paddd m3, m15
1050
+ psrld m3, 5
1051
+ packusdw m5, m3
1052
+ vextracti32x8 ym6, m5, 1
1053
+
1054
+ palignr m8, m2, m0, 8
1055
+ palignr m9, m1, m2, 8
1056
+ movu ym16, [r3 - 4 * 32] ; [12]
1057
+ vinserti32x8 m16, [r3 + 13 * 32] ,1 ; [29]
1058
+ pmaddwd m7, m8, m16
1059
+ paddd m7, m15
1060
+ psrld m7, 5
1061
+ pmaddwd m3, m9, m16
1062
+ paddd m3, m15
1063
+ psrld m3, 5
1064
+ packusdw m7, m3
1065
+ vextracti32x8 ym8, m7, 1
1066
+
1067
+
1068
+ palignr m10, m2, m0, 12
1069
+ palignr m11, m1, m2, 12
1070
+ movu ym16, [r3 - 2 * 32] ; [14]
1071
+ vinserti32x8 m16, [r3 + 15 * 32],1 ; [31]
1072
+ pmaddwd m9, m10, m16
1073
+ paddd m9, m15
1074
+ psrld m9, 5
1075
+ pmaddwd m3, m11, m16
1076
+ paddd m3, m15
1077
+ psrld m3, 5
1078
+ packusdw m9, m3
1079
+ vextracti32x8 ym10, m9, 1
1080
+
1081
+ pmaddwd m2, [r3] ; [16]
1082
+ paddd m2, m15
1083
+ psrld m2, 5
1084
+ pmaddwd m1, [r3]
1085
+ paddd m1, m15
1086
+ psrld m1, 5
1087
+ packusdw m2, m1
1088
+ TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 2, 0, 1, 16
1089
+ ret
1090
+;; angle 32, modes 5 and 31
1091
+cglobal ang32_mode_5_31
1092
+ test r6d, r6d
1093
+
1094
+ vbroadcasti32x8 m0, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
1095
+ vbroadcasti32x8 m1, [r2 + 4] ; [17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
1096
+
1097
+ punpcklwd m3, m0, m1 ; [13 12 12 11 11 10 10 9 5 4 4 3 3 2 2 1]
1098
+ punpckhwd m0, m1 ; [17 16 16 15 15 14 14 13 9 8 8 7 7 6 6 5]
1099
+
1100
+ vbroadcasti32x8 m1, [r2 + 18] ; [24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9]
1101
+ vbroadcasti32x8 m4, [r2 + 20] ; [25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10]
1102
+ punpcklwd m2, m1, m4 ; [21 20 20 19 19 18 18 17 13 12 12 11 11 10 10 9]
1103
+ punpckhwd m1, m4 ; [25 24 24 23 23 22 22 21 17 16 16 15 15 14 14 13]
1104
+
1105
+ movu ym16, [r3 - 15 * 32] ; [1]
1106
+ vinserti32x8 m16, [r3 + 2 * 32],1 ; [18]
1107
+ pmaddwd m4, m3, m16
1108
+ paddd m4, m15
1109
+ psrld m4, 5
1110
+ pmaddwd m5, m0, m16
1111
+ paddd m5, m15
1112
+ psrld m5, 5
1113
+ packusdw m4, m5
1114
+ vextracti32x8 ym5, m4, 1
1115
+
1116
+
1117
+ palignr m7, m0, m3, 4
1118
+ movu ym16, [r3 - 13 * 32] ; [3]
1119
+ vinserti32x8 m16, [r3 + 4 * 32] ,1 ; [20]
1120
+ pmaddwd m6, m7, m16
1121
+ paddd m6, m15
1122
+ psrld m6, 5
1123
+ palignr m8, m2, m0, 4
1124
+ pmaddwd m9, m8,m16
1125
+ paddd m9, m15
1126
+ psrld m9, 5
1127
+ packusdw m6, m9
1128
+ vextracti32x8 ym7, m6, 1
1129
+
1130
+
1131
+ palignr m9, m0, m3, 8
1132
+ movu ym16, [r3 - 11 * 32] ; [5]
1133
+ vinserti32x8 m16, [r3 + 6 * 32] ,1 ; [22]
1134
+ pmaddwd m8, m9,m16
1135
+ paddd m8, m15
1136
+ psrld m8, 5
1137
+ palignr m10, m2, m0, 8
1138
+ pmaddwd m11, m10,m16
1139
+ paddd m11, m15
1140
+ psrld m11, 5
1141
+ packusdw m8, m11
1142
+ vextracti32x8 ym9, m8, 1
1143
+
1144
+
1145
+ palignr m11, m0, m3, 12
1146
+ movu ym16, [r3 - 9 * 32] ; [7]
1147
+ vinserti32x8 m16, [r3 + 8 * 32] ,1 ; [24]
1148
+ pmaddwd m10, m11,m16
1149
+ paddd m10, m15
1150
+ psrld m10, 5
1151
+ palignr m12, m2, m0, 12
1152
+ pmaddwd m3, m12, m16
1153
+ paddd m3, m15
1154
+ psrld m3, 5
1155
+ packusdw m10, m3
1156
+ vextracti32x8 ym11, m10, 1
1157
+
1158
+
1159
+
1160
+ TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 3, 0
1161
+
1162
+ movu ym16, [r3 - 7 * 32] ; [9]
1163
+ vinserti32x8 m16, [r3 + 10 * 32] ,1 ; [26]
1164
+ pmaddwd m4, m0, m16
1165
+ paddd m4, m15
1166
+ psrld m4, 5
1167
+ pmaddwd m5, m2, m16
1168
+ paddd m5, m15
1169
+ psrld m5, 5
1170
+ packusdw m4, m5
1171
+ vextracti32x8 ym5, m4, 1
1172
+
1173
+
1174
+ palignr m7, m2, m0, 4
1175
+ movu ym16, [r3 - 5 * 32] ; [11]
1176
+ vinserti32x8 m16, [r3 + 12 * 32],1 ; [28]
1177
+ pmaddwd m6, m7, m16
1178
+ paddd m6, m15
1179
+ psrld m6, 5
1180
+ palignr m8, m1, m2, 4
1181
+ pmaddwd m9, m8,m16
1182
+ paddd m9, m15
1183
+ psrld m9, 5
1184
+ packusdw m6, m9
1185
+ vextracti32x8 ym7, m6, 1
1186
+
1187
+ palignr m9, m2, m0, 8
1188
+ movu ym16, [r3 - 3 * 32] ; [13]
1189
+ vinserti32x8 m16, [r3 + 14 * 32] ,1 ; [30]
1190
+ pmaddwd m8, m9, m16
1191
+ paddd m8, m15
1192
+ psrld m8, 5
1193
+ palignr m3, m1, m2, 8
1194
+ pmaddwd m10, m3, m16
1195
+ paddd m10, m15
1196
+ psrld m10, 5
1197
+ packusdw m8, m10
1198
+ vextracti32x8 ym9, m8, 1
1199
+
1200
+
1201
+
1202
+ palignr m10, m2, m0, 12
1203
+ pmaddwd m10, [r3 - 1 * 32] ; [15]
1204
+ paddd m10, m15
1205
+ psrld m10, 5
1206
+ palignr m11, m1, m2, 12
1207
+ pmaddwd m11, [r3 - 1 * 32]
1208
+ paddd m11, m15
1209
+ psrld m11, 5
1210
+ packusdw m10, m11
1211
+
1212
+ pmaddwd m2, [r3 - 16 * 32] ; [0]
1213
+ paddd m2, m15
1214
+ psrld m2, 5
1215
+ pmaddwd m1, [r3 - 16 * 32]
1216
+ paddd m1, m15
1217
+ psrld m1, 5
1218
+ packusdw m2, m1
1219
+ TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 2, 0, 1, 16
1220
+ ret
1221
+cglobal intra_pred_ang32_5, 3,8,17
1222
+ add r2, 128
1223
+ xor r6d, r6d
1224
+ lea r3, [ang_table_avx2 + 16 * 32]
1225
+ add r1d, r1d
1226
+ lea r4, [r1 * 3]
1227
+ lea r7, [r0 + 8 * r1]
1228
+ vbroadcasti32x8 m15, [pd_16]
1229
+ call ang16_mode_5_31
1230
+
1231
+ add r2, 18
1232
+ lea r0, [r0 + 32]
1233
+
1234
+ call ang32_mode_5_31
1235
+
1236
+ add r2, 14
1237
+ lea r0, [r7 + 8 * r1]
1238
+
1239
+ call ang16_mode_5_31
1240
+ vbroadcasti32x8 m15, [pd_16]
1241
+ add r2, 18
1242
+ lea r0, [r0 + 32]
1243
+ call ang32_mode_5_31
1244
+ RET
1245
+cglobal intra_pred_ang32_31, 3,7,17
1246
+ xor r6d, r6d
1247
+ inc r6d
1248
+ lea r3, [ang_table_avx2 + 16 * 32]
1249
+ add r1d, r1d
1250
+ lea r4, [r1 * 3]
1251
+ lea r5, [r0 + 32]
1252
+ vbroadcasti32x8 m15, [pd_16]
1253
+ call ang16_mode_5_31
1254
+
1255
+ add r2, 18
1256
+
1257
+ call ang32_mode_5_31
1258
+
1259
+ add r2, 14
1260
+ mov r0, r5
1261
+
1262
+ call ang16_mode_5_31
1263
+
1264
+ add r2, 18
1265
+ call ang32_mode_5_31
1266
+ RET
1267
+cglobal intra_pred_ang16_5, 3,7,17
1268
+ add r2, 64
1269
+ xor r6d, r6d
1270
+ vbroadcasti32x8 m15, [pd_16]
1271
+ lea r3, [ang_table_avx2 + 16 * 32]
1272
+ add r1d, r1d
1273
+ lea r4, [r1 * 3]
1274
+ call ang16_mode_5_31
1275
+ RET
1276
+cglobal intra_pred_ang16_31, 3,7,17
1277
+ xor r6d, r6d
1278
+ inc r6d
1279
+ vbroadcasti32x8 m15, [pd_16]
1280
+ lea r3, [ang_table_avx2 + 16 * 32]
1281
+ add r1d, r1d
1282
+ lea r4, [r1 * 3]
1283
+ call ang16_mode_5_31
1284
+ RET
1285
+;; angle 16, modes 4 and 32
1286
+cglobal ang16_mode_4_32
1287
+ test r6d, r6d
1288
+
1289
+ vbroadcasti32x8 m0, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
1290
+ vbroadcasti32x8 m1, [r2 + 4] ; [17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
1291
+
1292
+ punpcklwd m3, m0, m1 ; [13 12 12 11 11 10 10 9 5 4 4 3 3 2 2 1]
1293
+ punpckhwd m0, m1 ; [17 16 16 15 15 14 14 13 9 8 8 7 7 6 6 5]
1294
+
1295
+ vbroadcasti32x8 m1, [r2 + 18] ; [24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9]
1296
+ vbroadcasti32x8 m4, [r2 + 20] ; [25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10]
1297
+ punpcklwd m2, m1, m4 ; [21 20 20 19 19 18 18 17 13 12 12 11 11 10 10 9]
1298
+ punpckhwd m1, m4 ; [25 24 24 23 23 22 22 21 17 16 16 15 15 14 14 13]
1299
+
1300
+ pmaddwd m4, m3, [r3 + 3 * 32] ; [21]
1301
+ paddd m4, m15
1302
+ psrld m4, 5
1303
+ pmaddwd m5, m0, [r3 + 3 * 32]
1304
+ paddd m5, m15
1305
+ psrld m5, 5
1306
+ packusdw m4, m5
1307
+
1308
+ palignr m6, m0, m3, 4 ; [14 13 13 12 12 11 11 10 6 5 5 4 4 3 3 2]
1309
+ palignr m7, m2, m0, 4 ; [18 17 17 16 16 15 15 14 10 9 9 8 8 7 7 6]
1310
+ movu ym16,[r3 - 8 * 32] ; [10]
1311
+ vinserti32x8 m16, [r3 + 13 * 32] ,1 ; [31]
1312
+ pmaddwd m5, m6, m16
1313
+ paddd m5, m15
1314
+ psrld m5, 5
1315
+ pmaddwd m8, m7,m16
1316
+ paddd m8, m15
1317
+ psrld m8, 5
1318
+ packusdw m5, m8
1319
+ vextracti32x8 ym6, m5, 1
1320
+
1321
+
1322
+ palignr m7, m0, m3, 8 ; [15 14 14 13 13 12 12 11 7 6 6 5 5 4 4 3]
1323
+ pmaddwd m7, [r3 + 2 * 32] ; [20]
1324
+ paddd m7, m15
1325
+ psrld m7, 5
1326
+ palignr m8, m2, m0, 8 ; [19 18 18 17 17 16 16 15 11 10 10 9 9 8 8 7]
1327
+ pmaddwd m8, [r3 + 2 * 32]
1328
+ paddd m8, m15
1329
+ psrld m8, 5
1330
+ packusdw m7, m8
1331
+
1332
+ palignr m9, m0, m3, 12
1333
+ palignr m3, m2, m0, 12
1334
+ movu ym16,[r3 - 9 * 32] ; [9]
1335
+ vinserti32x8 m16, [r3 + 12 * 32] ,1 ; [30]
1336
+ pmaddwd m8, m9, m16
1337
+ paddd m8, m15
1338
+ psrld m8, 5
1339
+ pmaddwd m10, m3,m16
1340
+ paddd m10,m15
1341
+ psrld m10, 5
1342
+ packusdw m8, m10
1343
+ vextracti32x8 ym9, m8, 1
1344
+
1345
+
1346
+ pmaddwd m10, m0, [r3 + 1 * 32] ; [19]
1347
+ paddd m10,m15
1348
+ psrld m10, 5
1349
+ pmaddwd m3, m2, [r3 + 1 * 32]
1350
+ paddd m3, m15
1351
+ psrld m3, 5
1352
+ packusdw m10, m3
1353
+
1354
+ palignr m11, m2, m0, 4
1355
+ pmaddwd m11, [r3 - 10 * 32] ; [8]
1356
+ paddd m11, m15
1357
+ psrld m11, 5
1358
+ palignr m3, m1, m2, 4
1359
+ pmaddwd m3, [r3 - 10 * 32]
1360
+ paddd m3, m15
1361
+ psrld m3, 5
1362
+ packusdw m11, m3
1363
+
1364
+ TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 3, 0
1365
+
1366
+ palignr m4, m2, m0, 4
1367
+ pmaddwd m4, [r3 + 11 * 32] ; [29]
1368
+ paddd m4, m15
1369
+ psrld m4, 5
1370
+ palignr m5, m1, m2, 4
1371
+ pmaddwd m5, [r3 + 11 * 32]
1372
+ paddd m5, m15
1373
+ psrld m5, 5
1374
+ packusdw m4, m5
1375
+
1376
+ palignr m5, m2, m0, 8
1377
+ pmaddwd m5, [r3] ; [18]
1378
+ paddd m5, m15
1379
+ psrld m5, 5
1380
+ palignr m6, m1, m2, 8
1381
+ pmaddwd m6, [r3]
1382
+ paddd m6, m15
1383
+ psrld m6, 5
1384
+ packusdw m5, m6
1385
1386
+ palignr m7, m2, m0, 12
1387
+ palignr m8, m1, m2, 12
1388
+ movu ym16,[r3 - 11 * 32] ; [7]
1389
+ vinserti32x8 m16, [r3 + 10 * 32],1 ; [28]
1390
+ pmaddwd m6, m7, m16
1391
+ paddd m6, m15
1392
+ psrld m6, 5
1393
+ palignr m8, m1, m2, 12
1394
+ pmaddwd m3, m8, m16
1395
+ paddd m3,m15
1396
+ psrld m3, 5
1397
+ packusdw m6, m3
1398
+ vextracti32x8 ym7, m6, 1
1399
+
1400
+ movu m0, [r2 + 34] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17]
1401
+ pmaddwd m8, m2, [r3 - 1 * 32] ; [17]
1402
+ paddd m8, m15
1403
+ psrld m8, 5
1404
+ pmaddwd m9, m1, [r3 - 1 * 32]
1405
+ paddd m9, m15
1406
+ psrld m9, 5
1407
+ packusdw m8, m9
1408
+
1409
+ palignr m3, m0, m0, 2 ; [ x 32 31 30 29 28 27 26 x 24 23 22 21 20 19 18]
1410
+ punpcklwd m0, m3 ; [29 29 28 28 27 27 26 22 21 20 20 19 19 18 18 17]
1411
+
1412
+ palignr m10, m1, m2, 4
1413
+ pmaddwd m9, m10, [r3 - 12 * 32] ; [6]
1414
+ paddd m9, m15
1415
+ psrld m9, 5
1416
+ palignr m11, m0, m1, 4
1417
+ pmaddwd m3, m11, [r3 - 12 * 32]
1418
+ paddd m3, m15
1419
+ psrld m3, 5
1420
+ packusdw m9, m3
1421
+
1422
+ pmaddwd m10, [r3 + 9 * 32] ; [27]
1423
+ paddd m10,m15
1424
+ psrld m10, 5
1425
+ pmaddwd m11, [r3 + 9 * 32]
1426
+ paddd m11, m15
1427
+ psrld m11, 5
1428
+ packusdw m10, m11
1429
+
1430
+ palignr m3, m1, m2, 8
1431
+ pmaddwd m3, [r3 - 2 * 32] ; [16]
1432
+ paddd m3, m15
1433
+ psrld m3, 5
1434
+ palignr m0, m1, 8
1435
+ pmaddwd m0, [r3 - 2 * 32]
1436
+ paddd m0,m15
1437
+ psrld m0, 5
1438
+ packusdw m3, m0
1439
+ TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 3, 0, 1, 16
1440
+ ret
1441
+;; angle 32, modes 4 and 32
1442
+cglobal ang32_mode_4_32
1443
+ test r6d, r6d
1444
+
1445
+ vbroadcasti32x8 m0, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
1446
+ vbroadcasti32x8 m1, [r2 + 4] ; [17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
1447
+
1448
+ punpcklwd m3, m0, m1 ; [13 12 12 11 11 10 10 9 5 4 4 3 3 2 2 1]
1449
+ punpckhwd m0, m1 ; [17 16 16 15 15 14 14 13 9 8 8 7 7 6 6 5]
1450
+
1451
+ vbroadcasti32x8 m1, [r2 + 18] ; [24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9]
1452
+ vbroadcasti32x8 m4, [r2 + 20] ; [25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10]
1453
+ punpcklwd m2, m1, m4 ; [21 20 20 19 19 18 18 17 13 12 12 11 11 10 10 9]
1454
+ punpckhwd m1, m4 ; [25 24 24 23 23 22 22 21 17 16 16 15 15 14 14 13]
1455
+
1456
+ movu ym16, [r3 - 13 * 32] ; [5]
1457
+ vinserti32x8 m16, [r3 + 8 * 32],1 ; [26]
1458
+ pmaddwd m4, m3, m16
1459
+ paddd m4, m15
1460
+ psrld m4, 5
1461
+ pmaddwd m5, m0,m16
1462
+ paddd m5, m15
1463
+ psrld m5, 5
1464
+ packusdw m4, m5
1465
+ vextracti32x8 ym5, m4, 1
1466
+
1467
+ palignr m6, m0, m3, 4 ; [14 13 13 12 12 11 11 10 6 5 5 4 4 3 3 2]
1468
+ pmaddwd m6, [r3 - 3 * 32] ; [15]
1469
+ paddd m6, m15
1470
+ psrld m6, 5
1471
+ palignr m7, m2, m0, 4 ; [18 17 17 16 16 15 15 14 10 9 9 8 8 7 7 6]
1472
+ pmaddwd m7, [r3 - 3 * 32]
1473
+ paddd m7, m15
1474
+ psrld m7, 5
1475
+ packusdw m6, m7
1476
+
1477
+ palignr m8, m0, m3, 8 ; [15 14 14 13 13 12 12 11 7 6 6 5 5 4 4 3]
1478
+ palignr m9, m2, m0, 8 ; [19 18 18 17 17 16 16 15 11 10 10 9 9 8 8 7]
1479
+ movu ym16, [r3 - 14 * 32] ; [4]
1480
+ vinserti32x8 m16, [r3 + 7 * 32] ,1 ; [25]
1481
+ pmaddwd m7, m8, m16
1482
+ paddd m7, m15
1483
+ psrld m7, 5
1484
+ pmaddwd m10, m9, m16
1485
+ paddd m10, m15
1486
+ psrld m10, 5
1487
+ packusdw m7, m10
1488
+ vextracti32x8 ym8, m7, 1
1489
+
1490
+ palignr m9, m0, m3, 12
1491
+ pmaddwd m9, [r3 - 4 * 32] ; [14]
1492
+ paddd m9, m15
1493
+ psrld m9, 5
1494
+ palignr m3, m2, m0, 12
1495
+ pmaddwd m3, [r3 - 4 * 32]
1496
+ paddd m3,m15
1497
+ psrld m3, 5
1498
+ packusdw m9, m3
1499
+
1500
+ movu ym16, [r3 - 15 * 32] ; [3]
1501
+ vinserti32x8 m16, [r3 + 6 * 32] ,1 ; [24]
1502
+ pmaddwd m10, m0, m16
1503
+ paddd m10, m15
1504
+ psrld m10, 5
1505
+ pmaddwd m3, m2, m16
1506
+ paddd m3,m15
1507
+ psrld m3, 5
1508
+ packusdw m10, m3
1509
+ vextracti32x8 ym11, m10, 1
1510
+
1511
+ TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 3, 0
1512
+
1513
+ palignr m4, m2, m0, 4
1514
+ pmaddwd m4, [r3 - 5* 32] ; [13]
1515
+ paddd m4, m15
1516
+ psrld m4, 5
1517
+ palignr m5, m1, m2, 4
1518
+ pmaddwd m5, [r3 - 5 * 32]
1519
+ paddd m5, m15
1520
+ psrld m5, 5
1521
+ packusdw m4, m5
1522
+
1523
+ palignr m6, m2, m0, 8
1524
+ palignr m7, m1, m2, 8
1525
+ movu ym16, [r3 - 16 * 32] ; [2]
1526
+ vinserti32x8 m16, [r3 + 5 * 32] ,1 ; [23]
1527
+ pmaddwd m5, m6, m16
1528
+ paddd m5, m15
1529
+ psrld m5, 5
1530
+ palignr m7, m1, m2, 8
1531
+ pmaddwd m8, m7,m16
1532
+ paddd m8, m15
1533
+ psrld m8, 5
1534
+ packusdw m5, m8
1535
+ vextracti32x8 ym6, m5, 1
1536
+
1537
+
1538
+ palignr m7, m2, m0, 12
1539
+ pmaddwd m7, [r3 - 6 * 32] ; [12]
1540
+ paddd m7, m15
1541
+ psrld m7, 5
1542
+ palignr m8, m1, m2, 12
1543
+ pmaddwd m8, [r3 - 6 * 32]
1544
+ paddd m8, m15
1545
+ psrld m8, 5
1546
+ packusdw m7, m8
1547
+
1548
+ movu m0, [r2 + 34] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17]
1549
+ pmaddwd m8, m2, [r3 - 17 * 32] ; [1]
1550
+ paddd m8, m15
1551
+ psrld m8, 5
1552
+ pmaddwd m9, m1, [r3 - 17 * 32]
1553
+ paddd m9, m15
1554
+ psrld m9, 5
1555
+ packusdw m8, m9
1556
+
1557
+ palignr m3, m0, m0, 2 ; [ x 32 31 30 29 28 27 26 x 24 23 22 21 20 19 18]
1558
+ punpcklwd m0, m3 ; [29 29 28 28 27 27 26 22 21 20 20 19 19 18 18 17]
1559
+
1560
+ pmaddwd m9, m2, [r3 + 4 * 32] ; [22]
1561
+ paddd m9, m15
1562
+ psrld m9, 5
1563
+ pmaddwd m3, m1, [r3 + 4 * 32]
1564
+ paddd m3, m15
1565
+ psrld m3, 5
1566
+ packusdw m9, m3
1567
+
1568
+ palignr m10, m1, m2, 4
1569
+ pmaddwd m10, [r3 - 7 * 32] ; [11]
1570
+ paddd m10, m15
1571
+ psrld m10, 5
1572
+ palignr m11, m0, m1, 4
1573
+ pmaddwd m11, [r3 - 7 * 32]
1574
+ paddd m11, m15
1575
+ psrld m11, 5
1576
+ packusdw m10, m11
1577
+
1578
+ palignr m3, m1, m2, 8
1579
+ pmaddwd m3, [r3 - 18 * 32] ; [0]
1580
+ paddd m3, m15
1581
+ psrld m3, 5
1582
+ palignr m0, m1, 8
1583
+ pmaddwd m0, [r3 - 18 * 32]
1584
+ paddd m0, m15
1585
+ psrld m0, 5
1586
+ packusdw m3, m0
1587
+ TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 3, 0, 1, 16
1588
+ ret
1589
+cglobal intra_pred_ang32_4, 3,8,17
1590
+ add r2, 128
1591
+ xor r6d, r6d
1592
+ lea r3, [ang_table_avx2 + 18 * 32]
1593
+ add r1d, r1d
1594
+ lea r4, [r1 * 3]
1595
+ lea r7, [r0 + 8 * r1]
1596
+ vbroadcasti32x8 m15, [pd_16]
1597
+ call ang16_mode_4_32
1598
+
1599
+ add r2, 22
1600
+ lea r0, [r0 + 32]
1601
+
1602
+ call ang32_mode_4_32
1603
+
1604
+ add r2, 10
1605
+ lea r0, [r7 + 8 * r1]
1606
+
1607
+ call ang16_mode_4_32
1608
+
1609
+ add r2, 22
1610
+ lea r0, [r0 + 32]
1611
+ call ang32_mode_4_32
1612
+ RET
1613
+cglobal intra_pred_ang32_32, 3,7,17
1614
+ xor r6d, r6d
1615
+ inc r6d
1616
+ lea r3, [ang_table_avx2 + 18 * 32]
1617
+ add r1d, r1d
1618
+ lea r4, [r1 * 3]
1619
+ lea r5, [r0 + 32]
1620
+ vbroadcasti32x8 m15, [pd_16]
1621
+ call ang16_mode_4_32
1622
+
1623
+ add r2, 22
1624
+
1625
+ call ang32_mode_4_32
1626
+
1627
+ add r2, 10
1628
+ mov r0, r5
1629
+
1630
+ call ang16_mode_4_32
1631
+ add r2, 22
1632
+ call ang32_mode_4_32
1633
+ RET
1634
+cglobal intra_pred_ang16_4, 3,7,17
1635
+ add r2, 64
1636
+ xor r6d, r6d
1637
+ vbroadcasti32x8 m15, [pd_16]
1638
+ lea r3, [ang_table_avx2 + 18 * 32]
1639
+ add r1d, r1d
1640
+ lea r4, [r1 * 3]
1641
+ call ang16_mode_4_32
1642
+ RET
1643
+cglobal intra_pred_ang16_32, 3,7,17
1644
+ xor r6d, r6d
1645
+ inc r6d
1646
+ vbroadcasti32x8 m15, [pd_16]
1647
+ lea r3, [ang_table_avx2 + 18 * 32]
1648
+ shl r1d, 1
1649
+ lea r4, [r1 * 3]
1650
+ call ang16_mode_4_32
1651
+ RET
1652
+;; angle 16, modes 6 and 30
1653
+cglobal ang16_mode_6_30
1654
+ test r6d, r6d
1655
+
1656
+ vbroadcasti32x8 m0, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
1657
+ vbroadcasti32x8 m1, [r2 + 4] ; [17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
1658
+
1659
+ punpcklwd m3, m0, m1 ; [13 12 12 11 11 10 10 9 5 4 4 3 3 2 2 1]
1660
+ punpckhwd m0, m1 ; [17 16 16 15 15 14 14 13 9 8 8 7 7 6 6 5]
1661
+
1662
+ vbroadcasti32x8 m1, [r2 + 18] ; [24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9]
1663
+ vbroadcasti32x8 m4, [r2 + 20] ; [25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10]
1664
+ punpcklwd m2, m1, m4 ; [21 20 20 19 19 18 18 17 13 12 12 11 11 10 10 9]
1665
+ punpckhwd m1, m4 ; [25 24 24 23 23 22 22 21 17 16 16 15 15 14 14 13]
1666
+
1667
+ movu ym16, [r3 - 2 * 32] ; [13]
1668
+ vinserti32x8 m16, [r3 + 11 * 32] ,1 ; [26]
1669
+ pmaddwd m4, m3, m16
1670
+ paddd m4, m15
1671
+ psrld m4, 5
1672
+ pmaddwd m5, m0, m16
1673
+ paddd m5, m15
1674
+ psrld m5, 5
1675
+ packusdw m4, m5
1676
+ vextracti32x8 ym5, m4, 1
1677
+
1678
+ palignr m7, m0, m3, 4
1679
+ palignr m8, m2, m0, 4
1680
+ movu ym16, [r3 - 8 * 32] ; [7]
1681
+ vinserti32x8 m16, [r3 + 5 * 32] ,1 ; [20]
1682
+ pmaddwd m6, m7, m16
1683
+ paddd m6, m15
1684
+ psrld m6, 5
1685
+ pmaddwd m9, m8, m16
1686
+ paddd m9, m15
1687
+ psrld m9, 5
1688
+ packusdw m6, m9
1689
+ vextracti32x8 ym7, m6, 1
1690
+
1691
+ palignr m10, m0, m3, 8
1692
+ palignr m11, m2, m0, 8
1693
+ movu ym16, [r3 - 14 * 32] ; [1]
1694
+ vinserti32x8 m16, [r3 - 1 * 32],1 ; [14]
1695
+ pmaddwd m8, m10, m16
1696
+ paddd m8,m15
1697
+ psrld m8, 5
1698
+ palignr m11, m2, m0, 8
1699
+ pmaddwd m9, m11, m16
1700
+ paddd m9, m15
1701
+ psrld m9, 5
1702
+ packusdw m8, m9
1703
+ vextracti32x8 ym9, m8, 1
1704
+
1705
+ pmaddwd m10, [r3 + 12 * 32] ; [27]
1706
+ paddd m10,m15
1707
+ psrld m10, 5
1708
+ pmaddwd m11, [r3 + 12 * 32]
1709
+ paddd m11, m15
1710
+ psrld m11, 5
1711
+ packusdw m10, m11
1712
+
1713
+ palignr m11, m0, m3, 12
1714
+ pmaddwd m11, [r3 - 7 * 32] ; [8]
1715
+ paddd m11, m15
1716
+ psrld m11, 5
1717
+ palignr m12, m2, m0, 12
1718
+ pmaddwd m12, [r3 - 7 * 32]
1719
+ paddd m12, m15
1720
+ psrld m12, 5
1721
+ packusdw m11, m12
1722
+
1723
+ TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0
1724
+
1725
+ palignr m4, m0, m3, 12
1726
+ pmaddwd m4, [r3 + 6 * 32] ; [21]
1727
+ paddd m4, m15
1728
+ psrld m4, 5
1729
+ palignr m5, m2, m0, 12
1730
+ pmaddwd m5, [r3 + 6 * 32]
1731
+ paddd m5, m15
1732
+ psrld m5, 5
1733
+ packusdw m4, m5
1734
+
1735
+ movu ym16, [r3 - 13 * 32] ; [2]
1736
+ vinserti32x8 m16, [r3] ,1 ; [15]
1737
+ pmaddwd m5, m0, m16
1738
+ paddd m5, m15
1739
+ psrld m5, 5
1740
+ pmaddwd m3, m2,m16
1741
+ paddd m3, m15
1742
+ psrld m3, 5
1743
+ packusdw m5, m3
1744
+ vextracti32x8 ym6, m5, 1
1745
+
1746
+ pmaddwd m7, m0, [r3 + 13 * 32] ; [28]
1747
+ paddd m7, m15
1748
+ psrld m7, 5
1749
+ pmaddwd m3, m2, [r3 + 13 * 32]
1750
+ paddd m3, m15
1751
+ psrld m3, 5
1752
+ packusdw m7, m3
1753
+
1754
+ palignr m9, m2, m0, 4
1755
+ palignr m3, m1, m2, 4
1756
+ movu ym16, [r3 - 6 * 32] ; [9]
1757
+ vinserti32x8 m16, [r3 + 7 * 32],1 ; [22]
1758
+ pmaddwd m8, m9, m16
1759
+ paddd m8, m15
1760
+ psrld m8, 5
1761
+ pmaddwd m10, m3, m16
1762
+ paddd m10,m15
1763
+ psrld m10, 5
1764
+ packusdw m8, m10
1765
+ vextracti32x8 ym9, m8, 1
1766
+
1767
+
1768
+ palignr m11, m2, m0, 8
1769
+ pmaddwd m10, m11, [r3 - 12 * 32] ; [3]
1770
+ paddd m10, m15
1771
+ psrld m10, 5
1772
+ palignr m3, m1, m2, 8
1773
+ pmaddwd m12, m3, [r3 - 12 * 32]
1774
+ paddd m12, m15
1775
+ psrld m12, 5
1776
+ packusdw m10, m12
1777
+
1778
+ pmaddwd m11, [r3 + 1 * 32] ; [16]
1779
+ paddd m11, m15
1780
+ psrld m11, 5
1781
+ pmaddwd m3, [r3 + 1 * 32]
1782
+ paddd m3, m15
1783
+ psrld m3, 5
1784
+ packusdw m11, m3
1785
+ TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 16
1786
+ ret
1787
+;; angle 32, modes 6 and 30
1788
+cglobal ang32_mode_6_30
1789
+ test r6d, r6d
1790
+
1791
+ vbroadcasti32x8 m0, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
1792
+ vbroadcasti32x8 m1, [r2 + 4] ; [17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
1793
+
1794
+ punpcklwd m3, m0, m1 ; [13 12 12 11 11 10 10 9 5 4 4 3 3 2 2 1]
1795
+ punpckhwd m0, m1 ; [17 16 16 15 15 14 14 13 9 8 8 7 7 6 6 5]
1796
+
1797
+ vbroadcasti32x8 m1, [r2 + 18] ; [24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9]
1798
+ vbroadcasti32x8 m4, [r2 + 20] ; [25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10]
1799
+ punpcklwd m2, m1, m4 ; [21 20 20 19 19 18 18 17 13 12 12 11 11 10 10 9]
1800
+ punpckhwd m1, m4 ; [25 24 24 23 23 22 22 21 17 16 16 15 15 14 14 13]
1801
+
1802
+ pmaddwd m4, m3, [r3 + 14 * 32] ; [29]
1803
+ paddd m4, m15
1804
+ psrld m4, 5
1805
+ pmaddwd m5, m0, [r3 + 14 * 32]
1806
+ paddd m5, m15
1807
+ psrld m5, 5
1808
+ packusdw m4, m5
1809
+
1810
+ palignr m6, m0, m3, 4
1811
+ palignr m7, m2, m0, 4
1812
+ movu ym16, [r3 - 5 * 32] ; [10]
1813
+ vinserti32x8 m16, [r3 + 8 * 32] ,1 ; [23]
1814
+ pmaddwd m5, m6, m16
1815
+ paddd m5, m15
1816
+ psrld m5, 5
1817
+ pmaddwd m8, m7, m16
1818
+ paddd m8, m15
1819
+ psrld m8, 5
1820
+ packusdw m5, m8
1821
+ vextracti32x8 ym6, m5, 1
1822
+
1823
+ palignr m9, m0, m3, 8
1824
+ palignr m12, m2, m0, 8
1825
+ movu ym16, [r3 - 11 * 32] ; [4]
1826
+ vinserti32x8 m16, [r3 + 2 * 32] ,1 ; [17]
1827
+ pmaddwd m7, m9, m16
1828
+ paddd m7,m15
1829
+ psrld m7, 5
1830
+ palignr m12, m2, m0, 8
1831
+ pmaddwd m11, m12,m16
1832
+ paddd m11,m15
1833
+ psrld m11, 5
1834
+ packusdw m7, m11
1835
+ vextracti32x8 ym8, m7, 1
1836
+
1837
+ pmaddwd m9, [r3 + 15 * 32] ; [30]
1838
+ paddd m9, m15
1839
+ psrld m9, 5
1840
+ pmaddwd m12, [r3 + 15 * 32]
1841
+ paddd m12, m15
1842
+ psrld m12, 5
1843
+ packusdw m9, m12
1844
+
1845
+ palignr m11, m0, m3, 12
1846
+ pmaddwd m10, m11, [r3 - 4 * 32] ; [11]
1847
+ paddd m10, m15
1848
+ psrld m10, 5
1849
+ palignr m12, m2, m0, 12
1850
+ pmaddwd m3, m12, [r3 - 4 * 32]
1851
+ paddd m3, m15
1852
+ psrld m3, 5
1853
+ packusdw m10, m3
1854
+
1855
+ pmaddwd m11, [r3 + 9 * 32] ; [24]
1856
+ paddd m11, m15
1857
+ psrld m11, 5
1858
+ pmaddwd m12, [r3 + 9 * 32]
1859
+ paddd m12,m15
1860
+ psrld m12, 5
1861
+ packusdw m11, m12
1862
+
1863
+ TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0
1864
+
1865
+ movu ym16, [r3 - 10 * 32] ; [5]
1866
+ vinserti32x8 m16, [r3 + 3 * 32] ,1 ; [18]
1867
+ pmaddwd m4, m0, m16
1868
+ paddd m4, m15
1869
+ psrld m4, 5
1870
+ pmaddwd m5, m2, m16
1871
+ paddd m5, m15
1872
+ psrld m5, 5
1873
+ packusdw m4, m5
1874
+ vextracti32x8 ym5, m4, 1
1875
+
1876
+ pmaddwd m6, m0, [r3 + 16 * 32] ; [31]
1877
+ paddd m6,m15
1878
+ psrld m6, 5
1879
+ pmaddwd m7, m2, [r3 + 16 * 32]
1880
+ paddd m7,m15
1881
+ psrld m7, 5
1882
+ packusdw m6, m7
1883
+
1884
+ palignr m8, m2, m0, 4
1885
+ palignr m9, m1, m2, 4
1886
+ movu ym16, [r3 - 3 * 32] ; [12]
1887
+ vinserti32x8 m16, [r3 + 10 * 32],1 ; [25]
1888
+ pmaddwd m7, m8,m16
1889
+ paddd m7,m15
1890
+ psrld m7, 5
1891
+ pmaddwd m3, m9, m16
1892
+ paddd m3, m15
1893
+ psrld m3, 5
1894
+ packusdw m7, m3
1895
+ vextracti32x8 ym8, m7, 1
1896
+
1897
+ palignr m10, m2, m0, 8
1898
+ palignr m12, m1, m2, 8
1899
+ movu ym16, [r3 - 9 * 32] ; [6]
1900
+ vinserti32x8 m16, [r3 + 4 * 32] ,1 ; [19]
1901
+ pmaddwd m9, m10, m16
1902
+ paddd m9, m15
1903
+ psrld m9, 5
1904
+ pmaddwd m3, m12,m16
1905
+ paddd m3, m15
1906
+ psrld m3, 5
1907
+ packusdw m9, m3
1908
+ vextracti32x8 ym10, m9, 1
1909
+
1910
+
1911
+ palignr m11, m2, m0, 12
1912
+ pmaddwd m11, [r3 - 15 * 32] ; [0]
1913
+ paddd m11, m15
1914
+ psrld m11, 5
1915
+ palignr m3, m1, m2, 12
1916
+ pmaddwd m3, [r3 - 15 * 32]
1917
+ paddd m3, m15
1918
+ psrld m3, 5
1919
+ packusdw m11, m3
1920
+ TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 16
1921
+ ret
1922
+cglobal intra_pred_ang32_6, 3,8,17
1923
+ add r2, 128
1924
+ xor r6d, r6d
1925
+ lea r3, [ang_table_avx2 + 15 * 32]
1926
+ add r1d, r1d
1927
+ lea r4, [r1 * 3]
1928
+ lea r7, [r0 + 8 * r1]
1929
+ vbroadcasti32x8 m15, [pd_16]
1930
+ call ang16_mode_6_30
1931
+
1932
+ add r2, 12
1933
+ lea r0, [r0 + 32]
1934
+
1935
+ call ang32_mode_6_30
1936
+
1937
+ add r2, 20
1938
+ lea r0, [r7 + 8 * r1]
1939
+
1940
+ call ang16_mode_6_30
1941
+
1942
+ add r2, 12
1943
+ lea r0, [r0 + 32]
1944
+ call ang32_mode_6_30
1945
+ RET
1946
+cglobal intra_pred_ang32_30, 3,7,17
1947
+ xor r6d, r6d
1948
+ inc r6d
1949
+ lea r3, [ang_table_avx2 + 15 * 32]
1950
+ add r1d, r1d
1951
+ lea r4, [r1 * 3]
1952
+ lea r5, [r0 + 32]
1953
+ vbroadcasti32x8 m15, [pd_16]
1954
+ call ang16_mode_6_30
1955
+
1956
+ add r2, 12
1957
+
1958
+ call ang32_mode_6_30
1959
+
1960
+ add r2, 20
1961
+ mov r0, r5
1962
+
1963
+ call ang16_mode_6_30
1964
+
1965
+ add r2, 12
1966
+ call ang32_mode_6_30
1967
+ RET
1968
+cglobal intra_pred_ang16_6, 3,7,17
1969
+ add r2, 64
1970
+ xor r6d, r6d
1971
+ vbroadcasti32x8 m15, [pd_16]
1972
+ lea r3, [ang_table_avx2 + 15 * 32]
1973
+ shl r1d, 1
1974
+ lea r4, [r1 * 3]
1975
+ call ang16_mode_6_30
1976
+ RET
1977
+cglobal intra_pred_ang16_30, 3,7,17
1978
+ xor r6d, r6d
1979
+ inc r6d
1980
+ vbroadcasti32x8 m15, [pd_16]
1981
+ lea r3, [ang_table_avx2 + 15 * 32]
1982
+ shl r1d, 1
1983
+ lea r4, [r1 * 3]
1984
+ call ang16_mode_6_30
1985
+ RET
1986
+
1987
+;; angle 16, modes 8 and 28
1988
+cglobal ang16_mode_8_28
1989
+ test r6d, r6d
1990
+
1991
+ vbroadcasti32x8 m0, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
1992
+ vbroadcasti32x8 m1, [r2 + 4] ; [17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
1993
+
1994
+ punpcklwd m3, m0, m1 ; [13 12 12 11 11 10 10 9 5 4 4 3 3 2 2 1]
1995
+ punpckhwd m0, m1 ; [17 16 16 15 15 14 14 13 9 8 8 7 7 6 6 5]
1996
+
1997
+ vbroadcasti32x8 m2, [r2 + 18] ; [24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9]
1998
+ vbroadcasti32x8 m4, [r2 + 20] ; [25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10]
1999
+ punpcklwd m2, m4 ; [21 20 20 19 19 18 18 17 13 12 12 11 11 10 10 9]
2000
+
2001
+ movu ym14, [r3 - 10 * 32]
2002
+ vinserti32x8 m14, [r3 - 5 * 32], 1
2003
+ pmaddwd m4, m3, m14 ; [5], [10]
2004
+ paddd m4, m15
2005
+ psrld m4, 5
2006
+ pmaddwd m5, m0, m14
2007
+ paddd m5, m15
2008
+ psrld m5, 5
2009
+ packusdw m4, m5
2010
+ vextracti32x8 ym5, m4, 1
2011
+
2012
+ movu ym14, [r3]
2013
+ vinserti32x8 m14, [r3 + 5 * 32], 1
2014
+ pmaddwd m6, m3, m14 ; [15], [20]
2015
+ paddd m6, m15
2016
+ psrld m6, 5
2017
+ pmaddwd m9, m0, m14
2018
+ paddd m9, m15
2019
+ psrld m9, 5
2020
+ packusdw m6, m9
2021
+ vextracti32x8 ym7, m6, 1
2022
+
2023
+ movu ym14, [r3 + 10 * 32]
2024
+ vinserti32x8 m14, [r3 + 15 * 32], 1
2025
+ pmaddwd m8, m3, m14 ; [25], [30]
2026
+ paddd m8, m15
2027
+ psrld m8, 5
2028
+ pmaddwd m9, m0, m14
2029
+ paddd m9, m15
2030
+ psrld m9, 5
2031
+ packusdw m8, m9
2032
+ vextracti32x8 ym9, m8, 1
2033
+
2034
+ palignr m11, m0, m3, 4
2035
+ movu ym14, [r3 - 12 * 32]
2036
+ vinserti32x8 m14, [r3 - 7 * 32], 1
2037
+ pmaddwd m10, m11, m14 ; [3], [8]
2038
+ paddd m10, m15
2039
+ psrld m10, 5
2040
+ palignr m1, m2, m0, 4
2041
+ pmaddwd m12, m1, m14
2042
+ paddd m12, m15
2043
+ psrld m12, 5
2044
+ packusdw m10, m12
2045
+ vextracti32x8 ym11, m10, 1
2046
+
2047
+ TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 0
2048
+
2049
+ palignr m7, m0, m3, 4
2050
+ movu ym14, [r3 - 2 * 32]
2051
+ vinserti32x8 m14, [r3 + 3 * 32], 1
2052
+ pmaddwd m4, m7, m14 ; [13], [18]
2053
+ paddd m4, m15
2054
+ psrld m4, 5
2055
+ palignr m1, m2, m0, 4
2056
+ pmaddwd m5, m1, m14
2057
+ paddd m5, m15
2058
+ psrld m5, 5
2059
+ packusdw m4, m5
2060
+ vextracti32x8 ym5, m4, 1
2061
+
2062
+ movu ym14, [r3 + 8 * 32]
2063
+ vinserti32x8 m14, [r3 + 13 * 32], 1
2064
+ pmaddwd m6, m7, m14 ; [23], [28]
2065
+ paddd m6, m15
2066
+ psrld m6, 5
2067
+ pmaddwd m8, m1, m14
2068
+ paddd m8, m15
2069
+ psrld m8, 5
2070
+ packusdw m6, m8
2071
+ vextracti32x8 ym7, m6, 1
2072
+
2073
+ movu ym14, [r3 - 14 * 32]
2074
+ vinserti32x8 m14, [r3 - 9 * 32], 1
2075
+ palignr m1, m0, m3, 8
2076
+ pmaddwd m8, m1, m14 ; [1], [6]
2077
+ paddd m8, m15
2078
+ psrld m8, 5
2079
+ palignr m2, m0, 8
2080
+ pmaddwd m9, m2, m14
2081
+ paddd m9, m15
2082
+ psrld m9, 5
2083
+ packusdw m8, m9
2084
+ vextracti32x8 ym9, m8, 1
2085
+
2086
+ movu ym14, [r3 - 4 * 32]
2087
+ vinserti32x8 m14, [r3 + 1 * 32], 1
2088
+ pmaddwd m3, m1, m14 ; [11], [16]
2089
+ paddd m3, m15
2090
+ psrld m3, 5
2091
+ pmaddwd m0, m2, m14
2092
+ paddd m0, m15
2093
+ psrld m0, 5
2094
+ packusdw m3, m0
2095
+ vextracti32x8 ym1, m3, 1
2096
+ TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 3, 1, 0, 2, 16
2097
+ ret
2098
+
2099
+;; angle 32, modes 8 and 28
2100
+cglobal ang32_mode_8_28
2101
+ test r6d, r6d
2102
+
2103
+ vbroadcasti32x8 m0, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
2104
+ vbroadcasti32x8 m1, [r2 + 4] ; [17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
2105
+
2106
+ punpcklwd m3, m0, m1 ; [13 12 12 11 11 10 10 9 5 4 4 3 3 2 2 1]
2107
+ punpckhwd m0, m1 ; [17 16 16 15 15 14 14 13 9 8 8 7 7 6 6 5]
2108
+
2109
+ vbroadcasti32x8 m2, [r2 + 18] ; [24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9]
2110
+ vbroadcasti32x8 m4, [r2 + 20] ; [25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10]
2111
+ punpcklwd m2, m4 ; [21 20 20 19 19 18 18 17 13 12 12 11 11 10 10 9]
2112
+
2113
+ movu ym14, [r3 + 6 * 32]
2114
+ vinserti32x8 m14, [r3 + 11 * 32], 1
2115
+ pmaddwd m4, m3, m14 ; [21], [26]
2116
+ paddd m4, m15
2117
+ psrld m4, 5
2118
+ pmaddwd m5, m0, m14
2119
+ paddd m5, m15
2120
+ psrld m5, 5
2121
+ packusdw m4, m5
2122
+ vextracti32x8 ym5, m4, 1
2123
+
2124
+ pmaddwd m6, m3, [r3 + 16 * 32] ; [31]
2125
+ paddd m6, [pd_16]
2126
+ psrld m6, 5
2127
+ pmaddwd m9, m0, [r3 + 16 * 32]
2128
+ paddd m9, [pd_16]
2129
+ psrld m9, 5
2130
+ packusdw m6, m9
2131
+
2132
+ palignr m11, m0, m3, 4
2133
+ movu ym14, [r3 - 11 * 32]
2134
+ vinserti32x8 m14, [r3 - 6 * 32], 1
2135
+ pmaddwd m7, m11, m14 ; [4], [9]
2136
+ paddd m7, m15
2137
+ psrld m7, 5
2138
+ palignr m1, m2, m0, 4
2139
+ pmaddwd m8, m1, m14
2140
+ paddd m8, m15
2141
+ psrld m8, 5
2142
+ packusdw m7, m8
2143
+ vextracti32x8 ym8, m7, 1
2144
+
2145
+ movu ym14, [r3 - 1 * 32]
2146
+ vinserti32x8 m14, [r3 + 4 * 32], 1
2147
+ pmaddwd m9, m11, m14 ; [14], [19]
2148
+ paddd m9, m15
2149
+ psrld m9, 5
2150
+ pmaddwd m10, m1, m14
2151
+ paddd m10, m15
2152
+ psrld m10, 5
2153
+ packusdw m9, m10
2154
+ vextracti32x8 ym10, m9, 1
2155
+
2156
+ pmaddwd m11, [r3 + 9 * 32] ; [24]
2157
+ paddd m11, [pd_16]
2158
+ psrld m11, 5
2159
+ pmaddwd m1, [r3 + 9 * 32]
2160
+ paddd m1, [pd_16]
2161
+ psrld m1, 5
2162
+ packusdw m11, m1
2163
+
2164
+TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 0
2165
+
2166
+ palignr m4, m0, m3, 4
2167
+ pmaddwd m4, [r3 + 14 * 32] ; [29]
2168
+ paddd m4, m15
2169
+ psrld m4, 5
2170
+ palignr m5, m2, m0, 4
2171
+ pmaddwd m5, [r3 + 14 * 32]
2172
+ paddd m5, m15
2173
+ psrld m5, 5
2174
+ packusdw m4, m5
2175
+
2176
+ palignr m1, m0, m3, 8
2177
+ pmaddwd m5, m1, [r3 - 13 * 32] ; [2]
2178
+ paddd m5, m15
2179
+ psrld m5, 5
2180
+ palignr m10, m2, m0, 8
2181
+ pmaddwd m6, m10, [r3 - 13 * 32]
2182
+ paddd m6, m15
2183
+ psrld m6, 5
2184
+ packusdw m5, m6
2185
+
2186
+ movu ym14, [r3 - 8 * 32]
2187
+ vinserti32x8 m14, [r3 - 3 * 32], 1
2188
+ pmaddwd m6, m1, m14 ; [7], [12]
2189
+ paddd m6, m15
2190
+ psrld m6, 5
2191
+ pmaddwd m8, m10, m14
2192
+ paddd m8, m15
2193
+ psrld m8, 5
2194
+ packusdw m6, m8
2195
+ vextracti32x8 ym7, m6, 1
2196
+
2197
+ movu ym14, [r3 + 2 * 32]
2198
+ vinserti32x8 m14, [r3 + 7 * 32], 1
2199
+ pmaddwd m8, m1, m14 ; [17], [22]
2200
+ paddd m8, m15
2201
+ psrld m8, 5
2202
+ pmaddwd m9, m10, m14
2203
+ paddd m9, m15
2204
+ psrld m9, 5
2205
+ packusdw m8, m9
2206
+ vextracti32x8 ym9, m8, 1
2207
+
2208
+ pmaddwd m1, [r3 + 12 * 32] ; [27]
2209
+ paddd m1, [pd_16]
2210
+ psrld m1, 5
2211
+ pmaddwd m10, [r3 + 12 * 32]
2212
+ paddd m10, [pd_16]
2213
+ psrld m10, 5
2214
+ packusdw m1, m10
2215
+
2216
+ palignr m11, m0, m3, 12
2217
+ pmaddwd m11, [r3 - 15 * 32] ; [0]
2218
+ paddd m11, [pd_16]
2219
+ psrld m11, 5
2220
+ palignr m2, m0, 12
2221
+ pmaddwd m2, [r3 - 15 * 32]
2222
+ paddd m2, [pd_16]
2223
+ psrld m2, 5
2224
+ packusdw m11, m2
2225
+ TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 1, 11, 0, 2, 16
2226
+ ret
2227
+
2228
+
2229
+cglobal intra_pred_ang32_8, 3,8,16
2230
+ add r2, 128
2231
+ xor r6d, r6d
2232
+ lea r3, [ang_table_avx2 + 15 * 32]
2233
+ add r1d, r1d
2234
+ lea r4, [r1 * 3]
2235
+ lea r7, [r0 + 8 * r1]
2236
+ vbroadcasti32x8 m15, [pd_16]
2237
+
2238
+ call ang16_mode_8_28
2239
+
2240
+ add r2, 4
2241
+ lea r0, [r0 + 32]
2242
+
2243
+ call ang32_mode_8_28
2244
+
2245
+ add r2, 28
2246
+ lea r0, [r7 + 8 * r1]
2247
+
2248
+ call ang16_mode_8_28
2249
+
2250
+ add r2, 4
2251
+ lea r0, [r0 + 32]
2252
+
2253
+ call ang32_mode_8_28
2254
+ RET
2255
+
2256
+cglobal intra_pred_ang32_28, 3,7,16
2257
+ xor r6d, r6d
2258
+ inc r6d
2259
+ lea r3, [ang_table_avx2 + 15 * 32]
2260
+ add r1d, r1d
2261
+ lea r4, [r1 * 3]
2262
+ lea r5, [r0 + 32]
2263
+ vbroadcasti32x8 m15, [pd_16]
2264
+ call ang16_mode_8_28
2265
+
2266
+ add r2, 4
2267
+
2268
+ call ang32_mode_8_28
2269
+
2270
+ add r2, 28
2271
+ mov r0, r5
2272
+
2273
+ call ang16_mode_8_28
2274
+
2275
+ add r2, 4
2276
+ call ang32_mode_8_28
2277
+ RET
2278
+
2279
+ cglobal intra_pred_ang16_8, 3,7,16
2280
+ add r2, 64
2281
+ xor r6d, r6d
2282
+ lea r3, [ang_table_avx2 + 15 * 32]
2283
+ add r1d, r1d
2284
+ lea r4, [r1 * 3]
2285
+ vbroadcasti32x8 m15, [pd_16]
2286
+
2287
+ call ang16_mode_8_28
2288
+ RET
2289
+
2290
+cglobal intra_pred_ang16_28, 3,7,16
2291
+ xor r6d, r6d
2292
+ inc r6d
2293
+ lea r3, [ang_table_avx2 + 15 * 32]
2294
+ add r1d, r1d
2295
+ lea r4, [r1 * 3]
2296
+ vbroadcasti32x8 m15, [pd_16]
2297
+
2298
+ call ang16_mode_8_28
2299
+ RET
2300
+
2301
+;; angle 16, modes 7 and 29
2302
+cglobal ang16_mode_7_29
2303
+ test r6d, r6d
2304
+
2305
+ vbroadcasti32x8 m0, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
2306
+ vbroadcasti32x8 m1, [r2 + 4] ; [17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
2307
+
2308
+ punpcklwd m3, m0, m1 ; [13 12 12 11 11 10 10 9 5 4 4 3 3 2 2 1]
2309
+ punpckhwd m0, m1 ; [17 16 16 15 15 14 14 13 9 8 8 7 7 6 6 5]
2310
+
2311
+ vbroadcasti32x8 m2, [r2 + 18] ; [24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9]
2312
+ vbroadcasti32x8 m4, [r2 + 20] ; [25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10]
2313
+ punpcklwd m2, m4 ; [21 20 20 19 19 18 18 17 13 12 12 11 11 10 10 9]
2314
+
2315
+ movu ym16, [r3 - 8 * 32] ; [9]
2316
+ vinserti32x8 m16, [r3 + 1 * 32] ,1 ; [18]
2317
+ pmaddwd m4, m3,m16
2318
+ paddd m4, m15
2319
+ psrld m4, 5
2320
+ pmaddwd m5, m0, m16
2321
+ paddd m5, m15
2322
+ psrld m5, 5
2323
+ packusdw m4, m5
2324
+ vextracti32x8 ym5, m4, 1
2325
+
2326
+ pmaddwd m6, m3, [r3 + 10 * 32] ; [27]
2327
+ paddd m6, m15
2328
+ psrld m6, 5
2329
+ pmaddwd m9, m0, [r3 + 10 * 32]
2330
+ paddd m9, m15
2331
+ psrld m9, 5
2332
+ packusdw m6, m9
2333
+
2334
+ palignr m10, m0, m3, 4
2335
+ pmaddwd m7, m10, [r3 - 13 * 32] ; [4]
2336
+ paddd m7, m15
2337
+ psrld m7, 5
2338
+ palignr m11, m2, m0, 4
2339
+ pmaddwd m8, m11, [r3 - 13 * 32]
2340
+ paddd m8, m15
2341
+ psrld m8, 5
2342
+ packusdw m7, m8
2343
+
2344
+ movu ym16, [r3 - 4 * 32] ; [13]
2345
+ vinserti32x8 m16, [r3 + 5 * 32],1 ; [22]
2346
+ pmaddwd m8, m10, m16
2347
+ paddd m8, m15
2348
+ psrld m8, 5
2349
+ pmaddwd m9, m11, m16
2350
+ paddd m9, m15
2351
+ psrld m9, 5
2352
+ packusdw m8, m9
2353
+ vextracti32x8 ym9, m8, 1
2354
+
2355
+ pmaddwd m10, [r3 + 14 * 32] ; [31]
2356
+ paddd m10, m15
2357
+ psrld m10, 5
2358
+ pmaddwd m11, [r3 + 14 * 32]
2359
+ paddd m11, m15
2360
+ psrld m11, 5
2361
+ packusdw m10, m11
2362
+
2363
+ palignr m11, m0, m3, 8
2364
+ pmaddwd m11, [r3 - 9 * 32] ; [8]
2365
+ paddd m11, m15
2366
+ psrld m11, 5
2367
+ palignr m12, m2, m0, 8
2368
+ pmaddwd m12, [r3 - 9 * 32]
2369
+ paddd m12, m15
2370
+ psrld m12, 5
2371
+ packusdw m11, m12
2372
+
2373
+ TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 0
2374
+
2375
+ palignr m5, m0, m3, 8
2376
+ palignr m6, m2, m0, 8
2377
+ movu ym16, [r3] ; [17]
2378
+ vinserti32x8 m16, [r3 + 9 * 32] ,1 ; [26]
2379
+ pmaddwd m4, m5, m16
2380
+ paddd m4, m15
2381
+ psrld m4, 5
2382
+ pmaddwd m7, m6, m16
2383
+ paddd m7, m15
2384
+ psrld m7, 5
2385
+ packusdw m4, m7
2386
+ vextracti32x8 ym5, m4, 1
2387
+
2388
+
2389
+ palignr m9, m0, m3, 12
2390
+ palignr m3, m2, m0, 12
2391
+ movu ym16, [r3 - 14 * 32] ; [3]
2392
+ vinserti32x8 m16, [r3 - 5 * 32] ,1 ; [12]
2393
+ pmaddwd m6, m9,m16
2394
+ paddd m6, m15
2395
+ psrld m6, 5
2396
+ pmaddwd m7, m3,m16
2397
+ paddd m7, m15
2398
+ psrld m7, 5
2399
+ packusdw m6, m7
2400
+ vextracti32x8 ym7, m6, 1
2401
+
2402
+ movu ym16, [r3 + 4 * 32] ; [21]
2403
+ vinserti32x8 m16, [r3 + 13 * 32] ,1 ; [30]
2404
+ pmaddwd m8, m9,m16
2405
+ paddd m8, m15
2406
+ psrld m8, 5
2407
+ pmaddwd m10, m3, m16
2408
+ paddd m10, m15
2409
+ psrld m10, 5
2410
+ packusdw m8, m10
2411
+ vextracti32x8 ym9, m8, 1
2412
+
2413
+ movu ym16,[r3 - 10 * 32] ; [7]
2414
+ vinserti32x8 m16, [r3 - 1 * 32] ,1 ; [16]
2415
+ pmaddwd m10, m0, m16
2416
+ paddd m10, m15
2417
+ psrld m10, 5
2418
+ pmaddwd m12, m2, m16
2419
+ paddd m12, m15
2420
+ psrld m12, 5
2421
+ packusdw m10, m12
2422
+ vextracti32x8 ym0, m10, 1
2423
+
2424
+ TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 0, 1, 2, 16
2425
+ ret
2426
+;; angle 32, modes 7 and 29
2427
+cglobal ang32_mode_7_29
2428
+ test r6d, r6d
2429
+
2430
+ vbroadcasti32x8 m0, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
2431
+ vbroadcasti32x8 m1, [r2 + 4] ; [17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2]
2432
+
2433
+ punpcklwd m3, m0, m1 ; [13 12 12 11 11 10 10 9 5 4 4 3 3 2 2 1]
2434
+ punpckhwd m0, m1 ; [17 16 16 15 15 14 14 13 9 8 8 7 7 6 6 5]
2435
+
2436
+ vbroadcasti32x8 m1, [r2 + 18] ; [24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9]
2437
+ vbroadcasti32x8 m4, [r2 + 20] ; [25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10]
2438
+ punpcklwd m2, m1, m4 ; [21 20 20 19 19 18 18 17 13 12 12 11 11 10 10 9]
2439
+ punpckhwd m1, m4 ; [25 24 24 23 23 22 22 21 17 16 16 15 15 14 14 13]
2440
+
2441
+ pmaddwd m4, m3, [r3 + 8 * 32] ; [25]
2442
+ paddd m4, m15
2443
+ psrld m4, 5
2444
+ pmaddwd m5, m0, [r3 + 8 * 32]
2445
+ paddd m5, m15
2446
+ psrld m5, 5
2447
+ packusdw m4, m5
2448
+
2449
+ palignr m8, m0, m3, 4
2450
+ pmaddwd m5, m8, [r3 - 15 * 32] ; [2]
2451
+ paddd m5, m15
2452
+ psrld m5, 5
2453
+ palignr m9, m2, m0, 4
2454
+ pmaddwd m10, m9, [r3 - 15 * 32]
2455
+ paddd m10, m15
2456
+ psrld m10, 5
2457
+ packusdw m5, m10
2458
+
2459
+ movu ym16,[r3 - 6 * 32] ; [11]
2460
+ vinserti32x8 m16, [r3 + 3 * 32],1 ; [20]
2461
+ pmaddwd m6, m8, m16
2462
+ paddd m6, m15
2463
+ psrld m6, 5
2464
+ pmaddwd m7, m9, m16
2465
+ paddd m7, m15
2466
+ psrld m7, 5
2467
+ packusdw m6, m7
2468
+ vextracti32x8 ym7, m6, 1
2469
+
2470
+ pmaddwd m8, [r3 + 12 * 32] ; [29]
2471
+ paddd m8, m15
2472
+ psrld m8, 5
2473
+ pmaddwd m9, [r3 + 12 * 32]
2474
+ paddd m9, m15
2475
+ psrld m9, 5
2476
+ packusdw m8, m9
2477
+
2478
+ palignr m11, m0, m3, 8
2479
+ palignr m12, m2, m0, 8
2480
+ movu ym16, [r3 - 11 * 32] ; [6]
2481
+ vinserti32x8 m16, [r3 - 2 * 32] ,1 ; [15]
2482
+ pmaddwd m9, m11, m16
2483
+ paddd m9, m15
2484
+ psrld m9, 5
2485
+ palignr m12, m2, m0, 8
2486
+ pmaddwd m10, m12, m16
2487
+ paddd m10, m15
2488
+ psrld m10, 5
2489
+ packusdw m9, m10
2490
+ vextracti32x8 ym10, m9, 1
2491
+
2492
+ pmaddwd m11, [r3 + 7 * 32] ; [24]
2493
+ paddd m11, m15
2494
+ psrld m11, 5
2495
+ pmaddwd m12, [r3 + 7 * 32]
2496
+ paddd m12, m15
2497
+ psrld m12, 5
2498
+ packusdw m11, m12
2499
+
2500
+ TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0
2501
+
2502
+ palignr m5, m0, m3, 12
2503
+ palignr m6, m2, m0, 12
2504
+ movu ym16, [r3 - 16 * 32] ; [1]
2505
+ vinserti32x8 m16, [r3 - 7 * 32] ,1 ; [10]
2506
+ pmaddwd m4, m5, m16
2507
+ paddd m4, m15
2508
+ psrld m4, 5
2509
+ pmaddwd m7, m6, m16
2510
+ paddd m7, m15
2511
+ psrld m7, 5
2512
+ packusdw m4, m7
2513
+ vextracti32x8 ym5, m4, 1
2514
+
2515
+ palignr m9, m0, m3, 12
2516
+ pmaddwd m6, m9, [r3 + 2 * 32] ; [19]
2517
+ paddd m6, m15
2518
+ psrld m6, 5
2519
+ palignr m3, m2, m0, 12
2520
+ pmaddwd m7, m3, [r3 + 2 * 32]
2521
+ paddd m7, m15
2522
+ psrld m7, 5
2523
+ packusdw m6, m7
2524
+
2525
+ pmaddwd m7, m9, [r3 + 11 * 32] ; [28]
2526
+ paddd m7, m15
2527
+ psrld m7, 5
2528
+ pmaddwd m8, m3, [r3 + 11 * 32]
2529
+ paddd m8, m15
2530
+ psrld m8, 5
2531
+ packusdw m7, m8
2532
+
2533
+ movu ym16, [r3 - 12 * 32] ; [5]
2534
+ vinserti32x8 m16, [r3 - 3 * 32] ,1 ; [14]
2535
+ pmaddwd m8, m0, m16
2536
+ paddd m8, m15
2537
+ psrld m8, 5
2538
+ pmaddwd m10, m2, m16
2539
+ paddd m10,m15
2540
+ psrld m10, 5
2541
+ packusdw m8, m10
2542
+ vextracti32x8 ym9, m8, 1
2543
+
2544
+ pmaddwd m10, m0, [r3 + 6 * 32] ; [23]
2545
+ paddd m10,m15
2546
+ psrld m10, 5
2547
+ pmaddwd m12, m2, [r3 + 6 * 32]
2548
+ paddd m12, m15
2549
+ psrld m12, 5
2550
+ packusdw m10, m12
2551
+
2552
+ palignr m11, m2, m0, 4
2553
+ pmaddwd m11, [r3 - 17 * 32] ; [0]
2554
+ paddd m11, m15
2555
+ psrld m11, 5
2556
+ palignr m12, m1, m2, 4
2557
+ pmaddwd m12, [r3 - 17 * 32]
2558
+ paddd m12, m15
2559
+ psrld m12, 5
2560
+ packusdw m11, m12
2561
+ TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 3, 2, 16
2562
+ ret
2563
+
2564
+cglobal intra_pred_ang32_7, 3,8,17
2565
+ add r2, 128
2566
+ xor r6d, r6d
2567
+ lea r3, [ang_table_avx2 + 17 * 32]
2568
+ add r1d, r1d
2569
+ lea r4, [r1 * 3]
2570
+ lea r7, [r0 + 8 * r1]
2571
+ vbroadcasti32x8 m15, [pd_16]
2572
+ call ang16_mode_7_29
2573
+
2574
+ add r2, 8
2575
+ lea r0, [r0 + 32]
2576
+
2577
+ call ang32_mode_7_29
2578
+
2579
+ add r2, 24
2580
+ lea r0, [r7 + 8 * r1]
2581
+
2582
+ call ang16_mode_7_29
2583
+
2584
+ add r2, 8
2585
+ lea r0, [r0 + 32]
2586
+
2587
+ call ang32_mode_7_29
2588
+ RET
2589
+
2590
+cglobal intra_pred_ang32_29, 3,7,17
2591
+ xor r6d, r6d
2592
+ inc r6d
2593
+ lea r3, [ang_table_avx2 + 17 * 32]
2594
+ add r1d, r1d
2595
+ lea r4, [r1 * 3]
2596
+ lea r5, [r0 + 32]
2597
+ vbroadcasti32x8 m15, [pd_16]
2598
+ call ang16_mode_7_29
2599
+
2600
+ add r2, 8
2601
+
2602
+ call ang32_mode_7_29
2603
+
2604
+ add r2, 24
2605
+ mov r0, r5
2606
+
2607
+ call ang16_mode_7_29
2608
+ add r2, 8
2609
+ call ang32_mode_7_29
2610
+ RET
2611
+cglobal intra_pred_ang16_7, 3,7,17
2612
+ add r2, 64
2613
+ xor r6d, r6d
2614
+ vbroadcasti32x8 m15, [pd_16]
2615
+ lea r3, [ang_table_avx2 + 17 * 32]
2616
+ add r1d, r1d
2617
+ lea r4, [r1 * 3]
2618
+
2619
+ call ang16_mode_7_29
2620
+ RET
2621
+
2622
+cglobal intra_pred_ang16_29, 3,7,17
2623
+ xor r6d, r6d
2624
+ inc r6d
2625
+ vbroadcasti32x8 m15, [pd_16]
2626
+ lea r3, [ang_table_avx2 + 17 * 32]
2627
+ add r1d, r1d
2628
+ lea r4, [r1 * 3]
2629
+
2630
+ call ang16_mode_7_29
2631
+ RET
2632
+;-------------------------------------------------------------------------------------------------------
2633
+; avx512 code for intra_pred_ang32 mode 2 to 34 end
2634
+;-------------------------------------------------------------------------------------------------------
2635
%macro MODE_2_34 0
2636
movu m0, [r2 + 4]
2637
movu m1, [r2 + 20]
2638
x265_2.7.tar.gz/source/common/x86/ipfilter16.asm -> x265_2.9.tar.gz/source/common/x86/ipfilter16.asm
Changed
9510
1
2
%endif
3
4
5
-SECTION_RODATA 32
6
+SECTION_RODATA 64
7
8
tab_c_524800: times 4 dd 524800
9
tab_c_n8192: times 8 dw -8192
10
pd_524800: times 8 dd 524800
11
12
+tab_ChromaCoeff: dw 0, 64, 0, 0
13
+ dw -2, 58, 10, -2
14
+ dw -4, 54, 16, -2
15
+ dw -6, 46, 28, -4
16
+ dw -4, 36, 36, -4
17
+ dw -4, 28, 46, -6
18
+ dw -2, 16, 54, -4
19
+ dw -2, 10, 58, -2
20
+
21
+tab_LumaCoeff: dw 0, 0, 0, 64, 0, 0, 0, 0
22
+ dw -1, 4, -10, 58, 17, -5, 1, 0
23
+ dw -1, 4, -11, 40, 40, -11, 4, -1
24
+ dw 0, 1, -5, 17, 58, -10, 4, -1
25
+
26
+ALIGN 64
27
+tab_LumaCoeffH_avx512:
28
+ times 4 dw 0, 0, 0, 64, 0, 0, 0, 0
29
+ times 4 dw -1, 4, -10, 58, 17, -5, 1, 0
30
+ times 4 dw -1, 4, -11, 40, 40, -11, 4, -1
31
+ times 4 dw 0, 1, -5, 17, 58, -10, 4, -1
32
+
33
ALIGN 32
34
tab_LumaCoeffV: times 4 dw 0, 0
35
times 4 dw 0, 64
36
37
times 4 dw -5, 17
38
times 4 dw 58, -10
39
times 4 dw 4, -1
40
+
41
ALIGN 32
42
tab_LumaCoeffVer: times 8 dw 0, 0
43
times 8 dw 0, 64
44
45
times 8 dw -5, 17
46
times 8 dw 58, -10
47
times 8 dw 4, -1
48
-
49
+
50
+ALIGN 64
51
+const tab_ChromaCoeffV_avx512, times 16 dw 0, 64
52
+ times 16 dw 0, 0
53
+
54
+ times 16 dw -2, 58
55
+ times 16 dw 10, -2
56
+
57
+ times 16 dw -4, 54
58
+ times 16 dw 16, -2
59
+
60
+ times 16 dw -6, 46
61
+ times 16 dw 28, -4
62
+
63
+ times 16 dw -4, 36
64
+ times 16 dw 36, -4
65
+
66
+ times 16 dw -4, 28
67
+ times 16 dw 46, -6
68
+
69
+ times 16 dw -2, 16
70
+ times 16 dw 54, -4
71
+
72
+ times 16 dw -2, 10
73
+ times 16 dw 58, -2
74
+
75
+ALIGN 64
76
+tab_LumaCoeffVer_avx512: times 16 dw 0, 0
77
+ times 16 dw 0, 64
78
+ times 16 dw 0, 0
79
+ times 16 dw 0, 0
80
+
81
+ times 16 dw -1, 4
82
+ times 16 dw -10, 58
83
+ times 16 dw 17, -5
84
+ times 16 dw 1, 0
85
+
86
+ times 16 dw -1, 4
87
+ times 16 dw -11, 40
88
+ times 16 dw 40, -11
89
+ times 16 dw 4, -1
90
+
91
+ times 16 dw 0, 1
92
+ times 16 dw -5, 17
93
+ times 16 dw 58, -10
94
+ times 16 dw 4, -1
95
+
96
+ALIGN 64
97
+const interp8_hpp_shuf1_load_avx512, times 4 db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9
98
+
99
+ALIGN 64
100
+const interp8_hpp_shuf2_load_avx512, times 4 db 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13
101
+
102
+ALIGN 64
103
+const interp8_hpp_shuf1_store_avx512, times 4 db 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15
104
+
105
SECTION .text
106
cextern pd_8
107
cextern pd_32
108
109
;-------------------------------------------------------------------------------------------------------------
110
; void interp_8tap_vert_pp_%2x%3(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
111
;-------------------------------------------------------------------------------------------------------------
112
+%if ARCH_X86_64
113
FILTER_VER_LUMA_sse2 pp, 4, 4
114
FILTER_VER_LUMA_sse2 pp, 8, 8
115
FILTER_VER_LUMA_sse2 pp, 8, 4
116
117
FILTER_VER_LUMA_sse2 ps, 48, 64
118
FILTER_VER_LUMA_sse2 ps, 64, 16
119
FILTER_VER_LUMA_sse2 ps, 16, 64
120
+%endif
121
+
122
+;-----------------------------------------------------------------------------
123
+;p2s and p2s_aligned avx512 code start
124
+;-----------------------------------------------------------------------------
125
+%macro P2S_64x4_AVX512 0
126
+ movu m0, [r0]
127
+ movu m1, [r0 + r1]
128
+ movu m2, [r0 + r1 * 2]
129
+ movu m3, [r0 + r5]
130
+ psllw m0, (14 - BIT_DEPTH)
131
+ psllw m1, (14 - BIT_DEPTH)
132
+ psllw m2, (14 - BIT_DEPTH)
133
+ psllw m3, (14 - BIT_DEPTH)
134
+ psubw m0, m4
135
+ psubw m1, m4
136
+ psubw m2, m4
137
+ psubw m3, m4
138
+ movu [r2], m0
139
+ movu [r2 + r3], m1
140
+ movu [r2 + r3 * 2], m2
141
+ movu [r2 + r4], m3
142
+
143
+ movu m0, [r0 + mmsize]
144
+ movu m1, [r0 + r1 + mmsize]
145
+ movu m2, [r0 + r1 * 2 + mmsize]
146
+ movu m3, [r0 + r5 + mmsize]
147
+ psllw m0, (14 - BIT_DEPTH)
148
+ psllw m1, (14 - BIT_DEPTH)
149
+ psllw m2, (14 - BIT_DEPTH)
150
+ psllw m3, (14 - BIT_DEPTH)
151
+ psubw m0, m4
152
+ psubw m1, m4
153
+ psubw m2, m4
154
+ psubw m3, m4
155
+ movu [r2 + mmsize], m0
156
+ movu [r2 + r3 + mmsize], m1
157
+ movu [r2 + r3 * 2 + mmsize], m2
158
+ movu [r2 + r4 + mmsize], m3
159
+%endmacro
160
+
161
+%macro P2S_ALIGNED_64x4_AVX512 0
162
+ mova m0, [r0]
163
+ mova m1, [r0 + r1]
164
+ mova m2, [r0 + r1 * 2]
165
+ mova m3, [r0 + r5]
166
+ psllw m0, (14 - BIT_DEPTH)
167
+ psllw m1, (14 - BIT_DEPTH)
168
+ psllw m2, (14 - BIT_DEPTH)
169
+ psllw m3, (14 - BIT_DEPTH)
170
+ psubw m0, m4
171
+ psubw m1, m4
172
+ psubw m2, m4
173
+ psubw m3, m4
174
+ mova [r2], m0
175
+ mova [r2 + r3], m1
176
+ mova [r2 + r3 * 2], m2
177
+ mova [r2 + r4], m3
178
+
179
+ mova m0, [r0 + mmsize]
180
+ mova m1, [r0 + r1 + mmsize]
181
+ mova m2, [r0 + r1 * 2 + mmsize]
182
+ mova m3, [r0 + r5 + mmsize]
183
+ psllw m0, (14 - BIT_DEPTH)
184
+ psllw m1, (14 - BIT_DEPTH)
185
+ psllw m2, (14 - BIT_DEPTH)
186
+ psllw m3, (14 - BIT_DEPTH)
187
+ psubw m0, m4
188
+ psubw m1, m4
189
+ psubw m2, m4
190
+ psubw m3, m4
191
+ mova [r2 + mmsize], m0
192
+ mova [r2 + r3 + mmsize], m1
193
+ mova [r2 + r3 * 2 + mmsize], m2
194
+ mova [r2 + r4 + mmsize], m3
195
+%endmacro
196
+
197
+%macro P2S_32x4_AVX512 0
198
+ movu m0, [r0]
199
+ movu m1, [r0 + r1]
200
+ movu m2, [r0 + r1 * 2]
201
+ movu m3, [r0 + r5]
202
+ psllw m0, (14 - BIT_DEPTH)
203
+ psllw m1, (14 - BIT_DEPTH)
204
+ psllw m2, (14 - BIT_DEPTH)
205
+ psllw m3, (14 - BIT_DEPTH)
206
+ psubw m0, m4
207
+ psubw m1, m4
208
+ psubw m2, m4
209
+ psubw m3, m4
210
+ movu [r2], m0
211
+ movu [r2 + r3], m1
212
+ movu [r2 + r3 * 2], m2
213
+ movu [r2 + r4], m3
214
+%endmacro
215
+
216
+%macro P2S_ALIGNED_32x4_AVX512 0
217
+ mova m0, [r0]
218
+ mova m1, [r0 + r1]
219
+ mova m2, [r0 + r1 * 2]
220
+ mova m3, [r0 + r5]
221
+ psllw m0, (14 - BIT_DEPTH)
222
+ psllw m1, (14 - BIT_DEPTH)
223
+ psllw m2, (14 - BIT_DEPTH)
224
+ psllw m3, (14 - BIT_DEPTH)
225
+ psubw m0, m4
226
+ psubw m1, m4
227
+ psubw m2, m4
228
+ psubw m3, m4
229
+ mova [r2], m0
230
+ mova [r2 + r3], m1
231
+ mova [r2 + r3 * 2], m2
232
+ mova [r2 + r4], m3
233
+%endmacro
234
+
235
+%macro P2S_48x4_AVX512 0
236
+ movu m0, [r0]
237
+ movu m1, [r0 + r1]
238
+ movu m2, [r0 + r1 * 2]
239
+ movu m3, [r0 + r5]
240
+ psllw m0, (14 - BIT_DEPTH)
241
+ psllw m1, (14 - BIT_DEPTH)
242
+ psllw m2, (14 - BIT_DEPTH)
243
+ psllw m3, (14 - BIT_DEPTH)
244
+ psubw m0, m4
245
+ psubw m1, m4
246
+ psubw m2, m4
247
+ psubw m3, m4
248
+ movu [r2], m0
249
+ movu [r2 + r3], m1
250
+ movu [r2 + r3 * 2], m2
251
+ movu [r2 + r4], m3
252
+
253
+ movu ym0, [r0 + mmsize]
254
+ movu ym1, [r0 + r1 + mmsize]
255
+ movu ym2, [r0 + r1 * 2 + mmsize]
256
+ movu ym3, [r0 + r5 + mmsize]
257
+ psllw ym0, (14 - BIT_DEPTH)
258
+ psllw ym1, (14 - BIT_DEPTH)
259
+ psllw ym2, (14 - BIT_DEPTH)
260
+ psllw ym3, (14 - BIT_DEPTH)
261
+ psubw ym0, ym4
262
+ psubw ym1, ym4
263
+ psubw ym2, ym4
264
+ psubw ym3, ym4
265
+ movu [r2 + mmsize], ym0
266
+ movu [r2 + r3 + mmsize], ym1
267
+ movu [r2 + r3 * 2 + mmsize], ym2
268
+ movu [r2 + r4 + mmsize], ym3
269
+%endmacro
270
+
271
+%macro P2S_ALIGNED_48x4_AVX512 0
272
+ mova m0, [r0]
273
+ mova m1, [r0 + r1]
274
+ mova m2, [r0 + r1 * 2]
275
+ mova m3, [r0 + r5]
276
+ psllw m0, (14 - BIT_DEPTH)
277
+ psllw m1, (14 - BIT_DEPTH)
278
+ psllw m2, (14 - BIT_DEPTH)
279
+ psllw m3, (14 - BIT_DEPTH)
280
+ psubw m0, m4
281
+ psubw m1, m4
282
+ psubw m2, m4
283
+ psubw m3, m4
284
+ mova [r2], m0
285
+ mova [r2 + r3], m1
286
+ mova [r2 + r3 * 2], m2
287
+ mova [r2 + r4], m3
288
+
289
+ mova ym0, [r0 + mmsize]
290
+ mova ym1, [r0 + r1 + mmsize]
291
+ mova ym2, [r0 + r1 * 2 + mmsize]
292
+ mova ym3, [r0 + r5 + mmsize]
293
+ psllw ym0, (14 - BIT_DEPTH)
294
+ psllw ym1, (14 - BIT_DEPTH)
295
+ psllw ym2, (14 - BIT_DEPTH)
296
+ psllw ym3, (14 - BIT_DEPTH)
297
+ psubw ym0, ym4
298
+ psubw ym1, ym4
299
+ psubw ym2, ym4
300
+ psubw ym3, ym4
301
+ mova [r2 + mmsize], ym0
302
+ mova [r2 + r3 + mmsize], ym1
303
+ mova [r2 + r3 * 2 + mmsize], ym2
304
+ mova [r2 + r4 + mmsize], ym3
305
+%endmacro
306
+
307
+;-----------------------------------------------------------------------------
308
+; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride)
309
+;-----------------------------------------------------------------------------
310
+INIT_ZMM avx512
311
+cglobal filterPixelToShort_64x16, 4, 6, 5
312
+ add r1d, r1d
313
+ add r3d, r3d
314
+ lea r4, [r3 * 3]
315
+ lea r5, [r1 * 3]
316
+
317
+ ; load constant
318
+ vbroadcasti32x8 m4, [pw_2000]
319
+%rep 3
320
+ P2S_64x4_AVX512
321
+ lea r0, [r0 + r1 * 4]
322
+ lea r2, [r2 + r3 * 4]
323
+%endrep
324
+ P2S_64x4_AVX512
325
+ RET
326
+
327
+
328
+INIT_ZMM avx512
329
+cglobal filterPixelToShort_64x32, 4, 6, 5
330
+ add r1d, r1d
331
+ add r3d, r3d
332
+ lea r4, [r3 * 3]
333
+ lea r5, [r1 * 3]
334
+
335
+ ; load constant
336
+ vbroadcasti32x8 m4, [pw_2000]
337
+%rep 7
338
+ P2S_64x4_AVX512
339
+ lea r0, [r0 + r1 * 4]
340
+ lea r2, [r2 + r3 * 4]
341
+%endrep
342
+ P2S_64x4_AVX512
343
+ RET
344
+
345
+INIT_ZMM avx512
346
+cglobal filterPixelToShort_64x48, 4, 6, 5
347
+ add r1d, r1d
348
+ add r3d, r3d
349
+ lea r4, [r3 * 3]
350
+ lea r5, [r1 * 3]
351
+
352
+ ; load constant
353
+ vbroadcasti32x8 m4, [pw_2000]
354
+%rep 11
355
+ P2S_64x4_AVX512
356
+ lea r0, [r0 + r1 * 4]
357
+ lea r2, [r2 + r3 * 4]
358
+%endrep
359
+ P2S_64x4_AVX512
360
+ RET
361
+
362
+INIT_ZMM avx512
363
+cglobal filterPixelToShort_64x64, 4, 6, 5
364
+ add r1d, r1d
365
+ add r3d, r3d
366
+ lea r4, [r3 * 3]
367
+ lea r5, [r1 * 3]
368
+
369
+ ; load constant
370
+ vbroadcasti32x8 m4, [pw_2000]
371
+%rep 15
372
+ P2S_64x4_AVX512
373
+ lea r0, [r0 + r1 * 4]
374
+ lea r2, [r2 + r3 * 4]
375
+%endrep
376
+ P2S_64x4_AVX512
377
+ RET
378
+
379
+INIT_ZMM avx512
380
+cglobal filterPixelToShort_32x8, 4, 6, 5
381
+ add r1d, r1d
382
+ add r3d, r3d
383
+ lea r4, [r3 * 3]
384
+ lea r5, [r1 * 3]
385
+
386
+ ; load constant
387
+ vbroadcasti32x8 m4, [pw_2000]
388
+ P2S_32x4_AVX512
389
+ lea r0, [r0 + r1 * 4]
390
+ lea r2, [r2 + r3 * 4]
391
+ P2S_32x4_AVX512
392
+ RET
393
+
394
+INIT_ZMM avx512
395
+cglobal filterPixelToShort_32x16, 4, 6, 5
396
+ add r1d, r1d
397
+ add r3d, r3d
398
+ lea r4, [r3 * 3]
399
+ lea r5, [r1 * 3]
400
+
401
+ ; load constant
402
+ vbroadcasti32x8 m4, [pw_2000]
403
+%rep 3
404
+ P2S_32x4_AVX512
405
+ lea r0, [r0 + r1 * 4]
406
+ lea r2, [r2 + r3 * 4]
407
+%endrep
408
+ P2S_32x4_AVX512
409
+ RET
410
+
411
+INIT_ZMM avx512
412
+cglobal filterPixelToShort_32x24, 4, 6, 5
413
+ add r1d, r1d
414
+ add r3d, r3d
415
+ lea r4, [r3 * 3]
416
+ lea r5, [r1 * 3]
417
+
418
+ ; load constant
419
+ vbroadcasti32x8 m4, [pw_2000]
420
+%rep 5
421
+ P2S_32x4_AVX512
422
+ lea r0, [r0 + r1 * 4]
423
+ lea r2, [r2 + r3 * 4]
424
+%endrep
425
+ P2S_32x4_AVX512
426
+ RET
427
+
428
+INIT_ZMM avx512
429
+cglobal filterPixelToShort_32x32, 4, 6, 5
430
+ add r1d, r1d
431
+ add r3d, r3d
432
+ lea r4, [r3 * 3]
433
+ lea r5, [r1 * 3]
434
+
435
+ ; load constant
436
+ vbroadcasti32x8 m4, [pw_2000]
437
+%rep 7
438
+ P2S_32x4_AVX512
439
+ lea r0, [r0 + r1 * 4]
440
+ lea r2, [r2 + r3 * 4]
441
+%endrep
442
+ P2S_32x4_AVX512
443
+ RET
444
+
445
+INIT_ZMM avx512
446
+cglobal filterPixelToShort_32x48, 4, 6, 5
447
+ add r1d, r1d
448
+ add r3d, r3d
449
+ lea r4, [r3 * 3]
450
+ lea r5, [r1 * 3]
451
+
452
+ ; load constant
453
+ vbroadcasti32x8 m4, [pw_2000]
454
+%rep 11
455
+ P2S_32x4_AVX512
456
+ lea r0, [r0 + r1 * 4]
457
+ lea r2, [r2 + r3 * 4]
458
+%endrep
459
+ P2S_32x4_AVX512
460
+ RET
461
+
462
+INIT_ZMM avx512
463
+cglobal filterPixelToShort_32x64, 4, 6, 5
464
+ add r1d, r1d
465
+ add r3d, r3d
466
+ lea r4, [r3 * 3]
467
+ lea r5, [r1 * 3]
468
+
469
+ ; load constant
470
+ vbroadcasti32x8 m4, [pw_2000]
471
+%rep 15
472
+ P2S_32x4_AVX512
473
+ lea r0, [r0 + r1 * 4]
474
+ lea r2, [r2 + r3 * 4]
475
+%endrep
476
+ P2S_32x4_AVX512
477
+ RET
478
479
+INIT_ZMM avx512
480
+cglobal filterPixelToShort_48x64, 4, 6, 5
481
+ add r1d, r1d
482
+ add r3d, r3d
483
+ lea r4, [r3 * 3]
484
+ lea r5, [r1 * 3]
485
+
486
+ ; load constant
487
+ vbroadcasti32x8 m4, [pw_2000]
488
+%rep 15
489
+ P2S_48x4_AVX512
490
+ lea r0, [r0 + r1 * 4]
491
+ lea r2, [r2 + r3 * 4]
492
+%endrep
493
+ P2S_48x4_AVX512
494
+ RET
495
+
496
+INIT_ZMM avx512
497
+cglobal filterPixelToShort_aligned_64x16, 4, 6, 5
498
+ add r1d, r1d
499
+ add r3d, r3d
500
+ lea r4, [r3 * 3]
501
+ lea r5, [r1 * 3]
502
+
503
+ ; load constant
504
+ vbroadcasti32x8 m4, [pw_2000]
505
+%rep 3
506
+ P2S_ALIGNED_64x4_AVX512
507
+ lea r0, [r0 + r1 * 4]
508
+ lea r2, [r2 + r3 * 4]
509
+%endrep
510
+ P2S_ALIGNED_64x4_AVX512
511
+ RET
512
+
513
+
514
+INIT_ZMM avx512
515
+cglobal filterPixelToShort_aligned_64x32, 4, 6, 5
516
+ add r1d, r1d
517
+ add r3d, r3d
518
+ lea r4, [r3 * 3]
519
+ lea r5, [r1 * 3]
520
+
521
+ ; load constant
522
+ vbroadcasti32x8 m4, [pw_2000]
523
+%rep 7
524
+ P2S_ALIGNED_64x4_AVX512
525
+ lea r0, [r0 + r1 * 4]
526
+ lea r2, [r2 + r3 * 4]
527
+%endrep
528
+ P2S_ALIGNED_64x4_AVX512
529
+ RET
530
+
531
+INIT_ZMM avx512
532
+cglobal filterPixelToShort_aligned_64x48, 4, 6, 5
533
+ add r1d, r1d
534
+ add r3d, r3d
535
+ lea r4, [r3 * 3]
536
+ lea r5, [r1 * 3]
537
+
538
+ ; load constant
539
+ vbroadcasti32x8 m4, [pw_2000]
540
+%rep 11
541
+ P2S_ALIGNED_64x4_AVX512
542
+ lea r0, [r0 + r1 * 4]
543
+ lea r2, [r2 + r3 * 4]
544
+%endrep
545
+ P2S_ALIGNED_64x4_AVX512
546
+ RET
547
+
548
+INIT_ZMM avx512
549
+cglobal filterPixelToShort_aligned_64x64, 4, 6, 5
550
+ add r1d, r1d
551
+ add r3d, r3d
552
+ lea r4, [r3 * 3]
553
+ lea r5, [r1 * 3]
554
+
555
+ ; load constant
556
+ vbroadcasti32x8 m4, [pw_2000]
557
+%rep 15
558
+ P2S_ALIGNED_64x4_AVX512
559
+ lea r0, [r0 + r1 * 4]
560
+ lea r2, [r2 + r3 * 4]
561
+%endrep
562
+ P2S_ALIGNED_64x4_AVX512
563
+ RET
564
+
565
+INIT_ZMM avx512
566
+cglobal filterPixelToShort_aligned_32x8, 4, 6, 5
567
+ add r1d, r1d
568
+ add r3d, r3d
569
+ lea r4, [r3 * 3]
570
+ lea r5, [r1 * 3]
571
+
572
+ ; load constant
573
+ vbroadcasti32x8 m4, [pw_2000]
574
+ P2S_ALIGNED_32x4_AVX512
575
+ lea r0, [r0 + r1 * 4]
576
+ lea r2, [r2 + r3 * 4]
577
+ P2S_ALIGNED_32x4_AVX512
578
+ RET
579
+
580
+INIT_ZMM avx512
581
+cglobal filterPixelToShort_aligned_32x16, 4, 6, 5
582
+ add r1d, r1d
583
+ add r3d, r3d
584
+ lea r4, [r3 * 3]
585
+ lea r5, [r1 * 3]
586
+
587
+ ; load constant
588
+ vbroadcasti32x8 m4, [pw_2000]
589
+%rep 3
590
+ P2S_ALIGNED_32x4_AVX512
591
+ lea r0, [r0 + r1 * 4]
592
+ lea r2, [r2 + r3 * 4]
593
+%endrep
594
+ P2S_ALIGNED_32x4_AVX512
595
+ RET
596
+
597
+INIT_ZMM avx512
598
+cglobal filterPixelToShort_aligned_32x24, 4, 6, 5
599
+ add r1d, r1d
600
+ add r3d, r3d
601
+ lea r4, [r3 * 3]
602
+ lea r5, [r1 * 3]
603
+
604
+ ; load constant
605
+ vbroadcasti32x8 m4, [pw_2000]
606
+%rep 5
607
+ P2S_ALIGNED_32x4_AVX512
608
+ lea r0, [r0 + r1 * 4]
609
+ lea r2, [r2 + r3 * 4]
610
+%endrep
611
+ P2S_ALIGNED_32x4_AVX512
612
+ RET
613
+
614
+INIT_ZMM avx512
615
+cglobal filterPixelToShort_aligned_32x32, 4, 6, 5
616
+ add r1d, r1d
617
+ add r3d, r3d
618
+ lea r4, [r3 * 3]
619
+ lea r5, [r1 * 3]
620
+
621
+ ; load constant
622
+ vbroadcasti32x8 m4, [pw_2000]
623
+%rep 7
624
+ P2S_ALIGNED_32x4_AVX512
625
+ lea r0, [r0 + r1 * 4]
626
+ lea r2, [r2 + r3 * 4]
627
+%endrep
628
+ P2S_ALIGNED_32x4_AVX512
629
+ RET
630
+
631
+INIT_ZMM avx512
632
+cglobal filterPixelToShort_aligned_32x48, 4, 6, 5
633
+ add r1d, r1d
634
+ add r3d, r3d
635
+ lea r4, [r3 * 3]
636
+ lea r5, [r1 * 3]
637
+
638
+ ; load constant
639
+ vbroadcasti32x8 m4, [pw_2000]
640
+%rep 11
641
+ P2S_ALIGNED_32x4_AVX512
642
+ lea r0, [r0 + r1 * 4]
643
+ lea r2, [r2 + r3 * 4]
644
+%endrep
645
+ P2S_ALIGNED_32x4_AVX512
646
+ RET
647
+
648
+INIT_ZMM avx512
649
+cglobal filterPixelToShort_aligned_32x64, 4, 6, 5
650
+ add r1d, r1d
651
+ add r3d, r3d
652
+ lea r4, [r3 * 3]
653
+ lea r5, [r1 * 3]
654
+
655
+ ; load constant
656
+ vbroadcasti32x8 m4, [pw_2000]
657
+%rep 15
658
+ P2S_ALIGNED_32x4_AVX512
659
+ lea r0, [r0 + r1 * 4]
660
+ lea r2, [r2 + r3 * 4]
661
+%endrep
662
+ P2S_ALIGNED_32x4_AVX512
663
+ RET
664
+
665
+INIT_ZMM avx512
666
+cglobal filterPixelToShort_aligned_48x64, 4, 6, 5
667
+ add r1d, r1d
668
+ add r3d, r3d
669
+ lea r4, [r3 * 3]
670
+ lea r5, [r1 * 3]
671
+
672
+ ; load constant
673
+ vbroadcasti32x8 m4, [pw_2000]
674
+%rep 15
675
+ P2S_ALIGNED_48x4_AVX512
676
+ lea r0, [r0 + r1 * 4]
677
+ lea r2, [r2 + r3 * 4]
678
+%endrep
679
+ P2S_ALIGNED_48x4_AVX512
680
+ RET
681
+;-----------------------------------------------------------------------------------------------------------------------------
682
+;p2s and p2s_aligned avx512 code end
683
+;-----------------------------------------------------------------------------------------------------------------------------
684
685
%macro PROCESS_LUMA_VER_W4_4R 0
686
movq m0, [r0]
687
688
jnz .loop
689
RET
690
691
+;-------------------------------------------------------------------------------------------------------------
692
+;ipfilter_chroma_avx512 code start
693
+;-------------------------------------------------------------------------------------------------------------
694
+;-------------------------------------------------------------------------------------------------------------
695
+; avx512 chroma_hpp code start
696
+;-------------------------------------------------------------------------------------------------------------
697
+%macro PROCESS_IPFILTER_CHROMA_PP_8x4_AVX512 0
698
+ ; register map
699
+ ; m0 , m1 interpolate coeff
700
+ ; m2 , m3 shuffle order table
701
+ ; m4 - pd_32
702
+ ; m5 - zero
703
+ ; m6 - pw_pixel_max
704
+
705
+ movu xm7, [r0]
706
+ vinserti32x4 m7, [r0 + r1], 1
707
+ vinserti32x4 m7, [r0 + 2 * r1], 2
708
+ vinserti32x4 m7, [r0 + r6], 3
709
+
710
+ pshufb m9, m7, m3
711
+ pshufb m7, m2
712
+ pmaddwd m7, m0
713
+ pmaddwd m9, m1
714
+ paddd m7, m9
715
+ paddd m7, m4
716
+ psrad m7, 6
717
+
718
+ movu xm8, [r0 + 8]
719
+ vinserti32x4 m8, [r0 + r1 + 8], 1
720
+ vinserti32x4 m8, [r0 + 2 * r1 + 8], 2
721
+ vinserti32x4 m8, [r0 + r6 + 8], 3
722
+
723
+ pshufb m9, m8, m3
724
+ pshufb m8, m2
725
+ pmaddwd m8, m0
726
+ pmaddwd m9, m1
727
+ paddd m8, m9
728
+ paddd m8, m4
729
+ psrad m8, 6
730
+
731
+ packusdw m7, m8
732
+ CLIPW m7, m5, m6
733
+ pshufb m7, m10
734
+ movu [r2], xm7
735
+ vextracti32x4 [r2 + r3], m7, 1
736
+ vextracti32x4 [r2 + 2 * r3], m7, 2
737
+ vextracti32x4 [r2 + r7], m7, 3
738
+%endmacro
739
+
740
+%macro PROCESS_IPFILTER_CHROMA_PP_16x2_AVX512 0
741
+ ; register map
742
+ ; m0 , m1 interpolate coeff
743
+ ; m2 , m3 shuffle order table
744
+ ; m4 - pd_32
745
+ ; m5 - zero
746
+ ; m6 - pw_pixel_max
747
+
748
+ movu ym7, [r0]
749
+ vinserti32x8 m7, [r0 + r1], 1
750
+ movu ym8, [r0 + 8]
751
+ vinserti32x8 m8, [r0 + r1 + 8], 1
752
+
753
+ pshufb m9, m7, m3
754
+ pshufb m7, m2
755
+ pmaddwd m7, m0
756
+ pmaddwd m9, m1
757
+ paddd m7, m9
758
+ paddd m7, m4
759
+ psrad m7, 6
760
+
761
+ pshufb m9, m8, m3
762
+ pshufb m8, m2
763
+ pmaddwd m8, m0
764
+ pmaddwd m9, m1
765
+ paddd m8, m9
766
+ paddd m8, m4
767
+ psrad m8, 6
768
+
769
+ packusdw m7, m8
770
+ CLIPW m7, m5, m6
771
+ pshufb m7, m10
772
+ movu [r2], ym7
773
+ vextracti32x8 [r2 + r3], m7, 1
774
+%endmacro
775
+
776
+%macro PROCESS_IPFILTER_CHROMA_PP_24x4_AVX512 0
777
+ ; register map
778
+ ; m0 , m1 interpolate coeff
779
+ ; m2 , m3 shuffle order table
780
+ ; m4 - pd_32
781
+ ; m5 - zero
782
+ ; m6 - pw_pixel_max
783
+
784
+ movu ym7, [r0]
785
+ vinserti32x8 m7, [r0 + r1], 1
786
+ movu ym8, [r0 + 8]
787
+ vinserti32x8 m8, [r0 + r1 + 8], 1
788
+
789
+ pshufb m9, m7, m3
790
+ pshufb m7, m2
791
+ pmaddwd m7, m0
792
+ pmaddwd m9, m1
793
+ paddd m7, m9
794
+ paddd m7, m4
795
+ psrad m7, 6
796
+
797
+ pshufb m9, m8, m3
798
+ pshufb m8, m2
799
+ pmaddwd m8, m0
800
+ pmaddwd m9, m1
801
+ paddd m8, m9
802
+ paddd m8, m4
803
+ psrad m8, 6
804
+
805
+ packusdw m7, m8
806
+ CLIPW m7, m5, m6
807
+ pshufb m7, m10
808
+ movu [r2], ym7
809
+ vextracti32x8 [r2 + r3], m7, 1
810
+
811
+ movu ym7, [r0 + 2 * r1]
812
+ vinserti32x8 m7, [r0 + r6], 1
813
+ movu ym8, [r0 + 2 * r1 + 8]
814
+ vinserti32x8 m8, [r0 + r6 + 8], 1
815
+
816
+ pshufb m9, m7, m3
817
+ pshufb m7, m2
818
+ pmaddwd m7, m0
819
+ pmaddwd m9, m1
820
+ paddd m7, m9
821
+ paddd m7, m4
822
+ psrad m7, 6
823
+
824
+ pshufb m9, m8, m3
825
+ pshufb m8, m2
826
+ pmaddwd m8, m0
827
+ pmaddwd m9, m1
828
+ paddd m8, m9
829
+ paddd m8, m4
830
+ psrad m8, 6
831
+
832
+ packusdw m7, m8
833
+ CLIPW m7, m5, m6
834
+ pshufb m7, m10
835
+ movu [r2 + 2 * r3], ym7
836
+ vextracti32x8 [r2 + r7], m7, 1
837
+
838
+ movu xm7, [r0 + mmsize/2]
839
+ vinserti32x4 m7, [r0 + r1 + mmsize/2], 1
840
+ vinserti32x4 m7, [r0 + 2 * r1 + mmsize/2], 2
841
+ vinserti32x4 m7, [r0 + r6 + mmsize/2], 3
842
+
843
+ pshufb m9, m7, m3
844
+ pshufb m7, m2
845
+ pmaddwd m7, m0
846
+ pmaddwd m9, m1
847
+ paddd m7, m9
848
+ paddd m7, m4
849
+ psrad m7, 6
850
+
851
+ movu xm8, [r0 + mmsize/2 + 8]
852
+ vinserti32x4 m8, [r0 + r1 + mmsize/2 + 8], 1
853
+ vinserti32x4 m8, [r0 + 2 * r1 + mmsize/2 + 8], 2
854
+ vinserti32x4 m8, [r0 + r6 + mmsize/2 + 8], 3
855
+
856
+ pshufb m9, m8, m3
857
+ pshufb m8, m2
858
+ pmaddwd m8, m0
859
+ pmaddwd m9, m1
860
+ paddd m8, m9
861
+ paddd m8, m4
862
+ psrad m8, 6
863
+
864
+ packusdw m7, m8
865
+ CLIPW m7, m5, m6
866
+ pshufb m7, m10
867
+ movu [r2 + mmsize/2], xm7
868
+ vextracti32x4 [r2 + r3 + mmsize/2], m7, 1
869
+ vextracti32x4 [r2 + 2 * r3 + mmsize/2], m7, 2
870
+ vextracti32x4 [r2 + r7 + mmsize/2], m7, 3
871
+%endmacro
872
+
873
+%macro PROCESS_IPFILTER_CHROMA_PP_32x2_AVX512 0
874
+ ; register map
875
+ ; m0 , m1 interpolate coeff
876
+ ; m2 , m3 shuffle order table
877
+ ; m4 - pd_32
878
+ ; m5 - zero
879
+ ; m6 - pw_pixel_max
880
+
881
+ movu m7, [r0]
882
+ movu m8, [r0 + 8]
883
+
884
+ pshufb m9, m7, m3
885
+ pshufb m7, m2
886
+ pmaddwd m7, m0
887
+ pmaddwd m9, m1
888
+ paddd m7, m9
889
+ paddd m7, m4
890
+ psrad m7, 6
891
+
892
+ pshufb m9, m8, m3
893
+ pshufb m8, m2
894
+ pmaddwd m8, m0
895
+ pmaddwd m9, m1
896
+ paddd m8, m9
897
+ paddd m8, m4
898
+ psrad m8, 6
899
+
900
+ packusdw m7, m8
901
+ CLIPW m7, m5, m6
902
+ pshufb m7, m10
903
+ movu [r2], m7
904
+
905
+ movu m7, [r0 + r1]
906
+ movu m8, [r0 + r1 + 8]
907
+
908
+ pshufb m9, m7, m3
909
+ pshufb m7, m2
910
+ pmaddwd m7, m0
911
+ pmaddwd m9, m1
912
+ paddd m7, m9
913
+ paddd m7, m4
914
+ psrad m7, 6
915
+
916
+ pshufb m9, m8, m3
917
+ pshufb m8, m2
918
+ pmaddwd m8, m0
919
+ pmaddwd m9, m1
920
+ paddd m8, m9
921
+ paddd m8, m4
922
+ psrad m8, 6
923
+
924
+ packusdw m7, m8
925
+ CLIPW m7, m5, m6
926
+ pshufb m7, m10
927
+ movu [r2 + r3], m7
928
+%endmacro
929
+
930
+%macro PROCESS_IPFILTER_CHROMA_PP_48x2_AVX512 0
931
+ ; register map
932
+ ; m0 , m1 interpolate coeff
933
+ ; m2 , m3 shuffle order table
934
+ ; m4 - pd_32
935
+ ; m5 - zero
936
+ ; m6 - pw_pixel_max
937
+
938
+ movu m7, [r0]
939
+ movu m8, [r0 + 8]
940
+
941
+ pshufb m9, m7, m3
942
+ pshufb m7, m2
943
+ pmaddwd m7, m0
944
+ pmaddwd m9, m1
945
+ paddd m7, m9
946
+ paddd m7, m4
947
+ psrad m7, 6
948
+
949
+ pshufb m9, m8, m3
950
+ pshufb m8, m2
951
+ pmaddwd m8, m0
952
+ pmaddwd m9, m1
953
+ paddd m8, m9
954
+ paddd m8, m4
955
+ psrad m8, 6
956
+
957
+ packusdw m7, m8
958
+ CLIPW m7, m5, m6
959
+ pshufb m7, m10
960
+ movu [r2], m7
961
+
962
+ movu m7, [r0 + r1]
963
+ movu m8, [r0 + r1 + 8]
964
+
965
+ pshufb m9, m7, m3
966
+ pshufb m7, m2
967
+ pmaddwd m7, m0
968
+ pmaddwd m9, m1
969
+ paddd m7, m9
970
+ paddd m7, m4
971
+ psrad m7, 6
972
+
973
+ pshufb m9, m8, m3
974
+ pshufb m8, m2
975
+ pmaddwd m8, m0
976
+ pmaddwd m9, m1
977
+ paddd m8, m9
978
+ paddd m8, m4
979
+ psrad m8, 6
980
+
981
+ packusdw m7, m8
982
+ CLIPW m7, m5, m6
983
+ pshufb m7, m10
984
+ movu [r2 + r3], m7
985
+
986
+ movu ym7, [r0 + mmsize]
987
+ vinserti32x8 m7, [r0 + r1 + mmsize], 1
988
+ movu ym8, [r0 + mmsize + 8]
989
+ vinserti32x8 m8, [r0 + r1 + mmsize + 8], 1
990
+
991
+ pshufb m9, m7, m3
992
+ pshufb m7, m2
993
+ pmaddwd m7, m0
994
+ pmaddwd m9, m1
995
+ paddd m7, m9
996
+ paddd m7, m4
997
+ psrad m7, 6
998
+
999
+ pshufb m9, m8, m3
1000
+ pshufb m8, m2
1001
+ pmaddwd m8, m0
1002
+ pmaddwd m9, m1
1003
+ paddd m8, m9
1004
+ paddd m8, m4
1005
+ psrad m8, 6
1006
+
1007
+ packusdw m7, m8
1008
+ CLIPW m7, m5, m6
1009
+ pshufb m7, m10
1010
+ movu [r2 + mmsize], ym7
1011
+ vextracti32x8 [r2 + r3 + mmsize], m7, 1
1012
+%endmacro
1013
+
1014
+%macro PROCESS_IPFILTER_CHROMA_PP_64x2_AVX512 0
1015
+ ; register map
1016
+ ; m0 , m1 interpolate coeff
1017
+ ; m2 , m3 shuffle order table
1018
+ ; m4 - pd_32
1019
+ ; m5 - zero
1020
+ ; m6 - pw_pixel_max
1021
+
1022
+ movu m7, [r0]
1023
+ movu m8, [r0 + 8]
1024
+
1025
+ pshufb m9, m7, m3
1026
+ pshufb m7, m2
1027
+ pmaddwd m7, m0
1028
+ pmaddwd m9, m1
1029
+ paddd m7, m9
1030
+ paddd m7, m4
1031
+ psrad m7, 6
1032
+
1033
+ pshufb m9, m8, m3
1034
+ pshufb m8, m2
1035
+ pmaddwd m8, m0
1036
+ pmaddwd m9, m1
1037
+ paddd m8, m9
1038
+ paddd m8, m4
1039
+ psrad m8, 6
1040
+
1041
+ packusdw m7, m8
1042
+ CLIPW m7, m5, m6
1043
+ pshufb m7, m10
1044
+ movu [r2], m7
1045
+
1046
+ movu m7, [r0 + mmsize]
1047
+ movu m8, [r0 + mmsize + 8]
1048
+
1049
+ pshufb m9, m7, m3
1050
+ pshufb m7, m2
1051
+ pmaddwd m7, m0
1052
+ pmaddwd m9, m1
1053
+ paddd m7, m9
1054
+ paddd m7, m4
1055
+ psrad m7, 6
1056
+
1057
+ pshufb m9, m8, m3
1058
+ pshufb m8, m2
1059
+ pmaddwd m8, m0
1060
+ pmaddwd m9, m1
1061
+ paddd m8, m9
1062
+ paddd m8, m4
1063
+ psrad m8, 6
1064
+
1065
+ packusdw m7, m8
1066
+ CLIPW m7, m5, m6
1067
+ pshufb m7, m10
1068
+ movu [r2 + mmsize], m7
1069
+
1070
+ movu m7, [r0 + r1]
1071
+ movu m8, [r0 + r1 + 8]
1072
+
1073
+ pshufb m9, m7, m3
1074
+ pshufb m7, m2
1075
+ pmaddwd m7, m0
1076
+ pmaddwd m9, m1
1077
+ paddd m7, m9
1078
+ paddd m7, m4
1079
+ psrad m7, 6
1080
+
1081
+ pshufb m9, m8, m3
1082
+ pshufb m8, m2
1083
+ pmaddwd m8, m0
1084
+ pmaddwd m9, m1
1085
+ paddd m8, m9
1086
+ paddd m8, m4
1087
+ psrad m8, 6
1088
+
1089
+ packusdw m7, m8
1090
+ CLIPW m7, m5, m6
1091
+ pshufb m7, m10
1092
+ movu [r2 + r3], m7
1093
+
1094
+ movu m7, [r0 + r1 + mmsize]
1095
+ movu m8, [r0 + r1 + mmsize + 8]
1096
+
1097
+ pshufb m9, m7, m3
1098
+ pshufb m7, m2
1099
+ pmaddwd m7, m0
1100
+ pmaddwd m9, m1
1101
+ paddd m7, m9
1102
+ paddd m7, m4
1103
+ psrad m7, 6
1104
+
1105
+ pshufb m9, m8, m3
1106
+ pshufb m8, m2
1107
+ pmaddwd m8, m0
1108
+ pmaddwd m9, m1
1109
+ paddd m8, m9
1110
+ paddd m8, m4
1111
+ psrad m8, 6
1112
+
1113
+ packusdw m7, m8
1114
+ CLIPW m7, m5, m6
1115
+ pshufb m7, m10
1116
+ movu [r2 + r3 + mmsize], m7
1117
+%endmacro
1118
+;-------------------------------------------------------------------------------------------------------------
1119
+; void interp_4tap_horiz_pp(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx
1120
+;-------------------------------------------------------------------------------------------------------------
1121
+%if ARCH_X86_64
1122
+INIT_ZMM avx512
1123
+cglobal interp_4tap_horiz_pp_8x4, 5,8,11
1124
+ add r1d, r1d
1125
+ add r3d, r3d
1126
+ sub r0, 2
1127
+ mov r4d, r4m
1128
+ lea r6, [3 * r1]
1129
+ lea r7, [3 * r3]
1130
+%ifdef PIC
1131
+ lea r5, [tab_ChromaCoeff]
1132
+ vpbroadcastd m0, [r5 + r4 * 8]
1133
+ vpbroadcastd m1, [r5 + r4 * 8 + 4]
1134
+%else
1135
+ vpbroadcastd m0, [tab_ChromaCoeff + r4 * 8]
1136
+ vpbroadcastd m1, [tab_ChromaCoeff + r4 * 8 + 4]
1137
+%endif
1138
+ vbroadcasti32x8 m2, [interp8_hpp_shuf1_load_avx512]
1139
+ vbroadcasti32x8 m3, [interp8_hpp_shuf2_load_avx512]
1140
+ vbroadcasti32x8 m4, [pd_32]
1141
+ pxor m5, m5
1142
+ vbroadcasti32x8 m6, [pw_pixel_max]
1143
+ vbroadcasti32x8 m10, [interp8_hpp_shuf1_store_avx512]
1144
+
1145
+ PROCESS_IPFILTER_CHROMA_PP_8x4_AVX512
1146
+ RET
1147
+%endif
1148
+
1149
+%macro IPFILTER_CHROMA_AVX512_8xN 1
1150
+INIT_ZMM avx512
1151
+cglobal interp_4tap_horiz_pp_8x%1, 5,8,11
1152
+ add r1d, r1d
1153
+ add r3d, r3d
1154
+ sub r0, 2
1155
+ mov r4d, r4m
1156
+ lea r6, [3 * r1]
1157
+ lea r7, [3 * r3]
1158
+%ifdef PIC
1159
+ lea r5, [tab_ChromaCoeff]
1160
+ vpbroadcastd m0, [r5 + r4 * 8]
1161
+ vpbroadcastd m1, [r5 + r4 * 8 + 4]
1162
+%else
1163
+ vpbroadcastd m0, [tab_ChromaCoeff + r4 * 8]
1164
+ vpbroadcastd m1, [tab_ChromaCoeff + r4 * 8 + 4]
1165
+%endif
1166
+ vbroadcasti32x8 m2, [interp8_hpp_shuf1_load_avx512]
1167
+ vbroadcasti32x8 m3, [interp8_hpp_shuf2_load_avx512]
1168
+ vbroadcasti32x8 m4, [pd_32]
1169
+ pxor m5, m5
1170
+ vbroadcasti32x8 m6, [pw_pixel_max]
1171
+ vbroadcasti32x8 m10, [interp8_hpp_shuf1_store_avx512]
1172
+
1173
+%rep %1/4 - 1
1174
+ PROCESS_IPFILTER_CHROMA_PP_8x4_AVX512
1175
+ lea r0, [r0 + 4 * r1]
1176
+ lea r2, [r2 + 4 * r3]
1177
+%endrep
1178
+ PROCESS_IPFILTER_CHROMA_PP_8x4_AVX512
1179
+ RET
1180
+%endmacro
1181
+
1182
+%if ARCH_X86_64
1183
+IPFILTER_CHROMA_AVX512_8xN 8
1184
+IPFILTER_CHROMA_AVX512_8xN 12
1185
+IPFILTER_CHROMA_AVX512_8xN 16
1186
+IPFILTER_CHROMA_AVX512_8xN 32
1187
+IPFILTER_CHROMA_AVX512_8xN 64
1188
+%endif
1189
+
1190
+%macro IPFILTER_CHROMA_AVX512_16xN 1
1191
+INIT_ZMM avx512
1192
+cglobal interp_4tap_horiz_pp_16x%1, 5,6,11
1193
+ add r1d, r1d
1194
+ add r3d, r3d
1195
+ sub r0, 2
1196
+ mov r4d, r4m
1197
+%ifdef PIC
1198
+ lea r5, [tab_ChromaCoeff]
1199
+ vpbroadcastd m0, [r5 + r4 * 8]
1200
+ vpbroadcastd m1, [r5 + r4 * 8 + 4]
1201
+%else
1202
+ vpbroadcastd m0, [tab_ChromaCoeff + r4 * 8]
1203
+ vpbroadcastd m1, [tab_ChromaCoeff + r4 * 8 + 4]
1204
+%endif
1205
+ vbroadcasti32x8 m2, [interp8_hpp_shuf1_load_avx512]
1206
+ vbroadcasti32x8 m3, [interp8_hpp_shuf2_load_avx512]
1207
+ vbroadcasti32x8 m4, [pd_32]
1208
+ pxor m5, m5
1209
+ vbroadcasti32x8 m6, [pw_pixel_max]
1210
+ vbroadcasti32x8 m10, [interp8_hpp_shuf1_store_avx512]
1211
+
1212
+%rep %1/2 - 1
1213
+ PROCESS_IPFILTER_CHROMA_PP_16x2_AVX512
1214
+ lea r0, [r0 + 2 * r1]
1215
+ lea r2, [r2 + 2 * r3]
1216
+%endrep
1217
+ PROCESS_IPFILTER_CHROMA_PP_16x2_AVX512
1218
+ RET
1219
+%endmacro
1220
+
1221
+%if ARCH_X86_64
1222
+IPFILTER_CHROMA_AVX512_16xN 4
1223
+IPFILTER_CHROMA_AVX512_16xN 8
1224
+IPFILTER_CHROMA_AVX512_16xN 12
1225
+IPFILTER_CHROMA_AVX512_16xN 16
1226
+IPFILTER_CHROMA_AVX512_16xN 24
1227
+IPFILTER_CHROMA_AVX512_16xN 32
1228
+IPFILTER_CHROMA_AVX512_16xN 64
1229
+%endif
1230
+
1231
+%macro IPFILTER_CHROMA_AVX512_24xN 1
1232
+INIT_ZMM avx512
1233
+cglobal interp_4tap_horiz_pp_24x%1, 5,8,11
1234
+ add r1d, r1d
1235
+ add r3d, r3d
1236
+ sub r0, 2
1237
+ mov r4d, r4m
1238
+ lea r6, [3 * r1]
1239
+ lea r7, [3 * r3]
1240
+%ifdef PIC
1241
+ lea r5, [tab_ChromaCoeff]
1242
+ vpbroadcastd m0, [r5 + r4 * 8]
1243
+ vpbroadcastd m1, [r5 + r4 * 8 + 4]
1244
+%else
1245
+ vpbroadcastd m0, [tab_ChromaCoeff + r4 * 8]
1246
+ vpbroadcastd m1, [tab_ChromaCoeff + r4 * 8 + 4]
1247
+%endif
1248
+ vbroadcasti32x8 m2, [interp8_hpp_shuf1_load_avx512]
1249
+ vbroadcasti32x8 m3, [interp8_hpp_shuf2_load_avx512]
1250
+ vbroadcasti32x8 m4, [pd_32]
1251
+ pxor m5, m5
1252
+ vbroadcasti32x8 m6, [pw_pixel_max]
1253
+ vbroadcasti32x8 m10, [interp8_hpp_shuf1_store_avx512]
1254
+
1255
+%rep %1/4 - 1
1256
+ PROCESS_IPFILTER_CHROMA_PP_24x4_AVX512
1257
+ lea r0, [r0 + 4 * r1]
1258
+ lea r2, [r2 + 4 * r3]
1259
+%endrep
1260
+ PROCESS_IPFILTER_CHROMA_PP_24x4_AVX512
1261
+ RET
1262
+%endmacro
1263
+
1264
+%if ARCH_X86_64
1265
+IPFILTER_CHROMA_AVX512_24xN 32
1266
+IPFILTER_CHROMA_AVX512_24xN 64
1267
+%endif
1268
+
1269
+%macro IPFILTER_CHROMA_AVX512_32xN 1
1270
+INIT_ZMM avx512
1271
+cglobal interp_4tap_horiz_pp_32x%1, 5,6,11
1272
+ add r1d, r1d
1273
+ add r3d, r3d
1274
+ sub r0, 2
1275
+ mov r4d, r4m
1276
+%ifdef PIC
1277
+ lea r5, [tab_ChromaCoeff]
1278
+ vpbroadcastd m0, [r5 + r4 * 8]
1279
+ vpbroadcastd m1, [r5 + r4 * 8 + 4]
1280
+%else
1281
+ vpbroadcastd m0, [tab_ChromaCoeff + r4 * 8]
1282
+ vpbroadcastd m1, [tab_ChromaCoeff + r4 * 8 + 4]
1283
+%endif
1284
+ vbroadcasti32x8 m2, [interp8_hpp_shuf1_load_avx512]
1285
+ vbroadcasti32x8 m3, [interp8_hpp_shuf2_load_avx512]
1286
+ vbroadcasti32x8 m4, [pd_32]
1287
+ pxor m5, m5
1288
+ vbroadcasti32x8 m6, [pw_pixel_max]
1289
+ vbroadcasti32x8 m10, [interp8_hpp_shuf1_store_avx512]
1290
+
1291
+%rep %1/2 - 1
1292
+ PROCESS_IPFILTER_CHROMA_PP_32x2_AVX512
1293
+ lea r0, [r0 + 2 * r1]
1294
+ lea r2, [r2 + 2 * r3]
1295
+%endrep
1296
+ PROCESS_IPFILTER_CHROMA_PP_32x2_AVX512
1297
+ RET
1298
+%endmacro
1299
+
1300
+%if ARCH_X86_64
1301
+IPFILTER_CHROMA_AVX512_32xN 8
1302
+IPFILTER_CHROMA_AVX512_32xN 16
1303
+IPFILTER_CHROMA_AVX512_32xN 24
1304
+IPFILTER_CHROMA_AVX512_32xN 32
1305
+IPFILTER_CHROMA_AVX512_32xN 48
1306
+IPFILTER_CHROMA_AVX512_32xN 64
1307
+%endif
1308
+
1309
+%macro IPFILTER_CHROMA_AVX512_64xN 1
1310
+INIT_ZMM avx512
1311
+cglobal interp_4tap_horiz_pp_64x%1, 5,6,11
1312
+ add r1d, r1d
1313
+ add r3d, r3d
1314
+ sub r0, 2
1315
+ mov r4d, r4m
1316
+%ifdef PIC
1317
+ lea r5, [tab_ChromaCoeff]
1318
+ vpbroadcastd m0, [r5 + r4 * 8]
1319
+ vpbroadcastd m1, [r5 + r4 * 8 + 4]
1320
+%else
1321
+ vpbroadcastd m0, [tab_ChromaCoeff + r4 * 8]
1322
+ vpbroadcastd m1, [tab_ChromaCoeff + r4 * 8 + 4]
1323
+%endif
1324
+ vbroadcasti32x8 m2, [interp8_hpp_shuf1_load_avx512]
1325
+ vbroadcasti32x8 m3, [interp8_hpp_shuf2_load_avx512]
1326
+ vbroadcasti32x8 m4, [pd_32]
1327
+ pxor m5, m5
1328
+ vbroadcasti32x8 m6, [pw_pixel_max]
1329
+ vbroadcasti32x8 m10, [interp8_hpp_shuf1_store_avx512]
1330
+
1331
+%rep %1/2 - 1
1332
+ PROCESS_IPFILTER_CHROMA_PP_64x2_AVX512
1333
+ lea r0, [r0 + 2 * r1]
1334
+ lea r2, [r2 + 2 * r3]
1335
+%endrep
1336
+ PROCESS_IPFILTER_CHROMA_PP_64x2_AVX512
1337
+ RET
1338
+%endmacro
1339
+
1340
+%if ARCH_X86_64
1341
+IPFILTER_CHROMA_AVX512_64xN 16
1342
+IPFILTER_CHROMA_AVX512_64xN 32
1343
+IPFILTER_CHROMA_AVX512_64xN 48
1344
+IPFILTER_CHROMA_AVX512_64xN 64
1345
+%endif
1346
+
1347
+%if ARCH_X86_64
1348
+INIT_ZMM avx512
1349
+cglobal interp_4tap_horiz_pp_48x64, 5,6,11
1350
+ add r1d, r1d
1351
+ add r3d, r3d
1352
+ sub r0, 2
1353
+ mov r4d, r4m
1354
+%ifdef PIC
1355
+ lea r5, [tab_ChromaCoeff]
1356
+ vpbroadcastd m0, [r5 + r4 * 8]
1357
+ vpbroadcastd m1, [r5 + r4 * 8 + 4]
1358
+%else
1359
+ vpbroadcastd m0, [tab_ChromaCoeff + r4 * 8]
1360
+ vpbroadcastd m1, [tab_ChromaCoeff + r4 * 8 + 4]
1361
+%endif
1362
+ vbroadcasti32x8 m2, [interp8_hpp_shuf1_load_avx512]
1363
+ vbroadcasti32x8 m3, [interp8_hpp_shuf2_load_avx512]
1364
+ vbroadcasti32x8 m4, [pd_32]
1365
+ pxor m5, m5
1366
+ vbroadcasti32x8 m6, [pw_pixel_max]
1367
+ vbroadcasti32x8 m10, [interp8_hpp_shuf1_store_avx512]
1368
+
1369
+%rep 31
1370
+ PROCESS_IPFILTER_CHROMA_PP_48x2_AVX512
1371
+ lea r0, [r0 + 2 * r1]
1372
+ lea r2, [r2 + 2 * r3]
1373
+%endrep
1374
+ PROCESS_IPFILTER_CHROMA_PP_48x2_AVX512
1375
+ RET
1376
+%endif
1377
+;-------------------------------------------------------------------------------------------------------------
1378
+; avx512 chroma_hpp code end
1379
+;-------------------------------------------------------------------------------------------------------------
1380
+;-------------------------------------------------------------------------------------------------------------
1381
+; avx512 chroma_vpp code start
1382
+;-------------------------------------------------------------------------------------------------------------
1383
+%macro PROCESS_CHROMA_VERT_PP_8x8_AVX512 0
1384
+ movu xm1, [r0]
1385
+ lea r6, [r0 + 2 * r1]
1386
+ lea r8, [r0 + 4 * r1]
1387
+ lea r9, [r8 + 2 * r1]
1388
+ vinserti32x4 m1, [r6], 1
1389
+ vinserti32x4 m1, [r8], 2
1390
+ vinserti32x4 m1, [r9], 3
1391
+ movu xm3, [r0 + r1]
1392
+ vinserti32x4 m3, [r6 + r1], 1
1393
+ vinserti32x4 m3, [r8 + r1], 2
1394
+ vinserti32x4 m3, [r9 + r1], 3
1395
+ punpcklwd m0, m1, m3
1396
+ pmaddwd m0, [r5]
1397
+ punpckhwd m1, m3
1398
+ pmaddwd m1, [r5]
1399
+
1400
+ movu xm4, [r0 + 2 * r1]
1401
+ vinserti32x4 m4, [r6 + 2 * r1], 1
1402
+ vinserti32x4 m4, [r8 + 2 * r1], 2
1403
+ vinserti32x4 m4, [r9 + 2 * r1], 3
1404
+ punpcklwd m2, m3, m4
1405
+ pmaddwd m2, [r5]
1406
+ punpckhwd m3, m4
1407
+ pmaddwd m3, [r5]
1408
+
1409
+ movu xm5, [r0 + r10]
1410
+ vinserti32x4 m5, [r6 + r10], 1
1411
+ vinserti32x4 m5, [r8 + r10], 2
1412
+ vinserti32x4 m5, [r9 + r10], 3
1413
+ punpcklwd m6, m4, m5
1414
+ pmaddwd m6, [r5 + mmsize]
1415
+ paddd m0, m6
1416
+ punpckhwd m4, m5
1417
+ pmaddwd m4, [r5 + mmsize]
1418
+ paddd m1, m4
1419
+
1420
+ movu xm4, [r0 + 4 * r1]
1421
+ vinserti32x4 m4, [r6 + 4 * r1], 1
1422
+ vinserti32x4 m4, [r8 + 4 * r1], 2
1423
+ vinserti32x4 m4, [r9 + 4 * r1], 3
1424
+ punpcklwd m6, m5, m4
1425
+ pmaddwd m6, [r5 + mmsize]
1426
+ paddd m2, m6
1427
+ punpckhwd m5, m4
1428
+ pmaddwd m5, [r5 + mmsize]
1429
+ paddd m3, m5
1430
+
1431
+ paddd m0, m7
1432
+ paddd m1, m7
1433
+ paddd m2, m7
1434
+ paddd m3, m7
1435
+
1436
+ psrad m0, INTERP_SHIFT_PP
1437
+ psrad m1, INTERP_SHIFT_PP
1438
+ psrad m2, INTERP_SHIFT_PP
1439
+ psrad m3, INTERP_SHIFT_PP
1440
+
1441
+ packssdw m0, m1
1442
+ packssdw m2, m3
1443
+ pxor m5, m5
1444
+ CLIPW2 m0, m2, m5, m8
1445
+ movu [r2], xm0
1446
+ movu [r2 + r3], xm2
1447
+ vextracti32x4 [r2 + 2 * r3], m0, 1
1448
+ vextracti32x4 [r2 + r7], m2, 1
1449
+ lea r2, [r2 + 4 * r3]
1450
+ vextracti32x4 [r2], m0, 2
1451
+ vextracti32x4 [r2 + r3], m2, 2
1452
+ vextracti32x4 [r2 + 2 * r3], m0, 3
1453
+ vextracti32x4 [r2 + r7], m2, 3
1454
+%endmacro
1455
+
1456
+;-----------------------------------------------------------------------------------------------------------------
1457
+; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
1458
+;-----------------------------------------------------------------------------------------------------------------
1459
+%if ARCH_X86_64
1460
+INIT_ZMM avx512
1461
+cglobal interp_4tap_vert_pp_8x8, 5, 11, 9
1462
+ add r1d, r1d
1463
+ add r3d, r3d
1464
+ sub r0, r1
1465
+ shl r4d, 7
1466
+
1467
+%ifdef PIC
1468
+ lea r5, [tab_ChromaCoeffV_avx512]
1469
+ lea r5, [r5 + r4]
1470
+%else
1471
+ lea r5, [tab_ChromaCoeffV_avx512 + r4]
1472
+%endif
1473
+ vbroadcasti32x8 m7, [INTERP_OFFSET_PP]
1474
+ vbroadcasti32x8 m8, [pw_pixel_max]
1475
+ lea r10, [3 * r1]
1476
+ lea r7, [3 * r3]
1477
+ PROCESS_CHROMA_VERT_PP_8x8_AVX512
1478
+ RET
1479
+%endif
1480
+
1481
+%macro FILTER_VER_PP_CHROMA_8xN_AVX512 1
1482
+INIT_ZMM avx512
1483
+cglobal interp_4tap_vert_pp_8x%1, 5, 11, 9
1484
+ add r1d, r1d
1485
+ add r3d, r3d
1486
+ sub r0, r1
1487
+ shl r4d, 7
1488
+
1489
+%ifdef PIC
1490
+ lea r5, [tab_ChromaCoeffV_avx512]
1491
+ lea r5, [r5 + r4]
1492
+%else
1493
+ lea r5, [tab_ChromaCoeffV_avx512 + r4]
1494
+%endif
1495
+ vbroadcasti32x8 m7, [INTERP_OFFSET_PP]
1496
+ vbroadcasti32x8 m8, [pw_pixel_max]
1497
+ lea r10, [3 * r1]
1498
+ lea r7, [3 * r3]
1499
+%rep %1/8 - 1
1500
+ PROCESS_CHROMA_VERT_PP_8x8_AVX512
1501
+ lea r0, [r8 + 4 * r1]
1502
+ lea r2, [r2 + 4 * r3]
1503
+%endrep
1504
+ PROCESS_CHROMA_VERT_PP_8x8_AVX512
1505
+ RET
1506
+%endmacro
1507
+
1508
+%if ARCH_X86_64
1509
+FILTER_VER_PP_CHROMA_8xN_AVX512 16
1510
+FILTER_VER_PP_CHROMA_8xN_AVX512 32
1511
+FILTER_VER_PP_CHROMA_8xN_AVX512 64
1512
+%endif
1513
+
1514
+%macro PROCESS_CHROMA_VERT_PP_16x4_AVX512 0
1515
+ movu ym1, [r0]
1516
+ lea r6, [r0 + 2 * r1]
1517
+ vinserti32x8 m1, [r6], 1
1518
+ movu ym3, [r0 + r1]
1519
+ vinserti32x8 m3, [r6 + r1], 1
1520
+ punpcklwd m0, m1, m3
1521
+ pmaddwd m0, [r5]
1522
+ punpckhwd m1, m3
1523
+ pmaddwd m1, [r5]
1524
+
1525
+ movu ym4, [r0 + 2 * r1]
1526
+ vinserti32x8 m4, [r6 + 2 * r1], 1
1527
+ punpcklwd m2, m3, m4
1528
+ pmaddwd m2, [r5]
1529
+ punpckhwd m3, m4
1530
+ pmaddwd m3, [r5]
1531
+
1532
+ lea r0, [r0 + 2 * r1]
1533
+ lea r6, [r6 + 2 * r1]
1534
+
1535
+ movu ym5, [r0 + r1]
1536
+ vinserti32x8 m5, [r6 + r1], 1
1537
+ punpcklwd m6, m4, m5
1538
+ pmaddwd m6, [r5 + mmsize]
1539
+ paddd m0, m6
1540
+ punpckhwd m4, m5
1541
+ pmaddwd m4, [r5 + mmsize]
1542
+ paddd m1, m4
1543
+
1544
+ movu ym4, [r0 + 2 * r1]
1545
+ vinserti32x8 m4, [r6 + 2 * r1], 1
1546
+ punpcklwd m6, m5, m4
1547
+ pmaddwd m6, [r5 + mmsize]
1548
+ paddd m2, m6
1549
+ punpckhwd m5, m4
1550
+ pmaddwd m5, [r5 + mmsize]
1551
+ paddd m3, m5
1552
+
1553
+ paddd m0, m7
1554
+ paddd m1, m7
1555
+ paddd m2, m7
1556
+ paddd m3, m7
1557
+
1558
+ psrad m0, INTERP_SHIFT_PP
1559
+ psrad m1, INTERP_SHIFT_PP
1560
+ psrad m2, INTERP_SHIFT_PP
1561
+ psrad m3, INTERP_SHIFT_PP
1562
+
1563
+ packssdw m0, m1
1564
+ packssdw m2, m3
1565
+ pxor m5, m5
1566
+ CLIPW2 m0, m2, m5, m8
1567
+ movu [r2], ym0
1568
+ movu [r2 + r3], ym2
1569
+ vextracti32x8 [r2 + 2 * r3], m0, 1
1570
+ vextracti32x8 [r2 + r7], m2, 1
1571
+%endmacro
1572
+
1573
+;-----------------------------------------------------------------------------------------------------------------
1574
+; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
1575
+;-----------------------------------------------------------------------------------------------------------------
1576
+%if ARCH_X86_64
1577
+INIT_ZMM avx512
1578
+cglobal interp_4tap_vert_pp_16x4, 5, 8, 9
1579
+ add r1d, r1d
1580
+ add r3d, r3d
1581
+ sub r0, r1
1582
+ shl r4d, 7
1583
+
1584
+%ifdef PIC
1585
+ lea r5, [tab_ChromaCoeffV_avx512]
1586
+ lea r5, [r5 + r4]
1587
+%else
1588
+ lea r5, [tab_ChromaCoeffV_avx512 + r4]
1589
+%endif
1590
+ vbroadcasti32x8 m7, [INTERP_OFFSET_PP]
1591
+ vbroadcasti32x8 m8, [pw_pixel_max]
1592
+ lea r7, [3 * r3]
1593
+ PROCESS_CHROMA_VERT_PP_16x4_AVX512
1594
+ RET
1595
+%endif
1596
+
1597
+%macro FILTER_VER_PP_CHROMA_16xN_AVX512 1
1598
+INIT_ZMM avx512
1599
+cglobal interp_4tap_vert_pp_16x%1, 5, 8, 9
1600
+ add r1d, r1d
1601
+ add r3d, r3d
1602
+ sub r0, r1
1603
+ shl r4d, 7
1604
+
1605
+%ifdef PIC
1606
+ lea r5, [tab_ChromaCoeffV_avx512]
1607
+ lea r5, [r5 + r4]
1608
+%else
1609
+ lea r5, [tab_ChromaCoeffV_avx512 + r4]
1610
+%endif
1611
+ vbroadcasti32x8 m7, [INTERP_OFFSET_PP]
1612
+ vbroadcasti32x8 m8, [pw_pixel_max]
1613
+ lea r7, [3 * r3]
1614
+%rep %1/4 - 1
1615
+ PROCESS_CHROMA_VERT_PP_16x4_AVX512
1616
+ lea r0, [r0 + 2 * r1]
1617
+ lea r2, [r2 + 4 * r3]
1618
+%endrep
1619
+ PROCESS_CHROMA_VERT_PP_16x4_AVX512
1620
+ RET
1621
+%endmacro
1622
+
1623
+%if ARCH_X86_64
1624
+FILTER_VER_PP_CHROMA_16xN_AVX512 8
1625
+FILTER_VER_PP_CHROMA_16xN_AVX512 12
1626
+FILTER_VER_PP_CHROMA_16xN_AVX512 16
1627
+FILTER_VER_PP_CHROMA_16xN_AVX512 24
1628
+FILTER_VER_PP_CHROMA_16xN_AVX512 32
1629
+FILTER_VER_PP_CHROMA_16xN_AVX512 64
1630
+%endif
1631
+
1632
+%macro PROCESS_CHROMA_VERT_PP_24x8_AVX512 0
1633
+ movu ym1, [r0]
1634
+ lea r6, [r0 + 2 * r1]
1635
+ lea r8, [r0 + 4 * r1]
1636
+ lea r9, [r8 + 2 * r1]
1637
+
1638
+ movu ym10, [r8]
1639
+ movu ym3, [r0 + r1]
1640
+ movu ym12, [r8 + r1]
1641
+ vinserti32x8 m1, [r6], 1
1642
+ vinserti32x8 m10, [r9], 1
1643
+ vinserti32x8 m3, [r6 + r1], 1
1644
+ vinserti32x8 m12, [r9 + r1], 1
1645
+
1646
+ punpcklwd m0, m1, m3
1647
+ punpcklwd m9, m10, m12
1648
+ pmaddwd m0, [r5]
1649
+ pmaddwd m9, [r5]
1650
+ punpckhwd m1, m3
1651
+ punpckhwd m10, m12
1652
+ pmaddwd m1, [r5]
1653
+ pmaddwd m10, [r5]
1654
+
1655
+ movu ym4, [r0 + 2 * r1]
1656
+ movu ym13, [r8 + 2 * r1]
1657
+ vinserti32x8 m4, [r6 + 2 * r1], 1
1658
+ vinserti32x8 m13, [r9 + 2 * r1], 1
1659
+ punpcklwd m2, m3, m4
1660
+ punpcklwd m11, m12, m13
1661
+ pmaddwd m2, [r5]
1662
+ pmaddwd m11, [r5]
1663
+ punpckhwd m3, m4
1664
+ punpckhwd m12, m13
1665
+ pmaddwd m3, [r5]
1666
+ pmaddwd m12, [r5]
1667
+
1668
+ movu ym5, [r0 + r10]
1669
+ vinserti32x8 m5, [r6 + r10], 1
1670
+ movu ym14, [r8 + r10]
1671
+ vinserti32x8 m14, [r9 + r10], 1
1672
+ punpcklwd m6, m4, m5
1673
+ punpcklwd m15, m13, m14
1674
+ pmaddwd m6, [r5 + mmsize]
1675
+ pmaddwd m15, [r5 + mmsize]
1676
+ paddd m0, m6
1677
+ paddd m9, m15
1678
+ punpckhwd m4, m5
1679
+ punpckhwd m13, m14
1680
+ pmaddwd m4, [r5 + mmsize]
1681
+ pmaddwd m13, [r5 + mmsize]
1682
+ paddd m1, m4
1683
+ paddd m10, m13
1684
+
1685
+ movu ym4, [r0 + 4 * r1]
1686
+ vinserti32x8 m4, [r6 + 4 * r1], 1
1687
+ movu ym13, [r8 + 4 * r1]
1688
+ vinserti32x8 m13, [r9 + 4 * r1], 1
1689
+ punpcklwd m6, m5, m4
1690
+ punpcklwd m15, m14, m13
1691
+ pmaddwd m6, [r5 + mmsize]
1692
+ pmaddwd m15, [r5 + mmsize]
1693
+ paddd m2, m6
1694
+ paddd m11, m15
1695
+ punpckhwd m5, m4
1696
+ punpckhwd m14, m13
1697
+ pmaddwd m5, [r5 + mmsize]
1698
+ pmaddwd m14, [r5 + mmsize]
1699
+ paddd m3, m5
1700
+ paddd m12, m14
1701
+
1702
+ paddd m0, m7
1703
+ paddd m1, m7
1704
+ paddd m2, m7
1705
+ paddd m3, m7
1706
+ paddd m9, m7
1707
+ paddd m10, m7
1708
+ paddd m11, m7
1709
+ paddd m12, m7
1710
+
1711
+ psrad m0, INTERP_SHIFT_PP
1712
+ psrad m1, INTERP_SHIFT_PP
1713
+ psrad m2, INTERP_SHIFT_PP
1714
+ psrad m3, INTERP_SHIFT_PP
1715
+ psrad m9, INTERP_SHIFT_PP
1716
+ psrad m10, INTERP_SHIFT_PP
1717
+ psrad m11, INTERP_SHIFT_PP
1718
+ psrad m12, INTERP_SHIFT_PP
1719
+
1720
+ packssdw m0, m1
1721
+ packssdw m2, m3
1722
+ packssdw m9, m10
1723
+ packssdw m11, m12
1724
+ pxor m5, m5
1725
+ CLIPW2 m0, m2, m5, m8
1726
+ CLIPW2 m9, m11, m5, m8
1727
+ movu [r2], ym0
1728
+ movu [r2 + r3], ym2
1729
+ vextracti32x8 [r2 + 2 * r3], m0, 1
1730
+ vextracti32x8 [r2 + r7], m2, 1
1731
+ lea r11, [r2 + 4 * r3]
1732
+ movu [r11], ym9
1733
+ movu [r11 + r3], ym11
1734
+ vextracti32x8 [r11 + 2 * r3], m9, 1
1735
+ vextracti32x8 [r11 + r7], m11, 1
1736
+
1737
+ movu xm1, [r0 + mmsize/2]
1738
+ vinserti32x4 m1, [r6 + mmsize/2], 1
1739
+ vinserti32x4 m1, [r8 + mmsize/2], 2
1740
+ vinserti32x4 m1, [r9 + mmsize/2], 3
1741
+ movu xm3, [r0 + r1 + mmsize/2]
1742
+ vinserti32x4 m3, [r6 + r1 + mmsize/2], 1
1743
+ vinserti32x4 m3, [r8 + r1 + mmsize/2], 2
1744
+ vinserti32x4 m3, [r9 + r1 + mmsize/2], 3
1745
+ punpcklwd m0, m1, m3
1746
+ pmaddwd m0, [r5]
1747
+ punpckhwd m1, m3
1748
+ pmaddwd m1, [r5]
1749
+
1750
+ movu xm4, [r0 + 2 * r1 + mmsize/2]
1751
+ vinserti32x4 m4, [r6 + 2 * r1 + mmsize/2], 1
1752
+ vinserti32x4 m4, [r8 + 2 * r1 + mmsize/2], 2
1753
+ vinserti32x4 m4, [r9 + 2 * r1 + mmsize/2], 3
1754
+ punpcklwd m2, m3, m4
1755
+ pmaddwd m2, [r5]
1756
+ punpckhwd m3, m4
1757
+ pmaddwd m3, [r5]
1758
+
1759
+ movu xm5, [r0 + r10 + mmsize/2]
1760
+ vinserti32x4 m5, [r6 + r10 + mmsize/2], 1
1761
+ vinserti32x4 m5, [r8 + r10 + mmsize/2], 2
1762
+ vinserti32x4 m5, [r9 + r10 + mmsize/2], 3
1763
+ punpcklwd m6, m4, m5
1764
+ pmaddwd m6, [r5 + mmsize]
1765
+ paddd m0, m6
1766
+ punpckhwd m4, m5
1767
+ pmaddwd m4, [r5 + mmsize]
1768
+ paddd m1, m4
1769
+
1770
+ movu xm4, [r0 + 4 * r1 + mmsize/2]
1771
+ vinserti32x4 m4, [r6 + 4 * r1 + mmsize/2], 1
1772
+ vinserti32x4 m4, [r8 + 4 * r1 + mmsize/2], 2
1773
+ vinserti32x4 m4, [r9 + 4 * r1 + mmsize/2], 3
1774
+ punpcklwd m6, m5, m4
1775
+ pmaddwd m6, [r5 + mmsize]
1776
+ paddd m2, m6
1777
+ punpckhwd m5, m4
1778
+ pmaddwd m5, [r5 + mmsize]
1779
+ paddd m3, m5
1780
+
1781
+ paddd m0, m7
1782
+ paddd m1, m7
1783
+ paddd m2, m7
1784
+ paddd m3, m7
1785
+
1786
+ psrad m0, INTERP_SHIFT_PP
1787
+ psrad m1, INTERP_SHIFT_PP
1788
+ psrad m2, INTERP_SHIFT_PP
1789
+ psrad m3, INTERP_SHIFT_PP
1790
+
1791
+ packssdw m0, m1
1792
+ packssdw m2, m3
1793
+ pxor m5, m5
1794
+ CLIPW2 m0, m2, m5, m8
1795
+ movu [r2 + mmsize/2], xm0
1796
+ movu [r2 + r3 + mmsize/2], xm2
1797
+ vextracti32x4 [r2 + 2 * r3 + mmsize/2], m0, 1
1798
+ vextracti32x4 [r2 + r7 + mmsize/2], m2, 1
1799
+ lea r2, [r2 + 4 * r3]
1800
+ vextracti32x4 [r2 + mmsize/2], m0, 2
1801
+ vextracti32x4 [r2 + r3 + mmsize/2], m2, 2
1802
+ vextracti32x4 [r2 + 2 * r3 + mmsize/2], m0, 3
1803
+ vextracti32x4 [r2 + r7 + mmsize/2], m2, 3
1804
+%endmacro
1805
+
1806
+%macro FILTER_VER_PP_CHROMA_24xN_AVX512 1
1807
+INIT_ZMM avx512
1808
+cglobal interp_4tap_vert_pp_24x%1, 5, 12, 16
1809
+ add r1d, r1d
1810
+ add r3d, r3d
1811
+ sub r0, r1
1812
+ shl r4d, 7
1813
+
1814
+%ifdef PIC
1815
+ lea r5, [tab_ChromaCoeffV_avx512]
1816
+ lea r5, [r5 + r4]
1817
+%else
1818
+ lea r5, [tab_ChromaCoeffV_avx512 + r4]
1819
+%endif
1820
+ vbroadcasti32x8 m7, [INTERP_OFFSET_PP]
1821
+ vbroadcasti32x8 m8, [pw_pixel_max]
1822
+ lea r10, [3 * r1]
1823
+ lea r7, [3 * r3]
1824
+%rep %1/8 - 1
1825
+ PROCESS_CHROMA_VERT_PP_24x8_AVX512
1826
+ lea r0, [r8 + 4 * r1]
1827
+ lea r2, [r2 + 4 * r3]
1828
+%endrep
1829
+ PROCESS_CHROMA_VERT_PP_24x8_AVX512
1830
+ RET
1831
+%endmacro
1832
+
1833
+%if ARCH_X86_64
1834
+ FILTER_VER_PP_CHROMA_24xN_AVX512 32
1835
+ FILTER_VER_PP_CHROMA_24xN_AVX512 64
1836
+%endif
1837
+
1838
+%macro PROCESS_CHROMA_VERT_PP_32x2_AVX512 0
1839
+ movu m1, [r0]
1840
+ movu m3, [r0 + r1]
1841
+ punpcklwd m0, m1, m3
1842
+ pmaddwd m0, [r5]
1843
+ punpckhwd m1, m3
1844
+ pmaddwd m1, [r5]
1845
+
1846
+ movu m4, [r0 + 2 * r1]
1847
+ punpcklwd m2, m3, m4
1848
+ pmaddwd m2, [r5]
1849
+ punpckhwd m3, m4
1850
+ pmaddwd m3, [r5]
1851
+
1852
+ lea r0, [r0 + 2 * r1]
1853
+ movu m5, [r0 + r1]
1854
+ punpcklwd m6, m4, m5
1855
+ pmaddwd m6, [r5 + mmsize]
1856
+ paddd m0, m6
1857
+ punpckhwd m4, m5
1858
+ pmaddwd m4, [r5 + mmsize]
1859
+ paddd m1, m4
1860
+
1861
+ movu m4, [r0 + 2 * r1]
1862
+ punpcklwd m6, m5, m4
1863
+ pmaddwd m6, [r5 + mmsize]
1864
+ paddd m2, m6
1865
+ punpckhwd m5, m4
1866
+ pmaddwd m5, [r5 + mmsize]
1867
+ paddd m3, m5
1868
+
1869
+ paddd m0, m7
1870
+ paddd m1, m7
1871
+ paddd m2, m7
1872
+ paddd m3, m7
1873
+
1874
+ psrad m0, INTERP_SHIFT_PP
1875
+ psrad m1, INTERP_SHIFT_PP
1876
+ psrad m2, INTERP_SHIFT_PP
1877
+ psrad m3, INTERP_SHIFT_PP
1878
+
1879
+ packssdw m0, m1
1880
+ packssdw m2, m3
1881
+ pxor m5, m5
1882
+ CLIPW2 m0, m2, m5, m8
1883
+ movu [r2], m0
1884
+ movu [r2 + r3], m2
1885
+%endmacro
1886
+
1887
+;-----------------------------------------------------------------------------------------------------------------
1888
+; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
1889
+;-----------------------------------------------------------------------------------------------------------------
1890
+%macro FILTER_VER_PP_CHROMA_32xN_AVX512 1
1891
+INIT_ZMM avx512
1892
+cglobal interp_4tap_vert_pp_32x%1, 5, 7, 9
1893
+ add r1d, r1d
1894
+ add r3d, r3d
1895
+ sub r0, r1
1896
+ shl r4d, 7
1897
+
1898
+%ifdef PIC
1899
+ lea r5, [tab_ChromaCoeffV_avx512]
1900
+ lea r5, [r5 + r4]
1901
+%else
1902
+ lea r5, [tab_ChromaCoeffV_avx512 + r4]
1903
+%endif
1904
+ vbroadcasti32x8 m7, [INTERP_OFFSET_PP]
1905
+ vbroadcasti32x8 m8, [pw_pixel_max]
1906
+
1907
+%rep %1/2 - 1
1908
+ PROCESS_CHROMA_VERT_PP_32x2_AVX512
1909
+ lea r2, [r2 + 2 * r3]
1910
+%endrep
1911
+ PROCESS_CHROMA_VERT_PP_32x2_AVX512
1912
+ RET
1913
+%endmacro
1914
+
1915
+%if ARCH_X86_64
1916
+FILTER_VER_PP_CHROMA_32xN_AVX512 8
1917
+FILTER_VER_PP_CHROMA_32xN_AVX512 16
1918
+FILTER_VER_PP_CHROMA_32xN_AVX512 24
1919
+FILTER_VER_PP_CHROMA_32xN_AVX512 32
1920
+FILTER_VER_PP_CHROMA_32xN_AVX512 48
1921
+FILTER_VER_PP_CHROMA_32xN_AVX512 64
1922
+%endif
1923
+
1924
+%macro PROCESS_CHROMA_VERT_PP_48x4_AVX512 0
1925
+ movu m1, [r0]
1926
+ lea r6, [r0 + 2 * r1]
1927
+ movu m10, [r6]
1928
+ movu m3, [r0 + r1]
1929
+ movu m12, [r6 + r1]
1930
+ punpcklwd m0, m1, m3
1931
+ punpcklwd m9, m10, m12
1932
+ pmaddwd m0, [r5]
1933
+ pmaddwd m9, [r5]
1934
+ punpckhwd m1, m3
1935
+ punpckhwd m10, m12
1936
+ pmaddwd m1, [r5]
1937
+ pmaddwd m10, [r5]
1938
+
1939
+ movu m4, [r0 + 2 * r1]
1940
+ movu m13, [r6 + 2 * r1]
1941
+ punpcklwd m2, m3, m4
1942
+ punpcklwd m11, m12, m13
1943
+ pmaddwd m2, [r5]
1944
+ pmaddwd m11, [r5]
1945
+ punpckhwd m3, m4
1946
+ punpckhwd m12, m13
1947
+ pmaddwd m3, [r5]
1948
+ pmaddwd m12, [r5]
1949
+
1950
+ movu m5, [r0 + r7]
1951
+ movu m14, [r6 + r7]
1952
+ punpcklwd m6, m4, m5
1953
+ punpcklwd m15, m13, m14
1954
+ pmaddwd m6, [r5 + mmsize]
1955
+ pmaddwd m15, [r5 + mmsize]
1956
+ paddd m0, m6
1957
+ paddd m9, m15
1958
+ punpckhwd m4, m5
1959
+ punpckhwd m13, m14
1960
+ pmaddwd m4, [r5 + mmsize]
1961
+ pmaddwd m13, [r5 + mmsize]
1962
+ paddd m1, m4
1963
+ paddd m10, m13
1964
+
1965
+ movu m4, [r0 + 4 * r1]
1966
+ movu m13, [r6 + 4 * r1]
1967
+ punpcklwd m6, m5, m4
1968
+ punpcklwd m15, m14, m13
1969
+ pmaddwd m6, [r5 + mmsize]
1970
+ pmaddwd m15, [r5 + mmsize]
1971
+ paddd m2, m6
1972
+ paddd m11, m15
1973
+ punpckhwd m5, m4
1974
+ punpckhwd m14, m13
1975
+ pmaddwd m5, [r5 + mmsize]
1976
+ pmaddwd m14, [r5 + mmsize]
1977
+ paddd m3, m5
1978
+ paddd m12, m14
1979
+
1980
+ paddd m0, m7
1981
+ paddd m1, m7
1982
+ paddd m2, m7
1983
+ paddd m3, m7
1984
+ paddd m9, m7
1985
+ paddd m10, m7
1986
+ paddd m11, m7
1987
+ paddd m12, m7
1988
+
1989
+ psrad m0, INTERP_SHIFT_PP
1990
+ psrad m1, INTERP_SHIFT_PP
1991
+ psrad m2, INTERP_SHIFT_PP
1992
+ psrad m3, INTERP_SHIFT_PP
1993
+ psrad m9, INTERP_SHIFT_PP
1994
+ psrad m10, INTERP_SHIFT_PP
1995
+ psrad m11, INTERP_SHIFT_PP
1996
+ psrad m12, INTERP_SHIFT_PP
1997
+
1998
+ packssdw m0, m1
1999
+ packssdw m2, m3
2000
+ packssdw m9, m10
2001
+ packssdw m11, m12
2002
+ CLIPW2 m0, m2, m16, m8
2003
+ CLIPW2 m9, m11, m16, m8
2004
+ movu [r2], m0
2005
+ movu [r2 + r3], m2
2006
+ movu [r2 + 2 * r3], m9
2007
+ movu [r2 + r8], m11
2008
+
2009
+ movu ym1, [r0 + mmsize]
2010
+ vinserti32x8 m1, [r6 + mmsize], 1
2011
+ movu ym3, [r0 + r1 + mmsize]
2012
+ vinserti32x8 m3, [r6 + r1 + mmsize], 1
2013
+ punpcklwd m0, m1, m3
2014
+ pmaddwd m0, [r5]
2015
+ punpckhwd m1, m3
2016
+ pmaddwd m1, [r5]
2017
+
2018
+ movu ym4, [r0 + 2 * r1 + mmsize]
2019
+ vinserti32x8 m4, [r6 + 2 * r1 + mmsize], 1
2020
+ punpcklwd m2, m3, m4
2021
+ pmaddwd m2, [r5]
2022
+ punpckhwd m3, m4
2023
+ pmaddwd m3, [r5]
2024
+
2025
+ movu ym5, [r0 + r7 + mmsize]
2026
+ vinserti32x8 m5, [r6 + r7 + mmsize], 1
2027
+ punpcklwd m6, m4, m5
2028
+ pmaddwd m6, [r5 + mmsize]
2029
+ paddd m0, m6
2030
+ punpckhwd m4, m5
2031
+ pmaddwd m4, [r5 + mmsize]
2032
+ paddd m1, m4
2033
+
2034
+ movu ym4, [r0 + 4 * r1 + mmsize]
2035
+ vinserti32x8 m4, [r6 + 4 * r1 + mmsize], 1
2036
+ punpcklwd m6, m5, m4
2037
+ pmaddwd m6, [r5 + mmsize]
2038
+ paddd m2, m6
2039
+ punpckhwd m5, m4
2040
+ pmaddwd m5, [r5 + mmsize]
2041
+ paddd m3, m5
2042
+
2043
+ paddd m0, m7
2044
+ paddd m1, m7
2045
+ paddd m2, m7
2046
+ paddd m3, m7
2047
+
2048
+ psrad m0, INTERP_SHIFT_PP
2049
+ psrad m1, INTERP_SHIFT_PP
2050
+ psrad m2, INTERP_SHIFT_PP
2051
+ psrad m3, INTERP_SHIFT_PP
2052
+
2053
+ packssdw m0, m1
2054
+ packssdw m2, m3
2055
+ CLIPW2 m0, m2, m16, m8
2056
+ movu [r2 + mmsize], ym0
2057
+ movu [r2 + r3 + mmsize], ym2
2058
+ vextracti32x8 [r2 + 2 * r3 + mmsize], m0, 1
2059
+ vextracti32x8 [r2 + r8 + mmsize], m2, 1
2060
+%endmacro
2061
+
2062
+%if ARCH_X86_64
2063
+INIT_ZMM avx512
2064
+cglobal interp_4tap_vert_pp_48x64, 5, 9, 17
2065
+ add r1d, r1d
2066
+ add r3d, r3d
2067
+ sub r0, r1
2068
+ shl r4d, 7
2069
+%ifdef PIC
2070
+ lea r5, [tab_ChromaCoeffV_avx512]
2071
+ lea r5, [r5 + r4]
2072
+%else
2073
+ lea r5, [tab_ChromaCoeffV_avx512 + r4]
2074
+%endif
2075
+ lea r7, [3 * r1]
2076
+ lea r8, [3 * r3]
2077
+ vbroadcasti32x8 m7, [INTERP_OFFSET_PP]
2078
+ vbroadcasti32x8 m8, [pw_pixel_max]
2079
+ pxor m16, m16
2080
+
2081
+%rep 15
2082
+ PROCESS_CHROMA_VERT_PP_48x4_AVX512
2083
+ lea r0, [r0 + 4 * r1]
2084
+ lea r2, [r2 + 4 * r3]
2085
+%endrep
2086
+ PROCESS_CHROMA_VERT_PP_48x4_AVX512
2087
+ RET
2088
+%endif
2089
+
2090
+%macro PROCESS_CHROMA_VERT_PP_64x2_AVX512 0
2091
+ movu m1, [r0]
2092
+ movu m3, [r0 + r1]
2093
+ punpcklwd m0, m1, m3
2094
+ pmaddwd m0, [r5]
2095
+ punpckhwd m1, m3
2096
+ pmaddwd m1, [r5]
2097
+
2098
+ movu m9, [r0 + mmsize]
2099
+ movu m11, [r0 + r1 + mmsize]
2100
+ punpcklwd m8, m9, m11
2101
+ pmaddwd m8, [r5]
2102
+ punpckhwd m9, m11
2103
+ pmaddwd m9, [r5]
2104
+
2105
+ movu m4, [r0 + 2 * r1]
2106
+ punpcklwd m2, m3, m4
2107
+ pmaddwd m2, [r5]
2108
+ punpckhwd m3, m4
2109
+ pmaddwd m3, [r5]
2110
+
2111
+ movu m12, [r0 + 2 * r1 + mmsize]
2112
+ punpcklwd m10, m11, m12
2113
+ pmaddwd m10, [r5]
2114
+ punpckhwd m11, m12
2115
+ pmaddwd m11, [r5]
2116
+
2117
+ lea r0, [r0 + 2 * r1]
2118
+ movu m5, [r0 + r1]
2119
+ punpcklwd m6, m4, m5
2120
+ pmaddwd m6, [r5 + 1 * mmsize]
2121
+ paddd m0, m6
2122
+ punpckhwd m4, m5
2123
+ pmaddwd m4, [r5 + 1 * mmsize]
2124
+ paddd m1, m4
2125
+
2126
+ movu m13, [r0 + r1 + mmsize]
2127
+ punpcklwd m14, m12, m13
2128
+ pmaddwd m14, [r5 + 1 * mmsize]
2129
+ paddd m8, m14
2130
+ punpckhwd m12, m13
2131
+ pmaddwd m12, [r5 + 1 * mmsize]
2132
+ paddd m9, m12
2133
+
2134
+ movu m4, [r0 + 2 * r1]
2135
+ punpcklwd m6, m5, m4
2136
+ pmaddwd m6, [r5 + 1 * mmsize]
2137
+ paddd m2, m6
2138
+ punpckhwd m5, m4
2139
+ pmaddwd m5, [r5 + 1 * mmsize]
2140
+ paddd m3, m5
2141
+
2142
+ movu m12, [r0 + 2 * r1 + mmsize]
2143
+ punpcklwd m14, m13, m12
2144
+ pmaddwd m14, [r5 + 1 * mmsize]
2145
+ paddd m10, m14
2146
+ punpckhwd m13, m12
2147
+ pmaddwd m13, [r5 + 1 * mmsize]
2148
+ paddd m11, m13
2149
+
2150
+ paddd m0, m7
2151
+ paddd m1, m7
2152
+ paddd m2, m7
2153
+ paddd m3, m7
2154
+ paddd m8, m7
2155
+ paddd m9, m7
2156
+ paddd m10, m7
2157
+ paddd m11, m7
2158
+
2159
+ psrad m0, INTERP_SHIFT_PP
2160
+ psrad m1, INTERP_SHIFT_PP
2161
+ psrad m2, INTERP_SHIFT_PP
2162
+ psrad m3, INTERP_SHIFT_PP
2163
+ psrad m8, INTERP_SHIFT_PP
2164
+ psrad m9, INTERP_SHIFT_PP
2165
+ psrad m10, INTERP_SHIFT_PP
2166
+ psrad m11, INTERP_SHIFT_PP
2167
+
2168
+ packssdw m0, m1
2169
+ packssdw m2, m3
2170
+ packssdw m8, m9
2171
+ packssdw m10, m11
2172
+ pxor m5, m5
2173
+ CLIPW2 m0, m2, m5, m15
2174
+ CLIPW2 m8, m10, m5, m15
2175
+ movu [r2], m0
2176
+ movu [r2 + r3], m2
2177
+ movu [r2 + mmsize], m8
2178
+ movu [r2 + r3 + mmsize], m10
2179
+%endmacro
2180
+
2181
+;-----------------------------------------------------------------------------------------------------------------
2182
+; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
2183
+;-----------------------------------------------------------------------------------------------------------------
2184
+%macro FILTER_VER_PP_CHROMA_64xN_AVX512 1
2185
+INIT_ZMM avx512
2186
+cglobal interp_4tap_vert_pp_64x%1, 5, 7, 16
2187
+ add r1d, r1d
2188
+ add r3d, r3d
2189
+ sub r0, r1
2190
+ shl r4d, 7
2191
+
2192
+%ifdef PIC
2193
+ lea r5, [tab_ChromaCoeffV_avx512]
2194
+ lea r5, [r5 + r4]
2195
+%else
2196
+ lea r5, [tab_ChromaCoeffV_avx512 + r4]
2197
+%endif
2198
+ vbroadcasti32x8 m7, [INTERP_OFFSET_PP]
2199
+ vbroadcasti32x8 m15, [pw_pixel_max]
2200
+
2201
+%rep %1/2 - 1
2202
+ PROCESS_CHROMA_VERT_PP_64x2_AVX512
2203
+ lea r2, [r2 + 2 * r3]
2204
+%endrep
2205
+ PROCESS_CHROMA_VERT_PP_64x2_AVX512
2206
+ RET
2207
+%endmacro
2208
+
2209
+%if ARCH_X86_64
2210
+FILTER_VER_PP_CHROMA_64xN_AVX512 16
2211
+FILTER_VER_PP_CHROMA_64xN_AVX512 32
2212
+FILTER_VER_PP_CHROMA_64xN_AVX512 48
2213
+FILTER_VER_PP_CHROMA_64xN_AVX512 64
2214
+%endif
2215
+;-------------------------------------------------------------------------------------------------------------
2216
+; avx512 chroma_vpp code end
2217
+;-------------------------------------------------------------------------------------------------------------
2218
+;-------------------------------------------------------------------------------------------------------------
2219
+; avx512 chroma_hps code start
2220
+;-------------------------------------------------------------------------------------------------------------
2221
+%macro PROCESS_IPFILTER_CHROMA_PS_32x2_AVX512 0
2222
+ ; register map
2223
+ ; m0 , m1 - interpolate coeff
2224
+ ; m2 , m3 - shuffle load order table
2225
+ ; m4 - INTERP_OFFSET_PS
2226
+ ; m5 - shuffle store order table
2227
+
2228
+ movu m6, [r0]
2229
+ movu m7, [r0 + 8]
2230
+
2231
+ pshufb m8, m6, m3
2232
+ pshufb m6, m2
2233
+ pmaddwd m6, m0
2234
+ pmaddwd m8, m1
2235
+ paddd m6, m8
2236
+ paddd m6, m4
2237
+ psrad m6, INTERP_SHIFT_PS
2238
+
2239
+ pshufb m8, m7, m3
2240
+ pshufb m7, m2
2241
+ pmaddwd m7, m0
2242
+ pmaddwd m8, m1
2243
+ paddd m7, m8
2244
+ paddd m7, m4
2245
+ psrad m7, INTERP_SHIFT_PS
2246
+
2247
+ packssdw m6, m7
2248
+ pshufb m6, m5
2249
+ movu [r2], m6
2250
+
2251
+ movu m6, [r0 + r1]
2252
+ movu m7, [r0 + r1 + 8]
2253
+
2254
+ pshufb m8, m6, m3
2255
+ pshufb m6, m2
2256
+ pmaddwd m6, m0
2257
+ pmaddwd m8, m1
2258
+ paddd m6, m8
2259
+ paddd m6, m4
2260
+ psrad m6, INTERP_SHIFT_PS
2261
+
2262
+ pshufb m8, m7, m3
2263
+ pshufb m7, m2
2264
+ pmaddwd m7, m0
2265
+ pmaddwd m8, m1
2266
+ paddd m7, m8
2267
+ paddd m7, m4
2268
+ psrad m7, INTERP_SHIFT_PS
2269
+
2270
+ packssdw m6, m7
2271
+ pshufb m6, m5
2272
+ movu [r2 + r3], m6
2273
+%endmacro
2274
+
2275
+%macro PROCESS_IPFILTER_CHROMA_PS_32x1_AVX512 0
2276
+ movu m6, [r0]
2277
+ movu m7, [r0 + 8]
2278
+
2279
+ pshufb m8, m6, m3
2280
+ pshufb m6, m2
2281
+ pmaddwd m6, m0
2282
+ pmaddwd m8, m1
2283
+ paddd m6, m8
2284
+ paddd m6, m4
2285
+ psrad m6, INTERP_SHIFT_PS
2286
+
2287
+ pshufb m8, m7, m3
2288
+ pshufb m7, m2
2289
+ pmaddwd m7, m0
2290
+ pmaddwd m8, m1
2291
+ paddd m7, m8
2292
+ paddd m7, m4
2293
+ psrad m7, INTERP_SHIFT_PS
2294
+
2295
+ packssdw m6, m7
2296
+ pshufb m6, m5
2297
+ movu [r2], m6
2298
+%endmacro
2299
+
2300
+%macro IPFILTER_CHROMA_PS_AVX512_32xN 1
2301
+%if ARCH_X86_64 == 1
2302
+INIT_ZMM avx512
2303
+cglobal interp_4tap_horiz_ps_32x%1, 4,7,9
2304
+ shl r1d, 1
2305
+ shl r3d, 1
2306
+ mov r4d, r4m
2307
+ mov r5d, r5m
2308
+%ifdef PIC
2309
+ lea r6, [tab_ChromaCoeff]
2310
+ vpbroadcastd m0, [r6 + r4 * 8]
2311
+ vpbroadcastd m1, [r6 + r4 * 8 + 4]
2312
+%else
2313
+ vpbroadcastd m0, [tab_ChromaCoeff + r4 * 8]
2314
+ vpbroadcastd m1, [tab_ChromaCoeff + r4 * 8 + 4]
2315
+%endif
2316
+ vbroadcasti32x8 m2, [interp8_hpp_shuf1_load_avx512]
2317
+ vbroadcasti32x8 m3, [interp8_hpp_shuf2_load_avx512]
2318
+ vbroadcasti32x4 m4, [INTERP_OFFSET_PS]
2319
+ vbroadcasti32x8 m5, [interp8_hpp_shuf1_store_avx512]
2320
+
2321
+ mov r6d, %1
2322
+ sub r0, 2
2323
+ test r5d, r5d
2324
+ jz .loop
2325
+ sub r0, r1
2326
+ add r6d, 3
2327
+ PROCESS_IPFILTER_CHROMA_PS_32x1_AVX512
2328
+ lea r0, [r0 + r1]
2329
+ lea r2, [r2 + r3]
2330
+ dec r6d
2331
+
2332
+.loop:
2333
+ PROCESS_IPFILTER_CHROMA_PS_32x2_AVX512
2334
+ lea r0, [r0 + 2 * r1]
2335
+ lea r2, [r2 + 2 * r3]
2336
+ sub r6d, 2
2337
+ jnz .loop
2338
+ RET
2339
+%endif
2340
+%endmacro
2341
+
2342
+IPFILTER_CHROMA_PS_AVX512_32xN 8
2343
+IPFILTER_CHROMA_PS_AVX512_32xN 16
2344
+IPFILTER_CHROMA_PS_AVX512_32xN 24
2345
+IPFILTER_CHROMA_PS_AVX512_32xN 32
2346
+IPFILTER_CHROMA_PS_AVX512_32xN 48
2347
+IPFILTER_CHROMA_PS_AVX512_32xN 64
2348
+
2349
+%macro PROCESS_IPFILTER_CHROMA_PS_64x2_AVX512 0
2350
+ ; register map
2351
+ ; m0 , m1 - interpolate coeff
2352
+ ; m2 , m3 -shuffle order table
2353
+ ; m4 - INTERP_OFFSET_PS
2354
+ ; m5 - shuffle store order table
2355
+
2356
+
2357
+ movu m6, [r0]
2358
+ movu m7, [r0 + 8]
2359
+
2360
+ pshufb m8, m6, m3
2361
+ pshufb m6, m2
2362
+ pmaddwd m6, m0
2363
+ pmaddwd m8, m1
2364
+ paddd m6, m8
2365
+ paddd m6, m4
2366
+ psrad m6, INTERP_SHIFT_PS
2367
+
2368
+ pshufb m8, m7, m3
2369
+ pshufb m7, m2
2370
+ pmaddwd m7, m0
2371
+ pmaddwd m8, m1
2372
+ paddd m7, m8
2373
+ paddd m7, m4
2374
+ psrad m7, INTERP_SHIFT_PS
2375
+
2376
+ packssdw m6, m7
2377
+ pshufb m6, m5
2378
+ movu [r2], m6
2379
+
2380
+ movu m6, [r0 + mmsize]
2381
+ movu m7, [r0 + mmsize + 8]
2382
+
2383
+ pshufb m8, m6, m3
2384
+ pshufb m6, m2
2385
+ pmaddwd m6, m0
2386
+ pmaddwd m8, m1
2387
+ paddd m6, m8
2388
+ paddd m6, m4
2389
+ psrad m6, INTERP_SHIFT_PS
2390
+
2391
+ pshufb m8, m7, m3
2392
+ pshufb m7, m2
2393
+ pmaddwd m7, m0
2394
+ pmaddwd m8, m1
2395
+ paddd m7, m8
2396
+ paddd m7, m4
2397
+ psrad m7, INTERP_SHIFT_PS
2398
+
2399
+ packssdw m6, m7
2400
+ pshufb m6, m5
2401
+ movu [r2 + mmsize], m6
2402
+
2403
+ movu m6, [r0 + r1]
2404
+ movu m7, [r0 + r1 + 8]
2405
+
2406
+ pshufb m8, m6, m3
2407
+ pshufb m6, m2
2408
+ pmaddwd m6, m0
2409
+ pmaddwd m8, m1
2410
+ paddd m6, m8
2411
+ paddd m6, m4
2412
+ psrad m6, INTERP_SHIFT_PS
2413
+
2414
+ pshufb m8, m7, m3
2415
+ pshufb m7, m2
2416
+ pmaddwd m7, m0
2417
+ pmaddwd m8, m1
2418
+ paddd m7, m8
2419
+ paddd m7, m4
2420
+ psrad m7, INTERP_SHIFT_PS
2421
+
2422
+ packssdw m6, m7
2423
+ pshufb m6, m5
2424
+ movu [r2 + r3], m6
2425
+
2426
+ movu m6, [r0 + r1 + mmsize]
2427
+ movu m7, [r0 + r1 + mmsize + 8]
2428
+
2429
+ pshufb m8, m6, m3
2430
+ pshufb m6, m2
2431
+ pmaddwd m6, m0
2432
+ pmaddwd m8, m1
2433
+ paddd m6, m8
2434
+ paddd m6, m4
2435
+ psrad m6, INTERP_SHIFT_PS
2436
+
2437
+ pshufb m8, m7, m3
2438
+ pshufb m7, m2
2439
+ pmaddwd m7, m0
2440
+ pmaddwd m8, m1
2441
+ paddd m7, m8
2442
+ paddd m7, m4
2443
+ psrad m7, INTERP_SHIFT_PS
2444
+
2445
+ packssdw m6, m7
2446
+ pshufb m6, m5
2447
+ movu [r2 + r3 + mmsize], m6
2448
+%endmacro
2449
+
2450
+%macro PROCESS_IPFILTER_CHROMA_PS_64x1_AVX512 0
2451
+ movu m6, [r0]
2452
+ movu m7, [r0 + 8]
2453
+
2454
+ pshufb m8, m6, m3
2455
+ pshufb m6, m2
2456
+ pmaddwd m6, m0
2457
+ pmaddwd m8, m1
2458
+ paddd m6, m8
2459
+ paddd m6, m4
2460
+ psrad m6, INTERP_SHIFT_PS
2461
+
2462
+ pshufb m8, m7, m3
2463
+ pshufb m7, m2
2464
+ pmaddwd m7, m0
2465
+ pmaddwd m8, m1
2466
+ paddd m7, m8
2467
+ paddd m7, m4
2468
+ psrad m7, INTERP_SHIFT_PS
2469
+
2470
+ packssdw m6, m7
2471
+ pshufb m6, m5
2472
+ movu [r2], m6
2473
+
2474
+ movu m6, [r0 + mmsize]
2475
+ movu m7, [r0 + mmsize + 8]
2476
+
2477
+ pshufb m8, m6, m3
2478
+ pshufb m6, m2
2479
+ pmaddwd m6, m0
2480
+ pmaddwd m8, m1
2481
+ paddd m6, m8
2482
+ paddd m6, m4
2483
+ psrad m6, INTERP_SHIFT_PS
2484
+
2485
+ pshufb m8, m7, m3
2486
+ pshufb m7, m2
2487
+ pmaddwd m7, m0
2488
+ pmaddwd m8, m1
2489
+ paddd m7, m8
2490
+ paddd m7, m4
2491
+ psrad m7, INTERP_SHIFT_PS
2492
+
2493
+ packssdw m6, m7
2494
+ pshufb m6, m5
2495
+ movu [r2 + mmsize], m6
2496
+%endmacro
2497
+
2498
+%macro IPFILTER_CHROMA_PS_AVX512_64xN 1
2499
+%if ARCH_X86_64 == 1
2500
+INIT_ZMM avx512
2501
+cglobal interp_4tap_horiz_ps_64x%1, 4,7,9
2502
+ shl r1d, 1
2503
+ shl r3d, 1
2504
+ mov r4d, r4m
2505
+ mov r5d, r5m
2506
+%ifdef PIC
2507
+ lea r6, [tab_ChromaCoeff]
2508
+ vpbroadcastd m0, [r6 + r4 * 8]
2509
+ vpbroadcastd m1, [r6 + r4 * 8 + 4]
2510
+%else
2511
+ vpbroadcastd m0, [tab_ChromaCoeff + r4 * 8]
2512
+ vpbroadcastd m1, [tab_ChromaCoeff + r4 * 8 + 4]
2513
+%endif
2514
+ vbroadcasti32x8 m2, [interp8_hpp_shuf1_load_avx512]
2515
+ vbroadcasti32x8 m3, [interp8_hpp_shuf2_load_avx512]
2516
+ vbroadcasti32x4 m4, [INTERP_OFFSET_PS]
2517
+ vbroadcasti32x8 m5, [interp8_hpp_shuf1_store_avx512]
2518
+ mov r6d, %1
2519
+ sub r0, 2
2520
+ test r5d, r5d
2521
+ jz .loop
2522
+ sub r0, r1
2523
+ add r6d, 3
2524
+ PROCESS_IPFILTER_CHROMA_PS_64x1_AVX512
2525
+ lea r0, [r0 + r1]
2526
+ lea r2, [r2 + r3]
2527
+ dec r6d
2528
+
2529
+.loop:
2530
+ PROCESS_IPFILTER_CHROMA_PS_64x2_AVX512
2531
+ lea r0, [r0 + 2 * r1]
2532
+ lea r2, [r2 + 2 * r3]
2533
+ sub r6d, 2
2534
+ jnz .loop
2535
+ RET
2536
+%endif
2537
+%endmacro
2538
+
2539
+IPFILTER_CHROMA_PS_AVX512_64xN 16
2540
+IPFILTER_CHROMA_PS_AVX512_64xN 32
2541
+IPFILTER_CHROMA_PS_AVX512_64xN 48
2542
+IPFILTER_CHROMA_PS_AVX512_64xN 64
2543
+
2544
+%macro PROCESS_IPFILTER_CHROMA_PS_16x2_AVX512 0
2545
+ ; register map
2546
+ ; m0 , m1 - interpolate coeff
2547
+ ; m2 , m3 - shuffle order table
2548
+ ; m4 - INTERP_OFFSET_PS
2549
+ ; m5 - shuffle store order table
2550
+
2551
+ movu ym6, [r0]
2552
+ vinserti32x8 m6, [r0 + r1], 1
2553
+ movu ym7, [r0 + 8]
2554
+ vinserti32x8 m7, [r0 + r1 + 8], 1
2555
+
2556
+ pshufb m8, m6, m3
2557
+ pshufb m6, m2
2558
+ pmaddwd m6, m0
2559
+ pmaddwd m8, m1
2560
+ paddd m6, m8
2561
+ paddd m6, m4
2562
+ psrad m6, INTERP_SHIFT_PS
2563
+
2564
+ pshufb m8, m7, m3
2565
+ pshufb m7, m2
2566
+ pmaddwd m7, m0
2567
+ pmaddwd m8, m1
2568
+ paddd m7, m8
2569
+ paddd m7, m4
2570
+ psrad m7, INTERP_SHIFT_PS
2571
+
2572
+ packssdw m6, m7
2573
+ pshufb m6, m5
2574
+ movu [r2], ym6
2575
+ vextracti32x8 [r2 + r3], m6, 1
2576
+%endmacro
2577
+%macro PROCESS_IPFILTER_CHROMA_PS_16x1_AVX512 0
2578
+ movu ym6, [r0]
2579
+ vinserti32x8 m6, [r0 + 8], 1
2580
+
2581
+ pshufb m8, m6, m3
2582
+ pshufb m6, m2
2583
+ pmaddwd m6, m0
2584
+ pmaddwd m8, m1
2585
+ paddd m6, m8
2586
+ paddd m6, m4
2587
+ psrad m6, INTERP_SHIFT_PS
2588
+
2589
+ vextracti32x8 ym7, m6, 1
2590
+ packssdw ym6, ym7
2591
+ pshufb ym6, ym5
2592
+ movu [r2], ym6
2593
+%endmacro
2594
+%macro IPFILTER_CHROMA_PS_AVX512_16xN 1
2595
+%if ARCH_X86_64 == 1
2596
+INIT_ZMM avx512
2597
+cglobal interp_4tap_horiz_ps_16x%1, 4,7,9
2598
+ shl r1d, 1
2599
+ shl r3d, 1
2600
+ mov r4d, r4m
2601
+ mov r5d, r5m
2602
+%ifdef PIC
2603
+ lea r6, [tab_ChromaCoeff]
2604
+ vpbroadcastd m0, [r6 + r4 * 8]
2605
+ vpbroadcastd m1, [r6 + r4 * 8 + 4]
2606
+%else
2607
+ vpbroadcastd m0, [tab_ChromaCoeff + r4 * 8]
2608
+ vpbroadcastd m1, [tab_ChromaCoeff + r4 * 8 + 4]
2609
+%endif
2610
+ mova m2, [interp8_hpp_shuf1_load_avx512]
2611
+ mova m3, [interp8_hpp_shuf2_load_avx512]
2612
+ vbroadcasti32x4 m4, [INTERP_OFFSET_PS]
2613
+ mova m5, [interp8_hpp_shuf1_store_avx512]
2614
+ mov r6d, %1
2615
+ sub r0, 2
2616
+ test r5d, r5d
2617
+ jz .loop
2618
+ sub r0, r1
2619
+ add r6d, 3
2620
+ PROCESS_IPFILTER_CHROMA_PS_16x1_AVX512
2621
+ lea r0, [r0 + r1]
2622
+ lea r2, [r2 + r3]
2623
+ dec r6d
2624
+
2625
+.loop:
2626
+ PROCESS_IPFILTER_CHROMA_PS_16x2_AVX512
2627
+ lea r0, [r0 + 2 * r1]
2628
+ lea r2, [r2 + 2 * r3]
2629
+ sub r6d, 2
2630
+ jnz .loop
2631
+ RET
2632
+%endif
2633
+%endmacro
2634
+
2635
+IPFILTER_CHROMA_PS_AVX512_16xN 4
2636
+IPFILTER_CHROMA_PS_AVX512_16xN 8
2637
+IPFILTER_CHROMA_PS_AVX512_16xN 12
2638
+IPFILTER_CHROMA_PS_AVX512_16xN 16
2639
+IPFILTER_CHROMA_PS_AVX512_16xN 24
2640
+IPFILTER_CHROMA_PS_AVX512_16xN 32
2641
+IPFILTER_CHROMA_PS_AVX512_16xN 64
2642
+
2643
+%macro PROCESS_IPFILTER_CHROMA_PS_48x2_AVX512 0
2644
+ ; register map
2645
+ ; m0 , m1 - interpolate coeff
2646
+ ; m2 , m3 - shuffle load order table
2647
+ ; m4 - INTERP_OFFSET_PS
2648
+ ; m5 - shuffle store order table
2649
+
2650
+ movu m6, [r0]
2651
+ movu m7, [r0 + 8]
2652
+
2653
+ pshufb m8, m6, m3
2654
+ pshufb m6, m2
2655
+ pmaddwd m6, m0
2656
+ pmaddwd m8, m1
2657
+ paddd m6, m8
2658
+ paddd m6, m4
2659
+ psrad m6, INTERP_SHIFT_PS
2660
+
2661
+ pshufb m8, m7, m3
2662
+ pshufb m7, m2
2663
+ pmaddwd m7, m0
2664
+ pmaddwd m8, m1
2665
+ paddd m7, m8
2666
+ paddd m7, m4
2667
+ psrad m7, INTERP_SHIFT_PS
2668
+
2669
+ packssdw m6, m7
2670
+ pshufb m6, m5
2671
+ movu [r2], m6
2672
+
2673
+ movu m6, [r0 + r1]
2674
+ movu m7, [r0 + r1 + 8]
2675
+
2676
+ pshufb m8, m6, m3
2677
+ pshufb m6, m2
2678
+ pmaddwd m6, m0
2679
+ pmaddwd m8, m1
2680
+ paddd m6, m8
2681
+ paddd m6, m4
2682
+ psrad m6, INTERP_SHIFT_PS
2683
+
2684
+ pshufb m8, m7, m3
2685
+ pshufb m7, m2
2686
+ pmaddwd m7, m0
2687
+ pmaddwd m8, m1
2688
+ paddd m7, m8
2689
+ paddd m7, m4
2690
+ psrad m7, INTERP_SHIFT_PS
2691
+
2692
+ packssdw m6, m7
2693
+ pshufb m6, m5
2694
+ movu [r2 + r3], m6
2695
+
2696
+ movu ym6, [r0 + mmsize]
2697
+ vinserti32x8 m6, [r0 + r1 + mmsize], 1
2698
+ movu ym7, [r0 + mmsize + 8]
2699
+ vinserti32x8 m7, [r0 + r1 + mmsize + 8], 1
2700
+
2701
+ pshufb m8, m6, m3
2702
+ pshufb m6, m2
2703
+ pmaddwd m6, m0
2704
+ pmaddwd m8, m1
2705
+ paddd m6, m8
2706
+ paddd m6, m4
2707
+ psrad m6, INTERP_SHIFT_PS
2708
+
2709
+ pshufb m8, m7, m3
2710
+ pshufb m7, m2
2711
+ pmaddwd m7, m0
2712
+ pmaddwd m8, m1
2713
+ paddd m7, m8
2714
+ paddd m7, m4
2715
+ psrad m7, INTERP_SHIFT_PS
2716
+
2717
+ packssdw m6, m7
2718
+ pshufb m6, m5
2719
+ movu [r2 + mmsize], ym6
2720
+ vextracti32x8 [r2 + r3 + mmsize], m6, 1
2721
+%endmacro
2722
+
2723
+%macro PROCESS_IPFILTER_CHROMA_PS_48x1_AVX512 0
2724
+ ; register map
2725
+ ; m0 , m1 - interpolate coeff
2726
+ ; m2 , m3 - shuffle load order table
2727
+ ; m4 - INTERP_OFFSET_PS
2728
+ ; m5 - shuffle store order table
2729
+
2730
+ movu m6, [r0]
2731
+ movu m7, [r0 + 8]
2732
+
2733
+ pshufb m8, m6, m3
2734
+ pshufb m6, m2
2735
+ pmaddwd m6, m0
2736
+ pmaddwd m8, m1
2737
+ paddd m6, m8
2738
+ paddd m6, m4
2739
+ psrad m6, INTERP_SHIFT_PS
2740
+
2741
+ pshufb m8, m7, m3
2742
+ pshufb m7, m2
2743
+ pmaddwd m7, m0
2744
+ pmaddwd m8, m1
2745
+ paddd m7, m8
2746
+ paddd m7, m4
2747
+ psrad m7, INTERP_SHIFT_PS
2748
+
2749
+ packssdw m6, m7
2750
+ pshufb m6, m5
2751
+ movu [r2], m6
2752
+
2753
+ movu ym6, [r0 + mmsize]
2754
+ movu ym7, [r0 + mmsize + 8]
2755
+
2756
+ pshufb ym8, ym6, ym3
2757
+ pshufb ym6, ym2
2758
+ pmaddwd ym6, ym0
2759
+ pmaddwd ym8, ym1
2760
+ paddd ym6, ym8
2761
+ paddd ym6, ym4
2762
+ psrad ym6, INTERP_SHIFT_PS
2763
+
2764
+ pshufb ym8, ym7, ym3
2765
+ pshufb ym7, ym2
2766
+ pmaddwd ym7, ym0
2767
+ pmaddwd ym8, ym1
2768
+ paddd ym7, ym8
2769
+ paddd ym7, ym4
2770
+ psrad ym7, INTERP_SHIFT_PS
2771
+
2772
+ packssdw ym6, ym7
2773
+ pshufb ym6, ym5
2774
+ movu [r2 + mmsize], ym6
2775
+%endmacro
2776
+
2777
+%if ARCH_X86_64 == 1
2778
+INIT_ZMM avx512
2779
+cglobal interp_4tap_horiz_ps_48x64, 4,7,9
2780
+ shl r1d, 1
2781
+ shl r3d, 1
2782
+ mov r4d, r4m
2783
+ mov r5d, r5m
2784
+
2785
+%ifdef PIC
2786
+ lea r6, [tab_ChromaCoeff]
2787
+ vpbroadcastd m0, [r6 + r4 * 8]
2788
+ vpbroadcastd m1, [r6 + r4 * 8 + 4]
2789
+%else
2790
+ vpbroadcastd m0, [tab_ChromaCoeff + r4 * 8]
2791
+ vpbroadcastd m1, [tab_ChromaCoeff + r4 * 8 + 4]
2792
+%endif
2793
+ vbroadcasti32x8 m2, [interp8_hpp_shuf1_load_avx512]
2794
+ vbroadcasti32x8 m3, [interp8_hpp_shuf2_load_avx512]
2795
+ vbroadcasti32x4 m4, [INTERP_OFFSET_PS]
2796
+ vbroadcasti32x8 m5, [interp8_hpp_shuf1_store_avx512]
2797
+
2798
+ mov r6d, 64
2799
+ sub r0, 2
2800
+ test r5d, r5d
2801
+ jz .loop
2802
+ sub r0, r1
2803
+ add r6d, 3
2804
+ PROCESS_IPFILTER_CHROMA_PS_48x1_AVX512
2805
+ lea r0, [r0 + r1]
2806
+ lea r2, [r2 + r3]
2807
+ dec r6d
2808
+.loop:
2809
+ PROCESS_IPFILTER_CHROMA_PS_48x2_AVX512
2810
+ lea r0, [r0 + 2 * r1]
2811
+ lea r2, [r2 + 2 * r3]
2812
+ sub r6d, 2
2813
+ jnz .loop
2814
+ RET
2815
+%endif
2816
+
2817
+%macro PROCESS_IPFILTER_CHROMA_PS_8x4_AVX512 0
2818
+ ; register map
2819
+ ; m0 , m1 - interpolate coeff
2820
+ ; m2 , m3 - shuffle load order table
2821
+ ; m4 - INTERP_OFFSET_PS
2822
+ ; m5 - shuffle store order table
2823
+
2824
+ movu xm6, [r0]
2825
+ vinserti32x4 m6, [r0 + r1], 1
2826
+ vinserti32x4 m6, [r0 + 2 * r1], 2
2827
+ vinserti32x4 m6, [r0 + r6], 3
2828
+
2829
+ pshufb m8, m6, m3
2830
+ pshufb m6, m2
2831
+ pmaddwd m6, m0
2832
+ pmaddwd m8, m1
2833
+ paddd m6, m8
2834
+ paddd m6, m4
2835
+ psrad m6, INTERP_SHIFT_PS
2836
+
2837
+ movu xm7, [r0 + 8]
2838
+ vinserti32x4 m7, [r0 + r1 + 8], 1
2839
+ vinserti32x4 m7, [r0 + 2 * r1 + 8], 2
2840
+ vinserti32x4 m7, [r0 + r6 + 8], 3
2841
+
2842
+ pshufb m8, m7, m3
2843
+ pshufb m7, m2
2844
+ pmaddwd m7, m0
2845
+ pmaddwd m8, m1
2846
+ paddd m7, m8
2847
+ paddd m7, m4
2848
+ psrad m7, INTERP_SHIFT_PS
2849
+
2850
+ packssdw m6, m7
2851
+ pshufb m6, m5
2852
+ movu [r2], xm6
2853
+ vextracti32x4 [r2 + r3], m6, 1
2854
+ vextracti32x4 [r2 + 2 * r3], m6, 2
2855
+ vextracti32x4 [r2 + r7], m6, 3
2856
+%endmacro
2857
+
2858
+%macro PROCESS_IPFILTER_CHROMA_PS_8x3_AVX512 0
2859
+ movu xm6, [r0]
2860
+ vinserti32x4 m6, [r0 + r1], 1
2861
+ vinserti32x4 m6, [r0 + 2 * r1], 2
2862
+
2863
+ pshufb m8, m6, m3
2864
+ pshufb m6, m2
2865
+ pmaddwd m6, m0
2866
+ pmaddwd m8, m1
2867
+ paddd m6, m8
2868
+ paddd m6, m4
2869
+ psrad m6, INTERP_SHIFT_PS
2870
+
2871
+ movu xm7, [r0 + 8]
2872
+ vinserti32x4 m7, [r0 + r1 + 8], 1
2873
+ vinserti32x4 m7, [r0 + 2 * r1 + 8], 2
2874
+
2875
+ pshufb m8, m7, m3
2876
+ pshufb m7, m2
2877
+ pmaddwd m7, m0
2878
+ pmaddwd m8, m1
2879
+ paddd m7, m8
2880
+ paddd m7, m4
2881
+ psrad m7, INTERP_SHIFT_PS
2882
+
2883
+ packssdw m6, m7
2884
+ pshufb m6, m5
2885
+ movu [r2], xm6
2886
+ vextracti32x4 [r2 + r3], m6, 1
2887
+ vextracti32x4 [r2 + 2 * r3], m6, 2
2888
+%endmacro
2889
+
2890
+%macro IPFILTER_CHROMA_PS_AVX512_8xN 1
2891
+INIT_ZMM avx512
2892
+cglobal interp_4tap_horiz_ps_8x%1, 4,9,9
2893
+ shl r1d, 1
2894
+ shl r3d, 1
2895
+ mov r4d, r4m
2896
+ mov r5d, r5m
2897
+
2898
+ lea r6, [3 * r1]
2899
+ lea r7, [3 * r3]
2900
+%ifdef PIC
2901
+ lea r8, [tab_ChromaCoeff]
2902
+ vpbroadcastd m0, [r8 + r4 * 8]
2903
+ vpbroadcastd m1, [r8 + r4 * 8 + 4]
2904
+%else
2905
+ vpbroadcastd m0, [tab_ChromaCoeff + r4 * 8]
2906
+ vpbroadcastd m1, [tab_ChromaCoeff + r4 * 8 + 4]
2907
+%endif
2908
+ vbroadcasti32x8 m2, [interp8_hpp_shuf1_load_avx512]
2909
+ vbroadcasti32x8 m3, [interp8_hpp_shuf2_load_avx512]
2910
+ vbroadcasti32x4 m4, [INTERP_OFFSET_PS]
2911
+ vbroadcasti32x8 m5, [interp8_hpp_shuf1_store_avx512]
2912
+
2913
+ mov r8d, %1
2914
+ sub r0, 2
2915
+ test r5d, r5d
2916
+ jz .loop
2917
+ sub r0, r1
2918
+ add r8d, 3
2919
+ PROCESS_IPFILTER_CHROMA_PS_8x3_AVX512
2920
+ lea r0, [r0 + r6]
2921
+ lea r2, [r2 + r7]
2922
+ sub r8d, 3
2923
+
2924
+.loop:
2925
+ PROCESS_IPFILTER_CHROMA_PS_8x4_AVX512
2926
+ lea r0, [r0 + 4 * r1]
2927
+ lea r2, [r2 + 4 * r3]
2928
+ sub r8d, 4
2929
+ jnz .loop
2930
+ RET
2931
+%endmacro
2932
+
2933
+%if ARCH_X86_64
2934
+IPFILTER_CHROMA_PS_AVX512_8xN 4
2935
+IPFILTER_CHROMA_PS_AVX512_8xN 8
2936
+IPFILTER_CHROMA_PS_AVX512_8xN 12
2937
+IPFILTER_CHROMA_PS_AVX512_8xN 16
2938
+IPFILTER_CHROMA_PS_AVX512_8xN 32
2939
+IPFILTER_CHROMA_PS_AVX512_8xN 64
2940
+%endif
2941
+
2942
+%macro PROCESS_IPFILTER_CHROMA_PS_24x4_AVX512 0
2943
+ ; register map
2944
+ ; m0 , m1 - interpolate coeff
2945
+ ; m2 , m3 - shuffle order table
2946
+ ; m4 - INTERP_OFFSET_PS
2947
+ ; m5 - shuffle store order table
2948
+
2949
+ movu ym6, [r0]
2950
+ vinserti32x8 m6, [r0 + r1], 1
2951
+ movu ym7, [r0 + 8]
2952
+ vinserti32x8 m7, [r0 + r1 + 8], 1
2953
+
2954
+ pshufb m8, m6, m3
2955
+ pshufb m6, m2
2956
+ pmaddwd m6, m0
2957
+ pmaddwd m8, m1
2958
+ paddd m6, m8
2959
+ paddd m6, m4
2960
+ psrad m6, INTERP_SHIFT_PS
2961
+
2962
+ pshufb m8, m7, m3
2963
+ pshufb m7, m2
2964
+ pmaddwd m7, m0
2965
+ pmaddwd m8, m1
2966
+ paddd m7, m8
2967
+ paddd m7, m4
2968
+ psrad m7, INTERP_SHIFT_PS
2969
+
2970
+ packssdw m6, m7
2971
+ pshufb m6, m5
2972
+ movu [r2], ym6
2973
+ vextracti32x8 [r2 + r3], m6, 1
2974
+
2975
+ movu ym6, [r0 + 2 * r1]
2976
+ vinserti32x8 m6, [r0 + r6], 1
2977
+ movu ym7, [r0 + 2 * r1 + 8]
2978
+ vinserti32x8 m7, [r0 + r6 + 8], 1
2979
+
2980
+ pshufb m8, m6, m3
2981
+ pshufb m6, m2
2982
+ pmaddwd m6, m0
2983
+ pmaddwd m8, m1
2984
+ paddd m6, m8
2985
+ paddd m6, m4
2986
+ psrad m6, INTERP_SHIFT_PS
2987
+
2988
+ pshufb m8, m7, m3
2989
+ pshufb m7, m2
2990
+ pmaddwd m7, m0
2991
+ pmaddwd m8, m1
2992
+ paddd m7, m8
2993
+ paddd m7, m4
2994
+ psrad m7, INTERP_SHIFT_PS
2995
+
2996
+ packssdw m6, m7
2997
+ pshufb m6, m5
2998
+ movu [r2 + 2 * r3], ym6
2999
+ vextracti32x8 [r2 + r7], m6, 1
3000
+
3001
+ movu xm6, [r0 + mmsize/2]
3002
+ vinserti32x4 m6, [r0 + r1 + mmsize/2], 1
3003
+ vinserti32x4 m6, [r0 + 2 * r1 + mmsize/2], 2
3004
+ vinserti32x4 m6, [r0 + r6 + mmsize/2], 3
3005
+
3006
+ pshufb m8, m6, m3
3007
+ pshufb m6, m2
3008
+ pmaddwd m6, m0
3009
+ pmaddwd m8, m1
3010
+ paddd m6, m8
3011
+ paddd m6, m4
3012
+ psrad m6, INTERP_SHIFT_PS
3013
+
3014
+ movu xm7, [r0 + mmsize/2 + 8]
3015
+ vinserti32x4 m7, [r0 + r1 + mmsize/2 + 8], 1
3016
+ vinserti32x4 m7, [r0 + 2 * r1 + mmsize/2 + 8], 2
3017
+ vinserti32x4 m7, [r0 + r6 + mmsize/2 + 8], 3
3018
+
3019
+ pshufb m8, m7, m3
3020
+ pshufb m7, m2
3021
+ pmaddwd m7, m0
3022
+ pmaddwd m8, m1
3023
+ paddd m7, m8
3024
+ paddd m7, m4
3025
+ psrad m7, INTERP_SHIFT_PS
3026
+
3027
+ packssdw m6, m7
3028
+ pshufb m6, m5
3029
+ movu [r2 + mmsize/2], xm6
3030
+ vextracti32x4 [r2 + r3 + mmsize/2], m6, 1
3031
+ vextracti32x4 [r2 + 2 * r3 + mmsize/2], m6, 2
3032
+ vextracti32x4 [r2 + r7 + mmsize/2], m6, 3
3033
+%endmacro
3034
+
3035
+%macro PROCESS_IPFILTER_CHROMA_PS_24x3_AVX512 0
3036
+ movu ym6, [r0]
3037
+ vinserti32x8 m6, [r0 + r1], 1
3038
+ movu ym7, [r0 + 8]
3039
+ vinserti32x8 m7, [r0 + r1 + 8], 1
3040
+
3041
+ pshufb m8, m6, m3
3042
+ pshufb m6, m2
3043
+ pmaddwd m6, m0
3044
+ pmaddwd m8, m1
3045
+ paddd m6, m8
3046
+ paddd m6, m4
3047
+ psrad m6, INTERP_SHIFT_PS
3048
+
3049
+ pshufb m8, m7, m3
3050
+ pshufb m7, m2
3051
+ pmaddwd m7, m0
3052
+ pmaddwd m8, m1
3053
+ paddd m7, m8
3054
+ paddd m7, m4
3055
+ psrad m7, INTERP_SHIFT_PS
3056
+
3057
+ packssdw m6, m7
3058
+ pshufb m6, m5
3059
+ movu [r2], ym6
3060
+ vextracti32x8 [r2 + r3], m6, 1
3061
+
3062
+ movu ym6, [r0 + 2 * r1]
3063
+ movu ym7, [r0 + 2 * r1 + 8]
3064
+
3065
+ pshufb ym8, ym6, ym3
3066
+ pshufb ym6, ym2
3067
+ pmaddwd ym6, ym0
3068
+ pmaddwd ym8, ym1
3069
+ paddd ym6, ym8
3070
+ paddd ym6, ym4
3071
+ psrad ym6, INTERP_SHIFT_PS
3072
+
3073
+ pshufb ym8, ym7, ym3
3074
+ pshufb ym7, ym2
3075
+ pmaddwd ym7, ym0
3076
+ pmaddwd ym8, ym1
3077
+ paddd ym7, ym8
3078
+ paddd ym7, ym4
3079
+ psrad ym7, INTERP_SHIFT_PS
3080
+
3081
+ packssdw ym6, ym7
3082
+ pshufb ym6, ym5
3083
+ movu [r2 + 2 * r3], ym6
3084
+
3085
+ movu xm6, [r0 + mmsize/2]
3086
+ vinserti32x4 m6, [r0 + r1 + mmsize/2], 1
3087
+ vinserti32x4 m6, [r0 + 2 * r1 + mmsize/2], 2
3088
+
3089
+ pshufb m8, m6, m3
3090
+ pshufb m6, m2
3091
+ pmaddwd m6, m0
3092
+ pmaddwd m8, m1
3093
+ paddd m6, m8
3094
+ paddd m6, m4
3095
+ psrad m6, INTERP_SHIFT_PS
3096
+
3097
+ movu xm7, [r0 + mmsize/2 + 8]
3098
+ vinserti32x4 m7, [r0 + r1 + mmsize/2 + 8], 1
3099
+ vinserti32x4 m7, [r0 + 2 * r1 + mmsize/2 + 8], 2
3100
+
3101
+ pshufb m8, m7, m3
3102
+ pshufb m7, m2
3103
+ pmaddwd m7, m0
3104
+ pmaddwd m8, m1
3105
+ paddd m7, m8
3106
+ paddd m7, m4
3107
+ psrad m7, INTERP_SHIFT_PS
3108
+
3109
+ packssdw m6, m7
3110
+ pshufb m6, m5
3111
+ movu [r2 + mmsize/2], xm6
3112
+ vextracti32x4 [r2 + r3 + mmsize/2], m6, 1
3113
+ vextracti32x4 [r2 + 2 * r3 + mmsize/2], m6, 2
3114
+%endmacro
3115
+
3116
+%macro IPFILTER_CHROMA_PS_AVX512_24xN 1
3117
+INIT_ZMM avx512
3118
+cglobal interp_4tap_horiz_ps_24x%1, 4,9,9
3119
+ shl r1d, 1
3120
+ shl r3d, 1
3121
+ mov r4d, r4m
3122
+ mov r5d, r5m
3123
+
3124
+ lea r6, [3 * r1]
3125
+ lea r7, [3 * r3]
3126
+%ifdef PIC
3127
+ lea r8, [tab_ChromaCoeff]
3128
+ vpbroadcastd m0, [r8 + r4 * 8]
3129
+ vpbroadcastd m1, [r8 + r4 * 8 + 4]
3130
+%else
3131
+ vpbroadcastd m0, [tab_ChromaCoeff + r4 * 8]
3132
+ vpbroadcastd m1, [tab_ChromaCoeff + r4 * 8 + 4]
3133
+%endif
3134
+ vbroadcasti32x8 m2, [interp8_hpp_shuf1_load_avx512]
3135
+ vbroadcasti32x8 m3, [interp8_hpp_shuf2_load_avx512]
3136
+ vbroadcasti32x4 m4, [INTERP_OFFSET_PS]
3137
+ vbroadcasti32x8 m5,[interp8_hpp_shuf1_store_avx512]
3138
+
3139
+ mov r8d, %1
3140
+ sub r0, 2
3141
+ test r5d, r5d
3142
+ jz .loop
3143
+ sub r0, r1
3144
+ add r8d, 3
3145
+ PROCESS_IPFILTER_CHROMA_PS_24x3_AVX512
3146
+ lea r0, [r0 + r6]
3147
+ lea r2, [r2 + r7]
3148
+ sub r8d, 3
3149
+
3150
+.loop:
3151
+ PROCESS_IPFILTER_CHROMA_PS_24x4_AVX512
3152
+ lea r0, [r0 + 4 * r1]
3153
+ lea r2, [r2 + 4 * r3]
3154
+ sub r8d, 4
3155
+ jnz .loop
3156
+ RET
3157
+%endmacro
3158
+
3159
+%if ARCH_X86_64
3160
+IPFILTER_CHROMA_PS_AVX512_24xN 32
3161
+IPFILTER_CHROMA_PS_AVX512_24xN 64
3162
+%endif
3163
+;-------------------------------------------------------------------------------------------------------------
3164
+; avx512 chroma_hps code end
3165
+;-------------------------------------------------------------------------------------------------------------
3166
+;-------------------------------------------------------------------------------------------------------------
3167
+; avx512 chroma_vps code start
3168
+;-------------------------------------------------------------------------------------------------------------
3169
+%macro PROCESS_CHROMA_VERT_PS_8x8_AVX512 0
3170
+ movu xm1, [r0]
3171
+ lea r6, [r0 + 2 * r1]
3172
+ lea r8, [r0 + 4 * r1]
3173
+ lea r9, [r8 + 2 * r1]
3174
+ vinserti32x4 m1, [r6], 1
3175
+ vinserti32x4 m1, [r8], 2
3176
+ vinserti32x4 m1, [r9], 3
3177
+ movu xm3, [r0 + r1]
3178
+ vinserti32x4 m3, [r6 + r1], 1
3179
+ vinserti32x4 m3, [r8 + r1], 2
3180
+ vinserti32x4 m3, [r9 + r1], 3
3181
+ punpcklwd m0, m1, m3
3182
+ pmaddwd m0, [r5]
3183
+ punpckhwd m1, m3
3184
+ pmaddwd m1, [r5]
3185
+
3186
+ movu xm4, [r0 + 2 * r1]
3187
+ vinserti32x4 m4, [r6 + 2 * r1], 1
3188
+ vinserti32x4 m4, [r8 + 2 * r1], 2
3189
+ vinserti32x4 m4, [r9 + 2 * r1], 3
3190
+ punpcklwd m2, m3, m4
3191
+ pmaddwd m2, [r5]
3192
+ punpckhwd m3, m4
3193
+ pmaddwd m3, [r5]
3194
+
3195
+ movu xm5, [r0 + r10]
3196
+ vinserti32x4 m5, [r6 + r10], 1
3197
+ vinserti32x4 m5, [r8 + r10], 2
3198
+ vinserti32x4 m5, [r9 + r10], 3
3199
+ punpcklwd m6, m4, m5
3200
+ pmaddwd m6, [r5 + mmsize]
3201
+ paddd m0, m6
3202
+ punpckhwd m4, m5
3203
+ pmaddwd m4, [r5 + mmsize]
3204
+ paddd m1, m4
3205
+
3206
+ movu xm4, [r0 + 4 * r1]
3207
+ vinserti32x4 m4, [r6 + 4 * r1], 1
3208
+ vinserti32x4 m4, [r8 + 4 * r1], 2
3209
+ vinserti32x4 m4, [r9 + 4 * r1], 3
3210
+ punpcklwd m6, m5, m4
3211
+ pmaddwd m6, m9
3212
+ paddd m2, m6
3213
+ punpckhwd m5, m4
3214
+ pmaddwd m5, m9
3215
+ paddd m3, m5
3216
+
3217
+ paddd m0, m7
3218
+ paddd m1, m7
3219
+ paddd m2, m7
3220
+ paddd m3, m7
3221
+
3222
+ psrad m0, INTERP_SHIFT_PS
3223
+ psrad m1, INTERP_SHIFT_PS
3224
+ psrad m2, INTERP_SHIFT_PS
3225
+ psrad m3, INTERP_SHIFT_PS
3226
+
3227
+ packssdw m0, m1
3228
+ packssdw m2, m3
3229
+ movu [r2], xm0
3230
+ movu [r2 + r3], xm2
3231
+ vextracti32x4 [r2 + 2 * r3], m0, 1
3232
+ vextracti32x4 [r2 + r7], m2, 1
3233
+ lea r2, [r2 + 4 * r3]
3234
+ vextracti32x4 [r2], m0, 2
3235
+ vextracti32x4 [r2 + r3], m2, 2
3236
+ vextracti32x4 [r2 + 2 * r3], m0, 3
3237
+ vextracti32x4 [r2 + r7], m2, 3
3238
+%endmacro
3239
+
3240
+;-----------------------------------------------------------------------------------------------------------------
3241
+; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
3242
+;-----------------------------------------------------------------------------------------------------------------
3243
+%if ARCH_X86_64
3244
+INIT_ZMM avx512
3245
+cglobal interp_4tap_vert_ps_8x8, 5, 11, 10
3246
+ add r1d, r1d
3247
+ add r3d, r3d
3248
+ sub r0, r1
3249
+ shl r4d, 7
3250
+
3251
+%ifdef PIC
3252
+ lea r5, [tab_ChromaCoeffV_avx512]
3253
+ lea r5, [r5 + r4]
3254
+%else
3255
+ lea r5, [tab_ChromaCoeffV_avx512 + r4]
3256
+%endif
3257
+ vbroadcasti32x4 m7, [INTERP_OFFSET_PS]
3258
+ lea r10, [3 * r1]
3259
+ lea r7, [3 * r3]
3260
+ mova m8, [r5]
3261
+ mova m9, [r5 + mmsize]
3262
+ PROCESS_CHROMA_VERT_PS_8x8_AVX512
3263
+ RET
3264
+%endif
3265
+
3266
+%macro FILTER_VER_PS_CHROMA_8xN_AVX512 1
3267
+INIT_ZMM avx512
3268
+cglobal interp_4tap_vert_ps_8x%1, 5, 11, 10
3269
+ add r1d, r1d
3270
+ add r3d, r3d
3271
+ sub r0, r1
3272
+ shl r4d, 7
3273
+
3274
+%ifdef PIC
3275
+ lea r5, [tab_ChromaCoeffV_avx512]
3276
+ lea r5, [r5 + r4]
3277
+%else
3278
+ lea r5, [tab_ChromaCoeffV_avx512 + r4]
3279
+%endif
3280
+ vbroadcasti32x4 m7, [INTERP_OFFSET_PS]
3281
+ lea r10, [3 * r1]
3282
+ lea r7, [3 * r3]
3283
+ mova m8, [r5]
3284
+ mova m9, [r5 + mmsize]
3285
+%rep %1/8 - 1
3286
+ PROCESS_CHROMA_VERT_PS_8x8_AVX512
3287
+ lea r0, [r8 + 4 * r1]
3288
+ lea r2, [r2 + 4 * r3]
3289
+%endrep
3290
+ PROCESS_CHROMA_VERT_PS_8x8_AVX512
3291
+ RET
3292
+%endmacro
3293
+
3294
+%if ARCH_X86_64
3295
+FILTER_VER_PS_CHROMA_8xN_AVX512 16
3296
+FILTER_VER_PS_CHROMA_8xN_AVX512 32
3297
+FILTER_VER_PS_CHROMA_8xN_AVX512 64
3298
+%endif
3299
+
3300
+%macro PROCESS_CHROMA_VERT_PS_16x4_AVX512 0
3301
+ movu ym1, [r0]
3302
+ lea r6, [r0 + 2 * r1]
3303
+ vinserti32x8 m1, [r6], 1
3304
+ movu ym3, [r0 + r1]
3305
+ vinserti32x8 m3, [r6 + r1], 1
3306
+ punpcklwd m0, m1, m3
3307
+ pmaddwd m0, m8
3308
+ punpckhwd m1, m3
3309
+ pmaddwd m1, m8
3310
+
3311
+ movu ym4, [r0 + 2 * r1]
3312
+ vinserti32x8 m4, [r6 + 2 * r1], 1
3313
+ punpcklwd m2, m3, m4
3314
+ pmaddwd m2, m8
3315
+ punpckhwd m3, m4
3316
+ pmaddwd m3, m8
3317
+
3318
+ movu ym5, [r0 + r8]
3319
+ vinserti32x8 m5, [r6 + r8], 1
3320
+ punpcklwd m6, m4, m5
3321
+ pmaddwd m6, m9
3322
+ paddd m0, m6
3323
+ punpckhwd m4, m5
3324
+ pmaddwd m4, m9
3325
+ paddd m1, m4
3326
+
3327
+ movu ym4, [r0 + 4 * r1]
3328
+ vinserti32x8 m4, [r6 + 4 * r1], 1
3329
+ punpcklwd m6, m5, m4
3330
+ pmaddwd m6, m9
3331
+ paddd m2, m6
3332
+ punpckhwd m5, m4
3333
+ pmaddwd m5, m9
3334
+ paddd m3, m5
3335
+
3336
+ paddd m0, m7
3337
+ paddd m1, m7
3338
+ paddd m2, m7
3339
+ paddd m3, m7
3340
+
3341
+ psrad m0, INTERP_SHIFT_PS
3342
+ psrad m1, INTERP_SHIFT_PS
3343
+ psrad m2, INTERP_SHIFT_PS
3344
+ psrad m3, INTERP_SHIFT_PS
3345
+
3346
+ packssdw m0, m1
3347
+ packssdw m2, m3
3348
+ movu [r2], ym0
3349
+ movu [r2 + r3], ym2
3350
+ vextracti32x8 [r2 + 2 * r3], m0, 1
3351
+ vextracti32x8 [r2 + r7], m2, 1
3352
+%endmacro
3353
+
3354
+;-----------------------------------------------------------------------------------------------------------------
3355
+; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
3356
+;-----------------------------------------------------------------------------------------------------------------
3357
+%if ARCH_X86_64
3358
+INIT_ZMM avx512
3359
+cglobal interp_4tap_vert_ps_16x4, 5, 9, 10
3360
+ add r1d, r1d
3361
+ add r3d, r3d
3362
+ sub r0, r1
3363
+ shl r4d, 7
3364
+
3365
+%ifdef PIC
3366
+ lea r5, [tab_ChromaCoeffV_avx512]
3367
+ lea r5, [r5 + r4]
3368
+%else
3369
+ lea r5, [tab_ChromaCoeffV_avx512 + r4]
3370
+%endif
3371
+ vbroadcasti32x4 m7, [INTERP_OFFSET_PS]
3372
+ lea r7, [3 * r3]
3373
+ lea r8, [3 * r1]
3374
+ mova m8, [r5]
3375
+ mova m9, [r5 + mmsize]
3376
+ PROCESS_CHROMA_VERT_PS_16x4_AVX512
3377
+ RET
3378
+%endif
3379
+
3380
+%macro FILTER_VER_PS_CHROMA_16xN_AVX512 1
3381
+INIT_ZMM avx512
3382
+cglobal interp_4tap_vert_ps_16x%1, 5, 9, 10
3383
+ add r1d, r1d
3384
+ add r3d, r3d
3385
+ sub r0, r1
3386
+ shl r4d, 7
3387
+
3388
+%ifdef PIC
3389
+ lea r5, [tab_ChromaCoeffV_avx512]
3390
+ lea r5, [r5 + r4]
3391
+%else
3392
+ lea r5, [tab_ChromaCoeffV_avx512 + r4]
3393
+%endif
3394
+ vbroadcasti32x4 m7, [INTERP_OFFSET_PS]
3395
+ lea r7, [3 * r3]
3396
+ lea r8, [3 * r1]
3397
+ mova m8, [r5]
3398
+ mova m9, [r5 + mmsize]
3399
+%rep %1/4 - 1
3400
+ PROCESS_CHROMA_VERT_PS_16x4_AVX512
3401
+ lea r0, [r0 + 4 * r1]
3402
+ lea r2, [r2 + 4 * r3]
3403
+%endrep
3404
+ PROCESS_CHROMA_VERT_PS_16x4_AVX512
3405
+ RET
3406
+%endmacro
3407
+
3408
+%if ARCH_X86_64
3409
+FILTER_VER_PS_CHROMA_16xN_AVX512 8
3410
+FILTER_VER_PS_CHROMA_16xN_AVX512 12
3411
+FILTER_VER_PS_CHROMA_16xN_AVX512 16
3412
+FILTER_VER_PS_CHROMA_16xN_AVX512 24
3413
+FILTER_VER_PS_CHROMA_16xN_AVX512 32
3414
+FILTER_VER_PS_CHROMA_16xN_AVX512 64
3415
+%endif
3416
+
3417
+%macro PROCESS_CHROMA_VERT_PS_24x8_AVX512 0
3418
+ movu ym1, [r0]
3419
+ lea r6, [r0 + 2 * r1]
3420
+ lea r8, [r0 + 4 * r1]
3421
+ lea r9, [r8 + 2 * r1]
3422
+
3423
+ movu ym10, [r8]
3424
+ movu ym3, [r0 + r1]
3425
+ movu ym12, [r8 + r1]
3426
+ vinserti32x8 m1, [r6], 1
3427
+ vinserti32x8 m10, [r9], 1
3428
+ vinserti32x8 m3, [r6 + r1], 1
3429
+ vinserti32x8 m12, [r9 + r1], 1
3430
+
3431
+ punpcklwd m0, m1, m3
3432
+ punpcklwd m9, m10, m12
3433
+ pmaddwd m0, m16
3434
+ pmaddwd m9, m16
3435
+ punpckhwd m1, m3
3436
+ punpckhwd m10, m12
3437
+ pmaddwd m1, m16
3438
+ pmaddwd m10, m16
3439
+
3440
+ movu ym4, [r0 + 2 * r1]
3441
+ movu ym13, [r8 + 2 * r1]
3442
+ vinserti32x8 m4, [r6 + 2 * r1], 1
3443
+ vinserti32x8 m13, [r9 + 2 * r1], 1
3444
+ punpcklwd m2, m3, m4
3445
+ punpcklwd m11, m12, m13
3446
+ pmaddwd m2, m16
3447
+ pmaddwd m11, m16
3448
+ punpckhwd m3, m4
3449
+ punpckhwd m12, m13
3450
+ pmaddwd m3, m16
3451
+ pmaddwd m12, m16
3452
+
3453
+ movu ym5, [r0 + r10]
3454
+ vinserti32x8 m5, [r6 + r10], 1
3455
+ movu ym14, [r8 + r10]
3456
+ vinserti32x8 m14, [r9 + r10], 1
3457
+ punpcklwd m6, m4, m5
3458
+ punpcklwd m15, m13, m14
3459
+ pmaddwd m6, m17
3460
+ pmaddwd m15, m17
3461
+ paddd m0, m6
3462
+ paddd m9, m15
3463
+ punpckhwd m4, m5
3464
+ punpckhwd m13, m14
3465
+ pmaddwd m4, m17
3466
+ pmaddwd m13, m17
3467
+ paddd m1, m4
3468
+ paddd m10, m13
3469
+
3470
+ movu ym4, [r0 + 4 * r1]
3471
+ vinserti32x8 m4, [r6 + 4 * r1], 1
3472
+ movu ym13, [r8 + 4 * r1]
3473
+ vinserti32x8 m13, [r9 + 4 * r1], 1
3474
+ punpcklwd m6, m5, m4
3475
+ punpcklwd m15, m14, m13
3476
+ pmaddwd m6, m17
3477
+ pmaddwd m15, m17
3478
+ paddd m2, m6
3479
+ paddd m11, m15
3480
+ punpckhwd m5, m4
3481
+ punpckhwd m14, m13
3482
+ pmaddwd m5, m17
3483
+ pmaddwd m14, m17
3484
+ paddd m3, m5
3485
+ paddd m12, m14
3486
+
3487
+ paddd m0, m7
3488
+ paddd m1, m7
3489
+ paddd m2, m7
3490
+ paddd m3, m7
3491
+ paddd m9, m7
3492
+ paddd m10, m7
3493
+ paddd m11, m7
3494
+ paddd m12, m7
3495
+
3496
+ psrad m0, INTERP_SHIFT_PS
3497
+ psrad m1, INTERP_SHIFT_PS
3498
+ psrad m2, INTERP_SHIFT_PS
3499
+ psrad m3, INTERP_SHIFT_PS
3500
+ psrad m9, INTERP_SHIFT_PS
3501
+ psrad m10, INTERP_SHIFT_PS
3502
+ psrad m11, INTERP_SHIFT_PS
3503
+ psrad m12, INTERP_SHIFT_PS
3504
+
3505
+ packssdw m0, m1
3506
+ packssdw m2, m3
3507
+ packssdw m9, m10
3508
+ packssdw m11, m12
3509
+ movu [r2], ym0
3510
+ movu [r2 + r3], ym2
3511
+ vextracti32x8 [r2 + 2 * r3], m0, 1
3512
+ vextracti32x8 [r2 + r7], m2, 1
3513
+ lea r11, [r2 + 4 * r3]
3514
+ movu [r11], ym9
3515
+ movu [r11 + r3], ym11
3516
+ vextracti32x8 [r11 + 2 * r3], m9, 1
3517
+ vextracti32x8 [r11 + r7], m11, 1
3518
+
3519
+ movu xm1, [r0 + mmsize/2]
3520
+ vinserti32x4 m1, [r6 + mmsize/2], 1
3521
+ vinserti32x4 m1, [r8 + mmsize/2], 2
3522
+ vinserti32x4 m1, [r9 + mmsize/2], 3
3523
+ movu xm3, [r0 + r1 + mmsize/2]
3524
+ vinserti32x4 m3, [r6 + r1 + mmsize/2], 1
3525
+ vinserti32x4 m3, [r8 + r1 + mmsize/2], 2
3526
+ vinserti32x4 m3, [r9 + r1 + mmsize/2], 3
3527
+ punpcklwd m0, m1, m3
3528
+ pmaddwd m0, m16
3529
+ punpckhwd m1, m3
3530
+ pmaddwd m1, m16
3531
+
3532
+ movu xm4, [r0 + 2 * r1 + mmsize/2]
3533
+ vinserti32x4 m4, [r6 + 2 * r1 + mmsize/2], 1
3534
+ vinserti32x4 m4, [r8 + 2 * r1 + mmsize/2], 2
3535
+ vinserti32x4 m4, [r9 + 2 * r1 + mmsize/2], 3
3536
+ punpcklwd m2, m3, m4
3537
+ pmaddwd m2, m16
3538
+ punpckhwd m3, m4
3539
+ pmaddwd m3, m16
3540
+
3541
+ movu xm5, [r0 + r10 + mmsize/2]
3542
+ vinserti32x4 m5, [r6 + r10 + mmsize/2], 1
3543
+ vinserti32x4 m5, [r8 + r10 + mmsize/2], 2
3544
+ vinserti32x4 m5, [r9 + r10 + mmsize/2], 3
3545
+ punpcklwd m6, m4, m5
3546
+ pmaddwd m6, m17
3547
+ paddd m0, m6
3548
+ punpckhwd m4, m5
3549
+ pmaddwd m4, m17
3550
+ paddd m1, m4
3551
+
3552
+ movu xm4, [r0 + 4 * r1 + mmsize/2]
3553
+ vinserti32x4 m4, [r6 + 4 * r1 + mmsize/2], 1
3554
+ vinserti32x4 m4, [r8 + 4 * r1 + mmsize/2], 2
3555
+ vinserti32x4 m4, [r9 + 4 * r1 + mmsize/2], 3
3556
+ punpcklwd m6, m5, m4
3557
+ pmaddwd m6, m17
3558
+ paddd m2, m6
3559
+ punpckhwd m5, m4
3560
+ pmaddwd m5, m17
3561
+ paddd m3, m5
3562
+
3563
+ paddd m0, m7
3564
+ paddd m1, m7
3565
+ paddd m2, m7
3566
+ paddd m3, m7
3567
+
3568
+ psrad m0, INTERP_SHIFT_PS
3569
+ psrad m1, INTERP_SHIFT_PS
3570
+ psrad m2, INTERP_SHIFT_PS
3571
+ psrad m3, INTERP_SHIFT_PS
3572
+
3573
+ packssdw m0, m1
3574
+ packssdw m2, m3
3575
+ movu [r2 + mmsize/2], xm0
3576
+ movu [r2 + r3 + mmsize/2], xm2
3577
+ vextracti32x4 [r2 + 2 * r3 + mmsize/2], m0, 1
3578
+ vextracti32x4 [r2 + r7 + mmsize/2], m2, 1
3579
+ lea r2, [r2 + 4 * r3]
3580
+ vextracti32x4 [r2 + mmsize/2], m0, 2
3581
+ vextracti32x4 [r2 + r3 + mmsize/2], m2, 2
3582
+ vextracti32x4 [r2 + 2 * r3 + mmsize/2], m0, 3
3583
+ vextracti32x4 [r2 + r7 + mmsize/2], m2, 3
3584
+%endmacro
3585
+
3586
+%macro FILTER_VER_PS_CHROMA_24xN_AVX512 1
3587
+INIT_ZMM avx512
3588
+cglobal interp_4tap_vert_ps_24x%1, 5, 12, 18
3589
+ add r1d, r1d
3590
+ add r3d, r3d
3591
+ sub r0, r1
3592
+ shl r4d, 7
3593
+
3594
+%ifdef PIC
3595
+ lea r5, [tab_ChromaCoeffV_avx512]
3596
+ lea r5, [r5 + r4]
3597
+%else
3598
+ lea r5, [tab_ChromaCoeffV_avx512 + r4]
3599
+%endif
3600
+ vbroadcasti32x4 m7, [INTERP_OFFSET_PS]
3601
+ lea r10, [3 * r1]
3602
+ lea r7, [3 * r3]
3603
+ mova m16, [r5]
3604
+ mova m17, [r5 + mmsize]
3605
+%rep %1/8 - 1
3606
+ PROCESS_CHROMA_VERT_PS_24x8_AVX512
3607
+ lea r0, [r8 + 4 * r1]
3608
+ lea r2, [r2 + 4 * r3]
3609
+%endrep
3610
+ PROCESS_CHROMA_VERT_PS_24x8_AVX512
3611
+ RET
3612
+%endmacro
3613
+
3614
+%if ARCH_X86_64
3615
+ FILTER_VER_PS_CHROMA_24xN_AVX512 32
3616
+ FILTER_VER_PS_CHROMA_24xN_AVX512 64
3617
+%endif
3618
+
3619
+%macro PROCESS_CHROMA_VERT_PS_32x2_AVX512 0
3620
+ movu m1, [r0]
3621
+ movu m3, [r0 + r1]
3622
+ punpcklwd m0, m1, m3
3623
+ pmaddwd m0, m9
3624
+ punpckhwd m1, m3
3625
+ pmaddwd m1, m9
3626
+
3627
+ movu m4, [r0 + 2 * r1]
3628
+ punpcklwd m2, m3, m4
3629
+ pmaddwd m2, m9
3630
+ punpckhwd m3, m4
3631
+ pmaddwd m3, m9
3632
+
3633
+ lea r0, [r0 + 2 * r1]
3634
+ movu m5, [r0 + r1]
3635
+ punpcklwd m6, m4, m5
3636
+ pmaddwd m6, m10
3637
+ paddd m0, m6
3638
+ punpckhwd m4, m5
3639
+ pmaddwd m4, m10
3640
+ paddd m1, m4
3641
+
3642
+ movu m4, [r0 + 2 * r1]
3643
+ punpcklwd m6, m5, m4
3644
+ pmaddwd m6, m10
3645
+ paddd m2, m6
3646
+ punpckhwd m5, m4
3647
+ pmaddwd m5, m10
3648
+ paddd m3, m5
3649
+
3650
+ paddd m0, m7
3651
+ paddd m1, m7
3652
+ paddd m2, m7
3653
+ paddd m3, m7
3654
+ psrad m0, INTERP_SHIFT_PS
3655
+ psrad m1, INTERP_SHIFT_PS
3656
+ psrad m2, INTERP_SHIFT_PS
3657
+ psrad m3, INTERP_SHIFT_PS
3658
+
3659
+ packssdw m0, m1
3660
+ packssdw m2, m3
3661
+ movu [r2], m0
3662
+ movu [r2 + r3], m2
3663
+%endmacro
3664
+
3665
+;-----------------------------------------------------------------------------------------------------------------
3666
+; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
3667
+;-----------------------------------------------------------------------------------------------------------------
3668
+%macro FILTER_VER_PS_CHROMA_32xN_AVX512 1
3669
+INIT_ZMM avx512
3670
+cglobal interp_4tap_vert_ps_32x%1, 5, 7, 11
3671
+ add r1d, r1d
3672
+ add r3d, r3d
3673
+ sub r0, r1
3674
+ shl r4d, 7
3675
+
3676
+%ifdef PIC
3677
+ lea r5, [tab_ChromaCoeffV_avx512]
3678
+ lea r5, [r5 + r4]
3679
+%else
3680
+ lea r5, [tab_ChromaCoeffV_avx512 + r4]
3681
+%endif
3682
+ vbroadcasti32x4 m7, [INTERP_OFFSET_PS]
3683
+ mova m9, [r5]
3684
+ mova m10, [r5 + mmsize]
3685
+%rep %1/2 - 1
3686
+ PROCESS_CHROMA_VERT_PS_32x2_AVX512
3687
+ lea r2, [r2 + 2 * r3]
3688
+%endrep
3689
+ PROCESS_CHROMA_VERT_PS_32x2_AVX512
3690
+ RET
3691
+%endmacro
3692
+
3693
+%if ARCH_X86_64
3694
+FILTER_VER_PS_CHROMA_32xN_AVX512 8
3695
+FILTER_VER_PS_CHROMA_32xN_AVX512 16
3696
+FILTER_VER_PS_CHROMA_32xN_AVX512 24
3697
+FILTER_VER_PS_CHROMA_32xN_AVX512 32
3698
+FILTER_VER_PS_CHROMA_32xN_AVX512 48
3699
+FILTER_VER_PS_CHROMA_32xN_AVX512 64
3700
+%endif
3701
+
3702
+%macro PROCESS_CHROMA_VERT_PS_48x4_AVX512 0
3703
+ movu m1, [r0]
3704
+ lea r6, [r0 + 2 * r1]
3705
+ movu m10, [r6]
3706
+ movu m3, [r0 + r1]
3707
+ movu m12, [r6 + r1]
3708
+ punpcklwd m0, m1, m3
3709
+ punpcklwd m9, m10, m12
3710
+ pmaddwd m0, m16
3711
+ pmaddwd m9, m16
3712
+ punpckhwd m1, m3
3713
+ punpckhwd m10, m12
3714
+ pmaddwd m1, m16
3715
+ pmaddwd m10, m16
3716
+
3717
+ movu m4, [r0 + 2 * r1]
3718
+ movu m13, [r6 + 2 * r1]
3719
+ punpcklwd m2, m3, m4
3720
+ punpcklwd m11, m12, m13
3721
+ pmaddwd m2, m16
3722
+ pmaddwd m11, m16
3723
+ punpckhwd m3, m4
3724
+ punpckhwd m12, m13
3725
+ pmaddwd m3, m16
3726
+ pmaddwd m12, m16
3727
+
3728
+ movu m5, [r0 + r7]
3729
+ movu m14, [r6 + r7]
3730
+ punpcklwd m6, m4, m5
3731
+ punpcklwd m15, m13, m14
3732
+ pmaddwd m6, m17
3733
+ pmaddwd m15, m17
3734
+ paddd m0, m6
3735
+ paddd m9, m15
3736
+ punpckhwd m4, m5
3737
+ punpckhwd m13, m14
3738
+ pmaddwd m4, m17
3739
+ pmaddwd m13, m17
3740
+ paddd m1, m4
3741
+ paddd m10, m13
3742
+
3743
+ movu m4, [r0 + 4 * r1]
3744
+ movu m13, [r6 + 4 * r1]
3745
+ punpcklwd m6, m5, m4
3746
+ punpcklwd m15, m14, m13
3747
+ pmaddwd m6, m17
3748
+ pmaddwd m15, m17
3749
+ paddd m2, m6
3750
+ paddd m11, m15
3751
+ punpckhwd m5, m4
3752
+ punpckhwd m14, m13
3753
+ pmaddwd m5, m17
3754
+ pmaddwd m14, m17
3755
+ paddd m3, m5
3756
+ paddd m12, m14
3757
+
3758
+ paddd m0, m7
3759
+ paddd m1, m7
3760
+ paddd m2, m7
3761
+ paddd m3, m7
3762
+ paddd m9, m7
3763
+ paddd m10, m7
3764
+ paddd m11, m7
3765
+ paddd m12, m7
3766
+
3767
+ psrad m0, INTERP_SHIFT_PS
3768
+ psrad m1, INTERP_SHIFT_PS
3769
+ psrad m2, INTERP_SHIFT_PS
3770
+ psrad m3, INTERP_SHIFT_PS
3771
+ psrad m9, INTERP_SHIFT_PS
3772
+ psrad m10, INTERP_SHIFT_PS
3773
+ psrad m11, INTERP_SHIFT_PS
3774
+ psrad m12, INTERP_SHIFT_PS
3775
+
3776
+ packssdw m0, m1
3777
+ packssdw m2, m3
3778
+ packssdw m9, m10
3779
+ packssdw m11, m12
3780
+ movu [r2], m0
3781
+ movu [r2 + r3], m2
3782
+ movu [r2 + 2 * r3], m9
3783
+ movu [r2 + r8], m11
3784
+
3785
+ movu ym1, [r0 + mmsize]
3786
+ vinserti32x8 m1, [r6 + mmsize], 1
3787
+ movu ym3, [r0 + r1 + mmsize]
3788
+ vinserti32x8 m3, [r6 + r1 + mmsize], 1
3789
+ punpcklwd m0, m1, m3
3790
+ pmaddwd m0, m16
3791
+ punpckhwd m1, m3
3792
+ pmaddwd m1, m16
3793
+
3794
+ movu ym4, [r0 + 2 * r1 + mmsize]
3795
+ vinserti32x8 m4, [r6 + 2 * r1 + mmsize], 1
3796
+ punpcklwd m2, m3, m4
3797
+ pmaddwd m2, m16
3798
+ punpckhwd m3, m4
3799
+ pmaddwd m3, m16
3800
+
3801
+ movu ym5, [r0 + r7 + mmsize]
3802
+ vinserti32x8 m5, [r6 + r7 + mmsize], 1
3803
+ punpcklwd m6, m4, m5
3804
+ pmaddwd m6, m17
3805
+ paddd m0, m6
3806
+ punpckhwd m4, m5
3807
+ pmaddwd m4, m17
3808
+ paddd m1, m4
3809
+
3810
+ movu ym4, [r0 + 4 * r1 + mmsize]
3811
+ vinserti32x8 m4, [r6 + 4 * r1 + mmsize], 1
3812
+ punpcklwd m6, m5, m4
3813
+ pmaddwd m6, m17
3814
+ paddd m2, m6
3815
+ punpckhwd m5, m4
3816
+ pmaddwd m5, m17
3817
+ paddd m3, m5
3818
+
3819
+ paddd m0, m7
3820
+ paddd m1, m7
3821
+ paddd m2, m7
3822
+ paddd m3, m7
3823
+
3824
+ psrad m0, INTERP_SHIFT_PS
3825
+ psrad m1, INTERP_SHIFT_PS
3826
+ psrad m2, INTERP_SHIFT_PS
3827
+ psrad m3, INTERP_SHIFT_PS
3828
+
3829
+ packssdw m0, m1
3830
+ packssdw m2, m3
3831
+ movu [r2 + mmsize], ym0
3832
+ movu [r2 + r3 + mmsize], ym2
3833
+ vextracti32x8 [r2 + 2 * r3 + mmsize], m0, 1
3834
+ vextracti32x8 [r2 + r8 + mmsize], m2, 1
3835
+%endmacro
3836
+
3837
+%if ARCH_X86_64
3838
+INIT_ZMM avx512
3839
+cglobal interp_4tap_vert_ps_48x64, 5, 9, 18
3840
+ add r1d, r1d
3841
+ add r3d, r3d
3842
+ sub r0, r1
3843
+ shl r4d, 7
3844
+%ifdef PIC
3845
+ lea r5, [tab_ChromaCoeffV_avx512]
3846
+ lea r5, [r5 + r4]
3847
+%else
3848
+ lea r5, [tab_ChromaCoeffV_avx512 + r4]
3849
+%endif
3850
+ lea r7, [3 * r1]
3851
+ lea r8, [3 * r3]
3852
+ vbroadcasti32x4 m7, [INTERP_OFFSET_PS]
3853
+ mova m16, [r5]
3854
+ mova m17, [r5 + mmsize]
3855
+%rep 15
3856
+ PROCESS_CHROMA_VERT_PS_48x4_AVX512
3857
+ lea r0, [r0 + 4 * r1]
3858
+ lea r2, [r2 + 4 * r3]
3859
+%endrep
3860
+ PROCESS_CHROMA_VERT_PS_48x4_AVX512
3861
+ RET
3862
+%endif
3863
+
3864
+%macro PROCESS_CHROMA_VERT_PS_64x2_AVX512 0
3865
+ movu m1, [r0]
3866
+ movu m3, [r0 + r1]
3867
+ punpcklwd m0, m1, m3
3868
+ pmaddwd m0, m15
3869
+ punpckhwd m1, m3
3870
+ pmaddwd m1, m15
3871
+
3872
+ movu m9, [r0 + mmsize]
3873
+ movu m11, [r0 + r1 + mmsize]
3874
+ punpcklwd m8, m9, m11
3875
+ pmaddwd m8, m15
3876
+ punpckhwd m9, m11
3877
+ pmaddwd m9, m15
3878
+
3879
+ movu m4, [r0 + 2 * r1]
3880
+ punpcklwd m2, m3, m4
3881
+ pmaddwd m2, m15
3882
+ punpckhwd m3, m4
3883
+ pmaddwd m3, m15
3884
+
3885
+ movu m12, [r0 + 2 * r1 + mmsize]
3886
+ punpcklwd m10, m11, m12
3887
+ pmaddwd m10, m15
3888
+ punpckhwd m11, m12
3889
+ pmaddwd m11, m15
3890
+
3891
+ lea r0, [r0 + 2 * r1]
3892
+ movu m5, [r0 + r1]
3893
+ punpcklwd m6, m4, m5
3894
+ pmaddwd m6, m16
3895
+ paddd m0, m6
3896
+ punpckhwd m4, m5
3897
+ pmaddwd m4, m16
3898
+ paddd m1, m4
3899
+
3900
+ movu m13, [r0 + r1 + mmsize]
3901
+ punpcklwd m14, m12, m13
3902
+ pmaddwd m14, m16
3903
+ paddd m8, m14
3904
+ punpckhwd m12, m13
3905
+ pmaddwd m12, m16
3906
+ paddd m9, m12
3907
+
3908
+ movu m4, [r0 + 2 * r1]
3909
+ punpcklwd m6, m5, m4
3910
+ pmaddwd m6, m16
3911
+ paddd m2, m6
3912
+ punpckhwd m5, m4
3913
+ pmaddwd m5, m16
3914
+ paddd m3, m5
3915
+
3916
+ movu m12, [r0 + 2 * r1 + mmsize]
3917
+ punpcklwd m14, m13, m12
3918
+ pmaddwd m14, m16
3919
+ paddd m10, m14
3920
+ punpckhwd m13, m12
3921
+ pmaddwd m13, m16
3922
+ paddd m11, m13
3923
+
3924
+ paddd m0, m7
3925
+ paddd m1, m7
3926
+ paddd m2, m7
3927
+ paddd m3, m7
3928
+ paddd m8, m7
3929
+ paddd m9, m7
3930
+ paddd m10, m7
3931
+ paddd m11, m7
3932
+
3933
+ psrad m0, INTERP_SHIFT_PS
3934
+ psrad m1, INTERP_SHIFT_PS
3935
+ psrad m2, INTERP_SHIFT_PS
3936
+ psrad m3, INTERP_SHIFT_PS
3937
+ psrad m8, INTERP_SHIFT_PS
3938
+ psrad m9, INTERP_SHIFT_PS
3939
+ psrad m10, INTERP_SHIFT_PS
3940
+ psrad m11, INTERP_SHIFT_PS
3941
+
3942
+ packssdw m0, m1
3943
+ packssdw m2, m3
3944
+ packssdw m8, m9
3945
+ packssdw m10, m11
3946
+ movu [r2], m0
3947
+ movu [r2 + r3], m2
3948
+ movu [r2 + mmsize], m8
3949
+ movu [r2 + r3 + mmsize], m10
3950
+%endmacro
3951
+
3952
+;-----------------------------------------------------------------------------------------------------------------
3953
+; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
3954
+;-----------------------------------------------------------------------------------------------------------------
3955
+%macro FILTER_VER_PS_CHROMA_64xN_AVX512 1
3956
+INIT_ZMM avx512
3957
+cglobal interp_4tap_vert_ps_64x%1, 5, 7, 17
3958
+ add r1d, r1d
3959
+ add r3d, r3d
3960
+ sub r0, r1
3961
+ shl r4d, 7
3962
+
3963
+%ifdef PIC
3964
+ lea r5, [tab_ChromaCoeffV_avx512]
3965
+ lea r5, [r5 + r4]
3966
+%else
3967
+ lea r5, [tab_ChromaCoeffV_avx512 + r4]
3968
+%endif
3969
+ vbroadcasti32x4 m7, [INTERP_OFFSET_PS]
3970
+ mova m15, [r5]
3971
+ mova m16, [r5 + mmsize]
3972
+
3973
+%rep %1/2 - 1
3974
+ PROCESS_CHROMA_VERT_PS_64x2_AVX512
3975
+ lea r2, [r2 + 2 * r3]
3976
+%endrep
3977
+ PROCESS_CHROMA_VERT_PS_64x2_AVX512
3978
+ RET
3979
+%endmacro
3980
+
3981
+%if ARCH_X86_64
3982
+FILTER_VER_PS_CHROMA_64xN_AVX512 16
3983
+FILTER_VER_PS_CHROMA_64xN_AVX512 32
3984
+FILTER_VER_PS_CHROMA_64xN_AVX512 48
3985
+FILTER_VER_PS_CHROMA_64xN_AVX512 64
3986
+%endif
3987
+;-------------------------------------------------------------------------------------------------------------
3988
+; avx512 chroma_vps code end
3989
+;-------------------------------------------------------------------------------------------------------------
3990
+;-------------------------------------------------------------------------------------------------------------
3991
+; avx512 chroma_vsp and chroma_vss code start
3992
+;-------------------------------------------------------------------------------------------------------------
3993
+%macro PROCESS_CHROMA_VERT_S_8x8_AVX512 1
3994
+ movu xm1, [r0]
3995
+ lea r6, [r0 + 2 * r1]
3996
+ lea r8, [r0 + 4 * r1]
3997
+ lea r9, [r8 + 2 * r1]
3998
+ vinserti32x4 m1, [r6], 1
3999
+ vinserti32x4 m1, [r8], 2
4000
+ vinserti32x4 m1, [r9], 3
4001
+ movu xm3, [r0 + r1]
4002
+ vinserti32x4 m3, [r6 + r1], 1
4003
+ vinserti32x4 m3, [r8 + r1], 2
4004
+ vinserti32x4 m3, [r9 + r1], 3
4005
+ punpcklwd m0, m1, m3
4006
+ pmaddwd m0, m8
4007
+ punpckhwd m1, m3
4008
+ pmaddwd m1, m8
4009
+
4010
+ movu xm4, [r0 + 2 * r1]
4011
+ vinserti32x4 m4, [r6 + 2 * r1], 1
4012
+ vinserti32x4 m4, [r8 + 2 * r1], 2
4013
+ vinserti32x4 m4, [r9 + 2 * r1], 3
4014
+ punpcklwd m2, m3, m4
4015
+ pmaddwd m2, m8
4016
+ punpckhwd m3, m4
4017
+ pmaddwd m3, m8
4018
+
4019
+ movu xm5, [r0 + r10]
4020
+ vinserti32x4 m5, [r6 + r10], 1
4021
+ vinserti32x4 m5, [r8 + r10], 2
4022
+ vinserti32x4 m5, [r9 + r10], 3
4023
+ punpcklwd m6, m4, m5
4024
+ pmaddwd m6, m9
4025
+ paddd m0, m6
4026
+ punpckhwd m4, m5
4027
+ pmaddwd m4, m9
4028
+ paddd m1, m4
4029
+
4030
+ movu xm4, [r0 + 4 * r1]
4031
+ vinserti32x4 m4, [r6 + 4 * r1], 1
4032
+ vinserti32x4 m4, [r8 + 4 * r1], 2
4033
+ vinserti32x4 m4, [r9 + 4 * r1], 3
4034
+ punpcklwd m6, m5, m4
4035
+ pmaddwd m6, m9
4036
+ paddd m2, m6
4037
+ punpckhwd m5, m4
4038
+ pmaddwd m5, m9
4039
+ paddd m3, m5
4040
+
4041
+%ifidn %1,sp
4042
+ paddd m0, m7
4043
+ paddd m1, m7
4044
+ paddd m2, m7
4045
+ paddd m3, m7
4046
+
4047
+ psrad m0, INTERP_SHIFT_SP
4048
+ psrad m1, INTERP_SHIFT_SP
4049
+ psrad m2, INTERP_SHIFT_SP
4050
+ psrad m3, INTERP_SHIFT_SP
4051
+
4052
+ packssdw m0, m1
4053
+ packssdw m2, m3
4054
+ CLIPW2 m0, m2, m10, m11
4055
+%else
4056
+ psrad m0, 6
4057
+ psrad m1, 6
4058
+ psrad m2, 6
4059
+ psrad m3, 6
4060
+ packssdw m0, m1
4061
+ packssdw m2, m3
4062
+%endif
4063
+
4064
+ movu [r2], xm0
4065
+ movu [r2 + r3], xm2
4066
+ vextracti32x4 [r2 + 2 * r3], m0, 1
4067
+ vextracti32x4 [r2 + r7], m2, 1
4068
+ lea r2, [r2 + 4 * r3]
4069
+ vextracti32x4 [r2], m0, 2
4070
+ vextracti32x4 [r2 + r3], m2, 2
4071
+ vextracti32x4 [r2 + 2 * r3], m0, 3
4072
+ vextracti32x4 [r2 + r7], m2, 3
4073
+%endmacro
4074
+
4075
+;-----------------------------------------------------------------------------------------------------------------
4076
+; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
4077
+;-----------------------------------------------------------------------------------------------------------------
4078
+%macro CHROMA_VERT_S_8x8_AVX512 1
4079
+INIT_ZMM avx512
4080
+cglobal interp_4tap_vert_%1_8x8, 5, 11, 12
4081
+ add r1d, r1d
4082
+ add r3d, r3d
4083
+ sub r0, r1
4084
+ shl r4d, 7
4085
+%ifdef PIC
4086
+ lea r5, [tab_ChromaCoeffV_avx512]
4087
+ mova m8, [r5 + r4]
4088
+ mova m9, [r5 + r4 + mmsize]
4089
+%else
4090
+ lea r5, [tab_ChromaCoeffV_avx512 + r4]
4091
+ mova m8, [r5]
4092
+ mova m9, [r5 + mmsize]
4093
+%endif
4094
+%ifidn %1, sp
4095
+ vbroadcasti32x4 m7, [INTERP_OFFSET_SP]
4096
+ pxor m10, m10
4097
+ vbroadcasti32x8 m11, [pw_pixel_max]
4098
+%endif
4099
+ lea r10, [3 * r1]
4100
+ lea r7, [3 * r3]
4101
+
4102
+ PROCESS_CHROMA_VERT_S_8x8_AVX512 %1
4103
+ RET
4104
+%endmacro
4105
+
4106
+%if ARCH_X86_64
4107
+ CHROMA_VERT_S_8x8_AVX512 ss
4108
+ CHROMA_VERT_S_8x8_AVX512 sp
4109
+%endif
4110
+%macro FILTER_VER_S_CHROMA_8xN_AVX512 2
4111
+INIT_ZMM avx512
4112
+cglobal interp_4tap_vert_%1_8x%2, 5, 11, 10
4113
+ add r1d, r1d
4114
+ add r3d, r3d
4115
+ sub r0, r1
4116
+ shl r4d, 7
4117
+%ifdef PIC
4118
+ lea r5, [tab_ChromaCoeffV_avx512]
4119
+ mova m8, [r5 + r4]
4120
+ mova m9, [r5 + r4 + mmsize]
4121
+%else
4122
+ lea r5, [tab_ChromaCoeffV_avx512 + r4]
4123
+ mova m8, [r5]
4124
+ mova m9, [r5 + mmsize]
4125
+%endif
4126
+
4127
+%ifidn %1, sp
4128
+ vbroadcasti32x4 m7, [INTERP_OFFSET_SP]
4129
+ pxor m10, m10
4130
+ vbroadcasti32x8 m11, [pw_pixel_max]
4131
+%endif
4132
+ lea r10, [3 * r1]
4133
+ lea r7, [3 * r3]
4134
+
4135
+%rep %2/8 - 1
4136
+ PROCESS_CHROMA_VERT_S_8x8_AVX512 %1
4137
+ lea r0, [r8 + 4 * r1]
4138
+ lea r2, [r2 + 4 * r3]
4139
+%endrep
4140
+ PROCESS_CHROMA_VERT_S_8x8_AVX512 %1
4141
+ RET
4142
+%endmacro
4143
+%if ARCH_X86_64
4144
+ FILTER_VER_S_CHROMA_8xN_AVX512 ss, 16
4145
+ FILTER_VER_S_CHROMA_8xN_AVX512 ss, 32
4146
+ FILTER_VER_S_CHROMA_8xN_AVX512 ss, 64
4147
+ FILTER_VER_S_CHROMA_8xN_AVX512 sp, 16
4148
+ FILTER_VER_S_CHROMA_8xN_AVX512 sp, 32
4149
+ FILTER_VER_S_CHROMA_8xN_AVX512 sp, 64
4150
+%endif
4151
+%macro PROCESS_CHROMA_VERT_S_16x4_AVX512 1
4152
+ movu ym1, [r0]
4153
+ lea r6, [r0 + 2 * r1]
4154
+ vinserti32x8 m1, [r6], 1
4155
+ movu ym3, [r0 + r1]
4156
+ vinserti32x8 m3, [r6 + r1], 1
4157
+ punpcklwd m0, m1, m3
4158
+ pmaddwd m0, m8
4159
+ punpckhwd m1, m3
4160
+ pmaddwd m1, m8
4161
+
4162
+ movu ym4, [r0 + 2 * r1]
4163
+ vinserti32x8 m4, [r6 + 2 * r1], 1
4164
+ punpcklwd m2, m3, m4
4165
+ pmaddwd m2, m8
4166
+ punpckhwd m3, m4
4167
+ pmaddwd m3, m8
4168
+
4169
+ movu ym5, [r0 + r8]
4170
+ vinserti32x8 m5, [r6 + r8], 1
4171
+ punpcklwd m6, m4, m5
4172
+ pmaddwd m6, m9
4173
+ paddd m0, m6
4174
+ punpckhwd m4, m5
4175
+ pmaddwd m4, m9
4176
+ paddd m1, m4
4177
+
4178
+ movu ym4, [r0 + 4 * r1]
4179
+ vinserti32x8 m4, [r6 + 4 * r1], 1
4180
+ punpcklwd m6, m5, m4
4181
+ pmaddwd m6, m9
4182
+ paddd m2, m6
4183
+ punpckhwd m5, m4
4184
+ pmaddwd m5, m9
4185
+ paddd m3, m5
4186
+
4187
+%ifidn %1,sp
4188
+ paddd m0, m7
4189
+ paddd m1, m7
4190
+ paddd m2, m7
4191
+ paddd m3, m7
4192
+
4193
+ psrad m0, INTERP_SHIFT_SP
4194
+ psrad m1, INTERP_SHIFT_SP
4195
+ psrad m2, INTERP_SHIFT_SP
4196
+ psrad m3, INTERP_SHIFT_SP
4197
+
4198
+ packssdw m0, m1
4199
+ packssdw m2, m3
4200
+ CLIPW2 m0, m2, m10, m11
4201
+%else
4202
+ psrad m0, 6
4203
+ psrad m1, 6
4204
+ psrad m2, 6
4205
+ psrad m3, 6
4206
+ packssdw m0, m1
4207
+ packssdw m2, m3
4208
+%endif
4209
+
4210
+ movu [r2], ym0
4211
+ movu [r2 + r3], ym2
4212
+ vextracti32x8 [r2 + 2 * r3], m0, 1
4213
+ vextracti32x8 [r2 + r7], m2, 1
4214
+%endmacro
4215
+
4216
+;-----------------------------------------------------------------------------------------------------------------
4217
+; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
4218
+;-----------------------------------------------------------------------------------------------------------------
4219
+%macro CHROMA_VERT_S_16x4_AVX512 1
4220
+INIT_ZMM avx512
4221
+cglobal interp_4tap_vert_%1_16x4, 5, 9, 12
4222
+ add r1d, r1d
4223
+ add r3d, r3d
4224
+ sub r0, r1
4225
+ shl r4d, 7
4226
+%ifdef PIC
4227
+ lea r5, [tab_ChromaCoeffV_avx512]
4228
+ mova m8, [r5 + r4]
4229
+ mova m9, [r5 + r4 + mmsize]
4230
+%else
4231
+ lea r5, [tab_ChromaCoeffV_avx512 + r4]
4232
+ mova m8, [r5]
4233
+ mova m9, [r5 + mmsize]
4234
+%endif
4235
+
4236
+%ifidn %1, sp
4237
+ vbroadcasti32x4 m7, [INTERP_OFFSET_SP]
4238
+ pxor m10, m10
4239
+ vbroadcasti32x8 m11, [pw_pixel_max]
4240
+%endif
4241
+ lea r7, [3 * r3]
4242
+ lea r8, [3 * r1]
4243
+ PROCESS_CHROMA_VERT_S_16x4_AVX512 %1
4244
+ RET
4245
+%endmacro
4246
+
4247
+%if ARCH_X86_64
4248
+ CHROMA_VERT_S_16x4_AVX512 ss
4249
+ CHROMA_VERT_S_16x4_AVX512 sp
4250
+%endif
4251
+%macro FILTER_VER_S_CHROMA_16xN_AVX512 2
4252
+INIT_ZMM avx512
4253
+cglobal interp_4tap_vert_%1_16x%2, 5, 9, 12
4254
+ add r1d, r1d
4255
+ add r3d, r3d
4256
+ sub r0, r1
4257
+ shl r4d, 7
4258
+%ifdef PIC
4259
+ lea r5, [tab_ChromaCoeffV_avx512]
4260
+ mova m8, [r5 + r4]
4261
+ mova m9, [r5 + r4 + mmsize]
4262
+%else
4263
+ lea r5, [tab_ChromaCoeffV_avx512 + r4]
4264
+ mova m8, [r5]
4265
+ mova m9, [r5 + mmsize]
4266
+%endif
4267
+
4268
+%ifidn %1, sp
4269
+ vbroadcasti32x4 m7, [INTERP_OFFSET_SP]
4270
+ pxor m10, m10
4271
+ vbroadcasti32x8 m11, [pw_pixel_max]
4272
+%endif
4273
+ lea r7, [3 * r3]
4274
+ lea r8, [3 * r1]
4275
+%rep %2/4 - 1
4276
+ PROCESS_CHROMA_VERT_S_16x4_AVX512 %1
4277
+ lea r0, [r0 + 4 * r1]
4278
+ lea r2, [r2 + 4 * r3]
4279
+%endrep
4280
+ PROCESS_CHROMA_VERT_S_16x4_AVX512 %1
4281
+ RET
4282
+%endmacro
4283
+
4284
+%if ARCH_X86_64
4285
+ FILTER_VER_S_CHROMA_16xN_AVX512 ss, 8
4286
+ FILTER_VER_S_CHROMA_16xN_AVX512 ss, 12
4287
+ FILTER_VER_S_CHROMA_16xN_AVX512 ss, 16
4288
+ FILTER_VER_S_CHROMA_16xN_AVX512 ss, 24
4289
+ FILTER_VER_S_CHROMA_16xN_AVX512 ss, 32
4290
+ FILTER_VER_S_CHROMA_16xN_AVX512 ss, 64
4291
+ FILTER_VER_S_CHROMA_16xN_AVX512 sp, 8
4292
+ FILTER_VER_S_CHROMA_16xN_AVX512 sp, 12
4293
+ FILTER_VER_S_CHROMA_16xN_AVX512 sp, 16
4294
+ FILTER_VER_S_CHROMA_16xN_AVX512 sp, 24
4295
+ FILTER_VER_S_CHROMA_16xN_AVX512 sp, 32
4296
+ FILTER_VER_S_CHROMA_16xN_AVX512 sp, 64
4297
+%endif
4298
+
4299
+%macro PROCESS_CHROMA_VERT_S_24x8_AVX512 1
4300
+ movu ym1, [r0]
4301
+ lea r6, [r0 + 2 * r1]
4302
+ lea r8, [r0 + 4 * r1]
4303
+ lea r9, [r8 + 2 * r1]
4304
+
4305
+ movu ym10, [r8]
4306
+ movu ym3, [r0 + r1]
4307
+ movu ym12, [r8 + r1]
4308
+ vinserti32x8 m1, [r6], 1
4309
+ vinserti32x8 m10, [r9], 1
4310
+ vinserti32x8 m3, [r6 + r1], 1
4311
+ vinserti32x8 m12, [r9 + r1], 1
4312
+
4313
+ punpcklwd m0, m1, m3
4314
+ punpcklwd m9, m10, m12
4315
+ pmaddwd m0, m16
4316
+ pmaddwd m9, m16
4317
+ punpckhwd m1, m3
4318
+ punpckhwd m10, m12
4319
+ pmaddwd m1, m16
4320
+ pmaddwd m10, m16
4321
+
4322
+ movu ym4, [r0 + 2 * r1]
4323
+ movu ym13, [r8 + 2 * r1]
4324
+ vinserti32x8 m4, [r6 + 2 * r1], 1
4325
+ vinserti32x8 m13, [r9 + 2 * r1], 1
4326
+ punpcklwd m2, m3, m4
4327
+ punpcklwd m11, m12, m13
4328
+ pmaddwd m2, m16
4329
+ pmaddwd m11, m16
4330
+ punpckhwd m3, m4
4331
+ punpckhwd m12, m13
4332
+ pmaddwd m3, m16
4333
+ pmaddwd m12, m16
4334
+
4335
+ movu ym5, [r0 + r10]
4336
+ vinserti32x8 m5, [r6 + r10], 1
4337
+ movu ym14, [r8 + r10]
4338
+ vinserti32x8 m14, [r9 + r10], 1
4339
+ punpcklwd m6, m4, m5
4340
+ punpcklwd m15, m13, m14
4341
+ pmaddwd m6, m17
4342
+ pmaddwd m15, m17
4343
+ paddd m0, m6
4344
+ paddd m9, m15
4345
+ punpckhwd m4, m5
4346
+ punpckhwd m13, m14
4347
+ pmaddwd m4, m17
4348
+ pmaddwd m13, m17
4349
+ paddd m1, m4
4350
+ paddd m10, m13
4351
+
4352
+ movu ym4, [r0 + 4 * r1]
4353
+ vinserti32x8 m4, [r6 + 4 * r1], 1
4354
+ movu ym13, [r8 + 4 * r1]
4355
+ vinserti32x8 m13, [r9 + 4 * r1], 1
4356
+ punpcklwd m6, m5, m4
4357
+ punpcklwd m15, m14, m13
4358
+ pmaddwd m6, m17
4359
+ pmaddwd m15, m17
4360
+ paddd m2, m6
4361
+ paddd m11, m15
4362
+ punpckhwd m5, m4
4363
+ punpckhwd m14, m13
4364
+ pmaddwd m5, m17
4365
+ pmaddwd m14, m17
4366
+ paddd m3, m5
4367
+ paddd m12, m14
4368
+
4369
+%ifidn %1,sp
4370
+ paddd m0, m7
4371
+ paddd m1, m7
4372
+ paddd m2, m7
4373
+ paddd m3, m7
4374
+ paddd m9, m7
4375
+ paddd m10, m7
4376
+ paddd m11, m7
4377
+ paddd m12, m7
4378
+
4379
+ psrad m0, INTERP_SHIFT_SP
4380
+ psrad m1, INTERP_SHIFT_SP
4381
+ psrad m2, INTERP_SHIFT_SP
4382
+ psrad m3, INTERP_SHIFT_SP
4383
+ psrad m9, INTERP_SHIFT_SP
4384
+ psrad m10, INTERP_SHIFT_SP
4385
+ psrad m11, INTERP_SHIFT_SP
4386
+ psrad m12, INTERP_SHIFT_SP
4387
+
4388
+ packssdw m0, m1
4389
+ packssdw m2, m3
4390
+ packssdw m9, m10
4391
+ packssdw m11, m12
4392
+ CLIPW2 m0, m2, m18, m19
4393
+ CLIPW2 m9, m11, m18, m19
4394
+%else
4395
+ psrad m0, 6
4396
+ psrad m1, 6
4397
+ psrad m2, 6
4398
+ psrad m3, 6
4399
+ psrad m9, 6
4400
+ psrad m10, 6
4401
+ psrad m11, 6
4402
+ psrad m12, 6
4403
+
4404
+ packssdw m0, m1
4405
+ packssdw m2, m3
4406
+ packssdw m9, m10
4407
+ packssdw m11, m12
4408
+%endif
4409
+
4410
+ movu [r2], ym0
4411
+ movu [r2 + r3], ym2
4412
+ vextracti32x8 [r2 + 2 * r3], m0, 1
4413
+ vextracti32x8 [r2 + r7], m2, 1
4414
+ lea r11, [r2 + 4 * r3]
4415
+ movu [r11], ym9
4416
+ movu [r11 + r3], ym11
4417
+ vextracti32x8 [r11 + 2 * r3], m9, 1
4418
+ vextracti32x8 [r11 + r7], m11, 1
4419
+
4420
+ movu xm1, [r0 + mmsize/2]
4421
+ vinserti32x4 m1, [r6 + mmsize/2], 1
4422
+ vinserti32x4 m1, [r8 + mmsize/2], 2
4423
+ vinserti32x4 m1, [r9 + mmsize/2], 3
4424
+ movu xm3, [r0 + r1 + mmsize/2]
4425
+ vinserti32x4 m3, [r6 + r1 + mmsize/2], 1
4426
+ vinserti32x4 m3, [r8 + r1 + mmsize/2], 2
4427
+ vinserti32x4 m3, [r9 + r1 + mmsize/2], 3
4428
+ punpcklwd m0, m1, m3
4429
+ pmaddwd m0, m16
4430
+ punpckhwd m1, m3
4431
+ pmaddwd m1, m16
4432
+
4433
+ movu xm4, [r0 + 2 * r1 + mmsize/2]
4434
+ vinserti32x4 m4, [r6 + 2 * r1 + mmsize/2], 1
4435
+ vinserti32x4 m4, [r8 + 2 * r1 + mmsize/2], 2
4436
+ vinserti32x4 m4, [r9 + 2 * r1 + mmsize/2], 3
4437
+ punpcklwd m2, m3, m4
4438
+ pmaddwd m2, m16
4439
+ punpckhwd m3, m4
4440
+ pmaddwd m3, m16
4441
+
4442
+ movu xm5, [r0 + r10 + mmsize/2]
4443
+ vinserti32x4 m5, [r6 + r10 + mmsize/2], 1
4444
+ vinserti32x4 m5, [r8 + r10 + mmsize/2], 2
4445
+ vinserti32x4 m5, [r9 + r10 + mmsize/2], 3
4446
+ punpcklwd m6, m4, m5
4447
+ pmaddwd m6, m17
4448
+ paddd m0, m6
4449
+ punpckhwd m4, m5
4450
+ pmaddwd m4, m17
4451
+ paddd m1, m4
4452
+
4453
+ movu xm4, [r0 + 4 * r1 + mmsize/2]
4454
+ vinserti32x4 m4, [r6 + 4 * r1 + mmsize/2], 1
4455
+ vinserti32x4 m4, [r8 + 4 * r1 + mmsize/2], 2
4456
+ vinserti32x4 m4, [r9 + 4 * r1 + mmsize/2], 3
4457
+ punpcklwd m6, m5, m4
4458
+ pmaddwd m6, m17
4459
+ paddd m2, m6
4460
+ punpckhwd m5, m4
4461
+ pmaddwd m5, m17
4462
+ paddd m3, m5
4463
+
4464
+%ifidn %1,sp
4465
+ paddd m0, m7
4466
+ paddd m1, m7
4467
+ paddd m2, m7
4468
+ paddd m3, m7
4469
+
4470
+ psrad m0, INTERP_SHIFT_SP
4471
+ psrad m1, INTERP_SHIFT_SP
4472
+ psrad m2, INTERP_SHIFT_SP
4473
+ psrad m3, INTERP_SHIFT_SP
4474
+
4475
+ packssdw m0, m1
4476
+ packssdw m2, m3
4477
+ CLIPW2 m0, m2, m18, m19
4478
+%else
4479
+ psrad m0, 6
4480
+ psrad m1, 6
4481
+ psrad m2, 6
4482
+ psrad m3, 6
4483
+
4484
+ packssdw m0, m1
4485
+ packssdw m2, m3
4486
+%endif
4487
+
4488
+ movu [r2 + mmsize/2], xm0
4489
+ movu [r2 + r3 + mmsize/2], xm2
4490
+ vextracti32x4 [r2 + 2 * r3 + mmsize/2], m0, 1
4491
+ vextracti32x4 [r2 + r7 + mmsize/2], m2, 1
4492
+ lea r2, [r2 + 4 * r3]
4493
+ vextracti32x4 [r2 + mmsize/2], m0, 2
4494
+ vextracti32x4 [r2 + r3 + mmsize/2], m2, 2
4495
+ vextracti32x4 [r2 + 2 * r3 + mmsize/2], m0, 3
4496
+ vextracti32x4 [r2 + r7 + mmsize/2], m2, 3
4497
+%endmacro
4498
+%macro FILTER_VER_S_CHROMA_24xN_AVX512 2
4499
+INIT_ZMM avx512
4500
+cglobal interp_4tap_vert_%1_24x%2, 5, 12, 20
4501
+ add r1d, r1d
4502
+ add r3d, r3d
4503
+ sub r0, r1
4504
+ shl r4d, 7
4505
+%ifdef PIC
4506
+ lea r5, [tab_ChromaCoeffV_avx512]
4507
+ mova m16, [r5 + r4]
4508
+ mova m17, [r5 + r4 + mmsize]
4509
+%else
4510
+ lea r5, [tab_ChromaCoeffV_avx512 + r4]
4511
+ mova m16, [r5]
4512
+ mova m17, [r5 + mmsize]
4513
+%endif
4514
+%ifidn %1, sp
4515
+ vbroadcasti32x4 m7, [INTERP_OFFSET_SP]
4516
+ pxor m18, m18
4517
+ vbroadcasti32x8 m19, [pw_pixel_max]
4518
+%endif
4519
+ lea r10, [3 * r1]
4520
+ lea r7, [3 * r3]
4521
+%rep %2/8 - 1
4522
+ PROCESS_CHROMA_VERT_S_24x8_AVX512 %1
4523
+ lea r0, [r8 + 4 * r1]
4524
+ lea r2, [r2 + 4 * r3]
4525
+%endrep
4526
+ PROCESS_CHROMA_VERT_S_24x8_AVX512 %1
4527
+ RET
4528
+%endmacro
4529
+%if ARCH_X86_64
4530
+ FILTER_VER_S_CHROMA_24xN_AVX512 ss, 32
4531
+ FILTER_VER_S_CHROMA_24xN_AVX512 ss, 64
4532
+ FILTER_VER_S_CHROMA_24xN_AVX512 sp, 32
4533
+ FILTER_VER_S_CHROMA_24xN_AVX512 sp, 64
4534
+%endif
4535
+
4536
+%macro PROCESS_CHROMA_VERT_S_32x4_AVX512 1
4537
+ movu m1, [r0]
4538
+ lea r6, [r0 + 2 * r1]
4539
+ movu m10, [r6]
4540
+ movu m3, [r0 + r1]
4541
+ movu m12, [r6 + r1]
4542
+ punpcklwd m0, m1, m3
4543
+ punpcklwd m9, m10, m12
4544
+ pmaddwd m0, m16
4545
+ pmaddwd m9, m16
4546
+ punpckhwd m1, m3
4547
+ punpckhwd m10, m12
4548
+ pmaddwd m1, m16
4549
+ pmaddwd m10, m16
4550
+ movu m4, [r0 + 2 * r1]
4551
+ movu m13, [r6 + 2 * r1]
4552
+ punpcklwd m2, m3, m4
4553
+ punpcklwd m11, m12, m13
4554
+ pmaddwd m2, m16
4555
+ pmaddwd m11, m16
4556
+ punpckhwd m3, m4
4557
+ punpckhwd m12, m13
4558
+ pmaddwd m3, m16
4559
+ pmaddwd m12, m16
4560
+
4561
+ movu m5, [r0 + r7]
4562
+ movu m14, [r6 + r7]
4563
+ punpcklwd m6, m4, m5
4564
+ punpcklwd m15, m13, m14
4565
+ pmaddwd m6, m17
4566
+ pmaddwd m15, m17
4567
+ paddd m0, m6
4568
+ paddd m9, m15
4569
+ punpckhwd m4, m5
4570
+ punpckhwd m13, m14
4571
+ pmaddwd m4, m17
4572
+ pmaddwd m13, m17
4573
+ paddd m1, m4
4574
+ paddd m10, m13
4575
+
4576
+ movu m4, [r0 + 4 * r1]
4577
+ movu m13, [r6 + 4 * r1]
4578
+ punpcklwd m6, m5, m4
4579
+ punpcklwd m15, m14, m13
4580
+ pmaddwd m6, m17
4581
+ pmaddwd m15, m17
4582
+ paddd m2, m6
4583
+ paddd m11, m15
4584
+ punpckhwd m5, m4
4585
+ punpckhwd m14, m13
4586
+ pmaddwd m5, m17
4587
+ pmaddwd m14, m17
4588
+ paddd m3, m5
4589
+ paddd m12, m14
4590
+%ifidn %1,sp
4591
+ paddd m0, m7
4592
+ paddd m1, m7
4593
+ paddd m2, m7
4594
+ paddd m3, m7
4595
+ paddd m9, m7
4596
+ paddd m10, m7
4597
+ paddd m11, m7
4598
+ paddd m12, m7
4599
+
4600
+ psrad m0, INTERP_SHIFT_SP
4601
+ psrad m1, INTERP_SHIFT_SP
4602
+ psrad m2, INTERP_SHIFT_SP
4603
+ psrad m3, INTERP_SHIFT_SP
4604
+ psrad m9, INTERP_SHIFT_SP
4605
+ psrad m10, INTERP_SHIFT_SP
4606
+ psrad m11, INTERP_SHIFT_SP
4607
+ psrad m12, INTERP_SHIFT_SP
4608
+
4609
+ packssdw m0, m1
4610
+ packssdw m2, m3
4611
+ packssdw m9, m10
4612
+ packssdw m11, m12
4613
+ CLIPW2 m0, m2, m18, m19
4614
+ CLIPW2 m9, m11, m18, m19
4615
+%else
4616
+ psrad m0, 6
4617
+ psrad m1, 6
4618
+ psrad m2, 6
4619
+ psrad m3, 6
4620
+ psrad m9, 6
4621
+ psrad m10, 6
4622
+ psrad m11, 6
4623
+ psrad m12, 6
4624
+
4625
+ packssdw m0, m1
4626
+ packssdw m2, m3
4627
+ packssdw m9, m10
4628
+ packssdw m11, m12
4629
+%endif
4630
+
4631
+ movu [r2], m0
4632
+ movu [r2 + r3], m2
4633
+ movu [r2 + 2 * r3], m9
4634
+ movu [r2 + r8], m11
4635
+%endmacro
4636
+;-----------------------------------------------------------------------------------------------------------------
4637
+; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
4638
+;-----------------------------------------------------------------------------------------------------------------
4639
+%macro FILTER_VER_S_CHROMA_32xN_AVX512 2
4640
+INIT_ZMM avx512
4641
+cglobal interp_4tap_vert_%1_32x%2, 5, 9, 20
4642
+ add r1d, r1d
4643
+ add r3d, r3d
4644
+ sub r0, r1
4645
+ shl r4d, 7
4646
+%ifdef PIC
4647
+ lea r5, [tab_ChromaCoeffV_avx512]
4648
+ mova m16, [r5 + r4]
4649
+ mova m17, [r5 + r4 + mmsize]
4650
+%else
4651
+ lea r5, [tab_ChromaCoeffV_avx512 + r4]
4652
+ mova m16, [r5]
4653
+ mova m17, [r5 + mmsize]
4654
+%endif
4655
+ lea r7, [3 * r1]
4656
+ lea r8, [3 * r3]
4657
+%ifidn %1, sp
4658
+ vbroadcasti32x4 m7, [INTERP_OFFSET_SP]
4659
+ pxor m18, m18
4660
+ vbroadcasti32x8 m19, [pw_pixel_max]
4661
+%endif
4662
+
4663
+%rep %2/4 - 1
4664
+ PROCESS_CHROMA_VERT_S_32x4_AVX512 %1
4665
+ lea r0, [r0 + 4 * r1]
4666
+ lea r2, [r2 + 4 * r3]
4667
+%endrep
4668
+ PROCESS_CHROMA_VERT_S_32x4_AVX512 %1
4669
+ RET
4670
+%endmacro
4671
+%if ARCH_X86_64
4672
+ FILTER_VER_S_CHROMA_32xN_AVX512 ss, 8
4673
+ FILTER_VER_S_CHROMA_32xN_AVX512 ss, 16
4674
+ FILTER_VER_S_CHROMA_32xN_AVX512 ss, 24
4675
+ FILTER_VER_S_CHROMA_32xN_AVX512 ss, 32
4676
+ FILTER_VER_S_CHROMA_32xN_AVX512 ss, 48
4677
+ FILTER_VER_S_CHROMA_32xN_AVX512 ss, 64
4678
+ FILTER_VER_S_CHROMA_32xN_AVX512 sp, 8
4679
+ FILTER_VER_S_CHROMA_32xN_AVX512 sp, 16
4680
+ FILTER_VER_S_CHROMA_32xN_AVX512 sp, 24
4681
+ FILTER_VER_S_CHROMA_32xN_AVX512 sp, 32
4682
+ FILTER_VER_S_CHROMA_32xN_AVX512 sp, 48
4683
+ FILTER_VER_S_CHROMA_32xN_AVX512 sp, 64
4684
+%endif
4685
+%macro PROCESS_CHROMA_VERT_S_48x4_AVX512 1
4686
+ movu m1, [r0]
4687
+ lea r6, [r0 + 2 * r1]
4688
+ movu m10, [r6]
4689
+ movu m3, [r0 + r1]
4690
+ movu m12, [r6 + r1]
4691
+ punpcklwd m0, m1, m3
4692
+ punpcklwd m9, m10, m12
4693
+ pmaddwd m0, m16
4694
+ pmaddwd m9, m16
4695
+ punpckhwd m1, m3
4696
+ punpckhwd m10, m12
4697
+ pmaddwd m1, m16
4698
+ pmaddwd m10, m16
4699
+
4700
+ movu m4, [r0 + 2 * r1]
4701
+ movu m13, [r6 + 2 * r1]
4702
+ punpcklwd m2, m3, m4
4703
+ punpcklwd m11, m12, m13
4704
+ pmaddwd m2, m16
4705
+ pmaddwd m11, m16
4706
+ punpckhwd m3, m4
4707
+ punpckhwd m12, m13
4708
+ pmaddwd m3, m16
4709
+ pmaddwd m12, m16
4710
+
4711
+ movu m5, [r0 + r7]
4712
+ movu m14, [r6 + r7]
4713
+ punpcklwd m6, m4, m5
4714
+ punpcklwd m15, m13, m14
4715
+ pmaddwd m6, m17
4716
+ pmaddwd m15, m17
4717
+ paddd m0, m6
4718
+ paddd m9, m15
4719
+ punpckhwd m4, m5
4720
+ punpckhwd m13, m14
4721
+ pmaddwd m4, m17
4722
+ pmaddwd m13, m17
4723
+ paddd m1, m4
4724
+ paddd m10, m13
4725
+
4726
+ movu m4, [r0 + 4 * r1]
4727
+ movu m13, [r6 + 4 * r1]
4728
+ punpcklwd m6, m5, m4
4729
+ punpcklwd m15, m14, m13
4730
+ pmaddwd m6, m17
4731
+ pmaddwd m15, m17
4732
+ paddd m2, m6
4733
+ paddd m11, m15
4734
+ punpckhwd m5, m4
4735
+ punpckhwd m14, m13
4736
+ pmaddwd m5, m17
4737
+ pmaddwd m14, m17
4738
+ paddd m3, m5
4739
+ paddd m12, m14
4740
+
4741
+%ifidn %1,sp
4742
+ paddd m0, m7
4743
+ paddd m1, m7
4744
+ paddd m2, m7
4745
+ paddd m3, m7
4746
+ paddd m9, m7
4747
+ paddd m10, m7
4748
+ paddd m11, m7
4749
+ paddd m12, m7
4750
+
4751
+ psrad m0, INTERP_SHIFT_SP
4752
+ psrad m1, INTERP_SHIFT_SP
4753
+ psrad m2, INTERP_SHIFT_SP
4754
+ psrad m3, INTERP_SHIFT_SP
4755
+ psrad m9, INTERP_SHIFT_SP
4756
+ psrad m10, INTERP_SHIFT_SP
4757
+ psrad m11, INTERP_SHIFT_SP
4758
+ psrad m12, INTERP_SHIFT_SP
4759
+
4760
+ packssdw m0, m1
4761
+ packssdw m2, m3
4762
+ packssdw m9, m10
4763
+ packssdw m11, m12
4764
+ CLIPW2 m0, m2, m18, m19
4765
+ CLIPW2 m9, m11, m18, m19
4766
+%else
4767
+ psrad m0, 6
4768
+ psrad m1, 6
4769
+ psrad m2, 6
4770
+ psrad m3, 6
4771
+ psrad m9, 6
4772
+ psrad m10, 6
4773
+ psrad m11, 6
4774
+ psrad m12, 6
4775
+ packssdw m0, m1
4776
+ packssdw m2, m3
4777
+ packssdw m9, m10
4778
+ packssdw m11, m12
4779
+%endif
4780
+
4781
+ movu [r2], m0
4782
+ movu [r2 + r3], m2
4783
+ movu [r2 + 2 * r3], m9
4784
+ movu [r2 + r8], m11
4785
+
4786
+ movu ym1, [r0 + mmsize]
4787
+ vinserti32x8 m1, [r6 + mmsize], 1
4788
+ movu ym3, [r0 + r1 + mmsize]
4789
+ vinserti32x8 m3, [r6 + r1 + mmsize], 1
4790
+ punpcklwd m0, m1, m3
4791
+ pmaddwd m0, m16
4792
+ punpckhwd m1, m3
4793
+ pmaddwd m1, m16
4794
+
4795
+ movu ym4, [r0 + 2 * r1 + mmsize]
4796
+ vinserti32x8 m4, [r6 + 2 * r1 + mmsize], 1
4797
+ punpcklwd m2, m3, m4
4798
+ pmaddwd m2, m16
4799
+ punpckhwd m3, m4
4800
+ pmaddwd m3, m16
4801
+
4802
+ movu ym5, [r0 + r7 + mmsize]
4803
+ vinserti32x8 m5, [r6 + r7 + mmsize], 1
4804
+ punpcklwd m6, m4, m5
4805
+ pmaddwd m6, m17
4806
+ paddd m0, m6
4807
+ punpckhwd m4, m5
4808
+ pmaddwd m4, m17
4809
+ paddd m1, m4
4810
+
4811
+ movu ym4, [r0 + 4 * r1 + mmsize]
4812
+ vinserti32x8 m4, [r6 + 4 * r1 + mmsize], 1
4813
+ punpcklwd m6, m5, m4
4814
+ pmaddwd m6, m17
4815
+ paddd m2, m6
4816
+ punpckhwd m5, m4
4817
+ pmaddwd m5, m17
4818
+ paddd m3, m5
4819
+
4820
+%ifidn %1,sp
4821
+ paddd m0, m7
4822
+ paddd m1, m7
4823
+ paddd m2, m7
4824
+ paddd m3, m7
4825
+
4826
+ psrad m0, INTERP_SHIFT_SP
4827
+ psrad m1, INTERP_SHIFT_SP
4828
+ psrad m2, INTERP_SHIFT_SP
4829
+ psrad m3, INTERP_SHIFT_SP
4830
+ packssdw m0, m1
4831
+ packssdw m2, m3
4832
+ CLIPW2 m0, m2, m18, m19
4833
+%else
4834
+ psrad m0, 6
4835
+ psrad m1, 6
4836
+ psrad m2, 6
4837
+ psrad m3, 6
4838
+ packssdw m0, m1
4839
+ packssdw m2, m3
4840
+%endif
4841
+
4842
+ movu [r2 + mmsize], ym0
4843
+ movu [r2 + r3 + mmsize], ym2
4844
+ vextracti32x8 [r2 + 2 * r3 + mmsize], m0, 1
4845
+ vextracti32x8 [r2 + r8 + mmsize], m2, 1
4846
+%endmacro
4847
+%macro CHROMA_VERT_S_48x4_AVX512 1
4848
+INIT_ZMM avx512
4849
+cglobal interp_4tap_vert_%1_48x64, 5, 9, 20
4850
+ add r1d, r1d
4851
+ add r3d, r3d
4852
+ sub r0, r1
4853
+ shl r4d, 7
4854
+%ifdef PIC
4855
+ lea r5, [tab_ChromaCoeffV_avx512]
4856
+ mova m16, [r5 + r4]
4857
+ mova m17, [r5 + r4 + mmsize]
4858
+%else
4859
+ lea r5, [tab_ChromaCoeffV_avx512 + r4]
4860
+ mova m16, [r5]
4861
+ mova m17, [r5 + mmsize]
4862
+%endif
4863
+ lea r7, [3 * r1]
4864
+ lea r8, [3 * r3]
4865
+%ifidn %1, sp
4866
+ vbroadcasti32x4 m7, [INTERP_OFFSET_SP]
4867
+ pxor m18, m18
4868
+ vbroadcasti32x8 m19, [pw_pixel_max]
4869
+%endif
4870
+%rep 15
4871
+ PROCESS_CHROMA_VERT_S_48x4_AVX512 %1
4872
+ lea r0, [r0 + 4 * r1]
4873
+ lea r2, [r2 + 4 * r3]
4874
+%endrep
4875
+ PROCESS_CHROMA_VERT_S_48x4_AVX512 %1
4876
+ RET
4877
+%endmacro
4878
+
4879
+%if ARCH_X86_64
4880
+ CHROMA_VERT_S_48x4_AVX512 sp
4881
+ CHROMA_VERT_S_48x4_AVX512 ss
4882
+%endif
4883
+%macro PROCESS_CHROMA_VERT_S_64x2_AVX512 1
4884
+ movu m1, [r0]
4885
+ movu m3, [r0 + r1]
4886
+ punpcklwd m0, m1, m3
4887
+ pmaddwd m0, m15
4888
+ punpckhwd m1, m3
4889
+ pmaddwd m1, m15
4890
+
4891
+ movu m9, [r0 + mmsize]
4892
+ movu m11, [r0 + r1 + mmsize]
4893
+ punpcklwd m8, m9, m11
4894
+ pmaddwd m8, m15
4895
+ punpckhwd m9, m11
4896
+ pmaddwd m9, m15
4897
+ movu m4, [r0 + 2 * r1]
4898
+ punpcklwd m2, m3, m4
4899
+ pmaddwd m2, m15
4900
+ punpckhwd m3, m4
4901
+ pmaddwd m3, m15
4902
+ movu m12, [r0 + 2 * r1 + mmsize]
4903
+ punpcklwd m10, m11, m12
4904
+ pmaddwd m10, m15
4905
+ punpckhwd m11, m12
4906
+ pmaddwd m11, m15
4907
+
4908
+ lea r0, [r0 + 2 * r1]
4909
+ movu m5, [r0 + r1]
4910
+ punpcklwd m6, m4, m5
4911
+ pmaddwd m6, m16
4912
+ paddd m0, m6
4913
+ punpckhwd m4, m5
4914
+ pmaddwd m4, m16
4915
+ paddd m1, m4
4916
+
4917
+ movu m13, [r0 + r1 + mmsize]
4918
+ punpcklwd m14, m12, m13
4919
+ pmaddwd m14, m16
4920
+ paddd m8, m14
4921
+ punpckhwd m12, m13
4922
+ pmaddwd m12, m16
4923
+ paddd m9, m12
4924
+
4925
+ movu m4, [r0 + 2 * r1]
4926
+ punpcklwd m6, m5, m4
4927
+ pmaddwd m6, m16
4928
+ paddd m2, m6
4929
+ punpckhwd m5, m4
4930
+ pmaddwd m5, m16
4931
+ paddd m3, m5
4932
+
4933
+ movu m12, [r0 + 2 * r1 + mmsize]
4934
+ punpcklwd m14, m13, m12
4935
+ pmaddwd m14, m16
4936
+ paddd m10, m14
4937
+ punpckhwd m13, m12
4938
+ pmaddwd m13, m16
4939
+ paddd m11, m13
4940
+
4941
+%ifidn %1,sp
4942
+ paddd m0, m7
4943
+ paddd m1, m7
4944
+ paddd m2, m7
4945
+ paddd m3, m7
4946
+ paddd m8, m7
4947
+ paddd m9, m7
4948
+ paddd m10, m7
4949
+ paddd m11, m7
4950
+
4951
+ psrad m0, INTERP_SHIFT_SP
4952
+ psrad m1, INTERP_SHIFT_SP
4953
+ psrad m2, INTERP_SHIFT_SP
4954
+ psrad m3, INTERP_SHIFT_SP
4955
+ psrad m8, INTERP_SHIFT_SP
4956
+ psrad m9, INTERP_SHIFT_SP
4957
+ psrad m10, INTERP_SHIFT_SP
4958
+ psrad m11, INTERP_SHIFT_SP
4959
+
4960
+ packssdw m0, m1
4961
+ packssdw m2, m3
4962
+ packssdw m8, m9
4963
+ packssdw m10, m11
4964
+ CLIPW2 m0, m2, m17, m18
4965
+ CLIPW2 m8, m10, m17, m18
4966
+%else
4967
+ psrad m0, 6
4968
+ psrad m1, 6
4969
+ psrad m2, 6
4970
+ psrad m3, 6
4971
+ psrad m8, 6
4972
+ psrad m9, 6
4973
+ psrad m10, 6
4974
+ psrad m11, 6
4975
+
4976
+ packssdw m0, m1
4977
+ packssdw m2, m3
4978
+ packssdw m8, m9
4979
+ packssdw m10, m11
4980
+%endif
4981
+
4982
+ movu [r2], m0
4983
+ movu [r2 + r3], m2
4984
+ movu [r2 + mmsize], m8
4985
+ movu [r2 + r3 + mmsize], m10
4986
+%endmacro
4987
+;-----------------------------------------------------------------------------------------------------------------
4988
+; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
4989
+;-----------------------------------------------------------------------------------------------------------------
4990
+%macro FILTER_VER_S_CHROMA_64xN_AVX512 2
4991
+INIT_ZMM avx512
4992
+cglobal interp_4tap_vert_%1_64x%2, 5, 7, 19
4993
+ add r1d, r1d
4994
+ add r3d, r3d
4995
+ sub r0, r1
4996
+ shl r4d, 7
4997
+%ifdef PIC
4998
+ lea r5, [tab_ChromaCoeffV_avx512]
4999
+ mova m15, [r5 + r4]
5000
+ mova m16, [r5 + r4 + mmsize]
5001
+%else
5002
+ lea r5, [tab_ChromaCoeffV_avx512 + r4]
5003
+ mova m15, [r5]
5004
+ mova m16, [r5 + mmsize]
5005
+%endif
5006
+%ifidn %1, sp
5007
+ vbroadcasti32x4 m7, [INTERP_OFFSET_SP]
5008
+ pxor m17, m17
5009
+ vbroadcasti32x8 m18, [pw_pixel_max]
5010
+%endif
5011
+%rep %2/2 - 1
5012
+ PROCESS_CHROMA_VERT_S_64x2_AVX512 %1
5013
+ lea r2, [r2 + 2 * r3]
5014
+%endrep
5015
+ PROCESS_CHROMA_VERT_S_64x2_AVX512 %1
5016
+ RET
5017
+%endmacro
5018
+
5019
+%if ARCH_X86_64
5020
+ FILTER_VER_S_CHROMA_64xN_AVX512 ss, 16
5021
+ FILTER_VER_S_CHROMA_64xN_AVX512 ss, 32
5022
+ FILTER_VER_S_CHROMA_64xN_AVX512 ss, 48
5023
+ FILTER_VER_S_CHROMA_64xN_AVX512 ss, 64
5024
+ FILTER_VER_S_CHROMA_64xN_AVX512 sp, 16
5025
+ FILTER_VER_S_CHROMA_64xN_AVX512 sp, 32
5026
+ FILTER_VER_S_CHROMA_64xN_AVX512 sp, 48
5027
+ FILTER_VER_S_CHROMA_64xN_AVX512 sp, 64
5028
+%endif
5029
+;-------------------------------------------------------------------------------------------------------------
5030
+; avx512 chroma_vsp and chroma_vss code end
5031
+;-------------------------------------------------------------------------------------------------------------
5032
+;-------------------------------------------------------------------------------------------------------------
5033
+;ipfilter_chroma_avx512 code end
5034
+;-------------------------------------------------------------------------------------------------------------
5035
+;-------------------------------------------------------------------------------------------------------------
5036
+;ipfilter_luma_avx512 code start
5037
+;-------------------------------------------------------------------------------------------------------------
5038
+%macro PROCESS_IPFILTER_LUMA_PP_8x4_AVX512 0
5039
+ ; register map
5040
+ ; m0 , m1, m2, m3 - interpolate coeff
5041
+ ; m4 , m5 load shuffle order table
5042
+ ; m6 - pd_32
5043
+ ; m7 - zero
5044
+ ; m8 - pw_pixel_max
5045
+ ; m9 - store shuffle order table
5046
+
5047
+ movu xm10, [r0]
5048
+ movu xm11, [r0 + 8]
5049
+ movu xm12, [r0 + 16]
5050
+
5051
+ vinserti32x4 m10, [r0 + r1], 1
5052
+ vinserti32x4 m11, [r0 + r1 + 8], 1
5053
+ vinserti32x4 m12, [r0 + r1 + 16], 1
5054
+
5055
+ vinserti32x4 m10, [r0 + 2 * r1], 2
5056
+ vinserti32x4 m11, [r0 + 2 * r1 + 8], 2
5057
+ vinserti32x4 m12, [r0 + 2 * r1 + 16], 2
5058
+
5059
+ vinserti32x4 m10, [r0 + r6], 3
5060
+ vinserti32x4 m11, [r0 + r6 + 8], 3
5061
+ vinserti32x4 m12, [r0 + r6 + 16], 3
5062
+
5063
+ pshufb m13, m10, m5
5064
+ pshufb m10, m4
5065
+ pshufb m14, m11, m5
5066
+ pshufb m11, m4
5067
+ pshufb m15, m12, m5
5068
+ pshufb m12, m4
5069
+
5070
+ pmaddwd m10, m0
5071
+ pmaddwd m13, m1
5072
+ paddd m10, m13
5073
+ pmaddwd m13, m14, m3
5074
+ pmaddwd m16, m11, m2
5075
+ paddd m13, m16
5076
+ paddd m10, m13
5077
+ paddd m10, m6
5078
+ psrad m10, INTERP_SHIFT_PP
5079
+
5080
+ pmaddwd m11, m0
5081
+ pmaddwd m14, m1
5082
+ paddd m11, m14
5083
+ pmaddwd m15, m3
5084
+ pmaddwd m12, m2
5085
+ paddd m12, m15
5086
+ paddd m11, m12
5087
+ paddd m11, m6
5088
+ psrad m11, INTERP_SHIFT_PP
5089
+
5090
+ packusdw m10, m11
5091
+ CLIPW m10, m7, m8
5092
+ pshufb m10, m9
5093
+ movu [r2], xm10
5094
+ vextracti32x4 [r2 + r3], m10, 1
5095
+ vextracti32x4 [r2 + 2 * r3], m10, 2
5096
+ vextracti32x4 [r2 + r7], m10, 3
5097
+%endmacro
5098
+
5099
+%macro PROCESS_IPFILTER_LUMA_PP_16x4_AVX512 0
5100
+ ; register map
5101
+ ; m0 , m1, m2, m3 - interpolate coeff
5102
+ ; m4 , m5 load shuffle order table
5103
+ ; m6 - pd_32
5104
+ ; m7 - zero
5105
+ ; m8 - pw_pixel_max
5106
+ ; m9 - store shuffle order table
5107
+
5108
+ movu ym10, [r0]
5109
+ vinserti32x8 m10, [r0 + r1], 1
5110
+ movu ym11, [r0 + 8]
5111
+ vinserti32x8 m11, [r0 + r1 + 8], 1
5112
+ movu ym12, [r0 + 16]
5113
+ vinserti32x8 m12, [r0 + r1 + 16], 1
5114
+
5115
+ pshufb m13, m10, m5
5116
+ pshufb m10, m4
5117
+ pshufb m14, m11, m5
5118
+ pshufb m11, m4
5119
+ pshufb m15, m12, m5
5120
+ pshufb m12, m4
5121
+
5122
+ pmaddwd m10, m0
5123
+ pmaddwd m13, m1
5124
+ paddd m10, m13
5125
+ pmaddwd m13, m14, m3
5126
+ pmaddwd m16, m11, m2
5127
+ paddd m13, m16
5128
+ paddd m10, m13
5129
+ paddd m10, m6
5130
+ psrad m10, INTERP_SHIFT_PP
5131
+
5132
+ pmaddwd m11, m0
5133
+ pmaddwd m14, m1
5134
+ paddd m11, m14
5135
+ pmaddwd m15, m3
5136
+ pmaddwd m12, m2
5137
+ paddd m12, m15
5138
+ paddd m11, m12
5139
+ paddd m11, m6
5140
+ psrad m11, INTERP_SHIFT_PP
5141
+
5142
+ packusdw m10, m11
5143
+ CLIPW m10, m7, m8
5144
+ pshufb m10, m9
5145
+ movu [r2], ym10
5146
+ vextracti32x8 [r2 + r3], m10, 1
5147
+
5148
+ movu ym10, [r0 + 2 * r1]
5149
+ vinserti32x8 m10, [r0 + r6], 1
5150
+ movu ym11, [r0 + 2 * r1 + 8]
5151
+ vinserti32x8 m11, [r0 + r6 + 8], 1
5152
+ movu ym12, [r0 + 2 * r1 + 16]
5153
+ vinserti32x8 m12, [r0 + r6 + 16], 1
5154
+
5155
+ pshufb m13, m10, m5
5156
+ pshufb m10, m4
5157
+ pshufb m14, m11, m5
5158
+ pshufb m11, m4
5159
+ pshufb m15, m12, m5
5160
+ pshufb m12, m4
5161
+
5162
+ pmaddwd m10, m0
5163
+ pmaddwd m13, m1
5164
+ paddd m10, m13
5165
+ pmaddwd m13, m14, m3
5166
+ pmaddwd m16, m11, m2
5167
+ paddd m13, m16
5168
+ paddd m10, m13
5169
+ paddd m10, m6
5170
+ psrad m10, INTERP_SHIFT_PP
5171
+
5172
+ pmaddwd m11, m0
5173
+ pmaddwd m14, m1
5174
+ paddd m11, m14
5175
+ pmaddwd m14, m15, m3
5176
+ pmaddwd m16, m12, m2
5177
+ paddd m14, m16
5178
+ paddd m11, m14
5179
+ paddd m11, m6
5180
+ psrad m11, INTERP_SHIFT_PP
5181
+
5182
+ packusdw m10, m11
5183
+ CLIPW m10, m7, m8
5184
+ pshufb m10, m9
5185
+ movu [r2 + 2 * r3], ym10
5186
+ vextracti32x8 [r2 + r7], m10, 1
5187
+%endmacro
5188
+
5189
+%macro PROCESS_IPFILTER_LUMA_PP_24x4_AVX512 0
5190
+ ; register map
5191
+ ; m0 , m1, m2, m3 - interpolate coeff
5192
+ ; m4 , m5 load shuffle order table
5193
+ ; m6 - pd_32
5194
+ ; m7 - zero
5195
+ ; m8 - pw_pixel_max
5196
+ ; m9 - store shuffle order table
5197
+
5198
+ PROCESS_IPFILTER_LUMA_PP_16x4_AVX512
5199
+
5200
+ movu xm10, [r0 + mmsize/2]
5201
+ movu xm11, [r0 + mmsize/2 + 8]
5202
+ movu xm12, [r0 + mmsize/2 + 16]
5203
+
5204
+ vinserti32x4 m10, [r0 + r1 + mmsize/2], 1
5205
+ vinserti32x4 m11, [r0 + r1 + mmsize/2 + 8], 1
5206
+ vinserti32x4 m12, [r0 + r1 + mmsize/2 + 16], 1
5207
+
5208
+ vinserti32x4 m10, [r0 + 2 * r1 + mmsize/2], 2
5209
+ vinserti32x4 m11, [r0 + 2 * r1 + mmsize/2 + 8], 2
5210
+ vinserti32x4 m12, [r0 + 2 * r1 + mmsize/2 + 16], 2
5211
+
5212
+ vinserti32x4 m10, [r0 + r6 + mmsize/2], 3
5213
+ vinserti32x4 m11, [r0 + r6 + mmsize/2 + 8], 3
5214
+ vinserti32x4 m12, [r0 + r6 + mmsize/2 + 16], 3
5215
+
5216
+ pshufb m13, m10, m5
5217
+ pshufb m10, m4
5218
+ pshufb m14, m11, m5
5219
+ pshufb m11, m4
5220
+ pshufb m15, m12, m5
5221
+ pshufb m12, m4
5222
+
5223
+ pmaddwd m10, m0
5224
+ pmaddwd m13, m1
5225
+ paddd m10, m13
5226
+ pmaddwd m13, m14, m3
5227
+ pmaddwd m16, m11, m2
5228
+ paddd m13, m16
5229
+ paddd m10, m13
5230
+ paddd m10, m6
5231
+ psrad m10, INTERP_SHIFT_PP
5232
+
5233
+ pmaddwd m11, m0
5234
+ pmaddwd m14, m1
5235
+ paddd m11, m14
5236
+ pmaddwd m15, m3
5237
+ pmaddwd m12, m2
5238
+ paddd m12, m15
5239
+ paddd m11, m12
5240
+ paddd m11, m6
5241
+ psrad m11, INTERP_SHIFT_PP
5242
+
5243
+ packusdw m10, m11
5244
+ CLIPW m10, m7, m8
5245
+ pshufb m10, m9
5246
+ movu [r2 + mmsize/2], xm10
5247
+ vextracti32x4 [r2 + r3 + mmsize/2], m10, 1
5248
+ vextracti32x4 [r2 + 2 * r3 + mmsize/2], m10, 2
5249
+ vextracti32x4 [r2 + r7 + mmsize/2], m10, 3
5250
+%endmacro
5251
+
5252
+%macro PROCESS_IPFILTER_LUMA_PP_32x2_AVX512 0
5253
+ ; register map
5254
+ ; m0 , m1, m2, m3 - interpolate coeff
5255
+ ; m4 , m5 load shuffle order table
5256
+ ; m6 - pd_32
5257
+ ; m7 - zero
5258
+ ; m8 - pw_pixel_max
5259
+ ; m9 - store shuffle order table
5260
+
5261
+ movu m10, [r0]
5262
+ movu m11, [r0 + 8]
5263
+ movu m12, [r0 + 16]
5264
+
5265
+ pshufb m13, m10, m5
5266
+ pshufb m10, m4
5267
+ pshufb m14, m11, m5
5268
+ pshufb m11, m4
5269
+ pshufb m15, m12, m5
5270
+ pshufb m12, m4
5271
+
5272
+ pmaddwd m10, m0
5273
+ pmaddwd m13, m1
5274
+ paddd m10, m13
5275
+ pmaddwd m13, m14, m3
5276
+ pmaddwd m16, m11, m2
5277
+ paddd m13, m16
5278
+ paddd m10, m13
5279
+ paddd m10, m6
5280
+ psrad m10, INTERP_SHIFT_PP
5281
+
5282
+ pmaddwd m11, m0
5283
+ pmaddwd m14, m1
5284
+ paddd m11, m14
5285
+ pmaddwd m15, m3
5286
+ pmaddwd m12, m2
5287
+ paddd m12, m15
5288
+ paddd m11, m12
5289
+ paddd m11, m6
5290
+ psrad m11, INTERP_SHIFT_PP
5291
+
5292
+ packusdw m10, m11
5293
+ CLIPW m10, m7, m8
5294
+ pshufb m10, m9
5295
+ movu [r2], m10
5296
+
5297
+ movu m10, [r0 + r1]
5298
+ movu m11, [r0 + r1 + 8]
5299
+ movu m12, [r0 + r1 + 16]
5300
+
5301
+ pshufb m13, m10, m5
5302
+ pshufb m10, m4
5303
+ pshufb m14, m11, m5
5304
+ pshufb m11, m4
5305
+ pshufb m15, m12, m5
5306
+ pshufb m12, m4
5307
+
5308
+ pmaddwd m10, m0
5309
+ pmaddwd m13, m1
5310
+ paddd m10, m13
5311
+ pmaddwd m13, m14, m3
5312
+ pmaddwd m16, m11, m2
5313
+ paddd m13, m16
5314
+ paddd m10, m13
5315
+ paddd m10, m6
5316
+ psrad m10, INTERP_SHIFT_PP
5317
+
5318
+ pmaddwd m11, m0
5319
+ pmaddwd m14, m1
5320
+ paddd m11, m14
5321
+ pmaddwd m14, m15, m3
5322
+ pmaddwd m16, m12, m2
5323
+ paddd m14, m16
5324
+ paddd m11, m14
5325
+ paddd m11, m6
5326
+ psrad m11, INTERP_SHIFT_PP
5327
+
5328
+ packusdw m10, m11
5329
+ CLIPW m10, m7, m8
5330
+ pshufb m10, m9
5331
+ movu [r2 + r3], m10
5332
+%endmacro
5333
+
5334
+%macro PROCESS_IPFILTER_LUMA_PP_48x4_AVX512 0
5335
+ ; register map
5336
+ ; m0 , m1, m2, m3 - interpolate coeff
5337
+ ; m4 , m5 load shuffle order table
5338
+ ; m6 - pd_32
5339
+ ; m7 - zero
5340
+ ; m8 - pw_pixel_max
5341
+ ; m9 - store shuffle order table
5342
+
5343
+ movu m10, [r0]
5344
+ movu m11, [r0 + 8]
5345
+ movu m12, [r0 + 16]
5346
+
5347
+ pshufb m13, m10, m5
5348
+ pshufb m10, m4
5349
+ pshufb m14, m11, m5
5350
+ pshufb m11, m4
5351
+ pshufb m15, m12, m5
5352
+ pshufb m12, m4
5353
+
5354
+ pmaddwd m10, m0
5355
+ pmaddwd m13, m1
5356
+ paddd m10, m13
5357
+ pmaddwd m13, m14, m3
5358
+ pmaddwd m16, m11, m2
5359
+ paddd m13, m16
5360
+ paddd m10, m13
5361
+ paddd m10, m6
5362
+ psrad m10, INTERP_SHIFT_PP
5363
+
5364
+ pmaddwd m11, m0
5365
+ pmaddwd m14, m1
5366
+ paddd m11, m14
5367
+ pmaddwd m15, m3
5368
+ pmaddwd m12, m2
5369
+ paddd m12, m15
5370
+ paddd m11, m12
5371
+ paddd m11, m6
5372
+ psrad m11, INTERP_SHIFT_PP
5373
+
5374
+ packusdw m10, m11
5375
+ CLIPW m10, m7, m8
5376
+ pshufb m10, m9
5377
+ movu [r2], m10
5378
+
5379
+ movu m10, [r0 + r1]
5380
+ movu m11, [r0 + r1 + 8]
5381
+ movu m12, [r0 + r1 + 16]
5382
+
5383
+ pshufb m13, m10, m5
5384
+ pshufb m10, m4
5385
+ pshufb m14, m11, m5
5386
+ pshufb m11, m4
5387
+ pshufb m15, m12, m5
5388
+ pshufb m12, m4
5389
+
5390
+ pmaddwd m10, m0
5391
+ pmaddwd m13, m1
5392
+ paddd m10, m13
5393
+ pmaddwd m13, m14, m3
5394
+ pmaddwd m16, m11, m2
5395
+ paddd m13, m16
5396
+ paddd m10, m13
5397
+ paddd m10, m6
5398
+ psrad m10, INTERP_SHIFT_PP
5399
+
5400
+ pmaddwd m11, m0
5401
+ pmaddwd m14, m1
5402
+ paddd m11, m14
5403
+ pmaddwd m14, m15, m3
5404
+ pmaddwd m16, m12, m2
5405
+ paddd m14, m16
5406
+ paddd m11, m14
5407
+ paddd m11, m6
5408
+ psrad m11, INTERP_SHIFT_PP
5409
+
5410
+ packusdw m10, m11
5411
+ CLIPW m10, m7, m8
5412
+ pshufb m10, m9
5413
+ movu [r2 + r3], m10
5414
+
5415
+ movu m10, [r0 + 2 * r1]
5416
+ movu m11, [r0 + 2 * r1 + 8]
5417
+ movu m12, [r0 + 2 * r1 + 16]
5418
+
5419
+ pshufb m13, m10, m5
5420
+ pshufb m10, m4
5421
+ pshufb m14, m11, m5
5422
+ pshufb m11, m4
5423
+ pshufb m15, m12, m5
5424
+ pshufb m12, m4
5425
+
5426
+ pmaddwd m10, m0
5427
+ pmaddwd m13, m1
5428
+ paddd m10, m13
5429
+ pmaddwd m13, m14, m3
5430
+ pmaddwd m16, m11, m2
5431
+ paddd m13, m16
5432
+ paddd m10, m13
5433
+ paddd m10, m6
5434
+ psrad m10, INTERP_SHIFT_PP
5435
+
5436
+ pmaddwd m11, m0
5437
+ pmaddwd m14, m1
5438
+ paddd m11, m14
5439
+ pmaddwd m15, m3
5440
+ pmaddwd m12, m2
5441
+ paddd m12, m15
5442
+ paddd m11, m12
5443
+ paddd m11, m6
5444
+ psrad m11, INTERP_SHIFT_PP
5445
+
5446
+ packusdw m10, m11
5447
+ CLIPW m10, m7, m8
5448
+ pshufb m10, m9
5449
+ movu [r2 + 2 * r3], m10
5450
+
5451
+ movu m10, [r0 + r6]
5452
+ movu m11, [r0 + r6 + 8]
5453
+ movu m12, [r0 + r6 + 16]
5454
+
5455
+ pshufb m13, m10, m5
5456
+ pshufb m10, m4
5457
+ pshufb m14, m11, m5
5458
+ pshufb m11, m4
5459
+ pshufb m15, m12, m5
5460
+ pshufb m12, m4
5461
+
5462
+ pmaddwd m10, m0
5463
+ pmaddwd m13, m1
5464
+ paddd m10, m13
5465
+ pmaddwd m13, m14, m3
5466
+ pmaddwd m16, m11, m2
5467
+ paddd m13, m16
5468
+ paddd m10, m13
5469
+ paddd m10, m6
5470
+ psrad m10, INTERP_SHIFT_PP
5471
+
5472
+ pmaddwd m11, m0
5473
+ pmaddwd m14, m1
5474
+ paddd m11, m14
5475
+ pmaddwd m14, m15, m3
5476
+ pmaddwd m16, m12, m2
5477
+ paddd m14, m16
5478
+ paddd m11, m14
5479
+ paddd m11, m6
5480
+ psrad m11, INTERP_SHIFT_PP
5481
+
5482
+ packusdw m10, m11
5483
+ CLIPW m10, m7, m8
5484
+ pshufb m10, m9
5485
+ movu [r2 + r7], m10
5486
+
5487
+ movu ym10, [r0 + mmsize]
5488
+ vinserti32x8 m10, [r0 + r1 + mmsize], 1
5489
+ movu ym11, [r0 + mmsize + 8]
5490
+ vinserti32x8 m11, [r0 + r1 + mmsize + 8], 1
5491
+ movu ym12, [r0 + mmsize + 16]
5492
+ vinserti32x8 m12, [r0 + r1 + mmsize + 16], 1
5493
+
5494
+ pshufb m13, m10, m5
5495
+ pshufb m10, m4
5496
+ pshufb m14, m11, m5
5497
+ pshufb m11, m4
5498
+ pshufb m15, m12, m5
5499
+ pshufb m12, m4
5500
+
5501
+ pmaddwd m10, m0
5502
+ pmaddwd m13, m1
5503
+ paddd m10, m13
5504
+ pmaddwd m13, m14, m3
5505
+ pmaddwd m16, m11, m2
5506
+ paddd m13, m16
5507
+ paddd m10, m13
5508
+ paddd m10, m6
5509
+ psrad m10, INTERP_SHIFT_PP
5510
+
5511
+ pmaddwd m11, m0
5512
+ pmaddwd m14, m1
5513
+ paddd m11, m14
5514
+ pmaddwd m15, m3
5515
+ pmaddwd m12, m2
5516
+ paddd m12, m15
5517
+ paddd m11, m12
5518
+ paddd m11, m6
5519
+ psrad m11, INTERP_SHIFT_PP
5520
+
5521
+ packusdw m10, m11
5522
+ CLIPW m10, m7, m8
5523
+ pshufb m10, m9
5524
+ movu [r2 + mmsize], ym10
5525
+ vextracti32x8 [r2 + r3 + mmsize], m10, 1
5526
+
5527
+ movu ym10, [r0 + 2 * r1 + mmsize]
5528
+ vinserti32x8 m10, [r0 + r6 + mmsize], 1
5529
+ movu ym11, [r0 + 2 * r1 + mmsize + 8]
5530
+ vinserti32x8 m11, [r0 + r6 + mmsize + 8], 1
5531
+ movu ym12, [r0 + 2 * r1 + mmsize + 16]
5532
+ vinserti32x8 m12, [r0 + r6 + mmsize + 16], 1
5533
+
5534
+ pshufb m13, m10, m5
5535
+ pshufb m10, m4
5536
+ pshufb m14, m11, m5
5537
+ pshufb m11, m4
5538
+ pshufb m15, m12, m5
5539
+ pshufb m12, m4
5540
+
5541
+ pmaddwd m10, m0
5542
+ pmaddwd m13, m1
5543
+ paddd m10, m13
5544
+ pmaddwd m13, m14, m3
5545
+ pmaddwd m16, m11, m2
5546
+ paddd m13, m16
5547
+ paddd m10, m13
5548
+ paddd m10, m6
5549
+ psrad m10, INTERP_SHIFT_PP
5550
+
5551
+ pmaddwd m11, m0
5552
+ pmaddwd m14, m1
5553
+ paddd m11, m14
5554
+ pmaddwd m14, m15, m3
5555
+ pmaddwd m16, m12, m2
5556
+ paddd m14, m16
5557
+ paddd m11, m14
5558
+ paddd m11, m6
5559
+ psrad m11, INTERP_SHIFT_PP
5560
+
5561
+ packusdw m10, m11
5562
+ CLIPW m10, m7, m8
5563
+ pshufb m10, m9
5564
+ movu [r2 + 2 * r3 + mmsize], ym10
5565
+ vextracti32x8 [r2 + r7 + mmsize], m10, 1
5566
+%endmacro
5567
+
5568
+%macro PROCESS_IPFILTER_LUMA_PP_64x2_AVX512 0
5569
+ ; register map
5570
+ ; m0 , m1, m2, m3 - interpolate coeff
5571
+ ; m4 , m5 load shuffle order table
5572
+ ; m6 - pd_32
5573
+ ; m7 - zero
5574
+ ; m8 - pw_pixel_max
5575
+ ; m9 - store shuffle order table
5576
+
5577
+ movu m10, [r0]
5578
+ movu m11, [r0 + 8]
5579
+ movu m12, [r0 + 16]
5580
+
5581
+ pshufb m13, m10, m5
5582
+ pshufb m10, m4
5583
+ pshufb m14, m11, m5
5584
+ pshufb m11, m4
5585
+ pshufb m15, m12, m5
5586
+ pshufb m12, m4
5587
+
5588
+ pmaddwd m10, m0
5589
+ pmaddwd m13, m1
5590
+ paddd m10, m13
5591
+ pmaddwd m13, m14, m3
5592
+ pmaddwd m16, m11, m2
5593
+ paddd m13, m16
5594
+ paddd m10, m13
5595
+ paddd m10, m6
5596
+ psrad m10, INTERP_SHIFT_PP
5597
+
5598
+ pmaddwd m11, m0
5599
+ pmaddwd m14, m1
5600
+ paddd m11, m14
5601
+ pmaddwd m15, m3
5602
+ pmaddwd m12, m2
5603
+ paddd m12, m15
5604
+ paddd m11, m12
5605
+ paddd m11, m6
5606
+ psrad m11, INTERP_SHIFT_PP
5607
+
5608
+ packusdw m10, m11
5609
+ CLIPW m10, m7, m8
5610
+ pshufb m10, m9
5611
+ movu [r2], m10
5612
+
5613
+ movu m10, [r0 + mmsize]
5614
+ movu m11, [r0 + mmsize + 8]
5615
+ movu m12, [r0 + mmsize + 16]
5616
+
5617
+ pshufb m13, m10, m5
5618
+ pshufb m10, m4
5619
+ pshufb m14, m11, m5
5620
+ pshufb m11, m4
5621
+ pshufb m15, m12, m5
5622
+ pshufb m12, m4
5623
+
5624
+ pmaddwd m10, m0
5625
+ pmaddwd m13, m1
5626
+ paddd m10, m13
5627
+ pmaddwd m13, m14, m3
5628
+ pmaddwd m16, m11, m2
5629
+ paddd m13, m16
5630
+ paddd m10, m13
5631
+ paddd m10, m6
5632
+ psrad m10, INTERP_SHIFT_PP
5633
+
5634
+ pmaddwd m11, m0
5635
+ pmaddwd m14, m1
5636
+ paddd m11, m14
5637
+ pmaddwd m15, m3
5638
+ pmaddwd m12, m2
5639
+ paddd m12, m15
5640
+ paddd m11, m12
5641
+ paddd m11, m6
5642
+ psrad m11, INTERP_SHIFT_PP
5643
+
5644
+ packusdw m10, m11
5645
+ CLIPW m10, m7, m8
5646
+ pshufb m10, m9
5647
+ movu [r2 + mmsize], m10
5648
+
5649
+ movu m10, [r0 + r1]
5650
+ movu m11, [r0 + r1 + 8]
5651
+ movu m12, [r0 + r1 + 16]
5652
+
5653
+ pshufb m13, m10, m5
5654
+ pshufb m10, m4
5655
+ pshufb m14, m11, m5
5656
+ pshufb m11, m4
5657
+ pshufb m15, m12, m5
5658
+ pshufb m12, m4
5659
+
5660
+ pmaddwd m10, m0
5661
+ pmaddwd m13, m1
5662
+ paddd m10, m13
5663
+ pmaddwd m13, m14, m3
5664
+ pmaddwd m16, m11, m2
5665
+ paddd m13, m16
5666
+ paddd m10, m13
5667
+ paddd m10, m6
5668
+ psrad m10, INTERP_SHIFT_PP
5669
+
5670
+ pmaddwd m11, m0
5671
+ pmaddwd m14, m1
5672
+ paddd m11, m14
5673
+ pmaddwd m14, m15, m3
5674
+ pmaddwd m16, m12, m2
5675
+ paddd m14, m16
5676
+ paddd m11, m14
5677
+ paddd m11, m6
5678
+ psrad m11, INTERP_SHIFT_PP
5679
+
5680
+ packusdw m10, m11
5681
+ CLIPW m10, m7, m8
5682
+ pshufb m10, m9
5683
+ movu [r2 + r3], m10
5684
+
5685
+ movu m10, [r0 + r1 + mmsize]
5686
+ movu m11, [r0 + r1 + mmsize + 8]
5687
+ movu m12, [r0 + r1 + mmsize + 16]
5688
+
5689
+ pshufb m13, m10, m5
5690
+ pshufb m10, m4
5691
+ pshufb m14, m11, m5
5692
+ pshufb m11, m4
5693
+ pshufb m15, m12, m5
5694
+ pshufb m12, m4
5695
+
5696
+ pmaddwd m10, m0
5697
+ pmaddwd m13, m1
5698
+ paddd m10, m13
5699
+ pmaddwd m13, m14, m3
5700
+ pmaddwd m16, m11, m2
5701
+ paddd m13, m16
5702
+ paddd m10, m13
5703
+ paddd m10, m6
5704
+ psrad m10, INTERP_SHIFT_PP
5705
+
5706
+ pmaddwd m11, m0
5707
+ pmaddwd m14, m1
5708
+ paddd m11, m14
5709
+ pmaddwd m14, m15, m3
5710
+ pmaddwd m16, m12, m2
5711
+ paddd m14, m16
5712
+ paddd m11, m14
5713
+ paddd m11, m6
5714
+ psrad m11, INTERP_SHIFT_PP
5715
+
5716
+ packusdw m10, m11
5717
+ CLIPW m10, m7, m8
5718
+ pshufb m10, m9
5719
+ movu [r2 + r3 + mmsize], m10
5720
+%endmacro
5721
+
5722
+%macro IPFILTER_LUMA_AVX512_8xN 1
5723
+INIT_ZMM avx512
5724
+cglobal interp_8tap_horiz_pp_8x%1, 5, 8, 17
5725
+ add r1d, r1d
5726
+ add r3d, r3d
5727
+ sub r0, 6
5728
+ mov r4d, r4m
5729
+ shl r4d, 4
5730
+
5731
+%ifdef PIC
5732
+ lea r5, [tab_LumaCoeff]
5733
+ vpbroadcastd m0, [r5 + r4]
5734
+ vpbroadcastd m1, [r5 + r4 + 4]
5735
+ vpbroadcastd m2, [r5 + r4 + 8]
5736
+ vpbroadcastd m3, [r5 + r4 + 12]
5737
+%else
5738
+ vpbroadcastd m0, [tab_LumaCoeff + r4]
5739
+ vpbroadcastd m1, [tab_LumaCoeff + r4 + 4]
5740
+ vpbroadcastd m2, [tab_LumaCoeff + r4 + 8]
5741
+ vpbroadcastd m3, [tab_LumaCoeff + r4 + 12]
5742
+%endif
5743
+ vbroadcasti32x8 m4, [interp8_hpp_shuf1_load_avx512]
5744
+ vbroadcasti32x8 m5, [interp8_hpp_shuf2_load_avx512]
5745
+ vbroadcasti32x8 m6, [pd_32]
5746
+ pxor m7, m7
5747
+ vbroadcasti32x8 m8, [pw_pixel_max]
5748
+ vbroadcasti32x8 m9, [interp8_hpp_shuf1_store_avx512]
5749
+ lea r6, [3 * r1]
5750
+ lea r7, [3 * r3]
5751
+
5752
+%rep %1/4 - 1
5753
+ PROCESS_IPFILTER_LUMA_PP_8x4_AVX512
5754
+ lea r0, [r0 + 4 * r1]
5755
+ lea r2, [r2 + 4 * r3]
5756
+%endrep
5757
+ PROCESS_IPFILTER_LUMA_PP_8x4_AVX512
5758
+ RET
5759
+%endmacro
5760
+
5761
+%if ARCH_X86_64
5762
+ IPFILTER_LUMA_AVX512_8xN 4
5763
+ IPFILTER_LUMA_AVX512_8xN 8
5764
+ IPFILTER_LUMA_AVX512_8xN 16
5765
+ IPFILTER_LUMA_AVX512_8xN 32
5766
+%endif
5767
+
5768
+%macro IPFILTER_LUMA_AVX512_16xN 1
5769
+INIT_ZMM avx512
5770
+cglobal interp_8tap_horiz_pp_16x%1, 5,8,17
5771
+ add r1d, r1d
5772
+ add r3d, r3d
5773
+ sub r0, 6
5774
+ mov r4d, r4m
5775
+ shl r4d, 4
5776
+
5777
+%ifdef PIC
5778
+ lea r5, [tab_LumaCoeff]
5779
+ vpbroadcastd m0, [r5 + r4]
5780
+ vpbroadcastd m1, [r5 + r4 + 4]
5781
+ vpbroadcastd m2, [r5 + r4 + 8]
5782
+ vpbroadcastd m3, [r5 + r4 + 12]
5783
+%else
5784
+ vpbroadcastd m0, [tab_LumaCoeff + r4]
5785
+ vpbroadcastd m1, [tab_LumaCoeff + r4 + 4]
5786
+ vpbroadcastd m2, [tab_LumaCoeff + r4 + 8]
5787
+ vpbroadcastd m3, [tab_LumaCoeff + r4 + 12]
5788
+%endif
5789
+ vbroadcasti32x8 m4, [interp8_hpp_shuf1_load_avx512]
5790
+ vbroadcasti32x8 m5, [interp8_hpp_shuf2_load_avx512]
5791
+ vbroadcasti32x8 m6, [pd_32]
5792
+ pxor m7, m7
5793
+ vbroadcasti32x8 m8, [pw_pixel_max]
5794
+ vbroadcasti32x8 m9, [interp8_hpp_shuf1_store_avx512]
5795
+ lea r6, [3 * r1]
5796
+ lea r7, [3 * r3]
5797
+
5798
+%rep %1/4 - 1
5799
+ PROCESS_IPFILTER_LUMA_PP_16x4_AVX512
5800
+ lea r0, [r0 + 4 * r1]
5801
+ lea r2, [r2 + 4 * r3]
5802
+%endrep
5803
+ PROCESS_IPFILTER_LUMA_PP_16x4_AVX512
5804
+ RET
5805
+%endmacro
5806
+
5807
+%if ARCH_X86_64
5808
+IPFILTER_LUMA_AVX512_16xN 4
5809
+IPFILTER_LUMA_AVX512_16xN 8
5810
+IPFILTER_LUMA_AVX512_16xN 12
5811
+IPFILTER_LUMA_AVX512_16xN 16
5812
+IPFILTER_LUMA_AVX512_16xN 32
5813
+IPFILTER_LUMA_AVX512_16xN 64
5814
+%endif
5815
+
5816
+%if ARCH_X86_64
5817
+INIT_ZMM avx512
5818
+cglobal interp_8tap_horiz_pp_24x32, 5, 8, 17
5819
+ add r1d, r1d
5820
+ add r3d, r3d
5821
+ sub r0, 6
5822
+ mov r4d, r4m
5823
+ shl r4d, 4
5824
+
5825
+%ifdef PIC
5826
+ lea r5, [tab_LumaCoeff]
5827
+ vpbroadcastd m0, [r5 + r4]
5828
+ vpbroadcastd m1, [r5 + r4 + 4]
5829
+ vpbroadcastd m2, [r5 + r4 + 8]
5830
+ vpbroadcastd m3, [r5 + r4 + 12]
5831
+%else
5832
+ vpbroadcastd m0, [tab_LumaCoeff + r4]
5833
+ vpbroadcastd m1, [tab_LumaCoeff + r4 + 4]
5834
+ vpbroadcastd m2, [tab_LumaCoeff + r4 + 8]
5835
+ vpbroadcastd m3, [tab_LumaCoeff + r4 + 12]
5836
+%endif
5837
+ vbroadcasti32x8 m4, [interp8_hpp_shuf1_load_avx512]
5838
+ vbroadcasti32x8 m5, [interp8_hpp_shuf2_load_avx512]
5839
+ vbroadcasti32x8 m6, [pd_32]
5840
+ pxor m7, m7
5841
+ vbroadcasti32x8 m8, [pw_pixel_max]
5842
+ vbroadcasti32x8 m9, [interp8_hpp_shuf1_store_avx512]
5843
+ lea r6, [3 * r1]
5844
+ lea r7, [3 * r3]
5845
+
5846
+%rep 7
5847
+ PROCESS_IPFILTER_LUMA_PP_24x4_AVX512
5848
+ lea r0, [r0 + 4 * r1]
5849
+ lea r2, [r2 + 4 * r3]
5850
+%endrep
5851
+ PROCESS_IPFILTER_LUMA_PP_24x4_AVX512
5852
+ RET
5853
+%endif
5854
+
5855
+%macro IPFILTER_LUMA_AVX512_32xN 1
5856
+INIT_ZMM avx512
5857
+cglobal interp_8tap_horiz_pp_32x%1, 5,6,17
5858
+ add r1d, r1d
5859
+ add r3d, r3d
5860
+ sub r0, 6
5861
+ mov r4d, r4m
5862
+ shl r4d, 4
5863
+
5864
+%ifdef PIC
5865
+ lea r5, [tab_LumaCoeff]
5866
+ vpbroadcastd m0, [r5 + r4]
5867
+ vpbroadcastd m1, [r5 + r4 + 4]
5868
+ vpbroadcastd m2, [r5 + r4 + 8]
5869
+ vpbroadcastd m3, [r5 + r4 + 12]
5870
+%else
5871
+ vpbroadcastd m0, [tab_LumaCoeff + r4]
5872
+ vpbroadcastd m1, [tab_LumaCoeff + r4 + 4]
5873
+ vpbroadcastd m2, [tab_LumaCoeff + r4 + 8]
5874
+ vpbroadcastd m3, [tab_LumaCoeff + r4 + 12]
5875
+%endif
5876
+ vbroadcasti32x8 m4, [interp8_hpp_shuf1_load_avx512]
5877
+ vbroadcasti32x8 m5, [interp8_hpp_shuf2_load_avx512]
5878
+ vbroadcasti32x8 m6, [pd_32]
5879
+ pxor m7, m7
5880
+ vbroadcasti32x8 m8, [pw_pixel_max]
5881
+ vbroadcasti32x8 m9, [interp8_hpp_shuf1_store_avx512]
5882
+
5883
+%rep %1/2 - 1
5884
+ PROCESS_IPFILTER_LUMA_PP_32x2_AVX512
5885
+ lea r0, [r0 + 2 * r1]
5886
+ lea r2, [r2 + 2 * r3]
5887
+%endrep
5888
+ PROCESS_IPFILTER_LUMA_PP_32x2_AVX512
5889
+ RET
5890
+%endmacro
5891
+
5892
+%if ARCH_X86_64
5893
+IPFILTER_LUMA_AVX512_32xN 8
5894
+IPFILTER_LUMA_AVX512_32xN 16
5895
+IPFILTER_LUMA_AVX512_32xN 24
5896
+IPFILTER_LUMA_AVX512_32xN 32
5897
+IPFILTER_LUMA_AVX512_32xN 64
5898
+%endif
5899
+
5900
+%macro IPFILTER_LUMA_AVX512_64xN 1
5901
+INIT_ZMM avx512
5902
+cglobal interp_8tap_horiz_pp_64x%1, 5,6,17
5903
+ add r1d, r1d
5904
+ add r3d, r3d
5905
+ sub r0, 6
5906
+ mov r4d, r4m
5907
+ shl r4d, 4
5908
+
5909
+%ifdef PIC
5910
+ lea r5, [tab_LumaCoeff]
5911
+ vpbroadcastd m0, [r5 + r4]
5912
+ vpbroadcastd m1, [r5 + r4 + 4]
5913
+ vpbroadcastd m2, [r5 + r4 + 8]
5914
+ vpbroadcastd m3, [r5 + r4 + 12]
5915
+%else
5916
+ vpbroadcastd m0, [tab_LumaCoeff + r4]
5917
+ vpbroadcastd m1, [tab_LumaCoeff + r4 + 4]
5918
+ vpbroadcastd m2, [tab_LumaCoeff + r4 + 8]
5919
+ vpbroadcastd m3, [tab_LumaCoeff + r4 + 12]
5920
+%endif
5921
+ vbroadcasti32x8 m4, [interp8_hpp_shuf1_load_avx512]
5922
+ vbroadcasti32x8 m5, [interp8_hpp_shuf2_load_avx512]
5923
+ vbroadcasti32x8 m6, [pd_32]
5924
+ pxor m7, m7
5925
+ vbroadcasti32x8 m8, [pw_pixel_max]
5926
+ vbroadcasti32x8 m9, [interp8_hpp_shuf1_store_avx512]
5927
+
5928
+%rep %1/2 - 1
5929
+ PROCESS_IPFILTER_LUMA_PP_64x2_AVX512
5930
+ lea r0, [r0 + 2 * r1]
5931
+ lea r2, [r2 + 2 * r3]
5932
+%endrep
5933
+ PROCESS_IPFILTER_LUMA_PP_64x2_AVX512
5934
+ RET
5935
+%endmacro
5936
+
5937
+%if ARCH_X86_64
5938
+IPFILTER_LUMA_AVX512_64xN 16
5939
+IPFILTER_LUMA_AVX512_64xN 32
5940
+IPFILTER_LUMA_AVX512_64xN 48
5941
+IPFILTER_LUMA_AVX512_64xN 64
5942
+%endif
5943
+
5944
+%if ARCH_X86_64
5945
+INIT_ZMM avx512
5946
+cglobal interp_8tap_horiz_pp_48x64, 5,8,17
5947
+ add r1d, r1d
5948
+ add r3d, r3d
5949
+ sub r0, 6
5950
+ mov r4d, r4m
5951
+ shl r4d, 4
5952
+
5953
+%ifdef PIC
5954
+ lea r5, [tab_LumaCoeff]
5955
+ vpbroadcastd m0, [r5 + r4]
5956
+ vpbroadcastd m1, [r5 + r4 + 4]
5957
+ vpbroadcastd m2, [r5 + r4 + 8]
5958
+ vpbroadcastd m3, [r5 + r4 + 12]
5959
+%else
5960
+ vpbroadcastd m0, [tab_LumaCoeff + r4]
5961
+ vpbroadcastd m1, [tab_LumaCoeff + r4 + 4]
5962
+ vpbroadcastd m2, [tab_LumaCoeff + r4 + 8]
5963
+ vpbroadcastd m3, [tab_LumaCoeff + r4 + 12]
5964
+%endif
5965
+ vbroadcasti32x8 m4, [interp8_hpp_shuf1_load_avx512]
5966
+ vbroadcasti32x8 m5, [interp8_hpp_shuf2_load_avx512]
5967
+ vbroadcasti32x8 m6, [pd_32]
5968
+ pxor m7, m7
5969
+ vbroadcasti32x8 m8, [pw_pixel_max]
5970
+ vbroadcasti32x8 m9, [interp8_hpp_shuf1_store_avx512]
5971
+ lea r6, [3 * r1]
5972
+ lea r7, [3 * r3]
5973
+
5974
+%rep 15
5975
+ PROCESS_IPFILTER_LUMA_PP_48x4_AVX512
5976
+ lea r0, [r0 + 4 * r1]
5977
+ lea r2, [r2 + 4 * r3]
5978
+%endrep
5979
+ PROCESS_IPFILTER_LUMA_PP_48x4_AVX512
5980
+ RET
5981
+%endif
5982
+;-------------------------------------------------------------------------------------------------------------
5983
+;avx512 luma_hps code start
5984
+;-------------------------------------------------------------------------------------------------------------
5985
+
5986
+%macro PROCESS_IPFILTER_LUMA_PS_32x2_AVX512 0
5987
+ ; register map
5988
+ ; m0, m1, m2, m3 - interpolate coeff
5989
+ ; m4, m5 - shuffle load order table
5990
+ ; m6 - INTERP_OFFSET_PS
5991
+ ; m7 - shuffle store order table
5992
+
5993
+ movu m8, [r0]
5994
+ movu m9, [r0 + 8]
5995
+ movu m10, [r0 + 16]
5996
+
5997
+ pshufb m11, m8, m5
5998
+ pshufb m8, m4
5999
+ pmaddwd m8, m0
6000
+ pmaddwd m11, m1
6001
+ paddd m8, m11
6002
+ pshufb m12, m9, m5
6003
+ pshufb m9, m4
6004
+ pmaddwd m11, m12, m3
6005
+ pmaddwd m14, m9, m2
6006
+ paddd m11, m14
6007
+
6008
+ paddd m8, m11
6009
+ paddd m8, m6
6010
+ psrad m8, INTERP_SHIFT_PS
6011
+
6012
+ pshufb m13, m10, m5
6013
+ pshufb m10, m4
6014
+ pmaddwd m9, m0
6015
+ pmaddwd m12, m1
6016
+ paddd m9, m12
6017
+ pmaddwd m13, m3
6018
+ pmaddwd m10, m2
6019
+ paddd m10, m13
6020
+
6021
+ paddd m9, m10
6022
+ paddd m9, m6
6023
+ psrad m9, INTERP_SHIFT_PS
6024
+
6025
+ packssdw m8, m9
6026
+ pshufb m8, m7
6027
+ movu [r2], m8
6028
+
6029
+ movu m8, [r0 + r1]
6030
+ movu m9, [r0 + r1 + 8]
6031
+ movu m10, [r0 + r1 + 16]
6032
+
6033
+ pshufb m11, m8, m5
6034
+ pshufb m8, m4
6035
+ pmaddwd m8, m0
6036
+ pmaddwd m11, m1
6037
+ paddd m8, m11
6038
+ pshufb m12, m9, m5
6039
+ pshufb m9, m4
6040
+ pmaddwd m11, m12, m3
6041
+ pmaddwd m14, m9, m2
6042
+ paddd m11, m14
6043
+
6044
+ paddd m8, m11
6045
+ paddd m8, m6
6046
+ psrad m8, INTERP_SHIFT_PS
6047
+
6048
+ pshufb m13, m10, m5
6049
+ pshufb m10, m4
6050
+ pmaddwd m9, m0
6051
+ pmaddwd m12, m1
6052
+ paddd m9, m12
6053
+ pmaddwd m12, m13, m3
6054
+ pmaddwd m14, m10, m2
6055
+ paddd m12, m14
6056
+
6057
+ paddd m9, m12
6058
+ paddd m9, m6
6059
+ psrad m9, INTERP_SHIFT_PS
6060
+
6061
+ packssdw m8, m9
6062
+ pshufb m8, m7
6063
+ movu [r2 + r3],m8
6064
+%endmacro
6065
+
6066
+%macro PROCESS_IPFILTER_LUMA_PS_32x1_AVX512 0
6067
+ movu m8, [r0]
6068
+ movu m9, [r0 + 8]
6069
+ movu m10, [r0 + 16]
6070
+
6071
+ pshufb m11, m8, m5
6072
+ pshufb m8, m4
6073
+ pmaddwd m8, m0
6074
+ pmaddwd m11, m1
6075
+ paddd m8, m11
6076
+ pshufb m12, m9, m5
6077
+ pshufb m9, m4
6078
+ pmaddwd m11, m12, m3
6079
+ pmaddwd m14, m9, m2
6080
+ paddd m11, m14
6081
+
6082
+ paddd m8, m11
6083
+ paddd m8, m6
6084
+ psrad m8, INTERP_SHIFT_PS
6085
+
6086
+ pshufb m13, m10, m5
6087
+ pshufb m10, m4
6088
+ pmaddwd m9, m0
6089
+ pmaddwd m12, m1
6090
+ paddd m9, m12
6091
+ pmaddwd m13, m3
6092
+ pmaddwd m10, m2
6093
+ paddd m10, m13
6094
+
6095
+ paddd m9, m10
6096
+ paddd m9, m6
6097
+ psrad m9, INTERP_SHIFT_PS
6098
+
6099
+ packssdw m8, m9
6100
+ pshufb m8, m7
6101
+ movu [r2], m8
6102
+%endmacro
6103
+
6104
+%macro IPFILTER_LUMA_PS_AVX512_32xN 1
6105
+INIT_ZMM avx512
6106
+cglobal interp_8tap_horiz_ps_32x%1, 4,7,15
6107
+ shl r1d, 1
6108
+ shl r3d, 1
6109
+ mov r4d, r4m
6110
+ mov r5d, r5m
6111
+ shl r4d, 6
6112
+
6113
+%ifdef PIC
6114
+ lea r6, [tab_LumaCoeffH_avx512]
6115
+ vpbroadcastd m0, [r6 + r4]
6116
+ vpbroadcastd m1, [r6 + r4 + 4]
6117
+ vpbroadcastd m2, [r6 + r4 + 8]
6118
+ vpbroadcastd m3, [r6 + r4 + 12]
6119
+%else
6120
+ vpbroadcastd m0, [tab_LumaCoeffH_avx512 + r4]
6121
+ vpbroadcastd m1, [tab_LumaCoeffH_avx512 + r4 + 4]
6122
+ vpbroadcastd m2, [tab_LumaCoeffH_avx512 + r4 + 8]
6123
+ vpbroadcastd m3, [tab_LumaCoeffH_avx512 + r4 + 12]
6124
+%endif
6125
+ vbroadcasti32x8 m4, [interp8_hpp_shuf1_load_avx512]
6126
+ vbroadcasti32x8 m5, [interp8_hpp_shuf2_load_avx512]
6127
+ vbroadcasti32x4 m6, [INTERP_OFFSET_PS]
6128
+ vbroadcasti32x8 m7, [interp8_hpp_shuf1_store_avx512]
6129
+
6130
+ sub r0, 6
6131
+ mov r4d, %1
6132
+ test r5d, r5d
6133
+ jz .loop
6134
+ lea r6, [r1 * 3]
6135
+ sub r0, r6
6136
+ add r4d, 7
6137
+ PROCESS_IPFILTER_LUMA_PS_32x1_AVX512
6138
+ lea r0, [r0 + r1]
6139
+ lea r2, [r2 + r3]
6140
+ dec r4d
6141
+
6142
+.loop:
6143
+ PROCESS_IPFILTER_LUMA_PS_32x2_AVX512
6144
+ lea r0, [r0 + 2 * r1]
6145
+ lea r2, [r2 + 2 * r3]
6146
+ sub r4d, 2
6147
+ jnz .loop
6148
+ RET
6149
+%endmacro
6150
+
6151
+%if ARCH_X86_64
6152
+IPFILTER_LUMA_PS_AVX512_32xN 8
6153
+IPFILTER_LUMA_PS_AVX512_32xN 16
6154
+IPFILTER_LUMA_PS_AVX512_32xN 24
6155
+IPFILTER_LUMA_PS_AVX512_32xN 32
6156
+IPFILTER_LUMA_PS_AVX512_32xN 64
6157
+%endif
6158
+
6159
+%macro PROCESS_IPFILTER_LUMA_PS_64x2_AVX512 0
6160
+ ; register map
6161
+ ; m0, m1, m2, m3 - interpolate coeff
6162
+ ; m4, m5 - shuffle load order table
6163
+ ; m6 - INTERP_OFFSET_PS
6164
+ ; m7 - shuffle store order table
6165
+
6166
+ movu m8, [r0]
6167
+ movu m9, [r0 + 8]
6168
+ movu m10, [r0 + 16]
6169
+
6170
+ pshufb m11, m8, m5
6171
+ pshufb m8, m4
6172
+ pmaddwd m8, m0
6173
+ pmaddwd m11, m1
6174
+ paddd m8, m11
6175
+ pshufb m12, m9, m5
6176
+ pshufb m9, m4
6177
+ pmaddwd m11, m12, m3
6178
+ pmaddwd m14, m9, m2
6179
+ paddd m11, m14
6180
+
6181
+ paddd m8, m11
6182
+ paddd m8, m6
6183
+ psrad m8, INTERP_SHIFT_PS
6184
+
6185
+ pshufb m13, m10, m5
6186
+ pshufb m10, m4
6187
+ pmaddwd m9, m0
6188
+ pmaddwd m12, m1
6189
+ paddd m9, m12
6190
+ pmaddwd m13, m3
6191
+ pmaddwd m10, m2
6192
+ paddd m10, m13
6193
+
6194
+ paddd m9, m10
6195
+ paddd m9, m6
6196
+ psrad m9, INTERP_SHIFT_PS
6197
+
6198
+ packssdw m8, m9
6199
+ pshufb m8, m7
6200
+ movu [r2], m8
6201
+
6202
+ movu m8, [r0 + mmsize]
6203
+ movu m9, [r0 + mmsize + 8]
6204
+ movu m10, [r0 + mmsize + 16]
6205
+
6206
+ pshufb m11, m8, m5
6207
+ pshufb m8, m4
6208
+ pmaddwd m8, m0
6209
+ pmaddwd m11, m1
6210
+ paddd m8, m11
6211
+ pshufb m12, m9, m5
6212
+ pshufb m9, m4
6213
+ pmaddwd m11, m12, m3
6214
+ pmaddwd m14, m9, m2
6215
+ paddd m11, m14
6216
+ paddd m8, m11
6217
+ paddd m8, m6
6218
+ psrad m8, INTERP_SHIFT_PS
6219
+
6220
+ pshufb m13, m10, m5
6221
+ pshufb m10, m4
6222
+ pmaddwd m9, m0
6223
+ pmaddwd m12, m1
6224
+ paddd m9, m12
6225
+ pmaddwd m13, m3
6226
+ pmaddwd m10, m2
6227
+ paddd m10, m13
6228
+ paddd m9, m10
6229
+ paddd m9, m6
6230
+ psrad m9, INTERP_SHIFT_PS
6231
+
6232
+ packssdw m8, m9
6233
+ pshufb m8, m7
6234
+ movu [r2 + mmsize], m8
6235
+
6236
+ movu m8, [r0 + r1]
6237
+ movu m9, [r0 + r1 + 8]
6238
+ movu m10, [r0 + r1 + 16]
6239
+
6240
+ pshufb m11, m8, m5
6241
+ pshufb m8, m4
6242
+ pmaddwd m8, m0
6243
+ pmaddwd m11, m1
6244
+ paddd m8, m11
6245
+ pshufb m12, m9, m5
6246
+ pshufb m9, m4
6247
+ pmaddwd m11, m12, m3
6248
+ pmaddwd m14, m9, m2
6249
+ paddd m11, m14
6250
+ paddd m8, m11
6251
+ paddd m8, m6
6252
+ psrad m8, INTERP_SHIFT_PS
6253
+
6254
+ pshufb m13, m10, m5
6255
+ pshufb m10, m4
6256
+ pmaddwd m9, m0
6257
+ pmaddwd m12, m1
6258
+ paddd m9, m12
6259
+ pmaddwd m12, m13, m3
6260
+ pmaddwd m14, m10, m2
6261
+ paddd m12, m14
6262
+ paddd m9, m12
6263
+ paddd m9, m6
6264
+ psrad m9, INTERP_SHIFT_PS
6265
+
6266
+ packssdw m8, m9
6267
+ pshufb m8, m7
6268
+ movu [r2 + r3],m8
6269
+
6270
+ movu m8, [r0 + r1 + mmsize]
6271
+ movu m9, [r0 + r1 + mmsize + 8]
6272
+ movu m10, [r0 + r1 + mmsize + 16]
6273
+
6274
+ pshufb m11, m8, m5
6275
+ pshufb m8, m4
6276
+ pmaddwd m8, m0
6277
+ pmaddwd m11, m1
6278
+ paddd m8, m11
6279
+ pshufb m12, m9, m5
6280
+ pshufb m9, m4
6281
+ pmaddwd m11, m12, m3
6282
+ pmaddwd m14, m9, m2
6283
+ paddd m11, m14
6284
+ paddd m8, m11
6285
+ paddd m8, m6
6286
+ psrad m8, INTERP_SHIFT_PS
6287
+
6288
+ pshufb m13, m10, m5
6289
+ pshufb m10, m4
6290
+ pmaddwd m9, m0
6291
+ pmaddwd m12, m1
6292
+ paddd m9, m12
6293
+ pmaddwd m12, m13, m3
6294
+ pmaddwd m14, m10, m2
6295
+ paddd m12, m14
6296
+ paddd m9, m12
6297
+ paddd m9, m6
6298
+ psrad m9, INTERP_SHIFT_PS
6299
+
6300
+ packssdw m8, m9
6301
+ pshufb m8, m7
6302
+ movu [r2 + r3 + mmsize], m8
6303
+%endmacro
6304
+
6305
+%macro PROCESS_IPFILTER_LUMA_PS_64x1_AVX512 0
6306
+
6307
+ movu m8, [r0]
6308
+ movu m9, [r0 + 8]
6309
+ movu m10, [r0 + 16]
6310
+
6311
+ pshufb m11, m8, m5
6312
+ pshufb m8, m4
6313
+ pmaddwd m8, m0
6314
+ pmaddwd m11, m1
6315
+ paddd m8, m11
6316
+ pshufb m12, m9, m5
6317
+ pshufb m9, m4
6318
+ pmaddwd m11, m12, m3
6319
+ pmaddwd m14, m9, m2
6320
+ paddd m11, m14
6321
+ paddd m8, m11
6322
+ paddd m8, m6
6323
+ psrad m8, INTERP_SHIFT_PS
6324
+
6325
+ pshufb m13, m10, m5
6326
+ pshufb m10, m4
6327
+ pmaddwd m9, m0
6328
+ pmaddwd m12, m1
6329
+ paddd m9, m12
6330
+ pmaddwd m13, m3
6331
+ pmaddwd m10, m2
6332
+ paddd m10, m13
6333
+ paddd m9, m10
6334
+ paddd m9, m6
6335
+ psrad m9, INTERP_SHIFT_PS
6336
+
6337
+ packssdw m8, m9
6338
+ pshufb m8, m7
6339
+ movu [r2], m8
6340
+
6341
+ movu m8, [r0 + mmsize]
6342
+ movu m9, [r0 + mmsize + 8]
6343
+ movu m10, [r0 + mmsize + 16]
6344
+
6345
+ pshufb m11, m8, m5
6346
+ pshufb m8, m4
6347
+ pmaddwd m8, m0
6348
+ pmaddwd m11, m1
6349
+ paddd m8, m11
6350
+ pshufb m12, m9, m5
6351
+ pshufb m9, m4
6352
+ pmaddwd m11, m12, m3
6353
+ pmaddwd m14, m9, m2
6354
+ paddd m11, m14
6355
+ paddd m8, m11
6356
+ paddd m8, m6
6357
+ psrad m8, INTERP_SHIFT_PS
6358
+
6359
+ pshufb m13, m10, m5
6360
+ pshufb m10, m4
6361
+ pmaddwd m9, m0
6362
+ pmaddwd m12, m1
6363
+ paddd m9, m12
6364
+ pmaddwd m13, m3
6365
+ pmaddwd m10, m2
6366
+ paddd m10, m13
6367
+ paddd m9, m10
6368
+ paddd m9, m6
6369
+ psrad m9, INTERP_SHIFT_PS
6370
+
6371
+ packssdw m8, m9
6372
+ pshufb m8, m7
6373
+ movu [r2 + mmsize], m8
6374
+%endmacro
6375
+
6376
+%macro IPFILTER_LUMA_PS_AVX512_64xN 1
6377
+INIT_ZMM avx512
6378
+cglobal interp_8tap_horiz_ps_64x%1, 4,7,15
6379
+ shl r1d, 1
6380
+ shl r3d, 1
6381
+ mov r4d, r4m
6382
+ mov r5d, r5m
6383
+ shl r4d, 6
6384
+
6385
+%ifdef PIC
6386
+ lea r6, [tab_LumaCoeffH_avx512]
6387
+ vpbroadcastd m0, [r6 + r4]
6388
+ vpbroadcastd m1, [r6 + r4 + 4]
6389
+ vpbroadcastd m2, [r6 + r4 + 8]
6390
+ vpbroadcastd m3, [r6 + r4 + 12]
6391
+%else
6392
+ vpbroadcastd m0, [tab_LumaCoeffH_avx512 + r4]
6393
+ vpbroadcastd m1, [tab_LumaCoeffH_avx512 + r4 + 4]
6394
+ vpbroadcastd m2, [tab_LumaCoeffH_avx512 + r4 + 8]
6395
+ vpbroadcastd m3, [tab_LumaCoeffH_avx512 + r4 + 12]
6396
+%endif
6397
+ vbroadcasti32x8 m4, [interp8_hpp_shuf1_load_avx512]
6398
+ vbroadcasti32x8 m5, [interp8_hpp_shuf2_load_avx512]
6399
+ vbroadcasti32x4 m6, [INTERP_OFFSET_PS]
6400
+ vbroadcasti32x8 m7, [interp8_hpp_shuf1_store_avx512]
6401
+
6402
+ sub r0, 6
6403
+ mov r4d, %1
6404
+ test r5d, r5d
6405
+ jz .loop
6406
+ lea r6, [r1 * 3]
6407
+ sub r0, r6
6408
+ add r4d, 7
6409
+ PROCESS_IPFILTER_LUMA_PS_64x1_AVX512
6410
+ lea r0, [r0 + r1]
6411
+ lea r2, [r2 + r3]
6412
+ dec r4d
6413
+
6414
+.loop:
6415
+ PROCESS_IPFILTER_LUMA_PS_64x2_AVX512
6416
+ lea r0, [r0 + 2 * r1]
6417
+ lea r2, [r2 + 2 * r3]
6418
+ sub r4d, 2
6419
+ jnz .loop
6420
+ RET
6421
+%endmacro
6422
+
6423
+%if ARCH_X86_64
6424
+IPFILTER_LUMA_PS_AVX512_64xN 16
6425
+IPFILTER_LUMA_PS_AVX512_64xN 32
6426
+IPFILTER_LUMA_PS_AVX512_64xN 48
6427
+IPFILTER_LUMA_PS_AVX512_64xN 64
6428
+%endif
6429
+
6430
+%macro PROCESS_IPFILTER_LUMA_PS_16x4_AVX512 0
6431
+ ; register map
6432
+ ; m0, m1, m2, m3 - interpolate coeff
6433
+ ; m4, m5 - shuffle load order table
6434
+ ; m6 - INTERP_OFFSET_PS
6435
+ ; m7 - shuffle store order table
6436
+
6437
+ movu ym8, [r0]
6438
+ vinserti32x8 m8, [r0 + r1], 1
6439
+ movu ym9, [r0 + 8]
6440
+ vinserti32x8 m9, [r0 + r1 + 8], 1
6441
+ movu ym10, [r0 + 16]
6442
+ vinserti32x8 m10, [r0 + r1 + 16], 1
6443
+
6444
+ pshufb m11, m8, m5
6445
+ pshufb m8, m4
6446
+ pmaddwd m8, m0
6447
+ pmaddwd m11, m1
6448
+ paddd m8, m11
6449
+ pshufb m12, m9, m5
6450
+ pshufb m9, m4
6451
+ pmaddwd m11, m12, m3
6452
+ pmaddwd m14, m9, m2
6453
+ paddd m11, m14
6454
+ paddd m8, m11
6455
+ paddd m8, m6
6456
+ psrad m8, INTERP_SHIFT_PS
6457
+
6458
+ pshufb m13, m10, m5
6459
+ pshufb m10, m4
6460
+ pmaddwd m9, m0
6461
+ pmaddwd m12, m1
6462
+ paddd m9, m12
6463
+ pmaddwd m13, m3
6464
+ pmaddwd m10, m2
6465
+ paddd m10, m13
6466
+ paddd m9, m10
6467
+ paddd m9, m6
6468
+ psrad m9, INTERP_SHIFT_PS
6469
+
6470
+ packssdw m8, m9
6471
+ pshufb m8, m7
6472
+ movu [r2], ym8
6473
+ vextracti32x8 [r2 + r3],m8, 1
6474
+
6475
+ movu ym8, [r0 + 2 * r1]
6476
+ vinserti32x8 m8, [r0 + r6], 1
6477
+ movu ym9, [r0 + 2 * r1 + 8]
6478
+ vinserti32x8 m9, [r0 + r6 + 8], 1
6479
+ movu ym10, [r0 + 2 * r1 + 16]
6480
+ vinserti32x8 m10, [r0 + r6 + 16], 1
6481
+
6482
+ pshufb m11, m8, m5
6483
+ pshufb m8, m4
6484
+ pmaddwd m8, m0
6485
+ pmaddwd m11, m1
6486
+ paddd m8, m11
6487
+ pshufb m12, m9, m5
6488
+ pshufb m9, m4
6489
+ pmaddwd m11, m12, m3
6490
+ pmaddwd m14, m9, m2
6491
+ paddd m11, m14
6492
+ paddd m8, m11
6493
+ paddd m8, m6
6494
+ psrad m8, INTERP_SHIFT_PS
6495
+
6496
+ pshufb m13, m10, m5
6497
+ pshufb m10, m4
6498
+ pmaddwd m9, m0
6499
+ pmaddwd m12, m1
6500
+ paddd m9, m12
6501
+ pmaddwd m12, m13, m3
6502
+ pmaddwd m14, m10, m2
6503
+ paddd m12, m14
6504
+ paddd m9, m12
6505
+ paddd m9, m6
6506
+ psrad m9, INTERP_SHIFT_PS
6507
+
6508
+ packssdw m8, m9
6509
+ pshufb m8, m7
6510
+ movu [r2 + 2 * r3], ym8
6511
+ vextracti32x8 [r2 + r7], m8, 1
6512
+%endmacro
6513
+
6514
+%macro PROCESS_IPFILTER_LUMA_PS_16x3_AVX512 0
6515
+ movu ym8, [r0]
6516
+ vinserti32x8 m8, [r0 + r1], 1
6517
+ movu ym9, [r0 + 8]
6518
+ vinserti32x8 m9, [r0 + r1 + 8], 1
6519
+ movu ym10, [r0 + 16]
6520
+ vinserti32x8 m10, [r0 + r1 + 16], 1
6521
+
6522
+ pshufb m11, m8, m5
6523
+ pshufb m8, m4
6524
+ pmaddwd m8, m0
6525
+ pmaddwd m11, m1
6526
+ paddd m8, m11
6527
+ pshufb m12, m9, m5
6528
+ pshufb m9, m4
6529
+ pmaddwd m11, m12, m3
6530
+ pmaddwd m14, m9, m2
6531
+ paddd m11, m14
6532
+ paddd m8, m11
6533
+ paddd m8, m6
6534
+ psrad m8, INTERP_SHIFT_PS
6535
+
6536
+ pshufb m13, m10, m5
6537
+ pshufb m10, m4
6538
+ pmaddwd m9, m0
6539
+ pmaddwd m12, m1
6540
+ paddd m9, m12
6541
+ pmaddwd m13, m3
6542
+ pmaddwd m10, m2
6543
+ paddd m10, m13
6544
+ paddd m9, m10
6545
+ paddd m9, m6
6546
+ psrad m9, INTERP_SHIFT_PS
6547
+
6548
+ packssdw m8, m9
6549
+ pshufb m8, m7
6550
+ movu [r2], ym8
6551
+ vextracti32x8 [r2 + r3],m8, 1
6552
+
6553
+ movu ym8, [r0 + 2 * r1]
6554
+ movu ym9, [r0 + 2 * r1 + 8]
6555
+ movu ym10, [r0 + 2 * r1 + 16]
6556
+
6557
+ pshufb ym11, ym8, ym5
6558
+ pshufb ym8, ym4
6559
+ pmaddwd ym8, ym0
6560
+ pmaddwd ym11, ym1
6561
+ paddd ym8, ym11
6562
+ pshufb ym12, ym9, ym5
6563
+ pshufb ym9, ym4
6564
+ pmaddwd ym11, ym12, ym3
6565
+ pmaddwd ym14, ym9, ym2
6566
+ paddd ym11, ym14
6567
+ paddd ym8, ym11
6568
+ paddd ym8, ym6
6569
+ psrad ym8, INTERP_SHIFT_PS
6570
+
6571
+ pshufb ym13, ym10, ym5
6572
+ pshufb ym10, ym4
6573
+ pmaddwd ym9, ym0
6574
+ pmaddwd ym12, ym1
6575
+ paddd ym9, ym12
6576
+ pmaddwd ym12, ym13, ym3
6577
+ pmaddwd ym14, ym10, ym2
6578
+ paddd ym12, ym14
6579
+ paddd ym9, ym12
6580
+ paddd ym9, ym6
6581
+ psrad ym9, INTERP_SHIFT_PS
6582
+
6583
+ packssdw ym8, ym9
6584
+ pshufb ym8, ym7
6585
+ movu [r2 + 2 * r3], ym8
6586
+%endmacro
6587
+
6588
+
6589
+%macro IPFILTER_LUMA_PS_AVX512_16xN 1
6590
+INIT_ZMM avx512
6591
+cglobal interp_8tap_horiz_ps_16x%1, 4,9,15
6592
+ shl r1d, 1
6593
+ shl r3d, 1
6594
+ mov r4d, r4m
6595
+ mov r5d, r5m
6596
+ shl r4d, 6
6597
+
6598
+ lea r6, [3 * r1]
6599
+ lea r7, [3 * r3]
6600
+%ifdef PIC
6601
+ lea r8, [tab_LumaCoeffH_avx512]
6602
+ vpbroadcastd m0, [r8 + r4]
6603
+ vpbroadcastd m1, [r8 + r4 + 4]
6604
+ vpbroadcastd m2, [r8 + r4 + 8]
6605
+ vpbroadcastd m3, [r8 + r4 + 12]
6606
+%else
6607
+ vpbroadcastd m0, [tab_LumaCoeffH_avx512 + r4]
6608
+ vpbroadcastd m1, [tab_LumaCoeffH_avx512 + r4 + 4]
6609
+ vpbroadcastd m2, [tab_LumaCoeffH_avx512 + r4 + 8]
6610
+ vpbroadcastd m3, [tab_LumaCoeffH_avx512 + r4 + 12]
6611
+%endif
6612
+ vbroadcasti32x8 m4, [interp8_hpp_shuf1_load_avx512]
6613
+ vbroadcasti32x8 m5, [interp8_hpp_shuf2_load_avx512]
6614
+ vbroadcasti32x4 m6, [INTERP_OFFSET_PS]
6615
+ vbroadcasti32x8 m7, [interp8_hpp_shuf1_store_avx512]
6616
+
6617
+ sub r0, 6
6618
+ mov r4d, %1
6619
+ test r5d, r5d
6620
+ jz .loop
6621
+ lea r6, [r1 * 3]
6622
+ sub r0, r6
6623
+ add r4d, 7
6624
+ PROCESS_IPFILTER_LUMA_PS_16x3_AVX512
6625
+ lea r0, [r0 + r6]
6626
+ lea r2, [r2 + r7]
6627
+ sub r4d, 3
6628
+
6629
+.loop:
6630
+ PROCESS_IPFILTER_LUMA_PS_16x4_AVX512
6631
+ lea r0, [r0 + 4 * r1]
6632
+ lea r2, [r2 + 4 * r3]
6633
+ sub r4d, 4
6634
+ jnz .loop
6635
+ RET
6636
+%endmacro
6637
+
6638
+%if ARCH_X86_64
6639
+IPFILTER_LUMA_PS_AVX512_16xN 4
6640
+IPFILTER_LUMA_PS_AVX512_16xN 8
6641
+IPFILTER_LUMA_PS_AVX512_16xN 12
6642
+IPFILTER_LUMA_PS_AVX512_16xN 16
6643
+IPFILTER_LUMA_PS_AVX512_16xN 32
6644
+IPFILTER_LUMA_PS_AVX512_16xN 64
6645
+%endif
6646
+
6647
+%macro PROCESS_IPFILTER_LUMA_PS_48x4_AVX512 0
6648
+ ; register map
6649
+ ; m0, m1, m2, m3 - interpolate coeff
6650
+ ; m4, m5 - shuffle load order table
6651
+ ; m6 - INTERP_OFFSET_PS
6652
+ ; m7 - shuffle store order table
6653
+
6654
+ movu m8, [r0]
6655
+ movu m9, [r0 + 8]
6656
+ movu m10, [r0 + 16]
6657
+
6658
+ pshufb m11, m8, m5
6659
+ pshufb m8, m4
6660
+ pmaddwd m8, m0
6661
+ pmaddwd m11, m1
6662
+ paddd m8, m11
6663
+ pshufb m12, m9, m5
6664
+ pshufb m9, m4
6665
+ pmaddwd m11, m12, m3
6666
+ pmaddwd m14, m9, m2
6667
+ paddd m11, m14
6668
+ paddd m8, m11
6669
+ paddd m8, m6
6670
+ psrad m8, INTERP_SHIFT_PS
6671
+
6672
+ pshufb m13, m10, m5
6673
+ pshufb m10, m4
6674
+ pmaddwd m9, m0
6675
+ pmaddwd m12, m1
6676
+ paddd m9, m12
6677
+ pmaddwd m13, m3
6678
+ pmaddwd m10, m2
6679
+ paddd m10, m13
6680
+ paddd m9, m10
6681
+ paddd m9, m6
6682
+ psrad m9, INTERP_SHIFT_PS
6683
+
6684
+ packssdw m8, m9
6685
+ pshufb m8, m7
6686
+ movu [r2], m8
6687
+
6688
+ movu m8, [r0 + r1]
6689
+ movu m9, [r0 + r1 + 8]
6690
+ movu m10, [r0 + r1 + 16]
6691
+
6692
+ pshufb m11, m8, m5
6693
+ pshufb m8, m4
6694
+ pmaddwd m8, m0
6695
+ pmaddwd m11, m1
6696
+ paddd m8, m11
6697
+ pshufb m12, m9, m5
6698
+ pshufb m9, m4
6699
+ pmaddwd m11, m12, m3
6700
+ pmaddwd m14, m9, m2
6701
+ paddd m11, m14
6702
+ paddd m8, m11
6703
+ paddd m8, m6
6704
+ psrad m8, INTERP_SHIFT_PS
6705
+
6706
+ pshufb m13, m10, m5
6707
+ pshufb m10, m4
6708
+ pmaddwd m9, m0
6709
+ pmaddwd m12, m1
6710
+ paddd m9, m12
6711
+ pmaddwd m12, m13, m3
6712
+ pmaddwd m14, m10, m2
6713
+ paddd m12, m14
6714
+ paddd m9, m12
6715
+ paddd m9, m6
6716
+ psrad m9, INTERP_SHIFT_PS
6717
+
6718
+ packssdw m8, m9
6719
+ pshufb m8, m7
6720
+ movu [r2 + r3],m8
6721
+
6722
+ movu m8, [r0 + 2 * r1]
6723
+ movu m9, [r0 + 2 * r1 + 8]
6724
+ movu m10, [r0 + 2 * r1 + 16]
6725
+
6726
+ pshufb m11, m8, m5
6727
+ pshufb m8, m4
6728
+ pmaddwd m8, m0
6729
+ pmaddwd m11, m1
6730
+ paddd m8, m11
6731
+ pshufb m12, m9, m5
6732
+ pshufb m9, m4
6733
+ pmaddwd m11, m12, m3
6734
+ pmaddwd m14, m9, m2
6735
+ paddd m11, m14
6736
+ paddd m8, m11
6737
+ paddd m8, m6
6738
+ psrad m8, INTERP_SHIFT_PS
6739
+
6740
+ pshufb m13, m10, m5
6741
+ pshufb m10, m4
6742
+ pmaddwd m9, m0
6743
+ pmaddwd m12, m1
6744
+ paddd m9, m12
6745
+ pmaddwd m13, m3
6746
+ pmaddwd m10, m2
6747
+ paddd m10, m13
6748
+ paddd m9, m10
6749
+ paddd m9, m6
6750
+ psrad m9, INTERP_SHIFT_PS
6751
+
6752
+ packssdw m8, m9
6753
+ pshufb m8, m7
6754
+ movu [r2 + 2 * r3], m8
6755
+
6756
+ movu m8, [r0 + r6]
6757
+ movu m9, [r0 + r6 + 8]
6758
+ movu m10, [r0 + r6 + 16]
6759
+
6760
+ pshufb m11, m8, m5
6761
+ pshufb m8, m4
6762
+ pmaddwd m8, m0
6763
+ pmaddwd m11, m1
6764
+ paddd m8, m11
6765
+ pshufb m12, m9, m5
6766
+ pshufb m9, m4
6767
+ pmaddwd m11, m12, m3
6768
+ pmaddwd m14, m9, m2
6769
+ paddd m11, m14
6770
+ paddd m8, m11
6771
+ paddd m8, m6
6772
+ psrad m8, INTERP_SHIFT_PS
6773
+
6774
+ pshufb m13, m10, m5
6775
+ pshufb m10, m4
6776
+ pmaddwd m9, m0
6777
+ pmaddwd m12, m1
6778
+ paddd m9, m12
6779
+ pmaddwd m12, m13, m3
6780
+ pmaddwd m14, m10, m2
6781
+ paddd m12, m14
6782
+ paddd m9, m12
6783
+ paddd m9, m6
6784
+ psrad m9, INTERP_SHIFT_PS
6785
+
6786
+ packssdw m8, m9
6787
+ pshufb m8, m7
6788
+ movu [r2 + r7],m8
6789
+
6790
+ movu ym8, [r0 + mmsize]
6791
+ vinserti32x8 m8, [r0 + r1 + mmsize], 1
6792
+ movu ym9, [r0 + mmsize + 8]
6793
+ vinserti32x8 m9, [r0 + r1 + mmsize + 8], 1
6794
+ movu ym10, [r0 + mmsize + 16]
6795
+ vinserti32x8 m10, [r0 + r1 + mmsize + 16], 1
6796
+
6797
+ pshufb m11, m8, m5
6798
+ pshufb m8, m4
6799
+ pmaddwd m8, m0
6800
+ pmaddwd m11, m1
6801
+ paddd m8, m11
6802
+ pshufb m12, m9, m5
6803
+ pshufb m9, m4
6804
+ pmaddwd m11, m12, m3
6805
+ pmaddwd m14, m9, m2
6806
+ paddd m11, m14
6807
+ paddd m8, m11
6808
+ paddd m8, m6
6809
+ psrad m8, INTERP_SHIFT_PS
6810
+
6811
+ pshufb m13, m10, m5
6812
+ pshufb m10, m4
6813
+ pmaddwd m9, m0
6814
+ pmaddwd m12, m1
6815
+ paddd m9, m12
6816
+ pmaddwd m13, m3
6817
+ pmaddwd m10, m2
6818
+ paddd m10, m13
6819
+ paddd m9, m10
6820
+ paddd m9, m6
6821
+ psrad m9, INTERP_SHIFT_PS
6822
+
6823
+ packssdw m8, m9
6824
+ pshufb m8, m7
6825
+ movu [r2 + mmsize], ym8
6826
+ vextracti32x8 [r2 + r3 + mmsize], m8, 1
6827
+
6828
+ movu ym8, [r0 + 2 * r1 + mmsize]
6829
+ vinserti32x8 m8, [r0 + r6 + mmsize], 1
6830
+ movu ym9, [r0 + 2 * r1 + mmsize + 8]
6831
+ vinserti32x8 m9, [r0 + r6 + mmsize + 8], 1
6832
+ movu ym10, [r0 + 2 * r1 + mmsize + 16]
6833
+ vinserti32x8 m10, [r0 + r6 + mmsize + 16], 1
6834
+
6835
+ pshufb m11, m8, m5
6836
+ pshufb m8, m4
6837
+ pmaddwd m8, m0
6838
+ pmaddwd m11, m1
6839
+ paddd m8, m11
6840
+ pshufb m12, m9, m5
6841
+ pshufb m9, m4
6842
+ pmaddwd m11, m12, m3
6843
+ pmaddwd m14, m9, m2
6844
+ paddd m11, m14
6845
+ paddd m8, m11
6846
+ paddd m8, m6
6847
+ psrad m8, INTERP_SHIFT_PS
6848
+
6849
+ pshufb m13, m10, m5
6850
+ pshufb m10, m4
6851
+ pmaddwd m9, m0
6852
+ pmaddwd m12, m1
6853
+ paddd m9, m12
6854
+ pmaddwd m12, m13, m3
6855
+ pmaddwd m14, m10, m2
6856
+ paddd m12, m14
6857
+ paddd m9, m12
6858
+ paddd m9, m6
6859
+ psrad m9, INTERP_SHIFT_PS
6860
+
6861
+ packssdw m8, m9
6862
+ pshufb m8, m7
6863
+ movu [r2 + 2 * r3 + mmsize], ym8
6864
+ vextracti32x8 [r2 + r7 + mmsize], m8, 1
6865
+%endmacro
6866
+
6867
+%macro PROCESS_IPFILTER_LUMA_PS_48x3_AVX512 0
6868
+ movu m8, [r0]
6869
+ movu m9, [r0 + 8]
6870
+ movu m10, [r0 + 16]
6871
+
6872
+ pshufb m11, m8, m5
6873
+ pshufb m8, m4
6874
+ pmaddwd m8, m0
6875
+ pmaddwd m11, m1
6876
+ paddd m8, m11
6877
+ pshufb m12, m9, m5
6878
+ pshufb m9, m4
6879
+ pmaddwd m11, m12, m3
6880
+ pmaddwd m14, m9, m2
6881
+ paddd m11, m14
6882
+ paddd m8, m11
6883
+ paddd m8, m6
6884
+ psrad m8, INTERP_SHIFT_PS
6885
+
6886
+ pshufb m13, m10, m5
6887
+ pshufb m10, m4
6888
+ pmaddwd m9, m0
6889
+ pmaddwd m12, m1
6890
+ paddd m9, m12
6891
+ pmaddwd m13, m3
6892
+ pmaddwd m10, m2
6893
+ paddd m10, m13
6894
+ paddd m9, m10
6895
+ paddd m9, m6
6896
+ psrad m9, INTERP_SHIFT_PS
6897
+
6898
+ packssdw m8, m9
6899
+ pshufb m8, m7
6900
+ movu [r2], m8
6901
+
6902
+ movu m8, [r0 + r1]
6903
+ movu m9, [r0 + r1 + 8]
6904
+ movu m10, [r0 + r1 + 16]
6905
+
6906
+ pshufb m11, m8, m5
6907
+ pshufb m8, m4
6908
+ pmaddwd m8, m0
6909
+ pmaddwd m11, m1
6910
+ paddd m8, m11
6911
+ pshufb m12, m9, m5
6912
+ pshufb m9, m4
6913
+ pmaddwd m11, m12, m3
6914
+ pmaddwd m14, m9, m2
6915
+ paddd m11, m14
6916
+ paddd m8, m11
6917
+ paddd m8, m6
6918
+ psrad m8, INTERP_SHIFT_PS
6919
+
6920
+ pshufb m13, m10, m5
6921
+ pshufb m10, m4
6922
+ pmaddwd m9, m0
6923
+ pmaddwd m12, m1
6924
+ paddd m9, m12
6925
+ pmaddwd m12, m13, m3
6926
+ pmaddwd m14, m10, m2
6927
+ paddd m12, m14
6928
+ paddd m9, m12
6929
+ paddd m9, m6
6930
+ psrad m9, INTERP_SHIFT_PS
6931
+
6932
+ packssdw m8, m9
6933
+ pshufb m8, m7
6934
+ movu [r2 + r3],m8
6935
+
6936
+ movu m8, [r0 + 2 * r1]
6937
+ movu m9, [r0 + 2 * r1 + 8]
6938
+ movu m10, [r0 + 2 * r1 + 16]
6939
+
6940
+ pshufb m11, m8, m5
6941
+ pshufb m8, m4
6942
+ pmaddwd m8, m0
6943
+ pmaddwd m11, m1
6944
+ paddd m8, m11
6945
+ pshufb m12, m9, m5
6946
+ pshufb m9, m4
6947
+ pmaddwd m11, m12, m3
6948
+ pmaddwd m14, m9, m2
6949
+ paddd m11, m14
6950
+ paddd m8, m11
6951
+ paddd m8, m6
6952
+ psrad m8, INTERP_SHIFT_PS
6953
+
6954
+ pshufb m13, m10, m5
6955
+ pshufb m10, m4
6956
+ pmaddwd m9, m0
6957
+ pmaddwd m12, m1
6958
+ paddd m9, m12
6959
+ pmaddwd m13, m3
6960
+ pmaddwd m10, m2
6961
+ paddd m10, m13
6962
+ paddd m9, m10
6963
+ paddd m9, m6
6964
+ psrad m9, INTERP_SHIFT_PS
6965
+
6966
+ packssdw m8, m9
6967
+ pshufb m8, m7
6968
+ movu [r2 + 2 * r3], m8
6969
+
6970
+ movu ym8, [r0 + mmsize]
6971
+ vinserti32x8 m8, [r0 + r1 + mmsize], 1
6972
+ movu ym9, [r0 + mmsize + 8]
6973
+ vinserti32x8 m9, [r0 + r1 + mmsize + 8], 1
6974
+ movu ym10, [r0 + mmsize + 16]
6975
+ vinserti32x8 m10, [r0 + r1 + mmsize + 16], 1
6976
+
6977
+ pshufb m11, m8, m5
6978
+ pshufb m8, m4
6979
+ pmaddwd m8, m0
6980
+ pmaddwd m11, m1
6981
+ paddd m8, m11
6982
+ pshufb m12, m9, m5
6983
+ pshufb m9, m4
6984
+ pmaddwd m11, m12, m3
6985
+ pmaddwd m14, m9, m2
6986
+ paddd m11, m14
6987
+ paddd m8, m11
6988
+ paddd m8, m6
6989
+ psrad m8, INTERP_SHIFT_PS
6990
+
6991
+ pshufb m13, m10, m5
6992
+ pshufb m10, m4
6993
+ pmaddwd m9, m0
6994
+ pmaddwd m12, m1
6995
+ paddd m9, m12
6996
+ pmaddwd m13, m3
6997
+ pmaddwd m10, m2
6998
+ paddd m10, m13
6999
+ paddd m9, m10
7000
+ paddd m9, m6
7001
+ psrad m9, INTERP_SHIFT_PS
7002
+
7003
+ packssdw m8, m9
7004
+ pshufb m8, m7
7005
+ movu [r2 + mmsize], ym8
7006
+ vextracti32x8 [r2 + r3 + mmsize], m8, 1
7007
+
7008
+ movu ym8, [r0 + 2 * r1 + mmsize]
7009
+ movu ym9, [r0 + 2 * r1 + mmsize + 8]
7010
+ movu ym10, [r0 + 2 * r1 + mmsize + 16]
7011
+
7012
+ pshufb ym11, ym8, ym5
7013
+ pshufb ym8, ym4
7014
+ pmaddwd ym8, ym0
7015
+ pmaddwd ym11, ym1
7016
+ paddd ym8, ym11
7017
+ pshufb ym12, ym9, ym5
7018
+ pshufb ym9, ym4
7019
+ pmaddwd ym11, ym12, ym3
7020
+ pmaddwd ym14, ym9, ym2
7021
+ paddd ym11, ym14
7022
+ paddd ym8, ym11
7023
+ paddd ym8, ym6
7024
+ psrad ym8, INTERP_SHIFT_PS
7025
+
7026
+ pshufb ym13, ym10, ym5
7027
+ pshufb ym10, ym4
7028
+ pmaddwd ym9, ym0
7029
+ pmaddwd ym12, ym1
7030
+ paddd ym9, ym12
7031
+ pmaddwd ym12, ym13, ym3
7032
+ pmaddwd ym14, ym10, ym2
7033
+ paddd ym12, ym14
7034
+ paddd ym9, ym12
7035
+ paddd ym9, ym6
7036
+ psrad ym9, INTERP_SHIFT_PS
7037
+
7038
+ packssdw ym8, ym9
7039
+ pshufb ym8, ym7
7040
+ movu [r2 + 2 * r3 + mmsize], ym8
7041
+%endmacro
7042
+
7043
+%if ARCH_X86_64
7044
+INIT_ZMM avx512
7045
+cglobal interp_8tap_horiz_ps_48x64, 4,9,15
7046
+ shl r1d, 1
7047
+ shl r3d, 1
7048
+ mov r4d, r4m
7049
+ mov r5d, r5m
7050
+ shl r4d, 6
7051
+ lea r6, [3 * r1]
7052
+ lea r7, [3 * r3]
7053
+%ifdef PIC
7054
+ lea r8, [tab_LumaCoeffH_avx512]
7055
+ vpbroadcastd m0, [r8 + r4]
7056
+ vpbroadcastd m1, [r8 + r4 + 4]
7057
+ vpbroadcastd m2, [r8 + r4 + 8]
7058
+ vpbroadcastd m3, [r8 + r4 + 12]
7059
+%else
7060
+ vpbroadcastd m0, [tab_LumaCoeffH_avx512 + r4]
7061
+ vpbroadcastd m1, [tab_LumaCoeffH_avx512 + r4 + 4]
7062
+ vpbroadcastd m2, [tab_LumaCoeffH_avx512 + r4 + 8]
7063
+ vpbroadcastd m3, [tab_LumaCoeffH_avx512 + r4 + 12]
7064
+%endif
7065
+ vbroadcasti32x8 m4, [interp8_hpp_shuf1_load_avx512]
7066
+ vbroadcasti32x8 m5, [interp8_hpp_shuf2_load_avx512]
7067
+ vbroadcasti32x4 m6, [INTERP_OFFSET_PS]
7068
+ vbroadcasti32x8 m7, [interp8_hpp_shuf1_store_avx512]
7069
+
7070
+ sub r0, 6
7071
+ mov r4d, 64
7072
+ test r5d, r5d
7073
+ jz .loop
7074
+ lea r6, [r1 * 3]
7075
+ sub r0, r6
7076
+ add r4d, 7
7077
+ PROCESS_IPFILTER_LUMA_PS_48x4_AVX512
7078
+ lea r0, [r0 + r6]
7079
+ lea r2, [r2 + r7]
7080
+ sub r4d, 3
7081
+
7082
+.loop:
7083
+ PROCESS_IPFILTER_LUMA_PS_48x4_AVX512
7084
+ lea r0, [r0 + 4 * r1]
7085
+ lea r2, [r2 + 4 * r3]
7086
+ sub r4d, 4
7087
+ jnz .loop
7088
+ RET
7089
+%endif
7090
+
7091
+%macro PROCESS_IPFILTER_LUMA_PS_24x4_AVX512 0
7092
+ ; register map
7093
+ ; m0 , m1, m2, m3 - interpolate coeff table
7094
+ ; m4 , m5 - load shuffle order table
7095
+ ; m6 - INTERP_OFFSET_PS
7096
+ ; m7 - store shuffle order table
7097
+
7098
+ PROCESS_IPFILTER_LUMA_PS_16x4_AVX512
7099
+
7100
+ movu xm8, [r0 + mmsize/2]
7101
+ movu xm9, [r0 + mmsize/2 + 8]
7102
+ movu xm10, [r0 + mmsize/2 + 16]
7103
+
7104
+ vinserti32x4 m8, [r0 + r1 + mmsize/2], 1
7105
+ vinserti32x4 m9, [r0 + r1 + mmsize/2 + 8], 1
7106
+ vinserti32x4 m10, [r0 + r1 + mmsize/2 + 16], 1
7107
+
7108
+ vinserti32x4 m8, [r0 + 2 * r1 + mmsize/2], 2
7109
+ vinserti32x4 m9, [r0 + 2 * r1 + mmsize/2 + 8], 2
7110
+ vinserti32x4 m10, [r0 + 2 * r1 + mmsize/2 + 16], 2
7111
+
7112
+ vinserti32x4 m8, [r0 + r6 + mmsize/2], 3
7113
+ vinserti32x4 m9, [r0 + r6 + mmsize/2 + 8], 3
7114
+ vinserti32x4 m10, [r0 + r6 + mmsize/2 + 16], 3
7115
+
7116
+ pshufb m11, m8, m5
7117
+ pshufb m8, m4
7118
+ pmaddwd m8, m0
7119
+ pmaddwd m11, m1
7120
+ paddd m8, m11
7121
+ pshufb m12, m9, m5
7122
+ pshufb m9, m4
7123
+ pmaddwd m11, m12, m3
7124
+ pmaddwd m14, m9, m2
7125
+ paddd m11, m14
7126
+
7127
+ paddd m8, m11
7128
+ paddd m8, m6
7129
+ psrad m8, INTERP_SHIFT_PS
7130
+
7131
+ pshufb m13, m10, m5
7132
+ pshufb m10, m4
7133
+ pmaddwd m9, m0
7134
+ pmaddwd m12, m1
7135
+ paddd m9, m12
7136
+ pmaddwd m13, m3
7137
+ pmaddwd m10, m2
7138
+ paddd m10, m13
7139
+
7140
+ paddd m9, m10
7141
+ paddd m9, m6
7142
+ psrad m9, INTERP_SHIFT_PS
7143
+
7144
+ packssdw m8, m9
7145
+ pshufb m8, m7
7146
+ movu [r2 + mmsize/2], xm8
7147
+ vextracti32x4 [r2 + r3 + mmsize/2], m8, 1
7148
+ vextracti32x4 [r2 + 2 * r3 + mmsize/2], m8, 2
7149
+ vextracti32x4 [r2 + r7 + mmsize/2], m8, 3
7150
+%endmacro
7151
+
7152
+%macro PROCESS_IPFILTER_LUMA_PS_24x3_AVX512 0
7153
+
7154
+ PROCESS_IPFILTER_LUMA_PS_16x3_AVX512
7155
+
7156
+ movu xm8, [r0 + mmsize/2]
7157
+ movu xm9, [r0 + mmsize/2 + 8]
7158
+ movu xm10, [r0 + mmsize/2 + 16]
7159
+
7160
+ vinserti32x4 m8, [r0 + r1 + mmsize/2], 1
7161
+ vinserti32x4 m9, [r0 + r1 + mmsize/2 + 8], 1
7162
+ vinserti32x4 m10, [r0 + r1 + mmsize/2 + 16], 1
7163
+
7164
+ vinserti32x4 m8, [r0 + 2 * r1 + mmsize/2], 2
7165
+ vinserti32x4 m9, [r0 + 2 * r1 + mmsize/2 + 8], 2
7166
+ vinserti32x4 m10, [r0 + 2 * r1 + mmsize/2 + 16], 2
7167
+
7168
+ pshufb m11, m8, m5
7169
+ pshufb m8, m4
7170
+ pmaddwd m8, m0
7171
+ pmaddwd m11, m1
7172
+ paddd m8, m11
7173
+ pshufb m12, m9, m5
7174
+ pshufb m9, m4
7175
+ pmaddwd m11, m12, m3
7176
+ pmaddwd m14, m9, m2
7177
+ paddd m11, m14
7178
+
7179
+ paddd m8, m11
7180
+ paddd m8, m6
7181
+ psrad m8, INTERP_SHIFT_PS
7182
+
7183
+ pshufb m13, m10, m5
7184
+ pshufb m10, m4
7185
+ pmaddwd m9, m0
7186
+ pmaddwd m12, m1
7187
+ paddd m9, m12
7188
+ pmaddwd m13, m3
7189
+ pmaddwd m10, m2
7190
+ paddd m10, m13
7191
+
7192
+ paddd m9, m10
7193
+ paddd m9, m6
7194
+ psrad m9, INTERP_SHIFT_PS
7195
+
7196
+ packssdw m8, m9
7197
+ pshufb m8, m7
7198
+ movu [r2 + mmsize/2], xm8
7199
+ vextracti32x4 [r2 + r3 + mmsize/2], m8, 1
7200
+ vextracti32x4 [r2 + 2 * r3 + mmsize/2], m8, 2
7201
+%endmacro
7202
+
7203
+%if ARCH_X86_64
7204
+INIT_ZMM avx512
7205
+cglobal interp_8tap_horiz_ps_24x32, 4, 9, 15
7206
+ shl r1d, 1
7207
+ shl r3d, 1
7208
+ mov r4d, r4m
7209
+ mov r5d, r5m
7210
+ shl r4d, 6
7211
+
7212
+ lea r6, [3 * r1]
7213
+ lea r7, [3 * r3]
7214
+
7215
+%ifdef PIC
7216
+ lea r8, [tab_LumaCoeffH_avx512]
7217
+ vpbroadcastd m0, [r8 + r4]
7218
+ vpbroadcastd m1, [r8 + r4 + 4]
7219
+ vpbroadcastd m2, [r8 + r4 + 8]
7220
+ vpbroadcastd m3, [r8 + r4 + 12]
7221
+%else
7222
+ vpbroadcastd m0, [tab_LumaCoeffH_avx512 + r4]
7223
+ vpbroadcastd m1, [tab_LumaCoeffH_avx512 + r4 + 4]
7224
+ vpbroadcastd m2, [tab_LumaCoeffH_avx512 + r4 + 8]
7225
+ vpbroadcastd m3, [tab_LumaCoeffH_avx512 + r4 + 12]
7226
+%endif
7227
+ vbroadcasti32x8 m4, [interp8_hpp_shuf1_load_avx512]
7228
+ vbroadcasti32x8 m5, [interp8_hpp_shuf2_load_avx512]
7229
+ vbroadcasti32x4 m6, [INTERP_OFFSET_PS]
7230
+ vbroadcasti32x8 m7, [interp8_hpp_shuf1_store_avx512]
7231
+
7232
+ sub r0, 6
7233
+ mov r4d, 32
7234
+ test r5d, r5d
7235
+ jz .loop
7236
+ sub r0, r6
7237
+ add r4d, 7
7238
+ PROCESS_IPFILTER_LUMA_PS_24x3_AVX512
7239
+ lea r0, [r0 + r6]
7240
+ lea r2, [r2 + r7]
7241
+ sub r4d, 3
7242
+
7243
+.loop:
7244
+ PROCESS_IPFILTER_LUMA_PS_24x4_AVX512
7245
+ lea r0, [r0 + 4 * r1]
7246
+ lea r2, [r2 + 4 * r3]
7247
+ sub r4d, 4
7248
+ jnz .loop
7249
+ RET
7250
+%endif
7251
+%macro PROCESS_IPFILTER_LUMA_PS_8x4_AVX512 0
7252
+ ; register map
7253
+ ; m0 , m1, m2, m3 - interpolate coeff table
7254
+ ; m4 , m5 - load shuffle order table
7255
+ ; m6 - INTERP_OFFSET_PS
7256
+ ; m7 - store shuffle order table
7257
+
7258
+ movu xm8, [r0]
7259
+ movu xm9, [r0 + 8]
7260
+ movu xm10, [r0 + 16]
7261
+
7262
+ vinserti32x4 m8, [r0 + r1], 1
7263
+ vinserti32x4 m9, [r0 + r1 + 8], 1
7264
+ vinserti32x4 m10, [r0 + r1 + 16], 1
7265
+
7266
+ vinserti32x4 m8, [r0 + 2 * r1], 2
7267
+ vinserti32x4 m9, [r0 + 2 * r1 + 8], 2
7268
+ vinserti32x4 m10, [r0 + 2 * r1 + 16], 2
7269
+
7270
+ vinserti32x4 m8, [r0 + r6], 3
7271
+ vinserti32x4 m9, [r0 + r6 + 8], 3
7272
+ vinserti32x4 m10, [r0 + r6 + 16], 3
7273
+
7274
+ pshufb m11, m8, m5
7275
+ pshufb m8, m4
7276
+ pmaddwd m8, m0
7277
+ pmaddwd m11, m1
7278
+ paddd m8, m11
7279
+ pshufb m12, m9, m5
7280
+ pshufb m9, m4
7281
+ pmaddwd m11, m12, m3
7282
+ pmaddwd m14, m9, m2
7283
+ paddd m11, m14
7284
+
7285
+ paddd m8, m11
7286
+ paddd m8, m6
7287
+ psrad m8, INTERP_SHIFT_PS
7288
+
7289
+ pshufb m13, m10, m5
7290
+ pshufb m10, m4
7291
+ pmaddwd m9, m0
7292
+ pmaddwd m12, m1
7293
+ paddd m9, m12
7294
+ pmaddwd m13, m3
7295
+ pmaddwd m10, m2
7296
+ paddd m10, m13
7297
+
7298
+ paddd m9, m10
7299
+ paddd m9, m6
7300
+ psrad m9, INTERP_SHIFT_PS
7301
+
7302
+ packssdw m8, m9
7303
+ pshufb m8, m7
7304
+ movu [r2], xm8
7305
+ vextracti32x4 [r2 + r3], m8, 1
7306
+ vextracti32x4 [r2 + 2 * r3], m8, 2
7307
+ vextracti32x4 [r2 + r7], m8, 3
7308
+%endmacro
7309
+
7310
+%macro PROCESS_IPFILTER_LUMA_PS_8x3_AVX512 0
7311
+ movu xm8, [r0]
7312
+ movu xm9, [r0 + 8]
7313
+ movu xm10, [r0 + 16]
7314
+
7315
+ vinserti32x4 m8, [r0 + r1], 1
7316
+ vinserti32x4 m9, [r0 + r1 + 8], 1
7317
+ vinserti32x4 m10, [r0 + r1 + 16], 1
7318
+
7319
+ vinserti32x4 m8, [r0 + 2 * r1], 2
7320
+ vinserti32x4 m9, [r0 + 2 * r1 + 8], 2
7321
+ vinserti32x4 m10, [r0 + 2 * r1 + 16], 2
7322
+
7323
+ pshufb m11, m8, m5
7324
+ pshufb m8, m4
7325
+ pmaddwd m8, m0
7326
+ pmaddwd m11, m1
7327
+ paddd m8, m11
7328
+ pshufb m12, m9, m5
7329
+ pshufb m9, m4
7330
+ pmaddwd m11, m12, m3
7331
+ pmaddwd m14, m9, m2
7332
+ paddd m11, m14
7333
+
7334
+ paddd m8, m11
7335
+ paddd m8, m6
7336
+ psrad m8, INTERP_SHIFT_PS
7337
+
7338
+ pshufb m13, m10, m5
7339
+ pshufb m10, m4
7340
+ pmaddwd m9, m0
7341
+ pmaddwd m12, m1
7342
+ paddd m9, m12
7343
+ pmaddwd m13, m3
7344
+ pmaddwd m10, m2
7345
+ paddd m10, m13
7346
+
7347
+ paddd m9, m10
7348
+ paddd m9, m6
7349
+ psrad m9, INTERP_SHIFT_PS
7350
+
7351
+ packssdw m8, m9
7352
+ pshufb m8, m7
7353
+ movu [r2], xm8
7354
+ vextracti32x4 [r2 + r3], m8, 1
7355
+ vextracti32x4 [r2 + 2 * r3], m8, 2
7356
+%endmacro
7357
+
7358
+%macro IPFILTER_LUMA_PS_AVX512_8xN 1
7359
+INIT_ZMM avx512
7360
+cglobal interp_8tap_horiz_ps_8x%1, 4, 9, 15
7361
+ shl r1d, 1
7362
+ shl r3d, 1
7363
+ mov r4d, r4m
7364
+ mov r5d, r5m
7365
+ shl r4d, 6
7366
+
7367
+ lea r6, [3 * r1]
7368
+ lea r7, [3 * r3]
7369
+
7370
+%ifdef PIC
7371
+ lea r8, [tab_LumaCoeffH_avx512]
7372
+ vpbroadcastd m0, [r8 + r4]
7373
+ vpbroadcastd m1, [r8 + r4 + 4]
7374
+ vpbroadcastd m2, [r8 + r4 + 8]
7375
+ vpbroadcastd m3, [r8 + r4 + 12]
7376
+%else
7377
+ vpbroadcastd m0, [tab_LumaCoeffH_avx512 + r4]
7378
+ vpbroadcastd m1, [tab_LumaCoeffH_avx512 + r4 + 4]
7379
+ vpbroadcastd m2, [tab_LumaCoeffH_avx512 + r4 + 8]
7380
+ vpbroadcastd m3, [tab_LumaCoeffH_avx512 + r4 + 12]
7381
+%endif
7382
+ vbroadcasti32x8 m4, [interp8_hpp_shuf1_load_avx512]
7383
+ vbroadcasti32x8 m5, [interp8_hpp_shuf2_load_avx512]
7384
+ vbroadcasti32x4 m6, [INTERP_OFFSET_PS]
7385
+ vbroadcasti32x8 m7, [interp8_hpp_shuf1_store_avx512]
7386
+
7387
+ sub r0, 6
7388
+ mov r4d, %1
7389
+ test r5d, r5d
7390
+ jz .loop
7391
+ sub r0, r6
7392
+ add r4d, 7
7393
+ PROCESS_IPFILTER_LUMA_PS_8x3_AVX512
7394
+ lea r0, [r0 + r6]
7395
+ lea r2, [r2 + r7]
7396
+ sub r4d, 3
7397
+
7398
+.loop:
7399
+ PROCESS_IPFILTER_LUMA_PS_8x4_AVX512
7400
+ lea r0, [r0 + 4 * r1]
7401
+ lea r2, [r2 + 4 * r3]
7402
+ sub r4d, 4
7403
+ jnz .loop
7404
+ RET
7405
+%endmacro
7406
+
7407
+%if ARCH_X86_64
7408
+ IPFILTER_LUMA_PS_AVX512_8xN 4
7409
+ IPFILTER_LUMA_PS_AVX512_8xN 8
7410
+ IPFILTER_LUMA_PS_AVX512_8xN 16
7411
+ IPFILTER_LUMA_PS_AVX512_8xN 32
7412
+%endif
7413
+
7414
+;-------------------------------------------------------------------------------------------------------------
7415
+;avx512 luma_hps code end
7416
+;-------------------------------------------------------------------------------------------------------------
7417
+;-------------------------------------------------------------------------------------------------------------
7418
+;avx512 luma_vss and luma_vsp code start
7419
+;-------------------------------------------------------------------------------------------------------------
7420
+%macro PROCESS_LUMA_VERT_S_8x8_AVX512 1
7421
+ lea r6, [r0 + 4 * r1]
7422
+ movu xm1, [r0] ;0 row
7423
+ vinserti32x4 m1, [r0 + 2 * r1], 1
7424
+ vinserti32x4 m1, [r0 + 4 * r1], 2
7425
+ vinserti32x4 m1, [r6 + 2 * r1], 3
7426
+ movu xm3, [r0 + r1] ;1 row
7427
+ vinserti32x4 m3, [r0 + r7], 1
7428
+ vinserti32x4 m3, [r6 + r1], 2
7429
+ vinserti32x4 m3, [r6 + r7], 3
7430
+ punpcklwd m0, m1, m3
7431
+ pmaddwd m0, m15
7432
+ punpckhwd m1, m3
7433
+ pmaddwd m1, m15
7434
+
7435
+ movu xm4, [r0 + 2 * r1] ;2 row
7436
+ vinserti32x4 m4, [r0 + 4 * r1], 1
7437
+ vinserti32x4 m4, [r6 + 2 * r1], 2
7438
+ vinserti32x4 m4, [r6 + 4 * r1], 3
7439
+ punpcklwd m2, m3, m4
7440
+ pmaddwd m2, m15
7441
+ punpckhwd m3, m4
7442
+ pmaddwd m3, m15
7443
+
7444
+ lea r4, [r6 + 4 * r1]
7445
+ movu xm5, [r0 + r7] ;3 row
7446
+ vinserti32x4 m5, [r6 + r1], 1
7447
+ vinserti32x4 m5, [r6 + r7], 2
7448
+ vinserti32x4 m5, [r4 + r1], 3
7449
+ punpcklwd m6, m4, m5
7450
+ pmaddwd m6, m16
7451
+ punpckhwd m4, m5
7452
+ pmaddwd m4, m16
7453
+
7454
+ paddd m0, m6
7455
+ paddd m1, m4
7456
+
7457
+ movu xm4, [r0 + 4 * r1] ;4 row
7458
+ vinserti32x4 m4, [r6 + 2 * r1], 1
7459
+ vinserti32x4 m4, [r6 + 4 * r1], 2
7460
+ vinserti32x4 m4, [r4 + 2 * r1], 3
7461
+ punpcklwd m6, m5, m4
7462
+ pmaddwd m6, m16
7463
+ punpckhwd m5, m4
7464
+ pmaddwd m5, m16
7465
+
7466
+ paddd m2, m6
7467
+ paddd m3, m5
7468
+
7469
+ movu xm11, [r6 + r1] ;5 row
7470
+ vinserti32x4 m11, [r6 + r7], 1
7471
+ vinserti32x4 m11, [r4 + r1], 2
7472
+ vinserti32x4 m11, [r4 + r7], 3
7473
+ punpcklwd m8, m4, m11
7474
+ pmaddwd m8, m17
7475
+ punpckhwd m4, m11
7476
+ pmaddwd m4, m17
7477
+
7478
+ movu xm12, [r6 + 2 * r1] ;6 row
7479
+ vinserti32x4 m12, [r6 + 4 * r1], 1
7480
+ vinserti32x4 m12, [r4 + 2 * r1], 2
7481
+ vinserti32x4 m12, [r4 + 4 * r1], 3
7482
+ punpcklwd m10, m11, m12
7483
+ pmaddwd m10, m17
7484
+ punpckhwd m11, m12
7485
+ pmaddwd m11, m17
7486
+
7487
+ lea r8, [r4 + 4 * r1]
7488
+ movu xm13, [r6 + r7] ;7 row
7489
+ vinserti32x4 m13, [r4 + r1], 1
7490
+ vinserti32x4 m13, [r4 + r7], 2
7491
+ vinserti32x4 m13, [r8 + r1], 3
7492
+ punpcklwd m14, m12, m13
7493
+ pmaddwd m14, m18
7494
+ punpckhwd m12, m13
7495
+ pmaddwd m12, m18
7496
+
7497
+ paddd m8, m14
7498
+ paddd m4, m12
7499
+ paddd m0, m8
7500
+ paddd m1, m4
7501
+
7502
+ movu xm12, [r6 + 4 * r1] ; 8 row
7503
+ vinserti32x4 m12, [r4 + 2 * r1], 1
7504
+ vinserti32x4 m12, [r4 + 4 * r1], 2
7505
+ vinserti32x4 m12, [r8 + 2 * r1], 3
7506
+ punpcklwd m14, m13, m12
7507
+ pmaddwd m14, m18
7508
+ punpckhwd m13, m12
7509
+ pmaddwd m13, m18
7510
+
7511
+ paddd m10, m14
7512
+ paddd m11, m13
7513
+ paddd m2, m10
7514
+ paddd m3, m11
7515
+
7516
+%ifidn %1, sp
7517
+ paddd m0, m19
7518
+ paddd m1, m19
7519
+ paddd m2, m19
7520
+ paddd m3, m19
7521
+
7522
+ psrad m0, INTERP_SHIFT_SP
7523
+ psrad m1, INTERP_SHIFT_SP
7524
+ psrad m2, INTERP_SHIFT_SP
7525
+ psrad m3, INTERP_SHIFT_SP
7526
+
7527
+ packssdw m0, m1
7528
+ packssdw m2, m3
7529
+ CLIPW2 m0, m2, m20, m21
7530
+%else
7531
+ psrad m0, 6
7532
+ psrad m1, 6
7533
+ psrad m2, 6
7534
+ psrad m3, 6
7535
+
7536
+ packssdw m0, m1
7537
+ packssdw m2, m3
7538
+%endif
7539
+
7540
+ movu [r2], xm0
7541
+ movu [r2 + r3], xm2
7542
+ vextracti32x4 [r2 + 2 * r3], m0, 1
7543
+ vextracti32x4 [r2 + r5], m2, 1
7544
+ lea r2, [r2 + 4 * r3]
7545
+ vextracti32x4 [r2], m0, 2
7546
+ vextracti32x4 [r2 + r3], m2, 2
7547
+ vextracti32x4 [r2 + 2 * r3], m0, 3
7548
+ vextracti32x4 [r2 + r5], m2, 3
7549
+%endmacro
7550
+;-----------------------------------------------------------------------------------------------------------------
7551
+; void interp_8tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
7552
+;-----------------------------------------------------------------------------------------------------------------
7553
+%macro FILTER_VER_S_LUMA_8xN_AVX512 2
7554
+INIT_ZMM avx512
7555
+cglobal interp_8tap_vert_%1_8x%2, 5, 9, 22
7556
+ add r1d, r1d
7557
+ add r3d, r3d
7558
+ lea r7, [3 * r1]
7559
+ sub r0, r7
7560
+ shl r4d, 8
7561
+%ifdef PIC
7562
+ lea r5, [tab_LumaCoeffVer_avx512]
7563
+ mova m15, [r5 + r4]
7564
+ mova m16, [r5 + r4 + 1 * mmsize]
7565
+ mova m17, [r5 + r4 + 2 * mmsize]
7566
+ mova m18, [r5 + r4 + 3 * mmsize]
7567
+%else
7568
+ lea r5, [tab_LumaCoeffVer_avx512 + r4]
7569
+ mova m15, [r5]
7570
+ mova m16, [r5 + 1 * mmsize]
7571
+ mova m17, [r5 + 2 * mmsize]
7572
+ mova m18, [r5 + 3 * mmsize]
7573
+%endif
7574
+%ifidn %1, sp
7575
+ vbroadcasti32x4 m19, [INTERP_OFFSET_SP]
7576
+ pxor m20, m20
7577
+ vbroadcasti32x8 m21, [pw_pixel_max]
7578
+%endif
7579
+ lea r5, [3 * r3]
7580
+
7581
+%rep %2/8 - 1
7582
+ PROCESS_LUMA_VERT_S_8x8_AVX512 %1
7583
+ lea r0, [r4]
7584
+ lea r2, [r2 + 4 * r3]
7585
+%endrep
7586
+ PROCESS_LUMA_VERT_S_8x8_AVX512 %1
7587
+ RET
7588
+%endmacro
7589
+
7590
+%if ARCH_X86_64
7591
+ FILTER_VER_S_LUMA_8xN_AVX512 ss, 8
7592
+ FILTER_VER_S_LUMA_8xN_AVX512 ss, 16
7593
+ FILTER_VER_S_LUMA_8xN_AVX512 ss, 32
7594
+ FILTER_VER_S_LUMA_8xN_AVX512 sp, 8
7595
+ FILTER_VER_S_LUMA_8xN_AVX512 sp, 16
7596
+ FILTER_VER_S_LUMA_8xN_AVX512 sp, 32
7597
+%endif
7598
+
7599
+%macro PROCESS_LUMA_VERT_S_16x4_AVX512 1
7600
+ movu ym1, [r0]
7601
+ movu ym3, [r0 + r1]
7602
+ vinserti32x8 m1, [r0 + 2 * r1], 1
7603
+ vinserti32x8 m3, [r0 + r7], 1
7604
+ punpcklwd m0, m1, m3
7605
+ pmaddwd m0, m15
7606
+ punpckhwd m1, m3
7607
+ pmaddwd m1, m15
7608
+
7609
+ lea r6, [r0 + 4 * r1]
7610
+ movu ym4, [r0 + 2 * r1]
7611
+ vinserti32x8 m4, [r6], 1
7612
+ punpcklwd m2, m3, m4
7613
+ pmaddwd m2, m15
7614
+ punpckhwd m3, m4
7615
+ pmaddwd m3, m15
7616
+
7617
+ movu ym5, [r0 + r7]
7618
+ vinserti32x8 m5, [r6 + r1], 1
7619
+ punpcklwd m6, m4, m5
7620
+ pmaddwd m6, m16
7621
+ punpckhwd m4, m5
7622
+ pmaddwd m4, m16
7623
+
7624
+ paddd m0, m6
7625
+ paddd m1, m4
7626
+
7627
+ movu ym4, [r6]
7628
+ vinserti32x8 m4, [r6 + 2 * r1], 1
7629
+ punpcklwd m6, m5, m4
7630
+ pmaddwd m6, m16
7631
+ punpckhwd m5, m4
7632
+ pmaddwd m5, m16
7633
+
7634
+ paddd m2, m6
7635
+ paddd m3, m5
7636
+
7637
+ movu ym11, [r6 + r1]
7638
+ vinserti32x8 m11, [r6 + r7], 1
7639
+ punpcklwd m8, m4, m11
7640
+ pmaddwd m8, m17
7641
+ punpckhwd m4, m11
7642
+ pmaddwd m4, m17
7643
+
7644
+ movu ym12, [r6 + 2 * r1]
7645
+ vinserti32x8 m12, [r6 + 4 * r1], 1
7646
+ punpcklwd m10, m11, m12
7647
+ pmaddwd m10, m17
7648
+ punpckhwd m11, m12
7649
+ pmaddwd m11, m17
7650
+
7651
+ lea r4, [r6 + 4 * r1]
7652
+ movu ym13, [r6 + r7]
7653
+ vinserti32x8 m13, [r4 + r1], 1
7654
+ punpcklwd m14, m12, m13
7655
+ pmaddwd m14, m18
7656
+ punpckhwd m12, m13
7657
+ pmaddwd m12, m18
7658
+
7659
+ paddd m8, m14
7660
+ paddd m4, m12
7661
+ paddd m0, m8
7662
+ paddd m1, m4
7663
+
7664
+ movu ym12, [r6 + 4 * r1]
7665
+ vinserti32x8 m12, [r4 + 2 * r1], 1
7666
+ punpcklwd m14, m13, m12
7667
+ pmaddwd m14, m18
7668
+ punpckhwd m13, m12
7669
+ pmaddwd m13, m18
7670
+
7671
+ paddd m10, m14
7672
+ paddd m11, m13
7673
+ paddd m2, m10
7674
+ paddd m3, m11
7675
+
7676
+%ifidn %1, sp
7677
+ paddd m0, m19
7678
+ paddd m1, m19
7679
+ paddd m2, m19
7680
+ paddd m3, m19
7681
+
7682
+ psrad m0, INTERP_SHIFT_SP
7683
+ psrad m1, INTERP_SHIFT_SP
7684
+ psrad m2, INTERP_SHIFT_SP
7685
+ psrad m3, INTERP_SHIFT_SP
7686
+
7687
+ packssdw m0, m1
7688
+ packssdw m2, m3
7689
+ CLIPW2 m0, m2, m20, m21
7690
+%else
7691
+ psrad m0, 6
7692
+ psrad m1, 6
7693
+ psrad m2, 6
7694
+ psrad m3, 6
7695
+
7696
+ packssdw m0, m1
7697
+ packssdw m2, m3
7698
+%endif
7699
+
7700
+ movu [r2], ym0
7701
+ movu [r2 + r3], ym2
7702
+ vextracti32x8 [r2 + 2 * r3], m0, 1
7703
+ vextracti32x8 [r2 + r5], m2, 1
7704
+%endmacro
7705
+;-----------------------------------------------------------------------------------------------------------------
7706
+; void interp_8tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
7707
+;-----------------------------------------------------------------------------------------------------------------
7708
+%macro FILTER_VER_S_LUMA_16xN_AVX512 2
7709
+INIT_ZMM avx512
7710
+cglobal interp_8tap_vert_%1_16x%2, 5, 8, 22
7711
+ add r1d, r1d
7712
+ add r3d, r3d
7713
+ lea r7, [3 * r1]
7714
+ sub r0, r7
7715
+ shl r4d, 8
7716
+%ifdef PIC
7717
+ lea r5, [tab_LumaCoeffVer_avx512]
7718
+ mova m15, [r5 + r4]
7719
+ mova m16, [r5 + r4 + 1 * mmsize]
7720
+ mova m17, [r5 + r4 + 2 * mmsize]
7721
+ mova m18, [r5 + r4 + 3 * mmsize]
7722
+%else
7723
+ lea r5, [tab_LumaCoeffVer_avx512 + r4]
7724
+ mova m15, [r5]
7725
+ mova m16, [r5 + 1 * mmsize]
7726
+ mova m17, [r5 + 2 * mmsize]
7727
+ mova m18, [r5 + 3 * mmsize]
7728
+%endif
7729
+%ifidn %1, sp
7730
+ vbroadcasti32x4 m19, [INTERP_OFFSET_SP]
7731
+ pxor m20, m20
7732
+ vbroadcasti32x8 m21, [pw_pixel_max]
7733
+%endif
7734
+ lea r5, [3 * r3]
7735
+%rep %2/4 - 1
7736
+ PROCESS_LUMA_VERT_S_16x4_AVX512 %1
7737
+ lea r0, [r0 + 4 * r1]
7738
+ lea r2, [r2 + 4 * r3]
7739
+%endrep
7740
+ PROCESS_LUMA_VERT_S_16x4_AVX512 %1
7741
+ RET
7742
+%endmacro
7743
+
7744
+%if ARCH_X86_64
7745
+ FILTER_VER_S_LUMA_16xN_AVX512 ss, 4
7746
+ FILTER_VER_S_LUMA_16xN_AVX512 ss, 8
7747
+ FILTER_VER_S_LUMA_16xN_AVX512 ss, 12
7748
+ FILTER_VER_S_LUMA_16xN_AVX512 ss, 16
7749
+ FILTER_VER_S_LUMA_16xN_AVX512 ss, 32
7750
+ FILTER_VER_S_LUMA_16xN_AVX512 ss, 64
7751
+ FILTER_VER_S_LUMA_16xN_AVX512 sp, 4
7752
+ FILTER_VER_S_LUMA_16xN_AVX512 sp, 8
7753
+ FILTER_VER_S_LUMA_16xN_AVX512 sp, 12
7754
+ FILTER_VER_S_LUMA_16xN_AVX512 sp, 16
7755
+ FILTER_VER_S_LUMA_16xN_AVX512 sp, 32
7756
+ FILTER_VER_S_LUMA_16xN_AVX512 sp, 64
7757
+%endif
7758
+
7759
+%macro PROCESS_LUMA_VERT_S_24x8_AVX512 1
7760
+ PROCESS_LUMA_VERT_S_16x4_AVX512 %1
7761
+ lea r4, [r6 + 4 * r1]
7762
+ lea r8, [r4 + 4 * r1]
7763
+ movu ym1, [r6]
7764
+ movu ym3, [r6 + r1]
7765
+ vinserti32x8 m1, [r6 + 2 * r1], 1
7766
+ vinserti32x8 m3, [r6 + r7], 1
7767
+ punpcklwd m0, m1, m3
7768
+ pmaddwd m0, m15
7769
+ punpckhwd m1, m3
7770
+ pmaddwd m1, m15
7771
+
7772
+ movu ym4, [r6 + 2 * r1]
7773
+ vinserti32x8 m4, [r4], 1
7774
+ punpcklwd m2, m3, m4
7775
+ pmaddwd m2, m15
7776
+ punpckhwd m3, m4
7777
+ pmaddwd m3, m15
7778
+
7779
+ movu ym5, [r6 + r7]
7780
+ vinserti32x8 m5, [r4 + r1], 1
7781
+ punpcklwd m6, m4, m5
7782
+ pmaddwd m6, m16
7783
+ punpckhwd m4, m5
7784
+ pmaddwd m4, m16
7785
+
7786
+ paddd m0, m6
7787
+ paddd m1, m4
7788
+
7789
+ movu ym4, [r4]
7790
+ vinserti32x8 m4, [r4 + 2 * r1], 1
7791
+ punpcklwd m6, m5, m4
7792
+ pmaddwd m6, m16
7793
+ punpckhwd m5, m4
7794
+ pmaddwd m5, m16
7795
+
7796
+ paddd m2, m6
7797
+ paddd m3, m5
7798
+
7799
+ movu ym11, [r4 + r1]
7800
+ vinserti32x8 m11, [r4 + r7], 1
7801
+ punpcklwd m8, m4, m11
7802
+ pmaddwd m8, m17
7803
+ punpckhwd m4, m11
7804
+ pmaddwd m4, m17
7805
+
7806
+ movu ym12, [r4 + 2 * r1]
7807
+ vinserti32x8 m12, [r4 + 4 * r1], 1
7808
+ punpcklwd m10, m11, m12
7809
+ pmaddwd m10, m17
7810
+ punpckhwd m11, m12
7811
+ pmaddwd m11, m17
7812
+
7813
+ movu ym13, [r4 + r7]
7814
+ vinserti32x8 m13, [r8 + r1], 1
7815
+ punpcklwd m14, m12, m13
7816
+ pmaddwd m14, m18
7817
+ punpckhwd m12, m13
7818
+ pmaddwd m12, m18
7819
+
7820
+ paddd m8, m14
7821
+ paddd m4, m12
7822
+ paddd m0, m8
7823
+ paddd m1, m4
7824
+
7825
+ movu ym12, [r4 + 4 * r1]
7826
+ vinserti32x8 m12, [r8 + 2 * r1], 1
7827
+ punpcklwd m14, m13, m12
7828
+ pmaddwd m14, m18
7829
+ punpckhwd m13, m12
7830
+ pmaddwd m13, m18
7831
+
7832
+ paddd m10, m14
7833
+ paddd m11, m13
7834
+ paddd m2, m10
7835
+ paddd m3, m11
7836
+
7837
+%ifidn %1, sp
7838
+ paddd m0, m19
7839
+ paddd m1, m19
7840
+ paddd m2, m19
7841
+ paddd m3, m19
7842
+
7843
+ psrad m0, INTERP_SHIFT_SP
7844
+ psrad m1, INTERP_SHIFT_SP
7845
+ psrad m2, INTERP_SHIFT_SP
7846
+ psrad m3, INTERP_SHIFT_SP
7847
+
7848
+ packssdw m0, m1
7849
+ packssdw m2, m3
7850
+ CLIPW2 m0, m2, m20, m21
7851
+%else
7852
+ psrad m0, 6
7853
+ psrad m1, 6
7854
+ psrad m2, 6
7855
+ psrad m3, 6
7856
+
7857
+ packssdw m0, m1
7858
+ packssdw m2, m3
7859
+%endif
7860
+ lea r9, [r2 + 4 * r3]
7861
+ movu [r9], ym0
7862
+ movu [r9 + r3], ym2
7863
+ vextracti32x8 [r9 + 2 * r3], m0, 1
7864
+ vextracti32x8 [r9 + r5], m2, 1
7865
+
7866
+ movu xm1, [r0 + mmsize/2]
7867
+ vinserti32x4 m1, [r0 + 2 * r1 + mmsize/2], 1
7868
+ vinserti32x4 m1, [r0 + 4 * r1 + mmsize/2], 2
7869
+ vinserti32x4 m1, [r6 + 2 * r1 + mmsize/2], 3
7870
+ movu xm3, [r0 + r1 + mmsize/2]
7871
+ vinserti32x4 m3, [r0 + r7 + mmsize/2], 1
7872
+ vinserti32x4 m3, [r6 + r1 + mmsize/2], 2
7873
+ vinserti32x4 m3, [r6 + r7 + mmsize/2], 3
7874
+ punpcklwd m0, m1, m3
7875
+ pmaddwd m0, m15
7876
+ punpckhwd m1, m3
7877
+ pmaddwd m1, m15
7878
+
7879
+ movu xm4, [r0 + 2 * r1 + mmsize/2]
7880
+ vinserti32x4 m4, [r0 + 4 * r1 + mmsize/2], 1
7881
+ vinserti32x4 m4, [r6 + 2 * r1 + mmsize/2], 2
7882
+ vinserti32x4 m4, [r6 + 4 * r1 + mmsize/2], 3
7883
+ punpcklwd m2, m3, m4
7884
+ pmaddwd m2, m15
7885
+ punpckhwd m3, m4
7886
+ pmaddwd m3, m15
7887
+
7888
+ movu xm5, [r0 + r7 + mmsize/2]
7889
+ vinserti32x4 m5, [r6 + r1 + mmsize/2], 1
7890
+ vinserti32x4 m5, [r6 + r7 + mmsize/2], 2
7891
+ vinserti32x4 m5, [r4 + r1 + mmsize/2], 3
7892
+ punpcklwd m6, m4, m5
7893
+ pmaddwd m6, m16
7894
+ punpckhwd m4, m5
7895
+ pmaddwd m4, m16
7896
+
7897
+ paddd m0, m6
7898
+ paddd m1, m4
7899
+
7900
+ movu xm4, [r0 + 4 * r1 + mmsize/2]
7901
+ vinserti32x4 m4, [r6 + 2 * r1 + mmsize/2], 1
7902
+ vinserti32x4 m4, [r6 + 4 * r1 + mmsize/2], 2
7903
+ vinserti32x4 m4, [r4 + 2 * r1 + mmsize/2], 3
7904
+ punpcklwd m6, m5, m4
7905
+ pmaddwd m6, m16
7906
+ punpckhwd m5, m4
7907
+ pmaddwd m5, m16
7908
+
7909
+ paddd m2, m6
7910
+ paddd m3, m5
7911
+
7912
+ movu xm11, [r6 + r1 + mmsize/2]
7913
+ vinserti32x4 m11, [r6 + r7 + mmsize/2], 1
7914
+ vinserti32x4 m11, [r4 + r1 + mmsize/2], 2
7915
+ vinserti32x4 m11, [r4 + r7 + mmsize/2], 3
7916
+ punpcklwd m8, m4, m11
7917
+ pmaddwd m8, m17
7918
+ punpckhwd m4, m11
7919
+ pmaddwd m4, m17
7920
+
7921
+ movu xm12, [r6 + 2 * r1 + mmsize/2]
7922
+ vinserti32x4 m12, [r6 + 4 * r1 + mmsize/2], 1
7923
+ vinserti32x4 m12, [r4 + 2 * r1 + mmsize/2], 2
7924
+ vinserti32x4 m12, [r4 + 4 * r1 + mmsize/2], 3
7925
+ punpcklwd m10, m11, m12
7926
+ pmaddwd m10, m17
7927
+ punpckhwd m11, m12
7928
+ pmaddwd m11, m17
7929
+
7930
+ movu xm13, [r6 + r7 + mmsize/2]
7931
+ vinserti32x4 m13, [r4 + r1 + mmsize/2], 1
7932
+ vinserti32x4 m13, [r4 + r7 + mmsize/2], 2
7933
+ vinserti32x4 m13, [r8 + r1 + mmsize/2], 3
7934
+ punpcklwd m14, m12, m13
7935
+ pmaddwd m14, m18
7936
+ punpckhwd m12, m13
7937
+ pmaddwd m12, m18
7938
+
7939
+ paddd m8, m14
7940
+ paddd m4, m12
7941
+ paddd m0, m8
7942
+ paddd m1, m4
7943
+
7944
+ movu xm12, [r6 + 4 * r1 + mmsize/2]
7945
+ vinserti32x4 m12, [r4 + 2 * r1 + mmsize/2], 1
7946
+ vinserti32x4 m12, [r4 + 4 * r1 + mmsize/2], 2
7947
+ vinserti32x4 m12, [r8 + 2 * r1 + mmsize/2], 3
7948
+ punpcklwd m14, m13, m12
7949
+ pmaddwd m14, m18
7950
+ punpckhwd m13, m12
7951
+ pmaddwd m13, m18
7952
+
7953
+ paddd m10, m14
7954
+ paddd m11, m13
7955
+ paddd m2, m10
7956
+ paddd m3, m11
7957
+
7958
+%ifidn %1, sp
7959
+ paddd m0, m19
7960
+ paddd m1, m19
7961
+ paddd m2, m19
7962
+ paddd m3, m19
7963
+
7964
+ psrad m0, INTERP_SHIFT_SP
7965
+ psrad m1, INTERP_SHIFT_SP
7966
+ psrad m2, INTERP_SHIFT_SP
7967
+ psrad m3, INTERP_SHIFT_SP
7968
+
7969
+ packssdw m0, m1
7970
+ packssdw m2, m3
7971
+ CLIPW2 m0, m2, m20, m21
7972
+%else
7973
+ psrad m0, 6
7974
+ psrad m1, 6
7975
+ psrad m2, 6
7976
+ psrad m3, 6
7977
+
7978
+ packssdw m0, m1
7979
+ packssdw m2, m3
7980
+%endif
7981
+
7982
+ movu [r2 + mmsize/2], xm0
7983
+ movu [r2 + r3 + mmsize/2], xm2
7984
+ vextracti32x4 [r2 + 2 * r3 + mmsize/2], m0, 1
7985
+ vextracti32x4 [r2 + r5 + mmsize/2], m2, 1
7986
+ lea r2, [r2 + 4 * r3]
7987
+ vextracti32x4 [r2 + mmsize/2], m0, 2
7988
+ vextracti32x4 [r2 + r3 + mmsize/2], m2, 2
7989
+ vextracti32x4 [r2 + 2 * r3 + mmsize/2], m0, 3
7990
+ vextracti32x4 [r2 + r5 + mmsize/2], m2, 3
7991
+%endmacro
7992
+;-----------------------------------------------------------------------------------------------------------------
7993
+; void interp_8tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
7994
+;-----------------------------------------------------------------------------------------------------------------
7995
+%macro FILTER_VER_S_LUMA_24x32_AVX512 1
7996
+INIT_ZMM avx512
7997
+cglobal interp_8tap_vert_%1_24x32, 5, 10, 22
7998
+ add r1d, r1d
7999
+ add r3d, r3d
8000
+ lea r7, [3 * r1]
8001
+ sub r0, r7
8002
+ shl r4d, 8
8003
+%ifdef PIC
8004
+ lea r5, [tab_LumaCoeffVer_avx512]
8005
+ mova m15, [r5 + r4]
8006
+ mova m16, [r5 + r4 + 1 * mmsize]
8007
+ mova m17, [r5 + r4 + 2 * mmsize]
8008
+ mova m18, [r5 + r4 + 3 * mmsize]
8009
+%else
8010
+ lea r5, [tab_LumaCoeffVer_avx512 + r4]
8011
+ mova m15, [r5]
8012
+ mova m16, [r5 + 1 * mmsize]
8013
+ mova m17, [r5 + 2 * mmsize]
8014
+ mova m18, [r5 + 3 * mmsize]
8015
+%endif
8016
+%ifidn %1, sp
8017
+ vbroadcasti32x4 m19, [INTERP_OFFSET_SP]
8018
+ pxor m20, m20
8019
+ vbroadcasti32x8 m21, [pw_pixel_max]
8020
+%endif
8021
+ lea r5, [3 * r3]
8022
+
8023
+%rep 3
8024
+ PROCESS_LUMA_VERT_S_24x8_AVX512 %1
8025
+ lea r0, [r4]
8026
+ lea r2, [r2 + 4 * r3]
8027
+%endrep
8028
+ PROCESS_LUMA_VERT_S_24x8_AVX512 %1
8029
+ RET
8030
+%endmacro
8031
+
8032
+%if ARCH_X86_64
8033
+ FILTER_VER_S_LUMA_24x32_AVX512 ss
8034
+ FILTER_VER_S_LUMA_24x32_AVX512 sp
8035
+%endif
8036
+
8037
+%macro PROCESS_LUMA_VERT_S_32x2_AVX512 1
8038
+ movu m1, [r0] ;0 row
8039
+ movu m3, [r0 + r1] ;1 row
8040
+ punpcklwd m0, m1, m3
8041
+ pmaddwd m0, m15
8042
+ punpckhwd m1, m3
8043
+ pmaddwd m1, m15
8044
+
8045
+ movu m4, [r0 + 2 * r1] ;2 row
8046
+ punpcklwd m2, m3, m4
8047
+ pmaddwd m2, m15
8048
+ punpckhwd m3, m4
8049
+ pmaddwd m3, m15
8050
+
8051
+ movu m5, [r0 + r7] ;3 row
8052
+ punpcklwd m6, m4, m5
8053
+ pmaddwd m6, m16
8054
+ punpckhwd m4, m5
8055
+ pmaddwd m4, m16
8056
+
8057
+ paddd m0, m6
8058
+ paddd m1, m4
8059
+
8060
+ movu m4, [r0 + 4 * r1] ;4 row
8061
+ punpcklwd m6, m5, m4
8062
+ pmaddwd m6, m16
8063
+ punpckhwd m5, m4
8064
+ pmaddwd m5, m16
8065
+
8066
+ paddd m2, m6
8067
+ paddd m3, m5
8068
+
8069
+ lea r6, [r0 + 4 * r1]
8070
+
8071
+ movu m11, [r6 + r1] ;5 row
8072
+ punpcklwd m8, m4, m11
8073
+ pmaddwd m8, m17
8074
+ punpckhwd m4, m11
8075
+ pmaddwd m4, m17
8076
+
8077
+ movu m12, [r6 + 2 * r1] ;6 row
8078
+ punpcklwd m10, m11, m12
8079
+ pmaddwd m10, m17
8080
+ punpckhwd m11, m12
8081
+ pmaddwd m11, m17
8082
+
8083
+ movu m13, [r6 + r7] ;7 row
8084
+ punpcklwd m14, m12, m13
8085
+ pmaddwd m14, m18
8086
+ punpckhwd m12, m13
8087
+ pmaddwd m12, m18
8088
+
8089
+ paddd m8, m14
8090
+ paddd m4, m12
8091
+ paddd m0, m8
8092
+ paddd m1, m4
8093
+
8094
+ movu m12, [r6 + 4 * r1] ; 8 row
8095
+ punpcklwd m14, m13, m12
8096
+ pmaddwd m14, m18
8097
+ punpckhwd m13, m12
8098
+ pmaddwd m13, m18
8099
+
8100
+ paddd m10, m14
8101
+ paddd m11, m13
8102
+ paddd m2, m10
8103
+ paddd m3, m11
8104
+
8105
+%ifidn %1, sp
8106
+ paddd m0, m19
8107
+ paddd m1, m19
8108
+ paddd m2, m19
8109
+ paddd m3, m19
8110
+
8111
+ psrad m0, INTERP_SHIFT_SP
8112
+ psrad m1, INTERP_SHIFT_SP
8113
+ psrad m2, INTERP_SHIFT_SP
8114
+ psrad m3, INTERP_SHIFT_SP
8115
+
8116
+ packssdw m0, m1
8117
+ packssdw m2, m3
8118
+ CLIPW2 m0, m2, m20, m21
8119
+%else
8120
+ psrad m0, 6
8121
+ psrad m1, 6
8122
+ psrad m2, 6
8123
+ psrad m3, 6
8124
+
8125
+ packssdw m0, m1
8126
+ packssdw m2, m3
8127
+%endif
8128
+
8129
+ movu [r2], m0
8130
+ movu [r2 + r3], m2
8131
+%endmacro
8132
+;-----------------------------------------------------------------------------------------------------------------
8133
+; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
8134
+;-----------------------------------------------------------------------------------------------------------------
8135
+%macro FILTER_VER_S_LUMA_32xN_AVX512 2
8136
+INIT_ZMM avx512
8137
+cglobal interp_8tap_vert_%1_32x%2, 5, 8, 22
8138
+ add r1d, r1d
8139
+ add r3d, r3d
8140
+ lea r7, [3 * r1]
8141
+ sub r0, r7
8142
+ shl r4d, 8
8143
+%ifdef PIC
8144
+ lea r5, [tab_LumaCoeffVer_avx512]
8145
+ mova m15, [r5 + r4]
8146
+ mova m16, [r5 + r4 + 1 * mmsize]
8147
+ mova m17, [r5 + r4 + 2 * mmsize]
8148
+ mova m18, [r5 + r4 + 3 * mmsize]
8149
+%else
8150
+ lea r5, [tab_LumaCoeffVer_avx512 + r4]
8151
+ mova m15, [r5]
8152
+ mova m16, [r5 + 1 * mmsize]
8153
+ mova m17, [r5 + 2 * mmsize]
8154
+ mova m18, [r5 + 3 * mmsize]
8155
+%endif
8156
+%ifidn %1, sp
8157
+ vbroadcasti32x4 m19, [INTERP_OFFSET_SP]
8158
+ pxor m20, m20
8159
+ vbroadcasti32x8 m21, [pw_pixel_max]
8160
+%endif
8161
+
8162
+%rep %2/2 - 1
8163
+ PROCESS_LUMA_VERT_S_32x2_AVX512 %1
8164
+ lea r0, [r0 + 2 * r1]
8165
+ lea r2, [r2 + 2 * r3]
8166
+%endrep
8167
+ PROCESS_LUMA_VERT_S_32x2_AVX512 %1
8168
+ RET
8169
+%endmacro
8170
+
8171
+%if ARCH_X86_64
8172
+ FILTER_VER_S_LUMA_32xN_AVX512 ss, 8
8173
+ FILTER_VER_S_LUMA_32xN_AVX512 ss, 16
8174
+ FILTER_VER_S_LUMA_32xN_AVX512 ss, 32
8175
+ FILTER_VER_S_LUMA_32xN_AVX512 ss, 24
8176
+ FILTER_VER_S_LUMA_32xN_AVX512 ss, 64
8177
+ FILTER_VER_S_LUMA_32xN_AVX512 sp, 8
8178
+ FILTER_VER_S_LUMA_32xN_AVX512 sp, 16
8179
+ FILTER_VER_S_LUMA_32xN_AVX512 sp, 32
8180
+ FILTER_VER_S_LUMA_32xN_AVX512 sp, 24
8181
+ FILTER_VER_S_LUMA_32xN_AVX512 sp, 64
8182
+%endif
8183
+
8184
+%macro PROCESS_LUMA_VERT_S_48x4_AVX512 1
8185
+ PROCESS_LUMA_VERT_S_32x2_AVX512 %1
8186
+ movu m1, [r0 + 2 * r1]
8187
+ movu m3, [r0 + r7]
8188
+ punpcklwd m0, m1, m3
8189
+ pmaddwd m0, m15
8190
+ punpckhwd m1, m3
8191
+ pmaddwd m1, m15
8192
+
8193
+ movu m4, [r0 + 4 * r1]
8194
+ punpcklwd m2, m3, m4
8195
+ pmaddwd m2, m15
8196
+ punpckhwd m3, m4
8197
+ pmaddwd m3, m15
8198
+
8199
+ movu m5, [r6 + r1]
8200
+ punpcklwd m6, m4, m5
8201
+ pmaddwd m6, m16
8202
+ punpckhwd m4, m5
8203
+ pmaddwd m4, m16
8204
+
8205
+ paddd m0, m6
8206
+ paddd m1, m4
8207
+
8208
+ lea r4, [r6 + 4 * r1]
8209
+
8210
+ movu m4, [r6 + 2 * r1]
8211
+ punpcklwd m6, m5, m4
8212
+ pmaddwd m6, m16
8213
+ punpckhwd m5, m4
8214
+ pmaddwd m5, m16
8215
+
8216
+ paddd m2, m6
8217
+ paddd m3, m5
8218
+
8219
+ movu m11, [r6 + r7]
8220
+ punpcklwd m8, m4, m11
8221
+ pmaddwd m8, m17
8222
+ punpckhwd m4, m11
8223
+ pmaddwd m4, m17
8224
+
8225
+ movu m12, [r4]
8226
+ punpcklwd m10, m11, m12
8227
+ pmaddwd m10, m17
8228
+ punpckhwd m11, m12
8229
+ pmaddwd m11, m17
8230
+
8231
+ movu m13, [r4 + r1]
8232
+ punpcklwd m14, m12, m13
8233
+ pmaddwd m14, m18
8234
+ punpckhwd m12, m13
8235
+ pmaddwd m12, m18
8236
+
8237
+ paddd m8, m14
8238
+ paddd m4, m12
8239
+ paddd m0, m8
8240
+ paddd m1, m4
8241
+
8242
+ movu m12, [r4 + 2 * r1]
8243
+ punpcklwd m14, m13, m12
8244
+ pmaddwd m14, m18
8245
+ punpckhwd m13, m12
8246
+ pmaddwd m13, m18
8247
+
8248
+ paddd m10, m14
8249
+ paddd m11, m13
8250
+ paddd m2, m10
8251
+ paddd m3, m11
8252
+
8253
+%ifidn %1, sp
8254
+ paddd m0, m19
8255
+ paddd m1, m19
8256
+ paddd m2, m19
8257
+ paddd m3, m19
8258
+
8259
+ psrad m0, INTERP_SHIFT_SP
8260
+ psrad m1, INTERP_SHIFT_SP
8261
+ psrad m2, INTERP_SHIFT_SP
8262
+ psrad m3, INTERP_SHIFT_SP
8263
+
8264
+ packssdw m0, m1
8265
+ packssdw m2, m3
8266
+ CLIPW2 m0, m2, m20, m21
8267
+%else
8268
+ psrad m0, 6
8269
+ psrad m1, 6
8270
+ psrad m2, 6
8271
+ psrad m3, 6
8272
+
8273
+ packssdw m0, m1
8274
+ packssdw m2, m3
8275
+%endif
8276
+
8277
+ movu [r2 + 2 * r3], m0
8278
+ movu [r2 + r5], m2
8279
+
8280
+ movu ym1, [r0 + mmsize]
8281
+ movu ym3, [r0 + r1 + mmsize]
8282
+ vinserti32x8 m1, [r0 + 2 * r1 + mmsize], 1
8283
+ vinserti32x8 m3, [r0 + r7 + mmsize], 1
8284
+ punpcklwd m0, m1, m3
8285
+ pmaddwd m0, m15
8286
+ punpckhwd m1, m3
8287
+ pmaddwd m1, m15
8288
+
8289
+ movu ym4, [r0 + 2 * r1 + mmsize]
8290
+ vinserti32x8 m4, [r6 + mmsize], 1
8291
+ punpcklwd m2, m3, m4
8292
+ pmaddwd m2, m15
8293
+ punpckhwd m3, m4
8294
+ pmaddwd m3, m15
8295
+
8296
+ movu ym5, [r0 + r7 + mmsize]
8297
+ vinserti32x8 m5, [r6 + r1 + mmsize], 1
8298
+ punpcklwd m6, m4, m5
8299
+ pmaddwd m6, m16
8300
+ punpckhwd m4, m5
8301
+ pmaddwd m4, m16
8302
+
8303
+ paddd m0, m6
8304
+ paddd m1, m4
8305
+
8306
+ movu ym4, [r6 + mmsize]
8307
+ vinserti32x8 m4, [r6 + 2 * r1 + mmsize], 1
8308
+ punpcklwd m6, m5, m4
8309
+ pmaddwd m6, m16
8310
+ punpckhwd m5, m4
8311
+ pmaddwd m5, m16
8312
+
8313
+ paddd m2, m6
8314
+ paddd m3, m5
8315
+
8316
+ movu ym11, [r6 + r1 + mmsize]
8317
+ vinserti32x8 m11, [r6 + r7 + mmsize], 1
8318
+ punpcklwd m8, m4, m11
8319
+ pmaddwd m8, m17
8320
+ punpckhwd m4, m11
8321
+ pmaddwd m4, m17
8322
+
8323
+ movu ym12, [r6 + 2 * r1 + mmsize]
8324
+ vinserti32x8 m12, [r6 + 4 * r1 + mmsize], 1
8325
+ punpcklwd m10, m11, m12
8326
+ pmaddwd m10, m17
8327
+ punpckhwd m11, m12
8328
+ pmaddwd m11, m17
8329
+
8330
+ movu ym13, [r6 + r7 + mmsize]
8331
+ vinserti32x8 m13, [r4 + r1 + mmsize], 1
8332
+ punpcklwd m14, m12, m13
8333
+ pmaddwd m14, m18
8334
+ punpckhwd m12, m13
8335
+ pmaddwd m12, m18
8336
+
8337
+ paddd m8, m14
8338
+ paddd m4, m12
8339
+ paddd m0, m8
8340
+ paddd m1, m4
8341
+
8342
+ movu ym12, [r6 + 4 * r1 + mmsize]
8343
+ vinserti32x8 m12, [r4 + 2 * r1 + mmsize], 1
8344
+ punpcklwd m14, m13, m12
8345
+ pmaddwd m14, m18
8346
+ punpckhwd m13, m12
8347
+ pmaddwd m13, m18
8348
+
8349
+ paddd m10, m14
8350
+ paddd m11, m13
8351
+ paddd m2, m10
8352
+ paddd m3, m11
8353
+
8354
+%ifidn %1, sp
8355
+ paddd m0, m19
8356
+ paddd m1, m19
8357
+ paddd m2, m19
8358
+ paddd m3, m19
8359
+
8360
+ psrad m0, INTERP_SHIFT_SP
8361
+ psrad m1, INTERP_SHIFT_SP
8362
+ psrad m2, INTERP_SHIFT_SP
8363
+ psrad m3, INTERP_SHIFT_SP
8364
+
8365
+ packssdw m0, m1
8366
+ packssdw m2, m3
8367
+ CLIPW2 m0, m2, m20, m21
8368
+%else
8369
+ psrad m0, 6
8370
+ psrad m1, 6
8371
+ psrad m2, 6
8372
+ psrad m3, 6
8373
+
8374
+ packssdw m0, m1
8375
+ packssdw m2, m3
8376
+%endif
8377
+
8378
+ movu [r2 + mmsize], ym0
8379
+ movu [r2 + r3 + mmsize], ym2
8380
+ vextracti32x8 [r2 + 2 * r3 + mmsize], m0, 1
8381
+ vextracti32x8 [r2 + r5 + mmsize], m2, 1
8382
+%endmacro
8383
+;-----------------------------------------------------------------------------------------------------------------
8384
+; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
8385
+;-----------------------------------------------------------------------------------------------------------------
8386
+%macro FILTER_VER_S_LUMA_48x64_AVX512 1
8387
+INIT_ZMM avx512
8388
+cglobal interp_8tap_vert_%1_48x64, 5, 8, 22
8389
+ add r1d, r1d
8390
+ add r3d, r3d
8391
+ lea r7, [3 * r1]
8392
+ sub r0, r7
8393
+ shl r4d, 8
8394
+%ifdef PIC
8395
+ lea r5, [tab_LumaCoeffVer_avx512]
8396
+ mova m15, [r5 + r4]
8397
+ mova m16, [r5 + r4 + 1 * mmsize]
8398
+ mova m17, [r5 + r4 + 2 * mmsize]
8399
+ mova m18, [r5 + r4 + 3 * mmsize]
8400
+%else
8401
+ lea r5, [tab_LumaCoeffVer_avx512 + r4]
8402
+ mova m15, [r5]
8403
+ mova m16, [r5 + 1 * mmsize]
8404
+ mova m17, [r5 + 2 * mmsize]
8405
+ mova m18, [r5 + 3 * mmsize]
8406
+%endif
8407
+%ifidn %1, sp
8408
+ vbroadcasti32x4 m19, [INTERP_OFFSET_SP]
8409
+ pxor m20, m20
8410
+ vbroadcasti32x8 m21, [pw_pixel_max]
8411
+%endif
8412
+
8413
+ lea r5, [3 * r3]
8414
+%rep 15
8415
+ PROCESS_LUMA_VERT_S_48x4_AVX512 %1
8416
+ lea r0, [r0 + 4 * r1]
8417
+ lea r2, [r2 + 4 * r3]
8418
+%endrep
8419
+ PROCESS_LUMA_VERT_S_48x4_AVX512 %1
8420
+ RET
8421
+%endmacro
8422
+
8423
+%if ARCH_X86_64
8424
+ FILTER_VER_S_LUMA_48x64_AVX512 ss
8425
+ FILTER_VER_S_LUMA_48x64_AVX512 sp
8426
+%endif
8427
+
8428
+%macro PROCESS_LUMA_VERT_S_64x2_AVX512 1
8429
+ movu m1, [r0] ;0 row
8430
+ movu m3, [r0 + r1] ;1 row
8431
+ punpcklwd m0, m1, m3
8432
+ pmaddwd m0, m15
8433
+ punpckhwd m1, m3
8434
+ pmaddwd m1, m15
8435
+
8436
+ movu m4, [r0 + 2 * r1] ;2 row
8437
+ punpcklwd m2, m3, m4
8438
+ pmaddwd m2, m15
8439
+ punpckhwd m3, m4
8440
+ pmaddwd m3, m15
8441
+
8442
+ movu m5, [r0 + r7] ;3 row
8443
+ punpcklwd m6, m4, m5
8444
+ pmaddwd m6, m16
8445
+ punpckhwd m4, m5
8446
+ pmaddwd m4, m16
8447
+
8448
+ paddd m0, m6
8449
+ paddd m1, m4
8450
+
8451
+ movu m4, [r0 + 4 * r1] ;4 row
8452
+ punpcklwd m6, m5, m4
8453
+ pmaddwd m6, m16
8454
+ punpckhwd m5, m4
8455
+ pmaddwd m5, m16
8456
+
8457
+ paddd m2, m6
8458
+ paddd m3, m5
8459
+
8460
+ lea r6, [r0 + 4 * r1]
8461
+
8462
+ movu m11, [r6 + r1] ;5 row
8463
+ punpcklwd m8, m4, m11
8464
+ pmaddwd m8, m17
8465
+ punpckhwd m4, m11
8466
+ pmaddwd m4, m17
8467
+
8468
+ movu m12, [r6 + 2 * r1] ;6 row
8469
+ punpcklwd m10, m11, m12
8470
+ pmaddwd m10, m17
8471
+ punpckhwd m11, m12
8472
+ pmaddwd m11, m17
8473
+
8474
+ movu m13, [r6 + r7] ;7 row
8475
+ punpcklwd m14, m12, m13
8476
+ pmaddwd m14, m18
8477
+ punpckhwd m12, m13
8478
+ pmaddwd m12, m18
8479
+
8480
+ paddd m8, m14
8481
+ paddd m4, m12
8482
+ paddd m0, m8
8483
+ paddd m1, m4
8484
+
8485
+ movu m12, [r6 + 4 * r1] ; 8 row
8486
+ punpcklwd m14, m13, m12
8487
+ pmaddwd m14, m18
8488
+ punpckhwd m13, m12
8489
+ pmaddwd m13, m18
8490
+
8491
+ paddd m10, m14
8492
+ paddd m11, m13
8493
+ paddd m2, m10
8494
+ paddd m3, m11
8495
+
8496
+%ifidn %1, sp
8497
+ paddd m0, m19
8498
+ paddd m1, m19
8499
+ paddd m2, m19
8500
+ paddd m3, m19
8501
+
8502
+ psrad m0, INTERP_SHIFT_SP
8503
+ psrad m1, INTERP_SHIFT_SP
8504
+ psrad m2, INTERP_SHIFT_SP
8505
+ psrad m3, INTERP_SHIFT_SP
8506
+
8507
+ packssdw m0, m1
8508
+ packssdw m2, m3
8509
+ CLIPW2 m0, m2, m20, m21
8510
+%else
8511
+ psrad m0, 6
8512
+ psrad m1, 6
8513
+ psrad m2, 6
8514
+ psrad m3, 6
8515
+
8516
+ packssdw m0, m1
8517
+ packssdw m2, m3
8518
+%endif
8519
+
8520
+ movu [r2], m0
8521
+ movu [r2 + r3], m2
8522
+
8523
+ movu m1, [r0 + mmsize] ;0 row
8524
+ movu m3, [r0 + r1 + mmsize] ;1 row
8525
+ punpcklwd m0, m1, m3
8526
+ pmaddwd m0, m15
8527
+ punpckhwd m1, m3
8528
+ pmaddwd m1, m15
8529
+
8530
+ movu m4, [r0 + 2 * r1 + mmsize] ;2 row
8531
+ punpcklwd m2, m3, m4
8532
+ pmaddwd m2, m15
8533
+ punpckhwd m3, m4
8534
+ pmaddwd m3, m15
8535
+
8536
+ movu m5, [r0 + r7 + mmsize] ;3 row
8537
+ punpcklwd m6, m4, m5
8538
+ pmaddwd m6, m16
8539
+ punpckhwd m4, m5
8540
+ pmaddwd m4, m16
8541
+
8542
+ paddd m0, m6
8543
+ paddd m1, m4
8544
+
8545
+ movu m4, [r0 + 4 * r1 + mmsize] ;4 row
8546
+ punpcklwd m6, m5, m4
8547
+ pmaddwd m6, m16
8548
+ punpckhwd m5, m4
8549
+ pmaddwd m5, m16
8550
+
8551
+ paddd m2, m6
8552
+ paddd m3, m5
8553
+
8554
+ movu m11, [r6 + r1 + mmsize] ;5 row
8555
+ punpcklwd m8, m4, m11
8556
+ pmaddwd m8, m17
8557
+ punpckhwd m4, m11
8558
+ pmaddwd m4, m17
8559
+
8560
+ movu m12, [r6 + 2 * r1 + mmsize] ;6 row
8561
+ punpcklwd m10, m11, m12
8562
+ pmaddwd m10, m17
8563
+ punpckhwd m11, m12
8564
+ pmaddwd m11, m17
8565
+
8566
+ movu m13, [r6 + r7 + mmsize] ;7 row
8567
+ punpcklwd m14, m12, m13
8568
+ pmaddwd m14, m18
8569
+ punpckhwd m12, m13
8570
+ pmaddwd m12, m18
8571
+
8572
+ paddd m8, m14
8573
+ paddd m4, m12
8574
+ paddd m0, m8
8575
+ paddd m1, m4
8576
+
8577
+ movu m12, [r6 + 4 * r1 + mmsize] ; 8 row
8578
+ punpcklwd m14, m13, m12
8579
+ pmaddwd m14, m18
8580
+ punpckhwd m13, m12
8581
+ pmaddwd m13, m18
8582
+
8583
+ paddd m10, m14
8584
+ paddd m11, m13
8585
+ paddd m2, m10
8586
+ paddd m3, m11
8587
+
8588
+%ifidn %1, sp
8589
+ paddd m0, m19
8590
+ paddd m1, m19
8591
+ paddd m2, m19
8592
+ paddd m3, m19
8593
+
8594
+ psrad m0, INTERP_SHIFT_SP
8595
+ psrad m1, INTERP_SHIFT_SP
8596
+ psrad m2, INTERP_SHIFT_SP
8597
+ psrad m3, INTERP_SHIFT_SP
8598
+
8599
+ packssdw m0, m1
8600
+ packssdw m2, m3
8601
+ CLIPW2 m0, m2, m20, m21
8602
+%else
8603
+ psrad m0, 6
8604
+ psrad m1, 6
8605
+ psrad m2, 6
8606
+ psrad m3, 6
8607
+
8608
+ packssdw m0, m1
8609
+ packssdw m2, m3
8610
+%endif
8611
+
8612
+ movu [r2 + mmsize], m0
8613
+ movu [r2 + r3 + mmsize], m2
8614
+%endmacro
8615
+;-----------------------------------------------------------------------------------------------------------------
8616
+; void interp_8tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
8617
+;-----------------------------------------------------------------------------------------------------------------
8618
+%macro FILTER_VER_S_LUMA_64xN_AVX512 2
8619
+INIT_ZMM avx512
8620
+cglobal interp_8tap_vert_%1_64x%2, 5, 8, 22
8621
+ add r1d, r1d
8622
+ add r3d, r3d
8623
+ lea r7, [3 * r1]
8624
+ sub r0, r7
8625
+ shl r4d, 8
8626
+%ifdef PIC
8627
+ lea r5, [tab_LumaCoeffVer_avx512]
8628
+ mova m15, [r5 + r4]
8629
+ mova m16, [r5 + r4 + 1 * mmsize]
8630
+ mova m17, [r5 + r4 + 2 * mmsize]
8631
+ mova m18, [r5 + r4 + 3 * mmsize]
8632
+%else
8633
+ lea r5, [tab_LumaCoeffVer_avx512 + r4]
8634
+ mova m15, [r5]
8635
+ mova m16, [r5 + 1 * mmsize]
8636
+ mova m17, [r5 + 2 * mmsize]
8637
+ mova m18, [r5 + 3 * mmsize]
8638
+%endif
8639
+%ifidn %1, sp
8640
+ vbroadcasti32x4 m19, [INTERP_OFFSET_SP]
8641
+ pxor m20, m20
8642
+ vbroadcasti32x8 m21, [pw_pixel_max]
8643
+%endif
8644
+
8645
+%rep %2/2 - 1
8646
+ PROCESS_LUMA_VERT_S_64x2_AVX512 %1
8647
+ lea r0, [r0 + 2 * r1]
8648
+ lea r2, [r2 + 2 * r3]
8649
+%endrep
8650
+ PROCESS_LUMA_VERT_S_64x2_AVX512 %1
8651
+ RET
8652
+%endmacro
8653
+
8654
+%if ARCH_X86_64
8655
+ FILTER_VER_S_LUMA_64xN_AVX512 ss, 16
8656
+ FILTER_VER_S_LUMA_64xN_AVX512 ss, 32
8657
+ FILTER_VER_S_LUMA_64xN_AVX512 ss, 48
8658
+ FILTER_VER_S_LUMA_64xN_AVX512 ss, 64
8659
+ FILTER_VER_S_LUMA_64xN_AVX512 sp, 16
8660
+ FILTER_VER_S_LUMA_64xN_AVX512 sp, 32
8661
+ FILTER_VER_S_LUMA_64xN_AVX512 sp, 48
8662
+ FILTER_VER_S_LUMA_64xN_AVX512 sp, 64
8663
+%endif
8664
+;-------------------------------------------------------------------------------------------------------------
8665
+;avx512 luma_vss and luma_vsp code end
8666
+;-------------------------------------------------------------------------------------------------------------
8667
+;-------------------------------------------------------------------------------------------------------------
8668
+;avx512 luma_vpp and luma_vps code start
8669
+;-------------------------------------------------------------------------------------------------------------
8670
+%macro PROCESS_LUMA_VERT_P_16x4_AVX512 1
8671
+ lea r5, [r0 + 4 * r1]
8672
+ movu ym1, [r0]
8673
+ movu ym3, [r0 + r1]
8674
+ vinserti32x8 m1, [r0 + 2 * r1], 1
8675
+ vinserti32x8 m3, [r0 + r7], 1
8676
+ punpcklwd m0, m1, m3
8677
+ pmaddwd m0, m15
8678
+ punpckhwd m1, m3
8679
+ pmaddwd m1, m15
8680
+
8681
+ movu ym4, [r0 + 2 * r1]
8682
+ vinserti32x8 m4, [r0 + 4 * r1], 1
8683
+ punpcklwd m2, m3, m4
8684
+ pmaddwd m2, m15
8685
+ punpckhwd m3, m4
8686
+ pmaddwd m3, m15
8687
+
8688
+ movu ym5, [r0 + r7]
8689
+ vinserti32x8 m5, [r5 + r1], 1
8690
+ punpcklwd m6, m4, m5
8691
+ pmaddwd m6, m16
8692
+ punpckhwd m4, m5
8693
+ pmaddwd m4, m16
8694
+
8695
+ paddd m0, m6
8696
+ paddd m1, m4
8697
+
8698
+ movu ym4, [r5]
8699
+ vinserti32x8 m4, [r5 + 2 * r1], 1
8700
+ punpcklwd m6, m5, m4
8701
+ pmaddwd m6, m16
8702
+ punpckhwd m5, m4
8703
+ pmaddwd m5, m16
8704
+
8705
+ paddd m2, m6
8706
+ paddd m3, m5
8707
+
8708
+ lea r4, [r5 + 4 * r1]
8709
+ movu ym11, [r5 + r1]
8710
+ vinserti32x8 m11, [r5 + r7], 1
8711
+ punpcklwd m8, m4, m11
8712
+ pmaddwd m8, m17
8713
+ punpckhwd m4, m11
8714
+ pmaddwd m4, m17
8715
+
8716
+ movu ym12, [r5 + 2 * r1]
8717
+ vinserti32x8 m12, [r4], 1
8718
+ punpcklwd m10, m11, m12
8719
+ pmaddwd m10, m17
8720
+ punpckhwd m11, m12
8721
+ pmaddwd m11, m17
8722
+
8723
+ movu ym13, [r5 + r7]
8724
+ vinserti32x8 m13, [r4 + r1], 1
8725
+ punpcklwd m14, m12, m13
8726
+ pmaddwd m14, m18
8727
+ punpckhwd m12, m13
8728
+ pmaddwd m12, m18
8729
+
8730
+ paddd m8, m14
8731
+ paddd m4, m12
8732
+ paddd m0, m8
8733
+ paddd m1, m4
8734
+
8735
+ movu ym12, [r4]
8736
+ vinserti32x8 m12, [r4 + 2 * r1], 1
8737
+ punpcklwd m14, m13, m12
8738
+ pmaddwd m14, m18
8739
+ punpckhwd m13, m12
8740
+ pmaddwd m13, m18
8741
+
8742
+ paddd m10, m14
8743
+ paddd m11, m13
8744
+ paddd m2, m10
8745
+ paddd m3, m11
8746
+
8747
+ paddd m0, m19
8748
+ paddd m1, m19
8749
+ paddd m2, m19
8750
+ paddd m3, m19
8751
+
8752
+%ifidn %1, pp
8753
+ psrad m0, INTERP_SHIFT_PP
8754
+ psrad m1, INTERP_SHIFT_PP
8755
+ psrad m2, INTERP_SHIFT_PP
8756
+ psrad m3, INTERP_SHIFT_PP
8757
+
8758
+ packssdw m0, m1
8759
+ packssdw m2, m3
8760
+ CLIPW2 m0, m2, m20, m21
8761
+%else
8762
+ psrad m0, INTERP_SHIFT_PS
8763
+ psrad m1, INTERP_SHIFT_PS
8764
+ psrad m2, INTERP_SHIFT_PS
8765
+ psrad m3, INTERP_SHIFT_PS
8766
+
8767
+ packssdw m0, m1
8768
+ packssdw m2, m3
8769
+%endif
8770
+
8771
+ movu [r2], ym0
8772
+ movu [r2 + r3], ym2
8773
+ vextracti32x8 [r2 + 2 * r3], m0, 1
8774
+ vextracti32x8 [r2 + r8], m2, 1
8775
+%endmacro
8776
+;-----------------------------------------------------------------------------------------------------------------
8777
+; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
8778
+;-----------------------------------------------------------------------------------------------------------------
8779
+%macro FILTER_VER_P_LUMA_16xN_AVX512 2
8780
+INIT_ZMM avx512
8781
+cglobal interp_8tap_vert_%1_16x%2, 5, 9, 22
8782
+ add r1d, r1d
8783
+ add r3d, r3d
8784
+ shl r4d, 8
8785
+%ifdef PIC
8786
+ lea r5, [tab_LumaCoeffVer_avx512]
8787
+ mova m15, [r5 + r4]
8788
+ mova m16, [r5 + r4 + 1 * mmsize]
8789
+ mova m17, [r5 + r4 + 2 * mmsize]
8790
+ mova m18, [r5 + r4 + 3 * mmsize]
8791
+%else
8792
+ lea r5, [tab_LumaCoeffVer_avx512 + r4]
8793
+ mova m15, [r5]
8794
+ mova m16, [r5 + 1 * mmsize]
8795
+ mova m17, [r5 + 2 * mmsize]
8796
+ mova m18, [r5 + 3 * mmsize]
8797
+%endif
8798
+%ifidn %1, pp
8799
+ vbroadcasti32x4 m19, [INTERP_OFFSET_PP]
8800
+ pxor m20, m20
8801
+ vbroadcasti32x8 m21, [pw_pixel_max]
8802
+%else
8803
+ vbroadcasti32x4 m19, [INTERP_OFFSET_PS]
8804
+%endif
8805
+ lea r7, [3 * r1]
8806
+ lea r8, [3 * r3]
8807
+ sub r0, r7
8808
+
8809
+%rep %2/4 - 1
8810
+ PROCESS_LUMA_VERT_P_16x4_AVX512 %1
8811
+ lea r0, [r0 + 4 * r1]
8812
+ lea r2, [r2 + 4 * r3]
8813
+%endrep
8814
+ PROCESS_LUMA_VERT_P_16x4_AVX512 %1
8815
+ RET
8816
+%endmacro
8817
+
8818
+%if ARCH_X86_64
8819
+ FILTER_VER_P_LUMA_16xN_AVX512 ps, 4
8820
+ FILTER_VER_P_LUMA_16xN_AVX512 ps, 8
8821
+ FILTER_VER_P_LUMA_16xN_AVX512 ps, 12
8822
+ FILTER_VER_P_LUMA_16xN_AVX512 ps, 16
8823
+ FILTER_VER_P_LUMA_16xN_AVX512 ps, 32
8824
+ FILTER_VER_P_LUMA_16xN_AVX512 ps, 64
8825
+ FILTER_VER_P_LUMA_16xN_AVX512 pp, 4
8826
+ FILTER_VER_P_LUMA_16xN_AVX512 pp, 8
8827
+ FILTER_VER_P_LUMA_16xN_AVX512 pp, 12
8828
+ FILTER_VER_P_LUMA_16xN_AVX512 pp, 16
8829
+ FILTER_VER_P_LUMA_16xN_AVX512 pp, 32
8830
+ FILTER_VER_P_LUMA_16xN_AVX512 pp, 64
8831
+%endif
8832
+
8833
+%macro PROCESS_LUMA_VERT_P_24x4_AVX512 1
8834
+ PROCESS_LUMA_VERT_P_16x4_AVX512 %1
8835
+ movu xm1, [r0 + mmsize/2]
8836
+ movu xm3, [r0 + r1 + mmsize/2]
8837
+ vinserti32x4 m1, [r0 + r1 + mmsize/2], 1
8838
+ vinserti32x4 m3, [r0 + 2 * r1 + mmsize/2], 1
8839
+ vinserti32x4 m1, [r0 + 2 * r1 + mmsize/2], 2
8840
+ vinserti32x4 m3, [r0 + r7 + mmsize/2], 2
8841
+ vinserti32x4 m1, [r0 + r7 + mmsize/2], 3
8842
+ vinserti32x4 m3, [r0 + 4 * r1 + mmsize/2], 3
8843
+
8844
+ punpcklwd m0, m1, m3
8845
+ pmaddwd m0, m15
8846
+ punpckhwd m1, m3
8847
+ pmaddwd m1, m15
8848
+
8849
+ movu xm4, [r0 + 2 * r1 + mmsize/2]
8850
+ movu xm5, [r0 + r7 + mmsize/2]
8851
+ vinserti32x4 m4, [r0 + r7 + mmsize/2], 1
8852
+ vinserti32x4 m5, [r5 + mmsize/2], 1
8853
+ vinserti32x4 m4, [r5 + mmsize/2], 2
8854
+ vinserti32x4 m5, [r5 + r1 + mmsize/2], 2
8855
+ vinserti32x4 m4, [r5 + r1 + mmsize/2], 3
8856
+ vinserti32x4 m5, [r5 + 2 * r1 + mmsize/2], 3
8857
+
8858
+ punpcklwd m3, m4, m5
8859
+ pmaddwd m3, m16
8860
+ punpckhwd m4, m5
8861
+ pmaddwd m4, m16
8862
+
8863
+ paddd m0, m3
8864
+ paddd m1, m4
8865
+
8866
+ movu xm3, [r5 + mmsize/2]
8867
+ movu xm5, [r5 + r1 + mmsize/2]
8868
+ vinserti32x4 m3, [r5 + r1 + mmsize/2], 1
8869
+ vinserti32x4 m5, [r5 + 2 * r1 + mmsize/2], 1
8870
+ vinserti32x4 m3, [r5 + 2 * r1 + mmsize/2], 2
8871
+ vinserti32x4 m5, [r5 + r7 + mmsize/2], 2
8872
+ vinserti32x4 m3, [r5 + r7 + mmsize/2], 3
8873
+ vinserti32x4 m5, [r5 + 4 * r1 + mmsize/2], 3
8874
+
8875
+ punpcklwd m2, m3, m5
8876
+ pmaddwd m2, m17
8877
+ punpckhwd m3, m5
8878
+ pmaddwd m3, m17
8879
+
8880
+ movu xm6, [r5 + 2 * r1 + mmsize/2]
8881
+ movu xm7, [r5 + r7 + mmsize/2]
8882
+ vinserti32x4 m6, [r5 + r7 + mmsize/2], 1
8883
+ vinserti32x4 m7, [r4 + mmsize/2], 1
8884
+ vinserti32x4 m6, [r4 + mmsize/2], 2
8885
+ vinserti32x4 m7, [r4 + r1 + mmsize/2], 2
8886
+ vinserti32x4 m6, [r4 + r1 + mmsize/2], 3
8887
+ vinserti32x4 m7, [r4 + 2 * r1 + mmsize/2], 3
8888
+
8889
+ punpcklwd m5, m6, m7
8890
+ pmaddwd m5, m18
8891
+ punpckhwd m6, m7
8892
+ pmaddwd m6, m18
8893
+
8894
+ paddd m2, m5
8895
+ paddd m3, m6
8896
+ paddd m0, m2
8897
+ paddd m1, m3
8898
+
8899
+ paddd m0, m19
8900
+ paddd m1, m19
8901
+
8902
+%ifidn %1, pp
8903
+ psrad m0, INTERP_SHIFT_PP
8904
+ psrad m1, INTERP_SHIFT_PP
8905
+ packssdw m0, m1
8906
+ CLIPW m0, m20, m21
8907
+%else
8908
+ psrad m0, INTERP_SHIFT_PS
8909
+ psrad m1, INTERP_SHIFT_PS
8910
+ packssdw m0, m1
8911
+%endif
8912
+
8913
+ movu [r2 + mmsize/2], xm0
8914
+ vextracti32x4 [r2 + r3 + mmsize/2], m0, 1
8915
+ vextracti32x4 [r2 + 2 * r3 + mmsize/2], m0, 2
8916
+ vextracti32x4 [r2 + r8 + mmsize/2], m0, 3
8917
+%endmacro
8918
+;-----------------------------------------------------------------------------------------------------------------
8919
+; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
8920
+;-----------------------------------------------------------------------------------------------------------------
8921
+%macro FILTER_VER_P_LUMA_24xN_AVX512 1
8922
+INIT_ZMM avx512
8923
+cglobal interp_8tap_vert_%1_24x32, 5, 9, 22
8924
+ add r1d, r1d
8925
+ add r3d, r3d
8926
+ shl r4d, 8
8927
+%ifdef PIC
8928
+ lea r5, [tab_LumaCoeffVer_avx512]
8929
+ mova m15, [r5 + r4]
8930
+ mova m16, [r5 + r4 + 1 * mmsize]
8931
+ mova m17, [r5 + r4 + 2 * mmsize]
8932
+ mova m18, [r5 + r4 + 3 * mmsize]
8933
+%else
8934
+ lea r5, [tab_LumaCoeffVer_avx512 + r4]
8935
+ mova m15, [r5]
8936
+ mova m16, [r5 + 1 * mmsize]
8937
+ mova m17, [r5 + 2 * mmsize]
8938
+ mova m18, [r5 + 3 * mmsize]
8939
+%endif
8940
+%ifidn %1, pp
8941
+ vbroadcasti32x4 m19, [INTERP_OFFSET_PP]
8942
+ pxor m20, m20
8943
+ vbroadcasti32x8 m21, [pw_pixel_max]
8944
+%else
8945
+ vbroadcasti32x4 m19, [INTERP_OFFSET_PS]
8946
+%endif
8947
+ lea r7, [3 * r1]
8948
+ lea r8, [3 * r3]
8949
+ sub r0, r7
8950
+
8951
+%rep 7
8952
+ PROCESS_LUMA_VERT_P_24x4_AVX512 %1
8953
+ lea r0, [r0 + 4 * r1]
8954
+ lea r2, [r2 + 4 * r3]
8955
+%endrep
8956
+ PROCESS_LUMA_VERT_P_24x4_AVX512 %1
8957
+ RET
8958
+%endmacro
8959
+
8960
+%if ARCH_X86_64
8961
+ FILTER_VER_P_LUMA_24xN_AVX512 ps
8962
+ FILTER_VER_P_LUMA_24xN_AVX512 pp
8963
+%endif
8964
+
8965
+%macro PROCESS_LUMA_VERT_P_32x2_AVX512 1
8966
+ movu m1, [r0] ;0 row
8967
+ movu m3, [r0 + r1] ;1 row
8968
+ punpcklwd m0, m1, m3
8969
+ pmaddwd m0, m15
8970
+ punpckhwd m1, m3
8971
+ pmaddwd m1, m15
8972
+
8973
+ movu m4, [r0 + 2 * r1] ;2 row
8974
+ punpcklwd m2, m3, m4
8975
+ pmaddwd m2, m15
8976
+ punpckhwd m3, m4
8977
+ pmaddwd m3, m15
8978
+
8979
+ movu m5, [r0 + r7] ;3 row
8980
+ punpcklwd m6, m4, m5
8981
+ pmaddwd m6, m16
8982
+ punpckhwd m4, m5
8983
+ pmaddwd m4, m16
8984
+
8985
+ paddd m0, m6
8986
+ paddd m1, m4
8987
+
8988
+ movu m4, [r0 + 4 * r1] ;4 row
8989
+ punpcklwd m6, m5, m4
8990
+ pmaddwd m6, m16
8991
+ punpckhwd m5, m4
8992
+ pmaddwd m5, m16
8993
+
8994
+ paddd m2, m6
8995
+ paddd m3, m5
8996
+
8997
+ lea r6, [r0 + 4 * r1]
8998
+
8999
+ movu m11, [r6 + r1] ;5 row
9000
+ punpcklwd m8, m4, m11
9001
+ pmaddwd m8, m17
9002
+ punpckhwd m4, m11
9003
+ pmaddwd m4, m17
9004
+
9005
+ movu m12, [r6 + 2 * r1] ;6 row
9006
+ punpcklwd m10, m11, m12
9007
+ pmaddwd m10, m17
9008
+ punpckhwd m11, m12
9009
+ pmaddwd m11, m17
9010
+
9011
+ movu m13, [r6 + r7] ;7 row
9012
+ punpcklwd m14, m12, m13
9013
+ pmaddwd m14, m18
9014
+ punpckhwd m12, m13
9015
+ pmaddwd m12, m18
9016
+
9017
+ paddd m8, m14
9018
+ paddd m4, m12
9019
+ paddd m0, m8
9020
+ paddd m1, m4
9021
+
9022
+ movu m12, [r6 + 4 * r1] ; 8 row
9023
+ punpcklwd m14, m13, m12
9024
+ pmaddwd m14, m18
9025
+ punpckhwd m13, m12
9026
+ pmaddwd m13, m18
9027
+
9028
+ paddd m10, m14
9029
+ paddd m11, m13
9030
+ paddd m2, m10
9031
+ paddd m3, m11
9032
+
9033
+ paddd m0, m19
9034
+ paddd m1, m19
9035
+ paddd m2, m19
9036
+ paddd m3, m19
9037
+
9038
+%ifidn %1, pp
9039
+ psrad m0, INTERP_SHIFT_PP
9040
+ psrad m1, INTERP_SHIFT_PP
9041
+ psrad m2, INTERP_SHIFT_PP
9042
+ psrad m3, INTERP_SHIFT_PP
9043
+
9044
+ packssdw m0, m1
9045
+ packssdw m2, m3
9046
+ CLIPW2 m0, m2, m20, m21
9047
+%else
9048
+ psrad m0, INTERP_SHIFT_PS
9049
+ psrad m1, INTERP_SHIFT_PS
9050
+ psrad m2, INTERP_SHIFT_PS
9051
+ psrad m3, INTERP_SHIFT_PS
9052
+
9053
+ packssdw m0, m1
9054
+ packssdw m2, m3
9055
+%endif
9056
+
9057
+ movu [r2], m0
9058
+ movu [r2 + r3], m2
9059
+%endmacro
9060
+;-----------------------------------------------------------------------------------------------------------------
9061
+; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
9062
+;-----------------------------------------------------------------------------------------------------------------
9063
+%macro FILTER_VER_P_LUMA_32xN_AVX512 2
9064
+INIT_ZMM avx512
9065
+cglobal interp_8tap_vert_%1_32x%2, 5, 8, 22
9066
+ add r1d, r1d
9067
+ add r3d, r3d
9068
+ shl r4d, 8
9069
+%ifdef PIC
9070
+ lea r5, [tab_LumaCoeffVer_avx512]
9071
+ mova m15, [r5 + r4]
9072
+ mova m16, [r5 + r4 + 1 * mmsize]
9073
+ mova m17, [r5 + r4 + 2 * mmsize]
9074
+ mova m18, [r5 + r4 + 3 * mmsize]
9075
+%else
9076
+ lea r5, [tab_LumaCoeffVer_avx512 + r4]
9077
+ mova m15, [r5]
9078
+ mova m16, [r5 + 1 * mmsize]
9079
+ mova m17, [r5 + 2 * mmsize]
9080
+ mova m18, [r5 + 3 * mmsize]
9081
+%endif
9082
+%ifidn %1, pp
9083
+ vbroadcasti32x4 m19, [INTERP_OFFSET_PP]
9084
+ pxor m20, m20
9085
+ vbroadcasti32x8 m21, [pw_pixel_max]
9086
+%else
9087
+ vbroadcasti32x4 m19, [INTERP_OFFSET_PS]
9088
+%endif
9089
+ lea r7, [3 * r1]
9090
+ sub r0, r7
9091
+
9092
+%rep %2/2 - 1
9093
+ PROCESS_LUMA_VERT_P_32x2_AVX512 %1
9094
+ lea r0, [r0 + 2 * r1]
9095
+ lea r2, [r2 + 2 * r3]
9096
+%endrep
9097
+ PROCESS_LUMA_VERT_P_32x2_AVX512 %1
9098
+ RET
9099
+%endmacro
9100
+
9101
+%if ARCH_X86_64
9102
+ FILTER_VER_P_LUMA_32xN_AVX512 ps, 8
9103
+ FILTER_VER_P_LUMA_32xN_AVX512 ps, 16
9104
+ FILTER_VER_P_LUMA_32xN_AVX512 ps, 32
9105
+ FILTER_VER_P_LUMA_32xN_AVX512 ps, 24
9106
+ FILTER_VER_P_LUMA_32xN_AVX512 ps, 64
9107
+ FILTER_VER_P_LUMA_32xN_AVX512 pp, 8
9108
+ FILTER_VER_P_LUMA_32xN_AVX512 pp, 16
9109
+ FILTER_VER_P_LUMA_32xN_AVX512 pp, 32
9110
+ FILTER_VER_P_LUMA_32xN_AVX512 pp, 24
9111
+ FILTER_VER_P_LUMA_32xN_AVX512 pp, 64
9112
+%endif
9113
+
9114
+%macro PROCESS_LUMA_VERT_P_48x4_AVX512 1
9115
+ PROCESS_LUMA_VERT_P_32x2_AVX512 %1
9116
+ movu m1, [r0 + 2 * r1]
9117
+ movu m3, [r0 + r7]
9118
+ punpcklwd m0, m1, m3
9119
+ pmaddwd m0, m15
9120
+ punpckhwd m1, m3
9121
+ pmaddwd m1, m15
9122
+
9123
+ movu m4, [r0 + 4 * r1]
9124
+ punpcklwd m2, m3, m4
9125
+ pmaddwd m2, m15
9126
+ punpckhwd m3, m4
9127
+ pmaddwd m3, m15
9128
+
9129
+ movu m5, [r6 + r1]
9130
+ punpcklwd m6, m4, m5
9131
+ pmaddwd m6, m16
9132
+ punpckhwd m4, m5
9133
+ pmaddwd m4, m16
9134
+
9135
+ paddd m0, m6
9136
+ paddd m1, m4
9137
+
9138
+ movu m4, [r6 + 2 * r1]
9139
+ punpcklwd m6, m5, m4
9140
+ pmaddwd m6, m16
9141
+ punpckhwd m5, m4
9142
+ pmaddwd m5, m16
9143
+
9144
+ paddd m2, m6
9145
+ paddd m3, m5
9146
+
9147
+ lea r4, [r6 + 4 * r1]
9148
+
9149
+ movu m11, [r6 + r7]
9150
+ punpcklwd m8, m4, m11
9151
+ pmaddwd m8, m17
9152
+ punpckhwd m4, m11
9153
+ pmaddwd m4, m17
9154
+
9155
+ movu m12, [r6 + 4 * r1]
9156
+ punpcklwd m10, m11, m12
9157
+ pmaddwd m10, m17
9158
+ punpckhwd m11, m12
9159
+ pmaddwd m11, m17
9160
+
9161
+ movu m13, [r4 + r1]
9162
+ punpcklwd m14, m12, m13
9163
+ pmaddwd m14, m18
9164
+ punpckhwd m12, m13
9165
+ pmaddwd m12, m18
9166
+
9167
+ paddd m8, m14
9168
+ paddd m4, m12
9169
+ paddd m0, m8
9170
+ paddd m1, m4
9171
+
9172
+ movu m12, [r4 + 2 * r1]
9173
+ punpcklwd m14, m13, m12
9174
+ pmaddwd m14, m18
9175
+ punpckhwd m13, m12
9176
+ pmaddwd m13, m18
9177
+
9178
+ paddd m10, m14
9179
+ paddd m11, m13
9180
+ paddd m2, m10
9181
+ paddd m3, m11
9182
+
9183
+ paddd m0, m19
9184
+ paddd m1, m19
9185
+ paddd m2, m19
9186
+ paddd m3, m19
9187
+
9188
+%ifidn %1, pp
9189
+ psrad m0, INTERP_SHIFT_PP
9190
+ psrad m1, INTERP_SHIFT_PP
9191
+ psrad m2, INTERP_SHIFT_PP
9192
+ psrad m3, INTERP_SHIFT_PP
9193
+
9194
+ packssdw m0, m1
9195
+ packssdw m2, m3
9196
+ CLIPW2 m0, m2, m20, m21
9197
+%else
9198
+ psrad m0, INTERP_SHIFT_PS
9199
+ psrad m1, INTERP_SHIFT_PS
9200
+ psrad m2, INTERP_SHIFT_PS
9201
+ psrad m3, INTERP_SHIFT_PS
9202
+
9203
+ packssdw m0, m1
9204
+ packssdw m2, m3
9205
+%endif
9206
+ movu [r2 + 2 * r3], m0
9207
+ movu [r2 + r8], m2
9208
+
9209
+ movu ym1, [r0 + mmsize]
9210
+ movu ym3, [r0 + r1 + mmsize]
9211
+ vinserti32x8 m1, [r0 + 2 * r1 + mmsize], 1
9212
+ vinserti32x8 m3, [r0 + r7 + mmsize], 1
9213
+ punpcklwd m0, m1, m3
9214
+ pmaddwd m0, m15
9215
+ punpckhwd m1, m3
9216
+ pmaddwd m1, m15
9217
+
9218
+ movu ym4, [r0 + 2 * r1 + mmsize]
9219
+ vinserti32x8 m4, [r0 + 4 * r1 + mmsize], 1
9220
+ punpcklwd m2, m3, m4
9221
+ pmaddwd m2, m15
9222
+ punpckhwd m3, m4
9223
+ pmaddwd m3, m15
9224
+
9225
+ movu ym5, [r0 + r7 + mmsize]
9226
+ vinserti32x8 m5, [r6 + r1 + mmsize], 1
9227
+ punpcklwd m6, m4, m5
9228
+ pmaddwd m6, m16
9229
+ punpckhwd m4, m5
9230
+ pmaddwd m4, m16
9231
+
9232
+ paddd m0, m6
9233
+ paddd m1, m4
9234
+
9235
+ movu ym4, [r6 + mmsize]
9236
+ vinserti32x8 m4, [r6 + 2 * r1 + mmsize], 1
9237
+ punpcklwd m6, m5, m4
9238
+ pmaddwd m6, m16
9239
+ punpckhwd m5, m4
9240
+ pmaddwd m5, m16
9241
+
9242
+ paddd m2, m6
9243
+ paddd m3, m5
9244
+
9245
+ movu ym11, [r6 + r1 + mmsize]
9246
+ vinserti32x8 m11, [r6 + r7 + mmsize], 1
9247
+ punpcklwd m8, m4, m11
9248
+ pmaddwd m8, m17
9249
+ punpckhwd m4, m11
9250
+ pmaddwd m4, m17
9251
+
9252
+ movu ym12, [r6 + 2 * r1 + mmsize]
9253
+ vinserti32x8 m12, [r4 + mmsize], 1
9254
+ punpcklwd m10, m11, m12
9255
+ pmaddwd m10, m17
9256
+ punpckhwd m11, m12
9257
+ pmaddwd m11, m17
9258
+
9259
+ movu ym13, [r6 + r7 + mmsize]
9260
+ vinserti32x8 m13, [r4 + r1 + mmsize], 1
9261
+ punpcklwd m14, m12, m13
9262
+ pmaddwd m14, m18
9263
+ punpckhwd m12, m13
9264
+ pmaddwd m12, m18
9265
+
9266
+ paddd m8, m14
9267
+ paddd m4, m12
9268
+ paddd m0, m8
9269
+ paddd m1, m4
9270
+
9271
+ movu ym12, [r4 + mmsize]
9272
+ vinserti32x8 m12, [r4 + 2 * r1 + mmsize], 1
9273
+ punpcklwd m14, m13, m12
9274
+ pmaddwd m14, m18
9275
+ punpckhwd m13, m12
9276
+ pmaddwd m13, m18
9277
+
9278
+ paddd m10, m14
9279
+ paddd m11, m13
9280
+ paddd m2, m10
9281
+ paddd m3, m11
9282
+
9283
+ paddd m0, m19
9284
+ paddd m1, m19
9285
+ paddd m2, m19
9286
+ paddd m3, m19
9287
+
9288
+%ifidn %1, pp
9289
+ psrad m0, INTERP_SHIFT_PP
9290
+ psrad m1, INTERP_SHIFT_PP
9291
+ psrad m2, INTERP_SHIFT_PP
9292
+ psrad m3, INTERP_SHIFT_PP
9293
+
9294
+ packssdw m0, m1
9295
+ packssdw m2, m3
9296
+ CLIPW2 m0, m2, m20, m21
9297
+%else
9298
+ psrad m0, INTERP_SHIFT_PS
9299
+ psrad m1, INTERP_SHIFT_PS
9300
+ psrad m2, INTERP_SHIFT_PS
9301
+ psrad m3, INTERP_SHIFT_PS
9302
+
9303
+ packssdw m0, m1
9304
+ packssdw m2, m3
9305
+%endif
9306
+
9307
+ movu [r2 + mmsize], ym0
9308
+ movu [r2 + r3 + mmsize], ym2
9309
+ vextracti32x8 [r2 + 2 * r3 + mmsize], m0, 1
9310
+ vextracti32x8 [r2 + r8 + mmsize], m2, 1
9311
+%endmacro
9312
+;-----------------------------------------------------------------------------------------------------------------
9313
+; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
9314
+;-----------------------------------------------------------------------------------------------------------------
9315
+%macro FILTER_VER_P_LUMA_48x64_AVX512 1
9316
+INIT_ZMM avx512
9317
+cglobal interp_8tap_vert_%1_48x64, 5, 9, 22
9318
+ add r1d, r1d
9319
+ add r3d, r3d
9320
+ shl r4d, 8
9321
+%ifdef PIC
9322
+ lea r5, [tab_LumaCoeffVer_avx512]
9323
+ mova m15, [r5 + r4]
9324
+ mova m16, [r5 + r4 + 1 * mmsize]
9325
+ mova m17, [r5 + r4 + 2 * mmsize]
9326
+ mova m18, [r5 + r4 + 3 * mmsize]
9327
+%else
9328
+ lea r5, [tab_LumaCoeffVer_avx512 + r4]
9329
+ mova m15, [r5]
9330
+ mova m16, [r5 + 1 * mmsize]
9331
+ mova m17, [r5 + 2 * mmsize]
9332
+ mova m18, [r5 + 3 * mmsize]
9333
+%endif
9334
+%ifidn %1, pp
9335
+ vbroadcasti32x4 m19, [INTERP_OFFSET_PP]
9336
+ pxor m20, m20
9337
+ vbroadcasti32x8 m21, [pw_pixel_max]
9338
+%else
9339
+ vbroadcasti32x4 m19, [INTERP_OFFSET_PS]
9340
+%endif
9341
+ lea r7, [3 * r1]
9342
+ lea r8, [3 * r3]
9343
+ sub r0, r7
9344
+
9345
+%rep 15
9346
+ PROCESS_LUMA_VERT_P_48x4_AVX512 %1
9347
+ lea r0, [r0 + 4 * r1]
9348
+ lea r2, [r2 + 4 * r3]
9349
+%endrep
9350
+ PROCESS_LUMA_VERT_P_48x4_AVX512 %1
9351
+ RET
9352
+%endmacro
9353
+
9354
+%if ARCH_X86_64
9355
+ FILTER_VER_P_LUMA_48x64_AVX512 ps
9356
+ FILTER_VER_P_LUMA_48x64_AVX512 pp
9357
+%endif
9358
+
9359
+%macro PROCESS_LUMA_VERT_P_64x2_AVX512 1
9360
+ PROCESS_LUMA_VERT_P_32x2_AVX512 %1
9361
+ movu m1, [r0 + mmsize]
9362
+ movu m3, [r0 + r1 + mmsize]
9363
+ punpcklwd m0, m1, m3
9364
+ pmaddwd m0, m15
9365
+ punpckhwd m1, m3
9366
+ pmaddwd m1, m15
9367
+
9368
+ movu m4, [r0 + 2 * r1 + mmsize]
9369
+ punpcklwd m2, m3, m4
9370
+ pmaddwd m2, m15
9371
+ punpckhwd m3, m4
9372
+ pmaddwd m3, m15
9373
+
9374
+ movu m5, [r0 + r7 + mmsize]
9375
+ punpcklwd m6, m4, m5
9376
+ pmaddwd m6, m16
9377
+ punpckhwd m4, m5
9378
+ pmaddwd m4, m16
9379
+
9380
+ paddd m0, m6
9381
+ paddd m1, m4
9382
+
9383
+ movu m4, [r0 + 4 * r1 + mmsize]
9384
+ punpcklwd m6, m5, m4
9385
+ pmaddwd m6, m16
9386
+ punpckhwd m5, m4
9387
+ pmaddwd m5, m16
9388
+
9389
+ paddd m2, m6
9390
+ paddd m3, m5
9391
+
9392
+ movu m11, [r6 + r1 + mmsize]
9393
+ punpcklwd m8, m4, m11
9394
+ pmaddwd m8, m17
9395
+ punpckhwd m4, m11
9396
+ pmaddwd m4, m17
9397
+
9398
+ movu m12, [r6 + 2 * r1 + mmsize]
9399
+ punpcklwd m10, m11, m12
9400
+ pmaddwd m10, m17
9401
+ punpckhwd m11, m12
9402
+ pmaddwd m11, m17
9403
+
9404
+ movu m13, [r6 + r7 + mmsize]
9405
+ punpcklwd m14, m12, m13
9406
+ pmaddwd m14, m18
9407
+ punpckhwd m12, m13
9408
+ pmaddwd m12, m18
9409
+
9410
+ paddd m8, m14
9411
+ paddd m4, m12
9412
+ paddd m0, m8
9413
+ paddd m1, m4
9414
+
9415
+ movu m12, [r6 + 4 * r1 + mmsize]
9416
+ punpcklwd m14, m13, m12
9417
+ pmaddwd m14, m18
9418
+ punpckhwd m13, m12
9419
+ pmaddwd m13, m18
9420
+
9421
+ paddd m10, m14
9422
+ paddd m11, m13
9423
+ paddd m2, m10
9424
+ paddd m3, m11
9425
+
9426
+ paddd m0, m19
9427
+ paddd m1, m19
9428
+ paddd m2, m19
9429
+ paddd m3, m19
9430
+
9431
+%ifidn %1, pp
9432
+ psrad m0, INTERP_SHIFT_PP
9433
+ psrad m1, INTERP_SHIFT_PP
9434
+ psrad m2, INTERP_SHIFT_PP
9435
+ psrad m3, INTERP_SHIFT_PP
9436
+
9437
+ packssdw m0, m1
9438
+ packssdw m2, m3
9439
+ CLIPW2 m0, m2, m20, m21
9440
+%else
9441
+ psrad m0, INTERP_SHIFT_PS
9442
+ psrad m1, INTERP_SHIFT_PS
9443
+ psrad m2, INTERP_SHIFT_PS
9444
+ psrad m3, INTERP_SHIFT_PS
9445
+
9446
+ packssdw m0, m1
9447
+ packssdw m2, m3
9448
+%endif
9449
+
9450
+ movu [r2 + mmsize], m0
9451
+ movu [r2 + r3 + mmsize], m2
9452
+%endmacro
9453
+;-----------------------------------------------------------------------------------------------------------------
9454
+; void interp_8tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
9455
+;-----------------------------------------------------------------------------------------------------------------
9456
+%macro FILTER_VER_P_LUMA_64xN_AVX512 2
9457
+INIT_ZMM avx512
9458
+cglobal interp_8tap_vert_%1_64x%2, 5, 8, 22
9459
+ add r1d, r1d
9460
+ add r3d, r3d
9461
+ shl r4d, 8
9462
+%ifdef PIC
9463
+ lea r5, [tab_LumaCoeffVer_avx512]
9464
+ mova m15, [r5 + r4]
9465
+ mova m16, [r5 + r4 + 1 * mmsize]
9466
+ mova m17, [r5 + r4 + 2 * mmsize]
9467
+ mova m18, [r5 + r4 + 3 * mmsize]
9468
+%else
9469
+ lea r5, [tab_LumaCoeffVer_avx512 + r4]
9470
+ mova m15, [r5]
9471
+ mova m16, [r5 + 1 * mmsize]
9472
+ mova m17, [r5 + 2 * mmsize]
9473
+ mova m18, [r5 + 3 * mmsize]
9474
+%endif
9475
+%ifidn %1, pp
9476
+ vbroadcasti32x4 m19, [INTERP_OFFSET_PP]
9477
+ pxor m20, m20
9478
+ vbroadcasti32x8 m21, [pw_pixel_max]
9479
+%else
9480
+ vbroadcasti32x4 m19, [INTERP_OFFSET_PS]
9481
+%endif
9482
+ lea r7, [3 * r1]
9483
+ sub r0, r7
9484
+
9485
+%rep %2/2 - 1
9486
+ PROCESS_LUMA_VERT_P_64x2_AVX512 %1
9487
+ lea r0, [r0 + 2 * r1]
9488
+ lea r2, [r2 + 2 * r3]
9489
+%endrep
9490
+ PROCESS_LUMA_VERT_P_64x2_AVX512 %1
9491
+ RET
9492
+%endmacro
9493
+
9494
+%if ARCH_X86_64
9495
+ FILTER_VER_P_LUMA_64xN_AVX512 ps, 16
9496
+ FILTER_VER_P_LUMA_64xN_AVX512 ps, 32
9497
+ FILTER_VER_P_LUMA_64xN_AVX512 ps, 48
9498
+ FILTER_VER_P_LUMA_64xN_AVX512 ps, 64
9499
+ FILTER_VER_P_LUMA_64xN_AVX512 pp, 16
9500
+ FILTER_VER_P_LUMA_64xN_AVX512 pp, 32
9501
+ FILTER_VER_P_LUMA_64xN_AVX512 pp, 48
9502
+ FILTER_VER_P_LUMA_64xN_AVX512 pp, 64
9503
+%endif
9504
+;-------------------------------------------------------------------------------------------------------------
9505
+;avx512 luma_vpp and luma_vps code end
9506
+;-------------------------------------------------------------------------------------------------------------
9507
+;-------------------------------------------------------------------------------------------------------------
9508
+;ipfilter_luma_avx512 code end
9509
+;-------------------------------------------------------------------------------------------------------------
9510
x265_2.7.tar.gz/source/common/x86/ipfilter8.asm -> x265_2.9.tar.gz/source/common/x86/ipfilter8.asm
Changed
5651
1
2
%include "x86inc.asm"
3
%include "x86util.asm"
4
5
-SECTION_RODATA 32
6
+SECTION_RODATA 64
7
const tab_Tm, db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
8
db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
9
db 8, 9,10,11, 9,10,11,12,10,11,12,13,11,12,13, 14
10
11
12
const pd_526336, times 8 dd 8192*64+2048
13
14
+const tab_ChromaCoeff, db 0, 64, 0, 0
15
+ db -2, 58, 10, -2
16
+ db -4, 54, 16, -2
17
+ db -6, 46, 28, -4
18
+ db -4, 36, 36, -4
19
+ db -4, 28, 46, -6
20
+ db -2, 16, 54, -4
21
+ db -2, 10, 58, -2
22
+
23
const tab_LumaCoeff, db 0, 0, 0, 64, 0, 0, 0, 0
24
db -1, 4, -10, 58, 17, -5, 1, 0
25
db -1, 4, -11, 40, 40, -11, 4, -1
26
27
times 16 db 58, -10
28
times 16 db 4, -1
29
30
+ALIGN 64
31
+const tab_ChromaCoeffVer_32_avx512, times 32 db 0, 64
32
+ times 32 db 0, 0
33
+
34
+ times 32 db -2, 58
35
+ times 32 db 10, -2
36
+
37
+ times 32 db -4, 54
38
+ times 32 db 16, -2
39
+
40
+ times 32 db -6, 46
41
+ times 32 db 28, -4
42
+
43
+ times 32 db -4, 36
44
+ times 32 db 36, -4
45
+
46
+ times 32 db -4, 28
47
+ times 32 db 46, -6
48
+
49
+ times 32 db -2, 16
50
+ times 32 db 54, -4
51
+
52
+ times 32 db -2, 10
53
+ times 32 db 58, -2
54
+
55
+ALIGN 64
56
+const pw_ChromaCoeffVer_32_avx512, times 16 dw 0, 64
57
+ times 16 dw 0, 0
58
+
59
+ times 16 dw -2, 58
60
+ times 16 dw 10, -2
61
+
62
+ times 16 dw -4, 54
63
+ times 16 dw 16, -2
64
+
65
+ times 16 dw -6, 46
66
+ times 16 dw 28, -4
67
+
68
+ times 16 dw -4, 36
69
+ times 16 dw 36, -4
70
+
71
+ times 16 dw -4, 28
72
+ times 16 dw 46, -6
73
+
74
+ times 16 dw -2, 16
75
+ times 16 dw 54, -4
76
+
77
+ times 16 dw -2, 10
78
+ times 16 dw 58, -2
79
+
80
+ALIGN 64
81
+const pw_LumaCoeffVer_avx512, times 16 dw 0, 0
82
+ times 16 dw 0, 64
83
+ times 16 dw 0, 0
84
+ times 16 dw 0, 0
85
+
86
+ times 16 dw -1, 4
87
+ times 16 dw -10, 58
88
+ times 16 dw 17, -5
89
+ times 16 dw 1, 0
90
+
91
+ times 16 dw -1, 4
92
+ times 16 dw -11, 40
93
+ times 16 dw 40, -11
94
+ times 16 dw 4, -1
95
+
96
+ times 16 dw 0, 1
97
+ times 16 dw -5, 17
98
+ times 16 dw 58, -10
99
+ times 16 dw 4, -1
100
+
101
+ALIGN 64
102
+const tab_LumaCoeffVer_32_avx512, times 32 db 0, 0
103
+ times 32 db 0, 64
104
+ times 32 db 0, 0
105
+ times 32 db 0, 0
106
+
107
+ times 32 db -1, 4
108
+ times 32 db -10, 58
109
+ times 32 db 17, -5
110
+ times 32 db 1, 0
111
+
112
+ times 32 db -1, 4
113
+ times 32 db -11, 40
114
+ times 32 db 40, -11
115
+ times 32 db 4, -1
116
+
117
+ times 32 db 0, 1
118
+ times 32 db -5, 17
119
+ times 32 db 58, -10
120
+ times 32 db 4, -1
121
+
122
const tab_c_64_n64, times 8 db 64, -64
123
124
const interp8_hps_shuf, dd 0, 4, 1, 5, 2, 6, 3, 7
125
126
-SECTION .text
127
+const interp4_horiz_shuf_load1_avx512, times 2 db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
128
+const interp4_horiz_shuf_load2_avx512, times 2 db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
129
+const interp4_horiz_shuf_load3_avx512, times 2 db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
130
+
131
+ALIGN 64
132
+interp4_vps_store1_avx512: dq 0, 1, 8, 9, 2, 3, 10, 11
133
+interp4_vps_store2_avx512: dq 4, 5, 12, 13, 6, 7, 14, 15
134
+const interp4_hps_shuf_avx512, dq 0, 4, 1, 5, 2, 6, 3, 7
135
+const interp4_hps_store_16xN_avx512, dq 0, 2, 1, 3, 4, 6, 5, 7
136
+const interp8_hps_store_avx512, dq 0, 1, 4, 5, 2, 3, 6, 7
137
+const interp8_vsp_store_avx512, dq 0, 2, 4, 6, 1, 3, 5, 7
138
139
+SECTION .text
140
cextern pb_128
141
cextern pw_1
142
cextern pw_32
143
144
P2S_H_32xN_avx2 48
145
146
;-----------------------------------------------------------------------------
147
+;p2s and p2s_aligned 32xN avx512 code start
148
+;-----------------------------------------------------------------------------
149
+
150
+%macro PROCESS_P2S_32x4_AVX512 0
151
+ pmovzxbw m0, [r0]
152
+ pmovzxbw m1, [r0 + r1]
153
+ pmovzxbw m2, [r0 + r1 * 2]
154
+ pmovzxbw m3, [r0 + r5]
155
+
156
+ psllw m0, 6
157
+ psllw m1, 6
158
+ psllw m2, 6
159
+ psllw m3, 6
160
+ psubw m0, m4
161
+ psubw m1, m4
162
+ psubw m2, m4
163
+ psubw m3, m4
164
+
165
+ movu [r2], m0
166
+ movu [r2 + r3], m1
167
+ movu [r2 + r3 * 2], m2
168
+ movu [r2 + r6], m3
169
+%endmacro
170
+
171
+;-----------------------------------------------------------------------------
172
+; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
173
+;-----------------------------------------------------------------------------
174
+%if ARCH_X86_64
175
+INIT_ZMM avx512
176
+cglobal filterPixelToShort_32x8, 3, 7, 5
177
+ mov r3d, r3m
178
+ add r3d, r3d
179
+ lea r5, [r1 * 3]
180
+ lea r6, [r3 * 3]
181
+
182
+ ; load constant
183
+ vpbroadcastd m4, [pw_2000]
184
+
185
+ PROCESS_P2S_32x4_AVX512
186
+ lea r0, [r0 + r1 * 4]
187
+ lea r2, [r2 + r3 * 4]
188
+ PROCESS_P2S_32x4_AVX512
189
+ RET
190
+
191
+INIT_ZMM avx512
192
+cglobal filterPixelToShort_32x16, 3, 7, 5
193
+ mov r3d, r3m
194
+ add r3d, r3d
195
+ lea r5, [r1 * 3]
196
+ lea r6, [r3 * 3]
197
+
198
+ ; load constant
199
+ vpbroadcastd m4, [pw_2000]
200
+
201
+%rep 3
202
+ PROCESS_P2S_32x4_AVX512
203
+ lea r0, [r0 + r1 * 4]
204
+ lea r2, [r2 + r3 * 4]
205
+%endrep
206
+ PROCESS_P2S_32x4_AVX512
207
+ RET
208
+
209
+INIT_ZMM avx512
210
+cglobal filterPixelToShort_32x24, 3, 7, 5
211
+ mov r3d, r3m
212
+ add r3d, r3d
213
+ lea r5, [r1 * 3]
214
+ lea r6, [r3 * 3]
215
+
216
+ ; load constant
217
+ vpbroadcastd m4, [pw_2000]
218
+
219
+%rep 5
220
+ PROCESS_P2S_32x4_AVX512
221
+ lea r0, [r0 + r1 * 4]
222
+ lea r2, [r2 + r3 * 4]
223
+%endrep
224
+ PROCESS_P2S_32x4_AVX512
225
+ RET
226
+
227
+INIT_ZMM avx512
228
+cglobal filterPixelToShort_32x32, 3, 7, 5
229
+ mov r3d, r3m
230
+ add r3d, r3d
231
+ lea r5, [r1 * 3]
232
+ lea r6, [r3 * 3]
233
+
234
+ ; load constant
235
+ vpbroadcastd m4, [pw_2000]
236
+
237
+%rep 7
238
+ PROCESS_P2S_32x4_AVX512
239
+ lea r0, [r0 + r1 * 4]
240
+ lea r2, [r2 + r3 * 4]
241
+%endrep
242
+ PROCESS_P2S_32x4_AVX512
243
+ RET
244
+
245
+INIT_ZMM avx512
246
+cglobal filterPixelToShort_32x48, 3, 7, 5
247
+ mov r3d, r3m
248
+ add r3d, r3d
249
+ lea r5, [r1 * 3]
250
+ lea r6, [r3 * 3]
251
+
252
+ ; load constant
253
+ vpbroadcastd m4, [pw_2000]
254
+
255
+%rep 11
256
+ PROCESS_P2S_32x4_AVX512
257
+ lea r0, [r0 + r1 * 4]
258
+ lea r2, [r2 + r3 * 4]
259
+%endrep
260
+ PROCESS_P2S_32x4_AVX512
261
+ RET
262
+
263
+INIT_ZMM avx512
264
+cglobal filterPixelToShort_32x64, 3, 7, 5
265
+ mov r3d, r3m
266
+ add r3d, r3d
267
+ lea r5, [r1 * 3]
268
+ lea r6, [r3 * 3]
269
+
270
+ ; load constant
271
+ vpbroadcastd m4, [pw_2000]
272
+
273
+%rep 15
274
+ PROCESS_P2S_32x4_AVX512
275
+ lea r0, [r0 + r1 * 4]
276
+ lea r2, [r2 + r3 * 4]
277
+%endrep
278
+ PROCESS_P2S_32x4_AVX512
279
+ RET
280
+%endif
281
+
282
+%macro PROCESS_P2S_ALIGNED_32x4_AVX512 0
283
+ pmovzxbw m0, [r0]
284
+ pmovzxbw m1, [r0 + r1]
285
+ pmovzxbw m2, [r0 + r1 * 2]
286
+ pmovzxbw m3, [r0 + r5]
287
+
288
+ psllw m0, 6
289
+ psllw m1, 6
290
+ psllw m2, 6
291
+ psllw m3, 6
292
+ psubw m0, m4
293
+ psubw m1, m4
294
+ psubw m2, m4
295
+ psubw m3, m4
296
+
297
+ mova [r2], m0
298
+ mova [r2 + r3], m1
299
+ mova [r2 + r3 * 2], m2
300
+ mova [r2 + r6], m3
301
+%endmacro
302
+
303
+;-----------------------------------------------------------------------------
304
+; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
305
+;-----------------------------------------------------------------------------
306
+%if ARCH_X86_64
307
+INIT_ZMM avx512
308
+cglobal filterPixelToShort_aligned_32x8, 3, 7, 5
309
+ mov r3d, r3m
310
+ add r3d, r3d
311
+ lea r5, [r1 * 3]
312
+ lea r6, [r3 * 3]
313
+
314
+ ; load constant
315
+ vpbroadcastd m4, [pw_2000]
316
+
317
+ PROCESS_P2S_ALIGNED_32x4_AVX512
318
+ lea r0, [r0 + r1 * 4]
319
+ lea r2, [r2 + r3 * 4]
320
+ PROCESS_P2S_ALIGNED_32x4_AVX512
321
+ RET
322
+
323
+INIT_ZMM avx512
324
+cglobal filterPixelToShort_aligned_32x16, 3, 7, 5
325
+ mov r3d, r3m
326
+ add r3d, r3d
327
+ lea r5, [r1 * 3]
328
+ lea r6, [r3 * 3]
329
+
330
+ ; load constant
331
+ vpbroadcastd m4, [pw_2000]
332
+
333
+%rep 3
334
+ PROCESS_P2S_ALIGNED_32x4_AVX512
335
+ lea r0, [r0 + r1 * 4]
336
+ lea r2, [r2 + r3 * 4]
337
+%endrep
338
+ PROCESS_P2S_ALIGNED_32x4_AVX512
339
+ RET
340
+
341
+INIT_ZMM avx512
342
+cglobal filterPixelToShort_aligned_32x24, 3, 7, 5
343
+ mov r3d, r3m
344
+ add r3d, r3d
345
+ lea r5, [r1 * 3]
346
+ lea r6, [r3 * 3]
347
+
348
+ ; load constant
349
+ vpbroadcastd m4, [pw_2000]
350
+
351
+%rep 5
352
+ PROCESS_P2S_ALIGNED_32x4_AVX512
353
+ lea r0, [r0 + r1 * 4]
354
+ lea r2, [r2 + r3 * 4]
355
+%endrep
356
+ PROCESS_P2S_ALIGNED_32x4_AVX512
357
+ RET
358
+
359
+INIT_ZMM avx512
360
+cglobal filterPixelToShort_aligned_32x32, 3, 7, 5
361
+ mov r3d, r3m
362
+ add r3d, r3d
363
+ lea r5, [r1 * 3]
364
+ lea r6, [r3 * 3]
365
+
366
+ ; load constant
367
+ vpbroadcastd m4, [pw_2000]
368
+
369
+%rep 7
370
+ PROCESS_P2S_ALIGNED_32x4_AVX512
371
+ lea r0, [r0 + r1 * 4]
372
+ lea r2, [r2 + r3 * 4]
373
+%endrep
374
+ PROCESS_P2S_ALIGNED_32x4_AVX512
375
+ RET
376
+
377
+INIT_ZMM avx512
378
+cglobal filterPixelToShort_aligned_32x48, 3, 7, 5
379
+ mov r3d, r3m
380
+ add r3d, r3d
381
+ lea r5, [r1 * 3]
382
+ lea r6, [r3 * 3]
383
+
384
+ ; load constant
385
+ vpbroadcastd m4, [pw_2000]
386
+
387
+%rep 11
388
+ PROCESS_P2S_ALIGNED_32x4_AVX512
389
+ lea r0, [r0 + r1 * 4]
390
+ lea r2, [r2 + r3 * 4]
391
+%endrep
392
+ PROCESS_P2S_ALIGNED_32x4_AVX512
393
+ RET
394
+
395
+INIT_ZMM avx512
396
+cglobal filterPixelToShort_aligned_32x64, 3, 7, 5
397
+ mov r3d, r3m
398
+ add r3d, r3d
399
+ lea r5, [r1 * 3]
400
+ lea r6, [r3 * 3]
401
+
402
+ ; load constant
403
+ vpbroadcastd m4, [pw_2000]
404
+
405
+%rep 15
406
+ PROCESS_P2S_ALIGNED_32x4_AVX512
407
+ lea r0, [r0 + r1 * 4]
408
+ lea r2, [r2 + r3 * 4]
409
+%endrep
410
+ PROCESS_P2S_ALIGNED_32x4_AVX512
411
+ RET
412
+%endif
413
+;-----------------------------------------------------------------------------
414
+;p2s and p2s_aligned 32xN avx512 code end
415
+;-----------------------------------------------------------------------------
416
+;-----------------------------------------------------------------------------
417
; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
418
;-----------------------------------------------------------------------------
419
%macro P2S_H_64xN 1
420
421
P2S_H_64xN_avx2 48
422
423
;-----------------------------------------------------------------------------
424
+;p2s and p2s_aligned 64xN avx512 code start
425
+;-----------------------------------------------------------------------------
426
+%macro PROCESS_P2S_64x4_AVX512 0
427
+ pmovzxbw m0, [r0]
428
+ pmovzxbw m1, [r0 + mmsize/2]
429
+ pmovzxbw m2, [r0 + r1]
430
+ pmovzxbw m3, [r0 + r1 + mmsize/2]
431
+
432
+ psllw m0, 6
433
+ psllw m1, 6
434
+ psllw m2, 6
435
+ psllw m3, 6
436
+ psubw m0, m4
437
+ psubw m1, m4
438
+ psubw m2, m4
439
+ psubw m3, m4
440
+ movu [r2], m0
441
+ movu [r2 + mmsize], m1
442
+ movu [r2 + r3], m2
443
+ movu [r2 + r3 + mmsize], m3
444
+
445
+ pmovzxbw m0, [r0 + r1 * 2]
446
+ pmovzxbw m1, [r0 + r1 * 2 + mmsize/2]
447
+ pmovzxbw m2, [r0 + r5]
448
+ pmovzxbw m3, [r0 + r5 + mmsize/2]
449
+
450
+ psllw m0, 6
451
+ psllw m1, 6
452
+ psllw m2, 6
453
+ psllw m3, 6
454
+ psubw m0, m4
455
+ psubw m1, m4
456
+ psubw m2, m4
457
+ psubw m3, m4
458
+ movu [r2 + r3 * 2], m0
459
+ movu [r2 + r3 * 2 + mmsize], m1
460
+ movu [r2 + r6], m2
461
+ movu [r2 + r6 + mmsize], m3
462
+%endmacro
463
+
464
+%macro PROCESS_P2S_ALIGNED_64x4_AVX512 0
465
+ pmovzxbw m0, [r0]
466
+ pmovzxbw m1, [r0 + mmsize/2]
467
+ pmovzxbw m2, [r0 + r1]
468
+ pmovzxbw m3, [r0 + r1 + mmsize/2]
469
+
470
+ psllw m0, 6
471
+ psllw m1, 6
472
+ psllw m2, 6
473
+ psllw m3, 6
474
+ psubw m0, m4
475
+ psubw m1, m4
476
+ psubw m2, m4
477
+ psubw m3, m4
478
+ mova [r2], m0
479
+ mova [r2 + mmsize], m1
480
+ mova [r2 + r3], m2
481
+ mova [r2 + r3 + mmsize], m3
482
+
483
+ pmovzxbw m0, [r0 + r1 * 2]
484
+ pmovzxbw m1, [r0 + r1 * 2 + mmsize/2]
485
+ pmovzxbw m2, [r0 + r5]
486
+ pmovzxbw m3, [r0 + r5 + mmsize/2]
487
+
488
+ psllw m0, 6
489
+ psllw m1, 6
490
+ psllw m2, 6
491
+ psllw m3, 6
492
+ psubw m0, m4
493
+ psubw m1, m4
494
+ psubw m2, m4
495
+ psubw m3, m4
496
+ mova [r2 + r3 * 2], m0
497
+ mova [r2 + r3 * 2 + mmsize], m1
498
+ mova [r2 + r6], m2
499
+ mova [r2 + r6 + mmsize], m3
500
+%endmacro
501
+;-----------------------------------------------------------------------------
502
+; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
503
+;-----------------------------------------------------------------------------
504
+%if ARCH_X86_64
505
+INIT_ZMM avx512
506
+cglobal filterPixelToShort_64x64, 3, 7, 5
507
+ mov r3d, r3m
508
+ add r3d, r3d
509
+ lea r5, [r1 * 3]
510
+ lea r6, [r3 * 3]
511
+
512
+ ; load constant
513
+ vpbroadcastd m4, [pw_2000]
514
+
515
+%rep 15
516
+ PROCESS_P2S_64x4_AVX512
517
+ lea r0, [r0 + r1 * 4]
518
+ lea r2, [r2 + r3 * 4]
519
+%endrep
520
+ PROCESS_P2S_64x4_AVX512
521
+ RET
522
+
523
+INIT_ZMM avx512
524
+cglobal filterPixelToShort_64x48, 3, 7, 5
525
+ mov r3d, r3m
526
+ add r3d, r3d
527
+ lea r5, [r1 * 3]
528
+ lea r6, [r3 * 3]
529
+
530
+ ; load constant
531
+ vpbroadcastd m4, [pw_2000]
532
+
533
+%rep 11
534
+ PROCESS_P2S_64x4_AVX512
535
+ lea r0, [r0 + r1 * 4]
536
+ lea r2, [r2 + r3 * 4]
537
+%endrep
538
+ PROCESS_P2S_64x4_AVX512
539
+ RET
540
+
541
+INIT_ZMM avx512
542
+cglobal filterPixelToShort_64x32, 3, 7, 5
543
+ mov r3d, r3m
544
+ add r3d, r3d
545
+ lea r5, [r1 * 3]
546
+ lea r6, [r3 * 3]
547
+
548
+ ; load constant
549
+ vpbroadcastd m4, [pw_2000]
550
+
551
+%rep 7
552
+ PROCESS_P2S_64x4_AVX512
553
+ lea r0, [r0 + r1 * 4]
554
+ lea r2, [r2 + r3 * 4]
555
+%endrep
556
+ PROCESS_P2S_64x4_AVX512
557
+ RET
558
+
559
+INIT_ZMM avx512
560
+cglobal filterPixelToShort_64x16, 3, 7, 5
561
+ mov r3d, r3m
562
+ add r3d, r3d
563
+ lea r5, [r1 * 3]
564
+ lea r6, [r3 * 3]
565
+
566
+ ; load constant
567
+ vpbroadcastd m4, [pw_2000]
568
+
569
+%rep 3
570
+ PROCESS_P2S_64x4_AVX512
571
+ lea r0, [r0 + r1 * 4]
572
+ lea r2, [r2 + r3 * 4]
573
+%endrep
574
+ PROCESS_P2S_64x4_AVX512
575
+ RET
576
+
577
+INIT_ZMM avx512
578
+cglobal filterPixelToShort_aligned_64x64, 3, 7, 5
579
+ mov r3d, r3m
580
+ add r3d, r3d
581
+ lea r5, [r1 * 3]
582
+ lea r6, [r3 * 3]
583
+
584
+ ; load constant
585
+ vpbroadcastd m4, [pw_2000]
586
+
587
+%rep 15
588
+ PROCESS_P2S_ALIGNED_64x4_AVX512
589
+ lea r0, [r0 + r1 * 4]
590
+ lea r2, [r2 + r3 * 4]
591
+%endrep
592
+ PROCESS_P2S_ALIGNED_64x4_AVX512
593
+ RET
594
+
595
+INIT_ZMM avx512
596
+cglobal filterPixelToShort_aligned_64x48, 3, 7, 5
597
+ mov r3d, r3m
598
+ add r3d, r3d
599
+ lea r5, [r1 * 3]
600
+ lea r6, [r3 * 3]
601
+
602
+ ; load constant
603
+ vpbroadcastd m4, [pw_2000]
604
+
605
+%rep 11
606
+ PROCESS_P2S_ALIGNED_64x4_AVX512
607
+ lea r0, [r0 + r1 * 4]
608
+ lea r2, [r2 + r3 * 4]
609
+%endrep
610
+ PROCESS_P2S_ALIGNED_64x4_AVX512
611
+ RET
612
+
613
+INIT_ZMM avx512
614
+cglobal filterPixelToShort_aligned_64x32, 3, 7, 5
615
+ mov r3d, r3m
616
+ add r3d, r3d
617
+ lea r5, [r1 * 3]
618
+ lea r6, [r3 * 3]
619
+
620
+ ; load constant
621
+ vpbroadcastd m4, [pw_2000]
622
+
623
+%rep 7
624
+ PROCESS_P2S_ALIGNED_64x4_AVX512
625
+ lea r0, [r0 + r1 * 4]
626
+ lea r2, [r2 + r3 * 4]
627
+%endrep
628
+ PROCESS_P2S_ALIGNED_64x4_AVX512
629
+ RET
630
+
631
+INIT_ZMM avx512
632
+cglobal filterPixelToShort_aligned_64x16, 3, 7, 5
633
+ mov r3d, r3m
634
+ add r3d, r3d
635
+ lea r5, [r1 * 3]
636
+ lea r6, [r3 * 3]
637
+
638
+ ; load constant
639
+ vpbroadcastd m4, [pw_2000]
640
+
641
+%rep 3
642
+ PROCESS_P2S_ALIGNED_64x4_AVX512
643
+ lea r0, [r0 + r1 * 4]
644
+ lea r2, [r2 + r3 * 4]
645
+%endrep
646
+ PROCESS_P2S_ALIGNED_64x4_AVX512
647
+ RET
648
+%endif
649
+;-----------------------------------------------------------------------------
650
+;p2s and p2s_aligned 64xN avx512 code end
651
+;-----------------------------------------------------------------------------
652
+
653
+;-----------------------------------------------------------------------------
654
; void filterPixelToShort(pixel src, intptr_t srcStride, int16_t dst, int16_t dstStride)
655
;-----------------------------------------------------------------------------
656
%macro P2S_H_12xN 1
657
658
jnz .loop
659
RET
660
661
+;-----------------------------------------------------------------------------
662
+;p2s and p2s_aligned 48xN avx512 code start
663
+;-----------------------------------------------------------------------------
664
+%macro PROCESS_P2S_48x8_AVX512 0
665
+ pmovzxbw m0, [r0]
666
+ pmovzxbw m1, [r0 + r1]
667
+ pmovzxbw m2, [r0 + r1 * 2]
668
+ pmovzxbw m3, [r0 + r5]
669
+ psllw m0, 6
670
+ psllw m1, 6
671
+ psllw m2, 6
672
+ psllw m3, 6
673
+ psubw m0, m4
674
+ psubw m1, m4
675
+ psubw m2, m4
676
+ psubw m3, m4
677
+ movu [r2], m0
678
+ movu [r2 + r3], m1
679
+ movu [r2 + r3 * 2], m2
680
+ movu [r2 + r6], m3
681
+
682
+ pmovzxbw ym0, [r0 + 32]
683
+ pmovzxbw ym1, [r0 + r1 + 32]
684
+ pmovzxbw ym2, [r0 + r1 * 2 + 32]
685
+ pmovzxbw ym3, [r0 + r5 + 32]
686
+ psllw ym0, 6
687
+ psllw ym1, 6
688
+ psllw ym2, 6
689
+ psllw ym3, 6
690
+ psubw ym0, ym4
691
+ psubw ym1, ym4
692
+ psubw ym2, ym4
693
+ psubw ym3, ym4
694
+ movu [r2 + 64], ym0
695
+ movu [r2 + r3 + 64], ym1
696
+ movu [r2 + r3 * 2 + 64], ym2
697
+ movu [r2 + r6 + 64], ym3
698
+
699
+ lea r0, [r0 + r1 * 4]
700
+ lea r2, [r2 + r3 * 4]
701
+
702
+ pmovzxbw m0, [r0]
703
+ pmovzxbw m1, [r0 + r1]
704
+ pmovzxbw m2, [r0 + r1 * 2]
705
+ pmovzxbw m3, [r0 + r5]
706
+ psllw m0, 6
707
+ psllw m1, 6
708
+ psllw m2, 6
709
+ psllw m3, 6
710
+ psubw m0, m4
711
+ psubw m1, m4
712
+ psubw m2, m4
713
+ psubw m3, m4
714
+ movu [r2], m0
715
+ movu [r2 + r3], m1
716
+ movu [r2 + r3 * 2], m2
717
+ movu [r2 + r6], m3
718
+
719
+ pmovzxbw ym0, [r0 + 32]
720
+ pmovzxbw ym1, [r0 + r1 + 32]
721
+ pmovzxbw ym2, [r0 + r1 * 2 + 32]
722
+ pmovzxbw ym3, [r0 + r5 + 32]
723
+ psllw ym0, 6
724
+ psllw ym1, 6
725
+ psllw ym2, 6
726
+ psllw ym3, 6
727
+ psubw ym0, ym4
728
+ psubw ym1, ym4
729
+ psubw ym2, ym4
730
+ psubw ym3, ym4
731
+ movu [r2 + 64], ym0
732
+ movu [r2 + r3 + 64], ym1
733
+ movu [r2 + r3 * 2 + 64], ym2
734
+ movu [r2 + r6 + 64], ym3
735
+%endmacro
736
+
737
+%macro PROCESS_P2S_ALIGNED_48x8_AVX512 0
738
+ pmovzxbw m0, [r0]
739
+ pmovzxbw m1, [r0 + r1]
740
+ pmovzxbw m2, [r0 + r1 * 2]
741
+ pmovzxbw m3, [r0 + r5]
742
+ psllw m0, 6
743
+ psllw m1, 6
744
+ psllw m2, 6
745
+ psllw m3, 6
746
+ psubw m0, m4
747
+ psubw m1, m4
748
+ psubw m2, m4
749
+ psubw m3, m4
750
+ mova [r2], m0
751
+ mova [r2 + r3], m1
752
+ mova [r2 + r3 * 2], m2
753
+ mova [r2 + r6], m3
754
+
755
+ pmovzxbw ym0, [r0 + 32]
756
+ pmovzxbw ym1, [r0 + r1 + 32]
757
+ pmovzxbw ym2, [r0 + r1 * 2 + 32]
758
+ pmovzxbw ym3, [r0 + r5 + 32]
759
+ psllw ym0, 6
760
+ psllw ym1, 6
761
+ psllw ym2, 6
762
+ psllw ym3, 6
763
+ psubw ym0, ym4
764
+ psubw ym1, ym4
765
+ psubw ym2, ym4
766
+ psubw ym3, ym4
767
+ mova [r2 + 64], ym0
768
+ mova [r2 + r3 + 64], ym1
769
+ mova [r2 + r3 * 2 + 64], ym2
770
+ mova [r2 + r6 + 64], ym3
771
+
772
+ lea r0, [r0 + r1 * 4]
773
+ lea r2, [r2 + r3 * 4]
774
+
775
+ pmovzxbw m0, [r0]
776
+ pmovzxbw m1, [r0 + r1]
777
+ pmovzxbw m2, [r0 + r1 * 2]
778
+ pmovzxbw m3, [r0 + r5]
779
+ psllw m0, 6
780
+ psllw m1, 6
781
+ psllw m2, 6
782
+ psllw m3, 6
783
+ psubw m0, m4
784
+ psubw m1, m4
785
+ psubw m2, m4
786
+ psubw m3, m4
787
+ mova [r2], m0
788
+ mova [r2 + r3], m1
789
+ mova [r2 + r3 * 2], m2
790
+ mova [r2 + r6], m3
791
+
792
+ pmovzxbw ym0, [r0 + 32]
793
+ pmovzxbw ym1, [r0 + r1 + 32]
794
+ pmovzxbw ym2, [r0 + r1 * 2 + 32]
795
+ pmovzxbw ym3, [r0 + r5 + 32]
796
+ psllw ym0, 6
797
+ psllw ym1, 6
798
+ psllw ym2, 6
799
+ psllw ym3, 6
800
+ psubw ym0, ym4
801
+ psubw ym1, ym4
802
+ psubw ym2, ym4
803
+ psubw ym3, ym4
804
+ mova [r2 + 64], ym0
805
+ mova [r2 + r3 + 64], ym1
806
+ mova [r2 + r3 * 2 + 64], ym2
807
+ mova [r2 + r6 + 64], ym3
808
+%endmacro
809
+;-----------------------------------------------------------------------------
810
+; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
811
+;-----------------------------------------------------------------------------
812
+%if ARCH_X86_64
813
+INIT_ZMM avx512
814
+cglobal filterPixelToShort_48x64, 3,7,5
815
+ mov r3d, r3m
816
+ add r3d, r3d
817
+ lea r5, [r1 * 3]
818
+ lea r6, [r3 * 3]
819
+
820
+ ; load constant
821
+ vpbroadcastd m4, [pw_2000]
822
+
823
+ PROCESS_P2S_48x8_AVX512
824
+ lea r0, [r0 + r1 * 4]
825
+ lea r2, [r2 + r3 * 4]
826
+ PROCESS_P2S_48x8_AVX512
827
+ lea r0, [r0 + r1 * 4]
828
+ lea r2, [r2 + r3 * 4]
829
+ PROCESS_P2S_48x8_AVX512
830
+ lea r0, [r0 + r1 * 4]
831
+ lea r2, [r2 + r3 * 4]
832
+ PROCESS_P2S_48x8_AVX512
833
+ lea r0, [r0 + r1 * 4]
834
+ lea r2, [r2 + r3 * 4]
835
+ PROCESS_P2S_48x8_AVX512
836
+ lea r0, [r0 + r1 * 4]
837
+ lea r2, [r2 + r3 * 4]
838
+ PROCESS_P2S_48x8_AVX512
839
+ lea r0, [r0 + r1 * 4]
840
+ lea r2, [r2 + r3 * 4]
841
+ PROCESS_P2S_48x8_AVX512
842
+ lea r0, [r0 + r1 * 4]
843
+ lea r2, [r2 + r3 * 4]
844
+ PROCESS_P2S_48x8_AVX512
845
+ RET
846
+
847
+INIT_ZMM avx512
848
+cglobal filterPixelToShort_aligned_48x64, 3,7,5
849
+ mov r3d, r3m
850
+ add r3d, r3d
851
+ lea r5, [r1 * 3]
852
+ lea r6, [r3 * 3]
853
+
854
+ ; load constant
855
+ vpbroadcastd m4, [pw_2000]
856
+
857
+ PROCESS_P2S_ALIGNED_48x8_AVX512
858
+ lea r0, [r0 + r1 * 4]
859
+ lea r2, [r2 + r3 * 4]
860
+ PROCESS_P2S_ALIGNED_48x8_AVX512
861
+ lea r0, [r0 + r1 * 4]
862
+ lea r2, [r2 + r3 * 4]
863
+ PROCESS_P2S_ALIGNED_48x8_AVX512
864
+ lea r0, [r0 + r1 * 4]
865
+ lea r2, [r2 + r3 * 4]
866
+ PROCESS_P2S_ALIGNED_48x8_AVX512
867
+ lea r0, [r0 + r1 * 4]
868
+ lea r2, [r2 + r3 * 4]
869
+ PROCESS_P2S_ALIGNED_48x8_AVX512
870
+ lea r0, [r0 + r1 * 4]
871
+ lea r2, [r2 + r3 * 4]
872
+ PROCESS_P2S_ALIGNED_48x8_AVX512
873
+ lea r0, [r0 + r1 * 4]
874
+ lea r2, [r2 + r3 * 4]
875
+ PROCESS_P2S_ALIGNED_48x8_AVX512
876
+ lea r0, [r0 + r1 * 4]
877
+ lea r2, [r2 + r3 * 4]
878
+ PROCESS_P2S_ALIGNED_48x8_AVX512
879
+ RET
880
+%endif
881
+;-----------------------------------------------------------------------------
882
+;p2s and p2s_aligned 48xN avx512 code end
883
+;-----------------------------------------------------------------------------
884
885
%macro PROCESS_LUMA_W4_4R 0
886
movd m0, [r0]
887
888
889
FILTER_VER_LUMA_S_AVX2_32x24 sp
890
FILTER_VER_LUMA_S_AVX2_32x24 ss
891
+;-------------------------------------------------------------------------------------------------------------
892
+;ipfilter_chroma_avx512 code start
893
+;-------------------------------------------------------------------------------------------------------------
894
+%macro PROCESS_IPFILTER_CHROMA_PP_64x1_AVX512 0
895
+ ; register map
896
+ ; m0 - interpolate coeff
897
+ ; m1, m2 - shuffle order table
898
+ ; m3 - constant word 1
899
+ ; m4 - constant word 512
900
+
901
+ movu m5, [r0]
902
+ pshufb m6, m5, m2
903
+ pshufb m5, m5, m1
904
+ pmaddubsw m5, m0
905
+ pmaddubsw m6, m0
906
+ pmaddwd m5, m3
907
+ pmaddwd m6, m3
908
+
909
+ movu m7, [r0 + 4]
910
+ pshufb m8, m7, m2
911
+ pshufb m7, m7, m1
912
+ pmaddubsw m7, m0
913
+ pmaddubsw m8, m0
914
+ pmaddwd m7, m3
915
+ pmaddwd m8, m3
916
+
917
+ packssdw m5, m7
918
+ packssdw m6, m8
919
+ pmulhrsw m5, m4
920
+ pmulhrsw m6, m4
921
+ packuswb m5, m6
922
+ movu [r2], m5
923
+%endmacro
924
+
925
+%macro PROCESS_IPFILTER_CHROMA_PP_32x2_AVX512 0
926
+ ; register map
927
+ ; m0 - interpolate coeff
928
+ ; m1, m2 - shuffle order table
929
+ ; m3 - constant word 1
930
+ ; m4 - constant word 512
931
+ ; m9 - store shuffle order table
932
+
933
+ movu ym5, [r0]
934
+ vinserti32x8 m5, [r0 + r1], 1
935
+ movu ym7, [r0 + 4]
936
+ vinserti32x8 m7, [r0 + r1 + 4], 1
937
+
938
+ pshufb m6, m5, m2
939
+ pshufb m5, m1
940
+ pshufb m8, m7, m2
941
+ pshufb m7, m1
942
+
943
+ pmaddubsw m5, m0
944
+ pmaddubsw m7, m0
945
+ pmaddwd m5, m3
946
+ pmaddwd m7, m3
947
+
948
+ pmaddubsw m6, m0
949
+ pmaddubsw m8, m0
950
+ pmaddwd m6, m3
951
+ pmaddwd m8, m3
952
+
953
+ packssdw m5, m7
954
+ packssdw m6, m8
955
+ pmulhrsw m5, m4
956
+ pmulhrsw m6, m4
957
+ packuswb m5, m6
958
+ movu [r2], ym5
959
+ vextracti32x8 [r2 + r3], m5, 1
960
+%endmacro
961
+
962
+%macro PROCESS_IPFILTER_CHROMA_PP_16x4_AVX512 0
963
+ ; register map
964
+ ; m0 - interpolate coeff
965
+ ; m1, m2 - shuffle order table
966
+ ; m3 - constant word 1
967
+ ; m4 - constant word 512
968
+
969
+ movu xm5, [r0]
970
+ vinserti32x4 m5, [r0 + r1], 1
971
+ vinserti32x4 m5, [r0 + 2 * r1], 2
972
+ vinserti32x4 m5, [r0 + r6], 3
973
+ pshufb m6, m5, m2
974
+ pshufb m5, m1
975
+
976
+ movu xm7, [r0 + 4]
977
+ vinserti32x4 m7, [r0 + r1 + 4], 1
978
+ vinserti32x4 m7, [r0 + 2 * r1 + 4], 2
979
+ vinserti32x4 m7, [r0 + r6 + 4], 3
980
+ pshufb m8, m7, m2
981
+ pshufb m7, m1
982
+
983
+ pmaddubsw m5, m0
984
+ pmaddubsw m7, m0
985
+ pmaddwd m5, m3
986
+ pmaddwd m7, m3
987
+
988
+ pmaddubsw m6, m0
989
+ pmaddubsw m8, m0
990
+ pmaddwd m6, m3
991
+ pmaddwd m8, m3
992
+
993
+ packssdw m5, m7
994
+ packssdw m6, m8
995
+ pmulhrsw m5, m4
996
+ pmulhrsw m6, m4
997
+ packuswb m5, m6
998
+ movu [r2], xm5
999
+ vextracti32x4 [r2 + r3], m5, 1
1000
+ vextracti32x4 [r2 + 2 * r3], m5, 2
1001
+ vextracti32x4 [r2 + r7], m5, 3
1002
+%endmacro
1003
+
1004
+%macro PROCESS_IPFILTER_CHROMA_PP_48x4_AVX512 0
1005
+ ; register map
1006
+ ; m0 - interpolate coeff
1007
+ ; m1, m2 - shuffle order table
1008
+ ; m3 - constant word 1
1009
+ ; m4 - constant word 512
1010
+ movu ym5, [r0]
1011
+ vinserti32x8 m5, [r0 + r1], 1
1012
+ movu ym7, [r0 + 4]
1013
+ vinserti32x8 m7, [r0 + r1 + 4], 1
1014
+
1015
+ pshufb m6, m5, m2
1016
+ pshufb m5, m1
1017
+ pshufb m8, m7, m2
1018
+ pshufb m7, m1
1019
+
1020
+ pmaddubsw m5, m0
1021
+ pmaddubsw m7, m0
1022
+ pmaddwd m5, m3
1023
+ pmaddwd m7, m3
1024
+
1025
+ pmaddubsw m6, m0
1026
+ pmaddubsw m8, m0
1027
+ pmaddwd m6, m3
1028
+ pmaddwd m8, m3
1029
+
1030
+ packssdw m5, m7
1031
+ packssdw m6, m8
1032
+ pmulhrsw m5, m4
1033
+ pmulhrsw m6, m4
1034
+ packuswb m5, m6
1035
+ movu [r2], ym5
1036
+ vextracti32x8 [r2 + r3], m5, 1
1037
+
1038
+ movu ym5, [r0 + 2 * r1]
1039
+ vinserti32x8 m5, [r0 + r6], 1
1040
+ movu ym7, [r0 + 2 * r1 + 4]
1041
+ vinserti32x8 m7, [r0 + r6 + 4], 1
1042
+
1043
+ pshufb m6, m5, m2
1044
+ pshufb m5, m1
1045
+ pshufb m8, m7, m2
1046
+ pshufb m7, m1
1047
+
1048
+ pmaddubsw m5, m0
1049
+ pmaddubsw m7, m0
1050
+ pmaddwd m5, m3
1051
+ pmaddwd m7, m3
1052
+
1053
+ pmaddubsw m6, m0
1054
+ pmaddubsw m8, m0
1055
+ pmaddwd m6, m3
1056
+ pmaddwd m8, m3
1057
+
1058
+ packssdw m5, m7
1059
+ packssdw m6, m8
1060
+ pmulhrsw m5, m4
1061
+ pmulhrsw m6, m4
1062
+ packuswb m5, m6
1063
+ movu [r2 + 2 * r3], ym5
1064
+ vextracti32x8 [r2 + r7], m5, 1
1065
+
1066
+ movu xm5, [r0 + mmsize/2]
1067
+ vinserti32x4 m5, [r0 + r1 + mmsize/2], 1
1068
+ vinserti32x4 m5, [r0 + 2 * r1 + mmsize/2], 2
1069
+ vinserti32x4 m5, [r0 + r6 + mmsize/2], 3
1070
+ pshufb m6, m5, m2
1071
+ pshufb m5, m1
1072
+
1073
+ movu xm7, [r0 + 36]
1074
+ vinserti32x4 m7, [r0 + r1 + 36], 1
1075
+ vinserti32x4 m7, [r0 + 2 * r1 + 36], 2
1076
+ vinserti32x4 m7, [r0 + r6 + 36], 3
1077
+ pshufb m8, m7, m2
1078
+ pshufb m7, m1
1079
+
1080
+ pmaddubsw m5, m0
1081
+ pmaddubsw m7, m0
1082
+ pmaddwd m5, m3
1083
+ pmaddwd m7, m3
1084
+
1085
+ pmaddubsw m6, m0
1086
+ pmaddubsw m8, m0
1087
+ pmaddwd m6, m3
1088
+ pmaddwd m8, m3
1089
+
1090
+ packssdw m5, m7
1091
+ packssdw m6, m8
1092
+ pmulhrsw m5, m4
1093
+ pmulhrsw m6, m4
1094
+ packuswb m5, m6
1095
+ movu [r2 + mmsize/2], xm5
1096
+ vextracti32x4 [r2 + r3 + mmsize/2], m5, 1
1097
+ vextracti32x4 [r2 + 2 * r3 + mmsize/2], m5, 2
1098
+ vextracti32x4 [r2 + r7 + mmsize/2], m5, 3
1099
+%endmacro
1100
+
1101
+;-------------------------------------------------------------------------------------------------------------
1102
+; void interp_4tap_horiz_pp_64xN(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx
1103
+;-------------------------------------------------------------------------------------------------------------
1104
+%macro IPFILTER_CHROMA_PP_64xN_AVX512 1
1105
+INIT_ZMM avx512
1106
+cglobal interp_4tap_horiz_pp_64x%1, 4,6,9
1107
+ mov r4d, r4m
1108
+
1109
+%ifdef PIC
1110
+ lea r5, [tab_ChromaCoeff]
1111
+ vpbroadcastd m0, [r5 + r4 * 4]
1112
+%else
1113
+ vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4]
1114
+%endif
1115
+
1116
+ vbroadcasti32x8 m1, [interp4_horiz_shuf_load1_avx512]
1117
+ vbroadcasti32x8 m2, [interp4_horiz_shuf_load2_avx512]
1118
+ vbroadcasti32x8 m3, [pw_1]
1119
+ vbroadcasti32x8 m4, [pw_512]
1120
+ dec r0
1121
+
1122
+%rep %1 - 1
1123
+ PROCESS_IPFILTER_CHROMA_PP_64x1_AVX512
1124
+ lea r2, [r2 + r3]
1125
+ lea r0, [r0 + r1]
1126
+%endrep
1127
+ PROCESS_IPFILTER_CHROMA_PP_64x1_AVX512
1128
+ RET
1129
+%endmacro
1130
+
1131
+%if ARCH_X86_64
1132
+ IPFILTER_CHROMA_PP_64xN_AVX512 64
1133
+ IPFILTER_CHROMA_PP_64xN_AVX512 32
1134
+ IPFILTER_CHROMA_PP_64xN_AVX512 48
1135
+ IPFILTER_CHROMA_PP_64xN_AVX512 16
1136
+%endif
1137
+
1138
+%macro IPFILTER_CHROMA_PP_32xN_AVX512 1
1139
+INIT_ZMM avx512
1140
+cglobal interp_4tap_horiz_pp_32x%1, 4,6,9
1141
+ mov r4d, r4m
1142
+
1143
+%ifdef PIC
1144
+ lea r5, [tab_ChromaCoeff]
1145
+ vpbroadcastd m0, [r5 + r4 * 4]
1146
+%else
1147
+ vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4]
1148
+%endif
1149
+
1150
+ vbroadcasti32x8 m1, [interp4_horiz_shuf_load1_avx512]
1151
+ vbroadcasti32x8 m2, [interp4_horiz_shuf_load2_avx512]
1152
+ vbroadcasti32x8 m3, [pw_1]
1153
+ vbroadcasti32x8 m4, [pw_512]
1154
+ dec r0
1155
+
1156
+%rep %1/2 - 1
1157
+ PROCESS_IPFILTER_CHROMA_PP_32x2_AVX512
1158
+ lea r2, [r2 + 2 * r3]
1159
+ lea r0, [r0 + 2 * r1]
1160
+%endrep
1161
+ PROCESS_IPFILTER_CHROMA_PP_32x2_AVX512
1162
+ RET
1163
+%endmacro
1164
+
1165
+%if ARCH_X86_64
1166
+ IPFILTER_CHROMA_PP_32xN_AVX512 16
1167
+ IPFILTER_CHROMA_PP_32xN_AVX512 24
1168
+ IPFILTER_CHROMA_PP_32xN_AVX512 8
1169
+ IPFILTER_CHROMA_PP_32xN_AVX512 32
1170
+ IPFILTER_CHROMA_PP_32xN_AVX512 64
1171
+ IPFILTER_CHROMA_PP_32xN_AVX512 48
1172
+%endif
1173
+
1174
+%macro IPFILTER_CHROMA_PP_16xN_AVX512 1
1175
+INIT_ZMM avx512
1176
+cglobal interp_4tap_horiz_pp_16x%1, 4,8,9
1177
+ mov r4d, r4m
1178
+ lea r6, [3 * r1]
1179
+ lea r7, [3 * r3]
1180
+%ifdef PIC
1181
+ lea r5, [tab_ChromaCoeff]
1182
+ vpbroadcastd m0, [r5 + r4 * 4]
1183
+%else
1184
+ vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4]
1185
+%endif
1186
+
1187
+ vbroadcasti32x8 m1, [interp4_horiz_shuf_load1_avx512]
1188
+ vbroadcasti32x8 m2, [interp4_horiz_shuf_load2_avx512]
1189
+ vbroadcasti32x8 m3, [pw_1]
1190
+ vbroadcasti32x8 m4, [pw_512]
1191
+ dec r0
1192
+
1193
+%rep %1/4 - 1
1194
+ PROCESS_IPFILTER_CHROMA_PP_16x4_AVX512
1195
+ lea r2, [r2 + 4 * r3]
1196
+ lea r0, [r0 + 4 * r1]
1197
+%endrep
1198
+ PROCESS_IPFILTER_CHROMA_PP_16x4_AVX512
1199
+ RET
1200
+%endmacro
1201
+
1202
+%if ARCH_X86_64
1203
+ IPFILTER_CHROMA_PP_16xN_AVX512 4
1204
+ IPFILTER_CHROMA_PP_16xN_AVX512 8
1205
+ IPFILTER_CHROMA_PP_16xN_AVX512 12
1206
+ IPFILTER_CHROMA_PP_16xN_AVX512 16
1207
+ IPFILTER_CHROMA_PP_16xN_AVX512 24
1208
+ IPFILTER_CHROMA_PP_16xN_AVX512 32
1209
+ IPFILTER_CHROMA_PP_16xN_AVX512 64
1210
+%endif
1211
+
1212
+%if ARCH_X86_64
1213
+INIT_ZMM avx512
1214
+cglobal interp_4tap_horiz_pp_48x64, 4,8,9
1215
+ mov r4d, r4m
1216
+ lea r6, [3 * r1]
1217
+ lea r7, [3 * r3]
1218
+%ifdef PIC
1219
+ lea r5, [tab_ChromaCoeff]
1220
+ vpbroadcastd m0, [r5 + r4 * 4]
1221
+%else
1222
+ vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4]
1223
+%endif
1224
+
1225
+ vbroadcasti32x8 m1, [interp4_horiz_shuf_load1_avx512]
1226
+ vbroadcasti32x8 m2, [interp4_horiz_shuf_load2_avx512]
1227
+ vbroadcasti32x8 m3, [pw_1]
1228
+ vbroadcasti32x8 m4, [pw_512]
1229
+ dec r0
1230
+
1231
+%rep 15
1232
+ PROCESS_IPFILTER_CHROMA_PP_48x4_AVX512
1233
+ lea r2, [r2 + 4 * r3]
1234
+ lea r0, [r0 + 4 * r1]
1235
+%endrep
1236
+ PROCESS_IPFILTER_CHROMA_PP_48x4_AVX512
1237
+ RET
1238
+%endif
1239
+
1240
+%macro PROCESS_IPFILTER_CHROMA_PS_64x1_AVX512 0
1241
+ movu ym6, [r0]
1242
+ vinserti32x8 m6, [r0 + 4], 1
1243
+ pshufb m7, m6, m2
1244
+ pshufb m6, m1
1245
+ pmaddubsw m6, m0
1246
+ pmaddubsw m7, m0
1247
+ pmaddwd m6, m3
1248
+ pmaddwd m7, m3
1249
+
1250
+ movu ym8, [r0 + 32]
1251
+ vinserti32x8 m8, [r0 + 36], 1
1252
+ pshufb m9, m8, m2
1253
+ pshufb m8, m1
1254
+ pmaddubsw m8, m0
1255
+ pmaddubsw m9, m0
1256
+ pmaddwd m8, m3
1257
+ pmaddwd m9, m3
1258
+
1259
+ packssdw m6, m7
1260
+ packssdw m8, m9
1261
+ psubw m6, m4
1262
+ psubw m8, m4
1263
+ vpermq m6, m10, m6
1264
+ vpermq m8, m10, m8
1265
+ movu [r2], m6
1266
+ movu [r2 + mmsize],m8
1267
+%endmacro
1268
+
1269
+;-------------------------------------------------------------------------------------------------------------
1270
+; void interp_horiz_ps_64xN(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)
1271
+;-------------------------------------------------------------------------------------------------------------
1272
+%macro IPFILTER_CHROMA_PS_64xN_AVX512 1
1273
+INIT_ZMM avx512
1274
+cglobal interp_4tap_horiz_ps_64x%1, 4,7,11
1275
+ mov r4d, r4m
1276
+ mov r5d, r5m
1277
+
1278
+%ifdef PIC
1279
+ lea r6, [tab_ChromaCoeff]
1280
+ vpbroadcastd m0, [r6 + r4 * 4]
1281
+%else
1282
+ vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4]
1283
+%endif
1284
+
1285
+ vbroadcasti32x8 m1, [interp4_horiz_shuf_load1_avx512]
1286
+ vbroadcasti32x8 m2, [interp4_horiz_shuf_load2_avx512]
1287
+ vbroadcasti32x8 m3, [pw_1]
1288
+ vbroadcasti32x8 m4, [pw_2000]
1289
+ mova m10, [interp4_hps_shuf_avx512]
1290
+
1291
+ ; register map
1292
+ ; m0 - interpolate coeff
1293
+ ; m1,m2 - load shuffle order table
1294
+ ; m3 - constant word 1
1295
+ ; m4 - constant word 2000
1296
+ ; m10 - store shuffle order table
1297
+
1298
+ mov r6d, %1
1299
+ dec r0
1300
+ test r5d, r5d
1301
+ je .loop
1302
+ sub r0, r1
1303
+ add r6d, 3
1304
+
1305
+.loop:
1306
+ PROCESS_IPFILTER_CHROMA_PS_64x1_AVX512
1307
+ lea r2, [r2 + 2 * r3]
1308
+ lea r0, [r0 + r1]
1309
+ dec r6d
1310
+ jnz .loop
1311
+ RET
1312
+%endmacro
1313
+
1314
+%if ARCH_X86_64
1315
+ IPFILTER_CHROMA_PS_64xN_AVX512 64
1316
+ IPFILTER_CHROMA_PS_64xN_AVX512 32
1317
+ IPFILTER_CHROMA_PS_64xN_AVX512 48
1318
+ IPFILTER_CHROMA_PS_64xN_AVX512 16
1319
+%endif
1320
+
1321
+%macro PROCESS_IPFILTER_CHROMA_PS_32x1_AVX512 0
1322
+ movu ym6, [r0]
1323
+ vinserti32x8 m6, [r0 + 4], 1
1324
+ pshufb m7, m6, m2
1325
+ pshufb m6, m6, m1
1326
+ pmaddubsw m6, m0
1327
+ pmaddubsw m7, m0
1328
+ pmaddwd m6, m3
1329
+ pmaddwd m7, m3
1330
+
1331
+ packssdw m6, m7
1332
+ psubw m6, m4
1333
+ vpermq m6, m8, m6
1334
+ movu [r2], m6
1335
+%endmacro
1336
+
1337
+;-------------------------------------------------------------------------------------------------------------
1338
+; void interp_horiz_ps_32xN(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)
1339
+;-------------------------------------------------------------------------------------------------------------
1340
+%macro IPFILTER_CHROMA_PS_32xN_AVX512 1
1341
+INIT_ZMM avx512
1342
+cglobal interp_4tap_horiz_ps_32x%1, 4,7,9
1343
+ mov r4d, r4m
1344
+ mov r5d, r5m
1345
+
1346
+%ifdef PIC
1347
+ lea r6, [tab_ChromaCoeff]
1348
+ vpbroadcastd m0, [r6 + r4 * 4]
1349
+%else
1350
+ vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4]
1351
+%endif
1352
+
1353
+ vbroadcasti32x8 m1, [interp4_horiz_shuf_load1_avx512]
1354
+ vbroadcasti32x8 m2, [interp4_horiz_shuf_load2_avx512]
1355
+ vbroadcasti32x8 m3, [pw_1]
1356
+ vbroadcasti32x8 m4, [pw_2000]
1357
+ mova m8, [interp4_hps_shuf_avx512]
1358
+
1359
+ ; register map
1360
+ ; m0 - interpolate coeff
1361
+ ; m1,m2 - load shuffle order table
1362
+ ; m3 - constant word 1
1363
+ ; m4 - constant word 2000
1364
+ ; m8 - store shuffle order table
1365
+
1366
+ mov r6d, %1
1367
+ dec r0
1368
+ test r5d, r5d
1369
+ je .loop
1370
+ sub r0, r1
1371
+ add r6d, 3
1372
+
1373
+.loop:
1374
+ PROCESS_IPFILTER_CHROMA_PS_32x1_AVX512
1375
+ lea r2, [r2 + 2 * r3]
1376
+ lea r0, [r0 + r1]
1377
+ dec r6d
1378
+ jnz .loop
1379
+ RET
1380
+%endmacro
1381
+
1382
+%if ARCH_X86_64
1383
+ IPFILTER_CHROMA_PS_32xN_AVX512 64
1384
+ IPFILTER_CHROMA_PS_32xN_AVX512 48
1385
+ IPFILTER_CHROMA_PS_32xN_AVX512 32
1386
+ IPFILTER_CHROMA_PS_32xN_AVX512 24
1387
+ IPFILTER_CHROMA_PS_32xN_AVX512 16
1388
+ IPFILTER_CHROMA_PS_32xN_AVX512 8
1389
+%endif
1390
+
1391
+%macro PROCESS_IPFILTER_CHROMA_PS_16x2_AVX512 0
1392
+ movu xm6, [r0]
1393
+ vinserti32x4 m6, [r0 + 4], 1
1394
+ vinserti32x4 m6, [r0 + r1], 2
1395
+ vinserti32x4 m6, [r0 + r1 + 4], 3
1396
+
1397
+ pshufb m7, m6, m2
1398
+ pshufb m6, m6, m1
1399
+ pmaddubsw m6, m0
1400
+ pmaddubsw m7, m0
1401
+ pmaddwd m6, m3
1402
+ pmaddwd m7, m3
1403
+
1404
+ packssdw m6, m7
1405
+ psubw m6, m4
1406
+ vpermq m6, m8, m6
1407
+ movu [r2], ym6
1408
+ vextracti32x8 [r2 + r3], m6, 1
1409
+%endmacro
1410
+
1411
+%macro PROCESS_IPFILTER_CHROMA_PS_16x1_AVX512 0
1412
+ movu xm6, [r0]
1413
+ vinserti32x4 m6, [r0 + 4], 1
1414
+
1415
+ pshufb ym7, ym6, ym2
1416
+ pshufb ym6, ym6, ym1
1417
+ pmaddubsw ym6, ym0
1418
+ pmaddubsw ym7, ym0
1419
+ pmaddwd ym6, ym3
1420
+ pmaddwd ym7, ym3
1421
+
1422
+ packssdw ym6, ym7
1423
+ psubw ym6, ym4
1424
+ vpermq ym6, ym8, ym6
1425
+ movu [r2], ym6
1426
+%endmacro
1427
+
1428
+;-------------------------------------------------------------------------------------------------------------
1429
+; void interp_horiz_ps_16xN(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)
1430
+;-------------------------------------------------------------------------------------------------------------
1431
+%macro IPFILTER_CHROMA_PS_16xN_AVX512 1
1432
+INIT_ZMM avx512
1433
+cglobal interp_4tap_horiz_ps_16x%1, 4,7,9
1434
+ mov r4d, r4m
1435
+ mov r5d, r5m
1436
+ add r3, r3
1437
+
1438
+%ifdef PIC
1439
+ lea r6, [tab_ChromaCoeff]
1440
+ vpbroadcastd m0, [r6 + r4 * 4]
1441
+%else
1442
+ vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4]
1443
+%endif
1444
+
1445
+ vbroadcasti32x8 m1, [interp4_horiz_shuf_load1_avx512]
1446
+ vbroadcasti32x8 m2, [interp4_horiz_shuf_load2_avx512]
1447
+ vbroadcasti32x8 m3, [pw_1]
1448
+ vbroadcasti32x8 m4, [pw_2000]
1449
+ mova m8, [interp4_hps_store_16xN_avx512]
1450
+
1451
+ ; register map
1452
+ ; m0 - interpolate coeff
1453
+ ; m1,m2 - load shuffle order table
1454
+ ; m3 - constant word 1
1455
+ ; m4 - constant word 2000
1456
+ ; m8 - store shuffle order table
1457
+
1458
+ mov r6d, %1
1459
+ dec r0
1460
+ test r5d, r5d
1461
+ je .loop
1462
+ sub r0, r1
1463
+ add r6d, 3
1464
+ PROCESS_IPFILTER_CHROMA_PS_16x1_AVX512
1465
+ lea r2, [r2 + r3]
1466
+ lea r0, [r0 + r1]
1467
+ dec r6d
1468
+
1469
+.loop:
1470
+ PROCESS_IPFILTER_CHROMA_PS_16x2_AVX512
1471
+ lea r2, [r2 + 2 * r3]
1472
+ lea r0, [r0 + 2 * r1]
1473
+ sub r6d, 2
1474
+ jnz .loop
1475
+
1476
+ RET
1477
+%endmacro
1478
+
1479
+%if ARCH_X86_64 == 1
1480
+ IPFILTER_CHROMA_PS_16xN_AVX512 64
1481
+ IPFILTER_CHROMA_PS_16xN_AVX512 32
1482
+ IPFILTER_CHROMA_PS_16xN_AVX512 24
1483
+ IPFILTER_CHROMA_PS_16xN_AVX512 16
1484
+ IPFILTER_CHROMA_PS_16xN_AVX512 12
1485
+ IPFILTER_CHROMA_PS_16xN_AVX512 8
1486
+ IPFILTER_CHROMA_PS_16xN_AVX512 4
1487
+%endif
1488
+
1489
+%macro PROCESS_IPFILTER_CHROMA_PS_48x1_AVX512 0
1490
+ movu ym6, [r0]
1491
+ vinserti32x8 m6, [r0 + 4], 1
1492
+ pshufb m7, m6, m2
1493
+ pshufb m6, m6, m1
1494
+ pmaddubsw m6, m0
1495
+ pmaddubsw m7, m0
1496
+ pmaddwd m6, m3
1497
+ pmaddwd m7, m3
1498
+
1499
+ packssdw m6, m7
1500
+ psubw m6, m4
1501
+ vpermq m6, m8, m6
1502
+ movu [r2], m6
1503
+
1504
+ movu xm6, [r0 + 32]
1505
+ vinserti32x4 m6, [r0 + 36], 1
1506
+ pshufb ym7, ym6, ym2
1507
+ pshufb ym6, ym6, ym1
1508
+ pmaddubsw ym6, ym0
1509
+ pmaddubsw ym7, ym0
1510
+ pmaddwd ym6, ym3
1511
+ pmaddwd ym7, ym3
1512
+
1513
+ packssdw ym6, ym7
1514
+ psubw ym6, ym4
1515
+ vpermq ym6, ym9, ym6
1516
+ movu [r2 + mmsize],ym6
1517
+%endmacro
1518
+
1519
+;-------------------------------------------------------------------------------------------------------------
1520
+; void interp_horiz_ps_48xN(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)
1521
+;-------------------------------------------------------------------------------------------------------------
1522
+%macro IPFILTER_CHROMA_PS_48xN_AVX512 1
1523
+INIT_ZMM avx512
1524
+cglobal interp_4tap_horiz_ps_48x%1, 4,7,10
1525
+ mov r4d, r4m
1526
+ mov r5d, r5m
1527
+
1528
+%ifdef PIC
1529
+ lea r6, [tab_ChromaCoeff]
1530
+ vpbroadcastd m0, [r6 + r4 * 4]
1531
+%else
1532
+ vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4]
1533
+%endif
1534
+
1535
+ vbroadcasti32x8 m1, [interp4_horiz_shuf_load1_avx512]
1536
+ vbroadcasti32x8 m2, [interp4_horiz_shuf_load2_avx512]
1537
+ vbroadcasti32x8 m3, [pw_1]
1538
+ vbroadcasti32x8 m4, [pw_2000]
1539
+ mova m8, [interp4_hps_shuf_avx512]
1540
+ mova m9, [interp4_hps_store_16xN_avx512]
1541
+
1542
+ ; register map
1543
+ ; m0 - interpolate coeff
1544
+ ; m1,m2 - load shuffle order table
1545
+ ; m3 - constant word 1
1546
+ ; m4 - constant word 2000
1547
+ ; m8 - store shuffle order table
1548
+
1549
+ mov r6d, %1
1550
+ dec r0
1551
+ test r5d, r5d
1552
+ je .loop
1553
+ sub r0, r1
1554
+ add r6d, 3
1555
+
1556
+.loop:
1557
+ PROCESS_IPFILTER_CHROMA_PS_48x1_AVX512
1558
+ lea r2, [r2 + 2 * r3]
1559
+ lea r0, [r0 + r1]
1560
+ dec r6d
1561
+ jnz .loop
1562
+ RET
1563
+%endmacro
1564
+
1565
+%if ARCH_X86_64 == 1
1566
+ IPFILTER_CHROMA_PS_48xN_AVX512 64
1567
+%endif
1568
+
1569
+;-------------------------------------------------------------------------------------------------------------
1570
+;avx512 chroma_vpp and chroma_vps code start
1571
+;-------------------------------------------------------------------------------------------------------------
1572
+%macro PROCESS_CHROMA_VERT_16x4_AVX512 1
1573
+ lea r5, [r0 + 4 * r1]
1574
+ movu xm1, [r0]
1575
+ movu xm3, [r0 + r1]
1576
+ vinserti32x4 m1, [r0 + r1], 1
1577
+ vinserti32x4 m3, [r0 + 2 * r1], 1
1578
+ vinserti32x4 m1, [r0 + 2 * r1], 2
1579
+ vinserti32x4 m3, [r0 + r6], 2
1580
+ vinserti32x4 m1, [r0 + r6], 3
1581
+ vinserti32x4 m3, [r0 + 4 * r1], 3
1582
+
1583
+ punpcklbw m0, m1, m3
1584
+ pmaddubsw m0, m8
1585
+ punpckhbw m1, m3
1586
+ pmaddubsw m1, m8
1587
+
1588
+ movu xm4, [r0 + 2 * r1]
1589
+ movu xm5, [r0 + r6]
1590
+ vinserti32x4 m4, [r0 + r6], 1
1591
+ vinserti32x4 m5, [r5], 1
1592
+ vinserti32x4 m4, [r5], 2
1593
+ vinserti32x4 m5, [r5 + r1], 2
1594
+ vinserti32x4 m4, [r5 + r1], 3
1595
+ vinserti32x4 m5, [r5 + 2 * r1], 3
1596
+
1597
+ punpcklbw m3, m4, m5
1598
+ pmaddubsw m3, m9
1599
+ punpckhbw m4, m5
1600
+ pmaddubsw m4, m9
1601
+
1602
+ paddw m0, m3
1603
+ paddw m1, m4
1604
+%ifidn %1,pp
1605
+ pmulhrsw m0, m7
1606
+ pmulhrsw m1, m7
1607
+ packuswb m0, m1
1608
+ movu [r2], xm0
1609
+ vextracti32x4 [r2 + r3], m0, 1
1610
+ vextracti32x4 [r2 + 2 * r3], m0, 2
1611
+ vextracti32x4 [r2 + r7], m0, 3
1612
+%else
1613
+ psubw m0, m7
1614
+ psubw m1, m7
1615
+ mova m2, m10
1616
+ mova m3, m11
1617
+
1618
+ vpermi2q m2, m0, m1
1619
+ vpermi2q m3, m0, m1
1620
+
1621
+ movu [r2], ym2
1622
+ vextracti32x8 [r2 + r3], m2, 1
1623
+ movu [r2 + 2 * r3], ym3
1624
+ vextracti32x8 [r2 + r7], m3, 1
1625
+%endif
1626
+%endmacro
1627
+
1628
+;-----------------------------------------------------------------------------------------------------------------
1629
+; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
1630
+;-----------------------------------------------------------------------------------------------------------------
1631
+%macro FILTER_VERT_CHROMA_16xN_AVX512 2
1632
+INIT_ZMM avx512
1633
+cglobal interp_4tap_vert_%1_16x%2, 4, 10, 12
1634
+ mov r4d, r4m
1635
+ shl r4d, 7
1636
+ sub r0, r1
1637
+
1638
+%ifdef PIC
1639
+ lea r5, [tab_ChromaCoeffVer_32_avx512]
1640
+ mova m8, [r5 + r4]
1641
+ mova m9, [r5 + r4 + mmsize]
1642
+%else
1643
+ mova m8, [tab_ChromaCoeffVer_32_avx512 + r4]
1644
+ mova m9, [tab_ChromaCoeffVer_32_avx512 + r4 + mmsize]
1645
+%endif
1646
+
1647
+%ifidn %1, pp
1648
+ vbroadcasti32x8 m7, [pw_512]
1649
+%else
1650
+ shl r3d, 1
1651
+ vbroadcasti32x8 m7, [pw_2000]
1652
+ mova m10, [interp4_vps_store1_avx512]
1653
+ mova m11, [interp4_vps_store2_avx512]
1654
+%endif
1655
+ lea r6, [3 * r1]
1656
+ lea r7, [3 * r3]
1657
+
1658
+%rep %2/4 - 1
1659
+ PROCESS_CHROMA_VERT_16x4_AVX512 %1
1660
+ lea r0, [r0 + 4 * r1]
1661
+ lea r2, [r2 + 4 * r3]
1662
+%endrep
1663
+ PROCESS_CHROMA_VERT_16x4_AVX512 %1
1664
+ RET
1665
+%endmacro
1666
+
1667
+%if ARCH_X86_64
1668
+ FILTER_VERT_CHROMA_16xN_AVX512 pp, 4
1669
+ FILTER_VERT_CHROMA_16xN_AVX512 pp, 8
1670
+ FILTER_VERT_CHROMA_16xN_AVX512 pp, 12
1671
+ FILTER_VERT_CHROMA_16xN_AVX512 pp, 16
1672
+ FILTER_VERT_CHROMA_16xN_AVX512 pp, 24
1673
+ FILTER_VERT_CHROMA_16xN_AVX512 pp, 32
1674
+ FILTER_VERT_CHROMA_16xN_AVX512 pp, 64
1675
+
1676
+ FILTER_VERT_CHROMA_16xN_AVX512 ps, 4
1677
+ FILTER_VERT_CHROMA_16xN_AVX512 ps, 8
1678
+ FILTER_VERT_CHROMA_16xN_AVX512 ps, 12
1679
+ FILTER_VERT_CHROMA_16xN_AVX512 ps, 16
1680
+ FILTER_VERT_CHROMA_16xN_AVX512 ps, 24
1681
+ FILTER_VERT_CHROMA_16xN_AVX512 ps, 32
1682
+ FILTER_VERT_CHROMA_16xN_AVX512 ps, 64
1683
+%endif
1684
+%macro PROCESS_CHROMA_VERT_32x4_AVX512 1
1685
+ movu ym1, [r0]
1686
+ movu ym3, [r0 + r1]
1687
+ vinserti32x8 m1, [r0 + 2 * r1], 1
1688
+ vinserti32x8 m3, [r0 + r6], 1
1689
+ punpcklbw m0, m1, m3
1690
+ pmaddubsw m0, m8
1691
+ punpckhbw m1, m3
1692
+ pmaddubsw m1, m8
1693
+
1694
+ movu ym4, [r0 + 2 * r1]
1695
+ vinserti32x8 m4, [r0 + 4 * r1], 1
1696
+ punpcklbw m2, m3, m4
1697
+ pmaddubsw m2, m8
1698
+ punpckhbw m3, m4
1699
+ pmaddubsw m3, m8
1700
+
1701
+ lea r0, [r0 + 2 * r1]
1702
+
1703
+ movu ym5, [r0 + r1]
1704
+ vinserti32x8 m5, [r0 + r6], 1
1705
+ punpcklbw m6, m4, m5
1706
+ pmaddubsw m6, m9
1707
+ paddw m0, m6
1708
+ punpckhbw m4, m5
1709
+ pmaddubsw m4, m9
1710
+ paddw m1, m4
1711
+
1712
+ movu ym4, [r0 + 2 * r1]
1713
+ vinserti32x8 m4, [r0 + 4 * r1], 1
1714
+ punpcklbw m6, m5, m4
1715
+ pmaddubsw m6, m9
1716
+ paddw m2, m6
1717
+ punpckhbw m5, m4
1718
+ pmaddubsw m5, m9
1719
+ paddw m3, m5
1720
+
1721
+%ifidn %1,pp
1722
+ pmulhrsw m0, m7
1723
+ pmulhrsw m1, m7
1724
+ pmulhrsw m2, m7
1725
+ pmulhrsw m3, m7
1726
+ packuswb m0, m1
1727
+ packuswb m2, m3
1728
+ movu [r2], ym0
1729
+ movu [r2 + r3], ym2
1730
+ vextracti32x8 [r2 + 2 * r3], m0, 1
1731
+ vextracti32x8 [r2 + r7], m2, 1
1732
+%else
1733
+ psubw m0, m7
1734
+ psubw m1, m7
1735
+ psubw m2, m7
1736
+ psubw m3, m7
1737
+
1738
+ mova m4, m10
1739
+ mova m5, m11
1740
+ vpermi2q m4, m0, m1
1741
+ vpermi2q m5, m0, m1
1742
+ mova m6, m10
1743
+ mova m12, m11
1744
+ vpermi2q m6, m2, m3
1745
+ vpermi2q m12, m2, m3
1746
+
1747
+ movu [r2], m4
1748
+ movu [r2 + r3], m6
1749
+ movu [r2 + 2 * r3], m5
1750
+ movu [r2 + r7], m12
1751
+%endif
1752
+%endmacro
1753
+
1754
+;-----------------------------------------------------------------------------------------------------------------
1755
+; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
1756
+;-----------------------------------------------------------------------------------------------------------------
1757
+%macro FILTER_VERT_CHROMA_32xN_AVX512 2
1758
+INIT_ZMM avx512
1759
+cglobal interp_4tap_vert_%1_32x%2, 4, 8, 13
1760
+ mov r4d, r4m
1761
+ shl r4d, 7
1762
+ sub r0, r1
1763
+
1764
+%ifdef PIC
1765
+ lea r5, [tab_ChromaCoeffVer_32_avx512]
1766
+ mova m8, [r5 + r4]
1767
+ mova m9, [r5 + r4 + mmsize]
1768
+%else
1769
+ mova m8, [tab_ChromaCoeffVer_32_avx512 + r4]
1770
+ mova m9, [tab_ChromaCoeffVer_32_avx512 + r4 + mmsize]
1771
+%endif
1772
+
1773
+%ifidn %1,pp
1774
+ vbroadcasti32x8 m7, [pw_512]
1775
+%else
1776
+ shl r3d, 1
1777
+ vbroadcasti32x8 m7, [pw_2000]
1778
+ mova m10, [interp4_vps_store1_avx512]
1779
+ mova m11, [interp4_vps_store2_avx512]
1780
+%endif
1781
+
1782
+ lea r6, [3 * r1]
1783
+ lea r7, [3 * r3]
1784
+
1785
+%rep %2/4 - 1
1786
+ PROCESS_CHROMA_VERT_32x4_AVX512 %1
1787
+ lea r0, [r0 + 2 * r1]
1788
+ lea r2, [r2 + 4 * r3]
1789
+%endrep
1790
+ PROCESS_CHROMA_VERT_32x4_AVX512 %1
1791
+ RET
1792
+%endmacro
1793
+
1794
+%if ARCH_X86_64
1795
+ FILTER_VERT_CHROMA_32xN_AVX512 pp, 8
1796
+ FILTER_VERT_CHROMA_32xN_AVX512 pp, 16
1797
+ FILTER_VERT_CHROMA_32xN_AVX512 pp, 24
1798
+ FILTER_VERT_CHROMA_32xN_AVX512 pp, 32
1799
+ FILTER_VERT_CHROMA_32xN_AVX512 pp, 48
1800
+ FILTER_VERT_CHROMA_32xN_AVX512 pp, 64
1801
+
1802
+ FILTER_VERT_CHROMA_32xN_AVX512 ps, 8
1803
+ FILTER_VERT_CHROMA_32xN_AVX512 ps, 16
1804
+ FILTER_VERT_CHROMA_32xN_AVX512 ps, 24
1805
+ FILTER_VERT_CHROMA_32xN_AVX512 ps, 32
1806
+ FILTER_VERT_CHROMA_32xN_AVX512 ps, 48
1807
+ FILTER_VERT_CHROMA_32xN_AVX512 ps, 64
1808
+%endif
1809
+%macro PROCESS_CHROMA_VERT_48x4_AVX512 1
1810
+ movu ym1, [r0]
1811
+ movu ym3, [r0 + r1]
1812
+ vinserti32x8 m1, [r0 + 2 * r1], 1
1813
+ vinserti32x8 m3, [r0 + r6], 1
1814
+ punpcklbw m0, m1, m3
1815
+ pmaddubsw m0, m8
1816
+ punpckhbw m1, m3
1817
+ pmaddubsw m1, m8
1818
+
1819
+ movu ym4, [r0 + 2 * r1]
1820
+ vinserti32x8 m4, [r0 + 4 * r1], 1
1821
+ punpcklbw m2, m3, m4
1822
+ pmaddubsw m2, m8
1823
+ punpckhbw m3, m4
1824
+ pmaddubsw m3, m8
1825
+
1826
+ lea r5, [r0 + 4 * r1]
1827
+
1828
+ movu ym5, [r0 + r6]
1829
+ vinserti32x8 m5, [r5 + r1], 1
1830
+ punpcklbw m6, m4, m5
1831
+ pmaddubsw m6, m9
1832
+ paddw m0, m6
1833
+ punpckhbw m4, m5
1834
+ pmaddubsw m4, m9
1835
+ paddw m1, m4
1836
+
1837
+ movu ym4, [r0 + 4 * r1]
1838
+ vinserti32x8 m4, [r5 + 2 * r1], 1
1839
+ punpcklbw m6, m5, m4
1840
+ pmaddubsw m6, m9
1841
+ paddw m2, m6
1842
+ punpckhbw m5, m4
1843
+ pmaddubsw m5, m9
1844
+ paddw m3, m5
1845
+%ifidn %1, pp
1846
+ pmulhrsw m0, m7
1847
+ pmulhrsw m1, m7
1848
+ pmulhrsw m2, m7
1849
+ pmulhrsw m3, m7
1850
+
1851
+ packuswb m0, m1
1852
+ packuswb m2, m3
1853
+ movu [r2], ym0
1854
+ movu [r2 + r3], ym2
1855
+ vextracti32x8 [r2 + 2 * r3], m0, 1
1856
+ vextracti32x8 [r2 + r7], m2, 1
1857
+%else
1858
+ psubw m0, m7
1859
+ psubw m1, m7
1860
+ psubw m2, m7
1861
+ psubw m3, m7
1862
+
1863
+ mova m4, m10
1864
+ mova m5, m11
1865
+ vpermi2q m4, m0, m1
1866
+ vpermi2q m5, m0, m1
1867
+ mova m6, m10
1868
+ mova m12, m11
1869
+ vpermi2q m6, m2, m3
1870
+ vpermi2q m12, m2, m3
1871
+
1872
+ movu [r2], m4
1873
+ movu [r2 + r3], m6
1874
+ movu [r2 + 2 * r3], m5
1875
+ movu [r2 + r7], m12
1876
+%endif
1877
+ movu xm1, [r0 + mmsize/2]
1878
+ movu xm3, [r0 + r1 + mmsize/2]
1879
+ vinserti32x4 m1, [r0 + r1 + mmsize/2], 1
1880
+ vinserti32x4 m3, [r0 + 2 * r1 + mmsize/2], 1
1881
+ vinserti32x4 m1, [r0 + 2 * r1 + mmsize/2], 2
1882
+ vinserti32x4 m3, [r0 + r6 + mmsize/2], 2
1883
+ vinserti32x4 m1, [r0 + r6 + mmsize/2], 3
1884
+ vinserti32x4 m3, [r0 + 4 * r1 + mmsize/2], 3
1885
+
1886
+ punpcklbw m0, m1, m3
1887
+ pmaddubsw m0, m8
1888
+ punpckhbw m1, m3
1889
+ pmaddubsw m1, m8
1890
+
1891
+ movu xm4, [r0 + 2 * r1 + mmsize/2]
1892
+ movu xm5, [r0 + r6 + mmsize/2]
1893
+ vinserti32x4 m4, [r0 + r6 + mmsize/2], 1
1894
+ vinserti32x4 m5, [r5 + mmsize/2], 1
1895
+ vinserti32x4 m4, [r5 + mmsize/2], 2
1896
+ vinserti32x4 m5, [r5 + r1 + mmsize/2], 2
1897
+ vinserti32x4 m4, [r5 + r1 + mmsize/2], 3
1898
+ vinserti32x4 m5, [r5 + 2 * r1 + mmsize/2], 3
1899
+
1900
+ punpcklbw m3, m4, m5
1901
+ pmaddubsw m3, m9
1902
+ punpckhbw m4, m5
1903
+ pmaddubsw m4, m9
1904
+ paddw m0, m3
1905
+ paddw m1, m4
1906
+%ifidn %1, pp
1907
+ pmulhrsw m0, m7
1908
+ pmulhrsw m1, m7
1909
+ packuswb m0, m1
1910
+ movu [r2 + mmsize/2], xm0
1911
+ vextracti32x4 [r2 + r3 + mmsize/2], m0, 1
1912
+ vextracti32x4 [r2 + 2 * r3 + mmsize/2], m0, 2
1913
+ vextracti32x4 [r2 + r7 + mmsize/2], m0, 3
1914
+%else
1915
+ psubw m0, m7
1916
+ psubw m1, m7
1917
+ mova m2, m10
1918
+ mova m3, m11
1919
+
1920
+ vpermi2q m2, m0, m1
1921
+ vpermi2q m3, m0, m1
1922
+
1923
+ movu [r2 + mmsize], ym2
1924
+ vextracti32x8 [r2 + r3 + mmsize], m2, 1
1925
+ movu [r2 + 2 * r3 + mmsize], ym3
1926
+ vextracti32x8 [r2 + r7 + mmsize], m3, 1
1927
+%endif
1928
+%endmacro
1929
+;-----------------------------------------------------------------------------------------------------------------
1930
+; void interp_8tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
1931
+;-----------------------------------------------------------------------------------------------------------------
1932
+%macro FILTER_VERT_CHROMA_48x64_AVX512 1
1933
+INIT_ZMM avx512
1934
+cglobal interp_4tap_vert_%1_48x64, 4, 8, 13
1935
+ mov r4d, r4m
1936
+ shl r4d, 7
1937
+ sub r0, r1
1938
+
1939
+%ifdef PIC
1940
+ lea r5, [tab_ChromaCoeffVer_32_avx512]
1941
+ mova m8, [r5 + r4]
1942
+ mova m9, [r5 + r4 + mmsize]
1943
+%else
1944
+ mova m8, [tab_ChromaCoeffVer_32_avx512 + r4]
1945
+ mova m9, [tab_ChromaCoeffVer_32_avx512 + r4 + mmsize]
1946
+%endif
1947
+
1948
+%ifidn %1, pp
1949
+ vbroadcasti32x8 m7, [pw_512]
1950
+%else
1951
+ shl r3d, 1
1952
+ vbroadcasti32x8 m7, [pw_2000]
1953
+ mova m10, [interp4_vps_store1_avx512]
1954
+ mova m11, [interp4_vps_store2_avx512]
1955
+%endif
1956
+
1957
+ lea r6, [3 * r1]
1958
+ lea r7, [3 * r3]
1959
+%rep 15
1960
+ PROCESS_CHROMA_VERT_48x4_AVX512 %1
1961
+ lea r0, [r0 + 4 * r1]
1962
+ lea r2, [r2 + 4 * r3]
1963
+%endrep
1964
+ PROCESS_CHROMA_VERT_48x4_AVX512 %1
1965
+ RET
1966
+%endmacro
1967
+
1968
+%if ARCH_X86_64
1969
+ FILTER_VERT_CHROMA_48x64_AVX512 pp
1970
+ FILTER_VERT_CHROMA_48x64_AVX512 ps
1971
+%endif
1972
+%macro PROCESS_CHROMA_VERT_64x4_AVX512 1
1973
+ movu m0, [r0] ; m0 = row 0
1974
+ movu m1, [r0 + r1] ; m1 = row 1
1975
+ punpcklbw m2, m0, m1
1976
+ punpckhbw m3, m0, m1
1977
+ pmaddubsw m2, m10
1978
+ pmaddubsw m3, m10
1979
+ movu m0, [r0 + r1 * 2] ; m0 = row 2
1980
+ punpcklbw m4, m1, m0
1981
+ punpckhbw m5, m1, m0
1982
+ pmaddubsw m4, m10
1983
+ pmaddubsw m5, m10
1984
+ movu m1, [r0 + r4] ; m1 = row 3
1985
+ punpcklbw m6, m0, m1
1986
+ punpckhbw m7, m0, m1
1987
+ pmaddubsw m8, m6, m11
1988
+ pmaddubsw m9, m7, m11
1989
+ pmaddubsw m6, m10
1990
+ pmaddubsw m7, m10
1991
+ paddw m2, m8
1992
+ paddw m3, m9
1993
+
1994
+%ifidn %1,pp
1995
+ pmulhrsw m2, m12
1996
+ pmulhrsw m3, m12
1997
+ packuswb m2, m3
1998
+ movu [r2], m2
1999
+%else
2000
+ psubw m2, m12
2001
+ psubw m3, m12
2002
+ movu m8, m13
2003
+ movu m9, m14
2004
+ vpermi2q m8, m2, m3
2005
+ vpermi2q m9, m2, m3
2006
+ movu [r2], m8
2007
+ movu [r2 + mmsize], m9
2008
+%endif
2009
+
2010
+ lea r0, [r0 + r1 * 4]
2011
+ movu m0, [r0] ; m0 = row 4
2012
+ punpcklbw m2, m1, m0
2013
+ punpckhbw m3, m1, m0
2014
+ pmaddubsw m8, m2, m11
2015
+ pmaddubsw m9, m3, m11
2016
+ pmaddubsw m2, m10
2017
+ pmaddubsw m3, m10
2018
+ paddw m4, m8
2019
+ paddw m5, m9
2020
+
2021
+%ifidn %1,pp
2022
+ pmulhrsw m4, m12
2023
+ pmulhrsw m5, m12
2024
+ packuswb m4, m5
2025
+ movu [r2 + r3], m4
2026
+%else
2027
+ psubw m4, m12
2028
+ psubw m5, m12
2029
+ movu m8, m13
2030
+ movu m9, m14
2031
+ vpermi2q m8, m4, m5
2032
+ vpermi2q m9, m4, m5
2033
+ movu [r2 + r3], m8
2034
+ movu [r2 + r3 + mmsize], m9
2035
+%endif
2036
+
2037
+ movu m1, [r0 + r1] ; m1 = row 5
2038
+ punpcklbw m4, m0, m1
2039
+ punpckhbw m5, m0, m1
2040
+ pmaddubsw m4, m11
2041
+ pmaddubsw m5, m11
2042
+ paddw m6, m4
2043
+ paddw m7, m5
2044
+
2045
+%ifidn %1,pp
2046
+ pmulhrsw m6, m12
2047
+ pmulhrsw m7, m12
2048
+ packuswb m6, m7
2049
+ movu [r2 + r3 * 2], m6
2050
+%else
2051
+ psubw m6, m12
2052
+ psubw m7, m12
2053
+ movu m8, m13
2054
+ movu m9, m14
2055
+ vpermi2q m8, m6, m7
2056
+ vpermi2q m9, m6, m7
2057
+ movu [r2 + 2 * r3], m8
2058
+ movu [r2 + 2 * r3 + mmsize], m9
2059
+%endif
2060
+ movu m0, [r0 + r1 * 2] ; m0 = row 6
2061
+ punpcklbw m6, m1, m0
2062
+ punpckhbw m7, m1, m0
2063
+ pmaddubsw m6, m11
2064
+ pmaddubsw m7, m11
2065
+ paddw m2, m6
2066
+ paddw m3, m7
2067
+
2068
+%ifidn %1,pp
2069
+ pmulhrsw m2, m12
2070
+ pmulhrsw m3, m12
2071
+ packuswb m2, m3
2072
+ movu [r2 + r5], m2
2073
+%else
2074
+ psubw m2, m12
2075
+ psubw m3, m12
2076
+ movu m8, m13
2077
+ movu m9, m14
2078
+ vpermi2q m8, m2, m3
2079
+ vpermi2q m9, m2, m3
2080
+ movu [r2 + r5], m8
2081
+ movu [r2 + r5 + mmsize], m9
2082
+%endif
2083
+%endmacro
2084
+
2085
+%macro FILTER_VER_CHROMA_AVX512_64xN 2
2086
+INIT_ZMM avx512
2087
+cglobal interp_4tap_vert_%1_64x%2, 4, 6, 15
2088
+ mov r4d, r4m
2089
+ shl r4d, 7
2090
+
2091
+%ifdef PIC
2092
+ lea r5, [tab_ChromaCoeffVer_32_avx512]
2093
+ mova m10, [r5 + r4]
2094
+ mova m11, [r5 + r4 + mmsize]
2095
+%else
2096
+ mova m10, [tab_ChromaCoeffVer_32_avx512 + r4]
2097
+ mova m11, [tab_ChromaCoeffVer_32_avx512 + r4 + mmsize]
2098
+%endif
2099
+
2100
+%ifidn %1,pp
2101
+ vbroadcasti32x8 m12, [pw_512]
2102
+%else
2103
+ shl r3d, 1
2104
+ vbroadcasti32x8 m12, [pw_2000]
2105
+ mova m13, [interp4_vps_store1_avx512]
2106
+ mova m14, [interp4_vps_store2_avx512]
2107
+%endif
2108
+ lea r4, [r1 * 3]
2109
+ sub r0, r1
2110
+ lea r5, [r3 * 3]
2111
+
2112
+%rep %2/4 - 1
2113
+ PROCESS_CHROMA_VERT_64x4_AVX512 %1
2114
+ lea r2, [r2 + r3 * 4]
2115
+%endrep
2116
+ PROCESS_CHROMA_VERT_64x4_AVX512 %1
2117
+ RET
2118
+%endmacro
2119
+
2120
+%if ARCH_X86_64 == 1
2121
+FILTER_VER_CHROMA_AVX512_64xN pp, 64
2122
+FILTER_VER_CHROMA_AVX512_64xN pp, 48
2123
+FILTER_VER_CHROMA_AVX512_64xN pp, 32
2124
+FILTER_VER_CHROMA_AVX512_64xN pp, 16
2125
+
2126
+FILTER_VER_CHROMA_AVX512_64xN ps, 64
2127
+FILTER_VER_CHROMA_AVX512_64xN ps, 48
2128
+FILTER_VER_CHROMA_AVX512_64xN ps, 32
2129
+FILTER_VER_CHROMA_AVX512_64xN ps, 16
2130
+%endif
2131
+;-------------------------------------------------------------------------------------------------------------
2132
+;avx512 chroma_vpp and chroma_vps code end
2133
+;-------------------------------------------------------------------------------------------------------------
2134
+;-------------------------------------------------------------------------------------------------------------
2135
+;avx512 chroma_vss code start
2136
+;-------------------------------------------------------------------------------------------------------------
2137
+%macro PROCESS_CHROMA_VERT_SS_8x4_AVX512 0
2138
+ lea r5, [r0 + 4 * r1]
2139
+ movu xm1, [r0]
2140
+ movu xm3, [r0 + r1]
2141
+ vinserti32x4 m1, [r0 + r1], 1
2142
+ vinserti32x4 m3, [r0 + 2 * r1], 1
2143
+ vinserti32x4 m1, [r0 + 2 * r1], 2
2144
+ vinserti32x4 m3, [r0 + r6], 2
2145
+ vinserti32x4 m1, [r0 + r6], 3
2146
+ vinserti32x4 m3, [r0 + 4 * r1], 3
2147
+
2148
+ punpcklwd m0, m1, m3
2149
+ pmaddwd m0, m8
2150
+ punpckhwd m1, m3
2151
+ pmaddwd m1, m8
2152
+
2153
+ movu xm4, [r0 + 2 * r1]
2154
+ movu xm5, [r0 + r6]
2155
+ vinserti32x4 m4, [r0 + r6], 1
2156
+ vinserti32x4 m5, [r5], 1
2157
+ vinserti32x4 m4, [r5], 2
2158
+ vinserti32x4 m5, [r5 + r1], 2
2159
+ vinserti32x4 m4, [r5 + r1], 3
2160
+ vinserti32x4 m5, [r5 + 2 * r1], 3
2161
+
2162
+ punpcklwd m3, m4, m5
2163
+ pmaddwd m3, m9
2164
+ punpckhwd m4, m5
2165
+ pmaddwd m4, m9
2166
+
2167
+ paddd m0, m3
2168
+ paddd m1, m4
2169
+
2170
+ psrad m0, 6
2171
+ psrad m1, 6
2172
+ packssdw m0, m1
2173
+ movu [r2], xm0
2174
+ vextracti32x4 [r2 + r3], m0, 1
2175
+ vextracti32x4 [r2 + 2 * r3], m0, 2
2176
+ vextracti32x4 [r2 + r7], m0, 3
2177
+%endmacro
2178
+
2179
+;-----------------------------------------------------------------------------------------------------------------
2180
+; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
2181
+;-----------------------------------------------------------------------------------------------------------------
2182
+%macro FILTER_VER_SS_CHROMA_8xN_AVX512 1
2183
+INIT_ZMM avx512
2184
+cglobal interp_4tap_vert_ss_8x%1, 5, 8, 10
2185
+ add r1d, r1d
2186
+ add r3d, r3d
2187
+ sub r0, r1
2188
+ shl r4d, 7
2189
+%ifdef PIC
2190
+ lea r5, [pw_ChromaCoeffVer_32_avx512]
2191
+ mova m8, [r5 + r4]
2192
+ mova m9, [r5 + r4 + mmsize]
2193
+%else
2194
+ lea r5, [pw_ChromaCoeffVer_32_avx512 + r4]
2195
+ mova m8, [r5]
2196
+ mova m9, [r5 + mmsize]
2197
+%endif
2198
+ lea r6, [3 * r1]
2199
+ lea r7, [3 * r3]
2200
+
2201
+%rep %1/4 - 1
2202
+ PROCESS_CHROMA_VERT_SS_8x4_AVX512
2203
+ lea r0, [r0 + 4 * r1]
2204
+ lea r2, [r2 + 4 * r3]
2205
+%endrep
2206
+ PROCESS_CHROMA_VERT_SS_8x4_AVX512
2207
+ RET
2208
+%endmacro
2209
+
2210
+%if ARCH_X86_64
2211
+ FILTER_VER_SS_CHROMA_8xN_AVX512 4
2212
+ FILTER_VER_SS_CHROMA_8xN_AVX512 8
2213
+ FILTER_VER_SS_CHROMA_8xN_AVX512 12
2214
+ FILTER_VER_SS_CHROMA_8xN_AVX512 16
2215
+ FILTER_VER_SS_CHROMA_8xN_AVX512 32
2216
+ FILTER_VER_SS_CHROMA_8xN_AVX512 64
2217
+%endif
2218
+
2219
+%macro PROCESS_CHROMA_VERT_S_16x4_AVX512 1
2220
+ movu ym1, [r0]
2221
+ lea r6, [r0 + 2 * r1]
2222
+ vinserti32x8 m1, [r6], 1
2223
+ movu ym3, [r0 + r1]
2224
+ vinserti32x8 m3, [r6 + r1], 1
2225
+ punpcklwd m0, m1, m3
2226
+ pmaddwd m0, m7
2227
+ punpckhwd m1, m3
2228
+ pmaddwd m1, m7
2229
+
2230
+ movu ym4, [r0 + 2 * r1]
2231
+ vinserti32x8 m4, [r6 + 2 * r1], 1
2232
+ punpcklwd m2, m3, m4
2233
+ pmaddwd m2, m7
2234
+ punpckhwd m3, m4
2235
+ pmaddwd m3, m7
2236
+
2237
+ movu ym5, [r0 + r4]
2238
+ vinserti32x8 m5, [r6 + r4], 1
2239
+ punpcklwd m6, m4, m5
2240
+ pmaddwd m6, m8
2241
+ paddd m0, m6
2242
+ punpckhwd m4, m5
2243
+ pmaddwd m4, m8
2244
+ paddd m1, m4
2245
+
2246
+ movu ym4, [r0 + 4 * r1]
2247
+ vinserti32x8 m4, [r6 + 4 * r1], 1
2248
+ punpcklwd m6, m5, m4
2249
+ pmaddwd m6, m8
2250
+ paddd m2, m6
2251
+ punpckhwd m5, m4
2252
+ pmaddwd m5, m8
2253
+ paddd m3, m5
2254
+
2255
+%ifidn %1, sp
2256
+ paddd m0, m9
2257
+ paddd m1, m9
2258
+ paddd m2, m9
2259
+ paddd m3, m9
2260
+
2261
+ psrad m0, 12
2262
+ psrad m1, 12
2263
+ psrad m2, 12
2264
+ psrad m3, 12
2265
+
2266
+ packssdw m0, m1
2267
+ packssdw m2, m3
2268
+ packuswb m0, m2
2269
+ vpermq m0, m10, m0
2270
+ movu [r2], xm0
2271
+ vextracti32x4 [r2 + r3], m0, 2
2272
+ vextracti32x4 [r2 + 2 * r3], m0, 1
2273
+ vextracti32x4 [r2 + r5], m0, 3
2274
+%else
2275
+ psrad m0, 6
2276
+ psrad m1, 6
2277
+ psrad m2, 6
2278
+ psrad m3, 6
2279
+ packssdw m0, m1
2280
+ packssdw m2, m3
2281
+
2282
+ movu [r2], ym0
2283
+ movu [r2 + r3], ym2
2284
+ vextracti32x8 [r2 + 2 * r3], m0, 1
2285
+ vextracti32x8 [r2 + r5], m2, 1
2286
+%endif
2287
+%endmacro
2288
+
2289
+;-----------------------------------------------------------------------------------------------------------------
2290
+; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
2291
+;-----------------------------------------------------------------------------------------------------------------
2292
+%macro FILTER_VER_S_CHROMA_16xN_AVX512 2
2293
+INIT_ZMM avx512
2294
+cglobal interp_4tap_vert_%1_16x%2, 4, 7, 11
2295
+ mov r4d, r4m
2296
+ shl r4d, 7
2297
+
2298
+%ifdef PIC
2299
+ lea r5, [pw_ChromaCoeffVer_32_avx512]
2300
+ mova m7, [r5 + r4]
2301
+ mova m8, [r5 + r4 + mmsize]
2302
+%else
2303
+ mova m7, [pw_ChromaCoeffVer_32_avx512 + r4]
2304
+ mova m8, [pw_ChromaCoeffVer_32_avx512 + r4 + mmsize]
2305
+%endif
2306
+
2307
+%ifidn %1, sp
2308
+ vbroadcasti32x4 m9, [pd_526336]
2309
+ mova m10, [interp8_vsp_store_avx512]
2310
+%else
2311
+ add r3d, r3d
2312
+%endif
2313
+ add r1d, r1d
2314
+ sub r0, r1
2315
+ lea r4, [r1 * 3]
2316
+ lea r5, [r3 * 3]
2317
+
2318
+%rep %2/4 - 1
2319
+ PROCESS_CHROMA_VERT_S_16x4_AVX512 %1
2320
+ lea r0, [r0 + 4 * r1]
2321
+ lea r2, [r2 + 4 * r3]
2322
+%endrep
2323
+ PROCESS_CHROMA_VERT_S_16x4_AVX512 %1
2324
+ RET
2325
+%endmacro
2326
+
2327
+%if ARCH_X86_64
2328
+ FILTER_VER_S_CHROMA_16xN_AVX512 ss, 4
2329
+ FILTER_VER_S_CHROMA_16xN_AVX512 ss, 8
2330
+ FILTER_VER_S_CHROMA_16xN_AVX512 ss, 12
2331
+ FILTER_VER_S_CHROMA_16xN_AVX512 ss, 16
2332
+ FILTER_VER_S_CHROMA_16xN_AVX512 ss, 24
2333
+ FILTER_VER_S_CHROMA_16xN_AVX512 ss, 32
2334
+ FILTER_VER_S_CHROMA_16xN_AVX512 ss, 64
2335
+ FILTER_VER_S_CHROMA_16xN_AVX512 sp, 4
2336
+ FILTER_VER_S_CHROMA_16xN_AVX512 sp, 8
2337
+ FILTER_VER_S_CHROMA_16xN_AVX512 sp, 12
2338
+ FILTER_VER_S_CHROMA_16xN_AVX512 sp, 16
2339
+ FILTER_VER_S_CHROMA_16xN_AVX512 sp, 24
2340
+ FILTER_VER_S_CHROMA_16xN_AVX512 sp, 32
2341
+ FILTER_VER_S_CHROMA_16xN_AVX512 sp, 64
2342
+%endif
2343
+
2344
+%macro PROCESS_CHROMA_VERT_SS_24x8_AVX512 0
2345
+ movu ym1, [r0]
2346
+ lea r6, [r0 + 2 * r1]
2347
+ lea r8, [r0 + 4 * r1]
2348
+ lea r9, [r8 + 2 * r1]
2349
+
2350
+ movu ym10, [r8]
2351
+ movu ym3, [r0 + r1]
2352
+ movu ym12, [r8 + r1]
2353
+ vinserti32x8 m1, [r6], 1
2354
+ vinserti32x8 m10, [r9], 1
2355
+ vinserti32x8 m3, [r6 + r1], 1
2356
+ vinserti32x8 m12, [r9 + r1], 1
2357
+
2358
+ punpcklwd m0, m1, m3
2359
+ punpcklwd m9, m10, m12
2360
+ pmaddwd m0, m16
2361
+ pmaddwd m9, m16
2362
+ punpckhwd m1, m3
2363
+ punpckhwd m10, m12
2364
+ pmaddwd m1, m16
2365
+ pmaddwd m10, m16
2366
+
2367
+ movu ym4, [r0 + 2 * r1]
2368
+ movu ym13, [r8 + 2 * r1]
2369
+ vinserti32x8 m4, [r6 + 2 * r1], 1
2370
+ vinserti32x8 m13, [r9 + 2 * r1], 1
2371
+ punpcklwd m2, m3, m4
2372
+ punpcklwd m11, m12, m13
2373
+ pmaddwd m2, m16
2374
+ pmaddwd m11, m16
2375
+ punpckhwd m3, m4
2376
+ punpckhwd m12, m13
2377
+ pmaddwd m3, m16
2378
+ pmaddwd m12, m16
2379
+
2380
+ movu ym5, [r0 + r10]
2381
+ vinserti32x8 m5, [r6 + r10], 1
2382
+ movu ym14, [r8 + r10]
2383
+ vinserti32x8 m14, [r9 + r10], 1
2384
+ punpcklwd m6, m4, m5
2385
+ punpcklwd m15, m13, m14
2386
+ pmaddwd m6, m17
2387
+ pmaddwd m15, m17
2388
+ paddd m0, m6
2389
+ paddd m9, m15
2390
+ punpckhwd m4, m5
2391
+ punpckhwd m13, m14
2392
+ pmaddwd m4, m17
2393
+ pmaddwd m13, m17
2394
+ paddd m1, m4
2395
+ paddd m10, m13
2396
+
2397
+ movu ym4, [r0 + 4 * r1]
2398
+ vinserti32x8 m4, [r6 + 4 * r1], 1
2399
+ movu ym13, [r8 + 4 * r1]
2400
+ vinserti32x8 m13, [r9 + 4 * r1], 1
2401
+ punpcklwd m6, m5, m4
2402
+ punpcklwd m15, m14, m13
2403
+ pmaddwd m6, m17
2404
+ pmaddwd m15, m17
2405
+ paddd m2, m6
2406
+ paddd m11, m15
2407
+ punpckhwd m5, m4
2408
+ punpckhwd m14, m13
2409
+ pmaddwd m5, m17
2410
+ pmaddwd m14, m17
2411
+ paddd m3, m5
2412
+ paddd m12, m14
2413
+
2414
+ psrad m0, 6
2415
+ psrad m1, 6
2416
+ psrad m2, 6
2417
+ psrad m3, 6
2418
+ psrad m9, 6
2419
+ psrad m10, 6
2420
+ psrad m11, 6
2421
+ psrad m12, 6
2422
+
2423
+ packssdw m0, m1
2424
+ packssdw m2, m3
2425
+ packssdw m9, m10
2426
+ packssdw m11, m12
2427
+
2428
+ movu [r2], ym0
2429
+ movu [r2 + r3], ym2
2430
+ vextracti32x8 [r2 + 2 * r3], m0, 1
2431
+ vextracti32x8 [r2 + r7], m2, 1
2432
+ lea r11, [r2 + 4 * r3]
2433
+ movu [r11], ym9
2434
+ movu [r11 + r3], ym11
2435
+ vextracti32x8 [r11 + 2 * r3], m9, 1
2436
+ vextracti32x8 [r11 + r7], m11, 1
2437
+
2438
+ movu xm1, [r0 + mmsize/2]
2439
+ vinserti32x4 m1, [r6 + mmsize/2], 1
2440
+ vinserti32x4 m1, [r8 + mmsize/2], 2
2441
+ vinserti32x4 m1, [r9 + mmsize/2], 3
2442
+ movu xm3, [r0 + r1 + mmsize/2]
2443
+ vinserti32x4 m3, [r6 + r1 + mmsize/2], 1
2444
+ vinserti32x4 m3, [r8 + r1 + mmsize/2], 2
2445
+ vinserti32x4 m3, [r9 + r1 + mmsize/2], 3
2446
+ punpcklwd m0, m1, m3
2447
+ pmaddwd m0, m16
2448
+ punpckhwd m1, m3
2449
+ pmaddwd m1, m16
2450
+
2451
+ movu xm4, [r0 + 2 * r1 + mmsize/2]
2452
+ vinserti32x4 m4, [r6 + 2 * r1 + mmsize/2], 1
2453
+ vinserti32x4 m4, [r8 + 2 * r1 + mmsize/2], 2
2454
+ vinserti32x4 m4, [r9 + 2 * r1 + mmsize/2], 3
2455
+ punpcklwd m2, m3, m4
2456
+ pmaddwd m2, m16
2457
+ punpckhwd m3, m4
2458
+ pmaddwd m3, m16
2459
+
2460
+ movu xm5, [r0 + r10 + mmsize/2]
2461
+ vinserti32x4 m5, [r6 + r10 + mmsize/2], 1
2462
+ vinserti32x4 m5, [r8 + r10 + mmsize/2], 2
2463
+ vinserti32x4 m5, [r9 + r10 + mmsize/2], 3
2464
+ punpcklwd m6, m4, m5
2465
+ pmaddwd m6, m17
2466
+ paddd m0, m6
2467
+ punpckhwd m4, m5
2468
+ pmaddwd m4, m17
2469
+ paddd m1, m4
2470
+
2471
+ movu xm4, [r0 + 4 * r1 + mmsize/2]
2472
+ vinserti32x4 m4, [r6 + 4 * r1 + mmsize/2], 1
2473
+ vinserti32x4 m4, [r8 + 4 * r1 + mmsize/2], 2
2474
+ vinserti32x4 m4, [r9 + 4 * r1 + mmsize/2], 3
2475
+ punpcklwd m6, m5, m4
2476
+ pmaddwd m6, m17
2477
+ paddd m2, m6
2478
+ punpckhwd m5, m4
2479
+ pmaddwd m5, m17
2480
+ paddd m3, m5
2481
+
2482
+ psrad m0, 6
2483
+ psrad m1, 6
2484
+ psrad m2, 6
2485
+ psrad m3, 6
2486
+
2487
+ packssdw m0, m1
2488
+ packssdw m2, m3
2489
+
2490
+ movu [r2 + mmsize/2], xm0
2491
+ movu [r2 + r3 + mmsize/2], xm2
2492
+ vextracti32x4 [r2 + 2 * r3 + mmsize/2], m0, 1
2493
+ vextracti32x4 [r2 + r7 + mmsize/2], m2, 1
2494
+ lea r2, [r2 + 4 * r3]
2495
+ vextracti32x4 [r2 + mmsize/2], m0, 2
2496
+ vextracti32x4 [r2 + r3 + mmsize/2], m2, 2
2497
+ vextracti32x4 [r2 + 2 * r3 + mmsize/2], m0, 3
2498
+ vextracti32x4 [r2 + r7 + mmsize/2], m2, 3
2499
+%endmacro
2500
+
2501
+%macro FILTER_VER_SS_CHROMA_24xN_AVX512 1
2502
+INIT_ZMM avx512
2503
+cglobal interp_4tap_vert_ss_24x%1, 5, 12, 18
2504
+ add r1d, r1d
2505
+ add r3d, r3d
2506
+ sub r0, r1
2507
+ shl r4d, 7
2508
+%ifdef PIC
2509
+ lea r5, [pw_ChromaCoeffVer_32_avx512]
2510
+ mova m16, [r5 + r4]
2511
+ mova m17, [r5 + r4 + mmsize]
2512
+%else
2513
+ lea r5, [pw_ChromaCoeffVer_32_avx512 + r4]
2514
+ mova m16, [r5]
2515
+ mova m17, [r5 + mmsize]
2516
+%endif
2517
+ lea r10, [3 * r1]
2518
+ lea r7, [3 * r3]
2519
+%rep %1/8 - 1
2520
+ PROCESS_CHROMA_VERT_SS_24x8_AVX512
2521
+ lea r0, [r8 + 4 * r1]
2522
+ lea r2, [r2 + 4 * r3]
2523
+%endrep
2524
+ PROCESS_CHROMA_VERT_SS_24x8_AVX512
2525
+ RET
2526
+%endmacro
2527
+
2528
+%if ARCH_X86_64
2529
+ FILTER_VER_SS_CHROMA_24xN_AVX512 32
2530
+ FILTER_VER_SS_CHROMA_24xN_AVX512 64
2531
+%endif
2532
+%macro PROCESS_CHROMA_VERT_S_32x2_AVX512 1
2533
+ movu m1, [r0]
2534
+ movu m3, [r0 + r1]
2535
+ punpcklwd m0, m1, m3
2536
+ pmaddwd m0, m7
2537
+ punpckhwd m1, m3
2538
+ pmaddwd m1, m7
2539
+ movu m4, [r0 + 2 * r1]
2540
+ punpcklwd m2, m3, m4
2541
+ pmaddwd m2, m7
2542
+ punpckhwd m3, m4
2543
+ pmaddwd m3, m7
2544
+ movu m5, [r0 + r4]
2545
+ punpcklwd m6, m4, m5
2546
+ pmaddwd m6, m8
2547
+ paddd m0, m6
2548
+ punpckhwd m4, m5
2549
+ pmaddwd m4, m8
2550
+ paddd m1, m4
2551
+ movu m4, [r0 + 4 * r1]
2552
+ punpcklwd m6, m5, m4
2553
+ pmaddwd m6, m8
2554
+ paddd m2, m6
2555
+ punpckhwd m5, m4
2556
+ pmaddwd m5, m8
2557
+ paddd m3, m5
2558
+%ifidn %1, sp
2559
+ paddd m0, m9
2560
+ paddd m1, m9
2561
+ paddd m2, m9
2562
+ paddd m3, m9
2563
+
2564
+ psrad m0, 12
2565
+ psrad m1, 12
2566
+ psrad m2, 12
2567
+ psrad m3, 12
2568
+
2569
+ packssdw m0, m1
2570
+ packssdw m2, m3
2571
+ packuswb m0, m2
2572
+ vpermq m0, m10, m0
2573
+ movu [r2], ym0
2574
+ vextracti32x8 [r2 + r3], m0, 1
2575
+%else
2576
+ psrad m0, 6
2577
+ psrad m1, 6
2578
+ psrad m2, 6
2579
+ psrad m3, 6
2580
+ packssdw m0, m1
2581
+ packssdw m2, m3
2582
+ movu [r2], m0
2583
+ movu [r2 + r3], m2
2584
+%endif
2585
+%endmacro
2586
+
2587
+%macro FILTER_VER_S_CHROMA_32xN_AVX512 2
2588
+INIT_ZMM avx512
2589
+cglobal interp_4tap_vert_%1_32x%2, 4, 6, 11
2590
+ mov r4d, r4m
2591
+ shl r4d, 7
2592
+%ifdef PIC
2593
+ lea r5, [pw_ChromaCoeffVer_32_avx512]
2594
+ mova m7, [r5 + r4]
2595
+ mova m8, [r5 + r4 + mmsize]
2596
+%else
2597
+ mova m7, [pw_ChromaCoeffVer_32_avx512 + r4]
2598
+ mova m8, [pw_ChromaCoeffVer_32_avx512 + r4 + mmsize]
2599
+%endif
2600
+%ifidn %1, sp
2601
+ vbroadcasti32x4 m9, [pd_526336]
2602
+ mova m10, [interp8_vsp_store_avx512]
2603
+%else
2604
+ add r3d, r3d
2605
+%endif
2606
+ add r1d, r1d
2607
+ sub r0, r1
2608
+ lea r4, [r1 * 3]
2609
+ lea r5, [r3 * 3]
2610
+%rep %2/2 - 1
2611
+ PROCESS_CHROMA_VERT_S_32x2_AVX512 %1
2612
+ lea r0, [r0 + r1 * 2]
2613
+ lea r2, [r2 + r3 * 2]
2614
+%endrep
2615
+ PROCESS_CHROMA_VERT_S_32x2_AVX512 %1
2616
+ RET
2617
+%endmacro
2618
+
2619
+%if ARCH_X86_64
2620
+ FILTER_VER_S_CHROMA_32xN_AVX512 ss, 8
2621
+ FILTER_VER_S_CHROMA_32xN_AVX512 ss, 16
2622
+ FILTER_VER_S_CHROMA_32xN_AVX512 ss, 24
2623
+ FILTER_VER_S_CHROMA_32xN_AVX512 ss, 32
2624
+ FILTER_VER_S_CHROMA_32xN_AVX512 ss, 48
2625
+ FILTER_VER_S_CHROMA_32xN_AVX512 ss, 64
2626
+ FILTER_VER_S_CHROMA_32xN_AVX512 sp, 8
2627
+ FILTER_VER_S_CHROMA_32xN_AVX512 sp, 16
2628
+ FILTER_VER_S_CHROMA_32xN_AVX512 sp, 24
2629
+ FILTER_VER_S_CHROMA_32xN_AVX512 sp, 32
2630
+ FILTER_VER_S_CHROMA_32xN_AVX512 sp, 48
2631
+ FILTER_VER_S_CHROMA_32xN_AVX512 sp, 64
2632
+%endif
2633
+
2634
+%macro PROCESS_CHROMA_VERT_S_48x4_AVX512 1
2635
+ PROCESS_CHROMA_VERT_S_32x2_AVX512 %1
2636
+ lea r6, [r0 + 2 * r1]
2637
+
2638
+ movu m1, [r6]
2639
+ movu m3, [r6 + r1]
2640
+ punpcklwd m0, m1, m3
2641
+ pmaddwd m0, m7
2642
+ punpckhwd m1, m3
2643
+ pmaddwd m1, m7
2644
+ movu m4, [r6 + 2 * r1]
2645
+ punpcklwd m2, m3, m4
2646
+ pmaddwd m2, m7
2647
+ punpckhwd m3, m4
2648
+ pmaddwd m3, m7
2649
+
2650
+ movu m5, [r6 + r4]
2651
+ punpcklwd m6, m4, m5
2652
+ pmaddwd m6, m8
2653
+ paddd m0, m6
2654
+ punpckhwd m4, m5
2655
+ pmaddwd m4, m8
2656
+ paddd m1, m4
2657
+
2658
+ movu m4, [r6 + 4 * r1]
2659
+ punpcklwd m6, m5, m4
2660
+ pmaddwd m6, m8
2661
+ paddd m2, m6
2662
+ punpckhwd m5, m4
2663
+ pmaddwd m5, m8
2664
+ paddd m3, m5
2665
+
2666
+%ifidn %1, sp
2667
+ paddd m0, m9
2668
+ paddd m1, m9
2669
+ paddd m2, m9
2670
+ paddd m3, m9
2671
+
2672
+ psrad m0, 12
2673
+ psrad m1, 12
2674
+ psrad m2, 12
2675
+ psrad m3, 12
2676
+
2677
+ packssdw m0, m1
2678
+ packssdw m2, m3
2679
+ packuswb m0, m2
2680
+ vpermq m0, m10, m0
2681
+ movu [r2 + 2 * r3], ym0
2682
+ vextracti32x8 [r2 + r5], m0, 1
2683
+%else
2684
+ psrad m0, 6
2685
+ psrad m1, 6
2686
+ psrad m2, 6
2687
+ psrad m3, 6
2688
+
2689
+ packssdw m0, m1
2690
+ packssdw m2, m3
2691
+ movu [r2 + 2 * r3], m0
2692
+ movu [r2 + r5], m2
2693
+%endif
2694
+
2695
+ movu ym1, [r0 + mmsize]
2696
+ vinserti32x8 m1, [r6 + mmsize], 1
2697
+ movu ym3, [r0 + r1 + mmsize]
2698
+ vinserti32x8 m3, [r6 + r1 + mmsize], 1
2699
+ punpcklwd m0, m1, m3
2700
+ pmaddwd m0, m7
2701
+ punpckhwd m1, m3
2702
+ pmaddwd m1, m7
2703
+
2704
+ movu ym4, [r0 + 2 * r1 + mmsize]
2705
+ vinserti32x8 m4, [r6 + 2 * r1 + mmsize], 1
2706
+ punpcklwd m2, m3, m4
2707
+ pmaddwd m2, m7
2708
+ punpckhwd m3, m4
2709
+ pmaddwd m3, m7
2710
+
2711
+ movu ym5, [r0 + r4 + mmsize]
2712
+ vinserti32x8 m5, [r6 + r4 + mmsize], 1
2713
+ punpcklwd m6, m4, m5
2714
+ pmaddwd m6, m8
2715
+ paddd m0, m6
2716
+ punpckhwd m4, m5
2717
+ pmaddwd m4, m8
2718
+ paddd m1, m4
2719
+
2720
+ movu ym4, [r0 + 4 * r1 + mmsize]
2721
+ vinserti32x8 m4, [r6 + 4 * r1 + mmsize], 1
2722
+ punpcklwd m6, m5, m4
2723
+ pmaddwd m6, m8
2724
+ paddd m2, m6
2725
+ punpckhwd m5, m4
2726
+ pmaddwd m5, m8
2727
+ paddd m3, m5
2728
+
2729
+%ifidn %1, sp
2730
+ paddd m0, m9
2731
+ paddd m1, m9
2732
+ paddd m2, m9
2733
+ paddd m3, m9
2734
+
2735
+ psrad m0, 12
2736
+ psrad m1, 12
2737
+ psrad m2, 12
2738
+ psrad m3, 12
2739
+
2740
+ packssdw m0, m1
2741
+ packssdw m2, m3
2742
+ packuswb m0, m2
2743
+ vpermq m0, m10, m0
2744
+ movu [r2 + mmsize/2], xm0
2745
+ vextracti32x4 [r2 + r3 + mmsize/2], m0, 2
2746
+ vextracti32x4 [r2 + 2 * r3 + mmsize/2], m0, 1
2747
+ vextracti32x4 [r2 + r5 + mmsize/2], m0, 3
2748
+%else
2749
+ psrad m0, 6
2750
+ psrad m1, 6
2751
+ psrad m2, 6
2752
+ psrad m3, 6
2753
+ packssdw m0, m1
2754
+ packssdw m2, m3
2755
+
2756
+ movu [r2 + mmsize], ym0
2757
+ movu [r2 + r3 + mmsize], ym2
2758
+ vextracti32x8 [r2 + 2 * r3 + mmsize], m0, 1
2759
+ vextracti32x8 [r2 + r5 + mmsize], m2, 1
2760
+%endif
2761
+%endmacro
2762
+
2763
+%macro FILTER_VER_S_CHROMA_48x64_AVX512 1
2764
+INIT_ZMM avx512
2765
+cglobal interp_4tap_vert_%1_48x64, 4, 7, 11
2766
+ mov r4d, r4m
2767
+ shl r4d, 7
2768
+
2769
+%ifdef PIC
2770
+ lea r5, [pw_ChromaCoeffVer_32_avx512]
2771
+ mova m7, [r5 + r4]
2772
+ mova m8, [r5 + r4 + mmsize]
2773
+%else
2774
+ mova m7, [pw_ChromaCoeffVer_32_avx512 + r4]
2775
+ mova m8, [pw_ChromaCoeffVer_32_avx512 + r4 + mmsize]
2776
+%endif
2777
+
2778
+%ifidn %1, sp
2779
+ vbroadcasti32x4 m9, [pd_526336]
2780
+ mova m10, [interp8_vsp_store_avx512]
2781
+%else
2782
+ add r3d, r3d
2783
+%endif
2784
+ add r1d, r1d
2785
+ sub r0, r1
2786
+ lea r4, [r1 * 3]
2787
+ lea r5, [r3 * 3]
2788
+
2789
+%rep 15
2790
+ PROCESS_CHROMA_VERT_S_48x4_AVX512 %1
2791
+ lea r0, [r0 + 4 * r1]
2792
+ lea r2, [r2 + 4 * r3]
2793
+%endrep
2794
+ PROCESS_CHROMA_VERT_S_48x4_AVX512 %1
2795
+ RET
2796
+%endmacro
2797
+
2798
+%if ARCH_X86_64
2799
+ FILTER_VER_S_CHROMA_48x64_AVX512 ss
2800
+ FILTER_VER_S_CHROMA_48x64_AVX512 sp
2801
+%endif
2802
+
2803
+%macro PROCESS_CHROMA_VERT_S_64x2_AVX512 1
2804
+ PROCESS_CHROMA_VERT_S_32x2_AVX512 %1
2805
+ movu m1, [r0 + mmsize]
2806
+ movu m3, [r0 + r1 + mmsize]
2807
+ punpcklwd m0, m1, m3
2808
+ pmaddwd m0, m7
2809
+ punpckhwd m1, m3
2810
+ pmaddwd m1, m7
2811
+ movu m4, [r0 + 2 * r1 + mmsize]
2812
+ punpcklwd m2, m3, m4
2813
+ pmaddwd m2, m7
2814
+ punpckhwd m3, m4
2815
+ pmaddwd m3, m7
2816
+
2817
+ movu m5, [r0 + r4 + mmsize]
2818
+ punpcklwd m6, m4, m5
2819
+ pmaddwd m6, m8
2820
+ paddd m0, m6
2821
+ punpckhwd m4, m5
2822
+ pmaddwd m4, m8
2823
+ paddd m1, m4
2824
+
2825
+ movu m4, [r0 + 4 * r1 + mmsize]
2826
+ punpcklwd m6, m5, m4
2827
+ pmaddwd m6, m8
2828
+ paddd m2, m6
2829
+ punpckhwd m5, m4
2830
+ pmaddwd m5, m8
2831
+ paddd m3, m5
2832
+
2833
+%ifidn %1, sp
2834
+ paddd m0, m9
2835
+ paddd m1, m9
2836
+ paddd m2, m9
2837
+ paddd m3, m9
2838
+
2839
+ psrad m0, 12
2840
+ psrad m1, 12
2841
+ psrad m2, 12
2842
+ psrad m3, 12
2843
+
2844
+ packssdw m0, m1
2845
+ packssdw m2, m3
2846
+ packuswb m0, m2
2847
+ vpermq m0, m10, m0
2848
+ movu [r2 + mmsize/2], ym0
2849
+ vextracti32x8 [r2 + r3 + mmsize/2], m0, 1
2850
+%else
2851
+ psrad m0, 6
2852
+ psrad m1, 6
2853
+ psrad m2, 6
2854
+ psrad m3, 6
2855
+
2856
+ packssdw m0, m1
2857
+ packssdw m2, m3
2858
+ movu [r2 + mmsize], m0
2859
+ movu [r2 + r3 + mmsize], m2
2860
+%endif
2861
+%endmacro
2862
+
2863
+%macro FILTER_VER_S_CHROMA_64xN_AVX512 2
2864
+INIT_ZMM avx512
2865
+cglobal interp_4tap_vert_%1_64x%2, 4, 6, 11
2866
+ mov r4d, r4m
2867
+ shl r4d, 7
2868
+%ifdef PIC
2869
+ lea r5, [pw_ChromaCoeffVer_32_avx512]
2870
+ mova m7, [r5 + r4]
2871
+ mova m8, [r5 + r4 + mmsize]
2872
+%else
2873
+ mova m7, [pw_ChromaCoeffVer_32_avx512 + r4]
2874
+ mova m8, [pw_ChromaCoeffVer_32_avx512 + r4 + mmsize]
2875
+%endif
2876
+
2877
+%ifidn %1, sp
2878
+ vbroadcasti32x4 m9, [pd_526336]
2879
+ mova m10, [interp8_vsp_store_avx512]
2880
+%else
2881
+ add r3d, r3d
2882
+%endif
2883
+ add r1d, r1d
2884
+ sub r0, r1
2885
+ lea r4, [r1 * 3]
2886
+ lea r5, [r3 * 3]
2887
+
2888
+%rep %2/2 - 1
2889
+ PROCESS_CHROMA_VERT_S_64x2_AVX512 %1
2890
+ lea r0, [r0 + r1 * 2]
2891
+ lea r2, [r2 + r3 * 2]
2892
+%endrep
2893
+ PROCESS_CHROMA_VERT_S_64x2_AVX512 %1
2894
+ RET
2895
+%endmacro
2896
+
2897
+%if ARCH_X86_64
2898
+ FILTER_VER_S_CHROMA_64xN_AVX512 ss, 16
2899
+ FILTER_VER_S_CHROMA_64xN_AVX512 ss, 32
2900
+ FILTER_VER_S_CHROMA_64xN_AVX512 ss, 48
2901
+ FILTER_VER_S_CHROMA_64xN_AVX512 ss, 64
2902
+ FILTER_VER_S_CHROMA_64xN_AVX512 sp, 16
2903
+ FILTER_VER_S_CHROMA_64xN_AVX512 sp, 32
2904
+ FILTER_VER_S_CHROMA_64xN_AVX512 sp, 48
2905
+ FILTER_VER_S_CHROMA_64xN_AVX512 sp, 64
2906
+%endif
2907
+;-------------------------------------------------------------------------------------------------------------
2908
+;avx512 chroma_vss code end
2909
+;-------------------------------------------------------------------------------------------------------------
2910
+;-------------------------------------------------------------------------------------------------------------
2911
+;ipfilter_chroma_avx512 code end
2912
+;-------------------------------------------------------------------------------------------------------------
2913
+;-------------------------------------------------------------------------------------------------------------
2914
+;ipfilter_luma_avx512 code start
2915
+;-------------------------------------------------------------------------------------------------------------
2916
+%macro PROCESS_IPFILTER_LUMA_PP_64x1_AVX512 0
2917
+ ; register map
2918
+ ; m0 , m1 interpolate coeff
2919
+ ; m2 , m3, m4 shuffle order table
2920
+ ; m5 - pw_1
2921
+ ; m6 - pw_512
2922
+
2923
+ movu m7, [r0]
2924
+ movu m9, [r0 + 8]
2925
+
2926
+ pshufb m8, m7, m3
2927
+ pshufb m7, m2
2928
+ pshufb m10, m9, m3
2929
+ pshufb m11, m9, m4
2930
+ pshufb m9, m2
2931
+
2932
+
2933
+ pmaddubsw m7, m0
2934
+ pmaddubsw m12, m8, m1
2935
+ pmaddwd m7, m5
2936
+ pmaddwd m12, m5
2937
+ paddd m7, m12
2938
+
2939
+ pmaddubsw m8, m0
2940
+ pmaddubsw m12, m9, m1
2941
+ pmaddwd m8, m5
2942
+ pmaddwd m12, m5
2943
+ paddd m8, m12
2944
+
2945
+ pmaddubsw m9, m0
2946
+ pmaddubsw m12, m10, m1
2947
+ pmaddwd m9, m5
2948
+ pmaddwd m12, m5
2949
+ paddd m9, m12
2950
+
2951
+ pmaddubsw m10, m0
2952
+ pmaddubsw m12, m11, m1
2953
+ pmaddwd m10, m5
2954
+ pmaddwd m12, m5
2955
+ paddd m10, m12
2956
+
2957
+ packssdw m7, m8
2958
+ packssdw m9, m10
2959
+ pmulhrsw m7, m6
2960
+ pmulhrsw m9, m6
2961
+ packuswb m7, m9
2962
+ movu [r2], m7
2963
+%endmacro
2964
+
2965
+%macro PROCESS_IPFILTER_LUMA_PP_32x2_AVX512 0
2966
+ ; register map
2967
+ ; m0 , m1 interpolate coeff
2968
+ ; m2 , m3, m4 shuffle order table
2969
+ ; m5 - pw_1
2970
+ ; m6 - pw_512
2971
+
2972
+ movu ym7, [r0]
2973
+ vinserti32x8 m7, [r0 + r1], 1
2974
+ movu ym9, [r0 + 8]
2975
+ vinserti32x8 m9, [r0 + r1 + 8], 1
2976
+
2977
+ pshufb m8, m7, m3
2978
+ pshufb m7, m2
2979
+ pshufb m10, m9, m3
2980
+ pshufb m11, m9, m4
2981
+ pshufb m9, m2
2982
+
2983
+ pmaddubsw m7, m0
2984
+ pmaddubsw m12, m8, m1
2985
+ pmaddwd m7, m5
2986
+ pmaddwd m12, m5
2987
+ paddd m7, m12
2988
+
2989
+ pmaddubsw m8, m0
2990
+ pmaddubsw m12, m9, m1
2991
+ pmaddwd m8, m5
2992
+ pmaddwd m12, m5
2993
+ paddd m8, m12
2994
+
2995
+ pmaddubsw m9, m0
2996
+ pmaddubsw m12, m10, m1
2997
+ pmaddwd m9, m5
2998
+ pmaddwd m12, m5
2999
+ paddd m9, m12
3000
+
3001
+ pmaddubsw m10, m0
3002
+ pmaddubsw m12, m11, m1
3003
+ pmaddwd m10, m5
3004
+ pmaddwd m12, m5
3005
+ paddd m10, m12
3006
+
3007
+ packssdw m7, m8
3008
+ packssdw m9, m10
3009
+ pmulhrsw m7, m6
3010
+ pmulhrsw m9, m6
3011
+ packuswb m7, m9
3012
+ movu [r2], ym7
3013
+ vextracti32x8 [r2 + r3], m7, 1
3014
+%endmacro
3015
+
3016
+%macro PROCESS_IPFILTER_LUMA_PP_16x4_AVX512 0
3017
+ ; register map
3018
+ ; m0 , m1 interpolate coeff
3019
+ ; m2 , m3, m4 shuffle order table
3020
+ ; m5 - pw_1
3021
+ ; m6 - pw_512
3022
+
3023
+ movu xm7, [r0]
3024
+ vinserti32x4 m7, [r0 + r1], 1
3025
+ vinserti32x4 m7, [r0 + 2 * r1], 2
3026
+ vinserti32x4 m7, [r0 + r6], 3
3027
+
3028
+ pshufb m8, m7, m3
3029
+ pshufb m7, m2
3030
+
3031
+ movu xm9, [r0 + 8]
3032
+ vinserti32x4 m9, [r0 + r1 + 8], 1
3033
+ vinserti32x4 m9, [r0 + 2 * r1 + 8], 2
3034
+ vinserti32x4 m9, [r0 + r6 + 8], 3
3035
+
3036
+ pshufb m10, m9, m3
3037
+ pshufb m11, m9, m4
3038
+ pshufb m9, m2
3039
+
3040
+ pmaddubsw m7, m0
3041
+ pmaddubsw m12, m8, m1
3042
+ pmaddwd m7, m5
3043
+ pmaddwd m12, m5
3044
+ paddd m7, m12
3045
+
3046
+ pmaddubsw m8, m0
3047
+ pmaddubsw m12, m9, m1
3048
+ pmaddwd m8, m5
3049
+ pmaddwd m12, m5
3050
+ paddd m8, m12
3051
+
3052
+ pmaddubsw m9, m0
3053
+ pmaddubsw m12, m10, m1
3054
+ pmaddwd m9, m5
3055
+ pmaddwd m12, m5
3056
+ paddd m9, m12
3057
+
3058
+ pmaddubsw m10, m0
3059
+ pmaddubsw m12, m11, m1
3060
+ pmaddwd m10, m5
3061
+ pmaddwd m12, m5
3062
+ paddd m10, m12
3063
+
3064
+ packssdw m7, m8
3065
+ packssdw m9, m10
3066
+ pmulhrsw m7, m6
3067
+ pmulhrsw m9, m6
3068
+ packuswb m7, m9
3069
+ movu [r2], xm7
3070
+ vextracti32x4 [r2 + r3], m7, 1
3071
+ vextracti32x4 [r2 + 2 * r3], m7, 2
3072
+ vextracti32x4 [r2 + r7], m7, 3
3073
+%endmacro
3074
+
3075
+%macro PROCESS_IPFILTER_LUMA_PP_48x4_AVX512 0
3076
+ ; register map
3077
+ ; m0 , m1 interpolate coeff
3078
+ ; m2 , m3, m4 shuffle order table
3079
+ ; m5 - pw_1
3080
+ ; m6 - pw_512
3081
+
3082
+ movu ym7, [r0]
3083
+ vinserti32x8 m7, [r0 + r1], 1
3084
+ movu ym9, [r0 + 8]
3085
+ vinserti32x8 m9, [r0 + r1 + 8], 1
3086
+
3087
+ pshufb m8, m7, m3
3088
+ pshufb m7, m2
3089
+ pshufb m10, m9, m3
3090
+ pshufb m11, m9, m4
3091
+ pshufb m9, m2
3092
+
3093
+ pmaddubsw m7, m0
3094
+ pmaddubsw m12, m8, m1
3095
+ pmaddwd m7, m5
3096
+ pmaddwd m12, m5
3097
+ paddd m7, m12
3098
+
3099
+ pmaddubsw m8, m0
3100
+ pmaddubsw m12, m9, m1
3101
+ pmaddwd m8, m5
3102
+ pmaddwd m12, m5
3103
+ paddd m8, m12
3104
+
3105
+ pmaddubsw m9, m0
3106
+ pmaddubsw m12, m10, m1
3107
+ pmaddwd m9, m5
3108
+ pmaddwd m12, m5
3109
+ paddd m9, m12
3110
+
3111
+ pmaddubsw m10, m0
3112
+ pmaddubsw m12, m11, m1
3113
+ pmaddwd m10, m5
3114
+ pmaddwd m12, m5
3115
+ paddd m10, m12
3116
+
3117
+ packssdw m7, m8
3118
+ packssdw m9, m10
3119
+ pmulhrsw m7, m6
3120
+ pmulhrsw m9, m6
3121
+ packuswb m7, m9
3122
+ movu [r2], ym7
3123
+ vextracti32x8 [r2 + r3], m7, 1
3124
+
3125
+ movu ym7, [r0 + 2 * r1]
3126
+ vinserti32x8 m7, [r0 + r6], 1
3127
+ movu ym9, [r0 + 2 * r1 + 8]
3128
+ vinserti32x8 m9, [r0 + r6 + 8], 1
3129
+
3130
+ pshufb m8, m7, m3
3131
+ pshufb m7, m2
3132
+ pshufb m10, m9, m3
3133
+ pshufb m11, m9, m4
3134
+ pshufb m9, m2
3135
+
3136
+ pmaddubsw m7, m0
3137
+ pmaddubsw m12, m8, m1
3138
+ pmaddwd m7, m5
3139
+ pmaddwd m12, m5
3140
+ paddd m7, m12
3141
+
3142
+ pmaddubsw m8, m0
3143
+ pmaddubsw m12, m9, m1
3144
+ pmaddwd m8, m5
3145
+ pmaddwd m12, m5
3146
+ paddd m8, m12
3147
+
3148
+ pmaddubsw m9, m0
3149
+ pmaddubsw m12, m10, m1
3150
+ pmaddwd m9, m5
3151
+ pmaddwd m12, m5
3152
+ paddd m9, m12
3153
+
3154
+ pmaddubsw m10, m0
3155
+ pmaddubsw m12, m11, m1
3156
+ pmaddwd m10, m5
3157
+ pmaddwd m12, m5
3158
+ paddd m10, m12
3159
+
3160
+ packssdw m7, m8
3161
+ packssdw m9, m10
3162
+ pmulhrsw m7, m6
3163
+ pmulhrsw m9, m6
3164
+ packuswb m7, m9
3165
+ movu [r2 + 2 * r3], ym7
3166
+ vextracti32x8 [r2 + r7], m7, 1
3167
+
3168
+ movu xm7, [r0 + mmsize/2]
3169
+ vinserti32x4 m7, [r0 + r1 + mmsize/2], 1
3170
+ vinserti32x4 m7, [r0 + 2 * r1 + mmsize/2], 2
3171
+ vinserti32x4 m7, [r0 + r6 + mmsize/2], 3
3172
+
3173
+ pshufb m8, m7, m3
3174
+ pshufb m7, m2
3175
+
3176
+ movu xm9, [r0 + 40]
3177
+ vinserti32x4 m9, [r0 + r1 + 40], 1
3178
+ vinserti32x4 m9, [r0 + 2 * r1 + 40], 2
3179
+ vinserti32x4 m9, [r0 + r6 + 40], 3
3180
+
3181
+ pshufb m10, m9, m3
3182
+ pshufb m11, m9, m4
3183
+ pshufb m9, m2
3184
+
3185
+ pmaddubsw m7, m0
3186
+ pmaddubsw m12, m8, m1
3187
+ pmaddwd m7, m5
3188
+ pmaddwd m12, m5
3189
+ paddd m7, m12
3190
+
3191
+ pmaddubsw m8, m0
3192
+ pmaddubsw m12, m9, m1
3193
+ pmaddwd m8, m5
3194
+ pmaddwd m12, m5
3195
+ paddd m8, m12
3196
+
3197
+ pmaddubsw m9, m0
3198
+ pmaddubsw m12, m10, m1
3199
+ pmaddwd m9, m5
3200
+ pmaddwd m12, m5
3201
+ paddd m9, m12
3202
+
3203
+ pmaddubsw m10, m0
3204
+ pmaddubsw m12, m11, m1
3205
+ pmaddwd m10, m5
3206
+ pmaddwd m12, m5
3207
+ paddd m10, m12
3208
+
3209
+ packssdw m7, m8
3210
+ packssdw m9, m10
3211
+ pmulhrsw m7, m6
3212
+ pmulhrsw m9, m6
3213
+ packuswb m7, m9
3214
+ movu [r2 + mmsize/2], xm7
3215
+ vextracti32x4 [r2 + r3 + mmsize/2], m7, 1
3216
+ vextracti32x4 [r2 + 2 * r3 + mmsize/2], m7, 2
3217
+ vextracti32x4 [r2 + r7 + mmsize/2], m7, 3
3218
+%endmacro
3219
+
3220
+%macro IPFILTER_LUMA_64xN_AVX512 1
3221
+INIT_ZMM avx512
3222
+cglobal interp_8tap_horiz_pp_64x%1, 4,6,13
3223
+ sub r0, 3
3224
+ mov r4d, r4m
3225
+%ifdef PIC
3226
+ lea r5, [tab_LumaCoeff]
3227
+ vpbroadcastd m0, [r5 + r4 * 8]
3228
+ vpbroadcastd m1, [r5 + r4 * 8 + 4]
3229
+%else
3230
+ vpbroadcastd m0, [tab_LumaCoeff + r4 * 8]
3231
+ vpbroadcastd m1, [tab_LumaCoeff + r4 * 8 + 4]
3232
+%endif
3233
+ vbroadcasti32x8 m2, [interp4_horiz_shuf_load1_avx512]
3234
+ vbroadcasti32x8 m3, [interp4_horiz_shuf_load3_avx512]
3235
+ vbroadcasti32x8 m4, [interp4_horiz_shuf_load2_avx512]
3236
+ vpbroadcastd m5, [pw_1]
3237
+ vbroadcasti32x8 m6, [pw_512]
3238
+
3239
+%rep %1-1
3240
+ PROCESS_IPFILTER_LUMA_PP_64x1_AVX512
3241
+ lea r0, [r0 + r1]
3242
+ lea r2, [r2 + r3]
3243
+%endrep
3244
+ PROCESS_IPFILTER_LUMA_PP_64x1_AVX512
3245
+ RET
3246
+%endmacro
3247
+
3248
+%if ARCH_X86_64
3249
+IPFILTER_LUMA_64xN_AVX512 16
3250
+IPFILTER_LUMA_64xN_AVX512 32
3251
+IPFILTER_LUMA_64xN_AVX512 48
3252
+IPFILTER_LUMA_64xN_AVX512 64
3253
+%endif
3254
+
3255
+%macro IPFILTER_LUMA_32xN_AVX512 1
3256
+INIT_ZMM avx512
3257
+cglobal interp_8tap_horiz_pp_32x%1, 4,6,13
3258
+ sub r0, 3
3259
+ mov r4d, r4m
3260
+%ifdef PIC
3261
+ lea r5, [tab_LumaCoeff]
3262
+ vpbroadcastd m0, [r5 + r4 * 8]
3263
+ vpbroadcastd m1, [r5 + r4 * 8 + 4]
3264
+%else
3265
+ vpbroadcastd m0, [tab_LumaCoeff + r4 * 8]
3266
+ vpbroadcastd m1, [tab_LumaCoeff + r4 * 8 + 4]
3267
+%endif
3268
+ vbroadcasti32x8 m2, [interp4_horiz_shuf_load1_avx512]
3269
+ vbroadcasti32x8 m3, [interp4_horiz_shuf_load3_avx512]
3270
+ vbroadcasti32x8 m4, [interp4_horiz_shuf_load2_avx512]
3271
+ vpbroadcastd m5, [pw_1]
3272
+ vbroadcasti32x8 m6, [pw_512]
3273
+
3274
+%rep %1/2 -1
3275
+ PROCESS_IPFILTER_LUMA_PP_32x2_AVX512
3276
+ lea r0, [r0 + 2 * r1]
3277
+ lea r2, [r2 + 2 * r3]
3278
+%endrep
3279
+ PROCESS_IPFILTER_LUMA_PP_32x2_AVX512
3280
+ RET
3281
+%endmacro
3282
+
3283
+%if ARCH_X86_64
3284
+IPFILTER_LUMA_32xN_AVX512 8
3285
+IPFILTER_LUMA_32xN_AVX512 16
3286
+IPFILTER_LUMA_32xN_AVX512 24
3287
+IPFILTER_LUMA_32xN_AVX512 32
3288
+IPFILTER_LUMA_32xN_AVX512 64
3289
+%endif
3290
+
3291
+%macro IPFILTER_LUMA_16xN_AVX512 1
3292
+INIT_ZMM avx512
3293
+cglobal interp_8tap_horiz_pp_16x%1, 4,8,14
3294
+ sub r0, 3
3295
+ mov r4d, r4m
3296
+ lea r6, [3 * r1]
3297
+ lea r7, [3 * r3]
3298
+%ifdef PIC
3299
+ lea r5, [tab_LumaCoeff]
3300
+ vpbroadcastd m0, [r5 + r4 * 8]
3301
+ vpbroadcastd m1, [r5 + r4 * 8 + 4]
3302
+%else
3303
+ vpbroadcastd m0, [tab_LumaCoeff + r4 * 8]
3304
+ vpbroadcastd m1, [tab_LumaCoeff + r4 * 8 + 4]
3305
+%endif
3306
+ vbroadcasti32x8 m2, [interp4_horiz_shuf_load1_avx512]
3307
+ vbroadcasti32x8 m3, [interp4_horiz_shuf_load3_avx512]
3308
+ vbroadcasti32x8 m4, [interp4_horiz_shuf_load2_avx512]
3309
+ vpbroadcastd m5, [pw_1]
3310
+ vbroadcasti32x8 m6, [pw_512]
3311
+
3312
+%rep %1/4 -1
3313
+ PROCESS_IPFILTER_LUMA_PP_16x4_AVX512
3314
+ lea r0, [r0 + 4 * r1]
3315
+ lea r2, [r2 + 4 * r3]
3316
+%endrep
3317
+ PROCESS_IPFILTER_LUMA_PP_16x4_AVX512
3318
+ RET
3319
+%endmacro
3320
+
3321
+%if ARCH_X86_64
3322
+IPFILTER_LUMA_16xN_AVX512 4
3323
+IPFILTER_LUMA_16xN_AVX512 8
3324
+IPFILTER_LUMA_16xN_AVX512 12
3325
+IPFILTER_LUMA_16xN_AVX512 16
3326
+IPFILTER_LUMA_16xN_AVX512 32
3327
+IPFILTER_LUMA_16xN_AVX512 64
3328
+%endif
3329
+
3330
+%if ARCH_X86_64
3331
+INIT_ZMM avx512
3332
+cglobal interp_8tap_horiz_pp_48x64, 4,8,14
3333
+ sub r0, 3
3334
+ mov r4d, r4m
3335
+ lea r6, [3 * r1]
3336
+ lea r7, [3 * r3]
3337
+%ifdef PIC
3338
+ lea r5, [tab_LumaCoeff]
3339
+ vpbroadcastd m0, [r5 + r4 * 8]
3340
+ vpbroadcastd m1, [r5 + r4 * 8 + 4]
3341
+%else
3342
+ vpbroadcastd m0, [tab_LumaCoeff + r4 * 8]
3343
+ vpbroadcastd m1, [tab_LumaCoeff + r4 * 8 + 4]
3344
+%endif
3345
+ vbroadcasti32x8 m2, [interp4_horiz_shuf_load1_avx512]
3346
+ vbroadcasti32x8 m3, [interp4_horiz_shuf_load3_avx512]
3347
+ vbroadcasti32x8 m4, [interp4_horiz_shuf_load2_avx512]
3348
+ vpbroadcastd m5, [pw_1]
3349
+ vbroadcasti32x8 m6, [pw_512]
3350
+
3351
+%rep 15
3352
+ PROCESS_IPFILTER_LUMA_PP_48x4_AVX512
3353
+ lea r0, [r0 + 4 * r1]
3354
+ lea r2, [r2 + 4 * r3]
3355
+%endrep
3356
+ PROCESS_IPFILTER_LUMA_PP_48x4_AVX512
3357
+ RET
3358
+%endif
3359
+
3360
+%macro PROCESS_IPFILTER_LUMA_PS_64x1_AVX512 0
3361
+ ; register map
3362
+ ; m0 , m1 - interpolate coeff
3363
+ ; m2 , m3, m4 - load shuffle order table
3364
+ ; m5 - pw_1
3365
+ ; m6 - pw_2000
3366
+ ; m7 - store shuffle order table
3367
+
3368
+ movu ym8, [r0]
3369
+ vinserti32x8 m8, [r0 + 8], 1
3370
+ pshufb m9, m8, m3
3371
+ pshufb m10, m8, m4
3372
+ pshufb m8, m2
3373
+
3374
+ movu ym11, [r0 + mmsize/2]
3375
+ vinserti32x8 m11, [r0 + mmsize/2 + 8], 1
3376
+ pshufb m12, m11, m3
3377
+ pshufb m13, m11, m4
3378
+ pshufb m11, m2
3379
+
3380
+ pmaddubsw m8, m0
3381
+ pmaddubsw m14, m9, m1
3382
+ pmaddwd m8, m5
3383
+ pmaddwd m14, m5
3384
+ paddd m8, m14
3385
+
3386
+ pmaddubsw m9, m0
3387
+ pmaddubsw m14, m10, m1
3388
+ pmaddwd m9, m5
3389
+ pmaddwd m14, m5
3390
+ paddd m9, m14
3391
+
3392
+ pmaddubsw m11, m0
3393
+ pmaddubsw m14, m12, m1
3394
+ pmaddwd m11, m5
3395
+ pmaddwd m14, m5
3396
+ paddd m11, m14
3397
+
3398
+ pmaddubsw m12, m0
3399
+ pmaddubsw m14, m13, m1
3400
+ pmaddwd m12, m5
3401
+ pmaddwd m14, m5
3402
+ paddd m12, m14
3403
+
3404
+
3405
+ packssdw m8, m9
3406
+ packssdw m11, m12
3407
+ psubw m8, m6
3408
+ psubw m11, m6
3409
+ vpermq m8, m7, m8
3410
+ vpermq m11, m7, m11
3411
+ movu [r2], m8
3412
+ movu [r2 + mmsize], m11
3413
+%endmacro
3414
+
3415
+%macro IPFILTER_LUMA_PS_64xN_AVX512 1
3416
+INIT_ZMM avx512
3417
+cglobal interp_8tap_horiz_ps_64x%1, 4,7,15
3418
+ mov r4d, r4m
3419
+ mov r5d, r5m
3420
+
3421
+%ifdef PIC
3422
+ lea r6, [tab_LumaCoeff]
3423
+ vpbroadcastd m0, [r6 + r4 * 8]
3424
+ vpbroadcastd m1, [r6 + r4 * 8 + 4]
3425
+%else
3426
+ vpbroadcastd m0, [tab_LumaCoeff + r4 * 8]
3427
+ vpbroadcastd m1, [tab_LumaCoeff + r4 * 8 + 4]
3428
+%endif
3429
+ vbroadcasti32x8 m2, [interp4_horiz_shuf_load1_avx512]
3430
+ vbroadcasti32x8 m3, [interp4_horiz_shuf_load3_avx512]
3431
+ vbroadcasti32x8 m4, [interp4_horiz_shuf_load2_avx512]
3432
+ vpbroadcastd m5, [pw_1]
3433
+ vbroadcasti32x8 m6, [pw_2000]
3434
+ mova m7, [interp8_hps_store_avx512]
3435
+
3436
+ mov r4d, %1
3437
+ sub r0, 3
3438
+ test r5d, r5d
3439
+ jz .loop
3440
+ lea r6, [r1 * 3]
3441
+ sub r0, r6 ; r0(src)-r6
3442
+ add r4d, 7 ; blkheight += N - 1
3443
+
3444
+.loop:
3445
+ PROCESS_IPFILTER_LUMA_PS_64x1_AVX512
3446
+ lea r0, [r0 + r1]
3447
+ lea r2, [r2 + 2 * r3]
3448
+ dec r4d
3449
+ jnz .loop
3450
+ RET
3451
+%endmacro
3452
+
3453
+%if ARCH_X86_64 == 1
3454
+ IPFILTER_LUMA_PS_64xN_AVX512 16
3455
+ IPFILTER_LUMA_PS_64xN_AVX512 32
3456
+ IPFILTER_LUMA_PS_64xN_AVX512 48
3457
+ IPFILTER_LUMA_PS_64xN_AVX512 64
3458
+%endif
3459
+
3460
+%macro PROCESS_IPFILTER_LUMA_PS_32x1_AVX512 0
3461
+ ; register map
3462
+ ; m0 , m1 - interpolate coeff
3463
+ ; m2 , m3, m4 - load shuffle order table
3464
+ ; m5 - pw_1
3465
+ ; m6 - pw_2000
3466
+ ; m7 - store shuffle order table
3467
+
3468
+ movu ym8, [r0]
3469
+ vinserti32x8 m8, [r0 + 8], 1
3470
+ pshufb m9, m8, m3
3471
+ pshufb m10, m8, m4
3472
+ pshufb m8, m2
3473
+
3474
+ pmaddubsw m8, m0
3475
+ pmaddubsw m11, m9, m1
3476
+ pmaddwd m8, m5
3477
+ pmaddwd m11, m5
3478
+ paddd m8, m11
3479
+
3480
+ pmaddubsw m9, m0
3481
+ pmaddubsw m11, m10, m1
3482
+ pmaddwd m9, m5
3483
+ pmaddwd m11, m5
3484
+ paddd m9, m11
3485
+
3486
+ packssdw m8, m9
3487
+ psubw m8, m6
3488
+ vpermq m8, m7, m8
3489
+ movu [r2], m8
3490
+%endmacro
3491
+
3492
+%macro IPFILTER_LUMA_PS_32xN_AVX512 1
3493
+INIT_ZMM avx512
3494
+cglobal interp_8tap_horiz_ps_32x%1, 4,7,12
3495
+ mov r4d, r4m
3496
+ mov r5d, r5m
3497
+
3498
+%ifdef PIC
3499
+ lea r6, [tab_LumaCoeff]
3500
+ vpbroadcastd m0, [r6 + r4 * 8]
3501
+ vpbroadcastd m1, [r6 + r4 * 8 + 4]
3502
+%else
3503
+ vpbroadcastd m0, [tab_LumaCoeff + r4 * 8]
3504
+ vpbroadcastd m1, [tab_LumaCoeff + r4 * 8 + 4]
3505
+%endif
3506
+ vbroadcasti32x8 m2, [interp4_horiz_shuf_load1_avx512]
3507
+ vbroadcasti32x8 m3, [interp4_horiz_shuf_load3_avx512]
3508
+ vbroadcasti32x8 m4, [interp4_horiz_shuf_load2_avx512]
3509
+ vpbroadcastd m5, [pw_1]
3510
+ vbroadcasti32x8 m6, [pw_2000]
3511
+ mova m7, [interp8_hps_store_avx512]
3512
+
3513
+ mov r4d, %1
3514
+ sub r0, 3
3515
+ test r5d, r5d
3516
+ jz .loop
3517
+ lea r6, [r1 * 3]
3518
+ sub r0, r6 ; r0(src)-r6
3519
+ add r4d, 7 ; blkheight += N - 1
3520
+
3521
+.loop:
3522
+ PROCESS_IPFILTER_LUMA_PS_32x1_AVX512
3523
+ lea r0, [r0 + r1]
3524
+ lea r2, [r2 + 2 * r3]
3525
+ dec r4d
3526
+ jnz .loop
3527
+ RET
3528
+%endmacro
3529
+
3530
+%if ARCH_X86_64 == 1
3531
+ IPFILTER_LUMA_PS_32xN_AVX512 8
3532
+ IPFILTER_LUMA_PS_32xN_AVX512 16
3533
+ IPFILTER_LUMA_PS_32xN_AVX512 24
3534
+ IPFILTER_LUMA_PS_32xN_AVX512 32
3535
+ IPFILTER_LUMA_PS_32xN_AVX512 64
3536
+%endif
3537
+
3538
+%macro PROCESS_IPFILTER_LUMA_PS_8TAP_16x2_AVX512 0
3539
+ movu xm7, [r0]
3540
+ vinserti32x4 m7, [r0 + 8], 1
3541
+ vinserti32x4 m7, [r0 + r1], 2
3542
+ vinserti32x4 m7, [r0 + r1 + 8], 3
3543
+ pshufb m8, m7, m3
3544
+ pshufb m9, m7, m4
3545
+ pshufb m7, m2
3546
+
3547
+ pmaddubsw m7, m0
3548
+ pmaddubsw m10, m8, m1
3549
+ pmaddwd m7, m5
3550
+ pmaddwd m10, m5
3551
+ paddd m7, m10
3552
+
3553
+ pmaddubsw m8, m0
3554
+ pmaddubsw m10, m9, m1
3555
+ pmaddwd m8, m5
3556
+ pmaddwd m10, m5
3557
+ paddd m8, m10
3558
+
3559
+ packssdw m7, m8
3560
+ psubw m7, m6
3561
+ movu [r2], ym7
3562
+ vextracti32x8 [r2 + r3], m7, 1
3563
+%endmacro
3564
+
3565
+%macro PROCESS_IPFILTER_LUMA_PS_8TAP_16x1_AVX512 0
3566
+ movu xm7, [r0]
3567
+ vinserti32x4 m7, [r0 + 8], 1
3568
+ pshufb ym8, ym7, ym3
3569
+ pshufb ym9, ym7, ym4
3570
+ pshufb ym7, ym2
3571
+
3572
+ pmaddubsw ym7, ym0
3573
+ pmaddubsw ym10, ym8, ym1
3574
+ pmaddwd ym7, ym5
3575
+ pmaddwd ym10, ym5
3576
+ paddd ym7, ym10
3577
+
3578
+ pmaddubsw ym8, ym0
3579
+ pmaddubsw ym10, ym9, ym1
3580
+ pmaddwd ym8, ym5
3581
+ pmaddwd ym10, ym5
3582
+ paddd ym8, ym10
3583
+
3584
+ packssdw ym7, ym8
3585
+ psubw ym7, ym6
3586
+ movu [r2], ym7
3587
+%endmacro
3588
+
3589
+;-------------------------------------------------------------------------------------------------------------
3590
+; void interp_horiz_ps_16xN(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)
3591
+;-------------------------------------------------------------------------------------------------------------
3592
+%macro IPFILTER_LUMA_PS_8TAP_16xN_AVX512 1
3593
+INIT_ZMM avx512
3594
+cglobal interp_8tap_horiz_ps_16x%1, 4,7,11
3595
+ mov r4d, r4m
3596
+ mov r5d, r5m
3597
+ add r3, r3
3598
+
3599
+%ifdef PIC
3600
+ lea r6, [tab_LumaCoeff]
3601
+ vpbroadcastd m0, [r6 + r4 * 8]
3602
+ vpbroadcastd m1, [r6 + r4 * 8 + 4]
3603
+%else
3604
+ vpbroadcastd m0, [tab_LumaCoeff + r4 * 8]
3605
+ vpbroadcastd m1, [tab_LumaCoeff + r4 * 8 + 4]
3606
+%endif
3607
+ vbroadcasti32x8 m2, [interp4_horiz_shuf_load1_avx512]
3608
+ vbroadcasti32x8 m3, [interp4_horiz_shuf_load3_avx512]
3609
+ vbroadcasti32x8 m4, [interp4_horiz_shuf_load2_avx512]
3610
+ vpbroadcastd m5, [pw_1]
3611
+ vbroadcasti32x8 m6, [pw_2000]
3612
+
3613
+ ; register map
3614
+ ; m0 , m1 - interpolate coeff
3615
+ ; m2 , m3, m4 - load shuffle order table
3616
+ ; m5 - pw_1
3617
+ ; m6 - pw_2000
3618
+
3619
+ mov r4d, %1
3620
+ sub r0, 3
3621
+ test r5d, r5d
3622
+ jz .loop
3623
+ lea r6, [r1 * 3]
3624
+ sub r0, r6 ; r0(src)-r6
3625
+ add r4d, 7 ; blkheight += N - 1
3626
+ PROCESS_IPFILTER_LUMA_PS_8TAP_16x1_AVX512
3627
+ lea r0, [r0 + r1]
3628
+ lea r2, [r2 + r3]
3629
+ dec r4d
3630
+
3631
+.loop:
3632
+ PROCESS_IPFILTER_LUMA_PS_8TAP_16x2_AVX512
3633
+ lea r0, [r0 + 2 * r1]
3634
+ lea r2, [r2 + 2 * r3]
3635
+ sub r4d, 2
3636
+ jnz .loop
3637
+ RET
3638
+%endmacro
3639
+
3640
+%if ARCH_X86_64 == 1
3641
+ IPFILTER_LUMA_PS_8TAP_16xN_AVX512 4
3642
+ IPFILTER_LUMA_PS_8TAP_16xN_AVX512 8
3643
+ IPFILTER_LUMA_PS_8TAP_16xN_AVX512 12
3644
+ IPFILTER_LUMA_PS_8TAP_16xN_AVX512 16
3645
+ IPFILTER_LUMA_PS_8TAP_16xN_AVX512 32
3646
+ IPFILTER_LUMA_PS_8TAP_16xN_AVX512 64
3647
+%endif
3648
+
3649
+%macro PROCESS_IPFILTER_LUMA_PS_48x1_AVX512 0
3650
+ ; register map
3651
+ ; m0 , m1 - interpolate coeff
3652
+ ; m2 , m3, m4 - load shuffle order table
3653
+ ; m5 - pw_1
3654
+ ; m6 - pw_2000
3655
+ ; m7 - store shuffle order table
3656
+
3657
+ movu ym8, [r0]
3658
+ vinserti32x8 m8, [r0 + 8], 1
3659
+ pshufb m9, m8, m3
3660
+ pshufb m10, m8, m4
3661
+ pshufb m8, m2
3662
+
3663
+ pmaddubsw m8, m0
3664
+ pmaddubsw m11, m9, m1
3665
+ pmaddwd m8, m5
3666
+ pmaddwd m11, m5
3667
+ paddd m8, m11
3668
+
3669
+ pmaddubsw m9, m0
3670
+ pmaddubsw m11, m10, m1
3671
+ pmaddwd m9, m5
3672
+ pmaddwd m11, m5
3673
+ paddd m9, m11
3674
+
3675
+ packssdw m8, m9
3676
+ psubw m8, m6
3677
+ vpermq m8, m7, m8
3678
+ movu [r2], m8
3679
+
3680
+ movu ym8, [r0 + 32]
3681
+ vinserti32x4 m8, [r0 + 40], 1
3682
+ pshufb ym9, ym8, ym3
3683
+ pshufb ym10, ym8, ym4
3684
+ pshufb ym8, ym2
3685
+
3686
+ pmaddubsw ym8, ym0
3687
+ pmaddubsw ym11, ym9, ym1
3688
+ pmaddwd ym8, ym5
3689
+ pmaddwd ym11, ym5
3690
+ paddd ym8, ym11
3691
+
3692
+ pmaddubsw ym9, ym0
3693
+ pmaddubsw ym11, ym10, ym1
3694
+ pmaddwd ym9, ym5
3695
+ pmaddwd ym11, ym5
3696
+ paddd ym9, ym11
3697
+
3698
+ packssdw ym8, ym9
3699
+ psubw ym8, ym6
3700
+ movu [r2 + mmsize], ym8
3701
+%endmacro
3702
+
3703
+;-------------------------------------------------------------------------------------------------------------
3704
+; void interp_horiz_ps_48xN(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)
3705
+;-------------------------------------------------------------------------------------------------------------
3706
+%macro IPFILTER_LUMA_PS_48xN_AVX512 1
3707
+INIT_ZMM avx512
3708
+cglobal interp_8tap_horiz_ps_48x%1, 4,7,12
3709
+ mov r4d, r4m
3710
+ mov r5d, r5m
3711
+
3712
+%ifdef PIC
3713
+ lea r6, [tab_LumaCoeff]
3714
+ vpbroadcastd m0, [r6 + r4 * 8]
3715
+ vpbroadcastd m1, [r6 + r4 * 8 + 4]
3716
+%else
3717
+ vpbroadcastd m0, [tab_LumaCoeff + r4 * 8]
3718
+ vpbroadcastd m1, [tab_LumaCoeff + r4 * 8 + 4]
3719
+%endif
3720
+ vbroadcasti32x8 m2, [interp4_horiz_shuf_load1_avx512]
3721
+ vbroadcasti32x8 m3, [interp4_horiz_shuf_load3_avx512]
3722
+ vbroadcasti32x8 m4, [interp4_horiz_shuf_load2_avx512]
3723
+ vpbroadcastd m5, [pw_1]
3724
+ vbroadcasti32x8 m6, [pw_2000]
3725
+ mova m7, [interp8_hps_store_avx512]
3726
+
3727
+ mov r4d, %1
3728
+ sub r0, 3
3729
+ test r5d, r5d
3730
+ jz .loop
3731
+ lea r6, [r1 * 3]
3732
+ sub r0, r6 ; r0(src)-r6
3733
+ add r4d, 7 ; blkheight += N - 1
3734
+
3735
+.loop:
3736
+ PROCESS_IPFILTER_LUMA_PS_48x1_AVX512
3737
+ lea r0, [r0 + r1]
3738
+ lea r2, [r2 + 2 * r3]
3739
+ dec r4d
3740
+ jnz .loop
3741
+ RET
3742
+%endmacro
3743
+
3744
+%if ARCH_X86_64 == 1
3745
+ IPFILTER_LUMA_PS_48xN_AVX512 64
3746
+%endif
3747
+
3748
+;-------------------------------------------------------------------------------------------------------------
3749
+;avx512 luma_vss code start
3750
+;-------------------------------------------------------------------------------------------------------------
3751
+%macro PROCESS_LUMA_VERT_SS_8x8_AVX512 0
3752
+ lea r6, [r0 + 4 * r1]
3753
+ movu xm1, [r0] ;0 row
3754
+ vinserti32x4 m1, [r0 + 2 * r1], 1
3755
+ vinserti32x4 m1, [r0 + 4 * r1], 2
3756
+ vinserti32x4 m1, [r6 + 2 * r1], 3
3757
+ movu xm3, [r0 + r1] ;1 row
3758
+ vinserti32x4 m3, [r0 + r7], 1
3759
+ vinserti32x4 m3, [r6 + r1], 2
3760
+ vinserti32x4 m3, [r6 + r7], 3
3761
+ punpcklwd m0, m1, m3
3762
+ pmaddwd m0, m15
3763
+ punpckhwd m1, m3
3764
+ pmaddwd m1, m15
3765
+
3766
+ movu xm4, [r0 + 2 * r1] ;2 row
3767
+ vinserti32x4 m4, [r0 + 4 * r1], 1
3768
+ vinserti32x4 m4, [r6 + 2 * r1], 2
3769
+ vinserti32x4 m4, [r6 + 4 * r1], 3
3770
+ punpcklwd m2, m3, m4
3771
+ pmaddwd m2, m15
3772
+ punpckhwd m3, m4
3773
+ pmaddwd m3, m15
3774
+
3775
+ lea r4, [r6 + 4 * r1]
3776
+ movu xm5, [r0 + r7] ;3 row
3777
+ vinserti32x4 m5, [r6 + r1], 1
3778
+ vinserti32x4 m5, [r6 + r7], 2
3779
+ vinserti32x4 m5, [r4 + r1], 3
3780
+ punpcklwd m6, m4, m5
3781
+ pmaddwd m6, m16
3782
+ punpckhwd m4, m5
3783
+ pmaddwd m4, m16
3784
+
3785
+ paddd m0, m6
3786
+ paddd m1, m4
3787
+
3788
+ movu xm4, [r0 + 4 * r1] ;4 row
3789
+ vinserti32x4 m4, [r6 + 2 * r1], 1
3790
+ vinserti32x4 m4, [r6 + 4 * r1], 2
3791
+ vinserti32x4 m4, [r4 + 2 * r1], 3
3792
+ punpcklwd m6, m5, m4
3793
+ pmaddwd m6, m16
3794
+ punpckhwd m5, m4
3795
+ pmaddwd m5, m16
3796
+
3797
+ paddd m2, m6
3798
+ paddd m3, m5
3799
+
3800
+ movu xm11, [r6 + r1] ;5 row
3801
+ vinserti32x4 m11, [r6 + r7], 1
3802
+ vinserti32x4 m11, [r4 + r1], 2
3803
+ vinserti32x4 m11, [r4 + r7], 3
3804
+ punpcklwd m8, m4, m11
3805
+ pmaddwd m8, m17
3806
+ punpckhwd m4, m11
3807
+ pmaddwd m4, m17
3808
+
3809
+ movu xm12, [r6 + 2 * r1] ;6 row
3810
+ vinserti32x4 m12, [r6 + 4 * r1], 1
3811
+ vinserti32x4 m12, [r4 + 2 * r1], 2
3812
+ vinserti32x4 m12, [r4 + 4 * r1], 3
3813
+ punpcklwd m10, m11, m12
3814
+ pmaddwd m10, m17
3815
+ punpckhwd m11, m12
3816
+ pmaddwd m11, m17
3817
+
3818
+ lea r8, [r4 + 4 * r1]
3819
+ movu xm13, [r6 + r7] ;7 row
3820
+ vinserti32x4 m13, [r4 + r1], 1
3821
+ vinserti32x4 m13, [r4 + r7], 2
3822
+ vinserti32x4 m13, [r8 + r1], 3
3823
+ punpcklwd m14, m12, m13
3824
+ pmaddwd m14, m18
3825
+ punpckhwd m12, m13
3826
+ pmaddwd m12, m18
3827
+
3828
+ paddd m8, m14
3829
+ paddd m4, m12
3830
+ paddd m0, m8
3831
+ paddd m1, m4
3832
+
3833
+ movu xm12, [r6 + 4 * r1] ; 8 row
3834
+ vinserti32x4 m12, [r4 + 2 * r1], 1
3835
+ vinserti32x4 m12, [r4 + 4 * r1], 2
3836
+ vinserti32x4 m12, [r8 + 2 * r1], 3
3837
+ punpcklwd m14, m13, m12
3838
+ pmaddwd m14, m18
3839
+ punpckhwd m13, m12
3840
+ pmaddwd m13, m18
3841
+
3842
+ paddd m10, m14
3843
+ paddd m11, m13
3844
+ paddd m2, m10
3845
+ paddd m3, m11
3846
+
3847
+ psrad m0, 6
3848
+ psrad m1, 6
3849
+ psrad m2, 6
3850
+ psrad m3, 6
3851
+
3852
+ packssdw m0, m1
3853
+ packssdw m2, m3
3854
+
3855
+ movu [r2], xm0
3856
+ movu [r2 + r3], xm2
3857
+ vextracti32x4 [r2 + 2 * r3], m0, 1
3858
+ vextracti32x4 [r2 + r5], m2, 1
3859
+ lea r2, [r2 + 4 * r3]
3860
+ vextracti32x4 [r2], m0, 2
3861
+ vextracti32x4 [r2 + r3], m2, 2
3862
+ vextracti32x4 [r2 + 2 * r3], m0, 3
3863
+ vextracti32x4 [r2 + r5], m2, 3
3864
+%endmacro
3865
+;-----------------------------------------------------------------------------------------------------------------
3866
+; void interp_8tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
3867
+;-----------------------------------------------------------------------------------------------------------------
3868
+%macro FILTER_VER_SS_LUMA_8xN_AVX512 1
3869
+INIT_ZMM avx512
3870
+cglobal interp_8tap_vert_ss_8x%1, 5, 9, 19
3871
+ add r1d, r1d
3872
+ add r3d, r3d
3873
+ lea r7, [3 * r1]
3874
+ sub r0, r7
3875
+ shl r4d, 8
3876
+%ifdef PIC
3877
+ lea r5, [pw_LumaCoeffVer_avx512]
3878
+ mova m15, [r5 + r4]
3879
+ mova m16, [r5 + r4 + 1 * mmsize]
3880
+ mova m17, [r5 + r4 + 2 * mmsize]
3881
+ mova m18, [r5 + r4 + 3 * mmsize]
3882
+%else
3883
+ lea r5, [pw_LumaCoeffVer_avx512 + r4]
3884
+ mova m15, [r5]
3885
+ mova m16, [r5 + 1 * mmsize]
3886
+ mova m17, [r5 + 2 * mmsize]
3887
+ mova m18, [r5 + 3 * mmsize]
3888
+%endif
3889
+
3890
+ lea r5, [3 * r3]
3891
+%rep %1/8 - 1
3892
+ PROCESS_LUMA_VERT_SS_8x8_AVX512
3893
+ lea r0, [r4]
3894
+ lea r2, [r2 + 4 * r3]
3895
+%endrep
3896
+ PROCESS_LUMA_VERT_SS_8x8_AVX512
3897
+ RET
3898
+%endmacro
3899
+
3900
+%if ARCH_X86_64
3901
+ FILTER_VER_SS_LUMA_8xN_AVX512 8
3902
+ FILTER_VER_SS_LUMA_8xN_AVX512 16
3903
+ FILTER_VER_SS_LUMA_8xN_AVX512 32
3904
+%endif
3905
+%macro PROCESS_LUMA_VERT_S_16x4_AVX512 1
3906
+ movu ym1, [r0]
3907
+ movu ym3, [r0 + r1]
3908
+ vinserti32x8 m1, [r0 + 2 * r1], 1
3909
+ vinserti32x8 m3, [r0 + r7], 1
3910
+ punpcklwd m0, m1, m3
3911
+ pmaddwd m0, m15
3912
+ punpckhwd m1, m3
3913
+ pmaddwd m1, m15
3914
+
3915
+ lea r6, [r0 + 4 * r1]
3916
+ movu ym4, [r0 + 2 * r1]
3917
+ vinserti32x8 m4, [r6], 1
3918
+ punpcklwd m2, m3, m4
3919
+ pmaddwd m2, m15
3920
+ punpckhwd m3, m4
3921
+ pmaddwd m3, m15
3922
+
3923
+ movu ym5, [r0 + r7]
3924
+ vinserti32x8 m5, [r6 + r1], 1
3925
+ punpcklwd m6, m4, m5
3926
+ pmaddwd m6, m16
3927
+ punpckhwd m4, m5
3928
+ pmaddwd m4, m16
3929
+
3930
+ paddd m0, m6
3931
+ paddd m1, m4
3932
+
3933
+ movu ym4, [r6]
3934
+ vinserti32x8 m4, [r6 + 2 * r1], 1
3935
+ punpcklwd m6, m5, m4
3936
+ pmaddwd m6, m16
3937
+ punpckhwd m5, m4
3938
+ pmaddwd m5, m16
3939
+
3940
+ paddd m2, m6
3941
+ paddd m3, m5
3942
+
3943
+ movu ym11, [r6 + r1]
3944
+ vinserti32x8 m11, [r6 + r7], 1
3945
+ punpcklwd m8, m4, m11
3946
+ pmaddwd m8, m17
3947
+ punpckhwd m4, m11
3948
+ pmaddwd m4, m17
3949
+
3950
+ movu ym12, [r6 + 2 * r1]
3951
+ vinserti32x8 m12, [r6 + 4 * r1], 1
3952
+ punpcklwd m10, m11, m12
3953
+ pmaddwd m10, m17
3954
+ punpckhwd m11, m12
3955
+ pmaddwd m11, m17
3956
+
3957
+ lea r4, [r6 + 4 * r1]
3958
+ movu ym13, [r6 + r7]
3959
+ vinserti32x8 m13, [r4 + r1], 1
3960
+ punpcklwd m14, m12, m13
3961
+ pmaddwd m14, m18
3962
+ punpckhwd m12, m13
3963
+ pmaddwd m12, m18
3964
+
3965
+ paddd m8, m14
3966
+ paddd m4, m12
3967
+ paddd m0, m8
3968
+ paddd m1, m4
3969
+
3970
+ movu ym12, [r6 + 4 * r1]
3971
+ vinserti32x8 m12, [r4 + 2 * r1], 1
3972
+ punpcklwd m14, m13, m12
3973
+ pmaddwd m14, m18
3974
+ punpckhwd m13, m12
3975
+ pmaddwd m13, m18
3976
+
3977
+ paddd m10, m14
3978
+ paddd m11, m13
3979
+ paddd m2, m10
3980
+ paddd m3, m11
3981
+%ifidn %1, sp
3982
+ paddd m0, m19
3983
+ paddd m1, m19
3984
+ paddd m2, m19
3985
+ paddd m3, m19
3986
+
3987
+ psrad m0, 12
3988
+ psrad m1, 12
3989
+ psrad m2, 12
3990
+ psrad m3, 12
3991
+
3992
+ packssdw m0, m1
3993
+ packssdw m2, m3
3994
+ packuswb m0, m2
3995
+ vpermq m0, m20, m0
3996
+ movu [r2], xm0
3997
+ vextracti32x4 [r2 + r3], m0, 2
3998
+ vextracti32x4 [r2 + 2 * r3], m0, 1
3999
+ vextracti32x4 [r2 + r5], m0, 3
4000
+%else
4001
+ psrad m0, 6
4002
+ psrad m1, 6
4003
+ psrad m2, 6
4004
+ psrad m3, 6
4005
+
4006
+ packssdw m0, m1
4007
+ packssdw m2, m3
4008
+
4009
+ movu [r2], ym0
4010
+ movu [r2 + r3], ym2
4011
+ vextracti32x8 [r2 + 2 * r3], m0, 1
4012
+ vextracti32x8 [r2 + r5], m2, 1
4013
+%endif
4014
+%endmacro
4015
+;-----------------------------------------------------------------------------------------------------------------
4016
+; void interp_8tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
4017
+;-----------------------------------------------------------------------------------------------------------------
4018
+%macro FILTER_VER_S_LUMA_16xN_AVX512 2
4019
+INIT_ZMM avx512
4020
+cglobal interp_8tap_vert_%1_16x%2, 5, 8, 21
4021
+ add r1d, r1d
4022
+ lea r7, [3 * r1]
4023
+ sub r0, r7
4024
+ shl r4d, 8
4025
+%ifdef PIC
4026
+ lea r5, [pw_LumaCoeffVer_avx512]
4027
+ mova m15, [r5 + r4]
4028
+ mova m16, [r5 + r4 + 1 * mmsize]
4029
+ mova m17, [r5 + r4 + 2 * mmsize]
4030
+ mova m18, [r5 + r4 + 3 * mmsize]
4031
+%else
4032
+ lea r5, [pw_LumaCoeffVer_avx512 + r4]
4033
+ mova m15, [r5]
4034
+ mova m16, [r5 + 1 * mmsize]
4035
+ mova m17, [r5 + 2 * mmsize]
4036
+ mova m18, [r5 + 3 * mmsize]
4037
+%endif
4038
+%ifidn %1, sp
4039
+ vbroadcasti32x4 m19, [pd_526336]
4040
+ mova m20, [interp8_vsp_store_avx512]
4041
+%else
4042
+ add r3d, r3d
4043
+%endif
4044
+
4045
+ lea r5, [3 * r3]
4046
+%rep %2/4 - 1
4047
+ PROCESS_LUMA_VERT_S_16x4_AVX512 %1
4048
+ lea r0, [r0 + 4 * r1]
4049
+ lea r2, [r2 + 4 * r3]
4050
+%endrep
4051
+ PROCESS_LUMA_VERT_S_16x4_AVX512 %1
4052
+ RET
4053
+%endmacro
4054
+
4055
+%if ARCH_X86_64
4056
+ FILTER_VER_S_LUMA_16xN_AVX512 ss, 4
4057
+ FILTER_VER_S_LUMA_16xN_AVX512 ss, 8
4058
+ FILTER_VER_S_LUMA_16xN_AVX512 ss, 12
4059
+ FILTER_VER_S_LUMA_16xN_AVX512 ss, 16
4060
+ FILTER_VER_S_LUMA_16xN_AVX512 ss, 32
4061
+ FILTER_VER_S_LUMA_16xN_AVX512 ss, 64
4062
+ FILTER_VER_S_LUMA_16xN_AVX512 sp, 4
4063
+ FILTER_VER_S_LUMA_16xN_AVX512 sp, 8
4064
+ FILTER_VER_S_LUMA_16xN_AVX512 sp, 12
4065
+ FILTER_VER_S_LUMA_16xN_AVX512 sp, 16
4066
+ FILTER_VER_S_LUMA_16xN_AVX512 sp, 32
4067
+ FILTER_VER_S_LUMA_16xN_AVX512 sp, 64
4068
+%endif
4069
+%macro PROCESS_LUMA_VERT_SS_24x8_AVX512 0
4070
+ PROCESS_LUMA_VERT_S_16x4_AVX512 ss
4071
+ lea r4, [r6 + 4 * r1]
4072
+ lea r8, [r4 + 4 * r1]
4073
+ movu ym1, [r6]
4074
+ movu ym3, [r6 + r1]
4075
+ vinserti32x8 m1, [r6 + 2 * r1], 1
4076
+ vinserti32x8 m3, [r6 + r7], 1
4077
+ punpcklwd m0, m1, m3
4078
+ pmaddwd m0, m15
4079
+ punpckhwd m1, m3
4080
+ pmaddwd m1, m15
4081
+
4082
+ movu ym4, [r6 + 2 * r1]
4083
+ vinserti32x8 m4, [r4], 1
4084
+ punpcklwd m2, m3, m4
4085
+ pmaddwd m2, m15
4086
+ punpckhwd m3, m4
4087
+ pmaddwd m3, m15
4088
+
4089
+ movu ym5, [r6 + r7]
4090
+ vinserti32x8 m5, [r4 + r1], 1
4091
+ punpcklwd m6, m4, m5
4092
+ pmaddwd m6, m16
4093
+ punpckhwd m4, m5
4094
+ pmaddwd m4, m16
4095
+
4096
+ paddd m0, m6
4097
+ paddd m1, m4
4098
+
4099
+ movu ym4, [r4]
4100
+ vinserti32x8 m4, [r4 + 2 * r1], 1
4101
+ punpcklwd m6, m5, m4
4102
+ pmaddwd m6, m16
4103
+ punpckhwd m5, m4
4104
+ pmaddwd m5, m16
4105
+
4106
+ paddd m2, m6
4107
+ paddd m3, m5
4108
+
4109
+ movu ym11, [r4 + r1]
4110
+ vinserti32x8 m11, [r4 + r7], 1
4111
+ punpcklwd m8, m4, m11
4112
+ pmaddwd m8, m17
4113
+ punpckhwd m4, m11
4114
+ pmaddwd m4, m17
4115
+
4116
+ movu ym12, [r4 + 2 * r1]
4117
+ vinserti32x8 m12, [r4 + 4 * r1], 1
4118
+ punpcklwd m10, m11, m12
4119
+ pmaddwd m10, m17
4120
+ punpckhwd m11, m12
4121
+ pmaddwd m11, m17
4122
+
4123
+ movu ym13, [r4 + r7]
4124
+ vinserti32x8 m13, [r8 + r1], 1
4125
+ punpcklwd m14, m12, m13
4126
+ pmaddwd m14, m18
4127
+ punpckhwd m12, m13
4128
+ pmaddwd m12, m18
4129
+
4130
+ paddd m8, m14
4131
+ paddd m4, m12
4132
+ paddd m0, m8
4133
+ paddd m1, m4
4134
+
4135
+ movu ym12, [r4 + 4 * r1]
4136
+ vinserti32x8 m12, [r8 + 2 * r1], 1
4137
+ punpcklwd m14, m13, m12
4138
+ pmaddwd m14, m18
4139
+ punpckhwd m13, m12
4140
+ pmaddwd m13, m18
4141
+
4142
+ paddd m10, m14
4143
+ paddd m11, m13
4144
+ paddd m2, m10
4145
+ paddd m3, m11
4146
+
4147
+ psrad m0, 6
4148
+ psrad m1, 6
4149
+ psrad m2, 6
4150
+ psrad m3, 6
4151
+
4152
+ packssdw m0, m1
4153
+ packssdw m2, m3
4154
+
4155
+ lea r9, [r2 + 4 * r3]
4156
+ movu [r9], ym0
4157
+ movu [r9 + r3], ym2
4158
+ vextracti32x8 [r9 + 2 * r3], m0, 1
4159
+ vextracti32x8 [r9 + r5], m2, 1
4160
+
4161
+ movu xm1, [r0 + mmsize/2]
4162
+ vinserti32x4 m1, [r0 + 2 * r1 + mmsize/2], 1
4163
+ vinserti32x4 m1, [r0 + 4 * r1 + mmsize/2], 2
4164
+ vinserti32x4 m1, [r6 + 2 * r1 + mmsize/2], 3
4165
+ movu xm3, [r0 + r1 + mmsize/2]
4166
+ vinserti32x4 m3, [r0 + r7 + mmsize/2], 1
4167
+ vinserti32x4 m3, [r6 + r1 + mmsize/2], 2
4168
+ vinserti32x4 m3, [r6 + r7 + mmsize/2], 3
4169
+ punpcklwd m0, m1, m3
4170
+ pmaddwd m0, m15
4171
+ punpckhwd m1, m3
4172
+ pmaddwd m1, m15
4173
+
4174
+ movu xm4, [r0 + 2 * r1 + mmsize/2]
4175
+ vinserti32x4 m4, [r0 + 4 * r1 + mmsize/2], 1
4176
+ vinserti32x4 m4, [r6 + 2 * r1 + mmsize/2], 2
4177
+ vinserti32x4 m4, [r6 + 4 * r1 + mmsize/2], 3
4178
+ punpcklwd m2, m3, m4
4179
+ pmaddwd m2, m15
4180
+ punpckhwd m3, m4
4181
+ pmaddwd m3, m15
4182
+
4183
+ movu xm5, [r0 + r7 + mmsize/2]
4184
+ vinserti32x4 m5, [r6 + r1 + mmsize/2], 1
4185
+ vinserti32x4 m5, [r6 + r7 + mmsize/2], 2
4186
+ vinserti32x4 m5, [r4 + r1 + mmsize/2], 3
4187
+ punpcklwd m6, m4, m5
4188
+ pmaddwd m6, m16
4189
+ punpckhwd m4, m5
4190
+ pmaddwd m4, m16
4191
+
4192
+ paddd m0, m6
4193
+ paddd m1, m4
4194
+
4195
+ movu xm4, [r0 + 4 * r1 + mmsize/2]
4196
+ vinserti32x4 m4, [r6 + 2 * r1 + mmsize/2], 1
4197
+ vinserti32x4 m4, [r6 + 4 * r1 + mmsize/2], 2
4198
+ vinserti32x4 m4, [r4 + 2 * r1 + mmsize/2], 3
4199
+ punpcklwd m6, m5, m4
4200
+ pmaddwd m6, m16
4201
+ punpckhwd m5, m4
4202
+ pmaddwd m5, m16
4203
+
4204
+ paddd m2, m6
4205
+ paddd m3, m5
4206
+
4207
+ movu xm11, [r6 + r1 + mmsize/2]
4208
+ vinserti32x4 m11, [r6 + r7 + mmsize/2], 1
4209
+ vinserti32x4 m11, [r4 + r1 + mmsize/2], 2
4210
+ vinserti32x4 m11, [r4 + r7 + mmsize/2], 3
4211
+ punpcklwd m8, m4, m11
4212
+ pmaddwd m8, m17
4213
+ punpckhwd m4, m11
4214
+ pmaddwd m4, m17
4215
+
4216
+ movu xm12, [r6 + 2 * r1 + mmsize/2]
4217
+ vinserti32x4 m12, [r6 + 4 * r1 + mmsize/2], 1
4218
+ vinserti32x4 m12, [r4 + 2 * r1 + mmsize/2], 2
4219
+ vinserti32x4 m12, [r4 + 4 * r1 + mmsize/2], 3
4220
+ punpcklwd m10, m11, m12
4221
+ pmaddwd m10, m17
4222
+ punpckhwd m11, m12
4223
+ pmaddwd m11, m17
4224
+
4225
+ movu xm13, [r6 + r7 + mmsize/2]
4226
+ vinserti32x4 m13, [r4 + r1 + mmsize/2], 1
4227
+ vinserti32x4 m13, [r4 + r7 + mmsize/2], 2
4228
+ vinserti32x4 m13, [r8 + r1 + mmsize/2], 3
4229
+ punpcklwd m14, m12, m13
4230
+ pmaddwd m14, m18
4231
+ punpckhwd m12, m13
4232
+ pmaddwd m12, m18
4233
+
4234
+ paddd m8, m14
4235
+ paddd m4, m12
4236
+ paddd m0, m8
4237
+ paddd m1, m4
4238
+
4239
+ movu xm12, [r6 + 4 * r1 + mmsize/2]
4240
+ vinserti32x4 m12, [r4 + 2 * r1 + mmsize/2], 1
4241
+ vinserti32x4 m12, [r4 + 4 * r1 + mmsize/2], 2
4242
+ vinserti32x4 m12, [r8 + 2 * r1 + mmsize/2], 3
4243
+ punpcklwd m14, m13, m12
4244
+ pmaddwd m14, m18
4245
+ punpckhwd m13, m12
4246
+ pmaddwd m13, m18
4247
+
4248
+ paddd m10, m14
4249
+ paddd m11, m13
4250
+ paddd m2, m10
4251
+ paddd m3, m11
4252
+
4253
+ psrad m0, 6
4254
+ psrad m1, 6
4255
+ psrad m2, 6
4256
+ psrad m3, 6
4257
+
4258
+ packssdw m0, m1
4259
+ packssdw m2, m3
4260
+
4261
+ movu [r2 + mmsize/2], xm0
4262
+ movu [r2 + r3 + mmsize/2], xm2
4263
+ vextracti32x4 [r2 + 2 * r3 + mmsize/2], m0, 1
4264
+ vextracti32x4 [r2 + r5 + mmsize/2], m2, 1
4265
+ lea r2, [r2 + 4 * r3]
4266
+ vextracti32x4 [r2 + mmsize/2], m0, 2
4267
+ vextracti32x4 [r2 + r3 + mmsize/2], m2, 2
4268
+ vextracti32x4 [r2 + 2 * r3 + mmsize/2], m0, 3
4269
+ vextracti32x4 [r2 + r5 + mmsize/2], m2, 3
4270
+%endmacro
4271
+;-----------------------------------------------------------------------------------------------------------------
4272
+; void interp_8tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
4273
+;-----------------------------------------------------------------------------------------------------------------
4274
+%if ARCH_X86_64
4275
+INIT_ZMM avx512
4276
+cglobal interp_8tap_vert_ss_24x32, 5, 10, 19
4277
+ add r1d, r1d
4278
+ add r3d, r3d
4279
+ lea r7, [3 * r1]
4280
+ sub r0, r7
4281
+ shl r4d, 8
4282
+%ifdef PIC
4283
+ lea r5, [pw_LumaCoeffVer_avx512]
4284
+ mova m15, [r5 + r4]
4285
+ mova m16, [r5 + r4 + 1 * mmsize]
4286
+ mova m17, [r5 + r4 + 2 * mmsize]
4287
+ mova m18, [r5 + r4 + 3 * mmsize]
4288
+%else
4289
+ lea r5, [pw_LumaCoeffVer_avx512 + r4]
4290
+ mova m15, [r5]
4291
+ mova m16, [r5 + 1 * mmsize]
4292
+ mova m17, [r5 + 2 * mmsize]
4293
+ mova m18, [r5 + 3 * mmsize]
4294
+%endif
4295
+
4296
+ lea r5, [3 * r3]
4297
+%rep 3
4298
+ PROCESS_LUMA_VERT_SS_24x8_AVX512
4299
+ lea r0, [r4]
4300
+ lea r2, [r2 + 4 * r3]
4301
+%endrep
4302
+ PROCESS_LUMA_VERT_SS_24x8_AVX512
4303
+ RET
4304
+%endif
4305
+
4306
+%macro PROCESS_LUMA_VERT_S_32x2_AVX512 1
4307
+ movu m1, [r0] ;0 row
4308
+ movu m3, [r0 + r1] ;1 row
4309
+ punpcklwd m0, m1, m3
4310
+ pmaddwd m0, m15
4311
+ punpckhwd m1, m3
4312
+ pmaddwd m1, m15
4313
+
4314
+ movu m4, [r0 + 2 * r1] ;2 row
4315
+ punpcklwd m2, m3, m4
4316
+ pmaddwd m2, m15
4317
+ punpckhwd m3, m4
4318
+ pmaddwd m3, m15
4319
+
4320
+ movu m5, [r0 + r7] ;3 row
4321
+ punpcklwd m6, m4, m5
4322
+ pmaddwd m6, m16
4323
+ punpckhwd m4, m5
4324
+ pmaddwd m4, m16
4325
+
4326
+ paddd m0, m6
4327
+ paddd m1, m4
4328
+
4329
+ movu m4, [r0 + 4 * r1] ;4 row
4330
+ punpcklwd m6, m5, m4
4331
+ pmaddwd m6, m16
4332
+ punpckhwd m5, m4
4333
+ pmaddwd m5, m16
4334
+
4335
+ paddd m2, m6
4336
+ paddd m3, m5
4337
+
4338
+ lea r6, [r0 + 4 * r1]
4339
+
4340
+ movu m11, [r6 + r1] ;5 row
4341
+ punpcklwd m8, m4, m11
4342
+ pmaddwd m8, m17
4343
+ punpckhwd m4, m11
4344
+ pmaddwd m4, m17
4345
+
4346
+ movu m12, [r6 + 2 * r1] ;6 row
4347
+ punpcklwd m10, m11, m12
4348
+ pmaddwd m10, m17
4349
+ punpckhwd m11, m12
4350
+ pmaddwd m11, m17
4351
+
4352
+ movu m13, [r6 + r7] ;7 row
4353
+ punpcklwd m14, m12, m13
4354
+ pmaddwd m14, m18
4355
+ punpckhwd m12, m13
4356
+ pmaddwd m12, m18
4357
+ paddd m8, m14
4358
+ paddd m4, m12
4359
+ movu m12, [r6 + 4 * r1] ; 8 row
4360
+ punpcklwd m14, m13, m12
4361
+ pmaddwd m14, m18
4362
+ punpckhwd m13, m12
4363
+ pmaddwd m13, m18
4364
+ paddd m10, m14
4365
+ paddd m11, m13
4366
+
4367
+ paddd m0, m8
4368
+ paddd m1, m4
4369
+ paddd m2, m10
4370
+ paddd m3, m11
4371
+%ifidn %1, sp
4372
+ paddd m0, m19
4373
+ paddd m1, m19
4374
+ paddd m2, m19
4375
+ paddd m3, m19
4376
+
4377
+ psrad m0, 12
4378
+ psrad m1, 12
4379
+ psrad m2, 12
4380
+ psrad m3, 12
4381
+
4382
+ packssdw m0, m1
4383
+ packssdw m2, m3
4384
+ packuswb m0, m2
4385
+ vpermq m0, m20, m0
4386
+ movu [r2], ym0
4387
+ vextracti32x8 [r2 + r3], m0, 1
4388
+%else
4389
+ psrad m0, 6
4390
+ psrad m1, 6
4391
+ psrad m2, 6
4392
+ psrad m3, 6
4393
+
4394
+ packssdw m0, m1
4395
+ packssdw m2, m3
4396
+ movu [r2], m0
4397
+ movu [r2 + r3], m2
4398
+%endif
4399
+%endmacro
4400
+;-----------------------------------------------------------------------------------------------------------------
4401
+; void interp_8tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
4402
+;-----------------------------------------------------------------------------------------------------------------
4403
+%macro FILTER_VER_S_LUMA_32xN_AVX512 2
4404
+INIT_ZMM avx512
4405
+cglobal interp_8tap_vert_%1_32x%2, 5, 8, 21
4406
+ add r1d, r1d
4407
+ lea r7, [3 * r1]
4408
+ sub r0, r7
4409
+ shl r4d, 8
4410
+%ifdef PIC
4411
+ lea r5, [pw_LumaCoeffVer_avx512]
4412
+ mova m15, [r5 + r4]
4413
+ mova m16, [r5 + r4 + 1 * mmsize]
4414
+ mova m17, [r5 + r4 + 2 * mmsize]
4415
+ mova m18, [r5 + r4 + 3 * mmsize]
4416
+%else
4417
+ lea r5, [pw_LumaCoeffVer_avx512 + r4]
4418
+ mova m15, [r5]
4419
+ mova m16, [r5 + 1 * mmsize]
4420
+ mova m17, [r5 + 2 * mmsize]
4421
+ mova m18, [r5 + 3 * mmsize]
4422
+%endif
4423
+%ifidn %1, sp
4424
+ vbroadcasti32x4 m19, [pd_526336]
4425
+ mova m20, [interp8_vsp_store_avx512]
4426
+%else
4427
+ add r3d, r3d
4428
+%endif
4429
+
4430
+%rep %2/2 - 1
4431
+ PROCESS_LUMA_VERT_S_32x2_AVX512 %1
4432
+ lea r0, [r0 + 2 * r1]
4433
+ lea r2, [r2 + 2 * r3]
4434
+%endrep
4435
+ PROCESS_LUMA_VERT_S_32x2_AVX512 %1
4436
+ RET
4437
+%endmacro
4438
+
4439
+%if ARCH_X86_64
4440
+ FILTER_VER_S_LUMA_32xN_AVX512 ss, 8
4441
+ FILTER_VER_S_LUMA_32xN_AVX512 ss, 16
4442
+ FILTER_VER_S_LUMA_32xN_AVX512 ss, 32
4443
+ FILTER_VER_S_LUMA_32xN_AVX512 ss, 24
4444
+ FILTER_VER_S_LUMA_32xN_AVX512 ss, 64
4445
+ FILTER_VER_S_LUMA_32xN_AVX512 sp, 8
4446
+ FILTER_VER_S_LUMA_32xN_AVX512 sp, 16
4447
+ FILTER_VER_S_LUMA_32xN_AVX512 sp, 32
4448
+ FILTER_VER_S_LUMA_32xN_AVX512 sp, 24
4449
+ FILTER_VER_S_LUMA_32xN_AVX512 sp, 64
4450
+%endif
4451
+
4452
+%macro PROCESS_LUMA_VERT_S_48x4_AVX512 1
4453
+ PROCESS_LUMA_VERT_S_32x2_AVX512 %1
4454
+ movu m1, [r0 + 2 * r1]
4455
+ movu m3, [r0 + r7]
4456
+ punpcklwd m0, m1, m3
4457
+ pmaddwd m0, m15
4458
+ punpckhwd m1, m3
4459
+ pmaddwd m1, m15
4460
+
4461
+ movu m4, [r0 + 4 * r1]
4462
+ punpcklwd m2, m3, m4
4463
+ pmaddwd m2, m15
4464
+ punpckhwd m3, m4
4465
+ pmaddwd m3, m15
4466
+
4467
+ movu m5, [r6 + r1]
4468
+ punpcklwd m6, m4, m5
4469
+ pmaddwd m6, m16
4470
+ punpckhwd m4, m5
4471
+ pmaddwd m4, m16
4472
+
4473
+ paddd m0, m6
4474
+ paddd m1, m4
4475
+
4476
+ lea r4, [r6 + 4 * r1]
4477
+
4478
+ movu m4, [r6 + 2 * r1]
4479
+ punpcklwd m6, m5, m4
4480
+ pmaddwd m6, m16
4481
+ punpckhwd m5, m4
4482
+ pmaddwd m5, m16
4483
+
4484
+ paddd m2, m6
4485
+ paddd m3, m5
4486
+
4487
+ movu m11, [r6 + r7]
4488
+ punpcklwd m8, m4, m11
4489
+ pmaddwd m8, m17
4490
+ punpckhwd m4, m11
4491
+ pmaddwd m4, m17
4492
+
4493
+ movu m12, [r4]
4494
+ punpcklwd m10, m11, m12
4495
+ pmaddwd m10, m17
4496
+ punpckhwd m11, m12
4497
+ pmaddwd m11, m17
4498
+
4499
+ movu m13, [r4 + r1]
4500
+ punpcklwd m14, m12, m13
4501
+ pmaddwd m14, m18
4502
+ punpckhwd m12, m13
4503
+ pmaddwd m12, m18
4504
+ paddd m8, m14
4505
+ paddd m4, m12
4506
+ movu m12, [r4 + 2 * r1]
4507
+ punpcklwd m14, m13, m12
4508
+ pmaddwd m14, m18
4509
+ punpckhwd m13, m12
4510
+ pmaddwd m13, m18
4511
+ paddd m10, m14
4512
+ paddd m11, m13
4513
+
4514
+ paddd m0, m8
4515
+ paddd m1, m4
4516
+ paddd m2, m10
4517
+ paddd m3, m11
4518
+%ifidn %1, sp
4519
+ paddd m0, m19
4520
+ paddd m1, m19
4521
+ paddd m2, m19
4522
+ paddd m3, m19
4523
+
4524
+ psrad m0, 12
4525
+ psrad m1, 12
4526
+ psrad m2, 12
4527
+ psrad m3, 12
4528
+
4529
+ packssdw m0, m1
4530
+ packssdw m2, m3
4531
+ packuswb m0, m2
4532
+ vpermq m0, m20, m0
4533
+ movu [r2 + 2 * r3], ym0
4534
+ vextracti32x8 [r2 + r5], m0, 1
4535
+%else
4536
+ psrad m0, 6
4537
+ psrad m1, 6
4538
+ psrad m2, 6
4539
+ psrad m3, 6
4540
+
4541
+ packssdw m0, m1
4542
+ packssdw m2, m3
4543
+ movu [r2 + 2 * r3], m0
4544
+ movu [r2 + r5], m2
4545
+%endif
4546
+ movu ym1, [r0 + mmsize]
4547
+ movu ym3, [r0 + r1 + mmsize]
4548
+ vinserti32x8 m1, [r0 + 2 * r1 + mmsize], 1
4549
+ vinserti32x8 m3, [r0 + r7 + mmsize], 1
4550
+ punpcklwd m0, m1, m3
4551
+ pmaddwd m0, m15
4552
+ punpckhwd m1, m3
4553
+ pmaddwd m1, m15
4554
+
4555
+ movu ym4, [r0 + 2 * r1 + mmsize]
4556
+ vinserti32x8 m4, [r6 + mmsize], 1
4557
+ punpcklwd m2, m3, m4
4558
+ pmaddwd m2, m15
4559
+ punpckhwd m3, m4
4560
+ pmaddwd m3, m15
4561
+
4562
+ movu ym5, [r0 + r7 + mmsize]
4563
+ vinserti32x8 m5, [r6 + r1 + mmsize], 1
4564
+ punpcklwd m6, m4, m5
4565
+ pmaddwd m6, m16
4566
+ punpckhwd m4, m5
4567
+ pmaddwd m4, m16
4568
+
4569
+ paddd m0, m6
4570
+ paddd m1, m4
4571
+
4572
+ movu ym4, [r6 + mmsize]
4573
+ vinserti32x8 m4, [r6 + 2 * r1 + mmsize], 1
4574
+ punpcklwd m6, m5, m4
4575
+ pmaddwd m6, m16
4576
+ punpckhwd m5, m4
4577
+ pmaddwd m5, m16
4578
+
4579
+ paddd m2, m6
4580
+ paddd m3, m5
4581
+
4582
+ movu ym11, [r6 + r1 + mmsize]
4583
+ vinserti32x8 m11, [r6 + r7 + mmsize], 1
4584
+ punpcklwd m8, m4, m11
4585
+ pmaddwd m8, m17
4586
+ punpckhwd m4, m11
4587
+ pmaddwd m4, m17
4588
+
4589
+ movu ym12, [r6 + 2 * r1 + mmsize]
4590
+ vinserti32x8 m12, [r6 + 4 * r1 + mmsize], 1
4591
+ punpcklwd m10, m11, m12
4592
+ pmaddwd m10, m17
4593
+ punpckhwd m11, m12
4594
+ pmaddwd m11, m17
4595
+
4596
+ movu ym13, [r6 + r7 + mmsize]
4597
+ vinserti32x8 m13, [r4 + r1 + mmsize], 1
4598
+ punpcklwd m14, m12, m13
4599
+ pmaddwd m14, m18
4600
+ punpckhwd m12, m13
4601
+ pmaddwd m12, m18
4602
+ paddd m8, m14
4603
+ paddd m4, m12
4604
+ movu ym12, [r6 + 4 * r1 + mmsize]
4605
+ vinserti32x8 m12, [r4 + 2 * r1 + mmsize], 1
4606
+ punpcklwd m14, m13, m12
4607
+ pmaddwd m14, m18
4608
+ punpckhwd m13, m12
4609
+ pmaddwd m13, m18
4610
+ paddd m10, m14
4611
+ paddd m11, m13
4612
+
4613
+ paddd m0, m8
4614
+ paddd m1, m4
4615
+ paddd m2, m10
4616
+ paddd m3, m11
4617
+%ifidn %1, sp
4618
+ paddd m0, m19
4619
+ paddd m1, m19
4620
+ paddd m2, m19
4621
+ paddd m3, m19
4622
+
4623
+ psrad m0, 12
4624
+ psrad m1, 12
4625
+ psrad m2, 12
4626
+ psrad m3, 12
4627
+
4628
+ packssdw m0, m1
4629
+ packssdw m2, m3
4630
+ packuswb m0, m2
4631
+ vpermq m0, m20, m0
4632
+ movu [r2 + mmsize/2], xm0
4633
+ vextracti32x4 [r2 + r3 + mmsize/2], m0, 2
4634
+ vextracti32x4 [r2 + 2 * r3 + mmsize/2], m0, 1
4635
+ vextracti32x4 [r2 + r5 + mmsize/2], m0, 3
4636
+%else
4637
+ psrad m0, 6
4638
+ psrad m1, 6
4639
+ psrad m2, 6
4640
+ psrad m3, 6
4641
+
4642
+ packssdw m0, m1
4643
+ packssdw m2, m3
4644
+
4645
+ movu [r2 + mmsize], ym0
4646
+ movu [r2 + r3 + mmsize], ym2
4647
+ vextracti32x8 [r2 + 2 * r3 + mmsize], m0, 1
4648
+ vextracti32x8 [r2 + r5 + mmsize], m2, 1
4649
+%endif
4650
+%endmacro
4651
+;-----------------------------------------------------------------------------------------------------------------
4652
+; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
4653
+;-----------------------------------------------------------------------------------------------------------------
4654
+%macro FILTER_VER_S_LUMA_48x64_AVX512 1
4655
+INIT_ZMM avx512
4656
+cglobal interp_8tap_vert_%1_48x64, 5, 8, 21
4657
+ add r1d, r1d
4658
+ lea r7, [3 * r1]
4659
+ sub r0, r7
4660
+ shl r4d, 8
4661
+%ifdef PIC
4662
+ lea r5, [pw_LumaCoeffVer_avx512]
4663
+ mova m15, [r5 + r4]
4664
+ mova m16, [r5 + r4 + 1 * mmsize]
4665
+ mova m17, [r5 + r4 + 2 * mmsize]
4666
+ mova m18, [r5 + r4 + 3 * mmsize]
4667
+%else
4668
+ lea r5, [pw_LumaCoeffVer_avx512 + r4]
4669
+ mova m15, [r5]
4670
+ mova m16, [r5 + 1 * mmsize]
4671
+ mova m17, [r5 + 2 * mmsize]
4672
+ mova m18, [r5 + 3 * mmsize]
4673
+%endif
4674
+%ifidn %1, sp
4675
+ vbroadcasti32x4 m19, [pd_526336]
4676
+ mova m20, [interp8_vsp_store_avx512]
4677
+%else
4678
+ add r3d, r3d
4679
+%endif
4680
+
4681
+ lea r5, [3 * r3]
4682
+%rep 15
4683
+ PROCESS_LUMA_VERT_S_48x4_AVX512 %1
4684
+ lea r0, [r0 + 4 * r1]
4685
+ lea r2, [r2 + 4 * r3]
4686
+%endrep
4687
+ PROCESS_LUMA_VERT_S_48x4_AVX512 %1
4688
+ RET
4689
+%endmacro
4690
+
4691
+%if ARCH_X86_64
4692
+ FILTER_VER_S_LUMA_48x64_AVX512 ss
4693
+ FILTER_VER_S_LUMA_48x64_AVX512 sp
4694
+%endif
4695
+
4696
+%macro PROCESS_LUMA_VERT_S_64x2_AVX512 1
4697
+ PROCESS_LUMA_VERT_S_32x2_AVX512 %1
4698
+ movu m1, [r0 + mmsize] ;0 row
4699
+ movu m3, [r0 + r1 + mmsize] ;1 row
4700
+ punpcklwd m0, m1, m3
4701
+ pmaddwd m0, m15
4702
+ punpckhwd m1, m3
4703
+ pmaddwd m1, m15
4704
+
4705
+ movu m4, [r0 + 2 * r1 + mmsize] ;2 row
4706
+ punpcklwd m2, m3, m4
4707
+ pmaddwd m2, m15
4708
+ punpckhwd m3, m4
4709
+ pmaddwd m3, m15
4710
+
4711
+ movu m5, [r0 + r7 + mmsize] ;3 row
4712
+ punpcklwd m6, m4, m5
4713
+ pmaddwd m6, m16
4714
+ punpckhwd m4, m5
4715
+ pmaddwd m4, m16
4716
+
4717
+ paddd m0, m6
4718
+ paddd m1, m4
4719
+
4720
+ movu m4, [r0 + 4 * r1 + mmsize] ;4 row
4721
+ punpcklwd m6, m5, m4
4722
+ pmaddwd m6, m16
4723
+ punpckhwd m5, m4
4724
+ pmaddwd m5, m16
4725
+
4726
+ paddd m2, m6
4727
+ paddd m3, m5
4728
+
4729
+ movu m11, [r6 + r1 + mmsize] ;5 row
4730
+ punpcklwd m8, m4, m11
4731
+ pmaddwd m8, m17
4732
+ punpckhwd m4, m11
4733
+ pmaddwd m4, m17
4734
+
4735
+ movu m12, [r6 + 2 * r1 + mmsize] ;6 row
4736
+ punpcklwd m10, m11, m12
4737
+ pmaddwd m10, m17
4738
+ punpckhwd m11, m12
4739
+ pmaddwd m11, m17
4740
+
4741
+ movu m13, [r6 + r7 + mmsize] ;7 row
4742
+ punpcklwd m14, m12, m13
4743
+ pmaddwd m14, m18
4744
+ punpckhwd m12, m13
4745
+ pmaddwd m12, m18
4746
+ paddd m8, m14
4747
+ paddd m4, m12
4748
+ movu m12, [r6 + 4 * r1 + mmsize] ; 8 row
4749
+ punpcklwd m14, m13, m12
4750
+ pmaddwd m14, m18
4751
+ punpckhwd m13, m12
4752
+ pmaddwd m13, m18
4753
+ paddd m10, m14
4754
+ paddd m11, m13
4755
+
4756
+ paddd m0, m8
4757
+ paddd m1, m4
4758
+ paddd m2, m10
4759
+ paddd m3, m11
4760
+%ifidn %1, sp
4761
+ paddd m0, m19
4762
+ paddd m1, m19
4763
+ paddd m2, m19
4764
+ paddd m3, m19
4765
+
4766
+ psrad m0, 12
4767
+ psrad m1, 12
4768
+ psrad m2, 12
4769
+ psrad m3, 12
4770
+
4771
+ packssdw m0, m1
4772
+ packssdw m2, m3
4773
+ packuswb m0, m2
4774
+ vpermq m0, m20, m0
4775
+ movu [r2 + mmsize/2], ym0
4776
+ vextracti32x8 [r2 + r3 + mmsize/2], m0, 1
4777
+%else
4778
+ psrad m0, 6
4779
+ psrad m1, 6
4780
+ psrad m2, 6
4781
+ psrad m3, 6
4782
+ packssdw m0, m1
4783
+ packssdw m2, m3
4784
+ movu [r2 + mmsize], m0
4785
+ movu [r2 + r3 + mmsize], m2
4786
+%endif
4787
+%endmacro
4788
+;-----------------------------------------------------------------------------------------------------------------
4789
+; void interp_8tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
4790
+;-----------------------------------------------------------------------------------------------------------------
4791
+%macro FILTER_VER_S_LUMA_64xN_AVX512 2
4792
+INIT_ZMM avx512
4793
+cglobal interp_8tap_vert_%1_64x%2, 5, 8, 21
4794
+ add r1d, r1d
4795
+ lea r7, [3 * r1]
4796
+ sub r0, r7
4797
+ shl r4d, 8
4798
+%ifdef PIC
4799
+ lea r5, [pw_LumaCoeffVer_avx512]
4800
+ mova m15, [r5 + r4]
4801
+ mova m16, [r5 + r4 + 1 * mmsize]
4802
+ mova m17, [r5 + r4 + 2 * mmsize]
4803
+ mova m18, [r5 + r4 + 3 * mmsize]
4804
+%else
4805
+ lea r5, [pw_LumaCoeffVer_avx512 + r4]
4806
+ mova m15, [r5]
4807
+ mova m16, [r5 + 1 * mmsize]
4808
+ mova m17, [r5 + 2 * mmsize]
4809
+ mova m18, [r5 + 3 * mmsize]
4810
+%endif
4811
+%ifidn %1, sp
4812
+ vbroadcasti32x4 m19, [pd_526336]
4813
+ mova m20, [interp8_vsp_store_avx512]
4814
+%else
4815
+ add r3d, r3d
4816
+%endif
4817
+
4818
+%rep %2/2 - 1
4819
+ PROCESS_LUMA_VERT_S_64x2_AVX512 %1
4820
+ lea r0, [r0 + 2 * r1]
4821
+ lea r2, [r2 + 2 * r3]
4822
+%endrep
4823
+ PROCESS_LUMA_VERT_S_64x2_AVX512 %1
4824
+ RET
4825
+%endmacro
4826
+
4827
+%if ARCH_X86_64
4828
+ FILTER_VER_S_LUMA_64xN_AVX512 ss, 16
4829
+ FILTER_VER_S_LUMA_64xN_AVX512 ss, 32
4830
+ FILTER_VER_S_LUMA_64xN_AVX512 ss, 48
4831
+ FILTER_VER_S_LUMA_64xN_AVX512 ss, 64
4832
+ FILTER_VER_S_LUMA_64xN_AVX512 sp, 16
4833
+ FILTER_VER_S_LUMA_64xN_AVX512 sp, 32
4834
+ FILTER_VER_S_LUMA_64xN_AVX512 sp, 48
4835
+ FILTER_VER_S_LUMA_64xN_AVX512 sp, 64
4836
+%endif
4837
+;-------------------------------------------------------------------------------------------------------------
4838
+;avx512 luma_vss code end
4839
+;-------------------------------------------------------------------------------------------------------------
4840
+;-------------------------------------------------------------------------------------------------------------
4841
+;avx512 luma_vpp and luma_vps code start
4842
+;-------------------------------------------------------------------------------------------------------------
4843
+%macro PROCESS_LUMA_VERT_16x8_AVX512 1
4844
+ lea r5, [r0 + 4 * r1]
4845
+ lea r4, [r5 + 4 * r1]
4846
+ movu xm1, [r0]
4847
+ vinserti32x4 m1, [r0 + 2 * r1], 1
4848
+ vinserti32x4 m1, [r5], 2
4849
+ vinserti32x4 m1, [r5 + 2 * r1], 3
4850
+ movu xm3, [r0 + r1]
4851
+ vinserti32x4 m3, [r0 + r6], 1
4852
+ vinserti32x4 m3, [r5 + r1], 2
4853
+ vinserti32x4 m3, [r5 + r6], 3
4854
+ punpcklbw m0, m1, m3
4855
+ pmaddubsw m0, m8
4856
+ punpckhbw m1, m3
4857
+ pmaddubsw m1, m8
4858
+
4859
+ movu xm4, [r0 + 2 * r1]
4860
+ vinserti32x4 m4, [r0 + 4 * r1], 1
4861
+ vinserti32x4 m4, [r5 + 2 * r1], 2
4862
+ vinserti32x4 m4, [r5 + 4 * r1], 3
4863
+ punpcklbw m2, m3, m4
4864
+ pmaddubsw m2, m8
4865
+ punpckhbw m3, m4
4866
+ pmaddubsw m3, m8
4867
+
4868
+ movu xm5, [r0 + r6]
4869
+ vinserti32x4 m5, [r5 + r1], 1
4870
+ vinserti32x4 m5, [r5 + r6], 2
4871
+ vinserti32x4 m5, [r4 + r1], 3
4872
+ punpcklbw m6, m4, m5
4873
+ pmaddubsw m6, m9
4874
+ punpckhbw m4, m5
4875
+ pmaddubsw m4, m9
4876
+
4877
+ paddw m0, m6
4878
+ paddw m1, m4
4879
+
4880
+ movu xm4, [r0 + 4 * r1]
4881
+ vinserti32x4 m4, [r5 + 2 * r1], 1
4882
+ vinserti32x4 m4, [r5 + 4 * r1], 2
4883
+ vinserti32x4 m4, [r4 + 2 * r1], 3
4884
+ punpcklbw m6, m5, m4
4885
+ pmaddubsw m6, m9
4886
+ punpckhbw m5, m4
4887
+ pmaddubsw m5, m9
4888
+
4889
+ paddw m2, m6
4890
+ paddw m3, m5
4891
+
4892
+ movu xm15, [r5 + r1]
4893
+ vinserti32x4 m15, [r5 + r6], 1
4894
+ vinserti32x4 m15, [r4 + r1], 2
4895
+ vinserti32x4 m15, [r4 + r6], 3
4896
+ punpcklbw m12, m4, m15
4897
+ pmaddubsw m12, m10
4898
+ punpckhbw m13, m4, m15
4899
+ pmaddubsw m13, m10
4900
+
4901
+ lea r8, [r4 + 4 * r1]
4902
+ movu xm4, [r5 + 2 * r1]
4903
+ vinserti32x4 m4, [r5 + 4 * r1], 1
4904
+ vinserti32x4 m4, [r4 + 2 * r1], 2
4905
+ vinserti32x4 m4, [r4 + 4 * r1], 3
4906
+ punpcklbw m14, m15, m4
4907
+ pmaddubsw m14, m10
4908
+ punpckhbw m15, m4
4909
+ pmaddubsw m15, m10
4910
+
4911
+ movu xm5, [r5 + r6]
4912
+ vinserti32x4 m5, [r4 + r1], 1
4913
+ vinserti32x4 m5, [r4 + r6], 2
4914
+ vinserti32x4 m5, [r8 + r1], 3
4915
+ punpcklbw m6, m4, m5
4916
+ pmaddubsw m6, m11
4917
+ punpckhbw m4, m5
4918
+ pmaddubsw m4, m11
4919
+
4920
+ paddw m12, m6
4921
+ paddw m13, m4
4922
+
4923
+ movu xm4, [r5 + 4 * r1]
4924
+ vinserti32x4 m4, [r4 + 2 * r1], 1
4925
+ vinserti32x4 m4, [r4 + 4 * r1], 2
4926
+ vinserti32x4 m4, [r8 + 2 * r1], 3
4927
+ punpcklbw m6, m5, m4
4928
+ pmaddubsw m6, m11
4929
+ punpckhbw m5, m4
4930
+ pmaddubsw m5, m11
4931
+
4932
+ paddw m14, m6
4933
+ paddw m15, m5
4934
+
4935
+ paddw m0, m12
4936
+ paddw m1, m13
4937
+ paddw m2, m14
4938
+ paddw m3, m15
4939
+%ifidn %1,pp
4940
+ pmulhrsw m0, m7
4941
+ pmulhrsw m1, m7
4942
+ pmulhrsw m2, m7
4943
+ pmulhrsw m3, m7
4944
+
4945
+ packuswb m0, m1
4946
+ packuswb m2, m3
4947
+ movu [r2], xm0
4948
+ movu [r2 + r3], xm2
4949
+ vextracti32x4 [r2 + 2 * r3], m0, 1
4950
+ vextracti32x4 [r2 + r7], m2, 1
4951
+ lea r2, [r2 + 4 * r3]
4952
+ vextracti32x4 [r2], m0, 2
4953
+ vextracti32x4 [r2 + r3], m2, 2
4954
+ vextracti32x4 [r2 + 2 * r3], m0, 3
4955
+ vextracti32x4 [r2 + r7], m2, 3
4956
+%else
4957
+ psubw m0, m7
4958
+ psubw m1, m7
4959
+ mova m12, m16
4960
+ mova m13, m17
4961
+ vpermi2q m12, m0, m1
4962
+ vpermi2q m13, m0, m1
4963
+ movu [r2], ym12
4964
+ vextracti32x8 [r2 + 2 * r3], m12, 1
4965
+
4966
+ psubw m2, m7
4967
+ psubw m3, m7
4968
+ mova m14, m16
4969
+ mova m15, m17
4970
+ vpermi2q m14, m2, m3
4971
+ vpermi2q m15, m2, m3
4972
+ movu [r2 + r3], ym14
4973
+ vextracti32x8 [r2 + r7], m14, 1
4974
+ lea r2, [r2 + 4 * r3]
4975
+
4976
+ movu [r2], ym13
4977
+ movu [r2 + r3], ym15
4978
+ vextracti32x8 [r2 + 2 * r3], m13, 1
4979
+ vextracti32x8 [r2 + r7], m15, 1
4980
+%endif
4981
+%endmacro
4982
+;-----------------------------------------------------------------------------------------------------------------
4983
+; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
4984
+;-----------------------------------------------------------------------------------------------------------------
4985
+%macro FILTER_VERT_LUMA_16xN_AVX512 2
4986
+INIT_ZMM avx512
4987
+cglobal interp_8tap_vert_%1_16x%2, 5, 9, 18
4988
+ mov r4d, r4m
4989
+ shl r4d, 8
4990
+%ifdef PIC
4991
+ lea r5, [tab_LumaCoeffVer_32_avx512]
4992
+ mova m8, [r5 + r4]
4993
+ mova m9, [r5 + r4 + 1 * mmsize]
4994
+ mova m10, [r5 + r4 + 2 * mmsize]
4995
+ mova m11, [r5 + r4 + 3 * mmsize]
4996
+%else
4997
+ mova m8, [tab_LumaCoeffVer_32_avx512 + r4]
4998
+ mova m9, [tab_LumaCoeffVer_32_avx512 + r4 + 1 * mmsize]
4999
+ mova m10, [tab_LumaCoeffVer_32_avx512 + r4 + 2 * mmsize]
5000
+ mova m11, [tab_LumaCoeffVer_32_avx512 + r4 + 3 * mmsize]
5001
+%endif
5002
+%ifidn %1, pp
5003
+ vbroadcasti32x8 m7, [pw_512]
5004
+%else
5005
+ shl r3d, 1
5006
+ vbroadcasti32x8 m7, [pw_2000]
5007
+ mova m16, [interp4_vps_store1_avx512]
5008
+ mova m17, [interp4_vps_store2_avx512]
5009
+%endif
5010
+
5011
+ lea r6, [3 * r1]
5012
+ lea r7, [3 * r3]
5013
+ sub r0, r6
5014
+
5015
+%rep %2/8 - 1
5016
+ PROCESS_LUMA_VERT_16x8_AVX512 %1
5017
+ lea r0, [r4]
5018
+ lea r2, [r2 + 4 * r3]
5019
+%endrep
5020
+ PROCESS_LUMA_VERT_16x8_AVX512 %1
5021
+ RET
5022
+%endmacro
5023
+
5024
+%if ARCH_X86_64
5025
+ FILTER_VERT_LUMA_16xN_AVX512 pp, 8
5026
+ FILTER_VERT_LUMA_16xN_AVX512 pp, 16
5027
+ FILTER_VERT_LUMA_16xN_AVX512 pp, 32
5028
+ FILTER_VERT_LUMA_16xN_AVX512 pp, 64
5029
+
5030
+ FILTER_VERT_LUMA_16xN_AVX512 ps, 8
5031
+ FILTER_VERT_LUMA_16xN_AVX512 ps, 16
5032
+ FILTER_VERT_LUMA_16xN_AVX512 ps, 32
5033
+ FILTER_VERT_LUMA_16xN_AVX512 ps, 64
5034
+%endif
5035
+%macro PROCESS_LUMA_VERT_32x4_AVX512 1
5036
+ lea r5, [r0 + 4 * r1]
5037
+ movu ym1, [r0]
5038
+ vinserti32x8 m1, [r0 + 2 * r1], 1
5039
+ movu ym3, [r0 + r1]
5040
+ vinserti32x8 m3, [r0 + r6], 1
5041
+ punpcklbw m0, m1, m3
5042
+ pmaddubsw m0, m8
5043
+ punpckhbw m1, m3
5044
+ pmaddubsw m1, m8
5045
+
5046
+ movu ym4, [r0 + 2 * r1]
5047
+ vinserti32x8 m4, [r0 + 4 * r1], 1
5048
+ punpcklbw m2, m3, m4
5049
+ pmaddubsw m2, m8
5050
+ punpckhbw m3, m4
5051
+ pmaddubsw m3, m8
5052
+
5053
+ movu ym5, [r0 + r6]
5054
+ vinserti32x8 m5, [r5 + r1], 1
5055
+ punpcklbw m6, m4, m5
5056
+ pmaddubsw m6, m9
5057
+ punpckhbw m4, m5
5058
+ pmaddubsw m4, m9
5059
+
5060
+ paddw m0, m6
5061
+ paddw m1, m4
5062
+
5063
+ movu ym4, [r0 + 4 * r1]
5064
+ vinserti32x8 m4, [r5 + 2 * r1], 1
5065
+ punpcklbw m6, m5, m4
5066
+ pmaddubsw m6, m9
5067
+ punpckhbw m5, m4
5068
+ pmaddubsw m5, m9
5069
+
5070
+ paddw m2, m6
5071
+ paddw m3, m5
5072
+
5073
+ lea r4, [r5 + 4 * r1]
5074
+ movu ym15, [r5 + r1]
5075
+ vinserti32x8 m15, [r5 + r6], 1
5076
+ punpcklbw m12, m4, m15
5077
+ pmaddubsw m12, m10
5078
+ punpckhbw m13, m4, m15
5079
+ pmaddubsw m13, m10
5080
+
5081
+ movu ym4, [r5 + 2 * r1]
5082
+ vinserti32x8 m4, [r5 + 4 * r1], 1
5083
+ punpcklbw m14, m15, m4
5084
+ pmaddubsw m14, m10
5085
+ punpckhbw m15, m4
5086
+ pmaddubsw m15, m10
5087
+
5088
+ movu ym5, [r5 + r6]
5089
+ vinserti32x8 m5, [r4 + r1], 1
5090
+ punpcklbw m6, m4, m5
5091
+ pmaddubsw m6, m11
5092
+ punpckhbw m4, m5
5093
+ pmaddubsw m4, m11
5094
+
5095
+ paddw m12, m6
5096
+ paddw m13, m4
5097
+
5098
+ movu ym4, [r5 + 4 * r1]
5099
+ vinserti32x8 m4, [r4 + 2 * r1], 1
5100
+ punpcklbw m6, m5, m4
5101
+ pmaddubsw m6, m11
5102
+ punpckhbw m5, m4
5103
+ pmaddubsw m5, m11
5104
+
5105
+ paddw m14, m6
5106
+ paddw m15, m5
5107
+
5108
+ paddw m0, m12
5109
+ paddw m1, m13
5110
+ paddw m2, m14
5111
+ paddw m3, m15
5112
+%ifidn %1,pp
5113
+ pmulhrsw m0, m7
5114
+ pmulhrsw m1, m7
5115
+ pmulhrsw m2, m7
5116
+ pmulhrsw m3, m7
5117
+
5118
+ packuswb m0, m1
5119
+ packuswb m2, m3
5120
+ movu [r2], ym0
5121
+ movu [r2 + r3], ym2
5122
+ vextracti32x8 [r2 + 2 * r3], m0, 1
5123
+ vextracti32x8 [r2 + r7], m2, 1
5124
+%else
5125
+ psubw m0, m7
5126
+ psubw m1, m7
5127
+ mova m12, m16
5128
+ mova m13, m17
5129
+ vpermi2q m12, m0, m1
5130
+ vpermi2q m13, m0, m1
5131
+ movu [r2], m12
5132
+ movu [r2 + 2 * r3], m13
5133
+
5134
+ psubw m2, m7
5135
+ psubw m3, m7
5136
+ mova m14, m16
5137
+ mova m15, m17
5138
+ vpermi2q m14, m2, m3
5139
+ vpermi2q m15, m2, m3
5140
+ movu [r2 + r3], m14
5141
+ movu [r2 + r7], m15
5142
+%endif
5143
+%endmacro
5144
+;-----------------------------------------------------------------------------------------------------------------
5145
+; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
5146
+;-----------------------------------------------------------------------------------------------------------------
5147
+%macro FILTER_VERT_LUMA_32xN_AVX512 2
5148
+INIT_ZMM avx512
5149
+cglobal interp_8tap_vert_%1_32x%2, 5, 8, 18
5150
+ mov r4d, r4m
5151
+ shl r4d, 8
5152
+%ifdef PIC
5153
+ lea r5, [tab_LumaCoeffVer_32_avx512]
5154
+ mova m8, [r5 + r4]
5155
+ mova m9, [r5 + r4 + 1 * mmsize]
5156
+ mova m10, [r5 + r4 + 2 * mmsize]
5157
+ mova m11, [r5 + r4 + 3 * mmsize]
5158
+%else
5159
+ mova m8, [tab_LumaCoeffVer_32_avx512 + r4]
5160
+ mova m9, [tab_LumaCoeffVer_32_avx512 + r4 + 1 * mmsize]
5161
+ mova m10, [tab_LumaCoeffVer_32_avx512 + r4 + 2 * mmsize]
5162
+ mova m11, [tab_LumaCoeffVer_32_avx512 + r4 + 3 * mmsize]
5163
+%endif
5164
+%ifidn %1, pp
5165
+ vbroadcasti32x8 m7, [pw_512]
5166
+%else
5167
+ shl r3d, 1
5168
+ vbroadcasti32x8 m7, [pw_2000]
5169
+ mova m16, [interp4_vps_store1_avx512]
5170
+ mova m17, [interp4_vps_store2_avx512]
5171
+%endif
5172
+
5173
+ lea r6, [3 * r1]
5174
+ lea r7, [3 * r3]
5175
+ sub r0, r6
5176
+
5177
+%rep %2/4 - 1
5178
+ PROCESS_LUMA_VERT_32x4_AVX512 %1
5179
+ lea r0, [r0 + 4 * r1]
5180
+ lea r2, [r2 + 4 * r3]
5181
+%endrep
5182
+ PROCESS_LUMA_VERT_32x4_AVX512 %1
5183
+ RET
5184
+%endmacro
5185
+
5186
+%if ARCH_X86_64
5187
+ FILTER_VERT_LUMA_32xN_AVX512 pp, 8
5188
+ FILTER_VERT_LUMA_32xN_AVX512 pp, 16
5189
+ FILTER_VERT_LUMA_32xN_AVX512 pp, 24
5190
+ FILTER_VERT_LUMA_32xN_AVX512 pp, 32
5191
+ FILTER_VERT_LUMA_32xN_AVX512 pp, 64
5192
+
5193
+ FILTER_VERT_LUMA_32xN_AVX512 ps, 8
5194
+ FILTER_VERT_LUMA_32xN_AVX512 ps, 16
5195
+ FILTER_VERT_LUMA_32xN_AVX512 ps, 24
5196
+ FILTER_VERT_LUMA_32xN_AVX512 ps, 32
5197
+ FILTER_VERT_LUMA_32xN_AVX512 ps, 64
5198
+%endif
5199
+%macro PROCESS_LUMA_VERT_48x8_AVX512 1
5200
+%ifidn %1, pp
5201
+ PROCESS_LUMA_VERT_32x4_AVX512 pp
5202
+%else
5203
+ PROCESS_LUMA_VERT_32x4_AVX512 ps
5204
+%endif
5205
+ lea r8, [r4 + 4 * r1]
5206
+ lea r9, [r2 + 4 * r3]
5207
+ movu ym1, [r5]
5208
+ vinserti32x8 m1, [r5 + 2 * r1], 1
5209
+ movu ym3, [r5 + r1]
5210
+ vinserti32x8 m3, [r5 + r6], 1
5211
+ punpcklbw m0, m1, m3
5212
+ pmaddubsw m0, m8
5213
+ punpckhbw m1, m3
5214
+ pmaddubsw m1, m8
5215
+
5216
+ movu ym4, [r5 + 2 * r1]
5217
+ vinserti32x8 m4, [r5 + 4 * r1], 1
5218
+ punpcklbw m2, m3, m4
5219
+ pmaddubsw m2, m8
5220
+ punpckhbw m3, m4
5221
+ pmaddubsw m3, m8
5222
+
5223
+ movu ym5, [r5 + r6]
5224
+ vinserti32x8 m5, [r4 + r1], 1
5225
+ punpcklbw m6, m4, m5
5226
+ pmaddubsw m6, m9
5227
+ punpckhbw m4, m5
5228
+ pmaddubsw m4, m9
5229
+
5230
+ paddw m0, m6
5231
+ paddw m1, m4
5232
+
5233
+ movu ym4, [r5 + 4 * r1]
5234
+ vinserti32x8 m4, [r4 + 2 * r1], 1
5235
+ punpcklbw m6, m5, m4
5236
+ pmaddubsw m6, m9
5237
+ punpckhbw m5, m4
5238
+ pmaddubsw m5, m9
5239
+
5240
+ paddw m2, m6
5241
+ paddw m3, m5
5242
+
5243
+ movu ym15, [r4 + r1]
5244
+ vinserti32x8 m15, [r4 + r6], 1
5245
+ punpcklbw m12, m4, m15
5246
+ pmaddubsw m12, m10
5247
+ punpckhbw m13, m4, m15
5248
+ pmaddubsw m13, m10
5249
+
5250
+ movu ym4, [r4 + 2 * r1]
5251
+ vinserti32x8 m4, [r4 + 4 * r1], 1
5252
+ punpcklbw m14, m15, m4
5253
+ pmaddubsw m14, m10
5254
+ punpckhbw m15, m4
5255
+ pmaddubsw m15, m10
5256
+
5257
+ movu ym5, [r4 + r6]
5258
+ vinserti32x8 m5, [r8 + r1], 1
5259
+ punpcklbw m6, m4, m5
5260
+ pmaddubsw m6, m11
5261
+ punpckhbw m4, m5
5262
+ pmaddubsw m4, m11
5263
+
5264
+ paddw m12, m6
5265
+ paddw m13, m4
5266
+
5267
+ movu ym4, [r4 + 4 * r1]
5268
+ vinserti32x8 m4, [r8 + 2 * r1], 1
5269
+ punpcklbw m6, m5, m4
5270
+ pmaddubsw m6, m11
5271
+ punpckhbw m5, m4
5272
+ pmaddubsw m5, m11
5273
+
5274
+ paddw m14, m6
5275
+ paddw m15, m5
5276
+
5277
+ paddw m0, m12
5278
+ paddw m1, m13
5279
+ paddw m2, m14
5280
+ paddw m3, m15
5281
+%ifidn %1,pp
5282
+ pmulhrsw m0, m7
5283
+ pmulhrsw m1, m7
5284
+ pmulhrsw m2, m7
5285
+ pmulhrsw m3, m7
5286
+ packuswb m0, m1
5287
+ packuswb m2, m3
5288
+
5289
+ movu [r9], ym0
5290
+ movu [r9 + r3], ym2
5291
+ vextracti32x8 [r9 + 2 * r3], m0, 1
5292
+ vextracti32x8 [r9 + r7], m2, 1
5293
+%else
5294
+ psubw m0, m7
5295
+ psubw m1, m7
5296
+ mova m12, m16
5297
+ mova m13, m17
5298
+ vpermi2q m12, m0, m1
5299
+ vpermi2q m13, m0, m1
5300
+ movu [r9], m12
5301
+ movu [r9 + 2 * r3], m13
5302
+
5303
+ psubw m2, m7
5304
+ psubw m3, m7
5305
+ mova m14, m16
5306
+ mova m15, m17
5307
+ vpermi2q m14, m2, m3
5308
+ vpermi2q m15, m2, m3
5309
+ movu [r9 + r3], m14
5310
+ movu [r9 + r7], m15
5311
+%endif
5312
+ movu xm1, [r0 + mmsize/2]
5313
+ vinserti32x4 m1, [r0 + 2 * r1 + mmsize/2], 1
5314
+ vinserti32x4 m1, [r5 + mmsize/2], 2
5315
+ vinserti32x4 m1, [r5 + 2 * r1 + mmsize/2], 3
5316
+ movu xm3, [r0 + r1 + mmsize/2]
5317
+ vinserti32x4 m3, [r0 + r6 + mmsize/2], 1
5318
+ vinserti32x4 m3, [r5 + r1 + mmsize/2], 2
5319
+ vinserti32x4 m3, [r5 + r6 + mmsize/2], 3
5320
+ punpcklbw m0, m1, m3
5321
+ pmaddubsw m0, m8
5322
+ punpckhbw m1, m3
5323
+ pmaddubsw m1, m8
5324
+
5325
+ movu xm4, [r0 + 2 * r1 + mmsize/2]
5326
+ vinserti32x4 m4, [r0 + 4 * r1 + mmsize/2], 1
5327
+ vinserti32x4 m4, [r5 + 2 * r1 + mmsize/2], 2
5328
+ vinserti32x4 m4, [r5 + 4 * r1 + mmsize/2], 3
5329
+ punpcklbw m2, m3, m4
5330
+ pmaddubsw m2, m8
5331
+ punpckhbw m3, m4
5332
+ pmaddubsw m3, m8
5333
+
5334
+ movu xm5, [r0 + r6 + mmsize/2]
5335
+ vinserti32x4 m5, [r5 + r1 + mmsize/2], 1
5336
+ vinserti32x4 m5, [r5 + r6 + mmsize/2], 2
5337
+ vinserti32x4 m5, [r4 + r1 + mmsize/2], 3
5338
+ punpcklbw m6, m4, m5
5339
+ pmaddubsw m6, m9
5340
+ punpckhbw m4, m5
5341
+ pmaddubsw m4, m9
5342
+
5343
+ paddw m0, m6
5344
+ paddw m1, m4
5345
+
5346
+ movu xm4, [r0 + 4 * r1 + mmsize/2]
5347
+ vinserti32x4 m4, [r5 + 2 * r1 + mmsize/2], 1
5348
+ vinserti32x4 m4, [r5 + 4 * r1 + mmsize/2], 2
5349
+ vinserti32x4 m4, [r4 + 2 * r1 + mmsize/2], 3
5350
+ punpcklbw m6, m5, m4
5351
+ pmaddubsw m6, m9
5352
+ punpckhbw m5, m4
5353
+ pmaddubsw m5, m9
5354
+
5355
+ paddw m2, m6
5356
+ paddw m3, m5
5357
+
5358
+ movu xm15, [r5 + r1 + mmsize/2]
5359
+ vinserti32x4 m15, [r5 + r6 + mmsize/2], 1
5360
+ vinserti32x4 m15, [r4 + r1 + mmsize/2], 2
5361
+ vinserti32x4 m15, [r4 + r6 + mmsize/2], 3
5362
+ punpcklbw m12, m4, m15
5363
+ pmaddubsw m12, m10
5364
+ punpckhbw m13, m4, m15
5365
+ pmaddubsw m13, m10
5366
+
5367
+ movu xm4, [r5 + 2 * r1 + mmsize/2]
5368
+ vinserti32x4 m4, [r5 + 4 * r1 + mmsize/2], 1
5369
+ vinserti32x4 m4, [r4 + 2 * r1 + mmsize/2], 2
5370
+ vinserti32x4 m4, [r4 + 4 * r1 + mmsize/2], 3
5371
+ punpcklbw m14, m15, m4
5372
+ pmaddubsw m14, m10
5373
+ punpckhbw m15, m4
5374
+ pmaddubsw m15, m10
5375
+
5376
+ movu xm5, [r5 + r6 + mmsize/2]
5377
+ vinserti32x4 m5, [r4 + r1 + mmsize/2], 1
5378
+ vinserti32x4 m5, [r4 + r6 + mmsize/2], 2
5379
+ vinserti32x4 m5, [r8 + r1 + mmsize/2], 3
5380
+ punpcklbw m6, m4, m5
5381
+ pmaddubsw m6, m11
5382
+ punpckhbw m4, m5
5383
+ pmaddubsw m4, m11
5384
+
5385
+ paddw m12, m6
5386
+ paddw m13, m4
5387
+
5388
+ movu xm4, [r5 + 4 * r1 + mmsize/2]
5389
+ vinserti32x4 m4, [r4 + 2 * r1 + mmsize/2], 1
5390
+ vinserti32x4 m4, [r4 + 4 * r1 + mmsize/2], 2
5391
+ vinserti32x4 m4, [r8 + 2 * r1 + mmsize/2], 3
5392
+ punpcklbw m6, m5, m4
5393
+ pmaddubsw m6, m11
5394
+ punpckhbw m5, m4
5395
+ pmaddubsw m5, m11
5396
+
5397
+ paddw m14, m6
5398
+ paddw m15, m5
5399
+
5400
+ paddw m0, m12
5401
+ paddw m1, m13
5402
+ paddw m2, m14
5403
+ paddw m3, m15
5404
+%ifidn %1, pp
5405
+ pmulhrsw m0, m7
5406
+ pmulhrsw m1, m7
5407
+ pmulhrsw m2, m7
5408
+ pmulhrsw m3, m7
5409
+
5410
+ packuswb m0, m1
5411
+ packuswb m2, m3
5412
+ movu [r2 + mmsize/2], xm0
5413
+ movu [r2 + r3 + mmsize/2], xm2
5414
+ vextracti32x4 [r2 + 2 * r3 + mmsize/2], m0, 1
5415
+ vextracti32x4 [r2 + r7 + mmsize/2], m2, 1
5416
+ lea r2, [r2 + 4 * r3]
5417
+ vextracti32x4 [r2 + mmsize/2], m0, 2
5418
+ vextracti32x4 [r2 + r3 + mmsize/2], m2, 2
5419
+ vextracti32x4 [r2 + 2 * r3 + mmsize/2], m0, 3
5420
+ vextracti32x4 [r2 + r7 + mmsize/2], m2, 3
5421
+%else
5422
+ psubw m0, m7
5423
+ psubw m1, m7
5424
+ mova m12, m16
5425
+ mova m13, m17
5426
+ vpermi2q m12, m0, m1
5427
+ vpermi2q m13, m0, m1
5428
+ movu [r2 + mmsize], ym12
5429
+ vextracti32x8 [r2 + 2 * r3 + mmsize], m12, 1
5430
+
5431
+ psubw m2, m7
5432
+ psubw m3, m7
5433
+ mova m14, m16
5434
+ mova m15, m17
5435
+ vpermi2q m14, m2, m3
5436
+ vpermi2q m15, m2, m3
5437
+ movu [r2 + r3 + mmsize], ym14
5438
+ vextracti32x8 [r2 + r7 + mmsize], m14, 1
5439
+ lea r2, [r2 + 4 * r3]
5440
+
5441
+ movu [r2 + mmsize], ym13
5442
+ movu [r2 + r3 + mmsize], ym15
5443
+ vextracti32x8 [r2 + 2 * r3 + mmsize], m13, 1
5444
+ vextracti32x8 [r2 + r7 + mmsize], m15, 1
5445
+%endif
5446
+%endmacro
5447
+;-----------------------------------------------------------------------------------------------------------------
5448
+; void interp_8tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
5449
+;-----------------------------------------------------------------------------------------------------------------
5450
+%macro FILTER_VERT_LUMA_48x64_AVX512 1
5451
+INIT_ZMM avx512
5452
+cglobal interp_8tap_vert_%1_48x64, 5, 10, 18
5453
+ mov r4d, r4m
5454
+ shl r4d, 8
5455
+
5456
+%ifdef PIC
5457
+ lea r5, [tab_LumaCoeffVer_32_avx512]
5458
+ mova m8, [r5 + r4]
5459
+ mova m9, [r5 + r4 + 1 * mmsize]
5460
+ mova m10, [r5 + r4 + 2 * mmsize]
5461
+ mova m11, [r5 + r4 + 3 * mmsize]
5462
+%else
5463
+ mova m8, [tab_LumaCoeffVer_32_avx512 + r4]
5464
+ mova m9, [tab_LumaCoeffVer_32_avx512 + r4 + 1 * mmsize]
5465
+ mova m10, [tab_LumaCoeffVer_32_avx512 + r4 + 2 * mmsize]
5466
+ mova m11, [tab_LumaCoeffVer_32_avx512 + r4 + 3 * mmsize]
5467
+%endif
5468
+%ifidn %1, pp
5469
+ vbroadcasti32x8 m7, [pw_512]
5470
+%else
5471
+ shl r3d, 1
5472
+ vbroadcasti32x8 m7, [pw_2000]
5473
+ mova m16, [interp4_vps_store1_avx512]
5474
+ mova m17, [interp4_vps_store2_avx512]
5475
+%endif
5476
+
5477
+ lea r6, [3 * r1]
5478
+ lea r7, [3 * r3]
5479
+ sub r0, r6
5480
+
5481
+%rep 7
5482
+ PROCESS_LUMA_VERT_48x8_AVX512 %1
5483
+ lea r0, [r4]
5484
+ lea r2, [r2 + 4 * r3]
5485
+%endrep
5486
+ PROCESS_LUMA_VERT_48x8_AVX512 %1
5487
+ RET
5488
+%endmacro
5489
+
5490
+%if ARCH_X86_64
5491
+ FILTER_VERT_LUMA_48x64_AVX512 pp
5492
+ FILTER_VERT_LUMA_48x64_AVX512 ps
5493
+%endif
5494
+%macro PROCESS_LUMA_VERT_64x2_AVX512 1
5495
+ lea r5, [r0 + 4 * r1]
5496
+ movu m1, [r0]
5497
+ movu m3, [r0 + r1]
5498
+ punpcklbw m0, m1, m3
5499
+ pmaddubsw m0, m8
5500
+ punpckhbw m1, m3
5501
+ pmaddubsw m1, m8
5502
+
5503
+ movu m4, [r0 + 2 * r1]
5504
+ punpcklbw m2, m3, m4
5505
+ pmaddubsw m2, m8
5506
+ punpckhbw m3, m4
5507
+ pmaddubsw m3, m8
5508
+
5509
+ movu m5, [r0 + r6]
5510
+ punpcklbw m6, m4, m5
5511
+ pmaddubsw m6, m9
5512
+ punpckhbw m4, m5
5513
+ pmaddubsw m4, m9
5514
+
5515
+ paddw m0, m6
5516
+ paddw m1, m4
5517
+
5518
+ movu m4, [r0 + 4 * r1]
5519
+ punpcklbw m6, m5, m4
5520
+ pmaddubsw m6, m9
5521
+ punpckhbw m5, m4
5522
+ pmaddubsw m5, m9
5523
+
5524
+ paddw m2, m6
5525
+ paddw m3, m5
5526
+
5527
+ movu m15, [r5 + r1]
5528
+ punpcklbw m12, m4, m15
5529
+ pmaddubsw m12, m10
5530
+ punpckhbw m13, m4, m15
5531
+ pmaddubsw m13, m10
5532
+
5533
+ movu m4, [r5 + 2 * r1]
5534
+ punpcklbw m14, m15, m4
5535
+ pmaddubsw m14, m10
5536
+ punpckhbw m15, m4
5537
+ pmaddubsw m15, m10
5538
+
5539
+ movu m5, [r5 + r6]
5540
+ punpcklbw m6, m4, m5
5541
+ pmaddubsw m6, m11
5542
+ punpckhbw m4, m5
5543
+ pmaddubsw m4, m11
5544
+
5545
+ paddw m12, m6
5546
+ paddw m13, m4
5547
+
5548
+ movu m4, [r5 + 4 * r1]
5549
+ punpcklbw m6, m5, m4
5550
+ pmaddubsw m6, m11
5551
+ punpckhbw m5, m4
5552
+ pmaddubsw m5, m11
5553
+
5554
+ paddw m14, m6
5555
+ paddw m15, m5
5556
+
5557
+ paddw m0, m12
5558
+ paddw m1, m13
5559
+ paddw m2, m14
5560
+ paddw m3, m15
5561
+%ifidn %1,pp
5562
+ pmulhrsw m0, m7
5563
+ pmulhrsw m1, m7
5564
+ pmulhrsw m2, m7
5565
+ pmulhrsw m3, m7
5566
+
5567
+ packuswb m0, m1
5568
+ packuswb m2, m3
5569
+ movu [r2], m0
5570
+ movu [r2 + r3], m2
5571
+%else
5572
+ psubw m0, m7
5573
+ psubw m1, m7
5574
+ mova m12, m16
5575
+ mova m13, m17
5576
+ vpermi2q m12, m0, m1
5577
+ vpermi2q m13, m0, m1
5578
+ movu [r2], m12
5579
+ movu [r2 + mmsize], m13
5580
+
5581
+ psubw m2, m7
5582
+ psubw m3, m7
5583
+ mova m14, m16
5584
+ mova m15, m17
5585
+ vpermi2q m14, m2, m3
5586
+ vpermi2q m15, m2, m3
5587
+ movu [r2 + r3], m14
5588
+ movu [r2 + r3 + mmsize], m15
5589
+%endif
5590
+%endmacro
5591
+;-----------------------------------------------------------------------------------------------------------------
5592
+; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
5593
+;-----------------------------------------------------------------------------------------------------------------
5594
+%macro FILTER_VERT_LUMA_64xN_AVX512 2
5595
+INIT_ZMM avx512
5596
+cglobal interp_8tap_vert_%1_64x%2, 5, 8, 18
5597
+ mov r4d, r4m
5598
+ shl r4d, 8
5599
+%ifdef PIC
5600
+ lea r5, [tab_LumaCoeffVer_32_avx512]
5601
+ mova m8, [r5 + r4]
5602
+ mova m9, [r5 + r4 + 1 * mmsize]
5603
+ mova m10, [r5 + r4 + 2 * mmsize]
5604
+ mova m11, [r5 + r4 + 3 * mmsize]
5605
+%else
5606
+ mova m8, [tab_LumaCoeffVer_32_avx512 + r4]
5607
+ mova m9, [tab_LumaCoeffVer_32_avx512 + r4 + 1 * mmsize]
5608
+ mova m10, [tab_LumaCoeffVer_32_avx512 + r4 + 2 * mmsize]
5609
+ mova m11, [tab_LumaCoeffVer_32_avx512 + r4 + 3 * mmsize]
5610
+%endif
5611
+%ifidn %1, pp
5612
+ vbroadcasti32x8 m7, [pw_512]
5613
+%else
5614
+ shl r3d, 1
5615
+ vbroadcasti32x8 m7, [pw_2000]
5616
+ mova m16, [interp4_vps_store1_avx512]
5617
+ mova m17, [interp4_vps_store2_avx512]
5618
+%endif
5619
+
5620
+ lea r6, [3 * r1]
5621
+ sub r0, r6
5622
+ lea r7, [3 * r3]
5623
+
5624
+%rep %2/2 - 1
5625
+ PROCESS_LUMA_VERT_64x2_AVX512 %1
5626
+ lea r0, [r0 + 2 * r1]
5627
+ lea r2, [r2 + 2 * r3]
5628
+%endrep
5629
+ PROCESS_LUMA_VERT_64x2_AVX512 %1
5630
+ RET
5631
+%endmacro
5632
+
5633
+%if ARCH_X86_64
5634
+FILTER_VERT_LUMA_64xN_AVX512 pp, 16
5635
+FILTER_VERT_LUMA_64xN_AVX512 pp, 32
5636
+FILTER_VERT_LUMA_64xN_AVX512 pp, 48
5637
+FILTER_VERT_LUMA_64xN_AVX512 pp, 64
5638
+
5639
+FILTER_VERT_LUMA_64xN_AVX512 ps, 16
5640
+FILTER_VERT_LUMA_64xN_AVX512 ps, 32
5641
+FILTER_VERT_LUMA_64xN_AVX512 ps, 48
5642
+FILTER_VERT_LUMA_64xN_AVX512 ps, 64
5643
+%endif
5644
+;-------------------------------------------------------------------------------------------------------------
5645
+;avx512 luma_vpp and luma_vps code end
5646
+;-------------------------------------------------------------------------------------------------------------
5647
+;-------------------------------------------------------------------------------------------------------------
5648
+;ipfilter_luma_avx512 code end
5649
+;-------------------------------------------------------------------------------------------------------------
5650
\ No newline at end of file
5651
x265_2.7.tar.gz/source/common/x86/ipfilter8.h -> x265_2.9.tar.gz/source/common/x86/ipfilter8.h
Changed
16
1
2
FUNCDEF_PU(void, interp_8tap_vert_ss, cpu, const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); \
3
FUNCDEF_PU(void, interp_8tap_hv_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int idxX, int idxY); \
4
FUNCDEF_CHROMA_PU(void, filterPixelToShort, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride); \
5
+ FUNCDEF_CHROMA_PU(void, filterPixelToShort_aligned, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride); \
6
FUNCDEF_CHROMA_PU(void, interp_4tap_horiz_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
7
FUNCDEF_CHROMA_PU(void, interp_4tap_horiz_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt); \
8
FUNCDEF_CHROMA_PU(void, interp_4tap_vert_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
9
10
SETUP_FUNC_DEF(sse3);
11
SETUP_FUNC_DEF(sse4);
12
SETUP_FUNC_DEF(avx2);
13
+SETUP_FUNC_DEF(avx512);
14
15
#endif // ifndef X265_IPFILTER8_H
16
x265_2.7.tar.gz/source/common/x86/loopfilter.asm -> x265_2.9.tar.gz/source/common/x86/loopfilter.asm
Changed
50
1
2
;============================================================================================================
3
INIT_XMM sse4
4
%if HIGH_BIT_DEPTH
5
+%if ARCH_X86_64
6
cglobal saoCuOrgE0, 4,5,9
7
mov r4d, r4m
8
movh m6, [r1]
9
10
sub r4d, 16
11
jnz .loopH
12
RET
13
-
14
+%endif
15
%else ; HIGH_BIT_DEPTH == 1
16
17
cglobal saoCuOrgE0, 5, 5, 8, rec, offsetEo, lcuWidth, signLeft, stride
18
19
20
INIT_YMM avx2
21
%if HIGH_BIT_DEPTH
22
+%if ARCH_X86_64
23
cglobal saoCuOrgE0, 4,4,9
24
vbroadcasti128 m6, [r1]
25
movzx r1d, byte [r3]
26
27
dec r2d
28
jnz .loop
29
RET
30
+%endif
31
%else ; HIGH_BIT_DEPTH
32
cglobal saoCuOrgE0, 5, 5, 7, rec, offsetEo, lcuWidth, signLeft, stride
33
34
35
RET
36
%endif
37
38
+%if ARCH_X86_64
39
INIT_YMM avx2
40
%if HIGH_BIT_DEPTH
41
cglobal saoCuOrgB0, 5,7,8
42
43
.end:
44
RET
45
%endif
46
+%endif
47
48
;============================================================================================================
49
; void calSign(int8_t *dst, const Pixel *src1, const Pixel *src2, const int width)
50
x265_2.7.tar.gz/source/common/x86/mc-a.asm -> x265_2.9.tar.gz/source/common/x86/mc-a.asm
Changed
1841
1
2
%error Unsupport bit depth!
3
%endif
4
5
-SECTION_RODATA 32
6
+SECTION_RODATA 64
7
8
-ch_shuf: times 2 db 0,2,2,4,4,6,6,8,1,3,3,5,5,7,7,9
9
-ch_shuf_adj: times 8 db 0
10
- times 8 db 2
11
- times 8 db 4
12
- times 8 db 6
13
+ALIGN 64
14
+const shuf_avx512, dq 0, 2, 4, 6, 1, 3, 5, 7
15
16
SECTION .text
17
18
19
;------------------------------------------------------------------------------
20
; avx2 asm for addAvg high_bit_depth
21
;------------------------------------------------------------------------------
22
+%if ARCH_X86_64
23
INIT_YMM avx2
24
cglobal addAvg_8x2, 6,6,2, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
25
movu xm0, [r0]
26
27
movu [r2], xm0
28
movu [r2 + r5], xm2
29
RET
30
+%endif
31
32
%macro ADDAVG_W8_H4_AVX2 1
33
cglobal addAvg_8x%1, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
34
35
RET
36
%endmacro
37
38
+%if ARCH_X86_64
39
ADDAVG_W8_H4_AVX2 4
40
ADDAVG_W8_H4_AVX2 8
41
ADDAVG_W8_H4_AVX2 12
42
ADDAVG_W8_H4_AVX2 16
43
ADDAVG_W8_H4_AVX2 32
44
ADDAVG_W8_H4_AVX2 64
45
+%endif
46
47
+%if ARCH_X86_64
48
cglobal addAvg_12x16, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
49
mova m4, [pw_ %+ ADDAVG_ROUND]
50
mova m5, [pw_pixel_max]
51
52
dec r6d
53
jnz .loop
54
RET
55
+%endif
56
57
%macro ADDAVG_W16_H4_AVX2 1
58
cglobal addAvg_16x%1, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
59
60
RET
61
%endmacro
62
63
+%if ARCH_X86_64
64
ADDAVG_W16_H4_AVX2 4
65
ADDAVG_W16_H4_AVX2 8
66
ADDAVG_W16_H4_AVX2 12
67
68
ADDAVG_W16_H4_AVX2 24
69
ADDAVG_W16_H4_AVX2 32
70
ADDAVG_W16_H4_AVX2 64
71
+%endif
72
73
+%if ARCH_X86_64
74
cglobal addAvg_24x32, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
75
mova m4, [pw_ %+ ADDAVG_ROUND]
76
mova m5, [pw_pixel_max]
77
78
dec r6d
79
jnz .loop
80
RET
81
+%endif
82
83
%macro ADDAVG_W32_H2_AVX2 1
84
cglobal addAvg_32x%1, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
85
86
RET
87
%endmacro
88
89
+%if ARCH_X86_64
90
ADDAVG_W32_H2_AVX2 8
91
ADDAVG_W32_H2_AVX2 16
92
ADDAVG_W32_H2_AVX2 24
93
ADDAVG_W32_H2_AVX2 32
94
ADDAVG_W32_H2_AVX2 48
95
ADDAVG_W32_H2_AVX2 64
96
+%endif
97
98
+%if ARCH_X86_64
99
cglobal addAvg_48x64, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
100
mova m4, [pw_ %+ ADDAVG_ROUND]
101
mova m5, [pw_pixel_max]
102
103
dec r6d
104
jnz .loop
105
RET
106
+%endif
107
108
%macro ADDAVG_W64_H1_AVX2 1
109
cglobal addAvg_64x%1, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
110
111
RET
112
%endmacro
113
114
+%if ARCH_X86_64
115
ADDAVG_W64_H1_AVX2 16
116
ADDAVG_W64_H1_AVX2 32
117
ADDAVG_W64_H1_AVX2 48
118
ADDAVG_W64_H1_AVX2 64
119
+%endif
120
+;-----------------------------------------------------------------------------
121
+;addAvg avx512 high bit depth code start
122
+;-----------------------------------------------------------------------------
123
+%macro PROCESS_ADDAVG_16x4_HBD_AVX512 0
124
+ movu ym0, [r0]
125
+ vinserti32x8 m0, [r0 + r3], 1
126
+ movu ym1, [r1]
127
+ vinserti32x8 m1, [r1 + r4], 1
128
+
129
+ paddw m0, m1
130
+ pmulhrsw m0, m3
131
+ paddw m0, m4
132
+ pmaxsw m0, m2
133
+ pminsw m0, m5
134
+
135
+ movu [r2], ym0
136
+ vextracti32x8 [r2 + r5], m0, 1
137
+
138
+ movu ym0, [r0 + 2 * r3]
139
+ vinserti32x8 m0, [r0 + r6], 1
140
+ movu ym1, [r1 + 2 * r4]
141
+ vinserti32x8 m1, [r1 + r7], 1
142
+
143
+ paddw m0, m1
144
+ pmulhrsw m0, m3
145
+ paddw m0, m4
146
+ pmaxsw m0, m2
147
+ pminsw m0, m5
148
+
149
+ movu [r2 + 2 * r5], ym0
150
+ vextracti32x8 [r2 + r8], m0, 1
151
+%endmacro
152
+
153
+%macro PROCESS_ADDAVG_32x4_HBD_AVX512 0
154
+ movu m0, [r0]
155
+ movu m1, [r1]
156
+ paddw m0, m1
157
+ pmulhrsw m0, m3
158
+ paddw m0, m4
159
+ pmaxsw m0, m2
160
+ pminsw m0, m5
161
+ movu [r2], m0
162
+
163
+ movu m0, [r0 + r3]
164
+ movu m1, [r1 + r4]
165
+ paddw m0, m1
166
+ pmulhrsw m0, m3
167
+ paddw m0, m4
168
+ pmaxsw m0, m2
169
+ pminsw m0, m5
170
+ movu [r2 + r5], m0
171
+
172
+ movu m0, [r0 + 2 * r3]
173
+ movu m1, [r1 + 2 * r4]
174
+ paddw m0, m1
175
+ pmulhrsw m0, m3
176
+ paddw m0, m4
177
+ pmaxsw m0, m2
178
+ pminsw m0, m5
179
+ movu [r2 + 2 * r5], m0
180
+
181
+ movu m0, [r0 + r6]
182
+ movu m1, [r1 + r7]
183
+ paddw m0, m1
184
+ pmulhrsw m0, m3
185
+ paddw m0, m4
186
+ pmaxsw m0, m2
187
+ pminsw m0, m5
188
+ movu [r2 + r8], m0
189
+%endmacro
190
+
191
+%macro PROCESS_ADDAVG_64x4_HBD_AVX512 0
192
+ movu m0, [r0]
193
+ movu m1, [r1]
194
+ paddw m0, m1
195
+ pmulhrsw m0, m3
196
+ paddw m0, m4
197
+ pmaxsw m0, m2
198
+ pminsw m0, m5
199
+ movu [r2], m0
200
+
201
+ movu m0, [r0 + mmsize]
202
+ movu m1, [r1 + mmsize]
203
+ paddw m0, m1
204
+ pmulhrsw m0, m3
205
+ paddw m0, m4
206
+ pmaxsw m0, m2
207
+ pminsw m0, m5
208
+ movu [r2 + mmsize], m0
209
+
210
+ movu m0, [r0 + r3]
211
+ movu m1, [r1 + r4]
212
+ paddw m0, m1
213
+ pmulhrsw m0, m3
214
+ paddw m0, m4
215
+ pmaxsw m0, m2
216
+ pminsw m0, m5
217
+ movu [r2 + r5], m0
218
+
219
+ movu m0, [r0 + r3 + mmsize]
220
+ movu m1, [r1 + r4 + mmsize]
221
+ paddw m0, m1
222
+ pmulhrsw m0, m3
223
+ paddw m0, m4
224
+ pmaxsw m0, m2
225
+ pminsw m0, m5
226
+ movu [r2 + r5 + mmsize], m0
227
+
228
+ movu m0, [r0 + 2 * r3]
229
+ movu m1, [r1 + 2 * r4]
230
+ paddw m0, m1
231
+ pmulhrsw m0, m3
232
+ paddw m0, m4
233
+ pmaxsw m0, m2
234
+ pminsw m0, m5
235
+ movu [r2 + 2 * r5], m0
236
+
237
+ movu m0, [r0 + 2 * r3 + mmsize]
238
+ movu m1, [r1 + 2 * r4 + mmsize]
239
+ paddw m0, m1
240
+ pmulhrsw m0, m3
241
+ paddw m0, m4
242
+ pmaxsw m0, m2
243
+ pminsw m0, m5
244
+ movu [r2 + 2 * r5 + mmsize], m0
245
+
246
+ movu m0, [r0 + r6]
247
+ movu m1, [r1 + r7]
248
+ paddw m0, m1
249
+ pmulhrsw m0, m3
250
+ paddw m0, m4
251
+ pmaxsw m0, m2
252
+ pminsw m0, m5
253
+ movu [r2 + r8], m0
254
+
255
+ movu m0, [r0 + r6 + mmsize]
256
+ movu m1, [r1 + r7 + mmsize]
257
+ paddw m0, m1
258
+ pmulhrsw m0, m3
259
+ paddw m0, m4
260
+ pmaxsw m0, m2
261
+ pminsw m0, m5
262
+ movu [r2 + r8 + mmsize], m0
263
+%endmacro
264
+
265
+%macro PROCESS_ADDAVG_48x4_HBD_AVX512 0
266
+ movu m0, [r0]
267
+ movu m1, [r1]
268
+ paddw m0, m1
269
+ pmulhrsw m0, m3
270
+ paddw m0, m4
271
+ pmaxsw m0, m2
272
+ pminsw m0, m5
273
+ movu [r2], m0
274
+
275
+ movu ym0, [r0 + mmsize]
276
+ movu ym1, [r1 + mmsize]
277
+ paddw ym0, ym1
278
+ pmulhrsw ym0, ym3
279
+ paddw ym0, ym4
280
+ pmaxsw ym0, ym2
281
+ pminsw ym0, ym5
282
+ movu [r2 + mmsize], ym0
283
+
284
+ movu m0, [r0 + r3]
285
+ movu m1, [r1 + r4]
286
+ paddw m0, m1
287
+ pmulhrsw m0, m3
288
+ paddw m0, m4
289
+ pmaxsw m0, m2
290
+ pminsw m0, m5
291
+ movu [r2 + r5], m0
292
+
293
+ movu ym0, [r0 + r3 + mmsize]
294
+ movu ym1, [r1 + r4 + mmsize]
295
+ paddw ym0, ym1
296
+ pmulhrsw ym0, ym3
297
+ paddw ym0, ym4
298
+ pmaxsw ym0, ym2
299
+ pminsw ym0, ym5
300
+ movu [r2 + r5 + mmsize], ym0
301
+
302
+ movu m0, [r0 + 2 * r3]
303
+ movu m1, [r1 + 2 * r4]
304
+ paddw m0, m1
305
+ pmulhrsw m0, m3
306
+ paddw m0, m4
307
+ pmaxsw m0, m2
308
+ pminsw m0, m5
309
+ movu [r2 + 2 * r5], m0
310
+
311
+ movu ym0, [r0 + 2 * r3 + mmsize]
312
+ movu ym1, [r1 + 2 * r4 + mmsize]
313
+ paddw ym0, ym1
314
+ pmulhrsw ym0, ym3
315
+ paddw ym0, ym4
316
+ pmaxsw ym0, ym2
317
+ pminsw ym0, ym5
318
+ movu [r2 + 2 * r5 + mmsize], ym0
319
+
320
+ movu m0, [r0 + r6]
321
+ movu m1, [r1 + r7]
322
+ paddw m0, m1
323
+ pmulhrsw m0, m3
324
+ paddw m0, m4
325
+ pmaxsw m0, m2
326
+ pminsw m0, m5
327
+ movu [r2 + r8], m0
328
+
329
+ movu ym0, [r0 + r6 + mmsize]
330
+ movu ym1, [r1 + r7 + mmsize]
331
+ paddw ym0, ym1
332
+ pmulhrsw ym0, ym3
333
+ paddw ym0, ym4
334
+ pmaxsw ym0, ym2
335
+ pminsw ym0, ym5
336
+ movu [r2 + r8 + mmsize], ym0
337
+%endmacro
338
+;-----------------------------------------------------------------------------
339
+;void addAvg (int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride)
340
+;-----------------------------------------------------------------------------
341
+%if ARCH_X86_64
342
+INIT_ZMM avx512
343
+cglobal addAvg_16x4, 6,9,6
344
+ vbroadcasti32x8 m4, [pw_ %+ ADDAVG_ROUND]
345
+ vbroadcasti32x8 m5, [pw_pixel_max]
346
+ vbroadcasti32x8 m3, [pw_ %+ ADDAVG_FACTOR]
347
+ pxor m2, m2
348
+ add r3, r3
349
+ add r4, r4
350
+ add r5, r5
351
+ lea r6, [3 * r3]
352
+ lea r7, [3 * r4]
353
+ lea r8, [3 * r5]
354
+ PROCESS_ADDAVG_16x4_HBD_AVX512
355
+ RET
356
+%endif
357
+
358
+%macro ADDAVG_W16_HBD_AVX512 1
359
+INIT_ZMM avx512
360
+cglobal addAvg_16x%1, 6,9,6
361
+ vbroadcasti32x8 m4, [pw_ %+ ADDAVG_ROUND]
362
+ vbroadcasti32x8 m5, [pw_pixel_max]
363
+ vbroadcasti32x8 m3, [pw_ %+ ADDAVG_FACTOR]
364
+ pxor m2, m2
365
+ add r3, r3
366
+ add r4, r4
367
+ add r5, r5
368
+ lea r6, [3 * r3]
369
+ lea r7, [3 * r4]
370
+ lea r8, [3 * r5]
371
+
372
+%rep %1/4 - 1
373
+ PROCESS_ADDAVG_16x4_HBD_AVX512
374
+ lea r2, [r2 + 4 * r5]
375
+ lea r0, [r0 + 4 * r3]
376
+ lea r1, [r1 + 4 * r4]
377
+%endrep
378
+ PROCESS_ADDAVG_16x4_HBD_AVX512
379
+ RET
380
+%endmacro
381
+
382
+%if ARCH_X86_64
383
+ADDAVG_W16_HBD_AVX512 8
384
+ADDAVG_W16_HBD_AVX512 12
385
+ADDAVG_W16_HBD_AVX512 16
386
+ADDAVG_W16_HBD_AVX512 24
387
+ADDAVG_W16_HBD_AVX512 32
388
+ADDAVG_W16_HBD_AVX512 64
389
+%endif
390
+
391
+%macro ADDAVG_W32_HBD_AVX512 1
392
+INIT_ZMM avx512
393
+cglobal addAvg_32x%1, 6,9,6
394
+ vbroadcasti32x8 m4, [pw_ %+ ADDAVG_ROUND]
395
+ vbroadcasti32x8 m5, [pw_pixel_max]
396
+ vbroadcasti32x8 m3, [pw_ %+ ADDAVG_FACTOR]
397
+ pxor m2, m2
398
+ add r3, r3
399
+ add r4, r4
400
+ add r5, r5
401
+ lea r6, [3 * r3]
402
+ lea r7, [3 * r4]
403
+ lea r8, [3 * r5]
404
+
405
+%rep %1/4 - 1
406
+ PROCESS_ADDAVG_32x4_HBD_AVX512
407
+ lea r2, [r2 + 4 * r5]
408
+ lea r0, [r0 + 4 * r3]
409
+ lea r1, [r1 + 4 * r4]
410
+%endrep
411
+ PROCESS_ADDAVG_32x4_HBD_AVX512
412
+ RET
413
+%endmacro
414
+
415
+%if ARCH_X86_64
416
+ADDAVG_W32_HBD_AVX512 8
417
+ADDAVG_W32_HBD_AVX512 16
418
+ADDAVG_W32_HBD_AVX512 24
419
+ADDAVG_W32_HBD_AVX512 32
420
+ADDAVG_W32_HBD_AVX512 48
421
+ADDAVG_W32_HBD_AVX512 64
422
+%endif
423
+
424
+%macro ADDAVG_W64_HBD_AVX512 1
425
+INIT_ZMM avx512
426
+cglobal addAvg_64x%1, 6,9,6
427
+ vbroadcasti32x8 m4, [pw_ %+ ADDAVG_ROUND]
428
+ vbroadcasti32x8 m5, [pw_pixel_max]
429
+ vbroadcasti32x8 m3, [pw_ %+ ADDAVG_FACTOR]
430
+ pxor m2, m2
431
+ add r3, r3
432
+ add r4, r4
433
+ add r5, r5
434
+ lea r6, [3 * r3]
435
+ lea r7, [3 * r4]
436
+ lea r8, [3 * r5]
437
+
438
+%rep %1/4 - 1
439
+ PROCESS_ADDAVG_64x4_HBD_AVX512
440
+ lea r2, [r2 + 4 * r5]
441
+ lea r0, [r0 + 4 * r3]
442
+ lea r1, [r1 + 4 * r4]
443
+%endrep
444
+ PROCESS_ADDAVG_64x4_HBD_AVX512
445
+ RET
446
+%endmacro
447
+
448
+%if ARCH_X86_64
449
+ADDAVG_W64_HBD_AVX512 16
450
+ADDAVG_W64_HBD_AVX512 32
451
+ADDAVG_W64_HBD_AVX512 48
452
+ADDAVG_W64_HBD_AVX512 64
453
+%endif
454
+
455
+%if ARCH_X86_64
456
+INIT_ZMM avx512
457
+cglobal addAvg_48x64, 6,9,6
458
+ vbroadcasti32x8 m4, [pw_ %+ ADDAVG_ROUND]
459
+ vbroadcasti32x8 m5, [pw_pixel_max]
460
+ vbroadcasti32x8 m3, [pw_ %+ ADDAVG_FACTOR]
461
+ pxor m2, m2
462
+ add r3, r3
463
+ add r4, r4
464
+ add r5, r5
465
+ lea r6, [3 * r3]
466
+ lea r7, [3 * r4]
467
+ lea r8, [3 * r5]
468
+
469
+%rep 15
470
+ PROCESS_ADDAVG_48x4_HBD_AVX512
471
+ lea r2, [r2 + 4 * r5]
472
+ lea r0, [r0 + 4 * r3]
473
+ lea r1, [r1 + 4 * r4]
474
+%endrep
475
+ PROCESS_ADDAVG_48x4_HBD_AVX512
476
+ RET
477
+%endif
478
+
479
+%macro PROCESS_ADDAVG_ALIGNED_16x4_HBD_AVX512 0
480
+ movu ym0, [r0]
481
+ vinserti32x8 m0, [r0 + r3], 1
482
+ movu ym1, [r1]
483
+ vinserti32x8 m1, [r1 + r4], 1
484
+
485
+ paddw m0, m1
486
+ pmulhrsw m0, m3
487
+ paddw m0, m4
488
+ pmaxsw m0, m2
489
+ pminsw m0, m5
490
+
491
+ movu [r2], ym0
492
+ vextracti32x8 [r2 + r5], m0, 1
493
+
494
+ movu ym0, [r0 + 2 * r3]
495
+ vinserti32x8 m0, [r0 + r6], 1
496
+ movu ym1, [r1 + 2 * r4]
497
+ vinserti32x8 m1, [r1 + r7], 1
498
+
499
+ paddw m0, m1
500
+ pmulhrsw m0, m3
501
+ paddw m0, m4
502
+ pmaxsw m0, m2
503
+ pminsw m0, m5
504
+
505
+ movu [r2 + 2 * r5], ym0
506
+ vextracti32x8 [r2 + r8], m0, 1
507
+%endmacro
508
+
509
+%macro PROCESS_ADDAVG_ALIGNED_32x4_HBD_AVX512 0
510
+ movu m0, [r0]
511
+ movu m1, [r1]
512
+ paddw m0, m1
513
+ pmulhrsw m0, m3
514
+ paddw m0, m4
515
+ pmaxsw m0, m2
516
+ pminsw m0, m5
517
+ movu [r2], m0
518
+
519
+ movu m0, [r0 + r3]
520
+ movu m1, [r1 + r4]
521
+ paddw m0, m1
522
+ pmulhrsw m0, m3
523
+ paddw m0, m4
524
+ pmaxsw m0, m2
525
+ pminsw m0, m5
526
+ movu [r2 + r5], m0
527
+
528
+ movu m0, [r0 + 2 * r3]
529
+ movu m1, [r1 + 2 * r4]
530
+ paddw m0, m1
531
+ pmulhrsw m0, m3
532
+ paddw m0, m4
533
+ pmaxsw m0, m2
534
+ pminsw m0, m5
535
+ movu [r2 + 2 * r5], m0
536
+
537
+ movu m0, [r0 + r6]
538
+ movu m1, [r1 + r7]
539
+ paddw m0, m1
540
+ pmulhrsw m0, m3
541
+ paddw m0, m4
542
+ pmaxsw m0, m2
543
+ pminsw m0, m5
544
+ movu [r2 + r8], m0
545
+%endmacro
546
+
547
+%macro PROCESS_ADDAVG_ALIGNED_64x4_HBD_AVX512 0
548
+ movu m0, [r0]
549
+ movu m1, [r1]
550
+ paddw m0, m1
551
+ pmulhrsw m0, m3
552
+ paddw m0, m4
553
+ pmaxsw m0, m2
554
+ pminsw m0, m5
555
+ movu [r2], m0
556
+
557
+ movu m0, [r0 + mmsize]
558
+ movu m1, [r1 + mmsize]
559
+ paddw m0, m1
560
+ pmulhrsw m0, m3
561
+ paddw m0, m4
562
+ pmaxsw m0, m2
563
+ pminsw m0, m5
564
+ movu [r2 + mmsize], m0
565
+
566
+ movu m0, [r0 + r3]
567
+ movu m1, [r1 + r4]
568
+ paddw m0, m1
569
+ pmulhrsw m0, m3
570
+ paddw m0, m4
571
+ pmaxsw m0, m2
572
+ pminsw m0, m5
573
+ movu [r2 + r5], m0
574
+
575
+ movu m0, [r0 + r3 + mmsize]
576
+ movu m1, [r1 + r4 + mmsize]
577
+ paddw m0, m1
578
+ pmulhrsw m0, m3
579
+ paddw m0, m4
580
+ pmaxsw m0, m2
581
+ pminsw m0, m5
582
+ movu [r2 + r5 + mmsize], m0
583
+
584
+ movu m0, [r0 + 2 * r3]
585
+ movu m1, [r1 + 2 * r4]
586
+ paddw m0, m1
587
+ pmulhrsw m0, m3
588
+ paddw m0, m4
589
+ pmaxsw m0, m2
590
+ pminsw m0, m5
591
+ movu [r2 + 2 * r5], m0
592
+
593
+ movu m0, [r0 + 2 * r3 + mmsize]
594
+ movu m1, [r1 + 2 * r4 + mmsize]
595
+ paddw m0, m1
596
+ pmulhrsw m0, m3
597
+ paddw m0, m4
598
+ pmaxsw m0, m2
599
+ pminsw m0, m5
600
+ movu [r2 + 2 * r5 + mmsize], m0
601
+
602
+ movu m0, [r0 + r6]
603
+ movu m1, [r1 + r7]
604
+ paddw m0, m1
605
+ pmulhrsw m0, m3
606
+ paddw m0, m4
607
+ pmaxsw m0, m2
608
+ pminsw m0, m5
609
+ movu [r2 + r8], m0
610
+
611
+ movu m0, [r0 + r6 + mmsize]
612
+ movu m1, [r1 + r7 + mmsize]
613
+ paddw m0, m1
614
+ pmulhrsw m0, m3
615
+ paddw m0, m4
616
+ pmaxsw m0, m2
617
+ pminsw m0, m5
618
+ movu [r2 + r8 + mmsize], m0
619
+%endmacro
620
+
621
+%macro PROCESS_ADDAVG_ALIGNED_48x4_HBD_AVX512 0
622
+ movu m0, [r0]
623
+ movu m1, [r1]
624
+ paddw m0, m1
625
+ pmulhrsw m0, m3
626
+ paddw m0, m4
627
+ pmaxsw m0, m2
628
+ pminsw m0, m5
629
+ movu [r2], m0
630
+
631
+ movu ym0, [r0 + mmsize]
632
+ movu ym1, [r1 + mmsize]
633
+ paddw ym0, ym1
634
+ pmulhrsw ym0, ym3
635
+ paddw ym0, ym4
636
+ pmaxsw ym0, ym2
637
+ pminsw ym0, ym5
638
+ movu [r2 + mmsize], ym0
639
+
640
+ movu m0, [r0 + r3]
641
+ movu m1, [r1 + r4]
642
+ paddw m0, m1
643
+ pmulhrsw m0, m3
644
+ paddw m0, m4
645
+ pmaxsw m0, m2
646
+ pminsw m0, m5
647
+ movu [r2 + r5], m0
648
+
649
+ movu ym0, [r0 + r3 + mmsize]
650
+ movu ym1, [r1 + r4 + mmsize]
651
+ paddw ym0, ym1
652
+ pmulhrsw ym0, ym3
653
+ paddw ym0, ym4
654
+ pmaxsw ym0, ym2
655
+ pminsw ym0, ym5
656
+ movu [r2 + r5 + mmsize], ym0
657
+
658
+ movu m0, [r0 + 2 * r3]
659
+ movu m1, [r1 + 2 * r4]
660
+ paddw m0, m1
661
+ pmulhrsw m0, m3
662
+ paddw m0, m4
663
+ pmaxsw m0, m2
664
+ pminsw m0, m5
665
+ movu [r2 + 2 * r5], m0
666
+
667
+ movu ym0, [r0 + 2 * r3 + mmsize]
668
+ movu ym1, [r1 + 2 * r4 + mmsize]
669
+ paddw ym0, ym1
670
+ pmulhrsw ym0, ym3
671
+ paddw ym0, ym4
672
+ pmaxsw ym0, ym2
673
+ pminsw ym0, ym5
674
+ movu [r2 + 2 * r5 + mmsize], ym0
675
+
676
+ movu m0, [r0 + r6]
677
+ movu m1, [r1 + r7]
678
+ paddw m0, m1
679
+ pmulhrsw m0, m3
680
+ paddw m0, m4
681
+ pmaxsw m0, m2
682
+ pminsw m0, m5
683
+ movu [r2 + r8], m0
684
+
685
+ movu ym0, [r0 + r6 + mmsize]
686
+ movu ym1, [r1 + r7 + mmsize]
687
+ paddw ym0, ym1
688
+ pmulhrsw ym0, ym3
689
+ paddw ym0, ym4
690
+ pmaxsw ym0, ym2
691
+ pminsw ym0, ym5
692
+ movu [r2 + r8 + mmsize], ym0
693
+%endmacro
694
+;-----------------------------------------------------------------------------
695
+;void addAvg (int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride)
696
+;-----------------------------------------------------------------------------
697
+%if ARCH_X86_64
698
+INIT_ZMM avx512
699
+cglobal addAvg_aligned_16x4, 6,9,6
700
+ vbroadcasti32x8 m4, [pw_ %+ ADDAVG_ROUND]
701
+ vbroadcasti32x8 m5, [pw_pixel_max]
702
+ vbroadcasti32x8 m3, [pw_ %+ ADDAVG_FACTOR]
703
+ pxor m2, m2
704
+ add r3, r3
705
+ add r4, r4
706
+ add r5, r5
707
+ lea r6, [3 * r3]
708
+ lea r7, [3 * r4]
709
+ lea r8, [3 * r5]
710
+ PROCESS_ADDAVG_ALIGNED_16x4_HBD_AVX512
711
+ RET
712
+%endif
713
+
714
+%macro ADDAVG_ALIGNED_W16_HBD_AVX512 1
715
+INIT_ZMM avx512
716
+cglobal addAvg_aligned_16x%1, 6,9,6
717
+ vbroadcasti32x8 m4, [pw_ %+ ADDAVG_ROUND]
718
+ vbroadcasti32x8 m5, [pw_pixel_max]
719
+ vbroadcasti32x8 m3, [pw_ %+ ADDAVG_FACTOR]
720
+ pxor m2, m2
721
+ add r3, r3
722
+ add r4, r4
723
+ add r5, r5
724
+ lea r6, [3 * r3]
725
+ lea r7, [3 * r4]
726
+ lea r8, [3 * r5]
727
+
728
+%rep %1/4 - 1
729
+ PROCESS_ADDAVG_ALIGNED_16x4_HBD_AVX512
730
+ lea r2, [r2 + 4 * r5]
731
+ lea r0, [r0 + 4 * r3]
732
+ lea r1, [r1 + 4 * r4]
733
+%endrep
734
+ PROCESS_ADDAVG_ALIGNED_16x4_HBD_AVX512
735
+ RET
736
+%endmacro
737
+
738
+%if ARCH_X86_64
739
+ADDAVG_ALIGNED_W16_HBD_AVX512 8
740
+ADDAVG_ALIGNED_W16_HBD_AVX512 12
741
+ADDAVG_ALIGNED_W16_HBD_AVX512 16
742
+ADDAVG_ALIGNED_W16_HBD_AVX512 24
743
+ADDAVG_ALIGNED_W16_HBD_AVX512 32
744
+ADDAVG_ALIGNED_W16_HBD_AVX512 64
745
+%endif
746
+
747
+%macro ADDAVG_ALIGNED_W32_HBD_AVX512 1
748
+INIT_ZMM avx512
749
+cglobal addAvg_aligned_32x%1, 6,9,6
750
+ vbroadcasti32x8 m4, [pw_ %+ ADDAVG_ROUND]
751
+ vbroadcasti32x8 m5, [pw_pixel_max]
752
+ vbroadcasti32x8 m3, [pw_ %+ ADDAVG_FACTOR]
753
+ pxor m2, m2
754
+ add r3, r3
755
+ add r4, r4
756
+ add r5, r5
757
+ lea r6, [3 * r3]
758
+ lea r7, [3 * r4]
759
+ lea r8, [3 * r5]
760
+
761
+%rep %1/4 - 1
762
+ PROCESS_ADDAVG_ALIGNED_32x4_HBD_AVX512
763
+ lea r2, [r2 + 4 * r5]
764
+ lea r0, [r0 + 4 * r3]
765
+ lea r1, [r1 + 4 * r4]
766
+%endrep
767
+ PROCESS_ADDAVG_ALIGNED_32x4_HBD_AVX512
768
+ RET
769
+%endmacro
770
+
771
+%if ARCH_X86_64
772
+ADDAVG_ALIGNED_W32_HBD_AVX512 8
773
+ADDAVG_ALIGNED_W32_HBD_AVX512 16
774
+ADDAVG_ALIGNED_W32_HBD_AVX512 24
775
+ADDAVG_ALIGNED_W32_HBD_AVX512 32
776
+ADDAVG_ALIGNED_W32_HBD_AVX512 48
777
+ADDAVG_ALIGNED_W32_HBD_AVX512 64
778
+%endif
779
+
780
+%macro ADDAVG_ALIGNED_W64_HBD_AVX512 1
781
+INIT_ZMM avx512
782
+cglobal addAvg_aligned_64x%1, 6,9,6
783
+ vbroadcasti32x8 m4, [pw_ %+ ADDAVG_ROUND]
784
+ vbroadcasti32x8 m5, [pw_pixel_max]
785
+ vbroadcasti32x8 m3, [pw_ %+ ADDAVG_FACTOR]
786
+ pxor m2, m2
787
+ add r3, r3
788
+ add r4, r4
789
+ add r5, r5
790
+ lea r6, [3 * r3]
791
+ lea r7, [3 * r4]
792
+ lea r8, [3 * r5]
793
+
794
+%rep %1/4 - 1
795
+ PROCESS_ADDAVG_ALIGNED_64x4_HBD_AVX512
796
+ lea r2, [r2 + 4 * r5]
797
+ lea r0, [r0 + 4 * r3]
798
+ lea r1, [r1 + 4 * r4]
799
+%endrep
800
+ PROCESS_ADDAVG_ALIGNED_64x4_HBD_AVX512
801
+ RET
802
+%endmacro
803
+
804
+%if ARCH_X86_64
805
+ADDAVG_ALIGNED_W64_HBD_AVX512 16
806
+ADDAVG_ALIGNED_W64_HBD_AVX512 32
807
+ADDAVG_ALIGNED_W64_HBD_AVX512 48
808
+ADDAVG_ALIGNED_W64_HBD_AVX512 64
809
+%endif
810
+
811
+%if ARCH_X86_64
812
+INIT_ZMM avx512
813
+cglobal addAvg_aligned_48x64, 6,9,6
814
+ vbroadcasti32x8 m4, [pw_ %+ ADDAVG_ROUND]
815
+ vbroadcasti32x8 m5, [pw_pixel_max]
816
+ vbroadcasti32x8 m3, [pw_ %+ ADDAVG_FACTOR]
817
+ pxor m2, m2
818
+ add r3, r3
819
+ add r4, r4
820
+ add r5, r5
821
+ lea r6, [3 * r3]
822
+ lea r7, [3 * r4]
823
+ lea r8, [3 * r5]
824
+
825
+%rep 15
826
+ PROCESS_ADDAVG_ALIGNED_48x4_HBD_AVX512
827
+ lea r2, [r2 + 4 * r5]
828
+ lea r0, [r0 + 4 * r3]
829
+ lea r1, [r1 + 4 * r4]
830
+%endrep
831
+ PROCESS_ADDAVG_ALIGNED_48x4_HBD_AVX512
832
+ RET
833
+%endif
834
+;-----------------------------------------------------------------------------
835
+;addAvg avx512 high bit depth code end
836
+;-----------------------------------------------------------------------------
837
;-----------------------------------------------------------------------------
838
%else ; !HIGH_BIT_DEPTH
839
;-----------------------------------------------------------------------------
840
841
;-----------------------------------------------------------------------------
842
; addAvg avx2 code end
843
;-----------------------------------------------------------------------------
844
+; addAvg avx512 code start
845
+;-----------------------------------------------------------------------------
846
+%macro PROCESS_ADDAVG_64x2_AVX512 0
847
+ movu m0, [r0]
848
+ movu m1, [r1]
849
+ movu m2, [r0 + mmsize]
850
+ movu m3, [r1 + mmsize]
851
+
852
+ paddw m0, m1
853
+ pmulhrsw m0, m4
854
+ paddw m0, m5
855
+ paddw m2, m3
856
+ pmulhrsw m2, m4
857
+ paddw m2, m5
858
859
+ packuswb m0, m2
860
+ vpermq m0, m6, m0
861
+ movu [r2], m0
862
+
863
+ movu m0, [r0 + r3]
864
+ movu m1, [r1 + r4]
865
+ movu m2, [r0 + r3 + mmsize]
866
+ movu m3, [r1 + r4 + mmsize]
867
+
868
+ paddw m0, m1
869
+ pmulhrsw m0, m4
870
+ paddw m0, m5
871
+ paddw m2, m3
872
+ pmulhrsw m2, m4
873
+ paddw m2, m5
874
+
875
+ packuswb m0, m2
876
+ vpermq m0, m6, m0
877
+ movu [r2 + r5], m0
878
+%endmacro
879
+
880
+%macro PROCESS_ADDAVG_32x2_AVX512 0
881
+ movu m0, [r0]
882
+ movu m1, [r1]
883
+ movu m2, [r0 + r3]
884
+ movu m3, [r1 + r4]
885
+
886
+ paddw m0, m1
887
+ pmulhrsw m0, m4
888
+ paddw m0, m5
889
+ paddw m2, m3
890
+ pmulhrsw m2, m4
891
+ paddw m2, m5
892
+
893
+ packuswb m0, m2
894
+ vpermq m0, m6, m0
895
+ movu [r2], ym0
896
+ vextracti32x8 [r2 + r5], m0, 1
897
+%endmacro
898
+;--------------------------------------------------------------------------------------------------------------------
899
+;void addAvg (int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride)
900
+;--------------------------------------------------------------------------------------------------------------------
901
+%macro ADDAVG_W64_AVX512 1
902
+INIT_ZMM avx512
903
+cglobal addAvg_64x%1, 6,6,7
904
+ vbroadcasti32x8 m4, [pw_256]
905
+ vbroadcasti32x8 m5, [pw_128]
906
+ mova m6, [shuf_avx512]
907
+
908
+ add r3, r3
909
+ add r4, r4
910
+
911
+%rep %1/2 - 1
912
+ PROCESS_ADDAVG_64x2_AVX512
913
+ lea r2, [r2 + 2 * r5]
914
+ lea r0, [r0 + 2 * r3]
915
+ lea r1, [r1 + 2 * r4]
916
+%endrep
917
+ PROCESS_ADDAVG_64x2_AVX512
918
+ RET
919
+%endmacro
920
+
921
+ADDAVG_W64_AVX512 16
922
+ADDAVG_W64_AVX512 32
923
+ADDAVG_W64_AVX512 48
924
+ADDAVG_W64_AVX512 64
925
+
926
+%macro ADDAVG_W32_AVX512 1
927
+INIT_ZMM avx512
928
+cglobal addAvg_32x%1, 6,6,7
929
+ vbroadcasti32x8 m4, [pw_256]
930
+ vbroadcasti32x8 m5, [pw_128]
931
+ mova m6, [shuf_avx512]
932
+ add r3, r3
933
+ add r4, r4
934
+
935
+%rep %1/2 - 1
936
+ PROCESS_ADDAVG_32x2_AVX512
937
+ lea r2, [r2 + 2 * r5]
938
+ lea r0, [r0 + 2 * r3]
939
+ lea r1, [r1 + 2 * r4]
940
+%endrep
941
+ PROCESS_ADDAVG_32x2_AVX512
942
+ RET
943
+%endmacro
944
+
945
+ADDAVG_W32_AVX512 8
946
+ADDAVG_W32_AVX512 16
947
+ADDAVG_W32_AVX512 24
948
+ADDAVG_W32_AVX512 32
949
+ADDAVG_W32_AVX512 48
950
+ADDAVG_W32_AVX512 64
951
+
952
+%macro PROCESS_ADDAVG_ALIGNED_64x2_AVX512 0
953
+ mova m0, [r0]
954
+ mova m1, [r1]
955
+ mova m2, [r0 + mmsize]
956
+ mova m3, [r1 + mmsize]
957
+
958
+ paddw m0, m1
959
+ pmulhrsw m0, m4
960
+ paddw m0, m5
961
+ paddw m2, m3
962
+ pmulhrsw m2, m4
963
+ paddw m2, m5
964
+
965
+ packuswb m0, m2
966
+ vpermq m0, m6, m0
967
+ mova [r2], m0
968
+
969
+ mova m0, [r0 + r3]
970
+ mova m1, [r1 + r4]
971
+ mova m2, [r0 + r3 + mmsize]
972
+ mova m3, [r1 + r4 + mmsize]
973
+
974
+ paddw m0, m1
975
+ pmulhrsw m0, m4
976
+ paddw m0, m5
977
+ paddw m2, m3
978
+ pmulhrsw m2, m4
979
+ paddw m2, m5
980
+
981
+ packuswb m0, m2
982
+ vpermq m0, m6, m0
983
+ mova [r2 + r5], m0
984
+%endmacro
985
+
986
+%macro PROCESS_ADDAVG_ALIGNED_32x2_AVX512 0
987
+ mova m0, [r0]
988
+ mova m1, [r1]
989
+ mova m2, [r0 + r3]
990
+ mova m3, [r1 + r4]
991
+
992
+ paddw m0, m1
993
+ pmulhrsw m0, m4
994
+ paddw m0, m5
995
+ paddw m2, m3
996
+ pmulhrsw m2, m4
997
+ paddw m2, m5
998
+
999
+ packuswb m0, m2
1000
+ vpermq m0, m6, m0
1001
+ mova [r2], ym0
1002
+ vextracti32x8 [r2 + r5], m0, 1
1003
+%endmacro
1004
+;--------------------------------------------------------------------------------------------------------------------
1005
+;void addAvg (int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride)
1006
+;--------------------------------------------------------------------------------------------------------------------
1007
+%macro ADDAVG_ALIGNED_W64_AVX512 1
1008
+INIT_ZMM avx512
1009
+cglobal addAvg_aligned_64x%1, 6,6,7
1010
+ vbroadcasti32x8 m4, [pw_256]
1011
+ vbroadcasti32x8 m5, [pw_128]
1012
+ mova m6, [shuf_avx512]
1013
+
1014
+ add r3, r3
1015
+ add r4, r4
1016
+
1017
+%rep %1/2 - 1
1018
+ PROCESS_ADDAVG_ALIGNED_64x2_AVX512
1019
+ lea r2, [r2 + 2 * r5]
1020
+ lea r0, [r0 + 2 * r3]
1021
+ lea r1, [r1 + 2 * r4]
1022
+%endrep
1023
+ PROCESS_ADDAVG_ALIGNED_64x2_AVX512
1024
+ RET
1025
+%endmacro
1026
+
1027
+ADDAVG_ALIGNED_W64_AVX512 16
1028
+ADDAVG_ALIGNED_W64_AVX512 32
1029
+ADDAVG_ALIGNED_W64_AVX512 48
1030
+ADDAVG_ALIGNED_W64_AVX512 64
1031
+
1032
+%macro ADDAVG_ALIGNED_W32_AVX512 1
1033
+INIT_ZMM avx512
1034
+cglobal addAvg_aligned_32x%1, 6,6,7
1035
+ vbroadcasti32x8 m4, [pw_256]
1036
+ vbroadcasti32x8 m5, [pw_128]
1037
+ mova m6, [shuf_avx512]
1038
+ add r3, r3
1039
+ add r4, r4
1040
+
1041
+%rep %1/2 - 1
1042
+ PROCESS_ADDAVG_ALIGNED_32x2_AVX512
1043
+ lea r2, [r2 + 2 * r5]
1044
+ lea r0, [r0 + 2 * r3]
1045
+ lea r1, [r1 + 2 * r4]
1046
+%endrep
1047
+ PROCESS_ADDAVG_ALIGNED_32x2_AVX512
1048
+ RET
1049
+%endmacro
1050
+
1051
+ADDAVG_ALIGNED_W32_AVX512 8
1052
+ADDAVG_ALIGNED_W32_AVX512 16
1053
+ADDAVG_ALIGNED_W32_AVX512 24
1054
+ADDAVG_ALIGNED_W32_AVX512 32
1055
+ADDAVG_ALIGNED_W32_AVX512 48
1056
+ADDAVG_ALIGNED_W32_AVX512 64
1057
+;-----------------------------------------------------------------------------
1058
+; addAvg avx512 code end
1059
;-----------------------------------------------------------------------------
1060
%macro ADDAVG_W24_H2 2
1061
INIT_XMM sse4
1062
1063
%endmacro
1064
%endif
1065
1066
-%macro AVG_END 0
1067
- lea t4, [t4+t5*2*SIZEOF_PIXEL]
1068
+%macro AVG_END 0-1 2;rows
1069
lea t2, [t2+t3*2*SIZEOF_PIXEL]
1070
+ lea t4, [t4+t5*2*SIZEOF_PIXEL]
1071
lea t0, [t0+t1*2*SIZEOF_PIXEL]
1072
- sub eax, 2
1073
+ sub eax, %1
1074
jg .height_loop
1075
%ifidn movu,movq ; detect MMX
1076
EMMS
1077
1078
%endmacro
1079
1080
%macro BIWEIGHT_START_SSSE3 0
1081
- movzx t6d, byte r6m ; FIXME x86_64
1082
- mov t7d, 64
1083
- sub t7d, t6d
1084
- shl t7d, 8
1085
- add t6d, t7d
1086
- mova m4, [pw_512]
1087
- movd xm3, t6d
1088
+ movzx t6d, byte r6m ; FIXME x86_64
1089
+%if mmsize > 16
1090
+ vbroadcasti128 m4, [pw_512]
1091
+%else
1092
+ mova m4, [pw_512]
1093
+%endif
1094
+ lea t7d, [t6+(64<<8)]
1095
+ shl t6d, 8
1096
+ sub t7d, t6d
1097
+%if cpuflag(avx512)
1098
+ vpbroadcastw m3, t7d
1099
+%else
1100
+ movd xm3, t7d
1101
%if cpuflag(avx2)
1102
- vpbroadcastw m3, xm3
1103
+ vpbroadcastw m3, xm3
1104
%else
1105
- SPLATW m3, m3 ; weight_dst,src
1106
+ SPLATW m3, m3 ; weight_dst,src
1107
+%endif
1108
%endif
1109
%endmacro
1110
1111
1112
AVG_WEIGHT 24, 7
1113
AVG_WEIGHT 48, 7
1114
1115
+INIT_YMM avx512
1116
+cglobal pixel_avg_weight_w8
1117
+ BIWEIGHT_START
1118
+ kxnorb k1, k1, k1
1119
+ kaddb k1, k1, k1
1120
+ AVG_START 5
1121
+.height_loop:
1122
+ movq xm0, [t2]
1123
+ movq xm2, [t4]
1124
+ movq xm1, [t2+t3]
1125
+ movq xm5, [t4+t5]
1126
+ lea t2, [t2+t3*2]
1127
+ lea t4, [t4+t5*2]
1128
+ vpbroadcastq m0 {k1}, [t2]
1129
+ vpbroadcastq m2 {k1}, [t4]
1130
+ vpbroadcastq m1 {k1}, [t2+t3]
1131
+ vpbroadcastq m5 {k1}, [t4+t5]
1132
+ punpcklbw m0, m2
1133
+ punpcklbw m1, m5
1134
+ pmaddubsw m0, m3
1135
+ pmaddubsw m1, m3
1136
+ pmulhrsw m0, m4
1137
+ pmulhrsw m1, m4
1138
+ packuswb m0, m1
1139
+ vextracti128 xmm1, m0, 1
1140
+ movq [t0], xm0
1141
+ movhps [t0+t1], xm0
1142
+ lea t0, [t0+t1*2]
1143
+ movq [t0], xmm1
1144
+ movhps [t0+t1], xmm1
1145
+ AVG_END 4
1146
+
1147
INIT_YMM avx2
1148
cglobal pixel_avg_weight_w16
1149
BIWEIGHT_START
1150
1151
vextracti128 [t0+t1], m0, 1
1152
AVG_END
1153
1154
+INIT_ZMM avx512
1155
+ cglobal pixel_avg_weight_w16
1156
+ BIWEIGHT_START
1157
+ AVG_START 5
1158
+.height_loop:
1159
+ movu xm0, [t2]
1160
+ movu xm1, [t4]
1161
+ vinserti128 ym0, [t2+t3], 1
1162
+ vinserti128 ym1, [t4+t5], 1
1163
+ lea t2, [t2+t3*2]
1164
+ lea t4, [t4+t5*2]
1165
+ vinserti32x4 m0, [t2], 2
1166
+ vinserti32x4 m1, [t4], 2
1167
+ vinserti32x4 m0, [t2+t3], 3
1168
+ vinserti32x4 m1, [t4+t5], 3
1169
+ SBUTTERFLY bw, 0, 1, 2
1170
+ pmaddubsw m0, m3
1171
+ pmaddubsw m1, m3
1172
+ pmulhrsw m0, m4
1173
+ pmulhrsw m1, m4
1174
+ packuswb m0, m1
1175
+ mova [t0], xm0
1176
+ vextracti128 [t0+t1], ym0, 1
1177
+ lea t0, [t0+t1*2]
1178
+ vextracti32x4 [t0], m0, 2
1179
+ vextracti32x4 [t0+t1], m0, 3
1180
+ AVG_END 4
1181
+
1182
+INIT_YMM avx2
1183
cglobal pixel_avg_weight_w32
1184
BIWEIGHT_START
1185
AVG_START 5
1186
1187
mova [t0], m0
1188
AVG_END
1189
1190
+INIT_YMM avx2
1191
cglobal pixel_avg_weight_w64
1192
BIWEIGHT_START
1193
AVG_START 5
1194
1195
AVGH 16, 8
1196
AVGH 16, 4
1197
1198
+INIT_XMM avx512
1199
+AVGH 16, 64
1200
+AVGH 16, 32
1201
+AVGH 16, 16
1202
+AVGH 16, 12
1203
+AVGH 16, 8
1204
+AVGH 16, 4
1205
+AVGH 8, 32
1206
+AVGH 8, 16
1207
+AVGH 8, 8
1208
+AVGH 8, 4
1209
+
1210
%endif ;HIGH_BIT_DEPTH
1211
1212
;-------------------------------------------------------------------------------------------------------------------------------
1213
1214
RET
1215
%endif
1216
1217
+;-----------------------------------------------------------------------------
1218
+;pixel_avg_pp avx512 code start
1219
+;-----------------------------------------------------------------------------
1220
+%macro PROCESS_PIXELAVG_64x4_AVX512 0
1221
+ movu m0, [r2]
1222
+ movu m2, [r2 + r3]
1223
+ movu m1, [r4]
1224
+ movu m3, [r4 + r5]
1225
+ pavgb m0, m1
1226
+ pavgb m2, m3
1227
+ movu [r0], m0
1228
+ movu [r0 + r1], m2
1229
+
1230
+ movu m0, [r2 + 2 * r3]
1231
+ movu m2, [r2 + r7]
1232
+ movu m1, [r4 + 2 * r5]
1233
+ movu m3, [r4 + r8]
1234
+ pavgb m0, m1
1235
+ pavgb m2, m3
1236
+ movu [r0 + 2 * r1], m0
1237
+ movu [r0 + r6], m2
1238
+%endmacro
1239
+
1240
+;-------------------------------------------------------------------------------------------------------------------------------
1241
+;void pixelavg_pp(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int)
1242
+;-------------------------------------------------------------------------------------------------------------------------------
1243
+%if ARCH_X86_64 && BIT_DEPTH == 8
1244
+%macro PIXEL_AVG_64xN_AVX512 1
1245
+INIT_ZMM avx512
1246
+cglobal pixel_avg_64x%1, 6, 9, 4
1247
+ lea r6, [3 * r1]
1248
+ lea r7, [3 * r3]
1249
+ lea r8, [3 * r5]
1250
+
1251
+%rep %1/4 - 1
1252
+ PROCESS_PIXELAVG_64x4_AVX512
1253
+ lea r2, [r2 + r3 * 4]
1254
+ lea r4, [r4 + r5 * 4]
1255
+ lea r0, [r0 + r1 * 4]
1256
+%endrep
1257
+ PROCESS_PIXELAVG_64x4_AVX512
1258
+ RET
1259
+%endmacro
1260
+
1261
+PIXEL_AVG_64xN_AVX512 16
1262
+PIXEL_AVG_64xN_AVX512 32
1263
+PIXEL_AVG_64xN_AVX512 48
1264
+PIXEL_AVG_64xN_AVX512 64
1265
+%endif
1266
+;-----------------------------------------------------------------------------
1267
+;pixel_avg_pp avx512 code end
1268
+;-----------------------------------------------------------------------------
1269
;=============================================================================
1270
; pixel avg2
1271
;=============================================================================
1272
1273
RET
1274
%endif
1275
1276
+;-----------------------------------------------------------------------------
1277
+;pixel_avg_pp avx512 high bit depth code start
1278
+;-----------------------------------------------------------------------------
1279
+%macro PROCESS_PIXELAVG_32x8_HBD_AVX512 0
1280
+ movu m0, [r2]
1281
+ movu m1, [r4]
1282
+ movu m2, [r2 + r3]
1283
+ movu m3, [r4 + r5]
1284
+ pavgw m0, m1
1285
+ pavgw m2, m3
1286
+ movu [r0], m0
1287
+ movu [r0 + r1], m2
1288
+
1289
+ movu m0, [r2 + r3 * 2]
1290
+ movu m1, [r4 + r5 * 2]
1291
+ movu m2, [r2 + r6]
1292
+ movu m3, [r4 + r7]
1293
+ pavgw m0, m1
1294
+ pavgw m2, m3
1295
+ movu [r0 + r1 * 2], m0
1296
+ movu [r0 + r8], m2
1297
+
1298
+ lea r0, [r0 + 4 * r1]
1299
+ lea r2, [r2 + 4 * r3]
1300
+ lea r4, [r4 + 4 * r5]
1301
+
1302
+ movu m0, [r2]
1303
+ movu m1, [r4]
1304
+ movu m2, [r2 + r3]
1305
+ movu m3, [r4 + r5]
1306
+ pavgw m0, m1
1307
+ pavgw m2, m3
1308
+ movu [r0], m0
1309
+ movu [r0 + r1], m2
1310
+
1311
+ movu m0, [r2 + r3 * 2]
1312
+ movu m1, [r4 + r5 * 2]
1313
+ movu m2, [r2 + r6]
1314
+ movu m3, [r4 + r7]
1315
+ pavgw m0, m1
1316
+ pavgw m2, m3
1317
+ movu [r0 + r1 * 2], m0
1318
+ movu [r0 + r8], m2
1319
+%endmacro
1320
+%macro PROCESS_PIXELAVG_ALIGNED_32x8_HBD_AVX512 0
1321
+ mova m0, [r2]
1322
+ mova m1, [r4]
1323
+ mova m2, [r2 + r3]
1324
+ mova m3, [r4 + r5]
1325
+ pavgw m0, m1
1326
+ pavgw m2, m3
1327
+ mova [r0], m0
1328
+ mova [r0 + r1], m2
1329
+
1330
+ mova m0, [r2 + r3 * 2]
1331
+ mova m1, [r4 + r5 * 2]
1332
+ mova m2, [r2 + r6]
1333
+ mova m3, [r4 + r7]
1334
+ pavgw m0, m1
1335
+ pavgw m2, m3
1336
+ mova [r0 + r1 * 2], m0
1337
+ mova [r0 + r8], m2
1338
+
1339
+ lea r0, [r0 + 4 * r1]
1340
+ lea r2, [r2 + 4 * r3]
1341
+ lea r4, [r4 + 4 * r5]
1342
+
1343
+ mova m0, [r2]
1344
+ mova m1, [r4]
1345
+ mova m2, [r2 + r3]
1346
+ mova m3, [r4 + r5]
1347
+ pavgw m0, m1
1348
+ pavgw m2, m3
1349
+ mova [r0], m0
1350
+ mova [r0 + r1], m2
1351
+
1352
+ mova m0, [r2 + r3 * 2]
1353
+ mova m1, [r4 + r5 * 2]
1354
+ mova m2, [r2 + r6]
1355
+ mova m3, [r4 + r7]
1356
+ pavgw m0, m1
1357
+ pavgw m2, m3
1358
+ mova [r0 + r1 * 2], m0
1359
+ mova [r0 + r8], m2
1360
+%endmacro
1361
+
1362
+%macro PROCESS_PIXELAVG_64x8_HBD_AVX512 0
1363
+ movu m0, [r2]
1364
+ movu m1, [r4]
1365
+ movu m2, [r2 + r3]
1366
+ movu m3, [r4 + r5]
1367
+ pavgw m0, m1
1368
+ pavgw m2, m3
1369
+ movu [r0], m0
1370
+ movu [r0 + r1], m2
1371
+
1372
+ movu m0, [r2 + mmsize]
1373
+ movu m1, [r4 + mmsize]
1374
+ movu m2, [r2 + r3 + mmsize]
1375
+ movu m3, [r4 + r5 + mmsize]
1376
+ pavgw m0, m1
1377
+ pavgw m2, m3
1378
+ movu [r0 + mmsize], m0
1379
+ movu [r0 + r1 + mmsize], m2
1380
+
1381
+ movu m0, [r2 + r3 * 2]
1382
+ movu m1, [r4 + r5 * 2]
1383
+ movu m2, [r2 + r6]
1384
+ movu m3, [r4 + r7]
1385
+ pavgw m0, m1
1386
+ pavgw m2, m3
1387
+ movu [r0 + r1 * 2], m0
1388
+ movu [r0 + r8], m2
1389
+
1390
+ movu m0, [r2 + r3 * 2 + mmsize]
1391
+ movu m1, [r4 + r5 * 2 + mmsize]
1392
+ movu m2, [r2 + r6 + mmsize]
1393
+ movu m3, [r4 + r7 + mmsize]
1394
+ pavgw m0, m1
1395
+ pavgw m2, m3
1396
+ movu [r0 + r1 * 2 + mmsize], m0
1397
+ movu [r0 + r8 + mmsize], m2
1398
+
1399
+ lea r0, [r0 + 4 * r1]
1400
+ lea r2, [r2 + 4 * r3]
1401
+ lea r4, [r4 + 4 * r5]
1402
+
1403
+ movu m0, [r2]
1404
+ movu m1, [r4]
1405
+ movu m2, [r2 + r3]
1406
+ movu m3, [r4 + r5]
1407
+ pavgw m0, m1
1408
+ pavgw m2, m3
1409
+ movu [r0], m0
1410
+ movu [r0 + r1], m2
1411
+
1412
+ movu m0, [r2 + mmsize]
1413
+ movu m1, [r4 + mmsize]
1414
+ movu m2, [r2 + r3 + mmsize]
1415
+ movu m3, [r4 + r5 + mmsize]
1416
+ pavgw m0, m1
1417
+ pavgw m2, m3
1418
+ movu [r0 + mmsize], m0
1419
+ movu [r0 + r1 + mmsize], m2
1420
+
1421
+ movu m0, [r2 + r3 * 2]
1422
+ movu m1, [r4 + r5 * 2]
1423
+ movu m2, [r2 + r6]
1424
+ movu m3, [r4 + r7]
1425
+ pavgw m0, m1
1426
+ pavgw m2, m3
1427
+ movu [r0 + r1 * 2], m0
1428
+ movu [r0 + r8], m2
1429
+
1430
+ movu m0, [r2 + r3 * 2 + mmsize]
1431
+ movu m1, [r4 + r5 * 2 + mmsize]
1432
+ movu m2, [r2 + r6 + mmsize]
1433
+ movu m3, [r4 + r7 + mmsize]
1434
+ pavgw m0, m1
1435
+ pavgw m2, m3
1436
+ movu [r0 + r1 * 2 + mmsize], m0
1437
+ movu [r0 + r8 + mmsize], m2
1438
+%endmacro
1439
+%macro PROCESS_PIXELAVG_ALIGNED_64x8_HBD_AVX512 0
1440
+ mova m0, [r2]
1441
+ mova m1, [r4]
1442
+ mova m2, [r2 + r3]
1443
+ mova m3, [r4 + r5]
1444
+ pavgw m0, m1
1445
+ pavgw m2, m3
1446
+ mova [r0], m0
1447
+ mova [r0 + r1], m2
1448
+
1449
+ mova m0, [r2 + mmsize]
1450
+ mova m1, [r4 + mmsize]
1451
+ mova m2, [r2 + r3 + mmsize]
1452
+ mova m3, [r4 + r5 + mmsize]
1453
+ pavgw m0, m1
1454
+ pavgw m2, m3
1455
+ mova [r0 + mmsize], m0
1456
+ mova [r0 + r1 + mmsize], m2
1457
+
1458
+ mova m0, [r2 + r3 * 2]
1459
+ mova m1, [r4 + r5 * 2]
1460
+ mova m2, [r2 + r6]
1461
+ mova m3, [r4 + r7]
1462
+ pavgw m0, m1
1463
+ pavgw m2, m3
1464
+ mova [r0 + r1 * 2], m0
1465
+ mova [r0 + r8], m2
1466
+
1467
+ mova m0, [r2 + r3 * 2 + mmsize]
1468
+ mova m1, [r4 + r5 * 2 + mmsize]
1469
+ mova m2, [r2 + r6 + mmsize]
1470
+ mova m3, [r4 + r7 + mmsize]
1471
+ pavgw m0, m1
1472
+ pavgw m2, m3
1473
+ mova [r0 + r1 * 2 + mmsize], m0
1474
+ mova [r0 + r8 + mmsize], m2
1475
+
1476
+ lea r0, [r0 + 4 * r1]
1477
+ lea r2, [r2 + 4 * r3]
1478
+ lea r4, [r4 + 4 * r5]
1479
+
1480
+ mova m0, [r2]
1481
+ mova m1, [r4]
1482
+ mova m2, [r2 + r3]
1483
+ mova m3, [r4 + r5]
1484
+ pavgw m0, m1
1485
+ pavgw m2, m3
1486
+ mova [r0], m0
1487
+ mova [r0 + r1], m2
1488
+
1489
+ mova m0, [r2 + mmsize]
1490
+ mova m1, [r4 + mmsize]
1491
+ mova m2, [r2 + r3 + mmsize]
1492
+ mova m3, [r4 + r5 + mmsize]
1493
+ pavgw m0, m1
1494
+ pavgw m2, m3
1495
+ mova [r0 + mmsize], m0
1496
+ mova [r0 + r1 + mmsize], m2
1497
+
1498
+ mova m0, [r2 + r3 * 2]
1499
+ mova m1, [r4 + r5 * 2]
1500
+ mova m2, [r2 + r6]
1501
+ mova m3, [r4 + r7]
1502
+ pavgw m0, m1
1503
+ pavgw m2, m3
1504
+ mova [r0 + r1 * 2], m0
1505
+ mova [r0 + r8], m2
1506
+
1507
+ mova m0, [r2 + r3 * 2 + mmsize]
1508
+ mova m1, [r4 + r5 * 2 + mmsize]
1509
+ mova m2, [r2 + r6 + mmsize]
1510
+ mova m3, [r4 + r7 + mmsize]
1511
+ pavgw m0, m1
1512
+ pavgw m2, m3
1513
+ mova [r0 + r1 * 2 + mmsize], m0
1514
+ mova [r0 + r8 + mmsize], m2
1515
+%endmacro
1516
+
1517
+%macro PROCESS_PIXELAVG_48x8_HBD_AVX512 0
1518
+ movu m0, [r2]
1519
+ movu m1, [r4]
1520
+ movu m2, [r2 + r3]
1521
+ movu m3, [r4 + r5]
1522
+ pavgw m0, m1
1523
+ pavgw m2, m3
1524
+ movu [r0], m0
1525
+ movu [r0 + r1], m2
1526
+
1527
+ movu ym0, [r2 + mmsize]
1528
+ movu ym1, [r4 + mmsize]
1529
+ movu ym2, [r2 + r3 + mmsize]
1530
+ movu ym3, [r4 + r5 + mmsize]
1531
+ pavgw ym0, ym1
1532
+ pavgw ym2, ym3
1533
+ movu [r0 + mmsize], ym0
1534
+ movu [r0 + r1 + mmsize], ym2
1535
+
1536
+ movu m0, [r2 + r3 * 2]
1537
+ movu m1, [r4 + r5 * 2]
1538
+ movu m2, [r2 + r6]
1539
+ movu m3, [r4 + r7]
1540
+ pavgw m0, m1
1541
+ pavgw m2, m3
1542
+ movu [r0 + r1 * 2], m0
1543
+ movu [r0 + r8], m2
1544
+
1545
+ movu ym0, [r2 + r3 * 2 + mmsize]
1546
+ movu ym1, [r4 + r5 * 2 + mmsize]
1547
+ movu ym2, [r2 + r6 + mmsize]
1548
+ movu ym3, [r4 + r7 + mmsize]
1549
+ pavgw ym0, ym1
1550
+ pavgw ym2, ym3
1551
+ movu [r0 + r1 * 2 + mmsize], ym0
1552
+ movu [r0 + r8 + mmsize], ym2
1553
+
1554
+ lea r0, [r0 + 4 * r1]
1555
+ lea r2, [r2 + 4 * r3]
1556
+ lea r4, [r4 + 4 * r5]
1557
+
1558
+ movu m0, [r2]
1559
+ movu m1, [r4]
1560
+ movu m2, [r2 + r3]
1561
+ movu m3, [r4 + r5]
1562
+ pavgw m0, m1
1563
+ pavgw m2, m3
1564
+ movu [r0], m0
1565
+ movu [r0 + r1], m2
1566
+
1567
+ movu ym0, [r2 + mmsize]
1568
+ movu ym1, [r4 + mmsize]
1569
+ movu ym2, [r2 + r3 + mmsize]
1570
+ movu ym3, [r4 + r5 + mmsize]
1571
+ pavgw ym0, ym1
1572
+ pavgw ym2, ym3
1573
+ movu [r0 + mmsize], ym0
1574
+ movu [r0 + r1 + mmsize], ym2
1575
+
1576
+ movu m0, [r2 + r3 * 2]
1577
+ movu m1, [r4 + r5 * 2]
1578
+ movu m2, [r2 + r6]
1579
+ movu m3, [r4 + r7]
1580
+ pavgw m0, m1
1581
+ pavgw m2, m3
1582
+ movu [r0 + r1 * 2], m0
1583
+ movu [r0 + r8], m2
1584
+
1585
+ movu ym0, [r2 + r3 * 2 + mmsize]
1586
+ movu ym1, [r4 + r5 * 2 + mmsize]
1587
+ movu ym2, [r2 + r6 + mmsize]
1588
+ movu ym3, [r4 + r7 + mmsize]
1589
+ pavgw ym0, ym1
1590
+ pavgw ym2, ym3
1591
+ movu [r0 + r1 * 2 + mmsize], ym0
1592
+ movu [r0 + r8 + mmsize], ym2
1593
+%endmacro
1594
+%macro PROCESS_PIXELAVG_ALIGNED_48x8_HBD_AVX512 0
1595
+ mova m0, [r2]
1596
+ mova m1, [r4]
1597
+ mova m2, [r2 + r3]
1598
+ mova m3, [r4 + r5]
1599
+ pavgw m0, m1
1600
+ pavgw m2, m3
1601
+ mova [r0], m0
1602
+ mova [r0 + r1], m2
1603
+
1604
+ mova ym0, [r2 + mmsize]
1605
+ mova ym1, [r4 + mmsize]
1606
+ mova ym2, [r2 + r3 + mmsize]
1607
+ mova ym3, [r4 + r5 + mmsize]
1608
+ pavgw ym0, ym1
1609
+ pavgw ym2, ym3
1610
+ mova [r0 + mmsize], ym0
1611
+ mova [r0 + r1 + mmsize], ym2
1612
+
1613
+ mova m0, [r2 + r3 * 2]
1614
+ mova m1, [r4 + r5 * 2]
1615
+ mova m2, [r2 + r6]
1616
+ mova m3, [r4 + r7]
1617
+ pavgw m0, m1
1618
+ pavgw m2, m3
1619
+ mova [r0 + r1 * 2], m0
1620
+ mova [r0 + r8], m2
1621
+
1622
+ mova ym0, [r2 + r3 * 2 + mmsize]
1623
+ mova ym1, [r4 + r5 * 2 + mmsize]
1624
+ mova ym2, [r2 + r6 + mmsize]
1625
+ mova ym3, [r4 + r7 + mmsize]
1626
+ pavgw ym0, ym1
1627
+ pavgw ym2, ym3
1628
+ mova [r0 + r1 * 2 + mmsize], ym0
1629
+ mova [r0 + r8 + mmsize], ym2
1630
+
1631
+ lea r0, [r0 + 4 * r1]
1632
+ lea r2, [r2 + 4 * r3]
1633
+ lea r4, [r4 + 4 * r5]
1634
+
1635
+ mova m0, [r2]
1636
+ mova m1, [r4]
1637
+ mova m2, [r2 + r3]
1638
+ mova m3, [r4 + r5]
1639
+ pavgw m0, m1
1640
+ pavgw m2, m3
1641
+ mova [r0], m0
1642
+ mova [r0 + r1], m2
1643
+
1644
+ mova ym0, [r2 + mmsize]
1645
+ mova ym1, [r4 + mmsize]
1646
+ mova ym2, [r2 + r3 + mmsize]
1647
+ mova ym3, [r4 + r5 + mmsize]
1648
+ pavgw ym0, ym1
1649
+ pavgw ym2, ym3
1650
+ mova [r0 + mmsize], ym0
1651
+ mova [r0 + r1 + mmsize], ym2
1652
+
1653
+ mova m0, [r2 + r3 * 2]
1654
+ mova m1, [r4 + r5 * 2]
1655
+ mova m2, [r2 + r6]
1656
+ mova m3, [r4 + r7]
1657
+ pavgw m0, m1
1658
+ pavgw m2, m3
1659
+ mova [r0 + r1 * 2], m0
1660
+ mova [r0 + r8], m2
1661
+
1662
+ mova ym0, [r2 + r3 * 2 + mmsize]
1663
+ mova ym1, [r4 + r5 * 2 + mmsize]
1664
+ mova ym2, [r2 + r6 + mmsize]
1665
+ mova ym3, [r4 + r7 + mmsize]
1666
+ pavgw ym0, ym1
1667
+ pavgw ym2, ym3
1668
+ mova [r0 + r1 * 2 + mmsize], ym0
1669
+ mova [r0 + r8 + mmsize], ym2
1670
+%endmacro
1671
+
1672
+%macro PIXEL_AVG_HBD_W32 1
1673
+INIT_ZMM avx512
1674
+cglobal pixel_avg_32x%1, 6,9,4
1675
+ shl r1d, 1
1676
+ shl r3d, 1
1677
+ shl r5d, 1
1678
+ lea r6, [r3 * 3]
1679
+ lea r7, [r5 * 3]
1680
+ lea r8, [r1 * 3]
1681
+
1682
+%rep %1/8 - 1
1683
+ PROCESS_PIXELAVG_32x8_HBD_AVX512
1684
+ lea r0, [r0 + 4 * r1]
1685
+ lea r2, [r2 + 4 * r3]
1686
+ lea r4, [r4 + 4 * r5]
1687
+%endrep
1688
+ PROCESS_PIXELAVG_32x8_HBD_AVX512
1689
+ RET
1690
+%endmacro
1691
+
1692
+%if ARCH_X86_64
1693
+PIXEL_AVG_HBD_W32 8
1694
+PIXEL_AVG_HBD_W32 16
1695
+PIXEL_AVG_HBD_W32 24
1696
+PIXEL_AVG_HBD_W32 32
1697
+PIXEL_AVG_HBD_W32 64
1698
+%endif
1699
+%macro PIXEL_AVG_HBD_ALIGNED_W32 1
1700
+INIT_ZMM avx512
1701
+cglobal pixel_avg_aligned_32x%1, 6,9,4
1702
+ shl r1d, 1
1703
+ shl r3d, 1
1704
+ shl r5d, 1
1705
+ lea r6, [r3 * 3]
1706
+ lea r7, [r5 * 3]
1707
+ lea r8, [r1 * 3]
1708
+
1709
+%rep %1/8 - 1
1710
+ PROCESS_PIXELAVG_ALIGNED_32x8_HBD_AVX512
1711
+ lea r0, [r0 + 4 * r1]
1712
+ lea r2, [r2 + 4 * r3]
1713
+ lea r4, [r4 + 4 * r5]
1714
+%endrep
1715
+ PROCESS_PIXELAVG_ALIGNED_32x8_HBD_AVX512
1716
+ RET
1717
+%endmacro
1718
+
1719
+%if ARCH_X86_64
1720
+PIXEL_AVG_HBD_ALIGNED_W32 8
1721
+PIXEL_AVG_HBD_ALIGNED_W32 16
1722
+PIXEL_AVG_HBD_ALIGNED_W32 24
1723
+PIXEL_AVG_HBD_ALIGNED_W32 32
1724
+PIXEL_AVG_HBD_ALIGNED_W32 64
1725
+%endif
1726
+
1727
+%macro PIXEL_AVG_HBD_W64 1
1728
+INIT_ZMM avx512
1729
+cglobal pixel_avg_64x%1, 6,9,4
1730
+ shl r1d, 1
1731
+ shl r3d, 1
1732
+ shl r5d, 1
1733
+ lea r6, [r3 * 3]
1734
+ lea r7, [r5 * 3]
1735
+ lea r8, [r1 * 3]
1736
+
1737
+%rep %1/8 - 1
1738
+ PROCESS_PIXELAVG_64x8_HBD_AVX512
1739
+ lea r0, [r0 + 4 * r1]
1740
+ lea r2, [r2 + 4 * r3]
1741
+ lea r4, [r4 + 4 * r5]
1742
+%endrep
1743
+ PROCESS_PIXELAVG_64x8_HBD_AVX512
1744
+ RET
1745
+%endmacro
1746
+
1747
+%if ARCH_X86_64
1748
+PIXEL_AVG_HBD_W64 16
1749
+PIXEL_AVG_HBD_W64 32
1750
+PIXEL_AVG_HBD_W64 48
1751
+PIXEL_AVG_HBD_W64 64
1752
+%endif
1753
+%macro PIXEL_AVG_HBD_ALIGNED_W64 1
1754
+INIT_ZMM avx512
1755
+cglobal pixel_avg_aligned_64x%1, 6,9,4
1756
+ shl r1d, 1
1757
+ shl r3d, 1
1758
+ shl r5d, 1
1759
+ lea r6, [r3 * 3]
1760
+ lea r7, [r5 * 3]
1761
+ lea r8, [r1 * 3]
1762
+
1763
+%rep %1/8 - 1
1764
+ PROCESS_PIXELAVG_ALIGNED_64x8_HBD_AVX512
1765
+ lea r0, [r0 + 4 * r1]
1766
+ lea r2, [r2 + 4 * r3]
1767
+ lea r4, [r4 + 4 * r5]
1768
+%endrep
1769
+ PROCESS_PIXELAVG_ALIGNED_64x8_HBD_AVX512
1770
+ RET
1771
+%endmacro
1772
+
1773
+%if ARCH_X86_64
1774
+PIXEL_AVG_HBD_ALIGNED_W64 16
1775
+PIXEL_AVG_HBD_ALIGNED_W64 32
1776
+PIXEL_AVG_HBD_ALIGNED_W64 48
1777
+PIXEL_AVG_HBD_ALIGNED_W64 64
1778
+%endif
1779
+
1780
+%if ARCH_X86_64
1781
+INIT_ZMM avx512
1782
+cglobal pixel_avg_48x64, 6,9,4
1783
+ shl r1d, 1
1784
+ shl r3d, 1
1785
+ shl r5d, 1
1786
+ lea r6, [r3 * 3]
1787
+ lea r7, [r5 * 3]
1788
+ lea r8, [r1 * 3]
1789
+
1790
+%rep 7
1791
+ PROCESS_PIXELAVG_48x8_HBD_AVX512
1792
+ lea r0, [r0 + 4 * r1]
1793
+ lea r2, [r2 + 4 * r3]
1794
+ lea r4, [r4 + 4 * r5]
1795
+%endrep
1796
+ PROCESS_PIXELAVG_48x8_HBD_AVX512
1797
+ RET
1798
+%endif
1799
+
1800
+%if ARCH_X86_64
1801
+INIT_ZMM avx512
1802
+cglobal pixel_avg_aligned_48x64, 6,9,4
1803
+ shl r1d, 1
1804
+ shl r3d, 1
1805
+ shl r5d, 1
1806
+ lea r6, [r3 * 3]
1807
+ lea r7, [r5 * 3]
1808
+ lea r8, [r1 * 3]
1809
+
1810
+%rep 7
1811
+ PROCESS_PIXELAVG_ALIGNED_48x8_HBD_AVX512
1812
+ lea r0, [r0 + 4 * r1]
1813
+ lea r2, [r2 + 4 * r3]
1814
+ lea r4, [r4 + 4 * r5]
1815
+%endrep
1816
+ PROCESS_PIXELAVG_ALIGNED_48x8_HBD_AVX512
1817
+ RET
1818
+%endif
1819
+;-----------------------------------------------------------------------------
1820
+;pixel_avg_pp avx512 high bit depth code end
1821
+;-----------------------------------------------------------------------------
1822
%endif ; HIGH_BIT_DEPTH
1823
1824
%if HIGH_BIT_DEPTH == 0
1825
1826
jg .height_loop
1827
RET
1828
1829
+%if ARCH_X86_64
1830
INIT_YMM avx2
1831
cglobal pixel_avg2_w20, 6,7
1832
sub r2, r4
1833
1834
sub r5d, 2
1835
jg .height_loop
1836
RET
1837
+%endif
1838
1839
; Cacheline split code for processors with high latencies for loads
1840
; split over cache lines. See sad-a.asm for a more detailed explanation.
1841
x265_2.7.tar.gz/source/common/x86/pixel-a.asm -> x265_2.9.tar.gz/source/common/x86/pixel-a.asm
Changed
1567
1
2
times 2 dw 1, -1
3
times 4 dw 1
4
times 2 dw 1, -1
5
+psy_pp_shuff1: dq 0, 1, 8, 9, 4, 5, 12, 13
6
+psy_pp_shuff2: dq 2, 3, 10, 11, 6, 7, 14, 15
7
+psy_pp_shuff3: dq 0, 0, 8, 8, 1, 1, 9, 9
8
9
ALIGN 32
10
transd_shuf1: SHUFFLE_MASK_W 0, 8, 2, 10, 4, 12, 6, 14
11
12
%endif ; ARCH_X86_64=1
13
%endif ; HIGH_BIT_DEPTH
14
15
+%macro SATD_AVX512_LOAD4 2 ; size, opmask
16
+ vpbroadcast%1 m0, [r0]
17
+ vpbroadcast%1 m0 {%2}, [r0+2*r1]
18
+ vpbroadcast%1 m2, [r2]
19
+ vpbroadcast%1 m2 {%2}, [r2+2*r3]
20
+ add r0, r1
21
+ add r2, r3
22
+ vpbroadcast%1 m1, [r0]
23
+ vpbroadcast%1 m1 {%2}, [r0+2*r1]
24
+ vpbroadcast%1 m3, [r2]
25
+ vpbroadcast%1 m3 {%2}, [r2+2*r3]
26
+%endmacro
27
+
28
+%macro SATD_AVX512_LOAD8 5 ; size, halfreg, opmask1, opmask2, opmask3
29
+ vpbroadcast%1 %{2}0, [r0]
30
+ vpbroadcast%1 %{2}0 {%3}, [r0+2*r1]
31
+ vpbroadcast%1 %{2}2, [r2]
32
+ vpbroadcast%1 %{2}2 {%3}, [r2+2*r3]
33
+ vpbroadcast%1 m0 {%4}, [r0+4*r1]
34
+ vpbroadcast%1 m2 {%4}, [r2+4*r3]
35
+ vpbroadcast%1 m0 {%5}, [r0+2*r4]
36
+ vpbroadcast%1 m2 {%5}, [r2+2*r5]
37
+ vpbroadcast%1 %{2}1, [r0+r1]
38
+ vpbroadcast%1 %{2}1 {%3}, [r0+r4]
39
+ vpbroadcast%1 %{2}3, [r2+r3]
40
+ vpbroadcast%1 %{2}3 {%3}, [r2+r5]
41
+ lea r0, [r0+4*r1]
42
+ lea r2, [r2+4*r3]
43
+ vpbroadcast%1 m1 {%4}, [r0+r1]
44
+ vpbroadcast%1 m3 {%4}, [r2+r3]
45
+ vpbroadcast%1 m1 {%5}, [r0+r4]
46
+ vpbroadcast%1 m3 {%5}, [r2+r5]
47
+%endmacro
48
+
49
+%macro SATD_AVX512_PACKED 0
50
+ DIFF_SUMSUB_SSSE3 0, 2, 1, 3, 4
51
+ SUMSUB_BA w, 0, 1, 2
52
+ SBUTTERFLY qdq, 0, 1, 2
53
+ SUMSUB_BA w, 0, 1, 2
54
+ HMAXABSW2 0, 1, 2, 3
55
+%endmacro
56
+
57
+%macro SATD_AVX512_END 0-1 0 ; sa8d
58
+ paddw m0 {k1}{z}, m1 ; zero-extend to dwords
59
+%if ARCH_X86_64
60
+%if mmsize == 64
61
+ vextracti32x8 ym1, m0, 1
62
+ paddd ym0, ym1
63
+%endif
64
+%if mmsize >= 32
65
+ vextracti128 xm1, ym0, 1
66
+ paddd xmm0, xm0, xm1
67
+%endif
68
+ punpckhqdq xmm1, xmm0, xmm0
69
+ paddd xmm0, xmm1
70
+ movq rax, xmm0
71
+ rorx rdx, rax, 32
72
+%if %1
73
+ lea eax, [rax+rdx+1]
74
+ shr eax, 1
75
+%else
76
+ add eax, edx
77
+%endif
78
+%else
79
+ HADDD m0, m1
80
+ movd eax, xm0
81
+%if %1
82
+ inc eax
83
+ shr eax, 1
84
+%endif
85
+%endif
86
+ RET
87
+%endmacro
88
+
89
+%macro HMAXABSW2 4 ; a, b, tmp1, tmp2
90
+ pabsw m%1, m%1
91
+ pabsw m%2, m%2
92
+ psrldq m%3, m%1, 2
93
+ psrld m%4, m%2, 16
94
+ pmaxsw m%1, m%3
95
+ pmaxsw m%2, m%4
96
+%endmacro
97
+%if HIGH_BIT_DEPTH==0
98
+INIT_ZMM avx512
99
+cglobal pixel_satd_16x8_internal
100
+ vbroadcasti64x4 m6, [hmul_16p]
101
+ kxnorb k2, k2, k2
102
+ mov r4d, 0x55555555
103
+ knotw k2, k2
104
+ kmovd k1, r4d
105
+ lea r4, [3*r1]
106
+ lea r5, [3*r3]
107
+satd_16x8_avx512:
108
+ vbroadcasti128 ym0, [r0]
109
+ vbroadcasti32x4 m0 {k2}, [r0+4*r1] ; 0 0 4 4
110
+ vbroadcasti128 ym4, [r2]
111
+ vbroadcasti32x4 m4 {k2}, [r2+4*r3]
112
+ vbroadcasti128 ym2, [r0+2*r1]
113
+ vbroadcasti32x4 m2 {k2}, [r0+2*r4] ; 2 2 6 6
114
+ vbroadcasti128 ym5, [r2+2*r3]
115
+ vbroadcasti32x4 m5 {k2}, [r2+2*r5]
116
+ DIFF_SUMSUB_SSSE3 0, 4, 2, 5, 6
117
+ vbroadcasti128 ym1, [r0+r1]
118
+ vbroadcasti128 ym4, [r2+r3]
119
+ vbroadcasti128 ym3, [r0+r4]
120
+ vbroadcasti128 ym5, [r2+r5]
121
+ lea r0, [r0+4*r1]
122
+ lea r2, [r2+4*r3]
123
+ vbroadcasti32x4 m1 {k2}, [r0+r1] ; 1 1 5 5
124
+ vbroadcasti32x4 m4 {k2}, [r2+r3]
125
+ vbroadcasti32x4 m3 {k2}, [r0+r4] ; 3 3 7 7
126
+ vbroadcasti32x4 m5 {k2}, [r2+r5]
127
+ DIFF_SUMSUB_SSSE3 1, 4, 3, 5, 6
128
+ HADAMARD4_V 0, 1, 2, 3, 4
129
+ HMAXABSW2 0, 2, 4, 5
130
+ HMAXABSW2 1, 3, 4, 5
131
+ paddw m4, m0, m2 ; m1
132
+ paddw m2, m1, m3 ; m0
133
+ ret
134
+
135
+cglobal pixel_satd_8x8_internal
136
+ vbroadcasti64x4 m4, [hmul_16p]
137
+ mov r4d, 0x55555555
138
+ kmovd k1, r4d ; 01010101
139
+ kshiftlb k2, k1, 5 ; 10100000
140
+ kshiftlb k3, k1, 4 ; 01010000
141
+ lea r4, [3*r1]
142
+ lea r5, [3*r3]
143
+satd_8x8_avx512:
144
+ SATD_AVX512_LOAD8 q, ym, k1, k2, k3 ; 2 0 2 0 6 4 6 4
145
+ SATD_AVX512_PACKED ; 3 1 3 1 7 5 7 5
146
+ ret
147
+
148
+cglobal pixel_satd_16x8, 4,6
149
+ call pixel_satd_16x8_internal_avx512
150
+ jmp satd_zmm_avx512_end
151
+
152
+cglobal pixel_satd_16x16, 4,6
153
+ call pixel_satd_16x8_internal_avx512
154
+ lea r0, [r0+4*r1]
155
+ lea r2, [r2+4*r3]
156
+ paddw m7, m0, m1
157
+ call satd_16x8_avx512
158
+ paddw m1, m7
159
+ jmp satd_zmm_avx512_end
160
+
161
+cglobal pixel_satd_8x8, 4,6
162
+ call pixel_satd_8x8_internal_avx512
163
+satd_zmm_avx512_end:
164
+ SATD_AVX512_END
165
+
166
+cglobal pixel_satd_8x16, 4,6
167
+ call pixel_satd_8x8_internal_avx512
168
+ lea r0, [r0+4*r1]
169
+ lea r2, [r2+4*r3]
170
+ paddw m5, m0, m1
171
+ call satd_8x8_avx512
172
+ paddw m1, m5
173
+ jmp satd_zmm_avx512_end
174
+
175
+INIT_YMM avx512
176
+cglobal pixel_satd_4x8_internal
177
+ vbroadcasti128 m4, [hmul_4p]
178
+ mov r4d, 0x55550c
179
+ kmovd k2, r4d ; 00001100
180
+ kshiftlb k3, k2, 2 ; 00110000
181
+ kshiftlb k4, k2, 4 ; 11000000
182
+ kshiftrd k1, k2, 8 ; 01010101
183
+ lea r4, [3*r1]
184
+ lea r5, [3*r3]
185
+satd_4x8_avx512:
186
+ SATD_AVX512_LOAD8 d, xm, k2, k3, k4 ; 0 0 2 2 4 4 6 6
187
+satd_ymm_avx512: ; 1 1 3 3 5 5 7 7
188
+ SATD_AVX512_PACKED
189
+ ret
190
+
191
+cglobal pixel_satd_8x4, 4,5
192
+ mova m4, [hmul_16p]
193
+ mov r4d, 0x5555
194
+ kmovw k1, r4d
195
+ SATD_AVX512_LOAD4 q, k1 ; 2 0 2 0
196
+ call satd_ymm_avx512 ; 3 1 3 1
197
+ jmp satd_ymm_avx512_end2
198
+
199
+cglobal pixel_satd_4x8, 4,6
200
+ call pixel_satd_4x8_internal_avx512
201
+satd_ymm_avx512_end:
202
+%if ARCH_X86_64 == 0
203
+ pop r5d
204
+ %assign regs_used 5
205
+%endif
206
+satd_ymm_avx512_end2:
207
+ SATD_AVX512_END
208
+
209
+cglobal pixel_satd_4x16, 4,6
210
+ call pixel_satd_4x8_internal_avx512
211
+ lea r0, [r0+4*r1]
212
+ lea r2, [r2+4*r3]
213
+ paddw m5, m0, m1
214
+ call satd_4x8_avx512
215
+ paddw m1, m5
216
+ jmp satd_ymm_avx512_end
217
+
218
+INIT_XMM avx512
219
+cglobal pixel_satd_4x4, 4,5
220
+ mova m4, [hmul_4p]
221
+ mov r4d, 0x550c
222
+ kmovw k2, r4d
223
+ kshiftrw k1, k2, 8
224
+ SATD_AVX512_LOAD4 d, k2 ; 0 0 2 2
225
+ SATD_AVX512_PACKED ; 1 1 3 3
226
+ SWAP 0, 1
227
+ SATD_AVX512_END
228
+
229
+INIT_ZMM avx512
230
+cglobal pixel_sa8d_8x8, 4,6
231
+ vbroadcasti64x4 m4, [hmul_16p]
232
+ mov r4d, 0x55555555
233
+ kmovd k1, r4d ; 01010101
234
+ kshiftlb k2, k1, 5 ; 10100000
235
+ kshiftlb k3, k1, 4 ; 01010000
236
+ lea r4, [3*r1]
237
+ lea r5, [3*r3]
238
+ SATD_AVX512_LOAD8 q, ym, k1, k2, k3 ; 2 0 2 0 6 4 6 4
239
+ DIFF_SUMSUB_SSSE3 0, 2, 1, 3, 4 ; 3 1 3 1 7 5 7 5
240
+ SUMSUB_BA w, 0, 1, 2
241
+ SBUTTERFLY qdq, 0, 1, 2
242
+ SUMSUB_BA w, 0, 1, 2
243
+ shufps m2, m0, m1, q2020
244
+ shufps m1, m0, m1, q3131
245
+ SUMSUB_BA w, 2, 1, 0
246
+ vshufi32x4 m0, m2, m1, q1010
247
+ vshufi32x4 m1, m2, m1, q3232
248
+ SUMSUB_BA w, 0, 1, 2
249
+ HMAXABSW2 0, 1, 2, 3
250
+ SATD_AVX512_END 1
251
+%endif
252
; Input 10bit, Output 8bit
253
;------------------------------------------------------------------------------------------------------------------------
254
;void planecopy_sc(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)
255
256
257
.end:
258
RET
259
+INIT_ZMM avx512
260
+cglobal upShift_16, 4,7,4
261
+ mov r4d, r4m
262
+ mov r5d, r5m
263
+ movd xm0, r6m ; m0 = shift
264
+ vbroadcasti32x4 m3, [pw_pixel_max]
265
+ FIX_STRIDES r1d, r3d
266
+ dec r5d
267
+.loopH:
268
+ xor r6d, r6d
269
+.loopW:
270
+ movu m1, [r0 + r6 * SIZEOF_PIXEL]
271
+ psllw m1, xm0
272
+ pand m1, m3
273
+ movu [r2 + r6 * SIZEOF_PIXEL], m1
274
+
275
+ add r6, mmsize / SIZEOF_PIXEL
276
+ cmp r6d, r4d
277
+ jl .loopW
278
+
279
+ ; move to next row
280
+ add r0, r1
281
+ add r2, r3
282
+ dec r5d
283
+ jnz .loopH
284
285
+ ; processing last row of every frame [To handle width which not a multiple of 32]
286
287
+.loop32:
288
+ movu m1, [r0 + (r4 - mmsize/2) * 2]
289
+ psllw m1, xm0
290
+ pand m1, m3
291
+ movu [r2 + (r4 - mmsize/2) * 2], m1
292
+
293
+ sub r4d, mmsize/2
294
+ jz .end
295
+ cmp r4d, mmsize/2
296
+ jge .loop32
297
+
298
+ ; process partial pixels
299
+ movu m1, [r0]
300
+ psllw m1, xm0
301
+ pand m1, m3
302
+ movu [r2], m1
303
+
304
+.end:
305
+ RET
306
;---------------------------------------------------------------------------------------------------------------------
307
;int psyCost_pp(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride)
308
;---------------------------------------------------------------------------------------------------------------------
309
310
pabsd m11, m11
311
%endmacro
312
313
+%macro PSY_COST_PP_8x8_AVX512_MAIN12 0
314
+ ; load source and recon pixels
315
+ lea r4, [r1 * 3]
316
+ pmovzxwd ym0, [r0]
317
+ pmovzxwd ym1, [r0 + r1]
318
+ pmovzxwd ym2, [r0 + r1 * 2]
319
+ pmovzxwd ym3, [r0 + r4]
320
+ lea r5, [r0 + r1 * 4]
321
+ pmovzxwd ym4, [r5]
322
+ pmovzxwd ym5, [r5 + r1]
323
+ pmovzxwd ym6, [r5 + r1 * 2]
324
+ pmovzxwd ym7, [r5 + r4]
325
+
326
+ lea r4, [r3 * 3]
327
+ pmovzxwd ym16, [r2]
328
+ pmovzxwd ym17, [r2 + r3]
329
+ pmovzxwd ym18, [r2 + r3 * 2]
330
+ pmovzxwd ym19, [r2 + r4]
331
+ lea r5, [r2 + r3 * 4]
332
+ pmovzxwd ym20, [r5]
333
+ pmovzxwd ym21, [r5 + r3]
334
+ pmovzxwd ym22, [r5 + r3 * 2]
335
+ pmovzxwd ym23, [r5 + r4]
336
+
337
+ vinserti64x4 m0, m0, ym16, 1
338
+ vinserti64x4 m1, m1, ym17, 1
339
+ vinserti64x4 m2, m2, ym18, 1
340
+ vinserti64x4 m3, m3, ym19, 1
341
+ vinserti64x4 m4, m4, ym20, 1
342
+ vinserti64x4 m5, m5, ym21, 1
343
+ vinserti64x4 m6, m6, ym22, 1
344
+ vinserti64x4 m7, m7, ym23, 1
345
+
346
+ ; source + recon SAD
347
+ paddd m8, m0, m1
348
+ paddd m8, m2
349
+ paddd m8, m3
350
+ paddd m8, m4
351
+ paddd m8, m5
352
+ paddd m8, m6
353
+ paddd m8, m7
354
+
355
+ vextracti64x4 ym15, m8, 1
356
+
357
+ vextracti128 xm9, ym8, 1
358
+ paddd ym8, ym9 ; sad_8x8
359
+ movhlps xm9, xm8
360
+ paddd xm8, xm9
361
+ pshuflw xm9, xm8, 0Eh
362
+ paddd xm8, xm9
363
+ psrld ym8, 2
364
+
365
+ vextracti128 xm9, ym15, 1
366
+ paddd ym15, ym9 ; sad_8x8
367
+ movhlps xm9, xm15
368
+ paddd xm15, xm9
369
+ pshuflw xm9, xm15, 0Eh
370
+ paddd xm15, xm9
371
+ psrld ym15, 2
372
+
373
+ ; source and recon SA8D
374
+ psubd m9, m1, m0
375
+ paddd m0, m1
376
+ psubd m1, m3, m2
377
+ paddd m2, m3
378
+ punpckhdq m3, m0, m9
379
+ punpckldq m0, m9
380
+ psubd m9, m3, m0
381
+ paddd m0, m3
382
+ punpckhdq m3, m2, m1
383
+ punpckldq m2, m1
384
+ psubd m10, m3, m2
385
+ paddd m2, m3
386
+ psubd m3, m5, m4
387
+ paddd m4, m5
388
+ psubd m5, m7, m6
389
+ paddd m6, m7
390
+ punpckhdq m1, m4, m3
391
+ punpckldq m4, m3
392
+ psubd m7, m1, m4
393
+ paddd m4, m1
394
+ punpckhdq m3, m6, m5
395
+ punpckldq m6, m5
396
+ psubd m1, m3, m6
397
+ paddd m6, m3
398
+ psubd m3, m2, m0
399
+ paddd m0, m2
400
+ psubd m2, m10, m9
401
+ paddd m9, m10
402
+ punpckhqdq m5, m0, m3
403
+ punpcklqdq m0, m3
404
+ psubd m10, m5, m0
405
+ paddd m0, m5
406
+ punpckhqdq m3, m9, m2
407
+ punpcklqdq m9, m2
408
+ psubd m5, m3, m9
409
+ paddd m9, m3
410
+ psubd m3, m6, m4
411
+ paddd m4, m6
412
+ psubd m6, m1, m7
413
+ paddd m7, m1
414
+ punpckhqdq m2, m4, m3
415
+ punpcklqdq m4, m3
416
+ psubd m1, m2, m4
417
+ paddd m4, m2
418
+ punpckhqdq m3, m7, m6
419
+ punpcklqdq m7, m6
420
+
421
+ psubd m2, m3, m7
422
+ paddd m7, m3
423
+ psubd m3, m4, m0
424
+ paddd m0, m4
425
+ psubd m4, m1, m10
426
+ paddd m10, m1
427
+
428
+ mova m16, m13
429
+ mova m17, m14
430
+ vpermi2q m16, m0, m3
431
+ vpermi2q m17, m0, m3
432
+
433
+ pabsd m17, m17
434
+ pabsd m16, m16
435
+ pmaxsd m17, m16
436
+
437
+ mova m18, m13
438
+ mova m19, m14
439
+ vpermi2q m18, m10, m4
440
+ vpermi2q m19, m10, m4
441
+
442
+ pabsd m19, m19
443
+ pabsd m18, m18
444
+ pmaxsd m19, m18
445
+ psubd m18, m7, m9
446
+ paddd m9, m7
447
+ psubd m7, m2, m5
448
+ paddd m5, m2
449
+
450
+ mova m20, m13
451
+ mova m21, m14
452
+ vpermi2q m20, m9, m18
453
+ vpermi2q m21, m9, m18
454
+
455
+ pabsd m21, m21
456
+ pabsd m20, m20
457
+ pmaxsd m21, m20
458
+
459
+ mova m22, m13
460
+ mova m23, m14
461
+ vpermi2q m22, m5, m7
462
+ vpermi2q m23, m5, m7
463
+
464
+ pabsd m23, m23
465
+ pabsd m22, m22
466
+ pmaxsd m23, m22
467
+ paddd m17, m21
468
+ paddd m17, m19
469
+ paddd m17, m23
470
+
471
+ vextracti64x4 ym26, m17, 1
472
+
473
+ vextracti128 xm9, m17, 1
474
+ paddd ym17, ym9 ; sad_8x8
475
+ movhlps xm9, xm17
476
+ paddd xm17, xm9
477
+ pshuflw xm9, xm17, 0Eh
478
+ paddd xm17, xm9
479
+ paddd ym17, [pd_1]
480
+ psrld ym17, 1 ; sa8d_8x8
481
+
482
+ vextracti128 xm9, ym26, 1
483
+ paddd ym26, ym9 ; sad_8x8
484
+ movhlps xm9, xm26
485
+ paddd xm26, xm9
486
+ pshuflw xm9, xm26, 0Eh
487
+ paddd xm26, xm9
488
+ paddd ym26, [pd_1]
489
+ psrld ym26, 1 ; sa8d_8x8
490
+
491
+
492
+
493
+ psubd ym11, ym17, ym8 ; sa8d_8x8 - sad_8x8
494
+ psubd ym12, ym26, ym15 ; sa8d_8x8 - sad_8x8
495
+
496
+ psubd ym11, ym12
497
+ pabsd ym11, ym11
498
+%endmacro
499
+
500
+%macro PSY_PP_INPUT_AVX512_MAIN10 0
501
+ lea r4, [r1 * 3]
502
+ movu xm0, [r0]
503
+ movu xm1, [r0 + r1]
504
+ movu xm2, [r0 + r1 * 2]
505
+ movu xm3, [r0 + r4]
506
+ lea r5, [r0 + r1 * 4]
507
+ movu xm4, [r5]
508
+ movu xm5, [r5 + r1]
509
+ movu xm6, [r5 + r1 * 2]
510
+ movu xm7, [r5 + r4]
511
+
512
+ lea r4, [r3 * 3]
513
+ vinserti128 ym0, ym0, [r2], 1
514
+ vinserti128 ym1, ym1, [r2 + r3], 1
515
+ vinserti128 ym2, ym2, [r2 + r3 * 2], 1
516
+ vinserti128 ym3, ym3, [r2 + r4], 1
517
+ lea r5, [r2 + r3 * 4]
518
+ vinserti128 ym4, ym4, [r5], 1
519
+ vinserti128 ym5, ym5, [r5 + r3], 1
520
+ vinserti128 ym6, ym6, [r5 + r3 * 2], 1
521
+ vinserti128 ym7, ym7, [r5 + r4], 1
522
+
523
+ add r0, 16
524
+ add r2, 16
525
+
526
+ lea r4, [r1 * 3]
527
+ vinserti32x4 m0, m0, [r0], 2
528
+ vinserti32x4 m1, m1, [r0 + r1], 2
529
+ vinserti32x4 m2, m2, [r0 + r1 * 2], 2
530
+ vinserti32x4 m3, m3, [r0 + r4], 2
531
+ lea r5, [r0 + r1 * 4]
532
+ vinserti32x4 m4, m4, [r5], 2
533
+ vinserti32x4 m5, m5, [r5 + r1], 2
534
+ vinserti32x4 m6, m6, [r5 + r1 * 2], 2
535
+ vinserti32x4 m7, m7, [r5 + r4], 2
536
+
537
+ lea r4, [r3 * 3]
538
+ vinserti32x4 m0, m0, [r2], 3
539
+ vinserti32x4 m1, m1, [r2 + r3], 3
540
+ vinserti32x4 m2, m2, [r2 + r3 * 2], 3
541
+ vinserti32x4 m3, m3, [r2 + r4], 3
542
+ lea r5, [r2 + r3 * 4]
543
+ vinserti32x4 m4, m4, [r5], 3
544
+ vinserti32x4 m5, m5, [r5 + r3], 3
545
+ vinserti32x4 m6, m6, [r5 + r3 * 2], 3
546
+ vinserti32x4 m7, m7, [r5 + r4], 3
547
+%endmacro
548
+
549
+
550
+%macro PSY_PP_16x8_AVX512_MAIN10 0
551
+ paddw m8, m0, m1
552
+ paddw m8, m2
553
+ paddw m8, m3
554
+ paddw m8, m4
555
+ paddw m8, m5
556
+ paddw m8, m6
557
+ paddw m8, m7
558
+ pmaddwd m8, m14
559
+
560
+ psrldq m9, m8, 8
561
+ paddd m8, m9
562
+ psrldq m9, m8, 4
563
+ paddd m8, m9
564
+ psrld m8, 2
565
+
566
+ psubw m9, m1, m0
567
+ paddw m0, m1
568
+ psubw m1, m3, m2
569
+ paddw m2, m3
570
+ punpckhwd m3, m0, m9
571
+ punpcklwd m0, m9
572
+ psubw m9, m3, m0
573
+ paddw m0, m3
574
+ punpckhwd m3, m2, m1
575
+ punpcklwd m2, m1
576
+ psubw m10, m3, m2
577
+ paddw m2, m3
578
+
579
+ psubw m3, m5, m4
580
+ paddw m4, m5
581
+ psubw m5, m7, m6
582
+ paddw m6, m7
583
+ punpckhwd m1, m4, m3
584
+ punpcklwd m4, m3
585
+ psubw m7, m1, m4
586
+ paddw m4, m1
587
+ punpckhwd m3, m6, m5
588
+ punpcklwd m6, m5
589
+ psubw m1, m3, m6
590
+ paddw m6, m3
591
+
592
+ psubw m3, m2, m0
593
+ paddw m0, m2
594
+ psubw m2, m10, m9
595
+ paddw m9, m10
596
+ punpckhdq m5, m0, m3
597
+ punpckldq m0, m3
598
+ psubw m10, m5, m0
599
+ paddw m0, m5
600
+ punpckhdq m3, m9, m2
601
+ punpckldq m9, m2
602
+ psubw m5, m3, m9
603
+ paddw m9, m3
604
+
605
+ psubw m3, m6, m4
606
+ paddw m4, m6
607
+ psubw m6, m1, m7
608
+ paddw m7, m1
609
+ punpckhdq m2, m4, m3
610
+ punpckldq m4, m3
611
+ psubw m1, m2, m4
612
+ paddw m4, m2
613
+ punpckhdq m3, m7, m6
614
+ punpckldq m7, m6
615
+ psubw m2, m3, m7
616
+ paddw m7, m3
617
+
618
+ psubw m3, m4, m0
619
+ paddw m0, m4
620
+ psubw m4, m1, m10
621
+ paddw m10, m1
622
+ punpckhqdq m6, m0, m3
623
+ punpcklqdq m0, m3
624
+ pabsw m0, m0
625
+ pabsw m6, m6
626
+ pmaxsw m0, m6
627
+ punpckhqdq m3, m10, m4
628
+ punpcklqdq m10, m4
629
+ pabsw m10, m10
630
+ pabsw m3, m3
631
+ pmaxsw m10, m3
632
+
633
+ psubw m3, m7, m9
634
+ paddw m9, m7
635
+ psubw m7, m2, m5
636
+ paddw m5, m2
637
+ punpckhqdq m4, m9, m3
638
+ punpcklqdq m9, m3
639
+ pabsw m9, m9
640
+ pabsw m4, m4
641
+ pmaxsw m9, m4
642
+ punpckhqdq m3, m5, m7
643
+ punpcklqdq m5, m7
644
+ pabsw m5, m5
645
+ pabsw m3, m3
646
+ pmaxsw m5, m3
647
+
648
+ paddd m0, m9
649
+ paddd m0, m10
650
+ paddd m0, m5
651
+ psrld m9, m0, 16
652
+ pslld m0, 16
653
+ psrld m0, 16
654
+ paddd m0, m9
655
+ psrldq m9, m0, 8
656
+ paddd m0, m9
657
+ psrldq m9, m0, 4
658
+ paddd m0, m9
659
+ paddd m0, m15
660
+ psrld m0, 1
661
+ psubd m0, m8
662
+
663
+ vextracti64x4 ym2, m0, 1
664
+
665
+ vextracti128 xm3, ym2, 1
666
+ psubd xm3, xm2
667
+ pabsd xm3, xm3
668
+
669
+ vextracti128 xm1, ym0, 1
670
+ psubd xm1, xm0
671
+ pabsd xm1, xm1
672
+ paddd xm1, xm3
673
+%endmacro
674
+
675
+%macro PSY_PP_INPUT_AVX512_MAIN 0
676
+ movu xm16, [r0 + r1 * 0]
677
+ movu xm17, [r0 + r1 * 1]
678
+ movu xm18, [r0 + r1 * 2]
679
+ movu xm19, [r0 + r4 * 1]
680
+
681
+ movu xm20, [r2 + r3 * 0]
682
+ movu xm21, [r2 + r3 * 1]
683
+ movu xm22, [r2 + r3 * 2]
684
+ movu xm23, [r2 + r7 * 1]
685
+
686
+ mova m0, m26
687
+ vpermi2q m0, m16, m20
688
+ mova m1, m26
689
+ vpermi2q m1, m17, m21
690
+ mova m2, m26
691
+ vpermi2q m2, m18, m22
692
+ mova m3, m26
693
+ vpermi2q m3, m19, m23
694
+
695
+
696
+ lea r5, [r0 + r1 * 4]
697
+ lea r6, [r2 + r3 * 4]
698
+
699
+ movu xm16, [r5 + r1 * 0]
700
+ movu xm17, [r5 + r1 * 1]
701
+ movu xm18, [r5 + r1 * 2]
702
+ movu xm19, [r5 + r4 * 1]
703
+
704
+ movu xm20, [r6 + r3 * 0]
705
+ movu xm21, [r6 + r3 * 1]
706
+ movu xm22, [r6 + r3 * 2]
707
+ movu xm23, [r6 + r7 * 1]
708
+
709
+ mova m4, m26
710
+ vpermi2q m4, m16, m20
711
+ mova m5, m26
712
+ vpermi2q m5, m17, m21
713
+ mova m6, m26
714
+ vpermi2q m6, m18, m22
715
+ mova m7, m26
716
+ vpermi2q m7, m19, m23
717
+%endmacro
718
+
719
+%macro PSY_PP_16x8_AVX512_MAIN 0
720
+ pmaddubsw m0, m8
721
+ pmaddubsw m1, m8
722
+ pmaddubsw m2, m8
723
+ pmaddubsw m3, m8
724
+ pmaddubsw m4, m8
725
+ pmaddubsw m5, m8
726
+ pmaddubsw m6, m8
727
+ pmaddubsw m7, m8
728
+
729
+ paddw m11, m0, m1
730
+ paddw m11, m2
731
+ paddw m11, m3
732
+ paddw m11, m4
733
+ paddw m11, m5
734
+ paddw m11, m6
735
+ paddw m11, m7
736
+
737
+ pmaddwd m11, m14
738
+ psrldq m10, m11, 4
739
+ paddd m11, m10
740
+ psrld m11, 2
741
+
742
+ mova m9, m0
743
+ paddw m0, m1
744
+ psubw m1, m9
745
+ mova m9, m2
746
+ paddw m2, m3
747
+ psubw m3, m9
748
+ mova m9, m0
749
+ paddw m0, m2
750
+ psubw m2, m9
751
+ mova m9, m1
752
+ paddw m1, m3
753
+ psubw m3, m9
754
+
755
+ movdqa m9, m4
756
+ paddw m4, m5
757
+ psubw m5, m9
758
+ movdqa m9, m6
759
+ paddw m6, m7
760
+ psubw m7, m9
761
+ movdqa m9, m4
762
+ paddw m4, m6
763
+ psubw m6, m9
764
+ movdqa m9, m5
765
+ paddw m5, m7
766
+ psubw m7, m9
767
+
768
+ movdqa m9, m0
769
+ paddw m0, m4
770
+ psubw m4, m9
771
+ movdqa m9, m1
772
+ paddw m1, m5
773
+ psubw m5, m9
774
+
775
+ mova m9, m0
776
+ vshufps m9, m9, m4, 11011101b
777
+ vshufps m0, m0, m4, 10001000b
778
+
779
+ movdqa m4, m0
780
+ paddw m16, m0, m9
781
+ psubw m17, m9, m4
782
+
783
+ movaps m4, m1
784
+ vshufps m4, m4, m5, 11011101b
785
+ vshufps m1, m1, m5, 10001000b
786
+
787
+ movdqa m5, m1
788
+ paddw m18, m1, m4
789
+ psubw m19, m4, m5
790
+
791
+ movdqa m5, m2
792
+ paddw m2, m6
793
+ psubw m6, m5
794
+ movdqa m5, m3
795
+ paddw m3, m7
796
+ psubw m7, m5
797
+
798
+ movaps m5, m2
799
+ vshufps m5, m5, m6, 11011101b
800
+ vshufps m2, m2, m6, 10001000b
801
+
802
+ movdqa m6, m2
803
+ paddw m20, m2, m5
804
+ psubw m21, m5, m6
805
+
806
+ movaps m6, m3
807
+
808
+ vshufps m6, m6, m7, 11011101b
809
+ vshufps m3, m3, m7, 10001000b
810
+
811
+ movdqa m7, m3
812
+ paddw m22, m3, m6
813
+ psubw m23, m6, m7
814
+
815
+ movdqa m7, m16
816
+
817
+ vextracti64x4 ym24, m16, 1
818
+ vextracti64x4 ym25, m17, 1
819
+ pblendw ym16, ym17, 10101010b
820
+ pblendw ym24, ym25, 10101010b
821
+ vinserti64x4 m16, m16, ym24, 1
822
+
823
+ pslld m17, 10h
824
+ psrld m7, 10h
825
+ por m17, m7
826
+ pabsw m16, m16
827
+ pabsw m17, m17
828
+ pmaxsw m16, m17
829
+ movdqa m7, m18
830
+
831
+ vextracti64x4 ym24, m18, 1
832
+ vextracti64x4 ym25, m19, 1
833
+ pblendw ym18, ym19, 10101010b
834
+ pblendw ym24, ym25, 10101010b
835
+ vinserti64x4 m18, m18, ym24, 1
836
+
837
+ pslld m19, 10h
838
+ psrld m7, 10h
839
+ por m19, m7
840
+ pabsw m18, m18
841
+ pabsw m19, m19
842
+ pmaxsw m18, m19
843
+ movdqa m7, m20
844
+
845
+ vextracti64x4 ym24, m20, 1
846
+ vextracti64x4 ym25, m21, 1
847
+ pblendw ym20, ym21, 10101010b
848
+ pblendw ym24, ym25, 10101010b
849
+ vinserti64x4 m20, m20, ym24, 1
850
+
851
+ pslld m21, 10h
852
+ psrld m7, 10h
853
+ por m21, m7
854
+ pabsw m20, m20
855
+ pabsw m21, m21
856
+ pmaxsw m20, m21
857
+ mova m7, m22
858
+
859
+ vextracti64x4 ym24, m22, 1
860
+ vextracti64x4 ym25, m23, 1
861
+ pblendw ym22, ym23, 10101010b
862
+ pblendw ym24, ym25, 10101010b
863
+ vinserti64x4 m22, m22, ym24, 1
864
+
865
+ pslld m23, 10h
866
+ psrld m7, 10h
867
+ por m23, m7
868
+ pabsw m22, m22
869
+ pabsw m23, m23
870
+ pmaxsw m22, m23
871
+ paddw m16, m18
872
+ paddw m16, m20
873
+ paddw m16, m22
874
+ pmaddwd m16, m14
875
+ psrldq m1, m16, 8
876
+ paddd m16, m1
877
+
878
+ pshuflw m1, m16, 00001110b
879
+ paddd m16, m1
880
+ paddd m16, m15
881
+ psrld m16, 1
882
+
883
+ psubd m16, m11
884
+ vextracti64x4 ym2, m16, 1
885
+
886
+ vextracti128 xm1, ym16, 1
887
+ psubd xm16, xm1
888
+ pabsd xm16, xm16
889
+
890
+ vextracti128 xm3, ym2, 1
891
+ psubd xm3, xm2
892
+ pabsd xm3, xm3
893
+ paddd xm16, xm3
894
+%endmacro
895
+
896
+
897
%if ARCH_X86_64
898
INIT_YMM avx2
899
%if HIGH_BIT_DEPTH && BIT_DEPTH == 12
900
901
RET
902
%endif
903
%endif
904
+%if ARCH_X86_64
905
+INIT_ZMM avx512
906
+%if HIGH_BIT_DEPTH && BIT_DEPTH == 12
907
+cglobal psyCost_pp_16x16, 4, 10, 27
908
+ add r1d, r1d
909
+ add r3d, r3d
910
+ pxor m24, m24
911
+ movu m13, [psy_pp_shuff1]
912
+ movu m14, [psy_pp_shuff2]
913
+
914
+ mov r8d, 2
915
+.loopH:
916
+ mov r9d, 2
917
+.loopW:
918
+ PSY_COST_PP_8x8_AVX512_MAIN12
919
+
920
+ paddd xm24, xm11
921
+ add r0, 16
922
+ add r2, 16
923
+ dec r9d
924
+ jnz .loopW
925
+ lea r0, [r0 + r1 * 8 - 32]
926
+ lea r2, [r2 + r3 * 8 - 32]
927
+ dec r8d
928
+ jnz .loopH
929
+ movd eax, xm24
930
+ RET
931
+%endif
932
+
933
+%if HIGH_BIT_DEPTH && BIT_DEPTH == 10
934
+cglobal psyCost_pp_16x16, 4, 10, 16
935
+ add r1d, r1d
936
+ add r3d, r3d
937
+ pxor m11, m11
938
+ vbroadcasti32x8 m14, [pw_1]
939
+ vbroadcasti32x8 m15, [pd_1]
940
+
941
+ mov r8d, 2
942
+.loopH:
943
+ PSY_PP_INPUT_AVX512_MAIN10
944
+ PSY_PP_16x8_AVX512_MAIN10
945
+
946
+ paddd xm11, xm1
947
+ lea r0, [r0 + r1 * 8 - 16]
948
+ lea r2, [r2 + r3 * 8 - 16]
949
+ dec r8d
950
+ jnz .loopH
951
+ movd eax, xm11
952
+ RET
953
+%endif
954
+
955
+%if BIT_DEPTH == 8
956
+cglobal psyCost_pp_16x16, 4, 10, 27
957
+ lea r4, [3 * r1]
958
+ lea r7, [3 * r3]
959
+ vbroadcasti32x8 m8, [hmul_8p]
960
+ pxor m13, m13
961
+ vbroadcasti32x8 m14, [pw_1]
962
+ vbroadcasti32x8 m15, [pd_1]
963
+ movu m26, [psy_pp_shuff3]
964
+
965
+ mov r8d, 2
966
+.loopH:
967
+ PSY_PP_INPUT_AVX512_MAIN
968
+ PSY_PP_16x8_AVX512_MAIN
969
+
970
+ paddd m13, m16
971
+ lea r0, [r0 + r1 * 8]
972
+ lea r2, [r2 + r3 * 8]
973
+ dec r8d
974
+ jnz .loopH
975
+ movd eax, xm13
976
+ RET
977
+%endif
978
+%endif
979
+
980
+%if ARCH_X86_64
981
+INIT_ZMM avx512
982
+%if HIGH_BIT_DEPTH && BIT_DEPTH == 12
983
+cglobal psyCost_pp_32x32, 4, 10, 27
984
+ add r1d, r1d
985
+ add r3d, r3d
986
+ pxor m24, m24
987
+ movu m13, [psy_pp_shuff1]
988
+ movu m14, [psy_pp_shuff2]
989
+
990
+ mov r8d, 4
991
+.loopH:
992
+ mov r9d, 4
993
+.loopW:
994
+ PSY_COST_PP_8x8_AVX512_MAIN12
995
+
996
+ paddd xm24, xm11
997
+ add r0, 16
998
+ add r2, 16
999
+ dec r9d
1000
+ jnz .loopW
1001
+ lea r0, [r0 + r1 * 8 - 64]
1002
+ lea r2, [r2 + r3 * 8 - 64]
1003
+ dec r8d
1004
+ jnz .loopH
1005
+ movd eax, xm24
1006
+ RET
1007
+%endif
1008
+
1009
+%if HIGH_BIT_DEPTH && BIT_DEPTH == 10
1010
+cglobal psyCost_pp_32x32, 4, 10, 16
1011
+ add r1d, r1d
1012
+ add r3d, r3d
1013
+ pxor m11, m11
1014
+ vbroadcasti32x8 m14, [pw_1]
1015
+ vbroadcasti32x8 m15, [pd_1]
1016
+
1017
+ mov r8d, 4
1018
+.loopH:
1019
+ mov r9d, 2
1020
+.loopW:
1021
+ PSY_PP_INPUT_AVX512_MAIN10
1022
+ PSY_PP_16x8_AVX512_MAIN10
1023
+
1024
+ paddd xm11, xm1
1025
+ add r0, 16
1026
+ add r2, 16
1027
+ dec r9d
1028
+ jnz .loopW
1029
+ lea r0, [r0 + r1 * 8 - 64]
1030
+ lea r2, [r2 + r3 * 8 - 64]
1031
+ dec r8d
1032
+ jnz .loopH
1033
+ movd eax, xm11
1034
+ RET
1035
+%endif
1036
+
1037
+%if BIT_DEPTH == 8
1038
+cglobal psyCost_pp_32x32, 4, 10, 27
1039
+ lea r4, [3 * r1]
1040
+ lea r7, [3 * r3]
1041
+ vbroadcasti32x8 m8, [hmul_8p]
1042
+ pxor m13, m13
1043
+ vbroadcasti32x8 m14, [pw_1]
1044
+ vbroadcasti32x8 m15, [pd_1]
1045
+ movu m26, [psy_pp_shuff3]
1046
+
1047
+ mov r8d, 4
1048
+.loopH:
1049
+ mov r9d, 2
1050
+.loopW:
1051
+ PSY_PP_INPUT_AVX512_MAIN
1052
+ PSY_PP_16x8_AVX512_MAIN
1053
+
1054
+ paddd m13, m16
1055
+ add r0, 16
1056
+ add r2, 16
1057
+ dec r9d
1058
+ jnz .loopW
1059
+ lea r0, [r0 + r1 * 8 - 32]
1060
+ lea r2, [r2 + r3 * 8 - 32]
1061
+ dec r8d
1062
+ jnz .loopH
1063
+ movd eax, xm13
1064
+ RET
1065
+%endif
1066
+%endif
1067
+
1068
+%if ARCH_X86_64
1069
+INIT_ZMM avx512
1070
+%if HIGH_BIT_DEPTH && BIT_DEPTH == 12
1071
+cglobal psyCost_pp_64x64, 4, 10, 27
1072
+ add r1d, r1d
1073
+ add r3d, r3d
1074
+ pxor m24, m24
1075
+ movu m13, [psy_pp_shuff1]
1076
+ movu m14, [psy_pp_shuff2]
1077
+
1078
+ mov r8d, 8
1079
+.loopH:
1080
+ mov r9d, 8
1081
+.loopW:
1082
+ PSY_COST_PP_8x8_AVX512_MAIN12
1083
+
1084
+ paddd xm24, xm11
1085
+ add r0, 16
1086
+ add r2, 16
1087
+ dec r9d
1088
+ jnz .loopW
1089
+ lea r0, [r0 + r1 * 8 - 128]
1090
+ lea r2, [r2 + r3 * 8 - 128]
1091
+ dec r8d
1092
+ jnz .loopH
1093
+ movd eax, xm24
1094
+ RET
1095
+%endif
1096
+
1097
+%if HIGH_BIT_DEPTH && BIT_DEPTH == 10
1098
+cglobal psyCost_pp_64x64, 4, 10, 16
1099
+ add r1d, r1d
1100
+ add r3d, r3d
1101
+ pxor m11, m11
1102
+ vbroadcasti32x8 m14, [pw_1]
1103
+ vbroadcasti32x8 m15, [pd_1]
1104
+
1105
+ mov r8d, 8
1106
+.loopH:
1107
+ mov r9d, 4
1108
+.loopW:
1109
+ PSY_PP_INPUT_AVX512_MAIN10
1110
+ PSY_PP_16x8_AVX512_MAIN10
1111
+
1112
+ paddd xm11, xm1
1113
+ add r0, 16
1114
+ add r2, 16
1115
+ dec r9d
1116
+ jnz .loopW
1117
+ lea r0, [r0 + r1 * 8 - 128]
1118
+ lea r2, [r2 + r3 * 8 - 128]
1119
+ dec r8d
1120
+ jnz .loopH
1121
+ movd eax, xm11
1122
+ RET
1123
+%endif
1124
+
1125
+%if BIT_DEPTH == 8
1126
+cglobal psyCost_pp_64x64, 4, 10, 27
1127
+ lea r4, [3 * r1]
1128
+ lea r7, [3 * r3]
1129
+ vbroadcasti32x8 m8, [hmul_8p]
1130
+ pxor m13, m13
1131
+ vbroadcasti32x8 m14, [pw_1]
1132
+ vbroadcasti32x8 m15, [pd_1]
1133
+ movu m26, [psy_pp_shuff3]
1134
+
1135
+ mov r8d, 8
1136
+.loopH:
1137
+ mov r9d, 4
1138
+.loopW:
1139
+ PSY_PP_INPUT_AVX512_MAIN
1140
+ PSY_PP_16x8_AVX512_MAIN
1141
+
1142
+ paddd m13, m16
1143
+ add r0, 16
1144
+ add r2, 16
1145
+ dec r9d
1146
+ jnz .loopW
1147
+ lea r0, [r0 + r1 * 8 - 64]
1148
+ lea r2, [r2 + r3 * 8 - 64]
1149
+ dec r8d
1150
+ jnz .loopH
1151
+ movd eax, xm13
1152
+ RET
1153
+%endif
1154
+%endif
1155
1156
;---------------------------------------------------------------------------------------------------------------------
1157
;int psyCost_ss(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride)
1158
1159
paddd xm0, xm1
1160
movd eax, xm0
1161
RET
1162
-%endif ; ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 0
1163
1164
+%macro PROCESS_SATD_32x4_AVX512 0 ; function to compute satd cost for 32 columns, 4 rows
1165
+ ; rows 0-3
1166
+ pmovzxbw m0, [r0]
1167
+ pmovzxbw m4, [r2]
1168
+ psubw m0, m4
1169
+ pmovzxbw m1, [r0 + r1]
1170
+ pmovzxbw m5, [r2 + r3]
1171
+ psubw m1, m5
1172
+ pmovzxbw m2, [r0 + r1 * 2]
1173
+ pmovzxbw m4, [r2 + r3 * 2]
1174
+ psubw m2, m4
1175
+ pmovzxbw m3, [r0 + r4]
1176
+ pmovzxbw m5, [r2 + r5]
1177
+ psubw m3, m5
1178
+ paddw m4, m0, m1
1179
+ psubw m1, m0
1180
+ paddw m0, m2, m3
1181
+ psubw m3, m2
1182
+ punpckhwd m2, m4, m1
1183
+ punpcklwd m4, m1
1184
+ punpckhwd m1, m0, m3
1185
+ punpcklwd m0, m3
1186
+ paddw m3, m4, m0
1187
+ psubw m0, m4
1188
+ paddw m4, m2, m1
1189
+ psubw m1, m2
1190
+ punpckhdq m2, m3, m0
1191
+ punpckldq m3, m0
1192
+ paddw m0, m3, m2
1193
+ psubw m2, m3
1194
+ punpckhdq m3, m4, m1
1195
+ punpckldq m4, m1
1196
+ paddw m1, m4, m3
1197
+ psubw m3, m4
1198
+ punpckhqdq m4, m0, m1
1199
+ punpcklqdq m0, m1
1200
+ pabsw m0, m0
1201
+ pabsw m4, m4
1202
+ pmaxsw m0, m0, m4
1203
+ punpckhqdq m1, m2, m3
1204
+ punpcklqdq m2, m3
1205
+ pabsw m2, m2
1206
+ pabsw m1, m1
1207
+ pmaxsw m2, m1
1208
+ pxor m7, m7
1209
+ mova m1, m0
1210
+ punpcklwd m1, m7
1211
+ paddd m6, m1
1212
+ mova m1, m0
1213
+ punpckhwd m1, m7
1214
+ paddd m6, m1
1215
+ pxor m7, m7
1216
+ mova m1, m2
1217
+ punpcklwd m1, m7
1218
+ paddd m6, m1
1219
+ mova m1, m2
1220
+ punpckhwd m1, m7
1221
+ paddd m6, m1
1222
+%endmacro
1223
+
1224
+%macro SATD_MAIN_AVX512_END 0
1225
+ vextracti32x8 ym7, m6, 1
1226
+ paddd ym6, ym7
1227
+ vextracti128 xm7, ym6, 1
1228
+ paddd xm6, xm6, xm7
1229
+ punpckhqdq xm7, xm6, xm6
1230
+ paddd xm6, xm7
1231
+ movq rax, xm6
1232
+ rorx rdx, rax, 32
1233
+ add eax, edx
1234
+%endmacro
1235
+
1236
+%macro SATD_32xN_AVX512 1
1237
+INIT_ZMM avx512
1238
+cglobal pixel_satd_32x%1, 4,6,8
1239
+ lea r4, [3 * r1]
1240
+ lea r5, [3 * r3]
1241
+ pxor m6, m6
1242
+%rep %1/4 - 1
1243
+ PROCESS_SATD_32x4_AVX512
1244
+ lea r0, [r0 + 4 * r1]
1245
+ lea r2, [r2 + 4 * r3]
1246
+%endrep
1247
+ PROCESS_SATD_32x4_AVX512
1248
+ SATD_MAIN_AVX512_END
1249
+ RET
1250
+%endmacro
1251
+
1252
+SATD_32xN_AVX512 8
1253
+SATD_32xN_AVX512 16
1254
+SATD_32xN_AVX512 24
1255
+SATD_32xN_AVX512 32
1256
+SATD_32xN_AVX512 48
1257
+SATD_32xN_AVX512 64
1258
+
1259
+%macro SATD_64xN_AVX512 1
1260
+INIT_ZMM avx512
1261
+cglobal pixel_satd_64x%1, 4,8,8
1262
+ lea r4, [3 * r1]
1263
+ lea r5, [3 * r3]
1264
+ pxor m6, m6
1265
+ mov r6, r0
1266
+ mov r7, r2
1267
+
1268
+%rep %1/4 - 1
1269
+ PROCESS_SATD_32x4_AVX512
1270
+ lea r0, [r0 + 4 * r1]
1271
+ lea r2, [r2 + 4 * r3]
1272
+%endrep
1273
+ PROCESS_SATD_32x4_AVX512
1274
+ lea r0, [r6 + mmsize/2]
1275
+ lea r2, [r7 + mmsize/2]
1276
+%rep %1/4 - 1
1277
+ PROCESS_SATD_32x4_AVX512
1278
+ lea r0, [r0 + 4 * r1]
1279
+ lea r2, [r2 + 4 * r3]
1280
+%endrep
1281
+ PROCESS_SATD_32x4_AVX512
1282
+ SATD_MAIN_AVX512_END
1283
+ RET
1284
+%endmacro
1285
+
1286
+SATD_64xN_AVX512 16
1287
+SATD_64xN_AVX512 32
1288
+SATD_64xN_AVX512 48
1289
+SATD_64xN_AVX512 64
1290
+%endif ; ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 0
1291
%if ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 1
1292
INIT_YMM avx2
1293
cglobal calc_satd_16x8 ; function to compute satd cost for 16 columns, 8 rows
1294
1295
paddd xm6, xm7
1296
movd eax, xm6
1297
RET
1298
+
1299
+%macro SATD_HBD_AVX512_END 0
1300
+ vextracti32x8 ym7, m6, 1
1301
+ paddd ym6, ym7
1302
+ vextracti128 xm7, ym6, 1
1303
+ paddd xm6, xm7
1304
+ pxor xm7, xm7
1305
+ movhlps xm7, xm6
1306
+ paddd xm6, xm7
1307
+ pshufd xm7, xm6, 1
1308
+ paddd xm6, xm7
1309
+ movd eax, xm6
1310
+%endmacro
1311
+%macro PROCESS_SATD_16x8_HBD_AVX512 0 ; function to compute satd cost for 16 columns, 8 rows
1312
+ ; rows 0-3
1313
+ lea r6, [r0 + r1 * 4]
1314
+ lea r7, [r2 + r3 * 4]
1315
+ movu ym0, [r0]
1316
+ movu ym4, [r2]
1317
+ vinserti32x8 m0, [r6], 1
1318
+ vinserti32x8 m4, [r7], 1
1319
+ psubw m0, m4
1320
+ movu ym1, [r0 + r1]
1321
+ movu ym5, [r2 + r3]
1322
+ vinserti32x8 m1, [r6 + r1], 1
1323
+ vinserti32x8 m5, [r7 + r3], 1
1324
+ psubw m1, m5
1325
+ movu ym2, [r0 + r1 * 2]
1326
+ movu ym4, [r2 + r3 * 2]
1327
+ vinserti32x8 m2, [r6 + r1 * 2], 1
1328
+ vinserti32x8 m4, [r7 + r3 * 2], 1
1329
+ psubw m2, m4
1330
+ movu ym3, [r0 + r4]
1331
+ movu ym5, [r2 + r5]
1332
+ vinserti32x8 m3, [r6 + r4], 1
1333
+ vinserti32x8 m5, [r7 + r5], 1
1334
+ psubw m3, m5
1335
+
1336
+ paddw m4, m0, m1
1337
+ psubw m1, m0
1338
+ paddw m0, m2, m3
1339
+ psubw m3, m2
1340
+ punpckhwd m2, m4, m1
1341
+ punpcklwd m4, m1
1342
+ punpckhwd m1, m0, m3
1343
+ punpcklwd m0, m3
1344
+ paddw m3, m4, m0
1345
+ psubw m0, m4
1346
+ paddw m4, m2, m1
1347
+ psubw m1, m2
1348
+ punpckhdq m2, m3, m0
1349
+ punpckldq m3, m0
1350
+ paddw m0, m3, m2
1351
+ psubw m2, m3
1352
+ punpckhdq m3, m4, m1
1353
+ punpckldq m4, m1
1354
+ paddw m1, m4, m3
1355
+ psubw m3, m4
1356
+ punpckhqdq m4, m0, m1
1357
+ punpcklqdq m0, m1
1358
+ pabsw m0, m0
1359
+ pabsw m4, m4
1360
+ pmaxsw m0, m0, m4
1361
+ punpckhqdq m1, m2, m3
1362
+ punpcklqdq m2, m3
1363
+ pabsw m2, m2
1364
+ pabsw m1, m1
1365
+ pmaxsw m2, m1
1366
+ pxor m7, m7
1367
+ mova m1, m0
1368
+ punpcklwd m1, m7
1369
+ paddd m6, m1
1370
+ mova m1, m0
1371
+ punpckhwd m1, m7
1372
+ paddd m6, m1
1373
+ pxor m7, m7
1374
+ mova m1, m2
1375
+ punpcklwd m1, m7
1376
+ paddd m6, m1
1377
+ mova m1, m2
1378
+ punpckhwd m1, m7
1379
+ paddd m6, m1
1380
+%endmacro
1381
+%macro PROCESS_SATD_32x4_HBD_AVX512 0 ; function to compute satd cost for 32 columns, 4 rows
1382
+ ; rows 0-3
1383
+ movu m0, [r0]
1384
+ movu m4, [r2]
1385
+ psubw m0, m4
1386
+ movu m1, [r0 + r1]
1387
+ movu m5, [r2 + r3]
1388
+ psubw m1, m5
1389
+ movu m2, [r0 + r1 * 2]
1390
+ movu m4, [r2 + r3 * 2]
1391
+ psubw m2, m4
1392
+ movu m3, [r0 + r4]
1393
+ movu m5, [r2 + r5]
1394
+ psubw m3, m5
1395
+ paddw m4, m0, m1
1396
+ psubw m1, m0
1397
+ paddw m0, m2, m3
1398
+ psubw m3, m2
1399
+ punpckhwd m2, m4, m1
1400
+ punpcklwd m4, m1
1401
+ punpckhwd m1, m0, m3
1402
+ punpcklwd m0, m3
1403
+ paddw m3, m4, m0
1404
+ psubw m0, m4
1405
+ paddw m4, m2, m1
1406
+ psubw m1, m2
1407
+ punpckhdq m2, m3, m0
1408
+ punpckldq m3, m0
1409
+ paddw m0, m3, m2
1410
+ psubw m2, m3
1411
+ punpckhdq m3, m4, m1
1412
+ punpckldq m4, m1
1413
+ paddw m1, m4, m3
1414
+ psubw m3, m4
1415
+ punpckhqdq m4, m0, m1
1416
+ punpcklqdq m0, m1
1417
+ pabsw m0, m0
1418
+ pabsw m4, m4
1419
+ pmaxsw m0, m0, m4
1420
+ punpckhqdq m1, m2, m3
1421
+ punpcklqdq m2, m3
1422
+ pabsw m2, m2
1423
+ pabsw m1, m1
1424
+ pmaxsw m2, m1
1425
+ pxor m7, m7
1426
+ mova m1, m0
1427
+ punpcklwd m1, m7
1428
+ paddd m6, m1
1429
+ mova m1, m0
1430
+ punpckhwd m1, m7
1431
+ paddd m6, m1
1432
+ pxor m7, m7
1433
+ mova m1, m2
1434
+ punpcklwd m1, m7
1435
+ paddd m6, m1
1436
+ mova m1, m2
1437
+ punpckhwd m1, m7
1438
+ paddd m6, m1
1439
+%endmacro
1440
+
1441
+%macro SATD_16xN_HBD_AVX512 1
1442
+INIT_ZMM avx512
1443
+cglobal pixel_satd_16x%1, 4,8,8
1444
+ add r1d, r1d
1445
+ add r3d, r3d
1446
+ lea r4, [3 * r1]
1447
+ lea r5, [3 * r3]
1448
+ pxor m6, m6
1449
+
1450
+%rep %1/8 - 1
1451
+ PROCESS_SATD_16x8_HBD_AVX512
1452
+ lea r0, [r6 + 4 * r1]
1453
+ lea r2, [r7 + 4 * r3]
1454
+%endrep
1455
+ PROCESS_SATD_16x8_HBD_AVX512
1456
+ SATD_HBD_AVX512_END
1457
+ RET
1458
+%endmacro
1459
+
1460
+SATD_16xN_HBD_AVX512 8
1461
+SATD_16xN_HBD_AVX512 16
1462
+SATD_16xN_HBD_AVX512 32
1463
+SATD_16xN_HBD_AVX512 64
1464
+
1465
+%macro SATD_32xN_HBD_AVX512 1
1466
+INIT_ZMM avx512
1467
+cglobal pixel_satd_32x%1, 4,8,8
1468
+ add r1d, r1d
1469
+ add r3d, r3d
1470
+ lea r4, [3 * r1]
1471
+ lea r5, [3 * r3]
1472
+ pxor m6, m6
1473
+ mov r6, r0
1474
+ mov r7, r2
1475
+%rep %1/4 - 1
1476
+ PROCESS_SATD_32x4_HBD_AVX512
1477
+ lea r0, [r0 + 4 * r1]
1478
+ lea r2, [r2 + 4 * r3]
1479
+%endrep
1480
+ PROCESS_SATD_32x4_HBD_AVX512
1481
+ SATD_HBD_AVX512_END
1482
+ RET
1483
+%endmacro
1484
+
1485
+SATD_32xN_HBD_AVX512 8
1486
+SATD_32xN_HBD_AVX512 16
1487
+SATD_32xN_HBD_AVX512 24
1488
+SATD_32xN_HBD_AVX512 32
1489
+SATD_32xN_HBD_AVX512 64
1490
+INIT_ZMM avx512
1491
+cglobal pixel_satd_48x64, 4,10,8
1492
+ add r1d, r1d
1493
+ add r3d, r3d
1494
+ lea r4, [3 * r1]
1495
+ lea r5, [3 * r3]
1496
+ pxor m6, m6
1497
+ mov r8, r0
1498
+ mov r9, r2
1499
+
1500
+%rep 15
1501
+ PROCESS_SATD_32x4_HBD_AVX512
1502
+ lea r0, [r0 + 4 * r1]
1503
+ lea r2, [r2 + 4 * r3]
1504
+%endrep
1505
+ PROCESS_SATD_32x4_HBD_AVX512
1506
+ lea r0, [r8 + mmsize]
1507
+ lea r2, [r9 + mmsize]
1508
+%rep 7
1509
+ PROCESS_SATD_16x8_HBD_AVX512
1510
+ lea r0, [r6 + 4 * r1]
1511
+ lea r2, [r7 + 4 * r3]
1512
+%endrep
1513
+ PROCESS_SATD_16x8_HBD_AVX512
1514
+ SATD_HBD_AVX512_END
1515
+ RET
1516
+
1517
+%macro SATD_64xN_HBD_AVX512 1
1518
+INIT_ZMM avx512
1519
+cglobal pixel_satd_64x%1, 4,8,8
1520
+ add r1d, r1d
1521
+ add r3d, r3d
1522
+ lea r4, [3 * r1]
1523
+ lea r5, [3 * r3]
1524
+ pxor m6, m6
1525
+ mov r6, r0
1526
+ mov r7, r2
1527
+%rep %1/4 - 1
1528
+ PROCESS_SATD_32x4_HBD_AVX512
1529
+ lea r0, [r0 + 4 * r1]
1530
+ lea r2, [r2 + 4 * r3]
1531
+%endrep
1532
+ PROCESS_SATD_32x4_HBD_AVX512
1533
+ lea r0, [r6 + mmsize]
1534
+ lea r2, [r7 + mmsize]
1535
+%rep %1/4 - 1
1536
+ PROCESS_SATD_32x4_HBD_AVX512
1537
+ lea r0, [r0 + 4 * r1]
1538
+ lea r2, [r2 + 4 * r3]
1539
+%endrep
1540
+ PROCESS_SATD_32x4_HBD_AVX512
1541
+ SATD_HBD_AVX512_END
1542
+ RET
1543
+%endmacro
1544
+
1545
+SATD_64xN_HBD_AVX512 16
1546
+SATD_64xN_HBD_AVX512 32
1547
+SATD_64xN_HBD_AVX512 48
1548
+SATD_64xN_HBD_AVX512 64
1549
%endif ; ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 1
1550
1551
1552
1553
;lea %8, [%8+4*r3]
1554
%endmacro
1555
1556
+%if ARCH_X86_64
1557
INIT_YMM avx2
1558
cglobal pixel_satd_8x8, 4,4,7
1559
1560
1561
1562
movd eax, xm0
1563
RET
1564
-
1565
+%endif
1566
%endif ; HIGH_BIT_DEPTH == 1 && BIT_DEPTH == 10
1567
x265_2.7.tar.gz/source/common/x86/pixel-util.h -> x265_2.9.tar.gz/source/common/x86/pixel-util.h
Changed
33
1
2
3
#define DEFINE_UTILS(cpu) \
4
FUNCDEF_TU_S2(void, getResidual, cpu, const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride); \
5
+ FUNCDEF_TU_S2(void, getResidual_aligned, cpu, const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride); \
6
FUNCDEF_TU_S2(void, transpose, cpu, pixel* dest, const pixel* src, intptr_t stride); \
7
FUNCDEF_TU(int, count_nonzero, cpu, const int16_t* quantCoeff); \
8
uint32_t PFX(quant_ ## cpu(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff)); \
9
10
void PFX(weight_pp_ ## cpu(const pixel* src, pixel* dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset)); \
11
void PFX(weight_sp_ ## cpu(const int16_t* src, pixel* dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset)); \
12
void PFX(scale1D_128to64_ ## cpu(pixel*, const pixel*)); \
13
+ void PFX(scale1D_128to64_aligned_ ## cpu(pixel*, const pixel*)); \
14
void PFX(scale2D_64to32_ ## cpu(pixel*, const pixel*, intptr_t)); \
15
uint32_t PFX(costCoeffRemain_ ## cpu(uint16_t *absCoeff, int numNonZero, int idx)); \
16
uint32_t PFX(costC1C2Flag_sse2(uint16_t *absCoeff, intptr_t numNonZero, uint8_t *baseCtxMod, intptr_t ctxOffset)); \
17
18
DEFINE_UTILS(ssse3);
19
DEFINE_UTILS(sse4);
20
DEFINE_UTILS(avx2);
21
+DEFINE_UTILS(avx512);
22
23
#undef DEFINE_UTILS
24
25
26
uint32_t PFX(costCoeffNxN_sse4(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase));
27
uint32_t PFX(costCoeffNxN_avx2_bmi2(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase));
28
29
+int PFX(count_nonzero_16x16_avx512(const int16_t* quantCoeff));
30
+int PFX(count_nonzero_32x32_avx512(const int16_t* quantCoeff));
31
+
32
#endif // ifndef X265_PIXEL_UTIL_H
33
x265_2.7.tar.gz/source/common/x86/pixel-util8.asm -> x265_2.9.tar.gz/source/common/x86/pixel-util8.asm
Changed
1798
1
2
;* Authors: Min Chen <chenm003@163.com> <min.chen@multicorewareinc.com>
3
;* Nabajit Deka <nabajit@multicorewareinc.com>
4
;* Rajesh Paulraj <rajesh@multicorewareinc.com>
5
+;* Praveen Kumar Tiwari <praveen@multicorewareinc.com>
6
;*
7
;* This program is free software; you can redistribute it and/or modify
8
;* it under the terms of the GNU General Public License as published by
9
10
%include "x86inc.asm"
11
%include "x86util.asm"
12
13
-SECTION_RODATA 32
14
+SECTION_RODATA 64
15
+
16
+var_shuf_avx512: db 0,-1, 1,-1, 2,-1, 3,-1, 4,-1, 5,-1, 6,-1, 7,-1
17
+ db 8,-1, 9,-1,10,-1,11,-1,12,-1,13,-1,14,-1,15,-1
18
+ALIGN 64
19
+const dequant_shuf1_avx512, dq 0, 2, 4, 6, 1, 3, 5, 7
20
+const dequant_shuf2_avx512, dq 0, 4, 1, 5, 2, 6, 3, 7
21
22
%if BIT_DEPTH == 12
23
ssim_c1: times 4 dd 107321.76 ; .01*.01*4095*4095*64
24
25
%endrep
26
RET
27
%endif
28
+
29
+%macro PROCESS_GETRESIDUAL32_W4_HBD_AVX512 0
30
+ movu m0, [r0]
31
+ movu m1, [r0 + r3]
32
+ movu m2, [r0 + r3 * 2]
33
+ movu m3, [r0 + r4]
34
+ lea r0, [r0 + r3 * 4]
35
+
36
+ movu m4, [r1]
37
+ movu m5, [r1 + r3]
38
+ movu m6, [r1 + r3 * 2]
39
+ movu m7, [r1 + r4]
40
+ lea r1, [r1 + r3 * 4]
41
+
42
+ psubw m0, m4
43
+ psubw m1, m5
44
+ psubw m2, m6
45
+ psubw m3, m7
46
+
47
+ movu [r2], m0
48
+ movu [r2 + r3], m1
49
+ movu [r2 + r3 * 2], m2
50
+ movu [r2 + r4], m3
51
+ lea r2, [r2 + r3 * 4]
52
+%endmacro
53
+
54
+%macro PROCESS_GETRESIDUAL32_W4_HBD_AVX512_END 0
55
+ movu m0, [r0]
56
+ movu m1, [r0 + r3]
57
+ movu m2, [r0 + r3 * 2]
58
+ movu m3, [r0 + r4]
59
+
60
+ movu m4, [r1]
61
+ movu m5, [r1 + r3]
62
+ movu m6, [r1 + r3 * 2]
63
+ movu m7, [r1 + r4]
64
+
65
+ psubw m0, m4
66
+ psubw m1, m5
67
+ psubw m2, m6
68
+ psubw m3, m7
69
+
70
+ movu [r2], m0
71
+ movu [r2 + r3], m1
72
+ movu [r2 + r3 * 2], m2
73
+ movu [r2 + r4], m3
74
+%endmacro
75
+
76
+%macro PROCESS_GETRESIDUAL32_W4_AVX512 0
77
+ pmovzxbw m0, [r0]
78
+ pmovzxbw m1, [r0 + r3]
79
+ pmovzxbw m2, [r0 + r3 * 2]
80
+ pmovzxbw m3, [r0 + r4]
81
+ lea r0, [r0 + r3 * 4]
82
+
83
+ pmovzxbw m4, [r1]
84
+ pmovzxbw m5, [r1 + r3]
85
+ pmovzxbw m6, [r1 + r3 * 2]
86
+ pmovzxbw m7, [r1 + r4]
87
+ lea r1, [r1 + r3 * 4]
88
+
89
+ psubw m0, m4
90
+ psubw m1, m5
91
+ psubw m2, m6
92
+ psubw m3, m7
93
+
94
+ movu [r2], m0
95
+ movu [r2 + r3 * 2], m1
96
+ lea r2, [r2 + r3 * 4]
97
+ movu [r2], m2
98
+ movu [r2 + r3 * 2], m3
99
+ lea r2, [r2 + r3 * 4]
100
+%endmacro
101
+
102
+%macro PROCESS_GETRESIDUAL32_W4_AVX512_END 0
103
+ pmovzxbw m0, [r0]
104
+ pmovzxbw m1, [r0 + r3]
105
+ pmovzxbw m2, [r0 + r3 * 2]
106
+ pmovzxbw m3, [r0 + r4]
107
+
108
+ pmovzxbw m4, [r1]
109
+ pmovzxbw m5, [r1 + r3]
110
+ pmovzxbw m6, [r1 + r3 * 2]
111
+ pmovzxbw m7, [r1 + r4]
112
+
113
+ psubw m0, m4
114
+ psubw m1, m5
115
+ psubw m2, m6
116
+ psubw m3, m7
117
+
118
+ movu [r2], m0
119
+ movu [r2 + r3 * 2], m1
120
+ lea r2, [r2 + r3 * 4]
121
+ movu [r2], m2
122
+ movu [r2 + r3 * 2], m3
123
+%endmacro
124
+
125
+
126
+%if HIGH_BIT_DEPTH
127
+INIT_ZMM avx512
128
+cglobal getResidual32, 4,5,8
129
+ add r3, r3
130
+ lea r4, [r3 * 3]
131
+
132
+ PROCESS_GETRESIDUAL32_W4_HBD_AVX512
133
+ PROCESS_GETRESIDUAL32_W4_HBD_AVX512
134
+ PROCESS_GETRESIDUAL32_W4_HBD_AVX512
135
+ PROCESS_GETRESIDUAL32_W4_HBD_AVX512
136
+ PROCESS_GETRESIDUAL32_W4_HBD_AVX512
137
+ PROCESS_GETRESIDUAL32_W4_HBD_AVX512
138
+ PROCESS_GETRESIDUAL32_W4_HBD_AVX512
139
+ PROCESS_GETRESIDUAL32_W4_HBD_AVX512_END
140
+ RET
141
+%else
142
+INIT_ZMM avx512
143
+cglobal getResidual32, 4,5,8
144
+ lea r4, [r3 * 3]
145
+
146
+ PROCESS_GETRESIDUAL32_W4_AVX512
147
+ PROCESS_GETRESIDUAL32_W4_AVX512
148
+ PROCESS_GETRESIDUAL32_W4_AVX512
149
+ PROCESS_GETRESIDUAL32_W4_AVX512
150
+ PROCESS_GETRESIDUAL32_W4_AVX512
151
+ PROCESS_GETRESIDUAL32_W4_AVX512
152
+ PROCESS_GETRESIDUAL32_W4_AVX512
153
+ PROCESS_GETRESIDUAL32_W4_AVX512_END
154
+ RET
155
+%endif
156
+
157
+%macro PROCESS_GETRESIDUAL32_ALIGNED_W4_HBD_AVX512 0
158
+ movu m0, [r0]
159
+ movu m1, [r0 + r3]
160
+ movu m2, [r0 + r3 * 2]
161
+ movu m3, [r0 + r4]
162
+ lea r0, [r0 + r3 * 4]
163
+
164
+ movu m4, [r1]
165
+ movu m5, [r1 + r3]
166
+ movu m6, [r1 + r3 * 2]
167
+ movu m7, [r1 + r4]
168
+ lea r1, [r1 + r3 * 4]
169
+
170
+ psubw m0, m4
171
+ psubw m1, m5
172
+ psubw m2, m6
173
+ psubw m3, m7
174
+
175
+ movu [r2], m0
176
+ movu [r2 + r3], m1
177
+ movu [r2 + r3 * 2], m2
178
+ movu [r2 + r4], m3
179
+ lea r2, [r2 + r3 * 4]
180
+%endmacro
181
+
182
+%macro PROCESS_GETRESIDUAL32_ALIGNED_W4_HBD_AVX512_END 0
183
+ movu m0, [r0]
184
+ movu m1, [r0 + r3]
185
+ movu m2, [r0 + r3 * 2]
186
+ movu m3, [r0 + r4]
187
+
188
+ movu m4, [r1]
189
+ movu m5, [r1 + r3]
190
+ movu m6, [r1 + r3 * 2]
191
+ movu m7, [r1 + r4]
192
+
193
+ psubw m0, m4
194
+ psubw m1, m5
195
+ psubw m2, m6
196
+ psubw m3, m7
197
+
198
+ movu [r2], m0
199
+ movu [r2 + r3], m1
200
+ movu [r2 + r3 * 2], m2
201
+ movu [r2 + r4], m3
202
+%endmacro
203
+
204
+%macro PROCESS_GETRESIDUAL32_ALIGNED_W4_AVX512 0
205
+ pmovzxbw m0, [r0]
206
+ pmovzxbw m1, [r0 + r3]
207
+ pmovzxbw m2, [r0 + r3 * 2]
208
+ pmovzxbw m3, [r0 + r4]
209
+ lea r0, [r0 + r3 * 4]
210
+
211
+ pmovzxbw m4, [r1]
212
+ pmovzxbw m5, [r1 + r3]
213
+ pmovzxbw m6, [r1 + r3 * 2]
214
+ pmovzxbw m7, [r1 + r4]
215
+ lea r1, [r1 + r3 * 4]
216
+
217
+ psubw m0, m4
218
+ psubw m1, m5
219
+ psubw m2, m6
220
+ psubw m3, m7
221
+
222
+ movu [r2], m0
223
+ movu [r2 + r3 * 2], m1
224
+ lea r2, [r2 + r3 * 4]
225
+ movu [r2], m2
226
+ movu [r2 + r3 * 2], m3
227
+ lea r2, [r2 + r3 * 4]
228
+%endmacro
229
+
230
+%macro PROCESS_GETRESIDUAL32_ALIGNED_W4_AVX512_END 0
231
+ pmovzxbw m0, [r0]
232
+ pmovzxbw m1, [r0 + r3]
233
+ pmovzxbw m2, [r0 + r3 * 2]
234
+ pmovzxbw m3, [r0 + r4]
235
+
236
+ pmovzxbw m4, [r1]
237
+ pmovzxbw m5, [r1 + r3]
238
+ pmovzxbw m6, [r1 + r3 * 2]
239
+ pmovzxbw m7, [r1 + r4]
240
+
241
+ psubw m0, m4
242
+ psubw m1, m5
243
+ psubw m2, m6
244
+ psubw m3, m7
245
+
246
+ movu [r2], m0
247
+ movu [r2 + r3 * 2], m1
248
+ lea r2, [r2 + r3 * 4]
249
+ movu [r2], m2
250
+ movu [r2 + r3 * 2], m3
251
+%endmacro
252
+
253
+
254
+%if HIGH_BIT_DEPTH
255
+INIT_ZMM avx512
256
+cglobal getResidual_aligned32, 4,5,8
257
+ add r3, r3
258
+ lea r4, [r3 * 3]
259
+
260
+ PROCESS_GETRESIDUAL32_ALIGNED_W4_HBD_AVX512
261
+ PROCESS_GETRESIDUAL32_ALIGNED_W4_HBD_AVX512
262
+ PROCESS_GETRESIDUAL32_ALIGNED_W4_HBD_AVX512
263
+ PROCESS_GETRESIDUAL32_ALIGNED_W4_HBD_AVX512
264
+ PROCESS_GETRESIDUAL32_ALIGNED_W4_HBD_AVX512
265
+ PROCESS_GETRESIDUAL32_ALIGNED_W4_HBD_AVX512
266
+ PROCESS_GETRESIDUAL32_ALIGNED_W4_HBD_AVX512
267
+ PROCESS_GETRESIDUAL32_ALIGNED_W4_HBD_AVX512_END
268
+ RET
269
+%else
270
+INIT_ZMM avx512
271
+cglobal getResidual_aligned32, 4,5,8
272
+ lea r4, [r3 * 3]
273
+
274
+ PROCESS_GETRESIDUAL32_ALIGNED_W4_AVX512
275
+ PROCESS_GETRESIDUAL32_ALIGNED_W4_AVX512
276
+ PROCESS_GETRESIDUAL32_ALIGNED_W4_AVX512
277
+ PROCESS_GETRESIDUAL32_ALIGNED_W4_AVX512
278
+ PROCESS_GETRESIDUAL32_ALIGNED_W4_AVX512
279
+ PROCESS_GETRESIDUAL32_ALIGNED_W4_AVX512
280
+ PROCESS_GETRESIDUAL32_ALIGNED_W4_AVX512
281
+ PROCESS_GETRESIDUAL32_ALIGNED_W4_AVX512_END
282
+ RET
283
+%endif
284
;-----------------------------------------------------------------------------
285
; uint32_t quant(int16_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff);
286
;-----------------------------------------------------------------------------
287
288
%endif ; ARCH_X86_64 == 1
289
290
291
+%if ARCH_X86_64 == 1
292
+INIT_ZMM avx512
293
+cglobal quant, 5, 6, 22
294
+ ; fill qbits
295
+ movd xm4, r4d ; m4 = qbits
296
+
297
+ ; fill qbits-8
298
+ sub r4d, 8
299
+ movd xm6, r4d ; m6 = qbits8
300
+
301
+ ; fill offset
302
+%if UNIX64 == 0
303
+ vpbroadcastd m5, r5m ; m5 = add
304
+%else ; Mac
305
+ movd xm5, r5m
306
+ vpbroadcastd m5, xm5 ; m5 = add
307
+%endif
308
+
309
+ vbroadcasti32x8 m9, [pw_1]
310
+
311
+ mov r4d, r6m
312
+ pxor m7, m7
313
+ sub r4d, 32
314
+ jl .coeff16
315
+ add r4d, 32
316
+ shr r4d, 5
317
+ jmp .loop
318
+
319
+.coeff16:
320
+ ; 16 coeff
321
+ pxor m7, m7
322
+ pmovsxwd m16, [r0] ; m16 = level
323
+ pabsd m1, m16
324
+ pmulld m1, [r1]
325
+ paddd m17, m1, m5
326
+ psrad m17, xm4 ; m17 = level1
327
+
328
+ pslld m3, m17, 8
329
+ psrad m1, xm6
330
+ psubd m1, m3 ; m1 = deltaU1
331
+ movu [r2], m1
332
+ vextracti64x4 ym19, m17, 1
333
+ vextracti64x4 ym20, m16, 1
334
+ psignd ym17, ym16
335
+ psignd ym19, ym20
336
+ packssdw ym17, ym19
337
+ vpermq ym17, ym17, q3120
338
+ movu [r3], ym17
339
+
340
+ pminuw ym17, ym9
341
+ paddw ym7, ym17
342
+
343
+ ; sum count
344
+ xorpd m0, m0
345
+ psadbw ym7, ym0
346
+ vextracti128 xm1, ym7, 1
347
+ paddd xm7, xm1
348
+ movhlps xm0, xm7
349
+ paddd xm7, xm0
350
+ movd eax, xm7
351
+ RET
352
+
353
+.loop:
354
+ ; 16 coeff
355
+ pmovsxwd m16, [r0] ; m16 = level
356
+ pabsd m1, m16
357
+ pmulld m1, [r1]
358
+ paddd m17, m1, m5
359
+ psrad m17, xm4 ; m17 = level1
360
+
361
+ pslld m3, m17, 8
362
+ psrad m1, xm6
363
+ psubd m1, m3 ; m1 = deltaU1
364
+ movu [r2], m1
365
+ vextracti64x4 ym19, m17, 1
366
+ vextracti64x4 ym20, m16, 1
367
+ psignd ym17, ym16
368
+ psignd ym19, ym20
369
+ packssdw ym17, ym19
370
+
371
+ ; 16 coeff
372
+ pmovsxwd m16, [r0 + mmsize/2] ; m16 = level
373
+ pabsd m1, m16
374
+ pmulld m1, [r1 + mmsize]
375
+ paddd m18, m1, m5
376
+ psrad m18, xm4 ; m2 = level1
377
+
378
+ pslld m8, m18, 8
379
+ psrad m1, xm6
380
+ psubd m1, m8 ; m1 = deltaU1
381
+ movu [r2 + mmsize], m1
382
+ vextracti64x4 ym21, m18, 1
383
+ vextracti64x4 ym20, m16, 1
384
+ psignd ym18, ym16
385
+ psignd ym21, ym20
386
+ packssdw ym18, ym21
387
+ vinserti64x4 m17, m17, ym18, 1
388
+ vpermq m17, m17, q3120
389
+
390
+ movu [r3], m17
391
+
392
+ pminuw m17, m9
393
+ paddw m7, m17
394
+
395
+ add r0, mmsize
396
+ add r1, mmsize * 2
397
+ add r2, mmsize * 2
398
+ add r3, mmsize
399
+
400
+ dec r4d
401
+ jnz .loop
402
+
403
+ ; sum count
404
+ xorpd m0, m0
405
+ psadbw m7, m0
406
+ vextracti32x8 ym1, m7, 1
407
+ paddd ym7, ym1
408
+ vextracti64x2 xm1, m7, 1
409
+ paddd xm7, xm1
410
+ pshufd xm1, xm7, 2
411
+ paddd xm7, xm1
412
+ movd eax, xm7
413
+ RET
414
+%endif ; ARCH_X86_64 == 1
415
+
416
+
417
+
418
;-----------------------------------------------------------------------------
419
; uint32_t nquant(int16_t *coef, int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff);
420
;-----------------------------------------------------------------------------
421
422
paddd xm5, xm0
423
movd eax, xm5
424
RET
425
+%if ARCH_X86_64 == 1
426
+INIT_ZMM avx512
427
+cglobal nquant, 3,5,22
428
+%if UNIX64 == 0
429
+ vpbroadcastd m4, r4m
430
+%else ; Mac
431
+ movd xm4, r4m
432
+ vpbroadcastd m4, xm4
433
+%endif
434
435
+ vbroadcasti32x8 m6, [pw_1]
436
+ mov r4d, r5m
437
+ pxor m5, m5
438
+ movd xm3, r3m
439
+ sub r4d, 16
440
+ je .coeff16
441
+ add r4d, 16
442
+ shr r4d, 5
443
+ jmp .loop
444
+
445
+.coeff16:
446
+ pmovsxwd m16, [r0]
447
+ pabsd m17, m16
448
+ pmulld m17, [r1]
449
+ paddd m17, m4
450
+ psrad m17, xm3
451
+
452
+ vextracti64x4 ym19, m17, 1
453
+ vextracti64x4 ym20, m16, 1
454
+ psignd ym17, ym16
455
+ psignd ym19, ym20
456
+ packssdw ym17, ym19
457
+ vpermq ym17, ym17, q3120
458
+ pabsw ym17, ym17
459
+ movu [r2], ym17
460
+ pminuw ym17, ym6
461
+ paddw ym5, ym17
462
+ pxor m0, m0
463
+ psadbw ym5, ym0
464
+ vextracti128 xm0, ym5, 1
465
+ paddd xm5, xm0
466
+ pshufd xm0, xm5, 2
467
+ paddd xm5, xm0
468
+ movd eax, xm5
469
+ RET
470
+
471
+.loop:
472
+ pmovsxwd m16, [r0]
473
+ pabsd m17, m16
474
+ pmulld m17, [r1]
475
+ paddd m17, m4
476
+ psrad m17, xm3
477
+ vextracti64x4 ym19, m17, 1
478
+ vextracti64x4 ym20, m16, 1
479
+ psignd ym17, ym16
480
+ psignd ym19, ym20
481
+ packssdw ym17, ym19
482
+
483
+ pmovsxwd m16, [r0 + mmsize/2]
484
+ pabsd m18, m16
485
+ pmulld m18, [r1 + mmsize]
486
+ paddd m18, m4
487
+ psrad m18, xm3
488
+ vextracti64x4 ym21, m18, 1
489
+ vextracti64x4 ym20, m16, 1
490
+ psignd ym18, ym16
491
+ psignd ym21, ym20
492
+ packssdw ym18, ym21
493
+ vinserti64x4 m17, m17, ym18, 1
494
+ vpermq m17, m17, q3120
495
+
496
+ pabsw m17, m17
497
+ movu [r2], m17
498
+
499
+ add r0, mmsize
500
+ add r1, mmsize * 2
501
+ add r2, mmsize
502
+
503
+ pminuw m17, m6
504
+ paddw m5, m17
505
+
506
+ dec r4d
507
+ jnz .loop
508
+
509
+ pxor m0, m0
510
+ psadbw m5, m0
511
+ vextracti32x8 ym1, m5, 1
512
+ paddd ym5, ym1
513
+ vextracti64x2 xm1, m5, 1
514
+ paddd xm5, xm1
515
+ pshufd xm1, xm5, 2
516
+ paddd xm5, xm1
517
+ movd eax, xm5
518
+ RET
519
+%endif ; ARCH_X86_64 == 1
520
521
;-----------------------------------------------------------------------------
522
; void dequant_normal(const int16_t* quantCoef, int32_t* coef, int num, int scale, int shift)
523
524
jnz .loop
525
RET
526
527
+;----------------------------------------------------------------------------------------------------------------------
528
+;void dequant_scaling(const int16_t* src, const int32_t* dequantCoef, int16_t* dst, int num, int mcqp_miper, int shift)
529
+;----------------------------------------------------------------------------------------------------------------------
530
+INIT_ZMM avx512
531
+cglobal dequant_scaling, 6,7,8
532
+ mova m6, [dequant_shuf1_avx512]
533
+ mova m7, [dequant_shuf2_avx512]
534
+ add r5d, 4
535
+ mov r6d, r3d
536
+ shr r3d, 5 ; num/32
537
+ cmp r5d, r4d
538
+ jle .skip
539
+ sub r5d, r4d
540
+ vpbroadcastd m0, [pd_1]
541
+ movd xm1, r5d ; shift - per
542
+ dec r5d
543
+ movd xm2, r5d ; shift - per - 1
544
+ pslld m0, xm2 ; 1 << shift - per - 1
545
+
546
+.part0:
547
+ pmovsxwd m2, [r0]
548
+ pmovsxwd m4, [r0 + 32]
549
+ movu m3, [r1]
550
+ movu m5, [r1 + 64]
551
+ pmulld m2, m3
552
+ pmulld m4, m5
553
+ paddd m2, m0
554
+ paddd m4, m0
555
+ psrad m2, xm1
556
+ psrad m4, xm1
557
+ packssdw m2, m4
558
+ vpermq m2, m6, m2
559
+ cmp r6d, 16
560
+ je .num16part0
561
+ movu [r2], m2
562
+
563
+ add r0, 64
564
+ add r1, 128
565
+ add r2, 64
566
+ dec r3d
567
+ jnz .part0
568
+ jmp .end
569
+
570
+.num16part0:
571
+ movu [r2], ym2
572
+ jmp .end
573
+
574
+.skip:
575
+ sub r4d, r5d ; per - shift
576
+ movd xm0, r4d
577
+
578
+.part1:
579
+ pmovsxwd m2, [r0]
580
+ pmovsxwd m4, [r0 + 32]
581
+ movu m3, [r1]
582
+ movu m5, [r1 + 64]
583
+ pmulld m2, m3
584
+ pmulld m4, m5
585
+ packssdw m2, m4
586
+
587
+ vextracti32x8 ym4, m2, 1
588
+ pmovsxwd m1, ym2
589
+ pmovsxwd m2, ym4
590
+ pslld m1, xm0
591
+ pslld m2, xm0
592
+ packssdw m1, m2
593
+
594
+ vpermq m1, m7, m1
595
+ cmp r6d, 16
596
+ je .num16part1
597
+ movu [r2], m1
598
+
599
+ add r0, 64
600
+ add r1, 128
601
+ add r2, 64
602
+ dec r3d
603
+ jnz .part1
604
+
605
+.num16part1:
606
+ movu [r2], ym1
607
+
608
+.end:
609
+ RET
610
+
611
+INIT_ZMM avx512
612
+cglobal dequant_normal, 5,5,7
613
+ vpbroadcastd m2, [pw_1] ; m2 = word [1]
614
+ vpbroadcastd m5, [pd_32767] ; m5 = dword [32767]
615
+ vpbroadcastd m6, [pd_n32768] ; m6 = dword [-32768]
616
+%if HIGH_BIT_DEPTH
617
+ cmp r3d, 32767
618
+ jle .skip
619
+ shr r3d, (BIT_DEPTH - 8)
620
+ sub r4d, (BIT_DEPTH - 8)
621
+.skip:
622
+%endif
623
+ movd xm0, r4d ; m0 = shift
624
+ add r4d, -1+16
625
+ bts r3d, r4d
626
+
627
+ movd xm1, r3d
628
+ vpbroadcastd m1, xm1 ; m1 = dword [add scale]
629
+
630
+ ; m0 = shift
631
+ ; m1 = scale
632
+ ; m2 = word [1]
633
+ mov r3d, r2d
634
+ shr r2d, 5
635
+.loop:
636
+ movu m3, [r0]
637
+ punpckhwd m4, m3, m2
638
+ punpcklwd m3, m2
639
+ pmaddwd m3, m1 ; m3 = dword (clipQCoef * scale + add)
640
+ pmaddwd m4, m1
641
+ psrad m3, xm0
642
+ psrad m4, xm0
643
+ pminsd m3, m5
644
+ pmaxsd m3, m6
645
+ pminsd m4, m5
646
+ pmaxsd m4, m6
647
+ packssdw m3, m4
648
+
649
+ mova [r1 + 0 * mmsize/2], ym3
650
+ cmp r3d, 16
651
+ je .num16
652
+ vextracti32x8 [r1 + 1 * mmsize/2], m3, 1
653
+
654
+ add r0, mmsize
655
+ add r1, mmsize
656
+
657
+ dec r2d
658
+ jnz .loop
659
+ RET
660
+.num16:
661
+ RET
662
+
663
664
;-----------------------------------------------------------------------------
665
; int x265_count_nonzero_4x4_sse2(const int16_t *quantCoeff);
666
667
movd eax, xm0
668
RET
669
670
+;-----------------------------------------------------------------------------
671
+; int x265_count_nonzero_16x16_avx512(const int16_t *quantCoeff);
672
+;-----------------------------------------------------------------------------
673
+%if ARCH_X86_64
674
+INIT_ZMM avx512
675
+cglobal count_nonzero_16x16, 1,4,2
676
+ mov r1, 0xFFFFFFFFFFFFFFFF
677
+ kmovq k2, r1
678
+ xor r3, r3
679
+ pxor m0, m0
680
681
+%assign x 0
682
+%rep 4
683
+ movu m1, [r0 + x]
684
+ vpacksswb m1, [r0 + x + 64]
685
+%assign x x+128
686
+ vpcmpb k1 {k2}, m1, m0, 00000100b
687
+ kmovq r1, k1
688
+ popcnt r2, r1
689
+ add r3d, r2d
690
+%endrep
691
+ mov eax, r3d
692
+ RET
693
+%endif
694
;-----------------------------------------------------------------------------
695
; int x265_count_nonzero_32x32_sse2(const int16_t *quantCoeff);
696
;-----------------------------------------------------------------------------
697
698
RET
699
700
701
+;-----------------------------------------------------------------------------
702
+; int x265_count_nonzero_32x32_avx512(const int16_t *quantCoeff);
703
+;-----------------------------------------------------------------------------
704
+%if ARCH_X86_64
705
+INIT_ZMM avx512
706
+cglobal count_nonzero_32x32, 1,4,2
707
+ mov r1, 0xFFFFFFFFFFFFFFFF
708
+ kmovq k2, r1
709
+ xor r3, r3
710
+ pxor m0, m0
711
+
712
+%assign x 0
713
+%rep 16
714
+ movu m1, [r0 + x]
715
+ vpacksswb m1, [r0 + x + 64]
716
+%assign x x+128
717
+ vpcmpb k1 {k2}, m1, m0, 00000100b
718
+ kmovq r1, k1
719
+ popcnt r2, r1
720
+ add r3d, r2d
721
+%endrep
722
+ mov eax, r3d
723
+ RET
724
+%endif
725
;-----------------------------------------------------------------------------------------------------------------------------------------------
726
;void weight_pp(pixel *src, pixel *dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset)
727
;-----------------------------------------------------------------------------------------------------------------------------------------------
728
729
jnz .loopH
730
RET
731
%endif
732
+
733
+%if HIGH_BIT_DEPTH
734
+INIT_ZMM avx512
735
+cglobal weight_pp, 6, 7, 7
736
+%define correction (14 - BIT_DEPTH)
737
+ mov r6d, r6m
738
+ shl r6d, 16 - correction
739
+ or r6d, r5d
740
+
741
+ movd xm0, r6d
742
+ vpbroadcastd m0, xm0
743
+ mov r5d, r7m
744
+ sub r5d, correction
745
+ movd xm1, r5d
746
+
747
+ vpbroadcastd m2, r8m
748
+ vbroadcasti32x8 m5, [pw_1]
749
+ vbroadcasti32x8 m6, [pw_pixel_max]
750
+
751
+ add r2d, r2d
752
+ add r3d, r3d
753
+ sub r2d, r3d
754
+ shr r3d, 6
755
+
756
+.loopH:
757
+ mov r5d, r3d
758
+
759
+.loopW:
760
+ movu m4, [r0]
761
+ punpcklwd m3, m4, m5
762
+ pmaddwd m3, m0
763
+ psrad m3, xm1
764
+ paddd m3, m2
765
+
766
+ punpckhwd m4, m5
767
+ pmaddwd m4, m0
768
+ psrad m4, xm1
769
+ paddd m4, m2
770
+
771
+ packusdw m3, m4
772
+ pminuw m3, m6
773
+ movu [r1], m3
774
+
775
+ add r0, 64
776
+ add r1, 64
777
+
778
+ dec r5d
779
+ jnz .loopW
780
+
781
+ lea r0, [r0 + r2]
782
+ lea r1, [r1 + r2]
783
+
784
+ dec r4d
785
+ jnz .loopH
786
+%undef correction
787
+ RET
788
+%else
789
+INIT_ZMM avx512
790
+cglobal weight_pp, 6, 7, 6
791
+
792
+ shl r5d, 6
793
+ mov r6d, r6m
794
+ shl r6d, 16
795
+ or r6d, r5d
796
+
797
+ movd xm0, r6d
798
+ vpbroadcastd m0, xm0
799
+ movd xm1, r7m
800
+ vpbroadcastd m2, r8m
801
+
802
+ vbroadcasti32x8 m5, [pw_1]
803
+
804
+ sub r2d, r3d
805
+ shr r3d, 5
806
+
807
+.loopH:
808
+ mov r5d, r3d
809
+
810
+.loopW:
811
+ pmovzxbw m4, [r0]
812
+ punpcklwd m3, m4, m5
813
+ pmaddwd m3, m0
814
+ psrad m3, xm1
815
+ paddd m3, m2
816
+
817
+ punpckhwd m4, m5
818
+ pmaddwd m4, m0
819
+ psrad m4, xm1
820
+ paddd m4, m2
821
+
822
+ packssdw m3, m4
823
+ vextracti64x4 ym4, m3, 1
824
+ packuswb ym3, ym4
825
+ vpermq ym3, ym3, q3120
826
+ movu [r1], ym3
827
+
828
+ add r0, 32
829
+ add r1, 32
830
+
831
+ dec r5d
832
+ jnz .loopW
833
+
834
+ lea r0, [r0 + r2]
835
+ lea r1, [r1 + r2]
836
+
837
+ dec r4d
838
+ jnz .loopH
839
+ RET
840
+%endif
841
+
842
;-------------------------------------------------------------------------------------------------------------------------------------------------
843
;void weight_sp(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset)
844
;-------------------------------------------------------------------------------------------------------------------------------------------------
845
846
%endif
847
%endif
848
849
+%if ARCH_X86_64 == 1
850
+%if HIGH_BIT_DEPTH
851
+INIT_ZMM avx512
852
+cglobal weight_sp, 6,9,8
853
+ vbroadcasti32x8 m1, [pw_pixel_max]
854
+ vbroadcasti32x8 m2, [pw_1]
855
+
856
+ mov r6d, r7m
857
+ shl r6d, 16
858
+ or r6d, r6m
859
+ movd xm3, r6d
860
+ vpbroadcastd m3, xm3 ; m3 = [round w0]
861
+ movd xm4, r8m ; m4 = [shift]
862
+ vpbroadcastd m5, r9m ; m5 = [offset]
863
+
864
+ ; correct row stride
865
+ add r3d, r3d
866
+ add r2d, r2d
867
+ mov r6d, r4d
868
+ and r6d, ~(mmsize / SIZEOF_PIXEL - 1)
869
+ shl r6d, 1
870
+ sub r3d, r6d
871
+ sub r2d, r6d
872
+
873
+ mov r6d, r4d
874
+ and r6d, (mmsize / SIZEOF_PIXEL - 1)
875
+
876
+.loopH:
877
+ mov r6d, r4d
878
+
879
+.loopW:
880
+ movu m6, [r0]
881
+ vbroadcasti32x8 m8, [pw_2000]
882
+ paddw m6, m8
883
+
884
+ punpcklwd m7, m6, m2
885
+ pmaddwd m7, m3 ;(round w0)
886
+ psrad m7, xm4 ;(shift)
887
+ paddd m7, m5 ;(offset)
888
+
889
+ punpckhwd m6, m2
890
+ pmaddwd m6, m3
891
+ psrad m6, xm4
892
+ paddd m6, m5
893
+
894
+ packusdw m7, m6
895
+ pminuw m7, m1
896
+
897
+ sub r6d, (mmsize / SIZEOF_PIXEL)
898
+ jl .widthLess30
899
+ movu [r1], m7
900
+ lea r0, [r0 + mmsize]
901
+ lea r1, [r1 + mmsize]
902
+ je .nextH
903
+ jmp .loopW
904
+
905
+.widthLess30:
906
+ mov r8d, 0xFFFFFFFF
907
+ NEG r6d
908
+ shrx r8d, r8d, r6d
909
+ kmovd k1, r8d
910
+ vmovdqu16 [r1] {k1}, m7
911
+ jmp .nextH
912
+
913
+.nextH:
914
+ add r0, r2
915
+ add r1, r3
916
+
917
+ dec r5d
918
+ jnz .loopH
919
+ RET
920
+
921
+%else
922
+INIT_ZMM avx512
923
+cglobal weight_sp, 6, 10, 7
924
+ mov r7d, r7m
925
+ shl r7d, 16
926
+ or r7d, r6m
927
+ movd xm0, r7d
928
+ vpbroadcastd m0, xm0 ; m0 = times 8 dw w0, round
929
+ movd xm1, r8m ; m1 = [shift]
930
+ vpbroadcastd m2, r9m ; m2 = times 16 dw offset
931
+ vpbroadcastw m3, [pw_1]
932
+ vpbroadcastw m4, [pw_2000]
933
+
934
+ add r2d, r2d ; 2 * srcstride
935
+
936
+ mov r7, r0
937
+ mov r8, r1
938
+.loopH:
939
+ mov r6d, r4d ; width
940
+
941
+ ; save old src and dst
942
+ mov r0, r7 ; src
943
+ mov r1, r8 ; dst
944
+
945
+.loopW:
946
+ movu m5, [r0]
947
+ paddw m5, m4
948
+
949
+ punpcklwd m6, m5, m3
950
+ pmaddwd m6, m0
951
+ psrad m6, xm1
952
+ paddd m6, m2
953
+
954
+ punpckhwd m5, m3
955
+ pmaddwd m5, m0
956
+ psrad m5, xm1
957
+ paddd m5, m2
958
+
959
+ packssdw m6, m5
960
+ vextracti64x4 ym5, m6, 1
961
+ packuswb ym6, ym5
962
+ vpermq ym6, ym6, q3120
963
+
964
+ sub r6d, 32
965
+ jl .widthLess30
966
+ movu [r1], ym6
967
+ je .nextH
968
+ add r0, 64
969
+ add r1, 32
970
+ jmp .loopW
971
+
972
+
973
+.widthLess30:
974
+ mov r9d, 0xFFFFFFFF
975
+ NEG r6d
976
+ shrx r9d, r9d, r6d
977
+ kmovd k1, r9d
978
+ vmovdqu8 [r1] {k1}, ym6
979
+ jmp .nextH
980
+
981
+.nextH:
982
+ lea r7, [r7 + r2]
983
+ lea r8, [r8 + r3]
984
+
985
+ dec r5d
986
+ jnz .loopH
987
+ RET
988
+%endif
989
+%endif
990
+
991
+
992
;-----------------------------------------------------------------
993
; void transpose_4x4(pixel *dst, pixel *src, intptr_t stride)
994
;-----------------------------------------------------------------
995
996
RET
997
%endif
998
999
+%if HIGH_BIT_DEPTH == 0
1000
+INIT_ZMM avx512
1001
+cglobal scale1D_128to64, 2, 2, 7
1002
+ pxor m4, m4
1003
+ mova m6, [dequant_shuf1_avx512]
1004
+ vbroadcasti32x8 m5, [pb_1]
1005
+
1006
+ ;Top pixel
1007
+ movu m0, [r1]
1008
+ movu m1, [r1 + 1 * mmsize]
1009
+ movu m2, [r1 + 2 * mmsize]
1010
+ movu m3, [r1 + 3 * mmsize]
1011
+
1012
+ pmaddubsw m0, m5
1013
+ pavgw m0, m4
1014
+ pmaddubsw m1, m5
1015
+ pavgw m1, m4
1016
+ packuswb m0, m1
1017
+ vpermq m0, m6, m0
1018
+ movu [r0], m0
1019
+
1020
+ ;Left pixel
1021
+ pmaddubsw m2, m5
1022
+ pavgw m2, m4
1023
+ pmaddubsw m3, m5
1024
+ pavgw m3, m4
1025
+ packuswb m2, m3
1026
+ vpermq m2, m6, m2
1027
+ movu [r0 + mmsize], m2
1028
+ RET
1029
+
1030
+INIT_ZMM avx512
1031
+cglobal scale1D_128to64_aligned, 2, 2, 7
1032
+ pxor m4, m4
1033
+ mova m6, [dequant_shuf1_avx512]
1034
+ vbroadcasti32x8 m5, [pb_1]
1035
+
1036
+ ;Top pixel
1037
+ mova m0, [r1]
1038
+ mova m1, [r1 + 1 * mmsize]
1039
+ mova m2, [r1 + 2 * mmsize]
1040
+ mova m3, [r1 + 3 * mmsize]
1041
+
1042
+ pmaddubsw m0, m5
1043
+ pavgw m0, m4
1044
+ pmaddubsw m1, m5
1045
+ pavgw m1, m4
1046
+ packuswb m0, m1
1047
+ vpermq m0, m6, m0
1048
+ mova [r0], m0
1049
+
1050
+ ;Left pixel
1051
+ pmaddubsw m2, m5
1052
+ pavgw m2, m4
1053
+ pmaddubsw m3, m5
1054
+ pavgw m3, m4
1055
+ packuswb m2, m3
1056
+ vpermq m2, m6, m2
1057
+ mova [r0 + mmsize], m2
1058
+ RET
1059
+%endif
1060
+
1061
;-----------------------------------------------------------------
1062
; void scale2D_64to32(pixel *dst, pixel *src, intptr_t stride)
1063
;-----------------------------------------------------------------
1064
1065
PIXELSUB_PS_W32_H8_avx2 32, 64
1066
%endif
1067
1068
+%macro PROCESS_SUB_PS_32x8_AVX512 0
1069
+ pmovzxbw m0, [r2]
1070
+ pmovzxbw m1, [r3]
1071
+ pmovzxbw m2, [r2 + r4]
1072
+ pmovzxbw m3, [r3 + r5]
1073
+ pmovzxbw m4, [r2 + 2 * r4]
1074
+ pmovzxbw m5, [r3 + 2 * r5]
1075
+ pmovzxbw m6, [r2 + r7]
1076
+ pmovzxbw m7, [r3 + r8]
1077
+
1078
+ psubw m0, m1
1079
+ psubw m2, m3
1080
+ psubw m4, m5
1081
+ psubw m6, m7
1082
+
1083
+ movu [r0], m0
1084
+ movu [r0 + r1], m2
1085
+ movu [r0 + r1 * 2 ], m4
1086
+ movu [r0 + r9], m6
1087
+
1088
+ lea r2, [r2 + r4 * 4]
1089
+ lea r3, [r3 + r5 * 4]
1090
+ lea r0, [r0 + r1 * 4]
1091
+
1092
+ pmovzxbw m0, [r2]
1093
+ pmovzxbw m1, [r3]
1094
+ pmovzxbw m2, [r2 + r4]
1095
+ pmovzxbw m3, [r3 + r5]
1096
+ pmovzxbw m4, [r2 + 2 * r4]
1097
+ pmovzxbw m5, [r3 + 2 * r5]
1098
+ pmovzxbw m6, [r2 + r7]
1099
+ pmovzxbw m7, [r3 + r8]
1100
+
1101
+ psubw m0, m1
1102
+ psubw m2, m3
1103
+ psubw m4, m5
1104
+ psubw m6, m7
1105
+
1106
+ movu [r0], m0
1107
+ movu [r0 + r1], m2
1108
+ movu [r0 + r1 * 2 ], m4
1109
+ movu [r0 + r9], m6
1110
+%endmacro
1111
+
1112
+%macro PROCESS_SUB_PS_32x8_HBD_AVX512 0
1113
+ movu m0, [r2]
1114
+ movu m1, [r3]
1115
+ movu m2, [r2 + r4]
1116
+ movu m3, [r3 + r5]
1117
+ psubw m0, m1
1118
+ psubw m2, m3
1119
+
1120
+ movu [r0], m0
1121
+ movu [r0 + r1], m2
1122
+
1123
+ movu m0, [r2 + r4 * 2]
1124
+ movu m1, [r3 + r5 * 2]
1125
+ movu m2, [r2 + r7]
1126
+ movu m3, [r3 + r8]
1127
+ psubw m0, m1
1128
+ psubw m2, m3
1129
+
1130
+ movu [r0 + r1 * 2], m0
1131
+ movu [r0 + r6], m2
1132
+
1133
+ lea r0, [r0 + r1 * 4]
1134
+ lea r2, [r2 + r4 * 4]
1135
+ lea r3, [r3 + r5 * 4]
1136
+
1137
+ movu m0, [r2]
1138
+ movu m1, [r3]
1139
+ movu m2, [r2 + r4]
1140
+ movu m3, [r3 + r5]
1141
+ psubw m0, m1
1142
+ psubw m2, m3
1143
+
1144
+ movu [r0], m0
1145
+ movu [r0 + r1], m2
1146
+
1147
+ movu m0, [r2 + r4 * 2]
1148
+ movu m1, [r3 + r5 * 2]
1149
+ movu m2, [r2 + r7]
1150
+ movu m3, [r3 + r8]
1151
+ psubw m0, m1
1152
+ psubw m2, m3
1153
+
1154
+ movu [r0 + r1 * 2], m0
1155
+ movu [r0 + r6], m2
1156
+%endmacro
1157
+
1158
+;-----------------------------------------------------------------------------
1159
+; void pixel_sub_ps_32x32(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
1160
+;-----------------------------------------------------------------------------
1161
+%if HIGH_BIT_DEPTH
1162
+%if ARCH_X86_64
1163
+INIT_ZMM avx512
1164
+cglobal pixel_sub_ps_32x32, 6, 9, 4
1165
+ add r1d, r1d
1166
+ add r4d, r4d
1167
+ add r5d, r5d
1168
+ lea r6, [r1 * 3]
1169
+ lea r7, [r4 * 3]
1170
+ lea r8, [r5 * 3]
1171
+ PROCESS_SUB_PS_32x8_HBD_AVX512
1172
+ lea r0, [r0 + r1 * 4]
1173
+ lea r2, [r2 + r4 * 4]
1174
+ lea r3, [r3 + r5 * 4]
1175
+ PROCESS_SUB_PS_32x8_HBD_AVX512
1176
+ lea r0, [r0 + r1 * 4]
1177
+ lea r2, [r2 + r4 * 4]
1178
+ lea r3, [r3 + r5 * 4]
1179
+ PROCESS_SUB_PS_32x8_HBD_AVX512
1180
+ lea r0, [r0 + r1 * 4]
1181
+ lea r2, [r2 + r4 * 4]
1182
+ lea r3, [r3 + r5 * 4]
1183
+ PROCESS_SUB_PS_32x8_HBD_AVX512
1184
+ RET
1185
+
1186
+cglobal pixel_sub_ps_32x64, 6, 9, 4
1187
+ add r1d, r1d
1188
+ add r4d, r4d
1189
+ add r5d, r5d
1190
+ lea r6, [r1 * 3]
1191
+ lea r7, [r4 * 3]
1192
+ lea r8, [r5 * 3]
1193
+ PROCESS_SUB_PS_32x8_HBD_AVX512
1194
+ lea r0, [r0 + r1 * 4]
1195
+ lea r2, [r2 + r4 * 4]
1196
+ lea r3, [r3 + r5 * 4]
1197
+ PROCESS_SUB_PS_32x8_HBD_AVX512
1198
+ lea r0, [r0 + r1 * 4]
1199
+ lea r2, [r2 + r4 * 4]
1200
+ lea r3, [r3 + r5 * 4]
1201
+ PROCESS_SUB_PS_32x8_HBD_AVX512
1202
+ lea r0, [r0 + r1 * 4]
1203
+ lea r2, [r2 + r4 * 4]
1204
+ lea r3, [r3 + r5 * 4]
1205
+ PROCESS_SUB_PS_32x8_HBD_AVX512
1206
+ lea r0, [r0 + r1 * 4]
1207
+ lea r2, [r2 + r4 * 4]
1208
+ lea r3, [r3 + r5 * 4]
1209
+ PROCESS_SUB_PS_32x8_HBD_AVX512
1210
+ lea r0, [r0 + r1 * 4]
1211
+ lea r2, [r2 + r4 * 4]
1212
+ lea r3, [r3 + r5 * 4]
1213
+ PROCESS_SUB_PS_32x8_HBD_AVX512
1214
+ lea r0, [r0 + r1 * 4]
1215
+ lea r2, [r2 + r4 * 4]
1216
+ lea r3, [r3 + r5 * 4]
1217
+ PROCESS_SUB_PS_32x8_HBD_AVX512
1218
+ lea r0, [r0 + r1 * 4]
1219
+ lea r2, [r2 + r4 * 4]
1220
+ lea r3, [r3 + r5 * 4]
1221
+ PROCESS_SUB_PS_32x8_HBD_AVX512
1222
+ RET
1223
+%endif
1224
+%else
1225
+%if ARCH_X86_64
1226
+INIT_ZMM avx512
1227
+cglobal pixel_sub_ps_32x32, 6, 10, 8
1228
+ add r1, r1
1229
+ lea r7, [r4 * 3]
1230
+ lea r8, [r5 * 3]
1231
+ lea r9, [r1 * 3]
1232
+
1233
+ PROCESS_SUB_PS_32x8_AVX512
1234
+ lea r2, [r2 + r4 * 4]
1235
+ lea r3, [r3 + r5 * 4]
1236
+ lea r0, [r0 + r1 * 4]
1237
+ PROCESS_SUB_PS_32x8_AVX512
1238
+ lea r2, [r2 + r4 * 4]
1239
+ lea r3, [r3 + r5 * 4]
1240
+ lea r0, [r0 + r1 * 4]
1241
+ PROCESS_SUB_PS_32x8_AVX512
1242
+ lea r2, [r2 + r4 * 4]
1243
+ lea r3, [r3 + r5 * 4]
1244
+ lea r0, [r0 + r1 * 4]
1245
+ PROCESS_SUB_PS_32x8_AVX512
1246
+ RET
1247
+
1248
+INIT_ZMM avx512
1249
+cglobal pixel_sub_ps_32x64, 6, 10, 8
1250
+ add r1, r1
1251
+ lea r7, [r4 * 3]
1252
+ lea r8, [r5 * 3]
1253
+ lea r9, [r1 * 3]
1254
+
1255
+ PROCESS_SUB_PS_32x8_AVX512
1256
+ lea r2, [r2 + r4 * 4]
1257
+ lea r3, [r3 + r5 * 4]
1258
+ lea r0, [r0 + r1 * 4]
1259
+ PROCESS_SUB_PS_32x8_AVX512
1260
+ lea r2, [r2 + r4 * 4]
1261
+ lea r3, [r3 + r5 * 4]
1262
+ lea r0, [r0 + r1 * 4]
1263
+ PROCESS_SUB_PS_32x8_AVX512
1264
+ lea r2, [r2 + r4 * 4]
1265
+ lea r3, [r3 + r5 * 4]
1266
+ lea r0, [r0 + r1 * 4]
1267
+ PROCESS_SUB_PS_32x8_AVX512
1268
+ lea r2, [r2 + r4 * 4]
1269
+ lea r3, [r3 + r5 * 4]
1270
+ lea r0, [r0 + r1 * 4]
1271
+ PROCESS_SUB_PS_32x8_AVX512
1272
+ lea r2, [r2 + r4 * 4]
1273
+ lea r3, [r3 + r5 * 4]
1274
+ lea r0, [r0 + r1 * 4]
1275
+ PROCESS_SUB_PS_32x8_AVX512
1276
+ lea r2, [r2 + r4 * 4]
1277
+ lea r3, [r3 + r5 * 4]
1278
+ lea r0, [r0 + r1 * 4]
1279
+ PROCESS_SUB_PS_32x8_AVX512
1280
+ lea r2, [r2 + r4 * 4]
1281
+ lea r3, [r3 + r5 * 4]
1282
+ lea r0, [r0 + r1 * 4]
1283
+ PROCESS_SUB_PS_32x8_AVX512
1284
+ RET
1285
+%endif
1286
+%endif
1287
+
1288
;-----------------------------------------------------------------------------
1289
; void pixel_sub_ps_64x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
1290
;-----------------------------------------------------------------------------
1291
1292
jnz .loop
1293
RET
1294
%endif
1295
+
1296
+%macro PROCESS_SUB_PS_64x8_AVX512 0
1297
+ pmovzxbw m0, [r2]
1298
+ pmovzxbw m1, [r2 + 32]
1299
+ pmovzxbw m2, [r3]
1300
+ pmovzxbw m3, [r3 + 32]
1301
+ pmovzxbw m4, [r2 + r4]
1302
+ pmovzxbw m5, [r2 + r4 + 32]
1303
+ pmovzxbw m6, [r3 + r5]
1304
+ pmovzxbw m7, [r3 + r5 + 32]
1305
+
1306
+ psubw m0, m2
1307
+ psubw m1, m3
1308
+ psubw m4, m6
1309
+ psubw m5, m7
1310
+ movu [r0], m0
1311
+ movu [r0 + 64], m1
1312
+ movu [r0 + 2 * r1], m4
1313
+ movu [r0 + 2 * r1 + 64], m5
1314
+
1315
+ lea r0, [r0 + 4 * r1]
1316
+ lea r2, [r2 + 2 * r4]
1317
+ lea r3, [r3 + 2 * r5]
1318
+
1319
+ pmovzxbw m0, [r2]
1320
+ pmovzxbw m1, [r2 + 32]
1321
+ pmovzxbw m2, [r3]
1322
+ pmovzxbw m3, [r3 + 32]
1323
+ pmovzxbw m4, [r2 + r4]
1324
+ pmovzxbw m5, [r2 + r4 + 32]
1325
+ pmovzxbw m6, [r3 + r5]
1326
+ pmovzxbw m7, [r3 + r5 + 32]
1327
+
1328
+ psubw m0, m2
1329
+ psubw m1, m3
1330
+ psubw m4, m6
1331
+ psubw m5, m7
1332
+ movu [r0], m0
1333
+ movu [r0 + 64], m1
1334
+ movu [r0 + 2 * r1], m4
1335
+ movu [r0 + 2 * r1 + 64], m5
1336
+
1337
+ lea r0, [r0 + 4 * r1]
1338
+ lea r2, [r2 + 2 * r4]
1339
+ lea r3, [r3 + 2 * r5]
1340
+
1341
+ pmovzxbw m0, [r2]
1342
+ pmovzxbw m1, [r2 + 32]
1343
+ pmovzxbw m2, [r3]
1344
+ pmovzxbw m3, [r3 + 32]
1345
+ pmovzxbw m4, [r2 + r4]
1346
+ pmovzxbw m5, [r2 + r4 + 32]
1347
+ pmovzxbw m6, [r3 + r5]
1348
+ pmovzxbw m7, [r3 + r5 + 32]
1349
+
1350
+ psubw m0, m2
1351
+ psubw m1, m3
1352
+ psubw m4, m6
1353
+ psubw m5, m7
1354
+ movu [r0], m0
1355
+ movu [r0 + 64], m1
1356
+ movu [r0 + 2 * r1], m4
1357
+ movu [r0 + 2 * r1 + 64], m5
1358
+
1359
+ lea r0, [r0 + 4 * r1]
1360
+ lea r2, [r2 + 2 * r4]
1361
+ lea r3, [r3 + 2 * r5]
1362
+
1363
+ pmovzxbw m0, [r2]
1364
+ pmovzxbw m1, [r2 + 32]
1365
+ pmovzxbw m2, [r3]
1366
+ pmovzxbw m3, [r3 + 32]
1367
+ pmovzxbw m4, [r2 + r4]
1368
+ pmovzxbw m5, [r2 + r4 + 32]
1369
+ pmovzxbw m6, [r3 + r5]
1370
+ pmovzxbw m7, [r3 + r5 + 32]
1371
+
1372
+ psubw m0, m2
1373
+ psubw m1, m3
1374
+ psubw m4, m6
1375
+ psubw m5, m7
1376
+ movu [r0], m0
1377
+ movu [r0 + 64], m1
1378
+ movu [r0 + 2 * r1], m4
1379
+ movu [r0 + 2 * r1 + 64], m5
1380
+%endmacro
1381
+
1382
+%macro PROCESS_SUB_PS_64x8_HBD_AVX512 0
1383
+ movu m0, [r2]
1384
+ movu m1, [r2 + 64]
1385
+ movu m4, [r3]
1386
+ movu m5, [r3 + 64]
1387
+ psubw m0, m4
1388
+ psubw m1, m5
1389
+ movu m2, [r2 + r4]
1390
+ movu m3, [r2 + r4 + 64]
1391
+ movu m6, [r3 + r5]
1392
+ movu m7, [r3 + r5 + 64]
1393
+ psubw m2, m6
1394
+ psubw m3, m7
1395
+
1396
+ movu [r0], m0
1397
+ movu [r0 + 64], m1
1398
+ movu [r0 + r1], m2
1399
+ movu [r0 + r1 + 64], m3
1400
+
1401
+ movu m0, [r2 + r4 * 2]
1402
+ movu m1, [r2 + r4 * 2 + 64]
1403
+ movu m4, [r3 + r5 * 2]
1404
+ movu m5, [r3 + r5 * 2 + 64]
1405
+ psubw m0, m4
1406
+ psubw m1, m5
1407
+ movu m2, [r2 + r7]
1408
+ movu m3, [r2 + r7 + 64]
1409
+ movu m6, [r3 + r8]
1410
+ movu m7, [r3 + r8 + 64]
1411
+ psubw m2, m6
1412
+ psubw m3, m7
1413
+
1414
+ movu [r0 + r1 * 2], m0
1415
+ movu [r0 + r1 * 2 + 64], m1
1416
+ movu [r0 + r6], m2
1417
+ movu [r0 + r6 + 64], m3
1418
+
1419
+ lea r0, [r0 + r1 * 4]
1420
+ lea r2, [r2 + r4 * 4]
1421
+ lea r3, [r3 + r5 * 4]
1422
+
1423
+ movu m0, [r2]
1424
+ movu m1, [r2 + 64]
1425
+ movu m4, [r3]
1426
+ movu m5, [r3 + 64]
1427
+ psubw m0, m4
1428
+ psubw m1, m5
1429
+ movu m2, [r2 + r4]
1430
+ movu m3, [r2 + r4 + 64]
1431
+ movu m6, [r3 + r5]
1432
+ movu m7, [r3 + r5 + 64]
1433
+ psubw m2, m6
1434
+ psubw m3, m7
1435
+
1436
+ movu [r0], m0
1437
+ movu [r0 + 64], m1
1438
+ movu [r0 + r1], m2
1439
+ movu [r0 + r1 + 64], m3
1440
+
1441
+ movu m0, [r2 + r4 * 2]
1442
+ movu m1, [r2 + r4 * 2 + 64]
1443
+ movu m4, [r3 + r5 * 2]
1444
+ movu m5, [r3 + r5 * 2 + 64]
1445
+ psubw m0, m4
1446
+ psubw m1, m5
1447
+ movu m2, [r2 + r7]
1448
+ movu m3, [r2 + r7 + 64]
1449
+ movu m6, [r3 + r8]
1450
+ movu m7, [r3 + r8 + 64]
1451
+ psubw m2, m6
1452
+ psubw m3, m7
1453
+
1454
+ movu [r0 + r1 * 2], m0
1455
+ movu [r0 + r1 * 2 + 64], m1
1456
+ movu [r0 + r6], m2
1457
+ movu [r0 + r6 + 64], m3
1458
+%endmacro
1459
+;-----------------------------------------------------------------------------
1460
+; void pixel_sub_ps_64x64(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
1461
+;-----------------------------------------------------------------------------
1462
+%if HIGH_BIT_DEPTH
1463
+%if ARCH_X86_64
1464
+INIT_ZMM avx512
1465
+cglobal pixel_sub_ps_64x64, 6, 9, 8
1466
+ add r1d, r1d
1467
+ add r4d, r4d
1468
+ add r5d, r5d
1469
+ lea r6, [r1 * 3]
1470
+ lea r7, [r4 * 3]
1471
+ lea r8, [r5 * 3]
1472
+
1473
+ PROCESS_SUB_PS_64x8_HBD_AVX512
1474
+ lea r0, [r0 + r1 * 4]
1475
+ lea r2, [r2 + r4 * 4]
1476
+ lea r3, [r3 + r5 * 4]
1477
+ PROCESS_SUB_PS_64x8_HBD_AVX512
1478
+ lea r0, [r0 + r1 * 4]
1479
+ lea r2, [r2 + r4 * 4]
1480
+ lea r3, [r3 + r5 * 4]
1481
+ PROCESS_SUB_PS_64x8_HBD_AVX512
1482
+ lea r0, [r0 + r1 * 4]
1483
+ lea r2, [r2 + r4 * 4]
1484
+ lea r3, [r3 + r5 * 4]
1485
+ PROCESS_SUB_PS_64x8_HBD_AVX512
1486
+ lea r0, [r0 + r1 * 4]
1487
+ lea r2, [r2 + r4 * 4]
1488
+ lea r3, [r3 + r5 * 4]
1489
+ PROCESS_SUB_PS_64x8_HBD_AVX512
1490
+ lea r0, [r0 + r1 * 4]
1491
+ lea r2, [r2 + r4 * 4]
1492
+ lea r3, [r3 + r5 * 4]
1493
+ PROCESS_SUB_PS_64x8_HBD_AVX512
1494
+ lea r0, [r0 + r1 * 4]
1495
+ lea r2, [r2 + r4 * 4]
1496
+ lea r3, [r3 + r5 * 4]
1497
+ PROCESS_SUB_PS_64x8_HBD_AVX512
1498
+ lea r0, [r0 + r1 * 4]
1499
+ lea r2, [r2 + r4 * 4]
1500
+ lea r3, [r3 + r5 * 4]
1501
+ PROCESS_SUB_PS_64x8_HBD_AVX512
1502
+ RET
1503
+%endif
1504
+%else
1505
+%if ARCH_X86_64
1506
+INIT_ZMM avx512
1507
+cglobal pixel_sub_ps_64x64, 6, 7, 8
1508
+ PROCESS_SUB_PS_64x8_AVX512
1509
+ lea r0, [r0 + 4 * r1]
1510
+ lea r2, [r2 + 2 * r4]
1511
+ lea r3, [r3 + 2 * r5]
1512
+ PROCESS_SUB_PS_64x8_AVX512
1513
+ lea r0, [r0 + 4 * r1]
1514
+ lea r2, [r2 + 2 * r4]
1515
+ lea r3, [r3 + 2 * r5]
1516
+ PROCESS_SUB_PS_64x8_AVX512
1517
+ lea r0, [r0 + 4 * r1]
1518
+ lea r2, [r2 + 2 * r4]
1519
+ lea r3, [r3 + 2 * r5]
1520
+ PROCESS_SUB_PS_64x8_AVX512
1521
+ lea r0, [r0 + 4 * r1]
1522
+ lea r2, [r2 + 2 * r4]
1523
+ lea r3, [r3 + 2 * r5]
1524
+ PROCESS_SUB_PS_64x8_AVX512
1525
+ lea r0, [r0 + 4 * r1]
1526
+ lea r2, [r2 + 2 * r4]
1527
+ lea r3, [r3 + 2 * r5]
1528
+ PROCESS_SUB_PS_64x8_AVX512
1529
+ lea r0, [r0 + 4 * r1]
1530
+ lea r2, [r2 + 2 * r4]
1531
+ lea r3, [r3 + 2 * r5]
1532
+ PROCESS_SUB_PS_64x8_AVX512
1533
+ lea r0, [r0 + 4 * r1]
1534
+ lea r2, [r2 + 2 * r4]
1535
+ lea r3, [r3 + 2 * r5]
1536
+ PROCESS_SUB_PS_64x8_AVX512
1537
+ RET
1538
+%endif
1539
+%endif
1540
;=============================================================================
1541
; variance
1542
;=============================================================================
1543
1544
%if HIGH_BIT_DEPTH == 0
1545
%if %1
1546
mova m7, [pw_00ff]
1547
-%elif mmsize < 32
1548
+%elif mmsize == 16
1549
pxor m7, m7 ; zero
1550
%endif
1551
%endif ; !HIGH_BIT_DEPTH
1552
1553
RET
1554
%endif ; !HIGH_BIT_DEPTH
1555
1556
+%macro PROCESS_VAR_32x8_AVX512 0
1557
+ pmovzxbw m0, [r0]
1558
+ pmovzxbw m1, [r0 + r1]
1559
+ pmovzxbw m2, [r0 + 2 * r1]
1560
+ pmovzxbw m3, [r0 + r2]
1561
+
1562
+ paddw m4, m0
1563
+ paddw m4, m1
1564
+ paddw m4, m2
1565
+ paddw m4, m3
1566
+ pmaddwd m0, m0
1567
+ pmaddwd m1, m1
1568
+ pmaddwd m2, m2
1569
+ pmaddwd m3, m3
1570
+ paddd m5, m0
1571
+ paddd m5, m1
1572
+ paddd m5, m2
1573
+ paddd m5, m3
1574
+
1575
+ lea r0, [r0 + r1 * 4]
1576
+
1577
+ pmovzxbw m0, [r0]
1578
+ pmovzxbw m1, [r0 + r1]
1579
+ pmovzxbw m2, [r0 + 2 * r1]
1580
+ pmovzxbw m3, [r0 + r2]
1581
+
1582
+ paddw m4, m0
1583
+ paddw m4, m1
1584
+ paddw m4, m2
1585
+ paddw m4, m3
1586
+ pmaddwd m0, m0
1587
+ pmaddwd m1, m1
1588
+ pmaddwd m2, m2
1589
+ pmaddwd m3, m3
1590
+ paddd m5, m0
1591
+ paddd m5, m1
1592
+ paddd m5, m2
1593
+ paddd m5, m3
1594
+%endmacro
1595
+
1596
+%macro PROCESS_VAR_AVX512_END 0
1597
+ vextracti32x8 ym0, m4, 1
1598
+ vextracti32x8 ym1, m5, 1
1599
+ paddw ym4, ym0
1600
+ paddd ym5, ym1
1601
+ vextracti32x4 xm0, m4, 1
1602
+ vextracti32x4 xm1, m5, 1
1603
+ paddw xm4, xm0
1604
+ paddd xm5, xm1
1605
+ HADDW xm4, xm2
1606
+ HADDD xm5, xm1
1607
+%if ARCH_X86_64
1608
+ punpckldq xm4, xm5
1609
+ movq rax, xm4
1610
+%else
1611
+ movd eax, xm4
1612
+ movd edx, xm5
1613
+%endif
1614
+%endmacro
1615
+%if ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 0
1616
+;-----------------------------------------------------------------------------
1617
+; int pixel_var_wxh( uint8_t *, intptr_t )
1618
+;-----------------------------------------------------------------------------
1619
+INIT_ZMM avx512
1620
+cglobal pixel_var_32x32, 2,4,6
1621
+ pxor m4, m4 ; sum
1622
+ pxor m5, m5 ; sum squared
1623
+ lea r2, [3 * r1]
1624
+
1625
+ PROCESS_VAR_32x8_AVX512
1626
+ lea r0, [r0 + r1 * 4]
1627
+ PROCESS_VAR_32x8_AVX512
1628
+ lea r0, [r0 + r1 * 4]
1629
+ PROCESS_VAR_32x8_AVX512
1630
+ lea r0, [r0 + r1 * 4]
1631
+ PROCESS_VAR_32x8_AVX512
1632
+ PROCESS_VAR_AVX512_END
1633
+ RET
1634
+
1635
+INIT_ZMM avx512
1636
+cglobal pixel_var_64x64, 2,4,7
1637
+ pxor m5, m5 ; sum
1638
+ pxor m6, m6 ; sum squared
1639
+ mov r2d, 32
1640
+
1641
+.loop:
1642
+ pmovzxbw m0, [r0]
1643
+ pmovzxbw m3, [r0 + mmsize/2]
1644
+ pmovzxbw m1, [r0 + r1]
1645
+ pmovzxbw m4, [r0 + r1 + mmsize/2]
1646
+
1647
+ lea r0, [r0 + 2 * r1]
1648
+
1649
+ paddw m5, m0
1650
+ paddw m5, m3
1651
+ paddw m5, m1
1652
+ paddw m5, m4
1653
+ pmaddwd m0, m0
1654
+ pmaddwd m3, m3
1655
+ pmaddwd m1, m1
1656
+ pmaddwd m4, m4
1657
+ paddd m6, m0
1658
+ paddd m6, m3
1659
+ paddd m6, m1
1660
+ paddd m6, m4
1661
+
1662
+ dec r2d
1663
+ jg .loop
1664
+
1665
+ pxor m1, m1
1666
+ punpcklwd m0, m5, m1
1667
+ punpckhwd m5, m1
1668
+ paddd m5, m0
1669
+ vextracti32x8 ym2, m5, 1
1670
+ vextracti32x8 ym1, m6, 1
1671
+ paddd ym5, ym2
1672
+ paddd ym6, ym1
1673
+ vextracti32x4 xm2, m5, 1
1674
+ vextracti32x4 xm1, m6, 1
1675
+ paddd xm5, xm2
1676
+ paddd xm6, xm1
1677
+ HADDD xm5, xm2
1678
+ HADDD xm6, xm1
1679
+ punpckldq xm5, xm6
1680
+ movq rax, xm5
1681
+ RET
1682
+%endif
1683
+%macro VAR_AVX512_CORE 1 ; accum
1684
+%if %1
1685
+ paddw m0, m2
1686
+ pmaddwd m2, m2
1687
+ paddw m0, m3
1688
+ pmaddwd m3, m3
1689
+ paddd m1, m2
1690
+ paddd m1, m3
1691
+%else
1692
+ paddw m0, m2, m3
1693
+ pmaddwd m2, m2
1694
+ pmaddwd m3, m3
1695
+ paddd m1, m2, m3
1696
+%endif
1697
+%endmacro
1698
+
1699
+%macro VAR_AVX512_CORE_16x16 1 ; accum
1700
+%if HIGH_BIT_DEPTH
1701
+ mova ym2, [r0]
1702
+ vinserti64x4 m2, [r0+r1], 1
1703
+ mova ym3, [r0+2*r1]
1704
+ vinserti64x4 m3, [r0+r3], 1
1705
+%else
1706
+ vbroadcasti64x2 ym2, [r0]
1707
+ vbroadcasti64x2 m2 {k1}, [r0+r1]
1708
+ vbroadcasti64x2 ym3, [r0+2*r1]
1709
+ vbroadcasti64x2 m3 {k1}, [r0+r3]
1710
+ pshufb m2, m4
1711
+ pshufb m3, m4
1712
+%endif
1713
+ VAR_AVX512_CORE %1
1714
+%endmacro
1715
+
1716
+%macro VAR_AVX512_CORE_8x8 1 ; accum
1717
+%if HIGH_BIT_DEPTH
1718
+ mova xm2, [r0]
1719
+ mova xm3, [r0+r1]
1720
+%else
1721
+ movq xm2, [r0]
1722
+ movq xm3, [r0+r1]
1723
+%endif
1724
+ vinserti128 ym2, [r0+2*r1], 1
1725
+ vinserti128 ym3, [r0+r2], 1
1726
+ lea r0, [r0+4*r1]
1727
+ vinserti32x4 m2, [r0], 2
1728
+ vinserti32x4 m3, [r0+r1], 2
1729
+ vinserti32x4 m2, [r0+2*r1], 3
1730
+ vinserti32x4 m3, [r0+r2], 3
1731
+%if HIGH_BIT_DEPTH == 0
1732
+ punpcklbw m2, m4
1733
+ punpcklbw m3, m4
1734
+%endif
1735
+ VAR_AVX512_CORE %1
1736
+%endmacro
1737
+
1738
+INIT_ZMM avx512
1739
+cglobal pixel_var_16x16, 2,4
1740
+ FIX_STRIDES r1
1741
+ mov r2d, 0xf0
1742
+ lea r3, [3*r1]
1743
+%if HIGH_BIT_DEPTH == 0
1744
+ vbroadcasti64x4 m4, [var_shuf_avx512]
1745
+ kmovb k1, r2d
1746
+%endif
1747
+ VAR_AVX512_CORE_16x16 0
1748
+.loop:
1749
+ lea r0, [r0+4*r1]
1750
+ VAR_AVX512_CORE_16x16 1
1751
+ sub r2d, 0x50
1752
+ jg .loop
1753
+%if ARCH_X86_64 == 0
1754
+ pop r3d
1755
+ %assign regs_used 3
1756
+%endif
1757
+var_avx512_end:
1758
+ vbroadcasti32x4 m2, [pw_1]
1759
+ pmaddwd m0, m2
1760
+ SBUTTERFLY dq, 0, 1, 2
1761
+ paddd m0, m1
1762
+ vextracti32x8 ym1, m0, 1
1763
+ paddd ym0, ym1
1764
+ vextracti128 xm1, ym0, 1
1765
+ paddd xmm0, xm0, xm1
1766
+ punpckhqdq xmm1, xmm0, xmm0
1767
+ paddd xmm0, xmm1
1768
+%if ARCH_X86_64
1769
+ movq rax, xmm0
1770
+%else
1771
+ movd eax, xmm0
1772
+ pextrd edx, xmm0, 1
1773
+ %endif
1774
+ RET
1775
+
1776
+%if HIGH_BIT_DEPTH == 0 ; 8x8 doesn't benefit from AVX-512 in high bit-depth
1777
+cglobal pixel_var_8x8, 2,3
1778
+ lea r2, [3*r1]
1779
+ pxor xm4, xm4
1780
+ VAR_AVX512_CORE_8x8 0
1781
+ jmp var_avx512_end
1782
+%endif
1783
+
1784
+cglobal pixel_var_8x16, 2,3
1785
+ FIX_STRIDES r1
1786
+ lea r2, [3*r1]
1787
+%if HIGH_BIT_DEPTH == 0
1788
+ pxor xm4, xm4
1789
+%endif
1790
+ VAR_AVX512_CORE_8x8 0
1791
+ lea r0, [r0+4*r1]
1792
+ VAR_AVX512_CORE_8x8 1
1793
+ jmp var_avx512_end
1794
+
1795
%macro VAR2_END 3
1796
HADDW %2, xm1
1797
movd r1d, %2
1798
x265_2.7.tar.gz/source/common/x86/pixel.h -> x265_2.9.tar.gz/source/common/x86/pixel.h
Changed
37
1
2
void PFX(downShift_16_avx2)(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
3
void PFX(upShift_16_sse2)(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
4
void PFX(upShift_16_avx2)(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
5
+void PFX(upShift_16_avx512)(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
6
void PFX(upShift_8_sse4)(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
7
void PFX(upShift_8_avx2)(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
8
pixel PFX(planeClipAndMax_avx2)(pixel *src, intptr_t stride, int width, int height, uint64_t *outsum, const pixel minPix, const pixel maxPix);
9
10
FUNCDEF_PU(void, pixel_sad_x3, cpu, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*); \
11
FUNCDEF_PU(void, pixel_sad_x4, cpu, const pixel*, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*); \
12
FUNCDEF_PU(void, pixel_avg, cpu, pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int); \
13
+ FUNCDEF_PU(void, pixel_avg_aligned, cpu, pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int); \
14
FUNCDEF_PU(void, pixel_add_ps, cpu, pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1); \
15
+ FUNCDEF_PU(void, pixel_add_ps_aligned, cpu, pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1); \
16
FUNCDEF_PU(void, pixel_sub_ps, cpu, int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1); \
17
FUNCDEF_CHROMA_PU(int, pixel_satd, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
18
FUNCDEF_CHROMA_PU(int, pixel_sad, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
19
FUNCDEF_CHROMA_PU(sse_t, pixel_ssd_ss, cpu, const int16_t*, intptr_t, const int16_t*, intptr_t); \
20
FUNCDEF_CHROMA_PU(void, addAvg, cpu, const int16_t*, const int16_t*, pixel*, intptr_t, intptr_t, intptr_t); \
21
+ FUNCDEF_CHROMA_PU(void, addAvg_aligned, cpu, const int16_t*, const int16_t*, pixel*, intptr_t, intptr_t, intptr_t); \
22
FUNCDEF_CHROMA_PU(sse_t, pixel_ssd_s, cpu, const int16_t*, intptr_t); \
23
+ FUNCDEF_CHROMA_PU(sse_t, pixel_ssd_s_aligned, cpu, const int16_t*, intptr_t); \
24
FUNCDEF_TU_S(sse_t, pixel_ssd_s, cpu, const int16_t*, intptr_t); \
25
+ FUNCDEF_TU_S(sse_t, pixel_ssd_s_aligned, cpu, const int16_t*, intptr_t); \
26
FUNCDEF_TU(uint64_t, pixel_var, cpu, const pixel*, intptr_t); \
27
FUNCDEF_TU(int, psyCost_pp, cpu, const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride); \
28
FUNCDEF_TU(int, psyCost_ss, cpu, const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride)
29
30
DECL_PIXELS(avx);
31
DECL_PIXELS(xop);
32
DECL_PIXELS(avx2);
33
+DECL_PIXELS(avx512);
34
35
#undef DECL_PIXELS
36
37
x265_2.7.tar.gz/source/common/x86/pixeladd8.asm -> x265_2.9.tar.gz/source/common/x86/pixeladd8.asm
Changed
530
1
2
3
%include "x86inc.asm"
4
%include "x86util.asm"
5
+SECTION_RODATA 64
6
7
-SECTION_RODATA 32
8
-
9
+ALIGN 64
10
+const store_shuf1_avx512, dq 0, 2, 4, 6, 1, 3, 5, 7
11
SECTION .text
12
-
13
cextern pw_pixel_max
14
15
;-----------------------------------------------------------------------------
16
17
PIXEL_ADD_PS_W32_H4_avx2 32
18
PIXEL_ADD_PS_W32_H4_avx2 64
19
20
-
21
;-----------------------------------------------------------------------------
22
; void pixel_add_ps_64x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
23
;-----------------------------------------------------------------------------
24
25
RET
26
27
%endif
28
+
29
+;-----------------------------------------------------------------------------
30
+; pixel_add_ps avx512 code start
31
+;-----------------------------------------------------------------------------
32
+%macro PROCESS_ADD_PS_64x4_AVX512 0
33
+ pmovzxbw m0, [r2]
34
+ pmovzxbw m1, [r2 + mmsize/2]
35
+ movu m2, [r3]
36
+ movu m3, [r3 + mmsize]
37
+ paddw m0, m2
38
+ paddw m1, m3
39
+ packuswb m0, m1
40
+ vpermq m0, m4, m0
41
+ movu [r0], m0
42
+ pmovzxbw m0, [r2 + r4]
43
+ pmovzxbw m1, [r2 + r4 + mmsize/2]
44
+ movu m2, [r3 + r5]
45
+ movu m3, [r3 + r5 + mmsize]
46
+ paddw m0, m2
47
+ paddw m1, m3
48
+ packuswb m0, m1
49
+ vpermq m0, m4, m0
50
+ movu [r0 + r1], m0
51
+ pmovzxbw m0, [r2 + 2 * r4]
52
+ pmovzxbw m1, [r2 + 2 * r4 + mmsize/2]
53
+ movu m2, [r3 + 2 * r5]
54
+ movu m3, [r3 + 2 * r5 + mmsize]
55
+ paddw m0, m2
56
+ paddw m1, m3
57
+ packuswb m0, m1
58
+ vpermq m0, m4, m0
59
+ movu [r0 + 2 * r1], m0
60
+
61
+ pmovzxbw m0, [r2 + r7]
62
+ pmovzxbw m1, [r2 + r7 + mmsize/2]
63
+ movu m2, [r3 + r8]
64
+ movu m3, [r3 + r8 + mmsize]
65
+ paddw m0, m2
66
+ paddw m1, m3
67
+ packuswb m0, m1
68
+ vpermq m0, m4, m0
69
+ movu [r0 + r6], m0
70
+%endmacro
71
+
72
+%macro PROCESS_ADD_PS_64x4_HBD_AVX512 0
73
+ movu m0, [r2]
74
+ movu m1, [r2 + mmsize]
75
+ movu m2, [r3]
76
+ movu m3, [r3 + mmsize]
77
+ paddw m0, m2
78
+ paddw m1, m3
79
+
80
+ CLIPW2 m0, m1, m4, m5
81
+ movu [r0], m0
82
+ movu [r0 + mmsize], m1
83
+
84
+ movu m0, [r2 + r4]
85
+ movu m1, [r2 + r4 + mmsize]
86
+ movu m2, [r3 + r5]
87
+ movu m3, [r3 + r5 + mmsize]
88
+ paddw m0, m2
89
+ paddw m1, m3
90
+
91
+ CLIPW2 m0, m1, m4, m5
92
+ movu [r0 + r1], m0
93
+ movu [r0 + r1 + mmsize], m1
94
+
95
+ movu m0, [r2 + r4 * 2]
96
+ movu m1, [r2 + r4 * 2 + mmsize]
97
+ movu m2, [r3 + r5 * 2]
98
+ movu m3, [r3 + r5 * 2 + mmsize]
99
+ paddw m0, m2
100
+ paddw m1, m3
101
+
102
+ CLIPW2 m0, m1, m4, m5
103
+ movu [r0 + r1 * 2], m0
104
+ movu [r0 + r1 * 2 + mmsize], m1
105
+
106
+ movu m0, [r2 + r6]
107
+ movu m1, [r2 + r6 + mmsize]
108
+ movu m2, [r3 + r7]
109
+ movu m3, [r3 + r7 + mmsize]
110
+ paddw m0, m2
111
+ paddw m1, m3
112
+
113
+ CLIPW2 m0, m1, m4, m5
114
+ movu [r0 + r8], m0
115
+ movu [r0 + r8 + mmsize], m1
116
+%endmacro
117
+
118
+%macro PROCESS_ADD_PS_64x4_ALIGNED_AVX512 0
119
+ pmovzxbw m0, [r2]
120
+ pmovzxbw m1, [r2 + mmsize/2]
121
+ mova m2, [r3]
122
+ mova m3, [r3 + mmsize]
123
+ paddw m0, m2
124
+ paddw m1, m3
125
+ packuswb m0, m1
126
+ vpermq m0, m4, m0
127
+ mova [r0], m0
128
+ pmovzxbw m0, [r2 + r4]
129
+ pmovzxbw m1, [r2 + r4 + mmsize/2]
130
+ mova m2, [r3 + r5]
131
+ mova m3, [r3 + r5 + mmsize]
132
+ paddw m0, m2
133
+ paddw m1, m3
134
+ packuswb m0, m1
135
+ vpermq m0, m4, m0
136
+ mova [r0 + r1], m0
137
+ pmovzxbw m0, [r2 + 2 * r4]
138
+ pmovzxbw m1, [r2 + 2 * r4 + mmsize/2]
139
+ mova m2, [r3 + 2 * r5]
140
+ mova m3, [r3 + 2 * r5 + mmsize]
141
+ paddw m0, m2
142
+ paddw m1, m3
143
+ packuswb m0, m1
144
+ vpermq m0, m4, m0
145
+ mova [r0 + 2 * r1], m0
146
+
147
+ pmovzxbw m0, [r2 + r7]
148
+ pmovzxbw m1, [r2 + r7 + mmsize/2]
149
+ mova m2, [r3 + r8]
150
+ mova m3, [r3 + r8 + mmsize]
151
+ paddw m0, m2
152
+ paddw m1, m3
153
+ packuswb m0, m1
154
+ vpermq m0, m4, m0
155
+ mova [r0 + r6], m0
156
+%endmacro
157
+
158
+%macro PROCESS_ADD_PS_64x4_HBD_ALIGNED_AVX512 0
159
+ mova m0, [r2]
160
+ mova m1, [r2 + mmsize]
161
+ mova m2, [r3]
162
+ mova m3, [r3 + mmsize]
163
+ paddw m0, m2
164
+ paddw m1, m3
165
+
166
+ CLIPW2 m0, m1, m4, m5
167
+ mova [r0], m0
168
+ mova [r0 + mmsize], m1
169
+
170
+ mova m0, [r2 + r4]
171
+ mova m1, [r2 + r4 + mmsize]
172
+ mova m2, [r3 + r5]
173
+ mova m3, [r3 + r5 + mmsize]
174
+ paddw m0, m2
175
+ paddw m1, m3
176
+
177
+ CLIPW2 m0, m1, m4, m5
178
+ mova [r0 + r1], m0
179
+ mova [r0 + r1 + mmsize], m1
180
+
181
+ mova m0, [r2 + r4 * 2]
182
+ mova m1, [r2 + r4 * 2 + mmsize]
183
+ mova m2, [r3 + r5 * 2]
184
+ mova m3, [r3 + r5 * 2 + mmsize]
185
+ paddw m0, m2
186
+ paddw m1, m3
187
+
188
+ CLIPW2 m0, m1, m4, m5
189
+ mova [r0 + r1 * 2], m0
190
+ mova [r0 + r1 * 2 + mmsize], m1
191
+
192
+ mova m0, [r2 + r6]
193
+ mova m1, [r2 + r6 + mmsize]
194
+ mova m2, [r3 + r7]
195
+ mova m3, [r3 + r7 + mmsize]
196
+ paddw m0, m2
197
+ paddw m1, m3
198
+
199
+ CLIPW2 m0, m1, m4, m5
200
+ mova [r0 + r8], m0
201
+ mova [r0 + r8 + mmsize], m1
202
+%endmacro
203
+
204
+;-----------------------------------------------------------------------------
205
+; void pixel_add_ps_64x64(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
206
+;-----------------------------------------------------------------------------
207
+%if HIGH_BIT_DEPTH
208
+%if ARCH_X86_64
209
+INIT_ZMM avx512
210
+cglobal pixel_add_ps_64x64, 6, 9, 6
211
+ vbroadcasti32x8 m5, [pw_pixel_max]
212
+ pxor m4, m4
213
+ add r4d, r4d
214
+ add r5d, r5d
215
+ add r1d, r1d
216
+ lea r6, [r4 * 3]
217
+ lea r7, [r5 * 3]
218
+ lea r8, [r1 * 3]
219
+%rep 15
220
+ PROCESS_ADD_PS_64x4_HBD_AVX512
221
+ lea r2, [r2 + r4 * 4]
222
+ lea r3, [r3 + r5 * 4]
223
+ lea r0, [r0 + r1 * 4]
224
+%endrep
225
+ PROCESS_ADD_PS_64x4_HBD_AVX512
226
+ RET
227
+
228
+INIT_ZMM avx512
229
+cglobal pixel_add_ps_aligned_64x64, 6, 9, 6
230
+ vbroadcasti32x8 m5, [pw_pixel_max]
231
+ pxor m4, m4
232
+ add r4d, r4d
233
+ add r5d, r5d
234
+ add r1d, r1d
235
+ lea r6, [r4 * 3]
236
+ lea r7, [r5 * 3]
237
+ lea r8, [r1 * 3]
238
+%rep 15
239
+ PROCESS_ADD_PS_64x4_HBD_ALIGNED_AVX512
240
+ lea r2, [r2 + r4 * 4]
241
+ lea r3, [r3 + r5 * 4]
242
+ lea r0, [r0 + r1 * 4]
243
+%endrep
244
+ PROCESS_ADD_PS_64x4_HBD_ALIGNED_AVX512
245
+ RET
246
+%endif
247
+%else
248
+%if ARCH_X86_64
249
+INIT_ZMM avx512
250
+cglobal pixel_add_ps_64x64, 6, 9, 4
251
+ add r5, r5
252
+ lea r6, [3 * r1]
253
+ lea r7, [3 * r4]
254
+ lea r8, [3 * r5]
255
+ mova m4, [store_shuf1_avx512]
256
+%rep 15
257
+ PROCESS_ADD_PS_64x4_AVX512
258
+ lea r2, [r2 + r4 * 4]
259
+ lea r3, [r3 + r5 * 4]
260
+ lea r0, [r0 + r1 * 4]
261
+%endrep
262
+ PROCESS_ADD_PS_64x4_AVX512
263
+ RET
264
+
265
+INIT_ZMM avx512
266
+cglobal pixel_add_ps_aligned_64x64, 6, 9, 4
267
+ add r5, r5
268
+ lea r6, [3 * r1]
269
+ lea r7, [3 * r4]
270
+ lea r8, [3 * r5]
271
+ mova m4, [store_shuf1_avx512]
272
+%rep 15
273
+ PROCESS_ADD_PS_64x4_ALIGNED_AVX512
274
+ lea r2, [r2 + r4 * 4]
275
+ lea r3, [r3 + r5 * 4]
276
+ lea r0, [r0 + r1 * 4]
277
+%endrep
278
+ PROCESS_ADD_PS_64x4_ALIGNED_AVX512
279
+ RET
280
+%endif
281
+%endif
282
+
283
+%macro PROCESS_ADD_PS_32x4_AVX512 0
284
+ pmovzxbw m0, [r2]
285
+ movu m1, [r3]
286
+ pmovzxbw m2, [r2 + r4]
287
+ movu m3, [r3 + r5]
288
+ paddw m0, m1
289
+ paddw m2, m3
290
+ packuswb m0, m2
291
+ vpermq m0, m4, m0
292
+ movu [r0], ym0
293
+ vextracti32x8 [r0 + r1], m0, 1
294
+ pmovzxbw m0, [r2 + r4 * 2]
295
+ movu m1, [r3 + r5 * 2]
296
+ pmovzxbw m2, [r2 + r6]
297
+ movu m3, [r3 + r7]
298
+ paddw m0, m1
299
+ paddw m2, m3
300
+ packuswb m0, m2
301
+ vpermq m0, m4, m0
302
+ movu [r0 + r1 * 2], ym0
303
+ vextracti32x8 [r0 + r8], m0, 1
304
+%endmacro
305
+
306
+%macro PROCESS_ADD_PS_32x4_HBD_AVX512 0
307
+ movu m0, [r2]
308
+ movu m1, [r2 + r4]
309
+ movu m2, [r3]
310
+ movu m3, [r3 + r5]
311
+ paddw m0, m2
312
+ paddw m1, m3
313
+
314
+ CLIPW2 m0, m1, m4, m5
315
+ movu [r0], m0
316
+ movu [r0 + r1], m1
317
+
318
+ movu m0, [r2 + r4 * 2]
319
+ movu m1, [r2 + r6]
320
+ movu m2, [r3 + r5 * 2]
321
+ movu m3, [r3 + r7]
322
+ paddw m0, m2
323
+ paddw m1, m3
324
+
325
+ CLIPW2 m0, m1, m4, m5
326
+ movu [r0 + r1 * 2], m0
327
+ movu [r0 + r8], m1
328
+%endmacro
329
+
330
+%macro PROCESS_ADD_PS_32x4_ALIGNED_AVX512 0
331
+ pmovzxbw m0, [r2]
332
+ mova m1, [r3]
333
+ pmovzxbw m2, [r2 + r4]
334
+ mova m3, [r3 + r5]
335
+ paddw m0, m1
336
+ paddw m2, m3
337
+ packuswb m0, m2
338
+ vpermq m0, m4, m0
339
+ mova [r0], ym0
340
+ vextracti32x8 [r0 + r1], m0, 1
341
+ pmovzxbw m0, [r2 + r4 * 2]
342
+ mova m1, [r3 + r5 * 2]
343
+ pmovzxbw m2, [r2 + r6]
344
+ mova m3, [r3 + r7]
345
+ paddw m0, m1
346
+ paddw m2, m3
347
+ packuswb m0, m2
348
+ vpermq m0, m4, m0
349
+ mova [r0 + r1 * 2], ym0
350
+ vextracti32x8 [r0 + r8], m0, 1
351
+%endmacro
352
+
353
+%macro PROCESS_ADD_PS_32x4_HBD_ALIGNED_AVX512 0
354
+ mova m0, [r2]
355
+ mova m1, [r2 + r4]
356
+ mova m2, [r3]
357
+ mova m3, [r3 + r5]
358
+ paddw m0, m2
359
+ paddw m1, m3
360
+
361
+ CLIPW2 m0, m1, m4, m5
362
+ mova [r0], m0
363
+ mova [r0 + r1], m1
364
+
365
+ mova m0, [r2 + r4 * 2]
366
+ mova m1, [r2 + r6]
367
+ mova m2, [r3 + r5 * 2]
368
+ mova m3, [r3 + r7]
369
+ paddw m0, m2
370
+ paddw m1, m3
371
+
372
+ CLIPW2 m0, m1, m4, m5
373
+ mova [r0 + r1 * 2], m0
374
+ mova [r0 + r8], m1
375
+%endmacro
376
+
377
+;-----------------------------------------------------------------------------
378
+; void pixel_add_ps_32x32(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
379
+;-----------------------------------------------------------------------------
380
+%if HIGH_BIT_DEPTH
381
+%if ARCH_X86_64
382
+INIT_ZMM avx512
383
+cglobal pixel_add_ps_32x32, 6, 9, 6
384
+ vbroadcasti32x8 m5, [pw_pixel_max]
385
+ pxor m4, m4
386
+ add r4d, r4d
387
+ add r5d, r5d
388
+ add r1d, r1d
389
+ lea r6, [r4 * 3]
390
+ lea r7, [r5 * 3]
391
+ lea r8, [r1 * 3]
392
+%rep 7
393
+ PROCESS_ADD_PS_32x4_HBD_AVX512
394
+ lea r2, [r2 + r4 * 4]
395
+ lea r3, [r3 + r5 * 4]
396
+ lea r0, [r0 + r1 * 4]
397
+%endrep
398
+ PROCESS_ADD_PS_32x4_HBD_AVX512
399
+ RET
400
+
401
+INIT_ZMM avx512
402
+cglobal pixel_add_ps_32x64, 6, 9, 6
403
+ vbroadcasti32x8 m5, [pw_pixel_max]
404
+ pxor m4, m4
405
+ add r4d, r4d
406
+ add r5d, r5d
407
+ add r1d, r1d
408
+ lea r6, [r4 * 3]
409
+ lea r7, [r5 * 3]
410
+ lea r8, [r1 * 3]
411
+%rep 15
412
+ PROCESS_ADD_PS_32x4_HBD_AVX512
413
+ lea r2, [r2 + r4 * 4]
414
+ lea r3, [r3 + r5 * 4]
415
+ lea r0, [r0 + r1 * 4]
416
+%endrep
417
+ PROCESS_ADD_PS_32x4_HBD_AVX512
418
+ RET
419
+
420
+INIT_ZMM avx512
421
+cglobal pixel_add_ps_aligned_32x32, 6, 9, 6
422
+ vbroadcasti32x8 m5, [pw_pixel_max]
423
+ pxor m4, m4
424
+ add r4d, r4d
425
+ add r5d, r5d
426
+ add r1d, r1d
427
+ lea r6, [r4 * 3]
428
+ lea r7, [r5 * 3]
429
+ lea r8, [r1 * 3]
430
+%rep 7
431
+ PROCESS_ADD_PS_32x4_HBD_ALIGNED_AVX512
432
+ lea r2, [r2 + r4 * 4]
433
+ lea r3, [r3 + r5 * 4]
434
+ lea r0, [r0 + r1 * 4]
435
+%endrep
436
+ PROCESS_ADD_PS_32x4_HBD_ALIGNED_AVX512
437
+ RET
438
+
439
+INIT_ZMM avx512
440
+cglobal pixel_add_ps_aligned_32x64, 6, 9, 6
441
+ vbroadcasti32x8 m5, [pw_pixel_max]
442
+ pxor m4, m4
443
+ add r4d, r4d
444
+ add r5d, r5d
445
+ add r1d, r1d
446
+ lea r6, [r4 * 3]
447
+ lea r7, [r5 * 3]
448
+ lea r8, [r1 * 3]
449
+%rep 15
450
+ PROCESS_ADD_PS_32x4_HBD_ALIGNED_AVX512
451
+ lea r2, [r2 + r4 * 4]
452
+ lea r3, [r3 + r5 * 4]
453
+ lea r0, [r0 + r1 * 4]
454
+%endrep
455
+ PROCESS_ADD_PS_32x4_HBD_ALIGNED_AVX512
456
+ RET
457
+%endif
458
+%else
459
+%if ARCH_X86_64
460
+INIT_ZMM avx512
461
+cglobal pixel_add_ps_32x32, 6, 9, 5
462
+ add r5, r5
463
+ lea r6, [r4 * 3]
464
+ lea r7, [r5 * 3]
465
+ lea r8, [r1 * 3]
466
+ mova m4, [store_shuf1_avx512]
467
+%rep 7
468
+ PROCESS_ADD_PS_32x4_AVX512
469
+ lea r2, [r2 + r4 * 4]
470
+ lea r3, [r3 + r5 * 4]
471
+ lea r0, [r0 + r1 * 4]
472
+%endrep
473
+ PROCESS_ADD_PS_32x4_AVX512
474
+ RET
475
+
476
+INIT_ZMM avx512
477
+cglobal pixel_add_ps_32x64, 6, 9, 5
478
+ add r5, r5
479
+ lea r6, [r4 * 3]
480
+ lea r7, [r5 * 3]
481
+ lea r8, [r1 * 3]
482
+ mova m4, [store_shuf1_avx512]
483
+
484
+%rep 15
485
+ PROCESS_ADD_PS_32x4_AVX512
486
+ lea r2, [r2 + r4 * 4]
487
+ lea r3, [r3 + r5 * 4]
488
+ lea r0, [r0 + r1 * 4]
489
+%endrep
490
+ PROCESS_ADD_PS_32x4_AVX512
491
+ RET
492
+
493
+INIT_ZMM avx512
494
+cglobal pixel_add_ps_aligned_32x32, 6, 9, 5
495
+ add r5, r5
496
+ lea r6, [r4 * 3]
497
+ lea r7, [r5 * 3]
498
+ lea r8, [r1 * 3]
499
+ mova m4, [store_shuf1_avx512]
500
+%rep 7
501
+ PROCESS_ADD_PS_32x4_ALIGNED_AVX512
502
+ lea r2, [r2 + r4 * 4]
503
+ lea r3, [r3 + r5 * 4]
504
+ lea r0, [r0 + r1 * 4]
505
+%endrep
506
+ PROCESS_ADD_PS_32x4_ALIGNED_AVX512
507
+ RET
508
+
509
+INIT_ZMM avx512
510
+cglobal pixel_add_ps_aligned_32x64, 6, 9, 5
511
+ add r5, r5
512
+ lea r6, [r4 * 3]
513
+ lea r7, [r5 * 3]
514
+ lea r8, [r1 * 3]
515
+ mova m4, [store_shuf1_avx512]
516
+
517
+%rep 15
518
+ PROCESS_ADD_PS_32x4_ALIGNED_AVX512
519
+ lea r2, [r2 + r4 * 4]
520
+ lea r3, [r3 + r5 * 4]
521
+ lea r0, [r0 + r1 * 4]
522
+%endrep
523
+ PROCESS_ADD_PS_32x4_ALIGNED_AVX512
524
+ RET
525
+%endif
526
+%endif
527
+;-----------------------------------------------------------------------------
528
+; pixel_add_ps avx512 code end
529
+;-----------------------------------------------------------------------------
530
x265_2.7.tar.gz/source/common/x86/sad-a.asm -> x265_2.9.tar.gz/source/common/x86/sad-a.asm
Changed
877
1
2
lea r0, [r0 + r1]
3
%endmacro
4
5
-%macro SAD_W16 0
6
-;-----------------------------------------------------------------------------
7
-; int pixel_sad_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
8
-;-----------------------------------------------------------------------------
9
-cglobal pixel_sad_16x16, 4,4,8
10
- movu m0, [r2]
11
- movu m1, [r2+r3]
12
- lea r2, [r2+2*r3]
13
- movu m2, [r2]
14
- movu m3, [r2+r3]
15
- lea r2, [r2+2*r3]
16
- psadbw m0, [r0]
17
- psadbw m1, [r0+r1]
18
- lea r0, [r0+2*r1]
19
- movu m4, [r2]
20
- paddw m0, m1
21
- psadbw m2, [r0]
22
- psadbw m3, [r0+r1]
23
- lea r0, [r0+2*r1]
24
- movu m5, [r2+r3]
25
- lea r2, [r2+2*r3]
26
- paddw m2, m3
27
- movu m6, [r2]
28
- movu m7, [r2+r3]
29
- lea r2, [r2+2*r3]
30
- paddw m0, m2
31
- psadbw m4, [r0]
32
- psadbw m5, [r0+r1]
33
- lea r0, [r0+2*r1]
34
- movu m1, [r2]
35
- paddw m4, m5
36
- psadbw m6, [r0]
37
- psadbw m7, [r0+r1]
38
- lea r0, [r0+2*r1]
39
- movu m2, [r2+r3]
40
- lea r2, [r2+2*r3]
41
- paddw m6, m7
42
- movu m3, [r2]
43
- paddw m0, m4
44
- movu m4, [r2+r3]
45
- lea r2, [r2+2*r3]
46
- paddw m0, m6
47
- psadbw m1, [r0]
48
- psadbw m2, [r0+r1]
49
- lea r0, [r0+2*r1]
50
- movu m5, [r2]
51
- paddw m1, m2
52
- psadbw m3, [r0]
53
- psadbw m4, [r0+r1]
54
- lea r0, [r0+2*r1]
55
- movu m6, [r2+r3]
56
- lea r2, [r2+2*r3]
57
- paddw m3, m4
58
- movu m7, [r2]
59
- paddw m0, m1
60
- movu m1, [r2+r3]
61
- paddw m0, m3
62
- psadbw m5, [r0]
63
- psadbw m6, [r0+r1]
64
- lea r0, [r0+2*r1]
65
- paddw m5, m6
66
- psadbw m7, [r0]
67
- psadbw m1, [r0+r1]
68
- paddw m7, m1
69
- paddw m0, m5
70
- paddw m0, m7
71
- SAD_END_SSE2
72
+%macro SAD_W16 1 ; h
73
+cglobal pixel_sad_16x%1, 4,4
74
+%ifidn cpuname, sse2
75
+.skip_prologue:
76
+%endif
77
+%assign %%i 0
78
+%if ARCH_X86_64
79
+ lea r6, [3*r1] ; r6 results in fewer REX prefixes than r4 and both are volatile
80
+ lea r5, [3*r3]
81
+%rep %1/4
82
+ movu m1, [r2]
83
+ psadbw m1, [r0]
84
+ movu m3, [r2+r3]
85
+ psadbw m3, [r0+r1]
86
+ movu m2, [r2+2*r3]
87
+ psadbw m2, [r0+2*r1]
88
+ movu m4, [r2+r5]
89
+ psadbw m4, [r0+r6]
90
+%if %%i != %1/4-1
91
+ lea r2, [r2+4*r3]
92
+ lea r0, [r0+4*r1]
93
+%endif
94
+ paddw m1, m3
95
+ paddw m2, m4
96
+ ACCUM paddw, 0, 1, %%i
97
+ paddw m0, m2
98
+ %assign %%i %%i+1
99
+%endrep
100
+%else ; The cost of having to save and restore registers on x86-32
101
+%rep %1/2 ; nullifies the benefit of having 3*stride in registers.
102
+ movu m1, [r2]
103
+ psadbw m1, [r0]
104
+ movu m2, [r2+r3]
105
+ psadbw m2, [r0+r1]
106
+%if %%i != %1/2-1
107
+ lea r2, [r2+2*r3]
108
+ lea r0, [r0+2*r1]
109
+%endif
110
+ ACCUM paddw, 0, 1, %%i
111
+ paddw m0, m2
112
+ %assign %%i %%i+1
113
+%endrep
114
+%endif
115
+ SAD_END_SSE2
116
+ %endmacro
117
118
-;-----------------------------------------------------------------------------
119
-; int pixel_sad_16x8( uint8_t *, intptr_t, uint8_t *, intptr_t )
120
-;-----------------------------------------------------------------------------
121
-cglobal pixel_sad_16x8, 4,4
122
- movu m0, [r2]
123
- movu m2, [r2+r3]
124
- lea r2, [r2+2*r3]
125
- movu m3, [r2]
126
- movu m4, [r2+r3]
127
- psadbw m0, [r0]
128
- psadbw m2, [r0+r1]
129
- lea r0, [r0+2*r1]
130
- psadbw m3, [r0]
131
- psadbw m4, [r0+r1]
132
- lea r0, [r0+2*r1]
133
- lea r2, [r2+2*r3]
134
- paddw m0, m2
135
- paddw m3, m4
136
- paddw m0, m3
137
- movu m1, [r2]
138
- movu m2, [r2+r3]
139
- lea r2, [r2+2*r3]
140
- movu m3, [r2]
141
- movu m4, [r2+r3]
142
- psadbw m1, [r0]
143
- psadbw m2, [r0+r1]
144
- lea r0, [r0+2*r1]
145
- psadbw m3, [r0]
146
- psadbw m4, [r0+r1]
147
- lea r0, [r0+2*r1]
148
- lea r2, [r2+2*r3]
149
- paddw m1, m2
150
- paddw m3, m4
151
- paddw m0, m1
152
- paddw m0, m3
153
- SAD_END_SSE2
154
+INIT_XMM sse2
155
+SAD_W16 8
156
+SAD_W16 16
157
+INIT_XMM sse3
158
+SAD_W16 8
159
+SAD_W16 16
160
+INIT_XMM sse2, aligned
161
+SAD_W16 8
162
+SAD_W16 16
163
164
+%macro SAD_Wx 0
165
;-----------------------------------------------------------------------------
166
; int pixel_sad_16x12( uint8_t *, intptr_t, uint8_t *, intptr_t )
167
;-----------------------------------------------------------------------------
168
169
%endmacro
170
171
INIT_XMM sse2
172
-SAD_W16
173
+SAD_Wx
174
INIT_XMM sse3
175
-SAD_W16
176
+SAD_Wx
177
INIT_XMM sse2, aligned
178
-SAD_W16
179
+SAD_Wx
180
181
%macro SAD_INC_4x8P_SSE 1
182
movq m1, [r0]
183
184
SAD_INC_4x8P_SSE 1
185
SAD_INC_4x8P_SSE 1
186
SAD_END_SSE2
187
+
188
+%macro SAD_W48_AVX512 3 ; w, h, d/q
189
+cglobal pixel_sad_%1x%2, 4,4
190
+ kxnorb k1, k1, k1
191
+ kaddb k1, k1, k1
192
+%assign %%i 0
193
+%if ARCH_X86_64 && %2 != 4
194
+ lea r6, [3*r1]
195
+ lea r5, [3*r3]
196
+%rep %2/4
197
+ mov%3 m1, [r0]
198
+ vpbroadcast%3 m1 {k1}, [r0+r1]
199
+ mov%3 m3, [r2]
200
+ vpbroadcast%3 m3 {k1}, [r2+r3]
201
+ mov%3 m2, [r0+2*r1]
202
+ vpbroadcast%3 m2 {k1}, [r0+r6]
203
+ mov%3 m4, [r2+2*r3]
204
+ vpbroadcast%3 m4 {k1}, [r2+r5]
205
+%if %%i != %2/4-1
206
+ lea r0, [r0+4*r1]
207
+ lea r2, [r2+4*r3]
208
+%endif
209
+ psadbw m1, m3
210
+ psadbw m2, m4
211
+ ACCUM paddd, 0, 1, %%i
212
+ paddd m0, m2
213
+ %assign %%i %%i+1
214
+%endrep
215
+%else
216
+%rep %2/2
217
+ mov%3 m1, [r0]
218
+ vpbroadcast%3 m1 {k1}, [r0+r1]
219
+ mov%3 m2, [r2]
220
+ vpbroadcast%3 m2 {k1}, [r2+r3]
221
+%if %%i != %2/2-1
222
+ lea r0, [r0+2*r1]
223
+ lea r2, [r2+2*r3]
224
+%endif
225
+ psadbw m1, m2
226
+ ACCUM paddd, 0, 1, %%i
227
+ %assign %%i %%i+1
228
+%endrep
229
+%endif
230
+%if %1 == 8
231
+ punpckhqdq m1, m0, m0
232
+ paddd m0, m1
233
+%endif
234
+ movd eax, m0
235
+ RET
236
+%endmacro
237
+
238
+INIT_XMM avx512
239
+SAD_W48_AVX512 4, 4, d
240
+SAD_W48_AVX512 4, 8, d
241
+SAD_W48_AVX512 4, 16, d
242
+SAD_W48_AVX512 8, 4, q
243
+SAD_W48_AVX512 8, 8, q
244
+SAD_W48_AVX512 8, 16, q
245
+
246
+%macro SAD_W16_AVX512_START 1 ; h
247
+ cmp r1d, 16 ; optimized for width = 16, which has the
248
+ jne pixel_sad_16x%1_sse2.skip_prologue ; rows laid out contiguously in memory
249
+ lea r1, [3*r3]
250
+%endmacro
251
+
252
+%macro SAD_W16_AVX512_END 0
253
+ paddd m0, m1
254
+ paddd m0, m2
255
+ paddd m0, m3
256
+%if mmsize == 64
257
+ vextracti32x8 ym1, m0, 1
258
+ paddd ym0, ym1
259
+%endif
260
+ vextracti128 xm1, ym0, 1
261
+ paddd xmm0, xm0, xm1
262
+ punpckhqdq xmm1, xmm0, xmm0
263
+ paddd xmm0, xmm1
264
+ movd eax, xmm0
265
RET
266
+%endmacro
267
+
268
+INIT_YMM avx512
269
+cglobal pixel_sad_16x8, 4,4
270
+ SAD_W16_AVX512_START 8
271
+ movu xm0, [r2]
272
+ vinserti128 m0, [r2+r3], 1
273
+ psadbw m0, [r0+0*32]
274
+ movu xm1, [r2+2*r3]
275
+ vinserti128 m1, [r2+r1], 1
276
+ lea r2, [r2+4*r3]
277
+ psadbw m1, [r0+1*32]
278
+ movu xm2, [r2]
279
+ vinserti128 m2, [r2+r3], 1
280
+ psadbw m2, [r0+2*32]
281
+ movu xm3, [r2+2*r3]
282
+ vinserti128 m3, [r2+r1], 1
283
+ psadbw m3, [r0+3*32]
284
+ SAD_W16_AVX512_END
285
+
286
+INIT_ZMM avx512
287
+cglobal pixel_sad_16x16, 4,4
288
+ SAD_W16_AVX512_START 16
289
+ movu xm0, [r2]
290
+ vinserti128 ym0, [r2+r3], 1
291
+ movu xm1, [r2+4*r3]
292
+ vinserti32x4 m0, [r2+2*r3], 2
293
+ vinserti32x4 m1, [r2+2*r1], 2
294
+ vinserti32x4 m0, [r2+r1], 3
295
+ lea r2, [r2+4*r3]
296
+ vinserti32x4 m1, [r2+r3], 1
297
+ psadbw m0, [r0+0*64]
298
+ vinserti32x4 m1, [r2+r1], 3
299
+ lea r2, [r2+4*r3]
300
+ psadbw m1, [r0+1*64]
301
+ movu xm2, [r2]
302
+ vinserti128 ym2, [r2+r3], 1
303
+ movu xm3, [r2+4*r3]
304
+ vinserti32x4 m2, [r2+2*r3], 2
305
+ vinserti32x4 m3, [r2+2*r1], 2
306
+ vinserti32x4 m2, [r2+r1], 3
307
+ lea r2, [r2+4*r3]
308
+ vinserti32x4 m3, [r2+r3], 1
309
+ psadbw m2, [r0+2*64]
310
+ vinserti32x4 m3, [r2+r1], 3
311
+ psadbw m3, [r0+3*64]
312
+ SAD_W16_AVX512_END
313
314
;=============================================================================
315
; SAD x3/x4 MMX
316
317
SAD_X4_48x8_AVX2
318
PIXEL_SAD_X4_END_AVX2
319
RET
320
+
321
+;------------------------------------------------------------
322
+;sad_x4 avx512 code start
323
+;------------------------------------------------------------
324
+%macro PROCESS_SAD_X4_64x4_AVX512 0
325
+ movu m4, [r0]
326
+ movu m5, [r1]
327
+ movu m6, [r2]
328
+ movu m7, [r3]
329
+ movu m8, [r4]
330
+
331
+ psadbw m9, m4, m5
332
+ psadbw m5, m4, m6
333
+ psadbw m6, m4, m7
334
+ psadbw m4, m8
335
+
336
+ paddd m0, m9
337
+ paddd m1, m5
338
+ paddd m2, m6
339
+ paddd m3, m4
340
+
341
+ movu m4, [r0 + FENC_STRIDE]
342
+ movu m5, [r1 + r5]
343
+ movu m6, [r2 + r5]
344
+ movu m7, [r3 + r5]
345
+ movu m8, [r4 + r5]
346
+
347
+ psadbw m9, m4, m5
348
+ psadbw m5, m4, m6
349
+ psadbw m6, m4, m7
350
+ psadbw m4, m8
351
+ paddd m0, m9
352
+ paddd m1, m5
353
+ paddd m2, m6
354
+ paddd m3, m4
355
+
356
+ movu m4, [r0 + FENC_STRIDE * 2]
357
+ movu m5, [r1 + r5 * 2]
358
+ movu m6, [r2 + r5 * 2]
359
+ movu m7, [r3 + r5 * 2]
360
+ movu m8, [r4 + r5 * 2]
361
+
362
+ psadbw m9, m4, m5
363
+ psadbw m5, m4, m6
364
+ psadbw m6, m4, m7
365
+ psadbw m4, m8
366
+
367
+ paddd m0, m9
368
+ paddd m1, m5
369
+ paddd m2, m6
370
+ paddd m3, m4
371
+
372
+ movu m4, [r0 + FENC_STRIDE * 3]
373
+ movu m5, [r1 + r7]
374
+ movu m6, [r2 + r7]
375
+ movu m7, [r3 + r7]
376
+ movu m8, [r4 + r7]
377
+
378
+ psadbw m9, m4, m5
379
+ psadbw m5, m4, m6
380
+ psadbw m6, m4, m7
381
+ psadbw m4, m8
382
+ paddd m0, m9
383
+ paddd m1, m5
384
+ paddd m2, m6
385
+ paddd m3, m4
386
+%endmacro
387
+
388
+%macro PROCESS_SAD_X4_32x4_AVX512 0
389
+ movu ym4, [r0]
390
+ movu ym5, [r1]
391
+ movu ym6, [r2]
392
+ movu ym7, [r3]
393
+ movu ym8, [r4]
394
+
395
+ vinserti32x8 m4, [r0 + FENC_STRIDE], 1
396
+ vinserti32x8 m5, [r1 + r5], 1
397
+ vinserti32x8 m6, [r2 + r5], 1
398
+ vinserti32x8 m7, [r3 + r5], 1
399
+ vinserti32x8 m8, [r4 + r5], 1
400
+
401
+ psadbw m9, m4, m5
402
+ psadbw m5, m4, m6
403
+ psadbw m6, m4, m7
404
+ psadbw m4, m8
405
+
406
+ paddd m0, m9
407
+ paddd m1, m5
408
+ paddd m2, m6
409
+ paddd m3, m4
410
+
411
+ movu ym4, [r0 + FENC_STRIDE * 2]
412
+ movu ym5, [r1 + r5 * 2]
413
+ movu ym6, [r2 + r5 * 2]
414
+ movu ym7, [r3 + r5 * 2]
415
+ movu ym8, [r4 + r5 * 2]
416
+
417
+ vinserti32x8 m4, [r0 + FENC_STRIDE * 3], 1
418
+ vinserti32x8 m5, [r1 + r7], 1
419
+ vinserti32x8 m6, [r2 + r7], 1
420
+ vinserti32x8 m7, [r3 + r7], 1
421
+ vinserti32x8 m8, [r4 + r7], 1
422
+
423
+ psadbw m9, m4, m5
424
+ psadbw m5, m4, m6
425
+ psadbw m6, m4, m7
426
+ psadbw m4, m8
427
+
428
+ paddd m0, m9
429
+ paddd m1, m5
430
+ paddd m2, m6
431
+ paddd m3, m4
432
+%endmacro
433
+
434
+%macro PROCESS_SAD_X4_48x4_AVX512 0
435
+ movu ym4, [r0]
436
+ movu ym5, [r1]
437
+ movu ym6, [r2]
438
+ movu ym7, [r3]
439
+ movu ym8, [r4]
440
+
441
+ vinserti32x8 m4, [r0 + FENC_STRIDE], 1
442
+ vinserti32x8 m5, [r1 + r5], 1
443
+ vinserti32x8 m6, [r2 + r5], 1
444
+ vinserti32x8 m7, [r3 + r5], 1
445
+ vinserti32x8 m8, [r4 + r5], 1
446
+
447
+ psadbw m9, m4, m5
448
+ psadbw m5, m4, m6
449
+ psadbw m6, m4, m7
450
+ psadbw m4, m8
451
+
452
+ paddd m0, m9
453
+ paddd m1, m5
454
+ paddd m2, m6
455
+ paddd m3, m4
456
+
457
+ movu ym4, [r0 + FENC_STRIDE * 2]
458
+ movu ym5, [r1 + r5 * 2]
459
+ movu ym6, [r2 + r5 * 2]
460
+ movu ym7, [r3 + r5 * 2]
461
+ movu ym8, [r4 + r5 * 2]
462
+
463
+ vinserti32x8 m4, [r0 + FENC_STRIDE * 3], 1
464
+ vinserti32x8 m5, [r1 + r7], 1
465
+ vinserti32x8 m6, [r2 + r7], 1
466
+ vinserti32x8 m7, [r3 + r7], 1
467
+ vinserti32x8 m8, [r4 + r7], 1
468
+
469
+ psadbw m9, m4, m5
470
+ psadbw m5, m4, m6
471
+ psadbw m6, m4, m7
472
+ psadbw m4, m8
473
+
474
+ paddd m0, m9
475
+ paddd m1, m5
476
+ paddd m2, m6
477
+ paddd m3, m4
478
+
479
+ movu xm4, [r0 + mmsize/2]
480
+ movu xm5, [r1 + mmsize/2]
481
+ movu xm6, [r2 + mmsize/2]
482
+ movu xm7, [r3 + mmsize/2]
483
+ movu xm8, [r4 + mmsize/2]
484
+ vinserti32x4 m4, [r0 + FENC_STRIDE + mmsize/2], 1
485
+ vinserti32x4 m5, [r1 + r5 + mmsize/2], 1
486
+ vinserti32x4 m6, [r2 + r5 + mmsize/2], 1
487
+ vinserti32x4 m7, [r3 + r5 + mmsize/2], 1
488
+ vinserti32x4 m8, [r4 + r5 + mmsize/2], 1
489
+
490
+ vinserti32x4 m4, [r0 + FENC_STRIDE * 2 + mmsize/2], 2
491
+ vinserti32x4 m5, [r1 + r5 * 2 + mmsize/2], 2
492
+ vinserti32x4 m6, [r2 + r5 * 2 + mmsize/2], 2
493
+ vinserti32x4 m7, [r3 + r5 * 2 + mmsize/2], 2
494
+ vinserti32x4 m8, [r4 + r5 * 2 + mmsize/2], 2
495
+ vinserti32x4 m4, [r0 + FENC_STRIDE * 3 + mmsize/2], 3
496
+ vinserti32x4 m5, [r1 + r7 + mmsize/2], 3
497
+ vinserti32x4 m6, [r2 + r7 + mmsize/2], 3
498
+ vinserti32x4 m7, [r3 + r7 + mmsize/2], 3
499
+ vinserti32x4 m8, [r4 + r7 + mmsize/2], 3
500
+
501
+ psadbw m9, m4, m5
502
+ psadbw m5, m4, m6
503
+ psadbw m6, m4, m7
504
+ psadbw m4, m8
505
+ paddd m0, m9
506
+ paddd m1, m5
507
+ paddd m2, m6
508
+ paddd m3, m4
509
+%endmacro
510
+
511
+%macro PIXEL_SAD_X4_END_AVX512 0
512
+ vextracti32x8 ym4, m0, 1
513
+ vextracti32x8 ym5, m1, 1
514
+ vextracti32x8 ym6, m2, 1
515
+ vextracti32x8 ym7, m3, 1
516
+ paddd ym0, ym4
517
+ paddd ym1, ym5
518
+ paddd ym2, ym6
519
+ paddd ym3, ym7
520
+ vextracti64x2 xm4, m0, 1
521
+ vextracti64x2 xm5, m1, 1
522
+ vextracti64x2 xm6, m2, 1
523
+ vextracti64x2 xm7, m3, 1
524
+ paddd xm0, xm4
525
+ paddd xm1, xm5
526
+ paddd xm2, xm6
527
+ paddd xm3, xm7
528
+ pshufd xm4, xm0, 2
529
+ pshufd xm5, xm1, 2
530
+ pshufd xm6, xm2, 2
531
+ pshufd xm7, xm3, 2
532
+ paddd xm0, xm4
533
+ paddd xm1, xm5
534
+ paddd xm2, xm6
535
+ paddd xm3, xm7
536
+ movd [r6 + 0], xm0
537
+ movd [r6 + 4], xm1
538
+ movd [r6 + 8], xm2
539
+ movd [r6 + 12], xm3
540
+%endmacro
541
+
542
+%macro SAD_X4_AVX512 2
543
+INIT_ZMM avx512
544
+cglobal pixel_sad_x4_%1x%2, 7,8,10
545
+ pxor m0, m0
546
+ pxor m1, m1
547
+ pxor m2, m2
548
+ pxor m3, m3
549
+ lea r7, [r5 * 3]
550
+
551
+%rep %2/4 - 1
552
+ PROCESS_SAD_X4_%1x4_AVX512
553
+ add r0, FENC_STRIDE * 4
554
+ lea r1, [r1 + r5 * 4]
555
+ lea r2, [r2 + r5 * 4]
556
+ lea r3, [r3 + r5 * 4]
557
+ lea r4, [r4 + r5 * 4]
558
+%endrep
559
+ PROCESS_SAD_X4_%1x4_AVX512
560
+ PIXEL_SAD_X4_END_AVX512
561
+ RET
562
+%endmacro
563
+
564
+SAD_X4_AVX512 64, 64
565
+SAD_X4_AVX512 64, 48
566
+SAD_X4_AVX512 64, 32
567
+SAD_X4_AVX512 64, 16
568
+SAD_X4_AVX512 32, 64
569
+SAD_X4_AVX512 32, 32
570
+SAD_X4_AVX512 32, 24
571
+SAD_X4_AVX512 32, 16
572
+SAD_X4_AVX512 32, 8
573
+SAD_X4_AVX512 48, 64
574
+;------------------------------------------------------------
575
+;sad_x4 avx512 code end
576
+;------------------------------------------------------------
577
%endif
578
579
INIT_XMM sse2
580
581
RET
582
%endif
583
584
+;------------------------------------------------------------
585
+;sad_x3 avx512 code start
586
+;------------------------------------------------------------
587
+%macro PROCESS_SAD_X3_64x4_AVX512 0
588
+ movu m3, [r0]
589
+ movu m4, [r1]
590
+ movu m5, [r2]
591
+ movu m6, [r3]
592
+
593
+ psadbw m7, m3, m4
594
+ psadbw m4, m3, m5
595
+ psadbw m3, m6
596
+
597
+ paddd m0, m7
598
+ paddd m1, m4
599
+ paddd m2, m3
600
+
601
+ movu m3, [r0 + FENC_STRIDE]
602
+ movu m4, [r1 + r4]
603
+ movu m5, [r2 + r4]
604
+ movu m6, [r3 + r4]
605
+
606
+ psadbw m7, m3, m4
607
+ psadbw m4, m3, m5
608
+ psadbw m3, m6
609
+
610
+ paddd m0, m7
611
+ paddd m1, m4
612
+ paddd m2, m3
613
+
614
+ movu m3, [r0 + FENC_STRIDE * 2]
615
+ movu m4, [r1 + r4 * 2]
616
+ movu m5, [r2 + r4 * 2]
617
+ movu m6, [r3 + r4 * 2]
618
+
619
+ psadbw m7, m3, m4
620
+ psadbw m4, m3, m5
621
+ psadbw m3, m6
622
+
623
+ paddd m0, m7
624
+ paddd m1, m4
625
+ paddd m2, m3
626
+
627
+ movu m3, [r0 + FENC_STRIDE * 3]
628
+ movu m4, [r1 + r6]
629
+ movu m5, [r2 + r6]
630
+ movu m6, [r3 + r6]
631
+
632
+ psadbw m7, m3, m4
633
+ psadbw m4, m3, m5
634
+ psadbw m3, m6
635
+
636
+ paddd m0, m7
637
+ paddd m1, m4
638
+ paddd m2, m3
639
+%endmacro
640
+
641
+%macro PROCESS_SAD_X3_32x4_AVX512 0
642
+ movu ym3, [r0]
643
+ movu ym4, [r1]
644
+ movu ym5, [r2]
645
+ movu ym6, [r3]
646
+ vinserti32x8 m3, [r0 + FENC_STRIDE], 1
647
+ vinserti32x8 m4, [r1 + r4], 1
648
+ vinserti32x8 m5, [r2 + r4], 1
649
+ vinserti32x8 m6, [r3 + r4], 1
650
+
651
+ psadbw m7, m3, m4
652
+ psadbw m4, m3, m5
653
+ psadbw m3, m6
654
+
655
+ paddd m0, m7
656
+ paddd m1, m4
657
+ paddd m2, m3
658
+
659
+ movu ym3, [r0 + FENC_STRIDE * 2]
660
+ movu ym4, [r1 + r4 * 2]
661
+ movu ym5, [r2 + r4 * 2]
662
+ movu ym6, [r3 + r4 * 2]
663
+ vinserti32x8 m3, [r0 + FENC_STRIDE * 3], 1
664
+ vinserti32x8 m4, [r1 + r6], 1
665
+ vinserti32x8 m5, [r2 + r6], 1
666
+ vinserti32x8 m6, [r3 + r6], 1
667
+
668
+ psadbw m7, m3, m4
669
+ psadbw m4, m3, m5
670
+ psadbw m3, m6
671
+
672
+ paddd m0, m7
673
+ paddd m1, m4
674
+ paddd m2, m3
675
+%endmacro
676
+
677
+%macro PROCESS_SAD_X3_48x4_AVX512 0
678
+ movu ym3, [r0]
679
+ movu ym4, [r1]
680
+ movu ym5, [r2]
681
+ movu ym6, [r3]
682
+ vinserti32x8 m3, [r0 + FENC_STRIDE], 1
683
+ vinserti32x8 m4, [r1 + r4], 1
684
+ vinserti32x8 m5, [r2 + r4], 1
685
+ vinserti32x8 m6, [r3 + r4], 1
686
+
687
+ psadbw m7, m3, m4
688
+ psadbw m4, m3, m5
689
+ psadbw m3, m6
690
+
691
+ paddd m0, m7
692
+ paddd m1, m4
693
+ paddd m2, m3
694
+
695
+ movu ym3, [r0 + FENC_STRIDE * 2]
696
+ movu ym4, [r1 + r4 * 2]
697
+ movu ym5, [r2 + r4 * 2]
698
+ movu ym6, [r3 + r4 * 2]
699
+ vinserti32x8 m3, [r0 + FENC_STRIDE * 3], 1
700
+ vinserti32x8 m4, [r1 + r6], 1
701
+ vinserti32x8 m5, [r2 + r6], 1
702
+ vinserti32x8 m6, [r3 + r6], 1
703
+
704
+ psadbw m7, m3, m4
705
+ psadbw m4, m3, m5
706
+ psadbw m3, m6
707
+
708
+ paddd m0, m7
709
+ paddd m1, m4
710
+ paddd m2, m3
711
+
712
+ movu xm3, [r0 + mmsize/2]
713
+ movu xm4, [r1 + mmsize/2]
714
+ movu xm5, [r2 + mmsize/2]
715
+ movu xm6, [r3 + mmsize/2]
716
+ vinserti32x4 m3, [r0 + FENC_STRIDE + mmsize/2], 1
717
+ vinserti32x4 m4, [r1 + r4 + mmsize/2], 1
718
+ vinserti32x4 m5, [r2 + r4 + mmsize/2], 1
719
+ vinserti32x4 m6, [r3 + r4 + mmsize/2], 1
720
+
721
+ vinserti32x4 m3, [r0 + 2 * FENC_STRIDE + mmsize/2], 2
722
+ vinserti32x4 m4, [r1 + 2 * r4 + mmsize/2], 2
723
+ vinserti32x4 m5, [r2 + 2 * r4 + mmsize/2], 2
724
+ vinserti32x4 m6, [r3 + 2 * r4 + mmsize/2], 2
725
+ vinserti32x4 m3, [r0 + 3 * FENC_STRIDE + mmsize/2], 3
726
+ vinserti32x4 m4, [r1 + r6 + mmsize/2], 3
727
+ vinserti32x4 m5, [r2 + r6 + mmsize/2], 3
728
+ vinserti32x4 m6, [r3 + r6 + mmsize/2], 3
729
+
730
+ psadbw m7, m3, m4
731
+ psadbw m4, m3, m5
732
+ psadbw m3, m6
733
+ paddd m0, m7
734
+ paddd m1, m4
735
+ paddd m2, m3
736
+%endmacro
737
+
738
+%macro PIXEL_SAD_X3_END_AVX512 0
739
+ vextracti32x8 ym3, m0, 1
740
+ vextracti32x8 ym4, m1, 1
741
+ vextracti32x8 ym5, m2, 1
742
+ paddd ym0, ym3
743
+ paddd ym1, ym4
744
+ paddd ym2, ym5
745
+ vextracti64x2 xm3, m0, 1
746
+ vextracti64x2 xm4, m1, 1
747
+ vextracti64x2 xm5, m2, 1
748
+ paddd xm0, xm3
749
+ paddd xm1, xm4
750
+ paddd xm2, xm5
751
+ pshufd xm3, xm0, 2
752
+ pshufd xm4, xm1, 2
753
+ pshufd xm5, xm2, 2
754
+ paddd xm0, xm3
755
+ paddd xm1, xm4
756
+ paddd xm2, xm5
757
+ movd [r5 + 0], xm0
758
+ movd [r5 + 4], xm1
759
+ movd [r5 + 8], xm2
760
+%endmacro
761
+
762
+%macro SAD_X3_AVX512 2
763
+INIT_ZMM avx512
764
+cglobal pixel_sad_x3_%1x%2, 6,7,8
765
+ pxor m0, m0
766
+ pxor m1, m1
767
+ pxor m2, m2
768
+ lea r6, [r4 * 3]
769
+
770
+%rep %2/4 - 1
771
+ PROCESS_SAD_X3_%1x4_AVX512
772
+ add r0, FENC_STRIDE * 4
773
+ lea r1, [r1 + r4 * 4]
774
+ lea r2, [r2 + r4 * 4]
775
+ lea r3, [r3 + r4 * 4]
776
+%endrep
777
+ PROCESS_SAD_X3_%1x4_AVX512
778
+ PIXEL_SAD_X3_END_AVX512
779
+ RET
780
+%endmacro
781
+
782
+SAD_X3_AVX512 64, 64
783
+SAD_X3_AVX512 64, 48
784
+SAD_X3_AVX512 64, 32
785
+SAD_X3_AVX512 64, 16
786
+SAD_X3_AVX512 32, 64
787
+SAD_X3_AVX512 32, 32
788
+SAD_X3_AVX512 32, 24
789
+SAD_X3_AVX512 32, 16
790
+SAD_X3_AVX512 32, 8
791
+SAD_X3_AVX512 48, 64
792
+;------------------------------------------------------------
793
+;sad_x3 avx512 code end
794
+;------------------------------------------------------------
795
+
796
INIT_YMM avx2
797
cglobal pixel_sad_x4_8x8, 7,7,5
798
xorps m0, m0
799
800
movd eax, xm0
801
RET
802
803
+%macro PROCESS_SAD_64x4_AVX512 0
804
+ movu m1, [r0]
805
+ movu m2, [r2]
806
+ movu m3, [r0 + r1]
807
+ movu m4, [r2 + r3]
808
+ psadbw m1, m2
809
+ psadbw m3, m4
810
+ paddd m0, m1
811
+ paddd m0, m3
812
+ movu m1, [r0 + 2 * r1]
813
+ movu m2, [r2 + 2 * r3]
814
+ movu m3, [r0 + r5]
815
+ movu m4, [r2 + r6]
816
+ psadbw m1, m2
817
+ psadbw m3, m4
818
+ paddd m0, m1
819
+ paddd m0, m3
820
+%endmacro
821
+
822
+%macro PROCESS_SAD_32x4_AVX512 0
823
+ movu ym1, [r0]
824
+ movu ym2, [r2]
825
+ movu ym3, [r0 + 2 * r1]
826
+ movu ym4, [r2 + 2 * r3]
827
+ vinserti32x8 m1, [r0 + r1], 1
828
+ vinserti32x8 m2, [r2 + r3], 1
829
+ vinserti32x8 m3, [r0 + r5], 1
830
+ vinserti32x8 m4, [r2 + r6], 1
831
+
832
+ psadbw m1, m2
833
+ psadbw m3, m4
834
+ paddd m0, m1
835
+ paddd m0, m3
836
+%endmacro
837
+
838
+%macro PROCESS_SAD_AVX512_END 0
839
+ vextracti32x8 ym1, m0, 1
840
+ paddd ym0, ym1
841
+ vextracti64x2 xm1, m0, 1
842
+ paddd xm0, xm1
843
+ pshufd xm1, xm0, 2
844
+ paddd xm0, xm1
845
+ movd eax, xm0
846
+%endmacro
847
+;-----------------------------------------------------------------------------
848
+; int pixel_sad_64x%1( uint8_t *, intptr_t, uint8_t *, intptr_t )
849
+;-----------------------------------------------------------------------------
850
+%macro SAD_MxN_AVX512 2
851
+INIT_ZMM avx512
852
+cglobal pixel_sad_%1x%2, 4, 7, 5
853
+ pxor m0, m0
854
+ lea r5, [3 * r1]
855
+ lea r6, [3 * r3]
856
+
857
+%rep %2/4 - 1
858
+ PROCESS_SAD_%1x4_AVX512
859
+ lea r2, [r2 + 4 * r3]
860
+ lea r0, [r0 + 4 * r1]
861
+%endrep
862
+ PROCESS_SAD_%1x4_AVX512
863
+ PROCESS_SAD_AVX512_END
864
+ RET
865
+%endmacro
866
+
867
+SAD_MxN_AVX512 64, 16
868
+SAD_MxN_AVX512 64, 32
869
+SAD_MxN_AVX512 64, 48
870
+SAD_MxN_AVX512 64, 64
871
+SAD_MxN_AVX512 32, 8
872
+SAD_MxN_AVX512 32, 16
873
+SAD_MxN_AVX512 32, 24
874
+SAD_MxN_AVX512 32, 32
875
+SAD_MxN_AVX512 32, 64
876
%endif
877
x265_2.7.tar.gz/source/common/x86/sad16-a.asm -> x265_2.9.tar.gz/source/common/x86/sad16-a.asm
Changed
2819
1
2
SAD_12 12, 16
3
4
5
+%macro PROCESS_SAD_64x8_AVX512 0
6
+ movu m1, [r2]
7
+ movu m2, [r2 + mmsize]
8
+ movu m3, [r2 + r3]
9
+ movu m4, [r2 + r3 + mmsize]
10
+ psubw m1, [r0]
11
+ psubw m2, [r0 + mmsize]
12
+ psubw m3, [r0 + r1]
13
+ psubw m4, [r0 + r1 + mmsize]
14
+ pabsw m1, m1
15
+ pabsw m2, m2
16
+ pabsw m3, m3
17
+ pabsw m4, m4
18
+ paddw m1, m2
19
+ paddw m3, m4
20
+ paddw m5, m1, m3
21
+
22
+ movu m1, [r2 + 2 * r3]
23
+ movu m2, [r2 + 2 * r3 + mmsize]
24
+ movu m3, [r2 + r5]
25
+ movu m4, [r2 + r5 + mmsize]
26
+ psubw m1, [r0 + 2 * r1]
27
+ psubw m2, [r0 + 2 * r1 + mmsize]
28
+ psubw m3, [r0 + r4]
29
+ psubw m4, [r0 + r4 + mmsize]
30
+ pabsw m1, m1
31
+ pabsw m2, m2
32
+ pabsw m3, m3
33
+ pabsw m4, m4
34
+ paddw m1, m2
35
+ paddw m3, m4
36
+ paddw m1, m3
37
+
38
+ lea r0, [r0 + 4 * r1]
39
+ lea r2, [r2 + 4 * r3]
40
+
41
+ pmaddwd m5, m6
42
+ paddd m0, m5
43
+ pmaddwd m1, m6
44
+ paddd m0, m1
45
+
46
+ movu m1, [r2]
47
+ movu m2, [r2 + mmsize]
48
+ movu m3, [r2 + r3]
49
+ movu m4, [r2 + r3 + mmsize]
50
+ psubw m1, [r0]
51
+ psubw m2, [r0 + mmsize]
52
+ psubw m3, [r0 + r1]
53
+ psubw m4, [r0 + r1 + mmsize]
54
+ pabsw m1, m1
55
+ pabsw m2, m2
56
+ pabsw m3, m3
57
+ pabsw m4, m4
58
+ paddw m1, m2
59
+ paddw m3, m4
60
+ paddw m5, m1, m3
61
+
62
+ movu m1, [r2 + 2 * r3]
63
+ movu m2, [r2 + 2 * r3 + mmsize]
64
+ movu m3, [r2 + r5]
65
+ movu m4, [r2 + r5 + mmsize]
66
+ psubw m1, [r0 + 2 * r1]
67
+ psubw m2, [r0 + 2 * r1 + mmsize]
68
+ psubw m3, [r0 + r4]
69
+ psubw m4, [r0 + r4 + mmsize]
70
+ pabsw m1, m1
71
+ pabsw m2, m2
72
+ pabsw m3, m3
73
+ pabsw m4, m4
74
+ paddw m1, m2
75
+ paddw m3, m4
76
+ paddw m1, m3
77
+
78
+ pmaddwd m5, m6
79
+ paddd m0, m5
80
+ pmaddwd m1, m6
81
+ paddd m0, m1
82
+%endmacro
83
+
84
+
85
+%macro PROCESS_SAD_32x8_AVX512 0
86
+ movu m1, [r2]
87
+ movu m2, [r2 + r3]
88
+ movu m3, [r2 + 2 * r3]
89
+ movu m4, [r2 + r5]
90
+ psubw m1, [r0]
91
+ psubw m2, [r0 + r1]
92
+ psubw m3, [r0 + 2 * r1]
93
+ psubw m4, [r0 + r4]
94
+ pabsw m1, m1
95
+ pabsw m2, m2
96
+ pabsw m3, m3
97
+ pabsw m4, m4
98
+ paddw m1, m2
99
+ paddw m3, m4
100
+ paddw m5, m1, m3
101
+
102
+ lea r0, [r0 + 4 * r1]
103
+ lea r2, [r2 + 4 * r3]
104
+
105
+ movu m1, [r2]
106
+ movu m2, [r2 + r3]
107
+ movu m3, [r2 + 2 * r3]
108
+ movu m4, [r2 + r5]
109
+ psubw m1, [r0]
110
+ psubw m2, [r0 + r1]
111
+ psubw m3, [r0 + 2 * r1]
112
+ psubw m4, [r0 + r4]
113
+ pabsw m1, m1
114
+ pabsw m2, m2
115
+ pabsw m3, m3
116
+ pabsw m4, m4
117
+ paddw m1, m2
118
+ paddw m3, m4
119
+ paddw m1, m3
120
+
121
+ pmaddwd m5, m6
122
+ paddd m0, m5
123
+ pmaddwd m1, m6
124
+ paddd m0, m1
125
+%endmacro
126
+
127
+%macro PROCESS_SAD_16x8_AVX512 0
128
+ movu ym1, [r2]
129
+ vinserti64x4 m1, [r2 + r3], 1
130
+ movu ym2, [r2 + 2 * r3]
131
+ vinserti64x4 m2, [r2 + r5], 1
132
+ movu ym3, [r0]
133
+ vinserti64x4 m3, [r0 + r1], 1
134
+ movu ym4, [r0 + 2 * r1]
135
+ vinserti64x4 m4, [r0 + r4], 1
136
+
137
+ psubw m1, m3
138
+ psubw m2, m4
139
+ pabsw m1, m1
140
+ pabsw m2, m2
141
+ paddw m5, m1, m2
142
+
143
+ lea r0, [r0 + 4 * r1]
144
+ lea r2, [r2 + 4 * r3]
145
+
146
+ movu ym1, [r2]
147
+ vinserti64x4 m1, [r2 + r3], 1
148
+ movu ym2, [r2 + 2 * r3]
149
+ vinserti64x4 m2, [r2 + r5], 1
150
+ movu ym3, [r0]
151
+ vinserti64x4 m3, [r0 + r1], 1
152
+ movu ym4, [r0 + 2 * r1]
153
+ vinserti64x4 m4, [r0 + r4], 1
154
+
155
+ psubw m1, m3
156
+ psubw m2, m4
157
+ pabsw m1, m1
158
+ pabsw m2, m2
159
+ paddw m1, m2
160
+
161
+ pmaddwd m5, m6
162
+ paddd m0, m5
163
+ pmaddwd m1, m6
164
+ paddd m0, m1
165
+%endmacro
166
+
167
+%macro PROCESS_SAD_AVX512_END 0
168
+ vextracti32x8 ym1, m0, 1
169
+ paddd ym0, ym1
170
+ vextracti64x2 xm1, m0, 1
171
+ paddd xm0, xm1
172
+ pshufd xm1, xm0, 00001110b
173
+ paddd xm0, xm1
174
+ pshufd xm1, xm0, 00000001b
175
+ paddd xm0, xm1
176
+ movd eax, xm0
177
+%endmacro
178
+
179
+;-----------------------------------------------------------------------------
180
+; int pixel_sad_64x%1( uint16_t *, intptr_t, uint16_t *, intptr_t )
181
+;-----------------------------------------------------------------------------
182
+%if ARCH_X86_64
183
+INIT_ZMM avx512
184
+cglobal pixel_sad_64x16, 4,6,7
185
+ pxor m0, m0
186
+
187
+ vbroadcasti32x8 m6, [pw_1]
188
+
189
+ add r3d, r3d
190
+ add r1d, r1d
191
+ lea r4d, [r1 * 3]
192
+ lea r5d, [r3 * 3]
193
+
194
+ PROCESS_SAD_64x8_AVX512
195
+ lea r2, [r2 + 4 * r3]
196
+ lea r0, [r0 + 4 * r1]
197
+ PROCESS_SAD_64x8_AVX512
198
+ PROCESS_SAD_AVX512_END
199
+ RET
200
+
201
+INIT_ZMM avx512
202
+cglobal pixel_sad_64x32, 4,6,7
203
+ pxor m0, m0
204
+
205
+ vbroadcasti32x8 m6, [pw_1]
206
+
207
+ add r3d, r3d
208
+ add r1d, r1d
209
+ lea r4d, [r1 * 3]
210
+ lea r5d, [r3 * 3]
211
+
212
+ PROCESS_SAD_64x8_AVX512
213
+ lea r2, [r2 + 4 * r3]
214
+ lea r0, [r0 + 4 * r1]
215
+ PROCESS_SAD_64x8_AVX512
216
+ lea r2, [r2 + 4 * r3]
217
+ lea r0, [r0 + 4 * r1]
218
+ PROCESS_SAD_64x8_AVX512
219
+ lea r2, [r2 + 4 * r3]
220
+ lea r0, [r0 + 4 * r1]
221
+ PROCESS_SAD_64x8_AVX512
222
+ PROCESS_SAD_AVX512_END
223
+ RET
224
+
225
+INIT_ZMM avx512
226
+cglobal pixel_sad_64x48, 4,6,7
227
+ pxor m0, m0
228
+
229
+ vbroadcasti32x8 m6, [pw_1]
230
+
231
+ add r3d, r3d
232
+ add r1d, r1d
233
+ lea r4d, [r1 * 3]
234
+ lea r5d, [r3 * 3]
235
+
236
+ PROCESS_SAD_64x8_AVX512
237
+ lea r2, [r2 + 4 * r3]
238
+ lea r0, [r0 + 4 * r1]
239
+ PROCESS_SAD_64x8_AVX512
240
+ lea r2, [r2 + 4 * r3]
241
+ lea r0, [r0 + 4 * r1]
242
+ PROCESS_SAD_64x8_AVX512
243
+ lea r2, [r2 + 4 * r3]
244
+ lea r0, [r0 + 4 * r1]
245
+ PROCESS_SAD_64x8_AVX512
246
+ lea r2, [r2 + 4 * r3]
247
+ lea r0, [r0 + 4 * r1]
248
+ PROCESS_SAD_64x8_AVX512
249
+ lea r2, [r2 + 4 * r3]
250
+ lea r0, [r0 + 4 * r1]
251
+ PROCESS_SAD_64x8_AVX512
252
+ PROCESS_SAD_AVX512_END
253
+ RET
254
+
255
+INIT_ZMM avx512
256
+cglobal pixel_sad_64x64, 4,6,7
257
+ pxor m0, m0
258
+
259
+ vbroadcasti32x8 m6, [pw_1]
260
+
261
+ add r3d, r3d
262
+ add r1d, r1d
263
+ lea r4d, [r1 * 3]
264
+ lea r5d, [r3 * 3]
265
+
266
+ PROCESS_SAD_64x8_AVX512
267
+ lea r2, [r2 + 4 * r3]
268
+ lea r0, [r0 + 4 * r1]
269
+ PROCESS_SAD_64x8_AVX512
270
+ lea r2, [r2 + 4 * r3]
271
+ lea r0, [r0 + 4 * r1]
272
+ PROCESS_SAD_64x8_AVX512
273
+ lea r2, [r2 + 4 * r3]
274
+ lea r0, [r0 + 4 * r1]
275
+ PROCESS_SAD_64x8_AVX512
276
+ lea r2, [r2 + 4 * r3]
277
+ lea r0, [r0 + 4 * r1]
278
+ PROCESS_SAD_64x8_AVX512
279
+ lea r2, [r2 + 4 * r3]
280
+ lea r0, [r0 + 4 * r1]
281
+ PROCESS_SAD_64x8_AVX512
282
+ lea r2, [r2 + 4 * r3]
283
+ lea r0, [r0 + 4 * r1]
284
+ PROCESS_SAD_64x8_AVX512
285
+ lea r2, [r2 + 4 * r3]
286
+ lea r0, [r0 + 4 * r1]
287
+ PROCESS_SAD_64x8_AVX512
288
+ PROCESS_SAD_AVX512_END
289
+ RET
290
+%endif
291
+
292
+;-----------------------------------------------------------------------------
293
+; int pixel_sad_32x%1( uint16_t *, intptr_t, uint16_t *, intptr_t )
294
+;-----------------------------------------------------------------------------
295
+%if ARCH_X86_64
296
+INIT_ZMM avx512
297
+cglobal pixel_sad_32x8, 4,6,7
298
+ pxor m0, m0
299
+
300
+ vbroadcasti32x8 m6, [pw_1]
301
+
302
+ add r3d, r3d
303
+ add r1d, r1d
304
+ lea r4d, [r1 * 3]
305
+ lea r5d, [r3 * 3]
306
+
307
+ PROCESS_SAD_32x8_AVX512
308
+ PROCESS_SAD_AVX512_END
309
+ RET
310
+
311
+
312
+INIT_ZMM avx512
313
+cglobal pixel_sad_32x16, 4,6,7
314
+ pxor m0, m0
315
+
316
+ vbroadcasti32x8 m6, [pw_1]
317
+
318
+ add r3d, r3d
319
+ add r1d, r1d
320
+ lea r4d, [r1 * 3]
321
+ lea r5d, [r3 * 3]
322
+
323
+ PROCESS_SAD_32x8_AVX512
324
+ lea r2, [r2 + 4 * r3]
325
+ lea r0, [r0 + 4 * r1]
326
+ PROCESS_SAD_32x8_AVX512
327
+ PROCESS_SAD_AVX512_END
328
+ RET
329
+
330
+INIT_ZMM avx512
331
+cglobal pixel_sad_32x24, 4,6,7
332
+ pxor m0, m0
333
+
334
+ vbroadcasti32x8 m6, [pw_1]
335
+
336
+ add r3d, r3d
337
+ add r1d, r1d
338
+ lea r4d, [r1 * 3]
339
+ lea r5d, [r3 * 3]
340
+
341
+ PROCESS_SAD_32x8_AVX512
342
+ lea r2, [r2 + 4 * r3]
343
+ lea r0, [r0 + 4 * r1]
344
+ PROCESS_SAD_32x8_AVX512
345
+ lea r2, [r2 + 4 * r3]
346
+ lea r0, [r0 + 4 * r1]
347
+ PROCESS_SAD_32x8_AVX512
348
+ PROCESS_SAD_AVX512_END
349
+ RET
350
+
351
+INIT_ZMM avx512
352
+cglobal pixel_sad_32x32, 4,6,7
353
+ pxor m0, m0
354
+
355
+ vbroadcasti32x8 m6, [pw_1]
356
+
357
+ add r3d, r3d
358
+ add r1d, r1d
359
+ lea r4d, [r1 * 3]
360
+ lea r5d, [r3 * 3]
361
+
362
+ PROCESS_SAD_32x8_AVX512
363
+ lea r2, [r2 + 4 * r3]
364
+ lea r0, [r0 + 4 * r1]
365
+ PROCESS_SAD_32x8_AVX512
366
+ lea r2, [r2 + 4 * r3]
367
+ lea r0, [r0 + 4 * r1]
368
+ PROCESS_SAD_32x8_AVX512
369
+ lea r2, [r2 + 4 * r3]
370
+ lea r0, [r0 + 4 * r1]
371
+ PROCESS_SAD_32x8_AVX512
372
+ PROCESS_SAD_AVX512_END
373
+ RET
374
+
375
+INIT_ZMM avx512
376
+cglobal pixel_sad_32x64, 4,6,7
377
+ pxor m0, m0
378
+
379
+ vbroadcasti32x8 m6, [pw_1]
380
+
381
+ add r3d, r3d
382
+ add r1d, r1d
383
+ lea r4d, [r1 * 3]
384
+ lea r5d, [r3 * 3]
385
+
386
+ PROCESS_SAD_32x8_AVX512
387
+ lea r2, [r2 + 4 * r3]
388
+ lea r0, [r0 + 4 * r1]
389
+ PROCESS_SAD_32x8_AVX512
390
+ lea r2, [r2 + 4 * r3]
391
+ lea r0, [r0 + 4 * r1]
392
+ PROCESS_SAD_32x8_AVX512
393
+ lea r2, [r2 + 4 * r3]
394
+ lea r0, [r0 + 4 * r1]
395
+ PROCESS_SAD_32x8_AVX512
396
+ lea r2, [r2 + 4 * r3]
397
+ lea r0, [r0 + 4 * r1]
398
+ PROCESS_SAD_32x8_AVX512
399
+ lea r2, [r2 + 4 * r3]
400
+ lea r0, [r0 + 4 * r1]
401
+ PROCESS_SAD_32x8_AVX512
402
+ lea r2, [r2 + 4 * r3]
403
+ lea r0, [r0 + 4 * r1]
404
+ PROCESS_SAD_32x8_AVX512
405
+ lea r2, [r2 + 4 * r3]
406
+ lea r0, [r0 + 4 * r1]
407
+ PROCESS_SAD_32x8_AVX512
408
+ PROCESS_SAD_AVX512_END
409
+ RET
410
+%endif
411
+
412
+;-----------------------------------------------------------------------------
413
+; int pixel_sad_16x%1( uint16_t *, intptr_t, uint16_t *, intptr_t )
414
+;-----------------------------------------------------------------------------
415
+%if ARCH_X86_64
416
+INIT_ZMM avx512
417
+cglobal pixel_sad_16x32, 4,6,7
418
+ pxor m0, m0
419
+
420
+ vbroadcasti32x8 m6, [pw_1]
421
+
422
+ add r3d, r3d
423
+ add r1d, r1d
424
+ lea r4d, [r1 * 3]
425
+ lea r5d, [r3 * 3]
426
+
427
+ %rep 3
428
+ PROCESS_SAD_16x8_AVX512
429
+ lea r2, [r2 + 4 * r3]
430
+ lea r0, [r0 + 4 * r1]
431
+ %endrep
432
+ PROCESS_SAD_16x8_AVX512
433
+ PROCESS_SAD_AVX512_END
434
+ RET
435
+
436
+INIT_ZMM avx512
437
+cglobal pixel_sad_16x64, 4,6,7
438
+ pxor m0, m0
439
+
440
+ vbroadcasti32x8 m6, [pw_1]
441
+
442
+ add r3d, r3d
443
+ add r1d, r1d
444
+ lea r4d, [r1 * 3]
445
+ lea r5d, [r3 * 3]
446
+
447
+ %rep 7
448
+ PROCESS_SAD_16x8_AVX512
449
+ lea r2, [r2 + 4 * r3]
450
+ lea r0, [r0 + 4 * r1]
451
+ %endrep
452
+ PROCESS_SAD_16x8_AVX512
453
+ PROCESS_SAD_AVX512_END
454
+ RET
455
+%endif
456
+
457
+;-----------------------------------------------------------------------------
458
+; int pixel_sad_48x64( uint16_t *, intptr_t, uint16_t *, intptr_t )
459
+;-----------------------------------------------------------------------------
460
+%if ARCH_X86_64
461
+INIT_ZMM avx512
462
+cglobal pixel_sad_48x64, 4, 7, 9
463
+ pxor m0, m0
464
+ mov r6d, 64/8
465
+
466
+ vbroadcasti32x8 m8, [pw_1]
467
+
468
+ add r3d, r3d
469
+ add r1d, r1d
470
+ lea r4d, [r1 * 3]
471
+ lea r5d, [r3 * 3]
472
+.loop:
473
+ movu m1, [r2]
474
+ movu m2, [r2 + r3]
475
+ movu ym3, [r2 + mmsize]
476
+ vinserti32x8 m3, [r2 + r3 + mmsize], 1
477
+ movu m4, [r0]
478
+ movu m5, [r0 + r1]
479
+ movu ym6, [r0 + mmsize]
480
+ vinserti32x8 m6, [r0 + r1 + mmsize], 1
481
+
482
+ psubw m1, m4
483
+ psubw m2, m5
484
+ psubw m3, m6
485
+ pabsw m1, m1
486
+ pabsw m2, m2
487
+ pabsw m3, m3
488
+ paddw m1, m2
489
+ paddw m7, m3, m1
490
+
491
+ movu m1, [r2 + 2 * r3]
492
+ movu m2, [r2 + r5]
493
+ movu ym3, [r2 + 2 * r3 + mmsize]
494
+ vinserti32x8 m3, [r2 + r5 + mmsize], 1
495
+ movu m4, [r0 + 2 * r1]
496
+ movu m5, [r0 + r4]
497
+ movu ym6, [r0 + 2 * r1 + mmsize]
498
+ vinserti32x8 m6, [r0 + r4 + mmsize], 1
499
+ psubw m1, m4
500
+ psubw m2, m5
501
+ psubw m3, m6
502
+ pabsw m1, m1
503
+ pabsw m2, m2
504
+ pabsw m3, m3
505
+ paddw m1, m2
506
+ paddw m1, m3
507
+
508
+ pmaddwd m7, m8
509
+ paddd m0, m7
510
+ pmaddwd m1, m8
511
+ paddd m0, m1
512
+ lea r0, [r0 + 4 * r1]
513
+ lea r2, [r2 + 4 * r3]
514
+
515
+ movu m1, [r2]
516
+ movu m2, [r2 + r3]
517
+ movu ym3, [r2 + mmsize]
518
+ vinserti32x8 m3, [r2 + r3 + mmsize], 1
519
+ movu m4, [r0]
520
+ movu m5, [r0 + r1]
521
+ movu ym6, [r0 + mmsize]
522
+ vinserti32x8 m6, [r0 + r1 + mmsize], 1
523
+
524
+ psubw m1, m4
525
+ psubw m2, m5
526
+ psubw m3, m6
527
+ pabsw m1, m1
528
+ pabsw m2, m2
529
+ pabsw m3, m3
530
+ paddw m1, m2
531
+ paddw m7, m3, m1
532
+
533
+ movu m1, [r2 + 2 * r3]
534
+ movu m2, [r2 + r5]
535
+ movu ym3, [r2 + 2 * r3 + mmsize]
536
+ vinserti32x8 m3, [r2 + r5 + mmsize], 1
537
+ movu m4, [r0 + 2 * r1]
538
+ movu m5, [r0 + r4]
539
+ movu ym6, [r0 + 2 * r1 + mmsize]
540
+ vinserti32x8 m6, [r0 + r4 + mmsize], 1
541
+ psubw m1, m4
542
+ psubw m2, m5
543
+ psubw m3, m6
544
+ pabsw m1, m1
545
+ pabsw m2, m2
546
+ pabsw m3, m3
547
+ paddw m1, m2
548
+ paddw m1, m3
549
+
550
+ pmaddwd m7, m8
551
+ paddd m0, m7
552
+ pmaddwd m1, m8
553
+ paddd m0, m1
554
+ lea r0, [r0 + 4 * r1]
555
+ lea r2, [r2 + 4 * r3]
556
+
557
+ dec r6d
558
+ jg .loop
559
+
560
+ PROCESS_SAD_AVX512_END
561
+ RET
562
+%endif
563
+
564
;=============================================================================
565
; SAD x3/x4
566
;=============================================================================
567
568
SAD_X 4, 64, 48
569
SAD_X 4, 64, 64
570
571
+;============================
572
+; SAD x3/x4 avx512 code start
573
+;============================
574
+
575
+%macro PROCESS_SAD_X4_16x4_AVX512 0
576
+ movu ym8, [r0]
577
+ vinserti64x4 m8, [r0 + 2 * FENC_STRIDE], 1
578
+ movu ym4, [r1]
579
+ vinserti64x4 m4, [r1 + r5], 1
580
+ movu ym5, [r2]
581
+ vinserti64x4 m5, [r2 + r5], 1
582
+ movu ym6, [r3]
583
+ vinserti64x4 m6, [r3 + r5], 1
584
+ movu ym7, [r4]
585
+ vinserti64x4 m7, [r4 + r5], 1
586
+
587
+
588
+ psubw m4, m8
589
+ psubw m5, m8
590
+ psubw m6, m8
591
+ psubw m7, m8
592
+ pabsw m4, m4
593
+ pabsw m5, m5
594
+ pabsw m6, m6
595
+ pabsw m7, m7
596
+
597
+ pmaddwd m4, m9
598
+ paddd m0, m4
599
+ pmaddwd m5, m9
600
+ paddd m1, m5
601
+ pmaddwd m6, m9
602
+ paddd m2, m6
603
+ pmaddwd m7, m9
604
+ paddd m3, m7
605
+
606
+ movu ym8, [r0 + 4 * FENC_STRIDE]
607
+ vinserti64x4 m8, [r0 + 6 * FENC_STRIDE], 1
608
+ movu ym4, [r1 + 2 * r5]
609
+ vinserti64x4 m4, [r1 + r7], 1
610
+ movu ym5, [r2 + 2 * r5]
611
+ vinserti64x4 m5, [r2 + r7], 1
612
+ movu ym6, [r3 + 2 * r5]
613
+ vinserti64x4 m6, [r3 + r7], 1
614
+ movu ym7, [r4 + 2 * r5]
615
+ vinserti64x4 m7, [r4 + r7], 1
616
+
617
+ psubw m4, m8
618
+ psubw m5, m8
619
+ psubw m6, m8
620
+ psubw m7, m8
621
+ pabsw m4, m4
622
+ pabsw m5, m5
623
+ pabsw m6, m6
624
+ pabsw m7, m7
625
+
626
+ pmaddwd m4, m9
627
+ paddd m0, m4
628
+ pmaddwd m5, m9
629
+ paddd m1, m5
630
+ pmaddwd m6, m9
631
+ paddd m2, m6
632
+ pmaddwd m7, m9
633
+ paddd m3, m7
634
+%endmacro
635
+
636
+%macro PROCESS_SAD_X4_32x4_AVX512 0
637
+ movu m8, [r0]
638
+ movu m4, [r1]
639
+ movu m5, [r2]
640
+ movu m6, [r3]
641
+ movu m7, [r4]
642
+
643
+
644
+ psubw m4, m8
645
+ psubw m5, m8
646
+ psubw m6, m8
647
+ psubw m7, m8
648
+ pabsw m4, m4
649
+ pabsw m5, m5
650
+ pabsw m6, m6
651
+ pabsw m7, m7
652
+
653
+ pmaddwd m4, m9
654
+ paddd m0, m4
655
+ pmaddwd m5, m9
656
+ paddd m1, m5
657
+ pmaddwd m6, m9
658
+ paddd m2, m6
659
+ pmaddwd m7, m9
660
+ paddd m3, m7
661
+
662
+
663
+ movu m8, [r0 + 2 * FENC_STRIDE]
664
+ movu m4, [r1 + r5]
665
+ movu m5, [r2 + r5]
666
+ movu m6, [r3 + r5]
667
+ movu m7, [r4 + r5]
668
+
669
+
670
+ psubw m4, m8
671
+ psubw m5, m8
672
+ psubw m6, m8
673
+ psubw m7, m8
674
+ pabsw m4, m4
675
+ pabsw m5, m5
676
+ pabsw m6, m6
677
+ pabsw m7, m7
678
+
679
+ pmaddwd m4, m9
680
+ paddd m0, m4
681
+ pmaddwd m5, m9
682
+ paddd m1, m5
683
+ pmaddwd m6, m9
684
+ paddd m2, m6
685
+ pmaddwd m7, m9
686
+ paddd m3, m7
687
+
688
+ movu m8, [r0 + 4 * FENC_STRIDE]
689
+ movu m4, [r1 + 2 * r5]
690
+ movu m5, [r2 + 2 * r5]
691
+ movu m6, [r3 + 2 * r5]
692
+ movu m7, [r4 + 2 * r5]
693
+
694
+
695
+ psubw m4, m8
696
+ psubw m5, m8
697
+ psubw m6, m8
698
+ psubw m7, m8
699
+ pabsw m4, m4
700
+ pabsw m5, m5
701
+ pabsw m6, m6
702
+ pabsw m7, m7
703
+
704
+ pmaddwd m4, m9
705
+ paddd m0, m4
706
+ pmaddwd m5, m9
707
+ paddd m1, m5
708
+ pmaddwd m6, m9
709
+ paddd m2, m6
710
+ pmaddwd m7, m9
711
+ paddd m3, m7
712
+
713
+ movu m8, [r0 + 6 * FENC_STRIDE]
714
+ movu m4, [r1 + r7]
715
+ movu m5, [r2 + r7]
716
+ movu m6, [r3 + r7]
717
+ movu m7, [r4 + r7]
718
+
719
+
720
+ psubw m4, m8
721
+ psubw m5, m8
722
+ psubw m6, m8
723
+ psubw m7, m8
724
+ pabsw m4, m4
725
+ pabsw m5, m5
726
+ pabsw m6, m6
727
+ pabsw m7, m7
728
+
729
+ pmaddwd m4, m9
730
+ paddd m0, m4
731
+ pmaddwd m5, m9
732
+ paddd m1, m5
733
+ pmaddwd m6, m9
734
+ paddd m2, m6
735
+ pmaddwd m7, m9
736
+ paddd m3, m7
737
+%endmacro
738
+
739
+%macro PROCESS_SAD_X4_64x4_AVX512 0
740
+ movu m8, [r0]
741
+ movu m10, [r0 + mmsize]
742
+ movu m4, [r1]
743
+ movu m11, [r1 + mmsize]
744
+ movu m5, [r2]
745
+ movu m12, [r2 + mmsize]
746
+ movu m6, [r3]
747
+ movu m13, [r3 + mmsize]
748
+ movu m7, [r4]
749
+ movu m14, [r4 + mmsize]
750
+
751
+ psubw m4, m8
752
+ psubw m5, m8
753
+ psubw m6, m8
754
+ psubw m7, m8
755
+ psubw m11, m10
756
+ psubw m12, m10
757
+ psubw m13, m10
758
+ psubw m14, m10
759
+ pabsw m4, m4
760
+ pabsw m5, m5
761
+ pabsw m6, m6
762
+ pabsw m7, m7
763
+ pabsw m11, m11
764
+ pabsw m12, m12
765
+ pabsw m13, m13
766
+ pabsw m14, m14
767
+ paddw m4, m11
768
+ paddw m5, m12
769
+ paddw m6, m13
770
+ paddw m7, m14
771
+
772
+ pmaddwd m4, m9
773
+ paddd m0, m4
774
+ pmaddwd m5, m9
775
+ paddd m1, m5
776
+ pmaddwd m6, m9
777
+ paddd m2, m6
778
+ pmaddwd m7, m9
779
+ paddd m3, m7
780
+
781
+
782
+ movu m8, [r0 + 2 * FENC_STRIDE]
783
+ movu m10, [r0 + 2 * FENC_STRIDE + mmsize]
784
+ movu m4, [r1 + r5]
785
+ movu m11, [r1 + r5 + mmsize]
786
+ movu m5, [r2 + r5]
787
+ movu m12, [r2 + r5 + mmsize]
788
+ movu m6, [r3 + r5]
789
+ movu m13, [r3 + r5 + mmsize]
790
+ movu m7, [r4 + r5]
791
+ movu m14, [r4 + r5 + mmsize]
792
+
793
+ psubw m4, m8
794
+ psubw m5, m8
795
+ psubw m6, m8
796
+ psubw m7, m8
797
+ psubw m11, m10
798
+ psubw m12, m10
799
+ psubw m13, m10
800
+ psubw m14, m10
801
+ pabsw m4, m4
802
+ pabsw m5, m5
803
+ pabsw m6, m6
804
+ pabsw m7, m7
805
+ pabsw m11, m11
806
+ pabsw m12, m12
807
+ pabsw m13, m13
808
+ pabsw m14, m14
809
+ paddw m4, m11
810
+ paddw m5, m12
811
+ paddw m6, m13
812
+ paddw m7, m14
813
+
814
+ pmaddwd m4, m9
815
+ paddd m0, m4
816
+ pmaddwd m5, m9
817
+ paddd m1, m5
818
+ pmaddwd m6, m9
819
+ paddd m2, m6
820
+ pmaddwd m7, m9
821
+ paddd m3, m7
822
+
823
+ movu m8, [r0 + 4 * FENC_STRIDE]
824
+ movu m10, [r0 + 4 * FENC_STRIDE + mmsize]
825
+ movu m4, [r1 + 2 * r5]
826
+ movu m11, [r1 + 2 * r5 + mmsize]
827
+ movu m5, [r2 + 2 * r5]
828
+ movu m12, [r2 + 2 * r5 + mmsize]
829
+ movu m6, [r3 + 2 * r5]
830
+ movu m13, [r3 + 2 * r5 + mmsize]
831
+ movu m7, [r4 + 2 * r5]
832
+ movu m14, [r4 + 2 * r5 + mmsize]
833
+
834
+ psubw m4, m8
835
+ psubw m5, m8
836
+ psubw m6, m8
837
+ psubw m7, m8
838
+ psubw m11, m10
839
+ psubw m12, m10
840
+ psubw m13, m10
841
+ psubw m14, m10
842
+ pabsw m4, m4
843
+ pabsw m5, m5
844
+ pabsw m6, m6
845
+ pabsw m7, m7
846
+ pabsw m11, m11
847
+ pabsw m12, m12
848
+ pabsw m13, m13
849
+ pabsw m14, m14
850
+ paddw m4, m11
851
+ paddw m5, m12
852
+ paddw m6, m13
853
+ paddw m7, m14
854
+
855
+ pmaddwd m4, m9
856
+ paddd m0, m4
857
+ pmaddwd m5, m9
858
+ paddd m1, m5
859
+ pmaddwd m6, m9
860
+ paddd m2, m6
861
+ pmaddwd m7, m9
862
+ paddd m3, m7
863
+
864
+ movu m8, [r0 + 6 * FENC_STRIDE]
865
+ movu m10, [r0 + 6 * FENC_STRIDE + mmsize]
866
+ movu m4, [r1 + r7]
867
+ movu m11, [r1 + r7 + mmsize]
868
+ movu m5, [r2 + r7]
869
+ movu m12, [r2 + r7 + mmsize]
870
+ movu m6, [r3 + r7]
871
+ movu m13, [r3 + r7 + mmsize]
872
+ movu m7, [r4 + r7]
873
+ movu m14, [r4 + r7 + mmsize]
874
+
875
+ psubw m4, m8
876
+ psubw m5, m8
877
+ psubw m6, m8
878
+ psubw m7, m8
879
+ psubw m11, m10
880
+ psubw m12, m10
881
+ psubw m13, m10
882
+ psubw m14, m10
883
+ pabsw m4, m4
884
+ pabsw m5, m5
885
+ pabsw m6, m6
886
+ pabsw m7, m7
887
+ pabsw m11, m11
888
+ pabsw m12, m12
889
+ pabsw m13, m13
890
+ pabsw m14, m14
891
+ paddw m4, m11
892
+ paddw m5, m12
893
+ paddw m6, m13
894
+ paddw m7, m14
895
+
896
+ pmaddwd m4, m9
897
+ paddd m0, m4
898
+ pmaddwd m5, m9
899
+ paddd m1, m5
900
+ pmaddwd m6, m9
901
+ paddd m2, m6
902
+ pmaddwd m7, m9
903
+ paddd m3, m7
904
+%endmacro
905
+
906
+%macro PROCESS_SAD_X4_END_AVX512 0
907
+ vextracti32x8 ym4, m0, 1
908
+ vextracti32x8 ym5, m1, 1
909
+ vextracti32x8 ym6, m2, 1
910
+ vextracti32x8 ym7, m3, 1
911
+
912
+ paddd ym0, ym4
913
+ paddd ym1, ym5
914
+ paddd ym2, ym6
915
+ paddd ym3, ym7
916
+
917
+ vextracti64x2 xm4, m0, 1
918
+ vextracti64x2 xm5, m1, 1
919
+ vextracti64x2 xm6, m2, 1
920
+ vextracti64x2 xm7, m3, 1
921
+
922
+ paddd xm0, xm4
923
+ paddd xm1, xm5
924
+ paddd xm2, xm6
925
+ paddd xm3, xm7
926
+
927
+ pshufd xm4, xm0, 00001110b
928
+ pshufd xm5, xm1, 00001110b
929
+ pshufd xm6, xm2, 00001110b
930
+ pshufd xm7, xm3, 00001110b
931
+
932
+ paddd xm0, xm4
933
+ paddd xm1, xm5
934
+ paddd xm2, xm6
935
+ paddd xm3, xm7
936
+
937
+ pshufd xm4, xm0, 00000001b
938
+ pshufd xm5, xm1, 00000001b
939
+ pshufd xm6, xm2, 00000001b
940
+ pshufd xm7, xm3, 00000001b
941
+
942
+ paddd xm0, xm4
943
+ paddd xm1, xm5
944
+ paddd xm2, xm6
945
+ paddd xm3, xm7
946
+
947
+ mov r0, r6mp
948
+ movd [r0 + 0], xm0
949
+ movd [r0 + 4], xm1
950
+ movd [r0 + 8], xm2
951
+ movd [r0 + 12], xm3
952
+%endmacro
953
+
954
+
955
+%macro PROCESS_SAD_X3_16x4_AVX512 0
956
+ movu ym6, [r0]
957
+ vinserti64x4 m6, [r0 + 2 * FENC_STRIDE], 1
958
+ movu ym3, [r1]
959
+ vinserti64x4 m3, [r1 + r4], 1
960
+ movu ym4, [r2]
961
+ vinserti64x4 m4, [r2 + r4], 1
962
+ movu ym5, [r3]
963
+ vinserti64x4 m5, [r3 + r4], 1
964
+
965
+ psubw m3, m6
966
+ psubw m4, m6
967
+ psubw m5, m6
968
+ pabsw m3, m3
969
+ pabsw m4, m4
970
+ pabsw m5, m5
971
+
972
+ pmaddwd m3, m7
973
+ paddd m0, m3
974
+ pmaddwd m4, m7
975
+ paddd m1, m4
976
+ pmaddwd m5, m7
977
+ paddd m2, m5
978
+
979
+ movu ym6, [r0 + 4 * FENC_STRIDE]
980
+ vinserti64x4 m6, [r0 + 6 * FENC_STRIDE], 1
981
+ movu ym3, [r1 + 2 * r4]
982
+ vinserti64x4 m3, [r1 + r6], 1
983
+ movu ym4, [r2 + 2 * r4]
984
+ vinserti64x4 m4, [r2 + r6], 1
985
+ movu ym5, [r3 + 2 * r4]
986
+ vinserti64x4 m5, [r3 + r6], 1
987
+
988
+ psubw m3, m6
989
+ psubw m4, m6
990
+ psubw m5, m6
991
+ pabsw m3, m3
992
+ pabsw m4, m4
993
+ pabsw m5, m5
994
+
995
+ pmaddwd m3, m7
996
+ paddd m0, m3
997
+ pmaddwd m4, m7
998
+ paddd m1, m4
999
+ pmaddwd m5, m7
1000
+ paddd m2, m5
1001
+%endmacro
1002
+
1003
+
1004
+%macro PROCESS_SAD_X3_32x4_AVX512 0
1005
+ movu m6, [r0]
1006
+ movu m3, [r1]
1007
+ movu m4, [r2]
1008
+ movu m5, [r3]
1009
+
1010
+
1011
+ psubw m3, m6
1012
+ psubw m4, m6
1013
+ psubw m5, m6
1014
+ pabsw m3, m3
1015
+ pabsw m4, m4
1016
+ pabsw m5, m5
1017
+
1018
+ pmaddwd m3, m7
1019
+ paddd m0, m3
1020
+ pmaddwd m4, m7
1021
+ paddd m1, m4
1022
+ pmaddwd m5, m7
1023
+ paddd m2, m5
1024
+
1025
+ movu m6, [r0 + 2 * FENC_STRIDE]
1026
+ movu m3, [r1 + r4]
1027
+ movu m4, [r2 + r4]
1028
+ movu m5, [r3 + r4]
1029
+
1030
+ psubw m3, m6
1031
+ psubw m4, m6
1032
+ psubw m5, m6
1033
+ pabsw m3, m3
1034
+ pabsw m4, m4
1035
+ pabsw m5, m5
1036
+
1037
+ pmaddwd m3, m7
1038
+ paddd m0, m3
1039
+ pmaddwd m4, m7
1040
+ paddd m1, m4
1041
+ pmaddwd m5, m7
1042
+ paddd m2, m5
1043
+
1044
+ movu m6, [r0 + 4 * FENC_STRIDE]
1045
+ movu m3, [r1 + 2 * r4]
1046
+ movu m4, [r2 + 2 * r4]
1047
+ movu m5, [r3 + 2 * r4]
1048
+
1049
+ psubw m3, m6
1050
+ psubw m4, m6
1051
+ psubw m5, m6
1052
+ pabsw m3, m3
1053
+ pabsw m4, m4
1054
+ pabsw m5, m5
1055
+
1056
+ pmaddwd m3, m7
1057
+ paddd m0, m3
1058
+ pmaddwd m4, m7
1059
+ paddd m1, m4
1060
+ pmaddwd m5, m7
1061
+ paddd m2, m5
1062
+
1063
+ movu m6, [r0 + 6 * FENC_STRIDE]
1064
+ movu m3, [r1 + r6]
1065
+ movu m4, [r2 + r6]
1066
+ movu m5, [r3 + r6]
1067
+
1068
+ psubw m3, m6
1069
+ psubw m4, m6
1070
+ psubw m5, m6
1071
+ pabsw m3, m3
1072
+ pabsw m4, m4
1073
+ pabsw m5, m5
1074
+
1075
+ pmaddwd m3, m7
1076
+ paddd m0, m3
1077
+ pmaddwd m4, m7
1078
+ paddd m1, m4
1079
+ pmaddwd m5, m7
1080
+ paddd m2, m5
1081
+%endmacro
1082
+
1083
+%macro PROCESS_SAD_X3_64x4_AVX512 0
1084
+ movu m6, [r0]
1085
+ movu m8, [r0 + mmsize]
1086
+ movu m3, [r1]
1087
+ movu m9, [r1 + mmsize]
1088
+ movu m4, [r2]
1089
+ movu m10, [r2 + mmsize]
1090
+ movu m5, [r3]
1091
+ movu m11, [r3 + mmsize]
1092
+
1093
+ psubw m3, m6
1094
+ psubw m9, m8
1095
+ psubw m4, m6
1096
+ psubw m10, m8
1097
+ psubw m5, m6
1098
+ psubw m11, m8
1099
+ pabsw m3, m3
1100
+ pabsw m4, m4
1101
+ pabsw m5, m5
1102
+ pabsw m9, m9
1103
+ pabsw m10, m10
1104
+ pabsw m11, m11
1105
+ paddw m3, m9
1106
+ paddw m4, m10
1107
+ paddw m5, m11
1108
+
1109
+ pmaddwd m3, m7
1110
+ paddd m0, m3
1111
+ pmaddwd m4, m7
1112
+ paddd m1, m4
1113
+ pmaddwd m5, m7
1114
+ paddd m2, m5
1115
+
1116
+ movu m6, [r0 + 2 * FENC_STRIDE]
1117
+ movu m8, [r0 + 2 * FENC_STRIDE + mmsize]
1118
+ movu m3, [r1 + r4]
1119
+ movu m9, [r1 + r4 + mmsize]
1120
+ movu m4, [r2 + r4]
1121
+ movu m10, [r2 + r4 + mmsize]
1122
+ movu m5, [r3 + r4]
1123
+ movu m11, [r3 + r4 + mmsize]
1124
+
1125
+ psubw m3, m6
1126
+ psubw m9, m8
1127
+ psubw m4, m6
1128
+ psubw m10, m8
1129
+ psubw m5, m6
1130
+ psubw m11, m8
1131
+ pabsw m3, m3
1132
+ pabsw m4, m4
1133
+ pabsw m5, m5
1134
+ pabsw m9, m9
1135
+ pabsw m10, m10
1136
+ pabsw m11, m11
1137
+ paddw m3, m9
1138
+ paddw m4, m10
1139
+ paddw m5, m11
1140
+
1141
+ pmaddwd m3, m7
1142
+ paddd m0, m3
1143
+ pmaddwd m4, m7
1144
+ paddd m1, m4
1145
+ pmaddwd m5, m7
1146
+ paddd m2, m5
1147
+
1148
+ movu m6, [r0 + 4 * FENC_STRIDE]
1149
+ movu m8, [r0 + 4 * FENC_STRIDE + mmsize]
1150
+ movu m3, [r1 + 2 * r4]
1151
+ movu m9, [r1 + 2 * r4 + mmsize]
1152
+ movu m4, [r2 + 2 * r4]
1153
+ movu m10, [r2 + 2 * r4 + mmsize]
1154
+ movu m5, [r3 + 2 * r4]
1155
+ movu m11, [r3 + 2 * r4 + mmsize]
1156
+
1157
+ psubw m3, m6
1158
+ psubw m9, m8
1159
+ psubw m4, m6
1160
+ psubw m10, m8
1161
+ psubw m5, m6
1162
+ psubw m11, m8
1163
+ pabsw m3, m3
1164
+ pabsw m4, m4
1165
+ pabsw m5, m5
1166
+ pabsw m9, m9
1167
+ pabsw m10, m10
1168
+ pabsw m11, m11
1169
+ paddw m3, m9
1170
+ paddw m4, m10
1171
+ paddw m5, m11
1172
+
1173
+ pmaddwd m3, m7
1174
+ paddd m0, m3
1175
+ pmaddwd m4, m7
1176
+ paddd m1, m4
1177
+ pmaddwd m5, m7
1178
+ paddd m2, m5
1179
+
1180
+ movu m6, [r0 + 6 * FENC_STRIDE]
1181
+ movu m8, [r0 + 6 * FENC_STRIDE + mmsize]
1182
+ movu m3, [r1 + r6]
1183
+ movu m9, [r1 + r6 + mmsize]
1184
+ movu m4, [r2 + r6]
1185
+ movu m10, [r2 + r6 + mmsize]
1186
+ movu m5, [r3 + r6]
1187
+ movu m11, [r3 + r6 + mmsize]
1188
+
1189
+ psubw m3, m6
1190
+ psubw m9, m8
1191
+ psubw m4, m6
1192
+ psubw m10, m8
1193
+ psubw m5, m6
1194
+ psubw m11, m8
1195
+ pabsw m3, m3
1196
+ pabsw m4, m4
1197
+ pabsw m5, m5
1198
+ pabsw m9, m9
1199
+ pabsw m10, m10
1200
+ pabsw m11, m11
1201
+ paddw m3, m9
1202
+ paddw m4, m10
1203
+ paddw m5, m11
1204
+
1205
+ pmaddwd m3, m7
1206
+ paddd m0, m3
1207
+ pmaddwd m4, m7
1208
+ paddd m1, m4
1209
+ pmaddwd m5, m7
1210
+ paddd m2, m5
1211
+%endmacro
1212
+
1213
+%macro PROCESS_SAD_X3_END_AVX512 0
1214
+ vextracti32x8 ym3, m0, 1
1215
+ vextracti32x8 ym4, m1, 1
1216
+ vextracti32x8 ym5, m2, 1
1217
+
1218
+ paddd ym0, ym3
1219
+ paddd ym1, ym4
1220
+ paddd ym2, ym5
1221
+
1222
+ vextracti64x2 xm3, m0, 1
1223
+ vextracti64x2 xm4, m1, 1
1224
+ vextracti64x2 xm5, m2, 1
1225
+
1226
+ paddd xm0, xm3
1227
+ paddd xm1, xm4
1228
+ paddd xm2, xm5
1229
+
1230
+ pshufd xm3, xm0, 00001110b
1231
+ pshufd xm4, xm1, 00001110b
1232
+ pshufd xm5, xm2, 00001110b
1233
+
1234
+ paddd xm0, xm3
1235
+ paddd xm1, xm4
1236
+ paddd xm2, xm5
1237
+
1238
+ pshufd xm3, xm0, 00000001b
1239
+ pshufd xm4, xm1, 00000001b
1240
+ pshufd xm5, xm2, 00000001b
1241
+
1242
+ paddd xm0, xm3
1243
+ paddd xm1, xm4
1244
+ paddd xm2, xm5
1245
+
1246
+ %if UNIX64
1247
+ movd [r5 + 0], xm0
1248
+ movd [r5 + 4], xm1
1249
+ movd [r5 + 8], xm2
1250
+ %else
1251
+ mov r0, r5mp
1252
+ movd [r0 + 0], xm0
1253
+ movd [r0 + 4], xm1
1254
+ movd [r0 + 8], xm2
1255
+%endif
1256
+%endmacro
1257
+
1258
+
1259
+;------------------------------------------------------------------------------------------------------------------------------------------
1260
+; void pixel_sad_x3_16x%1( const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, intptr_t frefstride, int32_t* res )
1261
+;------------------------------------------------------------------------------------------------------------------------------------------
1262
+%if ARCH_X86_64
1263
+INIT_ZMM avx512
1264
+cglobal pixel_sad_x3_16x8, 6,7,8
1265
+ pxor m0, m0
1266
+ pxor m1, m1
1267
+ pxor m2, m2
1268
+
1269
+ vbroadcasti32x8 m7, [pw_1]
1270
+
1271
+ add r4d, r4d
1272
+ lea r6d, [r4 * 3]
1273
+
1274
+ PROCESS_SAD_X3_16x4_AVX512
1275
+ add r0, FENC_STRIDE * 8
1276
+ lea r1, [r1 + r4 * 4]
1277
+ lea r2, [r2 + r4 * 4]
1278
+ lea r3, [r3 + r4 * 4]
1279
+ PROCESS_SAD_X3_16x4_AVX512
1280
+ PROCESS_SAD_X3_END_AVX512
1281
+ RET
1282
+
1283
+INIT_ZMM avx512
1284
+cglobal pixel_sad_x3_16x12, 6,7,8
1285
+ pxor m0, m0
1286
+ pxor m1, m1
1287
+ pxor m2, m2
1288
+
1289
+ vbroadcasti32x8 m7, [pw_1]
1290
+
1291
+ add r4d, r4d
1292
+ lea r6d, [r4 * 3]
1293
+ %rep 2
1294
+ PROCESS_SAD_X3_16x4_AVX512
1295
+ add r0, FENC_STRIDE * 8
1296
+ lea r1, [r1 + r4 * 4]
1297
+ lea r2, [r2 + r4 * 4]
1298
+ lea r3, [r3 + r4 * 4]
1299
+ %endrep
1300
+ PROCESS_SAD_X3_16x4_AVX512
1301
+ PROCESS_SAD_X3_END_AVX512
1302
+ RET
1303
+
1304
+INIT_ZMM avx512
1305
+cglobal pixel_sad_x3_16x16, 6,7,8
1306
+ pxor m0, m0
1307
+ pxor m1, m1
1308
+ pxor m2, m2
1309
+
1310
+ vbroadcasti32x8 m7, [pw_1]
1311
+
1312
+ add r4d, r4d
1313
+ lea r6d, [r4 * 3]
1314
+
1315
+ %rep 3
1316
+ PROCESS_SAD_X3_16x4_AVX512
1317
+ add r0, FENC_STRIDE * 8
1318
+ lea r1, [r1 + r4 * 4]
1319
+ lea r2, [r2 + r4 * 4]
1320
+ lea r3, [r3 + r4 * 4]
1321
+ %endrep
1322
+ PROCESS_SAD_X3_16x4_AVX512
1323
+ PROCESS_SAD_X3_END_AVX512
1324
+ RET
1325
+
1326
+INIT_ZMM avx512
1327
+cglobal pixel_sad_x3_16x32, 6,7,8
1328
+ pxor m0, m0
1329
+ pxor m1, m1
1330
+ pxor m2, m2
1331
+
1332
+ vbroadcasti32x8 m7, [pw_1]
1333
+
1334
+ add r4d, r4d
1335
+ lea r6d, [r4 * 3]
1336
+
1337
+ %rep 7
1338
+ PROCESS_SAD_X3_16x4_AVX512
1339
+ add r0, FENC_STRIDE * 8
1340
+ lea r1, [r1 + r4 * 4]
1341
+ lea r2, [r2 + r4 * 4]
1342
+ lea r3, [r3 + r4 * 4]
1343
+ %endrep
1344
+ PROCESS_SAD_X3_16x4_AVX512
1345
+ PROCESS_SAD_X3_END_AVX512
1346
+ RET
1347
+
1348
+INIT_ZMM avx512
1349
+cglobal pixel_sad_x3_16x64, 6,7,8
1350
+ pxor m0, m0
1351
+ pxor m1, m1
1352
+ pxor m2, m2
1353
+
1354
+ vbroadcasti32x8 m7, [pw_1]
1355
+
1356
+ add r4d, r4d
1357
+ lea r6d, [r4 * 3]
1358
+
1359
+ %rep 15
1360
+ PROCESS_SAD_X3_16x4_AVX512
1361
+ add r0, FENC_STRIDE * 8
1362
+ lea r1, [r1 + r4 * 4]
1363
+ lea r2, [r2 + r4 * 4]
1364
+ lea r3, [r3 + r4 * 4]
1365
+ %endrep
1366
+ PROCESS_SAD_X3_16x4_AVX512
1367
+ PROCESS_SAD_X3_END_AVX512
1368
+ RET
1369
+%endif
1370
+
1371
+;------------------------------------------------------------------------------------------------------------------------------------------
1372
+; void pixel_sad_x3_32x%1( const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, intptr_t frefstride, int32_t* res )
1373
+;------------------------------------------------------------------------------------------------------------------------------------------
1374
+%if ARCH_X86_64
1375
+INIT_ZMM avx512
1376
+cglobal pixel_sad_x3_32x8, 6,7,8
1377
+ pxor m0, m0
1378
+ pxor m1, m1
1379
+ pxor m2, m2
1380
+
1381
+ vbroadcasti32x8 m7, [pw_1]
1382
+
1383
+ add r4d, r4d
1384
+ lea r6d, [r4 * 3]
1385
+
1386
+ PROCESS_SAD_X3_32x4_AVX512
1387
+ add r0, FENC_STRIDE * 8
1388
+ lea r1, [r1 + r4 * 4]
1389
+ lea r2, [r2 + r4 * 4]
1390
+ lea r3, [r3 + r4 * 4]
1391
+ PROCESS_SAD_X3_32x4_AVX512
1392
+ PROCESS_SAD_X3_END_AVX512
1393
+ RET
1394
+
1395
+
1396
+INIT_ZMM avx512
1397
+cglobal pixel_sad_x3_32x16, 6,7,8
1398
+ pxor m0, m0
1399
+ pxor m1, m1
1400
+ pxor m2, m2
1401
+
1402
+ vbroadcasti32x8 m7, [pw_1]
1403
+
1404
+ add r4d, r4d
1405
+ lea r6d, [r4 * 3]
1406
+
1407
+ PROCESS_SAD_X3_32x4_AVX512
1408
+ add r0, FENC_STRIDE * 8
1409
+ lea r1, [r1 + r4 * 4]
1410
+ lea r2, [r2 + r4 * 4]
1411
+ lea r3, [r3 + r4 * 4]
1412
+ PROCESS_SAD_X3_32x4_AVX512
1413
+ add r0, FENC_STRIDE * 8
1414
+ lea r1, [r1 + r4 * 4]
1415
+ lea r2, [r2 + r4 * 4]
1416
+ lea r3, [r3 + r4 * 4]
1417
+ PROCESS_SAD_X3_32x4_AVX512
1418
+ add r0, FENC_STRIDE * 8
1419
+ lea r1, [r1 + r4 * 4]
1420
+ lea r2, [r2 + r4 * 4]
1421
+ lea r3, [r3 + r4 * 4]
1422
+ PROCESS_SAD_X3_32x4_AVX512
1423
+ PROCESS_SAD_X3_END_AVX512
1424
+ RET
1425
+
1426
+INIT_ZMM avx512
1427
+cglobal pixel_sad_x3_32x24, 6,7,8
1428
+ pxor m0, m0
1429
+ pxor m1, m1
1430
+ pxor m2, m2
1431
+
1432
+ vbroadcasti32x8 m7, [pw_1]
1433
+
1434
+ add r4d, r4d
1435
+ lea r6d, [r4 * 3]
1436
+
1437
+ PROCESS_SAD_X3_32x4_AVX512
1438
+ add r0, FENC_STRIDE * 8
1439
+ lea r1, [r1 + r4 * 4]
1440
+ lea r2, [r2 + r4 * 4]
1441
+ lea r3, [r3 + r4 * 4]
1442
+ PROCESS_SAD_X3_32x4_AVX512
1443
+ add r0, FENC_STRIDE * 8
1444
+ lea r1, [r1 + r4 * 4]
1445
+ lea r2, [r2 + r4 * 4]
1446
+ lea r3, [r3 + r4 * 4]
1447
+ PROCESS_SAD_X3_32x4_AVX512
1448
+ add r0, FENC_STRIDE * 8
1449
+ lea r1, [r1 + r4 * 4]
1450
+ lea r2, [r2 + r4 * 4]
1451
+ lea r3, [r3 + r4 * 4]
1452
+ PROCESS_SAD_X3_32x4_AVX512
1453
+ add r0, FENC_STRIDE * 8
1454
+ lea r1, [r1 + r4 * 4]
1455
+ lea r2, [r2 + r4 * 4]
1456
+ lea r3, [r3 + r4 * 4]
1457
+ PROCESS_SAD_X3_32x4_AVX512
1458
+ add r0, FENC_STRIDE * 8
1459
+ lea r1, [r1 + r4 * 4]
1460
+ lea r2, [r2 + r4 * 4]
1461
+ lea r3, [r3 + r4 * 4]
1462
+ PROCESS_SAD_X3_32x4_AVX512
1463
+ PROCESS_SAD_X3_END_AVX512
1464
+ RET
1465
+
1466
+
1467
+INIT_ZMM avx512
1468
+cglobal pixel_sad_x3_32x32, 6,7,8
1469
+ pxor m0, m0
1470
+ pxor m1, m1
1471
+ pxor m2, m2
1472
+
1473
+ vbroadcasti32x8 m7, [pw_1]
1474
+
1475
+ add r4d, r4d
1476
+ lea r6d, [r4 * 3]
1477
+
1478
+ PROCESS_SAD_X3_32x4_AVX512
1479
+ add r0, FENC_STRIDE * 8
1480
+ lea r1, [r1 + r4 * 4]
1481
+ lea r2, [r2 + r4 * 4]
1482
+ lea r3, [r3 + r4 * 4]
1483
+ PROCESS_SAD_X3_32x4_AVX512
1484
+ add r0, FENC_STRIDE * 8
1485
+ lea r1, [r1 + r4 * 4]
1486
+ lea r2, [r2 + r4 * 4]
1487
+ lea r3, [r3 + r4 * 4]
1488
+ PROCESS_SAD_X3_32x4_AVX512
1489
+ add r0, FENC_STRIDE * 8
1490
+ lea r1, [r1 + r4 * 4]
1491
+ lea r2, [r2 + r4 * 4]
1492
+ lea r3, [r3 + r4 * 4]
1493
+ PROCESS_SAD_X3_32x4_AVX512
1494
+ add r0, FENC_STRIDE * 8
1495
+ lea r1, [r1 + r4 * 4]
1496
+ lea r2, [r2 + r4 * 4]
1497
+ lea r3, [r3 + r4 * 4]
1498
+ PROCESS_SAD_X3_32x4_AVX512
1499
+ add r0, FENC_STRIDE * 8
1500
+ lea r1, [r1 + r4 * 4]
1501
+ lea r2, [r2 + r4 * 4]
1502
+ lea r3, [r3 + r4 * 4]
1503
+ PROCESS_SAD_X3_32x4_AVX512
1504
+ add r0, FENC_STRIDE * 8
1505
+ lea r1, [r1 + r4 * 4]
1506
+ lea r2, [r2 + r4 * 4]
1507
+ lea r3, [r3 + r4 * 4]
1508
+ PROCESS_SAD_X3_32x4_AVX512
1509
+ add r0, FENC_STRIDE * 8
1510
+ lea r1, [r1 + r4 * 4]
1511
+ lea r2, [r2 + r4 * 4]
1512
+ lea r3, [r3 + r4 * 4]
1513
+ PROCESS_SAD_X3_32x4_AVX512
1514
+ PROCESS_SAD_X3_END_AVX512
1515
+ RET
1516
+
1517
+INIT_ZMM avx512
1518
+cglobal pixel_sad_x3_32x64, 6,7,8
1519
+ pxor m0, m0
1520
+ pxor m1, m1
1521
+ pxor m2, m2
1522
+
1523
+ vbroadcasti32x8 m7, [pw_1]
1524
+
1525
+ add r4d, r4d
1526
+ lea r6d, [r4 * 3]
1527
+
1528
+ PROCESS_SAD_X3_32x4_AVX512
1529
+ add r0, FENC_STRIDE * 8
1530
+ lea r1, [r1 + r4 * 4]
1531
+ lea r2, [r2 + r4 * 4]
1532
+ lea r3, [r3 + r4 * 4]
1533
+ PROCESS_SAD_X3_32x4_AVX512
1534
+ add r0, FENC_STRIDE * 8
1535
+ lea r1, [r1 + r4 * 4]
1536
+ lea r2, [r2 + r4 * 4]
1537
+ lea r3, [r3 + r4 * 4]
1538
+ PROCESS_SAD_X3_32x4_AVX512
1539
+ add r0, FENC_STRIDE * 8
1540
+ lea r1, [r1 + r4 * 4]
1541
+ lea r2, [r2 + r4 * 4]
1542
+ lea r3, [r3 + r4 * 4]
1543
+ PROCESS_SAD_X3_32x4_AVX512
1544
+ add r0, FENC_STRIDE * 8
1545
+ lea r1, [r1 + r4 * 4]
1546
+ lea r2, [r2 + r4 * 4]
1547
+ lea r3, [r3 + r4 * 4]
1548
+ PROCESS_SAD_X3_32x4_AVX512
1549
+ add r0, FENC_STRIDE * 8
1550
+ lea r1, [r1 + r4 * 4]
1551
+ lea r2, [r2 + r4 * 4]
1552
+ lea r3, [r3 + r4 * 4]
1553
+ PROCESS_SAD_X3_32x4_AVX512
1554
+ add r0, FENC_STRIDE * 8
1555
+ lea r1, [r1 + r4 * 4]
1556
+ lea r2, [r2 + r4 * 4]
1557
+ lea r3, [r3 + r4 * 4]
1558
+ PROCESS_SAD_X3_32x4_AVX512
1559
+ add r0, FENC_STRIDE * 8
1560
+ lea r1, [r1 + r4 * 4]
1561
+ lea r2, [r2 + r4 * 4]
1562
+ lea r3, [r3 + r4 * 4]
1563
+ PROCESS_SAD_X3_32x4_AVX512
1564
+ add r0, FENC_STRIDE * 8
1565
+ lea r1, [r1 + r4 * 4]
1566
+ lea r2, [r2 + r4 * 4]
1567
+ lea r3, [r3 + r4 * 4]
1568
+ PROCESS_SAD_X3_32x4_AVX512
1569
+ add r0, FENC_STRIDE * 8
1570
+ lea r1, [r1 + r4 * 4]
1571
+ lea r2, [r2 + r4 * 4]
1572
+ lea r3, [r3 + r4 * 4]
1573
+ PROCESS_SAD_X3_32x4_AVX512
1574
+ add r0, FENC_STRIDE * 8
1575
+ lea r1, [r1 + r4 * 4]
1576
+ lea r2, [r2 + r4 * 4]
1577
+ lea r3, [r3 + r4 * 4]
1578
+ PROCESS_SAD_X3_32x4_AVX512
1579
+ add r0, FENC_STRIDE * 8
1580
+ lea r1, [r1 + r4 * 4]
1581
+ lea r2, [r2 + r4 * 4]
1582
+ lea r3, [r3 + r4 * 4]
1583
+ PROCESS_SAD_X3_32x4_AVX512
1584
+ add r0, FENC_STRIDE * 8
1585
+ lea r1, [r1 + r4 * 4]
1586
+ lea r2, [r2 + r4 * 4]
1587
+ lea r3, [r3 + r4 * 4]
1588
+ PROCESS_SAD_X3_32x4_AVX512
1589
+ add r0, FENC_STRIDE * 8
1590
+ lea r1, [r1 + r4 * 4]
1591
+ lea r2, [r2 + r4 * 4]
1592
+ lea r3, [r3 + r4 * 4]
1593
+ PROCESS_SAD_X3_32x4_AVX512
1594
+ add r0, FENC_STRIDE * 8
1595
+ lea r1, [r1 + r4 * 4]
1596
+ lea r2, [r2 + r4 * 4]
1597
+ lea r3, [r3 + r4 * 4]
1598
+ PROCESS_SAD_X3_32x4_AVX512
1599
+ add r0, FENC_STRIDE * 8
1600
+ lea r1, [r1 + r4 * 4]
1601
+ lea r2, [r2 + r4 * 4]
1602
+ lea r3, [r3 + r4 * 4]
1603
+ PROCESS_SAD_X3_32x4_AVX512
1604
+ PROCESS_SAD_X3_END_AVX512
1605
+ RET
1606
+
1607
+;----------------------------------------------------------------------------------------------------------------------------------------
1608
+; int pixel_sad_x3_48x64( const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, intptr_t frefstride, int32_t* res )
1609
+;----------------------------------------------------------------------------------------------------------------------------------------
1610
+INIT_ZMM avx512
1611
+cglobal pixel_sad_x3_48x64, 4, 8, 17
1612
+ pxor m0, m0
1613
+ pxor m1, m1
1614
+ pxor m2, m2
1615
+ mov r7d, 64/4
1616
+ vbroadcasti32x8 m16, [pw_1]
1617
+
1618
+ add r4d, r4d
1619
+ lea r6d, [r4 * 3]
1620
+.loop:
1621
+ movu m4, [r0]
1622
+ movu m5, [r0 + 2 * FENC_STRIDE]
1623
+ movu ym6, [r0 + mmsize]
1624
+ vinserti32x8 m6, [r0 + 2 * FENC_STRIDE + mmsize], 1
1625
+ movu m7, [r1]
1626
+ movu m8, [r1 + r4]
1627
+ movu ym9, [r1 + mmsize]
1628
+ vinserti32x8 m9, [r1 + r4 + mmsize], 1
1629
+ movu m10, [r2]
1630
+ movu m11, [r2 + r4]
1631
+ movu ym12, [r2 + mmsize]
1632
+ vinserti32x8 m12, [r2 + r4 + mmsize], 1
1633
+ movu m13, [r3]
1634
+ movu m14, [r3 + r4]
1635
+ movu ym15, [r3 + mmsize]
1636
+ vinserti32x8 m15, [r3 + r4 + mmsize], 1
1637
+
1638
+ psubw m7, m4
1639
+ psubw m8, m5
1640
+ psubw m9, m6
1641
+ psubw m10, m4
1642
+ psubw m11, m5
1643
+ psubw m12, m6
1644
+ psubw m13, m4
1645
+ psubw m14, m5
1646
+ psubw m15, m6
1647
+
1648
+ pabsw m7, m7
1649
+ pabsw m8, m8
1650
+ pabsw m9, m9
1651
+ pabsw m10, m10
1652
+ pabsw m11, m11
1653
+ pabsw m12, m12
1654
+ pabsw m13, m13
1655
+ pabsw m14, m14
1656
+ pabsw m15, m15
1657
+
1658
+ paddw m7, m8
1659
+ paddw m7, m9
1660
+ paddw m10, m11
1661
+ paddw m10, m12
1662
+ paddw m13, m14
1663
+ paddw m13, m15
1664
+
1665
+ pmaddwd m7, m16
1666
+ paddd m0, m7
1667
+ pmaddwd m10, m16
1668
+ paddd m1, m10
1669
+ pmaddwd m13, m16
1670
+ paddd m2, m13
1671
+
1672
+ movu m4, [r0 + 4 * FENC_STRIDE]
1673
+ movu m5, [r0 + 6 * FENC_STRIDE]
1674
+ movu ym6, [r0 + 4 * FENC_STRIDE + mmsize]
1675
+ vinserti32x8 m6, [r0 + 6 * FENC_STRIDE + mmsize], 1
1676
+ movu m7, [r1 + 2 * r4]
1677
+ movu m8, [r1 + r6]
1678
+ movu ym9, [r1 + 2 * r4 + mmsize]
1679
+ vinserti32x8 m9, [r1 + r6 + mmsize], 1
1680
+ movu m10, [r2 + 2 * r4]
1681
+ movu m11, [r2 + r6]
1682
+ movu ym12, [r2 + 2 * r4 + mmsize]
1683
+ vinserti32x8 m12, [r2 + r6 + mmsize], 1
1684
+ movu m13, [r3 + 2 * r4]
1685
+ movu m14, [r3 + r6]
1686
+ movu ym15, [r3 + 2 * r4 + mmsize]
1687
+ vinserti32x8 m15, [r3 + r6 + mmsize], 1
1688
+
1689
+ psubw m7, m4
1690
+ psubw m8, m5
1691
+ psubw m9, m6
1692
+ psubw m10, m4
1693
+ psubw m11, m5
1694
+ psubw m12, m6
1695
+ psubw m13, m4
1696
+ psubw m14, m5
1697
+ psubw m15, m6
1698
+
1699
+ pabsw m7, m7
1700
+ pabsw m8, m8
1701
+ pabsw m9, m9
1702
+ pabsw m10, m10
1703
+ pabsw m11, m11
1704
+ pabsw m12, m12
1705
+ pabsw m13, m13
1706
+ pabsw m14, m14
1707
+ pabsw m15, m15
1708
+
1709
+ paddw m7, m8
1710
+ paddw m7, m9
1711
+ paddw m10, m11
1712
+ paddw m10, m12
1713
+ paddw m13, m14
1714
+ paddw m13, m15
1715
+
1716
+ pmaddwd m7, m16
1717
+ paddd m0, m7
1718
+ pmaddwd m10, m16
1719
+ paddd m1, m10
1720
+ pmaddwd m13, m16
1721
+ paddd m2, m13
1722
+
1723
+ add r0, FENC_STRIDE * 8
1724
+ lea r1, [r1 + r4 * 4]
1725
+ lea r2, [r2 + r4 * 4]
1726
+ lea r3, [r3 + r4 * 4]
1727
+
1728
+ dec r7d
1729
+ jg .loop
1730
+
1731
+ PROCESS_SAD_X3_END_AVX512
1732
+ RET
1733
+%endif
1734
+
1735
+;------------------------------------------------------------------------------------------------------------------------------------------
1736
+; void pixel_sad_x3_64x%1( const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, intptr_t frefstride, int32_t* res )
1737
+;------------------------------------------------------------------------------------------------------------------------------------------
1738
+%if ARCH_X86_64
1739
+INIT_ZMM avx512
1740
+cglobal pixel_sad_x3_64x16, 6,7,12
1741
+ pxor m0, m0
1742
+ pxor m1, m1
1743
+ pxor m2, m2
1744
+
1745
+ vbroadcasti32x8 m7, [pw_1]
1746
+
1747
+ add r4d, r4d
1748
+ lea r6d, [r4 * 3]
1749
+
1750
+ PROCESS_SAD_X3_64x4_AVX512
1751
+ add r0, FENC_STRIDE * 8
1752
+ lea r1, [r1 + r4 * 4]
1753
+ lea r2, [r2 + r4 * 4]
1754
+ lea r3, [r3 + r4 * 4]
1755
+ PROCESS_SAD_X3_64x4_AVX512
1756
+ add r0, FENC_STRIDE * 8
1757
+ lea r1, [r1 + r4 * 4]
1758
+ lea r2, [r2 + r4 * 4]
1759
+ lea r3, [r3 + r4 * 4]
1760
+ PROCESS_SAD_X3_64x4_AVX512
1761
+ add r0, FENC_STRIDE * 8
1762
+ lea r1, [r1 + r4 * 4]
1763
+ lea r2, [r2 + r4 * 4]
1764
+ lea r3, [r3 + r4 * 4]
1765
+ PROCESS_SAD_X3_64x4_AVX512
1766
+ PROCESS_SAD_X3_END_AVX512
1767
+ RET
1768
+
1769
+INIT_ZMM avx512
1770
+cglobal pixel_sad_x3_64x32, 6,7,12
1771
+ pxor m0, m0
1772
+ pxor m1, m1
1773
+ pxor m2, m2
1774
+
1775
+ vbroadcasti32x8 m7, [pw_1]
1776
+
1777
+ add r4d, r4d
1778
+ lea r6d, [r4 * 3]
1779
+
1780
+ PROCESS_SAD_X3_64x4_AVX512
1781
+ add r0, FENC_STRIDE * 8
1782
+ lea r1, [r1 + r4 * 4]
1783
+ lea r2, [r2 + r4 * 4]
1784
+ lea r3, [r3 + r4 * 4]
1785
+ PROCESS_SAD_X3_64x4_AVX512
1786
+ add r0, FENC_STRIDE * 8
1787
+ lea r1, [r1 + r4 * 4]
1788
+ lea r2, [r2 + r4 * 4]
1789
+ lea r3, [r3 + r4 * 4]
1790
+ PROCESS_SAD_X3_64x4_AVX512
1791
+ add r0, FENC_STRIDE * 8
1792
+ lea r1, [r1 + r4 * 4]
1793
+ lea r2, [r2 + r4 * 4]
1794
+ lea r3, [r3 + r4 * 4]
1795
+ PROCESS_SAD_X3_64x4_AVX512
1796
+ add r0, FENC_STRIDE * 8
1797
+ lea r1, [r1 + r4 * 4]
1798
+ lea r2, [r2 + r4 * 4]
1799
+ lea r3, [r3 + r4 * 4]
1800
+ PROCESS_SAD_X3_64x4_AVX512
1801
+ add r0, FENC_STRIDE * 8
1802
+ lea r1, [r1 + r4 * 4]
1803
+ lea r2, [r2 + r4 * 4]
1804
+ lea r3, [r3 + r4 * 4]
1805
+ PROCESS_SAD_X3_64x4_AVX512
1806
+ add r0, FENC_STRIDE * 8
1807
+ lea r1, [r1 + r4 * 4]
1808
+ lea r2, [r2 + r4 * 4]
1809
+ lea r3, [r3 + r4 * 4]
1810
+ PROCESS_SAD_X3_64x4_AVX512
1811
+ add r0, FENC_STRIDE * 8
1812
+ lea r1, [r1 + r4 * 4]
1813
+ lea r2, [r2 + r4 * 4]
1814
+ lea r3, [r3 + r4 * 4]
1815
+ PROCESS_SAD_X3_64x4_AVX512
1816
+ PROCESS_SAD_X3_END_AVX512
1817
+ RET
1818
+
1819
+INIT_ZMM avx512
1820
+cglobal pixel_sad_x3_64x48, 6,7,12
1821
+ pxor m0, m0
1822
+ pxor m1, m1
1823
+ pxor m2, m2
1824
+
1825
+ vbroadcasti32x8 m7, [pw_1]
1826
+
1827
+ add r4d, r4d
1828
+ lea r6d, [r4 * 3]
1829
+
1830
+ PROCESS_SAD_X3_64x4_AVX512
1831
+ add r0, FENC_STRIDE * 8
1832
+ lea r1, [r1 + r4 * 4]
1833
+ lea r2, [r2 + r4 * 4]
1834
+ lea r3, [r3 + r4 * 4]
1835
+ PROCESS_SAD_X3_64x4_AVX512
1836
+ add r0, FENC_STRIDE * 8
1837
+ lea r1, [r1 + r4 * 4]
1838
+ lea r2, [r2 + r4 * 4]
1839
+ lea r3, [r3 + r4 * 4]
1840
+ PROCESS_SAD_X3_64x4_AVX512
1841
+ add r0, FENC_STRIDE * 8
1842
+ lea r1, [r1 + r4 * 4]
1843
+ lea r2, [r2 + r4 * 4]
1844
+ lea r3, [r3 + r4 * 4]
1845
+ PROCESS_SAD_X3_64x4_AVX512
1846
+ add r0, FENC_STRIDE * 8
1847
+ lea r1, [r1 + r4 * 4]
1848
+ lea r2, [r2 + r4 * 4]
1849
+ lea r3, [r3 + r4 * 4]
1850
+ PROCESS_SAD_X3_64x4_AVX512
1851
+ add r0, FENC_STRIDE * 8
1852
+ lea r1, [r1 + r4 * 4]
1853
+ lea r2, [r2 + r4 * 4]
1854
+ lea r3, [r3 + r4 * 4]
1855
+ PROCESS_SAD_X3_64x4_AVX512
1856
+ add r0, FENC_STRIDE * 8
1857
+ lea r1, [r1 + r4 * 4]
1858
+ lea r2, [r2 + r4 * 4]
1859
+ lea r3, [r3 + r4 * 4]
1860
+ PROCESS_SAD_X3_64x4_AVX512
1861
+ add r0, FENC_STRIDE * 8
1862
+ lea r1, [r1 + r4 * 4]
1863
+ lea r2, [r2 + r4 * 4]
1864
+ lea r3, [r3 + r4 * 4]
1865
+ PROCESS_SAD_X3_64x4_AVX512
1866
+ add r0, FENC_STRIDE * 8
1867
+ lea r1, [r1 + r4 * 4]
1868
+ lea r2, [r2 + r4 * 4]
1869
+ lea r3, [r3 + r4 * 4]
1870
+ PROCESS_SAD_X3_64x4_AVX512
1871
+ add r0, FENC_STRIDE * 8
1872
+ lea r1, [r1 + r4 * 4]
1873
+ lea r2, [r2 + r4 * 4]
1874
+ lea r3, [r3 + r4 * 4]
1875
+ PROCESS_SAD_X3_64x4_AVX512
1876
+ add r0, FENC_STRIDE * 8
1877
+ lea r1, [r1 + r4 * 4]
1878
+ lea r2, [r2 + r4 * 4]
1879
+ lea r3, [r3 + r4 * 4]
1880
+ PROCESS_SAD_X3_64x4_AVX512
1881
+ add r0, FENC_STRIDE * 8
1882
+ lea r1, [r1 + r4 * 4]
1883
+ lea r2, [r2 + r4 * 4]
1884
+ lea r3, [r3 + r4 * 4]
1885
+ PROCESS_SAD_X3_64x4_AVX512
1886
+ PROCESS_SAD_X3_END_AVX512
1887
+ RET
1888
+
1889
+INIT_ZMM avx512
1890
+cglobal pixel_sad_x3_64x64, 6,7,12
1891
+ pxor m0, m0
1892
+ pxor m1, m1
1893
+ pxor m2, m2
1894
+
1895
+ vbroadcasti32x8 m7, [pw_1]
1896
+
1897
+ add r4d, r4d
1898
+ lea r6d, [r4 * 3]
1899
+
1900
+ PROCESS_SAD_X3_64x4_AVX512
1901
+ add r0, FENC_STRIDE * 8
1902
+ lea r1, [r1 + r4 * 4]
1903
+ lea r2, [r2 + r4 * 4]
1904
+ lea r3, [r3 + r4 * 4]
1905
+ PROCESS_SAD_X3_64x4_AVX512
1906
+ add r0, FENC_STRIDE * 8
1907
+ lea r1, [r1 + r4 * 4]
1908
+ lea r2, [r2 + r4 * 4]
1909
+ lea r3, [r3 + r4 * 4]
1910
+ PROCESS_SAD_X3_64x4_AVX512
1911
+ add r0, FENC_STRIDE * 8
1912
+ lea r1, [r1 + r4 * 4]
1913
+ lea r2, [r2 + r4 * 4]
1914
+ lea r3, [r3 + r4 * 4]
1915
+ PROCESS_SAD_X3_64x4_AVX512
1916
+ add r0, FENC_STRIDE * 8
1917
+ lea r1, [r1 + r4 * 4]
1918
+ lea r2, [r2 + r4 * 4]
1919
+ lea r3, [r3 + r4 * 4]
1920
+ PROCESS_SAD_X3_64x4_AVX512
1921
+ add r0, FENC_STRIDE * 8
1922
+ lea r1, [r1 + r4 * 4]
1923
+ lea r2, [r2 + r4 * 4]
1924
+ lea r3, [r3 + r4 * 4]
1925
+ PROCESS_SAD_X3_64x4_AVX512
1926
+ add r0, FENC_STRIDE * 8
1927
+ lea r1, [r1 + r4 * 4]
1928
+ lea r2, [r2 + r4 * 4]
1929
+ lea r3, [r3 + r4 * 4]
1930
+ PROCESS_SAD_X3_64x4_AVX512
1931
+ add r0, FENC_STRIDE * 8
1932
+ lea r1, [r1 + r4 * 4]
1933
+ lea r2, [r2 + r4 * 4]
1934
+ lea r3, [r3 + r4 * 4]
1935
+ PROCESS_SAD_X3_64x4_AVX512
1936
+ add r0, FENC_STRIDE * 8
1937
+ lea r1, [r1 + r4 * 4]
1938
+ lea r2, [r2 + r4 * 4]
1939
+ lea r3, [r3 + r4 * 4]
1940
+ PROCESS_SAD_X3_64x4_AVX512
1941
+ add r0, FENC_STRIDE * 8
1942
+ lea r1, [r1 + r4 * 4]
1943
+ lea r2, [r2 + r4 * 4]
1944
+ lea r3, [r3 + r4 * 4]
1945
+ PROCESS_SAD_X3_64x4_AVX512
1946
+ add r0, FENC_STRIDE * 8
1947
+ lea r1, [r1 + r4 * 4]
1948
+ lea r2, [r2 + r4 * 4]
1949
+ lea r3, [r3 + r4 * 4]
1950
+ PROCESS_SAD_X3_64x4_AVX512
1951
+ add r0, FENC_STRIDE * 8
1952
+ lea r1, [r1 + r4 * 4]
1953
+ lea r2, [r2 + r4 * 4]
1954
+ lea r3, [r3 + r4 * 4]
1955
+ PROCESS_SAD_X3_64x4_AVX512
1956
+ add r0, FENC_STRIDE * 8
1957
+ lea r1, [r1 + r4 * 4]
1958
+ lea r2, [r2 + r4 * 4]
1959
+ lea r3, [r3 + r4 * 4]
1960
+ PROCESS_SAD_X3_64x4_AVX512
1961
+ add r0, FENC_STRIDE * 8
1962
+ lea r1, [r1 + r4 * 4]
1963
+ lea r2, [r2 + r4 * 4]
1964
+ lea r3, [r3 + r4 * 4]
1965
+ PROCESS_SAD_X3_64x4_AVX512
1966
+ add r0, FENC_STRIDE * 8
1967
+ lea r1, [r1 + r4 * 4]
1968
+ lea r2, [r2 + r4 * 4]
1969
+ lea r3, [r3 + r4 * 4]
1970
+ PROCESS_SAD_X3_64x4_AVX512
1971
+ add r0, FENC_STRIDE * 8
1972
+ lea r1, [r1 + r4 * 4]
1973
+ lea r2, [r2 + r4 * 4]
1974
+ lea r3, [r3 + r4 * 4]
1975
+ PROCESS_SAD_X3_64x4_AVX512
1976
+ PROCESS_SAD_X3_END_AVX512
1977
+ RET
1978
+%endif
1979
+
1980
+;------------------------------------------------------------------------------------------------------------------------------------------------------------
1981
+; void pixel_sad_x4_16x%1( const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, const pixel* pix5, intptr_t frefstride, int32_t* res )
1982
+;------------------------------------------------------------------------------------------------------------------------------------------------------------
1983
+%if ARCH_X86_64
1984
+INIT_ZMM avx512
1985
+cglobal pixel_sad_x4_16x8, 6,8,10
1986
+ pxor m0, m0
1987
+ pxor m1, m1
1988
+ pxor m2, m2
1989
+ pxor m3, m3
1990
+
1991
+ vbroadcasti32x8 m9, [pw_1]
1992
+
1993
+ add r5d, r5d
1994
+ lea r7d, [r5 * 3]
1995
+
1996
+ PROCESS_SAD_X4_16x4_AVX512
1997
+ add r0, FENC_STRIDE * 8
1998
+ lea r1, [r1 + r5 * 4]
1999
+ lea r2, [r2 + r5 * 4]
2000
+ lea r3, [r3 + r5 * 4]
2001
+ lea r4, [r4 + r5 * 4]
2002
+ PROCESS_SAD_X4_16x4_AVX512
2003
+ PROCESS_SAD_X4_END_AVX512
2004
+ RET
2005
+
2006
+INIT_ZMM avx512
2007
+cglobal pixel_sad_x4_16x12, 6,8,10
2008
+ pxor m0, m0
2009
+ pxor m1, m1
2010
+ pxor m2, m2
2011
+ pxor m3, m3
2012
+
2013
+ vbroadcasti32x8 m9, [pw_1]
2014
+
2015
+ add r5d, r5d
2016
+ lea r7d, [r5 * 3]
2017
+
2018
+ %rep 2
2019
+ PROCESS_SAD_X4_16x4_AVX512
2020
+ add r0, FENC_STRIDE * 8
2021
+ lea r1, [r1 + r5 * 4]
2022
+ lea r2, [r2 + r5 * 4]
2023
+ lea r3, [r3 + r5 * 4]
2024
+ lea r4, [r4 + r5 * 4]
2025
+ %endrep
2026
+ PROCESS_SAD_X4_16x4_AVX512
2027
+ PROCESS_SAD_X4_END_AVX512
2028
+ RET
2029
+
2030
+INIT_ZMM avx512
2031
+cglobal pixel_sad_x4_16x16, 6,8,10
2032
+ pxor m0, m0
2033
+ pxor m1, m1
2034
+ pxor m2, m2
2035
+ pxor m3, m3
2036
+
2037
+ vbroadcasti32x8 m9, [pw_1]
2038
+
2039
+ add r5d, r5d
2040
+ lea r7d, [r5 * 3]
2041
+
2042
+ %rep 3
2043
+ PROCESS_SAD_X4_16x4_AVX512
2044
+ add r0, FENC_STRIDE * 8
2045
+ lea r1, [r1 + r5 * 4]
2046
+ lea r2, [r2 + r5 * 4]
2047
+ lea r3, [r3 + r5 * 4]
2048
+ lea r4, [r4 + r5 * 4]
2049
+ %endrep
2050
+ PROCESS_SAD_X4_16x4_AVX512
2051
+ PROCESS_SAD_X4_END_AVX512
2052
+ RET
2053
+
2054
+INIT_ZMM avx512
2055
+cglobal pixel_sad_x4_16x32, 6,8,10
2056
+ pxor m0, m0
2057
+ pxor m1, m1
2058
+ pxor m2, m2
2059
+ pxor m3, m3
2060
+
2061
+ vbroadcasti32x8 m9, [pw_1]
2062
+
2063
+ add r5d, r5d
2064
+ lea r7d, [r5 * 3]
2065
+
2066
+ %rep 7
2067
+ PROCESS_SAD_X4_16x4_AVX512
2068
+ add r0, FENC_STRIDE * 8
2069
+ lea r1, [r1 + r5 * 4]
2070
+ lea r2, [r2 + r5 * 4]
2071
+ lea r3, [r3 + r5 * 4]
2072
+ lea r4, [r4 + r5 * 4]
2073
+ %endrep
2074
+ PROCESS_SAD_X4_16x4_AVX512
2075
+ PROCESS_SAD_X4_END_AVX512
2076
+ RET
2077
+
2078
+INIT_ZMM avx512
2079
+cglobal pixel_sad_x4_16x64, 6,8,10
2080
+ pxor m0, m0
2081
+ pxor m1, m1
2082
+ pxor m2, m2
2083
+ pxor m3, m3
2084
+
2085
+ vbroadcasti32x8 m9, [pw_1]
2086
+
2087
+ add r5d, r5d
2088
+ lea r7d, [r5 * 3]
2089
+
2090
+ %rep 15
2091
+ PROCESS_SAD_X4_16x4_AVX512
2092
+ add r0, FENC_STRIDE * 8
2093
+ lea r1, [r1 + r5 * 4]
2094
+ lea r2, [r2 + r5 * 4]
2095
+ lea r3, [r3 + r5 * 4]
2096
+ lea r4, [r4 + r5 * 4]
2097
+ %endrep
2098
+ PROCESS_SAD_X4_16x4_AVX512
2099
+ PROCESS_SAD_X4_END_AVX512
2100
+ RET
2101
+%endif
2102
+
2103
+;------------------------------------------------------------------------------------------------------------------------------------------------------------
2104
+; void pixel_sad_x4_32x%1( const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, const pixel* pix5, intptr_t frefstride, int32_t* res )
2105
+;------------------------------------------------------------------------------------------------------------------------------------------------------------
2106
+%if ARCH_X86_64
2107
+INIT_ZMM avx512
2108
+cglobal pixel_sad_x4_32x8, 6,8,10
2109
+ pxor m0, m0
2110
+ pxor m1, m1
2111
+ pxor m2, m2
2112
+ pxor m3, m3
2113
+
2114
+ vbroadcasti32x8 m9, [pw_1]
2115
+
2116
+ add r5d, r5d
2117
+ lea r7d, [r5 * 3]
2118
+
2119
+ PROCESS_SAD_X4_32x4_AVX512
2120
+ add r0, FENC_STRIDE * 8
2121
+ lea r1, [r1 + r5 * 4]
2122
+ lea r2, [r2 + r5 * 4]
2123
+ lea r3, [r3 + r5 * 4]
2124
+ lea r4, [r4 + r5 * 4]
2125
+ PROCESS_SAD_X4_32x4_AVX512
2126
+ PROCESS_SAD_X4_END_AVX512
2127
+ RET
2128
+
2129
+INIT_ZMM avx512
2130
+cglobal pixel_sad_x4_32x16, 6,8,10
2131
+ pxor m0, m0
2132
+ pxor m1, m1
2133
+ pxor m2, m2
2134
+ pxor m3, m3
2135
+
2136
+ vbroadcasti32x8 m9, [pw_1]
2137
+
2138
+ add r5d, r5d
2139
+ lea r7d, [r5 * 3]
2140
+
2141
+ PROCESS_SAD_X4_32x4_AVX512
2142
+ add r0, FENC_STRIDE * 8
2143
+ lea r1, [r1 + r5 * 4]
2144
+ lea r2, [r2 + r5 * 4]
2145
+ lea r3, [r3 + r5 * 4]
2146
+ lea r4, [r4 + r5 * 4]
2147
+ PROCESS_SAD_X4_32x4_AVX512
2148
+ add r0, FENC_STRIDE * 8
2149
+ lea r1, [r1 + r5 * 4]
2150
+ lea r2, [r2 + r5 * 4]
2151
+ lea r3, [r3 + r5 * 4]
2152
+ lea r4, [r4 + r5 * 4]
2153
+ PROCESS_SAD_X4_32x4_AVX512
2154
+ add r0, FENC_STRIDE * 8
2155
+ lea r1, [r1 + r5 * 4]
2156
+ lea r2, [r2 + r5 * 4]
2157
+ lea r3, [r3 + r5 * 4]
2158
+ lea r4, [r4 + r5 * 4]
2159
+ PROCESS_SAD_X4_32x4_AVX512
2160
+ PROCESS_SAD_X4_END_AVX512
2161
+ RET
2162
+
2163
+INIT_ZMM avx512
2164
+cglobal pixel_sad_x4_32x24, 6,8,10
2165
+ pxor m0, m0
2166
+ pxor m1, m1
2167
+ pxor m2, m2
2168
+ pxor m3, m3
2169
+
2170
+ vbroadcasti32x8 m9, [pw_1]
2171
+
2172
+ add r5d, r5d
2173
+ lea r7d, [r5 * 3]
2174
+
2175
+ PROCESS_SAD_X4_32x4_AVX512
2176
+ add r0, FENC_STRIDE * 8
2177
+ lea r1, [r1 + r5 * 4]
2178
+ lea r2, [r2 + r5 * 4]
2179
+ lea r3, [r3 + r5 * 4]
2180
+ lea r4, [r4 + r5 * 4]
2181
+ PROCESS_SAD_X4_32x4_AVX512
2182
+ add r0, FENC_STRIDE * 8
2183
+ lea r1, [r1 + r5 * 4]
2184
+ lea r2, [r2 + r5 * 4]
2185
+ lea r3, [r3 + r5 * 4]
2186
+ lea r4, [r4 + r5 * 4]
2187
+ PROCESS_SAD_X4_32x4_AVX512
2188
+ add r0, FENC_STRIDE * 8
2189
+ lea r1, [r1 + r5 * 4]
2190
+ lea r2, [r2 + r5 * 4]
2191
+ lea r3, [r3 + r5 * 4]
2192
+ lea r4, [r4 + r5 * 4]
2193
+ PROCESS_SAD_X4_32x4_AVX512
2194
+ add r0, FENC_STRIDE * 8
2195
+ lea r1, [r1 + r5 * 4]
2196
+ lea r2, [r2 + r5 * 4]
2197
+ lea r3, [r3 + r5 * 4]
2198
+ lea r4, [r4 + r5 * 4]
2199
+ PROCESS_SAD_X4_32x4_AVX512
2200
+ add r0, FENC_STRIDE * 8
2201
+ lea r1, [r1 + r5 * 4]
2202
+ lea r2, [r2 + r5 * 4]
2203
+ lea r3, [r3 + r5 * 4]
2204
+ lea r4, [r4 + r5 * 4]
2205
+ PROCESS_SAD_X4_32x4_AVX512
2206
+ PROCESS_SAD_X4_END_AVX512
2207
+ RET
2208
+
2209
+
2210
+INIT_ZMM avx512
2211
+cglobal pixel_sad_x4_32x32, 6,8,10
2212
+ pxor m0, m0
2213
+ pxor m1, m1
2214
+ pxor m2, m2
2215
+ pxor m3, m3
2216
+
2217
+ vbroadcasti32x8 m9, [pw_1]
2218
+
2219
+ add r5d, r5d
2220
+ lea r7d, [r5 * 3]
2221
+
2222
+ PROCESS_SAD_X4_32x4_AVX512
2223
+ add r0, FENC_STRIDE * 8
2224
+ lea r1, [r1 + r5 * 4]
2225
+ lea r2, [r2 + r5 * 4]
2226
+ lea r3, [r3 + r5 * 4]
2227
+ lea r4, [r4 + r5 * 4]
2228
+ PROCESS_SAD_X4_32x4_AVX512
2229
+ add r0, FENC_STRIDE * 8
2230
+ lea r1, [r1 + r5 * 4]
2231
+ lea r2, [r2 + r5 * 4]
2232
+ lea r3, [r3 + r5 * 4]
2233
+ lea r4, [r4 + r5 * 4]
2234
+ PROCESS_SAD_X4_32x4_AVX512
2235
+ add r0, FENC_STRIDE * 8
2236
+ lea r1, [r1 + r5 * 4]
2237
+ lea r2, [r2 + r5 * 4]
2238
+ lea r3, [r3 + r5 * 4]
2239
+ lea r4, [r4 + r5 * 4]
2240
+ PROCESS_SAD_X4_32x4_AVX512
2241
+ add r0, FENC_STRIDE * 8
2242
+ lea r1, [r1 + r5 * 4]
2243
+ lea r2, [r2 + r5 * 4]
2244
+ lea r3, [r3 + r5 * 4]
2245
+ lea r4, [r4 + r5 * 4]
2246
+ PROCESS_SAD_X4_32x4_AVX512
2247
+ add r0, FENC_STRIDE * 8
2248
+ lea r1, [r1 + r5 * 4]
2249
+ lea r2, [r2 + r5 * 4]
2250
+ lea r3, [r3 + r5 * 4]
2251
+ lea r4, [r4 + r5 * 4]
2252
+ PROCESS_SAD_X4_32x4_AVX512
2253
+ add r0, FENC_STRIDE * 8
2254
+ lea r1, [r1 + r5 * 4]
2255
+ lea r2, [r2 + r5 * 4]
2256
+ lea r3, [r3 + r5 * 4]
2257
+ lea r4, [r4 + r5 * 4]
2258
+ PROCESS_SAD_X4_32x4_AVX512
2259
+ add r0, FENC_STRIDE * 8
2260
+ lea r1, [r1 + r5 * 4]
2261
+ lea r2, [r2 + r5 * 4]
2262
+ lea r3, [r3 + r5 * 4]
2263
+ lea r4, [r4 + r5 * 4]
2264
+ PROCESS_SAD_X4_32x4_AVX512
2265
+ PROCESS_SAD_X4_END_AVX512
2266
+ RET
2267
+
2268
+INIT_ZMM avx512
2269
+cglobal pixel_sad_x4_32x64, 6,8,10
2270
+ pxor m0, m0
2271
+ pxor m1, m1
2272
+ pxor m2, m2
2273
+ pxor m3, m3
2274
+
2275
+ vbroadcasti32x8 m9, [pw_1]
2276
+
2277
+ add r5d, r5d
2278
+ lea r7d, [r5 * 3]
2279
+
2280
+ PROCESS_SAD_X4_32x4_AVX512
2281
+ add r0, FENC_STRIDE * 8
2282
+ lea r1, [r1 + r5 * 4]
2283
+ lea r2, [r2 + r5 * 4]
2284
+ lea r3, [r3 + r5 * 4]
2285
+ lea r4, [r4 + r5 * 4]
2286
+ PROCESS_SAD_X4_32x4_AVX512
2287
+ add r0, FENC_STRIDE * 8
2288
+ lea r1, [r1 + r5 * 4]
2289
+ lea r2, [r2 + r5 * 4]
2290
+ lea r3, [r3 + r5 * 4]
2291
+ lea r4, [r4 + r5 * 4]
2292
+ PROCESS_SAD_X4_32x4_AVX512
2293
+ add r0, FENC_STRIDE * 8
2294
+ lea r1, [r1 + r5 * 4]
2295
+ lea r2, [r2 + r5 * 4]
2296
+ lea r3, [r3 + r5 * 4]
2297
+ lea r4, [r4 + r5 * 4]
2298
+ PROCESS_SAD_X4_32x4_AVX512
2299
+ add r0, FENC_STRIDE * 8
2300
+ lea r1, [r1 + r5 * 4]
2301
+ lea r2, [r2 + r5 * 4]
2302
+ lea r3, [r3 + r5 * 4]
2303
+ lea r4, [r4 + r5 * 4]
2304
+ PROCESS_SAD_X4_32x4_AVX512
2305
+ add r0, FENC_STRIDE * 8
2306
+ lea r1, [r1 + r5 * 4]
2307
+ lea r2, [r2 + r5 * 4]
2308
+ lea r3, [r3 + r5 * 4]
2309
+ lea r4, [r4 + r5 * 4]
2310
+ PROCESS_SAD_X4_32x4_AVX512
2311
+ add r0, FENC_STRIDE * 8
2312
+ lea r1, [r1 + r5 * 4]
2313
+ lea r2, [r2 + r5 * 4]
2314
+ lea r3, [r3 + r5 * 4]
2315
+ lea r4, [r4 + r5 * 4]
2316
+ PROCESS_SAD_X4_32x4_AVX512
2317
+ add r0, FENC_STRIDE * 8
2318
+ lea r1, [r1 + r5 * 4]
2319
+ lea r2, [r2 + r5 * 4]
2320
+ lea r3, [r3 + r5 * 4]
2321
+ lea r4, [r4 + r5 * 4]
2322
+ PROCESS_SAD_X4_32x4_AVX512
2323
+ add r0, FENC_STRIDE * 8
2324
+ lea r1, [r1 + r5 * 4]
2325
+ lea r2, [r2 + r5 * 4]
2326
+ lea r3, [r3 + r5 * 4]
2327
+ lea r4, [r4 + r5 * 4]
2328
+ PROCESS_SAD_X4_32x4_AVX512
2329
+ add r0, FENC_STRIDE * 8
2330
+ lea r1, [r1 + r5 * 4]
2331
+ lea r2, [r2 + r5 * 4]
2332
+ lea r3, [r3 + r5 * 4]
2333
+ lea r4, [r4 + r5 * 4]
2334
+ PROCESS_SAD_X4_32x4_AVX512
2335
+ add r0, FENC_STRIDE * 8
2336
+ lea r1, [r1 + r5 * 4]
2337
+ lea r2, [r2 + r5 * 4]
2338
+ lea r3, [r3 + r5 * 4]
2339
+ lea r4, [r4 + r5 * 4]
2340
+ PROCESS_SAD_X4_32x4_AVX512
2341
+ add r0, FENC_STRIDE * 8
2342
+ lea r1, [r1 + r5 * 4]
2343
+ lea r2, [r2 + r5 * 4]
2344
+ lea r3, [r3 + r5 * 4]
2345
+ lea r4, [r4 + r5 * 4]
2346
+ PROCESS_SAD_X4_32x4_AVX512
2347
+ add r0, FENC_STRIDE * 8
2348
+ lea r1, [r1 + r5 * 4]
2349
+ lea r2, [r2 + r5 * 4]
2350
+ lea r3, [r3 + r5 * 4]
2351
+ lea r4, [r4 + r5 * 4]
2352
+ PROCESS_SAD_X4_32x4_AVX512
2353
+ add r0, FENC_STRIDE * 8
2354
+ lea r1, [r1 + r5 * 4]
2355
+ lea r2, [r2 + r5 * 4]
2356
+ lea r3, [r3 + r5 * 4]
2357
+ lea r4, [r4 + r5 * 4]
2358
+ PROCESS_SAD_X4_32x4_AVX512
2359
+ add r0, FENC_STRIDE * 8
2360
+ lea r1, [r1 + r5 * 4]
2361
+ lea r2, [r2 + r5 * 4]
2362
+ lea r3, [r3 + r5 * 4]
2363
+ lea r4, [r4 + r5 * 4]
2364
+ PROCESS_SAD_X4_32x4_AVX512
2365
+ add r0, FENC_STRIDE * 8
2366
+ lea r1, [r1 + r5 * 4]
2367
+ lea r2, [r2 + r5 * 4]
2368
+ lea r3, [r3 + r5 * 4]
2369
+ lea r4, [r4 + r5 * 4]
2370
+ PROCESS_SAD_X4_32x4_AVX512
2371
+ PROCESS_SAD_X4_END_AVX512
2372
+ RET
2373
+%endif
2374
+;------------------------------------------------------------------------------------------------------------------------------------------------------------
2375
+; void pixel_sad_x4_48x64( const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, const pixel* pix5, intptr_t frefstride, int32_t* res )
2376
+;------------------------------------------------------------------------------------------------------------------------------------------------------------
2377
+%if ARCH_X86_64
2378
+INIT_ZMM avx512
2379
+cglobal pixel_sad_x4_48x64, 4, 9, 20
2380
+ pxor m0, m0
2381
+ pxor m1, m1
2382
+ pxor m2, m2
2383
+ pxor m3, m3
2384
+ mov r8d, 64/4
2385
+
2386
+ vbroadcasti32x8 m19, [pw_1]
2387
+
2388
+ add r5d, r5d
2389
+ lea r7d, [r5 * 3]
2390
+.loop:
2391
+ movu m4, [r0]
2392
+ movu m5, [r0 + 2 * FENC_STRIDE]
2393
+ movu ym6, [r0 + mmsize]
2394
+ vinserti32x8 m6, [r0 + 2 * FENC_STRIDE + mmsize], 1
2395
+ movu m7, [r1]
2396
+ movu m8, [r1 + r5]
2397
+ movu ym9, [r1 + mmsize]
2398
+ vinserti32x8 m9, [r1 + r5 + mmsize], 1
2399
+ movu m10, [r2]
2400
+ movu m11, [r2 + r5]
2401
+ movu ym12, [r2 + mmsize]
2402
+ vinserti32x8 m12, [r2 + r5 + mmsize], 1
2403
+ movu m13, [r3]
2404
+ movu m14, [r3 + r5]
2405
+ movu ym15, [r3 + mmsize]
2406
+ vinserti32x8 m15, [r3 + r5 + mmsize], 1
2407
+ movu m16, [r4]
2408
+ movu m17, [r4 + r5]
2409
+ movu ym18, [r4 + mmsize]
2410
+ vinserti32x8 m18, [r4 + r5 + mmsize], 1
2411
+
2412
+ psubw m7, m4
2413
+ psubw m8, m5
2414
+ psubw m9, m6
2415
+ psubw m10, m4
2416
+ psubw m11, m5
2417
+ psubw m12, m6
2418
+ psubw m13, m4
2419
+ psubw m14, m5
2420
+ psubw m15, m6
2421
+ psubw m16, m4
2422
+ psubw m17, m5
2423
+ psubw m18, m6
2424
+
2425
+ pabsw m7, m7
2426
+ pabsw m8, m8
2427
+ pabsw m9, m9
2428
+ pabsw m10, m10
2429
+ pabsw m11, m11
2430
+ pabsw m12, m12
2431
+ pabsw m13, m13
2432
+ pabsw m14, m14
2433
+ pabsw m15, m15
2434
+ pabsw m16, m16
2435
+ pabsw m17, m17
2436
+ pabsw m18, m18
2437
+
2438
+ paddw m7, m8
2439
+ paddw m7, m9
2440
+ paddw m10, m11
2441
+ paddw m10, m12
2442
+ paddw m13, m14
2443
+ paddw m13, m15
2444
+ paddw m16, m17
2445
+ paddw m16, m18
2446
+
2447
+ pmaddwd m7, m19
2448
+ paddd m0, m7
2449
+ pmaddwd m10, m19
2450
+ paddd m1, m10
2451
+ pmaddwd m13, m19
2452
+ paddd m2, m13
2453
+ pmaddwd m16, m19
2454
+ paddd m3, m16
2455
+
2456
+ movu m4, [r0 + 4 * FENC_STRIDE]
2457
+ movu m5, [r0 + 6 * FENC_STRIDE]
2458
+ movu ym6, [r0 + 4 * FENC_STRIDE + mmsize]
2459
+ vinserti32x8 m6, [r0 + 6 * FENC_STRIDE + mmsize], 1
2460
+ movu m7, [r1 + 2 * r5]
2461
+ movu m8, [r1 + r7]
2462
+ movu ym9, [r1 + 2 * r5 + mmsize]
2463
+ vinserti32x8 m9, [r1 + r7 + mmsize], 1
2464
+ movu m10, [r2 + 2 * r5]
2465
+ movu m11, [r2 + r7]
2466
+ movu ym12, [r2 + 2 * r5 + mmsize]
2467
+ vinserti32x8 m12, [r2 + r7 + mmsize], 1
2468
+ movu m13, [r3 + 2 * r5]
2469
+ movu m14, [r3 + r7]
2470
+ movu ym15, [r3 + 2 * r5 + mmsize]
2471
+ vinserti32x8 m15, [r3 + r7 + mmsize], 1
2472
+ movu m16, [r4 + 2 * r5]
2473
+ movu m17, [r4 + r7]
2474
+ movu ym18, [r4 + 2 * r5 + mmsize]
2475
+ vinserti32x8 m18, [r4 + r7 + mmsize], 1
2476
+
2477
+
2478
+ psubw m7, m4
2479
+ psubw m8, m5
2480
+ psubw m9, m6
2481
+ psubw m10, m4
2482
+ psubw m11, m5
2483
+ psubw m12, m6
2484
+ psubw m13, m4
2485
+ psubw m14, m5
2486
+ psubw m15, m6
2487
+ psubw m16, m4
2488
+ psubw m17, m5
2489
+ psubw m18, m6
2490
+
2491
+ pabsw m7, m7
2492
+ pabsw m8, m8
2493
+ pabsw m9, m9
2494
+ pabsw m10, m10
2495
+ pabsw m11, m11
2496
+ pabsw m12, m12
2497
+ pabsw m13, m13
2498
+ pabsw m14, m14
2499
+ pabsw m15, m15
2500
+ pabsw m16, m16
2501
+ pabsw m17, m17
2502
+ pabsw m18, m18
2503
+
2504
+ paddw m7, m8
2505
+ paddw m7, m9
2506
+ paddw m10, m11
2507
+ paddw m10, m12
2508
+ paddw m13, m14
2509
+ paddw m13, m15
2510
+ paddw m16, m17
2511
+ paddw m16, m18
2512
+
2513
+ pmaddwd m7, m19
2514
+ paddd m0, m7
2515
+ pmaddwd m10, m19
2516
+ paddd m1, m10
2517
+ pmaddwd m13, m19
2518
+ paddd m2, m13
2519
+ pmaddwd m16, m19
2520
+ paddd m3, m16
2521
+
2522
+ add r0, FENC_STRIDE * 8
2523
+ lea r1, [r1 + r5 * 4]
2524
+ lea r2, [r2 + r5 * 4]
2525
+ lea r3, [r3 + r5 * 4]
2526
+ lea r4, [r4 + r5 * 4]
2527
+
2528
+ dec r8d
2529
+ jg .loop
2530
+
2531
+ PROCESS_SAD_X4_END_AVX512
2532
+ RET
2533
+%endif
2534
+
2535
+;------------------------------------------------------------------------------------------------------------------------------------------------------------
2536
+; void pixel_sad_x4_64x%1( const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, const pixel* pix5, intptr_t frefstride, int32_t* res )
2537
+;------------------------------------------------------------------------------------------------------------------------------------------------------------
2538
+%if ARCH_X86_64
2539
+INIT_ZMM avx512
2540
+cglobal pixel_sad_x4_64x16, 6,8,15
2541
+ pxor m0, m0
2542
+ pxor m1, m1
2543
+ pxor m2, m2
2544
+ pxor m3, m3
2545
+
2546
+ vbroadcasti32x8 m9, [pw_1]
2547
+
2548
+ add r5d, r5d
2549
+ lea r7d, [r5 * 3]
2550
+
2551
+ PROCESS_SAD_X4_64x4_AVX512
2552
+ add r0, FENC_STRIDE * 8
2553
+ lea r1, [r1 + r5 * 4]
2554
+ lea r2, [r2 + r5 * 4]
2555
+ lea r3, [r3 + r5 * 4]
2556
+ lea r4, [r4 + r5 * 4]
2557
+ PROCESS_SAD_X4_64x4_AVX512
2558
+ add r0, FENC_STRIDE * 8
2559
+ lea r1, [r1 + r5 * 4]
2560
+ lea r2, [r2 + r5 * 4]
2561
+ lea r3, [r3 + r5 * 4]
2562
+ lea r4, [r4 + r5 * 4]
2563
+ PROCESS_SAD_X4_64x4_AVX512
2564
+ add r0, FENC_STRIDE * 8
2565
+ lea r1, [r1 + r5 * 4]
2566
+ lea r2, [r2 + r5 * 4]
2567
+ lea r3, [r3 + r5 * 4]
2568
+ lea r4, [r4 + r5 * 4]
2569
+ PROCESS_SAD_X4_64x4_AVX512
2570
+ PROCESS_SAD_X4_END_AVX512
2571
+ RET
2572
+
2573
+INIT_ZMM avx512
2574
+cglobal pixel_sad_x4_64x32, 6,8,15
2575
+ pxor m0, m0
2576
+ pxor m1, m1
2577
+ pxor m2, m2
2578
+ pxor m3, m3
2579
+
2580
+ vbroadcasti32x8 m9, [pw_1]
2581
+
2582
+ add r5d, r5d
2583
+ lea r7d, [r5 * 3]
2584
+
2585
+ PROCESS_SAD_X4_64x4_AVX512
2586
+ add r0, FENC_STRIDE * 8
2587
+ lea r1, [r1 + r5 * 4]
2588
+ lea r2, [r2 + r5 * 4]
2589
+ lea r3, [r3 + r5 * 4]
2590
+ lea r4, [r4 + r5 * 4]
2591
+ PROCESS_SAD_X4_64x4_AVX512
2592
+ add r0, FENC_STRIDE * 8
2593
+ lea r1, [r1 + r5 * 4]
2594
+ lea r2, [r2 + r5 * 4]
2595
+ lea r3, [r3 + r5 * 4]
2596
+ lea r4, [r4 + r5 * 4]
2597
+ PROCESS_SAD_X4_64x4_AVX512
2598
+ add r0, FENC_STRIDE * 8
2599
+ lea r1, [r1 + r5 * 4]
2600
+ lea r2, [r2 + r5 * 4]
2601
+ lea r3, [r3 + r5 * 4]
2602
+ lea r4, [r4 + r5 * 4]
2603
+ PROCESS_SAD_X4_64x4_AVX512
2604
+ add r0, FENC_STRIDE * 8
2605
+ lea r1, [r1 + r5 * 4]
2606
+ lea r2, [r2 + r5 * 4]
2607
+ lea r3, [r3 + r5 * 4]
2608
+ lea r4, [r4 + r5 * 4]
2609
+ PROCESS_SAD_X4_64x4_AVX512
2610
+ add r0, FENC_STRIDE * 8
2611
+ lea r1, [r1 + r5 * 4]
2612
+ lea r2, [r2 + r5 * 4]
2613
+ lea r3, [r3 + r5 * 4]
2614
+ lea r4, [r4 + r5 * 4]
2615
+ PROCESS_SAD_X4_64x4_AVX512
2616
+ add r0, FENC_STRIDE * 8
2617
+ lea r1, [r1 + r5 * 4]
2618
+ lea r2, [r2 + r5 * 4]
2619
+ lea r3, [r3 + r5 * 4]
2620
+ lea r4, [r4 + r5 * 4]
2621
+ PROCESS_SAD_X4_64x4_AVX512
2622
+ add r0, FENC_STRIDE * 8
2623
+ lea r1, [r1 + r5 * 4]
2624
+ lea r2, [r2 + r5 * 4]
2625
+ lea r3, [r3 + r5 * 4]
2626
+ lea r4, [r4 + r5 * 4]
2627
+ PROCESS_SAD_X4_64x4_AVX512
2628
+ PROCESS_SAD_X4_END_AVX512
2629
+ RET
2630
+
2631
+INIT_ZMM avx512
2632
+cglobal pixel_sad_x4_64x48, 6,8,15
2633
+ pxor m0, m0
2634
+ pxor m1, m1
2635
+ pxor m2, m2
2636
+ pxor m3, m3
2637
+
2638
+ vbroadcasti32x8 m9, [pw_1]
2639
+
2640
+ add r5d, r5d
2641
+ lea r7d, [r5 * 3]
2642
+
2643
+ PROCESS_SAD_X4_64x4_AVX512
2644
+ add r0, FENC_STRIDE * 8
2645
+ lea r1, [r1 + r5 * 4]
2646
+ lea r2, [r2 + r5 * 4]
2647
+ lea r3, [r3 + r5 * 4]
2648
+ lea r4, [r4 + r5 * 4]
2649
+ PROCESS_SAD_X4_64x4_AVX512
2650
+ add r0, FENC_STRIDE * 8
2651
+ lea r1, [r1 + r5 * 4]
2652
+ lea r2, [r2 + r5 * 4]
2653
+ lea r3, [r3 + r5 * 4]
2654
+ lea r4, [r4 + r5 * 4]
2655
+ PROCESS_SAD_X4_64x4_AVX512
2656
+ add r0, FENC_STRIDE * 8
2657
+ lea r1, [r1 + r5 * 4]
2658
+ lea r2, [r2 + r5 * 4]
2659
+ lea r3, [r3 + r5 * 4]
2660
+ lea r4, [r4 + r5 * 4]
2661
+ PROCESS_SAD_X4_64x4_AVX512
2662
+ add r0, FENC_STRIDE * 8
2663
+ lea r1, [r1 + r5 * 4]
2664
+ lea r2, [r2 + r5 * 4]
2665
+ lea r3, [r3 + r5 * 4]
2666
+ lea r4, [r4 + r5 * 4]
2667
+ PROCESS_SAD_X4_64x4_AVX512
2668
+ add r0, FENC_STRIDE * 8
2669
+ lea r1, [r1 + r5 * 4]
2670
+ lea r2, [r2 + r5 * 4]
2671
+ lea r3, [r3 + r5 * 4]
2672
+ lea r4, [r4 + r5 * 4]
2673
+ PROCESS_SAD_X4_64x4_AVX512
2674
+ add r0, FENC_STRIDE * 8
2675
+ lea r1, [r1 + r5 * 4]
2676
+ lea r2, [r2 + r5 * 4]
2677
+ lea r3, [r3 + r5 * 4]
2678
+ lea r4, [r4 + r5 * 4]
2679
+ PROCESS_SAD_X4_64x4_AVX512
2680
+ add r0, FENC_STRIDE * 8
2681
+ lea r1, [r1 + r5 * 4]
2682
+ lea r2, [r2 + r5 * 4]
2683
+ lea r3, [r3 + r5 * 4]
2684
+ lea r4, [r4 + r5 * 4]
2685
+ PROCESS_SAD_X4_64x4_AVX512
2686
+ add r0, FENC_STRIDE * 8
2687
+ lea r1, [r1 + r5 * 4]
2688
+ lea r2, [r2 + r5 * 4]
2689
+ lea r3, [r3 + r5 * 4]
2690
+ lea r4, [r4 + r5 * 4]
2691
+ PROCESS_SAD_X4_64x4_AVX512
2692
+ add r0, FENC_STRIDE * 8
2693
+ lea r1, [r1 + r5 * 4]
2694
+ lea r2, [r2 + r5 * 4]
2695
+ lea r3, [r3 + r5 * 4]
2696
+ lea r4, [r4 + r5 * 4]
2697
+ PROCESS_SAD_X4_64x4_AVX512
2698
+ add r0, FENC_STRIDE * 8
2699
+ lea r1, [r1 + r5 * 4]
2700
+ lea r2, [r2 + r5 * 4]
2701
+ lea r3, [r3 + r5 * 4]
2702
+ lea r4, [r4 + r5 * 4]
2703
+ PROCESS_SAD_X4_64x4_AVX512
2704
+ add r0, FENC_STRIDE * 8
2705
+ lea r1, [r1 + r5 * 4]
2706
+ lea r2, [r2 + r5 * 4]
2707
+ lea r3, [r3 + r5 * 4]
2708
+ lea r4, [r4 + r5 * 4]
2709
+ PROCESS_SAD_X4_64x4_AVX512
2710
+ PROCESS_SAD_X4_END_AVX512
2711
+ RET
2712
+
2713
+INIT_ZMM avx512
2714
+cglobal pixel_sad_x4_64x64, 6,8,15
2715
+ pxor m0, m0
2716
+ pxor m1, m1
2717
+ pxor m2, m2
2718
+ pxor m3, m3
2719
+
2720
+ vbroadcasti32x8 m9, [pw_1]
2721
+
2722
+ add r5d, r5d
2723
+ lea r7d, [r5 * 3]
2724
+
2725
+ PROCESS_SAD_X4_64x4_AVX512
2726
+ add r0, FENC_STRIDE * 8
2727
+ lea r1, [r1 + r5 * 4]
2728
+ lea r2, [r2 + r5 * 4]
2729
+ lea r3, [r3 + r5 * 4]
2730
+ lea r4, [r4 + r5 * 4]
2731
+ PROCESS_SAD_X4_64x4_AVX512
2732
+ add r0, FENC_STRIDE * 8
2733
+ lea r1, [r1 + r5 * 4]
2734
+ lea r2, [r2 + r5 * 4]
2735
+ lea r3, [r3 + r5 * 4]
2736
+ lea r4, [r4 + r5 * 4]
2737
+ PROCESS_SAD_X4_64x4_AVX512
2738
+ add r0, FENC_STRIDE * 8
2739
+ lea r1, [r1 + r5 * 4]
2740
+ lea r2, [r2 + r5 * 4]
2741
+ lea r3, [r3 + r5 * 4]
2742
+ lea r4, [r4 + r5 * 4]
2743
+ PROCESS_SAD_X4_64x4_AVX512
2744
+ add r0, FENC_STRIDE * 8
2745
+ lea r1, [r1 + r5 * 4]
2746
+ lea r2, [r2 + r5 * 4]
2747
+ lea r3, [r3 + r5 * 4]
2748
+ lea r4, [r4 + r5 * 4]
2749
+ PROCESS_SAD_X4_64x4_AVX512
2750
+ add r0, FENC_STRIDE * 8
2751
+ lea r1, [r1 + r5 * 4]
2752
+ lea r2, [r2 + r5 * 4]
2753
+ lea r3, [r3 + r5 * 4]
2754
+ lea r4, [r4 + r5 * 4]
2755
+ PROCESS_SAD_X4_64x4_AVX512
2756
+ add r0, FENC_STRIDE * 8
2757
+ lea r1, [r1 + r5 * 4]
2758
+ lea r2, [r2 + r5 * 4]
2759
+ lea r3, [r3 + r5 * 4]
2760
+ lea r4, [r4 + r5 * 4]
2761
+ PROCESS_SAD_X4_64x4_AVX512
2762
+ add r0, FENC_STRIDE * 8
2763
+ lea r1, [r1 + r5 * 4]
2764
+ lea r2, [r2 + r5 * 4]
2765
+ lea r3, [r3 + r5 * 4]
2766
+ lea r4, [r4 + r5 * 4]
2767
+ PROCESS_SAD_X4_64x4_AVX512
2768
+ add r0, FENC_STRIDE * 8
2769
+ lea r1, [r1 + r5 * 4]
2770
+ lea r2, [r2 + r5 * 4]
2771
+ lea r3, [r3 + r5 * 4]
2772
+ lea r4, [r4 + r5 * 4]
2773
+ PROCESS_SAD_X4_64x4_AVX512
2774
+ add r0, FENC_STRIDE * 8
2775
+ lea r1, [r1 + r5 * 4]
2776
+ lea r2, [r2 + r5 * 4]
2777
+ lea r3, [r3 + r5 * 4]
2778
+ lea r4, [r4 + r5 * 4]
2779
+ PROCESS_SAD_X4_64x4_AVX512
2780
+ add r0, FENC_STRIDE * 8
2781
+ lea r1, [r1 + r5 * 4]
2782
+ lea r2, [r2 + r5 * 4]
2783
+ lea r3, [r3 + r5 * 4]
2784
+ lea r4, [r4 + r5 * 4]
2785
+ PROCESS_SAD_X4_64x4_AVX512
2786
+ add r0, FENC_STRIDE * 8
2787
+ lea r1, [r1 + r5 * 4]
2788
+ lea r2, [r2 + r5 * 4]
2789
+ lea r3, [r3 + r5 * 4]
2790
+ lea r4, [r4 + r5 * 4]
2791
+ PROCESS_SAD_X4_64x4_AVX512
2792
+ add r0, FENC_STRIDE * 8
2793
+ lea r1, [r1 + r5 * 4]
2794
+ lea r2, [r2 + r5 * 4]
2795
+ lea r3, [r3 + r5 * 4]
2796
+ lea r4, [r4 + r5 * 4]
2797
+ PROCESS_SAD_X4_64x4_AVX512
2798
+ add r0, FENC_STRIDE * 8
2799
+ lea r1, [r1 + r5 * 4]
2800
+ lea r2, [r2 + r5 * 4]
2801
+ lea r3, [r3 + r5 * 4]
2802
+ lea r4, [r4 + r5 * 4]
2803
+ PROCESS_SAD_X4_64x4_AVX512
2804
+ add r0, FENC_STRIDE * 8
2805
+ lea r1, [r1 + r5 * 4]
2806
+ lea r2, [r2 + r5 * 4]
2807
+ lea r3, [r3 + r5 * 4]
2808
+ lea r4, [r4 + r5 * 4]
2809
+ PROCESS_SAD_X4_64x4_AVX512
2810
+ add r0, FENC_STRIDE * 8
2811
+ lea r1, [r1 + r5 * 4]
2812
+ lea r2, [r2 + r5 * 4]
2813
+ lea r3, [r3 + r5 * 4]
2814
+ lea r4, [r4 + r5 * 4]
2815
+ PROCESS_SAD_X4_64x4_AVX512
2816
+ PROCESS_SAD_X4_END_AVX512
2817
+ RET
2818
+%endif
2819
x265_2.7.tar.gz/source/common/x86/ssd-a.asm -> x265_2.9.tar.gz/source/common/x86/ssd-a.asm
Changed
590
1
2
3
; Function to find ssd for 32x16 block, sse2, 12 bit depth
4
; Defined sepeartely to be called from SSD_ONE_32 macro
5
+%if ARCH_X86_64
6
+;This code is written for 64 bit architecture
7
INIT_XMM sse2
8
cglobal ssd_ss_32x16
9
pxor m8, m8
10
11
paddq m4, m5
12
paddq m9, m4
13
ret
14
+%endif
15
16
%macro SSD_ONE_32 0
17
+%if ARCH_X86_64
18
cglobal pixel_ssd_ss_32x64, 4,7,10
19
add r1d, r1d
20
add r3d, r3d
21
22
call ssd_ss_32x16
23
movq rax, m9
24
RET
25
+%endif
26
%endmacro
27
+
28
%macro SSD_ONE_SS_32 0
29
cglobal pixel_ssd_ss_32x32, 4,5,8
30
add r1d, r1d
31
32
RET
33
%endmacro
34
35
+%if ARCH_X86_64
36
INIT_YMM avx2
37
cglobal pixel_ssd_16x16, 4,7,3
38
FIX_STRIDES r1, r3
39
40
movq rax, xm3
41
RET
42
43
+INIT_ZMM avx512
44
+cglobal pixel_ssd_32x2
45
+ pxor m0, m0
46
+ movu m1, [r0]
47
+ psubw m1, [r2]
48
+ pmaddwd m1, m1
49
+ paddd m0, m1
50
+ movu m1, [r0 + r1]
51
+ psubw m1, [r2 + r3]
52
+ pmaddwd m1, m1
53
+ paddd m0, m1
54
+ lea r0, [r0 + r1 * 2]
55
+ lea r2, [r2 + r3 * 2]
56
+
57
+ mova m1, m0
58
+ pxor m2, m2
59
+ punpckldq m0, m2
60
+ punpckhdq m1, m2
61
+
62
+ paddq m3, m0
63
+ paddq m3, m1
64
+ret
65
+
66
+INIT_ZMM avx512
67
+cglobal pixel_ssd_32x32, 4,5,5
68
+ shl r1d, 1
69
+ shl r3d, 1
70
+ pxor m3, m3
71
+ mov r4, 16
72
+.iterate:
73
+ call pixel_ssd_32x2
74
+ dec r4d
75
+ jne .iterate
76
+
77
+ vextracti32x8 ym4, m3, 1
78
+ paddq ym3, ym4
79
+ vextracti32x4 xm4, m3, 1
80
+ paddq xm3, xm4
81
+ movhlps xm4, xm3
82
+ paddq xm3, xm4
83
+ movq rax, xm3
84
+RET
85
+
86
+INIT_ZMM avx512
87
+cglobal pixel_ssd_32x64, 4,5,5
88
+ shl r1d, 1
89
+ shl r3d, 1
90
+ pxor m3, m3
91
+ mov r4, 32
92
+.iterate:
93
+ call pixel_ssd_32x2
94
+ dec r4d
95
+ jne .iterate
96
+
97
+ vextracti32x8 ym4, m3, 1
98
+ paddq ym3, ym4
99
+ vextracti32x4 xm4, m3, 1
100
+ paddq xm3, xm4
101
+ movhlps xm4, xm3
102
+ paddq xm3, xm4
103
+ movq rax, xm3
104
+RET
105
+
106
+INIT_ZMM avx512
107
+cglobal pixel_ssd_64x64, 4,5,5
108
+ FIX_STRIDES r1, r3
109
+ mov r4d, 64
110
+ pxor m3, m3
111
+
112
+.loop:
113
+ pxor m0, m0
114
+ movu m1, [r0]
115
+ psubw m1, [r2]
116
+ pmaddwd m1, m1
117
+ paddd m0, m1
118
+ movu m1, [r0 + mmsize]
119
+ psubw m1, [r2 + mmsize]
120
+ pmaddwd m1, m1
121
+ paddd m0, m1
122
+
123
+ lea r0, [r0 + r1]
124
+ lea r2, [r2 + r3]
125
+
126
+ mova m1, m0
127
+ pxor m2, m2
128
+ punpckldq m0, m2
129
+ punpckhdq m1, m2
130
+ paddq m3, m0
131
+ paddq m3, m1
132
+
133
+ dec r4d
134
+ jg .loop
135
+
136
+ vextracti32x8 ym4, m3, 1
137
+ paddq ym3, ym4
138
+ vextracti32x4 xm4, m3, 1
139
+ paddq xm3, xm4
140
+ movhlps xm4, xm3
141
+ paddq xm3, xm4
142
+ movq rax, xm3
143
+ RET
144
+%endif
145
INIT_MMX mmx2
146
SSD_ONE 4, 4
147
SSD_ONE 4, 8
148
149
%if BIT_DEPTH <= 10
150
SSD_ONE 32, 64
151
SSD_ONE 32, 32
152
+%if ARCH_X86_64
153
SSD_TWO 64, 64
154
+%endif
155
%else
156
SSD_ONE_32
157
SSD_ONE_SS_32
158
159
HADDD m2, m0
160
movd eax, xm2
161
RET
162
+;-----------------------------------------------------------------------------
163
+; ssd_ss avx512 code start
164
+;-----------------------------------------------------------------------------
165
+%if ARCH_X86_64
166
+%macro PROCESS_SSD_SS_64x4_AVX512 0
167
+ movu m0, [r0]
168
+ movu m1, [r0 + mmsize]
169
+ movu m2, [r0 + r1]
170
+ movu m3, [r0 + r1 + mmsize]
171
+ movu m4, [r2]
172
+ movu m5, [r2 + mmsize]
173
+ movu m6, [r2 + r3]
174
+ movu m7, [r2 + r3 + mmsize]
175
+
176
+ psubw m0, m4
177
+ psubw m1, m5
178
+ psubw m2, m6
179
+ psubw m3, m7
180
+ pmaddwd m0, m0
181
+ pmaddwd m1, m1
182
+ pmaddwd m2, m2
183
+ pmaddwd m3, m3
184
+ paddd m8, m0
185
+ paddd m8, m1
186
+ paddd m8, m2
187
+ paddd m8, m3
188
189
+ movu m0, [r0 + 2 * r1]
190
+ movu m1, [r0 + 2 * r1 + mmsize]
191
+ movu m2, [r0 + r5]
192
+ movu m3, [r0 + r5 + mmsize]
193
+ movu m4, [r2 + 2 * r3]
194
+ movu m5, [r2 + 2 * r3 + mmsize]
195
+ movu m6, [r2 + r6]
196
+ movu m7, [r2 + r6 + mmsize]
197
+
198
+ psubw m0, m4
199
+ psubw m1, m5
200
+ psubw m2, m6
201
+ psubw m3, m7
202
+ pmaddwd m0, m0
203
+ pmaddwd m1, m1
204
+ pmaddwd m2, m2
205
+ pmaddwd m3, m3
206
+ paddd m8, m0
207
+ paddd m8, m1
208
+ paddd m8, m2
209
+ paddd m8, m3
210
+%endmacro
211
+
212
+%macro PROCESS_SSD_SS_32x4_AVX512 0
213
+ movu m0, [r0]
214
+ movu m1, [r0 + r1]
215
+ movu m2, [r0 + 2 * r1]
216
+ movu m3, [r0 + r5]
217
+ movu m4, [r2]
218
+ movu m5, [r2 + r3]
219
+ movu m6, [r2 + 2 * r3]
220
+ movu m7, [r2 + r6]
221
+
222
+ psubw m0, m4
223
+ psubw m1, m5
224
+ psubw m2, m6
225
+ psubw m3, m7
226
+ pmaddwd m0, m0
227
+ pmaddwd m1, m1
228
+ pmaddwd m2, m2
229
+ pmaddwd m3, m3
230
+ paddd m8, m0
231
+ paddd m8, m1
232
+ paddd m8, m2
233
+ paddd m8, m3
234
+%endmacro
235
+
236
+%macro PROCESS_SSD_SS_16x4_AVX512 0
237
+ movu ym0, [r0]
238
+ vinserti32x8 m0, [r0 + r1], 1
239
+ movu ym1, [r0 + 2 * r1]
240
+ vinserti32x8 m1, [r0 + r5], 1
241
+ movu ym4, [r2]
242
+ vinserti32x8 m4, [r2 + r3], 1
243
+ movu ym5, [r2 + 2 * r3]
244
+ vinserti32x8 m5, [r2 + r6], 1
245
+
246
+ psubw m0, m4
247
+ psubw m1, m5
248
+ pmaddwd m0, m0
249
+ pmaddwd m1, m1
250
+ paddd m8, m0
251
+ paddd m8, m1
252
+%endmacro
253
+
254
+%macro SSD_SS_AVX512 2
255
+INIT_ZMM avx512
256
+cglobal pixel_ssd_ss_%1x%2, 4,7,9
257
+ add r1d, r1d
258
+ add r3d, r3d
259
+ lea r5, [r1 * 3]
260
+ lea r6, [r3 * 3]
261
+ pxor m8, m8
262
+
263
+%rep %2/4 - 1
264
+ PROCESS_SSD_SS_%1x4_AVX512
265
+ lea r0, [r0 + 4 * r1]
266
+ lea r2, [r2 + 4 * r3]
267
+%endrep
268
+ PROCESS_SSD_SS_%1x4_AVX512
269
+ HADDD m8, m0
270
+ movd eax, xm8
271
+ RET
272
+%endmacro
273
+
274
+
275
+SSD_SS_AVX512 64, 64
276
+SSD_SS_AVX512 32, 32
277
+SSD_SS_AVX512 16, 16
278
+%endif
279
+;-----------------------------------------------------------------------------
280
+; ssd_ss avx512 code end
281
+;-----------------------------------------------------------------------------
282
%endif ; !HIGH_BIT_DEPTH
283
284
%if HIGH_BIT_DEPTH == 0
285
286
movd eax, m0
287
RET
288
289
-
290
+%if ARCH_X86_64 && BIT_DEPTH >= 10
291
INIT_XMM sse2
292
cglobal pixel_ssd_s_32, 2,3,5
293
add r1, r1
294
295
dec r2d
296
jnz .loop
297
298
-%if BIT_DEPTH >= 10
299
movu m1, m0
300
pxor m2, m2
301
punpckldq m0, m2
302
303
movhlps m1, m0
304
paddq m0, m1
305
movq rax, xm0
306
-%else
307
+ RET
308
+%endif
309
+
310
+%if BIT_DEPTH == 8
311
+INIT_XMM sse2
312
+cglobal pixel_ssd_s_32, 2,3,5
313
+ add r1, r1
314
+
315
+ mov r2d, 16
316
+ pxor m0, m0
317
+.loop:
318
+ movu m1, [r0 + 0 * mmsize]
319
+ movu m2, [r0 + 1 * mmsize]
320
+ movu m3, [r0 + 2 * mmsize]
321
+ movu m4, [r0 + 3 * mmsize]
322
+ add r0, r1
323
+
324
+ pmaddwd m1, m1
325
+ pmaddwd m2, m2
326
+ pmaddwd m3, m3
327
+ pmaddwd m4, m4
328
+ paddd m1, m2
329
+ paddd m3, m4
330
+ paddd m1, m3
331
+ paddd m0, m1
332
+
333
+ movu m1, [r0 + 0 * mmsize]
334
+ movu m2, [r0 + 1 * mmsize]
335
+ movu m3, [r0 + 2 * mmsize]
336
+ movu m4, [r0 + 3 * mmsize]
337
+ add r0, r1
338
+
339
+ pmaddwd m1, m1
340
+ pmaddwd m2, m2
341
+ pmaddwd m3, m3
342
+ pmaddwd m4, m4
343
+ paddd m1, m2
344
+ paddd m3, m4
345
+ paddd m1, m3
346
+ paddd m0, m1
347
+
348
+ dec r2d
349
+ jnz .loop
350
; calculate sum and return
351
HADDD m0, m1
352
movd eax, m0
353
-%endif
354
RET
355
+%endif
356
357
+%if ARCH_X86_64
358
INIT_YMM avx2
359
cglobal pixel_ssd_s_16, 2,4,5
360
add r1, r1
361
362
movd eax, xm0
363
%endif
364
RET
365
+%endif
366
+;-----------------------------------------------------------------------------
367
+; ssd_s avx512 code start
368
+;-----------------------------------------------------------------------------
369
+%macro PROCESS_SSD_S_32x8_AVX512 0
370
+ movu m1, [r0]
371
+ movu m2, [r0 + r1]
372
+ movu m3, [r0 + 2 * r1]
373
+ movu m4, [r0 + r3]
374
+
375
+ pmaddwd m1, m1
376
+ pmaddwd m2, m2
377
+ pmaddwd m3, m3
378
+ pmaddwd m4, m4
379
+ paddd m1, m2
380
+ paddd m3, m4
381
+ paddd m1, m3
382
+ paddd m0, m1
383
+
384
+ lea r0, [r0 + 4 * r1]
385
+
386
+ movu m1, [r0]
387
+ movu m2, [r0 + r1]
388
+ movu m3, [r0 + 2 * r1]
389
+ movu m4, [r0 + r3]
390
+
391
+ pmaddwd m1, m1
392
+ pmaddwd m2, m2
393
+ pmaddwd m3, m3
394
+ pmaddwd m4, m4
395
+ paddd m1, m2
396
+ paddd m3, m4
397
+ paddd m1, m3
398
+ paddd m0, m1
399
+%endmacro
400
+
401
+%macro PROCESS_SSD_S_16x8_AVX512 0
402
+ movu ym1, [r0]
403
+ vinserti32x8 m1, [r0 + r1], 1
404
+ movu ym2, [r0 + 2 * r1]
405
+ vinserti32x8 m2, [r0 + r3], 1
406
+ lea r0, [r0 + 4 * r1]
407
+ movu ym3, [r0]
408
+ vinserti32x8 m3, [r0 + r1], 1
409
+ movu ym4, [r0 + 2 * r1]
410
+ vinserti32x8 m4, [r0 + r3], 1
411
+ pmaddwd m1, m1
412
+ pmaddwd m2, m2
413
+ pmaddwd m3, m3
414
+ pmaddwd m4, m4
415
+ paddd m1, m2
416
+ paddd m3, m4
417
+ paddd m1, m3
418
+ paddd m0, m1
419
+%endmacro
420
+;-----------------------------------------------------------------------------
421
+; int pixel_ssd_s( int16_t *ref, intptr_t i_stride )
422
+;-----------------------------------------------------------------------------
423
+%if ARCH_X86_64
424
+INIT_ZMM avx512
425
+cglobal pixel_ssd_s_32, 2,4,5
426
+ add r1, r1
427
+ lea r3, [r1 * 3]
428
+ pxor m0, m0
429
+
430
+ PROCESS_SSD_S_32x8_AVX512
431
+ lea r0, [r0 + 4 * r1]
432
+ PROCESS_SSD_S_32x8_AVX512
433
+ lea r0, [r0 + 4 * r1]
434
+ PROCESS_SSD_S_32x8_AVX512
435
+ lea r0, [r0 + 4 * r1]
436
+ PROCESS_SSD_S_32x8_AVX512
437
+
438
+ ; calculate sum and return
439
+%if BIT_DEPTH >= 10
440
+ movu m1, m0
441
+ pxor m2, m2
442
+ punpckldq m0, m2
443
+ punpckhdq m1, m2
444
+ paddq m0, m1
445
+ vextracti32x8 ym2, m0, 1
446
+ paddq ym0, ym2
447
+ vextracti32x4 xm2, m0, 1
448
+ paddq xm2, xm0
449
+ movhlps xm1, xm2
450
+ paddq xm2, xm1
451
+ movq rax, xm2
452
+%else
453
+ HADDD m0, m1
454
+ movd eax, xm0
455
+%endif
456
+ RET
457
+
458
+INIT_ZMM avx512
459
+cglobal pixel_ssd_s_16, 2,4,5
460
+ add r1, r1
461
+ lea r3, [r1 * 3]
462
+ pxor m0, m0
463
+
464
+ PROCESS_SSD_S_16x8_AVX512
465
+ lea r0, [r0 + 4 * r1]
466
+ PROCESS_SSD_S_16x8_AVX512
467
+
468
+ ; calculate sum and return
469
+ HADDD m0, m1
470
+ movd eax, xm0
471
+ RET
472
+%endif
473
+;-----------------------------------------------------------------------------
474
+; ssd_s avx512 code end
475
+;-----------------------------------------------------------------------------
476
+;-----------------------------------------------------------------------------
477
+;ALigned version of macro
478
+;-----------------------------------------------------------------------------
479
+%macro PROCESS_SSD_S_16x8_ALIGNED_AVX512 0
480
+ mova ym1, [r0]
481
+ vinserti32x8 m1, [r0 + r1], 1
482
+ mova ym2, [r0 + 2 * r1]
483
+ vinserti32x8 m2, [r0 + r3], 1
484
+ lea r0, [r0 + 4 * r1]
485
+ mova ym3, [r0]
486
+ vinserti32x8 m3, [r0 + r1], 1
487
+ mova ym4, [r0 + 2 * r1]
488
+ vinserti32x8 m4, [r0 + r3], 1
489
+ pmaddwd m1, m1
490
+ pmaddwd m2, m2
491
+ pmaddwd m3, m3
492
+ pmaddwd m4, m4
493
+ paddd m1, m2
494
+ paddd m3, m4
495
+ paddd m1, m3
496
+ paddd m0, m1
497
+%endmacro
498
+;---------------------------------------------------------------------------------
499
+;int pixel_ssd_s_aligned( int16_t *ref, intptr_t i_stride )
500
+;-----------------------------------------------------------------------------------
501
+%if ARCH_X86_64
502
+INIT_ZMM avx512
503
+
504
+INIT_ZMM avx512
505
+cglobal pixel_ssd_s_aligned_16, 2,4,5
506
+ add r1, r1
507
+ lea r3, [r1 * 3]
508
+ pxor m0, m0
509
+
510
+ PROCESS_SSD_S_16x8_ALIGNED_AVX512
511
+ lea r0, [r0 + 4 * r1]
512
+ PROCESS_SSD_S_16x8_ALIGNED_AVX512
513
+
514
+ ; calculate sum and return
515
+ HADDD m0, m1
516
+ movd eax, xm0
517
+ RET
518
+%endif
519
+;---------------------------------------------------------------------------------------------
520
+; aligned implementation for 32
521
+;---------------------------------------------------------------------------------------------
522
+%macro PROCESS_SSD_S_32x8_ALIGNED_AVX512 0
523
+ mova m1, [r0]
524
+ mova m2, [r0 + r1]
525
+ mova m3, [r0 + 2 * r1]
526
+ mova m4, [r0 + r3]
527
+
528
+ pmaddwd m1, m1
529
+ pmaddwd m2, m2
530
+ pmaddwd m3, m3
531
+ pmaddwd m4, m4
532
+ paddd m1, m2
533
+ paddd m3, m4
534
+ paddd m1, m3
535
+ paddd m0, m1
536
+
537
+ lea r0, [r0 + 4 * r1]
538
+
539
+ mova m1, [r0]
540
+ mova m2, [r0 + r1]
541
+ mova m3, [r0 + 2 * r1]
542
+ mova m4, [r0 + r3]
543
+
544
+ pmaddwd m1, m1
545
+ pmaddwd m2, m2
546
+ pmaddwd m3, m3
547
+ pmaddwd m4, m4
548
+ paddd m1, m2
549
+ paddd m3, m4
550
+ paddd m1, m3
551
+ paddd m0, m1
552
+%endmacro
553
+
554
+%if ARCH_X86_64
555
+INIT_ZMM avx512
556
+cglobal pixel_ssd_s_aligned_32, 2,4,5
557
+ add r1, r1
558
+ lea r3, [r1 * 3]
559
+ pxor m0, m0
560
+
561
+ PROCESS_SSD_S_32x8_AVX512
562
+ lea r0, [r0 + 4 * r1]
563
+ PROCESS_SSD_S_32x8_ALIGNED_AVX512
564
+ lea r0, [r0 + 4 * r1]
565
+ PROCESS_SSD_S_32x8_ALIGNED_AVX512
566
+ lea r0, [r0 + 4 * r1]
567
+ PROCESS_SSD_S_32x8_ALIGNED_AVX512
568
+
569
+ ; calculate sum and return
570
+%if BIT_DEPTH >= 10
571
+ mova m1, m0
572
+ pxor m2, m2
573
+ punpckldq m0, m2
574
+ punpckhdq m1, m2
575
+ paddq m0, m1
576
+ vextracti32x8 ym2, m0, 1
577
+ paddq ym0, ym2
578
+ vextracti32x4 xm2, m0, 1
579
+ paddq xm2, xm0
580
+ movhlps xm1, xm2
581
+ paddq xm2, xm1
582
+ movq rax, xm2
583
+%else
584
+ HADDD m0, m1
585
+ movd eax, xm0
586
+%endif
587
+ RET
588
+%endif
589
\ No newline at end of file
590
x265_2.7.tar.gz/source/common/x86/v4-ipfilter16.asm -> x265_2.9.tar.gz/source/common/x86/v4-ipfilter16.asm
Changed
17
1
2
RET
3
%endmacro
4
5
+%if ARCH_X86_64
6
FILTER_VER_CHROMA_AVX2_4xN pp, 16, 1, 6
7
FILTER_VER_CHROMA_AVX2_4xN ps, 16, 0, INTERP_SHIFT_PS
8
FILTER_VER_CHROMA_AVX2_4xN sp, 16, 1, INTERP_SHIFT_SP
9
10
FILTER_VER_CHROMA_AVX2_4xN ps, 32, 0, INTERP_SHIFT_PS
11
FILTER_VER_CHROMA_AVX2_4xN sp, 32, 1, INTERP_SHIFT_SP
12
FILTER_VER_CHROMA_AVX2_4xN ss, 32, 0, 6
13
+%endif
14
15
%macro FILTER_VER_CHROMA_AVX2_8x8 3
16
INIT_YMM avx2
17
x265_2.7.tar.gz/source/common/x86/v4-ipfilter8.asm -> x265_2.9.tar.gz/source/common/x86/v4-ipfilter8.asm
Changed
359
1
2
const v4_interp4_vpp_shuf1, dd 0, 1, 1, 2, 2, 3, 3, 4
3
dd 2, 3, 3, 4, 4, 5, 5, 6
4
5
-const tab_ChromaCoeff, db 0, 64, 0, 0
6
+const v4_tab_ChromaCoeff, db 0, 64, 0, 0
7
db -2, 58, 10, -2
8
db -4, 54, 16, -2
9
db -6, 46, 28, -4
10
11
mova m6, [r5 + r4]
12
mova m5, [r5 + r4 + 16]
13
%else
14
- mova m6, [tab_ChromaCoeff + r4]
15
- mova m5, [tab_ChromaCoeff + r4 + 16]
16
+ mova m6, [v4_tab_ChromaCoeff + r4]
17
+ mova m5, [v4_tab_ChromaCoeff + r4 + 16]
18
%endif
19
20
%ifidn %1,pp
21
22
sub r0, r1
23
24
%ifdef PIC
25
- lea r5, [tab_ChromaCoeff]
26
+ lea r5, [v4_tab_ChromaCoeff]
27
movd m0, [r5 + r4 * 4]
28
%else
29
- movd m0, [tab_ChromaCoeff + r4 * 4]
30
+ movd m0, [v4_tab_ChromaCoeff + r4 * 4]
31
%endif
32
lea r4, [r1 * 3]
33
lea r5, [r0 + 4 * r1]
34
35
sub r0, r1
36
37
%ifdef PIC
38
- lea r5, [tab_ChromaCoeff]
39
+ lea r5, [v4_tab_ChromaCoeff]
40
movd m0, [r5 + r4 * 4]
41
%else
42
- movd m0, [tab_ChromaCoeff + r4 * 4]
43
+ movd m0, [v4_tab_ChromaCoeff + r4 * 4]
44
%endif
45
46
pshufb m0, [tab_Cm]
47
48
sub r0, r1
49
50
%ifdef PIC
51
- lea r5, [tab_ChromaCoeff]
52
+ lea r5, [v4_tab_ChromaCoeff]
53
movd m0, [r5 + r4 * 4]
54
%else
55
- movd m0, [tab_ChromaCoeff + r4 * 4]
56
+ movd m0, [v4_tab_ChromaCoeff + r4 * 4]
57
%endif
58
59
pshufb m0, [tab_Cm]
60
61
sub r0, r1
62
63
%ifdef PIC
64
- lea r5, [tab_ChromaCoeff]
65
+ lea r5, [v4_tab_ChromaCoeff]
66
movd m0, [r5 + r4 * 4]
67
%else
68
- movd m0, [tab_ChromaCoeff + r4 * 4]
69
+ movd m0, [v4_tab_ChromaCoeff + r4 * 4]
70
%endif
71
72
pshufb m0, [tab_Cm]
73
74
sub r0, r1
75
76
%ifdef PIC
77
- lea r5, [tab_ChromaCoeff]
78
+ lea r5, [v4_tab_ChromaCoeff]
79
movd m0, [r5 + r4 * 4]
80
%else
81
- movd m0, [tab_ChromaCoeff + r4 * 4]
82
+ movd m0, [v4_tab_ChromaCoeff + r4 * 4]
83
%endif
84
85
pshufb m0, [tab_Cm]
86
87
punpcklbw m4, m2, m3
88
89
%ifdef PIC
90
- lea r6, [tab_ChromaCoeff]
91
+ lea r6, [v4_tab_ChromaCoeff]
92
movd m5, [r6 + r4 * 4]
93
%else
94
- movd m5, [tab_ChromaCoeff + r4 * 4]
95
+ movd m5, [v4_tab_ChromaCoeff + r4 * 4]
96
%endif
97
98
pshufb m6, m5, [tab_Vm]
99
100
add r3d, r3d
101
102
%ifdef PIC
103
- lea r5, [tab_ChromaCoeff]
104
+ lea r5, [v4_tab_ChromaCoeff]
105
movd m0, [r5 + r4 * 4]
106
%else
107
- movd m0, [tab_ChromaCoeff + r4 * 4]
108
+ movd m0, [v4_tab_ChromaCoeff + r4 * 4]
109
%endif
110
111
pshufb m0, [tab_Cm]
112
113
add r3d, r3d
114
115
%ifdef PIC
116
- lea r5, [tab_ChromaCoeff]
117
+ lea r5, [v4_tab_ChromaCoeff]
118
movd m0, [r5 + r4 * 4]
119
%else
120
- movd m0, [tab_ChromaCoeff + r4 * 4]
121
+ movd m0, [v4_tab_ChromaCoeff + r4 * 4]
122
%endif
123
124
pshufb m0, [tab_Cm]
125
126
add r3d, r3d
127
128
%ifdef PIC
129
- lea r5, [tab_ChromaCoeff]
130
+ lea r5, [v4_tab_ChromaCoeff]
131
movd m0, [r5 + r4 * 4]
132
%else
133
- movd m0, [tab_ChromaCoeff + r4 * 4]
134
+ movd m0, [v4_tab_ChromaCoeff + r4 * 4]
135
%endif
136
137
pshufb m0, [tab_Cm]
138
139
add r3d, r3d
140
141
%ifdef PIC
142
- lea r5, [tab_ChromaCoeff]
143
+ lea r5, [v4_tab_ChromaCoeff]
144
movd m5, [r5 + r4 * 4]
145
%else
146
- movd m5, [tab_ChromaCoeff + r4 * 4]
147
+ movd m5, [v4_tab_ChromaCoeff + r4 * 4]
148
%endif
149
150
pshufb m6, m5, [tab_Vm]
151
152
add r3d, r3d
153
154
%ifdef PIC
155
- lea r5, [tab_ChromaCoeff]
156
+ lea r5, [v4_tab_ChromaCoeff]
157
movd m5, [r5 + r4 * 4]
158
%else
159
- movd m5, [tab_ChromaCoeff + r4 * 4]
160
+ movd m5, [v4_tab_ChromaCoeff + r4 * 4]
161
%endif
162
163
pshufb m6, m5, [tab_Vm]
164
165
add r3d, r3d
166
167
%ifdef PIC
168
- lea r5, [tab_ChromaCoeff]
169
+ lea r5, [v4_tab_ChromaCoeff]
170
movd m5, [r5 + r4 * 4]
171
%else
172
- movd m5, [tab_ChromaCoeff + r4 * 4]
173
+ movd m5, [v4_tab_ChromaCoeff + r4 * 4]
174
%endif
175
176
pshufb m6, m5, [tab_Vm]
177
178
add r3d, r3d
179
180
%ifdef PIC
181
- lea r5, [tab_ChromaCoeff]
182
+ lea r5, [v4_tab_ChromaCoeff]
183
movd m0, [r5 + r4 * 4]
184
%else
185
- movd m0, [tab_ChromaCoeff + r4 * 4]
186
+ movd m0, [v4_tab_ChromaCoeff + r4 * 4]
187
%endif
188
189
pshufb m1, m0, [tab_Vm]
190
191
add r3d, r3d
192
193
%ifdef PIC
194
- lea r5, [tab_ChromaCoeff]
195
+ lea r5, [v4_tab_ChromaCoeff]
196
movd m0, [r5 + r4 * 4]
197
%else
198
- movd m0, [tab_ChromaCoeff + r4 * 4]
199
+ movd m0, [v4_tab_ChromaCoeff + r4 * 4]
200
%endif
201
202
pshufb m1, m0, [tab_Vm]
203
204
add r3d, r3d
205
206
%ifdef PIC
207
- lea r5, [tab_ChromaCoeff]
208
+ lea r5, [v4_tab_ChromaCoeff]
209
movd m0, [r5 + r4 * 4]
210
%else
211
- movd m0, [tab_ChromaCoeff + r4 * 4]
212
+ movd m0, [v4_tab_ChromaCoeff + r4 * 4]
213
%endif
214
215
pshufb m1, m0, [tab_Vm]
216
217
add r3d, r3d
218
219
%ifdef PIC
220
- lea r5, [tab_ChromaCoeff]
221
+ lea r5, [v4_tab_ChromaCoeff]
222
movd m0, [r5 + r4 * 4]
223
%else
224
- movd m0, [tab_ChromaCoeff + r4 * 4]
225
+ movd m0, [v4_tab_ChromaCoeff + r4 * 4]
226
%endif
227
228
pshufb m1, m0, [tab_Vm]
229
230
sub r0, r1
231
232
%ifdef PIC
233
- lea r5, [tab_ChromaCoeff]
234
+ lea r5, [v4_tab_ChromaCoeff]
235
movd m5, [r5 + r4 * 4]
236
%else
237
- movd m5, [tab_ChromaCoeff + r4 * 4]
238
+ movd m5, [v4_tab_ChromaCoeff + r4 * 4]
239
%endif
240
241
pshufb m6, m5, [tab_Vm]
242
243
sub r0, r1
244
245
%ifdef PIC
246
- lea r5, [tab_ChromaCoeff]
247
+ lea r5, [v4_tab_ChromaCoeff]
248
movd m5, [r5 + r4 * 4]
249
%else
250
- movd m5, [tab_ChromaCoeff + r4 * 4]
251
+ movd m5, [v4_tab_ChromaCoeff + r4 * 4]
252
%endif
253
254
pshufb m6, m5, [tab_Vm]
255
256
sub r0, r1
257
258
%ifdef PIC
259
- lea r5, [tab_ChromaCoeff]
260
+ lea r5, [v4_tab_ChromaCoeff]
261
movd m0, [r5 + r4 * 4]
262
%else
263
- movd m0, [tab_ChromaCoeff + r4 * 4]
264
+ movd m0, [v4_tab_ChromaCoeff + r4 * 4]
265
%endif
266
267
pshufb m1, m0, [tab_Vm]
268
269
sub r0, r1
270
271
%ifdef PIC
272
- lea r5, [tab_ChromaCoeff]
273
+ lea r5, [v4_tab_ChromaCoeff]
274
movd m0, [r5 + r4 * 4]
275
%else
276
- movd m0, [tab_ChromaCoeff + r4 * 4]
277
+ movd m0, [v4_tab_ChromaCoeff + r4 * 4]
278
%endif
279
280
pshufb m1, m0, [tab_Vm]
281
282
sub r0, r1
283
284
%ifdef PIC
285
- lea r5, [tab_ChromaCoeff]
286
+ lea r5, [v4_tab_ChromaCoeff]
287
movd m0, [r5 + r4 * 4]
288
%else
289
- movd m0, [tab_ChromaCoeff + r4 * 4]
290
+ movd m0, [v4_tab_ChromaCoeff + r4 * 4]
291
%endif
292
293
pshufb m1, m0, [tab_Vm]
294
295
sub r0, r1
296
297
%ifdef PIC
298
- lea r5, [tab_ChromaCoeff]
299
+ lea r5, [v4_tab_ChromaCoeff]
300
movd m0, [r5 + r4 * 4]
301
%else
302
- movd m0, [tab_ChromaCoeff + r4 * 4]
303
+ movd m0, [v4_tab_ChromaCoeff + r4 * 4]
304
%endif
305
306
pshufb m1, m0, [tab_Vm]
307
308
sub r0, r1
309
310
%ifdef PIC
311
- lea r5, [tab_ChromaCoeff]
312
+ lea r5, [v4_tab_ChromaCoeff]
313
movd m0, [r5 + r4 * 4]
314
%else
315
- movd m0, [tab_ChromaCoeff + r4 * 4]
316
+ movd m0, [v4_tab_ChromaCoeff + r4 * 4]
317
%endif
318
319
pshufb m1, m0, [tab_Vm]
320
321
add r3d, r3d
322
323
%ifdef PIC
324
- lea r5, [tab_ChromaCoeff]
325
+ lea r5, [v4_tab_ChromaCoeff]
326
movd m0, [r5 + r4 * 4]
327
%else
328
- movd m0, [tab_ChromaCoeff + r4 * 4]
329
+ movd m0, [v4_tab_ChromaCoeff + r4 * 4]
330
%endif
331
332
pshufb m1, m0, [tab_Vm]
333
334
add r3d, r3d
335
336
%ifdef PIC
337
- lea r5, [tab_ChromaCoeff]
338
+ lea r5, [v4_tab_ChromaCoeff]
339
movd m0, [r5 + r4 * 4]
340
%else
341
- movd m0, [tab_ChromaCoeff + r4 * 4]
342
+ movd m0, [v4_tab_ChromaCoeff + r4 * 4]
343
%endif
344
345
pshufb m0, [tab_Cm]
346
347
add r3d, r3d
348
349
%ifdef PIC
350
- lea r5, [tab_ChromaCoeff]
351
+ lea r5, [v4_tab_ChromaCoeff]
352
movd m0, [r5 + r4 * 4]
353
%else
354
- movd m0, [tab_ChromaCoeff + r4 * 4]
355
+ movd m0, [v4_tab_ChromaCoeff + r4 * 4]
356
%endif
357
358
pshufb m0, [tab_Cm]
359
x265_2.7.tar.gz/source/common/x86/x86inc.asm -> x265_2.9.tar.gz/source/common/x86/x86inc.asm
Changed
502
1
2
%endif
3
4
%macro SECTION_RODATA 0-1 32
5
- SECTION .rodata align=%1
6
+ %ifidn __OUTPUT_FORMAT__,win32
7
+ SECTION .rdata align=%1
8
+ %elif WIN64
9
+ SECTION .rdata align=%1
10
+ %else
11
+ SECTION .rodata align=%1
12
+ %endif
13
%endmacro
14
15
%if WIN64
16
17
%endmacro
18
19
%define required_stack_alignment ((mmsize + 15) & ~15)
20
+%define vzeroupper_required (mmsize > 16 && (ARCH_X86_64 == 0 || xmm_regs_used > 16 || notcpuflag(avx512)))
21
+%define high_mm_regs (16*cpuflag(avx512))
22
23
%macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only)
24
%ifnum %1
25
26
27
%macro WIN64_PUSH_XMM 0
28
; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated.
29
- %if xmm_regs_used > 6
30
+ %if xmm_regs_used > 6 + high_mm_regs
31
movaps [rstk + stack_offset + 8], xmm6
32
%endif
33
- %if xmm_regs_used > 7
34
+ %if xmm_regs_used > 7 + high_mm_regs
35
movaps [rstk + stack_offset + 24], xmm7
36
%endif
37
- %if xmm_regs_used > 8
38
+ %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
39
+ %if %%xmm_regs_on_stack > 0
40
%assign %%i 8
41
- %rep xmm_regs_used-8
42
+ %rep %%xmm_regs_on_stack
43
movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i
44
%assign %%i %%i+1
45
%endrep
46
47
48
%macro WIN64_SPILL_XMM 1
49
%assign xmm_regs_used %1
50
- ASSERT xmm_regs_used <= 16
51
- %if xmm_regs_used > 8
52
+ ASSERT xmm_regs_used <= 16 + high_mm_regs
53
+ %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
54
+ %if %%xmm_regs_on_stack > 0
55
; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack.
56
%assign %%pad (xmm_regs_used-8)*16 + 32
57
%assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1))
58
59
60
%macro WIN64_RESTORE_XMM_INTERNAL 0
61
%assign %%pad_size 0
62
- %if xmm_regs_used > 8
63
- %assign %%i xmm_regs_used
64
- %rep xmm_regs_used-8
65
+ %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
66
+ %if %%xmm_regs_on_stack > 0
67
+ %assign %%i xmm_regs_used - high_mm_regs
68
+ %rep %%xmm_regs_on_stack
69
%assign %%i %%i-1
70
movaps xmm %+ %%i, [rsp + (%%i-8)*16 + stack_size + 32]
71
%endrep
72
73
%assign %%pad_size stack_size_padded
74
%endif
75
%endif
76
- %if xmm_regs_used > 7
77
+ %if xmm_regs_used > 7 + high_mm_regs
78
movaps xmm7, [rsp + stack_offset - %%pad_size + 24]
79
%endif
80
- %if xmm_regs_used > 6
81
+ %if xmm_regs_used > 6 + high_mm_regs
82
movaps xmm6, [rsp + stack_offset - %%pad_size + 8]
83
%endif
84
%endmacro
85
86
%assign xmm_regs_used 0
87
%endmacro
88
89
-%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 || stack_size > 0
90
+%define has_epilogue regs_used > 7 || stack_size > 0 || vzeroupper_required || xmm_regs_used > 6 + high_mm_regs
91
92
%macro RET 0
93
WIN64_RESTORE_XMM_INTERNAL
94
POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
95
- %if mmsize == 32
96
+ %if vzeroupper_required
97
vzeroupper
98
%endif
99
AUTO_REP_RET
100
101
DECLARE_REG 13, R12, 64
102
DECLARE_REG 14, R13, 72
103
104
-%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
105
+%macro PROLOGUE 2-5+ 0; #args, #regs, #xmm_regs, [stack_size,] arg_names...
106
%assign num_args %1
107
%assign regs_used %2
108
+ %assign xmm_regs_used %3
109
ASSERT regs_used >= num_args
110
SETUP_STACK_POINTER %4
111
ASSERT regs_used <= 15
112
113
DEFINE_ARGS_INTERNAL %0, %4, %5
114
%endmacro
115
116
-%define has_epilogue regs_used > 9 || mmsize == 32 || stack_size > 0
117
+%define has_epilogue regs_used > 9 || stack_size > 0 || vzeroupper_required
118
119
%macro RET 0
120
%if stack_size_padded > 0
121
122
%endif
123
%endif
124
POP_IF_USED 14, 13, 12, 11, 10, 9
125
- %if mmsize == 32
126
+ %if vzeroupper_required
127
vzeroupper
128
%endif
129
AUTO_REP_RET
130
131
DEFINE_ARGS_INTERNAL %0, %4, %5
132
%endmacro
133
134
-%define has_epilogue regs_used > 3 || mmsize == 32 || stack_size > 0
135
+%define has_epilogue regs_used > 3 || stack_size > 0 || vzeroupper_required
136
137
%macro RET 0
138
%if stack_size_padded > 0
139
140
%endif
141
%endif
142
POP_IF_USED 6, 5, 4, 3
143
- %if mmsize == 32
144
+ %if vzeroupper_required
145
vzeroupper
146
%endif
147
AUTO_REP_RET
148
149
%assign stack_offset 0 ; stack pointer offset relative to the return address
150
%assign stack_size 0 ; amount of stack space that can be freely used inside a function
151
%assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding
152
- %assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64
153
+ %assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64 and vzeroupper
154
%ifnidn %3, ""
155
PROLOGUE %3
156
%endif
157
%endmacro
158
159
+; Create a global symbol from a local label with the correct name mangling and type
160
+%macro cglobal_label 1
161
+ %if FORMAT_ELF
162
+ global current_function %+ %1:function hidden
163
+ %else
164
+ global current_function %+ %1
165
+ %endif
166
+ %1:
167
+%endmacro
168
+
169
%macro cextern 1
170
%xdefine %1 mangle(private_prefix %+ _ %+ %1)
171
CAT_XDEFINE cglobaled_, %1, 1
172
173
%assign cpuflags_bmi1 (1<<16)| cpuflags_avx | cpuflags_lzcnt
174
%assign cpuflags_bmi2 (1<<17)| cpuflags_bmi1
175
%assign cpuflags_avx2 (1<<18)| cpuflags_fma3 | cpuflags_bmi2
176
+%assign cpuflags_avx512 (1<<19)| cpuflags_avx2 ; F, CD, BW, DQ, VL
177
178
-%assign cpuflags_cache32 (1<<19)
179
-%assign cpuflags_cache64 (1<<20)
180
-%assign cpuflags_slowctz (1<<21)
181
+%assign cpuflags_cache32 (1<<20)
182
+%assign cpuflags_cache64 (1<<21)
183
%assign cpuflags_aligned (1<<22) ; not a cpu feature, but a function variant
184
%assign cpuflags_atom (1<<23)
185
186
187
%endif
188
%endmacro
189
190
-; Merge mmx and sse*
191
+; Merge mmx and sse*, and avx*
192
; m# is a simd register of the currently selected size
193
; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m#
194
; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m#
195
-; (All 3 remain in sync through SWAP.)
196
+; zm# is the corresponding zmm register if mmsize >= 64, otherwise the same as m#
197
+; (All 4 remain in sync through SWAP.)
198
199
%macro CAT_XDEFINE 3
200
%xdefine %1%2 %3
201
202
%undef %1%2
203
%endmacro
204
205
+%macro DEFINE_MMREGS 1 ; mmtype
206
+ %assign %%prev_mmregs 0
207
+ %ifdef num_mmregs
208
+ %assign %%prev_mmregs num_mmregs
209
+ %endif
210
+
211
+ %assign num_mmregs 8
212
+ %if ARCH_X86_64 && mmsize >= 16
213
+ %assign num_mmregs 16
214
+ %if cpuflag(avx512) || mmsize == 64
215
+ %assign num_mmregs 32
216
+ %endif
217
+ %endif
218
+
219
+ %assign %%i 0
220
+ %rep num_mmregs
221
+ CAT_XDEFINE m, %%i, %1 %+ %%i
222
+ CAT_XDEFINE nn%1, %%i, %%i
223
+ %assign %%i %%i+1
224
+ %endrep
225
+ %if %%prev_mmregs > num_mmregs
226
+ %rep %%prev_mmregs - num_mmregs
227
+ CAT_UNDEF m, %%i
228
+ CAT_UNDEF nn %+ mmtype, %%i
229
+ %assign %%i %%i+1
230
+ %endrep
231
+ %endif
232
+ %xdefine mmtype %1
233
+%endmacro
234
+
235
+; Prefer registers 16-31 over 0-15 to avoid having to use vzeroupper
236
+%macro AVX512_MM_PERMUTATION 0-1 0 ; start_reg
237
+ %if ARCH_X86_64 && cpuflag(avx512)
238
+ %assign %%i %1
239
+ %rep 16-%1
240
+ %assign %%i_high %%i+16
241
+ SWAP %%i, %%i_high
242
+ %assign %%i %%i+1
243
+ %endrep
244
+ %endif
245
+%endmacro
246
+
247
%macro INIT_MMX 0-1+
248
%assign avx_enabled 0
249
%define RESET_MM_PERMUTATION INIT_MMX %1
250
%define mmsize 8
251
- %define num_mmregs 8
252
%define mova movq
253
%define movu movq
254
%define movh movd
255
%define movnta movntq
256
- %assign %%i 0
257
- %rep 8
258
- CAT_XDEFINE m, %%i, mm %+ %%i
259
- CAT_XDEFINE nnmm, %%i, %%i
260
- %assign %%i %%i+1
261
- %endrep
262
- %rep 8
263
- CAT_UNDEF m, %%i
264
- CAT_UNDEF nnmm, %%i
265
- %assign %%i %%i+1
266
- %endrep
267
INIT_CPUFLAGS %1
268
+ DEFINE_MMREGS mm
269
%endmacro
270
271
%macro INIT_XMM 0-1+
272
%assign avx_enabled 0
273
%define RESET_MM_PERMUTATION INIT_XMM %1
274
%define mmsize 16
275
- %define num_mmregs 8
276
- %if ARCH_X86_64
277
- %define num_mmregs 16
278
- %endif
279
%define mova movdqa
280
%define movu movdqu
281
%define movh movq
282
%define movnta movntdq
283
- %assign %%i 0
284
- %rep num_mmregs
285
- CAT_XDEFINE m, %%i, xmm %+ %%i
286
- CAT_XDEFINE nnxmm, %%i, %%i
287
- %assign %%i %%i+1
288
- %endrep
289
INIT_CPUFLAGS %1
290
+ DEFINE_MMREGS xmm
291
+ %if WIN64
292
+ ; Swap callee-saved registers with volatile registers
293
+ AVX512_MM_PERMUTATION 6
294
+ %endif
295
%endmacro
296
297
%macro INIT_YMM 0-1+
298
%assign avx_enabled 1
299
%define RESET_MM_PERMUTATION INIT_YMM %1
300
%define mmsize 32
301
- %define num_mmregs 8
302
- %if ARCH_X86_64
303
- %define num_mmregs 16
304
- %endif
305
%define mova movdqa
306
%define movu movdqu
307
%undef movh
308
%define movnta movntdq
309
- %assign %%i 0
310
- %rep num_mmregs
311
- CAT_XDEFINE m, %%i, ymm %+ %%i
312
- CAT_XDEFINE nnymm, %%i, %%i
313
- %assign %%i %%i+1
314
- %endrep
315
INIT_CPUFLAGS %1
316
+ DEFINE_MMREGS ymm
317
+ AVX512_MM_PERMUTATION
318
+%endmacro
319
+
320
+%macro INIT_ZMM 0-1+
321
+ %assign avx_enabled 1
322
+ %define RESET_MM_PERMUTATION INIT_ZMM %1
323
+ %define mmsize 64
324
+ %define mova movdqa
325
+ %define movu movdqu
326
+ %undef movh
327
+ %define movnta movntdq
328
+ INIT_CPUFLAGS %1
329
+ DEFINE_MMREGS zmm
330
+ AVX512_MM_PERMUTATION
331
%endmacro
332
333
INIT_XMM
334
335
%define mmmm%1 mm%1
336
%define mmxmm%1 mm%1
337
%define mmymm%1 mm%1
338
+ %define mmzmm%1 mm%1
339
%define xmmmm%1 mm%1
340
%define xmmxmm%1 xmm%1
341
%define xmmymm%1 xmm%1
342
+ %define xmmzmm%1 xmm%1
343
%define ymmmm%1 mm%1
344
%define ymmxmm%1 xmm%1
345
%define ymmymm%1 ymm%1
346
+ %define ymmzmm%1 ymm%1
347
+ %define zmmmm%1 mm%1
348
+ %define zmmxmm%1 xmm%1
349
+ %define zmmymm%1 ymm%1
350
+ %define zmmzmm%1 zmm%1
351
%define xm%1 xmm %+ m%1
352
%define ym%1 ymm %+ m%1
353
+ %define zm%1 zmm %+ m%1
354
%endmacro
355
356
%assign i 0
357
-%rep 16
358
+%rep 32
359
DECLARE_MMCAST i
360
%assign i i+1
361
%endrep
362
363
;=============================================================================
364
365
%assign i 0
366
-%rep 16
367
+%rep 32
368
%if i < 8
369
CAT_XDEFINE sizeofmm, i, 8
370
+ CAT_XDEFINE regnumofmm, i, i
371
%endif
372
CAT_XDEFINE sizeofxmm, i, 16
373
CAT_XDEFINE sizeofymm, i, 32
374
+ CAT_XDEFINE sizeofzmm, i, 64
375
+ CAT_XDEFINE regnumofxmm, i, i
376
+ CAT_XDEFINE regnumofymm, i, i
377
+ CAT_XDEFINE regnumofzmm, i, i
378
%assign i i+1
379
%endrep
380
%undef i
381
382
%endmacro
383
%endmacro
384
385
-; Instructions with both VEX and non-VEX encodings
386
+; Instructions with both VEX/EVEX and legacy encodings
387
; Non-destructive instructions are written without parameters
388
AVX_INSTR addpd, sse2, 1, 0, 1
389
AVX_INSTR addps, sse, 1, 0, 1
390
391
AVX_INSTR addss, sse, 1, 0, 0
392
AVX_INSTR addsubpd, sse3, 1, 0, 0
393
AVX_INSTR addsubps, sse3, 1, 0, 0
394
-AVX_INSTR aesdec, fnord, 0, 0, 0
395
-AVX_INSTR aesdeclast, fnord, 0, 0, 0
396
-AVX_INSTR aesenc, fnord, 0, 0, 0
397
-AVX_INSTR aesenclast, fnord, 0, 0, 0
398
-AVX_INSTR aesimc
399
-AVX_INSTR aeskeygenassist
400
+AVX_INSTR aesdec, aesni, 0, 0, 0
401
+AVX_INSTR aesdeclast, aesni, 0, 0, 0
402
+AVX_INSTR aesenc, aesni, 0, 0, 0
403
+AVX_INSTR aesenclast, aesni, 0, 0, 0
404
+AVX_INSTR aesimc, aesni
405
+AVX_INSTR aeskeygenassist, aesni
406
AVX_INSTR andnpd, sse2, 1, 0, 0
407
AVX_INSTR andnps, sse, 1, 0, 0
408
AVX_INSTR andpd, sse2, 1, 0, 1
409
410
AVX_INSTR blendps, sse4, 1, 1, 0
411
AVX_INSTR blendvpd, sse4 ; can't be emulated
412
AVX_INSTR blendvps, sse4 ; can't be emulated
413
+AVX_INSTR cmpeqpd, sse2, 1, 0, 1
414
+AVX_INSTR cmpeqps, sse, 1, 0, 1
415
+AVX_INSTR cmpeqsd, sse2, 1, 0, 0
416
+AVX_INSTR cmpeqss, sse, 1, 0, 0
417
+AVX_INSTR cmplepd, sse2, 1, 0, 0
418
+AVX_INSTR cmpleps, sse, 1, 0, 0
419
+AVX_INSTR cmplesd, sse2, 1, 0, 0
420
+AVX_INSTR cmpless, sse, 1, 0, 0
421
+AVX_INSTR cmpltpd, sse2, 1, 0, 0
422
+AVX_INSTR cmpltps, sse, 1, 0, 0
423
+AVX_INSTR cmpltsd, sse2, 1, 0, 0
424
+AVX_INSTR cmpltss, sse, 1, 0, 0
425
+AVX_INSTR cmpneqpd, sse2, 1, 0, 1
426
+AVX_INSTR cmpneqps, sse, 1, 0, 1
427
+AVX_INSTR cmpneqsd, sse2, 1, 0, 0
428
+AVX_INSTR cmpneqss, sse, 1, 0, 0
429
+AVX_INSTR cmpnlepd, sse2, 1, 0, 0
430
+AVX_INSTR cmpnleps, sse, 1, 0, 0
431
+AVX_INSTR cmpnlesd, sse2, 1, 0, 0
432
+AVX_INSTR cmpnless, sse, 1, 0, 0
433
+AVX_INSTR cmpnltpd, sse2, 1, 0, 0
434
+AVX_INSTR cmpnltps, sse, 1, 0, 0
435
+AVX_INSTR cmpnltsd, sse2, 1, 0, 0
436
+AVX_INSTR cmpnltss, sse, 1, 0, 0
437
+AVX_INSTR cmpordpd, sse2 1, 0, 1
438
+AVX_INSTR cmpordps, sse 1, 0, 1
439
+AVX_INSTR cmpordsd, sse2 1, 0, 0
440
+AVX_INSTR cmpordss, sse 1, 0, 0
441
AVX_INSTR cmppd, sse2, 1, 1, 0
442
AVX_INSTR cmpps, sse, 1, 1, 0
443
AVX_INSTR cmpsd, sse2, 1, 1, 0
444
AVX_INSTR cmpss, sse, 1, 1, 0
445
+AVX_INSTR cmpunordpd, sse2, 1, 0, 1
446
+AVX_INSTR cmpunordps, sse, 1, 0, 1
447
+AVX_INSTR cmpunordsd, sse2, 1, 0, 0
448
+AVX_INSTR cmpunordss, sse, 1, 0, 0
449
AVX_INSTR comisd, sse2
450
AVX_INSTR comiss, sse
451
AVX_INSTR cvtdq2pd, sse2
452
453
FMA4_INSTR fmsubadd, pd, ps
454
FMA4_INSTR fnmadd, pd, ps, sd, ss
455
FMA4_INSTR fnmsub, pd, ps, sd, ss
456
+
457
+; Macros for converting VEX instructions to equivalent EVEX ones.
458
+%macro EVEX_INSTR 2-3 0 ; vex, evex, prefer_evex
459
+ %macro %1 2-7 fnord, fnord, %1, %2, %3
460
+ %ifidn %3, fnord
461
+ %define %%args %1, %2
462
+ %elifidn %4, fnord
463
+ %define %%args %1, %2, %3
464
+ %else
465
+ %define %%args %1, %2, %3, %4
466
+ %endif
467
+ %assign %%evex_required cpuflag(avx512) & %7
468
+ %ifnum regnumof%1
469
+ %if regnumof%1 >= 16 || sizeof%1 > 32
470
+ %assign %%evex_required 1
471
+ %endif
472
+ %endif
473
+ %ifnum regnumof%2
474
+ %if regnumof%2 >= 16 || sizeof%2 > 32
475
+ %assign %%evex_required 1
476
+ %endif
477
+ %endif
478
+ %if %%evex_required
479
+ %6 %%args
480
+ %else
481
+ %5 %%args ; Prefer VEX over EVEX due to shorter instruction length
482
+ %endif
483
+ %endmacro
484
+%endmacro
485
+
486
+EVEX_INSTR vbroadcastf128, vbroadcastf32x4
487
+EVEX_INSTR vbroadcasti128, vbroadcasti32x4
488
+EVEX_INSTR vextractf128, vextractf32x4
489
+EVEX_INSTR vextracti128, vextracti32x4
490
+EVEX_INSTR vinsertf128, vinsertf32x4
491
+EVEX_INSTR vinserti128, vinserti32x4
492
+EVEX_INSTR vmovdqa, vmovdqa32
493
+EVEX_INSTR vmovdqu, vmovdqu32
494
+EVEX_INSTR vpand, vpandd
495
+EVEX_INSTR vpandn, vpandnd
496
+EVEX_INSTR vpor, vpord
497
+EVEX_INSTR vpxor, vpxord
498
+EVEX_INSTR vrcpps, vrcp14ps, 1 ; EVEX versions have higher precision
499
+EVEX_INSTR vrcpss, vrcp14ss, 1
500
+EVEX_INSTR vrsqrtps, vrsqrt14ps, 1
501
+EVEX_INSTR vrsqrtss, vrsqrt14ss, 1
502
x265_2.7.tar.gz/source/common/x86/x86util.asm -> x265_2.9.tar.gz/source/common/x86/x86util.asm
Changed
101
1
2
pminsw %2, %4
3
%endmacro
4
5
+%macro MOVHL 2 ; dst, src
6
+%ifidn %1, %2
7
+ punpckhqdq %1, %2
8
+%elif cpuflag(avx)
9
+ punpckhqdq %1, %2, %2
10
+%elif cpuflag(sse4)
11
+ pshufd %1, %2, q3232 ; pshufd is slow on some older CPUs, so only use it on more modern ones
12
+%else
13
+ movhlps %1, %2 ; may cause an int/float domain transition and has a dependency on dst
14
+%endif
15
+%endmacro
16
+
17
%macro HADDD 2 ; sum junk
18
-%if sizeof%1 == 32
19
-%define %2 xmm%2
20
- vextracti128 %2, %1, 1
21
-%define %1 xmm%1
22
- paddd %1, %2
23
+%if sizeof%1 >= 64
24
+ vextracti32x8 ymm%2, zmm%1, 1
25
+ paddd ymm%1, ymm%2
26
%endif
27
-%if mmsize >= 16
28
-%if cpuflag(xop) && sizeof%1 == 16
29
- vphadddq %1, %1
30
+%if sizeof%1 >= 32
31
+ vextracti128 xmm%2, ymm%1, 1
32
+ paddd xmm%1, xmm%2
33
+%endif
34
+%if sizeof%1 >= 16
35
+ MOVHL xmm%2, xmm%1
36
+ paddd xmm%1, xmm%2
37
%endif
38
- movhlps %2, %1
39
- paddd %1, %2
40
+%if cpuflag(xop) && sizeof%1 == 16
41
+ vphadddq xmm%1, xmm%1
42
%endif
43
%if notcpuflag(xop)
44
- PSHUFLW %2, %1, q0032
45
- paddd %1, %2
46
+ PSHUFLW xmm%2, xmm%1, q1032
47
+ paddd xmm%1, xmm%2
48
%endif
49
-%undef %1
50
-%undef %2
51
%endmacro
52
53
%macro HADDW 2 ; reg, tmp
54
%if cpuflag(xop) && sizeof%1 == 16
55
vphaddwq %1, %1
56
- movhlps %2, %1
57
+ MOVHL %2, %1
58
paddd %1, %2
59
%else
60
pmaddwd %1, [pw_1]
61
62
%macro HADDUW 2
63
%if cpuflag(xop) && sizeof%1 == 16
64
vphadduwq %1, %1
65
- movhlps %2, %1
66
+ MOVHL %2, %1
67
paddd %1, %2
68
%else
69
HADDUWD %1, %2
70
71
%if %6 ; %5 aligned?
72
mova %1, %4
73
psubw %1, %5
74
+%elif cpuflag(avx)
75
+ movu %1, %4
76
+ psubw %1, %5
77
%else
78
movu %1, %4
79
movu %2, %5
80
psubw %1, %2
81
%endif
82
%else ; !HIGH_BIT_DEPTH
83
-%ifidn %3, none
84
movh %1, %4
85
movh %2, %5
86
+%ifidn %3, none
87
punpcklbw %1, %2
88
punpcklbw %2, %2
89
- psubw %1, %2
90
%else
91
- movh %1, %4
92
punpcklbw %1, %3
93
- movh %2, %5
94
punpcklbw %2, %3
95
- psubw %1, %2
96
%endif
97
+ psubw %1, %2
98
%endif ; HIGH_BIT_DEPTH
99
%endmacro
100
101
x265_2.7.tar.gz/source/common/yuv.cpp -> x265_2.9.tar.gz/source/common/yuv.cpp
Changed
39
1
2
3
void Yuv::addClip(const Yuv& srcYuv0, const ShortYuv& srcYuv1, uint32_t log2SizeL, int picCsp)
4
{
5
- primitives.cu[log2SizeL - 2].add_ps(m_buf[0], m_size, srcYuv0.m_buf[0], srcYuv1.m_buf[0], srcYuv0.m_size, srcYuv1.m_size);
6
+ primitives.cu[log2SizeL - 2].add_ps[(m_size % 64 == 0) && (srcYuv0.m_size % 64 == 0) && (srcYuv1.m_size % 64 == 0)](m_buf[0],
7
+ m_size, srcYuv0.m_buf[0], srcYuv1.m_buf[0], srcYuv0.m_size, srcYuv1.m_size);
8
if (m_csp != X265_CSP_I400 && picCsp != X265_CSP_I400)
9
{
10
- primitives.chroma[m_csp].cu[log2SizeL - 2].add_ps(m_buf[1], m_csize, srcYuv0.m_buf[1], srcYuv1.m_buf[1], srcYuv0.m_csize, srcYuv1.m_csize);
11
- primitives.chroma[m_csp].cu[log2SizeL - 2].add_ps(m_buf[2], m_csize, srcYuv0.m_buf[2], srcYuv1.m_buf[2], srcYuv0.m_csize, srcYuv1.m_csize);
12
+ primitives.chroma[m_csp].cu[log2SizeL - 2].add_ps[(m_csize % 64 == 0) && (srcYuv0.m_csize % 64 ==0) && (srcYuv1.m_csize % 64 == 0)](m_buf[1],
13
+ m_csize, srcYuv0.m_buf[1], srcYuv1.m_buf[1], srcYuv0.m_csize, srcYuv1.m_csize);
14
+ primitives.chroma[m_csp].cu[log2SizeL - 2].add_ps[(m_csize % 64 == 0) && (srcYuv0.m_csize % 64 == 0) && (srcYuv1.m_csize % 64 == 0)](m_buf[2],
15
+ m_csize, srcYuv0.m_buf[2], srcYuv1.m_buf[2], srcYuv0.m_csize, srcYuv1.m_csize);
16
}
17
if (picCsp == X265_CSP_I400 && m_csp != X265_CSP_I400)
18
{
19
20
const int16_t* srcY0 = srcYuv0.getLumaAddr(absPartIdx);
21
const int16_t* srcY1 = srcYuv1.getLumaAddr(absPartIdx);
22
pixel* dstY = getLumaAddr(absPartIdx);
23
- primitives.pu[part].addAvg(srcY0, srcY1, dstY, srcYuv0.m_size, srcYuv1.m_size, m_size);
24
+ primitives.pu[part].addAvg[(srcYuv0.m_size % 64 == 0) && (srcYuv1.m_size % 64 == 0) && (m_size % 64 == 0)](srcY0, srcY1, dstY, srcYuv0.m_size, srcYuv1.m_size, m_size);
25
}
26
if (bChroma)
27
{
28
29
const int16_t* srcV1 = srcYuv1.getCrAddr(absPartIdx);
30
pixel* dstU = getCbAddr(absPartIdx);
31
pixel* dstV = getCrAddr(absPartIdx);
32
- primitives.chroma[m_csp].pu[part].addAvg(srcU0, srcU1, dstU, srcYuv0.m_csize, srcYuv1.m_csize, m_csize);
33
- primitives.chroma[m_csp].pu[part].addAvg(srcV0, srcV1, dstV, srcYuv0.m_csize, srcYuv1.m_csize, m_csize);
34
+ primitives.chroma[m_csp].pu[part].addAvg[(srcYuv0.m_csize % 64 == 0) && (srcYuv1.m_csize % 64 == 0) && (m_csize % 64 == 0)](srcU0, srcU1, dstU, srcYuv0.m_csize, srcYuv1.m_csize, m_csize);
35
+ primitives.chroma[m_csp].pu[part].addAvg[(srcYuv0.m_csize % 64 == 0) && (srcYuv1.m_csize % 64 == 0) && (m_csize % 64 == 0)](srcV0, srcV1, dstV, srcYuv0.m_csize, srcYuv1.m_csize, m_csize);
36
}
37
}
38
39
x265_2.7.tar.gz/source/common/yuv.h -> x265_2.9.tar.gz/source/common/yuv.h
Changed
9
1
2
class Yuv
3
{
4
public:
5
-
6
pixel* m_buf[3];
7
8
uint32_t m_size;
9
x265_2.7.tar.gz/source/dynamicHDR10/SeiMetadataDictionary.cpp -> x265_2.9.tar.gz/source/dynamicHDR10/SeiMetadataDictionary.cpp
Changed
28
1
2
const std::string BezierCurveNames::NumberOfAnchors = std::string("NumberOfAnchors");
3
const std::string BezierCurveNames::KneePointX = std::string("KneePointX");
4
const std::string BezierCurveNames::KneePointY = std::string("KneePointY");
5
+const std::string BezierCurveNames::AnchorsTag = std::string("Anchors");
6
const std::string BezierCurveNames::Anchors[] = {std::string("Anchor0"),
7
std::string("Anchor1"),
8
std::string("Anchor2"),
9
10
11
const std::string PercentileNames::TagName = std::string("PercentileLuminance");
12
const std::string PercentileNames::NumberOfPercentiles = std::string("NumberOfPercentiles");
13
+const std::string PercentileNames::DistributionIndex = std::string("DistributionIndex");
14
+const std::string PercentileNames::DistributionValues = std::string("DistributionValues");
15
const std::string PercentileNames::PercentilePercentageValue[] = {std::string("PercentilePercentage0"),
16
std::string("PercentilePercentage1"),
17
std::string("PercentilePercentage2"),
18
19
20
21
const std::string LuminanceNames::TagName = std::string("LuminanceParameters");
22
+const std::string LuminanceNames::LlcTagName = std::string("LuminanceDistributions");
23
const std::string LuminanceNames::AverageRGB = std::string("AverageRGB");
24
+const std::string LuminanceNames::MaxSCL = std::string("MaxScl");
25
const std::string LuminanceNames::MaxSCL0 = std::string("MaxScl0");
26
const std::string LuminanceNames::MaxSCL1 = std::string("MaxScl1");
27
const std::string LuminanceNames::MaxSCL2 = std::string("MaxScl2");
28
x265_2.7.tar.gz/source/dynamicHDR10/SeiMetadataDictionary.h -> x265_2.9.tar.gz/source/dynamicHDR10/SeiMetadataDictionary.h
Changed
28
1
2
static const std::string NumberOfAnchors;
3
static const std::string KneePointX;
4
static const std::string KneePointY;
5
+ static const std::string AnchorsTag;
6
static const std::string Anchors[14];
7
};
8
//Ellipse Selection Data
9
10
public:
11
static const std::string TagName;
12
static const std::string NumberOfPercentiles;
13
+ static const std::string DistributionIndex;
14
+ static const std::string DistributionValues;
15
static const std::string PercentilePercentageValue[15];
16
static const std::string PercentileLuminanceValue[15];
17
};
18
19
{
20
public:
21
static const std::string TagName;
22
+ static const std::string LlcTagName;
23
static const std::string AverageRGB;
24
+ static const std::string MaxSCL;
25
static const std::string MaxSCL0;
26
static const std::string MaxSCL1;
27
static const std::string MaxSCL2;
28
x265_2.7.tar.gz/source/dynamicHDR10/metadataFromJson.cpp -> x265_2.9.tar.gz/source/dynamicHDR10/metadataFromJson.cpp
Changed
534
1
2
int mCurrentStreamBit;
3
int mCurrentStreamByte;
4
5
- bool luminanceParamFromJson(const Json &data, LuminanceParameters &obj)
6
+ bool luminanceParamFromJson(const Json &data, LuminanceParameters &obj, const JsonType jsonType)
7
{
8
JsonObject lumJsonData = data.object_items();
9
if(!lumJsonData.empty())
10
{
11
- JsonObject percentileData = lumJsonData[PercentileNames::TagName].object_items();
12
- obj.order = percentileData[PercentileNames::NumberOfPercentiles].int_value();
13
-
14
- obj.averageLuminance = static_cast<float>(lumJsonData[LuminanceNames::AverageRGB].number_value());
15
- obj.maxRLuminance = static_cast<float>(lumJsonData[LuminanceNames::MaxSCL0].number_value());
16
- obj.maxGLuminance = static_cast<float>(lumJsonData[LuminanceNames::MaxSCL1].number_value());
17
- obj.maxBLuminance = static_cast<float>(lumJsonData[LuminanceNames::MaxSCL2].number_value());
18
-
19
- if(!percentileData.empty())
20
- {
21
- obj.percentiles.resize(obj.order);
22
- for(int i = 0; i < obj.order; ++i)
23
- {
24
- std::string percentileTag = PercentileNames::TagName;
25
- percentileTag += std::to_string(i);
26
- obj.percentiles[i] = static_cast<unsigned int>(percentileData[percentileTag].int_value());
27
- }
28
- }
29
-
30
- return true;
31
- }
32
- return false;
33
- }
34
-
35
- bool percentagesFromJson(const Json &data, std::vector<unsigned int> &percentages)
36
- {
37
- JsonObject jsonData = data.object_items();
38
- if(!jsonData.empty())
39
- {
40
- JsonObject percentileData = jsonData[PercentileNames::TagName].object_items();
41
- int order = percentileData[PercentileNames::NumberOfPercentiles].int_value();
42
-
43
- percentages.resize(order);
44
- for(int i = 0; i < order; ++i)
45
- {
46
- std::string percentileTag = PercentileNames::PercentilePercentageValue[i];
47
- percentages[i] = static_cast<unsigned int>(percentileData[percentileTag].int_value());
48
- }
49
-
50
- return true;
51
- }
52
+ switch(jsonType)
53
+ {
54
+ case LEGACY:
55
+ {
56
+ obj.averageLuminance = static_cast<float>(lumJsonData[LuminanceNames::AverageRGB].number_value());
57
+ obj.maxRLuminance = static_cast<float>(lumJsonData[LuminanceNames::MaxSCL0].number_value());
58
+ obj.maxGLuminance = static_cast<float>(lumJsonData[LuminanceNames::MaxSCL1].number_value());
59
+ obj.maxBLuminance = static_cast<float>(lumJsonData[LuminanceNames::MaxSCL2].number_value());
60
+
61
+ JsonObject percentileData = lumJsonData[PercentileNames::TagName].object_items();
62
+ obj.order = percentileData[PercentileNames::NumberOfPercentiles].int_value();
63
+ if(!percentileData.empty())
64
+ {
65
+ obj.percentiles.resize(obj.order);
66
+ for(int i = 0; i < obj.order; ++i)
67
+ {
68
+ std::string percentileTag = PercentileNames::TagName;
69
+ percentileTag += std::to_string(i);
70
+ obj.percentiles[i] = static_cast<unsigned int>(percentileData[percentileTag].int_value());
71
+ }
72
+ }
73
+ return true;
74
+ } break;
75
+ case LLC:
76
+ {
77
+ obj.averageLuminance = static_cast<float>(lumJsonData[LuminanceNames::AverageRGB].number_value());
78
+ JsonArray maxScl = lumJsonData[LuminanceNames::MaxSCL].array_items();
79
+ obj.maxRLuminance = static_cast<float>(maxScl[0].number_value());
80
+ obj.maxGLuminance = static_cast<float>(maxScl[1].number_value());
81
+ obj.maxBLuminance = static_cast<float>(maxScl[2].number_value());
82
+
83
+ JsonObject percentileData = lumJsonData[LuminanceNames::LlcTagName].object_items();
84
+ if(!percentileData.empty())
85
+ {
86
+ JsonArray distributionValues = percentileData[PercentileNames::DistributionValues].array_items();
87
+ obj.order = static_cast<int>(distributionValues.size());
88
+ obj.percentiles.resize(obj.order);
89
+ for(int i = 0; i < obj.order; ++i)
90
+ {
91
+ obj.percentiles[i] = static_cast<unsigned int>(distributionValues[i].int_value());
92
+ }
93
+ }
94
+ return true;
95
+ } break;
96
+ }
97
+ }
98
return false;
99
}
100
101
- bool percentagesFromJson(const Json &data, unsigned int *percentages)
102
+ bool percentagesFromJson(const Json &data, std::vector<unsigned int> &percentages, const JsonType jsonType)
103
{
104
JsonObject jsonData = data.object_items();
105
if(!jsonData.empty())
106
{
107
- JsonObject percentileData = jsonData[PercentileNames::TagName].object_items();
108
- int order = percentileData[PercentileNames::NumberOfPercentiles].int_value();
109
-
110
- for(int i = 0; i < order; ++i)
111
- {
112
- std::string percentileTag = PercentileNames::PercentilePercentageValue[i];
113
- percentages[i] = static_cast<unsigned int>(percentileData[percentileTag].int_value());
114
- }
115
+ switch(jsonType)
116
+ {
117
+ case LEGACY:
118
+ {
119
+ JsonObject percentileData = jsonData[PercentileNames::TagName].object_items();
120
+ int order = percentileData[PercentileNames::NumberOfPercentiles].int_value();
121
+ percentages.resize(order);
122
+ for(int i = 0; i < order; ++i)
123
+ {
124
+ std::string percentileTag = PercentileNames::PercentilePercentageValue[i];
125
+ percentages[i] = static_cast<unsigned int>(percentileData[percentileTag].int_value());
126
+ }
127
+ return true;
128
+ } break;
129
+ case LLC:
130
+ {
131
+ JsonObject percentileData = jsonData[LuminanceNames::LlcTagName].object_items();
132
+ if(!percentileData.empty())
133
+ {
134
+ JsonArray percentageValues = percentileData[PercentileNames::DistributionIndex].array_items();
135
+ int order = static_cast<int>(percentageValues.size());
136
+ percentages.resize(order);
137
+ for(int i = 0; i < order; ++i)
138
+ {
139
+ percentages[i] = static_cast<unsigned int>(percentageValues[i].int_value());
140
+ }
141
+ }
142
+ return true;
143
+ } break;
144
+ }
145
146
- return true;
147
}
148
return false;
149
}
150
151
- bool bezierCurveFromJson(const Json &data, BezierCurveData &obj)
152
+ bool bezierCurveFromJson(const Json &data, BezierCurveData &obj, const JsonType jsonType)
153
{
154
JsonObject jsonData = data.object_items();
155
if(!jsonData.empty())
156
{
157
- obj.order = jsonData[BezierCurveNames::NumberOfAnchors].int_value();
158
- obj.coeff.resize(obj.order);
159
- obj.sPx = jsonData[BezierCurveNames::KneePointX].int_value();
160
- obj.sPy = jsonData[BezierCurveNames::KneePointY].int_value();
161
- for(int i = 0; i < obj.order; ++i)
162
- {
163
- obj.coeff[i] = jsonData[BezierCurveNames::Anchors[i]].int_value();
164
- }
165
-
166
- return true;
167
+ switch(jsonType)
168
+ {
169
+ case LEGACY:
170
+ {
171
+ obj.sPx = jsonData[BezierCurveNames::KneePointX].int_value();
172
+ obj.sPy = jsonData[BezierCurveNames::KneePointY].int_value();
173
+ obj.order = jsonData[BezierCurveNames::NumberOfAnchors].int_value();
174
+ obj.coeff.resize(obj.order);
175
+ for(int i = 0; i < obj.order; ++i)
176
+ {
177
+ obj.coeff[i] = jsonData[BezierCurveNames::Anchors[i]].int_value();
178
+ }
179
+ return true;
180
+ } break;
181
+ case LLC:
182
+ {
183
+ obj.sPx = jsonData[BezierCurveNames::KneePointX].int_value();
184
+ obj.sPy = jsonData[BezierCurveNames::KneePointY].int_value();
185
+ JsonArray anchorValues = data[BezierCurveNames::AnchorsTag].array_items();
186
+ obj.order = static_cast<int>(anchorValues.size());
187
+ obj.coeff.resize(obj.order);
188
+ for(int i = 0; i < obj.order; ++i)
189
+ {
190
+ obj.coeff[i] = anchorValues[i].int_value();
191
+ }
192
+ return true;
193
+ } break;
194
+ }
195
}
196
return false;
197
}
198
199
void setPayloadSize(uint8_t *dataStream, int positionOnStream, int payload)
200
{
201
int payloadBytes = 1;
202
-
203
for(;payload >= 0xFF; payload -= 0xFF, ++payloadBytes);
204
-
205
if(payloadBytes > 1)
206
{
207
shiftData(dataStream, payloadBytes-1, mCurrentStreamByte, positionOnStream);
208
209
}
210
}
211
212
-// const std::string LocalParameters = std::string("LocalParameters");
213
-// const std::string TargetDisplayLuminance = std::string("TargetedSystemDisplayMaximumLuminance");
214
};
215
216
metadataFromJson::metadataFromJson() :
217
218
delete mPimpl;
219
}
220
221
-
222
bool metadataFromJson::frameMetadataFromJson(const char* filePath,
223
int frame,
224
uint8_t *&metadata)
225
{
226
std::string path(filePath);
227
JsonArray fileData = JsonHelper::readJsonArray(path);
228
-
229
+ JsonType jsonType = LEGACY;
230
if(fileData.empty())
231
{
232
- return false;
233
+ jsonType = LLC;
234
+ fileData = JsonHelper::readJson(filePath).at("SceneInfo").array_items();
235
}
236
237
// frame = frame + 1; //index on the array start at 0 frames starts at 1
238
239
}
240
241
int mSEIBytesToRead = 509;
242
-
243
if(metadata)
244
{
245
delete(metadata);
246
247
metadata = new uint8_t[mSEIBytesToRead];
248
mPimpl->mCurrentStreamBit = 8;
249
mPimpl->mCurrentStreamByte = 1;
250
+ memset(metadata, 0, mSEIBytesToRead);
251
252
- for(int j = 0; j < mSEIBytesToRead; ++j)
253
- {
254
- (metadata)[j] = 0;
255
- }
256
-
257
- fillMetadataArray(fileData, frame, metadata);
258
+ fillMetadataArray(fileData, frame, jsonType, metadata);
259
mPimpl->setPayloadSize(metadata, 0, mPimpl->mCurrentStreamByte);
260
return true;
261
}
262
263
{
264
std::string path(filePath);
265
JsonArray fileData = JsonHelper::readJsonArray(path);
266
+ JsonType jsonType = LEGACY;
267
if (fileData.empty())
268
{
269
- return -1;
270
+ jsonType = LLC;
271
+ fileData = JsonHelper::readJson(filePath).at("SceneInfo").array_items();
272
}
273
274
int numFrames = static_cast<int>(fileData.size());
275
276
for (int frame = 0; frame < numFrames; ++frame)
277
{
278
metadata[frame] = new uint8_t[509];
279
- for (int i = 0; i < 509; ++i)
280
- {
281
- metadata[frame][i] = 0;
282
- }
283
+ memset(metadata[frame], 0, 509);
284
mPimpl->mCurrentStreamBit = 8;
285
mPimpl->mCurrentStreamByte = 1;
286
287
- fillMetadataArray(fileData, frame, metadata[frame]);
288
-
289
+ fillMetadataArray(fileData, frame, jsonType, metadata[frame]);
290
mPimpl->setPayloadSize(metadata[frame], 0, mPimpl->mCurrentStreamByte);
291
-
292
}
293
294
return numFrames;
295
296
/* NOTE: We leave TWO BYTES of space for the payload */
297
mPimpl->mCurrentStreamByte += 2;
298
299
- fillMetadataArray(fileData, frame, metadata);
300
+ fillMetadataArray(fileData, frame, LEGACY, metadata);
301
302
/* Set payload in bytes 2 & 3 as indicated in Extended InfoFrame Type syntax */
303
metadata[2] = (mPimpl->mCurrentStreamByte & 0xFF00) >> 8;
304
305
306
int metadataFromJson::movieExtendedInfoFrameMetadataFromJson(const char* filePath, uint8_t **&metadata)
307
{
308
- std::string path(filePath);
309
+ std::string path(filePath);
310
JsonArray fileData = JsonHelper::readJsonArray(path);
311
if(fileData.empty())
312
{
313
314
{
315
metadata[frame] = new uint8_t[509];
316
for(int i = 0; i < 509; ++i)
317
- {
318
- metadata[frame][i] = 0;
319
- }
320
+ {
321
+ metadata[frame][i] = 0;
322
+ }
323
mPimpl->mCurrentStreamBit = 8;
324
mPimpl->mCurrentStreamByte = 0;
325
326
327
/* NOTE: We leave TWO BYTES of space for the payload */
328
mPimpl->mCurrentStreamByte += 2;
329
330
- fillMetadataArray(fileData, frame, metadata[frame]);
331
+ fillMetadataArray(fileData, frame, LEGACY, metadata[frame]);
332
333
/* Set payload in bytes 2 & 3 as indicated in Extended InfoFrame Type syntax */
334
metadata[frame][2] = (mPimpl->mCurrentStreamByte & 0xFF00) >> 8;
335
336
return numFrames;
337
}
338
339
-void metadataFromJson::fillMetadataArray(const JsonArray &fileData, int frame, uint8_t *&metadata)
340
+void metadataFromJson::fillMetadataArray(const JsonArray &fileData, int frame, const JsonType jsonType, uint8_t *&metadata)
341
{
342
const uint8_t countryCode = 0xB5;
343
const uint16_t terminalProviderCode = 0x003C;
344
345
mPimpl->appendBits(metadata, applicationIdentifier, 8);
346
mPimpl->appendBits(metadata, applicationVersion, 8);
347
348
- //Note: Validated only add up to two local selections, ignore the rest
349
- JsonArray jsonArray = fileData[frame][JsonDataKeys::LocalParameters].array_items();
350
- int ellipsesNum = static_cast<int>(jsonArray.size() > 2 ? 2 : jsonArray.size());
351
- uint16_t numWindows = (uint16_t)fileData[frame][JsonDataKeys::NumberOfWindows].int_value();
352
- mPimpl->appendBits(metadata, numWindows, 2);
353
- for (int i = 0; i < ellipsesNum; ++i)
354
+ uint16_t numWindows = 0;
355
+ /* HDR10+ LLC doesn't consider local windows */
356
+ if(jsonType & LLC)
357
+ {
358
+ numWindows = 1;
359
+ mPimpl->appendBits(metadata, numWindows, 2);
360
+ }
361
+ else
362
{
363
- mPimpl->appendBits(metadata, jsonArray[i][EllipseSelectionNames::WindowData]
364
- [EllipseSelectionNames::WindowUpperLeftCornerX].int_value(), 16);
365
- mPimpl->appendBits(metadata, jsonArray[i][EllipseSelectionNames::WindowData]
366
- [EllipseSelectionNames::WindowUpperLeftCornerY].int_value(), 16);
367
- mPimpl->appendBits(metadata, jsonArray[i][EllipseSelectionNames::WindowData]
368
- [EllipseSelectionNames::WindowLowerRightCornerX].int_value(), 16);
369
- mPimpl->appendBits(metadata, jsonArray[i][EllipseSelectionNames::WindowData]
370
- [EllipseSelectionNames::WindowLowerRightCornerY].int_value(), 16);
371
+ //Note: Validated only add up to two local selections, ignore the rest
372
+ JsonArray jsonArray = fileData[frame][JsonDataKeys::LocalParameters].array_items();
373
+ int ellipsesNum = static_cast<int>(jsonArray.size() > 2 ? 2 : jsonArray.size());
374
+ numWindows = (uint16_t)fileData[frame][JsonDataKeys::NumberOfWindows].int_value();
375
+ mPimpl->appendBits(metadata, numWindows, 2);
376
+ for (int i = 0; i < ellipsesNum; ++i)
377
+ {
378
+ mPimpl->appendBits(metadata, jsonArray[i][EllipseSelectionNames::WindowData]
379
+ [EllipseSelectionNames::WindowUpperLeftCornerX].int_value(), 16);
380
+ mPimpl->appendBits(metadata, jsonArray[i][EllipseSelectionNames::WindowData]
381
+ [EllipseSelectionNames::WindowUpperLeftCornerY].int_value(), 16);
382
+ mPimpl->appendBits(metadata, jsonArray[i][EllipseSelectionNames::WindowData]
383
+ [EllipseSelectionNames::WindowLowerRightCornerX].int_value(), 16);
384
+ mPimpl->appendBits(metadata, jsonArray[i][EllipseSelectionNames::WindowData]
385
+ [EllipseSelectionNames::WindowLowerRightCornerY].int_value(), 16);
386
387
- JsonObject ellipseJsonObject = jsonArray[i][EllipseNames::TagName].object_items();
388
+ JsonObject ellipseJsonObject = jsonArray[i][EllipseNames::TagName].object_items();
389
390
- mPimpl->appendBits(metadata,
391
- static_cast<uint16_t>(ellipseJsonObject[EllipseNames::CenterOfEllipseX].int_value()),
392
- 16);
393
+ mPimpl->appendBits(metadata,
394
+ static_cast<uint16_t>(ellipseJsonObject[EllipseNames::CenterOfEllipseX].int_value()),
395
+ 16);
396
397
- mPimpl->appendBits(metadata,
398
- static_cast<uint16_t>(ellipseJsonObject[EllipseNames::CenterOfEllipseY].int_value()),
399
- 16);
400
+ mPimpl->appendBits(metadata,
401
+ static_cast<uint16_t>(ellipseJsonObject[EllipseNames::CenterOfEllipseY].int_value()),
402
+ 16);
403
404
- int angle = ellipseJsonObject[EllipseNames::RotationAngle].int_value();
405
- uint8_t rotationAngle = static_cast<uint8_t>((angle > 180.0) ? angle - 180.0 : angle);
406
- mPimpl->appendBits(metadata, rotationAngle, 8);
407
+ int angle = ellipseJsonObject[EllipseNames::RotationAngle].int_value();
408
+ uint8_t rotationAngle = static_cast<uint8_t>((angle > 180.0) ? angle - 180.0 : angle);
409
+ mPimpl->appendBits(metadata, rotationAngle, 8);
410
411
- uint16_t semimajorExternalAxis =
412
- static_cast<uint16_t>(ellipseJsonObject[EllipseNames::SemiMajorAxisExternalEllipse].int_value());
413
+ uint16_t semimajorExternalAxis =
414
+ static_cast<uint16_t>(ellipseJsonObject[EllipseNames::SemiMajorAxisExternalEllipse].int_value());
415
416
- uint16_t semiminorExternalAxis =
417
- static_cast<uint16_t>(ellipseJsonObject[EllipseNames::SemiMinorAxisExternalEllipse].int_value());
418
+ uint16_t semiminorExternalAxis =
419
+ static_cast<uint16_t>(ellipseJsonObject[EllipseNames::SemiMinorAxisExternalEllipse].int_value());
420
421
- uint16_t semimajorInternalEllipse =
422
- static_cast<uint16_t>(ellipseJsonObject[EllipseNames::SemiMajorAxisInternalEllipse].int_value());
423
+ uint16_t semimajorInternalEllipse =
424
+ static_cast<uint16_t>(ellipseJsonObject[EllipseNames::SemiMajorAxisInternalEllipse].int_value());
425
426
- mPimpl->appendBits(metadata, semimajorInternalEllipse, 16);
427
+ mPimpl->appendBits(metadata, semimajorInternalEllipse, 16);
428
429
- mPimpl->appendBits(metadata, semimajorExternalAxis, 16);
430
- mPimpl->appendBits(metadata, semiminorExternalAxis, 16);
431
- uint8_t overlapProcessOption = static_cast<uint8_t>(ellipseJsonObject[EllipseNames::OverlapProcessOption].int_value());
432
- //TODO: Uses Layering method, the value is "1"
433
- mPimpl->appendBits(metadata, overlapProcessOption, 1);
434
+ mPimpl->appendBits(metadata, semimajorExternalAxis, 16);
435
+ mPimpl->appendBits(metadata, semiminorExternalAxis, 16);
436
+ uint8_t overlapProcessOption = static_cast<uint8_t>(ellipseJsonObject[EllipseNames::OverlapProcessOption].int_value());
437
+ //TODO: Uses Layering method, the value is "1"
438
+ mPimpl->appendBits(metadata, overlapProcessOption, 1);
439
+ }
440
}
441
+
442
/* Targeted System Display Data */
443
- uint32_t monitorPeak = fileData[frame][JsonDataKeys::TargetDisplayLuminance].int_value(); //500;
444
+ uint32_t monitorPeak = fileData[frame][JsonDataKeys::TargetDisplayLuminance].int_value();
445
mPimpl->appendBits(metadata, monitorPeak, 27);
446
- //NOTE: Set as false for now, as requested
447
+
448
uint8_t targetedSystemDisplayActualPeakLuminanceFlag = 0;
449
mPimpl->appendBits(metadata, targetedSystemDisplayActualPeakLuminanceFlag, 1);
450
if (targetedSystemDisplayActualPeakLuminanceFlag)
451
452
//TODO
453
}
454
455
- /* Max rgb values (maxScl)*/
456
+ /* Max RGB values (maxScl)*/
457
/* Luminance values/percentile for each window */
458
for (int w = 0; w < numWindows; ++w)
459
{
460
Json lumObj = fileData[frame][LuminanceNames::TagName];
461
LuminanceParameters luminanceData;
462
- if (!mPimpl->luminanceParamFromJson(lumObj, luminanceData))
463
+ if(!mPimpl->luminanceParamFromJson(lumObj, luminanceData, jsonType))
464
{
465
std::cout << "error parsing luminance parameters frame: " << w << std::endl;
466
}
467
468
- /* NOTE: Maxscl from 0 t 100,000 based on data that says in values of 0.00001
469
+ /* NOTE: Maxscl from 0 to 100,000 based on data that says in values of 0.00001
470
* one for each channel R,G,B
471
*/
472
-
473
mPimpl->appendBits(metadata, static_cast<uint8_t>(((int)luminanceData.maxRLuminance & 0x10000) >> 16), 1);
474
mPimpl->appendBits(metadata, static_cast<uint16_t>((int)luminanceData.maxRLuminance & 0xFFFF), 16);
475
mPimpl->appendBits(metadata, static_cast<uint8_t>(((int)luminanceData.maxGLuminance & 0x10000) >> 16), 1);
476
477
uint8_t numDistributionMaxrgbPercentiles = static_cast<uint8_t>(luminanceData.order);
478
mPimpl->appendBits(metadata, numDistributionMaxrgbPercentiles, 4);
479
480
- std::vector<unsigned int>percentilPercentages;
481
- mPimpl->percentagesFromJson(lumObj, percentilPercentages);
482
+ std::vector<unsigned int>percentilePercentages;
483
+ mPimpl->percentagesFromJson(lumObj, percentilePercentages, jsonType);
484
+
485
for (int i = 0; i < numDistributionMaxrgbPercentiles; ++i)
486
{
487
- uint8_t distributionMaxrgbPercentage = static_cast<uint8_t>(percentilPercentages.at(i));
488
+ uint8_t distributionMaxrgbPercentage = static_cast<uint8_t>(percentilePercentages.at(i));
489
mPimpl->appendBits(metadata, distributionMaxrgbPercentage, 7);
490
491
/* 17bits: 1bit then 16 */
492
493
}
494
495
/* 10bits: Fraction bright pixels */
496
- uint16_t fractionBrightPixels = 1;
497
+ uint16_t fractionBrightPixels = 0;
498
mPimpl->appendBits(metadata, fractionBrightPixels, 10);
499
500
}
501
502
/* Bezier Curve Data */
503
for (int w = 0; w < numWindows; ++w)
504
{
505
- uint8_t toneMappingFlag = 1;
506
+ uint8_t toneMappingFlag = 0;
507
/* Check if the window contains tone mapping bezier curve data and set toneMappingFlag appropriately */
508
- //Json bezierData = fileData[frame][BezierCurveNames::TagName];
509
BezierCurveData curveData;
510
/* Select curve data based on global window */
511
if (w == 0)
512
- {
513
- if (!mPimpl->bezierCurveFromJson(fileData[frame][BezierCurveNames::TagName], curveData))
514
+ {
515
+ if (mPimpl->bezierCurveFromJson(fileData[frame][BezierCurveNames::TagName], curveData, jsonType))
516
{
517
- toneMappingFlag = 0;
518
+ toneMappingFlag = 1;
519
}
520
}
521
- /* Select curve data based on local window */
522
+ /* Select curve data based on local window */
523
else
524
{
525
- if (!mPimpl->bezierCurveFromJson(jsonArray[w - 1][BezierCurveNames::TagName], curveData))
526
+ JsonArray jsonArray = fileData[frame][JsonDataKeys::LocalParameters].array_items();
527
+ if (mPimpl->bezierCurveFromJson(jsonArray[w - 1][BezierCurveNames::TagName], curveData, jsonType))
528
{
529
- toneMappingFlag = 0;
530
+ toneMappingFlag = 1;
531
}
532
}
533
mPimpl->appendBits(metadata, toneMappingFlag, 1);
534
x265_2.7.tar.gz/source/dynamicHDR10/metadataFromJson.h -> x265_2.9.tar.gz/source/dynamicHDR10/metadataFromJson.h
Changed
31
1
2
#define METADATAFROMJSON_H
3
4
#include<stdint.h>
5
-#include "string"
6
+#include<cstring>
7
#include "JsonHelper.h"
8
9
class metadataFromJson
10
11
metadataFromJson();
12
~metadataFromJson();
13
14
+ enum JsonType{
15
+ LEGACY,
16
+ LLC
17
+ };
18
+
19
20
/**
21
* @brief frameMetadataFromJson: Generates a sigle frame metadata array from Json file with all
22
23
24
class DynamicMetaIO;
25
DynamicMetaIO *mPimpl;
26
- void fillMetadataArray(const JsonArray &fileData, int frame, uint8_t *&metadata);
27
+ void fillMetadataArray(const JsonArray &fileData, int frame, const JsonType jsonType, uint8_t *&metadata);
28
};
29
30
#endif // METADATAFROMJSON_H
31
x265_2.7.tar.gz/source/encoder/analysis.cpp -> x265_2.9.tar.gz/source/encoder/analysis.cpp
Changed
634
1
2
using namespace X265_NS;
3
4
/* An explanation of rate distortion levels (--rd-level)
5
- *
6
+ *
7
* rd-level 0 generates no recon per CU (NO RDO or Quant)
8
*
9
* sa8d selection between merge / skip / inter / intra and split
10
11
for (uint32_t i = 0; i < cuGeom.numPartitions; i++)
12
ctu.m_log2CUSize[i] = (uint8_t)m_param->maxLog2CUSize - ctu.m_cuDepth[i];
13
}
14
- if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead)
15
+ if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && (m_slice->m_sliceType != I_SLICE))
16
{
17
- m_multipassAnalysis = (analysis2PassFrameData*)m_frame->m_analysis2Pass.analysisFramedata;
18
- m_multipassDepth = &m_multipassAnalysis->depth[ctu.m_cuAddr * ctu.m_numPartitions];
19
- if (m_slice->m_sliceType != I_SLICE)
20
+ int numPredDir = m_slice->isInterP() ? 1 : 2;
21
+ m_reuseInterDataCTU = m_frame->m_analysisData.interData;
22
+ for (int dir = 0; dir < numPredDir; dir++)
23
{
24
- int numPredDir = m_slice->isInterP() ? 1 : 2;
25
- for (int dir = 0; dir < numPredDir; dir++)
26
- {
27
- m_multipassMv[dir] = &m_multipassAnalysis->m_mv[dir][ctu.m_cuAddr * ctu.m_numPartitions];
28
- m_multipassMvpIdx[dir] = &m_multipassAnalysis->mvpIdx[dir][ctu.m_cuAddr * ctu.m_numPartitions];
29
- m_multipassRef[dir] = &m_multipassAnalysis->ref[dir][ctu.m_cuAddr * ctu.m_numPartitions];
30
- }
31
- m_multipassModes = &m_multipassAnalysis->modes[ctu.m_cuAddr * ctu.m_numPartitions];
32
+ m_reuseMv[dir] = &m_reuseInterDataCTU->mv[dir][ctu.m_cuAddr * ctu.m_numPartitions];
33
+ m_reuseMvpIdx[dir] = &m_reuseInterDataCTU->mvpIdx[dir][ctu.m_cuAddr * ctu.m_numPartitions];
34
}
35
+ m_reuseRef = &m_reuseInterDataCTU->ref[ctu.m_cuAddr * ctu.m_numPartitions];
36
+ m_reuseModes = &m_reuseInterDataCTU->modes[ctu.m_cuAddr * ctu.m_numPartitions];
37
+ m_reuseDepth = &m_reuseInterDataCTU->depth[ctu.m_cuAddr * ctu.m_numPartitions];
38
}
39
-
40
+
41
if ((m_param->analysisSave || m_param->analysisLoad) && m_slice->m_sliceType != I_SLICE && m_param->analysisReuseLevel > 1 && m_param->analysisReuseLevel < 10)
42
{
43
int numPredDir = m_slice->isInterP() ? 1 : 2;
44
- m_reuseInterDataCTU = (analysis_inter_data*)m_frame->m_analysisData.interData;
45
+ m_reuseInterDataCTU = m_frame->m_analysisData.interData;
46
m_reuseRef = &m_reuseInterDataCTU->ref [ctu.m_cuAddr * X265_MAX_PRED_MODE_PER_CTU * numPredDir];
47
m_reuseDepth = &m_reuseInterDataCTU->depth[ctu.m_cuAddr * ctu.m_numPartitions];
48
m_reuseModes = &m_reuseInterDataCTU->modes[ctu.m_cuAddr * ctu.m_numPartitions];
49
50
51
if (m_slice->m_sliceType == I_SLICE)
52
{
53
- analysis_intra_data* intraDataCTU = (analysis_intra_data*)m_frame->m_analysisData.intraData;
54
+ x265_analysis_intra_data* intraDataCTU = m_frame->m_analysisData.intraData;
55
if (m_param->analysisLoad && m_param->analysisReuseLevel > 1)
56
{
57
memcpy(ctu.m_cuDepth, &intraDataCTU->depth[ctu.m_cuAddr * numPartition], sizeof(uint8_t) * numPartition);
58
59
60
if (bCopyAnalysis)
61
{
62
- analysis_inter_data* interDataCTU = (analysis_inter_data*)m_frame->m_analysisData.interData;
63
+ x265_analysis_inter_data* interDataCTU = m_frame->m_analysisData.interData;
64
int posCTU = ctu.m_cuAddr * numPartition;
65
memcpy(ctu.m_cuDepth, &interDataCTU->depth[posCTU], sizeof(uint8_t) * numPartition);
66
memcpy(ctu.m_predMode, &interDataCTU->modes[posCTU], sizeof(uint8_t) * numPartition);
67
68
69
if ((m_slice->m_sliceType == P_SLICE || m_param->bIntraInBFrames) && !m_param->bMVType)
70
{
71
- analysis_intra_data* intraDataCTU = (analysis_intra_data*)m_frame->m_analysisData.intraData;
72
+ x265_analysis_intra_data* intraDataCTU = m_frame->m_analysisData.intraData;
73
memcpy(ctu.m_lumaIntraDir, &intraDataCTU->modes[posCTU], sizeof(uint8_t) * numPartition);
74
memcpy(ctu.m_chromaIntraDir, &intraDataCTU->chromaModes[posCTU], sizeof(uint8_t) * numPartition);
75
}
76
77
}
78
else if ((m_param->analysisLoad && m_param->analysisReuseLevel == 10) || ((m_param->bMVType == AVC_INFO) && m_param->analysisReuseLevel >= 7 && ctu.m_numPartitions <= 16))
79
{
80
- analysis_inter_data* interDataCTU = (analysis_inter_data*)m_frame->m_analysisData.interData;
81
+ x265_analysis_inter_data* interDataCTU = m_frame->m_analysisData.interData;
82
int posCTU = ctu.m_cuAddr * numPartition;
83
memcpy(ctu.m_cuDepth, &interDataCTU->depth[posCTU], sizeof(uint8_t) * numPartition);
84
memcpy(ctu.m_predMode, &interDataCTU->modes[posCTU], sizeof(uint8_t) * numPartition);
85
memcpy(ctu.m_partSize, &interDataCTU->partSize[posCTU], sizeof(uint8_t) * numPartition);
86
if ((m_slice->m_sliceType == P_SLICE || m_param->bIntraInBFrames) && !(m_param->bMVType == AVC_INFO))
87
{
88
- analysis_intra_data* intraDataCTU = (analysis_intra_data*)m_frame->m_analysisData.intraData;
89
+ x265_analysis_intra_data* intraDataCTU = m_frame->m_analysisData.intraData;
90
memcpy(ctu.m_lumaIntraDir, &intraDataCTU->modes[posCTU], sizeof(uint8_t) * numPartition);
91
memcpy(ctu.m_chromaIntraDir, &intraDataCTU->chromaModes[posCTU], sizeof(uint8_t) * numPartition);
92
}
93
94
bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
95
bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
96
97
- bool bAlreadyDecided = parentCTU.m_lumaIntraDir[cuGeom.absPartIdx] != (uint8_t)ALL_IDX;
98
- bool bDecidedDepth = parentCTU.m_cuDepth[cuGeom.absPartIdx] == depth;
99
+ bool bAlreadyDecided = m_param->intraRefine != 4 && parentCTU.m_lumaIntraDir[cuGeom.absPartIdx] != (uint8_t)ALL_IDX;
100
+ bool bDecidedDepth = m_param->intraRefine != 4 && parentCTU.m_cuDepth[cuGeom.absPartIdx] == depth;
101
int split = 0;
102
- if (m_param->intraRefine)
103
+ if (m_param->intraRefine && m_param->intraRefine != 4)
104
{
105
- split = ((cuGeom.log2CUSize == (uint32_t)(g_log2Size[m_param->minCUSize] + 1)) && bDecidedDepth);
106
+ split = m_param->scaleFactor && bDecidedDepth && (!mightNotSplit ||
107
+ ((cuGeom.log2CUSize == (uint32_t)(g_log2Size[m_param->minCUSize] + 1))));
108
if (cuGeom.log2CUSize == (uint32_t)(g_log2Size[m_param->minCUSize]) && !bDecidedDepth)
109
bAlreadyDecided = false;
110
}
111
112
if (bAlreadyDecided)
113
{
114
- if (bDecidedDepth)
115
+ if (bDecidedDepth && mightNotSplit)
116
{
117
Mode& mode = md.pred[0];
118
md.bestMode = &mode;
119
120
121
if (m_evaluateInter)
122
{
123
- if (m_param->interRefine == 2)
124
+ if (m_refineLevel == 2)
125
{
126
if (parentCTU.m_predMode[cuGeom.absPartIdx] == MODE_SKIP)
127
skipModes = true;
128
129
}
130
}
131
}
132
- if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && m_multipassAnalysis)
133
+ if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && m_reuseInterDataCTU)
134
{
135
- if (mightNotSplit && depth == m_multipassDepth[cuGeom.absPartIdx])
136
+ if (mightNotSplit && depth == m_reuseDepth[cuGeom.absPartIdx])
137
{
138
- if (m_multipassModes[cuGeom.absPartIdx] == MODE_SKIP)
139
+ if (m_reuseModes[cuGeom.absPartIdx] == MODE_SKIP)
140
{
141
md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
142
md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
143
144
md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
145
checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
146
if (m_param->rdLevel)
147
- skipModes = (m_param->bEnableEarlySkip || m_param->interRefine == 2)
148
+ skipModes = (m_param->bEnableEarlySkip || m_refineLevel == 2)
149
&& md.bestMode && md.bestMode->cu.isSkipped(0); // TODO: sa8d threshold per depth
150
}
151
if (md.bestMode && m_param->bEnableRecursionSkip && !bCtuInfoCheck && !(m_param->bMVType && m_param->analysisReuseLevel == 7 && (m_modeFlag[0] || m_modeFlag[1])))
152
153
154
if (m_evaluateInter)
155
{
156
- if (m_param->interRefine == 2)
157
+ if (m_refineLevel == 2)
158
{
159
if (parentCTU.m_predMode[cuGeom.absPartIdx] == MODE_SKIP)
160
skipModes = true;
161
162
}
163
}
164
165
- if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && m_multipassAnalysis)
166
+ if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && m_reuseInterDataCTU)
167
{
168
- if (mightNotSplit && depth == m_multipassDepth[cuGeom.absPartIdx])
169
+ if (mightNotSplit && depth == m_reuseDepth[cuGeom.absPartIdx])
170
{
171
- if (m_multipassModes[cuGeom.absPartIdx] == MODE_SKIP)
172
+ if (m_reuseModes[cuGeom.absPartIdx] == MODE_SKIP)
173
{
174
md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
175
md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
176
177
md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
178
md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
179
checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
180
- skipModes = (m_param->bEnableEarlySkip || m_param->interRefine == 2) &&
181
+ skipModes = (m_param->bEnableEarlySkip || m_refineLevel == 2) &&
182
md.bestMode && !md.bestMode->cu.getQtRootCbf(0);
183
refMasks[0] = allSplitRefs;
184
md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
185
186
bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
187
bool bDecidedDepth = parentCTU.m_cuDepth[cuGeom.absPartIdx] == depth;
188
189
- int split = (m_param->interRefine && cuGeom.log2CUSize == (uint32_t)(g_log2Size[m_param->minCUSize] + 1) && bDecidedDepth);
190
+ TrainingData td;
191
+ td.init(parentCTU, cuGeom);
192
193
- if (bDecidedDepth)
194
+ if (!m_param->bDynamicRefine)
195
+ m_refineLevel = m_param->interRefine;
196
+ else
197
+ m_refineLevel = m_frame->m_classifyFrame ? 1 : 3;
198
+ int split = (m_param->scaleFactor && bDecidedDepth && (!mightNotSplit ||
199
+ (m_refineLevel && cuGeom.log2CUSize == (uint32_t)(g_log2Size[m_param->minCUSize] + 1))));
200
+ td.split = split;
201
+
202
+ if (bDecidedDepth && mightNotSplit)
203
{
204
setLambdaFromQP(parentCTU, qp, lqp);
205
206
207
md.bestMode = &mode;
208
mode.cu.initSubCU(parentCTU, cuGeom, qp);
209
PartSize size = (PartSize)parentCTU.m_partSize[cuGeom.absPartIdx];
210
- if (parentCTU.isIntra(cuGeom.absPartIdx) && m_param->interRefine < 2)
211
+ if (parentCTU.isIntra(cuGeom.absPartIdx) && m_refineLevel < 2)
212
{
213
- bool reuseModes = !((m_param->intraRefine == 3) ||
214
- (m_param->intraRefine == 2 && parentCTU.m_lumaIntraDir[cuGeom.absPartIdx] > DC_IDX));
215
- if (reuseModes)
216
+ if (m_param->intraRefine == 4)
217
+ compressIntraCU(parentCTU, cuGeom, qp);
218
+ else
219
{
220
- memcpy(mode.cu.m_lumaIntraDir, parentCTU.m_lumaIntraDir + cuGeom.absPartIdx, cuGeom.numPartitions);
221
- memcpy(mode.cu.m_chromaIntraDir, parentCTU.m_chromaIntraDir + cuGeom.absPartIdx, cuGeom.numPartitions);
222
+ bool reuseModes = !((m_param->intraRefine == 3) ||
223
+ (m_param->intraRefine == 2 && parentCTU.m_lumaIntraDir[cuGeom.absPartIdx] > DC_IDX));
224
+ if (reuseModes)
225
+ {
226
+ memcpy(mode.cu.m_lumaIntraDir, parentCTU.m_lumaIntraDir + cuGeom.absPartIdx, cuGeom.numPartitions);
227
+ memcpy(mode.cu.m_chromaIntraDir, parentCTU.m_chromaIntraDir + cuGeom.absPartIdx, cuGeom.numPartitions);
228
+ }
229
+ checkIntra(mode, cuGeom, size);
230
}
231
- checkIntra(mode, cuGeom, size);
232
}
233
- else if (!parentCTU.isIntra(cuGeom.absPartIdx) && m_param->interRefine < 2)
234
+ else if (!parentCTU.isIntra(cuGeom.absPartIdx) && m_refineLevel < 2)
235
{
236
mode.cu.copyFromPic(parentCTU, cuGeom, m_csp, false);
237
uint32_t numPU = parentCTU.getNumPartInter(cuGeom.absPartIdx);
238
for (uint32_t part = 0; part < numPU; part++)
239
{
240
PredictionUnit pu(mode.cu, cuGeom, part);
241
- if (m_param->analysisReuseLevel >= 7)
242
+ if ((m_param->analysisLoad && m_param->analysisReuseLevel == 10) || (m_param->bMVType == AVC_INFO && m_param->analysisReuseLevel >= 7))
243
{
244
- analysis_inter_data* interDataCTU = (analysis_inter_data*)m_frame->m_analysisData.interData;
245
+ x265_analysis_inter_data* interDataCTU = m_frame->m_analysisData.interData;
246
int cuIdx = (mode.cu.m_cuAddr * parentCTU.m_numPartitions) + cuGeom.absPartIdx;
247
mode.cu.m_mergeFlag[pu.puAbsPartIdx] = interDataCTU->mergeFlag[cuIdx + part];
248
mode.cu.setPUInterDir(interDataCTU->interDir[cuIdx + part], pu.puAbsPartIdx, part);
249
for (int list = 0; list < m_slice->isInterB() + 1; list++)
250
{
251
- mode.cu.setPUMv(list, interDataCTU->mv[list][cuIdx + part], pu.puAbsPartIdx, part);
252
+ mode.cu.setPUMv(list, interDataCTU->mv[list][cuIdx + part].word, pu.puAbsPartIdx, part);
253
mode.cu.setPURefIdx(list, interDataCTU->refIdx[list][cuIdx + part], pu.puAbsPartIdx, part);
254
mode.cu.m_mvpIdx[list][pu.puAbsPartIdx] = interDataCTU->mvpIdx[list][cuIdx + part];
255
}
256
if (!mode.cu.m_mergeFlag[pu.puAbsPartIdx])
257
{
258
- if (m_param->mvRefine)
259
+ if (m_param->mvRefine || m_param->interRefine == 1)
260
m_me.setSourcePU(*mode.fencYuv, pu.ctuAddr, pu.cuAbsPartIdx, pu.puAbsPartIdx, pu.width, pu.height, m_param->searchMethod, m_param->subpelRefine, false);
261
//AMVP
262
MV mvc[(MD_ABOVE_LEFT + 1) * 2 + 2];
263
264
int ref = mode.cu.m_refIdx[list][pu.puAbsPartIdx];
265
if (ref == -1)
266
continue;
267
- mode.cu.getPMV(mode.interNeighbours, list, ref, mode.amvpCand[list][ref], mvc);
268
- MV mvp = mode.amvpCand[list][ref][mode.cu.m_mvpIdx[list][pu.puAbsPartIdx]];
269
- if (m_param->mvRefine)
270
+ MV mvp;
271
+
272
+ int numMvc = mode.cu.getPMV(mode.interNeighbours, list, ref, mode.amvpCand[list][ref], mvc);
273
+ if (m_param->interRefine != 1)
274
+ mvp = mode.amvpCand[list][ref][mode.cu.m_mvpIdx[list][pu.puAbsPartIdx]];
275
+ else
276
+ mvp = interDataCTU->mv[list][cuIdx + part].word;
277
+ if (m_param->mvRefine || m_param->interRefine == 1)
278
{
279
MV outmv;
280
- searchMV(mode, pu, list, ref, outmv);
281
+ searchMV(mode, pu, list, ref, outmv, mvp, numMvc, mvc);
282
mode.cu.setPUMv(list, outmv, pu.puAbsPartIdx, part);
283
}
284
- mode.cu.m_mvd[list][pu.puAbsPartIdx] = mode.cu.m_mv[list][pu.puAbsPartIdx] - mvp;
285
+ mode.cu.m_mvd[list][pu.puAbsPartIdx] = mode.cu.m_mv[list][pu.puAbsPartIdx] - mode.amvpCand[list][ref][mode.cu.m_mvpIdx[list][pu.puAbsPartIdx]]/*mvp*/;
286
}
287
}
288
- else if(m_param->scaleFactor)
289
+ else
290
{
291
MVField candMvField[MRG_MAX_NUM_CANDS][2]; // double length for mv of both lists
292
uint8_t candDir[MRG_MAX_NUM_CANDS];
293
mode.cu.getInterMergeCandidates(pu.puAbsPartIdx, part, candMvField, candDir);
294
uint8_t mvpIdx = mode.cu.m_mvpIdx[0][pu.puAbsPartIdx];
295
+ if (mode.cu.isBipredRestriction())
296
+ {
297
+ /* do not allow bidir merge candidates if PU is smaller than 8x8, drop L1 reference */
298
+ if (candDir[mvpIdx] == 3)
299
+ {
300
+ candDir[mvpIdx] = 1;
301
+ candMvField[mvpIdx][1].refIdx = REF_NOT_VALID;
302
+ }
303
+ }
304
mode.cu.setPUInterDir(candDir[mvpIdx], pu.puAbsPartIdx, part);
305
mode.cu.setPUMv(0, candMvField[mvpIdx][0].mv, pu.puAbsPartIdx, part);
306
mode.cu.setPUMv(1, candMvField[mvpIdx][1].mv, pu.puAbsPartIdx, part);
307
308
}
309
motionCompensation(mode.cu, pu, mode.predYuv, true, (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400));
310
}
311
- if (!m_param->interRefine && parentCTU.isSkipped(cuGeom.absPartIdx))
312
+ if (!m_param->interRefine && !m_param->bDynamicRefine && parentCTU.isSkipped(cuGeom.absPartIdx))
313
encodeResAndCalcRdSkipCU(mode);
314
else
315
encodeResAndCalcRdInterCU(mode, cuGeom);
316
317
checkDQP(mode, cuGeom);
318
}
319
320
- if (m_param->interRefine < 2)
321
+ if (m_refineLevel < 2)
322
{
323
if (m_bTryLossless)
324
tryLossless(cuGeom);
325
326
}
327
}
328
329
- if (m_param->interRefine > 1 || (m_param->interRefine && parentCTU.m_predMode[cuGeom.absPartIdx] == MODE_SKIP && !mode.cu.isSkipped(0)))
330
+ if (m_param->bDynamicRefine)
331
+ classifyCU(parentCTU,cuGeom, *md.bestMode, td);
332
+
333
+ if (m_refineLevel > 1 || (m_refineLevel && parentCTU.m_predMode[cuGeom.absPartIdx] == MODE_SKIP && !mode.cu.isSkipped(0)))
334
{
335
m_evaluateInter = 1;
336
m_param->rdLevel > 4 ? compressInterCU_rd5_6(parentCTU, cuGeom, qp) : compressInterCU_rd0_4(parentCTU, cuGeom, qp);
337
338
else
339
updateModeCost(*splitPred);
340
341
- if (m_param->interRefine)
342
+ if (m_refineLevel)
343
{
344
if (m_param->rdLevel > 1)
345
checkBestMode(*splitPred, cuGeom.depth);
346
347
md.bestMode->cu.copyToPic(depth);
348
md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, parentCTU.m_cuAddr, cuGeom.absPartIdx);
349
}
350
+ if (m_param->bDynamicRefine && bDecidedDepth)
351
+ trainCU(parentCTU, cuGeom, *md.bestMode, td);
352
+}
353
+
354
+void Analysis::classifyCU(const CUData& ctu, const CUGeom& cuGeom, const Mode& bestMode, TrainingData& trainData)
355
+{
356
+ uint32_t depth = cuGeom.depth;
357
+ trainData.cuVariance = calculateCUVariance(ctu, cuGeom);
358
+ if (m_frame->m_classifyFrame)
359
+ {
360
+ uint64_t diffRefine[X265_REFINE_INTER_LEVELS];
361
+ uint64_t diffRefineRd[X265_REFINE_INTER_LEVELS];
362
+ float probRefine[X265_REFINE_INTER_LEVELS] = { 0 };
363
+ uint8_t varRefineLevel = 1;
364
+ uint8_t rdRefineLevel = 1;
365
+ uint64_t cuCost = bestMode.rdCost;
366
+ int offset = (depth * X265_REFINE_INTER_LEVELS);
367
+ if (cuCost < m_frame->m_classifyRd[offset])
368
+ m_refineLevel = 1;
369
+ else
370
+ {
371
+ uint64_t trainingCount = 0;
372
+ for (uint8_t i = 0; i < X265_REFINE_INTER_LEVELS; i++)
373
+ {
374
+ offset = (depth * X265_REFINE_INTER_LEVELS) + i;
375
+ trainingCount += m_frame->m_classifyCount[offset];
376
+ }
377
+ for (uint8_t i = 0; i < X265_REFINE_INTER_LEVELS; i++)
378
+ {
379
+ offset = (depth * X265_REFINE_INTER_LEVELS) + i;
380
+ /* Calculate distance values */
381
+ diffRefine[i] = abs((int64_t)(trainData.cuVariance - m_frame->m_classifyVariance[offset]));
382
+ diffRefineRd[i] = abs((int64_t)(cuCost - m_frame->m_classifyRd[offset]));
383
+
384
+ /* Calculate prior probability - ranges between 0 and 1 */
385
+ if (trainingCount)
386
+ probRefine[i] = ((float)m_frame->m_classifyCount[offset] / (float)trainingCount);
387
+
388
+ /* Bayesian classification - P(c|x)P(x) = P(x|c)P(c)
389
+ P(c|x) is the posterior probability of class given predictor.
390
+ P(c) is the prior probability of class.
391
+ P(x|c) is the likelihood which is the probability of predictor given class.
392
+ P(x) is the prior probability of predictor.*/
393
+ int curRefineLevel = m_refineLevel - 1;
394
+ if ((diffRefine[i] * probRefine[curRefineLevel]) < (diffRefine[curRefineLevel] * probRefine[i]))
395
+ varRefineLevel = i + 1;
396
+ if ((diffRefineRd[i] * probRefine[curRefineLevel]) < (diffRefineRd[curRefineLevel] * probRefine[i]))
397
+ rdRefineLevel = i + 1;
398
+ }
399
+ m_refineLevel = X265_MAX(varRefineLevel, rdRefineLevel);
400
+ }
401
+ }
402
+}
403
+
404
+void Analysis::trainCU(const CUData& ctu, const CUGeom& cuGeom, const Mode& bestMode, TrainingData& trainData)
405
+{
406
+ uint32_t depth = cuGeom.depth;
407
+ int classify = 1;
408
+ if (!m_frame->m_classifyFrame)
409
+ {
410
+ /* classify = 1 : CUs for which the save data matches with that after encoding with refine-inter 3
411
+ and CUs that has split.
412
+ classify = 2 : CUs which are encoded as simple modes (Skip/Merge/2Nx2N).
413
+ classify = 3 : CUs encoded as any other mode. */
414
+
415
+ bool refineInter0 = (trainData.predMode == ctu.m_predMode[cuGeom.absPartIdx] &&
416
+ trainData.partSize == ctu.m_partSize[cuGeom.absPartIdx] &&
417
+ trainData.mergeFlag == ctu.m_mergeFlag[cuGeom.absPartIdx]);
418
+ bool refineInter1 = (depth == m_param->maxCUDepth - 1) && trainData.split;
419
+ if (refineInter0 || refineInter1)
420
+ classify = 1;
421
+ else if (trainData.partSize == SIZE_2Nx2N && trainData.partSize == ctu.m_partSize[cuGeom.absPartIdx])
422
+ classify = 2;
423
+ else
424
+ classify = 3;
425
+ }
426
+ else
427
+ classify = m_refineLevel;
428
+ uint64_t cuCost = bestMode.rdCost;
429
+ int offset = (depth * X265_REFINE_INTER_LEVELS) + classify - 1;
430
+ ctu.m_collectCURd[offset] += cuCost;
431
+ ctu.m_collectCUVariance[offset] += trainData.cuVariance;
432
+ ctu.m_collectCUCount[offset]++;
433
}
434
435
/* sets md.bestMode if a valid merge candidate is found, else leaves it NULL */
436
437
}
438
}
439
440
- if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && m_multipassAnalysis)
441
+ if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && m_reuseInterDataCTU)
442
{
443
uint32_t numPU = interMode.cu.getNumPartInter(0);
444
for (uint32_t part = 0; part < numPU; part++)
445
446
MotionData* bestME = interMode.bestME[part];
447
for (int32_t i = 0; i < numPredDir; i++)
448
{
449
- bestME[i].ref = m_multipassRef[i][cuGeom.absPartIdx];
450
- bestME[i].mv = m_multipassMv[i][cuGeom.absPartIdx];
451
- bestME[i].mvpIdx = m_multipassMvpIdx[i][cuGeom.absPartIdx];
452
+ int* ref = &m_reuseRef[i * m_frame->m_analysisData.numPartitions * m_frame->m_analysisData.numCUsInFrame];
453
+ bestME[i].ref = ref[cuGeom.absPartIdx];
454
+ bestME[i].mv = m_reuseMv[i][cuGeom.absPartIdx].word;
455
+ bestME[i].mvpIdx = m_reuseMvpIdx[i][cuGeom.absPartIdx];
456
}
457
}
458
}
459
460
}
461
}
462
463
- if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && m_multipassAnalysis)
464
+ if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && m_reuseInterDataCTU)
465
{
466
uint32_t numPU = interMode.cu.getNumPartInter(0);
467
for (uint32_t part = 0; part < numPU; part++)
468
469
MotionData* bestME = interMode.bestME[part];
470
for (int32_t i = 0; i < numPredDir; i++)
471
{
472
- bestME[i].ref = m_multipassRef[i][cuGeom.absPartIdx];
473
- bestME[i].mv = m_multipassMv[i][cuGeom.absPartIdx];
474
- bestME[i].mvpIdx = m_multipassMvpIdx[i][cuGeom.absPartIdx];
475
+ int* ref = &m_reuseRef[i * m_frame->m_analysisData.numPartitions * m_frame->m_analysisData.numCUsInFrame];
476
+ bestME[i].ref = ref[cuGeom.absPartIdx];
477
+ bestME[i].mv = m_reuseMv[i][cuGeom.absPartIdx].word;
478
+ bestME[i].mvpIdx = m_reuseMvpIdx[i][cuGeom.absPartIdx];
479
}
480
}
481
}
482
483
pixel *fref0 = m_slice->m_mref[0][ref0].getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx);
484
pixel *fref1 = m_slice->m_mref[1][ref1].getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx);
485
intptr_t refStride = m_slice->m_mref[0][0].lumaStride;
486
-
487
- primitives.pu[partEnum].pixelavg_pp(tmpPredYuv.m_buf[0], tmpPredYuv.m_size, fref0, refStride, fref1, refStride, 32);
488
+ primitives.pu[partEnum].pixelavg_pp[(tmpPredYuv.m_size % 64 == 0) && (refStride % 64 == 0)](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, fref0, refStride, fref1, refStride, 32);
489
zsa8d = primitives.cu[partEnum].sa8d(fencYuv.m_buf[0], fencYuv.m_size, tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
490
}
491
-
492
uint32_t bits0 = bestME[0].bits - m_me.bitcost(bestME[0].mv, mvp0) + m_me.bitcost(mvzero, mvp0);
493
uint32_t bits1 = bestME[1].bits - m_me.bitcost(bestME[1].mv, mvp1) + m_me.bitcost(mvzero, mvp1);
494
uint32_t zcost = zsa8d + m_rdCost.getCost(bits0) + m_rdCost.getCost(bits1);
495
496
* resiYuv. Generate the recon pixels by adding it to the prediction */
497
498
if (cu.m_cbf[0][0])
499
- primitives.cu[sizeIdx].add_ps(reconPic.getLumaAddr(cu.m_cuAddr, absPartIdx), reconPic.m_stride,
500
- predY, resiYuv.m_buf[0], predYuv.m_size, resiYuv.m_size);
501
+ {
502
+ bool reconPicAlign = (reconPic.m_cuOffsetY[cu.m_cuAddr] + reconPic.m_buOffsetY[absPartIdx]) % 64 == 0;
503
+ bool predYalign = predYuv.getAddrOffset(absPartIdx, predYuv.m_size) % 64 == 0;
504
+ primitives.cu[sizeIdx].add_ps[reconPicAlign && predYalign && (reconPic.m_stride % 64 == 0) && (predYuv.m_size % 64 == 0) &&
505
+ (resiYuv.m_size % 64 == 0)](reconPic.getLumaAddr(cu.m_cuAddr, absPartIdx), reconPic.m_stride, predY, resiYuv.m_buf[0], predYuv.m_size, resiYuv.m_size);
506
+ }
507
else
508
primitives.cu[sizeIdx].copy_pp(reconPic.getLumaAddr(cu.m_cuAddr, absPartIdx), reconPic.m_stride,
509
predY, predYuv.m_size);
510
511
{
512
pixel* predU = predYuv.getCbAddr(absPartIdx);
513
pixel* predV = predYuv.getCrAddr(absPartIdx);
514
- if (cu.m_cbf[1][0])
515
- primitives.chroma[m_csp].cu[sizeIdx].add_ps(reconPic.getCbAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC,
516
- predU, resiYuv.m_buf[1], predYuv.m_csize, resiYuv.m_csize);
517
+ if (cu.m_cbf[1][0])
518
+ {
519
+ bool reconPicAlign = (reconPic.m_cuOffsetC[cu.m_cuAddr] + reconPic.m_buOffsetC[absPartIdx]) % 64 == 0;
520
+ bool predUalign = predYuv.getChromaAddrOffset(absPartIdx) % 64 == 0;
521
+ primitives.chroma[m_csp].cu[sizeIdx].add_ps[reconPicAlign && predUalign && (reconPic.m_strideC % 64 == 0) && (predYuv.m_csize % 64 == 0) &&
522
+ (resiYuv.m_csize % 64 == 0)](reconPic.getCbAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC, predU, resiYuv.m_buf[1], predYuv.m_csize, resiYuv.m_csize);
523
+ }
524
else
525
primitives.chroma[m_csp].cu[sizeIdx].copy_pp(reconPic.getCbAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC,
526
predU, predYuv.m_csize);
527
528
if (cu.m_cbf[2][0])
529
- primitives.chroma[m_csp].cu[sizeIdx].add_ps(reconPic.getCrAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC,
530
- predV, resiYuv.m_buf[2], predYuv.m_csize, resiYuv.m_csize);
531
+ {
532
+ bool reconPicAlign = (reconPic.m_cuOffsetC[cu.m_cuAddr] + reconPic.m_buOffsetC[absPartIdx]) % 64 == 0;
533
+ bool predValign = predYuv.getChromaAddrOffset(absPartIdx) % 64 == 0;
534
+ primitives.chroma[m_csp].cu[sizeIdx].add_ps[reconPicAlign && predValign && (reconPic.m_strideC % 64 == 0) && (predYuv.m_csize % 64 == 0) &&
535
+ (resiYuv.m_csize % 64 == 0)](reconPic.getCrAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC, predV, resiYuv.m_buf[2], predYuv.m_csize, resiYuv.m_csize);
536
+ }
537
else
538
primitives.chroma[m_csp].cu[sizeIdx].copy_pp(reconPic.getCrAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC,
539
predV, predYuv.m_csize);
540
541
return false;
542
}
543
544
+uint32_t Analysis::calculateCUVariance(const CUData& ctu, const CUGeom& cuGeom)
545
+{
546
+ uint32_t cuVariance = 0;
547
+ uint32_t *blockVariance = m_frame->m_lowres.blockVariance;
548
+ int loopIncr = (m_param->rc.qgSize == 8) ? 8 : 16;
549
+
550
+ uint32_t width = m_frame->m_fencPic->m_picWidth;
551
+ uint32_t height = m_frame->m_fencPic->m_picHeight;
552
+ uint32_t block_x = ctu.m_cuPelX + g_zscanToPelX[cuGeom.absPartIdx];
553
+ uint32_t block_y = ctu.m_cuPelY + g_zscanToPelY[cuGeom.absPartIdx];
554
+ uint32_t maxCols = (m_frame->m_fencPic->m_picWidth + (loopIncr - 1)) / loopIncr;
555
+ uint32_t blockSize = m_param->maxCUSize >> cuGeom.depth;
556
+ uint32_t cnt = 0;
557
+
558
+ for (uint32_t block_yy = block_y; block_yy < block_y + blockSize && block_yy < height; block_yy += loopIncr)
559
+ {
560
+ for (uint32_t block_xx = block_x; block_xx < block_x + blockSize && block_xx < width; block_xx += loopIncr)
561
+ {
562
+ uint32_t idx = ((block_yy / loopIncr) * (maxCols)) + (block_xx / loopIncr);
563
+ cuVariance += blockVariance[idx];
564
+ cnt++;
565
+ }
566
+ }
567
+
568
+ return cuVariance / cnt;
569
+}
570
+
571
int Analysis::calculateQpforCuSize(const CUData& ctu, const CUGeom& cuGeom, int32_t complexCheck, double baseQp)
572
{
573
FrameData& curEncData = *m_frame->m_encData;
574
575
576
if (m_param->analysisMultiPassDistortion && m_param->rc.bStatRead)
577
{
578
- m_multipassAnalysis = (analysis2PassFrameData*)m_frame->m_analysis2Pass.analysisFramedata;
579
- if ((m_multipassAnalysis->threshold[ctu.m_cuAddr] < 0.9 || m_multipassAnalysis->threshold[ctu.m_cuAddr] > 1.1)
580
- && m_multipassAnalysis->highDistortionCtuCount && m_multipassAnalysis->lowDistortionCtuCount)
581
- qp += m_multipassAnalysis->offset[ctu.m_cuAddr];
582
+ x265_analysis_distortion_data* distortionData = m_frame->m_analysisData.distortionData;
583
+ if ((distortionData->threshold[ctu.m_cuAddr] < 0.9 || distortionData->threshold[ctu.m_cuAddr] > 1.1)
584
+ && distortionData->highDistortionCtuCount && distortionData->lowDistortionCtuCount)
585
+ qp += distortionData->offset[ctu.m_cuAddr];
586
}
587
588
- int loopIncr;
589
- if (m_param->rc.qgSize == 8)
590
- loopIncr = 8;
591
- else
592
- loopIncr = 16;
593
+ int loopIncr = (m_param->rc.qgSize == 8) ? 8 : 16;
594
+
595
/* Use cuTree offsets if cuTree enabled and frame is referenced, else use AQ offsets */
596
bool isReferenced = IS_REFERENCED(m_frame);
597
- double *qpoffs;
598
- if (complexCheck)
599
- qpoffs = m_frame->m_lowres.qpAqOffset;
600
- else
601
- qpoffs = (isReferenced && m_param->rc.cuTree) ? m_frame->m_lowres.qpCuTreeOffset : m_frame->m_lowres.qpAqOffset;
602
+ double *qpoffs = (isReferenced && m_param->rc.cuTree && !complexCheck) ? m_frame->m_lowres.qpCuTreeOffset :
603
+ m_frame->m_lowres.qpAqOffset;
604
if (qpoffs)
605
{
606
uint32_t width = m_frame->m_fencPic->m_picWidth;
607
608
uint32_t blockSize = m_param->maxCUSize >> cuGeom.depth;
609
double qp_offset = 0;
610
uint32_t cnt = 0;
611
- uint32_t idx;
612
-
613
for (uint32_t block_yy = block_y; block_yy < block_y + blockSize && block_yy < height; block_yy += loopIncr)
614
{
615
for (uint32_t block_xx = block_x; block_xx < block_x + blockSize && block_xx < width; block_xx += loopIncr)
616
{
617
- idx = ((block_yy / loopIncr) * (maxCols)) + (block_xx / loopIncr);
618
+ uint32_t idx = ((block_yy / loopIncr) * (maxCols)) + (block_xx / loopIncr);
619
qp_offset += qpoffs[idx];
620
cnt++;
621
}
622
623
int32_t offset = (int32_t)(qp_offset * 100 + .5);
624
double threshold = (1 - ((x265_ADAPT_RD_STRENGTH - m_param->dynamicRd) * 0.5));
625
int32_t max_threshold = (int32_t)(threshold * 100 + .5);
626
- if (offset < max_threshold)
627
- return 1;
628
- else
629
- return 0;
630
+ return (offset < max_threshold);
631
}
632
}
633
634
x265_2.7.tar.gz/source/encoder/analysis.h -> x265_2.9.tar.gz/source/encoder/analysis.h
Changed
69
1
2
3
protected:
4
/* Analysis data for save/load mode, writes/reads data based on absPartIdx */
5
- analysis_inter_data* m_reuseInterDataCTU;
6
- int32_t* m_reuseRef;
7
- uint8_t* m_reuseDepth;
8
- uint8_t* m_reuseModes;
9
- uint8_t* m_reusePartSize;
10
- uint8_t* m_reuseMergeFlag;
11
+ x265_analysis_inter_data* m_reuseInterDataCTU;
12
+ int32_t* m_reuseRef;
13
+ uint8_t* m_reuseDepth;
14
+ uint8_t* m_reuseModes;
15
+ uint8_t* m_reusePartSize;
16
+ uint8_t* m_reuseMergeFlag;
17
+ x265_analysis_MV* m_reuseMv[2];
18
+ uint8_t* m_reuseMvpIdx[2];
19
20
uint32_t m_splitRefIdx[4];
21
uint64_t* cacheCost;
22
23
-
24
- analysis2PassFrameData* m_multipassAnalysis;
25
- uint8_t* m_multipassDepth;
26
- MV* m_multipassMv[2];
27
- int* m_multipassMvpIdx[2];
28
- int32_t* m_multipassRef[2];
29
- uint8_t* m_multipassModes;
30
-
31
uint8_t m_evaluateInter;
32
+ int32_t m_refineLevel;
33
+
34
uint8_t* m_additionalCtuInfo;
35
int* m_prevCtuInfoChange;
36
+
37
+ struct TrainingData
38
+ {
39
+ uint32_t cuVariance;
40
+ uint8_t predMode;
41
+ uint8_t partSize;
42
+ uint8_t mergeFlag;
43
+ int split;
44
+
45
+ void init(const CUData& parentCTU, const CUGeom& cuGeom)
46
+ {
47
+ cuVariance = 0;
48
+ predMode = parentCTU.m_predMode[cuGeom.absPartIdx];
49
+ partSize = parentCTU.m_partSize[cuGeom.absPartIdx];
50
+ mergeFlag = parentCTU.m_mergeFlag[cuGeom.absPartIdx];
51
+ split = 0;
52
+ }
53
+ };
54
+
55
/* refine RD based on QP for rd-levels 5 and 6 */
56
void qprdRefine(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp, int32_t lqp);
57
58
59
void encodeResidue(const CUData& parentCTU, const CUGeom& cuGeom);
60
61
int calculateQpforCuSize(const CUData& ctu, const CUGeom& cuGeom, int32_t complexCheck = 0, double baseQP = -1);
62
+ uint32_t calculateCUVariance(const CUData& ctu, const CUGeom& cuGeom);
63
+
64
+ void classifyCU(const CUData& ctu, const CUGeom& cuGeom, const Mode& bestMode, TrainingData& trainData);
65
+ void trainCU(const CUData& ctu, const CUGeom& cuGeom, const Mode& bestMode, TrainingData& trainData);
66
67
void calculateNormFactor(CUData& ctu, int qp);
68
void normFactor(const pixel* src, uint32_t blockSize, CUData& ctu, int qp, TextType ttype);
69
x265_2.7.tar.gz/source/encoder/api.cpp -> x265_2.9.tar.gz/source/encoder/api.cpp
Changed
674
1
2
#include "nal.h"
3
#include "bitcost.h"
4
5
+#if ENABLE_LIBVMAF
6
+#include "libvmaf.h"
7
+#endif
8
+
9
/* multilib namespace reflectors */
10
#if LINKED_8BIT
11
namespace x265_8bit {
12
13
pic_in->analysisData.wt = NULL;
14
pic_in->analysisData.intraData = NULL;
15
pic_in->analysisData.interData = NULL;
16
- pic_in->analysis2Pass.analysisFramedata = NULL;
17
+ pic_in->analysisData.distortionData = NULL;
18
}
19
20
- if (pp_nal && numEncoded > 0)
21
+ if (pp_nal && numEncoded > 0 && encoder->m_outputCount >= encoder->m_latestParam->chunkStart)
22
{
23
*pp_nal = &encoder->m_nalList.m_nal[0];
24
if (pi_nal) *pi_nal = encoder->m_nalList.m_numNal;
25
26
else if (pi_nal)
27
*pi_nal = 0;
28
29
- if (numEncoded && encoder->m_param->csvLogLevel)
30
+ if (numEncoded && encoder->m_param->csvLogLevel && encoder->m_outputCount >= encoder->m_latestParam->chunkStart)
31
x265_csvlog_frame(encoder->m_param, pic_out);
32
33
if (numEncoded < 0)
34
35
encoder->fetchStats(outputStats, statsSizeBytes);
36
}
37
}
38
+#if ENABLE_LIBVMAF
39
+void x265_vmaf_encoder_log(x265_encoder* enc, int argc, char **argv, x265_param *param, x265_vmaf_data *vmafdata)
40
+{
41
+ if (enc)
42
+ {
43
+ Encoder *encoder = static_cast<Encoder*>(enc);
44
+ x265_stats stats;
45
+ stats.aggregateVmafScore = x265_calculate_vmafscore(param, vmafdata);
46
+ if(vmafdata->reference_file)
47
+ fclose(vmafdata->reference_file);
48
+ if(vmafdata->distorted_file)
49
+ fclose(vmafdata->distorted_file);
50
+ if(vmafdata)
51
+ x265_free(vmafdata);
52
+ encoder->fetchStats(&stats, sizeof(stats));
53
+ int padx = encoder->m_sps.conformanceWindow.rightOffset;
54
+ int pady = encoder->m_sps.conformanceWindow.bottomOffset;
55
+ x265_csvlog_encode(encoder->m_param, &stats, padx, pady, argc, argv);
56
+ }
57
+}
58
+#endif
59
60
void x265_encoder_log(x265_encoder* enc, int argc, char **argv)
61
{
62
if (enc)
63
{
64
Encoder *encoder = static_cast<Encoder*>(enc);
65
- x265_stats stats;
66
+ x265_stats stats;
67
encoder->fetchStats(&stats, sizeof(stats));
68
int padx = encoder->m_sps.conformanceWindow.rightOffset;
69
int pady = encoder->m_sps.conformanceWindow.bottomOffset;
70
71
return -1;
72
}
73
74
+void x265_alloc_analysis_data(x265_param *param, x265_analysis_data* analysis)
75
+{
76
+ x265_analysis_inter_data *interData = analysis->interData = NULL;
77
+ x265_analysis_intra_data *intraData = analysis->intraData = NULL;
78
+ x265_analysis_distortion_data *distortionData = analysis->distortionData = NULL;
79
+ bool isVbv = param->rc.vbvMaxBitrate > 0 && param->rc.vbvBufferSize > 0;
80
+ int numDir = 2; //irrespective of P or B slices set direction as 2
81
+ uint32_t numPlanes = param->internalCsp == X265_CSP_I400 ? 1 : 3;
82
+
83
+#if X265_DEPTH < 10 && (LINKED_10BIT || LINKED_12BIT)
84
+ uint32_t numCUs_sse_t = param->internalBitDepth > 8 ? analysis->numCUsInFrame << 1 : analysis->numCUsInFrame;
85
+#elif X265_DEPTH >= 10 && LINKED_8BIT
86
+ uint32_t numCUs_sse_t = param->internalBitDepth > 8 ? analysis->numCUsInFrame : (analysis->numCUsInFrame + 1U) >> 1;
87
+#else
88
+ uint32_t numCUs_sse_t = analysis->numCUsInFrame;
89
+#endif
90
+
91
+ //Allocate memory for distortionData pointer
92
+ CHECKED_MALLOC_ZERO(distortionData, x265_analysis_distortion_data, 1);
93
+ CHECKED_MALLOC_ZERO(distortionData->distortion, sse_t, analysis->numPartitions * numCUs_sse_t);
94
+ if (param->rc.bStatRead)
95
+ {
96
+ CHECKED_MALLOC_ZERO(distortionData->ctuDistortion, sse_t, numCUs_sse_t);
97
+ CHECKED_MALLOC_ZERO(distortionData->scaledDistortion, double, analysis->numCUsInFrame);
98
+ CHECKED_MALLOC_ZERO(distortionData->offset, double, analysis->numCUsInFrame);
99
+ CHECKED_MALLOC_ZERO(distortionData->threshold, double, analysis->numCUsInFrame);
100
+ }
101
+ analysis->distortionData = distortionData;
102
+
103
+ if (param->bDisableLookahead && isVbv)
104
+ {
105
+ CHECKED_MALLOC_ZERO(analysis->lookahead.intraSatdForVbv, uint32_t, analysis->numCuInHeight);
106
+ CHECKED_MALLOC_ZERO(analysis->lookahead.satdForVbv, uint32_t, analysis->numCuInHeight);
107
+ CHECKED_MALLOC_ZERO(analysis->lookahead.intraVbvCost, uint32_t, analysis->numCUsInFrame);
108
+ CHECKED_MALLOC_ZERO(analysis->lookahead.vbvCost, uint32_t, analysis->numCUsInFrame);
109
+ }
110
+
111
+ //Allocate memory for weightParam pointer
112
+ if (!(param->bMVType == AVC_INFO))
113
+ CHECKED_MALLOC_ZERO(analysis->wt, x265_weight_param, numPlanes * numDir);
114
+
115
+ if (param->analysisReuseLevel < 2)
116
+ return;
117
+
118
+ //Allocate memory for intraData pointer
119
+ CHECKED_MALLOC_ZERO(intraData, x265_analysis_intra_data, 1);
120
+ CHECKED_MALLOC(intraData->depth, uint8_t, analysis->numPartitions * analysis->numCUsInFrame);
121
+ CHECKED_MALLOC(intraData->modes, uint8_t, analysis->numPartitions * analysis->numCUsInFrame);
122
+ CHECKED_MALLOC(intraData->partSizes, char, analysis->numPartitions * analysis->numCUsInFrame);
123
+ CHECKED_MALLOC(intraData->chromaModes, uint8_t, analysis->numPartitions * analysis->numCUsInFrame);
124
+ analysis->intraData = intraData;
125
+
126
+ //Allocate memory for interData pointer based on ReuseLevels
127
+ CHECKED_MALLOC_ZERO(interData, x265_analysis_inter_data, 1);
128
+ CHECKED_MALLOC(interData->depth, uint8_t, analysis->numPartitions * analysis->numCUsInFrame);
129
+ CHECKED_MALLOC(interData->modes, uint8_t, analysis->numPartitions * analysis->numCUsInFrame);
130
+
131
+ CHECKED_MALLOC_ZERO(interData->mvpIdx[0], uint8_t, analysis->numPartitions * analysis->numCUsInFrame);
132
+ CHECKED_MALLOC_ZERO(interData->mvpIdx[1], uint8_t, analysis->numPartitions * analysis->numCUsInFrame);
133
+ CHECKED_MALLOC_ZERO(interData->mv[0], x265_analysis_MV, analysis->numPartitions * analysis->numCUsInFrame);
134
+ CHECKED_MALLOC_ZERO(interData->mv[1], x265_analysis_MV, analysis->numPartitions * analysis->numCUsInFrame);
135
+
136
+ if (param->analysisReuseLevel > 4)
137
+ {
138
+ CHECKED_MALLOC(interData->partSize, uint8_t, analysis->numPartitions * analysis->numCUsInFrame);
139
+ CHECKED_MALLOC_ZERO(interData->mergeFlag, uint8_t, analysis->numPartitions * analysis->numCUsInFrame);
140
+ }
141
+ if (param->analysisReuseLevel >= 7)
142
+ {
143
+ CHECKED_MALLOC(interData->interDir, uint8_t, analysis->numPartitions * analysis->numCUsInFrame);
144
+ CHECKED_MALLOC(interData->sadCost, int64_t, analysis->numPartitions * analysis->numCUsInFrame);
145
+ for (int dir = 0; dir < numDir; dir++)
146
+ {
147
+ CHECKED_MALLOC(interData->refIdx[dir], int8_t, analysis->numPartitions * analysis->numCUsInFrame);
148
+ CHECKED_MALLOC_ZERO(analysis->modeFlag[dir], uint8_t, analysis->numPartitions * analysis->numCUsInFrame);
149
+ }
150
+ }
151
+ else
152
+ {
153
+ if (param->analysisMultiPassRefine || param->analysisMultiPassDistortion){
154
+ CHECKED_MALLOC_ZERO(interData->ref, int32_t, 2 * analysis->numPartitions * analysis->numCUsInFrame);
155
+ }
156
+ else
157
+ CHECKED_MALLOC_ZERO(interData->ref, int32_t, analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU * numDir);
158
+ }
159
+ analysis->interData = interData;
160
+
161
+ return;
162
+
163
+fail:
164
+ x265_free_analysis_data(param, analysis);
165
+}
166
+
167
+void x265_free_analysis_data(x265_param *param, x265_analysis_data* analysis)
168
+{
169
+ bool isVbv = param->rc.vbvMaxBitrate > 0 && param->rc.vbvBufferSize > 0;
170
+
171
+ //Free memory for Lookahead pointers
172
+ if (param->bDisableLookahead && isVbv)
173
+ {
174
+ X265_FREE(analysis->lookahead.satdForVbv);
175
+ X265_FREE(analysis->lookahead.intraSatdForVbv);
176
+ X265_FREE(analysis->lookahead.vbvCost);
177
+ X265_FREE(analysis->lookahead.intraVbvCost);
178
+ }
179
+
180
+ //Free memory for distortionData pointers
181
+ if (analysis->distortionData)
182
+ {
183
+ X265_FREE((analysis->distortionData)->distortion);
184
+ if (param->rc.bStatRead)
185
+ {
186
+ X265_FREE((analysis->distortionData)->ctuDistortion);
187
+ X265_FREE((analysis->distortionData)->scaledDistortion);
188
+ X265_FREE((analysis->distortionData)->offset);
189
+ X265_FREE((analysis->distortionData)->threshold);
190
+ }
191
+ X265_FREE(analysis->distortionData);
192
+ }
193
+
194
+ /* Early exit freeing weights alone if level is 1 (when there is no analysis inter/intra) */
195
+ if (analysis->wt && !(param->bMVType == AVC_INFO))
196
+ X265_FREE(analysis->wt);
197
+
198
+ if (param->analysisReuseLevel < 2)
199
+ return;
200
+
201
+ //Free memory for intraData pointers
202
+ if (analysis->intraData)
203
+ {
204
+ X265_FREE((analysis->intraData)->depth);
205
+ X265_FREE((analysis->intraData)->modes);
206
+ X265_FREE((analysis->intraData)->partSizes);
207
+ X265_FREE((analysis->intraData)->chromaModes);
208
+ X265_FREE(analysis->intraData);
209
+ analysis->intraData = NULL;
210
+ }
211
+
212
+ //Free interData pointers
213
+ if (analysis->interData)
214
+ {
215
+ X265_FREE((analysis->interData)->depth);
216
+ X265_FREE((analysis->interData)->modes);
217
+ X265_FREE((analysis->interData)->mvpIdx[0]);
218
+ X265_FREE((analysis->interData)->mvpIdx[1]);
219
+ X265_FREE((analysis->interData)->mv[0]);
220
+ X265_FREE((analysis->interData)->mv[1]);
221
+
222
+ if (param->analysisReuseLevel > 4)
223
+ {
224
+ X265_FREE((analysis->interData)->mergeFlag);
225
+ X265_FREE((analysis->interData)->partSize);
226
+ }
227
+ if (param->analysisReuseLevel >= 7)
228
+ {
229
+ int numDir = 2;
230
+ X265_FREE((analysis->interData)->interDir);
231
+ X265_FREE((analysis->interData)->sadCost);
232
+ for (int dir = 0; dir < numDir; dir++)
233
+ {
234
+ X265_FREE((analysis->interData)->refIdx[dir]);
235
+ if (analysis->modeFlag[dir] != NULL)
236
+ {
237
+ X265_FREE(analysis->modeFlag[dir]);
238
+ analysis->modeFlag[dir] = NULL;
239
+ }
240
+ }
241
+ }
242
+ else
243
+ X265_FREE((analysis->interData)->ref);
244
+ X265_FREE(analysis->interData);
245
+ analysis->interData = NULL;
246
+ }
247
+}
248
+
249
void x265_cleanup(void)
250
{
251
BitCost::destroy();
252
253
&x265_csvlog_frame,
254
&x265_csvlog_encode,
255
&x265_dither_image,
256
- &x265_set_analysis_data
257
+ &x265_set_analysis_data,
258
+#if ENABLE_LIBVMAF
259
+ &x265_calculate_vmafscore,
260
+ &x265_calculate_vmaf_framelevelscore,
261
+ &x265_vmaf_encoder_log
262
+#endif
263
+
264
};
265
266
typedef const x265_api* (*api_get_func)(int bitDepth);
267
268
if (param->rc.rateControlMode == X265_RC_CRF)
269
fprintf(csvfp, "RateFactor, ");
270
if (param->rc.vbvBufferSize)
271
- fprintf(csvfp, "BufferFill, ");
272
+ fprintf(csvfp, "BufferFill, BufferFillFinal, ");
273
if (param->bEnablePsnr)
274
fprintf(csvfp, "Y PSNR, U PSNR, V PSNR, YUV PSNR, ");
275
if (param->bEnableSsim)
276
277
/* detailed performance statistics */
278
fprintf(csvfp, ", DecideWait (ms), Row0Wait (ms), Wall time (ms), Ref Wait Wall (ms), Total CTU time (ms),"
279
"Stall Time (ms), Total frame time (ms), Avg WPP, Row Blocks");
280
+#if ENABLE_LIBVMAF
281
+ fprintf(csvfp, ", VMAF Frame Score");
282
+#endif
283
}
284
fprintf(csvfp, "\n");
285
}
286
287
fputs(summaryCSVHeader, csvfp);
288
if (param->csvLogLevel >= 2 || param->maxCLL || param->maxFALL)
289
fputs("MaxCLL, MaxFALL,", csvfp);
290
+#if ENABLE_LIBVMAF
291
+ fputs(" Aggregate VMAF Score,", csvfp);
292
+#endif
293
fputs(" Version\n", csvfp);
294
}
295
}
296
297
if (param->rc.rateControlMode == X265_RC_CRF)
298
fprintf(param->csvfpt, "%.3lf,", frameStats->rateFactor);
299
if (param->rc.vbvBufferSize)
300
- fprintf(param->csvfpt, "%.3lf,", frameStats->bufferFill);
301
+ fprintf(param->csvfpt, "%.3lf, %.3lf,", frameStats->bufferFill, frameStats->bufferFillFinal);
302
if (param->bEnablePsnr)
303
fprintf(param->csvfpt, "%.3lf, %.3lf, %.3lf, %.3lf,", frameStats->psnrY, frameStats->psnrU, frameStats->psnrV, frameStats->psnr);
304
if (param->bEnableSsim)
305
306
frameStats->totalFrameTime);
307
308
fprintf(param->csvfpt, " %.3lf, %d", frameStats->avgWPP, frameStats->countRowBlocks);
309
+#if ENABLE_LIBVMAF
310
+ fprintf(param->csvfpt, ", %lf", frameStats->vmafFrameScore);
311
+#endif
312
}
313
fprintf(param->csvfpt, "\n");
314
fflush(stderr);
315
316
fputs(summaryCSVHeader, p->csvfpt);
317
if (p->csvLogLevel >= 2 || p->maxCLL || p->maxFALL)
318
fputs("MaxCLL, MaxFALL,", p->csvfpt);
319
+#if ENABLE_LIBVMAF
320
+ fputs(" Aggregate VMAF score,", p->csvfpt);
321
+#endif
322
fputs(" Version\n",p->csvfpt);
323
+
324
}
325
// CLI arguments or other
326
if (argc)
327
328
fputc('"', p->csvfpt);
329
fputs(opts, p->csvfpt);
330
fputc('"', p->csvfpt);
331
+ X265_FREE(opts);
332
}
333
}
334
335
336
char buffer[200];
337
strftime(buffer, 128, "%c", timeinfo);
338
fprintf(p->csvfpt, ", %s, ", buffer);
339
-
340
// elapsed time, fps, bitrate
341
fprintf(p->csvfpt, "%.2f, %.2f, %.2f,",
342
stats->elapsedEncodeTime, stats->encodedPictureCount / stats->elapsedEncodeTime, stats->bitrate);
343
344
fprintf(p->csvfpt, " -, -, -, -, -, -, -,");
345
if (p->csvLogLevel >= 2 || p->maxCLL || p->maxFALL)
346
fprintf(p->csvfpt, " %-6u, %-6u,", stats->maxCLL, stats->maxFALL);
347
+#if ENABLE_LIBVMAF
348
+ fprintf(p->csvfpt, " %lf,", stats->aggregateVmafScore);
349
+#endif
350
fprintf(p->csvfpt, " %s\n", api->version_str);
351
+
352
}
353
}
354
355
356
}
357
}
358
359
+#if ENABLE_LIBVMAF
360
+/* Read y values of single frame for 8-bit input */
361
+int read_image_byte(FILE *file, float *buf, int width, int height, int stride)
362
+{
363
+ char *byte_ptr = (char *)buf;
364
+ unsigned char *tmp_buf = 0;
365
+ int i, j;
366
+ int ret = 1;
367
+
368
+ if (width <= 0 || height <= 0)
369
+ {
370
+ goto fail_or_end;
371
+ }
372
+
373
+ if (!(tmp_buf = (unsigned char*)malloc(width)))
374
+ {
375
+ goto fail_or_end;
376
+ }
377
+
378
+ for (i = 0; i < height; ++i)
379
+ {
380
+ float *row_ptr = (float *)byte_ptr;
381
+
382
+ if (fread(tmp_buf, 1, width, file) != (size_t)width)
383
+ {
384
+ goto fail_or_end;
385
+ }
386
+
387
+ for (j = 0; j < width; ++j)
388
+ {
389
+ row_ptr[j] = tmp_buf[j];
390
+ }
391
+
392
+ byte_ptr += stride;
393
+ }
394
+
395
+ ret = 0;
396
+
397
+fail_or_end:
398
+ free(tmp_buf);
399
+ return ret;
400
+}
401
+/* Read y values of single frame for 10-bit input */
402
+int read_image_word(FILE *file, float *buf, int width, int height, int stride)
403
+{
404
+ char *byte_ptr = (char *)buf;
405
+ unsigned short *tmp_buf = 0;
406
+ int i, j;
407
+ int ret = 1;
408
+
409
+ if (width <= 0 || height <= 0)
410
+ {
411
+ goto fail_or_end;
412
+ }
413
+
414
+ if (!(tmp_buf = (unsigned short*)malloc(width * 2))) // '*2' to accommodate words
415
+ {
416
+ goto fail_or_end;
417
+ }
418
+
419
+ for (i = 0; i < height; ++i)
420
+ {
421
+ float *row_ptr = (float *)byte_ptr;
422
+
423
+ if (fread(tmp_buf, 2, width, file) != (size_t)width) // '2' for word
424
+ {
425
+ goto fail_or_end;
426
+ }
427
+
428
+ for (j = 0; j < width; ++j)
429
+ {
430
+ row_ptr[j] = tmp_buf[j] / 4.0; // '/4' to convert from 10 to 8-bit
431
+ }
432
+
433
+ byte_ptr += stride;
434
+ }
435
+
436
+ ret = 0;
437
+
438
+fail_or_end:
439
+ free(tmp_buf);
440
+ return ret;
441
+}
442
+
443
+int read_frame(float *reference_data, float *distorted_data, float *temp_data, int stride_byte, void *s)
444
+{
445
+ x265_vmaf_data *user_data = (x265_vmaf_data *)s;
446
+ int ret;
447
+
448
+ // read reference y
449
+ if (user_data->internalBitDepth == 8)
450
+ {
451
+ ret = read_image_byte(user_data->reference_file, reference_data, user_data->width, user_data->height, stride_byte);
452
+ }
453
+ else if (user_data->internalBitDepth == 10)
454
+ {
455
+ ret = read_image_word(user_data->reference_file, reference_data, user_data->width, user_data->height, stride_byte);
456
+ }
457
+ else
458
+ {
459
+ x265_log(NULL, X265_LOG_ERROR, "Invalid bitdepth\n");
460
+ return 1;
461
+ }
462
+ if (ret)
463
+ {
464
+ if (feof(user_data->reference_file))
465
+ {
466
+ ret = 2; // OK if end of file
467
+ }
468
+ return ret;
469
+ }
470
+
471
+ // read distorted y
472
+ if (user_data->internalBitDepth == 8)
473
+ {
474
+ ret = read_image_byte(user_data->distorted_file, distorted_data, user_data->width, user_data->height, stride_byte);
475
+ }
476
+ else if (user_data->internalBitDepth == 10)
477
+ {
478
+ ret = read_image_word(user_data->distorted_file, distorted_data, user_data->width, user_data->height, stride_byte);
479
+ }
480
+ else
481
+ {
482
+ x265_log(NULL, X265_LOG_ERROR, "Invalid bitdepth\n");
483
+ return 1;
484
+ }
485
+ if (ret)
486
+ {
487
+ if (feof(user_data->distorted_file))
488
+ {
489
+ ret = 2; // OK if end of file
490
+ }
491
+ return ret;
492
+ }
493
+
494
+ // reference skip u and v
495
+ if (user_data->internalBitDepth == 8)
496
+ {
497
+ if (fread(temp_data, 1, user_data->offset, user_data->reference_file) != (size_t)user_data->offset)
498
+ {
499
+ x265_log(NULL, X265_LOG_ERROR, "reference fread to skip u and v failed.\n");
500
+ goto fail_or_end;
501
+ }
502
+ }
503
+ else if (user_data->internalBitDepth == 10)
504
+ {
505
+ if (fread(temp_data, 2, user_data->offset, user_data->reference_file) != (size_t)user_data->offset)
506
+ {
507
+ x265_log(NULL, X265_LOG_ERROR, "reference fread to skip u and v failed.\n");
508
+ goto fail_or_end;
509
+ }
510
+ }
511
+ else
512
+ {
513
+ x265_log(NULL, X265_LOG_ERROR, "Invalid format\n");
514
+ goto fail_or_end;
515
+ }
516
+
517
+ // distorted skip u and v
518
+ if (user_data->internalBitDepth == 8)
519
+ {
520
+ if (fread(temp_data, 1, user_data->offset, user_data->distorted_file) != (size_t)user_data->offset)
521
+ {
522
+ x265_log(NULL, X265_LOG_ERROR, "distorted fread to skip u and v failed.\n");
523
+ goto fail_or_end;
524
+ }
525
+ }
526
+ else if (user_data->internalBitDepth == 10)
527
+ {
528
+ if (fread(temp_data, 2, user_data->offset, user_data->distorted_file) != (size_t)user_data->offset)
529
+ {
530
+ x265_log(NULL, X265_LOG_ERROR, "distorted fread to skip u and v failed.\n");
531
+ goto fail_or_end;
532
+ }
533
+ }
534
+ else
535
+ {
536
+ x265_log(NULL, X265_LOG_ERROR, "Invalid format\n");
537
+ goto fail_or_end;
538
+ }
539
+
540
+
541
+fail_or_end:
542
+ return ret;
543
+}
544
+
545
+double x265_calculate_vmafscore(x265_param *param, x265_vmaf_data *data)
546
+{
547
+ double score;
548
+
549
+ data->width = param->sourceWidth;
550
+ data->height = param->sourceHeight;
551
+ data->internalBitDepth = param->internalBitDepth;
552
+
553
+ if (param->internalCsp == X265_CSP_I420)
554
+ {
555
+ if ((param->sourceWidth * param->sourceHeight) % 2 != 0)
556
+ x265_log(NULL, X265_LOG_ERROR, "Invalid file size\n");
557
+ data->offset = param->sourceWidth * param->sourceHeight / 2;
558
+ }
559
+ else if (param->internalCsp == X265_CSP_I422)
560
+ data->offset = param->sourceWidth * param->sourceHeight;
561
+ else if (param->internalCsp == X265_CSP_I444)
562
+ data->offset = param->sourceWidth * param->sourceHeight * 2;
563
+ else
564
+ x265_log(NULL, X265_LOG_ERROR, "Invalid format\n");
565
+
566
+ compute_vmaf(&score, vcd->format, data->width, data->height, read_frame, data, vcd->model_path, vcd->log_path, vcd->log_fmt, vcd->disable_clip, vcd->disable_avx, vcd->enable_transform, vcd->phone_model, vcd->psnr, vcd->ssim, vcd->ms_ssim, vcd->pool);
567
+
568
+ return score;
569
+}
570
+
571
+int read_frame_10bit(float *reference_data, float *distorted_data, float *temp_data, int stride, void *s)
572
+{
573
+ x265_vmaf_framedata *user_data = (x265_vmaf_framedata *)s;
574
+
575
+ PicYuv *reference_frame = (PicYuv *)user_data->reference_frame;
576
+ PicYuv *distorted_frame = (PicYuv *)user_data->distorted_frame;
577
+
578
+ if(!user_data->frame_set) {
579
+
580
+ int reference_stride = reference_frame->m_stride;
581
+ int distorted_stride = distorted_frame->m_stride;
582
+
583
+ const uint16_t *reference_ptr = (const uint16_t *)reference_frame->m_picOrg[0];
584
+ const uint16_t *distorted_ptr = (const uint16_t *)distorted_frame->m_picOrg[0];
585
+
586
+ temp_data = reference_data;
587
+
588
+ int height = user_data->height;
589
+ int width = user_data->width;
590
+
591
+ int i,j;
592
+ for (i = 0; i < height; i++) {
593
+ for ( j = 0; j < width; j++) {
594
+ temp_data[j] = ((float)reference_ptr[j] / 4.0);
595
+ }
596
+ reference_ptr += reference_stride;
597
+ temp_data += stride / sizeof(*temp_data);
598
+ }
599
+
600
+ temp_data = distorted_data;
601
+ for (i = 0; i < height; i++) {
602
+ for (j = 0; j < width; j++) {
603
+ temp_data[j] = ((float)distorted_ptr[j] / 4.0);
604
+ }
605
+ distorted_ptr += distorted_stride;
606
+ temp_data += stride / sizeof(*temp_data);
607
+ }
608
+
609
+ user_data->frame_set = 1;
610
+ return 0;
611
+ }
612
+ return 2;
613
+}
614
+
615
+int read_frame_8bit(float *reference_data, float *distorted_data, float *temp_data, int stride, void *s)
616
+{
617
+ x265_vmaf_framedata *user_data = (x265_vmaf_framedata *)s;
618
+
619
+ PicYuv *reference_frame = (PicYuv *)user_data->reference_frame;
620
+ PicYuv *distorted_frame = (PicYuv *)user_data->distorted_frame;
621
+
622
+ if(!user_data->frame_set) {
623
+
624
+ int reference_stride = reference_frame->m_stride;
625
+ int distorted_stride = distorted_frame->m_stride;
626
+
627
+ const uint8_t *reference_ptr = (const uint8_t *)reference_frame->m_picOrg[0];
628
+ const uint8_t *distorted_ptr = (const uint8_t *)distorted_frame->m_picOrg[0];
629
+
630
+ temp_data = reference_data;
631
+
632
+ int height = user_data->height;
633
+ int width = user_data->width;
634
+
635
+ int i,j;
636
+ for (i = 0; i < height; i++) {
637
+ for ( j = 0; j < width; j++) {
638
+ temp_data[j] = (float)reference_ptr[j];
639
+ }
640
+ reference_ptr += reference_stride;
641
+ temp_data += stride / sizeof(*temp_data);
642
+ }
643
+
644
+ temp_data = distorted_data;
645
+ for (i = 0; i < height; i++) {
646
+ for (j = 0; j < width; j++) {
647
+ temp_data[j] = (float)distorted_ptr[j];
648
+ }
649
+ distorted_ptr += distorted_stride;
650
+ temp_data += stride / sizeof(*temp_data);
651
+ }
652
+
653
+ user_data->frame_set = 1;
654
+ return 0;
655
+ }
656
+ return 2;
657
+}
658
+
659
+double x265_calculate_vmaf_framelevelscore(x265_vmaf_framedata *vmafframedata)
660
+{
661
+ double score;
662
+ int (*read_frame)(float *reference_data, float *distorted_data, float *temp_data,
663
+ int stride, void *s);
664
+ if (vmafframedata->internalBitDepth == 8)
665
+ read_frame = read_frame_8bit;
666
+ else
667
+ read_frame = read_frame_10bit;
668
+ compute_vmaf(&score, vcd->format, vmafframedata->width, vmafframedata->height, read_frame, vmafframedata, vcd->model_path, vcd->log_path, vcd->log_fmt, vcd->disable_clip, vcd->disable_avx, vcd->enable_transform, vcd->phone_model, vcd->psnr, vcd->ssim, vcd->ms_ssim, vcd->pool);
669
+
670
+ return score;
671
+}
672
+#endif
673
} /* end namespace or extern "C" */
674
x265_2.7.tar.gz/source/encoder/dpb.cpp -> x265_2.9.tar.gz/source/encoder/dpb.cpp
Changed
35
1
2
int pocCurr = slice->m_poc;
3
int type = newFrame->m_lowres.sliceType;
4
bool bIsKeyFrame = newFrame->m_lowres.bKeyframe;
5
-
6
slice->m_nalUnitType = getNalUnitType(pocCurr, bIsKeyFrame);
7
- if (slice->m_nalUnitType == NAL_UNIT_CODED_SLICE_IDR_W_RADL)
8
+ if (slice->m_nalUnitType == NAL_UNIT_CODED_SLICE_IDR_W_RADL || slice->m_nalUnitType == NAL_UNIT_CODED_SLICE_IDR_N_LP)
9
m_lastIDR = pocCurr;
10
slice->m_lastIDR = m_lastIDR;
11
slice->m_sliceType = IS_X265_TYPE_B(type) ? B_SLICE : (type == X265_TYPE_P) ? P_SLICE : I_SLICE;
12
13
/* Marking reference pictures when an IDR/CRA is encountered. */
14
void DPB::decodingRefreshMarking(int pocCurr, NalUnitType nalUnitType)
15
{
16
- if (nalUnitType == NAL_UNIT_CODED_SLICE_IDR_W_RADL)
17
+ if (nalUnitType == NAL_UNIT_CODED_SLICE_IDR_W_RADL || nalUnitType == NAL_UNIT_CODED_SLICE_IDR_N_LP)
18
{
19
/* If the nal_unit_type is IDR, all pictures in the reference picture
20
* list are marked as "unused for reference" */
21
22
NalUnitType DPB::getNalUnitType(int curPOC, bool bIsKeyFrame)
23
{
24
if (!curPOC)
25
- return NAL_UNIT_CODED_SLICE_IDR_W_RADL;
26
-
27
+ return NAL_UNIT_CODED_SLICE_IDR_N_LP;
28
if (bIsKeyFrame)
29
- return m_bOpenGOP ? NAL_UNIT_CODED_SLICE_CRA : NAL_UNIT_CODED_SLICE_IDR_W_RADL;
30
-
31
+ return m_bOpenGOP ? NAL_UNIT_CODED_SLICE_CRA : m_bhasLeadingPicture ? NAL_UNIT_CODED_SLICE_IDR_W_RADL : NAL_UNIT_CODED_SLICE_IDR_N_LP;
32
if (m_pocCRA && curPOC < m_pocCRA)
33
// All leading pictures are being marked as TFD pictures here since
34
// current encoder uses all reference pictures while encoding leading
35
x265_2.7.tar.gz/source/encoder/dpb.h -> x265_2.9.tar.gz/source/encoder/dpb.h
Changed
17
1
2
int m_lastIDR;
3
int m_pocCRA;
4
int m_bOpenGOP;
5
+ int m_bhasLeadingPicture;
6
bool m_bRefreshPending;
7
bool m_bTemporalSublayer;
8
PicList m_picList;
9
10
{
11
m_lastIDR = 0;
12
m_pocCRA = 0;
13
+ m_bhasLeadingPicture = param->radl;
14
m_bRefreshPending = false;
15
m_frameDataFreeList = NULL;
16
m_bOpenGOP = param->bOpenGOP;
17
x265_2.7.tar.gz/source/encoder/encoder.cpp -> x265_2.9.tar.gz/source/encoder/encoder.cpp
Changed
2234
1
2
m_threadPool = NULL;
3
m_analysisFileIn = NULL;
4
m_analysisFileOut = NULL;
5
+ m_naluFile = NULL;
6
m_offsetEmergency = NULL;
7
m_iFrameNum = 0;
8
m_iPPSQpMinus26 = 0;
9
10
#endif
11
12
m_prevTonemapPayload.payload = NULL;
13
+ m_startPoint = 0;
14
+ m_saveCTUSize = 0;
15
}
16
inline char *strcatFilename(const char *input, const char *suffix)
17
{
18
19
20
if (m_param->bEmitHRDSEI)
21
m_rateControl->initHRD(m_sps);
22
+
23
if (!m_rateControl->init(m_sps))
24
m_aborted = true;
25
if (!m_lookahead->create())
26
m_aborted = true;
27
+
28
initRefIdx();
29
if (m_param->analysisSave && m_param->bUseAnalysisFile)
30
{
31
32
33
m_emitCLLSEI = p->maxCLL || p->maxFALL;
34
35
+ if (m_param->naluFile)
36
+ {
37
+ m_naluFile = x265_fopen(m_param->naluFile, "r");
38
+ if (!m_naluFile)
39
+ {
40
+ x265_log_file(NULL, X265_LOG_ERROR, "%s file not found or Failed to open\n", m_param->naluFile);
41
+ m_aborted = true;
42
+ }
43
+ else
44
+ m_enableNal = 1;
45
+ }
46
+ else
47
+ m_enableNal = 0;
48
+
49
#if ENABLE_HDR10_PLUS
50
if (m_bToneMap)
51
m_numCimInfo = m_hdr10plus_api->hdr10plus_json_to_movie_cim(m_param->toneMapFile, m_cim);
52
#endif
53
+ if (m_param->bDynamicRefine)
54
+ {
55
+ /* Allocate memory for 1 GOP and reuse it for the subsequent GOPs */
56
+ int size = (m_param->keyframeMax + m_param->lookaheadDepth) * m_param->maxCUDepth * X265_REFINE_INTER_LEVELS;
57
+ CHECKED_MALLOC_ZERO(m_variance, uint64_t, size);
58
+ CHECKED_MALLOC_ZERO(m_rdCost, uint64_t, size);
59
+ CHECKED_MALLOC_ZERO(m_trainingCount, uint32_t, size);
60
+ return;
61
+ fail:
62
+ m_aborted = true;
63
+ }
64
}
65
66
void Encoder::stopJobs()
67
68
curFrame->m_analysisData.numPartitions = m_param->num4x4Partitions;
69
int num16x16inCUWidth = m_param->maxCUSize >> 4;
70
uint32_t ctuAddr, offset, cuPos;
71
- analysis_intra_data * intraData = (analysis_intra_data *)curFrame->m_analysisData.intraData;
72
- analysis_intra_data * srcIntraData = (analysis_intra_data *)analysis_data->intraData;
73
+ x265_analysis_intra_data * intraData = curFrame->m_analysisData.intraData;
74
+ x265_analysis_intra_data * srcIntraData = analysis_data->intraData;
75
for (int i = 0; i < mbImageHeight; i++)
76
{
77
for (int j = 0; j < mbImageWidth; j++)
78
79
curFrame->m_analysisData.numPartitions = m_param->num4x4Partitions;
80
int num16x16inCUWidth = m_param->maxCUSize >> 4;
81
uint32_t ctuAddr, offset, cuPos;
82
- analysis_inter_data * interData = (analysis_inter_data *)curFrame->m_analysisData.interData;
83
- analysis_inter_data * srcInterData = (analysis_inter_data*)analysis_data->interData;
84
+ x265_analysis_inter_data * interData = curFrame->m_analysisData.interData;
85
+ x265_analysis_inter_data * srcInterData = analysis_data->interData;
86
for (int i = 0; i < mbImageHeight; i++)
87
{
88
for (int j = 0; j < mbImageWidth; j++)
89
90
curFrame->m_analysisData = (*analysis_data);
91
curFrame->m_analysisData.numCUsInFrame = widthInCU * heightInCU;
92
curFrame->m_analysisData.numPartitions = m_param->num4x4Partitions;
93
- allocAnalysis(&curFrame->m_analysisData);
94
+ x265_alloc_analysis_data(m_param, &curFrame->m_analysisData);
95
if (m_param->maxCUSize == 16)
96
{
97
if (analysis_data->sliceType == X265_TYPE_IDR || analysis_data->sliceType == X265_TYPE_I)
98
99
100
curFrame->m_analysisData.numPartitions = m_param->num4x4Partitions;
101
size_t count = 0;
102
- analysis_intra_data * currIntraData = (analysis_intra_data *)curFrame->m_analysisData.intraData;
103
- analysis_intra_data * intraData = (analysis_intra_data *)analysis_data->intraData;
104
+ x265_analysis_intra_data * currIntraData = curFrame->m_analysisData.intraData;
105
+ x265_analysis_intra_data * intraData = analysis_data->intraData;
106
for (uint32_t d = 0; d < cuBytes; d++)
107
{
108
int bytes = curFrame->m_analysisData.numPartitions >> ((intraData)->depth[d] * 2);
109
110
111
curFrame->m_analysisData.numPartitions = m_param->num4x4Partitions;
112
size_t count = 0;
113
- analysis_inter_data * currInterData = (analysis_inter_data *)curFrame->m_analysisData.interData;
114
- analysis_inter_data * interData = (analysis_inter_data *)analysis_data->interData;
115
+ x265_analysis_inter_data * currInterData = curFrame->m_analysisData.interData;
116
+ x265_analysis_inter_data * interData = analysis_data->interData;
117
for (uint32_t d = 0; d < cuBytes; d++)
118
{
119
int bytes = curFrame->m_analysisData.numPartitions >> ((interData)->depth[d] * 2);
120
memset(&(currInterData)->depth[count], (interData)->depth[d], bytes);
121
memset(&(currInterData)->modes[count], (interData)->modes[d], bytes);
122
- memcpy(&(currInterData)->sadCost[count], &((analysis_inter_data*)analysis_data->interData)->sadCost[d], bytes);
123
+ memcpy(&(currInterData)->sadCost[count], &(analysis_data->interData)->sadCost[d], bytes);
124
if (m_param->analysisReuseLevel > 4)
125
{
126
memset(&(currInterData)->partSize[count], (interData)->partSize[d], bytes);
127
128
if (m_bToneMap)
129
m_hdr10plus_api->hdr10plus_clear_movie(m_cim, m_numCimInfo);
130
#endif
131
-
132
+
133
+ if (m_param->bDynamicRefine)
134
+ {
135
+ X265_FREE(m_variance);
136
+ X265_FREE(m_rdCost);
137
+ X265_FREE(m_trainingCount);
138
+ }
139
if (m_exportedPic)
140
{
141
ATOMIC_DEC(&m_exportedPic->m_countRefEncoders);
142
143
}
144
X265_FREE(temp);
145
}
146
+ if (m_naluFile)
147
+ fclose(m_naluFile);
148
if (m_param)
149
{
150
if (m_param->csvfpt)
151
152
}
153
}
154
155
+void Encoder::copyUserSEIMessages(Frame *frame, const x265_picture* pic_in)
156
+{
157
+ x265_sei_payload toneMap;
158
+ toneMap.payload = NULL;
159
+ int toneMapPayload = 0;
160
+
161
+#if ENABLE_HDR10_PLUS
162
+ if (m_bToneMap)
163
+ {
164
+ int currentPOC = m_pocLast;
165
+ if (currentPOC < m_numCimInfo)
166
+ {
167
+ int32_t i = 0;
168
+ toneMap.payloadSize = 0;
169
+ while (m_cim[currentPOC][i] == 0xFF)
170
+ toneMap.payloadSize += m_cim[currentPOC][i++];
171
+ toneMap.payloadSize += m_cim[currentPOC][i];
172
+
173
+ toneMap.payload = (uint8_t*)x265_malloc(sizeof(uint8_t) * toneMap.payloadSize);
174
+ toneMap.payloadType = USER_DATA_REGISTERED_ITU_T_T35;
175
+ memcpy(toneMap.payload, &m_cim[currentPOC][i + 1], toneMap.payloadSize);
176
+ toneMapPayload = 1;
177
+ }
178
+ }
179
+#endif
180
+ /* seiMsg will contain SEI messages specified in a fixed file format in POC order.
181
+ * Format of the file : <POC><space><PREFIX><space><NAL UNIT TYPE>/<SEI TYPE><space><SEI Payload> */
182
+ x265_sei_payload seiMsg;
183
+ seiMsg.payload = NULL;
184
+ int userPayload = 0;
185
+ if (m_enableNal)
186
+ {
187
+ readUserSeiFile(seiMsg, m_pocLast);
188
+ if (seiMsg.payload)
189
+ userPayload = 1;;
190
+ }
191
+
192
+ int numPayloads = pic_in->userSEI.numPayloads + toneMapPayload + userPayload;
193
+ frame->m_userSEI.numPayloads = numPayloads;
194
+
195
+ if (frame->m_userSEI.numPayloads)
196
+ {
197
+ if (!frame->m_userSEI.payloads)
198
+ {
199
+ frame->m_userSEI.payloads = new x265_sei_payload[numPayloads];
200
+ for (int i = 0; i < numPayloads; i++)
201
+ frame->m_userSEI.payloads[i].payload = NULL;
202
+ }
203
+ for (int i = 0; i < numPayloads; i++)
204
+ {
205
+ x265_sei_payload input;
206
+ if ((i == (numPayloads - 1)) && toneMapPayload)
207
+ input = toneMap;
208
+ else if (m_enableNal)
209
+ input = seiMsg;
210
+ else
211
+ input = pic_in->userSEI.payloads[i];
212
+
213
+ if (!frame->m_userSEI.payloads[i].payload)
214
+ frame->m_userSEI.payloads[i].payload = new uint8_t[input.payloadSize];
215
+ memcpy(frame->m_userSEI.payloads[i].payload, input.payload, input.payloadSize);
216
+ frame->m_userSEI.payloads[i].payloadSize = input.payloadSize;
217
+ frame->m_userSEI.payloads[i].payloadType = input.payloadType;
218
+ }
219
+ if (toneMap.payload)
220
+ x265_free(toneMap.payload);
221
+ if (seiMsg.payload)
222
+ x265_free(seiMsg.payload);
223
+ }
224
+}
225
+
226
/**
227
* Feed one new input frame into the encoder, get one frame out. If pic_in is
228
* NULL, a flush condition is implied and pic_in must be NULL for all subsequent
229
230
if (m_exportedPic)
231
{
232
if (!m_param->bUseAnalysisFile && m_param->analysisSave)
233
- freeAnalysis(&m_exportedPic->m_analysisData);
234
+ x265_free_analysis_data(m_param, &m_exportedPic->m_analysisData);
235
ATOMIC_DEC(&m_exportedPic->m_countRefEncoders);
236
m_exportedPic = NULL;
237
m_dpb->recycleUnreferenced();
238
}
239
- if (pic_in)
240
+ if (pic_in && (!m_param->chunkEnd || (m_encodedFrameNum < m_param->chunkEnd)))
241
{
242
if (m_latestParam->forceFlush == 1)
243
{
244
245
m_latestParam->forceFlush = 0;
246
}
247
248
- x265_sei_payload toneMap;
249
- toneMap.payload = NULL;
250
-#if ENABLE_HDR10_PLUS
251
- if (m_bToneMap)
252
- {
253
- int currentPOC = m_pocLast + 1;
254
- if (currentPOC < m_numCimInfo)
255
- {
256
- int32_t i = 0;
257
- toneMap.payloadSize = 0;
258
- while (m_cim[currentPOC][i] == 0xFF)
259
- toneMap.payloadSize += m_cim[currentPOC][i++];
260
- toneMap.payloadSize += m_cim[currentPOC][i];
261
-
262
- toneMap.payload = (uint8_t*)x265_malloc(sizeof(uint8_t) * toneMap.payloadSize);
263
- toneMap.payloadType = USER_DATA_REGISTERED_ITU_T_T35;
264
- memcpy(toneMap.payload, &m_cim[currentPOC][i+1], toneMap.payloadSize);
265
- }
266
- }
267
-#endif
268
-
269
if (pic_in->bitDepth < 8 || pic_in->bitDepth > 16)
270
{
271
x265_log(m_param, X265_LOG_ERROR, "Input bit depth (%d) must be between 8 and 16\n",
272
273
inFrame->m_forceqp = pic_in->forceqp;
274
inFrame->m_param = (m_reconfigure || m_reconfigureRc) ? m_latestParam : m_param;
275
276
- int toneMapEnable = 0;
277
- if (m_bToneMap && toneMap.payload)
278
- toneMapEnable = 1;
279
- int numPayloads = pic_in->userSEI.numPayloads + toneMapEnable;
280
- inFrame->m_userSEI.numPayloads = numPayloads;
281
-
282
- if (inFrame->m_userSEI.numPayloads)
283
- {
284
- if (!inFrame->m_userSEI.payloads)
285
- {
286
- inFrame->m_userSEI.payloads = new x265_sei_payload[numPayloads];
287
- for (int i = 0; i < numPayloads; i++)
288
- inFrame->m_userSEI.payloads[i].payload = NULL;
289
- }
290
- for (int i = 0; i < numPayloads; i++)
291
- {
292
- x265_sei_payload input;
293
- if ((i == (numPayloads - 1)) && toneMapEnable)
294
- input = toneMap;
295
- else
296
- input = pic_in->userSEI.payloads[i];
297
- int size = inFrame->m_userSEI.payloads[i].payloadSize = input.payloadSize;
298
- inFrame->m_userSEI.payloads[i].payloadType = input.payloadType;
299
- if (!inFrame->m_userSEI.payloads[i].payload)
300
- inFrame->m_userSEI.payloads[i].payload = new uint8_t[size];
301
- memcpy(inFrame->m_userSEI.payloads[i].payload, input.payload, size);
302
- }
303
- if (toneMap.payload)
304
- x265_free(toneMap.payload);
305
- }
306
+ copyUserSEIMessages(inFrame, pic_in);
307
308
if (pic_in->quantOffsets != NULL)
309
{
310
311
/* Load analysis data before lookahead->addPicture, since sliceType has been decided */
312
if (m_param->analysisLoad)
313
{
314
- /* readAnalysisFile reads analysis data for the frame and allocates memory based on slicetype */
315
- readAnalysisFile(&inFrame->m_analysisData, inFrame->m_poc, pic_in);
316
+ /* reads analysis data for the frame and allocates memory based on slicetype */
317
+ static int paramBytes = 0;
318
+ if (!inFrame->m_poc)
319
+ {
320
+ x265_analysis_data analysisData = pic_in->analysisData;
321
+ paramBytes = validateAnalysisData(&analysisData, 0);
322
+ if (paramBytes == -1)
323
+ {
324
+ m_aborted = true;
325
+ return -1;
326
+ }
327
+ }
328
+ if (m_saveCTUSize)
329
+ {
330
+ cuLocation cuLocInFrame;
331
+ cuLocInFrame.init(m_param);
332
+ /* Set skipWidth/skipHeight flags when the out of bound pixels in lowRes is greater than half of maxCUSize */
333
+ int extendedWidth = ((m_param->sourceWidth / 2 + m_param->maxCUSize - 1) >> m_param->maxLog2CUSize) * m_param->maxCUSize;
334
+ int extendedHeight = ((m_param->sourceHeight / 2 + m_param->maxCUSize - 1) >> m_param->maxLog2CUSize) * m_param->maxCUSize;
335
+ uint32_t outOfBoundaryLowres = extendedWidth - m_param->sourceWidth / 2;
336
+ if (outOfBoundaryLowres * 2 >= m_param->maxCUSize)
337
+ cuLocInFrame.skipWidth = true;
338
+ uint32_t outOfBoundaryLowresH = extendedHeight - m_param->sourceHeight / 2;
339
+ if (outOfBoundaryLowresH * 2 >= m_param->maxCUSize)
340
+ cuLocInFrame.skipHeight = true;
341
+ readAnalysisFile(&inFrame->m_analysisData, inFrame->m_poc, pic_in, paramBytes, cuLocInFrame);
342
+ }
343
+ else
344
+ readAnalysisFile(&inFrame->m_analysisData, inFrame->m_poc, pic_in, paramBytes);
345
inFrame->m_poc = inFrame->m_analysisData.poc;
346
sliceType = inFrame->m_analysisData.sliceType;
347
inFrame->m_lowres.bScenecut = !!inFrame->m_analysisData.bScenecut;
348
349
350
/* Free up pic_in->analysisData since it has already been used */
351
if ((m_param->analysisLoad && !m_param->analysisSave) || (m_param->bMVType && slice->m_sliceType != I_SLICE))
352
- freeAnalysis(&outFrame->m_analysisData);
353
+ x265_free_analysis_data(m_param, &outFrame->m_analysisData);
354
355
if (pic_out)
356
{
357
358
359
pic_out->pts = outFrame->m_pts;
360
pic_out->dts = outFrame->m_dts;
361
+ pic_out->reorderedPts = outFrame->m_reorderedPts;
362
pic_out->sliceType = outFrame->m_lowres.sliceType;
363
pic_out->planes[0] = recpic->m_picOrg[0];
364
pic_out->stride[0] = (int)(recpic->m_stride * sizeof(pixel));
365
366
pic_out->analysisData.intraData = outFrame->m_analysisData.intraData;
367
pic_out->analysisData.modeFlag[0] = outFrame->m_analysisData.modeFlag[0];
368
pic_out->analysisData.modeFlag[1] = outFrame->m_analysisData.modeFlag[1];
369
+ pic_out->analysisData.distortionData = outFrame->m_analysisData.distortionData;
370
if (m_param->bDisableLookahead)
371
{
372
int factor = 1;
373
374
factor = m_param->scaleFactor * 2;
375
pic_out->analysisData.numCuInHeight = outFrame->m_analysisData.numCuInHeight;
376
pic_out->analysisData.lookahead.dts = outFrame->m_dts;
377
+ pic_out->analysisData.lookahead.reorderedPts = outFrame->m_reorderedPts;
378
pic_out->analysisData.satdCost *= factor;
379
pic_out->analysisData.lookahead.keyframe = outFrame->m_lowres.bKeyframe;
380
pic_out->analysisData.lookahead.lastMiniGopBFrame = outFrame->m_lowres.bLastMiniGopBFrame;
381
382
int vbvCount = m_param->lookaheadDepth + m_param->bframes + 2;
383
for (int index = 0; index < vbvCount; index++)
384
{
385
- pic_out->analysisData.lookahead.plannedSatd[index] = outFrame->m_lowres.plannedSatd[index] * factor;
386
+ pic_out->analysisData.lookahead.plannedSatd[index] = outFrame->m_lowres.plannedSatd[index];
387
pic_out->analysisData.lookahead.plannedType[index] = outFrame->m_lowres.plannedType[index];
388
}
389
for (uint32_t index = 0; index < pic_out->analysisData.numCuInHeight; index++)
390
{
391
- outFrame->m_analysisData.lookahead.intraSatdForVbv[index] = outFrame->m_encData->m_rowStat[index].intraSatdForVbv * factor;
392
- outFrame->m_analysisData.lookahead.satdForVbv[index] = outFrame->m_encData->m_rowStat[index].satdForVbv * factor;
393
+ outFrame->m_analysisData.lookahead.intraSatdForVbv[index] = outFrame->m_encData->m_rowStat[index].intraSatdForVbv;
394
+ outFrame->m_analysisData.lookahead.satdForVbv[index] = outFrame->m_encData->m_rowStat[index].satdForVbv;
395
}
396
pic_out->analysisData.lookahead.intraSatdForVbv = outFrame->m_analysisData.lookahead.intraSatdForVbv;
397
pic_out->analysisData.lookahead.satdForVbv = outFrame->m_analysisData.lookahead.satdForVbv;
398
for (uint32_t index = 0; index < pic_out->analysisData.numCUsInFrame; index++)
399
{
400
- outFrame->m_analysisData.lookahead.intraVbvCost[index] = outFrame->m_encData->m_cuStat[index].intraVbvCost * factor;
401
- outFrame->m_analysisData.lookahead.vbvCost[index] = outFrame->m_encData->m_cuStat[index].vbvCost * factor;
402
+ outFrame->m_analysisData.lookahead.intraVbvCost[index] = outFrame->m_encData->m_cuStat[index].intraVbvCost;
403
+ outFrame->m_analysisData.lookahead.vbvCost[index] = outFrame->m_encData->m_cuStat[index].vbvCost;
404
}
405
pic_out->analysisData.lookahead.intraVbvCost = outFrame->m_analysisData.lookahead.intraVbvCost;
406
pic_out->analysisData.lookahead.vbvCost = outFrame->m_analysisData.lookahead.vbvCost;
407
}
408
}
409
writeAnalysisFile(&pic_out->analysisData, *outFrame->m_encData);
410
+ pic_out->analysisData.saveParam = pic_out->analysisData.saveParam;
411
if (m_param->bUseAnalysisFile)
412
- freeAnalysis(&pic_out->analysisData);
413
+ x265_free_analysis_data(m_param, &pic_out->analysisData);
414
}
415
}
416
if (m_param->rc.bStatWrite && (m_param->analysisMultiPassRefine || m_param->analysisMultiPassDistortion))
417
{
418
if (pic_out)
419
{
420
- pic_out->analysis2Pass.poc = pic_out->poc;
421
- pic_out->analysis2Pass.analysisFramedata = outFrame->m_analysis2Pass.analysisFramedata;
422
+ pic_out->analysisData.poc = pic_out->poc;
423
+ pic_out->analysisData.interData = outFrame->m_analysisData.interData;
424
+ pic_out->analysisData.intraData = outFrame->m_analysisData.intraData;
425
+ pic_out->analysisData.distortionData = outFrame->m_analysisData.distortionData;
426
}
427
- writeAnalysis2PassFile(&outFrame->m_analysis2Pass, *outFrame->m_encData, outFrame->m_lowres.sliceType);
428
+ writeAnalysisFileRefine(&outFrame->m_analysisData, *outFrame->m_encData);
429
}
430
if (m_param->analysisMultiPassRefine || m_param->analysisMultiPassDistortion)
431
- freeAnalysis2Pass(&outFrame->m_analysis2Pass, outFrame->m_lowres.sliceType);
432
+ x265_free_analysis_data(m_param, &outFrame->m_analysisData);
433
if (m_param->internalCsp == X265_CSP_I400)
434
{
435
if (slice->m_sliceType == P_SLICE)
436
{
437
- if (slice->m_weightPredTable[0][0][0].bPresentFlag)
438
+ if (slice->m_weightPredTable[0][0][0].wtPresent)
439
m_numLumaWPFrames++;
440
}
441
else if (slice->m_sliceType == B_SLICE)
442
443
bool bLuma = false;
444
for (int l = 0; l < 2; l++)
445
{
446
- if (slice->m_weightPredTable[l][0][0].bPresentFlag)
447
+ if (slice->m_weightPredTable[l][0][0].wtPresent)
448
bLuma = true;
449
}
450
if (bLuma)
451
452
{
453
if (slice->m_sliceType == P_SLICE)
454
{
455
- if (slice->m_weightPredTable[0][0][0].bPresentFlag)
456
+ if (slice->m_weightPredTable[0][0][0].wtPresent)
457
m_numLumaWPFrames++;
458
- if (slice->m_weightPredTable[0][0][1].bPresentFlag ||
459
- slice->m_weightPredTable[0][0][2].bPresentFlag)
460
+ if (slice->m_weightPredTable[0][0][1].wtPresent ||
461
+ slice->m_weightPredTable[0][0][2].wtPresent)
462
m_numChromaWPFrames++;
463
}
464
else if (slice->m_sliceType == B_SLICE)
465
466
bool bLuma = false, bChroma = false;
467
for (int l = 0; l < 2; l++)
468
{
469
- if (slice->m_weightPredTable[l][0][0].bPresentFlag)
470
+ if (slice->m_weightPredTable[l][0][0].wtPresent)
471
bLuma = true;
472
- if (slice->m_weightPredTable[l][0][1].bPresentFlag ||
473
- slice->m_weightPredTable[l][0][2].bPresentFlag)
474
+ if (slice->m_weightPredTable[l][0][1].wtPresent ||
475
+ slice->m_weightPredTable[l][0][2].wtPresent)
476
bChroma = true;
477
}
478
479
480
if (m_aborted)
481
return -1;
482
483
- finishFrameStats(outFrame, curEncoder, frameData, m_pocLast);
484
+ if ((m_outputCount + 1) >= m_param->chunkStart)
485
+ finishFrameStats(outFrame, curEncoder, frameData, m_pocLast);
486
487
/* Write RateControl Frame level stats in multipass encodes */
488
if (m_param->rc.bStatWrite)
489
490
}
491
else
492
m_exportedPic = outFrame;
493
-
494
- m_numDelayedPic--;
495
+
496
+ m_outputCount++;
497
+ if (m_param->chunkEnd == m_outputCount)
498
+ m_numDelayedPic = 0;
499
+ else
500
+ m_numDelayedPic--;
501
502
ret = 1;
503
}
504
505
* curEncoder is guaranteed to be idle at this point */
506
if (!pass)
507
frameEnc = m_lookahead->getDecidedPicture();
508
- if (frameEnc && !pass)
509
+ if (frameEnc && !pass && (!m_param->chunkEnd || (m_encodedFrameNum < m_param->chunkEnd)))
510
{
511
if (m_param->analysisMultiPassRefine || m_param->analysisMultiPassDistortion)
512
{
513
- allocAnalysis2Pass(&frameEnc->m_analysis2Pass, frameEnc->m_lowres.sliceType);
514
- frameEnc->m_analysis2Pass.poc = frameEnc->m_poc;
515
+ uint32_t widthInCU = (m_param->sourceWidth + m_param->maxCUSize - 1) >> m_param->maxLog2CUSize;
516
+ uint32_t heightInCU = (m_param->sourceHeight + m_param->maxCUSize - 1) >> m_param->maxLog2CUSize;
517
+ frameEnc->m_analysisData.numCUsInFrame = widthInCU * heightInCU;
518
+ frameEnc->m_analysisData.numPartitions = m_param->num4x4Partitions;
519
+ x265_alloc_analysis_data(m_param, &frameEnc->m_analysisData);
520
+ frameEnc->m_analysisData.poc = frameEnc->m_poc;
521
if (m_param->rc.bStatRead)
522
- readAnalysis2PassFile(&frameEnc->m_analysis2Pass, frameEnc->m_poc, frameEnc->m_lowres.sliceType);
523
+ readAnalysisFile(&frameEnc->m_analysisData, frameEnc->m_poc, frameEnc->m_lowres.sliceType);
524
}
525
526
if (frameEnc->m_reconfigureRc && m_reconfigureRc)
527
528
if (m_param->analysisLoad && m_param->bDisableLookahead)
529
{
530
frameEnc->m_dts = frameEnc->m_analysisData.lookahead.dts;
531
+ frameEnc->m_reorderedPts = frameEnc->m_analysisData.lookahead.reorderedPts;
532
if (m_rateControl->m_isVbv)
533
{
534
for (uint32_t index = 0; index < frameEnc->m_analysisData.numCuInHeight; index++)
535
536
frameEnc->m_encData->m_slice->m_iNumRPSInSPS = m_sps.spsrpsNum;
537
538
curEncoder->m_rce.encodeOrder = frameEnc->m_encodeOrder = m_encodedFrameNum++;
539
+
540
if (!m_param->analysisLoad || !m_param->bDisableLookahead)
541
{
542
if (m_bframeDelay)
543
544
analysis->numCUsInFrame = numCUsInFrame;
545
analysis->numCuInHeight = heightInCU;
546
analysis->numPartitions = m_param->num4x4Partitions;
547
- allocAnalysis(analysis);
548
+ x265_alloc_analysis_data(m_param, analysis);
549
}
550
/* determine references, setup RPS, etc */
551
m_dpb->prepareEncode(frameEnc);
552
553
{
554
const int picOrderCntLSB = slice->m_poc - slice->m_lastIDR;
555
556
- frameStats->encoderOrder = m_outputCount++;
557
+ frameStats->encoderOrder = m_outputCount;
558
frameStats->sliceType = c;
559
frameStats->poc = picOrderCntLSB;
560
frameStats->qp = curEncData.m_avgQpAq;
561
562
if (m_param->csvLogLevel >= 2)
563
frameStats->ipCostRatio = curFrame->m_lowres.ipCostRatio;
564
frameStats->bufferFill = m_rateControl->m_bufferFillActual;
565
+ frameStats->bufferFillFinal = m_rateControl->m_bufferFillFinal;
566
frameStats->frameLatency = inPoc - poc;
567
if (m_param->rc.rateControlMode == X265_RC_CRF)
568
frameStats->rateFactor = curEncData.m_rateFactor;
569
570
#define ELAPSED_MSEC(start, end) (((double)(end) - (start)) / 1000)
571
if (m_param->csvLogLevel >= 2)
572
{
573
+#if ENABLE_LIBVMAF
574
+ frameStats->vmafFrameScore = curFrame->m_fencPic->m_vmafScore;
575
+#endif
576
frameStats->decideWaitTime = ELAPSED_MSEC(0, curEncoder->m_slicetypeWaitTime);
577
frameStats->row0WaitTime = ELAPSED_MSEC(curEncoder->m_startCompressTime, curEncoder->m_row0WaitTime);
578
frameStats->wallTime = ELAPSED_MSEC(curEncoder->m_row0WaitTime, curEncoder->m_endCompressTime);
579
580
list.serialize(NAL_UNIT_SPS, bs);
581
582
bs.resetBits();
583
- sbacCoder.codePPS( m_pps, (m_param->maxSlices <= 1), m_iPPSQpMinus26);
584
+ sbacCoder.codePPS(m_pps, (m_param->maxSlices <= 1), m_iPPSQpMinus26);
585
bs.writeByteAlignment();
586
list.serialize(NAL_UNIT_PPS, bs);
587
588
+ if (m_param->bSingleSeiNal)
589
+ bs.resetBits();
590
+
591
if (m_param->bEmitHDRSEI)
592
{
593
SEIContentLightLevel cllsei;
594
cllsei.max_content_light_level = m_param->maxCLL;
595
cllsei.max_pic_average_light_level = m_param->maxFALL;
596
- bs.resetBits();
597
- cllsei.write(bs, m_sps);
598
- bs.writeByteAlignment();
599
- list.serialize(NAL_UNIT_PREFIX_SEI, bs);
600
+ cllsei.writeSEImessages(bs, m_sps, NAL_UNIT_PREFIX_SEI, list, m_param->bSingleSeiNal);
601
602
if (m_param->masteringDisplayColorVolume)
603
{
604
SEIMasteringDisplayColorVolume mdsei;
605
if (mdsei.parse(m_param->masteringDisplayColorVolume))
606
- {
607
- bs.resetBits();
608
- mdsei.write(bs, m_sps);
609
- bs.writeByteAlignment();
610
- list.serialize(NAL_UNIT_PREFIX_SEI, bs);
611
- }
612
+ mdsei.writeSEImessages(bs, m_sps, NAL_UNIT_PREFIX_SEI, list, m_param->bSingleSeiNal);
613
else
614
x265_log(m_param, X265_LOG_WARNING, "unable to parse mastering display color volume info\n");
615
}
616
617
if (opts)
618
{
619
char *buffer = X265_MALLOC(char, strlen(opts) + strlen(PFX(version_str)) +
620
- strlen(PFX(build_info_str)) + 200);
621
+ strlen(PFX(build_info_str)) + 200);
622
if (buffer)
623
{
624
sprintf(buffer, "x265 (build %d) - %s:%s - H.265/HEVC codec - "
625
- "Copyright 2013-2018 (c) Multicoreware, Inc - "
626
- "http://x265.org - options: %s",
627
- X265_BUILD, PFX(version_str), PFX(build_info_str), opts);
628
-
629
- bs.resetBits();
630
+ "Copyright 2013-2018 (c) Multicoreware, Inc - "
631
+ "http://x265.org - options: %s",
632
+ X265_BUILD, PFX(version_str), PFX(build_info_str), opts);
633
+
634
SEIuserDataUnregistered idsei;
635
idsei.m_userData = (uint8_t*)buffer;
636
idsei.setSize((uint32_t)strlen(buffer));
637
- idsei.write(bs, m_sps);
638
- bs.writeByteAlignment();
639
- list.serialize(NAL_UNIT_PREFIX_SEI, bs);
640
+ idsei.writeSEImessages(bs, m_sps, NAL_UNIT_PREFIX_SEI, list, m_param->bSingleSeiNal);
641
642
X265_FREE(buffer);
643
}
644
645
SEIActiveParameterSets sei;
646
sei.m_selfContainedCvsFlag = true;
647
sei.m_noParamSetUpdateFlag = true;
648
-
649
- bs.resetBits();
650
- sei.write(bs, m_sps);
651
- bs.writeByteAlignment();
652
- list.serialize(NAL_UNIT_PREFIX_SEI, bs);
653
+ sei.writeSEImessages(bs, m_sps, NAL_UNIT_PREFIX_SEI, list, m_param->bSingleSeiNal);
654
}
655
}
656
657
658
vui.defaultDisplayWindow.bottomOffset = m_param->vui.defDispWinBottomOffset;
659
vui.defaultDisplayWindow.leftOffset = m_param->vui.defDispWinLeftOffset;
660
661
- vui.frameFieldInfoPresentFlag = !!m_param->interlaceMode;
662
+ vui.frameFieldInfoPresentFlag = !!m_param->interlaceMode || (m_param->pictureStructure >= 0);
663
vui.fieldSeqFlag = !!m_param->interlaceMode;
664
665
vui.hrdParametersPresentFlag = m_param->bEmitHRDSEI;
666
667
void Encoder::initPPS(PPS *pps)
668
{
669
bool bIsVbv = m_param->rc.vbvBufferSize > 0 && m_param->rc.vbvMaxBitrate > 0;
670
+ bool bEnableDistOffset = m_param->analysisMultiPassDistortion && m_param->rc.bStatRead;
671
672
if (!m_param->bLossless && (m_param->rc.aqMode || bIsVbv || m_param->bAQMotion))
673
{
674
675
pps->maxCuDQPDepth = g_log2Size[m_param->maxCUSize] - g_log2Size[m_param->rc.qgSize];
676
X265_CHECK(pps->maxCuDQPDepth <= 3, "max CU DQP depth cannot be greater than 3\n");
677
}
678
+ else if (!m_param->bLossless && bEnableDistOffset)
679
+ {
680
+ pps->bUseDQP = true;
681
+ pps->maxCuDQPDepth = 0;
682
+ }
683
else
684
{
685
pps->bUseDQP = false;
686
687
{
688
p->scaleFactor = 0;
689
}
690
- else if ((!p->analysisLoad && !p->analysisSave) || p->analysisReuseLevel < 10)
691
+ else if ((!p->analysisLoad && !p->analysisSave) || (p->analysisReuseLevel > 6 && p->analysisReuseLevel != 10))
692
{
693
- x265_log(p, X265_LOG_WARNING, "Input scaling works with analysis load/save, analysis-reuse-level 10. Disabling scale-factor.\n");
694
+ x265_log(p, X265_LOG_WARNING, "Input scaling works with analysis load/save and analysis-reuse-level 1-6 and 10. Disabling scale-factor.\n");
695
p->scaleFactor = 0;
696
}
697
}
698
699
if (p->intraRefine)
700
{
701
- if (!p->analysisLoad || p->analysisReuseLevel < 10 || !p->scaleFactor)
702
+ if (!p->analysisLoad || p->analysisReuseLevel < 10)
703
{
704
- x265_log(p, X265_LOG_WARNING, "Intra refinement requires analysis load, analysis-reuse-level 10, scale factor. Disabling intra refine.\n");
705
+ x265_log(p, X265_LOG_WARNING, "Intra refinement requires analysis load, analysis-reuse-level 10. Disabling intra refine.\n");
706
p->intraRefine = 0;
707
}
708
}
709
710
if (p->interRefine)
711
{
712
- if (!p->analysisLoad || p->analysisReuseLevel < 10 || !p->scaleFactor)
713
+ if (!p->analysisLoad || p->analysisReuseLevel < 10)
714
+ {
715
+ x265_log(p, X265_LOG_WARNING, "Inter refinement requires analysis load, analysis-reuse-level 10. Disabling inter refine.\n");
716
+ p->interRefine = 0;
717
+ }
718
+ }
719
+
720
+ if (p->bDynamicRefine)
721
+ {
722
+ if (!p->analysisLoad || p->analysisReuseLevel < 10)
723
+ {
724
+ x265_log(p, X265_LOG_WARNING, "Dynamic refinement requires analysis load, analysis-reuse-level 10. Disabling dynamic refine.\n");
725
+ p->bDynamicRefine = 0;
726
+ }
727
+ if (p->interRefine)
728
{
729
- x265_log(p, X265_LOG_WARNING, "Inter refinement requires analysis load, analysis-reuse-level 10, scale factor. Disabling inter refine.\n");
730
+ x265_log(p, X265_LOG_WARNING, "Inter refine cannot be used with dynamic refine. Disabling refine-inter.\n");
731
p->interRefine = 0;
732
}
733
}
734
+ if (p->scaleFactor && p->analysisLoad && !p->interRefine && !p->bDynamicRefine && p->analysisReuseLevel == 10)
735
+ {
736
+ x265_log(p, X265_LOG_WARNING, "Inter refinement 0 is not supported with scaling and analysis-reuse-level=10. Enabling refine-inter 1.\n");
737
+ p->interRefine = 1;
738
+ }
739
740
- if (p->limitTU && p->interRefine)
741
+ if (p->limitTU && (p->interRefine || p->bDynamicRefine))
742
{
743
x265_log(p, X265_LOG_WARNING, "Inter refinement does not support limitTU. Disabling limitTU.\n");
744
p->limitTU = 0;
745
746
747
if (p->mvRefine)
748
{
749
- if (!p->analysisLoad || p->analysisReuseLevel < 10 || !p->scaleFactor)
750
+ if (!p->analysisLoad || p->analysisReuseLevel < 10)
751
{
752
- x265_log(p, X265_LOG_WARNING, "MV refinement requires analysis load, analysis-reuse-level 10, scale factor. Disabling MV refine.\n");
753
+ x265_log(p, X265_LOG_WARNING, "MV refinement requires analysis load, analysis-reuse-level 10. Disabling MV refine.\n");
754
p->mvRefine = 0;
755
}
756
else if (p->interRefine >= 2)
757
758
p->bDistributeMotionEstimation = p->bDistributeModeAnalysis = 0;
759
}
760
761
- if (p->rc.bEnableGrain)
762
- {
763
- x265_log(p, X265_LOG_WARNING, "Rc Grain removes qp fluctuations caused by aq/cutree, Disabling aq,cu-tree\n");
764
- p->rc.cuTree = 0;
765
- p->rc.aqMode = 0;
766
- }
767
-
768
if (p->bDistributeModeAnalysis && (p->limitReferences >> 1) && 1)
769
{
770
x265_log(p, X265_LOG_WARNING, "Limit reference options 2 and 3 are not supported with pmode. Disabling limit reference\n");
771
772
p->radl = 0;
773
x265_log(p, X265_LOG_WARNING, "Radl requires fixed gop-length (keyint == min-keyint). Disabling radl.\n");
774
}
775
-}
776
-
777
-void Encoder::allocAnalysis(x265_analysis_data* analysis)
778
-{
779
- X265_CHECK(analysis->sliceType, "invalid slice type\n");
780
- analysis->interData = analysis->intraData = NULL;
781
- if (m_param->bDisableLookahead && m_rateControl->m_isVbv)
782
- {
783
- CHECKED_MALLOC_ZERO(analysis->lookahead.intraSatdForVbv, uint32_t, analysis->numCuInHeight);
784
- CHECKED_MALLOC_ZERO(analysis->lookahead.satdForVbv, uint32_t, analysis->numCuInHeight);
785
- CHECKED_MALLOC_ZERO(analysis->lookahead.intraVbvCost, uint32_t, analysis->numCUsInFrame);
786
- CHECKED_MALLOC_ZERO(analysis->lookahead.vbvCost, uint32_t, analysis->numCUsInFrame);
787
- }
788
- if (analysis->sliceType == X265_TYPE_IDR || analysis->sliceType == X265_TYPE_I)
789
- {
790
- if (m_param->analysisReuseLevel < 2)
791
- return;
792
-
793
- analysis_intra_data *intraData = (analysis_intra_data*)analysis->intraData;
794
- CHECKED_MALLOC_ZERO(intraData, analysis_intra_data, 1);
795
- CHECKED_MALLOC(intraData->depth, uint8_t, analysis->numPartitions * analysis->numCUsInFrame);
796
- CHECKED_MALLOC(intraData->modes, uint8_t, analysis->numPartitions * analysis->numCUsInFrame);
797
- CHECKED_MALLOC(intraData->partSizes, char, analysis->numPartitions * analysis->numCUsInFrame);
798
- CHECKED_MALLOC(intraData->chromaModes, uint8_t, analysis->numPartitions * analysis->numCUsInFrame);
799
- analysis->intraData = intraData;
800
- }
801
- else
802
- {
803
- int numDir = analysis->sliceType == X265_TYPE_P ? 1 : 2;
804
- uint32_t numPlanes = m_param->internalCsp == X265_CSP_I400 ? 1 : 3;
805
- if (!(m_param->bMVType == AVC_INFO))
806
- CHECKED_MALLOC_ZERO(analysis->wt, WeightParam, numPlanes * numDir);
807
- if (m_param->analysisReuseLevel < 2)
808
- return;
809
-
810
- analysis_inter_data *interData = (analysis_inter_data*)analysis->interData;
811
- CHECKED_MALLOC_ZERO(interData, analysis_inter_data, 1);
812
- CHECKED_MALLOC(interData->depth, uint8_t, analysis->numPartitions * analysis->numCUsInFrame);
813
- CHECKED_MALLOC(interData->modes, uint8_t, analysis->numPartitions * analysis->numCUsInFrame);
814
- if (m_param->analysisReuseLevel > 4)
815
- {
816
- CHECKED_MALLOC(interData->partSize, uint8_t, analysis->numPartitions * analysis->numCUsInFrame);
817
- CHECKED_MALLOC(interData->mergeFlag, uint8_t, analysis->numPartitions * analysis->numCUsInFrame);
818
- }
819
-
820
- if (m_param->analysisReuseLevel >= 7)
821
- {
822
- CHECKED_MALLOC(interData->interDir, uint8_t, analysis->numPartitions * analysis->numCUsInFrame);
823
- CHECKED_MALLOC(interData->sadCost, int64_t, analysis->numPartitions * analysis->numCUsInFrame);
824
- for (int dir = 0; dir < numDir; dir++)
825
- {
826
- CHECKED_MALLOC(interData->mvpIdx[dir], uint8_t, analysis->numPartitions * analysis->numCUsInFrame);
827
- CHECKED_MALLOC(interData->refIdx[dir], int8_t, analysis->numPartitions * analysis->numCUsInFrame);
828
- CHECKED_MALLOC(interData->mv[dir], MV, analysis->numPartitions * analysis->numCUsInFrame);
829
- CHECKED_MALLOC_ZERO(analysis->modeFlag[dir], uint8_t, analysis->numPartitions * analysis->numCUsInFrame);
830
- }
831
- /* Allocate intra in inter */
832
- if (analysis->sliceType == X265_TYPE_P || m_param->bIntraInBFrames)
833
- {
834
- analysis_intra_data *intraData = (analysis_intra_data*)analysis->intraData;
835
- CHECKED_MALLOC_ZERO(intraData, analysis_intra_data, 1);
836
- CHECKED_MALLOC(intraData->modes, uint8_t, analysis->numPartitions * analysis->numCUsInFrame);
837
- CHECKED_MALLOC(intraData->chromaModes, uint8_t, analysis->numPartitions * analysis->numCUsInFrame);
838
- analysis->intraData = intraData;
839
- }
840
- }
841
- else
842
- CHECKED_MALLOC_ZERO(interData->ref, int32_t, analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU * numDir);
843
-
844
- analysis->interData = interData;
845
- }
846
- return;
847
-
848
-fail:
849
- freeAnalysis(analysis);
850
- m_aborted = true;
851
-}
852
-void Encoder::freeAnalysis(x265_analysis_data* analysis)
853
-{
854
- if (m_param->bDisableLookahead && m_rateControl->m_isVbv)
855
- {
856
- X265_FREE(analysis->lookahead.satdForVbv);
857
- X265_FREE(analysis->lookahead.intraSatdForVbv);
858
- X265_FREE(analysis->lookahead.vbvCost);
859
- X265_FREE(analysis->lookahead.intraVbvCost);
860
- }
861
- /* Early exit freeing weights alone if level is 1 (when there is no analysis inter/intra) */
862
- if (analysis->sliceType > X265_TYPE_I && analysis->wt && !(m_param->bMVType == AVC_INFO))
863
- X265_FREE(analysis->wt);
864
- if (m_param->analysisReuseLevel < 2)
865
- return;
866
867
- if (analysis->sliceType == X265_TYPE_IDR || analysis->sliceType == X265_TYPE_I)
868
+ if ((p->chunkStart || p->chunkEnd) && p->bOpenGOP)
869
{
870
- if (analysis->intraData)
871
- {
872
- X265_FREE(((analysis_intra_data*)analysis->intraData)->depth);
873
- X265_FREE(((analysis_intra_data*)analysis->intraData)->modes);
874
- X265_FREE(((analysis_intra_data*)analysis->intraData)->partSizes);
875
- X265_FREE(((analysis_intra_data*)analysis->intraData)->chromaModes);
876
- X265_FREE(analysis->intraData);
877
- analysis->intraData = NULL;
878
- }
879
+ p->chunkStart = p->chunkEnd = 0;
880
+ x265_log(p, X265_LOG_WARNING, "Chunking requires closed gop structure. Disabling chunking.\n");
881
}
882
- else
883
- {
884
- if (analysis->intraData)
885
- {
886
- X265_FREE(((analysis_intra_data*)analysis->intraData)->modes);
887
- X265_FREE(((analysis_intra_data*)analysis->intraData)->chromaModes);
888
- X265_FREE(analysis->intraData);
889
- analysis->intraData = NULL;
890
- }
891
- if (analysis->interData)
892
- {
893
- X265_FREE(((analysis_inter_data*)analysis->interData)->depth);
894
- X265_FREE(((analysis_inter_data*)analysis->interData)->modes);
895
- if (m_param->analysisReuseLevel > 4)
896
- {
897
- X265_FREE(((analysis_inter_data*)analysis->interData)->mergeFlag);
898
- X265_FREE(((analysis_inter_data*)analysis->interData)->partSize);
899
- }
900
- if (m_param->analysisReuseLevel >= 7)
901
- {
902
- X265_FREE(((analysis_inter_data*)analysis->interData)->interDir);
903
- X265_FREE(((analysis_inter_data*)analysis->interData)->sadCost);
904
- int numDir = analysis->sliceType == X265_TYPE_P ? 1 : 2;
905
- for (int dir = 0; dir < numDir; dir++)
906
- {
907
- X265_FREE(((analysis_inter_data*)analysis->interData)->mvpIdx[dir]);
908
- X265_FREE(((analysis_inter_data*)analysis->interData)->refIdx[dir]);
909
- X265_FREE(((analysis_inter_data*)analysis->interData)->mv[dir]);
910
- if (analysis->modeFlag[dir] != NULL)
911
- {
912
- X265_FREE(analysis->modeFlag[dir]);
913
- analysis->modeFlag[dir] = NULL;
914
- }
915
- }
916
- }
917
- else
918
- X265_FREE(((analysis_inter_data*)analysis->interData)->ref);
919
-
920
- X265_FREE(analysis->interData);
921
- analysis->interData = NULL;
922
- }
923
- }
924
-}
925
-
926
-void Encoder::allocAnalysis2Pass(x265_analysis_2Pass* analysis, int sliceType)
927
-{
928
- analysis->analysisFramedata = NULL;
929
- analysis2PassFrameData *analysisFrameData = (analysis2PassFrameData*)analysis->analysisFramedata;
930
- uint32_t widthInCU = (m_param->sourceWidth + m_param->maxCUSize - 1) >> m_param->maxLog2CUSize;
931
- uint32_t heightInCU = (m_param->sourceHeight + m_param->maxCUSize - 1) >> m_param->maxLog2CUSize;
932
933
- uint32_t numCUsInFrame = widthInCU * heightInCU;
934
- CHECKED_MALLOC_ZERO(analysisFrameData, analysis2PassFrameData, 1);
935
- CHECKED_MALLOC_ZERO(analysisFrameData->depth, uint8_t, m_param->num4x4Partitions * numCUsInFrame);
936
- CHECKED_MALLOC_ZERO(analysisFrameData->distortion, sse_t, m_param->num4x4Partitions * numCUsInFrame);
937
- if (m_param->rc.bStatRead)
938
- {
939
- CHECKED_MALLOC_ZERO(analysisFrameData->ctuDistortion, sse_t, numCUsInFrame);
940
- CHECKED_MALLOC_ZERO(analysisFrameData->scaledDistortion, double, numCUsInFrame);
941
- CHECKED_MALLOC_ZERO(analysisFrameData->offset, double, numCUsInFrame);
942
- CHECKED_MALLOC_ZERO(analysisFrameData->threshold, double, numCUsInFrame);
943
- }
944
- if (!IS_X265_TYPE_I(sliceType))
945
+ if (p->chunkEnd < p->chunkStart)
946
{
947
- CHECKED_MALLOC_ZERO(analysisFrameData->m_mv[0], MV, m_param->num4x4Partitions * numCUsInFrame);
948
- CHECKED_MALLOC_ZERO(analysisFrameData->m_mv[1], MV, m_param->num4x4Partitions * numCUsInFrame);
949
- CHECKED_MALLOC_ZERO(analysisFrameData->mvpIdx[0], int, m_param->num4x4Partitions * numCUsInFrame);
950
- CHECKED_MALLOC_ZERO(analysisFrameData->mvpIdx[1], int, m_param->num4x4Partitions * numCUsInFrame);
951
- CHECKED_MALLOC_ZERO(analysisFrameData->ref[0], int32_t, m_param->num4x4Partitions * numCUsInFrame);
952
- CHECKED_MALLOC_ZERO(analysisFrameData->ref[1], int32_t, m_param->num4x4Partitions * numCUsInFrame);
953
- CHECKED_MALLOC(analysisFrameData->modes, uint8_t, m_param->num4x4Partitions * numCUsInFrame);
954
+ p->chunkStart = p->chunkEnd = 0;
955
+ x265_log(p, X265_LOG_WARNING, "chunk-end cannot be less than chunk-start. Disabling chunking.\n");
956
}
957
958
- analysis->analysisFramedata = analysisFrameData;
959
-
960
- return;
961
-
962
-fail:
963
- freeAnalysis2Pass(analysis, sliceType);
964
- m_aborted = true;
965
-}
966
-
967
-void Encoder::freeAnalysis2Pass(x265_analysis_2Pass* analysis, int sliceType)
968
-{
969
- if (analysis->analysisFramedata)
970
- {
971
- X265_FREE(((analysis2PassFrameData*)analysis->analysisFramedata)->depth);
972
- X265_FREE(((analysis2PassFrameData*)analysis->analysisFramedata)->distortion);
973
- if (m_param->rc.bStatRead)
974
- {
975
- X265_FREE(((analysis2PassFrameData*)analysis->analysisFramedata)->ctuDistortion);
976
- X265_FREE(((analysis2PassFrameData*)analysis->analysisFramedata)->scaledDistortion);
977
- X265_FREE(((analysis2PassFrameData*)analysis->analysisFramedata)->offset);
978
- X265_FREE(((analysis2PassFrameData*)analysis->analysisFramedata)->threshold);
979
- }
980
- if (!IS_X265_TYPE_I(sliceType))
981
- {
982
- X265_FREE(((analysis2PassFrameData*)analysis->analysisFramedata)->m_mv[0]);
983
- X265_FREE(((analysis2PassFrameData*)analysis->analysisFramedata)->m_mv[1]);
984
- X265_FREE(((analysis2PassFrameData*)analysis->analysisFramedata)->mvpIdx[0]);
985
- X265_FREE(((analysis2PassFrameData*)analysis->analysisFramedata)->mvpIdx[1]);
986
- X265_FREE(((analysis2PassFrameData*)analysis->analysisFramedata)->ref[0]);
987
- X265_FREE(((analysis2PassFrameData*)analysis->analysisFramedata)->ref[1]);
988
- X265_FREE(((analysis2PassFrameData*)analysis->analysisFramedata)->modes);
989
- }
990
- X265_FREE(analysis->analysisFramedata);
991
- }
992
}
993
994
-void Encoder::readAnalysisFile(x265_analysis_data* analysis, int curPoc, const x265_picture* picIn)
995
+void Encoder::readAnalysisFile(x265_analysis_data* analysis, int curPoc, const x265_picture* picIn, int paramBytes)
996
{
997
-
998
#define X265_FREAD(val, size, readSize, fileOffset, src)\
999
if (!m_param->bUseAnalysisFile)\
1000
- {\
1001
+ {\
1002
memcpy(val, src, (size * readSize));\
1003
- }\
1004
- else if (fread(val, size, readSize, fileOffset) != readSize)\
1005
+ }\
1006
+ else if (fread(val, size, readSize, fileOffset) != readSize)\
1007
{\
1008
x265_log(NULL, X265_LOG_ERROR, "Error reading analysis data\n");\
1009
- freeAnalysis(analysis);\
1010
+ x265_free_analysis_data(m_param, analysis);\
1011
m_aborted = true;\
1012
return;\
1013
}\
1014
1015
static uint64_t totalConsumedBytes = 0;
1016
uint32_t depthBytes = 0;
1017
if (m_param->bUseAnalysisFile)
1018
- fseeko(m_analysisFileIn, totalConsumedBytes, SEEK_SET);
1019
+ fseeko(m_analysisFileIn, totalConsumedBytes + paramBytes, SEEK_SET);
1020
const x265_analysis_data *picData = &(picIn->analysisData);
1021
- analysis_intra_data *intraPic = (analysis_intra_data *)picData->intraData;
1022
- analysis_inter_data *interPic = (analysis_inter_data *)picData->interData;
1023
+ x265_analysis_intra_data *intraPic = picData->intraData;
1024
+ x265_analysis_inter_data *interPic = picData->interData;
1025
1026
int poc; uint32_t frameRecordSize;
1027
X265_FREAD(&frameRecordSize, sizeof(uint32_t), 1, m_analysisFileIn, &(picData->frameRecordSize));
1028
1029
while (poc != curPoc && !feof(m_analysisFileIn))
1030
{
1031
currentOffset += frameRecordSize;
1032
- fseeko(m_analysisFileIn, currentOffset, SEEK_SET);
1033
+ fseeko(m_analysisFileIn, currentOffset + paramBytes, SEEK_SET);
1034
X265_FREAD(&frameRecordSize, sizeof(uint32_t), 1, m_analysisFileIn, &(picData->frameRecordSize));
1035
X265_FREAD(&depthBytes, sizeof(uint32_t), 1, m_analysisFileIn, &(picData->depthBytes));
1036
X265_FREAD(&poc, sizeof(int), 1, m_analysisFileIn, &(picData->poc));
1037
1038
if (poc != curPoc || feof(m_analysisFileIn))
1039
{
1040
x265_log(NULL, X265_LOG_WARNING, "Error reading analysis data: Cannot find POC %d\n", curPoc);
1041
- freeAnalysis(analysis);
1042
+ x265_free_analysis_data(m_param, analysis);
1043
return;
1044
}
1045
}
1046
1047
if (m_param->scaleFactor)
1048
analysis->numPartitions *= factor;
1049
/* Memory is allocated for inter and intra analysis data based on the slicetype */
1050
- allocAnalysis(analysis);
1051
+ x265_alloc_analysis_data(m_param, analysis);
1052
if (m_param->bDisableLookahead && m_rateControl->m_isVbv)
1053
{
1054
+ size_t vbvCount = m_param->lookaheadDepth + m_param->bframes + 2;
1055
X265_FREAD(analysis->lookahead.intraVbvCost, sizeof(uint32_t), analysis->numCUsInFrame, m_analysisFileIn, picData->lookahead.intraVbvCost);
1056
X265_FREAD(analysis->lookahead.vbvCost, sizeof(uint32_t), analysis->numCUsInFrame, m_analysisFileIn, picData->lookahead.vbvCost);
1057
X265_FREAD(analysis->lookahead.satdForVbv, sizeof(uint32_t), analysis->numCuInHeight, m_analysisFileIn, picData->lookahead.satdForVbv);
1058
X265_FREAD(analysis->lookahead.intraSatdForVbv, sizeof(uint32_t), analysis->numCuInHeight, m_analysisFileIn, picData->lookahead.intraSatdForVbv);
1059
+ X265_FREAD(analysis->lookahead.plannedSatd, sizeof(int64_t), vbvCount, m_analysisFileIn, picData->lookahead.plannedSatd);
1060
+
1061
+ if (m_param->scaleFactor)
1062
+ {
1063
+ for (uint64_t index = 0; index < vbvCount; index++)
1064
+ analysis->lookahead.plannedSatd[index] *= factor;
1065
+
1066
+ for (uint32_t i = 0; i < analysis->numCuInHeight; i++)
1067
+ {
1068
+ analysis->lookahead.satdForVbv[i] *= factor;
1069
+ analysis->lookahead.intraSatdForVbv[i] *= factor;
1070
+ }
1071
+ for (uint32_t i = 0; i < analysis->numCUsInFrame; i++)
1072
+ {
1073
+ analysis->lookahead.vbvCost[i] *= factor;
1074
+ analysis->lookahead.intraVbvCost[i] *= factor;
1075
+ }
1076
+ }
1077
}
1078
if (analysis->sliceType == X265_TYPE_IDR || analysis->sliceType == X265_TYPE_I)
1079
{
1080
1081
if (partSizes[d] == SIZE_NxN)
1082
partSizes[d] = SIZE_2Nx2N;
1083
}
1084
- memset(&((analysis_intra_data *)analysis->intraData)->depth[count], depthBuf[d], bytes);
1085
- memset(&((analysis_intra_data *)analysis->intraData)->chromaModes[count], modeBuf[d], bytes);
1086
- memset(&((analysis_intra_data *)analysis->intraData)->partSizes[count], partSizes[d], bytes);
1087
+ memset(&(analysis->intraData)->depth[count], depthBuf[d], bytes);
1088
+ memset(&(analysis->intraData)->chromaModes[count], modeBuf[d], bytes);
1089
+ memset(&(analysis->intraData)->partSizes[count], partSizes[d], bytes);
1090
count += bytes;
1091
}
1092
1093
if (!m_param->scaleFactor)
1094
{
1095
- X265_FREAD(((analysis_intra_data *)analysis->intraData)->modes, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFileIn, intraPic->modes);
1096
+ X265_FREAD((analysis->intraData)->modes, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFileIn, intraPic->modes);
1097
}
1098
else
1099
{
1100
uint8_t *tempLumaBuf = X265_MALLOC(uint8_t, analysis->numCUsInFrame * scaledNumPartition);
1101
X265_FREAD(tempLumaBuf, sizeof(uint8_t), analysis->numCUsInFrame * scaledNumPartition, m_analysisFileIn, intraPic->modes);
1102
for (uint32_t ctu32Idx = 0, cnt = 0; ctu32Idx < analysis->numCUsInFrame * scaledNumPartition; ctu32Idx++, cnt += factor)
1103
- memset(&((analysis_intra_data *)analysis->intraData)->modes[cnt], tempLumaBuf[ctu32Idx], factor);
1104
+ memset(&(analysis->intraData)->modes[cnt], tempLumaBuf[ctu32Idx], factor);
1105
X265_FREE(tempLumaBuf);
1106
}
1107
X265_FREE(tempBuf);
1108
1109
{
1110
int bytes = analysis->numPartitions >> (depthBuf[d] * 2);
1111
if (m_param->scaleFactor && modeBuf[d] == MODE_INTRA && depthBuf[d] == 0)
1112
- depthBuf[d] = 1;
1113
- memset(&((analysis_inter_data *)analysis->interData)->depth[count], depthBuf[d], bytes);
1114
- memset(&((analysis_inter_data *)analysis->interData)->modes[count], modeBuf[d], bytes);
1115
+ depthBuf[d] = 1;
1116
+ memset(&(analysis->interData)->depth[count], depthBuf[d], bytes);
1117
+ memset(&(analysis->interData)->modes[count], modeBuf[d], bytes);
1118
if (m_param->analysisReuseLevel > 4)
1119
{
1120
if (m_param->scaleFactor && modeBuf[d] == MODE_INTRA && partSize[d] == SIZE_NxN)
1121
- partSize[d] = SIZE_2Nx2N;
1122
- memset(&((analysis_inter_data *)analysis->interData)->partSize[count], partSize[d], bytes);
1123
+ partSize[d] = SIZE_2Nx2N;
1124
+ memset(&(analysis->interData)->partSize[count], partSize[d], bytes);
1125
int numPU = (modeBuf[d] == MODE_INTRA) ? 1 : nbPartsTable[(int)partSize[d]];
1126
for (int pu = 0; pu < numPU; pu++)
1127
{
1128
if (pu) d++;
1129
- ((analysis_inter_data *)analysis->interData)->mergeFlag[count + pu] = mergeFlag[d];
1130
+ (analysis->interData)->mergeFlag[count + pu] = mergeFlag[d];
1131
if (m_param->analysisReuseLevel == 10)
1132
{
1133
- ((analysis_inter_data *)analysis->interData)->interDir[count + pu] = interDir[d];
1134
+ (analysis->interData)->interDir[count + pu] = interDir[d];
1135
for (uint32_t i = 0; i < numDir; i++)
1136
{
1137
- ((analysis_inter_data *)analysis->interData)->mvpIdx[i][count + pu] = mvpIdx[i][d];
1138
- ((analysis_inter_data *)analysis->interData)->refIdx[i][count + pu] = refIdx[i][d];
1139
+ (analysis->interData)->mvpIdx[i][count + pu] = mvpIdx[i][d];
1140
+ (analysis->interData)->refIdx[i][count + pu] = refIdx[i][d];
1141
if (m_param->scaleFactor)
1142
{
1143
mv[i][d].x *= (int16_t)m_param->scaleFactor;
1144
mv[i][d].y *= (int16_t)m_param->scaleFactor;
1145
}
1146
- memcpy(&((analysis_inter_data *)analysis->interData)->mv[i][count + pu], &mv[i][d], sizeof(MV));
1147
+ memcpy(&(analysis->interData)->mv[i][count + pu], &mv[i][d], sizeof(MV));
1148
}
1149
}
1150
}
1151
if (m_param->analysisReuseLevel == 10 && bIntraInInter)
1152
- memset(&((analysis_intra_data *)analysis->intraData)->chromaModes[count], chromaDir[d], bytes);
1153
+ memset(&(analysis->intraData)->chromaModes[count], chromaDir[d], bytes);
1154
}
1155
count += bytes;
1156
}
1157
1158
{
1159
if (!m_param->scaleFactor)
1160
{
1161
- X265_FREAD(((analysis_intra_data *)analysis->intraData)->modes, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFileIn, intraPic->modes);
1162
+ X265_FREAD((analysis->intraData)->modes, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFileIn, intraPic->modes);
1163
}
1164
else
1165
{
1166
uint8_t *tempLumaBuf = X265_MALLOC(uint8_t, analysis->numCUsInFrame * scaledNumPartition);
1167
X265_FREAD(tempLumaBuf, sizeof(uint8_t), analysis->numCUsInFrame * scaledNumPartition, m_analysisFileIn, intraPic->modes);
1168
for (uint32_t ctu32Idx = 0, cnt = 0; ctu32Idx < analysis->numCUsInFrame * scaledNumPartition; ctu32Idx++, cnt += factor)
1169
- memset(&((analysis_intra_data *)analysis->intraData)->modes[cnt], tempLumaBuf[ctu32Idx], factor);
1170
+ memset(&(analysis->intraData)->modes[cnt], tempLumaBuf[ctu32Idx], factor);
1171
X265_FREE(tempLumaBuf);
1172
}
1173
}
1174
}
1175
else
1176
- X265_FREAD(((analysis_inter_data *)analysis->interData)->ref, sizeof(int32_t), analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU * numDir, m_analysisFileIn, interPic->ref);
1177
+ X265_FREAD((analysis->interData)->ref, sizeof(int32_t), analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU * numDir, m_analysisFileIn, interPic->ref);
1178
1179
consumedBytes += frameRecordSize;
1180
if (numDir == 1)
1181
1182
#undef X265_FREAD
1183
}
1184
1185
-void Encoder::readAnalysis2PassFile(x265_analysis_2Pass* analysis2Pass, int curPoc, int sliceType)
1186
+void Encoder::readAnalysisFile(x265_analysis_data* analysis, int curPoc, const x265_picture* picIn, int paramBytes, cuLocation cuLoc)
1187
+{
1188
+#define X265_FREAD(val, size, readSize, fileOffset, src)\
1189
+ if (!m_param->bUseAnalysisFile)\
1190
+ {\
1191
+ memcpy(val, src, (size * readSize));\
1192
+ }\
1193
+ else if (fread(val, size, readSize, fileOffset) != readSize)\
1194
+ {\
1195
+ x265_log(NULL, X265_LOG_ERROR, "Error reading analysis data\n");\
1196
+ x265_free_analysis_data(m_param, analysis);\
1197
+ m_aborted = true;\
1198
+ return;\
1199
+ }\
1200
+
1201
+ static uint64_t consumedBytes = 0;
1202
+ static uint64_t totalConsumedBytes = 0;
1203
+ uint32_t depthBytes = 0;
1204
+ if (m_param->bUseAnalysisFile)
1205
+ fseeko(m_analysisFileIn, totalConsumedBytes + paramBytes, SEEK_SET);
1206
+
1207
+ const x265_analysis_data *picData = &(picIn->analysisData);
1208
+ x265_analysis_intra_data *intraPic = picData->intraData;
1209
+ x265_analysis_inter_data *interPic = picData->interData;
1210
+
1211
+ int poc; uint32_t frameRecordSize;
1212
+ X265_FREAD(&frameRecordSize, sizeof(uint32_t), 1, m_analysisFileIn, &(picData->frameRecordSize));
1213
+ X265_FREAD(&depthBytes, sizeof(uint32_t), 1, m_analysisFileIn, &(picData->depthBytes));
1214
+ X265_FREAD(&poc, sizeof(int), 1, m_analysisFileIn, &(picData->poc));
1215
+
1216
+ if (m_param->bUseAnalysisFile)
1217
+ {
1218
+ uint64_t currentOffset = totalConsumedBytes;
1219
+
1220
+ /* Seeking to the right frame Record */
1221
+ while (poc != curPoc && !feof(m_analysisFileIn))
1222
+ {
1223
+ currentOffset += frameRecordSize;
1224
+ fseeko(m_analysisFileIn, currentOffset + paramBytes, SEEK_SET);
1225
+ X265_FREAD(&frameRecordSize, sizeof(uint32_t), 1, m_analysisFileIn, &(picData->frameRecordSize));
1226
+ X265_FREAD(&depthBytes, sizeof(uint32_t), 1, m_analysisFileIn, &(picData->depthBytes));
1227
+ X265_FREAD(&poc, sizeof(int), 1, m_analysisFileIn, &(picData->poc));
1228
+ }
1229
+ if (poc != curPoc || feof(m_analysisFileIn))
1230
+ {
1231
+ x265_log(NULL, X265_LOG_WARNING, "Error reading analysis data: Cannot find POC %d\n", curPoc);
1232
+ x265_free_analysis_data(m_param, analysis);
1233
+ return;
1234
+ }
1235
+ }
1236
+
1237
+ /* Now arrived at the right frame, read the record */
1238
+ analysis->poc = poc;
1239
+ analysis->frameRecordSize = frameRecordSize;
1240
+ X265_FREAD(&analysis->sliceType, sizeof(int), 1, m_analysisFileIn, &(picData->sliceType));
1241
+ X265_FREAD(&analysis->bScenecut, sizeof(int), 1, m_analysisFileIn, &(picData->bScenecut));
1242
+ X265_FREAD(&analysis->satdCost, sizeof(int64_t), 1, m_analysisFileIn, &(picData->satdCost));
1243
+ X265_FREAD(&analysis->numCUsInFrame, sizeof(int), 1, m_analysisFileIn, &(picData->numCUsInFrame));
1244
+ X265_FREAD(&analysis->numPartitions, sizeof(int), 1, m_analysisFileIn, &(picData->numPartitions));
1245
+ if (m_param->bDisableLookahead)
1246
+ {
1247
+ X265_FREAD(&analysis->numCuInHeight, sizeof(uint32_t), 1, m_analysisFileIn, &(picData->numCuInHeight));
1248
+ X265_FREAD(&analysis->lookahead, sizeof(x265_lookahead_data), 1, m_analysisFileIn, &(picData->lookahead));
1249
+ }
1250
+ int scaledNumPartition = analysis->numPartitions;
1251
+ int factor = 1 << m_param->scaleFactor;
1252
+
1253
+ int numPartitions = analysis->numPartitions;
1254
+ int numCUsInFrame = analysis->numCUsInFrame;
1255
+ int numCuInHeight = analysis->numCuInHeight;
1256
+ /* Allocate memory for scaled resoultion's numPartitions and numCUsInFrame*/
1257
+ analysis->numPartitions = m_param->num4x4Partitions;
1258
+ analysis->numCUsInFrame = cuLoc.heightInCU * cuLoc.widthInCU;
1259
+ analysis->numCuInHeight = cuLoc.heightInCU;
1260
+
1261
+ /* Memory is allocated for inter and intra analysis data based on the slicetype */
1262
+ x265_alloc_analysis_data(m_param, analysis);
1263
+
1264
+ analysis->numPartitions = numPartitions * factor;
1265
+ analysis->numCUsInFrame = numCUsInFrame;
1266
+ analysis->numCuInHeight = numCuInHeight;
1267
+ if (m_param->bDisableLookahead && m_rateControl->m_isVbv)
1268
+ {
1269
+ uint32_t width = analysis->numCUsInFrame / analysis->numCuInHeight;
1270
+ bool skipLastRow = (analysis->numCuInHeight * 2) > cuLoc.heightInCU;
1271
+ bool skipLastCol = (width * 2) > cuLoc.widthInCU;
1272
+ uint32_t *intraVbvCostBuf = NULL, *vbvCostBuf = NULL, *satdForVbvBuf = NULL, *intraSatdForVbvBuf = NULL;
1273
+ intraVbvCostBuf = X265_MALLOC(uint32_t, analysis->numCUsInFrame);
1274
+ vbvCostBuf = X265_MALLOC(uint32_t, analysis->numCUsInFrame);
1275
+ satdForVbvBuf = X265_MALLOC(uint32_t, analysis->numCuInHeight);
1276
+ intraSatdForVbvBuf = X265_MALLOC(uint32_t, analysis->numCuInHeight);
1277
+
1278
+ X265_FREAD(intraVbvCostBuf, sizeof(uint32_t), analysis->numCUsInFrame, m_analysisFileIn, picData->lookahead.intraVbvCost);
1279
+ X265_FREAD(vbvCostBuf, sizeof(uint32_t), analysis->numCUsInFrame, m_analysisFileIn, picData->lookahead.vbvCost);
1280
+ X265_FREAD(satdForVbvBuf, sizeof(uint32_t), analysis->numCuInHeight, m_analysisFileIn, picData->lookahead.satdForVbv);
1281
+ X265_FREAD(intraSatdForVbvBuf, sizeof(uint32_t), analysis->numCuInHeight, m_analysisFileIn, picData->lookahead.intraSatdForVbv);
1282
+
1283
+ int k = 0;
1284
+ for (uint32_t i = 0; i < analysis->numCuInHeight; i++)
1285
+ {
1286
+ analysis->lookahead.satdForVbv[m_param->scaleFactor * i] = satdForVbvBuf[i] * m_param->scaleFactor;
1287
+ analysis->lookahead.intraSatdForVbv[m_param->scaleFactor * i] = intraSatdForVbvBuf[i] * m_param->scaleFactor;
1288
+ if (!(i == (analysis->numCuInHeight - 1) && skipLastRow))
1289
+ {
1290
+ analysis->lookahead.satdForVbv[(m_param->scaleFactor * i) + 1] = satdForVbvBuf[i] * m_param->scaleFactor;
1291
+ analysis->lookahead.intraSatdForVbv[(m_param->scaleFactor * i) + 1] = intraSatdForVbvBuf[i] * m_param->scaleFactor;
1292
+ }
1293
+
1294
+ for (uint32_t j = 0; j < width; j++, k++)
1295
+ {
1296
+ analysis->lookahead.vbvCost[(i * m_param->scaleFactor * cuLoc.widthInCU) + (j * m_param->scaleFactor)] = vbvCostBuf[k];
1297
+ analysis->lookahead.intraVbvCost[(i * m_param->scaleFactor * cuLoc.widthInCU) + (j * m_param->scaleFactor)] = intraVbvCostBuf[k];
1298
+
1299
+ if (!(j == (width - 1) && skipLastCol))
1300
+ {
1301
+ analysis->lookahead.vbvCost[(i * m_param->scaleFactor * cuLoc.widthInCU) + (j * m_param->scaleFactor) + 1] = vbvCostBuf[k];
1302
+ analysis->lookahead.intraVbvCost[(i * m_param->scaleFactor * cuLoc.widthInCU) + (j * m_param->scaleFactor) + 1] = intraVbvCostBuf[k];
1303
+ }
1304
+ if (!(i == (analysis->numCuInHeight - 1) && skipLastRow))
1305
+ {
1306
+ analysis->lookahead.vbvCost[(i * m_param->scaleFactor * cuLoc.widthInCU) + cuLoc.widthInCU + (j * m_param->scaleFactor)] = vbvCostBuf[k];
1307
+ analysis->lookahead.intraVbvCost[(i * m_param->scaleFactor * cuLoc.widthInCU) + cuLoc.widthInCU + (j * m_param->scaleFactor)] = intraVbvCostBuf[k];
1308
+ if (!(j == (width - 1) && skipLastCol))
1309
+ {
1310
+ analysis->lookahead.vbvCost[(i * m_param->scaleFactor * cuLoc.widthInCU) + cuLoc.widthInCU + (j * m_param->scaleFactor) + 1] = vbvCostBuf[k];
1311
+ analysis->lookahead.intraVbvCost[(i * m_param->scaleFactor * cuLoc.widthInCU) + cuLoc.widthInCU + (j * m_param->scaleFactor) + 1] = intraVbvCostBuf[k];
1312
+ }
1313
+ }
1314
+ }
1315
+ }
1316
+ X265_FREE(satdForVbvBuf);
1317
+ X265_FREE(intraSatdForVbvBuf);
1318
+ X265_FREE(intraVbvCostBuf);
1319
+ X265_FREE(vbvCostBuf);
1320
+ }
1321
+
1322
+ if (analysis->sliceType == X265_TYPE_IDR || analysis->sliceType == X265_TYPE_I)
1323
+ {
1324
+ if (m_param->analysisReuseLevel < 2)
1325
+ return;
1326
+
1327
+ uint8_t *tempBuf = NULL, *depthBuf = NULL, *modeBuf = NULL, *partSizes = NULL;
1328
+
1329
+ tempBuf = X265_MALLOC(uint8_t, depthBytes * 3);
1330
+ depthBuf = tempBuf;
1331
+ modeBuf = tempBuf + depthBytes;
1332
+ partSizes = tempBuf + 2 * depthBytes;
1333
+
1334
+ X265_FREAD(depthBuf, sizeof(uint8_t), depthBytes, m_analysisFileIn, intraPic->depth);
1335
+ X265_FREAD(modeBuf, sizeof(uint8_t), depthBytes, m_analysisFileIn, intraPic->chromaModes);
1336
+ X265_FREAD(partSizes, sizeof(uint8_t), depthBytes, m_analysisFileIn, intraPic->partSizes);
1337
+
1338
+ uint32_t count = 0;
1339
+ for (uint32_t d = 0; d < depthBytes; d++)
1340
+ {
1341
+ int bytes = analysis->numPartitions >> (depthBuf[d] * 2);
1342
+ int numCTUCopied = 1;
1343
+ if (!depthBuf[d]) //copy data of one 64x64 to four scaled 64x64 CTUs.
1344
+ {
1345
+ bytes /= 4;
1346
+ numCTUCopied = 4;
1347
+ }
1348
+ if (partSizes[d] == SIZE_NxN)
1349
+ partSizes[d] = SIZE_2Nx2N;
1350
+ if ((depthBuf[d] > 1 && m_param->maxCUSize == 64) || (depthBuf[d] && m_param->maxCUSize != 64))
1351
+ depthBuf[d]--;
1352
+
1353
+ for (int numCTU = 0; numCTU < numCTUCopied; numCTU++)
1354
+ {
1355
+ memset(&(analysis->intraData)->depth[count], depthBuf[d], bytes);
1356
+ memset(&(analysis->intraData)->chromaModes[count], modeBuf[d], bytes);
1357
+ memset(&(analysis->intraData)->partSizes[count], partSizes[d], bytes);
1358
+ count += bytes;
1359
+ d += getCUIndex(&cuLoc, &count, bytes, 1);
1360
+ }
1361
+ }
1362
+
1363
+ cuLoc.evenRowIndex = 0;
1364
+ cuLoc.oddRowIndex = m_param->num4x4Partitions * cuLoc.widthInCU;
1365
+ cuLoc.switchCondition = 0;
1366
+ uint8_t *tempLumaBuf = X265_MALLOC(uint8_t, analysis->numCUsInFrame * scaledNumPartition);
1367
+ X265_FREAD(tempLumaBuf, sizeof(uint8_t), analysis->numCUsInFrame * scaledNumPartition, m_analysisFileIn, intraPic->modes);
1368
+ uint32_t cnt = 0;
1369
+ for (uint32_t ctu32Idx = 0; ctu32Idx < analysis->numCUsInFrame * scaledNumPartition; ctu32Idx++)
1370
+ {
1371
+ memset(&(analysis->intraData)->modes[cnt], tempLumaBuf[ctu32Idx], factor);
1372
+ cnt += factor;
1373
+ ctu32Idx += getCUIndex(&cuLoc, &cnt, factor, 0);
1374
+ }
1375
+ X265_FREE(tempLumaBuf);
1376
+ X265_FREE(tempBuf);
1377
+ consumedBytes += frameRecordSize;
1378
+ }
1379
+
1380
+ else
1381
+ {
1382
+ uint32_t numDir = analysis->sliceType == X265_TYPE_P ? 1 : 2;
1383
+ uint32_t numPlanes = m_param->internalCsp == X265_CSP_I400 ? 1 : 3;
1384
+ X265_FREAD((WeightParam*)analysis->wt, sizeof(WeightParam), numPlanes * numDir, m_analysisFileIn, (picIn->analysisData.wt));
1385
+ if (m_param->analysisReuseLevel < 2)
1386
+ return;
1387
+
1388
+ uint8_t *tempBuf = NULL, *depthBuf = NULL, *modeBuf = NULL, *partSize = NULL, *mergeFlag = NULL;
1389
+ uint8_t *interDir = NULL, *chromaDir = NULL, *mvpIdx[2];
1390
+ MV* mv[2];
1391
+ int8_t* refIdx[2];
1392
+
1393
+ int numBuf = m_param->analysisReuseLevel > 4 ? 4 : 2;
1394
+ bool bIntraInInter = false;
1395
+ if (m_param->analysisReuseLevel == 10)
1396
+ {
1397
+ numBuf++;
1398
+ bIntraInInter = (analysis->sliceType == X265_TYPE_P || m_param->bIntraInBFrames);
1399
+ if (bIntraInInter) numBuf++;
1400
+ }
1401
+
1402
+ tempBuf = X265_MALLOC(uint8_t, depthBytes * numBuf);
1403
+ depthBuf = tempBuf;
1404
+ modeBuf = tempBuf + depthBytes;
1405
+
1406
+ X265_FREAD(depthBuf, sizeof(uint8_t), depthBytes, m_analysisFileIn, interPic->depth);
1407
+ X265_FREAD(modeBuf, sizeof(uint8_t), depthBytes, m_analysisFileIn, interPic->modes);
1408
+ if (m_param->analysisReuseLevel > 4)
1409
+ {
1410
+ partSize = modeBuf + depthBytes;
1411
+ mergeFlag = partSize + depthBytes;
1412
+ X265_FREAD(partSize, sizeof(uint8_t), depthBytes, m_analysisFileIn, interPic->partSize);
1413
+ X265_FREAD(mergeFlag, sizeof(uint8_t), depthBytes, m_analysisFileIn, interPic->mergeFlag);
1414
+ if (m_param->analysisReuseLevel == 10)
1415
+ {
1416
+ interDir = mergeFlag + depthBytes;
1417
+ X265_FREAD(interDir, sizeof(uint8_t), depthBytes, m_analysisFileIn, interPic->interDir);
1418
+ if (bIntraInInter)
1419
+ {
1420
+ chromaDir = interDir + depthBytes;
1421
+ X265_FREAD(chromaDir, sizeof(uint8_t), depthBytes, m_analysisFileIn, intraPic->chromaModes);
1422
+ }
1423
+ for (uint32_t i = 0; i < numDir; i++)
1424
+ {
1425
+ mvpIdx[i] = X265_MALLOC(uint8_t, depthBytes);
1426
+ refIdx[i] = X265_MALLOC(int8_t, depthBytes);
1427
+ mv[i] = X265_MALLOC(MV, depthBytes);
1428
+ X265_FREAD(mvpIdx[i], sizeof(uint8_t), depthBytes, m_analysisFileIn, interPic->mvpIdx[i]);
1429
+ X265_FREAD(refIdx[i], sizeof(int8_t), depthBytes, m_analysisFileIn, interPic->refIdx[i]);
1430
+ X265_FREAD(mv[i], sizeof(MV), depthBytes, m_analysisFileIn, interPic->mv[i]);
1431
+ }
1432
+ }
1433
+ }
1434
+
1435
+ uint32_t count = 0;
1436
+ cuLoc.switchCondition = 0;
1437
+ for (uint32_t d = 0; d < depthBytes; d++)
1438
+ {
1439
+ int bytes = analysis->numPartitions >> (depthBuf[d] * 2);
1440
+ bool isScaledMaxCUSize = false;
1441
+ int numCTUCopied = 1;
1442
+ int writeDepth = depthBuf[d];
1443
+ if (!depthBuf[d]) //copy data of one 64x64 to four scaled 64x64 CTUs.
1444
+ {
1445
+ isScaledMaxCUSize = true;
1446
+ bytes /= 4;
1447
+ numCTUCopied = 4;
1448
+ }
1449
+ if ((modeBuf[d] != MODE_INTRA && depthBuf[d] != 0) || (modeBuf[d] == MODE_INTRA && depthBuf[d] > 1))
1450
+ writeDepth--;
1451
+
1452
+ for (int numCTU = 0; numCTU < numCTUCopied; numCTU++)
1453
+ {
1454
+ memset(&(analysis->interData)->depth[count], writeDepth, bytes);
1455
+ memset(&(analysis->interData)->modes[count], modeBuf[d], bytes);
1456
+ if (m_param->analysisReuseLevel == 10 && bIntraInInter)
1457
+ memset(&(analysis->intraData)->chromaModes[count], chromaDir[d], bytes);
1458
+
1459
+ if (m_param->analysisReuseLevel > 4)
1460
+ {
1461
+ puOrientation puOrient;
1462
+ puOrient.init();
1463
+ if (modeBuf[d] == MODE_INTRA && partSize[d] == SIZE_NxN)
1464
+ partSize[d] = SIZE_2Nx2N;
1465
+ int partitionSize = partSize[d];
1466
+ if (isScaledMaxCUSize && partSize[d] != SIZE_2Nx2N)
1467
+ partitionSize = getPuShape(&puOrient, partSize[d], numCTU);
1468
+ memset(&(analysis->interData)->partSize[count], partitionSize, bytes);
1469
+ int numPU = (modeBuf[d] == MODE_INTRA) ? 1 : nbPartsTable[(int)partSize[d]];
1470
+ for (int pu = 0; pu < numPU; pu++)
1471
+ {
1472
+ if (!isScaledMaxCUSize && pu)
1473
+ d++;
1474
+ int restoreD = d;
1475
+ /* Adjust d value when the current CTU takes data from 2nd PU */
1476
+ if (puOrient.isRect || (puOrient.isAmp && partitionSize == SIZE_2Nx2N))
1477
+ {
1478
+ if ((numCTU > 1 && !puOrient.isVert) || ((numCTU % 2 == 1) && puOrient.isVert))
1479
+ d++;
1480
+ }
1481
+ if (puOrient.isAmp && pu)
1482
+ d++;
1483
+
1484
+ (analysis->interData)->mergeFlag[count + pu] = mergeFlag[d];
1485
+ if (m_param->analysisReuseLevel == 10)
1486
+ {
1487
+ (analysis->interData)->interDir[count + pu] = interDir[d];
1488
+ MV mvCopy[2];
1489
+ for (uint32_t i = 0; i < numDir; i++)
1490
+ {
1491
+ (analysis->interData)->mvpIdx[i][count + pu] = mvpIdx[i][d];
1492
+ (analysis->interData)->refIdx[i][count + pu] = refIdx[i][d];
1493
+ mvCopy[i].x = mv[i][d].x * (int16_t)m_param->scaleFactor;
1494
+ mvCopy[i].y = mv[i][d].y * (int16_t)m_param->scaleFactor;
1495
+ memcpy(&(analysis->interData)->mv[i][count + pu], &mvCopy[i], sizeof(MV));
1496
+ }
1497
+ }
1498
+ d = restoreD; // Restore d value after copying each of the 4 64x64 CTUs
1499
+
1500
+ if (isScaledMaxCUSize && (puOrient.isRect || puOrient.isAmp))
1501
+ {
1502
+ /* Skip PU index when current CTU is a 2Nx2N */
1503
+ if (partitionSize == SIZE_2Nx2N)
1504
+ pu++;
1505
+ /* Adjust d after completion of all 4 CTU copies */
1506
+ if (numCTU == 3 && (pu == (numPU - 1)))
1507
+ d++;
1508
+ }
1509
+ }
1510
+ }
1511
+ count += bytes;
1512
+ d += getCUIndex(&cuLoc, &count, bytes, 1);
1513
+ }
1514
+ }
1515
+
1516
+ X265_FREE(tempBuf);
1517
+
1518
+ if (m_param->analysisReuseLevel == 10)
1519
+ {
1520
+ for (uint32_t i = 0; i < numDir; i++)
1521
+ {
1522
+ X265_FREE(mvpIdx[i]);
1523
+ X265_FREE(refIdx[i]);
1524
+ X265_FREE(mv[i]);
1525
+ }
1526
+ if (bIntraInInter)
1527
+ {
1528
+ cuLoc.evenRowIndex = 0;
1529
+ cuLoc.oddRowIndex = m_param->num4x4Partitions * cuLoc.widthInCU;
1530
+ cuLoc.switchCondition = 0;
1531
+ uint8_t *tempLumaBuf = X265_MALLOC(uint8_t, analysis->numCUsInFrame * scaledNumPartition);
1532
+ X265_FREAD(tempLumaBuf, sizeof(uint8_t), analysis->numCUsInFrame * scaledNumPartition, m_analysisFileIn, intraPic->modes);
1533
+ uint32_t cnt = 0;
1534
+ for (uint32_t ctu32Idx = 0; ctu32Idx < analysis->numCUsInFrame * scaledNumPartition; ctu32Idx++)
1535
+ {
1536
+ memset(&(analysis->intraData)->modes[cnt], tempLumaBuf[ctu32Idx], factor);
1537
+ cnt += factor;
1538
+ ctu32Idx += getCUIndex(&cuLoc, &cnt, factor, 0);
1539
+ }
1540
+ X265_FREE(tempLumaBuf);
1541
+ }
1542
+ }
1543
+ else
1544
+ X265_FREAD((analysis->interData)->ref, sizeof(int32_t), analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU * numDir, m_analysisFileIn, interPic->ref);
1545
+
1546
+ consumedBytes += frameRecordSize;
1547
+ if (numDir == 1)
1548
+ totalConsumedBytes = consumedBytes;
1549
+ }
1550
+
1551
+ /* Restore to the current encode's numPartitions and numCUsInFrame */
1552
+ analysis->numPartitions = m_param->num4x4Partitions;
1553
+ analysis->numCUsInFrame = cuLoc.heightInCU * cuLoc.widthInCU;
1554
+ analysis->numCuInHeight = cuLoc.heightInCU;
1555
+#undef X265_FREAD
1556
+}
1557
+
1558
+
1559
+int Encoder::validateAnalysisData(x265_analysis_data* analysis, int writeFlag)
1560
+{
1561
+#define X265_PARAM_VALIDATE(analysisParam, size, bytes, param, errorMsg)\
1562
+ if(!writeFlag)\
1563
+ {\
1564
+ fileOffset = m_analysisFileIn;\
1565
+ if ((!m_param->bUseAnalysisFile && analysisParam != (int)*param) || \
1566
+ (m_param->bUseAnalysisFile && (fread(&readValue, size, bytes, fileOffset) != bytes || (readValue != (int)*param))))\
1567
+ {\
1568
+ x265_log(NULL, X265_LOG_ERROR, "Error reading analysis data. Incompatible option : <%s> \n", #errorMsg);\
1569
+ m_aborted = true;\
1570
+ return -1;\
1571
+ }\
1572
+ }\
1573
+ if(writeFlag)\
1574
+ {\
1575
+ fileOffset = m_analysisFileOut;\
1576
+ if(!m_param->bUseAnalysisFile)\
1577
+ analysisParam = *param;\
1578
+ else if(fwrite(param, size, bytes, fileOffset) < bytes)\
1579
+ {\
1580
+ x265_log(NULL, X265_LOG_ERROR, "Error writing analysis data\n"); \
1581
+ m_aborted = true;\
1582
+ return -1; \
1583
+ }\
1584
+ }\
1585
+ count++;
1586
+
1587
+#define X265_FREAD(val, size, readSize, fileOffset, src)\
1588
+ if (!m_param->bUseAnalysisFile)\
1589
+ {\
1590
+ memcpy(val, src, (size * readSize));\
1591
+ }\
1592
+ else if (fread(val, size, readSize, fileOffset) != readSize)\
1593
+ {\
1594
+ x265_log(NULL, X265_LOG_ERROR, "Error reading analysis data\n");\
1595
+ m_aborted = true;\
1596
+ return -1;\
1597
+ }\
1598
+ count++;
1599
+
1600
+ x265_analysis_validate *saveParam = &analysis->saveParam;
1601
+ FILE* fileOffset = NULL;
1602
+ int readValue = 0;
1603
+ int count = 0;
1604
+
1605
+ X265_PARAM_VALIDATE(saveParam->intraRefresh, sizeof(int), 1, &m_param->bIntraRefresh, intra-refresh);
1606
+ X265_PARAM_VALIDATE(saveParam->maxNumReferences, sizeof(int), 1, &m_param->maxNumReferences, ref);
1607
+ X265_PARAM_VALIDATE(saveParam->analysisReuseLevel, sizeof(int), 1, &m_param->analysisReuseLevel, analysis-reuse-level);
1608
+ X265_PARAM_VALIDATE(saveParam->keyframeMax, sizeof(int), 1, &m_param->keyframeMax, keyint);
1609
+ X265_PARAM_VALIDATE(saveParam->keyframeMin, sizeof(int), 1, &m_param->keyframeMin, min-keyint);
1610
+ X265_PARAM_VALIDATE(saveParam->openGOP, sizeof(int), 1, &m_param->bOpenGOP, open-gop);
1611
+ X265_PARAM_VALIDATE(saveParam->bframes, sizeof(int), 1, &m_param->bframes, bframes);
1612
+ X265_PARAM_VALIDATE(saveParam->bPyramid, sizeof(int), 1, &m_param->bBPyramid, bPyramid);
1613
+ X265_PARAM_VALIDATE(saveParam->minCUSize, sizeof(int), 1, &m_param->minCUSize, min - cu - size);
1614
+ X265_PARAM_VALIDATE(saveParam->lookaheadDepth, sizeof(int), 1, &m_param->lookaheadDepth, rc - lookahead);
1615
+ X265_PARAM_VALIDATE(saveParam->chunkStart, sizeof(int), 1, &m_param->chunkStart, chunk-start);
1616
+ X265_PARAM_VALIDATE(saveParam->chunkEnd, sizeof(int), 1, &m_param->chunkEnd, chunk-end);
1617
+
1618
+ int sourceHeight, sourceWidth;
1619
+ if (writeFlag)
1620
+ {
1621
+ sourceHeight = m_param->sourceHeight - m_conformanceWindow.bottomOffset;
1622
+ sourceWidth = m_param->sourceWidth - m_conformanceWindow.rightOffset;
1623
+ X265_PARAM_VALIDATE(saveParam->sourceWidth, sizeof(int), 1, &sourceWidth, res-width);
1624
+ X265_PARAM_VALIDATE(saveParam->sourceHeight, sizeof(int), 1, &sourceHeight, res-height);
1625
+ X265_PARAM_VALIDATE(saveParam->maxCUSize, sizeof(int), 1, &m_param->maxCUSize, ctu);
1626
+ }
1627
+ else
1628
+ {
1629
+ fileOffset = m_analysisFileIn;
1630
+ bool error = false;
1631
+ int curSourceHeight = m_param->sourceHeight - m_conformanceWindow.bottomOffset;
1632
+ int curSourceWidth = m_param->sourceWidth - m_conformanceWindow.rightOffset;
1633
+
1634
+ X265_FREAD(&sourceWidth, sizeof(int), 1, m_analysisFileIn, &(saveParam->sourceWidth));
1635
+ X265_FREAD(&sourceHeight, sizeof(int), 1, m_analysisFileIn, &(saveParam->sourceHeight));
1636
+ X265_FREAD(&readValue, sizeof(int), 1, m_analysisFileIn, &(saveParam->maxCUSize));
1637
+
1638
+ bool isScaledRes = (2 * sourceHeight == curSourceHeight) && (2 * sourceWidth == curSourceWidth);
1639
+ if (!isScaledRes && (sourceHeight != curSourceHeight || sourceWidth != curSourceWidth
1640
+ || readValue != (int)m_param->maxCUSize || m_param->scaleFactor))
1641
+ error = true;
1642
+ else if (isScaledRes && !m_param->scaleFactor)
1643
+ error = true;
1644
+ else if (isScaledRes && (int)m_param->maxCUSize == readValue)
1645
+ m_saveCTUSize = 1;
1646
+ else if (isScaledRes && (g_log2Size[m_param->maxCUSize] - g_log2Size[readValue]) != 1)
1647
+ error = true;
1648
+
1649
+ if (error)
1650
+ {
1651
+ x265_log(NULL, X265_LOG_ERROR, "Error reading analysis data. Incompatible option : <input-res / scale-factor / ctu> \n");
1652
+ m_aborted = true;
1653
+ return -1;
1654
+ }
1655
+ }
1656
+ return (count * sizeof(int));
1657
+
1658
+#undef X265_FREAD
1659
+#undef X265_PARAM_VALIDATE
1660
+}
1661
+
1662
+/* Toggle between two consecutive CTU rows. The save's CTU is copied
1663
+twice consecutively in the first and second CTU row of load*/
1664
+
1665
+int Encoder::getCUIndex(cuLocation* cuLoc, uint32_t* count, int bytes, int flag)
1666
+{
1667
+ int index = 0;
1668
+ cuLoc->switchCondition += bytes;
1669
+ int isBoundaryW = (*count % (m_param->num4x4Partitions * cuLoc->widthInCU) == 0);
1670
+
1671
+ /* Width boundary case :
1672
+ Skip to appropriate index when out of boundary cases occur
1673
+ Out of boundary may occur when the out of bound pixels along
1674
+ the width in low resoultion is greater than half of the maxCUSize */
1675
+ if (cuLoc->skipWidth && isBoundaryW)
1676
+ {
1677
+ if (flag)
1678
+ index++;
1679
+ else
1680
+ {
1681
+ /* Number of 4x4 blocks in out of bound region */
1682
+ int outOfBound = m_param->maxCUSize / 2;
1683
+ uint32_t sum = (uint32_t)pow((outOfBound >> 2), 2);
1684
+ index += sum;
1685
+ }
1686
+ cuLoc->switchCondition += m_param->num4x4Partitions;
1687
+ }
1688
+
1689
+ /* Completed writing 2 CTUs - move to the last remembered index of the next CTU row*/
1690
+ if (cuLoc->switchCondition == 2 * m_param->num4x4Partitions)
1691
+ {
1692
+ if (isBoundaryW)
1693
+ cuLoc->evenRowIndex = *count + (m_param->num4x4Partitions * cuLoc->widthInCU); // end of row - skip to the next even row
1694
+ else
1695
+ cuLoc->evenRowIndex = *count;
1696
+ *count = cuLoc->oddRowIndex;
1697
+
1698
+ /* Height boundary case :
1699
+ Skip to appropriate index when out of boundary cases occur
1700
+ Out of boundary may occur when the out of bound pixels along
1701
+ the height in low resoultion is greater than half of the maxCUSize */
1702
+ int isBoundaryH = (*count >= (m_param->num4x4Partitions * cuLoc->heightInCU * cuLoc->widthInCU));
1703
+ if (cuLoc->skipHeight && isBoundaryH)
1704
+ {
1705
+ if (flag)
1706
+ index += 2;
1707
+ else
1708
+ {
1709
+ int outOfBound = m_param->maxCUSize / 2;
1710
+ uint32_t sum = (uint32_t)(2 * pow((abs(outOfBound) >> 2), 2));
1711
+ index += sum;
1712
+ }
1713
+ *count = cuLoc->evenRowIndex;
1714
+ cuLoc->switchCondition = 0;
1715
+ }
1716
+ }
1717
+ /* Completed writing 4 CTUs - move to the last remembered index of
1718
+ the previous CTU row to copy the next save CTU's data*/
1719
+ else if (cuLoc->switchCondition == 4 * m_param->num4x4Partitions)
1720
+ {
1721
+ if (isBoundaryW)
1722
+ cuLoc->oddRowIndex = *count + (m_param->num4x4Partitions * cuLoc->widthInCU); // end of row - skip to the next odd row
1723
+ else
1724
+ cuLoc->oddRowIndex = *count;
1725
+ *count = cuLoc->evenRowIndex;
1726
+ cuLoc->switchCondition = 0;
1727
+ }
1728
+ return index;
1729
+}
1730
+
1731
+/* save load
1732
+ CTU0 CTU1 CTU2 CTU3
1733
+ 2NxN 2Nx2N 2Nx2N 2Nx2N 2Nx2N
1734
+ NX2N 2Nx2N 2Nx2N 2Nx2N 2Nx2N
1735
+ 2NxnU 2NxN 2NxN 2Nx2N 2Nx2N
1736
+ 2NxnD 2Nx2N 2Nx2N 2NxN 2NxN
1737
+ nLx2N Nx2N 2Nx2N Nx2N 2Nx2N
1738
+ nRx2N 2Nx2N Nx2N 2Nx2N Nx2N
1739
+*/
1740
+int Encoder::getPuShape(puOrientation* puOrient, int partSize, int numCTU)
1741
+{
1742
+ puOrient->isRect = true;
1743
+ if (partSize == SIZE_Nx2N)
1744
+ puOrient->isVert = true;
1745
+ if (partSize >= SIZE_2NxnU) // All AMP modes
1746
+ {
1747
+ puOrient->isAmp = true;
1748
+ puOrient->isRect = false;
1749
+ if (partSize == SIZE_2NxnD && numCTU > 1)
1750
+ return SIZE_2NxN;
1751
+ else if (partSize == SIZE_2NxnU && numCTU < 2)
1752
+ return SIZE_2NxN;
1753
+ else if (partSize == SIZE_nLx2N)
1754
+ {
1755
+ puOrient->isVert = true;
1756
+ if (!(numCTU % 2))
1757
+ return SIZE_Nx2N;
1758
+ }
1759
+ else if (partSize == SIZE_nRx2N)
1760
+ {
1761
+ puOrient->isVert = true;
1762
+ if (numCTU % 2)
1763
+ return SIZE_Nx2N;
1764
+ }
1765
+ }
1766
+ return SIZE_2Nx2N;
1767
+}
1768
+
1769
+void Encoder::readAnalysisFile(x265_analysis_data* analysis, int curPoc, int sliceType)
1770
{
1771
1772
#define X265_FREAD(val, size, readSize, fileOffset)\
1773
if (fread(val, size, readSize, fileOffset) != readSize)\
1774
{\
1775
x265_log(NULL, X265_LOG_ERROR, "Error reading analysis 2 pass data\n"); \
1776
- freeAnalysis2Pass(analysis2Pass, sliceType); \
1777
+ x265_alloc_analysis_data(m_param, analysis); \
1778
m_aborted = true; \
1779
return; \
1780
}\
1781
1782
uint32_t depthBytes = 0;
1783
- uint32_t widthInCU = (m_param->sourceWidth + m_param->maxCUSize - 1) >> m_param->maxLog2CUSize;
1784
- uint32_t heightInCU = (m_param->sourceHeight + m_param->maxCUSize - 1) >> m_param->maxLog2CUSize;
1785
- uint32_t numCUsInFrame = widthInCU * heightInCU;
1786
-
1787
int poc; uint32_t frameRecordSize;
1788
X265_FREAD(&frameRecordSize, sizeof(uint32_t), 1, m_analysisFileIn);
1789
X265_FREAD(&depthBytes, sizeof(uint32_t), 1, m_analysisFileIn);
1790
1791
if (poc != curPoc || feof(m_analysisFileIn))
1792
{
1793
x265_log(NULL, X265_LOG_WARNING, "Error reading analysis 2 pass data: Cannot find POC %d\n", curPoc);
1794
- freeAnalysis2Pass(analysis2Pass, sliceType);
1795
+ x265_free_analysis_data(m_param, analysis);
1796
return;
1797
}
1798
/* Now arrived at the right frame, read the record */
1799
- analysis2Pass->frameRecordSize = frameRecordSize;
1800
+ analysis->frameRecordSize = frameRecordSize;
1801
uint8_t* tempBuf = NULL, *depthBuf = NULL;
1802
sse_t *tempdistBuf = NULL, *distortionBuf = NULL;
1803
tempBuf = X265_MALLOC(uint8_t, depthBytes);
1804
1805
X265_FREAD(tempdistBuf, sizeof(sse_t), depthBytes, m_analysisFileIn);
1806
depthBuf = tempBuf;
1807
distortionBuf = tempdistBuf;
1808
- analysis2PassFrameData* analysisFrameData = (analysis2PassFrameData*)analysis2Pass->analysisFramedata;
1809
+ x265_analysis_data *analysisData = (x265_analysis_data*)analysis;
1810
+ x265_analysis_intra_data *intraData = analysisData->intraData;
1811
+ x265_analysis_inter_data *interData = analysisData->interData;
1812
+ x265_analysis_distortion_data *distortionData = analysisData->distortionData;
1813
+
1814
size_t count = 0;
1815
uint32_t ctuCount = 0;
1816
double sum = 0, sqrSum = 0;
1817
for (uint32_t d = 0; d < depthBytes; d++)
1818
{
1819
- int bytes = m_param->num4x4Partitions >> (depthBuf[d] * 2);
1820
- memset(&analysisFrameData->depth[count], depthBuf[d], bytes);
1821
- analysisFrameData->distortion[count] = distortionBuf[d];
1822
- analysisFrameData->ctuDistortion[ctuCount] += analysisFrameData->distortion[count];
1823
+ int bytes = analysis->numPartitions >> (depthBuf[d] * 2);
1824
+ if (IS_X265_TYPE_I(sliceType))
1825
+ memset(&intraData->depth[count], depthBuf[d], bytes);
1826
+ else
1827
+ memset(&interData->depth[count], depthBuf[d], bytes);
1828
+ distortionData->distortion[count] = distortionBuf[d];
1829
+ distortionData->ctuDistortion[ctuCount] += distortionData->distortion[count];
1830
count += bytes;
1831
- if ((count % (unsigned)m_param->num4x4Partitions) == 0)
1832
+ if ((count % (unsigned)analysis->numPartitions) == 0)
1833
{
1834
- analysisFrameData->scaledDistortion[ctuCount] = X265_LOG2(X265_MAX(analysisFrameData->ctuDistortion[ctuCount], 1));
1835
- sum += analysisFrameData->scaledDistortion[ctuCount];
1836
- sqrSum += analysisFrameData->scaledDistortion[ctuCount] * analysisFrameData->scaledDistortion[ctuCount];
1837
+ distortionData->scaledDistortion[ctuCount] = X265_LOG2(X265_MAX(distortionData->ctuDistortion[ctuCount], 1));
1838
+ sum += distortionData->scaledDistortion[ctuCount];
1839
+ sqrSum += distortionData->scaledDistortion[ctuCount] * distortionData->scaledDistortion[ctuCount];
1840
ctuCount++;
1841
}
1842
}
1843
- double avg = sum / numCUsInFrame;
1844
- analysisFrameData->sdDistortion = pow(((sqrSum / numCUsInFrame) - (avg * avg)), 0.5);
1845
- analysisFrameData->averageDistortion = avg;
1846
- analysisFrameData->highDistortionCtuCount = analysisFrameData->lowDistortionCtuCount = 0;
1847
- for (uint32_t i = 0; i < numCUsInFrame; ++i)
1848
- {
1849
- analysisFrameData->threshold[i] = analysisFrameData->scaledDistortion[i] / analysisFrameData->averageDistortion;
1850
- analysisFrameData->offset[i] = (analysisFrameData->averageDistortion - analysisFrameData->scaledDistortion[i]) / analysisFrameData->sdDistortion;
1851
- if (analysisFrameData->threshold[i] < 0.9 && analysisFrameData->offset[i] >= 1)
1852
- analysisFrameData->lowDistortionCtuCount++;
1853
- else if (analysisFrameData->threshold[i] > 1.1 && analysisFrameData->offset[i] <= -1)
1854
- analysisFrameData->highDistortionCtuCount++;
1855
+ double avg = sum / analysis->numCUsInFrame;
1856
+ distortionData->sdDistortion = pow(((sqrSum / analysis->numCUsInFrame) - (avg * avg)), 0.5);
1857
+ distortionData->averageDistortion = avg;
1858
+ distortionData->highDistortionCtuCount = distortionData->lowDistortionCtuCount = 0;
1859
+ for (uint32_t i = 0; i < analysis->numCUsInFrame; ++i)
1860
+ {
1861
+ distortionData->threshold[i] = distortionData->scaledDistortion[i] / distortionData->averageDistortion;
1862
+ distortionData->offset[i] = (distortionData->averageDistortion - distortionData->scaledDistortion[i]) / distortionData->sdDistortion;
1863
+ if (distortionData->threshold[i] < 0.9 && distortionData->offset[i] >= 1)
1864
+ distortionData->lowDistortionCtuCount++;
1865
+ else if (distortionData->threshold[i] > 1.1 && distortionData->offset[i] <= -1)
1866
+ distortionData->highDistortionCtuCount++;
1867
}
1868
if (!IS_X265_TYPE_I(sliceType))
1869
{
1870
MV *tempMVBuf[2], *MVBuf[2];
1871
- int32_t *tempRefBuf[2], *refBuf[2];
1872
- int *tempMvpBuf[2], *mvpBuf[2];
1873
+ int32_t *tempRefBuf, *refBuf;
1874
+ uint8_t *tempMvpBuf[2], *mvpBuf[2];
1875
uint8_t* tempModeBuf = NULL, *modeBuf = NULL;
1876
-
1877
int numDir = sliceType == X265_TYPE_P ? 1 : 2;
1878
+ tempRefBuf = X265_MALLOC(int32_t, numDir * depthBytes);
1879
+
1880
for (int i = 0; i < numDir; i++)
1881
{
1882
tempMVBuf[i] = X265_MALLOC(MV, depthBytes);
1883
X265_FREAD(tempMVBuf[i], sizeof(MV), depthBytes, m_analysisFileIn);
1884
MVBuf[i] = tempMVBuf[i];
1885
- tempMvpBuf[i] = X265_MALLOC(int, depthBytes);
1886
- X265_FREAD(tempMvpBuf[i], sizeof(int), depthBytes, m_analysisFileIn);
1887
+ tempMvpBuf[i] = X265_MALLOC(uint8_t, depthBytes);
1888
+ X265_FREAD(tempMvpBuf[i], sizeof(uint8_t), depthBytes, m_analysisFileIn);
1889
mvpBuf[i] = tempMvpBuf[i];
1890
- tempRefBuf[i] = X265_MALLOC(int32_t, depthBytes);
1891
- X265_FREAD(tempRefBuf[i], sizeof(int32_t), depthBytes, m_analysisFileIn);
1892
- refBuf[i] = tempRefBuf[i];
1893
+ X265_FREAD(&tempRefBuf[i*depthBytes], sizeof(int32_t), depthBytes, m_analysisFileIn);
1894
}
1895
+ refBuf = tempRefBuf;
1896
tempModeBuf = X265_MALLOC(uint8_t, depthBytes);
1897
X265_FREAD(tempModeBuf, sizeof(uint8_t), depthBytes, m_analysisFileIn);
1898
modeBuf = tempModeBuf;
1899
-
1900
+
1901
count = 0;
1902
+
1903
for (uint32_t d = 0; d < depthBytes; d++)
1904
{
1905
- size_t bytes = m_param->num4x4Partitions >> (depthBuf[d] * 2);
1906
+ size_t bytes = analysis->numPartitions >> (depthBuf[d] * 2);
1907
for (int i = 0; i < numDir; i++)
1908
{
1909
+ int32_t* ref = &(analysis->interData)->ref[i * analysis->numPartitions * analysis->numCUsInFrame];
1910
for (size_t j = count, k = 0; k < bytes; j++, k++)
1911
{
1912
- memcpy(&((analysis2PassFrameData*)analysis2Pass->analysisFramedata)->m_mv[i][j], MVBuf[i] + d, sizeof(MV));
1913
- memcpy(&((analysis2PassFrameData*)analysis2Pass->analysisFramedata)->mvpIdx[i][j], mvpBuf[i] + d, sizeof(int));
1914
- memcpy(&((analysis2PassFrameData*)analysis2Pass->analysisFramedata)->ref[i][j], refBuf[i] + d, sizeof(int32_t));
1915
+ memcpy(&(analysis->interData)->mv[i][j], MVBuf[i] + d, sizeof(MV));
1916
+ memcpy(&(analysis->interData)->mvpIdx[i][j], mvpBuf[i] + d, sizeof(uint8_t));
1917
+ memcpy(&ref[j], refBuf + (i * depthBytes) + d, sizeof(int32_t));
1918
}
1919
}
1920
- memset(&((analysis2PassFrameData *)analysis2Pass->analysisFramedata)->modes[count], modeBuf[d], bytes);
1921
+ memset(&(analysis->interData)->modes[count], modeBuf[d], bytes);
1922
count += bytes;
1923
}
1924
1925
1926
{
1927
X265_FREE(tempMVBuf[i]);
1928
X265_FREE(tempMvpBuf[i]);
1929
- X265_FREE(tempRefBuf[i]);
1930
}
1931
+ X265_FREE(tempRefBuf);
1932
X265_FREE(tempModeBuf);
1933
}
1934
X265_FREE(tempBuf);
1935
1936
if (fwrite(val, size, writeSize, fileOffset) < writeSize)\
1937
{\
1938
x265_log(NULL, X265_LOG_ERROR, "Error writing analysis data\n");\
1939
- freeAnalysis(analysis);\
1940
+ x265_free_analysis_data(m_param, analysis);\
1941
m_aborted = true;\
1942
return;\
1943
}\
1944
1945
uint32_t numDir, numPlanes;
1946
bool bIntraInInter = false;
1947
1948
+ if (!analysis->poc)
1949
+ {
1950
+ if (validateAnalysisData(analysis, 1) == -1)
1951
+ {
1952
+ m_aborted = true;
1953
+ return;
1954
+ }
1955
+ }
1956
+
1957
/* calculate frameRecordSize */
1958
analysis->frameRecordSize = sizeof(analysis->frameRecordSize) + sizeof(depthBytes) + sizeof(analysis->poc) + sizeof(analysis->sliceType) +
1959
sizeof(analysis->numCUsInFrame) + sizeof(analysis->numPartitions) + sizeof(analysis->bScenecut) + sizeof(analysis->satdCost);
1960
1961
uint8_t partSize = 0;
1962
1963
CUData* ctu = curEncData.getPicCTU(cuAddr);
1964
- analysis_intra_data* intraDataCTU = (analysis_intra_data*)analysis->intraData;
1965
+ x265_analysis_intra_data* intraDataCTU = analysis->intraData;
1966
1967
for (uint32_t absPartIdx = 0; absPartIdx < ctu->m_numPartitions; depthBytes++)
1968
{
1969
1970
uint8_t partSize = 0;
1971
1972
CUData* ctu = curEncData.getPicCTU(cuAddr);
1973
- analysis_inter_data* interDataCTU = (analysis_inter_data*)analysis->interData;
1974
- analysis_intra_data* intraDataCTU = (analysis_intra_data*)analysis->intraData;
1975
+ x265_analysis_inter_data* interDataCTU = analysis->interData;
1976
+ x265_analysis_intra_data* intraDataCTU = analysis->intraData;
1977
1978
for (uint32_t absPartIdx = 0; absPartIdx < ctu->m_numPartitions; depthBytes++)
1979
{
1980
1981
{
1982
interDataCTU->mvpIdx[dir][depthBytes] = ctu->m_mvpIdx[dir][puabsPartIdx];
1983
interDataCTU->refIdx[dir][depthBytes] = ctu->m_refIdx[dir][puabsPartIdx];
1984
- interDataCTU->mv[dir][depthBytes] = ctu->m_mv[dir][puabsPartIdx];
1985
+ interDataCTU->mv[dir][depthBytes].word = ctu->m_mv[dir][puabsPartIdx].word;
1986
}
1987
}
1988
}
1989
1990
1991
if (analysis->sliceType == X265_TYPE_IDR || analysis->sliceType == X265_TYPE_I)
1992
{
1993
- X265_FWRITE(((analysis_intra_data*)analysis->intraData)->depth, sizeof(uint8_t), depthBytes, m_analysisFileOut);
1994
- X265_FWRITE(((analysis_intra_data*)analysis->intraData)->chromaModes, sizeof(uint8_t), depthBytes, m_analysisFileOut);
1995
- X265_FWRITE(((analysis_intra_data*)analysis->intraData)->partSizes, sizeof(char), depthBytes, m_analysisFileOut);
1996
- X265_FWRITE(((analysis_intra_data*)analysis->intraData)->modes, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFileOut);
1997
+ X265_FWRITE((analysis->intraData)->depth, sizeof(uint8_t), depthBytes, m_analysisFileOut);
1998
+ X265_FWRITE((analysis->intraData)->chromaModes, sizeof(uint8_t), depthBytes, m_analysisFileOut);
1999
+ X265_FWRITE((analysis->intraData)->partSizes, sizeof(char), depthBytes, m_analysisFileOut);
2000
+ X265_FWRITE((analysis->intraData)->modes, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFileOut);
2001
}
2002
else
2003
{
2004
- X265_FWRITE(((analysis_inter_data*)analysis->interData)->depth, sizeof(uint8_t), depthBytes, m_analysisFileOut);
2005
- X265_FWRITE(((analysis_inter_data*)analysis->interData)->modes, sizeof(uint8_t), depthBytes, m_analysisFileOut);
2006
+ X265_FWRITE((analysis->interData)->depth, sizeof(uint8_t), depthBytes, m_analysisFileOut);
2007
+ X265_FWRITE((analysis->interData)->modes, sizeof(uint8_t), depthBytes, m_analysisFileOut);
2008
if (m_param->analysisReuseLevel > 4)
2009
{
2010
- X265_FWRITE(((analysis_inter_data*)analysis->interData)->partSize, sizeof(uint8_t), depthBytes, m_analysisFileOut);
2011
- X265_FWRITE(((analysis_inter_data*)analysis->interData)->mergeFlag, sizeof(uint8_t), depthBytes, m_analysisFileOut);
2012
+ X265_FWRITE((analysis->interData)->partSize, sizeof(uint8_t), depthBytes, m_analysisFileOut);
2013
+ X265_FWRITE((analysis->interData)->mergeFlag, sizeof(uint8_t), depthBytes, m_analysisFileOut);
2014
if (m_param->analysisReuseLevel == 10)
2015
{
2016
- X265_FWRITE(((analysis_inter_data*)analysis->interData)->interDir, sizeof(uint8_t), depthBytes, m_analysisFileOut);
2017
- if (bIntraInInter) X265_FWRITE(((analysis_intra_data*)analysis->intraData)->chromaModes, sizeof(uint8_t), depthBytes, m_analysisFileOut);
2018
+ X265_FWRITE((analysis->interData)->interDir, sizeof(uint8_t), depthBytes, m_analysisFileOut);
2019
+ if (bIntraInInter) X265_FWRITE((analysis->intraData)->chromaModes, sizeof(uint8_t), depthBytes, m_analysisFileOut);
2020
for (uint32_t dir = 0; dir < numDir; dir++)
2021
{
2022
- X265_FWRITE(((analysis_inter_data*)analysis->interData)->mvpIdx[dir], sizeof(uint8_t), depthBytes, m_analysisFileOut);
2023
- X265_FWRITE(((analysis_inter_data*)analysis->interData)->refIdx[dir], sizeof(int8_t), depthBytes, m_analysisFileOut);
2024
- X265_FWRITE(((analysis_inter_data*)analysis->interData)->mv[dir], sizeof(MV), depthBytes, m_analysisFileOut);
2025
+ X265_FWRITE((analysis->interData)->mvpIdx[dir], sizeof(uint8_t), depthBytes, m_analysisFileOut);
2026
+ X265_FWRITE((analysis->interData)->refIdx[dir], sizeof(int8_t), depthBytes, m_analysisFileOut);
2027
+ X265_FWRITE((analysis->interData)->mv[dir], sizeof(MV), depthBytes, m_analysisFileOut);
2028
}
2029
if (bIntraInInter)
2030
- X265_FWRITE(((analysis_intra_data*)analysis->intraData)->modes, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFileOut);
2031
+ X265_FWRITE((analysis->intraData)->modes, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFileOut);
2032
}
2033
}
2034
if (m_param->analysisReuseLevel != 10)
2035
- X265_FWRITE(((analysis_inter_data*)analysis->interData)->ref, sizeof(int32_t), analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU * numDir, m_analysisFileOut);
2036
+ X265_FWRITE((analysis->interData)->ref, sizeof(int32_t), analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU * numDir, m_analysisFileOut);
2037
2038
}
2039
#undef X265_FWRITE
2040
}
2041
2042
-void Encoder::writeAnalysis2PassFile(x265_analysis_2Pass* analysis2Pass, FrameData &curEncData, int slicetype)
2043
+void Encoder::writeAnalysisFileRefine(x265_analysis_data* analysis, FrameData &curEncData)
2044
{
2045
#define X265_FWRITE(val, size, writeSize, fileOffset)\
2046
if (fwrite(val, size, writeSize, fileOffset) < writeSize)\
2047
{\
2048
x265_log(NULL, X265_LOG_ERROR, "Error writing analysis 2 pass data\n"); \
2049
- freeAnalysis2Pass(analysis2Pass, slicetype); \
2050
+ x265_free_analysis_data(m_param, analysis); \
2051
m_aborted = true; \
2052
return; \
2053
}\
2054
2055
uint32_t depthBytes = 0;
2056
- uint32_t widthInCU = (m_param->sourceWidth + m_param->maxCUSize - 1) >> m_param->maxLog2CUSize;
2057
- uint32_t heightInCU = (m_param->sourceHeight + m_param->maxCUSize - 1) >> m_param->maxLog2CUSize;
2058
- uint32_t numCUsInFrame = widthInCU * heightInCU;
2059
- analysis2PassFrameData* analysisFrameData = (analysis2PassFrameData*)analysis2Pass->analysisFramedata;
2060
+ x265_analysis_data *analysisData = (x265_analysis_data*)analysis;
2061
+ x265_analysis_intra_data *intraData = analysisData->intraData;
2062
+ x265_analysis_inter_data *interData = analysisData->interData;
2063
+ x265_analysis_distortion_data *distortionData = analysisData->distortionData;
2064
2065
- for (uint32_t cuAddr = 0; cuAddr < numCUsInFrame; cuAddr++)
2066
+ for (uint32_t cuAddr = 0; cuAddr < analysis->numCUsInFrame; cuAddr++)
2067
{
2068
uint8_t depth = 0;
2069
2070
2071
for (uint32_t absPartIdx = 0; absPartIdx < ctu->m_numPartitions; depthBytes++)
2072
{
2073
depth = ctu->m_cuDepth[absPartIdx];
2074
- analysisFrameData->depth[depthBytes] = depth;
2075
- analysisFrameData->distortion[depthBytes] = ctu->m_distortion[absPartIdx];
2076
+ if (curEncData.m_slice->m_sliceType == I_SLICE)
2077
+ intraData->depth[depthBytes] = depth;
2078
+ else
2079
+ interData->depth[depthBytes] = depth;
2080
+ distortionData->distortion[depthBytes] = ctu->m_distortion[absPartIdx];
2081
absPartIdx += ctu->m_numPartitions >> (depth * 2);
2082
}
2083
}
2084
2085
if (curEncData.m_slice->m_sliceType != I_SLICE)
2086
{
2087
+ int32_t* ref[2];
2088
+ ref[0] = (analysis->interData)->ref;
2089
+ ref[1] = &(analysis->interData)->ref[analysis->numPartitions * analysis->numCUsInFrame];
2090
depthBytes = 0;
2091
- for (uint32_t cuAddr = 0; cuAddr < numCUsInFrame; cuAddr++)
2092
+ for (uint32_t cuAddr = 0; cuAddr < analysis->numCUsInFrame; cuAddr++)
2093
{
2094
uint8_t depth = 0;
2095
uint8_t predMode = 0;
2096
2097
CUData* ctu = curEncData.getPicCTU(cuAddr);
2098
-
2099
for (uint32_t absPartIdx = 0; absPartIdx < ctu->m_numPartitions; depthBytes++)
2100
{
2101
depth = ctu->m_cuDepth[absPartIdx];
2102
- analysisFrameData->m_mv[0][depthBytes] = ctu->m_mv[0][absPartIdx];
2103
- analysisFrameData->mvpIdx[0][depthBytes] = ctu->m_mvpIdx[0][absPartIdx];
2104
- analysisFrameData->ref[0][depthBytes] = ctu->m_refIdx[0][absPartIdx];
2105
+ interData->mv[0][depthBytes].word = ctu->m_mv[0][absPartIdx].word;
2106
+ interData->mvpIdx[0][depthBytes] = ctu->m_mvpIdx[0][absPartIdx];
2107
+ ref[0][depthBytes] = ctu->m_refIdx[0][absPartIdx];
2108
predMode = ctu->m_predMode[absPartIdx];
2109
if (ctu->m_refIdx[1][absPartIdx] != -1)
2110
{
2111
- analysisFrameData->m_mv[1][depthBytes] = ctu->m_mv[1][absPartIdx];
2112
- analysisFrameData->mvpIdx[1][depthBytes] = ctu->m_mvpIdx[1][absPartIdx];
2113
- analysisFrameData->ref[1][depthBytes] = ctu->m_refIdx[1][absPartIdx];
2114
+ interData->mv[1][depthBytes].word = ctu->m_mv[1][absPartIdx].word;
2115
+ interData->mvpIdx[1][depthBytes] = ctu->m_mvpIdx[1][absPartIdx];
2116
+ ref[1][depthBytes] = ctu->m_refIdx[1][absPartIdx];
2117
predMode = 4; // used as indiacator if the block is coded as bidir
2118
}
2119
- analysisFrameData->modes[depthBytes] = predMode;
2120
+ interData->modes[depthBytes] = predMode;
2121
2122
absPartIdx += ctu->m_numPartitions >> (depth * 2);
2123
}
2124
2125
}
2126
2127
/* calculate frameRecordSize */
2128
- analysis2Pass->frameRecordSize = sizeof(analysis2Pass->frameRecordSize) + sizeof(depthBytes) + sizeof(analysis2Pass->poc);
2129
-
2130
- analysis2Pass->frameRecordSize += depthBytes * sizeof(uint8_t);
2131
- analysis2Pass->frameRecordSize += depthBytes * sizeof(sse_t);
2132
+ analysis->frameRecordSize = sizeof(analysis->frameRecordSize) + sizeof(depthBytes) + sizeof(analysis->poc);
2133
+ analysis->frameRecordSize += depthBytes * sizeof(uint8_t);
2134
+ analysis->frameRecordSize += depthBytes * sizeof(sse_t);
2135
if (curEncData.m_slice->m_sliceType != I_SLICE)
2136
{
2137
int numDir = (curEncData.m_slice->m_sliceType == P_SLICE) ? 1 : 2;
2138
- analysis2Pass->frameRecordSize += depthBytes * sizeof(MV) * numDir;
2139
- analysis2Pass->frameRecordSize += depthBytes * sizeof(int32_t) * numDir;
2140
- analysis2Pass->frameRecordSize += depthBytes * sizeof(int) * numDir;
2141
- analysis2Pass->frameRecordSize += depthBytes * sizeof(uint8_t);
2142
+ analysis->frameRecordSize += depthBytes * sizeof(MV) * numDir;
2143
+ analysis->frameRecordSize += depthBytes * sizeof(int32_t) * numDir;
2144
+ analysis->frameRecordSize += depthBytes * sizeof(uint8_t) * numDir;
2145
+ analysis->frameRecordSize += depthBytes * sizeof(uint8_t);
2146
}
2147
- X265_FWRITE(&analysis2Pass->frameRecordSize, sizeof(uint32_t), 1, m_analysisFileOut);
2148
+ X265_FWRITE(&analysis->frameRecordSize, sizeof(uint32_t), 1, m_analysisFileOut);
2149
X265_FWRITE(&depthBytes, sizeof(uint32_t), 1, m_analysisFileOut);
2150
- X265_FWRITE(&analysis2Pass->poc, sizeof(uint32_t), 1, m_analysisFileOut);
2151
-
2152
- X265_FWRITE(analysisFrameData->depth, sizeof(uint8_t), depthBytes, m_analysisFileOut);
2153
- X265_FWRITE(analysisFrameData->distortion, sizeof(sse_t), depthBytes, m_analysisFileOut);
2154
+ X265_FWRITE(&analysis->poc, sizeof(uint32_t), 1, m_analysisFileOut);
2155
+ if (curEncData.m_slice->m_sliceType == I_SLICE)
2156
+ {
2157
+ X265_FWRITE((analysis->intraData)->depth, sizeof(uint8_t), depthBytes, m_analysisFileOut);
2158
+ }
2159
+ else
2160
+ {
2161
+ X265_FWRITE((analysis->interData)->depth, sizeof(uint8_t), depthBytes, m_analysisFileOut);
2162
+ }
2163
+ X265_FWRITE(distortionData->distortion, sizeof(sse_t), depthBytes, m_analysisFileOut);
2164
if (curEncData.m_slice->m_sliceType != I_SLICE)
2165
{
2166
int numDir = curEncData.m_slice->m_sliceType == P_SLICE ? 1 : 2;
2167
for (int i = 0; i < numDir; i++)
2168
{
2169
- X265_FWRITE(analysisFrameData->m_mv[i], sizeof(MV), depthBytes, m_analysisFileOut);
2170
- X265_FWRITE(analysisFrameData->mvpIdx[i], sizeof(int), depthBytes, m_analysisFileOut);
2171
- X265_FWRITE(analysisFrameData->ref[i], sizeof(int32_t), depthBytes, m_analysisFileOut);
2172
+ int32_t* ref = &(analysis->interData)->ref[i * analysis->numPartitions * analysis->numCUsInFrame];
2173
+ X265_FWRITE(interData->mv[i], sizeof(MV), depthBytes, m_analysisFileOut);
2174
+ X265_FWRITE(interData->mvpIdx[i], sizeof(uint8_t), depthBytes, m_analysisFileOut);
2175
+ X265_FWRITE(ref, sizeof(int32_t), depthBytes, m_analysisFileOut);
2176
}
2177
- X265_FWRITE(analysisFrameData->modes, sizeof(uint8_t), depthBytes, m_analysisFileOut);
2178
+ X265_FWRITE((analysis->interData)->modes, sizeof(uint8_t), depthBytes, m_analysisFileOut);
2179
}
2180
#undef X265_FWRITE
2181
}
2182
2183
TOOLCMP(oldParam->rc.rfConstant, newParam->rc.rfConstant, "crf=%f to %f\n");
2184
}
2185
2186
+void Encoder::readUserSeiFile(x265_sei_payload& seiMsg, int curPoc)
2187
+{
2188
+ char line[1024];
2189
+ while (fgets(line, sizeof(line), m_naluFile))
2190
+ {
2191
+ int poc = atoi(strtok(line, " "));
2192
+ char *prefix = strtok(NULL, " ");
2193
+ int nalType = atoi(strtok(NULL, "/"));
2194
+ int payloadType = atoi(strtok(NULL, " "));
2195
+ char *base64Encode = strtok(NULL, "\n");
2196
+ int base64EncodeLength = (int)strlen(base64Encode);
2197
+ char *base64Decode = SEI::base64Decode(base64Encode, base64EncodeLength);
2198
+ if (nalType == NAL_UNIT_PREFIX_SEI && (!strcmp(prefix, "PREFIX")))
2199
+ {
2200
+ int currentPOC = curPoc;
2201
+ if (currentPOC == poc)
2202
+ {
2203
+ seiMsg.payloadSize = (base64EncodeLength / 4) * 3;
2204
+ seiMsg.payload = (uint8_t*)x265_malloc(sizeof(uint8_t) * seiMsg.payloadSize);
2205
+ if (!seiMsg.payload)
2206
+ {
2207
+ x265_log(m_param, X265_LOG_ERROR, "Unable to allocate memory for SEI payload\n");
2208
+ break;
2209
+ }
2210
+ if (payloadType == 4)
2211
+ seiMsg.payloadType = USER_DATA_REGISTERED_ITU_T_T35;
2212
+ else if (payloadType == 5)
2213
+ seiMsg.payloadType = USER_DATA_UNREGISTERED;
2214
+ else
2215
+ {
2216
+ x265_log(m_param, X265_LOG_WARNING, "Unsupported SEI payload Type for frame %d\n", poc);
2217
+ break;
2218
+ }
2219
+ memcpy(seiMsg.payload, base64Decode, seiMsg.payloadSize);
2220
+ break;
2221
+ }
2222
+ }
2223
+ else
2224
+ {
2225
+ x265_log(m_param, X265_LOG_WARNING, "SEI message for frame %d is not inserted. Will support only PREFIX SEI messages.\n", poc);
2226
+ break;
2227
+ }
2228
+ }
2229
+}
2230
+
2231
bool Encoder::computeSPSRPSIndex()
2232
{
2233
RPS* rpsInSPS = m_sps.spsrps;
2234
x265_2.7.tar.gz/source/encoder/encoder.h -> x265_2.9.tar.gz/source/encoder/encoder.h
Changed
121
1
2
RPSListNode* prior;
3
};
4
5
+struct cuLocation
6
+{
7
+ bool skipWidth;
8
+ bool skipHeight;
9
+ uint32_t heightInCU;
10
+ uint32_t widthInCU;
11
+ uint32_t oddRowIndex;
12
+ uint32_t evenRowIndex;
13
+ uint32_t switchCondition;
14
+
15
+ void init(x265_param* param)
16
+ {
17
+ skipHeight = false;
18
+ skipWidth = false;
19
+ heightInCU = (param->sourceHeight + param->maxCUSize - 1) >> param->maxLog2CUSize;
20
+ widthInCU = (param->sourceWidth + param->maxCUSize - 1) >> param->maxLog2CUSize;
21
+ evenRowIndex = 0;
22
+ oddRowIndex = param->num4x4Partitions * widthInCU;
23
+ switchCondition = 0; // To switch between odd and even rows
24
+ }
25
+};
26
+
27
+struct puOrientation
28
+{
29
+ bool isVert;
30
+ bool isRect;
31
+ bool isAmp;
32
+
33
+ void init()
34
+ {
35
+ isRect = false;
36
+ isAmp = false;
37
+ isVert = false;
38
+ }
39
+};
40
+
41
+
42
class FrameEncoder;
43
class DPB;
44
class Lookahead;
45
46
Frame* m_exportedPic;
47
FILE* m_analysisFileIn;
48
FILE* m_analysisFileOut;
49
+ FILE* m_naluFile;
50
x265_param* m_param;
51
x265_param* m_latestParam; // Holds latest param during a reconfigure
52
RateControl* m_rateControl;
53
54
double m_cR;
55
56
int m_bToneMap; // Enables tone-mapping
57
+ int m_enableNal;
58
59
#ifdef ENABLE_HDR10_PLUS
60
const hdr10plus_api *m_hdr10plus_api;
61
62
63
x265_sei_payload m_prevTonemapPayload;
64
65
+ /* Collect frame level feature data */
66
+ uint64_t* m_rdCost;
67
+ uint64_t* m_variance;
68
+ uint32_t* m_trainingCount;
69
+ int32_t m_startPoint;
70
+ Lock m_dynamicRefineLock;
71
+
72
+ bool m_saveCTUSize;
73
+
74
Encoder();
75
~Encoder()
76
{
77
78
79
void updateVbvPlan(RateControl* rc);
80
81
- void allocAnalysis(x265_analysis_data* analysis);
82
+ void readAnalysisFile(x265_analysis_data* analysis, int poc, int sliceType);
83
+
84
+ void readAnalysisFile(x265_analysis_data* analysis, int poc, const x265_picture* picIn, int paramBytes);
85
86
- void freeAnalysis(x265_analysis_data* analysis);
87
+ void readAnalysisFile(x265_analysis_data* analysis, int poc, const x265_picture* picIn, int paramBytes, cuLocation cuLoc);
88
89
- void allocAnalysis2Pass(x265_analysis_2Pass* analysis, int sliceType);
90
+ int getCUIndex(cuLocation* cuLoc, uint32_t* count, int bytes, int flag);
91
92
- void freeAnalysis2Pass(x265_analysis_2Pass* analysis, int sliceType);
93
+ int getPuShape(puOrientation* puOrient, int partSize, int numCTU);
94
95
- void readAnalysisFile(x265_analysis_data* analysis, int poc, const x265_picture* picIn);
96
+ void writeAnalysisFile(x265_analysis_data* analysis, FrameData &curEncData);
97
+
98
+ void writeAnalysisFileRefine(x265_analysis_data* analysis, FrameData &curEncData);
99
100
- void writeAnalysisFile(x265_analysis_data* pic, FrameData &curEncData);
101
- void readAnalysis2PassFile(x265_analysis_2Pass* analysis2Pass, int poc, int sliceType);
102
- void writeAnalysis2PassFile(x265_analysis_2Pass* analysis2Pass, FrameData &curEncData, int slicetype);
103
void finishFrameStats(Frame* pic, FrameEncoder *curEncoder, x265_frame_stats* frameStats, int inPoc);
104
105
+ int validateAnalysisData(x265_analysis_data* analysis, int readWriteFlag);
106
+
107
+ void readUserSeiFile(x265_sei_payload& seiMsg, int poc);
108
+
109
void calcRefreshInterval(Frame* frameEnc);
110
111
void initRefIdx();
112
113
void updateRefIdx();
114
bool computeSPSRPSIndex();
115
116
+ void copyUserSEIMessages(Frame *frame, const x265_picture* pic_in);
117
+
118
protected:
119
120
void initVPS(VPS *vps);
121
x265_2.7.tar.gz/source/encoder/entropy.cpp -> x265_2.9.tar.gz/source/encoder/entropy.cpp
Changed
40
1
2
}
3
bDenomCoded = true;
4
}
5
- WRITE_FLAG(wp[0].bPresentFlag, "luma_weight_lX_flag");
6
- totalSignalledWeightFlags += wp[0].bPresentFlag;
7
+ WRITE_FLAG(!!wp[0].wtPresent, "luma_weight_lX_flag");
8
+ totalSignalledWeightFlags += wp[0].wtPresent;
9
}
10
11
if (bChroma)
12
13
for (int ref = 0; ref < slice.m_numRefIdx[list]; ref++)
14
{
15
wp = slice.m_weightPredTable[list][ref];
16
- WRITE_FLAG(wp[1].bPresentFlag, "chroma_weight_lX_flag");
17
- totalSignalledWeightFlags += 2 * wp[1].bPresentFlag;
18
+ WRITE_FLAG(!!wp[1].wtPresent, "chroma_weight_lX_flag");
19
+ totalSignalledWeightFlags += 2 * wp[1].wtPresent;
20
}
21
}
22
23
for (int ref = 0; ref < slice.m_numRefIdx[list]; ref++)
24
{
25
wp = slice.m_weightPredTable[list][ref];
26
- if (wp[0].bPresentFlag)
27
+ if (wp[0].wtPresent)
28
{
29
int deltaWeight = (wp[0].inputWeight - (1 << wp[0].log2WeightDenom));
30
WRITE_SVLC(deltaWeight, "delta_luma_weight_lX");
31
32
33
if (bChroma)
34
{
35
- if (wp[1].bPresentFlag)
36
+ if (wp[1].wtPresent)
37
{
38
for (int plane = 1; plane < 3; plane++)
39
{
40
x265_2.7.tar.gz/source/encoder/frameencoder.cpp -> x265_2.9.tar.gz/source/encoder/frameencoder.cpp
Changed
695
1
2
ok &= m_rce.picTimingSEI && m_rce.hrdTiming;
3
}
4
5
- if (m_param->noiseReductionIntra || m_param->noiseReductionInter || m_param->rc.vbvBufferSize)
6
+ if (m_param->noiseReductionIntra || m_param->noiseReductionInter)
7
m_nr = X265_MALLOC(NoiseReduction, 1);
8
if (m_nr)
9
memset(m_nr, 0, sizeof(NoiseReduction));
10
11
return length;
12
}
13
14
+bool FrameEncoder::writeToneMapInfo(x265_sei_payload *payload)
15
+{
16
+ bool payloadChange = false;
17
+ if (m_top->m_prevTonemapPayload.payload != NULL && payload->payloadSize == m_top->m_prevTonemapPayload.payloadSize)
18
+ {
19
+ if (memcmp(m_top->m_prevTonemapPayload.payload, payload->payload, payload->payloadSize) != 0)
20
+ payloadChange = true;
21
+ }
22
+ else
23
+ {
24
+ payloadChange = true;
25
+ if (m_top->m_prevTonemapPayload.payload != NULL)
26
+ x265_free(m_top->m_prevTonemapPayload.payload);
27
+ m_top->m_prevTonemapPayload.payload = (uint8_t*)x265_malloc(sizeof(uint8_t)* payload->payloadSize);
28
+ }
29
+
30
+ if (payloadChange)
31
+ {
32
+ m_top->m_prevTonemapPayload.payloadType = payload->payloadType;
33
+ m_top->m_prevTonemapPayload.payloadSize = payload->payloadSize;
34
+ memcpy(m_top->m_prevTonemapPayload.payload, payload->payload, payload->payloadSize);
35
+ }
36
+
37
+ bool isIDR = m_frame->m_lowres.sliceType == X265_TYPE_IDR;
38
+ return (payloadChange || isIDR);
39
+}
40
+
41
+void FrameEncoder::writeTrailingSEIMessages()
42
+{
43
+ Slice* slice = m_frame->m_encData->m_slice;
44
+ int planes = (m_param->internalCsp != X265_CSP_I400) ? 3 : 1;
45
+ int32_t payloadSize = 0;
46
+
47
+ if (m_param->decodedPictureHashSEI == 1)
48
+ {
49
+ m_seiReconPictureDigest.m_method = SEIDecodedPictureHash::MD5;
50
+ for (int i = 0; i < planes; i++)
51
+ MD5Final(&m_seiReconPictureDigest.m_state[i], m_seiReconPictureDigest.m_digest[i]);
52
+ payloadSize = 1 + 16 * planes;
53
+ }
54
+ else if (m_param->decodedPictureHashSEI == 2)
55
+ {
56
+ m_seiReconPictureDigest.m_method = SEIDecodedPictureHash::CRC;
57
+ for (int i = 0; i < planes; i++)
58
+ crcFinish(m_seiReconPictureDigest.m_crc[i], m_seiReconPictureDigest.m_digest[i]);
59
+ payloadSize = 1 + 2 * planes;
60
+ }
61
+ else if (m_param->decodedPictureHashSEI == 3)
62
+ {
63
+ m_seiReconPictureDigest.m_method = SEIDecodedPictureHash::CHECKSUM;
64
+ for (int i = 0; i < planes; i++)
65
+ checksumFinish(m_seiReconPictureDigest.m_checksum[i], m_seiReconPictureDigest.m_digest[i]);
66
+ payloadSize = 1 + 4 * planes;
67
+ }
68
+
69
+ m_seiReconPictureDigest.setSize(payloadSize);
70
+ m_seiReconPictureDigest.writeSEImessages(m_bs, *slice->m_sps, NAL_UNIT_SUFFIX_SEI, m_nalList, false);
71
+}
72
+
73
void FrameEncoder::compressFrame()
74
{
75
ProfileScopeEvent(frameThread);
76
77
* not repeating headers (since AUD is supposed to be the first NAL in the access
78
* unit) */
79
Slice* slice = m_frame->m_encData->m_slice;
80
+
81
if (m_param->bEnableAccessUnitDelimiters && (m_frame->m_poc || m_param->bRepeatHeaders))
82
{
83
m_bs.resetBits();
84
85
m_entropyCoder.codeAUD(*slice);
86
m_bs.writeByteAlignment();
87
m_nalList.serialize(NAL_UNIT_ACCESS_UNIT_DELIMITER, m_bs);
88
+ if (m_param->bSingleSeiNal)
89
+ m_bs.resetBits();
90
}
91
if (m_frame->m_lowres.bKeyframe && m_param->bRepeatHeaders)
92
{
93
94
wa.waitForExit();
95
else
96
weightAnalyse(*slice, *m_frame, *m_param);
97
-
98
}
99
-
100
}
101
else
102
slice->disableWeights();
103
104
for (int ref = 0; ref < slice->m_numRefIdx[l]; ref++)
105
{
106
WeightParam *w = NULL;
107
- if ((bUseWeightP || bUseWeightB) && slice->m_weightPredTable[l][ref][0].bPresentFlag)
108
+ if ((bUseWeightP || bUseWeightB) && slice->m_weightPredTable[l][ref][0].wtPresent)
109
w = slice->m_weightPredTable[l][ref];
110
slice->m_refReconPicList[l][ref] = slice->m_refFrameList[l][ref]->m_reconPic;
111
m_mref[l][ref].init(slice->m_refReconPicList[l][ref], w, *m_param);
112
113
114
/* Get the QP for this frame from rate control. This call may block until
115
* frames ahead of it in encode order have called rateControlEnd() */
116
- m_rce.encodeOrder = m_frame->m_encodeOrder;
117
- bool payloadChange = false;
118
- bool writeSei = true;
119
- if (m_param->bDhdr10opt)
120
- {
121
- for (int i = 0; i < m_frame->m_userSEI.numPayloads; i++)
122
- {
123
- x265_sei_payload *payload = &m_frame->m_userSEI.payloads[i];
124
- if(payload->payloadType == USER_DATA_REGISTERED_ITU_T_T35)
125
- {
126
- if (m_top->m_prevTonemapPayload.payload != NULL && payload->payloadSize == m_top->m_prevTonemapPayload.payloadSize)
127
- {
128
- if (memcmp(m_top->m_prevTonemapPayload.payload, payload->payload, payload->payloadSize) != 0)
129
- payloadChange = true;
130
- }
131
- else
132
- {
133
- payloadChange = true;
134
- if (m_top->m_prevTonemapPayload.payload != NULL)
135
- x265_free(m_top->m_prevTonemapPayload.payload);
136
- m_top->m_prevTonemapPayload.payload = (uint8_t*)x265_malloc(sizeof(uint8_t) * payload->payloadSize);
137
- }
138
-
139
- if (payloadChange)
140
- {
141
- m_top->m_prevTonemapPayload.payloadType = payload->payloadType;
142
- m_top->m_prevTonemapPayload.payloadSize = payload->payloadSize;
143
- memcpy(m_top->m_prevTonemapPayload.payload, payload->payload, payload->payloadSize);
144
- }
145
-
146
- bool isIDR = m_frame->m_lowres.sliceType == X265_TYPE_IDR;
147
- writeSei = payloadChange || isIDR;
148
- }
149
- }
150
- }
151
int qp = m_top->m_rateControl->rateControlStart(m_frame, &m_rce, m_top);
152
m_rce.newQp = qp;
153
154
155
156
/* reset entropy coders and compute slice id */
157
m_entropyCoder.load(m_initSliceContext);
158
-
159
for (uint32_t sliceId = 0; sliceId < m_param->maxSlices; sliceId++)
160
for (uint32_t row = m_sliceBaseRow[sliceId]; row < m_sliceBaseRow[sliceId + 1]; row++)
161
m_rows[row].init(m_initSliceContext, sliceId);
162
163
m_outStreams[i].resetBits();
164
}
165
166
+ m_rce.encodeOrder = m_frame->m_encodeOrder;
167
int prevBPSEI = m_rce.encodeOrder ? m_top->m_lastBPSEI : 0;
168
169
if (m_frame->m_lowres.bKeyframe)
170
171
bpSei->m_auCpbRemovalDelayDelta = 1;
172
bpSei->m_cpbDelayOffset = 0;
173
bpSei->m_dpbDelayOffset = 0;
174
-
175
// hrdFullness() calculates the initial CPB removal delay and offset
176
m_top->m_rateControl->hrdFullness(bpSei);
177
-
178
- m_bs.resetBits();
179
- bpSei->write(m_bs, *slice->m_sps);
180
- m_bs.writeByteAlignment();
181
-
182
- m_nalList.serialize(NAL_UNIT_PREFIX_SEI, m_bs);
183
+ bpSei->writeSEImessages(m_bs, *slice->m_sps, NAL_UNIT_PREFIX_SEI, m_nalList, m_param->bSingleSeiNal);
184
185
m_top->m_lastBPSEI = m_rce.encodeOrder;
186
}
187
+
188
+ if (m_frame->m_lowres.sliceType == X265_TYPE_IDR && m_param->bEmitIDRRecoverySEI)
189
+ {
190
+ /* Recovery Point SEI require the SPS to be "activated" */
191
+ SEIRecoveryPoint sei;
192
+ sei.m_recoveryPocCnt = 0;
193
+ sei.m_exactMatchingFlag = true;
194
+ sei.m_brokenLinkFlag = false;
195
+ sei.writeSEImessages(m_bs, *slice->m_sps, NAL_UNIT_PREFIX_SEI, m_nalList, m_param->bSingleSeiNal);
196
+ }
197
}
198
199
if ((m_param->bEmitHRDSEI || !!m_param->interlaceMode))
200
201
else if (m_param->interlaceMode == 1)
202
sei->m_picStruct = (poc & 1) ? 2 /* bottom */ : 1 /* top */;
203
else
204
- sei->m_picStruct = 0;
205
- sei->m_sourceScanType = 0;
206
+ sei->m_picStruct = m_param->pictureStructure;
207
+
208
+ sei->m_sourceScanType = m_param->interlaceMode ? 0 : 1;
209
+
210
sei->m_duplicateFlag = false;
211
}
212
213
214
sei->m_picDpbOutputDelay = slice->m_sps->numReorderPics + poc - m_rce.encodeOrder;
215
}
216
217
- m_bs.resetBits();
218
- sei->write(m_bs, *slice->m_sps);
219
- m_bs.writeByteAlignment();
220
- m_nalList.serialize(NAL_UNIT_PREFIX_SEI, m_bs);
221
+ sei->writeSEImessages(m_bs, *slice->m_sps, NAL_UNIT_PREFIX_SEI, m_nalList, m_param->bSingleSeiNal);
222
+ }
223
+
224
+ if (m_param->preferredTransferCharacteristics > -1 && slice->isIRAP())
225
+ {
226
+ SEIAlternativeTC m_seiAlternativeTC;
227
+ m_seiAlternativeTC.m_preferredTransferCharacteristics = m_param->preferredTransferCharacteristics;
228
+ m_seiAlternativeTC.writeSEImessages(m_bs, *slice->m_sps, NAL_UNIT_PREFIX_SEI, m_nalList, m_param->bSingleSeiNal);
229
}
230
231
/* Write user SEI */
232
233
{
234
SEIuserDataUnregistered sei;
235
sei.m_userData = payload->payload;
236
- m_bs.resetBits();
237
sei.setSize(payload->payloadSize);
238
- sei.write(m_bs, *slice->m_sps);
239
- m_bs.writeByteAlignment();
240
- m_nalList.serialize(NAL_UNIT_PREFIX_SEI, m_bs);
241
+ sei.writeSEImessages(m_bs, *slice->m_sps, NAL_UNIT_PREFIX_SEI, m_nalList, m_param->bSingleSeiNal);
242
}
243
else if (payload->payloadType == USER_DATA_REGISTERED_ITU_T_T35)
244
{
245
+ bool writeSei = m_param->bDhdr10opt ? writeToneMapInfo(payload) : true;
246
if (writeSei)
247
{
248
- SEICreativeIntentMeta sei;
249
- sei.m_payload = payload->payload;
250
- m_bs.resetBits();
251
+ SEIuserDataRegistered sei;
252
+ sei.m_userData = payload->payload;
253
sei.setSize(payload->payloadSize);
254
- sei.write(m_bs, *slice->m_sps);
255
- m_bs.writeByteAlignment();
256
- m_nalList.serialize(NAL_UNIT_PREFIX_SEI, m_bs);
257
+ sei.writeSEImessages(m_bs, *slice->m_sps, NAL_UNIT_PREFIX_SEI, m_nalList, m_param->bSingleSeiNal);
258
}
259
}
260
else
261
x265_log(m_param, X265_LOG_ERROR, "Unrecognized SEI type\n");
262
}
263
+
264
+ bool isSei = ((m_frame->m_lowres.bKeyframe && m_param->bRepeatHeaders) || m_param->bEmitHRDSEI ||
265
+ !!m_param->interlaceMode || (m_frame->m_lowres.sliceType == X265_TYPE_IDR && m_param->bEmitIDRRecoverySEI) ||
266
+ m_frame->m_userSEI.numPayloads);
267
+
268
+ if (isSei && m_param->bSingleSeiNal)
269
+ {
270
+ m_bs.writeByteAlignment();
271
+ m_nalList.serialize(NAL_UNIT_PREFIX_SEI, m_bs);
272
+ }
273
/* CQP and CRF (without capped VBV) doesn't use mid-frame statistics to
274
* tune RateControl parameters for other frames.
275
* Hence, for these modes, update m_startEndOrder and unlock RC for previous threads waiting in
276
277
m_top->m_rateControl->m_startEndOrder.incr(); // faked rateControlEnd calls for negative frames
278
}
279
280
+ if (m_param->bDynamicRefine)
281
+ computeAvgTrainingData();
282
+
283
/* Analyze CTU rows, most of the hard work is done here. Frame is
284
* compressed in a wave-front pattern if WPP is enabled. Row based loop
285
* filters runs behind the CTU compression and reconstruction */
286
287
m_frameFilter.processRow(i - m_filterRowDelay);
288
}
289
}
290
+#if ENABLE_LIBVMAF
291
+ vmafFrameLevelScore();
292
+#endif
293
294
if (m_param->maxSlices > 1)
295
{
296
PicYuv *reconPic = m_frame->m_reconPic;
297
uint32_t height = reconPic->m_picHeight;
298
- uint32_t width = reconPic->m_picWidth;
299
- intptr_t stride = reconPic->m_stride;
300
- const uint32_t hChromaShift = CHROMA_H_SHIFT(m_param->internalCsp);
301
- const uint32_t vChromaShift = CHROMA_V_SHIFT(m_param->internalCsp);
302
+ initDecodedPictureHashSEI(0, 0, height);
303
+ }
304
305
- if (m_param->decodedPictureHashSEI == 1)
306
- {
307
-
308
- MD5Init(&m_state[0]);
309
-
310
- updateMD5Plane(m_state[0], reconPic->m_picOrg[0], width, height, stride);
311
-
312
- if (m_param->internalCsp != X265_CSP_I400)
313
- {
314
- MD5Init(&m_state[1]);
315
- MD5Init(&m_state[2]);
316
-
317
- width >>= hChromaShift;
318
- height >>= vChromaShift;
319
- stride = reconPic->m_strideC;
320
-
321
- updateMD5Plane(m_state[1], reconPic->m_picOrg[1], width, height, stride);
322
- updateMD5Plane(m_state[2], reconPic->m_picOrg[2], width, height, stride);
323
- }
324
- }
325
- // TODO: NOT verify code in below mode
326
- else if (m_param->decodedPictureHashSEI == 2)
327
- {
328
- m_crc[0] = 0xffff;
329
-
330
- updateCRC(reconPic->m_picOrg[0], m_crc[0], height, width, stride);
331
-
332
- if (m_param->internalCsp != X265_CSP_I400)
333
- {
334
- width >>= hChromaShift;
335
- height >>= vChromaShift;
336
- stride = reconPic->m_strideC;
337
- m_crc[1] = m_crc[2] = 0xffff;
338
-
339
- updateCRC(reconPic->m_picOrg[1], m_crc[1], height, width, stride);
340
- updateCRC(reconPic->m_picOrg[2], m_crc[2], height, width, stride);
341
- }
342
- }
343
- else if (m_param->decodedPictureHashSEI == 3)
344
- {
345
- uint32_t cuHeight = m_param->maxCUSize;
346
-
347
- m_checksum[0] = 0;
348
-
349
- updateChecksum(reconPic->m_picOrg[0], m_checksum[0], height, width, stride, 0, cuHeight);
350
-
351
- if (m_param->internalCsp != X265_CSP_I400)
352
- {
353
- width >>= hChromaShift;
354
- height >>= vChromaShift;
355
- stride = reconPic->m_strideC;
356
- cuHeight >>= vChromaShift;
357
-
358
- m_checksum[1] = m_checksum[2] = 0;
359
-
360
- updateChecksum(reconPic->m_picOrg[1], m_checksum[1], height, width, stride, 0, cuHeight);
361
- updateChecksum(reconPic->m_picOrg[2], m_checksum[2], height, width, stride, 0, cuHeight);
362
- }
363
- }
364
- } // end of (m_param->maxSlices > 1)
365
+ if (m_param->bDynamicRefine && m_top->m_startPoint <= m_frame->m_encodeOrder) //Avoid collecting data that will not be used by future frames.
366
+ collectDynDataFrame();
367
368
if (m_param->rc.bStatWrite)
369
{
370
371
m_bs.resetBits();
372
373
const uint32_t sliceAddr = nextSliceRow * m_numCols;
374
- //CUData* ctu = m_frame->m_encData->getPicCTU(sliceAddr);
375
- //const int sliceQp = ctu->m_qp[0];
376
if (m_param->bOptRefListLengthPPS)
377
{
378
ScopedLock refIdxLock(m_top->m_sliceRefIdxLock);
379
380
m_nalList.serialize(slice->m_nalUnitType, m_bs);
381
}
382
383
-
384
if (m_param->decodedPictureHashSEI)
385
- {
386
- int planes = (m_frame->m_param->internalCsp != X265_CSP_I400) ? 3 : 1;
387
- int32_t payloadSize = 0;
388
- if (m_param->decodedPictureHashSEI == 1)
389
- {
390
- m_seiReconPictureDigest.m_method = SEIDecodedPictureHash::MD5;
391
- for (int i = 0; i < planes; i++)
392
- MD5Final(&m_state[i], m_seiReconPictureDigest.m_digest[i]);
393
- payloadSize = 1 + 16 * planes;
394
- }
395
- else if (m_param->decodedPictureHashSEI == 2)
396
- {
397
- m_seiReconPictureDigest.m_method = SEIDecodedPictureHash::CRC;
398
- for (int i = 0; i < planes; i++)
399
- crcFinish(m_crc[i], m_seiReconPictureDigest.m_digest[i]);
400
- payloadSize = 1 + 2 * planes;
401
- }
402
- else if (m_param->decodedPictureHashSEI == 3)
403
- {
404
- m_seiReconPictureDigest.m_method = SEIDecodedPictureHash::CHECKSUM;
405
- for (int i = 0; i < planes; i++)
406
- checksumFinish(m_checksum[i], m_seiReconPictureDigest.m_digest[i]);
407
- payloadSize = 1 + 4 * planes;
408
- }
409
- m_bs.resetBits();
410
- m_seiReconPictureDigest.setSize(payloadSize);
411
- m_seiReconPictureDigest.write(m_bs, *slice->m_sps);
412
- m_bs.writeByteAlignment();
413
- m_nalList.serialize(NAL_UNIT_SUFFIX_SEI, m_bs);
414
- }
415
+ writeTrailingSEIMessages();
416
417
uint64_t bytes = 0;
418
for (uint32_t i = 0; i < m_nalList.m_numNal; i++)
419
420
m_cuStats.accumulate(m_tld[i].analysis.m_stats[m_jpId], *m_param);
421
#endif
422
423
- m_endFrameTime = x265_mdate();
424
+ m_endFrameTime = x265_mdate();
425
+}
426
+
427
+void FrameEncoder::initDecodedPictureHashSEI(int row, int cuAddr, int height)
428
+{
429
+ PicYuv *reconPic = m_frame->m_reconPic;
430
+ uint32_t width = reconPic->m_picWidth;
431
+ intptr_t stride = reconPic->m_stride;
432
+ uint32_t maxCUHeight = m_param->maxCUSize;
433
+
434
+ const uint32_t hChromaShift = CHROMA_H_SHIFT(m_param->internalCsp);
435
+ const uint32_t vChromaShift = CHROMA_V_SHIFT(m_param->internalCsp);
436
+
437
+ if (m_param->decodedPictureHashSEI == 1)
438
+ {
439
+ if (!row)
440
+ MD5Init(&m_seiReconPictureDigest.m_state[0]);
441
+
442
+ updateMD5Plane(m_seiReconPictureDigest.m_state[0], reconPic->getLumaAddr(cuAddr), width, height, stride);
443
+ if (m_param->internalCsp != X265_CSP_I400)
444
+ {
445
+ if (!row)
446
+ {
447
+ MD5Init(&m_seiReconPictureDigest.m_state[1]);
448
+ MD5Init(&m_seiReconPictureDigest.m_state[2]);
449
+ }
450
+
451
+ width >>= hChromaShift;
452
+ height >>= vChromaShift;
453
+ stride = reconPic->m_strideC;
454
+
455
+ updateMD5Plane(m_seiReconPictureDigest.m_state[1], reconPic->getCbAddr(cuAddr), width, height, stride);
456
+ updateMD5Plane(m_seiReconPictureDigest.m_state[2], reconPic->getCrAddr(cuAddr), width, height, stride);
457
+ }
458
+ }
459
+ else if (m_param->decodedPictureHashSEI == 2)
460
+ {
461
+
462
+ if (!row)
463
+ m_seiReconPictureDigest.m_crc[0] = 0xffff;
464
+
465
+ updateCRC(reconPic->getLumaAddr(cuAddr), m_seiReconPictureDigest.m_crc[0], height, width, stride);
466
+ if (m_param->internalCsp != X265_CSP_I400)
467
+ {
468
+ width >>= hChromaShift;
469
+ height >>= vChromaShift;
470
+ stride = reconPic->m_strideC;
471
+ m_seiReconPictureDigest.m_crc[1] = m_seiReconPictureDigest.m_crc[2] = 0xffff;
472
+
473
+ updateCRC(reconPic->getCbAddr(cuAddr), m_seiReconPictureDigest.m_crc[1], height, width, stride);
474
+ updateCRC(reconPic->getCrAddr(cuAddr), m_seiReconPictureDigest.m_crc[2], height, width, stride);
475
+ }
476
+ }
477
+ else if (m_param->decodedPictureHashSEI == 3)
478
+ {
479
+ if (!row)
480
+ m_seiReconPictureDigest.m_checksum[0] = 0;
481
+
482
+ updateChecksum(reconPic->m_picOrg[0], m_seiReconPictureDigest.m_checksum[0], height, width, stride, row, maxCUHeight);
483
+ if (m_param->internalCsp != X265_CSP_I400)
484
+ {
485
+ width >>= hChromaShift;
486
+ height >>= vChromaShift;
487
+ stride = reconPic->m_strideC;
488
+ maxCUHeight >>= vChromaShift;
489
+
490
+ if (!row)
491
+ m_seiReconPictureDigest.m_checksum[1] = m_seiReconPictureDigest.m_checksum[2] = 0;
492
+
493
+ updateChecksum(reconPic->m_picOrg[1], m_seiReconPictureDigest.m_checksum[1], height, width, stride, row, maxCUHeight);
494
+ updateChecksum(reconPic->m_picOrg[2], m_seiReconPictureDigest.m_checksum[2], height, width, stride, row, maxCUHeight);
495
+ }
496
+ }
497
}
498
499
void FrameEncoder::encodeSlice(uint32_t sliceAddr)
500
501
}
502
curRow.avgQPComputed = 1;
503
}
504
- }
505
+ }
506
507
// Initialize restrict on MV range in slices
508
tld.analysis.m_sliceMinY = -(int16_t)(rowInSlice * m_param->maxCUSize * 4) + 3 * 4;
509
510
// Does all the CU analysis, returns best top level mode decision
511
Mode& best = tld.analysis.compressCTU(*ctu, *m_frame, m_cuGeoms[m_ctuGeomMap[cuAddr]], rowCoder);
512
513
+ /* startPoint > encodeOrder is true when the start point changes for
514
+ a new GOP but few frames from the previous GOP is still incomplete.
515
+ The data of frames in this interval will not be used by any future frames. */
516
+ if (m_param->bDynamicRefine && m_top->m_startPoint <= m_frame->m_encodeOrder)
517
+ collectDynDataRow(*ctu, &curRow.rowStats);
518
+
519
// take a sample of the current active worker count
520
ATOMIC_ADD(&m_totalActiveWorkerCount, m_activeWorkerCount);
521
ATOMIC_INC(&m_activeWorkerCountSamples);
522
523
{
524
// NOTE: in VBV mode, we may reencode anytime, so we can't do Deblock stage-Horizon and SAO
525
if (!bIsVbv)
526
- {
527
+ {
528
// Delay one row to avoid intra prediction conflict
529
if (m_pool && !bFirstRowInSlice)
530
{
531
532
else if ((uint32_t)m_rce.encodeOrder <= 2 * (m_param->fpsNum / m_param->fpsDenom))
533
rowCount = X265_MIN((maxRows + 1) / 2, maxRows - 1);
534
else
535
- rowCount = X265_MIN(m_refLagRows / m_param->maxSlices, maxRows - 1);
536
+ rowCount = X265_MIN(m_refLagRows / m_param->maxSlices, maxRows - 1);
537
538
if (rowInSlice == rowCount)
539
{
540
m_rowSliceTotalBits[sliceId] = 0;
541
if (bIsVbv && !(m_param->rc.bEnableConstVbv && m_param->bEnableWavefront))
542
- {
543
+ {
544
for (uint32_t i = m_sliceBaseRow[sliceId]; i < rowCount + m_sliceBaseRow[sliceId]; i++)
545
m_rowSliceTotalBits[sliceId] += curEncData.m_rowStat[i].encodedBits;
546
}
547
else
548
{
549
uint32_t startAddr = m_sliceBaseRow[sliceId] * numCols;
550
- uint32_t finishAddr = startAddr + rowCount * numCols;
551
+ uint32_t finishAddr = startAddr + rowCount * numCols;
552
553
- for (uint32_t cuAddr = startAddr; cuAddr < finishAddr; cuAddr++)
554
+ for (uint32_t cuAddr = startAddr; cuAddr < finishAddr; cuAddr++)
555
m_rowSliceTotalBits[sliceId] += curEncData.m_cuStat[cuAddr].totalBits;
556
- }
557
+ }
558
559
if (ATOMIC_INC(&m_sliceCnt) == (int)m_param->maxSlices)
560
{
561
562
m_completionEvent.trigger();
563
}
564
565
+void FrameEncoder::collectDynDataRow(CUData& ctu, FrameStats* rowStats)
566
+{
567
+ for (uint32_t i = 0; i < X265_REFINE_INTER_LEVELS; i++)
568
+ {
569
+ for (uint32_t depth = 0; depth < m_param->maxCUDepth; depth++)
570
+ {
571
+ int offset = (depth * X265_REFINE_INTER_LEVELS) + i;
572
+ if (ctu.m_collectCUCount[offset])
573
+ {
574
+ rowStats->rowVarDyn[offset] += ctu.m_collectCUVariance[offset];
575
+ rowStats->rowRdDyn[offset] += ctu.m_collectCURd[offset];
576
+ rowStats->rowCntDyn[offset] += ctu.m_collectCUCount[offset];
577
+ }
578
+ }
579
+ }
580
+}
581
+
582
+void FrameEncoder::collectDynDataFrame()
583
+{
584
+ for (uint32_t row = 0; row < m_numRows; row++)
585
+ {
586
+ for (uint32_t refLevel = 0; refLevel < X265_REFINE_INTER_LEVELS; refLevel++)
587
+ {
588
+ for (uint32_t depth = 0; depth < m_param->maxCUDepth; depth++)
589
+ {
590
+ int offset = (depth * X265_REFINE_INTER_LEVELS) + refLevel;
591
+ int curFrameIndex = m_frame->m_encodeOrder - m_top->m_startPoint;
592
+ int index = (curFrameIndex * X265_REFINE_INTER_LEVELS * m_param->maxCUDepth) + offset;
593
+ if (m_rows[row].rowStats.rowCntDyn[offset])
594
+ {
595
+ m_top->m_variance[index] += m_rows[row].rowStats.rowVarDyn[offset];
596
+ m_top->m_rdCost[index] += m_rows[row].rowStats.rowRdDyn[offset];
597
+ m_top->m_trainingCount[index] += m_rows[row].rowStats.rowCntDyn[offset];
598
+ }
599
+ }
600
+ }
601
+ }
602
+}
603
+
604
+void FrameEncoder::computeAvgTrainingData()
605
+{
606
+ if (m_frame->m_lowres.bScenecut || m_frame->m_lowres.bKeyframe)
607
+ {
608
+ m_top->m_startPoint = m_frame->m_encodeOrder;
609
+ int size = (m_param->keyframeMax + m_param->lookaheadDepth) * m_param->maxCUDepth * X265_REFINE_INTER_LEVELS;
610
+ memset(m_top->m_variance, 0, size * sizeof(uint64_t));
611
+ memset(m_top->m_rdCost, 0, size * sizeof(uint64_t));
612
+ memset(m_top->m_trainingCount, 0, size * sizeof(uint32_t));
613
+ }
614
+ if (m_frame->m_encodeOrder - m_top->m_startPoint < 2 * m_param->frameNumThreads)
615
+ m_frame->m_classifyFrame = false;
616
+ else
617
+ m_frame->m_classifyFrame = true;
618
+
619
+ int size = m_param->maxCUDepth * X265_REFINE_INTER_LEVELS;
620
+ memset(m_frame->m_classifyRd, 0, size * sizeof(uint64_t));
621
+ memset(m_frame->m_classifyVariance, 0, size * sizeof(uint64_t));
622
+ memset(m_frame->m_classifyCount, 0, size * sizeof(uint32_t));
623
+ if (m_frame->m_classifyFrame)
624
+ {
625
+ uint32_t limit = m_frame->m_encodeOrder - m_top->m_startPoint - m_param->frameNumThreads;
626
+ for (uint32_t i = 1; i < limit; i++)
627
+ {
628
+ for (uint32_t j = 0; j < X265_REFINE_INTER_LEVELS; j++)
629
+ {
630
+ for (uint32_t depth = 0; depth < m_param->maxCUDepth; depth++)
631
+ {
632
+ int offset = (depth * X265_REFINE_INTER_LEVELS) + j;
633
+ int index = (i* X265_REFINE_INTER_LEVELS * m_param->maxCUDepth) + offset;
634
+ if (m_top->m_trainingCount[index])
635
+ {
636
+ m_frame->m_classifyRd[offset] += m_top->m_rdCost[index] / m_top->m_trainingCount[index];
637
+ m_frame->m_classifyVariance[offset] += m_top->m_variance[index] / m_top->m_trainingCount[index];
638
+ m_frame->m_classifyCount[offset] += m_top->m_trainingCount[index];
639
+ }
640
+ }
641
+ }
642
+ }
643
+ /* Calculates the average feature values of historic frames that are being considered for the current frame */
644
+ int historyCount = m_frame->m_encodeOrder - m_param->frameNumThreads - m_top->m_startPoint - 1;
645
+ if (historyCount)
646
+ {
647
+ for (uint32_t j = 0; j < X265_REFINE_INTER_LEVELS; j++)
648
+ {
649
+ for (uint32_t depth = 0; depth < m_param->maxCUDepth; depth++)
650
+ {
651
+ int offset = (depth * X265_REFINE_INTER_LEVELS) + j;
652
+ m_frame->m_classifyRd[offset] /= historyCount;
653
+ m_frame->m_classifyVariance[offset] /= historyCount;
654
+ }
655
+ }
656
+ }
657
+ }
658
+}
659
+
660
/* collect statistics about CU coding decisions, return total QP */
661
int FrameEncoder::collectCTUStatistics(const CUData& ctu, FrameStats* log)
662
{
663
664
m_nr->nrOffsetDenoise[cat][0] = 0;
665
}
666
}
667
+#if ENABLE_LIBVMAF
668
+void FrameEncoder::vmafFrameLevelScore()
669
+{
670
+ PicYuv *fenc = m_frame->m_fencPic;
671
+ PicYuv *recon = m_frame->m_reconPic;
672
+
673
+ x265_vmaf_framedata *vmafframedata = (x265_vmaf_framedata*)x265_malloc(sizeof(x265_vmaf_framedata));
674
+ if (!vmafframedata)
675
+ {
676
+ x265_log(NULL, X265_LOG_ERROR, "vmaf frame data alloc failed\n");
677
+ }
678
+
679
+ vmafframedata->height = fenc->m_picHeight;
680
+ vmafframedata->width = fenc->m_picWidth;
681
+ vmafframedata->frame_set = 0;
682
+ vmafframedata->internalBitDepth = m_param->internalBitDepth;
683
+ vmafframedata->reference_frame = fenc;
684
+ vmafframedata->distorted_frame = recon;
685
+
686
+ fenc->m_vmafScore = x265_calculate_vmaf_framelevelscore(vmafframedata);
687
+
688
+ if (vmafframedata)
689
+ x265_free(vmafframedata);
690
+}
691
+#endif
692
693
Frame *FrameEncoder::getEncodedPicture(NALList& output)
694
{
695
x265_2.7.tar.gz/source/encoder/frameencoder.h -> x265_2.9.tar.gz/source/encoder/frameencoder.h
Changed
42
1
2
/* blocks until worker thread is done, returns access unit */
3
Frame *getEncodedPicture(NALList& list);
4
5
+ void initDecodedPictureHashSEI(int row, int cuAddr, int height);
6
+
7
Event m_enable;
8
Event m_done;
9
Event m_completionEvent;
10
11
double m_ssim;
12
uint64_t m_accessUnitBits;
13
uint32_t m_ssimCnt;
14
- MD5Context m_state[3];
15
- uint32_t m_crc[3];
16
- uint32_t m_checksum[3];
17
18
volatile int m_activeWorkerCount; // count of workers currently encoding or filtering CTUs
19
volatile int m_totalActiveWorkerCount; // sum of m_activeWorkerCount sampled at end of each CTU
20
21
void threadMain();
22
int collectCTUStatistics(const CUData& ctu, FrameStats* frameLog);
23
void noiseReductionUpdate();
24
+ void writeTrailingSEIMessages();
25
+ bool writeToneMapInfo(x265_sei_payload *payload);
26
27
/* Called by WaveFront::findJob() */
28
virtual void processRow(int row, int threadId);
29
30
void enqueueRowFilter(int row) { WaveFront::enqueueRow(row * 2 + 1); }
31
void enableRowEncoder(int row) { WaveFront::enableRow(row * 2 + 0); }
32
void enableRowFilter(int row) { WaveFront::enableRow(row * 2 + 1); }
33
+#if ENABLE_LIBVMAF
34
+ void vmafFrameLevelScore();
35
+#endif
36
+ void collectDynDataFrame();
37
+ void computeAvgTrainingData();
38
+ void collectDynDataRow(CUData& ctu, FrameStats* rowStats);
39
};
40
}
41
42
x265_2.7.tar.gz/source/encoder/framefilter.cpp -> x265_2.9.tar.gz/source/encoder/framefilter.cpp
Changed
82
1
2
3
if (m_param->maxSlices == 1)
4
{
5
- if (m_param->decodedPictureHashSEI == 1)
6
- {
7
- uint32_t height = m_parallelFilter[row].getCUHeight();
8
- uint32_t width = reconPic->m_picWidth;
9
- intptr_t stride = reconPic->m_stride;
10
-
11
- if (!row)
12
- MD5Init(&m_frameEncoder->m_state[0]);
13
-
14
- updateMD5Plane(m_frameEncoder->m_state[0], reconPic->getLumaAddr(cuAddr), width, height, stride);
15
- if (m_param->internalCsp != X265_CSP_I400)
16
- {
17
- if (!row)
18
- {
19
- MD5Init(&m_frameEncoder->m_state[1]);
20
- MD5Init(&m_frameEncoder->m_state[2]);
21
- }
22
-
23
- width >>= m_hChromaShift;
24
- height >>= m_vChromaShift;
25
- stride = reconPic->m_strideC;
26
-
27
- updateMD5Plane(m_frameEncoder->m_state[1], reconPic->getCbAddr(cuAddr), width, height, stride);
28
- updateMD5Plane(m_frameEncoder->m_state[2], reconPic->getCrAddr(cuAddr), width, height, stride);
29
- }
30
- }
31
- else if (m_param->decodedPictureHashSEI == 2)
32
- {
33
- uint32_t height = m_parallelFilter[row].getCUHeight();
34
- uint32_t width = reconPic->m_picWidth;
35
- intptr_t stride = reconPic->m_stride;
36
-
37
- if (!row)
38
- m_frameEncoder->m_crc[0] = 0xffff;
39
-
40
- updateCRC(reconPic->getLumaAddr(cuAddr), m_frameEncoder->m_crc[0], height, width, stride);
41
- if (m_param->internalCsp != X265_CSP_I400)
42
- {
43
- width >>= m_hChromaShift;
44
- height >>= m_vChromaShift;
45
- stride = reconPic->m_strideC;
46
- m_frameEncoder->m_crc[1] = m_frameEncoder->m_crc[2] = 0xffff;
47
-
48
- updateCRC(reconPic->getCbAddr(cuAddr), m_frameEncoder->m_crc[1], height, width, stride);
49
- updateCRC(reconPic->getCrAddr(cuAddr), m_frameEncoder->m_crc[2], height, width, stride);
50
- }
51
- }
52
- else if (m_param->decodedPictureHashSEI == 3)
53
- {
54
- uint32_t width = reconPic->m_picWidth;
55
- uint32_t height = m_parallelFilter[row].getCUHeight();
56
- intptr_t stride = reconPic->m_stride;
57
- uint32_t cuHeight = m_param->maxCUSize;
58
-
59
- if (!row)
60
- m_frameEncoder->m_checksum[0] = 0;
61
-
62
- updateChecksum(reconPic->m_picOrg[0], m_frameEncoder->m_checksum[0], height, width, stride, row, cuHeight);
63
- if (m_param->internalCsp != X265_CSP_I400)
64
- {
65
- width >>= m_hChromaShift;
66
- height >>= m_vChromaShift;
67
- stride = reconPic->m_strideC;
68
- cuHeight >>= m_vChromaShift;
69
-
70
- if (!row)
71
- m_frameEncoder->m_checksum[1] = m_frameEncoder->m_checksum[2] = 0;
72
-
73
- updateChecksum(reconPic->m_picOrg[1], m_frameEncoder->m_checksum[1], height, width, stride, row, cuHeight);
74
- updateChecksum(reconPic->m_picOrg[2], m_frameEncoder->m_checksum[2], height, width, stride, row, cuHeight);
75
- }
76
- }
77
+ uint32_t height = m_parallelFilter[row].getCUHeight();
78
+ m_frameEncoder->initDecodedPictureHashSEI(row, cuAddr, height);
79
} // end of (m_param->maxSlices == 1)
80
81
if (ATOMIC_INC(&m_frameEncoder->m_completionCount) == 2 * (int)m_frameEncoder->m_numRows)
82
x265_2.7.tar.gz/source/encoder/ratecontrol.cpp -> x265_2.9.tar.gz/source/encoder/ratecontrol.cpp
Changed
114
1
2
m_predictedBits = m_totalBits;
3
updateVbvPlan(enc);
4
rce->bufferFill = m_bufferFill;
5
+ rce->vbvEndAdj = false;
6
+ if (m_param->vbvBufferEnd && rce->encodeOrder >= m_param->vbvEndFrameAdjust * m_param->totalFrames)
7
+ {
8
+ rce->vbvEndAdj = true;
9
+ rce->targetFill = 0;
10
+ }
11
12
int mincr = enc->m_vps.ptl.minCrForLevel;
13
/* Profiles above Main10 don't require maxAU size check, so just set the maximum to a large value. */
14
15
else
16
{
17
/* The spec has a special case for the first frame. */
18
- if (rce->encodeOrder == 0)
19
+ if (curFrame->m_lowres.bKeyframe)
20
{
21
/* 1.5 * (Max( PicSizeInSamplesY, fR * MaxLumaSr) + MaxLumaSr * (AuCpbRemovalTime[ 0 ] -AuNominalRemovalTime[ 0 ])) ? MinCr */
22
double fr = 1. / 300;
23
24
/* 1.5 * MaxLumaSr * (AuCpbRemovalTime[ n ] - AuCpbRemovalTime[ n - 1 ]) / MinCr */
25
rce->frameSizeMaximum = 8 * 1.5 * enc->m_vps.ptl.maxLumaSrForLevel * m_frameDuration / mincr;
26
}
27
+ rce->frameSizeMaximum *= m_param->maxAUSizeFactor;
28
}
29
}
30
if (!m_isAbr && m_2pass && m_param->rc.rateControlMode == X265_RC_CRF)
31
32
curBits = predictSize(&m_pred[predType], frameQ[type], (double)satd);
33
bufferFillCur -= curBits;
34
}
35
- if (m_param->vbvBufferEnd && rce->encodeOrder >= m_param->vbvEndFrameAdjust * m_param->totalFrames)
36
+ if (rce->vbvEndAdj)
37
{
38
bool loopBreak = false;
39
double bufferDiff = m_param->vbvBufferEnd - (m_bufferFill / m_bufferSize);
40
- targetFill = m_bufferFill + m_bufferSize * (bufferDiff / (m_param->totalFrames - rce->encodeOrder));
41
- if (bufferFillCur < targetFill)
42
+ rce->targetFill = m_bufferFill + m_bufferSize * (bufferDiff / (m_param->totalFrames - rce->encodeOrder));
43
+ if (bufferFillCur < rce->targetFill)
44
{
45
q *= 1.01;
46
loopTerminate |= 1;
47
48
double rcTol = bufferLeftPlanned / m_param->frameNumThreads * m_rateTolerance;
49
int32_t encodedBitsSoFar = 0;
50
double accFrameBits = predictRowsSizeSum(curFrame, rce, qpVbv, encodedBitsSoFar);
51
+ double vbvEndBias = 0.95;
52
53
/* * Don't increase the row QPs until a sufficent amount of the bits of
54
* the frame have been processed, in case a flat area at the top of the
55
56
while (qpVbv < qpMax
57
&& (((accFrameBits > rce->frameSizePlanned + rcTol) ||
58
(rce->bufferFill - accFrameBits < bufferLeftPlanned * 0.5) ||
59
- (accFrameBits > rce->frameSizePlanned && qpVbv < rce->qpNoVbv))
60
+ (accFrameBits > rce->frameSizePlanned && qpVbv < rce->qpNoVbv) ||
61
+ (rce->vbvEndAdj && ((rce->bufferFill - accFrameBits) < (rce->targetFill * vbvEndBias))))
62
&& (!m_param->rc.bStrictCbr ? 1 : abrOvershoot > 0.1)))
63
{
64
qpVbv += stepSize;
65
66
while (qpVbv > qpMin
67
&& (qpVbv > curEncData.m_rowStat[0].rowQp || m_singleFrameVbv)
68
&& (((accFrameBits < rce->frameSizePlanned * 0.8f && qpVbv <= prevRowQp)
69
- || accFrameBits < (rce->bufferFill - m_bufferSize + m_bufferRate) * 1.1)
70
+ || accFrameBits < (rce->bufferFill - m_bufferSize + m_bufferRate) * 1.1
71
+ || (rce->vbvEndAdj && ((rce->bufferFill - accFrameBits) > (rce->targetFill * vbvEndBias))))
72
&& (!m_param->rc.bStrictCbr ? 1 : abrOvershoot < 0)))
73
{
74
qpVbv -= stepSize;
75
76
FrameData& curEncData = *curFrame->m_encData;
77
int64_t actualBits = bits;
78
Slice *slice = curEncData.m_slice;
79
+ bool bEnableDistOffset = m_param->analysisMultiPassDistortion && m_param->rc.bStatRead;
80
81
- if (m_param->rc.aqMode || m_isVbv || m_param->bAQMotion)
82
+ if (m_param->rc.aqMode || m_isVbv || m_param->bAQMotion || bEnableDistOffset)
83
{
84
if (m_isVbv && !(m_2pass && m_param->rc.rateControlMode == X265_RC_CRF))
85
{
86
87
rce->qpaRc = curEncData.m_avgQpRc;
88
}
89
90
- if (m_param->rc.aqMode || m_param->bAQMotion)
91
+ if (m_param->rc.aqMode || m_param->bAQMotion || bEnableDistOffset)
92
{
93
double avgQpAq = 0;
94
- /* determine actual avg encoded QP, after AQ/cutree adjustments */
95
+ /* determine actual avg encoded QP, after AQ/cutree/distortion adjustments */
96
for (uint32_t i = 0; i < slice->m_sps->numCuInHeight; i++)
97
avgQpAq += curEncData.m_rowStat[i].sumQpAq;
98
99
100
/* called to write out the rate control frame stats info in multipass encodes */
101
int RateControl::writeRateControlFrameStats(Frame* curFrame, RateControlEntry* rce)
102
{
103
- FrameData& curEncData = *curFrame->m_encData;
104
- int ncu;
105
- if (m_param->rc.qgSize == 8)
106
- ncu = m_ncu * 4;
107
- else
108
- ncu = m_ncu;
109
+ FrameData& curEncData = *curFrame->m_encData;
110
+ int ncu = (m_param->rc.qgSize == 8) ? m_ncu * 4 : m_ncu;
111
char cType = rce->sliceType == I_SLICE ? (curFrame->m_lowres.sliceType == X265_TYPE_IDR ? 'I' : 'i')
112
: rce->sliceType == P_SLICE ? 'P'
113
: IS_REFERENCED(curFrame) ? 'B' : 'b';
114
x265_2.7.tar.gz/source/encoder/ratecontrol.h -> x265_2.9.tar.gz/source/encoder/ratecontrol.h
Changed
10
1
2
double rowCplxrSum;
3
double qpNoVbv;
4
double bufferFill;
5
+ double targetFill;
6
+ bool vbvEndAdj;
7
double frameDuration;
8
double clippedDuration;
9
double frameSizeEstimated; /* hold frameSize, updated from cu level vbv rc */
10
x265_2.7.tar.gz/source/encoder/reference.cpp -> x265_2.9.tar.gz/source/encoder/reference.cpp
Changed
24
1
2
cuHeight >>= reconPic->m_vChromaShift;
3
}
4
5
- if (wp[c].bPresentFlag)
6
+ if (wp[c].wtPresent)
7
{
8
if (!weightBuffer[c])
9
{
10
11
12
const pixel* src = reconPic->m_picOrg[c] + numWeightedRows * cuHeight * stride;
13
pixel* dst = fpelPlane[c] + numWeightedRows * cuHeight * stride;
14
-
15
// Computing weighted CU rows
16
int correction = IF_INTERNAL_PREC - X265_DEPTH; // intermediate interpolation depth
17
- int padwidth = (width + 15) & ~15; // weightp assembly needs even 16 byte widths
18
+ int padwidth = (width + 31) & ~31; // weightp assembly needs even 32 byte widths
19
primitives.weight_pp(src, dst, stride, padwidth, height, w[c].weight, w[c].round << correction, w[c].shift + correction, w[c].offset);
20
-
21
// Extending Left & Right
22
primitives.extendRowBorder(dst, stride, width, height, marginX);
23
24
x265_2.7.tar.gz/source/encoder/search.cpp -> x265_2.9.tar.gz/source/encoder/search.cpp
Changed
409
1
2
m_me.init(param.internalCsp);
3
4
bool ok = m_quant.init(param.psyRdoq, scalingList, m_entropyCoder);
5
- if (m_param->noiseReductionIntra || m_param->noiseReductionInter || m_param->rc.vbvBufferSize)
6
+ if (m_param->noiseReductionIntra || m_param->noiseReductionInter )
7
ok &= m_quant.allocNoiseReduction(param);
8
9
ok &= Predict::allocBuffers(param.internalCsp); /* sets m_hChromaShift & m_vChromaShift */
10
11
// store original entropy coding status
12
if (bEnableRDOQ)
13
m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
14
-
15
- primitives.cu[sizeIdx].calcresidual(fenc, pred, residual, stride);
16
+ primitives.cu[sizeIdx].calcresidual[stride % 64 == 0](fenc, pred, residual, stride);
17
18
uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffY, log2TrSize, TEXT_LUMA, absPartIdx, false);
19
if (numSig)
20
{
21
m_quant.invtransformNxN(cu, residual, stride, coeffY, log2TrSize, TEXT_LUMA, true, false, numSig);
22
- primitives.cu[sizeIdx].add_ps(reconQt, reconQtStride, pred, residual, stride, stride);
23
+ bool reconQtYuvAlign = m_rqt[qtLayer].reconQtYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 == 0;
24
+ bool predAlign = mode.predYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 == 0;
25
+ bool residualAlign = m_rqt[cuGeom.depth].tmpResiYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 == 0;
26
+ bool bufferAlignCheck = (reconQtStride % 64 == 0) && (stride % 64 == 0) && reconQtYuvAlign && predAlign && residualAlign;
27
+ primitives.cu[sizeIdx].add_ps[bufferAlignCheck](reconQt, reconQtStride, pred, residual, stride, stride);
28
}
29
else
30
// no coded residual, recon = pred
31
32
33
coeff_t* coeff = (useTSkip ? m_tsCoeff : coeffY);
34
pixel* tmpRecon = (useTSkip ? m_tsRecon : reconQt);
35
+ bool tmpReconAlign = (useTSkip ? 1 : (m_rqt[qtLayer].reconQtYuv.getAddrOffset(absPartIdx, m_rqt[qtLayer].reconQtYuv.m_size) % 64 == 0));
36
uint32_t tmpReconStride = (useTSkip ? MAX_TS_SIZE : reconQtStride);
37
38
- primitives.cu[sizeIdx].calcresidual(fenc, pred, residual, stride);
39
+ primitives.cu[sizeIdx].calcresidual[stride % 64 == 0](fenc, pred, residual, stride);
40
41
uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSize, TEXT_LUMA, absPartIdx, useTSkip);
42
if (numSig)
43
{
44
m_quant.invtransformNxN(cu, residual, stride, coeff, log2TrSize, TEXT_LUMA, true, useTSkip, numSig);
45
- primitives.cu[sizeIdx].add_ps(tmpRecon, tmpReconStride, pred, residual, stride, stride);
46
+ bool residualAlign = m_rqt[cuGeom.depth].tmpResiYuv.getAddrOffset(absPartIdx, m_rqt[cuGeom.depth].tmpResiYuv.m_size) % 64 == 0;
47
+ bool predAlign = predYuv->getAddrOffset(absPartIdx, predYuv->m_size) % 64 == 0;
48
+ bool bufferAlignCheck = (stride % 64 == 0) && (tmpReconStride % 64 == 0) && tmpReconAlign && residualAlign && predAlign;
49
+ primitives.cu[sizeIdx].add_ps[bufferAlignCheck](tmpRecon, tmpReconStride, pred, residual, stride, stride);
50
}
51
else if (useTSkip)
52
{
53
54
coeff_t* coeffY = cu.m_trCoeff[0] + coeffOffsetY;
55
56
uint32_t sizeIdx = log2TrSize - 2;
57
- primitives.cu[sizeIdx].calcresidual(fenc, pred, residual, stride);
58
+ primitives.cu[sizeIdx].calcresidual[stride % 64 == 0](fenc, pred, residual, stride);
59
60
PicYuv* reconPic = m_frame->m_reconPic;
61
pixel* picReconY = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
62
63
if (numSig)
64
{
65
m_quant.invtransformNxN(cu, residual, stride, coeffY, log2TrSize, TEXT_LUMA, true, false, numSig);
66
- primitives.cu[sizeIdx].add_ps(picReconY, picStride, pred, residual, stride, stride);
67
+ bool picReconYAlign = (reconPic->m_cuOffsetY[cu.m_cuAddr] + reconPic->m_buOffsetY[cuGeom.absPartIdx + absPartIdx]) % 64 == 0;
68
+ bool predAlign = mode.predYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 == 0;
69
+ bool residualAlign = m_rqt[cuGeom.depth].tmpResiYuv.getAddrOffset(absPartIdx, m_rqt[cuGeom.depth].tmpResiYuv.m_size)% 64 == 0;
70
+ bool bufferAlignCheck = (picStride % 64 == 0) && (stride % 64 == 0) && picReconYAlign && predAlign && residualAlign;
71
+ primitives.cu[sizeIdx].add_ps[bufferAlignCheck](picReconY, picStride, pred, residual, stride, stride);
72
cu.setCbfSubParts(1 << tuDepth, TEXT_LUMA, absPartIdx, fullDepth);
73
}
74
else
75
76
predIntraChromaAng(chromaPredMode, pred, stride, log2TrSizeC);
77
cu.setTransformSkipPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
78
79
- primitives.cu[sizeIdxC].calcresidual(fenc, pred, residual, stride);
80
+ primitives.cu[sizeIdxC].calcresidual[stride % 64 == 0](fenc, pred, residual, stride);
81
+
82
uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffC, log2TrSizeC, ttype, absPartIdxC, false);
83
if (numSig)
84
{
85
m_quant.invtransformNxN(cu, residual, stride, coeffC, log2TrSizeC, ttype, true, false, numSig);
86
- primitives.cu[sizeIdxC].add_ps(reconQt, reconQtStride, pred, residual, stride, stride);
87
+ bool reconQtAlign = m_rqt[qtLayer].reconQtYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
88
+ bool predAlign = mode.predYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
89
+ bool residualAlign = resiYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
90
+ bool bufferAlignCheck = reconQtAlign && predAlign && residualAlign && (reconQtStride % 64 == 0) && (stride % 64 == 0);
91
+ primitives.cu[sizeIdxC].add_ps[bufferAlignCheck](reconQt, reconQtStride, pred, residual, stride, stride);
92
cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
93
}
94
else
95
96
pixel* recon = (useTSkip ? m_tsRecon : reconQt);
97
uint32_t reconStride = (useTSkip ? MAX_TS_SIZE : reconQtStride);
98
99
- primitives.cu[sizeIdxC].calcresidual(fenc, pred, residual, stride);
100
+ primitives.cu[sizeIdxC].calcresidual[stride % 64 == 0](fenc, pred, residual, stride);
101
102
uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSizeC, ttype, absPartIdxC, useTSkip);
103
if (numSig)
104
{
105
m_quant.invtransformNxN(cu, residual, stride, coeff, log2TrSizeC, ttype, true, useTSkip, numSig);
106
- primitives.cu[sizeIdxC].add_ps(recon, reconStride, pred, residual, stride, stride);
107
+ bool reconAlign = (useTSkip ? 1 : m_rqt[qtLayer].reconQtYuv.getChromaAddrOffset(absPartIdxC)) % 64 == 0;
108
+ bool predYuvAlign = mode.predYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
109
+ bool residualAlign = m_rqt[cuGeom.depth].tmpResiYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
110
+ bool bufferAlignCheck = reconAlign && predYuvAlign && residualAlign && (reconStride % 64 == 0) && (stride % 64 == 0);
111
+ primitives.cu[sizeIdxC].add_ps[bufferAlignCheck](recon, reconStride, pred, residual, stride, stride);
112
cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
113
}
114
else if (useTSkip)
115
116
117
X265_CHECK(!cu.m_transformSkip[ttype][0], "transform skip not supported at low RD levels\n");
118
119
- primitives.cu[sizeIdxC].calcresidual(fenc, pred, residual, stride);
120
+ primitives.cu[sizeIdxC].calcresidual[stride % 64 == 0](fenc, pred, residual, stride);
121
+
122
uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffC, log2TrSizeC, ttype, absPartIdxC, false);
123
if (numSig)
124
{
125
m_quant.invtransformNxN(cu, residual, stride, coeffC, log2TrSizeC, ttype, true, false, numSig);
126
- primitives.cu[sizeIdxC].add_ps(picReconC, picStride, pred, residual, stride, stride);
127
+ bool picReconCAlign = (reconPic->m_cuOffsetC[cu.m_cuAddr] + reconPic->m_buOffsetC[cuGeom.absPartIdx + absPartIdxC]) % 64 == 0;
128
+ bool predAlign = mode.predYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
129
+ bool residualAlign = resiYuv.getChromaAddrOffset(absPartIdxC)% 64 == 0;
130
+ bool bufferAlignCheck = picReconCAlign && predAlign && residualAlign && (picStride % 64 == 0) && (stride % 64 == 0);
131
+ primitives.cu[sizeIdxC].add_ps[bufferAlignCheck](picReconC, picStride, pred, residual, stride, stride);
132
cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
133
}
134
else
135
136
137
pixel nScale[129];
138
intraNeighbourBuf[1][0] = intraNeighbourBuf[0][0];
139
- primitives.scale1D_128to64(nScale + 1, intraNeighbourBuf[0] + 1);
140
+ primitives.scale1D_128to64[NONALIGNED](nScale + 1, intraNeighbourBuf[0] + 1);
141
142
// we do not estimate filtering for downscaled samples
143
memcpy(&intraNeighbourBuf[0][1], &nScale[1], 2 * 64 * sizeof(pixel)); // Top & Left pixels
144
145
bestME[list].mvCost = mvCost;
146
}
147
}
148
-
149
-void Search::searchMV(Mode& interMode, const PredictionUnit& pu, int list, int ref, MV& outmv)
150
+void Search::searchMV(Mode& interMode, const PredictionUnit& pu, int list, int ref, MV& outmv, MV mvp, int numMvc, MV* mvc)
151
{
152
CUData& cu = interMode.cu;
153
const Slice *slice = m_slice;
154
- MV mv = cu.m_mv[list][pu.puAbsPartIdx];
155
+ MV mv;
156
+ if (m_param->interRefine == 1)
157
+ mv = mvp;
158
+ else
159
+ mv = cu.m_mv[list][pu.puAbsPartIdx];
160
cu.clipMv(mv);
161
MV mvmin, mvmax;
162
setSearchRange(cu, mv, m_param->searchRange, mvmin, mvmax);
163
- m_me.refineMV(&slice->m_mref[list][ref], mvmin, mvmax, mv, outmv);
164
+ if (m_param->interRefine == 1)
165
+ m_me.motionEstimate(&m_slice->m_mref[list][ref], mvmin, mvmax, mv, numMvc, mvc, m_param->searchRange, outmv, m_param->maxSlices,
166
+ m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
167
+ else
168
+ m_me.refineMV(&slice->m_mref[list][ref], mvmin, mvmax, mv, outmv);
169
}
170
-
171
/* find the best inter prediction for each PU of specified mode */
172
void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChromaMC, uint32_t refMasks[2])
173
{
174
175
int totalmebits = 0;
176
MV mvzero(0, 0);
177
Yuv& tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv;
178
-
179
MergeData merge;
180
memset(&merge, 0, sizeof(merge));
181
-
182
+ bool useAsMVP = false;
183
for (int puIdx = 0; puIdx < numPart; puIdx++)
184
{
185
MotionData* bestME = interMode.bestME[puIdx];
186
PredictionUnit pu(cu, cuGeom, puIdx);
187
-
188
m_me.setSourcePU(*interMode.fencYuv, pu.ctuAddr, pu.cuAbsPartIdx, pu.puAbsPartIdx, pu.width, pu.height, m_param->searchMethod, m_param->subpelRefine, bChromaMC);
189
-
190
+ useAsMVP = false;
191
+ x265_analysis_inter_data* interDataCTU = NULL;
192
+ int cuIdx;
193
+ cuIdx = (interMode.cu.m_cuAddr * m_param->num4x4Partitions) + cuGeom.absPartIdx;
194
+ if (m_param->analysisReuseLevel == 10 && m_param->interRefine > 1)
195
+ {
196
+ interDataCTU = m_frame->m_analysisData.interData;
197
+ if ((cu.m_predMode[pu.puAbsPartIdx] == interDataCTU->modes[cuIdx + pu.puAbsPartIdx])
198
+ && (cu.m_partSize[pu.puAbsPartIdx] == interDataCTU->partSize[cuIdx + pu.puAbsPartIdx])
199
+ && !(interDataCTU->mergeFlag[cuIdx + puIdx])
200
+ && (cu.m_cuDepth[0] == interDataCTU->depth[cuIdx]))
201
+ useAsMVP = true;
202
+ }
203
/* find best cost merge candidate. note: 2Nx2N merge and bidir are handled as separate modes */
204
uint32_t mrgCost = numPart == 1 ? MAX_UINT : mergeEstimation(cu, cuGeom, pu, puIdx, merge);
205
-
206
bestME[0].cost = MAX_UINT;
207
bestME[1].cost = MAX_UINT;
208
209
210
bool bDoUnidir = true;
211
212
cu.getNeighbourMV(puIdx, pu.puAbsPartIdx, interMode.interNeighbours);
213
-
214
/* Uni-directional prediction */
215
if ((m_param->analysisLoad && m_param->analysisReuseLevel > 1 && m_param->analysisReuseLevel != 10)
216
- || (m_param->analysisMultiPassRefine && m_param->rc.bStatRead) || (m_param->bMVType == AVC_INFO))
217
+ || (m_param->analysisMultiPassRefine && m_param->rc.bStatRead) || (m_param->bMVType == AVC_INFO) || (useAsMVP))
218
{
219
for (int list = 0; list < numPredDir; list++)
220
{
221
- int ref = bestME[list].ref;
222
+
223
+ int ref = -1;
224
+ if (useAsMVP)
225
+ ref = interDataCTU->refIdx[list][cuIdx + puIdx];
226
+
227
+ else
228
+ ref = bestME[list].ref;
229
if (ref < 0)
230
+ {
231
continue;
232
-
233
+ }
234
uint32_t bits = m_listSelBits[list] + MVP_IDX_BITS;
235
bits += getTUBits(ref, numRefIdx[list]);
236
237
int numMvc = cu.getPMV(interMode.interNeighbours, list, ref, interMode.amvpCand[list][ref], mvc);
238
-
239
const MV* amvp = interMode.amvpCand[list][ref];
240
int mvpIdx = selectMVP(cu, pu, amvp, list, ref);
241
- MV mvmin, mvmax, outmv, mvp = amvp[mvpIdx];
242
-
243
+ MV mvmin, mvmax, outmv, mvp;
244
+ if (useAsMVP)
245
+ {
246
+ mvp = interDataCTU->mv[list][cuIdx + puIdx].word;
247
+ mvpIdx = interDataCTU->mvpIdx[list][cuIdx + puIdx];
248
+ }
249
+ else
250
+ mvp = amvp[mvpIdx];
251
if (m_param->searchMethod == X265_SEA)
252
{
253
int puX = puIdx & 1;
254
255
bits += m_me.bitcost(outmv);
256
uint32_t mvCost = m_me.mvcost(outmv);
257
uint32_t cost = (satdCost - mvCost) + m_rdCost.getCost(bits);
258
-
259
/* Refine MVP selection, updates: mvpIdx, bits, cost */
260
- if (!m_param->analysisMultiPassRefine)
261
+ if (!(m_param->analysisMultiPassRefine || useAsMVP))
262
mvp = checkBestMVP(amvp, outmv, mvpIdx, bits, cost);
263
else
264
{
265
266
bestME[list].cost = cost;
267
bestME[list].bits = bits;
268
bestME[list].mvCost = mvCost;
269
+ bestME[list].ref = ref;
270
}
271
bDoUnidir = false;
272
}
273
274
/* Generate reference subpels */
275
predInterLumaPixel(pu, bidirYuv[0], *refPic0, bestME[0].mv);
276
predInterLumaPixel(pu, bidirYuv[1], *refPic1, bestME[1].mv);
277
-
278
- primitives.pu[m_me.partEnum].pixelavg_pp(tmpPredYuv.m_buf[0], tmpPredYuv.m_size, bidirYuv[0].getLumaAddr(pu.puAbsPartIdx), bidirYuv[0].m_size,
279
+ primitives.pu[m_me.partEnum].pixelavg_pp[(tmpPredYuv.m_size % 64 == 0) && (bidirYuv[0].m_size % 64 == 0) && (bidirYuv[1].m_size % 64 == 0)](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, bidirYuv[0].getLumaAddr(pu.puAbsPartIdx), bidirYuv[0].m_size,
280
bidirYuv[1].getLumaAddr(pu.puAbsPartIdx), bidirYuv[1].m_size, 32);
281
satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
282
}
283
284
const pixel* ref0 = m_slice->m_mref[0][bestME[0].ref].getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx);
285
const pixel* ref1 = m_slice->m_mref[1][bestME[1].ref].getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx);
286
intptr_t refStride = slice->m_mref[0][0].lumaStride;
287
-
288
- primitives.pu[m_me.partEnum].pixelavg_pp(tmpPredYuv.m_buf[0], tmpPredYuv.m_size, ref0, refStride, ref1, refStride, 32);
289
+ primitives.pu[m_me.partEnum].pixelavg_pp[(tmpPredYuv.m_size % 64 == 0) && (refStride % 64 == 0)](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, ref0, refStride, ref1, refStride, 32);
290
satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
291
}
292
-
293
MV mvp0 = bestME[0].mvp;
294
int mvpIdx0 = bestME[0].mvpIdx;
295
uint32_t bits0 = bestME[0].bits - m_me.bitcost(bestME[0].mv, mvp0) + m_me.bitcost(mvzero, mvp0);
296
297
}
298
else
299
{
300
- primitives.cu[sizeIdx].blockfill_s(curResiY, strideResiY, 0);
301
+ primitives.cu[sizeIdx].blockfill_s[strideResiY % 64 == 0](curResiY, strideResiY, 0);
302
cu.setCbfSubParts(0, TEXT_LUMA, absPartIdx, depth);
303
}
304
305
306
}
307
else
308
{
309
- primitives.cu[sizeIdxC].blockfill_s(curResiU, strideResiC, 0);
310
+ primitives.cu[sizeIdxC].blockfill_s[strideResiC % 64 == 0](curResiU, strideResiC, 0);
311
cu.setCbfPartRange(0, TEXT_CHROMA_U, absPartIdxC, tuIterator.absPartIdxStep);
312
}
313
314
315
}
316
else
317
{
318
- primitives.cu[sizeIdxC].blockfill_s(curResiV, strideResiC, 0);
319
+ primitives.cu[sizeIdxC].blockfill_s[strideResiC % 64 == 0](curResiV, strideResiC, 0);
320
cu.setCbfPartRange(0, TEXT_CHROMA_V, absPartIdxC, tuIterator.absPartIdxStep);
321
}
322
}
323
324
// non-zero cost calculation for luma - This is an approximation
325
// finally we have to encode correct cbf after comparing with null cost
326
pixel* curReconY = m_rqt[qtLayer].reconQtYuv.getLumaAddr(absPartIdx);
327
+ bool curReconYAlign = m_rqt[qtLayer].reconQtYuv.getAddrOffset(absPartIdx, m_rqt[qtLayer].reconQtYuv.m_size) % 64 == 0;
328
uint32_t strideReconY = m_rqt[qtLayer].reconQtYuv.m_size;
329
- primitives.cu[partSize].add_ps(curReconY, strideReconY, mode.predYuv.getLumaAddr(absPartIdx), curResiY, mode.predYuv.m_size, strideResiY);
330
+ bool predYuvAlign = mode.predYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 == 0;
331
+ bool curResiYAlign = m_rqt[qtLayer].resiQtYuv.getAddrOffset(absPartIdx, m_rqt[qtLayer].resiQtYuv.m_size) % 64 == 0;
332
+ bool bufferAlignCheck = curReconYAlign && predYuvAlign && curResiYAlign && (strideReconY % 64 == 0) && (mode.predYuv.m_size % 64 == 0) && (strideResiY % 64 == 0);
333
+ primitives.cu[partSize].add_ps[bufferAlignCheck](curReconY, strideReconY, mode.predYuv.getLumaAddr(absPartIdx), curResiY, mode.predYuv.m_size, strideResiY);
334
335
const sse_t nonZeroDistY = primitives.cu[partSize].sse_pp(fenc, fencYuv->m_size, curReconY, strideReconY);
336
uint32_t nzCbfBitsY = m_entropyCoder.estimateCbfBits(cbfFlag[TEXT_LUMA][0], TEXT_LUMA, tuDepth);
337
338
{
339
cbfFlag[TEXT_LUMA][0] = 0;
340
singleBits[TEXT_LUMA][0] = 0;
341
- primitives.cu[partSize].blockfill_s(curResiY, strideResiY, 0);
342
+ primitives.cu[partSize].blockfill_s[strideResiY % 64 == 0](curResiY, strideResiY, 0);
343
#if CHECKED_BUILD || _DEBUG
344
uint32_t numCoeffY = 1 << (log2TrSize << 1);
345
memset(coeffCurY, 0, sizeof(coeff_t)* numCoeffY);
346
347
{
348
if (checkTransformSkipY)
349
minCost[TEXT_LUMA][0] = estimateNullCbfCost(zeroDistY, zeroEnergyY, tuDepth, TEXT_LUMA);
350
- primitives.cu[partSize].blockfill_s(curResiY, strideResiY, 0);
351
+ primitives.cu[partSize].blockfill_s[strideResiY % 64 == 0](curResiY, strideResiY, 0);
352
singleDist[TEXT_LUMA][0] = zeroDistY;
353
singleBits[TEXT_LUMA][0] = 0;
354
singleEnergy[TEXT_LUMA][0] = zeroEnergyY;
355
356
// finally we have to encode correct cbf after comparing with null cost
357
pixel* curReconC = m_rqt[qtLayer].reconQtYuv.getChromaAddr(chromaId, absPartIdxC);
358
uint32_t strideReconC = m_rqt[qtLayer].reconQtYuv.m_csize;
359
- primitives.cu[partSizeC].add_ps(curReconC, strideReconC, mode.predYuv.getChromaAddr(chromaId, absPartIdxC), curResiC, mode.predYuv.m_csize, strideResiC);
360
+ bool curReconCAlign = m_rqt[qtLayer].reconQtYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
361
+ bool predYuvAlign = mode.predYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
362
+ bool curResiCAlign = m_rqt[qtLayer].resiQtYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
363
+ bool bufferAlignCheck = curReconCAlign && predYuvAlign && curResiCAlign && (strideReconC % 64 == 0) && (mode.predYuv.m_csize % 64 == 0) && (strideResiC % 64 == 0);
364
+ primitives.cu[partSizeC].add_ps[bufferAlignCheck](curReconC, strideReconC, mode.predYuv.getChromaAddr(chromaId, absPartIdxC), curResiC, mode.predYuv.m_csize, strideResiC);
365
sse_t nonZeroDistC = m_rdCost.scaleChromaDist(chromaId, primitives.cu[partSizeC].sse_pp(fenc, fencYuv->m_csize, curReconC, strideReconC));
366
uint32_t nzCbfBitsC = m_entropyCoder.estimateCbfBits(cbfFlag[chromaId][tuIterator.section], (TextType)chromaId, tuDepth);
367
uint32_t nonZeroEnergyC = 0; uint64_t singleCostC = 0;
368
369
{
370
cbfFlag[chromaId][tuIterator.section] = 0;
371
singleBits[chromaId][tuIterator.section] = 0;
372
- primitives.cu[partSizeC].blockfill_s(curResiC, strideResiC, 0);
373
+ primitives.cu[partSizeC].blockfill_s[strideResiC % 64 == 0](curResiC, strideResiC, 0);
374
#if CHECKED_BUILD || _DEBUG
375
uint32_t numCoeffC = 1 << (log2TrSizeC << 1);
376
memset(coeffCurC + subTUOffset, 0, sizeof(coeff_t) * numCoeffC);
377
378
{
379
if (checkTransformSkipC)
380
minCost[chromaId][tuIterator.section] = estimateNullCbfCost(zeroDistC, zeroEnergyC, tuDepthC, (TextType)chromaId);
381
- primitives.cu[partSizeC].blockfill_s(curResiC, strideResiC, 0);
382
+ primitives.cu[partSizeC].blockfill_s[strideResiC % 64 == 0](curResiC, strideResiC, 0);
383
singleBits[chromaId][tuIterator.section] = 0;
384
singleDist[chromaId][tuIterator.section] = zeroDistC;
385
singleEnergy[chromaId][tuIterator.section] = zeroEnergyC;
386
387
const uint32_t skipSingleBitsY = m_entropyCoder.getNumberOfWrittenBits();
388
389
m_quant.invtransformNxN(cu, m_tsResidual, trSize, m_tsCoeff, log2TrSize, TEXT_LUMA, false, true, numSigTSkipY);
390
+ bool predYuvAlign = mode.predYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 == 0;
391
392
- primitives.cu[partSize].add_ps(m_tsRecon, trSize, mode.predYuv.getLumaAddr(absPartIdx), m_tsResidual, mode.predYuv.m_size, trSize);
393
+ bool bufferAlignCheck = predYuvAlign && (trSize % 64 == 0) && (mode.predYuv.m_size % 64 == 0);
394
+ primitives.cu[partSize].add_ps[bufferAlignCheck](m_tsRecon, trSize, mode.predYuv.getLumaAddr(absPartIdx), m_tsResidual, mode.predYuv.m_size, trSize);
395
nonZeroDistY = primitives.cu[partSize].sse_pp(fenc, fencYuv->m_size, m_tsRecon, trSize);
396
397
if (m_rdCost.m_psyRd)
398
399
400
m_quant.invtransformNxN(cu, m_tsResidual, trSizeC, m_tsCoeff,
401
log2TrSizeC, (TextType)chromaId, false, true, numSigTSkipC);
402
- primitives.cu[partSizeC].add_ps(m_tsRecon, trSizeC, mode.predYuv.getChromaAddr(chromaId, absPartIdxC), m_tsResidual, mode.predYuv.m_csize, trSizeC);
403
+ bool predYuvAlign = mode.predYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
404
+ bool bufferAlignCheck = predYuvAlign && (trSizeC % 64 == 0) && (mode.predYuv.m_csize % 64 == 0) && (trSizeC % 64 == 0);
405
+ primitives.cu[partSizeC].add_ps[bufferAlignCheck](m_tsRecon, trSizeC, mode.predYuv.getChromaAddr(chromaId, absPartIdxC), m_tsResidual, mode.predYuv.m_csize, trSizeC);
406
nonZeroDistC = m_rdCost.scaleChromaDist(chromaId, primitives.cu[partSizeC].sse_pp(fenc, fencYuv->m_csize, m_tsRecon, trSizeC));
407
if (m_rdCost.m_psyRd)
408
{
409
x265_2.7.tar.gz/source/encoder/search.h -> x265_2.9.tar.gz/source/encoder/search.h
Changed
11
1
2
3
// estimation inter prediction (non-skip)
4
void predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChromaMC, uint32_t masks[2]);
5
-
6
- void searchMV(Mode& interMode, const PredictionUnit& pu, int list, int ref, MV& outmv);
7
+ void searchMV(Mode& interMode, const PredictionUnit& pu, int list, int ref, MV& outmv, MV mvp, int numMvc, MV* mvc);
8
// encode residual and compute rd-cost for inter mode
9
void encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom);
10
void encodeResAndCalcRdSkipCU(Mode& interMode);
11
x265_2.7.tar.gz/source/encoder/sei.cpp -> x265_2.9.tar.gz/source/encoder/sei.cpp
Changed
141
1
2
};
3
4
/* marshal a single SEI message sei, storing the marshalled representation
5
- * in bitstream bs */
6
-void SEI::write(Bitstream& bs, const SPS& sps)
7
+* in bitstream bs */
8
+void SEI::writeSEImessages(Bitstream& bs, const SPS& sps, NalUnitType nalUnitType, NALList& list, int isNested)
9
{
10
- uint32_t type = m_payloadType;
11
+ if (!isNested)
12
+ bs.resetBits();
13
+
14
+ BitCounter counter;
15
+ m_bitIf = &counter;
16
+ writeSEI(sps);
17
+ /* count the size of the payload and return the size in bits */
18
+ X265_CHECK(0 == (counter.getNumberOfWrittenBits() & 7), "payload unaligned\n");
19
+ uint32_t payloadData = counter.getNumberOfWrittenBits() >> 3;
20
+
21
+ // set bitstream
22
m_bitIf = &bs;
23
- BitCounter count;
24
- bool hrdTypes = (m_payloadType == ACTIVE_PARAMETER_SETS || m_payloadType == PICTURE_TIMING || m_payloadType == BUFFERING_PERIOD);
25
- if (hrdTypes)
26
- {
27
- m_bitIf = &count;
28
- /* virtual writeSEI method, write to bit counter to determine size */
29
- writeSEI(sps);
30
- m_bitIf = &bs;
31
- uint32_t payloadType = m_payloadType;
32
- for (; payloadType >= 0xff; payloadType -= 0xff)
33
- WRITE_CODE(0xff, 8, "payload_type");
34
- }
35
- WRITE_CODE(type, 8, "payload_type");
36
- uint32_t payloadSize;
37
- if (hrdTypes || m_payloadType == USER_DATA_UNREGISTERED || m_payloadType == USER_DATA_REGISTERED_ITU_T_T35)
38
+
39
+ uint32_t payloadType = m_payloadType;
40
+ for (; payloadType >= 0xff; payloadType -= 0xff)
41
+ WRITE_CODE(0xff, 8, "payload_type");
42
+ WRITE_CODE(payloadType, 8, "payload_type");
43
+
44
+ uint32_t payloadSize = payloadData;
45
+ for (; payloadSize >= 0xff; payloadSize -= 0xff)
46
+ WRITE_CODE(0xff, 8, "payload_size");
47
+ WRITE_CODE(payloadSize, 8, "payload_size");
48
+
49
+ // virtual writeSEI method, write to bs
50
+ writeSEI(sps);
51
+
52
+ if (!isNested)
53
{
54
- if (hrdTypes)
55
- {
56
- X265_CHECK(0 == (count.getNumberOfWrittenBits() & 7), "payload unaligned\n");
57
- payloadSize = count.getNumberOfWrittenBits() >> 3;
58
- }
59
- else if (m_payloadType == USER_DATA_UNREGISTERED)
60
- payloadSize = m_payloadSize + 16;
61
- else
62
- payloadSize = m_payloadSize;
63
-
64
- for (; payloadSize >= 0xff; payloadSize -= 0xff)
65
- WRITE_CODE(0xff, 8, "payload_size");
66
- WRITE_CODE(payloadSize, 8, "payload_size");
67
+ bs.writeByteAlignment();
68
+ list.serialize(nalUnitType, bs);
69
}
70
- else
71
- WRITE_CODE(m_payloadSize, 8, "payload_size");
72
- /* virtual writeSEI method, write to bs */
73
- writeSEI(sps);
74
}
75
76
void SEI::writeByteAlign()
77
78
{
79
m_payloadSize = size;
80
}
81
+
82
+/* charSet = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/" */
83
+
84
+char* SEI::base64Decode(char encodedString[], int base64EncodeLength)
85
+{
86
+ char* decodedString;
87
+ decodedString = (char*)malloc(sizeof(char) * ((base64EncodeLength / 4) * 3));
88
+ int i, j, k = 0;
89
+ // stores the bitstream
90
+ int bitstream = 0;
91
+ // countBits stores current number of bits in bitstream
92
+ int countBits = 0;
93
+ // selects 4 characters from encodedString at a time. Find the position of each encoded character in charSet and stores in bitstream
94
+ for (i = 0; i < base64EncodeLength; i += 4)
95
+ {
96
+ bitstream = 0, countBits = 0;
97
+ for (j = 0; j < 4; j++)
98
+ {
99
+ // make space for 6 bits
100
+ if (encodedString[i + j] != '=')
101
+ {
102
+ bitstream = bitstream << 6;
103
+ countBits += 6;
104
+ }
105
+ // Finding the position of each encoded character in charSet and storing in bitstream, use OR '|' operator to store bits
106
+
107
+ if (encodedString[i + j] >= 'A' && encodedString[i + j] <= 'Z')
108
+ bitstream = bitstream | (encodedString[i + j] - 'A');
109
+
110
+ else if (encodedString[i + j] >= 'a' && encodedString[i + j] <= 'z')
111
+ bitstream = bitstream | (encodedString[i + j] - 'a' + 26);
112
+
113
+ else if (encodedString[i + j] >= '0' && encodedString[i + j] <= '9')
114
+ bitstream = bitstream | (encodedString[i + j] - '0' + 52);
115
+
116
+ // '+' occurs in 62nd position in charSet
117
+ else if (encodedString[i + j] == '+')
118
+ bitstream = bitstream | 62;
119
+
120
+ // '/' occurs in 63rd position in charSet
121
+ else if (encodedString[i + j] == '/')
122
+ bitstream = bitstream | 63;
123
+
124
+ // to delete appended bits during encoding
125
+ else
126
+ {
127
+ bitstream = bitstream >> 2;
128
+ countBits -= 2;
129
+ }
130
+ }
131
+
132
+ while (countBits != 0)
133
+ {
134
+ countBits -= 8;
135
+ decodedString[k++] = (bitstream >> countBits) & 255;
136
+ }
137
+ }
138
+ return decodedString;
139
+}
140
+
141
x265_2.7.tar.gz/source/encoder/sei.h -> x265_2.9.tar.gz/source/encoder/sei.h
Changed
135
1
2
#include "common.h"
3
#include "bitstream.h"
4
#include "slice.h"
5
+#include "nal.h"
6
+#include "md5.h"
7
8
namespace X265_NS {
9
// private namespace
10
11
class SEI : public SyntaxElementWriter
12
{
13
public:
14
- /* SEI users call write() to marshal an SEI to a bitstream.
15
- * The write() method calls writeSEI() which encodes the header */
16
- void write(Bitstream& bs, const SPS& sps);
17
-
18
+ /* SEI users call writeSEImessages() to marshal an SEI to a bitstream.
19
+ * The writeSEImessages() method calls writeSEI() which encodes the header */
20
+ void writeSEImessages(Bitstream& bs, const SPS& sps, NalUnitType nalUnitType, NALList& list, int isNested);
21
void setSize(uint32_t size);
22
+ static char* base64Decode(char encodedString[], int base64EncodeLength);
23
virtual ~SEI() {}
24
protected:
25
SEIPayloadType m_payloadType;
26
27
void writeByteAlign();
28
};
29
30
+//seongnam.oh@samsung.com :: for the Creative Intent Meta Data Encoding
31
+class SEIuserDataRegistered : public SEI
32
+{
33
+public:
34
+ SEIuserDataRegistered()
35
+ {
36
+ m_payloadType = USER_DATA_REGISTERED_ITU_T_T35;
37
+ m_payloadSize = 0;
38
+ }
39
+
40
+ uint8_t *m_userData;
41
+
42
+ // daniel.vt@samsung.com :: for the Creative Intent Meta Data Encoding ( seongnam.oh@samsung.com )
43
+ void writeSEI(const SPS&)
44
+ {
45
+ if (!m_userData)
46
+ return;
47
+
48
+ uint32_t i = 0;
49
+ for (; i < m_payloadSize; ++i)
50
+ WRITE_CODE(m_userData[i], 8, "creative_intent_metadata");
51
+ }
52
+};
53
+
54
+static const uint32_t ISO_IEC_11578_LEN = 16;
55
+
56
class SEIuserDataUnregistered : public SEI
57
{
58
public:
59
60
m_payloadType = USER_DATA_UNREGISTERED;
61
m_payloadSize = 0;
62
}
63
- static const uint8_t m_uuid_iso_iec_11578[16];
64
+ static const uint8_t m_uuid_iso_iec_11578[ISO_IEC_11578_LEN];
65
uint8_t *m_userData;
66
void writeSEI(const SPS&)
67
{
68
- for (uint32_t i = 0; i < 16; i++)
69
+ for (uint32_t i = 0; i < ISO_IEC_11578_LEN; i++)
70
WRITE_CODE(m_uuid_iso_iec_11578[i], 8, "sei.uuid_iso_iec_11578[i]");
71
for (uint32_t i = 0; i < m_payloadSize; i++)
72
WRITE_CODE(m_userData[i], 8, "user_data");
73
74
CRC,
75
CHECKSUM,
76
} m_method;
77
- uint8_t m_digest[3][16];
78
+
79
+ MD5Context m_state[3];
80
+ uint32_t m_crc[3];
81
+ uint32_t m_checksum[3];
82
+ uint8_t m_digest[3][16];
83
+
84
void writeSEI(const SPS& sps)
85
{
86
int planes = (sps.chromaFormatIdc != X265_CSP_I400) ? 3 : 1;
87
88
class SEIRecoveryPoint : public SEI
89
{
90
public:
91
+ SEIRecoveryPoint()
92
+ {
93
+ m_payloadType = RECOVERY_POINT;
94
+ m_payloadSize = 0;
95
+ }
96
int m_recoveryPocCnt;
97
bool m_exactMatchingFlag;
98
bool m_brokenLinkFlag;
99
100
}
101
};
102
103
-//seongnam.oh@samsung.com :: for the Creative Intent Meta Data Encoding
104
-class SEICreativeIntentMeta : public SEI
105
+class SEIAlternativeTC : public SEI
106
{
107
public:
108
- SEICreativeIntentMeta()
109
+ int m_preferredTransferCharacteristics;
110
+ SEIAlternativeTC()
111
{
112
- m_payloadType = USER_DATA_REGISTERED_ITU_T_T35;
113
+ m_payloadType = ALTERNATIVE_TRANSFER_CHARACTERISTICS;
114
m_payloadSize = 0;
115
+ m_preferredTransferCharacteristics = -1;
116
}
117
118
- uint8_t *m_payload;
119
-
120
- // daniel.vt@samsung.com :: for the Creative Intent Meta Data Encoding ( seongnam.oh@samsung.com )
121
void writeSEI(const SPS&)
122
{
123
- if (!m_payload)
124
- return;
125
-
126
- uint32_t i = 0;
127
- for (; i < m_payloadSize; ++i)
128
- WRITE_CODE(m_payload[i], 8, "creative_intent_metadata");
129
+ WRITE_CODE(m_preferredTransferCharacteristics, 8, "Preferred transfer characteristics");
130
}
131
};
132
+
133
}
134
#endif // ifndef X265_SEI_H
135
x265_2.7.tar.gz/source/encoder/slicetype.cpp -> x265_2.9.tar.gz/source/encoder/slicetype.cpp
Changed
400
1
2
curFrame->m_lowres.wp_sum[y] = 0;
3
}
4
5
- /* Calculate Qp offset for each 16x16 or 8x8 block in the frame */
6
- int blockXY = 0;
7
- int blockX = 0, blockY = 0;
8
- double strength = 0.f;
9
+ /* Calculate Qp offset for each 16x16 or 8x8 block in the frame */
10
if ((param->rc.aqMode == X265_AQ_NONE || param->rc.aqStrength == 0) || (param->rc.bStatRead && param->rc.cuTree && IS_REFERENCED(curFrame)))
11
{
12
- /* Need to init it anyways for CU tree */
13
- int cuCount = blockCount;
14
-
15
if (param->rc.aqMode && param->rc.aqStrength == 0)
16
{
17
if (quantOffsets)
18
{
19
- for (int cuxy = 0; cuxy < cuCount; cuxy++)
20
+ for (int cuxy = 0; cuxy < blockCount; cuxy++)
21
{
22
curFrame->m_lowres.qpCuTreeOffset[cuxy] = curFrame->m_lowres.qpAqOffset[cuxy] = quantOffsets[cuxy];
23
curFrame->m_lowres.invQscaleFactor[cuxy] = x265_exp2fix8(curFrame->m_lowres.qpCuTreeOffset[cuxy]);
24
25
}
26
else
27
{
28
- memset(curFrame->m_lowres.qpCuTreeOffset, 0, cuCount * sizeof(double));
29
- memset(curFrame->m_lowres.qpAqOffset, 0, cuCount * sizeof(double));
30
- for (int cuxy = 0; cuxy < cuCount; cuxy++)
31
- curFrame->m_lowres.invQscaleFactor[cuxy] = 256;
32
+ memset(curFrame->m_lowres.qpCuTreeOffset, 0, blockCount * sizeof(double));
33
+ memset(curFrame->m_lowres.qpAqOffset, 0, blockCount * sizeof(double));
34
+ for (int cuxy = 0; cuxy < blockCount; cuxy++)
35
+ curFrame->m_lowres.invQscaleFactor[cuxy] = 256;
36
}
37
}
38
39
- /* Need variance data for weighted prediction */
40
+ /* Need variance data for weighted prediction and dynamic refinement*/
41
if (param->bEnableWeightedPred || param->bEnableWeightedBiPred)
42
{
43
- for (blockY = 0; blockY < maxRow; blockY += loopIncr)
44
- for (blockX = 0; blockX < maxCol; blockX += loopIncr)
45
- acEnergyCu(curFrame, blockX, blockY, param->internalCsp, param->rc.qgSize);
46
+ for (int blockY = 0; blockY < maxRow; blockY += loopIncr)
47
+ for (int blockX = 0; blockX < maxCol; blockX += loopIncr)
48
+ acEnergyCu(curFrame, blockX, blockY, param->internalCsp, param->rc.qgSize);
49
}
50
}
51
else
52
{
53
- blockXY = 0;
54
- double avg_adj_pow2 = 0, avg_adj = 0, qp_adj = 0;
55
- double bias_strength = 0.f;
56
+ int blockXY = 0;
57
+ double avg_adj_pow2 = 0.f, avg_adj = 0.f, qp_adj = 0.f;
58
+ double bias_strength = 0.f, strength = 0.f;
59
if (param->rc.aqMode == X265_AQ_AUTO_VARIANCE || param->rc.aqMode == X265_AQ_AUTO_VARIANCE_BIASED)
60
{
61
- double bit_depth_correction = 1.f / (1 << (2*(X265_DEPTH-8)));
62
- curFrame->m_lowres.frameVariance = 0;
63
- uint64_t rowVariance = 0;
64
- for (blockY = 0; blockY < maxRow; blockY += loopIncr)
65
- {
66
- rowVariance = 0;
67
- for (blockX = 0; blockX < maxCol; blockX += loopIncr)
68
- {
69
- uint32_t energy = acEnergyCu(curFrame, blockX, blockY, param->internalCsp, param->rc.qgSize);
70
- curFrame->m_lowres.blockVariance[blockXY] = energy;
71
- rowVariance += energy;
72
+ double bit_depth_correction = 1.f / (1 << (2*(X265_DEPTH-8)));
73
+
74
+ for (int blockY = 0; blockY < maxRow; blockY += loopIncr)
75
+ {
76
+ for (int blockX = 0; blockX < maxCol; blockX += loopIncr)
77
+ {
78
+ uint32_t energy = acEnergyCu(curFrame, blockX, blockY, param->internalCsp, param->rc.qgSize);
79
qp_adj = pow(energy * bit_depth_correction + 1, 0.1);
80
curFrame->m_lowres.qpCuTreeOffset[blockXY] = qp_adj;
81
avg_adj += qp_adj;
82
avg_adj_pow2 += qp_adj * qp_adj;
83
blockXY++;
84
}
85
- curFrame->m_lowres.frameVariance += (rowVariance / maxCol);
86
}
87
- curFrame->m_lowres.frameVariance /= maxRow;
88
avg_adj /= blockCount;
89
avg_adj_pow2 /= blockCount;
90
strength = param->rc.aqStrength * avg_adj;
91
- avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - (modeTwoConst)) / avg_adj;
92
+ avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - modeTwoConst) / avg_adj;
93
bias_strength = param->rc.aqStrength;
94
}
95
else
96
strength = param->rc.aqStrength * 1.0397f;
97
98
blockXY = 0;
99
- for (blockY = 0; blockY < maxRow; blockY += loopIncr)
100
+ for (int blockY = 0; blockY < maxRow; blockY += loopIncr)
101
{
102
- for (blockX = 0; blockX < maxCol; blockX += loopIncr)
103
+ for (int blockX = 0; blockX < maxCol; blockX += loopIncr)
104
{
105
if (param->rc.aqMode == X265_AQ_AUTO_VARIANCE_BIASED)
106
{
107
108
else
109
{
110
uint32_t energy = acEnergyCu(curFrame, blockX, blockY, param->internalCsp,param->rc.qgSize);
111
- qp_adj = strength * (X265_LOG2(X265_MAX(energy, 1)) - (modeOneConst + 2 * (X265_DEPTH - 8)));
112
+ qp_adj = strength * (X265_LOG2(X265_MAX(energy, 1)) - (modeOneConst + 2 * (X265_DEPTH - 8)));
113
}
114
115
if (param->bHDROpt)
116
117
curFrame->m_lowres.wp_ssd[i] = ssd - (sum * sum + (width[i] * height[i]) / 2) / (width[i] * height[i]);
118
}
119
}
120
+
121
+ if (param->bDynamicRefine)
122
+ {
123
+ int blockXY = 0;
124
+ for (int blockY = 0; blockY < maxRow; blockY += loopIncr)
125
+ for (int blockX = 0; blockX < maxCol; blockX += loopIncr)
126
+ {
127
+ curFrame->m_lowres.blockVariance[blockXY] = acEnergyCu(curFrame, blockX, blockY, param->internalCsp, param->rc.qgSize);
128
+ blockXY++;
129
+ }
130
+ }
131
}
132
133
void LookaheadTLD::lowresIntraEstimate(Lowres& fenc, uint32_t qgSize)
134
135
pixel *src = ref.fpelPlane[0];
136
intptr_t stride = fenc.lumaStride;
137
138
- if (wp.bPresentFlag)
139
+ if (wp.wtPresent)
140
{
141
int offset = wp.inputOffset << (X265_DEPTH - 8);
142
int scale = wp.inputWeight;
143
144
int deltaIndex = fenc.frameNum - ref.frameNum;
145
146
WeightParam wp;
147
- wp.bPresentFlag = false;
148
+ wp.wtPresent = 0;
149
150
if (!wbuffer[0])
151
{
152
153
}
154
155
int bframes, brefs;
156
- for (bframes = 0, brefs = 0;; bframes++)
157
+ if (!m_param->analysisLoad)
158
{
159
- Lowres& frm = list[bframes]->m_lowres;
160
-
161
- if (frm.sliceType == X265_TYPE_BREF && !m_param->bBPyramid && brefs == m_param->bBPyramid)
162
+ for (bframes = 0, brefs = 0;; bframes++)
163
{
164
- frm.sliceType = X265_TYPE_B;
165
- x265_log(m_param, X265_LOG_WARNING, "B-ref at frame %d incompatible with B-pyramid\n",
166
- frm.frameNum);
167
- }
168
+ Lowres& frm = list[bframes]->m_lowres;
169
170
- /* pyramid with multiple B-refs needs a big enough dpb that the preceding P-frame stays available.
171
- * smaller dpb could be supported by smart enough use of mmco, but it's easier just to forbid it. */
172
- else if (frm.sliceType == X265_TYPE_BREF && m_param->bBPyramid && brefs &&
173
- m_param->maxNumReferences <= (brefs + 3))
174
- {
175
- frm.sliceType = X265_TYPE_B;
176
- x265_log(m_param, X265_LOG_WARNING, "B-ref at frame %d incompatible with B-pyramid and %d reference frames\n",
177
- frm.sliceType, m_param->maxNumReferences);
178
- }
179
- if ((!m_param->bIntraRefresh || frm.frameNum == 0) && frm.frameNum - m_lastKeyframe >= m_param->keyframeMax &&
180
- (!m_extendGopBoundary || frm.frameNum - m_lastKeyframe >= m_param->keyframeMax + m_param->gopLookahead))
181
- {
182
- if (frm.sliceType == X265_TYPE_AUTO || frm.sliceType == X265_TYPE_I)
183
- frm.sliceType = m_param->bOpenGOP && m_lastKeyframe >= 0 ? X265_TYPE_I : X265_TYPE_IDR;
184
- bool warn = frm.sliceType != X265_TYPE_IDR;
185
- if (warn && m_param->bOpenGOP)
186
- warn &= frm.sliceType != X265_TYPE_I;
187
- if (warn)
188
+ if (frm.sliceType == X265_TYPE_BREF && !m_param->bBPyramid && brefs == m_param->bBPyramid)
189
{
190
- x265_log(m_param, X265_LOG_WARNING, "specified frame type (%d) at %d is not compatible with keyframe interval\n",
191
- frm.sliceType, frm.frameNum);
192
- frm.sliceType = m_param->bOpenGOP && m_lastKeyframe >= 0 ? X265_TYPE_I : X265_TYPE_IDR;
193
+ frm.sliceType = X265_TYPE_B;
194
+ x265_log(m_param, X265_LOG_WARNING, "B-ref at frame %d incompatible with B-pyramid\n",
195
+ frm.frameNum);
196
}
197
- }
198
- if (frm.sliceType == X265_TYPE_I && frm.frameNum - m_lastKeyframe >= m_param->keyframeMin)
199
- {
200
- if (m_param->bOpenGOP)
201
+
202
+ /* pyramid with multiple B-refs needs a big enough dpb that the preceding P-frame stays available.
203
+ * smaller dpb could be supported by smart enough use of mmco, but it's easier just to forbid it. */
204
+ else if (frm.sliceType == X265_TYPE_BREF && m_param->bBPyramid && brefs &&
205
+ m_param->maxNumReferences <= (brefs + 3))
206
+ {
207
+ frm.sliceType = X265_TYPE_B;
208
+ x265_log(m_param, X265_LOG_WARNING, "B-ref at frame %d incompatible with B-pyramid and %d reference frames\n",
209
+ frm.sliceType, m_param->maxNumReferences);
210
+ }
211
+ if (((!m_param->bIntraRefresh || frm.frameNum == 0) && frm.frameNum - m_lastKeyframe >= m_param->keyframeMax &&
212
+ (!m_extendGopBoundary || frm.frameNum - m_lastKeyframe >= m_param->keyframeMax + m_param->gopLookahead)) ||
213
+ (frm.frameNum == (m_param->chunkStart - 1)) || (frm.frameNum == m_param->chunkEnd))
214
+ {
215
+ if (frm.sliceType == X265_TYPE_AUTO || frm.sliceType == X265_TYPE_I)
216
+ frm.sliceType = m_param->bOpenGOP && m_lastKeyframe >= 0 ? X265_TYPE_I : X265_TYPE_IDR;
217
+ bool warn = frm.sliceType != X265_TYPE_IDR;
218
+ if (warn && m_param->bOpenGOP)
219
+ warn &= frm.sliceType != X265_TYPE_I;
220
+ if (warn)
221
+ {
222
+ x265_log(m_param, X265_LOG_WARNING, "specified frame type (%d) at %d is not compatible with keyframe interval\n",
223
+ frm.sliceType, frm.frameNum);
224
+ frm.sliceType = m_param->bOpenGOP && m_lastKeyframe >= 0 ? X265_TYPE_I : X265_TYPE_IDR;
225
+ }
226
+ }
227
+ if ((frm.sliceType == X265_TYPE_I && frm.frameNum - m_lastKeyframe >= m_param->keyframeMin) || (frm.frameNum == (m_param->chunkStart - 1)) || (frm.frameNum == m_param->chunkEnd))
228
{
229
+ if (m_param->bOpenGOP)
230
+ {
231
+ m_lastKeyframe = frm.frameNum;
232
+ frm.bKeyframe = true;
233
+ }
234
+ else
235
+ frm.sliceType = X265_TYPE_IDR;
236
+ }
237
+ if (frm.sliceType == X265_TYPE_IDR)
238
+ {
239
+ /* Closed GOP */
240
m_lastKeyframe = frm.frameNum;
241
frm.bKeyframe = true;
242
+ if (bframes > 0 && !m_param->radl)
243
+ {
244
+ list[bframes - 1]->m_lowres.sliceType = X265_TYPE_P;
245
+ bframes--;
246
+ }
247
}
248
- else
249
- frm.sliceType = X265_TYPE_IDR;
250
- }
251
- if (frm.sliceType == X265_TYPE_IDR)
252
- {
253
- /* Closed GOP */
254
- m_lastKeyframe = frm.frameNum;
255
- frm.bKeyframe = true;
256
- if (bframes > 0 && !m_param->radl)
257
+ if (bframes == m_param->bframes || !list[bframes + 1])
258
{
259
- list[bframes - 1]->m_lowres.sliceType = X265_TYPE_P;
260
- bframes--;
261
+ if (IS_X265_TYPE_B(frm.sliceType))
262
+ x265_log(m_param, X265_LOG_WARNING, "specified frame type is not compatible with max B-frames\n");
263
+ if (frm.sliceType == X265_TYPE_AUTO || IS_X265_TYPE_B(frm.sliceType))
264
+ frm.sliceType = X265_TYPE_P;
265
}
266
- }
267
- if (m_param->radl && !m_param->bOpenGOP && list[bframes + 1])
268
- {
269
- if ((frm.frameNum - m_lastKeyframe) > (m_param->keyframeMax - m_param->radl - 1) && (frm.frameNum - m_lastKeyframe) < m_param->keyframeMax)
270
+ if (frm.sliceType == X265_TYPE_BREF)
271
+ brefs++;
272
+ if (frm.sliceType == X265_TYPE_AUTO)
273
frm.sliceType = X265_TYPE_B;
274
- if ((frm.frameNum - m_lastKeyframe) == (m_param->keyframeMax - m_param->radl - 1))
275
- frm.sliceType = X265_TYPE_P;
276
+ else if (!IS_X265_TYPE_B(frm.sliceType))
277
+ break;
278
}
279
-
280
- if (bframes == m_param->bframes || !list[bframes + 1])
281
+ }
282
+ else
283
+ {
284
+ for (bframes = 0, brefs = 0;; bframes++)
285
{
286
- if (IS_X265_TYPE_B(frm.sliceType))
287
- x265_log(m_param, X265_LOG_WARNING, "specified frame type is not compatible with max B-frames\n");
288
- if (frm.sliceType == X265_TYPE_AUTO || IS_X265_TYPE_B(frm.sliceType))
289
- frm.sliceType = X265_TYPE_P;
290
- }
291
- if (frm.sliceType == X265_TYPE_BREF)
292
- brefs++;
293
- if (frm.sliceType == X265_TYPE_AUTO)
294
- frm.sliceType = X265_TYPE_B;
295
- else if (!IS_X265_TYPE_B(frm.sliceType))
296
- break;
297
+ Lowres& frm = list[bframes]->m_lowres;
298
+ if (frm.sliceType == X265_TYPE_BREF)
299
+ brefs++;
300
+ if ((IS_X265_TYPE_I(frm.sliceType) && frm.frameNum - m_lastKeyframe >= m_param->keyframeMin)
301
+ || (frm.frameNum == (m_param->chunkStart - 1)) || (frm.frameNum == m_param->chunkEnd))
302
+ {
303
+ m_lastKeyframe = frm.frameNum;
304
+ frm.bKeyframe = true;
305
+ }
306
+ if (!IS_X265_TYPE_B(frm.sliceType))
307
+ break;
308
+ }
309
}
310
-
311
if (bframes)
312
list[bframes - 1]->m_lowres.bLastMiniGopBFrame = true;
313
list[bframes]->m_lowres.leadingBframes = bframes;
314
315
return;
316
}
317
frames[framecnt + 1] = NULL;
318
- int keyFrameLimit = m_param->keyframeMax + m_lastKeyframe - frames[0]->frameNum - 1;
319
+
320
+ int keylimit = m_param->keyframeMax;
321
+ if (frames[0]->frameNum < m_param->chunkEnd)
322
+ {
323
+ int chunkStart = (m_param->chunkStart - m_lastKeyframe - 1);
324
+ int chunkEnd = (m_param->chunkEnd - m_lastKeyframe);
325
+ if ((chunkStart > 0) && (chunkStart < m_param->keyframeMax))
326
+ keylimit = chunkStart;
327
+ else if ((chunkEnd > 0) && (chunkEnd < m_param->keyframeMax))
328
+ keylimit = chunkEnd;
329
+ }
330
+
331
+ int keyFrameLimit = keylimit + m_lastKeyframe - frames[0]->frameNum - 1;
332
if (m_param->gopLookahead && keyFrameLimit <= m_param->bframes + 1)
333
keyintLimit = keyFrameLimit + m_param->gopLookahead;
334
else
335
336
int numBFrames = 0;
337
int numAnalyzed = numFrames;
338
bool isScenecut = scenecut(frames, 0, 1, true, origNumFrames);
339
+
340
/* When scenecut threshold is set, use scenecut detection for I frame placements */
341
if (m_param->scenecutThreshold && isScenecut)
342
{
343
344
frames[numFrames]->sliceType = X265_TYPE_P;
345
}
346
347
- /* Check scenecut on the first minigop. */
348
- for (int j = 1; j < numBFrames + 1; j++)
349
+ bool bForceRADL = m_param->radl && !m_param->bOpenGOP;
350
+ bool bLastMiniGop = (framecnt >= m_param->bframes + 1) ? false : true;
351
+ int preRADL = m_lastKeyframe + m_param->keyframeMax - m_param->radl - 1; /*Frame preceeding RADL in POC order*/
352
+ if (bForceRADL && (frames[0]->frameNum == preRADL) && !bLastMiniGop)
353
+ {
354
+ int j = 1;
355
+ numBFrames = m_param->radl;
356
+ for (; j <= m_param->radl; j++)
357
+ frames[j]->sliceType = X265_TYPE_B;
358
+ frames[j]->sliceType = X265_TYPE_I;
359
+ }
360
+ else /* Check scenecut and RADL on the first minigop. */
361
{
362
- if (scenecut(frames, j, j + 1, false, origNumFrames))
363
+ for (int j = 1; j < numBFrames + 1; j++)
364
{
365
- frames[j]->sliceType = X265_TYPE_P;
366
- numAnalyzed = j;
367
- break;
368
+ if (scenecut(frames, j, j + 1, false, origNumFrames) ||
369
+ (bForceRADL && (frames[j]->frameNum == preRADL)))
370
+ {
371
+ frames[j]->sliceType = X265_TYPE_P;
372
+ numAnalyzed = j;
373
+ break;
374
+ }
375
}
376
}
377
resetStart = bKeyframe ? 1 : X265_MIN(numBFrames + 2, numAnalyzed + 1);
378
379
intptr_t stride0 = X265_LOWRES_CU_SIZE, stride1 = X265_LOWRES_CU_SIZE;
380
pixel *src0 = fref0->lowresMC(pelOffset, fenc->lowresMvs[0][listDist[0]][cuXY], subpelbuf0, stride0);
381
pixel *src1 = fref1->lowresMC(pelOffset, fenc->lowresMvs[1][listDist[1]][cuXY], subpelbuf1, stride1);
382
-
383
ALIGN_VAR_32(pixel, ref[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
384
- primitives.pu[LUMA_8x8].pixelavg_pp(ref, X265_LOWRES_CU_SIZE, src0, stride0, src1, stride1, 32);
385
+ primitives.pu[LUMA_8x8].pixelavg_pp[NONALIGNED](ref, X265_LOWRES_CU_SIZE, src0, stride0, src1, stride1, 32);
386
int bicost = tld.me.bufSATD(ref, X265_LOWRES_CU_SIZE);
387
COPY2_IF_LT(bcost, bicost, listused, 3);
388
-
389
/* coloc candidate */
390
src0 = fref0->lowresPlane[0] + pelOffset;
391
src1 = fref1->lowresPlane[0] + pelOffset;
392
- primitives.pu[LUMA_8x8].pixelavg_pp(ref, X265_LOWRES_CU_SIZE, src0, fref0->lumaStride, src1, fref1->lumaStride, 32);
393
+ primitives.pu[LUMA_8x8].pixelavg_pp[NONALIGNED](ref, X265_LOWRES_CU_SIZE, src0, fref0->lumaStride, src1, fref1->lumaStride, 32);
394
bicost = tld.me.bufSATD(ref, X265_LOWRES_CU_SIZE);
395
COPY2_IF_LT(bcost, bicost, listused, 3);
396
-
397
bcost += lowresPenalty;
398
}
399
else /* P, also consider intra */
400
x265_2.7.tar.gz/source/encoder/weightPrediction.cpp -> x265_2.9.tar.gz/source/encoder/weightPrediction.cpp
Changed
56
1
2
int denom = w->log2WeightDenom;
3
int round = denom ? 1 << (denom - 1) : 0;
4
int correction = IF_INTERNAL_PREC - X265_DEPTH; /* intermediate interpolation depth */
5
- int pwidth = ((width + 15) >> 4) << 4;
6
-
7
+ int pwidth = ((width + 31) >> 5) << 5;
8
primitives.weight_pp(ref, weightTemp, stride, pwidth, height,
9
weight, round << correction, denom + correction, offset);
10
ref = weightTemp;
11
12
for (int plane = 0; plane < (param.internalCsp != X265_CSP_I400 ? 3 : 1); plane++)
13
{
14
denom = plane ? chromaDenom : lumaDenom;
15
- if (plane && !weights[0].bPresentFlag)
16
+ if (plane && !weights[0].wtPresent)
17
break;
18
19
/* Early termination */
20
21
}
22
}
23
24
- if (weights[0].bPresentFlag)
25
+ if (weights[0].wtPresent)
26
{
27
// Make sure both chroma channels match
28
- if (weights[1].bPresentFlag != weights[2].bPresentFlag)
29
+ if (weights[1].wtPresent != weights[2].wtPresent)
30
{
31
- if (weights[1].bPresentFlag)
32
+ if (weights[1].wtPresent)
33
weights[2] = weights[1];
34
else
35
weights[1] = weights[2];
36
37
for (int list = 0; list < numPredDir; list++)
38
{
39
WeightParam* w = &wp[list][0][0];
40
- if (w[0].bPresentFlag || w[1].bPresentFlag || w[2].bPresentFlag)
41
+ if (w[0].wtPresent || w[1].wtPresent || w[2].wtPresent)
42
{
43
bWeighted = true;
44
p += sprintf(buf + p, " [L%d:R0 ", list);
45
- if (w[0].bPresentFlag)
46
+ if (w[0].wtPresent)
47
p += sprintf(buf + p, "Y{%d/%d%+d}", w[0].inputWeight, 1 << w[0].log2WeightDenom, w[0].inputOffset);
48
- if (w[1].bPresentFlag)
49
+ if (w[1].wtPresent)
50
p += sprintf(buf + p, "U{%d/%d%+d}", w[1].inputWeight, 1 << w[1].log2WeightDenom, w[1].inputOffset);
51
- if (w[2].bPresentFlag)
52
+ if (w[2].wtPresent)
53
p += sprintf(buf + p, "V{%d/%d%+d}", w[2].inputWeight, 1 << w[2].log2WeightDenom, w[2].inputOffset);
54
p += sprintf(buf + p, "]");
55
}
56
x265_2.7.tar.gz/source/test/ipfilterharness.cpp -> x265_2.9.tar.gz/source/test/ipfilterharness.cpp
Changed
268
1
2
return true;
3
}
4
5
+bool IPFilterHarness::check_IPFilterLumaP2S_aligned_primitive(filter_p2s_t ref, filter_p2s_t opt)
6
+{
7
+ for (int i = 0; i < TEST_CASES; i++)
8
+ {
9
+ int index = i % TEST_CASES;
10
+ intptr_t rand_srcStride[] = { 128, 192, 256, 512 };
11
+ intptr_t dstStride[] = { 192, 256, 512, 576 };
12
+ for (int p = 0; p < 4; p++)
13
+ {
14
+ ref(pixel_test_buff[index], rand_srcStride[p], IPF_C_output_s, dstStride[p]);
15
+ checked(opt, pixel_test_buff[index] + (64 * i), rand_srcStride[p], IPF_vec_output_s, dstStride[p]);
16
+ if (memcmp(IPF_vec_output_s, IPF_C_output_s, TEST_BUF_SIZE * sizeof(int16_t)))
17
+ return false;
18
+ }
19
+ reportfail();
20
+ }
21
+
22
+ return true;
23
+}
24
+
25
bool IPFilterHarness::check_IPFilterChromaP2S_primitive(filter_p2s_t ref, filter_p2s_t opt)
26
{
27
for (int i = 0; i < ITERS; i++)
28
29
return true;
30
}
31
32
+bool IPFilterHarness::check_IPFilterChromaP2S_aligned_primitive(filter_p2s_t ref, filter_p2s_t opt)
33
+{
34
+ for (int i = 0; i < TEST_CASES; i++)
35
+ {
36
+ int index = i % TEST_CASES;
37
+ intptr_t rand_srcStride[] = { 128, 192, 256, 512};
38
+ intptr_t dstStride[] = { 192, 256, 512, 576 };
39
+
40
+ for (int p = 0; p < 4; p++)
41
+ {
42
+ ref(pixel_test_buff[index], rand_srcStride[p], IPF_C_output_s, dstStride[p]);
43
+
44
+ checked(opt, pixel_test_buff[index], rand_srcStride[p], IPF_vec_output_s, dstStride[p]);
45
+
46
+ if (memcmp(IPF_vec_output_s, IPF_C_output_s, TEST_BUF_SIZE * sizeof(int16_t)))
47
+ return false;
48
+ }
49
+ reportfail();
50
+ }
51
+
52
+ return true;
53
+}
54
+
55
bool IPFilterHarness::testCorrectness(const EncoderPrimitives& ref, const EncoderPrimitives& opt)
56
{
57
58
59
return false;
60
}
61
}
62
- if (opt.pu[value].convert_p2s)
63
+ if (opt.pu[value].convert_p2s[NONALIGNED])
64
{
65
- if (!check_IPFilterLumaP2S_primitive(ref.pu[value].convert_p2s, opt.pu[value].convert_p2s))
66
+ if (!check_IPFilterLumaP2S_primitive(ref.pu[value].convert_p2s[NONALIGNED], opt.pu[value].convert_p2s[NONALIGNED]))
67
{
68
printf("convert_p2s[%s]", lumaPartStr[value]);
69
return false;
70
}
71
}
72
+ if (opt.pu[value].convert_p2s[ALIGNED])
73
+ {
74
+ if (!check_IPFilterLumaP2S_aligned_primitive(ref.pu[value].convert_p2s[ALIGNED], opt.pu[value].convert_p2s[ALIGNED]))
75
+ {
76
+ printf("convert_p2s_aligned[%s]", lumaPartStr[value]);
77
+ return false;
78
+ }
79
+ }
80
}
81
82
for (int csp = X265_CSP_I420; csp < X265_CSP_COUNT; csp++)
83
84
return false;
85
}
86
}
87
- if (opt.chroma[csp].pu[value].p2s)
88
+ if (opt.chroma[csp].pu[value].p2s[ALIGNED])
89
+ {
90
+ if (!check_IPFilterChromaP2S_aligned_primitive(ref.chroma[csp].pu[value].p2s[ALIGNED], opt.chroma[csp].pu[value].p2s[ALIGNED]))
91
+ {
92
+ printf("chroma_p2s_aligned[%s]", chromaPartStr[csp][value]);
93
+ return false;
94
+ }
95
+ }
96
+ if (opt.chroma[csp].pu[value].p2s[NONALIGNED])
97
{
98
- if (!check_IPFilterChromaP2S_primitive(ref.chroma[csp].pu[value].p2s, opt.chroma[csp].pu[value].p2s))
99
+ if (!check_IPFilterChromaP2S_primitive(ref.chroma[csp].pu[value].p2s[NONALIGNED], opt.chroma[csp].pu[value].p2s[NONALIGNED]))
100
{
101
printf("chroma_p2s[%s]", chromaPartStr[csp][value]);
102
return false;
103
104
105
void IPFilterHarness::measureSpeed(const EncoderPrimitives& ref, const EncoderPrimitives& opt)
106
{
107
- int16_t srcStride = 96;
108
- int16_t dstStride = 96;
109
+ int16_t srcStride = 192; /* Multiple of 64 */
110
+ int16_t dstStride = 192;
111
int maxVerticalfilterHalfDistance = 3;
112
113
for (int value = 0; value < NUM_PU_SIZES; value++)
114
115
{
116
printf("luma_hpp[%s]\t", lumaPartStr[value]);
117
REPORT_SPEEDUP(opt.pu[value].luma_hpp, ref.pu[value].luma_hpp,
118
- pixel_buff + srcStride, srcStride, IPF_vec_output_p, dstStride, 1);
119
+ pixel_buff + srcStride, srcStride, IPF_vec_output_p, dstStride, 1);
120
}
121
122
if (opt.pu[value].luma_hps)
123
{
124
printf("luma_hps[%s]\t", lumaPartStr[value]);
125
REPORT_SPEEDUP(opt.pu[value].luma_hps, ref.pu[value].luma_hps,
126
- pixel_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
127
- IPF_vec_output_s, dstStride, 1, 1);
128
+ pixel_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
129
+ IPF_vec_output_s, dstStride, 1, 1);
130
}
131
132
if (opt.pu[value].luma_vpp)
133
{
134
printf("luma_vpp[%s]\t", lumaPartStr[value]);
135
REPORT_SPEEDUP(opt.pu[value].luma_vpp, ref.pu[value].luma_vpp,
136
- pixel_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
137
- IPF_vec_output_p, dstStride, 1);
138
+ pixel_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
139
+ IPF_vec_output_p, dstStride, 1);
140
}
141
142
if (opt.pu[value].luma_vps)
143
{
144
printf("luma_vps[%s]\t", lumaPartStr[value]);
145
REPORT_SPEEDUP(opt.pu[value].luma_vps, ref.pu[value].luma_vps,
146
- pixel_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
147
- IPF_vec_output_s, dstStride, 1);
148
+ pixel_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
149
+ IPF_vec_output_s, dstStride, 1);
150
}
151
152
if (opt.pu[value].luma_vsp)
153
{
154
printf("luma_vsp[%s]\t", lumaPartStr[value]);
155
REPORT_SPEEDUP(opt.pu[value].luma_vsp, ref.pu[value].luma_vsp,
156
- short_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
157
- IPF_vec_output_p, dstStride, 1);
158
+ short_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
159
+ IPF_vec_output_p, dstStride, 1);
160
}
161
162
if (opt.pu[value].luma_vss)
163
{
164
printf("luma_vss[%s]\t", lumaPartStr[value]);
165
REPORT_SPEEDUP(opt.pu[value].luma_vss, ref.pu[value].luma_vss,
166
- short_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
167
- IPF_vec_output_s, dstStride, 1);
168
+ short_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
169
+ IPF_vec_output_s, dstStride, 1);
170
}
171
172
if (opt.pu[value].luma_hvpp)
173
{
174
printf("luma_hv [%s]\t", lumaPartStr[value]);
175
REPORT_SPEEDUP(opt.pu[value].luma_hvpp, ref.pu[value].luma_hvpp,
176
- pixel_buff + 3 * srcStride, srcStride, IPF_vec_output_p, srcStride, 1, 3);
177
+ pixel_buff + 3 * srcStride, srcStride, IPF_vec_output_p, srcStride, 1, 3);
178
}
179
180
- if (opt.pu[value].convert_p2s)
181
+ if (opt.pu[value].convert_p2s[NONALIGNED])
182
{
183
printf("convert_p2s[%s]\t", lumaPartStr[value]);
184
- REPORT_SPEEDUP(opt.pu[value].convert_p2s, ref.pu[value].convert_p2s,
185
- pixel_buff, srcStride,
186
- IPF_vec_output_s, dstStride);
187
+ REPORT_SPEEDUP(opt.pu[value].convert_p2s[NONALIGNED], ref.pu[value].convert_p2s[NONALIGNED],
188
+ pixel_buff, srcStride,
189
+ IPF_vec_output_s, dstStride);
190
+ }
191
+
192
+ if (opt.pu[value].convert_p2s[ALIGNED])
193
+ {
194
+ printf("convert_p2s_aligned[%s]\t", lumaPartStr[value]);
195
+ REPORT_SPEEDUP(opt.pu[value].convert_p2s[ALIGNED], ref.pu[value].convert_p2s[ALIGNED],
196
+ pixel_buff, srcStride,
197
+ IPF_vec_output_s, dstStride);
198
}
199
}
200
201
202
{
203
printf("chroma_hpp[%s]", chromaPartStr[csp][value]);
204
REPORT_SPEEDUP(opt.chroma[csp].pu[value].filter_hpp, ref.chroma[csp].pu[value].filter_hpp,
205
- pixel_buff + srcStride, srcStride, IPF_vec_output_p, dstStride, 1);
206
+ pixel_buff + srcStride, srcStride, IPF_vec_output_p, dstStride, 1);
207
}
208
if (opt.chroma[csp].pu[value].filter_hps)
209
{
210
printf("chroma_hps[%s]", chromaPartStr[csp][value]);
211
REPORT_SPEEDUP(opt.chroma[csp].pu[value].filter_hps, ref.chroma[csp].pu[value].filter_hps,
212
- pixel_buff + srcStride, srcStride, IPF_vec_output_s, dstStride, 1, 1);
213
+ pixel_buff + srcStride, srcStride, IPF_vec_output_s, dstStride, 1, 1);
214
}
215
if (opt.chroma[csp].pu[value].filter_vpp)
216
{
217
printf("chroma_vpp[%s]", chromaPartStr[csp][value]);
218
REPORT_SPEEDUP(opt.chroma[csp].pu[value].filter_vpp, ref.chroma[csp].pu[value].filter_vpp,
219
- pixel_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
220
- IPF_vec_output_p, dstStride, 1);
221
+ pixel_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
222
+ IPF_vec_output_p, dstStride, 1);
223
}
224
if (opt.chroma[csp].pu[value].filter_vps)
225
{
226
printf("chroma_vps[%s]", chromaPartStr[csp][value]);
227
REPORT_SPEEDUP(opt.chroma[csp].pu[value].filter_vps, ref.chroma[csp].pu[value].filter_vps,
228
- pixel_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
229
- IPF_vec_output_s, dstStride, 1);
230
+ pixel_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
231
+ IPF_vec_output_s, dstStride, 1);
232
}
233
if (opt.chroma[csp].pu[value].filter_vsp)
234
{
235
printf("chroma_vsp[%s]", chromaPartStr[csp][value]);
236
REPORT_SPEEDUP(opt.chroma[csp].pu[value].filter_vsp, ref.chroma[csp].pu[value].filter_vsp,
237
- short_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
238
- IPF_vec_output_p, dstStride, 1);
239
+ short_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
240
+ IPF_vec_output_p, dstStride, 1);
241
}
242
if (opt.chroma[csp].pu[value].filter_vss)
243
{
244
printf("chroma_vss[%s]", chromaPartStr[csp][value]);
245
REPORT_SPEEDUP(opt.chroma[csp].pu[value].filter_vss, ref.chroma[csp].pu[value].filter_vss,
246
- short_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
247
- IPF_vec_output_s, dstStride, 1);
248
+ short_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
249
+ IPF_vec_output_s, dstStride, 1);
250
}
251
- if (opt.chroma[csp].pu[value].p2s)
252
+ if (opt.chroma[csp].pu[value].p2s[NONALIGNED])
253
{
254
printf("chroma_p2s[%s]\t", chromaPartStr[csp][value]);
255
- REPORT_SPEEDUP(opt.chroma[csp].pu[value].p2s, ref.chroma[csp].pu[value].p2s,
256
- pixel_buff, srcStride, IPF_vec_output_s, dstStride);
257
+ REPORT_SPEEDUP(opt.chroma[csp].pu[value].p2s[NONALIGNED], ref.chroma[csp].pu[value].p2s[NONALIGNED],
258
+ pixel_buff, srcStride, IPF_vec_output_s, dstStride);
259
+ }
260
+ if (opt.chroma[csp].pu[value].p2s[ALIGNED])
261
+ {
262
+ printf("chroma_p2s_aligned[%s]\t", chromaPartStr[csp][value]);
263
+ REPORT_SPEEDUP(opt.chroma[csp].pu[value].p2s[ALIGNED], ref.chroma[csp].pu[value].p2s[ALIGNED],
264
+ pixel_buff, srcStride, IPF_vec_output_s, dstStride);
265
}
266
}
267
}
268
x265_2.7.tar.gz/source/test/ipfilterharness.h -> x265_2.9.tar.gz/source/test/ipfilterharness.h
Changed
35
1
2
enum { TEST_CASES = 3 };
3
enum { SMAX = 1 << 12 };
4
enum { SMIN = (unsigned)-1 << 12 };
5
- ALIGN_VAR_32(pixel, pixel_buff[TEST_BUF_SIZE]);
6
- int16_t short_buff[TEST_BUF_SIZE];
7
- int16_t IPF_vec_output_s[TEST_BUF_SIZE];
8
- int16_t IPF_C_output_s[TEST_BUF_SIZE];
9
- pixel IPF_vec_output_p[TEST_BUF_SIZE];
10
- pixel IPF_C_output_p[TEST_BUF_SIZE];
11
+ ALIGN_VAR_64(pixel, pixel_buff[TEST_BUF_SIZE]);
12
+ ALIGN_VAR_64(int16_t, short_buff[TEST_BUF_SIZE]);
13
+ ALIGN_VAR_64(int16_t, IPF_vec_output_s[TEST_BUF_SIZE]);
14
+ ALIGN_VAR_64(int16_t, IPF_C_output_s[TEST_BUF_SIZE]);
15
+ ALIGN_VAR_64(pixel, IPF_vec_output_p[TEST_BUF_SIZE]);
16
+ ALIGN_VAR_64(pixel, IPF_C_output_p[TEST_BUF_SIZE]);
17
18
- pixel pixel_test_buff[TEST_CASES][TEST_BUF_SIZE];
19
- int16_t short_test_buff[TEST_CASES][TEST_BUF_SIZE];
20
+ ALIGN_VAR_64(pixel, pixel_test_buff[TEST_CASES][TEST_BUF_SIZE]);
21
+ ALIGN_VAR_64(int16_t, short_test_buff[TEST_CASES][TEST_BUF_SIZE]);
22
23
bool check_IPFilterChroma_primitive(filter_pp_t ref, filter_pp_t opt);
24
bool check_IPFilterChroma_ps_primitive(filter_ps_t ref, filter_ps_t opt);
25
26
bool check_IPFilterLuma_ss_primitive(filter_ss_t ref, filter_ss_t opt);
27
bool check_IPFilterLumaHV_primitive(filter_hv_pp_t ref, filter_hv_pp_t opt);
28
bool check_IPFilterLumaP2S_primitive(filter_p2s_t ref, filter_p2s_t opt);
29
+ bool check_IPFilterLumaP2S_aligned_primitive(filter_p2s_t ref, filter_p2s_t opt);
30
bool check_IPFilterChromaP2S_primitive(filter_p2s_t ref, filter_p2s_t opt);
31
+ bool check_IPFilterChromaP2S_aligned_primitive(filter_p2s_t ref, filter_p2s_t opt);
32
33
public:
34
35
x265_2.7.tar.gz/source/test/mbdstharness.cpp -> x265_2.9.tar.gz/source/test/mbdstharness.cpp
Changed
256
1
2
for (int i = 0; i < TEST_BUF_SIZE; i++)
3
{
4
short_test_buff[0][i] = (rand() & PIXEL_MAX) - (rand() & PIXEL_MAX);
5
+ short_test_buff1[0][i] = (rand() & PIXEL_MAX) - (rand() & PIXEL_MAX);
6
int_test_buff[0][i] = rand() % PIXEL_MAX;
7
int_idct_test_buff[0][i] = (rand() % (SHORT_MAX - SHORT_MIN)) - SHORT_MAX;
8
short_denoise_test_buff1[0][i] = short_denoise_test_buff2[0][i] = (rand() & SHORT_MAX) - (rand() & SHORT_MAX);
9
-
10
short_test_buff[1][i] = -PIXEL_MAX;
11
+ short_test_buff1[1][i] = -PIXEL_MAX;
12
int_test_buff[1][i] = -PIXEL_MAX;
13
int_idct_test_buff[1][i] = SHORT_MIN;
14
short_denoise_test_buff1[1][i] = short_denoise_test_buff2[1][i] = -SHORT_MAX;
15
-
16
short_test_buff[2][i] = PIXEL_MAX;
17
+ short_test_buff1[2][i] = PIXEL_MAX;
18
int_test_buff[2][i] = PIXEL_MAX;
19
int_idct_test_buff[2][i] = SHORT_MAX;
20
short_denoise_test_buff1[2][i] = short_denoise_test_buff2[2][i] = SHORT_MAX;
21
22
bool MBDstHarness::check_nquant_primitive(nquant_t ref, nquant_t opt)
23
{
24
int j = 0;
25
-
26
for (int i = 0; i < ITERS; i++)
27
{
28
- int width = (rand() % 4 + 1) * 4;
29
+ int width = 1 << (rand() % 4 + 2);
30
int height = width;
31
-
32
uint32_t optReturnValue = 0;
33
uint32_t refReturnValue = 0;
34
35
36
reportfail();
37
j += INCR;
38
}
39
+ return true;
40
+}
41
+
42
+bool MBDstHarness::check_nonPsyRdoQuant_primitive(nonPsyRdoQuant_t ref, nonPsyRdoQuant_t opt)
43
+{
44
+ int j = 0;
45
+ int trSize[4] = { 16, 64, 256, 1024 };
46
+
47
+ ALIGN_VAR_32(int64_t, ref_dest[4 * MAX_TU_SIZE]);
48
+ ALIGN_VAR_32(int64_t, opt_dest[4 * MAX_TU_SIZE]);
49
+
50
+ for (int i = 0; i < ITERS; i++)
51
+ {
52
+ int64_t totalRdCostRef = rand();
53
+ int64_t totalUncodedCostRef = rand();
54
+ int64_t totalRdCostOpt = totalRdCostRef;
55
+ int64_t totalUncodedCostOpt = totalUncodedCostRef;
56
+
57
+ int index = rand() % 4;
58
+ uint32_t blkPos = trSize[index];
59
+ int cmp_size = 4 * MAX_TU_SIZE;
60
+
61
+ memset(ref_dest, 0, MAX_TU_SIZE * sizeof(int64_t));
62
+ memset(opt_dest, 0, MAX_TU_SIZE * sizeof(int64_t));
63
+
64
+ int index1 = rand() % TEST_CASES;
65
+
66
+ ref(short_test_buff[index1] + j, ref_dest, &totalUncodedCostRef, &totalRdCostRef, blkPos);
67
+ checked(opt, short_test_buff[index1] + j, opt_dest, &totalUncodedCostOpt, &totalRdCostOpt, blkPos);
68
+
69
+ if (memcmp(ref_dest, opt_dest, cmp_size))
70
+ return false;
71
+
72
+ if (totalUncodedCostRef != totalUncodedCostOpt)
73
+ return false;
74
+
75
+ if (totalRdCostRef != totalRdCostOpt)
76
+ return false;
77
+
78
+ reportfail();
79
+ j += INCR;
80
+ }
81
+
82
+ return true;
83
+}
84
+bool MBDstHarness::check_psyRdoQuant_primitive(psyRdoQuant_t ref, psyRdoQuant_t opt)
85
+{
86
+ int j = 0;
87
+ int trSize[4] = { 16, 64, 256, 1024 };
88
+
89
+ ALIGN_VAR_32(int64_t, ref_dest[4 * MAX_TU_SIZE]);
90
+ ALIGN_VAR_32(int64_t, opt_dest[4 * MAX_TU_SIZE]);
91
+
92
+ for (int i = 0; i < ITERS; i++)
93
+ {
94
+ int64_t totalRdCostRef = rand();
95
+ int64_t totalUncodedCostRef = rand();
96
+ int64_t totalRdCostOpt = totalRdCostRef;
97
+ int64_t totalUncodedCostOpt = totalUncodedCostRef;
98
+ int64_t *psyScale = X265_MALLOC(int64_t, 1);
99
+ *psyScale = rand();
100
+
101
+ int index = rand() % 4;
102
+ uint32_t blkPos = trSize[index];
103
+ int cmp_size = 4 * MAX_TU_SIZE;
104
+
105
+ memset(ref_dest, 0, MAX_TU_SIZE * sizeof(int64_t));
106
+ memset(opt_dest, 0, MAX_TU_SIZE * sizeof(int64_t));
107
+
108
+ int index1 = rand() % TEST_CASES;
109
+
110
+ ref(short_test_buff[index1] + j, short_test_buff1[index1] + j, ref_dest, &totalUncodedCostRef, &totalRdCostRef, psyScale, blkPos);
111
+ checked(opt, short_test_buff[index1] + j, short_test_buff1[index1] + j, opt_dest, &totalUncodedCostOpt, &totalRdCostOpt, psyScale, blkPos);
112
+
113
+ X265_FREE(psyScale);
114
+ if (memcmp(ref_dest, opt_dest, cmp_size))
115
+ return false;
116
+
117
+ if (totalUncodedCostRef != totalUncodedCostOpt)
118
+ return false;
119
+
120
+ if (totalRdCostRef != totalRdCostOpt)
121
+ return false;
122
+
123
+ reportfail();
124
+ j += INCR;
125
+ }
126
+
127
+ return true;
128
+}
129
+bool MBDstHarness::check_psyRdoQuant_primitive_avx2(psyRdoQuant_t1 ref, psyRdoQuant_t1 opt)
130
+{
131
+ int j = 0;
132
+ int trSize[4] = { 16, 64, 256, 1024 };
133
+
134
+ ALIGN_VAR_32(int64_t, ref_dest[4 * MAX_TU_SIZE]);
135
+ ALIGN_VAR_32(int64_t, opt_dest[4 * MAX_TU_SIZE]);
136
+
137
+ for (int i = 0; i < ITERS; i++)
138
+ {
139
+ int64_t totalRdCostRef = rand();
140
+ int64_t totalUncodedCostRef = rand();
141
+ int64_t totalRdCostOpt = totalRdCostRef;
142
+ int64_t totalUncodedCostOpt = totalUncodedCostRef;
143
+
144
+ int index = rand() % 4;
145
+ uint32_t blkPos = trSize[index];
146
+ int cmp_size = 4 * MAX_TU_SIZE;
147
+
148
+ memset(ref_dest, 0, MAX_TU_SIZE * sizeof(int64_t));
149
+ memset(opt_dest, 0, MAX_TU_SIZE * sizeof(int64_t));
150
+
151
+ int index1 = rand() % TEST_CASES;
152
+
153
+ ref(short_test_buff[index1] + j, ref_dest, &totalUncodedCostRef, &totalRdCostRef, blkPos);
154
+ checked(opt, short_test_buff[index1] + j, opt_dest, &totalUncodedCostOpt, &totalRdCostOpt, blkPos);
155
+
156
+
157
+ if (memcmp(ref_dest, opt_dest, cmp_size))
158
+ return false;
159
+
160
+ if (totalUncodedCostRef != totalUncodedCostOpt)
161
+ return false;
162
+
163
+ if (totalRdCostRef != totalRdCostOpt)
164
+ return false;
165
+
166
+ reportfail();
167
+ j += INCR;
168
+ }
169
170
return true;
171
}
172
173
return false;
174
}
175
}
176
+
177
+ for (int i = 0; i < NUM_TR_SIZE; i++)
178
+ {
179
+ if (opt.cu[i].nonPsyRdoQuant)
180
+ {
181
+ if (!check_nonPsyRdoQuant_primitive(ref.cu[i].nonPsyRdoQuant, opt.cu[i].nonPsyRdoQuant))
182
+ {
183
+ printf("nonPsyRdoQuant[%dx%d]: Failed!\n", 4 << i, 4 << i);
184
+ return false;
185
+ }
186
+ }
187
+ }
188
+ for (int i = 0; i < NUM_TR_SIZE; i++)
189
+ {
190
+ if (opt.cu[i].psyRdoQuant)
191
+ {
192
+ if (!check_psyRdoQuant_primitive(ref.cu[i].psyRdoQuant, opt.cu[i].psyRdoQuant))
193
+ {
194
+ printf("psyRdoQuant[%dx%d]: Failed!\n", 4 << i, 4 << i);
195
+ return false;
196
+ }
197
+ }
198
+ }
199
+ for (int i = 0; i < NUM_TR_SIZE; i++)
200
+ {
201
+ if (opt.cu[i].psyRdoQuant_1p)
202
+ {
203
+ if (!check_psyRdoQuant_primitive_avx2(ref.cu[i].psyRdoQuant_1p, opt.cu[i].psyRdoQuant_1p))
204
+ {
205
+ printf("psyRdoQuant_1p[%dx%d]: Failed!\n", 4 << i, 4 << i);
206
+ return false;
207
+ }
208
+ }
209
+ }
210
for (int i = 0; i < NUM_TR_SIZE; i++)
211
{
212
if (opt.cu[i].count_nonzero)
213
214
printf("nquant\t\t");
215
REPORT_SPEEDUP(opt.nquant, ref.nquant, short_test_buff[0], int_test_buff[1], mshortbuf2, 23, 23785, 32 * 32);
216
}
217
+
218
+ for (int value = 0; value < NUM_TR_SIZE; value++)
219
+ {
220
+ if (opt.cu[value].nonPsyRdoQuant)
221
+ {
222
+ ALIGN_VAR_32(int64_t, opt_dest[4 * MAX_TU_SIZE]);
223
+ int64_t totalRdCost = 0;
224
+ int64_t totalUncodedCost = 0;
225
+ printf("nonPsyRdoQuant[%dx%d]", 4 << value, 4 << value);
226
+ REPORT_SPEEDUP(opt.cu[value].nonPsyRdoQuant, ref.cu[value].nonPsyRdoQuant, short_test_buff[0], opt_dest, &totalUncodedCost, &totalRdCost, 0);
227
+ }
228
+ }
229
+ for (int value = 0; value < NUM_TR_SIZE; value++)
230
+ {
231
+ if (opt.cu[value].psyRdoQuant)
232
+ {
233
+ ALIGN_VAR_32(int64_t, opt_dest[4 * MAX_TU_SIZE]);
234
+ int64_t totalRdCost = 0;
235
+ int64_t totalUncodedCost = 0;
236
+ int64_t *psyScale = X265_MALLOC(int64_t, 1);
237
+ *psyScale = 0;
238
+ printf("psyRdoQuant[%dx%d]", 4 << value, 4 << value);
239
+ REPORT_SPEEDUP(opt.cu[value].psyRdoQuant, ref.cu[value].psyRdoQuant, short_test_buff[0], short_test_buff1[0], opt_dest, &totalUncodedCost, &totalRdCost, psyScale, 0);
240
+ }
241
+ }
242
+ for (int value = 0; value < NUM_TR_SIZE; value++)
243
+ {
244
+ if (opt.cu[value].psyRdoQuant_1p)
245
+ {
246
+ ALIGN_VAR_32(int64_t, opt_dest[4 * MAX_TU_SIZE]);
247
+ int64_t totalRdCost = 0;
248
+ int64_t totalUncodedCost = 0;
249
+ printf("psyRdoQuant_1p[%dx%d]", 4 << value, 4 << value);
250
+ REPORT_SPEEDUP(opt.cu[value].psyRdoQuant_1p, ref.cu[value].psyRdoQuant_1p, short_test_buff[0], opt_dest, &totalUncodedCost, &totalRdCost, 0);
251
+ }
252
+ }
253
for (int value = 0; value < NUM_TR_SIZE; value++)
254
{
255
if (opt.cu[value].count_nonzero)
256
x265_2.7.tar.gz/source/test/mbdstharness.h -> x265_2.9.tar.gz/source/test/mbdstharness.h
Changed
32
1
2
int mintbuf2[MAX_TU_SIZE];
3
int mintbuf3[MAX_TU_SIZE];
4
int mintbuf4[MAX_TU_SIZE];
5
-
6
int16_t short_test_buff[TEST_CASES][TEST_BUF_SIZE];
7
+ int16_t short_test_buff1[TEST_CASES][TEST_BUF_SIZE];
8
int int_test_buff[TEST_CASES][TEST_BUF_SIZE];
9
int int_idct_test_buff[TEST_CASES][TEST_BUF_SIZE];
10
-
11
uint32_t mubuf1[MAX_TU_SIZE];
12
uint32_t mubuf2[MAX_TU_SIZE];
13
uint16_t mushortbuf1[MAX_TU_SIZE];
14
15
int16_t short_denoise_test_buff1[TEST_CASES][TEST_BUF_SIZE];
16
int16_t short_denoise_test_buff2[TEST_CASES][TEST_BUF_SIZE];
17
-
18
bool check_dequant_primitive(dequant_scaling_t ref, dequant_scaling_t opt);
19
bool check_dequant_primitive(dequant_normal_t ref, dequant_normal_t opt);
20
+ bool check_nonPsyRdoQuant_primitive(nonPsyRdoQuant_t ref, nonPsyRdoQuant_t opt);
21
+ bool check_psyRdoQuant_primitive(psyRdoQuant_t ref, psyRdoQuant_t opt);
22
bool check_quant_primitive(quant_t ref, quant_t opt);
23
bool check_nquant_primitive(nquant_t ref, nquant_t opt);
24
bool check_dct_primitive(dct_t ref, dct_t opt, intptr_t width);
25
bool check_idct_primitive(idct_t ref, idct_t opt, intptr_t width);
26
bool check_count_nonzero_primitive(count_nonzero_t ref, count_nonzero_t opt);
27
bool check_denoise_dct_primitive(denoiseDct_t ref, denoiseDct_t opt);
28
+ bool check_psyRdoQuant_primitive_avx2(psyRdoQuant_t1 ref, psyRdoQuant_t1 opt);
29
30
public:
31
32
x265_2.7.tar.gz/source/test/pixelharness.cpp -> x265_2.9.tar.gz/source/test/pixelharness.cpp
Changed
775
1
2
return true;
3
}
4
5
+bool PixelHarness::check_calresidual_aligned(calcresidual_t ref, calcresidual_t opt)
6
+{
7
+ ALIGN_VAR_16(int16_t, ref_dest[64 * 64]);
8
+ ALIGN_VAR_16(int16_t, opt_dest[64 * 64]);
9
+ memset(ref_dest, 0, 64 * 64 * sizeof(int16_t));
10
+ memset(opt_dest, 0, 64 * 64 * sizeof(int16_t));
11
+
12
+ int j = 0;
13
+ intptr_t stride = STRIDE;
14
+ for (int i = 0; i < ITERS; i++)
15
+ {
16
+ int index = i % TEST_CASES;
17
+ checked(opt, pbuf1 + j, pixel_test_buff[index] + j, opt_dest, stride);
18
+ ref(pbuf1 + j, pixel_test_buff[index] + j, ref_dest, stride);
19
+
20
+ if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int16_t)))
21
+ return false;
22
+
23
+ reportfail();
24
+ j += INCR;
25
+ }
26
+
27
+ return true;
28
+}
29
+
30
bool PixelHarness::check_ssd_s(pixel_ssd_s_t ref, pixel_ssd_s_t opt)
31
{
32
int j = 0;
33
34
reportfail();
35
j += INCR;
36
}
37
-
38
return true;
39
}
40
+bool PixelHarness::check_ssd_s_aligned(pixel_ssd_s_t ref, pixel_ssd_s_t opt)
41
+{
42
+ int j = 0;
43
+ for (int i = 0; i < ITERS; i++)
44
+ {
45
+ // NOTE: stride must be multiple of 16, because minimum block is 4x4
46
+ int stride = STRIDE;
47
+ sse_t cres = ref(sbuf1 + j, stride);
48
+ sse_t vres = (sse_t)checked(opt, sbuf1 + j, (intptr_t)stride);
49
+
50
+ if (cres != vres)
51
+ return false;
52
+
53
+ reportfail();
54
+ j += INCR+32;
55
+ }
56
57
+ return true;
58
+}
59
bool PixelHarness::check_weightp(weightp_sp_t ref, weightp_sp_t opt)
60
{
61
ALIGN_VAR_16(pixel, ref_dest[64 * (64 + 1)]);
62
63
memset(ref_dest, 0, 64 * 64 * sizeof(pixel));
64
memset(opt_dest, 0, 64 * 64 * sizeof(pixel));
65
int j = 0;
66
+ bool enableavx512 = true;
67
int width = 16 * (rand() % 4 + 1);
68
+ int cpuid = X265_NS::cpu_detect(enableavx512);
69
+ if (cpuid & X265_CPU_AVX512)
70
+ width = 32 * (rand() % 2 + 1);
71
int height = 8;
72
int w0 = rand() % 128;
73
int shift = rand() % 8; // maximum is 7, see setFromWeightAndOffset()
74
75
76
return true;
77
}
78
-
79
bool PixelHarness::check_cpy1Dto2D_shl_t(cpy1Dto2D_shl_t ref, cpy1Dto2D_shl_t opt)
80
{
81
- ALIGN_VAR_16(int16_t, ref_dest[64 * 64]);
82
- ALIGN_VAR_16(int16_t, opt_dest[64 * 64]);
83
-
84
+ ALIGN_VAR_64(int16_t, ref_dest[64 * 64]);
85
+ ALIGN_VAR_64(int16_t, opt_dest[64 * 64]);
86
memset(ref_dest, 0xCD, sizeof(ref_dest));
87
memset(opt_dest, 0xCD, sizeof(opt_dest));
88
89
90
91
return true;
92
}
93
+bool PixelHarness::check_cpy1Dto2D_shl_aligned_t(cpy1Dto2D_shl_t ref, cpy1Dto2D_shl_t opt)
94
+{
95
+ ALIGN_VAR_64(int16_t, ref_dest[64 * 64]);
96
+ ALIGN_VAR_64(int16_t, opt_dest[64 * 64]);
97
+
98
+ memset(ref_dest, 0xCD, sizeof(ref_dest));
99
+ memset(opt_dest, 0xCD, sizeof(opt_dest));
100
+
101
+ int j = 0;
102
+ intptr_t stride = STRIDE;
103
+ for (int i = 0; i < ITERS; i++)
104
+ {
105
+ int shift = (rand() % 7 + 1);
106
+
107
+ int index = i % TEST_CASES;
108
+ checked(opt, opt_dest, short_test_buff[index] + j, stride, shift);
109
+ ref(ref_dest, short_test_buff[index] + j, stride, shift);
110
+
111
+ if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int16_t)))
112
+ return false;
113
+
114
+ reportfail();
115
+ j += INCR + 32;
116
+ }
117
+
118
+ return true;
119
+}
120
121
bool PixelHarness::check_cpy1Dto2D_shr_t(cpy1Dto2D_shr_t ref, cpy1Dto2D_shr_t opt)
122
{
123
124
125
return true;
126
}
127
-
128
bool PixelHarness::check_pixelavg_pp(pixelavg_pp_t ref, pixelavg_pp_t opt)
129
{
130
- ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
131
- ALIGN_VAR_16(pixel, opt_dest[64 * 64]);
132
+ ALIGN_VAR_64(pixel, ref_dest[64 * 64]);
133
+ ALIGN_VAR_64(pixel, opt_dest[64 * 64]);
134
+ int j = 0;
135
+ memset(ref_dest, 0xCD, sizeof(ref_dest));
136
+ memset(opt_dest, 0xCD, sizeof(opt_dest));
137
+
138
+ intptr_t stride = STRIDE;
139
+ for (int i = 0; i < ITERS; i++)
140
+ {
141
+ int index1 = rand() % TEST_CASES;
142
+ int index2 = rand() % TEST_CASES;
143
+ checked(ref, ref_dest, stride, pixel_test_buff[index1] + j,
144
+ stride, pixel_test_buff[index2] + j, stride, 32);
145
+ opt(opt_dest, stride, pixel_test_buff[index1] + j,
146
+ stride, pixel_test_buff[index2] + j, stride, 32);
147
+
148
+ if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)))
149
+ return false;
150
+
151
+ reportfail();
152
+ j += INCR;
153
+ }
154
+
155
+ return true;
156
+}
157
+bool PixelHarness::check_pixelavg_pp_aligned(pixelavg_pp_t ref, pixelavg_pp_t opt)
158
+{
159
+ ALIGN_VAR_64(pixel, ref_dest[64 * 64]);
160
+ ALIGN_VAR_64(pixel, opt_dest[64 * 64]);
161
162
int j = 0;
163
164
165
return false;
166
167
reportfail();
168
- j += INCR;
169
+ j += INCR + 32;
170
}
171
172
return true;
173
174
175
bool PixelHarness::check_blockfill_s(blockfill_s_t ref, blockfill_s_t opt)
176
{
177
- ALIGN_VAR_16(int16_t, ref_dest[64 * 64]);
178
- ALIGN_VAR_16(int16_t, opt_dest[64 * 64]);
179
+ ALIGN_VAR_64(int16_t, ref_dest[64 * 64]);
180
+ ALIGN_VAR_64(int16_t, opt_dest[64 * 64]);
181
+
182
+ memset(ref_dest, 0xCD, sizeof(ref_dest));
183
+ memset(opt_dest, 0xCD, sizeof(opt_dest));
184
+
185
+ intptr_t stride = 64;
186
+ for (int i = 0; i < ITERS; i++)
187
+ {
188
+ int16_t value = (rand() % SHORT_MAX) + 1;
189
+
190
+ checked(opt, opt_dest, stride, value);
191
+ ref(ref_dest, stride, value);
192
+
193
+ if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int16_t)))
194
+ return false;
195
+
196
+ reportfail();
197
+ }
198
+
199
+ return true;
200
+}
201
+
202
+bool PixelHarness::check_blockfill_s_aligned(blockfill_s_t ref, blockfill_s_t opt)
203
+{
204
+ ALIGN_VAR_64(int16_t, ref_dest[64 * 64]);
205
+ ALIGN_VAR_64(int16_t, opt_dest[64 * 64]);
206
207
memset(ref_dest, 0xCD, sizeof(ref_dest));
208
memset(opt_dest, 0xCD, sizeof(opt_dest));
209
210
211
bool PixelHarness::check_scale1D_pp(scale1D_t ref, scale1D_t opt)
212
{
213
- ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
214
- ALIGN_VAR_16(pixel, opt_dest[64 * 64]);
215
+ ALIGN_VAR_64(pixel, ref_dest[64 * 64]);
216
+ ALIGN_VAR_64(pixel, opt_dest[64 * 64]);
217
218
memset(ref_dest, 0, sizeof(ref_dest));
219
memset(opt_dest, 0, sizeof(opt_dest));
220
221
return true;
222
}
223
224
+bool PixelHarness::check_scale1D_pp_aligned(scale1D_t ref, scale1D_t opt)
225
+{
226
+ ALIGN_VAR_64(pixel, ref_dest[64 * 64]);
227
+ ALIGN_VAR_64(pixel, opt_dest[64 * 64]);
228
+
229
+ memset(ref_dest, 0, sizeof(ref_dest));
230
+ memset(opt_dest, 0, sizeof(opt_dest));
231
+
232
+ int j = 0;
233
+ for (int i = 0; i < ITERS; i++)
234
+ {
235
+ int index = i % TEST_CASES;
236
+ checked(opt, opt_dest, pixel_test_buff[index] + j);
237
+ ref(ref_dest, pixel_test_buff[index] + j);
238
+
239
+ if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)))
240
+ return false;
241
+
242
+ reportfail();
243
+ j += INCR * 2;
244
+ }
245
+
246
+ return true;
247
+}
248
+
249
bool PixelHarness::check_scale2D_pp(scale2D_t ref, scale2D_t opt)
250
{
251
ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
252
253
return true;
254
}
255
256
+bool PixelHarness::check_pixel_add_ps_aligned(pixel_add_ps_t ref, pixel_add_ps_t opt)
257
+{
258
+ ALIGN_VAR_64(pixel, ref_dest[64 * 64]);
259
+ ALIGN_VAR_64(pixel, opt_dest[64 * 64]);
260
+
261
+ memset(ref_dest, 0xCD, sizeof(ref_dest));
262
+ memset(opt_dest, 0xCD, sizeof(opt_dest));
263
+
264
+ int j = 0;
265
+ intptr_t stride2 = 64, stride = STRIDE;
266
+ for (int i = 0; i < ITERS; i++)
267
+ {
268
+ int index1 = rand() % TEST_CASES;
269
+ int index2 = rand() % TEST_CASES;
270
+ checked(opt, opt_dest, stride2, pixel_test_buff[index1] + j, short_test_buff[index2] + j, stride, stride);
271
+ ref(ref_dest, stride2, pixel_test_buff[index1] + j, short_test_buff[index2] + j, stride, stride);
272
+ if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)))
273
+ return false;
274
+
275
+ reportfail();
276
+ j += 2 * INCR;
277
+ }
278
+ return true;
279
+}
280
+
281
bool PixelHarness::check_pixel_var(var_t ref, var_t opt)
282
{
283
int j = 0;
284
285
286
bool PixelHarness::check_addAvg(addAvg_t ref, addAvg_t opt)
287
{
288
- ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
289
- ALIGN_VAR_16(pixel, opt_dest[64 * 64]);
290
+ ALIGN_VAR_64(pixel, ref_dest[64 * 64]);
291
+ ALIGN_VAR_64(pixel, opt_dest[64 * 64]);
292
293
int j = 0;
294
295
296
return true;
297
}
298
299
+bool PixelHarness::check_addAvg_aligned(addAvg_t ref, addAvg_t opt)
300
+{
301
+ ALIGN_VAR_64(pixel, ref_dest[64 * 64]);
302
+ ALIGN_VAR_64(pixel, opt_dest[64 * 64]);
303
+
304
+ int j = 0;
305
+
306
+ memset(ref_dest, 0xCD, sizeof(ref_dest));
307
+ memset(opt_dest, 0xCD, sizeof(opt_dest));
308
+ intptr_t stride = STRIDE;
309
+
310
+ for (int i = 0; i < ITERS; i++)
311
+ {
312
+ int index1 = rand() % TEST_CASES;
313
+ int index2 = rand() % TEST_CASES;
314
+ ref(short_test_buff2[index1] + j, short_test_buff2[index2] + j, ref_dest, stride, stride, stride);
315
+ checked(opt, short_test_buff2[index1] + j, short_test_buff2[index2] + j, opt_dest, stride, stride, stride);
316
+ if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)))
317
+ return false;
318
+
319
+ reportfail();
320
+ j += INCR * 2;
321
+ }
322
+
323
+ return true;
324
+}
325
bool PixelHarness::check_calSign(sign_t ref, sign_t opt)
326
{
327
ALIGN_VAR_16(int8_t, ref_dest[64 * 2]);
328
329
return false;
330
}
331
}
332
-
333
- if (opt.pu[part].pixelavg_pp)
334
+ if (opt.pu[part].pixelavg_pp[NONALIGNED])
335
{
336
- if (!check_pixelavg_pp(ref.pu[part].pixelavg_pp, opt.pu[part].pixelavg_pp))
337
+ if (!check_pixelavg_pp(ref.pu[part].pixelavg_pp[NONALIGNED], opt.pu[part].pixelavg_pp[NONALIGNED]))
338
{
339
printf("pixelavg_pp[%s]: failed!\n", lumaPartStr[part]);
340
return false;
341
}
342
}
343
+ if (opt.pu[part].pixelavg_pp[ALIGNED])
344
+ {
345
+ if (!check_pixelavg_pp_aligned(ref.pu[part].pixelavg_pp[ALIGNED], opt.pu[part].pixelavg_pp[ALIGNED]))
346
+ {
347
+ printf("pixelavg_pp_aligned[%s]: failed!\n", lumaPartStr[part]);
348
+ return false;
349
+ }
350
+ }
351
352
if (opt.pu[part].copy_pp)
353
{
354
355
}
356
}
357
358
- if (opt.pu[part].addAvg)
359
+ if (opt.pu[part].addAvg[NONALIGNED])
360
{
361
- if (!check_addAvg(ref.pu[part].addAvg, opt.pu[part].addAvg))
362
+ if (!check_addAvg(ref.pu[part].addAvg[NONALIGNED], opt.pu[part].addAvg[NONALIGNED]))
363
{
364
printf("addAvg[%s] failed\n", lumaPartStr[part]);
365
return false;
366
}
367
}
368
369
+ if (opt.pu[part].addAvg[ALIGNED])
370
+ {
371
+ if (!check_addAvg_aligned(ref.pu[part].addAvg[ALIGNED], opt.pu[part].addAvg[ALIGNED]))
372
+ {
373
+ printf("addAvg_aligned[%s] failed\n", lumaPartStr[part]);
374
+ return false;
375
+ }
376
+ }
377
+
378
if (part < NUM_CU_SIZES)
379
{
380
if (opt.cu[part].sse_pp)
381
382
}
383
}
384
385
- if (opt.cu[part].add_ps)
386
+ if (opt.cu[part].add_ps[NONALIGNED])
387
{
388
- if (!check_pixel_add_ps(ref.cu[part].add_ps, opt.cu[part].add_ps))
389
+ if (!check_pixel_add_ps(ref.cu[part].add_ps[NONALIGNED], opt.cu[part].add_ps[NONALIGNED]))
390
{
391
printf("add_ps[%s] failed\n", lumaPartStr[part]);
392
return false;
393
}
394
}
395
396
+ if (opt.cu[part].add_ps[ALIGNED])
397
+ {
398
+ if (!check_pixel_add_ps_aligned(ref.cu[part].add_ps[ALIGNED], opt.cu[part].add_ps[ALIGNED]))
399
+ {
400
+ printf("add_ps_aligned[%s] failed\n", lumaPartStr[part]);
401
+ return false;
402
+ }
403
+ }
404
+
405
if (opt.cu[part].copy_ss)
406
{
407
if (!check_copy_ss(ref.cu[part].copy_ss, opt.cu[part].copy_ss))
408
409
return false;
410
}
411
}
412
- if (opt.chroma[i].pu[part].addAvg)
413
+ if (opt.chroma[i].pu[part].addAvg[NONALIGNED])
414
{
415
- if (!check_addAvg(ref.chroma[i].pu[part].addAvg, opt.chroma[i].pu[part].addAvg))
416
+ if (!check_addAvg(ref.chroma[i].pu[part].addAvg[NONALIGNED], opt.chroma[i].pu[part].addAvg[NONALIGNED]))
417
{
418
printf("chroma_addAvg[%s][%s] failed\n", x265_source_csp_names[i], chromaPartStr[i][part]);
419
return false;
420
}
421
}
422
+ if (opt.chroma[i].pu[part].addAvg[ALIGNED])
423
+ {
424
+ if (!check_addAvg_aligned(ref.chroma[i].pu[part].addAvg[ALIGNED], opt.chroma[i].pu[part].addAvg[ALIGNED]))
425
+ {
426
+ printf("chroma_addAvg_aligned[%s][%s] failed\n", x265_source_csp_names[i], chromaPartStr[i][part]);
427
+ return false;
428
+ }
429
+ }
430
if (opt.chroma[i].pu[part].satd)
431
{
432
if (!check_pixelcmp(ref.chroma[i].pu[part].satd, opt.chroma[i].pu[part].satd))
433
434
return false;
435
}
436
}
437
- if (opt.chroma[i].cu[part].add_ps)
438
+ if (opt.chroma[i].cu[part].add_ps[NONALIGNED])
439
{
440
- if (!check_pixel_add_ps(ref.chroma[i].cu[part].add_ps, opt.chroma[i].cu[part].add_ps))
441
+ if (!check_pixel_add_ps(ref.chroma[i].cu[part].add_ps[NONALIGNED], opt.chroma[i].cu[part].add_ps[NONALIGNED]))
442
{
443
printf("chroma_add_ps[%s][%s] failed\n", x265_source_csp_names[i], chromaPartStr[i][part]);
444
return false;
445
}
446
}
447
+ if (opt.chroma[i].cu[part].add_ps[ALIGNED])
448
+ {
449
+ if (!check_pixel_add_ps_aligned(ref.chroma[i].cu[part].add_ps[ALIGNED], opt.chroma[i].cu[part].add_ps[ALIGNED]))
450
+ {
451
+ printf("chroma_add_ps_aligned[%s][%s] failed\n", x265_source_csp_names[i], chromaPartStr[i][part]);
452
+ return false;
453
+ }
454
+ }
455
if (opt.chroma[i].cu[part].copy_sp)
456
{
457
if (!check_copy_sp(ref.chroma[i].cu[part].copy_sp, opt.chroma[i].cu[part].copy_sp))
458
459
}
460
}
461
462
- if (opt.cu[i].blockfill_s)
463
+ if (opt.cu[i].blockfill_s[NONALIGNED])
464
{
465
- if (!check_blockfill_s(ref.cu[i].blockfill_s, opt.cu[i].blockfill_s))
466
+ if (!check_blockfill_s(ref.cu[i].blockfill_s[NONALIGNED], opt.cu[i].blockfill_s[NONALIGNED]))
467
{
468
printf("blockfill_s[%dx%d]: failed!\n", 4 << i, 4 << i);
469
return false;
470
}
471
}
472
473
+ if (opt.cu[i].blockfill_s[ALIGNED])
474
+ {
475
+ if (!check_blockfill_s_aligned(ref.cu[i].blockfill_s[ALIGNED], opt.cu[i].blockfill_s[ALIGNED]))
476
+ {
477
+ printf("blockfill_s_aligned[%dx%d]: failed!\n", 4 << i, 4 << i);
478
+ return false;
479
+ }
480
+ }
481
if (opt.cu[i].var)
482
{
483
if (!check_pixel_var(ref.cu[i].var, opt.cu[i].var))
484
485
{
486
/* TU only primitives */
487
488
- if (opt.cu[i].calcresidual)
489
+ if (opt.cu[i].calcresidual[NONALIGNED])
490
{
491
- if (!check_calresidual(ref.cu[i].calcresidual, opt.cu[i].calcresidual))
492
+ if (!check_calresidual(ref.cu[i].calcresidual[NONALIGNED], opt.cu[i].calcresidual[NONALIGNED]))
493
{
494
printf("calcresidual width: %d failed!\n", 4 << i);
495
return false;
496
}
497
}
498
499
+ if (opt.cu[i].calcresidual[ALIGNED])
500
+ {
501
+ if (!check_calresidual_aligned(ref.cu[i].calcresidual[ALIGNED], opt.cu[i].calcresidual[ALIGNED]))
502
+ {
503
+ printf("calcresidual_aligned width: %d failed!\n", 4 << i);
504
+ return false;
505
+ }
506
+ }
507
+
508
if (opt.cu[i].transpose)
509
{
510
if (!check_transpose(ref.cu[i].transpose, opt.cu[i].transpose))
511
512
return false;
513
}
514
}
515
-
516
- if (opt.cu[i].ssd_s)
517
+ if (opt.cu[i].ssd_s[NONALIGNED])
518
{
519
- if (!check_ssd_s(ref.cu[i].ssd_s, opt.cu[i].ssd_s))
520
+ if (!check_ssd_s(ref.cu[i].ssd_s[NONALIGNED], opt.cu[i].ssd_s[NONALIGNED]))
521
{
522
printf("ssd_s[%dx%d]: failed!\n", 4 << i, 4 << i);
523
return false;
524
}
525
}
526
-
527
+ if (opt.cu[i].ssd_s[ALIGNED])
528
+ {
529
+ if (!check_ssd_s_aligned(ref.cu[i].ssd_s[ALIGNED], opt.cu[i].ssd_s[ALIGNED]))
530
+ {
531
+ printf("ssd_s_aligned[%dx%d]: failed!\n", 4 << i, 4 << i);
532
+ return false;
533
+ }
534
+ }
535
if (opt.cu[i].copy_cnt)
536
{
537
if (!check_copy_cnt_t(ref.cu[i].copy_cnt, opt.cu[i].copy_cnt))
538
539
return false;
540
}
541
}
542
-
543
- if (opt.cu[i].cpy1Dto2D_shl)
544
+ if (opt.cu[i].cpy1Dto2D_shl[NONALIGNED])
545
{
546
- if (!check_cpy1Dto2D_shl_t(ref.cu[i].cpy1Dto2D_shl, opt.cu[i].cpy1Dto2D_shl))
547
+ if (!check_cpy1Dto2D_shl_t(ref.cu[i].cpy1Dto2D_shl[NONALIGNED], opt.cu[i].cpy1Dto2D_shl[NONALIGNED]))
548
{
549
printf("cpy1Dto2D_shl[%dx%d] failed!\n", 4 << i, 4 << i);
550
return false;
551
}
552
}
553
+ if (opt.cu[i].cpy1Dto2D_shl[ALIGNED])
554
+ {
555
+ if (!check_cpy1Dto2D_shl_aligned_t(ref.cu[i].cpy1Dto2D_shl[ALIGNED], opt.cu[i].cpy1Dto2D_shl[ALIGNED]))
556
+ {
557
+ printf("cpy1Dto2D_shl_aligned[%dx%d] failed!\n", 4 << i, 4 << i);
558
+ return false;
559
+ }
560
+ }
561
562
if (opt.cu[i].cpy1Dto2D_shr)
563
{
564
565
}
566
}
567
568
- if (opt.scale1D_128to64)
569
+ if (opt.scale1D_128to64[NONALIGNED])
570
{
571
- if (!check_scale1D_pp(ref.scale1D_128to64, opt.scale1D_128to64))
572
+ if (!check_scale1D_pp(ref.scale1D_128to64[NONALIGNED], opt.scale1D_128to64[NONALIGNED]))
573
{
574
printf("scale1D_128to64 failed!\n");
575
return false;
576
}
577
}
578
579
+ if (opt.scale1D_128to64[ALIGNED])
580
+ {
581
+ if (!check_scale1D_pp_aligned(ref.scale1D_128to64[ALIGNED], opt.scale1D_128to64[ALIGNED]))
582
+ {
583
+ printf("scale1D_128to64_aligned failed!\n");
584
+ return false;
585
+ }
586
+ }
587
+
588
if (opt.scale2D_64to32)
589
{
590
if (!check_scale2D_pp(ref.scale2D_64to32, opt.scale2D_64to32))
591
592
HEADER("satd[%s]", lumaPartStr[part]);
593
REPORT_SPEEDUP(opt.pu[part].satd, ref.pu[part].satd, pbuf1, STRIDE, fref, STRIDE);
594
}
595
-
596
- if (opt.pu[part].pixelavg_pp)
597
+ if (opt.pu[part].pixelavg_pp[NONALIGNED])
598
{
599
HEADER("avg_pp[%s]", lumaPartStr[part]);
600
- REPORT_SPEEDUP(opt.pu[part].pixelavg_pp, ref.pu[part].pixelavg_pp, pbuf1, STRIDE, pbuf2, STRIDE, pbuf3, STRIDE, 32);
601
+ REPORT_SPEEDUP(opt.pu[part].pixelavg_pp[NONALIGNED], ref.pu[part].pixelavg_pp[NONALIGNED], pbuf1, STRIDE, pbuf2, STRIDE, pbuf3, STRIDE, 32);
602
}
603
604
+ if (opt.pu[part].pixelavg_pp[ALIGNED])
605
+ {
606
+ HEADER("avg_pp_aligned[%s]", lumaPartStr[part]);
607
+ REPORT_SPEEDUP(opt.pu[part].pixelavg_pp[ALIGNED], ref.pu[part].pixelavg_pp[ALIGNED], pbuf1, STRIDE, pbuf2, STRIDE, pbuf3, STRIDE, 32);
608
+ }
609
if (opt.pu[part].sad)
610
{
611
HEADER("sad[%s]", lumaPartStr[part]);
612
613
REPORT_SPEEDUP(opt.pu[part].copy_pp, ref.pu[part].copy_pp, pbuf1, 64, pbuf2, 64);
614
}
615
616
- if (opt.pu[part].addAvg)
617
+ if (opt.pu[part].addAvg[NONALIGNED])
618
{
619
HEADER("addAvg[%s]", lumaPartStr[part]);
620
- REPORT_SPEEDUP(opt.pu[part].addAvg, ref.pu[part].addAvg, sbuf1, sbuf2, pbuf1, STRIDE, STRIDE, STRIDE);
621
+ REPORT_SPEEDUP(opt.pu[part].addAvg[NONALIGNED], ref.pu[part].addAvg[NONALIGNED], sbuf1, sbuf2, pbuf1, STRIDE, STRIDE, STRIDE);
622
+ }
623
+ if (opt.pu[part].addAvg[ALIGNED])
624
+ {
625
+ HEADER("addAvg_aligned[%s]", lumaPartStr[part]);
626
+ REPORT_SPEEDUP(opt.pu[part].addAvg[ALIGNED], ref.pu[part].addAvg[ALIGNED], sbuf1, sbuf2, pbuf1, STRIDE, STRIDE, STRIDE);
627
}
628
629
if (part < NUM_CU_SIZES)
630
631
HEADER("sub_ps[%s]", lumaPartStr[part]);
632
REPORT_SPEEDUP(opt.cu[part].sub_ps, ref.cu[part].sub_ps, (int16_t*)pbuf1, FENC_STRIDE, pbuf2, pbuf1, STRIDE, STRIDE);
633
}
634
- if (opt.cu[part].add_ps)
635
+ if (opt.cu[part].add_ps[NONALIGNED])
636
{
637
HEADER("add_ps[%s]", lumaPartStr[part]);
638
- REPORT_SPEEDUP(opt.cu[part].add_ps, ref.cu[part].add_ps, pbuf1, FENC_STRIDE, pbuf2, sbuf1, STRIDE, STRIDE);
639
+ REPORT_SPEEDUP(opt.cu[part].add_ps[NONALIGNED], ref.cu[part].add_ps[NONALIGNED], pbuf1, FENC_STRIDE, pbuf2, sbuf1, STRIDE, STRIDE);
640
+ }
641
+ if (opt.cu[part].add_ps[ALIGNED])
642
+ {
643
+ HEADER("add_ps_aligned[%s]", lumaPartStr[part]);
644
+ REPORT_SPEEDUP(opt.cu[part].add_ps[ALIGNED], ref.cu[part].add_ps[ALIGNED], pbuf1, FENC_STRIDE, pbuf2, sbuf1, STRIDE, STRIDE);
645
}
646
if (opt.cu[part].copy_ss)
647
{
648
649
HEADER("[%s] copy_pp[%s]", x265_source_csp_names[i], chromaPartStr[i][part]);
650
REPORT_SPEEDUP(opt.chroma[i].pu[part].copy_pp, ref.chroma[i].pu[part].copy_pp, pbuf1, 64, pbuf2, 128);
651
}
652
- if (opt.chroma[i].pu[part].addAvg)
653
+ if (opt.chroma[i].pu[part].addAvg[NONALIGNED])
654
{
655
HEADER("[%s] addAvg[%s]", x265_source_csp_names[i], chromaPartStr[i][part]);
656
- REPORT_SPEEDUP(opt.chroma[i].pu[part].addAvg, ref.chroma[i].pu[part].addAvg, sbuf1, sbuf2, pbuf1, STRIDE, STRIDE, STRIDE);
657
+ REPORT_SPEEDUP(opt.chroma[i].pu[part].addAvg[NONALIGNED], ref.chroma[i].pu[part].addAvg[NONALIGNED], sbuf1, sbuf2, pbuf1, STRIDE, STRIDE, STRIDE);
658
+ }
659
+ if (opt.chroma[i].pu[part].addAvg[ALIGNED])
660
+ {
661
+ HEADER("[%s] addAvg_aligned[%s]", x265_source_csp_names[i], chromaPartStr[i][part]);
662
+ REPORT_SPEEDUP(opt.chroma[i].pu[part].addAvg[ALIGNED], ref.chroma[i].pu[part].addAvg[ALIGNED], sbuf1, sbuf2, pbuf1, STRIDE, STRIDE, STRIDE);
663
}
664
if (opt.chroma[i].pu[part].satd)
665
{
666
667
HEADER("[%s] sub_ps[%s]", x265_source_csp_names[i], chromaPartStr[i][part]);
668
REPORT_SPEEDUP(opt.chroma[i].cu[part].sub_ps, ref.chroma[i].cu[part].sub_ps, (int16_t*)pbuf1, FENC_STRIDE, pbuf2, pbuf1, STRIDE, STRIDE);
669
}
670
- if (opt.chroma[i].cu[part].add_ps)
671
+ if (opt.chroma[i].cu[part].add_ps[NONALIGNED])
672
{
673
HEADER("[%s] add_ps[%s]", x265_source_csp_names[i], chromaPartStr[i][part]);
674
- REPORT_SPEEDUP(opt.chroma[i].cu[part].add_ps, ref.chroma[i].cu[part].add_ps, pbuf1, FENC_STRIDE, pbuf2, sbuf1, STRIDE, STRIDE);
675
+ REPORT_SPEEDUP(opt.chroma[i].cu[part].add_ps[NONALIGNED], ref.chroma[i].cu[part].add_ps[NONALIGNED], pbuf1, FENC_STRIDE, pbuf2, sbuf1, STRIDE, STRIDE);
676
+ }
677
+ if (opt.chroma[i].cu[part].add_ps[ALIGNED])
678
+ {
679
+ HEADER("[%s] add_ps_aligned[%s]", x265_source_csp_names[i], chromaPartStr[i][part]);
680
+ REPORT_SPEEDUP(opt.chroma[i].cu[part].add_ps[ALIGNED], ref.chroma[i].cu[part].add_ps[ALIGNED], pbuf1, FENC_STRIDE, pbuf2, sbuf1, STRIDE, STRIDE);
681
}
682
if (opt.chroma[i].cu[part].sa8d)
683
{
684
685
measurePartition(part, ref, opt);
686
}
687
}
688
-
689
for (int i = 0; i < NUM_CU_SIZES; i++)
690
{
691
- if ((i <= BLOCK_32x32) && opt.cu[i].ssd_s)
692
+ if ((i <= BLOCK_32x32) && opt.cu[i].ssd_s[NONALIGNED])
693
{
694
HEADER("ssd_s[%dx%d]", 4 << i, 4 << i);
695
- REPORT_SPEEDUP(opt.cu[i].ssd_s, ref.cu[i].ssd_s, sbuf1, STRIDE);
696
+ REPORT_SPEEDUP(opt.cu[i].ssd_s[NONALIGNED], ref.cu[i].ssd_s[NONALIGNED], sbuf1, STRIDE);
697
+ }
698
+ if ((i <= BLOCK_32x32) && opt.cu[i].ssd_s[ALIGNED])
699
+ {
700
+ HEADER("ssd_s_aligned[%dx%d]", 4 << i, 4 << i);
701
+ REPORT_SPEEDUP(opt.cu[i].ssd_s[ALIGNED], ref.cu[i].ssd_s[ALIGNED], sbuf1, STRIDE);
702
}
703
if (opt.cu[i].sa8d)
704
{
705
HEADER("sa8d[%dx%d]", 4 << i, 4 << i);
706
REPORT_SPEEDUP(opt.cu[i].sa8d, ref.cu[i].sa8d, pbuf1, STRIDE, pbuf2, STRIDE);
707
}
708
- if (opt.cu[i].calcresidual)
709
+ if (opt.cu[i].calcresidual[NONALIGNED])
710
{
711
HEADER("residual[%dx%d]", 4 << i, 4 << i);
712
- REPORT_SPEEDUP(opt.cu[i].calcresidual, ref.cu[i].calcresidual, pbuf1, pbuf2, sbuf1, 64);
713
+ REPORT_SPEEDUP(opt.cu[i].calcresidual[NONALIGNED], ref.cu[i].calcresidual[NONALIGNED], pbuf1, pbuf2, sbuf1, 64);
714
}
715
-
716
- if (opt.cu[i].blockfill_s)
717
+ if (opt.cu[i].calcresidual[ALIGNED])
718
+ {
719
+ HEADER("residual_aligned[%dx%d]", 4 << i, 4 << i);
720
+ REPORT_SPEEDUP(opt.cu[i].calcresidual[ALIGNED], ref.cu[i].calcresidual[ALIGNED], pbuf1, pbuf2, sbuf1, 64);
721
+ }
722
+ if (opt.cu[i].blockfill_s[NONALIGNED])
723
{
724
HEADER("blkfill[%dx%d]", 4 << i, 4 << i);
725
- REPORT_SPEEDUP(opt.cu[i].blockfill_s, ref.cu[i].blockfill_s, sbuf1, 64, SHORT_MAX);
726
+ REPORT_SPEEDUP(opt.cu[i].blockfill_s[NONALIGNED], ref.cu[i].blockfill_s[NONALIGNED], sbuf1, 64, SHORT_MAX);
727
+ }
728
+ if (opt.cu[i].blockfill_s[ALIGNED])
729
+ {
730
+ HEADER("blkfill_aligned[%dx%d]", 4 << i, 4 << i);
731
+ REPORT_SPEEDUP(opt.cu[i].blockfill_s[ALIGNED], ref.cu[i].blockfill_s[ALIGNED], sbuf1, 64, SHORT_MAX);
732
}
733
734
if (opt.cu[i].transpose)
735
736
HEADER("cpy2Dto1D_shr[%dx%d]", 4 << i, 4 << i);
737
REPORT_SPEEDUP(opt.cu[i].cpy2Dto1D_shr, ref.cu[i].cpy2Dto1D_shr, sbuf1, sbuf2, STRIDE, 3);
738
}
739
-
740
- if ((i < BLOCK_64x64) && opt.cu[i].cpy1Dto2D_shl)
741
+ if ((i < BLOCK_64x64) && opt.cu[i].cpy1Dto2D_shl[NONALIGNED])
742
{
743
HEADER("cpy1Dto2D_shl[%dx%d]", 4 << i, 4 << i);
744
- REPORT_SPEEDUP(opt.cu[i].cpy1Dto2D_shl, ref.cu[i].cpy1Dto2D_shl, sbuf1, sbuf2, STRIDE, 64);
745
+ REPORT_SPEEDUP(opt.cu[i].cpy1Dto2D_shl[NONALIGNED], ref.cu[i].cpy1Dto2D_shl[NONALIGNED], sbuf1, sbuf2, STRIDE, 64);
746
}
747
748
+ if ((i < BLOCK_64x64) && opt.cu[i].cpy1Dto2D_shl[ALIGNED])
749
+ {
750
+ HEADER("cpy1Dto2D_shl_aligned[%dx%d]", 4 << i, 4 << i);
751
+ REPORT_SPEEDUP(opt.cu[i].cpy1Dto2D_shl[ALIGNED], ref.cu[i].cpy1Dto2D_shl[ALIGNED], sbuf1, sbuf2, STRIDE, 64);
752
+ }
753
if ((i < BLOCK_64x64) && opt.cu[i].cpy1Dto2D_shr)
754
{
755
HEADER("cpy1Dto2D_shr[%dx%d]", 4 << i, 4 << i);
756
757
REPORT_SPEEDUP(opt.frameInitLowres, ref.frameInitLowres, pbuf2, pbuf1, pbuf2, pbuf3, pbuf4, 64, 64, 64, 64);
758
}
759
760
- if (opt.scale1D_128to64)
761
+ if (opt.scale1D_128to64[NONALIGNED])
762
{
763
HEADER0("scale1D_128to64");
764
- REPORT_SPEEDUP(opt.scale1D_128to64, ref.scale1D_128to64, pbuf2, pbuf1);
765
+ REPORT_SPEEDUP(opt.scale1D_128to64[NONALIGNED], ref.scale1D_128to64[NONALIGNED], pbuf2, pbuf1);
766
+ }
767
+
768
+ if (opt.scale1D_128to64[ALIGNED])
769
+ {
770
+ HEADER0("scale1D_128to64_aligned");
771
+ REPORT_SPEEDUP(opt.scale1D_128to64[ALIGNED], ref.scale1D_128to64[ALIGNED], pbuf2, pbuf1);
772
}
773
774
if (opt.scale2D_64to32)
775
x265_2.7.tar.gz/source/test/pixelharness.h -> x265_2.9.tar.gz/source/test/pixelharness.h
Changed
89
1
2
enum { RMAX = PIXEL_MAX - PIXEL_MIN }; //The maximum value obtained by subtracting pixel values (residual max)
3
enum { RMIN = PIXEL_MIN - PIXEL_MAX }; //The minimum value obtained by subtracting pixel values (residual min)
4
5
- ALIGN_VAR_32(pixel, pbuf1[BUFFSIZE]);
6
- pixel pbuf2[BUFFSIZE];
7
- pixel pbuf3[BUFFSIZE];
8
- pixel pbuf4[BUFFSIZE];
9
- int ibuf1[BUFFSIZE];
10
- int8_t psbuf1[BUFFSIZE];
11
- int8_t psbuf2[BUFFSIZE];
12
- int8_t psbuf3[BUFFSIZE];
13
- int8_t psbuf4[BUFFSIZE];
14
- int8_t psbuf5[BUFFSIZE];
15
+ ALIGN_VAR_64(pixel, pbuf1[BUFFSIZE]);
16
+ ALIGN_VAR_64(pixel, pbuf2[BUFFSIZE]);
17
+ ALIGN_VAR_64(pixel, pbuf3[BUFFSIZE]);
18
+ ALIGN_VAR_64(pixel, pbuf4[BUFFSIZE]);
19
+ ALIGN_VAR_64(int, ibuf1[BUFFSIZE]);
20
+ ALIGN_VAR_64(int8_t, psbuf1[BUFFSIZE]);
21
+ ALIGN_VAR_64(int8_t, psbuf2[BUFFSIZE]);
22
+ ALIGN_VAR_64(int8_t, psbuf3[BUFFSIZE]);
23
+ ALIGN_VAR_64(int8_t, psbuf4[BUFFSIZE]);
24
+ ALIGN_VAR_64(int8_t, psbuf5[BUFFSIZE]);
25
26
- int16_t sbuf1[BUFFSIZE];
27
- int16_t sbuf2[BUFFSIZE];
28
- int16_t sbuf3[BUFFSIZE];
29
+ ALIGN_VAR_64(int16_t, sbuf1[BUFFSIZE]);
30
+ ALIGN_VAR_64(int16_t, sbuf2[BUFFSIZE]);
31
+ ALIGN_VAR_64(int16_t, sbuf3[BUFFSIZE]);
32
33
- pixel pixel_test_buff[TEST_CASES][BUFFSIZE];
34
- int16_t short_test_buff[TEST_CASES][BUFFSIZE];
35
- int16_t short_test_buff1[TEST_CASES][BUFFSIZE];
36
- int16_t short_test_buff2[TEST_CASES][BUFFSIZE];
37
- int int_test_buff[TEST_CASES][BUFFSIZE];
38
- uint16_t ushort_test_buff[TEST_CASES][BUFFSIZE];
39
- uint8_t uchar_test_buff[TEST_CASES][BUFFSIZE];
40
- double double_test_buff[TEST_CASES][BUFFSIZE];
41
- int16_t residual_test_buff[TEST_CASES][BUFFSIZE];
42
+ ALIGN_VAR_64(pixel, pixel_test_buff[TEST_CASES][BUFFSIZE]);
43
+ ALIGN_VAR_64(int16_t, short_test_buff[TEST_CASES][BUFFSIZE]);
44
+ ALIGN_VAR_64(int16_t, short_test_buff1[TEST_CASES][BUFFSIZE]);
45
+ ALIGN_VAR_64(int16_t, short_test_buff2[TEST_CASES][BUFFSIZE]);
46
+ ALIGN_VAR_64(int, int_test_buff[TEST_CASES][BUFFSIZE]);
47
+ ALIGN_VAR_64(uint16_t, ushort_test_buff[TEST_CASES][BUFFSIZE]);
48
+ ALIGN_VAR_64(uint8_t, uchar_test_buff[TEST_CASES][BUFFSIZE]);
49
+ ALIGN_VAR_64(double, double_test_buff[TEST_CASES][BUFFSIZE]);
50
+ ALIGN_VAR_64(int16_t, residual_test_buff[TEST_CASES][BUFFSIZE]);
51
52
bool check_pixelcmp(pixelcmp_t ref, pixelcmp_t opt);
53
bool check_pixel_sse(pixel_sse_t ref, pixel_sse_t opt);
54
55
bool check_copy_ps(copy_ps_t ref, copy_ps_t opt);
56
bool check_copy_ss(copy_ss_t ref, copy_ss_t opt);
57
bool check_pixelavg_pp(pixelavg_pp_t ref, pixelavg_pp_t opt);
58
+ bool check_pixelavg_pp_aligned(pixelavg_pp_t ref, pixelavg_pp_t opt);
59
bool check_pixel_sub_ps(pixel_sub_ps_t ref, pixel_sub_ps_t opt);
60
bool check_pixel_add_ps(pixel_add_ps_t ref, pixel_add_ps_t opt);
61
+ bool check_pixel_add_ps_aligned(pixel_add_ps_t ref, pixel_add_ps_t opt);
62
bool check_scale1D_pp(scale1D_t ref, scale1D_t opt);
63
+ bool check_scale1D_pp_aligned(scale1D_t ref, scale1D_t opt);
64
bool check_scale2D_pp(scale2D_t ref, scale2D_t opt);
65
bool check_ssd_s(pixel_ssd_s_t ref, pixel_ssd_s_t opt);
66
+ bool check_ssd_s_aligned(pixel_ssd_s_t ref, pixel_ssd_s_t opt);
67
bool check_blockfill_s(blockfill_s_t ref, blockfill_s_t opt);
68
+ bool check_blockfill_s_aligned(blockfill_s_t ref, blockfill_s_t opt);
69
bool check_calresidual(calcresidual_t ref, calcresidual_t opt);
70
+ bool check_calresidual_aligned(calcresidual_t ref, calcresidual_t opt);
71
bool check_transpose(transpose_t ref, transpose_t opt);
72
bool check_weightp(weightp_pp_t ref, weightp_pp_t opt);
73
bool check_weightp(weightp_sp_t ref, weightp_sp_t opt);
74
75
bool check_cpy2Dto1D_shl_t(cpy2Dto1D_shl_t ref, cpy2Dto1D_shl_t opt);
76
bool check_cpy2Dto1D_shr_t(cpy2Dto1D_shr_t ref, cpy2Dto1D_shr_t opt);
77
bool check_cpy1Dto2D_shl_t(cpy1Dto2D_shl_t ref, cpy1Dto2D_shl_t opt);
78
+ bool check_cpy1Dto2D_shl_aligned_t(cpy1Dto2D_shl_t ref, cpy1Dto2D_shl_t opt);
79
bool check_cpy1Dto2D_shr_t(cpy1Dto2D_shr_t ref, cpy1Dto2D_shr_t opt);
80
bool check_copy_cnt_t(copy_cnt_t ref, copy_cnt_t opt);
81
bool check_pixel_var(var_t ref, var_t opt);
82
bool check_ssim_4x4x2_core(ssim_4x4x2_core_t ref, ssim_4x4x2_core_t opt);
83
bool check_ssim_end(ssim_end4_t ref, ssim_end4_t opt);
84
bool check_addAvg(addAvg_t, addAvg_t);
85
+ bool check_addAvg_aligned(addAvg_t, addAvg_t);
86
bool check_saoCuOrgE0_t(saoCuOrgE0_t ref, saoCuOrgE0_t opt);
87
bool check_saoCuOrgE1_t(saoCuOrgE1_t ref, saoCuOrgE1_t opt);
88
bool check_saoCuOrgE2_t(saoCuOrgE2_t ref[], saoCuOrgE2_t opt[]);
89
x265_2.7.tar.gz/source/test/regression-tests.txt -> x265_2.9.tar.gz/source/test/regression-tests.txt
Changed
63
1
2
BasketballDrive_1920x1080_50.y4m,--preset slower --lossless --chromaloc 3 --subme 0 --limit-tu 4
3
BasketballDrive_1920x1080_50.y4m,--preset slower --no-cutree --analysis-save x265_analysis.dat --analysis-reuse-level 10 --bitrate 7000 --limit-tu 0::--preset slower --no-cutree --analysis-load x265_analysis.dat --analysis-reuse-level 10 --bitrate 7000 --limit-tu 0
4
BasketballDrive_1920x1080_50.y4m,--preset veryslow --crf 4 --cu-lossless --pmode --limit-refs 1 --aq-mode 3 --limit-tu 3
5
-BasketballDrive_1920x1080_50.y4m,--preset veryslow --no-cutree --analysis-save x265_analysis.dat --bitrate 7000 --tskip-fast --limit-tu 2::--preset veryslow --no-cutree --analysis-load x265_analysis.dat --bitrate 7000 --tskip-fast --limit-tu 2
6
+BasketballDrive_1920x1080_50.y4m,--preset veryslow --no-cutree --analysis-save x265_analysis.dat --crf 18 --tskip-fast --limit-tu 2::--preset veryslow --no-cutree --analysis-load x265_analysis.dat --crf 18 --tskip-fast --limit-tu 2
7
BasketballDrive_1920x1080_50.y4m,--preset veryslow --recon-y4m-exec "ffplay -i pipe:0 -autoexit"
8
Coastguard-4k.y4m,--preset ultrafast --recon-y4m-exec "ffplay -i pipe:0 -autoexit"
9
Coastguard-4k.y4m,--preset superfast --tune grain --overscan=crop
10
Coastguard-4k.y4m,--preset superfast --tune grain --pme --aq-strength 2 --merange 190
11
-Coastguard-4k.y4m,--preset veryfast --no-cutree --analysis-save x265_analysis.dat --analysis-reuse-level 1 --bitrate 15000::--preset veryfast --no-cutree --analysis-load x265_analysis.dat --analysis-reuse-level 1 --bitrate 15000
12
+Coastguard-4k.y4m,--preset veryfast --no-cutree --analysis-save x265_analysis.dat --analysis-reuse-level 1 --qp 35::--preset veryfast --no-cutree --analysis-load x265_analysis.dat --analysis-reuse-level 1 --qp 35
13
Coastguard-4k.y4m,--preset medium --rdoq-level 1 --tune ssim --no-signhide --me umh --slices 2
14
Coastguard-4k.y4m,--preset slow --tune psnr --cbqpoffs -1 --crqpoffs 1 --limit-refs 1
15
CrowdRun_1920x1080_50_10bit_422.yuv,--preset ultrafast --weightp --tune zerolatency --qg-size 16
16
17
KristenAndSara_1280x720_60.y4m,--preset slower --pmode --max-tu-size 8 --limit-refs 0 --limit-modes --limit-tu 1
18
NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset superfast --tune psnr
19
NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset medium --tune grain --limit-refs 2
20
-NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset slow --no-cutree --analysis-save x265_analysis.dat --rd 5 --analysis-reuse-level 10 --bitrate 9000::--preset slow --no-cutree --analysis-load x265_analysis.dat --rd 5 --analysis-reuse-level 10 --bitrate 9000
21
+NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset slow --no-cutree --analysis-save x265_analysis.dat --rd 5 --analysis-reuse-level 10 --bitrate 9000 --vbv-maxrate 9000 --vbv-bufsize 9000::--preset slow --no-cutree --analysis-load x265_analysis.dat --rd 5 --analysis-reuse-level 10 --bitrate 9000 --vbv-maxrate 9000 --vbv-bufsize 9000
22
News-4k.y4m,--preset ultrafast --no-cutree --analysis-save x265_analysis.dat --analysis-reuse-level 2 --bitrate 15000::--preset ultrafast --no-cutree --analysis-load x265_analysis.dat --analysis-reuse-level 2 --bitrate 15000
23
News-4k.y4m,--preset superfast --lookahead-slices 6 --aq-mode 0
24
News-4k.y4m,--preset superfast --slices 4 --aq-mode 0
25
News-4k.y4m,--preset medium --tune ssim --no-sao --qg-size 16
26
-News-4k.y4m,--preset slower --opt-cu-delta-qp
27
News-4k.y4m,--preset veryslow --no-rskip
28
News-4k.y4m,--preset veryslow --pme --crf 40
29
OldTownCross_1920x1080_50_10bit_422.yuv,--preset superfast --weightp
30
31
city_4cif_60fps.y4m,--preset superfast --rdpenalty 1 --tu-intra-depth 2
32
city_4cif_60fps.y4m,--preset medium --crf 4 --cu-lossless --sao-non-deblock
33
city_4cif_60fps.y4m,--preset slower --scaling-list default
34
-city_4cif_60fps.y4m,--preset veryslow --opt-cu-delta-qp
35
city_4cif_60fps.y4m,--preset veryslow --rdpenalty 2 --sao-non-deblock --no-b-intra --limit-refs 0
36
ducks_take_off_420_720p50.y4m,--preset ultrafast --constrained-intra --rd 1
37
ducks_take_off_444_720p50.y4m,--preset superfast --weightp --limit-refs 2
38
39
Kimono1_1920x1080_24_400.yuv,--preset veryslow --crf 4 --cu-lossless --slices 2 --limit-refs 3 --limit-modes
40
Kimono1_1920x1080_24_400.yuv,--preset placebo --ctu 32 --max-tu-size 8 --limit-tu 2
41
big_buck_bunny_360p24.y4m, --keyint 60 --min-keyint 40 --gop-lookahead 14
42
-BasketballDrive_1920x1080_50.y4m, --preset medium --no-open-gop --keyint 50 --min-keyint 50 --radl 2
43
+BasketballDrive_1920x1080_50.y4m, --preset medium --no-open-gop --keyint 50 --min-keyint 50 --radl 2 --vbv-maxrate 5000 --vbv-bufsize 5000
44
45
# Main12 intraCost overflow bug test
46
720p50_parkrun_ter.y4m,--preset medium
47
48
#low-pass dct test
49
720p50_parkrun_ter.y4m,--preset medium --lowpass-dct
50
51
+#scaled save/load test
52
+crowd_run_1080p50.y4m,--preset ultrafast --no-cutree --analysis-save x265_analysis.dat --analysis-reuse-level 1 --scale-factor 2 --crf 26 --vbv-maxrate 8000 --vbv-bufsize 8000::crowd_run_2160p50.y4m, --preset ultrafast --no-cutree --analysis-load x265_analysis.dat --analysis-reuse-level 1 --scale-factor 2 --crf 26 --vbv-maxrate 12000 --vbv-bufsize 12000
53
+crowd_run_1080p50.y4m,--preset superfast --no-cutree --analysis-save x265_analysis.dat --analysis-reuse-level 2 --scale-factor 2 --crf 22 --vbv-maxrate 5000 --vbv-bufsize 5000::crowd_run_2160p50.y4m, --preset superfast --no-cutree --analysis-load x265_analysis.dat --analysis-reuse-level 2 --scale-factor 2 --crf 22 --vbv-maxrate 10000 --vbv-bufsize 10000
54
+crowd_run_1080p50.y4m,--preset fast --no-cutree --analysis-save x265_analysis.dat --analysis-reuse-level 5 --scale-factor 2 --qp 18::crowd_run_2160p50.y4m, --preset fast --no-cutree --analysis-load x265_analysis.dat --analysis-reuse-level 5 --scale-factor 2 --qp 18
55
+crowd_run_1080p50.y4m,--preset medium --no-cutree --analysis-save x265_analysis.dat --analysis-reuse-level 10 --scale-factor 2 --bitrate 5000 --vbv-maxrate 5000 --vbv-bufsize 5000 --early-skip --tu-inter-depth 3::crowd_run_2160p50.y4m, --preset medium --no-cutree --analysis-load x265_analysis.dat --analysis-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-maxrate 10000 --vbv-bufsize 10000 --early-skip --tu-inter-depth 3 --refine-intra 4 --dynamic-refine::crowd_run_2160p50.y4m, --preset medium --no-cutree --analysis-load x265_analysis.dat --analysis-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-maxrate 10000 --vbv-bufsize 10000 --early-skip --tu-inter-depth 3 --refine-intra 3 --refine-inter 3
56
+RaceHorses_416x240_30.y4m,--preset slow --no-cutree --ctu 16 --analysis-save x265_analysis.dat --analysis-reuse-level 10 --scale-factor 2 --crf 22 --vbv-maxrate 1000 --vbv-bufsize 1000::RaceHorses_832x480_30.y4m, --preset slow --no-cutree --ctu 32 --analysis-load x265_analysis.dat --analysis-save x265_analysis_2.dat --analysis-reuse-level 10 --scale-factor 2 --crf 16 --vbv-maxrate 4000 --vbv-bufsize 4000 --refine-intra 0 --refine-inter 1::RaceHorses_1664x960_30.y4m,--preset slow --no-cutree --ctu 64 --analysis-load x265_analysis_2.dat --analysis-reuse-level 10 --scale-factor 2 --crf 12 --vbv-maxrate 7000 --vbv-bufsize 7000 --refine-intra 2 --refine-inter 2
57
+ElFunete_960x540_60.yuv,--colorprim bt709 --transfer bt709 --chromaloc 2 --aud --repeat-headers --no-opt-qp-pps --no-opt-ref-list-length-pps --wpp --no-interlace --sar 1:1 --min-keyint 60 --no-open-gop --rc-lookahead 180 --bframes 5 --b-intra --ref 4 --cbqpoffs -2 --crqpoffs -2 --lookahead-threads 0 --weightb --qg-size 8 --me star --preset veryslow --frame-threads 1 --b-adapt 2 --aq-mode 3 --rd 6 --pools 15 --colormatrix bt709 --keyint 120 --high-tier --ctu 64 --tune psnr --bitrate 10000 --vbv-bufsize 30000 --vbv-maxrate 17500 --analysis-reuse-level 10 --analysis-save elfuente_960x540.dat --scale-factor 2::ElFunete_1920x1080_60.yuv,--colorprim bt709 --transfer bt709 --chromaloc 2 --aud --repeat-headers --no-opt-qp-pps --no-opt-ref-list-length-pps --wpp --no-interlace --sar 1:1 --min-keyint 60 --no-open-gop --rc-lookahead 180 --bframes 5 --b-intra --ref 4 --cbqpoffs -2 --crqpoffs -2 --lookahead-threads 0 --weightb --qg-size 8 --me star --preset veryslow --frame-threads 1 --b-adapt 2 --aq-mode 3 --rd 6 --pools 15 --colormatrix bt709 --keyint 120 --high-tier --ctu 64 --tune psnr --bitrate 10000 --vbv-bufsize 30000 --vbv-maxrate 17500 --analysis-reuse-level 10 --analysis-save elfuente_1920x1080.dat --limit-tu 0 --scale-factor 2 --analysis-load elfuente_960x540.dat --refine-intra 4 --refine-inter 2::ElFuente_3840x2160_60.yuv,--colorprim bt709 --transfer bt709 --chromaloc 2 --aud --repeat-headers --no-opt-qp-pps --no-opt-ref-list-length-pps --wpp --no-interlace --sar 1:1 --min-keyint 60 --no-open-gop --rc-lookahead 180 --bframes 5 --b-intra --ref 4 --cbqpoffs -2 --crqpoffs -2 --lookahead-threads 0 --weightb --qg-size 8 --me star --preset veryslow --frame-threads 1 --b-adapt 2 --aq-mode 3 --rd 6 --pools 15 --colormatrix bt709 --keyint 120 --high-tier --ctu 64 --tune=psnr --bitrate 24000 --vbv-bufsize 84000 --vbv-maxrate 49000 --analysis-reuse-level 10 --limit-tu 0 --scale-factor 2 --analysis-load elfuente_1920x1080.dat --refine-intra 4 --refine-inter 2
58
+
59
+#segment encoding
60
+BasketballDrive_1920x1080_50.y4m, --preset ultrafast --no-open-gop --chunk-start 100 --chunk-end 200
61
+
62
# vim: tw=200
63
x265_2.7.tar.gz/source/test/smoke-tests.txt -> x265_2.9.tar.gz/source/test/smoke-tests.txt
Changed
10
1
2
old_town_cross_444_720p50.y4m,--preset=fast --keyint 20 --min-cu-size 16
3
old_town_cross_444_720p50.y4m,--preset=slow --sao-non-deblock --pmode --qg-size 32
4
RaceHorses_416x240_30_10bit.yuv,--preset=veryfast --max-tu-size 8
5
-RaceHorses_416x240_30_10bit.yuv,--preset=slower --bitrate 500 -F4 --rdoq-level 1 --opt-cu-delta-qp
6
+RaceHorses_416x240_30_10bit.yuv,--preset=slower --bitrate 500 -F4 --rdoq-level 1
7
CrowdRun_1920x1080_50_10bit_444.yuv,--preset=ultrafast --constrained-intra --min-keyint 5 --keyint 10
8
CrowdRun_1920x1080_50_10bit_444.yuv,--preset=medium --max-tu-size 16 --tu-inter-depth 2 --limit-tu 3
9
DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset=veryfast --min-cu 16
10
x265_2.7.tar.gz/source/test/testbench.cpp -> x265_2.9.tar.gz/source/test/testbench.cpp
Changed
28
1
2
3
int main(int argc, char *argv[])
4
{
5
- int cpuid = X265_NS::cpu_detect();
6
+ bool enableavx512 = true;
7
+ int cpuid = X265_NS::cpu_detect(enableavx512);
8
const char *testname = 0;
9
10
if (!(argc & 1))
11
12
if (!strncmp(name, "cpuid", strlen(name)))
13
{
14
bool bError = false;
15
- cpuid = parseCpuName(value, bError);
16
+ cpuid = parseCpuName(value, bError, enableavx512);
17
if (bError)
18
{
19
printf("Invalid CPU name: %s\n", value);
20
21
{ "XOP", X265_CPU_XOP },
22
{ "AVX2", X265_CPU_AVX2 },
23
{ "BMI2", X265_CPU_AVX2 | X265_CPU_BMI1 | X265_CPU_BMI2 },
24
+ { "AVX512", X265_CPU_AVX512 },
25
{ "ARMv6", X265_CPU_ARMV6 },
26
{ "NEON", X265_CPU_NEON },
27
{ "FastNeonMRC", X265_CPU_FAST_NEON_MRC },
28
x265_2.7.tar.gz/source/test/testharness.h -> x265_2.9.tar.gz/source/test/testharness.h
Changed
19
1
2
#include <x86intrin.h>
3
#elif ( !defined(__APPLE__) && defined (__GNUC__) && defined(__ARM_NEON__))
4
#include <arm_neon.h>
5
-#elif defined(__GNUC__)
6
+#elif defined(__GNUC__) && (!defined(__clang__) || __clang_major__ < 4)
7
/* fallback for older GCC/MinGW */
8
static inline uint32_t __rdtsc(void)
9
{
10
11
}
12
#endif // ifdef _MSC_VER
13
14
-#define BENCH_RUNS 1000
15
+#define BENCH_RUNS 2000
16
17
// Adapted from checkasm.c, runs each optimized primitive four times, measures rdtsc
18
// and discards invalid times. Repeats 1000 times to get a good average. Then measures
19
x265_2.7.tar.gz/source/x265.cpp -> x265_2.9.tar.gz/source/x265.cpp
Changed
121
1
2
const char* reconPlayCmd;
3
const x265_api* api;
4
x265_param* param;
5
+ x265_vmaf_data* vmafData;
6
bool bProgress;
7
bool bForceY4m;
8
bool bDither;
9
10
reconPlayCmd = NULL;
11
api = NULL;
12
param = NULL;
13
+ vmafData = NULL;
14
framesToBeEncoded = seek = 0;
15
totalbytes = 0;
16
bProgress = true;
17
18
{
19
int eta = (int)(elapsed * (framesToBeEncoded - frameNum) / ((int64_t)frameNum * 1000000));
20
sprintf(buf, "x265 [%.1f%%] %d/%d frames, %.2f fps, %.2f kb/s, eta %d:%02d:%02d",
21
- 100. * frameNum / framesToBeEncoded, frameNum, framesToBeEncoded, fps, bitrate,
22
+ 100. * frameNum / (param->chunkEnd ? param->chunkEnd : param->totalFrames), frameNum, (param->chunkEnd ? param->chunkEnd : param->totalFrames), fps, bitrate,
23
eta / 3600, (eta / 60) % 60, eta % 60);
24
}
25
else
26
27
x265_log(NULL, X265_LOG_ERROR, "param alloc failed\n");
28
return true;
29
}
30
+#if ENABLE_LIBVMAF
31
+ vmafData = (x265_vmaf_data*)x265_malloc(sizeof(x265_vmaf_data));
32
+ if(!vmafData)
33
+ {
34
+ x265_log(NULL, X265_LOG_ERROR, "vmaf data alloc failed\n");
35
+ return true;
36
+ }
37
+#endif
38
39
if (api->param_default_preset(param, preset, tune) < 0)
40
{
41
42
info.frameCount = 0;
43
getParamAspectRatio(param, info.sarWidth, info.sarHeight);
44
45
+
46
this->input = InputFile::open(info, this->bForceY4m);
47
if (!this->input || this->input->isFail())
48
{
49
50
if (this->framesToBeEncoded == 0 && info.frameCount > (int)seek)
51
this->framesToBeEncoded = info.frameCount - seek;
52
param->totalFrames = this->framesToBeEncoded;
53
-
54
+
55
/* Force CFR until we have support for VFR */
56
info.timebaseNum = param->fpsDenom;
57
info.timebaseDenom = param->fpsNum;
58
59
param->sourceWidth, param->sourceHeight, param->fpsNum, param->fpsDenom,
60
x265_source_csp_names[param->internalCsp]);
61
}
62
+#if ENABLE_LIBVMAF
63
+ if (!reconfn)
64
+ {
65
+ x265_log(param, X265_LOG_ERROR, "recon file must be specified to get VMAF score, try --help for help\n");
66
+ return true;
67
+ }
68
+ const char *str = strrchr(info.filename, '.');
69
70
+ if (!strcmp(str, ".y4m"))
71
+ {
72
+ x265_log(param, X265_LOG_ERROR, "VMAF supports YUV file format only.\n");
73
+ return true;
74
+ }
75
+ if(param->internalCsp == X265_CSP_I420 || param->internalCsp == X265_CSP_I422 || param->internalCsp == X265_CSP_I444)
76
+ {
77
+ vmafData->reference_file = x265_fopen(inputfn, "rb");
78
+ vmafData->distorted_file = x265_fopen(reconfn, "rb");
79
+ }
80
+ else
81
+ {
82
+ x265_log(param, X265_LOG_ERROR, "VMAF will support only yuv420p, yu422p, yu444p, yuv420p10le, yuv422p10le, yuv444p10le formats.\n");
83
+ return true;
84
+ }
85
+#endif
86
this->output = OutputFile::open(outputfn, info);
87
if (this->output->isFail())
88
{
89
90
91
x265_param* param = cliopt.param;
92
const x265_api* api = cliopt.api;
93
-
94
+#if ENABLE_LIBVMAF
95
+ x265_vmaf_data* vmafdata = cliopt.vmafData;
96
+#endif
97
/* This allows muxers to modify bitstream format */
98
cliopt.output->setParam(param);
99
100
101
if (!numEncoded)
102
break;
103
}
104
-
105
+
106
/* clear progress report */
107
if (cliopt.bProgress)
108
fprintf(stderr, "%*s\r", 80, " ");
109
110
111
api->encoder_get_stats(encoder, &stats, sizeof(stats));
112
if (param->csvfn && !b_ctrl_c)
113
+#if ENABLE_LIBVMAF
114
+ api->vmaf_encoder_log(encoder, argc, argv, param, vmafdata);
115
+#else
116
api->encoder_log(encoder, argc, argv);
117
+#endif
118
api->encoder_close(encoder);
119
120
int64_t second_largest_pts = 0;
121
x265_2.7.tar.gz/source/x265.h -> x265_2.9.tar.gz/source/x265.h
Changed
425
1
2
extern "C" {
3
#endif
4
5
+#if _MSC_VER
6
+#pragma warning(disable: 4201) // non-standard extension used (nameless struct/union)
7
+#endif
8
+
9
/* x265_encoder:
10
* opaque handler for encoder */
11
typedef struct x265_encoder x265_encoder;
12
13
int lastMiniGopBFrame;
14
int plannedType[X265_LOOKAHEAD_MAX + 1];
15
int64_t dts;
16
+ int64_t reorderedPts;
17
} x265_lookahead_data;
18
19
+typedef struct x265_analysis_validate
20
+{
21
+ int maxNumReferences;
22
+ int analysisReuseLevel;
23
+ int sourceWidth;
24
+ int sourceHeight;
25
+ int keyframeMax;
26
+ int keyframeMin;
27
+ int openGOP;
28
+ int bframes;
29
+ int bPyramid;
30
+ int maxCUSize;
31
+ int minCUSize;
32
+ int intraRefresh;
33
+ int lookaheadDepth;
34
+ int chunkStart;
35
+ int chunkEnd;
36
+}x265_analysis_validate;
37
+
38
+/* Stores intra analysis data for a single frame. This struct needs better packing */
39
+typedef struct x265_analysis_intra_data
40
+{
41
+ uint8_t* depth;
42
+ uint8_t* modes;
43
+ char* partSizes;
44
+ uint8_t* chromaModes;
45
+}x265_analysis_intra_data;
46
+
47
+typedef struct x265_analysis_MV
48
+{
49
+ union{
50
+ struct { int16_t x, y; };
51
+
52
+ int32_t word;
53
+ };
54
+}x265_analysis_MV;
55
+
56
+/* Stores inter analysis data for a single frame */
57
+typedef struct x265_analysis_inter_data
58
+{
59
+ int32_t* ref;
60
+ uint8_t* depth;
61
+ uint8_t* modes;
62
+ uint8_t* partSize;
63
+ uint8_t* mergeFlag;
64
+ uint8_t* interDir;
65
+ uint8_t* mvpIdx[2];
66
+ int8_t* refIdx[2];
67
+ x265_analysis_MV* mv[2];
68
+ int64_t* sadCost;
69
+}x265_analysis_inter_data;
70
+
71
+typedef struct x265_weight_param
72
+{
73
+ uint32_t log2WeightDenom;
74
+ int inputWeight;
75
+ int inputOffset;
76
+ int wtPresent;
77
+}x265_weight_param;
78
+
79
+#if X265_DEPTH < 10
80
+typedef uint32_t sse_t;
81
+#else
82
+typedef uint64_t sse_t;
83
+#endif
84
+
85
+typedef struct x265_analysis_distortion_data
86
+{
87
+ sse_t* distortion;
88
+ sse_t* ctuDistortion;
89
+ double* scaledDistortion;
90
+ double averageDistortion;
91
+ double sdDistortion;
92
+ uint32_t highDistortionCtuCount;
93
+ uint32_t lowDistortionCtuCount;
94
+ double* offset;
95
+ double* threshold;
96
+}x265_analysis_distortion_data;
97
+
98
/* Stores all analysis data for a single frame */
99
typedef struct x265_analysis_data
100
{
101
- int64_t satdCost;
102
- uint32_t frameRecordSize;
103
- uint32_t poc;
104
- uint32_t sliceType;
105
- uint32_t numCUsInFrame;
106
- uint32_t numPartitions;
107
- uint32_t depthBytes;
108
- int bScenecut;
109
- void* wt;
110
- void* interData;
111
- void* intraData;
112
- uint32_t numCuInHeight;
113
- x265_lookahead_data lookahead;
114
- uint8_t* modeFlag[2];
115
+ int64_t satdCost;
116
+ uint32_t frameRecordSize;
117
+ uint32_t poc;
118
+ uint32_t sliceType;
119
+ uint32_t numCUsInFrame;
120
+ uint32_t numPartitions;
121
+ uint32_t depthBytes;
122
+ int bScenecut;
123
+ x265_weight_param* wt;
124
+ x265_analysis_inter_data* interData;
125
+ x265_analysis_intra_data* intraData;
126
+ uint32_t numCuInHeight;
127
+ x265_lookahead_data lookahead;
128
+ uint8_t* modeFlag[2];
129
+ x265_analysis_validate saveParam;
130
+ x265_analysis_distortion_data* distortionData;
131
} x265_analysis_data;
132
133
/* cu statistics */
134
135
/* All the above values will add up to 100%. */
136
} x265_pu_stats;
137
138
-
139
-typedef struct x265_analysis_2Pass
140
-{
141
- uint32_t poc;
142
- uint32_t frameRecordSize;
143
- void* analysisFramedata;
144
-}x265_analysis_2Pass;
145
-
146
/* Frame level statistics */
147
typedef struct x265_frame_stats
148
{
149
150
x265_cu_stats cuStats;
151
x265_pu_stats puStats;
152
double totalFrameTime;
153
+ double vmafFrameScore;
154
+ double bufferFillFinal;
155
} x265_frame_stats;
156
157
typedef struct x265_ctu_info_t
158
159
REGION_REFRESH_INFO = 134,
160
MASTERING_DISPLAY_INFO = 137,
161
CONTENT_LIGHT_LEVEL_INFO = 144,
162
+ ALTERNATIVE_TRANSFER_CHARACTERISTICS = 147,
163
} SEIPayloadType;
164
165
typedef struct x265_sei_payload
166
167
168
int height;
169
170
- x265_analysis_2Pass analysis2Pass;
171
+ // pts is reordered in the order of encoding.
172
+ int64_t reorderedPts;
173
} x265_picture;
174
175
typedef enum
176
177
/* CPU flags */
178
179
/* x86 */
180
-#define X265_CPU_CMOV 0x0000001
181
-#define X265_CPU_MMX 0x0000002
182
-#define X265_CPU_MMX2 0x0000004 /* MMX2 aka MMXEXT aka ISSE */
183
+#define X265_CPU_MMX (1 << 0)
184
+#define X265_CPU_MMX2 (1 << 1) /* MMX2 aka MMXEXT aka ISSE */
185
#define X265_CPU_MMXEXT X265_CPU_MMX2
186
-#define X265_CPU_SSE 0x0000008
187
-#define X265_CPU_SSE2 0x0000010
188
-#define X265_CPU_SSE3 0x0000020
189
-#define X265_CPU_SSSE3 0x0000040
190
-#define X265_CPU_SSE4 0x0000080 /* SSE4.1 */
191
-#define X265_CPU_SSE42 0x0000100 /* SSE4.2 */
192
-#define X265_CPU_LZCNT 0x0000200 /* Phenom support for "leading zero count" instruction. */
193
-#define X265_CPU_AVX 0x0000400 /* AVX support: requires OS support even if YMM registers aren't used. */
194
-#define X265_CPU_XOP 0x0000800 /* AMD XOP */
195
-#define X265_CPU_FMA4 0x0001000 /* AMD FMA4 */
196
-#define X265_CPU_AVX2 0x0002000 /* AVX2 */
197
-#define X265_CPU_FMA3 0x0004000 /* Intel FMA3 */
198
-#define X265_CPU_BMI1 0x0008000 /* BMI1 */
199
-#define X265_CPU_BMI2 0x0010000 /* BMI2 */
200
+#define X265_CPU_SSE (1 << 2)
201
+#define X265_CPU_SSE2 (1 << 3)
202
+#define X265_CPU_LZCNT (1 << 4)
203
+#define X265_CPU_SSE3 (1 << 5)
204
+#define X265_CPU_SSSE3 (1 << 6)
205
+#define X265_CPU_SSE4 (1 << 7) /* SSE4.1 */
206
+#define X265_CPU_SSE42 (1 << 8) /* SSE4.2 */
207
+#define X265_CPU_AVX (1 << 9) /* Requires OS support even if YMM registers aren't used. */
208
+#define X265_CPU_XOP (1 << 10) /* AMD XOP */
209
+#define X265_CPU_FMA4 (1 << 11) /* AMD FMA4 */
210
+#define X265_CPU_FMA3 (1 << 12) /* Intel FMA3 */
211
+#define X265_CPU_BMI1 (1 << 13) /* BMI1 */
212
+#define X265_CPU_BMI2 (1 << 14) /* BMI2 */
213
+#define X265_CPU_AVX2 (1 << 15) /* AVX2 */
214
+#define X265_CPU_AVX512 (1 << 16) /* AVX-512 {F, CD, BW, DQ, VL}, requires OS support */
215
/* x86 modifiers */
216
-#define X265_CPU_CACHELINE_32 0x0020000 /* avoid memory loads that span the border between two cachelines */
217
-#define X265_CPU_CACHELINE_64 0x0040000 /* 32/64 is the size of a cacheline in bytes */
218
-#define X265_CPU_SSE2_IS_SLOW 0x0080000 /* avoid most SSE2 functions on Athlon64 */
219
-#define X265_CPU_SSE2_IS_FAST 0x0100000 /* a few functions are only faster on Core2 and Phenom */
220
-#define X265_CPU_SLOW_SHUFFLE 0x0200000 /* The Conroe has a slow shuffle unit (relative to overall SSE performance) */
221
-#define X265_CPU_STACK_MOD4 0x0400000 /* if stack is only mod4 and not mod16 */
222
-#define X265_CPU_SLOW_CTZ 0x0800000 /* BSR/BSF x86 instructions are really slow on some CPUs */
223
-#define X265_CPU_SLOW_ATOM 0x1000000 /* The Atom is terrible: slow SSE unaligned loads, slow
224
+#define X265_CPU_CACHELINE_32 (1 << 17) /* avoid memory loads that span the border between two cachelines */
225
+#define X265_CPU_CACHELINE_64 (1 << 18) /* 32/64 is the size of a cacheline in bytes */
226
+#define X265_CPU_SSE2_IS_SLOW (1 << 19) /* avoid most SSE2 functions on Athlon64 */
227
+#define X265_CPU_SSE2_IS_FAST (1 << 20) /* a few functions are only faster on Core2 and Phenom */
228
+#define X265_CPU_SLOW_SHUFFLE (1 << 21) /* The Conroe has a slow shuffle unit (relative to overall SSE performance) */
229
+#define X265_CPU_STACK_MOD4 (1 << 22) /* if stack is only mod4 and not mod16 */
230
+#define X265_CPU_SLOW_ATOM (1 << 23) /* The Atom is terrible: slow SSE unaligned loads, slow
231
* SIMD multiplies, slow SIMD variable shifts, slow pshufb,
232
* cacheline split penalties -- gather everything here that
233
* isn't shared by other CPUs to avoid making half a dozen
234
* new SLOW flags. */
235
-#define X265_CPU_SLOW_PSHUFB 0x2000000 /* such as on the Intel Atom */
236
-#define X265_CPU_SLOW_PALIGNR 0x4000000 /* such as on the AMD Bobcat */
237
+#define X265_CPU_SLOW_PSHUFB (1 << 24) /* such as on the Intel Atom */
238
+#define X265_CPU_SLOW_PALIGNR (1 << 25) /* such as on the AMD Bobcat */
239
240
/* ARM */
241
#define X265_CPU_ARMV6 0x0000001
242
243
#define X265_AQ_VARIANCE 1
244
#define X265_AQ_AUTO_VARIANCE 2
245
#define X265_AQ_AUTO_VARIANCE_BIASED 3
246
-
247
#define x265_ADAPT_RD_STRENGTH 4
248
-
249
+#define X265_REFINE_INTER_LEVELS 3
250
/* NOTE! For this release only X265_CSP_I420 and X265_CSP_I444 are supported */
251
-
252
/* Supported internal color space types (according to semantics of chroma_format_idc) */
253
#define X265_CSP_I400 0 /* yuv 4:0:0 planar */
254
#define X265_CSP_I420 1 /* yuv 4:2:0 planar */
255
256
double elapsedEncodeTime; /* wall time since encoder was opened */
257
double elapsedVideoTime; /* encoded picture count / frame rate */
258
double bitrate; /* accBits / elapsed video time */
259
+ double aggregateVmafScore; /* aggregate VMAF score for input video*/
260
uint64_t accBits; /* total bits output thus far */
261
uint32_t encodedPictureCount; /* number of output pictures thus far */
262
uint32_t totalWPFrames; /* number of uni-directional weighted frames used */
263
264
float bitrateFactor;
265
} x265_zone;
266
267
+/* data to calculate aggregate VMAF score */
268
+typedef struct x265_vmaf_data
269
+{
270
+ int width;
271
+ int height;
272
+ size_t offset;
273
+ int internalBitDepth;
274
+ FILE *reference_file; /* FILE pointer for input file */
275
+ FILE *distorted_file; /* FILE pointer for recon file generated*/
276
+}x265_vmaf_data;
277
+
278
+/* data to calculate frame level VMAF score */
279
+typedef struct x265_vmaf_framedata
280
+{
281
+ int width;
282
+ int height;
283
+ int frame_set;
284
+ int internalBitDepth;
285
+ void *reference_frame; /* points to fenc of particular frame */
286
+ void *distorted_frame; /* points to recon of particular frame */
287
+}x265_vmaf_framedata;
288
+
289
+/* common data needed to calculate both frame level and video level VMAF scores */
290
+typedef struct x265_vmaf_commondata
291
+{
292
+ char *format;
293
+ char *model_path;
294
+ char *log_path;
295
+ char *log_fmt;
296
+ int disable_clip;
297
+ int disable_avx;
298
+ int enable_transform;
299
+ int phone_model;
300
+ int psnr;
301
+ int ssim;
302
+ int ms_ssim;
303
+ char *pool;
304
+}x265_vmaf_commondata;
305
+
306
+static const x265_vmaf_commondata vcd[] = { { NULL, (char *)"/usr/local/share/model/vmaf_v0.6.1.pkl", NULL, NULL, 0, 0, 0, 0, 0, 0, 0, NULL } };
307
+
308
/* x265 input parameters
309
*
310
* For version safety you may use x265_param_alloc/free() to manage the
311
312
* somehow flawed on your target hardware. The asm function tables are
313
* process global, the first encoder configures them for all encoders */
314
int cpuid;
315
-
316
/*== Parallelism Features ==*/
317
318
/* Number of concurrently encoded frames between 1 and X265_MAX_FRAME_THREADS
319
320
* Default is 0, which is recommended */
321
int crQpOffset;
322
323
+ /* Specifies the preferred transfer characteristics syntax element in the
324
+ * alternative transfer characteristics SEI message (see. D.2.38 and D.3.38 of
325
+ * JCTVC-W1005 http://phenix.it-sudparis.eu/jct/doc_end_user/documents/23_San%20Diego/wg11/JCTVC-W1005-v4.zip
326
+ * */
327
+ int preferredTransferCharacteristics;
328
+
329
+ /*
330
+ * Specifies the value for the pic_struc syntax element of the picture timing SEI message (See D2.3 and D3.3)
331
+ * of the HEVC spec. for a detailed explanation
332
+ * */
333
+ int pictureStructure;
334
+
335
struct
336
{
337
/* Explicit mode of rate-control, necessary for API users. It must
338
339
340
/*Number of RADL pictures allowed in front of IDR*/
341
int radl;
342
+
343
+ /* This value controls the maximum AU size defined in specification
344
+ * It represents the percentage of maximum AU size used.
345
+ * Default is 1 (which is 100%). Range is 0.5 to 1. */
346
+ double maxAUSizeFactor;
347
+
348
+ /* Enables the emission of a Recovery Point SEI with the stream headers
349
+ * at each IDR frame describing poc of the recovery point, exact matching flag
350
+ * and broken link flag. Default is disabled. */
351
+ int bEmitIDRRecoverySEI;
352
+
353
+ /* Dynamically change refine-inter at block level*/
354
+ int bDynamicRefine;
355
+
356
+ /* Enable writing all SEI messgaes in one single NAL instead of mul*/
357
+ int bSingleSeiNal;
358
+
359
+
360
+ /* First frame of the chunk. Frames preceeding this in display order will
361
+ * be encoded, however, they will be discarded in the bitstream.
362
+ * Default 0 (disabled). */
363
+ int chunkStart;
364
+
365
+ /* Last frame of the chunk. Frames following this in display order will be
366
+ * used in taking lookahead decisions, but, they will not be encoded.
367
+ * Default 0 (disabled). */
368
+ int chunkEnd;
369
+ /* File containing base64 encoded SEI messages in POC order */
370
+ const char* naluFile;
371
+
372
} x265_param;
373
374
/* x265_param_alloc:
375
376
* A static string describing the compiler and target architecture */
377
X265_API extern const char *x265_build_info_str;
378
379
+/* x265_alloc_analysis_data:
380
+* Allocate memory for the x265_analysis_data object's internal structures. */
381
+void x265_alloc_analysis_data(x265_param *param, x265_analysis_data* analysis);
382
+
383
+/*
384
+* Free the allocated memory for x265_analysis_data object's internal structures. */
385
+void x265_free_analysis_data(x265_param *param, x265_analysis_data* analysis);
386
+
387
/* Force a link error in the case of linking against an incompatible API version.
388
* Glue #defines exist to force correct macro expansion; the final output of the macro
389
* is x265_encoder_open_##X265_BUILD (for purposes of dlopen). */
390
391
/* In-place downshift from a bit-depth greater than 8 to a bit-depth of 8, using
392
* the residual bits to dither each row. */
393
void x265_dither_image(x265_picture *, int picWidth, int picHeight, int16_t *errorBuf, int bitDepth);
394
+#if ENABLE_LIBVMAF
395
+/* x265_calculate_vmafScore:
396
+ * returns VMAF score for the input video.
397
+ * This api must be called only after encoding was done. */
398
+double x265_calculate_vmafscore(x265_param*, x265_vmaf_data*);
399
+
400
+/* x265_calculate_vmaf_framelevelscore:
401
+ * returns VMAF score for each frame in a given input video. */
402
+double x265_calculate_vmaf_framelevelscore(x265_vmaf_framedata*);
403
+/* x265_vmaf_encoder_log:
404
+ * write a line to the configured CSV file. If a CSV filename was not
405
+ * configured, or file open failed, this function will perform no write.
406
+ * This api will be called only when ENABLE_LIBVMAF cmake option is set */
407
+void x265_vmaf_encoder_log(x265_encoder *encoder, int argc, char **argv, x265_param*, x265_vmaf_data*);
408
+
409
+#endif
410
411
#define X265_MAJOR_VERSION 1
412
413
414
void (*csvlog_encode)(const x265_param*, const x265_stats *, int, int, int, char**);
415
void (*dither_image)(x265_picture*, int, int, int16_t*, int);
416
int (*set_analysis_data)(x265_encoder *encoder, x265_analysis_data *analysis_data, int poc, uint32_t cuBytes);
417
+#if ENABLE_LIBVMAF
418
+ double (*calculate_vmafscore)(x265_param *, x265_vmaf_data *);
419
+ double (*calculate_vmaf_framelevelscore)(x265_vmaf_framedata *);
420
+ void (*vmaf_encoder_log)(x265_encoder*, int, char**, x265_param *, x265_vmaf_data *);
421
+#endif
422
/* add new pointers to the end, or increment X265_MAJOR_VERSION */
423
} x265_api;
424
425
x265_2.7.tar.gz/source/x265cli.h -> x265_2.9.tar.gz/source/x265cli.h
Changed
104
1
2
{ "vbv-init", required_argument, NULL, 0 },
3
{ "vbv-end", required_argument, NULL, 0 },
4
{ "vbv-end-fr-adj", required_argument, NULL, 0 },
5
+ { "chunk-start", required_argument, NULL, 0 },
6
+ { "chunk-end", required_argument, NULL, 0 },
7
{ "bitrate", required_argument, NULL, 0 },
8
{ "qp", required_argument, NULL, 'q' },
9
{ "aq-mode", required_argument, NULL, 0 },
10
11
{ "scale-factor", required_argument, NULL, 0 },
12
{ "refine-intra", required_argument, NULL, 0 },
13
{ "refine-inter", required_argument, NULL, 0 },
14
+ { "dynamic-refine", no_argument, NULL, 0 },
15
+ { "no-dynamic-refine", no_argument, NULL, 0 },
16
{ "strict-cbr", no_argument, NULL, 0 },
17
{ "temporal-layers", no_argument, NULL, 0 },
18
{ "no-temporal-layers", no_argument, NULL, 0 },
19
20
{ "refine-mv-type", required_argument, NULL, 0 },
21
{ "copy-pic", no_argument, NULL, 0 },
22
{ "no-copy-pic", no_argument, NULL, 0 },
23
+ { "max-ausize-factor", required_argument, NULL, 0 },
24
+ { "idr-recovery-sei", no_argument, NULL, 0 },
25
+ { "no-idr-recovery-sei", no_argument, NULL, 0 },
26
+ { "single-sei", no_argument, NULL, 0 },
27
+ { "no-single-sei", no_argument, NULL, 0 },
28
+ { "atc-sei", required_argument, NULL, 0 },
29
+ { "pic-struct", required_argument, NULL, 0 },
30
+ { "nalu-file", required_argument, NULL, 0 },
31
{ 0, 0, 0, 0 },
32
{ 0, 0, 0, 0 },
33
{ 0, 0, 0, 0 },
34
35
H0(" --dhdr10-info <filename> JSON file containing the Creative Intent Metadata to be encoded as Dynamic Tone Mapping\n");
36
H0(" --[no-]dhdr10-opt Insert tone mapping SEI only for IDR frames and when the tone mapping information changes. Default disabled\n");
37
#endif
38
+ H0(" --nalu-file <filename> Text file containing SEI messages in the following format : <POC><space><PREFIX><space><NAL UNIT TYPE>/<SEI TYPE><space><SEI Payload>\n");
39
H0("-f/--frames <integer> Maximum number of frames to encode. Default all\n");
40
H0(" --seek <integer> First frame to encode\n");
41
H1(" --[no-]interlace <bff|tff> Indicate input pictures are interlace fields in temporal order. Default progressive\n");
42
43
H0(" --[no-]early-skip Enable early SKIP detection. Default %s\n", OPT(param->bEnableEarlySkip));
44
H0(" --[no-]rskip Enable early exit from recursion. Default %s\n", OPT(param->bEnableRecursionSkip));
45
H1(" --[no-]tskip-fast Enable fast intra transform skipping. Default %s\n", OPT(param->bEnableTSkipFast));
46
- H1(" --[no-]splitrd-skip Enable skipping split RD analysis when sum of split CU rdCost larger than none split CU rdCost for Intra CU. Default %s\n", OPT(param->bEnableSplitRdSkip));
47
+ H1(" --[no-]splitrd-skip Enable skipping split RD analysis when sum of split CU rdCost larger than one split CU rdCost for Intra CU. Default %s\n", OPT(param->bEnableSplitRdSkip));
48
H1(" --nr-intra <integer> An integer value in range of 0 to 2000, which denotes strength of noise reduction in intra CUs. Default 0\n");
49
H1(" --nr-inter <integer> An integer value in range of 0 to 2000, which denotes strength of noise reduction in inter CUs. Default 0\n");
50
H0(" --ctu-info <integer> Enable receiving ctu information asynchronously and determine reaction to the CTU information (0, 1, 2, 4, 6) Default 0\n"
51
52
H0(" --vbv-init <float> Initial VBV buffer occupancy (fraction of bufsize or in kbits). Default %.2f\n", param->rc.vbvBufferInit);
53
H0(" --vbv-end <float> Final VBV buffer emptiness (fraction of bufsize or in kbits). Default 0 (disabled)\n");
54
H0(" --vbv-end-fr-adj <float> Frame from which qp has to be adjusted to achieve final decode buffer emptiness. Default 0\n");
55
+ H0(" --chunk-start <integer> First frame of the chunk. Default 0 (disabled)\n");
56
+ H0(" --chunk-end <integer> Last frame of the chunk. Default 0 (disabled)\n");
57
H0(" --pass Multi pass rate control.\n"
58
" - 1 : First pass, creates stats file\n"
59
" - 2 : Last pass, does not overwrite stats file\n"
60
61
H0(" --analysis-reuse-level <1..10> Level of analysis reuse indicates amount of info stored/reused in save/load mode, 1:least..10:most. Default %d\n", param->analysisReuseLevel);
62
H0(" --refine-mv-type <string> Reuse MV information received through API call. Supported option is avc. Default disabled - %d\n", param->bMVType);
63
H0(" --scale-factor <int> Specify factor by which input video is scaled down for analysis save mode. Default %d\n", param->scaleFactor);
64
- H0(" --refine-intra <0..3> Enable intra refinement for encode that uses analysis-load.\n"
65
+ H0(" --refine-intra <0..4> Enable intra refinement for encode that uses analysis-load.\n"
66
" - 0 : Forces both mode and depth from the save encode.\n"
67
" - 1 : Functionality of (0) + evaluate all intra modes at min-cu-size's depth when current depth is one smaller than min-cu-size's depth.\n"
68
" - 2 : Functionality of (1) + irrespective of size evaluate all angular modes when the save encode decides the best mode as angular.\n"
69
" - 3 : Functionality of (1) + irrespective of size evaluate all intra modes.\n"
70
+ " - 4 : Re-evaluate all intra blocks, does not reuse data from save encode.\n"
71
" Default:%d\n", param->intraRefine);
72
H0(" --refine-inter <0..3> Enable inter refinement for encode that uses analysis-load.\n"
73
" - 0 : Forces both mode and depth from the save encode.\n"
74
75
" - 2 : Functionality of (1) + irrespective of size restrict the modes evaluated when specific modes are decided as the best mode by the save encode.\n"
76
" - 3 : Functionality of (1) + irrespective of size evaluate all inter modes.\n"
77
" Default:%d\n", param->interRefine);
78
+ H0(" --[no-]dynamic-refine Dynamically changes refine-inter level for each CU. Default %s\n", OPT(param->bDynamicRefine));
79
H0(" --[no-]refine-mv Enable mv refinement for load mode. Default %s\n", OPT(param->mvRefine));
80
H0(" --aq-mode <integer> Mode for Adaptive Quantization - 0:none 1:uniform AQ 2:auto variance 3:auto variance with bias to dark scenes. Default %d\n", param->rc.aqMode);
81
H0(" --aq-strength <float> Reduces blocking and blurring in flat and textured areas (0 to 3.0). Default %.2f\n", param->rc.aqStrength);
82
83
H1(" MAX_MAX_QP+1 floats for lambda table, then again for lambda2 table\n");
84
H1(" Blank lines and lines starting with hash(#) are ignored\n");
85
H1(" Comma is considered to be white-space\n");
86
+ H0(" --max-ausize-factor <float> This value controls the maximum AU size defined in specification.\n");
87
+ H0(" It represents the percentage of maximum AU size used. Default %.1f\n", param->maxAUSizeFactor);
88
H0("\nLoop filters (deblock and SAO):\n");
89
H0(" --[no-]deblock Enable Deblocking Loop Filter, optionally specify tC:Beta offsets Default %s\n", OPT(param->bEnableLoopFilter));
90
H0(" --[no-]sao Enable Sample Adaptive Offset. Default %s\n", OPT(param->bEnableSAO));
91
92
H0(" --[no-]repeat-headers Emit SPS and PPS headers at each keyframe. Default %s\n", OPT(param->bRepeatHeaders));
93
H0(" --[no-]info Emit SEI identifying encoder and parameters. Default %s\n", OPT(param->bEmitInfoSEI));
94
H0(" --[no-]hrd Enable HRD parameters signaling. Default %s\n", OPT(param->bEmitHRDSEI));
95
+ H0(" --[no-]idr-recovery-sei Emit recovery point infor SEI at each IDR frame \n");
96
H0(" --[no-]temporal-layers Enable a temporal sublayer for unreferenced B frames. Default %s\n", OPT(param->bEnableTemporalSubLayers));
97
H0(" --[no-]aud Emit access unit delimiters at the start of each access unit. Default %s\n", OPT(param->bEnableAccessUnitDelimiters));
98
H1(" --hash <integer> Decoded Picture Hash SEI 0: disabled, 1: MD5, 2: CRC, 3: Checksum. Default %d\n", param->decodedPictureHashSEI);
99
+ H0(" --atc-sei <integer> Emit the alternative transfer characteristics SEI message where the integer is the preferred transfer characteristics. Default disabled\n");
100
+ H0(" --pic-struct <integer> Set the picture structure and emits it in the picture timing SEI message. Values in the range 0..12. See D.3.3 of the HEVC spec. for a detailed explanation.\n");
101
H0(" --log2-max-poc-lsb <integer> Maximum of the picture order count\n");
102
H0(" --[no-]vui-timing-info Emit VUI timing information in the bistream. Default %s\n", OPT(param->bEmitVUITimingInfo));
103
H0(" --[no-]vui-hrd-info Emit VUI HRD information in the bistream. Default %s\n", OPT(param->bEmitVUIHRDInfo));
104
Refresh
No build results available
Refresh
No rpmlint results available
Login required, please
login
or
signup
in order to comment