Overview

Request 4285 (accepted)

No description set
Submit package home:Aloysius:branches:Essentials / x265 to package Essentials / x265

x265.changes Changed
x
 
1
@@ -1,4 +1,66 @@
2
 -------------------------------------------------------------------
3
+Tue Oct  9 20:03:53 UTC 2018 - aloisio@gmx.com
4
+
5
+- Update to version 2.9
6
+  New features:
7
+  * Support for chunked encoding
8
+    + :option:`--chunk-start and --chunk-end`
9
+    + Frames preceding first frame of chunk in display order
10
+      will be encoded, however, they will be discarded in the
11
+      bitstream.
12
+    + Frames following last frame of the chunk in display order
13
+      will be used in taking lookahead decisions, but, they will
14
+      not be encoded.
15
+    + This feature can be enabled only in closed GOP structures.
16
+      Default disabled.
17
+  * Support for HDR10+ version 1 SEI messages.
18
+  Encoder enhancements:
19
+  * Create API function for allocating and freeing
20
+    x265_analysis_data.
21
+  * CEA 608/708 support: Read SEI messages from text file and
22
+    encode it using userSEI message.
23
+  Bug fixes:
24
+  * Disable noise reduction when vbv is enabled.
25
+  * Support minLuma and maxLuma values changed by the
26
+    commandline.
27
+  version 2.8
28
+  New features:
29
+  * :option:`--asm avx512` used to enable AVX-512 in x265.
30
+    Default disabled.  
31
+    + For 4K main10 high-quality encoding, we are seeing good
32
+      gains; for other resolutions and presets, we don't
33
+      recommend using this setting for now.
34
+  * :option:`--dynamic-refine` dynamically switches between
35
+    different inter refine levels. Default disabled.
36
+    + It is recommended to use :option:`--refine-intra 4' with
37
+      dynamic refinement for a better trade-off between encode
38
+      efficiency and performance than using static refinement.
39
+  * :option:`--single-sei`
40
+    + Encode SEI messages in a single NAL unit instead of
41
+      multiple NAL units. Default disabled.
42
+  * :option:`--max-ausize-factor` controls the maximum AU size
43
+    defined in HEVC specification.
44
+    + It represents the percentage of maximum AU size used.
45
+      Default is 1.
46
+  * VMAF (Video Multi-Method Assessment Fusion)
47
+    + Added VMAF support for objective quality measurement of a
48
+      video sequence.
49
+    + Enable cmake option ENABLE_LIBVMAF to report per frame and
50
+      aggregate VMAF score. The frame level VMAF score does not
51
+      include temporal scores.
52
+    + This is supported only on linux for now.
53
+  Encoder enhancements:
54
+  * Introduced refine-intra level 4 to improve quality.
55
+  * Support for HLG-graded content and pic_struct in SEI message.
56
+  Bug Fixes:
57
+  * Fix 32 bit build error (using CMAKE GUI) in Linux.
58
+  * Fix 32 bit build error for asm primitives.
59
+  * Fix build error on mac OS.
60
+  * Fix VBV Lookahead in analysis load to achieve target bitrate.
61
+
62
+- Added x265-fix_enable512.patch
63
+
64
+-------------------------------------------------------------------
65
 Fri May  4 22:21:57 UTC 2018 - zaitor@opensuse.org
66
 
67
 - Build with nasm >= 2.13 for openSUSE Leap 42.3 and SLE-12, since
68
x265.spec Changed
83
 
1
@@ -1,10 +1,10 @@
2
 # based on the spec file from https://build.opensuse.org/package/view_file/home:Simmphonie/libx265/
3
 
4
 Name:           x265
5
-%define soname  151
6
+%define soname  165
7
 %define libname lib%{name}
8
 %define libsoname %{libname}-%{soname}
9
-Version:        2.7
10
+Version:        2.9
11
 Release:        0
12
 License:        GPL-2.0+
13
 Summary:        A free h265/HEVC encoder - encoder binary
14
@@ -13,17 +13,15 @@
15
 Source0:        https://bitbucket.org/multicoreware/x265/downloads/%{name}_%{version}.tar.gz
16
 Patch0:         arm.patch
17
 Patch1:         x265.pkgconfig.patch
18
+Patch2:         x265-fix_enable512.patch
19
 BuildRequires:  gcc
20
 BuildRequires:  gcc-c++
21
 BuildRequires:  cmake >= 2.8.8
22
 BuildRequires:  pkg-config
23
 BuildRequires:  nasm >= 2.13
24
-%if 0%{?suse_version} > 1310
25
 %ifarch x86_64
26
 BuildRequires:  libnuma-devel >= 2.0.9
27
 %endif
28
-%endif
29
-BuildRoot:      %{_tmppath}/%{name}-%{version}-build
30
 
31
 %description
32
 x265 is a free library for encoding next-generation H265/HEVC video
33
@@ -47,18 +45,19 @@
34
 
35
 %description -n %{libname}-devel
36
 x265 is a free library for encoding next-generation H265/HEVC video
37
-streams. 
38
+streams.
39
 
40
 %prep
41
 %setup -q -n %{name}_%{version}
42
 %patch0 -p1
43
 %patch1 -p1
44
+%patch2 -p1
45
 
46
 sed -i -e "s/0.0/%{soname}.0/g" source/cmake/version.cmake
47
 
48
 
49
 %build
50
-%if 0%{?suse_version} < 1330
51
+%if 0%{?suse_version} < 1500
52
 cd source
53
 %else
54
 %define __builddir ./source/build
55
@@ -68,7 +67,7 @@
56
 make %{?_smp_mflags}
57
 
58
 %install
59
-%if 0%{?suse_version} < 1330
60
+%if 0%{?suse_version} < 1500
61
 cd source
62
 %endif
63
 %cmake_install
64
@@ -79,15 +78,14 @@
65
 %postun -n %{libsoname} -p /sbin/ldconfig
66
 
67
 %files -n %{libsoname}
68
-%defattr(0644,root,root)
69
 %{_libdir}/%{libname}.so.%{soname}*
70
 
71
-%files 
72
-%defattr(0755,root,root)
73
+%files
74
 %{_bindir}/%{name}
75
 
76
 %files -n %{libname}-devel
77
-%defattr(0644,root,root)
78
+%license COPYING
79
+%doc readme.rst
80
 %{_includedir}/%{name}.h
81
 %{_includedir}/%{name}_config.h
82
 %{_libdir}/pkgconfig/%{name}.pc
83
x265-fix_enable512.patch Added
27
 
1
@@ -0,0 +1,25 @@
2
+--- a/source/common/cpu.cpp
3
++++ b/source/common/cpu.cpp
4
+@@ -110,6 +110,11 @@ const cpu_name_t cpu_names[] =
5
+     { "", 0 },
6
+ };
7
+ 
8
++bool detect512()
9
++{
10
++    return(enable512);
11
++}
12
++
13
+ #if X265_ARCH_X86
14
+ 
15
+ extern "C" {
16
+@@ -123,10 +128,6 @@ uint64_t PFX(cpu_xgetbv)(int xcr);
17
+ #pragma warning(disable: 4309) // truncation of constant value
18
+ #endif
19
+ 
20
+-bool detect512()
21
+-{
22
+-    return(enable512);
23
+-}
24
+ uint32_t cpu_detect(bool benableavx512 )
25
+ {
26
+ 
27
x265_2.7.tar.gz/.hg_archival.txt -> x265_2.9.tar.gz/.hg_archival.txt Changed
8
 
1
@@ -1,4 +1,4 @@
2
 repo: 09fe40627f03a0f9c3e6ac78b22ac93da23f9fdf
3
-node: e41a9bf2bac4a7af2bec2bbadf91e63752d320ef
4
+node: f9681d731f2e56c2ca185cec10daece5939bee07
5
 branch: stable
6
-tag: 2.7
7
+tag: 2.9
8
x265_2.7.tar.gz/.hgtags -> x265_2.9.tar.gz/.hgtags Changed
7
 
1
@@ -25,3 +25,5 @@
2
 e7a4dd48293b7956d4a20df257d23904cc78e376 2.4
3
 64b2d0bf45a52511e57a6b7299160b961ca3d51c 2.5
4
 0e9ea76945c89962cd46cee6537586e2054b2935 2.6
5
+e41a9bf2bac4a7af2bec2bbadf91e63752d320ef 2.7
6
+a158a3a029663133455268e2a63ae6b0af2df720 2.8
7
x265_2.7.tar.gz/doc/reST/api.rst -> x265_2.9.tar.gz/doc/reST/api.rst Changed
51
 
1
@@ -223,6 +223,18 @@
2
      *     returns negative on error, 0 access unit were output.*/
3
      int x265_set_analysis_data(x265_encoder *encoder, x265_analysis_data *analysis_data, int poc, uint32_t cuBytes);
4
 
5
+**x265_alloc_analysis_data()** may be used to allocate memory for the x265_analysis_data::
6
+
7
+    /* x265_alloc_analysis_data:
8
+     *     Allocate memory for the x265_analysis_data object's internal structures. */
9
+     void x265_alloc_analysis_data(x265_param *param, x265_analysis_data* analysis);
10
+
11
+**x265_free_analysis_data()** may be used to free memory for the x265_analysis_data::
12
+
13
+    /* x265_free_analysis_data:
14
+     *    Free the allocated memory for x265_analysis_data object's internal structures. */
15
+     void x265_free_analysis_data(x265_param *param, x265_analysis_data* analysis);
16
+
17
 Pictures
18
 ========
19
 
20
@@ -398,7 +410,30 @@
21
     *     release library static allocations, reset configured CTU size */
22
    void x265_cleanup(void);
23
 
24
+VMAF (Video Multi-Method Assessment Fusion)
25
+==========================================
26
+
27
+If you set the ENABLE_LIBVMAF cmake option to ON, then x265 will report per frame
28
+and aggregate VMAF score for the given input and dump the scores in csv file.
29
+The user also need to specify the :option:`--recon` in command line to get the VMAF scores.
30
+ 
31
+    /* x265_calculate_vmafScore:
32
+     *    returns VMAF score for the input video.
33
+     *    This api must be called only after encoding was done. */
34
+    double x265_calculate_vmafscore(x265_param*, x265_vmaf_data*);
35
+
36
+    /* x265_calculate_vmaf_framelevelscore:
37
+     *    returns VMAF score for each frame in a given input video. The frame level VMAF score does not include temporal scores. */
38
+    double x265_calculate_vmaf_framelevelscore(x265_vmaf_framedata*);
39
+    
40
+.. Note::
41
 
42
+    When setting ENABLE_LIBVMAF cmake option to ON, it is recommended to
43
+    also set ENABLE_SHARED to OFF to prevent build problems.  
44
+    We only need the static library from these builds.
45
+    
46
+    Binaries build with windows will not have VMAF support.
47
+      
48
 Multi-library Interface
49
 =======================
50
 
51
x265_2.7.tar.gz/doc/reST/cli.rst -> x265_2.9.tar.gz/doc/reST/cli.rst Changed
625
 
1
@@ -52,7 +52,7 @@
2
    2. unable to open encoder
3
    3. unable to generate stream headers
4
    4. encoder abort
5
-   
6
+
7
 Logging/Statistic Options
8
 =========================
9
 
10
@@ -104,6 +104,8 @@
11
    **BufferFill** Bits available for the next frame. Includes bits carried
12
    over from the current frame.
13
    
14
+   **BufferFillFinal** Buffer bits available after removing the frame out of CPB.
15
+   
16
    **Latency** Latency in terms of number of frames between when the frame 
17
    was given in and when the frame is given out.
18
    
19
@@ -183,11 +185,11 @@
20
    
21
 .. option:: --csv-log-level <integer>
22
 
23
-    Controls the level of detail (and size) of --csv log files
24
-       
25
-    0. summary **(default)**
26
-    1. frame level logging
27
-    2. frame level logging with performance statistics
28
+   Controls the level of detail (and size) of --csv log files
29
+
30
+   0. summary **(default)**
31
+   1. frame level logging
32
+   2. frame level logging with performance statistics
33
 
34
 .. option:: --ssim, --no-ssim
35
 
36
@@ -254,7 +256,7 @@
37
    "*"       - same as default
38
    "none"    - no thread pools are created, only frame parallelism possible
39
    "-"       - same as "none"
40
-   "10"      - allocate one pool, using up to 10 cores on node 0
41
+   "10"      - allocate one pool, using up to 10 cores on all available nodes
42
    "-,+"     - allocate one pool, using all cores on node 1
43
    "+,-,+"   - allocate one pool, using only cores on nodes 0 and 2
44
    "+,-,+,-" - allocate one pool, using only cores on nodes 0 and 2
45
@@ -535,6 +537,20 @@
46
 
47
    **CLI ONLY**
48
 
49
+.. option:: --chunk-start <integer>
50
+
51
+   First frame of the chunk. Frames preceeding this in display order will
52
+   be encoded, however, they will be discarded in the bitstream. This
53
+   feature can be enabled only in closed GOP structures.
54
+   Default 0 (disabled).
55
+   
56
+.. option:: --chunk-end <integer>
57
+
58
+   Last frame of the chunk. Frames following this in display order will be
59
+   used in taking lookahead decisions, but, they will not be encoded.
60
+   This feature can be enabled only in closed GOP structures.
61
+   Default 0 (disabled).
62
+
63
 Profile, Level, Tier
64
 ====================
65
 
66
@@ -646,9 +662,9 @@
67
     encoding options, the encoder will attempt to modify/set the right 
68
     encode specifications. If the encoder is unable to do so, this option
69
     will be turned OFF. Highly experimental.
70
-   
71
+
72
     Default: disabled
73
-   
74
+
75
 .. note::
76
 
77
    :option:`--profile`, :option:`--level-idc`, and
78
@@ -773,7 +789,7 @@
79
    Default 3.
80
 
81
 .. option:: --limit-modes, --no-limit-modes
82
-    
83
+
84
    When enabled, limit-modes will limit modes analyzed for each CU using cost 
85
    metrics from the 4 sub-CUs. When multiple inter modes like :option:`--rect`
86
    and/or :option:`--amp` are enabled, this feature will use motion cost 
87
@@ -820,6 +836,11 @@
88
 
89
    Default: enabled, disabled for :option:`--tune grain`
90
 
91
+.. option:: --splitrd-skip, --no-splitrd-skip
92
+
93
+   Enable skipping split RD analysis when sum of split CU rdCost larger than one
94
+   split CU rdCost for Intra CU. Default disabled.
95
+
96
 .. option:: --fast-intra, --no-fast-intra
97
 
98
    Perform an initial scan of every fifth intra angular mode, then
99
@@ -888,35 +909,36 @@
100
 
101
    Note that --analysis-reuse-level must be paired with analysis-reuse-mode.
102
 
103
-    +--------------+------------------------------------------+
104
-    | Level        | Description                              |
105
-    +==============+==========================================+
106
-    | 1            | Lookahead information                    |
107
-    +--------------+------------------------------------------+
108
-    | 2 to 4       | Level 1 + intra/inter modes, ref's       |
109
-    +--------------+------------------------------------------+
110
-    | 5,6 and 9    | Level 2 + rect-amp                       |
111
-    +--------------+------------------------------------------+
112
-    | 7            | Level 5 + AVC size CU refinement         |
113
-    +--------------+------------------------------------------+
114
-    | 8            | Level 5 + AVC size Full CU analysis-info |
115
-    +--------------+------------------------------------------+
116
-    | 10           | Level 5 + Full CU analysis-info          |
117
-    +--------------+------------------------------------------+
118
+   +--------------+------------------------------------------+
119
+   | Level        | Description                              |
120
+   +==============+==========================================+
121
+   | 1            | Lookahead information                    |
122
+   +--------------+------------------------------------------+
123
+   | 2 to 4       | Level 1 + intra/inter modes, ref's       |
124
+   +--------------+------------------------------------------+
125
+   | 5 and 6      | Level 2 + rect-amp                       |
126
+   +--------------+------------------------------------------+
127
+   | 7            | Level 5 + AVC size CU refinement         |
128
+   +--------------+------------------------------------------+
129
+   | 8 and 9      | Level 5 + AVC size Full CU analysis-info |
130
+   +--------------+------------------------------------------+
131
+   | 10           | Level 5 + Full CU analysis-info          |
132
+   +--------------+------------------------------------------+
133
 
134
 .. option:: --refine-mv-type <string>
135
 
136
-    Reuse MV information received through API call. Currently receives information for AVC size and the accepted 
137
-    string input is "avc". Default is disabled.
138
+   Reuse MV information received through API call. Currently receives information for AVC size and the accepted 
139
+   string input is "avc". Default is disabled.
140
 
141
 .. option:: --scale-factor
142
 
143
-       Factor by which input video is scaled down for analysis save mode.
144
-       This option should be coupled with analysis-reuse-mode option, --analysis-reuse-level 10.
145
-       The ctu size of load should be double the size of save. Default 0.
146
+   Factor by which input video is scaled down for analysis save mode.
147
+   This option should be coupled with analysis-reuse-mode option, 
148
+   --analysis-reuse-level 10. The ctu size of load can either be the 
149
+   same as that of save or double the size of save. Default 0.
150
+
151
+.. option:: --refine-intra <0..4>
152
 
153
-.. option:: --refine-intra <0..3>
154
-   
155
    Enables refinement of intra blocks in current encode. 
156
    
157
    Level 0 - Forces both mode and depth from the save encode.
158
@@ -931,8 +953,10 @@
159
    
160
    Level 3 - Perform analysis of intra modes for depth reused from first encode.
161
    
162
-   Default 0.
163
+   Level 4 - Does not reuse any analysis information - redo analysis for the intra block.
164
    
165
+   Default 0.
166
+
167
 .. option:: --refine-inter <0..3>
168
 
169
    Enables refinement of inter blocks in current encode. 
170
@@ -954,11 +978,17 @@
171
    
172
    Default 0.
173
 
174
+.. option:: --dynamic-refine, --no-dynamic-refine
175
+
176
+   Dynamically switches :option:`--refine-inter` levels 0-3 based on the content and 
177
+   the encoder settings. It is recommended to use :option:`--refine-intra` 4 with dynamic 
178
+   refinement. Default disabled.
179
+
180
 .. option:: --refine-mv
181
    
182
    Enables refinement of motion vector for scaled video. Evaluates the best 
183
    motion vector by searching the surrounding eight integer and subpel pixel
184
-    positions.
185
+   positions.
186
 
187
 Options which affect the transform unit quad-tree, sometimes referred to
188
 as the residual quad-tree (RQT).
189
@@ -1094,9 +1124,9 @@
190
    quad-tree begins at the same depth of the coded tree unit, but if the
191
    maximum TU size is smaller than the CU size then transform QT begins 
192
    at the depth of the max-tu-size. Default: 32.
193
-   
194
+
195
 .. option:: --dynamic-rd <0..4>
196
-   
197
+
198
    Increases the RD level at points where quality drops due to VBV rate 
199
    control enforcement. The number of CUs for which the RD is reconfigured 
200
    is determined based on the strength. Strength 1 gives the best FPS, 
201
@@ -1107,13 +1137,13 @@
202
 
203
 .. option:: --ssim-rd, --no-ssim-rd
204
 
205
-    Enable/Disable SSIM RDO. SSIM is a better perceptual quality assessment
206
-    method as compared to MSE. SSIM based RDO calculation is based on residual
207
-    divisive normalization scheme. This normalization is consistent with the 
208
-    luminance and contrast masking effect of Human Visual System. It is used
209
-    for mode selection during analysis of CTUs and can achieve significant 
210
-    gain in terms of objective quality metrics SSIM and PSNR. It only has effect
211
-    on presets which use RDO-based mode decisions (:option:`--rd` 3 and above).
212
+   Enable/Disable SSIM RDO. SSIM is a better perceptual quality assessment
213
+   method as compared to MSE. SSIM based RDO calculation is based on residual
214
+   divisive normalization scheme. This normalization is consistent with the 
215
+   luminance and contrast masking effect of Human Visual System. It is used
216
+   for mode selection during analysis of CTUs and can achieve significant 
217
+   gain in terms of objective quality metrics SSIM and PSNR. It only has effect
218
+   on presets which use RDO-based mode decisions (:option:`--rd` 3 and above).
219
 
220
 Temporal / motion search options
221
 ================================
222
@@ -1216,8 +1246,8 @@
223
 
224
 .. option:: --analyze-src-pics, --no-analyze-src-pics
225
 
226
-    Enalbe motion estimation with source frame pixels, in this mode, 
227
-    motion estimation can be computed independently. Default disabled.
228
+   Enable motion estimation with source frame pixels, in this mode, 
229
+   motion estimation can be computed independently. Default disabled.
230
 
231
 Spatial/intra options
232
 =====================
233
@@ -1362,12 +1392,12 @@
234
 
235
 .. option:: --ctu-info <0, 1, 2, 4, 6>
236
 
237
-   This value enables receiving CTU information asynchronously and determine reaction to the CTU information. Default 0.
238
-   1: force the partitions if CTU information is present.
239
-   2: functionality of (1) and reduce qp if CTU information has changed.
240
-   4: functionality of (1) and force Inter modes when CTU Information has changed, merge/skip otherwise.
241
-   This option should be enabled only when planning to invoke the API function x265_encoder_ctu_info to copy ctu-info asynchronously. 
242
-   If enabled without calling the API function, the encoder will wait indefinitely.
243
+   This value enables receiving CTU information asynchronously and determine reaction to the CTU information. Default 0.
244
+   1: force the partitions if CTU information is present.
245
+   2: functionality of (1) and reduce qp if CTU information has changed.
246
+   4: functionality of (1) and force Inter modes when CTU Information has changed, merge/skip otherwise.
247
+   This option should be enabled only when planning to invoke the API function x265_encoder_ctu_info to copy ctu-info asynchronously. 
248
+   If enabled without calling the API function, the encoder will wait indefinitely.
249
 
250
 .. option:: --intra-refresh
251
 
252
@@ -1387,16 +1417,17 @@
253
    Default 20
254
 
255
    **Range of values:** Between the maximum consecutive bframe count (:option:`--bframes`) and 250
256
+
257
 .. option:: --gop-lookahead <integer>
258
 
259
-        Number of frames for GOP boundary decision lookahead. If a scenecut frame is found
260
-        within this from the gop boundary set by `--keyint`, the GOP will be extented until such a point,
261
-        otherwise the GOP will be terminated as set by `--keyint`. Default 0.
262
+   Number of frames for GOP boundary decision lookahead. If a scenecut frame is found
263
+   within this from the gop boundary set by `--keyint`, the GOP will be extented until such a point,
264
+   otherwise the GOP will be terminated as set by `--keyint`. Default 0.
265
 
266
-        **Range of values:** Between 0 and (`--rc-lookahead` - mini-GOP length)
267
+   **Range of values:** Between 0 and (`--rc-lookahead` - mini-GOP length)
268
 
269
-        It is recommended to have `--gop-lookahaed` less than `--min-keyint` as scenecuts beyond
270
-        `--min-keyint` are already being coded as keyframes.
271
+   It is recommended to have `--gop-lookahaed` less than `--min-keyint` as scenecuts beyond
272
+   `--min-keyint` are already being coded as keyframes.
273
 
274
 .. option:: --lookahead-slices <0..16>
275
 
276
@@ -1412,30 +1443,30 @@
277
    on systems with many threads.
278
 
279
    The encoder may internally lower the number of slices or disable
280
-    slicing to ensure each slice codes at least 10 16x16 rows of lowres
281
-    blocks to minimize the impact on quality. For example, for 720p and
282
-    1080p videos, the number of slices is capped to 4 and 6, respectively.
283
-    For resolutions lesser than 720p, slicing is auto-disabled.
284
-        
285
-    If slices are used in lookahead, they are logged in the list of tools
286
-    as *lslices*
287
+   slicing to ensure each slice codes at least 10 16x16 rows of lowres
288
+   blocks to minimize the impact on quality. For example, for 720p and
289
+   1080p videos, the number of slices is capped to 4 and 6, respectively.
290
+   For resolutions lesser than 720p, slicing is auto-disabled.
291
+
292
+   If slices are used in lookahead, they are logged in the list of tools
293
+   as *lslices*
294
 
295
    **Values:** 0 - disabled. 1 is the same as 0. Max 16.
296
-    Default: 8 for ultrafast, superfast, faster, fast, medium
297
-             4 for slow, slower
298
-             disabled for veryslow, slower
299
-            
300
+   Default: 8 for ultrafast, superfast, faster, fast, medium
301
+            4 for slow, slower
302
+            disabled for veryslow, slower
303
+
304
 .. option:: --lookahead-threads <integer>
305
 
306
-    Use multiple worker threads dedicated to doing only lookahead instead of sharing
307
-    the worker threads with frame Encoders. A dedicated lookahead threadpool is created with the
308
-    specified number of worker threads. This can range from 0 upto half the
309
-    hardware threads available for encoding. Using too many threads for lookahead can starve
310
-    resources for frame Encoder and can harm performance. Default is 0 - disabled, Lookahead 
311
+   Use multiple worker threads dedicated to doing only lookahead instead of sharing
312
+   the worker threads with frame Encoders. A dedicated lookahead threadpool is created with the
313
+   specified number of worker threads. This can range from 0 upto half the
314
+   hardware threads available for encoding. Using too many threads for lookahead can starve
315
+   resources for frame Encoder and can harm performance. Default is 0 - disabled, Lookahead 
316
    shares worker threads with other FrameEncoders . 
317
 
318
     **Values:** 0 - disabled(default). Max - Half of available hardware threads.
319
-   
320
+
321
 .. option:: --b-adapt <integer>
322
 
323
    Set the level of effort in determining B frame placement.
324
@@ -1466,11 +1497,11 @@
325
 .. option:: --b-pyramid, --no-b-pyramid
326
 
327
    Use B-frames as references, when possible. Default enabled
328
-   
329
+
330
 .. option:: --force-flush <integer>
331
 
332
    Force the encoder to flush frames. Default is 0.
333
-   
334
+
335
    Values:
336
    0 - flush the encoder only when all the input pictures are over.
337
    1 - flush all the frames even when the input is not over. 
338
@@ -1502,7 +1533,7 @@
339
    any given frame (ensuring a max QP).  This is dangerous when CRF is
340
    used in combination with VBV as it may result in buffer underruns.
341
    Default disabled
342
-        
343
+
344
 .. option:: --crf-min <0..51.0>
345
 
346
    Specify an lower limit to the rate factor which may be assigned to
347
@@ -1541,7 +1572,7 @@
348
    Default 0.9
349
 
350
    **Range of values:** fractional: 0 - 1.0, or kbits: 2 .. bufsize
351
-   
352
+
353
 .. option:: --vbv-end <float>
354
 
355
    Final buffer emptiness. The portion of the decode buffer that must be 
356
@@ -1553,7 +1584,7 @@
357
    can specify the starting and ending state of the VBV buffer so that VBV 
358
    compliance can be maintained when chunks are independently encoded and 
359
    stitched together.
360
-   
361
+
362
 .. option:: --vbv-end-fr-adj <float>
363
 
364
    Frame from which qp has to be adjusted to achieve final decode buffer
365
@@ -1671,31 +1702,31 @@
366
 
367
 .. option:: --multi-pass-opt-analysis, --no-multi-pass-opt-analysis
368
 
369
-    Enable/Disable multipass analysis refinement along with multipass ratecontrol. Based on 
370
-    the information stored in pass 1, in subsequent passes analysis data is refined 
371
-    and also redundant steps are skipped.
372
-    In pass 1 analysis information like motion vector, depth, reference and prediction
373
-    modes of the final best CTU partition is stored for each CTU.
374
-    Multipass analysis refinement cannot be enabled when 'analysis-save/analysis-load' option
375
-    is enabled and both will be disabled when enabled together. This feature requires 'pmode/pme'
376
-    to be disabled and hence pmode/pme will be disabled when enabled at the same time.
377
+   Enable/Disable multipass analysis refinement along with multipass ratecontrol. Based on 
378
+   the information stored in pass 1, in subsequent passes analysis data is refined 
379
+   and also redundant steps are skipped.
380
+   In pass 1 analysis information like motion vector, depth, reference and prediction
381
+   modes of the final best CTU partition is stored for each CTU.
382
+   Multipass analysis refinement cannot be enabled when 'analysis-save/analysis-load' option
383
+   is enabled and both will be disabled when enabled together. This feature requires 'pmode/pme'
384
+   to be disabled and hence pmode/pme will be disabled when enabled at the same time.
385
 
386
-    Default: disabled.
387
+   Default: disabled.
388
 
389
 .. option:: --multi-pass-opt-distortion, --no-multi-pass-opt-distortion
390
 
391
-    Enable/Disable multipass refinement of qp based on distortion data along with multipass
392
-    ratecontrol. In pass 1 distortion of best CTU partition is stored. CTUs with high
393
-    distortion get lower(negative)qp offsets and vice-versa for low distortion CTUs in pass 2.
394
-    This helps to improve the subjective quality.
395
-    Multipass refinement of qp cannot be enabled when 'analysis-save/analysis-load' option
396
-    is enabled and both will be disabled when enabled together. 'multi-pass-opt-distortion' 
397
-    requires 'pmode/pme' to be disabled and hence pmode/pme will be disabled when enabled along with it.
398
+   Enable/Disable multipass refinement of qp based on distortion data along with multipass
399
+   ratecontrol. In pass 1 distortion of best CTU partition is stored. CTUs with high
400
+   distortion get lower(negative)qp offsets and vice-versa for low distortion CTUs in pass 2.
401
+   This helps to improve the subjective quality.
402
+   Multipass refinement of qp cannot be enabled when 'analysis-save/analysis-load' option
403
+   is enabled and both will be disabled when enabled together. 'multi-pass-opt-distortion' 
404
+   requires 'pmode/pme' to be disabled and hence pmode/pme will be disabled when enabled along with it.
405
 
406
-    Default: disabled.
407
+   Default: disabled.
408
 
409
 .. option:: --strict-cbr, --no-strict-cbr
410
-   
411
+
412
    Enables stricter conditions to control bitrate deviance from the 
413
    target bitrate in ABR mode. Bit rate adherence is prioritised
414
    over quality. Rate tolerance is reduced to 50%. Default disabled.
415
@@ -1708,7 +1739,7 @@
416
    encoded frames to control QP. strict-cbr allows the encoder to be 
417
    more aggressive in hitting the target bitrate even for short segment 
418
    videos.
419
-   
420
+
421
 .. option:: --cbqpoffs <integer>
422
 
423
    Offset of Cb chroma QP from the luma QP selected by rate control.
424
@@ -1741,14 +1772,13 @@
425
 
426
    qComp sets the quantizer curve compression factor. It weights the
427
    frame quantizer based on the complexity of residual (measured by
428
-   lookahead).  Default value is 0.6. Increasing it to 1 will
429
-   effectively generate CQP
430
+   lookahead). It's value must be between 0.5 and 1.0. Default value is
431
+   0.6. Increasing it to 1.0 will effectively generate CQP.
432
 
433
 .. option:: --qpstep <integer>
434
 
435
-   The maximum single adjustment in QP allowed to rate control. Default
436
-   4
437
-   
438
+   The maximum single adjustment in QP allowed to rate control. Default 4
439
+
440
 .. option:: --qpmin <integer>
441
 
442
    sets a hard lower limit on QP allowed to ratecontrol. Default 0
443
@@ -1756,21 +1786,21 @@
444
 .. option:: --qpmax <integer>
445
 
446
    sets a hard upper limit on QP allowed to ratecontrol. Default 69
447
-   
448
+
449
 .. option:: --rc-grain, --no-rc-grain
450
 
451
-   Enables a specialised ratecontrol algorithm for film grain content. This 
452
-   parameter strictly minimises QP fluctuations within and across frames 
453
-   and removes pulsing of grain. Default disabled. 
454
-   Enabled when :option:'--tune' grain is applied. It is highly recommended 
455
-   that this option is used through the tune grain feature where a combination 
456
-   of param options are used to improve visual quality.
457
-   
458
+   Enables a specialised ratecontrol algorithm for film grain content. This 
459
+   parameter strictly minimises QP fluctuations within and across frames 
460
+   and removes pulsing of grain. Default disabled. 
461
+   Enabled when :option:'--tune' grain is applied. It is highly recommended 
462
+   that this option is used through the tune grain feature where a combination 
463
+   of param options are used to improve visual quality.
464
+
465
 .. option:: --const-vbv, --no-const-vbv
466
 
467
-   Enables VBV algorithm to be consistent across runs. Default disabled. 
468
-   Enabled when :option:'--tune' grain is applied.
469
-   
470
+   Enables VBV algorithm to be consistent across runs. Default disabled. 
471
+   Enabled when :option:'--tune' grain is applied.
472
+
473
 .. option:: --qblur <float>
474
 
475
    Temporally blur quants. Default 0.5
476
@@ -1831,17 +1861,18 @@
477
    HEVC specifies a default set of scaling lists which may be enabled
478
    without requiring them to be signaled in the SPS. Those scaling
479
    lists can be enabled via :option:`--scaling-list` *default*.
480
-    
481
+
482
    All other strings indicate a filename containing custom scaling
483
    lists in the HM format. The encode will abort if the file is not
484
-   parsed correctly. Custom lists must be signaled in the SPS
485
+   parsed correctly. Custom lists must be signaled in the SPS. A sample
486
+   scaling list file is available in `the downloads page <https://bitbucket.org/multicoreware/x265/downloads/reference_scalinglist.txt>`_
487
 
488
 .. option:: --lambda-file <filename>
489
 
490
    Specify a text file containing values for x265_lambda_tab and
491
    x265_lambda2_tab. Each table requires MAX_MAX_QP+1 (70) float
492
    values.
493
-   
494
+
495
    The text file syntax is simple. Comma is considered to be
496
    white-space. All white-space is ignored. Lines must be less than 2k
497
    bytes in length. Content following hash (#) characters are ignored.
498
@@ -1856,6 +1887,11 @@
499
    vectors and splits) and less on residual. This feature is intended
500
    for experimentation.
501
 
502
+.. option:: --max-ausize-factor <float>
503
+
504
+   It controls the maximum AU size defined in specification. It represents
505
+   the percentage of maximum AU size used. Default is 1. Range is 0.5 to 1.
506
+
507
 Loop filters
508
 ============
509
 
510
@@ -1975,9 +2011,9 @@
511
    7. smpte240m
512
    8. film
513
    9. bt2020
514
-    10. smpte428
515
-    11. smpte431
516
-    12. smpte432
517
+   10. smpte428
518
+   11. smpte431
519
+   12. smpte432
520
 
521
 .. option:: --transfer <integer|string>
522
 
523
@@ -2018,10 +2054,10 @@
524
    8. YCgCo
525
    9. bt2020nc
526
    10. bt2020c
527
-    11. smpte2085
528
-    12. chroma-derived-nc
529
-    13. chroma-derived-c
530
-    14. ictcp
531
+   11. smpte2085
532
+   12. chroma-derived-nc
533
+   13. chroma-derived-c
534
+   14. ictcp
535
 
536
 .. option:: --chromaloc <0..5>
537
 
538
@@ -2075,13 +2111,13 @@
539
    automatically when :option:`--master-display` or :option:`--max-cll` is
540
    specified. Useful when there is a desire to signal 0 values for max-cll
541
    and max-fall. Default disabled.
542
-   
543
+
544
 .. option:: --hdr-opt, --no-hdr-opt
545
 
546
    Add luma and chroma offsets for HDR/WCG content.
547
    Input video should be 10 bit 4:2:0. Applicable for HDR content. It is recommended
548
    that AQ-mode be enabled along with this feature. Default disabled.
549
-   
550
+
551
 .. option:: --dhdr10-info <filename>
552
 
553
    Inserts tone mapping information as an SEI message. It takes as input, 
554
@@ -2107,6 +2143,24 @@
555
    Maximum luma value allowed for input pictures. Any values above max-luma
556
    are clipped.  No default.
557
 
558
+.. option:: --nalu-file <filename>
559
+
560
+   Text file containing userSEI in POC order : <POC><space><PREFIX><space><NAL UNIT TYPE>/<SEI TYPE><space><SEI Payload>
561
+   Parse the input file specified and inserts SEI messages into the bitstream. 
562
+   Currently, we support only PREFIX SEI messages. This is an "application-only" feature.
563
+
564
+.. option:: --atc-sei <integer>
565
+
566
+   Emit the alternative transfer characteristics SEI message where the integer
567
+   is the preferred transfer characteristics. Required for HLG (Hybrid Log Gamma)
568
+   signalling. Not signalled by default.
569
+
570
+.. option:: --pic-struct <integer>
571
+
572
+   Set the picture structure and emits it in the picture timing SEI message.
573
+   Values in the range 0..12. See D.3.3 of the HEVC spec. for a detailed explanation.
574
+   Required for HLG (Hybrid Log Gamma) signalling. Not signalled by default.
575
+
576
 Bitstream options
577
 =================
578
 
579
@@ -2173,7 +2227,7 @@
580
 
581
 .. option:: --log2-max-poc-lsb <integer>
582
 
583
-  Maximum of the picture order count. Default 8
584
+   Maximum of the picture order count. Default 8
585
 
586
 .. option:: --vui-timing-info, --no-vui-timing-info
587
 
588
@@ -2205,21 +2259,28 @@
589
 
590
    Only effective at RD levels 5 and 6
591
 
592
+.. option:: --idr-recovery-sei, --no-idr-recovery-sei
593
+   Emit RecoveryPoint info as sei in bitstream for each IDR frame. Default disabled.
594
+
595
+.. option:: --single-sei, --no-single-sei
596
+   Emit SEI messages in a single NAL unit instead of multiple NALs. Default disabled.
597
+   When HRD SEI is enabled the HM decoder will throw a warning.
598
+
599
 DCT Approximations
600
 =================
601
 
602
 .. option:: --lowpass-dct
603
 
604
-    If enabled, x265 will use low-pass subband dct approximation instead of the
605
-    standard dct for 16x16 and 32x32 blocks. This approximation is less computational 
606
-    intensive but it generates truncated coefficient matrixes for the transformed block. 
607
-    Empirical analysis shows marginal loss in compression and performance gains up to 10%,
608
-    paticularly at moderate bit-rates.
609
+   If enabled, x265 will use low-pass subband dct approximation instead of the
610
+   standard dct for 16x16 and 32x32 blocks. This approximation is less computational 
611
+   intensive but it generates truncated coefficient matrixes for the transformed block. 
612
+   Empirical analysis shows marginal loss in compression and performance gains up to 10%,
613
+   paticularly at moderate bit-rates.
614
 
615
-    This approximation should be considered for platforms with performance and time 
616
-    constrains.
617
+   This approximation should be considered for platforms with performance and time 
618
+   constrains.
619
 
620
-    Default disabled. **Experimental feature**
621
+   Default disabled. **Experimental feature**
622
 
623
 Debugging options
624
 =================
625
x265_2.7.tar.gz/doc/reST/presets.rst -> x265_2.9.tar.gz/doc/reST/presets.rst Changed
13
 
1
@@ -156,7 +156,10 @@
2
 that strictly minimises QP fluctuations across frames, while still allowing 
3
 the encoder to hit bitrate targets and VBV buffer limits (with a slightly 
4
 higher margin of error than normal). It is highly recommended that this 
5
-algorithm is used only through the :option:`--tune` *grain* feature.
6
+algorithm is used only through the :option:`--tune` *grain* feature. 
7
+Overriding the `--tune` *grain* settings might result in grain strobing, especially
8
+when enabling features like :option:`--aq-mode` and :option:`--cutree` that modify
9
+per-block QPs within a given frame.
10
 
11
 Fast Decode
12
 ~~~~~~~~~~~
13
x265_2.7.tar.gz/doc/reST/releasenotes.rst -> x265_2.9.tar.gz/doc/reST/releasenotes.rst Changed
71
 
1
@@ -2,6 +2,69 @@
2
 Release Notes
3
 *************
4
 
5
+Version 2.9
6
+===========
7
+
8
+Release date - 05/10/2018
9
+
10
+New features
11
+-------------
12
+1. Support for chunked encoding
13
+
14
+   :option:`--chunk-start and --chunk-end` 
15
+   Frames preceding first frame of chunk in display order will be encoded, however, they will be discarded in the bitstream.
16
+   Frames following last frame of the chunk in display order will be used in taking lookahead decisions, but, they will not be encoded. 
17
+   This feature can be enabled only in closed GOP structures. Default disabled.
18
+
19
+2. Support for HDR10+ version 1 SEI messages.
20
+
21
+Encoder enhancements
22
+--------------------
23
+1. Create API function for allocating and freeing x265_analysis_data.
24
+2. CEA 608/708 support: Read SEI messages from text file and encode it using userSEI message.
25
+
26
+Bug fixes
27
+---------
28
+1. Disable noise reduction when vbv is enabled.
29
+2. Support minLuma and maxLuma values changed by the commandline.
30
+
31
+Version 2.8
32
+===========
33
+
34
+Release date - 21/05/2018
35
+
36
+New features
37
+-------------
38
+1. :option:`--asm avx512` used to enable AVX-512 in x265. Default disabled.    
39
+    For 4K main10 high-quality encoding, we are seeing good gains; for other resolutions and presets, we don't recommend using this setting for now.
40
+
41
+2. :option:`--dynamic-refine` dynamically switches between different inter refine levels. Default disabled.
42
+    It is recommended to use :option:`--refine-intra 4' with dynamic refinement for a better trade-off between encode efficiency and performance than using static refinement.
43
+
44
+3. :option:`--single-sei`
45
+    Encode SEI messages in a single NAL unit instead of multiple NAL units. Default disabled. 
46
+
47
+4. :option:`--max-ausize-factor` controls the maximum AU size defined in HEVC specification.
48
+    It represents the percentage of maximum AU size used. Default is 1. 
49
+     
50
+5. VMAF (Video Multi-Method Assessment Fusion)
51
+   Added VMAF support for objective quality measurement of a video sequence. 
52
+   Enable cmake option ENABLE_LIBVMAF to report per frame and aggregate VMAF score. The frame level VMAF score does not include temporal scores.
53
+   This is supported only on linux for now.
54
+ 
55
+Encoder enhancements
56
+--------------------
57
+1. Introduced refine-intra level 4 to improve quality. 
58
+2. Support for HLG-graded content and pic_struct in SEI message.
59
+
60
+Bug Fixes
61
+---------
62
+1. Fix 32 bit build error (using CMAKE GUI) in Linux.
63
+2. Fix 32 bit build error for asm primitives.
64
+3. Fix build error on mac OS.
65
+4. Fix VBV Lookahead in analysis load to achieve target bitrate.
66
+
67
+
68
 Version 2.7
69
 ===========
70
 
71
x265_2.7.tar.gz/source/CMakeLists.txt -> x265_2.9.tar.gz/source/CMakeLists.txt Changed
57
 
1
@@ -29,7 +29,7 @@
2
 option(STATIC_LINK_CRT "Statically link C runtime for release builds" OFF)
3
 mark_as_advanced(FPROFILE_USE FPROFILE_GENERATE NATIVE_BUILD)
4
 # X265_BUILD must be incremented each time the public API is changed
5
-set(X265_BUILD 151)
6
+set(X265_BUILD 165)
7
 configure_file("${PROJECT_SOURCE_DIR}/x265.def.in"
8
                "${PROJECT_BINARY_DIR}/x265.def")
9
 configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in"
10
@@ -48,12 +48,12 @@
11
 if("${SYSPROC}" STREQUAL "" OR X86MATCH GREATER "-1")
12
     set(X86 1)
13
     add_definitions(-DX265_ARCH_X86=1)
14
-    if("${CMAKE_SIZEOF_VOID_P}" MATCHES 8)
15
+    if(CMAKE_CXX_FLAGS STREQUAL "-m32")
16
+        message(STATUS "Detected x86 target processor")
17
+    elseif("${CMAKE_SIZEOF_VOID_P}" MATCHES 8)
18
         set(X64 1)
19
         add_definitions(-DX86_64=1)
20
         message(STATUS "Detected x86_64 target processor")
21
-    else()
22
-        message(STATUS "Detected x86 target processor")
23
     endif()
24
 elseif(POWERMATCH GREATER "-1")
25
     message(STATUS "Detected POWER target processor")
26
@@ -109,6 +109,11 @@
27
     if(NO_ATOMICS)
28
         add_definitions(-DNO_ATOMICS=1)
29
     endif(NO_ATOMICS)
30
+    find_library(VMAF vmaf)
31
+    option(ENABLE_LIBVMAF "Enable VMAF" OFF)
32
+    if(ENABLE_LIBVMAF)
33
+        add_definitions(-DENABLE_LIBVMAF)
34
+    endif()
35
 endif(UNIX)
36
 
37
 if(X64 AND NOT WIN32)
38
@@ -536,6 +541,9 @@
39
 if(EXTRA_LIB)
40
     target_link_libraries(x265-static ${EXTRA_LIB})
41
 endif()
42
+if(ENABLE_LIBVMAF)
43
+    target_link_libraries(x265-static ${VMAF})
44
+endif()
45
 install(TARGETS x265-static
46
     LIBRARY DESTINATION ${LIB_INSTALL_DIR}
47
     ARCHIVE DESTINATION ${LIB_INSTALL_DIR})
48
@@ -546,7 +554,7 @@
49
         ARCHIVE DESTINATION ${LIB_INSTALL_DIR})
50
 endif()
51
 install(FILES x265.h "${PROJECT_BINARY_DIR}/x265_config.h" DESTINATION include)
52
-if(WIN32)
53
+if((WIN32 AND ENABLE_CLI) OR (WIN32 AND ENABLE_SHARED))
54
     if(MSVC_IDE)
55
         install(FILES "${PROJECT_BINARY_DIR}/Debug/x265.pdb" DESTINATION ${BIN_INSTALL_DIR} CONFIGURATIONS Debug)
56
         install(FILES "${PROJECT_BINARY_DIR}/RelWithDebInfo/x265.pdb" DESTINATION ${BIN_INSTALL_DIR} CONFIGURATIONS RelWithDebInfo)
57
x265_2.7.tar.gz/source/common/common.cpp -> x265_2.9.tar.gz/source/common/common.cpp Changed
10
 
1
@@ -54,7 +54,7 @@
2
 #endif
3
 }
4
 
5
-#define X265_ALIGNBYTES 32
6
+#define X265_ALIGNBYTES 64
7
 
8
 #if _WIN32
9
 #if defined(__MINGW32__) && !defined(__MINGW64_VERSION_MAJOR)
10
x265_2.7.tar.gz/source/common/common.h -> x265_2.9.tar.gz/source/common/common.h Changed
26
 
1
@@ -75,6 +75,7 @@
2
 #define ALIGN_VAR_8(T, var)  T var __attribute__((aligned(8)))
3
 #define ALIGN_VAR_16(T, var) T var __attribute__((aligned(16)))
4
 #define ALIGN_VAR_32(T, var) T var __attribute__((aligned(32)))
5
+#define ALIGN_VAR_64(T, var) T var __attribute__((aligned(64)))
6
 #if defined(__MINGW32__)
7
 #define fseeko fseeko64
8
 #define ftello ftello64
9
@@ -85,6 +86,7 @@
10
 #define ALIGN_VAR_8(T, var)  __declspec(align(8)) T var
11
 #define ALIGN_VAR_16(T, var) __declspec(align(16)) T var
12
 #define ALIGN_VAR_32(T, var) __declspec(align(32)) T var
13
+#define ALIGN_VAR_64(T, var) __declspec(align(64)) T var
14
 #define fseeko _fseeki64
15
 #define ftello _ftelli64
16
 #endif // if defined(__GNUC__)
17
@@ -330,6 +332,8 @@
18
 #define START_CODE_OVERHEAD 3 
19
 #define FILLER_OVERHEAD (NAL_TYPE_OVERHEAD + START_CODE_OVERHEAD + 1)
20
 
21
+#define MAX_NUM_DYN_REFINE          (NUM_CU_DEPTH * X265_REFINE_INTER_LEVELS)
22
+
23
 namespace X265_NS {
24
 
25
 enum { SAO_NUM_OFFSET = 4 };
26
x265_2.7.tar.gz/source/common/cpu.cpp -> x265_2.9.tar.gz/source/common/cpu.cpp Changed
200
 
1
@@ -58,10 +58,11 @@
2
 #endif // if X265_ARCH_ARM
3
 
4
 namespace X265_NS {
5
+static bool enable512 = false;
6
 const cpu_name_t cpu_names[] =
7
 {
8
 #if X265_ARCH_X86
9
-#define MMX2 X265_CPU_MMX | X265_CPU_MMX2 | X265_CPU_CMOV
10
+#define MMX2 X265_CPU_MMX | X265_CPU_MMX2
11
     { "MMX2",        MMX2 },
12
     { "MMXEXT",      MMX2 },
13
     { "SSE",         MMX2 | X265_CPU_SSE },
14
@@ -84,13 +85,13 @@
15
     { "BMI2",        AVX | X265_CPU_LZCNT | X265_CPU_BMI1 | X265_CPU_BMI2 },
16
 #define AVX2 AVX | X265_CPU_FMA3 | X265_CPU_LZCNT | X265_CPU_BMI1 | X265_CPU_BMI2 | X265_CPU_AVX2
17
     { "AVX2", AVX2},
18
+    { "AVX512", AVX2 | X265_CPU_AVX512 },
19
 #undef AVX2
20
 #undef AVX
21
 #undef SSE2
22
 #undef MMX2
23
     { "Cache32",         X265_CPU_CACHELINE_32 },
24
     { "Cache64",         X265_CPU_CACHELINE_64 },
25
-    { "SlowCTZ",         X265_CPU_SLOW_CTZ },
26
     { "SlowAtom",        X265_CPU_SLOW_ATOM },
27
     { "SlowPshufb",      X265_CPU_SLOW_PSHUFB },
28
     { "SlowPalignr",     X265_CPU_SLOW_PALIGNR },
29
@@ -115,28 +116,32 @@
30
 /* cpu-a.asm */
31
 int PFX(cpu_cpuid_test)(void);
32
 void PFX(cpu_cpuid)(uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx);
33
-void PFX(cpu_xgetbv)(uint32_t op, uint32_t *eax, uint32_t *edx);
34
+uint64_t PFX(cpu_xgetbv)(int xcr);
35
 }
36
 
37
 #if defined(_MSC_VER)
38
 #pragma warning(disable: 4309) // truncation of constant value
39
 #endif
40
 
41
-uint32_t cpu_detect(void)
42
+bool detect512()
43
+{
44
+    return(enable512);
45
+}
46
+uint32_t cpu_detect(bool benableavx512 )
47
 {
48
-    uint32_t cpu = 0;
49
 
50
+    uint32_t cpu = 0; 
51
     uint32_t eax, ebx, ecx, edx;
52
     uint32_t vendor[4] = { 0 };
53
     uint32_t max_extended_cap, max_basic_cap;
54
+    uint64_t xcr0 = 0;
55
 
56
 #if !X86_64
57
     if (!PFX(cpu_cpuid_test)())
58
         return 0;
59
 #endif
60
 
61
-    PFX(cpu_cpuid)(0, &eax, vendor + 0, vendor + 2, vendor + 1);
62
-    max_basic_cap = eax;
63
+    PFX(cpu_cpuid)(0, &max_basic_cap, vendor + 0, vendor + 2, vendor + 1);
64
     if (max_basic_cap == 0)
65
         return 0;
66
 
67
@@ -147,27 +152,24 @@
68
         return cpu;
69
     if (edx & 0x02000000)
70
         cpu |= X265_CPU_MMX2 | X265_CPU_SSE;
71
-    if (edx & 0x00008000)
72
-        cpu |= X265_CPU_CMOV;
73
-    else
74
-        return cpu;
75
     if (edx & 0x04000000)
76
         cpu |= X265_CPU_SSE2;
77
     if (ecx & 0x00000001)
78
         cpu |= X265_CPU_SSE3;
79
     if (ecx & 0x00000200)
80
-        cpu |= X265_CPU_SSSE3;
81
+        cpu |= X265_CPU_SSSE3 | X265_CPU_SSE2_IS_FAST;
82
     if (ecx & 0x00080000)
83
         cpu |= X265_CPU_SSE4;
84
     if (ecx & 0x00100000)
85
         cpu |= X265_CPU_SSE42;
86
-    /* Check OXSAVE and AVX bits */
87
-    if ((ecx & 0x18000000) == 0x18000000)
88
+
89
+    if (ecx & 0x08000000) /* XGETBV supported and XSAVE enabled by OS */
90
     {
91
         /* Check for OS support */
92
-        PFX(cpu_xgetbv)(0, &eax, &edx);
93
-        if ((eax & 0x6) == 0x6)
94
+        xcr0 = PFX(cpu_xgetbv)(0);
95
+        if ((xcr0 & 0x6) == 0x6) /* XMM/YMM state */
96
         {
97
+            if (ecx & 0x10000000)
98
             cpu |= X265_CPU_AVX;
99
             if (ecx & 0x00001000)
100
                 cpu |= X265_CPU_FMA3;
101
@@ -178,19 +180,29 @@
102
     {
103
         PFX(cpu_cpuid)(7, &eax, &ebx, &ecx, &edx);
104
         /* AVX2 requires OS support, but BMI1/2 don't. */
105
-        if ((cpu & X265_CPU_AVX) && (ebx & 0x00000020))
106
-            cpu |= X265_CPU_AVX2;
107
         if (ebx & 0x00000008)
108
-        {
109
             cpu |= X265_CPU_BMI1;
110
-            if (ebx & 0x00000100)
111
-                cpu |= X265_CPU_BMI2;
112
+        if (ebx & 0x00000100)
113
+            cpu |= X265_CPU_BMI2;
114
+
115
+        if ((xcr0 & 0x6) == 0x6) /* XMM/YMM state */
116
+        {
117
+            if (ebx & 0x00000020)
118
+                cpu |= X265_CPU_AVX2;
119
+            if (benableavx512)
120
+            {
121
+                if ((xcr0 & 0xE0) == 0xE0) /* OPMASK/ZMM state */
122
+                {
123
+                    if ((ebx & 0xD0030000) == 0xD0030000)
124
+                    {
125
+                        cpu |= X265_CPU_AVX512;
126
+                        enable512 = true;
127
+                    }
128
+                }
129
+            }
130
         }
131
     }
132
 
133
-    if (cpu & X265_CPU_SSSE3)
134
-        cpu |= X265_CPU_SSE2_IS_FAST;
135
-
136
     PFX(cpu_cpuid)(0x80000000, &eax, &ebx, &ecx, &edx);
137
     max_extended_cap = eax;
138
 
139
@@ -230,8 +242,6 @@
140
         {
141
             if (edx & 0x00400000)
142
                 cpu |= X265_CPU_MMX2;
143
-            if (!(cpu & X265_CPU_LZCNT))
144
-                cpu |= X265_CPU_SLOW_CTZ;
145
             if ((cpu & X265_CPU_SSE2) && !(cpu & X265_CPU_SSE2_IS_FAST))
146
                 cpu |= X265_CPU_SSE2_IS_SLOW; /* AMD CPUs come in two types: terrible at SSE and great at it */
147
         }
148
@@ -244,19 +254,10 @@
149
         int model  = ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0);
150
         if (family == 6)
151
         {
152
-            /* 6/9 (pentium-m "banias"), 6/13 (pentium-m "dothan"), and 6/14 (core1 "yonah")
153
-             * theoretically support sse2, but it's significantly slower than mmx for
154
-             * almost all of x264's functions, so let's just pretend they don't. */
155
-            if (model == 9 || model == 13 || model == 14)
156
-            {
157
-                cpu &= ~(X265_CPU_SSE2 | X265_CPU_SSE3);
158
-                X265_CHECK(!(cpu & (X265_CPU_SSSE3 | X265_CPU_SSE4)), "unexpected CPU ID %d\n", cpu);
159
-            }
160
             /* Detect Atom CPU */
161
-            else if (model == 28)
162
+            if (model == 28)
163
             {
164
                 cpu |= X265_CPU_SLOW_ATOM;
165
-                cpu |= X265_CPU_SLOW_CTZ;
166
                 cpu |= X265_CPU_SLOW_PSHUFB;
167
             }
168
 
169
@@ -328,7 +329,7 @@
170
 int PFX(cpu_fast_neon_mrc_test)(void);
171
 }
172
 
173
-uint32_t cpu_detect(void)
174
+uint32_t cpu_detect(bool benableavx512)
175
 {
176
     int flags = 0;
177
 
178
@@ -371,7 +372,7 @@
179
 
180
 #elif X265_ARCH_POWER8
181
 
182
-uint32_t cpu_detect(void)
183
+uint32_t cpu_detect(bool benableavx512)
184
 {
185
 #if HAVE_ALTIVEC
186
     return X265_CPU_ALTIVEC;
187
@@ -382,10 +383,11 @@
188
 
189
 #else // if X265_ARCH_POWER8
190
 
191
-uint32_t cpu_detect(void)
192
+uint32_t cpu_detect(bool benableavx512)
193
 {
194
     return 0;
195
 }
196
 
197
 #endif // if X265_ARCH_X86
198
 }
199
+
200
x265_2.7.tar.gz/source/common/cpu.h -> x265_2.9.tar.gz/source/common/cpu.h Changed
19
 
1
@@ -26,7 +26,6 @@
2
 #define X265_CPU_H
3
 
4
 #include "common.h"
5
-
6
 /* All assembly functions are prefixed with X265_NS (macro expanded) */
7
 #define PFX3(prefix, name) prefix ## _ ## name
8
 #define PFX2(prefix, name) PFX3(prefix, name)
9
@@ -50,7 +49,8 @@
10
 #endif
11
 
12
 namespace X265_NS {
13
-uint32_t cpu_detect(void);
14
+uint32_t cpu_detect(bool);
15
+bool detect512();
16
 
17
 struct cpu_name_t
18
 {
19
x265_2.7.tar.gz/source/common/cudata.cpp -> x265_2.9.tar.gz/source/common/cudata.cpp Changed
29
 
1
@@ -1626,11 +1626,6 @@
2
                 dir |= (1 << list);
3
                 candMvField[count][list].mv = colmv;
4
                 candMvField[count][list].refIdx = refIdx;
5
-                if (m_encData->m_param->scaleFactor && m_encData->m_param->analysisSave && m_log2CUSize[0] < 4)
6
-                {
7
-                    MV dist(MAX_MV, MAX_MV);
8
-                    candMvField[count][list].mv = dist;
9
-                }
10
             }
11
         }
12
 
13
@@ -1790,14 +1785,7 @@
14
 
15
             int curRefPOC = m_slice->m_refPOCList[picList][refIdx];
16
             int curPOC = m_slice->m_poc;
17
-
18
-            if (m_encData->m_param->scaleFactor && m_encData->m_param->analysisSave && (m_log2CUSize[0] < 4))
19
-            {
20
-                MV dist(MAX_MV, MAX_MV);
21
-                pmv[numMvc++] = amvpCand[num++] = dist;
22
-            }
23
-            else
24
-                pmv[numMvc++] = amvpCand[num++] = scaleMvByPOCDist(neighbours[MD_COLLOCATED].mv[picList], curPOC, curRefPOC, colPOC, colRefPOC);
25
+            pmv[numMvc++] = amvpCand[num++] = scaleMvByPOCDist(neighbours[MD_COLLOCATED].mv[picList], curPOC, curRefPOC, colPOC, colRefPOC);
26
         }
27
     }
28
 
29
x265_2.7.tar.gz/source/common/cudata.h -> x265_2.9.tar.gz/source/common/cudata.h Changed
27
 
1
@@ -224,6 +224,11 @@
2
     uint64_t      m_fAc_den[3];
3
     uint64_t      m_fDc_den[3];
4
 
5
+    /* Feature values per CTU for dynamic refinement */
6
+    uint64_t*       m_collectCURd;
7
+    uint32_t*       m_collectCUVariance;
8
+    uint32_t*       m_collectCUCount;
9
+
10
     CUData();
11
 
12
     void     initialize(const CUDataMemPool& dataPool, uint32_t depth, const x265_param& param, int instance);
13
@@ -348,8 +353,12 @@
14
     coeff_t* trCoeffMemBlock;
15
     MV*      mvMemBlock;
16
     sse_t*   distortionMemBlock;
17
+    uint64_t* dynRefineRdBlock;
18
+    uint32_t* dynRefCntBlock;
19
+    uint32_t* dynRefVarBlock;
20
 
21
-    CUDataMemPool() { charMemBlock = NULL; trCoeffMemBlock = NULL; mvMemBlock = NULL; distortionMemBlock = NULL; }
22
+    CUDataMemPool() { charMemBlock = NULL; trCoeffMemBlock = NULL; mvMemBlock = NULL; distortionMemBlock = NULL; 
23
+                      dynRefineRdBlock = NULL; dynRefCntBlock = NULL; dynRefVarBlock = NULL;}
24
 
25
     bool create(uint32_t depth, uint32_t csp, uint32_t numInstances, const x265_param& param)
26
     {
27
x265_2.7.tar.gz/source/common/dct.cpp -> x265_2.9.tar.gz/source/common/dct.cpp Changed
130
 
1
@@ -980,19 +980,110 @@
2
             sum += sbacGetEntropyBits(mstate, firstC2Flag);
3
         }
4
     }
5
-
6
     return (sum & 0x00FFFFFF) + (c1 << 26) + (firstC2Idx << 28);
7
 }
8
+template<int log2TrSize>
9
+static void nonPsyRdoQuant_c(int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, uint32_t blkPos)
10
+{
11
+    const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
12
+    const int scaleBits = SCALE_BITS - 2 * transformShift;
13
+    const uint32_t trSize = 1 << log2TrSize;
14
+
15
+    for (int y = 0; y < MLS_CG_SIZE; y++)
16
+    {
17
+        for (int x = 0; x < MLS_CG_SIZE; x++)
18
+        {
19
+             int64_t signCoef = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
20
+             costUncoded[blkPos + x] = static_cast<int64_t>((double)((signCoef * signCoef) << scaleBits));
21
+             *totalUncodedCost += costUncoded[blkPos + x];
22
+             *totalRdCost += costUncoded[blkPos + x];
23
+        }
24
+        blkPos += trSize;
25
+    }
26
+}
27
+template<int log2TrSize>
28
+static void psyRdoQuant_c(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos)
29
+{
30
+    const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
31
+    const int scaleBits = SCALE_BITS - 2 * transformShift;
32
+    const uint32_t trSize = 1 << log2TrSize;
33
+    int max = X265_MAX(0, (2 * transformShift + 1));
34
+
35
+    for (int y = 0; y < MLS_CG_SIZE; y++)
36
+    {
37
+        for (int x = 0; x < MLS_CG_SIZE; x++)
38
+        {
39
+            int64_t signCoef = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
40
+            int64_t predictedCoef = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/
41
+
42
+            costUncoded[blkPos + x] = static_cast<int64_t>((double)((signCoef * signCoef) << scaleBits));
43
+
44
+            /* when no residual coefficient is coded, predicted coef == recon coef */
45
+            costUncoded[blkPos + x] -= static_cast<int64_t>((double)(((*psyScale) * predictedCoef) >> max));
46
+
47
+            *totalUncodedCost += costUncoded[blkPos + x];
48
+            *totalRdCost += costUncoded[blkPos + x];
49
+        }
50
+        blkPos += trSize;
51
+    }
52
+}
53
+template<int log2TrSize>
54
+static void psyRdoQuant_c_1(int16_t *m_resiDctCoeff, /*int16_t  *m_fencDctCoeff, */ int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, /* int64_t *psyScale,*/ uint32_t blkPos)
55
+{
56
+   const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
57
+   const int scaleBits = SCALE_BITS - 2 * transformShift;
58
+   const uint32_t trSize = 1 << log2TrSize;
59
+
60
+   for (int y = 0; y < MLS_CG_SIZE; y++)
61
+   {
62
+       for (int x = 0; x < MLS_CG_SIZE; x++)
63
+       {
64
+           int64_t signCoef = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
65
+           costUncoded[blkPos + x] = static_cast<int64_t>((double)((signCoef * signCoef) << scaleBits));
66
+           *totalUncodedCost += costUncoded[blkPos + x];
67
+           *totalRdCost += costUncoded[blkPos + x];
68
+       }
69
+       blkPos += trSize;
70
+   }
71
+}
72
+template<int log2TrSize>
73
+static void psyRdoQuant_c_2(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos)
74
+{
75
+   const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
76
+
77
+   const uint32_t trSize = 1 << log2TrSize;
78
+   int max = X265_MAX(0, (2 * transformShift + 1));
79
+
80
+   for (int y = 0; y < MLS_CG_SIZE; y++)
81
+   {
82
+       for (int x = 0; x < MLS_CG_SIZE; x++)
83
+       {
84
+           int64_t signCoef = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
85
+           int64_t predictedCoef = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/
86
+           costUncoded[blkPos + x] -= static_cast<int64_t>((double)(((*psyScale) * predictedCoef) >> max));
87
+           *totalUncodedCost += costUncoded[blkPos + x];
88
+           *totalRdCost += costUncoded[blkPos + x];
89
+       }
90
+       blkPos += trSize;
91
+   }
92
+}
93
 
94
 namespace X265_NS {
95
 // x265 private namespace
96
-
97
 void setupDCTPrimitives_c(EncoderPrimitives& p)
98
 {
99
     p.dequant_scaling = dequant_scaling_c;
100
     p.dequant_normal = dequant_normal_c;
101
     p.quant = quant_c;
102
     p.nquant = nquant_c;
103
+    p.cu[BLOCK_4x4].nonPsyRdoQuant   = nonPsyRdoQuant_c<2>;
104
+    p.cu[BLOCK_8x8].nonPsyRdoQuant   = nonPsyRdoQuant_c<3>;
105
+    p.cu[BLOCK_16x16].nonPsyRdoQuant = nonPsyRdoQuant_c<4>;
106
+    p.cu[BLOCK_32x32].nonPsyRdoQuant = nonPsyRdoQuant_c<5>;
107
+    p.cu[BLOCK_4x4].psyRdoQuant = psyRdoQuant_c<2>;
108
+    p.cu[BLOCK_8x8].psyRdoQuant = psyRdoQuant_c<3>;
109
+    p.cu[BLOCK_16x16].psyRdoQuant = psyRdoQuant_c<4>;
110
+    p.cu[BLOCK_32x32].psyRdoQuant = psyRdoQuant_c<5>;
111
     p.dst4x4 = dst4_c;
112
     p.cu[BLOCK_4x4].dct   = dct4_c;
113
     p.cu[BLOCK_8x8].dct   = dct8_c;
114
@@ -1013,7 +1104,14 @@
115
     p.cu[BLOCK_8x8].copy_cnt   = copy_count<8>;
116
     p.cu[BLOCK_16x16].copy_cnt = copy_count<16>;
117
     p.cu[BLOCK_32x32].copy_cnt = copy_count<32>;
118
-
119
+   p.cu[BLOCK_4x4].psyRdoQuant_1p = psyRdoQuant_c_1<2>;
120
+   p.cu[BLOCK_4x4].psyRdoQuant_2p = psyRdoQuant_c_2<2>;
121
+   p.cu[BLOCK_8x8].psyRdoQuant_1p = psyRdoQuant_c_1<3>;
122
+   p.cu[BLOCK_8x8].psyRdoQuant_2p = psyRdoQuant_c_2<3>;
123
+   p.cu[BLOCK_16x16].psyRdoQuant_1p = psyRdoQuant_c_1<4>;
124
+   p.cu[BLOCK_16x16].psyRdoQuant_2p = psyRdoQuant_c_2<4>;
125
+   p.cu[BLOCK_32x32].psyRdoQuant_1p = psyRdoQuant_c_1<5>;
126
+   p.cu[BLOCK_32x32].psyRdoQuant_2p = psyRdoQuant_c_2<5>;
127
     p.scanPosLast = scanPosLast_c;
128
     p.findPosFirstLast = findPosFirstLast_c;
129
     p.costCoeffNxN = costCoeffNxN_c;
130
x265_2.7.tar.gz/source/common/frame.cpp -> x265_2.9.tar.gz/source/common/frame.cpp Changed
56
 
1
@@ -53,6 +53,7 @@
2
     m_addOnDepth = NULL;
3
     m_addOnCtuInfo = NULL;
4
     m_addOnPrevChange = NULL;
5
+    m_classifyFrame = false;
6
 }
7
 
8
 bool Frame::create(x265_param *param, float* quantOffsets)
9
@@ -82,10 +83,18 @@
10
         m_analysisData.wt = NULL;
11
         m_analysisData.intraData = NULL;
12
         m_analysisData.interData = NULL;
13
-        m_analysis2Pass.analysisFramedata = NULL;
14
+        m_analysisData.distortionData = NULL;
15
     }
16
 
17
-    if (m_fencPic->create(param, !!m_param->bCopyPicToFrame) && m_lowres.create(m_fencPic, param->bframes, !!param->rc.aqMode || !!param->bAQMotion, param->rc.qgSize))
18
+    if (param->bDynamicRefine)
19
+    {
20
+        int size = m_param->maxCUDepth * X265_REFINE_INTER_LEVELS;
21
+        CHECKED_MALLOC_ZERO(m_classifyRd, uint64_t, size);
22
+        CHECKED_MALLOC_ZERO(m_classifyVariance, uint64_t, size);
23
+        CHECKED_MALLOC_ZERO(m_classifyCount, uint32_t, size);
24
+    }
25
+
26
+    if (m_fencPic->create(param, !!m_param->bCopyPicToFrame) && m_lowres.create(param, m_fencPic, param->rc.qgSize))
27
     {
28
         X265_CHECK((m_reconColCount == NULL), "m_reconColCount was initialized");
29
         m_numRows = (m_fencPic->m_picHeight + param->maxCUSize - 1)  / param->maxCUSize;
30
@@ -94,11 +103,8 @@
31
 
32
         if (quantOffsets)
33
         {
34
-            int32_t cuCount;
35
-            if (param->rc.qgSize == 8)
36
-                cuCount = m_lowres.maxBlocksInRowFullRes * m_lowres.maxBlocksInColFullRes;
37
-            else
38
-                cuCount = m_lowres.maxBlocksInRow * m_lowres.maxBlocksInCol;
39
+            int32_t cuCount = (param->rc.qgSize == 8) ? m_lowres.maxBlocksInRowFullRes * m_lowres.maxBlocksInColFullRes :
40
+                                                        m_lowres.maxBlocksInRow * m_lowres.maxBlocksInCol;
41
             m_quantOffsets = new float[cuCount];
42
         }
43
         return true;
44
@@ -226,4 +232,11 @@
45
     }
46
     m_lowres.destroy();
47
     X265_FREE(m_rcData);
48
+
49
+    if (m_param->bDynamicRefine)
50
+    {
51
+        X265_FREE_ZERO(m_classifyRd);
52
+        X265_FREE_ZERO(m_classifyVariance);
53
+        X265_FREE_ZERO(m_classifyCount);
54
+    }
55
 }
56
x265_2.7.tar.gz/source/common/frame.h -> x265_2.9.tar.gz/source/common/frame.h Changed
24
 
1
@@ -109,7 +109,6 @@
2
     Frame*                 m_prev;
3
     x265_param*            m_param;              // Points to the latest param set for the frame.
4
     x265_analysis_data     m_analysisData;
5
-    x265_analysis_2Pass    m_analysis2Pass;
6
     RcStats*               m_rcData;
7
 
8
     Event                  m_copyMVType;
9
@@ -122,6 +121,14 @@
10
     uint8_t**              m_addOnDepth;
11
     uint8_t**              m_addOnCtuInfo;
12
     int**                  m_addOnPrevChange;
13
+
14
+    /* Average feature values of frames being considered for classification */
15
+    uint64_t*              m_classifyRd;
16
+    uint64_t*              m_classifyVariance;
17
+    uint32_t*              m_classifyCount;
18
+
19
+    bool                   m_classifyFrame;
20
+
21
     Frame();
22
 
23
     bool create(x265_param *param, float* quantOffsets);
24
x265_2.7.tar.gz/source/common/framedata.cpp -> x265_2.9.tar.gz/source/common/framedata.cpp Changed
53
 
1
@@ -41,9 +41,25 @@
2
     if (param.rc.bStatWrite)
3
         m_spsrps = const_cast<RPS*>(sps.spsrps);
4
     bool isallocated = m_cuMemPool.create(0, param.internalCsp, sps.numCUsInFrame, param);
5
+    if (m_param->bDynamicRefine)
6
+    {
7
+        CHECKED_MALLOC_ZERO(m_cuMemPool.dynRefineRdBlock, uint64_t, MAX_NUM_DYN_REFINE * sps.numCUsInFrame);
8
+        CHECKED_MALLOC_ZERO(m_cuMemPool.dynRefCntBlock, uint32_t, MAX_NUM_DYN_REFINE * sps.numCUsInFrame);
9
+        CHECKED_MALLOC_ZERO(m_cuMemPool.dynRefVarBlock, uint32_t, MAX_NUM_DYN_REFINE * sps.numCUsInFrame);
10
+    }
11
     if (isallocated)
12
+    {
13
         for (uint32_t ctuAddr = 0; ctuAddr < sps.numCUsInFrame; ctuAddr++)
14
+        {
15
+            if (m_param->bDynamicRefine)
16
+            {
17
+                m_picCTU[ctuAddr].m_collectCURd = m_cuMemPool.dynRefineRdBlock + (ctuAddr * MAX_NUM_DYN_REFINE);
18
+                m_picCTU[ctuAddr].m_collectCUVariance = m_cuMemPool.dynRefVarBlock + (ctuAddr * MAX_NUM_DYN_REFINE);
19
+                m_picCTU[ctuAddr].m_collectCUCount = m_cuMemPool.dynRefCntBlock + (ctuAddr * MAX_NUM_DYN_REFINE);
20
+            }
21
             m_picCTU[ctuAddr].initialize(m_cuMemPool, 0, param, ctuAddr);
22
+        }
23
+    }
24
     else
25
         return false;
26
     CHECKED_MALLOC_ZERO(m_cuStat, RCStatCU, sps.numCUsInFrame);
27
@@ -65,6 +81,12 @@
28
 {
29
     memset(m_cuStat, 0, sps.numCUsInFrame * sizeof(*m_cuStat));
30
     memset(m_rowStat, 0, sps.numCuInHeight * sizeof(*m_rowStat));
31
+    if (m_param->bDynamicRefine)
32
+    {
33
+        memset(m_picCTU->m_collectCURd, 0, MAX_NUM_DYN_REFINE * sizeof(uint64_t));
34
+        memset(m_picCTU->m_collectCUVariance, 0, MAX_NUM_DYN_REFINE * sizeof(uint32_t));
35
+        memset(m_picCTU->m_collectCUCount, 0, MAX_NUM_DYN_REFINE * sizeof(uint32_t));
36
+    }
37
 }
38
 
39
 void FrameData::destroy()
40
@@ -75,6 +97,12 @@
41
 
42
     m_cuMemPool.destroy();
43
 
44
+    if (m_param->bDynamicRefine)
45
+    {
46
+        X265_FREE(m_cuMemPool.dynRefineRdBlock);
47
+        X265_FREE(m_cuMemPool.dynRefCntBlock);
48
+        X265_FREE(m_cuMemPool.dynRefVarBlock);
49
+    }
50
     X265_FREE(m_cuStat);
51
     X265_FREE(m_rowStat);
52
     for (int i = 0; i < INTEGRAL_PLANE_NUM; i++)
53
x265_2.7.tar.gz/source/common/framedata.h -> x265_2.9.tar.gz/source/common/framedata.h Changed
61
 
1
@@ -88,6 +88,11 @@
2
     uint64_t    cntInterPu[NUM_CU_DEPTH][INTER_MODES - 1];
3
     uint64_t    cntMergePu[NUM_CU_DEPTH][INTER_MODES - 1];
4
 
5
+    /* Feature values per row for dynamic refinement */
6
+    uint64_t       rowRdDyn[MAX_NUM_DYN_REFINE];
7
+    uint32_t       rowVarDyn[MAX_NUM_DYN_REFINE];
8
+    uint32_t       rowCntDyn[MAX_NUM_DYN_REFINE];
9
+
10
     FrameStats()
11
     {
12
         memset(this, 0, sizeof(FrameStats));
13
@@ -174,47 +179,5 @@
14
     inline CUData* getPicCTU(uint32_t ctuAddr) { return &m_picCTU[ctuAddr]; }
15
 };
16
 
17
-/* Stores intra analysis data for a single frame. This struct needs better packing */
18
-struct analysis_intra_data
19
-{
20
-    uint8_t*  depth;
21
-    uint8_t*  modes;
22
-    char*     partSizes;
23
-    uint8_t*  chromaModes;
24
-};
25
-
26
-/* Stores inter analysis data for a single frame */
27
-struct analysis_inter_data
28
-{
29
-    int32_t*    ref;
30
-    uint8_t*    depth;
31
-    uint8_t*    modes;
32
-    uint8_t*    partSize;
33
-    uint8_t*    mergeFlag;
34
-    uint8_t*    interDir;
35
-    uint8_t*    mvpIdx[2];
36
-    int8_t*     refIdx[2];
37
-    MV*         mv[2];
38
-   int64_t*     sadCost;
39
-};
40
-
41
-struct analysis2PassFrameData
42
-{
43
-    uint8_t*      depth;
44
-    MV*           m_mv[2];
45
-    int*          mvpIdx[2];
46
-    int32_t*      ref[2];
47
-    uint8_t*      modes;
48
-    sse_t*        distortion;
49
-    sse_t*        ctuDistortion;
50
-    double*       scaledDistortion;
51
-    double        averageDistortion;
52
-    double        sdDistortion;
53
-    uint32_t      highDistortionCtuCount;
54
-    uint32_t      lowDistortionCtuCount;
55
-    double*       offset;
56
-    double*       threshold;
57
-};
58
-
59
 }
60
 #endif // ifndef X265_FRAMEDATA_H
61
x265_2.7.tar.gz/source/common/ipfilter.cpp -> x265_2.9.tar.gz/source/common/ipfilter.cpp Changed
41
 
1
@@ -379,7 +379,8 @@
2
     p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_vps = interp_vert_ps_c<4, W, H>;  \
3
     p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_vsp = interp_vert_sp_c<4, W, H>;  \
4
     p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_vss = interp_vert_ss_c<4, W, H>; \
5
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].p2s = filterPixelToShort_c<W, H>;
6
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].p2s[NONALIGNED] = filterPixelToShort_c<W, H>;\
7
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].p2s[ALIGNED] = filterPixelToShort_c<W, H>;
8
 
9
 #define CHROMA_422(W, H) \
10
     p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_hpp = interp_horiz_pp_c<4, W, H>; \
11
@@ -388,7 +389,8 @@
12
     p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_vps = interp_vert_ps_c<4, W, H>;  \
13
     p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_vsp = interp_vert_sp_c<4, W, H>;  \
14
     p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_vss = interp_vert_ss_c<4, W, H>; \
15
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].p2s = filterPixelToShort_c<W, H>;
16
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].p2s[NONALIGNED] = filterPixelToShort_c<W, H>;\
17
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].p2s[ALIGNED] = filterPixelToShort_c<W, H>;
18
 
19
 #define CHROMA_444(W, H) \
20
     p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_hpp = interp_horiz_pp_c<4, W, H>; \
21
@@ -397,7 +399,8 @@
22
     p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vps = interp_vert_ps_c<4, W, H>;  \
23
     p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vsp = interp_vert_sp_c<4, W, H>;  \
24
     p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vss = interp_vert_ss_c<4, W, H>; \
25
-    p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].p2s = filterPixelToShort_c<W, H>;
26
+    p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].p2s[NONALIGNED] = filterPixelToShort_c<W, H>;\
27
+    p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].p2s[ALIGNED] = filterPixelToShort_c<W, H>;
28
 
29
 #define LUMA(W, H) \
30
     p.pu[LUMA_ ## W ## x ## H].luma_hpp     = interp_horiz_pp_c<8, W, H>; \
31
@@ -407,7 +410,8 @@
32
     p.pu[LUMA_ ## W ## x ## H].luma_vsp     = interp_vert_sp_c<8, W, H>;  \
33
     p.pu[LUMA_ ## W ## x ## H].luma_vss     = interp_vert_ss_c<8, W, H>;  \
34
     p.pu[LUMA_ ## W ## x ## H].luma_hvpp    = interp_hv_pp_c<8, W, H>; \
35
-    p.pu[LUMA_ ## W ## x ## H].convert_p2s = filterPixelToShort_c<W, H>;
36
+    p.pu[LUMA_ ## W ## x ## H].convert_p2s[NONALIGNED] = filterPixelToShort_c<W, H>;\
37
+    p.pu[LUMA_ ## W ## x ## H].convert_p2s[ALIGNED] = filterPixelToShort_c<W, H>;
38
 
39
 void setupFilterPrimitives_c(EncoderPrimitives& p)
40
 {
41
x265_2.7.tar.gz/source/common/lowres.cpp -> x265_2.9.tar.gz/source/common/lowres.cpp Changed
66
 
1
@@ -27,10 +27,10 @@
2
 
3
 using namespace X265_NS;
4
 
5
-bool Lowres::create(PicYuv *origPic, int _bframes, bool bAQEnabled, uint32_t qgSize)
6
+bool Lowres::create(x265_param* param, PicYuv *origPic, uint32_t qgSize)
7
 {
8
     isLowres = true;
9
-    bframes = _bframes;
10
+    bframes = param->bframes;
11
     width = origPic->m_picWidth / 2;
12
     lines = origPic->m_picHeight / 2;
13
     lumaStride = width + 2 * origPic->m_lumaMarginX;
14
@@ -41,11 +41,7 @@
15
     maxBlocksInRowFullRes = maxBlocksInRow * 2;
16
     maxBlocksInColFullRes = maxBlocksInCol * 2;
17
     int cuCount = maxBlocksInRow * maxBlocksInCol;
18
-    int cuCountFullRes;
19
-    if (qgSize == 8)
20
-        cuCountFullRes = maxBlocksInRowFullRes * maxBlocksInColFullRes;
21
-    else
22
-        cuCountFullRes = cuCount;
23
+    int cuCountFullRes = (qgSize > 8) ? cuCount : cuCount << 2;
24
 
25
     /* rounding the width to multiple of lowres CU size */
26
     width = maxBlocksInRow * X265_LOWRES_CU_SIZE;
27
@@ -53,16 +49,18 @@
28
 
29
     size_t planesize = lumaStride * (lines + 2 * origPic->m_lumaMarginY);
30
     size_t padoffset = lumaStride * origPic->m_lumaMarginY + origPic->m_lumaMarginX;
31
-    if (bAQEnabled)
32
+    if (!!param->rc.aqMode)
33
     {
34
         CHECKED_MALLOC_ZERO(qpAqOffset, double, cuCountFullRes);
35
-        CHECKED_MALLOC_ZERO(qpAqMotionOffset, double, cuCountFullRes);
36
         CHECKED_MALLOC_ZERO(invQscaleFactor, int, cuCountFullRes);
37
         CHECKED_MALLOC_ZERO(qpCuTreeOffset, double, cuCountFullRes);
38
-        CHECKED_MALLOC_ZERO(blockVariance, uint32_t, cuCountFullRes);
39
         if (qgSize == 8)
40
             CHECKED_MALLOC_ZERO(invQscaleFactor8x8, int, cuCount);
41
     }
42
+    if (origPic->m_param->bAQMotion)
43
+        CHECKED_MALLOC_ZERO(qpAqMotionOffset, double, cuCountFullRes);
44
+    if (origPic->m_param->bDynamicRefine)
45
+        CHECKED_MALLOC_ZERO(blockVariance, uint32_t, cuCountFullRes);
46
     CHECKED_MALLOC(propagateCost, uint16_t, cuCount);
47
 
48
     /* allocate lowres buffers */
49
@@ -126,14 +124,13 @@
50
         X265_FREE(lowresMvCosts[1][i]);
51
     }
52
     X265_FREE(qpAqOffset);
53
-    X265_FREE(qpAqMotionOffset);
54
     X265_FREE(invQscaleFactor);
55
     X265_FREE(qpCuTreeOffset);
56
     X265_FREE(propagateCost);
57
-    X265_FREE(blockVariance);
58
     X265_FREE(invQscaleFactor8x8);
59
+    X265_FREE(qpAqMotionOffset);
60
+    X265_FREE(blockVariance);
61
 }
62
-
63
 // (re) initialize lowres state
64
 void Lowres::init(PicYuv *origPic, int poc)
65
 {
66
x265_2.7.tar.gz/source/common/lowres.h -> x265_2.9.tar.gz/source/common/lowres.h Changed
35
 
1
@@ -69,7 +69,7 @@
2
             int qmvy = qmv.y + (qmv.y & 1);
3
             int hpelB = (qmvy & 2) | ((qmvx & 2) >> 1);
4
             pixel *frefB = lowresPlane[hpelB] + blockOffset + (qmvx >> 2) + (qmvy >> 2) * lumaStride;
5
-            primitives.pu[LUMA_8x8].pixelavg_pp(buf, outstride, frefA, lumaStride, frefB, lumaStride, 32);
6
+            primitives.pu[LUMA_8x8].pixelavg_pp[(outstride % 64 == 0) && (lumaStride % 64 == 0)](buf, outstride, frefA, lumaStride, frefB, lumaStride, 32);
7
             return buf;
8
         }
9
         else
10
@@ -91,7 +91,7 @@
11
             int qmvy = qmv.y + (qmv.y & 1);
12
             int hpelB = (qmvy & 2) | ((qmvx & 2) >> 1);
13
             pixel *frefB = lowresPlane[hpelB] + blockOffset + (qmvx >> 2) + (qmvy >> 2) * lumaStride;
14
-            primitives.pu[LUMA_8x8].pixelavg_pp(subpelbuf, 8, frefA, lumaStride, frefB, lumaStride, 32);
15
+            primitives.pu[LUMA_8x8].pixelavg_pp[NONALIGNED](subpelbuf, 8, frefA, lumaStride, frefB, lumaStride, 32);
16
             return comp(fenc, FENC_STRIDE, subpelbuf, 8);
17
         }
18
         else
19
@@ -152,14 +152,12 @@
20
     uint32_t* blockVariance;
21
     uint64_t  wp_ssd[3];       // This is different than SSDY, this is sum(pixel^2) - sum(pixel)^2 for entire frame
22
     uint64_t  wp_sum[3];
23
-    uint64_t  frameVariance;
24
 
25
     /* cutree intermediate data */
26
     uint16_t* propagateCost;
27
     double    weightedCostDelta[X265_BFRAME_MAX + 2];
28
     ReferencePlanes weightedRef[X265_BFRAME_MAX + 2];
29
-
30
-    bool create(PicYuv *origPic, int _bframes, bool bAqEnabled, uint32_t qgSize);
31
+    bool create(x265_param* param, PicYuv *origPic, uint32_t qgSize);
32
     void destroy();
33
     void init(PicYuv *origPic, int poc);
34
 };
35
x265_2.7.tar.gz/source/common/param.cpp -> x265_2.9.tar.gz/source/common/param.cpp Changed
224
 
1
@@ -105,7 +105,7 @@
2
     memset(param, 0, sizeof(x265_param));
3
 
4
     /* Applying default values to all elements in the param structure */
5
-    param->cpuid = X265_NS::cpu_detect();
6
+    param->cpuid = X265_NS::cpu_detect(false);
7
     param->bEnableWavefront = 1;
8
     param->frameNumThreads = 0;
9
 
10
@@ -133,6 +133,7 @@
11
     param->bEmitHRDSEI = 0;
12
     param->bEmitInfoSEI = 1;
13
     param->bEmitHDRSEI = 0;
14
+    param->bEmitIDRRecoverySEI = 0;
15
 
16
     /* CU definitions */
17
     param->maxCUSize = 64;
18
@@ -155,6 +156,9 @@
19
     param->lookaheadThreads = 0;
20
     param->scenecutBias = 5.0;
21
     param->radl = 0;
22
+    param->chunkStart = 0;
23
+    param->chunkEnd = 0;
24
+
25
     /* Intra Coding Tools */
26
     param->bEnableConstrainedIntra = 0;
27
     param->bEnableStrongIntraSmoothing = 1;
28
@@ -192,6 +196,7 @@
29
     param->bEnableSAO = 1;
30
     param->bSaoNonDeblocked = 0;
31
     param->bLimitSAO = 0;
32
+
33
     /* Coding Quality */
34
     param->cbQpOffset = 0;
35
     param->crQpOffset = 0;
36
@@ -289,16 +294,24 @@
37
     param->scaleFactor = 0;
38
     param->intraRefine = 0;
39
     param->interRefine = 0;
40
+    param->bDynamicRefine = 0;
41
     param->mvRefine = 0;
42
     param->bUseAnalysisFile = 1;
43
     param->csvfpt = NULL;
44
     param->forceFlush = 0;
45
     param->bDisableLookahead = 0;
46
     param->bCopyPicToFrame = 1;
47
+    param->maxAUSizeFactor = 1;
48
+    param->naluFile = NULL;
49
 
50
     /* DCT Approximations */
51
     param->bLowPassDct = 0;
52
     param->bMVType = 0;
53
+    param->bSingleSeiNal = 0;
54
+
55
+    /* SEI messages */
56
+    param->preferredTransferCharacteristics = -1;
57
+    param->pictureStructure = -1;
58
 }
59
 
60
 int x265_param_default_preset(x265_param* param, const char* preset, const char* tune)
61
@@ -606,10 +619,26 @@
62
     if (0) ;
63
     OPT("asm")
64
     {
65
+#if X265_ARCH_X86
66
+        if (!strcasecmp(value, "avx512"))
67
+        {
68
+            p->cpuid = X265_NS::cpu_detect(true);
69
+            if (!(p->cpuid & X265_CPU_AVX512))
70
+                x265_log(p, X265_LOG_WARNING, "AVX512 is not supported\n");
71
+        }
72
+        else
73
+        {
74
+            if (bValueWasNull)
75
+                p->cpuid = atobool(value);
76
+            else
77
+                p->cpuid = parseCpuName(value, bError, false);
78
+        }
79
+#else
80
         if (bValueWasNull)
81
             p->cpuid = atobool(value);
82
         else
83
-            p->cpuid = parseCpuName(value, bError);
84
+            p->cpuid = parseCpuName(value, bError, false);
85
+#endif
86
     }
87
     OPT("fps")
88
     {
89
@@ -981,6 +1010,7 @@
90
         OPT("limit-sao") p->bLimitSAO = atobool(value);
91
         OPT("dhdr10-info") p->toneMapFile = strdup(value);
92
         OPT("dhdr10-opt") p->bDhdr10opt = atobool(value);
93
+        OPT("idr-recovery-sei") p->bEmitIDRRecoverySEI = atobool(value);
94
         OPT("const-vbv") p->rc.bEnableConstVbv = atobool(value);
95
         OPT("ctu-info") p->bCTUInfo = atoi(value);
96
         OPT("scale-factor") p->scaleFactor = atoi(value);
97
@@ -989,7 +1019,7 @@
98
         OPT("refine-mv")p->mvRefine = atobool(value);
99
         OPT("force-flush")p->forceFlush = atoi(value);
100
         OPT("splitrd-skip") p->bEnableSplitRdSkip = atobool(value);
101
-       OPT("lowpass-dct") p->bLowPassDct = atobool(value);
102
+        OPT("lowpass-dct") p->bLowPassDct = atobool(value);
103
         OPT("vbv-end") p->vbvBufferEnd = atof(value);
104
         OPT("vbv-end-fr-adj") p->vbvEndFrameAdjust = atof(value);
105
         OPT("copy-pic") p->bCopyPicToFrame = atobool(value);
106
@@ -1007,11 +1037,19 @@
107
             {
108
                 bError = true;
109
             }
110
-         }
111
+        }
112
         OPT("gop-lookahead") p->gopLookahead = atoi(value);
113
         OPT("analysis-save") p->analysisSave = strdup(value);
114
         OPT("analysis-load") p->analysisLoad = strdup(value);
115
         OPT("radl") p->radl = atoi(value);
116
+        OPT("max-ausize-factor") p->maxAUSizeFactor = atof(value);
117
+        OPT("dynamic-refine") p->bDynamicRefine = atobool(value);
118
+        OPT("single-sei") p->bSingleSeiNal = atobool(value);
119
+        OPT("atc-sei") p->preferredTransferCharacteristics = atoi(value);
120
+        OPT("pic-struct") p->pictureStructure = atoi(value);
121
+        OPT("chunk-start") p->chunkStart = atoi(value);
122
+        OPT("chunk-end") p->chunkEnd = atoi(value);
123
+        OPT("nalu-file") p->naluFile = strdup(value);
124
         else
125
             return X265_PARAM_BAD_NAME;
126
     }
127
@@ -1054,7 +1092,7 @@
128
  *   false || no  - disabled
129
  *   integer bitmap value
130
  *   comma separated list of SIMD names, eg: SSE4.1,XOP */
131
-int parseCpuName(const char* value, bool& bError)
132
+int parseCpuName(const char* value, bool& bError, bool bEnableavx512)
133
 {
134
     if (!value)
135
     {
136
@@ -1065,7 +1103,7 @@
137
     if (isdigit(value[0]))
138
         cpu = x265_atoi(value, bError);
139
     else
140
-        cpu = !strcmp(value, "auto") || x265_atobool(value, bError) ? X265_NS::cpu_detect() : 0;
141
+        cpu = !strcmp(value, "auto") || x265_atobool(value, bError) ? X265_NS::cpu_detect(bEnableavx512) : 0;
142
 
143
     if (bError)
144
     {
145
@@ -1365,8 +1403,10 @@
146
         "Supported values for bCTUInfo are 0, 1, 2, 4, 6");
147
     CHECK(param->interRefine > 3 || param->interRefine < 0,
148
         "Invalid refine-inter value, refine-inter levels 0 to 3 supported");
149
-    CHECK(param->intraRefine > 3 || param->intraRefine < 0,
150
+    CHECK(param->intraRefine > 4 || param->intraRefine < 0,
151
         "Invalid refine-intra value, refine-intra levels 0 to 3 supported");
152
+    CHECK(param->maxAUSizeFactor < 0.5 || param->maxAUSizeFactor > 1.0,
153
+        "Supported factor for controlling max AU size is from 0.5 to 1");
154
 #if !X86_64
155
     CHECK(param->searchMethod == X265_SEA && (param->sourceWidth > 840 || param->sourceHeight > 480),
156
         "SEA motion search does not support resolutions greater than 480p in 32 bit build");
157
@@ -1375,6 +1415,21 @@
158
     if (param->masteringDisplayColorVolume || param->maxFALL || param->maxCLL)
159
         param->bEmitHDRSEI = 1;
160
 
161
+    bool isSingleSEI = (param->bRepeatHeaders
162
+                     || param->bEmitHRDSEI
163
+                     || param->bEmitInfoSEI
164
+                     || param->bEmitHDRSEI
165
+                     || param->bEmitIDRRecoverySEI
166
+                   || !!param->interlaceMode
167
+                     || param->preferredTransferCharacteristics > 1
168
+                     || param->toneMapFile
169
+                     || param->naluFile);
170
+
171
+    if (!isSingleSEI && param->bSingleSeiNal)
172
+    {
173
+        param->bSingleSeiNal = 0;
174
+        x265_log(param, X265_LOG_WARNING, "None of the SEI messages are enabled. Disabling Single SEI NAL\n");
175
+    }
176
     return check_failed;
177
 }
178
 
179
@@ -1504,6 +1559,7 @@
180
     TOOLVAL(param->bCTUInfo, "ctu-info=%d");
181
     if (param->bMVType == AVC_INFO)
182
         TOOLOPT(param->bMVType, "refine-mv-type=avc");
183
+    TOOLOPT(param->bDynamicRefine, "dynamic-refine");
184
     if (param->maxSlices > 1)
185
         TOOLVAL(param->maxSlices, "slices=%d");
186
     if (param->bEnableLoopFilter)
187
@@ -1520,6 +1576,7 @@
188
     TOOLOPT(!param->bSaoNonDeblocked && param->bEnableSAO, "sao");
189
     TOOLOPT(param->rc.bStatWrite, "stats-write");
190
     TOOLOPT(param->rc.bStatRead,  "stats-read");
191
+    TOOLOPT(param->bSingleSeiNal, "single-sei");
192
 #if ENABLE_HDR10_PLUS
193
     TOOLOPT(param->toneMapFile != NULL, "dhdr10-info");
194
 #endif
195
@@ -1560,6 +1617,10 @@
196
     s += sprintf(s, " input-res=%dx%d", p->sourceWidth - padx, p->sourceHeight - pady);
197
     s += sprintf(s, " interlace=%d", p->interlaceMode);
198
     s += sprintf(s, " total-frames=%d", p->totalFrames);
199
+    if (p->chunkStart)
200
+        s += sprintf(s, " chunk-start=%d", p->chunkStart);
201
+    if (p->chunkEnd)
202
+        s += sprintf(s, " chunk-end=%d", p->chunkEnd);
203
     s += sprintf(s, " level-idc=%d", p->levelIdc);
204
     s += sprintf(s, " high-tier=%d", p->bHighTier);
205
     s += sprintf(s, " uhd-bd=%d", p->uhdBluray);
206
@@ -1726,6 +1787,7 @@
207
     BOOL(p->bEmitHDRSEI, "hdr");
208
     BOOL(p->bHDROpt, "hdr-opt");
209
     BOOL(p->bDhdr10opt, "dhdr10-opt");
210
+    BOOL(p->bEmitIDRRecoverySEI, "idr-recovery-sei");
211
     if (p->analysisSave)
212
         s += sprintf(s, " analysis-save");
213
     if (p->analysisLoad)
214
@@ -1740,6 +1802,9 @@
215
     BOOL(p->bLowPassDct, "lowpass-dct");
216
     s += sprintf(s, " refine-mv-type=%d", p->bMVType);
217
     s += sprintf(s, " copy-pic=%d", p->bCopyPicToFrame);
218
+    s += sprintf(s, " max-ausize-factor=%.1f", p->maxAUSizeFactor);
219
+    BOOL(p->bDynamicRefine, "dynamic-refine");
220
+    BOOL(p->bSingleSeiNal, "single-sei");
221
 #undef BOOL
222
     return buf;
223
 }
224
x265_2.7.tar.gz/source/common/param.h -> x265_2.9.tar.gz/source/common/param.h Changed
10
 
1
@@ -33,7 +33,7 @@
2
 char* x265_param2string(x265_param *param, int padx, int pady);
3
 int   x265_atoi(const char *str, bool& bError);
4
 double x265_atof(const char *str, bool& bError);
5
-int   parseCpuName(const char *value, bool& bError);
6
+int   parseCpuName(const char *value, bool& bError, bool bEnableavx512);
7
 void  setParamAspectRatio(x265_param *p, int width, int height);
8
 void  getParamAspectRatio(x265_param *p, int& width, int& height);
9
 bool  parseLambdaFile(x265_param *param);
10
x265_2.7.tar.gz/source/common/picyuv.cpp -> x265_2.9.tar.gz/source/common/picyuv.cpp Changed
21
 
1
@@ -358,6 +358,19 @@
2
     pixel *uPic = m_picOrg[1];
3
     pixel *vPic = m_picOrg[2];
4
 
5
+    if(param.minLuma != 0 || param.maxLuma != PIXEL_MAX)
6
+    {
7
+        for (int r = 0; r < height; r++)
8
+        {
9
+            for (int c = 0; c < width; c++)
10
+            {
11
+                yPic[c] = X265_MIN(yPic[c], (pixel)param.maxLuma);
12
+                yPic[c] = X265_MAX(yPic[c], (pixel)param.minLuma);
13
+            }
14
+            yPic += m_stride;
15
+        }
16
+    }
17
+    yPic = m_picOrg[0];
18
     if (param.csvLogLevel >= 2 || param.maxCLL || param.maxFALL)
19
     {
20
         for (int r = 0; r < height; r++)
21
x265_2.7.tar.gz/source/common/picyuv.h -> x265_2.9.tar.gz/source/common/picyuv.h Changed
9
 
1
@@ -72,6 +72,7 @@
2
     pixel   m_maxChromaVLevel;
3
     pixel   m_minChromaVLevel;
4
     double  m_avgChromaVLevel;
5
+    double  m_vmafScore;
6
     x265_param *m_param;
7
 
8
     PicYuv();
9
x265_2.7.tar.gz/source/common/pixel.cpp -> x265_2.9.tar.gz/source/common/pixel.cpp Changed
102
 
1
@@ -922,7 +922,7 @@
2
 static void cuTreeFix8Pack(uint16_t *dst, double *src, int count)
3
 {
4
     for (int i = 0; i < count; i++)
5
-        dst[i] = (uint16_t)(src[i] * 256.0);
6
+        dst[i] = (uint16_t)(int16_t)(src[i] * 256.0);
7
 }
8
 
9
 static void cuTreeFix8Unpack(double *dst, uint16_t *src, int count)
10
@@ -986,28 +986,34 @@
11
 {
12
 #define LUMA_PU(W, H) \
13
     p.pu[LUMA_ ## W ## x ## H].copy_pp = blockcopy_pp_c<W, H>; \
14
-    p.pu[LUMA_ ## W ## x ## H].addAvg = addAvg<W, H>; \
15
+    p.pu[LUMA_ ## W ## x ## H].addAvg[NONALIGNED] = addAvg<W, H>; \
16
+    p.pu[LUMA_ ## W ## x ## H].addAvg[ALIGNED] = addAvg<W, H>; \
17
     p.pu[LUMA_ ## W ## x ## H].sad = sad<W, H>; \
18
     p.pu[LUMA_ ## W ## x ## H].sad_x3 = sad_x3<W, H>; \
19
     p.pu[LUMA_ ## W ## x ## H].sad_x4 = sad_x4<W, H>; \
20
-    p.pu[LUMA_ ## W ## x ## H].pixelavg_pp = pixelavg_pp<W, H>;
21
-
22
+    p.pu[LUMA_ ## W ## x ## H].pixelavg_pp[NONALIGNED] = pixelavg_pp<W, H>; \
23
+    p.pu[LUMA_ ## W ## x ## H].pixelavg_pp[ALIGNED] = pixelavg_pp<W, H>;
24
 #define LUMA_CU(W, H) \
25
     p.cu[BLOCK_ ## W ## x ## H].sub_ps        = pixel_sub_ps_c<W, H>; \
26
-    p.cu[BLOCK_ ## W ## x ## H].add_ps        = pixel_add_ps_c<W, H>; \
27
+    p.cu[BLOCK_ ## W ## x ## H].add_ps[NONALIGNED]    = pixel_add_ps_c<W, H>; \
28
+    p.cu[BLOCK_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_c<W, H>; \
29
     p.cu[BLOCK_ ## W ## x ## H].copy_sp       = blockcopy_sp_c<W, H>; \
30
     p.cu[BLOCK_ ## W ## x ## H].copy_ps       = blockcopy_ps_c<W, H>; \
31
     p.cu[BLOCK_ ## W ## x ## H].copy_ss       = blockcopy_ss_c<W, H>; \
32
-    p.cu[BLOCK_ ## W ## x ## H].blockfill_s   = blockfill_s_c<W>;  \
33
+    p.cu[BLOCK_ ## W ## x ## H].blockfill_s[NONALIGNED] = blockfill_s_c<W>;  \
34
+    p.cu[BLOCK_ ## W ## x ## H].blockfill_s[ALIGNED]    = blockfill_s_c<W>;  \
35
     p.cu[BLOCK_ ## W ## x ## H].cpy2Dto1D_shl = cpy2Dto1D_shl<W>; \
36
     p.cu[BLOCK_ ## W ## x ## H].cpy2Dto1D_shr = cpy2Dto1D_shr<W>; \
37
-    p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shl = cpy1Dto2D_shl<W>; \
38
+    p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shl[NONALIGNED] = cpy1Dto2D_shl<W>; \
39
+    p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shl[ALIGNED] = cpy1Dto2D_shl<W>; \
40
     p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shr = cpy1Dto2D_shr<W>; \
41
     p.cu[BLOCK_ ## W ## x ## H].psy_cost_pp   = psyCost_pp<BLOCK_ ## W ## x ## H>; \
42
     p.cu[BLOCK_ ## W ## x ## H].transpose     = transpose<W>; \
43
-    p.cu[BLOCK_ ## W ## x ## H].ssd_s         = pixel_ssd_s_c<W>; \
44
+    p.cu[BLOCK_ ## W ## x ## H].ssd_s[NONALIGNED]         = pixel_ssd_s_c<W>; \
45
+    p.cu[BLOCK_ ## W ## x ## H].ssd_s[ALIGNED] = pixel_ssd_s_c<W>; \
46
     p.cu[BLOCK_ ## W ## x ## H].var           = pixel_var<W>; \
47
-    p.cu[BLOCK_ ## W ## x ## H].calcresidual  = getResidual<W>; \
48
+    p.cu[BLOCK_ ## W ## x ## H].calcresidual[NONALIGNED]  = getResidual<W>; \
49
+    p.cu[BLOCK_ ## W ## x ## H].calcresidual[ALIGNED]     = getResidual<W>; \
50
     p.cu[BLOCK_ ## W ## x ## H].sse_pp        = sse<W, H, pixel, pixel>; \
51
     p.cu[BLOCK_ ## W ## x ## H].sse_ss        = sse<W, H, int16_t, int16_t>;
52
 
53
@@ -1102,7 +1108,8 @@
54
     p.cu[BLOCK_64x64].sa8d = sa8d16<64, 64>;
55
 
56
 #define CHROMA_PU_420(W, H) \
57
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].addAvg  = addAvg<W, H>;         \
58
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].addAvg[NONALIGNED]  = addAvg<W, H>;         \
59
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].addAvg[ALIGNED]  = addAvg<W, H>;         \
60
     p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].copy_pp = blockcopy_pp_c<W, H>; \
61
 
62
     CHROMA_PU_420(2, 2);
63
@@ -1165,7 +1172,8 @@
64
     p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_ps = blockcopy_ps_c<W, H>; \
65
     p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_ss = blockcopy_ss_c<W, H>; \
66
     p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].sub_ps = pixel_sub_ps_c<W, H>;  \
67
-    p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps = pixel_add_ps_c<W, H>;
68
+    p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_c<W, H>; \
69
+    p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_c<W, H>;
70
 
71
     CHROMA_CU_420(2, 2)
72
     CHROMA_CU_420(4, 4)
73
@@ -1179,7 +1187,8 @@
74
     p.chroma[X265_CSP_I420].cu[BLOCK_64x64].sa8d = sa8d16<32, 32>;
75
 
76
 #define CHROMA_PU_422(W, H) \
77
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].addAvg  = addAvg<W, H>;         \
78
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].addAvg[NONALIGNED]  = addAvg<W, H>;         \
79
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].addAvg[ALIGNED]  = addAvg<W, H>;         \
80
     p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].copy_pp = blockcopy_pp_c<W, H>; \
81
 
82
     CHROMA_PU_422(2, 4);
83
@@ -1242,7 +1251,8 @@
84
     p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_ps = blockcopy_ps_c<W, H>; \
85
     p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_ss = blockcopy_ss_c<W, H>; \
86
     p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].sub_ps = pixel_sub_ps_c<W, H>; \
87
-    p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps = pixel_add_ps_c<W, H>;
88
+    p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_c<W, H>; \
89
+    p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_c<W, H>;
90
 
91
     CHROMA_CU_422(2, 4)
92
     CHROMA_CU_422(4, 8)
93
@@ -1258,7 +1268,7 @@
94
     p.weight_pp = weight_pp_c;
95
     p.weight_sp = weight_sp_c;
96
 
97
-    p.scale1D_128to64 = scale1D_128to64;
98
+    p.scale1D_128to64[NONALIGNED] = p.scale1D_128to64[ALIGNED] = scale1D_128to64;
99
     p.scale2D_64to32 = scale2D_64to32;
100
     p.frameInitLowres = frame_init_lowres_core;
101
     p.ssim_4x4x2_core = ssim_4x4x2_core;
102
x265_2.7.tar.gz/source/common/predict.cpp -> x265_2.9.tar.gz/source/common/predict.cpp Changed
72
 
1
@@ -91,7 +91,7 @@
2
         MV mv0 = cu.m_mv[0][pu.puAbsPartIdx];
3
         cu.clipMv(mv0);
4
 
5
-        if (cu.m_slice->m_pps->bUseWeightPred && wp0->bPresentFlag)
6
+        if (cu.m_slice->m_pps->bUseWeightPred && wp0->wtPresent)
7
         {
8
             for (int plane = 0; plane < (bChroma ? 3 : 1); plane++)
9
             {
10
@@ -133,7 +133,7 @@
11
             pwp0 = refIdx0 >= 0 ? cu.m_slice->m_weightPredTable[0][refIdx0] : NULL;
12
             pwp1 = refIdx1 >= 0 ? cu.m_slice->m_weightPredTable[1][refIdx1] : NULL;
13
 
14
-            if (pwp0 && pwp1 && (pwp0->bPresentFlag || pwp1->bPresentFlag))
15
+            if (pwp0 && pwp1 && (pwp0->wtPresent || pwp1->wtPresent))
16
             {
17
                 /* biprediction weighting */
18
                 for (int plane = 0; plane < (bChroma ? 3 : 1); plane++)
19
@@ -183,7 +183,7 @@
20
                 predInterChromaShort(pu, m_predShortYuv[1], *cu.m_slice->m_refReconPicList[1][refIdx1], mv1);
21
             }
22
 
23
-            if (pwp0 && pwp1 && (pwp0->bPresentFlag || pwp1->bPresentFlag))
24
+            if (pwp0 && pwp1 && (pwp0->wtPresent || pwp1->wtPresent))
25
                 addWeightBi(pu, predYuv, m_predShortYuv[0], m_predShortYuv[1], wv0, wv1, bLuma, bChroma);
26
             else
27
                 predYuv.addAvg(m_predShortYuv[0], m_predShortYuv[1], pu.puAbsPartIdx, pu.width, pu.height, bLuma, bChroma);
28
@@ -193,7 +193,7 @@
29
             MV mv0 = cu.m_mv[0][pu.puAbsPartIdx];
30
             cu.clipMv(mv0);
31
 
32
-            if (pwp0 && pwp0->bPresentFlag)
33
+            if (pwp0 && pwp0->wtPresent)
34
             {
35
                 ShortYuv& shortYuv = m_predShortYuv[0];
36
 
37
@@ -220,7 +220,7 @@
38
             /* uniprediction to L1 */
39
             X265_CHECK(refIdx1 >= 0, "refidx1 was not positive\n");
40
 
41
-            if (pwp1 && pwp1->bPresentFlag)
42
+            if (pwp1 && pwp1->wtPresent)
43
             {
44
                 ShortYuv& shortYuv = m_predShortYuv[0];
45
 
46
@@ -283,7 +283,11 @@
47
     int yFrac = mv.y & 3;
48
 
49
     if (!(yFrac | xFrac))
50
-        primitives.pu[partEnum].convert_p2s(src, srcStride, dst, dstStride);
51
+    {
52
+        bool srcbufferAlignCheck = (refPic.m_cuOffsetY[pu.ctuAddr] + refPic.m_buOffsetY[pu.cuAbsPartIdx + pu.puAbsPartIdx] + srcOffset) % 64 == 0;
53
+        bool dstbufferAlignCheck = (dstSYuv.getAddrOffset(pu.puAbsPartIdx, dstSYuv.m_size) % 64) == 0;
54
+        primitives.pu[partEnum].convert_p2s[srcStride % 64 == 0 && dstStride % 64 == 0 && srcbufferAlignCheck && dstbufferAlignCheck](src, srcStride, dst, dstStride);
55
+    }
56
     else if (!yFrac)
57
         primitives.pu[partEnum].luma_hps(src, srcStride, dst, dstStride, xFrac, 0);
58
     else if (!xFrac)
59
@@ -375,8 +379,10 @@
60
 
61
     if (!(yFrac | xFrac))
62
     {
63
-        primitives.chroma[m_csp].pu[partEnum].p2s(refCb, refStride, dstCb, dstStride);
64
-        primitives.chroma[m_csp].pu[partEnum].p2s(refCr, refStride, dstCr, dstStride);
65
+        bool srcbufferAlignCheckC = (refPic.m_cuOffsetC[pu.ctuAddr] + refPic.m_buOffsetC[pu.cuAbsPartIdx + pu.puAbsPartIdx] + refOffset) % 64 == 0;
66
+        bool dstbufferAlignCheckC = dstSYuv.getChromaAddrOffset(pu.puAbsPartIdx) % 64 == 0;
67
+        primitives.chroma[m_csp].pu[partEnum].p2s[refStride % 64 == 0 && dstStride % 64 == 0 && srcbufferAlignCheckC && dstbufferAlignCheckC](refCb, refStride, dstCb, dstStride);
68
+        primitives.chroma[m_csp].pu[partEnum].p2s[refStride % 64 == 0 && dstStride % 64 == 0 && srcbufferAlignCheckC && dstbufferAlignCheckC](refCr, refStride, dstCr, dstStride);
69
     }
70
     else if (!yFrac)
71
     {
72
x265_2.7.tar.gz/source/common/primitives.cpp -> x265_2.9.tar.gz/source/common/primitives.cpp Changed
25
 
1
@@ -114,9 +114,11 @@
2
     for (int i = 0; i < NUM_PU_SIZES; i++)
3
     {
4
         p.chroma[X265_CSP_I444].pu[i].copy_pp = p.pu[i].copy_pp;
5
-        p.chroma[X265_CSP_I444].pu[i].addAvg  = p.pu[i].addAvg;
6
+        p.chroma[X265_CSP_I444].pu[i].addAvg[NONALIGNED]  = p.pu[i].addAvg[NONALIGNED];
7
+        p.chroma[X265_CSP_I444].pu[i].addAvg[ALIGNED] = p.pu[i].addAvg[ALIGNED];
8
         p.chroma[X265_CSP_I444].pu[i].satd    = p.pu[i].satd;
9
-        p.chroma[X265_CSP_I444].pu[i].p2s     = p.pu[i].convert_p2s;
10
+        p.chroma[X265_CSP_I444].pu[i].p2s[NONALIGNED]     = p.pu[i].convert_p2s[NONALIGNED];
11
+        p.chroma[X265_CSP_I444].pu[i].p2s[ALIGNED] = p.pu[i].convert_p2s[ALIGNED];
12
     }
13
 
14
     for (int i = 0; i < NUM_CU_SIZES; i++)
15
@@ -124,7 +126,8 @@
16
         p.chroma[X265_CSP_I444].cu[i].sa8d    = p.cu[i].sa8d;
17
         p.chroma[X265_CSP_I444].cu[i].sse_pp  = p.cu[i].sse_pp;
18
         p.chroma[X265_CSP_I444].cu[i].sub_ps  = p.cu[i].sub_ps;
19
-        p.chroma[X265_CSP_I444].cu[i].add_ps  = p.cu[i].add_ps;
20
+        p.chroma[X265_CSP_I444].cu[i].add_ps[NONALIGNED]  = p.cu[i].add_ps[NONALIGNED];
21
+        p.chroma[X265_CSP_I444].cu[i].add_ps[ALIGNED] = p.cu[i].add_ps[ALIGNED];
22
         p.chroma[X265_CSP_I444].cu[i].copy_ps = p.cu[i].copy_ps;
23
         p.chroma[X265_CSP_I444].cu[i].copy_sp = p.cu[i].copy_sp;
24
         p.chroma[X265_CSP_I444].cu[i].copy_ss = p.cu[i].copy_ss;
25
x265_2.7.tar.gz/source/common/primitives.h -> x265_2.9.tar.gz/source/common/primitives.h Changed
117
 
1
@@ -62,6 +62,13 @@
2
     NUM_CU_SIZES
3
 };
4
 
5
+enum AlignPrimitive
6
+{
7
+    NONALIGNED,
8
+    ALIGNED,
9
+    NUM_ALIGNMENT_TYPES
10
+};
11
+
12
 enum { NUM_TR_SIZE = 4 }; // TU are 4x4, 8x8, 16x16, and 32x32
13
 
14
 
15
@@ -216,7 +223,10 @@
16
 
17
 typedef void (*integralv_t)(uint32_t *sum, intptr_t stride);
18
 typedef void (*integralh_t)(uint32_t *sum, pixel *pix, intptr_t stride);
19
-
20
+typedef void(*nonPsyRdoQuant_t)(int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, uint32_t blkPos);
21
+typedef void(*psyRdoQuant_t)(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos);
22
+typedef void(*psyRdoQuant_t1)(int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost,uint32_t blkPos);
23
+typedef void(*psyRdoQuant_t2)(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos);
24
 /* Function pointers to optimized encoder primitives. Each pointer can reference
25
  * either an assembly routine, a SIMD intrinsic primitive, or a C function */
26
 struct EncoderPrimitives
27
@@ -242,12 +252,10 @@
28
         filter_sp_t    luma_vsp;
29
         filter_ss_t    luma_vss;
30
         filter_hv_pp_t luma_hvpp;   // combines hps + vsp
31
-
32
-        pixelavg_pp_t  pixelavg_pp; // quick bidir using pixels (borrowed from x264)
33
-        addAvg_t       addAvg;      // bidir motion compensation, uses 16bit values
34
-
35
+        pixelavg_pp_t  pixelavg_pp[NUM_ALIGNMENT_TYPES]; // quick bidir using pixels (borrowed from x264)
36
+        addAvg_t       addAvg[NUM_ALIGNMENT_TYPES];      // bidir motion compensation, uses 16bit values
37
         copy_pp_t      copy_pp;
38
-        filter_p2s_t   convert_p2s;
39
+        filter_p2s_t   convert_p2s[NUM_ALIGNMENT_TYPES];
40
     }
41
     pu[NUM_PU_SIZES];
42
 
43
@@ -265,17 +273,16 @@
44
         dct_t           standard_dct;   // original dct function, used by lowpass_dct
45
         dct_t           lowpass_dct;    // lowpass dct approximation
46
 
47
-        calcresidual_t  calcresidual;
48
+        calcresidual_t  calcresidual[NUM_ALIGNMENT_TYPES];
49
         pixel_sub_ps_t  sub_ps;
50
-        pixel_add_ps_t  add_ps;
51
-        blockfill_s_t   blockfill_s;   // block fill, for DC transforms
52
+        pixel_add_ps_t  add_ps[NUM_ALIGNMENT_TYPES];
53
+        blockfill_s_t   blockfill_s[NUM_ALIGNMENT_TYPES];   // block fill, for DC transforms
54
         copy_cnt_t      copy_cnt;      // copy coeff while counting non-zero
55
         count_nonzero_t count_nonzero;
56
         cpy2Dto1D_shl_t cpy2Dto1D_shl;
57
         cpy2Dto1D_shr_t cpy2Dto1D_shr;
58
-        cpy1Dto2D_shl_t cpy1Dto2D_shl;
59
+        cpy1Dto2D_shl_t cpy1Dto2D_shl[NUM_ALIGNMENT_TYPES];
60
         cpy1Dto2D_shr_t cpy1Dto2D_shr;
61
-
62
         copy_sp_t       copy_sp;
63
         copy_ps_t       copy_ps;
64
         copy_ss_t       copy_ss;
65
@@ -286,16 +293,18 @@
66
         pixel_sse_t     sse_pp;        // Sum of Square Error (pixel, pixel) fenc alignment not assumed
67
         pixel_sse_ss_t  sse_ss;        // Sum of Square Error (short, short) fenc alignment not assumed
68
         pixelcmp_t      psy_cost_pp;   // difference in AC energy between two pixel blocks
69
-        pixel_ssd_s_t   ssd_s;         // Sum of Square Error (residual coeff to self)
70
+        pixel_ssd_s_t   ssd_s[NUM_ALIGNMENT_TYPES];         // Sum of Square Error (residual coeff to self)
71
         pixelcmp_t      sa8d;          // Sum of Transformed Differences (8x8 Hadamard), uses satd for 4x4 intra TU
72
-
73
         transpose_t     transpose;     // transpose pixel block; for use with intra all-angs
74
         intra_allangs_t intra_pred_allangs;
75
         intra_filter_t  intra_filter;
76
         intra_pred_t    intra_pred[NUM_INTRA_MODE];
77
+        nonPsyRdoQuant_t nonPsyRdoQuant;
78
+        psyRdoQuant_t    psyRdoQuant;
79
+       psyRdoQuant_t1   psyRdoQuant_1p;
80
+       psyRdoQuant_t2   psyRdoQuant_2p;
81
     }
82
     cu[NUM_CU_SIZES];
83
-
84
     /* These remaining primitives work on either fixed block sizes or take
85
      * block dimensions as arguments and thus do not belong in either the PU or
86
      * the CU arrays */
87
@@ -307,7 +316,7 @@
88
     dequant_scaling_t     dequant_scaling;
89
     dequant_normal_t      dequant_normal;
90
     denoiseDct_t          denoiseDct;
91
-    scale1D_t             scale1D_128to64;
92
+    scale1D_t             scale1D_128to64[NUM_ALIGNMENT_TYPES];
93
     scale2D_t             scale2D_64to32;
94
 
95
     ssim_4x4x2_core_t     ssim_4x4x2_core;
96
@@ -384,9 +393,9 @@
97
             filter_ss_t  filter_vss;
98
             filter_pp_t  filter_hpp;
99
             filter_hps_t filter_hps;
100
-            addAvg_t     addAvg;
101
+            addAvg_t     addAvg[NUM_ALIGNMENT_TYPES];
102
             copy_pp_t    copy_pp;
103
-            filter_p2s_t p2s;
104
+            filter_p2s_t p2s[NUM_ALIGNMENT_TYPES];
105
 
106
         }
107
         pu[NUM_PU_SIZES];
108
@@ -397,7 +406,7 @@
109
             pixelcmp_t     sa8d;    // if chroma CU is not multiple of 8x8, will use satd
110
             pixel_sse_t    sse_pp;
111
             pixel_sub_ps_t sub_ps;
112
-            pixel_add_ps_t add_ps;
113
+            pixel_add_ps_t add_ps[NUM_ALIGNMENT_TYPES];
114
 
115
             copy_ps_t      copy_ps;
116
             copy_sp_t      copy_sp;
117
x265_2.7.tar.gz/source/common/quant.cpp -> x265_2.9.tar.gz/source/common/quant.cpp Changed
163
 
1
@@ -560,13 +560,11 @@
2
                             uint32_t log2TrSize, TextType ttype, bool bIntra, bool useTransformSkip, uint32_t numSig)
3
 {
4
     const uint32_t sizeIdx = log2TrSize - 2;
5
-
6
     if (cu.m_tqBypass[0])
7
     {
8
-        primitives.cu[sizeIdx].cpy1Dto2D_shl(residual, coeff, resiStride, 0);
9
+        primitives.cu[sizeIdx].cpy1Dto2D_shl[resiStride % 64 == 0](residual, coeff, resiStride, 0);
10
         return;
11
     }
12
-
13
     // Values need to pass as input parameter in dequant
14
     int rem = m_qpParam[ttype].rem;
15
     int per = m_qpParam[ttype].per;
16
@@ -595,7 +593,7 @@
17
         if (transformShift > 0)
18
             primitives.cu[sizeIdx].cpy1Dto2D_shr(residual, m_resiDctCoeff, resiStride, transformShift);
19
         else
20
-            primitives.cu[sizeIdx].cpy1Dto2D_shl(residual, m_resiDctCoeff, resiStride, -transformShift);
21
+            primitives.cu[sizeIdx].cpy1Dto2D_shl[resiStride % 64 == 0](residual, m_resiDctCoeff, resiStride, -transformShift);
22
 #endif
23
     }
24
     else
25
@@ -611,7 +609,7 @@
26
             const int add_2nd = 1 << (shift_2nd - 1);
27
 
28
             int dc_val = (((m_resiDctCoeff[0] * (64 >> 6) + add_1st) >> shift_1st) * (64 >> 3) + add_2nd) >> shift_2nd;
29
-            primitives.cu[sizeIdx].blockfill_s(residual, resiStride, (int16_t)dc_val);
30
+            primitives.cu[sizeIdx].blockfill_s[resiStride % 64 == 0](residual, resiStride, (int16_t)dc_val);
31
             return;
32
         }
33
 
34
@@ -644,11 +642,9 @@
35
     X265_CHECK((int)numSig == primitives.cu[log2TrSize - 2].count_nonzero(dstCoeff), "numSig differ\n");
36
     if (!numSig)
37
         return 0;
38
-
39
     const uint32_t trSize = 1 << log2TrSize;
40
     int64_t lambda2 = m_qpParam[ttype].lambda2;
41
-    const int64_t psyScale = ((int64_t)m_psyRdoqScale * m_qpParam[ttype].lambda);
42
-
43
+    int64_t psyScale = ((int64_t)m_psyRdoqScale * m_qpParam[ttype].lambda);
44
     /* unquant constants for measuring distortion. Scaling list quant coefficients have a (1 << 4)
45
      * scale applied that must be removed during unquant. Note that in real dequant there is clipping
46
      * at several stages. We skip the clipping for simplicity when measuring RD cost */
47
@@ -725,27 +721,15 @@
48
         for (int cgScanPos = cgLastScanPos + 1; cgScanPos < (int)cgNum ; cgScanPos++)
49
         {
50
             X265_CHECK(coeffNum[cgScanPos] == 0, "count of coeff failure\n");
51
-
52
             uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE);
53
             uint32_t blkPos      = codeParams.scan[scanPosBase];
54
-
55
-            // TODO: we can't SIMD optimize because PSYVALUE need 64-bits multiplication, convert to Double can work faster by FMA
56
-            for (int y = 0; y < MLS_CG_SIZE; y++)
57
+            bool enable512 = detect512();
58
+            if (enable512)
59
+                primitives.cu[log2TrSize - 2].psyRdoQuant(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos);
60
+            else
61
             {
62
-                for (int x = 0; x < MLS_CG_SIZE; x++)
63
-                {
64
-                    int signCoef         = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
65
-                    int predictedCoef    = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/
66
-
67
-                    costUncoded[blkPos + x] = ((int64_t)signCoef * signCoef) << scaleBits;
68
-
69
-                    /* when no residual coefficient is coded, predicted coef == recon coef */
70
-                    costUncoded[blkPos + x] -= PSYVALUE(predictedCoef);
71
-
72
-                    totalUncodedCost += costUncoded[blkPos + x];
73
-                    totalRdCost += costUncoded[blkPos + x];
74
-                }
75
-                blkPos += trSize;
76
+                primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff,  costUncoded, &totalUncodedCost, &totalRdCost,blkPos);
77
+                primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos);
78
             }
79
         }
80
     }
81
@@ -755,25 +739,11 @@
82
         for (int cgScanPos = cgLastScanPos + 1; cgScanPos < (int)cgNum ; cgScanPos++)
83
         {
84
             X265_CHECK(coeffNum[cgScanPos] == 0, "count of coeff failure\n");
85
-
86
             uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE);
87
             uint32_t blkPos      = codeParams.scan[scanPosBase];
88
-
89
-            for (int y = 0; y < MLS_CG_SIZE; y++)
90
-            {
91
-                for (int x = 0; x < MLS_CG_SIZE; x++)
92
-                {
93
-                    int signCoef = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
94
-                    costUncoded[blkPos + x] = ((int64_t)signCoef * signCoef) << scaleBits;
95
-
96
-                    totalUncodedCost += costUncoded[blkPos + x];
97
-                    totalRdCost += costUncoded[blkPos + x];
98
-                }
99
-                blkPos += trSize;
100
-            }
101
+            primitives.cu[log2TrSize - 2].nonPsyRdoQuant(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos);
102
         }
103
     }
104
-
105
     static const uint8_t table_cnt[5][SCAN_SET_SIZE] =
106
     {
107
         // patternSigCtx = 0
108
@@ -833,25 +803,22 @@
109
             // TODO: does we need zero-coeff cost?
110
             const uint32_t scanPosBase = (cgScanPos << MLS_CG_SIZE);
111
             uint32_t blkPos = codeParams.scan[scanPosBase];
112
-
113
             if (usePsyMask)
114
             {
115
-                // TODO: we can't SIMD optimize because PSYVALUE need 64-bits multiplication, convert to Double can work faster by FMA
116
+                bool enable512 = detect512();
117
+
118
+                if (enable512)
119
+                    primitives.cu[log2TrSize - 2].psyRdoQuant(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos);
120
+                else
121
+                {
122
+                    primitives.cu[log2TrSize - 2].psyRdoQuant_1p(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos);
123
+                    primitives.cu[log2TrSize - 2].psyRdoQuant_2p(m_resiDctCoeff, m_fencDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, &psyScale, blkPos);
124
+                }
125
+                blkPos = codeParams.scan[scanPosBase];
126
                 for (int y = 0; y < MLS_CG_SIZE; y++)
127
                 {
128
                     for (int x = 0; x < MLS_CG_SIZE; x++)
129
                     {
130
-                        int signCoef         = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
131
-                        int predictedCoef    = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/
132
-
133
-                        costUncoded[blkPos + x] = ((int64_t)signCoef * signCoef) << scaleBits;
134
-
135
-                        /* when no residual coefficient is coded, predicted coef == recon coef */
136
-                        costUncoded[blkPos + x] -= PSYVALUE(predictedCoef);
137
-
138
-                        totalUncodedCost += costUncoded[blkPos + x];
139
-                        totalRdCost += costUncoded[blkPos + x];
140
-
141
                         const uint32_t scanPosOffset =  y * MLS_CG_SIZE + x;
142
                         const uint32_t ctxSig = table_cnt[patternSigCtx][g_scan4x4[codeParams.scanType][scanPosOffset]] + ctxSigOffset;
143
                         X265_CHECK(trSize > 4, "trSize check failure\n");
144
@@ -867,16 +834,12 @@
145
             else
146
             {
147
                 // non-psy path
148
+                primitives.cu[log2TrSize - 2].nonPsyRdoQuant(m_resiDctCoeff, costUncoded, &totalUncodedCost, &totalRdCost, blkPos);
149
+                blkPos = codeParams.scan[scanPosBase];
150
                 for (int y = 0; y < MLS_CG_SIZE; y++)
151
                 {
152
                     for (int x = 0; x < MLS_CG_SIZE; x++)
153
                     {
154
-                        int signCoef = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
155
-                        costUncoded[blkPos + x] = ((int64_t)signCoef * signCoef) << scaleBits;
156
-
157
-                        totalUncodedCost += costUncoded[blkPos + x];
158
-                        totalRdCost += costUncoded[blkPos + x];
159
-
160
                         const uint32_t scanPosOffset =  y * MLS_CG_SIZE + x;
161
                         const uint32_t ctxSig = table_cnt[patternSigCtx][g_scan4x4[codeParams.scanType][scanPosOffset]] + ctxSigOffset;
162
                         X265_CHECK(trSize > 4, "trSize check failure\n");
163
x265_2.7.tar.gz/source/common/slice.cpp -> x265_2.9.tar.gz/source/common/slice.cpp Changed
10
 
1
@@ -138,7 +138,7 @@
2
             for (int yuv = 0; yuv < 3; yuv++)
3
             {
4
                 WeightParam& wp = m_weightPredTable[l][i][yuv];
5
-                wp.bPresentFlag = false;
6
+                wp.wtPresent = 0;
7
                 wp.log2WeightDenom = 0;
8
                 wp.inputWeight = 1;
9
                 wp.inputOffset = 0;
10
x265_2.7.tar.gz/source/common/slice.h -> x265_2.9.tar.gz/source/common/slice.h Changed
37
 
1
@@ -298,7 +298,7 @@
2
     uint32_t log2WeightDenom;
3
     int      inputWeight;
4
     int      inputOffset;
5
-    bool     bPresentFlag;
6
+    int      wtPresent;
7
 
8
     /* makes a non-h265 weight (i.e. fix7), into an h265 weight */
9
     void setFromWeightAndOffset(int w, int o, int denom, bool bNormalize)
10
@@ -321,7 +321,7 @@
11
         (w).inputWeight = (s); \
12
         (w).log2WeightDenom = (d); \
13
         (w).inputOffset = (o); \
14
-        (w).bPresentFlag = (b); \
15
+        (w).wtPresent = (b); \
16
     }
17
 
18
 class Slice
19
@@ -385,14 +385,14 @@
20
     bool getRapPicFlag() const
21
     {
22
         return m_nalUnitType == NAL_UNIT_CODED_SLICE_IDR_W_RADL
23
+            || m_nalUnitType == NAL_UNIT_CODED_SLICE_IDR_N_LP
24
             || m_nalUnitType == NAL_UNIT_CODED_SLICE_CRA;
25
     }
26
-
27
     bool getIdrPicFlag() const
28
     {
29
-        return m_nalUnitType == NAL_UNIT_CODED_SLICE_IDR_W_RADL;
30
+        return m_nalUnitType == NAL_UNIT_CODED_SLICE_IDR_W_RADL
31
+            || m_nalUnitType == NAL_UNIT_CODED_SLICE_IDR_N_LP;
32
     }
33
-
34
     bool isIRAP() const   { return m_nalUnitType >= 16 && m_nalUnitType <= 23; }
35
 
36
     bool isIntra()  const { return m_sliceType == I_SLICE; }
37
x265_2.7.tar.gz/source/common/x86/asm-primitives.cpp -> x265_2.9.tar.gz/source/common/x86/asm-primitives.cpp Changed
2936
 
1
@@ -404,36 +404,58 @@
2
     p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].sa8d = PFX(pixel_sa8d_8x16_ ## cpu); \
3
     p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sa8d = PFX(pixel_sa8d_16x32_ ## cpu); \
4
     p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sa8d = PFX(pixel_sa8d_32x64_ ## cpu)
5
-
6
 #define PIXEL_AVG(cpu) \
7
-    p.pu[LUMA_64x64].pixelavg_pp = PFX(pixel_avg_64x64_ ## cpu); \
8
-    p.pu[LUMA_64x48].pixelavg_pp = PFX(pixel_avg_64x48_ ## cpu); \
9
-    p.pu[LUMA_64x32].pixelavg_pp = PFX(pixel_avg_64x32_ ## cpu); \
10
-    p.pu[LUMA_64x16].pixelavg_pp = PFX(pixel_avg_64x16_ ## cpu); \
11
-    p.pu[LUMA_48x64].pixelavg_pp = PFX(pixel_avg_48x64_ ## cpu); \
12
-    p.pu[LUMA_32x64].pixelavg_pp = PFX(pixel_avg_32x64_ ## cpu); \
13
-    p.pu[LUMA_32x32].pixelavg_pp = PFX(pixel_avg_32x32_ ## cpu); \
14
-    p.pu[LUMA_32x24].pixelavg_pp = PFX(pixel_avg_32x24_ ## cpu); \
15
-    p.pu[LUMA_32x16].pixelavg_pp = PFX(pixel_avg_32x16_ ## cpu); \
16
-    p.pu[LUMA_32x8].pixelavg_pp  = PFX(pixel_avg_32x8_ ## cpu); \
17
-    p.pu[LUMA_24x32].pixelavg_pp = PFX(pixel_avg_24x32_ ## cpu); \
18
-    p.pu[LUMA_16x64].pixelavg_pp = PFX(pixel_avg_16x64_ ## cpu); \
19
-    p.pu[LUMA_16x32].pixelavg_pp = PFX(pixel_avg_16x32_ ## cpu); \
20
-    p.pu[LUMA_16x16].pixelavg_pp = PFX(pixel_avg_16x16_ ## cpu); \
21
-    p.pu[LUMA_16x12].pixelavg_pp = PFX(pixel_avg_16x12_ ## cpu); \
22
-    p.pu[LUMA_16x8].pixelavg_pp  = PFX(pixel_avg_16x8_ ## cpu); \
23
-    p.pu[LUMA_16x4].pixelavg_pp  = PFX(pixel_avg_16x4_ ## cpu); \
24
-    p.pu[LUMA_12x16].pixelavg_pp = PFX(pixel_avg_12x16_ ## cpu); \
25
-    p.pu[LUMA_8x32].pixelavg_pp  = PFX(pixel_avg_8x32_ ## cpu); \
26
-    p.pu[LUMA_8x16].pixelavg_pp  = PFX(pixel_avg_8x16_ ## cpu); \
27
-    p.pu[LUMA_8x8].pixelavg_pp   = PFX(pixel_avg_8x8_ ## cpu); \
28
-    p.pu[LUMA_8x4].pixelavg_pp   = PFX(pixel_avg_8x4_ ## cpu);
29
-
30
+    p.pu[LUMA_64x64].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_64x64_ ## cpu); \
31
+    p.pu[LUMA_64x48].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_64x48_ ## cpu); \
32
+    p.pu[LUMA_64x32].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_64x32_ ## cpu); \
33
+    p.pu[LUMA_64x16].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_64x16_ ## cpu); \
34
+    p.pu[LUMA_48x64].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_48x64_ ## cpu); \
35
+    p.pu[LUMA_32x64].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_32x64_ ## cpu); \
36
+    p.pu[LUMA_32x32].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_32x32_ ## cpu); \
37
+    p.pu[LUMA_32x24].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_32x24_ ## cpu); \
38
+    p.pu[LUMA_32x16].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_32x16_ ## cpu); \
39
+    p.pu[LUMA_32x8].pixelavg_pp[NONALIGNED]  = PFX(pixel_avg_32x8_ ## cpu); \
40
+    p.pu[LUMA_24x32].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_24x32_ ## cpu); \
41
+    p.pu[LUMA_16x64].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_16x64_ ## cpu); \
42
+    p.pu[LUMA_16x32].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_16x32_ ## cpu); \
43
+    p.pu[LUMA_16x16].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_16x16_ ## cpu); \
44
+    p.pu[LUMA_16x12].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_16x12_ ## cpu); \
45
+    p.pu[LUMA_16x8].pixelavg_pp[NONALIGNED]  = PFX(pixel_avg_16x8_ ## cpu); \
46
+    p.pu[LUMA_16x4].pixelavg_pp[NONALIGNED]  = PFX(pixel_avg_16x4_ ## cpu); \
47
+    p.pu[LUMA_12x16].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_12x16_ ## cpu); \
48
+    p.pu[LUMA_8x32].pixelavg_pp[NONALIGNED]  = PFX(pixel_avg_8x32_ ## cpu); \
49
+    p.pu[LUMA_8x16].pixelavg_pp[NONALIGNED]  = PFX(pixel_avg_8x16_ ## cpu); \
50
+    p.pu[LUMA_8x8].pixelavg_pp[NONALIGNED]   = PFX(pixel_avg_8x8_ ## cpu); \
51
+    p.pu[LUMA_8x4].pixelavg_pp[NONALIGNED]   = PFX(pixel_avg_8x4_ ## cpu); \
52
+    p.pu[LUMA_64x64].pixelavg_pp[ALIGNED] = PFX(pixel_avg_64x64_ ## cpu); \
53
+    p.pu[LUMA_64x48].pixelavg_pp[ALIGNED] = PFX(pixel_avg_64x48_ ## cpu); \
54
+    p.pu[LUMA_64x32].pixelavg_pp[ALIGNED] = PFX(pixel_avg_64x32_ ## cpu); \
55
+    p.pu[LUMA_64x16].pixelavg_pp[ALIGNED] = PFX(pixel_avg_64x16_ ## cpu); \
56
+    p.pu[LUMA_48x64].pixelavg_pp[ALIGNED] = PFX(pixel_avg_48x64_ ## cpu); \
57
+    p.pu[LUMA_32x64].pixelavg_pp[ALIGNED] = PFX(pixel_avg_32x64_ ## cpu); \
58
+    p.pu[LUMA_32x32].pixelavg_pp[ALIGNED] = PFX(pixel_avg_32x32_ ## cpu); \
59
+    p.pu[LUMA_32x24].pixelavg_pp[ALIGNED] = PFX(pixel_avg_32x24_ ## cpu); \
60
+    p.pu[LUMA_32x16].pixelavg_pp[ALIGNED] = PFX(pixel_avg_32x16_ ## cpu); \
61
+    p.pu[LUMA_32x8].pixelavg_pp[ALIGNED]  = PFX(pixel_avg_32x8_ ## cpu); \
62
+    p.pu[LUMA_24x32].pixelavg_pp[ALIGNED] = PFX(pixel_avg_24x32_ ## cpu); \
63
+    p.pu[LUMA_16x64].pixelavg_pp[ALIGNED] = PFX(pixel_avg_16x64_ ## cpu); \
64
+    p.pu[LUMA_16x32].pixelavg_pp[ALIGNED] = PFX(pixel_avg_16x32_ ## cpu); \
65
+    p.pu[LUMA_16x16].pixelavg_pp[ALIGNED] = PFX(pixel_avg_16x16_ ## cpu); \
66
+    p.pu[LUMA_16x12].pixelavg_pp[ALIGNED] = PFX(pixel_avg_16x12_ ## cpu); \
67
+    p.pu[LUMA_16x8].pixelavg_pp[ALIGNED]  = PFX(pixel_avg_16x8_ ## cpu); \
68
+    p.pu[LUMA_16x4].pixelavg_pp[ALIGNED]  = PFX(pixel_avg_16x4_ ## cpu); \
69
+    p.pu[LUMA_12x16].pixelavg_pp[ALIGNED] = PFX(pixel_avg_12x16_ ## cpu); \
70
+    p.pu[LUMA_8x32].pixelavg_pp[ALIGNED]  = PFX(pixel_avg_8x32_ ## cpu); \
71
+    p.pu[LUMA_8x16].pixelavg_pp[ALIGNED]  = PFX(pixel_avg_8x16_ ## cpu); \
72
+    p.pu[LUMA_8x8].pixelavg_pp[ALIGNED]   = PFX(pixel_avg_8x8_ ## cpu); \
73
+    p.pu[LUMA_8x4].pixelavg_pp[ALIGNED]   = PFX(pixel_avg_8x4_ ## cpu);
74
 #define PIXEL_AVG_W4(cpu) \
75
-    p.pu[LUMA_4x4].pixelavg_pp  = PFX(pixel_avg_4x4_ ## cpu); \
76
-    p.pu[LUMA_4x8].pixelavg_pp  = PFX(pixel_avg_4x8_ ## cpu); \
77
-    p.pu[LUMA_4x16].pixelavg_pp = PFX(pixel_avg_4x16_ ## cpu);
78
-
79
+    p.pu[LUMA_4x4].pixelavg_pp[NONALIGNED]  = PFX(pixel_avg_4x4_ ## cpu); \
80
+    p.pu[LUMA_4x8].pixelavg_pp[NONALIGNED]  = PFX(pixel_avg_4x8_ ## cpu); \
81
+    p.pu[LUMA_4x16].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_4x16_ ## cpu); \
82
+    p.pu[LUMA_4x4].pixelavg_pp[ALIGNED]  = PFX(pixel_avg_4x4_ ## cpu); \
83
+    p.pu[LUMA_4x8].pixelavg_pp[ALIGNED]  = PFX(pixel_avg_4x8_ ## cpu); \
84
+    p.pu[LUMA_4x16].pixelavg_pp[ALIGNED] = PFX(pixel_avg_4x16_ ## cpu);
85
 #define CHROMA_420_FILTERS(cpu) \
86
     ALL_CHROMA_420_PU(filter_hpp, interp_4tap_horiz_pp, cpu); \
87
     ALL_CHROMA_420_PU(filter_hps, interp_4tap_horiz_ps, cpu); \
88
@@ -633,23 +655,32 @@
89
 
90
 #define LUMA_PIXELSUB(cpu) \
91
     p.cu[BLOCK_4x4].sub_ps = PFX(pixel_sub_ps_4x4_ ## cpu); \
92
-    p.cu[BLOCK_4x4].add_ps = PFX(pixel_add_ps_4x4_ ## cpu); \
93
+    p.cu[BLOCK_4x4].add_ps[NONALIGNED] = PFX(pixel_add_ps_4x4_ ## cpu); \
94
+    p.cu[BLOCK_4x4].add_ps[ALIGNED] = PFX(pixel_add_ps_4x4_ ## cpu); \
95
     ALL_LUMA_CU(sub_ps, pixel_sub_ps, cpu); \
96
-    ALL_LUMA_CU(add_ps, pixel_add_ps, cpu);
97
+    ALL_LUMA_CU(add_ps[NONALIGNED], pixel_add_ps, cpu); \
98
+    ALL_LUMA_CU(add_ps[ALIGNED], pixel_add_ps, cpu);
99
 
100
 #define CHROMA_420_PIXELSUB_PS(cpu) \
101
     ALL_CHROMA_420_CU(sub_ps, pixel_sub_ps, cpu); \
102
-    ALL_CHROMA_420_CU(add_ps, pixel_add_ps, cpu);
103
+    ALL_CHROMA_420_CU(add_ps[NONALIGNED], pixel_add_ps, cpu); \
104
+    ALL_CHROMA_420_CU(add_ps[ALIGNED], pixel_add_ps, cpu);
105
 
106
 #define CHROMA_422_PIXELSUB_PS(cpu) \
107
     ALL_CHROMA_422_CU(sub_ps, pixel_sub_ps, cpu); \
108
-    ALL_CHROMA_422_CU(add_ps, pixel_add_ps, cpu);
109
+    ALL_CHROMA_422_CU(add_ps[NONALIGNED], pixel_add_ps, cpu); \
110
+    ALL_CHROMA_422_CU(add_ps[ALIGNED], pixel_add_ps, cpu);
111
 
112
 #define LUMA_VAR(cpu)          ALL_LUMA_CU(var, pixel_var, cpu)
113
 
114
-#define LUMA_ADDAVG(cpu)       ALL_LUMA_PU(addAvg, addAvg, cpu); p.pu[LUMA_4x4].addAvg = PFX(addAvg_4x4_ ## cpu)
115
-#define CHROMA_420_ADDAVG(cpu) ALL_CHROMA_420_PU(addAvg, addAvg, cpu);
116
-#define CHROMA_422_ADDAVG(cpu) ALL_CHROMA_422_PU(addAvg, addAvg, cpu);
117
+#define LUMA_ADDAVG(cpu)       ALL_LUMA_PU(addAvg[NONALIGNED], addAvg, cpu); \
118
+                               p.pu[LUMA_4x4].addAvg[NONALIGNED] = PFX(addAvg_4x4_ ## cpu); \
119
+                               ALL_LUMA_PU(addAvg[ALIGNED], addAvg, cpu); \
120
+                               p.pu[LUMA_4x4].addAvg[ALIGNED] = PFX(addAvg_4x4_ ## cpu)
121
+#define CHROMA_420_ADDAVG(cpu) ALL_CHROMA_420_PU(addAvg[NONALIGNED], addAvg, cpu); \
122
+                               ALL_CHROMA_420_PU(addAvg[ALIGNED], addAvg, cpu)
123
+#define CHROMA_422_ADDAVG(cpu) ALL_CHROMA_422_PU(addAvg[NONALIGNED], addAvg, cpu); \
124
+                               ALL_CHROMA_422_PU(addAvg[ALIGNED], addAvg, cpu)
125
 
126
 #define SETUP_INTRA_ANG_COMMON(mode, fno, cpu) \
127
     p.cu[BLOCK_4x4].intra_pred[mode] = PFX(intra_pred_ang4_ ## fno ## _ ## cpu); \
128
@@ -855,6 +886,10 @@
129
     ALL_CHROMA_444_PU(filter_hpp, interp_4tap_horiz_pp, cpu); \
130
     ALL_CHROMA_444_PU(filter_hps, interp_4tap_horiz_ps, cpu);
131
 
132
+#define ASSIGN2(func, fname) \
133
+    func[ALIGNED] = PFX(fname); \
134
+    func[NONALIGNED] = PFX(fname)
135
+
136
 namespace X265_NS {
137
 // private x265 namespace
138
 
139
@@ -873,10 +908,6 @@
140
 
141
 void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask) // Main10
142
 {
143
-#if !defined(X86_64)
144
-#error "Unsupported build configuration (32bit x86 and HIGH_BIT_DEPTH), you must configure ENABLE_ASSEMBLY=OFF"
145
-#endif
146
-
147
 #if X86_64
148
     p.scanPosLast = PFX(scanPosLast_x64);
149
 #endif
150
@@ -937,35 +968,69 @@
151
         CHROMA_422_VERT_FILTERS(_sse2);
152
         CHROMA_444_VERT_FILTERS(sse2);
153
 
154
+#if X86_64
155
         ALL_LUMA_PU(luma_hpp, interp_8tap_horiz_pp, sse2);
156
         p.pu[LUMA_4x4].luma_hpp = PFX(interp_8tap_horiz_pp_4x4_sse2);
157
         ALL_LUMA_PU(luma_hps, interp_8tap_horiz_ps, sse2);
158
         p.pu[LUMA_4x4].luma_hps = PFX(interp_8tap_horiz_ps_4x4_sse2);
159
         ALL_LUMA_PU(luma_vpp, interp_8tap_vert_pp, sse2);
160
         ALL_LUMA_PU(luma_vps, interp_8tap_vert_ps, sse2);
161
+#endif
162
 
163
         p.ssim_4x4x2_core = PFX(pixel_ssim_4x4x2_core_sse2);
164
         p.ssim_end_4 = PFX(pixel_ssim_end4_sse2);
165
-        PIXEL_AVG(sse2);
166
+        ASSIGN2(p.pu[LUMA_64x64].pixelavg_pp, pixel_avg_64x64_sse2);
167
+        ASSIGN2(p.pu[LUMA_64x48].pixelavg_pp, pixel_avg_64x48_sse2);
168
+        ASSIGN2(p.pu[LUMA_64x32].pixelavg_pp, pixel_avg_64x32_sse2);
169
+        ASSIGN2(p.pu[LUMA_64x16].pixelavg_pp, pixel_avg_64x16_sse2);
170
+        ASSIGN2(p.pu[LUMA_48x64].pixelavg_pp, pixel_avg_48x64_sse2);
171
+        ASSIGN2(p.pu[LUMA_32x64].pixelavg_pp, pixel_avg_32x64_sse2);
172
+        ASSIGN2(p.pu[LUMA_32x32].pixelavg_pp, pixel_avg_32x32_sse2);
173
+        ASSIGN2(p.pu[LUMA_32x24].pixelavg_pp, pixel_avg_32x24_sse2);
174
+        ASSIGN2(p.pu[LUMA_32x16].pixelavg_pp, pixel_avg_32x16_sse2);
175
+        ASSIGN2(p.pu[LUMA_32x8].pixelavg_pp, pixel_avg_32x8_sse2);
176
+        ASSIGN2(p.pu[LUMA_24x32].pixelavg_pp, pixel_avg_24x32_sse2);
177
+        ASSIGN2(p.pu[LUMA_16x64].pixelavg_pp, pixel_avg_16x64_sse2);
178
+        ASSIGN2(p.pu[LUMA_16x32].pixelavg_pp, pixel_avg_16x32_sse2);
179
+        ASSIGN2(p.pu[LUMA_16x16].pixelavg_pp, pixel_avg_16x16_sse2);
180
+        ASSIGN2(p.pu[LUMA_16x12].pixelavg_pp, pixel_avg_16x12_sse2);
181
+        ASSIGN2(p.pu[LUMA_16x8].pixelavg_pp, pixel_avg_16x8_sse2);
182
+        ASSIGN2(p.pu[LUMA_16x4].pixelavg_pp, pixel_avg_16x4_sse2);
183
+        ASSIGN2(p.pu[LUMA_12x16].pixelavg_pp, pixel_avg_12x16_sse2);
184
+#if X86_64
185
+        ASSIGN2(p.pu[LUMA_8x32].pixelavg_pp, pixel_avg_8x32_sse2);
186
+        ASSIGN2(p.pu[LUMA_8x16].pixelavg_pp, pixel_avg_8x16_sse2);
187
+        ASSIGN2(p.pu[LUMA_8x8].pixelavg_pp, pixel_avg_8x8_sse2);
188
+        ASSIGN2(p.pu[LUMA_8x4].pixelavg_pp, pixel_avg_8x4_sse2);
189
+#endif
190
         PIXEL_AVG_W4(mmx2);
191
         LUMA_VAR(sse2);
192
 
193
 
194
-        ALL_LUMA_TU(blockfill_s, blockfill_s, sse2);
195
+        ALL_LUMA_TU(blockfill_s[ALIGNED], blockfill_s, sse2);
196
+        ALL_LUMA_TU(blockfill_s[NONALIGNED], blockfill_s, sse2);
197
         ALL_LUMA_TU_S(cpy1Dto2D_shr, cpy1Dto2D_shr_, sse2);
198
-        ALL_LUMA_TU_S(cpy1Dto2D_shl, cpy1Dto2D_shl_, sse2);
199
+        ALL_LUMA_TU_S(cpy1Dto2D_shl[ALIGNED], cpy1Dto2D_shl_, sse2);
200
+        ALL_LUMA_TU_S(cpy1Dto2D_shl[NONALIGNED], cpy1Dto2D_shl_, sse2);
201
         ALL_LUMA_TU_S(cpy2Dto1D_shr, cpy2Dto1D_shr_, sse2);
202
         ALL_LUMA_TU_S(cpy2Dto1D_shl, cpy2Dto1D_shl_, sse2);
203
-        ALL_LUMA_TU_S(ssd_s, pixel_ssd_s_, sse2);
204
-        ALL_LUMA_TU_S(calcresidual, getResidual, sse2);
205
+#if X86_64
206
+        ASSIGN2(p.cu[BLOCK_4x4].ssd_s,pixel_ssd_s_4_sse2 );
207
+        ASSIGN2(p.cu[BLOCK_8x8].ssd_s,pixel_ssd_s_8_sse2);
208
+        ASSIGN2(p.cu[BLOCK_16x16].ssd_s,pixel_ssd_s_16_sse2);
209
+        ASSIGN2(p.cu[BLOCK_32x32].ssd_s,pixel_ssd_s_32_sse2 );
210
+#endif
211
+        ALL_LUMA_TU_S(calcresidual[ALIGNED], getResidual, sse2);
212
+        ALL_LUMA_TU_S(calcresidual[NONALIGNED], getResidual, sse2);
213
         ALL_LUMA_TU_S(transpose, transpose, sse2);
214
 
215
         p.cu[BLOCK_4x4].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar4_sse2);
216
         p.cu[BLOCK_8x8].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar8_sse2);
217
         p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar16_sse2);
218
+#if X86_64
219
         p.cu[BLOCK_32x32].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar32_sse2);
220
         ALL_LUMA_TU_S(intra_pred[DC_IDX], intra_pred_dc, sse2);
221
-
222
+#endif
223
         p.cu[BLOCK_4x4].intra_pred[2] = PFX(intra_pred_ang4_2_sse2);
224
         p.cu[BLOCK_4x4].intra_pred[3] = PFX(intra_pred_ang4_3_sse2);
225
         p.cu[BLOCK_4x4].intra_pred[4] = PFX(intra_pred_ang4_4_sse2);
226
@@ -990,7 +1055,9 @@
227
         p.cu[BLOCK_4x4].intra_pred[23] = PFX(intra_pred_ang4_23_sse2);
228
         p.cu[BLOCK_4x4].intra_pred[24] = PFX(intra_pred_ang4_24_sse2);
229
         p.cu[BLOCK_4x4].intra_pred[25] = PFX(intra_pred_ang4_25_sse2);
230
+#if X86_64
231
         p.cu[BLOCK_4x4].intra_pred[26] = PFX(intra_pred_ang4_26_sse2);
232
+#endif
233
         p.cu[BLOCK_4x4].intra_pred[27] = PFX(intra_pred_ang4_27_sse2);
234
         p.cu[BLOCK_4x4].intra_pred[28] = PFX(intra_pred_ang4_28_sse2);
235
         p.cu[BLOCK_4x4].intra_pred[29] = PFX(intra_pred_ang4_29_sse2);
236
@@ -999,19 +1066,24 @@
237
         p.cu[BLOCK_4x4].intra_pred[32] = PFX(intra_pred_ang4_32_sse2);
238
         p.cu[BLOCK_4x4].intra_pred[33] = PFX(intra_pred_ang4_33_sse2);
239
 
240
+#if X86_64 && X265_DEPTH <= 10
241
+        p.cu[BLOCK_4x4].sse_ss = PFX(pixel_ssd_ss_4x4_mmx2);
242
         p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_32x64_sse2);
243
         p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_4x8_mmx2);
244
         p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_8x16_sse2);
245
         p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_16x32_sse2);
246
-#if X265_DEPTH <= 10
247
-        p.cu[BLOCK_4x4].sse_ss = PFX(pixel_ssd_ss_4x4_mmx2);
248
-        ALL_LUMA_CU(sse_ss, pixel_ssd_ss, sse2);
249
+
250
+        p.cu[BLOCK_8x8].sse_ss = PFX(pixel_ssd_ss_8x8_sse2);
251
+        p.cu[BLOCK_16x16].sse_ss = PFX(pixel_ssd_ss_16x16_sse2);
252
+        p.cu[BLOCK_32x32].sse_ss = PFX(pixel_ssd_ss_32x32_sse2);
253
+        p.cu[BLOCK_64x64].sse_ss = PFX(pixel_ssd_ss_64x64_sse2);
254
 #endif
255
         p.cu[BLOCK_4x4].dct = PFX(dct4_sse2);
256
         p.cu[BLOCK_8x8].dct = PFX(dct8_sse2);
257
         p.cu[BLOCK_4x4].idct = PFX(idct4_sse2);
258
+#if X86_64
259
         p.cu[BLOCK_8x8].idct = PFX(idct8_sse2);
260
-
261
+#endif
262
         p.idst4x4 = PFX(idst4_sse2);
263
         p.dst4x4 = PFX(dst4_sse2);
264
 
265
@@ -1022,25 +1094,31 @@
266
         //p.planecopy_sp = PFX(downShift_16_sse2);
267
         p.planecopy_sp_shl = PFX(upShift_16_sse2);
268
 
269
-        ALL_CHROMA_420_PU(p2s, filterPixelToShort, sse2);
270
-        ALL_CHROMA_422_PU(p2s, filterPixelToShort, sse2);
271
-        ALL_CHROMA_444_PU(p2s, filterPixelToShort, sse2);
272
-        ALL_LUMA_PU(convert_p2s, filterPixelToShort, sse2);
273
+        ALL_CHROMA_420_PU(p2s[ALIGNED], filterPixelToShort, sse2);
274
+        ALL_CHROMA_422_PU(p2s[ALIGNED], filterPixelToShort, sse2);
275
+        ALL_CHROMA_444_PU(p2s[ALIGNED], filterPixelToShort, sse2);
276
+        ALL_LUMA_PU(convert_p2s[ALIGNED], filterPixelToShort, sse2);
277
+        ALL_CHROMA_420_PU(p2s[NONALIGNED], filterPixelToShort, sse2);
278
+        ALL_CHROMA_422_PU(p2s[NONALIGNED], filterPixelToShort, sse2);
279
+        ALL_CHROMA_444_PU(p2s[NONALIGNED], filterPixelToShort, sse2);
280
+        ALL_LUMA_PU(convert_p2s[NONALIGNED], filterPixelToShort, sse2);
281
         ALL_LUMA_TU(count_nonzero, count_nonzero, sse2);
282
         p.propagateCost = PFX(mbtree_propagate_cost_sse2);
283
     }
284
     if (cpuMask & X265_CPU_SSE3)
285
     {
286
+#if X86_64
287
         ALL_CHROMA_420_PU(filter_hpp, interp_4tap_horiz_pp, sse3);
288
         ALL_CHROMA_422_PU(filter_hpp, interp_4tap_horiz_pp, sse3);
289
         ALL_CHROMA_444_PU(filter_hpp, interp_4tap_horiz_pp, sse3);
290
         ALL_CHROMA_420_PU(filter_hps, interp_4tap_horiz_ps, sse3);
291
         ALL_CHROMA_422_PU(filter_hps, interp_4tap_horiz_ps, sse3);
292
         ALL_CHROMA_444_PU(filter_hps, interp_4tap_horiz_ps, sse3);
293
+#endif
294
     }
295
     if (cpuMask & X265_CPU_SSSE3)
296
     {
297
-        p.scale1D_128to64 = PFX(scale1D_128to64_ssse3);
298
+        ASSIGN2(p.scale1D_128to64, scale1D_128to64_ssse3);
299
         p.scale2D_64to32 = PFX(scale2D_64to32_ssse3);
300
 
301
         // p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d = PFX(pixel_satd_4x4_ssse3); this one is broken
302
@@ -1055,60 +1133,65 @@
303
 
304
         p.frameInitLowres = PFX(frame_init_lowres_core_ssse3);
305
 
306
-        ALL_LUMA_PU(convert_p2s, filterPixelToShort, ssse3);
307
+        ALL_LUMA_PU(convert_p2s[ALIGNED], filterPixelToShort, ssse3);
308
+        ALL_LUMA_PU(convert_p2s[NONALIGNED], filterPixelToShort, ssse3);
309
+
310
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].p2s, filterPixelToShort_4x4_ssse3);
311
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].p2s, filterPixelToShort_4x8_ssse3);
312
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].p2s, filterPixelToShort_4x16_ssse3);
313
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].p2s, filterPixelToShort_8x4_ssse3);
314
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].p2s, filterPixelToShort_8x8_ssse3);
315
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].p2s, filterPixelToShort_8x16_ssse3);
316
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].p2s, filterPixelToShort_8x32_ssse3);
317
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].p2s, filterPixelToShort_16x4_ssse3);
318
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].p2s, filterPixelToShort_16x8_ssse3);
319
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].p2s, filterPixelToShort_16x12_ssse3);
320
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].p2s, filterPixelToShort_16x16_ssse3);
321
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].p2s, filterPixelToShort_16x32_ssse3);
322
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].p2s, filterPixelToShort_32x8_ssse3);
323
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].p2s, filterPixelToShort_32x16_ssse3);
324
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].p2s, filterPixelToShort_32x24_ssse3);
325
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].p2s, filterPixelToShort_32x32_ssse3);
326
+
327
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].p2s, filterPixelToShort_4x4_ssse3);
328
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].p2s, filterPixelToShort_4x8_ssse3);
329
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].p2s, filterPixelToShort_4x16_ssse3);
330
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].p2s, filterPixelToShort_4x32_ssse3);
331
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].p2s, filterPixelToShort_8x4_ssse3);
332
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].p2s, filterPixelToShort_8x8_ssse3);
333
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].p2s, filterPixelToShort_8x12_ssse3);
334
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].p2s, filterPixelToShort_8x16_ssse3);
335
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].p2s, filterPixelToShort_8x32_ssse3);
336
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].p2s, filterPixelToShort_8x64_ssse3);
337
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].p2s, filterPixelToShort_12x32_ssse3);
338
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].p2s, filterPixelToShort_16x8_ssse3);
339
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].p2s, filterPixelToShort_16x16_ssse3);
340
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].p2s, filterPixelToShort_16x24_ssse3);
341
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].p2s, filterPixelToShort_16x32_ssse3);
342
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].p2s, filterPixelToShort_16x64_ssse3);
343
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].p2s, filterPixelToShort_24x64_ssse3);
344
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].p2s, filterPixelToShort_32x16_ssse3);
345
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].p2s, filterPixelToShort_32x32_ssse3);
346
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s, filterPixelToShort_32x48_ssse3);
347
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s, filterPixelToShort_32x64_ssse3);
348
+
349
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].p2s, filterPixelToShort_4x2_ssse3);
350
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].p2s, filterPixelToShort_8x2_ssse3);
351
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].p2s, filterPixelToShort_8x6_ssse3);
352
 
353
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].p2s = PFX(filterPixelToShort_4x4_ssse3);
354
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].p2s = PFX(filterPixelToShort_4x8_ssse3);
355
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].p2s = PFX(filterPixelToShort_4x16_ssse3);
356
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].p2s = PFX(filterPixelToShort_8x4_ssse3);
357
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].p2s = PFX(filterPixelToShort_8x8_ssse3);
358
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].p2s = PFX(filterPixelToShort_8x16_ssse3);
359
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].p2s = PFX(filterPixelToShort_8x32_ssse3);
360
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].p2s = PFX(filterPixelToShort_16x4_ssse3);
361
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].p2s = PFX(filterPixelToShort_16x8_ssse3);
362
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].p2s = PFX(filterPixelToShort_16x12_ssse3);
363
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].p2s = PFX(filterPixelToShort_16x16_ssse3);
364
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].p2s = PFX(filterPixelToShort_16x32_ssse3);
365
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].p2s = PFX(filterPixelToShort_32x8_ssse3);
366
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].p2s = PFX(filterPixelToShort_32x16_ssse3);
367
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].p2s = PFX(filterPixelToShort_32x24_ssse3);
368
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].p2s = PFX(filterPixelToShort_32x32_ssse3);
369
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].p2s = PFX(filterPixelToShort_4x4_ssse3);
370
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].p2s = PFX(filterPixelToShort_4x8_ssse3);
371
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].p2s = PFX(filterPixelToShort_4x16_ssse3);
372
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].p2s = PFX(filterPixelToShort_4x32_ssse3);
373
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].p2s = PFX(filterPixelToShort_8x4_ssse3);
374
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].p2s = PFX(filterPixelToShort_8x8_ssse3);
375
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].p2s = PFX(filterPixelToShort_8x12_ssse3);
376
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].p2s = PFX(filterPixelToShort_8x16_ssse3);
377
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].p2s = PFX(filterPixelToShort_8x32_ssse3);
378
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].p2s = PFX(filterPixelToShort_8x64_ssse3);
379
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].p2s = PFX(filterPixelToShort_12x32_ssse3);
380
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].p2s = PFX(filterPixelToShort_16x8_ssse3);
381
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].p2s = PFX(filterPixelToShort_16x16_ssse3);
382
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].p2s = PFX(filterPixelToShort_16x24_ssse3);
383
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].p2s = PFX(filterPixelToShort_16x32_ssse3);
384
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].p2s = PFX(filterPixelToShort_16x64_ssse3);
385
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].p2s = PFX(filterPixelToShort_24x64_ssse3);
386
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].p2s = PFX(filterPixelToShort_32x16_ssse3);
387
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].p2s = PFX(filterPixelToShort_32x32_ssse3);
388
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s = PFX(filterPixelToShort_32x48_ssse3);
389
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s = PFX(filterPixelToShort_32x64_ssse3);
390
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].p2s = PFX(filterPixelToShort_4x2_ssse3);
391
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].p2s = PFX(filterPixelToShort_8x2_ssse3);
392
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].p2s = PFX(filterPixelToShort_8x6_ssse3);
393
         p.findPosFirstLast = PFX(findPosFirstLast_ssse3);
394
         p.fix8Unpack = PFX(cutree_fix8_unpack_ssse3);
395
         p.fix8Pack = PFX(cutree_fix8_pack_ssse3);
396
     }
397
     if (cpuMask & X265_CPU_SSE4)
398
     {
399
+#if X86_64
400
         p.pelFilterLumaStrong[0] = PFX(pelFilterLumaStrong_V_sse4);
401
         p.pelFilterLumaStrong[1] = PFX(pelFilterLumaStrong_H_sse4);
402
         p.pelFilterChroma[0] = PFX(pelFilterChroma_V_sse4);
403
         p.pelFilterChroma[1] = PFX(pelFilterChroma_H_sse4);
404
-
405
         p.saoCuOrgE0 = PFX(saoCuOrgE0_sse4);
406
+#endif
407
         p.saoCuOrgE1 = PFX(saoCuOrgE1_sse4);
408
         p.saoCuOrgE1_2Rows = PFX(saoCuOrgE1_2Rows_sse4);
409
         p.saoCuOrgE2[0] = PFX(saoCuOrgE2_sse4);
410
@@ -1123,6 +1206,68 @@
411
         CHROMA_422_ADDAVG(sse4);
412
 
413
         LUMA_FILTERS(sse4);
414
+
415
+#if X86_64
416
+        p.pu[LUMA_4x4].luma_hpp = PFX(interp_8tap_horiz_pp_4x4_sse4);
417
+        p.pu[LUMA_4x8].luma_hpp = PFX(interp_8tap_horiz_pp_4x8_sse4);
418
+        p.pu[LUMA_4x16].luma_hpp = PFX(interp_8tap_horiz_pp_4x16_sse4);
419
+        p.pu[LUMA_4x4].luma_hps = PFX(interp_8tap_horiz_ps_4x4_sse4);
420
+        p.pu[LUMA_4x8].luma_hps = PFX(interp_8tap_horiz_ps_4x8_sse4);
421
+        p.pu[LUMA_4x16].luma_hps = PFX(interp_8tap_horiz_ps_4x16_sse4);
422
+#endif
423
+
424
+        p.pu[LUMA_8x8].luma_hpp = PFX(interp_8tap_horiz_pp_8x8_sse4);
425
+        p.pu[LUMA_16x16].luma_hpp = PFX(interp_8tap_horiz_pp_16x16_sse4);
426
+        p.pu[LUMA_32x32].luma_hpp = PFX(interp_8tap_horiz_pp_32x32_sse4);
427
+        p.pu[LUMA_64x64].luma_hpp = PFX(interp_8tap_horiz_pp_64x64_sse4);
428
+        p.pu[LUMA_8x4].luma_hpp = PFX(interp_8tap_horiz_pp_8x4_sse4);
429
+
430
+        p.pu[LUMA_16x8].luma_hpp = PFX(interp_8tap_horiz_pp_16x8_sse4);
431
+        p.pu[LUMA_8x16].luma_hpp = PFX(interp_8tap_horiz_pp_8x16_sse4);
432
+        p.pu[LUMA_16x32].luma_hpp = PFX(interp_8tap_horiz_pp_16x32_sse4);
433
+        p.pu[LUMA_32x16].luma_hpp = PFX(interp_8tap_horiz_pp_32x16_sse4);
434
+        p.pu[LUMA_64x32].luma_hpp = PFX(interp_8tap_horiz_pp_64x32_sse4);
435
+        p.pu[LUMA_32x64].luma_hpp = PFX(interp_8tap_horiz_pp_32x64_sse4);
436
+        p.pu[LUMA_16x12].luma_hpp = PFX(interp_8tap_horiz_pp_16x12_sse4);
437
+        p.pu[LUMA_12x16].luma_hpp = PFX(interp_8tap_horiz_pp_12x16_sse4);
438
+        p.pu[LUMA_16x4].luma_hpp = PFX(interp_8tap_horiz_pp_16x4_sse4);
439
+
440
+        p.pu[LUMA_32x24].luma_hpp = PFX(interp_8tap_horiz_pp_32x24_sse4);
441
+        p.pu[LUMA_24x32].luma_hpp = PFX(interp_8tap_horiz_pp_24x32_sse4);
442
+        p.pu[LUMA_32x8].luma_hpp = PFX(interp_8tap_horiz_pp_32x8_sse4);
443
+        p.pu[LUMA_8x32].luma_hpp = PFX(interp_8tap_horiz_pp_8x32_sse4);
444
+        p.pu[LUMA_64x48].luma_hpp = PFX(interp_8tap_horiz_pp_64x48_sse4);
445
+        p.pu[LUMA_48x64].luma_hpp = PFX(interp_8tap_horiz_pp_48x64_sse4);
446
+        p.pu[LUMA_64x16].luma_hpp = PFX(interp_8tap_horiz_pp_64x16_sse4);
447
+        p.pu[LUMA_16x64].luma_hpp = PFX(interp_8tap_horiz_pp_16x64_sse4);
448
+
449
+        p.pu[LUMA_8x8].luma_hps = PFX(interp_8tap_horiz_ps_8x8_sse4);
450
+        p.pu[LUMA_16x16].luma_hps = PFX(interp_8tap_horiz_ps_16x16_sse4);
451
+        p.pu[LUMA_32x32].luma_hps = PFX(interp_8tap_horiz_ps_32x32_sse4);
452
+        p.pu[LUMA_64x64].luma_hps = PFX(interp_8tap_horiz_ps_64x64_sse4);
453
+        p.pu[LUMA_8x4].luma_hps = PFX(interp_8tap_horiz_ps_8x4_sse4);
454
+        p.pu[LUMA_16x8].luma_hps = PFX(interp_8tap_horiz_ps_16x8_sse4);
455
+        p.pu[LUMA_8x16].luma_hps = PFX(interp_8tap_horiz_ps_8x16_sse4);
456
+        p.pu[LUMA_16x32].luma_hps = PFX(interp_8tap_horiz_ps_16x32_sse4);
457
+        p.pu[LUMA_32x16].luma_hps = PFX(interp_8tap_horiz_ps_32x16_sse4);
458
+        p.pu[LUMA_64x32].luma_hps = PFX(interp_8tap_horiz_ps_64x32_sse4);
459
+        p.pu[LUMA_32x64].luma_hps = PFX(interp_8tap_horiz_ps_32x64_sse4);
460
+        p.pu[LUMA_16x12].luma_hps = PFX(interp_8tap_horiz_ps_16x12_sse4);
461
+        p.pu[LUMA_12x16].luma_hps = PFX(interp_8tap_horiz_ps_12x16_sse4);
462
+        p.pu[LUMA_16x4].luma_hps = PFX(interp_8tap_horiz_ps_16x4_sse4);
463
+        p.pu[LUMA_32x24].luma_hps = PFX(interp_8tap_horiz_ps_32x24_sse4);
464
+        p.pu[LUMA_24x32].luma_hps = PFX(interp_8tap_horiz_ps_24x32_sse4);
465
+        p.pu[LUMA_32x8].luma_hps = PFX(interp_8tap_horiz_ps_32x8_sse4);
466
+        p.pu[LUMA_8x32].luma_hps = PFX(interp_8tap_horiz_ps_8x32_sse4);
467
+        p.pu[LUMA_64x48].luma_hps = PFX(interp_8tap_horiz_ps_64x48_sse4);
468
+        p.pu[LUMA_48x64].luma_hps = PFX(interp_8tap_horiz_ps_48x64_sse4);
469
+        p.pu[LUMA_64x16].luma_hps = PFX(interp_8tap_horiz_ps_64x16_sse4);
470
+        p.pu[LUMA_16x64].luma_hps = PFX(interp_8tap_horiz_ps_16x64_sse4);
471
+
472
+        ALL_LUMA_PU(luma_vpp, interp_8tap_vert_pp, sse4); p.pu[LUMA_4x4].luma_vpp = PFX(interp_8tap_vert_pp_4x4_sse4);
473
+        ALL_LUMA_PU(luma_vps, interp_8tap_vert_ps, sse4); p.pu[LUMA_4x4].luma_vps = PFX(interp_8tap_vert_ps_4x4_sse4);
474
+        ALL_LUMA_PU(luma_vsp, interp_8tap_vert_sp, sse4); p.pu[LUMA_4x4].luma_vsp = PFX(interp_8tap_vert_sp_4x4_sse4);
475
+        ALL_LUMA_PU_T(luma_hvpp, interp_8tap_hv_pp_cpu); p.pu[LUMA_4x4].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_4x4>;
476
         CHROMA_420_HORIZ_FILTERS(sse4);
477
         CHROMA_420_VERT_FILTERS_SSE4(_sse4);
478
         CHROMA_422_HORIZ_FILTERS(_sse4);
479
@@ -1162,16 +1307,16 @@
480
 
481
         // TODO: check POPCNT flag!
482
         ALL_LUMA_TU_S(copy_cnt, copy_cnt_, sse4);
483
-#if X265_DEPTH <= 10
484
+#if X86_64 && X265_DEPTH <= 10
485
         ALL_LUMA_CU(psy_cost_pp, psyCost_pp, sse4);
486
 #endif
487
 
488
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].p2s = PFX(filterPixelToShort_2x4_sse4);
489
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_2x8].p2s = PFX(filterPixelToShort_2x8_sse4);
490
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].p2s = PFX(filterPixelToShort_6x8_sse4);
491
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_2x8].p2s = PFX(filterPixelToShort_2x8_sse4);
492
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].p2s = PFX(filterPixelToShort_2x16_sse4);
493
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].p2s = PFX(filterPixelToShort_6x16_sse4);
494
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].p2s[NONALIGNED] = PFX(filterPixelToShort_2x4_sse4);
495
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_2x8].p2s[NONALIGNED] = PFX(filterPixelToShort_2x8_sse4);
496
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].p2s[NONALIGNED] = PFX(filterPixelToShort_6x8_sse4);
497
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_2x8].p2s[NONALIGNED] = PFX(filterPixelToShort_2x8_sse4);
498
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].p2s[NONALIGNED] = PFX(filterPixelToShort_2x16_sse4);
499
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].p2s[NONALIGNED] = PFX(filterPixelToShort_6x16_sse4);
500
         p.costCoeffRemain = PFX(costCoeffRemain_sse4);
501
 #if X86_64
502
         p.saoCuStatsE0 = PFX(saoCuStatsE0_sse4);
503
@@ -1180,6 +1325,7 @@
504
         p.saoCuStatsE3 = PFX(saoCuStatsE3_sse4);
505
 #endif
506
     }
507
+#if X86_64
508
     if (cpuMask & X265_CPU_AVX)
509
     {
510
         // p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d = PFX(pixel_satd_4x4_avx); fails tests
511
@@ -1411,83 +1557,81 @@
512
         p.cu[BLOCK_32x32].intra_pred[32]    = PFX(intra_pred_ang32_32_avx2);
513
         p.cu[BLOCK_32x32].intra_pred[33]    = PFX(intra_pred_ang32_33_avx2);
514
         p.cu[BLOCK_32x32].intra_pred[34]    = PFX(intra_pred_ang32_2_avx2);
515
-
516
-        p.pu[LUMA_12x16].pixelavg_pp = PFX(pixel_avg_12x16_avx2);
517
-        p.pu[LUMA_16x4].pixelavg_pp = PFX(pixel_avg_16x4_avx2);
518
-        p.pu[LUMA_16x8].pixelavg_pp = PFX(pixel_avg_16x8_avx2);
519
-        p.pu[LUMA_16x12].pixelavg_pp = PFX(pixel_avg_16x12_avx2);
520
-        p.pu[LUMA_16x16].pixelavg_pp = PFX(pixel_avg_16x16_avx2);
521
-        p.pu[LUMA_16x32].pixelavg_pp = PFX(pixel_avg_16x32_avx2);
522
-        p.pu[LUMA_16x64].pixelavg_pp = PFX(pixel_avg_16x64_avx2);
523
-        p.pu[LUMA_24x32].pixelavg_pp = PFX(pixel_avg_24x32_avx2);
524
-        p.pu[LUMA_32x8].pixelavg_pp = PFX(pixel_avg_32x8_avx2);
525
-        p.pu[LUMA_32x16].pixelavg_pp = PFX(pixel_avg_32x16_avx2);
526
-        p.pu[LUMA_32x24].pixelavg_pp = PFX(pixel_avg_32x24_avx2);
527
-        p.pu[LUMA_32x32].pixelavg_pp = PFX(pixel_avg_32x32_avx2);
528
-        p.pu[LUMA_32x64].pixelavg_pp = PFX(pixel_avg_32x64_avx2);
529
-        p.pu[LUMA_64x16].pixelavg_pp = PFX(pixel_avg_64x16_avx2);
530
-        p.pu[LUMA_64x32].pixelavg_pp = PFX(pixel_avg_64x32_avx2);
531
-        p.pu[LUMA_64x48].pixelavg_pp = PFX(pixel_avg_64x48_avx2);
532
-        p.pu[LUMA_64x64].pixelavg_pp = PFX(pixel_avg_64x64_avx2);
533
-        p.pu[LUMA_48x64].pixelavg_pp = PFX(pixel_avg_48x64_avx2);
534
-
535
-        p.pu[LUMA_8x4].addAvg   = PFX(addAvg_8x4_avx2);
536
-        p.pu[LUMA_8x8].addAvg   = PFX(addAvg_8x8_avx2);
537
-        p.pu[LUMA_8x16].addAvg  = PFX(addAvg_8x16_avx2);
538
-        p.pu[LUMA_8x32].addAvg  = PFX(addAvg_8x32_avx2);
539
-        p.pu[LUMA_12x16].addAvg = PFX(addAvg_12x16_avx2);
540
-        p.pu[LUMA_16x4].addAvg  = PFX(addAvg_16x4_avx2);
541
-        p.pu[LUMA_16x8].addAvg  = PFX(addAvg_16x8_avx2);
542
-        p.pu[LUMA_16x12].addAvg = PFX(addAvg_16x12_avx2);
543
-        p.pu[LUMA_16x16].addAvg = PFX(addAvg_16x16_avx2);
544
-        p.pu[LUMA_16x32].addAvg = PFX(addAvg_16x32_avx2);
545
-        p.pu[LUMA_16x64].addAvg = PFX(addAvg_16x64_avx2);
546
-        p.pu[LUMA_24x32].addAvg = PFX(addAvg_24x32_avx2);
547
-        p.pu[LUMA_32x8].addAvg  = PFX(addAvg_32x8_avx2);
548
-        p.pu[LUMA_32x16].addAvg = PFX(addAvg_32x16_avx2);
549
-        p.pu[LUMA_32x24].addAvg = PFX(addAvg_32x24_avx2);
550
-        p.pu[LUMA_32x32].addAvg = PFX(addAvg_32x32_avx2);
551
-        p.pu[LUMA_32x64].addAvg = PFX(addAvg_32x64_avx2);
552
-        p.pu[LUMA_48x64].addAvg = PFX(addAvg_48x64_avx2);
553
-        p.pu[LUMA_64x16].addAvg = PFX(addAvg_64x16_avx2);
554
-        p.pu[LUMA_64x32].addAvg = PFX(addAvg_64x32_avx2);
555
-        p.pu[LUMA_64x48].addAvg = PFX(addAvg_64x48_avx2);
556
-        p.pu[LUMA_64x64].addAvg = PFX(addAvg_64x64_avx2);
557
-
558
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].addAvg   = PFX(addAvg_8x2_avx2);
559
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].addAvg   = PFX(addAvg_8x4_avx2);
560
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].addAvg   = PFX(addAvg_8x6_avx2);
561
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].addAvg   = PFX(addAvg_8x8_avx2);
562
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].addAvg  = PFX(addAvg_8x16_avx2);
563
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].addAvg  = PFX(addAvg_8x32_avx2);
564
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].addAvg = PFX(addAvg_12x16_avx2);
565
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].addAvg  = PFX(addAvg_16x4_avx2);
566
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].addAvg  = PFX(addAvg_16x8_avx2);
567
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].addAvg = PFX(addAvg_16x12_avx2);
568
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].addAvg = PFX(addAvg_16x16_avx2);
569
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].addAvg = PFX(addAvg_16x32_avx2);
570
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].addAvg  = PFX(addAvg_32x8_avx2);
571
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].addAvg = PFX(addAvg_32x16_avx2);
572
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].addAvg = PFX(addAvg_32x24_avx2);
573
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].addAvg = PFX(addAvg_32x32_avx2);
574
-
575
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].addAvg = PFX(addAvg_8x16_avx2);
576
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].addAvg = PFX(addAvg_16x32_avx2);
577
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].addAvg = PFX(addAvg_32x64_avx2);
578
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].addAvg = PFX(addAvg_8x8_avx2);
579
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].addAvg = PFX(addAvg_16x16_avx2);
580
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].addAvg = PFX(addAvg_8x32_avx2);
581
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].addAvg = PFX(addAvg_32x32_avx2);
582
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].addAvg = PFX(addAvg_16x64_avx2);
583
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].addAvg = PFX(addAvg_8x12_avx2);
584
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].addAvg = PFX(addAvg_8x4_avx2);
585
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].addAvg = PFX(addAvg_16x24_avx2);
586
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].addAvg = PFX(addAvg_16x8_avx2);
587
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].addAvg = PFX(addAvg_8x64_avx2);
588
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].addAvg = PFX(addAvg_24x64_avx2);
589
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].addAvg = PFX(addAvg_12x32_avx2);
590
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].addAvg = PFX(addAvg_32x16_avx2);
591
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].addAvg = PFX(addAvg_32x48_avx2);
592
+        ASSIGN2(p.pu[LUMA_12x16].pixelavg_pp, pixel_avg_12x16_avx2);
593
+        ASSIGN2(p.pu[LUMA_16x4].pixelavg_pp, pixel_avg_16x4_avx2);
594
+        ASSIGN2(p.pu[LUMA_16x8].pixelavg_pp, pixel_avg_16x8_avx2);
595
+        ASSIGN2(p.pu[LUMA_16x12].pixelavg_pp, pixel_avg_16x12_avx2);
596
+        ASSIGN2(p.pu[LUMA_16x16].pixelavg_pp, pixel_avg_16x16_avx2);
597
+        ASSIGN2(p.pu[LUMA_16x32].pixelavg_pp, pixel_avg_16x32_avx2);
598
+        ASSIGN2(p.pu[LUMA_16x64].pixelavg_pp, pixel_avg_16x64_avx2);
599
+        ASSIGN2(p.pu[LUMA_24x32].pixelavg_pp, pixel_avg_24x32_avx2);
600
+        ASSIGN2(p.pu[LUMA_32x8].pixelavg_pp, pixel_avg_32x8_avx2);
601
+        ASSIGN2(p.pu[LUMA_32x16].pixelavg_pp, pixel_avg_32x16_avx2);
602
+        ASSIGN2(p.pu[LUMA_32x24].pixelavg_pp, pixel_avg_32x24_avx2);
603
+        ASSIGN2(p.pu[LUMA_32x32].pixelavg_pp, pixel_avg_32x32_avx2);
604
+        ASSIGN2(p.pu[LUMA_32x64].pixelavg_pp, pixel_avg_32x64_avx2);
605
+        ASSIGN2(p.pu[LUMA_64x16].pixelavg_pp, pixel_avg_64x16_avx2);
606
+        ASSIGN2(p.pu[LUMA_64x32].pixelavg_pp, pixel_avg_64x32_avx2);
607
+        ASSIGN2(p.pu[LUMA_64x48].pixelavg_pp, pixel_avg_64x48_avx2);
608
+        ASSIGN2(p.pu[LUMA_64x64].pixelavg_pp, pixel_avg_64x64_avx2);
609
+        ASSIGN2(p.pu[LUMA_48x64].pixelavg_pp, pixel_avg_48x64_avx2);
610
+        ASSIGN2(p.pu[LUMA_8x4].addAvg, addAvg_8x4_avx2);
611
+        ASSIGN2(p.pu[LUMA_8x8].addAvg, addAvg_8x8_avx2);
612
+        ASSIGN2(p.pu[LUMA_8x16].addAvg, addAvg_8x16_avx2);
613
+        ASSIGN2(p.pu[LUMA_8x32].addAvg, addAvg_8x32_avx2);
614
+        ASSIGN2(p.pu[LUMA_12x16].addAvg, addAvg_12x16_avx2);
615
+        ASSIGN2(p.pu[LUMA_16x4].addAvg, addAvg_16x4_avx2);
616
+        ASSIGN2(p.pu[LUMA_16x8].addAvg, addAvg_16x8_avx2);
617
+        ASSIGN2(p.pu[LUMA_16x12].addAvg, addAvg_16x12_avx2);
618
+        ASSIGN2(p.pu[LUMA_16x16].addAvg, addAvg_16x16_avx2);
619
+        ASSIGN2(p.pu[LUMA_16x32].addAvg, addAvg_16x32_avx2);
620
+        ASSIGN2(p.pu[LUMA_16x64].addAvg, addAvg_16x64_avx2);
621
+        ASSIGN2(p.pu[LUMA_24x32].addAvg, addAvg_24x32_avx2);
622
+        ASSIGN2(p.pu[LUMA_32x8].addAvg, addAvg_32x8_avx2);
623
+        ASSIGN2(p.pu[LUMA_32x16].addAvg, addAvg_32x16_avx2);
624
+        ASSIGN2(p.pu[LUMA_32x24].addAvg, addAvg_32x24_avx2);
625
+        ASSIGN2(p.pu[LUMA_32x32].addAvg, addAvg_32x32_avx2);
626
+        ASSIGN2(p.pu[LUMA_32x64].addAvg, addAvg_32x64_avx2);
627
+        ASSIGN2(p.pu[LUMA_48x64].addAvg, addAvg_48x64_avx2);
628
+        ASSIGN2(p.pu[LUMA_64x16].addAvg, addAvg_64x16_avx2);
629
+        ASSIGN2(p.pu[LUMA_64x32].addAvg, addAvg_64x32_avx2);
630
+        ASSIGN2(p.pu[LUMA_64x48].addAvg, addAvg_64x48_avx2);
631
+        ASSIGN2(p.pu[LUMA_64x64].addAvg, addAvg_64x64_avx2);
632
+
633
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].addAvg, addAvg_8x2_avx2);
634
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].addAvg, addAvg_8x4_avx2);
635
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].addAvg, addAvg_8x6_avx2);
636
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].addAvg, addAvg_8x8_avx2);
637
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].addAvg, addAvg_8x16_avx2);
638
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].addAvg, addAvg_8x32_avx2);
639
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].addAvg, addAvg_12x16_avx2);
640
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].addAvg, addAvg_16x4_avx2);
641
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].addAvg, addAvg_16x8_avx2);
642
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].addAvg, addAvg_16x12_avx2);
643
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].addAvg, addAvg_16x16_avx2);
644
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].addAvg, addAvg_16x32_avx2);
645
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].addAvg, addAvg_32x8_avx2);
646
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].addAvg, addAvg_32x16_avx2);
647
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].addAvg, addAvg_32x24_avx2);
648
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].addAvg, addAvg_32x32_avx2);
649
+
650
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].addAvg, addAvg_8x16_avx2);
651
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].addAvg, addAvg_16x32_avx2);
652
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].addAvg, addAvg_32x64_avx2);
653
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].addAvg, addAvg_8x8_avx2);
654
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].addAvg,addAvg_16x16_avx2);
655
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].addAvg, addAvg_8x32_avx2);
656
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].addAvg, addAvg_32x32_avx2);
657
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].addAvg, addAvg_16x64_avx2);
658
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].addAvg, addAvg_8x12_avx2);
659
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].addAvg, addAvg_8x4_avx2);
660
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].addAvg, addAvg_16x24_avx2);
661
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].addAvg, addAvg_16x8_avx2);
662
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].addAvg, addAvg_8x64_avx2);
663
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].addAvg, addAvg_24x64_avx2);
664
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].addAvg, addAvg_12x32_avx2);
665
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].addAvg, addAvg_32x16_avx2);
666
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].addAvg, addAvg_32x48_avx2);
667
 
668
         p.cu[BLOCK_4x4].psy_cost_pp = PFX(psyCost_pp_4x4_avx2);
669
         p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar16_avx2);
670
@@ -1537,9 +1681,8 @@
671
         p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].satd = PFX(pixel_satd_16x8_avx2);
672
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd = PFX(pixel_satd_32x16_avx2);
673
 
674
-        p.cu[BLOCK_16x16].ssd_s = PFX(pixel_ssd_s_16_avx2);
675
-        p.cu[BLOCK_32x32].ssd_s = PFX(pixel_ssd_s_32_avx2);
676
-
677
+        ASSIGN2( p.cu[BLOCK_16x16].ssd_s,pixel_ssd_s_16_avx2);
678
+        ASSIGN2( p.cu[BLOCK_32x32].ssd_s,pixel_ssd_s_32_avx2);
679
         p.cu[BLOCK_16x16].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_16x16_avx2);
680
         p.cu[BLOCK_32x32].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_32x32_avx2);
681
         p.cu[BLOCK_64x64].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_64x64_avx2);
682
@@ -1555,7 +1698,7 @@
683
         p.idst4x4 = PFX(idst4_avx2);
684
         p.denoiseDct = PFX(denoise_dct_avx2);
685
 
686
-        p.scale1D_128to64 = PFX(scale1D_128to64_avx2);
687
+        ASSIGN2(p.scale1D_128to64, scale1D_128to64_avx2);
688
         p.scale2D_64to32 = PFX(scale2D_64to32_avx2);
689
 
690
         p.weight_pp = PFX(weight_pp_avx2);
691
@@ -1563,16 +1706,15 @@
692
         p.sign = PFX(calSign_avx2);
693
         p.planecopy_cp = PFX(upShift_8_avx2);
694
 
695
-        p.cu[BLOCK_16x16].calcresidual = PFX(getResidual16_avx2);
696
-        p.cu[BLOCK_32x32].calcresidual = PFX(getResidual32_avx2);
697
-
698
-        p.cu[BLOCK_16x16].blockfill_s = PFX(blockfill_s_16x16_avx2);
699
-        p.cu[BLOCK_32x32].blockfill_s = PFX(blockfill_s_32x32_avx2);
700
+        ASSIGN2(p.cu[BLOCK_16x16].calcresidual, getResidual16_avx2);
701
+        ASSIGN2(p.cu[BLOCK_32x32].calcresidual, getResidual32_avx2);
702
 
703
+        ASSIGN2(p.cu[BLOCK_16x16].blockfill_s, blockfill_s_16x16_avx2);
704
+        ASSIGN2(p.cu[BLOCK_32x32].blockfill_s, blockfill_s_32x32_avx2);
705
         ALL_LUMA_TU(count_nonzero, count_nonzero, avx2);
706
-        ALL_LUMA_TU_S(cpy1Dto2D_shl, cpy1Dto2D_shl_, avx2);
707
+        ALL_LUMA_TU_S(cpy1Dto2D_shl[ALIGNED], cpy1Dto2D_shl_, avx2);
708
+        ALL_LUMA_TU_S(cpy1Dto2D_shl[NONALIGNED], cpy1Dto2D_shl_, avx2);
709
         ALL_LUMA_TU_S(cpy1Dto2D_shr, cpy1Dto2D_shr_, avx2);
710
-
711
         p.cu[BLOCK_8x8].copy_cnt = PFX(copy_cnt_8_avx2);
712
         p.cu[BLOCK_16x16].copy_cnt = PFX(copy_cnt_16_avx2);
713
         p.cu[BLOCK_32x32].copy_cnt = PFX(copy_cnt_32_avx2);
714
@@ -1596,13 +1738,13 @@
715
         ALL_LUMA_PU(luma_vss, interp_8tap_vert_ss, avx2);
716
         p.pu[LUMA_4x4].luma_vsp = PFX(interp_8tap_vert_sp_4x4_avx2);               // since ALL_LUMA_PU didn't declare 4x4 size, calling separately luma_vsp function to use 
717
 
718
-        p.cu[BLOCK_16x16].add_ps = PFX(pixel_add_ps_16x16_avx2);
719
-        p.cu[BLOCK_32x32].add_ps = PFX(pixel_add_ps_32x32_avx2);
720
-        p.cu[BLOCK_64x64].add_ps = PFX(pixel_add_ps_64x64_avx2);
721
-        p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].add_ps = PFX(pixel_add_ps_16x16_avx2);
722
-        p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps = PFX(pixel_add_ps_32x32_avx2);
723
-        p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].add_ps = PFX(pixel_add_ps_16x32_avx2);
724
-        p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps = PFX(pixel_add_ps_32x64_avx2);
725
+        ASSIGN2(p.cu[BLOCK_16x16].add_ps, pixel_add_ps_16x16_avx2);
726
+        ASSIGN2(p.cu[BLOCK_32x32].add_ps, pixel_add_ps_32x32_avx2);
727
+        ASSIGN2(p.cu[BLOCK_64x64].add_ps, pixel_add_ps_64x64_avx2);
728
+        ASSIGN2(p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].add_ps, pixel_add_ps_16x16_avx2);
729
+        ASSIGN2(p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps, pixel_add_ps_32x32_avx2);
730
+        ASSIGN2(p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].add_ps, pixel_add_ps_16x32_avx2);
731
+        ASSIGN2(p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps, pixel_add_ps_32x64_avx2);
732
 
733
         p.cu[BLOCK_16x16].sub_ps = PFX(pixel_sub_ps_16x16_avx2);
734
         p.cu[BLOCK_32x32].sub_ps = PFX(pixel_sub_ps_32x32_avx2);
735
@@ -1663,44 +1805,45 @@
736
         p.pu[LUMA_64x48].sad_x4 = PFX(pixel_sad_x4_64x48_avx2);
737
         p.pu[LUMA_64x64].sad_x4 = PFX(pixel_sad_x4_64x64_avx2);
738
 
739
-        p.pu[LUMA_16x4].convert_p2s = PFX(filterPixelToShort_16x4_avx2);
740
-        p.pu[LUMA_16x8].convert_p2s = PFX(filterPixelToShort_16x8_avx2);
741
-        p.pu[LUMA_16x12].convert_p2s = PFX(filterPixelToShort_16x12_avx2);
742
-        p.pu[LUMA_16x16].convert_p2s = PFX(filterPixelToShort_16x16_avx2);
743
-        p.pu[LUMA_16x32].convert_p2s = PFX(filterPixelToShort_16x32_avx2);
744
-        p.pu[LUMA_16x64].convert_p2s = PFX(filterPixelToShort_16x64_avx2);
745
-        p.pu[LUMA_32x8].convert_p2s = PFX(filterPixelToShort_32x8_avx2);
746
-        p.pu[LUMA_32x16].convert_p2s = PFX(filterPixelToShort_32x16_avx2);
747
-        p.pu[LUMA_32x24].convert_p2s = PFX(filterPixelToShort_32x24_avx2);
748
-        p.pu[LUMA_32x32].convert_p2s = PFX(filterPixelToShort_32x32_avx2);
749
-        p.pu[LUMA_32x64].convert_p2s = PFX(filterPixelToShort_32x64_avx2);
750
-        p.pu[LUMA_64x16].convert_p2s = PFX(filterPixelToShort_64x16_avx2);
751
-        p.pu[LUMA_64x32].convert_p2s = PFX(filterPixelToShort_64x32_avx2);
752
-        p.pu[LUMA_64x48].convert_p2s = PFX(filterPixelToShort_64x48_avx2);
753
-        p.pu[LUMA_64x64].convert_p2s = PFX(filterPixelToShort_64x64_avx2);
754
-        p.pu[LUMA_24x32].convert_p2s = PFX(filterPixelToShort_24x32_avx2);
755
-        p.pu[LUMA_48x64].convert_p2s = PFX(filterPixelToShort_48x64_avx2);
756
-
757
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].p2s = PFX(filterPixelToShort_16x4_avx2);
758
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].p2s = PFX(filterPixelToShort_16x8_avx2);
759
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].p2s = PFX(filterPixelToShort_16x12_avx2);
760
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].p2s = PFX(filterPixelToShort_16x16_avx2);
761
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].p2s = PFX(filterPixelToShort_16x32_avx2);
762
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].p2s = PFX(filterPixelToShort_24x32_avx2);
763
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].p2s = PFX(filterPixelToShort_32x8_avx2);
764
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].p2s = PFX(filterPixelToShort_32x16_avx2);
765
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].p2s = PFX(filterPixelToShort_32x24_avx2);
766
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].p2s = PFX(filterPixelToShort_32x32_avx2);
767
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].p2s = PFX(filterPixelToShort_16x8_avx2);
768
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].p2s = PFX(filterPixelToShort_16x16_avx2);
769
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].p2s = PFX(filterPixelToShort_16x24_avx2);
770
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].p2s = PFX(filterPixelToShort_16x32_avx2);
771
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].p2s = PFX(filterPixelToShort_16x64_avx2);
772
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].p2s = PFX(filterPixelToShort_24x64_avx2);
773
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].p2s = PFX(filterPixelToShort_32x16_avx2);
774
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].p2s = PFX(filterPixelToShort_32x32_avx2);
775
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s = PFX(filterPixelToShort_32x48_avx2);
776
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s = PFX(filterPixelToShort_32x64_avx2);
777
+        ASSIGN2(p.pu[LUMA_16x4].convert_p2s, filterPixelToShort_16x4_avx2);
778
+        ASSIGN2(p.pu[LUMA_16x8].convert_p2s, filterPixelToShort_16x8_avx2);
779
+        ASSIGN2(p.pu[LUMA_16x12].convert_p2s, filterPixelToShort_16x12_avx2);
780
+        ASSIGN2(p.pu[LUMA_16x16].convert_p2s, filterPixelToShort_16x16_avx2);
781
+        ASSIGN2(p.pu[LUMA_16x32].convert_p2s, filterPixelToShort_16x32_avx2);
782
+        ASSIGN2(p.pu[LUMA_16x64].convert_p2s, filterPixelToShort_16x64_avx2);
783
+        ASSIGN2(p.pu[LUMA_32x8].convert_p2s, filterPixelToShort_32x8_avx2);
784
+        ASSIGN2(p.pu[LUMA_32x16].convert_p2s, filterPixelToShort_32x16_avx2);
785
+        ASSIGN2(p.pu[LUMA_32x24].convert_p2s, filterPixelToShort_32x24_avx2);
786
+        ASSIGN2(p.pu[LUMA_32x32].convert_p2s, filterPixelToShort_32x32_avx2);
787
+        ASSIGN2(p.pu[LUMA_32x64].convert_p2s, filterPixelToShort_32x64_avx2);
788
+        ASSIGN2(p.pu[LUMA_64x16].convert_p2s, filterPixelToShort_64x16_avx2);
789
+        ASSIGN2(p.pu[LUMA_64x32].convert_p2s, filterPixelToShort_64x32_avx2);
790
+        ASSIGN2(p.pu[LUMA_64x48].convert_p2s, filterPixelToShort_64x48_avx2);
791
+        ASSIGN2(p.pu[LUMA_64x64].convert_p2s, filterPixelToShort_64x64_avx2);
792
+        ASSIGN2(p.pu[LUMA_24x32].convert_p2s, filterPixelToShort_24x32_avx2);
793
+        ASSIGN2(p.pu[LUMA_48x64].convert_p2s, filterPixelToShort_48x64_avx2);
794
+
795
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].p2s, filterPixelToShort_16x4_avx2);
796
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].p2s, filterPixelToShort_16x8_avx2);
797
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].p2s, filterPixelToShort_16x12_avx2);
798
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].p2s, filterPixelToShort_16x16_avx2);
799
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].p2s, filterPixelToShort_16x32_avx2);
800
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].p2s, filterPixelToShort_24x32_avx2);
801
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].p2s, filterPixelToShort_32x8_avx2);
802
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].p2s, filterPixelToShort_32x16_avx2);
803
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].p2s, filterPixelToShort_32x24_avx2);
804
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].p2s, filterPixelToShort_32x32_avx2);
805
+
806
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].p2s, filterPixelToShort_16x8_avx2);
807
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].p2s, filterPixelToShort_16x16_avx2);
808
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].p2s, filterPixelToShort_16x24_avx2);
809
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].p2s, filterPixelToShort_16x32_avx2);
810
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].p2s, filterPixelToShort_16x64_avx2);
811
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].p2s, filterPixelToShort_24x64_avx2);
812
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].p2s, filterPixelToShort_32x16_avx2);
813
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].p2s, filterPixelToShort_32x32_avx2);
814
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s, filterPixelToShort_32x48_avx2);
815
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s, filterPixelToShort_32x64_avx2);
816
 
817
         p.pu[LUMA_4x4].luma_hps = PFX(interp_8tap_horiz_ps_4x4_avx2);
818
         p.pu[LUMA_4x8].luma_hps = PFX(interp_8tap_horiz_ps_4x8_avx2);
819
@@ -2167,6 +2310,14 @@
820
         p.integral_inith[INTEGRAL_8] = PFX(integral8h_avx2);
821
         p.integral_inith[INTEGRAL_12] = PFX(integral12h_avx2);
822
         p.integral_inith[INTEGRAL_16] = PFX(integral16h_avx2);
823
+        p.cu[BLOCK_4x4].nonPsyRdoQuant = PFX(nonPsyRdoQuant4_avx2);
824
+        p.cu[BLOCK_8x8].nonPsyRdoQuant = PFX(nonPsyRdoQuant8_avx2);
825
+        p.cu[BLOCK_16x16].nonPsyRdoQuant = PFX(nonPsyRdoQuant16_avx2);
826
+        p.cu[BLOCK_32x32].nonPsyRdoQuant = PFX(nonPsyRdoQuant32_avx2);
827
+        p.cu[BLOCK_4x4].psyRdoQuant_1p = PFX(psyRdoQuant_1p4_avx2);
828
+        p.cu[BLOCK_8x8].psyRdoQuant_1p = PFX(psyRdoQuant_1p8_avx2);
829
+        p.cu[BLOCK_16x16].psyRdoQuant_1p = PFX(psyRdoQuant_1p16_avx2);
830
+        p.cu[BLOCK_32x32].psyRdoQuant_1p = PFX(psyRdoQuant_1p32_avx2);
831
 
832
         /* TODO: This kernel needs to be modified to work with HIGH_BIT_DEPTH only 
833
         p.planeClipAndMax = PFX(planeClipAndMax_avx2); */
834
@@ -2188,6 +2339,844 @@
835
             p.costCoeffNxN = PFX(costCoeffNxN_avx2_bmi2);
836
         }
837
     }
838
+    if (cpuMask & X265_CPU_AVX512)
839
+    {
840
+        p.cu[BLOCK_16x16].var = PFX(pixel_var_16x16_avx512);
841
+        p.cu[BLOCK_32x32].calcresidual[NONALIGNED] = PFX(getResidual32_avx512);
842
+        p.cu[BLOCK_32x32].calcresidual[ALIGNED] = PFX(getResidual_aligned32_avx512);
843
+        p.cu[BLOCK_64x64].sub_ps = PFX(pixel_sub_ps_64x64_avx512);
844
+        p.cu[BLOCK_32x32].sub_ps = PFX(pixel_sub_ps_32x32_avx512);
845
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sub_ps = PFX(pixel_sub_ps_32x32_avx512);
846
+        p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sub_ps = PFX(pixel_sub_ps_32x64_avx512);
847
+
848
+        p.cu[BLOCK_64x64].add_ps[NONALIGNED] = PFX(pixel_add_ps_64x64_avx512);
849
+        p.cu[BLOCK_32x32].add_ps[NONALIGNED] = PFX(pixel_add_ps_32x32_avx512);
850
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps[NONALIGNED] = PFX(pixel_add_ps_32x32_avx512);
851
+        p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps[NONALIGNED] = PFX(pixel_add_ps_32x64_avx512);
852
+
853
+        p.cu[BLOCK_32x32].add_ps[ALIGNED] = PFX(pixel_add_ps_aligned_32x32_avx512);
854
+        p.cu[BLOCK_64x64].add_ps[ALIGNED] = PFX(pixel_add_ps_aligned_64x64_avx512);
855
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps[ALIGNED] = PFX(pixel_add_ps_aligned_32x32_avx512);
856
+        p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps[ALIGNED] = PFX(pixel_add_ps_aligned_32x64_avx512);
857
+
858
+        // 64 X N
859
+        p.cu[BLOCK_64x64].copy_ss = PFX(blockcopy_ss_64x64_avx512);
860
+        p.pu[LUMA_64x64].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x64_avx512);
861
+        p.pu[LUMA_64x48].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x48_avx512);
862
+        p.pu[LUMA_64x32].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x32_avx512);
863
+        p.pu[LUMA_64x16].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x16_avx512);
864
+        p.cu[BLOCK_64x64].copy_ps = (copy_ps_t)PFX(blockcopy_ss_64x64_avx512);
865
+        p.cu[BLOCK_64x64].copy_sp = (copy_sp_t)PFX(blockcopy_ss_64x64_avx512);
866
+
867
+        // 32 X N
868
+        p.cu[BLOCK_32x32].copy_ss = PFX(blockcopy_ss_32x32_avx512);
869
+        p.pu[LUMA_32x64].copy_pp = (copy_pp_t)PFX(blockcopy_ss_32x64_avx512);
870
+        p.pu[LUMA_32x32].copy_pp = (copy_pp_t)PFX(blockcopy_ss_32x32_avx512);
871
+        p.pu[LUMA_32x24].copy_pp = (copy_pp_t)PFX(blockcopy_ss_32x24_avx512);
872
+        p.pu[LUMA_32x16].copy_pp = (copy_pp_t)PFX(blockcopy_ss_32x16_avx512);
873
+        p.pu[LUMA_32x8].copy_pp = (copy_pp_t)PFX(blockcopy_ss_32x8_avx512);
874
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].copy_pp = (copy_pp_t)PFX(blockcopy_ss_32x8_avx512);
875
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].copy_pp = (copy_pp_t)PFX(blockcopy_ss_32x16_avx512);
876
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].copy_pp = (copy_pp_t)PFX(blockcopy_ss_32x24_avx512);
877
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].copy_pp = (copy_pp_t)PFX(blockcopy_ss_32x32_avx512);
878
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].copy_pp = (copy_pp_t)PFX(blockcopy_ss_32x16_avx512);
879
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].copy_pp = (copy_pp_t)PFX(blockcopy_ss_32x32_avx512);
880
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].copy_pp = (copy_pp_t)PFX(blockcopy_ss_32x48_avx512);
881
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].copy_pp = (copy_pp_t)PFX(blockcopy_ss_32x64_avx512);
882
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].copy_ss = PFX(blockcopy_ss_32x32_avx512);
883
+        p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_ss = PFX(blockcopy_ss_32x64_avx512);
884
+        p.cu[BLOCK_32x32].copy_ps = (copy_ps_t)PFX(blockcopy_ss_32x32_avx512);
885
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].copy_ps = (copy_ps_t)PFX(blockcopy_ss_32x32_avx512);
886
+        p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_ps = (copy_ps_t)PFX(blockcopy_ss_32x64_avx512);
887
+        p.cu[BLOCK_32x32].copy_sp = (copy_sp_t)PFX(blockcopy_ss_32x32_avx512);
888
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].copy_sp = (copy_sp_t)PFX(blockcopy_ss_32x32_avx512);
889
+        p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_sp = (copy_sp_t)PFX(blockcopy_ss_32x64_avx512);
890
+
891
+        p.pu[LUMA_64x16].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_64x16_avx512);
892
+        p.pu[LUMA_64x32].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_64x32_avx512);
893
+        p.pu[LUMA_64x48].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_64x48_avx512);
894
+        p.pu[LUMA_64x64].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_64x64_avx512);
895
+        p.pu[LUMA_32x8].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_32x8_avx512);
896
+        p.pu[LUMA_32x16].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_32x16_avx512);
897
+        p.pu[LUMA_32x24].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_32x24_avx512);
898
+        p.pu[LUMA_32x32].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_32x32_avx512);
899
+        p.pu[LUMA_32x64].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_32x64_avx512);
900
+        p.pu[LUMA_48x64].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_48x64_avx512);
901
+
902
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].p2s[ALIGNED] = PFX(filterPixelToShort_2x4_sse4);
903
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_2x8].p2s[ALIGNED] = PFX(filterPixelToShort_2x8_sse4);
904
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].p2s[ALIGNED] = PFX(filterPixelToShort_6x8_sse4);
905
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].p2s[NONALIGNED] = PFX(filterPixelToShort_32x8_avx512);
906
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].p2s[NONALIGNED] = PFX(filterPixelToShort_32x16_avx512);
907
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].p2s[NONALIGNED] = PFX(filterPixelToShort_32x24_avx512);
908
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].p2s[NONALIGNED] = PFX(filterPixelToShort_32x32_avx512);
909
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].p2s[NONALIGNED] = PFX(filterPixelToShort_32x16_avx512);
910
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].p2s[NONALIGNED] = PFX(filterPixelToShort_32x32_avx512);
911
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s[NONALIGNED] = PFX(filterPixelToShort_32x48_avx512);
912
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s[NONALIGNED] = PFX(filterPixelToShort_32x64_avx512);
913
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_2x8].p2s[ALIGNED] = PFX(filterPixelToShort_2x8_sse4);
914
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].p2s[ALIGNED] = PFX(filterPixelToShort_2x16_sse4);
915
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].p2s[ALIGNED] = PFX(filterPixelToShort_6x16_sse4);
916
+        p.chroma[X265_CSP_I444].pu[LUMA_32x8].p2s[NONALIGNED] = PFX(filterPixelToShort_32x8_avx512);
917
+        p.chroma[X265_CSP_I444].pu[LUMA_32x16].p2s[NONALIGNED] = PFX(filterPixelToShort_32x16_avx512);
918
+        p.chroma[X265_CSP_I444].pu[LUMA_32x24].p2s[NONALIGNED] = PFX(filterPixelToShort_32x24_avx512);
919
+        p.chroma[X265_CSP_I444].pu[LUMA_32x32].p2s[NONALIGNED] = PFX(filterPixelToShort_32x32_avx512);
920
+        p.chroma[X265_CSP_I444].pu[LUMA_32x64].p2s[NONALIGNED] = PFX(filterPixelToShort_32x64_avx512);
921
+        p.chroma[X265_CSP_I444].pu[LUMA_64x16].p2s[NONALIGNED] = PFX(filterPixelToShort_64x16_avx512);
922
+        p.chroma[X265_CSP_I444].pu[LUMA_64x32].p2s[NONALIGNED] = PFX(filterPixelToShort_64x32_avx512);
923
+        p.chroma[X265_CSP_I444].pu[LUMA_64x48].p2s[NONALIGNED] = PFX(filterPixelToShort_64x48_avx512);
924
+        p.chroma[X265_CSP_I444].pu[LUMA_64x64].p2s[NONALIGNED] = PFX(filterPixelToShort_64x64_avx512);
925
+
926
+        p.pu[LUMA_64x16].convert_p2s[ALIGNED] = PFX(filterPixelToShort_aligned_64x16_avx512);
927
+        p.pu[LUMA_64x32].convert_p2s[ALIGNED] = PFX(filterPixelToShort_aligned_64x32_avx512);
928
+        p.pu[LUMA_64x48].convert_p2s[ALIGNED] = PFX(filterPixelToShort_aligned_64x48_avx512);
929
+        p.pu[LUMA_64x64].convert_p2s[ALIGNED] = PFX(filterPixelToShort_aligned_64x64_avx512);
930
+        p.pu[LUMA_32x8].convert_p2s[ALIGNED] = PFX(filterPixelToShort_aligned_32x8_avx512);
931
+        p.pu[LUMA_32x16].convert_p2s[ALIGNED] = PFX(filterPixelToShort_aligned_32x16_avx512);
932
+        p.pu[LUMA_32x24].convert_p2s[ALIGNED] = PFX(filterPixelToShort_aligned_32x24_avx512);
933
+        p.pu[LUMA_32x32].convert_p2s[ALIGNED] = PFX(filterPixelToShort_aligned_32x32_avx512);
934
+        p.pu[LUMA_32x64].convert_p2s[ALIGNED] = PFX(filterPixelToShort_aligned_32x64_avx512);
935
+        p.pu[LUMA_48x64].convert_p2s[ALIGNED] = PFX(filterPixelToShort_aligned_48x64_avx512);
936
+
937
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].p2s[ALIGNED] = PFX(filterPixelToShort_aligned_32x8_avx512);
938
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].p2s[ALIGNED] = PFX(filterPixelToShort_aligned_32x16_avx512);
939
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].p2s[ALIGNED] = PFX(filterPixelToShort_aligned_32x24_avx512);
940
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].p2s[ALIGNED] = PFX(filterPixelToShort_aligned_32x32_avx512);
941
+
942
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].p2s[ALIGNED] = PFX(filterPixelToShort_aligned_32x16_avx512);
943
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].p2s[ALIGNED] = PFX(filterPixelToShort_aligned_32x32_avx512);
944
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s[ALIGNED] = PFX(filterPixelToShort_aligned_32x48_avx512);
945
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s[ALIGNED] = PFX(filterPixelToShort_aligned_32x64_avx512);
946
+
947
+        p.chroma[X265_CSP_I444].pu[LUMA_32x8].p2s[ALIGNED] = PFX(filterPixelToShort_aligned_32x8_avx512);
948
+        p.chroma[X265_CSP_I444].pu[LUMA_32x16].p2s[ALIGNED] = PFX(filterPixelToShort_aligned_32x16_avx512);
949
+        p.chroma[X265_CSP_I444].pu[LUMA_32x24].p2s[ALIGNED] = PFX(filterPixelToShort_aligned_32x24_avx512);
950
+        p.chroma[X265_CSP_I444].pu[LUMA_32x32].p2s[ALIGNED] = PFX(filterPixelToShort_aligned_32x32_avx512);
951
+        p.chroma[X265_CSP_I444].pu[LUMA_32x64].p2s[ALIGNED] = PFX(filterPixelToShort_aligned_32x64_avx512);
952
+        p.chroma[X265_CSP_I444].pu[LUMA_64x16].p2s[ALIGNED] = PFX(filterPixelToShort_aligned_64x16_avx512);
953
+        p.chroma[X265_CSP_I444].pu[LUMA_64x32].p2s[ALIGNED] = PFX(filterPixelToShort_aligned_64x32_avx512);
954
+        p.chroma[X265_CSP_I444].pu[LUMA_64x48].p2s[ALIGNED] = PFX(filterPixelToShort_aligned_64x48_avx512);
955
+        p.chroma[X265_CSP_I444].pu[LUMA_64x64].p2s[ALIGNED] = PFX(filterPixelToShort_aligned_64x64_avx512);
956
+        p.cu[BLOCK_32x32].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_32_avx512);
957
+        p.cu[BLOCK_32x32].ssd_s[ALIGNED] = PFX(pixel_ssd_s_aligned_32_avx512);
958
+        p.cu[BLOCK_16x16].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_16_avx512);
959
+        p.cu[BLOCK_16x16].ssd_s[ALIGNED] = PFX(pixel_ssd_s_aligned_16_avx512);
960
+        p.pu[LUMA_16x32].sad = PFX(pixel_sad_16x32_avx512);
961
+        p.pu[LUMA_16x64].sad = PFX(pixel_sad_16x64_avx512);
962
+        p.pu[LUMA_32x8].sad = PFX(pixel_sad_32x8_avx512);
963
+        p.pu[LUMA_32x16].sad = PFX(pixel_sad_32x16_avx512);
964
+        p.pu[LUMA_32x24].sad = PFX(pixel_sad_32x24_avx512);
965
+        p.pu[LUMA_32x32].sad = PFX(pixel_sad_32x32_avx512);
966
+        p.pu[LUMA_32x64].sad = PFX(pixel_sad_32x64_avx512);
967
+        p.pu[LUMA_48x64].sad = PFX(pixel_sad_48x64_avx512);
968
+        p.pu[LUMA_64x16].sad = PFX(pixel_sad_64x16_avx512);
969
+        p.pu[LUMA_64x32].sad = PFX(pixel_sad_64x32_avx512);
970
+        p.pu[LUMA_64x48].sad = PFX(pixel_sad_64x48_avx512);
971
+        p.pu[LUMA_64x64].sad = PFX(pixel_sad_64x64_avx512);
972
+
973
+        p.pu[LUMA_64x16].addAvg[NONALIGNED] = PFX(addAvg_64x16_avx512);
974
+        p.pu[LUMA_64x32].addAvg[NONALIGNED] = PFX(addAvg_64x32_avx512);
975
+        p.pu[LUMA_64x48].addAvg[NONALIGNED] = PFX(addAvg_64x48_avx512);
976
+        p.pu[LUMA_64x64].addAvg[NONALIGNED] = PFX(addAvg_64x64_avx512);
977
+        p.pu[LUMA_32x8].addAvg[NONALIGNED] = PFX(addAvg_32x8_avx512);
978
+        p.pu[LUMA_32x16].addAvg[NONALIGNED] = PFX(addAvg_32x16_avx512);
979
+        p.pu[LUMA_32x24].addAvg[NONALIGNED] = PFX(addAvg_32x24_avx512);
980
+        p.pu[LUMA_32x32].addAvg[NONALIGNED] = PFX(addAvg_32x32_avx512);
981
+        p.pu[LUMA_32x64].addAvg[NONALIGNED] = PFX(addAvg_32x64_avx512);
982
+        p.pu[LUMA_16x4].addAvg[NONALIGNED] = PFX(addAvg_16x4_avx512);
983
+        p.pu[LUMA_16x8].addAvg[NONALIGNED] = PFX(addAvg_16x8_avx512);
984
+        p.pu[LUMA_16x12].addAvg[NONALIGNED] = PFX(addAvg_16x12_avx512);
985
+        p.pu[LUMA_16x16].addAvg[NONALIGNED] = PFX(addAvg_16x16_avx512);
986
+        p.pu[LUMA_16x32].addAvg[NONALIGNED] = PFX(addAvg_16x32_avx512);
987
+        p.pu[LUMA_16x64].addAvg[NONALIGNED] = PFX(addAvg_16x64_avx512);
988
+        p.pu[LUMA_48x64].addAvg[NONALIGNED] = PFX(addAvg_48x64_avx512);
989
+
990
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].addAvg[NONALIGNED] = PFX(addAvg_32x8_avx512);
991
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].addAvg[NONALIGNED] = PFX(addAvg_32x16_avx512);
992
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].addAvg[NONALIGNED] = PFX(addAvg_32x24_avx512);
993
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].addAvg[NONALIGNED] = PFX(addAvg_32x32_avx512);
994
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].addAvg[NONALIGNED] = PFX(addAvg_16x4_avx512);
995
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].addAvg[NONALIGNED] = PFX(addAvg_16x8_avx512);
996
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].addAvg[NONALIGNED] = PFX(addAvg_16x12_avx512);
997
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].addAvg[NONALIGNED] = PFX(addAvg_16x16_avx512);
998
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].addAvg[NONALIGNED] = PFX(addAvg_16x32_avx512);
999
+
1000
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].addAvg[NONALIGNED] = PFX(addAvg_32x16_avx512);
1001
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].addAvg[NONALIGNED] = PFX(addAvg_32x32_avx512);
1002
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].addAvg[NONALIGNED] = PFX(addAvg_32x48_avx512);
1003
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].addAvg[NONALIGNED] = PFX(addAvg_32x64_avx512);
1004
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].addAvg[NONALIGNED] = PFX(addAvg_16x32_avx512);
1005
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].addAvg[NONALIGNED] = PFX(addAvg_16x16_avx512);
1006
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].addAvg[NONALIGNED] = PFX(addAvg_16x64_avx512);
1007
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].addAvg[NONALIGNED] = PFX(addAvg_16x24_avx512);
1008
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].addAvg[NONALIGNED] = PFX(addAvg_16x8_avx512);
1009
+        p.pu[LUMA_32x8].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_32x8_avx512);
1010
+        p.pu[LUMA_32x16].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_32x16_avx512);
1011
+        p.pu[LUMA_32x24].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_32x24_avx512);
1012
+        p.pu[LUMA_32x32].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_32x32_avx512);
1013
+        p.pu[LUMA_32x64].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_32x64_avx512);
1014
+        p.pu[LUMA_64x16].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_64x16_avx512);
1015
+        p.pu[LUMA_64x32].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_64x32_avx512);
1016
+        p.pu[LUMA_64x48].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_64x48_avx512);
1017
+        p.pu[LUMA_64x64].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_64x64_avx512);
1018
+        p.pu[LUMA_48x64].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_48x64_avx512);
1019
+
1020
+        p.pu[LUMA_32x8].pixelavg_pp[ALIGNED] = PFX(pixel_avg_aligned_32x8_avx512);
1021
+        p.pu[LUMA_32x16].pixelavg_pp[ALIGNED] = PFX(pixel_avg_aligned_32x16_avx512);
1022
+        p.pu[LUMA_32x24].pixelavg_pp[ALIGNED] = PFX(pixel_avg_aligned_32x24_avx512);
1023
+        p.pu[LUMA_32x32].pixelavg_pp[ALIGNED] = PFX(pixel_avg_aligned_32x32_avx512);
1024
+        p.pu[LUMA_32x64].pixelavg_pp[ALIGNED] = PFX(pixel_avg_aligned_32x64_avx512);
1025
+        p.pu[LUMA_48x64].pixelavg_pp[ALIGNED] = PFX(pixel_avg_aligned_48x64_avx512);
1026
+        p.pu[LUMA_64x16].pixelavg_pp[ALIGNED] = PFX(pixel_avg_aligned_64x16_avx512);
1027
+        p.pu[LUMA_64x32].pixelavg_pp[ALIGNED] = PFX(pixel_avg_aligned_64x32_avx512);
1028
+        p.pu[LUMA_64x48].pixelavg_pp[ALIGNED] = PFX(pixel_avg_aligned_64x48_avx512);
1029
+        p.pu[LUMA_64x64].pixelavg_pp[ALIGNED] = PFX(pixel_avg_aligned_64x64_avx512);
1030
+        p.pu[LUMA_16x8].sad_x3 = PFX(pixel_sad_x3_16x8_avx512);
1031
+        p.pu[LUMA_16x12].sad_x3 = PFX(pixel_sad_x3_16x12_avx512);
1032
+        p.pu[LUMA_16x16].sad_x3 = PFX(pixel_sad_x3_16x16_avx512);
1033
+        p.pu[LUMA_16x32].sad_x3 = PFX(pixel_sad_x3_16x32_avx512);
1034
+        p.pu[LUMA_16x64].sad_x3 = PFX(pixel_sad_x3_16x64_avx512);
1035
+        p.pu[LUMA_32x8].sad_x3 = PFX(pixel_sad_x3_32x8_avx512);
1036
+        p.pu[LUMA_32x16].sad_x3 = PFX(pixel_sad_x3_32x16_avx512);
1037
+        p.pu[LUMA_32x24].sad_x3 = PFX(pixel_sad_x3_32x24_avx512);
1038
+        p.pu[LUMA_32x32].sad_x3 = PFX(pixel_sad_x3_32x32_avx512);
1039
+        p.pu[LUMA_32x64].sad_x3 = PFX(pixel_sad_x3_32x64_avx512);
1040
+        //p.pu[LUMA_48x64].sad_x3 = PFX(pixel_sad_x3_48x64_avx512);
1041
+        p.pu[LUMA_64x16].sad_x3 = PFX(pixel_sad_x3_64x16_avx512);
1042
+        p.pu[LUMA_64x32].sad_x3 = PFX(pixel_sad_x3_64x32_avx512);
1043
+        p.pu[LUMA_64x48].sad_x3 = PFX(pixel_sad_x3_64x48_avx512);
1044
+        p.pu[LUMA_64x64].sad_x3 = PFX(pixel_sad_x3_64x64_avx512);
1045
+
1046
+        p.pu[LUMA_16x8].sad_x4 = PFX(pixel_sad_x4_16x8_avx512);
1047
+        p.pu[LUMA_16x12].sad_x4 = PFX(pixel_sad_x4_16x12_avx512);
1048
+        p.pu[LUMA_16x16].sad_x4 = PFX(pixel_sad_x4_16x16_avx512);
1049
+        p.pu[LUMA_16x32].sad_x4 = PFX(pixel_sad_x4_16x32_avx512);
1050
+        p.pu[LUMA_16x64].sad_x4 = PFX(pixel_sad_x4_16x64_avx512);
1051
+        p.pu[LUMA_32x8].sad_x4 = PFX(pixel_sad_x4_32x8_avx512);
1052
+        p.pu[LUMA_32x16].sad_x4 = PFX(pixel_sad_x4_32x16_avx512);
1053
+        p.pu[LUMA_32x24].sad_x4 = PFX(pixel_sad_x4_32x24_avx512);
1054
+        p.pu[LUMA_32x32].sad_x4 = PFX(pixel_sad_x4_32x32_avx512);
1055
+        p.pu[LUMA_32x64].sad_x4 = PFX(pixel_sad_x4_32x64_avx512);
1056
+        //p.pu[LUMA_48x64].sad_x4 = PFX(pixel_sad_x4_48x64_avx512);
1057
+        p.pu[LUMA_64x16].sad_x4 = PFX(pixel_sad_x4_64x16_avx512);
1058
+        p.pu[LUMA_64x32].sad_x4 = PFX(pixel_sad_x4_64x32_avx512);
1059
+        p.pu[LUMA_64x48].sad_x4 = PFX(pixel_sad_x4_64x48_avx512);
1060
+        p.pu[LUMA_64x64].sad_x4 = PFX(pixel_sad_x4_64x64_avx512);
1061
+        p.cu[BLOCK_16x16].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_16_avx512);
1062
+        p.cu[BLOCK_32x32].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32_avx512);
1063
+        p.cu[BLOCK_32x32].cpy1Dto2D_shl[NONALIGNED] = PFX(cpy1Dto2D_shl_32_avx512);
1064
+        p.cu[BLOCK_32x32].cpy1Dto2D_shl[ALIGNED] = PFX(cpy1Dto2D_shl_aligned_32_avx512);
1065
+        p.cu[BLOCK_16x16].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_16_avx512);
1066
+        p.cu[BLOCK_32x32].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_32_avx512);
1067
+
1068
+        p.cu[BLOCK_16x16].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_16_avx512);
1069
+        p.cu[BLOCK_32x32].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_32_avx512);
1070
+
1071
+        p.weight_pp = PFX(weight_pp_avx512);
1072
+        p.weight_sp = PFX(weight_sp_avx512);
1073
+        p.dequant_normal = PFX(dequant_normal_avx512);
1074
+        p.dequant_scaling = PFX(dequant_scaling_avx512);
1075
+        p.cu[BLOCK_32x32].copy_cnt = PFX(copy_cnt_32_avx512);
1076
+        p.cu[BLOCK_16x16].copy_cnt = PFX(copy_cnt_16_avx512);
1077
+
1078
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].filter_hpp = PFX(interp_4tap_horiz_pp_8x4_avx512);
1079
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].filter_hpp = PFX(interp_4tap_horiz_pp_8x8_avx512);
1080
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].filter_hpp = PFX(interp_4tap_horiz_pp_8x16_avx512);
1081
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].filter_hpp = PFX(interp_4tap_horiz_pp_8x32_avx512);
1082
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_hpp = PFX(interp_4tap_horiz_pp_16x4_avx512);
1083
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_hpp = PFX(interp_4tap_horiz_pp_16x8_avx512);
1084
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_hpp = PFX(interp_4tap_horiz_pp_16x12_avx512);
1085
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_hpp = PFX(interp_4tap_horiz_pp_16x16_avx512);
1086
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].filter_hpp = PFX(interp_4tap_horiz_pp_16x32_avx512);
1087
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_hpp = PFX(interp_4tap_horiz_pp_32x8_avx512);
1088
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_hpp = PFX(interp_4tap_horiz_pp_32x16_avx512);
1089
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_hpp = PFX(interp_4tap_horiz_pp_32x24_avx512);
1090
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_hpp = PFX(interp_4tap_horiz_pp_32x32_avx512);
1091
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].filter_hpp = PFX(interp_4tap_horiz_pp_24x32_avx512);
1092
+
1093
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].filter_hpp = PFX(interp_4tap_horiz_pp_8x4_avx512);
1094
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].filter_hpp = PFX(interp_4tap_horiz_pp_8x8_avx512);
1095
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].filter_hpp = PFX(interp_4tap_horiz_pp_8x12_avx512);
1096
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].filter_hpp = PFX(interp_4tap_horiz_pp_8x16_avx512);
1097
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].filter_hpp = PFX(interp_4tap_horiz_pp_8x32_avx512);
1098
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].filter_hpp = PFX(interp_4tap_horiz_pp_8x64_avx512);
1099
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_hpp = PFX(interp_4tap_horiz_pp_16x8_avx512);
1100
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_hpp = PFX(interp_4tap_horiz_pp_16x16_avx512);
1101
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_hpp = PFX(interp_4tap_horiz_pp_16x24_avx512);
1102
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].filter_hpp = PFX(interp_4tap_horiz_pp_16x32_avx512);
1103
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].filter_hpp = PFX(interp_4tap_horiz_pp_16x64_avx512);
1104
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_hpp = PFX(interp_4tap_horiz_pp_32x16_avx512);
1105
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_hpp = PFX(interp_4tap_horiz_pp_32x32_avx512);
1106
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_hpp = PFX(interp_4tap_horiz_pp_32x48_avx512);
1107
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_hpp = PFX(interp_4tap_horiz_pp_32x64_avx512);
1108
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].filter_hpp = PFX(interp_4tap_horiz_pp_24x64_avx512);
1109
+
1110
+        p.chroma[X265_CSP_I444].pu[LUMA_8x4].filter_hpp = PFX(interp_4tap_horiz_pp_8x4_avx512);
1111
+        p.chroma[X265_CSP_I444].pu[LUMA_8x8].filter_hpp = PFX(interp_4tap_horiz_pp_8x8_avx512);
1112
+        p.chroma[X265_CSP_I444].pu[LUMA_8x16].filter_hpp = PFX(interp_4tap_horiz_pp_8x16_avx512);
1113
+        p.chroma[X265_CSP_I444].pu[LUMA_8x32].filter_hpp = PFX(interp_4tap_horiz_pp_8x32_avx512);
1114
+        p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_hpp = PFX(interp_4tap_horiz_pp_16x4_avx512);
1115
+        p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_hpp = PFX(interp_4tap_horiz_pp_16x8_avx512);
1116
+        p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_hpp = PFX(interp_4tap_horiz_pp_16x12_avx512);
1117
+        p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_hpp = PFX(interp_4tap_horiz_pp_16x16_avx512);
1118
+        p.chroma[X265_CSP_I444].pu[LUMA_16x32].filter_hpp = PFX(interp_4tap_horiz_pp_16x32_avx512);
1119
+        p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_hpp = PFX(interp_4tap_horiz_pp_16x64_avx512);
1120
+        p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_hpp = PFX(interp_4tap_horiz_pp_32x8_avx512);
1121
+        p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_hpp = PFX(interp_4tap_horiz_pp_32x16_avx512);
1122
+        p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_hpp = PFX(interp_4tap_horiz_pp_32x24_avx512);
1123
+        p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_hpp = PFX(interp_4tap_horiz_pp_32x32_avx512);
1124
+        p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_hpp = PFX(interp_4tap_horiz_pp_32x64_avx512);
1125
+        p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_hpp = PFX(interp_4tap_horiz_pp_64x16_avx512);
1126
+        p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_hpp = PFX(interp_4tap_horiz_pp_64x32_avx512);
1127
+        p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_hpp = PFX(interp_4tap_horiz_pp_64x48_avx512);
1128
+        p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_hpp = PFX(interp_4tap_horiz_pp_64x64_avx512);
1129
+        p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_hpp = PFX(interp_4tap_horiz_pp_48x64_avx512);
1130
+        p.chroma[X265_CSP_I444].pu[LUMA_24x32].filter_hpp = PFX(interp_4tap_horiz_pp_24x32_avx512);
1131
+
1132
+        p.pu[LUMA_16x4].addAvg[ALIGNED] = PFX(addAvg_aligned_16x4_avx512);
1133
+        p.pu[LUMA_16x8].addAvg[ALIGNED] = PFX(addAvg_aligned_16x8_avx512);
1134
+        p.pu[LUMA_16x12].addAvg[ALIGNED] = PFX(addAvg_aligned_16x12_avx512);
1135
+        p.pu[LUMA_16x16].addAvg[ALIGNED] = PFX(addAvg_aligned_16x16_avx512);
1136
+        p.pu[LUMA_16x32].addAvg[ALIGNED] = PFX(addAvg_aligned_16x32_avx512);
1137
+        p.pu[LUMA_16x64].addAvg[ALIGNED] = PFX(addAvg_aligned_16x64_avx512);
1138
+        p.pu[LUMA_48x64].addAvg[ALIGNED] = PFX(addAvg_aligned_48x64_avx512);
1139
+        p.pu[LUMA_32x8].addAvg[ALIGNED] = PFX(addAvg_aligned_32x8_avx512);
1140
+        p.pu[LUMA_32x16].addAvg[ALIGNED] = PFX(addAvg_aligned_32x16_avx512);
1141
+        p.pu[LUMA_32x24].addAvg[ALIGNED] = PFX(addAvg_aligned_32x24_avx512);
1142
+        p.pu[LUMA_32x32].addAvg[ALIGNED] = PFX(addAvg_aligned_32x32_avx512);
1143
+        p.pu[LUMA_32x64].addAvg[ALIGNED] = PFX(addAvg_aligned_32x64_avx512);
1144
+        p.pu[LUMA_64x16].addAvg[ALIGNED] = PFX(addAvg_aligned_64x16_avx512);
1145
+        p.pu[LUMA_64x32].addAvg[ALIGNED] = PFX(addAvg_aligned_64x32_avx512);
1146
+        p.pu[LUMA_64x48].addAvg[ALIGNED] = PFX(addAvg_aligned_64x48_avx512);
1147
+        p.pu[LUMA_64x64].addAvg[ALIGNED] = PFX(addAvg_aligned_64x64_avx512);
1148
+
1149
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].addAvg[ALIGNED] = PFX(addAvg_aligned_16x4_avx512);
1150
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].addAvg[ALIGNED] = PFX(addAvg_aligned_16x8_avx512);
1151
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].addAvg[ALIGNED] = PFX(addAvg_aligned_16x12_avx512);
1152
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].addAvg[ALIGNED] = PFX(addAvg_aligned_16x16_avx512);
1153
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].addAvg[ALIGNED] = PFX(addAvg_aligned_16x32_avx512);
1154
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].addAvg[ALIGNED] = PFX(addAvg_aligned_32x8_avx512);
1155
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].addAvg[ALIGNED] = PFX(addAvg_aligned_32x16_avx512);
1156
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].addAvg[ALIGNED] = PFX(addAvg_aligned_32x24_avx512);
1157
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].addAvg[ALIGNED] = PFX(addAvg_aligned_32x32_avx512);
1158
+
1159
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].addAvg[ALIGNED] = PFX(addAvg_aligned_16x32_avx512);
1160
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].addAvg[ALIGNED] = PFX(addAvg_aligned_16x16_avx512);
1161
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].addAvg[ALIGNED] = PFX(addAvg_aligned_16x64_avx512);
1162
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].addAvg[ALIGNED] = PFX(addAvg_aligned_16x24_avx512);
1163
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].addAvg[ALIGNED] = PFX(addAvg_aligned_16x8_avx512);
1164
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].addAvg[ALIGNED] = PFX(addAvg_aligned_32x16_avx512);
1165
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].addAvg[ALIGNED] = PFX(addAvg_aligned_32x32_avx512);
1166
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].addAvg[ALIGNED] = PFX(addAvg_aligned_32x48_avx512);
1167
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].addAvg[ALIGNED] = PFX(addAvg_aligned_32x64_avx512);
1168
+        p.cu[BLOCK_32x32].blockfill_s[NONALIGNED] = PFX(blockfill_s_32x32_avx512);
1169
+        p.cu[BLOCK_32x32].blockfill_s[ALIGNED] = PFX(blockfill_s_aligned_32x32_avx512);
1170
+        p.pu[LUMA_8x4].luma_hpp = PFX(interp_8tap_horiz_pp_8x4_avx512);
1171
+        p.pu[LUMA_8x8].luma_hpp = PFX(interp_8tap_horiz_pp_8x8_avx512);
1172
+        p.pu[LUMA_8x16].luma_hpp = PFX(interp_8tap_horiz_pp_8x16_avx512);
1173
+        p.pu[LUMA_8x32].luma_hpp = PFX(interp_8tap_horiz_pp_8x32_avx512);
1174
+        p.pu[LUMA_16x4].luma_hpp = PFX(interp_8tap_horiz_pp_16x4_avx512);
1175
+        p.pu[LUMA_16x8].luma_hpp = PFX(interp_8tap_horiz_pp_16x8_avx512);
1176
+        p.pu[LUMA_16x12].luma_hpp = PFX(interp_8tap_horiz_pp_16x12_avx512);
1177
+        p.pu[LUMA_16x16].luma_hpp = PFX(interp_8tap_horiz_pp_16x16_avx512);
1178
+        p.pu[LUMA_16x32].luma_hpp = PFX(interp_8tap_horiz_pp_16x32_avx512);
1179
+        p.pu[LUMA_16x64].luma_hpp = PFX(interp_8tap_horiz_pp_16x64_avx512);
1180
+        p.pu[LUMA_24x32].luma_hpp = PFX(interp_8tap_horiz_pp_24x32_avx512);
1181
+        p.pu[LUMA_32x8].luma_hpp = PFX(interp_8tap_horiz_pp_32x8_avx512);
1182
+        p.pu[LUMA_32x16].luma_hpp = PFX(interp_8tap_horiz_pp_32x16_avx512);
1183
+        p.pu[LUMA_32x24].luma_hpp = PFX(interp_8tap_horiz_pp_32x24_avx512);
1184
+        p.pu[LUMA_32x32].luma_hpp = PFX(interp_8tap_horiz_pp_32x32_avx512);
1185
+        p.pu[LUMA_32x64].luma_hpp = PFX(interp_8tap_horiz_pp_32x64_avx512);
1186
+        p.pu[LUMA_64x16].luma_hpp = PFX(interp_8tap_horiz_pp_64x16_avx512);
1187
+        p.pu[LUMA_64x32].luma_hpp = PFX(interp_8tap_horiz_pp_64x32_avx512);
1188
+        p.pu[LUMA_64x48].luma_hpp = PFX(interp_8tap_horiz_pp_64x48_avx512);
1189
+        p.pu[LUMA_64x64].luma_hpp = PFX(interp_8tap_horiz_pp_64x64_avx512);
1190
+        p.pu[LUMA_48x64].luma_hpp = PFX(interp_8tap_horiz_pp_48x64_avx512);
1191
+
1192
+        p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vpp = PFX(interp_4tap_vert_pp_64x16_avx512);
1193
+        p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vpp = PFX(interp_4tap_vert_pp_64x32_avx512);
1194
+        p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vpp = PFX(interp_4tap_vert_pp_64x48_avx512);
1195
+        p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vpp = PFX(interp_4tap_vert_pp_64x64_avx512);
1196
+        p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vps = PFX(interp_4tap_vert_ps_64x16_avx512);
1197
+        p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vps = PFX(interp_4tap_vert_ps_64x32_avx512);
1198
+        p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vps = PFX(interp_4tap_vert_ps_64x48_avx512);
1199
+        p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vps = PFX(interp_4tap_vert_ps_64x64_avx512);
1200
+        p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vsp = PFX(interp_4tap_vert_sp_64x16_avx512);
1201
+        p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vsp = PFX(interp_4tap_vert_sp_64x32_avx512);
1202
+        p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vsp = PFX(interp_4tap_vert_sp_64x48_avx512);
1203
+        p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vsp = PFX(interp_4tap_vert_sp_64x64_avx512);
1204
+        p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vss = PFX(interp_4tap_vert_ss_64x16_avx512);
1205
+        p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vss = PFX(interp_4tap_vert_ss_64x32_avx512);
1206
+        p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vss = PFX(interp_4tap_vert_ss_64x48_avx512);
1207
+        p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vss = PFX(interp_4tap_vert_ss_64x64_avx512);
1208
+
1209
+        p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vpp = PFX(interp_4tap_vert_pp_48x64_avx512);
1210
+        p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vps = PFX(interp_4tap_vert_ps_48x64_avx512);
1211
+        p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vsp = PFX(interp_4tap_vert_sp_48x64_avx512);
1212
+        p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vss = PFX(interp_4tap_vert_ss_48x64_avx512);
1213
+
1214
+        p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_vpp = PFX(interp_4tap_vert_pp_32x8_avx512);
1215
+        p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_vpp = PFX(interp_4tap_vert_pp_32x16_avx512);
1216
+        p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_vpp = PFX(interp_4tap_vert_pp_32x24_avx512);
1217
+        p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_vpp = PFX(interp_4tap_vert_pp_32x32_avx512);
1218
+        p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_vpp = PFX(interp_4tap_vert_pp_32x64_avx512);
1219
+        p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_vps = PFX(interp_4tap_vert_ps_32x8_avx512);
1220
+        p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_vps = PFX(interp_4tap_vert_ps_32x16_avx512);
1221
+        p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_vps = PFX(interp_4tap_vert_ps_32x24_avx512);
1222
+        p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_vps = PFX(interp_4tap_vert_ps_32x32_avx512);
1223
+        p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_vps = PFX(interp_4tap_vert_ps_32x64_avx512);
1224
+        p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_vss = PFX(interp_4tap_vert_ss_32x8_avx512);
1225
+        p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_vss = PFX(interp_4tap_vert_ss_32x16_avx512);
1226
+        p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_vss = PFX(interp_4tap_vert_ss_32x24_avx512);
1227
+        p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_vss = PFX(interp_4tap_vert_ss_32x32_avx512);
1228
+        p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_vss = PFX(interp_4tap_vert_ss_32x64_avx512);
1229
+        p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_vsp = PFX(interp_4tap_vert_sp_32x8_avx512);
1230
+        p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_vsp = PFX(interp_4tap_vert_sp_32x16_avx512);
1231
+        p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_vsp = PFX(interp_4tap_vert_sp_32x24_avx512);
1232
+        p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_vsp = PFX(interp_4tap_vert_sp_32x32_avx512);
1233
+        p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_vsp = PFX(interp_4tap_vert_sp_32x64_avx512);
1234
+
1235
+        p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_vpp = PFX(interp_4tap_vert_pp_16x4_avx512);
1236
+        p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_vpp = PFX(interp_4tap_vert_pp_16x8_avx512);
1237
+        p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_vpp = PFX(interp_4tap_vert_pp_16x12_avx512);
1238
+        p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_vpp = PFX(interp_4tap_vert_pp_16x16_avx512);
1239
+        p.chroma[X265_CSP_I444].pu[LUMA_16x32].filter_vpp = PFX(interp_4tap_vert_pp_16x32_avx512);
1240
+        p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_vpp = PFX(interp_4tap_vert_pp_16x64_avx512);
1241
+        p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_vps = PFX(interp_4tap_vert_ps_16x4_avx512);
1242
+        p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_vps = PFX(interp_4tap_vert_ps_16x8_avx512);
1243
+        p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_vps = PFX(interp_4tap_vert_ps_16x12_avx512);
1244
+        p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_vps = PFX(interp_4tap_vert_ps_16x16_avx512);
1245
+        p.chroma[X265_CSP_I444].pu[LUMA_16x32].filter_vps = PFX(interp_4tap_vert_ps_16x32_avx512);
1246
+        p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_vps = PFX(interp_4tap_vert_ps_16x64_avx512);
1247
+        p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_vss = PFX(interp_4tap_vert_ss_16x4_avx512);
1248
+        p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_vss = PFX(interp_4tap_vert_ss_16x8_avx512);
1249
+        p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_vss = PFX(interp_4tap_vert_ss_16x12_avx512);
1250
+        p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_vss = PFX(interp_4tap_vert_ss_16x16_avx512);
1251
+        p.chroma[X265_CSP_I444].pu[LUMA_16x32].filter_vss = PFX(interp_4tap_vert_ss_16x32_avx512);
1252
+        p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_vss = PFX(interp_4tap_vert_ss_16x64_avx512);
1253
+        p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_vsp = PFX(interp_4tap_vert_sp_16x4_avx512);
1254
+        p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_vsp = PFX(interp_4tap_vert_sp_16x8_avx512);
1255
+        p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_vsp = PFX(interp_4tap_vert_sp_16x12_avx512);
1256
+        p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_vsp = PFX(interp_4tap_vert_sp_16x16_avx512);
1257
+        p.chroma[X265_CSP_I444].pu[LUMA_16x32].filter_vsp = PFX(interp_4tap_vert_sp_16x32_avx512);
1258
+        p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_vsp = PFX(interp_4tap_vert_sp_16x64_avx512);
1259
+
1260
+        p.chroma[X265_CSP_I444].pu[LUMA_8x8].filter_vpp = PFX(interp_4tap_vert_pp_8x8_avx512);
1261
+        p.chroma[X265_CSP_I444].pu[LUMA_8x16].filter_vpp = PFX(interp_4tap_vert_pp_8x16_avx512);
1262
+        p.chroma[X265_CSP_I444].pu[LUMA_8x32].filter_vpp = PFX(interp_4tap_vert_pp_8x32_avx512);
1263
+        p.chroma[X265_CSP_I444].pu[LUMA_8x8].filter_vps = PFX(interp_4tap_vert_ps_8x8_avx512);
1264
+        p.chroma[X265_CSP_I444].pu[LUMA_8x16].filter_vps = PFX(interp_4tap_vert_ps_8x16_avx512);
1265
+        p.chroma[X265_CSP_I444].pu[LUMA_8x32].filter_vps = PFX(interp_4tap_vert_ps_8x32_avx512);
1266
+        p.chroma[X265_CSP_I444].pu[LUMA_8x8].filter_vss = PFX(interp_4tap_vert_ss_8x8_avx512);
1267
+        p.chroma[X265_CSP_I444].pu[LUMA_8x16].filter_vss = PFX(interp_4tap_vert_ss_8x16_avx512);
1268
+        p.chroma[X265_CSP_I444].pu[LUMA_8x32].filter_vss = PFX(interp_4tap_vert_ss_8x32_avx512);
1269
+        p.chroma[X265_CSP_I444].pu[LUMA_8x8].filter_vsp = PFX(interp_4tap_vert_sp_8x8_avx512);
1270
+        p.chroma[X265_CSP_I444].pu[LUMA_8x16].filter_vsp = PFX(interp_4tap_vert_sp_8x16_avx512);
1271
+        p.chroma[X265_CSP_I444].pu[LUMA_8x32].filter_vsp = PFX(interp_4tap_vert_sp_8x32_avx512);
1272
+
1273
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_vpp = PFX(interp_4tap_vert_pp_32x16_avx512);
1274
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_vpp = PFX(interp_4tap_vert_pp_32x32_avx512);
1275
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_vpp = PFX(interp_4tap_vert_pp_32x48_avx512);
1276
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_vpp = PFX(interp_4tap_vert_pp_32x64_avx512);
1277
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_vps = PFX(interp_4tap_vert_ps_32x16_avx512);
1278
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_vps = PFX(interp_4tap_vert_ps_32x32_avx512);
1279
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_vps = PFX(interp_4tap_vert_ps_32x48_avx512);
1280
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_vps = PFX(interp_4tap_vert_ps_32x64_avx512);
1281
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_vss = PFX(interp_4tap_vert_ss_32x16_avx512);
1282
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_vss = PFX(interp_4tap_vert_ss_32x32_avx512);
1283
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_vss = PFX(interp_4tap_vert_ss_32x48_avx512);
1284
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_vss = PFX(interp_4tap_vert_ss_32x64_avx512);
1285
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_vsp = PFX(interp_4tap_vert_sp_32x16_avx512);
1286
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_vsp = PFX(interp_4tap_vert_sp_32x32_avx512);
1287
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_vsp = PFX(interp_4tap_vert_sp_32x48_avx512);
1288
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_vsp = PFX(interp_4tap_vert_sp_32x64_avx512);
1289
+
1290
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_vpp = PFX(interp_4tap_vert_pp_16x8_avx512);
1291
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_vpp = PFX(interp_4tap_vert_pp_16x16_avx512);
1292
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_vpp = PFX(interp_4tap_vert_pp_16x24_avx512);
1293
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].filter_vpp = PFX(interp_4tap_vert_pp_16x32_avx512);
1294
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].filter_vpp = PFX(interp_4tap_vert_pp_16x64_avx512);
1295
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_vps = PFX(interp_4tap_vert_ps_16x8_avx512);
1296
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_vps = PFX(interp_4tap_vert_ps_16x16_avx512);
1297
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_vps = PFX(interp_4tap_vert_ps_16x24_avx512);
1298
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].filter_vps = PFX(interp_4tap_vert_ps_16x32_avx512);
1299
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].filter_vps = PFX(interp_4tap_vert_ps_16x64_avx512);
1300
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_vss = PFX(interp_4tap_vert_ss_16x8_avx512);
1301
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_vss = PFX(interp_4tap_vert_ss_16x16_avx512);
1302
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_vss = PFX(interp_4tap_vert_ss_16x24_avx512);
1303
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].filter_vss = PFX(interp_4tap_vert_ss_16x32_avx512);
1304
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].filter_vss = PFX(interp_4tap_vert_ss_16x64_avx512);
1305
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_vsp = PFX(interp_4tap_vert_sp_16x8_avx512);
1306
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_vsp = PFX(interp_4tap_vert_sp_16x16_avx512);
1307
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_vsp = PFX(interp_4tap_vert_sp_16x24_avx512);
1308
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].filter_vsp = PFX(interp_4tap_vert_sp_16x32_avx512);
1309
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].filter_vsp = PFX(interp_4tap_vert_sp_16x64_avx512);
1310
+
1311
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].filter_vpp = PFX(interp_4tap_vert_pp_8x8_avx512);
1312
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].filter_vpp = PFX(interp_4tap_vert_pp_8x16_avx512);
1313
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].filter_vpp = PFX(interp_4tap_vert_pp_8x32_avx512);
1314
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].filter_vpp = PFX(interp_4tap_vert_pp_8x64_avx512);
1315
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].filter_vps = PFX(interp_4tap_vert_ps_8x8_avx512);
1316
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].filter_vps = PFX(interp_4tap_vert_ps_8x16_avx512);
1317
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].filter_vps = PFX(interp_4tap_vert_ps_8x32_avx512);
1318
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].filter_vps = PFX(interp_4tap_vert_ps_8x64_avx512);
1319
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].filter_vss = PFX(interp_4tap_vert_ss_8x8_avx512);
1320
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].filter_vss = PFX(interp_4tap_vert_ss_8x16_avx512);
1321
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].filter_vss = PFX(interp_4tap_vert_ss_8x32_avx512);
1322
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].filter_vss = PFX(interp_4tap_vert_ss_8x64_avx512);
1323
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].filter_vsp = PFX(interp_4tap_vert_sp_8x8_avx512);
1324
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].filter_vsp = PFX(interp_4tap_vert_sp_8x16_avx512);
1325
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].filter_vsp = PFX(interp_4tap_vert_sp_8x32_avx512);
1326
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].filter_vsp = PFX(interp_4tap_vert_sp_8x64_avx512);
1327
+
1328
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_vpp = PFX(interp_4tap_vert_pp_32x8_avx512);
1329
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vpp = PFX(interp_4tap_vert_pp_32x16_avx512);
1330
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vpp = PFX(interp_4tap_vert_pp_32x24_avx512);
1331
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_vpp = PFX(interp_4tap_vert_pp_32x32_avx512);
1332
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_vps = PFX(interp_4tap_vert_ps_32x8_avx512);
1333
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vps = PFX(interp_4tap_vert_ps_32x16_avx512);
1334
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vps = PFX(interp_4tap_vert_ps_32x24_avx512);
1335
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_vps = PFX(interp_4tap_vert_ps_32x32_avx512);
1336
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_vss = PFX(interp_4tap_vert_ss_32x8_avx512);
1337
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vss = PFX(interp_4tap_vert_ss_32x16_avx512);
1338
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vss = PFX(interp_4tap_vert_ss_32x24_avx512);
1339
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_vss = PFX(interp_4tap_vert_ss_32x32_avx512);
1340
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_vsp = PFX(interp_4tap_vert_sp_32x8_avx512);
1341
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vsp = PFX(interp_4tap_vert_sp_32x16_avx512);
1342
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vsp = PFX(interp_4tap_vert_sp_32x24_avx512);
1343
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_vsp = PFX(interp_4tap_vert_sp_32x32_avx512);
1344
+
1345
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_vpp = PFX(interp_4tap_vert_pp_16x4_avx512);
1346
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_vpp = PFX(interp_4tap_vert_pp_16x8_avx512);
1347
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_vpp = PFX(interp_4tap_vert_pp_16x12_avx512);
1348
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_vpp = PFX(interp_4tap_vert_pp_16x16_avx512);
1349
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].filter_vpp = PFX(interp_4tap_vert_pp_16x32_avx512);
1350
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_vps = PFX(interp_4tap_vert_ps_16x4_avx512);
1351
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_vps = PFX(interp_4tap_vert_ps_16x8_avx512);
1352
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_vps = PFX(interp_4tap_vert_ps_16x12_avx512);
1353
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_vps = PFX(interp_4tap_vert_ps_16x16_avx512);
1354
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].filter_vps = PFX(interp_4tap_vert_ps_16x32_avx512);
1355
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_vss = PFX(interp_4tap_vert_ss_16x4_avx512);
1356
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_vss = PFX(interp_4tap_vert_ss_16x8_avx512);
1357
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_vss = PFX(interp_4tap_vert_ss_16x12_avx512);
1358
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_vss = PFX(interp_4tap_vert_ss_16x16_avx512);
1359
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].filter_vss = PFX(interp_4tap_vert_ss_16x32_avx512);
1360
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_vsp = PFX(interp_4tap_vert_sp_16x4_avx512);
1361
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_vsp = PFX(interp_4tap_vert_sp_16x8_avx512);
1362
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_vsp = PFX(interp_4tap_vert_sp_16x12_avx512);
1363
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_vsp = PFX(interp_4tap_vert_sp_16x16_avx512);
1364
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].filter_vsp = PFX(interp_4tap_vert_sp_16x32_avx512);
1365
+
1366
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].filter_vpp = PFX(interp_4tap_vert_pp_8x8_avx512);
1367
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].filter_vpp = PFX(interp_4tap_vert_pp_8x16_avx512);
1368
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].filter_vpp = PFX(interp_4tap_vert_pp_8x32_avx512);
1369
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].filter_vps = PFX(interp_4tap_vert_ps_8x8_avx512);
1370
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].filter_vps = PFX(interp_4tap_vert_ps_8x16_avx512);
1371
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].filter_vps = PFX(interp_4tap_vert_ps_8x32_avx512);
1372
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].filter_vss = PFX(interp_4tap_vert_ss_8x8_avx512);
1373
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].filter_vss = PFX(interp_4tap_vert_ss_8x16_avx512);
1374
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].filter_vss = PFX(interp_4tap_vert_ss_8x32_avx512);
1375
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].filter_vsp = PFX(interp_4tap_vert_sp_8x8_avx512);
1376
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].filter_vsp = PFX(interp_4tap_vert_sp_8x16_avx512);
1377
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].filter_vsp = PFX(interp_4tap_vert_sp_8x32_avx512);
1378
+
1379
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].filter_vpp = PFX(interp_4tap_vert_pp_24x32_avx512);
1380
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].filter_vpp = PFX(interp_4tap_vert_pp_24x64_avx512);
1381
+        p.chroma[X265_CSP_I444].pu[LUMA_24x32].filter_vpp = PFX(interp_4tap_vert_pp_24x32_avx512);
1382
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].filter_vps = PFX(interp_4tap_vert_ps_24x32_avx512);
1383
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].filter_vps = PFX(interp_4tap_vert_ps_24x64_avx512);
1384
+        p.chroma[X265_CSP_I444].pu[LUMA_24x32].filter_vps = PFX(interp_4tap_vert_ps_24x32_avx512);
1385
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].filter_vss = PFX(interp_4tap_vert_ss_24x32_avx512);
1386
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].filter_vss = PFX(interp_4tap_vert_ss_24x64_avx512);
1387
+        p.chroma[X265_CSP_I444].pu[LUMA_24x32].filter_vss = PFX(interp_4tap_vert_ss_24x32_avx512);
1388
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].filter_vsp = PFX(interp_4tap_vert_sp_24x32_avx512);
1389
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].filter_vsp = PFX(interp_4tap_vert_sp_24x64_avx512);
1390
+        p.chroma[X265_CSP_I444].pu[LUMA_24x32].filter_vsp = PFX(interp_4tap_vert_sp_24x32_avx512);
1391
+
1392
+        p.pu[LUMA_8x8].luma_vss = PFX(interp_8tap_vert_ss_8x8_avx512);
1393
+        p.pu[LUMA_8x16].luma_vss = PFX(interp_8tap_vert_ss_8x16_avx512);
1394
+        p.pu[LUMA_8x32].luma_vss = PFX(interp_8tap_vert_ss_8x32_avx512);
1395
+        p.pu[LUMA_16x4].luma_vss = PFX(interp_8tap_vert_ss_16x4_avx512);
1396
+        p.pu[LUMA_16x8].luma_vss = PFX(interp_8tap_vert_ss_16x8_avx512);
1397
+        p.pu[LUMA_16x12].luma_vss = PFX(interp_8tap_vert_ss_16x12_avx512);
1398
+        p.pu[LUMA_16x16].luma_vss = PFX(interp_8tap_vert_ss_16x16_avx512);
1399
+        p.pu[LUMA_16x32].luma_vss = PFX(interp_8tap_vert_ss_16x32_avx512);
1400
+        p.pu[LUMA_16x64].luma_vss = PFX(interp_8tap_vert_ss_16x64_avx512);
1401
+        p.pu[LUMA_24x32].luma_vss = PFX(interp_8tap_vert_ss_24x32_avx512);
1402
+        p.pu[LUMA_32x8].luma_vss = PFX(interp_8tap_vert_ss_32x8_avx512);
1403
+        p.pu[LUMA_32x16].luma_vss = PFX(interp_8tap_vert_ss_32x16_avx512);
1404
+        p.pu[LUMA_32x32].luma_vss = PFX(interp_8tap_vert_ss_32x32_avx512);
1405
+        p.pu[LUMA_32x24].luma_vss = PFX(interp_8tap_vert_ss_32x24_avx512);
1406
+        p.pu[LUMA_32x64].luma_vss = PFX(interp_8tap_vert_ss_32x64_avx512);
1407
+        p.pu[LUMA_64x16].luma_vss = PFX(interp_8tap_vert_ss_64x16_avx512);
1408
+        p.pu[LUMA_64x32].luma_vss = PFX(interp_8tap_vert_ss_64x32_avx512);
1409
+        p.pu[LUMA_64x48].luma_vss = PFX(interp_8tap_vert_ss_64x48_avx512);
1410
+        p.pu[LUMA_64x64].luma_vss = PFX(interp_8tap_vert_ss_64x64_avx512);
1411
+        p.pu[LUMA_48x64].luma_vss = PFX(interp_8tap_vert_ss_48x64_avx512);
1412
+
1413
+        p.pu[LUMA_8x8].luma_vsp = PFX(interp_8tap_vert_sp_8x8_avx512);
1414
+        p.pu[LUMA_8x16].luma_vsp = PFX(interp_8tap_vert_sp_8x16_avx512);
1415
+        p.pu[LUMA_8x32].luma_vsp = PFX(interp_8tap_vert_sp_8x32_avx512);
1416
+        p.pu[LUMA_16x4].luma_vsp = PFX(interp_8tap_vert_sp_16x4_avx512);
1417
+        p.pu[LUMA_16x8].luma_vsp = PFX(interp_8tap_vert_sp_16x8_avx512);
1418
+        p.pu[LUMA_16x12].luma_vsp = PFX(interp_8tap_vert_sp_16x12_avx512);
1419
+        p.pu[LUMA_16x16].luma_vsp = PFX(interp_8tap_vert_sp_16x16_avx512);
1420
+        p.pu[LUMA_16x32].luma_vsp = PFX(interp_8tap_vert_sp_16x32_avx512);
1421
+        p.pu[LUMA_16x64].luma_vsp = PFX(interp_8tap_vert_sp_16x64_avx512);
1422
+        p.pu[LUMA_24x32].luma_vsp = PFX(interp_8tap_vert_sp_24x32_avx512);
1423
+        p.pu[LUMA_32x8].luma_vsp = PFX(interp_8tap_vert_sp_32x8_avx512);
1424
+        p.pu[LUMA_32x16].luma_vsp = PFX(interp_8tap_vert_sp_32x16_avx512);
1425
+        p.pu[LUMA_32x32].luma_vsp = PFX(interp_8tap_vert_sp_32x32_avx512);
1426
+        p.pu[LUMA_32x24].luma_vsp = PFX(interp_8tap_vert_sp_32x24_avx512);
1427
+        p.pu[LUMA_32x64].luma_vsp = PFX(interp_8tap_vert_sp_32x64_avx512);
1428
+        p.pu[LUMA_64x16].luma_vsp = PFX(interp_8tap_vert_sp_64x16_avx512);
1429
+        p.pu[LUMA_64x32].luma_vsp = PFX(interp_8tap_vert_sp_64x32_avx512);
1430
+        p.pu[LUMA_64x48].luma_vsp = PFX(interp_8tap_vert_sp_64x48_avx512);
1431
+        p.pu[LUMA_64x64].luma_vsp = PFX(interp_8tap_vert_sp_64x64_avx512);
1432
+        p.pu[LUMA_48x64].luma_vsp = PFX(interp_8tap_vert_sp_48x64_avx512);
1433
+
1434
+        p.pu[LUMA_16x4].luma_vpp = PFX(interp_8tap_vert_pp_16x4_avx512);
1435
+        p.pu[LUMA_16x8].luma_vpp = PFX(interp_8tap_vert_pp_16x8_avx512);
1436
+        p.pu[LUMA_16x12].luma_vpp = PFX(interp_8tap_vert_pp_16x12_avx512);
1437
+        p.pu[LUMA_16x16].luma_vpp = PFX(interp_8tap_vert_pp_16x16_avx512);
1438
+        p.pu[LUMA_16x32].luma_vpp = PFX(interp_8tap_vert_pp_16x32_avx512);
1439
+        p.pu[LUMA_16x64].luma_vpp = PFX(interp_8tap_vert_pp_16x64_avx512);
1440
+        p.pu[LUMA_24x32].luma_vpp = PFX(interp_8tap_vert_pp_24x32_avx512);
1441
+        p.pu[LUMA_32x8].luma_vpp = PFX(interp_8tap_vert_pp_32x8_avx512);
1442
+        p.pu[LUMA_32x16].luma_vpp = PFX(interp_8tap_vert_pp_32x16_avx512);
1443
+        p.pu[LUMA_32x32].luma_vpp = PFX(interp_8tap_vert_pp_32x32_avx512);
1444
+        p.pu[LUMA_32x24].luma_vpp = PFX(interp_8tap_vert_pp_32x24_avx512);
1445
+        p.pu[LUMA_32x64].luma_vpp = PFX(interp_8tap_vert_pp_32x64_avx512);
1446
+        p.pu[LUMA_48x64].luma_vpp = PFX(interp_8tap_vert_pp_48x64_avx512);
1447
+        p.pu[LUMA_64x16].luma_vpp = PFX(interp_8tap_vert_pp_64x16_avx512);
1448
+        p.pu[LUMA_64x32].luma_vpp = PFX(interp_8tap_vert_pp_64x32_avx512);
1449
+        p.pu[LUMA_64x48].luma_vpp = PFX(interp_8tap_vert_pp_64x48_avx512);
1450
+        p.pu[LUMA_64x64].luma_vpp = PFX(interp_8tap_vert_pp_64x64_avx512);
1451
+
1452
+        p.pu[LUMA_16x4].luma_vps = PFX(interp_8tap_vert_ps_16x4_avx512);
1453
+        p.pu[LUMA_16x8].luma_vps = PFX(interp_8tap_vert_ps_16x8_avx512);
1454
+        p.pu[LUMA_16x12].luma_vps = PFX(interp_8tap_vert_ps_16x12_avx512);
1455
+        p.pu[LUMA_16x16].luma_vps = PFX(interp_8tap_vert_ps_16x16_avx512);
1456
+        p.pu[LUMA_16x32].luma_vps = PFX(interp_8tap_vert_ps_16x32_avx512);
1457
+        p.pu[LUMA_16x64].luma_vps = PFX(interp_8tap_vert_ps_16x64_avx512);
1458
+        p.pu[LUMA_24x32].luma_vps = PFX(interp_8tap_vert_ps_24x32_avx512);
1459
+        p.pu[LUMA_32x8].luma_vps = PFX(interp_8tap_vert_ps_32x8_avx512);
1460
+        p.pu[LUMA_32x16].luma_vps = PFX(interp_8tap_vert_ps_32x16_avx512);
1461
+        p.pu[LUMA_32x32].luma_vps = PFX(interp_8tap_vert_ps_32x32_avx512);
1462
+        p.pu[LUMA_32x24].luma_vps = PFX(interp_8tap_vert_ps_32x24_avx512);
1463
+        p.pu[LUMA_32x64].luma_vps = PFX(interp_8tap_vert_ps_32x64_avx512);
1464
+        p.pu[LUMA_48x64].luma_vps = PFX(interp_8tap_vert_ps_48x64_avx512);
1465
+        p.pu[LUMA_64x16].luma_vps = PFX(interp_8tap_vert_ps_64x16_avx512);
1466
+        p.pu[LUMA_64x32].luma_vps = PFX(interp_8tap_vert_ps_64x32_avx512);
1467
+        p.pu[LUMA_64x48].luma_vps = PFX(interp_8tap_vert_ps_64x48_avx512);
1468
+        p.pu[LUMA_64x64].luma_vps = PFX(interp_8tap_vert_ps_64x64_avx512);
1469
+
1470
+        p.cu[BLOCK_8x8].dct = PFX(dct8_avx512);
1471
+        /* TODO: Currently these kernels performance are similar to AVX2 version, we need a to improve them further to ebable
1472
+        * it. Probably a Vtune analysis will help here.
1473
+
1474
+        * p.cu[BLOCK_16x16].dct  = PFX(dct16_avx512);
1475
+        * p.cu[BLOCK_32x32].dct  = PFX(dct32_avx512); */
1476
+
1477
+        p.cu[BLOCK_8x8].idct = PFX(idct8_avx512);
1478
+        p.cu[BLOCK_16x16].idct = PFX(idct16_avx512);
1479
+        p.cu[BLOCK_32x32].idct = PFX(idct32_avx512);
1480
+        p.quant = PFX(quant_avx512);
1481
+        p.nquant = PFX(nquant_avx512);
1482
+        p.denoiseDct = PFX(denoise_dct_avx512);
1483
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_hps = PFX(interp_4tap_horiz_ps_32x32_avx512);
1484
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_hps = PFX(interp_4tap_horiz_ps_32x16_avx512);
1485
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_hps = PFX(interp_4tap_horiz_ps_32x24_avx512);
1486
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_hps = PFX(interp_4tap_horiz_ps_32x8_avx512);
1487
+
1488
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_hps = PFX(interp_4tap_horiz_ps_32x64_avx512);
1489
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_hps = PFX(interp_4tap_horiz_ps_32x32_avx512);
1490
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_hps = PFX(interp_4tap_horiz_ps_32x48_avx512);
1491
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_hps = PFX(interp_4tap_horiz_ps_32x16_avx512);
1492
+
1493
+        p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_hps = PFX(interp_4tap_horiz_ps_32x32_avx512);
1494
+        p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_hps = PFX(interp_4tap_horiz_ps_32x16_avx512);
1495
+        p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_hps = PFX(interp_4tap_horiz_ps_32x64_avx512);
1496
+        p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_hps = PFX(interp_4tap_horiz_ps_32x24_avx512);
1497
+        p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_hps = PFX(interp_4tap_horiz_ps_32x8_avx512);
1498
+
1499
+        p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_hps = PFX(interp_4tap_horiz_ps_64x64_avx512);
1500
+        p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_hps = PFX(interp_4tap_horiz_ps_64x48_avx512);
1501
+        p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_hps = PFX(interp_4tap_horiz_ps_64x32_avx512);
1502
+        p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_hps = PFX(interp_4tap_horiz_ps_64x16_avx512);
1503
+
1504
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_hps = PFX(interp_4tap_horiz_ps_16x16_avx512);
1505
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_hps = PFX(interp_4tap_horiz_ps_16x8_avx512);
1506
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].filter_hps = PFX(interp_4tap_horiz_ps_16x32_avx512);
1507
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_hps = PFX(interp_4tap_horiz_ps_16x12_avx512);
1508
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_hps = PFX(interp_4tap_horiz_ps_16x4_avx512);
1509
+
1510
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].filter_hps = PFX(interp_4tap_horiz_ps_16x32_avx512);
1511
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_hps = PFX(interp_4tap_horiz_ps_16x16_avx512);
1512
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].filter_hps = PFX(interp_4tap_horiz_ps_16x64_avx512);
1513
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_hps = PFX(interp_4tap_horiz_ps_16x24_avx512);
1514
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_hps = PFX(interp_4tap_horiz_ps_16x8_avx512);
1515
+
1516
+        p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_hps = PFX(interp_4tap_horiz_ps_16x16_avx512);
1517
+        p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_hps = PFX(interp_4tap_horiz_ps_16x8_avx512);
1518
+        p.chroma[X265_CSP_I444].pu[LUMA_16x32].filter_hps = PFX(interp_4tap_horiz_ps_16x32_avx512);
1519
+        p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_hps = PFX(interp_4tap_horiz_ps_16x12_avx512);
1520
+        p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_hps = PFX(interp_4tap_horiz_ps_16x4_avx512);
1521
+        p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_hps = PFX(interp_4tap_horiz_ps_16x64_avx512);
1522
+
1523
+        p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_hps = PFX(interp_4tap_horiz_ps_48x64_avx512);
1524
+
1525
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].filter_hps = PFX(interp_4tap_horiz_ps_8x8_avx512);
1526
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].filter_hps = PFX(interp_4tap_horiz_ps_8x4_avx512);
1527
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].filter_hps = PFX(interp_4tap_horiz_ps_8x16_avx512);
1528
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].filter_hps = PFX(interp_4tap_horiz_ps_8x32_avx512);
1529
+
1530
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].filter_hps = PFX(interp_4tap_horiz_ps_8x8_avx512);
1531
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].filter_hps = PFX(interp_4tap_horiz_ps_8x16_avx512);
1532
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].filter_hps = PFX(interp_4tap_horiz_ps_8x32_avx512);
1533
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].filter_hps = PFX(interp_4tap_horiz_ps_8x12_avx512);
1534
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].filter_hps = PFX(interp_4tap_horiz_ps_8x64_avx512);
1535
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].filter_hps = PFX(interp_4tap_horiz_ps_8x4_avx512);
1536
+
1537
+        p.chroma[X265_CSP_I444].pu[LUMA_8x8].filter_hps = PFX(interp_4tap_horiz_ps_8x8_avx512);
1538
+        p.chroma[X265_CSP_I444].pu[LUMA_8x4].filter_hps = PFX(interp_4tap_horiz_ps_8x4_avx512);
1539
+        p.chroma[X265_CSP_I444].pu[LUMA_8x16].filter_hps = PFX(interp_4tap_horiz_ps_8x16_avx512);
1540
+        p.chroma[X265_CSP_I444].pu[LUMA_8x32].filter_hps = PFX(interp_4tap_horiz_ps_8x32_avx512);
1541
+
1542
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].filter_hps = PFX(interp_4tap_horiz_ps_24x32_avx512);
1543
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].filter_hps = PFX(interp_4tap_horiz_ps_24x64_avx512);
1544
+        p.chroma[X265_CSP_I444].pu[LUMA_24x32].filter_hps = PFX(interp_4tap_horiz_ps_24x32_avx512);
1545
+
1546
+        //Luma_hps_32xN
1547
+        p.pu[LUMA_32x8].luma_hps = PFX(interp_8tap_horiz_ps_32x8_avx512);
1548
+        p.pu[LUMA_32x16].luma_hps = PFX(interp_8tap_horiz_ps_32x16_avx512);
1549
+        p.pu[LUMA_32x32].luma_hps = PFX(interp_8tap_horiz_ps_32x32_avx512);
1550
+        p.pu[LUMA_32x24].luma_hps = PFX(interp_8tap_horiz_ps_32x24_avx512);
1551
+        p.pu[LUMA_32x64].luma_hps = PFX(interp_8tap_horiz_ps_32x64_avx512);
1552
+        //Luma_hps_64xN
1553
+        p.pu[LUMA_64x16].luma_hps = PFX(interp_8tap_horiz_ps_64x16_avx512);
1554
+        p.pu[LUMA_64x32].luma_hps = PFX(interp_8tap_horiz_ps_64x32_avx512);
1555
+        p.pu[LUMA_64x48].luma_hps = PFX(interp_8tap_horiz_ps_64x48_avx512);
1556
+        p.pu[LUMA_64x64].luma_hps = PFX(interp_8tap_horiz_ps_64x64_avx512);
1557
+        //Luma_hps_16xN
1558
+        p.pu[LUMA_16x4].luma_hps = PFX(interp_8tap_horiz_ps_16x4_avx512);
1559
+        p.pu[LUMA_16x8].luma_hps = PFX(interp_8tap_horiz_ps_16x8_avx512);
1560
+        p.pu[LUMA_16x12].luma_hps = PFX(interp_8tap_horiz_ps_16x12_avx512);
1561
+        p.pu[LUMA_16x16].luma_hps = PFX(interp_8tap_horiz_ps_16x16_avx512);
1562
+        p.pu[LUMA_16x32].luma_hps = PFX(interp_8tap_horiz_ps_16x32_avx512);
1563
+        p.pu[LUMA_16x64].luma_hps = PFX(interp_8tap_horiz_ps_16x64_avx512);
1564
+        //Luma_hps_48x64
1565
+        p.pu[LUMA_48x64].luma_hps = PFX(interp_8tap_horiz_ps_48x64_avx512);
1566
+        //Luma_hps_24x32
1567
+        p.pu[LUMA_24x32].luma_hps = PFX(interp_8tap_horiz_ps_24x32_avx512);
1568
+       //Luma_hps_8xN
1569
+        p.pu[LUMA_8x4].luma_hps = PFX(interp_8tap_horiz_ps_8x4_avx512);
1570
+        p.pu[LUMA_8x8].luma_hps = PFX(interp_8tap_horiz_ps_8x8_avx512);
1571
+        p.pu[LUMA_8x16].luma_hps = PFX(interp_8tap_horiz_ps_8x16_avx512);
1572
+        p.pu[LUMA_8x32].luma_hps = PFX(interp_8tap_horiz_ps_8x32_avx512);
1573
+        p.pu[LUMA_16x8].satd = PFX(pixel_satd_16x8_avx512);
1574
+        p.pu[LUMA_16x16].satd = PFX(pixel_satd_16x16_avx512);
1575
+        p.pu[LUMA_16x32].satd = PFX(pixel_satd_16x32_avx512);
1576
+        p.pu[LUMA_16x64].satd = PFX(pixel_satd_16x64_avx512);
1577
+        p.pu[LUMA_32x8].satd = PFX(pixel_satd_32x8_avx512);
1578
+        p.pu[LUMA_32x16].satd = PFX(pixel_satd_32x16_avx512);
1579
+        p.pu[LUMA_32x24].satd = PFX(pixel_satd_32x24_avx512);
1580
+        p.pu[LUMA_32x32].satd = PFX(pixel_satd_32x32_avx512);
1581
+        p.pu[LUMA_32x64].satd = PFX(pixel_satd_32x64_avx512);
1582
+        p.pu[LUMA_64x16].satd = PFX(pixel_satd_64x16_avx512);
1583
+        p.pu[LUMA_64x32].satd = PFX(pixel_satd_64x32_avx512);
1584
+        p.pu[LUMA_64x48].satd = PFX(pixel_satd_64x48_avx512);
1585
+        p.pu[LUMA_64x64].satd = PFX(pixel_satd_64x64_avx512);
1586
+        p.pu[LUMA_48x64].satd = PFX(pixel_satd_48x64_avx512);
1587
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].satd = PFX(pixel_satd_16x32_avx512);
1588
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].satd = PFX(pixel_satd_16x16_avx512);
1589
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].satd = PFX(pixel_satd_16x8_avx512);
1590
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].satd = PFX(pixel_satd_32x32_avx512);
1591
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].satd = PFX(pixel_satd_32x16_avx512);
1592
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].satd = PFX(pixel_satd_32x24_avx512);
1593
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].satd = PFX(pixel_satd_32x8_avx512);
1594
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].satd = PFX(pixel_satd_16x64_avx512);
1595
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].satd = PFX(pixel_satd_16x32_avx512);
1596
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].satd = PFX(pixel_satd_16x16_avx512);
1597
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].satd = PFX(pixel_satd_16x8_avx512);
1598
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].satd = PFX(pixel_satd_32x64_avx512);
1599
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].satd = PFX(pixel_satd_32x32_avx512);
1600
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd = PFX(pixel_satd_32x16_avx512);
1601
+
1602
+        p.cu[BLOCK_32x32].intra_pred[DC_IDX] = PFX(intra_pred_dc32_avx512);
1603
+        p.cu[BLOCK_32x32].intra_pred[2]      = PFX(intra_pred_ang32_2_avx512);
1604
+        p.cu[BLOCK_32x32].intra_pred[34]     = PFX(intra_pred_ang32_2_avx512);
1605
+        p.cu[BLOCK_32x32].intra_pred[9] = PFX(intra_pred_ang32_9_avx512);
1606
+        p.cu[BLOCK_32x32].intra_pred[10] = PFX(intra_pred_ang32_10_avx512);
1607
+        p.cu[BLOCK_32x32].intra_pred[11]    = PFX(intra_pred_ang32_11_avx512);
1608
+        p.cu[BLOCK_32x32].intra_pred[18]    = PFX(intra_pred_ang32_18_avx512);
1609
+        p.cu[BLOCK_32x32].intra_pred[25]    = PFX(intra_pred_ang32_25_avx512);
1610
+        p.cu[BLOCK_32x32].intra_pred[26]    = PFX(intra_pred_ang32_26_avx512);
1611
+        p.cu[BLOCK_32x32].intra_pred[27]    = PFX(intra_pred_ang32_27_avx512);
1612
+        p.cu[BLOCK_32x32].intra_pred[5] = PFX(intra_pred_ang32_5_avx512);
1613
+        p.cu[BLOCK_32x32].intra_pred[31] = PFX(intra_pred_ang32_31_avx512);
1614
+        p.cu[BLOCK_32x32].intra_pred[32] = PFX(intra_pred_ang32_32_avx512);
1615
+        p.cu[BLOCK_32x32].intra_pred[4] = PFX(intra_pred_ang32_4_avx512);
1616
+        p.cu[BLOCK_32x32].intra_pred[30] = PFX(intra_pred_ang32_30_avx512);
1617
+        p.cu[BLOCK_32x32].intra_pred[6] = PFX(intra_pred_ang32_6_avx512);
1618
+        p.cu[BLOCK_32x32].intra_pred[29] = PFX(intra_pred_ang32_29_avx512);
1619
+        p.cu[BLOCK_32x32].intra_pred[7] = PFX(intra_pred_ang32_7_avx512);
1620
+        p.cu[BLOCK_32x32].intra_pred[8]    = PFX(intra_pred_ang32_8_avx512);
1621
+        p.cu[BLOCK_32x32].intra_pred[28]    = PFX(intra_pred_ang32_28_avx512);
1622
+        p.cu[BLOCK_16x16].intra_pred[9]     = PFX(intra_pred_ang16_9_avx512);
1623
+        p.cu[BLOCK_16x16].intra_pred[11]    = PFX(intra_pred_ang16_11_avx512);
1624
+        p.cu[BLOCK_16x16].intra_pred[25]    = PFX(intra_pred_ang16_25_avx512);
1625
+        p.cu[BLOCK_16x16].intra_pred[27]    = PFX(intra_pred_ang16_27_avx512);
1626
+        p.cu[BLOCK_16x16].intra_pred[8]     = PFX(intra_pred_ang16_8_avx512);
1627
+        p.cu[BLOCK_16x16].intra_pred[28]    = PFX(intra_pred_ang16_28_avx512);
1628
+        p.cu[BLOCK_16x16].intra_pred[5] = PFX(intra_pred_ang16_5_avx512);
1629
+        p.cu[BLOCK_16x16].intra_pred[31] = PFX(intra_pred_ang16_31_avx512);
1630
+        p.cu[BLOCK_16x16].intra_pred[4] = PFX(intra_pred_ang16_4_avx512);
1631
+        p.cu[BLOCK_16x16].intra_pred[32] = PFX(intra_pred_ang16_32_avx512);
1632
+        p.cu[BLOCK_16x16].intra_pred[6] = PFX(intra_pred_ang16_6_avx512);
1633
+        p.cu[BLOCK_16x16].intra_pred[30] = PFX(intra_pred_ang16_30_avx512);
1634
+        p.cu[BLOCK_16x16].intra_pred[7] = PFX(intra_pred_ang16_7_avx512);
1635
+        p.cu[BLOCK_16x16].intra_pred[29] = PFX(intra_pred_ang16_29_avx512);
1636
+        p.pu[LUMA_64x64].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_64x64>;
1637
+        p.pu[LUMA_64x48].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_64x48>;
1638
+        p.pu[LUMA_64x32].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_64x32>;
1639
+        p.pu[LUMA_64x16].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_64x16>;
1640
+        p.pu[LUMA_32x8].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_32x8>;
1641
+        p.pu[LUMA_32x16].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_32x16>;
1642
+        p.pu[LUMA_32x32].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_32x32>;
1643
+        p.pu[LUMA_32x24].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_32x24>;
1644
+        p.pu[LUMA_32x64].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_32x64>;
1645
+        p.pu[LUMA_16x4].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_16x4>;
1646
+        p.pu[LUMA_16x8].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_16x8>;
1647
+        p.pu[LUMA_16x12].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_16x12>;
1648
+        p.pu[LUMA_16x16].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_16x16>;
1649
+        p.pu[LUMA_16x32].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_16x32>;
1650
+        p.pu[LUMA_16x64].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_16x64>;
1651
+        p.pu[LUMA_48x64].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_48x64>;
1652
+
1653
+        p.cu[BLOCK_16x16].psy_cost_pp = PFX(psyCost_pp_16x16_avx512);
1654
+        p.cu[BLOCK_32x32].psy_cost_pp = PFX(psyCost_pp_32x32_avx512);
1655
+        p.cu[BLOCK_64x64].psy_cost_pp = PFX(psyCost_pp_64x64_avx512);
1656
+
1657
+        p.cu[BLOCK_4x4].nonPsyRdoQuant = PFX(nonPsyRdoQuant4_avx512);
1658
+        p.cu[BLOCK_8x8].nonPsyRdoQuant = PFX(nonPsyRdoQuant8_avx512);
1659
+        p.cu[BLOCK_16x16].nonPsyRdoQuant = PFX(nonPsyRdoQuant16_avx512);
1660
+        p.cu[BLOCK_32x32].nonPsyRdoQuant = PFX(nonPsyRdoQuant32_avx512);
1661
+        p.cu[BLOCK_4x4].psyRdoQuant = PFX(psyRdoQuant4_avx512);
1662
+        p.cu[BLOCK_8x8].psyRdoQuant = PFX(psyRdoQuant8_avx512);
1663
+        p.cu[BLOCK_16x16].psyRdoQuant = PFX(psyRdoQuant16_avx512);
1664
+        p.cu[BLOCK_32x32].psyRdoQuant = PFX(psyRdoQuant32_avx512);
1665
+
1666
+        p.cu[BLOCK_32x32].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_32x32_avx512);
1667
+        p.cu[BLOCK_64x64].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_64x64_avx512);
1668
+        p.cu[BLOCK_32x32].sse_pp = PFX(pixel_ssd_32x32_avx512);
1669
+        p.cu[BLOCK_64x64].sse_pp = PFX(pixel_ssd_64x64_avx512);
1670
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sse_pp = (pixel_sse_t)PFX(pixel_ssd_32x32_avx512);
1671
+        p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sse_pp = (pixel_sse_t)PFX(pixel_ssd_32x64_avx512);
1672
+        p.planecopy_sp_shl = PFX(upShift_16_avx512);
1673
+
1674
+    }
1675
+#endif
1676
 }
1677
 #else // if HIGH_BIT_DEPTH
1678
 
1679
@@ -2295,16 +3284,16 @@
1680
         //p.frameInitLowres = PFX(frame_init_lowres_core_mmx2);
1681
         p.frameInitLowres = PFX(frame_init_lowres_core_sse2);
1682
 
1683
-        ALL_LUMA_TU(blockfill_s, blockfill_s, sse2);
1684
+        ALL_LUMA_TU(blockfill_s[NONALIGNED], blockfill_s, sse2);
1685
+        ALL_LUMA_TU(blockfill_s[ALIGNED], blockfill_s, sse2);
1686
         ALL_LUMA_TU_S(cpy2Dto1D_shl, cpy2Dto1D_shl_, sse2);
1687
         ALL_LUMA_TU_S(cpy2Dto1D_shr, cpy2Dto1D_shr_, sse2);
1688
-        ALL_LUMA_TU_S(cpy1Dto2D_shl, cpy1Dto2D_shl_, sse2);
1689
+        ALL_LUMA_TU_S(cpy1Dto2D_shl[ALIGNED], cpy1Dto2D_shl_, sse2);
1690
+        ALL_LUMA_TU_S(cpy1Dto2D_shl[NONALIGNED], cpy1Dto2D_shl_, sse2);
1691
         ALL_LUMA_TU_S(cpy1Dto2D_shr, cpy1Dto2D_shr_, sse2);
1692
-        ALL_LUMA_TU_S(ssd_s, pixel_ssd_s_, sse2);
1693
-
1694
+        ALL_LUMA_TU_S(ssd_s[NONALIGNED], pixel_ssd_s_, sse2);
1695
         ALL_LUMA_TU_S(intra_pred[PLANAR_IDX], intra_pred_planar, sse2);
1696
         ALL_LUMA_TU_S(intra_pred[DC_IDX], intra_pred_dc, sse2);
1697
-
1698
         p.cu[BLOCK_4x4].intra_pred[2] = PFX(intra_pred_ang4_2_sse2);
1699
         p.cu[BLOCK_4x4].intra_pred[3] = PFX(intra_pred_ang4_3_sse2);
1700
         p.cu[BLOCK_4x4].intra_pred[4] = PFX(intra_pred_ang4_4_sse2);
1701
@@ -2339,9 +3328,8 @@
1702
         p.cu[BLOCK_4x4].intra_pred[33] = PFX(intra_pred_ang4_33_sse2);
1703
 
1704
         p.cu[BLOCK_4x4].intra_pred_allangs = PFX(all_angs_pred_4x4_sse2);
1705
-
1706
-        p.cu[BLOCK_4x4].calcresidual = PFX(getResidual4_sse2);
1707
-        p.cu[BLOCK_8x8].calcresidual = PFX(getResidual8_sse2);
1708
+        ASSIGN2(p.cu[BLOCK_4x4].calcresidual, getResidual4_sse2);
1709
+        ASSIGN2(p.cu[BLOCK_8x8].calcresidual, getResidual8_sse2);
1710
 
1711
         ALL_LUMA_TU_S(transpose, transpose, sse2);
1712
         p.cu[BLOCK_64x64].transpose = PFX(transpose64_sse2);
1713
@@ -2362,10 +3350,14 @@
1714
         p.dst4x4 = PFX(dst4_sse2);
1715
 
1716
         p.planecopy_sp = PFX(downShift_16_sse2);
1717
-        ALL_CHROMA_420_PU(p2s, filterPixelToShort, sse2);
1718
-        ALL_CHROMA_422_PU(p2s, filterPixelToShort, sse2);
1719
-        ALL_CHROMA_444_PU(p2s, filterPixelToShort, sse2);
1720
-        ALL_LUMA_PU(convert_p2s, filterPixelToShort, sse2);
1721
+        ALL_CHROMA_420_PU(p2s[NONALIGNED], filterPixelToShort, sse2);
1722
+        ALL_CHROMA_422_PU(p2s[NONALIGNED], filterPixelToShort, sse2);
1723
+        ALL_CHROMA_444_PU(p2s[NONALIGNED], filterPixelToShort, sse2);
1724
+        ALL_CHROMA_420_PU(p2s[ALIGNED], filterPixelToShort, sse2);
1725
+        ALL_CHROMA_422_PU(p2s[ALIGNED], filterPixelToShort, sse2);
1726
+        ALL_CHROMA_444_PU(p2s[ALIGNED], filterPixelToShort, sse2);
1727
+        ALL_LUMA_PU(convert_p2s[NONALIGNED], filterPixelToShort, sse2);
1728
+        ALL_LUMA_PU(convert_p2s[ALIGNED], filterPixelToShort, sse2);
1729
         ALL_LUMA_TU(count_nonzero, count_nonzero, sse2);
1730
         p.propagateCost = PFX(mbtree_propagate_cost_sse2);
1731
     }
1732
@@ -2411,64 +3403,61 @@
1733
         p.pu[LUMA_8x8].luma_hvpp = PFX(interp_8tap_hv_pp_8x8_ssse3);
1734
 
1735
         p.frameInitLowres = PFX(frame_init_lowres_core_ssse3);
1736
-        p.scale1D_128to64 = PFX(scale1D_128to64_ssse3);
1737
+        ASSIGN2(p.scale1D_128to64, scale1D_128to64_ssse3);
1738
         p.scale2D_64to32 = PFX(scale2D_64to32_ssse3);
1739
 
1740
-        p.pu[LUMA_8x4].convert_p2s = PFX(filterPixelToShort_8x4_ssse3);
1741
-        p.pu[LUMA_8x8].convert_p2s = PFX(filterPixelToShort_8x8_ssse3);
1742
-        p.pu[LUMA_8x16].convert_p2s = PFX(filterPixelToShort_8x16_ssse3);
1743
-        p.pu[LUMA_8x32].convert_p2s = PFX(filterPixelToShort_8x32_ssse3);
1744
-        p.pu[LUMA_16x4].convert_p2s = PFX(filterPixelToShort_16x4_ssse3);
1745
-        p.pu[LUMA_16x8].convert_p2s = PFX(filterPixelToShort_16x8_ssse3);
1746
-        p.pu[LUMA_16x12].convert_p2s = PFX(filterPixelToShort_16x12_ssse3);
1747
-        p.pu[LUMA_16x16].convert_p2s = PFX(filterPixelToShort_16x16_ssse3);
1748
-        p.pu[LUMA_16x32].convert_p2s = PFX(filterPixelToShort_16x32_ssse3);
1749
-        p.pu[LUMA_16x64].convert_p2s = PFX(filterPixelToShort_16x64_ssse3);
1750
-        p.pu[LUMA_32x8].convert_p2s = PFX(filterPixelToShort_32x8_ssse3);
1751
-        p.pu[LUMA_32x16].convert_p2s = PFX(filterPixelToShort_32x16_ssse3);
1752
-        p.pu[LUMA_32x24].convert_p2s = PFX(filterPixelToShort_32x24_ssse3);
1753
-        p.pu[LUMA_32x32].convert_p2s = PFX(filterPixelToShort_32x32_ssse3);
1754
-        p.pu[LUMA_32x64].convert_p2s = PFX(filterPixelToShort_32x64_ssse3);
1755
-        p.pu[LUMA_64x16].convert_p2s = PFX(filterPixelToShort_64x16_ssse3);
1756
-        p.pu[LUMA_64x32].convert_p2s = PFX(filterPixelToShort_64x32_ssse3);
1757
-        p.pu[LUMA_64x48].convert_p2s = PFX(filterPixelToShort_64x48_ssse3);
1758
-        p.pu[LUMA_64x64].convert_p2s = PFX(filterPixelToShort_64x64_ssse3);
1759
-        p.pu[LUMA_12x16].convert_p2s = PFX(filterPixelToShort_12x16_ssse3);
1760
-        p.pu[LUMA_24x32].convert_p2s = PFX(filterPixelToShort_24x32_ssse3);
1761
-        p.pu[LUMA_48x64].convert_p2s = PFX(filterPixelToShort_48x64_ssse3);
1762
-
1763
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].p2s = PFX(filterPixelToShort_8x2_ssse3);
1764
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].p2s = PFX(filterPixelToShort_8x4_ssse3);
1765
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].p2s = PFX(filterPixelToShort_8x6_ssse3);
1766
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].p2s = PFX(filterPixelToShort_8x8_ssse3);
1767
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].p2s = PFX(filterPixelToShort_8x16_ssse3);
1768
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].p2s = PFX(filterPixelToShort_8x32_ssse3);
1769
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].p2s = PFX(filterPixelToShort_16x4_ssse3);
1770
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].p2s = PFX(filterPixelToShort_16x8_ssse3);
1771
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].p2s = PFX(filterPixelToShort_16x12_ssse3);
1772
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].p2s = PFX(filterPixelToShort_16x16_ssse3);
1773
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].p2s = PFX(filterPixelToShort_16x32_ssse3);
1774
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].p2s = PFX(filterPixelToShort_32x8_ssse3);
1775
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].p2s = PFX(filterPixelToShort_32x16_ssse3);
1776
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].p2s = PFX(filterPixelToShort_32x24_ssse3);
1777
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].p2s = PFX(filterPixelToShort_32x32_ssse3);
1778
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].p2s = PFX(filterPixelToShort_8x4_ssse3);
1779
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].p2s = PFX(filterPixelToShort_8x8_ssse3);
1780
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].p2s = PFX(filterPixelToShort_8x12_ssse3);
1781
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].p2s = PFX(filterPixelToShort_8x16_ssse3);
1782
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].p2s = PFX(filterPixelToShort_8x32_ssse3);
1783
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].p2s = PFX(filterPixelToShort_8x64_ssse3);
1784
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].p2s = PFX(filterPixelToShort_12x32_ssse3);
1785
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].p2s = PFX(filterPixelToShort_16x8_ssse3);
1786
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].p2s = PFX(filterPixelToShort_16x16_ssse3);
1787
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].p2s = PFX(filterPixelToShort_16x24_ssse3);
1788
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].p2s = PFX(filterPixelToShort_16x32_ssse3);
1789
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].p2s = PFX(filterPixelToShort_16x64_ssse3);
1790
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].p2s = PFX(filterPixelToShort_24x64_ssse3);
1791
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].p2s = PFX(filterPixelToShort_32x16_ssse3);
1792
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].p2s = PFX(filterPixelToShort_32x32_ssse3);
1793
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s = PFX(filterPixelToShort_32x48_ssse3);
1794
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s = PFX(filterPixelToShort_32x64_ssse3);
1795
+        ASSIGN2(p.pu[LUMA_8x4].convert_p2s, filterPixelToShort_8x4_ssse3);
1796
+        ASSIGN2(p.pu[LUMA_8x8].convert_p2s, filterPixelToShort_8x8_ssse3);
1797
+        ASSIGN2(p.pu[LUMA_8x16].convert_p2s, filterPixelToShort_8x16_ssse3);
1798
+        ASSIGN2(p.pu[LUMA_8x32].convert_p2s, filterPixelToShort_8x32_ssse3);
1799
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].p2s, filterPixelToShort_8x2_ssse3);
1800
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].p2s, filterPixelToShort_8x4_ssse3);
1801
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].p2s, filterPixelToShort_8x6_ssse3);
1802
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].p2s, filterPixelToShort_8x8_ssse3);
1803
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].p2s, filterPixelToShort_8x16_ssse3);
1804
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].p2s, filterPixelToShort_8x32_ssse3);
1805
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].p2s, filterPixelToShort_8x4_ssse3);
1806
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].p2s, filterPixelToShort_8x8_ssse3);
1807
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].p2s, filterPixelToShort_8x12_ssse3);
1808
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].p2s, filterPixelToShort_8x16_ssse3);
1809
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].p2s, filterPixelToShort_8x32_ssse3);
1810
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].p2s, filterPixelToShort_8x64_ssse3);
1811
+
1812
+        ASSIGN2(p.pu[LUMA_16x4].convert_p2s, filterPixelToShort_16x4_ssse3);
1813
+        ASSIGN2(p.pu[LUMA_16x8].convert_p2s, filterPixelToShort_16x8_ssse3);
1814
+        ASSIGN2(p.pu[LUMA_16x12].convert_p2s, filterPixelToShort_16x12_ssse3);
1815
+        ASSIGN2(p.pu[LUMA_16x16].convert_p2s, filterPixelToShort_16x16_ssse3);
1816
+        ASSIGN2(p.pu[LUMA_16x32].convert_p2s, filterPixelToShort_16x32_ssse3);
1817
+        ASSIGN2(p.pu[LUMA_16x64].convert_p2s, filterPixelToShort_16x64_ssse3);
1818
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].p2s, filterPixelToShort_16x4_ssse3);
1819
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].p2s, filterPixelToShort_16x8_ssse3);
1820
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].p2s, filterPixelToShort_16x12_ssse3);
1821
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].p2s, filterPixelToShort_16x16_ssse3);
1822
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].p2s, filterPixelToShort_16x32_ssse3);
1823
+
1824
+        ASSIGN2(p.pu[LUMA_32x8].convert_p2s, filterPixelToShort_32x8_ssse3);
1825
+        ASSIGN2(p.pu[LUMA_32x16].convert_p2s, filterPixelToShort_32x16_ssse3);
1826
+        ASSIGN2(p.pu[LUMA_32x24].convert_p2s, filterPixelToShort_32x24_ssse3);
1827
+        ASSIGN2(p.pu[LUMA_32x32].convert_p2s, filterPixelToShort_32x32_ssse3);
1828
+        ASSIGN2(p.pu[LUMA_32x64].convert_p2s, filterPixelToShort_32x64_ssse3);
1829
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].p2s, filterPixelToShort_32x8_ssse3);
1830
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].p2s, filterPixelToShort_32x16_ssse3);
1831
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].p2s, filterPixelToShort_32x24_ssse3);
1832
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].p2s, filterPixelToShort_32x32_ssse3);
1833
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].p2s, filterPixelToShort_32x16_ssse3);
1834
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].p2s, filterPixelToShort_32x32_ssse3);
1835
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s, filterPixelToShort_32x48_ssse3);
1836
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s, filterPixelToShort_32x64_ssse3);
1837
+
1838
+        ASSIGN2(p.pu[LUMA_64x16].convert_p2s, filterPixelToShort_64x16_ssse3);
1839
+        ASSIGN2(p.pu[LUMA_64x32].convert_p2s, filterPixelToShort_64x32_ssse3);
1840
+        ASSIGN2(p.pu[LUMA_64x48].convert_p2s, filterPixelToShort_64x48_ssse3);
1841
+        ASSIGN2(p.pu[LUMA_64x64].convert_p2s, filterPixelToShort_64x64_ssse3);
1842
+        ASSIGN2(p.pu[LUMA_12x16].convert_p2s, filterPixelToShort_12x16_ssse3);
1843
+        ASSIGN2(p.pu[LUMA_24x32].convert_p2s, filterPixelToShort_24x32_ssse3);
1844
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].p2s, filterPixelToShort_24x64_ssse3);
1845
+        ASSIGN2(p.pu[LUMA_48x64].convert_p2s, filterPixelToShort_48x64_ssse3);
1846
+
1847
         p.findPosFirstLast = PFX(findPosFirstLast_ssse3);
1848
         p.fix8Unpack = PFX(cutree_fix8_unpack_ssse3);
1849
         p.fix8Pack = PFX(cutree_fix8_pack_ssse3);
1850
@@ -2519,8 +3508,8 @@
1851
         CHROMA_420_CU_BLOCKCOPY(ps, sse4);
1852
         CHROMA_422_CU_BLOCKCOPY(ps, sse4);
1853
 
1854
-        p.cu[BLOCK_16x16].calcresidual = PFX(getResidual16_sse4);
1855
-        p.cu[BLOCK_32x32].calcresidual = PFX(getResidual32_sse4);
1856
+        ASSIGN2(p.cu[BLOCK_16x16].calcresidual, getResidual16_sse4);
1857
+        ASSIGN2(p.cu[BLOCK_32x32].calcresidual, getResidual32_sse4);
1858
         p.cu[BLOCK_8x8].dct = PFX(dct8_sse4);
1859
         p.denoiseDct = PFX(denoise_dct_sse4);
1860
         p.quant = PFX(quant_sse4);
1861
@@ -2545,24 +3534,25 @@
1862
 
1863
         p.cu[BLOCK_4x4].psy_cost_pp = PFX(psyCost_pp_4x4_sse4);
1864
 
1865
-        p.pu[LUMA_4x4].convert_p2s = PFX(filterPixelToShort_4x4_sse4);
1866
-        p.pu[LUMA_4x8].convert_p2s = PFX(filterPixelToShort_4x8_sse4);
1867
-        p.pu[LUMA_4x16].convert_p2s = PFX(filterPixelToShort_4x16_sse4);
1868
-
1869
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].p2s = PFX(filterPixelToShort_2x4_sse4);
1870
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_2x8].p2s = PFX(filterPixelToShort_2x8_sse4);
1871
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].p2s = PFX(filterPixelToShort_4x2_sse4);
1872
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].p2s = PFX(filterPixelToShort_4x4_sse4);
1873
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].p2s = PFX(filterPixelToShort_4x8_sse4);
1874
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].p2s = PFX(filterPixelToShort_4x16_sse4);
1875
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].p2s = PFX(filterPixelToShort_6x8_sse4);
1876
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_2x8].p2s = PFX(filterPixelToShort_2x8_sse4);
1877
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].p2s = PFX(filterPixelToShort_2x16_sse4);
1878
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].p2s = PFX(filterPixelToShort_4x4_sse4);
1879
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].p2s = PFX(filterPixelToShort_4x8_sse4);
1880
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].p2s = PFX(filterPixelToShort_4x16_sse4);
1881
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].p2s = PFX(filterPixelToShort_4x32_sse4);
1882
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].p2s = PFX(filterPixelToShort_6x16_sse4);
1883
+        ASSIGN2(p.pu[LUMA_4x4].convert_p2s, filterPixelToShort_4x4_sse4);
1884
+        ASSIGN2(p.pu[LUMA_4x8].convert_p2s, filterPixelToShort_4x8_sse4);
1885
+        ASSIGN2(p.pu[LUMA_4x16].convert_p2s, filterPixelToShort_4x16_sse4);
1886
+
1887
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].p2s, filterPixelToShort_2x4_sse4);
1888
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_2x8].p2s, filterPixelToShort_2x8_sse4);
1889
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].p2s, filterPixelToShort_4x2_sse4);
1890
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].p2s, filterPixelToShort_4x4_sse4);
1891
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].p2s, filterPixelToShort_4x8_sse4);
1892
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].p2s, filterPixelToShort_4x16_sse4);
1893
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].p2s, filterPixelToShort_6x8_sse4);
1894
+
1895
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_2x8].p2s, filterPixelToShort_2x8_sse4);
1896
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].p2s, filterPixelToShort_2x16_sse4);
1897
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].p2s, filterPixelToShort_4x4_sse4);
1898
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].p2s, filterPixelToShort_4x8_sse4);
1899
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].p2s, filterPixelToShort_4x16_sse4);
1900
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].p2s, filterPixelToShort_4x32_sse4);
1901
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].p2s, filterPixelToShort_6x16_sse4);
1902
 
1903
 #if X86_64
1904
         p.pelFilterLumaStrong[0] = PFX(pelFilterLumaStrong_V_sse4);
1905
@@ -2732,69 +3722,63 @@
1906
         p.cu[BLOCK_32x32].psy_cost_pp = PFX(psyCost_pp_32x32_avx2);
1907
         p.cu[BLOCK_64x64].psy_cost_pp = PFX(psyCost_pp_64x64_avx2);
1908
 
1909
-        p.pu[LUMA_8x4].addAvg = PFX(addAvg_8x4_avx2);
1910
-        p.pu[LUMA_8x8].addAvg = PFX(addAvg_8x8_avx2);
1911
-        p.pu[LUMA_8x16].addAvg = PFX(addAvg_8x16_avx2);
1912
-        p.pu[LUMA_8x32].addAvg = PFX(addAvg_8x32_avx2);
1913
-
1914
-        p.pu[LUMA_12x16].addAvg = PFX(addAvg_12x16_avx2);
1915
-
1916
-        p.pu[LUMA_16x4].addAvg = PFX(addAvg_16x4_avx2);
1917
-        p.pu[LUMA_16x8].addAvg = PFX(addAvg_16x8_avx2);
1918
-        p.pu[LUMA_16x12].addAvg = PFX(addAvg_16x12_avx2);
1919
-        p.pu[LUMA_16x16].addAvg = PFX(addAvg_16x16_avx2);
1920
-        p.pu[LUMA_16x32].addAvg = PFX(addAvg_16x32_avx2);
1921
-        p.pu[LUMA_16x64].addAvg = PFX(addAvg_16x64_avx2);
1922
-
1923
-        p.pu[LUMA_24x32].addAvg = PFX(addAvg_24x32_avx2);
1924
-
1925
-        p.pu[LUMA_32x8].addAvg = PFX(addAvg_32x8_avx2);
1926
-        p.pu[LUMA_32x16].addAvg = PFX(addAvg_32x16_avx2);
1927
-        p.pu[LUMA_32x24].addAvg = PFX(addAvg_32x24_avx2);
1928
-        p.pu[LUMA_32x32].addAvg = PFX(addAvg_32x32_avx2);
1929
-        p.pu[LUMA_32x64].addAvg = PFX(addAvg_32x64_avx2);
1930
-
1931
-        p.pu[LUMA_48x64].addAvg = PFX(addAvg_48x64_avx2);
1932
-
1933
-        p.pu[LUMA_64x16].addAvg = PFX(addAvg_64x16_avx2);
1934
-        p.pu[LUMA_64x32].addAvg = PFX(addAvg_64x32_avx2);
1935
-        p.pu[LUMA_64x48].addAvg = PFX(addAvg_64x48_avx2);
1936
-        p.pu[LUMA_64x64].addAvg = PFX(addAvg_64x64_avx2);
1937
-
1938
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].addAvg = PFX(addAvg_8x2_avx2);
1939
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].addAvg = PFX(addAvg_8x4_avx2);
1940
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].addAvg = PFX(addAvg_8x6_avx2);
1941
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].addAvg = PFX(addAvg_8x8_avx2);
1942
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].addAvg = PFX(addAvg_8x16_avx2);
1943
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].addAvg = PFX(addAvg_8x32_avx2);
1944
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].addAvg = PFX(addAvg_12x16_avx2);
1945
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].addAvg = PFX(addAvg_16x4_avx2);
1946
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].addAvg = PFX(addAvg_16x8_avx2);
1947
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].addAvg = PFX(addAvg_16x12_avx2);
1948
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].addAvg = PFX(addAvg_16x16_avx2);
1949
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].addAvg = PFX(addAvg_16x32_avx2);
1950
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].addAvg = PFX(addAvg_32x8_avx2);
1951
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].addAvg = PFX(addAvg_32x16_avx2);
1952
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].addAvg = PFX(addAvg_32x24_avx2);
1953
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].addAvg = PFX(addAvg_32x32_avx2);
1954
-
1955
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].addAvg = PFX(addAvg_8x4_avx2);
1956
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].addAvg = PFX(addAvg_8x8_avx2);
1957
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].addAvg = PFX(addAvg_8x12_avx2);
1958
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].addAvg = PFX(addAvg_8x16_avx2);
1959
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].addAvg = PFX(addAvg_8x32_avx2);
1960
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].addAvg = PFX(addAvg_8x64_avx2);
1961
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].addAvg = PFX(addAvg_12x32_avx2);
1962
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].addAvg = PFX(addAvg_16x8_avx2);
1963
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].addAvg = PFX(addAvg_16x16_avx2);
1964
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].addAvg = PFX(addAvg_16x24_avx2);
1965
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].addAvg = PFX(addAvg_16x32_avx2);
1966
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].addAvg = PFX(addAvg_16x64_avx2);
1967
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].addAvg = PFX(addAvg_24x64_avx2);
1968
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].addAvg = PFX(addAvg_32x16_avx2);
1969
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].addAvg = PFX(addAvg_32x32_avx2);
1970
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].addAvg = PFX(addAvg_32x48_avx2);
1971
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].addAvg = PFX(addAvg_32x64_avx2);
1972
+        ASSIGN2(p.pu[LUMA_8x4].addAvg, addAvg_8x4_avx2);
1973
+        ASSIGN2(p.pu[LUMA_8x8].addAvg, addAvg_8x8_avx2);
1974
+        ASSIGN2(p.pu[LUMA_8x16].addAvg, addAvg_8x16_avx2);
1975
+        ASSIGN2(p.pu[LUMA_8x32].addAvg, addAvg_8x32_avx2);
1976
+        ASSIGN2(p.pu[LUMA_12x16].addAvg, addAvg_12x16_avx2);
1977
+        ASSIGN2(p.pu[LUMA_16x4].addAvg, addAvg_16x4_avx2);
1978
+        ASSIGN2(p.pu[LUMA_16x8].addAvg, addAvg_16x8_avx2);
1979
+        ASSIGN2(p.pu[LUMA_16x12].addAvg, addAvg_16x12_avx2);
1980
+        ASSIGN2(p.pu[LUMA_16x16].addAvg, addAvg_16x16_avx2);
1981
+        ASSIGN2(p.pu[LUMA_16x32].addAvg, addAvg_16x32_avx2);
1982
+        ASSIGN2(p.pu[LUMA_16x64].addAvg, addAvg_16x64_avx2);
1983
+        ASSIGN2(p.pu[LUMA_24x32].addAvg, addAvg_24x32_avx2);
1984
+        ASSIGN2(p.pu[LUMA_32x8].addAvg, addAvg_32x8_avx2);
1985
+        ASSIGN2(p.pu[LUMA_32x16].addAvg, addAvg_32x16_avx2);
1986
+        ASSIGN2(p.pu[LUMA_32x24].addAvg, addAvg_32x24_avx2);
1987
+        ASSIGN2(p.pu[LUMA_32x32].addAvg, addAvg_32x32_avx2);
1988
+        ASSIGN2(p.pu[LUMA_32x64].addAvg, addAvg_32x64_avx2);
1989
+        ASSIGN2(p.pu[LUMA_48x64].addAvg, addAvg_48x64_avx2);
1990
+        ASSIGN2(p.pu[LUMA_64x16].addAvg, addAvg_64x16_avx2);
1991
+        ASSIGN2(p.pu[LUMA_64x32].addAvg, addAvg_64x32_avx2);
1992
+        ASSIGN2(p.pu[LUMA_64x48].addAvg, addAvg_64x48_avx2);
1993
+        ASSIGN2(p.pu[LUMA_64x64].addAvg, addAvg_64x64_avx2);
1994
+
1995
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].addAvg, addAvg_8x2_avx2);
1996
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].addAvg, addAvg_8x4_avx2);
1997
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].addAvg, addAvg_8x6_avx2);
1998
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].addAvg, addAvg_8x8_avx2);
1999
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].addAvg, addAvg_8x16_avx2);
2000
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].addAvg, addAvg_8x32_avx2);
2001
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].addAvg, addAvg_12x16_avx2);
2002
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].addAvg, addAvg_16x4_avx2);
2003
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].addAvg, addAvg_16x8_avx2);
2004
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].addAvg, addAvg_16x12_avx2);
2005
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].addAvg, addAvg_16x16_avx2);
2006
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].addAvg, addAvg_16x32_avx2);
2007
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].addAvg, addAvg_32x8_avx2);
2008
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].addAvg, addAvg_32x16_avx2);
2009
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].addAvg, addAvg_32x24_avx2);
2010
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].addAvg, addAvg_32x32_avx2);
2011
+
2012
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].addAvg, addAvg_8x4_avx2);
2013
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].addAvg, addAvg_8x8_avx2);
2014
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].addAvg, addAvg_8x12_avx2);
2015
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].addAvg, addAvg_8x16_avx2);
2016
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].addAvg, addAvg_8x32_avx2);
2017
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].addAvg, addAvg_8x64_avx2);
2018
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].addAvg, addAvg_12x32_avx2);
2019
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].addAvg, addAvg_16x8_avx2);
2020
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].addAvg, addAvg_16x16_avx2);
2021
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].addAvg, addAvg_16x24_avx2);
2022
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].addAvg, addAvg_16x32_avx2);
2023
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].addAvg, addAvg_16x64_avx2);
2024
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].addAvg, addAvg_24x64_avx2);
2025
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].addAvg, addAvg_32x16_avx2);
2026
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].addAvg, addAvg_32x32_avx2);
2027
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].addAvg, addAvg_32x48_avx2);
2028
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].addAvg, addAvg_32x64_avx2);
2029
 
2030
         p.cu[BLOCK_8x8].sa8d = PFX(pixel_sa8d_8x8_avx2);
2031
         p.cu[BLOCK_16x16].sa8d = PFX(pixel_sa8d_16x16_avx2);
2032
@@ -2803,13 +3787,13 @@
2033
         p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sa8d = PFX(pixel_sa8d_16x16_avx2);
2034
         p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sa8d = PFX(pixel_sa8d_32x32_avx2);
2035
 
2036
-        p.cu[BLOCK_16x16].add_ps = PFX(pixel_add_ps_16x16_avx2);
2037
-        p.cu[BLOCK_32x32].add_ps = PFX(pixel_add_ps_32x32_avx2);
2038
-        p.cu[BLOCK_64x64].add_ps = PFX(pixel_add_ps_64x64_avx2);
2039
-        p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].add_ps = PFX(pixel_add_ps_16x16_avx2);
2040
-        p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps = PFX(pixel_add_ps_32x32_avx2);
2041
-        p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].add_ps = PFX(pixel_add_ps_16x32_avx2);
2042
-        p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps = PFX(pixel_add_ps_32x64_avx2);
2043
+        ASSIGN2(p.cu[BLOCK_16x16].add_ps, pixel_add_ps_16x16_avx2);
2044
+        ASSIGN2(p.cu[BLOCK_32x32].add_ps, pixel_add_ps_32x32_avx2);
2045
+        ASSIGN2(p.cu[BLOCK_64x64].add_ps, pixel_add_ps_64x64_avx2);
2046
+        ASSIGN2(p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].add_ps, pixel_add_ps_16x16_avx2);
2047
+        ASSIGN2(p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps, pixel_add_ps_32x32_avx2);
2048
+        ASSIGN2(p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].add_ps, pixel_add_ps_16x32_avx2);
2049
+        ASSIGN2(p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps, pixel_add_ps_32x64_avx2);
2050
 
2051
         p.cu[BLOCK_16x16].sub_ps = PFX(pixel_sub_ps_16x16_avx2);
2052
         p.cu[BLOCK_32x32].sub_ps = PFX(pixel_sub_ps_32x32_avx2);
2053
@@ -2818,25 +3802,23 @@
2054
         p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sub_ps = PFX(pixel_sub_ps_32x32_avx2);
2055
         p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sub_ps = PFX(pixel_sub_ps_16x32_avx2);
2056
         p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sub_ps = PFX(pixel_sub_ps_32x64_avx2);
2057
-
2058
-        p.pu[LUMA_16x4].pixelavg_pp = PFX(pixel_avg_16x4_avx2);
2059
-        p.pu[LUMA_16x8].pixelavg_pp = PFX(pixel_avg_16x8_avx2);
2060
-        p.pu[LUMA_16x12].pixelavg_pp = PFX(pixel_avg_16x12_avx2);
2061
-        p.pu[LUMA_16x16].pixelavg_pp = PFX(pixel_avg_16x16_avx2);
2062
-        p.pu[LUMA_16x32].pixelavg_pp = PFX(pixel_avg_16x32_avx2);
2063
-        p.pu[LUMA_16x64].pixelavg_pp = PFX(pixel_avg_16x64_avx2);
2064
-
2065
-        p.pu[LUMA_32x64].pixelavg_pp = PFX(pixel_avg_32x64_avx2);
2066
-        p.pu[LUMA_32x32].pixelavg_pp = PFX(pixel_avg_32x32_avx2);
2067
-        p.pu[LUMA_32x24].pixelavg_pp = PFX(pixel_avg_32x24_avx2);
2068
-        p.pu[LUMA_32x16].pixelavg_pp = PFX(pixel_avg_32x16_avx2);
2069
-        p.pu[LUMA_32x8].pixelavg_pp = PFX(pixel_avg_32x8_avx2);
2070
-        p.pu[LUMA_48x64].pixelavg_pp = PFX(pixel_avg_48x64_avx2);
2071
-        p.pu[LUMA_64x64].pixelavg_pp = PFX(pixel_avg_64x64_avx2);
2072
-        p.pu[LUMA_64x48].pixelavg_pp = PFX(pixel_avg_64x48_avx2);
2073
-        p.pu[LUMA_64x32].pixelavg_pp = PFX(pixel_avg_64x32_avx2);
2074
-        p.pu[LUMA_64x16].pixelavg_pp = PFX(pixel_avg_64x16_avx2);
2075
-
2076
+        ASSIGN2(p.pu[LUMA_16x4].pixelavg_pp, pixel_avg_16x4_avx2);
2077
+        ASSIGN2(p.pu[LUMA_16x8].pixelavg_pp, pixel_avg_16x8_avx2);
2078
+        ASSIGN2(p.pu[LUMA_16x12].pixelavg_pp, pixel_avg_16x12_avx2);
2079
+        ASSIGN2(p.pu[LUMA_16x16].pixelavg_pp, pixel_avg_16x16_avx2);
2080
+        ASSIGN2(p.pu[LUMA_16x32].pixelavg_pp, pixel_avg_16x32_avx2);
2081
+        ASSIGN2(p.pu[LUMA_16x64].pixelavg_pp, pixel_avg_16x64_avx2);
2082
+
2083
+        ASSIGN2(p.pu[LUMA_32x64].pixelavg_pp, pixel_avg_32x64_avx2);
2084
+        ASSIGN2(p.pu[LUMA_32x32].pixelavg_pp, pixel_avg_32x32_avx2);
2085
+        ASSIGN2(p.pu[LUMA_32x24].pixelavg_pp, pixel_avg_32x24_avx2);
2086
+        ASSIGN2(p.pu[LUMA_32x16].pixelavg_pp, pixel_avg_32x16_avx2);
2087
+        ASSIGN2(p.pu[LUMA_32x8].pixelavg_pp, pixel_avg_32x8_avx2);
2088
+        ASSIGN2(p.pu[LUMA_48x64].pixelavg_pp, pixel_avg_48x64_avx2);
2089
+        ASSIGN2(p.pu[LUMA_64x64].pixelavg_pp, pixel_avg_64x64_avx2);
2090
+        ASSIGN2(p.pu[LUMA_64x48].pixelavg_pp, pixel_avg_64x48_avx2);
2091
+        ASSIGN2(p.pu[LUMA_64x32].pixelavg_pp, pixel_avg_64x32_avx2);
2092
+        ASSIGN2(p.pu[LUMA_64x16].pixelavg_pp, pixel_avg_64x16_avx2);
2093
         p.pu[LUMA_16x16].satd = PFX(pixel_satd_16x16_avx2);
2094
         p.pu[LUMA_16x8].satd  = PFX(pixel_satd_16x8_avx2);
2095
         p.pu[LUMA_8x16].satd  = PFX(pixel_satd_8x16_avx2);
2096
@@ -2895,19 +3877,15 @@
2097
         p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sse_pp = PFX(pixel_ssd_16x16_avx2);
2098
         p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sse_pp = PFX(pixel_ssd_32x32_avx2);
2099
 
2100
-        p.cu[BLOCK_16x16].ssd_s = PFX(pixel_ssd_s_16_avx2);
2101
-        p.cu[BLOCK_32x32].ssd_s = PFX(pixel_ssd_s_32_avx2);
2102
-
2103
+        ASSIGN2(p.cu[BLOCK_16x16].ssd_s, pixel_ssd_s_16_avx2);
2104
+        ASSIGN2(p.cu[BLOCK_32x32].ssd_s, pixel_ssd_s_32_avx2);
2105
         p.cu[BLOCK_8x8].copy_cnt = PFX(copy_cnt_8_avx2);
2106
         p.cu[BLOCK_16x16].copy_cnt = PFX(copy_cnt_16_avx2);
2107
         p.cu[BLOCK_32x32].copy_cnt = PFX(copy_cnt_32_avx2);
2108
-
2109
-        p.cu[BLOCK_16x16].blockfill_s = PFX(blockfill_s_16x16_avx2);
2110
-        p.cu[BLOCK_32x32].blockfill_s = PFX(blockfill_s_32x32_avx2);
2111
-
2112
-        ALL_LUMA_TU_S(cpy1Dto2D_shl, cpy1Dto2D_shl_, avx2);
2113
+        ASSIGN2(p.cu[BLOCK_16x16].blockfill_s, blockfill_s_16x16_avx2);
2114
+        ALL_LUMA_TU_S(cpy1Dto2D_shl[ALIGNED], cpy1Dto2D_shl_, avx2);
2115
+        ALL_LUMA_TU_S(cpy1Dto2D_shl[NONALIGNED], cpy1Dto2D_shl_, avx2);
2116
         ALL_LUMA_TU_S(cpy1Dto2D_shr, cpy1Dto2D_shr_, avx2);
2117
-
2118
         p.cu[BLOCK_8x8].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_8_avx2);
2119
         p.cu[BLOCK_16x16].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_16_avx2);
2120
         p.cu[BLOCK_32x32].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32_avx2);
2121
@@ -2923,10 +3901,10 @@
2122
         p.dequant_normal = PFX(dequant_normal_avx2);
2123
         p.dequant_scaling = PFX(dequant_scaling_avx2);
2124
 
2125
-        p.cu[BLOCK_16x16].calcresidual = PFX(getResidual16_avx2);
2126
-        p.cu[BLOCK_32x32].calcresidual = PFX(getResidual32_avx2);
2127
+        ASSIGN2(p.cu[BLOCK_16x16].calcresidual, getResidual16_avx2);
2128
+        ASSIGN2(p.cu[BLOCK_32x32].calcresidual, getResidual32_avx2);
2129
 
2130
-        p.scale1D_128to64 = PFX(scale1D_128to64_avx2);
2131
+        ASSIGN2(p.scale1D_128to64, scale1D_128to64_avx2);
2132
         p.weight_pp = PFX(weight_pp_avx2);
2133
         p.weight_sp = PFX(weight_sp_avx2);
2134
 
2135
@@ -3354,44 +4332,45 @@
2136
         ALL_LUMA_PU_T(luma_hvpp, interp_8tap_hv_pp_cpu);
2137
         p.pu[LUMA_4x4].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_4x4>;
2138
 
2139
-        p.pu[LUMA_16x4].convert_p2s = PFX(filterPixelToShort_16x4_avx2);
2140
-        p.pu[LUMA_16x8].convert_p2s = PFX(filterPixelToShort_16x8_avx2);
2141
-        p.pu[LUMA_16x12].convert_p2s = PFX(filterPixelToShort_16x12_avx2);
2142
-        p.pu[LUMA_16x16].convert_p2s = PFX(filterPixelToShort_16x16_avx2);
2143
-        p.pu[LUMA_16x32].convert_p2s = PFX(filterPixelToShort_16x32_avx2);
2144
-        p.pu[LUMA_16x64].convert_p2s = PFX(filterPixelToShort_16x64_avx2);
2145
-        p.pu[LUMA_32x8].convert_p2s = PFX(filterPixelToShort_32x8_avx2);
2146
-        p.pu[LUMA_32x16].convert_p2s = PFX(filterPixelToShort_32x16_avx2);
2147
-        p.pu[LUMA_32x24].convert_p2s = PFX(filterPixelToShort_32x24_avx2);
2148
-        p.pu[LUMA_32x32].convert_p2s = PFX(filterPixelToShort_32x32_avx2);
2149
-        p.pu[LUMA_32x64].convert_p2s = PFX(filterPixelToShort_32x64_avx2);
2150
-        p.pu[LUMA_64x16].convert_p2s = PFX(filterPixelToShort_64x16_avx2);
2151
-        p.pu[LUMA_64x32].convert_p2s = PFX(filterPixelToShort_64x32_avx2);
2152
-        p.pu[LUMA_64x48].convert_p2s = PFX(filterPixelToShort_64x48_avx2);
2153
-        p.pu[LUMA_64x64].convert_p2s = PFX(filterPixelToShort_64x64_avx2);
2154
-        p.pu[LUMA_48x64].convert_p2s = PFX(filterPixelToShort_48x64_avx2);
2155
-        p.pu[LUMA_24x32].convert_p2s = PFX(filterPixelToShort_24x32_avx2);
2156
-
2157
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].p2s = PFX(filterPixelToShort_16x4_avx2);
2158
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].p2s = PFX(filterPixelToShort_16x8_avx2);
2159
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].p2s = PFX(filterPixelToShort_16x12_avx2);
2160
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].p2s = PFX(filterPixelToShort_16x16_avx2);
2161
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].p2s = PFX(filterPixelToShort_16x32_avx2);
2162
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].p2s = PFX(filterPixelToShort_24x32_avx2);
2163
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].p2s = PFX(filterPixelToShort_32x8_avx2);
2164
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].p2s = PFX(filterPixelToShort_32x16_avx2);
2165
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].p2s = PFX(filterPixelToShort_32x24_avx2);
2166
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].p2s = PFX(filterPixelToShort_32x32_avx2);
2167
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].p2s = PFX(filterPixelToShort_16x8_avx2);
2168
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].p2s = PFX(filterPixelToShort_16x16_avx2);
2169
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].p2s = PFX(filterPixelToShort_16x24_avx2);
2170
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].p2s = PFX(filterPixelToShort_16x32_avx2);
2171
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].p2s = PFX(filterPixelToShort_16x64_avx2);
2172
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].p2s = PFX(filterPixelToShort_24x64_avx2);
2173
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].p2s = PFX(filterPixelToShort_32x16_avx2);
2174
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].p2s = PFX(filterPixelToShort_32x32_avx2);
2175
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s = PFX(filterPixelToShort_32x48_avx2);
2176
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s = PFX(filterPixelToShort_32x64_avx2);
2177
+        ASSIGN2(p.pu[LUMA_16x4].convert_p2s, filterPixelToShort_16x4_avx2);
2178
+        ASSIGN2(p.pu[LUMA_16x8].convert_p2s, filterPixelToShort_16x8_avx2);
2179
+        ASSIGN2(p.pu[LUMA_16x12].convert_p2s, filterPixelToShort_16x12_avx2);
2180
+        ASSIGN2(p.pu[LUMA_16x16].convert_p2s, filterPixelToShort_16x16_avx2);
2181
+        ASSIGN2(p.pu[LUMA_16x32].convert_p2s, filterPixelToShort_16x32_avx2);
2182
+        ASSIGN2(p.pu[LUMA_16x64].convert_p2s, filterPixelToShort_16x64_avx2);
2183
+        ASSIGN2(p.pu[LUMA_32x8].convert_p2s, filterPixelToShort_32x8_avx2);
2184
+        ASSIGN2(p.pu[LUMA_32x16].convert_p2s, filterPixelToShort_32x16_avx2);
2185
+        ASSIGN2(p.pu[LUMA_32x24].convert_p2s, filterPixelToShort_32x24_avx2);
2186
+        ASSIGN2(p.pu[LUMA_32x32].convert_p2s, filterPixelToShort_32x32_avx2);
2187
+        ASSIGN2(p.pu[LUMA_32x64].convert_p2s, filterPixelToShort_32x64_avx2);
2188
+        ASSIGN2(p.pu[LUMA_64x16].convert_p2s, filterPixelToShort_64x16_avx2);
2189
+        ASSIGN2(p.pu[LUMA_64x32].convert_p2s, filterPixelToShort_64x32_avx2);
2190
+        ASSIGN2(p.pu[LUMA_64x48].convert_p2s, filterPixelToShort_64x48_avx2);
2191
+        ASSIGN2(p.pu[LUMA_64x64].convert_p2s, filterPixelToShort_64x64_avx2);
2192
+        ASSIGN2(p.pu[LUMA_48x64].convert_p2s, filterPixelToShort_48x64_avx2);
2193
+        ASSIGN2(p.pu[LUMA_24x32].convert_p2s, filterPixelToShort_24x32_avx2);
2194
+
2195
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].p2s, filterPixelToShort_16x4_avx2);
2196
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].p2s, filterPixelToShort_16x8_avx2);
2197
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].p2s, filterPixelToShort_16x12_avx2);
2198
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].p2s, filterPixelToShort_16x16_avx2);
2199
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].p2s, filterPixelToShort_16x32_avx2);
2200
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].p2s, filterPixelToShort_24x32_avx2);
2201
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].p2s, filterPixelToShort_32x8_avx2);
2202
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].p2s, filterPixelToShort_32x16_avx2);
2203
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].p2s, filterPixelToShort_32x24_avx2);
2204
+        ASSIGN2(p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].p2s, filterPixelToShort_32x32_avx2);
2205
+
2206
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].p2s, filterPixelToShort_16x8_avx2);
2207
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].p2s, filterPixelToShort_16x16_avx2);
2208
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].p2s, filterPixelToShort_16x24_avx2);
2209
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].p2s, filterPixelToShort_16x32_avx2);
2210
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].p2s, filterPixelToShort_16x64_avx2);
2211
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].p2s, filterPixelToShort_24x64_avx2);
2212
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].p2s, filterPixelToShort_32x16_avx2);
2213
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].p2s, filterPixelToShort_32x32_avx2);
2214
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s, filterPixelToShort_32x48_avx2);
2215
+        ASSIGN2(p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s, filterPixelToShort_32x64_avx2);
2216
 
2217
         //i422 for chroma_hpp
2218
         p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].filter_hpp = PFX(interp_4tap_horiz_pp_12x32_avx2);
2219
@@ -3718,6 +4697,707 @@
2220
         p.integral_inith[INTEGRAL_16] = PFX(integral16h_avx2);
2221
         p.integral_inith[INTEGRAL_24] = PFX(integral24h_avx2);
2222
         p.integral_inith[INTEGRAL_32] = PFX(integral32h_avx2);
2223
+        p.cu[BLOCK_4x4].nonPsyRdoQuant = PFX(nonPsyRdoQuant4_avx2);
2224
+        p.cu[BLOCK_8x8].nonPsyRdoQuant = PFX(nonPsyRdoQuant8_avx2);
2225
+        p.cu[BLOCK_16x16].nonPsyRdoQuant = PFX(nonPsyRdoQuant16_avx2);
2226
+        p.cu[BLOCK_32x32].nonPsyRdoQuant = PFX(nonPsyRdoQuant32_avx2);
2227
+        p.cu[BLOCK_4x4].psyRdoQuant_1p = PFX(psyRdoQuant_1p4_avx2);
2228
+        p.cu[BLOCK_8x8].psyRdoQuant_1p = PFX(psyRdoQuant_1p8_avx2);
2229
+        p.cu[BLOCK_16x16].psyRdoQuant_1p = PFX(psyRdoQuant_1p16_avx2);
2230
+        p.cu[BLOCK_32x32].psyRdoQuant_1p = PFX(psyRdoQuant_1p32_avx2);
2231
+
2232
+    }
2233
+    if (cpuMask & X265_CPU_AVX512)
2234
+    {
2235
+        p.pu[LUMA_32x8].sad = PFX(pixel_sad_32x8_avx512);
2236
+      //  p.pu[LUMA_32x16].sad = PFX(pixel_sad_32x16_avx512);
2237
+        p.pu[LUMA_32x24].sad = PFX(pixel_sad_32x24_avx512);
2238
+        p.pu[LUMA_32x32].sad = PFX(pixel_sad_32x32_avx512);
2239
+        //p.pu[LUMA_32x64].sad = PFX(pixel_sad_32x64_avx512);
2240
+        p.pu[LUMA_64x16].sad = PFX(pixel_sad_64x16_avx512);
2241
+        p.pu[LUMA_64x32].sad = PFX(pixel_sad_64x32_avx512);
2242
+        p.pu[LUMA_64x48].sad = PFX(pixel_sad_64x48_avx512);
2243
+        p.pu[LUMA_64x64].sad = PFX(pixel_sad_64x64_avx512);
2244
+
2245
+        p.pu[LUMA_32x8].sad_x3 = PFX(pixel_sad_x3_32x8_avx512);
2246
+        p.pu[LUMA_32x16].sad_x3 = PFX(pixel_sad_x3_32x16_avx512);
2247
+        p.pu[LUMA_32x24].sad_x3 = PFX(pixel_sad_x3_32x24_avx512);
2248
+        p.pu[LUMA_32x32].sad_x3 = PFX(pixel_sad_x3_32x32_avx512);
2249
+        p.pu[LUMA_32x64].sad_x3 = PFX(pixel_sad_x3_32x64_avx512);
2250
+        p.pu[LUMA_64x16].sad_x3 = PFX(pixel_sad_x3_64x16_avx512);
2251
+        p.pu[LUMA_64x32].sad_x3 = PFX(pixel_sad_x3_64x32_avx512);
2252
+        p.pu[LUMA_64x48].sad_x3 = PFX(pixel_sad_x3_64x48_avx512);
2253
+        p.pu[LUMA_64x64].sad_x3 = PFX(pixel_sad_x3_64x64_avx512);
2254
+        p.pu[LUMA_48x64].sad_x3 = PFX(pixel_sad_x3_48x64_avx512);
2255
+
2256
+        p.pu[LUMA_32x32].sad_x4 = PFX(pixel_sad_x4_32x32_avx512);
2257
+        p.pu[LUMA_32x16].sad_x4 = PFX(pixel_sad_x4_32x16_avx512);
2258
+        p.pu[LUMA_32x64].sad_x4 = PFX(pixel_sad_x4_32x64_avx512);
2259
+        p.pu[LUMA_32x24].sad_x4 = PFX(pixel_sad_x4_32x24_avx512);
2260
+        p.pu[LUMA_32x8].sad_x4 = PFX(pixel_sad_x4_32x8_avx512);
2261
+        p.pu[LUMA_64x16].sad_x4 = PFX(pixel_sad_x4_64x16_avx512);
2262
+        p.pu[LUMA_64x32].sad_x4 = PFX(pixel_sad_x4_64x32_avx512);
2263
+        p.pu[LUMA_64x48].sad_x4 = PFX(pixel_sad_x4_64x48_avx512);
2264
+        p.pu[LUMA_64x64].sad_x4 = PFX(pixel_sad_x4_64x64_avx512);
2265
+        p.pu[LUMA_48x64].sad_x4 = PFX(pixel_sad_x4_48x64_avx512);
2266
+
2267
+        p.pu[LUMA_4x4].satd = PFX(pixel_satd_4x4_avx512);
2268
+        p.pu[LUMA_4x8].satd = PFX(pixel_satd_4x8_avx512);
2269
+        p.pu[LUMA_4x16].satd = PFX(pixel_satd_4x16_avx512);
2270
+        p.pu[LUMA_8x4].satd = PFX(pixel_satd_8x4_avx512);
2271
+        p.pu[LUMA_8x8].satd = PFX(pixel_satd_8x8_avx512);
2272
+        p.pu[LUMA_8x16].satd = PFX(pixel_satd_8x16_avx512);
2273
+        p.pu[LUMA_16x8].satd = PFX(pixel_satd_16x8_avx512);
2274
+        p.pu[LUMA_16x16].satd = PFX(pixel_satd_16x16_avx512);
2275
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].satd = PFX(pixel_satd_4x4_avx512);
2276
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd = PFX(pixel_satd_4x8_avx512);
2277
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].satd = PFX(pixel_satd_4x16_avx512);
2278
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].satd = PFX(pixel_satd_8x4_avx512);
2279
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].satd = PFX(pixel_satd_8x8_avx512);
2280
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].satd = PFX(pixel_satd_8x16_avx512);
2281
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].satd = PFX(pixel_satd_16x8_avx512);
2282
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].satd = PFX(pixel_satd_16x16_avx512);
2283
+
2284
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].satd = PFX(pixel_satd_4x4_avx512);
2285
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].satd = PFX(pixel_satd_4x8_avx512);
2286
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].satd = PFX(pixel_satd_4x16_avx512);
2287
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].satd = PFX(pixel_satd_8x4_avx512);
2288
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].satd = PFX(pixel_satd_8x8_avx512);
2289
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].satd = PFX(pixel_satd_8x16_avx512);
2290
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].satd = PFX(pixel_satd_16x8_avx512);
2291
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].satd = PFX(pixel_satd_16x16_avx512);
2292
+
2293
+        p.cu[BLOCK_8x8].sa8d = PFX(pixel_sa8d_8x8_avx512);
2294
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].sa8d = PFX(pixel_sa8d_8x8_avx512);
2295
+
2296
+        p.cu[BLOCK_8x8].var = PFX(pixel_var_8x8_avx512);
2297
+        p.cu[BLOCK_16x16].var = PFX(pixel_var_16x16_avx512);
2298
+        p.cu[BLOCK_32x32].var = PFX(pixel_var_32x32_avx512);
2299
+        p.cu[BLOCK_64x64].var = PFX(pixel_var_64x64_avx512);
2300
+        ASSIGN2(p.pu[LUMA_16x64].pixelavg_pp, pixel_avg_16x64_avx512);
2301
+        ASSIGN2(p.pu[LUMA_16x32].pixelavg_pp, pixel_avg_16x32_avx512);
2302
+        ASSIGN2(p.pu[LUMA_16x16].pixelavg_pp, pixel_avg_16x16_avx512);
2303
+        ASSIGN2(p.pu[LUMA_16x12].pixelavg_pp, pixel_avg_16x12_avx512);
2304
+        ASSIGN2(p.pu[LUMA_16x8].pixelavg_pp, pixel_avg_16x8_avx512);
2305
+        ASSIGN2(p.pu[LUMA_16x4].pixelavg_pp, pixel_avg_16x4_avx512);
2306
+        ASSIGN2(p.pu[LUMA_8x32].pixelavg_pp, pixel_avg_8x32_avx512);
2307
+        ASSIGN2(p.pu[LUMA_8x16].pixelavg_pp, pixel_avg_8x16_avx512);
2308
+        ASSIGN2(p.pu[LUMA_8x8].pixelavg_pp, pixel_avg_8x8_avx512);
2309
+        //p.pu[LUMA_8x4].pixelavg_pp = PFX(pixel_avg_8x4_avx512);
2310
+        p.pu[LUMA_4x4].sad = PFX(pixel_sad_4x4_avx512);
2311
+        p.pu[LUMA_4x8].sad = PFX(pixel_sad_4x8_avx512);
2312
+        p.pu[LUMA_4x16].sad = PFX(pixel_sad_4x16_avx512);
2313
+        p.pu[LUMA_8x4].sad = PFX(pixel_sad_8x4_avx512);
2314
+        p.pu[LUMA_8x8].sad = PFX(pixel_sad_8x8_avx512);
2315
+       // p.pu[LUMA_8x16].sad = PFX(pixel_sad_8x16_avx512);
2316
+        p.pu[LUMA_16x8].sad = PFX(pixel_sad_16x8_avx512);
2317
+        p.pu[LUMA_16x16].sad = PFX(pixel_sad_16x16_avx512);
2318
+
2319
+        p.pu[LUMA_64x64].copy_pp = PFX(blockcopy_pp_64x64_avx512);
2320
+        p.pu[LUMA_64x32].copy_pp = PFX(blockcopy_pp_64x32_avx512);
2321
+        p.pu[LUMA_64x48].copy_pp = PFX(blockcopy_pp_64x48_avx512);
2322
+        p.pu[LUMA_64x16].copy_pp = PFX(blockcopy_pp_64x16_avx512);
2323
+        p.pu[LUMA_32x16].copy_pp = PFX(blockcopy_pp_32x16_avx512);
2324
+        p.pu[LUMA_32x24].copy_pp = PFX(blockcopy_pp_32x24_avx512);
2325
+        p.pu[LUMA_32x32].copy_pp  = PFX(blockcopy_pp_32x32_avx512);
2326
+        p.pu[LUMA_32x64].copy_pp = PFX(blockcopy_pp_32x64_avx512);
2327
+
2328
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].copy_pp = PFX(blockcopy_pp_32x16_avx512);
2329
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].copy_pp = PFX(blockcopy_pp_32x24_avx512);
2330
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].copy_pp = PFX(blockcopy_pp_32x32_avx512);
2331
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].copy_pp = PFX(blockcopy_pp_32x16_avx512);
2332
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].copy_pp = PFX(blockcopy_pp_32x32_avx512);
2333
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].copy_pp = PFX(blockcopy_pp_32x48_avx512);
2334
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].copy_pp = PFX(blockcopy_pp_32x64_avx512);
2335
+
2336
+        p.cu[BLOCK_64x64].copy_sp = PFX(blockcopy_sp_64x64_avx512);
2337
+        p.cu[BLOCK_32x32].copy_sp = PFX(blockcopy_sp_32x32_avx512);
2338
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].copy_sp = PFX(blockcopy_sp_32x32_avx512);
2339
+        p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_sp = PFX(blockcopy_sp_32x64_avx512);
2340
+
2341
+        p.cu[BLOCK_32x32].copy_ps = PFX(blockcopy_ps_32x32_avx512);
2342
+        p.chroma[X265_CSP_I420].cu[CHROMA_420_32x32].copy_ps = PFX(blockcopy_ps_32x32_avx512);
2343
+        p.chroma[X265_CSP_I422].cu[CHROMA_422_32x64].copy_ps = PFX(blockcopy_ps_32x64_avx512);
2344
+        p.cu[BLOCK_64x64].copy_ps = PFX(blockcopy_ps_64x64_avx512);
2345
+
2346
+        p.scale1D_128to64[NONALIGNED] = PFX(scale1D_128to64_avx512);
2347
+        p.scale1D_128to64[ALIGNED] = PFX(scale1D_128to64_aligned_avx512);
2348
+
2349
+        p.pu[LUMA_64x16].addAvg[NONALIGNED] = PFX(addAvg_64x16_avx512);
2350
+        p.pu[LUMA_64x32].addAvg[NONALIGNED] = PFX(addAvg_64x32_avx512);
2351
+        p.pu[LUMA_64x48].addAvg[NONALIGNED] = PFX(addAvg_64x48_avx512);
2352
+        p.pu[LUMA_64x64].addAvg[NONALIGNED] = PFX(addAvg_64x64_avx512);
2353
+        p.pu[LUMA_32x8].addAvg[NONALIGNED] = PFX(addAvg_32x8_avx512);
2354
+        p.pu[LUMA_32x16].addAvg[NONALIGNED] = PFX(addAvg_32x16_avx512);
2355
+        p.pu[LUMA_32x24].addAvg[NONALIGNED] = PFX(addAvg_32x24_avx512);
2356
+        p.pu[LUMA_32x32].addAvg[NONALIGNED] = PFX(addAvg_32x32_avx512);
2357
+        p.pu[LUMA_32x64].addAvg[NONALIGNED] = PFX(addAvg_32x64_avx512);
2358
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].addAvg[NONALIGNED] = PFX(addAvg_32x8_avx512);
2359
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].addAvg[NONALIGNED] = PFX(addAvg_32x16_avx512);
2360
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].addAvg[NONALIGNED] = PFX(addAvg_32x24_avx512);
2361
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].addAvg[NONALIGNED] = PFX(addAvg_32x32_avx512);
2362
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].addAvg[NONALIGNED] = PFX(addAvg_32x16_avx512);
2363
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].addAvg[NONALIGNED] = PFX(addAvg_32x48_avx512);
2364
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].addAvg[NONALIGNED] = PFX(addAvg_32x64_avx512);
2365
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].addAvg[NONALIGNED] = PFX(addAvg_32x32_avx512);
2366
+
2367
+        p.pu[LUMA_32x8].addAvg[ALIGNED] = PFX(addAvg_aligned_32x8_avx512);
2368
+        p.pu[LUMA_32x16].addAvg[ALIGNED] = PFX(addAvg_aligned_32x16_avx512);
2369
+        p.pu[LUMA_32x24].addAvg[ALIGNED] = PFX(addAvg_aligned_32x24_avx512);
2370
+        p.pu[LUMA_32x32].addAvg[ALIGNED] = PFX(addAvg_aligned_32x32_avx512);
2371
+        p.pu[LUMA_32x64].addAvg[ALIGNED] = PFX(addAvg_aligned_32x64_avx512);
2372
+        p.pu[LUMA_64x16].addAvg[ALIGNED] = PFX(addAvg_aligned_64x16_avx512);
2373
+        p.pu[LUMA_64x32].addAvg[ALIGNED] = PFX(addAvg_aligned_64x32_avx512);
2374
+        p.pu[LUMA_64x48].addAvg[ALIGNED] = PFX(addAvg_aligned_64x48_avx512);
2375
+        p.pu[LUMA_64x64].addAvg[ALIGNED] = PFX(addAvg_aligned_64x64_avx512);
2376
+
2377
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].addAvg[ALIGNED] = PFX(addAvg_aligned_32x8_avx512);
2378
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].addAvg[ALIGNED] = PFX(addAvg_aligned_32x16_avx512);
2379
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].addAvg[ALIGNED] = PFX(addAvg_aligned_32x24_avx512);
2380
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].addAvg[ALIGNED] = PFX(addAvg_aligned_32x32_avx512);
2381
+
2382
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].addAvg[ALIGNED] = PFX(addAvg_aligned_32x16_avx512);
2383
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].addAvg[ALIGNED] = PFX(addAvg_aligned_32x48_avx512);
2384
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].addAvg[ALIGNED] = PFX(addAvg_aligned_32x64_avx512);
2385
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].addAvg[ALIGNED] = PFX(addAvg_aligned_32x32_avx512);
2386
+
2387
+        p.cu[BLOCK_32x32].blockfill_s[NONALIGNED] = PFX(blockfill_s_32x32_avx512);
2388
+        p.cu[BLOCK_32x32].blockfill_s[ALIGNED] = PFX(blockfill_s_aligned_32x32_avx512);
2389
+
2390
+        p.cu[BLOCK_64x64].add_ps[NONALIGNED] = PFX(pixel_add_ps_64x64_avx512);
2391
+        p.cu[BLOCK_32x32].add_ps[NONALIGNED] = PFX(pixel_add_ps_32x32_avx512);
2392
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps[NONALIGNED] = PFX(pixel_add_ps_32x32_avx512);
2393
+        p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps[NONALIGNED] = PFX(pixel_add_ps_32x64_avx512);
2394
+
2395
+        p.cu[BLOCK_32x32].add_ps[ALIGNED] = PFX(pixel_add_ps_aligned_32x32_avx512);
2396
+        p.cu[BLOCK_64x64].add_ps[ALIGNED] = PFX(pixel_add_ps_aligned_64x64_avx512);
2397
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps[ALIGNED] = PFX(pixel_add_ps_aligned_32x32_avx512);
2398
+        p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps[ALIGNED] = PFX(pixel_add_ps_aligned_32x64_avx512);
2399
+
2400
+        p.cu[BLOCK_64x64].sub_ps = PFX(pixel_sub_ps_64x64_avx512);
2401
+        p.cu[BLOCK_32x32].sub_ps = PFX(pixel_sub_ps_32x32_avx512);
2402
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sub_ps = PFX(pixel_sub_ps_32x32_avx512);
2403
+        p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sub_ps = PFX(pixel_sub_ps_32x64_avx512);
2404
+
2405
+        p.pu[LUMA_64x16].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_64x16_avx512);
2406
+        p.pu[LUMA_64x32].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_64x32_avx512);
2407
+        p.pu[LUMA_64x48].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_64x48_avx512);
2408
+        p.pu[LUMA_64x64].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_64x64_avx512);
2409
+        p.pu[LUMA_32x8].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_32x8_avx2);
2410
+        p.pu[LUMA_32x16].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_32x16_avx512);
2411
+        p.pu[LUMA_32x24].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_32x24_avx512);
2412
+        p.pu[LUMA_32x32].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_32x32_avx512);
2413
+        p.pu[LUMA_32x64].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_32x64_avx512);
2414
+        p.pu[LUMA_48x64].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_48x64_avx512);
2415
+
2416
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].p2s[NONALIGNED] = PFX(filterPixelToShort_32x8_avx512);
2417
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].p2s[NONALIGNED] = PFX(filterPixelToShort_32x16_avx512);
2418
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].p2s[NONALIGNED] = PFX(filterPixelToShort_32x24_avx512);
2419
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].p2s[NONALIGNED] = PFX(filterPixelToShort_32x32_avx512);
2420
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].p2s[NONALIGNED] = PFX(filterPixelToShort_32x16_avx512);
2421
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].p2s[NONALIGNED] = PFX(filterPixelToShort_32x32_avx512);
2422
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s[NONALIGNED] = PFX(filterPixelToShort_32x48_avx512);
2423
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s[NONALIGNED] = PFX(filterPixelToShort_32x64_avx512);
2424
+        p.chroma[X265_CSP_I444].pu[LUMA_32x8].p2s[NONALIGNED] = PFX(filterPixelToShort_32x8_avx2);
2425
+        p.chroma[X265_CSP_I444].pu[LUMA_32x16].p2s[NONALIGNED] = PFX(filterPixelToShort_32x16_avx512);
2426
+        p.chroma[X265_CSP_I444].pu[LUMA_32x24].p2s[NONALIGNED] = PFX(filterPixelToShort_32x24_avx512);
2427
+        p.chroma[X265_CSP_I444].pu[LUMA_32x32].p2s[NONALIGNED] = PFX(filterPixelToShort_32x32_avx512);
2428
+        p.chroma[X265_CSP_I444].pu[LUMA_32x64].p2s[NONALIGNED] = PFX(filterPixelToShort_32x64_avx512);
2429
+        p.chroma[X265_CSP_I444].pu[LUMA_64x16].p2s[NONALIGNED] = PFX(filterPixelToShort_64x16_avx512);
2430
+        p.chroma[X265_CSP_I444].pu[LUMA_64x32].p2s[NONALIGNED] = PFX(filterPixelToShort_64x32_avx512);
2431
+        p.chroma[X265_CSP_I444].pu[LUMA_64x48].p2s[NONALIGNED] = PFX(filterPixelToShort_64x48_avx512);
2432
+        p.chroma[X265_CSP_I444].pu[LUMA_64x64].p2s[NONALIGNED] = PFX(filterPixelToShort_64x64_avx512);
2433
+
2434
+        p.pu[LUMA_64x16].convert_p2s[ALIGNED] = PFX(filterPixelToShort_aligned_64x16_avx512);
2435
+        p.pu[LUMA_64x32].convert_p2s[ALIGNED] = PFX(filterPixelToShort_aligned_64x32_avx512);
2436
+        p.pu[LUMA_64x48].convert_p2s[ALIGNED] = PFX(filterPixelToShort_aligned_64x48_avx512);
2437
+        p.pu[LUMA_64x64].convert_p2s[ALIGNED] = PFX(filterPixelToShort_aligned_64x64_avx512);
2438
+        p.pu[LUMA_32x8].convert_p2s[ALIGNED] = PFX(filterPixelToShort_aligned_32x8_avx512);
2439
+        p.pu[LUMA_32x16].convert_p2s[ALIGNED] = PFX(filterPixelToShort_aligned_32x16_avx512);
2440
+        p.pu[LUMA_32x24].convert_p2s[ALIGNED] = PFX(filterPixelToShort_aligned_32x24_avx512);
2441
+        p.pu[LUMA_32x32].convert_p2s[ALIGNED] = PFX(filterPixelToShort_aligned_32x32_avx512);
2442
+        p.pu[LUMA_32x64].convert_p2s[ALIGNED] = PFX(filterPixelToShort_aligned_32x64_avx512);
2443
+        p.pu[LUMA_48x64].convert_p2s[ALIGNED] = PFX(filterPixelToShort_aligned_48x64_avx512);
2444
+
2445
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].p2s[ALIGNED] = PFX(filterPixelToShort_aligned_32x8_avx512);
2446
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].p2s[ALIGNED] = PFX(filterPixelToShort_aligned_32x16_avx512);
2447
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].p2s[ALIGNED] = PFX(filterPixelToShort_aligned_32x24_avx512);
2448
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].p2s[ALIGNED] = PFX(filterPixelToShort_aligned_32x32_avx512);
2449
+
2450
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].p2s[ALIGNED] = PFX(filterPixelToShort_aligned_32x16_avx512);
2451
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].p2s[ALIGNED] = PFX(filterPixelToShort_aligned_32x32_avx512);
2452
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s[ALIGNED] = PFX(filterPixelToShort_aligned_32x48_avx512);
2453
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s[ALIGNED] = PFX(filterPixelToShort_aligned_32x64_avx512);
2454
+
2455
+        p.chroma[X265_CSP_I444].pu[LUMA_32x8].p2s[ALIGNED] = PFX(filterPixelToShort_aligned_32x8_avx512);
2456
+        p.chroma[X265_CSP_I444].pu[LUMA_32x16].p2s[ALIGNED] = PFX(filterPixelToShort_aligned_32x16_avx512);
2457
+        p.chroma[X265_CSP_I444].pu[LUMA_32x24].p2s[ALIGNED] = PFX(filterPixelToShort_aligned_32x24_avx512);
2458
+        p.chroma[X265_CSP_I444].pu[LUMA_32x32].p2s[ALIGNED] = PFX(filterPixelToShort_aligned_32x32_avx512);
2459
+        p.chroma[X265_CSP_I444].pu[LUMA_32x64].p2s[ALIGNED] = PFX(filterPixelToShort_aligned_32x64_avx512);
2460
+        p.chroma[X265_CSP_I444].pu[LUMA_64x16].p2s[ALIGNED] = PFX(filterPixelToShort_aligned_64x16_avx512);
2461
+        p.chroma[X265_CSP_I444].pu[LUMA_64x32].p2s[ALIGNED] = PFX(filterPixelToShort_aligned_64x32_avx512);
2462
+        p.chroma[X265_CSP_I444].pu[LUMA_64x48].p2s[ALIGNED] = PFX(filterPixelToShort_aligned_64x48_avx512);
2463
+        p.chroma[X265_CSP_I444].pu[LUMA_64x64].p2s[ALIGNED] = PFX(filterPixelToShort_aligned_64x64_avx512);
2464
+
2465
+        p.cu[BLOCK_64x64].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_64x64_avx512);
2466
+        p.cu[BLOCK_32x32].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_32x32_avx512);
2467
+        p.cu[BLOCK_16x16].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_16x16_avx512);
2468
+        p.cu[BLOCK_32x32].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_32_avx512);
2469
+        p.cu[BLOCK_32x32].ssd_s[ALIGNED] = PFX(pixel_ssd_s_32_avx512);
2470
+        p.cu[BLOCK_16x16].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_16_avx512);
2471
+        p.cu[BLOCK_16x16].ssd_s[ALIGNED] = PFX(pixel_ssd_s_aligned_16_avx512);
2472
+        p.cu[BLOCK_32x32].copy_ss = PFX(blockcopy_ss_32x32_avx512);
2473
+        p.chroma[X265_CSP_I420].cu[CHROMA_420_32x32].copy_ss = PFX(blockcopy_ss_32x32_avx512);
2474
+        p.chroma[X265_CSP_I422].cu[CHROMA_422_32x64].copy_ss = PFX(blockcopy_ss_32x64_avx512);
2475
+        p.cu[BLOCK_64x64].copy_ss = PFX(blockcopy_ss_64x64_avx512);
2476
+
2477
+        p.cu[BLOCK_32x32].calcresidual[NONALIGNED] = PFX(getResidual32_avx512);
2478
+        p.cu[BLOCK_32x32].calcresidual[ALIGNED] = PFX(getResidual_aligned32_avx512);
2479
+        p.cu[BLOCK_16x16].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_16_avx512);
2480
+        p.cu[BLOCK_32x32].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32_avx512);
2481
+        p.cu[BLOCK_32x32].cpy1Dto2D_shl[NONALIGNED] = PFX(cpy1Dto2D_shl_32_avx512);
2482
+        p.cu[BLOCK_32x32].cpy1Dto2D_shl[ALIGNED] = PFX(cpy1Dto2D_shl_aligned_32_avx512);
2483
+        p.cu[BLOCK_16x16].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_16_avx512);
2484
+        p.cu[BLOCK_32x32].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_32_avx512);
2485
+
2486
+        p.cu[BLOCK_16x16].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_16_avx512);
2487
+        p.cu[BLOCK_32x32].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_32_avx512);
2488
+
2489
+        p.cu[BLOCK_32x32].copy_cnt = PFX(copy_cnt_32_avx512);
2490
+        p.cu[BLOCK_16x16].copy_cnt = PFX(copy_cnt_16_avx512);
2491
+
2492
+        p.dequant_normal = PFX(dequant_normal_avx512);
2493
+        p.dequant_scaling = PFX(dequant_scaling_avx512);
2494
+        //i444 chroma_hpp
2495
+        p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_hpp = PFX(interp_4tap_horiz_pp_64x64_avx512);
2496
+        p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_hpp = PFX(interp_4tap_horiz_pp_64x32_avx512);
2497
+        p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_hpp = PFX(interp_4tap_horiz_pp_64x48_avx512);
2498
+        p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_hpp = PFX(interp_4tap_horiz_pp_64x16_avx512);
2499
+        p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_hpp = PFX(interp_4tap_horiz_pp_32x16_avx512);
2500
+        p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_hpp = PFX(interp_4tap_horiz_pp_32x64_avx512);
2501
+        p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_hpp = PFX(interp_4tap_horiz_pp_32x24_avx512);
2502
+        p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_hpp = PFX(interp_4tap_horiz_pp_32x8_avx512);
2503
+        p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_hpp = PFX(interp_4tap_horiz_pp_32x32_avx512);
2504
+        p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_hpp = PFX(interp_4tap_horiz_pp_16x16_avx512);
2505
+        p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_hpp = PFX(interp_4tap_horiz_pp_16x8_avx512);
2506
+        p.chroma[X265_CSP_I444].pu[LUMA_16x32].filter_hpp = PFX(interp_4tap_horiz_pp_16x32_avx512);
2507
+        p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_hpp = PFX(interp_4tap_horiz_pp_16x12_avx512);
2508
+        p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_hpp = PFX(interp_4tap_horiz_pp_16x4_avx512);
2509
+        p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_hpp = PFX(interp_4tap_horiz_pp_16x64_avx512);
2510
+        p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_hpp = PFX(interp_4tap_horiz_pp_48x64_avx512);
2511
+
2512
+        //i422 chroma_hpp
2513
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_hpp = PFX(interp_4tap_horiz_pp_16x8_avx512);
2514
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_hpp = PFX(interp_4tap_horiz_pp_16x16_avx512);
2515
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].filter_hpp = PFX(interp_4tap_horiz_pp_16x32_avx512);
2516
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].filter_hpp = PFX(interp_4tap_horiz_pp_16x64_avx512);
2517
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_hpp = PFX(interp_4tap_horiz_pp_16x24_avx512);
2518
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_hpp = PFX(interp_4tap_horiz_pp_32x16_avx512);
2519
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_hpp = PFX(interp_4tap_horiz_pp_32x32_avx512);
2520
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_hpp = PFX(interp_4tap_horiz_pp_32x64_avx512);
2521
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_hpp = PFX(interp_4tap_horiz_pp_32x48_avx512);
2522
+
2523
+        //i420 chroma_hpp
2524
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_hpp = PFX(interp_4tap_horiz_pp_16x4_avx512);
2525
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_hpp = PFX(interp_4tap_horiz_pp_16x8_avx512);
2526
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_hpp = PFX(interp_4tap_horiz_pp_16x12_avx512);
2527
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].filter_hpp = PFX(interp_4tap_horiz_pp_16x32_avx512);
2528
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_hpp = PFX(interp_4tap_horiz_pp_16x16_avx512);
2529
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_hpp = PFX(interp_4tap_horiz_pp_32x32_avx512);
2530
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_hpp = PFX(interp_4tap_horiz_pp_32x16_avx512);
2531
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_hpp = PFX(interp_4tap_horiz_pp_32x24_avx512);
2532
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_hpp = PFX(interp_4tap_horiz_pp_32x8_avx512);
2533
+
2534
+        p.weight_pp = PFX(weight_pp_avx512);
2535
+        p.weight_sp = PFX(weight_sp_avx512);
2536
+
2537
+        //i444 chroma_hps
2538
+        p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_hps = PFX(interp_4tap_horiz_ps_64x64_avx512);
2539
+        p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_hps = PFX(interp_4tap_horiz_ps_64x32_avx512);
2540
+        p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_hps = PFX(interp_4tap_horiz_ps_64x48_avx512);
2541
+        p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_hps = PFX(interp_4tap_horiz_ps_64x16_avx512);
2542
+
2543
+        p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_hps = PFX(interp_4tap_horiz_ps_32x32_avx512);
2544
+        p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_hps = PFX(interp_4tap_horiz_ps_32x16_avx512);
2545
+        p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_hps = PFX(interp_4tap_horiz_ps_32x64_avx512);
2546
+        p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_hps = PFX(interp_4tap_horiz_ps_32x24_avx512);
2547
+        p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_hps = PFX(interp_4tap_horiz_ps_32x8_avx512);
2548
+
2549
+        //i422 chroma_hps
2550
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_hps = PFX(interp_4tap_horiz_ps_32x16_avx512);
2551
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_hps = PFX(interp_4tap_horiz_ps_32x32_avx512);
2552
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_hps = PFX(interp_4tap_horiz_ps_32x64_avx512);
2553
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_hps = PFX(interp_4tap_horiz_ps_32x48_avx512);
2554
+
2555
+        //i420 chroma_hps
2556
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_hps = PFX(interp_4tap_horiz_ps_32x32_avx512);
2557
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_hps = PFX(interp_4tap_horiz_ps_32x16_avx512);
2558
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_hps = PFX(interp_4tap_horiz_ps_32x24_avx512);
2559
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_hps = PFX(interp_4tap_horiz_ps_32x8_avx512);
2560
+
2561
+        p.pu[LUMA_16x4].luma_hpp = PFX(interp_8tap_horiz_pp_16x4_avx512);
2562
+        p.pu[LUMA_16x8].luma_hpp = PFX(interp_8tap_horiz_pp_16x8_avx512);
2563
+        p.pu[LUMA_16x12].luma_hpp = PFX(interp_8tap_horiz_pp_16x12_avx512);
2564
+        p.pu[LUMA_16x16].luma_hpp = PFX(interp_8tap_horiz_pp_16x16_avx512);
2565
+        p.pu[LUMA_16x32].luma_hpp = PFX(interp_8tap_horiz_pp_16x32_avx512);
2566
+        p.pu[LUMA_16x64].luma_hpp = PFX(interp_8tap_horiz_pp_16x64_avx512);
2567
+        p.pu[LUMA_32x8].luma_hpp = PFX(interp_8tap_horiz_pp_32x8_avx512);
2568
+        p.pu[LUMA_32x16].luma_hpp = PFX(interp_8tap_horiz_pp_32x16_avx512);
2569
+        p.pu[LUMA_32x24].luma_hpp = PFX(interp_8tap_horiz_pp_32x24_avx512);
2570
+        p.pu[LUMA_32x32].luma_hpp = PFX(interp_8tap_horiz_pp_32x32_avx512);
2571
+        p.pu[LUMA_32x64].luma_hpp = PFX(interp_8tap_horiz_pp_32x64_avx512);
2572
+        p.pu[LUMA_64x16].luma_hpp = PFX(interp_8tap_horiz_pp_64x16_avx512);
2573
+        p.pu[LUMA_64x32].luma_hpp = PFX(interp_8tap_horiz_pp_64x32_avx512);
2574
+        p.pu[LUMA_64x48].luma_hpp = PFX(interp_8tap_horiz_pp_64x48_avx512);
2575
+        p.pu[LUMA_64x64].luma_hpp = PFX(interp_8tap_horiz_pp_64x64_avx512);
2576
+        p.pu[LUMA_48x64].luma_hpp = PFX(interp_8tap_horiz_pp_48x64_avx512);
2577
+        ASSIGN2(p.pu[LUMA_64x16].pixelavg_pp, pixel_avg_64x16_avx512);
2578
+        ASSIGN2(p.pu[LUMA_64x32].pixelavg_pp, pixel_avg_64x32_avx512);
2579
+        ASSIGN2(p.pu[LUMA_64x48].pixelavg_pp, pixel_avg_64x48_avx512);
2580
+        ASSIGN2(p.pu[LUMA_64x64].pixelavg_pp, pixel_avg_64x64_avx512);
2581
+        //luma hps
2582
+        p.pu[LUMA_64x64].luma_hps = PFX(interp_8tap_horiz_ps_64x64_avx512);
2583
+        p.pu[LUMA_64x48].luma_hps = PFX(interp_8tap_horiz_ps_64x48_avx512);
2584
+        p.pu[LUMA_64x32].luma_hps = PFX(interp_8tap_horiz_ps_64x32_avx512);
2585
+        p.pu[LUMA_64x16].luma_hps = PFX(interp_8tap_horiz_ps_64x16_avx512);
2586
+
2587
+        p.pu[LUMA_32x64].luma_hps = PFX(interp_8tap_horiz_ps_32x64_avx512);
2588
+        p.pu[LUMA_32x32].luma_hps = PFX(interp_8tap_horiz_ps_32x32_avx512);
2589
+        p.pu[LUMA_32x24].luma_hps = PFX(interp_8tap_horiz_ps_32x24_avx512);
2590
+        p.pu[LUMA_32x16].luma_hps = PFX(interp_8tap_horiz_ps_32x16_avx512);
2591
+        p.pu[LUMA_32x8].luma_hps = PFX(interp_8tap_horiz_ps_32x8_avx512);
2592
+
2593
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].filter_hps = PFX(interp_4tap_horiz_ps_16x32_avx512);
2594
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_hps = PFX(interp_4tap_horiz_ps_16x12_avx512);
2595
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_hps = PFX(interp_4tap_horiz_ps_16x8_avx512);
2596
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_hps = PFX(interp_4tap_horiz_ps_16x4_avx512);
2597
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_hps = PFX(interp_4tap_horiz_ps_16x16_avx512);
2598
+
2599
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_hps = PFX(interp_4tap_horiz_ps_16x8_avx512);
2600
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_hps = PFX(interp_4tap_horiz_ps_16x16_avx512);
2601
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].filter_hps = PFX(interp_4tap_horiz_ps_16x32_avx512);
2602
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].filter_hps = PFX(interp_4tap_horiz_ps_16x64_avx512);
2603
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_hps = PFX(interp_4tap_horiz_ps_16x24_avx512);
2604
+
2605
+        p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_hps = PFX(interp_4tap_horiz_ps_16x16_avx512);
2606
+        p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_hps = PFX(interp_4tap_horiz_ps_16x8_avx512);
2607
+        p.chroma[X265_CSP_I444].pu[LUMA_16x32].filter_hps = PFX(interp_4tap_horiz_ps_16x32_avx512);
2608
+        p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_hps = PFX(interp_4tap_horiz_ps_16x12_avx512);
2609
+        p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_hps = PFX(interp_4tap_horiz_ps_16x4_avx512);
2610
+        p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_hps = PFX(interp_4tap_horiz_ps_16x64_avx512);
2611
+
2612
+        p.pu[LUMA_16x8].luma_hps = PFX(interp_8tap_horiz_ps_16x8_avx512);
2613
+        p.pu[LUMA_16x16].luma_hps = PFX(interp_8tap_horiz_ps_16x16_avx512);
2614
+        p.pu[LUMA_16x12].luma_hps = PFX(interp_8tap_horiz_ps_16x12_avx512);
2615
+        p.pu[LUMA_16x4].luma_hps = PFX(interp_8tap_horiz_ps_16x4_avx512);
2616
+        p.pu[LUMA_16x32].luma_hps = PFX(interp_8tap_horiz_ps_16x32_avx512);
2617
+        p.pu[LUMA_16x64].luma_hps = PFX(interp_8tap_horiz_ps_16x64_avx512);
2618
+
2619
+        p.pu[LUMA_48x64].luma_hps = PFX(interp_8tap_horiz_ps_48x64_avx512);
2620
+        p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_hps = PFX(interp_4tap_horiz_ps_48x64_avx512);
2621
+
2622
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_vpp = PFX(interp_4tap_vert_pp_16x4_avx512);
2623
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_vpp = PFX(interp_4tap_vert_pp_16x8_avx512);
2624
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_vpp = PFX(interp_4tap_vert_pp_16x12_avx512);
2625
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_vpp = PFX(interp_4tap_vert_pp_16x16_avx512);
2626
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].filter_vpp = PFX(interp_4tap_vert_pp_16x32_avx512);
2627
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_vpp = PFX(interp_4tap_vert_pp_32x8_avx512);
2628
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vpp = PFX(interp_4tap_vert_pp_32x16_avx512);
2629
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vpp = PFX(interp_4tap_vert_pp_32x24_avx512);
2630
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_vpp = PFX(interp_4tap_vert_pp_32x32_avx512);
2631
+
2632
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].filter_vss = PFX(interp_4tap_vert_ss_8x4_avx512);
2633
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].filter_vss = PFX(interp_4tap_vert_ss_8x8_avx512);
2634
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].filter_vss = PFX(interp_4tap_vert_ss_8x16_avx512);
2635
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].filter_vss = PFX(interp_4tap_vert_ss_8x32_avx512);
2636
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_vss = PFX(interp_4tap_vert_ss_16x4_avx512);
2637
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_vss = PFX(interp_4tap_vert_ss_16x8_avx512);
2638
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_vss = PFX(interp_4tap_vert_ss_16x12_avx512);
2639
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_vss = PFX(interp_4tap_vert_ss_16x16_avx512);
2640
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].filter_vss = PFX(interp_4tap_vert_ss_16x32_avx512);
2641
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].filter_vss = PFX(interp_4tap_vert_ss_24x32_avx512);
2642
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_vss = PFX(interp_4tap_vert_ss_32x8_avx512);
2643
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vss = PFX(interp_4tap_vert_ss_32x16_avx512);
2644
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vss = PFX(interp_4tap_vert_ss_32x24_avx512);
2645
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_vss = PFX(interp_4tap_vert_ss_32x32_avx512);
2646
+
2647
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_vsp = PFX(interp_4tap_vert_sp_16x4_avx512);
2648
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_vsp = PFX(interp_4tap_vert_sp_16x8_avx512);
2649
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_vsp = PFX(interp_4tap_vert_sp_16x12_avx512);
2650
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_vsp = PFX(interp_4tap_vert_sp_16x16_avx512);
2651
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].filter_vsp = PFX(interp_4tap_vert_sp_16x32_avx512);
2652
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_vsp = PFX(interp_4tap_vert_sp_32x8_avx512);
2653
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vsp = PFX(interp_4tap_vert_sp_32x16_avx512);
2654
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vsp = PFX(interp_4tap_vert_sp_32x24_avx512);
2655
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_vsp = PFX(interp_4tap_vert_sp_32x32_avx512);
2656
+
2657
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_vpp = PFX(interp_4tap_vert_pp_16x8_avx512);
2658
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_vpp = PFX(interp_4tap_vert_pp_16x16_avx512);
2659
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_vpp = PFX(interp_4tap_vert_pp_16x24_avx512);
2660
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].filter_vpp = PFX(interp_4tap_vert_pp_16x32_avx512);
2661
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].filter_vpp = PFX(interp_4tap_vert_pp_16x64_avx512);
2662
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_vpp = PFX(interp_4tap_vert_pp_16x24_avx512);
2663
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_vpp = PFX(interp_4tap_vert_pp_32x16_avx512);
2664
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_vpp = PFX(interp_4tap_vert_pp_32x32_avx512);
2665
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_vpp = PFX(interp_4tap_vert_pp_32x48_avx512);
2666
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_vpp = PFX(interp_4tap_vert_pp_32x64_avx512);
2667
+
2668
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].filter_vss = PFX(interp_4tap_vert_ss_8x4_avx512);
2669
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].filter_vss = PFX(interp_4tap_vert_ss_8x8_avx512);
2670
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].filter_vss = PFX(interp_4tap_vert_ss_8x12_avx512);
2671
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].filter_vss = PFX(interp_4tap_vert_ss_8x16_avx512);
2672
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].filter_vss = PFX(interp_4tap_vert_ss_8x32_avx512);
2673
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].filter_vss = PFX(interp_4tap_vert_ss_8x64_avx512);
2674
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_vss = PFX(interp_4tap_vert_ss_16x8_avx512);
2675
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_vss = PFX(interp_4tap_vert_ss_16x16_avx512);
2676
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_vss = PFX(interp_4tap_vert_ss_16x24_avx512);
2677
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].filter_vss = PFX(interp_4tap_vert_ss_16x32_avx512);
2678
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].filter_vss = PFX(interp_4tap_vert_ss_16x64_avx512);
2679
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].filter_vss = PFX(interp_4tap_vert_ss_24x64_avx512);
2680
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_vss = PFX(interp_4tap_vert_ss_32x16_avx512);
2681
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_vss = PFX(interp_4tap_vert_ss_32x32_avx512);
2682
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_vss = PFX(interp_4tap_vert_ss_32x48_avx512);
2683
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_vss = PFX(interp_4tap_vert_ss_32x64_avx512);
2684
+
2685
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_vsp = PFX(interp_4tap_vert_sp_16x8_avx512);
2686
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_vsp = PFX(interp_4tap_vert_sp_16x16_avx512);
2687
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_vsp = PFX(interp_4tap_vert_sp_16x24_avx512);
2688
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].filter_vsp = PFX(interp_4tap_vert_sp_16x32_avx512);
2689
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].filter_vsp = PFX(interp_4tap_vert_sp_16x64_avx512);
2690
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_vsp = PFX(interp_4tap_vert_sp_32x16_avx512);
2691
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_vsp = PFX(interp_4tap_vert_sp_32x32_avx512);
2692
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_vsp = PFX(interp_4tap_vert_sp_32x48_avx512);
2693
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_vsp = PFX(interp_4tap_vert_sp_32x64_avx512);
2694
+
2695
+        p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_vpp = PFX(interp_4tap_vert_pp_16x4_avx512);
2696
+        p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_vpp = PFX(interp_4tap_vert_pp_16x8_avx512);
2697
+        p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_vpp = PFX(interp_4tap_vert_pp_16x12_avx512);
2698
+        p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_vpp = PFX(interp_4tap_vert_pp_16x16_avx512);
2699
+        p.chroma[X265_CSP_I444].pu[LUMA_16x32].filter_vpp = PFX(interp_4tap_vert_pp_16x32_avx512);
2700
+        p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_vpp = PFX(interp_4tap_vert_pp_16x64_avx512);
2701
+        p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_vpp = PFX(interp_4tap_vert_pp_32x8_avx512);
2702
+        p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_vpp = PFX(interp_4tap_vert_pp_32x16_avx512);
2703
+        p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_vpp = PFX(interp_4tap_vert_pp_32x24_avx512);
2704
+        p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_vpp = PFX(interp_4tap_vert_pp_32x32_avx512);
2705
+        p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_vpp = PFX(interp_4tap_vert_pp_32x64_avx512);
2706
+        p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vpp = PFX(interp_4tap_vert_pp_48x64_avx512);
2707
+        p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vpp = PFX(interp_4tap_vert_pp_64x64_avx512);
2708
+        p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vpp = PFX(interp_4tap_vert_pp_64x48_avx512);
2709
+        p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vpp = PFX(interp_4tap_vert_pp_64x32_avx512);
2710
+        p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vpp = PFX(interp_4tap_vert_pp_64x16_avx512);
2711
+
2712
+        p.chroma[X265_CSP_I444].pu[LUMA_8x4].filter_vss = PFX(interp_4tap_vert_ss_8x4_avx512);
2713
+        p.chroma[X265_CSP_I444].pu[LUMA_8x8].filter_vss = PFX(interp_4tap_vert_ss_8x8_avx512);
2714
+        p.chroma[X265_CSP_I444].pu[LUMA_8x16].filter_vss = PFX(interp_4tap_vert_ss_8x16_avx512);
2715
+        p.chroma[X265_CSP_I444].pu[LUMA_8x32].filter_vss = PFX(interp_4tap_vert_ss_8x32_avx512);
2716
+        p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_vss = PFX(interp_4tap_vert_ss_16x4_avx512);
2717
+        p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_vss = PFX(interp_4tap_vert_ss_16x8_avx512);
2718
+        p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_vss = PFX(interp_4tap_vert_ss_16x12_avx512);
2719
+        p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_vss = PFX(interp_4tap_vert_ss_16x16_avx512);
2720
+        p.chroma[X265_CSP_I444].pu[LUMA_16x32].filter_vss = PFX(interp_4tap_vert_ss_16x32_avx512);
2721
+        p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_vss = PFX(interp_4tap_vert_ss_16x64_avx512);
2722
+        p.chroma[X265_CSP_I444].pu[LUMA_24x32].filter_vss = PFX(interp_4tap_vert_ss_24x32_avx512);
2723
+        p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_vss = PFX(interp_4tap_vert_ss_32x8_avx512);
2724
+        p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_vss = PFX(interp_4tap_vert_ss_32x16_avx512);
2725
+        p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_vss = PFX(interp_4tap_vert_ss_32x24_avx512);
2726
+        p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_vss = PFX(interp_4tap_vert_ss_32x32_avx512);
2727
+        p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_vss = PFX(interp_4tap_vert_ss_32x64_avx512);
2728
+        p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vss = PFX(interp_4tap_vert_ss_64x64_avx512);
2729
+        p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vss = PFX(interp_4tap_vert_ss_64x48_avx512);
2730
+        p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vss = PFX(interp_4tap_vert_ss_64x32_avx512);
2731
+        p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vss = PFX(interp_4tap_vert_ss_64x16_avx512);
2732
+        p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vss = PFX(interp_4tap_vert_ss_48x64_avx512);
2733
+
2734
+        p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_vsp = PFX(interp_4tap_vert_sp_16x4_avx512);
2735
+        p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_vsp = PFX(interp_4tap_vert_sp_16x8_avx512);
2736
+        p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_vsp = PFX(interp_4tap_vert_sp_16x12_avx512);
2737
+        p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_vsp = PFX(interp_4tap_vert_sp_16x16_avx512);
2738
+        p.chroma[X265_CSP_I444].pu[LUMA_16x32].filter_vsp = PFX(interp_4tap_vert_sp_16x32_avx512);
2739
+        p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_vsp = PFX(interp_4tap_vert_sp_16x64_avx512);
2740
+        p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_vsp = PFX(interp_4tap_vert_sp_32x8_avx512);
2741
+        p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_vsp = PFX(interp_4tap_vert_sp_32x16_avx512);
2742
+        p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_vsp = PFX(interp_4tap_vert_sp_32x24_avx512);
2743
+        p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_vsp = PFX(interp_4tap_vert_sp_32x32_avx512);
2744
+        p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_vsp = PFX(interp_4tap_vert_sp_32x64_avx512);
2745
+        p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vsp = PFX(interp_4tap_vert_sp_48x64_avx512);
2746
+        p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vsp = PFX(interp_4tap_vert_sp_64x64_avx512);
2747
+        p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vsp = PFX(interp_4tap_vert_sp_64x48_avx512);
2748
+        p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vsp = PFX(interp_4tap_vert_sp_64x32_avx512);
2749
+        p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vsp = PFX(interp_4tap_vert_sp_64x16_avx512);
2750
+
2751
+        p.pu[LUMA_8x8].luma_vss = PFX(interp_8tap_vert_ss_8x8_avx512);
2752
+        p.pu[LUMA_8x16].luma_vss = PFX(interp_8tap_vert_ss_8x16_avx512);
2753
+        p.pu[LUMA_8x32].luma_vss = PFX(interp_8tap_vert_ss_8x32_avx512);
2754
+        p.pu[LUMA_16x4].luma_vss = PFX(interp_8tap_vert_ss_16x4_avx512);
2755
+        p.pu[LUMA_16x8].luma_vss = PFX(interp_8tap_vert_ss_16x8_avx512);
2756
+        p.pu[LUMA_16x12].luma_vss = PFX(interp_8tap_vert_ss_16x12_avx512);
2757
+        p.pu[LUMA_16x16].luma_vss = PFX(interp_8tap_vert_ss_16x16_avx512);
2758
+        p.pu[LUMA_16x32].luma_vss = PFX(interp_8tap_vert_ss_16x32_avx512);
2759
+        p.pu[LUMA_16x64].luma_vss = PFX(interp_8tap_vert_ss_16x64_avx512);
2760
+        p.pu[LUMA_24x32].luma_vss = PFX(interp_8tap_vert_ss_24x32_avx512);
2761
+        p.pu[LUMA_32x64].luma_vss = PFX(interp_8tap_vert_ss_32x64_avx512);
2762
+        p.pu[LUMA_32x32].luma_vss = PFX(interp_8tap_vert_ss_32x32_avx512);
2763
+        p.pu[LUMA_32x24].luma_vss = PFX(interp_8tap_vert_ss_32x24_avx512);
2764
+        p.pu[LUMA_32x16].luma_vss = PFX(interp_8tap_vert_ss_32x16_avx512);
2765
+        p.pu[LUMA_32x8].luma_vss = PFX(interp_8tap_vert_ss_32x8_avx512);
2766
+        p.pu[LUMA_48x64].luma_vss = PFX(interp_8tap_vert_ss_48x64_avx512);
2767
+        p.pu[LUMA_64x64].luma_vss = PFX(interp_8tap_vert_ss_64x64_avx512);
2768
+        p.pu[LUMA_64x48].luma_vss = PFX(interp_8tap_vert_ss_64x48_avx512);
2769
+        p.pu[LUMA_64x32].luma_vss = PFX(interp_8tap_vert_ss_64x32_avx512);
2770
+        p.pu[LUMA_64x16].luma_vss = PFX(interp_8tap_vert_ss_64x16_avx512);
2771
+
2772
+        p.pu[LUMA_16x64].luma_vpp = PFX(interp_8tap_vert_pp_16x64_avx512);
2773
+        p.pu[LUMA_16x32].luma_vpp = PFX(interp_8tap_vert_pp_16x32_avx512);
2774
+        p.pu[LUMA_16x16].luma_vpp = PFX(interp_8tap_vert_pp_16x16_avx512);
2775
+        p.pu[LUMA_16x8].luma_vpp = PFX(interp_8tap_vert_pp_16x8_avx512);
2776
+        p.pu[LUMA_32x64].luma_vpp = PFX(interp_8tap_vert_pp_32x64_avx512);
2777
+        p.pu[LUMA_32x32].luma_vpp = PFX(interp_8tap_vert_pp_32x32_avx512);
2778
+        p.pu[LUMA_32x24].luma_vpp = PFX(interp_8tap_vert_pp_32x24_avx512);
2779
+        p.pu[LUMA_32x16].luma_vpp = PFX(interp_8tap_vert_pp_32x16_avx512);
2780
+        p.pu[LUMA_32x8].luma_vpp = PFX(interp_8tap_vert_pp_32x8_avx512);
2781
+        p.pu[LUMA_48x64].luma_vpp = PFX(interp_8tap_vert_pp_48x64_avx512);
2782
+        p.pu[LUMA_64x64].luma_vpp = PFX(interp_8tap_vert_pp_64x64_avx512);
2783
+        p.pu[LUMA_64x48].luma_vpp = PFX(interp_8tap_vert_pp_64x48_avx512);
2784
+        p.pu[LUMA_64x32].luma_vpp = PFX(interp_8tap_vert_pp_64x32_avx512);
2785
+        p.pu[LUMA_64x16].luma_vpp = PFX(interp_8tap_vert_pp_64x16_avx512);
2786
+        p.pu[LUMA_16x4].luma_vsp = PFX(interp_8tap_vert_sp_16x4_avx512);
2787
+        p.pu[LUMA_16x8].luma_vsp = PFX(interp_8tap_vert_sp_16x8_avx512);
2788
+        p.pu[LUMA_16x12].luma_vsp = PFX(interp_8tap_vert_sp_16x12_avx512);
2789
+        p.pu[LUMA_16x16].luma_vsp = PFX(interp_8tap_vert_sp_16x16_avx512);
2790
+        p.pu[LUMA_16x32].luma_vsp = PFX(interp_8tap_vert_sp_16x32_avx512);
2791
+        p.pu[LUMA_16x64].luma_vsp = PFX(interp_8tap_vert_sp_16x64_avx512);
2792
+        p.pu[LUMA_32x64].luma_vsp = PFX(interp_8tap_vert_sp_32x64_avx512);
2793
+        p.pu[LUMA_32x32].luma_vsp = PFX(interp_8tap_vert_sp_32x32_avx512);
2794
+        p.pu[LUMA_32x24].luma_vsp = PFX(interp_8tap_vert_sp_32x24_avx512);
2795
+        p.pu[LUMA_32x16].luma_vsp = PFX(interp_8tap_vert_sp_32x16_avx512);
2796
+        p.pu[LUMA_32x8].luma_vsp = PFX(interp_8tap_vert_sp_32x8_avx512);
2797
+        p.pu[LUMA_48x64].luma_vsp = PFX(interp_8tap_vert_sp_48x64_avx512);
2798
+        p.pu[LUMA_64x64].luma_vsp = PFX(interp_8tap_vert_sp_64x64_avx512);
2799
+        p.pu[LUMA_64x48].luma_vsp = PFX(interp_8tap_vert_sp_64x48_avx512);
2800
+        p.pu[LUMA_64x32].luma_vsp = PFX(interp_8tap_vert_sp_64x32_avx512);
2801
+        p.pu[LUMA_64x16].luma_vsp = PFX(interp_8tap_vert_sp_64x16_avx512);
2802
+
2803
+        p.cu[BLOCK_8x8].dct    = PFX(dct8_avx512);
2804
+        /* TODO: Currently these kernels performance are similar to AVX2 version, we need a to improve them further to ebable
2805
+         * it. Probably a Vtune analysis will help here.
2806
+
2807
+         * p.cu[BLOCK_16x16].dct  = PFX(dct16_avx512);
2808
+         * p.cu[BLOCK_32x32].dct  = PFX(dct32_avx512); */
2809
+
2810
+        p.cu[BLOCK_8x8].idct   = PFX(idct8_avx512);
2811
+        p.cu[BLOCK_16x16].idct = PFX(idct16_avx512);
2812
+        p.cu[BLOCK_32x32].idct = PFX(idct32_avx512);
2813
+        p.quant = PFX(quant_avx512);
2814
+        p.nquant = PFX(nquant_avx512);
2815
+        p.denoiseDct = PFX(denoise_dct_avx512);
2816
+
2817
+        p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vps = PFX(interp_4tap_vert_ps_64x64_avx512);
2818
+        p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vps = PFX(interp_4tap_vert_ps_64x48_avx512);
2819
+        p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vps = PFX(interp_4tap_vert_ps_64x32_avx512);
2820
+        p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vps = PFX(interp_4tap_vert_ps_64x16_avx512);
2821
+
2822
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_vps = PFX(interp_4tap_vert_ps_32x32_avx512);
2823
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vps = PFX(interp_4tap_vert_ps_32x24_avx512);
2824
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vps = PFX(interp_4tap_vert_ps_32x16_avx512);
2825
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_vps = PFX(interp_4tap_vert_ps_32x8_avx512);
2826
+
2827
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_vps = PFX(interp_4tap_vert_ps_32x32_avx512);
2828
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_vps = PFX(interp_4tap_vert_ps_32x16_avx512);
2829
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_vps = PFX(interp_4tap_vert_ps_32x64_avx512);
2830
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_vps = PFX(interp_4tap_vert_ps_32x48_avx512);
2831
+
2832
+        p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_vps = PFX(interp_4tap_vert_ps_32x32_avx512);
2833
+        p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_vps = PFX(interp_4tap_vert_ps_32x16_avx512);
2834
+        p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_vps = PFX(interp_4tap_vert_ps_32x24_avx512);
2835
+        p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_vps = PFX(interp_4tap_vert_ps_32x8_avx512);
2836
+        p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_vps = PFX(interp_4tap_vert_ps_32x64_avx512);
2837
+
2838
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_vps = PFX(interp_4tap_vert_ps_16x4_avx512);
2839
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_vps = PFX(interp_4tap_vert_ps_16x8_avx512);
2840
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_vps = PFX(interp_4tap_vert_ps_16x12_avx512);
2841
+        //p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_vps = PFX(interp_4tap_vert_ps_16x16_avx512);
2842
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].filter_vps = PFX(interp_4tap_vert_ps_16x32_avx512);
2843
+
2844
+        /*p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].filter_vps = PFX(interp_4tap_vert_ps_16x32_avx512);
2845
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_vps = PFX(interp_4tap_vert_ps_16x16_avx512);
2846
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_vps = PFX(interp_4tap_vert_ps_16x8_avx512);
2847
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].filter_vps = PFX(interp_4tap_vert_ps_16x64_avx512);
2848
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_vps = PFX(interp_4tap_vert_ps_16x24_avx512);*/
2849
+
2850
+        //p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_vps = PFX(interp_4tap_vert_ps_16x16_avx512);
2851
+        p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_vps = PFX(interp_4tap_vert_ps_16x8_avx512);
2852
+        p.chroma[X265_CSP_I444].pu[LUMA_16x32].filter_vps = PFX(interp_4tap_vert_ps_16x32_avx512);
2853
+        //p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_vps = PFX(interp_4tap_vert_ps_16x12_avx512);
2854
+        p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_vps = PFX(interp_4tap_vert_ps_16x4_avx512);
2855
+        p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_vps = PFX(interp_4tap_vert_ps_16x64_avx512);
2856
+        p.cu[BLOCK_16x16].psy_cost_pp = PFX(psyCost_pp_16x16_avx512);
2857
+        p.cu[BLOCK_32x32].psy_cost_pp = PFX(psyCost_pp_32x32_avx512);
2858
+        p.cu[BLOCK_64x64].psy_cost_pp = PFX(psyCost_pp_64x64_avx512);
2859
+
2860
+        p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vps = PFX(interp_4tap_vert_ps_48x64_avx512);
2861
+
2862
+        p.pu[LUMA_64x16].luma_vps = PFX(interp_8tap_vert_ps_64x16_avx512);
2863
+        p.pu[LUMA_64x32].luma_vps = PFX(interp_8tap_vert_ps_64x32_avx512);
2864
+        p.pu[LUMA_64x48].luma_vps = PFX(interp_8tap_vert_ps_64x48_avx512);
2865
+        p.pu[LUMA_64x64].luma_vps = PFX(interp_8tap_vert_ps_64x64_avx512);
2866
+
2867
+        p.pu[LUMA_32x8].luma_vps = PFX(interp_8tap_vert_ps_32x8_avx512);
2868
+        p.pu[LUMA_32x16].luma_vps = PFX(interp_8tap_vert_ps_32x16_avx512);
2869
+        p.pu[LUMA_32x32].luma_vps = PFX(interp_8tap_vert_ps_32x32_avx512);
2870
+        p.pu[LUMA_32x24].luma_vps = PFX(interp_8tap_vert_ps_32x24_avx512);
2871
+        p.pu[LUMA_32x64].luma_vps = PFX(interp_8tap_vert_ps_32x64_avx512);
2872
+
2873
+        p.pu[LUMA_16x8].luma_vps = PFX(interp_8tap_vert_ps_16x8_avx512);
2874
+        p.pu[LUMA_16x16].luma_vps = PFX(interp_8tap_vert_ps_16x16_avx512);
2875
+        p.pu[LUMA_16x32].luma_vps = PFX(interp_8tap_vert_ps_16x32_avx512);
2876
+        //p.pu[LUMA_16x64].luma_vps = PFX(interp_8tap_vert_ps_16x64_avx512);
2877
+        p.pu[LUMA_48x64].luma_vps = PFX(interp_8tap_vert_ps_48x64_avx512);
2878
+
2879
+        p.pu[LUMA_64x64].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_64x64>;
2880
+        p.pu[LUMA_64x48].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_64x48>;
2881
+        p.pu[LUMA_64x32].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_64x32>;
2882
+        p.pu[LUMA_64x16].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_64x16>;
2883
+        p.pu[LUMA_32x8].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_32x8>;
2884
+        p.pu[LUMA_32x16].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_32x16>;
2885
+        p.pu[LUMA_32x32].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_32x32>;
2886
+        p.pu[LUMA_32x24].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_32x24>;
2887
+        p.pu[LUMA_32x64].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_32x64>;
2888
+        p.pu[LUMA_16x4].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_16x4>;
2889
+        p.pu[LUMA_16x8].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_16x8>;
2890
+        p.pu[LUMA_16x12].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_16x12>;
2891
+        p.pu[LUMA_16x16].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_16x16>;
2892
+        p.pu[LUMA_16x32].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_16x32>;
2893
+        p.pu[LUMA_16x64].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_16x64>;
2894
+        p.pu[LUMA_48x64].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_48x64>;
2895
+
2896
+        p.cu[BLOCK_4x4].nonPsyRdoQuant = PFX(nonPsyRdoQuant4_avx512);
2897
+        p.cu[BLOCK_8x8].nonPsyRdoQuant = PFX(nonPsyRdoQuant8_avx512);
2898
+        p.cu[BLOCK_16x16].nonPsyRdoQuant = PFX(nonPsyRdoQuant16_avx512);
2899
+        p.cu[BLOCK_32x32].nonPsyRdoQuant = PFX(nonPsyRdoQuant32_avx512);
2900
+        p.cu[BLOCK_4x4].psyRdoQuant = PFX(psyRdoQuant4_avx512);
2901
+        p.cu[BLOCK_8x8].psyRdoQuant = PFX(psyRdoQuant8_avx512);
2902
+        p.cu[BLOCK_16x16].psyRdoQuant = PFX(psyRdoQuant16_avx512);
2903
+        p.cu[BLOCK_32x32].psyRdoQuant = PFX(psyRdoQuant32_avx512);
2904
+        p.pu[LUMA_32x8].satd = PFX(pixel_satd_32x8_avx512);
2905
+        p.pu[LUMA_32x16].satd = PFX(pixel_satd_32x16_avx512);
2906
+        p.pu[LUMA_32x24].satd = PFX(pixel_satd_32x24_avx512);
2907
+        p.pu[LUMA_32x32].satd = PFX(pixel_satd_32x32_avx512);
2908
+        p.pu[LUMA_32x64].satd = PFX(pixel_satd_32x64_avx512);
2909
+        p.pu[LUMA_64x16].satd = PFX(pixel_satd_64x16_avx512);
2910
+        p.pu[LUMA_64x32].satd = PFX(pixel_satd_64x32_avx512);
2911
+        p.pu[LUMA_64x48].satd = PFX(pixel_satd_64x48_avx512);
2912
+        p.pu[LUMA_64x64].satd = PFX(pixel_satd_64x64_avx512);
2913
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].satd = PFX(pixel_satd_32x32_avx512);
2914
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].satd = PFX(pixel_satd_32x16_avx512);
2915
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].satd = PFX(pixel_satd_32x24_avx512);
2916
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].satd = PFX(pixel_satd_32x8_avx512);
2917
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].satd = PFX(pixel_satd_32x64_avx512);
2918
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].satd = PFX(pixel_satd_32x48_avx512);
2919
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].satd = PFX(pixel_satd_32x32_avx512);
2920
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd = PFX(pixel_satd_32x16_avx512);
2921
+        p.planecopy_sp_shl = PFX(upShift_16_avx512);
2922
+        p.cu[BLOCK_16x16].count_nonzero = PFX(count_nonzero_16x16_avx512);
2923
+        p.cu[BLOCK_32x32].count_nonzero = PFX(count_nonzero_32x32_avx512);
2924
 
2925
     }
2926
 #endif
2927
@@ -3738,7 +5418,7 @@
2928
 // CPU dispatcher function
2929
 void PFX(intel_cpu_indicator_init)(void)
2930
 {
2931
-    uint32_t cpu = x265::cpu_detect();
2932
+    uint32_t cpu = x265::cpu_detect(false);
2933
 
2934
     if (cpu & X265_CPU_AVX)
2935
         __intel_cpu_indicator = 0x20000;
2936
x265_2.7.tar.gz/source/common/x86/blockcopy8.asm -> x265_2.9.tar.gz/source/common/x86/blockcopy8.asm Changed
1157
 
1
@@ -26,7 +26,10 @@
2
 %include "x86inc.asm"
3
 %include "x86util.asm"
4
 
5
-SECTION_RODATA 32
6
+SECTION_RODATA 64
7
+
8
+ALIGN 64
9
+const shuf1_avx512,  dq 0, 2, 4, 6, 1, 3, 5, 7
10
 
11
 cextern pb_4
12
 cextern pb_1
13
@@ -1103,6 +1106,82 @@
14
 BLOCKCOPY_PP_W64_H4_avx 64, 48
15
 BLOCKCOPY_PP_W64_H4_avx 64, 64
16
 
17
+;----------------------------------------------------------------------------------------------
18
+; blockcopy_pp avx512 code start
19
+;----------------------------------------------------------------------------------------------
20
+%macro PROCESS_BLOCKCOPY_PP_64X4_avx512 0
21
+movu    m0, [r2]
22
+movu    m1, [r2 + r3]
23
+movu    m2, [r2 + 2 * r3]
24
+movu    m3, [r2 + r4]
25
+
26
+movu    [r0] , m0
27
+movu    [r0 + r1] , m1
28
+movu    [r0 + 2 * r1]  , m2
29
+movu    [r0 + r5] , m3
30
+%endmacro
31
+
32
+%macro PROCESS_BLOCKCOPY_PP_32X4_avx512 0
33
+movu           ym0, [r2]
34
+vinserti32x8   m0,  [r2 + r3],     1
35
+movu           ym1, [r2 + 2 * r3]
36
+vinserti32x8   m1,  [r2 + r4],     1
37
+
38
+movu           [r0] ,              ym0
39
+vextracti32x8  [r0 + r1] ,         m0,    1
40
+movu           [r0 + 2 * r1]  ,    ym1
41
+vextracti32x8  [r0 + r5] ,         m1,    1
42
+%endmacro
43
+
44
+;----------------------------------------------------------------------------------------------
45
+; void blockcopy_pp_64x%1(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
46
+;----------------------------------------------------------------------------------------------
47
+%macro BLOCKCOPY_PP_W64_H4_avx512 1
48
+INIT_ZMM avx512
49
+cglobal blockcopy_pp_64x%1, 4, 6, 4
50
+lea    r4,  [3 * r3]
51
+lea    r5,  [3 * r1]
52
+
53
+%rep %1/4 - 1
54
+PROCESS_BLOCKCOPY_PP_64X4_avx512
55
+lea     r2, [r2 + 4 * r3]
56
+lea     r0, [r0 + 4 * r1] 
57
+%endrep
58
+
59
+PROCESS_BLOCKCOPY_PP_64X4_avx512
60
+RET
61
+%endmacro
62
+
63
+BLOCKCOPY_PP_W64_H4_avx512 16
64
+BLOCKCOPY_PP_W64_H4_avx512 32
65
+BLOCKCOPY_PP_W64_H4_avx512 48
66
+BLOCKCOPY_PP_W64_H4_avx512 64
67
+
68
+%macro BLOCKCOPY_PP_W32_H4_avx512 1
69
+INIT_ZMM avx512
70
+cglobal blockcopy_pp_32x%1, 4, 6, 2
71
+    lea    r4,  [3 * r3]
72
+    lea    r5,  [3 * r1]
73
+
74
+%rep %1/4 - 1
75
+    PROCESS_BLOCKCOPY_PP_32X4_avx512
76
+    lea     r2, [r2 + 4 * r3]
77
+    lea     r0, [r0 + 4 * r1] 
78
+%endrep
79
+    PROCESS_BLOCKCOPY_PP_32X4_avx512
80
+    RET
81
+%endmacro
82
+
83
+BLOCKCOPY_PP_W32_H4_avx512 8
84
+BLOCKCOPY_PP_W32_H4_avx512 16
85
+BLOCKCOPY_PP_W32_H4_avx512 24
86
+BLOCKCOPY_PP_W32_H4_avx512 32
87
+BLOCKCOPY_PP_W32_H4_avx512 48
88
+BLOCKCOPY_PP_W32_H4_avx512 64
89
+;----------------------------------------------------------------------------------------------
90
+; blockcopy_pp avx512 code end
91
+;----------------------------------------------------------------------------------------------
92
+
93
 ;-----------------------------------------------------------------------------
94
 ; void blockcopy_sp_2x4(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
95
 ;-----------------------------------------------------------------------------
96
@@ -2121,6 +2200,86 @@
97
 
98
 BLOCKCOPY_SP_W64_H4_avx2 64, 64
99
 
100
+%macro PROCESS_BLOCKCOPY_SP_64x4_AVX512 0
101
+    movu               m0,             [r2]
102
+    movu               m1,             [r2 + 64]
103
+    movu               m2,             [r2 + r3]
104
+    movu               m3,             [r2 + r3 + 64]
105
+
106
+    packuswb           m0,             m1
107
+    packuswb           m2,             m3
108
+    vpermq             m0,             m4,         m0
109
+    vpermq             m2,             m4,         m2
110
+    movu               [r0],           m0
111
+    movu               [r0 + r1],      m2
112
+
113
+    movu               m0,             [r2 + 2 * r3]
114
+    movu               m1,             [r2 + 2 * r3 + 64]
115
+    movu               m2,             [r2 + r4]
116
+    movu               m3,             [r2 + r4 + 64]
117
+
118
+    packuswb           m0,             m1
119
+    packuswb           m2,             m3
120
+    vpermq             m0,             m4,         m0
121
+    vpermq             m2,             m4,         m2
122
+    movu               [r0 + 2 * r1],  m0
123
+    movu               [r0 + r5],      m2
124
+%endmacro
125
+
126
+%macro PROCESS_BLOCKCOPY_SP_32x4_AVX512 0
127
+    movu               m0,             [r2]
128
+    movu               m1,             [r2 + r3]
129
+    movu               m2,             [r2 + 2 * r3]
130
+    movu               m3,             [r2 + r4]
131
+
132
+    packuswb           m0,             m1
133
+    packuswb           m2,             m3
134
+    vpermq             m0,             m4,         m0
135
+    vpermq             m2,             m4,         m2
136
+    movu               [r0],           ym0
137
+    vextracti32x8      [r0 + r1],      m0,         1
138
+    movu               [r0 + 2 * r1],  ym2
139
+    vextracti32x8      [r0 + r5],      m2,         1
140
+%endmacro
141
+
142
+;-----------------------------------------------------------------------------
143
+; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
144
+;-----------------------------------------------------------------------------
145
+INIT_ZMM avx512
146
+cglobal blockcopy_sp_64x64, 4, 6, 5
147
+    mova   m4, [shuf1_avx512]
148
+    add    r3,  r3
149
+    lea    r4,  [3 * r3]
150
+    lea    r5,  [3 * r1]
151
+
152
+%rep 15
153
+    PROCESS_BLOCKCOPY_SP_64x4_AVX512
154
+    lea    r0, [r0 + 4 * r1]
155
+    lea    r2, [r2 + 4 * r3]
156
+%endrep
157
+    PROCESS_BLOCKCOPY_SP_64x4_AVX512
158
+    RET
159
+
160
+%macro BLOCKCOPY_SP_32xN_AVX512 1
161
+INIT_ZMM avx512
162
+cglobal blockcopy_sp_32x%1, 4, 6, 5
163
+    mova   m4, [shuf1_avx512]
164
+    add    r3,  r3
165
+    lea    r4,  [3 * r3]
166
+    lea    r5,  [3 * r1]
167
+
168
+%rep %1/4 - 1
169
+    PROCESS_BLOCKCOPY_SP_32x4_AVX512
170
+    lea    r0, [r0 + 4 * r1]
171
+    lea    r2, [r2 + 4 * r3]
172
+%endrep
173
+    PROCESS_BLOCKCOPY_SP_32x4_AVX512
174
+    RET
175
+%endmacro
176
+
177
+BLOCKCOPY_SP_32xN_AVX512 32
178
+BLOCKCOPY_SP_32xN_AVX512 64
179
+
180
 ;-----------------------------------------------------------------------------
181
 ; void blockfill_s_4x4(int16_t* dst, intptr_t dstride, int16_t val)
182
 ;-----------------------------------------------------------------------------
183
@@ -2396,6 +2555,43 @@
184
 movu       [r0 + r3 + 32], m0
185
 RET
186
 
187
+;--------------------------------------------------------------------
188
+; void blockfill_s_32x32(int16_t* dst, intptr_t dstride, int16_t val)
189
+;--------------------------------------------------------------------
190
+INIT_ZMM avx512
191
+cglobal blockfill_s_32x32, 3, 4, 1
192
+add          r1, r1
193
+lea          r3, [3 * r1]
194
+movd         xm0, r2d
195
+vpbroadcastw m0, xm0
196
+
197
+%rep 8
198
+movu       [r0], m0
199
+movu       [r0 + r1], m0
200
+movu       [r0 + 2 * r1], m0
201
+movu       [r0 + r3], m0
202
+lea        r0, [r0 + 4 * r1]
203
+%endrep
204
+RET
205
+
206
+;--------------------------------------------------------------------
207
+; void blockfill_s_aligned_32x32(int16_t* dst, intptr_t dstride, int16_t val)
208
+;--------------------------------------------------------------------
209
+INIT_ZMM avx512
210
+cglobal blockfill_s_aligned_32x32, 3, 4, 1
211
+add          r1, r1
212
+lea          r3, [3 * r1]
213
+movd         xm0, r2d
214
+vpbroadcastw m0, xm0
215
+
216
+%rep 8
217
+mova       [r0], m0
218
+mova       [r0 + r1], m0
219
+mova       [r0 + 2 * r1], m0
220
+mova       [r0 + r3], m0
221
+lea        r0, [r0 + 4 * r1]
222
+%endrep
223
+RET
224
 ;-----------------------------------------------------------------------------
225
 ; void blockcopy_ps_2x4(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
226
 ;-----------------------------------------------------------------------------
227
@@ -3077,6 +3273,79 @@
228
 BLOCKCOPY_PS_W32_H4_avx2 32, 32
229
 BLOCKCOPY_PS_W32_H4_avx2 32, 64
230
 
231
+%macro PROCESS_BLOCKCOPY_PS_32x8_AVX512 0
232
+    pmovzxbw      m0, [r2]
233
+    pmovzxbw      m1, [r2 + r3]
234
+    pmovzxbw      m2, [r2 + r3 * 2]
235
+    pmovzxbw      m3, [r2 + r4]
236
+
237
+    movu          [r0], m0
238
+    movu          [r0 + r1], m1
239
+    movu          [r0 + r1 * 2], m2
240
+    movu          [r0 + r5], m3
241
+
242
+    lea           r0, [r0 + 4 * r1]
243
+    lea           r2, [r2 + 4 * r3]
244
+
245
+    pmovzxbw      m0, [r2]
246
+    pmovzxbw      m1, [r2 + r3]
247
+    pmovzxbw      m2, [r2 + r3 * 2]
248
+    pmovzxbw      m3, [r2 + r4]
249
+
250
+    movu          [r0], m0
251
+    movu          [r0 + r1], m1
252
+    movu          [r0 + r1 * 2], m2
253
+    movu          [r0 + r5], m3
254
+%endmacro
255
+
256
+INIT_ZMM avx512
257
+cglobal blockcopy_ps_32x32, 4, 6, 4
258
+    add     r1, r1
259
+    lea     r4, [3 * r3]
260
+    lea     r5, [3 * r1]
261
+
262
+    PROCESS_BLOCKCOPY_PS_32x8_AVX512
263
+    lea           r0, [r0 + 4 * r1]
264
+    lea           r2, [r2 + 4 * r3]
265
+    PROCESS_BLOCKCOPY_PS_32x8_AVX512
266
+    lea           r0, [r0 + 4 * r1]
267
+    lea           r2, [r2 + 4 * r3]
268
+    PROCESS_BLOCKCOPY_PS_32x8_AVX512
269
+    lea           r0, [r0 + 4 * r1]
270
+    lea           r2, [r2 + 4 * r3]
271
+    PROCESS_BLOCKCOPY_PS_32x8_AVX512
272
+    RET
273
+
274
+INIT_ZMM avx512
275
+cglobal blockcopy_ps_32x64, 4, 6, 4
276
+    add     r1, r1
277
+    lea     r4, [3 * r3]
278
+    lea     r5, [3 * r1]
279
+
280
+    PROCESS_BLOCKCOPY_PS_32x8_AVX512
281
+    lea           r0, [r0 + 4 * r1]
282
+    lea           r2, [r2 + 4 * r3]
283
+    PROCESS_BLOCKCOPY_PS_32x8_AVX512
284
+    lea           r0, [r0 + 4 * r1]
285
+    lea           r2, [r2 + 4 * r3]
286
+    PROCESS_BLOCKCOPY_PS_32x8_AVX512
287
+    lea           r0, [r0 + 4 * r1]
288
+    lea           r2, [r2 + 4 * r3]
289
+    PROCESS_BLOCKCOPY_PS_32x8_AVX512
290
+    lea           r0, [r0 + 4 * r1]
291
+    lea           r2, [r2 + 4 * r3]
292
+    PROCESS_BLOCKCOPY_PS_32x8_AVX512
293
+    lea           r0, [r0 + 4 * r1]
294
+    lea           r2, [r2 + 4 * r3]
295
+    PROCESS_BLOCKCOPY_PS_32x8_AVX512
296
+    lea           r0, [r0 + 4 * r1]
297
+    lea           r2, [r2 + 4 * r3]
298
+    PROCESS_BLOCKCOPY_PS_32x8_AVX512
299
+    lea           r0, [r0 + 4 * r1]
300
+    lea           r2, [r2 + 4 * r3]
301
+    PROCESS_BLOCKCOPY_PS_32x8_AVX512
302
+    RET
303
+
304
 ;-----------------------------------------------------------------------------
305
 ; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
306
 ;-----------------------------------------------------------------------------
307
@@ -3262,6 +3531,79 @@
308
     jnz           .loop
309
     RET
310
 
311
+%macro PROCESS_BLOCKCOPY_PS_64x8_AVX512 0
312
+    pmovzxbw      m0, [r2]
313
+    pmovzxbw      m1, [r2 + 32]
314
+    pmovzxbw      m2, [r2 + r3]
315
+    pmovzxbw      m3, [r2 + r3 + 32]
316
+    movu          [r0], m0
317
+    movu          [r0 + 64], m1
318
+    movu          [r0 + r1], m2
319
+    movu          [r0 + r1 + 64], m3
320
+
321
+    pmovzxbw      m0, [r2 + r3 * 2]
322
+    pmovzxbw      m1, [r2 + r3 * 2 + 32]
323
+    pmovzxbw      m2, [r2 + r4]
324
+    pmovzxbw      m3, [r2 + r4 + 32]
325
+    movu          [r0 + r1 * 2], m0
326
+    movu          [r0 + r1 * 2 + 64], m1
327
+    movu          [r0 + r5], m2
328
+    movu          [r0 + r5 + 64], m3
329
+
330
+    lea           r0, [r0 + 4 * r1]
331
+    lea           r2, [r2 + 4 * r3]
332
+
333
+    pmovzxbw      m0, [r2]
334
+    pmovzxbw      m1, [r2 + 32]
335
+    pmovzxbw      m2, [r2 + r3]
336
+    pmovzxbw      m3, [r2 + r3 + 32]
337
+    movu          [r0], m0
338
+    movu          [r0 + 64], m1
339
+    movu          [r0 + r1], m2
340
+    movu          [r0 + r1 + 64], m3
341
+
342
+    pmovzxbw      m0, [r2 + r3 * 2]
343
+    pmovzxbw      m1, [r2 + r3 * 2 + 32]
344
+    pmovzxbw      m2, [r2 + r4]
345
+    pmovzxbw      m3, [r2 + r4 + 32]
346
+    movu          [r0 + r1 * 2], m0
347
+    movu          [r0 + r1 * 2 + 64], m1
348
+    movu          [r0 + r5], m2
349
+    movu          [r0 + r5 + 64], m3
350
+%endmacro
351
+;-----------------------------------------------------------------------------
352
+; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
353
+;-----------------------------------------------------------------------------
354
+INIT_ZMM avx512
355
+cglobal blockcopy_ps_64x64, 4, 6, 4
356
+    add     r1, r1
357
+    lea     r4, [3 * r3]
358
+    lea     r5, [3 * r1]
359
+
360
+    PROCESS_BLOCKCOPY_PS_64x8_AVX512
361
+    lea           r0, [r0 + 4 * r1]
362
+    lea           r2, [r2 + 4 * r3]
363
+    PROCESS_BLOCKCOPY_PS_64x8_AVX512
364
+    lea           r0, [r0 + 4 * r1]
365
+    lea           r2, [r2 + 4 * r3]
366
+    PROCESS_BLOCKCOPY_PS_64x8_AVX512
367
+    lea           r0, [r0 + 4 * r1]
368
+    lea           r2, [r2 + 4 * r3]
369
+    PROCESS_BLOCKCOPY_PS_64x8_AVX512
370
+    lea           r0, [r0 + 4 * r1]
371
+    lea           r2, [r2 + 4 * r3]
372
+    PROCESS_BLOCKCOPY_PS_64x8_AVX512
373
+    lea           r0, [r0 + 4 * r1]
374
+    lea           r2, [r2 + 4 * r3]
375
+    PROCESS_BLOCKCOPY_PS_64x8_AVX512
376
+    lea           r0, [r0 + 4 * r1]
377
+    lea           r2, [r2 + 4 * r3]
378
+    PROCESS_BLOCKCOPY_PS_64x8_AVX512
379
+    lea           r0, [r0 + 4 * r1]
380
+    lea           r2, [r2 + 4 * r3]
381
+    PROCESS_BLOCKCOPY_PS_64x8_AVX512
382
+    RET
383
+
384
 ;-----------------------------------------------------------------------------
385
 ; void blockcopy_ss_2x4(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
386
 ;-----------------------------------------------------------------------------
387
@@ -4051,6 +4393,143 @@
388
 BLOCKCOPY_SS_W32_H4_avx 32, 48
389
 BLOCKCOPY_SS_W32_H4_avx 32, 64
390
 
391
+%macro PROCESS_BLOCKCOPY_SS_W32_H8_avx512 0
392
+    movu    m0, [r2]
393
+    movu    m1, [r2 + r3]
394
+    movu    m2, [r2 + 2 * r3]
395
+    movu    m3, [r2 + r6]
396
+    lea     r2, [r2 + 4 * r3]
397
+
398
+    movu    [r0],          m0
399
+    movu    [r0 + r1],     m1
400
+    movu    [r0 + 2 * r1], m2
401
+    movu    [r0 + r5],     m3
402
+    lea     r0, [r0 + 4 * r1]
403
+
404
+    movu    m0, [r2]
405
+    movu    m1, [r2 + r3]
406
+    movu    m2, [r2 + 2 * r3]
407
+    movu    m3, [r2 + r6]
408
+    lea     r2, [r2 + 4 * r3]
409
+
410
+    movu    [r0],          m0
411
+    movu    [r0 + r1],     m1
412
+    movu    [r0 + 2 * r1], m2
413
+    movu    [r0 + r5],     m3
414
+    lea     r0, [r0 + 4 * r1]
415
+%endmacro
416
+
417
+%macro PROCESS_BLOCKCOPY_SS_W32_H8_LAST_avx512 0
418
+    movu    m0, [r2]
419
+    movu    m1, [r2 + r3]
420
+    movu    m2, [r2 + 2 * r3]
421
+    movu    m3, [r2 + r6]
422
+    lea     r2, [r2 + 4 * r3]
423
+
424
+    movu    [r0],          m0
425
+    movu    [r0 + r1],     m1
426
+    movu    [r0 + 2 * r1], m2
427
+    movu    [r0 + r5],     m3
428
+    lea     r0, [r0 + 4 * r1]
429
+
430
+    movu    m0, [r2]
431
+    movu    m1, [r2 + r3]
432
+    movu    m2, [r2 + 2 * r3]
433
+    movu    m3, [r2 + r6]
434
+
435
+    movu    [r0],          m0
436
+    movu    [r0 + r1],     m1
437
+    movu    [r0 + 2 * r1], m2
438
+    movu    [r0 + r5],     m3
439
+%endmacro
440
+
441
+;-----------------------------------------------------------------------------
442
+; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
443
+;-----------------------------------------------------------------------------
444
+INIT_ZMM avx512
445
+cglobal blockcopy_ss_32x8, 4, 7, 4
446
+
447
+    add    r1, r1
448
+    add    r3, r3
449
+    lea    r5, [3 * r1]
450
+    lea    r6, [3 * r3]
451
+
452
+    PROCESS_BLOCKCOPY_SS_W32_H8_LAST_avx512
453
+    RET
454
+
455
+INIT_ZMM avx512
456
+cglobal blockcopy_ss_32x16, 4, 7, 4
457
+
458
+    add    r1, r1
459
+    add    r3, r3
460
+    lea    r5, [3 * r1]
461
+    lea    r6, [3 * r3]
462
+
463
+    PROCESS_BLOCKCOPY_SS_W32_H8_avx512
464
+    PROCESS_BLOCKCOPY_SS_W32_H8_LAST_avx512
465
+    RET
466
+
467
+INIT_ZMM avx512
468
+cglobal blockcopy_ss_32x24, 4, 7, 4
469
+
470
+    add    r1, r1
471
+    add    r3, r3
472
+    lea    r5, [3 * r1]
473
+    lea    r6, [3 * r3]
474
+
475
+    PROCESS_BLOCKCOPY_SS_W32_H8_avx512
476
+    PROCESS_BLOCKCOPY_SS_W32_H8_avx512
477
+    PROCESS_BLOCKCOPY_SS_W32_H8_LAST_avx512
478
+    RET
479
+
480
+INIT_ZMM avx512
481
+cglobal blockcopy_ss_32x32, 4, 7, 4
482
+
483
+    add    r1, r1
484
+    add    r3, r3
485
+    lea    r5, [3 * r1]
486
+    lea    r6, [3 * r3]
487
+
488
+    PROCESS_BLOCKCOPY_SS_W32_H8_avx512
489
+    PROCESS_BLOCKCOPY_SS_W32_H8_avx512
490
+    PROCESS_BLOCKCOPY_SS_W32_H8_avx512
491
+    PROCESS_BLOCKCOPY_SS_W32_H8_LAST_avx512
492
+    RET
493
+
494
+INIT_ZMM avx512
495
+cglobal blockcopy_ss_32x48, 4, 7, 4
496
+
497
+    add    r1, r1
498
+    add    r3, r3
499
+    lea    r5, [3 * r1]
500
+    lea    r6, [3 * r3]
501
+
502
+    PROCESS_BLOCKCOPY_SS_W32_H8_avx512
503
+    PROCESS_BLOCKCOPY_SS_W32_H8_avx512
504
+    PROCESS_BLOCKCOPY_SS_W32_H8_avx512
505
+    PROCESS_BLOCKCOPY_SS_W32_H8_avx512
506
+    PROCESS_BLOCKCOPY_SS_W32_H8_avx512
507
+    PROCESS_BLOCKCOPY_SS_W32_H8_LAST_avx512
508
+    RET
509
+
510
+INIT_ZMM avx512
511
+cglobal blockcopy_ss_32x64, 4, 7, 4
512
+
513
+    add    r1, r1
514
+    add    r3, r3
515
+    lea    r5, [3 * r1]
516
+    lea    r6, [3 * r3]
517
+
518
+    PROCESS_BLOCKCOPY_SS_W32_H8_avx512
519
+    PROCESS_BLOCKCOPY_SS_W32_H8_avx512
520
+    PROCESS_BLOCKCOPY_SS_W32_H8_avx512
521
+    PROCESS_BLOCKCOPY_SS_W32_H8_avx512
522
+    PROCESS_BLOCKCOPY_SS_W32_H8_avx512
523
+    PROCESS_BLOCKCOPY_SS_W32_H8_avx512
524
+    PROCESS_BLOCKCOPY_SS_W32_H8_avx512
525
+    PROCESS_BLOCKCOPY_SS_W32_H8_LAST_avx512
526
+    RET
527
+
528
 ;-----------------------------------------------------------------------------
529
 ; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
530
 ;-----------------------------------------------------------------------------
531
@@ -4349,6 +4828,154 @@
532
 BLOCKCOPY_SS_W64_H4_avx 64, 48
533
 BLOCKCOPY_SS_W64_H4_avx 64, 64
534
 
535
+%macro PROCESS_BLOCKCOPY_SS_W64_H8_avx512 0
536
+    movu    m0, [r2]
537
+    movu    m1, [r2 + mmsize]
538
+    movu    m2, [r2 + r3]
539
+    movu    m3, [r2 + r3 + mmsize]
540
+
541
+    movu    [r0],               m0
542
+    movu    [r0 + mmsize],      m1
543
+    movu    [r0 + r1],          m2
544
+    movu    [r0 + r1 + mmsize], m3
545
+
546
+    movu    m0, [r2 + 2 * r3]
547
+    movu    m1, [r2 + 2 * r3 + mmsize]
548
+    movu    m2, [r2 + r6]
549
+    movu    m3, [r2 + r6 + mmsize]
550
+    lea     r2, [r2 + 4 * r3]
551
+
552
+    movu    [r0 + 2 * r1],          m0
553
+    movu    [r0 + 2 * r1 + mmsize], m1
554
+    movu    [r0 + r5],              m2
555
+    movu    [r0 + r5 + mmsize],     m3
556
+    lea     r0, [r0 + 4 * r1]
557
+
558
+    movu    m0, [r2]
559
+    movu    m1, [r2 + mmsize]
560
+    movu    m2, [r2 + r3]
561
+    movu    m3, [r2 + r3 + mmsize]
562
+
563
+    movu    [r0],               m0
564
+    movu    [r0 + mmsize],      m1
565
+    movu    [r0 + r1],          m2
566
+    movu    [r0 + r1 + mmsize], m3
567
+
568
+    movu    m0, [r2 + 2 * r3]
569
+    movu    m1, [r2 + 2 * r3 + mmsize]
570
+    movu    m2, [r2 + r6]
571
+    movu    m3, [r2 + r6 + mmsize]
572
+    lea     r2, [r2 + 4 * r3]
573
+
574
+    movu    [r0 + 2 * r1],          m0
575
+    movu    [r0 + 2 * r1 + mmsize], m1
576
+    movu    [r0 + r5],              m2
577
+    movu    [r0 + r5 + mmsize],     m3
578
+    lea     r0, [r0 + 4 * r1]
579
+%endmacro
580
+
581
+%macro PROCESS_BLOCKCOPY_SS_W64_H8_LAST_avx512 0
582
+    movu    m0, [r2]
583
+    movu    m1, [r2 + mmsize]
584
+    movu    m2, [r2 + r3]
585
+    movu    m3, [r2 + r3 + mmsize]
586
+
587
+    movu    [r0],               m0
588
+    movu    [r0 + mmsize],      m1
589
+    movu    [r0 + r1],          m2
590
+    movu    [r0 + r1 + mmsize], m3
591
+
592
+    movu    m0, [r2 + 2 * r3]
593
+    movu    m1, [r2 + 2 * r3 + mmsize]
594
+    movu    m2, [r2 + r6]
595
+    movu    m3, [r2 + r6 + mmsize]
596
+    lea     r2, [r2 + 4 * r3]
597
+
598
+    movu    [r0 + 2 * r1],          m0
599
+    movu    [r0 + 2 * r1 + mmsize], m1
600
+    movu    [r0 + r5],              m2
601
+    movu    [r0 + r5 + mmsize],     m3
602
+    lea     r0, [r0 + 4 * r1]
603
+
604
+    movu    m0, [r2]
605
+    movu    m1, [r2 + mmsize]
606
+    movu    m2, [r2 + r3]
607
+    movu    m3, [r2 + r3 + mmsize]
608
+
609
+    movu    [r0],               m0
610
+    movu    [r0 + mmsize],      m1
611
+    movu    [r0 + r1],          m2
612
+    movu    [r0 + r1 + mmsize], m3
613
+
614
+    movu    m0, [r2 + 2 * r3]
615
+    movu    m1, [r2 + 2 * r3 + mmsize]
616
+    movu    m2, [r2 + r6]
617
+    movu    m3, [r2 + r6 + mmsize]
618
+
619
+    movu    [r0 + 2 * r1],          m0
620
+    movu    [r0 + 2 * r1 + mmsize], m1
621
+    movu    [r0 + r5],              m2
622
+    movu    [r0 + r5 + mmsize],     m3
623
+%endmacro
624
+
625
+;-----------------------------------------------------------------------------
626
+; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
627
+;-----------------------------------------------------------------------------
628
+INIT_ZMM avx512
629
+cglobal blockcopy_ss_64x16, 4, 7, 4
630
+    add     r1, r1
631
+    add     r3, r3
632
+    lea     r5, [3 * r1]
633
+    lea     r6, [3 * r3]
634
+
635
+    PROCESS_BLOCKCOPY_SS_W64_H8_avx512
636
+    PROCESS_BLOCKCOPY_SS_W64_H8_LAST_avx512
637
+    RET
638
+
639
+INIT_ZMM avx512
640
+cglobal blockcopy_ss_64x32, 4, 7, 4
641
+    add     r1, r1
642
+    add     r3, r3
643
+    lea     r5, [3 * r1]
644
+    lea     r6, [3 * r3]
645
+
646
+    PROCESS_BLOCKCOPY_SS_W64_H8_avx512
647
+    PROCESS_BLOCKCOPY_SS_W64_H8_avx512
648
+    PROCESS_BLOCKCOPY_SS_W64_H8_avx512
649
+    PROCESS_BLOCKCOPY_SS_W64_H8_LAST_avx512
650
+    RET
651
+
652
+INIT_ZMM avx512
653
+cglobal blockcopy_ss_64x48, 4, 7, 4
654
+    add     r1, r1
655
+    add     r3, r3
656
+    lea     r5, [3 * r1]
657
+    lea     r6, [3 * r3]
658
+
659
+    PROCESS_BLOCKCOPY_SS_W64_H8_avx512
660
+    PROCESS_BLOCKCOPY_SS_W64_H8_avx512
661
+    PROCESS_BLOCKCOPY_SS_W64_H8_avx512
662
+    PROCESS_BLOCKCOPY_SS_W64_H8_avx512
663
+    PROCESS_BLOCKCOPY_SS_W64_H8_avx512
664
+    PROCESS_BLOCKCOPY_SS_W64_H8_LAST_avx512
665
+    RET
666
+
667
+INIT_ZMM avx512
668
+cglobal blockcopy_ss_64x64, 4, 7, 4
669
+    add     r1, r1
670
+    add     r3, r3
671
+    lea     r5, [3 * r1]
672
+    lea     r6, [3 * r3]
673
+
674
+    PROCESS_BLOCKCOPY_SS_W64_H8_avx512
675
+    PROCESS_BLOCKCOPY_SS_W64_H8_avx512
676
+    PROCESS_BLOCKCOPY_SS_W64_H8_avx512
677
+    PROCESS_BLOCKCOPY_SS_W64_H8_avx512
678
+    PROCESS_BLOCKCOPY_SS_W64_H8_avx512
679
+    PROCESS_BLOCKCOPY_SS_W64_H8_avx512
680
+    PROCESS_BLOCKCOPY_SS_W64_H8_avx512
681
+    PROCESS_BLOCKCOPY_SS_W64_H8_LAST_avx512
682
+    RET
683
 ;--------------------------------------------------------------------------------------
684
 ; void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
685
 ;--------------------------------------------------------------------------------------
686
@@ -4572,6 +5199,53 @@
687
     jnz        .loop
688
     RET
689
 
690
+INIT_ZMM avx512
691
+cglobal cpy2Dto1D_shr_16, 4, 5, 4
692
+    shl                 r2d,             1
693
+    movd                xm0,             r3d
694
+    pcmpeqw             ymm1,            ymm1
695
+    psllw               ym1,             ymm1,        xm0
696
+    psraw               ym1,             1
697
+    vinserti32x8        m1,              ym1,         1
698
+    lea                 r3,              [r2 * 3]
699
+    mov                 r4d,             2
700
+
701
+.loop:
702
+    ; Row 0-1
703
+    movu                ym2,             [r1]
704
+    vinserti32x8        m2,              [r1 + r2],   1
705
+    psubw               m2,              m1
706
+    psraw               m2,              xm0
707
+    movu                [r0],            m2
708
+
709
+    ; Row 2-3
710
+    movu                ym2,             [r1 + 2 * r2]
711
+    vinserti32x8        m2,              [r1 + r3],   1
712
+    psubw               m2,              m1
713
+    psraw               m2,              xm0
714
+    movu                [r0 + mmsize],   m2
715
+
716
+    lea        r1, [r1 + 4 * r2]
717
+    ; Row 4-5
718
+
719
+    movu                ym2,             [r1]
720
+    vinserti32x8        m2,              [r1 + r2],   1
721
+    psubw               m2,              m1
722
+    psraw               m2,              xm0
723
+    movu                [r0 + 2 * mmsize], m2
724
+
725
+    ; Row 6-7
726
+    movu                ym2,             [r1 + 2 * r2]
727
+    vinserti32x8        m2,              [r1 + r3],   1
728
+    psubw               m2,              m1
729
+    psraw               m2,              xm0
730
+    movu                [r0 + 3 * mmsize], m2
731
+
732
+    add                 r0,              4 * mmsize
733
+    lea                 r1,              [r1 + 4 * r2]
734
+    dec                 r4d
735
+    jnz                 .loop
736
+    RET
737
 
738
 ;--------------------------------------------------------------------------------------
739
 ; void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
740
@@ -4675,6 +5349,48 @@
741
     jnz        .loop
742
     RET
743
 
744
+INIT_ZMM avx512
745
+cglobal cpy2Dto1D_shr_32, 4, 5, 4
746
+    shl                 r2d,             1
747
+    movd                xm0,             r3d
748
+    pcmpeqw             ymm1,            ymm1
749
+    psllw               ym1,             ymm1,       xm0
750
+    psraw               ym1,             1
751
+    vinserti32x8        m1,              ym1,        1
752
+    lea                 r3,              [r2 * 3]
753
+    mov                 r4d,             8
754
+
755
+.loop:
756
+    ; Row 0
757
+    movu                m2,              [r1]
758
+    psubw               m2,              m1
759
+    psraw               m2,              xm0
760
+    movu                [r0],            m2
761
+
762
+    ; Row 1
763
+    movu                m2,              [r1 + r2]
764
+    psubw               m2,              m1
765
+    psraw               m2,              xm0
766
+    movu                [r0 + mmsize],   m2
767
+
768
+    ; Row 2
769
+    movu                m2,              [r1 + 2 * r2]
770
+    psubw               m2,              m1
771
+    psraw               m2,              xm0
772
+    movu                [r0 + 2 * mmsize], m2
773
+
774
+    ; Row 3
775
+    movu                m2,              [r1 + r3]
776
+    psubw               m2,              m1
777
+    psraw               m2,              xm0
778
+    movu                [r0 + 3 * mmsize], m2
779
+
780
+    add                 r0,              4 * mmsize
781
+    lea                 r1,              [r1 + 4 * r2]
782
+    dec                 r4d
783
+    jnz                 .loop
784
+    RET
785
+
786
 ;--------------------------------------------------------------------------------------
787
 ; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
788
 ;--------------------------------------------------------------------------------------
789
@@ -4931,7 +5647,103 @@
790
     jnz        .loop
791
     RET
792
 
793
+;--------------------------------------------------------------------------------------
794
+; cpy_1Dto2D_shl avx512 code start
795
+;--------------------------------------------------------------------------------------
796
+%macro PROCESS_CPY1Dto2D_SHL_32x8_AVX512 0
797
+    movu        m1,            [r1 + 0 * mmsize]
798
+    movu        m2,            [r1 + 1 * mmsize]
799
+    movu        m3,            [r1 + 2 * mmsize]
800
+    movu        m4,            [r1 + 3 * mmsize]
801
+    psllw       m1,            xm0
802
+    psllw       m2,            xm0
803
+    psllw       m3,            xm0
804
+    psllw       m4,            xm0
805
+    movu        [r0],          m1
806
+    movu        [r0 + r2],     m2
807
+    movu        [r0 + 2 * r2], m3
808
+    movu        [r0 + r3],     m4
809
+
810
+    add         r1,            4 * mmsize
811
+    lea         r0,            [r0 + r2 * 4]
812
+
813
+    movu        m1,            [r1 + 0 * mmsize]
814
+    movu        m2,            [r1 + 1 * mmsize]
815
+    movu        m3,            [r1 + 2 * mmsize]
816
+    movu        m4,            [r1 + 3 * mmsize]
817
+    psllw       m1,            xm0
818
+    psllw       m2,            xm0
819
+    psllw       m3,            xm0
820
+    psllw       m4,            xm0
821
+    movu        [r0],          m1
822
+    movu        [r0 + r2],     m2
823
+    movu        [r0 + 2 * r2], m3
824
+    movu        [r0 + r3],     m4
825
+%endmacro
826
+;--------------------------------------------------------------------------------------
827
+; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
828
+;--------------------------------------------------------------------------------------
829
+INIT_ZMM avx512
830
+cglobal cpy1Dto2D_shl_32, 4, 4, 5
831
+    add         r2d, r2d
832
+    movd        xm0, r3d
833
+    lea         r3, [3 * r2]
834
+%rep 3
835
+    PROCESS_CPY1Dto2D_SHL_32x8_AVX512
836
+    add         r1, 4 * mmsize
837
+    lea         r0, [r0 + r2 * 4]
838
+%endrep
839
+    PROCESS_CPY1Dto2D_SHL_32x8_AVX512
840
+    RET
841
 
842
+%macro PROCESS_CPY1Dto2D_SHL_ALIGNED_32x8_AVX512 0
843
+    mova        m1,            [r1 + 0 * mmsize]
844
+    mova        m2,            [r1 + 1 * mmsize]
845
+    mova        m3,            [r1 + 2 * mmsize]
846
+    mova        m4,            [r1 + 3 * mmsize]
847
+    psllw       m1,            xm0
848
+    psllw       m2,            xm0
849
+    psllw       m3,            xm0
850
+    psllw       m4,            xm0
851
+    mova        [r0],          m1
852
+    mova        [r0 + r2],     m2
853
+    mova        [r0 + 2 * r2], m3
854
+    mova        [r0 + r3],     m4
855
+
856
+    add         r1,            4 * mmsize
857
+    lea         r0,            [r0 + r2 * 4]
858
+
859
+    mova        m1,            [r1 + 0 * mmsize]
860
+    mova        m2,            [r1 + 1 * mmsize]
861
+    mova        m3,            [r1 + 2 * mmsize]
862
+    mova        m4,            [r1 + 3 * mmsize]
863
+    psllw       m1,            xm0
864
+    psllw       m2,            xm0
865
+    psllw       m3,            xm0
866
+    psllw       m4,            xm0
867
+    mova        [r0],          m1
868
+    mova        [r0 + r2],     m2
869
+    mova        [r0 + 2 * r2], m3
870
+    mova        [r0 + r3],     m4
871
+%endmacro
872
+;--------------------------------------------------------------------------------------
873
+; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
874
+;--------------------------------------------------------------------------------------
875
+INIT_ZMM avx512
876
+cglobal cpy1Dto2D_shl_aligned_32, 4, 4, 5
877
+    add         r2d, r2d
878
+    movd        xm0, r3d
879
+    lea         r3, [3 * r2]
880
+%rep 3
881
+    PROCESS_CPY1Dto2D_SHL_ALIGNED_32x8_AVX512
882
+    add         r1, 4 * mmsize
883
+    lea         r0, [r0 + r2 * 4]
884
+%endrep
885
+    PROCESS_CPY1Dto2D_SHL_ALIGNED_32x8_AVX512
886
+    RET
887
+;--------------------------------------------------------------------------------------
888
+; copy_cnt avx512 code end
889
+;--------------------------------------------------------------------------------------
890
 ;--------------------------------------------------------------------------------------
891
 ; uint32_t copy_cnt(int16_t* dst, const int16_t* src, intptr_t srcStride);
892
 ;--------------------------------------------------------------------------------------
893
@@ -5294,7 +6106,91 @@
894
     movd         eax, xm4
895
     RET
896
 
897
+;--------------------------------------------------------------------------------------
898
+; copy_cnt avx512 code start
899
+;--------------------------------------------------------------------------------------
900
+%macro PROCESS_COPY_CNT_32x4_AVX512 0
901
+    movu        m0,                  [r1]
902
+    movu        m1,                  [r1 + r2]
903
+    movu        [r0],                m0
904
+    movu        [r0 + mmsize],       m1
905
+    packsswb    m0,                  m1
906
+    pminub      m0,                  m3
907
+
908
+    movu        m1,                  [r1 + 2 * r2]
909
+    movu        m2,                  [r1 + r3]
910
+    movu        [r0 + 2 * mmsize],   m1
911
+    movu        [r0 + 3 * mmsize],   m2
912
+    packsswb    m1,                  m2
913
+    pminub      m1,                  m3
914
+
915
+    paddb       m0,                  m1
916
+    paddb       m4,                  m0
917
+%endmacro
918
+
919
+%macro PROCESS_COPY_CNT_16x4_AVX512 0
920
+    movu          ym0,               [r1]
921
+    vinserti32x8   m0,               [r1 + r2],    1
922
+    movu          ym1,               [r1 + 2 * r2]
923
+    vinserti32x8   m1,               [r1 + r3],    1
924
+    movu         [r0],               m0
925
+    movu         [r0 + mmsize],      m1
926
+    packsswb       m0,               m1
927
+    pminub         m0,               m3
928
+    paddb          m4,               m0
929
+%endmacro
930
+
931
+%macro PROCESS_COPY_CNT_END_AVX512 0
932
+    pxor           m0,  m0
933
+    vextracti32x8  ym1, m4, 1
934
+    paddb          ym4, ym1
935
+    vextracti32x4  xm1, ym4, 1
936
+    paddb          xm4, xm1
937
+    psadbw         xm4, xm0
938
+    movhlps        xm1, xm4
939
+    paddd          xm4, xm1
940
+    movd           eax, xm4
941
+%endmacro
942
+
943
+;--------------------------------------------------------------------------------------
944
+; uint32_t copy_cnt(int32_t* dst, const int16_t* src, intptr_t stride);
945
+;--------------------------------------------------------------------------------------
946
+INIT_ZMM avx512
947
+cglobal copy_cnt_32, 3, 4, 5
948
+    add              r2d,  r2d
949
+    lea              r3,   [3 * r2]
950
+
951
+    vbroadcasti32x8  m3,   [pb_1]
952
+    pxor             m4,   m4
953
+
954
+%rep 7
955
+    PROCESS_COPY_CNT_32x4_AVX512
956
+    add              r0,  4 * mmsize
957
+    lea              r1,  [r1 + 4 * r2]
958
+%endrep
959
+    PROCESS_COPY_CNT_32x4_AVX512
960
+    PROCESS_COPY_CNT_END_AVX512
961
+    RET
962
+
963
+INIT_ZMM avx512
964
+cglobal copy_cnt_16, 3, 4, 5
965
+    add              r2d,  r2d
966
+    lea              r3,   [3 * r2]
967
+
968
+    vbroadcasti32x8  m3,   [pb_1]
969
+    pxor             m4,   m4
970
 
971
+%rep 3
972
+    PROCESS_COPY_CNT_16x4_AVX512
973
+    add              r0,  2 * mmsize
974
+    lea              r1,  [r1 + 4 * r2]
975
+%endrep
976
+    PROCESS_COPY_CNT_16x4_AVX512
977
+    PROCESS_COPY_CNT_END_AVX512
978
+    RET
979
+;--------------------------------------------------------------------------------------
980
+; copy_cnt avx512 code end
981
+;--------------------------------------------------------------------------------------
982
 ;--------------------------------------------------------------------------------------
983
 ; void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
984
 ;--------------------------------------------------------------------------------------
985
@@ -5558,6 +6454,102 @@
986
     RET
987
 
988
 ;--------------------------------------------------------------------------------------
989
+; cpy2Dto1D_shl avx512 code start
990
+;--------------------------------------------------------------------------------------
991
+%macro PROCESS_CPY2Dto1D_SHL_16x8_AVX512 0
992
+    movu             m1,    [r1]
993
+    vinserti32x8     m1,    [r1 + r2], 1
994
+    movu             m2,    [r1 + 2 * r2]
995
+    vinserti32x8     m2,    [r1 + r3], 1
996
+
997
+    psllw    m1, xm0
998
+    psllw    m2, xm0
999
+    movu     [r0], m1
1000
+    movu     [r0 + mmsize], m2
1001
+
1002
+    add      r0, 2 * mmsize
1003
+    lea      r1, [r1 + r2 * 4]
1004
+
1005
+    movu             m1,    [r1]
1006
+    vinserti32x8     m1,    [r1 + r2], 1
1007
+    movu             m2,    [r1 + 2 * r2]
1008
+    vinserti32x8     m2,    [r1 + r3], 1
1009
+
1010
+    psllw    m1, xm0
1011
+    psllw    m2, xm0
1012
+    movu     [r0], m1
1013
+    movu     [r0 + mmsize], m2
1014
+%endmacro
1015
+
1016
+%macro PROCESS_CPY2Dto1D_SHL_32x8_AVX512 0
1017
+    movu     m1, [r1]
1018
+    movu     m2, [r1 + r2]
1019
+    movu     m3, [r1 + 2 * r2]
1020
+    movu     m4, [r1 + r3]
1021
+
1022
+    psllw    m1, xm0
1023
+    psllw    m2, xm0
1024
+    psllw    m3, xm0
1025
+    psllw    m4, xm0
1026
+    movu     [r0], m1
1027
+    movu     [r0 + mmsize], m2
1028
+    movu     [r0 + 2 * mmsize], m3
1029
+    movu     [r0 + 3 * mmsize], m4
1030
+
1031
+    add      r0, 4 * mmsize
1032
+    lea      r1, [r1 + r2 * 4]
1033
+
1034
+    movu     m1, [r1]
1035
+    movu     m2, [r1 + r2]
1036
+    movu     m3, [r1 + 2 * r2]
1037
+    movu     m4, [r1 + r3]
1038
+
1039
+    psllw    m1, xm0
1040
+    psllw    m2, xm0
1041
+    psllw    m3, xm0
1042
+    psllw    m4, xm0
1043
+    movu     [r0], m1
1044
+    movu     [r0 + mmsize], m2
1045
+    movu     [r0 + 2 * mmsize], m3
1046
+    movu     [r0 + 3 * mmsize], m4
1047
+%endmacro
1048
+
1049
+;--------------------------------------------------------------------------------------
1050
+; void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
1051
+;--------------------------------------------------------------------------------------
1052
+INIT_ZMM avx512
1053
+cglobal cpy2Dto1D_shl_32, 4, 4, 5
1054
+    add     r2d, r2d
1055
+    movd    xm0, r3d
1056
+    lea     r3, [3 * r2]
1057
+
1058
+    PROCESS_CPY2Dto1D_SHL_32x8_AVX512
1059
+    add      r0, 4 * mmsize
1060
+    lea      r1, [r1 + r2 * 4]
1061
+    PROCESS_CPY2Dto1D_SHL_32x8_AVX512
1062
+    add      r0, 4 * mmsize
1063
+    lea      r1, [r1 + r2 * 4]
1064
+    PROCESS_CPY2Dto1D_SHL_32x8_AVX512
1065
+    add      r0, 4 * mmsize
1066
+    lea      r1, [r1 + r2 * 4]
1067
+    PROCESS_CPY2Dto1D_SHL_32x8_AVX512
1068
+    RET
1069
+
1070
+INIT_ZMM avx512
1071
+cglobal cpy2Dto1D_shl_16, 4, 4, 3
1072
+    add     r2d, r2d
1073
+    movd    xm0, r3d
1074
+    lea     r3, [3 * r2]
1075
+
1076
+    PROCESS_CPY2Dto1D_SHL_16x8_AVX512
1077
+    add      r0, 2 * mmsize
1078
+    lea      r1, [r1 + r2 * 4]
1079
+    PROCESS_CPY2Dto1D_SHL_16x8_AVX512
1080
+    RET
1081
+;--------------------------------------------------------------------------------------
1082
+; cpy2Dto1D_shl avx512 code end
1083
+;--------------------------------------------------------------------------------------
1084
+;--------------------------------------------------------------------------------------
1085
 ; void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
1086
 ;--------------------------------------------------------------------------------------
1087
 INIT_XMM sse2
1088
@@ -5785,6 +6777,37 @@
1089
     jnz        .loop
1090
     RET
1091
 
1092
+INIT_ZMM avx512
1093
+cglobal cpy1Dto2D_shr_16, 3, 5, 4
1094
+    shl                 r2d,             1
1095
+    movd                xm0,             r3m
1096
+    pcmpeqw             xmm1,            xmm1
1097
+    psllw               xm1,             xmm1,       xm0
1098
+    psraw               xm1,             1
1099
+    vpbroadcastw         m1,              xm1
1100
+    mov                 r3d,             4
1101
+    lea                 r4,              [r2 * 3]
1102
+
1103
+.loop:
1104
+    ; Row 0-1
1105
+    movu                m2,              [r1]
1106
+    psubw               m2,              m1
1107
+    psraw               m2,              xm0
1108
+    movu                [r0],            ym2
1109
+    vextracti32x8       [r0 + r2],       m2,         1
1110
+
1111
+    ; Row 2-3
1112
+    movu                m2,              [r1 + mmsize]
1113
+    psubw               m2,              m1
1114
+    psraw               m2,              xm0
1115
+    movu                [r0 + r2 * 2],   ym2
1116
+    vextracti32x8       [r0 + r4],       m2,         1
1117
+
1118
+    add                 r1,              2 * mmsize
1119
+    lea                 r0,              [r0 + r2 * 4]
1120
+    dec                 r3d
1121
+    jnz                 .loop
1122
+    RET
1123
 
1124
 ;--------------------------------------------------------------------------------------
1125
 ; void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
1126
@@ -5875,3 +6898,30 @@
1127
     dec         r3d
1128
     jnz        .loop
1129
     RET
1130
+
1131
+INIT_ZMM avx512
1132
+cglobal cpy1Dto2D_shr_32, 3, 4, 6
1133
+    shl                 r2d,             1
1134
+    movd                xm0,             r3m
1135
+    pcmpeqw             xmm1,            xmm1
1136
+    psllw               xm1,             xmm1,       xm0
1137
+    psraw               xm1,             1
1138
+    vpbroadcastw        m1,              xm1
1139
+    mov                 r3d,             16
1140
+
1141
+.loop:
1142
+    ; Row 0-1
1143
+    movu                m2,              [r1]
1144
+    movu                m3,              [r1 + mmsize]
1145
+    psubw               m2,              m1
1146
+    psubw               m3,              m1
1147
+    psraw               m2,              xm0
1148
+    psraw               m3,              xm0
1149
+    movu                [r0],            m2
1150
+    movu                [r0 + r2],       m3
1151
+
1152
+    add                 r1,              2 * mmsize
1153
+    lea                 r0,              [r0 + r2 * 2]
1154
+    dec                 r3d
1155
+    jnz                 .loop
1156
+    RET
1157
x265_2.7.tar.gz/source/common/x86/blockcopy8.h -> x265_2.9.tar.gz/source/common/x86/blockcopy8.h Changed
51
 
1
@@ -28,37 +28,48 @@
2
 FUNCDEF_TU_S(void, cpy2Dto1D_shl, sse2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
3
 FUNCDEF_TU_S(void, cpy2Dto1D_shl, sse4, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
4
 FUNCDEF_TU_S(void, cpy2Dto1D_shl, avx2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
5
+FUNCDEF_TU_S(void, cpy2Dto1D_shl, avx512, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
6
 
7
 FUNCDEF_TU_S(void, cpy2Dto1D_shr, sse2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
8
 FUNCDEF_TU_S(void, cpy2Dto1D_shr, sse4, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
9
 FUNCDEF_TU_S(void, cpy2Dto1D_shr, avx2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
10
+FUNCDEF_TU_S(void, cpy2Dto1D_shr, avx512, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
11
 
12
 FUNCDEF_TU_S(void, cpy1Dto2D_shl, sse2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
13
 FUNCDEF_TU_S(void, cpy1Dto2D_shl, sse4, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
14
 FUNCDEF_TU_S(void, cpy1Dto2D_shl, avx2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
15
-
16
+FUNCDEF_TU_S(void, cpy1Dto2D_shl, avx512, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
17
+FUNCDEF_TU_S(void, cpy1Dto2D_shl_aligned, avx512, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
18
 FUNCDEF_TU_S(void, cpy1Dto2D_shr, sse2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
19
 FUNCDEF_TU_S(void, cpy1Dto2D_shr, sse4, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
20
 FUNCDEF_TU_S(void, cpy1Dto2D_shr, avx2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
21
+FUNCDEF_TU_S(void, cpy1Dto2D_shr, avx512, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
22
 
23
 FUNCDEF_TU_S(uint32_t, copy_cnt, sse2, int16_t* dst, const int16_t* src, intptr_t srcStride);
24
 FUNCDEF_TU_S(uint32_t, copy_cnt, sse4, int16_t* dst, const int16_t* src, intptr_t srcStride);
25
 FUNCDEF_TU_S(uint32_t, copy_cnt, avx2, int16_t* dst, const int16_t* src, intptr_t srcStride);
26
+FUNCDEF_TU_S(uint32_t, copy_cnt, avx512, int16_t* dst, const int16_t* src, intptr_t srcStride);
27
 
28
 FUNCDEF_TU(void, blockfill_s, sse2, int16_t* dst, intptr_t dstride, int16_t val);
29
 FUNCDEF_TU(void, blockfill_s, avx2, int16_t* dst, intptr_t dstride, int16_t val);
30
+FUNCDEF_TU(void, blockfill_s, avx512, int16_t* dst, intptr_t dstride, int16_t val);
31
+FUNCDEF_TU(void, blockfill_s_aligned, avx512, int16_t* dst, intptr_t dstride, int16_t val);
32
 
33
 FUNCDEF_CHROMA_PU(void, blockcopy_ss, sse2, int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
34
 FUNCDEF_CHROMA_PU(void, blockcopy_ss, avx, int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
35
+FUNCDEF_CHROMA_PU(void, blockcopy_ss, avx512, int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
36
 
37
 FUNCDEF_CHROMA_PU(void, blockcopy_pp, sse2, pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
38
 FUNCDEF_CHROMA_PU(void, blockcopy_pp, avx, pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
39
+FUNCDEF_CHROMA_PU(void, blockcopy_pp, avx512, pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
40
 
41
 FUNCDEF_PU(void, blockcopy_sp, sse2, pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
42
 FUNCDEF_PU(void, blockcopy_sp, sse4, pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
43
 FUNCDEF_PU(void, blockcopy_sp, avx2, pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
44
+FUNCDEF_PU(void, blockcopy_sp, avx512, pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
45
 FUNCDEF_PU(void, blockcopy_ps, sse2, int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
46
 FUNCDEF_PU(void, blockcopy_ps, sse4, int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
47
 FUNCDEF_PU(void, blockcopy_ps, avx2, int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
48
+FUNCDEF_PU(void, blockcopy_ps, avx512, int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
49
 
50
 #endif // ifndef X265_I386_PIXEL_H
51
x265_2.7.tar.gz/source/common/x86/const-a.asm -> x265_2.9.tar.gz/source/common/x86/const-a.asm Changed
10
 
1
@@ -28,7 +28,7 @@
2
 
3
 %include "x86inc.asm"
4
 
5
-SECTION_RODATA 32
6
+SECTION_RODATA 64
7
 
8
 ;; 8-bit constants
9
 
10
x265_2.7.tar.gz/source/common/x86/cpu-a.asm -> x265_2.9.tar.gz/source/common/x86/cpu-a.asm Changed
46
 
1
@@ -54,18 +54,16 @@
2
     RET
3
 
4
 ;-----------------------------------------------------------------------------
5
-; void cpu_xgetbv( int op, int *eax, int *edx )
6
+; uint64_t cpu_xgetbv( int xcr )
7
 ;-----------------------------------------------------------------------------
8
-cglobal cpu_xgetbv, 3,7
9
-    push  r2
10
-    push  r1
11
-    mov  ecx, r0d
12
+cglobal cpu_xgetbv
13
+    movifnidn ecx, r0m
14
     xgetbv
15
-    pop   r4
16
-    mov [r4], eax
17
-    pop   r4
18
-    mov [r4], edx
19
-    RET
20
+%if ARCH_X86_64
21
+    shl       rdx, 32
22
+    or        rax, rdx
23
+%endif
24
+    ret
25
 
26
 %if ARCH_X86_64
27
 
28
@@ -78,7 +76,7 @@
29
 %if WIN64
30
     sub  rsp, 32 ; shadow space
31
 %endif
32
-    and  rsp, ~31
33
+    and  rsp, ~(STACK_ALIGNMENT - 1)
34
     mov  rax, r0
35
     mov   r0, r1
36
     mov   r1, r2
37
@@ -119,7 +117,7 @@
38
     push ebp
39
     mov  ebp, esp
40
     sub  esp, 12
41
-    and  esp, ~31
42
+    and  esp, ~(STACK_ALIGNMENT - 1)
43
     mov  ecx, [ebp+8]
44
     mov  edx, [ebp+12]
45
     mov  [esp], edx
46
x265_2.7.tar.gz/source/common/x86/dct8.asm -> x265_2.9.tar.gz/source/common/x86/dct8.asm Changed
4080
 
1
@@ -28,7 +28,89 @@
2
 
3
 %include "x86inc.asm"
4
 %include "x86util.asm"
5
-SECTION_RODATA 32
6
+SECTION_RODATA 64
7
+
8
+tab_dct32:      dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
9
+                dw 90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13,  4, -4, -13, -22, -31, -38, -46, -54, -61, -67, -73, -78, -82, -85, -88, -90, -90
10
+                dw 90, 87, 80, 70, 57, 43, 25,  9, -9, -25, -43, -57, -70, -80, -87, -90, -90, -87, -80, -70, -57, -43, -25, -9,  9, 25, 43, 57, 70, 80, 87, 90
11
+                dw 90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13, 13, 38, 61, 78, 88, 90, 85, 73, 54, 31,  4, -22, -46, -67, -82, -90
12
+                dw 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89, 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89
13
+                dw 88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22, -22, -61, -85, -90, -73, -38,  4, 46, 78, 90, 82, 54, 13, -31, -67, -88
14
+                dw 87, 57,  9, -43, -80, -90, -70, -25, 25, 70, 90, 80, 43, -9, -57, -87, -87, -57, -9, 43, 80, 90, 70, 25, -25, -70, -90, -80, -43,  9, 57, 87
15
+                dw 85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31, 31, 78, 90, 61,  4, -54, -88, -82, -38, 22, 73, 90, 67, 13, -46, -85
16
+                dw 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83
17
+                dw 82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67,  4, 73, 88, 38, -38, -88, -73, -4, 67, 90, 46, -31, -85, -78, -13, 61, 90, 54, -22, -82
18
+                dw 80,  9, -70, -87, -25, 57, 90, 43, -43, -90, -57, 25, 87, 70, -9, -80, -80, -9, 70, 87, 25, -57, -90, -43, 43, 90, 57, -25, -87, -70,  9, 80
19
+                dw 78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46, 46, 90, 38, -54, -90, -31, 61, 88, 22, -67, -85, -13, 73, 82,  4, -78
20
+                dw 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75, 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75
21
+                dw 73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54, -54, -85,  4, 88, 46, -61, -82, 13, 90, 38, -67, -78, 22, 90, 31, -73
22
+                dw 70, -43, -87,  9, 90, 25, -80, -57, 57, 80, -25, -90, -9, 87, 43, -70, -70, 43, 87, -9, -90, -25, 80, 57, -57, -80, 25, 90,  9, -87, -43, 70
23
+                dw 67, -54, -78, 38, 85, -22, -90,  4, 90, 13, -88, -31, 82, 46, -73, -61, 61, 73, -46, -82, 31, 88, -13, -90, -4, 90, 22, -85, -38, 78, 54, -67
24
+                dw 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64
25
+                dw 61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67, -67, -54, 78, 38, -85, -22, 90,  4, -90, 13, 88, -31, -82, 46, 73, -61
26
+                dw 57, -80, -25, 90, -9, -87, 43, 70, -70, -43, 87,  9, -90, 25, 80, -57, -57, 80, 25, -90,  9, 87, -43, -70, 70, 43, -87, -9, 90, -25, -80, 57
27
+                dw 54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73, 73, 31, -90, 22, 78, -67, -38, 90, -13, -82, 61, 46, -88,  4, 85, -54
28
+                dw 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50, 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50
29
+                dw 46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82,  4, 78, -78, -4, 82, -73, -13, 85, -67, -22, 88, -61, -31, 90, -54, -38, 90, -46
30
+                dw 43, -90, 57, 25, -87, 70,  9, -80, 80, -9, -70, 87, -25, -57, 90, -43, -43, 90, -57, -25, 87, -70, -9, 80, -80,  9, 70, -87, 25, 57, -90, 43
31
+                dw 38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82, 82, -22, -54, 90, -61, -13, 78, -85, 31, 46, -90, 67,  4, -73, 88, -38
32
+                dw 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36
33
+                dw 31, -78, 90, -61,  4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85, -85, 46, 13, -67, 90, -73, 22, 38, -82, 88, -54, -4, 61, -90, 78, -31
34
+                dw 25, -70, 90, -80, 43,  9, -57, 87, -87, 57, -9, -43, 80, -90, 70, -25, -25, 70, -90, 80, -43, -9, 57, -87, 87, -57,  9, 43, -80, 90, -70, 25
35
+                dw 22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88, 88, -67, 31, 13, -54, 82, -90, 78, -46,  4, 38, -73, 90, -85, 61, -22
36
+                dw 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18, 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18
37
+                dw 13, -38, 61, -78, 88, -90, 85, -73, 54, -31,  4, 22, -46, 67, -82, 90, -90, 82, -67, 46, -22, -4, 31, -54, 73, -85, 90, -88, 78, -61, 38, -13
38
+                dw  9, -25, 43, -57, 70, -80, 87, -90, 90, -87, 80, -70, 57, -43, 25, -9, -9, 25, -43, 57, -70, 80, -87, 90, -90, 87, -80, 70, -57, 43, -25,  9
39
+                dw  4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90, 90, -90, 88, -85, 82, -78, 73, -67, 61, -54, 46, -38, 31, -22, 13, -4
40
+tab_dct16:      dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
41
+                dw 90, 87, 80, 70, 57, 43, 25,  9, -9, -25, -43, -57, -70, -80, -87, -90
42
+                dw 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89
43
+                dw 87, 57,  9, -43, -80, -90, -70, -25, 25, 70, 90, 80, 43, -9, -57, -87
44
+                dw 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83
45
+                dw 80,  9, -70, -87, -25, 57, 90, 43, -43, -90, -57, 25, 87, 70, -9, -80
46
+                dw 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75
47
+                dw 70, -43, -87,  9, 90, 25, -80, -57, 57, 80, -25, -90, -9, 87, 43, -70
48
+                dw 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64
49
+                dw 57, -80, -25, 90, -9, -87, 43, 70, -70, -43, 87,  9, -90, 25, 80, -57
50
+                dw 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50
51
+                dw 43, -90, 57, 25, -87, 70,  9, -80, 80, -9, -70, 87, -25, -57, 90, -43
52
+                dw 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36
53
+                dw 25, -70, 90, -80, 43,  9, -57, 87, -87, 57, -9, -43, 80, -90, 70, -25
54
+                dw 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18
55
+                dw 9, -25, 43, -57, 70, -80, 87, -90, 90, -87, 80, -70, 57, -43, 25, -9
56
+
57
+dct16_shuf_AVX512:  dq 0, 1, 8, 9, 4, 5, 12, 13
58
+dct16_shuf1_AVX512: dq 2, 3, 10, 11, 6, 7, 14, 15
59
+dct16_shuf3_AVX512: dq 0, 1, 4, 5, 8, 9, 12, 13
60
+dct16_shuf4_AVX512: dq 2, 3, 6, 7, 10, 11, 14, 15
61
+dct16_shuf2_AVX512: dd 0, 4, 8, 12, 2, 6, 10, 14, 16, 20, 24, 28, 18, 22, 26, 30
62
+
63
+dct8_shuf5_AVX512: dq 0, 2, 4, 6, 1, 3, 5, 7
64
+dct8_shuf6_AVX512: dq 0, 2, 4, 6, 1, 3, 5, 7
65
+dct8_shuf8_AVX512: dd 0, 2, 8, 10, 4, 6, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
66
+dct8_shuf4_AVX512: times 2 dd 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
67
+dct16_shuf7_AVX512: dd 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
68
+dct16_shuf9_AVX512: dd 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
69
+
70
+dct32_shuf_AVX512:  dd 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20 , 21, 24, 25, 28, 29
71
+dct32_shuf4_AVX512: times 2 dd 0, 4, 8, 12, 0, 4, 8, 12
72
+dct32_shuf5_AVX512: dd 0, 0, 0, 0, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0
73
+dct32_shuf6_AVX512: dd 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, 0, 0, 0, 0
74
+dct32_shuf7_AVX512: dd 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1
75
+dct32_shuf8_AVX512: dd -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
76
+dct16_shuf5_AVX512: dw 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27, 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
77
+dct16_shuf6_AVX512: dw 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
78
+dct16_shuf8_AVX512: dw 20, 0, 4, 2, 28, 8, 6, 10, 22, 16, 12, 18, 30, 24, 14, 26
79
+
80
+dct8_shuf7_AVX512: dw 0, 2, 16, 18, 8, 10, 24, 26, 4, 6, 20, 22, 12, 14, 28, 30
81
+dct8_shuf9_AVX512: times 2 dw 0, 8, 16, 24, 4, 12, 20, 28
82
+dct32_shuf1_AVX512: dw 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16
83
+dct32_shuf2_AVX512: dw 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23, 15, 14, 13, 12, 11, 10, 9, 8, 31, 30, 29, 28, 27, 26, 25, 24
84
+dct32_shuf3_AVX512: times 2 dw 0, 8, 16, 24, 2, 10, 18, 26
85
+
86
+dct8_shuf:         times 2 db 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9
87
+dct8_shuf_AVX512:  times 2 db 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11
88
+
89
 tab_dct8:       dw 64, 64, 64, 64, 64, 64, 64, 64
90
                 dw 89, 75, 50, 18, -18, -50, -75, -89
91
                 dw 83, 36, -36, -83, -83, -36, 36, 83
92
@@ -38,7 +120,10 @@
93
                 dw 36, -83, 83, -36, -36, 83, -83, 36
94
                 dw 18, -50, 75, -89, 89, -75, 50, -18
95
 
96
-dct8_shuf:      times 2 db 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9
97
+tab_dct8_avx512: dw 64, 64, 64, 64, 89, 75, 50, 18
98
+                 dw 83, 36, -36, -83, 75, -18, -89, -50
99
+                 dw 64, -64, -64, 64, 50, -89, 18, 75
100
+                 dw 36, -83, 83, -36, 18, -50, 75, -89
101
 
102
 tab_dct16_1:    dw 64, 64, 64, 64, 64, 64, 64, 64
103
                 dw 90, 87, 80, 70, 57, 43, 25,  9
104
@@ -57,7 +142,6 @@
105
                 dw 18, -50, 75, -89, 89, -75, 50, -18
106
                 dw  9, -25, 43, -57, 70, -80, 87, -90
107
 
108
-
109
 tab_dct16_2:    dw 64, 64, 64, 64, 64, 64, 64, 64
110
                 dw -9, -25, -43, -57, -70, -80, -87, -90
111
                 dw -89, -75, -50, -18, 18, 50, 75, 89
112
@@ -155,12 +239,34 @@
113
                 times 4 dw 50, -89, 18, 75
114
                 times 4 dw 18, -50, 75, -89
115
 
116
+avx512_idct8_1:   times 8 dw 64, 83, 64, 36
117
+                  times 8 dw 64, 36, -64, -83
118
+                  times 8 dw 64, -36, -64, 83
119
+                  times 8 dw 64, -83, 64, -36
120
+
121
+avx512_idct8_2:   times 8 dw 89, 75, 50, 18
122
+                  times 8 dw 75, -18, -89, -50
123
+                  times 8 dw 50, -89, 18, 75
124
+                  times 8 dw 18, -50, 75, -89
125
+
126
+avx512_idct8_3:   dw 64, 83, 64, 83, 64, 83, 64, 83, 64, 83, 64, 83, 64, 83, 64, 83, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36
127
+                  dw 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, -64, 83, -64, 83, -64, 83, -64, 83, -64, 83, -64, 83, -64, 83, -64, 83
128
+                  dw 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, -83, 64, -83, 64, -83, 64, -83, 64, -83, 64, -83, 64, -83, 64, -83
129
+                  dw -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36
130
+                  dw 89, 75, 89, 75, 89, 75, 89, 75, 89, 75, 89, 75, 89, 75, 89, 75, 50, -89, 50, -89, 50, -89, 50, -89, 50, -89, 50, -89, 50, -89, 50, -89
131
+                  dw 50, 18, 50, 18, 50, 18, 50, 18, 50, 18, 50, 18, 50, 18, 50, 18, 18, 75, 18, 75, 18, 75, 18, 75, 18, 75, 18, 75, 18, 75, 18, 75
132
+                  dw 75, -18, 75, -18, 75, -18, 75, -18, 75, -18, 75, -18, 75, -18, 75, -18, 18, -50, 18, -50, 18, -50, 18, -50, 18, -50, 18, -50, 18, -50, 18, -50
133
+                  dw -89, -50, -89, -50, -89, -50, -89, -50, -89, -50, -89, -50, -89, -50, -89, -50, 75, -89, 75, -89, 75, -89, 75, -89, 75, -89, 75, -89, 75, -89, 75, -89
134
+
135
 idct8_shuf1:    dd 0, 2, 4, 6, 1, 3, 5, 7
136
 
137
 const idct8_shuf2,    times 2 db 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15
138
 
139
 idct8_shuf3:    times 2 db 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
140
 
141
+
142
+idct8_avx512_shuf3:    times 4 db 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
143
+
144
 tab_idct16_1:   dw 90, 87, 80, 70, 57, 43, 25, 9
145
                 dw 87, 57, 9, -43, -80, -90, -70, -25
146
                 dw 80, 9, -70, -87, -25, 57, 90, 43
147
@@ -182,6 +288,31 @@
148
 idct16_shuff:   dd 0, 4, 2, 6, 1, 5, 3, 7
149
 
150
 idct16_shuff1:  dd 2, 6, 0, 4, 3, 7, 1, 5
151
+idct16_shuff2:  dw 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30
152
+idct16_shuff3:  dw 1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31
153
+idct16_shuff4:  dd 0, 8, 2, 10, 4, 12, 6, 14
154
+idct16_shuff5:  dd 1, 9, 3, 11, 5, 13, 7, 15
155
+
156
+
157
+tab_AVX512_idct16_1:   dw 90, 87, 80, 70, 57, 43, 25, 9, 90, 87, 80, 70, 57, 43, 25, 9, 80, 9, -70, -87, -25, 57, 90, 43, 80, 9, -70, -87, -25, 57, 90, 43
158
+                       dw 87, 57, 9, -43, -80, -90, -70, -25, 87, 57, 9, -43, -80, -90, -70, -25, 70, -43, -87, 9, 90, 25, -80, -57, 70, -43, -87, 9, 90, 25, -80, -57
159
+                       dw 57, -80, -25, 90, -9, -87, 43, 70, 57, -80, -25, 90, -9, -87, 43, 70, 25, -70, 90, -80, 43, 9, -57, 87, 25, -70, 90, -80, 43, 9, -57, 87
160
+                       dw 43, -90, 57, 25, -87, 70, 9, -80, 43, -90, 57, 25, -87, 70, 9, -80, 9, -25, 43, -57, 70, -80, 87, -90, 9, -25, 43, -57, 70, -80, 87, -90
161
+
162
+tab_AVX512_idct16_2:   dw 64, 89, 83, 75, 64, 50, 36, 18, 64, 89, 83, 75, 64, 50, 36, 18, 64, 50, -36, -89, -64, 18, 83, 75, 64, 50, -36, -89, -64, 18, 83, 75
163
+                       dw 64, 75, 36, -18, -64, -89, -83, -50, 64, 75, 36, -18, -64, -89, -83, -50, 64, 18, -83, -50, 64, 75, -36, -89, 64, 18, -83, -50, 64, 75, -36, -89
164
+                       dw 64, -18, -83, 50, 64, -75, -36, 89, 64, -18, -83, 50, 64, -75, -36, 89, 64, -75, 36, 18, -64, 89, -83, 50, 64, -75, 36, 18, -64, 89, -83, 50
165
+                       dw 64, -50, -36, 89, -64, -18, 83, -75, 64, -50, -36, 89, -64, -18, 83, -75, 64, -89, 83, -75, 64, -50, 36, -18, 64, -89, 83, -75, 64, -50, 36, -18
166
+
167
+idct16_AVX512_shuff:   dd 0, 4, 2, 6, 1, 5, 3, 7, 8, 12, 10, 14, 9, 13, 11, 15
168
+
169
+idct16_AVX512_shuff1:  dd 2, 6, 0, 4, 3, 7, 1, 5, 10, 14, 8, 12, 11, 15, 9, 13
170
+
171
+idct16_AVX512_shuff2:   dq 0, 1, 8, 9, 4, 5, 12, 13
172
+idct16_AVX512_shuff3:   dq 2, 3, 10, 11, 6, 7, 14, 15
173
+idct16_AVX512_shuff4:   dq 4, 5, 12, 13, 0, 1, 8, 9
174
+idct16_AVX512_shuff5:   dq 6, 7, 14, 15, 2, 3, 10, 11
175
+idct16_AVX512_shuff6:   times 4 db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1
176
 
177
 tab_idct32_1:   dw 90 ,90 ,88 ,85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4
178
                 dw 90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13
179
@@ -237,6 +368,71 @@
180
                 dw 64, -87, 75, -57, 36, -9, -18, 43, -64, 80, -89, 90, -83, 70, -50, 25
181
                 dw 64, -90, 89, -87, 83, -80, 75, -70, 64, -57, 50, -43, 36, -25, 18, -9
182
 
183
+
184
+tab_idct32_AVX512_1:   dw 90 ,90 ,88 ,85, 82, 78, 73, 67, 90 ,90 ,88 ,85, 82, 78, 73, 67, 90, 82, 67, 46, 22, -4, -31, -54, 90, 82, 67, 46, 22, -4, -31, -54
185
+                       dw 61, 54, 46, 38, 31, 22, 13, 4, 61, 54, 46, 38, 31, 22, 13, 4, -73, -85, -90, -88, -78, -61, -38, -13, -73, -85, -90, -88, -78, -61, -38, -13
186
+                       dw 88, 67, 31, -13, -54, -82, -90, -78, 88, 67, 31, -13, -54, -82, -90, -78, 85, 46, -13, -67, -90, -73, -22, 38, 85, 46, -13, -67, -90, -73, -22, 38
187
+                       dw -46, -4, 38, 73, 90, 85, 61, 22, -46, -4, 38, 73, 90, 85, 61, 22, 82, 88, 54, -4, -61, -90, -78, -31, 82, 88, 54, -4, -61, -90, -78, -31
188
+                       dw 82, 22, -54, -90, -61, 13, 78, 85, 82, 22, -54, -90, -61, 13, 78, 85, 78, -4, -82, -73, 13, 85, 67, -22, 78, -4, -82, -73, 13, 85, 67, -22
189
+                       dw 31, -46, -90, -67, 4, 73, 88, 38, 31, -46, -90, -67, 4, 73, 88, 38, -88, -61, 31, 90, 54, -38, -90, -46, -88, -61, 31, 90, 54, -38, -90, -46
190
+                       dw 73, -31, -90, -22, 78, 67, -38, -90, 73, -31, -90, -22, 78, 67, -38, -90, 67, -54, -78, 38, 85, -22, -90, 4, 67, -54, -78, 38, 85, -22, -90, 4
191
+                       dw -13, 82, 61, -46, -88, -4, 85, 54, -13, 82, 61, -46, -88, -4, 85, 54, 90, 13, -88, -31, 82, 46, -73, -61, 90, 13, -88, -31, 82, 46, -73, -61
192
+
193
+tab_idct32_AVX512_5:   dw 4, -13, 22, -31, 38, -46, 54, -61, 4, -13, 22, -31, 38, -46, 54, -61, 13, -38, 61, -78, 88, -90, 85, -73, 13, -38, 61, -78, 88, -90, 85, -73
194
+                       dw 67, -73, 78, -82, 85, -88, 90, -90, 67, -73, 78, -82, 85, -88, 90, -90, 54, -31, 4, 22, -46, 67, -82, 90, 54, -31, 4, 22, -46, 67, -82, 90
195
+                       dw 22, -61, 85, -90, 73, -38, -4, 46, 22, -61, 85, -90, 73, -38, -4, 46, 31, -78, 90, -61, 4, 54, -88, 82, 31, -78, 90, -61, 4, 54, -88, 82
196
+                       dw -78, 90, -82, 54, -13, -31, 67, -88, -78, 90, -82, 54, -13, -31, 67, -88, -38, -22, 73, -90, 67, -13, -46, 85, -38, -22, 73, -90, 67, -13, -46, 85
197
+                       dw 38, -88, 73, -4, -67, 90, -46, -31, 38, -88, 73, -4, -67, 90, -46, -31, 46, -90, 38, 54, -90, 31, 61, -88, 46, -90, 38, 54, -90, 31, 61, -88
198
+                       dw 85, -78, 13, 61, -90, 54, 22, -82, 85, -78, 13, 61, -90, 54, 22, -82, 22, 67, -85, 13, 73, -82, 4, 78, 22, 67, -85, 13, 73, -82, 4, 78
199
+                       dw 54, -85, -4, 88, -46, -61, 82, 13, 54, -85, -4, 88, -46, -61, 82, 13, 61, -73, -46, 82, 31, -88, -13, 90, 61, -73, -46, 82, 31, -88, -13, 90
200
+                       dw -90, 38, 67, -78, -22, 90, -31, -73, -90, 38, 67, -78, -22, 90, -31, -73, -4, -90, 22, 85, -38, -78, 54, 67, -4, -90, 22, 85, -38, -78, 54, 67
201
+
202
+
203
+tab_idct32_AVX512_2:   dw 64, 89, 83, 75, 64, 50, 36, 18, 64, 89, 83, 75, 64, 50, 36, 18, 64, 75, 36, -18, -64, -89, -83, -50, 64, 75, 36, -18, -64, -89, -83, -50
204
+                       dw 64, 50, -36, -89, -64, 18, 83, 75, 64, 50, -36, -89, -64, 18, 83, 75, 64, 18, -83, -50, 64, 75, -36, -89, 64, 18, -83, -50, 64, 75, -36, -89
205
+                       dw 64, -18, -83, 50, 64, -75, -36, 89, 64, -18, -83, 50, 64, -75, -36, 89, 64, -50, -36, 89, -64, -18, 83, -75, 64, -50, -36, 89, -64, -18, 83, -75
206
+                       dw 64, -75, 36, 18, -64, 89, -83, 50, 64, -75, 36, 18, -64, 89, -83, 50, 64, -89, 83, -75, 64, -50, 36, -18, 64, -89, 83, -75, 64, -50, 36, -18
207
+
208
+tab_idct32_AVX512_3:   dw 90, 87, 80, 70, 57, 43, 25, 9, 90, 87, 80, 70, 57, 43, 25, 9, 87, 57, 9, -43, -80, -90, -70, -25, 87, 57, 9, -43, -80, -90, -70, -25
209
+                       dw 80, 9, -70, -87, -25, 57, 90, 43, 80, 9, -70, -87, -25, 57, 90, 43, 70, -43, -87, 9, 90, 25, -80, -57, 70, -43, -87, 9, 90, 25, -80, -57
210
+                       dw 57, -80, -25, 90, -9, -87, 43, 70, 57, -80, -25, 90, -9, -87, 43, 70, 43, -90, 57, 25, -87, 70, 9, -80, 43, -90, 57, 25, -87, 70, 9, -80
211
+                       dw 25, -70, 90, -80, 43, 9, -57, 87, 25, -70, 90, -80, 43, 9, -57, 87, 9, -25, 43, -57, 70, -80, 87, -90, 9, -25, 43, -57, 70, -80, 87, -90
212
+
213
+tab_idct32_AVX512_4:   dw 90 ,90 ,88 ,85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4, 90 ,90 ,88 ,85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4
214
+                       dw 90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13, 90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13
215
+                       dw 88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22, 88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22
216
+                       dw 85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31, 85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31
217
+                       dw 82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38, 82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38
218
+                       dw 78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46, 78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46
219
+                       dw 73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54, 73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54
220
+                       dw 67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61, 67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61
221
+                       dw 61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67, 61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67
222
+                       dw 54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73, 54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73
223
+                       dw 46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78, 46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78
224
+                       dw 38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82, 38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82
225
+                       dw 31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85, 31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85
226
+                       dw 22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88, 22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88
227
+                       dw 13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90, 13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90
228
+                       dw 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90, 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90
229
+
230
+tab_idct32_AVX512_6:   dw 64, 90, 89, 87, 83, 80, 75, 70, 64, 57, 50, 43, 36, 25, 18, 9, 64, 90, 89, 87, 83, 80, 75, 70, 64, 57, 50, 43, 36, 25, 18, 9
231
+                       dw 64, 87, 75, 57, 36, 9, -18, -43, -64, -80, -89, -90, -83, -70, -50, -25, 64, 87, 75, 57, 36, 9, -18, -43, -64, -80, -89, -90, -83, -70, -50, -25
232
+                       dw 64, 80, 50, 9, -36, -70, -89, -87, -64, -25, 18, 57, 83, 90, 75, 43, 64, 80, 50, 9, -36, -70, -89, -87, -64, -25, 18, 57, 83, 90, 75, 43
233
+                       dw 64, 70, 18, -43, -83, -87, -50, 9, 64, 90, 75, 25, -36, -80, -89, -57, 64, 70, 18, -43, -83, -87, -50, 9, 64, 90, 75, 25, -36, -80, -89, -57
234
+                       dw 64, 57, -18, -80, -83, -25, 50, 90, 64, -9, -75, -87, -36, 43, 89, 70, 64, 57, -18, -80, -83, -25, 50, 90, 64, -9, -75, -87, -36, 43, 89, 70
235
+                       dw 64, 43, -50, -90, -36, 57, 89, 25, -64, -87, -18, 70, 83, 9, -75, -80, 64, 43, -50, -90, -36, 57, 89, 25, -64, -87, -18, 70, 83, 9, -75, -80
236
+                       dw 64, 25, -75, -70, 36, 90, 18, -80, -64, 43, 89, 9, -83, -57, 50, 87, 64, 25, -75, -70, 36, 90, 18, -80, -64, 43, 89, 9, -83, -57, 50, 87
237
+                       dw 64, 9, -89, -25, 83, 43, -75, -57, 64, 70, -50, -80, 36, 87, -18, -90, 64, 9, -89, -25, 83, 43, -75, -57, 64, 70, -50, -80, 36, 87, -18, -90
238
+                       dw 64, -9, -89, 25, 83, -43, -75, 57, 64, -70, -50, 80, 36, -87, -18, 90, 64, -9, -89, 25, 83, -43, -75, 57, 64, -70, -50, 80, 36, -87, -18, 90
239
+                       dw 64, -25, -75, 70, 36, -90, 18, 80, -64, -43, 89, -9, -83, 57, 50, -87, 64, -25, -75, 70, 36, -90, 18, 80, -64, -43, 89, -9, -83, 57, 50, -87
240
+                       dw 64, -43, -50, 90, -36, -57, 89, -25, -64, 87, -18, -70, 83, -9, -75, 80, 64, -43, -50, 90, -36, -57, 89, -25, -64, 87, -18, -70, 83, -9, -75, 80
241
+                       dw 64, -57, -18, 80, -83, 25, 50, -90, 64, 9, -75, 87, -36, -43, 89, -70, 64, -57, -18, 80, -83, 25, 50, -90, 64, 9, -75, 87, -36, -43, 89, -70
242
+                       dw 64, -70, 18, 43, -83, 87, -50, -9, 64, -90, 75, -25, -36, 80, -89, 57, 64, -70, 18, 43, -83, 87, -50, -9, 64, -90, 75, -25, -36, 80, -89, 57
243
+                       dw 64, -80, 50, -9, -36, 70, -89, 87, -64, 25, 18, -57, 83, -90, 75, -43, 64, -80, 50, -9, -36, 70, -89, 87, -64, 25, 18, -57, 83, -90, 75, -43
244
+                       dw 64, -87, 75, -57, 36, -9, -18, 43, -64, 80, -89, 90, -83, 70, -50, 25, 64, -87, 75, -57, 36, -9, -18, 43, -64, 80, -89, 90, -83, 70, -50, 25
245
+                       dw 64, -90, 89, -87, 83, -80, 75, -70, 64, -57, 50, -43, 36, -25, 18, -9, 64, -90, 89, -87, 83, -80, 75, -70, 64, -57, 50, -43, 36, -25, 18, -9
246
+
247
+
248
 avx2_dct4:      dw 64, 64, 64, 64, 64, 64, 64, 64, 64, -64, 64, -64, 64, -64, 64, -64
249
                 dw 83, 36, 83, 36, 83, 36, 83, 36, 36, -83, 36, -83, 36, -83, 36, -83
250
 
251
@@ -314,9 +510,13 @@
252
 
253
 tab_idct8_2:    times 1 dw 89, 75, 50, 18, 75, -18, -89, -50
254
                 times 1 dw 50, -89, 18, 75, 18, -50, 75, -89
255
-
256
 pb_idct8odd:    db 2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15
257
 
258
+;Scale bits table for rdoQuant
259
+tab_nonpsyRdo8 : dq 5, 7, 9, 11
260
+tab_nonpsyRdo10: dq 9, 11, 13, 15
261
+tab_nonpsyRdo12: dq 13, 15, 17, 19
262
+
263
 SECTION .text
264
 cextern pd_1
265
 cextern pd_2
266
@@ -343,6 +543,10 @@
267
     %define     DST4_ROUND          16
268
     %define     DCT8_SHIFT1         6
269
     %define     DCT8_ROUND1         32
270
+    %define     RDO_MAX_4           3
271
+    %define     RDO_MAX_8           1
272
+    %define     RDO_MAX_16          0
273
+    %define     RDO_MAX_32          0
274
 %elif BIT_DEPTH == 10
275
     %define     DCT4_SHIFT          3
276
     %define     DCT4_ROUND          4
277
@@ -352,6 +556,10 @@
278
     %define     DST4_ROUND          4
279
     %define     DCT8_SHIFT1         4
280
     %define     DCT8_ROUND1         8
281
+    %define     RDO_MAX_4           7
282
+    %define     RDO_MAX_8           5
283
+    %define     RDO_MAX_16          3
284
+    %define     RDO_MAX_32          1
285
 %elif BIT_DEPTH == 8
286
     %define     DCT4_SHIFT          1
287
     %define     DCT4_ROUND          1
288
@@ -361,6 +569,10 @@
289
     %define     DST4_ROUND          1
290
     %define     DCT8_SHIFT1         2
291
     %define     DCT8_ROUND1         2
292
+    %define     RDO_MAX_4           11
293
+    %define     RDO_MAX_8           9
294
+    %define     RDO_MAX_16          7
295
+    %define     RDO_MAX_32          5
296
 %else
297
     %error Unsupported BIT_DEPTH!
298
 %endif
299
@@ -2165,6 +2377,67 @@
300
     dec      r3d
301
     jnz .loop
302
     RET
303
+%if ARCH_X86_64 == 1
304
+INIT_ZMM avx512
305
+cglobal denoise_dct, 4, 4, 22
306
+    pxor     m16,  m16
307
+    sub      r3d,   16
308
+    je       .coeff16
309
+    add      r3d,   16
310
+    shr      r3d,    5
311
+    jmp      .loop
312
+
313
+.coeff16:
314
+    movu          ym19,  [r0]
315
+    pabsw         ym17, ym19
316
+    movu            m2, [r1]
317
+    pmovsxwd       m18, ym17
318
+    paddd           m2,  m18
319
+    movu          [r1],   m2
320
+    movu           ym3, [r2]
321
+    psubusw       ym17, ym3
322
+    pcmpgtw       ym18, ym17, ym16
323
+    pand          ym17, ym18
324
+    psignw        ym17, ym19
325
+    movu          [r0], ym17
326
+    RET
327
+
328
+.loop:
329
+    movu          m21, [r0]
330
+    pabsw         m17, m21
331
+    movu           m2, [r1]
332
+    pmovsxwd       m4, ym17
333
+    paddd          m2,  m4
334
+    movu         [r1],  m2
335
+    vextracti64x4 ym4, m17, 1
336
+
337
+    movu           m2, [r1 + mmsize]
338
+    pmovsxwd       m3, ym4
339
+    paddd          m2, m3
340
+    movu           [r1 + mmsize], m2
341
+    movu           m3, [r2]
342
+    psubusw       m17, m3
343
+
344
+    vextracti64x4 ym20,  m17,    1
345
+    pcmpgtw       ym18, ym17, ym16
346
+    pcmpgtw       ym19, ym20, ym16
347
+    vinserti64x4   m18,  m18, ym19, 1
348
+
349
+    pand           m17,  m18
350
+    vextracti64x4 ym19,  m17, 1
351
+    vextracti64x4 ym20,  m21, 1
352
+    psignw        ym17, ym21
353
+    psignw        ym19, ym20
354
+    vinserti64x4   m17,  m17, ym19, 1
355
+
356
+    movu          [r0],  m17
357
+    add             r0,  mmsize
358
+    add             r1,  mmsize * 2
359
+    add             r2,  mmsize
360
+    dec             r3d
361
+    jnz             .loop
362
+    RET
363
+%endif ; ARCH_X86_64 == 1
364
 
365
 %if ARCH_X86_64 == 1
366
 %macro DCT8_PASS_1 4
367
@@ -2270,6 +2543,168 @@
368
     movu            [r1 + 96],         m10
369
     RET
370
 
371
+
372
+%macro DCT8_AVX512_PASS_1 4
373
+    vpmaddwd        m%2,               m3, m%1
374
+    vpsrlq          m8,                m%2, 32
375
+    vpaddd          m%2,               m8
376
+    vpaddd          m%2,               m5
377
+    vpsrad          m%2,               DCT8_SHIFT1
378
+
379
+    vpmaddwd        m%4,               m2, m%3
380
+    vpsrlq          m8,                m%4, 32
381
+    vpaddd          m%4,               m8
382
+    vpaddd          m%4,               m5
383
+    vpsrad          m%4,               DCT8_SHIFT1
384
+
385
+    vpackssdw       m%2,               m%4
386
+    vpermw          m%2,               m1, m%2
387
+%endmacro
388
+
389
+%macro DCT8_AVX512_PASS_2 4
390
+    vpmaddwd         m0,               m9,  m%1
391
+    vpmaddwd         m1,               m10, m%1
392
+    vpsrldq          m2,               m0,  8
393
+    vpsrldq          m3,               m1,  8
394
+    vpaddd           m0,               m2
395
+    vpaddd           m1,               m3
396
+    vpsrlq           m2,               m0,  32
397
+    vpsrlq           m3,               m1,  32
398
+    vpaddd           m0,               m2
399
+    vpaddd           m1,               m3
400
+    vpaddd           m0,               m5
401
+    vpsrad           m0,               DCT8_SHIFT2
402
+    vpaddd           m1,               m5
403
+    vpsrad           m1,               DCT8_SHIFT2
404
+    vpackssdw        m0,               m1
405
+    vpermw           m0,               m19, m0
406
+
407
+    vpmaddwd         m1,               m9,  m%2
408
+    vpmaddwd         m2,               m10, m%2
409
+    vpsrldq          m3,               m1,  8
410
+    vpsrldq          m4,               m2,  8
411
+    vpaddd           m1,               m3
412
+    vpaddd           m2,               m4
413
+    vpsrlq           m3,               m1,  32
414
+    vpsrlq           m4,               m2,  32
415
+    vpaddd           m1,               m3
416
+    vpaddd           m2,               m4
417
+    vpaddd           m1,               m5
418
+    vpsrad           m1,               DCT8_SHIFT2
419
+    vpaddd           m2,               m5
420
+    vpsrad           m2,               DCT8_SHIFT2
421
+    vpackssdw        m1,               m2
422
+    vpermw           m1,               m19, m1
423
+    vinserti128      ym0,              ym0, xm1, 1
424
+
425
+    vpmaddwd         m1,               m9,  m%3
426
+    vpmaddwd         m2,               m10, m%3
427
+    vpsrldq          m3,               m1,  8
428
+    vpsrldq          m4,               m2,  8
429
+    vpaddd           m1,               m3
430
+    vpaddd           m2,               m4
431
+    vpsrlq           m3,               m1,  32
432
+    vpsrlq           m4,               m2,  32
433
+    vpaddd           m1,               m3
434
+    vpaddd           m2,               m4
435
+    vpaddd           m1,               m5
436
+    vpsrad           m1,               DCT8_SHIFT2
437
+    vpaddd           m2,               m5
438
+    vpsrad           m2,               DCT8_SHIFT2
439
+    vpackssdw        m1,               m2
440
+    vpermw           m1,               m19, m1
441
+
442
+    vpmaddwd         m2,               m9,  m%4
443
+    vpmaddwd         m3,               m10, m%4
444
+    vpsrldq          m4,               m2,  8
445
+    vpsrldq          m6,               m3,  8
446
+    vpaddd           m2,               m4
447
+    vpaddd           m3,               m6
448
+    vpsrlq           m4,               m2,  32
449
+    vpsrlq           m6,               m3,  32
450
+    vpaddd           m2,               m4
451
+    vpaddd           m3,               m6
452
+    vpaddd           m2,               m5
453
+    vpsrad           m2,               DCT8_SHIFT2
454
+    vpaddd           m3,               m5
455
+    vpsrad           m3,               DCT8_SHIFT2
456
+    vpackssdw        m2,               m3
457
+    vpermw           m2,               m19, m2
458
+
459
+    vinserti128      ym1,              ym1, xm2, 1
460
+    vinserti64x4     m0,               m0, ym1, 1
461
+%endmacro
462
+
463
+INIT_ZMM avx512
464
+cglobal dct8, 3, 7, 24
465
+
466
+    vbroadcasti32x4  m5,               [pd_ %+ DCT8_ROUND1]
467
+    vbroadcasti32x8  m4,               [dct8_shuf]
468
+    vbroadcasti32x4  m19,              [dct8_shuf9_AVX512]
469
+
470
+    add              r2d,              r2d
471
+    lea              r3,               [r2 * 3]
472
+    lea              r4,               [r0 + r2 * 4]
473
+    lea              r5,               [tab_dct8]
474
+    lea              r6,               [tab_dct8_avx512]
475
+
476
+    ;pass1
477
+    mova            xm0,               [r0]
478
+    vinserti128     ym0,               ym0, [r4], 1
479
+    mova            xm1,               [r0 + r2]
480
+    vinserti128     ym1,               ym1, [r4 + r2], 1
481
+    mova            xm2,               [r0 + r2 * 2]
482
+    vinserti128     ym2,               ym2, [r4 + r2 * 2], 1
483
+    mova            xm3,               [r0 + r3]
484
+    vinserti128     ym3,               ym3,  [r4 + r3], 1
485
+
486
+    vinserti64x4    m0,                m0, ym2, 1
487
+    vinserti64x4    m1,                m1, ym3, 1
488
+
489
+    vpunpcklqdq     m2,                m0, m1
490
+    vpunpckhqdq     m0,                m1
491
+
492
+    vpshufb         m0,                m4
493
+    vpaddw          m3,                m2, m0
494
+    vpsubw          m2,                m0
495
+
496
+    vbroadcasti32x8 m1,                [dct8_shuf7_AVX512]
497
+
498
+    ; Load all the coefficients togather for better caching
499
+    vpbroadcastq    m20,               [r6 + 0 * 8]
500
+    vpbroadcastq    m21,               [r6 + 1 * 8]
501
+    vpbroadcastq    m22,               [r6 + 2 * 8]
502
+    vpbroadcastq    m23,               [r6 + 3 * 8]
503
+    vpbroadcastq    m7,                [r6 + 4 * 8]
504
+    vpbroadcastq    m12,               [r6 + 5 * 8]
505
+    vpbroadcastq    m14,               [r6 + 6 * 8]
506
+    vpbroadcastq    m16,               [r6 + 7 * 8]
507
+
508
+    DCT8_AVX512_PASS_1     20,       9, 21,      10
509
+    DCT8_AVX512_PASS_1     22,      11, 23,      10
510
+    DCT8_AVX512_PASS_1     7,       13, 12,      10
511
+    DCT8_AVX512_PASS_1     14,      15, 16,      10
512
+
513
+    ;pass2
514
+    vbroadcasti32x4        m5,          [pd_ %+ DCT8_ROUND2]
515
+
516
+    vinserti64x4           m9,          m9,  ym11, 1
517
+    vinserti64x4           m10,         m13, ym15, 1
518
+
519
+    ;Load all the coefficients togather for better caching and reuse common coefficients from PASS 1
520
+    vbroadcasti32x4    m21,                [r5 + 1 * 16]
521
+    vbroadcasti32x4    m22,                [r5 + 2 * 16]
522
+    vbroadcasti32x4    m23,                [r5 + 3 * 16]
523
+    vbroadcasti32x4    m12,                [r5 + 5 * 16]
524
+    vbroadcasti32x4    m14,                [r5 + 6 * 16]
525
+    vbroadcasti32x4    m16,                [r5 + 7 * 16]
526
+
527
+    DCT8_AVX512_PASS_2     20, 21, 22, 23
528
+    movu                   [r1],        m0
529
+    DCT8_AVX512_PASS_2     7, 12, 14, 16
530
+    movu                   [r1 + 64],   m0
531
+    RET
532
+
533
 %macro DCT16_PASS_1_E 2
534
     vpbroadcastq    m7,                [r7 + %1]
535
 
536
@@ -2527,10 +2962,401 @@
537
     dec             r4d
538
     jnz             .pass2
539
     RET
540
+%macro DCT16_avx512_PASS_1_O 4
541
+    vbroadcasti32x4  m1,               [r5 + %1]
542
+
543
+    pmaddwd          m3,               m6,  m1
544
+    vpsrldq          m11,              m3,  8
545
+    vpaddd           m3,               m11
546
+
547
+    pmaddwd          m11,              m8,  m1
548
+    vpsrldq          m12,              m11, 8
549
+    vpaddd           m11,              m12
550
+
551
+    vpunpcklqdq      m12,              m3, m11
552
+    vpsrldq          m11,              m12, 4
553
+    vpaddd           m11,              m12
554
+
555
+    pmaddwd          m3,               m10, m1
556
+    vpsrldq          m12,              m3,  8
557
+    vpaddd           m3,               m12
558
+
559
+    pmaddwd          m12,              m2,  m1
560
+    vpsrldq          m13,              m12, 8
561
+    vpaddd           m12,              m13
562
+
563
+    vpunpcklqdq      m13,              m3, m12
564
+    vpsrldq          m12,              m13, 4
565
+    vpaddd           m12,              m13
566
+
567
+    mova             m%3,              m26
568
+    vpermi2d         m%3,              m11, m12
569
+    paddd            m%3,              m0
570
+    psrad            m%3,              DCT_SHIFT
571
+
572
+    ; next row start
573
+    vbroadcasti32x4  m1,               [r5 + %2]
574
+
575
+    pmaddwd          m3,               m6,  m1
576
+    vpsrldq          m11,              m3,  8
577
+    vpaddd           m3,               m11
578
+
579
+    pmaddwd          m11,              m8,  m1
580
+    vpsrldq          m12,              m11, 8
581
+    vpaddd           m11,              m12
582
+
583
+    vpunpcklqdq      m12,              m3, m11
584
+    vpsrldq          m11,              m12, 4
585
+    vpaddd           m11,              m12
586
+
587
+    pmaddwd          m3,               m10, m1
588
+    vpsrldq          m12,              m3,  8
589
+    vpaddd           m3,               m12
590
+
591
+    pmaddwd          m12,              m2,  m1
592
+    vpsrldq          m13,              m12, 8
593
+    vpaddd           m12,              m13
594
+
595
+    vpunpcklqdq      m13,              m3, m12
596
+    vpsrldq          m12,              m13, 4
597
+    vpaddd           m12,              m13
598
+
599
+    mova             m%4,              m26
600
+    vpermi2d         m%4,              m11, m12
601
+    paddd            m%4,              m0
602
+    psrad            m%4,              DCT_SHIFT
603
+   ;next row end
604
+
605
+    packssdw         m%3,              m%4
606
+    vpermw           m%4,              m25, m%3
607
+%endmacro
608
+
609
+%macro DCT16_AVX512_PASS_1_LOOP 0
610
+    vbroadcasti32x8 m1,                [dct16_shuf1]
611
+    mova            m2,                [dct16_shuf3_AVX512]
612
+    mova            m3,                [dct16_shuf4_AVX512]
613
+
614
+    movu            ym4,               [r0]
615
+    movu            ym5,               [r0 + r2]
616
+    vinserti64x4    m4,                m4, ym5, 1
617
+
618
+    movu            ym5,               [r0 + 2 * r2]
619
+    movu            ym6,               [r0 + r3]
620
+    vinserti64x4    m5,                m5, ym6, 1
621
+
622
+    mova            m6,                m2
623
+    mova            m7,                m3
624
+    vpermi2q        m6,                m4, m5
625
+    vpermi2q        m7,                m4, m5
626
+
627
+    movu            ym4,               [r4]
628
+    movu            ym5,               [r4 + r2]
629
+    vinserti64x4    m4,                m4, ym5, 1
630
+
631
+    movu            ym5,               [r4 + 2 * r2]
632
+    movu            ym8,               [r4 + r3]
633
+    vinserti64x4    m5,                m5, ym8, 1
634
+
635
+    mova            m8,                m2
636
+    mova            m9,                m3
637
+    vpermi2q        m8,                m4, m5
638
+    vpermi2q        m9,                m4, m5
639
+
640
+    vpshufb         m7,                m1
641
+    vpshufb         m9,                m1
642
+
643
+    paddw           m4,                m6, m7
644
+    psubw           m6,                m7
645
+
646
+    paddw           m5,                m8, m9
647
+    psubw           m8,                m9
648
+
649
+    lea             r0,                [r0 + 8 * r2]
650
+    lea             r4,                [r0 + r2 * 4]
651
+
652
+    movu            ym7,               [r0]
653
+    movu            ym9,               [r0 + r2]
654
+    vinserti64x4    m7,                m7, ym9, 1
655
+
656
+    movu            ym9,               [r0 + 2 * r2]
657
+    movu            ym10,              [r0 + r3]
658
+    vinserti64x4    m9,                m9, ym10, 1
659
+
660
+    mova            m10,               m2
661
+    mova            m11,               m3
662
+    vpermi2q        m10,               m7, m9
663
+    vpermi2q        m11,               m7, m9
664
+
665
+    vpshufb         m11,               m1
666
+    paddw           m7,                m10, m11
667
+    psubw           m10,               m11
668
+
669
+    movu            ym9,               [r4]
670
+    movu            ym11,              [r4 + r2]
671
+    vinserti64x4    m9,                m9, ym11, 1
672
+
673
+    movu            ym11,              [r4 + 2 * r2]
674
+    movu            ym12,              [r4 + r3]
675
+    vinserti64x4    m11,               m11, ym12, 1
676
+
677
+    vpermi2q        m2,                m9, m11
678
+    vpermi2q        m3,                m9, m11
679
+
680
+    vpshufb         m3,                m1
681
+    paddw           m9,                m2, m3
682
+    psubw           m2,                m3
683
+%endmacro
684
+
685
+%macro DCT16_avx512_PASS_1_E 4
686
+    vpbroadcastq      m1,              [r5 + %1]
687
+
688
+    pmaddwd          m19,              m11,  m1
689
+    vpsrldq          m12,              m19,  4
690
+    vpaddd           m12,              m19
691
+
692
+    pmaddwd          m19,              m13,  m1
693
+    vpsrldq          m18,              m19,  4
694
+    vpaddd           m18,              m19
695
+
696
+    mova             m%2,              m27
697
+    vpermi2d         m%2,              m12, m18
698
+    paddd            m%2,              m0
699
+    psrad            m%2,              DCT_SHIFT
700
+
701
+    ; 2nd row
702
+    vpbroadcastq      m1,              [r5 + %3]
703
+
704
+    pmaddwd          m19,              m11,  m1
705
+    vpsrldq          m12,              m19,  4
706
+    vpaddd           m12,              m19
707
+
708
+    pmaddwd          m19,              m13,  m1
709
+    vpsrldq          m18,              m19,  4
710
+    vpaddd           m18,              m19
711
+
712
+    mova             m%4,              m27
713
+    vpermi2d         m%4,              m12, m18
714
+    paddd            m%4,              m0
715
+    psrad            m%4,              DCT_SHIFT
716
+
717
+    packssdw         m%2,              m%4
718
+    vpermw           m%4,              m25, m%2
719
+%endmacro
720
+
721
+%macro DCT16_PASS2_AVX512 10
722
+    vpmaddwd         m5,   m%2, m%1
723
+    vpsrldq          m6,   m5,  8
724
+    vpaddd           m5,   m6
725
+    vpsrldq          m6,   m5,  4
726
+    vpaddd           m5,   m6
727
+
728
+    vpmaddwd         m6,   m%3, m%1
729
+    vpsrldq          m7,   m6,  8
730
+    vpaddd           m6,   m7
731
+    vpsrldq          m7,   m6,  4
732
+    vpaddd           m6,   m7
733
+    vpunpckldq       m7,   m5, m6
734
+
735
+    vpmaddwd         m5,   m%4, m%1
736
+    vpsrldq          m6,   m5,  8
737
+    vpaddd           m5,   m6
738
+    vpsrldq          m6,   m5,  4
739
+    vpaddd           m5,   m6
740
+
741
+    vpmaddwd         m6,   m%5, m%1
742
+    vpsrldq          m8,   m6,  8
743
+    vpaddd           m6,   m8
744
+    vpsrldq          m8,   m6,  4
745
+    vpaddd           m6,   m8
746
+    vpunpckldq       m8,   m5, m6
747
+
748
+    vpunpcklqdq      m5,   m7, m8
749
+    vpermd           m5,   m2, m5
750
+    vpsrldq          m6,   m5,  4
751
+    vpaddd           m5,   m6
752
+
753
+    vpmaddwd         m6,   m%6, m%1
754
+    vpsrldq          m7,   m6,  8
755
+    vpaddd           m6,   m7
756
+    vpsrldq          m7,   m6,  4
757
+    vpaddd           m6,   m7
758
+
759
+    vpmaddwd         m7,   m%7, m%1
760
+    vpsrldq          m8,   m7,  8
761
+    vpaddd           m7,   m8
762
+    vpsrldq          m8,   m7,  4
763
+    vpaddd           m7,   m8
764
+    vpunpckldq       m8,   m6, m7
765
+
766
+    vpmaddwd         m6,   m%8, m%1
767
+    vpsrldq          m7,   m6,  8
768
+    vpaddd           m6,   m7
769
+    vpsrldq          m7,   m6,  4
770
+    vpaddd           m6,   m7
771
+
772
+    vpmaddwd         m7,   m%9, m%1
773
+    vpsrldq          m4,   m7,  8
774
+    vpaddd           m7,   m4
775
+    vpsrldq          m4,   m7,  4
776
+    vpaddd           m7,   m4
777
+    vpunpckldq       m4,   m6, m7
778
+
779
+    vpunpcklqdq      m6,   m8, m4
780
+    vpermd           m6,   m2, m6
781
+    vpsrldq          m7,   m6,  4
782
+    vpaddd           m6,   m7
783
+
784
+    paddd            m5, m0
785
+    psrad            m5, DCT_SHIFT2
786
+    paddd            m6, m0
787
+    psrad            m6, DCT_SHIFT2
788
+
789
+    packssdw         m5, m6
790
+    vpermw           m%10, m3, m5
791
+%endmacro
792
+
793
+INIT_ZMM avx512
794
+cglobal dct16, 3, 6, 29
795
+
796
+%if BIT_DEPTH == 12
797
+    %define          DCT_SHIFT          7
798
+    vbroadcasti32x4  m0,                [pd_64]
799
+%elif BIT_DEPTH == 10
800
+    %define          DCT_SHIFT          5
801
+    vbroadcasti32x4  m0,                [pd_16]
802
+%elif BIT_DEPTH == 8
803
+    %define          DCT_SHIFT          3
804
+    vbroadcasti32x4  m0,                [pd_4]
805
+%else
806
+    %error Unsupported BIT_DEPTH!
807
+%endif
808
+%define             DCT_SHIFT2         10
809
+
810
+    add             r2d,               r2d
811
+    lea             r3,                [r2 * 3]
812
+    lea             r4,                [r0 + r2 * 4]
813
+    lea             r5,                [tab_dct16_1 + 8 * 16]
814
+
815
+    ;Load reuseable table once to save memory movments
816
+    mova             m25,              [dct16_shuf5_AVX512]
817
+    mova             m26,              [dct16_shuf2_AVX512]
818
+    mova             m27,              [dct16_shuf7_AVX512]
819
+    vbroadcasti32x8  m28,              [dct16_shuf6_AVX512]
820
+
821
+    DCT16_AVX512_PASS_1_LOOP
822
+    DCT16_avx512_PASS_1_O              -7 * 16, -5 * 16, 15, 14    ;row 1,   3
823
+    DCT16_avx512_PASS_1_O              -3 * 16, -1 * 16, 16, 15    ;row 5,   7
824
+    DCT16_avx512_PASS_1_O               1 * 16,  3 * 16, 17, 16    ;row 9,  11
825
+    DCT16_avx512_PASS_1_O               5 * 16,  7 * 16, 18, 17    ;row 13, 15
826
+
827
+    vbroadcasti32x8 m1,                [dct16_shuf2]
828
+    pshufb          m4,                m1
829
+    pshufb          m5,                m1
830
+    pshufb          m7,                m1
831
+    pshufb          m9,                m1
832
+
833
+    vpsrldq          m3,              m4,  2
834
+    vpsubw           m11,             m4,  m3
835
+    vpsrldq          m6,              m5,  2
836
+    vpsubw           m12,             m5,  m6
837
+    vpsrldq          m8,              m7,  2
838
+    vpsubw           m13,             m7,  m8
839
+    vpsrldq          m10,             m9,  2
840
+    vpsubw           m18,             m9,  m10
841
+
842
+    vpermw           m11,             m28, m11
843
+    vpermw           m12,             m28, m12
844
+    vinserti64x4     m11,             m11, ym12, 1
845
+
846
+    vpermw           m13,             m28, m13
847
+    vpermw           m18,             m28, m18
848
+    vinserti64x4     m13,             m13, ym18, 1
849
+
850
+    DCT16_avx512_PASS_1_E            -6 * 16, 21, -2 * 16, 20    ; row  2,  6
851
+    DCT16_avx512_PASS_1_E             2 * 16, 22,  6 * 16, 21    ; row 10, 14
852
+
853
+    vpaddw           m11,             m4,  m3
854
+    vpaddw           m12,             m5,  m6
855
+    vpaddw           m13,             m7,  m8
856
+    vpaddw           m18,             m9,  m10
857
+
858
+    vpermw           m11,             m28, m11
859
+    vpermw           m12,             m28, m12
860
+    vinserti64x4     m11,             m11, ym12, 1
861
+
862
+    vpermw           m13,             m28, m13
863
+    vpermw           m18,             m28, m18
864
+    vinserti64x4     m13,             m13, ym18, 1
865
+
866
+    DCT16_avx512_PASS_1_E            -8 * 16, 23, 0 * 16, 22    ; row 0, 8
867
+    DCT16_avx512_PASS_1_E            -4 * 16, 24, 4 * 16, 23    ; row 4, 12
868
+
869
+    ;PASS2
870
+    vbroadcasti128    m0,             [pd_512]
871
+
872
+    lea              r5,              [tab_dct16]
873
+    mova             m2,              [dct16_shuf9_AVX512]
874
+    vbroadcasti32x8  m3,              [dct16_shuf8_AVX512]
875
+
876
+    vbroadcasti32x8  m1,              [r5 + 0 * 32]
877
+    DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 9
878
+    vbroadcasti32x8  m1,              [r5 + 1 * 32]
879
+    DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 10
880
+    vinserti64x4     m9,              m9, ym10, 1
881
+    movu             [r1 + 0 * 64],   m9
882
+
883
+    vbroadcasti32x8  m1,              [r5 + 2 * 32]
884
+    DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 9
885
+    vbroadcasti32x8  m1,              [r5 + 3 * 32]
886
+    DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 10
887
+    vinserti64x4     m9,              m9, ym10, 1
888
+    movu             [r1 + 1 * 64],   m9
889
+
890
+    vbroadcasti32x8  m1,              [r5 + 4 * 32]
891
+    DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 9
892
+    vbroadcasti32x8  m1,              [r5 + 5 * 32]
893
+    DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 10
894
+    vinserti64x4     m9,              m9, ym10, 1
895
+    movu             [r1 + 2 * 64],   m9
896
+
897
+    vbroadcasti32x8  m1,              [r5 + 6 * 32]
898
+    DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 9
899
+    vbroadcasti32x8  m1,              [r5 + 7 * 32]
900
+    DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 10
901
+    vinserti64x4     m9,              m9, ym10, 1
902
+    movu             [r1 + 3 * 64],   m9
903
+
904
+    vbroadcasti32x8  m1,              [r5 + 8 * 32]
905
+    DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 9
906
+    vbroadcasti32x8  m1,              [r5 + 9 * 32]
907
+    DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 10
908
+    vinserti64x4     m9,              m9, ym10, 1
909
+    movu             [r1 + 4 * 64],   m9
910
+
911
+    vbroadcasti32x8  m1,              [r5 + 10 * 32]
912
+    DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 9
913
+    vbroadcasti32x8  m1,              [r5 + 11 * 32]
914
+    DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 10
915
+    vinserti64x4     m9,              m9, ym10, 1
916
+    movu             [r1 + 5 * 64],   m9
917
+
918
+    vbroadcasti32x8  m1,              [r5 + 12 * 32]
919
+    DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 9
920
+    vbroadcasti32x8  m1,              [r5 + 13 * 32]
921
+    DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 10
922
+    vinserti64x4     m9,              m9, ym10, 1
923
+    movu             [r1 + 6 * 64],   m9
924
+
925
+    vbroadcasti32x8  m1,              [r5 + 14 * 32]
926
+    DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 9
927
+    vbroadcasti32x8  m1,              [r5 + 15 * 32]
928
+    DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 10
929
+    vinserti64x4     m9,              m9, ym10, 1
930
+    movu             [r1 + 7 * 64],   m9
931
+    RET
932
 
933
 %macro DCT32_PASS_1 4
934
     vbroadcasti128  m8,                [r7 + %1]
935
-
936
     pmaddwd         m11,               m%3, m8
937
     pmaddwd         m12,               m%4, m8
938
     phaddd          m11,               m12
939
@@ -2791,6 +3617,521 @@
940
     jnz             .pass2
941
     RET
942
 
943
+
944
+%macro DCT32_avx512_LOOP 4
945
+    movu            m1,               [r0]
946
+    movu            m2,               [r0 + r2]
947
+
948
+    vinserti64x4    m3,               m1, ym2, 1    ; row 0l, 1l
949
+    vextracti64x4   ym4,              m1, 1
950
+    vinserti64x4    m2,               m2, ym4, 0    ; row 0h, 1h
951
+    vpermw          m2,               m31, m2
952
+
953
+    psubw           m%1,              m3, m2        ; O
954
+    paddw           m3,               m2            ; E
955
+    mova            [r9 + %3 * 64],   m3
956
+
957
+    movu            m1,               [r0 + 2 * r2]
958
+    movu            m5,               [r0 + r3]
959
+
960
+    vinserti64x4    m6,               m1, ym5, 1    ; row 2l, 3l
961
+    vextracti64x4   ym7,              m1, 1
962
+    vinserti64x4    m5,               m5, ym7, 0    ; row 2h, 3h
963
+    vpermw          m5,               m31, m5
964
+
965
+    psubw           m%2,              m6, m5        ; O
966
+    paddw           m6,               m5            ; E
967
+    mova            [r9 + %4 * 64],   m6
968
+%endmacro
969
+
970
+%macro DCT32_avx512_PASS_1_O 3
971
+    pmaddwd          m10,              m%2,  m9
972
+    vpsrldq          m11,              m10, 8
973
+    vpaddd           m10,              m11
974
+
975
+    pmaddwd          m11,              m%3,  m9
976
+    vpsrldq          m12,              m11, 8
977
+    vpaddd           m11,              m12
978
+
979
+    mova             m12,              m8
980
+    vpermi2d         m12,              m10, m11
981
+    vpsrldq          m10,              m12, 8
982
+    vpaddd           m12,              m10
983
+    vpsrldq          m10,              m12, 4
984
+    vpaddd           m12,              m10
985
+
986
+    vpaddd           m12,              m0
987
+    vpsrad           m12,              DCT_SHIFT
988
+    vpackssdw        m12,              m12
989
+    vpermw           m12,              m30, m12
990
+    movq             [r5 + %1],        xm12
991
+%endmacro
992
+
993
+%macro DCT32_avx512_PASS_1_ROW_O 0
994
+    vbroadcasti32x8  m9,               [r7 + 1 * 32]
995
+
996
+    DCT32_avx512_LOOP 13, 14, 0, 1
997
+    DCT32_avx512_PASS_1_O              1 * 64 + 0 * 8, 13, 14
998
+
999
+    lea             r0,                [r0 + 4 * r2]
1000
+    DCT32_avx512_LOOP 15, 16, 2, 3
1001
+    DCT32_avx512_PASS_1_O              1 * 64 + 1 * 8, 15, 16
1002
+
1003
+    lea             r0,                [r0 + 4 * r2]
1004
+    DCT32_avx512_LOOP 17, 18, 4, 5
1005
+    DCT32_avx512_PASS_1_O              1 * 64 + 2 * 8, 17, 18
1006
+
1007
+    lea             r0,                [r0 + 4 * r2]
1008
+    DCT32_avx512_LOOP 19, 20, 6, 7
1009
+    DCT32_avx512_PASS_1_O              1 * 64 + 3 * 8, 19, 20
1010
+
1011
+    lea             r0,                [r0 + 4 * r2]
1012
+    DCT32_avx512_LOOP 21, 22, 8, 9
1013
+    DCT32_avx512_PASS_1_O              1 * 64 + 4 * 8, 21, 22
1014
+
1015
+    lea             r0,                [r0 + 4 * r2]
1016
+    DCT32_avx512_LOOP 23, 24, 10, 11
1017
+    DCT32_avx512_PASS_1_O              1 * 64 + 5 * 8, 23, 24
1018
+
1019
+    lea             r0,                [r0 + 4 * r2]
1020
+    DCT32_avx512_LOOP 25, 26, 12, 13
1021
+    DCT32_avx512_PASS_1_O              1 * 64 + 6 * 8, 25, 26
1022
+
1023
+    lea             r0,                [r0 + 4 * r2]
1024
+    DCT32_avx512_LOOP 27, 28, 14, 15
1025
+    DCT32_avx512_PASS_1_O              1 * 64 + 7 * 8, 27, 28
1026
+%endmacro
1027
+
1028
+%macro DCT32_avx512_PASS_1_ROW_O_1_7 1
1029
+    vbroadcasti32x8  m9,               [r7 + %1 * 32]
1030
+
1031
+    DCT32_avx512_PASS_1_O              %1 * 64 + 0 * 8, 13, 14
1032
+    DCT32_avx512_PASS_1_O              %1 * 64 + 1 * 8, 15, 16
1033
+    DCT32_avx512_PASS_1_O              %1 * 64 + 2 * 8, 17, 18
1034
+    DCT32_avx512_PASS_1_O              %1 * 64 + 3 * 8, 19, 20
1035
+    DCT32_avx512_PASS_1_O              %1 * 64 + 4 * 8, 21, 22
1036
+    DCT32_avx512_PASS_1_O              %1 * 64 + 5 * 8, 23, 24
1037
+    DCT32_avx512_PASS_1_O              %1 * 64 + 6 * 8, 25, 26
1038
+    DCT32_avx512_PASS_1_O              %1 * 64 + 7 * 8, 27, 28
1039
+%endmacro
1040
+
1041
+%macro DCT32_avx512_LOOP_EO 4
1042
+    mova            m4,                [rsp + 32 * mmsize + %3 * 64]
1043
+    vpermw          m4,                m8, m4
1044
+    vextracti64x4   ym5,               m4, 1
1045
+
1046
+    mova            m6,                [rsp + 32 * mmsize + %4 * 64]
1047
+    vpermw          m6,                m8, m6
1048
+    vextracti64x4   ym7,               m6, 1
1049
+
1050
+    vinserti64x4    m4,                m4, ym6, 1
1051
+    vinserti64x4    m5,                m5, ym7, 1
1052
+
1053
+    psubw           m%1,               m4, m5      ; EO
1054
+    paddw           m%2,               m4, m5      ; EE
1055
+%endmacro
1056
+
1057
+%macro DCT32_avx512_PASS_1_ROW_EO 2
1058
+    pmaddwd          m29,              m%2,  m12
1059
+    vpsrldq          m30,              m29,  8
1060
+    vpaddd           m30,              m29
1061
+    vpsrldq          m29,              m30,  4
1062
+    vpaddd           m29,              m30
1063
+
1064
+    vpaddd           m29,              m0
1065
+    vpsrad           m29,              DCT_SHIFT
1066
+    vpackssdw        m29,              m29
1067
+
1068
+    vpermw           m29,              m11, m29
1069
+    movq             [r5 + %1],        xm29
1070
+%endmacro
1071
+
1072
+%macro DCT32_avx512_PASS_1_ROW_EO_0 0
1073
+
1074
+    mova            m8,               [dct32_shuf2_AVX512]
1075
+    vbroadcasti32x4 m12,              [r7 + 2 * 32]
1076
+
1077
+    DCT32_avx512_LOOP_EO 13, 14, 0, 1
1078
+    DCT32_avx512_PASS_1_ROW_EO    2 * 64 + 0 * 8, 13
1079
+
1080
+    lea             r9,           [r9 + 4 * r2]
1081
+    DCT32_avx512_LOOP_EO 15, 16, 2, 3
1082
+    DCT32_avx512_PASS_1_ROW_EO    2 * 64 + 1 * 8, 15
1083
+
1084
+    lea             r9,           [r9 + 4 * r2]
1085
+    DCT32_avx512_LOOP_EO 17, 18, 4, 5
1086
+    DCT32_avx512_PASS_1_ROW_EO    2 * 64 + 2 * 8, 17
1087
+
1088
+    lea             r9,           [r9 + 4 * r2]
1089
+    DCT32_avx512_LOOP_EO 19, 20, 6, 7
1090
+    DCT32_avx512_PASS_1_ROW_EO    2 * 64 + 3 * 8, 19
1091
+
1092
+    lea             r9,           [r9 + 4 * r2]
1093
+    DCT32_avx512_LOOP_EO 21, 22, 8, 9
1094
+    DCT32_avx512_PASS_1_ROW_EO    2 * 64 + 4 * 8, 21
1095
+
1096
+    lea             r9,           [r9 + 4 * r2]
1097
+    DCT32_avx512_LOOP_EO 23, 24, 10, 11
1098
+    DCT32_avx512_PASS_1_ROW_EO    2 * 64 + 5 * 8, 23
1099
+
1100
+    lea             r9,           [r9 + 4 * r2]
1101
+    DCT32_avx512_LOOP_EO 25, 26, 12, 13
1102
+    DCT32_avx512_PASS_1_ROW_EO    2 * 64 + 6 * 8, 25
1103
+
1104
+    lea             r9,           [r9 + 4 * r2]
1105
+    DCT32_avx512_LOOP_EO 27, 28, 14, 15
1106
+    DCT32_avx512_PASS_1_ROW_EO    2 * 64 + 7 * 8, 27
1107
+
1108
+%endmacro
1109
+
1110
+%macro DCT32_avx512_PASS_1_ROW_EO_1_7 1
1111
+
1112
+    vbroadcasti32x4 m12,         [r7 + %1 * 32]
1113
+
1114
+    DCT32_avx512_PASS_1_ROW_EO   %1 * 64 + 0 * 8, 13
1115
+    DCT32_avx512_PASS_1_ROW_EO   %1 * 64 + 1 * 8, 15
1116
+    DCT32_avx512_PASS_1_ROW_EO   %1 * 64 + 2 * 8, 17
1117
+    DCT32_avx512_PASS_1_ROW_EO   %1 * 64 + 3 * 8, 19
1118
+    DCT32_avx512_PASS_1_ROW_EO   %1 * 64 + 4 * 8, 21
1119
+    DCT32_avx512_PASS_1_ROW_EO   %1 * 64 + 5 * 8, 23
1120
+    DCT32_avx512_PASS_1_ROW_EO   %1 * 64 + 6 * 8, 25
1121
+    DCT32_avx512_PASS_1_ROW_EO   %1 * 64 + 7 * 8, 27
1122
+
1123
+%endmacro
1124
+
1125
+%macro DCT32_avx512_LOOP_EEO 0
1126
+    vpunpcklqdq        m2,  m14, m16
1127
+    vpunpckhqdq        m14, m16
1128
+    vpshufb            m14, m31
1129
+
1130
+    vpaddw             m16, m2, m14     ; EEE
1131
+    vpsubw             m2,  m14         ; EE0
1132
+
1133
+    vpunpcklqdq        m3,  m18, m20
1134
+    vpunpckhqdq        m18, m20
1135
+    vpshufb            m18, m31
1136
+
1137
+    vpaddw             m20, m3, m18     ; EEE
1138
+    vpsubw             m3,  m18         ; EE0
1139
+
1140
+    vpunpcklqdq        m4,  m22, m24
1141
+    vpunpckhqdq        m22, m24
1142
+    vpshufb            m22, m31
1143
+
1144
+    vpaddw             m24, m4, m22     ; EEE
1145
+    vpsubw             m4,  m22         ; EE0
1146
+
1147
+    vpunpcklqdq        m5,  m26, m28
1148
+    vpunpckhqdq        m26, m28
1149
+    vpshufb            m26, m31
1150
+
1151
+    vpaddw             m28, m5, m26     ; EEE
1152
+    vpsubw             m5,  m26         ; EE0
1153
+%endmacro
1154
+
1155
+%macro DCT32_avx512_PASS_1_ROW_EEO 2
1156
+    pmaddwd          m30,              m%2,  m1
1157
+    vpsrldq          m29,              m30,  4
1158
+    vpaddd           m29,              m30
1159
+
1160
+    vpaddd           m29,              m0
1161
+    vpsrad           m29,              DCT_SHIFT
1162
+    vpackssdw        m29,              m29
1163
+
1164
+    vpermw           m29,              m27, m29
1165
+    movu             [r5 + %1],        xm29
1166
+%endmacro
1167
+
1168
+%macro DCT32_avx512_PASS_1_ROW_EEO_1_4 1
1169
+
1170
+vpbroadcastq     m1,            [r7 + %1 * 32]
1171
+DCT32_avx512_PASS_1_ROW_EEO     %1 * 64 + 0 * 16, 2
1172
+DCT32_avx512_PASS_1_ROW_EEO     %1 * 64 + 1 * 16, 3
1173
+DCT32_avx512_PASS_1_ROW_EEO     %1 * 64 + 2 * 16, 4
1174
+DCT32_avx512_PASS_1_ROW_EEO     %1 * 64 + 3 * 16, 5
1175
+
1176
+%endmacro
1177
+
1178
+%macro DCT32_avx512_PASS_1_ROW_EEEO_1_4 1
1179
+
1180
+vpbroadcastq     m1,            [r7 + %1 * 32]
1181
+DCT32_avx512_PASS_1_ROW_EEO     %1 * 64 + 0 * 16, 16
1182
+DCT32_avx512_PASS_1_ROW_EEO     %1 * 64 + 1 * 16, 20
1183
+DCT32_avx512_PASS_1_ROW_EEO     %1 * 64 + 2 * 16, 24
1184
+DCT32_avx512_PASS_1_ROW_EEO     %1 * 64 + 3 * 16, 28
1185
+
1186
+%endmacro
1187
+
1188
+%macro DCT32_avx512_PASS2_OPT 5
1189
+    pmaddwd         m9,                m1,  m%1
1190
+    vpsrldq         m10,               m9,  8
1191
+    vpaddd          m9,                m10
1192
+
1193
+    pmaddwd         m10,               m1,  m%2
1194
+    vpsrldq         m11,               m10, 8
1195
+    vpaddd          m10,               m11
1196
+
1197
+    pmaddwd         m11,               m1,  m%3
1198
+    vpsrldq         m12,               m11, 8
1199
+    vpaddd          m11,               m12
1200
+
1201
+    pmaddwd         m12,               m1,  m%4
1202
+    vpsrldq         m13,               m12, 8
1203
+    vpaddd          m12,               m13
1204
+
1205
+    vpsrldq         m13,               m9,  4
1206
+    vpaddd          m9,                m13
1207
+    vpsrldq         m13,               m10, 4
1208
+    vpaddd          m10,               m13
1209
+    vpsrldq         m13,               m11, 4
1210
+    vpaddd          m11,               m13
1211
+    vpsrldq         m13,               m12, 4
1212
+    vpaddd          m12,               m13
1213
+
1214
+    vpermd           m9,               m31,  m9
1215
+    vpermd          m10,               m31, m10
1216
+    vpermd          m11,               m31, m11
1217
+    vpermd          m12,               m31, m12
1218
+
1219
+    vpandd          m9,                m27
1220
+    vpandd          m10,               m30
1221
+    vpandd          m11,               m29
1222
+    vpandd          m12,               m28
1223
+
1224
+    vpaddd          m9,                m10
1225
+    vpaddd          m11,               m12
1226
+    vpaddd          m9,                m11
1227
+
1228
+    vpsrldq         m10,               m9, 8
1229
+    vpaddd          m9,                m10
1230
+    vpsrldq         m10,               m9, 4
1231
+    vpaddd          m9,                m10
1232
+
1233
+    vpermd          m9,                m31, m9
1234
+    vpaddd          m9,                m0
1235
+    vpsrad          m9,                DCT_SHIFT2
1236
+    vpackssdw       m9,                m9
1237
+    movq            [r1 + %5],         xm9
1238
+
1239
+%endmacro
1240
+
1241
+%macro DCT32_avx512_PASS2 5
1242
+
1243
+    mova            m9,                [r5 + %1]
1244
+    mova            m10,               [r5 + %2]
1245
+    mova            m11,               [r5 + %3]
1246
+    mova            m12,               [r5 + %4]
1247
+
1248
+    pmaddwd         m9,                m1,  m9
1249
+    vpsrldq         m13,               m9,  8
1250
+    vpaddd          m9,                m13
1251
+
1252
+    pmaddwd         m10,               m1,  m10
1253
+    vpsrldq         m13,               m10, 8
1254
+    vpaddd          m10,               m13
1255
+
1256
+    pmaddwd         m11,               m1,  m11
1257
+    vpsrldq         m13,               m11, 8
1258
+    vpaddd          m11,               m13
1259
+
1260
+    pmaddwd         m12,               m1,  m12
1261
+    vpsrldq         m13,               m12, 8
1262
+    vpaddd          m12,               m13
1263
+
1264
+    vpsrldq         m13,               m9,  4
1265
+    vpaddd          m9,                m13
1266
+    vpsrldq         m13,               m10, 4
1267
+    vpaddd          m10,               m13
1268
+    vpsrldq         m13,               m11, 4
1269
+    vpaddd          m11,               m13
1270
+    vpsrldq         m13,               m12, 4
1271
+    vpaddd          m12,               m13
1272
+
1273
+    vpermd           m9,               m31,  m9
1274
+    vpermd          m10,               m31, m10
1275
+    vpermd          m11,               m31, m11
1276
+    vpermd          m12,               m31, m12
1277
+
1278
+    vpandd          m9,                m27
1279
+    vpandd          m10,               m30
1280
+    vpandd          m11,               m29
1281
+    vpandd          m12,               m28
1282
+
1283
+    vpaddd          m9,                m10
1284
+    vpaddd          m11,               m12
1285
+    vpaddd          m9,                m11
1286
+
1287
+    vpsrldq         m10,               m9, 8
1288
+    vpaddd          m9,                m10
1289
+    vpsrldq         m10,               m9, 4
1290
+    vpaddd          m9,                m10
1291
+
1292
+    vpermd          m9,                m31, m9
1293
+    vpaddd          m9,                m0
1294
+    vpsrad          m9,                DCT_SHIFT2
1295
+    vpackssdw       m9,                m9
1296
+    movq            [r1 + %5],         xm9
1297
+
1298
+%endmacro
1299
+
1300
+%macro DCT32_avx512_PASS2_1_ROW 1
1301
+
1302
+mova            m1,                [r8 + %1 * 64]
1303
+
1304
+DCT32_avx512_PASS2_OPT  2,  3,  4, 14, %1 * 64 + 0 * 8
1305
+DCT32_avx512_PASS2_OPT 15, 16, 17, 18, %1 * 64 + 1 * 8
1306
+DCT32_avx512_PASS2_OPT 19, 20, 21, 22, %1 * 64 + 2 * 8
1307
+DCT32_avx512_PASS2_OPT 23, 24, 25, 26, %1 * 64 + 3 * 8
1308
+DCT32_avx512_PASS2_OPT  5,  6,  7,  8, %1 * 64 + 4 * 8
1309
+
1310
+DCT32_avx512_PASS2 20 * 64, 21 * 64, 22 * 64, 23 * 64, %1 * 64 + 5 * 8
1311
+DCT32_avx512_PASS2 24 * 64, 25 * 64, 26 * 64, 27 * 64, %1 * 64 + 6 * 8
1312
+DCT32_avx512_PASS2 28 * 64, 29 * 64, 30 * 64, 31 * 64, %1 * 64 + 7 * 8
1313
+
1314
+%endmacro
1315
+
1316
+INIT_ZMM avx512
1317
+cglobal dct32, 3, 10, 32, 0-(32*mmsize + 16*mmsize)
1318
+
1319
+%if BIT_DEPTH == 12
1320
+    %define         DCT_SHIFT          8
1321
+    vpbroadcastq    m0,                [pd_128]
1322
+%elif BIT_DEPTH == 10
1323
+    %define         DCT_SHIFT          6
1324
+    vpbroadcastq    m0,                [pd_32]
1325
+%elif BIT_DEPTH == 8
1326
+    %define         DCT_SHIFT          4
1327
+    vpbroadcastq    m0,                [pd_8]
1328
+%else
1329
+    %error Unsupported BIT_DEPTH!
1330
+%endif
1331
+%define             DCT_SHIFT2         11
1332
+
1333
+    add             r2d,               r2d
1334
+    lea             r7,                [tab_dct32_1]
1335
+    lea             r8,                [tab_dct32]
1336
+    lea             r3,                [r2 * 3]
1337
+    mov             r5,                rsp
1338
+    mov             r9,                2048    ; 32 * mmsize
1339
+    add             r9,                rsp
1340
+
1341
+    mova            m31,               [dct32_shuf1_AVX512]
1342
+
1343
+    ; PASSS 1
1344
+
1345
+    vbroadcasti32x8 m30,               [dct8_shuf9_AVX512]
1346
+    mova            m8,                [dct32_shuf_AVX512]
1347
+
1348
+    DCT32_avx512_PASS_1_ROW_O
1349
+    DCT32_avx512_PASS_1_ROW_O_1_7  3
1350
+    DCT32_avx512_PASS_1_ROW_O_1_7  5
1351
+    DCT32_avx512_PASS_1_ROW_O_1_7  7
1352
+    DCT32_avx512_PASS_1_ROW_O_1_7  9
1353
+    DCT32_avx512_PASS_1_ROW_O_1_7 11
1354
+    DCT32_avx512_PASS_1_ROW_O_1_7 13
1355
+    DCT32_avx512_PASS_1_ROW_O_1_7 15
1356
+    DCT32_avx512_PASS_1_ROW_O_1_7 17
1357
+    DCT32_avx512_PASS_1_ROW_O_1_7 19
1358
+    DCT32_avx512_PASS_1_ROW_O_1_7 20
1359
+    DCT32_avx512_PASS_1_ROW_O_1_7 21
1360
+    DCT32_avx512_PASS_1_ROW_O_1_7 23
1361
+    DCT32_avx512_PASS_1_ROW_O_1_7 25
1362
+    DCT32_avx512_PASS_1_ROW_O_1_7 27
1363
+    DCT32_avx512_PASS_1_ROW_O_1_7 29
1364
+    DCT32_avx512_PASS_1_ROW_O_1_7 31
1365
+
1366
+    vbroadcasti32x8  m11,               [dct8_shuf9_AVX512]
1367
+
1368
+    DCT32_avx512_PASS_1_ROW_EO_0
1369
+    DCT32_avx512_PASS_1_ROW_EO_1_7 6
1370
+    DCT32_avx512_PASS_1_ROW_EO_1_7 10
1371
+    DCT32_avx512_PASS_1_ROW_EO_1_7 14
1372
+    DCT32_avx512_PASS_1_ROW_EO_1_7 18
1373
+    DCT32_avx512_PASS_1_ROW_EO_1_7 22
1374
+    DCT32_avx512_PASS_1_ROW_EO_1_7 26
1375
+    DCT32_avx512_PASS_1_ROW_EO_1_7 30
1376
+
1377
+    vbroadcasti32x4  m31,               [dct8_shuf]
1378
+    vbroadcasti32x8  m27,               [dct32_shuf3_AVX512]
1379
+
1380
+    DCT32_avx512_LOOP_EEO
1381
+    DCT32_avx512_PASS_1_ROW_EEO_1_4 4
1382
+    DCT32_avx512_PASS_1_ROW_EEO_1_4 12
1383
+    DCT32_avx512_PASS_1_ROW_EEO_1_4 20
1384
+    DCT32_avx512_PASS_1_ROW_EEO_1_4 28
1385
+
1386
+    DCT32_avx512_PASS_1_ROW_EEEO_1_4 0
1387
+    DCT32_avx512_PASS_1_ROW_EEEO_1_4 16
1388
+    DCT32_avx512_PASS_1_ROW_EEEO_1_4 8
1389
+    DCT32_avx512_PASS_1_ROW_EEEO_1_4 24
1390
+
1391
+    ; PASS 2
1392
+
1393
+    vpbroadcastq    m0,               [pd_1024]
1394
+    vbroadcasti32x8 m31,              [dct32_shuf4_AVX512]
1395
+    movu            m30,              [dct32_shuf5_AVX512]
1396
+    movu            m29,              [dct32_shuf6_AVX512]
1397
+    movu            m28,              [dct32_shuf7_AVX512]
1398
+    movu            m27,              [dct32_shuf8_AVX512]
1399
+
1400
+    ;Load the source coefficents into free registers and reuse them for all rows
1401
+
1402
+    mova            m2,               [r5 +  0 * 64]
1403
+    mova            m3,               [r5 +  1 * 64]
1404
+    mova            m4,               [r5 +  2 * 64]
1405
+    mova            m14,              [r5 +  3 * 64]
1406
+    mova            m15,              [r5 +  4 * 64]
1407
+    mova            m16,              [r5 +  5 * 64]
1408
+    mova            m17,              [r5 +  6 * 64]
1409
+    mova            m18,              [r5 +  7 * 64]
1410
+    mova            m19,              [r5 +  8 * 64]
1411
+    mova            m20,              [r5 +  9 * 64]
1412
+    mova            m21,              [r5 + 10 * 64]
1413
+    mova            m22,              [r5 + 11 * 64]
1414
+    mova            m23,              [r5 + 12 * 64]
1415
+    mova            m24,              [r5 + 13 * 64]
1416
+    mova            m25,              [r5 + 14 * 64]
1417
+    mova            m26,              [r5 + 15 * 64]
1418
+    mova             m5,              [r5 + 16 * 64]
1419
+    mova             m6,              [r5 + 17 * 64]
1420
+    mova             m7,              [r5 + 18 * 64]
1421
+    mova             m8,              [r5 + 19 * 64]
1422
+
1423
+    DCT32_avx512_PASS2_1_ROW 0
1424
+    DCT32_avx512_PASS2_1_ROW 1
1425
+    DCT32_avx512_PASS2_1_ROW 2
1426
+    DCT32_avx512_PASS2_1_ROW 3
1427
+    DCT32_avx512_PASS2_1_ROW 4
1428
+    DCT32_avx512_PASS2_1_ROW 5
1429
+    DCT32_avx512_PASS2_1_ROW 6
1430
+    DCT32_avx512_PASS2_1_ROW 7
1431
+    DCT32_avx512_PASS2_1_ROW 8
1432
+    DCT32_avx512_PASS2_1_ROW 9
1433
+    DCT32_avx512_PASS2_1_ROW 10
1434
+    DCT32_avx512_PASS2_1_ROW 11
1435
+    DCT32_avx512_PASS2_1_ROW 12
1436
+    DCT32_avx512_PASS2_1_ROW 13
1437
+    DCT32_avx512_PASS2_1_ROW 14
1438
+    DCT32_avx512_PASS2_1_ROW 15
1439
+    DCT32_avx512_PASS2_1_ROW 16
1440
+    DCT32_avx512_PASS2_1_ROW 17
1441
+    DCT32_avx512_PASS2_1_ROW 18
1442
+    DCT32_avx512_PASS2_1_ROW 19
1443
+    DCT32_avx512_PASS2_1_ROW 20
1444
+    DCT32_avx512_PASS2_1_ROW 21
1445
+    DCT32_avx512_PASS2_1_ROW 22
1446
+    DCT32_avx512_PASS2_1_ROW 23
1447
+    DCT32_avx512_PASS2_1_ROW 24
1448
+    DCT32_avx512_PASS2_1_ROW 25
1449
+    DCT32_avx512_PASS2_1_ROW 26
1450
+    DCT32_avx512_PASS2_1_ROW 27
1451
+    DCT32_avx512_PASS2_1_ROW 28
1452
+    DCT32_avx512_PASS2_1_ROW 29
1453
+    DCT32_avx512_PASS2_1_ROW 30
1454
+    DCT32_avx512_PASS2_1_ROW 31
1455
+
1456
+    RET
1457
+
1458
 %macro IDCT8_PASS_1 1
1459
     vpbroadcastd    m7,                [r5 + %1]
1460
     vpbroadcastd    m10,               [r5 + %1 + 4]
1461
@@ -2969,6 +4310,213 @@
1462
     mova            [r1 + r3],         xm3
1463
     RET
1464
 
1465
+
1466
+%macro IDCT8_AVX512_PASS_1 0
1467
+    pmaddwd         m5,                m29, m17
1468
+    pmaddwd         m6,                m25, m18
1469
+    paddd           m5,                m6
1470
+
1471
+    pmaddwd         m6,                m30, m21
1472
+    pmaddwd         m3,                m26, m22
1473
+    paddd           m6,                m3
1474
+
1475
+    paddd           m3,                m5, m6
1476
+    paddd           m3,                m11
1477
+    psrad           m3,                IDCT_SHIFT1
1478
+
1479
+    psubd           m5,                m6
1480
+    paddd           m5,                m11
1481
+    psrad           m5,                IDCT_SHIFT1
1482
+
1483
+    pmaddwd         m6,                m29, m19
1484
+    pmaddwd         m8,                m25, m20
1485
+    paddd           m6,                m8
1486
+
1487
+    pmaddwd         m8,                m30, m23
1488
+    pmaddwd         m9,                m26, m24
1489
+    paddd           m8,                m9
1490
+
1491
+    paddd           m9,                m6, m8
1492
+    paddd           m9,                m11
1493
+    psrad           m9,                IDCT_SHIFT1
1494
+
1495
+    psubd           m6,                m8
1496
+    paddd           m6,                m11
1497
+    psrad           m6,                IDCT_SHIFT1
1498
+
1499
+    packssdw        m3,                m9
1500
+    vpermq          m3,                m3, 0xD8
1501
+
1502
+    packssdw        m6,                m5
1503
+    vpermq          m6,                m6, 0xD8
1504
+%endmacro
1505
+
1506
+
1507
+%macro IDCT8_AVX512_PASS_2 0
1508
+    mov             r7d, 0xAAAA
1509
+    kmovd           k1, r7d
1510
+    punpcklqdq      m2,                m3, m13
1511
+    punpckhqdq      m0,                m3, m13
1512
+
1513
+    pmaddwd         m3,                m2, [r5]
1514
+    pmaddwd         m5,                m2, [r5 + 1 * mmsize]
1515
+    pmaddwd         m6,                m2, [r5 + 2 * mmsize]
1516
+    pmaddwd         m7,                m2, [r5 + 3 * mmsize]
1517
+
1518
+    vpsrldq         m14,   m3, 4
1519
+    paddd            m3,  m14
1520
+    vpslldq         m16,   m5, 4
1521
+    paddd            m5,  m16
1522
+    vmovdqu32        m3   {k1}, m5
1523
+
1524
+    vpsrldq         m14,   m6, 4
1525
+    paddd            m6,  m14
1526
+    vpslldq         m16,   m7, 4
1527
+    paddd            m7,  m16
1528
+    vmovdqu32        m6   {k1}, m7
1529
+
1530
+    punpcklqdq      m7,                m3, m6
1531
+    punpckhqdq      m3,                m6
1532
+
1533
+    pmaddwd         m5,                m0, [r6]
1534
+    pmaddwd         m6,                m0, [r6 + 1 * mmsize]
1535
+    pmaddwd         m8,                m0, [r6 + 2 * mmsize]
1536
+    pmaddwd         m9,                m0, [r6 + 3 * mmsize]
1537
+
1538
+    vpsrldq         m14,   m5, 4
1539
+    paddd            m5,  m14
1540
+    vpslldq         m16,   m6, 4
1541
+    paddd            m6,  m16
1542
+    vmovdqu32        m5   {k1}, m6
1543
+
1544
+    vpsrldq         m14,   m8, 4
1545
+    paddd            m8,  m14
1546
+    vpslldq         m16,   m9, 4
1547
+    paddd            m9,  m16
1548
+    vmovdqu32        m8   {k1}, m9
1549
+
1550
+    punpcklqdq      m6,                m5, m8
1551
+    punpckhqdq      m5,                m8
1552
+
1553
+    paddd           m8,                m7, m6
1554
+    paddd           m8,                m12
1555
+    psrad           m8,                IDCT_SHIFT2
1556
+
1557
+    psubd           m7,                m6
1558
+    paddd           m7,                m12
1559
+    psrad           m7,                IDCT_SHIFT2
1560
+
1561
+    pshufb          m7,                [idct8_avx512_shuf3]
1562
+    packssdw        m8,                 m7
1563
+
1564
+    paddd           m9,                m3, m5
1565
+    paddd           m9,                m12
1566
+    psrad           m9,                IDCT_SHIFT2
1567
+
1568
+    psubd           m3,                m5
1569
+    paddd           m3,                m12
1570
+    psrad           m3,                IDCT_SHIFT2
1571
+
1572
+    pshufb          m3,                [idct8_avx512_shuf3]
1573
+    packssdw        m9,                m3
1574
+%endmacro
1575
+
1576
+
1577
+%if ARCH_X86_64
1578
+INIT_ZMM avx512
1579
+cglobal idct8, 3, 8, 31
1580
+%if BIT_DEPTH == 12
1581
+    %define         IDCT_SHIFT2        8
1582
+    vpbroadcastd    m12,                [pd_128]
1583
+%elif BIT_DEPTH == 10
1584
+    %define         IDCT_SHIFT2        10
1585
+    vpbroadcastd    m12,                [pd_512]
1586
+%elif BIT_DEPTH == 8
1587
+    %define         IDCT_SHIFT2        12
1588
+    vpbroadcastd    m12,                [pd_2048]
1589
+%else
1590
+    %error Unsupported BIT_DEPTH!
1591
+%endif
1592
+%define             IDCT_SHIFT1         7
1593
+
1594
+    vpbroadcastd     m11,               [pd_64]
1595
+
1596
+    lea             r4,                [avx512_idct8_3]
1597
+    lea             r5,                [avx2_idct8_1]
1598
+    lea             r6,                [avx2_idct8_2]
1599
+    movu           m16,                [idct16_shuff2]
1600
+    movu           m17,                [idct16_shuff3]
1601
+
1602
+    ;pass1
1603
+    mova            ym1, [r0 + 0 * 32]
1604
+    mova            ym0, [r0 + 1 * 32]
1605
+    mova            ym25, ym16
1606
+    mova            ym26, ym17
1607
+    vpermi2w        ym25,  ym1, ym0
1608
+    vpermi2w        ym26,  ym1, ym0
1609
+
1610
+    mova            ym1, [r0 + 2 * 32]
1611
+    mova            ym0, [r0 + 3 * 32]
1612
+    mova            ym27, ym16
1613
+    mova            ym28, ym17
1614
+    vpermi2w        ym27,  ym1, ym0
1615
+    vpermi2w        ym28,  ym1, ym0
1616
+    
1617
+    vperm2i128      ym29, ym25, ym26, 0x20
1618
+    vperm2i128      ym30, ym25, ym26, 0x31
1619
+    vperm2i128      ym25, ym27, ym28, 0x20
1620
+    vperm2i128      ym26, ym27, ym28, 0x31
1621
+
1622
+    vinserti64x4    m29,        m29,      ym29, 1
1623
+    vinserti64x4    m25,        m25,      ym25, 1
1624
+    vinserti64x4    m30,        m30,      ym30, 1
1625
+    vinserti64x4    m26,        m26,      ym26, 1
1626
+
1627
+    movu            m17,                [r4]
1628
+    movu            m18,                [r4 + 1 * mmsize]
1629
+    movu            m19,                [r4 + 2 * mmsize]
1630
+    movu            m20,                [r4 + 3 * mmsize]
1631
+    movu            m21,                [r4 + 4 * mmsize]
1632
+    movu            m22,                [r4 + 5 * mmsize]
1633
+    movu            m23,                [r4 + 6 * mmsize]
1634
+    movu            m24,                [r4 + 7 * mmsize]
1635
+
1636
+    IDCT8_AVX512_PASS_1
1637
+
1638
+    vextracti64x4   ym13,       m3,      1
1639
+    vextracti64x4   ym14,       m6,      1
1640
+    vinserti64x4      m3,       m3,      ym14, 1
1641
+    vinserti64x4     m13,      m13,       ym6, 1
1642
+
1643
+    ;pass2
1644
+    add             r2d,               r2d
1645
+    lea             r3,                [r2 * 3]
1646
+    lea             r5,                [avx512_idct8_1]
1647
+    lea             r6,                [avx512_idct8_2]
1648
+
1649
+    IDCT8_AVX512_PASS_2
1650
+
1651
+    vextracti128    xm3,               ym8, 1
1652
+    mova            [r1],              xm8
1653
+    mova            [r1 + r2],         xm3
1654
+    vextracti128    xm3,               ym9, 1
1655
+    mova            [r1 + r2 * 2],     xm9
1656
+    mova            [r1 + r3],         xm3
1657
+
1658
+    lea             r1,                [r1 + r2 * 4]
1659
+
1660
+    vextracti64x4   ym10,   m8, 1
1661
+    vextracti64x4   ym11,   m9, 1
1662
+
1663
+    vextracti128    xm3,               ym10, 1
1664
+    mova            [r1],              xm10
1665
+    mova            [r1 + r2],         xm3
1666
+    vextracti128    xm3,               ym11, 1
1667
+    mova            [r1 + r2 * 2],     xm11
1668
+    mova            [r1 + r3],         xm3
1669
+    RET
1670
+%endif
1671
+
1672
 %macro IDCT_PASS1 2
1673
     vbroadcasti128  m5, [tab_idct16_2 + %1 * 16]
1674
 
1675
@@ -3266,6 +4814,574 @@
1676
     jnz             .pass2
1677
     RET
1678
 
1679
+
1680
+%macro IDCT16_AVX512_PASS1 3
1681
+    movu            m5,  [tab_AVX512_idct16_2 + %1 * 64]
1682
+    pmaddwd         m9, m4, m5
1683
+    pmaddwd         m10, m6, m5
1684
+
1685
+    vpsrldq         m16,   m9, 4
1686
+    paddd            m9,  m16
1687
+    vpslldq         m17,   m10, 4
1688
+    paddd            m10,  m17
1689
+    vmovdqu32        m9   {k1}, m10
1690
+
1691
+    pmaddwd         m10, m7, m5
1692
+    pmaddwd         m11, m8, m5
1693
+
1694
+    vpsrldq         m16,   m10, 4
1695
+    paddd            m10,  m16
1696
+    vpslldq         m17,   m11, 4
1697
+    paddd            m11,  m17
1698
+    vmovdqu32        m10   {k1}, m11
1699
+
1700
+    vpsrldq         m16,   m9, 8
1701
+    paddd            m9,  m16
1702
+    vpslldq         m17,   m10, 8
1703
+    paddd            m10,  m17
1704
+    vmovdqu32        m9   {k2}, m10
1705
+
1706
+    mova            m5,  [tab_AVX512_idct16_1 + %1 * 64]
1707
+    pmaddwd         m10, m28, m5
1708
+    pmaddwd         m11, m29, m5
1709
+
1710
+    vpsrldq         m16,   m10, 4
1711
+    paddd            m10,  m16
1712
+    vpslldq         m17,   m11, 4
1713
+    paddd            m11,  m17
1714
+    vmovdqu32        m10   {k1}, m11
1715
+
1716
+    pmaddwd         m11, m30, m5
1717
+    pmaddwd         m12, m31, m5
1718
+
1719
+    vpsrldq         m16,   m11, 4
1720
+    paddd            m11,  m16
1721
+    vpslldq         m17,   m12, 4
1722
+    paddd            m12,  m17
1723
+    vmovdqu32        m11   {k1}, m12
1724
+
1725
+    vpsrldq         m16,   m10, 8
1726
+    paddd            m10,  m16
1727
+    vpslldq         m17,   m11, 8
1728
+    paddd            m11,  m17
1729
+    vmovdqu32        m10   {k2}, m11
1730
+
1731
+    paddd           m11, m9, m10
1732
+    paddd           m11, m14
1733
+    psrad           m11, IDCT_SHIFT1
1734
+
1735
+    psubd           m9, m10
1736
+    paddd           m9, m14
1737
+    psrad           m9, IDCT_SHIFT1
1738
+
1739
+    mova            m5,  [tab_AVX512_idct16_2 + %1 * 64 + 64]
1740
+    pmaddwd         m10, m4, m5
1741
+    pmaddwd         m12, m6, m5
1742
+
1743
+
1744
+    vpsrldq         m16,   m10, 4
1745
+    paddd            m10,  m16
1746
+    vpslldq         m17,   m12, 4
1747
+    paddd            m12,  m17
1748
+    vmovdqu32        m10   {k1}, m12
1749
+
1750
+    pmaddwd         m12, m7, m5
1751
+    pmaddwd         m13, m8, m5
1752
+
1753
+
1754
+    vpsrldq         m16,   m12, 4
1755
+    paddd            m12,  m16
1756
+    vpslldq         m17,   m13, 4
1757
+    paddd            m13,  m17
1758
+    vmovdqu32        m12   {k1}, m13
1759
+
1760
+
1761
+    vpsrldq         m16,   m10, 8
1762
+    paddd            m10,  m16
1763
+    vpslldq         m17,   m12, 8
1764
+    paddd            m12,  m17
1765
+    vmovdqu32        m10   {k2}, m12
1766
+
1767
+
1768
+
1769
+    mova            m5,  [tab_AVX512_idct16_1 + %1 * 64 + 64] 
1770
+    pmaddwd         m12, m28, m5
1771
+    pmaddwd         m13, m29, m5
1772
+
1773
+
1774
+    vpsrldq         m16,   m12, 4
1775
+    paddd            m12,  m16
1776
+    vpslldq         m17,   m13, 4
1777
+    paddd            m13,  m17
1778
+    vmovdqu32        m12   {k1}, m13
1779
+
1780
+    pmaddwd         m13, m30, m5
1781
+    pmaddwd         m5, m31
1782
+
1783
+
1784
+    vpsrldq         m16,   m13, 4
1785
+    paddd            m13,  m16
1786
+    vpslldq         m17,   m5, 4
1787
+    paddd            m5,  m17
1788
+    vmovdqu32        m13   {k1}, m5
1789
+
1790
+
1791
+    vpsrldq         m16,   m12, 8
1792
+    paddd            m12,  m16
1793
+    vpslldq         m17,   m13, 8
1794
+    paddd            m13,  m17
1795
+    vmovdqu32        m12   {k2}, m13
1796
+
1797
+
1798
+    paddd           m5, m10, m12
1799
+    paddd           m5, m14
1800
+    psrad           m5, IDCT_SHIFT1
1801
+
1802
+    psubd           m10, m12
1803
+    paddd           m10, m14
1804
+    psrad           m10, IDCT_SHIFT1
1805
+
1806
+    packssdw        m11, m5
1807
+    packssdw        m9, m10
1808
+
1809
+    mova            m10, [idct16_AVX512_shuff]
1810
+    mova            m5,  [idct16_AVX512_shuff1]
1811
+
1812
+    vpermd          m%2, m10, m11
1813
+    vpermd          m%3, m5, m9
1814
+%endmacro
1815
+
1816
+%macro IDCT16_AVX512_PASS2 2
1817
+    vpermq          m0, m%1, 0xD8
1818
+
1819
+    pmaddwd         m1, m0, m7
1820
+    pmaddwd         m2, m0, m8
1821
+
1822
+
1823
+    vpsrldq         m14,   m1, 4
1824
+    paddd            m1,  m14
1825
+    vpslldq         m31,   m2, 4
1826
+    paddd            m2,  m31
1827
+    vmovdqu32        m1   {k1}, m2
1828
+
1829
+    pmaddwd         m2, m0, m9
1830
+    pmaddwd         m3, m0, m10
1831
+
1832
+
1833
+    vpsrldq         m14,   m2, 4
1834
+    paddd            m2,  m14
1835
+    vpslldq         m31,   m3, 4
1836
+    paddd            m3,  m31
1837
+    vmovdqu32        m2   {k1}, m3
1838
+
1839
+
1840
+    vpsrldq         m14,   m1, 8
1841
+    paddd            m1,  m14
1842
+    vpslldq         m31,   m2, 8
1843
+    paddd            m2,  m31
1844
+    vmovdqu32        m1   {k2}, m2
1845
+
1846
+    pmaddwd         m2, m0, m11
1847
+    pmaddwd         m3, m0, m12
1848
+
1849
+
1850
+    vpsrldq         m14,   m2, 4
1851
+    paddd            m2,  m14
1852
+    vpslldq         m31,   m3, 4
1853
+    paddd            m3,  m31
1854
+    vmovdqu32        m2   {k1}, m3
1855
+
1856
+    vbroadcasti64x2  m14, [r5 + 112]
1857
+    pmaddwd         m3, m0, m13
1858
+    pmaddwd         m4, m0, m14
1859
+
1860
+
1861
+    vpsrldq         m14,   m3, 4
1862
+    paddd            m3,  m14
1863
+    vpslldq         m31,   m4, 4
1864
+    paddd            m4,  m31
1865
+    vmovdqu32        m3   {k1}, m4
1866
+
1867
+
1868
+    vpsrldq         m14,   m2, 8
1869
+    paddd            m2,  m14
1870
+    vpslldq         m31,   m3, 8
1871
+    paddd            m3,  m31
1872
+    vmovdqu32        m2   {k2}, m3
1873
+
1874
+    vpermq          m0, m%2, 0xD8
1875
+    pmaddwd         m3, m0, m16
1876
+    pmaddwd         m4, m0, m17
1877
+
1878
+
1879
+    vpsrldq         m14,   m3, 4
1880
+    paddd            m3,  m14
1881
+    vpslldq         m31,   m4, 4
1882
+    paddd            m4,  m31
1883
+    vmovdqu32        m3   {k1}, m4
1884
+
1885
+    pmaddwd         m4, m0, m19
1886
+    pmaddwd         m5, m0, m23
1887
+
1888
+
1889
+    vpsrldq         m14,   m4, 4
1890
+    paddd            m4,  m14
1891
+    vpslldq         m31,   m5, 4
1892
+    paddd            m5,  m31
1893
+    vmovdqu32        m4   {k1}, m5
1894
+
1895
+
1896
+    vpsrldq         m14,   m3, 8
1897
+    paddd            m3,  m14
1898
+    vpslldq         m31,   m4, 8
1899
+    paddd            m4,  m31
1900
+    vmovdqu32        m3   {k2}, m4
1901
+
1902
+
1903
+    pmaddwd         m4, m0, m28
1904
+    pmaddwd         m5, m0, m29
1905
+
1906
+    vpsrldq         m14,   m4, 4
1907
+    paddd            m4,  m14
1908
+    vpslldq         m31,   m5, 4
1909
+    paddd            m5,  m31
1910
+    vmovdqu32        m4   {k1}, m5
1911
+
1912
+    pmaddwd         m6, m0, m30
1913
+    vbroadcasti64x2  m31, [r6 + 112]
1914
+    pmaddwd         m0, m31
1915
+
1916
+
1917
+    vpsrldq         m14,   m6, 4
1918
+    paddd            m6,  m14
1919
+    vpslldq         m31,   m0, 4
1920
+    paddd            m0,  m31
1921
+    vmovdqu32        m6   {k1}, m0
1922
+
1923
+
1924
+    vpsrldq         m14,   m4, 8
1925
+    paddd            m4,  m14
1926
+    vpslldq         m31,   m6, 8
1927
+    paddd            m6,  m31
1928
+    vmovdqu32        m4   {k2}, m6
1929
+
1930
+    paddd           m5, m1, m3
1931
+    paddd           m5, m15
1932
+    psrad           m5, IDCT_SHIFT2
1933
+
1934
+    psubd           m1, m3
1935
+    paddd           m1, m15
1936
+    psrad           m1, IDCT_SHIFT2
1937
+
1938
+    paddd           m6, m2, m4
1939
+    paddd           m6, m15
1940
+    psrad           m6, IDCT_SHIFT2
1941
+
1942
+    psubd           m2, m4
1943
+    paddd           m2, m15
1944
+    psrad           m2, IDCT_SHIFT2
1945
+
1946
+    packssdw        m5, m6
1947
+    packssdw        m1, m2
1948
+    pshufb          m2, m1, [idct16_AVX512_shuff6]
1949
+%endmacro
1950
+
1951
+
1952
+;-------------------------------------------------------
1953
+; void idct16(const int16_t* src, int16_t* dst, intptr_t dstStride)
1954
+;-------------------------------------------------------
1955
+INIT_ZMM avx512
1956
+cglobal idct16, 3, 8, 32
1957
+%if BIT_DEPTH == 12
1958
+    %define         IDCT_SHIFT2        8
1959
+    vpbroadcastd    m15,                [pd_128]
1960
+%elif BIT_DEPTH == 10
1961
+    %define         IDCT_SHIFT2        10
1962
+    vpbroadcastd    m15,                [pd_512]
1963
+%elif BIT_DEPTH == 8
1964
+    %define         IDCT_SHIFT2        12
1965
+    vpbroadcastd    m15,                [pd_2048]
1966
+%else
1967
+    %error Unsupported BIT_DEPTH!
1968
+%endif
1969
+%define             IDCT_SHIFT1         7
1970
+
1971
+    vpbroadcastd    m14,               [pd_64]
1972
+
1973
+    add             r2d,               r2d
1974
+
1975
+    mov             r7d,    0xAAAA
1976
+    kmovd            k1,    r7d
1977
+    mov             r7d,    0xCCCC
1978
+    kmovd            k2,    r7d
1979
+    mova          ym2, [idct16_shuff2]
1980
+    mova          ym3, [idct16_shuff3]
1981
+    mova         ym26, [idct16_shuff4]
1982
+    mova         ym27, [idct16_shuff5]
1983
+
1984
+.pass1:
1985
+    movu          xm0, [r0 + 0 * 32]
1986
+    vinserti128   ym0, ym0, [r0 + 8 * 32], 1
1987
+    movu          xm1, [r0 + 2 * 32]
1988
+    vinserti128   ym1, ym1, [r0 + 10 * 32], 1
1989
+
1990
+    mova          ym9, ym2
1991
+    mova         ym10, ym3
1992
+    vpermi2w      ym9, ym0, ym1
1993
+    vpermi2w     ym10, ym0, ym1
1994
+
1995
+    movu          xm0, [r0 + 4 * 32]
1996
+    vinserti128   ym0, ym0, [r0 + 12 * 32], 1
1997
+    movu          xm1, [r0 + 6 * 32]
1998
+    vinserti128   ym1, ym1, [r0 + 14 * 32], 1
1999
+
2000
+    mova         ym11, ym2
2001
+    mova         ym12, ym3
2002
+    vpermi2w     ym11, ym0,  ym1
2003
+    vpermi2w     ym12, ym0,  ym1
2004
+
2005
+    mova         ym4,  ym26
2006
+    mova         ym6,  ym27
2007
+    vpermi2d     ym4,   ym9, ym11
2008
+    vpermi2d     ym6,   ym9, ym11
2009
+
2010
+    mova         ym7, ym26
2011
+    mova         ym8, ym27
2012
+    vpermi2d     ym7, ym10, ym12
2013
+    vpermi2d     ym8, ym10, ym12
2014
+
2015
+    vpermq       ym4, ym4,  q3120
2016
+    vpermq       ym6, ym6,  q3120
2017
+    vpermq       ym7, ym7,  q3120
2018
+    vpermq       ym8, ym8,  q3120
2019
+
2020
+    movu          xm0, [r0 + 1 * 32]
2021
+    vinserti128   ym0, ym0, [r0 + 9 * 32], 1
2022
+    movu          xm1, [r0 + 3 * 32]
2023
+    vinserti128   ym1, ym1, [r0 + 11 * 32], 1
2024
+
2025
+    mova          ym9, ym2
2026
+    mova         ym10, ym3
2027
+    vpermi2w      ym9,  ym0, ym1
2028
+    vpermi2w     ym10,  ym0, ym1
2029
+
2030
+    movu          xm0, [r0 + 5 * 32]
2031
+    vinserti128   ym0, ym0, [r0 + 13 * 32], 1
2032
+    movu          xm1, [r0 + 7 * 32]
2033
+    vinserti128   ym1, ym1, [r0 + 15 * 32], 1
2034
+
2035
+    mova         ym11,  ym2
2036
+    mova         ym12,  ym3
2037
+    vpermi2w     ym11,  ym0,  ym1
2038
+    vpermi2w     ym12,  ym0,  ym1
2039
+
2040
+    mova         ym28,  ym26
2041
+    mova         ym29,  ym27
2042
+    vpermi2d     ym28,  ym9, ym11
2043
+    vpermi2d     ym29,  ym9, ym11
2044
+
2045
+    mova         ym30, ym26
2046
+    mova         ym31, ym27
2047
+    vpermi2d     ym30, ym10, ym12
2048
+    vpermi2d     ym31, ym10, ym12
2049
+
2050
+    vpermq       ym28, ym28,  q3120
2051
+    vpermq       ym29, ym29,  q3120
2052
+    vpermq       ym30, ym30,  q3120
2053
+    vpermq       ym31, ym31,  q3120
2054
+
2055
+    vinserti64x4    m4,          m4,      ym4, 1
2056
+    vinserti64x4    m6,          m6,      ym6, 1
2057
+    vinserti64x4    m7,          m7,      ym7, 1
2058
+    vinserti64x4    m8,          m8,      ym8, 1
2059
+    vinserti64x4    m28,        m28,      ym28, 1
2060
+    vinserti64x4    m29,        m29,      ym29, 1
2061
+    vinserti64x4    m30,        m30,      ym30, 1
2062
+    vinserti64x4    m31,        m31,      ym31, 1
2063
+
2064
+    IDCT16_AVX512_PASS1      0, 18, 19
2065
+    IDCT16_AVX512_PASS1      2, 20, 21
2066
+
2067
+    add             r0, 16
2068
+
2069
+    movu          xm0, [r0 + 0 * 32]
2070
+    vinserti128   ym0, ym0, [r0 + 8 * 32], 1
2071
+    movu          xm1, [r0 + 2 * 32]
2072
+    vinserti128   ym1, ym1, [r0 + 10 * 32], 1
2073
+
2074
+    mova          ym9, ym2
2075
+    mova         ym10, ym3
2076
+    vpermi2w      ym9, ym0, ym1
2077
+    vpermi2w     ym10, ym0, ym1
2078
+
2079
+    movu          xm0, [r0 + 4 * 32]
2080
+    vinserti128   ym0, ym0, [r0 + 12 * 32], 1
2081
+    movu          xm1, [r0 + 6 * 32]
2082
+    vinserti128   ym1, ym1, [r0 + 14 * 32], 1
2083
+
2084
+    mova         ym11, ym2
2085
+    mova         ym12, ym3
2086
+    vpermi2w     ym11, ym0,  ym1
2087
+    vpermi2w     ym12, ym0,  ym1
2088
+
2089
+    mova         ym4,  ym26
2090
+    mova         ym6,  ym27
2091
+    vpermi2d     ym4,   ym9, ym11
2092
+    vpermi2d     ym6,   ym9, ym11
2093
+
2094
+    mova         ym7, ym26
2095
+    mova         ym8, ym27
2096
+    vpermi2d     ym7, ym10, ym12
2097
+    vpermi2d     ym8, ym10, ym12
2098
+
2099
+    vpermq       ym4, ym4,  q3120
2100
+    vpermq       ym6, ym6,  q3120
2101
+    vpermq       ym7, ym7,  q3120
2102
+    vpermq       ym8, ym8,  q3120
2103
+
2104
+    movu          xm0, [r0 + 1 * 32]
2105
+    vinserti128   ym0, ym0, [r0 + 9 * 32], 1
2106
+    movu          xm1, [r0 + 3 * 32]
2107
+    vinserti128   ym1, ym1, [r0 + 11 * 32], 1
2108
+
2109
+    mova          ym9, ym2
2110
+    mova         ym10, ym3
2111
+    vpermi2w      ym9,  ym0, ym1
2112
+    vpermi2w     ym10,  ym0, ym1
2113
+
2114
+    movu          xm0, [r0 + 5 * 32]
2115
+    vinserti128   ym0, ym0, [r0 + 13 * 32], 1
2116
+    movu          xm1, [r0 + 7 * 32]
2117
+    vinserti128   ym1, ym1, [r0 + 15 * 32], 1
2118
+
2119
+    mova         ym11,  ym2
2120
+    mova         ym12,  ym3
2121
+    vpermi2w     ym11,  ym0,  ym1
2122
+    vpermi2w     ym12,  ym0,  ym1
2123
+
2124
+    mova         ym28,  ym26
2125
+    mova         ym29,  ym27
2126
+    vpermi2d     ym28,  ym9, ym11
2127
+    vpermi2d     ym29,  ym9, ym11
2128
+
2129
+    mova         ym30, ym26
2130
+    mova         ym31, ym27
2131
+    vpermi2d     ym30, ym10, ym12
2132
+    vpermi2d     ym31, ym10, ym12
2133
+
2134
+    vpermq       ym28, ym28,  q3120
2135
+    vpermq       ym29, ym29,  q3120
2136
+    vpermq       ym30, ym30,  q3120
2137
+    vpermq       ym31, ym31,  q3120
2138
+
2139
+    vinserti64x4    m4,          m4,      ym4, 1
2140
+    vinserti64x4    m6,          m6,      ym6, 1
2141
+    vinserti64x4    m7,          m7,      ym7, 1
2142
+    vinserti64x4    m8,          m8,      ym8, 1
2143
+    vinserti64x4    m28,        m28,      ym28, 1
2144
+    vinserti64x4    m29,        m29,      ym29, 1
2145
+    vinserti64x4    m30,        m30,      ym30, 1
2146
+    vinserti64x4    m31,        m31,      ym31, 1
2147
+
2148
+
2149
+    IDCT16_AVX512_PASS1      0, 22, 23
2150
+    IDCT16_AVX512_PASS1      2, 24, 25
2151
+
2152
+    mova       m26,    [idct16_AVX512_shuff2]
2153
+    mova       m27,    [idct16_AVX512_shuff3]
2154
+    vpermi2q   m26,    m18, m22
2155
+    vpermi2q   m27,    m18, m22
2156
+    mova       m18,    [idct16_AVX512_shuff2]
2157
+    mova       m22,    [idct16_AVX512_shuff3]
2158
+    vpermi2q   m18,    m20, m24
2159
+    vpermi2q   m22,    m20, m24
2160
+    mova       m20,    [idct16_AVX512_shuff4]
2161
+    mova       m24,    [idct16_AVX512_shuff5]
2162
+    vpermi2q   m20,    m21, m25
2163
+    vpermi2q   m24,    m21, m25
2164
+    mova       m21,    [idct16_AVX512_shuff4]
2165
+    mova       m25,    [idct16_AVX512_shuff5]
2166
+    vpermi2q   m21,    m19, m23
2167
+    vpermi2q   m25,    m19, m23
2168
+
2169
+    lea             r5, [tab_idct16_2]
2170
+    lea             r6, [tab_idct16_1]
2171
+
2172
+    vbroadcasti64x2  m7,  [r5]
2173
+    vbroadcasti64x2  m8,  [r5 + 16]
2174
+    vbroadcasti64x2  m9,  [r5 + 32]
2175
+    vbroadcasti64x2  m10, [r5 + 48]
2176
+    vbroadcasti64x2  m11, [r5 + 64]
2177
+    vbroadcasti64x2  m12, [r5 + 80]
2178
+    vbroadcasti64x2  m13, [r5 + 96]
2179
+
2180
+    vbroadcasti64x2  m16, [r6]
2181
+    vbroadcasti64x2  m17, [r6 + 16]
2182
+    vbroadcasti64x2  m19, [r6 + 32]
2183
+    vbroadcasti64x2  m23, [r6 + 48]
2184
+    vbroadcasti64x2  m28, [r6 + 64]
2185
+    vbroadcasti64x2  m29, [r6 + 80]
2186
+    vbroadcasti64x2  m30, [r6 + 96]
2187
+
2188
+
2189
+    IDCT16_AVX512_PASS2 26, 27
2190
+     mova            [r1], xm5
2191
+     mova            [r1 + 16], xm2
2192
+     vextracti128    [r1 + r2], ym5, 1
2193
+     vextracti128    [r1 + r2 + 16], ym2, 1
2194
+     vextracti64x4   ym14, m5, 1
2195
+     vextracti64x4   ym31, m2, 1
2196
+     lea             r1, [r1 + 2 * r2]
2197
+     mova            [r1], xm14
2198
+     mova            [r1 + 16], xm31
2199
+     vextracti128    [r1 + r2], ym14, 1
2200
+     vextracti128    [r1 + r2 + 16], ym31, 1
2201
+
2202
+    IDCT16_AVX512_PASS2 18, 22
2203
+     lea             r1, [r1 + 2 * r2]
2204
+     mova            [r1], xm5
2205
+     mova            [r1 + 16], xm2
2206
+     vextracti128    [r1 + r2], ym5, 1
2207
+     vextracti128    [r1 + r2 + 16], ym2, 1
2208
+     vextracti64x4   ym14, m5, 1
2209
+     vextracti64x4   ym31, m2, 1
2210
+     lea             r1, [r1 + 2 * r2]
2211
+     mova            [r1], xm14
2212
+     mova            [r1 + 16], xm31
2213
+     vextracti128    [r1 + r2], ym14, 1
2214
+     vextracti128    [r1 + r2 + 16], ym31, 1
2215
+
2216
+    IDCT16_AVX512_PASS2 20, 24
2217
+     lea             r1, [r1 + 2 * r2]
2218
+     mova            [r1], xm5
2219
+     mova            [r1 + 16], xm2
2220
+     vextracti128    [r1 + r2], ym5, 1
2221
+     vextracti128    [r1 + r2 + 16], ym2, 1
2222
+     vextracti64x4   ym14, m5, 1
2223
+     vextracti64x4   ym31, m2, 1
2224
+     lea             r1, [r1 + 2 * r2]
2225
+     mova            [r1], xm14
2226
+     mova            [r1 + 16], xm31
2227
+     vextracti128    [r1 + r2], ym14, 1
2228
+     vextracti128    [r1 + r2 + 16], ym31, 1
2229
+
2230
+    IDCT16_AVX512_PASS2 21, 25
2231
+     lea             r1, [r1 + 2 * r2]
2232
+     mova            [r1], xm5
2233
+     mova            [r1 + 16], xm2
2234
+     vextracti128    [r1 + r2], ym5, 1
2235
+     vextracti128    [r1 + r2 + 16], ym2, 1
2236
+     vextracti64x4   ym14, m5, 1
2237
+     vextracti64x4   ym31, m2, 1
2238
+     lea             r1, [r1 + 2 * r2]
2239
+     mova            [r1], xm14
2240
+     mova            [r1 + 16], xm31
2241
+     vextracti128    [r1 + r2], ym14, 1
2242
+     vextracti128    [r1 + r2 + 16], ym31, 1
2243
+    RET
2244
+
2245
+
2246
+
2247
 %macro IDCT32_PASS1 1
2248
     vbroadcasti128  m3, [tab_idct32_1 + %1 * 32]
2249
     vbroadcasti128  m13, [tab_idct32_1 + %1 * 32 + 16]
2250
@@ -3630,6 +5746,601 @@
2251
     jnz             .pass2
2252
     RET
2253
 
2254
+
2255
+%macro IDCT32_AVX512_PASS1 5
2256
+    pmaddwd         m9,  m8, m%4
2257
+    pmaddwd         m10, m7, m%5
2258
+
2259
+    paddd            m9,  m10
2260
+    vpsrldq          m0,   m9, 8
2261
+    paddd            m9,   m0
2262
+    vpsrldq          m0,   m9, 4
2263
+    paddd            m9,   m0
2264
+
2265
+    pmaddwd         m10, m4, m%4
2266
+    pmaddwd         m11, m1, m%5
2267
+
2268
+    paddd           m10,   m11
2269
+    vpsrldq          m0,   m10, 8
2270
+    paddd           m10,   m0
2271
+    vpslldq          m0,   m10, 4
2272
+    paddd           m10,    m0
2273
+
2274
+    vmovdqu32       m9 {k3}, m10
2275
+
2276
+    mova            m6,  [tab_idct32_AVX512_5 + %1 * 64]
2277
+    mova            m5,  [tab_idct32_AVX512_5 + %1 * 64 + 64]
2278
+
2279
+    pmaddwd         m10, m8, m6
2280
+    pmaddwd         m11, m7, m5
2281
+
2282
+    paddd           m10,  m11
2283
+    vpslldq         m0,   m10, 8
2284
+    paddd           m10,   m0
2285
+    vpsrldq          m0,  m10, 4
2286
+    paddd           m10,   m0
2287
+
2288
+    pmaddwd         m11, m4, m6
2289
+    pmaddwd         m12, m1, m5
2290
+
2291
+    paddd           m11,   m12
2292
+    vpslldq          m0,   m11, 8
2293
+    paddd           m11,    m0
2294
+    vpslldq          m0,   m11, 4
2295
+    paddd           m11,    m0
2296
+
2297
+    vmovdqu32        m10  {k4},  m11
2298
+    vmovdqu32        m9  {k2}, m10
2299
+
2300
+    pmaddwd         m10, m3, m%2
2301
+    pmaddwd         m11, m14, m%2
2302
+
2303
+    vpsrldq          m0,   m10, 4
2304
+    paddd           m10,    m0
2305
+    vpslldq          m5,   m11, 4
2306
+    paddd           m11,    m5
2307
+    vmovdqu32       m10   {k1}, m11
2308
+
2309
+    vpsrldq         m0,    m10, 8
2310
+    paddd           m10,    m0
2311
+
2312
+    pmaddwd         m11, m2, m%3
2313
+    pmaddwd         m12, m13, m%3
2314
+
2315
+    vpsrldq          m0,   m11, 4
2316
+    paddd           m11,    m0
2317
+    vpslldq          m5,   m12, 4
2318
+    paddd           m12,    m5
2319
+    vmovdqu32       m11   {k1}, m12
2320
+
2321
+    vpsrldq          m0,   m11, 8
2322
+    paddd           m11,    m0
2323
+
2324
+    paddd           m12, m10, m11
2325
+    psubd           m10, m11
2326
+
2327
+    punpcklqdq      m12, m10
2328
+    paddd           m10, m9, m12
2329
+    paddd           m10, m15
2330
+    psrad           m10, IDCT_SHIFT1
2331
+
2332
+    psubd           m12, m9
2333
+    paddd           m12, m15
2334
+    psrad           m12, IDCT_SHIFT1
2335
+
2336
+    packssdw        m10, m12
2337
+    vextracti128    xm12, m10, 1
2338
+    vextracti64x4   ym5,  m10, 1
2339
+    vextracti128    xm0, ym5, 1
2340
+
2341
+    movd            [r3 + %1 * 64], xm10
2342
+    movd            [r3 + 32 + %1 * 64], xm12
2343
+    pextrd          [r4 - %1 * 64], xm10, 1
2344
+    pextrd          [r4+ 32 - %1 * 64], xm12, 1
2345
+    pextrd          [r3 + 16 * 64 + %1 *64], xm10, 3
2346
+    pextrd          [r3 + 16 * 64 + 32 + %1 * 64], xm12, 3
2347
+    pextrd          [r4 + 16 * 64 - %1 * 64], xm10, 2
2348
+    pextrd          [r4 + 16 * 64 + 32 - %1 * 64], xm12, 2
2349
+
2350
+    movd            [r3 + (%1 + 1) * 64], xm5
2351
+    movd            [r3 + 32 + (%1 + 1) * 64], xm0
2352
+    pextrd          [r4 - (%1 + 1) * 64], xm5, 1
2353
+    pextrd          [r4+ 32 - (%1 + 1) * 64], xm0, 1
2354
+    pextrd          [r3 + 16 * 64 + (%1 + 1) * 64], xm5, 3
2355
+    pextrd          [r3 + 16 * 64 + 32 + (%1 + 1) * 64], xm0, 3
2356
+    pextrd          [r4 + 16 * 64 - (%1 + 1) * 64], xm5, 2
2357
+    pextrd          [r4 + 16 * 64 + 32 - (%1 + 1) * 64], xm0, 2
2358
+%endmacro
2359
+
2360
+%macro IDCT32_AVX512_PASS2 0
2361
+    pmaddwd         m2, m0, m7
2362
+    pmaddwd         m3, m0, m8
2363
+
2364
+    vpsrldq         m24,   m2, 4
2365
+    paddd            m2,  m24
2366
+    vpslldq         m25,   m3, 4
2367
+    paddd            m3,  m25
2368
+    vmovdqu32        m2   {k1}, m3
2369
+
2370
+    pmaddwd         m3, m0, m9
2371
+    pmaddwd         m4, m0, m10
2372
+
2373
+    vpsrldq         m24,   m3, 4
2374
+    paddd            m3,  m24
2375
+    vpslldq         m25,   m4, 4
2376
+    paddd            m4,  m25
2377
+    vmovdqu32        m3   {k1}, m4
2378
+
2379
+    vpsrldq         m24,   m2, 8
2380
+    paddd            m2,  m24
2381
+    vpslldq         m25,   m3, 8
2382
+    paddd            m3,  m25
2383
+    vmovdqu32        m2   {k2}, m3
2384
+
2385
+    pmaddwd         m3, m0, m11
2386
+    pmaddwd         m4, m0, m12
2387
+
2388
+    vpsrldq         m24,   m3, 4
2389
+    paddd            m3,  m24
2390
+    vpslldq         m25,   m4, 4
2391
+    paddd            m4,  m25
2392
+    vmovdqu32        m3   {k1}, m4
2393
+
2394
+    pmaddwd         m4, m0, m13
2395
+    pmaddwd         m5, m0, m14
2396
+
2397
+    vpsrldq         m24,   m4, 4
2398
+    paddd            m4,  m24
2399
+    vpslldq         m25,   m5, 4
2400
+    paddd            m5,  m25
2401
+    vmovdqu32        m4   {k1}, m5
2402
+
2403
+    vpsrldq         m24,   m3, 8
2404
+    paddd            m3,  m24
2405
+    vpslldq         m25,   m4, 8
2406
+    paddd            m4,  m25
2407
+    vmovdqu32        m3   {k2}, m4
2408
+
2409
+    mova           m24,        [idct16_AVX512_shuff3]
2410
+    mova           m25,        [idct16_AVX512_shuff2]
2411
+    vpermi2q       m24,        m2,       m3
2412
+    vpermi2q       m25,        m2,       m3
2413
+    paddd           m2, m25, m24
2414
+
2415
+    pmaddwd         m3, m0, m16
2416
+    pmaddwd         m4, m0, m17
2417
+
2418
+    vpsrldq         m24,   m3, 4
2419
+    paddd            m3,  m24
2420
+    vpslldq         m25,   m4, 4
2421
+    paddd            m4,  m25
2422
+    vmovdqu32        m3   {k1}, m4
2423
+
2424
+    pmaddwd         m4, m0, m18
2425
+    pmaddwd         m5, m0, m19
2426
+
2427
+    vpsrldq         m24,   m4, 4
2428
+    paddd            m4,  m24
2429
+    vpslldq         m25,   m5, 4
2430
+    paddd            m5,  m25
2431
+    vmovdqu32        m4   {k1}, m5
2432
+
2433
+    vpsrldq         m24,   m3, 8
2434
+    paddd            m3,  m24
2435
+    vpslldq         m25,   m4, 8
2436
+    paddd            m4,  m25
2437
+    vmovdqu32        m3   {k2}, m4
2438
+
2439
+    pmaddwd         m4, m0, m20
2440
+    pmaddwd         m5, m0, m21
2441
+
2442
+    vpsrldq         m24,   m4, 4
2443
+    paddd            m4,  m24
2444
+    vpslldq         m25,   m5, 4
2445
+    paddd            m5,  m25
2446
+    vmovdqu32        m4   {k1}, m5
2447
+
2448
+    pmaddwd         m5, m0, m22
2449
+    pmaddwd         m0,     m23
2450
+
2451
+    vpsrldq         m24,   m5, 4
2452
+    paddd            m5,  m24
2453
+    vpslldq         m25,   m0, 4
2454
+    paddd            m0,  m25
2455
+    vmovdqu32        m5   {k1}, m0
2456
+
2457
+    vpsrldq         m24,   m4, 8
2458
+    paddd            m4,  m24
2459
+    vpslldq         m25,   m5, 8
2460
+    paddd            m5,  m25
2461
+    vmovdqu32        m4   {k2}, m5
2462
+
2463
+    mova           m24,        [idct16_AVX512_shuff3]
2464
+    mova           m25,        [idct16_AVX512_shuff2]
2465
+    vpermi2q       m24,        m3,       m4
2466
+    vpermi2q       m25,        m3,       m4
2467
+    paddd           m3, m25, m24
2468
+
2469
+    pmaddwd         m4, m1, m26
2470
+    pmaddwd         m0, m1, m27
2471
+
2472
+    vpsrldq         m24,   m4, 4
2473
+    paddd            m4,  m24
2474
+    vpslldq         m25,   m0, 4
2475
+    paddd            m0,  m25
2476
+    vmovdqu32        m4   {k1}, m0
2477
+
2478
+    pmaddwd         m5, m1, m28
2479
+    pmaddwd         m0, m1, m29
2480
+
2481
+    vpsrldq         m24,   m5, 4
2482
+    paddd            m5,  m24
2483
+    vpslldq         m25,   m0, 4
2484
+    paddd            m0,  m25
2485
+    vmovdqu32        m5   {k1}, m0
2486
+
2487
+
2488
+    vpsrldq         m24,   m4, 8
2489
+    paddd            m4,  m24
2490
+    vpslldq         m25,   m5, 8
2491
+    paddd            m5,  m25
2492
+    vmovdqu32        m4   {k2}, m5
2493
+
2494
+    pmaddwd         m5, m1, m30
2495
+    pmaddwd         m0, m1, m31
2496
+
2497
+    vpsrldq         m24,   m5, 4
2498
+    paddd            m5,  m24
2499
+    vpslldq         m25,   m0, 4
2500
+    paddd            m0,  m25
2501
+    vmovdqu32        m5   {k1}, m0
2502
+
2503
+    pmaddwd         m6, m1, [tab_idct32_AVX512_4 + 6 * mmsize]
2504
+    pmaddwd         m0, m1, [tab_idct32_AVX512_4 + 7 * mmsize]
2505
+
2506
+    vpsrldq         m24,   m6, 4
2507
+    paddd            m6,  m24
2508
+    vpslldq         m25,   m0, 4
2509
+    paddd            m0,  m25
2510
+    vmovdqu32        m6   {k1}, m0
2511
+
2512
+    vpsrldq         m24,   m5, 8
2513
+    paddd            m5,  m24
2514
+    vpslldq         m25,   m6, 8
2515
+    paddd            m6,  m25
2516
+    vmovdqu32        m5   {k2}, m6
2517
+
2518
+    mova           m24,        [idct16_AVX512_shuff3]
2519
+    mova           m25,        [idct16_AVX512_shuff2]
2520
+    vpermi2q       m24,        m4,       m5
2521
+    vpermi2q       m25,        m4,       m5
2522
+    paddd           m4, m25, m24
2523
+
2524
+    pmaddwd         m5, m1, [tab_idct32_AVX512_4 + 8 * mmsize]
2525
+    pmaddwd         m0, m1, [tab_idct32_AVX512_4 + 9 * mmsize]
2526
+
2527
+    vpsrldq         m24,   m5, 4
2528
+    paddd            m5,  m24
2529
+    vpslldq         m25,   m0, 4
2530
+    paddd            m0,  m25
2531
+    vmovdqu32        m5   {k1}, m0
2532
+
2533
+    pmaddwd         m6, m1, [tab_idct32_AVX512_4 + 10 * mmsize]
2534
+    pmaddwd         m0, m1, [tab_idct32_AVX512_4 + 11 * mmsize]
2535
+
2536
+    vpsrldq         m24,   m6, 4
2537
+    paddd            m6,  m24
2538
+    vpslldq         m25,   m0, 4
2539
+    paddd            m0,  m25
2540
+    vmovdqu32        m6   {k1}, m0
2541
+
2542
+    vpsrldq         m24,   m5, 8
2543
+    paddd            m5,  m24
2544
+    vpslldq         m25,   m6, 8
2545
+    paddd            m6,  m25
2546
+    vmovdqu32        m5   {k2}, m6
2547
+
2548
+    pmaddwd         m6, m1, [tab_idct32_AVX512_4 + 12 * mmsize]
2549
+    pmaddwd         m0, m1, [tab_idct32_AVX512_4 + 13 * mmsize]
2550
+
2551
+    vpsrldq         m24,   m6, 4
2552
+    paddd            m6,  m24
2553
+    vpslldq         m25,   m0, 4
2554
+    paddd            m0,  m25
2555
+    vmovdqu32        m6   {k1}, m0
2556
+
2557
+    pmaddwd         m0, m1, [tab_idct32_AVX512_4 + 14 * mmsize]
2558
+    pmaddwd         m1,     [tab_idct32_AVX512_4 + 15 * mmsize]
2559
+
2560
+    vpsrldq         m24,   m0, 4
2561
+    paddd            m0,  m24
2562
+    vpslldq         m25,   m1, 4
2563
+    paddd            m1,  m25
2564
+    vmovdqu32        m0   {k1}, m1
2565
+
2566
+    vpsrldq         m24,   m6, 8
2567
+    paddd            m6,  m24
2568
+    vpslldq         m25,   m0, 8
2569
+    paddd            m0,  m25
2570
+    vmovdqu32        m6   {k2}, m0
2571
+
2572
+    mova           m24,        [idct16_AVX512_shuff3]
2573
+    mova           m25,        [idct16_AVX512_shuff2]
2574
+    vpermi2q       m24,        m5,       m6
2575
+    vpermi2q       m25,        m5,       m6
2576
+    paddd           m5, m25, m24
2577
+
2578
+    paddd           m6, m2, m4
2579
+    paddd           m6, m15
2580
+    psrad           m6, IDCT_SHIFT2
2581
+
2582
+    psubd           m2, m4
2583
+    paddd           m2, m15
2584
+    psrad           m2, IDCT_SHIFT2
2585
+
2586
+    paddd           m4, m3, m5
2587
+    paddd           m4, m15
2588
+    psrad           m4, IDCT_SHIFT2
2589
+
2590
+    psubd           m3, m5
2591
+    paddd           m3, m15
2592
+    psrad           m3, IDCT_SHIFT2
2593
+
2594
+    packssdw        m6, m4
2595
+    packssdw        m2, m3
2596
+
2597
+    vpermq          m6, m6, 0xD8
2598
+    vpermq          m2, m2, 0x8D
2599
+    pshufb          m2, [idct16_AVX512_shuff6]
2600
+%endmacro
2601
+
2602
+;-------------------------------------------------------------------
2603
+; void idct32(const int16_t* src, int16_t* dst, intptr_t dstStride)
2604
+;-------------------------------------------------------------------
2605
+
2606
+INIT_ZMM avx512
2607
+cglobal idct32, 3, 8, 32, 0-32*64
2608
+
2609
+%define             IDCT_SHIFT1         7
2610
+
2611
+    vbroadcasti128  m15, [pd_64]
2612
+
2613
+    mov             r3,  rsp
2614
+    lea             r4,  [r3 + 15 * 64]
2615
+    mov             r5d, 8
2616
+    mov             r7d, 0xAAAA
2617
+    kmovd            k1, r7d
2618
+    mov             r7d, 0xCCCC
2619
+    kmovd            k2, r7d
2620
+    mov             r7d, 0x2222
2621
+    kmovd            k3, r7d
2622
+    mov             r7d, 0x8888
2623
+    kmovd            k4, r7d
2624
+
2625
+
2626
+    mova            m16, [tab_idct32_AVX512_2 + 0 * 64]
2627
+    mova            m17, [tab_idct32_AVX512_2 + 1 * 64]
2628
+    mova            m18, [tab_idct32_AVX512_2 + 2 * 64]
2629
+    mova            m19, [tab_idct32_AVX512_2 + 3 * 64]
2630
+
2631
+    mova            m20, [tab_idct32_AVX512_3 + 0 * 64]
2632
+    mova            m21, [tab_idct32_AVX512_3 + 1 * 64]
2633
+    mova            m22, [tab_idct32_AVX512_3 + 2 * 64]
2634
+    mova            m23, [tab_idct32_AVX512_3 + 3 * 64]
2635
+
2636
+    mova            m24, [tab_idct32_AVX512_1 + 0 * 64]
2637
+    mova            m25, [tab_idct32_AVX512_1 + 1 * 64]
2638
+    mova            m26, [tab_idct32_AVX512_1 + 2 * 64]
2639
+    mova            m27, [tab_idct32_AVX512_1 + 3 * 64]
2640
+    mova            m28, [tab_idct32_AVX512_1 + 4 * 64]
2641
+    mova            m29, [tab_idct32_AVX512_1 + 5 * 64]
2642
+    mova            m30, [tab_idct32_AVX512_1 + 6 * 64]
2643
+    mova            m31, [tab_idct32_AVX512_1 + 7 * 64]
2644
+
2645
+.pass1:
2646
+    movq            xm0,    [r0 +  2 * 64]
2647
+    movq            xm1,    [r0 + 18 * 64]
2648
+    punpcklqdq      xm0,    xm0,  xm1
2649
+    movq            xm1,    [r0 +  0 * 64]
2650
+    movq            xm2,    [r0 + 16 * 64]
2651
+    punpcklqdq      xm1,    xm1,  xm2
2652
+    vinserti128     ym0,    ym0,  xm1, 1             ;[2 18 0 16]
2653
+
2654
+    movq            xm1,    [r0 + 1 * 64]
2655
+    movq            xm2,    [r0 + 9 * 64]
2656
+    punpcklqdq      xm1,    xm1,  xm2
2657
+    movq            xm2,    [r0 + 17 * 64]
2658
+    movq            xm3,    [r0 + 25 * 64]
2659
+    punpcklqdq      xm2,    xm2,  xm3
2660
+    vinserti128     ym1,    ym1,  xm2, 1             ;[1 9 17 25]
2661
+
2662
+    movq            xm2,    [r0 + 6 * 64]
2663
+    movq            xm3,    [r0 + 22 * 64]
2664
+    punpcklqdq      xm2,    xm2,  xm3
2665
+    movq            xm3,    [r0 + 4 * 64]
2666
+    movq            xm4,    [r0 + 20 * 64]
2667
+    punpcklqdq      xm3,    xm3,  xm4
2668
+    vinserti128     ym2,    ym2,  xm3, 1             ;[6 22 4 20]
2669
+
2670
+    movq            xm3,    [r0 + 3 * 64]
2671
+    movq            xm4,    [r0 + 11 * 64]
2672
+    punpcklqdq      xm3,    xm3,  xm4
2673
+    movq            xm4,    [r0 + 19 * 64]
2674
+    movq            xm5,    [r0 + 27 * 64]
2675
+    punpcklqdq      xm4,    xm4,  xm5
2676
+    vinserti128     ym3,    ym3,  xm4, 1             ;[3 11 17 25]
2677
+
2678
+    movq            xm4,    [r0 + 10 * 64]
2679
+    movq            xm5,    [r0 + 26 * 64]
2680
+    punpcklqdq      xm4,    xm4,  xm5
2681
+    movq            xm5,    [r0 + 8 * 64]
2682
+    movq            xm6,    [r0 + 24 * 64]
2683
+    punpcklqdq      xm5,    xm5,  xm6
2684
+    vinserti128     ym4,    ym4,  xm5, 1             ;[10 26 8 24]
2685
+
2686
+    movq            xm5,    [r0 + 5 * 64]
2687
+    movq            xm6,    [r0 + 13 * 64]
2688
+    punpcklqdq      xm5,    xm5,  xm6
2689
+    movq            xm6,    [r0 + 21 * 64]
2690
+    movq            xm7,    [r0 + 29 * 64]
2691
+    punpcklqdq      xm6,    xm6,  xm7
2692
+    vinserti128     ym5,    ym5,  xm6, 1             ;[5 13 21 9]
2693
+
2694
+    movq            xm6,    [r0 + 14 * 64]
2695
+    movq            xm7,    [r0 + 30 * 64]
2696
+    punpcklqdq      xm6,    xm6,  xm7
2697
+    movq            xm7,    [r0 + 12 * 64]
2698
+    movq            xm8,    [r0 + 28 * 64]
2699
+    punpcklqdq      xm7,    xm7,  xm8
2700
+    vinserti128     ym6,    ym6,  xm7, 1             ;[14 30 12 28]
2701
+
2702
+    movq            xm7,    [r0 + 7 * 64]
2703
+    movq            xm8,    [r0 + 15 * 64]
2704
+    punpcklqdq      xm7,    xm7,  xm8
2705
+    movq            xm8,    [r0 + 23 * 64]
2706
+    movq            xm9,    [r0 + 31 * 64]
2707
+    punpcklqdq      xm8,    xm8,  xm9
2708
+    vinserti128     ym7,    ym7,  xm8, 1             ;[7 15 23 31]
2709
+
2710
+    punpckhwd       ym8, ym0, ym2                  ;[18 22 16 20]
2711
+    punpcklwd       ym0, ym2                       ;[2 6 0 4]
2712
+
2713
+    punpckhwd       ym2, ym1, ym3                  ;[9 11 25 27]
2714
+    punpcklwd       ym1, ym3                       ;[1 3 17 19]
2715
+
2716
+    punpckhwd       ym3, ym4, ym6                  ;[26 30 24 28]
2717
+    punpcklwd       ym4, ym6                       ;[10 14 8 12]
2718
+
2719
+    punpckhwd       ym6, ym5, ym7                  ;[13 15 29 31]
2720
+    punpcklwd       ym5, ym7                       ;[5 7 21 23]
2721
+
2722
+    punpckhdq       ym7, ym0, ym4                  ;[22 62 102 142 23 63 103 143 02 42 82 122 03 43 83 123]
2723
+    punpckldq       ym0, ym4                       ;[20 60 100 140 21 61 101 141 00 40 80 120 01 41 81 121]
2724
+
2725
+    punpckhdq       ym4, ym8, ym3                  ;[182 222 262 302 183 223 263 303 162 202 242 282 163 203 243 283]
2726
+    punpckldq       ym8, ym3                       ;[180 220 260 300 181 221 261 301 160 200 240 280 161 201 241 281]
2727
+
2728
+    punpckhdq       ym3, ym1, ym5                  ;[12 32 52 72 13 33 53 73 172 192 212 232 173 193 213 233]
2729
+    punpckldq       ym1, ym5                       ;[10 30 50 70 11 31 51 71 170 190 210 230 171 191 211 231]
2730
+
2731
+    punpckhdq       ym5, ym2, ym6                  ;[92 112 132 152 93 113 133 153 252 272 292 312 253 273 293 313]
2732
+    punpckldq       ym2, ym6                       ;[90 110 130 150 91 111 131 151 250 270 290 310 251 271 291 311]
2733
+
2734
+    punpckhqdq      ym6, ym0, ym8                  ;[21 61 101 141 181 221 261 301 01 41 81 121 161 201 241 281]
2735
+    punpcklqdq      ym0, ym8                       ;[20 60 100 140 180 220 260 300 00 40 80 120 160 200 240 280]
2736
+
2737
+    punpckhqdq      ym8, ym7, ym4                  ;[23 63 103 143 183 223 263 303 03 43 83 123 163 203 243 283]
2738
+    punpcklqdq      ym7, ym4                       ;[22 62 102 142 182 222 262 302 02 42 82 122 162 202 242 282]
2739
+
2740
+    punpckhqdq      ym4, ym1, ym2                  ;[11 31 51 71 91 111 131 151 171 191 211 231 251 271 291 311]
2741
+    punpcklqdq      ym1, ym2                       ;[10 30 50 70 90 110 130 150 170 190 210 230 250 270 290 310]
2742
+
2743
+    punpckhqdq      ym2, ym3, ym5                  ;[13 33 53 73 93 113 133 153 173 193 213 233 253 273 293 313]
2744
+    punpcklqdq      ym3, ym5                       ;[12 32 52 72 92 112 132 152 172 192 212 232 252 272 292 312]
2745
+
2746
+    vinserti64x4    m7,        m7,      ym7, 1
2747
+    vinserti64x4    m8,        m8,      ym8, 1
2748
+    movu           m13,        [idct16_AVX512_shuff2]
2749
+    movu           m14,        [idct16_AVX512_shuff3]
2750
+    vpermi2q       m13,        m7,       m8
2751
+    vpermi2q       m14,        m7,       m8
2752
+
2753
+    vinserti64x4    m1,        m1,      ym1, 1
2754
+    vinserti64x4    m4,        m4,      ym4, 1
2755
+    movu            m7,        [idct16_AVX512_shuff3]
2756
+    movu            m8,        [idct16_AVX512_shuff2]
2757
+    vpermi2q        m7,        m1,       m4
2758
+    vpermi2q        m8,        m1,       m4
2759
+
2760
+    vinserti64x4    m3,        m3,      ym3, 1
2761
+    vinserti64x4    m2,        m2,      ym2, 1
2762
+    movu            m1,        [idct16_AVX512_shuff3]
2763
+    movu            m4,        [idct16_AVX512_shuff2]
2764
+    vpermi2q        m1,        m3,       m2
2765
+    vpermi2q        m4,        m3,       m2
2766
+
2767
+    vinserti64x4    m0,        m0,      ym0, 1
2768
+    vinserti64x4    m6,        m6,      ym6, 1
2769
+    movu            m2,        [idct16_AVX512_shuff2]
2770
+    movu            m3,        [idct16_AVX512_shuff3]
2771
+    vpermi2q        m2,        m0,       m6
2772
+    vpermi2q        m3,        m0,       m6
2773
+
2774
+
2775
+    IDCT32_AVX512_PASS1 0, 16, 20, 24, 25
2776
+    IDCT32_AVX512_PASS1 2, 17, 21, 26, 27
2777
+    IDCT32_AVX512_PASS1 4, 18, 22, 28, 29
2778
+    IDCT32_AVX512_PASS1 6, 19, 23, 30, 31
2779
+
2780
+    add             r0, 8
2781
+    add             r3, 4
2782
+    add             r4, 4
2783
+    dec             r5d
2784
+    jnz             .pass1
2785
+
2786
+%if BIT_DEPTH == 12
2787
+    %define         IDCT_SHIFT2        8
2788
+    vpbroadcastd    m15,                [pd_128]
2789
+%elif BIT_DEPTH == 10
2790
+    %define         IDCT_SHIFT2        10
2791
+    vpbroadcastd    m15,                [pd_512]
2792
+%elif BIT_DEPTH == 8
2793
+    %define         IDCT_SHIFT2        12
2794
+    vpbroadcastd    m15,                [pd_2048]
2795
+%else
2796
+    %error Unsupported BIT_DEPTH!
2797
+%endif
2798
+
2799
+    mov             r3,  rsp
2800
+    add             r2d, r2d
2801
+    mov             r4d, 16
2802
+    mov             r6d, 0xFFFF0000
2803
+    kmovd            k3, r6d
2804
+
2805
+    mova            m7,  [tab_idct32_AVX512_6]
2806
+    mova            m8,  [tab_idct32_AVX512_6 + 1 * mmsize]
2807
+    mova            m9,  [tab_idct32_AVX512_6 + 2 * mmsize]
2808
+    mova            m10, [tab_idct32_AVX512_6 + 3 * mmsize]
2809
+    mova            m11, [tab_idct32_AVX512_6 + 4 * mmsize]
2810
+    mova            m12, [tab_idct32_AVX512_6 + 5 * mmsize]
2811
+    mova            m13, [tab_idct32_AVX512_6 + 6 * mmsize]
2812
+    mova            m14, [tab_idct32_AVX512_6 + 7 * mmsize]
2813
+    mova            m16, [tab_idct32_AVX512_6 + 8 * mmsize]
2814
+    mova            m17, [tab_idct32_AVX512_6 + 9 * mmsize]
2815
+    mova            m18, [tab_idct32_AVX512_6 + 10 * mmsize]
2816
+    mova            m19, [tab_idct32_AVX512_6 + 11 * mmsize]
2817
+    mova            m20, [tab_idct32_AVX512_6 + 12 * mmsize]
2818
+    mova            m21, [tab_idct32_AVX512_6 + 13 * mmsize]
2819
+    mova            m22, [tab_idct32_AVX512_6 + 14 * mmsize]
2820
+    mova            m23, [tab_idct32_AVX512_6 + 15 * mmsize]
2821
+    mova            m26, [tab_idct32_AVX512_4]
2822
+    mova            m27, [tab_idct32_AVX512_4 + 1 * mmsize]
2823
+    mova            m28, [tab_idct32_AVX512_4 + 2 * mmsize]
2824
+    mova            m29, [tab_idct32_AVX512_4 + 3 * mmsize]
2825
+    mova            m30, [tab_idct32_AVX512_4 + 4 * mmsize]
2826
+    mova            m31, [tab_idct32_AVX512_4 + 5 * mmsize]
2827
+
2828
+.pass2:
2829
+    movu            ym0, [r3]
2830
+    movu            ym1, [r3 + 32]
2831
+    vmovdqu16        m0  {k3}, [r3 + 32]
2832
+    vmovdqu16        m1  {k3}, [r3 + 64]
2833
+
2834
+    IDCT32_AVX512_PASS2
2835
+    movu            [r1],      ym6
2836
+    movu            [r1 + 32], ym2
2837
+    vextracti64x4   ym24,       m6, 1
2838
+    vextracti64x4   ym25,       m2, 1
2839
+    add             r1,         r2
2840
+    movu            [r1 ],     ym24
2841
+    movu            [r1 + 32], ym25
2842
+
2843
+    add             r1, r2
2844
+    add             r3, 128
2845
+    dec             r4d
2846
+    jnz             .pass2
2847
+    RET
2848
+
2849
 ;-------------------------------------------------------
2850
 ; void idct4(const int16_t* src, int16_t* dst, intptr_t dstStride)
2851
 ;-------------------------------------------------------
2852
@@ -3704,4 +6415,1227 @@
2853
     movhps          [r1 + 2 * r2], xm0
2854
     movhps          [r1 + r3], xm1
2855
     RET
2856
+
2857
+;static void nonPsyRdoQuant_c(int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, uint32_t blkPos)
2858
+;{
2859
+;    const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
2860
+;    const int scaleBits = SCALE_BITS - 2 * transformShift;
2861
+;    const uint32_t trSize = 1 << log2TrSize;
2862
+
2863
+;    for (int y = 0; y < MLS_CG_SIZE; y++)
2864
+;    {
2865
+;        for (int x = 0; x < MLS_CG_SIZE; x++)
2866
+;        {
2867
+;             int signCoef = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
2868
+;             costUncoded[blkPos + x] = static_cast<int64_t>((double)((signCoef * signCoef) << scaleBits));
2869
+;             *totalUncodedCost += costUncoded[blkPos + x];
2870
+;             *totalRdCost += costUncoded[blkPos + x];
2871
+;        }
2872
+;        blkPos += trSize;
2873
+;    }
2874
+;}
2875
+
2876
+;---------------------------------------------------------------------------------------------------------------------------------------------------------
2877
+; void nonPsyRdoQuant_c(int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, uint32_t blkPos)
2878
+;---------------------------------------------------------------------------------------------------------------------------------------------------------
2879
+INIT_ZMM avx512
2880
+cglobal nonPsyRdoQuant4, 5, 5, 8
2881
+    mov            r4d,        r4m
2882
+    lea             r0,        [r0 + 2 * r4]
2883
+    lea             r4,        [4 * r4]
2884
+    lea             r1,        [r1 + 2 * r4]
2885
+%if BIT_DEPTH == 12
2886
+    mov             r4,        [tab_nonpsyRdo12]
2887
+%elif BIT_DEPTH == 10
2888
+    mov             r4,        [tab_nonpsyRdo10]
2889
+%elif BIT_DEPTH == 8
2890
+    mov             r4,        [tab_nonpsyRdo8]
2891
+%else
2892
+    %error Unsupported BIT_DEPTH!
2893
+ %endif
2894
+    movq           xm3,        r4
2895
+    movq           xm6,        [r2]
2896
+    movq           xm7,        [r3]
2897
+    vpxor           m4,        m4
2898
+    vpxor           m5,        m5
2899
+;Row 1, 2
2900
+    movu           xm0,        [r0]
2901
+    vpmovsxwq      m1,         xm0
2902
+    vcvtqq2pd      m2,         m1                              ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
2903
+    vfmadd213pd    m2,         m2,             m5              ; Multiply packed double-precision (64-bit) floating-point elements
2904
+    vcvtpd2qq      m1,         m2
2905
+    vpsllq         m1,         xm3                              ; costUncoded
2906
+    paddq          m4,         m1
2907
+    movu           [r1],       m1
2908
+    ;Row 3, 4
2909
+    movu           xm0,        [r0 + 16]
2910
+    vpmovsxwq      m1,         xm0
2911
+    vcvtqq2pd      m2,         m1
2912
+    vfmadd213pd    m2,         m2,             m5
2913
+    vcvtpd2qq      m1,         m2
2914
+    vpsllq         m1,         xm3                              ; costUncoded
2915
+    paddq          m4,         m1
2916
+    movu           [r1 + 64],  m1
2917
+    vextracti32x8  ym2,        m4,             1
2918
+    paddq          ym4,        ym2
2919
+    vextracti32x4  xm2,        m4,             1
2920
+    paddq          xm4,        xm2
2921
+    punpckhqdq     xm2,        xm4,            xm5
2922
+    paddq          xm4,        xm2
2923
+
2924
+    paddq          xm6,        xm4
2925
+    paddq          xm7,        xm4
2926
+
2927
+    movq           [r2],       xm6
2928
+    movq           [r3],       xm7
2929
+    RET
2930
+INIT_ZMM avx512
2931
+cglobal nonPsyRdoQuant8, 5, 5, 8
2932
+    mov            r4d,        r4m
2933
+    lea             r0,        [r0 + 2 * r4]
2934
+    lea             r4,        [4 * r4]
2935
+    lea             r1,        [r1 + 2 * r4]
2936
+%if BIT_DEPTH == 12
2937
+    mov             r4,        [tab_nonpsyRdo12 + 8]
2938
+%elif BIT_DEPTH == 10
2939
+    mov             r4,        [tab_nonpsyRdo10 + 8]
2940
+%elif BIT_DEPTH == 8
2941
+    mov             r4,        [tab_nonpsyRdo8 + 8]
2942
+%else
2943
+    %error Unsupported BIT_DEPTH!
2944
+ %endif
2945
+    movq           xm3,        r4
2946
+    movq           xm6,        [r2]
2947
+    movq           xm7,        [r3]
2948
+    vpxor           m4,        m4
2949
+    vpxor           m5,        m5
2950
+
2951
+;Row 1, 2
2952
+    movq           xm0,        [r0]
2953
+    pinsrq         xm0,        [r0 + mmsize/4], 1
2954
+    vpmovsxwq      m1,         xm0
2955
+    vcvtqq2pd      m2,         m1                              ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
2956
+    vfmadd213pd    m2,         m2,             m5              ; Multiply packed double-precision (64-bit) floating-point elements
2957
+    vcvtpd2qq      m1,         m2
2958
+    vpsllq         m1,         xm3                              ; costUncoded
2959
+    paddq          m4,         m1
2960
+    movu           [r1],       ym1
2961
+    vextracti32x8  [r1 + mmsize],  m1 ,        1
2962
+
2963
+    ;Row 3, 4
2964
+    movq           xm0,        [r0 + mmsize/2]
2965
+    pinsrq         xm0,        [r0 + 3 * mmsize/4],      1
2966
+    vpmovsxwq      m1,         xm0
2967
+    vcvtqq2pd      m2,         m1
2968
+    vfmadd213pd    m2,         m2,             m5
2969
+    vcvtpd2qq      m1,         m2
2970
+    vpsllq         m1,         xm3                              ; costUncoded
2971
+    paddq          m4,         m1
2972
+    movu           [r1 + 2 * mmsize], ym1
2973
+    vextracti32x8  [r1 + 3 * mmsize], m1 ,     1
2974
+
2975
+    vextracti32x8  ym2,        m4,             1
2976
+    paddq          ym4,        ym2
2977
+    vextracti32x4  xm2,        m4,             1
2978
+    paddq          xm4,        xm2
2979
+    punpckhqdq     xm2,        xm4,            xm5
2980
+    paddq          xm4,        xm2
2981
+
2982
+    paddq          xm6,        xm4
2983
+    paddq          xm7,        xm4
2984
+
2985
+    movq           [r2],       xm6
2986
+    movq           [r3],       xm7
2987
+    RET
2988
+INIT_ZMM avx512
2989
+cglobal nonPsyRdoQuant16, 5, 5, 8
2990
+    mov            r4d,        r4m
2991
+    lea             r0,        [r0 + 2 * r4]
2992
+    lea             r4,        [4 * r4]
2993
+    lea             r1,        [r1 + 2 * r4]
2994
+%if BIT_DEPTH == 12
2995
+    mov             r4,        [tab_nonpsyRdo12 + 16]
2996
+%elif BIT_DEPTH == 10
2997
+    mov             r4,        [tab_nonpsyRdo10 + 16]
2998
+%elif BIT_DEPTH == 8
2999
+    mov             r4,        [tab_nonpsyRdo8 + 16]
3000
+%else
3001
+    %error Unsupported BIT_DEPTH!
3002
+ %endif
3003
+    movq           xm3,        r4
3004
+    movq           xm6,        [r2]
3005
+    movq           xm7,        [r3]
3006
+    vpxor           m4,        m4
3007
+    vpxor           m5,        m5
3008
+
3009
+;Row 1, 2
3010
+    movq           xm0,        [r0]
3011
+    pinsrq         xm0,        [r0 + mmsize/2],       1
3012
+    vpmovsxwq      m1,         xm0
3013
+    vcvtqq2pd      m2,         m1                              ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
3014
+    vfmadd213pd    m2,         m2,             m5              ; Multiply packed double-precision (64-bit) floating-point elements
3015
+    vcvtpd2qq      m1,         m2
3016
+    vpsllq         m1,         xm3                              ; costUncoded
3017
+    paddq          m4,         m1
3018
+    movu           [r1],       ym1
3019
+    vextracti32x8  [r1 + 2 * mmsize],  m1,     1
3020
+
3021
+    ;Row 3, 4
3022
+    movq           xm0,        [r0 + mmsize]
3023
+    pinsrq         xm0,        [r0 + 3 * mmsize/2],      1
3024
+    vpmovsxwq      m1,         xm0
3025
+    vcvtqq2pd      m2,         m1
3026
+    vfmadd213pd    m2,         m2,             m5
3027
+    vcvtpd2qq      m1,         m2
3028
+    vpsllq         m1,         xm3                              ; costUncoded
3029
+    paddq          m4,         m1
3030
+    movu           [r1 + 4 * mmsize],         ym1
3031
+    vextracti32x8  [r1 + 6 * mmsize],          m1 ,            1
3032
+
3033
+    vextracti32x8  ym2,        m4,             1
3034
+    paddq          ym4,        ym2
3035
+    vextracti32x4  xm2,        m4,             1
3036
+    paddq          xm4,        xm2
3037
+    punpckhqdq     xm2,        xm4,            xm5
3038
+    paddq          xm4,        xm2
3039
+
3040
+    paddq          xm6,        xm4
3041
+    paddq          xm7,        xm4
3042
+
3043
+    movq           [r2],       xm6
3044
+    movq           [r3],       xm7
3045
+    RET
3046
+INIT_ZMM avx512
3047
+cglobal nonPsyRdoQuant32, 5, 5, 8
3048
+    mov            r4d,        r4m
3049
+    lea             r0,        [r0 + 2 * r4]
3050
+    lea             r4,        [4 * r4]
3051
+    lea             r1,        [r1 + 2 * r4]
3052
+%if BIT_DEPTH == 12
3053
+    mov             r4,        [tab_nonpsyRdo12 + 24]
3054
+%elif BIT_DEPTH == 10
3055
+    mov             r4,        [tab_nonpsyRdo10 + 24]
3056
+%elif BIT_DEPTH == 8
3057
+    mov             r4,        [tab_nonpsyRdo8 + 24]
3058
+%else
3059
+    %error Unsupported BIT_DEPTH!
3060
+ %endif
3061
+    movq           xm3,        r4
3062
+    movq           xm6,        [r2]
3063
+    movq           xm7,        [r3]
3064
+    vpxor           m4,        m4
3065
+    vpxor           m5,        m5
3066
+
3067
+;Row 1, 2
3068
+    movq           xm0,        [r0]
3069
+    pinsrq         xm0,        [r0 + mmsize],  1
3070
+    vpmovsxwq      m1,         xm0
3071
+    vcvtqq2pd      m2,         m1                              ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
3072
+    vfmadd213pd    m2,         m2,             m5              ; Multiply packed double-precision (64-bit) floating-point elements
3073
+    vcvtpd2qq      m1,         m2
3074
+    vpsllq         m1,         xm3                              ; costUncoded
3075
+    paddq          m4,         m1
3076
+    movu           [r1],       ym1
3077
+    vextracti32x8  [r1 + 4 * mmsize],  m1,     1
3078
+
3079
+    ;Row 3, 4
3080
+    movq           xm0,        [r0 + 2 * mmsize]
3081
+    pinsrq         xm0,        [r0 + 3 * mmsize],      1
3082
+    vpmovsxwq      m1,         xm0
3083
+    vcvtqq2pd      m2,         m1
3084
+    vfmadd213pd    m2,         m2,             m5
3085
+    vcvtpd2qq      m1,         m2
3086
+    vpsllq         m1,         xm3                              ; costUncoded
3087
+    paddq          m4,         m1
3088
+    movu           [r1 + 8 * mmsize],         ym1
3089
+    vextracti32x8  [r1 + 12 * mmsize],         m1 ,            1
3090
+
3091
+    vextracti32x8  ym2,        m4,             1
3092
+    paddq          ym4,        ym2
3093
+    vextracti32x4  xm2,        m4,             1
3094
+    paddq          xm4,        xm2
3095
+    punpckhqdq     xm2,        xm4,            xm5
3096
+    paddq          xm4,        xm2
3097
+
3098
+    paddq          xm6,        xm4
3099
+    paddq          xm7,        xm4
3100
+
3101
+    movq           [r2],       xm6
3102
+    movq           [r3],       xm7
3103
+    RET
3104
+;static void psyRdoQuant_c(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t psyScale, uint32_t blkPos)
3105
+;{
3106
+;    const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
3107
+;    const int scaleBits = SCALE_BITS - 2 * transformShift;
3108
+;    const uint32_t trSize = 1 << log2TrSize;
3109
+;    int max = X265_MAX(0, (2 * transformShift + 1));
3110
+;
3111
+;    for (int y = 0; y < MLS_CG_SIZE; y++)
3112
+;    {
3113
+;        for (int x = 0; x < MLS_CG_SIZE; x++)
3114
+;        {
3115
+;            int64_t signCoef = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
3116
+;            int64_t predictedCoef = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/
3117
+;
3118
+;            costUncoded[blkPos + x] = static_cast<int64_t>((double)(signCoef * signCoef) << scaleBits);
3119
+;
3120
+;            /* when no residual coefficient is coded, predicted coef == recon coef */
3121
+;            costUncoded[blkPos + x] -= static_cast<int64_t>((psyScale * (predictedCoef)) >> max);
3122
+;
3123
+;            *totalUncodedCost += costUncoded[blkPos + x];
3124
+;            *totalRdCost += costUncoded[blkPos + x];
3125
+;        }
3126
+;        blkPos += trSize;
3127
+;    }
3128
+;}
3129
+
3130
+;---------------------------------------------------------------------------------------------------------------------------------------------------------
3131
+; void psyRdoQuant(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos)
3132
+;---------------------------------------------------------------------------------------------------------------------------------------------------------
3133
+INIT_ZMM avx512
3134
+cglobal psyRdoQuant4, 5, 9, 13
3135
+%if WIN64
3136
+    mov             r5,        r5m
3137
+%endif
3138
+    mov            r6d,        r6m
3139
+    vpbroadcastq   m12,        [r5]                              ; psyScale
3140
+    lea             r0,        [r0 + 2 * r6]
3141
+    lea             r1,        [r1 + 2 * r6]
3142
+    lea             r6,        [4 * r6]
3143
+    lea             r2,        [r2 + 2 * r6]
3144
+    movq           xm0,        [r3]
3145
+    movq           xm1,        [r4]
3146
+
3147
+%if BIT_DEPTH == 12
3148
+    mov            r5,         [tab_nonpsyRdo12]                 ; scaleBits
3149
+%elif BIT_DEPTH == 10
3150
+    mov            r5,         [tab_nonpsyRdo10]
3151
+%elif BIT_DEPTH == 8
3152
+    mov            r5,         [tab_nonpsyRdo8]
3153
+%else
3154
+    %error Unsupported BIT_DEPTH!
3155
+%endif
3156
+
3157
+    movq           xm2,        r5
3158
+    vpxor           m4,        m4
3159
+    vpxor           m3,        m3
3160
+
3161
+;Row 1, 2
3162
+    vpmovsxwq       m6,        [r0]
3163
+    vpmovsxwq       m7,        [r1]
3164
+    psubq           m7,        m6                              ; predictedCoef
3165
+
3166
+    vcvtqq2pd       m9,        m6
3167
+    vfmadd213pd     m9,        m9,             m3
3168
+    vcvtpd2qq       m8,        m9
3169
+    vpsllq          m8,        xm2                             ;(signCoef * signCoef) << scaleBits
3170
+
3171
+    vcvtqq2pd      m10,        m7
3172
+    vcvtqq2pd      m11,        m12
3173
+    vfmadd213pd    m10,        m11,            m3
3174
+    vcvtpd2qq       m9,        m10
3175
+    vpsraq          m9,        RDO_MAX_4                       ;(psyScale * predictedCoef) >> max
3176
+
3177
+    psubq           m8,        m9
3178
+    paddq           m4,        m8
3179
+    movu           [r2],       m8
3180
+
3181
+    ;Row 3, 4
3182
+    vpmovsxwq       m6,        [r0 + 16]
3183
+    vpmovsxwq       m7,        [r1 + 16]
3184
+    psubq           m7,        m6                              ; predictedCoef
3185
+
3186
+    vcvtqq2pd       m9,        m6
3187
+    vfmadd213pd     m9,        m9,             m3
3188
+    vcvtpd2qq       m8,        m9
3189
+    vpsllq          m8,        xm2                             ;(signCoef * signCoef) << scaleBits
3190
+
3191
+    vcvtqq2pd      m10,        m7
3192
+    vcvtqq2pd      m11,        m12
3193
+    vfmadd213pd    m10,        m11,             m3
3194
+    vcvtpd2qq       m9,        m10
3195
+    vpsraq          m9,        RDO_MAX_4                      ;(psyScale * predictedCoef) >> max
3196
+
3197
+    psubq           m8,         m9
3198
+    paddq           m4,         m8
3199
+    movu           [r2 + 64],   m8
3200
+
3201
+    vextracti32x8  ym2,         m4,            1
3202
+    paddq          ym4,        ym2
3203
+    vextracti32x4  xm2,         m4,            1
3204
+    paddq          xm4,        xm2
3205
+    punpckhqdq     xm2,        xm4,            xm3
3206
+    paddq          xm4,        xm2
3207
+
3208
+    paddq          xm0,        xm4
3209
+    paddq          xm1,        xm4
3210
+
3211
+    movq           [r3],       xm0
3212
+    movq           [r4],       xm1
3213
+    RET
3214
+
3215
+;---------------------------------------------------------------------------------------------------------------------------------------------------------
3216
+; void psyRdoQuant(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos)
3217
+;---------------------------------------------------------------------------------------------------------------------------------------------------------
3218
+INIT_ZMM avx512
3219
+cglobal psyRdoQuant8, 5, 9, 15
3220
+%if WIN64
3221
+    mov             r5,        r5m
3222
+%endif
3223
+    mov            r6d,        r6m
3224
+    vpbroadcastq   m12,        [r5]                              ; psyScale
3225
+    lea             r0,        [r0 + 2 * r6]
3226
+    lea             r1,        [r1 + 2 * r6]
3227
+    lea             r6,        [4 * r6]
3228
+    lea             r2,        [r2 + 2 * r6]
3229
+    movq           xm0,        [r3]
3230
+    movq           xm1,        [r4]
3231
+
3232
+%if BIT_DEPTH == 12
3233
+    mov            r5,         [tab_nonpsyRdo12 + 8]                 ; scaleBits
3234
+%elif BIT_DEPTH == 10
3235
+    mov            r5,         [tab_nonpsyRdo10 + 8]
3236
+%elif BIT_DEPTH == 8
3237
+    mov            r5,         [tab_nonpsyRdo8 + 8]
3238
+%else
3239
+    %error Unsupported BIT_DEPTH!
3240
+%endif
3241
+
3242
+    movq           xm2,        r5
3243
+    vpxor           m4,        m4
3244
+    vpxor           m3,        m3
3245
+
3246
+;Row 1, 2
3247
+    movq           xm13,       [r0]
3248
+    movq           xm14,       [r1]
3249
+    pinsrq         xm13,       [r0 + mmsize/4], 1
3250
+    pinsrq         xm14,       [r1 + mmsize/4], 1
3251
+    vpmovsxwq       m6,        xm13
3252
+    vpmovsxwq       m7,        xm14
3253
+    psubq           m7,        m6                              ; predictedCoef
3254
+
3255
+    vcvtqq2pd       m9,        m6
3256
+    vfmadd213pd     m9,        m9,             m3
3257
+    vcvtpd2qq       m8,        m9
3258
+    vpsllq          m8,        xm2                             ;(signCoef * signCoef) << scaleBits
3259
+
3260
+    vcvtqq2pd      m10,        m7
3261
+    vcvtqq2pd      m11,        m12
3262
+    vfmadd213pd    m10,        m11,            m3
3263
+    vcvtpd2qq       m9,        m10
3264
+    vpsraq          m9,        RDO_MAX_8                       ;(psyScale * predictedCoef) >> max
3265
+
3266
+    psubq           m8,        m9
3267
+    paddq           m4,        m8
3268
+    movu           [r2],       ym8
3269
+    vextracti32x8  [r2 + mmsize],  m8 ,        1
3270
+
3271
+    ;Row 3, 4
3272
+    movq           xm13,       [r0 + mmsize/2]
3273
+    movq           xm14,       [r1 + mmsize/2]
3274
+    pinsrq         xm13,       [r0 + 3 * mmsize/4],      1
3275
+    pinsrq         xm14,       [r1 + 3 * mmsize/4],      1
3276
+    vpmovsxwq       m6,        xm13
3277
+    vpmovsxwq       m7,        xm14
3278
+    psubq           m7,        m6                              ; predictedCoef
3279
+
3280
+    vcvtqq2pd       m9,        m6
3281
+    vfmadd213pd     m9,        m9,             m3
3282
+    vcvtpd2qq       m8,        m9
3283
+    vpsllq          m8,        xm2                             ;(signCoef * signCoef) << scaleBits
3284
+
3285
+    vcvtqq2pd      m10,        m7
3286
+    vcvtqq2pd      m11,        m12
3287
+    vfmadd213pd    m10,        m11,             m3
3288
+    vcvtpd2qq       m9,        m10
3289
+    vpsraq          m9,        RDO_MAX_8                      ;(psyScale * predictedCoef) >> max
3290
+
3291
+    psubq           m8,         m9
3292
+    paddq           m4,         m8
3293
+    movu           [r2 + 2 * mmsize],       ym8
3294
+    vextracti32x8  [r2 + 3 * mmsize],  m8 ,    1
3295
+
3296
+    vextracti32x8  ym2,         m4,            1
3297
+    paddq          ym4,        ym2
3298
+    vextracti32x4  xm2,         m4,            1
3299
+    paddq          xm4,        xm2
3300
+    punpckhqdq     xm2,        xm4,            xm3
3301
+    paddq          xm4,        xm2
3302
+
3303
+    paddq          xm0,        xm4
3304
+    paddq          xm1,        xm4
3305
+
3306
+    movq           [r3],       xm0
3307
+    movq           [r4],       xm1
3308
+    RET
3309
+
3310
+;---------------------------------------------------------------------------------------------------------------------------------------------------------
3311
+; void psyRdoQuant(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos)
3312
+;---------------------------------------------------------------------------------------------------------------------------------------------------------
3313
+INIT_ZMM avx512
3314
+cglobal psyRdoQuant16, 5, 9, 15
3315
+%if WIN64
3316
+    mov             r5,        r5m
3317
+%endif
3318
+    mov            r6d,        r6m
3319
+    vpbroadcastq   m12,        [r5]                              ; psyScale
3320
+    lea             r0,        [r0 + 2 * r6]
3321
+    lea             r1,        [r1 + 2 * r6]
3322
+    lea             r6,        [4 * r6]
3323
+    lea             r2,        [r2 + 2 * r6]
3324
+    movq           xm0,        [r3]
3325
+    movq           xm1,        [r4]
3326
+
3327
+%if BIT_DEPTH == 12
3328
+    mov            r5,         [tab_nonpsyRdo12 + 16]                 ; scaleBits
3329
+%elif BIT_DEPTH == 10
3330
+    mov            r5,         [tab_nonpsyRdo10 + 16]
3331
+%elif BIT_DEPTH == 8
3332
+    mov            r5,         [tab_nonpsyRdo8 + 16]
3333
+%else
3334
+    %error Unsupported BIT_DEPTH!
3335
+%endif
3336
+
3337
+    movq           xm2,        r5
3338
+    vpxor           m4,        m4
3339
+    vpxor           m3,        m3
3340
+
3341
+;Row 1, 2
3342
+    movq           xm13,       [r0]
3343
+    movq           xm14,       [r1]
3344
+    pinsrq         xm13,       [r0 + mmsize/2], 1
3345
+    pinsrq         xm14,       [r1 + mmsize/2], 1
3346
+    vpmovsxwq       m6,        xm13
3347
+    vpmovsxwq       m7,        xm14
3348
+    psubq           m7,        m6                              ; predictedCoef
3349
+
3350
+    vcvtqq2pd       m9,        m6
3351
+    vfmadd213pd     m9,        m9,             m3
3352
+    vcvtpd2qq       m8,        m9
3353
+    vpsllq          m8,        xm2                             ;(signCoef * signCoef) << scaleBits
3354
+
3355
+    vcvtqq2pd      m10,        m7
3356
+    vcvtqq2pd      m11,        m12
3357
+    vfmadd213pd    m10,        m11,            m3
3358
+    vcvtpd2qq       m9,        m10
3359
+    vpsraq          m9,        RDO_MAX_16                      ;(psyScale * predictedCoef) >> max
3360
+
3361
+    psubq           m8,        m9
3362
+    paddq           m4,        m8
3363
+    movu           [r2],       ym8
3364
+    vextracti32x8  [r2 + 2 * mmsize],  m8 ,        1
3365
+
3366
+    ;Row 3, 4
3367
+    movq           xm13,       [r0 + mmsize]
3368
+    movq           xm14,       [r1 + mmsize]
3369
+    pinsrq         xm13,       [r0 + 3 * mmsize/2],      1
3370
+    pinsrq         xm14,       [r1 + 3 * mmsize/2],      1
3371
+    vpmovsxwq       m6,        xm13
3372
+    vpmovsxwq       m7,        xm14
3373
+    psubq           m7,        m6                              ; predictedCoef
3374
+
3375
+    vcvtqq2pd       m9,        m6
3376
+    vfmadd213pd     m9,        m9,             m3
3377
+    vcvtpd2qq       m8,        m9
3378
+    vpsllq          m8,        xm2                             ;(signCoef * signCoef) << scaleBits
3379
+
3380
+    vcvtqq2pd      m10,        m7
3381
+    vcvtqq2pd      m11,        m12
3382
+    vfmadd213pd    m10,        m11,             m3
3383
+    vcvtpd2qq       m9,        m10
3384
+    vpsraq          m9,        RDO_MAX_16                      ;(psyScale * predictedCoef) >> max
3385
+
3386
+    psubq           m8,         m9
3387
+    paddq           m4,         m8
3388
+    movu           [r2 + 4 * mmsize],       ym8
3389
+    vextracti32x8  [r2 + 6 * mmsize],  m8 ,    1
3390
+
3391
+    vextracti32x8  ym2,         m4,            1
3392
+    paddq          ym4,        ym2
3393
+    vextracti32x4  xm2,         m4,            1
3394
+    paddq          xm4,        xm2
3395
+    punpckhqdq     xm2,        xm4,            xm3
3396
+    paddq          xm4,        xm2
3397
+
3398
+    paddq          xm0,        xm4
3399
+    paddq          xm1,        xm4
3400
+
3401
+    movq           [r3],       xm0
3402
+    movq           [r4],       xm1
3403
+    RET
3404
+
3405
+;---------------------------------------------------------------------------------------------------------------------------------------------------------
3406
+; void psyRdoQuant(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos)
3407
+;---------------------------------------------------------------------------------------------------------------------------------------------------------
3408
+INIT_ZMM avx512
3409
+cglobal psyRdoQuant32, 5, 9, 15
3410
+%if WIN64
3411
+    mov             r5,        r5m
3412
+%endif
3413
+    mov            r6d,        r6m
3414
+    vpbroadcastq   m12,        [r5]                              ; psyScale
3415
+    lea             r0,        [r0 + 2 * r6]
3416
+    lea             r1,        [r1 + 2 * r6]
3417
+    lea             r6,        [4 * r6]
3418
+    lea             r2,        [r2 + 2 * r6]
3419
+    movq           xm0,        [r3]
3420
+    movq           xm1,        [r4]
3421
+
3422
+%if BIT_DEPTH == 12
3423
+    mov            r5,         [tab_nonpsyRdo12 + 24]                 ; scaleBits
3424
+%elif BIT_DEPTH == 10
3425
+    mov            r5,         [tab_nonpsyRdo10 + 24]
3426
+%elif BIT_DEPTH == 8
3427
+    mov            r5,         [tab_nonpsyRdo8 + 24]
3428
+%else
3429
+    %error Unsupported BIT_DEPTH!
3430
+%endif
3431
+
3432
+    movq           xm2,        r5
3433
+    vpxor           m4,        m4
3434
+    vpxor           m3,        m3
3435
+
3436
+;Row 1, 2
3437
+    movq           xm13,       [r0]
3438
+    movq           xm14,       [r1]
3439
+    pinsrq         xm13,       [r0 + mmsize], 1
3440
+    pinsrq         xm14,       [r1 + mmsize], 1
3441
+    vpmovsxwq       m6,        xm13
3442
+    vpmovsxwq       m7,        xm14
3443
+    psubq           m7,        m6                              ; predictedCoef
3444
+
3445
+    vcvtqq2pd       m9,        m6
3446
+    vfmadd213pd     m9,        m9,             m3
3447
+    vcvtpd2qq       m8,        m9
3448
+    vpsllq          m8,        xm2                             ;(signCoef * signCoef) << scaleBits
3449
+
3450
+    vcvtqq2pd      m10,        m7
3451
+    vcvtqq2pd      m11,        m12
3452
+    vfmadd213pd    m10,        m11,            m3
3453
+    vcvtpd2qq       m9,        m10
3454
+    vpsraq          m9,        RDO_MAX_32                      ;(psyScale * predictedCoef) >> max
3455
+
3456
+    psubq           m8,        m9
3457
+    paddq           m4,        m8
3458
+    movu           [r2],       ym8
3459
+    vextracti32x8  [r2 + 4 * mmsize],  m8 ,        1
3460
+
3461
+    ;Row 3, 4
3462
+    movq           xm13,       [r0 + 2 * mmsize]
3463
+    movq           xm14,       [r1 + 2 * mmsize]
3464
+    pinsrq         xm13,       [r0 + 3 * mmsize],      1
3465
+    pinsrq         xm14,       [r1 + 3 * mmsize],      1
3466
+    vpmovsxwq       m6,        xm13
3467
+    vpmovsxwq       m7,        xm14
3468
+    psubq           m7,        m6                              ; predictedCoef
3469
+
3470
+    vcvtqq2pd       m9,        m6
3471
+    vfmadd213pd     m9,        m9,             m3
3472
+    vcvtpd2qq       m8,        m9
3473
+    vpsllq          m8,        xm2                             ;(signCoef * signCoef) << scaleBits
3474
+
3475
+    vcvtqq2pd      m10,        m7
3476
+    vcvtqq2pd      m11,        m12
3477
+    vfmadd213pd    m10,        m11,             m3
3478
+    vcvtpd2qq       m9,        m10
3479
+    vpsraq          m9,        RDO_MAX_32                      ;(psyScale * predictedCoef) >> max
3480
+
3481
+    psubq           m8,         m9
3482
+    paddq           m4,         m8
3483
+    movu           [r2 + 8 * mmsize],       ym8
3484
+    vextracti32x8  [r2 + 12 * mmsize], m8 ,    1
3485
+
3486
+    vextracti32x8  ym2,         m4,            1
3487
+    paddq          ym4,        ym2
3488
+    vextracti32x4  xm2,         m4,            1
3489
+    paddq          xm4,        xm2
3490
+    punpckhqdq     xm2,        xm4,            xm3
3491
+    paddq          xm4,        xm2
3492
+
3493
+    paddq          xm0,        xm4
3494
+    paddq          xm1,        xm4
3495
+
3496
+    movq           [r3],       xm0
3497
+    movq           [r4],       xm1
3498
+    RET
3499
+
3500
+INIT_YMM avx2
3501
+cglobal nonPsyRdoQuant4, 5, 9, 16
3502
+    mov            r4d,        r4m
3503
+    lea             r0,        [r0 + 2 * r4]
3504
+    lea             r4,        [4 * r4]
3505
+    lea             r1,        [r1 + 2 * r4]
3506
+    movq           xm0,        [r2]
3507
+    movq           xm1,        [r3]
3508
+
3509
+%if BIT_DEPTH == 12
3510
+    mov            r5,         [tab_nonpsyRdo12]                 ; scaleBits
3511
+%elif BIT_DEPTH == 10
3512
+    mov            r5,         [tab_nonpsyRdo10]
3513
+%elif BIT_DEPTH == 8
3514
+    mov            r5,         [tab_nonpsyRdo8]
3515
+%else
3516
+    %error Unsupported BIT_DEPTH!
3517
+%endif
3518
+    movq           xm2,        r5
3519
+    vpxor           m4,        m4
3520
+    vpxor           m3,        m3
3521
+    vpxor           m13,       m13
3522
+
3523
+    vpmovsxwd                  m6,        [r0]
3524
+    vcvtdq2pd                  m9,        xm6
3525
+    vfmadd213pd                m9,        m9,             m3
3526
+    vcvtpd2dq                  xm8,       m9
3527
+    vpmovsxdq                  m13,       xm8                              ; 32 bit int to 64 bit int
3528
+    vpsllq                     m13,       xm2                             ;(signCoef * signCoef) << scaleBits 
3529
+    paddq                      m4,        m13
3530
+    movu                       [r1],       m13
3531
+
3532
+    vpmovsxwd                 m6,        [r0 + 8]
3533
+    vcvtdq2pd                 m9,        xm6
3534
+    vfmadd213pd               m9,        m9,             m3
3535
+    vcvtpd2dq                 xm8,       m9
3536
+    vpmovsxdq                 m13,       xm8                              ; 32 bit int to 64 bit int
3537
+    vpsllq                    m13,       xm2                             ;(signCoef * signCoef) << scaleBits 
3538
+    paddq                     m4,        m13
3539
+    movu                      [r1 + 32], m13
3540
+
3541
+    vpmovsxwd                 m6,        [r0 + 16]
3542
+    vcvtdq2pd                 m9,        xm6
3543
+    vfmadd213pd               m9,        m9,             m3
3544
+    vcvtpd2dq                 xm8,       m9
3545
+    vpmovsxdq                 m13,       xm8                              ; 32 bit int to 64 bit int
3546
+    vpsllq                    m13,       xm2                             ;(signCoef * signCoef) << scaleBits 
3547
+    paddq                     m4,        m13
3548
+    movu                      [r1 + 64], m13
3549
+
3550
+    vpmovsxwd                 m6,        [r0 +24]
3551
+    vcvtdq2pd                 m9,        xm6
3552
+    vfmadd213pd               m9,        m9,             m3
3553
+    vcvtpd2dq                 xm8,       m9
3554
+    vpmovsxdq                 m13,       xm8                              ; 32 bit int to 64 bit int 
3555
+    vpsllq                    m13,       xm2                             ;(signCoef * signCoef) << scaleBits
3556
+    paddq                     m4,        m13
3557
+    movu                      [r1 + 96], m13
3558
+
3559
+
3560
+    vextracti128              xm2,       m4,            1
3561
+    paddq                     xm4,       xm2
3562
+    punpckhqdq                xm2,       xm4,            xm3
3563
+    paddq                     xm4,       xm2
3564
+
3565
+    paddq                     xm0,       xm4
3566
+    paddq                     xm1,       xm4
3567
+
3568
+    movq                      [r2],      xm0
3569
+    movq                      [r3],      xm1
3570
+    RET
3571
+
3572
+
3573
+
3574
+INIT_YMM avx2
3575
+cglobal nonPsyRdoQuant8, 5, 5, 8
3576
+    mov            r4d,        r4m
3577
+    lea             r0,        [r0 + 2 * r4]
3578
+    lea             r4,        [4 * r4]
3579
+    lea             r1,        [r1 + 2 * r4]
3580
+%if BIT_DEPTH == 12
3581
+    mov             r4,        [tab_nonpsyRdo12 + 8]
3582
+%elif BIT_DEPTH == 10
3583
+    mov             r4,        [tab_nonpsyRdo10 + 8]
3584
+%elif BIT_DEPTH == 8
3585
+    mov             r4,        [tab_nonpsyRdo8 + 8]
3586
+%else
3587
+    %error Unsupported BIT_DEPTH!
3588
+ %endif
3589
+    movq           xm3,        r4
3590
+    movq           xm6,        [r2]
3591
+    movq           xm7,        [r3]
3592
+    vpxor           m4,        m4
3593
+    vpxor           m5,        m5
3594
+    movq           xm0,        [r0]
3595
+    vpmovsxwd       m1,         xm0
3596
+    vcvtdq2pd       m2,         xm1                              ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
3597
+    vfmadd213pd     m2,         m2,             m5              ; Multiply packed double-precision (64-bit) floating-point elements
3598
+    vcvtpd2dq       xm1,        m2
3599
+    vpmovsxdq       m0 ,        xm1
3600
+    vpsllq          m0,         xm3                              ; costUncoded
3601
+    paddq           m4,         m0
3602
+    movu            [r1],       ym0
3603
+    vpxor           m0,         m0
3604
+    movq            xm0,        [r0 +mmsize/2]
3605
+    vpmovsxwd       m1,         xm0
3606
+    vcvtdq2pd       m2,         xm1                              ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
3607
+    vfmadd213pd     m2,         m2,             m5              ; Multiply packed double-precision (64-bit) floating-point elements
3608
+    vcvtpd2dq       xm1,        m2
3609
+    vpmovsxdq       m0 ,       xm1
3610
+    vpsllq          m0,         xm3                              ; costUncoded
3611
+    paddq           m4,         m0
3612
+    movu            [r1 +2*mmsize],       m0
3613
+    vpxor           m0,         m0
3614
+    movq            xm0,        [r0 +mmsize]
3615
+    vpmovsxwd       m1,         xm0
3616
+    vcvtdq2pd       m2,         xm1                              ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
3617
+    vfmadd213pd     m2,         m2,             m5              ; Multiply packed double-precision (64-bit) floating-point elements
3618
+    vcvtpd2dq       xm1,        m2
3619
+    vpmovsxdq       m0 ,       xm1
3620
+    vpsllq          m0,         xm3                              ; costUncoded
3621
+    paddq           m4,         m0
3622
+    movu            [r1 +4*mmsize],       m0
3623
+    vpxor           m0,         m0
3624
+    movq            xm0,        [r0 +3*mmsize/2]
3625
+    vpmovsxwd       m1,         xm0
3626
+    vcvtdq2pd       m2,         xm1                              ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
3627
+    vfmadd213pd     m2,         m2,             m5              ; Multiply packed double-precision (64-bit) floating-point elements
3628
+    vcvtpd2dq       xm1,        m2
3629
+    vpmovsxdq       m0 ,       xm1
3630
+    vpsllq          m0,         xm3                              ; costUncoded
3631
+    paddq           m4,         m0
3632
+    movu            [r1 +6*mmsize],       m0
3633
+
3634
+    vextracti128    xm2,        m4,             1
3635
+    paddq           xm4,        xm2
3636
+    punpckhqdq      xm2,        xm4,            xm5
3637
+    paddq           xm4,        xm2
3638
+
3639
+    paddq          xm6,        xm4
3640
+    paddq          xm7,        xm4
3641
+
3642
+    movq           [r2],       xm6
3643
+    movq           [r3],       xm7
3644
+    RET
3645
+INIT_YMM avx2
3646
+cglobal nonPsyRdoQuant16, 5, 5, 8
3647
+    mov            r4d,        r4m
3648
+    lea             r0,        [r0 + 2 * r4]
3649
+    lea             r4,        [4 * r4]
3650
+    lea             r1,        [r1 + 2 * r4]
3651
+%if BIT_DEPTH == 12
3652
+    mov             r4,        [tab_nonpsyRdo12 + 16]
3653
+%elif BIT_DEPTH == 10
3654
+    mov             r4,        [tab_nonpsyRdo10 + 16]
3655
+%elif BIT_DEPTH == 8
3656
+    mov             r4,        [tab_nonpsyRdo8 + 16]
3657
+%else
3658
+    %error Unsupported BIT_DEPTH!
3659
+ %endif
3660
+    movq           xm3,        r4
3661
+    movq           xm6,        [r2]
3662
+    movq           xm7,        [r3]
3663
+    vpxor           m4,        m4
3664
+    vpxor           m5,        m5
3665
+
3666
+;Row 1, 2
3667
+    movq           xm0,        [r0]
3668
+    vpmovsxwd      m1,         xm0
3669
+    vcvtdq2pd      m2,         xm1                              ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
3670
+    vfmadd213pd    m2,         m2,             m5              ; Multiply packed double-precision (64-bit) floating-point elements
3671
+    vcvtpd2dq      xm1,         m2
3672
+    vpmovsxdq      m0 ,       xm1
3673
+    vpsllq         m0,         xm3                              ; costUncoded
3674
+    paddq          m4,         m0
3675
+    movu           [r1],       ym0
3676
+   
3677
+    movq           xm0,        [r0 +mmsize]
3678
+    vpmovsxwd      m1,         xm0
3679
+    vcvtdq2pd      m2,         xm1                              ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
3680
+    vfmadd213pd    m2,         m2,             m5              ; Multiply packed double-precision (64-bit) floating-point elements
3681
+    vcvtpd2dq      xm1,         m2
3682
+    vpmovsxdq      m0 ,       xm1
3683
+    vpsllq         m0,         xm3                              ; costUncoded
3684
+    paddq          m4,         m0
3685
+    movu           [r1+4*mmsize],       ym0
3686
+   
3687
+    movq           xm0,        [r0 + 2*mmsize]
3688
+    vpmovsxwd      m1,         xm0
3689
+    vcvtdq2pd      m2,         xm1                              ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
3690
+    vfmadd213pd    m2,         m2,             m5              ; Multiply packed double-precision (64-bit) floating-point elements
3691
+    vcvtpd2dq      xm1,         m2
3692
+    vpmovsxdq      m0 ,       xm1
3693
+    vpsllq         m0,         xm3                              ; costUncoded
3694
+    paddq          m4,         m0
3695
+    movu           [r1+8*mmsize],       ym0
3696
+
3697
+    movq           xm0,        [r0 + 3*mmsize]
3698
+    vpmovsxwd      m1,         xm0
3699
+    vcvtdq2pd      m2,         xm1                              ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
3700
+    vfmadd213pd    m2,         m2,             m5              ; Multiply packed double-precision (64-bit) floating-point elements
3701
+    vcvtpd2dq      xm1,         m2
3702
+    vpmovsxdq      m0 ,       xm1
3703
+    vpsllq         m0,         xm3                              ; costUncoded
3704
+    paddq          m4,         m0
3705
+    movu           [r1+12*mmsize],       ym0
3706
+
3707
+ 
3708
+    vextracti128  xm2,        m4,             1
3709
+    paddq          xm4,        xm2
3710
+    punpckhqdq     xm2,        xm4,            xm5
3711
+    paddq          xm4,        xm2
3712
+
3713
+    paddq          xm6,        xm4
3714
+    paddq          xm7,        xm4
3715
+
3716
+    movq           [r2],       xm6
3717
+    movq           [r3],       xm7
3718
+    RET
3719
+INIT_YMM avx2
3720
+cglobal nonPsyRdoQuant32, 5, 5, 8
3721
+    mov            r4d,        r4m
3722
+    lea             r0,        [r0 + 2 * r4]
3723
+    lea             r4,        [4 * r4]
3724
+    lea             r1,        [r1 + 2 * r4]
3725
+%if BIT_DEPTH == 12
3726
+    mov             r4,        [tab_nonpsyRdo12 + 24]
3727
+%elif BIT_DEPTH == 10
3728
+    mov             r4,        [tab_nonpsyRdo10 + 24]
3729
+%elif BIT_DEPTH == 8
3730
+    mov             r4,        [tab_nonpsyRdo8 + 24]
3731
+%else
3732
+    %error Unsupported BIT_DEPTH!
3733
+ %endif
3734
+    movq           xm3,        r4
3735
+    movq           xm6,        [r2]
3736
+    movq           xm7,        [r3]
3737
+    vpxor           m4,        m4
3738
+    vpxor           m5,        m5
3739
+
3740
+    movq           xm0,        [r0]
3741
+    vpmovsxwd      m1,         xm0
3742
+    vcvtdq2pd      m2,         xm1                              ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
3743
+    vfmadd213pd    m2,         m2,             m5              ; Multiply packed double-precision (64-bit) floating-point elements
3744
+    vcvtpd2dq      xm1,         m2
3745
+    vpmovsxdq      m0 ,       xm1
3746
+    vpsllq         m0,         xm3                              ; costUncoded
3747
+    paddq          m4,         m0
3748
+    movu           [r1],       m0
3749
+    vpxor           m0,        m0
3750
+   
3751
+    movq           xm0,        [r0 +2*mmsize]
3752
+    vpmovsxwd      m1,         xm0
3753
+    vcvtdq2pd      m2,         xm1                              ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
3754
+    vfmadd213pd    m2,         m2,             m5              ; Multiply packed double-precision (64-bit) floating-point elements
3755
+    vcvtpd2dq      xm1,         m2
3756
+    vpmovsxdq      m0 ,       xm1
3757
+    vpsllq         m0,         xm3                              ; costUncoded
3758
+    paddq          m4,         m0
3759
+    movu           [r1 + 8*mmsize],       m0
3760
+    vpxor           m0,        m0
3761
+    
3762
+    movq           xm0,        [r0 +4*mmsize]
3763
+    vpmovsxwd      m1,         xm0
3764
+    vcvtdq2pd      m2,         xm1                              ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
3765
+    vfmadd213pd    m2,         m2,             m5              ; Multiply packed double-precision (64-bit) floating-point elements
3766
+    vcvtpd2dq      xm1,         m2
3767
+    vpmovsxdq      m0 ,       xm1
3768
+    vpsllq         m0,         xm3                              ; costUncoded
3769
+    paddq          m4,         m0
3770
+    movu           [r1 +16*mmsize],       m0
3771
+    vpxor           m0,        m0
3772
+
3773
+    movq           xm0,        [r0 +6*mmsize]
3774
+    vpmovsxwd      m1,         xm0
3775
+    vcvtdq2pd      m2,         xm1                              ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
3776
+    vfmadd213pd    m2,         m2,             m5              ; Multiply packed double-precision (64-bit) floating-point elements
3777
+    vcvtpd2dq      xm1,         m2
3778
+    vpmovsxdq      m0 ,       xm1
3779
+    vpsllq         m0,         xm3                              ; costUncoded
3780
+    paddq          m4,         m0
3781
+    movu           [r1 +24*mmsize],       m0
3782
+
3783
+    vextracti128   xm2,        m4,             1
3784
+    paddq          xm4,        xm2
3785
+    punpckhqdq     xm2,        xm4,            xm5
3786
+    paddq          xm4,        xm2
3787
+
3788
+    paddq          xm6,        xm4
3789
+    paddq          xm7,        xm4
3790
+
3791
+    movq           [r2],       xm6
3792
+    movq           [r3],       xm7
3793
+    RET
3794
+
3795
+INIT_YMM avx2
3796
+cglobal psyRdoQuant_1p4, 5, 9, 16
3797
+    mov            r4d,        r4m
3798
+    lea             r0,        [r0 + 2 * r4]
3799
+    lea             r4,        [4 * r4]
3800
+    lea             r1,        [r1 + 2 * r4]
3801
+    movq           xm0,        [r2]
3802
+    movq           xm1,        [r3]
3803
+
3804
+%if BIT_DEPTH == 12
3805
+    mov            r5,         [tab_nonpsyRdo12]                 ; scaleBits
3806
+%elif BIT_DEPTH == 10
3807
+    mov            r5,         [tab_nonpsyRdo10]
3808
+%elif BIT_DEPTH == 8
3809
+    mov            r5,         [tab_nonpsyRdo8]
3810
+%else
3811
+    %error Unsupported BIT_DEPTH!
3812
+%endif
3813
+    movq           xm2,        r5
3814
+    vpxor           m4,        m4
3815
+    vpxor           m3,        m3
3816
+    vpxor           m13,       m13
3817
+
3818
+    vpmovsxwd                  m6,        [r0]
3819
+    vcvtdq2pd                  m9,        xm6
3820
+    vfmadd213pd                m9,        m9,             m3
3821
+    vcvtpd2dq                  xm8,       m9
3822
+    vpmovsxdq                  m13,       xm8                              ; 32 bit int to 64 bit int
3823
+    vpsllq                     m13,       xm2                             ;(signCoef * signCoef) << scaleBits 
3824
+    paddq                      m4,        m13
3825
+    movu                       [r1],       m13
3826
+
3827
+    vpmovsxwd                 m6,        [r0 + 8]
3828
+    vcvtdq2pd                 m9,        xm6
3829
+    vfmadd213pd               m9,        m9,             m3
3830
+    vcvtpd2dq                 xm8,       m9
3831
+    vpmovsxdq                 m13,       xm8                              ; 32 bit int to 64 bit int
3832
+    vpsllq                    m13,       xm2                             ;(signCoef * signCoef) << scaleBits 
3833
+    paddq                     m4,        m13
3834
+    movu                      [r1 + 32], m13
3835
+
3836
+    vpmovsxwd                 m6,        [r0 + 16]
3837
+    vcvtdq2pd                 m9,        xm6
3838
+    vfmadd213pd               m9,        m9,             m3
3839
+    vcvtpd2dq                 xm8,       m9
3840
+    vpmovsxdq                 m13,       xm8                              ; 32 bit int to 64 bit int
3841
+    vpsllq                    m13,       xm2                             ;(signCoef * signCoef) << scaleBits 
3842
+    paddq                     m4,        m13
3843
+    movu                      [r1 + 64], m13
3844
+
3845
+    vpmovsxwd                 m6,        [r0 +24]
3846
+    vcvtdq2pd                 m9,        xm6
3847
+    vfmadd213pd               m9,        m9,             m3
3848
+    vcvtpd2dq                 xm8,       m9
3849
+    vpmovsxdq                 m13,       xm8                              ; 32 bit int to 64 bit int 
3850
+    vpsllq                    m13,       xm2                             ;(signCoef * signCoef) << scaleBits
3851
+    paddq                     m4,        m13
3852
+    movu                      [r1 + 96], m13
3853
+
3854
+
3855
+    vextracti128              xm2,       m4,            1
3856
+    paddq                     xm4,       xm2
3857
+    punpckhqdq                xm2,       xm4,            xm3
3858
+    paddq                     xm4,       xm2
3859
+
3860
+    paddq                     xm0,       xm4
3861
+    paddq                     xm1,       xm4
3862
+
3863
+    movq                      [r2],      xm0
3864
+    movq                      [r3],      xm1
3865
+    RET
3866
+INIT_YMM avx2
3867
+cglobal psyRdoQuant_1p8, 7, 9, 16
3868
+    mov            r4d,        r4m
3869
+    lea             r0,        [r0 + 2 * r4]
3870
+    lea             r4,        [4 * r4]
3871
+    lea             r1,        [r1 + 2 * r4]
3872
+    movq           xm0,        [r2]
3873
+    movq           xm1,        [r3]
3874
+%if BIT_DEPTH == 12
3875
+    mov            r5,         [tab_nonpsyRdo12 +8]                 ; scaleBits
3876
+%elif BIT_DEPTH == 10
3877
+    mov            r5,         [tab_nonpsyRdo10 +8]
3878
+%elif BIT_DEPTH == 8
3879
+    mov            r5,         [tab_nonpsyRdo8 + 8 ]
3880
+%else
3881
+    %error Unsupported BIT_DEPTH!
3882
+%endif
3883
+    movq           xm2,        r5
3884
+    vpxor           m4,        m4
3885
+    vpxor           m3,        m3
3886
+    vpxor           m13,       m13
3887
+
3888
+
3889
+    vpmovsxwd                  m6,        [r0]
3890
+    vcvtdq2pd                  m9,        xm6
3891
+    vfmadd213pd                m9,        m9,             m3
3892
+    vcvtpd2dq                  xm8,       m9
3893
+    vpmovsxdq                  m13,       xm8                              ; 32 bit int to 64 bit int
3894
+    vpsllq                     m13,       xm2                             ;(signCoef * signCoef) << scaleBits 
3895
+    paddq                      m4,        m13
3896
+    movu                       [r1],       m13
3897
+
3898
+    vpmovsxwd                  m6,        [r0 + 16]
3899
+    vcvtdq2pd                  m9,        xm6
3900
+    vfmadd213pd                m9,        m9,             m3
3901
+    vcvtpd2dq                  xm8,       m9
3902
+    vpmovsxdq                  m13,       xm8                              ; 32 bit int to 64 bit int
3903
+    vpsllq                     m13,       xm2                             ;(signCoef * signCoef) << scaleBits 
3904
+    paddq                      m4,        m13
3905
+    movu                       [r1 + 64],       m13
3906
+
3907
+    vpmovsxwd                  m6,        [r0 +32]
3908
+    vcvtdq2pd                  m9,        xm6
3909
+    vfmadd213pd                m9,        m9,             m3
3910
+    vcvtpd2dq                  xm8,       m9
3911
+    vpmovsxdq                  m13,       xm8                              ; 32 bit int to 64 bit int
3912
+    vpsllq                     m13,       xm2                             ;(signCoef * signCoef) << scaleBits 
3913
+    paddq                      m4,        m13
3914
+    movu                       [r1 +128],       m13
3915
+
3916
+    vpmovsxwd                  m6,        [r0 + 48]
3917
+    vcvtdq2pd                  m9,        xm6
3918
+    vfmadd213pd                m9,        m9,             m3
3919
+    vcvtpd2dq                  xm8,       m9
3920
+    vpmovsxdq                  m13,       xm8                              ; 32 bit int to 64 bit int
3921
+    vpsllq                     m13,       xm2                             ;(signCoef * signCoef) << scaleBits 
3922
+    paddq                      m4,        m13
3923
+    movu                       [r1 + 192],       m13
3924
+
3925
+    vextracti128              xm2,       m4,            1
3926
+    paddq                     xm4,       xm2
3927
+    punpckhqdq                xm2,       xm4,            xm3
3928
+    paddq                     xm4,       xm2
3929
+
3930
+    paddq                     xm0,       xm4
3931
+    paddq                     xm1,       xm4
3932
+
3933
+    movq                      [r2],      xm0
3934
+    movq                      [r3],      xm1
3935
+    RET
3936
+
3937
+INIT_YMM avx2
3938
+cglobal psyRdoQuant_1p16, 7, 9, 16
3939
+    mov            r4d,        r4m
3940
+    lea             r0,        [r0 + 2 * r4]
3941
+    lea             r4,        [4 * r4]
3942
+    lea             r1,        [r1 + 2 * r4]
3943
+    movq           xm0,        [r2]
3944
+    movq           xm1,        [r3]
3945
+%if BIT_DEPTH == 12
3946
+    mov            r5,         [tab_nonpsyRdo12 + 16]                 ; scaleBits
3947
+%elif BIT_DEPTH == 10
3948
+    mov            r5,         [tab_nonpsyRdo10 + 16]
3949
+%elif BIT_DEPTH == 8
3950
+    mov            r5,         [tab_nonpsyRdo8 + 16 ]
3951
+%else
3952
+    %error Unsupported BIT_DEPTH!
3953
+%endif
3954
+    movq           xm2,        r5
3955
+    vpxor           m4,        m4
3956
+    vpxor           m3,        m3
3957
+    vpxor           m13,       m13
3958
+
3959
+    vpmovsxwd                  m6,        [r0]
3960
+    vcvtdq2pd                  m9,        xm6
3961
+    vfmadd213pd                m9,        m9,             m3
3962
+    vcvtpd2dq                  xm8,       m9
3963
+    vpmovsxdq                  m13,       xm8                              ; 32 bit int to 64 bit int
3964
+    vpsllq                     m13,       xm2                             ;(signCoef * signCoef) << scaleBits 
3965
+    paddq                      m4,        m13
3966
+    movu                       [r1],       m13
3967
+
3968
+    vpmovsxwd                  m6,        [r0 + mmsize]
3969
+
3970
+    vcvtdq2pd                  m9,        xm6
3971
+    vfmadd213pd                m9,        m9,             m3
3972
+    vcvtpd2dq                  xm8,       m9
3973
+    vpmovsxdq                  m13,       xm8                              ; 32 bit int to 64 bit int
3974
+    vpsllq                     m13,       xm2                             ;(signCoef * signCoef) << scaleBits 
3975
+    paddq                      m4,        m13
3976
+    movu                       [r1 + 4*mmsize],       m13
3977
+
3978
+    vpmovsxwd                  m6,        [r0 + 2 * mmsize]
3979
+    vcvtdq2pd                  m9,        xm6
3980
+    vfmadd213pd                m9,        m9,             m3
3981
+    vcvtpd2dq                  xm8,       m9
3982
+    vpmovsxdq                  m13,       xm8                              ; 32 bit int to 64 bit int
3983
+    vpsllq                     m13,       xm2                             ;(signCoef * signCoef) << scaleBits 
3984
+    paddq                      m4,        m13
3985
+    movu                       [r1 + 8*mmsize],       m13
3986
+
3987
+    vpmovsxwd                  m6,        [r0 + 3 * mmsize]
3988
+    vcvtdq2pd                  m9,        xm6
3989
+    vfmadd213pd                m9,        m9,             m3
3990
+    vcvtpd2dq                  xm8,       m9
3991
+    vpmovsxdq                  m13,       xm8                              ; 32 bit int to 64 bit int
3992
+    vpsllq                     m13,       xm2                             ;(signCoef * signCoef) << scaleBits 
3993
+    paddq                      m4,        m13
3994
+    movu                       [r1 + 12*mmsize],       m13
3995
+
3996
+    vextracti128              xm2,       m4,            1
3997
+    paddq                     xm4,       xm2
3998
+    punpckhqdq                xm2,       xm4,            xm3
3999
+    paddq                     xm4,       xm2
4000
+
4001
+    paddq                     xm0,       xm4
4002
+    paddq                     xm1,       xm4
4003
+
4004
+    movq                      [r2],      xm0
4005
+    movq                      [r3],      xm1
4006
+    RET
4007
+
4008
+INIT_YMM avx2
4009
+cglobal psyRdoQuant_1p32, 7, 9, 16
4010
+   mov            r4d,        r4m
4011
+    lea             r0,        [r0 + 2 * r4]
4012
+    lea             r4,        [4 * r4]
4013
+    lea             r1,        [r1 + 2 * r4]
4014
+    movq           xm0,        [r2]
4015
+    movq           xm1,        [r3]
4016
+%if BIT_DEPTH == 12
4017
+    mov            r5,         [tab_nonpsyRdo12 + 24]                 ; scaleBits
4018
+%elif BIT_DEPTH == 10
4019
+    mov            r5,         [tab_nonpsyRdo10 + 24]
4020
+%elif BIT_DEPTH == 8
4021
+    mov            r5,         [tab_nonpsyRdo8 + 24]
4022
+%else
4023
+    %error Unsupported BIT_DEPTH!
4024
+%endif
4025
+    movq           xm2,        r5
4026
+    vpxor           m4,        m4
4027
+    vpxor           m3,        m3
4028
+    vpxor           m13,       m13
4029
+
4030
+
4031
+    vpmovsxwd                  m6,        [r0]
4032
+    vcvtdq2pd                  m9,        xm6
4033
+    vfmadd213pd                m9,        m9,             m3
4034
+    vcvtpd2dq                  xm8,       m9
4035
+    vpmovsxdq                  m13,       xm8                              ; 32 bit int to 64 bit int
4036
+    vpsllq                     m13,       xm2                             ;(signCoef * signCoef) << scaleBits 
4037
+    paddq                      m4,        m13
4038
+    movu                       [r1],       m13
4039
+
4040
+    vpmovsxwd                  m6,        [r0 + 2 * mmsize]
4041
+    vcvtdq2pd                  m9,        xm6
4042
+    vfmadd213pd                m9,        m9,             m3
4043
+    vcvtpd2dq                  xm8,       m9
4044
+    vpmovsxdq                  m13,       xm8                              ; 32 bit int to 64 bit int
4045
+    vpsllq                     m13,       xm2                             ;(signCoef * signCoef) << scaleBits 
4046
+    paddq                      m4,        m13
4047
+    movu                       [r1 + 8 * mmsize],       m13
4048
+
4049
+    vpmovsxwd                  m6,        [r0 + 4 * mmsize]
4050
+    vcvtdq2pd                  m9,        xm6
4051
+    vfmadd213pd                m9,        m9,             m3
4052
+    vcvtpd2dq                  xm8,       m9
4053
+    vpmovsxdq                  m13,       xm8                              ; 32 bit int to 64 bit int
4054
+    vpsllq                     m13,       xm2                             ;(signCoef * signCoef) << scaleBits 
4055
+    paddq                      m4,        m13
4056
+    movu                       [r1 + 16 * mmsize],       m13
4057
+
4058
+    vpmovsxwd                  m6,        [r0 + 6 * mmsize]
4059
+    vcvtdq2pd                  m9,        xm6
4060
+    vfmadd213pd                m9,        m9,             m3
4061
+    vcvtpd2dq                  xm8,       m9
4062
+    vpmovsxdq                  m13,       xm8                              ; 32 bit int to 64 bit int
4063
+    vpsllq                     m13,       xm2                             ;(signCoef * signCoef) << scaleBits 
4064
+    paddq                      m4,        m13
4065
+    movu                       [r1  + 24 *mmsize],       m13
4066
+
4067
+    vextracti128              xm2,       m4,            1
4068
+    paddq                     xm4,       xm2
4069
+    punpckhqdq                xm2,       xm4,            xm3
4070
+    paddq                     xm4,       xm2
4071
+
4072
+    paddq                     xm0,       xm4
4073
+    paddq                     xm1,       xm4
4074
+
4075
+    movq                      [r2],      xm0
4076
+    movq                      [r3],      xm1
4077
+    RET
4078
+
4079
 %endif
4080
x265_2.7.tar.gz/source/common/x86/dct8.h -> x265_2.9.tar.gz/source/common/x86/dct8.h Changed
26
 
1
@@ -34,6 +34,11 @@
2
 FUNCDEF_TU_S2(void, idct, ssse3, const int16_t* src, int16_t* dst, intptr_t dstStride);
3
 FUNCDEF_TU_S2(void, idct, sse4, const int16_t* src, int16_t* dst, intptr_t dstStride);
4
 FUNCDEF_TU_S2(void, idct, avx2, const int16_t* src, int16_t* dst, intptr_t dstStride);
5
+FUNCDEF_TU_S2(void, nonPsyRdoQuant, avx512, int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, uint32_t blkPos);
6
+FUNCDEF_TU_S2(void, psyRdoQuant, avx512, int16_t* m_resiDctCoeff, int16_t* m_fencDctCoeff, int64_t* costUncoded, int64_t* totalUncodedCost, int64_t* totalRdCost, int64_t *psyScale, uint32_t blkPos);
7
+FUNCDEF_TU_S2(void, nonPsyRdoQuant, avx2, int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, uint32_t blkPos);
8
+FUNCDEF_TU_S2(void, psyRdoQuant_1p, avx2, int16_t* m_resiDctCoeff,  int64_t* costUncoded, int64_t* totalUncodedCost, int64_t* totalRdCost,  uint32_t blkPos);
9
+FUNCDEF_TU_S2(void, psyRdoQuant_2p, avx2, int16_t* m_resiDctCoeff, int16_t* m_fencDctCoeff, int64_t* costUncoded, int64_t* totalUncodedCost, int64_t* totalRdCost, int64_t *psyScale, uint32_t blkPos);
10
 
11
 void PFX(dst4_ssse3)(const int16_t* src, int16_t* dst, intptr_t srcStride);
12
 void PFX(dst4_sse2)(const int16_t* src, int16_t* dst, intptr_t srcStride);
13
@@ -42,5 +47,11 @@
14
 void PFX(idst4_avx2)(const int16_t* src, int16_t* dst, intptr_t srcStride);
15
 void PFX(denoise_dct_sse4)(int16_t* dct, uint32_t* sum, const uint16_t* offset, int size);
16
 void PFX(denoise_dct_avx2)(int16_t* dct, uint32_t* sum, const uint16_t* offset, int size);
17
-
18
+void PFX(denoise_dct_avx512)(int16_t* dct, uint32_t* sum, const uint16_t* offset, int size);
19
+void PFX(dct8_avx512)(const int16_t* src, int16_t* dst, intptr_t srcStride);
20
+void PFX(idct8_avx512)(const int16_t* src, int16_t* dst, intptr_t dstStride);
21
+void PFX(idct16_avx512)(const int16_t* src, int16_t* dst, intptr_t dstStride);
22
+void PFX(idct32_avx512)(const int16_t* src, int16_t* dst, intptr_t dstStride);
23
+void PFX(dct32_avx512)(const int16_t* src, int16_t* dst, intptr_t srcStride);
24
+void PFX(dct16_avx512)(const int16_t* src, int16_t* dst, intptr_t srcStride);
25
 #endif // ifndef X265_DCT8_H
26
x265_2.7.tar.gz/source/common/x86/h-ipfilter16.asm -> x265_2.9.tar.gz/source/common/x86/h-ipfilter16.asm Changed
1589
 
1
@@ -47,7 +47,7 @@
2
 
3
 h_pd_524800:        times 8 dd 524800
4
                                     
5
-tab_LumaCoeff:    dw   0, 0,  0,  64,  0,   0,  0,  0
6
+h_tab_LumaCoeff:    dw   0, 0,  0,  64,  0,   0,  0,  0
7
                   dw  -1, 4, -10, 58,  17, -5,  1,  0
8
                   dw  -1, 4, -11, 40,  40, -11, 4, -1
9
                   dw   0, 1, -5,  17,  58, -10, 4, -1
10
@@ -79,8 +79,13 @@
11
                             db 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13 
12
 
13
 const interp8_hpp_shuf_new, db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9
14
-                            db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13                         
15
-                            
16
+                            db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13
17
+
18
+ALIGN 64
19
+interp8_hpp_shuf1_load_avx512: times 4 db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9
20
+interp8_hpp_shuf2_load_avx512: times 4 db 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13
21
+interp8_hpp_shuf1_store_avx512: times 4 db 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15
22
+
23
 SECTION .text
24
 cextern pd_8
25
 cextern pd_32
26
@@ -207,10 +212,10 @@
27
     add         r3d,    r3d
28
 
29
 %ifdef PIC
30
-    lea         r6,     [tab_LumaCoeff]
31
+    lea         r6,     [h_tab_LumaCoeff]
32
     mova        m0,     [r6 + r4]
33
 %else
34
-    mova        m0,     [tab_LumaCoeff + r4]
35
+    mova        m0,     [h_tab_LumaCoeff + r4]
36
 %endif
37
 
38
 %ifidn %3, pp
39
@@ -285,7 +290,8 @@
40
 ;------------------------------------------------------------------------------------------------------------
41
 ; void interp_8tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx
42
 ;------------------------------------------------------------------------------------------------------------
43
-    FILTER_HOR_LUMA_sse2 4, 4, pp
44
+%if ARCH_X86_64
45
+   FILTER_HOR_LUMA_sse2 4, 4, pp
46
     FILTER_HOR_LUMA_sse2 4, 8, pp
47
     FILTER_HOR_LUMA_sse2 4, 16, pp
48
     FILTER_HOR_LUMA_sse2 8, 4, pp
49
@@ -339,6 +345,7 @@
50
     FILTER_HOR_LUMA_sse2 64, 32, ps
51
     FILTER_HOR_LUMA_sse2 64, 48, ps
52
     FILTER_HOR_LUMA_sse2 64, 64, ps
53
+%endif
54
 
55
 ;-----------------------------------------------------------------------------
56
 ; void interp_4tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
57
@@ -625,10 +632,10 @@
58
     add         r3, r3
59
 
60
 %ifdef PIC
61
-    lea         r6, [tab_LumaCoeff]
62
+    lea         r6, [h_tab_LumaCoeff]
63
     mova        m0, [r6 + r4]
64
 %else
65
-    mova        m0, [tab_LumaCoeff + r4]
66
+    mova        m0, [h_tab_LumaCoeff + r4]
67
 %endif
68
 
69
 %ifidn %3, pp
70
@@ -712,10 +719,10 @@
71
     shl         r4d, 4
72
 
73
 %ifdef PIC
74
-    lea         r6, [tab_LumaCoeff]
75
+    lea         r6, [h_tab_LumaCoeff]
76
     mova        m0, [r6 + r4]
77
 %else
78
-    mova        m0, [tab_LumaCoeff + r4]
79
+    mova        m0, [h_tab_LumaCoeff + r4]
80
 %endif
81
 
82
 %ifidn %3, pp
83
@@ -815,10 +822,10 @@
84
     shl         r4d, 4
85
 
86
 %ifdef PIC
87
-    lea         r6, [tab_LumaCoeff]
88
+    lea         r6, [h_tab_LumaCoeff]
89
     mova        m0, [r6 + r4]
90
 %else
91
-    mova        m0, [tab_LumaCoeff + r4]
92
+    mova        m0, [h_tab_LumaCoeff + r4]
93
 %endif
94
 %ifidn %3, pp
95
     mova        m1, [INTERP_OFFSET_PP]
96
@@ -936,10 +943,10 @@
97
     shl         r4d, 4
98
 
99
 %ifdef PIC
100
-    lea         r6, [tab_LumaCoeff]
101
+    lea         r6, [h_tab_LumaCoeff]
102
     mova        m0, [r6 + r4]
103
 %else
104
-    mova        m0, [tab_LumaCoeff + r4]
105
+    mova        m0, [h_tab_LumaCoeff + r4]
106
 %endif
107
 
108
 %ifidn %3, pp
109
@@ -1132,10 +1139,10 @@
110
     shl         r4d, 4
111
 
112
 %ifdef PIC
113
-    lea         r6, [tab_LumaCoeff]
114
+    lea         r6, [h_tab_LumaCoeff]
115
     mova        m0, [r6 + r4]
116
 %else
117
-    mova        m0, [tab_LumaCoeff + r4]
118
+    mova        m0, [h_tab_LumaCoeff + r4]
119
 %endif
120
 %ifidn %3, pp
121
     mova        m1, [pd_32]
122
@@ -1307,12 +1314,12 @@
123
     mov              r4d, r4m
124
     shl              r4d, 4
125
 %ifdef PIC
126
-    lea              r5, [tab_LumaCoeff]
127
+    lea              r5, [h_tab_LumaCoeff]
128
     vpbroadcastq     m0, [r5 + r4]
129
     vpbroadcastq     m1, [r5 + r4 + 8]
130
 %else
131
-    vpbroadcastq     m0, [tab_LumaCoeff + r4]
132
-    vpbroadcastq     m1, [tab_LumaCoeff + r4 + 8]
133
+    vpbroadcastq     m0, [h_tab_LumaCoeff + r4]
134
+    vpbroadcastq     m1, [h_tab_LumaCoeff + r4 + 8]
135
 %endif
136
     lea              r6, [pw_pixel_max]
137
     mova             m3, [interp8_hpp_shuf]
138
@@ -1376,302 +1383,352 @@
139
 ;-------------------------------------------------------------------------------------------------------------
140
 ; void interp_8tap_horiz_pp(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx
141
 ;-------------------------------------------------------------------------------------------------------------
142
-%macro FILTER_HOR_LUMA_W8 1
143
+%macro PROCESS_IPFILTER_LUMA_PP_8x2_AVX2 0
144
+    movu            xm7,        [r0]
145
+    movu            xm8,        [r0 + 8]
146
+    vinserti128     m7,        m7,        [r0 + r1],          1
147
+    vinserti128     m8,        m8,        [r0 + r1 + 8],      1
148
+    pshufb          m10,       m7,        m14
149
+    pshufb          m7,                   m13
150
+    pshufb          m11,       m8,        m14
151
+    pshufb          m8,                   m13
152
+
153
+    pmaddwd         m7,        m0
154
+    pmaddwd         m10,       m1
155
+    paddd           m7,        m10
156
+    pmaddwd         m10,       m11,       m3
157
+    pmaddwd         m9,        m8,        m2
158
+    paddd           m10,       m9
159
+    paddd           m7,        m10
160
+    paddd           m7,        m4
161
+    psrad           m7,        INTERP_SHIFT_PP
162
+
163
+    movu            xm9,        [r0 + 16]
164
+    vinserti128     m9,        m9,        [r0 + r1 + 16],      1
165
+    pshufb          m10,       m9,        m14
166
+    pshufb          m9,                   m13
167
+    pmaddwd         m8,        m0
168
+    pmaddwd         m11,       m1
169
+    paddd           m8,        m11
170
+    pmaddwd         m10,       m3
171
+    pmaddwd         m9,        m2
172
+    paddd           m9,        m10
173
+    paddd           m8,        m9
174
+    paddd           m8,        m4
175
+    psrad           m8,        INTERP_SHIFT_PP
176
+
177
+    packusdw        m7,        m8
178
+    pshufb          m7,        m12
179
+    CLIPW           m7,        m5,         m6
180
+    movu            [r2],      xm7
181
+    vextracti128    [r2 + r3], m7,         1
182
+%endmacro
183
+
184
+%macro IPFILTER_LUMA_AVX2_8xN 1
185
 INIT_YMM avx2
186
-cglobal interp_8tap_horiz_pp_8x%1, 4,6,8
187
-    add              r1d, r1d
188
-    add              r3d, r3d
189
-    sub              r0, 6
190
-    mov              r4d, r4m
191
-    shl              r4d, 4
192
+cglobal interp_8tap_horiz_pp_8x%1, 5,6,15
193
+    shl              r1d,        1
194
+    shl              r3d,        1
195
+    sub              r0,         6
196
+    mov              r4d,        r4m
197
+    shl              r4d,        4
198
+
199
 %ifdef PIC
200
-    lea              r5, [tab_LumaCoeff]
201
-    vpbroadcastq     m0, [r5 + r4]
202
-    vpbroadcastq     m1, [r5 + r4 + 8]
203
+    lea              r5,         [h_tab_LumaCoeff]
204
+    vpbroadcastd     m0,         [r5 + r4]
205
+    vpbroadcastd     m1,         [r5 + r4 + 4]
206
+    vpbroadcastd     m2,         [r5 + r4 + 8]
207
+    vpbroadcastd     m3,         [r5 + r4 + 12]
208
 %else
209
-    vpbroadcastq     m0, [tab_LumaCoeff + r4]
210
-    vpbroadcastq     m1, [h_ab_LumaCoeff + r4 + 8]
211
-%endif
212
-    mova             m3, [interp8_hpp_shuf]
213
-    mova             m7, [pd_32]
214
-    pxor             m2, m2
215
-
216
-    ; register map
217
-    ; m0 , m1 interpolate coeff
218
-
219
-    mov              r4d, %1/2
220
-
221
-.loop:
222
-    vbroadcasti128   m4, [r0]
223
-    vbroadcasti128   m5, [r0 + 8]
224
-    pshufb           m4, m3
225
-    pshufb           m5, m3
226
-
227
-    pmaddwd          m4, m0
228
-    pmaddwd          m5, m1
229
-    paddd            m4, m5
230
-
231
-    vbroadcasti128   m5, [r0 + 8]
232
-    vbroadcasti128   m6, [r0 + 16]
233
-    pshufb           m5, m3
234
-    pshufb           m6, m3
235
-
236
-    pmaddwd          m5, m0
237
-    pmaddwd          m6, m1
238
-    paddd            m5, m6
239
-
240
-    phaddd           m4, m5
241
-    vpermq           m4, m4, q3120
242
-    paddd            m4, m7
243
-    psrad            m4, INTERP_SHIFT_PP
244
-
245
-    packusdw         m4, m4
246
-    vpermq           m4, m4, q2020
247
-    CLIPW            m4, m2, [pw_pixel_max]
248
-    movu             [r2], xm4
249
-
250
-    vbroadcasti128   m4, [r0 + r1]
251
-    vbroadcasti128   m5, [r0 + r1 + 8]
252
-    pshufb           m4, m3
253
-    pshufb           m5, m3
254
-
255
-    pmaddwd          m4, m0
256
-    pmaddwd          m5, m1
257
-    paddd            m4, m5
258
-
259
-    vbroadcasti128   m5, [r0 + r1 + 8]
260
-    vbroadcasti128   m6, [r0 + r1 + 16]
261
-    pshufb           m5, m3
262
-    pshufb           m6, m3
263
-
264
-    pmaddwd          m5, m0
265
-    pmaddwd          m6, m1
266
-    paddd            m5, m6
267
-
268
-    phaddd           m4, m5
269
-    vpermq           m4, m4, q3120
270
-    paddd            m4, m7
271
-    psrad            m4, INTERP_SHIFT_PP
272
-
273
-    packusdw         m4, m4
274
-    vpermq           m4, m4, q2020
275
-    CLIPW            m4, m2, [pw_pixel_max]
276
-    movu             [r2 + r3], xm4
277
-
278
-    lea              r2, [r2 + 2 * r3]
279
-    lea              r0, [r0 + 2 * r1]
280
-    dec              r4d
281
-    jnz              .loop
282
+    vpbroadcastd     m0,         [h_tab_LumaCoeff + r4]
283
+    vpbroadcastd     m1,         [h_tab_LumaCoeff + r4 + 4]
284
+    vpbroadcastd     m2,         [h_tab_LumaCoeff + r4 + 8]
285
+    vpbroadcastd     m3,         [h_tab_LumaCoeff + r4 + 12]
286
+%endif
287
+    mova             m13,        [interp8_hpp_shuf1_load_avx512]
288
+    mova             m14,        [interp8_hpp_shuf2_load_avx512]
289
+    mova             m12,        [interp8_hpp_shuf1_store_avx512]
290
+    mova             m4,         [pd_32]
291
+    pxor             m5,         m5
292
+    mova             m6,         [pw_pixel_max]
293
+
294
+%rep %1/2 - 1
295
+    PROCESS_IPFILTER_LUMA_PP_8x2_AVX2
296
+    lea              r0,         [r0 + 2 * r1]
297
+    lea              r2,         [r2 + 2 * r3]
298
+%endrep
299
+    PROCESS_IPFILTER_LUMA_PP_8x2_AVX2
300
     RET
301
 %endmacro
302
-FILTER_HOR_LUMA_W8 4
303
-FILTER_HOR_LUMA_W8 8
304
-FILTER_HOR_LUMA_W8 16
305
-FILTER_HOR_LUMA_W8 32
306
+
307
+%if ARCH_X86_64
308
+    IPFILTER_LUMA_AVX2_8xN 4
309
+    IPFILTER_LUMA_AVX2_8xN 8
310
+    IPFILTER_LUMA_AVX2_8xN 16
311
+    IPFILTER_LUMA_AVX2_8xN 32
312
+%endif
313
 
314
 ;-------------------------------------------------------------------------------------------------------------
315
 ; void interp_8tap_horiz_pp(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx
316
 ;-------------------------------------------------------------------------------------------------------------
317
-%macro FILTER_HOR_LUMA_W16 1
318
+%macro PROCESS_IPFILTER_LUMA_PP_16x1_AVX2 0
319
+    movu            m7,        [r0]
320
+    movu            m8,        [r0 + 8]
321
+
322
+    pshufb          m10,       m7,        m14
323
+    pshufb          m7,                   m13
324
+    pshufb          m11,       m8,        m14
325
+    pshufb          m8,                   m13
326
+
327
+    pmaddwd         m7,        m0
328
+    pmaddwd         m10,       m1
329
+    paddd           m7,        m10
330
+    pmaddwd         m10,       m11,       m3
331
+    pmaddwd         m9,        m8,        m2
332
+    paddd           m10,       m9
333
+    paddd           m7,        m10
334
+    paddd           m7,        m4
335
+    psrad           m7,        INTERP_SHIFT_PP
336
+
337
+    movu            m9,        [r0 + 16]
338
+    pshufb          m10,       m9,        m14
339
+    pshufb          m9,                   m13
340
+    pmaddwd         m8,        m0
341
+    pmaddwd         m11,       m1
342
+    paddd           m8,        m11
343
+    pmaddwd         m10,       m3
344
+    pmaddwd         m9,        m2
345
+    paddd           m9,        m10
346
+    paddd           m8,        m9
347
+    paddd           m8,        m4
348
+    psrad           m8,        INTERP_SHIFT_PP
349
+
350
+    packusdw        m7,        m8
351
+    pshufb          m7,        m12
352
+    CLIPW           m7,        m5,         m6
353
+    movu            [r2],      m7
354
+%endmacro
355
+
356
+%macro IPFILTER_LUMA_AVX2_16xN 1
357
 INIT_YMM avx2
358
-cglobal interp_8tap_horiz_pp_16x%1, 4,6,8
359
-    add              r1d, r1d
360
-    add              r3d, r3d
361
-    sub              r0, 6
362
-    mov              r4d, r4m
363
-    shl              r4d, 4
364
+cglobal interp_8tap_horiz_pp_16x%1, 5,6,15
365
+    shl              r1d,        1
366
+    shl              r3d,        1
367
+    sub              r0,         6
368
+    mov              r4d,        r4m
369
+    shl              r4d,        4
370
+
371
 %ifdef PIC
372
-    lea              r5, [tab_LumaCoeff]
373
-    vpbroadcastq     m0, [r5 + r4]
374
-    vpbroadcastq     m1, [r5 + r4 + 8]
375
+    lea              r5,         [h_tab_LumaCoeff]
376
+    vpbroadcastd     m0,         [r5 + r4]
377
+    vpbroadcastd     m1,         [r5 + r4 + 4]
378
+    vpbroadcastd     m2,         [r5 + r4 + 8]
379
+    vpbroadcastd     m3,         [r5 + r4 + 12]
380
 %else
381
-    vpbroadcastq     m0, [tab_LumaCoeff + r4]
382
-    vpbroadcastq     m1, [tab_LumaCoeff + r4 + 8]
383
-%endif
384
-    mova             m3, [interp8_hpp_shuf]
385
-    mova             m7, [pd_32]
386
-    pxor             m2, m2
387
-
388
-    ; register map
389
-    ; m0 , m1 interpolate coeff
390
-
391
-    mov              r4d, %1
392
-
393
-.loop:
394
-    vbroadcasti128   m4, [r0]
395
-    vbroadcasti128   m5, [r0 + 8]
396
-    pshufb           m4, m3
397
-    pshufb           m5, m3
398
-
399
-    pmaddwd          m4, m0
400
-    pmaddwd          m5, m1
401
-    paddd            m4, m5
402
-
403
-    vbroadcasti128   m5, [r0 + 8]
404
-    vbroadcasti128   m6, [r0 + 16]
405
-    pshufb           m5, m3
406
-    pshufb           m6, m3
407
-
408
-    pmaddwd          m5, m0
409
-    pmaddwd          m6, m1
410
-    paddd            m5, m6
411
-
412
-    phaddd           m4, m5
413
-    vpermq           m4, m4, q3120
414
-    paddd            m4, m7
415
-    psrad            m4, INTERP_SHIFT_PP
416
-
417
-    packusdw         m4, m4
418
-    vpermq           m4, m4, q2020
419
-    CLIPW            m4, m2, [pw_pixel_max]
420
-    movu             [r2], xm4
421
-
422
-    vbroadcasti128   m4, [r0 + 16]
423
-    vbroadcasti128   m5, [r0 + 24]
424
-    pshufb           m4, m3
425
-    pshufb           m5, m3
426
-
427
-    pmaddwd          m4, m0
428
-    pmaddwd          m5, m1
429
-    paddd            m4, m5
430
-
431
-    vbroadcasti128   m5, [r0 + 24]
432
-    vbroadcasti128   m6, [r0 + 32]
433
-    pshufb           m5, m3
434
-    pshufb           m6, m3
435
-
436
-    pmaddwd          m5, m0
437
-    pmaddwd          m6, m1
438
-    paddd            m5, m6
439
-
440
-    phaddd           m4, m5
441
-    vpermq           m4, m4, q3120
442
-    paddd            m4, m7
443
-    psrad            m4, INTERP_SHIFT_PP
444
-
445
-    packusdw         m4, m4
446
-    vpermq           m4, m4, q2020
447
-    CLIPW            m4, m2, [pw_pixel_max]
448
-    movu             [r2 + 16], xm4
449
-
450
-    add              r2, r3
451
-    add              r0, r1
452
-    dec              r4d
453
-    jnz              .loop
454
+    vpbroadcastd     m0,         [h_tab_LumaCoeff + r4]
455
+    vpbroadcastd     m1,         [h_tab_LumaCoeff + r4 + 4]
456
+    vpbroadcastd     m2,         [h_tab_LumaCoeff + r4 + 8]
457
+    vpbroadcastd     m3,         [h_tab_LumaCoeff + r4 + 12]
458
+%endif
459
+    mova             m13,        [interp8_hpp_shuf1_load_avx512]
460
+    mova             m14,        [interp8_hpp_shuf2_load_avx512]
461
+    mova             m12,        [interp8_hpp_shuf1_store_avx512]
462
+    mova             m4,         [pd_32]
463
+    pxor             m5,         m5
464
+    mova             m6,         [pw_pixel_max]
465
+
466
+%rep %1 - 1
467
+    PROCESS_IPFILTER_LUMA_PP_16x1_AVX2
468
+    lea              r0,         [r0 + r1]
469
+    lea              r2,         [r2 + r3]
470
+%endrep
471
+    PROCESS_IPFILTER_LUMA_PP_16x1_AVX2
472
     RET
473
 %endmacro
474
-FILTER_HOR_LUMA_W16 4
475
-FILTER_HOR_LUMA_W16 8
476
-FILTER_HOR_LUMA_W16 12
477
-FILTER_HOR_LUMA_W16 16
478
-FILTER_HOR_LUMA_W16 32
479
-FILTER_HOR_LUMA_W16 64
480
+
481
+%if ARCH_X86_64
482
+    IPFILTER_LUMA_AVX2_16xN 4
483
+    IPFILTER_LUMA_AVX2_16xN 8
484
+    IPFILTER_LUMA_AVX2_16xN 12
485
+    IPFILTER_LUMA_AVX2_16xN 16
486
+    IPFILTER_LUMA_AVX2_16xN 32
487
+    IPFILTER_LUMA_AVX2_16xN 64
488
+%endif
489
 
490
 ;-------------------------------------------------------------------------------------------------------------
491
 ; void interp_8tap_horiz_pp(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx
492
 ;-------------------------------------------------------------------------------------------------------------
493
-%macro FILTER_HOR_LUMA_W32 2
494
+%macro PROCESS_IPFILTER_LUMA_PP_32x1_AVX2 0
495
+    PROCESS_IPFILTER_LUMA_PP_16x1_AVX2
496
+
497
+    movu            m7,        [r0 + mmsize]
498
+    movu            m8,        [r0 + 8 + mmsize]
499
+
500
+    pshufb          m10,       m7,        m14
501
+    pshufb          m7,                   m13
502
+    pshufb          m11,       m8,        m14
503
+    pshufb          m8,                   m13
504
+
505
+    pmaddwd         m7,        m0
506
+    pmaddwd         m10,       m1
507
+    paddd           m7,        m10
508
+    pmaddwd         m10,       m11,       m3
509
+    pmaddwd         m9,        m8,        m2
510
+    paddd           m10,       m9
511
+    paddd           m7,        m10
512
+    paddd           m7,        m4
513
+    psrad           m7,        INTERP_SHIFT_PP
514
+
515
+    movu            m9,        [r0 + 16 + mmsize]
516
+    pshufb          m10,       m9,        m14
517
+    pshufb          m9,                   m13
518
+    pmaddwd         m8,        m0
519
+    pmaddwd         m11,       m1
520
+    paddd           m8,        m11
521
+    pmaddwd         m10,       m3
522
+    pmaddwd         m9,        m2
523
+    paddd           m9,        m10
524
+    paddd           m8,        m9
525
+    paddd           m8,        m4
526
+    psrad           m8,        INTERP_SHIFT_PP
527
+
528
+    packusdw        m7,        m8
529
+    pshufb          m7,        m12
530
+    CLIPW           m7,        m5,         m6
531
+    movu            [r2 + mmsize],         m7
532
+%endmacro
533
+
534
+%macro IPFILTER_LUMA_AVX2_32xN 1
535
 INIT_YMM avx2
536
-cglobal interp_8tap_horiz_pp_%1x%2, 4,6,8
537
-    add              r1d, r1d
538
-    add              r3d, r3d
539
-    sub              r0, 6
540
-    mov              r4d, r4m
541
-    shl              r4d, 4
542
+cglobal interp_8tap_horiz_pp_32x%1, 5,6,15
543
+    shl              r1d,        1
544
+    shl              r3d,        1
545
+    sub              r0,         6
546
+    mov              r4d,        r4m
547
+    shl              r4d,        4
548
+
549
 %ifdef PIC
550
-    lea              r5, [tab_LumaCoeff]
551
-    vpbroadcastq     m0, [r5 + r4]
552
-    vpbroadcastq     m1, [r5 + r4 + 8]
553
+    lea              r5,         [h_tab_LumaCoeff]
554
+    vpbroadcastd     m0,         [r5 + r4]
555
+    vpbroadcastd     m1,         [r5 + r4 + 4]
556
+    vpbroadcastd     m2,         [r5 + r4 + 8]
557
+    vpbroadcastd     m3,         [r5 + r4 + 12]
558
 %else
559
-    vpbroadcastq     m0, [tab_LumaCoeff + r4]
560
-    vpbroadcastq     m1, [tab_LumaCoeff + r4 + 8]
561
-%endif
562
-    mova             m3, [interp8_hpp_shuf]
563
-    mova             m7, [pd_32]
564
-    pxor             m2, m2
565
-
566
-    ; register map
567
-    ; m0 , m1 interpolate coeff
568
-
569
-    mov              r4d, %2
570
-
571
-.loop:
572
-%assign x 0
573
-%rep %1/16
574
-    vbroadcasti128   m4, [r0 + x]
575
-    vbroadcasti128   m5, [r0 + 8 + x]
576
-    pshufb           m4, m3
577
-    pshufb           m5, m3
578
-
579
-    pmaddwd          m4, m0
580
-    pmaddwd          m5, m1
581
-    paddd            m4, m5
582
-
583
-    vbroadcasti128   m5, [r0 + 8 + x]
584
-    vbroadcasti128   m6, [r0 + 16 + x]
585
-    pshufb           m5, m3
586
-    pshufb           m6, m3
587
-
588
-    pmaddwd          m5, m0
589
-    pmaddwd          m6, m1
590
-    paddd            m5, m6
591
-
592
-    phaddd           m4, m5
593
-    vpermq           m4, m4, q3120
594
-    paddd            m4, m7
595
-    psrad            m4, INTERP_SHIFT_PP
596
-
597
-    packusdw         m4, m4
598
-    vpermq           m4, m4, q2020
599
-    CLIPW            m4, m2, [pw_pixel_max]
600
-    movu             [r2 + x], xm4
601
-
602
-    vbroadcasti128   m4, [r0 + 16 + x]
603
-    vbroadcasti128   m5, [r0 + 24 + x]
604
-    pshufb           m4, m3
605
-    pshufb           m5, m3
606
-
607
-    pmaddwd          m4, m0
608
-    pmaddwd          m5, m1
609
-    paddd            m4, m5
610
-
611
-    vbroadcasti128   m5, [r0 + 24 + x]
612
-    vbroadcasti128   m6, [r0 + 32 + x]
613
-    pshufb           m5, m3
614
-    pshufb           m6, m3
615
-
616
-    pmaddwd          m5, m0
617
-    pmaddwd          m6, m1
618
-    paddd            m5, m6
619
-
620
-    phaddd           m4, m5
621
-    vpermq           m4, m4, q3120
622
-    paddd            m4, m7
623
-    psrad            m4, INTERP_SHIFT_PP
624
-
625
-    packusdw         m4, m4
626
-    vpermq           m4, m4, q2020
627
-    CLIPW            m4, m2, [pw_pixel_max]
628
-    movu             [r2 + 16 + x], xm4
629
+    vpbroadcastd     m0,         [h_tab_LumaCoeff + r4]
630
+    vpbroadcastd     m1,         [h_tab_LumaCoeff + r4 + 4]
631
+    vpbroadcastd     m2,         [h_tab_LumaCoeff + r4 + 8]
632
+    vpbroadcastd     m3,         [h_tab_LumaCoeff + r4 + 12]
633
+%endif
634
+    mova             m13,        [interp8_hpp_shuf1_load_avx512]
635
+    mova             m14,        [interp8_hpp_shuf2_load_avx512]
636
+    mova             m12,        [interp8_hpp_shuf1_store_avx512]
637
+    mova             m4,         [pd_32]
638
+    pxor             m5,         m5
639
+    mova             m6,         [pw_pixel_max]
640
+
641
+%rep %1 - 1
642
+    PROCESS_IPFILTER_LUMA_PP_32x1_AVX2
643
+    lea              r0,         [r0 + r1]
644
+    lea              r2,         [r2 + r3]
645
+%endrep
646
+    PROCESS_IPFILTER_LUMA_PP_32x1_AVX2
647
+    RET
648
+%endmacro
649
 
650
+%if ARCH_X86_64
651
+    IPFILTER_LUMA_AVX2_32xN 8
652
+    IPFILTER_LUMA_AVX2_32xN 16
653
+    IPFILTER_LUMA_AVX2_32xN 24
654
+    IPFILTER_LUMA_AVX2_32xN 32
655
+    IPFILTER_LUMA_AVX2_32xN 64
656
+%endif
657
+
658
+%macro PROCESS_IPFILTER_LUMA_PP_64x1_AVX2 0
659
+    PROCESS_IPFILTER_LUMA_PP_16x1_AVX2
660
+%assign x 32
661
+%rep 3
662
+    movu            m7,        [r0 + x]
663
+    movu            m8,        [r0 + 8 + x]
664
+
665
+    pshufb          m10,       m7,        m14
666
+    pshufb          m7,                   m13
667
+    pshufb          m11,       m8,        m14
668
+    pshufb          m8,                   m13
669
+
670
+    pmaddwd         m7,        m0
671
+    pmaddwd         m10,       m1
672
+    paddd           m7,        m10
673
+    pmaddwd         m10,       m11,       m3
674
+    pmaddwd         m9,        m8,        m2
675
+    paddd           m10,       m9
676
+    paddd           m7,        m10
677
+    paddd           m7,        m4
678
+    psrad           m7,        INTERP_SHIFT_PP
679
+
680
+    movu            m9,        [r0 + 16 + x]
681
+    pshufb          m10,       m9,        m14
682
+    pshufb          m9,                   m13
683
+    pmaddwd         m8,        m0
684
+    pmaddwd         m11,       m1
685
+    paddd           m8,        m11
686
+    pmaddwd         m10,       m3
687
+    pmaddwd         m9,        m2
688
+    paddd           m9,        m10
689
+    paddd           m8,        m9
690
+    paddd           m8,        m4
691
+    psrad           m8,        INTERP_SHIFT_PP
692
+
693
+    packusdw        m7,        m8
694
+    pshufb          m7,        m12
695
+    CLIPW           m7,        m5,         m6
696
+    movu            [r2 + x],  m7
697
 %assign x x+32
698
 %endrep
699
+%endmacro
700
 
701
-    add              r2, r3
702
-    add              r0, r1
703
-    dec              r4d
704
-    jnz              .loop
705
+%macro IPFILTER_LUMA_AVX2_64xN 1
706
+INIT_YMM avx2
707
+cglobal interp_8tap_horiz_pp_64x%1, 5,6,15
708
+    shl              r1d,        1
709
+    shl              r3d,        1
710
+    sub              r0,         6
711
+    mov              r4d,        r4m
712
+    shl              r4d,        4
713
+
714
+%ifdef PIC
715
+    lea              r5,         [h_tab_LumaCoeff]
716
+    vpbroadcastd     m0,         [r5 + r4]
717
+    vpbroadcastd     m1,         [r5 + r4 + 4]
718
+    vpbroadcastd     m2,         [r5 + r4 + 8]
719
+    vpbroadcastd     m3,         [r5 + r4 + 12]
720
+%else
721
+    vpbroadcastd     m0,         [h_tab_LumaCoeff + r4]
722
+    vpbroadcastd     m1,         [h_tab_LumaCoeff + r4 + 4]
723
+    vpbroadcastd     m2,         [h_tab_LumaCoeff + r4 + 8]
724
+    vpbroadcastd     m3,         [h_tab_LumaCoeff + r4 + 12]
725
+%endif
726
+    mova             m13,        [interp8_hpp_shuf1_load_avx512]
727
+    mova             m14,        [interp8_hpp_shuf2_load_avx512]
728
+    mova             m12,        [interp8_hpp_shuf1_store_avx512]
729
+    mova             m4,         [pd_32]
730
+    pxor             m5,         m5
731
+    mova             m6,         [pw_pixel_max]
732
+
733
+%rep %1 - 1
734
+    PROCESS_IPFILTER_LUMA_PP_64x1_AVX2
735
+    lea              r0,         [r0 + r1]
736
+    lea              r2,         [r2 + r3]
737
+%endrep
738
+    PROCESS_IPFILTER_LUMA_PP_64x1_AVX2
739
     RET
740
 %endmacro
741
-FILTER_HOR_LUMA_W32 32, 8
742
-FILTER_HOR_LUMA_W32 32, 16
743
-FILTER_HOR_LUMA_W32 32, 24
744
-FILTER_HOR_LUMA_W32 32, 32
745
-FILTER_HOR_LUMA_W32 32, 64
746
-FILTER_HOR_LUMA_W32 64, 16
747
-FILTER_HOR_LUMA_W32 64, 32
748
-FILTER_HOR_LUMA_W32 64, 48
749
-FILTER_HOR_LUMA_W32 64, 64
750
+
751
+%if ARCH_X86_64
752
+    IPFILTER_LUMA_AVX2_64xN 16
753
+    IPFILTER_LUMA_AVX2_64xN 32
754
+    IPFILTER_LUMA_AVX2_64xN 48
755
+    IPFILTER_LUMA_AVX2_64xN 64
756
+%endif
757
 
758
 ;-------------------------------------------------------------------------------------------------------------
759
 ; void interp_8tap_horiz_pp(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx
760
@@ -1684,12 +1741,12 @@
761
     mov              r4d, r4m
762
     shl              r4d, 4
763
 %ifdef PIC
764
-    lea              r5, [tab_LumaCoeff]
765
+    lea              r5, [h_tab_LumaCoeff]
766
     vpbroadcastq     m0, [r5 + r4]
767
     vpbroadcastq     m1, [r5 + r4 + 8]
768
 %else
769
-    vpbroadcastq     m0, [tab_LumaCoeff + r4]
770
-    vpbroadcastq     m1, [tab_LumaCoeff + r4 + 8]
771
+    vpbroadcastq     m0, [h_tab_LumaCoeff + r4]
772
+    vpbroadcastq     m1, [h_tab_LumaCoeff + r4 + 8]
773
 %endif
774
     mova             m3, [interp8_hpp_shuf]
775
     mova             m7, [pd_32]
776
@@ -1774,12 +1831,12 @@
777
     mov              r4d, r4m
778
     shl              r4d, 4
779
 %ifdef PIC
780
-    lea              r5, [tab_LumaCoeff]
781
+    lea              r5, [h_tab_LumaCoeff]
782
     vpbroadcastq     m0, [r5 + r4]
783
     vpbroadcastq     m1, [r5 + r4 + 8]
784
 %else
785
-    vpbroadcastq     m0, [tab_LumaCoeff + r4]
786
-    vpbroadcastq     m1, [tab_LumaCoeff + r4 + 8]
787
+    vpbroadcastq     m0, [h_tab_LumaCoeff + r4]
788
+    vpbroadcastq     m1, [h_tab_LumaCoeff + r4 + 8]
789
 %endif
790
     mova             m3, [interp8_hpp_shuf]
791
     mova             m7, [pd_32]
792
@@ -1884,125 +1941,82 @@
793
 ;-------------------------------------------------------------------------------------------------------------
794
 ; void interp_8tap_horiz_pp(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx
795
 ;-------------------------------------------------------------------------------------------------------------
796
+%macro PROCESS_IPFILTER_LUMA_PP_48x1_AVX2 0
797
+    PROCESS_IPFILTER_LUMA_PP_32x1_AVX2
798
+
799
+    movu            m7,        [r0 + 2 * mmsize]
800
+    movu            m8,        [r0 + 8 + 2 * mmsize]
801
+
802
+    pshufb          m10,       m7,        m14
803
+    pshufb          m7,                   m13
804
+    pshufb          m11,       m8,        m14
805
+    pshufb          m8,                   m13
806
+
807
+    pmaddwd         m7,        m0
808
+    pmaddwd         m10,       m1
809
+    paddd           m7,        m10
810
+    pmaddwd         m10,       m11,       m3
811
+    pmaddwd         m9,        m8,        m2
812
+    paddd           m10,       m9
813
+    paddd           m7,        m10
814
+    paddd           m7,        m4
815
+    psrad           m7,        INTERP_SHIFT_PP
816
+
817
+    movu            m9,        [r0 + 16 + 2 * mmsize]
818
+    pshufb          m10,       m9,        m14
819
+    pshufb          m9,                   m13
820
+    pmaddwd         m8,        m0
821
+    pmaddwd         m11,       m1
822
+    paddd           m8,        m11
823
+    pmaddwd         m10,       m3
824
+    pmaddwd         m9,        m2
825
+    paddd           m9,        m10
826
+    paddd           m8,        m9
827
+    paddd           m8,        m4
828
+    psrad           m8,        INTERP_SHIFT_PP
829
+
830
+    packusdw        m7,        m8
831
+    pshufb          m7,        m12
832
+    CLIPW           m7,        m5,         m6
833
+    movu            [r2 + 2 * mmsize],     m7
834
+%endmacro
835
+
836
+%if ARCH_X86_64
837
 INIT_YMM avx2
838
-cglobal interp_8tap_horiz_pp_48x64, 4,6,8
839
-    add              r1d, r1d
840
-    add              r3d, r3d
841
-    sub              r0, 6
842
-    mov              r4d, r4m
843
-    shl              r4d, 4
844
+cglobal interp_8tap_horiz_pp_48x64, 5,6,15
845
+    shl              r1d,        1
846
+    shl              r3d,        1
847
+    sub              r0,         6
848
+    mov              r4d,        r4m
849
+    shl              r4d,        4
850
+
851
 %ifdef PIC
852
-    lea              r5, [tab_LumaCoeff]
853
-    vpbroadcastq     m0, [r5 + r4]
854
-    vpbroadcastq     m1, [r5 + r4 + 8]
855
+    lea              r5,         [h_tab_LumaCoeff]
856
+    vpbroadcastd     m0,         [r5 + r4]
857
+    vpbroadcastd     m1,         [r5 + r4 + 4]
858
+    vpbroadcastd     m2,         [r5 + r4 + 8]
859
+    vpbroadcastd     m3,         [r5 + r4 + 12]
860
 %else
861
-    vpbroadcastq     m0, [tab_LumaCoeff + r4]
862
-    vpbroadcastq     m1, [tab_LumaCoeff + r4 + 8]
863
-%endif
864
-    mova             m3, [interp8_hpp_shuf]
865
-    mova             m7, [pd_32]
866
-    pxor             m2, m2
867
-
868
-    ; register map
869
-    ; m0 , m1 interpolate coeff
870
-
871
-    mov              r4d, 64
872
-
873
-.loop:
874
-%assign x 0
875
-%rep 2
876
-    vbroadcasti128   m4, [r0 + x]
877
-    vbroadcasti128   m5, [r0 + 8 + x]
878
-    pshufb           m4, m3
879
-    pshufb           m5, m3
880
-
881
-    pmaddwd          m4, m0
882
-    pmaddwd          m5, m1
883
-    paddd            m4, m5
884
-
885
-    vbroadcasti128   m5, [r0 + 8 + x]
886
-    vbroadcasti128   m6, [r0 + 16 + x]
887
-    pshufb           m5, m3
888
-    pshufb           m6, m3
889
-
890
-    pmaddwd          m5, m0
891
-    pmaddwd          m6, m1
892
-    paddd            m5, m6
893
-
894
-    phaddd           m4, m5
895
-    vpermq           m4, m4, q3120
896
-    paddd            m4, m7
897
-    psrad            m4, INTERP_SHIFT_PP
898
-
899
-    packusdw         m4, m4
900
-    vpermq           m4, m4, q2020
901
-    CLIPW            m4, m2, [pw_pixel_max]
902
-    movu             [r2 + x], xm4
903
-
904
-    vbroadcasti128   m4, [r0 + 16 + x]
905
-    vbroadcasti128   m5, [r0 + 24 + x]
906
-    pshufb           m4, m3
907
-    pshufb           m5, m3
908
-
909
-    pmaddwd          m4, m0
910
-    pmaddwd          m5, m1
911
-    paddd            m4, m5
912
-
913
-    vbroadcasti128   m5, [r0 + 24 + x]
914
-    vbroadcasti128   m6, [r0 + 32 + x]
915
-    pshufb           m5, m3
916
-    pshufb           m6, m3
917
-
918
-    pmaddwd          m5, m0
919
-    pmaddwd          m6, m1
920
-    paddd            m5, m6
921
-
922
-    phaddd           m4, m5
923
-    vpermq           m4, m4, q3120
924
-    paddd            m4, m7
925
-    psrad            m4, INTERP_SHIFT_PP
926
-
927
-    packusdw         m4, m4
928
-    vpermq           m4, m4, q2020
929
-    CLIPW            m4, m2, [pw_pixel_max]
930
-    movu             [r2 + 16 + x], xm4
931
-
932
-    vbroadcasti128   m4, [r0 + 32 + x]
933
-    vbroadcasti128   m5, [r0 + 40 + x]
934
-    pshufb           m4, m3
935
-    pshufb           m5, m3
936
-
937
-    pmaddwd          m4, m0
938
-    pmaddwd          m5, m1
939
-    paddd            m4, m5
940
-
941
-    vbroadcasti128   m5, [r0 + 40 + x]
942
-    vbroadcasti128   m6, [r0 + 48 + x]
943
-    pshufb           m5, m3
944
-    pshufb           m6, m3
945
-
946
-    pmaddwd          m5, m0
947
-    pmaddwd          m6, m1
948
-    paddd            m5, m6
949
-
950
-    phaddd           m4, m5
951
-    vpermq           m4, m4, q3120
952
-    paddd            m4, m7
953
-    psrad            m4, INTERP_SHIFT_PP
954
-
955
-    packusdw         m4, m4
956
-    vpermq           m4, m4, q2020
957
-    CLIPW            m4, m2, [pw_pixel_max]
958
-    movu             [r2 + 32 + x], xm4
959
-
960
-%assign x x+48
961
+    vpbroadcastd     m0,         [h_tab_LumaCoeff + r4]
962
+    vpbroadcastd     m1,         [h_tab_LumaCoeff + r4 + 4]
963
+    vpbroadcastd     m2,         [h_tab_LumaCoeff + r4 + 8]
964
+    vpbroadcastd     m3,         [h_tab_LumaCoeff + r4 + 12]
965
+%endif
966
+    mova             m13,        [interp8_hpp_shuf1_load_avx512]
967
+    mova             m14,        [interp8_hpp_shuf2_load_avx512]
968
+    mova             m12,        [interp8_hpp_shuf1_store_avx512]
969
+    mova             m4,         [pd_32]
970
+    pxor             m5,         m5
971
+    mova             m6,         [pw_pixel_max]
972
+
973
+%rep 63
974
+    PROCESS_IPFILTER_LUMA_PP_48x1_AVX2
975
+    lea              r0,         [r0 + r1]
976
+    lea              r2,         [r2 + r3]
977
 %endrep
978
-
979
-    add              r2, r3
980
-    add              r0, r1
981
-    dec              r4d
982
-    jnz              .loop
983
+    PROCESS_IPFILTER_LUMA_PP_48x1_AVX2
984
     RET
985
+%endif
986
 
987
 ;-----------------------------------------------------------------------------------------------------------------------------
988
 ;void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)
989
@@ -2018,12 +2032,12 @@
990
     add                         r3d,               r3d
991
 
992
 %ifdef PIC
993
-    lea                         r6,                [tab_LumaCoeff]
994
+    lea                         r6,                [h_tab_LumaCoeff]
995
     lea                         r4,                [r4 * 8]
996
     vbroadcasti128              m0,                [r6 + r4 * 2]
997
 %else
998
     lea                         r4,                [r4 * 8]
999
-    vbroadcasti128              m0,                [tab_LumaCoeff + r4 * 2]
1000
+    vbroadcasti128              m0,                [h_tab_LumaCoeff + r4 * 2]
1001
 %endif
1002
 
1003
     vbroadcasti128              m2,                [INTERP_OFFSET_PS]
1004
@@ -2119,22 +2133,53 @@
1005
     IPFILTER_LUMA_PS_4xN_AVX2 8
1006
     IPFILTER_LUMA_PS_4xN_AVX2 16
1007
 
1008
+   %macro PROCESS_IPFILTER_LUMA_PS_8x1_AVX2 1
1009
+
1010
+     %assign x 0
1011
+    %rep %1/8
1012
+    vbroadcasti128      m4, [r0 + x]
1013
+    vbroadcasti128      m5, [r0 + 8+ x]
1014
+    pshufb              m4, m3
1015
+    pshufb              m7, m5, m3
1016
+    pmaddwd             m4, m0
1017
+    pmaddwd             m7, m1
1018
+    paddd               m4, m7
1019
+
1020
+    vbroadcasti128      m6, [r0 + 16 + x]
1021
+    pshufb              m5, m3
1022
+    pshufb              m6, m3
1023
+    pmaddwd             m5, m0
1024
+    pmaddwd             m6, m1
1025
+    paddd               m5, m6
1026
+
1027
+    phaddd              m4, m5
1028
+    vpermq              m4, m4, q3120
1029
+    paddd               m4, m2
1030
+    vextracti128        xm5,m4, 1
1031
+    psrad               xm4, INTERP_SHIFT_PS
1032
+    psrad               xm5, INTERP_SHIFT_PS
1033
+    packssdw            xm4, xm5
1034
+    movu                [r2 + x], xm4
1035
+    %assign x x+16
1036
+     %endrep
1037
+    %endmacro
1038
+
1039
 %macro IPFILTER_LUMA_PS_8xN_AVX2 1
1040
 INIT_YMM avx2
1041
 %if ARCH_X86_64 == 1
1042
 cglobal interp_8tap_horiz_ps_8x%1, 4, 6, 8
1043
-    add                 r1d, r1d
1044
-    add                 r3d, r3d
1045
+    shl                 r1d, 1
1046
+    shl                 r3d, 1
1047
     mov                 r4d, r4m
1048
     mov                 r5d, r5m
1049
     shl                 r4d, 4
1050
 %ifdef PIC
1051
-    lea                 r6, [tab_LumaCoeff]
1052
+    lea                 r6, [h_tab_LumaCoeff]
1053
     vpbroadcastq        m0, [r6 + r4]
1054
     vpbroadcastq        m1, [r6 + r4 + 8]
1055
 %else
1056
-    vpbroadcastq        m0, [tab_LumaCoeff + r4]
1057
-    vpbroadcastq        m1, [tab_LumaCoeff + r4 + 8]
1058
+    vpbroadcastq        m0, [h_tab_LumaCoeff + r4]
1059
+    vpbroadcastq        m1, [h_tab_LumaCoeff + r4 + 8]
1060
 %endif
1061
     mova                m3, [interp8_hpp_shuf]
1062
     vbroadcasti128      m2, [INTERP_OFFSET_PS]
1063
@@ -2151,30 +2196,7 @@
1064
     add                 r4d, 7
1065
 
1066
 .loop0:
1067
-    vbroadcasti128      m4, [r0]
1068
-    vbroadcasti128      m5, [r0 + 8]
1069
-    pshufb              m4, m3
1070
-    pshufb              m7, m5, m3
1071
-    pmaddwd             m4, m0
1072
-    pmaddwd             m7, m1
1073
-    paddd               m4, m7
1074
-
1075
-    vbroadcasti128      m6, [r0 + 16]
1076
-    pshufb              m5, m3
1077
-    pshufb              m6, m3
1078
-    pmaddwd             m5, m0
1079
-    pmaddwd             m6, m1
1080
-    paddd               m5, m6
1081
-
1082
-    phaddd              m4, m5
1083
-    vpermq              m4, m4, q3120
1084
-    paddd               m4, m2
1085
-    vextracti128        xm5,m4, 1
1086
-    psrad               xm4, INTERP_SHIFT_PS
1087
-    psrad               xm5, INTERP_SHIFT_PS
1088
-    packssdw            xm4, xm5
1089
-
1090
-    movu                [r2], xm4
1091
+    PROCESS_IPFILTER_LUMA_PS_8x1_AVX2 8
1092
     add                 r2, r3
1093
     add                 r0, r1
1094
     dec                 r4d
1095
@@ -2197,12 +2219,12 @@
1096
     mov                 r5d, r5m
1097
     shl                 r4d, 4
1098
 %ifdef PIC
1099
-    lea                 r6, [tab_LumaCoeff]
1100
+    lea                 r6, [h_tab_LumaCoeff]
1101
     vpbroadcastq        m0, [r6 + r4]
1102
     vpbroadcastq        m1, [r6 + r4 + 8]
1103
 %else
1104
-    vpbroadcastq        m0, [tab_LumaCoeff + r4]
1105
-    vpbroadcastq        m1, [tab_LumaCoeff + r4 + 8]
1106
+    vpbroadcastq        m0, [h_tab_LumaCoeff + r4]
1107
+    vpbroadcastq        m1, [h_tab_LumaCoeff + r4 + 8]
1108
 %endif
1109
     mova                m3, [interp8_hpp_shuf]
1110
     vbroadcasti128      m2, [INTERP_OFFSET_PS]
1111
@@ -2218,46 +2240,297 @@
1112
     sub                 r0, r6
1113
     add                 r4d, 7
1114
 
1115
+
1116
 .loop0:
1117
-%assign x 0
1118
-%rep 24/8
1119
-    vbroadcasti128      m4, [r0 + x]
1120
-    vbroadcasti128      m5, [r0 + 8 + x]
1121
-    pshufb              m4, m3
1122
-    pshufb              m7, m5, m3
1123
-    pmaddwd             m4, m0
1124
-    pmaddwd             m7, m1
1125
-    paddd               m4, m7
1126
+    PROCESS_IPFILTER_LUMA_PS_8x1_AVX2 24
1127
+    add                 r2, r3
1128
+    add                 r0, r1
1129
+    dec                 r4d
1130
+    jnz                 .loop0
1131
+    RET
1132
+%endif
1133
 
1134
-    vbroadcasti128      m6, [r0 + 16 + x]
1135
-    pshufb              m5, m3
1136
-    pshufb              m6, m3
1137
-    pmaddwd             m5, m0
1138
-    pmaddwd             m6, m1
1139
-    paddd               m5, m6
1140
 
1141
-    phaddd              m4, m5
1142
-    vpermq              m4, m4, q3120
1143
-    paddd               m4, m2
1144
-    vextracti128        xm5,m4, 1
1145
-    psrad               xm4, INTERP_SHIFT_PS
1146
-    psrad               xm5, INTERP_SHIFT_PS
1147
-    packssdw            xm4, xm5
1148
+%macro PROCESS_IPFILTER_LUMA_PS_16x1_AVX2 0
1149
+    movu            m7,        [r0]
1150
+    movu            m8,        [r0 + 8]
1151
+    pshufb          m10,       m7,        m14
1152
+    pshufb          m7,                   m13
1153
+    pshufb          m11,       m8,        m14
1154
+    pshufb          m8,                   m13
1155
+
1156
+    pmaddwd         m7,        m0
1157
+    pmaddwd         m10,       m1
1158
+    paddd           m7,        m10
1159
+    pmaddwd         m10,       m11,       m3
1160
+    pmaddwd         m9,        m8,        m2
1161
+    paddd           m10,       m9
1162
+    paddd           m7,        m10
1163
+    paddd           m7,        m4
1164
+    psrad           m7,        INTERP_SHIFT_PS
1165
+    movu            m9,        [r0 + 16]
1166
+    pshufb          m10,       m9,        m14
1167
+    pshufb          m9,                   m13
1168
+    pmaddwd         m8,        m0
1169
+    pmaddwd         m11,       m1
1170
+    paddd           m8,        m11
1171
+    pmaddwd         m10,       m3
1172
+    pmaddwd         m9,        m2
1173
+    paddd           m9,        m10
1174
+    paddd           m8,        m9
1175
+    paddd           m8,        m4
1176
+    psrad           m8,        INTERP_SHIFT_PS
1177
+    packssdw        m7,        m8
1178
+    pshufb          m7,        m12
1179
+    movu            [r2],      m7
1180
+%endmacro
1181
 
1182
-    movu                [r2 + x], xm4
1183
-    %assign x x+16
1184
-    %endrep
1185
+%macro IPFILTER_LUMA_PS_16xN_AVX2 1
1186
+INIT_YMM avx2
1187
+%if ARCH_X86_64 == 1
1188
+cglobal interp_8tap_horiz_ps_16x%1, 5, 6, 15
1189
 
1190
-    add                 r2, r3
1191
-    add                 r0, r1
1192
+    shl                 r1d, 1
1193
+    shl                 r3d, 1
1194
+    mov                 r4d, r4m
1195
+    mov                 r5d, r5m
1196
+    shl                 r4d, 4
1197
+%ifdef PIC
1198
+    lea                 r6, [h_tab_LumaCoeff]
1199
+    vpbroadcastd     m0,         [r6 + r4]
1200
+    vpbroadcastd     m1,         [r6 + r4 + 4]
1201
+    vpbroadcastd     m2,         [r6 + r4 + 8]
1202
+    vpbroadcastd     m3,         [r6 + r4 + 12]
1203
+%else
1204
+    vpbroadcastd     m0,         [h_tab_LumaCoeff + r4]
1205
+    vpbroadcastd     m1,         [h_tab_LumaCoeff + r4 + 4]
1206
+    vpbroadcastd     m2,         [h_tab_LumaCoeff + r4 + 8]
1207
+    vpbroadcastd     m3,         [h_tab_LumaCoeff + r4 + 12]
1208
+%endif
1209
+    mova             m13,        [interp8_hpp_shuf1_load_avx512]
1210
+    mova             m14,        [interp8_hpp_shuf2_load_avx512]
1211
+    mova             m12,        [interp8_hpp_shuf1_store_avx512]
1212
+    vbroadcasti128           m4,         [INTERP_OFFSET_PS]
1213
+
1214
+    ; register map
1215
+    ; m0 , m1 interpolate coeff
1216
+
1217
+    sub                 r0, 6
1218
+    test                r5d, r5d
1219
+    mov                 r4d, %1
1220
+    jz                  .loop0
1221
+    lea                 r6, [r1*3]
1222
+    sub                 r0, r6
1223
+    add                 r4d, 7
1224
+
1225
+.loop0:
1226
+
1227
+     PROCESS_IPFILTER_LUMA_PS_16x1_AVX2
1228
+     lea              r0,         [r0 + r1]
1229
+    lea              r2,         [r2 + r3]
1230
+    ;add                 r2, r3
1231
+    ;add                 r0, r1
1232
     dec                 r4d
1233
     jnz                 .loop0
1234
     RET
1235
 %endif
1236
-%macro IPFILTER_LUMA_PS_32_64_AVX2 2
1237
+%endmacro
1238
+
1239
+    IPFILTER_LUMA_PS_16xN_AVX2 4
1240
+    IPFILTER_LUMA_PS_16xN_AVX2 8
1241
+    IPFILTER_LUMA_PS_16xN_AVX2 12
1242
+    IPFILTER_LUMA_PS_16xN_AVX2 16
1243
+    IPFILTER_LUMA_PS_16xN_AVX2 32
1244
+    IPFILTER_LUMA_PS_16xN_AVX2 64
1245
+%macro PROCESS_IPFILTER_LUMA_PS_32x1_AVX2 0
1246
+     PROCESS_IPFILTER_LUMA_PS_16x1_AVX2
1247
+    movu            m7,        [r0 + mmsize]
1248
+    movu            m8,        [r0 + 8+ mmsize]
1249
+    pshufb          m10,       m7,        m14
1250
+    pshufb          m7,                   m13
1251
+    pshufb          m11,       m8,        m14
1252
+    pshufb          m8,                   m13
1253
+
1254
+    pmaddwd         m7,        m0
1255
+    pmaddwd         m10,       m1
1256
+    paddd           m7,        m10
1257
+    pmaddwd         m10,       m11,       m3
1258
+    pmaddwd         m9,        m8,        m2
1259
+    paddd           m10,       m9
1260
+    paddd           m7,        m10
1261
+    paddd           m7,        m4
1262
+    psrad           m7,        INTERP_SHIFT_PS
1263
+    movu            m9,        [r0 + 16+ mmsize]
1264
+    pshufb          m10,       m9,        m14
1265
+    pshufb          m9,                   m13
1266
+    pmaddwd         m8,        m0
1267
+    pmaddwd         m11,       m1
1268
+    paddd           m8,        m11
1269
+    pmaddwd         m10,       m3
1270
+    pmaddwd         m9,        m2
1271
+    paddd           m9,        m10
1272
+    paddd           m8,        m9
1273
+    paddd           m8,        m4
1274
+    psrad           m8,        INTERP_SHIFT_PS
1275
+    packssdw        m7,        m8
1276
+    pshufb          m7,        m12
1277
+    movu            [r2+ mmsize],      m7
1278
+%endmacro
1279
+
1280
+%macro IPFILTER_LUMA_PS_32xN_AVX2 1
1281
+INIT_YMM avx2
1282
+%if ARCH_X86_64
1283
+cglobal interp_8tap_horiz_ps_32x%1, 5, 6, 15
1284
+
1285
+    shl                 r1d, 1
1286
+    shl                 r3d, 1
1287
+    mov                 r4d, r4m
1288
+    mov                 r5d, r5m
1289
+    shl                 r4d, 4
1290
+%ifdef PIC
1291
+    lea                 r6, [h_tab_LumaCoeff]
1292
+    vpbroadcastd     m0,         [r6 + r4]
1293
+    vpbroadcastd     m1,         [r6 + r4 + 4]
1294
+    vpbroadcastd     m2,         [r6 + r4 + 8]
1295
+    vpbroadcastd     m3,         [r6 + r4 + 12]
1296
+%else
1297
+    vpbroadcastd     m0,         [h_tab_LumaCoeff + r4]
1298
+    vpbroadcastd     m1,         [h_tab_LumaCoeff + r4 + 4]
1299
+    vpbroadcastd     m2,         [h_tab_LumaCoeff + r4 + 8]
1300
+    vpbroadcastd     m3,         [h_tab_LumaCoeff + r4 + 12]
1301
+%endif
1302
+    mova             m13,        [interp8_hpp_shuf1_load_avx512]
1303
+    mova             m14,        [interp8_hpp_shuf2_load_avx512]
1304
+    mova             m12,        [interp8_hpp_shuf1_store_avx512]
1305
+    vbroadcasti128           m4,         [INTERP_OFFSET_PS]
1306
+
1307
+    ; register map
1308
+    ; m0 , m1 interpolate coeff
1309
+
1310
+    sub                 r0, 6
1311
+    test                r5d, r5d
1312
+    mov                 r4d, %1
1313
+    jz                  .loop0
1314
+    lea                 r6, [r1*3]
1315
+    sub                 r0, r6
1316
+    add                 r4d, 7
1317
+
1318
+.loop0:
1319
+    PROCESS_IPFILTER_LUMA_PS_32x1_AVX2
1320
+    lea              r0,         [r0 + r1]
1321
+    lea              r2,         [r2 + r3]
1322
+    ;add                 r2, r3
1323
+    ;add                 r0, r1
1324
+    dec                 r4d
1325
+    jnz                 .loop0
1326
+    RET
1327
+%endif
1328
+%endmacro
1329
+
1330
+    IPFILTER_LUMA_PS_32xN_AVX2 8
1331
+    IPFILTER_LUMA_PS_32xN_AVX2 16
1332
+    IPFILTER_LUMA_PS_32xN_AVX2 24
1333
+    IPFILTER_LUMA_PS_32xN_AVX2 32
1334
+    IPFILTER_LUMA_PS_32xN_AVX2 64
1335
+
1336
+%macro PROCESS_IPFILTER_LUMA_PS_64x1_AVX2 0
1337
+     PROCESS_IPFILTER_LUMA_PS_16x1_AVX2
1338
+%assign x 32
1339
+%rep 3
1340
+    movu            m7,        [r0 + x]
1341
+    movu            m8,        [r0 + 8+ x]
1342
+    pshufb          m10,       m7,        m14
1343
+    pshufb          m7,                   m13
1344
+    pshufb          m11,       m8,        m14
1345
+    pshufb          m8,                   m13
1346
+
1347
+    pmaddwd         m7,        m0
1348
+    pmaddwd         m10,       m1
1349
+    paddd           m7,        m10
1350
+    pmaddwd         m10,       m11,       m3
1351
+    pmaddwd         m9,        m8,        m2
1352
+    paddd           m10,       m9
1353
+    paddd           m7,        m10
1354
+    paddd           m7,        m4
1355
+    psrad           m7,        INTERP_SHIFT_PS
1356
+    movu            m9,        [r0 + 16+ x]
1357
+    pshufb          m10,       m9,        m14
1358
+    pshufb          m9,                   m13
1359
+    pmaddwd         m8,        m0
1360
+    pmaddwd         m11,       m1
1361
+    paddd           m8,        m11
1362
+    pmaddwd         m10,       m3
1363
+    pmaddwd         m9,        m2
1364
+    paddd           m9,        m10
1365
+    paddd           m8,        m9
1366
+    paddd           m8,        m4
1367
+    psrad           m8,        INTERP_SHIFT_PS
1368
+    packssdw        m7,        m8
1369
+    pshufb          m7,        m12
1370
+    movu            [r2+ x],      m7
1371
+%assign x x+32
1372
+%endrep
1373
+%endmacro
1374
+
1375
+%macro IPFILTER_LUMA_PS_64xN_AVX2 1
1376
+INIT_YMM avx2
1377
+%if ARCH_X86_64
1378
+cglobal interp_8tap_horiz_ps_64x%1, 5, 6, 15
1379
+
1380
+    shl                 r1d, 1
1381
+    shl                 r3d, 1
1382
+    mov                 r4d, r4m
1383
+    mov                 r5d, r5m
1384
+    shl                 r4d, 4
1385
+%ifdef PIC
1386
+    lea                 r6, [h_tab_LumaCoeff]
1387
+    vpbroadcastd     m0,         [r6 + r4]
1388
+    vpbroadcastd     m1,         [r6 + r4 + 4]
1389
+    vpbroadcastd     m2,         [r6 + r4 + 8]
1390
+    vpbroadcastd     m3,         [r6 + r4 + 12]
1391
+%else
1392
+    vpbroadcastd     m0,         [h_tab_LumaCoeff + r4]
1393
+    vpbroadcastd     m1,         [h_tab_LumaCoeff + r4 + 4]
1394
+    vpbroadcastd     m2,         [h_tab_LumaCoeff + r4 + 8]
1395
+    vpbroadcastd     m3,         [h_tab_LumaCoeff + r4 + 12]
1396
+%endif
1397
+    mova             m13,        [interp8_hpp_shuf1_load_avx512]
1398
+    mova             m14,        [interp8_hpp_shuf2_load_avx512]
1399
+    mova             m12,        [interp8_hpp_shuf1_store_avx512]
1400
+    vbroadcasti128           m4,         [INTERP_OFFSET_PS]
1401
+
1402
+    ; register map
1403
+    ; m0 , m1 interpolate coeff
1404
+
1405
+    sub                 r0, 6
1406
+    test                r5d, r5d
1407
+    mov                 r4d, %1
1408
+    jz                  .loop0
1409
+    lea                 r6, [r1*3]
1410
+    sub                 r0, r6
1411
+    add                 r4d, 7
1412
+
1413
+.loop0:
1414
+    PROCESS_IPFILTER_LUMA_PS_64x1_AVX2
1415
+    lea              r0,         [r0 + r1]
1416
+    lea              r2,         [r2 + r3]
1417
+    ;add                 r2, r3
1418
+    ;add                 r0, r1
1419
+    dec                 r4d
1420
+    jnz                 .loop0
1421
+    RET
1422
+%endif
1423
+%endmacro
1424
+
1425
+    IPFILTER_LUMA_PS_64xN_AVX2 16
1426
+    IPFILTER_LUMA_PS_64xN_AVX2 32
1427
+    IPFILTER_LUMA_PS_64xN_AVX2 48
1428
+    IPFILTER_LUMA_PS_64xN_AVX2 64
1429
+
1430
+%macro IPFILTER_LUMA_PS_48xN_AVX2 1
1431
 INIT_YMM avx2
1432
 %if ARCH_X86_64 == 1
1433
-cglobal interp_8tap_horiz_ps_%1x%2, 4, 6, 8
1434
+cglobal interp_8tap_horiz_ps_48x%1, 5, 9,15
1435
 
1436
     add                 r1d, r1d
1437
     add                 r3d, r3d
1438
@@ -2280,7 +2553,7 @@
1439
 
1440
     sub                 r0, 6
1441
     test                r5d, r5d
1442
-    mov                 r4d, %2
1443
+    mov                 r4d, %1
1444
     jz                 .loop0
1445
     lea                 r6, [r1*3]
1446
     sub                 r0, r6
1447
@@ -2288,7 +2561,7 @@
1448
 
1449
 .loop0:
1450
 %assign x 0
1451
-%rep %1/16
1452
+%rep 3
1453
     vbroadcasti128      m4, [r0 + x]
1454
     vbroadcasti128      m5, [r0 + 4 * SIZEOF_PIXEL + x]
1455
     pshufb              m4, m3
1456
@@ -2351,115 +2624,7 @@
1457
     RET
1458
 %endif
1459
 %endmacro
1460
-
1461
-    IPFILTER_LUMA_PS_32_64_AVX2 32, 8
1462
-    IPFILTER_LUMA_PS_32_64_AVX2 32, 16
1463
-    IPFILTER_LUMA_PS_32_64_AVX2 32, 24
1464
-    IPFILTER_LUMA_PS_32_64_AVX2 32, 32
1465
-    IPFILTER_LUMA_PS_32_64_AVX2 32, 64
1466
-
1467
-    IPFILTER_LUMA_PS_32_64_AVX2 64, 16
1468
-    IPFILTER_LUMA_PS_32_64_AVX2 64, 32
1469
-    IPFILTER_LUMA_PS_32_64_AVX2 64, 48
1470
-    IPFILTER_LUMA_PS_32_64_AVX2 64, 64
1471
-
1472
-    IPFILTER_LUMA_PS_32_64_AVX2 48, 64
1473
-
1474
-%macro IPFILTER_LUMA_PS_16xN_AVX2 1
1475
-INIT_YMM avx2
1476
-%if ARCH_X86_64 == 1
1477
-cglobal interp_8tap_horiz_ps_16x%1, 4, 6, 8
1478
-
1479
-    add                 r1d, r1d
1480
-    add                 r3d, r3d
1481
-    mov                 r4d, r4m
1482
-    mov                 r5d, r5m
1483
-    shl                 r4d, 4
1484
-%ifdef PIC
1485
-    lea                 r6, [tab_LumaCoeff]
1486
-    vpbroadcastq        m0, [r6 + r4]
1487
-    vpbroadcastq        m1, [r6 + r4 + 8]
1488
-%else
1489
-    vpbroadcastq        m0, [tab_LumaCoeff + r4]
1490
-    vpbroadcastq        m1, [tab_LumaCoeff + r4 + 8]
1491
-%endif
1492
-    mova                m3, [interp8_hpp_shuf]
1493
-    vbroadcasti128      m2, [INTERP_OFFSET_PS]
1494
-
1495
-    ; register map
1496
-    ; m0 , m1 interpolate coeff
1497
-
1498
-    sub                 r0, 6
1499
-    test                r5d, r5d
1500
-    mov                 r4d, %1
1501
-    jz                  .loop0
1502
-    lea                 r6, [r1*3]
1503
-    sub                 r0, r6
1504
-    add                 r4d, 7
1505
-
1506
-.loop0:
1507
-    vbroadcasti128      m4, [r0]
1508
-    vbroadcasti128      m5, [r0 + 8]
1509
-    pshufb              m4, m3
1510
-    pshufb              m7, m5, m3
1511
-    pmaddwd             m4, m0
1512
-    pmaddwd             m7, m1
1513
-    paddd               m4, m7
1514
-
1515
-    vbroadcasti128      m6, [r0 + 16]
1516
-    pshufb              m5, m3
1517
-    pshufb              m7, m6, m3
1518
-    pmaddwd             m5, m0
1519
-    pmaddwd             m7, m1
1520
-    paddd               m5, m7
1521
-
1522
-    phaddd              m4, m5
1523
-    vpermq              m4, m4, q3120
1524
-    paddd               m4, m2
1525
-    vextracti128        xm5, m4, 1
1526
-    psrad               xm4, INTERP_SHIFT_PS
1527
-    psrad               xm5, INTERP_SHIFT_PS
1528
-    packssdw            xm4, xm5
1529
-    movu                [r2], xm4
1530
-
1531
-    vbroadcasti128      m5, [r0 + 24]
1532
-    pshufb              m6, m3
1533
-    pshufb              m7, m5, m3
1534
-    pmaddwd             m6, m0
1535
-    pmaddwd             m7, m1
1536
-    paddd               m6, m7
1537
-
1538
-    vbroadcasti128      m7, [r0 + 32]
1539
-    pshufb              m5, m3
1540
-    pshufb              m7, m3
1541
-    pmaddwd             m5, m0
1542
-    pmaddwd             m7, m1
1543
-    paddd               m5, m7
1544
-
1545
-    phaddd              m6, m5
1546
-    vpermq              m6, m6, q3120
1547
-    paddd               m6, m2
1548
-    vextracti128        xm5,m6, 1
1549
-    psrad               xm6, INTERP_SHIFT_PS
1550
-    psrad               xm5, INTERP_SHIFT_PS
1551
-    packssdw            xm6, xm5
1552
-    movu                [r2 + 16], xm6
1553
-
1554
-    add                 r2, r3
1555
-    add                 r0, r1
1556
-    dec                 r4d
1557
-    jnz                 .loop0
1558
-    RET
1559
-%endif
1560
-%endmacro
1561
-
1562
-    IPFILTER_LUMA_PS_16xN_AVX2 4
1563
-    IPFILTER_LUMA_PS_16xN_AVX2 8
1564
-    IPFILTER_LUMA_PS_16xN_AVX2 12
1565
-    IPFILTER_LUMA_PS_16xN_AVX2 16
1566
-    IPFILTER_LUMA_PS_16xN_AVX2 32
1567
-    IPFILTER_LUMA_PS_16xN_AVX2 64
1568
-
1569
+      IPFILTER_LUMA_PS_48xN_AVX2 64
1570
 INIT_YMM avx2
1571
 %if ARCH_X86_64 == 1
1572
 cglobal interp_8tap_horiz_ps_12x16, 4, 6, 8
1573
@@ -2469,12 +2634,12 @@
1574
     mov                 r5d, r5m
1575
     shl                 r4d, 4
1576
 %ifdef PIC
1577
-    lea                 r6, [tab_LumaCoeff]
1578
+    lea                 r6, [h_tab_LumaCoeff]
1579
     vpbroadcastq        m0, [r6 + r4]
1580
     vpbroadcastq        m1, [r6 + r4 + 8]
1581
 %else
1582
-    vpbroadcastq        m0, [tab_LumaCoeff + r4]
1583
-    vpbroadcastq        m1, [tab_LumaCoeff + r4 + 8]
1584
+    vpbroadcastq        m0, [h_tab_LumaCoeff + r4]
1585
+    vpbroadcastq        m1, [h_tab_LumaCoeff + r4 + 8]
1586
 %endif
1587
     mova                m3, [interp8_hpp_shuf]
1588
     vbroadcasti128      m2, [INTERP_OFFSET_PS]
1589
x265_2.7.tar.gz/source/common/x86/h4-ipfilter16.asm -> x265_2.9.tar.gz/source/common/x86/h4-ipfilter16.asm Changed
299
 
1
@@ -52,7 +52,7 @@
2
 
3
 tab_Tm16:         db 0, 1, 2, 3, 4,  5,  6, 7, 2, 3, 4,  5, 6, 7, 8, 9
4
 
5
-tab_ChromaCoeff:  dw  0, 64,  0,  0
6
+h4_tab_ChromaCoeff:  dw  0, 64,  0,  0
7
                   dw -2, 58, 10, -2
8
                   dw -4, 54, 16, -2
9
                   dw -6, 46, 28, -4
10
@@ -279,10 +279,10 @@
11
     add         r4d,    r4d
12
 
13
 %ifdef PIC
14
-    lea         r6,     [tab_ChromaCoeff]
15
+    lea         r6,     [h4_tab_ChromaCoeff]
16
     movddup     m0,     [r6 + r4 * 4]
17
 %else
18
-    movddup     m0,     [tab_ChromaCoeff + r4 * 4]
19
+    movddup     m0,     [h4_tab_ChromaCoeff + r4 * 4]
20
 %endif
21
 
22
 %ifidn %3, ps
23
@@ -377,6 +377,7 @@
24
 ; void interp_4tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
25
 ;-----------------------------------------------------------------------------
26
 
27
+%if ARCH_X86_64
28
 FILTER_HOR_CHROMA_sse3 2, 4, pp
29
 FILTER_HOR_CHROMA_sse3 2, 8, pp
30
 FILTER_HOR_CHROMA_sse3 2, 16, pp
31
@@ -462,6 +463,7 @@
32
 FILTER_HOR_CHROMA_sse3 64, 32, ps
33
 FILTER_HOR_CHROMA_sse3 64, 48, ps
34
 FILTER_HOR_CHROMA_sse3 64, 64, ps
35
+%endif
36
 
37
 %macro FILTER_W2_2 1
38
     movu        m3,         [r0]
39
@@ -530,10 +532,10 @@
40
     add         r4d,      r4d
41
 
42
 %ifdef PIC
43
-    lea         r%6,      [tab_ChromaCoeff]
44
+    lea         r%6,      [h4_tab_ChromaCoeff]
45
     movh        m0,       [r%6 + r4 * 4]
46
 %else
47
-    movh        m0,       [tab_ChromaCoeff + r4 * 4]
48
+    movh        m0,       [h4_tab_ChromaCoeff + r4 * 4]
49
 %endif
50
 
51
     punpcklqdq  m0,       m0
52
@@ -1129,10 +1131,10 @@
53
     add         r4d,        r4d
54
 
55
 %ifdef PIC
56
-    lea         r%4,       [tab_ChromaCoeff]
57
+    lea         r%4,       [h4_tab_ChromaCoeff]
58
     movh        m0,       [r%4 + r4 * 4]
59
 %else
60
-    movh        m0,       [tab_ChromaCoeff + r4 * 4]
61
+    movh        m0,       [h4_tab_ChromaCoeff + r4 * 4]
62
 %endif
63
 
64
     punpcklqdq  m0,       m0
65
@@ -1246,10 +1248,10 @@
66
     sub             r0, 2
67
     mov             r4d, r4m
68
 %ifdef PIC
69
-    lea             r5, [tab_ChromaCoeff]
70
+    lea             r5, [h4_tab_ChromaCoeff]
71
     vpbroadcastq    m0, [r5 + r4 * 8]
72
 %else
73
-    vpbroadcastq    m0, [tab_ChromaCoeff + r4 * 8]
74
+    vpbroadcastq    m0, [h4_tab_ChromaCoeff + r4 * 8]
75
 %endif
76
     mova            m1, [h4_interp8_hpp_shuf]
77
     vpbroadcastd    m2, [pd_32]
78
@@ -1314,10 +1316,10 @@
79
     sub             r0, 2
80
     mov             r4d, r4m
81
 %ifdef PIC
82
-    lea             r5, [tab_ChromaCoeff]
83
+    lea             r5, [h4_tab_ChromaCoeff]
84
     vpbroadcastq    m0, [r5 + r4 * 8]
85
 %else
86
-    vpbroadcastq    m0, [tab_ChromaCoeff + r4 * 8]
87
+    vpbroadcastq    m0, [h4_tab_ChromaCoeff + r4 * 8]
88
 %endif
89
     mova            m1, [h4_interp8_hpp_shuf]
90
     vpbroadcastd    m2, [pd_32]
91
@@ -1370,10 +1372,10 @@
92
     sub             r0, 2
93
     mov             r4d, r4m
94
 %ifdef PIC
95
-    lea             r5, [tab_ChromaCoeff]
96
+    lea             r5, [h4_tab_ChromaCoeff]
97
     vpbroadcastq    m0, [r5 + r4 * 8]
98
 %else
99
-    vpbroadcastq    m0, [tab_ChromaCoeff + r4 * 8]
100
+    vpbroadcastq    m0, [h4_tab_ChromaCoeff + r4 * 8]
101
 %endif
102
     mova            m1, [h4_interp8_hpp_shuf]
103
     vpbroadcastd    m2, [pd_32]
104
@@ -1432,10 +1434,10 @@
105
     sub             r0, 2
106
     mov             r4d, r4m
107
 %ifdef PIC
108
-    lea             r5, [tab_ChromaCoeff]
109
+    lea             r5, [h4_tab_ChromaCoeff]
110
     vpbroadcastq    m0, [r5 + r4 * 8]
111
 %else
112
-    vpbroadcastq    m0, [tab_ChromaCoeff + r4 * 8]
113
+    vpbroadcastq    m0, [h4_tab_ChromaCoeff + r4 * 8]
114
 %endif
115
     mova            m1, [h4_interp8_hpp_shuf]
116
     vpbroadcastd    m2, [pd_32]
117
@@ -1504,10 +1506,10 @@
118
     sub             r0, 2
119
     mov             r4d, r4m
120
 %ifdef PIC
121
-    lea             r5, [tab_ChromaCoeff]
122
+    lea             r5, [h4_tab_ChromaCoeff]
123
     vpbroadcastq    m0, [r5 + r4 * 8]
124
 %else
125
-    vpbroadcastq    m0, [tab_ChromaCoeff + r4 * 8]
126
+    vpbroadcastq    m0, [h4_tab_ChromaCoeff + r4 * 8]
127
 %endif
128
     mova            m1, [h4_interp8_hpp_shuf]
129
     vpbroadcastd    m2, [pd_32]
130
@@ -1579,10 +1581,10 @@
131
     sub             r0, 2
132
     mov             r4d, r4m
133
 %ifdef PIC
134
-    lea             r5, [tab_ChromaCoeff]
135
+    lea             r5, [h4_tab_ChromaCoeff]
136
     vpbroadcastq    m0, [r5 + r4 * 8]
137
 %else
138
-    vpbroadcastq    m0, [tab_ChromaCoeff + r4 * 8]
139
+    vpbroadcastq    m0, [h4_tab_ChromaCoeff + r4 * 8]
140
 %endif
141
     mova            m1, [h4_interp8_hpp_shuf]
142
     vpbroadcastd    m2, [pd_32]
143
@@ -1655,10 +1657,10 @@
144
     sub             r0, 2
145
     mov             r4d, r4m
146
 %ifdef PIC
147
-    lea             r5, [tab_ChromaCoeff]
148
+    lea             r5, [h4_tab_ChromaCoeff]
149
     vpbroadcastq    m0, [r5 + r4 * 8]
150
 %else
151
-    vpbroadcastq    m0, [tab_ChromaCoeff + r4 * 8]
152
+    vpbroadcastq    m0, [h4_tab_ChromaCoeff + r4 * 8]
153
 %endif
154
     mova            m1, [h4_interp8_hpp_shuf]
155
     vpbroadcastd    m2, [pd_32]
156
@@ -1724,10 +1726,10 @@
157
     sub             r0, 2
158
     mov             r4d, r4m
159
 %ifdef PIC
160
-    lea             r5, [tab_ChromaCoeff]
161
+    lea             r5, [h4_tab_ChromaCoeff]
162
     vpbroadcastq    m0, [r5 + r4 * 8]
163
 %else
164
-    vpbroadcastq    m0, [tab_ChromaCoeff + r4 * 8]
165
+    vpbroadcastq    m0, [h4_tab_ChromaCoeff + r4 * 8]
166
 %endif
167
     mova            m1, [h4_interp8_hpp_shuf]
168
     vpbroadcastd    m2, [pd_32]
169
@@ -1804,10 +1806,10 @@
170
     sub             r0, 2
171
     mov             r4d, r4m
172
 %ifdef PIC
173
-    lea             r5, [tab_ChromaCoeff]
174
+    lea             r5, [h4_tab_ChromaCoeff]
175
     vpbroadcastq    m0, [r5 + r4 * 8]
176
 %else
177
-    vpbroadcastq    m0, [tab_ChromaCoeff + r4 * 8]
178
+    vpbroadcastq    m0, [h4_tab_ChromaCoeff + r4 * 8]
179
 %endif
180
     mova            m1, [h4_interp8_hpp_shuf]
181
     vpbroadcastd    m2, [pd_32]
182
@@ -1872,10 +1874,10 @@
183
     sub             r0, 2
184
     mov             r4d, r4m
185
 %ifdef PIC
186
-    lea             r5, [tab_ChromaCoeff]
187
+    lea             r5, [h4_tab_ChromaCoeff]
188
     vpbroadcastq    m0, [r5 + r4 * 8]
189
 %else
190
-    vpbroadcastq    m0, [tab_ChromaCoeff + r4 * 8]
191
+    vpbroadcastq    m0, [h4_tab_ChromaCoeff + r4 * 8]
192
 %endif
193
     mova            m1, [h4_interp8_hpp_shuf]
194
     vpbroadcastd    m2, [pd_32]
195
@@ -1934,10 +1936,10 @@
196
     mov                 r5d, r5m
197
 
198
 %ifdef PIC
199
-    lea                 r6, [tab_ChromaCoeff]
200
+    lea                 r6, [h4_tab_ChromaCoeff]
201
     vpbroadcastq        m0, [r6 + r4 * 8]
202
 %else
203
-    vpbroadcastq        m0, [tab_ChromaCoeff + r4 * 8]
204
+    vpbroadcastq        m0, [h4_tab_ChromaCoeff + r4 * 8]
205
 %endif
206
     mova                m3, [h4_interp8_hpp_shuf]
207
     vbroadcasti128      m2, [INTERP_OFFSET_PS]
208
@@ -1993,10 +1995,10 @@
209
     mov                 r5d, r5m
210
 
211
 %ifdef PIC
212
-    lea                 r6, [tab_ChromaCoeff]
213
+    lea                 r6, [h4_tab_ChromaCoeff]
214
     vpbroadcastq        m0, [r6 + r4 * 8]
215
 %else
216
-    vpbroadcastq        m0, [tab_ChromaCoeff + r4 * 8]
217
+    vpbroadcastq        m0, [h4_tab_ChromaCoeff + r4 * 8]
218
 %endif
219
     mova                m3, [h4_interp8_hpp_shuf]
220
     vbroadcasti128      m2, [INTERP_OFFSET_PS]
221
@@ -2066,10 +2068,10 @@
222
     mov                 r5d, r5m
223
 
224
 %ifdef PIC
225
-    lea                 r6, [tab_ChromaCoeff]
226
+    lea                 r6, [h4_tab_ChromaCoeff]
227
     vpbroadcastq        m0, [r6 + r4 * 8]
228
 %else
229
-    vpbroadcastq        m0, [tab_ChromaCoeff + r4 * 8]
230
+    vpbroadcastq        m0, [h4_tab_ChromaCoeff + r4 * 8]
231
 %endif
232
     mova                m3, [h4_interp8_hpp_shuf]
233
     vbroadcasti128      m2, [INTERP_OFFSET_PS]
234
@@ -2148,10 +2150,10 @@
235
     mov                 r5d, r5m
236
 
237
 %ifdef PIC
238
-    lea                 r6, [tab_ChromaCoeff]
239
+    lea                 r6, [h4_tab_ChromaCoeff]
240
     vpbroadcastq        m0, [r6 + r4 * 8]
241
 %else
242
-    vpbroadcastq        m0, [tab_ChromaCoeff + r4 * 8]
243
+    vpbroadcastq        m0, [h4_tab_ChromaCoeff + r4 * 8]
244
 %endif
245
     mova                m3, [h4_interp8_hpp_shuf]
246
     vbroadcasti128      m2, [INTERP_OFFSET_PS]
247
@@ -2213,10 +2215,10 @@
248
     mov                 r5d, r5m
249
 
250
 %ifdef PIC
251
-    lea                 r6, [tab_ChromaCoeff]
252
+    lea                 r6, [h4_tab_ChromaCoeff]
253
     vpbroadcastq        m0, [r6 + r4 * 8]
254
 %else
255
-    vpbroadcastq        m0, [tab_ChromaCoeff + r4 * 8]
256
+    vpbroadcastq        m0, [h4_tab_ChromaCoeff + r4 * 8]
257
 %endif
258
     mova                m3, [h4_interp8_hpp_shuf]
259
     vbroadcasti128      m2, [INTERP_OFFSET_PS]
260
@@ -2314,10 +2316,10 @@
261
     mov                 r5d, r5m
262
 
263
 %ifdef PIC
264
-    lea                 r6, [tab_ChromaCoeff]
265
+    lea                 r6, [h4_tab_ChromaCoeff]
266
     vpbroadcastq        m0, [r6 + r4 * 8]
267
 %else
268
-    vpbroadcastq        m0, [tab_ChromaCoeff + r4 * 8]
269
+    vpbroadcastq        m0, [h4_tab_ChromaCoeff + r4 * 8]
270
 %endif
271
     mova                m3, [h4_interp8_hpp_shuf]
272
     vbroadcasti128      m2, [INTERP_OFFSET_PS]
273
@@ -2467,10 +2469,10 @@
274
     mov                 r5d, r5m
275
 
276
 %ifdef PIC
277
-    lea                 r6, [tab_ChromaCoeff]
278
+    lea                 r6, [h4_tab_ChromaCoeff]
279
     vpbroadcastq        m0, [r6 + r4 * 8]
280
 %else
281
-    vpbroadcastq        m0, [tab_ChromaCoeff + r4 * 8]
282
+    vpbroadcastq        m0, [h4_tab_ChromaCoeff + r4 * 8]
283
 %endif
284
     mova                m3, [h4_interp8_hpp_shuf]
285
     vbroadcasti128      m2, [INTERP_OFFSET_PS]
286
@@ -2587,10 +2589,10 @@
287
     mov                 r5d, r5m
288
 
289
 %ifdef PIC
290
-    lea                 r6, [tab_ChromaCoeff]
291
+    lea                 r6, [h4_tab_ChromaCoeff]
292
     vpbroadcastq        m0, [r6 + r4 * 8]
293
 %else
294
-    vpbroadcastq        m0, [tab_ChromaCoeff + r4 * 8]
295
+    vpbroadcastq        m0, [h4_tab_ChromaCoeff + r4 * 8]
296
 %endif
297
     mova                m3, [h4_interp8_hpp_shuf]
298
     vbroadcasti128      m2, [INTERP_OFFSET_PS]
299
x265_2.7.tar.gz/source/common/x86/intrapred.h -> x265_2.9.tar.gz/source/common/x86/intrapred.h Changed
19
 
1
@@ -76,7 +76,7 @@
2
 FUNCDEF_TU_S2(void, intra_pred_dc, sse2, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
3
 FUNCDEF_TU_S2(void, intra_pred_dc, sse4, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
4
 FUNCDEF_TU_S2(void, intra_pred_dc, avx2, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
5
-
6
+FUNCDEF_TU_S2(void, intra_pred_dc, avx512, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
7
 FUNCDEF_TU_S2(void, intra_pred_planar, sse2, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
8
 FUNCDEF_TU_S2(void, intra_pred_planar, sse4, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
9
 FUNCDEF_TU_S2(void, intra_pred_planar, avx2, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
10
@@ -85,7 +85,7 @@
11
 DECL_ALL(ssse3);
12
 DECL_ALL(sse4);
13
 DECL_ALL(avx2);
14
-
15
+DECL_ALL(avx512);
16
 #undef DECL_ALL
17
 #undef DECL_ANGS
18
 #undef DECL_ANG
19
x265_2.7.tar.gz/source/common/x86/intrapred16.asm -> x265_2.9.tar.gz/source/common/x86/intrapred16.asm Changed
2638
 
1
@@ -71,7 +71,7 @@
2
 const pw_ang8_16,                   db  0,  0,  0,  0,  0,  0, 12, 13, 10, 11,  6,  7,  4,  5,  0,  1
3
 const pw_ang8_17,                   db  0,  0, 14, 15, 12, 13, 10, 11,  8,  9,  4,  5,  2,  3,  0,  1
4
 const pw_swap16,            times 2 db 14, 15, 12, 13, 10, 11,  8,  9,  6,  7,  4,  5,  2,  3,  0,  1
5
-
6
+const pw_swap16_avx512,     times 4 db 14, 15, 12, 13, 10, 11,  8,  9,  6,  7,  4,  5,  2,  3,  0,  1
7
 const pw_ang16_13,                  db 14, 15,  8,  9,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
8
 const pw_ang16_16,                  db  0,  0,  0,  0,  0,  0, 10, 11,  8,  9,  6,  7,  2,  3,  0,  1
9
 
10
@@ -196,6 +196,7 @@
11
 ;-----------------------------------------------------------------------------------
12
 ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* above, int, int filter)
13
 ;-----------------------------------------------------------------------------------
14
+%if ARCH_X86_64
15
 INIT_XMM sse2
16
 cglobal intra_pred_dc8, 5, 8, 2
17
     movu            m0,            [r2 + 34]
18
@@ -275,10 +276,13 @@
19
     mov             [r0 + r7],     r3w
20
 .end:
21
     RET
22
+%endif
23
 
24
 ;-------------------------------------------------------------------------------------------------------
25
 ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
26
 ;-------------------------------------------------------------------------------------------------------
27
+%if ARCH_X86_64
28
+;This code is meant for 64 bit architecture
29
 INIT_XMM sse2
30
 cglobal intra_pred_dc16, 5, 10, 4
31
     lea             r3,                  [r2 + 66]
32
@@ -410,6 +414,7 @@
33
     mov             [r9 + r1 * 8],       r3w
34
 .end:
35
     RET
36
+%endif
37
 
38
 ;-------------------------------------------------------------------------------------------
39
 ; void intra_pred_dc(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int filter)
40
@@ -474,6 +479,7 @@
41
 ;-------------------------------------------------------------------------------------------------------
42
 ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
43
 ;-------------------------------------------------------------------------------------------------------
44
+%if ARCH_X86_64
45
 INIT_YMM avx2
46
 cglobal intra_pred_dc16, 3, 9, 4
47
     mov             r3d,                 r4m
48
@@ -682,6 +688,68 @@
49
     movu            [r0 + r2 * 1 +  0], m0
50
     movu            [r0 + r2 * 1 + mmsize], m0
51
     RET
52
+INIT_ZMM avx512
53
+cglobal intra_pred_dc32, 3,3,2
54
+    add              r2, 2
55
+    add             r1d, r1d
56
+    movu             m0, [r2]
57
+    movu             m1, [r2 + 2 * mmsize]
58
+    paddw            m0, m1
59
+    vextracti32x8   ym1, m0, 1
60
+    paddw           ym0, ym1
61
+    vextracti32x4   xm1, m0, 1
62
+    paddw           xm0, xm1
63
+    pmaddwd         xm0, [pw_1]
64
+    movhlps         xm1, xm0
65
+    paddd           xm0, xm1
66
+    vpsrldq         xm1, xm0, 4
67
+    paddd           xm0, xm1
68
+    paddd           xm0, [pd_32]                        ; sum = sum + 32
69
+    psrld           xm0, 6                              ; sum = sum / 64
70
+    vpbroadcastw     m0, xm0
71
+    lea              r2, [r1 * 3]
72
+    ; store DC 32x32
73
+    movu            [r0 + r1 * 0 +  0], m0
74
+    movu            [r0 + r1 * 1 +  0], m0
75
+    movu            [r0 + r1 * 2 +  0], m0
76
+    movu            [r0 + r2 * 1 +  0], m0
77
+    lea             r0, [r0 + r1 * 4]
78
+    movu            [r0 + r1 * 0 +  0], m0
79
+    movu            [r0 + r1 * 1 +  0], m0
80
+    movu            [r0 + r1 * 2 +  0], m0
81
+    movu            [r0 + r2 * 1 +  0], m0
82
+    lea             r0, [r0 + r1 * 4]
83
+    movu            [r0 + r1 * 0 +  0], m0
84
+    movu            [r0 + r1 * 1 +  0], m0
85
+    movu            [r0 + r1 * 2 +  0], m0
86
+    movu            [r0 + r2 * 1 +  0], m0
87
+    lea             r0, [r0 + r1 * 4]
88
+    movu            [r0 + r1 * 0 +  0], m0
89
+    movu            [r0 + r1 * 1 +  0], m0
90
+    movu            [r0 + r1 * 2 +  0], m0
91
+    movu            [r0 + r2 * 1 +  0], m0
92
+    lea             r0, [r0 + r1 * 4]
93
+    movu            [r0 + r1 * 0 +  0], m0
94
+    movu            [r0 + r1 * 1 +  0], m0
95
+    movu            [r0 + r1 * 2 +  0], m0
96
+    movu            [r0 + r2 * 1 +  0], m0
97
+    lea             r0, [r0 + r1 * 4]
98
+    movu            [r0 + r1 * 0 +  0], m0
99
+    movu            [r0 + r1 * 1 +  0], m0
100
+    movu            [r0 + r1 * 2 +  0], m0
101
+    movu            [r0 + r2 * 1 +  0], m0
102
+    lea             r0, [r0 + r1 * 4]
103
+    movu            [r0 + r1 * 0 +  0], m0
104
+    movu            [r0 + r1 * 1 +  0], m0
105
+    movu            [r0 + r1 * 2 +  0], m0
106
+    movu            [r0 + r2 * 1 +  0], m0
107
+    lea             r0, [r0 + r1 * 4]
108
+    movu            [r0 + r1 * 0 +  0], m0
109
+    movu            [r0 + r1 * 1 +  0], m0
110
+    movu            [r0 + r1 * 2 +  0], m0
111
+    movu            [r0 + r2 * 1 +  0], m0
112
+    RET
113
+%endif
114
 
115
 ;---------------------------------------------------------------------------------------
116
 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
117
@@ -1104,6 +1172,7 @@
118
 ;---------------------------------------------------------------------------------------
119
 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
120
 ;---------------------------------------------------------------------------------------
121
+%if ARCH_X86_64
122
 INIT_XMM sse2
123
 cglobal intra_pred_planar32, 3,3,16
124
     movd            m3, [r2 + 66]               ; topRight   = above[32]
125
@@ -1209,7 +1278,7 @@
126
 %endrep
127
     RET
128
 %endif
129
-
130
+%endif
131
 ;---------------------------------------------------------------------------------------
132
 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
133
 ;---------------------------------------------------------------------------------------
134
@@ -2063,6 +2132,7 @@
135
     STORE_4x4
136
     RET
137
 
138
+%if ARCH_X86_64
139
 cglobal intra_pred_ang4_26, 3,3,3
140
     movh        m0,             [r2 + 2] ;[8 7 6 5 4 3 2 1]
141
     add         r1d,            r1d
142
@@ -2098,6 +2168,7 @@
143
     mov         [r0 + r3],      r2w
144
 .quit:
145
     RET
146
+%endif
147
 
148
 cglobal intra_pred_ang4_27, 3,3,5
149
     movu        m0, [r2 + 2]            ;[8 7 6 5 4 3 2 1]
150
@@ -11054,35 +11125,35 @@
151
 
152
 %macro TRANSPOSE_STORE_AVX2 11
153
     jnz             .skip%11
154
-    punpckhwd       m%9,  m%1,  m%2
155
-    punpcklwd       m%1,  m%2
156
-    punpckhwd       m%2,  m%3,  m%4
157
-    punpcklwd       m%3,  m%4
158
-
159
-    punpckldq       m%4,  m%1,  m%3
160
-    punpckhdq       m%1,  m%3
161
-    punpckldq       m%3,  m%9,  m%2
162
-    punpckhdq       m%9,  m%2
163
-
164
-    punpckhwd       m%10, m%5,  m%6
165
-    punpcklwd       m%5,  m%6
166
-    punpckhwd       m%6,  m%7,  m%8
167
-    punpcklwd       m%7,  m%8
168
-
169
-    punpckldq       m%8,  m%5,  m%7
170
-    punpckhdq       m%5,  m%7
171
-    punpckldq       m%7,  m%10, m%6
172
-    punpckhdq       m%10, m%6
173
-
174
-    punpcklqdq      m%6,  m%4,  m%8
175
-    punpckhqdq      m%2,  m%4,  m%8
176
-    punpcklqdq      m%4,  m%1,  m%5
177
-    punpckhqdq      m%8,  m%1,  m%5
178
-
179
-    punpcklqdq      m%1,  m%3,  m%7
180
-    punpckhqdq      m%5,  m%3,  m%7
181
-    punpcklqdq      m%3,  m%9,  m%10
182
-    punpckhqdq      m%7,  m%9,  m%10
183
+    punpckhwd       ym%9,  ym%1,  ym%2
184
+    punpcklwd       ym%1,  ym%2
185
+    punpckhwd       ym%2,  ym%3,  ym%4
186
+    punpcklwd       ym%3,  ym%4
187
+
188
+    punpckldq       ym%4,  ym%1,  ym%3
189
+    punpckhdq       ym%1,  ym%3
190
+    punpckldq       ym%3,  ym%9,  ym%2
191
+    punpckhdq       ym%9,  ym%2
192
+
193
+    punpckhwd       ym%10, ym%5,  ym%6
194
+    punpcklwd       ym%5,  ym%6
195
+    punpckhwd       ym%6,  ym%7,  ym%8
196
+    punpcklwd       ym%7,  ym%8
197
+
198
+    punpckldq       ym%8,  ym%5,  ym%7
199
+    punpckhdq       ym%5,  ym%7
200
+    punpckldq       ym%7,  ym%10, ym%6
201
+    punpckhdq       ym%10, ym%6
202
+
203
+    punpcklqdq      ym%6,  ym%4,  ym%8
204
+    punpckhqdq      ym%2,  ym%4,  ym%8
205
+    punpcklqdq      ym%4,  ym%1,  ym%5
206
+    punpckhqdq      ym%8,  ym%1,  ym%5
207
+
208
+    punpcklqdq      ym%1,  ym%3,  ym%7
209
+    punpckhqdq      ym%5,  ym%3,  ym%7
210
+    punpcklqdq      ym%3,  ym%9,  ym%10
211
+    punpckhqdq      ym%7,  ym%9,  ym%10
212
 
213
     movu            [r0 + r1 * 0 + %11], xm%6
214
     movu            [r0 + r1 * 1 + %11], xm%2
215
@@ -11096,32 +11167,33 @@
216
     movu            [r5 + r4 * 1 + %11], xm%7
217
 
218
     lea             r5, [r5 + r1 * 4]
219
-    vextracti128    [r5 + r1 * 0 + %11], m%6, 1
220
-    vextracti128    [r5 + r1 * 1 + %11], m%2, 1
221
-    vextracti128    [r5 + r1 * 2 + %11], m%4, 1
222
-    vextracti128    [r5 + r4 * 1 + %11], m%8, 1
223
+    vextracti128    [r5 + r1 * 0 + %11], ym%6, 1
224
+    vextracti128    [r5 + r1 * 1 + %11], ym%2, 1
225
+    vextracti128    [r5 + r1 * 2 + %11], ym%4, 1
226
+    vextracti128    [r5 + r4 * 1 + %11], ym%8, 1
227
 
228
     lea             r5, [r5 + r1 * 4]
229
-    vextracti128    [r5 + r1 * 0 + %11], m%1, 1
230
-    vextracti128    [r5 + r1 * 1 + %11], m%5, 1
231
-    vextracti128    [r5 + r1 * 2 + %11], m%3, 1
232
-    vextracti128    [r5 + r4 * 1 + %11], m%7, 1
233
+    vextracti128    [r5 + r1 * 0 + %11], ym%1, 1
234
+    vextracti128    [r5 + r1 * 1 + %11], ym%5, 1
235
+    vextracti128    [r5 + r1 * 2 + %11], ym%3, 1
236
+    vextracti128    [r5 + r4 * 1 + %11], ym%7, 1
237
     jmp             .end%11
238
 .skip%11:
239
-    movu            [r0 + r1 * 0], m%1
240
-    movu            [r0 + r1 * 1], m%2
241
-    movu            [r0 + r1 * 2], m%3
242
-    movu            [r0 + r4 * 1], m%4
243
+    movu            [r0 + r1 * 0], ym%1
244
+    movu            [r0 + r1 * 1], ym%2
245
+    movu            [r0 + r1 * 2], ym%3
246
+    movu            [r0 + r4 * 1], ym%4
247
 
248
     lea             r0, [r0 + r1 * 4]
249
-    movu            [r0 + r1 * 0], m%5
250
-    movu            [r0 + r1 * 1], m%6
251
-    movu            [r0 + r1 * 2], m%7
252
-    movu            [r0 + r4 * 1], m%8
253
+    movu            [r0 + r1 * 0], ym%5
254
+    movu            [r0 + r1 * 1], ym%6
255
+    movu            [r0 + r1 * 2], ym%7
256
+    movu            [r0 + r4 * 1], ym%8
257
     lea             r0, [r0 + r1 * 4]
258
 .end%11:
259
 %endmacro
260
 
261
+%if ARCH_X86_64
262
 ;; angle 16, modes 3 and 33
263
 cglobal ang16_mode_3_33
264
     test            r6d, r6d
265
@@ -11771,7 +11843,6 @@
266
     packusdw        m11, m3
267
     TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 16
268
     ret
269
-
270
 ;; angle 16, modes 7 and 29
271
 cglobal ang16_mode_7_29
272
     test            r6d, r6d
273
@@ -18220,10 +18291,2364 @@
274
 
275
     mov         rsp,                [rsp+4*mmsize]
276
     RET
277
+%endif
278
 ;-------------------------------------------------------------------------------------------------------
279
 ; end of avx2 code for intra_pred_ang32 mode 2 to 34
280
 ;-------------------------------------------------------------------------------------------------------
281
+;-------------------------------------------------------------------------------------------------------
282
+; avx512 code for intra_pred_ang32 mode 2 to 34 start
283
+;-------------------------------------------------------------------------------------------------------
284
+INIT_ZMM avx512
285
+cglobal intra_pred_ang32_2, 3,5,3
286
+    lea         r4,                 [r2]
287
+    add         r2,                 128
288
+    cmp         r3m,                byte 34
289
+    cmove       r2,                 r4
290
+    add         r1d,                 r1d
291
+    lea         r3,                 [r1 * 3]
292
+    movu        m0,                 [r2 + 4]
293
+    movu        m1,                 [r2 + 20]
294
+
295
+    movu        [r0],               m0
296
+    palignr     m2,                 m1, m0, 2
297
+    movu        [r0 + r1],          m2
298
+    palignr     m2,                 m1, m0, 4
299
+    movu        [r0 + r1 * 2],      m2
300
+    palignr     m2,                 m1, m0, 6
301
+    movu        [r0 + r3],          m2
302
+
303
+    lea         r0,                 [r0 + r1 * 4]
304
+    palignr     m2,                 m1, m0, 8
305
+    movu        [r0],               m2
306
+    palignr     m2,                 m1, m0, 10
307
+    movu        [r0 + r1],          m2
308
+    palignr     m2,                 m1, m0, 12
309
+    movu        [r0 + r1 * 2],      m2
310
+    palignr     m2,                 m1, m0, 14
311
+    movu        [r0 + r3],          m2
312
+
313
+    movu        m0,                 [r2 + 36]
314
+    lea         r0,                 [r0 + r1 * 4]
315
+    movu        [r0],               m1
316
+    palignr     m2,                 m0, m1, 2
317
+    movu        [r0 + r1],          m2
318
+    palignr     m2,                 m0, m1, 4
319
+    movu        [r0 + r1 * 2],      m2
320
+    palignr     m2,                 m0, m1, 6
321
+    movu        [r0 + r3],          m2
322
+
323
+    lea         r0,                 [r0 + r1 * 4]
324
+    palignr     m2,                 m0, m1, 8
325
+    movu        [r0],               m2
326
+    palignr     m2,                 m0, m1, 10
327
+    movu        [r0 + r1],          m2
328
+    palignr     m2,                 m0, m1, 12
329
+    movu        [r0 + r1 * 2],      m2
330
+    palignr     m2,                 m0, m1, 14
331
+    movu        [r0 + r3],          m2
332
+
333
+    lea         r0,                 [r0 + r1 * 4]
334
+    movu        m1,                 [r2 + 52]
335
+
336
+    movu        [r0],               m0
337
+    palignr     m2,                 m1, m0, 2
338
+    movu        [r0 + r1],          m2
339
+    palignr     m2,                 m1, m0, 4
340
+    movu        [r0 + r1 * 2],      m2
341
+    palignr     m2,                 m1, m0, 6
342
+    movu        [r0 + r3],          m2
343
+
344
+    lea         r0,                 [r0 + r1 * 4]
345
+    palignr     m2,                 m1, m0, 8
346
+    movu        [r0],               m2
347
+    palignr     m2,                 m1, m0, 10
348
+    movu        [r0 + r1],          m2
349
+    palignr     m2,                 m1, m0, 12
350
+    movu        [r0 + r1 * 2],      m2
351
+    palignr     m2,                 m1, m0, 14
352
+    movu        [r0 + r3],          m2
353
+
354
+    movu        m0,                 [r2 + 68]
355
+    lea         r0,                 [r0 + r1 * 4]
356
+    movu        [r0],               m1
357
+    palignr     m2,                 m0, m1, 2
358
+    movu        [r0 + r1],          m2
359
+    palignr     m2,                 m0, m1, 4
360
+    movu        [r0 + r1 * 2],      m2
361
+    palignr     m2,                 m0, m1, 6
362
+    movu        [r0 + r3],          m2
363
+
364
+    lea         r0,                 [r0 + r1 * 4]
365
+    palignr     m2,                 m0, m1, 8
366
+    movu        [r0],               m2
367
+    palignr     m2,                 m0, m1, 10
368
+    movu        [r0 + r1],          m2
369
+    palignr     m2,                 m0, m1, 12
370
+    movu        [r0 + r1 * 2],      m2
371
+    palignr     m2,                 m0, m1, 14
372
+    movu        [r0 + r3],          m2
373
+    RET
374
+
375
+cglobal intra_pred_ang32_10, 3,4,2
376
+    add             r2, mmsize*2
377
+    add             r1d, r1d
378
+    lea             r3, [r1 * 3]
379
+
380
+    vpbroadcastw    m0, [r2 + 2]       ; [1...]
381
+    vpbroadcastw    m1, [r2 + 2 + 2]   ; [2...]
382
+    movu            [r0], m0
383
+    movu            [r0 + r1], m1
384
+
385
+    vpbroadcastw    m0, [r2 + 2 + 4]   ; [3...]
386
+    vpbroadcastw    m1, [r2 + 2 + 6]   ; [4...]
387
+    movu            [r0 + r1 * 2], m0
388
+    movu            [r0 + r3], m1
389
+    lea             r0, [r0 + r1 * 4]
390
+
391
+    vpbroadcastw    m0, [r2 + 2 + 8]   ; [5...]
392
+    vpbroadcastw    m1, [r2 + 2 + 10]  ; [6...]
393
+    movu            [r0], m0
394
+    movu            [r0 + r1], m1
395
+
396
+    vpbroadcastw    m0, [r2 + 2 + 12]  ; [7...]
397
+    vpbroadcastw    m1, [r2 + 2 + 14]  ; [8...]
398
+    movu            [r0 + r1 * 2], m0
399
+    movu            [r0 + r3], m1
400
+    lea             r0, [r0 + r1 *4]
401
+
402
+    vpbroadcastw    m0, [r2 + 2 + 16]  ; [9...]
403
+    vpbroadcastw    m1, [r2 + 2 + 18]  ; [10...]
404
+    movu            [r0], m0
405
+    movu            [r0 + r1], m1
406
+
407
+    vpbroadcastw    m0, [r2 + 2 + 20]  ; [11...]
408
+    vpbroadcastw    m1, [r2 + 2 + 22]  ; [12...]
409
+    movu            [r0 + r1 * 2], m0
410
+    movu            [r0 + r3], m1
411
+    lea             r0, [r0 + r1 *4]
412
+
413
+    vpbroadcastw    m0, [r2 + 2 + 24]  ; [13...]
414
+    vpbroadcastw    m1, [r2 + 2 + 26]  ; [14...]
415
+    movu            [r0], m0
416
+    movu            [r0 + r1], m1
417
+
418
+    vpbroadcastw    m0, [r2 + 2 + 28]  ; [15...]
419
+    vpbroadcastw    m1, [r2 + 2 + 30]  ; [16...]
420
+    movu            [r0 + r1 * 2], m0
421
+    movu            [r0 + r3], m1
422
+    lea             r0, [r0 + r1 *4]
423
+
424
+    vpbroadcastw    m0, [r2 + 2 + 32]  ; [17...]
425
+    vpbroadcastw    m1, [r2 + 2 + 34]  ; [18...]
426
+    movu            [r0], m0
427
+    movu            [r0 + r1], m1
428
+
429
+    vpbroadcastw    m0, [r2 + 2 + 36]  ; [19...]
430
+    vpbroadcastw    m1, [r2 + 2 + 38]  ; [20...]
431
+    movu            [r0 + r1 * 2], m0
432
+    movu            [r0 + r3], m1
433
+    lea             r0, [r0 + r1 *4]
434
+
435
+    vpbroadcastw    m0, [r2 + 2 + 40]  ; [21...]
436
+    vpbroadcastw    m1, [r2 + 2 + 42]  ; [22...]
437
+    movu            [r0], m0
438
+    movu            [r0 + r1], m1
439
+
440
+    vpbroadcastw    m0, [r2 + 2 + 44]  ; [23...]
441
+    vpbroadcastw    m1, [r2 + 2 + 46]  ; [24...]
442
+    movu            [r0 + r1 * 2], m0
443
+    movu            [r0 + r3], m1
444
+    lea             r0, [r0 + r1 *4]
445
+
446
+    vpbroadcastw    m0, [r2 + 2 + 48]  ; [25...]
447
+    vpbroadcastw    m1, [r2 + 2 + 50]  ; [26...]
448
+    movu            [r0], m0
449
+    movu            [r0 + r1], m1
450
+
451
+    vpbroadcastw    m0, [r2 + 2 + 52]  ; [27...]
452
+    vpbroadcastw    m1, [r2 + 2 + 54]  ; [28...]
453
+    movu            [r0 + r1 * 2], m0
454
+    movu            [r0 + r3], m1
455
+    lea             r0, [r0 + r1 *4]
456
+
457
+    vpbroadcastw    m0, [r2 + 2 + 56]  ; [29...]
458
+    vpbroadcastw    m1, [r2 + 2 + 58]  ; [30...]
459
+    movu            [r0], m0
460
+    movu            [r0 + r1], m1
461
+
462
+    vpbroadcastw    m0, [r2 + 2 + 60]  ; [31...]
463
+    vpbroadcastw    m1, [r2 + 2 + 62]  ; [32...]
464
+    movu            [r0 + r1 * 2], m0
465
+    movu            [r0 + r3], m1
466
+    RET
467
+
468
+cglobal intra_pred_ang32_18, 3,6,6
469
+    mov         r4,                 rsp
470
+    sub         rsp,                4*(mmsize/2)+gprsize
471
+    and         rsp,                ~63
472
+    mov         [rsp+4*(mmsize/2)],     r4
473
+
474
+    movu        m0,                 [r2]
475
+    mova                            [rsp + 2*(mmsize/2)],   ym0
476
+    vextracti32x8                   [rsp + 3*(mmsize/2)],   m0, 1
477
+
478
+    movu        m2,                 [r2 + 130]
479
+    pshufb      m2,                 [pw_swap16_avx512]
480
+    vpermq      m2,                 m2, q1032
481
+    mova                            [rsp + 1*(mmsize/2)],   ym2
482
+    vextracti32x8                   [rsp + 0*(mmsize/2)],   m2, 1
483
+
484
+    add         r1d,                r1d
485
+    lea         r2,                 [rsp+2*(mmsize/2)]
486
+    lea         r4,                 [r1 * 2]
487
+    lea         r3,                 [r1 * 3]
488
+    lea         r5,                 [r1 * 4]
489
+
490
+    movu        m0,                 [r2]
491
+    movu        m2,                 [r2 - 16]
492
+    movu        [r0],               m0
493
+
494
+    palignr     m4,                 m0, m2, 14
495
+    palignr     m5,                 m0, m2, 12
496
+    movu        [r0 + r1],          m4
497
+    movu        [r0 + r4],          m5
498
+
499
+    palignr     m4,                 m0, m2, 10
500
+    palignr     m5,                 m0, m2, 8
501
+    movu        [r0 + r3],          m4
502
+    add         r0,                 r5
503
+    movu        [r0],               m5
504
+
505
+    palignr     m4,                 m0, m2, 6
506
+    palignr     m5,                 m0, m2, 4
507
+    movu        [r0 + r1],          m4
508
+    movu        [r0 + r4],          m5
509
+
510
+    palignr     m4,                 m0, m2, 2
511
+    movu        [r0 + r3],          m4
512
+    add         r0,                 r5
513
+    movu        [r0],               m2
514
+
515
+    movu        m0,                 [r2 - 32]
516
+    palignr     m4,                 m2, m0, 14
517
+    palignr     m5,                 m2, m0, 12
518
+    movu        [r0 + r1],          m4
519
+    movu        [r0 + r4],          m5
520
+
521
+    palignr     m4,                 m2, m0, 10
522
+    palignr     m5,                 m2, m0, 8
523
+    movu        [r0 + r3],          m4
524
+    add         r0,                 r5
525
+    movu        [r0],               m5
526
+
527
+    palignr     m4,                 m2, m0, 6
528
+    palignr     m5,                 m2, m0, 4
529
+    movu        [r0 + r1],          m4
530
+    movu        [r0 + r4],          m5
531
+
532
+    palignr     m4,                 m2, m0, 2
533
+    movu        [r0 + r3],          m4
534
+    add         r0,                 r5
535
+    movu        [r0],               m0
536
+
537
+    movu        m2,                 [r2 - 48]
538
+    palignr     m4,                 m0, m2, 14
539
+    palignr     m5,                 m0, m2, 12
540
+    movu        [r0 + r1],          m4
541
+    movu        [r0 + r4],          m5
542
+
543
+    palignr     m4,                 m0, m2, 10
544
+    palignr     m5,                 m0, m2, 8
545
+    movu        [r0 + r3],          m4
546
+    add         r0,                 r5
547
+    movu        [r0],               m5
548
+
549
+    palignr     m4,                 m0, m2, 6
550
+    palignr     m5,                 m0, m2, 4
551
+    movu        [r0 + r1],          m4
552
+    movu        [r0 + r4],          m5
553
+
554
+    palignr     m4,                 m0, m2, 2
555
+    movu        [r0 + r3],          m4
556
+    add         r0,                 r5
557
+    movu        [r0],               m2
558
+
559
+    movu        m0,                 [r2 - 64]
560
+    palignr     m4,                 m2, m0, 14
561
+    palignr     m5,                 m2, m0, 12
562
+    movu        [r0 + r1],          m4
563
+    movu        [r0 + r4],          m5
564
+
565
+    palignr     m4,                 m2, m0, 10
566
+    palignr     m5,                 m2, m0, 8
567
+    movu        [r0 + r3],          m4
568
+    add         r0,                 r5
569
+    movu        [r0],               m5
570
+
571
+    palignr     m4,                 m2, m0, 6
572
+    palignr     m5,                 m2, m0, 4
573
+    movu        [r0 + r1],          m4
574
+    movu        [r0 + r4],          m5
575
+
576
+    palignr     m4,                 m2, m0, 2
577
+    movu        [r0 + r3],          m4
578
+    mov         rsp,                [rsp+4*(mmsize/2)]
579
+    RET
580
+INIT_ZMM avx512
581
+cglobal intra_pred_ang32_26, 3,3,2
582
+    movu        m0,                 [r2 + 2]
583
+    add         r1d,                r1d
584
+    lea         r2,                 [r1 * 3]
585
+    movu        [r0],               m0
586
+   movu        [r0 + r1],          m0
587
+    movu        [r0 + r1 * 2],      m0
588
+    movu        [r0 + r2],          m0
589
+    lea         r0,                 [r0 + r1 *4]
590
+    movu        [r0],               m0
591
+    movu        [r0 + r1],          m0
592
+    movu        [r0 + r1 * 2],      m0
593
+    movu        [r0 + r2],          m0
594
+    lea         r0,                 [r0 + r1 *4]
595
+    movu        [r0],               m0
596
+    movu        [r0 + r1],          m0
597
+    movu        [r0 + r1 * 2],      m0
598
+    movu        [r0 + r2],          m0
599
+    lea         r0,                 [r0 + r1 *4]
600
+    movu        [r0],               m0
601
+    movu        [r0 + r1],          m0
602
+    movu        [r0 + r1 * 2],      m0
603
+    movu        [r0 + r2],          m0
604
+    lea         r0,                 [r0 + r1 *4]
605
+    movu        [r0],               m0
606
+    movu        [r0 + r1],          m0
607
+    movu        [r0 + r1 * 2],      m0
608
+    movu        [r0 + r2],          m0
609
+    lea         r0,                 [r0 + r1 *4]
610
+    movu        [r0],               m0
611
+    movu        [r0 + r1],          m0
612
+    movu        [r0 + r1 * 2],      m0
613
+    movu        [r0 + r2],          m0
614
+    lea         r0,                 [r0 + r1 *4]
615
+    movu        [r0],               m0
616
+    movu        [r0 + r1],          m0
617
+    movu        [r0 + r1 * 2],      m0
618
+    movu        [r0 + r2],          m0
619
+    lea         r0,                 [r0 + r1 *4]
620
+    movu        [r0],               m0
621
+    movu        [r0 + r1],          m0
622
+    movu        [r0 + r1 * 2],      m0
623
+    movu        [r0 + r2],          m0
624
+    RET
625
+
626
+;; angle 16, modes 9 and 27
627
+cglobal ang16_mode_9_27
628
+    test            r6d, r6d
629
+
630
+    vbroadcasti32x8 m0, [r2 + 2]                    ; [16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1]
631
+    vbroadcasti32x8 m1, [r2 + 4]                    ; [17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2]
632
+
633
+    punpcklwd       m3, m0, m1                       ; [13 12 12 11 11 10 10  9  5  4  4  3  3  2  2  1]
634
+    punpckhwd       m0, m1                           ; [17 16 16 15 15 14 14 13  9  8  8  7  7  6  6  5]
635
+
636
+    vbroadcasti32x8 m2, [r2 + 18]                    ; [24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9]
637
+    vbroadcasti32x8 m4, [r2 + 20]                    ; [25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10]
638
+    punpcklwd       m2, m4                           ; [21 20 20 19 19 18 18 17 13 12 12 11 11 10 10  9]
639
+
640
+    movu            ym16, [r3 - 14 * 32]            ; [2]
641
+    vinserti32x8    m16,  [r3 - 12 * 32], 1         ; [4]
642
+    pmaddwd         m4, m3, m16
643
+    paddd           m4, m15
644
+    psrld           m4, 5
645
+    pmaddwd         m5, m0, m16
646
+    paddd           m5, m15
647
+    psrld           m5, 5
648
+    packusdw        m4, m5
649
+    vextracti32x8   ym5, m4, 1
650
+    movu            ym16, [r3 - 10 * 32]            ; [6]
651
+    vinserti32x8    m16,  [r3 - 8 * 32], 1          ; [8]
652
+    pmaddwd         m6, m3, m16
653
+    paddd           m6, m15
654
+    psrld           m6, 5
655
+    pmaddwd         m9, m0, m16
656
+    paddd           m9, m15
657
+    psrld           m9, 5
658
+    packusdw        m6, m9
659
+    vextracti32x8   ym7, m6, 1
660
+    movu            ym16, [r3 - 6 * 32]             ; [10]
661
+    vinserti32x8    m16,  [r3 - 4 * 32], 1          ; [12]
662
+    pmaddwd         m8, m3, m16
663
+    paddd           m8, m15
664
+    psrld           m8, 5
665
+    pmaddwd         m9, m0, m16
666
+    paddd           m9, m15
667
+    psrld           m9, 5
668
+    packusdw        m8, m9
669
+    vextracti32x8   ym9, m8, 1
670
+    movu            ym16, [r3 - 2 * 32]             ; [14]
671
+    vinserti32x8    m16,  [r3], 1                   ; [16]
672
+    pmaddwd         m10, m3, m16
673
+    paddd           m10, m15
674
+    psrld           m10, 5
675
+    pmaddwd         m1, m0, m16
676
+    paddd           m1, m15
677
+    psrld           m1, 5
678
+    packusdw        m10, m1
679
+    vextracti32x8   ym11, m10, 1
680
+    TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 2, 1, 0
681
+
682
+    movu            ym16, [r3 + 2 * 32]             ; [18]
683
+    vinserti32x8    m16,  [r3 + 4 * 32], 1          ; [20]
684
+    pmaddwd         m4, m3, m16
685
+    paddd           m4, m15
686
+    psrld           m4, 5
687
+    pmaddwd         m5, m0, m16
688
+    paddd           m5, m15
689
+    psrld           m5, 5
690
+    packusdw        m4, m5
691
+    vextracti32x8   ym5, m4, 1
692
+    movu            ym16, [r3 + 6 * 32]             ; [22]
693
+    vinserti32x8    m16,  [r3 + 8 * 32], 1          ; [24]
694
+    pmaddwd         m6, m3, m16
695
+    paddd           m6, m15
696
+    psrld           m6, 5
697
+    pmaddwd         m8, m0, m16
698
+    paddd           m8, m15
699
+    psrld           m8, 5
700
+    packusdw        m6, m8
701
+    vextracti32x8   ym7, m6, 1
702
+    movu            ym16, [r3 + 10 * 32]            ; [26]
703
+    vinserti32x8    m16,  [r3 + 12 * 32], 1         ; [28]
704
+    pmaddwd         m8, m3, m16
705
+    paddd           m8, m15
706
+    psrld           m8, 5
707
+    pmaddwd         m9, m0, m16
708
+    paddd           m9, m15
709
+    psrld           m9, 5
710
+    packusdw        m8, m9
711
+    vextracti32x8   ym9, m8, 1
712
+    movu            ym16, [r3 + 14 * 32]            ; [30]
713
+    pmaddwd         ym3, ym16
714
+    paddd           ym3, ym15
715
+    psrld           ym3, 5
716
+    pmaddwd         ym0, ym16
717
+    paddd           ym0, ym15
718
+    psrld           ym0, 5
719
+    packusdw        ym3, ym0
720
+
721
+    movu            ym1, [r2 + 4]
722
+    TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 3, 1, 0, 2, 16
723
+    ret
724
+
725
+cglobal intra_pred_ang32_9, 3,8,17
726
+    add         r2,        128
727
+    xor         r6d,       r6d
728
+    lea         r3,        [ang_table_avx2 + 16 * 32]
729
+    shl         r1d,       1
730
+    lea         r4,        [r1 * 3]
731
+    lea         r7,        [r0 + 8 * r1]
732
+    vbroadcasti32x8  m15,  [pd_16]
733
+
734
+    call        ang16_mode_9_27
735
+    add         r2,        2
736
+    lea         r0,        [r0 + 32]
737
+    call        ang16_mode_9_27
738
+    add         r2,        30
739
+    lea         r0,        [r7 + 8 * r1]
740
+    call        ang16_mode_9_27
741
+    add         r2,        2
742
+    lea         r0,        [r0 + 32]
743
+    call        ang16_mode_9_27
744
+    RET
745
+
746
+cglobal intra_pred_ang32_27, 3,7,17
747
+    xor         r6d,       r6d
748
+    inc         r6d
749
+    lea         r3,        [ang_table_avx2 + 16 * 32]
750
+    shl         r1d,       1
751
+    lea         r4,        [r1 * 3]
752
+    lea         r5,        [r0 + 32]
753
+    vbroadcasti32x8  m15,  [pd_16]
754
+
755
+    call        ang16_mode_9_27
756
+    add         r2,        2
757
+    call        ang16_mode_9_27
758
+    add         r2,        30
759
+    mov         r0,        r5
760
+    call        ang16_mode_9_27
761
+    add         r2,        2
762
+    call        ang16_mode_9_27
763
+    RET
764
+;; angle 16, modes 11 and 25
765
+cglobal ang16_mode_11_25
766
+    test            r6d, r6d
767
+
768
+    vbroadcasti32x8  m0, [r2]                        ; [15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0]
769
+    vbroadcasti32x8  m1, [r2 + 2]                    ; [16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1]
770
+
771
+    punpcklwd       m3, m0, m1                      ; [12 11 11 10 10  9  9  8  4  3  3  2  2  1  1  0]
772
+    punpckhwd       m0, m1                          ; [16 15 15 14 14 13 13 12  8  7  7  6  6  5  5  4]
773
+
774
+    movu            ym16, [r3 + 14 * 32]          ; [30]
775
+    vinserti32x8    m16, [r3 + 12 * 32], 1           ; [28]
776
+    pmaddwd         m4, m3, m16
777
+    paddd           m4, m15
778
+    psrld           m4, 5
779
+    pmaddwd         m5, m0, m16
780
+    paddd           m5, m15
781
+    psrld           m5, 5
782
+    packusdw        m4, m5
783
+    vextracti32x8   ym5, m4, 1
784
+    movu            ym16, [r3 + 10 * 32]          ; [26]
785
+    vinserti32x8    m16, [r3 + 8 * 32], 1            ; [24]
786
+    pmaddwd         m6, m3, m16
787
+    paddd           m6, m15
788
+    psrld           m6, 5
789
+    pmaddwd         m9, m0, m16
790
+    paddd           m9, m15
791
+    psrld           m9, 5
792
+    packusdw        m6, m9
793
+    vextracti32x8   ym7, m6, 1
794
+    movu            ym16, [r3 + 6 * 32]           ; [22]
795
+    vinserti32x8    m16, [r3 + 4 * 32], 1            ; [20]
796
+    pmaddwd         m8, m3, m16
797
+    paddd           m8, m15
798
+    psrld           m8, 5
799
+    pmaddwd         m9, m0, m16
800
+    paddd           m9, m15
801
+    psrld           m9, 5
802
+    packusdw        m8, m9
803
+    vextracti32x8   ym9, m8, 1
804
+    movu            ym16, [r3 + 2 * 32]           ; [18]
805
+    vinserti32x8    m16, [r3], 1                     ; [16]
806
+    pmaddwd         m10, m3, m16
807
+    paddd           m10, m15
808
+    psrld           m10, 5
809
+    pmaddwd         m1, m0, m16
810
+    paddd           m1, m15
811
+    psrld           m1, 5
812
+    packusdw        m10, m1
813
+    vextracti32x8   ym11, m10, 1
814
+    TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 2, 1, 0
815
+
816
+    movu            ym16, [r3 - 2 * 32]             ; [14]
817
+    vinserti32x8    m16, [r3 - 4 * 32], 1              ; [12]
818
+    pmaddwd         m4, m3, m16
819
+    paddd           m4, m15
820
+    psrld           m4, 5
821
+    pmaddwd         m5, m0, m16
822
+    paddd           m5, m15
823
+    psrld           m5, 5
824
+    packusdw        m4, m5
825
+    vextracti32x8   ym5, m4, 1
826
+    movu            ym16, [r3 - 6 * 32]             ; [10]
827
+    vinserti32x8    m16, [r3 - 8 * 32], 1              ; [8]
828
+    pmaddwd         m6, m3, m16
829
+    paddd           m6, m15
830
+    psrld           m6, 5
831
+    pmaddwd         m8, m0, m16
832
+    paddd           m8, m15
833
+    psrld           m8, 5
834
+    packusdw        m6, m8
835
+    vextracti32x8   ym7, m6, 1
836
+    movu            ym16, [r3 - 10 * 32]             ; [6]
837
+    vinserti32x8    m16, [r3 - 12 * 32], 1              ; [4]
838
+    pmaddwd         m8, m3, m16
839
+    paddd           m8, m15
840
+    psrld           m8, 5
841
+    pmaddwd         m9, m0, m16
842
+    paddd           m9, m15
843
+    psrld           m9, 5
844
+    packusdw        m8, m9
845
+    vextracti32x8   ym9, m8, 1
846
+    pmaddwd         ym3, [r3 - 14 * 32]              ; [2]
847
+    paddd           ym3, ym15
848
+    psrld           ym3, 5
849
+    pmaddwd         ym0, [r3 - 14 * 32]
850
+    paddd           ym0, ym15
851
+    psrld           ym0, 5
852
+    packusdw        ym3, ym0
853
+
854
+    movu            ym1, [r2]
855
+    TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 3, 1, 0, 2, 16
856
+    ret
857
+
858
+cglobal intra_pred_ang32_11, 3,8,17, 0-8
859
+    movzx       r5d,        word [r2 + 128]  ; [0]
860
+    movzx       r6d,        word [r2]
861
+    mov         [rsp],      r5w
862
+    mov         [r2 + 128], r6w
863
+
864
+    movzx       r5d,        word [r2 + 126]  ; [16]
865
+    movzx       r6d,        word [r2 + 32]
866
+    mov         [rsp + 4],  r5w
867
+    mov         [r2 + 126], r6w
868
+    vbroadcasti32x8  m15,  [pd_16]
869
+    add         r2,         128
870
+    xor         r6d,        r6d
871
+    lea         r3,         [ang_table_avx2 + 16 * 32]
872
+    shl         r1d,        1
873
+    lea         r4,         [r1 * 3]
874
+    lea         r7,         [r0 + 8 * r1]
875
+
876
+    call        ang16_mode_11_25
877
+    sub         r2,         2
878
+    lea         r0,         [r0 + 32]
879
+    call        ang16_mode_11_25
880
+    add         r2,         34
881
+    lea         r0,         [r7 + 8 * r1]
882
+    call        ang16_mode_11_25
883
+    sub         r2,         2
884
+    lea         r0,         [r0 + 32]
885
+    call        ang16_mode_11_25
886
+    mov         r6d,        [rsp]
887
+    mov         [r2 - 30], r6w
888
+    mov         r6d,       [rsp + 4]
889
+    mov         [r2 - 32], r6w
890
+    RET
891
+
892
+cglobal intra_pred_ang32_25, 3,7,17, 0-4
893
+    xor         r6d,        r6d
894
+    inc         r6d
895
+    lea         r3,         [ang_table_avx2 + 16 * 32]
896
+    shl         r1d,        1
897
+    vbroadcasti32x8  m15,  [pd_16]
898
+    movzx       r4d,        word [r2 - 2]
899
+    movzx       r5d,        word [r2 + 160]     ; [16]
900
+    mov         [rsp],      r4w
901
+    mov         [r2 - 2],   r5w
902
+
903
+    lea         r4,         [r1 * 3]
904
+    lea         r5,         [r0 + 32]
905
+    call        ang16_mode_11_25
906
+    sub         r2,         2
907
+    call        ang16_mode_11_25
908
+    add         r2,         34
909
+    mov         r0,         r5
910
+    call        ang16_mode_11_25
911
+    sub         r2,         2
912
+    call        ang16_mode_11_25
913
+    mov         r5d,        [rsp]
914
+    mov         [r2 - 32],  r5w
915
+    RET
916
+
917
+cglobal intra_pred_ang16_9, 3,7,17
918
+    add         r2,        64
919
+    xor         r6d,       r6d
920
+    lea         r3,        [ang_table_avx2 + 16 * 32]
921
+    shl         r1d,       1
922
+    lea         r4,        [r1 * 3]
923
+    vbroadcasti32x8  m15,  [pd_16]
924
+    call        ang16_mode_9_27
925
+    RET
926
+
927
+cglobal intra_pred_ang16_27, 3,7,17
928
+    xor         r6d,       r6d
929
+    inc         r6d
930
+    lea         r3,        [ang_table_avx2 + 16 * 32]
931
+    shl         r1d,       1
932
+    lea         r4,        [r1 * 3]
933
+    vbroadcasti32x8  m15,  [pd_16]
934
+    call        ang16_mode_9_27
935
+    RET
936
+
937
+cglobal intra_pred_ang16_11, 3,7,17, 0-4
938
+    movzx       r5d,       word [r2 + 64]
939
+    movzx       r6d,       word [r2]
940
+    mov         [rsp],     r5w
941
+    mov         [r2 + 64], r6w
942
+    vbroadcasti32x8  m15,  [pd_16]
943
+    add         r2,        64
944
+    xor         r6d,       r6d
945
+    lea         r3,        [ang_table_avx2 + 16 * 32]
946
+    shl         r1d,       1
947
+    lea         r4,        [r1 * 3]
948
+    call        ang16_mode_11_25
949
+    mov         r6d,       [rsp]
950
+    mov         [r2],      r6w
951
+    RET
952
+
953
+cglobal intra_pred_ang16_25, 3,7,17
954
+    xor         r6d,       r6d
955
+    inc         r6d
956
+    vbroadcasti32x8  m15,  [pd_16]
957
+    lea         r3,        [ang_table_avx2 + 16 * 32]
958
+    shl         r1d,       1
959
+    lea         r4,        [r1 * 3]
960
+    call        ang16_mode_11_25
961
+    RET
962
+cglobal ang16_mode_5_31
963
+    test            r6d, r6d
964
+
965
+    vbroadcasti32x8            m0, [r2 + 2]                    ; [16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1]
966
+    vbroadcasti32x8            m1, [r2 + 4]                    ; [17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2]
967
+
968
+    punpcklwd       m3, m0, m1                      ; [13 12 12 11 11 10 10  9  5  4  4  3  3  2  2  1]
969
+    punpckhwd       m0, m1                          ; [17 16 16 15 15 14 14 13  9  8  8  7  7  6  6  5]
970
+
971
+    vbroadcasti32x8            m1, [r2 + 18]                   ; [24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9]
972
+    vbroadcasti32x8            m4, [r2 + 20]                   ; [25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10]
973
+    punpcklwd       m2, m1, m4                      ; [21 20 20 19 19 18 18 17 13 12 12 11 11 10 10  9]
974
+    punpckhwd       m1, m4                          ; [25 24 24 23 23 22 22 21 17 16 16 15 15 14 14 13]
975
+
976
+    pmaddwd         m4, m3, [r3 + 1 * 32]           ; [17]
977
+    paddd           m4, m15
978
+    psrld           m4, 5
979
+    pmaddwd         m5, m0, [r3 + 1 * 32]
980
+    paddd           m5, m15
981
+    psrld           m5, 5
982
+    packusdw        m4, m5
983
+
984
+    movu            ym16, [r3 - 14 * 32]          ; [2]
985
+    vinserti32x8    m16, [r3 + 3 * 32] ,1         ; [19]
986
+    palignr         m6, m0, m3, 4
987
+    pmaddwd         m5, m6, m16
988
+    paddd           m5, m15
989
+    psrld           m5, 5
990
+    palignr         m7, m2, m0, 4
991
+    pmaddwd         m8, m7, m16
992
+    paddd           m8, m15
993
+    psrld           m8, 5
994
+    packusdw        m5, m8
995
+    vextracti32x8   ym6, m5, 1
996
+
997
+    palignr         m8, m0, m3, 8
998
+    palignr         m9, m2, m0, 8
999
+    movu            ym16, [r3 - 12 * 32]          ; [4]
1000
+    vinserti32x8    m16, [r3 + 5 * 32],1          ; [21]
1001
+    pmaddwd         m7, m8, m16
1002
+    paddd           m7, m15
1003
+    psrld           m7, 5
1004
+    pmaddwd         m10, m9,m16
1005
+    paddd           m10, m15
1006
+    psrld           m10, 5
1007
+    packusdw        m7, m10
1008
+    vextracti32x8   ym8, m7, 1
1009
+
1010
+    palignr         m10, m0, m3, 12
1011
+    palignr         m11, m2, m0, 12
1012
+    movu            ym16,[r3 - 10 * 32]         ; [6]
1013
+    vinserti32x8    m16,  [r3 + 7 * 32] ,1      ; [23]
1014
+    pmaddwd         m9, m10, m16
1015
+    paddd           m9, m15
1016
+    psrld           m9, 5
1017
+    pmaddwd         m3, m11, m16
1018
+    paddd           m3, m15
1019
+    psrld           m3, 5
1020
+    packusdw        m9, m3
1021
+    vextracti32x8   ym10, m9, 1
1022
+
1023
+    pmaddwd         m11, m0, [r3 - 8 * 32]          ; [8]
1024
+    paddd           m11, m15
1025
+    psrld           m11, 5
1026
+    pmaddwd         m3, m2, [r3 - 8 * 32]
1027
+    paddd           m3, m15
1028
+    psrld           m3, 5
1029
+    packusdw        m11, m3
1030
+
1031
+    TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 3, 0
1032
+
1033
+    pmaddwd         m4, m0, [r3 + 9 * 32]           ; [25]
1034
+    paddd           m4, m15
1035
+    psrld           m4, 5
1036
+    pmaddwd         m5, m2, [r3 + 9  * 32]
1037
+    paddd           m5, m15
1038
+    psrld           m5, 5
1039
+    packusdw        m4, m5
1040
+
1041
+    palignr         m6, m2, m0, 4
1042
+    movu            ym16, [r3 - 6 * 32]           ; [10]
1043
+    vinserti32x8    m16, [r3 + 11 * 32] ,1        ; [27]
1044
+    pmaddwd         m5, m6,m16
1045
+    paddd           m5, m15
1046
+    psrld           m5, 5
1047
+    palignr         m7, m1, m2, 4
1048
+    pmaddwd         m3, m7,m16
1049
+    paddd           m3, m15
1050
+    psrld           m3, 5
1051
+    packusdw        m5, m3
1052
+    vextracti32x8   ym6, m5, 1
1053
+
1054
+    palignr         m8, m2, m0, 8
1055
+    palignr         m9, m1, m2, 8
1056
+    movu            ym16, [r3 - 4 * 32]           ; [12]
1057
+    vinserti32x8    m16, [r3 + 13 * 32]  ,1       ; [29]
1058
+    pmaddwd         m7, m8, m16
1059
+    paddd           m7, m15
1060
+    psrld           m7, 5
1061
+    pmaddwd         m3, m9, m16
1062
+    paddd           m3, m15
1063
+    psrld           m3, 5
1064
+    packusdw        m7, m3
1065
+    vextracti32x8   ym8, m7, 1
1066
+
1067
+
1068
+    palignr         m10, m2, m0, 12
1069
+    palignr         m11, m1, m2, 12
1070
+    movu            ym16, [r3 - 2 * 32]          ; [14]
1071
+    vinserti32x8    m16,  [r3 + 15 * 32],1       ; [31]
1072
+    pmaddwd         m9, m10, m16
1073
+    paddd           m9, m15
1074
+    psrld           m9, 5
1075
+    pmaddwd         m3, m11, m16
1076
+    paddd           m3, m15
1077
+    psrld           m3, 5
1078
+    packusdw        m9, m3
1079
+    vextracti32x8   ym10, m9, 1
1080
+
1081
+    pmaddwd         m2, [r3]                        ; [16]
1082
+    paddd           m2, m15
1083
+    psrld           m2, 5
1084
+    pmaddwd         m1, [r3]
1085
+    paddd           m1, m15
1086
+    psrld           m1, 5
1087
+    packusdw        m2, m1
1088
+    TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 2, 0, 1, 16
1089
+    ret
1090
+;; angle 32, modes 5 and 31
1091
+cglobal ang32_mode_5_31
1092
+    test            r6d, r6d
1093
+
1094
+    vbroadcasti32x8            m0, [r2 + 2]                    ; [16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1]
1095
+    vbroadcasti32x8            m1, [r2 + 4]                    ; [17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2]
1096
+
1097
+    punpcklwd       m3, m0, m1                      ; [13 12 12 11 11 10 10  9  5  4  4  3  3  2  2  1]
1098
+    punpckhwd       m0, m1                          ; [17 16 16 15 15 14 14 13  9  8  8  7  7  6  6  5]
1099
+
1100
+    vbroadcasti32x8            m1, [r2 + 18]                   ; [24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9]
1101
+    vbroadcasti32x8            m4, [r2 + 20]                   ; [25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10]
1102
+    punpcklwd       m2, m1, m4                      ; [21 20 20 19 19 18 18 17 13 12 12 11 11 10 10  9]
1103
+    punpckhwd       m1, m4                          ; [25 24 24 23 23 22 22 21 17 16 16 15 15 14 14 13]
1104
+
1105
+   movu            ym16, [r3 - 15 * 32]          ; [1]
1106
+    vinserti32x8    m16, [r3 + 2 * 32],1         ; [18]
1107
+    pmaddwd         m4, m3, m16
1108
+    paddd           m4, m15
1109
+    psrld           m4, 5
1110
+    pmaddwd         m5, m0, m16
1111
+    paddd           m5, m15
1112
+    psrld           m5, 5
1113
+    packusdw        m4, m5
1114
+    vextracti32x8   ym5, m4, 1
1115
+
1116
+
1117
+    palignr         m7, m0, m3, 4
1118
+    movu            ym16, [r3 - 13 * 32]          ; [3]
1119
+    vinserti32x8    m16, [r3 + 4 * 32] ,1         ; [20]
1120
+    pmaddwd         m6, m7, m16
1121
+    paddd           m6, m15
1122
+    psrld           m6, 5
1123
+    palignr         m8, m2, m0, 4
1124
+    pmaddwd         m9, m8,m16
1125
+    paddd           m9, m15
1126
+    psrld           m9, 5
1127
+    packusdw        m6, m9
1128
+    vextracti32x8   ym7, m6, 1
1129
+
1130
+
1131
+    palignr         m9, m0, m3, 8
1132
+    movu            ym16, [r3 - 11 * 32]          ; [5]
1133
+    vinserti32x8    m16, [r3 + 6 * 32] ,1         ; [22]
1134
+    pmaddwd         m8, m9,m16
1135
+    paddd           m8, m15
1136
+    psrld           m8, 5
1137
+    palignr         m10, m2, m0, 8
1138
+    pmaddwd         m11, m10,m16
1139
+    paddd           m11, m15
1140
+    psrld           m11, 5
1141
+    packusdw        m8, m11
1142
+    vextracti32x8   ym9, m8, 1
1143
+
1144
+
1145
+    palignr         m11, m0, m3, 12
1146
+    movu            ym16, [r3 - 9 * 32]         ; [7]
1147
+    vinserti32x8    m16, [r3 + 8 * 32] ,1       ; [24]
1148
+    pmaddwd         m10, m11,m16
1149
+    paddd           m10, m15
1150
+    psrld           m10, 5
1151
+    palignr         m12, m2, m0, 12
1152
+    pmaddwd         m3, m12, m16
1153
+    paddd           m3, m15
1154
+    psrld           m3, 5
1155
+    packusdw        m10, m3
1156
+    vextracti32x8   ym11, m10, 1
1157
+
1158
+
1159
+
1160
+    TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 3, 0
1161
+
1162
+    movu            ym16, [r3 - 7 * 32]           ; [9]
1163
+    vinserti32x8    m16, [r3 + 10 * 32]  ,1       ; [26]
1164
+    pmaddwd         m4, m0, m16
1165
+    paddd           m4, m15
1166
+    psrld           m4, 5
1167
+    pmaddwd         m5, m2, m16
1168
+    paddd           m5, m15
1169
+    psrld           m5, 5
1170
+    packusdw        m4, m5
1171
+    vextracti32x8   ym5, m4, 1
1172
+
1173
+
1174
+    palignr         m7, m2, m0, 4
1175
+    movu            ym16, [r3 - 5 * 32]           ; [11]
1176
+    vinserti32x8    m16, [r3 + 12 * 32],1         ; [28]
1177
+    pmaddwd         m6, m7, m16
1178
+    paddd           m6, m15
1179
+    psrld           m6, 5
1180
+    palignr         m8, m1, m2, 4
1181
+    pmaddwd         m9, m8,m16
1182
+    paddd           m9, m15
1183
+    psrld           m9, 5
1184
+    packusdw        m6, m9
1185
+    vextracti32x8   ym7, m6, 1
1186
+
1187
+    palignr         m9, m2, m0, 8
1188
+    movu            ym16, [r3 - 3 * 32]           ; [13]
1189
+    vinserti32x8    m16, [r3 + 14 * 32]  ,1       ; [30]
1190
+    pmaddwd         m8, m9, m16
1191
+    paddd           m8, m15
1192
+    psrld           m8, 5
1193
+    palignr         m3, m1, m2, 8
1194
+    pmaddwd         m10, m3, m16
1195
+    paddd           m10, m15
1196
+    psrld           m10, 5
1197
+    packusdw        m8, m10
1198
+    vextracti32x8    ym9, m8, 1
1199
+
1200
+
1201
+
1202
+    palignr         m10, m2, m0, 12
1203
+    pmaddwd         m10, [r3 - 1 * 32]              ; [15]
1204
+    paddd           m10, m15
1205
+    psrld           m10, 5
1206
+    palignr         m11, m1, m2, 12
1207
+    pmaddwd         m11, [r3 - 1 * 32]
1208
+    paddd           m11, m15
1209
+    psrld           m11, 5
1210
+    packusdw        m10, m11
1211
+
1212
+    pmaddwd         m2, [r3 - 16 * 32]              ; [0]
1213
+    paddd           m2, m15
1214
+    psrld           m2, 5
1215
+    pmaddwd         m1, [r3 - 16 * 32]
1216
+    paddd           m1, m15
1217
+    psrld           m1, 5
1218
+    packusdw        m2, m1
1219
+    TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 2, 0, 1, 16
1220
+    ret
1221
+cglobal intra_pred_ang32_5, 3,8,17
1222
+    add         r2,        128
1223
+    xor         r6d,       r6d
1224
+    lea         r3,        [ang_table_avx2 + 16 * 32]
1225
+    add         r1d,       r1d
1226
+    lea         r4,        [r1 * 3]
1227
+    lea         r7,        [r0 + 8 * r1]
1228
+    vbroadcasti32x8  m15,  [pd_16]
1229
+    call        ang16_mode_5_31
1230
+
1231
+    add         r2,        18
1232
+    lea         r0,        [r0 + 32]
1233
+
1234
+    call        ang32_mode_5_31
1235
+
1236
+    add         r2,        14
1237
+    lea         r0,        [r7 + 8 * r1]
1238
+
1239
+    call        ang16_mode_5_31
1240
+    vbroadcasti32x8  m15,  [pd_16]
1241
+    add         r2,        18
1242
+    lea         r0,        [r0 + 32]
1243
+    call        ang32_mode_5_31
1244
+    RET
1245
+cglobal intra_pred_ang32_31, 3,7,17
1246
+    xor         r6d,       r6d
1247
+    inc         r6d
1248
+    lea         r3,        [ang_table_avx2 + 16 * 32]
1249
+    add         r1d,       r1d
1250
+    lea         r4,        [r1 * 3]
1251
+    lea         r5,        [r0 + 32]
1252
+    vbroadcasti32x8  m15,  [pd_16]
1253
+    call        ang16_mode_5_31
1254
+
1255
+    add         r2,        18
1256
+
1257
+    call        ang32_mode_5_31
1258
+
1259
+    add         r2,        14
1260
+    mov         r0,        r5
1261
+
1262
+    call        ang16_mode_5_31
1263
+
1264
+    add         r2,        18
1265
+    call        ang32_mode_5_31
1266
+    RET
1267
+cglobal intra_pred_ang16_5, 3,7,17
1268
+    add         r2,        64
1269
+    xor         r6d,       r6d
1270
+    vbroadcasti32x8  m15,  [pd_16]
1271
+    lea         r3,        [ang_table_avx2 + 16 * 32]
1272
+    add         r1d,       r1d
1273
+    lea         r4,        [r1 * 3]
1274
+    call        ang16_mode_5_31
1275
+    RET
1276
+cglobal intra_pred_ang16_31, 3,7,17
1277
+    xor         r6d,       r6d
1278
+    inc         r6d
1279
+    vbroadcasti32x8  m15,  [pd_16]
1280
+    lea         r3,        [ang_table_avx2 + 16 * 32]
1281
+    add         r1d,       r1d
1282
+    lea         r4,        [r1 * 3]
1283
+    call        ang16_mode_5_31
1284
+    RET
1285
+;; angle 16, modes 4 and 32
1286
+cglobal ang16_mode_4_32
1287
+    test            r6d, r6d
1288
+
1289
+    vbroadcasti32x8            m0, [r2 + 2]                    ; [16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1]
1290
+    vbroadcasti32x8            m1, [r2 + 4]                    ; [17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2]
1291
+
1292
+    punpcklwd       m3, m0, m1                      ; [13 12 12 11 11 10 10  9  5  4  4  3  3  2  2  1]
1293
+    punpckhwd       m0, m1                          ; [17 16 16 15 15 14 14 13  9  8  8  7  7  6  6  5]
1294
+
1295
+    vbroadcasti32x8            m1, [r2 + 18]                   ; [24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9]
1296
+    vbroadcasti32x8            m4, [r2 + 20]                   ; [25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10]
1297
+    punpcklwd       m2, m1, m4                      ; [21 20 20 19 19 18 18 17 13 12 12 11 11 10 10  9]
1298
+    punpckhwd       m1, m4                          ; [25 24 24 23 23 22 22 21 17 16 16 15 15 14 14 13]
1299
+
1300
+    pmaddwd         m4, m3, [r3 + 3 * 32]           ; [21]
1301
+    paddd           m4, m15
1302
+    psrld           m4, 5
1303
+    pmaddwd         m5, m0, [r3 + 3 * 32]
1304
+    paddd           m5, m15
1305
+    psrld           m5, 5
1306
+    packusdw        m4, m5
1307
+
1308
+    palignr         m6, m0, m3, 4                   ; [14 13 13 12 12 11 11 10  6  5  5  4  4  3  3  2]
1309
+    palignr         m7, m2, m0, 4                   ; [18 17 17 16 16 15 15 14 10  9  9  8  8  7  7  6]
1310
+    movu            ym16,[r3 - 8 * 32]           ; [10]
1311
+    vinserti32x8    m16, [r3 + 13 * 32] ,1             ; [31]
1312
+    pmaddwd         m5, m6, m16
1313
+    paddd           m5, m15
1314
+    psrld           m5, 5
1315
+    pmaddwd         m8, m7,m16
1316
+    paddd           m8, m15
1317
+    psrld           m8, 5
1318
+    packusdw        m5, m8
1319
+    vextracti32x8   ym6, m5, 1
1320
+
1321
+
1322
+    palignr         m7, m0, m3, 8                   ; [15 14 14 13 13 12 12 11  7  6  6  5  5  4  4  3]
1323
+    pmaddwd         m7, [r3 + 2 * 32]               ; [20]
1324
+    paddd           m7, m15
1325
+    psrld           m7, 5
1326
+    palignr         m8, m2, m0, 8                   ; [19 18 18 17 17 16 16 15 11 10 10  9  9  8  8  7]
1327
+    pmaddwd         m8, [r3 + 2 * 32]
1328
+    paddd           m8, m15
1329
+    psrld           m8, 5
1330
+    packusdw        m7, m8
1331
+
1332
+    palignr         m9, m0, m3, 12
1333
+    palignr         m3, m2, m0, 12
1334
+    movu            ym16,[r3 - 9 * 32]           ; [9]
1335
+    vinserti32x8    m16, [r3 + 12 * 32] ,1      ; [30]
1336
+    pmaddwd         m8, m9, m16
1337
+    paddd           m8, m15
1338
+    psrld           m8, 5
1339
+    pmaddwd         m10, m3,m16
1340
+    paddd           m10,m15
1341
+    psrld           m10, 5
1342
+    packusdw        m8, m10
1343
+    vextracti32x8   ym9, m8, 1
1344
+
1345
+
1346
+    pmaddwd         m10, m0, [r3 + 1 * 32]          ; [19]
1347
+    paddd           m10,m15
1348
+    psrld           m10, 5
1349
+    pmaddwd         m3, m2, [r3 + 1 * 32]
1350
+    paddd           m3, m15
1351
+    psrld           m3, 5
1352
+    packusdw        m10, m3
1353
+
1354
+    palignr         m11, m2, m0, 4
1355
+    pmaddwd         m11, [r3 - 10 * 32]             ; [8]
1356
+    paddd           m11, m15
1357
+    psrld           m11, 5
1358
+    palignr         m3, m1, m2, 4
1359
+    pmaddwd         m3, [r3 - 10 * 32]
1360
+    paddd           m3, m15
1361
+    psrld           m3, 5
1362
+    packusdw        m11, m3
1363
+
1364
+    TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 3, 0
1365
+
1366
+    palignr         m4, m2, m0, 4
1367
+    pmaddwd         m4, [r3 + 11 * 32]              ; [29]
1368
+    paddd           m4, m15
1369
+    psrld           m4, 5
1370
+    palignr         m5, m1, m2, 4
1371
+    pmaddwd         m5, [r3 + 11  * 32]
1372
+    paddd           m5, m15
1373
+    psrld           m5, 5
1374
+    packusdw        m4, m5
1375
+
1376
+    palignr         m5, m2, m0, 8
1377
+    pmaddwd         m5, [r3]                        ; [18]
1378
+    paddd           m5, m15
1379
+    psrld           m5, 5
1380
+    palignr         m6, m1, m2, 8
1381
+    pmaddwd         m6, [r3]
1382
+    paddd           m6, m15
1383
+    psrld           m6, 5
1384
+    packusdw        m5, m6
1385
 
1386
+    palignr         m7, m2, m0, 12
1387
+    palignr         m8, m1, m2, 12
1388
+    movu            ym16,[r3 - 11 * 32]          ; [7]
1389
+    vinserti32x8    m16, [r3 + 10 * 32],1        ; [28]
1390
+    pmaddwd         m6, m7, m16
1391
+    paddd           m6, m15
1392
+    psrld           m6, 5
1393
+    palignr         m8, m1, m2, 12
1394
+    pmaddwd         m3, m8, m16
1395
+    paddd           m3,m15
1396
+    psrld           m3, 5
1397
+    packusdw        m6, m3
1398
+    vextracti32x8   ym7, m6, 1
1399
+
1400
+    movu            m0, [r2 + 34]                   ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17]
1401
+    pmaddwd         m8, m2, [r3 - 1 * 32]           ; [17]
1402
+    paddd           m8, m15
1403
+    psrld           m8, 5
1404
+    pmaddwd         m9, m1, [r3 - 1 * 32]
1405
+    paddd           m9, m15
1406
+    psrld           m9, 5
1407
+    packusdw        m8, m9
1408
+
1409
+    palignr         m3, m0, m0, 2                   ; [ x 32 31 30 29 28 27 26  x 24 23 22 21 20 19 18]
1410
+    punpcklwd       m0, m3                          ; [29 29 28 28 27 27 26 22 21 20 20 19 19 18 18 17]
1411
+
1412
+    palignr         m10, m1, m2, 4
1413
+    pmaddwd         m9, m10, [r3 - 12 * 32]         ; [6]
1414
+    paddd           m9, m15
1415
+    psrld           m9, 5
1416
+    palignr         m11, m0, m1, 4
1417
+    pmaddwd         m3, m11, [r3 - 12 * 32]
1418
+    paddd           m3, m15
1419
+    psrld           m3, 5
1420
+    packusdw        m9, m3
1421
+
1422
+    pmaddwd         m10, [r3 + 9 * 32]              ; [27]
1423
+    paddd           m10,m15
1424
+    psrld           m10, 5
1425
+    pmaddwd         m11, [r3 + 9 * 32]
1426
+    paddd           m11, m15
1427
+    psrld           m11, 5
1428
+    packusdw        m10, m11
1429
+
1430
+    palignr         m3, m1, m2, 8
1431
+    pmaddwd         m3, [r3 - 2 * 32]               ; [16]
1432
+    paddd           m3, m15
1433
+    psrld           m3, 5
1434
+    palignr         m0, m1, 8
1435
+    pmaddwd         m0, [r3 - 2 * 32]
1436
+    paddd           m0,m15
1437
+    psrld           m0, 5
1438
+    packusdw        m3, m0
1439
+    TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 3, 0, 1, 16
1440
+    ret
1441
+;; angle 32, modes 4 and 32
1442
+cglobal ang32_mode_4_32
1443
+    test            r6d, r6d
1444
+
1445
+    vbroadcasti32x8            m0, [r2 + 2]                    ; [16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1]
1446
+    vbroadcasti32x8            m1, [r2 + 4]                    ; [17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2]
1447
+
1448
+    punpcklwd       m3, m0, m1                      ; [13 12 12 11 11 10 10  9  5  4  4  3  3  2  2  1]
1449
+    punpckhwd       m0, m1                          ; [17 16 16 15 15 14 14 13  9  8  8  7  7  6  6  5]
1450
+
1451
+   vbroadcasti32x8          m1, [r2 + 18]                   ; [24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9]
1452
+   vbroadcasti32x8           m4, [r2 + 20]                   ; [25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10]
1453
+    punpcklwd       m2, m1, m4                      ; [21 20 20 19 19 18 18 17 13 12 12 11 11 10 10  9]
1454
+    punpckhwd       m1, m4                          ; [25 24 24 23 23 22 22 21 17 16 16 15 15 14 14 13]
1455
+
1456
+    movu            ym16, [r3 - 13 * 32]          ; [5]
1457
+    vinserti32x8    m16, [r3 + 8 * 32],1          ; [26]
1458
+    pmaddwd         m4, m3, m16
1459
+    paddd           m4, m15
1460
+    psrld           m4, 5
1461
+    pmaddwd         m5, m0,m16
1462
+    paddd           m5, m15
1463
+    psrld           m5, 5
1464
+    packusdw        m4, m5
1465
+   vextracti32x8   ym5, m4, 1
1466
+
1467
+    palignr         m6, m0, m3, 4                   ; [14 13 13 12 12 11 11 10  6  5  5  4  4  3  3  2]
1468
+    pmaddwd         m6, [r3 - 3 * 32]               ; [15]
1469
+    paddd           m6, m15
1470
+    psrld           m6, 5
1471
+    palignr         m7, m2, m0, 4                   ; [18 17 17 16 16 15 15 14 10  9  9  8  8  7  7  6]
1472
+    pmaddwd         m7, [r3 - 3 * 32]
1473
+    paddd           m7, m15
1474
+    psrld           m7, 5
1475
+    packusdw        m6, m7
1476
+
1477
+    palignr         m8, m0, m3, 8                   ; [15 14 14 13 13 12 12 11  7  6  6  5  5  4  4  3]
1478
+    palignr         m9, m2, m0, 8                   ; [19 18 18 17 17 16 16 15 11 10 10  9  9  8  8  7]
1479
+    movu            ym16,  [r3 - 14 * 32]              ; [4]
1480
+    vinserti32x8    m16, [r3 + 7 * 32] ,1               ; [25]
1481
+    pmaddwd         m7, m8, m16
1482
+    paddd           m7, m15
1483
+    psrld           m7, 5
1484
+    pmaddwd         m10, m9, m16
1485
+    paddd           m10, m15
1486
+    psrld           m10, 5
1487
+    packusdw        m7, m10
1488
+    vextracti32x8    ym8, m7, 1
1489
+
1490
+    palignr         m9, m0, m3, 12
1491
+    pmaddwd         m9, [r3 - 4 * 32]               ; [14]
1492
+    paddd           m9, m15
1493
+    psrld           m9, 5
1494
+    palignr         m3, m2, m0, 12
1495
+    pmaddwd         m3, [r3 - 4 * 32]
1496
+    paddd           m3,m15
1497
+    psrld           m3, 5
1498
+    packusdw        m9, m3
1499
+
1500
+    movu            ym16,   [r3 - 15 * 32]         ; [3]
1501
+    vinserti32x8    m16, [r3 + 6 * 32]  ,1        ; [24]
1502
+    pmaddwd         m10, m0, m16
1503
+    paddd           m10, m15
1504
+    psrld           m10, 5
1505
+    pmaddwd         m3, m2, m16
1506
+    paddd           m3,m15
1507
+    psrld           m3, 5
1508
+    packusdw        m10, m3
1509
+    vextracti32x8    ym11, m10, 1
1510
+
1511
+    TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 3, 0
1512
+
1513
+    palignr         m4, m2, m0, 4
1514
+    pmaddwd         m4, [r3 - 5* 32]                ; [13]
1515
+    paddd           m4, m15
1516
+    psrld           m4, 5
1517
+    palignr         m5, m1, m2, 4
1518
+    pmaddwd         m5, [r3 - 5  * 32]
1519
+    paddd           m5, m15
1520
+    psrld           m5, 5
1521
+    packusdw        m4, m5
1522
+
1523
+    palignr         m6, m2, m0, 8
1524
+    palignr         m7, m1, m2, 8
1525
+    movu            ym16, [r3 - 16 * 32]          ; [2]
1526
+    vinserti32x8    m16, [r3 + 5 * 32] ,1        ; [23]
1527
+    pmaddwd         m5, m6, m16
1528
+    paddd           m5, m15
1529
+    psrld           m5, 5
1530
+    palignr         m7, m1, m2, 8
1531
+    pmaddwd         m8, m7,m16
1532
+    paddd           m8, m15
1533
+    psrld           m8, 5
1534
+    packusdw        m5, m8
1535
+   vextracti32x8   ym6, m5, 1
1536
+
1537
+
1538
+    palignr         m7, m2, m0, 12
1539
+    pmaddwd         m7, [r3 - 6 * 32]               ; [12]
1540
+    paddd           m7, m15
1541
+    psrld           m7, 5
1542
+    palignr         m8, m1, m2, 12
1543
+    pmaddwd         m8, [r3 - 6 * 32]
1544
+    paddd           m8, m15
1545
+    psrld           m8, 5
1546
+    packusdw        m7, m8
1547
+
1548
+    movu            m0, [r2 + 34]                   ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17]
1549
+    pmaddwd         m8, m2, [r3 - 17 * 32]          ; [1]
1550
+    paddd           m8, m15
1551
+    psrld           m8, 5
1552
+    pmaddwd         m9, m1, [r3 - 17 * 32]
1553
+    paddd           m9, m15
1554
+    psrld           m9, 5
1555
+    packusdw        m8, m9
1556
+
1557
+    palignr         m3, m0, m0, 2                   ; [ x 32 31 30 29 28 27 26  x 24 23 22 21 20 19 18]
1558
+    punpcklwd       m0, m3                          ; [29 29 28 28 27 27 26 22 21 20 20 19 19 18 18 17]
1559
+
1560
+    pmaddwd         m9, m2, [r3 + 4 * 32]           ; [22]
1561
+    paddd           m9, m15
1562
+    psrld           m9, 5
1563
+    pmaddwd         m3, m1, [r3 + 4 * 32]
1564
+    paddd           m3, m15
1565
+    psrld           m3, 5
1566
+    packusdw        m9, m3
1567
+
1568
+    palignr         m10, m1, m2, 4
1569
+    pmaddwd         m10, [r3 - 7 * 32]              ; [11]
1570
+    paddd           m10, m15
1571
+    psrld           m10, 5
1572
+    palignr         m11, m0, m1, 4
1573
+    pmaddwd         m11, [r3 - 7 * 32]
1574
+    paddd           m11, m15
1575
+    psrld           m11, 5
1576
+    packusdw        m10, m11
1577
+
1578
+    palignr         m3, m1, m2, 8
1579
+    pmaddwd         m3, [r3 - 18 * 32]              ; [0]
1580
+    paddd           m3, m15
1581
+    psrld           m3, 5
1582
+    palignr         m0, m1, 8
1583
+    pmaddwd         m0, [r3 - 18 * 32]
1584
+    paddd           m0, m15
1585
+    psrld           m0, 5
1586
+    packusdw        m3, m0
1587
+    TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 3, 0, 1, 16
1588
+    ret
1589
+cglobal intra_pred_ang32_4, 3,8,17
1590
+    add         r2,        128
1591
+    xor         r6d,       r6d
1592
+    lea         r3,        [ang_table_avx2 + 18 * 32]
1593
+    add         r1d,       r1d
1594
+    lea         r4,        [r1 * 3]
1595
+    lea         r7,        [r0 + 8 * r1]
1596
+    vbroadcasti32x8  m15,  [pd_16]
1597
+    call        ang16_mode_4_32
1598
+
1599
+    add         r2,        22
1600
+    lea         r0,        [r0 + 32]
1601
+
1602
+    call        ang32_mode_4_32
1603
+
1604
+    add         r2,        10
1605
+    lea         r0,        [r7 + 8 * r1]
1606
+
1607
+    call        ang16_mode_4_32
1608
+
1609
+    add         r2,        22
1610
+    lea         r0,        [r0 + 32]
1611
+    call        ang32_mode_4_32
1612
+    RET
1613
+cglobal intra_pred_ang32_32, 3,7,17
1614
+    xor         r6d,       r6d
1615
+    inc         r6d
1616
+    lea         r3,        [ang_table_avx2 + 18 * 32]
1617
+    add         r1d,       r1d
1618
+    lea         r4,        [r1 * 3]
1619
+    lea         r5,        [r0 + 32]
1620
+    vbroadcasti32x8  m15,  [pd_16]
1621
+    call        ang16_mode_4_32
1622
+
1623
+    add         r2,        22
1624
+
1625
+    call        ang32_mode_4_32
1626
+
1627
+    add         r2,        10
1628
+    mov         r0,        r5
1629
+
1630
+    call        ang16_mode_4_32
1631
+    add         r2,        22
1632
+    call        ang32_mode_4_32
1633
+    RET
1634
+cglobal intra_pred_ang16_4, 3,7,17
1635
+    add         r2,        64
1636
+    xor         r6d,       r6d
1637
+    vbroadcasti32x8  m15,  [pd_16]
1638
+    lea         r3,        [ang_table_avx2 + 18 * 32]
1639
+    add         r1d,       r1d
1640
+    lea         r4,        [r1 * 3]
1641
+    call        ang16_mode_4_32
1642
+    RET
1643
+cglobal intra_pred_ang16_32, 3,7,17
1644
+    xor         r6d,       r6d
1645
+    inc         r6d
1646
+    vbroadcasti32x8  m15,  [pd_16]
1647
+    lea         r3,        [ang_table_avx2 + 18 * 32]
1648
+    shl         r1d,       1
1649
+    lea         r4,        [r1 * 3]
1650
+    call        ang16_mode_4_32
1651
+    RET
1652
+;; angle 16, modes 6 and 30
1653
+cglobal ang16_mode_6_30
1654
+    test            r6d, r6d
1655
+
1656
+    vbroadcasti32x8            m0, [r2 + 2]                    ; [16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1]
1657
+    vbroadcasti32x8            m1, [r2 + 4]                    ; [17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2]
1658
+
1659
+    punpcklwd       m3, m0, m1                      ; [13 12 12 11 11 10 10  9  5  4  4  3  3  2  2  1]
1660
+    punpckhwd       m0, m1                          ; [17 16 16 15 15 14 14 13  9  8  8  7  7  6  6  5]
1661
+
1662
+    vbroadcasti32x8            m1, [r2 + 18]                   ; [24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9]
1663
+    vbroadcasti32x8            m4, [r2 + 20]                   ; [25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10]
1664
+    punpcklwd       m2, m1, m4                      ; [21 20 20 19 19 18 18 17 13 12 12 11 11 10 10  9]
1665
+    punpckhwd       m1, m4                          ; [25 24 24 23 23 22 22 21 17 16 16 15 15 14 14 13]
1666
+
1667
+    movu            ym16,  [r3 - 2 * 32]           ; [13]
1668
+    vinserti32x8    m16, [r3 + 11 * 32] ,1         ; [26]
1669
+    pmaddwd         m4, m3, m16
1670
+    paddd           m4, m15
1671
+    psrld           m4, 5
1672
+    pmaddwd         m5, m0, m16
1673
+    paddd           m5, m15
1674
+    psrld           m5, 5
1675
+    packusdw        m4, m5
1676
+    vextracti32x8   ym5, m4, 1
1677
+
1678
+    palignr         m7, m0, m3, 4
1679
+    palignr         m8, m2, m0, 4
1680
+    movu            ym16, [r3 - 8 * 32]           ; [7]
1681
+    vinserti32x8    m16, [r3 + 5 * 32] ,1              ; [20]
1682
+    pmaddwd         m6, m7, m16
1683
+    paddd           m6, m15
1684
+    psrld           m6, 5
1685
+    pmaddwd         m9, m8, m16
1686
+    paddd           m9, m15
1687
+    psrld           m9, 5
1688
+    packusdw        m6, m9
1689
+    vextracti32x8   ym7, m6, 1
1690
+
1691
+    palignr         m10, m0, m3, 8
1692
+    palignr         m11, m2, m0, 8
1693
+    movu            ym16,  [r3 - 14 * 32]         ; [1]
1694
+    vinserti32x8    m16, [r3 - 1 * 32],1          ; [14]
1695
+    pmaddwd         m8, m10, m16
1696
+    paddd           m8,m15
1697
+    psrld           m8, 5
1698
+    palignr         m11, m2, m0, 8
1699
+    pmaddwd         m9, m11, m16
1700
+    paddd           m9, m15
1701
+    psrld           m9, 5
1702
+    packusdw        m8, m9
1703
+    vextracti32x8   ym9, m8, 1
1704
+
1705
+    pmaddwd         m10, [r3 + 12 * 32]             ; [27]
1706
+    paddd           m10,m15
1707
+    psrld           m10, 5
1708
+    pmaddwd         m11, [r3 + 12 * 32]
1709
+    paddd           m11, m15
1710
+    psrld           m11, 5
1711
+    packusdw        m10, m11
1712
+
1713
+    palignr         m11, m0, m3, 12
1714
+    pmaddwd         m11, [r3 - 7 * 32]              ; [8]
1715
+    paddd           m11, m15
1716
+    psrld           m11, 5
1717
+    palignr         m12, m2, m0, 12
1718
+    pmaddwd         m12, [r3 - 7 * 32]
1719
+    paddd           m12, m15
1720
+    psrld           m12, 5
1721
+    packusdw        m11, m12
1722
+
1723
+    TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0
1724
+
1725
+    palignr         m4, m0, m3, 12
1726
+    pmaddwd         m4, [r3 + 6 * 32]               ; [21]
1727
+    paddd           m4, m15
1728
+    psrld           m4, 5
1729
+    palignr         m5, m2, m0, 12
1730
+    pmaddwd         m5, [r3 + 6  * 32]
1731
+    paddd           m5, m15
1732
+    psrld           m5, 5
1733
+    packusdw        m4, m5
1734
+
1735
+    movu            ym16,  [r3 - 13 * 32]          ; [2]
1736
+    vinserti32x8    m16,  [r3]   ,1                ; [15]
1737
+    pmaddwd         m5, m0, m16
1738
+    paddd           m5, m15
1739
+    psrld           m5, 5
1740
+    pmaddwd         m3, m2,m16
1741
+    paddd           m3, m15
1742
+    psrld           m3, 5
1743
+    packusdw        m5, m3
1744
+    vextracti32x8   ym6, m5, 1
1745
+
1746
+    pmaddwd         m7, m0, [r3 + 13 * 32]          ; [28]
1747
+    paddd           m7, m15
1748
+    psrld           m7, 5
1749
+    pmaddwd         m3, m2, [r3 + 13 * 32]
1750
+    paddd           m3, m15
1751
+    psrld           m3, 5
1752
+    packusdw        m7, m3
1753
+
1754
+    palignr         m9, m2, m0, 4
1755
+    palignr         m3, m1, m2, 4
1756
+    movu            ym16, [r3 - 6 * 32]           ; [9]
1757
+    vinserti32x8    m16,  [r3 + 7 * 32],1               ; [22]
1758
+    pmaddwd         m8, m9, m16
1759
+    paddd           m8, m15
1760
+    psrld           m8, 5
1761
+    pmaddwd         m10, m3, m16
1762
+    paddd           m10,m15
1763
+    psrld           m10, 5
1764
+    packusdw        m8, m10
1765
+    vextracti32x8   ym9, m8, 1
1766
+
1767
+
1768
+    palignr         m11, m2, m0, 8
1769
+    pmaddwd         m10, m11, [r3 - 12 * 32]        ; [3]
1770
+    paddd           m10, m15
1771
+    psrld           m10, 5
1772
+    palignr         m3, m1, m2, 8
1773
+    pmaddwd         m12, m3, [r3 - 12 * 32]
1774
+    paddd           m12, m15
1775
+    psrld           m12, 5
1776
+    packusdw        m10, m12
1777
+
1778
+    pmaddwd         m11, [r3 + 1 * 32]              ; [16]
1779
+    paddd           m11, m15
1780
+    psrld           m11, 5
1781
+    pmaddwd         m3, [r3 + 1 * 32]
1782
+    paddd           m3, m15
1783
+    psrld           m3, 5
1784
+    packusdw        m11, m3
1785
+    TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 16
1786
+    ret
1787
+;; angle 32, modes 6 and 30
1788
+cglobal ang32_mode_6_30
1789
+    test            r6d, r6d
1790
+
1791
+    vbroadcasti32x8            m0, [r2 + 2]                    ; [16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1]
1792
+    vbroadcasti32x8            m1, [r2 + 4]                    ; [17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2]
1793
+
1794
+    punpcklwd       m3, m0, m1                      ; [13 12 12 11 11 10 10  9  5  4  4  3  3  2  2  1]
1795
+    punpckhwd       m0, m1                          ; [17 16 16 15 15 14 14 13  9  8  8  7  7  6  6  5]
1796
+
1797
+    vbroadcasti32x8            m1, [r2 + 18]                   ; [24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9]
1798
+    vbroadcasti32x8            m4, [r2 + 20]                   ; [25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10]
1799
+    punpcklwd       m2, m1, m4                      ; [21 20 20 19 19 18 18 17 13 12 12 11 11 10 10  9]
1800
+    punpckhwd       m1, m4                          ; [25 24 24 23 23 22 22 21 17 16 16 15 15 14 14 13]
1801
+
1802
+    pmaddwd         m4, m3, [r3 + 14 * 32]          ; [29]
1803
+    paddd           m4, m15
1804
+    psrld           m4, 5
1805
+    pmaddwd         m5, m0, [r3 + 14 * 32]
1806
+    paddd           m5, m15
1807
+    psrld           m5, 5
1808
+    packusdw        m4, m5
1809
+
1810
+    palignr         m6, m0, m3, 4
1811
+    palignr         m7, m2, m0, 4
1812
+    movu            ym16, [r3 - 5 * 32]           ; [10]
1813
+    vinserti32x8    m16,  [r3 + 8 * 32] ,1              ; [23]
1814
+    pmaddwd         m5, m6, m16
1815
+    paddd           m5, m15
1816
+    psrld           m5, 5
1817
+    pmaddwd         m8, m7, m16
1818
+    paddd           m8, m15
1819
+    psrld           m8, 5
1820
+    packusdw        m5, m8
1821
+    vextracti32x8   ym6, m5, 1
1822
+
1823
+    palignr         m9, m0, m3, 8
1824
+    palignr         m12, m2, m0, 8
1825
+    movu            ym16, [r3 - 11 * 32]          ; [4]
1826
+    vinserti32x8    m16, [r3 + 2 * 32] ,1          ; [17]
1827
+    pmaddwd         m7, m9, m16
1828
+    paddd           m7,m15
1829
+    psrld           m7, 5
1830
+    palignr         m12, m2, m0, 8
1831
+    pmaddwd         m11, m12,m16
1832
+    paddd           m11,m15
1833
+    psrld           m11, 5
1834
+    packusdw        m7, m11
1835
+    vextracti32x8   ym8, m7, 1
1836
+
1837
+    pmaddwd         m9, [r3 + 15 * 32]              ; [30]
1838
+    paddd           m9, m15
1839
+    psrld           m9, 5
1840
+    pmaddwd         m12, [r3 + 15 * 32]
1841
+    paddd           m12, m15
1842
+    psrld           m12, 5
1843
+    packusdw        m9, m12
1844
+
1845
+    palignr         m11, m0, m3, 12
1846
+    pmaddwd         m10, m11, [r3 - 4 * 32]         ; [11]
1847
+    paddd           m10, m15
1848
+    psrld           m10, 5
1849
+    palignr         m12, m2, m0, 12
1850
+    pmaddwd         m3, m12, [r3 - 4 * 32]
1851
+    paddd           m3, m15
1852
+    psrld           m3, 5
1853
+    packusdw        m10, m3
1854
+
1855
+    pmaddwd         m11, [r3 + 9 * 32]              ; [24]
1856
+    paddd           m11, m15
1857
+    psrld           m11, 5
1858
+    pmaddwd         m12, [r3 + 9 * 32]
1859
+    paddd           m12,m15
1860
+    psrld           m12, 5
1861
+    packusdw        m11, m12
1862
+
1863
+    TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0
1864
+
1865
+    movu            ym16,  [r3 - 10 * 32]          ; [5]
1866
+    vinserti32x8    m16,  [r3 + 3 * 32] ,1          ; [18]
1867
+    pmaddwd         m4, m0, m16
1868
+    paddd           m4, m15
1869
+    psrld           m4, 5
1870
+    pmaddwd         m5, m2, m16
1871
+    paddd           m5, m15
1872
+    psrld           m5, 5
1873
+    packusdw        m4, m5
1874
+    vextracti32x8   ym5, m4, 1
1875
+
1876
+    pmaddwd         m6, m0, [r3 + 16 * 32]          ; [31]
1877
+    paddd           m6,m15
1878
+    psrld           m6, 5
1879
+    pmaddwd         m7, m2, [r3 + 16 * 32]
1880
+    paddd           m7,m15
1881
+    psrld           m7, 5
1882
+    packusdw        m6, m7
1883
+
1884
+    palignr         m8, m2, m0, 4
1885
+    palignr         m9, m1, m2, 4
1886
+    movu            ym16, [r3 - 3 * 32]           ; [12]
1887
+    vinserti32x8    m16, [r3 + 10 * 32],1              ; [25]
1888
+    pmaddwd         m7, m8,m16
1889
+    paddd           m7,m15
1890
+    psrld           m7, 5
1891
+    pmaddwd         m3, m9, m16
1892
+    paddd           m3, m15
1893
+    psrld           m3, 5
1894
+    packusdw        m7, m3
1895
+    vextracti32x8   ym8, m7, 1
1896
+
1897
+    palignr         m10, m2, m0, 8
1898
+    palignr         m12, m1, m2, 8
1899
+    movu            ym16,  [r3 - 9 * 32]          ; [6]
1900
+    vinserti32x8    m16, [r3 + 4 * 32]  ,1            ; [19]
1901
+    pmaddwd         m9, m10, m16
1902
+    paddd           m9, m15
1903
+    psrld           m9, 5
1904
+    pmaddwd         m3, m12,m16
1905
+    paddd           m3, m15
1906
+    psrld           m3, 5
1907
+    packusdw        m9, m3
1908
+    vextracti32x8   ym10, m9, 1
1909
+
1910
+
1911
+    palignr         m11, m2, m0, 12
1912
+    pmaddwd         m11, [r3 - 15 * 32]             ; [0]
1913
+    paddd           m11, m15
1914
+    psrld           m11, 5
1915
+    palignr         m3, m1, m2, 12
1916
+    pmaddwd         m3, [r3 - 15 * 32]
1917
+    paddd           m3, m15
1918
+    psrld           m3, 5
1919
+    packusdw        m11, m3
1920
+    TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 16
1921
+    ret
1922
+cglobal intra_pred_ang32_6, 3,8,17
1923
+    add         r2,        128
1924
+    xor         r6d,       r6d
1925
+    lea         r3,        [ang_table_avx2 + 15 * 32]
1926
+    add         r1d,       r1d
1927
+    lea         r4,        [r1 * 3]
1928
+    lea         r7,        [r0 + 8 * r1]
1929
+    vbroadcasti32x8  m15,  [pd_16]
1930
+    call        ang16_mode_6_30
1931
+
1932
+    add         r2,        12
1933
+    lea         r0,        [r0 + 32]
1934
+
1935
+    call        ang32_mode_6_30
1936
+
1937
+    add         r2,        20
1938
+    lea         r0,        [r7 + 8 * r1]
1939
+
1940
+    call        ang16_mode_6_30
1941
+
1942
+    add         r2,        12
1943
+    lea         r0,        [r0 + 32]
1944
+    call        ang32_mode_6_30
1945
+    RET
1946
+cglobal intra_pred_ang32_30, 3,7,17
1947
+    xor         r6d,       r6d
1948
+    inc         r6d
1949
+    lea         r3,        [ang_table_avx2 + 15 * 32]
1950
+    add         r1d,       r1d
1951
+    lea         r4,        [r1 * 3]
1952
+    lea         r5,        [r0 + 32]
1953
+    vbroadcasti32x8  m15,  [pd_16]
1954
+    call        ang16_mode_6_30
1955
+
1956
+    add         r2,        12
1957
+
1958
+    call        ang32_mode_6_30
1959
+
1960
+    add         r2,        20
1961
+    mov         r0,        r5
1962
+
1963
+    call        ang16_mode_6_30
1964
+
1965
+    add         r2,        12
1966
+    call        ang32_mode_6_30
1967
+    RET
1968
+cglobal intra_pred_ang16_6, 3,7,17
1969
+    add         r2,        64
1970
+    xor         r6d,       r6d
1971
+    vbroadcasti32x8  m15,  [pd_16]
1972
+    lea         r3,        [ang_table_avx2 + 15 * 32]
1973
+    shl         r1d,       1
1974
+    lea         r4,        [r1 * 3]
1975
+    call        ang16_mode_6_30
1976
+    RET
1977
+cglobal intra_pred_ang16_30, 3,7,17
1978
+    xor         r6d,       r6d
1979
+    inc         r6d
1980
+    vbroadcasti32x8  m15,  [pd_16]
1981
+    lea         r3,        [ang_table_avx2 + 15 * 32]
1982
+     shl         r1d,       1
1983
+    lea         r4,        [r1 * 3]
1984
+    call        ang16_mode_6_30
1985
+    RET
1986
+
1987
+;; angle 16, modes 8 and 28
1988
+cglobal ang16_mode_8_28
1989
+    test            r6d, r6d
1990
+
1991
+    vbroadcasti32x8            m0, [r2 + 2]         ; [16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1]
1992
+    vbroadcasti32x8            m1, [r2 + 4]         ; [17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2]
1993
+
1994
+    punpcklwd       m3, m0, m1                      ; [13 12 12 11 11 10 10  9  5  4  4  3  3  2  2  1]
1995
+    punpckhwd       m0, m1                          ; [17 16 16 15 15 14 14 13  9  8  8  7  7  6  6  5]
1996
+
1997
+    vbroadcasti32x8            m2, [r2 + 18]        ; [24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9]
1998
+    vbroadcasti32x8            m4, [r2 + 20]        ; [25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10]
1999
+    punpcklwd       m2, m4                          ; [21 20 20 19 19 18 18 17 13 12 12 11 11 10 10  9]
2000
+
2001
+    movu            ym14, [r3 - 10 * 32]
2002
+    vinserti32x8    m14, [r3 - 5 * 32], 1
2003
+    pmaddwd         m4, m3, m14                    ; [5], [10]
2004
+    paddd           m4, m15
2005
+    psrld           m4, 5
2006
+    pmaddwd         m5, m0, m14
2007
+    paddd           m5, m15
2008
+    psrld           m5, 5
2009
+    packusdw        m4, m5
2010
+    vextracti32x8   ym5, m4, 1
2011
+
2012
+    movu            ym14, [r3]
2013
+    vinserti32x8    m14, [r3 + 5 * 32], 1
2014
+    pmaddwd         m6, m3, m14                    ; [15], [20]
2015
+    paddd           m6, m15
2016
+    psrld           m6, 5
2017
+    pmaddwd         m9, m0, m14
2018
+    paddd           m9, m15
2019
+    psrld           m9, 5
2020
+    packusdw        m6, m9
2021
+    vextracti32x8   ym7, m6, 1
2022
+
2023
+    movu            ym14, [r3 + 10 * 32]
2024
+    vinserti32x8    m14, [r3 +  15 * 32], 1
2025
+    pmaddwd         m8, m3, m14                     ; [25], [30]
2026
+    paddd           m8, m15
2027
+    psrld           m8, 5
2028
+    pmaddwd         m9, m0, m14
2029
+    paddd           m9, m15
2030
+    psrld           m9, 5
2031
+    packusdw        m8, m9
2032
+    vextracti32x8   ym9, m8, 1
2033
+
2034
+    palignr         m11, m0, m3, 4
2035
+    movu            ym14, [r3 - 12 * 32]
2036
+    vinserti32x8    m14, [r3 - 7 * 32], 1
2037
+    pmaddwd         m10, m11, m14                     ; [3], [8]
2038
+    paddd           m10, m15
2039
+    psrld           m10, 5
2040
+    palignr         m1, m2, m0, 4
2041
+    pmaddwd         m12, m1, m14
2042
+    paddd           m12, m15
2043
+    psrld           m12, 5
2044
+    packusdw        m10, m12
2045
+    vextracti32x8   ym11, m10, 1
2046
+
2047
+    TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 0
2048
+
2049
+    palignr         m7, m0, m3, 4
2050
+    movu            ym14, [r3 - 2 * 32]
2051
+    vinserti32x8    m14, [r3 + 3 * 32], 1
2052
+    pmaddwd         m4, m7, m14                      ; [13], [18]
2053
+    paddd           m4, m15
2054
+    psrld           m4, 5
2055
+    palignr         m1, m2, m0, 4
2056
+    pmaddwd         m5, m1, m14
2057
+    paddd           m5, m15
2058
+    psrld           m5, 5
2059
+    packusdw        m4, m5
2060
+    vextracti32x8   ym5, m4, 1
2061
+
2062
+    movu            ym14, [r3 + 8 * 32]
2063
+    vinserti32x8    m14, [r3 + 13 * 32], 1
2064
+    pmaddwd         m6, m7, m14                      ; [23], [28]
2065
+    paddd           m6, m15
2066
+    psrld           m6, 5
2067
+    pmaddwd         m8, m1, m14
2068
+    paddd           m8, m15
2069
+    psrld           m8, 5
2070
+    packusdw        m6, m8
2071
+    vextracti32x8   ym7, m6, 1
2072
+
2073
+    movu            ym14, [r3 - 14 * 32]
2074
+    vinserti32x8    m14, [r3 - 9 * 32], 1
2075
+    palignr         m1, m0, m3, 8
2076
+    pmaddwd         m8, m1, m14                      ; [1], [6]
2077
+    paddd           m8, m15
2078
+    psrld           m8, 5
2079
+    palignr         m2, m0, 8
2080
+    pmaddwd         m9, m2, m14
2081
+    paddd           m9, m15
2082
+    psrld           m9, 5
2083
+    packusdw        m8, m9
2084
+    vextracti32x8   ym9, m8, 1
2085
+
2086
+    movu            ym14, [r3 - 4 * 32]
2087
+    vinserti32x8    m14, [r3 + 1 * 32], 1
2088
+    pmaddwd         m3, m1, m14                      ; [11], [16]
2089
+    paddd           m3, m15
2090
+    psrld           m3, 5
2091
+    pmaddwd         m0, m2, m14
2092
+    paddd           m0, m15
2093
+    psrld           m0, 5
2094
+    packusdw        m3, m0
2095
+    vextracti32x8   ym1, m3, 1
2096
+    TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 3, 1, 0, 2, 16
2097
+    ret
2098
+
2099
+;; angle 32, modes 8 and 28
2100
+cglobal ang32_mode_8_28
2101
+    test            r6d, r6d
2102
+
2103
+    vbroadcasti32x8            m0, [r2 + 2]         ; [16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1]
2104
+    vbroadcasti32x8            m1, [r2 + 4]         ; [17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2]
2105
+
2106
+    punpcklwd       m3, m0, m1                      ; [13 12 12 11 11 10 10  9  5  4  4  3  3  2  2  1]
2107
+    punpckhwd       m0, m1                          ; [17 16 16 15 15 14 14 13  9  8  8  7  7  6  6  5]
2108
+
2109
+    vbroadcasti32x8            m2, [r2 + 18]        ; [24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9]
2110
+    vbroadcasti32x8            m4, [r2 + 20]        ; [25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10]
2111
+    punpcklwd       m2, m4                          ; [21 20 20 19 19 18 18 17 13 12 12 11 11 10 10  9]
2112
+
2113
+    movu            ym14, [r3 + 6 * 32]
2114
+    vinserti32x8    m14, [r3 + 11 * 32], 1
2115
+    pmaddwd         m4, m3, m14                     ; [21], [26]
2116
+    paddd           m4, m15
2117
+    psrld           m4, 5
2118
+    pmaddwd         m5, m0, m14
2119
+    paddd           m5, m15
2120
+    psrld           m5, 5
2121
+    packusdw        m4, m5
2122
+    vextracti32x8   ym5, m4, 1
2123
+
2124
+    pmaddwd         m6, m3, [r3 + 16 * 32]          ; [31]
2125
+    paddd           m6, [pd_16]
2126
+    psrld           m6, 5
2127
+    pmaddwd         m9, m0, [r3 + 16 * 32]
2128
+    paddd           m9, [pd_16]
2129
+    psrld           m9, 5
2130
+    packusdw        m6, m9
2131
+
2132
+    palignr         m11, m0, m3, 4
2133
+    movu            ym14, [r3 - 11 * 32]
2134
+    vinserti32x8    m14, [r3 - 6 * 32], 1
2135
+    pmaddwd         m7, m11, m14                    ; [4], [9]
2136
+    paddd           m7, m15
2137
+    psrld           m7, 5
2138
+    palignr         m1, m2, m0, 4
2139
+    pmaddwd         m8, m1, m14
2140
+    paddd           m8, m15
2141
+    psrld           m8, 5
2142
+    packusdw        m7, m8
2143
+    vextracti32x8   ym8, m7, 1
2144
+
2145
+    movu            ym14, [r3 - 1 * 32]
2146
+    vinserti32x8    m14, [r3 + 4 * 32], 1
2147
+    pmaddwd         m9, m11, m14                     ; [14], [19]
2148
+    paddd           m9, m15
2149
+    psrld           m9, 5
2150
+    pmaddwd         m10, m1, m14
2151
+    paddd           m10, m15
2152
+    psrld           m10, 5
2153
+    packusdw        m9, m10
2154
+    vextracti32x8   ym10, m9, 1
2155
+
2156
+    pmaddwd         m11, [r3 + 9 * 32]              ; [24]
2157
+    paddd           m11, [pd_16]
2158
+    psrld           m11, 5
2159
+    pmaddwd         m1, [r3 + 9 * 32]
2160
+    paddd           m1, [pd_16]
2161
+    psrld           m1, 5
2162
+    packusdw        m11, m1
2163
+
2164
+TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 0
2165
+
2166
+    palignr         m4, m0, m3, 4
2167
+    pmaddwd         m4, [r3 + 14 * 32]              ; [29]
2168
+    paddd           m4, m15
2169
+    psrld           m4, 5
2170
+    palignr         m5, m2, m0, 4
2171
+    pmaddwd         m5, [r3 + 14 * 32]
2172
+    paddd           m5, m15
2173
+    psrld           m5, 5
2174
+    packusdw        m4, m5
2175
+
2176
+    palignr         m1, m0, m3, 8
2177
+    pmaddwd         m5, m1, [r3 - 13 * 32]          ; [2]
2178
+    paddd           m5, m15
2179
+    psrld           m5, 5
2180
+    palignr         m10, m2, m0, 8
2181
+    pmaddwd         m6, m10, [r3 - 13 * 32]
2182
+    paddd           m6, m15
2183
+    psrld           m6, 5
2184
+    packusdw        m5, m6
2185
+
2186
+    movu            ym14, [r3 - 8 * 32]
2187
+    vinserti32x8    m14, [r3 - 3 * 32], 1
2188
+    pmaddwd         m6, m1, m14                     ; [7], [12]
2189
+    paddd           m6, m15
2190
+    psrld           m6, 5
2191
+    pmaddwd         m8, m10, m14
2192
+    paddd           m8, m15
2193
+    psrld           m8, 5
2194
+    packusdw        m6, m8
2195
+    vextracti32x8   ym7, m6, 1
2196
+
2197
+    movu            ym14, [r3 + 2 * 32]
2198
+    vinserti32x8    m14, [r3 + 7 * 32], 1
2199
+    pmaddwd         m8, m1, m14                     ; [17], [22]
2200
+    paddd           m8, m15
2201
+    psrld           m8, 5
2202
+    pmaddwd         m9, m10, m14
2203
+    paddd           m9, m15
2204
+    psrld           m9, 5
2205
+    packusdw        m8, m9
2206
+    vextracti32x8   ym9, m8, 1
2207
+
2208
+    pmaddwd         m1, [r3 + 12 * 32]              ; [27]
2209
+    paddd           m1, [pd_16]
2210
+    psrld           m1, 5
2211
+    pmaddwd         m10, [r3 + 12 * 32]
2212
+    paddd           m10, [pd_16]
2213
+    psrld           m10, 5
2214
+    packusdw        m1, m10
2215
+
2216
+    palignr         m11, m0, m3, 12
2217
+    pmaddwd         m11, [r3 - 15 * 32]             ; [0]
2218
+    paddd           m11, [pd_16]
2219
+    psrld           m11, 5
2220
+    palignr         m2, m0, 12
2221
+    pmaddwd         m2, [r3 - 15 * 32]
2222
+    paddd           m2, [pd_16]
2223
+    psrld           m2, 5
2224
+    packusdw        m11, m2
2225
+    TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 1, 11, 0, 2, 16
2226
+    ret
2227
+
2228
+
2229
+cglobal intra_pred_ang32_8, 3,8,16
2230
+    add         r2,        128
2231
+    xor         r6d,       r6d
2232
+    lea         r3,        [ang_table_avx2 + 15 * 32]
2233
+    add         r1d,       r1d
2234
+    lea         r4,        [r1 * 3]
2235
+    lea         r7,        [r0 + 8 * r1]
2236
+    vbroadcasti32x8        m15, [pd_16]
2237
+
2238
+    call        ang16_mode_8_28
2239
+
2240
+    add         r2,        4
2241
+    lea         r0,        [r0 + 32]
2242
+
2243
+    call        ang32_mode_8_28
2244
+
2245
+    add         r2,        28
2246
+    lea         r0,        [r7 + 8 * r1]
2247
+
2248
+    call        ang16_mode_8_28
2249
+
2250
+    add         r2,        4
2251
+    lea         r0,        [r0 + 32]
2252
+
2253
+    call        ang32_mode_8_28
2254
+    RET
2255
+
2256
+cglobal intra_pred_ang32_28, 3,7,16
2257
+    xor         r6d,       r6d
2258
+    inc         r6d
2259
+    lea         r3,        [ang_table_avx2 + 15 * 32]
2260
+    add         r1d,       r1d
2261
+    lea         r4,        [r1 * 3]
2262
+    lea         r5,        [r0 + 32]
2263
+    vbroadcasti32x8  m15,  [pd_16]
2264
+    call        ang16_mode_8_28
2265
+
2266
+    add         r2,        4
2267
+
2268
+    call        ang32_mode_8_28
2269
+
2270
+    add         r2,        28
2271
+    mov         r0,        r5
2272
+
2273
+    call        ang16_mode_8_28
2274
+
2275
+    add         r2,        4
2276
+    call        ang32_mode_8_28
2277
+    RET
2278
+
2279
+    cglobal intra_pred_ang16_8, 3,7,16
2280
+    add         r2,        64
2281
+    xor         r6d,       r6d
2282
+    lea         r3,        [ang_table_avx2 + 15 * 32]
2283
+    add         r1d,       r1d
2284
+    lea         r4,        [r1 * 3]
2285
+    vbroadcasti32x8  m15,  [pd_16]
2286
+
2287
+    call        ang16_mode_8_28
2288
+    RET
2289
+
2290
+cglobal intra_pred_ang16_28, 3,7,16
2291
+    xor         r6d,       r6d
2292
+    inc         r6d
2293
+    lea         r3,        [ang_table_avx2 + 15 * 32]
2294
+    add         r1d,       r1d
2295
+    lea         r4,        [r1 * 3]
2296
+    vbroadcasti32x8  m15,  [pd_16]
2297
+
2298
+    call        ang16_mode_8_28
2299
+    RET
2300
+
2301
+;; angle 16, modes 7 and 29
2302
+cglobal ang16_mode_7_29
2303
+    test            r6d, r6d
2304
+
2305
+    vbroadcasti32x8             m0, [r2 + 2]                    ; [16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1]
2306
+    vbroadcasti32x8            m1, [r2 + 4]                    ; [17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2]
2307
+
2308
+    punpcklwd       m3, m0, m1                      ; [13 12 12 11 11 10 10  9  5  4  4  3  3  2  2  1]
2309
+    punpckhwd       m0, m1                          ; [17 16 16 15 15 14 14 13  9  8  8  7  7  6  6  5]
2310
+
2311
+    vbroadcasti32x8             m2, [r2 + 18]                   ; [24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9]
2312
+    vbroadcasti32x8            m4, [r2 + 20]                   ; [25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10]
2313
+    punpcklwd       m2, m4                          ; [21 20 20 19 19 18 18 17 13 12 12 11 11 10 10  9]
2314
+
2315
+    movu            ym16, [r3 - 8 * 32]           ; [9]
2316
+    vinserti32x8    m16,  [r3 + 1 * 32] ,1          ; [18]
2317
+    pmaddwd         m4, m3,m16
2318
+    paddd           m4, m15
2319
+    psrld           m4, 5
2320
+    pmaddwd         m5, m0, m16
2321
+    paddd           m5, m15
2322
+    psrld           m5, 5
2323
+    packusdw        m4, m5
2324
+    vextracti32x8   ym5, m4, 1
2325
+
2326
+    pmaddwd         m6, m3, [r3 + 10 * 32]          ; [27]
2327
+    paddd           m6, m15
2328
+    psrld           m6, 5
2329
+    pmaddwd         m9, m0, [r3 + 10 * 32]
2330
+    paddd           m9, m15
2331
+    psrld           m9, 5
2332
+    packusdw        m6, m9
2333
+
2334
+    palignr         m10, m0, m3, 4
2335
+    pmaddwd         m7, m10, [r3 - 13 * 32]         ; [4]
2336
+    paddd           m7, m15
2337
+    psrld           m7, 5
2338
+    palignr         m11, m2, m0, 4
2339
+    pmaddwd         m8, m11, [r3 - 13 * 32]
2340
+    paddd           m8, m15
2341
+    psrld           m8, 5
2342
+    packusdw        m7, m8
2343
+
2344
+    movu            ym16,  [r3 - 4 * 32]          ; [13]
2345
+    vinserti32x8    m16,  [r3 + 5 * 32],1          ; [22]
2346
+    pmaddwd         m8, m10, m16
2347
+    paddd           m8, m15
2348
+    psrld           m8, 5
2349
+    pmaddwd         m9, m11, m16
2350
+    paddd           m9, m15
2351
+    psrld           m9, 5
2352
+    packusdw        m8, m9
2353
+    vextracti32x8   ym9, m8, 1
2354
+
2355
+    pmaddwd         m10, [r3 + 14 * 32]             ; [31]
2356
+    paddd           m10, m15
2357
+    psrld           m10, 5
2358
+    pmaddwd         m11, [r3 + 14 * 32]
2359
+    paddd           m11, m15
2360
+    psrld           m11, 5
2361
+    packusdw        m10, m11
2362
+
2363
+    palignr         m11, m0, m3, 8
2364
+    pmaddwd         m11, [r3 - 9 * 32]              ; [8]
2365
+    paddd           m11, m15
2366
+    psrld           m11, 5
2367
+    palignr         m12, m2, m0, 8
2368
+    pmaddwd         m12, [r3 - 9 * 32]
2369
+    paddd           m12, m15
2370
+    psrld           m12, 5
2371
+    packusdw        m11, m12
2372
+
2373
+    TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 0
2374
+
2375
+    palignr         m5, m0, m3, 8
2376
+    palignr         m6, m2, m0, 8
2377
+    movu            ym16, [r3]                    ; [17]
2378
+    vinserti32x8    m16,  [r3 + 9 * 32] ,1              ; [26]
2379
+    pmaddwd         m4, m5, m16
2380
+    paddd           m4, m15
2381
+    psrld           m4, 5
2382
+    pmaddwd         m7, m6, m16
2383
+    paddd           m7, m15
2384
+    psrld           m7, 5
2385
+    packusdw        m4, m7
2386
+    vextracti32x8   ym5, m4, 1
2387
+
2388
+
2389
+    palignr         m9, m0, m3, 12
2390
+    palignr         m3, m2, m0, 12
2391
+    movu            ym16, [r3 - 14 * 32]          ; [3]
2392
+    vinserti32x8    m16, [r3 - 5 * 32] ,1          ; [12]
2393
+    pmaddwd         m6, m9,m16
2394
+    paddd           m6, m15
2395
+    psrld           m6, 5
2396
+    pmaddwd         m7, m3,m16
2397
+    paddd           m7, m15
2398
+    psrld           m7, 5
2399
+    packusdw        m6, m7
2400
+    vextracti32x8   ym7, m6, 1
2401
+
2402
+    movu            ym16, [r3 + 4 * 32]           ; [21]
2403
+    vinserti32x8    m16, [r3 + 13 * 32] ,1             ; [30]
2404
+    pmaddwd         m8, m9,m16
2405
+    paddd           m8, m15
2406
+    psrld           m8, 5
2407
+    pmaddwd         m10, m3, m16
2408
+    paddd           m10, m15
2409
+    psrld           m10, 5
2410
+    packusdw        m8, m10
2411
+    vextracti32x8   ym9, m8, 1
2412
+
2413
+    movu            ym16,[r3 - 10 * 32]         ; [7]
2414
+    vinserti32x8    m16, [r3 - 1 * 32] ,1              ; [16]
2415
+    pmaddwd         m10, m0, m16
2416
+    paddd           m10, m15
2417
+    psrld           m10, 5
2418
+    pmaddwd         m12, m2, m16
2419
+    paddd           m12, m15
2420
+    psrld           m12, 5
2421
+    packusdw        m10, m12
2422
+    vextracti32x8   ym0, m10, 1
2423
+
2424
+    TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 0, 1, 2, 16
2425
+    ret
2426
+;; angle 32, modes 7 and 29
2427
+cglobal ang32_mode_7_29
2428
+    test            r6d, r6d
2429
+
2430
+    vbroadcasti32x8             m0, [r2 + 2]                    ; [16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1]
2431
+    vbroadcasti32x8             m1, [r2 + 4]                    ; [17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2]
2432
+
2433
+    punpcklwd       m3, m0, m1                      ; [13 12 12 11 11 10 10  9  5  4  4  3  3  2  2  1]
2434
+    punpckhwd       m0, m1                          ; [17 16 16 15 15 14 14 13  9  8  8  7  7  6  6  5]
2435
+
2436
+    vbroadcasti32x8             m1, [r2 + 18]                   ; [24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9]
2437
+    vbroadcasti32x8            m4, [r2 + 20]                   ; [25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10]
2438
+    punpcklwd       m2, m1, m4                      ; [21 20 20 19 19 18 18 17 13 12 12 11 11 10 10  9]
2439
+    punpckhwd       m1, m4                          ; [25 24 24 23 23 22 22 21 17 16 16 15 15 14 14 13]
2440
+
2441
+    pmaddwd         m4, m3, [r3 + 8 * 32]           ; [25]
2442
+    paddd           m4, m15
2443
+    psrld           m4, 5
2444
+    pmaddwd         m5, m0, [r3 + 8 * 32]
2445
+    paddd           m5, m15
2446
+    psrld           m5, 5
2447
+    packusdw        m4, m5
2448
+
2449
+    palignr         m8, m0, m3, 4
2450
+    pmaddwd         m5, m8, [r3 - 15 * 32]          ; [2]
2451
+    paddd           m5, m15
2452
+    psrld           m5, 5
2453
+    palignr         m9, m2, m0, 4
2454
+    pmaddwd         m10, m9, [r3 - 15 * 32]
2455
+    paddd           m10, m15
2456
+    psrld           m10, 5
2457
+    packusdw        m5, m10
2458
+
2459
+    movu            ym16,[r3 - 6 * 32]           ; [11]
2460
+    vinserti32x8    m16, [r3 + 3 * 32],1           ; [20]
2461
+    pmaddwd         m6, m8, m16
2462
+    paddd           m6, m15
2463
+    psrld           m6, 5
2464
+    pmaddwd         m7, m9, m16
2465
+    paddd           m7, m15
2466
+    psrld           m7, 5
2467
+    packusdw        m6, m7
2468
+    vextracti32x8   ym7, m6, 1
2469
+
2470
+    pmaddwd         m8, [r3 + 12 * 32]              ; [29]
2471
+    paddd           m8, m15
2472
+    psrld           m8, 5
2473
+    pmaddwd         m9, [r3 + 12 * 32]
2474
+    paddd           m9, m15
2475
+    psrld           m9, 5
2476
+    packusdw        m8, m9
2477
+
2478
+    palignr         m11, m0, m3, 8
2479
+    palignr         m12, m2, m0, 8
2480
+    movu            ym16, [r3 - 11 * 32]         ; [6]
2481
+    vinserti32x8    m16, [r3 - 2 * 32] ,1        ; [15]
2482
+    pmaddwd         m9, m11, m16
2483
+    paddd           m9, m15
2484
+    psrld           m9, 5
2485
+    palignr         m12, m2, m0, 8
2486
+    pmaddwd         m10, m12, m16
2487
+    paddd           m10, m15
2488
+    psrld           m10, 5
2489
+    packusdw        m9, m10
2490
+    vextracti32x8   ym10, m9, 1
2491
+
2492
+    pmaddwd         m11, [r3 + 7 * 32]              ; [24]
2493
+    paddd           m11, m15
2494
+    psrld           m11, 5
2495
+    pmaddwd         m12, [r3 + 7 * 32]
2496
+    paddd           m12, m15
2497
+    psrld           m12, 5
2498
+    packusdw        m11, m12
2499
+
2500
+    TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0
2501
+
2502
+    palignr         m5, m0, m3, 12
2503
+    palignr         m6, m2, m0, 12
2504
+    movu            ym16, [r3 - 16 * 32]          ; [1]
2505
+    vinserti32x8    m16, [r3 - 7 * 32]  ,1             ; [10]
2506
+    pmaddwd         m4, m5, m16
2507
+    paddd           m4, m15
2508
+    psrld           m4, 5
2509
+    pmaddwd         m7, m6, m16
2510
+    paddd           m7, m15
2511
+    psrld           m7, 5
2512
+    packusdw        m4, m7
2513
+    vextracti32x8   ym5, m4, 1
2514
+
2515
+    palignr         m9, m0, m3, 12
2516
+    pmaddwd         m6, m9, [r3 + 2 * 32]           ; [19]
2517
+    paddd           m6, m15
2518
+    psrld           m6, 5
2519
+    palignr         m3, m2, m0, 12
2520
+    pmaddwd         m7, m3, [r3 + 2 * 32]
2521
+    paddd           m7, m15
2522
+    psrld           m7, 5
2523
+    packusdw        m6, m7
2524
+
2525
+    pmaddwd         m7, m9, [r3 + 11 * 32]          ; [28]
2526
+    paddd           m7, m15
2527
+    psrld           m7, 5
2528
+    pmaddwd         m8, m3, [r3 + 11 * 32]
2529
+    paddd           m8, m15
2530
+    psrld           m8, 5
2531
+    packusdw        m7, m8
2532
+
2533
+    movu            ym16, [r3 - 12 * 32]          ; [5]
2534
+    vinserti32x8    m16, [r3 - 3 * 32]  ,1         ; [14]
2535
+    pmaddwd         m8, m0, m16
2536
+    paddd           m8, m15
2537
+    psrld           m8, 5
2538
+    pmaddwd         m10, m2, m16
2539
+    paddd           m10,m15
2540
+    psrld           m10, 5
2541
+    packusdw        m8, m10
2542
+    vextracti32x8   ym9, m8, 1
2543
+
2544
+    pmaddwd         m10, m0, [r3 + 6 * 32]          ; [23]
2545
+    paddd           m10,m15
2546
+    psrld           m10, 5
2547
+    pmaddwd         m12, m2, [r3 + 6 * 32]
2548
+    paddd           m12, m15
2549
+    psrld           m12, 5
2550
+    packusdw        m10, m12
2551
+
2552
+    palignr         m11, m2, m0, 4
2553
+    pmaddwd         m11, [r3 - 17 * 32]             ; [0]
2554
+    paddd           m11, m15
2555
+    psrld           m11, 5
2556
+    palignr         m12, m1, m2, 4
2557
+    pmaddwd         m12, [r3 - 17 * 32]
2558
+    paddd           m12, m15
2559
+    psrld           m12, 5
2560
+    packusdw        m11, m12
2561
+    TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 3, 2, 16
2562
+    ret
2563
+
2564
+cglobal intra_pred_ang32_7, 3,8,17
2565
+    add         r2,        128
2566
+    xor         r6d,       r6d
2567
+    lea         r3,        [ang_table_avx2 + 17 * 32]
2568
+    add         r1d,       r1d
2569
+    lea         r4,        [r1 * 3]
2570
+    lea         r7,        [r0 + 8 * r1]
2571
+    vbroadcasti32x8  m15,  [pd_16]
2572
+    call        ang16_mode_7_29
2573
+
2574
+    add         r2,        8
2575
+    lea         r0,        [r0 + 32]
2576
+
2577
+    call        ang32_mode_7_29
2578
+
2579
+    add         r2,        24
2580
+    lea         r0,        [r7 + 8 * r1]
2581
+
2582
+    call        ang16_mode_7_29
2583
+
2584
+    add         r2,        8
2585
+    lea         r0,        [r0 + 32]
2586
+
2587
+    call        ang32_mode_7_29
2588
+    RET
2589
+
2590
+cglobal intra_pred_ang32_29, 3,7,17
2591
+    xor         r6d,       r6d
2592
+    inc         r6d
2593
+    lea         r3,        [ang_table_avx2 + 17 * 32]
2594
+    add         r1d,       r1d
2595
+    lea         r4,        [r1 * 3]
2596
+    lea         r5,        [r0 + 32]
2597
+    vbroadcasti32x8  m15,  [pd_16]
2598
+    call        ang16_mode_7_29
2599
+
2600
+    add         r2,        8
2601
+
2602
+    call        ang32_mode_7_29
2603
+
2604
+    add         r2,        24
2605
+    mov         r0,        r5
2606
+
2607
+    call        ang16_mode_7_29
2608
+    add         r2,        8
2609
+    call        ang32_mode_7_29
2610
+    RET
2611
+cglobal intra_pred_ang16_7, 3,7,17
2612
+    add         r2,        64
2613
+    xor         r6d,       r6d
2614
+    vbroadcasti32x8  m15,  [pd_16]
2615
+    lea         r3,        [ang_table_avx2 + 17 * 32]
2616
+    add         r1d,       r1d
2617
+    lea         r4,        [r1 * 3]
2618
+
2619
+    call        ang16_mode_7_29
2620
+    RET
2621
+
2622
+cglobal intra_pred_ang16_29, 3,7,17
2623
+    xor         r6d,       r6d
2624
+    inc         r6d
2625
+    vbroadcasti32x8  m15,  [pd_16]
2626
+    lea         r3,        [ang_table_avx2 + 17 * 32]
2627
+    add         r1d,       r1d
2628
+    lea         r4,        [r1 * 3]
2629
+
2630
+    call        ang16_mode_7_29
2631
+    RET
2632
+;-------------------------------------------------------------------------------------------------------
2633
+; avx512 code for intra_pred_ang32 mode 2 to 34 end
2634
+;-------------------------------------------------------------------------------------------------------
2635
 %macro MODE_2_34 0
2636
     movu            m0, [r2 + 4]
2637
     movu            m1, [r2 + 20]
2638
x265_2.7.tar.gz/source/common/x86/ipfilter16.asm -> x265_2.9.tar.gz/source/common/x86/ipfilter16.asm Changed
9510
 
1
@@ -45,12 +45,33 @@
2
 %endif
3
 
4
 
5
-SECTION_RODATA 32
6
+SECTION_RODATA 64
7
 
8
 tab_c_524800:     times 4 dd 524800
9
 tab_c_n8192:      times 8 dw -8192
10
 pd_524800:        times 8 dd 524800
11
 
12
+tab_ChromaCoeff:  dw  0, 64,  0,  0
13
+                  dw -2, 58, 10, -2
14
+                  dw -4, 54, 16, -2
15
+                  dw -6, 46, 28, -4
16
+                  dw -4, 36, 36, -4
17
+                  dw -4, 28, 46, -6
18
+                  dw -2, 16, 54, -4
19
+                  dw -2, 10, 58, -2
20
+               
21
+tab_LumaCoeff:    dw   0, 0,  0,  64,  0,   0,  0,  0
22
+                  dw  -1, 4, -10, 58,  17, -5,  1,  0
23
+                  dw  -1, 4, -11, 40,  40, -11, 4, -1
24
+                  dw   0, 1, -5,  17,  58, -10, 4, -1
25
+
26
+ALIGN 64
27
+tab_LumaCoeffH_avx512:
28
+                  times 4 dw  0, 0,  0,  64,  0,   0,  0,  0
29
+                  times 4 dw  -1, 4, -10, 58,  17, -5,  1,  0
30
+                  times 4 dw  -1, 4, -11, 40,  40, -11, 4, -1
31
+                  times 4 dw   0, 1, -5,  17,  58, -10, 4, -1
32
+
33
 ALIGN 32
34
 tab_LumaCoeffV:   times 4 dw 0, 0
35
                   times 4 dw 0, 64
36
@@ -71,6 +92,7 @@
37
                   times 4 dw -5, 17
38
                   times 4 dw 58, -10
39
                   times 4 dw 4, -1
40
+
41
 ALIGN 32
42
 tab_LumaCoeffVer: times 8 dw 0, 0
43
                   times 8 dw 0, 64
44
@@ -91,7 +113,62 @@
45
                   times 8 dw -5, 17
46
                   times 8 dw 58, -10
47
                   times 8 dw 4, -1
48
-
49
+                 
50
+ALIGN 64
51
+const tab_ChromaCoeffV_avx512,  times 16 dw 0, 64
52
+                                times 16 dw 0, 0
53
+
54
+                                times 16 dw -2, 58
55
+                                times 16 dw 10, -2
56
+
57
+                                times 16 dw -4, 54
58
+                                times 16 dw 16, -2
59
+
60
+                                times 16 dw -6, 46
61
+                                times 16 dw 28, -4
62
+
63
+                                times 16 dw -4, 36
64
+                                times 16 dw 36, -4
65
+
66
+                                times 16 dw -4, 28
67
+                                times 16 dw 46, -6
68
+
69
+                                times 16 dw -2, 16
70
+                                times 16 dw 54, -4
71
+
72
+                                times 16 dw -2, 10
73
+                                times 16 dw 58, -2
74
+
75
+ALIGN 64
76
+tab_LumaCoeffVer_avx512: times 16 dw 0, 0
77
+                         times 16 dw 0, 64
78
+                         times 16 dw 0, 0
79
+                         times 16 dw 0, 0
80
+
81
+                         times 16 dw -1, 4
82
+                         times 16 dw -10, 58
83
+                         times 16 dw 17, -5
84
+                         times 16 dw 1, 0
85
+
86
+                         times 16 dw -1, 4
87
+                         times 16 dw -11, 40
88
+                         times 16 dw 40, -11
89
+                         times 16 dw 4, -1
90
+
91
+                         times 16 dw 0, 1
92
+                         times 16 dw -5, 17
93
+                         times 16 dw 58, -10
94
+                         times 16 dw 4, -1
95
+
96
+ALIGN 64
97
+const interp8_hpp_shuf1_load_avx512, times 4 db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9
98
+
99
+ALIGN 64
100
+const interp8_hpp_shuf2_load_avx512, times 4 db 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13
101
+
102
+ALIGN 64
103
+const interp8_hpp_shuf1_store_avx512, times 4 db 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15
104
+ 
105
 SECTION .text
106
 cextern pd_8
107
 cextern pd_32
108
@@ -246,6 +323,7 @@
109
 ;-------------------------------------------------------------------------------------------------------------
110
 ; void interp_8tap_vert_pp_%2x%3(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
111
 ;-------------------------------------------------------------------------------------------------------------
112
+%if ARCH_X86_64
113
     FILTER_VER_LUMA_sse2 pp, 4, 4
114
     FILTER_VER_LUMA_sse2 pp, 8, 8
115
     FILTER_VER_LUMA_sse2 pp, 8, 4
116
@@ -300,7 +378,570 @@
117
     FILTER_VER_LUMA_sse2 ps, 48, 64
118
     FILTER_VER_LUMA_sse2 ps, 64, 16
119
     FILTER_VER_LUMA_sse2 ps, 16, 64
120
+%endif
121
+
122
+;-----------------------------------------------------------------------------
123
+;p2s and p2s_aligned avx512 code start
124
+;-----------------------------------------------------------------------------
125
+%macro P2S_64x4_AVX512 0
126
+    movu       m0, [r0]
127
+    movu       m1, [r0 + r1]
128
+    movu       m2, [r0 + r1 * 2]
129
+    movu       m3, [r0 + r5]
130
+    psllw      m0, (14 - BIT_DEPTH)
131
+    psllw      m1, (14 - BIT_DEPTH)
132
+    psllw      m2, (14 - BIT_DEPTH)
133
+    psllw      m3, (14 - BIT_DEPTH)
134
+    psubw      m0, m4
135
+    psubw      m1, m4
136
+    psubw      m2, m4
137
+    psubw      m3, m4
138
+    movu       [r2], m0
139
+    movu       [r2 + r3], m1
140
+    movu       [r2 + r3 * 2], m2
141
+    movu       [r2 + r4], m3
142
+
143
+    movu       m0, [r0 + mmsize]
144
+    movu       m1, [r0 + r1 + mmsize]
145
+    movu       m2, [r0 + r1 * 2 + mmsize]
146
+    movu       m3, [r0 + r5 + mmsize]
147
+    psllw      m0, (14 - BIT_DEPTH)
148
+    psllw      m1, (14 - BIT_DEPTH)
149
+    psllw      m2, (14 - BIT_DEPTH)
150
+    psllw      m3, (14 - BIT_DEPTH)
151
+    psubw      m0, m4
152
+    psubw      m1, m4
153
+    psubw      m2, m4
154
+    psubw      m3, m4
155
+    movu       [r2 + mmsize], m0
156
+    movu       [r2 + r3 + mmsize], m1
157
+    movu       [r2 + r3 * 2 + mmsize], m2
158
+    movu       [r2 + r4 + mmsize], m3
159
+%endmacro
160
+
161
+%macro P2S_ALIGNED_64x4_AVX512 0
162
+    mova       m0, [r0]
163
+    mova       m1, [r0 + r1]
164
+    mova       m2, [r0 + r1 * 2]
165
+    mova       m3, [r0 + r5]
166
+    psllw      m0, (14 - BIT_DEPTH)
167
+    psllw      m1, (14 - BIT_DEPTH)
168
+    psllw      m2, (14 - BIT_DEPTH)
169
+    psllw      m3, (14 - BIT_DEPTH)
170
+    psubw      m0, m4
171
+    psubw      m1, m4
172
+    psubw      m2, m4
173
+    psubw      m3, m4
174
+    mova       [r2], m0
175
+    mova       [r2 + r3], m1
176
+    mova       [r2 + r3 * 2], m2
177
+    mova       [r2 + r4], m3
178
+
179
+    mova       m0, [r0 + mmsize]
180
+    mova       m1, [r0 + r1 + mmsize]
181
+    mova       m2, [r0 + r1 * 2 + mmsize]
182
+    mova       m3, [r0 + r5 + mmsize]
183
+    psllw      m0, (14 - BIT_DEPTH)
184
+    psllw      m1, (14 - BIT_DEPTH)
185
+    psllw      m2, (14 - BIT_DEPTH)
186
+    psllw      m3, (14 - BIT_DEPTH)
187
+    psubw      m0, m4
188
+    psubw      m1, m4
189
+    psubw      m2, m4
190
+    psubw      m3, m4
191
+    mova       [r2 + mmsize], m0
192
+    mova       [r2 + r3 + mmsize], m1
193
+    mova       [r2 + r3 * 2 + mmsize], m2
194
+    mova       [r2 + r4 + mmsize], m3
195
+%endmacro
196
+
197
+%macro P2S_32x4_AVX512 0
198
+    movu       m0, [r0]
199
+    movu       m1, [r0 + r1]
200
+    movu       m2, [r0 + r1 * 2]
201
+    movu       m3, [r0 + r5]
202
+    psllw      m0, (14 - BIT_DEPTH)
203
+    psllw      m1, (14 - BIT_DEPTH)
204
+    psllw      m2, (14 - BIT_DEPTH)
205
+    psllw      m3, (14 - BIT_DEPTH)
206
+    psubw      m0, m4
207
+    psubw      m1, m4
208
+    psubw      m2, m4
209
+    psubw      m3, m4
210
+    movu       [r2], m0
211
+    movu       [r2 + r3], m1
212
+    movu       [r2 + r3 * 2], m2
213
+    movu       [r2 + r4], m3
214
+%endmacro
215
+
216
+%macro P2S_ALIGNED_32x4_AVX512 0
217
+    mova       m0, [r0]
218
+    mova       m1, [r0 + r1]
219
+    mova       m2, [r0 + r1 * 2]
220
+    mova       m3, [r0 + r5]
221
+    psllw      m0, (14 - BIT_DEPTH)
222
+    psllw      m1, (14 - BIT_DEPTH)
223
+    psllw      m2, (14 - BIT_DEPTH)
224
+    psllw      m3, (14 - BIT_DEPTH)
225
+    psubw      m0, m4
226
+    psubw      m1, m4
227
+    psubw      m2, m4
228
+    psubw      m3, m4
229
+    mova       [r2], m0
230
+    mova       [r2 + r3], m1
231
+    mova       [r2 + r3 * 2], m2
232
+    mova       [r2 + r4], m3
233
+%endmacro
234
+
235
+%macro P2S_48x4_AVX512 0
236
+    movu       m0, [r0]
237
+    movu       m1, [r0 + r1]
238
+    movu       m2, [r0 + r1 * 2]
239
+    movu       m3, [r0 + r5]
240
+    psllw      m0, (14 - BIT_DEPTH)
241
+    psllw      m1, (14 - BIT_DEPTH)
242
+    psllw      m2, (14 - BIT_DEPTH)
243
+    psllw      m3, (14 - BIT_DEPTH)
244
+    psubw      m0, m4
245
+    psubw      m1, m4
246
+    psubw      m2, m4
247
+    psubw      m3, m4
248
+    movu       [r2], m0
249
+    movu       [r2 + r3], m1
250
+    movu       [r2 + r3 * 2], m2
251
+    movu       [r2 + r4], m3
252
+
253
+    movu       ym0, [r0 + mmsize]
254
+    movu       ym1, [r0 + r1 + mmsize]
255
+    movu       ym2, [r0 + r1 * 2 + mmsize]
256
+    movu       ym3, [r0 + r5 + mmsize]
257
+    psllw      ym0, (14 - BIT_DEPTH)
258
+    psllw      ym1, (14 - BIT_DEPTH)
259
+    psllw      ym2, (14 - BIT_DEPTH)
260
+    psllw      ym3, (14 - BIT_DEPTH)
261
+    psubw      ym0, ym4
262
+    psubw      ym1, ym4
263
+    psubw      ym2, ym4
264
+    psubw      ym3, ym4
265
+    movu       [r2 + mmsize], ym0
266
+    movu       [r2 + r3 + mmsize], ym1
267
+    movu       [r2 + r3 * 2 + mmsize], ym2
268
+    movu       [r2 + r4 + mmsize], ym3
269
+%endmacro
270
+
271
+%macro P2S_ALIGNED_48x4_AVX512 0
272
+    mova       m0, [r0]
273
+    mova       m1, [r0 + r1]
274
+    mova       m2, [r0 + r1 * 2]
275
+    mova       m3, [r0 + r5]
276
+    psllw      m0, (14 - BIT_DEPTH)
277
+    psllw      m1, (14 - BIT_DEPTH)
278
+    psllw      m2, (14 - BIT_DEPTH)
279
+    psllw      m3, (14 - BIT_DEPTH)
280
+    psubw      m0, m4
281
+    psubw      m1, m4
282
+    psubw      m2, m4
283
+    psubw      m3, m4
284
+    mova       [r2], m0
285
+    mova       [r2 + r3], m1
286
+    mova       [r2 + r3 * 2], m2
287
+    mova       [r2 + r4], m3
288
+
289
+    mova       ym0, [r0 + mmsize]
290
+    mova       ym1, [r0 + r1 + mmsize]
291
+    mova       ym2, [r0 + r1 * 2 + mmsize]
292
+    mova       ym3, [r0 + r5 + mmsize]
293
+    psllw      ym0, (14 - BIT_DEPTH)
294
+    psllw      ym1, (14 - BIT_DEPTH)
295
+    psllw      ym2, (14 - BIT_DEPTH)
296
+    psllw      ym3, (14 - BIT_DEPTH)
297
+    psubw      ym0, ym4
298
+    psubw      ym1, ym4
299
+    psubw      ym2, ym4
300
+    psubw      ym3, ym4
301
+    mova       [r2 + mmsize], ym0
302
+    mova       [r2 + r3 + mmsize], ym1
303
+    mova       [r2 + r3 * 2 + mmsize], ym2
304
+    mova       [r2 + r4 + mmsize], ym3
305
+%endmacro
306
+
307
+;-----------------------------------------------------------------------------
308
+; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride)
309
+;-----------------------------------------------------------------------------
310
+INIT_ZMM avx512
311
+cglobal filterPixelToShort_64x16, 4, 6, 5
312
+    add        r1d, r1d
313
+    add        r3d, r3d
314
+    lea        r4, [r3 * 3]
315
+    lea        r5, [r1 * 3]
316
+
317
+    ; load constant
318
+    vbroadcasti32x8    m4, [pw_2000]
319
+%rep 3
320
+    P2S_64x4_AVX512
321
+    lea        r0, [r0 + r1 * 4]
322
+    lea        r2, [r2 + r3 * 4]
323
+%endrep
324
+    P2S_64x4_AVX512
325
+    RET
326
+
327
+
328
+INIT_ZMM avx512
329
+cglobal filterPixelToShort_64x32, 4, 6, 5
330
+    add        r1d, r1d
331
+    add        r3d, r3d
332
+    lea        r4, [r3 * 3]
333
+    lea        r5, [r1 * 3]
334
+
335
+    ; load constant
336
+    vbroadcasti32x8    m4, [pw_2000]
337
+%rep 7
338
+    P2S_64x4_AVX512
339
+    lea        r0, [r0 + r1 * 4]
340
+    lea        r2, [r2 + r3 * 4]
341
+%endrep
342
+    P2S_64x4_AVX512
343
+    RET
344
+
345
+INIT_ZMM avx512
346
+cglobal filterPixelToShort_64x48, 4, 6, 5
347
+    add        r1d, r1d
348
+    add        r3d, r3d
349
+    lea        r4, [r3 * 3]
350
+    lea        r5, [r1 * 3]
351
+
352
+    ; load constant
353
+    vbroadcasti32x8    m4, [pw_2000]
354
+%rep 11
355
+    P2S_64x4_AVX512
356
+    lea        r0, [r0 + r1 * 4]
357
+    lea        r2, [r2 + r3 * 4]
358
+%endrep
359
+    P2S_64x4_AVX512
360
+    RET
361
+
362
+INIT_ZMM avx512
363
+cglobal filterPixelToShort_64x64, 4, 6, 5
364
+    add        r1d, r1d
365
+    add        r3d, r3d
366
+    lea        r4, [r3 * 3]
367
+    lea        r5, [r1 * 3]
368
+
369
+    ; load constant
370
+    vbroadcasti32x8    m4, [pw_2000]
371
+%rep 15
372
+    P2S_64x4_AVX512
373
+    lea        r0, [r0 + r1 * 4]
374
+    lea        r2, [r2 + r3 * 4]
375
+%endrep
376
+    P2S_64x4_AVX512
377
+    RET
378
+
379
+INIT_ZMM avx512
380
+cglobal filterPixelToShort_32x8, 4, 6, 5
381
+    add        r1d, r1d
382
+    add        r3d, r3d
383
+    lea        r4, [r3 * 3]
384
+    lea        r5, [r1 * 3]
385
+
386
+    ; load constant
387
+    vbroadcasti32x8    m4, [pw_2000]
388
+    P2S_32x4_AVX512
389
+    lea        r0, [r0 + r1 * 4]
390
+    lea        r2, [r2 + r3 * 4]
391
+    P2S_32x4_AVX512
392
+    RET
393
+
394
+INIT_ZMM avx512
395
+cglobal filterPixelToShort_32x16, 4, 6, 5
396
+    add        r1d, r1d
397
+    add        r3d, r3d
398
+    lea        r4, [r3 * 3]
399
+    lea        r5, [r1 * 3]
400
+
401
+    ; load constant
402
+    vbroadcasti32x8    m4, [pw_2000]
403
+%rep 3
404
+    P2S_32x4_AVX512
405
+    lea        r0, [r0 + r1 * 4]
406
+    lea        r2, [r2 + r3 * 4]
407
+%endrep
408
+    P2S_32x4_AVX512
409
+    RET
410
+
411
+INIT_ZMM avx512
412
+cglobal filterPixelToShort_32x24, 4, 6, 5
413
+    add        r1d, r1d
414
+    add        r3d, r3d
415
+    lea        r4, [r3 * 3]
416
+    lea        r5, [r1 * 3]
417
+
418
+    ; load constant
419
+    vbroadcasti32x8    m4, [pw_2000]
420
+%rep 5
421
+    P2S_32x4_AVX512
422
+    lea        r0, [r0 + r1 * 4]
423
+    lea        r2, [r2 + r3 * 4]
424
+%endrep
425
+    P2S_32x4_AVX512
426
+    RET
427
+
428
+INIT_ZMM avx512
429
+cglobal filterPixelToShort_32x32, 4, 6, 5
430
+    add        r1d, r1d
431
+    add        r3d, r3d
432
+    lea        r4, [r3 * 3]
433
+    lea        r5, [r1 * 3]
434
+
435
+    ; load constant
436
+    vbroadcasti32x8    m4, [pw_2000]
437
+%rep 7
438
+    P2S_32x4_AVX512
439
+    lea        r0, [r0 + r1 * 4]
440
+    lea        r2, [r2 + r3 * 4]
441
+%endrep
442
+    P2S_32x4_AVX512
443
+    RET
444
+
445
+INIT_ZMM avx512
446
+cglobal filterPixelToShort_32x48, 4, 6, 5
447
+    add        r1d, r1d
448
+    add        r3d, r3d
449
+    lea        r4, [r3 * 3]
450
+    lea        r5, [r1 * 3]
451
+
452
+    ; load constant
453
+    vbroadcasti32x8    m4, [pw_2000]
454
+%rep 11
455
+    P2S_32x4_AVX512
456
+    lea        r0, [r0 + r1 * 4]
457
+    lea        r2, [r2 + r3 * 4]
458
+%endrep
459
+    P2S_32x4_AVX512
460
+    RET
461
+
462
+INIT_ZMM avx512
463
+cglobal filterPixelToShort_32x64, 4, 6, 5
464
+    add        r1d, r1d
465
+    add        r3d, r3d
466
+    lea        r4, [r3 * 3]
467
+    lea        r5, [r1 * 3]
468
+
469
+    ; load constant
470
+    vbroadcasti32x8    m4, [pw_2000]
471
+%rep 15
472
+    P2S_32x4_AVX512
473
+    lea        r0, [r0 + r1 * 4]
474
+    lea        r2, [r2 + r3 * 4]
475
+%endrep
476
+    P2S_32x4_AVX512
477
+    RET
478
 
479
+INIT_ZMM avx512
480
+cglobal filterPixelToShort_48x64, 4, 6, 5
481
+    add        r1d, r1d
482
+    add        r3d, r3d
483
+    lea        r4, [r3 * 3]
484
+    lea        r5, [r1 * 3]
485
+
486
+    ; load constant
487
+    vbroadcasti32x8    m4, [pw_2000]
488
+%rep 15
489
+    P2S_48x4_AVX512
490
+    lea        r0, [r0 + r1 * 4]
491
+    lea        r2, [r2 + r3 * 4]
492
+%endrep
493
+    P2S_48x4_AVX512
494
+    RET
495
+
496
+INIT_ZMM avx512
497
+cglobal filterPixelToShort_aligned_64x16, 4, 6, 5
498
+    add        r1d, r1d
499
+    add        r3d, r3d
500
+    lea        r4, [r3 * 3]
501
+    lea        r5, [r1 * 3]
502
+
503
+    ; load constant
504
+    vbroadcasti32x8    m4, [pw_2000]
505
+%rep 3
506
+    P2S_ALIGNED_64x4_AVX512
507
+    lea        r0, [r0 + r1 * 4]
508
+    lea        r2, [r2 + r3 * 4]
509
+%endrep
510
+    P2S_ALIGNED_64x4_AVX512
511
+    RET
512
+
513
+
514
+INIT_ZMM avx512
515
+cglobal filterPixelToShort_aligned_64x32, 4, 6, 5
516
+    add        r1d, r1d
517
+    add        r3d, r3d
518
+    lea        r4, [r3 * 3]
519
+    lea        r5, [r1 * 3]
520
+
521
+    ; load constant
522
+    vbroadcasti32x8    m4, [pw_2000]
523
+%rep 7
524
+    P2S_ALIGNED_64x4_AVX512
525
+    lea        r0, [r0 + r1 * 4]
526
+    lea        r2, [r2 + r3 * 4]
527
+%endrep
528
+    P2S_ALIGNED_64x4_AVX512
529
+    RET
530
+
531
+INIT_ZMM avx512
532
+cglobal filterPixelToShort_aligned_64x48, 4, 6, 5
533
+    add        r1d, r1d
534
+    add        r3d, r3d
535
+    lea        r4, [r3 * 3]
536
+    lea        r5, [r1 * 3]
537
+
538
+    ; load constant
539
+    vbroadcasti32x8    m4, [pw_2000]
540
+%rep 11
541
+    P2S_ALIGNED_64x4_AVX512
542
+    lea        r0, [r0 + r1 * 4]
543
+    lea        r2, [r2 + r3 * 4]
544
+%endrep
545
+    P2S_ALIGNED_64x4_AVX512
546
+    RET
547
+
548
+INIT_ZMM avx512
549
+cglobal filterPixelToShort_aligned_64x64, 4, 6, 5
550
+    add        r1d, r1d
551
+    add        r3d, r3d
552
+    lea        r4, [r3 * 3]
553
+    lea        r5, [r1 * 3]
554
+
555
+    ; load constant
556
+    vbroadcasti32x8    m4, [pw_2000]
557
+%rep 15
558
+    P2S_ALIGNED_64x4_AVX512
559
+    lea        r0, [r0 + r1 * 4]
560
+    lea        r2, [r2 + r3 * 4]
561
+%endrep
562
+    P2S_ALIGNED_64x4_AVX512
563
+    RET
564
+
565
+INIT_ZMM avx512
566
+cglobal filterPixelToShort_aligned_32x8, 4, 6, 5
567
+    add        r1d, r1d
568
+    add        r3d, r3d
569
+    lea        r4, [r3 * 3]
570
+    lea        r5, [r1 * 3]
571
+
572
+    ; load constant
573
+    vbroadcasti32x8    m4, [pw_2000]
574
+    P2S_ALIGNED_32x4_AVX512
575
+    lea        r0, [r0 + r1 * 4]
576
+    lea        r2, [r2 + r3 * 4]
577
+    P2S_ALIGNED_32x4_AVX512
578
+    RET
579
+
580
+INIT_ZMM avx512
581
+cglobal filterPixelToShort_aligned_32x16, 4, 6, 5
582
+    add        r1d, r1d
583
+    add        r3d, r3d
584
+    lea        r4, [r3 * 3]
585
+    lea        r5, [r1 * 3]
586
+
587
+    ; load constant
588
+    vbroadcasti32x8    m4, [pw_2000]
589
+%rep 3
590
+    P2S_ALIGNED_32x4_AVX512
591
+    lea        r0, [r0 + r1 * 4]
592
+    lea        r2, [r2 + r3 * 4]
593
+%endrep
594
+    P2S_ALIGNED_32x4_AVX512
595
+    RET
596
+
597
+INIT_ZMM avx512
598
+cglobal filterPixelToShort_aligned_32x24, 4, 6, 5
599
+    add        r1d, r1d
600
+    add        r3d, r3d
601
+    lea        r4, [r3 * 3]
602
+    lea        r5, [r1 * 3]
603
+
604
+    ; load constant
605
+    vbroadcasti32x8    m4, [pw_2000]
606
+%rep 5
607
+    P2S_ALIGNED_32x4_AVX512
608
+    lea        r0, [r0 + r1 * 4]
609
+    lea        r2, [r2 + r3 * 4]
610
+%endrep
611
+    P2S_ALIGNED_32x4_AVX512
612
+    RET
613
+
614
+INIT_ZMM avx512
615
+cglobal filterPixelToShort_aligned_32x32, 4, 6, 5
616
+    add        r1d, r1d
617
+    add        r3d, r3d
618
+    lea        r4, [r3 * 3]
619
+    lea        r5, [r1 * 3]
620
+
621
+    ; load constant
622
+    vbroadcasti32x8    m4, [pw_2000]
623
+%rep 7
624
+    P2S_ALIGNED_32x4_AVX512
625
+    lea        r0, [r0 + r1 * 4]
626
+    lea        r2, [r2 + r3 * 4]
627
+%endrep
628
+    P2S_ALIGNED_32x4_AVX512
629
+    RET
630
+
631
+INIT_ZMM avx512
632
+cglobal filterPixelToShort_aligned_32x48, 4, 6, 5
633
+    add        r1d, r1d
634
+    add        r3d, r3d
635
+    lea        r4, [r3 * 3]
636
+    lea        r5, [r1 * 3]
637
+
638
+    ; load constant
639
+    vbroadcasti32x8    m4, [pw_2000]
640
+%rep 11
641
+    P2S_ALIGNED_32x4_AVX512
642
+    lea        r0, [r0 + r1 * 4]
643
+    lea        r2, [r2 + r3 * 4]
644
+%endrep
645
+    P2S_ALIGNED_32x4_AVX512
646
+    RET
647
+
648
+INIT_ZMM avx512
649
+cglobal filterPixelToShort_aligned_32x64, 4, 6, 5
650
+    add        r1d, r1d
651
+    add        r3d, r3d
652
+    lea        r4, [r3 * 3]
653
+    lea        r5, [r1 * 3]
654
+
655
+    ; load constant
656
+    vbroadcasti32x8    m4, [pw_2000]
657
+%rep 15
658
+    P2S_ALIGNED_32x4_AVX512
659
+    lea        r0, [r0 + r1 * 4]
660
+    lea        r2, [r2 + r3 * 4]
661
+%endrep
662
+    P2S_ALIGNED_32x4_AVX512
663
+    RET
664
+
665
+INIT_ZMM avx512
666
+cglobal filterPixelToShort_aligned_48x64, 4, 6, 5
667
+    add        r1d, r1d
668
+    add        r3d, r3d
669
+    lea        r4, [r3 * 3]
670
+    lea        r5, [r1 * 3]
671
+
672
+    ; load constant
673
+    vbroadcasti32x8    m4, [pw_2000]
674
+%rep 15
675
+    P2S_ALIGNED_48x4_AVX512
676
+    lea        r0, [r0 + r1 * 4]
677
+    lea        r2, [r2 + r3 * 4]
678
+%endrep
679
+    P2S_ALIGNED_48x4_AVX512
680
+    RET
681
+;-----------------------------------------------------------------------------------------------------------------------------
682
+;p2s and p2s_aligned avx512 code end
683
+;-----------------------------------------------------------------------------------------------------------------------------
684
 
685
 %macro PROCESS_LUMA_VER_W4_4R 0
686
     movq       m0, [r0]
687
@@ -4611,3 +5252,8822 @@
688
     jnz        .loop
689
     RET
690
 
691
+;-------------------------------------------------------------------------------------------------------------
692
+;ipfilter_chroma_avx512 code start
693
+;-------------------------------------------------------------------------------------------------------------
694
+;-------------------------------------------------------------------------------------------------------------
695
+; avx512 chroma_hpp code start
696
+;-------------------------------------------------------------------------------------------------------------
697
+%macro PROCESS_IPFILTER_CHROMA_PP_8x4_AVX512 0
698
+    ; register map
699
+    ; m0 , m1 interpolate coeff
700
+    ; m2 , m3  shuffle order table
701
+    ; m4 - pd_32
702
+    ; m5 - zero
703
+    ; m6 - pw_pixel_max
704
+
705
+    movu            xm7,       [r0]
706
+    vinserti32x4    m7,        [r0 + r1],      1
707
+    vinserti32x4    m7,        [r0 + 2 * r1],  2
708
+    vinserti32x4    m7,        [r0 + r6],      3
709
+
710
+    pshufb          m9,        m7,        m3
711
+    pshufb          m7,        m2
712
+    pmaddwd         m7,        m0
713
+    pmaddwd         m9,        m1
714
+    paddd           m7,        m9
715
+    paddd           m7,        m4
716
+    psrad           m7,        6
717
+
718
+    movu            xm8,       [r0 + 8]
719
+    vinserti32x4    m8,        [r0 + r1 + 8],      1
720
+    vinserti32x4    m8,        [r0 + 2 * r1 + 8],  2
721
+    vinserti32x4    m8,        [r0 + r6 + 8],      3
722
+
723
+    pshufb          m9,        m8,        m3
724
+    pshufb          m8,        m2
725
+    pmaddwd         m8,        m0
726
+    pmaddwd         m9,        m1
727
+    paddd           m8,        m9
728
+    paddd           m8,        m4
729
+    psrad           m8,        6
730
+
731
+    packusdw        m7,        m8
732
+    CLIPW           m7,        m5,        m6
733
+    pshufb          m7,        m10
734
+    movu            [r2],      xm7
735
+    vextracti32x4   [r2 + r3],     m7,        1
736
+    vextracti32x4   [r2 + 2 * r3], m7,        2
737
+    vextracti32x4   [r2 + r7],     m7,        3
738
+%endmacro
739
+
740
+%macro PROCESS_IPFILTER_CHROMA_PP_16x2_AVX512 0
741
+    ; register map
742
+    ; m0 , m1 interpolate coeff
743
+    ; m2 , m3  shuffle order table
744
+    ; m4 - pd_32
745
+    ; m5 - zero
746
+    ; m6 - pw_pixel_max
747
+
748
+    movu            ym7,       [r0]
749
+    vinserti32x8    m7,        [r0 + r1],      1
750
+    movu            ym8,       [r0 + 8]
751
+    vinserti32x8    m8,        [r0 + r1 + 8],  1
752
+
753
+    pshufb          m9,        m7,        m3
754
+    pshufb          m7,        m2
755
+    pmaddwd         m7,        m0
756
+    pmaddwd         m9,        m1
757
+    paddd           m7,        m9
758
+    paddd           m7,        m4
759
+    psrad           m7,        6
760
+
761
+    pshufb          m9,        m8,        m3
762
+    pshufb          m8,        m2
763
+    pmaddwd         m8,        m0
764
+    pmaddwd         m9,        m1
765
+    paddd           m8,        m9
766
+    paddd           m8,        m4
767
+    psrad           m8,        6
768
+
769
+    packusdw        m7,        m8
770
+    CLIPW           m7,        m5,        m6
771
+    pshufb          m7,        m10
772
+    movu            [r2],      ym7
773
+    vextracti32x8   [r2 + r3], m7,        1
774
+%endmacro
775
+
776
+%macro PROCESS_IPFILTER_CHROMA_PP_24x4_AVX512 0
777
+    ; register map
778
+    ; m0 , m1 interpolate coeff
779
+    ; m2 , m3  shuffle order table
780
+    ; m4 - pd_32
781
+    ; m5 - zero
782
+    ; m6 - pw_pixel_max
783
+
784
+    movu            ym7,       [r0]
785
+    vinserti32x8    m7,        [r0 + r1],      1
786
+    movu            ym8,       [r0 + 8]
787
+    vinserti32x8    m8,        [r0 + r1 + 8],  1
788
+
789
+    pshufb          m9,        m7,        m3
790
+    pshufb          m7,        m2
791
+    pmaddwd         m7,        m0
792
+    pmaddwd         m9,        m1
793
+    paddd           m7,        m9
794
+    paddd           m7,        m4
795
+    psrad           m7,        6
796
+
797
+    pshufb          m9,        m8,        m3
798
+    pshufb          m8,        m2
799
+    pmaddwd         m8,        m0
800
+    pmaddwd         m9,        m1
801
+    paddd           m8,        m9
802
+    paddd           m8,        m4
803
+    psrad           m8,        6
804
+
805
+    packusdw        m7,        m8
806
+    CLIPW           m7,        m5,        m6
807
+    pshufb          m7,        m10
808
+    movu            [r2],      ym7
809
+    vextracti32x8   [r2 + r3], m7,        1
810
+
811
+    movu            ym7,       [r0 + 2 * r1]
812
+    vinserti32x8    m7,        [r0 + r6],      1
813
+    movu            ym8,       [r0 + 2 * r1 + 8]
814
+    vinserti32x8    m8,        [r0 + r6 + 8],  1
815
+
816
+    pshufb          m9,        m7,        m3
817
+    pshufb          m7,        m2
818
+    pmaddwd         m7,        m0
819
+    pmaddwd         m9,        m1
820
+    paddd           m7,        m9
821
+    paddd           m7,        m4
822
+    psrad           m7,        6
823
+
824
+    pshufb          m9,        m8,        m3
825
+    pshufb          m8,        m2
826
+    pmaddwd         m8,        m0
827
+    pmaddwd         m9,        m1
828
+    paddd           m8,        m9
829
+    paddd           m8,        m4
830
+    psrad           m8,        6
831
+
832
+    packusdw        m7,        m8
833
+    CLIPW           m7,        m5,        m6
834
+    pshufb          m7,        m10
835
+    movu            [r2 + 2 * r3],        ym7
836
+    vextracti32x8   [r2 + r7], m7,        1
837
+
838
+    movu            xm7,       [r0 + mmsize/2]
839
+    vinserti32x4    m7,        [r0 + r1 + mmsize/2],      1
840
+    vinserti32x4    m7,        [r0 + 2 * r1 + mmsize/2],  2
841
+    vinserti32x4    m7,        [r0 + r6 + mmsize/2],      3
842
+
843
+    pshufb          m9,        m7,        m3
844
+    pshufb          m7,        m2
845
+    pmaddwd         m7,        m0
846
+    pmaddwd         m9,        m1
847
+    paddd           m7,        m9
848
+    paddd           m7,        m4
849
+    psrad           m7,        6
850
+
851
+    movu            xm8,       [r0 + mmsize/2 + 8]
852
+    vinserti32x4    m8,        [r0 + r1 + mmsize/2 + 8],      1
853
+    vinserti32x4    m8,        [r0 + 2 * r1 + mmsize/2 + 8],  2
854
+    vinserti32x4    m8,        [r0 + r6 + mmsize/2 + 8],      3
855
+
856
+    pshufb          m9,        m8,        m3
857
+    pshufb          m8,        m2
858
+    pmaddwd         m8,        m0
859
+    pmaddwd         m9,        m1
860
+    paddd           m8,        m9
861
+    paddd           m8,        m4
862
+    psrad           m8,        6
863
+
864
+    packusdw        m7,        m8
865
+    CLIPW           m7,        m5,        m6
866
+    pshufb          m7,        m10
867
+    movu            [r2 + mmsize/2],      xm7
868
+    vextracti32x4   [r2 + r3 + mmsize/2],     m7,        1
869
+    vextracti32x4   [r2 + 2 * r3 + mmsize/2], m7,        2
870
+    vextracti32x4   [r2 + r7 + mmsize/2],     m7,        3
871
+%endmacro
872
+
873
+%macro PROCESS_IPFILTER_CHROMA_PP_32x2_AVX512 0
874
+    ; register map
875
+    ; m0 , m1 interpolate coeff
876
+    ; m2 , m3  shuffle order table
877
+    ; m4 - pd_32
878
+    ; m5 - zero
879
+    ; m6 - pw_pixel_max
880
+
881
+    movu            m7,        [r0]
882
+    movu            m8,        [r0 + 8]
883
+
884
+    pshufb          m9,        m7,        m3
885
+    pshufb          m7,        m2
886
+    pmaddwd         m7,        m0
887
+    pmaddwd         m9,        m1
888
+    paddd           m7,        m9
889
+    paddd           m7,        m4
890
+    psrad           m7,        6
891
+
892
+    pshufb          m9,        m8,        m3
893
+    pshufb          m8,        m2
894
+    pmaddwd         m8,        m0
895
+    pmaddwd         m9,        m1
896
+    paddd           m8,        m9
897
+    paddd           m8,        m4
898
+    psrad           m8,        6
899
+
900
+    packusdw        m7,        m8
901
+    CLIPW           m7,        m5,        m6
902
+    pshufb          m7,        m10
903
+    movu            [r2],      m7
904
+
905
+    movu            m7,        [r0 + r1]
906
+    movu            m8,        [r0 + r1 + 8]
907
+
908
+    pshufb          m9,        m7,        m3
909
+    pshufb          m7,        m2
910
+    pmaddwd         m7,        m0
911
+    pmaddwd         m9,        m1
912
+    paddd           m7,        m9
913
+    paddd           m7,        m4
914
+    psrad           m7,        6
915
+
916
+    pshufb          m9,        m8,        m3
917
+    pshufb          m8,        m2
918
+    pmaddwd         m8,        m0
919
+    pmaddwd         m9,        m1
920
+    paddd           m8,        m9
921
+    paddd           m8,        m4
922
+    psrad           m8,        6
923
+
924
+    packusdw        m7,        m8
925
+    CLIPW           m7,        m5,        m6
926
+    pshufb          m7,        m10
927
+    movu            [r2 + r3], m7
928
+%endmacro
929
+
930
+%macro PROCESS_IPFILTER_CHROMA_PP_48x2_AVX512 0
931
+    ; register map
932
+    ; m0 , m1 interpolate coeff
933
+    ; m2 , m3  shuffle order table
934
+    ; m4 - pd_32
935
+    ; m5 - zero
936
+    ; m6 - pw_pixel_max
937
+
938
+    movu            m7,        [r0]
939
+    movu            m8,        [r0 + 8]
940
+
941
+    pshufb          m9,        m7,        m3
942
+    pshufb          m7,        m2
943
+    pmaddwd         m7,        m0
944
+    pmaddwd         m9,        m1
945
+    paddd           m7,        m9
946
+    paddd           m7,        m4
947
+    psrad           m7,        6
948
+
949
+    pshufb          m9,        m8,        m3
950
+    pshufb          m8,        m2
951
+    pmaddwd         m8,        m0
952
+    pmaddwd         m9,        m1
953
+    paddd           m8,        m9
954
+    paddd           m8,        m4
955
+    psrad           m8,        6
956
+
957
+    packusdw        m7,        m8
958
+    CLIPW           m7,        m5,        m6
959
+    pshufb          m7,        m10
960
+    movu            [r2],      m7
961
+
962
+    movu            m7,        [r0 + r1]
963
+    movu            m8,        [r0 + r1 + 8]
964
+
965
+    pshufb          m9,        m7,        m3
966
+    pshufb          m7,        m2
967
+    pmaddwd         m7,        m0
968
+    pmaddwd         m9,        m1
969
+    paddd           m7,        m9
970
+    paddd           m7,        m4
971
+    psrad           m7,        6
972
+
973
+    pshufb          m9,        m8,        m3
974
+    pshufb          m8,        m2
975
+    pmaddwd         m8,        m0
976
+    pmaddwd         m9,        m1
977
+    paddd           m8,        m9
978
+    paddd           m8,        m4
979
+    psrad           m8,        6
980
+
981
+    packusdw        m7,        m8
982
+    CLIPW           m7,        m5,        m6
983
+    pshufb          m7,        m10
984
+    movu            [r2 + r3], m7
985
+
986
+    movu            ym7,       [r0 + mmsize]
987
+    vinserti32x8    m7,        [r0 + r1 + mmsize],     1
988
+    movu            ym8,       [r0 + mmsize + 8]
989
+    vinserti32x8    m8,        [r0 + r1 + mmsize + 8],  1
990
+
991
+    pshufb          m9,        m7,        m3
992
+    pshufb          m7,        m2
993
+    pmaddwd         m7,        m0
994
+    pmaddwd         m9,        m1
995
+    paddd           m7,        m9
996
+    paddd           m7,        m4
997
+    psrad           m7,        6
998
+
999
+    pshufb          m9,        m8,        m3
1000
+    pshufb          m8,        m2
1001
+    pmaddwd         m8,        m0
1002
+    pmaddwd         m9,        m1
1003
+    paddd           m8,        m9
1004
+    paddd           m8,        m4
1005
+    psrad           m8,        6
1006
+
1007
+    packusdw        m7,        m8
1008
+    CLIPW           m7,        m5,        m6
1009
+    pshufb          m7,        m10
1010
+    movu            [r2 + mmsize],      ym7
1011
+    vextracti32x8   [r2 + r3 + mmsize], m7,        1
1012
+%endmacro
1013
+
1014
+%macro PROCESS_IPFILTER_CHROMA_PP_64x2_AVX512 0
1015
+    ; register map
1016
+    ; m0 , m1 interpolate coeff
1017
+    ; m2 , m3  shuffle order table
1018
+    ; m4 - pd_32
1019
+    ; m5 - zero
1020
+    ; m6 - pw_pixel_max
1021
+
1022
+    movu            m7,        [r0]
1023
+    movu            m8,        [r0 + 8]
1024
+
1025
+    pshufb          m9,        m7,        m3
1026
+    pshufb          m7,        m2
1027
+    pmaddwd         m7,        m0
1028
+    pmaddwd         m9,        m1
1029
+    paddd           m7,        m9
1030
+    paddd           m7,        m4
1031
+    psrad           m7,        6
1032
+
1033
+    pshufb          m9,        m8,        m3
1034
+    pshufb          m8,        m2
1035
+    pmaddwd         m8,        m0
1036
+    pmaddwd         m9,        m1
1037
+    paddd           m8,        m9
1038
+    paddd           m8,        m4
1039
+    psrad           m8,        6
1040
+
1041
+    packusdw        m7,        m8
1042
+    CLIPW           m7,        m5,        m6
1043
+    pshufb          m7,        m10
1044
+    movu            [r2],      m7
1045
+
1046
+    movu            m7,        [r0 + mmsize]
1047
+    movu            m8,        [r0 + mmsize + 8]
1048
+
1049
+    pshufb          m9,        m7,        m3
1050
+    pshufb          m7,        m2
1051
+    pmaddwd         m7,        m0
1052
+    pmaddwd         m9,        m1
1053
+    paddd           m7,        m9
1054
+    paddd           m7,        m4
1055
+    psrad           m7,        6
1056
+
1057
+    pshufb          m9,        m8,        m3
1058
+    pshufb          m8,        m2
1059
+    pmaddwd         m8,        m0
1060
+    pmaddwd         m9,        m1
1061
+    paddd           m8,        m9
1062
+    paddd           m8,        m4
1063
+    psrad           m8,        6
1064
+
1065
+    packusdw        m7,        m8
1066
+    CLIPW           m7,        m5,        m6
1067
+    pshufb          m7,        m10
1068
+    movu            [r2 + mmsize],        m7
1069
+
1070
+    movu            m7,        [r0 + r1]
1071
+    movu            m8,        [r0 + r1 + 8]
1072
+
1073
+    pshufb          m9,        m7,        m3
1074
+    pshufb          m7,        m2
1075
+    pmaddwd         m7,        m0
1076
+    pmaddwd         m9,        m1
1077
+    paddd           m7,        m9
1078
+    paddd           m7,        m4
1079
+    psrad           m7,        6
1080
+
1081
+    pshufb          m9,        m8,        m3
1082
+    pshufb          m8,        m2
1083
+    pmaddwd         m8,        m0
1084
+    pmaddwd         m9,        m1
1085
+    paddd           m8,        m9
1086
+    paddd           m8,        m4
1087
+    psrad           m8,        6
1088
+
1089
+    packusdw        m7,        m8
1090
+    CLIPW           m7,        m5,        m6
1091
+    pshufb          m7,        m10
1092
+    movu            [r2 + r3], m7
1093
+
1094
+    movu            m7,        [r0 + r1 + mmsize]
1095
+    movu            m8,        [r0 + r1 + mmsize + 8]
1096
+
1097
+    pshufb          m9,        m7,        m3
1098
+    pshufb          m7,        m2
1099
+    pmaddwd         m7,        m0
1100
+    pmaddwd         m9,        m1
1101
+    paddd           m7,        m9
1102
+    paddd           m7,        m4
1103
+    psrad           m7,        6
1104
+
1105
+    pshufb          m9,        m8,        m3
1106
+    pshufb          m8,        m2
1107
+    pmaddwd         m8,        m0
1108
+    pmaddwd         m9,        m1
1109
+    paddd           m8,        m9
1110
+    paddd           m8,        m4
1111
+    psrad           m8,        6
1112
+
1113
+    packusdw        m7,        m8
1114
+    CLIPW           m7,        m5,        m6
1115
+    pshufb          m7,        m10
1116
+    movu            [r2 + r3 + mmsize],   m7
1117
+%endmacro
1118
+;-------------------------------------------------------------------------------------------------------------
1119
+; void interp_4tap_horiz_pp(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx
1120
+;-------------------------------------------------------------------------------------------------------------
1121
+%if ARCH_X86_64
1122
+INIT_ZMM avx512
1123
+cglobal interp_4tap_horiz_pp_8x4, 5,8,11
1124
+    add             r1d, r1d
1125
+    add             r3d, r3d
1126
+    sub             r0, 2
1127
+    mov             r4d, r4m
1128
+    lea             r6, [3 * r1]
1129
+    lea             r7, [3 * r3]
1130
+%ifdef PIC
1131
+    lea             r5, [tab_ChromaCoeff]
1132
+    vpbroadcastd    m0, [r5 + r4 * 8]
1133
+    vpbroadcastd    m1, [r5 + r4 * 8 + 4]
1134
+%else
1135
+    vpbroadcastd    m0, [tab_ChromaCoeff + r4 * 8]
1136
+    vpbroadcastd    m1, [tab_ChromaCoeff + r4 * 8 + 4]
1137
+%endif
1138
+    vbroadcasti32x8 m2, [interp8_hpp_shuf1_load_avx512]
1139
+    vbroadcasti32x8 m3, [interp8_hpp_shuf2_load_avx512]
1140
+    vbroadcasti32x8 m4, [pd_32]
1141
+    pxor            m5, m5
1142
+    vbroadcasti32x8 m6, [pw_pixel_max]
1143
+    vbroadcasti32x8 m10, [interp8_hpp_shuf1_store_avx512]
1144
+
1145
+    PROCESS_IPFILTER_CHROMA_PP_8x4_AVX512
1146
+    RET
1147
+%endif
1148
+
1149
+%macro IPFILTER_CHROMA_AVX512_8xN 1
1150
+INIT_ZMM avx512
1151
+cglobal interp_4tap_horiz_pp_8x%1, 5,8,11
1152
+    add             r1d, r1d
1153
+    add             r3d, r3d
1154
+    sub             r0, 2
1155
+    mov             r4d, r4m
1156
+    lea             r6, [3 * r1]
1157
+    lea             r7, [3 * r3]
1158
+%ifdef PIC
1159
+    lea             r5, [tab_ChromaCoeff]
1160
+    vpbroadcastd    m0, [r5 + r4 * 8]
1161
+    vpbroadcastd    m1, [r5 + r4 * 8 + 4]
1162
+%else
1163
+    vpbroadcastd    m0, [tab_ChromaCoeff + r4 * 8]
1164
+    vpbroadcastd    m1, [tab_ChromaCoeff + r4 * 8 + 4]
1165
+%endif
1166
+    vbroadcasti32x8 m2, [interp8_hpp_shuf1_load_avx512]
1167
+    vbroadcasti32x8 m3, [interp8_hpp_shuf2_load_avx512]
1168
+    vbroadcasti32x8 m4, [pd_32]
1169
+    pxor            m5, m5
1170
+    vbroadcasti32x8 m6, [pw_pixel_max]
1171
+    vbroadcasti32x8 m10, [interp8_hpp_shuf1_store_avx512]
1172
+
1173
+%rep %1/4 - 1
1174
+    PROCESS_IPFILTER_CHROMA_PP_8x4_AVX512
1175
+    lea             r0, [r0 + 4 * r1]
1176
+    lea             r2, [r2 + 4 * r3]
1177
+%endrep
1178
+    PROCESS_IPFILTER_CHROMA_PP_8x4_AVX512
1179
+    RET
1180
+%endmacro
1181
+
1182
+%if ARCH_X86_64
1183
+IPFILTER_CHROMA_AVX512_8xN 8
1184
+IPFILTER_CHROMA_AVX512_8xN 12
1185
+IPFILTER_CHROMA_AVX512_8xN 16
1186
+IPFILTER_CHROMA_AVX512_8xN 32
1187
+IPFILTER_CHROMA_AVX512_8xN 64
1188
+%endif
1189
+
1190
+%macro IPFILTER_CHROMA_AVX512_16xN 1
1191
+INIT_ZMM avx512
1192
+cglobal interp_4tap_horiz_pp_16x%1, 5,6,11
1193
+    add             r1d, r1d
1194
+    add             r3d, r3d
1195
+    sub             r0, 2
1196
+    mov             r4d, r4m
1197
+%ifdef PIC
1198
+    lea             r5, [tab_ChromaCoeff]
1199
+    vpbroadcastd    m0, [r5 + r4 * 8]
1200
+    vpbroadcastd    m1, [r5 + r4 * 8 + 4]
1201
+%else
1202
+    vpbroadcastd    m0, [tab_ChromaCoeff + r4 * 8]
1203
+    vpbroadcastd    m1, [tab_ChromaCoeff + r4 * 8 + 4]
1204
+%endif
1205
+    vbroadcasti32x8 m2, [interp8_hpp_shuf1_load_avx512]
1206
+    vbroadcasti32x8 m3, [interp8_hpp_shuf2_load_avx512]
1207
+    vbroadcasti32x8 m4, [pd_32]
1208
+    pxor            m5, m5
1209
+    vbroadcasti32x8 m6, [pw_pixel_max]
1210
+    vbroadcasti32x8 m10, [interp8_hpp_shuf1_store_avx512]
1211
+
1212
+%rep %1/2 - 1
1213
+    PROCESS_IPFILTER_CHROMA_PP_16x2_AVX512
1214
+    lea             r0, [r0 + 2 * r1]
1215
+    lea             r2, [r2 + 2 * r3]
1216
+%endrep
1217
+    PROCESS_IPFILTER_CHROMA_PP_16x2_AVX512
1218
+    RET
1219
+%endmacro
1220
+
1221
+%if ARCH_X86_64
1222
+IPFILTER_CHROMA_AVX512_16xN 4
1223
+IPFILTER_CHROMA_AVX512_16xN 8
1224
+IPFILTER_CHROMA_AVX512_16xN 12
1225
+IPFILTER_CHROMA_AVX512_16xN 16
1226
+IPFILTER_CHROMA_AVX512_16xN 24
1227
+IPFILTER_CHROMA_AVX512_16xN 32
1228
+IPFILTER_CHROMA_AVX512_16xN 64
1229
+%endif
1230
+
1231
+%macro IPFILTER_CHROMA_AVX512_24xN 1
1232
+INIT_ZMM avx512
1233
+cglobal interp_4tap_horiz_pp_24x%1, 5,8,11
1234
+    add             r1d, r1d
1235
+    add             r3d, r3d
1236
+    sub             r0, 2
1237
+    mov             r4d, r4m
1238
+    lea             r6, [3 * r1]
1239
+    lea             r7, [3 * r3]
1240
+%ifdef PIC
1241
+    lea             r5, [tab_ChromaCoeff]
1242
+    vpbroadcastd    m0, [r5 + r4 * 8]
1243
+    vpbroadcastd    m1, [r5 + r4 * 8 + 4]
1244
+%else
1245
+    vpbroadcastd    m0, [tab_ChromaCoeff + r4 * 8]
1246
+    vpbroadcastd    m1, [tab_ChromaCoeff + r4 * 8 + 4]
1247
+%endif
1248
+    vbroadcasti32x8 m2, [interp8_hpp_shuf1_load_avx512]
1249
+    vbroadcasti32x8 m3, [interp8_hpp_shuf2_load_avx512]
1250
+    vbroadcasti32x8 m4, [pd_32]
1251
+    pxor            m5, m5
1252
+    vbroadcasti32x8 m6, [pw_pixel_max]
1253
+    vbroadcasti32x8 m10, [interp8_hpp_shuf1_store_avx512]
1254
+
1255
+%rep %1/4 - 1
1256
+    PROCESS_IPFILTER_CHROMA_PP_24x4_AVX512
1257
+    lea             r0, [r0 + 4 * r1]
1258
+    lea             r2, [r2 + 4 * r3]
1259
+%endrep
1260
+    PROCESS_IPFILTER_CHROMA_PP_24x4_AVX512
1261
+    RET
1262
+%endmacro
1263
+
1264
+%if ARCH_X86_64
1265
+IPFILTER_CHROMA_AVX512_24xN 32
1266
+IPFILTER_CHROMA_AVX512_24xN 64
1267
+%endif
1268
+
1269
+%macro IPFILTER_CHROMA_AVX512_32xN 1
1270
+INIT_ZMM avx512
1271
+cglobal interp_4tap_horiz_pp_32x%1, 5,6,11
1272
+    add             r1d, r1d
1273
+    add             r3d, r3d
1274
+    sub             r0, 2
1275
+    mov             r4d, r4m
1276
+%ifdef PIC
1277
+    lea             r5, [tab_ChromaCoeff]
1278
+    vpbroadcastd    m0, [r5 + r4 * 8]
1279
+    vpbroadcastd    m1, [r5 + r4 * 8 + 4]
1280
+%else
1281
+    vpbroadcastd    m0, [tab_ChromaCoeff + r4 * 8]
1282
+    vpbroadcastd    m1, [tab_ChromaCoeff + r4 * 8 + 4]
1283
+%endif
1284
+    vbroadcasti32x8 m2, [interp8_hpp_shuf1_load_avx512]
1285
+    vbroadcasti32x8 m3, [interp8_hpp_shuf2_load_avx512]
1286
+    vbroadcasti32x8 m4, [pd_32]
1287
+    pxor            m5, m5
1288
+    vbroadcasti32x8 m6, [pw_pixel_max]
1289
+    vbroadcasti32x8 m10, [interp8_hpp_shuf1_store_avx512]
1290
+
1291
+%rep %1/2 - 1
1292
+    PROCESS_IPFILTER_CHROMA_PP_32x2_AVX512
1293
+    lea             r0, [r0 + 2 * r1]
1294
+    lea             r2, [r2 + 2 * r3]
1295
+%endrep
1296
+    PROCESS_IPFILTER_CHROMA_PP_32x2_AVX512
1297
+    RET
1298
+%endmacro
1299
+
1300
+%if ARCH_X86_64
1301
+IPFILTER_CHROMA_AVX512_32xN 8
1302
+IPFILTER_CHROMA_AVX512_32xN 16
1303
+IPFILTER_CHROMA_AVX512_32xN 24
1304
+IPFILTER_CHROMA_AVX512_32xN 32
1305
+IPFILTER_CHROMA_AVX512_32xN 48
1306
+IPFILTER_CHROMA_AVX512_32xN 64
1307
+%endif
1308
+
1309
+%macro IPFILTER_CHROMA_AVX512_64xN 1
1310
+INIT_ZMM avx512
1311
+cglobal interp_4tap_horiz_pp_64x%1, 5,6,11
1312
+    add             r1d, r1d
1313
+    add             r3d, r3d
1314
+    sub             r0, 2
1315
+    mov             r4d, r4m
1316
+%ifdef PIC
1317
+    lea             r5, [tab_ChromaCoeff]
1318
+    vpbroadcastd    m0, [r5 + r4 * 8]
1319
+    vpbroadcastd    m1, [r5 + r4 * 8 + 4]
1320
+%else
1321
+    vpbroadcastd    m0, [tab_ChromaCoeff + r4 * 8]
1322
+    vpbroadcastd    m1, [tab_ChromaCoeff + r4 * 8 + 4]
1323
+%endif
1324
+    vbroadcasti32x8 m2, [interp8_hpp_shuf1_load_avx512]
1325
+    vbroadcasti32x8 m3, [interp8_hpp_shuf2_load_avx512]
1326
+    vbroadcasti32x8 m4, [pd_32]
1327
+    pxor            m5, m5
1328
+    vbroadcasti32x8 m6, [pw_pixel_max]
1329
+    vbroadcasti32x8 m10, [interp8_hpp_shuf1_store_avx512]
1330
+
1331
+%rep %1/2 - 1
1332
+    PROCESS_IPFILTER_CHROMA_PP_64x2_AVX512
1333
+    lea             r0, [r0 + 2 * r1]
1334
+    lea             r2, [r2 + 2 * r3]
1335
+%endrep
1336
+    PROCESS_IPFILTER_CHROMA_PP_64x2_AVX512
1337
+    RET
1338
+%endmacro
1339
+
1340
+%if ARCH_X86_64
1341
+IPFILTER_CHROMA_AVX512_64xN 16
1342
+IPFILTER_CHROMA_AVX512_64xN 32
1343
+IPFILTER_CHROMA_AVX512_64xN 48
1344
+IPFILTER_CHROMA_AVX512_64xN 64
1345
+%endif
1346
+
1347
+%if ARCH_X86_64
1348
+INIT_ZMM avx512
1349
+cglobal interp_4tap_horiz_pp_48x64, 5,6,11
1350
+    add             r1d, r1d
1351
+    add             r3d, r3d
1352
+    sub             r0, 2
1353
+    mov             r4d, r4m
1354
+%ifdef PIC
1355
+    lea             r5, [tab_ChromaCoeff]
1356
+    vpbroadcastd    m0, [r5 + r4 * 8]
1357
+    vpbroadcastd    m1, [r5 + r4 * 8 + 4]
1358
+%else
1359
+    vpbroadcastd    m0, [tab_ChromaCoeff + r4 * 8]
1360
+    vpbroadcastd    m1, [tab_ChromaCoeff + r4 * 8 + 4]
1361
+%endif
1362
+    vbroadcasti32x8 m2, [interp8_hpp_shuf1_load_avx512]
1363
+    vbroadcasti32x8 m3, [interp8_hpp_shuf2_load_avx512]
1364
+    vbroadcasti32x8 m4, [pd_32]
1365
+    pxor            m5, m5
1366
+    vbroadcasti32x8 m6, [pw_pixel_max]
1367
+    vbroadcasti32x8 m10, [interp8_hpp_shuf1_store_avx512]
1368
+
1369
+%rep 31
1370
+    PROCESS_IPFILTER_CHROMA_PP_48x2_AVX512
1371
+    lea             r0, [r0 + 2 * r1]
1372
+    lea             r2, [r2 + 2 * r3]
1373
+%endrep
1374
+    PROCESS_IPFILTER_CHROMA_PP_48x2_AVX512
1375
+    RET
1376
+%endif
1377
+;-------------------------------------------------------------------------------------------------------------
1378
+; avx512 chroma_hpp code end
1379
+;-------------------------------------------------------------------------------------------------------------
1380
+;-------------------------------------------------------------------------------------------------------------
1381
+; avx512 chroma_vpp code start
1382
+;-------------------------------------------------------------------------------------------------------------
1383
+%macro PROCESS_CHROMA_VERT_PP_8x8_AVX512 0
1384
+    movu                  xm1,                [r0]
1385
+    lea                   r6,                 [r0 + 2 * r1]
1386
+    lea                   r8,                 [r0 + 4 * r1]
1387
+    lea                   r9,                 [r8 + 2 * r1]
1388
+    vinserti32x4          m1,                 [r6],                1
1389
+    vinserti32x4          m1,                 [r8],                2
1390
+    vinserti32x4          m1,                 [r9],                3
1391
+    movu                  xm3,                [r0 + r1]
1392
+    vinserti32x4          m3,                 [r6 + r1],           1
1393
+    vinserti32x4          m3,                 [r8 + r1],           2
1394
+    vinserti32x4          m3,                 [r9 + r1],           3
1395
+    punpcklwd             m0,                 m1,                  m3
1396
+    pmaddwd               m0,                 [r5]
1397
+    punpckhwd             m1,                 m3
1398
+    pmaddwd               m1,                 [r5]
1399
+
1400
+    movu                  xm4,                [r0 + 2 * r1]
1401
+    vinserti32x4          m4,                 [r6 + 2 * r1],       1
1402
+    vinserti32x4          m4,                 [r8 + 2 * r1],       2
1403
+    vinserti32x4          m4,                 [r9 + 2 * r1],       3
1404
+    punpcklwd             m2,                 m3,                  m4
1405
+    pmaddwd               m2,                 [r5]
1406
+    punpckhwd             m3,                 m4
1407
+    pmaddwd               m3,                 [r5]
1408
+
1409
+    movu                  xm5,                [r0 + r10]
1410
+    vinserti32x4          m5,                 [r6 + r10],          1
1411
+    vinserti32x4          m5,                 [r8 + r10],          2
1412
+    vinserti32x4          m5,                 [r9 + r10],          3
1413
+    punpcklwd             m6,                 m4,                  m5
1414
+    pmaddwd               m6,                 [r5 + mmsize]
1415
+    paddd                 m0,                 m6
1416
+    punpckhwd             m4,                 m5
1417
+    pmaddwd               m4,                 [r5 + mmsize]
1418
+    paddd                 m1,                 m4
1419
+
1420
+    movu                  xm4,                [r0 + 4 * r1]
1421
+    vinserti32x4          m4,                 [r6 + 4 * r1],       1
1422
+    vinserti32x4          m4,                 [r8 + 4 * r1],       2
1423
+    vinserti32x4          m4,                 [r9 + 4 * r1],       3
1424
+    punpcklwd             m6,                 m5,                  m4
1425
+    pmaddwd               m6,                 [r5 + mmsize]
1426
+    paddd                 m2,                 m6
1427
+    punpckhwd             m5,                 m4
1428
+    pmaddwd               m5,                 [r5 + mmsize]
1429
+    paddd                 m3,                 m5
1430
+
1431
+    paddd                 m0,                 m7
1432
+    paddd                 m1,                 m7
1433
+    paddd                 m2,                 m7
1434
+    paddd                 m3,                 m7
1435
+
1436
+    psrad                 m0,                 INTERP_SHIFT_PP
1437
+    psrad                 m1,                 INTERP_SHIFT_PP
1438
+    psrad                 m2,                 INTERP_SHIFT_PP
1439
+    psrad                 m3,                 INTERP_SHIFT_PP
1440
+
1441
+    packssdw              m0,                 m1
1442
+    packssdw              m2,                 m3
1443
+    pxor                  m5,                 m5
1444
+    CLIPW2                m0,                 m2,                  m5,                 m8
1445
+    movu                  [r2],               xm0
1446
+    movu                  [r2 + r3],          xm2
1447
+    vextracti32x4         [r2 + 2 * r3],      m0,                  1
1448
+    vextracti32x4         [r2 + r7],          m2,                  1
1449
+    lea                   r2,                 [r2 + 4 * r3]
1450
+    vextracti32x4         [r2],               m0,                  2
1451
+    vextracti32x4         [r2 + r3],          m2,                  2
1452
+    vextracti32x4         [r2 + 2 * r3],      m0,                  3
1453
+    vextracti32x4         [r2 + r7],          m2,                  3
1454
+%endmacro
1455
+
1456
+;-----------------------------------------------------------------------------------------------------------------
1457
+; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
1458
+;-----------------------------------------------------------------------------------------------------------------
1459
+%if ARCH_X86_64
1460
+INIT_ZMM avx512
1461
+cglobal interp_4tap_vert_pp_8x8, 5, 11, 9
1462
+    add                   r1d,                r1d
1463
+    add                   r3d,                r3d
1464
+    sub                   r0,                 r1
1465
+    shl                   r4d,                7
1466
+
1467
+%ifdef PIC
1468
+    lea                   r5,                 [tab_ChromaCoeffV_avx512]
1469
+    lea                   r5,                 [r5 + r4]
1470
+%else
1471
+    lea                   r5,                 [tab_ChromaCoeffV_avx512 + r4]
1472
+%endif
1473
+    vbroadcasti32x8       m7,                 [INTERP_OFFSET_PP]
1474
+    vbroadcasti32x8       m8,                 [pw_pixel_max]
1475
+    lea                   r10,                [3 * r1]
1476
+    lea                   r7,                 [3 * r3]
1477
+    PROCESS_CHROMA_VERT_PP_8x8_AVX512
1478
+    RET
1479
+%endif
1480
+
1481
+%macro FILTER_VER_PP_CHROMA_8xN_AVX512 1
1482
+INIT_ZMM avx512
1483
+cglobal interp_4tap_vert_pp_8x%1, 5, 11, 9
1484
+    add                   r1d,                r1d
1485
+    add                   r3d,                r3d
1486
+    sub                   r0,                 r1
1487
+    shl                   r4d,                7
1488
+
1489
+%ifdef PIC
1490
+    lea                   r5,                 [tab_ChromaCoeffV_avx512]
1491
+    lea                   r5,                 [r5 + r4]
1492
+%else
1493
+    lea                   r5,                 [tab_ChromaCoeffV_avx512 + r4]
1494
+%endif
1495
+    vbroadcasti32x8       m7,                 [INTERP_OFFSET_PP]
1496
+    vbroadcasti32x8       m8,                 [pw_pixel_max]
1497
+    lea                   r10,                [3 * r1]
1498
+    lea                   r7,                 [3 * r3]
1499
+%rep %1/8 - 1
1500
+    PROCESS_CHROMA_VERT_PP_8x8_AVX512
1501
+    lea                   r0,                 [r8 + 4 * r1]
1502
+    lea                   r2,                 [r2 + 4 * r3]
1503
+%endrep
1504
+    PROCESS_CHROMA_VERT_PP_8x8_AVX512
1505
+    RET
1506
+%endmacro
1507
+
1508
+%if ARCH_X86_64
1509
+FILTER_VER_PP_CHROMA_8xN_AVX512 16
1510
+FILTER_VER_PP_CHROMA_8xN_AVX512 32
1511
+FILTER_VER_PP_CHROMA_8xN_AVX512 64
1512
+%endif
1513
+
1514
+%macro PROCESS_CHROMA_VERT_PP_16x4_AVX512 0
1515
+    movu                  ym1,                [r0]
1516
+    lea                   r6,                 [r0 + 2 * r1]
1517
+    vinserti32x8          m1,                 [r6],                1
1518
+    movu                  ym3,                [r0 + r1]
1519
+    vinserti32x8          m3,                 [r6 + r1],           1
1520
+    punpcklwd             m0,                 m1,                  m3
1521
+    pmaddwd               m0,                 [r5]
1522
+    punpckhwd             m1,                 m3
1523
+    pmaddwd               m1,                 [r5]
1524
+
1525
+    movu                  ym4,                [r0 + 2 * r1]
1526
+    vinserti32x8          m4,                 [r6 + 2 * r1],       1
1527
+    punpcklwd             m2,                 m3,                  m4
1528
+    pmaddwd               m2,                 [r5]
1529
+    punpckhwd             m3,                 m4
1530
+    pmaddwd               m3,                 [r5]
1531
+
1532
+    lea                   r0,                 [r0 + 2 * r1]
1533
+    lea                   r6,                 [r6 + 2 * r1]
1534
+
1535
+    movu                  ym5,                [r0 + r1]
1536
+    vinserti32x8          m5,                 [r6 + r1],           1
1537
+    punpcklwd             m6,                 m4,                  m5
1538
+    pmaddwd               m6,                 [r5 + mmsize]
1539
+    paddd                 m0,                 m6
1540
+    punpckhwd             m4,                 m5
1541
+    pmaddwd               m4,                 [r5 + mmsize]
1542
+    paddd                 m1,                 m4
1543
+
1544
+    movu                  ym4,                [r0 + 2 * r1]
1545
+    vinserti32x8          m4,                 [r6 + 2 * r1],       1
1546
+    punpcklwd             m6,                 m5,                  m4
1547
+    pmaddwd               m6,                 [r5 + mmsize]
1548
+    paddd                 m2,                 m6
1549
+    punpckhwd             m5,                 m4
1550
+    pmaddwd               m5,                 [r5 + mmsize]
1551
+    paddd                 m3,                 m5
1552
+
1553
+    paddd                 m0,                 m7
1554
+    paddd                 m1,                 m7
1555
+    paddd                 m2,                 m7
1556
+    paddd                 m3,                 m7
1557
+
1558
+    psrad                 m0,                 INTERP_SHIFT_PP
1559
+    psrad                 m1,                 INTERP_SHIFT_PP
1560
+    psrad                 m2,                 INTERP_SHIFT_PP
1561
+    psrad                 m3,                 INTERP_SHIFT_PP
1562
+
1563
+    packssdw              m0,                 m1
1564
+    packssdw              m2,                 m3
1565
+    pxor                  m5,                 m5
1566
+    CLIPW2                m0,                 m2,                  m5,                 m8
1567
+    movu                  [r2],               ym0
1568
+    movu                  [r2 + r3],          ym2
1569
+    vextracti32x8         [r2 + 2 * r3],      m0,                  1
1570
+    vextracti32x8         [r2 + r7],          m2,                  1
1571
+%endmacro
1572
+
1573
+;-----------------------------------------------------------------------------------------------------------------
1574
+; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
1575
+;-----------------------------------------------------------------------------------------------------------------
1576
+%if ARCH_X86_64
1577
+INIT_ZMM avx512
1578
+cglobal interp_4tap_vert_pp_16x4, 5, 8, 9
1579
+    add                   r1d,                r1d
1580
+    add                   r3d,                r3d
1581
+    sub                   r0,                 r1
1582
+    shl                   r4d,                7
1583
+
1584
+%ifdef PIC
1585
+    lea                   r5,                 [tab_ChromaCoeffV_avx512]
1586
+    lea                   r5,                 [r5 + r4]
1587
+%else
1588
+    lea                   r5,                 [tab_ChromaCoeffV_avx512 + r4]
1589
+%endif
1590
+    vbroadcasti32x8       m7,                 [INTERP_OFFSET_PP]
1591
+    vbroadcasti32x8       m8,                 [pw_pixel_max]
1592
+    lea                   r7,                 [3 * r3]
1593
+    PROCESS_CHROMA_VERT_PP_16x4_AVX512
1594
+    RET
1595
+%endif
1596
+
1597
+%macro FILTER_VER_PP_CHROMA_16xN_AVX512 1
1598
+INIT_ZMM avx512
1599
+cglobal interp_4tap_vert_pp_16x%1, 5, 8, 9
1600
+    add                   r1d,                r1d
1601
+    add                   r3d,                r3d
1602
+    sub                   r0,                 r1
1603
+    shl                   r4d,                7
1604
+
1605
+%ifdef PIC
1606
+    lea                   r5,                 [tab_ChromaCoeffV_avx512]
1607
+    lea                   r5,                 [r5 + r4]
1608
+%else
1609
+    lea                   r5,                 [tab_ChromaCoeffV_avx512 + r4]
1610
+%endif
1611
+    vbroadcasti32x8       m7,                 [INTERP_OFFSET_PP]
1612
+    vbroadcasti32x8       m8,                 [pw_pixel_max]
1613
+    lea                   r7,                 [3 * r3]
1614
+%rep %1/4 - 1
1615
+    PROCESS_CHROMA_VERT_PP_16x4_AVX512
1616
+    lea                   r0,                 [r0 + 2 * r1]
1617
+    lea                   r2,                 [r2 + 4 * r3]
1618
+%endrep
1619
+    PROCESS_CHROMA_VERT_PP_16x4_AVX512
1620
+    RET
1621
+%endmacro
1622
+
1623
+%if ARCH_X86_64
1624
+FILTER_VER_PP_CHROMA_16xN_AVX512 8
1625
+FILTER_VER_PP_CHROMA_16xN_AVX512 12
1626
+FILTER_VER_PP_CHROMA_16xN_AVX512 16
1627
+FILTER_VER_PP_CHROMA_16xN_AVX512 24
1628
+FILTER_VER_PP_CHROMA_16xN_AVX512 32
1629
+FILTER_VER_PP_CHROMA_16xN_AVX512 64
1630
+%endif
1631
+
1632
+%macro PROCESS_CHROMA_VERT_PP_24x8_AVX512 0
1633
+    movu                  ym1,                [r0]
1634
+    lea                   r6,                 [r0 + 2 * r1]
1635
+    lea                   r8,                 [r0 + 4 * r1]
1636
+    lea                   r9,                 [r8 + 2 * r1]
1637
+
1638
+    movu                  ym10,               [r8]
1639
+    movu                  ym3,                [r0 + r1]
1640
+    movu                  ym12,               [r8 + r1]
1641
+    vinserti32x8          m1,                 [r6],                1
1642
+    vinserti32x8          m10,                [r9],                1
1643
+    vinserti32x8          m3,                 [r6 + r1],           1
1644
+    vinserti32x8          m12,                [r9 + r1],           1
1645
+
1646
+    punpcklwd             m0,                 m1,                  m3
1647
+    punpcklwd             m9,                 m10,                 m12
1648
+    pmaddwd               m0,                 [r5]
1649
+    pmaddwd               m9,                 [r5]
1650
+    punpckhwd             m1,                 m3
1651
+    punpckhwd             m10,                m12
1652
+    pmaddwd               m1,                 [r5]
1653
+    pmaddwd               m10,                [r5]
1654
+
1655
+    movu                  ym4,                [r0 + 2 * r1]
1656
+    movu                  ym13,               [r8 + 2 * r1]
1657
+    vinserti32x8          m4,                 [r6 + 2 * r1],       1
1658
+    vinserti32x8          m13,                [r9 + 2 * r1],       1
1659
+    punpcklwd             m2,                 m3,                  m4
1660
+    punpcklwd             m11,                m12,                 m13
1661
+    pmaddwd               m2,                 [r5]
1662
+    pmaddwd               m11,                [r5]
1663
+    punpckhwd             m3,                 m4
1664
+    punpckhwd             m12,                m13
1665
+    pmaddwd               m3,                 [r5]
1666
+    pmaddwd               m12,                [r5]
1667
+
1668
+    movu                  ym5,                [r0 + r10]
1669
+    vinserti32x8          m5,                 [r6 + r10],          1
1670
+    movu                  ym14,               [r8 + r10]
1671
+    vinserti32x8          m14,                [r9 + r10],          1
1672
+    punpcklwd             m6,                 m4,                  m5
1673
+    punpcklwd             m15,                m13,                 m14
1674
+    pmaddwd               m6,                 [r5 + mmsize]
1675
+    pmaddwd               m15,                [r5 + mmsize]
1676
+    paddd                 m0,                 m6
1677
+    paddd                 m9,                 m15
1678
+    punpckhwd             m4,                 m5
1679
+    punpckhwd             m13,                m14
1680
+    pmaddwd               m4,                 [r5 + mmsize]
1681
+    pmaddwd               m13,                [r5 + mmsize]
1682
+    paddd                 m1,                 m4
1683
+    paddd                 m10,                m13
1684
+
1685
+    movu                  ym4,                [r0 + 4 * r1]
1686
+    vinserti32x8          m4,                 [r6 + 4 * r1],       1
1687
+    movu                  ym13,               [r8 + 4 * r1]
1688
+    vinserti32x8          m13,                [r9 + 4 * r1],       1
1689
+    punpcklwd             m6,                 m5,                  m4
1690
+    punpcklwd             m15,                m14,                 m13
1691
+    pmaddwd               m6,                 [r5 + mmsize]
1692
+    pmaddwd               m15,                [r5 + mmsize]
1693
+    paddd                 m2,                 m6
1694
+    paddd                 m11,                m15
1695
+    punpckhwd             m5,                 m4
1696
+    punpckhwd             m14,                m13
1697
+    pmaddwd               m5,                 [r5 + mmsize]
1698
+    pmaddwd               m14,                [r5 + mmsize]
1699
+    paddd                 m3,                 m5
1700
+    paddd                 m12,                m14
1701
+
1702
+    paddd                 m0,                 m7
1703
+    paddd                 m1,                 m7
1704
+    paddd                 m2,                 m7
1705
+    paddd                 m3,                 m7
1706
+    paddd                 m9,                 m7
1707
+    paddd                 m10,                m7
1708
+    paddd                 m11,                m7
1709
+    paddd                 m12,                m7
1710
+
1711
+    psrad                 m0,                 INTERP_SHIFT_PP
1712
+    psrad                 m1,                 INTERP_SHIFT_PP
1713
+    psrad                 m2,                 INTERP_SHIFT_PP
1714
+    psrad                 m3,                 INTERP_SHIFT_PP
1715
+    psrad                 m9,                 INTERP_SHIFT_PP
1716
+    psrad                 m10,                INTERP_SHIFT_PP
1717
+    psrad                 m11,                INTERP_SHIFT_PP
1718
+    psrad                 m12,                INTERP_SHIFT_PP
1719
+
1720
+    packssdw              m0,                 m1
1721
+    packssdw              m2,                 m3
1722
+    packssdw              m9,                 m10
1723
+    packssdw              m11,                m12
1724
+    pxor                  m5,                 m5
1725
+    CLIPW2                m0,                 m2,                  m5,                 m8
1726
+    CLIPW2                m9,                 m11,                 m5,                 m8  
1727
+    movu                  [r2],               ym0
1728
+    movu                  [r2 + r3],          ym2
1729
+    vextracti32x8         [r2 + 2 * r3],      m0,                  1
1730
+    vextracti32x8         [r2 + r7],          m2,                  1
1731
+    lea                   r11,                [r2 + 4 * r3]
1732
+    movu                  [r11],              ym9
1733
+    movu                  [r11 + r3],         ym11
1734
+    vextracti32x8         [r11 + 2 * r3],     m9,                  1
1735
+    vextracti32x8         [r11 + r7],         m11,                 1
1736
+
1737
+    movu                  xm1,                [r0 + mmsize/2]
1738
+    vinserti32x4          m1,                 [r6 + mmsize/2],                1
1739
+    vinserti32x4          m1,                 [r8 + mmsize/2],                2
1740
+    vinserti32x4          m1,                 [r9 + mmsize/2],                3
1741
+    movu                  xm3,                [r0 + r1 + mmsize/2]
1742
+    vinserti32x4          m3,                 [r6 + r1 + mmsize/2],           1
1743
+    vinserti32x4          m3,                 [r8 + r1 + mmsize/2],           2
1744
+    vinserti32x4          m3,                 [r9 + r1 + mmsize/2],           3
1745
+    punpcklwd             m0,                 m1,                             m3
1746
+    pmaddwd               m0,                 [r5]
1747
+    punpckhwd             m1,                 m3
1748
+    pmaddwd               m1,                 [r5]
1749
+
1750
+    movu                  xm4,                [r0 + 2 * r1 + mmsize/2]
1751
+    vinserti32x4          m4,                 [r6 + 2 * r1 + mmsize/2],       1
1752
+    vinserti32x4          m4,                 [r8 + 2 * r1 + mmsize/2],       2
1753
+    vinserti32x4          m4,                 [r9 + 2 * r1 + mmsize/2],       3
1754
+    punpcklwd             m2,                 m3,                             m4
1755
+    pmaddwd               m2,                 [r5]
1756
+    punpckhwd             m3,                 m4
1757
+    pmaddwd               m3,                 [r5]
1758
+
1759
+    movu                  xm5,                [r0 + r10 + mmsize/2]
1760
+    vinserti32x4          m5,                 [r6 + r10 + mmsize/2],          1
1761
+    vinserti32x4          m5,                 [r8 + r10 + mmsize/2],          2
1762
+    vinserti32x4          m5,                 [r9 + r10 + mmsize/2],          3
1763
+    punpcklwd             m6,                 m4,                             m5
1764
+    pmaddwd               m6,                 [r5 + mmsize]
1765
+    paddd                 m0,                 m6
1766
+    punpckhwd             m4,                 m5
1767
+    pmaddwd               m4,                 [r5 + mmsize]
1768
+    paddd                 m1,                 m4
1769
+
1770
+    movu                  xm4,                [r0 + 4 * r1 + mmsize/2]
1771
+    vinserti32x4          m4,                 [r6 + 4 * r1 + mmsize/2],       1
1772
+    vinserti32x4          m4,                 [r8 + 4 * r1 + mmsize/2],       2
1773
+    vinserti32x4          m4,                 [r9 + 4 * r1 + mmsize/2],       3
1774
+    punpcklwd             m6,                 m5,                             m4
1775
+    pmaddwd               m6,                 [r5 + mmsize]
1776
+    paddd                 m2,                 m6
1777
+    punpckhwd             m5,                 m4
1778
+    pmaddwd               m5,                 [r5 + mmsize]
1779
+    paddd                 m3,                 m5
1780
+
1781
+    paddd                 m0,                 m7
1782
+    paddd                 m1,                 m7
1783
+    paddd                 m2,                 m7
1784
+    paddd                 m3,                 m7
1785
+
1786
+    psrad                 m0,                 INTERP_SHIFT_PP
1787
+    psrad                 m1,                 INTERP_SHIFT_PP
1788
+    psrad                 m2,                 INTERP_SHIFT_PP
1789
+    psrad                 m3,                 INTERP_SHIFT_PP
1790
+
1791
+    packssdw              m0,                 m1
1792
+    packssdw              m2,                 m3
1793
+    pxor                  m5,                 m5
1794
+    CLIPW2                m0,                 m2,        m5,                  m8
1795
+    movu                  [r2 + mmsize/2],               xm0
1796
+    movu                  [r2 + r3 + mmsize/2],          xm2
1797
+    vextracti32x4         [r2 + 2 * r3 + mmsize/2],      m0,                  1
1798
+    vextracti32x4         [r2 + r7 + mmsize/2],          m2,                  1
1799
+    lea                   r2,                            [r2 + 4 * r3]
1800
+    vextracti32x4         [r2 + mmsize/2],               m0,                  2
1801
+    vextracti32x4         [r2 + r3 + mmsize/2],          m2,                  2
1802
+    vextracti32x4         [r2 + 2 * r3 + mmsize/2],      m0,                  3
1803
+    vextracti32x4         [r2 + r7 + mmsize/2],          m2,                  3    
1804
+%endmacro
1805
+
1806
+%macro FILTER_VER_PP_CHROMA_24xN_AVX512 1
1807
+INIT_ZMM avx512
1808
+cglobal interp_4tap_vert_pp_24x%1, 5, 12, 16
1809
+    add                   r1d,                r1d
1810
+    add                   r3d,                r3d
1811
+    sub                   r0,                 r1
1812
+    shl                   r4d,                7
1813
+
1814
+%ifdef PIC
1815
+    lea                   r5,                 [tab_ChromaCoeffV_avx512]
1816
+    lea                   r5,                 [r5 + r4]
1817
+%else
1818
+    lea                   r5,                 [tab_ChromaCoeffV_avx512 + r4]
1819
+%endif
1820
+    vbroadcasti32x8       m7,                 [INTERP_OFFSET_PP]
1821
+    vbroadcasti32x8       m8,                 [pw_pixel_max]
1822
+    lea                   r10,                [3 * r1]
1823
+    lea                   r7,                 [3 * r3]
1824
+%rep %1/8 - 1
1825
+    PROCESS_CHROMA_VERT_PP_24x8_AVX512
1826
+    lea                   r0,                 [r8 + 4 * r1]
1827
+    lea                   r2,                 [r2 + 4 * r3]
1828
+%endrep
1829
+    PROCESS_CHROMA_VERT_PP_24x8_AVX512
1830
+    RET
1831
+%endmacro
1832
+
1833
+%if ARCH_X86_64
1834
+    FILTER_VER_PP_CHROMA_24xN_AVX512 32
1835
+    FILTER_VER_PP_CHROMA_24xN_AVX512 64
1836
+%endif
1837
+
1838
+%macro PROCESS_CHROMA_VERT_PP_32x2_AVX512 0
1839
+    movu                  m1,                 [r0]
1840
+    movu                  m3,                 [r0 + r1]
1841
+    punpcklwd             m0,                 m1,                  m3
1842
+    pmaddwd               m0,                 [r5]
1843
+    punpckhwd             m1,                 m3
1844
+    pmaddwd               m1,                 [r5]
1845
+
1846
+    movu                  m4,                 [r0 + 2 * r1]
1847
+    punpcklwd             m2,                 m3,                  m4
1848
+    pmaddwd               m2,                 [r5]
1849
+    punpckhwd             m3,                 m4
1850
+    pmaddwd               m3,                 [r5]
1851
+
1852
+    lea                   r0,                 [r0 + 2 * r1]
1853
+    movu                  m5,                 [r0 + r1]
1854
+    punpcklwd             m6,                 m4,                  m5
1855
+    pmaddwd               m6,                 [r5 + mmsize]
1856
+    paddd                 m0,                 m6
1857
+    punpckhwd             m4,                 m5
1858
+    pmaddwd               m4,                 [r5 + mmsize]
1859
+    paddd                 m1,                 m4
1860
+
1861
+    movu                  m4,                 [r0 + 2 * r1]
1862
+    punpcklwd             m6,                 m5,                  m4
1863
+    pmaddwd               m6,                 [r5 + mmsize]
1864
+    paddd                 m2,                 m6
1865
+    punpckhwd             m5,                 m4
1866
+    pmaddwd               m5,                 [r5 + mmsize]
1867
+    paddd                 m3,                 m5
1868
+
1869
+    paddd                 m0,                 m7
1870
+    paddd                 m1,                 m7
1871
+    paddd                 m2,                 m7
1872
+    paddd                 m3,                 m7
1873
+
1874
+    psrad                 m0,                 INTERP_SHIFT_PP
1875
+    psrad                 m1,                 INTERP_SHIFT_PP
1876
+    psrad                 m2,                 INTERP_SHIFT_PP
1877
+    psrad                 m3,                 INTERP_SHIFT_PP
1878
+
1879
+    packssdw              m0,                 m1
1880
+    packssdw              m2,                 m3
1881
+    pxor                  m5,                 m5
1882
+    CLIPW2                m0,                 m2,                  m5,                 m8
1883
+    movu                  [r2],               m0
1884
+    movu                  [r2 + r3],          m2
1885
+%endmacro
1886
+
1887
+;-----------------------------------------------------------------------------------------------------------------
1888
+; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
1889
+;-----------------------------------------------------------------------------------------------------------------
1890
+%macro FILTER_VER_PP_CHROMA_32xN_AVX512 1
1891
+INIT_ZMM avx512
1892
+cglobal interp_4tap_vert_pp_32x%1, 5, 7, 9
1893
+    add                   r1d,                r1d
1894
+    add                   r3d,                r3d
1895
+    sub                   r0,                 r1
1896
+    shl                   r4d,                7
1897
+
1898
+%ifdef PIC
1899
+    lea                   r5,                 [tab_ChromaCoeffV_avx512]
1900
+    lea                   r5,                 [r5 + r4]
1901
+%else
1902
+    lea                   r5,                 [tab_ChromaCoeffV_avx512 + r4]
1903
+%endif
1904
+    vbroadcasti32x8       m7,                 [INTERP_OFFSET_PP]
1905
+    vbroadcasti32x8       m8,                 [pw_pixel_max]
1906
+
1907
+%rep %1/2 - 1
1908
+    PROCESS_CHROMA_VERT_PP_32x2_AVX512
1909
+    lea                   r2,                 [r2 + 2 * r3]
1910
+%endrep
1911
+    PROCESS_CHROMA_VERT_PP_32x2_AVX512
1912
+    RET
1913
+%endmacro
1914
+
1915
+%if ARCH_X86_64
1916
+FILTER_VER_PP_CHROMA_32xN_AVX512 8
1917
+FILTER_VER_PP_CHROMA_32xN_AVX512 16
1918
+FILTER_VER_PP_CHROMA_32xN_AVX512 24
1919
+FILTER_VER_PP_CHROMA_32xN_AVX512 32
1920
+FILTER_VER_PP_CHROMA_32xN_AVX512 48
1921
+FILTER_VER_PP_CHROMA_32xN_AVX512 64
1922
+%endif
1923
+
1924
+%macro PROCESS_CHROMA_VERT_PP_48x4_AVX512 0
1925
+    movu                  m1,                 [r0]
1926
+    lea                   r6,                 [r0 + 2 * r1]
1927
+    movu                  m10,                [r6]
1928
+    movu                  m3,                 [r0 + r1]
1929
+    movu                  m12,                [r6 + r1]
1930
+    punpcklwd             m0,                 m1,                  m3
1931
+    punpcklwd             m9,                 m10,                 m12
1932
+    pmaddwd               m0,                 [r5]
1933
+    pmaddwd               m9,                 [r5]
1934
+    punpckhwd             m1,                 m3
1935
+    punpckhwd             m10,                m12
1936
+    pmaddwd               m1,                 [r5]
1937
+    pmaddwd               m10,                [r5]
1938
+
1939
+    movu                  m4,                 [r0 + 2 * r1]
1940
+    movu                  m13,                [r6 + 2 * r1]
1941
+    punpcklwd             m2,                 m3,                  m4
1942
+    punpcklwd             m11,                m12,                 m13
1943
+    pmaddwd               m2,                 [r5]
1944
+    pmaddwd               m11,                [r5]
1945
+    punpckhwd             m3,                 m4
1946
+    punpckhwd             m12,                m13
1947
+    pmaddwd               m3,                 [r5]
1948
+    pmaddwd               m12,                [r5]
1949
+
1950
+    movu                  m5,                 [r0 + r7]
1951
+    movu                  m14,                [r6 + r7]
1952
+    punpcklwd             m6,                 m4,                  m5
1953
+    punpcklwd             m15,                m13,                 m14
1954
+    pmaddwd               m6,                 [r5 + mmsize]
1955
+    pmaddwd               m15,                [r5 + mmsize]
1956
+    paddd                 m0,                 m6
1957
+    paddd                 m9,                 m15
1958
+    punpckhwd             m4,                 m5
1959
+    punpckhwd             m13,                m14
1960
+    pmaddwd               m4,                 [r5 + mmsize]
1961
+    pmaddwd               m13,                [r5 + mmsize]
1962
+    paddd                 m1,                 m4
1963
+    paddd                 m10,                m13
1964
+
1965
+    movu                  m4,                 [r0 + 4 * r1]
1966
+    movu                  m13,                [r6 + 4 * r1]
1967
+    punpcklwd             m6,                 m5,                  m4
1968
+    punpcklwd             m15,                m14,                 m13
1969
+    pmaddwd               m6,                 [r5 + mmsize]
1970
+    pmaddwd               m15,                [r5 + mmsize]
1971
+    paddd                 m2,                 m6
1972
+    paddd                 m11,                m15
1973
+    punpckhwd             m5,                 m4
1974
+    punpckhwd             m14,                m13
1975
+    pmaddwd               m5,                 [r5 + mmsize]
1976
+    pmaddwd               m14,                [r5 + mmsize]
1977
+    paddd                 m3,                 m5
1978
+    paddd                 m12,                m14
1979
+
1980
+    paddd                 m0,                 m7
1981
+    paddd                 m1,                 m7
1982
+    paddd                 m2,                 m7
1983
+    paddd                 m3,                 m7
1984
+    paddd                 m9,                 m7
1985
+    paddd                 m10,                m7
1986
+    paddd                 m11,                m7
1987
+    paddd                 m12,                m7
1988
+
1989
+    psrad                 m0,                 INTERP_SHIFT_PP
1990
+    psrad                 m1,                 INTERP_SHIFT_PP
1991
+    psrad                 m2,                 INTERP_SHIFT_PP
1992
+    psrad                 m3,                 INTERP_SHIFT_PP
1993
+    psrad                 m9,                 INTERP_SHIFT_PP
1994
+    psrad                 m10,                INTERP_SHIFT_PP
1995
+    psrad                 m11,                INTERP_SHIFT_PP
1996
+    psrad                 m12,                INTERP_SHIFT_PP
1997
+
1998
+    packssdw              m0,                 m1
1999
+    packssdw              m2,                 m3
2000
+    packssdw              m9,                 m10
2001
+    packssdw              m11,                m12
2002
+    CLIPW2                m0,                 m2,                 m16,                 m8
2003
+    CLIPW2                m9,                 m11,                m16,                 m8
2004
+    movu                  [r2],               m0
2005
+    movu                  [r2 + r3],          m2
2006
+    movu                  [r2 + 2 * r3],      m9
2007
+    movu                  [r2 + r8],          m11
2008
+
2009
+    movu                  ym1,                [r0 + mmsize]
2010
+    vinserti32x8          m1,                 [r6 + mmsize],       1
2011
+    movu                  ym3,                [r0 + r1 + mmsize]
2012
+    vinserti32x8          m3,                 [r6 + r1 + mmsize],  1
2013
+    punpcklwd             m0,                 m1,                  m3
2014
+    pmaddwd               m0,                 [r5]
2015
+    punpckhwd             m1,                 m3
2016
+    pmaddwd               m1,                 [r5]
2017
+
2018
+    movu                  ym4,                [r0 + 2 * r1 + mmsize]
2019
+    vinserti32x8          m4,                 [r6 + 2 * r1 + mmsize],  1
2020
+    punpcklwd             m2,                 m3,                  m4
2021
+    pmaddwd               m2,                 [r5]
2022
+    punpckhwd             m3,                 m4
2023
+    pmaddwd               m3,                 [r5]
2024
+
2025
+    movu                  ym5,                [r0 + r7 + mmsize]
2026
+    vinserti32x8          m5,                 [r6 + r7 + mmsize],  1
2027
+    punpcklwd             m6,                 m4,                  m5
2028
+    pmaddwd               m6,                 [r5 + mmsize]
2029
+    paddd                 m0,                 m6
2030
+    punpckhwd             m4,                 m5
2031
+    pmaddwd               m4,                 [r5 + mmsize]
2032
+    paddd                 m1,                 m4
2033
+
2034
+    movu                  ym4,                [r0 + 4 * r1 + mmsize]
2035
+    vinserti32x8          m4,                 [r6 + 4 * r1 + mmsize],  1
2036
+    punpcklwd             m6,                 m5,                  m4
2037
+    pmaddwd               m6,                 [r5 + mmsize]
2038
+    paddd                 m2,                 m6
2039
+    punpckhwd             m5,                 m4
2040
+    pmaddwd               m5,                 [r5 + mmsize]
2041
+    paddd                 m3,                 m5
2042
+
2043
+    paddd                 m0,                 m7
2044
+    paddd                 m1,                 m7
2045
+    paddd                 m2,                 m7
2046
+    paddd                 m3,                 m7
2047
+
2048
+    psrad                 m0,                 INTERP_SHIFT_PP
2049
+    psrad                 m1,                 INTERP_SHIFT_PP
2050
+    psrad                 m2,                 INTERP_SHIFT_PP
2051
+    psrad                 m3,                 INTERP_SHIFT_PP
2052
+
2053
+    packssdw              m0,                 m1
2054
+    packssdw              m2,                 m3
2055
+    CLIPW2                m0,                 m2,                 m16,                 m8
2056
+    movu                  [r2 + mmsize],               ym0
2057
+    movu                  [r2 + r3 + mmsize],          ym2
2058
+    vextracti32x8         [r2 + 2 * r3 + mmsize],      m0,                  1
2059
+    vextracti32x8         [r2 + r8 + mmsize],          m2,                  1
2060
+%endmacro
2061
+
2062
+%if ARCH_X86_64
2063
+INIT_ZMM avx512
2064
+cglobal interp_4tap_vert_pp_48x64, 5, 9, 17
2065
+     add                   r1d,                r1d
2066
+     add                   r3d,                r3d
2067
+     sub                   r0,                 r1
2068
+     shl                   r4d,                7
2069
+%ifdef PIC
2070
+    lea                   r5,                 [tab_ChromaCoeffV_avx512]
2071
+    lea                   r5,                 [r5 + r4]
2072
+%else
2073
+    lea                   r5,                 [tab_ChromaCoeffV_avx512 + r4]
2074
+%endif
2075
+    lea                   r7,                 [3 * r1]
2076
+    lea                   r8,                 [3 * r3]
2077
+    vbroadcasti32x8       m7,                 [INTERP_OFFSET_PP]
2078
+    vbroadcasti32x8       m8,                 [pw_pixel_max]
2079
+    pxor                  m16,                m16
2080
+
2081
+%rep 15
2082
+    PROCESS_CHROMA_VERT_PP_48x4_AVX512
2083
+    lea                   r0,                 [r0 + 4 * r1]
2084
+    lea                   r2,                 [r2 + 4 * r3]
2085
+%endrep
2086
+    PROCESS_CHROMA_VERT_PP_48x4_AVX512
2087
+    RET
2088
+%endif
2089
+
2090
+%macro PROCESS_CHROMA_VERT_PP_64x2_AVX512 0
2091
+    movu                 m1,                  [r0]
2092
+    movu                 m3,                  [r0 + r1]
2093
+    punpcklwd            m0,                  m1,                     m3
2094
+    pmaddwd              m0,                  [r5]
2095
+    punpckhwd            m1,                  m3
2096
+    pmaddwd              m1,                  [r5]
2097
+
2098
+    movu                 m9,                  [r0 + mmsize]
2099
+    movu                 m11,                 [r0 + r1 + mmsize]
2100
+    punpcklwd            m8,                  m9,                     m11
2101
+    pmaddwd              m8,                  [r5]
2102
+    punpckhwd            m9,                  m11
2103
+    pmaddwd              m9,                  [r5]
2104
+
2105
+    movu                 m4,                  [r0 + 2 * r1]
2106
+    punpcklwd            m2,                  m3,                     m4
2107
+    pmaddwd              m2,                  [r5]
2108
+    punpckhwd            m3,                  m4
2109
+    pmaddwd              m3,                  [r5]
2110
+
2111
+    movu                 m12,                 [r0 + 2 * r1 + mmsize]
2112
+    punpcklwd            m10,                 m11,                    m12
2113
+    pmaddwd              m10,                 [r5]
2114
+    punpckhwd            m11,                 m12
2115
+    pmaddwd              m11,                 [r5]
2116
+
2117
+    lea                  r0,                  [r0 + 2 * r1]
2118
+    movu                 m5,                  [r0 + r1]
2119
+    punpcklwd            m6,                  m4,                     m5
2120
+    pmaddwd              m6,                  [r5 + 1 * mmsize]
2121
+    paddd                m0,                  m6
2122
+    punpckhwd            m4,                  m5
2123
+    pmaddwd              m4,                  [r5 + 1 * mmsize]
2124
+    paddd                m1,                  m4
2125
+
2126
+    movu                 m13,                 [r0 + r1 + mmsize]
2127
+    punpcklwd            m14,                 m12,                    m13
2128
+    pmaddwd              m14,                 [r5 + 1 * mmsize]
2129
+    paddd                m8,                  m14
2130
+    punpckhwd            m12,                 m13
2131
+    pmaddwd              m12,                 [r5 + 1 * mmsize]
2132
+    paddd                m9,                  m12
2133
+
2134
+    movu                 m4,                  [r0 + 2 * r1]
2135
+    punpcklwd            m6,                  m5,                     m4
2136
+    pmaddwd              m6,                  [r5 + 1 * mmsize]
2137
+    paddd                m2,                  m6
2138
+    punpckhwd            m5,                  m4
2139
+    pmaddwd              m5,                  [r5 + 1 * mmsize]
2140
+    paddd                m3,                  m5
2141
+
2142
+    movu                 m12,                 [r0 + 2 * r1 + mmsize]
2143
+    punpcklwd            m14,                 m13,                    m12
2144
+    pmaddwd              m14,                 [r5 + 1 * mmsize]
2145
+    paddd                m10,                 m14
2146
+    punpckhwd            m13,                 m12
2147
+    pmaddwd              m13,                 [r5 + 1 * mmsize]
2148
+    paddd                m11,                 m13
2149
+
2150
+    paddd                m0,                  m7
2151
+    paddd                m1,                  m7
2152
+    paddd                m2,                  m7
2153
+    paddd                m3,                  m7
2154
+    paddd                m8,                  m7
2155
+    paddd                m9,                  m7
2156
+    paddd                m10,                 m7
2157
+    paddd                m11,                 m7
2158
+
2159
+    psrad                m0,                  INTERP_SHIFT_PP
2160
+    psrad                m1,                  INTERP_SHIFT_PP
2161
+    psrad                m2,                  INTERP_SHIFT_PP
2162
+    psrad                m3,                  INTERP_SHIFT_PP
2163
+    psrad                m8,                  INTERP_SHIFT_PP
2164
+    psrad                m9,                  INTERP_SHIFT_PP
2165
+    psrad                m10,                 INTERP_SHIFT_PP
2166
+    psrad                m11,                 INTERP_SHIFT_PP
2167
+
2168
+    packssdw             m0,                  m1
2169
+    packssdw             m2,                  m3
2170
+    packssdw             m8,                  m9
2171
+    packssdw             m10,                 m11
2172
+    pxor                 m5,                  m5
2173
+    CLIPW2               m0,                  m2,                  m5,                 m15
2174
+    CLIPW2               m8,                  m10,                 m5,                 m15
2175
+    movu                 [r2],                m0
2176
+    movu                 [r2 + r3],           m2
2177
+    movu                 [r2 + mmsize],       m8
2178
+    movu                 [r2 + r3 + mmsize],  m10
2179
+%endmacro
2180
+
2181
+;-----------------------------------------------------------------------------------------------------------------
2182
+; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
2183
+;-----------------------------------------------------------------------------------------------------------------
2184
+%macro FILTER_VER_PP_CHROMA_64xN_AVX512 1
2185
+INIT_ZMM avx512
2186
+cglobal interp_4tap_vert_pp_64x%1, 5, 7, 16
2187
+    add                   r1d,                r1d
2188
+    add                   r3d,                r3d
2189
+    sub                   r0,                 r1
2190
+    shl                   r4d,                7
2191
+
2192
+%ifdef PIC
2193
+    lea                   r5,                 [tab_ChromaCoeffV_avx512]
2194
+    lea                   r5,                 [r5 + r4]
2195
+%else
2196
+    lea                   r5,                 [tab_ChromaCoeffV_avx512 + r4]
2197
+%endif
2198
+    vbroadcasti32x8       m7,                 [INTERP_OFFSET_PP]
2199
+    vbroadcasti32x8       m15,                [pw_pixel_max]
2200
+
2201
+%rep %1/2 - 1
2202
+    PROCESS_CHROMA_VERT_PP_64x2_AVX512
2203
+    lea                   r2,                 [r2 + 2 * r3]
2204
+%endrep
2205
+    PROCESS_CHROMA_VERT_PP_64x2_AVX512
2206
+    RET
2207
+%endmacro
2208
+
2209
+%if ARCH_X86_64
2210
+FILTER_VER_PP_CHROMA_64xN_AVX512 16
2211
+FILTER_VER_PP_CHROMA_64xN_AVX512 32
2212
+FILTER_VER_PP_CHROMA_64xN_AVX512 48
2213
+FILTER_VER_PP_CHROMA_64xN_AVX512 64
2214
+%endif
2215
+;-------------------------------------------------------------------------------------------------------------
2216
+; avx512 chroma_vpp code end
2217
+;-------------------------------------------------------------------------------------------------------------
2218
+;-------------------------------------------------------------------------------------------------------------
2219
+; avx512 chroma_hps code start
2220
+;-------------------------------------------------------------------------------------------------------------
2221
+%macro PROCESS_IPFILTER_CHROMA_PS_32x2_AVX512 0
2222
+    ; register map
2223
+    ; m0 , m1 - interpolate coeff
2224
+    ; m2 , m3 - shuffle load order table
2225
+    ; m4      - INTERP_OFFSET_PS
2226
+    ; m5      - shuffle store order table
2227
+
2228
+    movu            m6,        [r0]
2229
+    movu            m7,        [r0 + 8]
2230
+
2231
+    pshufb          m8,        m6,        m3
2232
+    pshufb          m6,        m2
2233
+    pmaddwd         m6,        m0
2234
+    pmaddwd         m8,        m1
2235
+    paddd           m6,        m8
2236
+    paddd           m6,        m4
2237
+    psrad           m6,        INTERP_SHIFT_PS
2238
+
2239
+    pshufb          m8,        m7,        m3
2240
+    pshufb          m7,        m2
2241
+    pmaddwd         m7,        m0
2242
+    pmaddwd         m8,        m1
2243
+    paddd           m7,        m8
2244
+    paddd           m7,        m4
2245
+    psrad           m7,        INTERP_SHIFT_PS
2246
+
2247
+    packssdw        m6,        m7
2248
+    pshufb          m6,        m5
2249
+    movu            [r2],      m6
2250
+
2251
+    movu            m6,        [r0 + r1]
2252
+    movu            m7,        [r0 + r1 + 8]
2253
+
2254
+    pshufb          m8,        m6,        m3
2255
+    pshufb          m6,        m2
2256
+    pmaddwd         m6,        m0
2257
+    pmaddwd         m8,        m1
2258
+    paddd           m6,        m8
2259
+    paddd           m6,        m4
2260
+    psrad           m6,        INTERP_SHIFT_PS
2261
+
2262
+    pshufb          m8,        m7,        m3
2263
+    pshufb          m7,        m2
2264
+    pmaddwd         m7,        m0
2265
+    pmaddwd         m8,        m1
2266
+    paddd           m7,        m8
2267
+    paddd           m7,        m4
2268
+    psrad           m7,        INTERP_SHIFT_PS
2269
+
2270
+    packssdw        m6,        m7
2271
+    pshufb          m6,        m5
2272
+    movu            [r2 + r3], m6
2273
+%endmacro
2274
+
2275
+%macro PROCESS_IPFILTER_CHROMA_PS_32x1_AVX512 0
2276
+    movu            m6,        [r0]
2277
+    movu            m7,        [r0 + 8]
2278
+
2279
+    pshufb          m8,        m6,        m3
2280
+    pshufb          m6,        m2
2281
+    pmaddwd         m6,        m0
2282
+    pmaddwd         m8,        m1
2283
+    paddd           m6,        m8
2284
+    paddd           m6,        m4
2285
+    psrad           m6,        INTERP_SHIFT_PS
2286
+
2287
+    pshufb          m8,        m7,        m3
2288
+    pshufb          m7,        m2
2289
+    pmaddwd         m7,        m0
2290
+    pmaddwd         m8,        m1
2291
+    paddd           m7,        m8
2292
+    paddd           m7,        m4
2293
+    psrad           m7,        INTERP_SHIFT_PS
2294
+
2295
+    packssdw        m6,        m7
2296
+    pshufb          m6,        m5
2297
+    movu            [r2],      m6
2298
+%endmacro
2299
+
2300
+%macro IPFILTER_CHROMA_PS_AVX512_32xN 1
2301
+%if ARCH_X86_64 == 1
2302
+INIT_ZMM avx512
2303
+cglobal interp_4tap_horiz_ps_32x%1, 4,7,9
2304
+    shl             r1d, 1
2305
+    shl             r3d, 1
2306
+    mov             r4d, r4m
2307
+    mov             r5d, r5m
2308
+%ifdef PIC
2309
+    lea             r6, [tab_ChromaCoeff]
2310
+    vpbroadcastd    m0, [r6 + r4 * 8]
2311
+    vpbroadcastd    m1, [r6 + r4 * 8 + 4]
2312
+%else
2313
+    vpbroadcastd    m0, [tab_ChromaCoeff + r4 * 8]
2314
+    vpbroadcastd    m1, [tab_ChromaCoeff + r4 * 8 + 4]
2315
+%endif
2316
+    vbroadcasti32x8 m2, [interp8_hpp_shuf1_load_avx512]
2317
+    vbroadcasti32x8 m3, [interp8_hpp_shuf2_load_avx512]
2318
+    vbroadcasti32x4 m4, [INTERP_OFFSET_PS]
2319
+    vbroadcasti32x8 m5, [interp8_hpp_shuf1_store_avx512]
2320
+
2321
+    mov               r6d,         %1
2322
+    sub               r0,          2
2323
+    test              r5d,         r5d
2324
+    jz                .loop
2325
+    sub               r0,          r1
2326
+    add               r6d,         3
2327
+    PROCESS_IPFILTER_CHROMA_PS_32x1_AVX512
2328
+    lea               r0, [r0 + r1]
2329
+    lea               r2, [r2 + r3]
2330
+    dec               r6d
2331
+
2332
+.loop:
2333
+    PROCESS_IPFILTER_CHROMA_PS_32x2_AVX512
2334
+    lea             r0,  [r0 + 2 * r1]
2335
+    lea             r2,  [r2 + 2 * r3]
2336
+    sub             r6d, 2
2337
+    jnz             .loop
2338
+    RET
2339
+%endif
2340
+%endmacro
2341
+
2342
+IPFILTER_CHROMA_PS_AVX512_32xN 8
2343
+IPFILTER_CHROMA_PS_AVX512_32xN 16
2344
+IPFILTER_CHROMA_PS_AVX512_32xN 24
2345
+IPFILTER_CHROMA_PS_AVX512_32xN 32
2346
+IPFILTER_CHROMA_PS_AVX512_32xN 48
2347
+IPFILTER_CHROMA_PS_AVX512_32xN 64
2348
+
2349
+%macro PROCESS_IPFILTER_CHROMA_PS_64x2_AVX512 0
2350
+    ; register map
2351
+    ; m0 , m1 - interpolate coeff
2352
+    ; m2 , m3 -shuffle order table
2353
+    ; m4      - INTERP_OFFSET_PS
2354
+    ; m5      - shuffle store order table
2355
+
2356
+
2357
+    movu            m6,        [r0]
2358
+    movu            m7,        [r0 + 8]
2359
+
2360
+    pshufb          m8,        m6,        m3
2361
+    pshufb          m6,        m2
2362
+    pmaddwd         m6,        m0
2363
+    pmaddwd         m8,        m1
2364
+    paddd           m6,        m8
2365
+    paddd           m6,        m4
2366
+    psrad           m6,        INTERP_SHIFT_PS
2367
+
2368
+    pshufb          m8,        m7,        m3
2369
+    pshufb          m7,        m2
2370
+    pmaddwd         m7,        m0
2371
+    pmaddwd         m8,        m1
2372
+    paddd           m7,        m8
2373
+    paddd           m7,        m4
2374
+    psrad           m7,        INTERP_SHIFT_PS
2375
+
2376
+    packssdw        m6,        m7
2377
+    pshufb          m6,        m5
2378
+    movu            [r2],      m6
2379
+
2380
+    movu            m6,        [r0 + mmsize]
2381
+    movu            m7,        [r0 + mmsize + 8]
2382
+
2383
+    pshufb          m8,        m6,        m3
2384
+    pshufb          m6,        m2
2385
+    pmaddwd         m6,        m0
2386
+    pmaddwd         m8,        m1
2387
+    paddd           m6,        m8
2388
+    paddd           m6,        m4
2389
+    psrad           m6,        INTERP_SHIFT_PS
2390
+
2391
+    pshufb          m8,        m7,        m3
2392
+    pshufb          m7,        m2
2393
+    pmaddwd         m7,        m0
2394
+    pmaddwd         m8,        m1
2395
+    paddd           m7,        m8
2396
+    paddd           m7,        m4
2397
+    psrad           m7,        INTERP_SHIFT_PS
2398
+
2399
+    packssdw        m6,        m7
2400
+    pshufb          m6,        m5
2401
+    movu            [r2 + mmsize],        m6
2402
+
2403
+    movu            m6,        [r0 + r1]
2404
+    movu            m7,        [r0 + r1 + 8]
2405
+
2406
+    pshufb          m8,        m6,        m3
2407
+    pshufb          m6,        m2
2408
+    pmaddwd         m6,        m0
2409
+    pmaddwd         m8,        m1
2410
+    paddd           m6,        m8
2411
+    paddd           m6,        m4
2412
+    psrad           m6,        INTERP_SHIFT_PS
2413
+
2414
+    pshufb          m8,        m7,        m3
2415
+    pshufb          m7,        m2
2416
+    pmaddwd         m7,        m0
2417
+    pmaddwd         m8,        m1
2418
+    paddd           m7,        m8
2419
+    paddd           m7,        m4
2420
+    psrad           m7,        INTERP_SHIFT_PS
2421
+
2422
+    packssdw        m6,        m7
2423
+    pshufb          m6,        m5
2424
+    movu            [r2 + r3], m6
2425
+
2426
+    movu            m6,        [r0 + r1 + mmsize]
2427
+    movu            m7,        [r0 + r1 + mmsize + 8]
2428
+
2429
+    pshufb          m8,        m6,        m3
2430
+    pshufb          m6,        m2
2431
+    pmaddwd         m6,        m0
2432
+    pmaddwd         m8,        m1
2433
+    paddd           m6,        m8
2434
+    paddd           m6,        m4
2435
+    psrad           m6,        INTERP_SHIFT_PS
2436
+
2437
+    pshufb          m8,        m7,        m3
2438
+    pshufb          m7,        m2
2439
+    pmaddwd         m7,        m0
2440
+    pmaddwd         m8,        m1
2441
+    paddd           m7,        m8
2442
+    paddd           m7,        m4
2443
+    psrad           m7,        INTERP_SHIFT_PS
2444
+
2445
+    packssdw        m6,        m7
2446
+    pshufb          m6,        m5
2447
+    movu            [r2 + r3 + mmsize],   m6
2448
+%endmacro
2449
+
2450
+%macro PROCESS_IPFILTER_CHROMA_PS_64x1_AVX512 0
2451
+    movu            m6,        [r0]
2452
+    movu            m7,        [r0 + 8]
2453
+
2454
+    pshufb          m8,        m6,        m3
2455
+    pshufb          m6,        m2
2456
+    pmaddwd         m6,        m0
2457
+    pmaddwd         m8,        m1
2458
+    paddd           m6,        m8
2459
+    paddd           m6,        m4
2460
+    psrad           m6,        INTERP_SHIFT_PS
2461
+
2462
+    pshufb          m8,        m7,        m3
2463
+    pshufb          m7,        m2
2464
+    pmaddwd         m7,        m0
2465
+    pmaddwd         m8,        m1
2466
+    paddd           m7,        m8
2467
+    paddd           m7,        m4
2468
+    psrad           m7,        INTERP_SHIFT_PS
2469
+
2470
+    packssdw        m6,        m7
2471
+    pshufb          m6,        m5
2472
+    movu            [r2],      m6
2473
+
2474
+    movu            m6,        [r0 + mmsize]
2475
+    movu            m7,        [r0 + mmsize + 8]
2476
+
2477
+    pshufb          m8,        m6,        m3
2478
+    pshufb          m6,        m2
2479
+    pmaddwd         m6,        m0
2480
+    pmaddwd         m8,        m1
2481
+    paddd           m6,        m8
2482
+    paddd           m6,        m4
2483
+    psrad           m6,        INTERP_SHIFT_PS
2484
+
2485
+    pshufb          m8,        m7,        m3
2486
+    pshufb          m7,        m2
2487
+    pmaddwd         m7,        m0
2488
+    pmaddwd         m8,        m1
2489
+    paddd           m7,        m8
2490
+    paddd           m7,        m4
2491
+    psrad           m7,        INTERP_SHIFT_PS
2492
+
2493
+    packssdw        m6,        m7
2494
+    pshufb          m6,        m5
2495
+    movu            [r2 + mmsize],        m6
2496
+%endmacro
2497
+
2498
+%macro IPFILTER_CHROMA_PS_AVX512_64xN 1
2499
+%if ARCH_X86_64 == 1
2500
+INIT_ZMM avx512
2501
+cglobal interp_4tap_horiz_ps_64x%1, 4,7,9
2502
+    shl             r1d, 1
2503
+    shl             r3d, 1
2504
+    mov             r4d, r4m
2505
+    mov             r5d, r5m
2506
+%ifdef PIC
2507
+    lea             r6, [tab_ChromaCoeff]
2508
+    vpbroadcastd    m0, [r6 + r4 * 8]
2509
+    vpbroadcastd    m1, [r6 + r4 * 8 + 4]
2510
+%else
2511
+    vpbroadcastd    m0, [tab_ChromaCoeff + r4 * 8]
2512
+    vpbroadcastd    m1, [tab_ChromaCoeff + r4 * 8 + 4]
2513
+%endif
2514
+    vbroadcasti32x8 m2, [interp8_hpp_shuf1_load_avx512]
2515
+    vbroadcasti32x8 m3, [interp8_hpp_shuf2_load_avx512]
2516
+    vbroadcasti32x4 m4, [INTERP_OFFSET_PS]
2517
+    vbroadcasti32x8 m5, [interp8_hpp_shuf1_store_avx512]
2518
+    mov               r6d,         %1
2519
+    sub               r0,          2
2520
+    test              r5d,         r5d
2521
+    jz                .loop
2522
+    sub               r0,          r1
2523
+    add               r6d,         3
2524
+    PROCESS_IPFILTER_CHROMA_PS_64x1_AVX512
2525
+    lea               r0, [r0 + r1]
2526
+    lea               r2, [r2 + r3]
2527
+    dec               r6d
2528
+
2529
+.loop:
2530
+    PROCESS_IPFILTER_CHROMA_PS_64x2_AVX512
2531
+    lea             r0,  [r0 + 2 * r1]
2532
+    lea             r2,  [r2 + 2 * r3]
2533
+    sub             r6d, 2
2534
+    jnz             .loop
2535
+    RET
2536
+%endif
2537
+%endmacro
2538
+
2539
+IPFILTER_CHROMA_PS_AVX512_64xN 16
2540
+IPFILTER_CHROMA_PS_AVX512_64xN 32
2541
+IPFILTER_CHROMA_PS_AVX512_64xN 48
2542
+IPFILTER_CHROMA_PS_AVX512_64xN 64
2543
+
2544
+%macro PROCESS_IPFILTER_CHROMA_PS_16x2_AVX512 0
2545
+    ; register map
2546
+    ; m0 , m1 - interpolate coeff
2547
+    ; m2 , m3 - shuffle order table
2548
+    ; m4      - INTERP_OFFSET_PS
2549
+    ; m5      - shuffle store order table
2550
+
2551
+    movu            ym6,       [r0]
2552
+    vinserti32x8    m6,        [r0 + r1],      1
2553
+    movu            ym7,       [r0 + 8]
2554
+    vinserti32x8    m7,        [r0 + r1 + 8],  1
2555
+
2556
+    pshufb          m8,        m6,        m3
2557
+    pshufb          m6,        m2
2558
+    pmaddwd         m6,        m0
2559
+    pmaddwd         m8,        m1
2560
+    paddd           m6,        m8
2561
+    paddd           m6,        m4
2562
+    psrad           m6,        INTERP_SHIFT_PS
2563
+
2564
+    pshufb          m8,        m7,        m3
2565
+    pshufb          m7,        m2
2566
+    pmaddwd         m7,        m0
2567
+    pmaddwd         m8,        m1
2568
+    paddd           m7,        m8
2569
+    paddd           m7,        m4
2570
+    psrad           m7,        INTERP_SHIFT_PS
2571
+
2572
+    packssdw        m6,        m7
2573
+    pshufb          m6,        m5
2574
+    movu            [r2],      ym6
2575
+    vextracti32x8   [r2 + r3], m6,        1
2576
+%endmacro
2577
+%macro PROCESS_IPFILTER_CHROMA_PS_16x1_AVX512 0
2578
+    movu            ym6,       [r0]
2579
+    vinserti32x8    m6,        [r0 + 8],  1
2580
+
2581
+    pshufb          m8,        m6,        m3
2582
+    pshufb          m6,        m2
2583
+    pmaddwd         m6,        m0
2584
+    pmaddwd         m8,        m1
2585
+    paddd           m6,        m8
2586
+    paddd           m6,        m4
2587
+    psrad           m6,        INTERP_SHIFT_PS
2588
+
2589
+    vextracti32x8   ym7,       m6,        1
2590
+    packssdw        ym6,       ym7
2591
+    pshufb          ym6,       ym5
2592
+    movu            [r2],      ym6
2593
+%endmacro
2594
+%macro IPFILTER_CHROMA_PS_AVX512_16xN 1
2595
+%if ARCH_X86_64 == 1
2596
+INIT_ZMM avx512
2597
+cglobal interp_4tap_horiz_ps_16x%1, 4,7,9
2598
+    shl             r1d, 1
2599
+    shl             r3d, 1
2600
+    mov             r4d, r4m
2601
+    mov             r5d, r5m
2602
+%ifdef PIC
2603
+    lea             r6, [tab_ChromaCoeff]
2604
+    vpbroadcastd    m0, [r6 + r4 * 8]
2605
+    vpbroadcastd    m1, [r6 + r4 * 8 + 4]
2606
+%else
2607
+    vpbroadcastd    m0, [tab_ChromaCoeff + r4 * 8]
2608
+    vpbroadcastd    m1, [tab_ChromaCoeff + r4 * 8 + 4]
2609
+%endif
2610
+    mova            m2, [interp8_hpp_shuf1_load_avx512]
2611
+    mova            m3, [interp8_hpp_shuf2_load_avx512]
2612
+    vbroadcasti32x4 m4, [INTERP_OFFSET_PS]
2613
+    mova            m5, [interp8_hpp_shuf1_store_avx512]
2614
+    mov               r6d,         %1
2615
+    sub               r0,          2
2616
+    test              r5d,         r5d
2617
+    jz                .loop
2618
+    sub               r0,          r1
2619
+    add               r6d,         3
2620
+    PROCESS_IPFILTER_CHROMA_PS_16x1_AVX512
2621
+    lea               r0, [r0 + r1]
2622
+    lea               r2, [r2 + r3]
2623
+    dec               r6d
2624
+
2625
+.loop:
2626
+    PROCESS_IPFILTER_CHROMA_PS_16x2_AVX512
2627
+    lea             r0,  [r0 + 2 * r1]
2628
+    lea             r2,  [r2 + 2 * r3]
2629
+    sub             r6d, 2
2630
+    jnz             .loop
2631
+    RET
2632
+%endif
2633
+%endmacro
2634
+
2635
+IPFILTER_CHROMA_PS_AVX512_16xN 4
2636
+IPFILTER_CHROMA_PS_AVX512_16xN 8
2637
+IPFILTER_CHROMA_PS_AVX512_16xN 12
2638
+IPFILTER_CHROMA_PS_AVX512_16xN 16
2639
+IPFILTER_CHROMA_PS_AVX512_16xN 24
2640
+IPFILTER_CHROMA_PS_AVX512_16xN 32
2641
+IPFILTER_CHROMA_PS_AVX512_16xN 64
2642
+
2643
+%macro PROCESS_IPFILTER_CHROMA_PS_48x2_AVX512 0
2644
+    ; register map
2645
+    ; m0 , m1 - interpolate coeff
2646
+    ; m2 , m3 - shuffle load order table
2647
+    ; m4      - INTERP_OFFSET_PS
2648
+    ; m5      - shuffle store order table
2649
+
2650
+    movu            m6,        [r0]
2651
+    movu            m7,        [r0 + 8]
2652
+
2653
+    pshufb          m8,        m6,        m3
2654
+    pshufb          m6,        m2
2655
+    pmaddwd         m6,        m0
2656
+    pmaddwd         m8,        m1
2657
+    paddd           m6,        m8
2658
+    paddd           m6,        m4
2659
+    psrad           m6,        INTERP_SHIFT_PS
2660
+
2661
+    pshufb          m8,        m7,        m3
2662
+    pshufb          m7,        m2
2663
+    pmaddwd         m7,        m0
2664
+    pmaddwd         m8,        m1
2665
+    paddd           m7,        m8
2666
+    paddd           m7,        m4
2667
+    psrad           m7,        INTERP_SHIFT_PS
2668
+
2669
+    packssdw        m6,        m7
2670
+    pshufb          m6,        m5
2671
+    movu            [r2],      m6
2672
+
2673
+    movu            m6,        [r0 + r1]
2674
+    movu            m7,        [r0 + r1 + 8]
2675
+
2676
+    pshufb          m8,        m6,        m3
2677
+    pshufb          m6,        m2
2678
+    pmaddwd         m6,        m0
2679
+    pmaddwd         m8,        m1
2680
+    paddd           m6,        m8
2681
+    paddd           m6,        m4
2682
+    psrad           m6,        INTERP_SHIFT_PS
2683
+
2684
+    pshufb          m8,        m7,        m3
2685
+    pshufb          m7,        m2
2686
+    pmaddwd         m7,        m0
2687
+    pmaddwd         m8,        m1
2688
+    paddd           m7,        m8
2689
+    paddd           m7,        m4
2690
+    psrad           m7,        INTERP_SHIFT_PS
2691
+
2692
+    packssdw        m6,        m7
2693
+    pshufb          m6,        m5
2694
+    movu            [r2 + r3], m6
2695
+
2696
+    movu            ym6,       [r0 + mmsize]
2697
+    vinserti32x8    m6,        [r0 + r1 + mmsize],     1
2698
+    movu            ym7,       [r0 + mmsize + 8]
2699
+    vinserti32x8    m7,        [r0 + r1 + mmsize + 8],  1
2700
+
2701
+    pshufb          m8,        m6,        m3
2702
+    pshufb          m6,        m2
2703
+    pmaddwd         m6,        m0
2704
+    pmaddwd         m8,        m1
2705
+    paddd           m6,        m8
2706
+    paddd           m6,        m4
2707
+    psrad           m6,        INTERP_SHIFT_PS
2708
+
2709
+    pshufb          m8,        m7,        m3
2710
+    pshufb          m7,        m2
2711
+    pmaddwd         m7,        m0
2712
+    pmaddwd         m8,        m1
2713
+    paddd           m7,        m8
2714
+    paddd           m7,        m4
2715
+    psrad           m7,        INTERP_SHIFT_PS
2716
+
2717
+    packssdw        m6,        m7
2718
+    pshufb          m6,        m5
2719
+    movu            [r2 + mmsize],      ym6
2720
+    vextracti32x8   [r2 + r3 + mmsize], m6,        1
2721
+%endmacro
2722
+
2723
+%macro PROCESS_IPFILTER_CHROMA_PS_48x1_AVX512 0
2724
+    ; register map
2725
+    ; m0 , m1 - interpolate coeff
2726
+    ; m2 , m3 - shuffle load order table
2727
+    ; m4      - INTERP_OFFSET_PS
2728
+    ; m5      - shuffle store order table
2729
+
2730
+    movu            m6,        [r0]
2731
+    movu            m7,        [r0 + 8]
2732
+
2733
+    pshufb          m8,        m6,        m3
2734
+    pshufb          m6,        m2
2735
+    pmaddwd         m6,        m0
2736
+    pmaddwd         m8,        m1
2737
+    paddd           m6,        m8
2738
+    paddd           m6,        m4
2739
+    psrad           m6,        INTERP_SHIFT_PS
2740
+
2741
+    pshufb          m8,        m7,        m3
2742
+    pshufb          m7,        m2
2743
+    pmaddwd         m7,        m0
2744
+    pmaddwd         m8,        m1
2745
+    paddd           m7,        m8
2746
+    paddd           m7,        m4
2747
+    psrad           m7,        INTERP_SHIFT_PS
2748
+
2749
+    packssdw        m6,        m7
2750
+    pshufb          m6,        m5
2751
+    movu            [r2],      m6
2752
+
2753
+    movu            ym6,       [r0 + mmsize]
2754
+    movu            ym7,       [r0 + mmsize + 8]
2755
+
2756
+    pshufb          ym8,        ym6,        ym3
2757
+    pshufb          ym6,        ym2
2758
+    pmaddwd         ym6,        ym0
2759
+    pmaddwd         ym8,        ym1
2760
+    paddd           ym6,        ym8
2761
+    paddd           ym6,        ym4
2762
+    psrad           ym6,        INTERP_SHIFT_PS
2763
+
2764
+    pshufb          ym8,        ym7,        ym3
2765
+    pshufb          ym7,        ym2
2766
+    pmaddwd         ym7,        ym0
2767
+    pmaddwd         ym8,        ym1
2768
+    paddd           ym7,        ym8
2769
+    paddd           ym7,        ym4
2770
+    psrad           ym7,        INTERP_SHIFT_PS
2771
+
2772
+    packssdw        ym6,        ym7
2773
+    pshufb          ym6,        ym5
2774
+    movu            [r2 + mmsize],       ym6
2775
+%endmacro
2776
+
2777
+%if ARCH_X86_64 == 1
2778
+INIT_ZMM avx512
2779
+cglobal interp_4tap_horiz_ps_48x64, 4,7,9
2780
+    shl             r1d, 1
2781
+    shl             r3d, 1
2782
+    mov             r4d, r4m
2783
+    mov             r5d, r5m
2784
+
2785
+%ifdef PIC
2786
+    lea             r6, [tab_ChromaCoeff]
2787
+    vpbroadcastd    m0, [r6 + r4 * 8]
2788
+    vpbroadcastd    m1, [r6 + r4 * 8 + 4]
2789
+%else
2790
+    vpbroadcastd    m0, [tab_ChromaCoeff + r4 * 8]
2791
+    vpbroadcastd    m1, [tab_ChromaCoeff + r4 * 8 + 4]
2792
+%endif
2793
+    vbroadcasti32x8 m2, [interp8_hpp_shuf1_load_avx512]
2794
+    vbroadcasti32x8 m3, [interp8_hpp_shuf2_load_avx512]
2795
+    vbroadcasti32x4 m4, [INTERP_OFFSET_PS]
2796
+    vbroadcasti32x8 m5, [interp8_hpp_shuf1_store_avx512]
2797
+
2798
+    mov               r6d,         64
2799
+    sub               r0,          2
2800
+    test              r5d,         r5d
2801
+    jz                .loop
2802
+    sub               r0,          r1
2803
+    add               r6d,         3
2804
+    PROCESS_IPFILTER_CHROMA_PS_48x1_AVX512
2805
+    lea               r0, [r0 + r1]
2806
+    lea               r2, [r2 + r3]
2807
+    dec               r6d
2808
+.loop:
2809
+    PROCESS_IPFILTER_CHROMA_PS_48x2_AVX512
2810
+    lea             r0,  [r0 + 2 * r1]
2811
+    lea             r2,  [r2 + 2 * r3]
2812
+    sub             r6d, 2
2813
+    jnz             .loop
2814
+    RET
2815
+%endif
2816
+
2817
+%macro PROCESS_IPFILTER_CHROMA_PS_8x4_AVX512 0
2818
+    ; register map
2819
+    ; m0 , m1 - interpolate coeff
2820
+    ; m2 , m3 - shuffle load order table
2821
+    ; m4      - INTERP_OFFSET_PS
2822
+    ; m5      - shuffle store order table
2823
+
2824
+    movu            xm6,       [r0]
2825
+    vinserti32x4    m6,        [r0 + r1],      1
2826
+    vinserti32x4    m6,        [r0 + 2 * r1],  2
2827
+    vinserti32x4    m6,        [r0 + r6],      3
2828
+
2829
+    pshufb          m8,        m6,        m3
2830
+    pshufb          m6,        m2
2831
+    pmaddwd         m6,        m0
2832
+    pmaddwd         m8,        m1
2833
+    paddd           m6,        m8
2834
+    paddd           m6,        m4
2835
+    psrad           m6,        INTERP_SHIFT_PS
2836
+
2837
+    movu            xm7,       [r0 + 8]
2838
+    vinserti32x4    m7,        [r0 + r1 + 8],      1
2839
+    vinserti32x4    m7,        [r0 + 2 * r1 + 8],  2
2840
+    vinserti32x4    m7,        [r0 + r6 + 8],      3
2841
+
2842
+    pshufb          m8,        m7,        m3
2843
+    pshufb          m7,        m2
2844
+    pmaddwd         m7,        m0
2845
+    pmaddwd         m8,        m1
2846
+    paddd           m7,        m8
2847
+    paddd           m7,        m4
2848
+    psrad           m7,        INTERP_SHIFT_PS
2849
+
2850
+    packssdw        m6,        m7
2851
+    pshufb          m6,        m5
2852
+    movu            [r2],      xm6
2853
+    vextracti32x4   [r2 + r3],     m6,        1
2854
+    vextracti32x4   [r2 + 2 * r3], m6,        2
2855
+    vextracti32x4   [r2 + r7],     m6,        3
2856
+%endmacro
2857
+
2858
+%macro PROCESS_IPFILTER_CHROMA_PS_8x3_AVX512 0
2859
+    movu            xm6,       [r0]
2860
+    vinserti32x4    m6,        [r0 + r1],      1
2861
+    vinserti32x4    m6,        [r0 + 2 * r1],  2
2862
+
2863
+    pshufb          m8,        m6,        m3
2864
+    pshufb          m6,        m2
2865
+    pmaddwd         m6,        m0
2866
+    pmaddwd         m8,        m1
2867
+    paddd           m6,        m8
2868
+    paddd           m6,        m4
2869
+    psrad           m6,        INTERP_SHIFT_PS
2870
+
2871
+    movu            xm7,       [r0 + 8]
2872
+    vinserti32x4    m7,        [r0 + r1 + 8],      1
2873
+    vinserti32x4    m7,        [r0 + 2 * r1 + 8],  2
2874
+
2875
+    pshufb          m8,        m7,        m3
2876
+    pshufb          m7,        m2
2877
+    pmaddwd         m7,        m0
2878
+    pmaddwd         m8,        m1
2879
+    paddd           m7,        m8
2880
+    paddd           m7,        m4
2881
+    psrad           m7,        INTERP_SHIFT_PS
2882
+
2883
+    packssdw        m6,        m7
2884
+    pshufb          m6,        m5
2885
+    movu            [r2],      xm6
2886
+    vextracti32x4   [r2 + r3],     m6,        1
2887
+    vextracti32x4   [r2 + 2 * r3], m6,        2
2888
+%endmacro
2889
+
2890
+%macro IPFILTER_CHROMA_PS_AVX512_8xN 1
2891
+INIT_ZMM avx512
2892
+cglobal interp_4tap_horiz_ps_8x%1, 4,9,9
2893
+    shl             r1d, 1
2894
+    shl             r3d, 1
2895
+    mov             r4d, r4m
2896
+    mov             r5d, r5m
2897
+
2898
+    lea             r6, [3 * r1]
2899
+    lea             r7, [3 * r3]
2900
+%ifdef PIC
2901
+    lea             r8, [tab_ChromaCoeff]
2902
+    vpbroadcastd    m0, [r8 + r4 * 8]
2903
+    vpbroadcastd    m1, [r8 + r4 * 8 + 4]
2904
+%else
2905
+    vpbroadcastd    m0, [tab_ChromaCoeff + r4 * 8]
2906
+    vpbroadcastd    m1, [tab_ChromaCoeff + r4 * 8 + 4]
2907
+%endif
2908
+    vbroadcasti32x8 m2, [interp8_hpp_shuf1_load_avx512]
2909
+    vbroadcasti32x8 m3, [interp8_hpp_shuf2_load_avx512]
2910
+    vbroadcasti32x4 m4, [INTERP_OFFSET_PS]
2911
+    vbroadcasti32x8 m5, [interp8_hpp_shuf1_store_avx512]
2912
+
2913
+    mov               r8d,         %1
2914
+    sub               r0,          2
2915
+    test              r5d,         r5d
2916
+    jz                .loop
2917
+    sub               r0,          r1
2918
+    add               r8d,         3
2919
+    PROCESS_IPFILTER_CHROMA_PS_8x3_AVX512
2920
+    lea               r0,  [r0 + r6]
2921
+    lea               r2,  [r2 + r7]
2922
+    sub               r8d, 3
2923
+
2924
+.loop:
2925
+    PROCESS_IPFILTER_CHROMA_PS_8x4_AVX512
2926
+    lea             r0,  [r0 + 4 * r1]
2927
+    lea             r2,  [r2 + 4 * r3]
2928
+    sub             r8d, 4
2929
+    jnz             .loop
2930
+    RET
2931
+%endmacro
2932
+
2933
+%if ARCH_X86_64
2934
+IPFILTER_CHROMA_PS_AVX512_8xN 4
2935
+IPFILTER_CHROMA_PS_AVX512_8xN 8
2936
+IPFILTER_CHROMA_PS_AVX512_8xN 12
2937
+IPFILTER_CHROMA_PS_AVX512_8xN 16
2938
+IPFILTER_CHROMA_PS_AVX512_8xN 32
2939
+IPFILTER_CHROMA_PS_AVX512_8xN 64
2940
+%endif
2941
+
2942
+%macro PROCESS_IPFILTER_CHROMA_PS_24x4_AVX512 0
2943
+    ; register map
2944
+    ; m0 , m1 - interpolate coeff
2945
+    ; m2 , m3 - shuffle order table
2946
+    ; m4      - INTERP_OFFSET_PS
2947
+    ; m5      - shuffle store order table
2948
+
2949
+    movu            ym6,       [r0]
2950
+    vinserti32x8    m6,        [r0 + r1],      1
2951
+    movu            ym7,       [r0 + 8]
2952
+    vinserti32x8    m7,        [r0 + r1 + 8],  1
2953
+
2954
+    pshufb          m8,        m6,        m3
2955
+    pshufb          m6,        m2
2956
+    pmaddwd         m6,        m0
2957
+    pmaddwd         m8,        m1
2958
+    paddd           m6,        m8
2959
+    paddd           m6,        m4
2960
+    psrad           m6,        INTERP_SHIFT_PS
2961
+
2962
+    pshufb          m8,        m7,        m3
2963
+    pshufb          m7,        m2
2964
+    pmaddwd         m7,        m0
2965
+    pmaddwd         m8,        m1
2966
+    paddd           m7,        m8
2967
+    paddd           m7,        m4
2968
+    psrad           m7,        INTERP_SHIFT_PS
2969
+
2970
+    packssdw        m6,        m7
2971
+    pshufb          m6,        m5
2972
+    movu            [r2],      ym6
2973
+    vextracti32x8   [r2 + r3], m6,        1
2974
+
2975
+    movu            ym6,       [r0 + 2 * r1]
2976
+    vinserti32x8    m6,        [r0 + r6],      1
2977
+    movu            ym7,       [r0 + 2 * r1 + 8]
2978
+    vinserti32x8    m7,        [r0 + r6 + 8],  1
2979
+
2980
+    pshufb          m8,        m6,        m3
2981
+    pshufb          m6,        m2
2982
+    pmaddwd         m6,        m0
2983
+    pmaddwd         m8,        m1
2984
+    paddd           m6,        m8
2985
+    paddd           m6,        m4
2986
+    psrad           m6,        INTERP_SHIFT_PS
2987
+
2988
+    pshufb          m8,        m7,        m3
2989
+    pshufb          m7,        m2
2990
+    pmaddwd         m7,        m0
2991
+    pmaddwd         m8,        m1
2992
+    paddd           m7,        m8
2993
+    paddd           m7,        m4
2994
+    psrad           m7,        INTERP_SHIFT_PS
2995
+
2996
+    packssdw        m6,        m7
2997
+    pshufb          m6,        m5
2998
+    movu            [r2 + 2 * r3],        ym6
2999
+    vextracti32x8   [r2 + r7], m6,        1
3000
+
3001
+    movu            xm6,       [r0 + mmsize/2]
3002
+    vinserti32x4    m6,        [r0 + r1 + mmsize/2],      1
3003
+    vinserti32x4    m6,        [r0 + 2 * r1 + mmsize/2],  2
3004
+    vinserti32x4    m6,        [r0 + r6 + mmsize/2],      3
3005
+
3006
+    pshufb          m8,        m6,        m3
3007
+    pshufb          m6,        m2
3008
+    pmaddwd         m6,        m0
3009
+    pmaddwd         m8,        m1
3010
+    paddd           m6,        m8
3011
+    paddd           m6,        m4
3012
+    psrad           m6,        INTERP_SHIFT_PS
3013
+
3014
+    movu            xm7,       [r0 + mmsize/2 + 8]
3015
+    vinserti32x4    m7,        [r0 + r1 + mmsize/2 + 8],      1
3016
+    vinserti32x4    m7,        [r0 + 2 * r1 + mmsize/2 + 8],  2
3017
+    vinserti32x4    m7,        [r0 + r6 + mmsize/2 + 8],      3
3018
+
3019
+    pshufb          m8,        m7,        m3
3020
+    pshufb          m7,        m2
3021
+    pmaddwd         m7,        m0
3022
+    pmaddwd         m8,        m1
3023
+    paddd           m7,        m8
3024
+    paddd           m7,        m4
3025
+    psrad           m7,        INTERP_SHIFT_PS
3026
+
3027
+    packssdw        m6,        m7
3028
+    pshufb          m6,        m5
3029
+    movu            [r2 + mmsize/2],      xm6
3030
+    vextracti32x4   [r2 + r3 + mmsize/2],     m6,        1
3031
+    vextracti32x4   [r2 + 2 * r3 + mmsize/2], m6,        2
3032
+    vextracti32x4   [r2 + r7 + mmsize/2],     m6,        3
3033
+%endmacro
3034
+
3035
+%macro PROCESS_IPFILTER_CHROMA_PS_24x3_AVX512 0
3036
+    movu            ym6,       [r0]
3037
+    vinserti32x8    m6,        [r0 + r1],      1
3038
+    movu            ym7,       [r0 + 8]
3039
+    vinserti32x8    m7,        [r0 + r1 + 8],  1
3040
+
3041
+    pshufb          m8,        m6,        m3
3042
+    pshufb          m6,        m2
3043
+    pmaddwd         m6,        m0
3044
+    pmaddwd         m8,        m1
3045
+    paddd           m6,        m8
3046
+    paddd           m6,        m4
3047
+    psrad           m6,        INTERP_SHIFT_PS
3048
+
3049
+    pshufb          m8,        m7,        m3
3050
+    pshufb          m7,        m2
3051
+    pmaddwd         m7,        m0
3052
+    pmaddwd         m8,        m1
3053
+    paddd           m7,        m8
3054
+    paddd           m7,        m4
3055
+    psrad           m7,        INTERP_SHIFT_PS
3056
+
3057
+    packssdw        m6,        m7
3058
+    pshufb          m6,        m5
3059
+    movu            [r2],      ym6
3060
+    vextracti32x8   [r2 + r3], m6,        1
3061
+
3062
+    movu            ym6,       [r0 + 2 * r1]
3063
+    movu            ym7,       [r0 + 2 * r1 + 8]
3064
+
3065
+    pshufb          ym8,        ym6,        ym3
3066
+    pshufb          ym6,        ym2
3067
+    pmaddwd         ym6,        ym0
3068
+    pmaddwd         ym8,        ym1
3069
+    paddd           ym6,        ym8
3070
+    paddd           ym6,        ym4
3071
+    psrad           ym6,        INTERP_SHIFT_PS
3072
+
3073
+    pshufb          ym8,        ym7,        ym3
3074
+    pshufb          ym7,        ym2
3075
+    pmaddwd         ym7,        ym0
3076
+    pmaddwd         ym8,        ym1
3077
+    paddd           ym7,        ym8
3078
+    paddd           ym7,        ym4
3079
+    psrad           ym7,        INTERP_SHIFT_PS
3080
+
3081
+    packssdw        ym6,        ym7
3082
+    pshufb          ym6,        ym5
3083
+    movu            [r2 + 2 * r3],        ym6
3084
+
3085
+    movu            xm6,       [r0 + mmsize/2]
3086
+    vinserti32x4    m6,        [r0 + r1 + mmsize/2],      1
3087
+    vinserti32x4    m6,        [r0 + 2 * r1 + mmsize/2],  2
3088
+
3089
+    pshufb          m8,        m6,        m3
3090
+    pshufb          m6,        m2
3091
+    pmaddwd         m6,        m0
3092
+    pmaddwd         m8,        m1
3093
+    paddd           m6,        m8
3094
+    paddd           m6,        m4
3095
+    psrad           m6,        INTERP_SHIFT_PS
3096
+
3097
+    movu            xm7,       [r0 + mmsize/2 + 8]
3098
+    vinserti32x4    m7,        [r0 + r1 + mmsize/2 + 8],      1
3099
+    vinserti32x4    m7,        [r0 + 2 * r1 + mmsize/2 + 8],  2
3100
+
3101
+    pshufb          m8,        m7,        m3
3102
+    pshufb          m7,        m2
3103
+    pmaddwd         m7,        m0
3104
+    pmaddwd         m8,        m1
3105
+    paddd           m7,        m8
3106
+    paddd           m7,        m4
3107
+    psrad           m7,        INTERP_SHIFT_PS
3108
+
3109
+    packssdw        m6,        m7
3110
+    pshufb          m6,        m5
3111
+    movu            [r2 + mmsize/2],      xm6
3112
+    vextracti32x4   [r2 + r3 + mmsize/2],     m6,        1
3113
+    vextracti32x4   [r2 + 2 * r3 + mmsize/2], m6,        2
3114
+%endmacro
3115
+
3116
+%macro IPFILTER_CHROMA_PS_AVX512_24xN 1
3117
+INIT_ZMM avx512
3118
+cglobal interp_4tap_horiz_ps_24x%1, 4,9,9
3119
+    shl             r1d, 1
3120
+    shl             r3d, 1
3121
+    mov             r4d, r4m
3122
+    mov             r5d, r5m
3123
+
3124
+    lea             r6, [3 * r1]
3125
+    lea             r7, [3 * r3]
3126
+%ifdef PIC
3127
+    lea             r8, [tab_ChromaCoeff]
3128
+    vpbroadcastd    m0, [r8 + r4 * 8]
3129
+    vpbroadcastd    m1, [r8 + r4 * 8 + 4]
3130
+%else
3131
+    vpbroadcastd    m0, [tab_ChromaCoeff + r4 * 8]
3132
+    vpbroadcastd    m1, [tab_ChromaCoeff + r4 * 8 + 4]
3133
+%endif
3134
+    vbroadcasti32x8 m2, [interp8_hpp_shuf1_load_avx512]
3135
+    vbroadcasti32x8 m3, [interp8_hpp_shuf2_load_avx512]
3136
+    vbroadcasti32x4 m4, [INTERP_OFFSET_PS]
3137
+    vbroadcasti32x8 m5,[interp8_hpp_shuf1_store_avx512]
3138
+
3139
+    mov               r8d,         %1
3140
+    sub               r0,          2
3141
+    test              r5d,         r5d
3142
+    jz                .loop
3143
+    sub               r0,          r1
3144
+    add               r8d,         3
3145
+    PROCESS_IPFILTER_CHROMA_PS_24x3_AVX512
3146
+    lea               r0,  [r0 + r6]
3147
+    lea               r2,  [r2 + r7]
3148
+    sub               r8d, 3
3149
+
3150
+.loop:
3151
+    PROCESS_IPFILTER_CHROMA_PS_24x4_AVX512
3152
+    lea             r0,  [r0 + 4 * r1]
3153
+    lea             r2,  [r2 + 4 * r3]
3154
+    sub             r8d, 4
3155
+    jnz             .loop
3156
+    RET
3157
+%endmacro
3158
+
3159
+%if ARCH_X86_64
3160
+IPFILTER_CHROMA_PS_AVX512_24xN 32
3161
+IPFILTER_CHROMA_PS_AVX512_24xN 64
3162
+%endif
3163
+;-------------------------------------------------------------------------------------------------------------
3164
+; avx512 chroma_hps code end
3165
+;-------------------------------------------------------------------------------------------------------------
3166
+;-------------------------------------------------------------------------------------------------------------
3167
+; avx512 chroma_vps code start
3168
+;-------------------------------------------------------------------------------------------------------------
3169
+%macro PROCESS_CHROMA_VERT_PS_8x8_AVX512 0
3170
+    movu                  xm1,                [r0]
3171
+    lea                   r6,                 [r0 + 2 * r1]
3172
+    lea                   r8,                 [r0 + 4 * r1]
3173
+    lea                   r9,                 [r8 + 2 * r1]
3174
+    vinserti32x4          m1,                 [r6],                1
3175
+    vinserti32x4          m1,                 [r8],                2
3176
+    vinserti32x4          m1,                 [r9],                3
3177
+    movu                  xm3,                [r0 + r1]
3178
+    vinserti32x4          m3,                 [r6 + r1],           1
3179
+    vinserti32x4          m3,                 [r8 + r1],           2
3180
+    vinserti32x4          m3,                 [r9 + r1],           3
3181
+    punpcklwd             m0,                 m1,                  m3
3182
+    pmaddwd               m0,                 [r5]
3183
+    punpckhwd             m1,                 m3
3184
+    pmaddwd               m1,                 [r5]
3185
+
3186
+    movu                  xm4,                [r0 + 2 * r1]
3187
+    vinserti32x4          m4,                 [r6 + 2 * r1],       1
3188
+    vinserti32x4          m4,                 [r8 + 2 * r1],       2
3189
+    vinserti32x4          m4,                 [r9 + 2 * r1],       3
3190
+    punpcklwd             m2,                 m3,                  m4
3191
+    pmaddwd               m2,                 [r5]
3192
+    punpckhwd             m3,                 m4
3193
+    pmaddwd               m3,                 [r5]
3194
+
3195
+    movu                  xm5,                [r0 + r10]
3196
+    vinserti32x4          m5,                 [r6 + r10],          1
3197
+    vinserti32x4          m5,                 [r8 + r10],          2
3198
+    vinserti32x4          m5,                 [r9 + r10],          3
3199
+    punpcklwd             m6,                 m4,                  m5
3200
+    pmaddwd               m6,                 [r5 + mmsize]
3201
+    paddd                 m0,                 m6
3202
+    punpckhwd             m4,                 m5
3203
+    pmaddwd               m4,                 [r5 + mmsize]
3204
+    paddd                 m1,                 m4
3205
+
3206
+    movu                  xm4,                [r0 + 4 * r1]
3207
+    vinserti32x4          m4,                 [r6 + 4 * r1],       1
3208
+    vinserti32x4          m4,                 [r8 + 4 * r1],       2
3209
+    vinserti32x4          m4,                 [r9 + 4 * r1],       3
3210
+    punpcklwd             m6,                 m5,                  m4
3211
+    pmaddwd               m6,                 m9
3212
+    paddd                 m2,                 m6
3213
+    punpckhwd             m5,                 m4
3214
+    pmaddwd               m5,                 m9
3215
+    paddd                 m3,                 m5
3216
+
3217
+    paddd                 m0,                 m7
3218
+    paddd                 m1,                 m7
3219
+    paddd                 m2,                 m7
3220
+    paddd                 m3,                 m7
3221
+
3222
+    psrad                 m0,                 INTERP_SHIFT_PS
3223
+    psrad                 m1,                 INTERP_SHIFT_PS
3224
+    psrad                 m2,                 INTERP_SHIFT_PS
3225
+    psrad                 m3,                 INTERP_SHIFT_PS
3226
+
3227
+    packssdw              m0,                 m1
3228
+    packssdw              m2,                 m3
3229
+    movu                  [r2],               xm0
3230
+    movu                  [r2 + r3],          xm2
3231
+    vextracti32x4         [r2 + 2 * r3],      m0,                  1
3232
+    vextracti32x4         [r2 + r7],          m2,                  1
3233
+    lea                   r2,                 [r2 + 4 * r3]
3234
+    vextracti32x4         [r2],               m0,                  2
3235
+    vextracti32x4         [r2 + r3],          m2,                  2
3236
+    vextracti32x4         [r2 + 2 * r3],      m0,                  3
3237
+    vextracti32x4         [r2 + r7],          m2,                  3
3238
+%endmacro
3239
+
3240
+;-----------------------------------------------------------------------------------------------------------------
3241
+; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
3242
+;-----------------------------------------------------------------------------------------------------------------
3243
+%if ARCH_X86_64
3244
+INIT_ZMM avx512
3245
+cglobal interp_4tap_vert_ps_8x8, 5, 11, 10
3246
+    add                   r1d,                r1d
3247
+    add                   r3d,                r3d
3248
+    sub                   r0,                 r1
3249
+    shl                   r4d,                7
3250
+
3251
+%ifdef PIC
3252
+    lea                   r5,                 [tab_ChromaCoeffV_avx512]
3253
+    lea                   r5,                 [r5 + r4]
3254
+%else
3255
+    lea                   r5,                 [tab_ChromaCoeffV_avx512 + r4]
3256
+%endif
3257
+    vbroadcasti32x4       m7,                 [INTERP_OFFSET_PS]
3258
+    lea                   r10,                [3 * r1]
3259
+    lea                   r7,                 [3 * r3]
3260
+    mova                  m8,                 [r5]
3261
+    mova                  m9,                 [r5 + mmsize]
3262
+    PROCESS_CHROMA_VERT_PS_8x8_AVX512
3263
+    RET
3264
+%endif
3265
+
3266
+%macro FILTER_VER_PS_CHROMA_8xN_AVX512 1
3267
+INIT_ZMM avx512
3268
+cglobal interp_4tap_vert_ps_8x%1, 5, 11, 10
3269
+    add                   r1d,                r1d
3270
+    add                   r3d,                r3d
3271
+    sub                   r0,                 r1
3272
+    shl                   r4d,                7
3273
+
3274
+%ifdef PIC
3275
+    lea                   r5,                 [tab_ChromaCoeffV_avx512]
3276
+    lea                   r5,                 [r5 + r4]
3277
+%else
3278
+    lea                   r5,                 [tab_ChromaCoeffV_avx512 + r4]
3279
+%endif
3280
+    vbroadcasti32x4       m7,                 [INTERP_OFFSET_PS]
3281
+    lea                   r10,                [3 * r1]
3282
+    lea                   r7,                 [3 * r3]
3283
+    mova                  m8,                 [r5]
3284
+    mova                  m9,                 [r5 + mmsize]
3285
+%rep %1/8 - 1
3286
+    PROCESS_CHROMA_VERT_PS_8x8_AVX512
3287
+    lea                   r0,                 [r8 + 4 * r1]
3288
+    lea                   r2,                 [r2 + 4 * r3]
3289
+%endrep
3290
+    PROCESS_CHROMA_VERT_PS_8x8_AVX512
3291
+    RET
3292
+%endmacro
3293
+
3294
+%if ARCH_X86_64
3295
+FILTER_VER_PS_CHROMA_8xN_AVX512 16
3296
+FILTER_VER_PS_CHROMA_8xN_AVX512 32
3297
+FILTER_VER_PS_CHROMA_8xN_AVX512 64
3298
+%endif
3299
+
3300
+%macro PROCESS_CHROMA_VERT_PS_16x4_AVX512 0
3301
+    movu                  ym1,                [r0]
3302
+    lea                   r6,                 [r0 + 2 * r1]
3303
+    vinserti32x8          m1,                 [r6],                1
3304
+    movu                  ym3,                [r0 + r1]
3305
+    vinserti32x8          m3,                 [r6 + r1],           1
3306
+    punpcklwd             m0,                 m1,                  m3
3307
+    pmaddwd               m0,                 m8
3308
+    punpckhwd             m1,                 m3
3309
+    pmaddwd               m1,                 m8
3310
+
3311
+    movu                  ym4,                [r0 + 2 * r1]
3312
+    vinserti32x8          m4,                 [r6 + 2 * r1],       1
3313
+    punpcklwd             m2,                 m3,                  m4
3314
+    pmaddwd               m2,                 m8
3315
+    punpckhwd             m3,                 m4
3316
+    pmaddwd               m3,                 m8
3317
+
3318
+    movu                  ym5,                [r0 + r8]
3319
+    vinserti32x8          m5,                 [r6 + r8],           1
3320
+    punpcklwd             m6,                 m4,                  m5
3321
+    pmaddwd               m6,                 m9
3322
+    paddd                 m0,                 m6
3323
+    punpckhwd             m4,                 m5
3324
+    pmaddwd               m4,                 m9
3325
+    paddd                 m1,                 m4
3326
+
3327
+    movu                  ym4,                [r0 + 4 * r1]
3328
+    vinserti32x8          m4,                 [r6 + 4 * r1],       1
3329
+    punpcklwd             m6,                 m5,                  m4
3330
+    pmaddwd               m6,                 m9
3331
+    paddd                 m2,                 m6
3332
+    punpckhwd             m5,                 m4
3333
+    pmaddwd               m5,                 m9
3334
+    paddd                 m3,                 m5
3335
+
3336
+    paddd                 m0,                 m7
3337
+    paddd                 m1,                 m7
3338
+    paddd                 m2,                 m7
3339
+    paddd                 m3,                 m7
3340
+
3341
+    psrad                 m0,                 INTERP_SHIFT_PS
3342
+    psrad                 m1,                 INTERP_SHIFT_PS
3343
+    psrad                 m2,                 INTERP_SHIFT_PS
3344
+    psrad                 m3,                 INTERP_SHIFT_PS
3345
+
3346
+    packssdw              m0,                 m1
3347
+    packssdw              m2,                 m3
3348
+    movu                  [r2],               ym0
3349
+    movu                  [r2 + r3],          ym2
3350
+    vextracti32x8         [r2 + 2 * r3],      m0,                  1
3351
+    vextracti32x8         [r2 + r7],          m2,                  1
3352
+%endmacro
3353
+
3354
+;-----------------------------------------------------------------------------------------------------------------
3355
+; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
3356
+;-----------------------------------------------------------------------------------------------------------------
3357
+%if ARCH_X86_64
3358
+INIT_ZMM avx512
3359
+cglobal interp_4tap_vert_ps_16x4, 5, 9, 10
3360
+    add                   r1d,                r1d
3361
+    add                   r3d,                r3d
3362
+    sub                   r0,                 r1
3363
+    shl                   r4d,                7
3364
+
3365
+%ifdef PIC
3366
+    lea                   r5,                 [tab_ChromaCoeffV_avx512]
3367
+    lea                   r5,                 [r5 + r4]
3368
+%else
3369
+    lea                   r5,                 [tab_ChromaCoeffV_avx512 + r4]
3370
+%endif
3371
+    vbroadcasti32x4       m7,                 [INTERP_OFFSET_PS]
3372
+    lea                   r7,                 [3 * r3]
3373
+    lea                   r8,                 [3 * r1]
3374
+    mova                  m8,                 [r5]
3375
+    mova                  m9,                 [r5 + mmsize]
3376
+    PROCESS_CHROMA_VERT_PS_16x4_AVX512
3377
+    RET
3378
+%endif
3379
+
3380
+%macro FILTER_VER_PS_CHROMA_16xN_AVX512 1
3381
+INIT_ZMM avx512
3382
+cglobal interp_4tap_vert_ps_16x%1, 5, 9, 10
3383
+    add                   r1d,                r1d
3384
+    add                   r3d,                r3d
3385
+    sub                   r0,                 r1
3386
+    shl                   r4d,                7
3387
+
3388
+%ifdef PIC
3389
+    lea                   r5,                 [tab_ChromaCoeffV_avx512]
3390
+    lea                   r5,                 [r5 + r4]
3391
+%else
3392
+    lea                   r5,                 [tab_ChromaCoeffV_avx512 + r4]
3393
+%endif
3394
+    vbroadcasti32x4       m7,                 [INTERP_OFFSET_PS]
3395
+    lea                   r7,                 [3 * r3]
3396
+    lea                   r8,                 [3 * r1]
3397
+    mova                  m8,                 [r5]
3398
+    mova                  m9,                 [r5 + mmsize]
3399
+%rep %1/4 - 1
3400
+    PROCESS_CHROMA_VERT_PS_16x4_AVX512
3401
+    lea                   r0,                 [r0 + 4 * r1]
3402
+    lea                   r2,                 [r2 + 4 * r3]
3403
+%endrep
3404
+    PROCESS_CHROMA_VERT_PS_16x4_AVX512
3405
+    RET
3406
+%endmacro
3407
+
3408
+%if ARCH_X86_64
3409
+FILTER_VER_PS_CHROMA_16xN_AVX512 8
3410
+FILTER_VER_PS_CHROMA_16xN_AVX512 12
3411
+FILTER_VER_PS_CHROMA_16xN_AVX512 16
3412
+FILTER_VER_PS_CHROMA_16xN_AVX512 24
3413
+FILTER_VER_PS_CHROMA_16xN_AVX512 32
3414
+FILTER_VER_PS_CHROMA_16xN_AVX512 64
3415
+%endif
3416
+
3417
+%macro PROCESS_CHROMA_VERT_PS_24x8_AVX512 0
3418
+    movu                  ym1,                [r0]
3419
+    lea                   r6,                 [r0 + 2 * r1]
3420
+    lea                   r8,                 [r0 + 4 * r1]
3421
+    lea                   r9,                 [r8 + 2 * r1]
3422
+
3423
+    movu                  ym10,               [r8]
3424
+    movu                  ym3,                [r0 + r1]
3425
+    movu                  ym12,               [r8 + r1]
3426
+    vinserti32x8          m1,                 [r6],                1
3427
+    vinserti32x8          m10,                [r9],                1
3428
+    vinserti32x8          m3,                 [r6 + r1],           1
3429
+    vinserti32x8          m12,                [r9 + r1],           1
3430
+
3431
+    punpcklwd             m0,                 m1,                  m3
3432
+    punpcklwd             m9,                 m10,                 m12
3433
+    pmaddwd               m0,                 m16
3434
+    pmaddwd               m9,                 m16
3435
+    punpckhwd             m1,                 m3
3436
+    punpckhwd             m10,                m12
3437
+    pmaddwd               m1,                 m16
3438
+    pmaddwd               m10,                m16
3439
+
3440
+    movu                  ym4,                [r0 + 2 * r1]
3441
+    movu                  ym13,               [r8 + 2 * r1]
3442
+    vinserti32x8          m4,                 [r6 + 2 * r1],       1
3443
+    vinserti32x8          m13,                [r9 + 2 * r1],       1
3444
+    punpcklwd             m2,                 m3,                  m4
3445
+    punpcklwd             m11,                m12,                 m13
3446
+    pmaddwd               m2,                 m16
3447
+    pmaddwd               m11,                m16
3448
+    punpckhwd             m3,                 m4
3449
+    punpckhwd             m12,                m13
3450
+    pmaddwd               m3,                 m16
3451
+    pmaddwd               m12,                m16
3452
+
3453
+    movu                  ym5,                [r0 + r10]
3454
+    vinserti32x8          m5,                 [r6 + r10],          1
3455
+    movu                  ym14,               [r8 + r10]
3456
+    vinserti32x8          m14,                [r9 + r10],          1
3457
+    punpcklwd             m6,                 m4,                  m5
3458
+    punpcklwd             m15,                m13,                 m14
3459
+    pmaddwd               m6,                 m17
3460
+    pmaddwd               m15,                m17
3461
+    paddd                 m0,                 m6
3462
+    paddd                 m9,                 m15
3463
+    punpckhwd             m4,                 m5
3464
+    punpckhwd             m13,                m14
3465
+    pmaddwd               m4,                 m17
3466
+    pmaddwd               m13,                m17
3467
+    paddd                 m1,                 m4
3468
+    paddd                 m10,                m13
3469
+
3470
+    movu                  ym4,                [r0 + 4 * r1]
3471
+    vinserti32x8          m4,                 [r6 + 4 * r1],       1
3472
+    movu                  ym13,               [r8 + 4 * r1]
3473
+    vinserti32x8          m13,                [r9 + 4 * r1],       1
3474
+    punpcklwd             m6,                 m5,                  m4
3475
+    punpcklwd             m15,                m14,                 m13
3476
+    pmaddwd               m6,                 m17
3477
+    pmaddwd               m15,                m17
3478
+    paddd                 m2,                 m6
3479
+    paddd                 m11,                m15
3480
+    punpckhwd             m5,                 m4
3481
+    punpckhwd             m14,                m13
3482
+    pmaddwd               m5,                 m17
3483
+    pmaddwd               m14,                m17
3484
+    paddd                 m3,                 m5
3485
+    paddd                 m12,                m14
3486
+
3487
+    paddd                 m0,                 m7
3488
+    paddd                 m1,                 m7
3489
+    paddd                 m2,                 m7
3490
+    paddd                 m3,                 m7
3491
+    paddd                 m9,                 m7
3492
+    paddd                 m10,                m7
3493
+    paddd                 m11,                m7
3494
+    paddd                 m12,                m7
3495
+
3496
+    psrad                 m0,                 INTERP_SHIFT_PS
3497
+    psrad                 m1,                 INTERP_SHIFT_PS
3498
+    psrad                 m2,                 INTERP_SHIFT_PS
3499
+    psrad                 m3,                 INTERP_SHIFT_PS
3500
+    psrad                 m9,                 INTERP_SHIFT_PS
3501
+    psrad                 m10,                INTERP_SHIFT_PS
3502
+    psrad                 m11,                INTERP_SHIFT_PS
3503
+    psrad                 m12,                INTERP_SHIFT_PS
3504
+
3505
+    packssdw              m0,                 m1
3506
+    packssdw              m2,                 m3
3507
+    packssdw              m9,                 m10
3508
+    packssdw              m11,                m12
3509
+    movu                  [r2],               ym0
3510
+    movu                  [r2 + r3],          ym2
3511
+    vextracti32x8         [r2 + 2 * r3],      m0,                  1
3512
+    vextracti32x8         [r2 + r7],          m2,                  1
3513
+    lea                   r11,                [r2 + 4 * r3]
3514
+    movu                  [r11],              ym9
3515
+    movu                  [r11 + r3],         ym11
3516
+    vextracti32x8         [r11 + 2 * r3],     m9,                  1
3517
+    vextracti32x8         [r11 + r7],         m11,                 1
3518
+
3519
+    movu                  xm1,                [r0 + mmsize/2]
3520
+    vinserti32x4          m1,                 [r6 + mmsize/2],                1
3521
+    vinserti32x4          m1,                 [r8 + mmsize/2],                2
3522
+    vinserti32x4          m1,                 [r9 + mmsize/2],                3
3523
+    movu                  xm3,                [r0 + r1 + mmsize/2]
3524
+    vinserti32x4          m3,                 [r6 + r1 + mmsize/2],           1
3525
+    vinserti32x4          m3,                 [r8 + r1 + mmsize/2],           2
3526
+    vinserti32x4          m3,                 [r9 + r1 + mmsize/2],           3
3527
+    punpcklwd             m0,                 m1,                             m3
3528
+    pmaddwd               m0,                 m16
3529
+    punpckhwd             m1,                 m3
3530
+    pmaddwd               m1,                 m16
3531
+
3532
+    movu                  xm4,                [r0 + 2 * r1 + mmsize/2]
3533
+    vinserti32x4          m4,                 [r6 + 2 * r1 + mmsize/2],       1
3534
+    vinserti32x4          m4,                 [r8 + 2 * r1 + mmsize/2],       2
3535
+    vinserti32x4          m4,                 [r9 + 2 * r1 + mmsize/2],       3
3536
+    punpcklwd             m2,                 m3,                             m4
3537
+    pmaddwd               m2,                 m16
3538
+    punpckhwd             m3,                 m4
3539
+    pmaddwd               m3,                 m16
3540
+
3541
+    movu                  xm5,                [r0 + r10 + mmsize/2]
3542
+    vinserti32x4          m5,                 [r6 + r10 + mmsize/2],          1
3543
+    vinserti32x4          m5,                 [r8 + r10 + mmsize/2],          2
3544
+    vinserti32x4          m5,                 [r9 + r10 + mmsize/2],          3
3545
+    punpcklwd             m6,                 m4,                             m5
3546
+    pmaddwd               m6,                 m17
3547
+    paddd                 m0,                 m6
3548
+    punpckhwd             m4,                 m5
3549
+    pmaddwd               m4,                 m17
3550
+    paddd                 m1,                 m4
3551
+
3552
+    movu                  xm4,                [r0 + 4 * r1 + mmsize/2]
3553
+    vinserti32x4          m4,                 [r6 + 4 * r1 + mmsize/2],       1
3554
+    vinserti32x4          m4,                 [r8 + 4 * r1 + mmsize/2],       2
3555
+    vinserti32x4          m4,                 [r9 + 4 * r1 + mmsize/2],       3
3556
+    punpcklwd             m6,                 m5,                             m4
3557
+    pmaddwd               m6,                 m17
3558
+    paddd                 m2,                 m6
3559
+    punpckhwd             m5,                 m4
3560
+    pmaddwd               m5,                 m17
3561
+    paddd                 m3,                 m5
3562
+
3563
+    paddd                 m0,                 m7
3564
+    paddd                 m1,                 m7
3565
+    paddd                 m2,                 m7
3566
+    paddd                 m3,                 m7
3567
+
3568
+    psrad                 m0,                 INTERP_SHIFT_PS
3569
+    psrad                 m1,                 INTERP_SHIFT_PS
3570
+    psrad                 m2,                 INTERP_SHIFT_PS
3571
+    psrad                 m3,                 INTERP_SHIFT_PS
3572
+
3573
+    packssdw              m0,                 m1
3574
+    packssdw              m2,                 m3
3575
+    movu                  [r2 + mmsize/2],               xm0
3576
+    movu                  [r2 + r3 + mmsize/2],          xm2
3577
+    vextracti32x4         [r2 + 2 * r3 + mmsize/2],      m0,                  1
3578
+    vextracti32x4         [r2 + r7 + mmsize/2],          m2,                  1
3579
+    lea                   r2,                            [r2 + 4 * r3]
3580
+    vextracti32x4         [r2 + mmsize/2],               m0,                  2
3581
+    vextracti32x4         [r2 + r3 + mmsize/2],          m2,                  2
3582
+    vextracti32x4         [r2 + 2 * r3 + mmsize/2],      m0,                  3
3583
+    vextracti32x4         [r2 + r7 + mmsize/2],          m2,                  3
3584
+%endmacro
3585
+
3586
+%macro FILTER_VER_PS_CHROMA_24xN_AVX512 1
3587
+INIT_ZMM avx512
3588
+cglobal interp_4tap_vert_ps_24x%1, 5, 12, 18
3589
+    add                   r1d,                r1d
3590
+    add                   r3d,                r3d
3591
+    sub                   r0,                 r1
3592
+    shl                   r4d,                7
3593
+
3594
+%ifdef PIC
3595
+    lea                   r5,                 [tab_ChromaCoeffV_avx512]
3596
+    lea                   r5,                 [r5 + r4]
3597
+%else
3598
+    lea                   r5,                 [tab_ChromaCoeffV_avx512 + r4]
3599
+%endif
3600
+    vbroadcasti32x4       m7,                 [INTERP_OFFSET_PS]
3601
+    lea                   r10,                [3 * r1]
3602
+    lea                   r7,                 [3 * r3]
3603
+    mova                  m16,                [r5]
3604
+    mova                  m17,                [r5 + mmsize]
3605
+%rep %1/8 - 1
3606
+    PROCESS_CHROMA_VERT_PS_24x8_AVX512
3607
+    lea                   r0,                 [r8 + 4 * r1]
3608
+    lea                   r2,                 [r2 + 4 * r3]
3609
+%endrep
3610
+    PROCESS_CHROMA_VERT_PS_24x8_AVX512
3611
+    RET
3612
+%endmacro
3613
+
3614
+%if ARCH_X86_64
3615
+    FILTER_VER_PS_CHROMA_24xN_AVX512 32
3616
+    FILTER_VER_PS_CHROMA_24xN_AVX512 64
3617
+%endif
3618
+
3619
+%macro PROCESS_CHROMA_VERT_PS_32x2_AVX512 0
3620
+    movu                  m1,                 [r0]
3621
+    movu                  m3,                 [r0 + r1]
3622
+    punpcklwd             m0,                 m1,                  m3
3623
+    pmaddwd               m0,                 m9
3624
+    punpckhwd             m1,                 m3
3625
+    pmaddwd               m1,                 m9
3626
+
3627
+    movu                  m4,                 [r0 + 2 * r1]
3628
+    punpcklwd             m2,                 m3,                  m4
3629
+    pmaddwd               m2,                 m9
3630
+    punpckhwd             m3,                 m4
3631
+    pmaddwd               m3,                 m9
3632
+
3633
+    lea                   r0,                 [r0 + 2 * r1]
3634
+    movu                  m5,                 [r0 + r1]
3635
+    punpcklwd             m6,                 m4,                  m5
3636
+    pmaddwd               m6,                 m10
3637
+    paddd                 m0,                 m6
3638
+    punpckhwd             m4,                 m5
3639
+    pmaddwd               m4,                 m10
3640
+    paddd                 m1,                 m4
3641
+
3642
+    movu                  m4,                 [r0 + 2 * r1]
3643
+    punpcklwd             m6,                 m5,                  m4
3644
+    pmaddwd               m6,                 m10
3645
+    paddd                 m2,                 m6
3646
+    punpckhwd             m5,                 m4
3647
+    pmaddwd               m5,                 m10
3648
+    paddd                 m3,                 m5
3649
+
3650
+    paddd                 m0,                 m7
3651
+    paddd                 m1,                 m7
3652
+    paddd                 m2,                 m7
3653
+    paddd                 m3,                 m7
3654
+    psrad                 m0,                 INTERP_SHIFT_PS
3655
+    psrad                 m1,                 INTERP_SHIFT_PS
3656
+    psrad                 m2,                 INTERP_SHIFT_PS
3657
+    psrad                 m3,                 INTERP_SHIFT_PS
3658
+
3659
+    packssdw              m0,                 m1
3660
+    packssdw              m2,                 m3
3661
+    movu                  [r2],               m0
3662
+    movu                  [r2 + r3],          m2
3663
+%endmacro
3664
+
3665
+;-----------------------------------------------------------------------------------------------------------------
3666
+; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
3667
+;-----------------------------------------------------------------------------------------------------------------
3668
+%macro FILTER_VER_PS_CHROMA_32xN_AVX512 1
3669
+INIT_ZMM avx512
3670
+cglobal interp_4tap_vert_ps_32x%1, 5, 7, 11
3671
+    add                   r1d,                r1d
3672
+    add                   r3d,                r3d
3673
+    sub                   r0,                 r1
3674
+    shl                   r4d,                7
3675
+
3676
+%ifdef PIC
3677
+    lea                   r5,                 [tab_ChromaCoeffV_avx512]
3678
+    lea                   r5,                 [r5 + r4]
3679
+%else
3680
+    lea                   r5,                 [tab_ChromaCoeffV_avx512 + r4]
3681
+%endif
3682
+    vbroadcasti32x4       m7,                 [INTERP_OFFSET_PS]
3683
+    mova                  m9,                 [r5]
3684
+    mova                  m10,                [r5 + mmsize]
3685
+%rep %1/2 - 1
3686
+    PROCESS_CHROMA_VERT_PS_32x2_AVX512
3687
+    lea                   r2,                 [r2 + 2 * r3]
3688
+%endrep
3689
+    PROCESS_CHROMA_VERT_PS_32x2_AVX512
3690
+    RET
3691
+%endmacro
3692
+
3693
+%if ARCH_X86_64
3694
+FILTER_VER_PS_CHROMA_32xN_AVX512 8
3695
+FILTER_VER_PS_CHROMA_32xN_AVX512 16
3696
+FILTER_VER_PS_CHROMA_32xN_AVX512 24
3697
+FILTER_VER_PS_CHROMA_32xN_AVX512 32
3698
+FILTER_VER_PS_CHROMA_32xN_AVX512 48
3699
+FILTER_VER_PS_CHROMA_32xN_AVX512 64
3700
+%endif
3701
+
3702
+%macro PROCESS_CHROMA_VERT_PS_48x4_AVX512 0
3703
+    movu                  m1,                 [r0]
3704
+    lea                   r6,                 [r0 + 2 * r1]
3705
+    movu                  m10,                [r6]
3706
+    movu                  m3,                 [r0 + r1]
3707
+    movu                  m12,                [r6 + r1]
3708
+    punpcklwd             m0,                 m1,                  m3
3709
+    punpcklwd             m9,                 m10,                 m12
3710
+    pmaddwd               m0,                 m16
3711
+    pmaddwd               m9,                 m16
3712
+    punpckhwd             m1,                 m3
3713
+    punpckhwd             m10,                m12
3714
+    pmaddwd               m1,                 m16
3715
+    pmaddwd               m10,                m16
3716
+
3717
+    movu                  m4,                 [r0 + 2 * r1]
3718
+    movu                  m13,                [r6 + 2 * r1]
3719
+    punpcklwd             m2,                 m3,                  m4
3720
+    punpcklwd             m11,                m12,                 m13
3721
+    pmaddwd               m2,                 m16
3722
+    pmaddwd               m11,                m16
3723
+    punpckhwd             m3,                 m4
3724
+    punpckhwd             m12,                m13
3725
+    pmaddwd               m3,                 m16
3726
+    pmaddwd               m12,                m16
3727
+
3728
+    movu                  m5,                 [r0 + r7]
3729
+    movu                  m14,                [r6 + r7]
3730
+    punpcklwd             m6,                 m4,                  m5
3731
+    punpcklwd             m15,                m13,                 m14
3732
+    pmaddwd               m6,                 m17
3733
+    pmaddwd               m15,                m17
3734
+    paddd                 m0,                 m6
3735
+    paddd                 m9,                 m15
3736
+    punpckhwd             m4,                 m5
3737
+    punpckhwd             m13,                m14
3738
+    pmaddwd               m4,                 m17
3739
+    pmaddwd               m13,                m17
3740
+    paddd                 m1,                 m4
3741
+    paddd                 m10,                m13
3742
+
3743
+    movu                  m4,                 [r0 + 4 * r1]
3744
+    movu                  m13,                [r6 + 4 * r1]
3745
+    punpcklwd             m6,                 m5,                  m4
3746
+    punpcklwd             m15,                m14,                 m13
3747
+    pmaddwd               m6,                 m17
3748
+    pmaddwd               m15,                m17
3749
+    paddd                 m2,                 m6
3750
+    paddd                 m11,                m15
3751
+    punpckhwd             m5,                 m4
3752
+    punpckhwd             m14,                m13
3753
+    pmaddwd               m5,                 m17
3754
+    pmaddwd               m14,                m17
3755
+    paddd                 m3,                 m5
3756
+    paddd                 m12,                m14
3757
+
3758
+    paddd                 m0,                 m7
3759
+    paddd                 m1,                 m7
3760
+    paddd                 m2,                 m7
3761
+    paddd                 m3,                 m7
3762
+    paddd                 m9,                 m7
3763
+    paddd                 m10,                m7
3764
+    paddd                 m11,                m7
3765
+    paddd                 m12,                m7
3766
+
3767
+    psrad                 m0,                 INTERP_SHIFT_PS
3768
+    psrad                 m1,                 INTERP_SHIFT_PS
3769
+    psrad                 m2,                 INTERP_SHIFT_PS
3770
+    psrad                 m3,                 INTERP_SHIFT_PS
3771
+    psrad                 m9,                 INTERP_SHIFT_PS
3772
+    psrad                 m10,                INTERP_SHIFT_PS
3773
+    psrad                 m11,                INTERP_SHIFT_PS
3774
+    psrad                 m12,                INTERP_SHIFT_PS
3775
+
3776
+    packssdw              m0,                 m1
3777
+    packssdw              m2,                 m3
3778
+    packssdw              m9,                 m10
3779
+    packssdw              m11,                m12
3780
+    movu                  [r2],               m0
3781
+    movu                  [r2 + r3],          m2
3782
+    movu                  [r2 + 2 * r3],      m9
3783
+    movu                  [r2 + r8],          m11
3784
+
3785
+    movu                  ym1,                [r0 + mmsize]
3786
+    vinserti32x8          m1,                 [r6 + mmsize],       1
3787
+    movu                  ym3,                [r0 + r1 + mmsize]
3788
+    vinserti32x8          m3,                 [r6 + r1 + mmsize],  1
3789
+    punpcklwd             m0,                 m1,                  m3
3790
+    pmaddwd               m0,                 m16
3791
+    punpckhwd             m1,                 m3
3792
+    pmaddwd               m1,                 m16
3793
+
3794
+    movu                  ym4,                [r0 + 2 * r1 + mmsize]
3795
+    vinserti32x8          m4,                 [r6 + 2 * r1 + mmsize],  1
3796
+    punpcklwd             m2,                 m3,                  m4
3797
+    pmaddwd               m2,                 m16
3798
+    punpckhwd             m3,                 m4
3799
+    pmaddwd               m3,                 m16
3800
+
3801
+    movu                  ym5,                [r0 + r7 + mmsize]
3802
+    vinserti32x8          m5,                 [r6 + r7 + mmsize],  1
3803
+    punpcklwd             m6,                 m4,                  m5
3804
+    pmaddwd               m6,                 m17
3805
+    paddd                 m0,                 m6
3806
+    punpckhwd             m4,                 m5
3807
+    pmaddwd               m4,                 m17
3808
+    paddd                 m1,                 m4
3809
+
3810
+    movu                  ym4,                [r0 + 4 * r1 + mmsize]
3811
+    vinserti32x8          m4,                 [r6 + 4 * r1 + mmsize],  1
3812
+    punpcklwd             m6,                 m5,                  m4
3813
+    pmaddwd               m6,                 m17
3814
+    paddd                 m2,                 m6
3815
+    punpckhwd             m5,                 m4
3816
+    pmaddwd               m5,                 m17
3817
+    paddd                 m3,                 m5
3818
+
3819
+    paddd                 m0,                 m7
3820
+    paddd                 m1,                 m7
3821
+    paddd                 m2,                 m7
3822
+    paddd                 m3,                 m7
3823
+
3824
+    psrad                 m0,                 INTERP_SHIFT_PS
3825
+    psrad                 m1,                 INTERP_SHIFT_PS
3826
+    psrad                 m2,                 INTERP_SHIFT_PS
3827
+    psrad                 m3,                 INTERP_SHIFT_PS
3828
+
3829
+    packssdw              m0,                 m1
3830
+    packssdw              m2,                 m3
3831
+    movu                  [r2 + mmsize],               ym0
3832
+    movu                  [r2 + r3 + mmsize],          ym2
3833
+    vextracti32x8         [r2 + 2 * r3 + mmsize],      m0,                  1
3834
+    vextracti32x8         [r2 + r8 + mmsize],          m2,                  1
3835
+%endmacro
3836
+
3837
+%if ARCH_X86_64
3838
+INIT_ZMM avx512
3839
+cglobal interp_4tap_vert_ps_48x64, 5, 9, 18
3840
+     add                   r1d,                r1d
3841
+     add                   r3d,                r3d
3842
+     sub                   r0,                 r1
3843
+     shl                   r4d,                7
3844
+%ifdef PIC
3845
+    lea                   r5,                 [tab_ChromaCoeffV_avx512]
3846
+    lea                   r5,                 [r5 + r4]
3847
+%else
3848
+    lea                   r5,                 [tab_ChromaCoeffV_avx512 + r4]
3849
+%endif
3850
+    lea                   r7,                 [3 * r1]
3851
+    lea                   r8,                 [3 * r3]
3852
+    vbroadcasti32x4       m7,                 [INTERP_OFFSET_PS]
3853
+    mova                  m16,                [r5]
3854
+    mova                  m17,                [r5 + mmsize]
3855
+%rep 15
3856
+    PROCESS_CHROMA_VERT_PS_48x4_AVX512
3857
+    lea                   r0,                 [r0 + 4 * r1]
3858
+    lea                   r2,                 [r2 + 4 * r3]
3859
+%endrep
3860
+    PROCESS_CHROMA_VERT_PS_48x4_AVX512
3861
+    RET
3862
+%endif
3863
+
3864
+%macro PROCESS_CHROMA_VERT_PS_64x2_AVX512 0
3865
+    movu                 m1,                  [r0]
3866
+    movu                 m3,                  [r0 + r1]
3867
+    punpcklwd            m0,                  m1,                     m3
3868
+    pmaddwd              m0,                  m15
3869
+    punpckhwd            m1,                  m3
3870
+    pmaddwd              m1,                  m15
3871
+
3872
+    movu                 m9,                  [r0 + mmsize]
3873
+    movu                 m11,                 [r0 + r1 + mmsize]
3874
+    punpcklwd            m8,                  m9,                     m11
3875
+    pmaddwd              m8,                  m15
3876
+    punpckhwd            m9,                  m11
3877
+    pmaddwd              m9,                  m15
3878
+
3879
+    movu                 m4,                  [r0 + 2 * r1]
3880
+    punpcklwd            m2,                  m3,                     m4
3881
+    pmaddwd              m2,                  m15
3882
+    punpckhwd            m3,                  m4
3883
+    pmaddwd              m3,                  m15
3884
+
3885
+    movu                 m12,                 [r0 + 2 * r1 + mmsize]
3886
+    punpcklwd            m10,                 m11,                    m12
3887
+    pmaddwd              m10,                 m15
3888
+    punpckhwd            m11,                 m12
3889
+    pmaddwd              m11,                 m15
3890
+
3891
+    lea                  r0,                  [r0 + 2 * r1]
3892
+    movu                 m5,                  [r0 + r1]
3893
+    punpcklwd            m6,                  m4,                     m5
3894
+    pmaddwd              m6,                  m16
3895
+    paddd                m0,                  m6
3896
+    punpckhwd            m4,                  m5
3897
+    pmaddwd              m4,                  m16
3898
+    paddd                m1,                  m4
3899
+
3900
+    movu                 m13,                 [r0 + r1 + mmsize]
3901
+    punpcklwd            m14,                 m12,                    m13
3902
+    pmaddwd              m14,                 m16
3903
+    paddd                m8,                  m14
3904
+    punpckhwd            m12,                 m13
3905
+    pmaddwd              m12,                 m16
3906
+    paddd                m9,                  m12
3907
+
3908
+    movu                 m4,                  [r0 + 2 * r1]
3909
+    punpcklwd            m6,                  m5,                     m4
3910
+    pmaddwd              m6,                  m16
3911
+    paddd                m2,                  m6
3912
+    punpckhwd            m5,                  m4
3913
+    pmaddwd              m5,                  m16
3914
+    paddd                m3,                  m5
3915
+
3916
+    movu                 m12,                 [r0 + 2 * r1 + mmsize]
3917
+    punpcklwd            m14,                 m13,                    m12
3918
+    pmaddwd              m14,                 m16
3919
+    paddd                m10,                 m14
3920
+    punpckhwd            m13,                 m12
3921
+    pmaddwd              m13,                 m16
3922
+    paddd                m11,                 m13
3923
+
3924
+    paddd                m0,                  m7
3925
+    paddd                m1,                  m7
3926
+    paddd                m2,                  m7
3927
+    paddd                m3,                  m7
3928
+    paddd                m8,                  m7
3929
+    paddd                m9,                  m7
3930
+    paddd                m10,                 m7
3931
+    paddd                m11,                 m7
3932
+
3933
+    psrad                m0,                  INTERP_SHIFT_PS
3934
+    psrad                m1,                  INTERP_SHIFT_PS
3935
+    psrad                m2,                  INTERP_SHIFT_PS
3936
+    psrad                m3,                  INTERP_SHIFT_PS
3937
+    psrad                m8,                  INTERP_SHIFT_PS
3938
+    psrad                m9,                  INTERP_SHIFT_PS
3939
+    psrad                m10,                 INTERP_SHIFT_PS
3940
+    psrad                m11,                 INTERP_SHIFT_PS
3941
+
3942
+    packssdw             m0,                  m1
3943
+    packssdw             m2,                  m3
3944
+    packssdw             m8,                  m9
3945
+    packssdw             m10,                 m11
3946
+    movu                 [r2],                m0
3947
+    movu                 [r2 + r3],           m2
3948
+    movu                 [r2 + mmsize],       m8
3949
+    movu                 [r2 + r3 + mmsize],  m10
3950
+%endmacro
3951
+
3952
+;-----------------------------------------------------------------------------------------------------------------
3953
+; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
3954
+;-----------------------------------------------------------------------------------------------------------------
3955
+%macro FILTER_VER_PS_CHROMA_64xN_AVX512 1
3956
+INIT_ZMM avx512
3957
+cglobal interp_4tap_vert_ps_64x%1, 5, 7, 17
3958
+    add                   r1d,                r1d
3959
+    add                   r3d,                r3d
3960
+    sub                   r0,                 r1
3961
+    shl                   r4d,                7
3962
+
3963
+%ifdef PIC
3964
+    lea                   r5,                 [tab_ChromaCoeffV_avx512]
3965
+    lea                   r5,                 [r5 + r4]
3966
+%else
3967
+    lea                   r5,                 [tab_ChromaCoeffV_avx512 + r4]
3968
+%endif
3969
+    vbroadcasti32x4       m7,                 [INTERP_OFFSET_PS]
3970
+    mova                  m15,                [r5]
3971
+    mova                  m16,                [r5 + mmsize]
3972
+
3973
+%rep %1/2 - 1
3974
+    PROCESS_CHROMA_VERT_PS_64x2_AVX512
3975
+    lea                   r2,                 [r2 + 2 * r3]
3976
+%endrep
3977
+    PROCESS_CHROMA_VERT_PS_64x2_AVX512
3978
+    RET
3979
+%endmacro
3980
+
3981
+%if ARCH_X86_64
3982
+FILTER_VER_PS_CHROMA_64xN_AVX512 16
3983
+FILTER_VER_PS_CHROMA_64xN_AVX512 32
3984
+FILTER_VER_PS_CHROMA_64xN_AVX512 48
3985
+FILTER_VER_PS_CHROMA_64xN_AVX512 64
3986
+%endif
3987
+;-------------------------------------------------------------------------------------------------------------
3988
+; avx512 chroma_vps code end
3989
+;-------------------------------------------------------------------------------------------------------------
3990
+;-------------------------------------------------------------------------------------------------------------
3991
+; avx512 chroma_vsp and chroma_vss code start
3992
+;-------------------------------------------------------------------------------------------------------------
3993
+%macro PROCESS_CHROMA_VERT_S_8x8_AVX512 1
3994
+    movu                  xm1,                [r0]
3995
+    lea                   r6,                 [r0 + 2 * r1]
3996
+    lea                   r8,                 [r0 + 4 * r1]
3997
+    lea                   r9,                 [r8 + 2 * r1]
3998
+    vinserti32x4          m1,                 [r6],                1
3999
+    vinserti32x4          m1,                 [r8],                2
4000
+    vinserti32x4          m1,                 [r9],                3
4001
+    movu                  xm3,                [r0 + r1]
4002
+    vinserti32x4          m3,                 [r6 + r1],           1
4003
+    vinserti32x4          m3,                 [r8 + r1],           2
4004
+    vinserti32x4          m3,                 [r9 + r1],           3
4005
+    punpcklwd             m0,                 m1,                  m3
4006
+    pmaddwd               m0,                 m8
4007
+    punpckhwd             m1,                 m3
4008
+    pmaddwd               m1,                 m8
4009
+
4010
+    movu                  xm4,                [r0 + 2 * r1]
4011
+    vinserti32x4          m4,                 [r6 + 2 * r1],       1
4012
+    vinserti32x4          m4,                 [r8 + 2 * r1],       2
4013
+    vinserti32x4          m4,                 [r9 + 2 * r1],       3
4014
+    punpcklwd             m2,                 m3,                  m4
4015
+    pmaddwd               m2,                 m8
4016
+    punpckhwd             m3,                 m4
4017
+    pmaddwd               m3,                 m8
4018
+
4019
+    movu                  xm5,                [r0 + r10]
4020
+    vinserti32x4          m5,                 [r6 + r10],          1
4021
+    vinserti32x4          m5,                 [r8 + r10],          2
4022
+    vinserti32x4          m5,                 [r9 + r10],          3
4023
+    punpcklwd             m6,                 m4,                  m5
4024
+    pmaddwd               m6,                 m9
4025
+    paddd                 m0,                 m6
4026
+    punpckhwd             m4,                 m5
4027
+    pmaddwd               m4,                 m9
4028
+    paddd                 m1,                 m4
4029
+
4030
+    movu                  xm4,                [r0 + 4 * r1]
4031
+    vinserti32x4          m4,                 [r6 + 4 * r1],       1
4032
+    vinserti32x4          m4,                 [r8 + 4 * r1],       2
4033
+    vinserti32x4          m4,                 [r9 + 4 * r1],       3
4034
+    punpcklwd             m6,                 m5,                  m4
4035
+    pmaddwd               m6,                 m9
4036
+    paddd                 m2,                 m6
4037
+    punpckhwd             m5,                 m4
4038
+    pmaddwd               m5,                 m9
4039
+    paddd                 m3,                 m5
4040
+
4041
+%ifidn %1,sp
4042
+    paddd                 m0,                 m7
4043
+    paddd                 m1,                 m7
4044
+    paddd                 m2,                 m7
4045
+    paddd                 m3,                 m7
4046
+
4047
+    psrad                 m0,                 INTERP_SHIFT_SP
4048
+    psrad                 m1,                 INTERP_SHIFT_SP
4049
+    psrad                 m2,                 INTERP_SHIFT_SP
4050
+    psrad                 m3,                 INTERP_SHIFT_SP
4051
+
4052
+    packssdw              m0,                 m1
4053
+    packssdw              m2,                 m3
4054
+    CLIPW2                m0,                 m2,                  m10,                 m11
4055
+%else
4056
+    psrad                 m0,                 6
4057
+    psrad                 m1,                 6
4058
+    psrad                 m2,                 6
4059
+    psrad                 m3,                 6
4060
+    packssdw              m0,                 m1
4061
+    packssdw              m2,                 m3
4062
+%endif
4063
+
4064
+    movu                  [r2],               xm0
4065
+    movu                  [r2 + r3],          xm2
4066
+    vextracti32x4         [r2 + 2 * r3],      m0,                  1
4067
+    vextracti32x4         [r2 + r7],          m2,                  1
4068
+    lea                   r2,                 [r2 + 4 * r3]
4069
+    vextracti32x4         [r2],               m0,                  2
4070
+    vextracti32x4         [r2 + r3],          m2,                  2
4071
+    vextracti32x4         [r2 + 2 * r3],      m0,                  3
4072
+    vextracti32x4         [r2 + r7],          m2,                  3
4073
+%endmacro
4074
+
4075
+;-----------------------------------------------------------------------------------------------------------------
4076
+; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
4077
+;-----------------------------------------------------------------------------------------------------------------
4078
+%macro CHROMA_VERT_S_8x8_AVX512 1
4079
+INIT_ZMM avx512
4080
+cglobal interp_4tap_vert_%1_8x8, 5, 11, 12
4081
+    add                   r1d,                r1d
4082
+    add                   r3d,                r3d
4083
+    sub                   r0,                 r1
4084
+    shl                   r4d,                7
4085
+%ifdef PIC
4086
+    lea                   r5,                 [tab_ChromaCoeffV_avx512]
4087
+    mova                  m8,                 [r5 + r4]
4088
+    mova                  m9,                 [r5 + r4 + mmsize]
4089
+%else
4090
+    lea                   r5,                 [tab_ChromaCoeffV_avx512 + r4]
4091
+    mova                  m8,                 [r5]
4092
+    mova                  m9,                 [r5 + mmsize]
4093
+%endif
4094
+%ifidn %1, sp
4095
+    vbroadcasti32x4       m7,                 [INTERP_OFFSET_SP]
4096
+    pxor                  m10,                m10
4097
+    vbroadcasti32x8       m11,                [pw_pixel_max]
4098
+%endif
4099
+    lea                   r10,                [3 * r1]
4100
+    lea                   r7,                 [3 * r3]
4101
+
4102
+    PROCESS_CHROMA_VERT_S_8x8_AVX512 %1
4103
+    RET
4104
+%endmacro
4105
+
4106
+%if ARCH_X86_64
4107
+    CHROMA_VERT_S_8x8_AVX512 ss
4108
+    CHROMA_VERT_S_8x8_AVX512 sp
4109
+%endif
4110
+%macro FILTER_VER_S_CHROMA_8xN_AVX512 2
4111
+INIT_ZMM avx512
4112
+cglobal interp_4tap_vert_%1_8x%2, 5, 11, 10
4113
+    add                   r1d,                r1d
4114
+    add                   r3d,                r3d
4115
+    sub                   r0,                 r1
4116
+    shl                   r4d,                7
4117
+%ifdef PIC
4118
+    lea                   r5,                 [tab_ChromaCoeffV_avx512]
4119
+    mova                  m8,                 [r5 + r4]
4120
+    mova                  m9,                 [r5 + r4 + mmsize]
4121
+%else
4122
+    lea                   r5,                 [tab_ChromaCoeffV_avx512 + r4]
4123
+    mova                  m8,                 [r5]
4124
+    mova                  m9,                 [r5 + mmsize]
4125
+%endif
4126
+
4127
+%ifidn %1, sp
4128
+    vbroadcasti32x4       m7,                 [INTERP_OFFSET_SP]
4129
+    pxor                  m10,                m10
4130
+    vbroadcasti32x8       m11,                [pw_pixel_max]
4131
+%endif
4132
+    lea                   r10,                [3 * r1]
4133
+    lea                   r7,                 [3 * r3]
4134
+
4135
+%rep %2/8 - 1
4136
+    PROCESS_CHROMA_VERT_S_8x8_AVX512 %1
4137
+    lea                   r0,                 [r8 + 4 * r1]
4138
+    lea                   r2,                 [r2 + 4 * r3]
4139
+%endrep
4140
+    PROCESS_CHROMA_VERT_S_8x8_AVX512 %1
4141
+    RET
4142
+%endmacro
4143
+%if ARCH_X86_64
4144
+    FILTER_VER_S_CHROMA_8xN_AVX512 ss, 16
4145
+    FILTER_VER_S_CHROMA_8xN_AVX512 ss, 32
4146
+    FILTER_VER_S_CHROMA_8xN_AVX512 ss, 64
4147
+    FILTER_VER_S_CHROMA_8xN_AVX512 sp, 16
4148
+    FILTER_VER_S_CHROMA_8xN_AVX512 sp, 32
4149
+    FILTER_VER_S_CHROMA_8xN_AVX512 sp, 64
4150
+%endif
4151
+%macro PROCESS_CHROMA_VERT_S_16x4_AVX512 1
4152
+    movu                  ym1,                [r0]
4153
+    lea                   r6,                 [r0 + 2 * r1]
4154
+    vinserti32x8          m1,                 [r6],                1
4155
+    movu                  ym3,                [r0 + r1]
4156
+    vinserti32x8          m3,                 [r6 + r1],           1
4157
+    punpcklwd             m0,                 m1,                  m3
4158
+    pmaddwd               m0,                 m8
4159
+    punpckhwd             m1,                 m3
4160
+    pmaddwd               m1,                 m8
4161
+
4162
+    movu                  ym4,                [r0 + 2 * r1]
4163
+    vinserti32x8          m4,                 [r6 + 2 * r1],       1
4164
+    punpcklwd             m2,                 m3,                  m4
4165
+    pmaddwd               m2,                 m8
4166
+    punpckhwd             m3,                 m4
4167
+    pmaddwd               m3,                 m8
4168
+
4169
+    movu                  ym5,                [r0 + r8]
4170
+    vinserti32x8          m5,                 [r6 + r8],           1
4171
+    punpcklwd             m6,                 m4,                  m5
4172
+    pmaddwd               m6,                 m9
4173
+    paddd                 m0,                 m6
4174
+    punpckhwd             m4,                 m5
4175
+    pmaddwd               m4,                 m9
4176
+    paddd                 m1,                 m4
4177
+
4178
+    movu                  ym4,                [r0 + 4 * r1]
4179
+    vinserti32x8          m4,                 [r6 + 4 * r1],       1
4180
+    punpcklwd             m6,                 m5,                  m4
4181
+    pmaddwd               m6,                 m9
4182
+    paddd                 m2,                 m6
4183
+    punpckhwd             m5,                 m4
4184
+    pmaddwd               m5,                 m9
4185
+    paddd                 m3,                 m5
4186
+
4187
+%ifidn %1,sp
4188
+    paddd                 m0,                 m7
4189
+    paddd                 m1,                 m7
4190
+    paddd                 m2,                 m7
4191
+    paddd                 m3,                 m7
4192
+
4193
+    psrad                 m0,                 INTERP_SHIFT_SP
4194
+    psrad                 m1,                 INTERP_SHIFT_SP
4195
+    psrad                 m2,                 INTERP_SHIFT_SP
4196
+    psrad                 m3,                 INTERP_SHIFT_SP
4197
+
4198
+    packssdw              m0,                 m1
4199
+    packssdw              m2,                 m3
4200
+    CLIPW2                m0,                 m2,                  m10,                 m11
4201
+%else
4202
+    psrad                 m0,                 6
4203
+    psrad                 m1,                 6
4204
+    psrad                 m2,                 6
4205
+    psrad                 m3,                 6
4206
+    packssdw              m0,                 m1
4207
+    packssdw              m2,                 m3
4208
+%endif
4209
+
4210
+    movu                  [r2],               ym0
4211
+    movu                  [r2 + r3],          ym2
4212
+    vextracti32x8         [r2 + 2 * r3],      m0,                  1
4213
+    vextracti32x8         [r2 + r7],          m2,                  1
4214
+%endmacro
4215
+
4216
+;-----------------------------------------------------------------------------------------------------------------
4217
+; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
4218
+;-----------------------------------------------------------------------------------------------------------------
4219
+%macro CHROMA_VERT_S_16x4_AVX512 1
4220
+INIT_ZMM avx512
4221
+cglobal interp_4tap_vert_%1_16x4, 5, 9, 12
4222
+    add                   r1d,                r1d
4223
+    add                   r3d,                r3d
4224
+    sub                   r0,                 r1
4225
+    shl                   r4d,                7
4226
+%ifdef PIC
4227
+    lea                   r5,                 [tab_ChromaCoeffV_avx512]
4228
+    mova                  m8,                 [r5 + r4]
4229
+    mova                  m9,                 [r5 + r4 + mmsize]
4230
+%else
4231
+    lea                   r5,                 [tab_ChromaCoeffV_avx512 + r4]
4232
+    mova                  m8,                 [r5]
4233
+    mova                  m9,                 [r5 + mmsize]
4234
+%endif
4235
+
4236
+%ifidn %1, sp
4237
+    vbroadcasti32x4       m7,                 [INTERP_OFFSET_SP]
4238
+    pxor                  m10,                m10
4239
+    vbroadcasti32x8       m11,                [pw_pixel_max]
4240
+%endif
4241
+    lea                   r7,                 [3 * r3]
4242
+    lea                   r8,                 [3 * r1]
4243
+    PROCESS_CHROMA_VERT_S_16x4_AVX512 %1
4244
+    RET
4245
+%endmacro
4246
+
4247
+%if ARCH_X86_64
4248
+    CHROMA_VERT_S_16x4_AVX512 ss
4249
+    CHROMA_VERT_S_16x4_AVX512 sp
4250
+%endif
4251
+%macro FILTER_VER_S_CHROMA_16xN_AVX512 2
4252
+INIT_ZMM avx512
4253
+cglobal interp_4tap_vert_%1_16x%2, 5, 9, 12
4254
+    add                   r1d,                r1d
4255
+    add                   r3d,                r3d
4256
+    sub                   r0,                 r1
4257
+    shl                   r4d,                7
4258
+%ifdef PIC
4259
+    lea                   r5,                 [tab_ChromaCoeffV_avx512]
4260
+    mova                  m8,                 [r5 + r4]
4261
+    mova                  m9,                 [r5 + r4 + mmsize]
4262
+%else
4263
+    lea                   r5,                 [tab_ChromaCoeffV_avx512 + r4]
4264
+    mova                  m8,                 [r5]
4265
+    mova                  m9,                 [r5 + mmsize]
4266
+%endif
4267
+
4268
+%ifidn %1, sp
4269
+    vbroadcasti32x4       m7,                 [INTERP_OFFSET_SP]
4270
+    pxor                  m10,                m10
4271
+    vbroadcasti32x8       m11,                [pw_pixel_max]
4272
+%endif
4273
+    lea                   r7,                 [3 * r3]
4274
+    lea                   r8,                 [3 * r1]
4275
+%rep %2/4 - 1
4276
+    PROCESS_CHROMA_VERT_S_16x4_AVX512 %1
4277
+    lea                   r0,                 [r0 + 4 * r1]
4278
+    lea                   r2,                 [r2 + 4 * r3]
4279
+%endrep
4280
+    PROCESS_CHROMA_VERT_S_16x4_AVX512 %1
4281
+    RET
4282
+%endmacro
4283
+
4284
+%if ARCH_X86_64
4285
+    FILTER_VER_S_CHROMA_16xN_AVX512 ss, 8
4286
+    FILTER_VER_S_CHROMA_16xN_AVX512 ss, 12
4287
+    FILTER_VER_S_CHROMA_16xN_AVX512 ss, 16
4288
+    FILTER_VER_S_CHROMA_16xN_AVX512 ss, 24
4289
+    FILTER_VER_S_CHROMA_16xN_AVX512 ss, 32
4290
+    FILTER_VER_S_CHROMA_16xN_AVX512 ss, 64
4291
+    FILTER_VER_S_CHROMA_16xN_AVX512 sp, 8
4292
+    FILTER_VER_S_CHROMA_16xN_AVX512 sp, 12
4293
+    FILTER_VER_S_CHROMA_16xN_AVX512 sp, 16
4294
+    FILTER_VER_S_CHROMA_16xN_AVX512 sp, 24
4295
+    FILTER_VER_S_CHROMA_16xN_AVX512 sp, 32
4296
+    FILTER_VER_S_CHROMA_16xN_AVX512 sp, 64
4297
+%endif
4298
+
4299
+%macro PROCESS_CHROMA_VERT_S_24x8_AVX512 1
4300
+    movu                  ym1,                [r0]
4301
+    lea                   r6,                 [r0 + 2 * r1]
4302
+    lea                   r8,                 [r0 + 4 * r1]
4303
+    lea                   r9,                 [r8 + 2 * r1]
4304
+
4305
+    movu                  ym10,               [r8]
4306
+    movu                  ym3,                [r0 + r1]
4307
+    movu                  ym12,               [r8 + r1]
4308
+    vinserti32x8          m1,                 [r6],                1
4309
+    vinserti32x8          m10,                [r9],                1
4310
+    vinserti32x8          m3,                 [r6 + r1],           1
4311
+    vinserti32x8          m12,                [r9 + r1],           1
4312
+
4313
+    punpcklwd             m0,                 m1,                  m3
4314
+    punpcklwd             m9,                 m10,                 m12
4315
+    pmaddwd               m0,                 m16
4316
+    pmaddwd               m9,                 m16
4317
+    punpckhwd             m1,                 m3
4318
+    punpckhwd             m10,                m12
4319
+    pmaddwd               m1,                 m16
4320
+    pmaddwd               m10,                m16
4321
+
4322
+    movu                  ym4,                [r0 + 2 * r1]
4323
+    movu                  ym13,               [r8 + 2 * r1]
4324
+    vinserti32x8          m4,                 [r6 + 2 * r1],       1
4325
+    vinserti32x8          m13,                [r9 + 2 * r1],       1
4326
+    punpcklwd             m2,                 m3,                  m4
4327
+    punpcklwd             m11,                m12,                 m13
4328
+    pmaddwd               m2,                 m16
4329
+    pmaddwd               m11,                m16
4330
+    punpckhwd             m3,                 m4
4331
+    punpckhwd             m12,                m13
4332
+    pmaddwd               m3,                 m16
4333
+    pmaddwd               m12,                m16
4334
+
4335
+    movu                  ym5,                [r0 + r10]
4336
+    vinserti32x8          m5,                 [r6 + r10],          1
4337
+    movu                  ym14,               [r8 + r10]
4338
+    vinserti32x8          m14,                [r9 + r10],          1
4339
+    punpcklwd             m6,                 m4,                  m5
4340
+    punpcklwd             m15,                m13,                 m14
4341
+    pmaddwd               m6,                 m17
4342
+    pmaddwd               m15,                m17
4343
+    paddd                 m0,                 m6
4344
+    paddd                 m9,                 m15
4345
+    punpckhwd             m4,                 m5
4346
+    punpckhwd             m13,                m14
4347
+    pmaddwd               m4,                 m17
4348
+    pmaddwd               m13,                m17
4349
+    paddd                 m1,                 m4
4350
+    paddd                 m10,                m13
4351
+
4352
+    movu                  ym4,                [r0 + 4 * r1]
4353
+    vinserti32x8          m4,                 [r6 + 4 * r1],       1
4354
+    movu                  ym13,               [r8 + 4 * r1]
4355
+    vinserti32x8          m13,                [r9 + 4 * r1],       1
4356
+    punpcklwd             m6,                 m5,                  m4
4357
+    punpcklwd             m15,                m14,                 m13
4358
+    pmaddwd               m6,                 m17
4359
+    pmaddwd               m15,                m17
4360
+    paddd                 m2,                 m6
4361
+    paddd                 m11,                m15
4362
+    punpckhwd             m5,                 m4
4363
+    punpckhwd             m14,                m13
4364
+    pmaddwd               m5,                 m17
4365
+    pmaddwd               m14,                m17
4366
+    paddd                 m3,                 m5
4367
+    paddd                 m12,                m14
4368
+
4369
+%ifidn %1,sp
4370
+    paddd                 m0,                 m7
4371
+    paddd                 m1,                 m7
4372
+    paddd                 m2,                 m7
4373
+    paddd                 m3,                 m7
4374
+    paddd                 m9,                 m7
4375
+    paddd                 m10,                m7
4376
+    paddd                 m11,                m7
4377
+    paddd                 m12,                m7
4378
+
4379
+    psrad                 m0,                 INTERP_SHIFT_SP
4380
+    psrad                 m1,                 INTERP_SHIFT_SP
4381
+    psrad                 m2,                 INTERP_SHIFT_SP
4382
+    psrad                 m3,                 INTERP_SHIFT_SP
4383
+    psrad                 m9,                 INTERP_SHIFT_SP
4384
+    psrad                 m10,                INTERP_SHIFT_SP
4385
+    psrad                 m11,                INTERP_SHIFT_SP
4386
+    psrad                 m12,                INTERP_SHIFT_SP
4387
+
4388
+    packssdw              m0,                 m1
4389
+    packssdw              m2,                 m3
4390
+    packssdw              m9,                 m10
4391
+    packssdw              m11,                m12
4392
+    CLIPW2                m0,                 m2,                m18,                m19
4393
+    CLIPW2                m9,                 m11,               m18,                m19
4394
+%else
4395
+    psrad                 m0,                 6
4396
+    psrad                 m1,                 6
4397
+    psrad                 m2,                 6
4398
+    psrad                 m3,                 6
4399
+    psrad                 m9,                 6
4400
+    psrad                 m10,                6
4401
+    psrad                 m11,                6
4402
+    psrad                 m12,                6
4403
+
4404
+    packssdw              m0,                 m1
4405
+    packssdw              m2,                 m3
4406
+    packssdw              m9,                 m10
4407
+    packssdw              m11,                m12
4408
+%endif
4409
+
4410
+    movu                  [r2],               ym0
4411
+    movu                  [r2 + r3],          ym2
4412
+    vextracti32x8         [r2 + 2 * r3],      m0,                  1
4413
+    vextracti32x8         [r2 + r7],          m2,                  1
4414
+    lea                   r11,                [r2 + 4 * r3]
4415
+    movu                  [r11],              ym9
4416
+    movu                  [r11 + r3],         ym11
4417
+    vextracti32x8         [r11 + 2 * r3],     m9,                  1
4418
+    vextracti32x8         [r11 + r7],         m11,                 1
4419
+
4420
+    movu                  xm1,                [r0 + mmsize/2]
4421
+    vinserti32x4          m1,                 [r6 + mmsize/2],                1
4422
+    vinserti32x4          m1,                 [r8 + mmsize/2],                2
4423
+    vinserti32x4          m1,                 [r9 + mmsize/2],                3
4424
+    movu                  xm3,                [r0 + r1 + mmsize/2]
4425
+    vinserti32x4          m3,                 [r6 + r1 + mmsize/2],           1
4426
+    vinserti32x4          m3,                 [r8 + r1 + mmsize/2],           2
4427
+    vinserti32x4          m3,                 [r9 + r1 + mmsize/2],           3
4428
+    punpcklwd             m0,                 m1,                             m3
4429
+    pmaddwd               m0,                 m16
4430
+    punpckhwd             m1,                 m3
4431
+    pmaddwd               m1,                 m16
4432
+
4433
+    movu                  xm4,                [r0 + 2 * r1 + mmsize/2]
4434
+    vinserti32x4          m4,                 [r6 + 2 * r1 + mmsize/2],       1
4435
+    vinserti32x4          m4,                 [r8 + 2 * r1 + mmsize/2],       2
4436
+    vinserti32x4          m4,                 [r9 + 2 * r1 + mmsize/2],       3
4437
+    punpcklwd             m2,                 m3,                             m4
4438
+    pmaddwd               m2,                 m16
4439
+    punpckhwd             m3,                 m4
4440
+    pmaddwd               m3,                 m16
4441
+
4442
+    movu                  xm5,                [r0 + r10 + mmsize/2]
4443
+    vinserti32x4          m5,                 [r6 + r10 + mmsize/2],          1
4444
+    vinserti32x4          m5,                 [r8 + r10 + mmsize/2],          2
4445
+    vinserti32x4          m5,                 [r9 + r10 + mmsize/2],          3
4446
+    punpcklwd             m6,                 m4,                             m5
4447
+    pmaddwd               m6,                 m17
4448
+    paddd                 m0,                 m6
4449
+    punpckhwd             m4,                 m5
4450
+    pmaddwd               m4,                 m17
4451
+    paddd                 m1,                 m4
4452
+
4453
+    movu                  xm4,                [r0 + 4 * r1 + mmsize/2]
4454
+    vinserti32x4          m4,                 [r6 + 4 * r1 + mmsize/2],       1
4455
+    vinserti32x4          m4,                 [r8 + 4 * r1 + mmsize/2],       2
4456
+    vinserti32x4          m4,                 [r9 + 4 * r1 + mmsize/2],       3
4457
+    punpcklwd             m6,                 m5,                             m4
4458
+    pmaddwd               m6,                 m17
4459
+    paddd                 m2,                 m6
4460
+    punpckhwd             m5,                 m4
4461
+    pmaddwd               m5,                 m17
4462
+    paddd                 m3,                 m5
4463
+
4464
+%ifidn %1,sp
4465
+    paddd                 m0,                 m7
4466
+    paddd                 m1,                 m7
4467
+    paddd                 m2,                 m7
4468
+    paddd                 m3,                 m7
4469
+
4470
+    psrad                 m0,                 INTERP_SHIFT_SP
4471
+    psrad                 m1,                 INTERP_SHIFT_SP
4472
+    psrad                 m2,                 INTERP_SHIFT_SP
4473
+    psrad                 m3,                 INTERP_SHIFT_SP
4474
+
4475
+    packssdw              m0,                 m1
4476
+    packssdw              m2,                 m3
4477
+    CLIPW2                m0,                 m2,                 m18,                m19
4478
+%else
4479
+    psrad                 m0,                 6
4480
+    psrad                 m1,                 6
4481
+    psrad                 m2,                 6
4482
+    psrad                 m3,                 6
4483
+
4484
+    packssdw              m0,                 m1
4485
+    packssdw              m2,                 m3
4486
+%endif
4487
+
4488
+    movu                  [r2 + mmsize/2],               xm0
4489
+    movu                  [r2 + r3 + mmsize/2],          xm2
4490
+    vextracti32x4         [r2 + 2 * r3 + mmsize/2],      m0,                  1
4491
+    vextracti32x4         [r2 + r7 + mmsize/2],          m2,                  1
4492
+    lea                   r2,                            [r2 + 4 * r3]
4493
+    vextracti32x4         [r2 + mmsize/2],               m0,                  2
4494
+    vextracti32x4         [r2 + r3 + mmsize/2],          m2,                  2
4495
+    vextracti32x4         [r2 + 2 * r3 + mmsize/2],      m0,                  3
4496
+    vextracti32x4         [r2 + r7 + mmsize/2],          m2,                  3
4497
+%endmacro
4498
+%macro FILTER_VER_S_CHROMA_24xN_AVX512 2
4499
+INIT_ZMM avx512
4500
+cglobal interp_4tap_vert_%1_24x%2, 5, 12, 20
4501
+    add                   r1d,                r1d
4502
+    add                   r3d,                r3d
4503
+    sub                   r0,                 r1
4504
+    shl                   r4d,                7
4505
+%ifdef PIC
4506
+    lea                   r5,                 [tab_ChromaCoeffV_avx512]
4507
+    mova                  m16,                [r5 + r4]
4508
+    mova                  m17,                [r5 + r4 + mmsize]
4509
+%else
4510
+    lea                   r5,                 [tab_ChromaCoeffV_avx512 + r4]
4511
+    mova                  m16,                [r5]
4512
+    mova                  m17,                [r5 + mmsize]
4513
+%endif
4514
+%ifidn %1, sp
4515
+    vbroadcasti32x4       m7,                 [INTERP_OFFSET_SP]
4516
+    pxor                  m18,                m18
4517
+    vbroadcasti32x8       m19,                [pw_pixel_max]
4518
+%endif
4519
+    lea                   r10,                [3 * r1]
4520
+    lea                   r7,                 [3 * r3]
4521
+%rep %2/8 - 1
4522
+    PROCESS_CHROMA_VERT_S_24x8_AVX512 %1
4523
+    lea                   r0,                 [r8 + 4 * r1]
4524
+    lea                   r2,                 [r2 + 4 * r3]
4525
+%endrep
4526
+    PROCESS_CHROMA_VERT_S_24x8_AVX512 %1
4527
+    RET
4528
+%endmacro
4529
+%if ARCH_X86_64
4530
+    FILTER_VER_S_CHROMA_24xN_AVX512 ss, 32
4531
+    FILTER_VER_S_CHROMA_24xN_AVX512 ss, 64
4532
+    FILTER_VER_S_CHROMA_24xN_AVX512 sp, 32
4533
+    FILTER_VER_S_CHROMA_24xN_AVX512 sp, 64
4534
+%endif
4535
+
4536
+%macro PROCESS_CHROMA_VERT_S_32x4_AVX512 1
4537
+    movu                  m1,                 [r0]
4538
+    lea                   r6,                 [r0 + 2 * r1]
4539
+    movu                  m10,                [r6]
4540
+    movu                  m3,                 [r0 + r1]
4541
+    movu                  m12,                [r6 + r1]
4542
+    punpcklwd             m0,                 m1,                  m3
4543
+    punpcklwd             m9,                 m10,                 m12
4544
+    pmaddwd               m0,                 m16
4545
+    pmaddwd               m9,                 m16
4546
+    punpckhwd             m1,                 m3
4547
+    punpckhwd             m10,                m12
4548
+    pmaddwd               m1,                 m16
4549
+    pmaddwd               m10,                m16
4550
+    movu                  m4,                 [r0 + 2 * r1]
4551
+    movu                  m13,                [r6 + 2 * r1]
4552
+    punpcklwd             m2,                 m3,                  m4
4553
+    punpcklwd             m11,                m12,                 m13
4554
+    pmaddwd               m2,                 m16
4555
+    pmaddwd               m11,                m16
4556
+    punpckhwd             m3,                 m4
4557
+    punpckhwd             m12,                m13
4558
+    pmaddwd               m3,                 m16
4559
+    pmaddwd               m12,                m16
4560
+
4561
+    movu                  m5,                 [r0 + r7]
4562
+    movu                  m14,                [r6 + r7]
4563
+    punpcklwd             m6,                 m4,                  m5
4564
+    punpcklwd             m15,                m13,                 m14
4565
+    pmaddwd               m6,                 m17
4566
+    pmaddwd               m15,                m17
4567
+    paddd                 m0,                 m6
4568
+    paddd                 m9,                 m15
4569
+    punpckhwd             m4,                 m5
4570
+    punpckhwd             m13,                m14
4571
+    pmaddwd               m4,                 m17
4572
+    pmaddwd               m13,                m17
4573
+    paddd                 m1,                 m4
4574
+    paddd                 m10,                m13
4575
+
4576
+    movu                  m4,                 [r0 + 4 * r1]
4577
+    movu                  m13,                [r6 + 4 * r1]
4578
+    punpcklwd             m6,                 m5,                  m4
4579
+    punpcklwd             m15,                m14,                 m13
4580
+    pmaddwd               m6,                 m17
4581
+    pmaddwd               m15,                m17
4582
+    paddd                 m2,                 m6
4583
+    paddd                 m11,                m15
4584
+    punpckhwd             m5,                 m4
4585
+    punpckhwd             m14,                m13
4586
+    pmaddwd               m5,                 m17
4587
+    pmaddwd               m14,                m17
4588
+    paddd                 m3,                 m5
4589
+    paddd                 m12,                m14
4590
+%ifidn %1,sp
4591
+    paddd                 m0,                 m7
4592
+    paddd                 m1,                 m7
4593
+    paddd                 m2,                 m7
4594
+    paddd                 m3,                 m7
4595
+    paddd                 m9,                 m7
4596
+    paddd                 m10,                m7
4597
+    paddd                 m11,                m7
4598
+    paddd                 m12,                m7
4599
+
4600
+    psrad                 m0,                 INTERP_SHIFT_SP
4601
+    psrad                 m1,                 INTERP_SHIFT_SP
4602
+    psrad                 m2,                 INTERP_SHIFT_SP
4603
+    psrad                 m3,                 INTERP_SHIFT_SP
4604
+    psrad                 m9,                 INTERP_SHIFT_SP
4605
+    psrad                 m10,                INTERP_SHIFT_SP
4606
+    psrad                 m11,                INTERP_SHIFT_SP
4607
+    psrad                 m12,                INTERP_SHIFT_SP
4608
+
4609
+    packssdw              m0,                 m1
4610
+    packssdw              m2,                 m3
4611
+    packssdw              m9,                 m10
4612
+    packssdw              m11,                m12
4613
+    CLIPW2                m0,                 m2,                m18,              m19
4614
+    CLIPW2                m9,                 m11,               m18,              m19
4615
+%else
4616
+    psrad                 m0,                 6
4617
+    psrad                 m1,                 6
4618
+    psrad                 m2,                 6
4619
+    psrad                 m3,                 6
4620
+    psrad                 m9,                 6
4621
+    psrad                 m10,                6
4622
+    psrad                 m11,                6
4623
+    psrad                 m12,                6
4624
+
4625
+    packssdw              m0,                 m1
4626
+    packssdw              m2,                 m3
4627
+    packssdw              m9,                 m10
4628
+    packssdw              m11,                m12
4629
+%endif
4630
+
4631
+    movu                  [r2],               m0
4632
+    movu                  [r2 + r3],          m2
4633
+    movu                  [r2 + 2 * r3],      m9
4634
+    movu                  [r2 + r8],          m11
4635
+%endmacro
4636
+;-----------------------------------------------------------------------------------------------------------------
4637
+; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
4638
+;-----------------------------------------------------------------------------------------------------------------
4639
+%macro FILTER_VER_S_CHROMA_32xN_AVX512 2
4640
+INIT_ZMM avx512
4641
+cglobal interp_4tap_vert_%1_32x%2, 5, 9, 20
4642
+    add                   r1d,                r1d
4643
+    add                   r3d,                r3d
4644
+    sub                   r0,                 r1
4645
+    shl                   r4d,                7
4646
+%ifdef PIC
4647
+    lea                   r5,                 [tab_ChromaCoeffV_avx512]
4648
+    mova                  m16,                [r5 + r4]
4649
+    mova                  m17,                [r5 + r4 + mmsize]
4650
+%else
4651
+    lea                   r5,                 [tab_ChromaCoeffV_avx512 + r4]
4652
+    mova                  m16,                [r5]
4653
+    mova                  m17,                [r5 + mmsize]
4654
+%endif
4655
+    lea                   r7,                 [3 * r1]
4656
+    lea                   r8,                 [3 * r3]
4657
+%ifidn %1, sp
4658
+    vbroadcasti32x4       m7,                 [INTERP_OFFSET_SP]
4659
+    pxor                  m18,                m18
4660
+    vbroadcasti32x8       m19,                [pw_pixel_max]
4661
+%endif
4662
+
4663
+%rep %2/4 - 1
4664
+    PROCESS_CHROMA_VERT_S_32x4_AVX512 %1
4665
+    lea                   r0,                 [r0 + 4 * r1]
4666
+    lea                   r2,                 [r2 + 4 * r3]
4667
+%endrep
4668
+    PROCESS_CHROMA_VERT_S_32x4_AVX512 %1
4669
+    RET
4670
+%endmacro
4671
+%if ARCH_X86_64
4672
+    FILTER_VER_S_CHROMA_32xN_AVX512 ss, 8
4673
+    FILTER_VER_S_CHROMA_32xN_AVX512 ss, 16
4674
+    FILTER_VER_S_CHROMA_32xN_AVX512 ss, 24
4675
+    FILTER_VER_S_CHROMA_32xN_AVX512 ss, 32
4676
+    FILTER_VER_S_CHROMA_32xN_AVX512 ss, 48
4677
+    FILTER_VER_S_CHROMA_32xN_AVX512 ss, 64
4678
+    FILTER_VER_S_CHROMA_32xN_AVX512 sp, 8
4679
+    FILTER_VER_S_CHROMA_32xN_AVX512 sp, 16
4680
+    FILTER_VER_S_CHROMA_32xN_AVX512 sp, 24
4681
+    FILTER_VER_S_CHROMA_32xN_AVX512 sp, 32
4682
+    FILTER_VER_S_CHROMA_32xN_AVX512 sp, 48
4683
+    FILTER_VER_S_CHROMA_32xN_AVX512 sp, 64
4684
+%endif
4685
+%macro PROCESS_CHROMA_VERT_S_48x4_AVX512 1
4686
+    movu                  m1,                 [r0]
4687
+    lea                   r6,                 [r0 + 2 * r1]
4688
+    movu                  m10,                [r6]
4689
+    movu                  m3,                 [r0 + r1]
4690
+    movu                  m12,                [r6 + r1]
4691
+    punpcklwd             m0,                 m1,                  m3
4692
+    punpcklwd             m9,                 m10,                 m12
4693
+    pmaddwd               m0,                 m16
4694
+    pmaddwd               m9,                 m16
4695
+    punpckhwd             m1,                 m3
4696
+    punpckhwd             m10,                m12
4697
+    pmaddwd               m1,                 m16
4698
+    pmaddwd               m10,                m16
4699
+
4700
+    movu                  m4,                 [r0 + 2 * r1]
4701
+    movu                  m13,                [r6 + 2 * r1]
4702
+    punpcklwd             m2,                 m3,                  m4
4703
+    punpcklwd             m11,                m12,                 m13
4704
+    pmaddwd               m2,                 m16
4705
+    pmaddwd               m11,                m16
4706
+    punpckhwd             m3,                 m4
4707
+    punpckhwd             m12,                m13
4708
+    pmaddwd               m3,                 m16
4709
+    pmaddwd               m12,                m16
4710
+
4711
+    movu                  m5,                 [r0 + r7]
4712
+    movu                  m14,                [r6 + r7]
4713
+    punpcklwd             m6,                 m4,                  m5
4714
+    punpcklwd             m15,                m13,                 m14
4715
+    pmaddwd               m6,                 m17
4716
+    pmaddwd               m15,                m17
4717
+    paddd                 m0,                 m6
4718
+    paddd                 m9,                 m15
4719
+    punpckhwd             m4,                 m5
4720
+    punpckhwd             m13,                m14
4721
+    pmaddwd               m4,                 m17
4722
+    pmaddwd               m13,                m17
4723
+    paddd                 m1,                 m4
4724
+    paddd                 m10,                m13
4725
+
4726
+    movu                  m4,                 [r0 + 4 * r1]
4727
+    movu                  m13,                [r6 + 4 * r1]
4728
+    punpcklwd             m6,                 m5,                  m4
4729
+    punpcklwd             m15,                m14,                 m13
4730
+    pmaddwd               m6,                 m17
4731
+    pmaddwd               m15,                m17
4732
+    paddd                 m2,                 m6
4733
+    paddd                 m11,                m15
4734
+    punpckhwd             m5,                 m4
4735
+    punpckhwd             m14,                m13
4736
+    pmaddwd               m5,                 m17
4737
+    pmaddwd               m14,                m17
4738
+    paddd                 m3,                 m5
4739
+    paddd                 m12,                m14
4740
+
4741
+%ifidn %1,sp
4742
+    paddd                 m0,                 m7
4743
+    paddd                 m1,                 m7
4744
+    paddd                 m2,                 m7
4745
+    paddd                 m3,                 m7
4746
+    paddd                 m9,                 m7
4747
+    paddd                 m10,                m7
4748
+    paddd                 m11,                m7
4749
+    paddd                 m12,                m7
4750
+
4751
+    psrad                 m0,                 INTERP_SHIFT_SP
4752
+    psrad                 m1,                 INTERP_SHIFT_SP
4753
+    psrad                 m2,                 INTERP_SHIFT_SP
4754
+    psrad                 m3,                 INTERP_SHIFT_SP
4755
+    psrad                 m9,                 INTERP_SHIFT_SP
4756
+    psrad                 m10,                INTERP_SHIFT_SP
4757
+    psrad                 m11,                INTERP_SHIFT_SP
4758
+    psrad                 m12,                INTERP_SHIFT_SP
4759
+
4760
+    packssdw              m0,                 m1
4761
+    packssdw              m2,                 m3
4762
+    packssdw              m9,                 m10
4763
+    packssdw              m11,                m12
4764
+    CLIPW2                m0,                 m2,               m18,                 m19
4765
+    CLIPW2                m9,                 m11,              m18,                 m19
4766
+%else
4767
+    psrad                 m0,                 6
4768
+    psrad                 m1,                 6
4769
+    psrad                 m2,                 6
4770
+    psrad                 m3,                 6
4771
+    psrad                 m9,                 6
4772
+    psrad                 m10,                6
4773
+    psrad                 m11,                6
4774
+    psrad                 m12,                6
4775
+    packssdw              m0,                 m1
4776
+    packssdw              m2,                 m3
4777
+    packssdw              m9,                 m10
4778
+    packssdw              m11,                m12
4779
+%endif
4780
+
4781
+    movu                  [r2],               m0
4782
+    movu                  [r2 + r3],          m2
4783
+    movu                  [r2 + 2 * r3],      m9
4784
+    movu                  [r2 + r8],          m11
4785
+
4786
+    movu                  ym1,                [r0 + mmsize]
4787
+    vinserti32x8          m1,                 [r6 + mmsize],       1
4788
+    movu                  ym3,                [r0 + r1 + mmsize]
4789
+    vinserti32x8          m3,                 [r6 + r1 + mmsize],  1
4790
+    punpcklwd             m0,                 m1,                  m3
4791
+    pmaddwd               m0,                 m16
4792
+    punpckhwd             m1,                 m3
4793
+    pmaddwd               m1,                 m16
4794
+
4795
+    movu                  ym4,                [r0 + 2 * r1 + mmsize]
4796
+    vinserti32x8          m4,                 [r6 + 2 * r1 + mmsize],  1
4797
+    punpcklwd             m2,                 m3,                  m4
4798
+    pmaddwd               m2,                 m16
4799
+    punpckhwd             m3,                 m4
4800
+    pmaddwd               m3,                 m16
4801
+
4802
+    movu                  ym5,                [r0 + r7 + mmsize]
4803
+    vinserti32x8          m5,                 [r6 + r7 + mmsize],  1
4804
+    punpcklwd             m6,                 m4,                  m5
4805
+    pmaddwd               m6,                 m17
4806
+    paddd                 m0,                 m6
4807
+    punpckhwd             m4,                 m5
4808
+    pmaddwd               m4,                 m17
4809
+    paddd                 m1,                 m4
4810
+
4811
+    movu                  ym4,                [r0 + 4 * r1 + mmsize]
4812
+    vinserti32x8          m4,                 [r6 + 4 * r1 + mmsize],  1
4813
+    punpcklwd             m6,                 m5,                  m4
4814
+    pmaddwd               m6,                 m17
4815
+    paddd                 m2,                 m6
4816
+    punpckhwd             m5,                 m4
4817
+    pmaddwd               m5,                 m17
4818
+    paddd                 m3,                 m5
4819
+
4820
+%ifidn %1,sp
4821
+    paddd                 m0,                 m7
4822
+    paddd                 m1,                 m7
4823
+    paddd                 m2,                 m7
4824
+    paddd                 m3,                 m7
4825
+
4826
+    psrad                 m0,                 INTERP_SHIFT_SP
4827
+    psrad                 m1,                 INTERP_SHIFT_SP
4828
+    psrad                 m2,                 INTERP_SHIFT_SP
4829
+    psrad                 m3,                 INTERP_SHIFT_SP
4830
+    packssdw              m0,                 m1
4831
+    packssdw              m2,                 m3
4832
+    CLIPW2                m0,                 m2,                m18,                 m19
4833
+%else
4834
+    psrad                 m0,                 6
4835
+    psrad                 m1,                 6
4836
+    psrad                 m2,                 6
4837
+    psrad                 m3,                 6
4838
+    packssdw              m0,                 m1
4839
+    packssdw              m2,                 m3
4840
+%endif
4841
+
4842
+    movu                  [r2 + mmsize],               ym0
4843
+    movu                  [r2 + r3 + mmsize],          ym2
4844
+    vextracti32x8         [r2 + 2 * r3 + mmsize],      m0,                  1
4845
+    vextracti32x8         [r2 + r8 + mmsize],          m2,                  1
4846
+%endmacro
4847
+%macro CHROMA_VERT_S_48x4_AVX512 1
4848
+INIT_ZMM avx512
4849
+cglobal interp_4tap_vert_%1_48x64, 5, 9, 20
4850
+    add                   r1d,                r1d
4851
+    add                   r3d,                r3d
4852
+    sub                   r0,                 r1
4853
+    shl                   r4d,                7
4854
+%ifdef PIC
4855
+    lea                   r5,                 [tab_ChromaCoeffV_avx512]
4856
+    mova                  m16,                [r5 + r4]
4857
+    mova                  m17,                [r5 + r4 + mmsize]
4858
+%else
4859
+    lea                   r5,                 [tab_ChromaCoeffV_avx512 + r4]
4860
+    mova                  m16,                [r5]
4861
+    mova                  m17,                [r5 + mmsize]
4862
+%endif
4863
+    lea                   r7,                 [3 * r1]
4864
+    lea                   r8,                 [3 * r3]
4865
+%ifidn %1, sp
4866
+    vbroadcasti32x4       m7,                 [INTERP_OFFSET_SP]
4867
+    pxor                  m18,                m18
4868
+    vbroadcasti32x8       m19,                [pw_pixel_max]
4869
+%endif
4870
+%rep 15
4871
+    PROCESS_CHROMA_VERT_S_48x4_AVX512 %1
4872
+    lea                   r0,                 [r0 + 4 * r1]
4873
+    lea                   r2,                 [r2 + 4 * r3]
4874
+%endrep
4875
+    PROCESS_CHROMA_VERT_S_48x4_AVX512 %1
4876
+    RET
4877
+%endmacro
4878
+
4879
+%if ARCH_X86_64
4880
+    CHROMA_VERT_S_48x4_AVX512 sp
4881
+    CHROMA_VERT_S_48x4_AVX512 ss
4882
+%endif
4883
+%macro PROCESS_CHROMA_VERT_S_64x2_AVX512 1
4884
+    movu                 m1,                  [r0]
4885
+    movu                 m3,                  [r0 + r1]
4886
+    punpcklwd            m0,                  m1,                     m3
4887
+    pmaddwd              m0,                  m15
4888
+    punpckhwd            m1,                  m3
4889
+    pmaddwd              m1,                  m15
4890
+
4891
+    movu                 m9,                  [r0 + mmsize]
4892
+    movu                 m11,                 [r0 + r1 + mmsize]
4893
+    punpcklwd            m8,                  m9,                     m11
4894
+    pmaddwd              m8,                  m15
4895
+    punpckhwd            m9,                  m11
4896
+    pmaddwd              m9,                  m15
4897
+    movu                 m4,                  [r0 + 2 * r1]
4898
+    punpcklwd            m2,                  m3,                     m4
4899
+    pmaddwd              m2,                  m15
4900
+    punpckhwd            m3,                  m4
4901
+    pmaddwd              m3,                  m15
4902
+    movu                 m12,                 [r0 + 2 * r1 + mmsize]
4903
+    punpcklwd            m10,                 m11,                    m12
4904
+    pmaddwd              m10,                 m15
4905
+    punpckhwd            m11,                 m12
4906
+    pmaddwd              m11,                 m15
4907
+
4908
+    lea                  r0,                  [r0 + 2 * r1]
4909
+    movu                 m5,                  [r0 + r1]
4910
+    punpcklwd            m6,                  m4,                     m5
4911
+    pmaddwd              m6,                  m16
4912
+    paddd                m0,                  m6
4913
+    punpckhwd            m4,                  m5
4914
+    pmaddwd              m4,                  m16
4915
+    paddd                m1,                  m4
4916
+
4917
+    movu                 m13,                 [r0 + r1 + mmsize]
4918
+    punpcklwd            m14,                 m12,                    m13
4919
+    pmaddwd              m14,                 m16
4920
+    paddd                m8,                  m14
4921
+    punpckhwd            m12,                 m13
4922
+    pmaddwd              m12,                 m16
4923
+    paddd                m9,                  m12
4924
+
4925
+    movu                 m4,                  [r0 + 2 * r1]
4926
+    punpcklwd            m6,                  m5,                     m4
4927
+    pmaddwd              m6,                  m16
4928
+    paddd                m2,                  m6
4929
+    punpckhwd            m5,                  m4
4930
+    pmaddwd              m5,                  m16
4931
+    paddd                m3,                  m5
4932
+
4933
+    movu                 m12,                 [r0 + 2 * r1 + mmsize]
4934
+    punpcklwd            m14,                 m13,                    m12
4935
+    pmaddwd              m14,                 m16
4936
+    paddd                m10,                 m14
4937
+    punpckhwd            m13,                 m12
4938
+    pmaddwd              m13,                 m16
4939
+    paddd                m11,                 m13
4940
+
4941
+%ifidn %1,sp
4942
+    paddd                m0,                  m7
4943
+    paddd                m1,                  m7
4944
+    paddd                m2,                  m7
4945
+    paddd                m3,                  m7
4946
+    paddd                m8,                  m7
4947
+    paddd                m9,                  m7
4948
+    paddd                m10,                 m7
4949
+    paddd                m11,                 m7
4950
+
4951
+    psrad                m0,                  INTERP_SHIFT_SP
4952
+    psrad                m1,                  INTERP_SHIFT_SP
4953
+    psrad                m2,                  INTERP_SHIFT_SP
4954
+    psrad                m3,                  INTERP_SHIFT_SP
4955
+    psrad                m8,                  INTERP_SHIFT_SP
4956
+    psrad                m9,                  INTERP_SHIFT_SP
4957
+    psrad                m10,                 INTERP_SHIFT_SP
4958
+    psrad                m11,                 INTERP_SHIFT_SP
4959
+
4960
+    packssdw             m0,                  m1
4961
+    packssdw             m2,                  m3
4962
+    packssdw             m8,                  m9
4963
+    packssdw             m10,                 m11
4964
+    CLIPW2               m0,                  m2,                   m17,              m18
4965
+    CLIPW2               m8,                  m10,                  m17,              m18
4966
+%else
4967
+    psrad                m0,                  6
4968
+    psrad                m1,                  6
4969
+    psrad                m2,                  6
4970
+    psrad                m3,                  6
4971
+    psrad                m8,                  6
4972
+    psrad                m9,                  6
4973
+    psrad                m10,                 6
4974
+    psrad                m11,                 6
4975
+
4976
+    packssdw             m0,                  m1
4977
+    packssdw             m2,                  m3
4978
+    packssdw             m8,                  m9
4979
+    packssdw             m10,                 m11
4980
+%endif
4981
+
4982
+    movu                 [r2],                m0
4983
+    movu                 [r2 + r3],           m2
4984
+    movu                 [r2 + mmsize],       m8
4985
+    movu                 [r2 + r3 + mmsize],  m10
4986
+%endmacro
4987
+;-----------------------------------------------------------------------------------------------------------------
4988
+; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
4989
+;-----------------------------------------------------------------------------------------------------------------
4990
+%macro FILTER_VER_S_CHROMA_64xN_AVX512 2
4991
+INIT_ZMM avx512
4992
+cglobal interp_4tap_vert_%1_64x%2, 5, 7, 19
4993
+    add                   r1d,                r1d
4994
+    add                   r3d,                r3d
4995
+    sub                   r0,                 r1
4996
+    shl                   r4d,                7
4997
+%ifdef PIC
4998
+    lea                   r5,                 [tab_ChromaCoeffV_avx512]
4999
+    mova                  m15,                [r5 + r4]
5000
+    mova                  m16,                [r5 + r4 + mmsize]
5001
+%else
5002
+    lea                   r5,                 [tab_ChromaCoeffV_avx512 + r4]
5003
+    mova                  m15,                [r5]
5004
+    mova                  m16,                [r5 + mmsize]
5005
+%endif
5006
+%ifidn %1, sp
5007
+    vbroadcasti32x4       m7,                 [INTERP_OFFSET_SP]
5008
+    pxor                  m17,                m17
5009
+    vbroadcasti32x8       m18,                [pw_pixel_max]
5010
+%endif
5011
+%rep %2/2 - 1
5012
+    PROCESS_CHROMA_VERT_S_64x2_AVX512 %1
5013
+    lea                   r2,                 [r2 + 2 * r3]
5014
+%endrep
5015
+    PROCESS_CHROMA_VERT_S_64x2_AVX512 %1
5016
+    RET
5017
+%endmacro
5018
+
5019
+%if ARCH_X86_64
5020
+    FILTER_VER_S_CHROMA_64xN_AVX512 ss, 16
5021
+    FILTER_VER_S_CHROMA_64xN_AVX512 ss, 32
5022
+    FILTER_VER_S_CHROMA_64xN_AVX512 ss, 48
5023
+    FILTER_VER_S_CHROMA_64xN_AVX512 ss, 64
5024
+    FILTER_VER_S_CHROMA_64xN_AVX512 sp, 16
5025
+    FILTER_VER_S_CHROMA_64xN_AVX512 sp, 32
5026
+    FILTER_VER_S_CHROMA_64xN_AVX512 sp, 48
5027
+    FILTER_VER_S_CHROMA_64xN_AVX512 sp, 64
5028
+%endif
5029
+;-------------------------------------------------------------------------------------------------------------
5030
+; avx512 chroma_vsp and chroma_vss code end
5031
+;-------------------------------------------------------------------------------------------------------------
5032
+;-------------------------------------------------------------------------------------------------------------
5033
+;ipfilter_chroma_avx512 code end
5034
+;-------------------------------------------------------------------------------------------------------------
5035
+;-------------------------------------------------------------------------------------------------------------
5036
+;ipfilter_luma_avx512 code start
5037
+;-------------------------------------------------------------------------------------------------------------
5038
+%macro PROCESS_IPFILTER_LUMA_PP_8x4_AVX512 0
5039
+    ; register map
5040
+    ; m0 , m1, m2, m3 - interpolate coeff
5041
+    ; m4 , m5  load shuffle order table
5042
+    ; m6 - pd_32
5043
+    ; m7 - zero
5044
+    ; m8 - pw_pixel_max
5045
+    ; m9 - store shuffle order table
5046
+
5047
+    movu            xm10,      [r0]
5048
+    movu            xm11,      [r0 + 8]
5049
+    movu            xm12,      [r0 + 16]
5050
+
5051
+    vinserti32x4     m10,      [r0 + r1],      1
5052
+    vinserti32x4     m11,      [r0 + r1 + 8],  1
5053
+    vinserti32x4     m12,      [r0 + r1 + 16], 1
5054
+
5055
+    vinserti32x4     m10,      [r0 + 2 * r1],           2
5056
+    vinserti32x4     m11,      [r0 + 2 * r1 + 8],       2
5057
+    vinserti32x4     m12,      [r0 + 2 * r1 + 16],      2
5058
+
5059
+    vinserti32x4     m10,      [r0 + r6],      3
5060
+    vinserti32x4     m11,      [r0 + r6 + 8],  3
5061
+    vinserti32x4     m12,      [r0 + r6 + 16], 3
5062
+
5063
+    pshufb          m13,       m10,        m5
5064
+    pshufb          m10,       m4
5065
+    pshufb          m14,       m11,        m5
5066
+    pshufb          m11,       m4
5067
+    pshufb          m15,       m12,        m5
5068
+    pshufb          m12,       m4
5069
+
5070
+    pmaddwd         m10,       m0
5071
+    pmaddwd         m13,       m1
5072
+    paddd           m10,       m13
5073
+    pmaddwd         m13,       m14,       m3
5074
+    pmaddwd         m16,       m11,       m2
5075
+    paddd           m13,       m16
5076
+    paddd           m10,       m13
5077
+    paddd           m10,       m6
5078
+    psrad           m10,       INTERP_SHIFT_PP
5079
+
5080
+    pmaddwd         m11,       m0
5081
+    pmaddwd         m14,       m1
5082
+    paddd           m11,       m14
5083
+    pmaddwd         m15,       m3
5084
+    pmaddwd         m12,       m2
5085
+    paddd           m12,       m15
5086
+    paddd           m11,       m12
5087
+    paddd           m11,       m6
5088
+    psrad           m11,       INTERP_SHIFT_PP
5089
+
5090
+    packusdw        m10,       m11
5091
+    CLIPW           m10,       m7,         m8
5092
+    pshufb          m10,       m9
5093
+    movu            [r2],      xm10
5094
+    vextracti32x4   [r2 + r3],     m10,        1
5095
+    vextracti32x4   [r2 + 2 * r3], m10,        2
5096
+    vextracti32x4   [r2 + r7],     m10,        3
5097
+%endmacro
5098
+
5099
+%macro PROCESS_IPFILTER_LUMA_PP_16x4_AVX512 0
5100
+    ; register map
5101
+    ; m0 , m1, m2, m3 - interpolate coeff
5102
+    ; m4 , m5  load shuffle order table
5103
+    ; m6 - pd_32
5104
+    ; m7 - zero
5105
+    ; m8 - pw_pixel_max
5106
+    ; m9 - store shuffle order table
5107
+
5108
+    movu            ym10,      [r0]
5109
+    vinserti32x8     m10,      [r0 + r1],      1
5110
+    movu            ym11,      [r0 + 8]
5111
+    vinserti32x8     m11,      [r0 + r1 + 8],  1
5112
+    movu            ym12,      [r0 + 16]
5113
+    vinserti32x8     m12,      [r0 + r1 + 16], 1
5114
+
5115
+    pshufb          m13,       m10,        m5
5116
+    pshufb          m10,       m4
5117
+    pshufb          m14,       m11,        m5
5118
+    pshufb          m11,       m4
5119
+    pshufb          m15,       m12,        m5
5120
+    pshufb          m12,       m4
5121
+
5122
+    pmaddwd         m10,       m0
5123
+    pmaddwd         m13,       m1
5124
+    paddd           m10,       m13
5125
+    pmaddwd         m13,       m14,       m3
5126
+    pmaddwd         m16,       m11,       m2
5127
+    paddd           m13,       m16
5128
+    paddd           m10,       m13
5129
+    paddd           m10,       m6
5130
+    psrad           m10,       INTERP_SHIFT_PP
5131
+
5132
+    pmaddwd         m11,       m0
5133
+    pmaddwd         m14,       m1
5134
+    paddd           m11,       m14
5135
+    pmaddwd         m15,       m3
5136
+    pmaddwd         m12,       m2
5137
+    paddd           m12,       m15
5138
+    paddd           m11,       m12
5139
+    paddd           m11,       m6
5140
+    psrad           m11,       INTERP_SHIFT_PP
5141
+
5142
+    packusdw        m10,       m11
5143
+    CLIPW           m10,       m7,         m8
5144
+    pshufb          m10,       m9
5145
+    movu            [r2],      ym10
5146
+    vextracti32x8   [r2 + r3], m10,        1
5147
+
5148
+    movu            ym10,      [r0 + 2 * r1]
5149
+    vinserti32x8     m10,      [r0 + r6],            1
5150
+    movu            ym11,      [r0 + 2 * r1 + 8]
5151
+    vinserti32x8     m11,      [r0 + r6 + 8],        1
5152
+    movu            ym12,      [r0 + 2 * r1 + 16]
5153
+    vinserti32x8     m12,      [r0 + r6 + 16],       1
5154
+
5155
+    pshufb          m13,       m10,        m5
5156
+    pshufb          m10,       m4
5157
+    pshufb          m14,       m11,        m5
5158
+    pshufb          m11,       m4
5159
+    pshufb          m15,       m12,        m5
5160
+    pshufb          m12,       m4
5161
+
5162
+    pmaddwd         m10,       m0
5163
+    pmaddwd         m13,       m1
5164
+    paddd           m10,       m13
5165
+    pmaddwd         m13,       m14,       m3
5166
+    pmaddwd         m16,       m11,       m2
5167
+    paddd           m13,       m16
5168
+    paddd           m10,       m13
5169
+    paddd           m10,       m6
5170
+    psrad           m10,       INTERP_SHIFT_PP
5171
+
5172
+    pmaddwd         m11,       m0
5173
+    pmaddwd         m14,       m1
5174
+    paddd           m11,       m14
5175
+    pmaddwd         m14,       m15,       m3
5176
+    pmaddwd         m16,       m12,       m2
5177
+    paddd           m14,       m16
5178
+    paddd           m11,       m14
5179
+    paddd           m11,       m6
5180
+    psrad           m11,       INTERP_SHIFT_PP
5181
+
5182
+    packusdw        m10,       m11
5183
+    CLIPW           m10,       m7,         m8
5184
+    pshufb          m10,       m9
5185
+    movu            [r2 + 2 * r3],        ym10
5186
+    vextracti32x8   [r2 + r7],      m10,     1
5187
+%endmacro
5188
+
5189
+%macro PROCESS_IPFILTER_LUMA_PP_24x4_AVX512 0
5190
+    ; register map
5191
+    ; m0 , m1, m2, m3 - interpolate coeff
5192
+    ; m4 , m5  load shuffle order table
5193
+    ; m6 - pd_32
5194
+    ; m7 - zero
5195
+    ; m8 - pw_pixel_max
5196
+    ; m9 - store shuffle order table
5197
+
5198
+    PROCESS_IPFILTER_LUMA_PP_16x4_AVX512
5199
+
5200
+    movu            xm10,      [r0 + mmsize/2]
5201
+    movu            xm11,      [r0 + mmsize/2 + 8]
5202
+    movu            xm12,      [r0 + mmsize/2 + 16]
5203
+
5204
+    vinserti32x4     m10,      [r0 + r1 + mmsize/2],      1
5205
+    vinserti32x4     m11,      [r0 + r1 + mmsize/2 + 8],  1
5206
+    vinserti32x4     m12,      [r0 + r1 + mmsize/2 + 16], 1
5207
+
5208
+    vinserti32x4     m10,      [r0 + 2 * r1 + mmsize/2],           2
5209
+    vinserti32x4     m11,      [r0 + 2 * r1 + mmsize/2 + 8],       2
5210
+    vinserti32x4     m12,      [r0 + 2 * r1 + mmsize/2 + 16],      2
5211
+
5212
+    vinserti32x4     m10,      [r0 + r6 + mmsize/2],      3
5213
+    vinserti32x4     m11,      [r0 + r6 + mmsize/2 + 8],  3
5214
+    vinserti32x4     m12,      [r0 + r6 + mmsize/2 + 16], 3
5215
+
5216
+    pshufb          m13,       m10,        m5
5217
+    pshufb          m10,       m4
5218
+    pshufb          m14,       m11,        m5
5219
+    pshufb          m11,       m4
5220
+    pshufb          m15,       m12,        m5
5221
+    pshufb          m12,       m4
5222
+
5223
+    pmaddwd         m10,       m0
5224
+    pmaddwd         m13,       m1
5225
+    paddd           m10,       m13
5226
+    pmaddwd         m13,       m14,       m3
5227
+    pmaddwd         m16,       m11,       m2
5228
+    paddd           m13,       m16
5229
+    paddd           m10,       m13
5230
+    paddd           m10,       m6
5231
+    psrad           m10,       INTERP_SHIFT_PP
5232
+
5233
+    pmaddwd         m11,       m0
5234
+    pmaddwd         m14,       m1
5235
+    paddd           m11,       m14
5236
+    pmaddwd         m15,       m3
5237
+    pmaddwd         m12,       m2
5238
+    paddd           m12,       m15
5239
+    paddd           m11,       m12
5240
+    paddd           m11,       m6
5241
+    psrad           m11,       INTERP_SHIFT_PP
5242
+
5243
+    packusdw        m10,       m11
5244
+    CLIPW           m10,       m7,         m8
5245
+    pshufb          m10,       m9
5246
+    movu            [r2 + mmsize/2],      xm10
5247
+    vextracti32x4   [r2 + r3 + mmsize/2],     m10,        1
5248
+    vextracti32x4   [r2 + 2 * r3 + mmsize/2], m10,        2
5249
+    vextracti32x4   [r2 + r7 + mmsize/2],     m10,        3
5250
+%endmacro
5251
+
5252
+%macro PROCESS_IPFILTER_LUMA_PP_32x2_AVX512 0
5253
+    ; register map
5254
+    ; m0 , m1, m2, m3 - interpolate coeff
5255
+    ; m4 , m5  load shuffle order table
5256
+    ; m6 - pd_32
5257
+    ; m7 - zero
5258
+    ; m8 - pw_pixel_max
5259
+    ; m9 - store shuffle order table
5260
+
5261
+    movu            m10,       [r0]
5262
+    movu            m11,       [r0 + 8]
5263
+    movu            m12,       [r0 + 16]
5264
+
5265
+    pshufb          m13,       m10,        m5
5266
+    pshufb          m10,       m4
5267
+    pshufb          m14,       m11,        m5
5268
+    pshufb          m11,       m4
5269
+    pshufb          m15,       m12,        m5
5270
+    pshufb          m12,       m4
5271
+
5272
+    pmaddwd         m10,       m0
5273
+    pmaddwd         m13,       m1
5274
+    paddd           m10,       m13
5275
+    pmaddwd         m13,       m14,       m3
5276
+    pmaddwd         m16,       m11,       m2
5277
+    paddd           m13,       m16
5278
+    paddd           m10,       m13
5279
+    paddd           m10,       m6
5280
+    psrad           m10,       INTERP_SHIFT_PP
5281
+
5282
+    pmaddwd         m11,       m0
5283
+    pmaddwd         m14,       m1
5284
+    paddd           m11,       m14
5285
+    pmaddwd         m15,       m3
5286
+    pmaddwd         m12,       m2
5287
+    paddd           m12,       m15
5288
+    paddd           m11,       m12
5289
+    paddd           m11,       m6
5290
+    psrad           m11,       INTERP_SHIFT_PP
5291
+
5292
+    packusdw        m10,       m11
5293
+    CLIPW           m10,       m7,         m8
5294
+    pshufb          m10,       m9
5295
+    movu            [r2],      m10
5296
+
5297
+    movu            m10,       [r0 + r1]
5298
+    movu            m11,       [r0 + r1 + 8]
5299
+    movu            m12,       [r0 + r1 + 16]
5300
+
5301
+    pshufb          m13,       m10,        m5
5302
+    pshufb          m10,       m4
5303
+    pshufb          m14,       m11,        m5
5304
+    pshufb          m11,       m4
5305
+    pshufb          m15,       m12,        m5
5306
+    pshufb          m12,       m4
5307
+
5308
+    pmaddwd         m10,       m0
5309
+    pmaddwd         m13,       m1
5310
+    paddd           m10,       m13
5311
+    pmaddwd         m13,       m14,       m3
5312
+    pmaddwd         m16,       m11,       m2
5313
+    paddd           m13,       m16
5314
+    paddd           m10,       m13
5315
+    paddd           m10,       m6
5316
+    psrad           m10,       INTERP_SHIFT_PP
5317
+
5318
+    pmaddwd         m11,       m0
5319
+    pmaddwd         m14,       m1
5320
+    paddd           m11,       m14
5321
+    pmaddwd         m14,       m15,       m3
5322
+    pmaddwd         m16,       m12,       m2
5323
+    paddd           m14,       m16
5324
+    paddd           m11,       m14
5325
+    paddd           m11,       m6
5326
+    psrad           m11,       INTERP_SHIFT_PP
5327
+
5328
+    packusdw        m10,       m11
5329
+    CLIPW           m10,       m7,         m8
5330
+    pshufb          m10,       m9
5331
+    movu            [r2 + r3], m10
5332
+%endmacro
5333
+
5334
+%macro PROCESS_IPFILTER_LUMA_PP_48x4_AVX512 0
5335
+    ; register map
5336
+    ; m0 , m1, m2, m3 - interpolate coeff
5337
+    ; m4 , m5  load shuffle order table
5338
+    ; m6 - pd_32
5339
+    ; m7 - zero
5340
+    ; m8 - pw_pixel_max
5341
+    ; m9 - store shuffle order table
5342
+
5343
+    movu            m10,       [r0]
5344
+    movu            m11,       [r0 + 8]
5345
+    movu            m12,       [r0 + 16]
5346
+
5347
+    pshufb          m13,       m10,        m5
5348
+    pshufb          m10,       m4
5349
+    pshufb          m14,       m11,        m5
5350
+    pshufb          m11,       m4
5351
+    pshufb          m15,       m12,        m5
5352
+    pshufb          m12,       m4
5353
+
5354
+    pmaddwd         m10,       m0
5355
+    pmaddwd         m13,       m1
5356
+    paddd           m10,       m13
5357
+    pmaddwd         m13,       m14,       m3
5358
+    pmaddwd         m16,       m11,       m2
5359
+    paddd           m13,       m16
5360
+    paddd           m10,       m13
5361
+    paddd           m10,       m6
5362
+    psrad           m10,       INTERP_SHIFT_PP
5363
+
5364
+    pmaddwd         m11,       m0
5365
+    pmaddwd         m14,       m1
5366
+    paddd           m11,       m14
5367
+    pmaddwd         m15,       m3
5368
+    pmaddwd         m12,       m2
5369
+    paddd           m12,       m15
5370
+    paddd           m11,       m12
5371
+    paddd           m11,       m6
5372
+    psrad           m11,       INTERP_SHIFT_PP
5373
+
5374
+    packusdw        m10,       m11
5375
+    CLIPW           m10,       m7,         m8
5376
+    pshufb          m10,       m9
5377
+    movu            [r2],      m10
5378
+
5379
+    movu            m10,       [r0 + r1]
5380
+    movu            m11,       [r0 + r1 + 8]
5381
+    movu            m12,       [r0 + r1 + 16]
5382
+
5383
+    pshufb          m13,       m10,        m5
5384
+    pshufb          m10,       m4
5385
+    pshufb          m14,       m11,        m5
5386
+    pshufb          m11,       m4
5387
+    pshufb          m15,       m12,        m5
5388
+    pshufb          m12,       m4
5389
+
5390
+    pmaddwd         m10,       m0
5391
+    pmaddwd         m13,       m1
5392
+    paddd           m10,       m13
5393
+    pmaddwd         m13,       m14,       m3
5394
+    pmaddwd         m16,       m11,       m2
5395
+    paddd           m13,       m16
5396
+    paddd           m10,       m13
5397
+    paddd           m10,       m6
5398
+    psrad           m10,       INTERP_SHIFT_PP
5399
+
5400
+    pmaddwd         m11,       m0
5401
+    pmaddwd         m14,       m1
5402
+    paddd           m11,       m14
5403
+    pmaddwd         m14,       m15,       m3
5404
+    pmaddwd         m16,       m12,       m2
5405
+    paddd           m14,       m16
5406
+    paddd           m11,       m14
5407
+    paddd           m11,       m6
5408
+    psrad           m11,       INTERP_SHIFT_PP
5409
+
5410
+    packusdw        m10,       m11
5411
+    CLIPW           m10,       m7,         m8
5412
+    pshufb          m10,       m9
5413
+    movu            [r2 + r3], m10
5414
+
5415
+    movu            m10,       [r0 + 2 * r1]
5416
+    movu            m11,       [r0 + 2 * r1 + 8]
5417
+    movu            m12,       [r0 + 2 * r1 + 16]
5418
+
5419
+    pshufb          m13,       m10,        m5
5420
+    pshufb          m10,       m4
5421
+    pshufb          m14,       m11,        m5
5422
+    pshufb          m11,       m4
5423
+    pshufb          m15,       m12,        m5
5424
+    pshufb          m12,       m4
5425
+
5426
+    pmaddwd         m10,       m0
5427
+    pmaddwd         m13,       m1
5428
+    paddd           m10,       m13
5429
+    pmaddwd         m13,       m14,       m3
5430
+    pmaddwd         m16,       m11,       m2
5431
+    paddd           m13,       m16
5432
+    paddd           m10,       m13
5433
+    paddd           m10,       m6
5434
+    psrad           m10,       INTERP_SHIFT_PP
5435
+
5436
+    pmaddwd         m11,       m0
5437
+    pmaddwd         m14,       m1
5438
+    paddd           m11,       m14
5439
+    pmaddwd         m15,       m3
5440
+    pmaddwd         m12,       m2
5441
+    paddd           m12,       m15
5442
+    paddd           m11,       m12
5443
+    paddd           m11,       m6
5444
+    psrad           m11,       INTERP_SHIFT_PP
5445
+
5446
+    packusdw        m10,       m11
5447
+    CLIPW           m10,       m7,         m8
5448
+    pshufb          m10,       m9
5449
+    movu            [r2 + 2 * r3],         m10
5450
+
5451
+    movu            m10,       [r0 + r6]
5452
+    movu            m11,       [r0 + r6 + 8]
5453
+    movu            m12,       [r0 + r6 + 16]
5454
+
5455
+    pshufb          m13,       m10,        m5
5456
+    pshufb          m10,       m4
5457
+    pshufb          m14,       m11,        m5
5458
+    pshufb          m11,       m4
5459
+    pshufb          m15,       m12,        m5
5460
+    pshufb          m12,       m4
5461
+
5462
+    pmaddwd         m10,       m0
5463
+    pmaddwd         m13,       m1
5464
+    paddd           m10,       m13
5465
+    pmaddwd         m13,       m14,       m3
5466
+    pmaddwd         m16,       m11,       m2
5467
+    paddd           m13,       m16
5468
+    paddd           m10,       m13
5469
+    paddd           m10,       m6
5470
+    psrad           m10,       INTERP_SHIFT_PP
5471
+
5472
+    pmaddwd         m11,       m0
5473
+    pmaddwd         m14,       m1
5474
+    paddd           m11,       m14
5475
+    pmaddwd         m14,       m15,       m3
5476
+    pmaddwd         m16,       m12,       m2
5477
+    paddd           m14,       m16
5478
+    paddd           m11,       m14
5479
+    paddd           m11,       m6
5480
+    psrad           m11,       INTERP_SHIFT_PP
5481
+
5482
+    packusdw        m10,       m11
5483
+    CLIPW           m10,       m7,         m8
5484
+    pshufb          m10,       m9
5485
+    movu            [r2 + r7], m10
5486
+
5487
+    movu            ym10,      [r0 + mmsize]
5488
+    vinserti32x8     m10,      [r0 + r1 + mmsize],      1
5489
+    movu            ym11,      [r0 + mmsize + 8]
5490
+    vinserti32x8     m11,      [r0 + r1 + mmsize + 8],  1
5491
+    movu            ym12,      [r0 + mmsize + 16]
5492
+    vinserti32x8     m12,      [r0 + r1 + mmsize + 16], 1
5493
+
5494
+    pshufb          m13,       m10,        m5
5495
+    pshufb          m10,       m4
5496
+    pshufb          m14,       m11,        m5
5497
+    pshufb          m11,       m4
5498
+    pshufb          m15,       m12,        m5
5499
+    pshufb          m12,       m4
5500
+
5501
+    pmaddwd         m10,       m0
5502
+    pmaddwd         m13,       m1
5503
+    paddd           m10,       m13
5504
+    pmaddwd         m13,       m14,       m3
5505
+    pmaddwd         m16,       m11,       m2
5506
+    paddd           m13,       m16
5507
+    paddd           m10,       m13
5508
+    paddd           m10,       m6
5509
+    psrad           m10,       INTERP_SHIFT_PP
5510
+
5511
+    pmaddwd         m11,       m0
5512
+    pmaddwd         m14,       m1
5513
+    paddd           m11,       m14
5514
+    pmaddwd         m15,       m3
5515
+    pmaddwd         m12,       m2
5516
+    paddd           m12,       m15
5517
+    paddd           m11,       m12
5518
+    paddd           m11,       m6
5519
+    psrad           m11,       INTERP_SHIFT_PP
5520
+
5521
+    packusdw        m10,       m11
5522
+    CLIPW           m10,       m7,         m8
5523
+    pshufb          m10,       m9
5524
+    movu            [r2 + mmsize],      ym10
5525
+    vextracti32x8   [r2 + r3 + mmsize], m10,        1
5526
+
5527
+    movu            ym10,      [r0 + 2 * r1 + mmsize]
5528
+    vinserti32x8     m10,      [r0 + r6 + mmsize],            1
5529
+    movu            ym11,      [r0 + 2 * r1 + mmsize + 8]
5530
+    vinserti32x8     m11,      [r0 + r6 + mmsize + 8],        1
5531
+    movu            ym12,      [r0 + 2 * r1 + mmsize + 16]
5532
+    vinserti32x8     m12,      [r0 + r6 + mmsize + 16],       1
5533
+
5534
+    pshufb          m13,       m10,        m5
5535
+    pshufb          m10,       m4
5536
+    pshufb          m14,       m11,        m5
5537
+    pshufb          m11,       m4
5538
+    pshufb          m15,       m12,        m5
5539
+    pshufb          m12,       m4
5540
+
5541
+    pmaddwd         m10,       m0
5542
+    pmaddwd         m13,       m1
5543
+    paddd           m10,       m13
5544
+    pmaddwd         m13,       m14,       m3
5545
+    pmaddwd         m16,       m11,       m2
5546
+    paddd           m13,       m16
5547
+    paddd           m10,       m13
5548
+    paddd           m10,       m6
5549
+    psrad           m10,       INTERP_SHIFT_PP
5550
+
5551
+    pmaddwd         m11,       m0
5552
+    pmaddwd         m14,       m1
5553
+    paddd           m11,       m14
5554
+    pmaddwd         m14,       m15,       m3
5555
+    pmaddwd         m16,       m12,       m2
5556
+    paddd           m14,       m16
5557
+    paddd           m11,       m14
5558
+    paddd           m11,       m6
5559
+    psrad           m11,       INTERP_SHIFT_PP
5560
+
5561
+    packusdw        m10,       m11
5562
+    CLIPW           m10,       m7,         m8
5563
+    pshufb          m10,       m9
5564
+    movu            [r2 + 2 * r3 + mmsize],        ym10
5565
+    vextracti32x8   [r2 + r7 + mmsize],      m10,     1
5566
+%endmacro
5567
+
5568
+%macro PROCESS_IPFILTER_LUMA_PP_64x2_AVX512 0
5569
+    ; register map
5570
+    ; m0 , m1, m2, m3 - interpolate coeff
5571
+    ; m4 , m5  load shuffle order table
5572
+    ; m6 - pd_32
5573
+    ; m7 - zero
5574
+    ; m8 - pw_pixel_max
5575
+    ; m9 - store shuffle order table
5576
+
5577
+    movu            m10,       [r0]
5578
+    movu            m11,       [r0 + 8]
5579
+    movu            m12,       [r0 + 16]
5580
+
5581
+    pshufb          m13,       m10,        m5
5582
+    pshufb          m10,       m4
5583
+    pshufb          m14,       m11,        m5
5584
+    pshufb          m11,       m4
5585
+    pshufb          m15,       m12,        m5
5586
+    pshufb          m12,       m4
5587
+
5588
+    pmaddwd         m10,       m0
5589
+    pmaddwd         m13,       m1
5590
+    paddd           m10,       m13
5591
+    pmaddwd         m13,       m14,       m3
5592
+    pmaddwd         m16,       m11,       m2
5593
+    paddd           m13,       m16
5594
+    paddd           m10,       m13
5595
+    paddd           m10,       m6
5596
+    psrad           m10,       INTERP_SHIFT_PP
5597
+
5598
+    pmaddwd         m11,       m0
5599
+    pmaddwd         m14,       m1
5600
+    paddd           m11,       m14
5601
+    pmaddwd         m15,       m3
5602
+    pmaddwd         m12,       m2
5603
+    paddd           m12,       m15
5604
+    paddd           m11,       m12
5605
+    paddd           m11,       m6
5606
+    psrad           m11,       INTERP_SHIFT_PP
5607
+
5608
+    packusdw        m10,       m11
5609
+    CLIPW           m10,       m7,         m8
5610
+    pshufb          m10,       m9
5611
+    movu            [r2],      m10
5612
+
5613
+    movu            m10,       [r0 + mmsize]
5614
+    movu            m11,       [r0 + mmsize + 8]
5615
+    movu            m12,       [r0 + mmsize + 16]
5616
+
5617
+    pshufb          m13,       m10,        m5
5618
+    pshufb          m10,       m4
5619
+    pshufb          m14,       m11,        m5
5620
+    pshufb          m11,       m4
5621
+    pshufb          m15,       m12,        m5
5622
+    pshufb          m12,       m4
5623
+
5624
+    pmaddwd         m10,       m0
5625
+    pmaddwd         m13,       m1
5626
+    paddd           m10,       m13
5627
+    pmaddwd         m13,       m14,       m3
5628
+    pmaddwd         m16,       m11,       m2
5629
+    paddd           m13,       m16
5630
+    paddd           m10,       m13
5631
+    paddd           m10,       m6
5632
+    psrad           m10,       INTERP_SHIFT_PP
5633
+
5634
+    pmaddwd         m11,       m0
5635
+    pmaddwd         m14,       m1
5636
+    paddd           m11,       m14
5637
+    pmaddwd         m15,       m3
5638
+    pmaddwd         m12,       m2
5639
+    paddd           m12,       m15
5640
+    paddd           m11,       m12
5641
+    paddd           m11,       m6
5642
+    psrad           m11,       INTERP_SHIFT_PP
5643
+
5644
+    packusdw        m10,       m11
5645
+    CLIPW           m10,       m7,         m8
5646
+    pshufb          m10,       m9
5647
+    movu            [r2 + mmsize],         m10
5648
+
5649
+    movu            m10,       [r0 + r1]
5650
+    movu            m11,       [r0 + r1 + 8]
5651
+    movu            m12,       [r0 + r1 + 16]
5652
+
5653
+    pshufb          m13,       m10,        m5
5654
+    pshufb          m10,       m4
5655
+    pshufb          m14,       m11,        m5
5656
+    pshufb          m11,       m4
5657
+    pshufb          m15,       m12,        m5
5658
+    pshufb          m12,       m4
5659
+
5660
+    pmaddwd         m10,       m0
5661
+    pmaddwd         m13,       m1
5662
+    paddd           m10,       m13
5663
+    pmaddwd         m13,       m14,       m3
5664
+    pmaddwd         m16,       m11,       m2
5665
+    paddd           m13,       m16
5666
+    paddd           m10,       m13
5667
+    paddd           m10,       m6
5668
+    psrad           m10,       INTERP_SHIFT_PP
5669
+
5670
+    pmaddwd         m11,       m0
5671
+    pmaddwd         m14,       m1
5672
+    paddd           m11,       m14
5673
+    pmaddwd         m14,       m15,       m3
5674
+    pmaddwd         m16,       m12,       m2
5675
+    paddd           m14,       m16
5676
+    paddd           m11,       m14
5677
+    paddd           m11,       m6
5678
+    psrad           m11,       INTERP_SHIFT_PP
5679
+
5680
+    packusdw        m10,       m11
5681
+    CLIPW           m10,       m7,         m8
5682
+    pshufb          m10,       m9
5683
+    movu            [r2 + r3], m10
5684
+
5685
+    movu            m10,       [r0 + r1 + mmsize]
5686
+    movu            m11,       [r0 + r1 + mmsize + 8]
5687
+    movu            m12,       [r0 + r1 + mmsize + 16]
5688
+
5689
+    pshufb          m13,       m10,        m5
5690
+    pshufb          m10,       m4
5691
+    pshufb          m14,       m11,        m5
5692
+    pshufb          m11,       m4
5693
+    pshufb          m15,       m12,        m5
5694
+    pshufb          m12,       m4
5695
+
5696
+    pmaddwd         m10,       m0
5697
+    pmaddwd         m13,       m1
5698
+    paddd           m10,       m13
5699
+    pmaddwd         m13,       m14,       m3
5700
+    pmaddwd         m16,       m11,       m2
5701
+    paddd           m13,       m16
5702
+    paddd           m10,       m13
5703
+    paddd           m10,       m6
5704
+    psrad           m10,       INTERP_SHIFT_PP
5705
+
5706
+    pmaddwd         m11,       m0
5707
+    pmaddwd         m14,       m1
5708
+    paddd           m11,       m14
5709
+    pmaddwd         m14,       m15,       m3
5710
+    pmaddwd         m16,       m12,       m2
5711
+    paddd           m14,       m16
5712
+    paddd           m11,       m14
5713
+    paddd           m11,       m6
5714
+    psrad           m11,       INTERP_SHIFT_PP
5715
+
5716
+    packusdw        m10,       m11
5717
+    CLIPW           m10,       m7,         m8
5718
+    pshufb          m10,       m9
5719
+    movu            [r2 + r3 + mmsize],    m10
5720
+%endmacro
5721
+
5722
+%macro IPFILTER_LUMA_AVX512_8xN 1
5723
+INIT_ZMM avx512
5724
+cglobal interp_8tap_horiz_pp_8x%1, 5, 8, 17
5725
+    add              r1d,        r1d
5726
+    add              r3d,        r3d
5727
+    sub              r0,         6
5728
+    mov              r4d,        r4m
5729
+    shl              r4d,        4
5730
+
5731
+%ifdef PIC
5732
+    lea              r5,         [tab_LumaCoeff]
5733
+    vpbroadcastd     m0,         [r5 + r4]
5734
+    vpbroadcastd     m1,         [r5 + r4 + 4]
5735
+    vpbroadcastd     m2,         [r5 + r4 + 8]
5736
+    vpbroadcastd     m3,         [r5 + r4 + 12]
5737
+%else
5738
+    vpbroadcastd     m0,         [tab_LumaCoeff + r4]
5739
+    vpbroadcastd     m1,         [tab_LumaCoeff + r4 + 4]
5740
+    vpbroadcastd     m2,         [tab_LumaCoeff + r4 + 8]
5741
+    vpbroadcastd     m3,         [tab_LumaCoeff + r4 + 12]
5742
+%endif
5743
+    vbroadcasti32x8  m4,         [interp8_hpp_shuf1_load_avx512]
5744
+    vbroadcasti32x8  m5,         [interp8_hpp_shuf2_load_avx512]
5745
+    vbroadcasti32x8  m6,         [pd_32]
5746
+    pxor             m7,         m7
5747
+    vbroadcasti32x8  m8,         [pw_pixel_max]
5748
+    vbroadcasti32x8  m9,         [interp8_hpp_shuf1_store_avx512]
5749
+    lea              r6,         [3 * r1]
5750
+    lea              r7,         [3 * r3]
5751
+
5752
+%rep %1/4 - 1
5753
+    PROCESS_IPFILTER_LUMA_PP_8x4_AVX512
5754
+    lea              r0,         [r0 + 4 * r1]
5755
+    lea              r2,         [r2 + 4 * r3]
5756
+%endrep
5757
+    PROCESS_IPFILTER_LUMA_PP_8x4_AVX512
5758
+    RET
5759
+%endmacro
5760
+
5761
+%if ARCH_X86_64
5762
+    IPFILTER_LUMA_AVX512_8xN 4
5763
+    IPFILTER_LUMA_AVX512_8xN 8
5764
+    IPFILTER_LUMA_AVX512_8xN 16
5765
+    IPFILTER_LUMA_AVX512_8xN 32
5766
+%endif
5767
+
5768
+%macro IPFILTER_LUMA_AVX512_16xN 1
5769
+INIT_ZMM avx512
5770
+cglobal interp_8tap_horiz_pp_16x%1, 5,8,17
5771
+    add              r1d,        r1d
5772
+    add              r3d,        r3d
5773
+    sub              r0,         6
5774
+    mov              r4d,        r4m
5775
+    shl              r4d,        4
5776
+
5777
+%ifdef PIC
5778
+    lea              r5,         [tab_LumaCoeff]
5779
+    vpbroadcastd     m0,         [r5 + r4]
5780
+    vpbroadcastd     m1,         [r5 + r4 + 4]
5781
+    vpbroadcastd     m2,         [r5 + r4 + 8]
5782
+    vpbroadcastd     m3,         [r5 + r4 + 12]
5783
+%else
5784
+    vpbroadcastd     m0,         [tab_LumaCoeff + r4]
5785
+    vpbroadcastd     m1,         [tab_LumaCoeff + r4 + 4]
5786
+    vpbroadcastd     m2,         [tab_LumaCoeff + r4 + 8]
5787
+    vpbroadcastd     m3,         [tab_LumaCoeff + r4 + 12]
5788
+%endif
5789
+    vbroadcasti32x8  m4,         [interp8_hpp_shuf1_load_avx512]
5790
+    vbroadcasti32x8  m5,         [interp8_hpp_shuf2_load_avx512]
5791
+    vbroadcasti32x8  m6,         [pd_32]
5792
+    pxor             m7,         m7
5793
+    vbroadcasti32x8  m8,         [pw_pixel_max]
5794
+    vbroadcasti32x8  m9,         [interp8_hpp_shuf1_store_avx512]
5795
+    lea              r6,         [3 * r1]
5796
+    lea              r7,         [3 * r3]
5797
+
5798
+%rep %1/4 - 1
5799
+    PROCESS_IPFILTER_LUMA_PP_16x4_AVX512
5800
+    lea              r0,         [r0 + 4 * r1]
5801
+    lea              r2,         [r2 + 4 * r3]
5802
+%endrep
5803
+    PROCESS_IPFILTER_LUMA_PP_16x4_AVX512
5804
+    RET
5805
+%endmacro
5806
+
5807
+%if ARCH_X86_64
5808
+IPFILTER_LUMA_AVX512_16xN 4
5809
+IPFILTER_LUMA_AVX512_16xN 8
5810
+IPFILTER_LUMA_AVX512_16xN 12
5811
+IPFILTER_LUMA_AVX512_16xN 16
5812
+IPFILTER_LUMA_AVX512_16xN 32
5813
+IPFILTER_LUMA_AVX512_16xN 64
5814
+%endif
5815
+
5816
+%if ARCH_X86_64
5817
+INIT_ZMM avx512
5818
+cglobal interp_8tap_horiz_pp_24x32, 5, 8, 17
5819
+    add              r1d,        r1d
5820
+    add              r3d,        r3d
5821
+    sub              r0,         6
5822
+    mov              r4d,        r4m
5823
+    shl              r4d,        4
5824
+
5825
+%ifdef PIC
5826
+    lea              r5,         [tab_LumaCoeff]
5827
+    vpbroadcastd     m0,         [r5 + r4]
5828
+    vpbroadcastd     m1,         [r5 + r4 + 4]
5829
+    vpbroadcastd     m2,         [r5 + r4 + 8]
5830
+    vpbroadcastd     m3,         [r5 + r4 + 12]
5831
+%else
5832
+    vpbroadcastd     m0,         [tab_LumaCoeff + r4]
5833
+    vpbroadcastd     m1,         [tab_LumaCoeff + r4 + 4]
5834
+    vpbroadcastd     m2,         [tab_LumaCoeff + r4 + 8]
5835
+    vpbroadcastd     m3,         [tab_LumaCoeff + r4 + 12]
5836
+%endif
5837
+    vbroadcasti32x8  m4,         [interp8_hpp_shuf1_load_avx512]
5838
+    vbroadcasti32x8  m5,         [interp8_hpp_shuf2_load_avx512]
5839
+    vbroadcasti32x8  m6,         [pd_32]
5840
+    pxor             m7,         m7
5841
+    vbroadcasti32x8  m8,         [pw_pixel_max]
5842
+    vbroadcasti32x8  m9,         [interp8_hpp_shuf1_store_avx512]
5843
+    lea              r6,         [3 * r1]
5844
+    lea              r7,         [3 * r3]
5845
+
5846
+%rep 7
5847
+    PROCESS_IPFILTER_LUMA_PP_24x4_AVX512
5848
+    lea              r0,         [r0 + 4 * r1]
5849
+    lea              r2,         [r2 + 4 * r3]
5850
+%endrep
5851
+    PROCESS_IPFILTER_LUMA_PP_24x4_AVX512
5852
+    RET
5853
+%endif
5854
+
5855
+%macro IPFILTER_LUMA_AVX512_32xN 1
5856
+INIT_ZMM avx512
5857
+cglobal interp_8tap_horiz_pp_32x%1, 5,6,17
5858
+    add              r1d,        r1d
5859
+    add              r3d,        r3d
5860
+    sub              r0,         6
5861
+    mov              r4d,        r4m
5862
+    shl              r4d,        4
5863
+
5864
+%ifdef PIC
5865
+    lea              r5,         [tab_LumaCoeff]
5866
+    vpbroadcastd     m0,         [r5 + r4]
5867
+    vpbroadcastd     m1,         [r5 + r4 + 4]
5868
+    vpbroadcastd     m2,         [r5 + r4 + 8]
5869
+    vpbroadcastd     m3,         [r5 + r4 + 12]
5870
+%else
5871
+    vpbroadcastd     m0,         [tab_LumaCoeff + r4]
5872
+    vpbroadcastd     m1,         [tab_LumaCoeff + r4 + 4]
5873
+    vpbroadcastd     m2,         [tab_LumaCoeff + r4 + 8]
5874
+    vpbroadcastd     m3,         [tab_LumaCoeff + r4 + 12]
5875
+%endif
5876
+    vbroadcasti32x8  m4,         [interp8_hpp_shuf1_load_avx512]
5877
+    vbroadcasti32x8  m5,         [interp8_hpp_shuf2_load_avx512]
5878
+    vbroadcasti32x8  m6,         [pd_32]
5879
+    pxor             m7,         m7
5880
+    vbroadcasti32x8  m8,         [pw_pixel_max]
5881
+    vbroadcasti32x8  m9,         [interp8_hpp_shuf1_store_avx512]
5882
+
5883
+%rep %1/2 - 1
5884
+    PROCESS_IPFILTER_LUMA_PP_32x2_AVX512
5885
+    lea              r0,         [r0 + 2 * r1]
5886
+    lea              r2,         [r2 + 2 * r3]
5887
+%endrep
5888
+    PROCESS_IPFILTER_LUMA_PP_32x2_AVX512
5889
+    RET
5890
+%endmacro
5891
+
5892
+%if ARCH_X86_64
5893
+IPFILTER_LUMA_AVX512_32xN 8
5894
+IPFILTER_LUMA_AVX512_32xN 16
5895
+IPFILTER_LUMA_AVX512_32xN 24
5896
+IPFILTER_LUMA_AVX512_32xN 32
5897
+IPFILTER_LUMA_AVX512_32xN 64
5898
+%endif
5899
+
5900
+%macro IPFILTER_LUMA_AVX512_64xN 1
5901
+INIT_ZMM avx512
5902
+cglobal interp_8tap_horiz_pp_64x%1, 5,6,17
5903
+    add              r1d,        r1d
5904
+    add              r3d,        r3d
5905
+    sub              r0,         6
5906
+    mov              r4d,        r4m
5907
+    shl              r4d,        4
5908
+
5909
+%ifdef PIC
5910
+    lea              r5,         [tab_LumaCoeff]
5911
+    vpbroadcastd     m0,         [r5 + r4]
5912
+    vpbroadcastd     m1,         [r5 + r4 + 4]
5913
+    vpbroadcastd     m2,         [r5 + r4 + 8]
5914
+    vpbroadcastd     m3,         [r5 + r4 + 12]
5915
+%else
5916
+    vpbroadcastd     m0,         [tab_LumaCoeff + r4]
5917
+    vpbroadcastd     m1,         [tab_LumaCoeff + r4 + 4]
5918
+    vpbroadcastd     m2,         [tab_LumaCoeff + r4 + 8]
5919
+    vpbroadcastd     m3,         [tab_LumaCoeff + r4 + 12]
5920
+%endif
5921
+    vbroadcasti32x8  m4,         [interp8_hpp_shuf1_load_avx512]
5922
+    vbroadcasti32x8  m5,         [interp8_hpp_shuf2_load_avx512]
5923
+    vbroadcasti32x8  m6,         [pd_32]
5924
+    pxor             m7,         m7
5925
+    vbroadcasti32x8  m8,         [pw_pixel_max]
5926
+    vbroadcasti32x8  m9,         [interp8_hpp_shuf1_store_avx512]
5927
+
5928
+%rep %1/2 - 1
5929
+    PROCESS_IPFILTER_LUMA_PP_64x2_AVX512
5930
+    lea              r0,         [r0 + 2 * r1]
5931
+    lea              r2,         [r2 + 2 * r3]
5932
+%endrep
5933
+    PROCESS_IPFILTER_LUMA_PP_64x2_AVX512
5934
+    RET
5935
+%endmacro
5936
+
5937
+%if ARCH_X86_64
5938
+IPFILTER_LUMA_AVX512_64xN 16
5939
+IPFILTER_LUMA_AVX512_64xN 32
5940
+IPFILTER_LUMA_AVX512_64xN 48
5941
+IPFILTER_LUMA_AVX512_64xN 64
5942
+%endif
5943
+
5944
+%if ARCH_X86_64
5945
+INIT_ZMM avx512
5946
+cglobal interp_8tap_horiz_pp_48x64, 5,8,17
5947
+    add              r1d,        r1d
5948
+    add              r3d,        r3d
5949
+    sub              r0,         6
5950
+    mov              r4d,        r4m
5951
+    shl              r4d,        4
5952
+
5953
+%ifdef PIC
5954
+    lea              r5,         [tab_LumaCoeff]
5955
+    vpbroadcastd     m0,         [r5 + r4]
5956
+    vpbroadcastd     m1,         [r5 + r4 + 4]
5957
+    vpbroadcastd     m2,         [r5 + r4 + 8]
5958
+    vpbroadcastd     m3,         [r5 + r4 + 12]
5959
+%else
5960
+    vpbroadcastd     m0,         [tab_LumaCoeff + r4]
5961
+    vpbroadcastd     m1,         [tab_LumaCoeff + r4 + 4]
5962
+    vpbroadcastd     m2,         [tab_LumaCoeff + r4 + 8]
5963
+    vpbroadcastd     m3,         [tab_LumaCoeff + r4 + 12]
5964
+%endif
5965
+    vbroadcasti32x8  m4,         [interp8_hpp_shuf1_load_avx512]
5966
+    vbroadcasti32x8  m5,         [interp8_hpp_shuf2_load_avx512]
5967
+    vbroadcasti32x8  m6,         [pd_32]
5968
+    pxor             m7,         m7
5969
+    vbroadcasti32x8  m8,         [pw_pixel_max]
5970
+    vbroadcasti32x8  m9,         [interp8_hpp_shuf1_store_avx512]
5971
+    lea              r6,         [3 * r1]
5972
+    lea              r7,         [3 * r3]
5973
+
5974
+%rep 15
5975
+    PROCESS_IPFILTER_LUMA_PP_48x4_AVX512
5976
+    lea              r0,         [r0 + 4 * r1]
5977
+    lea              r2,         [r2 + 4 * r3]
5978
+%endrep
5979
+    PROCESS_IPFILTER_LUMA_PP_48x4_AVX512
5980
+    RET
5981
+%endif
5982
+;-------------------------------------------------------------------------------------------------------------
5983
+;avx512 luma_hps code start
5984
+;-------------------------------------------------------------------------------------------------------------
5985
+
5986
+%macro PROCESS_IPFILTER_LUMA_PS_32x2_AVX512 0
5987
+    ; register map
5988
+    ; m0, m1, m2, m3 - interpolate coeff
5989
+    ; m4, m5         - shuffle load order table
5990
+    ; m6             - INTERP_OFFSET_PS
5991
+    ; m7             - shuffle store order table
5992
+
5993
+    movu            m8,       [r0]
5994
+    movu            m9,       [r0 + 8]
5995
+    movu            m10,      [r0 + 16]
5996
+
5997
+    pshufb          m11,      m8,        m5
5998
+    pshufb          m8,       m4
5999
+    pmaddwd         m8,       m0
6000
+    pmaddwd         m11,      m1
6001
+    paddd           m8,       m11
6002
+    pshufb          m12,      m9,        m5
6003
+    pshufb          m9,       m4
6004
+    pmaddwd         m11,      m12,       m3
6005
+    pmaddwd         m14,      m9,        m2
6006
+    paddd           m11,      m14
6007
+
6008
+    paddd           m8,       m11
6009
+    paddd           m8,       m6
6010
+    psrad           m8,       INTERP_SHIFT_PS
6011
+
6012
+    pshufb          m13,      m10,       m5
6013
+    pshufb          m10,      m4
6014
+    pmaddwd         m9,       m0
6015
+    pmaddwd         m12,      m1
6016
+    paddd           m9,       m12
6017
+    pmaddwd         m13,      m3
6018
+    pmaddwd         m10,      m2
6019
+    paddd           m10,      m13
6020
+
6021
+    paddd           m9,       m10
6022
+    paddd           m9,       m6
6023
+    psrad           m9,       INTERP_SHIFT_PS
6024
+
6025
+    packssdw        m8,       m9
6026
+    pshufb          m8,       m7
6027
+    movu            [r2],     m8
6028
+
6029
+    movu            m8,       [r0 + r1]
6030
+    movu            m9,       [r0 + r1 + 8]
6031
+    movu            m10,      [r0 + r1 + 16]
6032
+
6033
+    pshufb          m11,      m8,        m5
6034
+    pshufb          m8,       m4
6035
+    pmaddwd         m8,       m0
6036
+    pmaddwd         m11,      m1
6037
+    paddd           m8,       m11
6038
+    pshufb          m12,      m9,        m5
6039
+    pshufb          m9,       m4
6040
+    pmaddwd         m11,      m12,       m3
6041
+    pmaddwd         m14,      m9,        m2
6042
+    paddd           m11,      m14
6043
+
6044
+    paddd           m8,       m11
6045
+    paddd           m8,       m6
6046
+    psrad           m8,       INTERP_SHIFT_PS
6047
+
6048
+    pshufb          m13,      m10,       m5
6049
+    pshufb          m10,      m4
6050
+    pmaddwd         m9,       m0
6051
+    pmaddwd         m12,      m1
6052
+    paddd           m9,       m12
6053
+    pmaddwd         m12,      m13,       m3
6054
+    pmaddwd         m14,      m10,       m2
6055
+    paddd           m12,      m14
6056
+
6057
+    paddd           m9,       m12
6058
+    paddd           m9,       m6
6059
+    psrad           m9,       INTERP_SHIFT_PS
6060
+
6061
+    packssdw        m8,       m9
6062
+    pshufb          m8,       m7
6063
+    movu            [r2 + r3],m8
6064
+%endmacro
6065
+
6066
+%macro PROCESS_IPFILTER_LUMA_PS_32x1_AVX512 0
6067
+    movu            m8,       [r0]
6068
+    movu            m9,       [r0 + 8]
6069
+    movu            m10,      [r0 + 16]
6070
+
6071
+    pshufb          m11,      m8,        m5
6072
+    pshufb          m8,       m4
6073
+    pmaddwd         m8,       m0
6074
+    pmaddwd         m11,      m1
6075
+    paddd           m8,       m11
6076
+    pshufb          m12,      m9,        m5
6077
+    pshufb          m9,       m4
6078
+    pmaddwd         m11,      m12,       m3
6079
+    pmaddwd         m14,      m9,        m2
6080
+    paddd           m11,      m14
6081
+
6082
+    paddd           m8,       m11
6083
+    paddd           m8,       m6
6084
+    psrad           m8,       INTERP_SHIFT_PS
6085
+
6086
+    pshufb          m13,      m10,       m5
6087
+    pshufb          m10,      m4
6088
+    pmaddwd         m9,       m0
6089
+    pmaddwd         m12,      m1
6090
+    paddd           m9,       m12
6091
+    pmaddwd         m13,      m3
6092
+    pmaddwd         m10,      m2
6093
+    paddd           m10,      m13
6094
+
6095
+    paddd           m9,       m10
6096
+    paddd           m9,       m6
6097
+    psrad           m9,       INTERP_SHIFT_PS
6098
+
6099
+    packssdw        m8,       m9
6100
+    pshufb          m8,       m7
6101
+    movu            [r2],     m8
6102
+%endmacro
6103
+
6104
+%macro IPFILTER_LUMA_PS_AVX512_32xN 1
6105
+INIT_ZMM avx512
6106
+cglobal interp_8tap_horiz_ps_32x%1, 4,7,15
6107
+    shl              r1d,        1
6108
+    shl              r3d,        1
6109
+    mov              r4d,        r4m
6110
+    mov              r5d,        r5m
6111
+    shl              r4d,        6
6112
+
6113
+%ifdef PIC
6114
+    lea              r6,         [tab_LumaCoeffH_avx512]
6115
+    vpbroadcastd     m0,         [r6 + r4]
6116
+    vpbroadcastd     m1,         [r6 + r4 + 4]
6117
+    vpbroadcastd     m2,         [r6 + r4 + 8]
6118
+    vpbroadcastd     m3,         [r6 + r4 + 12]
6119
+%else
6120
+    vpbroadcastd     m0,         [tab_LumaCoeffH_avx512 + r4]
6121
+    vpbroadcastd     m1,         [tab_LumaCoeffH_avx512 + r4 + 4]
6122
+    vpbroadcastd     m2,         [tab_LumaCoeffH_avx512 + r4 + 8]
6123
+    vpbroadcastd     m3,         [tab_LumaCoeffH_avx512 + r4 + 12]
6124
+%endif
6125
+    vbroadcasti32x8  m4,         [interp8_hpp_shuf1_load_avx512]
6126
+    vbroadcasti32x8  m5,         [interp8_hpp_shuf2_load_avx512]
6127
+    vbroadcasti32x4  m6,         [INTERP_OFFSET_PS]
6128
+    vbroadcasti32x8  m7,         [interp8_hpp_shuf1_store_avx512]
6129
+
6130
+    sub              r0,  6
6131
+    mov              r4d, %1
6132
+    test             r5d, r5d
6133
+    jz               .loop
6134
+    lea              r6,  [r1 * 3]
6135
+    sub              r0,  r6
6136
+    add              r4d, 7
6137
+    PROCESS_IPFILTER_LUMA_PS_32x1_AVX512
6138
+    lea              r0,  [r0 + r1]
6139
+    lea              r2,  [r2 + r3]
6140
+    dec              r4d
6141
+
6142
+.loop:
6143
+    PROCESS_IPFILTER_LUMA_PS_32x2_AVX512
6144
+    lea              r0,  [r0 + 2 * r1]
6145
+    lea              r2,  [r2 + 2 * r3]
6146
+    sub              r4d, 2
6147
+    jnz              .loop
6148
+    RET
6149
+%endmacro
6150
+
6151
+%if ARCH_X86_64
6152
+IPFILTER_LUMA_PS_AVX512_32xN 8
6153
+IPFILTER_LUMA_PS_AVX512_32xN 16
6154
+IPFILTER_LUMA_PS_AVX512_32xN 24
6155
+IPFILTER_LUMA_PS_AVX512_32xN 32
6156
+IPFILTER_LUMA_PS_AVX512_32xN 64
6157
+%endif
6158
+
6159
+%macro PROCESS_IPFILTER_LUMA_PS_64x2_AVX512 0
6160
+    ; register map
6161
+    ; m0, m1, m2, m3 - interpolate coeff
6162
+    ; m4, m5         - shuffle load order table
6163
+    ; m6             - INTERP_OFFSET_PS
6164
+    ; m7             - shuffle store order table
6165
+
6166
+    movu            m8,       [r0]
6167
+    movu            m9,       [r0 + 8]
6168
+    movu            m10,      [r0 + 16]
6169
+
6170
+    pshufb          m11,      m8,        m5
6171
+    pshufb          m8,       m4
6172
+    pmaddwd         m8,       m0
6173
+    pmaddwd         m11,      m1
6174
+    paddd           m8,       m11
6175
+    pshufb          m12,      m9,        m5
6176
+    pshufb          m9,       m4
6177
+    pmaddwd         m11,      m12,       m3
6178
+    pmaddwd         m14,      m9,        m2
6179
+    paddd           m11,      m14
6180
+
6181
+    paddd           m8,       m11
6182
+    paddd           m8,       m6
6183
+    psrad           m8,       INTERP_SHIFT_PS
6184
+
6185
+    pshufb          m13,      m10,       m5
6186
+    pshufb          m10,      m4
6187
+    pmaddwd         m9,       m0
6188
+    pmaddwd         m12,      m1
6189
+    paddd           m9,       m12
6190
+    pmaddwd         m13,      m3
6191
+    pmaddwd         m10,      m2
6192
+    paddd           m10,      m13
6193
+
6194
+    paddd           m9,       m10
6195
+    paddd           m9,       m6
6196
+    psrad           m9,       INTERP_SHIFT_PS
6197
+
6198
+    packssdw        m8,       m9
6199
+    pshufb          m8,       m7
6200
+    movu            [r2],     m8
6201
+
6202
+    movu            m8,       [r0 + mmsize]
6203
+    movu            m9,       [r0 + mmsize + 8]
6204
+    movu            m10,      [r0 + mmsize + 16]
6205
+
6206
+    pshufb          m11,      m8,        m5
6207
+    pshufb          m8,       m4
6208
+    pmaddwd         m8,       m0
6209
+    pmaddwd         m11,      m1
6210
+    paddd           m8,       m11
6211
+    pshufb          m12,      m9,        m5
6212
+    pshufb          m9,       m4
6213
+    pmaddwd         m11,      m12,       m3
6214
+    pmaddwd         m14,      m9,        m2
6215
+    paddd           m11,      m14
6216
+    paddd           m8,       m11
6217
+    paddd           m8,       m6
6218
+    psrad           m8,       INTERP_SHIFT_PS
6219
+
6220
+    pshufb          m13,      m10,       m5
6221
+    pshufb          m10,      m4
6222
+    pmaddwd         m9,       m0
6223
+    pmaddwd         m12,      m1
6224
+    paddd           m9,       m12
6225
+    pmaddwd         m13,      m3
6226
+    pmaddwd         m10,      m2
6227
+    paddd           m10,      m13
6228
+    paddd           m9,       m10
6229
+    paddd           m9,       m6
6230
+    psrad           m9,       INTERP_SHIFT_PS
6231
+
6232
+    packssdw        m8,       m9
6233
+    pshufb          m8,       m7
6234
+    movu            [r2 + mmsize],       m8
6235
+
6236
+    movu            m8,       [r0 + r1]
6237
+    movu            m9,       [r0 + r1 + 8]
6238
+    movu            m10,      [r0 + r1 + 16]
6239
+
6240
+    pshufb          m11,      m8,        m5
6241
+    pshufb          m8,       m4
6242
+    pmaddwd         m8,       m0
6243
+    pmaddwd         m11,      m1
6244
+    paddd           m8,       m11
6245
+    pshufb          m12,      m9,        m5
6246
+    pshufb          m9,       m4
6247
+    pmaddwd         m11,      m12,       m3
6248
+    pmaddwd         m14,      m9,        m2
6249
+    paddd           m11,      m14
6250
+    paddd           m8,       m11
6251
+    paddd           m8,       m6
6252
+    psrad           m8,       INTERP_SHIFT_PS
6253
+
6254
+    pshufb          m13,      m10,       m5
6255
+    pshufb          m10,      m4
6256
+    pmaddwd         m9,       m0
6257
+    pmaddwd         m12,       m1
6258
+    paddd           m9,       m12
6259
+    pmaddwd         m12,       m13,      m3
6260
+    pmaddwd         m14,       m10,      m2
6261
+    paddd           m12,       m14
6262
+    paddd           m9,       m12
6263
+    paddd           m9,       m6
6264
+    psrad           m9,       INTERP_SHIFT_PS
6265
+
6266
+    packssdw        m8,       m9
6267
+    pshufb          m8,       m7
6268
+    movu            [r2 + r3],m8
6269
+
6270
+    movu            m8,       [r0 + r1 + mmsize]
6271
+    movu            m9,       [r0 + r1 + mmsize + 8]
6272
+    movu            m10,      [r0 + r1 + mmsize + 16]
6273
+
6274
+    pshufb          m11,      m8,        m5
6275
+    pshufb          m8,       m4
6276
+    pmaddwd         m8,       m0
6277
+    pmaddwd         m11,      m1
6278
+    paddd           m8,       m11
6279
+    pshufb          m12,      m9,        m5
6280
+    pshufb          m9,       m4
6281
+    pmaddwd         m11,      m12,       m3
6282
+    pmaddwd         m14,      m9,        m2
6283
+    paddd           m11,      m14
6284
+    paddd           m8,       m11
6285
+    paddd           m8,       m6
6286
+    psrad           m8,       INTERP_SHIFT_PS
6287
+
6288
+    pshufb          m13,      m10,       m5
6289
+    pshufb          m10,      m4
6290
+    pmaddwd         m9,       m0
6291
+    pmaddwd         m12,      m1
6292
+    paddd           m9,       m12
6293
+    pmaddwd         m12,      m13,       m3
6294
+    pmaddwd         m14,      m10,       m2
6295
+    paddd           m12,      m14
6296
+    paddd           m9,       m12
6297
+    paddd           m9,       m6
6298
+    psrad           m9,       INTERP_SHIFT_PS
6299
+
6300
+    packssdw        m8,       m9
6301
+    pshufb          m8,       m7
6302
+    movu            [r2 + r3 + mmsize],  m8
6303
+%endmacro
6304
+
6305
+%macro PROCESS_IPFILTER_LUMA_PS_64x1_AVX512 0
6306
+
6307
+    movu            m8,       [r0]
6308
+    movu            m9,       [r0 + 8]
6309
+    movu            m10,      [r0 + 16]
6310
+
6311
+    pshufb          m11,      m8,        m5
6312
+    pshufb          m8,       m4
6313
+    pmaddwd         m8,       m0
6314
+    pmaddwd         m11,      m1
6315
+    paddd           m8,       m11
6316
+    pshufb          m12,      m9,        m5
6317
+    pshufb          m9,       m4
6318
+    pmaddwd         m11,      m12,       m3
6319
+    pmaddwd         m14,      m9,        m2
6320
+    paddd           m11,      m14
6321
+    paddd           m8,       m11
6322
+    paddd           m8,       m6
6323
+    psrad           m8,       INTERP_SHIFT_PS
6324
+
6325
+    pshufb          m13,      m10,       m5
6326
+    pshufb          m10,      m4
6327
+    pmaddwd         m9,       m0
6328
+    pmaddwd         m12,      m1
6329
+    paddd           m9,       m12
6330
+    pmaddwd         m13,      m3
6331
+    pmaddwd         m10,      m2
6332
+    paddd           m10,      m13
6333
+    paddd           m9,       m10
6334
+    paddd           m9,       m6
6335
+    psrad           m9,       INTERP_SHIFT_PS
6336
+
6337
+    packssdw        m8,       m9
6338
+    pshufb          m8,       m7
6339
+    movu            [r2],     m8
6340
+
6341
+    movu            m8,       [r0 + mmsize]
6342
+    movu            m9,       [r0 + mmsize + 8]
6343
+    movu            m10,      [r0 + mmsize + 16]
6344
+
6345
+    pshufb          m11,      m8,        m5
6346
+    pshufb          m8,       m4
6347
+    pmaddwd         m8,       m0
6348
+    pmaddwd         m11,      m1
6349
+    paddd           m8,       m11
6350
+    pshufb          m12,      m9,        m5
6351
+    pshufb          m9,       m4
6352
+    pmaddwd         m11,      m12,       m3
6353
+    pmaddwd         m14,      m9,        m2
6354
+    paddd           m11,      m14
6355
+    paddd           m8,       m11
6356
+    paddd           m8,       m6
6357
+    psrad           m8,       INTERP_SHIFT_PS
6358
+
6359
+    pshufb          m13,      m10,       m5
6360
+    pshufb          m10,      m4
6361
+    pmaddwd         m9,       m0
6362
+    pmaddwd         m12,      m1
6363
+    paddd           m9,       m12
6364
+    pmaddwd         m13,      m3
6365
+    pmaddwd         m10,      m2
6366
+    paddd           m10,      m13
6367
+    paddd           m9,       m10
6368
+    paddd           m9,       m6
6369
+    psrad           m9,       INTERP_SHIFT_PS
6370
+
6371
+    packssdw        m8,       m9
6372
+    pshufb          m8,       m7
6373
+    movu            [r2 + mmsize],       m8
6374
+%endmacro
6375
+
6376
+%macro IPFILTER_LUMA_PS_AVX512_64xN 1
6377
+INIT_ZMM avx512
6378
+cglobal interp_8tap_horiz_ps_64x%1, 4,7,15
6379
+    shl              r1d,        1
6380
+    shl              r3d,        1
6381
+    mov              r4d,        r4m
6382
+    mov              r5d,        r5m
6383
+    shl              r4d,        6
6384
+
6385
+%ifdef PIC
6386
+    lea              r6,         [tab_LumaCoeffH_avx512]
6387
+    vpbroadcastd     m0,         [r6 + r4]
6388
+    vpbroadcastd     m1,         [r6 + r4 + 4]
6389
+    vpbroadcastd     m2,         [r6 + r4 + 8]
6390
+    vpbroadcastd     m3,         [r6 + r4 + 12]
6391
+%else
6392
+    vpbroadcastd     m0,         [tab_LumaCoeffH_avx512 + r4]
6393
+    vpbroadcastd     m1,         [tab_LumaCoeffH_avx512 + r4 + 4]
6394
+    vpbroadcastd     m2,         [tab_LumaCoeffH_avx512 + r4 + 8]
6395
+    vpbroadcastd     m3,         [tab_LumaCoeffH_avx512 + r4 + 12]
6396
+%endif
6397
+    vbroadcasti32x8  m4,         [interp8_hpp_shuf1_load_avx512]
6398
+    vbroadcasti32x8  m5,         [interp8_hpp_shuf2_load_avx512]
6399
+    vbroadcasti32x4  m6,         [INTERP_OFFSET_PS]
6400
+    vbroadcasti32x8  m7,         [interp8_hpp_shuf1_store_avx512]
6401
+
6402
+    sub              r0,  6
6403
+    mov              r4d, %1
6404
+    test             r5d, r5d
6405
+    jz               .loop
6406
+    lea              r6,  [r1 * 3]
6407
+    sub              r0,  r6
6408
+    add              r4d, 7
6409
+    PROCESS_IPFILTER_LUMA_PS_64x1_AVX512
6410
+    lea              r0,  [r0 + r1]
6411
+    lea              r2,  [r2 + r3]
6412
+    dec              r4d
6413
+
6414
+.loop:
6415
+    PROCESS_IPFILTER_LUMA_PS_64x2_AVX512
6416
+    lea              r0,  [r0 + 2 * r1]
6417
+    lea              r2,  [r2 + 2 * r3]
6418
+    sub              r4d, 2
6419
+    jnz              .loop
6420
+    RET
6421
+%endmacro
6422
+
6423
+%if ARCH_X86_64
6424
+IPFILTER_LUMA_PS_AVX512_64xN 16
6425
+IPFILTER_LUMA_PS_AVX512_64xN 32
6426
+IPFILTER_LUMA_PS_AVX512_64xN 48
6427
+IPFILTER_LUMA_PS_AVX512_64xN 64
6428
+%endif
6429
+
6430
+%macro PROCESS_IPFILTER_LUMA_PS_16x4_AVX512 0
6431
+    ; register map
6432
+    ; m0, m1, m2, m3 - interpolate coeff
6433
+    ; m4, m5         - shuffle load order table
6434
+    ; m6             - INTERP_OFFSET_PS
6435
+    ; m7             - shuffle store order table
6436
+
6437
+    movu            ym8,      [r0]
6438
+    vinserti32x8     m8,      [r0 + r1],      1
6439
+    movu            ym9,      [r0 + 8]
6440
+    vinserti32x8     m9,      [r0 + r1 + 8],  1
6441
+    movu            ym10,     [r0 + 16]
6442
+    vinserti32x8     m10,     [r0 + r1 + 16], 1
6443
+
6444
+    pshufb          m11,      m8,        m5
6445
+    pshufb          m8,       m4
6446
+    pmaddwd         m8,       m0
6447
+    pmaddwd         m11,      m1
6448
+    paddd           m8,       m11
6449
+    pshufb          m12,      m9,        m5
6450
+    pshufb          m9,       m4
6451
+    pmaddwd         m11,      m12,       m3
6452
+    pmaddwd         m14,      m9,        m2
6453
+    paddd           m11,      m14
6454
+    paddd           m8,       m11
6455
+    paddd           m8,       m6
6456
+    psrad           m8,       INTERP_SHIFT_PS
6457
+
6458
+    pshufb          m13,      m10,       m5
6459
+    pshufb          m10,      m4
6460
+    pmaddwd         m9,       m0
6461
+    pmaddwd         m12,      m1
6462
+    paddd           m9,       m12
6463
+    pmaddwd         m13,      m3
6464
+    pmaddwd         m10,      m2
6465
+    paddd           m10,      m13
6466
+    paddd           m9,       m10
6467
+    paddd           m9,       m6
6468
+    psrad           m9,       INTERP_SHIFT_PS
6469
+
6470
+    packssdw        m8,       m9
6471
+    pshufb          m8,       m7
6472
+    movu            [r2],     ym8
6473
+    vextracti32x8   [r2 + r3],m8,        1
6474
+
6475
+    movu            ym8,      [r0 + 2 * r1]
6476
+    vinserti32x8     m8,      [r0 + r6],            1
6477
+    movu            ym9,      [r0 + 2 * r1 + 8]
6478
+    vinserti32x8     m9,      [r0 + r6 + 8],        1
6479
+    movu            ym10,     [r0 + 2 * r1 + 16]
6480
+    vinserti32x8     m10,     [r0 + r6 + 16],       1
6481
+
6482
+    pshufb          m11,      m8,        m5
6483
+    pshufb          m8,       m4
6484
+    pmaddwd         m8,       m0
6485
+    pmaddwd         m11,      m1
6486
+    paddd           m8,       m11
6487
+    pshufb          m12,      m9,        m5
6488
+    pshufb          m9,       m4
6489
+    pmaddwd         m11,      m12,       m3
6490
+    pmaddwd         m14,      m9,        m2
6491
+    paddd           m11,      m14
6492
+    paddd           m8,       m11
6493
+    paddd           m8,       m6
6494
+    psrad           m8,       INTERP_SHIFT_PS
6495
+
6496
+    pshufb          m13,      m10,       m5
6497
+    pshufb          m10,      m4
6498
+    pmaddwd         m9,       m0
6499
+    pmaddwd         m12,      m1
6500
+    paddd           m9,       m12
6501
+    pmaddwd         m12,      m13,       m3
6502
+    pmaddwd         m14,      m10,       m2
6503
+    paddd           m12,      m14
6504
+    paddd           m9,       m12
6505
+    paddd           m9,       m6
6506
+    psrad           m9,       INTERP_SHIFT_PS
6507
+
6508
+    packssdw        m8,       m9
6509
+    pshufb          m8,       m7
6510
+    movu            [r2 + 2 * r3],       ym8
6511
+    vextracti32x8   [r2 + r7],      m8,  1
6512
+%endmacro
6513
+
6514
+%macro PROCESS_IPFILTER_LUMA_PS_16x3_AVX512 0
6515
+    movu            ym8,      [r0]
6516
+    vinserti32x8     m8,      [r0 + r1],      1
6517
+    movu            ym9,      [r0 + 8]
6518
+    vinserti32x8     m9,      [r0 + r1 + 8],  1
6519
+    movu            ym10,     [r0 + 16]
6520
+    vinserti32x8     m10,     [r0 + r1 + 16], 1
6521
+
6522
+    pshufb          m11,      m8,             m5
6523
+    pshufb          m8,       m4
6524
+    pmaddwd         m8,       m0
6525
+    pmaddwd         m11,      m1
6526
+    paddd           m8,       m11
6527
+    pshufb          m12,      m9,             m5
6528
+    pshufb          m9,       m4
6529
+    pmaddwd         m11,      m12,            m3
6530
+    pmaddwd         m14,      m9,             m2
6531
+    paddd           m11,      m14
6532
+    paddd           m8,       m11
6533
+    paddd           m8,       m6
6534
+    psrad           m8,       INTERP_SHIFT_PS
6535
+
6536
+    pshufb          m13,      m10,            m5
6537
+    pshufb          m10,      m4
6538
+    pmaddwd         m9,       m0
6539
+    pmaddwd         m12,      m1
6540
+    paddd           m9,       m12
6541
+    pmaddwd         m13,      m3
6542
+    pmaddwd         m10,      m2
6543
+    paddd           m10,      m13
6544
+    paddd           m9,       m10
6545
+    paddd           m9,       m6
6546
+    psrad           m9,       INTERP_SHIFT_PS
6547
+
6548
+    packssdw        m8,       m9
6549
+    pshufb          m8,       m7
6550
+    movu            [r2],     ym8
6551
+    vextracti32x8   [r2 + r3],m8,             1
6552
+
6553
+    movu            ym8,      [r0 + 2 * r1]
6554
+    movu            ym9,      [r0 + 2 * r1 + 8]
6555
+    movu            ym10,     [r0 + 2 * r1 + 16]
6556
+
6557
+    pshufb          ym11,     ym8,            ym5
6558
+    pshufb          ym8,      ym4
6559
+    pmaddwd         ym8,      ym0
6560
+    pmaddwd         ym11,     ym1
6561
+    paddd           ym8,      ym11
6562
+    pshufb          ym12,     ym9,            ym5
6563
+    pshufb          ym9,      ym4
6564
+    pmaddwd         ym11,     ym12,           ym3
6565
+    pmaddwd         ym14,     ym9,            ym2
6566
+    paddd           ym11,     ym14
6567
+    paddd           ym8,      ym11
6568
+    paddd           ym8,      ym6
6569
+    psrad           ym8,      INTERP_SHIFT_PS
6570
+
6571
+    pshufb          ym13,     ym10,           ym5
6572
+    pshufb          ym10,     ym4
6573
+    pmaddwd         ym9,      ym0
6574
+    pmaddwd         ym12,     ym1
6575
+    paddd           ym9,      ym12
6576
+    pmaddwd         ym12,     ym13,           ym3
6577
+    pmaddwd         ym14,     ym10,           ym2
6578
+    paddd           ym12,     ym14
6579
+    paddd           ym9,      ym12
6580
+    paddd           ym9,      ym6
6581
+    psrad           ym9,      INTERP_SHIFT_PS
6582
+
6583
+    packssdw        ym8,      ym9
6584
+    pshufb          ym8,      ym7
6585
+    movu            [r2 + 2 * r3],            ym8
6586
+%endmacro
6587
+
6588
+
6589
+%macro IPFILTER_LUMA_PS_AVX512_16xN 1
6590
+INIT_ZMM avx512
6591
+cglobal interp_8tap_horiz_ps_16x%1, 4,9,15
6592
+    shl              r1d,        1
6593
+    shl              r3d,        1
6594
+    mov              r4d,        r4m
6595
+    mov              r5d,        r5m
6596
+    shl              r4d,        6
6597
+
6598
+    lea              r6,         [3 * r1]
6599
+    lea              r7,         [3 * r3]  
6600
+%ifdef PIC
6601
+    lea              r8,         [tab_LumaCoeffH_avx512]
6602
+    vpbroadcastd     m0,         [r8 + r4]
6603
+    vpbroadcastd     m1,         [r8 + r4 + 4]
6604
+    vpbroadcastd     m2,         [r8 + r4 + 8]
6605
+    vpbroadcastd     m3,         [r8 + r4 + 12]
6606
+%else
6607
+    vpbroadcastd     m0,         [tab_LumaCoeffH_avx512 + r4]
6608
+    vpbroadcastd     m1,         [tab_LumaCoeffH_avx512 + r4 + 4]
6609
+    vpbroadcastd     m2,         [tab_LumaCoeffH_avx512 + r4 + 8]
6610
+    vpbroadcastd     m3,         [tab_LumaCoeffH_avx512 + r4 + 12]
6611
+%endif
6612
+    vbroadcasti32x8  m4,         [interp8_hpp_shuf1_load_avx512]
6613
+    vbroadcasti32x8  m5,         [interp8_hpp_shuf2_load_avx512]
6614
+    vbroadcasti32x4  m6,         [INTERP_OFFSET_PS]
6615
+    vbroadcasti32x8  m7,         [interp8_hpp_shuf1_store_avx512]
6616
+
6617
+    sub              r0,  6
6618
+    mov              r4d, %1
6619
+    test             r5d, r5d
6620
+    jz               .loop
6621
+    lea              r6,  [r1 * 3]
6622
+    sub              r0,  r6
6623
+    add              r4d, 7
6624
+    PROCESS_IPFILTER_LUMA_PS_16x3_AVX512
6625
+    lea              r0,  [r0 + r6]
6626
+    lea              r2,  [r2 + r7]
6627
+    sub              r4d, 3
6628
+
6629
+.loop:
6630
+    PROCESS_IPFILTER_LUMA_PS_16x4_AVX512
6631
+    lea              r0,  [r0 + 4 * r1]
6632
+    lea              r2,  [r2 + 4 * r3]
6633
+    sub              r4d, 4
6634
+    jnz              .loop
6635
+    RET
6636
+%endmacro
6637
+
6638
+%if ARCH_X86_64
6639
+IPFILTER_LUMA_PS_AVX512_16xN 4
6640
+IPFILTER_LUMA_PS_AVX512_16xN 8
6641
+IPFILTER_LUMA_PS_AVX512_16xN 12
6642
+IPFILTER_LUMA_PS_AVX512_16xN 16
6643
+IPFILTER_LUMA_PS_AVX512_16xN 32
6644
+IPFILTER_LUMA_PS_AVX512_16xN 64
6645
+%endif
6646
+
6647
+%macro PROCESS_IPFILTER_LUMA_PS_48x4_AVX512 0
6648
+    ; register map
6649
+    ; m0, m1, m2, m3 - interpolate coeff
6650
+    ; m4, m5         - shuffle load order table
6651
+    ; m6             - INTERP_OFFSET_PS
6652
+    ; m7             - shuffle store order table
6653
+
6654
+    movu            m8,       [r0]
6655
+    movu            m9,       [r0 + 8]
6656
+    movu            m10,      [r0 + 16]
6657
+
6658
+    pshufb          m11,      m8,        m5
6659
+    pshufb          m8,       m4
6660
+    pmaddwd         m8,       m0
6661
+    pmaddwd         m11,      m1
6662
+    paddd           m8,       m11
6663
+    pshufb          m12,      m9,        m5
6664
+    pshufb          m9,       m4
6665
+    pmaddwd         m11,      m12,       m3
6666
+    pmaddwd         m14,      m9,        m2
6667
+    paddd           m11,      m14
6668
+    paddd           m8,       m11
6669
+    paddd           m8,       m6
6670
+    psrad           m8,       INTERP_SHIFT_PS
6671
+
6672
+    pshufb          m13,      m10,       m5
6673
+    pshufb          m10,      m4
6674
+    pmaddwd         m9,       m0
6675
+    pmaddwd         m12,      m1
6676
+    paddd           m9,       m12
6677
+    pmaddwd         m13,      m3
6678
+    pmaddwd         m10,      m2
6679
+    paddd           m10,      m13
6680
+    paddd           m9,       m10
6681
+    paddd           m9,       m6
6682
+    psrad           m9,       INTERP_SHIFT_PS
6683
+
6684
+    packssdw        m8,       m9
6685
+    pshufb          m8,       m7
6686
+    movu            [r2],     m8
6687
+
6688
+    movu            m8,       [r0 + r1]
6689
+    movu            m9,       [r0 + r1 + 8]
6690
+    movu            m10,      [r0 + r1 + 16]
6691
+
6692
+    pshufb          m11,      m8,        m5
6693
+    pshufb          m8,       m4
6694
+    pmaddwd         m8,       m0
6695
+    pmaddwd         m11,      m1
6696
+    paddd           m8,       m11
6697
+    pshufb          m12,      m9,        m5
6698
+    pshufb          m9,       m4
6699
+    pmaddwd         m11,      m12,       m3
6700
+    pmaddwd         m14,      m9,        m2
6701
+    paddd           m11,      m14
6702
+    paddd           m8,       m11
6703
+    paddd           m8,       m6
6704
+    psrad           m8,       INTERP_SHIFT_PS
6705
+
6706
+    pshufb          m13,      m10,       m5
6707
+    pshufb          m10,      m4
6708
+    pmaddwd         m9,       m0
6709
+    pmaddwd         m12,      m1
6710
+    paddd           m9,       m12
6711
+    pmaddwd         m12,      m13,       m3
6712
+    pmaddwd         m14,      m10,       m2
6713
+    paddd           m12,      m14
6714
+    paddd           m9,       m12
6715
+    paddd           m9,       m6
6716
+    psrad           m9,       INTERP_SHIFT_PS
6717
+
6718
+    packssdw        m8,       m9
6719
+    pshufb          m8,       m7
6720
+    movu            [r2 + r3],m8
6721
+
6722
+    movu            m8,       [r0 + 2 * r1]
6723
+    movu            m9,       [r0 + 2 * r1 + 8]
6724
+    movu            m10,      [r0 + 2 * r1 + 16]
6725
+
6726
+    pshufb          m11,      m8,        m5
6727
+    pshufb          m8,       m4
6728
+    pmaddwd         m8,       m0
6729
+    pmaddwd         m11,      m1
6730
+    paddd           m8,       m11
6731
+    pshufb          m12,      m9,        m5
6732
+    pshufb          m9,       m4
6733
+    pmaddwd         m11,      m12,       m3
6734
+    pmaddwd         m14,      m9,        m2
6735
+    paddd           m11,      m14
6736
+    paddd           m8,       m11
6737
+    paddd           m8,       m6
6738
+    psrad           m8,       INTERP_SHIFT_PS
6739
+
6740
+    pshufb          m13,      m10,       m5
6741
+    pshufb          m10,      m4
6742
+    pmaddwd         m9,       m0
6743
+    pmaddwd         m12,      m1
6744
+    paddd           m9,       m12
6745
+    pmaddwd         m13,      m3
6746
+    pmaddwd         m10,      m2
6747
+    paddd           m10,      m13
6748
+    paddd           m9,       m10
6749
+    paddd           m9,       m6
6750
+    psrad           m9,       INTERP_SHIFT_PS
6751
+
6752
+    packssdw        m8,       m9
6753
+    pshufb          m8,       m7
6754
+    movu            [r2 + 2 * r3],       m8
6755
+
6756
+    movu            m8,       [r0 + r6]
6757
+    movu            m9,       [r0 + r6 + 8]
6758
+    movu            m10,      [r0 + r6 + 16]
6759
+
6760
+    pshufb          m11,      m8,        m5
6761
+    pshufb          m8,       m4
6762
+    pmaddwd         m8,       m0
6763
+    pmaddwd         m11,      m1
6764
+    paddd           m8,       m11
6765
+    pshufb          m12,      m9,        m5
6766
+    pshufb          m9,       m4
6767
+    pmaddwd         m11,      m12,       m3
6768
+    pmaddwd         m14,      m9,        m2
6769
+    paddd           m11,      m14
6770
+    paddd           m8,       m11
6771
+    paddd           m8,       m6
6772
+    psrad           m8,       INTERP_SHIFT_PS
6773
+
6774
+    pshufb          m13,      m10,       m5
6775
+    pshufb          m10,      m4
6776
+    pmaddwd         m9,       m0
6777
+    pmaddwd         m12,      m1
6778
+    paddd           m9,       m12
6779
+    pmaddwd         m12,      m13,       m3
6780
+    pmaddwd         m14,      m10,       m2
6781
+    paddd           m12,      m14
6782
+    paddd           m9,       m12
6783
+    paddd           m9,       m6
6784
+    psrad           m9,       INTERP_SHIFT_PS
6785
+
6786
+    packssdw        m8,       m9
6787
+    pshufb          m8,       m7
6788
+    movu            [r2 + r7],m8
6789
+
6790
+    movu            ym8,      [r0 + mmsize]
6791
+    vinserti32x8     m8,      [r0 + r1 + mmsize],      1
6792
+    movu            ym9,      [r0 + mmsize + 8]
6793
+    vinserti32x8     m9,      [r0 + r1 + mmsize + 8],  1
6794
+    movu            ym10,     [r0 + mmsize + 16]
6795
+    vinserti32x8     m10,     [r0 + r1 + mmsize + 16], 1
6796
+
6797
+    pshufb          m11,      m8,        m5
6798
+    pshufb          m8,       m4
6799
+    pmaddwd         m8,       m0
6800
+    pmaddwd         m11,      m1
6801
+    paddd           m8,       m11
6802
+    pshufb          m12,      m9,        m5
6803
+    pshufb          m9,       m4
6804
+    pmaddwd         m11,      m12,       m3
6805
+    pmaddwd         m14,      m9,        m2
6806
+    paddd           m11,      m14
6807
+    paddd           m8,       m11
6808
+    paddd           m8,       m6
6809
+    psrad           m8,       INTERP_SHIFT_PS
6810
+
6811
+    pshufb          m13,      m10,       m5
6812
+    pshufb          m10,      m4
6813
+    pmaddwd         m9,       m0
6814
+    pmaddwd         m12,      m1
6815
+    paddd           m9,       m12
6816
+    pmaddwd         m13,      m3
6817
+    pmaddwd         m10,      m2
6818
+    paddd           m10,      m13
6819
+    paddd           m9,       m10
6820
+    paddd           m9,       m6
6821
+    psrad           m9,       INTERP_SHIFT_PS
6822
+
6823
+    packssdw        m8,       m9
6824
+    pshufb          m8,       m7
6825
+    movu            [r2 + mmsize],      ym8
6826
+    vextracti32x8   [r2 + r3 + mmsize], m8,        1
6827
+
6828
+    movu            ym8,      [r0 + 2 * r1 + mmsize]
6829
+    vinserti32x8     m8,      [r0 + r6 + mmsize],            1
6830
+    movu            ym9,      [r0 + 2 * r1 + mmsize + 8]
6831
+    vinserti32x8     m9,      [r0 + r6 + mmsize + 8],        1
6832
+    movu            ym10,     [r0 + 2 * r1 + mmsize + 16]
6833
+    vinserti32x8     m10,     [r0 + r6 + mmsize + 16],       1
6834
+
6835
+    pshufb          m11,      m8,       m5
6836
+    pshufb          m8,       m4
6837
+    pmaddwd         m8,       m0
6838
+    pmaddwd         m11,      m1
6839
+    paddd           m8,       m11
6840
+    pshufb          m12,      m9,       m5
6841
+    pshufb          m9,       m4
6842
+    pmaddwd         m11,      m12,      m3
6843
+    pmaddwd         m14,      m9,       m2
6844
+    paddd           m11,      m14
6845
+    paddd           m8,       m11
6846
+    paddd           m8,       m6
6847
+    psrad           m8,       INTERP_SHIFT_PS
6848
+
6849
+    pshufb          m13,      m10,      m5
6850
+    pshufb          m10,      m4
6851
+    pmaddwd         m9,       m0
6852
+    pmaddwd         m12,      m1
6853
+    paddd           m9,       m12
6854
+    pmaddwd         m12,      m13,      m3
6855
+    pmaddwd         m14,      m10,      m2
6856
+    paddd           m12,      m14
6857
+    paddd           m9,       m12
6858
+    paddd           m9,       m6
6859
+    psrad           m9,       INTERP_SHIFT_PS
6860
+
6861
+    packssdw        m8,       m9
6862
+    pshufb          m8,       m7
6863
+    movu            [r2 + 2 * r3 + mmsize],        ym8
6864
+    vextracti32x8   [r2 + r7 + mmsize],     m8,      1
6865
+%endmacro
6866
+
6867
+%macro PROCESS_IPFILTER_LUMA_PS_48x3_AVX512 0
6868
+    movu            m8,       [r0]
6869
+    movu            m9,       [r0 + 8]
6870
+    movu            m10,      [r0 + 16]
6871
+
6872
+    pshufb          m11,      m8,        m5
6873
+    pshufb          m8,       m4
6874
+    pmaddwd         m8,       m0
6875
+    pmaddwd         m11,      m1
6876
+    paddd           m8,       m11
6877
+    pshufb          m12,      m9,        m5
6878
+    pshufb          m9,       m4
6879
+    pmaddwd         m11,      m12,       m3
6880
+    pmaddwd         m14,      m9,        m2
6881
+    paddd           m11,      m14
6882
+    paddd           m8,       m11
6883
+    paddd           m8,       m6
6884
+    psrad           m8,       INTERP_SHIFT_PS
6885
+
6886
+    pshufb          m13,      m10,       m5
6887
+    pshufb          m10,      m4
6888
+    pmaddwd         m9,       m0
6889
+    pmaddwd         m12,      m1
6890
+    paddd           m9,       m12
6891
+    pmaddwd         m13,      m3
6892
+    pmaddwd         m10,      m2
6893
+    paddd           m10,      m13
6894
+    paddd           m9,       m10
6895
+    paddd           m9,       m6
6896
+    psrad           m9,       INTERP_SHIFT_PS
6897
+
6898
+    packssdw        m8,       m9
6899
+    pshufb          m8,       m7
6900
+    movu            [r2],     m8
6901
+
6902
+    movu            m8,       [r0 + r1]
6903
+    movu            m9,       [r0 + r1 + 8]
6904
+    movu            m10,      [r0 + r1 + 16]
6905
+
6906
+    pshufb          m11,      m8,        m5
6907
+    pshufb          m8,       m4
6908
+    pmaddwd         m8,       m0
6909
+    pmaddwd         m11,      m1
6910
+    paddd           m8,       m11
6911
+    pshufb          m12,      m9,        m5
6912
+    pshufb          m9,       m4
6913
+    pmaddwd         m11,      m12,       m3
6914
+    pmaddwd         m14,      m9,        m2
6915
+    paddd           m11,      m14
6916
+    paddd           m8,       m11
6917
+    paddd           m8,       m6
6918
+    psrad           m8,       INTERP_SHIFT_PS
6919
+
6920
+    pshufb          m13,      m10,       m5
6921
+    pshufb          m10,      m4
6922
+    pmaddwd         m9,       m0
6923
+    pmaddwd         m12,      m1
6924
+    paddd           m9,       m12
6925
+    pmaddwd         m12,      m13,       m3
6926
+    pmaddwd         m14,      m10,       m2
6927
+    paddd           m12,      m14
6928
+    paddd           m9,       m12
6929
+    paddd           m9,       m6
6930
+    psrad           m9,       INTERP_SHIFT_PS
6931
+
6932
+    packssdw        m8,       m9
6933
+    pshufb          m8,       m7
6934
+    movu            [r2 + r3],m8
6935
+
6936
+    movu            m8,       [r0 + 2 * r1]
6937
+    movu            m9,       [r0 + 2 * r1 + 8]
6938
+    movu            m10,      [r0 + 2 * r1 + 16]
6939
+
6940
+    pshufb          m11,      m8,        m5
6941
+    pshufb          m8,       m4
6942
+    pmaddwd         m8,       m0
6943
+    pmaddwd         m11,      m1
6944
+    paddd           m8,       m11
6945
+    pshufb          m12,      m9,        m5
6946
+    pshufb          m9,       m4
6947
+    pmaddwd         m11,      m12,       m3
6948
+    pmaddwd         m14,      m9,        m2
6949
+    paddd           m11,      m14
6950
+    paddd           m8,       m11
6951
+    paddd           m8,       m6
6952
+    psrad           m8,       INTERP_SHIFT_PS
6953
+
6954
+    pshufb          m13,      m10,       m5
6955
+    pshufb          m10,      m4
6956
+    pmaddwd         m9,       m0
6957
+    pmaddwd         m12,      m1
6958
+    paddd           m9,       m12
6959
+    pmaddwd         m13,      m3
6960
+    pmaddwd         m10,      m2
6961
+    paddd           m10,      m13
6962
+    paddd           m9,       m10
6963
+    paddd           m9,       m6
6964
+    psrad           m9,       INTERP_SHIFT_PS
6965
+
6966
+    packssdw        m8,       m9
6967
+    pshufb          m8,       m7
6968
+    movu            [r2 + 2 * r3],       m8
6969
+
6970
+    movu            ym8,      [r0 + mmsize]
6971
+    vinserti32x8     m8,      [r0 + r1 + mmsize],      1
6972
+    movu            ym9,      [r0 + mmsize + 8]
6973
+    vinserti32x8     m9,      [r0 + r1 + mmsize + 8],  1
6974
+    movu            ym10,     [r0 + mmsize + 16]
6975
+    vinserti32x8     m10,     [r0 + r1 + mmsize + 16], 1
6976
+
6977
+    pshufb          m11,      m8,        m5
6978
+    pshufb          m8,       m4
6979
+    pmaddwd         m8,       m0
6980
+    pmaddwd         m11,      m1
6981
+    paddd           m8,       m11
6982
+    pshufb          m12,      m9,        m5
6983
+    pshufb          m9,       m4
6984
+    pmaddwd         m11,      m12,       m3
6985
+    pmaddwd         m14,      m9,        m2
6986
+    paddd           m11,      m14
6987
+    paddd           m8,       m11
6988
+    paddd           m8,       m6
6989
+    psrad           m8,       INTERP_SHIFT_PS
6990
+
6991
+    pshufb          m13,      m10,       m5
6992
+    pshufb          m10,      m4
6993
+    pmaddwd         m9,       m0
6994
+    pmaddwd         m12,      m1
6995
+    paddd           m9,       m12
6996
+    pmaddwd         m13,      m3
6997
+    pmaddwd         m10,      m2
6998
+    paddd           m10,      m13
6999
+    paddd           m9,       m10
7000
+    paddd           m9,       m6
7001
+    psrad           m9,       INTERP_SHIFT_PS
7002
+
7003
+    packssdw        m8,       m9
7004
+    pshufb          m8,       m7
7005
+    movu            [r2 + mmsize],      ym8
7006
+    vextracti32x8   [r2 + r3 + mmsize], m8,        1
7007
+
7008
+    movu            ym8,      [r0 + 2 * r1 + mmsize]
7009
+    movu            ym9,      [r0 + 2 * r1 + mmsize + 8]
7010
+    movu            ym10,     [r0 + 2 * r1 + mmsize + 16]
7011
+
7012
+    pshufb          ym11,      ym8,       ym5
7013
+    pshufb          ym8,       ym4
7014
+    pmaddwd         ym8,       ym0
7015
+    pmaddwd         ym11,      ym1
7016
+    paddd           ym8,       ym11
7017
+    pshufb          ym12,      ym9,       ym5
7018
+    pshufb          ym9,       ym4
7019
+    pmaddwd         ym11,      ym12,      ym3
7020
+    pmaddwd         ym14,      ym9,       ym2
7021
+    paddd           ym11,      ym14
7022
+    paddd           ym8,       ym11
7023
+    paddd           ym8,       ym6
7024
+    psrad           ym8,       INTERP_SHIFT_PS
7025
+
7026
+    pshufb          ym13,      ym10,      ym5
7027
+    pshufb          ym10,      ym4
7028
+    pmaddwd         ym9,       ym0
7029
+    pmaddwd         ym12,      ym1
7030
+    paddd           ym9,       ym12
7031
+    pmaddwd         ym12,      ym13,      ym3
7032
+    pmaddwd         ym14,      ym10,      ym2
7033
+    paddd           ym12,      ym14
7034
+    paddd           ym9,       ym12
7035
+    paddd           ym9,       ym6
7036
+    psrad           ym9,       INTERP_SHIFT_PS
7037
+
7038
+    packssdw        ym8,       ym9
7039
+    pshufb          ym8,       ym7
7040
+    movu            [r2 + 2 * r3 + mmsize],        ym8
7041
+%endmacro
7042
+
7043
+%if ARCH_X86_64
7044
+INIT_ZMM avx512
7045
+cglobal interp_8tap_horiz_ps_48x64, 4,9,15
7046
+    shl              r1d,        1
7047
+    shl              r3d,        1
7048
+    mov              r4d,        r4m
7049
+    mov              r5d,        r5m
7050
+    shl              r4d,        6
7051
+    lea              r6,         [3 * r1]
7052
+    lea              r7,         [3 * r3]
7053
+%ifdef PIC
7054
+    lea              r8,         [tab_LumaCoeffH_avx512]
7055
+    vpbroadcastd     m0,         [r8 + r4]
7056
+    vpbroadcastd     m1,         [r8 + r4 + 4]
7057
+    vpbroadcastd     m2,         [r8 + r4 + 8]
7058
+    vpbroadcastd     m3,         [r8 + r4 + 12]
7059
+%else
7060
+    vpbroadcastd     m0,         [tab_LumaCoeffH_avx512 + r4]
7061
+    vpbroadcastd     m1,         [tab_LumaCoeffH_avx512 + r4 + 4]
7062
+    vpbroadcastd     m2,         [tab_LumaCoeffH_avx512 + r4 + 8]
7063
+    vpbroadcastd     m3,         [tab_LumaCoeffH_avx512 + r4 + 12]
7064
+%endif
7065
+    vbroadcasti32x8  m4,         [interp8_hpp_shuf1_load_avx512]
7066
+    vbroadcasti32x8  m5,         [interp8_hpp_shuf2_load_avx512]
7067
+    vbroadcasti32x4  m6,         [INTERP_OFFSET_PS]
7068
+    vbroadcasti32x8  m7,         [interp8_hpp_shuf1_store_avx512]
7069
+
7070
+    sub              r0,  6
7071
+    mov              r4d, 64
7072
+    test             r5d, r5d
7073
+    jz               .loop
7074
+    lea              r6,  [r1 * 3]
7075
+    sub              r0,  r6
7076
+    add              r4d, 7
7077
+    PROCESS_IPFILTER_LUMA_PS_48x4_AVX512
7078
+    lea              r0,  [r0 + r6]
7079
+    lea              r2,  [r2 + r7]
7080
+    sub              r4d, 3
7081
+
7082
+.loop:
7083
+    PROCESS_IPFILTER_LUMA_PS_48x4_AVX512
7084
+    lea              r0,  [r0 + 4 * r1]
7085
+    lea              r2,  [r2 + 4 * r3]
7086
+    sub              r4d, 4
7087
+    jnz              .loop
7088
+    RET
7089
+%endif
7090
+
7091
+%macro PROCESS_IPFILTER_LUMA_PS_24x4_AVX512 0
7092
+    ; register map
7093
+    ; m0 , m1, m2, m3 - interpolate coeff table
7094
+    ; m4 , m5         - load shuffle order table
7095
+    ; m6              - INTERP_OFFSET_PS
7096
+    ; m7              - store shuffle order table
7097
+
7098
+    PROCESS_IPFILTER_LUMA_PS_16x4_AVX512
7099
+
7100
+    movu            xm8,      [r0 + mmsize/2]
7101
+    movu            xm9,      [r0 + mmsize/2 + 8]
7102
+    movu            xm10,     [r0 + mmsize/2 + 16]
7103
+
7104
+    vinserti32x4     m8,      [r0 + r1 + mmsize/2],      1
7105
+    vinserti32x4     m9,      [r0 + r1 + mmsize/2 + 8],  1
7106
+    vinserti32x4     m10,     [r0 + r1 + mmsize/2 + 16], 1
7107
+
7108
+    vinserti32x4     m8,      [r0 + 2 * r1 + mmsize/2],           2
7109
+    vinserti32x4     m9,      [r0 + 2 * r1 + mmsize/2 + 8],       2
7110
+    vinserti32x4     m10,     [r0 + 2 * r1 + mmsize/2 + 16],      2
7111
+
7112
+    vinserti32x4     m8,      [r0 + r6 + mmsize/2],      3
7113
+    vinserti32x4     m9,      [r0 + r6 + mmsize/2 + 8],  3
7114
+    vinserti32x4     m10,     [r0 + r6 + mmsize/2 + 16], 3
7115
+
7116
+    pshufb          m11,      m8,        m5
7117
+    pshufb          m8,       m4
7118
+    pmaddwd         m8,       m0
7119
+    pmaddwd         m11,      m1
7120
+    paddd           m8,       m11
7121
+    pshufb          m12,      m9,        m5
7122
+    pshufb          m9,       m4
7123
+    pmaddwd         m11,      m12,       m3
7124
+    pmaddwd         m14,      m9,        m2
7125
+    paddd           m11,      m14
7126
+
7127
+    paddd           m8,       m11
7128
+    paddd           m8,       m6
7129
+    psrad           m8,       INTERP_SHIFT_PS
7130
+
7131
+    pshufb          m13,      m10,       m5
7132
+    pshufb          m10,      m4
7133
+    pmaddwd         m9,       m0
7134
+    pmaddwd         m12,      m1
7135
+    paddd           m9,       m12
7136
+    pmaddwd         m13,      m3
7137
+    pmaddwd         m10,      m2
7138
+    paddd           m10,      m13
7139
+
7140
+    paddd           m9,       m10
7141
+    paddd           m9,       m6
7142
+    psrad           m9,       INTERP_SHIFT_PS
7143
+
7144
+    packssdw        m8,       m9
7145
+    pshufb          m8,       m7
7146
+    movu            [r2 + mmsize/2],      xm8
7147
+    vextracti32x4   [r2 + r3 + mmsize/2],     m8,        1
7148
+    vextracti32x4   [r2 + 2 * r3 + mmsize/2], m8,        2
7149
+    vextracti32x4   [r2 + r7 + mmsize/2],     m8,        3
7150
+%endmacro
7151
+
7152
+%macro PROCESS_IPFILTER_LUMA_PS_24x3_AVX512 0
7153
+
7154
+    PROCESS_IPFILTER_LUMA_PS_16x3_AVX512
7155
+
7156
+    movu            xm8,      [r0 + mmsize/2]
7157
+    movu            xm9,      [r0 + mmsize/2 + 8]
7158
+    movu            xm10,     [r0 + mmsize/2 + 16]
7159
+
7160
+    vinserti32x4     m8,      [r0 + r1 + mmsize/2],      1
7161
+    vinserti32x4     m9,      [r0 + r1 + mmsize/2 + 8],  1
7162
+    vinserti32x4     m10,     [r0 + r1 + mmsize/2 + 16], 1
7163
+
7164
+    vinserti32x4     m8,      [r0 + 2 * r1 + mmsize/2],           2
7165
+    vinserti32x4     m9,      [r0 + 2 * r1 + mmsize/2 + 8],       2
7166
+    vinserti32x4     m10,     [r0 + 2 * r1 + mmsize/2 + 16],      2
7167
+
7168
+    pshufb          m11,      m8,        m5
7169
+    pshufb          m8,       m4
7170
+    pmaddwd         m8,       m0
7171
+    pmaddwd         m11,      m1
7172
+    paddd           m8,       m11
7173
+    pshufb          m12,      m9,        m5
7174
+    pshufb          m9,       m4
7175
+    pmaddwd         m11,      m12,       m3
7176
+    pmaddwd         m14,      m9,        m2
7177
+    paddd           m11,      m14
7178
+
7179
+    paddd           m8,       m11
7180
+    paddd           m8,       m6
7181
+    psrad           m8,       INTERP_SHIFT_PS
7182
+
7183
+    pshufb          m13,      m10,       m5
7184
+    pshufb          m10,      m4
7185
+    pmaddwd         m9,       m0
7186
+    pmaddwd         m12,      m1
7187
+    paddd           m9,       m12
7188
+    pmaddwd         m13,      m3
7189
+    pmaddwd         m10,      m2
7190
+    paddd           m10,      m13
7191
+
7192
+    paddd           m9,       m10
7193
+    paddd           m9,       m6
7194
+    psrad           m9,       INTERP_SHIFT_PS
7195
+
7196
+    packssdw        m8,       m9
7197
+    pshufb          m8,       m7
7198
+    movu            [r2 + mmsize/2],      xm8
7199
+    vextracti32x4   [r2 + r3 + mmsize/2],     m8,        1
7200
+    vextracti32x4   [r2 + 2 * r3 + mmsize/2], m8,        2
7201
+%endmacro
7202
+
7203
+%if ARCH_X86_64
7204
+INIT_ZMM avx512
7205
+cglobal interp_8tap_horiz_ps_24x32, 4, 9, 15
7206
+    shl              r1d,        1
7207
+    shl              r3d,        1
7208
+    mov              r4d,        r4m
7209
+    mov              r5d,        r5m
7210
+    shl              r4d,        6
7211
+
7212
+    lea              r6,         [3 * r1]
7213
+    lea              r7,         [3 * r3]
7214
+
7215
+%ifdef PIC
7216
+    lea              r8,         [tab_LumaCoeffH_avx512]
7217
+    vpbroadcastd     m0,         [r8 + r4]
7218
+    vpbroadcastd     m1,         [r8 + r4 + 4]
7219
+    vpbroadcastd     m2,         [r8 + r4 + 8]
7220
+    vpbroadcastd     m3,         [r8 + r4 + 12]
7221
+%else
7222
+    vpbroadcastd     m0,         [tab_LumaCoeffH_avx512 + r4]
7223
+    vpbroadcastd     m1,         [tab_LumaCoeffH_avx512 + r4 + 4]
7224
+    vpbroadcastd     m2,         [tab_LumaCoeffH_avx512 + r4 + 8]
7225
+    vpbroadcastd     m3,         [tab_LumaCoeffH_avx512 + r4 + 12]
7226
+%endif
7227
+    vbroadcasti32x8  m4,         [interp8_hpp_shuf1_load_avx512]
7228
+    vbroadcasti32x8  m5,         [interp8_hpp_shuf2_load_avx512]
7229
+    vbroadcasti32x4  m6,         [INTERP_OFFSET_PS]
7230
+    vbroadcasti32x8  m7,         [interp8_hpp_shuf1_store_avx512]
7231
+
7232
+    sub              r0,         6
7233
+    mov              r4d, 32
7234
+    test             r5d, r5d
7235
+    jz               .loop
7236
+    sub              r0,  r6
7237
+    add              r4d, 7
7238
+    PROCESS_IPFILTER_LUMA_PS_24x3_AVX512
7239
+    lea              r0,  [r0 + r6]
7240
+    lea              r2,  [r2 + r7]
7241
+    sub              r4d, 3
7242
+
7243
+.loop:
7244
+    PROCESS_IPFILTER_LUMA_PS_24x4_AVX512
7245
+    lea              r0,         [r0 + 4 * r1]
7246
+    lea              r2,         [r2 + 4 * r3]
7247
+    sub              r4d,        4
7248
+    jnz              .loop
7249
+    RET
7250
+%endif
7251
+%macro PROCESS_IPFILTER_LUMA_PS_8x4_AVX512 0
7252
+    ; register map
7253
+    ; m0 , m1, m2, m3 - interpolate coeff table
7254
+    ; m4 , m5         - load shuffle order table
7255
+    ; m6              - INTERP_OFFSET_PS
7256
+    ; m7              - store shuffle order table
7257
+
7258
+    movu            xm8,      [r0]
7259
+    movu            xm9,      [r0 + 8]
7260
+    movu            xm10,     [r0 + 16]
7261
+
7262
+    vinserti32x4     m8,      [r0 + r1],      1
7263
+    vinserti32x4     m9,      [r0 + r1 + 8],  1
7264
+    vinserti32x4     m10,     [r0 + r1 + 16], 1
7265
+
7266
+    vinserti32x4     m8,      [r0 + 2 * r1],           2
7267
+    vinserti32x4     m9,      [r0 + 2 * r1 + 8],       2
7268
+    vinserti32x4     m10,     [r0 + 2 * r1 + 16],      2
7269
+
7270
+    vinserti32x4     m8,      [r0 + r6],      3
7271
+    vinserti32x4     m9,      [r0 + r6 + 8],  3
7272
+    vinserti32x4     m10,     [r0 + r6 + 16], 3
7273
+
7274
+    pshufb          m11,      m8,         m5
7275
+    pshufb          m8,       m4
7276
+    pmaddwd         m8,       m0
7277
+    pmaddwd         m11,      m1
7278
+    paddd           m8,       m11
7279
+    pshufb          m12,      m9,         m5
7280
+    pshufb          m9,       m4
7281
+    pmaddwd         m11,      m12,       m3
7282
+    pmaddwd         m14,      m9,       m2
7283
+    paddd           m11,      m14
7284
+
7285
+    paddd           m8,       m11
7286
+    paddd           m8,       m6
7287
+    psrad           m8,       INTERP_SHIFT_PS
7288
+
7289
+    pshufb          m13,      m10,        m5
7290
+    pshufb          m10,      m4
7291
+    pmaddwd         m9,       m0
7292
+    pmaddwd         m12,      m1
7293
+    paddd           m9,       m12
7294
+    pmaddwd         m13,      m3
7295
+    pmaddwd         m10,      m2
7296
+    paddd           m10,      m13
7297
+
7298
+    paddd           m9,       m10
7299
+    paddd           m9,       m6
7300
+    psrad           m9,       INTERP_SHIFT_PS
7301
+
7302
+    packssdw        m8,       m9
7303
+    pshufb          m8,       m7
7304
+    movu            [r2],     xm8
7305
+    vextracti32x4   [r2 + r3],     m8,        1
7306
+    vextracti32x4   [r2 + 2 * r3], m8,        2
7307
+    vextracti32x4   [r2 + r7],     m8,        3
7308
+%endmacro
7309
+
7310
+%macro PROCESS_IPFILTER_LUMA_PS_8x3_AVX512 0
7311
+    movu            xm8,      [r0]
7312
+    movu            xm9,      [r0 + 8]
7313
+    movu            xm10,     [r0 + 16]
7314
+
7315
+    vinserti32x4     m8,      [r0 + r1],      1
7316
+    vinserti32x4     m9,      [r0 + r1 + 8],  1
7317
+    vinserti32x4     m10,     [r0 + r1 + 16], 1
7318
+
7319
+    vinserti32x4     m8,      [r0 + 2 * r1],           2
7320
+    vinserti32x4     m9,      [r0 + 2 * r1 + 8],       2
7321
+    vinserti32x4     m10,     [r0 + 2 * r1 + 16],      2
7322
+
7323
+    pshufb          m11,      m8,        m5
7324
+    pshufb          m8,       m4
7325
+    pmaddwd         m8,       m0
7326
+    pmaddwd         m11,      m1
7327
+    paddd           m8,       m11
7328
+    pshufb          m12,      m9,        m5
7329
+    pshufb          m9,       m4
7330
+    pmaddwd         m11,      m12,       m3
7331
+    pmaddwd         m14,      m9,       m2
7332
+    paddd           m11,      m14
7333
+
7334
+    paddd           m8,       m11
7335
+    paddd           m8,       m6
7336
+    psrad           m8,       INTERP_SHIFT_PS
7337
+
7338
+    pshufb          m13,      m10,        m5
7339
+    pshufb          m10,      m4
7340
+    pmaddwd         m9,       m0
7341
+    pmaddwd         m12,      m1
7342
+    paddd           m9,       m12
7343
+    pmaddwd         m13,      m3
7344
+    pmaddwd         m10,      m2
7345
+    paddd           m10,      m13
7346
+
7347
+    paddd           m9,       m10
7348
+    paddd           m9,       m6
7349
+    psrad           m9,       INTERP_SHIFT_PS
7350
+
7351
+    packssdw        m8,       m9
7352
+    pshufb          m8,       m7
7353
+    movu            [r2],     xm8
7354
+    vextracti32x4   [r2 + r3],     m8,        1
7355
+    vextracti32x4   [r2 + 2 * r3], m8,        2
7356
+%endmacro
7357
+
7358
+%macro IPFILTER_LUMA_PS_AVX512_8xN 1
7359
+INIT_ZMM avx512
7360
+cglobal interp_8tap_horiz_ps_8x%1, 4, 9, 15
7361
+    shl              r1d,        1
7362
+    shl              r3d,        1
7363
+    mov              r4d,        r4m
7364
+    mov              r5d,        r5m
7365
+    shl              r4d,        6
7366
+
7367
+    lea              r6,         [3 * r1]
7368
+    lea              r7,         [3 * r3]
7369
+
7370
+%ifdef PIC
7371
+    lea              r8,         [tab_LumaCoeffH_avx512]
7372
+    vpbroadcastd     m0,         [r8 + r4]
7373
+    vpbroadcastd     m1,         [r8 + r4 + 4]
7374
+    vpbroadcastd     m2,         [r8 + r4 + 8]
7375
+    vpbroadcastd     m3,         [r8 + r4 + 12]
7376
+%else
7377
+    vpbroadcastd     m0,         [tab_LumaCoeffH_avx512 + r4]
7378
+    vpbroadcastd     m1,         [tab_LumaCoeffH_avx512 + r4 + 4]
7379
+    vpbroadcastd     m2,         [tab_LumaCoeffH_avx512 + r4 + 8]
7380
+    vpbroadcastd     m3,         [tab_LumaCoeffH_avx512 + r4 + 12]
7381
+%endif
7382
+    vbroadcasti32x8  m4,         [interp8_hpp_shuf1_load_avx512]
7383
+    vbroadcasti32x8  m5,         [interp8_hpp_shuf2_load_avx512]
7384
+    vbroadcasti32x4  m6,         [INTERP_OFFSET_PS]
7385
+    vbroadcasti32x8  m7,         [interp8_hpp_shuf1_store_avx512]
7386
+
7387
+    sub              r0,         6
7388
+    mov              r4d, %1
7389
+    test             r5d, r5d
7390
+    jz               .loop
7391
+    sub              r0,  r6
7392
+    add              r4d, 7
7393
+    PROCESS_IPFILTER_LUMA_PS_8x3_AVX512
7394
+    lea              r0,  [r0 + r6]
7395
+    lea              r2,  [r2 + r7]
7396
+    sub              r4d, 3
7397
+
7398
+.loop:
7399
+    PROCESS_IPFILTER_LUMA_PS_8x4_AVX512
7400
+    lea              r0,         [r0 + 4 * r1]
7401
+    lea              r2,         [r2 + 4 * r3]
7402
+    sub              r4d,        4
7403
+    jnz              .loop
7404
+    RET
7405
+%endmacro
7406
+
7407
+%if ARCH_X86_64
7408
+    IPFILTER_LUMA_PS_AVX512_8xN 4
7409
+    IPFILTER_LUMA_PS_AVX512_8xN 8
7410
+    IPFILTER_LUMA_PS_AVX512_8xN 16
7411
+    IPFILTER_LUMA_PS_AVX512_8xN 32
7412
+%endif
7413
+
7414
+;-------------------------------------------------------------------------------------------------------------
7415
+;avx512 luma_hps code end
7416
+;-------------------------------------------------------------------------------------------------------------
7417
+;-------------------------------------------------------------------------------------------------------------
7418
+;avx512 luma_vss and luma_vsp code start
7419
+;-------------------------------------------------------------------------------------------------------------
7420
+%macro PROCESS_LUMA_VERT_S_8x8_AVX512 1
7421
+    lea                  r6,                  [r0 + 4 * r1]
7422
+    movu                 xm1,                 [r0]                           ;0 row
7423
+    vinserti32x4         m1,                  [r0 + 2 * r1],          1
7424
+    vinserti32x4         m1,                  [r0 + 4 * r1],          2
7425
+    vinserti32x4         m1,                  [r6 + 2 * r1],          3
7426
+    movu                 xm3,                 [r0 + r1]                      ;1 row
7427
+    vinserti32x4         m3,                  [r0 + r7],              1
7428
+    vinserti32x4         m3,                  [r6 + r1],              2
7429
+    vinserti32x4         m3,                  [r6 + r7],              3
7430
+    punpcklwd            m0,                  m1,                     m3
7431
+    pmaddwd              m0,                  m15
7432
+    punpckhwd            m1,                  m3
7433
+    pmaddwd              m1,                  m15
7434
+
7435
+    movu                 xm4,                 [r0 + 2 * r1]                  ;2 row
7436
+    vinserti32x4         m4,                  [r0 + 4 * r1],          1
7437
+    vinserti32x4         m4,                  [r6 + 2 * r1],          2
7438
+    vinserti32x4         m4,                  [r6 + 4 * r1],          3
7439
+    punpcklwd            m2,                  m3,                     m4
7440
+    pmaddwd              m2,                  m15
7441
+    punpckhwd            m3,                  m4
7442
+    pmaddwd              m3,                  m15
7443
+
7444
+    lea                  r4,                  [r6 + 4 * r1]
7445
+    movu                 xm5,                 [r0 + r7]                      ;3 row
7446
+    vinserti32x4         m5,                  [r6 + r1],              1
7447
+    vinserti32x4         m5,                  [r6 + r7],              2
7448
+    vinserti32x4         m5,                  [r4 + r1],              3
7449
+    punpcklwd            m6,                  m4,                     m5
7450
+    pmaddwd              m6,                  m16
7451
+    punpckhwd            m4,                  m5
7452
+    pmaddwd              m4,                  m16
7453
+
7454
+    paddd                m0,                  m6
7455
+    paddd                m1,                  m4
7456
+
7457
+    movu                 xm4,                 [r0 + 4 * r1]                  ;4 row
7458
+    vinserti32x4         m4,                  [r6 + 2 * r1],              1
7459
+    vinserti32x4         m4,                  [r6 + 4 * r1],              2
7460
+    vinserti32x4         m4,                  [r4 + 2 * r1],              3
7461
+    punpcklwd            m6,                  m5,                     m4
7462
+    pmaddwd              m6,                  m16
7463
+    punpckhwd            m5,                  m4
7464
+    pmaddwd              m5,                  m16
7465
+
7466
+    paddd                m2,                  m6
7467
+    paddd                m3,                  m5
7468
+
7469
+    movu                 xm11,                [r6 + r1]                      ;5 row
7470
+    vinserti32x4         m11,                 [r6 + r7],              1
7471
+    vinserti32x4         m11,                 [r4 + r1],              2
7472
+    vinserti32x4         m11,                 [r4 + r7],              3
7473
+    punpcklwd            m8,                  m4,                     m11
7474
+    pmaddwd              m8,                  m17
7475
+    punpckhwd            m4,                  m11
7476
+    pmaddwd              m4,                  m17
7477
+
7478
+    movu                 xm12,                [r6 + 2 * r1]                  ;6 row
7479
+    vinserti32x4         m12,                 [r6 + 4 * r1],          1
7480
+    vinserti32x4         m12,                 [r4 + 2 * r1],          2
7481
+    vinserti32x4         m12,                 [r4 + 4 * r1],          3
7482
+    punpcklwd            m10,                 m11,                    m12
7483
+    pmaddwd              m10,                 m17
7484
+    punpckhwd            m11,                 m12
7485
+    pmaddwd              m11,                 m17
7486
+
7487
+    lea                  r8,                  [r4 + 4 * r1]
7488
+    movu                 xm13,                [r6 + r7]                      ;7 row
7489
+    vinserti32x4         m13,                 [r4 + r1],              1
7490
+    vinserti32x4         m13,                 [r4 + r7],              2
7491
+    vinserti32x4         m13,                 [r8 + r1],              3
7492
+    punpcklwd            m14,                 m12,                    m13
7493
+    pmaddwd              m14,                 m18
7494
+    punpckhwd            m12,                 m13
7495
+    pmaddwd              m12,                 m18
7496
+
7497
+    paddd                m8,                  m14
7498
+    paddd                m4,                  m12
7499
+    paddd                m0,                  m8
7500
+    paddd                m1,                  m4
7501
+
7502
+    movu                 xm12,                [r6 + 4 * r1]                 ; 8 row
7503
+    vinserti32x4         m12,                 [r4 + 2 * r1],          1
7504
+    vinserti32x4         m12,                 [r4 + 4 * r1],          2
7505
+    vinserti32x4         m12,                 [r8 + 2 * r1],          3
7506
+    punpcklwd            m14,                 m13,                    m12
7507
+    pmaddwd              m14,                 m18
7508
+    punpckhwd            m13,                 m12
7509
+    pmaddwd              m13,                 m18
7510
+
7511
+    paddd                m10,                 m14
7512
+    paddd                m11,                 m13
7513
+    paddd                m2,                  m10
7514
+    paddd                m3,                  m11
7515
+
7516
+%ifidn %1, sp
7517
+    paddd                m0,                  m19
7518
+    paddd                m1,                  m19
7519
+    paddd                m2,                  m19
7520
+    paddd                m3,                  m19
7521
+
7522
+    psrad                m0,                  INTERP_SHIFT_SP
7523
+    psrad                m1,                  INTERP_SHIFT_SP
7524
+    psrad                m2,                  INTERP_SHIFT_SP
7525
+    psrad                m3,                  INTERP_SHIFT_SP
7526
+
7527
+    packssdw             m0,                  m1
7528
+    packssdw             m2,                  m3
7529
+    CLIPW2               m0,                  m2,                   m20,                 m21
7530
+%else
7531
+    psrad                m0,                  6
7532
+    psrad                m1,                  6
7533
+    psrad                m2,                  6
7534
+    psrad                m3,                  6
7535
+
7536
+    packssdw             m0,                  m1
7537
+    packssdw             m2,                  m3
7538
+%endif
7539
+
7540
+    movu                 [r2],                xm0
7541
+    movu                 [r2 + r3],           xm2
7542
+    vextracti32x4        [r2 + 2 * r3],       m0,                  1
7543
+    vextracti32x4        [r2 + r5],           m2,                  1
7544
+    lea                  r2,                  [r2 + 4 * r3]
7545
+    vextracti32x4        [r2],                m0,                  2
7546
+    vextracti32x4        [r2 + r3],           m2,                  2
7547
+    vextracti32x4        [r2 + 2 * r3],       m0,                  3
7548
+    vextracti32x4        [r2 + r5],           m2,                  3
7549
+%endmacro
7550
+;-----------------------------------------------------------------------------------------------------------------
7551
+; void interp_8tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
7552
+;-----------------------------------------------------------------------------------------------------------------
7553
+%macro FILTER_VER_S_LUMA_8xN_AVX512 2
7554
+INIT_ZMM avx512
7555
+cglobal interp_8tap_vert_%1_8x%2, 5, 9, 22
7556
+    add                   r1d,                r1d
7557
+    add                   r3d,                r3d
7558
+    lea                   r7,                 [3 * r1]
7559
+    sub                   r0,                 r7
7560
+    shl                   r4d,                8
7561
+%ifdef PIC
7562
+    lea                   r5,                 [tab_LumaCoeffVer_avx512]
7563
+    mova                  m15,                [r5 + r4]
7564
+    mova                  m16,                [r5 + r4 + 1 * mmsize]
7565
+    mova                  m17,                [r5 + r4 + 2 * mmsize]
7566
+    mova                  m18,                [r5 + r4 + 3 * mmsize]
7567
+%else
7568
+    lea                   r5,                 [tab_LumaCoeffVer_avx512 + r4]
7569
+    mova                  m15,                [r5]
7570
+    mova                  m16,                [r5 + 1 * mmsize]
7571
+    mova                  m17,                [r5 + 2 * mmsize]
7572
+    mova                  m18,                [r5 + 3 * mmsize]
7573
+%endif
7574
+%ifidn %1, sp
7575
+    vbroadcasti32x4       m19,                [INTERP_OFFSET_SP]
7576
+    pxor                  m20,                m20
7577
+    vbroadcasti32x8       m21,                [pw_pixel_max]
7578
+%endif
7579
+    lea                   r5,                 [3 * r3]
7580
+
7581
+%rep %2/8 - 1
7582
+    PROCESS_LUMA_VERT_S_8x8_AVX512 %1
7583
+    lea                   r0,                 [r4]
7584
+    lea                   r2,                 [r2 + 4 * r3]
7585
+%endrep
7586
+    PROCESS_LUMA_VERT_S_8x8_AVX512 %1
7587
+    RET
7588
+%endmacro
7589
+
7590
+%if ARCH_X86_64
7591
+    FILTER_VER_S_LUMA_8xN_AVX512 ss, 8
7592
+    FILTER_VER_S_LUMA_8xN_AVX512 ss, 16
7593
+    FILTER_VER_S_LUMA_8xN_AVX512 ss, 32
7594
+    FILTER_VER_S_LUMA_8xN_AVX512 sp, 8
7595
+    FILTER_VER_S_LUMA_8xN_AVX512 sp, 16
7596
+    FILTER_VER_S_LUMA_8xN_AVX512 sp, 32
7597
+%endif
7598
+
7599
+%macro PROCESS_LUMA_VERT_S_16x4_AVX512 1
7600
+    movu                 ym1,                 [r0]
7601
+    movu                 ym3,                 [r0 + r1]
7602
+    vinserti32x8         m1,                  [r0 + 2 * r1],          1
7603
+    vinserti32x8         m3,                  [r0 + r7],              1
7604
+    punpcklwd            m0,                  m1,                     m3
7605
+    pmaddwd              m0,                  m15
7606
+    punpckhwd            m1,                  m3
7607
+    pmaddwd              m1,                  m15
7608
+
7609
+    lea                  r6,                  [r0 + 4 * r1]
7610
+    movu                 ym4,                 [r0 + 2 * r1]
7611
+    vinserti32x8         m4,                  [r6],                   1
7612
+    punpcklwd            m2,                  m3,                     m4
7613
+    pmaddwd              m2,                  m15
7614
+    punpckhwd            m3,                  m4
7615
+    pmaddwd              m3,                  m15
7616
+
7617
+    movu                 ym5,                 [r0 + r7]
7618
+    vinserti32x8         m5,                  [r6 + r1],              1
7619
+    punpcklwd            m6,                  m4,                     m5
7620
+    pmaddwd              m6,                  m16
7621
+    punpckhwd            m4,                  m5
7622
+    pmaddwd              m4,                  m16
7623
+
7624
+    paddd                m0,                  m6
7625
+    paddd                m1,                  m4
7626
+
7627
+    movu                 ym4,                 [r6]
7628
+    vinserti32x8         m4,                  [r6 + 2 * r1],          1
7629
+    punpcklwd            m6,                  m5,                     m4
7630
+    pmaddwd              m6,                  m16
7631
+    punpckhwd            m5,                  m4
7632
+    pmaddwd              m5,                  m16
7633
+
7634
+    paddd                m2,                  m6
7635
+    paddd                m3,                  m5
7636
+
7637
+    movu                 ym11,                [r6 + r1]
7638
+    vinserti32x8         m11,                 [r6 + r7],              1
7639
+    punpcklwd            m8,                  m4,                     m11
7640
+    pmaddwd              m8,                  m17
7641
+    punpckhwd            m4,                  m11
7642
+    pmaddwd              m4,                  m17
7643
+
7644
+    movu                 ym12,                [r6 + 2 * r1]
7645
+    vinserti32x8         m12,                 [r6 + 4 * r1],          1
7646
+    punpcklwd            m10,                 m11,                    m12
7647
+    pmaddwd              m10,                 m17
7648
+    punpckhwd            m11,                 m12
7649
+    pmaddwd              m11,                 m17
7650
+
7651
+    lea                  r4,                  [r6 + 4 * r1]
7652
+    movu                 ym13,                [r6 + r7]
7653
+    vinserti32x8         m13,                 [r4 + r1],              1
7654
+    punpcklwd            m14,                 m12,                    m13
7655
+    pmaddwd              m14,                 m18
7656
+    punpckhwd            m12,                 m13
7657
+    pmaddwd              m12,                 m18
7658
+
7659
+    paddd                m8,                  m14
7660
+    paddd                m4,                  m12
7661
+    paddd                m0,                  m8
7662
+    paddd                m1,                  m4
7663
+
7664
+    movu                 ym12,                [r6 + 4 * r1]
7665
+    vinserti32x8         m12,                 [r4 + 2 * r1],          1
7666
+    punpcklwd            m14,                 m13,                    m12
7667
+    pmaddwd              m14,                 m18
7668
+    punpckhwd            m13,                 m12
7669
+    pmaddwd              m13,                 m18
7670
+
7671
+    paddd                m10,                 m14
7672
+    paddd                m11,                 m13
7673
+    paddd                m2,                  m10
7674
+    paddd                m3,                  m11
7675
+
7676
+%ifidn %1, sp
7677
+    paddd                m0,                  m19
7678
+    paddd                m1,                  m19
7679
+    paddd                m2,                  m19
7680
+    paddd                m3,                  m19
7681
+
7682
+    psrad                m0,                  INTERP_SHIFT_SP
7683
+    psrad                m1,                  INTERP_SHIFT_SP
7684
+    psrad                m2,                  INTERP_SHIFT_SP
7685
+    psrad                m3,                  INTERP_SHIFT_SP
7686
+
7687
+    packssdw             m0,                  m1
7688
+    packssdw             m2,                  m3
7689
+    CLIPW2               m0,                  m2,                   m20,                 m21
7690
+%else
7691
+    psrad                m0,                  6
7692
+    psrad                m1,                  6
7693
+    psrad                m2,                  6
7694
+    psrad                m3,                  6
7695
+
7696
+    packssdw             m0,                  m1
7697
+    packssdw             m2,                  m3
7698
+%endif
7699
+
7700
+    movu                 [r2],                ym0
7701
+    movu                 [r2 + r3],           ym2
7702
+    vextracti32x8        [r2 + 2 * r3],       m0,                1
7703
+    vextracti32x8        [r2 + r5],           m2,                1
7704
+%endmacro
7705
+;-----------------------------------------------------------------------------------------------------------------
7706
+; void interp_8tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
7707
+;-----------------------------------------------------------------------------------------------------------------
7708
+%macro FILTER_VER_S_LUMA_16xN_AVX512 2
7709
+INIT_ZMM avx512
7710
+cglobal interp_8tap_vert_%1_16x%2, 5, 8, 22
7711
+    add                   r1d,                r1d
7712
+    add                   r3d,                r3d
7713
+    lea                   r7,                 [3 * r1]
7714
+    sub                   r0,                 r7
7715
+    shl                   r4d,                8
7716
+%ifdef PIC
7717
+    lea                   r5,                 [tab_LumaCoeffVer_avx512]
7718
+    mova                  m15,                [r5 + r4]
7719
+    mova                  m16,                [r5 + r4 + 1 * mmsize]
7720
+    mova                  m17,                [r5 + r4 + 2 * mmsize]
7721
+    mova                  m18,                [r5 + r4 + 3 * mmsize]
7722
+%else
7723
+    lea                   r5,                 [tab_LumaCoeffVer_avx512 + r4]
7724
+    mova                  m15,                [r5]
7725
+    mova                  m16,                [r5 + 1 * mmsize]
7726
+    mova                  m17,                [r5 + 2 * mmsize]
7727
+    mova                  m18,                [r5 + 3 * mmsize]
7728
+%endif
7729
+%ifidn %1, sp
7730
+    vbroadcasti32x4       m19,                [INTERP_OFFSET_SP]
7731
+    pxor                  m20,                m20
7732
+    vbroadcasti32x8       m21,                [pw_pixel_max]
7733
+%endif
7734
+    lea                   r5,                 [3 * r3]
7735
+%rep %2/4 - 1
7736
+    PROCESS_LUMA_VERT_S_16x4_AVX512 %1
7737
+    lea                   r0,                 [r0 + 4 * r1]
7738
+    lea                   r2,                 [r2 + 4 * r3]
7739
+%endrep
7740
+    PROCESS_LUMA_VERT_S_16x4_AVX512 %1
7741
+    RET
7742
+%endmacro
7743
+
7744
+%if ARCH_X86_64
7745
+    FILTER_VER_S_LUMA_16xN_AVX512 ss, 4
7746
+    FILTER_VER_S_LUMA_16xN_AVX512 ss, 8
7747
+    FILTER_VER_S_LUMA_16xN_AVX512 ss, 12
7748
+    FILTER_VER_S_LUMA_16xN_AVX512 ss, 16
7749
+    FILTER_VER_S_LUMA_16xN_AVX512 ss, 32
7750
+    FILTER_VER_S_LUMA_16xN_AVX512 ss, 64
7751
+    FILTER_VER_S_LUMA_16xN_AVX512 sp, 4
7752
+    FILTER_VER_S_LUMA_16xN_AVX512 sp, 8
7753
+    FILTER_VER_S_LUMA_16xN_AVX512 sp, 12
7754
+    FILTER_VER_S_LUMA_16xN_AVX512 sp, 16
7755
+    FILTER_VER_S_LUMA_16xN_AVX512 sp, 32
7756
+    FILTER_VER_S_LUMA_16xN_AVX512 sp, 64
7757
+%endif
7758
+
7759
+%macro PROCESS_LUMA_VERT_S_24x8_AVX512 1
7760
+    PROCESS_LUMA_VERT_S_16x4_AVX512 %1
7761
+    lea                  r4,                  [r6 + 4 * r1]
7762
+    lea                  r8,                  [r4 + 4 * r1]
7763
+    movu                 ym1,                 [r6]
7764
+    movu                 ym3,                 [r6 + r1]
7765
+    vinserti32x8         m1,                  [r6 + 2 * r1],          1
7766
+    vinserti32x8         m3,                  [r6 + r7],              1
7767
+    punpcklwd            m0,                  m1,                     m3
7768
+    pmaddwd              m0,                  m15
7769
+    punpckhwd            m1,                  m3
7770
+    pmaddwd              m1,                  m15
7771
+
7772
+    movu                 ym4,                 [r6 + 2 * r1]
7773
+    vinserti32x8         m4,                  [r4],                   1
7774
+    punpcklwd            m2,                  m3,                     m4
7775
+    pmaddwd              m2,                  m15
7776
+    punpckhwd            m3,                  m4
7777
+    pmaddwd              m3,                  m15
7778
+
7779
+    movu                 ym5,                 [r6 + r7]
7780
+    vinserti32x8         m5,                  [r4 + r1],              1
7781
+    punpcklwd            m6,                  m4,                     m5
7782
+    pmaddwd              m6,                  m16
7783
+    punpckhwd            m4,                  m5
7784
+    pmaddwd              m4,                  m16
7785
+
7786
+    paddd                m0,                  m6
7787
+    paddd                m1,                  m4
7788
+
7789
+    movu                 ym4,                 [r4]
7790
+    vinserti32x8         m4,                  [r4 + 2 * r1],          1
7791
+    punpcklwd            m6,                  m5,                     m4
7792
+    pmaddwd              m6,                  m16
7793
+    punpckhwd            m5,                  m4
7794
+    pmaddwd              m5,                  m16
7795
+
7796
+    paddd                m2,                  m6
7797
+    paddd                m3,                  m5
7798
+
7799
+    movu                 ym11,                [r4 + r1]
7800
+    vinserti32x8         m11,                 [r4 + r7],              1
7801
+    punpcklwd            m8,                  m4,                     m11
7802
+    pmaddwd              m8,                  m17
7803
+    punpckhwd            m4,                  m11
7804
+    pmaddwd              m4,                  m17
7805
+
7806
+    movu                 ym12,                [r4 + 2 * r1]
7807
+    vinserti32x8         m12,                 [r4 + 4 * r1],          1
7808
+    punpcklwd            m10,                 m11,                    m12
7809
+    pmaddwd              m10,                 m17
7810
+    punpckhwd            m11,                 m12
7811
+    pmaddwd              m11,                 m17
7812
+
7813
+    movu                 ym13,                [r4 + r7]
7814
+    vinserti32x8         m13,                 [r8 + r1],              1
7815
+    punpcklwd            m14,                 m12,                    m13
7816
+    pmaddwd              m14,                 m18
7817
+    punpckhwd            m12,                 m13
7818
+    pmaddwd              m12,                 m18
7819
+
7820
+    paddd                m8,                  m14
7821
+    paddd                m4,                  m12
7822
+    paddd                m0,                  m8
7823
+    paddd                m1,                  m4
7824
+
7825
+    movu                 ym12,                [r4 + 4 * r1]
7826
+    vinserti32x8         m12,                 [r8 + 2 * r1],          1
7827
+    punpcklwd            m14,                 m13,                    m12
7828
+    pmaddwd              m14,                 m18
7829
+    punpckhwd            m13,                 m12
7830
+    pmaddwd              m13,                 m18
7831
+
7832
+    paddd                m10,                 m14
7833
+    paddd                m11,                 m13
7834
+    paddd                m2,                  m10
7835
+    paddd                m3,                  m11
7836
+
7837
+%ifidn %1, sp
7838
+    paddd                m0,                  m19
7839
+    paddd                m1,                  m19
7840
+    paddd                m2,                  m19
7841
+    paddd                m3,                  m19
7842
+
7843
+    psrad                m0,                  INTERP_SHIFT_SP
7844
+    psrad                m1,                  INTERP_SHIFT_SP
7845
+    psrad                m2,                  INTERP_SHIFT_SP
7846
+    psrad                m3,                  INTERP_SHIFT_SP
7847
+
7848
+    packssdw             m0,                  m1
7849
+    packssdw             m2,                  m3
7850
+    CLIPW2               m0,                  m2,                   m20,                 m21
7851
+%else
7852
+    psrad                m0,                  6
7853
+    psrad                m1,                  6
7854
+    psrad                m2,                  6
7855
+    psrad                m3,                  6
7856
+
7857
+    packssdw             m0,                  m1
7858
+    packssdw             m2,                  m3
7859
+%endif
7860
+    lea                  r9,                  [r2 + 4 * r3]
7861
+    movu                 [r9],                ym0
7862
+    movu                 [r9 + r3],           ym2
7863
+    vextracti32x8        [r9 + 2 * r3],       m0,                1
7864
+    vextracti32x8        [r9 + r5],           m2,                1
7865
+
7866
+    movu                 xm1,                 [r0 + mmsize/2]
7867
+    vinserti32x4         m1,                  [r0 + 2 * r1 + mmsize/2],          1
7868
+    vinserti32x4         m1,                  [r0 + 4 * r1 + mmsize/2],          2
7869
+    vinserti32x4         m1,                  [r6 + 2 * r1 + mmsize/2],          3
7870
+    movu                 xm3,                 [r0 + r1 + mmsize/2]
7871
+    vinserti32x4         m3,                  [r0 + r7 + mmsize/2],              1
7872
+    vinserti32x4         m3,                  [r6 + r1 + mmsize/2],              2
7873
+    vinserti32x4         m3,                  [r6 + r7 + mmsize/2],              3
7874
+    punpcklwd            m0,                  m1,                     m3
7875
+    pmaddwd              m0,                  m15
7876
+    punpckhwd            m1,                  m3
7877
+    pmaddwd              m1,                  m15
7878
+
7879
+    movu                 xm4,                 [r0 + 2 * r1 + mmsize/2]
7880
+    vinserti32x4         m4,                  [r0 + 4 * r1 + mmsize/2],          1
7881
+    vinserti32x4         m4,                  [r6 + 2 * r1 + mmsize/2],          2
7882
+    vinserti32x4         m4,                  [r6 + 4 * r1 + mmsize/2],          3
7883
+    punpcklwd            m2,                  m3,                     m4
7884
+    pmaddwd              m2,                  m15
7885
+    punpckhwd            m3,                  m4
7886
+    pmaddwd              m3,                  m15
7887
+
7888
+    movu                 xm5,                 [r0 + r7 + mmsize/2]
7889
+    vinserti32x4         m5,                  [r6 + r1 + mmsize/2],              1
7890
+    vinserti32x4         m5,                  [r6 + r7 + mmsize/2],              2
7891
+    vinserti32x4         m5,                  [r4 + r1 + mmsize/2],              3
7892
+    punpcklwd            m6,                  m4,                     m5
7893
+    pmaddwd              m6,                  m16
7894
+    punpckhwd            m4,                  m5
7895
+    pmaddwd              m4,                  m16
7896
+
7897
+    paddd                m0,                  m6
7898
+    paddd                m1,                  m4
7899
+
7900
+    movu                 xm4,                 [r0 + 4 * r1 + mmsize/2]
7901
+    vinserti32x4         m4,                  [r6 + 2 * r1 + mmsize/2],              1
7902
+    vinserti32x4         m4,                  [r6 + 4 * r1 + mmsize/2],              2
7903
+    vinserti32x4         m4,                  [r4 + 2 * r1 + mmsize/2],              3
7904
+    punpcklwd            m6,                  m5,                     m4
7905
+    pmaddwd              m6,                  m16
7906
+    punpckhwd            m5,                  m4
7907
+    pmaddwd              m5,                  m16
7908
+
7909
+    paddd                m2,                  m6
7910
+    paddd                m3,                  m5
7911
+
7912
+    movu                 xm11,                [r6 + r1 + mmsize/2]
7913
+    vinserti32x4         m11,                 [r6 + r7 + mmsize/2],              1
7914
+    vinserti32x4         m11,                 [r4 + r1 + mmsize/2],              2
7915
+    vinserti32x4         m11,                 [r4 + r7 + mmsize/2],              3
7916
+    punpcklwd            m8,                  m4,                     m11
7917
+    pmaddwd              m8,                  m17
7918
+    punpckhwd            m4,                  m11
7919
+    pmaddwd              m4,                  m17
7920
+
7921
+    movu                 xm12,                [r6 + 2 * r1 + mmsize/2]
7922
+    vinserti32x4         m12,                 [r6 + 4 * r1 + mmsize/2],          1
7923
+    vinserti32x4         m12,                 [r4 + 2 * r1 + mmsize/2],          2
7924
+    vinserti32x4         m12,                 [r4 + 4 * r1 + mmsize/2],          3
7925
+    punpcklwd            m10,                 m11,                    m12
7926
+    pmaddwd              m10,                 m17
7927
+    punpckhwd            m11,                 m12
7928
+    pmaddwd              m11,                 m17
7929
+
7930
+    movu                 xm13,                [r6 + r7 + mmsize/2]
7931
+    vinserti32x4         m13,                 [r4 + r1 + mmsize/2],              1
7932
+    vinserti32x4         m13,                 [r4 + r7 + mmsize/2],              2
7933
+    vinserti32x4         m13,                 [r8 + r1 + mmsize/2],              3
7934
+    punpcklwd            m14,                 m12,                    m13
7935
+    pmaddwd              m14,                 m18
7936
+    punpckhwd            m12,                 m13
7937
+    pmaddwd              m12,                 m18
7938
+
7939
+    paddd                m8,                  m14
7940
+    paddd                m4,                  m12
7941
+    paddd                m0,                  m8
7942
+    paddd                m1,                  m4
7943
+
7944
+    movu                 xm12,                [r6 + 4 * r1 + mmsize/2]
7945
+    vinserti32x4         m12,                 [r4 + 2 * r1 + mmsize/2],          1
7946
+    vinserti32x4         m12,                 [r4 + 4 * r1 + mmsize/2],          2
7947
+    vinserti32x4         m12,                 [r8 + 2 * r1 + mmsize/2],          3
7948
+    punpcklwd            m14,                 m13,                    m12
7949
+    pmaddwd              m14,                 m18
7950
+    punpckhwd            m13,                 m12
7951
+    pmaddwd              m13,                 m18
7952
+
7953
+    paddd                m10,                 m14
7954
+    paddd                m11,                 m13
7955
+    paddd                m2,                  m10
7956
+    paddd                m3,                  m11
7957
+
7958
+%ifidn %1, sp
7959
+    paddd                m0,                  m19
7960
+    paddd                m1,                  m19
7961
+    paddd                m2,                  m19
7962
+    paddd                m3,                  m19
7963
+
7964
+    psrad                m0,                  INTERP_SHIFT_SP
7965
+    psrad                m1,                  INTERP_SHIFT_SP
7966
+    psrad                m2,                  INTERP_SHIFT_SP
7967
+    psrad                m3,                  INTERP_SHIFT_SP
7968
+
7969
+    packssdw             m0,                  m1
7970
+    packssdw             m2,                  m3
7971
+    CLIPW2               m0,                  m2,                   m20,                 m21
7972
+%else
7973
+    psrad                m0,                  6
7974
+    psrad                m1,                  6
7975
+    psrad                m2,                  6
7976
+    psrad                m3,                  6
7977
+
7978
+    packssdw             m0,                  m1
7979
+    packssdw             m2,                  m3
7980
+%endif
7981
+
7982
+    movu                 [r2 + mmsize/2],                xm0
7983
+    movu                 [r2 + r3 + mmsize/2],           xm2
7984
+    vextracti32x4        [r2 + 2 * r3 + mmsize/2],       m0,                  1
7985
+    vextracti32x4        [r2 + r5 + mmsize/2],           m2,                  1
7986
+    lea                  r2,                             [r2 + 4 * r3]
7987
+    vextracti32x4        [r2 + mmsize/2],                m0,                  2
7988
+    vextracti32x4        [r2 + r3 + mmsize/2],           m2,                  2
7989
+    vextracti32x4        [r2 + 2 * r3 + mmsize/2],       m0,                  3
7990
+    vextracti32x4        [r2 + r5 + mmsize/2],           m2,                  3
7991
+%endmacro
7992
+;-----------------------------------------------------------------------------------------------------------------
7993
+; void interp_8tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
7994
+;-----------------------------------------------------------------------------------------------------------------
7995
+%macro FILTER_VER_S_LUMA_24x32_AVX512 1
7996
+INIT_ZMM avx512
7997
+cglobal interp_8tap_vert_%1_24x32, 5, 10, 22
7998
+    add                   r1d,                r1d
7999
+    add                   r3d,                r3d
8000
+    lea                   r7,                 [3 * r1]
8001
+    sub                   r0,                 r7
8002
+    shl                   r4d,                8
8003
+%ifdef PIC
8004
+    lea                   r5,                 [tab_LumaCoeffVer_avx512]
8005
+    mova                  m15,                [r5 + r4]
8006
+    mova                  m16,                [r5 + r4 + 1 * mmsize]
8007
+    mova                  m17,                [r5 + r4 + 2 * mmsize]
8008
+    mova                  m18,                [r5 + r4 + 3 * mmsize]
8009
+%else
8010
+    lea                   r5,                 [tab_LumaCoeffVer_avx512 + r4]
8011
+    mova                  m15,                [r5]
8012
+    mova                  m16,                [r5 + 1 * mmsize]
8013
+    mova                  m17,                [r5 + 2 * mmsize]
8014
+    mova                  m18,                [r5 + 3 * mmsize]
8015
+%endif
8016
+%ifidn %1, sp
8017
+    vbroadcasti32x4       m19,                [INTERP_OFFSET_SP]
8018
+    pxor                  m20,                m20
8019
+    vbroadcasti32x8       m21,                [pw_pixel_max]
8020
+%endif
8021
+    lea                   r5,                 [3 * r3]
8022
+
8023
+%rep 3
8024
+    PROCESS_LUMA_VERT_S_24x8_AVX512 %1
8025
+    lea                   r0,                 [r4]
8026
+    lea                   r2,                 [r2 + 4 * r3]
8027
+%endrep
8028
+    PROCESS_LUMA_VERT_S_24x8_AVX512 %1
8029
+    RET
8030
+%endmacro
8031
+
8032
+%if ARCH_X86_64
8033
+    FILTER_VER_S_LUMA_24x32_AVX512 ss
8034
+    FILTER_VER_S_LUMA_24x32_AVX512 sp
8035
+%endif
8036
+
8037
+%macro PROCESS_LUMA_VERT_S_32x2_AVX512 1
8038
+    movu                 m1,                  [r0]                           ;0 row
8039
+    movu                 m3,                  [r0 + r1]                      ;1 row
8040
+    punpcklwd            m0,                  m1,                     m3
8041
+    pmaddwd              m0,                  m15
8042
+    punpckhwd            m1,                  m3
8043
+    pmaddwd              m1,                  m15
8044
+
8045
+    movu                 m4,                  [r0 + 2 * r1]                  ;2 row
8046
+    punpcklwd            m2,                  m3,                     m4
8047
+    pmaddwd              m2,                  m15
8048
+    punpckhwd            m3,                  m4
8049
+    pmaddwd              m3,                  m15
8050
+
8051
+    movu                 m5,                  [r0 + r7]                      ;3 row
8052
+    punpcklwd            m6,                  m4,                     m5
8053
+    pmaddwd              m6,                  m16
8054
+    punpckhwd            m4,                  m5
8055
+    pmaddwd              m4,                  m16
8056
+
8057
+    paddd                m0,                  m6
8058
+    paddd                m1,                  m4
8059
+
8060
+    movu                 m4,                  [r0 + 4 * r1]                  ;4 row
8061
+    punpcklwd            m6,                  m5,                     m4
8062
+    pmaddwd              m6,                  m16
8063
+    punpckhwd            m5,                  m4
8064
+    pmaddwd              m5,                  m16
8065
+
8066
+    paddd                m2,                  m6
8067
+    paddd                m3,                  m5
8068
+
8069
+    lea                  r6,                  [r0 + 4 * r1]
8070
+
8071
+    movu                 m11,                 [r6 + r1]                      ;5 row
8072
+    punpcklwd            m8,                  m4,                     m11
8073
+    pmaddwd              m8,                  m17
8074
+    punpckhwd            m4,                  m11
8075
+    pmaddwd              m4,                  m17
8076
+
8077
+    movu                 m12,                 [r6 + 2 * r1]                  ;6 row
8078
+    punpcklwd            m10,                 m11,                    m12
8079
+    pmaddwd              m10,                 m17
8080
+    punpckhwd            m11,                 m12
8081
+    pmaddwd              m11,                 m17
8082
+
8083
+    movu                 m13,                 [r6 + r7]                      ;7 row
8084
+    punpcklwd            m14,                 m12,                    m13
8085
+    pmaddwd              m14,                 m18
8086
+    punpckhwd            m12,                 m13
8087
+    pmaddwd              m12,                 m18
8088
+
8089
+    paddd                m8,                  m14
8090
+    paddd                m4,                  m12
8091
+    paddd                m0,                  m8
8092
+    paddd                m1,                  m4
8093
+
8094
+    movu                 m12,                 [r6 + 4 * r1]                 ; 8 row
8095
+    punpcklwd            m14,                 m13,                    m12
8096
+    pmaddwd              m14,                 m18
8097
+    punpckhwd            m13,                 m12
8098
+    pmaddwd              m13,                 m18
8099
+
8100
+    paddd                m10,                 m14
8101
+    paddd                m11,                 m13
8102
+    paddd                m2,                  m10
8103
+    paddd                m3,                  m11
8104
+
8105
+%ifidn %1, sp
8106
+    paddd                m0,                  m19
8107
+    paddd                m1,                  m19
8108
+    paddd                m2,                  m19
8109
+    paddd                m3,                  m19
8110
+
8111
+    psrad                m0,                  INTERP_SHIFT_SP
8112
+    psrad                m1,                  INTERP_SHIFT_SP
8113
+    psrad                m2,                  INTERP_SHIFT_SP
8114
+    psrad                m3,                  INTERP_SHIFT_SP
8115
+
8116
+    packssdw             m0,                  m1
8117
+    packssdw             m2,                  m3
8118
+    CLIPW2               m0,                  m2,                   m20,                 m21
8119
+%else
8120
+    psrad                m0,                  6
8121
+    psrad                m1,                  6
8122
+    psrad                m2,                  6
8123
+    psrad                m3,                  6
8124
+
8125
+    packssdw             m0,                  m1
8126
+    packssdw             m2,                  m3
8127
+%endif
8128
+
8129
+    movu                 [r2],                m0
8130
+    movu                 [r2 + r3],           m2
8131
+%endmacro
8132
+;-----------------------------------------------------------------------------------------------------------------
8133
+; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
8134
+;-----------------------------------------------------------------------------------------------------------------
8135
+%macro FILTER_VER_S_LUMA_32xN_AVX512 2
8136
+INIT_ZMM avx512
8137
+cglobal interp_8tap_vert_%1_32x%2, 5, 8, 22
8138
+    add                   r1d,                r1d
8139
+    add                   r3d,                r3d
8140
+    lea                   r7,                 [3 * r1]
8141
+    sub                   r0,                 r7
8142
+    shl                   r4d,                8
8143
+%ifdef PIC
8144
+    lea                   r5,                 [tab_LumaCoeffVer_avx512]
8145
+    mova                  m15,                [r5 + r4]
8146
+    mova                  m16,                [r5 + r4 + 1 * mmsize]
8147
+    mova                  m17,                [r5 + r4 + 2 * mmsize]
8148
+    mova                  m18,                [r5 + r4 + 3 * mmsize]
8149
+%else
8150
+    lea                   r5,                 [tab_LumaCoeffVer_avx512 + r4]
8151
+    mova                  m15,                [r5]
8152
+    mova                  m16,                [r5 + 1 * mmsize]
8153
+    mova                  m17,                [r5 + 2 * mmsize]
8154
+    mova                  m18,                [r5 + 3 * mmsize]
8155
+%endif
8156
+%ifidn %1, sp
8157
+    vbroadcasti32x4       m19,                [INTERP_OFFSET_SP]
8158
+    pxor                  m20,                m20
8159
+    vbroadcasti32x8       m21,                [pw_pixel_max]
8160
+%endif
8161
+
8162
+%rep %2/2 - 1
8163
+    PROCESS_LUMA_VERT_S_32x2_AVX512 %1
8164
+    lea                   r0,                 [r0 + 2 * r1]
8165
+    lea                   r2,                 [r2 + 2 * r3]
8166
+%endrep
8167
+    PROCESS_LUMA_VERT_S_32x2_AVX512 %1
8168
+    RET
8169
+%endmacro
8170
+
8171
+%if ARCH_X86_64
8172
+    FILTER_VER_S_LUMA_32xN_AVX512 ss, 8
8173
+    FILTER_VER_S_LUMA_32xN_AVX512 ss, 16
8174
+    FILTER_VER_S_LUMA_32xN_AVX512 ss, 32
8175
+    FILTER_VER_S_LUMA_32xN_AVX512 ss, 24
8176
+    FILTER_VER_S_LUMA_32xN_AVX512 ss, 64
8177
+    FILTER_VER_S_LUMA_32xN_AVX512 sp, 8
8178
+    FILTER_VER_S_LUMA_32xN_AVX512 sp, 16
8179
+    FILTER_VER_S_LUMA_32xN_AVX512 sp, 32
8180
+    FILTER_VER_S_LUMA_32xN_AVX512 sp, 24
8181
+    FILTER_VER_S_LUMA_32xN_AVX512 sp, 64
8182
+%endif
8183
+
8184
+%macro PROCESS_LUMA_VERT_S_48x4_AVX512 1
8185
+    PROCESS_LUMA_VERT_S_32x2_AVX512 %1
8186
+    movu                 m1,                  [r0 + 2 * r1]
8187
+    movu                 m3,                  [r0 + r7]
8188
+    punpcklwd            m0,                  m1,                     m3
8189
+    pmaddwd              m0,                  m15
8190
+    punpckhwd            m1,                  m3
8191
+    pmaddwd              m1,                  m15
8192
+
8193
+    movu                 m4,                  [r0 + 4 * r1]
8194
+    punpcklwd            m2,                  m3,                     m4
8195
+    pmaddwd              m2,                  m15
8196
+    punpckhwd            m3,                  m4
8197
+    pmaddwd              m3,                  m15
8198
+
8199
+    movu                 m5,                  [r6 + r1]
8200
+    punpcklwd            m6,                  m4,                     m5
8201
+    pmaddwd              m6,                  m16
8202
+    punpckhwd            m4,                  m5
8203
+    pmaddwd              m4,                  m16
8204
+
8205
+    paddd                m0,                  m6
8206
+    paddd                m1,                  m4
8207
+
8208
+    lea                  r4,                  [r6 + 4 * r1]
8209
+
8210
+    movu                 m4,                  [r6 + 2 * r1]
8211
+    punpcklwd            m6,                  m5,                     m4
8212
+    pmaddwd              m6,                  m16
8213
+    punpckhwd            m5,                  m4
8214
+    pmaddwd              m5,                  m16
8215
+
8216
+    paddd                m2,                  m6
8217
+    paddd                m3,                  m5
8218
+
8219
+    movu                 m11,                 [r6 + r7]
8220
+    punpcklwd            m8,                  m4,                     m11
8221
+    pmaddwd              m8,                  m17
8222
+    punpckhwd            m4,                  m11
8223
+    pmaddwd              m4,                  m17
8224
+
8225
+    movu                 m12,                 [r4]
8226
+    punpcklwd            m10,                 m11,                    m12
8227
+    pmaddwd              m10,                 m17
8228
+    punpckhwd            m11,                 m12
8229
+    pmaddwd              m11,                 m17
8230
+
8231
+    movu                 m13,                 [r4 + r1]
8232
+    punpcklwd            m14,                 m12,                    m13
8233
+    pmaddwd              m14,                 m18
8234
+    punpckhwd            m12,                 m13
8235
+    pmaddwd              m12,                 m18
8236
+
8237
+    paddd                m8,                  m14
8238
+    paddd                m4,                  m12
8239
+    paddd                m0,                  m8
8240
+    paddd                m1,                  m4
8241
+
8242
+    movu                 m12,                 [r4 + 2 * r1]
8243
+    punpcklwd            m14,                 m13,                    m12
8244
+    pmaddwd              m14,                 m18
8245
+    punpckhwd            m13,                 m12
8246
+    pmaddwd              m13,                 m18
8247
+
8248
+    paddd                m10,                 m14
8249
+    paddd                m11,                 m13
8250
+    paddd                m2,                  m10
8251
+    paddd                m3,                  m11
8252
+
8253
+%ifidn %1, sp
8254
+    paddd                m0,                  m19
8255
+    paddd                m1,                  m19
8256
+    paddd                m2,                  m19
8257
+    paddd                m3,                  m19
8258
+
8259
+    psrad                m0,                  INTERP_SHIFT_SP
8260
+    psrad                m1,                  INTERP_SHIFT_SP
8261
+    psrad                m2,                  INTERP_SHIFT_SP
8262
+    psrad                m3,                  INTERP_SHIFT_SP
8263
+
8264
+    packssdw             m0,                  m1
8265
+    packssdw             m2,                  m3
8266
+    CLIPW2               m0,                  m2,                   m20,                 m21
8267
+%else
8268
+    psrad                m0,                  6
8269
+    psrad                m1,                  6
8270
+    psrad                m2,                  6
8271
+    psrad                m3,                  6
8272
+
8273
+    packssdw             m0,                  m1
8274
+    packssdw             m2,                  m3
8275
+%endif
8276
+
8277
+    movu                 [r2 + 2 * r3],       m0
8278
+    movu                 [r2 + r5],           m2
8279
+
8280
+    movu                 ym1,                 [r0 + mmsize]
8281
+    movu                 ym3,                 [r0 + r1 + mmsize]
8282
+    vinserti32x8         m1,                  [r0 + 2 * r1 + mmsize], 1
8283
+    vinserti32x8         m3,                  [r0 + r7 + mmsize],     1
8284
+    punpcklwd            m0,                  m1,                     m3
8285
+    pmaddwd              m0,                  m15
8286
+    punpckhwd            m1,                  m3
8287
+    pmaddwd              m1,                  m15
8288
+
8289
+    movu                 ym4,                 [r0 + 2 * r1 + mmsize]
8290
+    vinserti32x8         m4,                  [r6 + mmsize],          1
8291
+    punpcklwd            m2,                  m3,                     m4
8292
+    pmaddwd              m2,                  m15
8293
+    punpckhwd            m3,                  m4
8294
+    pmaddwd              m3,                  m15
8295
+
8296
+    movu                 ym5,                 [r0 + r7 + mmsize]
8297
+    vinserti32x8         m5,                  [r6 + r1 + mmsize],     1
8298
+    punpcklwd            m6,                  m4,                     m5
8299
+    pmaddwd              m6,                  m16
8300
+    punpckhwd            m4,                  m5
8301
+    pmaddwd              m4,                  m16
8302
+
8303
+    paddd                m0,                  m6
8304
+    paddd                m1,                  m4
8305
+
8306
+    movu                 ym4,                 [r6 + mmsize]
8307
+    vinserti32x8         m4,                  [r6 + 2 * r1 + mmsize], 1
8308
+    punpcklwd            m6,                  m5,                     m4
8309
+    pmaddwd              m6,                  m16
8310
+    punpckhwd            m5,                  m4
8311
+    pmaddwd              m5,                  m16
8312
+
8313
+    paddd                m2,                  m6
8314
+    paddd                m3,                  m5
8315
+
8316
+    movu                 ym11,                [r6 + r1 + mmsize]
8317
+    vinserti32x8         m11,                 [r6 + r7 + mmsize],     1
8318
+    punpcklwd            m8,                  m4,                     m11
8319
+    pmaddwd              m8,                  m17
8320
+    punpckhwd            m4,                  m11
8321
+    pmaddwd              m4,                  m17
8322
+
8323
+    movu                 ym12,                [r6 + 2 * r1 + mmsize]
8324
+    vinserti32x8         m12,                 [r6 + 4 * r1 + mmsize], 1
8325
+    punpcklwd            m10,                 m11,                    m12
8326
+    pmaddwd              m10,                 m17
8327
+    punpckhwd            m11,                 m12
8328
+    pmaddwd              m11,                 m17
8329
+
8330
+    movu                 ym13,                [r6 + r7 + mmsize]
8331
+    vinserti32x8         m13,                 [r4 + r1 + mmsize],     1
8332
+    punpcklwd            m14,                 m12,                    m13
8333
+    pmaddwd              m14,                 m18
8334
+    punpckhwd            m12,                 m13
8335
+    pmaddwd              m12,                 m18
8336
+
8337
+    paddd                m8,                  m14
8338
+    paddd                m4,                  m12
8339
+    paddd                m0,                  m8
8340
+    paddd                m1,                  m4
8341
+
8342
+    movu                 ym12,                [r6 + 4 * r1 + mmsize]
8343
+    vinserti32x8         m12,                 [r4 + 2 * r1 + mmsize], 1
8344
+    punpcklwd            m14,                 m13,                    m12
8345
+    pmaddwd              m14,                 m18
8346
+    punpckhwd            m13,                 m12
8347
+    pmaddwd              m13,                 m18
8348
+
8349
+    paddd                m10,                 m14
8350
+    paddd                m11,                 m13
8351
+    paddd                m2,                  m10
8352
+    paddd                m3,                  m11
8353
+
8354
+%ifidn %1, sp
8355
+    paddd                m0,                  m19
8356
+    paddd                m1,                  m19
8357
+    paddd                m2,                  m19
8358
+    paddd                m3,                  m19
8359
+
8360
+    psrad                m0,                  INTERP_SHIFT_SP
8361
+    psrad                m1,                  INTERP_SHIFT_SP
8362
+    psrad                m2,                  INTERP_SHIFT_SP
8363
+    psrad                m3,                  INTERP_SHIFT_SP
8364
+
8365
+    packssdw             m0,                  m1
8366
+    packssdw             m2,                  m3
8367
+    CLIPW2               m0,                  m2,                   m20,                 m21
8368
+%else
8369
+    psrad                m0,                  6
8370
+    psrad                m1,                  6
8371
+    psrad                m2,                  6
8372
+    psrad                m3,                  6
8373
+
8374
+    packssdw             m0,                  m1
8375
+    packssdw             m2,                  m3
8376
+%endif
8377
+
8378
+    movu                 [r2 + mmsize],                ym0
8379
+    movu                 [r2 + r3 + mmsize],           ym2
8380
+    vextracti32x8        [r2 + 2 * r3 + mmsize],       m0,                1
8381
+    vextracti32x8        [r2 + r5 + mmsize],           m2,                1
8382
+%endmacro
8383
+;-----------------------------------------------------------------------------------------------------------------
8384
+; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
8385
+;-----------------------------------------------------------------------------------------------------------------
8386
+%macro FILTER_VER_S_LUMA_48x64_AVX512 1
8387
+INIT_ZMM avx512
8388
+cglobal interp_8tap_vert_%1_48x64, 5, 8, 22
8389
+    add                   r1d,                r1d
8390
+    add                   r3d,                r3d
8391
+    lea                   r7,                 [3 * r1]
8392
+    sub                   r0,                 r7
8393
+    shl                   r4d,                8
8394
+%ifdef PIC
8395
+    lea                   r5,                 [tab_LumaCoeffVer_avx512]
8396
+    mova                  m15,                [r5 + r4]
8397
+    mova                  m16,                [r5 + r4 + 1 * mmsize]
8398
+    mova                  m17,                [r5 + r4 + 2 * mmsize]
8399
+    mova                  m18,                [r5 + r4 + 3 * mmsize]
8400
+%else
8401
+    lea                   r5,                 [tab_LumaCoeffVer_avx512 + r4]
8402
+    mova                  m15,                [r5]
8403
+    mova                  m16,                [r5 + 1 * mmsize]
8404
+    mova                  m17,                [r5 + 2 * mmsize]
8405
+    mova                  m18,                [r5 + 3 * mmsize]
8406
+%endif
8407
+%ifidn %1, sp
8408
+    vbroadcasti32x4       m19,                [INTERP_OFFSET_SP]
8409
+    pxor                  m20,                m20
8410
+    vbroadcasti32x8       m21,                [pw_pixel_max]
8411
+%endif
8412
+
8413
+    lea                   r5,                 [3 * r3]
8414
+%rep 15
8415
+    PROCESS_LUMA_VERT_S_48x4_AVX512 %1
8416
+    lea                   r0,                 [r0 + 4 * r1]
8417
+    lea                   r2,                 [r2 + 4 * r3]
8418
+%endrep
8419
+    PROCESS_LUMA_VERT_S_48x4_AVX512 %1
8420
+    RET
8421
+%endmacro
8422
+
8423
+%if ARCH_X86_64
8424
+    FILTER_VER_S_LUMA_48x64_AVX512 ss
8425
+    FILTER_VER_S_LUMA_48x64_AVX512 sp
8426
+%endif
8427
+
8428
+%macro PROCESS_LUMA_VERT_S_64x2_AVX512 1
8429
+    movu                 m1,                  [r0]                           ;0 row
8430
+    movu                 m3,                  [r0 + r1]                      ;1 row
8431
+    punpcklwd            m0,                  m1,                     m3
8432
+    pmaddwd              m0,                  m15
8433
+    punpckhwd            m1,                  m3
8434
+    pmaddwd              m1,                  m15
8435
+
8436
+    movu                 m4,                  [r0 + 2 * r1]                  ;2 row
8437
+    punpcklwd            m2,                  m3,                     m4
8438
+    pmaddwd              m2,                  m15
8439
+    punpckhwd            m3,                  m4
8440
+    pmaddwd              m3,                  m15
8441
+
8442
+    movu                 m5,                  [r0 + r7]                      ;3 row
8443
+    punpcklwd            m6,                  m4,                     m5
8444
+    pmaddwd              m6,                  m16
8445
+    punpckhwd            m4,                  m5
8446
+    pmaddwd              m4,                  m16
8447
+
8448
+    paddd                m0,                  m6
8449
+    paddd                m1,                  m4
8450
+
8451
+    movu                 m4,                  [r0 + 4 * r1]                  ;4 row
8452
+    punpcklwd            m6,                  m5,                     m4
8453
+    pmaddwd              m6,                  m16
8454
+    punpckhwd            m5,                  m4
8455
+    pmaddwd              m5,                  m16
8456
+
8457
+    paddd                m2,                  m6
8458
+    paddd                m3,                  m5
8459
+
8460
+    lea                  r6,                  [r0 + 4 * r1]
8461
+
8462
+    movu                 m11,                 [r6 + r1]                      ;5 row
8463
+    punpcklwd            m8,                  m4,                     m11
8464
+    pmaddwd              m8,                  m17
8465
+    punpckhwd            m4,                  m11
8466
+    pmaddwd              m4,                  m17
8467
+
8468
+    movu                 m12,                 [r6 + 2 * r1]                  ;6 row
8469
+    punpcklwd            m10,                 m11,                    m12
8470
+    pmaddwd              m10,                 m17
8471
+    punpckhwd            m11,                 m12
8472
+    pmaddwd              m11,                 m17
8473
+
8474
+    movu                 m13,                 [r6 + r7]                      ;7 row
8475
+    punpcklwd            m14,                 m12,                    m13
8476
+    pmaddwd              m14,                 m18
8477
+    punpckhwd            m12,                 m13
8478
+    pmaddwd              m12,                 m18
8479
+
8480
+    paddd                m8,                  m14
8481
+    paddd                m4,                  m12
8482
+    paddd                m0,                  m8
8483
+    paddd                m1,                  m4
8484
+
8485
+    movu                 m12,                 [r6 + 4 * r1]                 ; 8 row
8486
+    punpcklwd            m14,                 m13,                    m12
8487
+    pmaddwd              m14,                 m18
8488
+    punpckhwd            m13,                 m12
8489
+    pmaddwd              m13,                 m18
8490
+
8491
+    paddd                m10,                 m14
8492
+    paddd                m11,                 m13
8493
+    paddd                m2,                  m10
8494
+    paddd                m3,                  m11
8495
+
8496
+%ifidn %1, sp
8497
+    paddd                m0,                  m19
8498
+    paddd                m1,                  m19
8499
+    paddd                m2,                  m19
8500
+    paddd                m3,                  m19
8501
+
8502
+    psrad                m0,                  INTERP_SHIFT_SP
8503
+    psrad                m1,                  INTERP_SHIFT_SP
8504
+    psrad                m2,                  INTERP_SHIFT_SP
8505
+    psrad                m3,                  INTERP_SHIFT_SP
8506
+
8507
+    packssdw             m0,                  m1
8508
+    packssdw             m2,                  m3
8509
+    CLIPW2               m0,                  m2,                   m20,                 m21
8510
+%else
8511
+    psrad                m0,                  6
8512
+    psrad                m1,                  6
8513
+    psrad                m2,                  6
8514
+    psrad                m3,                  6
8515
+
8516
+    packssdw             m0,                  m1
8517
+    packssdw             m2,                  m3
8518
+%endif
8519
+
8520
+    movu                 [r2],                m0
8521
+    movu                 [r2 + r3],           m2
8522
+
8523
+    movu                 m1,                  [r0 + mmsize]                  ;0 row
8524
+    movu                 m3,                  [r0 + r1 + mmsize]             ;1 row
8525
+    punpcklwd            m0,                  m1,                     m3
8526
+    pmaddwd              m0,                  m15
8527
+    punpckhwd            m1,                  m3
8528
+    pmaddwd              m1,                  m15
8529
+
8530
+    movu                 m4,                  [r0 + 2 * r1 + mmsize]         ;2 row
8531
+    punpcklwd            m2,                  m3,                     m4
8532
+    pmaddwd              m2,                  m15
8533
+    punpckhwd            m3,                  m4
8534
+    pmaddwd              m3,                  m15
8535
+
8536
+    movu                 m5,                  [r0 + r7 + mmsize]             ;3 row
8537
+    punpcklwd            m6,                  m4,                     m5
8538
+    pmaddwd              m6,                  m16
8539
+    punpckhwd            m4,                  m5
8540
+    pmaddwd              m4,                  m16
8541
+
8542
+    paddd                m0,                  m6
8543
+    paddd                m1,                  m4
8544
+
8545
+    movu                 m4,                  [r0 + 4 * r1 + mmsize]         ;4 row
8546
+    punpcklwd            m6,                  m5,                     m4
8547
+    pmaddwd              m6,                  m16
8548
+    punpckhwd            m5,                  m4
8549
+    pmaddwd              m5,                  m16
8550
+
8551
+    paddd                m2,                  m6
8552
+    paddd                m3,                  m5
8553
+
8554
+    movu                 m11,                 [r6 + r1 + mmsize]             ;5 row
8555
+    punpcklwd            m8,                  m4,                     m11
8556
+    pmaddwd              m8,                  m17
8557
+    punpckhwd            m4,                  m11
8558
+    pmaddwd              m4,                  m17
8559
+
8560
+    movu                 m12,                 [r6 + 2 * r1 + mmsize]         ;6 row
8561
+    punpcklwd            m10,                 m11,                    m12
8562
+    pmaddwd              m10,                 m17
8563
+    punpckhwd            m11,                 m12
8564
+    pmaddwd              m11,                 m17
8565
+
8566
+    movu                 m13,                 [r6 + r7 + mmsize]             ;7 row
8567
+    punpcklwd            m14,                 m12,                    m13
8568
+    pmaddwd              m14,                 m18
8569
+    punpckhwd            m12,                 m13
8570
+    pmaddwd              m12,                 m18
8571
+
8572
+    paddd                m8,                  m14
8573
+    paddd                m4,                  m12
8574
+    paddd                m0,                  m8
8575
+    paddd                m1,                  m4
8576
+
8577
+    movu                 m12,                 [r6 + 4 * r1 + mmsize]         ; 8 row
8578
+    punpcklwd            m14,                 m13,                    m12
8579
+    pmaddwd              m14,                 m18
8580
+    punpckhwd            m13,                 m12
8581
+    pmaddwd              m13,                 m18
8582
+
8583
+    paddd                m10,                 m14
8584
+    paddd                m11,                 m13
8585
+    paddd                m2,                  m10
8586
+    paddd                m3,                  m11
8587
+
8588
+%ifidn %1, sp
8589
+    paddd                m0,                  m19
8590
+    paddd                m1,                  m19
8591
+    paddd                m2,                  m19
8592
+    paddd                m3,                  m19
8593
+
8594
+    psrad                m0,                  INTERP_SHIFT_SP
8595
+    psrad                m1,                  INTERP_SHIFT_SP
8596
+    psrad                m2,                  INTERP_SHIFT_SP
8597
+    psrad                m3,                  INTERP_SHIFT_SP
8598
+
8599
+    packssdw             m0,                  m1
8600
+    packssdw             m2,                  m3
8601
+    CLIPW2               m0,                  m2,                   m20,                 m21
8602
+%else
8603
+    psrad                m0,                  6
8604
+    psrad                m1,                  6
8605
+    psrad                m2,                  6
8606
+    psrad                m3,                  6
8607
+
8608
+    packssdw             m0,                  m1
8609
+    packssdw             m2,                  m3
8610
+%endif
8611
+
8612
+    movu                 [r2 + mmsize],       m0
8613
+    movu                 [r2 + r3 + mmsize],  m2
8614
+%endmacro
8615
+;-----------------------------------------------------------------------------------------------------------------
8616
+; void interp_8tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
8617
+;-----------------------------------------------------------------------------------------------------------------
8618
+%macro FILTER_VER_S_LUMA_64xN_AVX512 2
8619
+INIT_ZMM avx512
8620
+cglobal interp_8tap_vert_%1_64x%2, 5, 8, 22
8621
+    add                   r1d,                r1d
8622
+    add                   r3d,                r3d
8623
+    lea                   r7,                 [3 * r1]
8624
+    sub                   r0,                 r7
8625
+    shl                   r4d,                8
8626
+%ifdef PIC
8627
+    lea                   r5,                 [tab_LumaCoeffVer_avx512]
8628
+    mova                  m15,                [r5 + r4]
8629
+    mova                  m16,                [r5 + r4 + 1 * mmsize]
8630
+    mova                  m17,                [r5 + r4 + 2 * mmsize]
8631
+    mova                  m18,                [r5 + r4 + 3 * mmsize]
8632
+%else
8633
+    lea                   r5,                 [tab_LumaCoeffVer_avx512 + r4]
8634
+    mova                  m15,                [r5]
8635
+    mova                  m16,                [r5 + 1 * mmsize]
8636
+    mova                  m17,                [r5 + 2 * mmsize]
8637
+    mova                  m18,                [r5 + 3 * mmsize]
8638
+%endif
8639
+%ifidn %1, sp
8640
+    vbroadcasti32x4       m19,                [INTERP_OFFSET_SP]
8641
+    pxor                  m20,                m20
8642
+    vbroadcasti32x8       m21,                [pw_pixel_max]
8643
+%endif
8644
+
8645
+%rep %2/2 - 1
8646
+    PROCESS_LUMA_VERT_S_64x2_AVX512 %1
8647
+    lea                   r0,                 [r0 + 2 * r1]
8648
+    lea                   r2,                 [r2 + 2 * r3]
8649
+%endrep
8650
+    PROCESS_LUMA_VERT_S_64x2_AVX512 %1
8651
+    RET
8652
+%endmacro
8653
+
8654
+%if ARCH_X86_64
8655
+    FILTER_VER_S_LUMA_64xN_AVX512 ss, 16
8656
+    FILTER_VER_S_LUMA_64xN_AVX512 ss, 32
8657
+    FILTER_VER_S_LUMA_64xN_AVX512 ss, 48
8658
+    FILTER_VER_S_LUMA_64xN_AVX512 ss, 64
8659
+    FILTER_VER_S_LUMA_64xN_AVX512 sp, 16
8660
+    FILTER_VER_S_LUMA_64xN_AVX512 sp, 32
8661
+    FILTER_VER_S_LUMA_64xN_AVX512 sp, 48
8662
+    FILTER_VER_S_LUMA_64xN_AVX512 sp, 64
8663
+%endif
8664
+;-------------------------------------------------------------------------------------------------------------
8665
+;avx512 luma_vss and luma_vsp code end
8666
+;-------------------------------------------------------------------------------------------------------------
8667
+;-------------------------------------------------------------------------------------------------------------
8668
+;avx512 luma_vpp and luma_vps code start
8669
+;-------------------------------------------------------------------------------------------------------------
8670
+%macro PROCESS_LUMA_VERT_P_16x4_AVX512 1
8671
+    lea                  r5,                  [r0 + 4 * r1]
8672
+    movu                 ym1,                 [r0]
8673
+    movu                 ym3,                 [r0 + r1]
8674
+    vinserti32x8         m1,                  [r0 + 2 * r1],          1
8675
+    vinserti32x8         m3,                  [r0 + r7],              1
8676
+    punpcklwd            m0,                  m1,                     m3
8677
+    pmaddwd              m0,                  m15
8678
+    punpckhwd            m1,                  m3
8679
+    pmaddwd              m1,                  m15
8680
+
8681
+    movu                 ym4,                 [r0 + 2 * r1]
8682
+    vinserti32x8         m4,                  [r0 + 4 * r1],          1
8683
+    punpcklwd            m2,                  m3,                     m4
8684
+    pmaddwd              m2,                  m15
8685
+    punpckhwd            m3,                  m4
8686
+    pmaddwd              m3,                  m15
8687
+
8688
+    movu                 ym5,                 [r0 + r7]
8689
+    vinserti32x8         m5,                  [r5 + r1],              1
8690
+    punpcklwd            m6,                  m4,                     m5
8691
+    pmaddwd              m6,                  m16
8692
+    punpckhwd            m4,                  m5
8693
+    pmaddwd              m4,                  m16
8694
+
8695
+    paddd                m0,                  m6
8696
+    paddd                m1,                  m4
8697
+
8698
+    movu                 ym4,                 [r5]
8699
+    vinserti32x8         m4,                  [r5 + 2 * r1],          1
8700
+    punpcklwd            m6,                  m5,                     m4
8701
+    pmaddwd              m6,                  m16
8702
+    punpckhwd            m5,                  m4
8703
+    pmaddwd              m5,                  m16
8704
+
8705
+    paddd                m2,                  m6
8706
+    paddd                m3,                  m5
8707
+
8708
+    lea                  r4,                  [r5 + 4 * r1]
8709
+    movu                 ym11,                [r5 + r1]
8710
+    vinserti32x8         m11,                 [r5 + r7],              1
8711
+    punpcklwd            m8,                  m4,                     m11
8712
+    pmaddwd              m8,                  m17
8713
+    punpckhwd            m4,                  m11
8714
+    pmaddwd              m4,                  m17
8715
+
8716
+    movu                 ym12,                [r5 + 2 * r1]
8717
+    vinserti32x8         m12,                 [r4],                   1
8718
+    punpcklwd            m10,                 m11,                    m12
8719
+    pmaddwd              m10,                 m17
8720
+    punpckhwd            m11,                 m12
8721
+    pmaddwd              m11,                 m17
8722
+
8723
+    movu                 ym13,                [r5 + r7]
8724
+    vinserti32x8         m13,                 [r4 + r1],              1
8725
+    punpcklwd            m14,                 m12,                    m13
8726
+    pmaddwd              m14,                 m18
8727
+    punpckhwd            m12,                 m13
8728
+    pmaddwd              m12,                 m18
8729
+
8730
+    paddd                m8,                  m14
8731
+    paddd                m4,                  m12
8732
+    paddd                m0,                  m8
8733
+    paddd                m1,                  m4
8734
+
8735
+    movu                 ym12,                [r4]
8736
+    vinserti32x8         m12,                 [r4 + 2 * r1],          1
8737
+    punpcklwd            m14,                 m13,                    m12
8738
+    pmaddwd              m14,                 m18
8739
+    punpckhwd            m13,                 m12
8740
+    pmaddwd              m13,                 m18
8741
+
8742
+    paddd                m10,                 m14
8743
+    paddd                m11,                 m13
8744
+    paddd                m2,                  m10
8745
+    paddd                m3,                  m11
8746
+
8747
+    paddd                m0,                  m19
8748
+    paddd                m1,                  m19
8749
+    paddd                m2,                  m19
8750
+    paddd                m3,                  m19
8751
+
8752
+%ifidn %1, pp
8753
+    psrad                m0,                  INTERP_SHIFT_PP
8754
+    psrad                m1,                  INTERP_SHIFT_PP
8755
+    psrad                m2,                  INTERP_SHIFT_PP
8756
+    psrad                m3,                  INTERP_SHIFT_PP
8757
+
8758
+    packssdw             m0,                  m1
8759
+    packssdw             m2,                  m3
8760
+    CLIPW2               m0,                  m2,                   m20,                 m21
8761
+%else
8762
+    psrad                m0,                  INTERP_SHIFT_PS
8763
+    psrad                m1,                  INTERP_SHIFT_PS
8764
+    psrad                m2,                  INTERP_SHIFT_PS
8765
+    psrad                m3,                  INTERP_SHIFT_PS
8766
+
8767
+    packssdw             m0,                  m1
8768
+    packssdw             m2,                  m3
8769
+%endif
8770
+
8771
+    movu                 [r2],                ym0
8772
+    movu                 [r2 + r3],           ym2
8773
+    vextracti32x8        [r2 + 2 * r3],       m0,                    1
8774
+    vextracti32x8        [r2 + r8],           m2,                    1
8775
+%endmacro
8776
+;-----------------------------------------------------------------------------------------------------------------
8777
+; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
8778
+;-----------------------------------------------------------------------------------------------------------------
8779
+%macro FILTER_VER_P_LUMA_16xN_AVX512 2
8780
+INIT_ZMM avx512
8781
+cglobal interp_8tap_vert_%1_16x%2, 5, 9, 22
8782
+    add                   r1d,                r1d
8783
+    add                   r3d,                r3d
8784
+    shl                   r4d,                8
8785
+%ifdef PIC
8786
+    lea                   r5,                 [tab_LumaCoeffVer_avx512]
8787
+    mova                  m15,                [r5 + r4]
8788
+    mova                  m16,                [r5 + r4 + 1 * mmsize]
8789
+    mova                  m17,                [r5 + r4 + 2 * mmsize]
8790
+    mova                  m18,                [r5 + r4 + 3 * mmsize]
8791
+%else
8792
+    lea                   r5,                 [tab_LumaCoeffVer_avx512 + r4]
8793
+    mova                  m15,                [r5]
8794
+    mova                  m16,                [r5 + 1 * mmsize]
8795
+    mova                  m17,                [r5 + 2 * mmsize]
8796
+    mova                  m18,                [r5 + 3 * mmsize]
8797
+%endif
8798
+%ifidn %1, pp
8799
+    vbroadcasti32x4       m19,                [INTERP_OFFSET_PP]
8800
+    pxor                  m20,                m20
8801
+    vbroadcasti32x8       m21,                [pw_pixel_max]
8802
+%else
8803
+    vbroadcasti32x4       m19,                [INTERP_OFFSET_PS]
8804
+%endif
8805
+    lea                   r7,                 [3 * r1]
8806
+    lea                   r8,                 [3 * r3]
8807
+    sub                   r0,                 r7
8808
+
8809
+%rep %2/4 - 1
8810
+    PROCESS_LUMA_VERT_P_16x4_AVX512 %1
8811
+    lea                   r0,                 [r0 + 4 * r1]
8812
+    lea                   r2,                 [r2 + 4 * r3]
8813
+%endrep
8814
+    PROCESS_LUMA_VERT_P_16x4_AVX512 %1
8815
+    RET
8816
+%endmacro
8817
+
8818
+%if ARCH_X86_64
8819
+    FILTER_VER_P_LUMA_16xN_AVX512 ps, 4
8820
+    FILTER_VER_P_LUMA_16xN_AVX512 ps, 8
8821
+    FILTER_VER_P_LUMA_16xN_AVX512 ps, 12
8822
+    FILTER_VER_P_LUMA_16xN_AVX512 ps, 16
8823
+    FILTER_VER_P_LUMA_16xN_AVX512 ps, 32
8824
+    FILTER_VER_P_LUMA_16xN_AVX512 ps, 64
8825
+    FILTER_VER_P_LUMA_16xN_AVX512 pp, 4
8826
+    FILTER_VER_P_LUMA_16xN_AVX512 pp, 8
8827
+    FILTER_VER_P_LUMA_16xN_AVX512 pp, 12
8828
+    FILTER_VER_P_LUMA_16xN_AVX512 pp, 16
8829
+    FILTER_VER_P_LUMA_16xN_AVX512 pp, 32
8830
+    FILTER_VER_P_LUMA_16xN_AVX512 pp, 64
8831
+%endif
8832
+
8833
+%macro PROCESS_LUMA_VERT_P_24x4_AVX512 1
8834
+    PROCESS_LUMA_VERT_P_16x4_AVX512 %1
8835
+    movu                  xm1,                [r0 + mmsize/2]
8836
+    movu                  xm3,                [r0 + r1 + mmsize/2]
8837
+    vinserti32x4          m1,                 [r0 + r1 + mmsize/2],           1
8838
+    vinserti32x4          m3,                 [r0 + 2 * r1 + mmsize/2],       1
8839
+    vinserti32x4          m1,                 [r0 + 2 * r1 + mmsize/2],       2
8840
+    vinserti32x4          m3,                 [r0 + r7 + mmsize/2],           2
8841
+    vinserti32x4          m1,                 [r0 + r7 + mmsize/2],           3
8842
+    vinserti32x4          m3,                 [r0 + 4 * r1 + mmsize/2],       3
8843
+
8844
+    punpcklwd             m0,                 m1,                  m3
8845
+    pmaddwd               m0,                 m15
8846
+    punpckhwd             m1,                 m3
8847
+    pmaddwd               m1,                 m15
8848
+
8849
+    movu                  xm4,                [r0 + 2 * r1 + mmsize/2]
8850
+    movu                  xm5,                [r0 + r7 + mmsize/2]
8851
+    vinserti32x4          m4,                 [r0 + r7 + mmsize/2],           1
8852
+    vinserti32x4          m5,                 [r5 + mmsize/2],                1
8853
+    vinserti32x4          m4,                 [r5 + mmsize/2],                2
8854
+    vinserti32x4          m5,                 [r5 + r1 + mmsize/2],           2
8855
+    vinserti32x4          m4,                 [r5 + r1 + mmsize/2],           3
8856
+    vinserti32x4          m5,                 [r5 + 2 * r1 + mmsize/2],       3
8857
+
8858
+    punpcklwd             m3,                 m4,                  m5
8859
+    pmaddwd               m3,                 m16
8860
+    punpckhwd             m4,                 m5
8861
+    pmaddwd               m4,                 m16
8862
+
8863
+    paddd                 m0,                 m3
8864
+    paddd                 m1,                 m4
8865
+
8866
+    movu                  xm3,                [r5 + mmsize/2]
8867
+    movu                  xm5,                [r5 + r1 + mmsize/2]
8868
+    vinserti32x4          m3,                 [r5 + r1 + mmsize/2],           1
8869
+    vinserti32x4          m5,                 [r5 + 2 * r1 + mmsize/2],       1
8870
+    vinserti32x4          m3,                 [r5 + 2 * r1 + mmsize/2],       2
8871
+    vinserti32x4          m5,                 [r5 + r7 + mmsize/2],           2
8872
+    vinserti32x4          m3,                 [r5 + r7 + mmsize/2],           3
8873
+    vinserti32x4          m5,                 [r5 + 4 * r1 + mmsize/2],       3
8874
+
8875
+    punpcklwd             m2,                 m3,                  m5
8876
+    pmaddwd               m2,                 m17
8877
+    punpckhwd             m3,                 m5
8878
+    pmaddwd               m3,                 m17
8879
+
8880
+    movu                  xm6,                [r5 + 2 * r1 + mmsize/2]
8881
+    movu                  xm7,                [r5 + r7 + mmsize/2]
8882
+    vinserti32x4          m6,                 [r5 + r7 + mmsize/2],           1
8883
+    vinserti32x4          m7,                 [r4 + mmsize/2],                1
8884
+    vinserti32x4          m6,                 [r4 + mmsize/2],                2
8885
+    vinserti32x4          m7,                 [r4 + r1 + mmsize/2],           2
8886
+    vinserti32x4          m6,                 [r4 + r1 + mmsize/2],           3
8887
+    vinserti32x4          m7,                 [r4 + 2 * r1 + mmsize/2],       3
8888
+
8889
+    punpcklwd             m5,                 m6,                  m7
8890
+    pmaddwd               m5,                 m18
8891
+    punpckhwd             m6,                 m7
8892
+    pmaddwd               m6,                 m18
8893
+
8894
+    paddd                 m2,                 m5
8895
+    paddd                 m3,                 m6
8896
+    paddd                 m0,                 m2
8897
+    paddd                 m1,                 m3
8898
+
8899
+    paddd                 m0,                 m19
8900
+    paddd                 m1,                 m19
8901
+
8902
+%ifidn %1, pp
8903
+    psrad                 m0,                 INTERP_SHIFT_PP
8904
+    psrad                 m1,                 INTERP_SHIFT_PP
8905
+    packssdw              m0,                 m1
8906
+    CLIPW                 m0,                 m20,                 m21
8907
+%else
8908
+    psrad                 m0,                 INTERP_SHIFT_PS
8909
+    psrad                 m1,                 INTERP_SHIFT_PS
8910
+    packssdw              m0,                 m1
8911
+%endif
8912
+
8913
+    movu                 [r2 + mmsize/2],                xm0
8914
+    vextracti32x4        [r2 + r3 + mmsize/2],           m0,                    1
8915
+    vextracti32x4        [r2 + 2 * r3 + mmsize/2],       m0,                    2
8916
+    vextracti32x4        [r2 + r8 + mmsize/2],           m0,                    3
8917
+%endmacro
8918
+;-----------------------------------------------------------------------------------------------------------------
8919
+; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
8920
+;-----------------------------------------------------------------------------------------------------------------
8921
+%macro FILTER_VER_P_LUMA_24xN_AVX512 1
8922
+INIT_ZMM avx512
8923
+cglobal interp_8tap_vert_%1_24x32, 5, 9, 22
8924
+    add                   r1d,                r1d
8925
+    add                   r3d,                r3d
8926
+    shl                   r4d,                8
8927
+%ifdef PIC
8928
+    lea                   r5,                 [tab_LumaCoeffVer_avx512]
8929
+    mova                  m15,                [r5 + r4]
8930
+    mova                  m16,                [r5 + r4 + 1 * mmsize]
8931
+    mova                  m17,                [r5 + r4 + 2 * mmsize]
8932
+    mova                  m18,                [r5 + r4 + 3 * mmsize]
8933
+%else
8934
+    lea                   r5,                 [tab_LumaCoeffVer_avx512 + r4]
8935
+    mova                  m15,                [r5]
8936
+    mova                  m16,                [r5 + 1 * mmsize]
8937
+    mova                  m17,                [r5 + 2 * mmsize]
8938
+    mova                  m18,                [r5 + 3 * mmsize]
8939
+%endif
8940
+%ifidn %1, pp
8941
+    vbroadcasti32x4       m19,                [INTERP_OFFSET_PP]
8942
+    pxor                  m20,                m20
8943
+    vbroadcasti32x8       m21,                [pw_pixel_max]
8944
+%else
8945
+    vbroadcasti32x4       m19,                [INTERP_OFFSET_PS]
8946
+%endif
8947
+    lea                   r7,                 [3 * r1]
8948
+    lea                   r8,                 [3 * r3]
8949
+    sub                   r0,                 r7
8950
+
8951
+%rep 7
8952
+    PROCESS_LUMA_VERT_P_24x4_AVX512 %1
8953
+    lea                   r0,                 [r0 + 4 * r1]
8954
+    lea                   r2,                 [r2 + 4 * r3]
8955
+%endrep
8956
+    PROCESS_LUMA_VERT_P_24x4_AVX512 %1
8957
+    RET
8958
+%endmacro
8959
+
8960
+%if ARCH_X86_64
8961
+    FILTER_VER_P_LUMA_24xN_AVX512 ps
8962
+    FILTER_VER_P_LUMA_24xN_AVX512 pp
8963
+%endif
8964
+
8965
+%macro PROCESS_LUMA_VERT_P_32x2_AVX512 1
8966
+    movu                 m1,                  [r0]                           ;0 row
8967
+    movu                 m3,                  [r0 + r1]                      ;1 row
8968
+    punpcklwd            m0,                  m1,                     m3
8969
+    pmaddwd              m0,                  m15
8970
+    punpckhwd            m1,                  m3
8971
+    pmaddwd              m1,                  m15
8972
+
8973
+    movu                 m4,                  [r0 + 2 * r1]                  ;2 row
8974
+    punpcklwd            m2,                  m3,                     m4
8975
+    pmaddwd              m2,                  m15
8976
+    punpckhwd            m3,                  m4
8977
+    pmaddwd              m3,                  m15
8978
+
8979
+    movu                 m5,                  [r0 + r7]                      ;3 row
8980
+    punpcklwd            m6,                  m4,                     m5
8981
+    pmaddwd              m6,                  m16
8982
+    punpckhwd            m4,                  m5
8983
+    pmaddwd              m4,                  m16
8984
+
8985
+    paddd                m0,                  m6
8986
+    paddd                m1,                  m4
8987
+
8988
+    movu                 m4,                  [r0 + 4 * r1]                  ;4 row
8989
+    punpcklwd            m6,                  m5,                     m4
8990
+    pmaddwd              m6,                  m16
8991
+    punpckhwd            m5,                  m4
8992
+    pmaddwd              m5,                  m16
8993
+
8994
+    paddd                m2,                  m6
8995
+    paddd                m3,                  m5
8996
+
8997
+    lea                  r6,                  [r0 + 4 * r1]
8998
+
8999
+    movu                 m11,                 [r6 + r1]                      ;5 row
9000
+    punpcklwd            m8,                  m4,                     m11
9001
+    pmaddwd              m8,                  m17
9002
+    punpckhwd            m4,                  m11
9003
+    pmaddwd              m4,                  m17
9004
+
9005
+    movu                 m12,                 [r6 + 2 * r1]                  ;6 row
9006
+    punpcklwd            m10,                 m11,                    m12
9007
+    pmaddwd              m10,                 m17
9008
+    punpckhwd            m11,                 m12
9009
+    pmaddwd              m11,                 m17
9010
+
9011
+    movu                 m13,                 [r6 + r7]                      ;7 row
9012
+    punpcklwd            m14,                 m12,                    m13
9013
+    pmaddwd              m14,                 m18
9014
+    punpckhwd            m12,                 m13
9015
+    pmaddwd              m12,                 m18
9016
+
9017
+    paddd                m8,                  m14
9018
+    paddd                m4,                  m12
9019
+    paddd                m0,                  m8
9020
+    paddd                m1,                  m4
9021
+
9022
+    movu                 m12,                 [r6 + 4 * r1]                 ; 8 row
9023
+    punpcklwd            m14,                 m13,                    m12
9024
+    pmaddwd              m14,                 m18
9025
+    punpckhwd            m13,                 m12
9026
+    pmaddwd              m13,                 m18
9027
+
9028
+    paddd                m10,                 m14
9029
+    paddd                m11,                 m13
9030
+    paddd                m2,                  m10
9031
+    paddd                m3,                  m11
9032
+
9033
+    paddd                m0,                  m19
9034
+    paddd                m1,                  m19
9035
+    paddd                m2,                  m19
9036
+    paddd                m3,                  m19
9037
+
9038
+%ifidn %1, pp
9039
+    psrad                m0,                  INTERP_SHIFT_PP
9040
+    psrad                m1,                  INTERP_SHIFT_PP
9041
+    psrad                m2,                  INTERP_SHIFT_PP
9042
+    psrad                m3,                  INTERP_SHIFT_PP
9043
+
9044
+    packssdw             m0,                  m1
9045
+    packssdw             m2,                  m3
9046
+    CLIPW2               m0,                  m2,                   m20,                 m21
9047
+%else
9048
+    psrad                m0,                  INTERP_SHIFT_PS
9049
+    psrad                m1,                  INTERP_SHIFT_PS
9050
+    psrad                m2,                  INTERP_SHIFT_PS
9051
+    psrad                m3,                  INTERP_SHIFT_PS
9052
+
9053
+    packssdw             m0,                  m1
9054
+    packssdw             m2,                  m3
9055
+%endif
9056
+
9057
+    movu                 [r2],                m0
9058
+    movu                 [r2 + r3],           m2
9059
+%endmacro
9060
+;-----------------------------------------------------------------------------------------------------------------
9061
+; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
9062
+;-----------------------------------------------------------------------------------------------------------------
9063
+%macro FILTER_VER_P_LUMA_32xN_AVX512 2
9064
+INIT_ZMM avx512
9065
+cglobal interp_8tap_vert_%1_32x%2, 5, 8, 22
9066
+    add                   r1d,                r1d
9067
+    add                   r3d,                r3d
9068
+    shl                   r4d,                8
9069
+%ifdef PIC
9070
+    lea                   r5,                 [tab_LumaCoeffVer_avx512]
9071
+    mova                  m15,                [r5 + r4]
9072
+    mova                  m16,                [r5 + r4 + 1 * mmsize]
9073
+    mova                  m17,                [r5 + r4 + 2 * mmsize]
9074
+    mova                  m18,                [r5 + r4 + 3 * mmsize]
9075
+%else
9076
+    lea                   r5,                 [tab_LumaCoeffVer_avx512 + r4]
9077
+    mova                  m15,                [r5]
9078
+    mova                  m16,                [r5 + 1 * mmsize]
9079
+    mova                  m17,                [r5 + 2 * mmsize]
9080
+    mova                  m18,                [r5 + 3 * mmsize]
9081
+%endif
9082
+%ifidn %1, pp
9083
+    vbroadcasti32x4       m19,                [INTERP_OFFSET_PP]
9084
+    pxor                  m20,                m20
9085
+    vbroadcasti32x8       m21,                [pw_pixel_max]
9086
+%else
9087
+    vbroadcasti32x4       m19,                [INTERP_OFFSET_PS]
9088
+%endif
9089
+    lea                   r7,                 [3 * r1]
9090
+    sub                   r0,                 r7
9091
+
9092
+%rep %2/2 - 1
9093
+    PROCESS_LUMA_VERT_P_32x2_AVX512 %1
9094
+    lea                   r0,                 [r0 + 2 * r1]
9095
+    lea                   r2,                 [r2 + 2 * r3]
9096
+%endrep
9097
+    PROCESS_LUMA_VERT_P_32x2_AVX512 %1
9098
+    RET
9099
+%endmacro
9100
+
9101
+%if ARCH_X86_64
9102
+    FILTER_VER_P_LUMA_32xN_AVX512 ps, 8
9103
+    FILTER_VER_P_LUMA_32xN_AVX512 ps, 16
9104
+    FILTER_VER_P_LUMA_32xN_AVX512 ps, 32
9105
+    FILTER_VER_P_LUMA_32xN_AVX512 ps, 24
9106
+    FILTER_VER_P_LUMA_32xN_AVX512 ps, 64
9107
+    FILTER_VER_P_LUMA_32xN_AVX512 pp, 8
9108
+    FILTER_VER_P_LUMA_32xN_AVX512 pp, 16
9109
+    FILTER_VER_P_LUMA_32xN_AVX512 pp, 32
9110
+    FILTER_VER_P_LUMA_32xN_AVX512 pp, 24
9111
+    FILTER_VER_P_LUMA_32xN_AVX512 pp, 64
9112
+%endif
9113
+
9114
+%macro PROCESS_LUMA_VERT_P_48x4_AVX512 1
9115
+    PROCESS_LUMA_VERT_P_32x2_AVX512 %1
9116
+    movu                 m1,                  [r0 + 2 * r1]
9117
+    movu                 m3,                  [r0 + r7]
9118
+    punpcklwd            m0,                  m1,                     m3
9119
+    pmaddwd              m0,                  m15
9120
+    punpckhwd            m1,                  m3
9121
+    pmaddwd              m1,                  m15
9122
+
9123
+    movu                 m4,                  [r0 + 4 * r1]
9124
+    punpcklwd            m2,                  m3,                     m4
9125
+    pmaddwd              m2,                  m15
9126
+    punpckhwd            m3,                  m4
9127
+    pmaddwd              m3,                  m15
9128
+
9129
+    movu                 m5,                  [r6 + r1]
9130
+    punpcklwd            m6,                  m4,                     m5
9131
+    pmaddwd              m6,                  m16
9132
+    punpckhwd            m4,                  m5
9133
+    pmaddwd              m4,                  m16
9134
+
9135
+    paddd                m0,                  m6
9136
+    paddd                m1,                  m4
9137
+
9138
+    movu                 m4,                  [r6 + 2 * r1]
9139
+    punpcklwd            m6,                  m5,                     m4
9140
+    pmaddwd              m6,                  m16
9141
+    punpckhwd            m5,                  m4
9142
+    pmaddwd              m5,                  m16
9143
+
9144
+    paddd                m2,                  m6
9145
+    paddd                m3,                  m5
9146
+
9147
+    lea                  r4,                  [r6 + 4 * r1]
9148
+
9149
+    movu                 m11,                 [r6 + r7]
9150
+    punpcklwd            m8,                  m4,                     m11
9151
+    pmaddwd              m8,                  m17
9152
+    punpckhwd            m4,                  m11
9153
+    pmaddwd              m4,                  m17
9154
+
9155
+    movu                 m12,                 [r6 + 4 * r1]
9156
+    punpcklwd            m10,                 m11,                    m12
9157
+    pmaddwd              m10,                 m17
9158
+    punpckhwd            m11,                 m12
9159
+    pmaddwd              m11,                 m17
9160
+
9161
+    movu                 m13,                 [r4 + r1]
9162
+    punpcklwd            m14,                 m12,                    m13
9163
+    pmaddwd              m14,                 m18
9164
+    punpckhwd            m12,                 m13
9165
+    pmaddwd              m12,                 m18
9166
+
9167
+    paddd                m8,                  m14
9168
+    paddd                m4,                  m12
9169
+    paddd                m0,                  m8
9170
+    paddd                m1,                  m4
9171
+
9172
+    movu                 m12,                 [r4 + 2 * r1]
9173
+    punpcklwd            m14,                 m13,                    m12
9174
+    pmaddwd              m14,                 m18
9175
+    punpckhwd            m13,                 m12
9176
+    pmaddwd              m13,                 m18
9177
+
9178
+    paddd                m10,                 m14
9179
+    paddd                m11,                 m13
9180
+    paddd                m2,                  m10
9181
+    paddd                m3,                  m11
9182
+
9183
+    paddd                m0,                  m19
9184
+    paddd                m1,                  m19
9185
+    paddd                m2,                  m19
9186
+    paddd                m3,                  m19
9187
+
9188
+%ifidn %1, pp
9189
+    psrad                m0,                  INTERP_SHIFT_PP
9190
+    psrad                m1,                  INTERP_SHIFT_PP
9191
+    psrad                m2,                  INTERP_SHIFT_PP
9192
+    psrad                m3,                  INTERP_SHIFT_PP
9193
+
9194
+    packssdw             m0,                  m1
9195
+    packssdw             m2,                  m3
9196
+    CLIPW2               m0,                  m2,                   m20,                 m21
9197
+%else
9198
+    psrad                m0,                  INTERP_SHIFT_PS
9199
+    psrad                m1,                  INTERP_SHIFT_PS
9200
+    psrad                m2,                  INTERP_SHIFT_PS
9201
+    psrad                m3,                  INTERP_SHIFT_PS
9202
+
9203
+    packssdw             m0,                  m1
9204
+    packssdw             m2,                  m3
9205
+%endif
9206
+    movu                 [r2 + 2 * r3],       m0
9207
+    movu                 [r2 + r8],           m2
9208
+
9209
+    movu                 ym1,                 [r0 + mmsize]
9210
+    movu                 ym3,                 [r0 + r1 + mmsize]
9211
+    vinserti32x8         m1,                  [r0 + 2 * r1 + mmsize], 1
9212
+    vinserti32x8         m3,                  [r0 + r7 + mmsize],     1
9213
+    punpcklwd            m0,                  m1,                     m3
9214
+    pmaddwd              m0,                  m15
9215
+    punpckhwd            m1,                  m3
9216
+    pmaddwd              m1,                  m15
9217
+
9218
+    movu                 ym4,                 [r0 + 2 * r1 + mmsize]
9219
+    vinserti32x8         m4,                  [r0 + 4 * r1 + mmsize], 1
9220
+    punpcklwd            m2,                  m3,                     m4
9221
+    pmaddwd              m2,                  m15
9222
+    punpckhwd            m3,                  m4
9223
+    pmaddwd              m3,                  m15
9224
+
9225
+    movu                 ym5,                 [r0 + r7 + mmsize]
9226
+    vinserti32x8         m5,                  [r6 + r1 + mmsize],     1
9227
+    punpcklwd            m6,                  m4,                     m5
9228
+    pmaddwd              m6,                  m16
9229
+    punpckhwd            m4,                  m5
9230
+    pmaddwd              m4,                  m16
9231
+
9232
+    paddd                m0,                  m6
9233
+    paddd                m1,                  m4
9234
+
9235
+    movu                 ym4,                 [r6 + mmsize]
9236
+    vinserti32x8         m4,                  [r6 + 2 * r1 + mmsize], 1
9237
+    punpcklwd            m6,                  m5,                     m4
9238
+    pmaddwd              m6,                  m16
9239
+    punpckhwd            m5,                  m4
9240
+    pmaddwd              m5,                  m16
9241
+
9242
+    paddd                m2,                  m6
9243
+    paddd                m3,                  m5
9244
+
9245
+    movu                 ym11,                [r6 + r1 + mmsize]
9246
+    vinserti32x8         m11,                 [r6 + r7 + mmsize],     1
9247
+    punpcklwd            m8,                  m4,                     m11
9248
+    pmaddwd              m8,                  m17
9249
+    punpckhwd            m4,                  m11
9250
+    pmaddwd              m4,                  m17
9251
+
9252
+    movu                 ym12,                [r6 + 2 * r1 + mmsize]
9253
+    vinserti32x8         m12,                 [r4 + mmsize],          1
9254
+    punpcklwd            m10,                 m11,                    m12
9255
+    pmaddwd              m10,                 m17
9256
+    punpckhwd            m11,                 m12
9257
+    pmaddwd              m11,                 m17
9258
+
9259
+    movu                 ym13,                [r6 + r7 + mmsize]
9260
+    vinserti32x8         m13,                 [r4 + r1 + mmsize],     1
9261
+    punpcklwd            m14,                 m12,                    m13
9262
+    pmaddwd              m14,                 m18
9263
+    punpckhwd            m12,                 m13
9264
+    pmaddwd              m12,                 m18
9265
+
9266
+    paddd                m8,                  m14
9267
+    paddd                m4,                  m12
9268
+    paddd                m0,                  m8
9269
+    paddd                m1,                  m4
9270
+
9271
+    movu                 ym12,                [r4 + mmsize]
9272
+    vinserti32x8         m12,                 [r4 + 2 * r1 + mmsize], 1
9273
+    punpcklwd            m14,                 m13,                    m12
9274
+    pmaddwd              m14,                 m18
9275
+    punpckhwd            m13,                 m12
9276
+    pmaddwd              m13,                 m18
9277
+
9278
+    paddd                m10,                 m14
9279
+    paddd                m11,                 m13
9280
+    paddd                m2,                  m10
9281
+    paddd                m3,                  m11
9282
+
9283
+    paddd                m0,                  m19
9284
+    paddd                m1,                  m19
9285
+    paddd                m2,                  m19
9286
+    paddd                m3,                  m19
9287
+
9288
+%ifidn %1, pp
9289
+    psrad                m0,                  INTERP_SHIFT_PP
9290
+    psrad                m1,                  INTERP_SHIFT_PP
9291
+    psrad                m2,                  INTERP_SHIFT_PP
9292
+    psrad                m3,                  INTERP_SHIFT_PP
9293
+
9294
+    packssdw             m0,                  m1
9295
+    packssdw             m2,                  m3
9296
+    CLIPW2               m0,                  m2,                   m20,                 m21
9297
+%else
9298
+    psrad                m0,                  INTERP_SHIFT_PS
9299
+    psrad                m1,                  INTERP_SHIFT_PS
9300
+    psrad                m2,                  INTERP_SHIFT_PS
9301
+    psrad                m3,                  INTERP_SHIFT_PS
9302
+
9303
+    packssdw             m0,                  m1
9304
+    packssdw             m2,                  m3
9305
+%endif
9306
+
9307
+    movu                 [r2 + mmsize],                ym0
9308
+    movu                 [r2 + r3 + mmsize],           ym2
9309
+    vextracti32x8        [r2 + 2 * r3 + mmsize],       m0,                    1
9310
+    vextracti32x8        [r2 + r8 + mmsize],           m2,                    1
9311
+%endmacro
9312
+;-----------------------------------------------------------------------------------------------------------------
9313
+; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
9314
+;-----------------------------------------------------------------------------------------------------------------
9315
+%macro FILTER_VER_P_LUMA_48x64_AVX512 1
9316
+INIT_ZMM avx512
9317
+cglobal interp_8tap_vert_%1_48x64, 5, 9, 22
9318
+    add                   r1d,                r1d
9319
+    add                   r3d,                r3d
9320
+    shl                   r4d,                8
9321
+%ifdef PIC
9322
+    lea                   r5,                 [tab_LumaCoeffVer_avx512]
9323
+    mova                  m15,                [r5 + r4]
9324
+    mova                  m16,                [r5 + r4 + 1 * mmsize]
9325
+    mova                  m17,                [r5 + r4 + 2 * mmsize]
9326
+    mova                  m18,                [r5 + r4 + 3 * mmsize]
9327
+%else
9328
+    lea                   r5,                 [tab_LumaCoeffVer_avx512 + r4]
9329
+    mova                  m15,                [r5]
9330
+    mova                  m16,                [r5 + 1 * mmsize]
9331
+    mova                  m17,                [r5 + 2 * mmsize]
9332
+    mova                  m18,                [r5 + 3 * mmsize]
9333
+%endif
9334
+%ifidn %1, pp
9335
+    vbroadcasti32x4       m19,                [INTERP_OFFSET_PP]
9336
+    pxor                  m20,                m20
9337
+    vbroadcasti32x8       m21,                [pw_pixel_max]
9338
+%else
9339
+    vbroadcasti32x4       m19,                [INTERP_OFFSET_PS]
9340
+%endif
9341
+    lea                   r7,                 [3 * r1]
9342
+    lea                   r8,                 [3 * r3]
9343
+    sub                   r0,                 r7
9344
+
9345
+%rep 15
9346
+    PROCESS_LUMA_VERT_P_48x4_AVX512 %1
9347
+    lea                   r0,                 [r0 + 4 * r1]
9348
+    lea                   r2,                 [r2 + 4 * r3]
9349
+%endrep
9350
+    PROCESS_LUMA_VERT_P_48x4_AVX512 %1
9351
+    RET
9352
+%endmacro
9353
+
9354
+%if ARCH_X86_64
9355
+    FILTER_VER_P_LUMA_48x64_AVX512 ps
9356
+    FILTER_VER_P_LUMA_48x64_AVX512 pp
9357
+%endif
9358
+
9359
+%macro PROCESS_LUMA_VERT_P_64x2_AVX512 1
9360
+    PROCESS_LUMA_VERT_P_32x2_AVX512 %1
9361
+    movu                 m1,                  [r0 + mmsize]
9362
+    movu                 m3,                  [r0 + r1 + mmsize]
9363
+    punpcklwd            m0,                  m1,                     m3
9364
+    pmaddwd              m0,                  m15
9365
+    punpckhwd            m1,                  m3
9366
+    pmaddwd              m1,                  m15
9367
+
9368
+    movu                 m4,                  [r0 + 2 * r1 + mmsize]
9369
+    punpcklwd            m2,                  m3,                     m4
9370
+    pmaddwd              m2,                  m15
9371
+    punpckhwd            m3,                  m4
9372
+    pmaddwd              m3,                  m15
9373
+
9374
+    movu                 m5,                  [r0 + r7 + mmsize]
9375
+    punpcklwd            m6,                  m4,                     m5
9376
+    pmaddwd              m6,                  m16
9377
+    punpckhwd            m4,                  m5
9378
+    pmaddwd              m4,                  m16
9379
+
9380
+    paddd                m0,                  m6
9381
+    paddd                m1,                  m4
9382
+
9383
+    movu                 m4,                  [r0 + 4 * r1 + mmsize]
9384
+    punpcklwd            m6,                  m5,                     m4
9385
+    pmaddwd              m6,                  m16
9386
+    punpckhwd            m5,                  m4
9387
+    pmaddwd              m5,                  m16
9388
+
9389
+    paddd                m2,                  m6
9390
+    paddd                m3,                  m5
9391
+
9392
+    movu                 m11,                 [r6 + r1 + mmsize]
9393
+    punpcklwd            m8,                  m4,                     m11
9394
+    pmaddwd              m8,                  m17
9395
+    punpckhwd            m4,                  m11
9396
+    pmaddwd              m4,                  m17
9397
+
9398
+    movu                 m12,                 [r6 + 2 * r1 + mmsize]
9399
+    punpcklwd            m10,                 m11,                    m12
9400
+    pmaddwd              m10,                 m17
9401
+    punpckhwd            m11,                 m12
9402
+    pmaddwd              m11,                 m17
9403
+
9404
+    movu                 m13,                 [r6 + r7 + mmsize]
9405
+    punpcklwd            m14,                 m12,                    m13
9406
+    pmaddwd              m14,                 m18
9407
+    punpckhwd            m12,                 m13
9408
+    pmaddwd              m12,                 m18
9409
+
9410
+    paddd                m8,                  m14
9411
+    paddd                m4,                  m12
9412
+    paddd                m0,                  m8
9413
+    paddd                m1,                  m4
9414
+
9415
+    movu                 m12,                 [r6 + 4 * r1 + mmsize]
9416
+    punpcklwd            m14,                 m13,                    m12
9417
+    pmaddwd              m14,                 m18
9418
+    punpckhwd            m13,                 m12
9419
+    pmaddwd              m13,                 m18
9420
+
9421
+    paddd                m10,                 m14
9422
+    paddd                m11,                 m13
9423
+    paddd                m2,                  m10
9424
+    paddd                m3,                  m11
9425
+
9426
+    paddd                m0,                  m19
9427
+    paddd                m1,                  m19
9428
+    paddd                m2,                  m19
9429
+    paddd                m3,                  m19
9430
+
9431
+%ifidn %1, pp
9432
+    psrad                m0,                  INTERP_SHIFT_PP
9433
+    psrad                m1,                  INTERP_SHIFT_PP
9434
+    psrad                m2,                  INTERP_SHIFT_PP
9435
+    psrad                m3,                  INTERP_SHIFT_PP
9436
+
9437
+    packssdw             m0,                  m1
9438
+    packssdw             m2,                  m3
9439
+    CLIPW2               m0,                  m2,                   m20,                 m21
9440
+%else
9441
+    psrad                m0,                  INTERP_SHIFT_PS
9442
+    psrad                m1,                  INTERP_SHIFT_PS
9443
+    psrad                m2,                  INTERP_SHIFT_PS
9444
+    psrad                m3,                  INTERP_SHIFT_PS
9445
+
9446
+    packssdw             m0,                  m1
9447
+    packssdw             m2,                  m3
9448
+%endif
9449
+
9450
+    movu                 [r2 + mmsize],       m0
9451
+    movu                 [r2 + r3 + mmsize],  m2
9452
+%endmacro
9453
+;-----------------------------------------------------------------------------------------------------------------
9454
+; void interp_8tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
9455
+;-----------------------------------------------------------------------------------------------------------------
9456
+%macro FILTER_VER_P_LUMA_64xN_AVX512 2
9457
+INIT_ZMM avx512
9458
+cglobal interp_8tap_vert_%1_64x%2, 5, 8, 22
9459
+    add                   r1d,                r1d
9460
+    add                   r3d,                r3d
9461
+    shl                   r4d,                8
9462
+%ifdef PIC
9463
+    lea                   r5,                 [tab_LumaCoeffVer_avx512]
9464
+    mova                  m15,                [r5 + r4]
9465
+    mova                  m16,                [r5 + r4 + 1 * mmsize]
9466
+    mova                  m17,                [r5 + r4 + 2 * mmsize]
9467
+    mova                  m18,                [r5 + r4 + 3 * mmsize]
9468
+%else
9469
+    lea                   r5,                 [tab_LumaCoeffVer_avx512 + r4]
9470
+    mova                  m15,                [r5]
9471
+    mova                  m16,                [r5 + 1 * mmsize]
9472
+    mova                  m17,                [r5 + 2 * mmsize]
9473
+    mova                  m18,                [r5 + 3 * mmsize]
9474
+%endif
9475
+%ifidn %1, pp
9476
+    vbroadcasti32x4       m19,                [INTERP_OFFSET_PP]
9477
+    pxor                  m20,                m20
9478
+    vbroadcasti32x8       m21,                [pw_pixel_max]
9479
+%else
9480
+    vbroadcasti32x4       m19,                [INTERP_OFFSET_PS]
9481
+%endif
9482
+    lea                   r7,                 [3 * r1]
9483
+    sub                   r0,                 r7
9484
+
9485
+%rep %2/2 - 1
9486
+    PROCESS_LUMA_VERT_P_64x2_AVX512 %1
9487
+    lea                   r0,                 [r0 + 2 * r1]
9488
+    lea                   r2,                 [r2 + 2 * r3]
9489
+%endrep
9490
+    PROCESS_LUMA_VERT_P_64x2_AVX512 %1
9491
+    RET
9492
+%endmacro
9493
+
9494
+%if ARCH_X86_64
9495
+    FILTER_VER_P_LUMA_64xN_AVX512 ps, 16
9496
+    FILTER_VER_P_LUMA_64xN_AVX512 ps, 32
9497
+    FILTER_VER_P_LUMA_64xN_AVX512 ps, 48
9498
+    FILTER_VER_P_LUMA_64xN_AVX512 ps, 64
9499
+    FILTER_VER_P_LUMA_64xN_AVX512 pp, 16
9500
+    FILTER_VER_P_LUMA_64xN_AVX512 pp, 32
9501
+    FILTER_VER_P_LUMA_64xN_AVX512 pp, 48
9502
+    FILTER_VER_P_LUMA_64xN_AVX512 pp, 64
9503
+%endif
9504
+;-------------------------------------------------------------------------------------------------------------
9505
+;avx512 luma_vpp and luma_vps code end
9506
+;-------------------------------------------------------------------------------------------------------------
9507
+;-------------------------------------------------------------------------------------------------------------
9508
+;ipfilter_luma_avx512 code end
9509
+;-------------------------------------------------------------------------------------------------------------
9510
x265_2.7.tar.gz/source/common/x86/ipfilter8.asm -> x265_2.9.tar.gz/source/common/x86/ipfilter8.asm Changed
5651
 
1
@@ -26,7 +26,7 @@
2
 %include "x86inc.asm"
3
 %include "x86util.asm"
4
 
5
-SECTION_RODATA 32
6
+SECTION_RODATA 64
7
 const tab_Tm,    db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
8
                  db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
9
                  db 8, 9,10,11, 9,10,11,12,10,11,12,13,11,12,13, 14
10
@@ -43,6 +43,15 @@
11
 
12
 const pd_526336, times 8 dd 8192*64+2048
13
 
14
+const tab_ChromaCoeff, db  0, 64,  0,  0
15
+                       db -2, 58, 10, -2
16
+                       db -4, 54, 16, -2
17
+                       db -6, 46, 28, -4
18
+                       db -4, 36, 36, -4
19
+                       db -4, 28, 46, -6
20
+                       db -2, 16, 54, -4
21
+                       db -2, 10, 58, -2
22
+
23
 const tab_LumaCoeff,   db   0, 0,  0,  64,  0,   0,  0,  0
24
                        db  -1, 4, -10, 58,  17, -5,  1,  0
25
                        db  -1, 4, -11, 40,  40, -11, 4, -1
26
@@ -133,12 +142,115 @@
27
                             times 16 db 58, -10
28
                             times 16 db 4, -1
29
 
30
+ALIGN 64
31
+const tab_ChromaCoeffVer_32_avx512,     times 32 db 0, 64
32
+                                        times 32 db 0, 0
33
+
34
+                                        times 32 db -2, 58
35
+                                        times 32 db 10, -2
36
+
37
+                                        times 32 db -4, 54
38
+                                        times 32 db 16, -2
39
+
40
+                                        times 32 db -6, 46
41
+                                        times 32 db 28, -4
42
+
43
+                                        times 32 db -4, 36
44
+                                        times 32 db 36, -4
45
+
46
+                                        times 32 db -4, 28
47
+                                        times 32 db 46, -6
48
+
49
+                                        times 32 db -2, 16
50
+                                        times 32 db 54, -4
51
+
52
+                                        times 32 db -2, 10
53
+                                        times 32 db 58, -2
54
+
55
+ALIGN 64
56
+const pw_ChromaCoeffVer_32_avx512,      times 16 dw 0, 64
57
+                                        times 16 dw 0, 0
58
+
59
+                                        times 16 dw -2, 58
60
+                                        times 16 dw 10, -2
61
+
62
+                                        times 16 dw -4, 54
63
+                                        times 16 dw 16, -2
64
+
65
+                                        times 16 dw -6, 46
66
+                                        times 16 dw 28, -4
67
+
68
+                                        times 16 dw -4, 36
69
+                                        times 16 dw 36, -4
70
+
71
+                                        times 16 dw -4, 28
72
+                                        times 16 dw 46, -6
73
+
74
+                                        times 16 dw -2, 16
75
+                                        times 16 dw 54, -4
76
+
77
+                                        times 16 dw -2, 10
78
+                                        times 16 dw 58, -2
79
+
80
+ALIGN 64
81
+const pw_LumaCoeffVer_avx512,           times 16 dw 0, 0
82
+                                        times 16 dw 0, 64
83
+                                        times 16 dw 0, 0
84
+                                        times 16 dw 0, 0
85
+
86
+                                        times 16 dw -1, 4
87
+                                        times 16 dw -10, 58
88
+                                        times 16 dw 17, -5
89
+                                        times 16 dw 1, 0
90
+
91
+                                        times 16 dw -1, 4
92
+                                        times 16 dw -11, 40
93
+                                        times 16 dw 40, -11
94
+                                        times 16 dw 4, -1
95
+
96
+                                        times 16 dw 0, 1
97
+                                        times 16 dw -5, 17
98
+                                        times 16 dw 58, -10
99
+                                        times 16 dw 4, -1
100
+
101
+ALIGN 64
102
+const tab_LumaCoeffVer_32_avx512,       times 32 db 0, 0
103
+                                        times 32 db 0, 64
104
+                                        times 32 db 0, 0
105
+                                        times 32 db 0, 0
106
+
107
+                                        times 32 db -1, 4
108
+                                        times 32 db -10, 58
109
+                                        times 32 db 17, -5
110
+                                        times 32 db 1, 0
111
+
112
+                                        times 32 db -1, 4
113
+                                        times 32 db -11, 40
114
+                                        times 32 db 40, -11
115
+                                        times 32 db 4, -1
116
+
117
+                                        times 32 db 0, 1
118
+                                        times 32 db -5, 17
119
+                                        times 32 db 58, -10
120
+                                        times 32 db 4, -1
121
+
122
 const tab_c_64_n64, times 8 db 64, -64
123
 
124
 const interp8_hps_shuf,     dd 0, 4, 1, 5, 2, 6, 3, 7
125
 
126
-SECTION .text
127
+const interp4_horiz_shuf_load1_avx512,  times 2 db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
128
+const interp4_horiz_shuf_load2_avx512,  times 2 db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
129
+const interp4_horiz_shuf_load3_avx512,  times 2 db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
130
+
131
+ALIGN 64
132
+interp4_vps_store1_avx512:   dq 0, 1, 8, 9, 2, 3, 10, 11
133
+interp4_vps_store2_avx512:   dq 4, 5, 12, 13, 6, 7, 14, 15
134
+const interp4_hps_shuf_avx512,  dq 0, 4, 1, 5, 2, 6, 3, 7
135
+const interp4_hps_store_16xN_avx512,  dq 0, 2, 1, 3, 4, 6, 5, 7
136
+const interp8_hps_store_avx512,  dq 0, 1, 4, 5, 2, 3, 6, 7
137
+const interp8_vsp_store_avx512,  dq 0, 2, 4, 6, 1, 3, 5, 7
138
 
139
+SECTION .text
140
 cextern pb_128
141
 cextern pw_1
142
 cextern pw_32
143
@@ -1954,6 +2066,276 @@
144
     P2S_H_32xN_avx2 48
145
 
146
 ;-----------------------------------------------------------------------------
147
+;p2s and p2s_aligned 32xN avx512 code start
148
+;-----------------------------------------------------------------------------
149
+
150
+%macro PROCESS_P2S_32x4_AVX512 0
151
+    pmovzxbw    m0, [r0]
152
+    pmovzxbw    m1, [r0 + r1]
153
+    pmovzxbw    m2, [r0 + r1 * 2]
154
+    pmovzxbw    m3, [r0 + r5]
155
+
156
+    psllw       m0, 6
157
+    psllw       m1, 6
158
+    psllw       m2, 6
159
+    psllw       m3, 6
160
+    psubw       m0, m4
161
+    psubw       m1, m4
162
+    psubw       m2, m4
163
+    psubw       m3, m4
164
+
165
+    movu        [r2],           m0
166
+    movu        [r2 + r3],      m1
167
+    movu        [r2 + r3 * 2],  m2
168
+    movu        [r2 + r6],      m3
169
+%endmacro
170
+
171
+;-----------------------------------------------------------------------------
172
+; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
173
+;-----------------------------------------------------------------------------
174
+%if ARCH_X86_64
175
+INIT_ZMM avx512
176
+cglobal filterPixelToShort_32x8, 3, 7, 5
177
+    mov         r3d, r3m
178
+    add         r3d, r3d
179
+    lea         r5, [r1 * 3]
180
+    lea         r6, [r3 * 3]
181
+
182
+    ; load constant
183
+    vpbroadcastd      m4, [pw_2000]
184
+
185
+    PROCESS_P2S_32x4_AVX512
186
+    lea         r0, [r0 + r1 * 4]
187
+    lea         r2, [r2 + r3 * 4]
188
+    PROCESS_P2S_32x4_AVX512
189
+    RET
190
+
191
+INIT_ZMM avx512
192
+cglobal filterPixelToShort_32x16, 3, 7, 5
193
+    mov         r3d, r3m
194
+    add         r3d, r3d
195
+    lea         r5, [r1 * 3]
196
+    lea         r6, [r3 * 3]
197
+
198
+    ; load constant
199
+    vpbroadcastd      m4, [pw_2000]
200
+
201
+%rep 3
202
+    PROCESS_P2S_32x4_AVX512
203
+    lea         r0, [r0 + r1 * 4]
204
+    lea         r2, [r2 + r3 * 4]
205
+%endrep
206
+    PROCESS_P2S_32x4_AVX512
207
+    RET
208
+
209
+INIT_ZMM avx512
210
+cglobal filterPixelToShort_32x24, 3, 7, 5
211
+    mov         r3d, r3m
212
+    add         r3d, r3d
213
+    lea         r5, [r1 * 3]
214
+    lea         r6, [r3 * 3]
215
+
216
+    ; load constant
217
+    vpbroadcastd      m4, [pw_2000]
218
+
219
+%rep 5
220
+    PROCESS_P2S_32x4_AVX512
221
+    lea         r0, [r0 + r1 * 4]
222
+    lea         r2, [r2 + r3 * 4]
223
+%endrep
224
+    PROCESS_P2S_32x4_AVX512
225
+    RET
226
+
227
+INIT_ZMM avx512
228
+cglobal filterPixelToShort_32x32, 3, 7, 5
229
+    mov         r3d, r3m
230
+    add         r3d, r3d
231
+    lea         r5, [r1 * 3]
232
+    lea         r6, [r3 * 3]
233
+
234
+    ; load constant
235
+    vpbroadcastd      m4, [pw_2000]
236
+
237
+%rep 7
238
+    PROCESS_P2S_32x4_AVX512
239
+    lea         r0, [r0 + r1 * 4]
240
+    lea         r2, [r2 + r3 * 4]
241
+%endrep
242
+    PROCESS_P2S_32x4_AVX512
243
+    RET
244
+
245
+INIT_ZMM avx512
246
+cglobal filterPixelToShort_32x48, 3, 7, 5
247
+    mov         r3d, r3m
248
+    add         r3d, r3d
249
+    lea         r5, [r1 * 3]
250
+    lea         r6, [r3 * 3]
251
+
252
+    ; load constant
253
+    vpbroadcastd      m4, [pw_2000]
254
+
255
+%rep 11
256
+    PROCESS_P2S_32x4_AVX512
257
+    lea         r0, [r0 + r1 * 4]
258
+    lea         r2, [r2 + r3 * 4]
259
+%endrep
260
+    PROCESS_P2S_32x4_AVX512
261
+    RET
262
+
263
+INIT_ZMM avx512
264
+cglobal filterPixelToShort_32x64, 3, 7, 5
265
+    mov         r3d, r3m
266
+    add         r3d, r3d
267
+    lea         r5, [r1 * 3]
268
+    lea         r6, [r3 * 3]
269
+
270
+    ; load constant
271
+    vpbroadcastd      m4, [pw_2000]
272
+
273
+%rep 15
274
+    PROCESS_P2S_32x4_AVX512
275
+    lea         r0, [r0 + r1 * 4]
276
+    lea         r2, [r2 + r3 * 4]
277
+%endrep
278
+    PROCESS_P2S_32x4_AVX512
279
+    RET
280
+%endif
281
+
282
+%macro PROCESS_P2S_ALIGNED_32x4_AVX512 0
283
+    pmovzxbw    m0, [r0]
284
+    pmovzxbw    m1, [r0 + r1]
285
+    pmovzxbw    m2, [r0 + r1 * 2]
286
+    pmovzxbw    m3, [r0 + r5]
287
+
288
+    psllw       m0, 6
289
+    psllw       m1, 6
290
+    psllw       m2, 6
291
+    psllw       m3, 6
292
+    psubw       m0, m4
293
+    psubw       m1, m4
294
+    psubw       m2, m4
295
+    psubw       m3, m4
296
+
297
+    mova        [r2],           m0
298
+    mova        [r2 + r3],      m1
299
+    mova        [r2 + r3 * 2],  m2
300
+    mova        [r2 + r6],      m3
301
+%endmacro
302
+
303
+;-----------------------------------------------------------------------------
304
+; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
305
+;-----------------------------------------------------------------------------
306
+%if ARCH_X86_64
307
+INIT_ZMM avx512
308
+cglobal filterPixelToShort_aligned_32x8, 3, 7, 5
309
+    mov         r3d, r3m
310
+    add         r3d, r3d
311
+    lea         r5, [r1 * 3]
312
+    lea         r6, [r3 * 3]
313
+
314
+    ; load constant
315
+    vpbroadcastd      m4, [pw_2000]
316
+
317
+    PROCESS_P2S_ALIGNED_32x4_AVX512
318
+    lea         r0, [r0 + r1 * 4]
319
+    lea         r2, [r2 + r3 * 4]
320
+    PROCESS_P2S_ALIGNED_32x4_AVX512
321
+    RET
322
+
323
+INIT_ZMM avx512
324
+cglobal filterPixelToShort_aligned_32x16, 3, 7, 5
325
+    mov         r3d, r3m
326
+    add         r3d, r3d
327
+    lea         r5, [r1 * 3]
328
+    lea         r6, [r3 * 3]
329
+
330
+    ; load constant
331
+    vpbroadcastd      m4, [pw_2000]
332
+
333
+%rep 3
334
+    PROCESS_P2S_ALIGNED_32x4_AVX512
335
+    lea         r0, [r0 + r1 * 4]
336
+    lea         r2, [r2 + r3 * 4]
337
+%endrep
338
+    PROCESS_P2S_ALIGNED_32x4_AVX512
339
+    RET
340
+
341
+INIT_ZMM avx512
342
+cglobal filterPixelToShort_aligned_32x24, 3, 7, 5
343
+    mov         r3d, r3m
344
+    add         r3d, r3d
345
+    lea         r5, [r1 * 3]
346
+    lea         r6, [r3 * 3]
347
+
348
+    ; load constant
349
+    vpbroadcastd      m4, [pw_2000]
350
+
351
+%rep 5
352
+    PROCESS_P2S_ALIGNED_32x4_AVX512
353
+    lea         r0, [r0 + r1 * 4]
354
+    lea         r2, [r2 + r3 * 4]
355
+%endrep
356
+    PROCESS_P2S_ALIGNED_32x4_AVX512
357
+    RET
358
+
359
+INIT_ZMM avx512
360
+cglobal filterPixelToShort_aligned_32x32, 3, 7, 5
361
+    mov         r3d, r3m
362
+    add         r3d, r3d
363
+    lea         r5, [r1 * 3]
364
+    lea         r6, [r3 * 3]
365
+
366
+    ; load constant
367
+    vpbroadcastd      m4, [pw_2000]
368
+
369
+%rep 7
370
+    PROCESS_P2S_ALIGNED_32x4_AVX512
371
+    lea         r0, [r0 + r1 * 4]
372
+    lea         r2, [r2 + r3 * 4]
373
+%endrep
374
+    PROCESS_P2S_ALIGNED_32x4_AVX512
375
+    RET
376
+
377
+INIT_ZMM avx512
378
+cglobal filterPixelToShort_aligned_32x48, 3, 7, 5
379
+    mov         r3d, r3m
380
+    add         r3d, r3d
381
+    lea         r5, [r1 * 3]
382
+    lea         r6, [r3 * 3]
383
+
384
+    ; load constant
385
+    vpbroadcastd      m4, [pw_2000]
386
+
387
+%rep 11
388
+    PROCESS_P2S_ALIGNED_32x4_AVX512
389
+    lea         r0, [r0 + r1 * 4]
390
+    lea         r2, [r2 + r3 * 4]
391
+%endrep
392
+    PROCESS_P2S_ALIGNED_32x4_AVX512
393
+    RET
394
+
395
+INIT_ZMM avx512
396
+cglobal filterPixelToShort_aligned_32x64, 3, 7, 5
397
+    mov         r3d, r3m
398
+    add         r3d, r3d
399
+    lea         r5, [r1 * 3]
400
+    lea         r6, [r3 * 3]
401
+
402
+    ; load constant
403
+    vpbroadcastd      m4, [pw_2000]
404
+
405
+%rep 15
406
+    PROCESS_P2S_ALIGNED_32x4_AVX512
407
+    lea         r0, [r0 + r1 * 4]
408
+    lea         r2, [r2 + r3 * 4]
409
+%endrep
410
+    PROCESS_P2S_ALIGNED_32x4_AVX512
411
+    RET
412
+%endif
413
+;-----------------------------------------------------------------------------
414
+;p2s and p2s_aligned 32xN avx512 code end
415
+;-----------------------------------------------------------------------------
416
+;-----------------------------------------------------------------------------
417
 ; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
418
 ;-----------------------------------------------------------------------------
419
 %macro P2S_H_64xN 1
420
@@ -2269,6 +2651,236 @@
421
     P2S_H_64xN_avx2 48
422
 
423
 ;-----------------------------------------------------------------------------
424
+;p2s and p2s_aligned 64xN avx512 code start
425
+;-----------------------------------------------------------------------------
426
+%macro PROCESS_P2S_64x4_AVX512 0
427
+    pmovzxbw    m0, [r0]
428
+    pmovzxbw    m1, [r0 + mmsize/2]
429
+    pmovzxbw    m2, [r0 + r1]
430
+    pmovzxbw    m3, [r0 + r1 + mmsize/2]
431
+
432
+    psllw       m0, 6
433
+    psllw       m1, 6
434
+    psllw       m2, 6
435
+    psllw       m3, 6
436
+    psubw       m0, m4
437
+    psubw       m1, m4
438
+    psubw       m2, m4
439
+    psubw       m3, m4
440
+    movu        [r2], m0
441
+    movu        [r2 + mmsize], m1
442
+    movu        [r2 + r3], m2
443
+    movu        [r2 + r3 + mmsize], m3
444
+
445
+    pmovzxbw    m0, [r0 + r1 * 2]
446
+    pmovzxbw    m1, [r0 + r1 * 2 + mmsize/2]
447
+    pmovzxbw    m2, [r0 + r5]
448
+    pmovzxbw    m3, [r0 + r5 + mmsize/2]
449
+
450
+    psllw       m0, 6
451
+    psllw       m1, 6
452
+    psllw       m2, 6
453
+    psllw       m3, 6
454
+    psubw       m0, m4
455
+    psubw       m1, m4
456
+    psubw       m2, m4
457
+    psubw       m3, m4
458
+    movu        [r2 + r3 * 2], m0
459
+    movu        [r2 + r3 * 2 + mmsize], m1
460
+    movu        [r2 + r6], m2
461
+    movu        [r2 + r6 + mmsize], m3
462
+%endmacro
463
+
464
+%macro PROCESS_P2S_ALIGNED_64x4_AVX512 0
465
+    pmovzxbw    m0, [r0]
466
+    pmovzxbw    m1, [r0 + mmsize/2]
467
+    pmovzxbw    m2, [r0 + r1]
468
+    pmovzxbw    m3, [r0 + r1 + mmsize/2]
469
+
470
+    psllw       m0, 6
471
+    psllw       m1, 6
472
+    psllw       m2, 6
473
+    psllw       m3, 6
474
+    psubw       m0, m4
475
+    psubw       m1, m4
476
+    psubw       m2, m4
477
+    psubw       m3, m4
478
+    mova        [r2], m0
479
+    mova        [r2 + mmsize], m1
480
+    mova        [r2 + r3], m2
481
+    mova        [r2 + r3 + mmsize], m3
482
+
483
+    pmovzxbw    m0, [r0 + r1 * 2]
484
+    pmovzxbw    m1, [r0 + r1 * 2 + mmsize/2]
485
+    pmovzxbw    m2, [r0 + r5]
486
+    pmovzxbw    m3, [r0 + r5 + mmsize/2]
487
+
488
+    psllw       m0, 6
489
+    psllw       m1, 6
490
+    psllw       m2, 6
491
+    psllw       m3, 6
492
+    psubw       m0, m4
493
+    psubw       m1, m4
494
+    psubw       m2, m4
495
+    psubw       m3, m4
496
+    mova        [r2 + r3 * 2], m0
497
+    mova        [r2 + r3 * 2 + mmsize], m1
498
+    mova        [r2 + r6], m2
499
+    mova        [r2 + r6 + mmsize], m3
500
+%endmacro
501
+;-----------------------------------------------------------------------------
502
+; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
503
+;-----------------------------------------------------------------------------
504
+%if ARCH_X86_64
505
+INIT_ZMM avx512
506
+cglobal filterPixelToShort_64x64, 3, 7, 5
507
+    mov         r3d, r3m
508
+    add         r3d, r3d
509
+    lea         r5, [r1 * 3]
510
+    lea         r6, [r3 * 3]
511
+
512
+    ; load constant
513
+    vpbroadcastd      m4, [pw_2000]
514
+
515
+%rep 15
516
+    PROCESS_P2S_64x4_AVX512
517
+    lea         r0, [r0 + r1 * 4]
518
+    lea         r2, [r2 + r3 * 4]
519
+%endrep
520
+    PROCESS_P2S_64x4_AVX512
521
+    RET
522
+
523
+INIT_ZMM avx512
524
+cglobal filterPixelToShort_64x48, 3, 7, 5
525
+    mov         r3d, r3m
526
+    add         r3d, r3d
527
+    lea         r5, [r1 * 3]
528
+    lea         r6, [r3 * 3]
529
+
530
+    ; load constant
531
+    vpbroadcastd      m4, [pw_2000]
532
+
533
+%rep 11
534
+    PROCESS_P2S_64x4_AVX512
535
+    lea         r0, [r0 + r1 * 4]
536
+    lea         r2, [r2 + r3 * 4]
537
+%endrep
538
+    PROCESS_P2S_64x4_AVX512
539
+    RET
540
+
541
+INIT_ZMM avx512
542
+cglobal filterPixelToShort_64x32, 3, 7, 5
543
+    mov         r3d, r3m
544
+    add         r3d, r3d
545
+    lea         r5, [r1 * 3]
546
+    lea         r6, [r3 * 3]
547
+
548
+    ; load constant
549
+    vpbroadcastd      m4, [pw_2000]
550
+
551
+%rep 7
552
+    PROCESS_P2S_64x4_AVX512
553
+    lea         r0, [r0 + r1 * 4]
554
+    lea         r2, [r2 + r3 * 4]
555
+%endrep
556
+    PROCESS_P2S_64x4_AVX512
557
+    RET
558
+
559
+INIT_ZMM avx512
560
+cglobal filterPixelToShort_64x16, 3, 7, 5
561
+    mov         r3d, r3m
562
+    add         r3d, r3d
563
+    lea         r5, [r1 * 3]
564
+    lea         r6, [r3 * 3]
565
+
566
+    ; load constant
567
+    vpbroadcastd      m4, [pw_2000]
568
+
569
+%rep 3
570
+    PROCESS_P2S_64x4_AVX512
571
+    lea         r0, [r0 + r1 * 4]
572
+    lea         r2, [r2 + r3 * 4]
573
+%endrep
574
+    PROCESS_P2S_64x4_AVX512
575
+    RET
576
+
577
+INIT_ZMM avx512
578
+cglobal filterPixelToShort_aligned_64x64, 3, 7, 5
579
+    mov         r3d, r3m
580
+    add         r3d, r3d
581
+    lea         r5, [r1 * 3]
582
+    lea         r6, [r3 * 3]
583
+
584
+    ; load constant
585
+    vpbroadcastd      m4, [pw_2000]
586
+
587
+%rep 15
588
+    PROCESS_P2S_ALIGNED_64x4_AVX512
589
+    lea         r0, [r0 + r1 * 4]
590
+    lea         r2, [r2 + r3 * 4]
591
+%endrep
592
+    PROCESS_P2S_ALIGNED_64x4_AVX512
593
+    RET
594
+
595
+INIT_ZMM avx512
596
+cglobal filterPixelToShort_aligned_64x48, 3, 7, 5
597
+    mov         r3d, r3m
598
+    add         r3d, r3d
599
+    lea         r5, [r1 * 3]
600
+    lea         r6, [r3 * 3]
601
+
602
+    ; load constant
603
+    vpbroadcastd      m4, [pw_2000]
604
+
605
+%rep 11
606
+    PROCESS_P2S_ALIGNED_64x4_AVX512
607
+    lea         r0, [r0 + r1 * 4]
608
+    lea         r2, [r2 + r3 * 4]
609
+%endrep
610
+    PROCESS_P2S_ALIGNED_64x4_AVX512
611
+    RET
612
+
613
+INIT_ZMM avx512
614
+cglobal filterPixelToShort_aligned_64x32, 3, 7, 5
615
+    mov         r3d, r3m
616
+    add         r3d, r3d
617
+    lea         r5, [r1 * 3]
618
+    lea         r6, [r3 * 3]
619
+
620
+    ; load constant
621
+    vpbroadcastd      m4, [pw_2000]
622
+
623
+%rep 7
624
+    PROCESS_P2S_ALIGNED_64x4_AVX512
625
+    lea         r0, [r0 + r1 * 4]
626
+    lea         r2, [r2 + r3 * 4]
627
+%endrep
628
+    PROCESS_P2S_ALIGNED_64x4_AVX512
629
+    RET
630
+
631
+INIT_ZMM avx512
632
+cglobal filterPixelToShort_aligned_64x16, 3, 7, 5
633
+    mov         r3d, r3m
634
+    add         r3d, r3d
635
+    lea         r5, [r1 * 3]
636
+    lea         r6, [r3 * 3]
637
+
638
+    ; load constant
639
+    vpbroadcastd      m4, [pw_2000]
640
+
641
+%rep 3
642
+    PROCESS_P2S_ALIGNED_64x4_AVX512
643
+    lea         r0, [r0 + r1 * 4]
644
+    lea         r2, [r2 + r3 * 4]
645
+%endrep
646
+    PROCESS_P2S_ALIGNED_64x4_AVX512
647
+    RET
648
+%endif
649
+;-----------------------------------------------------------------------------
650
+;p2s and p2s_aligned 64xN avx512 code end
651
+;-----------------------------------------------------------------------------
652
+
653
+;-----------------------------------------------------------------------------
654
 ; void filterPixelToShort(pixel src, intptr_t srcStride, int16_t dst, int16_t dstStride)
655
 ;-----------------------------------------------------------------------------
656
 %macro P2S_H_12xN 1
657
@@ -2689,6 +3301,229 @@
658
     jnz        .loop
659
     RET
660
 
661
+;-----------------------------------------------------------------------------
662
+;p2s and p2s_aligned 48xN avx512 code start
663
+;-----------------------------------------------------------------------------
664
+%macro PROCESS_P2S_48x8_AVX512 0
665
+    pmovzxbw    m0, [r0]
666
+    pmovzxbw    m1, [r0 + r1]
667
+    pmovzxbw    m2, [r0 + r1 * 2]
668
+    pmovzxbw    m3, [r0 + r5]
669
+    psllw       m0, 6
670
+    psllw       m1, 6
671
+    psllw       m2, 6
672
+    psllw       m3, 6
673
+    psubw       m0, m4
674
+    psubw       m1, m4
675
+    psubw       m2, m4
676
+    psubw       m3, m4
677
+    movu        [r2],           m0
678
+    movu        [r2 + r3],      m1
679
+    movu        [r2 + r3 * 2],  m2
680
+    movu        [r2 + r6],      m3
681
+
682
+    pmovzxbw    ym0, [r0 + 32]
683
+    pmovzxbw    ym1, [r0 + r1 + 32]
684
+    pmovzxbw    ym2, [r0 + r1 * 2 + 32]
685
+    pmovzxbw    ym3, [r0 + r5 + 32]
686
+    psllw       ym0, 6
687
+    psllw       ym1, 6
688
+    psllw       ym2, 6
689
+    psllw       ym3, 6
690
+    psubw       ym0, ym4
691
+    psubw       ym1, ym4
692
+    psubw       ym2, ym4
693
+    psubw       ym3, ym4
694
+    movu        [r2 + 64],           ym0
695
+    movu        [r2 + r3 + 64],      ym1
696
+    movu        [r2 + r3 * 2 + 64],  ym2
697
+    movu        [r2 + r6 + 64],      ym3
698
+
699
+    lea         r0, [r0 + r1 * 4]
700
+    lea         r2, [r2 + r3 * 4]
701
+
702
+    pmovzxbw    m0, [r0]
703
+    pmovzxbw    m1, [r0 + r1]
704
+    pmovzxbw    m2, [r0 + r1 * 2]
705
+    pmovzxbw    m3, [r0 + r5]
706
+    psllw       m0, 6
707
+    psllw       m1, 6
708
+    psllw       m2, 6
709
+    psllw       m3, 6
710
+    psubw       m0, m4
711
+    psubw       m1, m4
712
+    psubw       m2, m4
713
+    psubw       m3, m4
714
+    movu        [r2],           m0
715
+    movu        [r2 + r3],      m1
716
+    movu        [r2 + r3 * 2],  m2
717
+    movu        [r2 + r6],      m3
718
+
719
+    pmovzxbw    ym0, [r0 + 32]
720
+    pmovzxbw    ym1, [r0 + r1 + 32]
721
+    pmovzxbw    ym2, [r0 + r1 * 2 + 32]
722
+    pmovzxbw    ym3, [r0 + r5 + 32]
723
+    psllw       ym0, 6
724
+    psllw       ym1, 6
725
+    psllw       ym2, 6
726
+    psllw       ym3, 6
727
+    psubw       ym0, ym4
728
+    psubw       ym1, ym4
729
+    psubw       ym2, ym4
730
+    psubw       ym3, ym4
731
+    movu        [r2 + 64],           ym0
732
+    movu        [r2 + r3 + 64],      ym1
733
+    movu        [r2 + r3 * 2 + 64],  ym2
734
+    movu        [r2 + r6 + 64],      ym3
735
+%endmacro
736
+
737
+%macro PROCESS_P2S_ALIGNED_48x8_AVX512 0
738
+    pmovzxbw    m0, [r0]
739
+    pmovzxbw    m1, [r0 + r1]
740
+    pmovzxbw    m2, [r0 + r1 * 2]
741
+    pmovzxbw    m3, [r0 + r5]
742
+    psllw       m0, 6
743
+    psllw       m1, 6
744
+    psllw       m2, 6
745
+    psllw       m3, 6
746
+    psubw       m0, m4
747
+    psubw       m1, m4
748
+    psubw       m2, m4
749
+    psubw       m3, m4
750
+    mova        [r2],           m0
751
+    mova        [r2 + r3],      m1
752
+    mova        [r2 + r3 * 2],  m2
753
+    mova        [r2 + r6],      m3
754
+
755
+    pmovzxbw    ym0, [r0 + 32]
756
+    pmovzxbw    ym1, [r0 + r1 + 32]
757
+    pmovzxbw    ym2, [r0 + r1 * 2 + 32]
758
+    pmovzxbw    ym3, [r0 + r5 + 32]
759
+    psllw       ym0, 6
760
+    psllw       ym1, 6
761
+    psllw       ym2, 6
762
+    psllw       ym3, 6
763
+    psubw       ym0, ym4
764
+    psubw       ym1, ym4
765
+    psubw       ym2, ym4
766
+    psubw       ym3, ym4
767
+    mova        [r2 + 64],           ym0
768
+    mova        [r2 + r3 + 64],      ym1
769
+    mova        [r2 + r3 * 2 + 64],  ym2
770
+    mova        [r2 + r6 + 64],      ym3
771
+
772
+    lea         r0, [r0 + r1 * 4]
773
+    lea         r2, [r2 + r3 * 4]
774
+
775
+    pmovzxbw    m0, [r0]
776
+    pmovzxbw    m1, [r0 + r1]
777
+    pmovzxbw    m2, [r0 + r1 * 2]
778
+    pmovzxbw    m3, [r0 + r5]
779
+    psllw       m0, 6
780
+    psllw       m1, 6
781
+    psllw       m2, 6
782
+    psllw       m3, 6
783
+    psubw       m0, m4
784
+    psubw       m1, m4
785
+    psubw       m2, m4
786
+    psubw       m3, m4
787
+    mova        [r2],           m0
788
+    mova        [r2 + r3],      m1
789
+    mova        [r2 + r3 * 2],  m2
790
+    mova        [r2 + r6],      m3
791
+
792
+    pmovzxbw    ym0, [r0 + 32]
793
+    pmovzxbw    ym1, [r0 + r1 + 32]
794
+    pmovzxbw    ym2, [r0 + r1 * 2 + 32]
795
+    pmovzxbw    ym3, [r0 + r5 + 32]
796
+    psllw       ym0, 6
797
+    psllw       ym1, 6
798
+    psllw       ym2, 6
799
+    psllw       ym3, 6
800
+    psubw       ym0, ym4
801
+    psubw       ym1, ym4
802
+    psubw       ym2, ym4
803
+    psubw       ym3, ym4
804
+    mova        [r2 + 64],           ym0
805
+    mova        [r2 + r3 + 64],      ym1
806
+    mova        [r2 + r3 * 2 + 64],  ym2
807
+    mova        [r2 + r6 + 64],      ym3
808
+%endmacro
809
+;-----------------------------------------------------------------------------
810
+; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
811
+;-----------------------------------------------------------------------------
812
+%if ARCH_X86_64
813
+INIT_ZMM avx512
814
+cglobal filterPixelToShort_48x64, 3,7,5
815
+    mov         r3d, r3m
816
+    add         r3d, r3d
817
+    lea         r5, [r1 * 3]
818
+    lea         r6, [r3 * 3]
819
+
820
+    ; load constant
821
+    vpbroadcastd m4, [pw_2000]
822
+
823
+    PROCESS_P2S_48x8_AVX512
824
+    lea         r0, [r0 + r1 * 4]
825
+    lea         r2, [r2 + r3 * 4]
826
+    PROCESS_P2S_48x8_AVX512
827
+    lea         r0, [r0 + r1 * 4]
828
+    lea         r2, [r2 + r3 * 4]
829
+    PROCESS_P2S_48x8_AVX512
830
+    lea         r0, [r0 + r1 * 4]
831
+    lea         r2, [r2 + r3 * 4]
832
+    PROCESS_P2S_48x8_AVX512
833
+    lea         r0, [r0 + r1 * 4]
834
+    lea         r2, [r2 + r3 * 4]
835
+    PROCESS_P2S_48x8_AVX512
836
+    lea         r0, [r0 + r1 * 4]
837
+    lea         r2, [r2 + r3 * 4]
838
+    PROCESS_P2S_48x8_AVX512
839
+    lea         r0, [r0 + r1 * 4]
840
+    lea         r2, [r2 + r3 * 4]
841
+    PROCESS_P2S_48x8_AVX512
842
+    lea         r0, [r0 + r1 * 4]
843
+    lea         r2, [r2 + r3 * 4]
844
+    PROCESS_P2S_48x8_AVX512
845
+    RET
846
+
847
+INIT_ZMM avx512
848
+cglobal filterPixelToShort_aligned_48x64, 3,7,5
849
+    mov         r3d, r3m
850
+    add         r3d, r3d
851
+    lea         r5, [r1 * 3]
852
+    lea         r6, [r3 * 3]
853
+
854
+    ; load constant
855
+    vpbroadcastd m4, [pw_2000]
856
+
857
+    PROCESS_P2S_ALIGNED_48x8_AVX512
858
+    lea         r0, [r0 + r1 * 4]
859
+    lea         r2, [r2 + r3 * 4]
860
+    PROCESS_P2S_ALIGNED_48x8_AVX512
861
+    lea         r0, [r0 + r1 * 4]
862
+    lea         r2, [r2 + r3 * 4]
863
+    PROCESS_P2S_ALIGNED_48x8_AVX512
864
+    lea         r0, [r0 + r1 * 4]
865
+    lea         r2, [r2 + r3 * 4]
866
+    PROCESS_P2S_ALIGNED_48x8_AVX512
867
+    lea         r0, [r0 + r1 * 4]
868
+    lea         r2, [r2 + r3 * 4]
869
+    PROCESS_P2S_ALIGNED_48x8_AVX512
870
+    lea         r0, [r0 + r1 * 4]
871
+    lea         r2, [r2 + r3 * 4]
872
+    PROCESS_P2S_ALIGNED_48x8_AVX512
873
+    lea         r0, [r0 + r1 * 4]
874
+    lea         r2, [r2 + r3 * 4]
875
+    PROCESS_P2S_ALIGNED_48x8_AVX512
876
+    lea         r0, [r0 + r1 * 4]
877
+    lea         r2, [r2 + r3 * 4]
878
+    PROCESS_P2S_ALIGNED_48x8_AVX512
879
+    RET
880
+%endif
881
+;-----------------------------------------------------------------------------
882
+;p2s and p2s_aligned 48xN avx512 code end
883
+;-----------------------------------------------------------------------------
884
 
885
 %macro PROCESS_LUMA_W4_4R 0
886
     movd        m0, [r0]
887
@@ -9353,3 +10188,4762 @@
888
 
889
     FILTER_VER_LUMA_S_AVX2_32x24 sp
890
     FILTER_VER_LUMA_S_AVX2_32x24 ss
891
+;-------------------------------------------------------------------------------------------------------------
892
+;ipfilter_chroma_avx512 code start
893
+;-------------------------------------------------------------------------------------------------------------
894
+%macro PROCESS_IPFILTER_CHROMA_PP_64x1_AVX512 0
895
+    ; register map
896
+    ; m0 - interpolate coeff
897
+    ; m1, m2 - shuffle order table
898
+    ; m3 - constant word 1
899
+    ; m4 - constant word 512
900
+
901
+    movu               m5,           [r0]
902
+    pshufb             m6,           m5,       m2
903
+    pshufb             m5,           m5,       m1
904
+    pmaddubsw          m5,           m0
905
+    pmaddubsw          m6,           m0
906
+    pmaddwd            m5,           m3
907
+    pmaddwd            m6,           m3
908
+
909
+    movu               m7,           [r0 + 4]
910
+    pshufb             m8,           m7,       m2
911
+    pshufb             m7,           m7,       m1
912
+    pmaddubsw          m7,           m0
913
+    pmaddubsw          m8,           m0
914
+    pmaddwd            m7,           m3
915
+    pmaddwd            m8,           m3
916
+
917
+    packssdw           m5,           m7
918
+    packssdw           m6,           m8
919
+    pmulhrsw           m5,           m4
920
+    pmulhrsw           m6,           m4
921
+    packuswb           m5,           m6
922
+    movu              [r2],          m5
923
+%endmacro
924
+
925
+%macro PROCESS_IPFILTER_CHROMA_PP_32x2_AVX512 0
926
+    ; register map
927
+    ; m0 - interpolate coeff
928
+    ; m1, m2 - shuffle order table
929
+    ; m3 - constant word 1
930
+    ; m4 - constant word 512
931
+    ; m9 - store shuffle order table
932
+
933
+    movu              ym5,           [r0]
934
+    vinserti32x8       m5,           [r0 + r1], 1
935
+    movu              ym7,           [r0 + 4]
936
+    vinserti32x8       m7,           [r0 + r1 + 4], 1
937
+
938
+    pshufb             m6,           m5,       m2
939
+    pshufb             m5,           m1
940
+    pshufb             m8,           m7,       m2
941
+    pshufb             m7,           m1
942
+
943
+    pmaddubsw          m5,           m0
944
+    pmaddubsw          m7,           m0
945
+    pmaddwd            m5,           m3
946
+    pmaddwd            m7,           m3
947
+
948
+    pmaddubsw          m6,           m0
949
+    pmaddubsw          m8,           m0
950
+    pmaddwd            m6,           m3
951
+    pmaddwd            m8,           m3
952
+
953
+    packssdw           m5,           m7
954
+    packssdw           m6,           m8
955
+    pmulhrsw           m5,           m4
956
+    pmulhrsw           m6,           m4
957
+    packuswb           m5,           m6
958
+    movu             [r2],          ym5
959
+    vextracti32x8    [r2 + r3],      m5,            1
960
+%endmacro
961
+
962
+%macro PROCESS_IPFILTER_CHROMA_PP_16x4_AVX512 0
963
+    ; register map
964
+    ; m0 - interpolate coeff
965
+    ; m1, m2 - shuffle order table
966
+    ; m3 - constant word 1
967
+    ; m4 - constant word 512
968
+
969
+    movu              xm5,           [r0]
970
+    vinserti32x4       m5,           [r0 + r1],            1
971
+    vinserti32x4       m5,           [r0 + 2 * r1],        2
972
+    vinserti32x4       m5,           [r0 + r6],            3
973
+    pshufb             m6,           m5,       m2
974
+    pshufb             m5,           m1
975
+
976
+    movu              xm7,           [r0 + 4]
977
+    vinserti32x4       m7,           [r0 + r1 + 4],        1
978
+    vinserti32x4       m7,           [r0 + 2 * r1 + 4],    2
979
+    vinserti32x4       m7,           [r0 + r6 + 4],        3
980
+    pshufb             m8,           m7,       m2
981
+    pshufb             m7,           m1
982
+
983
+    pmaddubsw          m5,           m0
984
+    pmaddubsw          m7,           m0
985
+    pmaddwd            m5,           m3
986
+    pmaddwd            m7,           m3
987
+
988
+    pmaddubsw          m6,           m0
989
+    pmaddubsw          m8,           m0
990
+    pmaddwd            m6,           m3
991
+    pmaddwd            m8,           m3
992
+
993
+    packssdw           m5,           m7
994
+    packssdw           m6,           m8
995
+    pmulhrsw           m5,           m4
996
+    pmulhrsw           m6,           m4
997
+    packuswb           m5,           m6
998
+    movu              [r2],          xm5
999
+    vextracti32x4     [r2 + r3],     m5,       1
1000
+    vextracti32x4     [r2 + 2 * r3], m5,       2
1001
+    vextracti32x4     [r2 + r7],     m5,       3
1002
+%endmacro
1003
+
1004
+%macro PROCESS_IPFILTER_CHROMA_PP_48x4_AVX512 0
1005
+    ; register map
1006
+    ; m0 - interpolate coeff
1007
+    ; m1, m2 - shuffle order table
1008
+    ; m3 - constant word 1
1009
+    ; m4 - constant word 512
1010
+    movu              ym5,           [r0]
1011
+    vinserti32x8       m5,           [r0 + r1], 1
1012
+    movu              ym7,           [r0 + 4]
1013
+    vinserti32x8       m7,           [r0 + r1 + 4], 1
1014
+
1015
+    pshufb             m6,           m5,           m2
1016
+    pshufb             m5,           m1
1017
+    pshufb             m8,           m7,           m2
1018
+    pshufb             m7,           m1
1019
+
1020
+    pmaddubsw          m5,           m0
1021
+    pmaddubsw          m7,           m0
1022
+    pmaddwd            m5,           m3
1023
+    pmaddwd            m7,           m3
1024
+
1025
+    pmaddubsw          m6,           m0
1026
+    pmaddubsw          m8,           m0
1027
+    pmaddwd            m6,           m3
1028
+    pmaddwd            m8,           m3
1029
+
1030
+    packssdw           m5,           m7
1031
+    packssdw           m6,           m8
1032
+    pmulhrsw           m5,           m4
1033
+    pmulhrsw           m6,           m4
1034
+    packuswb           m5,           m6
1035
+    movu             [r2],          ym5
1036
+    vextracti32x8    [r2 + r3],      m5,            1
1037
+
1038
+    movu              ym5,           [r0 + 2 * r1]
1039
+    vinserti32x8       m5,           [r0 + r6], 1
1040
+    movu              ym7,           [r0 + 2 * r1 + 4]
1041
+    vinserti32x8       m7,           [r0 + r6 + 4], 1
1042
+
1043
+    pshufb             m6,           m5,           m2
1044
+    pshufb             m5,           m1
1045
+    pshufb             m8,           m7,           m2
1046
+    pshufb             m7,           m1
1047
+
1048
+    pmaddubsw          m5,           m0
1049
+    pmaddubsw          m7,           m0
1050
+    pmaddwd            m5,           m3
1051
+    pmaddwd            m7,           m3
1052
+
1053
+    pmaddubsw          m6,           m0
1054
+    pmaddubsw          m8,           m0
1055
+    pmaddwd            m6,           m3
1056
+    pmaddwd            m8,           m3
1057
+
1058
+    packssdw           m5,           m7
1059
+    packssdw           m6,           m8
1060
+    pmulhrsw           m5,           m4
1061
+    pmulhrsw           m6,           m4
1062
+    packuswb           m5,           m6
1063
+    movu             [r2 + 2 * r3], ym5
1064
+    vextracti32x8    [r2 + r7],      m5,            1
1065
+
1066
+    movu              xm5,           [r0 + mmsize/2]
1067
+    vinserti32x4       m5,           [r0 + r1 + mmsize/2],            1
1068
+    vinserti32x4       m5,           [r0 + 2 * r1 + mmsize/2],        2
1069
+    vinserti32x4       m5,           [r0 + r6 + mmsize/2],            3
1070
+    pshufb             m6,           m5,       m2
1071
+    pshufb             m5,           m1
1072
+
1073
+    movu              xm7,           [r0 + 36]
1074
+    vinserti32x4       m7,           [r0 + r1 + 36],        1
1075
+    vinserti32x4       m7,           [r0 + 2 * r1 + 36],    2
1076
+    vinserti32x4       m7,           [r0 + r6 + 36],        3
1077
+    pshufb             m8,           m7,       m2
1078
+    pshufb             m7,           m1
1079
+
1080
+    pmaddubsw          m5,           m0
1081
+    pmaddubsw          m7,           m0
1082
+    pmaddwd            m5,           m3
1083
+    pmaddwd            m7,           m3
1084
+
1085
+    pmaddubsw          m6,           m0
1086
+    pmaddubsw          m8,           m0
1087
+    pmaddwd            m6,           m3
1088
+    pmaddwd            m8,           m3
1089
+
1090
+    packssdw           m5,           m7
1091
+    packssdw           m6,           m8
1092
+    pmulhrsw           m5,           m4
1093
+    pmulhrsw           m6,           m4
1094
+    packuswb           m5,           m6
1095
+    movu              [r2 + mmsize/2],          xm5
1096
+    vextracti32x4     [r2 + r3 + mmsize/2],     m5,       1
1097
+    vextracti32x4     [r2 + 2 * r3 + mmsize/2], m5,       2
1098
+    vextracti32x4     [r2 + r7 + mmsize/2],     m5,       3
1099
+%endmacro
1100
+
1101
+;-------------------------------------------------------------------------------------------------------------
1102
+; void interp_4tap_horiz_pp_64xN(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx
1103
+;-------------------------------------------------------------------------------------------------------------
1104
+%macro IPFILTER_CHROMA_PP_64xN_AVX512 1
1105
+INIT_ZMM avx512
1106
+cglobal interp_4tap_horiz_pp_64x%1, 4,6,9
1107
+    mov               r4d,               r4m
1108
+
1109
+%ifdef PIC
1110
+    lea               r5,           [tab_ChromaCoeff]
1111
+    vpbroadcastd      m0,           [r5 + r4 * 4]
1112
+%else
1113
+    vpbroadcastd      m0,           [tab_ChromaCoeff + r4 * 4]
1114
+%endif
1115
+
1116
+    vbroadcasti32x8   m1,           [interp4_horiz_shuf_load1_avx512]
1117
+    vbroadcasti32x8   m2,           [interp4_horiz_shuf_load2_avx512]
1118
+    vbroadcasti32x8   m3,           [pw_1]
1119
+    vbroadcasti32x8   m4,           [pw_512]
1120
+    dec               r0
1121
+
1122
+%rep %1 - 1
1123
+    PROCESS_IPFILTER_CHROMA_PP_64x1_AVX512
1124
+    lea               r2,           [r2 + r3]
1125
+    lea               r0,           [r0 + r1]
1126
+%endrep
1127
+    PROCESS_IPFILTER_CHROMA_PP_64x1_AVX512
1128
+    RET
1129
+%endmacro
1130
+
1131
+%if ARCH_X86_64
1132
+    IPFILTER_CHROMA_PP_64xN_AVX512  64
1133
+    IPFILTER_CHROMA_PP_64xN_AVX512  32
1134
+    IPFILTER_CHROMA_PP_64xN_AVX512  48
1135
+    IPFILTER_CHROMA_PP_64xN_AVX512  16
1136
+%endif
1137
+
1138
+%macro IPFILTER_CHROMA_PP_32xN_AVX512 1
1139
+INIT_ZMM avx512
1140
+cglobal interp_4tap_horiz_pp_32x%1, 4,6,9
1141
+    mov               r4d,               r4m
1142
+
1143
+%ifdef PIC
1144
+    lea               r5,           [tab_ChromaCoeff]
1145
+    vpbroadcastd      m0,           [r5 + r4 * 4]
1146
+%else
1147
+    vpbroadcastd      m0,           [tab_ChromaCoeff + r4 * 4]
1148
+%endif
1149
+
1150
+    vbroadcasti32x8   m1,           [interp4_horiz_shuf_load1_avx512]
1151
+    vbroadcasti32x8   m2,           [interp4_horiz_shuf_load2_avx512]
1152
+    vbroadcasti32x8   m3,           [pw_1]
1153
+    vbroadcasti32x8   m4,           [pw_512]
1154
+    dec               r0
1155
+
1156
+%rep %1/2 - 1
1157
+    PROCESS_IPFILTER_CHROMA_PP_32x2_AVX512
1158
+    lea               r2,           [r2 + 2 * r3]
1159
+    lea               r0,           [r0 + 2 * r1]
1160
+%endrep
1161
+    PROCESS_IPFILTER_CHROMA_PP_32x2_AVX512
1162
+    RET
1163
+%endmacro
1164
+
1165
+%if ARCH_X86_64
1166
+    IPFILTER_CHROMA_PP_32xN_AVX512 16
1167
+    IPFILTER_CHROMA_PP_32xN_AVX512 24
1168
+    IPFILTER_CHROMA_PP_32xN_AVX512 8
1169
+    IPFILTER_CHROMA_PP_32xN_AVX512 32
1170
+    IPFILTER_CHROMA_PP_32xN_AVX512 64
1171
+    IPFILTER_CHROMA_PP_32xN_AVX512 48
1172
+%endif
1173
+
1174
+%macro IPFILTER_CHROMA_PP_16xN_AVX512 1
1175
+INIT_ZMM avx512
1176
+cglobal interp_4tap_horiz_pp_16x%1, 4,8,9
1177
+    mov               r4d,          r4m
1178
+    lea               r6,           [3 * r1]
1179
+    lea               r7,           [3 * r3]
1180
+%ifdef PIC
1181
+    lea               r5,           [tab_ChromaCoeff]
1182
+    vpbroadcastd      m0,           [r5 + r4 * 4]
1183
+%else
1184
+    vpbroadcastd      m0,           [tab_ChromaCoeff + r4 * 4]
1185
+%endif
1186
+
1187
+    vbroadcasti32x8   m1,           [interp4_horiz_shuf_load1_avx512]
1188
+    vbroadcasti32x8   m2,           [interp4_horiz_shuf_load2_avx512]
1189
+    vbroadcasti32x8   m3,           [pw_1]
1190
+    vbroadcasti32x8   m4,           [pw_512]
1191
+    dec               r0
1192
+
1193
+%rep %1/4 - 1
1194
+    PROCESS_IPFILTER_CHROMA_PP_16x4_AVX512
1195
+    lea               r2,           [r2 + 4 * r3]
1196
+    lea               r0,           [r0 + 4 * r1]
1197
+%endrep
1198
+    PROCESS_IPFILTER_CHROMA_PP_16x4_AVX512
1199
+    RET
1200
+%endmacro
1201
+
1202
+%if ARCH_X86_64
1203
+    IPFILTER_CHROMA_PP_16xN_AVX512 4
1204
+    IPFILTER_CHROMA_PP_16xN_AVX512 8
1205
+    IPFILTER_CHROMA_PP_16xN_AVX512 12
1206
+    IPFILTER_CHROMA_PP_16xN_AVX512 16
1207
+    IPFILTER_CHROMA_PP_16xN_AVX512 24
1208
+    IPFILTER_CHROMA_PP_16xN_AVX512 32
1209
+    IPFILTER_CHROMA_PP_16xN_AVX512 64
1210
+%endif
1211
+
1212
+%if ARCH_X86_64
1213
+INIT_ZMM avx512
1214
+cglobal interp_4tap_horiz_pp_48x64, 4,8,9
1215
+    mov               r4d,          r4m
1216
+    lea               r6,           [3 * r1]
1217
+    lea               r7,           [3 * r3]
1218
+%ifdef PIC
1219
+    lea               r5,           [tab_ChromaCoeff]
1220
+    vpbroadcastd      m0,           [r5 + r4 * 4]
1221
+%else
1222
+    vpbroadcastd      m0,           [tab_ChromaCoeff + r4 * 4]
1223
+%endif
1224
+
1225
+    vbroadcasti32x8   m1,           [interp4_horiz_shuf_load1_avx512]
1226
+    vbroadcasti32x8   m2,           [interp4_horiz_shuf_load2_avx512]
1227
+    vbroadcasti32x8   m3,           [pw_1]
1228
+    vbroadcasti32x8   m4,           [pw_512]
1229
+    dec               r0
1230
+
1231
+%rep 15
1232
+    PROCESS_IPFILTER_CHROMA_PP_48x4_AVX512
1233
+    lea               r2,           [r2 + 4 * r3]
1234
+    lea               r0,           [r0 + 4 * r1]
1235
+%endrep
1236
+    PROCESS_IPFILTER_CHROMA_PP_48x4_AVX512
1237
+    RET
1238
+%endif
1239
+
1240
+%macro PROCESS_IPFILTER_CHROMA_PS_64x1_AVX512 0
1241
+    movu               ym6,          [r0]
1242
+    vinserti32x8       m6,           [r0 + 4], 1
1243
+    pshufb             m7,           m6,       m2
1244
+    pshufb             m6,           m1
1245
+    pmaddubsw          m6,           m0
1246
+    pmaddubsw          m7,           m0
1247
+    pmaddwd            m6,           m3
1248
+    pmaddwd            m7,           m3
1249
+
1250
+    movu               ym8,          [r0 + 32]
1251
+    vinserti32x8       m8,           [r0 + 36], 1
1252
+    pshufb             m9,           m8,       m2
1253
+    pshufb             m8,           m1
1254
+    pmaddubsw          m8,           m0
1255
+    pmaddubsw          m9,           m0
1256
+    pmaddwd            m8,           m3
1257
+    pmaddwd            m9,           m3
1258
+
1259
+    packssdw           m6,           m7
1260
+    packssdw           m8,           m9
1261
+    psubw              m6,           m4
1262
+    psubw              m8,           m4
1263
+    vpermq             m6,           m10,       m6
1264
+    vpermq             m8,           m10,       m8
1265
+    movu               [r2],         m6
1266
+    movu               [r2 + mmsize],m8
1267
+%endmacro
1268
+
1269
+;-------------------------------------------------------------------------------------------------------------
1270
+; void interp_horiz_ps_64xN(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)
1271
+;-------------------------------------------------------------------------------------------------------------
1272
+%macro IPFILTER_CHROMA_PS_64xN_AVX512 1
1273
+INIT_ZMM avx512
1274
+cglobal interp_4tap_horiz_ps_64x%1, 4,7,11
1275
+    mov             r4d, r4m
1276
+    mov             r5d, r5m
1277
+
1278
+%ifdef PIC
1279
+    lea               r6,           [tab_ChromaCoeff]
1280
+    vpbroadcastd      m0,           [r6 + r4 * 4]
1281
+%else
1282
+    vpbroadcastd      m0,           [tab_ChromaCoeff + r4 * 4]
1283
+%endif
1284
+
1285
+    vbroadcasti32x8    m1,           [interp4_horiz_shuf_load1_avx512]
1286
+    vbroadcasti32x8    m2,           [interp4_horiz_shuf_load2_avx512]
1287
+    vbroadcasti32x8    m3,           [pw_1]
1288
+    vbroadcasti32x8    m4,           [pw_2000]
1289
+    mova               m10,          [interp4_hps_shuf_avx512]
1290
+
1291
+    ; register map
1292
+    ; m0    - interpolate coeff
1293
+    ; m1,m2 - load shuffle order table
1294
+    ; m3    - constant word 1
1295
+    ; m4    - constant word 2000
1296
+    ; m10   - store shuffle order table
1297
+
1298
+    mov               r6d,         %1
1299
+    dec               r0
1300
+    test              r5d,         r5d
1301
+    je                .loop
1302
+    sub               r0,          r1
1303
+    add               r6d,         3
1304
+
1305
+.loop:
1306
+    PROCESS_IPFILTER_CHROMA_PS_64x1_AVX512
1307
+    lea               r2,           [r2 + 2 * r3]
1308
+    lea               r0,           [r0 + r1]
1309
+    dec               r6d
1310
+    jnz               .loop
1311
+    RET
1312
+%endmacro
1313
+
1314
+%if ARCH_X86_64
1315
+    IPFILTER_CHROMA_PS_64xN_AVX512 64
1316
+    IPFILTER_CHROMA_PS_64xN_AVX512 32
1317
+    IPFILTER_CHROMA_PS_64xN_AVX512 48
1318
+    IPFILTER_CHROMA_PS_64xN_AVX512 16
1319
+%endif
1320
+
1321
+%macro PROCESS_IPFILTER_CHROMA_PS_32x1_AVX512 0
1322
+    movu               ym6,          [r0]
1323
+    vinserti32x8       m6,           [r0 + 4], 1
1324
+    pshufb             m7,           m6,       m2
1325
+    pshufb             m6,           m6,       m1
1326
+    pmaddubsw          m6,           m0
1327
+    pmaddubsw          m7,           m0
1328
+    pmaddwd            m6,           m3
1329
+    pmaddwd            m7,           m3
1330
+
1331
+    packssdw           m6,           m7
1332
+    psubw              m6,           m4
1333
+    vpermq             m6,           m8,       m6
1334
+    movu               [r2],         m6
1335
+%endmacro
1336
+
1337
+;-------------------------------------------------------------------------------------------------------------
1338
+; void interp_horiz_ps_32xN(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)
1339
+;-------------------------------------------------------------------------------------------------------------
1340
+%macro IPFILTER_CHROMA_PS_32xN_AVX512 1
1341
+INIT_ZMM avx512
1342
+cglobal interp_4tap_horiz_ps_32x%1, 4,7,9
1343
+    mov             r4d, r4m
1344
+    mov             r5d, r5m
1345
+
1346
+%ifdef PIC
1347
+    lea               r6,           [tab_ChromaCoeff]
1348
+    vpbroadcastd      m0,           [r6 + r4 * 4]
1349
+%else
1350
+    vpbroadcastd      m0,           [tab_ChromaCoeff + r4 * 4]
1351
+%endif
1352
+
1353
+    vbroadcasti32x8    m1,           [interp4_horiz_shuf_load1_avx512]
1354
+    vbroadcasti32x8    m2,           [interp4_horiz_shuf_load2_avx512]
1355
+    vbroadcasti32x8    m3,           [pw_1]
1356
+    vbroadcasti32x8    m4,           [pw_2000]
1357
+    mova               m8,           [interp4_hps_shuf_avx512]
1358
+
1359
+    ; register map
1360
+    ; m0    - interpolate coeff
1361
+    ; m1,m2 - load shuffle order table
1362
+    ; m3    - constant word 1
1363
+    ; m4    - constant word 2000
1364
+    ; m8   - store shuffle order table
1365
+
1366
+    mov               r6d,         %1
1367
+    dec               r0
1368
+    test              r5d,         r5d
1369
+    je                .loop
1370
+    sub               r0,          r1
1371
+    add               r6d,         3
1372
+
1373
+.loop:
1374
+    PROCESS_IPFILTER_CHROMA_PS_32x1_AVX512
1375
+    lea               r2,           [r2 + 2 * r3]
1376
+    lea               r0,           [r0 + r1]
1377
+    dec               r6d
1378
+    jnz               .loop
1379
+    RET
1380
+%endmacro
1381
+
1382
+%if ARCH_X86_64
1383
+    IPFILTER_CHROMA_PS_32xN_AVX512 64
1384
+    IPFILTER_CHROMA_PS_32xN_AVX512 48
1385
+    IPFILTER_CHROMA_PS_32xN_AVX512 32
1386
+    IPFILTER_CHROMA_PS_32xN_AVX512 24
1387
+    IPFILTER_CHROMA_PS_32xN_AVX512 16
1388
+    IPFILTER_CHROMA_PS_32xN_AVX512 8
1389
+%endif
1390
+
1391
+%macro PROCESS_IPFILTER_CHROMA_PS_16x2_AVX512 0
1392
+    movu               xm6,         [r0]
1393
+    vinserti32x4       m6,          [r0 + 4],      1
1394
+    vinserti32x4       m6,          [r0 + r1],     2
1395
+    vinserti32x4       m6,          [r0 + r1 + 4], 3
1396
+
1397
+    pshufb             m7,          m6,            m2
1398
+    pshufb             m6,          m6,            m1
1399
+    pmaddubsw          m6,          m0
1400
+    pmaddubsw          m7,          m0
1401
+    pmaddwd            m6,          m3
1402
+    pmaddwd            m7,          m3
1403
+
1404
+    packssdw           m6,          m7
1405
+    psubw              m6,          m4
1406
+    vpermq             m6,          m8,            m6
1407
+    movu               [r2],        ym6
1408
+    vextracti32x8      [r2 + r3],   m6,            1
1409
+%endmacro
1410
+
1411
+%macro PROCESS_IPFILTER_CHROMA_PS_16x1_AVX512 0
1412
+    movu              xm6,          [r0]
1413
+    vinserti32x4      m6,           [r0 + 4],  1
1414
+
1415
+    pshufb            ym7,          ym6,       ym2
1416
+    pshufb            ym6,          ym6,       ym1
1417
+    pmaddubsw         ym6,          ym0
1418
+    pmaddubsw         ym7,          ym0
1419
+    pmaddwd           ym6,          ym3
1420
+    pmaddwd           ym7,          ym3
1421
+
1422
+    packssdw          ym6,          ym7
1423
+    psubw             ym6,          ym4
1424
+    vpermq            ym6,          ym8,       ym6
1425
+    movu              [r2],         ym6
1426
+%endmacro
1427
+
1428
+;-------------------------------------------------------------------------------------------------------------
1429
+; void interp_horiz_ps_16xN(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)
1430
+;-------------------------------------------------------------------------------------------------------------
1431
+%macro IPFILTER_CHROMA_PS_16xN_AVX512 1
1432
+INIT_ZMM avx512
1433
+cglobal interp_4tap_horiz_ps_16x%1, 4,7,9
1434
+    mov             r4d, r4m
1435
+    mov             r5d, r5m
1436
+    add             r3,  r3
1437
+
1438
+%ifdef PIC
1439
+    lea               r6,           [tab_ChromaCoeff]
1440
+    vpbroadcastd      m0,           [r6 + r4 * 4]
1441
+%else
1442
+    vpbroadcastd      m0,           [tab_ChromaCoeff + r4 * 4]
1443
+%endif
1444
+
1445
+    vbroadcasti32x8    m1,          [interp4_horiz_shuf_load1_avx512]
1446
+    vbroadcasti32x8    m2,          [interp4_horiz_shuf_load2_avx512]
1447
+    vbroadcasti32x8    m3,          [pw_1]
1448
+    vbroadcasti32x8    m4,          [pw_2000]
1449
+    mova               m8,          [interp4_hps_store_16xN_avx512]
1450
+
1451
+    ; register map
1452
+    ; m0    - interpolate coeff
1453
+    ; m1,m2 - load shuffle order table
1454
+    ; m3    - constant word 1
1455
+    ; m4    - constant word 2000
1456
+    ; m8   - store shuffle order table
1457
+
1458
+    mov               r6d,          %1
1459
+    dec               r0
1460
+    test              r5d,          r5d
1461
+    je                .loop
1462
+    sub               r0,           r1
1463
+    add               r6d,          3
1464
+    PROCESS_IPFILTER_CHROMA_PS_16x1_AVX512
1465
+    lea               r2,           [r2 + r3]
1466
+    lea               r0,           [r0 + r1]
1467
+    dec               r6d
1468
+
1469
+.loop:
1470
+    PROCESS_IPFILTER_CHROMA_PS_16x2_AVX512
1471
+    lea               r2,           [r2 + 2 * r3]
1472
+    lea               r0,           [r0 + 2 * r1]
1473
+    sub               r6d,          2
1474
+    jnz               .loop
1475
+
1476
+    RET
1477
+%endmacro
1478
+
1479
+%if ARCH_X86_64 == 1
1480
+    IPFILTER_CHROMA_PS_16xN_AVX512 64
1481
+    IPFILTER_CHROMA_PS_16xN_AVX512 32
1482
+    IPFILTER_CHROMA_PS_16xN_AVX512 24
1483
+    IPFILTER_CHROMA_PS_16xN_AVX512 16
1484
+    IPFILTER_CHROMA_PS_16xN_AVX512 12
1485
+    IPFILTER_CHROMA_PS_16xN_AVX512 8
1486
+    IPFILTER_CHROMA_PS_16xN_AVX512 4
1487
+%endif
1488
+
1489
+%macro PROCESS_IPFILTER_CHROMA_PS_48x1_AVX512 0
1490
+    movu               ym6,          [r0]
1491
+    vinserti32x8       m6,           [r0 + 4], 1
1492
+    pshufb             m7,           m6,       m2
1493
+    pshufb             m6,           m6,       m1
1494
+    pmaddubsw          m6,           m0
1495
+    pmaddubsw          m7,           m0
1496
+    pmaddwd            m6,           m3
1497
+    pmaddwd            m7,           m3
1498
+
1499
+    packssdw           m6,           m7
1500
+    psubw              m6,           m4
1501
+    vpermq             m6,           m8,       m6
1502
+    movu               [r2],         m6
1503
+
1504
+    movu              xm6,          [r0 + 32]
1505
+    vinserti32x4      m6,           [r0 + 36], 1
1506
+    pshufb            ym7,          ym6,       ym2
1507
+    pshufb            ym6,          ym6,       ym1
1508
+    pmaddubsw         ym6,          ym0
1509
+    pmaddubsw         ym7,          ym0
1510
+    pmaddwd           ym6,          ym3
1511
+    pmaddwd           ym7,          ym3
1512
+
1513
+    packssdw          ym6,          ym7
1514
+    psubw             ym6,          ym4
1515
+    vpermq            ym6,          ym9,       ym6
1516
+    movu              [r2 + mmsize],ym6
1517
+%endmacro
1518
+
1519
+;-------------------------------------------------------------------------------------------------------------
1520
+; void interp_horiz_ps_48xN(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)
1521
+;-------------------------------------------------------------------------------------------------------------
1522
+%macro IPFILTER_CHROMA_PS_48xN_AVX512 1
1523
+INIT_ZMM avx512
1524
+cglobal interp_4tap_horiz_ps_48x%1, 4,7,10
1525
+    mov             r4d, r4m
1526
+    mov             r5d, r5m
1527
+
1528
+%ifdef PIC
1529
+    lea               r6,           [tab_ChromaCoeff]
1530
+    vpbroadcastd      m0,           [r6 + r4 * 4]
1531
+%else
1532
+    vpbroadcastd      m0,           [tab_ChromaCoeff + r4 * 4]
1533
+%endif
1534
+
1535
+    vbroadcasti32x8    m1,           [interp4_horiz_shuf_load1_avx512]
1536
+    vbroadcasti32x8    m2,           [interp4_horiz_shuf_load2_avx512]
1537
+    vbroadcasti32x8    m3,           [pw_1]
1538
+    vbroadcasti32x8    m4,           [pw_2000]
1539
+    mova               m8,           [interp4_hps_shuf_avx512]
1540
+    mova               m9,           [interp4_hps_store_16xN_avx512]
1541
+
1542
+    ; register map
1543
+    ; m0    - interpolate coeff
1544
+    ; m1,m2 - load shuffle order table
1545
+    ; m3    - constant word 1
1546
+    ; m4    - constant word 2000
1547
+    ; m8   - store shuffle order table
1548
+
1549
+    mov               r6d,         %1
1550
+    dec               r0
1551
+    test              r5d,         r5d
1552
+    je                .loop
1553
+    sub               r0,          r1
1554
+    add               r6d,         3
1555
+
1556
+.loop:
1557
+    PROCESS_IPFILTER_CHROMA_PS_48x1_AVX512
1558
+    lea               r2,           [r2 + 2 * r3]
1559
+    lea               r0,           [r0 + r1]
1560
+    dec               r6d
1561
+    jnz               .loop
1562
+    RET
1563
+%endmacro
1564
+
1565
+%if ARCH_X86_64 == 1
1566
+    IPFILTER_CHROMA_PS_48xN_AVX512 64
1567
+%endif
1568
+
1569
+;-------------------------------------------------------------------------------------------------------------
1570
+;avx512 chroma_vpp and chroma_vps code start
1571
+;-------------------------------------------------------------------------------------------------------------
1572
+%macro PROCESS_CHROMA_VERT_16x4_AVX512 1
1573
+    lea                   r5,                 [r0 + 4 * r1]
1574
+    movu                  xm1,                [r0]
1575
+    movu                  xm3,                [r0 + r1]
1576
+    vinserti32x4          m1,                 [r0 + r1],           1
1577
+    vinserti32x4          m3,                 [r0 + 2 * r1],       1
1578
+    vinserti32x4          m1,                 [r0 + 2 * r1],       2
1579
+    vinserti32x4          m3,                 [r0 + r6],           2
1580
+    vinserti32x4          m1,                 [r0 + r6],           3
1581
+    vinserti32x4          m3,                 [r0 + 4 * r1],       3
1582
+
1583
+    punpcklbw             m0,                 m1,                  m3
1584
+    pmaddubsw             m0,                 m8
1585
+    punpckhbw             m1,                 m3
1586
+    pmaddubsw             m1,                 m8
1587
+
1588
+    movu                  xm4,                [r0 + 2 * r1]
1589
+    movu                  xm5,                [r0 + r6]
1590
+    vinserti32x4          m4,                 [r0 + r6],           1
1591
+    vinserti32x4          m5,                 [r5],                1
1592
+    vinserti32x4          m4,                 [r5],                2
1593
+    vinserti32x4          m5,                 [r5 + r1],           2
1594
+    vinserti32x4          m4,                 [r5 + r1],           3
1595
+    vinserti32x4          m5,                 [r5 + 2 * r1],       3
1596
+
1597
+    punpcklbw             m3,                 m4,                  m5
1598
+    pmaddubsw             m3,                 m9
1599
+    punpckhbw             m4,                 m5
1600
+    pmaddubsw             m4,                 m9
1601
+
1602
+    paddw                 m0,                 m3
1603
+    paddw                 m1,                 m4
1604
+%ifidn %1,pp
1605
+    pmulhrsw              m0,                 m7
1606
+    pmulhrsw              m1,                 m7
1607
+    packuswb              m0,                 m1
1608
+    movu                  [r2],               xm0
1609
+    vextracti32x4         [r2 + r3],          m0,                  1
1610
+    vextracti32x4         [r2 + 2 * r3],      m0,                  2
1611
+    vextracti32x4         [r2 + r7],          m0,                  3
1612
+%else
1613
+    psubw                 m0,                 m7
1614
+    psubw                 m1,                 m7
1615
+    mova                  m2,                 m10
1616
+    mova                  m3,                 m11
1617
+
1618
+    vpermi2q              m2,                 m0,                  m1
1619
+    vpermi2q              m3,                 m0,                  m1
1620
+    
1621
+    movu                  [r2],               ym2
1622
+    vextracti32x8         [r2 + r3],          m2,                   1
1623
+    movu                  [r2 + 2 * r3],      ym3
1624
+    vextracti32x8         [r2 + r7],          m3,                   1
1625
+%endif
1626
+%endmacro
1627
+
1628
+;-----------------------------------------------------------------------------------------------------------------
1629
+; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
1630
+;-----------------------------------------------------------------------------------------------------------------
1631
+%macro FILTER_VERT_CHROMA_16xN_AVX512 2
1632
+INIT_ZMM avx512
1633
+cglobal interp_4tap_vert_%1_16x%2, 4, 10, 12
1634
+    mov                   r4d,                r4m
1635
+    shl                   r4d,                7
1636
+    sub                   r0,                 r1
1637
+
1638
+%ifdef PIC
1639
+    lea                   r5,                 [tab_ChromaCoeffVer_32_avx512]
1640
+    mova                  m8,                 [r5 + r4]
1641
+    mova                  m9,                 [r5 + r4 + mmsize]
1642
+%else
1643
+    mova                  m8,                 [tab_ChromaCoeffVer_32_avx512 + r4]
1644
+    mova                  m9,                 [tab_ChromaCoeffVer_32_avx512 + r4 + mmsize]
1645
+%endif
1646
+
1647
+%ifidn %1, pp
1648
+    vbroadcasti32x8       m7,                 [pw_512]
1649
+%else
1650
+    shl                   r3d,                1
1651
+    vbroadcasti32x8       m7,                 [pw_2000]
1652
+    mova                  m10,                [interp4_vps_store1_avx512]
1653
+    mova                  m11,                [interp4_vps_store2_avx512]
1654
+%endif
1655
+    lea                   r6,                 [3 * r1]
1656
+    lea                   r7,                 [3 * r3]
1657
+
1658
+%rep %2/4 - 1
1659
+    PROCESS_CHROMA_VERT_16x4_AVX512 %1
1660
+    lea                   r0,                 [r0 + 4 * r1]
1661
+    lea                   r2,                 [r2 + 4 * r3]
1662
+%endrep
1663
+    PROCESS_CHROMA_VERT_16x4_AVX512 %1
1664
+    RET
1665
+%endmacro
1666
+
1667
+%if ARCH_X86_64
1668
+    FILTER_VERT_CHROMA_16xN_AVX512 pp, 4
1669
+    FILTER_VERT_CHROMA_16xN_AVX512 pp, 8
1670
+    FILTER_VERT_CHROMA_16xN_AVX512 pp, 12
1671
+    FILTER_VERT_CHROMA_16xN_AVX512 pp, 16
1672
+    FILTER_VERT_CHROMA_16xN_AVX512 pp, 24
1673
+    FILTER_VERT_CHROMA_16xN_AVX512 pp, 32
1674
+    FILTER_VERT_CHROMA_16xN_AVX512 pp, 64
1675
+
1676
+    FILTER_VERT_CHROMA_16xN_AVX512 ps, 4
1677
+    FILTER_VERT_CHROMA_16xN_AVX512 ps, 8
1678
+    FILTER_VERT_CHROMA_16xN_AVX512 ps, 12
1679
+    FILTER_VERT_CHROMA_16xN_AVX512 ps, 16
1680
+    FILTER_VERT_CHROMA_16xN_AVX512 ps, 24
1681
+    FILTER_VERT_CHROMA_16xN_AVX512 ps, 32
1682
+    FILTER_VERT_CHROMA_16xN_AVX512 ps, 64
1683
+%endif
1684
+%macro PROCESS_CHROMA_VERT_32x4_AVX512 1
1685
+    movu                  ym1,                [r0]
1686
+    movu                  ym3,                [r0 + r1]
1687
+    vinserti32x8          m1,                 [r0 + 2 * r1],       1
1688
+    vinserti32x8          m3,                 [r0 + r6],           1
1689
+    punpcklbw             m0,                 m1,                  m3
1690
+    pmaddubsw             m0,                 m8
1691
+    punpckhbw             m1,                 m3
1692
+    pmaddubsw             m1,                 m8
1693
+
1694
+    movu                  ym4,                [r0 + 2 * r1]
1695
+    vinserti32x8          m4,                 [r0 + 4 * r1],       1
1696
+    punpcklbw             m2,                 m3,                  m4
1697
+    pmaddubsw             m2,                 m8
1698
+    punpckhbw             m3,                 m4
1699
+    pmaddubsw             m3,                 m8
1700
+
1701
+    lea                   r0,                 [r0 + 2 * r1]
1702
+
1703
+    movu                  ym5,                [r0 + r1]
1704
+    vinserti32x8          m5,                 [r0 + r6],           1
1705
+    punpcklbw             m6,                 m4,                  m5
1706
+    pmaddubsw             m6,                 m9
1707
+    paddw                 m0,                 m6
1708
+    punpckhbw             m4,                 m5
1709
+    pmaddubsw             m4,                 m9
1710
+    paddw                 m1,                 m4
1711
+
1712
+    movu                  ym4,                [r0 + 2 * r1]
1713
+    vinserti32x8          m4,                 [r0 + 4 * r1],       1
1714
+    punpcklbw             m6,                 m5,                  m4
1715
+    pmaddubsw             m6,                 m9
1716
+    paddw                 m2,                 m6
1717
+    punpckhbw             m5,                 m4
1718
+    pmaddubsw             m5,                 m9
1719
+    paddw                 m3,                 m5
1720
+
1721
+%ifidn %1,pp
1722
+    pmulhrsw              m0,                 m7
1723
+    pmulhrsw              m1,                 m7
1724
+    pmulhrsw              m2,                 m7
1725
+    pmulhrsw              m3,                 m7
1726
+    packuswb              m0,                 m1
1727
+    packuswb              m2,                 m3
1728
+    movu                  [r2],               ym0
1729
+    movu                  [r2 + r3],          ym2
1730
+    vextracti32x8         [r2 + 2 * r3],      m0,                  1
1731
+    vextracti32x8         [r2 + r7],          m2,                  1
1732
+%else
1733
+    psubw                 m0,                 m7
1734
+    psubw                 m1,                 m7
1735
+    psubw                 m2,                 m7
1736
+    psubw                 m3,                 m7
1737
+
1738
+    mova                  m4,                 m10
1739
+    mova                  m5,                 m11
1740
+    vpermi2q              m4,                 m0,                m1
1741
+    vpermi2q              m5,                 m0,                m1
1742
+    mova                  m6,                 m10
1743
+    mova                  m12,                m11
1744
+    vpermi2q              m6,                 m2,                m3
1745
+    vpermi2q              m12,                 m2,                m3
1746
+
1747
+    movu                  [r2],               m4
1748
+    movu                  [r2 + r3],          m6
1749
+    movu                  [r2 + 2 * r3],      m5
1750
+    movu                  [r2 + r7],          m12
1751
+%endif
1752
+%endmacro
1753
+
1754
+;-----------------------------------------------------------------------------------------------------------------
1755
+; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
1756
+;-----------------------------------------------------------------------------------------------------------------
1757
+%macro FILTER_VERT_CHROMA_32xN_AVX512 2
1758
+INIT_ZMM avx512
1759
+cglobal interp_4tap_vert_%1_32x%2, 4, 8, 13
1760
+    mov                   r4d,                r4m
1761
+    shl                   r4d,                7
1762
+    sub                   r0,                 r1
1763
+
1764
+%ifdef PIC
1765
+    lea                   r5,                 [tab_ChromaCoeffVer_32_avx512]
1766
+    mova                  m8,                 [r5 + r4]
1767
+    mova                  m9,                 [r5 + r4 + mmsize]
1768
+%else
1769
+    mova                  m8,                 [tab_ChromaCoeffVer_32_avx512 + r4]
1770
+    mova                  m9,                 [tab_ChromaCoeffVer_32_avx512 + r4 + mmsize]
1771
+%endif
1772
+
1773
+%ifidn %1,pp
1774
+    vbroadcasti32x8       m7,                [pw_512]
1775
+%else
1776
+    shl                   r3d,                1
1777
+    vbroadcasti32x8       m7,                [pw_2000]
1778
+    mova                  m10,                [interp4_vps_store1_avx512]
1779
+    mova                  m11,                [interp4_vps_store2_avx512]
1780
+%endif
1781
+
1782
+    lea                   r6,                 [3 * r1]
1783
+    lea                   r7,                 [3 * r3]
1784
+
1785
+%rep %2/4 - 1
1786
+    PROCESS_CHROMA_VERT_32x4_AVX512 %1
1787
+    lea                   r0,                 [r0 + 2 * r1]
1788
+    lea                   r2,                 [r2 + 4 * r3]
1789
+%endrep
1790
+    PROCESS_CHROMA_VERT_32x4_AVX512 %1
1791
+    RET
1792
+%endmacro
1793
+
1794
+%if ARCH_X86_64
1795
+    FILTER_VERT_CHROMA_32xN_AVX512 pp, 8
1796
+    FILTER_VERT_CHROMA_32xN_AVX512 pp, 16
1797
+    FILTER_VERT_CHROMA_32xN_AVX512 pp, 24
1798
+    FILTER_VERT_CHROMA_32xN_AVX512 pp, 32
1799
+    FILTER_VERT_CHROMA_32xN_AVX512 pp, 48
1800
+    FILTER_VERT_CHROMA_32xN_AVX512 pp, 64
1801
+
1802
+    FILTER_VERT_CHROMA_32xN_AVX512 ps, 8
1803
+    FILTER_VERT_CHROMA_32xN_AVX512 ps, 16
1804
+    FILTER_VERT_CHROMA_32xN_AVX512 ps, 24
1805
+    FILTER_VERT_CHROMA_32xN_AVX512 ps, 32
1806
+    FILTER_VERT_CHROMA_32xN_AVX512 ps, 48
1807
+    FILTER_VERT_CHROMA_32xN_AVX512 ps, 64
1808
+%endif
1809
+%macro PROCESS_CHROMA_VERT_48x4_AVX512 1
1810
+    movu                  ym1,                [r0]
1811
+    movu                  ym3,                [r0 + r1]
1812
+    vinserti32x8          m1,                 [r0 + 2 * r1],       1
1813
+    vinserti32x8          m3,                 [r0 + r6],           1
1814
+    punpcklbw             m0,                 m1,                  m3
1815
+    pmaddubsw             m0,                 m8
1816
+    punpckhbw             m1,                 m3
1817
+    pmaddubsw             m1,                 m8
1818
+
1819
+    movu                  ym4,                [r0 + 2 * r1]
1820
+    vinserti32x8          m4,                 [r0 + 4 * r1],       1
1821
+    punpcklbw             m2,                 m3,                  m4
1822
+    pmaddubsw             m2,                 m8
1823
+    punpckhbw             m3,                 m4
1824
+    pmaddubsw             m3,                 m8
1825
+
1826
+    lea                   r5,                 [r0 + 4 * r1]
1827
+
1828
+    movu                  ym5,                [r0 + r6]
1829
+    vinserti32x8          m5,                 [r5 + r1],           1
1830
+    punpcklbw             m6,                 m4,                  m5
1831
+    pmaddubsw             m6,                 m9
1832
+    paddw                 m0,                 m6
1833
+    punpckhbw             m4,                 m5
1834
+    pmaddubsw             m4,                 m9
1835
+    paddw                 m1,                 m4
1836
+
1837
+    movu                  ym4,                [r0 + 4 * r1]
1838
+    vinserti32x8          m4,                 [r5 + 2 * r1],       1
1839
+    punpcklbw             m6,                 m5,                  m4
1840
+    pmaddubsw             m6,                 m9
1841
+    paddw                 m2,                 m6
1842
+    punpckhbw             m5,                 m4
1843
+    pmaddubsw             m5,                 m9
1844
+    paddw                 m3,                 m5
1845
+%ifidn %1, pp
1846
+    pmulhrsw              m0,                 m7
1847
+    pmulhrsw              m1,                 m7
1848
+    pmulhrsw              m2,                 m7
1849
+    pmulhrsw              m3,                 m7
1850
+
1851
+    packuswb              m0,                 m1
1852
+    packuswb              m2,                 m3
1853
+    movu                  [r2],               ym0
1854
+    movu                  [r2 + r3],          ym2
1855
+    vextracti32x8         [r2 + 2 * r3],      m0,                  1
1856
+    vextracti32x8         [r2 + r7],          m2,                  1
1857
+%else
1858
+    psubw                 m0,                 m7
1859
+    psubw                 m1,                 m7
1860
+    psubw                 m2,                 m7
1861
+    psubw                 m3,                 m7
1862
+
1863
+    mova                  m4,                 m10
1864
+    mova                  m5,                 m11
1865
+    vpermi2q              m4,                 m0,                m1
1866
+    vpermi2q              m5,                 m0,                m1
1867
+    mova                  m6,                 m10
1868
+    mova                  m12,                m11
1869
+    vpermi2q              m6,                 m2,                m3
1870
+    vpermi2q              m12,                m2,                m3
1871
+
1872
+    movu                  [r2],               m4
1873
+    movu                  [r2 + r3],          m6
1874
+    movu                  [r2 + 2 * r3],      m5
1875
+    movu                  [r2 + r7],          m12
1876
+%endif
1877
+    movu                  xm1,                [r0 + mmsize/2]
1878
+    movu                  xm3,                [r0 + r1 + mmsize/2]
1879
+    vinserti32x4          m1,                 [r0 + r1 + mmsize/2],           1
1880
+    vinserti32x4          m3,                 [r0 + 2 * r1 + mmsize/2],       1
1881
+    vinserti32x4          m1,                 [r0 + 2 * r1 + mmsize/2],       2
1882
+    vinserti32x4          m3,                 [r0 + r6 + mmsize/2],           2
1883
+    vinserti32x4          m1,                 [r0 + r6 + mmsize/2],           3
1884
+    vinserti32x4          m3,                 [r0 + 4 * r1 + mmsize/2],       3
1885
+
1886
+    punpcklbw             m0,                 m1,                  m3
1887
+    pmaddubsw             m0,                 m8
1888
+    punpckhbw             m1,                 m3
1889
+    pmaddubsw             m1,                 m8
1890
+
1891
+    movu                  xm4,                [r0 + 2 * r1 + mmsize/2]
1892
+    movu                  xm5,                [r0 + r6 + mmsize/2]
1893
+    vinserti32x4          m4,                 [r0 + r6 + mmsize/2],           1
1894
+    vinserti32x4          m5,                 [r5 + mmsize/2],                1
1895
+    vinserti32x4          m4,                 [r5 + mmsize/2],                2
1896
+    vinserti32x4          m5,                 [r5 + r1 + mmsize/2],           2
1897
+    vinserti32x4          m4,                 [r5 + r1 + mmsize/2],           3
1898
+    vinserti32x4          m5,                 [r5 + 2 * r1 + mmsize/2],       3
1899
+
1900
+    punpcklbw             m3,                 m4,                  m5
1901
+    pmaddubsw             m3,                 m9
1902
+    punpckhbw             m4,                 m5
1903
+    pmaddubsw             m4,                 m9
1904
+    paddw                 m0,                 m3
1905
+    paddw                 m1,                 m4
1906
+%ifidn %1, pp
1907
+    pmulhrsw              m0,                 m7
1908
+    pmulhrsw              m1,                 m7
1909
+    packuswb              m0,                 m1
1910
+    movu                  [r2 + mmsize/2],               xm0
1911
+    vextracti32x4         [r2 + r3 + mmsize/2],          m0,                  1
1912
+    vextracti32x4         [r2 + 2 * r3 + mmsize/2],      m0,                  2
1913
+    vextracti32x4         [r2 + r7 + mmsize/2],          m0,                  3
1914
+%else
1915
+    psubw                 m0,                 m7
1916
+    psubw                 m1,                 m7
1917
+    mova                  m2,                m10
1918
+    mova                  m3,                m11
1919
+
1920
+    vpermi2q              m2,  m0, m1
1921
+    vpermi2q              m3,  m0, m1
1922
+
1923
+    movu                  [r2 + mmsize],               ym2
1924
+    vextracti32x8         [r2 + r3 + mmsize],          m2,                  1
1925
+    movu                  [r2 + 2 * r3 + mmsize],      ym3
1926
+    vextracti32x8         [r2 + r7 + mmsize],          m3,                  1
1927
+%endif
1928
+%endmacro
1929
+;-----------------------------------------------------------------------------------------------------------------
1930
+; void interp_8tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
1931
+;-----------------------------------------------------------------------------------------------------------------
1932
+%macro FILTER_VERT_CHROMA_48x64_AVX512 1
1933
+INIT_ZMM avx512
1934
+cglobal interp_4tap_vert_%1_48x64, 4, 8, 13
1935
+    mov                   r4d,                r4m
1936
+    shl                   r4d,                7
1937
+    sub                   r0,                 r1
1938
+
1939
+%ifdef PIC
1940
+    lea                   r5,                 [tab_ChromaCoeffVer_32_avx512]
1941
+    mova                  m8,                 [r5 + r4]
1942
+    mova                  m9,                 [r5 + r4 + mmsize]
1943
+%else
1944
+    mova                  m8,                 [tab_ChromaCoeffVer_32_avx512 + r4]
1945
+    mova                  m9,                 [tab_ChromaCoeffVer_32_avx512 + r4 + mmsize]
1946
+%endif
1947
+
1948
+%ifidn %1, pp
1949
+    vbroadcasti32x8       m7,                 [pw_512]
1950
+%else
1951
+    shl                   r3d,                1
1952
+    vbroadcasti32x8       m7,                 [pw_2000]
1953
+    mova                  m10,                [interp4_vps_store1_avx512]
1954
+    mova                  m11,                [interp4_vps_store2_avx512]
1955
+%endif
1956
+
1957
+    lea                   r6,                 [3 * r1]
1958
+    lea                   r7,                 [3 * r3]
1959
+%rep 15
1960
+    PROCESS_CHROMA_VERT_48x4_AVX512 %1
1961
+    lea                   r0,                 [r0 + 4 * r1]
1962
+    lea                   r2,                 [r2 + 4 * r3]
1963
+%endrep
1964
+    PROCESS_CHROMA_VERT_48x4_AVX512 %1
1965
+    RET
1966
+%endmacro
1967
+
1968
+%if ARCH_X86_64
1969
+    FILTER_VERT_CHROMA_48x64_AVX512 pp
1970
+    FILTER_VERT_CHROMA_48x64_AVX512 ps
1971
+%endif
1972
+%macro PROCESS_CHROMA_VERT_64x4_AVX512 1
1973
+    movu              m0,              [r0]                        ; m0 = row 0
1974
+    movu              m1,              [r0 + r1]                   ; m1 = row 1
1975
+    punpcklbw         m2,              m0,                m1
1976
+    punpckhbw         m3,              m0,                m1
1977
+    pmaddubsw         m2,              m10
1978
+    pmaddubsw         m3,              m10
1979
+    movu              m0,              [r0 + r1 * 2]               ; m0 = row 2
1980
+    punpcklbw         m4,              m1,                m0
1981
+    punpckhbw         m5,              m1,                m0
1982
+    pmaddubsw         m4,              m10
1983
+    pmaddubsw         m5,              m10
1984
+    movu              m1,              [r0 + r4]                   ; m1 = row 3
1985
+    punpcklbw         m6,              m0,                m1
1986
+    punpckhbw         m7,              m0,                m1
1987
+    pmaddubsw         m8,              m6,                m11
1988
+    pmaddubsw         m9,              m7,                m11
1989
+    pmaddubsw         m6,              m10
1990
+    pmaddubsw         m7,              m10
1991
+    paddw             m2,              m8
1992
+    paddw             m3,              m9
1993
+
1994
+%ifidn %1,pp
1995
+    pmulhrsw          m2,              m12
1996
+    pmulhrsw          m3,              m12
1997
+    packuswb          m2,              m3
1998
+    movu              [r2],            m2
1999
+%else
2000
+    psubw             m2, m12
2001
+    psubw             m3, m12
2002
+    movu              m8, m13
2003
+    movu              m9, m14
2004
+    vpermi2q          m8, m2, m3
2005
+    vpermi2q          m9, m2, m3
2006
+    movu              [r2], m8
2007
+    movu              [r2 + mmsize], m9
2008
+%endif
2009
+
2010
+    lea               r0,              [r0 + r1 * 4]
2011
+    movu              m0,              [r0]                        ; m0 = row 4
2012
+    punpcklbw         m2,              m1,                m0
2013
+    punpckhbw         m3,              m1,                m0
2014
+    pmaddubsw         m8,              m2,                m11
2015
+    pmaddubsw         m9,              m3,                m11
2016
+    pmaddubsw         m2,              m10
2017
+    pmaddubsw         m3,              m10
2018
+    paddw             m4,              m8
2019
+    paddw             m5,              m9
2020
+
2021
+%ifidn %1,pp
2022
+    pmulhrsw          m4,              m12
2023
+    pmulhrsw          m5,              m12
2024
+    packuswb          m4,              m5
2025
+    movu              [r2 + r3],       m4
2026
+%else
2027
+    psubw             m4, m12
2028
+    psubw             m5, m12
2029
+    movu              m8, m13
2030
+    movu              m9, m14
2031
+    vpermi2q          m8, m4, m5
2032
+    vpermi2q          m9, m4, m5
2033
+    movu              [r2 + r3], m8
2034
+    movu              [r2 + r3 + mmsize], m9
2035
+%endif
2036
+
2037
+    movu              m1,              [r0 + r1]                   ; m1 = row 5
2038
+    punpcklbw         m4,              m0,                m1
2039
+    punpckhbw         m5,              m0,                m1
2040
+    pmaddubsw         m4,              m11
2041
+    pmaddubsw         m5,              m11
2042
+    paddw             m6,              m4
2043
+    paddw             m7,              m5
2044
+
2045
+%ifidn %1,pp
2046
+    pmulhrsw          m6,              m12
2047
+    pmulhrsw          m7,              m12
2048
+    packuswb          m6,              m7
2049
+    movu              [r2 + r3 * 2],   m6
2050
+%else
2051
+    psubw             m6, m12
2052
+    psubw             m7, m12
2053
+    movu              m8, m13
2054
+    movu              m9, m14
2055
+    vpermi2q          m8, m6, m7
2056
+    vpermi2q          m9, m6, m7
2057
+    movu              [r2 + 2 * r3], m8
2058
+    movu              [r2 + 2 * r3 + mmsize], m9
2059
+%endif
2060
+    movu              m0,              [r0 + r1 * 2]               ; m0 = row 6
2061
+    punpcklbw         m6,              m1,                m0
2062
+    punpckhbw         m7,              m1,                m0
2063
+    pmaddubsw         m6,              m11
2064
+    pmaddubsw         m7,              m11
2065
+    paddw             m2,              m6
2066
+    paddw             m3,              m7
2067
+
2068
+%ifidn %1,pp
2069
+    pmulhrsw          m2,              m12
2070
+    pmulhrsw          m3,              m12
2071
+    packuswb          m2,              m3
2072
+    movu              [r2 + r5],       m2
2073
+%else
2074
+    psubw             m2, m12
2075
+    psubw             m3, m12
2076
+    movu              m8, m13
2077
+    movu              m9, m14
2078
+    vpermi2q          m8, m2, m3
2079
+    vpermi2q          m9, m2, m3
2080
+    movu              [r2 + r5], m8
2081
+    movu              [r2 + r5 + mmsize], m9
2082
+%endif
2083
+%endmacro
2084
+
2085
+%macro FILTER_VER_CHROMA_AVX512_64xN 2
2086
+INIT_ZMM avx512
2087
+cglobal interp_4tap_vert_%1_64x%2, 4, 6, 15
2088
+    mov               r4d,             r4m
2089
+    shl               r4d,             7
2090
+
2091
+%ifdef PIC
2092
+    lea               r5,              [tab_ChromaCoeffVer_32_avx512]
2093
+    mova              m10,             [r5 + r4]
2094
+    mova              m11,             [r5 + r4 + mmsize]
2095
+%else
2096
+    mova              m10,             [tab_ChromaCoeffVer_32_avx512 + r4]
2097
+    mova              m11,             [tab_ChromaCoeffVer_32_avx512 + r4 + mmsize]
2098
+%endif
2099
+
2100
+%ifidn %1,pp
2101
+    vbroadcasti32x8            m12, [pw_512]
2102
+%else
2103
+    shl                        r3d, 1
2104
+    vbroadcasti32x8            m12, [pw_2000]
2105
+    mova                       m13, [interp4_vps_store1_avx512]
2106
+    mova                       m14, [interp4_vps_store2_avx512]
2107
+%endif
2108
+    lea               r4,              [r1 * 3]
2109
+    sub               r0,              r1
2110
+    lea               r5,              [r3 * 3]
2111
+
2112
+%rep %2/4 - 1
2113
+    PROCESS_CHROMA_VERT_64x4_AVX512 %1
2114
+    lea               r2, [r2 + r3 * 4]
2115
+%endrep
2116
+    PROCESS_CHROMA_VERT_64x4_AVX512 %1
2117
+    RET
2118
+%endmacro
2119
+
2120
+%if ARCH_X86_64 == 1
2121
+FILTER_VER_CHROMA_AVX512_64xN pp, 64
2122
+FILTER_VER_CHROMA_AVX512_64xN pp, 48
2123
+FILTER_VER_CHROMA_AVX512_64xN pp, 32
2124
+FILTER_VER_CHROMA_AVX512_64xN pp, 16
2125
+
2126
+FILTER_VER_CHROMA_AVX512_64xN ps, 64
2127
+FILTER_VER_CHROMA_AVX512_64xN ps, 48
2128
+FILTER_VER_CHROMA_AVX512_64xN ps, 32
2129
+FILTER_VER_CHROMA_AVX512_64xN ps, 16
2130
+%endif
2131
+;-------------------------------------------------------------------------------------------------------------
2132
+;avx512 chroma_vpp and chroma_vps code end
2133
+;-------------------------------------------------------------------------------------------------------------
2134
+;-------------------------------------------------------------------------------------------------------------
2135
+;avx512 chroma_vss code start
2136
+;-------------------------------------------------------------------------------------------------------------
2137
+%macro PROCESS_CHROMA_VERT_SS_8x4_AVX512 0
2138
+    lea                   r5,                 [r0 + 4 * r1]
2139
+    movu                  xm1,                [r0]
2140
+    movu                  xm3,                [r0 + r1]
2141
+    vinserti32x4          m1,                 [r0 + r1],           1
2142
+    vinserti32x4          m3,                 [r0 + 2 * r1],       1
2143
+    vinserti32x4          m1,                 [r0 + 2 * r1],       2
2144
+    vinserti32x4          m3,                 [r0 + r6],           2
2145
+    vinserti32x4          m1,                 [r0 + r6],           3
2146
+    vinserti32x4          m3,                 [r0 + 4 * r1],       3
2147
+
2148
+    punpcklwd             m0,                 m1,                  m3
2149
+    pmaddwd               m0,                 m8
2150
+    punpckhwd             m1,                 m3
2151
+    pmaddwd               m1,                 m8
2152
+
2153
+    movu                  xm4,                [r0 + 2 * r1]
2154
+    movu                  xm5,                [r0 + r6]
2155
+    vinserti32x4          m4,                 [r0 + r6],           1
2156
+    vinserti32x4          m5,                 [r5],                1
2157
+    vinserti32x4          m4,                 [r5],                2
2158
+    vinserti32x4          m5,                 [r5 + r1],           2
2159
+    vinserti32x4          m4,                 [r5 + r1],           3
2160
+    vinserti32x4          m5,                 [r5 + 2 * r1],       3
2161
+
2162
+    punpcklwd             m3,                 m4,                  m5
2163
+    pmaddwd               m3,                 m9
2164
+    punpckhwd             m4,                 m5
2165
+    pmaddwd               m4,                 m9
2166
+
2167
+    paddd                 m0,                 m3
2168
+    paddd                 m1,                 m4
2169
+
2170
+    psrad                 m0,                 6
2171
+    psrad                 m1,                 6
2172
+    packssdw              m0,                 m1
2173
+    movu                  [r2],               xm0
2174
+    vextracti32x4         [r2 + r3],          m0,                  1
2175
+    vextracti32x4         [r2 + 2 * r3],      m0,                  2
2176
+    vextracti32x4         [r2 + r7],          m0,                  3
2177
+%endmacro
2178
+
2179
+;-----------------------------------------------------------------------------------------------------------------
2180
+; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
2181
+;-----------------------------------------------------------------------------------------------------------------
2182
+%macro FILTER_VER_SS_CHROMA_8xN_AVX512 1
2183
+INIT_ZMM avx512
2184
+cglobal interp_4tap_vert_ss_8x%1, 5, 8, 10
2185
+    add                   r1d,                r1d
2186
+    add                   r3d,                r3d
2187
+    sub                   r0,                 r1
2188
+    shl                   r4d,                7
2189
+%ifdef PIC
2190
+    lea                   r5,                 [pw_ChromaCoeffVer_32_avx512]
2191
+    mova                  m8,                 [r5 + r4]
2192
+    mova                  m9,                 [r5 + r4 + mmsize]
2193
+%else
2194
+    lea                   r5,                 [pw_ChromaCoeffVer_32_avx512 + r4]
2195
+    mova                  m8,                 [r5]
2196
+    mova                  m9,                 [r5 + mmsize]
2197
+%endif
2198
+    lea                   r6,                 [3 * r1]
2199
+    lea                   r7,                 [3 * r3]
2200
+
2201
+%rep %1/4 - 1
2202
+    PROCESS_CHROMA_VERT_SS_8x4_AVX512
2203
+    lea                   r0,                 [r0 + 4 * r1]
2204
+    lea                   r2,                 [r2 + 4 * r3]
2205
+%endrep
2206
+    PROCESS_CHROMA_VERT_SS_8x4_AVX512
2207
+    RET
2208
+%endmacro
2209
+
2210
+%if ARCH_X86_64
2211
+    FILTER_VER_SS_CHROMA_8xN_AVX512 4
2212
+    FILTER_VER_SS_CHROMA_8xN_AVX512 8
2213
+    FILTER_VER_SS_CHROMA_8xN_AVX512 12
2214
+    FILTER_VER_SS_CHROMA_8xN_AVX512 16
2215
+    FILTER_VER_SS_CHROMA_8xN_AVX512 32
2216
+    FILTER_VER_SS_CHROMA_8xN_AVX512 64
2217
+%endif
2218
+
2219
+%macro PROCESS_CHROMA_VERT_S_16x4_AVX512 1
2220
+    movu                  ym1,                [r0]
2221
+    lea                   r6,                 [r0 + 2 * r1]
2222
+    vinserti32x8          m1,                 [r6],                1
2223
+    movu                  ym3,                [r0 + r1]
2224
+    vinserti32x8          m3,                 [r6 + r1],           1
2225
+    punpcklwd             m0,                 m1,                  m3
2226
+    pmaddwd               m0,                 m7
2227
+    punpckhwd             m1,                 m3
2228
+    pmaddwd               m1,                 m7
2229
+
2230
+    movu                  ym4,                [r0 + 2 * r1]
2231
+    vinserti32x8          m4,                 [r6 + 2 * r1],       1
2232
+    punpcklwd             m2,                 m3,                  m4
2233
+    pmaddwd               m2,                 m7
2234
+    punpckhwd             m3,                 m4
2235
+    pmaddwd               m3,                 m7
2236
+
2237
+    movu                  ym5,                [r0 + r4]
2238
+    vinserti32x8          m5,                 [r6 + r4],           1
2239
+    punpcklwd             m6,                 m4,                  m5
2240
+    pmaddwd               m6,                 m8
2241
+    paddd                 m0,                 m6
2242
+    punpckhwd             m4,                 m5
2243
+    pmaddwd               m4,                 m8
2244
+    paddd                 m1,                 m4
2245
+
2246
+    movu                  ym4,                [r0 + 4 * r1]
2247
+    vinserti32x8          m4,                 [r6 + 4 * r1],       1
2248
+    punpcklwd             m6,                 m5,                  m4
2249
+    pmaddwd               m6,                 m8
2250
+    paddd                 m2,                 m6
2251
+    punpckhwd             m5,                 m4
2252
+    pmaddwd               m5,                 m8
2253
+    paddd                 m3,                 m5
2254
+
2255
+%ifidn %1, sp
2256
+    paddd                m0,                  m9
2257
+    paddd                m1,                  m9
2258
+    paddd                m2,                  m9
2259
+    paddd                m3,                  m9
2260
+
2261
+    psrad                m0,                  12
2262
+    psrad                m1,                  12
2263
+    psrad                m2,                  12
2264
+    psrad                m3,                  12
2265
+
2266
+    packssdw             m0,                  m1
2267
+    packssdw             m2,                  m3
2268
+    packuswb             m0,                  m2
2269
+    vpermq               m0,                  m10,                 m0
2270
+    movu                 [r2],                xm0
2271
+    vextracti32x4        [r2 + r3],           m0,                  2
2272
+    vextracti32x4        [r2 + 2 * r3],       m0,                  1
2273
+    vextracti32x4        [r2 + r5],           m0,                  3
2274
+%else
2275
+    psrad                 m0,                 6
2276
+    psrad                 m1,                 6
2277
+    psrad                 m2,                 6
2278
+    psrad                 m3,                 6
2279
+    packssdw              m0,                 m1
2280
+    packssdw              m2,                 m3
2281
+
2282
+    movu                  [r2],               ym0
2283
+    movu                  [r2 + r3],          ym2
2284
+    vextracti32x8         [r2 + 2 * r3],      m0,                  1
2285
+    vextracti32x8         [r2 + r5],          m2,                  1
2286
+%endif
2287
+%endmacro
2288
+
2289
+;-----------------------------------------------------------------------------------------------------------------
2290
+; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
2291
+;-----------------------------------------------------------------------------------------------------------------
2292
+%macro FILTER_VER_S_CHROMA_16xN_AVX512 2
2293
+INIT_ZMM avx512
2294
+cglobal interp_4tap_vert_%1_16x%2, 4, 7, 11
2295
+    mov                   r4d,                r4m
2296
+    shl                   r4d,                7
2297
+
2298
+%ifdef PIC
2299
+    lea                   r5,                 [pw_ChromaCoeffVer_32_avx512]
2300
+    mova                  m7,                 [r5 + r4]
2301
+    mova                  m8,                 [r5 + r4 + mmsize]
2302
+%else
2303
+    mova                  m7,                 [pw_ChromaCoeffVer_32_avx512 + r4]
2304
+    mova                  m8,                 [pw_ChromaCoeffVer_32_avx512 + r4 + mmsize]
2305
+%endif
2306
+
2307
+%ifidn %1, sp
2308
+    vbroadcasti32x4       m9,                 [pd_526336]
2309
+    mova                  m10,                [interp8_vsp_store_avx512]
2310
+%else
2311
+    add                   r3d,                r3d
2312
+%endif
2313
+    add                   r1d,                r1d
2314
+    sub                   r0,                 r1
2315
+    lea                   r4,                 [r1 * 3]
2316
+    lea                   r5,                 [r3 * 3]
2317
+
2318
+%rep %2/4 - 1
2319
+    PROCESS_CHROMA_VERT_S_16x4_AVX512 %1
2320
+    lea                   r0,                 [r0 + 4 * r1]
2321
+    lea                   r2,                 [r2 + 4 * r3]
2322
+%endrep
2323
+    PROCESS_CHROMA_VERT_S_16x4_AVX512 %1
2324
+    RET
2325
+%endmacro
2326
+
2327
+%if ARCH_X86_64
2328
+    FILTER_VER_S_CHROMA_16xN_AVX512 ss, 4
2329
+    FILTER_VER_S_CHROMA_16xN_AVX512 ss, 8
2330
+    FILTER_VER_S_CHROMA_16xN_AVX512 ss, 12
2331
+    FILTER_VER_S_CHROMA_16xN_AVX512 ss, 16
2332
+    FILTER_VER_S_CHROMA_16xN_AVX512 ss, 24
2333
+    FILTER_VER_S_CHROMA_16xN_AVX512 ss, 32
2334
+    FILTER_VER_S_CHROMA_16xN_AVX512 ss, 64
2335
+    FILTER_VER_S_CHROMA_16xN_AVX512 sp, 4
2336
+    FILTER_VER_S_CHROMA_16xN_AVX512 sp, 8
2337
+    FILTER_VER_S_CHROMA_16xN_AVX512 sp, 12
2338
+    FILTER_VER_S_CHROMA_16xN_AVX512 sp, 16
2339
+    FILTER_VER_S_CHROMA_16xN_AVX512 sp, 24
2340
+    FILTER_VER_S_CHROMA_16xN_AVX512 sp, 32
2341
+    FILTER_VER_S_CHROMA_16xN_AVX512 sp, 64
2342
+%endif
2343
+
2344
+%macro PROCESS_CHROMA_VERT_SS_24x8_AVX512 0
2345
+    movu                  ym1,                [r0]
2346
+    lea                   r6,                 [r0 + 2 * r1]
2347
+    lea                   r8,                 [r0 + 4 * r1]
2348
+    lea                   r9,                 [r8 + 2 * r1]
2349
+
2350
+    movu                  ym10,               [r8]
2351
+    movu                  ym3,                [r0 + r1]
2352
+    movu                  ym12,               [r8 + r1]
2353
+    vinserti32x8          m1,                 [r6],                1
2354
+    vinserti32x8          m10,                [r9],                1
2355
+    vinserti32x8          m3,                 [r6 + r1],           1
2356
+    vinserti32x8          m12,                [r9 + r1],           1
2357
+
2358
+    punpcklwd             m0,                 m1,                  m3
2359
+    punpcklwd             m9,                 m10,                 m12
2360
+    pmaddwd               m0,                 m16
2361
+    pmaddwd               m9,                 m16
2362
+    punpckhwd             m1,                 m3
2363
+    punpckhwd             m10,                m12
2364
+    pmaddwd               m1,                 m16
2365
+    pmaddwd               m10,                m16
2366
+
2367
+    movu                  ym4,                [r0 + 2 * r1]
2368
+    movu                  ym13,               [r8 + 2 * r1]
2369
+    vinserti32x8          m4,                 [r6 + 2 * r1],       1
2370
+    vinserti32x8          m13,                [r9 + 2 * r1],       1
2371
+    punpcklwd             m2,                 m3,                  m4
2372
+    punpcklwd             m11,                m12,                 m13
2373
+    pmaddwd               m2,                 m16
2374
+    pmaddwd               m11,                m16
2375
+    punpckhwd             m3,                 m4
2376
+    punpckhwd             m12,                m13
2377
+    pmaddwd               m3,                 m16
2378
+    pmaddwd               m12,                m16
2379
+
2380
+    movu                  ym5,                [r0 + r10]
2381
+    vinserti32x8          m5,                 [r6 + r10],          1
2382
+    movu                  ym14,               [r8 + r10]
2383
+    vinserti32x8          m14,                [r9 + r10],          1
2384
+    punpcklwd             m6,                 m4,                  m5
2385
+    punpcklwd             m15,                m13,                 m14
2386
+    pmaddwd               m6,                 m17
2387
+    pmaddwd               m15,                m17
2388
+    paddd                 m0,                 m6
2389
+    paddd                 m9,                 m15
2390
+    punpckhwd             m4,                 m5
2391
+    punpckhwd             m13,                m14
2392
+    pmaddwd               m4,                 m17
2393
+    pmaddwd               m13,                m17
2394
+    paddd                 m1,                 m4
2395
+    paddd                 m10,                m13
2396
+
2397
+    movu                  ym4,                [r0 + 4 * r1]
2398
+    vinserti32x8          m4,                 [r6 + 4 * r1],       1
2399
+    movu                  ym13,               [r8 + 4 * r1]
2400
+    vinserti32x8          m13,                [r9 + 4 * r1],       1
2401
+    punpcklwd             m6,                 m5,                  m4
2402
+    punpcklwd             m15,                m14,                 m13
2403
+    pmaddwd               m6,                 m17
2404
+    pmaddwd               m15,                m17
2405
+    paddd                 m2,                 m6
2406
+    paddd                 m11,                m15
2407
+    punpckhwd             m5,                 m4
2408
+    punpckhwd             m14,                m13
2409
+    pmaddwd               m5,                 m17
2410
+    pmaddwd               m14,                m17
2411
+    paddd                 m3,                 m5
2412
+    paddd                 m12,                m14
2413
+
2414
+    psrad                 m0,                 6
2415
+    psrad                 m1,                 6
2416
+    psrad                 m2,                 6
2417
+    psrad                 m3,                 6
2418
+    psrad                 m9,                 6
2419
+    psrad                 m10,                6
2420
+    psrad                 m11,                6
2421
+    psrad                 m12,                6
2422
+
2423
+    packssdw              m0,                 m1
2424
+    packssdw              m2,                 m3
2425
+    packssdw              m9,                 m10
2426
+    packssdw              m11,                m12
2427
+
2428
+    movu                  [r2],               ym0
2429
+    movu                  [r2 + r3],          ym2
2430
+    vextracti32x8         [r2 + 2 * r3],      m0,                  1
2431
+    vextracti32x8         [r2 + r7],          m2,                  1
2432
+    lea                   r11,                [r2 + 4 * r3]
2433
+    movu                  [r11],              ym9
2434
+    movu                  [r11 + r3],         ym11
2435
+    vextracti32x8         [r11 + 2 * r3],     m9,                  1
2436
+    vextracti32x8         [r11 + r7],         m11,                 1
2437
+
2438
+    movu                  xm1,                [r0 + mmsize/2]
2439
+    vinserti32x4          m1,                 [r6 + mmsize/2],                1
2440
+    vinserti32x4          m1,                 [r8 + mmsize/2],                2
2441
+    vinserti32x4          m1,                 [r9 + mmsize/2],                3
2442
+    movu                  xm3,                [r0 + r1 + mmsize/2]
2443
+    vinserti32x4          m3,                 [r6 + r1 + mmsize/2],           1
2444
+    vinserti32x4          m3,                 [r8 + r1 + mmsize/2],           2
2445
+    vinserti32x4          m3,                 [r9 + r1 + mmsize/2],           3
2446
+    punpcklwd             m0,                 m1,                             m3
2447
+    pmaddwd               m0,                 m16
2448
+    punpckhwd             m1,                 m3
2449
+    pmaddwd               m1,                 m16
2450
+
2451
+    movu                  xm4,                [r0 + 2 * r1 + mmsize/2]
2452
+    vinserti32x4          m4,                 [r6 + 2 * r1 + mmsize/2],       1
2453
+    vinserti32x4          m4,                 [r8 + 2 * r1 + mmsize/2],       2
2454
+    vinserti32x4          m4,                 [r9 + 2 * r1 + mmsize/2],       3
2455
+    punpcklwd             m2,                 m3,                             m4
2456
+    pmaddwd               m2,                 m16
2457
+    punpckhwd             m3,                 m4
2458
+    pmaddwd               m3,                 m16
2459
+
2460
+    movu                  xm5,                [r0 + r10 + mmsize/2]
2461
+    vinserti32x4          m5,                 [r6 + r10 + mmsize/2],          1
2462
+    vinserti32x4          m5,                 [r8 + r10 + mmsize/2],          2
2463
+    vinserti32x4          m5,                 [r9 + r10 + mmsize/2],          3
2464
+    punpcklwd             m6,                 m4,                             m5
2465
+    pmaddwd               m6,                 m17
2466
+    paddd                 m0,                 m6
2467
+    punpckhwd             m4,                 m5
2468
+    pmaddwd               m4,                 m17
2469
+    paddd                 m1,                 m4
2470
+
2471
+    movu                  xm4,                [r0 + 4 * r1 + mmsize/2]
2472
+    vinserti32x4          m4,                 [r6 + 4 * r1 + mmsize/2],       1
2473
+    vinserti32x4          m4,                 [r8 + 4 * r1 + mmsize/2],       2
2474
+    vinserti32x4          m4,                 [r9 + 4 * r1 + mmsize/2],       3
2475
+    punpcklwd             m6,                 m5,                             m4
2476
+    pmaddwd               m6,                 m17
2477
+    paddd                 m2,                 m6
2478
+    punpckhwd             m5,                 m4
2479
+    pmaddwd               m5,                 m17
2480
+    paddd                 m3,                 m5
2481
+
2482
+    psrad                 m0,                 6
2483
+    psrad                 m1,                 6
2484
+    psrad                 m2,                 6
2485
+    psrad                 m3,                 6
2486
+
2487
+    packssdw              m0,                 m1
2488
+    packssdw              m2,                 m3
2489
+
2490
+    movu                  [r2 + mmsize/2],               xm0
2491
+    movu                  [r2 + r3 + mmsize/2],          xm2
2492
+    vextracti32x4         [r2 + 2 * r3 + mmsize/2],      m0,                  1
2493
+    vextracti32x4         [r2 + r7 + mmsize/2],          m2,                  1
2494
+    lea                   r2,                            [r2 + 4 * r3]
2495
+    vextracti32x4         [r2 + mmsize/2],               m0,                  2
2496
+    vextracti32x4         [r2 + r3 + mmsize/2],          m2,                  2
2497
+    vextracti32x4         [r2 + 2 * r3 + mmsize/2],      m0,                  3
2498
+    vextracti32x4         [r2 + r7 + mmsize/2],          m2,                  3
2499
+%endmacro
2500
+
2501
+%macro FILTER_VER_SS_CHROMA_24xN_AVX512 1
2502
+INIT_ZMM avx512
2503
+cglobal interp_4tap_vert_ss_24x%1, 5, 12, 18
2504
+    add                   r1d,                r1d
2505
+    add                   r3d,                r3d
2506
+    sub                   r0,                 r1
2507
+    shl                   r4d,                7
2508
+%ifdef PIC
2509
+    lea                   r5,                 [pw_ChromaCoeffVer_32_avx512]
2510
+    mova                  m16,                [r5 + r4]
2511
+    mova                  m17,                [r5 + r4 + mmsize]
2512
+%else
2513
+    lea                   r5,                 [pw_ChromaCoeffVer_32_avx512 + r4]
2514
+    mova                  m16,                [r5]
2515
+    mova                  m17,                [r5 + mmsize]
2516
+%endif
2517
+    lea                   r10,                [3 * r1]
2518
+    lea                   r7,                 [3 * r3]
2519
+%rep %1/8 - 1
2520
+    PROCESS_CHROMA_VERT_SS_24x8_AVX512
2521
+    lea                   r0,                 [r8 + 4 * r1]
2522
+    lea                   r2,                 [r2 + 4 * r3]
2523
+%endrep
2524
+    PROCESS_CHROMA_VERT_SS_24x8_AVX512
2525
+    RET
2526
+%endmacro
2527
+
2528
+%if ARCH_X86_64
2529
+    FILTER_VER_SS_CHROMA_24xN_AVX512 32
2530
+    FILTER_VER_SS_CHROMA_24xN_AVX512 64
2531
+%endif
2532
+%macro PROCESS_CHROMA_VERT_S_32x2_AVX512 1
2533
+    movu                  m1,                 [r0]
2534
+    movu                  m3,                 [r0 + r1]
2535
+    punpcklwd             m0,                 m1,                  m3
2536
+    pmaddwd               m0,                 m7
2537
+    punpckhwd             m1,                 m3
2538
+    pmaddwd               m1,                 m7
2539
+    movu                  m4,                 [r0 + 2 * r1]
2540
+    punpcklwd             m2,                 m3,                  m4
2541
+    pmaddwd               m2,                 m7
2542
+    punpckhwd             m3,                 m4
2543
+    pmaddwd               m3,                 m7
2544
+    movu                  m5,                 [r0 + r4]
2545
+    punpcklwd             m6,                 m4,                  m5
2546
+    pmaddwd               m6,                 m8
2547
+    paddd                 m0,                 m6
2548
+    punpckhwd             m4,                 m5
2549
+    pmaddwd               m4,                 m8
2550
+    paddd                 m1,                 m4
2551
+    movu                  m4,                 [r0 + 4 * r1]
2552
+    punpcklwd             m6,                 m5,                  m4
2553
+    pmaddwd               m6,                 m8
2554
+    paddd                 m2,                 m6
2555
+    punpckhwd             m5,                 m4
2556
+    pmaddwd               m5,                 m8
2557
+    paddd                 m3,                 m5
2558
+%ifidn %1, sp
2559
+    paddd                 m0,                 m9
2560
+    paddd                 m1,                 m9
2561
+    paddd                 m2,                 m9
2562
+    paddd                 m3,                 m9
2563
+
2564
+    psrad                 m0,                 12
2565
+    psrad                 m1,                 12
2566
+    psrad                 m2,                 12
2567
+    psrad                 m3,                 12
2568
+
2569
+    packssdw              m0,                 m1
2570
+    packssdw              m2,                 m3
2571
+    packuswb              m0,                 m2
2572
+    vpermq                m0,                 m10,                   m0
2573
+    movu                  [r2],               ym0
2574
+    vextracti32x8         [r2 + r3],          m0,                    1
2575
+%else
2576
+    psrad                 m0,                 6
2577
+    psrad                 m1,                 6
2578
+    psrad                 m2,                 6
2579
+    psrad                 m3,                 6
2580
+    packssdw              m0,                 m1
2581
+    packssdw              m2,                 m3
2582
+    movu                  [r2],               m0
2583
+    movu                  [r2 + r3],          m2
2584
+%endif
2585
+%endmacro
2586
+
2587
+%macro FILTER_VER_S_CHROMA_32xN_AVX512 2
2588
+INIT_ZMM avx512
2589
+cglobal interp_4tap_vert_%1_32x%2, 4, 6, 11
2590
+    mov               r4d,             r4m
2591
+    shl               r4d,             7
2592
+%ifdef PIC
2593
+    lea               r5,              [pw_ChromaCoeffVer_32_avx512]
2594
+    mova              m7,              [r5 + r4]
2595
+    mova              m8,              [r5 + r4 + mmsize]
2596
+%else
2597
+    mova              m7,              [pw_ChromaCoeffVer_32_avx512 + r4]
2598
+    mova              m8,              [pw_ChromaCoeffVer_32_avx512 + r4 + mmsize]
2599
+%endif
2600
+%ifidn %1, sp
2601
+    vbroadcasti32x4   m9,              [pd_526336]
2602
+    mova              m10,             [interp8_vsp_store_avx512]
2603
+%else
2604
+    add               r3d,             r3d
2605
+%endif
2606
+    add               r1d,             r1d
2607
+    sub               r0,              r1
2608
+    lea               r4,              [r1 * 3]
2609
+    lea               r5,              [r3 * 3]
2610
+%rep %2/2 - 1
2611
+    PROCESS_CHROMA_VERT_S_32x2_AVX512 %1
2612
+    lea               r0,              [r0 + r1 * 2]
2613
+    lea               r2,              [r2 + r3 * 2]
2614
+%endrep
2615
+    PROCESS_CHROMA_VERT_S_32x2_AVX512 %1
2616
+    RET
2617
+%endmacro
2618
+
2619
+%if ARCH_X86_64
2620
+    FILTER_VER_S_CHROMA_32xN_AVX512 ss, 8
2621
+    FILTER_VER_S_CHROMA_32xN_AVX512 ss, 16
2622
+    FILTER_VER_S_CHROMA_32xN_AVX512 ss, 24
2623
+    FILTER_VER_S_CHROMA_32xN_AVX512 ss, 32
2624
+    FILTER_VER_S_CHROMA_32xN_AVX512 ss, 48
2625
+    FILTER_VER_S_CHROMA_32xN_AVX512 ss, 64
2626
+    FILTER_VER_S_CHROMA_32xN_AVX512 sp, 8
2627
+    FILTER_VER_S_CHROMA_32xN_AVX512 sp, 16
2628
+    FILTER_VER_S_CHROMA_32xN_AVX512 sp, 24
2629
+    FILTER_VER_S_CHROMA_32xN_AVX512 sp, 32
2630
+    FILTER_VER_S_CHROMA_32xN_AVX512 sp, 48
2631
+    FILTER_VER_S_CHROMA_32xN_AVX512 sp, 64
2632
+%endif
2633
+
2634
+%macro PROCESS_CHROMA_VERT_S_48x4_AVX512 1
2635
+    PROCESS_CHROMA_VERT_S_32x2_AVX512 %1
2636
+    lea                   r6,                 [r0 + 2 * r1]
2637
+
2638
+    movu                  m1,                 [r6]
2639
+    movu                  m3,                 [r6 + r1]
2640
+    punpcklwd             m0,                 m1,                  m3
2641
+    pmaddwd               m0,                 m7
2642
+    punpckhwd             m1,                 m3
2643
+    pmaddwd               m1,                 m7
2644
+    movu                  m4,                 [r6 + 2 * r1]
2645
+    punpcklwd             m2,                 m3,                  m4
2646
+    pmaddwd               m2,                 m7
2647
+    punpckhwd             m3,                 m4
2648
+    pmaddwd               m3,                 m7
2649
+
2650
+    movu                  m5,                 [r6 + r4]
2651
+    punpcklwd             m6,                 m4,                  m5
2652
+    pmaddwd               m6,                 m8
2653
+    paddd                 m0,                 m6
2654
+    punpckhwd             m4,                 m5
2655
+    pmaddwd               m4,                 m8
2656
+    paddd                 m1,                 m4
2657
+
2658
+    movu                  m4,                 [r6 + 4 * r1]
2659
+    punpcklwd             m6,                 m5,                  m4
2660
+    pmaddwd               m6,                 m8
2661
+    paddd                 m2,                 m6
2662
+    punpckhwd             m5,                 m4
2663
+    pmaddwd               m5,                 m8
2664
+    paddd                 m3,                 m5
2665
+
2666
+%ifidn %1, sp
2667
+    paddd                 m0,                 m9
2668
+    paddd                 m1,                 m9
2669
+    paddd                 m2,                 m9
2670
+    paddd                 m3,                 m9
2671
+
2672
+    psrad                 m0,                 12
2673
+    psrad                 m1,                 12
2674
+    psrad                 m2,                 12
2675
+    psrad                 m3,                 12
2676
+
2677
+    packssdw              m0,                 m1
2678
+    packssdw              m2,                 m3
2679
+    packuswb              m0,                 m2
2680
+    vpermq                m0,                 m10,                   m0
2681
+    movu                  [r2 + 2 * r3],      ym0
2682
+    vextracti32x8         [r2 + r5],          m0,                    1
2683
+%else
2684
+    psrad                 m0,                 6
2685
+    psrad                 m1,                 6
2686
+    psrad                 m2,                 6
2687
+    psrad                 m3,                 6
2688
+
2689
+    packssdw              m0,                 m1
2690
+    packssdw              m2,                 m3
2691
+    movu                  [r2 + 2 * r3],      m0
2692
+    movu                  [r2 + r5],          m2
2693
+%endif
2694
+
2695
+    movu                  ym1,                [r0 + mmsize]
2696
+    vinserti32x8          m1,                 [r6 + mmsize],                1
2697
+    movu                  ym3,                [r0 + r1 + mmsize]
2698
+    vinserti32x8          m3,                 [r6 + r1 + mmsize],           1
2699
+    punpcklwd             m0,                 m1,                  m3
2700
+    pmaddwd               m0,                 m7
2701
+    punpckhwd             m1,                 m3
2702
+    pmaddwd               m1,                 m7
2703
+
2704
+    movu                  ym4,                [r0 + 2 * r1 + mmsize]
2705
+    vinserti32x8          m4,                 [r6 + 2 * r1 + mmsize],       1
2706
+    punpcklwd             m2,                 m3,                  m4
2707
+    pmaddwd               m2,                 m7
2708
+    punpckhwd             m3,                 m4
2709
+    pmaddwd               m3,                 m7
2710
+
2711
+    movu                  ym5,                [r0 + r4 + mmsize]
2712
+    vinserti32x8          m5,                 [r6 + r4 + mmsize],           1
2713
+    punpcklwd             m6,                 m4,                  m5
2714
+    pmaddwd               m6,                 m8
2715
+    paddd                 m0,                 m6
2716
+    punpckhwd             m4,                 m5
2717
+    pmaddwd               m4,                 m8
2718
+    paddd                 m1,                 m4
2719
+
2720
+    movu                  ym4,                [r0 + 4 * r1 + mmsize]
2721
+    vinserti32x8          m4,                 [r6 + 4 * r1 + mmsize],       1
2722
+    punpcklwd             m6,                 m5,                  m4
2723
+    pmaddwd               m6,                 m8
2724
+    paddd                 m2,                 m6
2725
+    punpckhwd             m5,                 m4
2726
+    pmaddwd               m5,                 m8
2727
+    paddd                 m3,                 m5
2728
+
2729
+%ifidn %1, sp
2730
+    paddd                m0,                  m9
2731
+    paddd                m1,                  m9
2732
+    paddd                m2,                  m9
2733
+    paddd                m3,                  m9
2734
+
2735
+    psrad                m0,                  12
2736
+    psrad                m1,                  12
2737
+    psrad                m2,                  12
2738
+    psrad                m3,                  12
2739
+
2740
+    packssdw             m0,                  m1
2741
+    packssdw             m2,                  m3
2742
+    packuswb             m0,                  m2
2743
+    vpermq               m0,                  m10,                 m0
2744
+    movu                 [r2 + mmsize/2],                xm0
2745
+    vextracti32x4        [r2 + r3 + mmsize/2],           m0,                  2
2746
+    vextracti32x4        [r2 + 2 * r3 + mmsize/2],       m0,                  1
2747
+    vextracti32x4        [r2 + r5 + mmsize/2],           m0,                  3
2748
+%else
2749
+    psrad                 m0,                 6
2750
+    psrad                 m1,                 6
2751
+    psrad                 m2,                 6
2752
+    psrad                 m3,                 6
2753
+    packssdw              m0,                 m1
2754
+    packssdw              m2,                 m3
2755
+
2756
+    movu                  [r2 + mmsize],               ym0
2757
+    movu                  [r2 + r3 + mmsize],          ym2
2758
+    vextracti32x8         [r2 + 2 * r3 + mmsize],      m0,                  1
2759
+    vextracti32x8         [r2 + r5 + mmsize],          m2,                  1
2760
+%endif
2761
+%endmacro
2762
+
2763
+%macro FILTER_VER_S_CHROMA_48x64_AVX512 1
2764
+INIT_ZMM avx512
2765
+cglobal interp_4tap_vert_%1_48x64, 4, 7, 11
2766
+    mov                   r4d,                r4m
2767
+    shl                   r4d,                7
2768
+
2769
+%ifdef PIC
2770
+    lea                   r5,                 [pw_ChromaCoeffVer_32_avx512]
2771
+    mova                  m7,                 [r5 + r4]
2772
+    mova                  m8,                 [r5 + r4 + mmsize]
2773
+%else
2774
+    mova                  m7,                 [pw_ChromaCoeffVer_32_avx512 + r4]
2775
+    mova                  m8,                 [pw_ChromaCoeffVer_32_avx512 + r4 + mmsize]
2776
+%endif
2777
+
2778
+%ifidn %1, sp
2779
+    vbroadcasti32x4       m9,                 [pd_526336]
2780
+    mova                  m10,                [interp8_vsp_store_avx512]
2781
+%else
2782
+    add                   r3d,                r3d
2783
+%endif
2784
+    add                   r1d,                r1d
2785
+    sub                   r0,                 r1
2786
+    lea                   r4,                 [r1 * 3]
2787
+    lea                   r5,                 [r3 * 3]
2788
+
2789
+%rep 15
2790
+    PROCESS_CHROMA_VERT_S_48x4_AVX512 %1
2791
+    lea                   r0,                 [r0 + 4 * r1]
2792
+    lea                   r2,                 [r2 + 4 * r3]
2793
+%endrep
2794
+    PROCESS_CHROMA_VERT_S_48x4_AVX512 %1
2795
+    RET
2796
+%endmacro
2797
+
2798
+%if ARCH_X86_64
2799
+    FILTER_VER_S_CHROMA_48x64_AVX512 ss
2800
+    FILTER_VER_S_CHROMA_48x64_AVX512 sp
2801
+%endif
2802
+
2803
+%macro PROCESS_CHROMA_VERT_S_64x2_AVX512 1
2804
+    PROCESS_CHROMA_VERT_S_32x2_AVX512 %1
2805
+    movu                  m1,                 [r0 + mmsize]
2806
+    movu                  m3,                 [r0 + r1 + mmsize]
2807
+    punpcklwd             m0,                 m1,                  m3
2808
+    pmaddwd               m0,                 m7
2809
+    punpckhwd             m1,                 m3
2810
+    pmaddwd               m1,                 m7
2811
+    movu                  m4,                 [r0 + 2 * r1 + mmsize]
2812
+    punpcklwd             m2,                 m3,                  m4
2813
+    pmaddwd               m2,                 m7
2814
+    punpckhwd             m3,                 m4
2815
+    pmaddwd               m3,                 m7
2816
+
2817
+    movu                  m5,                 [r0 + r4 + mmsize]
2818
+    punpcklwd             m6,                 m4,                  m5
2819
+    pmaddwd               m6,                 m8
2820
+    paddd                 m0,                 m6
2821
+    punpckhwd             m4,                 m5
2822
+    pmaddwd               m4,                 m8
2823
+    paddd                 m1,                 m4
2824
+
2825
+    movu                  m4,                 [r0 + 4 * r1 + mmsize]
2826
+    punpcklwd             m6,                 m5,                  m4
2827
+    pmaddwd               m6,                 m8
2828
+    paddd                 m2,                 m6
2829
+    punpckhwd             m5,                 m4
2830
+    pmaddwd               m5,                 m8
2831
+    paddd                 m3,                 m5
2832
+
2833
+%ifidn %1, sp
2834
+    paddd                 m0,                 m9
2835
+    paddd                 m1,                 m9
2836
+    paddd                 m2,                 m9
2837
+    paddd                 m3,                 m9
2838
+
2839
+    psrad                 m0,                 12
2840
+    psrad                 m1,                 12
2841
+    psrad                 m2,                 12
2842
+    psrad                 m3,                 12
2843
+
2844
+    packssdw              m0,                 m1
2845
+    packssdw              m2,                 m3
2846
+    packuswb              m0,                 m2
2847
+    vpermq                m0,                 m10,                   m0
2848
+    movu                  [r2 + mmsize/2],    ym0
2849
+    vextracti32x8         [r2 + r3 + mmsize/2], m0,                    1
2850
+%else
2851
+    psrad                 m0,                 6
2852
+    psrad                 m1,                 6
2853
+    psrad                 m2,                 6
2854
+    psrad                 m3,                 6
2855
+
2856
+    packssdw              m0,                 m1
2857
+    packssdw              m2,                 m3
2858
+    movu                  [r2 + mmsize],      m0
2859
+    movu                  [r2 + r3 + mmsize], m2
2860
+%endif
2861
+%endmacro
2862
+
2863
+%macro FILTER_VER_S_CHROMA_64xN_AVX512 2
2864
+INIT_ZMM avx512
2865
+cglobal interp_4tap_vert_%1_64x%2, 4, 6, 11
2866
+    mov               r4d,             r4m
2867
+    shl               r4d,             7
2868
+%ifdef PIC
2869
+    lea               r5,              [pw_ChromaCoeffVer_32_avx512]
2870
+    mova              m7,              [r5 + r4]
2871
+    mova              m8,              [r5 + r4 + mmsize]
2872
+%else
2873
+    mova              m7,              [pw_ChromaCoeffVer_32_avx512 + r4]
2874
+    mova              m8,              [pw_ChromaCoeffVer_32_avx512 + r4 + mmsize]
2875
+%endif
2876
+
2877
+%ifidn %1, sp
2878
+    vbroadcasti32x4   m9,              [pd_526336]
2879
+    mova              m10,             [interp8_vsp_store_avx512]
2880
+%else
2881
+    add               r3d,             r3d
2882
+%endif
2883
+    add               r1d,             r1d
2884
+    sub               r0,              r1
2885
+    lea               r4,              [r1 * 3]
2886
+    lea               r5,              [r3 * 3]
2887
+
2888
+%rep %2/2 - 1
2889
+    PROCESS_CHROMA_VERT_S_64x2_AVX512 %1
2890
+    lea               r0,              [r0 + r1 * 2]
2891
+    lea               r2,              [r2 + r3 * 2]
2892
+%endrep
2893
+    PROCESS_CHROMA_VERT_S_64x2_AVX512 %1
2894
+    RET
2895
+%endmacro
2896
+
2897
+%if ARCH_X86_64
2898
+    FILTER_VER_S_CHROMA_64xN_AVX512 ss, 16
2899
+    FILTER_VER_S_CHROMA_64xN_AVX512 ss, 32
2900
+    FILTER_VER_S_CHROMA_64xN_AVX512 ss, 48
2901
+    FILTER_VER_S_CHROMA_64xN_AVX512 ss, 64
2902
+    FILTER_VER_S_CHROMA_64xN_AVX512 sp, 16
2903
+    FILTER_VER_S_CHROMA_64xN_AVX512 sp, 32
2904
+    FILTER_VER_S_CHROMA_64xN_AVX512 sp, 48
2905
+    FILTER_VER_S_CHROMA_64xN_AVX512 sp, 64
2906
+%endif
2907
+;-------------------------------------------------------------------------------------------------------------
2908
+;avx512 chroma_vss code end
2909
+;-------------------------------------------------------------------------------------------------------------
2910
+;-------------------------------------------------------------------------------------------------------------
2911
+;ipfilter_chroma_avx512 code end
2912
+;-------------------------------------------------------------------------------------------------------------
2913
+;-------------------------------------------------------------------------------------------------------------
2914
+;ipfilter_luma_avx512 code start
2915
+;-------------------------------------------------------------------------------------------------------------
2916
+%macro PROCESS_IPFILTER_LUMA_PP_64x1_AVX512 0
2917
+    ; register map
2918
+    ; m0 , m1 interpolate coeff
2919
+    ; m2 , m3, m4  shuffle order table
2920
+    ; m5 - pw_1
2921
+    ; m6 - pw_512
2922
+
2923
+    movu              m7,        [r0]
2924
+    movu              m9,        [r0 + 8]
2925
+
2926
+    pshufb            m8,        m7,        m3
2927
+    pshufb            m7,        m2
2928
+    pshufb            m10,       m9,        m3
2929
+    pshufb            m11,       m9,        m4
2930
+    pshufb            m9,        m2
2931
+
2932
+
2933
+    pmaddubsw         m7,        m0
2934
+    pmaddubsw         m12,       m8,        m1
2935
+    pmaddwd           m7,        m5
2936
+    pmaddwd           m12,       m5
2937
+    paddd             m7,        m12
2938
+
2939
+    pmaddubsw         m8,        m0
2940
+    pmaddubsw         m12,       m9,        m1
2941
+    pmaddwd           m8,        m5
2942
+    pmaddwd           m12,       m5
2943
+    paddd             m8,        m12
2944
+
2945
+    pmaddubsw         m9,        m0
2946
+    pmaddubsw         m12,       m10,       m1
2947
+    pmaddwd           m9,        m5
2948
+    pmaddwd           m12,       m5
2949
+    paddd             m9,        m12
2950
+
2951
+    pmaddubsw         m10,       m0
2952
+    pmaddubsw         m12,      m11,        m1
2953
+    pmaddwd           m10,      m5
2954
+    pmaddwd           m12,      m5
2955
+    paddd             m10,      m12
2956
+
2957
+    packssdw          m7,       m8
2958
+    packssdw          m9,       m10
2959
+    pmulhrsw          m7,       m6
2960
+    pmulhrsw          m9,       m6
2961
+    packuswb          m7,       m9
2962
+    movu              [r2],     m7
2963
+%endmacro
2964
+
2965
+%macro PROCESS_IPFILTER_LUMA_PP_32x2_AVX512 0
2966
+    ; register map
2967
+    ; m0 , m1 interpolate coeff
2968
+    ; m2 , m3, m4  shuffle order table
2969
+    ; m5 - pw_1
2970
+    ; m6 - pw_512
2971
+
2972
+    movu             ym7,        [r0]
2973
+    vinserti32x8      m7,        [r0 + r1], 1
2974
+    movu             ym9,        [r0 + 8]
2975
+    vinserti32x8      m9,        [r0 + r1 + 8], 1
2976
+
2977
+    pshufb            m8,        m7,        m3
2978
+    pshufb            m7,        m2
2979
+    pshufb            m10,       m9,        m3
2980
+    pshufb            m11,       m9,        m4
2981
+    pshufb            m9,        m2
2982
+
2983
+    pmaddubsw         m7,        m0
2984
+    pmaddubsw         m12,       m8,        m1
2985
+    pmaddwd           m7,        m5
2986
+    pmaddwd           m12,       m5
2987
+    paddd             m7,        m12
2988
+
2989
+    pmaddubsw         m8,        m0
2990
+    pmaddubsw         m12,       m9,        m1
2991
+    pmaddwd           m8,        m5
2992
+    pmaddwd           m12,       m5
2993
+    paddd             m8,        m12
2994
+
2995
+    pmaddubsw         m9,        m0
2996
+    pmaddubsw         m12,       m10,       m1
2997
+    pmaddwd           m9,        m5
2998
+    pmaddwd           m12,       m5
2999
+    paddd             m9,        m12
3000
+
3001
+    pmaddubsw         m10,       m0
3002
+    pmaddubsw         m12,      m11,        m1
3003
+    pmaddwd           m10,      m5
3004
+    pmaddwd           m12,      m5
3005
+    paddd             m10,      m12
3006
+
3007
+    packssdw          m7,       m8
3008
+    packssdw          m9,       m10
3009
+    pmulhrsw          m7,       m6
3010
+    pmulhrsw          m9,       m6
3011
+    packuswb          m7,       m9
3012
+    movu              [r2],     ym7
3013
+    vextracti32x8     [r2 + r3], m7, 1
3014
+%endmacro
3015
+
3016
+%macro PROCESS_IPFILTER_LUMA_PP_16x4_AVX512 0
3017
+    ; register map
3018
+    ; m0 , m1 interpolate coeff
3019
+    ; m2 , m3, m4  shuffle order table
3020
+    ; m5 - pw_1
3021
+    ; m6 - pw_512
3022
+
3023
+    movu             xm7,        [r0]
3024
+    vinserti32x4      m7,        [r0 + r1],          1
3025
+    vinserti32x4      m7,        [r0 + 2 * r1],      2
3026
+    vinserti32x4      m7,        [r0 + r6],          3
3027
+
3028
+    pshufb            m8,        m7,        m3
3029
+    pshufb            m7,        m2
3030
+
3031
+    movu             xm9,        [r0 + 8]
3032
+    vinserti32x4      m9,        [r0 + r1 + 8],      1
3033
+    vinserti32x4      m9,        [r0 + 2 * r1 + 8],  2
3034
+    vinserti32x4      m9,        [r0 + r6 + 8],      3
3035
+
3036
+    pshufb            m10,       m9,        m3
3037
+    pshufb            m11,       m9,        m4
3038
+    pshufb            m9,        m2
3039
+
3040
+    pmaddubsw         m7,        m0
3041
+    pmaddubsw         m12,       m8,        m1
3042
+    pmaddwd           m7,        m5
3043
+    pmaddwd           m12,       m5
3044
+    paddd             m7,        m12
3045
+
3046
+    pmaddubsw         m8,        m0
3047
+    pmaddubsw         m12,       m9,        m1
3048
+    pmaddwd           m8,        m5
3049
+    pmaddwd           m12,       m5
3050
+    paddd             m8,        m12
3051
+
3052
+    pmaddubsw         m9,        m0
3053
+    pmaddubsw         m12,       m10,       m1
3054
+    pmaddwd           m9,        m5
3055
+    pmaddwd           m12,       m5
3056
+    paddd             m9,        m12
3057
+
3058
+    pmaddubsw         m10,       m0
3059
+    pmaddubsw         m12,      m11,        m1
3060
+    pmaddwd           m10,      m5
3061
+    pmaddwd           m12,      m5
3062
+    paddd             m10,      m12
3063
+
3064
+    packssdw          m7,       m8
3065
+    packssdw          m9,       m10
3066
+    pmulhrsw          m7,       m6
3067
+    pmulhrsw          m9,       m6
3068
+    packuswb          m7,       m9
3069
+    movu              [r2],         xm7
3070
+    vextracti32x4     [r2 + r3],     m7,    1
3071
+    vextracti32x4     [r2 + 2 * r3], m7,    2
3072
+    vextracti32x4     [r2 + r7],     m7,    3
3073
+%endmacro
3074
+
3075
+%macro PROCESS_IPFILTER_LUMA_PP_48x4_AVX512 0
3076
+    ; register map
3077
+    ; m0 , m1 interpolate coeff
3078
+    ; m2 , m3, m4  shuffle order table
3079
+    ; m5 - pw_1
3080
+    ; m6 - pw_512
3081
+
3082
+    movu             ym7,        [r0]
3083
+    vinserti32x8      m7,        [r0 + r1], 1
3084
+    movu             ym9,        [r0 + 8]
3085
+    vinserti32x8      m9,        [r0 + r1 + 8], 1
3086
+
3087
+    pshufb            m8,        m7,        m3
3088
+    pshufb            m7,        m2
3089
+    pshufb            m10,       m9,        m3
3090
+    pshufb            m11,       m9,        m4
3091
+    pshufb            m9,        m2
3092
+
3093
+    pmaddubsw         m7,        m0
3094
+    pmaddubsw         m12,       m8,        m1
3095
+    pmaddwd           m7,        m5
3096
+    pmaddwd           m12,       m5
3097
+    paddd             m7,        m12
3098
+
3099
+    pmaddubsw         m8,        m0
3100
+    pmaddubsw         m12,       m9,        m1
3101
+    pmaddwd           m8,        m5
3102
+    pmaddwd           m12,       m5
3103
+    paddd             m8,        m12
3104
+
3105
+    pmaddubsw         m9,        m0
3106
+    pmaddubsw         m12,       m10,       m1
3107
+    pmaddwd           m9,        m5
3108
+    pmaddwd           m12,       m5
3109
+    paddd             m9,        m12
3110
+
3111
+    pmaddubsw         m10,       m0
3112
+    pmaddubsw         m12,      m11,        m1
3113
+    pmaddwd           m10,      m5
3114
+    pmaddwd           m12,      m5
3115
+    paddd             m10,      m12
3116
+
3117
+    packssdw          m7,       m8
3118
+    packssdw          m9,       m10
3119
+    pmulhrsw          m7,       m6
3120
+    pmulhrsw          m9,       m6
3121
+    packuswb          m7,       m9
3122
+    movu              [r2],     ym7
3123
+    vextracti32x8     [r2 + r3], m7, 1
3124
+
3125
+    movu             ym7,        [r0 + 2 * r1]
3126
+    vinserti32x8      m7,        [r0 + r6],          1
3127
+    movu             ym9,        [r0 + 2 * r1 + 8]
3128
+    vinserti32x8      m9,        [r0 + r6 + 8],      1
3129
+
3130
+    pshufb            m8,        m7,        m3
3131
+    pshufb            m7,        m2
3132
+    pshufb            m10,       m9,        m3
3133
+    pshufb            m11,       m9,        m4
3134
+    pshufb            m9,        m2
3135
+
3136
+    pmaddubsw         m7,        m0
3137
+    pmaddubsw         m12,       m8,        m1
3138
+    pmaddwd           m7,        m5
3139
+    pmaddwd           m12,       m5
3140
+    paddd             m7,        m12
3141
+
3142
+    pmaddubsw         m8,        m0
3143
+    pmaddubsw         m12,       m9,        m1
3144
+    pmaddwd           m8,        m5
3145
+    pmaddwd           m12,       m5
3146
+    paddd             m8,        m12
3147
+
3148
+    pmaddubsw         m9,        m0
3149
+    pmaddubsw         m12,       m10,       m1
3150
+    pmaddwd           m9,        m5
3151
+    pmaddwd           m12,       m5
3152
+    paddd             m9,        m12
3153
+
3154
+    pmaddubsw         m10,       m0
3155
+    pmaddubsw         m12,      m11,        m1
3156
+    pmaddwd           m10,      m5
3157
+    pmaddwd           m12,      m5
3158
+    paddd             m10,      m12
3159
+
3160
+    packssdw          m7,       m8
3161
+    packssdw          m9,       m10
3162
+    pmulhrsw          m7,       m6
3163
+    pmulhrsw          m9,       m6
3164
+    packuswb          m7,       m9
3165
+    movu              [r2 + 2 * r3],     ym7
3166
+    vextracti32x8     [r2 + r7],          m7,    1
3167
+
3168
+    movu             xm7,        [r0 + mmsize/2]
3169
+    vinserti32x4      m7,        [r0 + r1 + mmsize/2],          1
3170
+    vinserti32x4      m7,        [r0 + 2 * r1 + mmsize/2],      2
3171
+    vinserti32x4      m7,        [r0 + r6 + mmsize/2],          3
3172
+
3173
+    pshufb            m8,        m7,        m3
3174
+    pshufb            m7,        m2
3175
+
3176
+    movu             xm9,        [r0 + 40]
3177
+    vinserti32x4      m9,        [r0 + r1 + 40],      1
3178
+    vinserti32x4      m9,        [r0 + 2 * r1 + 40],  2
3179
+    vinserti32x4      m9,        [r0 + r6 + 40],      3
3180
+
3181
+    pshufb            m10,       m9,        m3
3182
+    pshufb            m11,       m9,        m4
3183
+    pshufb            m9,        m2
3184
+
3185
+    pmaddubsw         m7,        m0
3186
+    pmaddubsw         m12,       m8,        m1
3187
+    pmaddwd           m7,        m5
3188
+    pmaddwd           m12,       m5
3189
+    paddd             m7,        m12
3190
+
3191
+    pmaddubsw         m8,        m0
3192
+    pmaddubsw         m12,       m9,        m1
3193
+    pmaddwd           m8,        m5
3194
+    pmaddwd           m12,       m5
3195
+    paddd             m8,        m12
3196
+
3197
+    pmaddubsw         m9,        m0
3198
+    pmaddubsw         m12,       m10,       m1
3199
+    pmaddwd           m9,        m5
3200
+    pmaddwd           m12,       m5
3201
+    paddd             m9,        m12
3202
+
3203
+    pmaddubsw         m10,       m0
3204
+    pmaddubsw         m12,      m11,        m1
3205
+    pmaddwd           m10,      m5
3206
+    pmaddwd           m12,      m5
3207
+    paddd             m10,      m12
3208
+
3209
+    packssdw          m7,       m8
3210
+    packssdw          m9,       m10
3211
+    pmulhrsw          m7,       m6
3212
+    pmulhrsw          m9,       m6
3213
+    packuswb          m7,       m9
3214
+    movu              [r2 + mmsize/2],         xm7
3215
+    vextracti32x4     [r2 + r3 + mmsize/2],     m7,    1
3216
+    vextracti32x4     [r2 + 2 * r3 + mmsize/2], m7,    2
3217
+    vextracti32x4     [r2 + r7 + mmsize/2],     m7,    3
3218
+%endmacro
3219
+
3220
+%macro IPFILTER_LUMA_64xN_AVX512 1
3221
+INIT_ZMM avx512
3222
+cglobal interp_8tap_horiz_pp_64x%1, 4,6,13
3223
+    sub               r0,    3
3224
+    mov               r4d,   r4m
3225
+%ifdef PIC
3226
+    lea               r5,        [tab_LumaCoeff]
3227
+    vpbroadcastd      m0,        [r5 + r4 * 8]
3228
+    vpbroadcastd      m1,        [r5 + r4 * 8 + 4]
3229
+%else
3230
+    vpbroadcastd      m0,        [tab_LumaCoeff + r4 * 8]
3231
+    vpbroadcastd      m1,        [tab_LumaCoeff + r4 * 8 + 4]
3232
+%endif
3233
+    vbroadcasti32x8   m2,        [interp4_horiz_shuf_load1_avx512]
3234
+    vbroadcasti32x8   m3,        [interp4_horiz_shuf_load3_avx512]
3235
+    vbroadcasti32x8   m4,        [interp4_horiz_shuf_load2_avx512]
3236
+    vpbroadcastd      m5,        [pw_1]
3237
+    vbroadcasti32x8   m6,        [pw_512]
3238
+
3239
+%rep %1-1
3240
+    PROCESS_IPFILTER_LUMA_PP_64x1_AVX512
3241
+    lea               r0,        [r0 + r1]
3242
+    lea               r2,        [r2 + r3]
3243
+%endrep
3244
+    PROCESS_IPFILTER_LUMA_PP_64x1_AVX512
3245
+    RET
3246
+%endmacro
3247
+
3248
+%if ARCH_X86_64
3249
+IPFILTER_LUMA_64xN_AVX512 16
3250
+IPFILTER_LUMA_64xN_AVX512 32
3251
+IPFILTER_LUMA_64xN_AVX512 48
3252
+IPFILTER_LUMA_64xN_AVX512 64
3253
+%endif
3254
+
3255
+%macro IPFILTER_LUMA_32xN_AVX512 1
3256
+INIT_ZMM avx512
3257
+cglobal interp_8tap_horiz_pp_32x%1, 4,6,13
3258
+    sub               r0,    3
3259
+    mov               r4d,   r4m
3260
+%ifdef PIC
3261
+    lea               r5,        [tab_LumaCoeff]
3262
+    vpbroadcastd      m0,        [r5 + r4 * 8]
3263
+    vpbroadcastd      m1,        [r5 + r4 * 8 + 4]
3264
+%else
3265
+    vpbroadcastd      m0,        [tab_LumaCoeff + r4 * 8]
3266
+    vpbroadcastd      m1,        [tab_LumaCoeff + r4 * 8 + 4]
3267
+%endif
3268
+    vbroadcasti32x8   m2,        [interp4_horiz_shuf_load1_avx512]
3269
+    vbroadcasti32x8   m3,        [interp4_horiz_shuf_load3_avx512]
3270
+    vbroadcasti32x8   m4,        [interp4_horiz_shuf_load2_avx512]
3271
+    vpbroadcastd      m5,        [pw_1]
3272
+    vbroadcasti32x8   m6,        [pw_512]
3273
+
3274
+%rep %1/2 -1
3275
+    PROCESS_IPFILTER_LUMA_PP_32x2_AVX512
3276
+    lea               r0,        [r0 + 2 * r1]
3277
+    lea               r2,        [r2 + 2 * r3]
3278
+%endrep
3279
+    PROCESS_IPFILTER_LUMA_PP_32x2_AVX512
3280
+    RET
3281
+%endmacro
3282
+
3283
+%if ARCH_X86_64
3284
+IPFILTER_LUMA_32xN_AVX512 8
3285
+IPFILTER_LUMA_32xN_AVX512 16
3286
+IPFILTER_LUMA_32xN_AVX512 24
3287
+IPFILTER_LUMA_32xN_AVX512 32
3288
+IPFILTER_LUMA_32xN_AVX512 64
3289
+%endif
3290
+
3291
+%macro IPFILTER_LUMA_16xN_AVX512 1
3292
+INIT_ZMM avx512
3293
+cglobal interp_8tap_horiz_pp_16x%1, 4,8,14
3294
+    sub               r0,    3
3295
+    mov               r4d,   r4m
3296
+    lea               r6,    [3 * r1]
3297
+    lea               r7,    [3 * r3]
3298
+%ifdef PIC
3299
+    lea               r5,        [tab_LumaCoeff]
3300
+    vpbroadcastd      m0,        [r5 + r4 * 8]
3301
+    vpbroadcastd      m1,        [r5 + r4 * 8 + 4]
3302
+%else
3303
+    vpbroadcastd      m0,        [tab_LumaCoeff + r4 * 8]
3304
+    vpbroadcastd      m1,        [tab_LumaCoeff + r4 * 8 + 4]
3305
+%endif
3306
+    vbroadcasti32x8   m2,        [interp4_horiz_shuf_load1_avx512]
3307
+    vbroadcasti32x8   m3,        [interp4_horiz_shuf_load3_avx512]
3308
+    vbroadcasti32x8   m4,        [interp4_horiz_shuf_load2_avx512]
3309
+    vpbroadcastd      m5,        [pw_1]
3310
+    vbroadcasti32x8   m6,        [pw_512]
3311
+
3312
+%rep %1/4 -1
3313
+    PROCESS_IPFILTER_LUMA_PP_16x4_AVX512
3314
+    lea               r0,        [r0 + 4 * r1]
3315
+    lea               r2,        [r2 + 4 * r3]
3316
+%endrep
3317
+    PROCESS_IPFILTER_LUMA_PP_16x4_AVX512
3318
+    RET
3319
+%endmacro
3320
+
3321
+%if ARCH_X86_64
3322
+IPFILTER_LUMA_16xN_AVX512 4
3323
+IPFILTER_LUMA_16xN_AVX512 8
3324
+IPFILTER_LUMA_16xN_AVX512 12
3325
+IPFILTER_LUMA_16xN_AVX512 16
3326
+IPFILTER_LUMA_16xN_AVX512 32
3327
+IPFILTER_LUMA_16xN_AVX512 64
3328
+%endif
3329
+
3330
+%if ARCH_X86_64
3331
+INIT_ZMM avx512
3332
+cglobal interp_8tap_horiz_pp_48x64, 4,8,14
3333
+    sub               r0,    3
3334
+    mov               r4d,   r4m
3335
+    lea               r6,    [3 * r1]
3336
+    lea               r7,    [3 * r3]
3337
+%ifdef PIC
3338
+    lea               r5,        [tab_LumaCoeff]
3339
+    vpbroadcastd      m0,        [r5 + r4 * 8]
3340
+    vpbroadcastd      m1,        [r5 + r4 * 8 + 4]
3341
+%else
3342
+    vpbroadcastd      m0,        [tab_LumaCoeff + r4 * 8]
3343
+    vpbroadcastd      m1,        [tab_LumaCoeff + r4 * 8 + 4]
3344
+%endif
3345
+    vbroadcasti32x8   m2,        [interp4_horiz_shuf_load1_avx512]
3346
+    vbroadcasti32x8   m3,        [interp4_horiz_shuf_load3_avx512]
3347
+    vbroadcasti32x8   m4,        [interp4_horiz_shuf_load2_avx512]
3348
+    vpbroadcastd      m5,        [pw_1]
3349
+    vbroadcasti32x8   m6,        [pw_512]
3350
+
3351
+%rep 15
3352
+    PROCESS_IPFILTER_LUMA_PP_48x4_AVX512
3353
+    lea               r0,        [r0 + 4 * r1]
3354
+    lea               r2,        [r2 + 4 * r3]
3355
+%endrep
3356
+    PROCESS_IPFILTER_LUMA_PP_48x4_AVX512
3357
+    RET
3358
+%endif
3359
+
3360
+%macro PROCESS_IPFILTER_LUMA_PS_64x1_AVX512 0
3361
+    ; register map
3362
+    ; m0 , m1     - interpolate coeff
3363
+    ; m2 , m3, m4 - load shuffle order table
3364
+    ; m5          - pw_1
3365
+    ; m6          - pw_2000
3366
+    ; m7          - store shuffle order table
3367
+
3368
+    movu              ym8,           [r0]
3369
+    vinserti32x8      m8,            [r0 + 8],            1
3370
+    pshufb            m9,            m8,                  m3
3371
+    pshufb            m10,           m8,                  m4
3372
+    pshufb            m8,             m2
3373
+
3374
+    movu              ym11,          [r0 + mmsize/2]
3375
+    vinserti32x8      m11,           [r0 + mmsize/2 + 8], 1
3376
+    pshufb            m12,           m11,                 m3
3377
+    pshufb            m13,           m11,                 m4
3378
+    pshufb            m11,           m2
3379
+
3380
+    pmaddubsw         m8,            m0
3381
+    pmaddubsw         m14,           m9,                  m1
3382
+    pmaddwd           m8,            m5
3383
+    pmaddwd           m14,           m5
3384
+    paddd             m8,            m14
3385
+
3386
+    pmaddubsw         m9,            m0
3387
+    pmaddubsw         m14,           m10,                 m1
3388
+    pmaddwd           m9,            m5
3389
+    pmaddwd           m14,           m5
3390
+    paddd             m9,            m14
3391
+
3392
+    pmaddubsw         m11,           m0
3393
+    pmaddubsw         m14,           m12,                 m1
3394
+    pmaddwd           m11,           m5
3395
+    pmaddwd           m14,           m5
3396
+    paddd             m11,           m14
3397
+
3398
+    pmaddubsw         m12,           m0
3399
+    pmaddubsw         m14,           m13,                 m1
3400
+    pmaddwd           m12,           m5
3401
+    pmaddwd           m14,           m5
3402
+    paddd             m12,           m14
3403
+
3404
+
3405
+    packssdw          m8,            m9
3406
+    packssdw          m11,           m12
3407
+    psubw             m8,            m6
3408
+    psubw             m11,           m6
3409
+    vpermq            m8,            m7,                  m8
3410
+    vpermq            m11,           m7,                  m11
3411
+    movu              [r2],          m8
3412
+    movu              [r2 + mmsize], m11
3413
+%endmacro
3414
+
3415
+%macro IPFILTER_LUMA_PS_64xN_AVX512 1
3416
+INIT_ZMM avx512
3417
+cglobal interp_8tap_horiz_ps_64x%1, 4,7,15
3418
+    mov               r4d,   r4m
3419
+    mov               r5d,   r5m
3420
+
3421
+%ifdef PIC
3422
+    lea               r6,        [tab_LumaCoeff]
3423
+    vpbroadcastd      m0,        [r6 + r4 * 8]
3424
+    vpbroadcastd      m1,        [r6 + r4 * 8 + 4]
3425
+%else
3426
+    vpbroadcastd      m0,        [tab_LumaCoeff + r4 * 8]
3427
+    vpbroadcastd      m1,        [tab_LumaCoeff + r4 * 8 + 4]
3428
+%endif
3429
+    vbroadcasti32x8   m2,        [interp4_horiz_shuf_load1_avx512]
3430
+    vbroadcasti32x8   m3,        [interp4_horiz_shuf_load3_avx512]
3431
+    vbroadcasti32x8   m4,        [interp4_horiz_shuf_load2_avx512]
3432
+    vpbroadcastd      m5,        [pw_1]
3433
+    vbroadcasti32x8   m6,        [pw_2000]
3434
+    mova              m7,        [interp8_hps_store_avx512]
3435
+
3436
+    mov               r4d,       %1
3437
+    sub               r0,        3
3438
+    test              r5d,       r5d
3439
+    jz                .loop
3440
+    lea               r6,        [r1 * 3]
3441
+    sub               r0,        r6                           ; r0(src)-r6
3442
+    add               r4d,       7                            ; blkheight += N - 1
3443
+
3444
+.loop:
3445
+    PROCESS_IPFILTER_LUMA_PS_64x1_AVX512
3446
+    lea               r0,        [r0 + r1]
3447
+    lea               r2,        [r2 + 2 * r3]
3448
+    dec               r4d
3449
+    jnz               .loop
3450
+    RET
3451
+%endmacro
3452
+
3453
+%if ARCH_X86_64 == 1
3454
+    IPFILTER_LUMA_PS_64xN_AVX512 16
3455
+    IPFILTER_LUMA_PS_64xN_AVX512 32
3456
+    IPFILTER_LUMA_PS_64xN_AVX512 48
3457
+    IPFILTER_LUMA_PS_64xN_AVX512 64
3458
+%endif
3459
+
3460
+%macro PROCESS_IPFILTER_LUMA_PS_32x1_AVX512 0
3461
+    ; register map
3462
+    ; m0 , m1     - interpolate coeff
3463
+    ; m2 , m3, m4 - load shuffle order table
3464
+    ; m5          - pw_1
3465
+    ; m6          - pw_2000
3466
+    ; m7          - store shuffle order table
3467
+
3468
+    movu              ym8,           [r0]
3469
+    vinserti32x8      m8,            [r0 + 8],            1
3470
+    pshufb            m9,            m8,                  m3
3471
+    pshufb            m10,           m8,                  m4
3472
+    pshufb            m8,             m2
3473
+
3474
+    pmaddubsw         m8,            m0
3475
+    pmaddubsw         m11,           m9,                  m1
3476
+    pmaddwd           m8,            m5
3477
+    pmaddwd           m11,           m5
3478
+    paddd             m8,            m11
3479
+
3480
+    pmaddubsw         m9,            m0
3481
+    pmaddubsw         m11,           m10,                 m1
3482
+    pmaddwd           m9,            m5
3483
+    pmaddwd           m11,           m5
3484
+    paddd             m9,            m11
3485
+
3486
+    packssdw          m8,            m9
3487
+    psubw             m8,            m6
3488
+    vpermq            m8,            m7,                  m8
3489
+    movu              [r2],          m8
3490
+%endmacro
3491
+
3492
+%macro IPFILTER_LUMA_PS_32xN_AVX512 1
3493
+INIT_ZMM avx512
3494
+cglobal interp_8tap_horiz_ps_32x%1, 4,7,12
3495
+    mov               r4d,   r4m
3496
+    mov               r5d,   r5m
3497
+
3498
+%ifdef PIC
3499
+    lea               r6,        [tab_LumaCoeff]
3500
+    vpbroadcastd      m0,        [r6 + r4 * 8]
3501
+    vpbroadcastd      m1,        [r6 + r4 * 8 + 4]
3502
+%else
3503
+    vpbroadcastd      m0,        [tab_LumaCoeff + r4 * 8]
3504
+    vpbroadcastd      m1,        [tab_LumaCoeff + r4 * 8 + 4]
3505
+%endif
3506
+    vbroadcasti32x8   m2,        [interp4_horiz_shuf_load1_avx512]
3507
+    vbroadcasti32x8   m3,        [interp4_horiz_shuf_load3_avx512]
3508
+    vbroadcasti32x8   m4,        [interp4_horiz_shuf_load2_avx512]
3509
+    vpbroadcastd      m5,        [pw_1]
3510
+    vbroadcasti32x8   m6,        [pw_2000]
3511
+    mova              m7,        [interp8_hps_store_avx512]
3512
+
3513
+    mov               r4d,       %1
3514
+    sub               r0,        3
3515
+    test              r5d,       r5d
3516
+    jz                .loop
3517
+    lea               r6,        [r1 * 3]
3518
+    sub               r0,        r6                           ; r0(src)-r6
3519
+    add               r4d,       7                            ; blkheight += N - 1
3520
+
3521
+.loop:
3522
+    PROCESS_IPFILTER_LUMA_PS_32x1_AVX512
3523
+    lea               r0,        [r0 + r1]
3524
+    lea               r2,        [r2 + 2 * r3]
3525
+    dec               r4d
3526
+    jnz               .loop
3527
+    RET
3528
+%endmacro
3529
+
3530
+%if ARCH_X86_64 == 1
3531
+    IPFILTER_LUMA_PS_32xN_AVX512 8
3532
+    IPFILTER_LUMA_PS_32xN_AVX512 16
3533
+    IPFILTER_LUMA_PS_32xN_AVX512 24
3534
+    IPFILTER_LUMA_PS_32xN_AVX512 32
3535
+    IPFILTER_LUMA_PS_32xN_AVX512 64
3536
+%endif
3537
+
3538
+%macro PROCESS_IPFILTER_LUMA_PS_8TAP_16x2_AVX512 0
3539
+    movu              xm7,           [r0]
3540
+    vinserti32x4      m7,            [r0 + 8],            1
3541
+    vinserti32x4      m7,            [r0 + r1],           2
3542
+    vinserti32x4      m7,            [r0 + r1 + 8],       3
3543
+    pshufb            m8,            m7,                  m3
3544
+    pshufb            m9,            m7,                  m4
3545
+    pshufb            m7,            m2
3546
+
3547
+    pmaddubsw         m7,            m0
3548
+    pmaddubsw         m10,           m8,                  m1
3549
+    pmaddwd           m7,            m5
3550
+    pmaddwd           m10,           m5
3551
+    paddd             m7,            m10
3552
+
3553
+    pmaddubsw         m8,            m0
3554
+    pmaddubsw         m10,           m9,                  m1
3555
+    pmaddwd           m8,            m5
3556
+    pmaddwd           m10,           m5
3557
+    paddd             m8,            m10
3558
+
3559
+    packssdw          m7,            m8
3560
+    psubw             m7,            m6
3561
+    movu              [r2],          ym7
3562
+    vextracti32x8     [r2 + r3],     m7,                  1
3563
+%endmacro
3564
+
3565
+%macro PROCESS_IPFILTER_LUMA_PS_8TAP_16x1_AVX512 0
3566
+    movu              xm7,            [r0]
3567
+    vinserti32x4      m7,             [r0 + 8],             1
3568
+    pshufb            ym8,            ym7,                  ym3
3569
+    pshufb            ym9,            ym7,                  ym4
3570
+    pshufb            ym7,            ym2
3571
+
3572
+    pmaddubsw         ym7,            ym0
3573
+    pmaddubsw         ym10,           ym8,                  ym1
3574
+    pmaddwd           ym7,            ym5
3575
+    pmaddwd           ym10,           ym5
3576
+    paddd             ym7,            ym10
3577
+
3578
+    pmaddubsw         ym8,            ym0
3579
+    pmaddubsw         ym10,           ym9,                  ym1
3580
+    pmaddwd           ym8,            ym5
3581
+    pmaddwd           ym10,           ym5
3582
+    paddd             ym8,            ym10
3583
+
3584
+    packssdw          ym7,            ym8
3585
+    psubw             ym7,            ym6
3586
+    movu              [r2],           ym7
3587
+%endmacro
3588
+
3589
+;-------------------------------------------------------------------------------------------------------------
3590
+; void interp_horiz_ps_16xN(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)
3591
+;-------------------------------------------------------------------------------------------------------------
3592
+%macro IPFILTER_LUMA_PS_8TAP_16xN_AVX512 1
3593
+INIT_ZMM avx512
3594
+cglobal interp_8tap_horiz_ps_16x%1, 4,7,11
3595
+    mov               r4d,   r4m
3596
+    mov               r5d,   r5m
3597
+    add               r3,    r3
3598
+
3599
+%ifdef PIC
3600
+    lea               r6,        [tab_LumaCoeff]
3601
+    vpbroadcastd      m0,        [r6 + r4 * 8]
3602
+    vpbroadcastd      m1,        [r6 + r4 * 8 + 4]
3603
+%else
3604
+    vpbroadcastd      m0,        [tab_LumaCoeff + r4 * 8]
3605
+    vpbroadcastd      m1,        [tab_LumaCoeff + r4 * 8 + 4]
3606
+%endif
3607
+    vbroadcasti32x8   m2,        [interp4_horiz_shuf_load1_avx512]
3608
+    vbroadcasti32x8   m3,        [interp4_horiz_shuf_load3_avx512]
3609
+    vbroadcasti32x8   m4,        [interp4_horiz_shuf_load2_avx512]
3610
+    vpbroadcastd      m5,        [pw_1]
3611
+    vbroadcasti32x8   m6,        [pw_2000]
3612
+
3613
+    ; register map
3614
+    ; m0 , m1     - interpolate coeff
3615
+    ; m2 , m3, m4 - load shuffle order table
3616
+    ; m5          - pw_1
3617
+    ; m6          - pw_2000
3618
+
3619
+    mov               r4d,       %1
3620
+    sub               r0,        3
3621
+    test              r5d,       r5d
3622
+    jz                .loop
3623
+    lea               r6,        [r1 * 3]
3624
+    sub               r0,        r6                           ; r0(src)-r6
3625
+    add               r4d,       7                            ; blkheight += N - 1
3626
+    PROCESS_IPFILTER_LUMA_PS_8TAP_16x1_AVX512
3627
+    lea               r0,        [r0 + r1]
3628
+    lea               r2,        [r2 + r3]
3629
+    dec               r4d
3630
+
3631
+.loop:
3632
+    PROCESS_IPFILTER_LUMA_PS_8TAP_16x2_AVX512
3633
+    lea               r0,        [r0 + 2 * r1]
3634
+    lea               r2,        [r2 + 2 * r3]
3635
+    sub               r4d,       2
3636
+    jnz               .loop
3637
+    RET
3638
+%endmacro
3639
+
3640
+%if ARCH_X86_64 == 1
3641
+    IPFILTER_LUMA_PS_8TAP_16xN_AVX512 4
3642
+    IPFILTER_LUMA_PS_8TAP_16xN_AVX512 8
3643
+    IPFILTER_LUMA_PS_8TAP_16xN_AVX512 12
3644
+    IPFILTER_LUMA_PS_8TAP_16xN_AVX512 16
3645
+    IPFILTER_LUMA_PS_8TAP_16xN_AVX512 32
3646
+    IPFILTER_LUMA_PS_8TAP_16xN_AVX512 64
3647
+%endif
3648
+
3649
+%macro PROCESS_IPFILTER_LUMA_PS_48x1_AVX512 0
3650
+    ; register map
3651
+    ; m0 , m1     - interpolate coeff
3652
+    ; m2 , m3, m4 - load shuffle order table
3653
+    ; m5          - pw_1
3654
+    ; m6          - pw_2000
3655
+    ; m7          - store shuffle order table
3656
+
3657
+    movu              ym8,           [r0]
3658
+    vinserti32x8      m8,            [r0 + 8],            1
3659
+    pshufb            m9,            m8,                  m3
3660
+    pshufb            m10,           m8,                  m4
3661
+    pshufb            m8,             m2
3662
+
3663
+    pmaddubsw         m8,            m0
3664
+    pmaddubsw         m11,           m9,                  m1
3665
+    pmaddwd           m8,            m5
3666
+    pmaddwd           m11,           m5
3667
+    paddd             m8,            m11
3668
+
3669
+    pmaddubsw         m9,            m0
3670
+    pmaddubsw         m11,           m10,                 m1
3671
+    pmaddwd           m9,            m5
3672
+    pmaddwd           m11,           m5
3673
+    paddd             m9,            m11
3674
+
3675
+    packssdw          m8,            m9
3676
+    psubw             m8,            m6
3677
+    vpermq            m8,            m7,                  m8
3678
+    movu              [r2],          m8
3679
+
3680
+    movu              ym8,           [r0 + 32]
3681
+    vinserti32x4      m8,            [r0 + 40],           1
3682
+    pshufb            ym9,           ym8,                 ym3
3683
+    pshufb            ym10,           ym8,                ym4
3684
+    pshufb            ym8,            ym2
3685
+
3686
+    pmaddubsw         ym8,            ym0
3687
+    pmaddubsw         ym11,           ym9,                ym1
3688
+    pmaddwd           ym8,            ym5
3689
+    pmaddwd           ym11,           ym5
3690
+    paddd             ym8,            ym11
3691
+
3692
+    pmaddubsw         ym9,            ym0
3693
+    pmaddubsw         ym11,           ym10,               ym1
3694
+    pmaddwd           ym9,            ym5
3695
+    pmaddwd           ym11,           ym5
3696
+    paddd             ym9,            ym11
3697
+
3698
+    packssdw          ym8,            ym9
3699
+    psubw             ym8,            ym6
3700
+    movu              [r2 + mmsize],  ym8
3701
+%endmacro
3702
+
3703
+;-------------------------------------------------------------------------------------------------------------
3704
+; void interp_horiz_ps_48xN(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)
3705
+;-------------------------------------------------------------------------------------------------------------
3706
+%macro IPFILTER_LUMA_PS_48xN_AVX512 1
3707
+INIT_ZMM avx512
3708
+cglobal interp_8tap_horiz_ps_48x%1, 4,7,12
3709
+    mov               r4d,   r4m
3710
+    mov               r5d,   r5m
3711
+
3712
+%ifdef PIC
3713
+    lea               r6,        [tab_LumaCoeff]
3714
+    vpbroadcastd      m0,        [r6 + r4 * 8]
3715
+    vpbroadcastd      m1,        [r6 + r4 * 8 + 4]
3716
+%else
3717
+    vpbroadcastd      m0,        [tab_LumaCoeff + r4 * 8]
3718
+    vpbroadcastd      m1,        [tab_LumaCoeff + r4 * 8 + 4]
3719
+%endif
3720
+    vbroadcasti32x8   m2,        [interp4_horiz_shuf_load1_avx512]
3721
+    vbroadcasti32x8   m3,        [interp4_horiz_shuf_load3_avx512]
3722
+    vbroadcasti32x8   m4,        [interp4_horiz_shuf_load2_avx512]
3723
+    vpbroadcastd      m5,        [pw_1]
3724
+    vbroadcasti32x8   m6,        [pw_2000]
3725
+    mova              m7,        [interp8_hps_store_avx512]
3726
+
3727
+    mov               r4d,       %1
3728
+    sub               r0,        3
3729
+    test              r5d,       r5d
3730
+    jz                .loop
3731
+    lea               r6,        [r1 * 3]
3732
+    sub               r0,        r6                           ; r0(src)-r6
3733
+    add               r4d,       7                            ; blkheight += N - 1
3734
+
3735
+.loop:
3736
+    PROCESS_IPFILTER_LUMA_PS_48x1_AVX512
3737
+    lea               r0,        [r0 + r1]
3738
+    lea               r2,        [r2 + 2 * r3]
3739
+    dec               r4d
3740
+    jnz               .loop
3741
+    RET
3742
+%endmacro
3743
+
3744
+%if ARCH_X86_64 == 1
3745
+    IPFILTER_LUMA_PS_48xN_AVX512 64
3746
+%endif
3747
+
3748
+;-------------------------------------------------------------------------------------------------------------
3749
+;avx512 luma_vss code start
3750
+;-------------------------------------------------------------------------------------------------------------
3751
+%macro PROCESS_LUMA_VERT_SS_8x8_AVX512 0
3752
+    lea                  r6,                  [r0 + 4 * r1]
3753
+    movu                 xm1,                 [r0]                           ;0 row
3754
+    vinserti32x4         m1,                  [r0 + 2 * r1],          1
3755
+    vinserti32x4         m1,                  [r0 + 4 * r1],          2
3756
+    vinserti32x4         m1,                  [r6 + 2 * r1],          3
3757
+    movu                 xm3,                 [r0 + r1]                      ;1 row
3758
+    vinserti32x4         m3,                  [r0 + r7],              1
3759
+    vinserti32x4         m3,                  [r6 + r1],              2
3760
+    vinserti32x4         m3,                  [r6 + r7],              3
3761
+    punpcklwd            m0,                  m1,                     m3
3762
+    pmaddwd              m0,                  m15
3763
+    punpckhwd            m1,                  m3
3764
+    pmaddwd              m1,                  m15
3765
+
3766
+    movu                 xm4,                 [r0 + 2 * r1]                  ;2 row
3767
+    vinserti32x4         m4,                  [r0 + 4 * r1],          1
3768
+    vinserti32x4         m4,                  [r6 + 2 * r1],          2
3769
+    vinserti32x4         m4,                  [r6 + 4 * r1],          3
3770
+    punpcklwd            m2,                  m3,                     m4
3771
+    pmaddwd              m2,                  m15
3772
+    punpckhwd            m3,                  m4
3773
+    pmaddwd              m3,                  m15
3774
+
3775
+    lea                  r4,                  [r6 + 4 * r1]
3776
+    movu                 xm5,                 [r0 + r7]                      ;3 row
3777
+    vinserti32x4         m5,                  [r6 + r1],              1
3778
+    vinserti32x4         m5,                  [r6 + r7],              2
3779
+    vinserti32x4         m5,                  [r4 + r1],              3
3780
+    punpcklwd            m6,                  m4,                     m5
3781
+    pmaddwd              m6,                  m16
3782
+    punpckhwd            m4,                  m5
3783
+    pmaddwd              m4,                  m16
3784
+
3785
+    paddd                m0,                  m6
3786
+    paddd                m1,                  m4
3787
+
3788
+    movu                 xm4,                 [r0 + 4 * r1]                  ;4 row
3789
+    vinserti32x4         m4,                  [r6 + 2 * r1],              1
3790
+    vinserti32x4         m4,                  [r6 + 4 * r1],              2
3791
+    vinserti32x4         m4,                  [r4 + 2 * r1],              3
3792
+    punpcklwd            m6,                  m5,                     m4
3793
+    pmaddwd              m6,                  m16
3794
+    punpckhwd            m5,                  m4
3795
+    pmaddwd              m5,                  m16
3796
+
3797
+    paddd                m2,                  m6
3798
+    paddd                m3,                  m5
3799
+
3800
+    movu                 xm11,                [r6 + r1]                      ;5 row
3801
+    vinserti32x4         m11,                 [r6 + r7],              1
3802
+    vinserti32x4         m11,                 [r4 + r1],              2
3803
+    vinserti32x4         m11,                 [r4 + r7],              3
3804
+    punpcklwd            m8,                  m4,                     m11
3805
+    pmaddwd              m8,                  m17
3806
+    punpckhwd            m4,                  m11
3807
+    pmaddwd              m4,                  m17
3808
+
3809
+    movu                 xm12,                [r6 + 2 * r1]                  ;6 row
3810
+    vinserti32x4         m12,                 [r6 + 4 * r1],          1
3811
+    vinserti32x4         m12,                 [r4 + 2 * r1],          2
3812
+    vinserti32x4         m12,                 [r4 + 4 * r1],          3
3813
+    punpcklwd            m10,                 m11,                    m12
3814
+    pmaddwd              m10,                 m17
3815
+    punpckhwd            m11,                 m12
3816
+    pmaddwd              m11,                 m17
3817
+
3818
+    lea                  r8,                  [r4 + 4 * r1]
3819
+    movu                 xm13,                [r6 + r7]                      ;7 row
3820
+    vinserti32x4         m13,                 [r4 + r1],              1
3821
+    vinserti32x4         m13,                 [r4 + r7],              2
3822
+    vinserti32x4         m13,                 [r8 + r1],              3
3823
+    punpcklwd            m14,                 m12,                    m13
3824
+    pmaddwd              m14,                 m18
3825
+    punpckhwd            m12,                 m13
3826
+    pmaddwd              m12,                 m18
3827
+
3828
+    paddd                m8,                  m14
3829
+    paddd                m4,                  m12
3830
+    paddd                m0,                  m8
3831
+    paddd                m1,                  m4
3832
+
3833
+    movu                 xm12,                [r6 + 4 * r1]                 ; 8 row
3834
+    vinserti32x4         m12,                 [r4 + 2 * r1],          1
3835
+    vinserti32x4         m12,                 [r4 + 4 * r1],          2
3836
+    vinserti32x4         m12,                 [r8 + 2 * r1],          3
3837
+    punpcklwd            m14,                 m13,                    m12
3838
+    pmaddwd              m14,                 m18
3839
+    punpckhwd            m13,                 m12
3840
+    pmaddwd              m13,                 m18
3841
+
3842
+    paddd                m10,                 m14
3843
+    paddd                m11,                 m13
3844
+    paddd                m2,                  m10
3845
+    paddd                m3,                  m11
3846
+
3847
+    psrad                m0,                  6
3848
+    psrad                m1,                  6
3849
+    psrad                m2,                  6
3850
+    psrad                m3,                  6
3851
+
3852
+    packssdw             m0,                  m1
3853
+    packssdw             m2,                  m3
3854
+
3855
+    movu                 [r2],                xm0
3856
+    movu                 [r2 + r3],           xm2
3857
+    vextracti32x4        [r2 + 2 * r3],       m0,                  1
3858
+    vextracti32x4        [r2 + r5],           m2,                  1
3859
+    lea                  r2,                  [r2 + 4 * r3]
3860
+    vextracti32x4        [r2],                m0,                  2
3861
+    vextracti32x4        [r2 + r3],           m2,                  2
3862
+    vextracti32x4        [r2 + 2 * r3],       m0,                  3
3863
+    vextracti32x4        [r2 + r5],           m2,                  3
3864
+%endmacro
3865
+;-----------------------------------------------------------------------------------------------------------------
3866
+; void interp_8tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
3867
+;-----------------------------------------------------------------------------------------------------------------
3868
+%macro FILTER_VER_SS_LUMA_8xN_AVX512 1
3869
+INIT_ZMM avx512
3870
+cglobal interp_8tap_vert_ss_8x%1, 5, 9, 19
3871
+    add                   r1d,                r1d
3872
+    add                   r3d,                r3d
3873
+    lea                   r7,                 [3 * r1]
3874
+    sub                   r0,                 r7
3875
+    shl                   r4d,                8
3876
+%ifdef PIC
3877
+    lea                   r5,                 [pw_LumaCoeffVer_avx512]
3878
+    mova                  m15,                [r5 + r4]
3879
+    mova                  m16,                [r5 + r4 + 1 * mmsize]
3880
+    mova                  m17,                [r5 + r4 + 2 * mmsize]
3881
+    mova                  m18,                [r5 + r4 + 3 * mmsize]
3882
+%else
3883
+    lea                   r5,                 [pw_LumaCoeffVer_avx512 + r4]
3884
+    mova                  m15,                [r5]
3885
+    mova                  m16,                [r5 + 1 * mmsize]
3886
+    mova                  m17,                [r5 + 2 * mmsize]
3887
+    mova                  m18,                [r5 + 3 * mmsize]
3888
+%endif
3889
+
3890
+    lea                   r5,                 [3 * r3]
3891
+%rep %1/8 - 1
3892
+    PROCESS_LUMA_VERT_SS_8x8_AVX512
3893
+    lea                   r0,                 [r4]
3894
+    lea                   r2,                 [r2 + 4 * r3]
3895
+%endrep
3896
+    PROCESS_LUMA_VERT_SS_8x8_AVX512
3897
+    RET
3898
+%endmacro
3899
+
3900
+%if ARCH_X86_64
3901
+    FILTER_VER_SS_LUMA_8xN_AVX512 8
3902
+    FILTER_VER_SS_LUMA_8xN_AVX512 16
3903
+    FILTER_VER_SS_LUMA_8xN_AVX512 32
3904
+%endif
3905
+%macro PROCESS_LUMA_VERT_S_16x4_AVX512 1
3906
+    movu                 ym1,                 [r0]
3907
+    movu                 ym3,                 [r0 + r1]
3908
+    vinserti32x8         m1,                  [r0 + 2 * r1],          1
3909
+    vinserti32x8         m3,                  [r0 + r7],              1
3910
+    punpcklwd            m0,                  m1,                     m3
3911
+    pmaddwd              m0,                  m15
3912
+    punpckhwd            m1,                  m3
3913
+    pmaddwd              m1,                  m15
3914
+
3915
+    lea                  r6,                  [r0 + 4 * r1]
3916
+    movu                 ym4,                 [r0 + 2 * r1]
3917
+    vinserti32x8         m4,                  [r6],                   1
3918
+    punpcklwd            m2,                  m3,                     m4
3919
+    pmaddwd              m2,                  m15
3920
+    punpckhwd            m3,                  m4
3921
+    pmaddwd              m3,                  m15
3922
+
3923
+    movu                 ym5,                 [r0 + r7]
3924
+    vinserti32x8         m5,                  [r6 + r1],              1
3925
+    punpcklwd            m6,                  m4,                     m5
3926
+    pmaddwd              m6,                  m16
3927
+    punpckhwd            m4,                  m5
3928
+    pmaddwd              m4,                  m16
3929
+
3930
+    paddd                m0,                  m6
3931
+    paddd                m1,                  m4
3932
+
3933
+    movu                 ym4,                 [r6]
3934
+    vinserti32x8         m4,                  [r6 + 2 * r1],          1
3935
+    punpcklwd            m6,                  m5,                     m4
3936
+    pmaddwd              m6,                  m16
3937
+    punpckhwd            m5,                  m4
3938
+    pmaddwd              m5,                  m16
3939
+
3940
+    paddd                m2,                  m6
3941
+    paddd                m3,                  m5
3942
+
3943
+    movu                 ym11,                [r6 + r1]
3944
+    vinserti32x8         m11,                 [r6 + r7],              1
3945
+    punpcklwd            m8,                  m4,                     m11
3946
+    pmaddwd              m8,                  m17
3947
+    punpckhwd            m4,                  m11
3948
+    pmaddwd              m4,                  m17
3949
+
3950
+    movu                 ym12,                [r6 + 2 * r1]
3951
+    vinserti32x8         m12,                 [r6 + 4 * r1],          1
3952
+    punpcklwd            m10,                 m11,                    m12
3953
+    pmaddwd              m10,                 m17
3954
+    punpckhwd            m11,                 m12
3955
+    pmaddwd              m11,                 m17
3956
+
3957
+    lea                  r4,                  [r6 + 4 * r1]
3958
+    movu                 ym13,                [r6 + r7]
3959
+    vinserti32x8         m13,                 [r4 + r1],              1
3960
+    punpcklwd            m14,                 m12,                    m13
3961
+    pmaddwd              m14,                 m18
3962
+    punpckhwd            m12,                 m13
3963
+    pmaddwd              m12,                 m18
3964
+
3965
+    paddd                m8,                  m14
3966
+    paddd                m4,                  m12
3967
+    paddd                m0,                  m8
3968
+    paddd                m1,                  m4
3969
+
3970
+    movu                 ym12,                [r6 + 4 * r1]
3971
+    vinserti32x8         m12,                 [r4 + 2 * r1],          1
3972
+    punpcklwd            m14,                 m13,                    m12
3973
+    pmaddwd              m14,                 m18
3974
+    punpckhwd            m13,                 m12
3975
+    pmaddwd              m13,                 m18
3976
+
3977
+    paddd                m10,                 m14
3978
+    paddd                m11,                 m13
3979
+    paddd                m2,                  m10
3980
+    paddd                m3,                  m11
3981
+%ifidn %1, sp
3982
+    paddd                m0,                  m19
3983
+    paddd                m1,                  m19
3984
+    paddd                m2,                  m19
3985
+    paddd                m3,                  m19
3986
+
3987
+    psrad                m0,                  12
3988
+    psrad                m1,                  12
3989
+    psrad                m2,                  12
3990
+    psrad                m3,                  12
3991
+
3992
+    packssdw             m0,                  m1
3993
+    packssdw             m2,                  m3
3994
+    packuswb             m0,                  m2
3995
+    vpermq               m0,                  m20,                   m0
3996
+    movu                 [r2],                xm0
3997
+    vextracti32x4        [r2 + r3],           m0,                    2
3998
+    vextracti32x4        [r2 + 2 * r3],       m0,                    1
3999
+    vextracti32x4        [r2 + r5],           m0,                    3
4000
+%else
4001
+    psrad                m0,                  6
4002
+    psrad                m1,                  6
4003
+    psrad                m2,                  6
4004
+    psrad                m3,                  6
4005
+
4006
+    packssdw             m0,                  m1
4007
+    packssdw             m2,                  m3
4008
+
4009
+    movu                 [r2],                ym0
4010
+    movu                 [r2 + r3],           ym2
4011
+    vextracti32x8        [r2 + 2 * r3],       m0,                1
4012
+    vextracti32x8        [r2 + r5],           m2,                1
4013
+%endif
4014
+%endmacro
4015
+;-----------------------------------------------------------------------------------------------------------------
4016
+; void interp_8tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
4017
+;-----------------------------------------------------------------------------------------------------------------
4018
+%macro FILTER_VER_S_LUMA_16xN_AVX512 2
4019
+INIT_ZMM avx512
4020
+cglobal interp_8tap_vert_%1_16x%2, 5, 8, 21
4021
+    add                   r1d,                r1d
4022
+    lea                   r7,                 [3 * r1]
4023
+    sub                   r0,                 r7
4024
+    shl                   r4d,                8
4025
+%ifdef PIC
4026
+    lea                   r5,                 [pw_LumaCoeffVer_avx512]
4027
+    mova                  m15,                [r5 + r4]
4028
+    mova                  m16,                [r5 + r4 + 1 * mmsize]
4029
+    mova                  m17,                [r5 + r4 + 2 * mmsize]
4030
+    mova                  m18,                [r5 + r4 + 3 * mmsize]
4031
+%else
4032
+    lea                   r5,                 [pw_LumaCoeffVer_avx512 + r4]
4033
+    mova                  m15,                [r5]
4034
+    mova                  m16,                [r5 + 1 * mmsize]
4035
+    mova                  m17,                [r5 + 2 * mmsize]
4036
+    mova                  m18,                [r5 + 3 * mmsize]
4037
+%endif
4038
+%ifidn %1, sp
4039
+    vbroadcasti32x4       m19,                [pd_526336]
4040
+    mova                  m20,                [interp8_vsp_store_avx512]
4041
+%else
4042
+    add                   r3d,                r3d
4043
+%endif
4044
+
4045
+    lea                   r5,                 [3 * r3]
4046
+%rep %2/4 - 1
4047
+    PROCESS_LUMA_VERT_S_16x4_AVX512 %1
4048
+    lea                   r0,                 [r0 + 4 * r1]
4049
+    lea                   r2,                 [r2 + 4 * r3]
4050
+%endrep
4051
+    PROCESS_LUMA_VERT_S_16x4_AVX512 %1
4052
+    RET
4053
+%endmacro
4054
+
4055
+%if ARCH_X86_64
4056
+    FILTER_VER_S_LUMA_16xN_AVX512 ss, 4
4057
+    FILTER_VER_S_LUMA_16xN_AVX512 ss, 8
4058
+    FILTER_VER_S_LUMA_16xN_AVX512 ss, 12
4059
+    FILTER_VER_S_LUMA_16xN_AVX512 ss, 16
4060
+    FILTER_VER_S_LUMA_16xN_AVX512 ss, 32
4061
+    FILTER_VER_S_LUMA_16xN_AVX512 ss, 64
4062
+    FILTER_VER_S_LUMA_16xN_AVX512 sp, 4
4063
+    FILTER_VER_S_LUMA_16xN_AVX512 sp, 8
4064
+    FILTER_VER_S_LUMA_16xN_AVX512 sp, 12
4065
+    FILTER_VER_S_LUMA_16xN_AVX512 sp, 16
4066
+    FILTER_VER_S_LUMA_16xN_AVX512 sp, 32
4067
+    FILTER_VER_S_LUMA_16xN_AVX512 sp, 64
4068
+%endif
4069
+%macro PROCESS_LUMA_VERT_SS_24x8_AVX512 0
4070
+    PROCESS_LUMA_VERT_S_16x4_AVX512 ss
4071
+    lea                  r4,                  [r6 + 4 * r1]
4072
+    lea                  r8,                  [r4 + 4 * r1]
4073
+    movu                 ym1,                 [r6]
4074
+    movu                 ym3,                 [r6 + r1]
4075
+    vinserti32x8         m1,                  [r6 + 2 * r1],          1
4076
+    vinserti32x8         m3,                  [r6 + r7],              1
4077
+    punpcklwd            m0,                  m1,                     m3
4078
+    pmaddwd              m0,                  m15
4079
+    punpckhwd            m1,                  m3
4080
+    pmaddwd              m1,                  m15
4081
+
4082
+    movu                 ym4,                 [r6 + 2 * r1]
4083
+    vinserti32x8         m4,                  [r4],                   1
4084
+    punpcklwd            m2,                  m3,                     m4
4085
+    pmaddwd              m2,                  m15
4086
+    punpckhwd            m3,                  m4
4087
+    pmaddwd              m3,                  m15
4088
+
4089
+    movu                 ym5,                 [r6 + r7]
4090
+    vinserti32x8         m5,                  [r4 + r1],              1
4091
+    punpcklwd            m6,                  m4,                     m5
4092
+    pmaddwd              m6,                  m16
4093
+    punpckhwd            m4,                  m5
4094
+    pmaddwd              m4,                  m16
4095
+
4096
+    paddd                m0,                  m6
4097
+    paddd                m1,                  m4
4098
+
4099
+    movu                 ym4,                 [r4]
4100
+    vinserti32x8         m4,                  [r4 + 2 * r1],          1
4101
+    punpcklwd            m6,                  m5,                     m4
4102
+    pmaddwd              m6,                  m16
4103
+    punpckhwd            m5,                  m4
4104
+    pmaddwd              m5,                  m16
4105
+
4106
+    paddd                m2,                  m6
4107
+    paddd                m3,                  m5
4108
+
4109
+    movu                 ym11,                [r4 + r1]
4110
+    vinserti32x8         m11,                 [r4 + r7],              1
4111
+    punpcklwd            m8,                  m4,                     m11
4112
+    pmaddwd              m8,                  m17
4113
+    punpckhwd            m4,                  m11
4114
+    pmaddwd              m4,                  m17
4115
+
4116
+    movu                 ym12,                [r4 + 2 * r1]
4117
+    vinserti32x8         m12,                 [r4 + 4 * r1],          1
4118
+    punpcklwd            m10,                 m11,                    m12
4119
+    pmaddwd              m10,                 m17
4120
+    punpckhwd            m11,                 m12
4121
+    pmaddwd              m11,                 m17
4122
+
4123
+    movu                 ym13,                [r4 + r7]
4124
+    vinserti32x8         m13,                 [r8 + r1],              1
4125
+    punpcklwd            m14,                 m12,                    m13
4126
+    pmaddwd              m14,                 m18
4127
+    punpckhwd            m12,                 m13
4128
+    pmaddwd              m12,                 m18
4129
+
4130
+    paddd                m8,                  m14
4131
+    paddd                m4,                  m12
4132
+    paddd                m0,                  m8
4133
+    paddd                m1,                  m4
4134
+
4135
+    movu                 ym12,                [r4 + 4 * r1]
4136
+    vinserti32x8         m12,                 [r8 + 2 * r1],          1
4137
+    punpcklwd            m14,                 m13,                    m12
4138
+    pmaddwd              m14,                 m18
4139
+    punpckhwd            m13,                 m12
4140
+    pmaddwd              m13,                 m18
4141
+
4142
+    paddd                m10,                 m14
4143
+    paddd                m11,                 m13
4144
+    paddd                m2,                  m10
4145
+    paddd                m3,                  m11
4146
+
4147
+    psrad                m0,                  6
4148
+    psrad                m1,                  6
4149
+    psrad                m2,                  6
4150
+    psrad                m3,                  6
4151
+
4152
+    packssdw             m0,                  m1
4153
+    packssdw             m2,                  m3
4154
+
4155
+    lea                  r9,                  [r2 + 4 * r3]
4156
+    movu                 [r9],                ym0
4157
+    movu                 [r9 + r3],           ym2
4158
+    vextracti32x8        [r9 + 2 * r3],       m0,                1
4159
+    vextracti32x8        [r9 + r5],           m2,                1
4160
+
4161
+    movu                 xm1,                 [r0 + mmsize/2]
4162
+    vinserti32x4         m1,                  [r0 + 2 * r1 + mmsize/2],          1
4163
+    vinserti32x4         m1,                  [r0 + 4 * r1 + mmsize/2],          2
4164
+    vinserti32x4         m1,                  [r6 + 2 * r1 + mmsize/2],          3
4165
+    movu                 xm3,                 [r0 + r1 + mmsize/2]
4166
+    vinserti32x4         m3,                  [r0 + r7 + mmsize/2],              1
4167
+    vinserti32x4         m3,                  [r6 + r1 + mmsize/2],              2
4168
+    vinserti32x4         m3,                  [r6 + r7 + mmsize/2],              3
4169
+    punpcklwd            m0,                  m1,                     m3
4170
+    pmaddwd              m0,                  m15
4171
+    punpckhwd            m1,                  m3
4172
+    pmaddwd              m1,                  m15
4173
+
4174
+    movu                 xm4,                 [r0 + 2 * r1 + mmsize/2]
4175
+    vinserti32x4         m4,                  [r0 + 4 * r1 + mmsize/2],          1
4176
+    vinserti32x4         m4,                  [r6 + 2 * r1 + mmsize/2],          2
4177
+    vinserti32x4         m4,                  [r6 + 4 * r1 + mmsize/2],          3
4178
+    punpcklwd            m2,                  m3,                     m4
4179
+    pmaddwd              m2,                  m15
4180
+    punpckhwd            m3,                  m4
4181
+    pmaddwd              m3,                  m15
4182
+
4183
+    movu                 xm5,                 [r0 + r7 + mmsize/2]
4184
+    vinserti32x4         m5,                  [r6 + r1 + mmsize/2],              1
4185
+    vinserti32x4         m5,                  [r6 + r7 + mmsize/2],              2
4186
+    vinserti32x4         m5,                  [r4 + r1 + mmsize/2],              3
4187
+    punpcklwd            m6,                  m4,                     m5
4188
+    pmaddwd              m6,                  m16
4189
+    punpckhwd            m4,                  m5
4190
+    pmaddwd              m4,                  m16
4191
+
4192
+    paddd                m0,                  m6
4193
+    paddd                m1,                  m4
4194
+
4195
+    movu                 xm4,                 [r0 + 4 * r1 + mmsize/2]
4196
+    vinserti32x4         m4,                  [r6 + 2 * r1 + mmsize/2],              1
4197
+    vinserti32x4         m4,                  [r6 + 4 * r1 + mmsize/2],              2
4198
+    vinserti32x4         m4,                  [r4 + 2 * r1 + mmsize/2],              3
4199
+    punpcklwd            m6,                  m5,                     m4
4200
+    pmaddwd              m6,                  m16
4201
+    punpckhwd            m5,                  m4
4202
+    pmaddwd              m5,                  m16
4203
+
4204
+    paddd                m2,                  m6
4205
+    paddd                m3,                  m5
4206
+
4207
+    movu                 xm11,                [r6 + r1 + mmsize/2]
4208
+    vinserti32x4         m11,                 [r6 + r7 + mmsize/2],              1
4209
+    vinserti32x4         m11,                 [r4 + r1 + mmsize/2],              2
4210
+    vinserti32x4         m11,                 [r4 + r7 + mmsize/2],              3
4211
+    punpcklwd            m8,                  m4,                     m11
4212
+    pmaddwd              m8,                  m17
4213
+    punpckhwd            m4,                  m11
4214
+    pmaddwd              m4,                  m17
4215
+
4216
+    movu                 xm12,                [r6 + 2 * r1 + mmsize/2]
4217
+    vinserti32x4         m12,                 [r6 + 4 * r1 + mmsize/2],          1
4218
+    vinserti32x4         m12,                 [r4 + 2 * r1 + mmsize/2],          2
4219
+    vinserti32x4         m12,                 [r4 + 4 * r1 + mmsize/2],          3
4220
+    punpcklwd            m10,                 m11,                    m12
4221
+    pmaddwd              m10,                 m17
4222
+    punpckhwd            m11,                 m12
4223
+    pmaddwd              m11,                 m17
4224
+
4225
+    movu                 xm13,                [r6 + r7 + mmsize/2]
4226
+    vinserti32x4         m13,                 [r4 + r1 + mmsize/2],              1
4227
+    vinserti32x4         m13,                 [r4 + r7 + mmsize/2],              2
4228
+    vinserti32x4         m13,                 [r8 + r1 + mmsize/2],              3
4229
+    punpcklwd            m14,                 m12,                    m13
4230
+    pmaddwd              m14,                 m18
4231
+    punpckhwd            m12,                 m13
4232
+    pmaddwd              m12,                 m18
4233
+
4234
+    paddd                m8,                  m14
4235
+    paddd                m4,                  m12
4236
+    paddd                m0,                  m8
4237
+    paddd                m1,                  m4
4238
+
4239
+    movu                 xm12,                [r6 + 4 * r1 + mmsize/2]
4240
+    vinserti32x4         m12,                 [r4 + 2 * r1 + mmsize/2],          1
4241
+    vinserti32x4         m12,                 [r4 + 4 * r1 + mmsize/2],          2
4242
+    vinserti32x4         m12,                 [r8 + 2 * r1 + mmsize/2],          3
4243
+    punpcklwd            m14,                 m13,                    m12
4244
+    pmaddwd              m14,                 m18
4245
+    punpckhwd            m13,                 m12
4246
+    pmaddwd              m13,                 m18
4247
+
4248
+    paddd                m10,                 m14
4249
+    paddd                m11,                 m13
4250
+    paddd                m2,                  m10
4251
+    paddd                m3,                  m11
4252
+
4253
+    psrad                m0,                  6
4254
+    psrad                m1,                  6
4255
+    psrad                m2,                  6
4256
+    psrad                m3,                  6
4257
+
4258
+    packssdw             m0,                  m1
4259
+    packssdw             m2,                  m3
4260
+
4261
+    movu                 [r2 + mmsize/2],                xm0
4262
+    movu                 [r2 + r3 + mmsize/2],           xm2
4263
+    vextracti32x4        [r2 + 2 * r3 + mmsize/2],       m0,                  1
4264
+    vextracti32x4        [r2 + r5 + mmsize/2],           m2,                  1
4265
+    lea                  r2,                             [r2 + 4 * r3]
4266
+    vextracti32x4        [r2 + mmsize/2],                m0,                  2
4267
+    vextracti32x4        [r2 + r3 + mmsize/2],           m2,                  2
4268
+    vextracti32x4        [r2 + 2 * r3 + mmsize/2],       m0,                  3
4269
+    vextracti32x4        [r2 + r5 + mmsize/2],           m2,                  3
4270
+%endmacro
4271
+;-----------------------------------------------------------------------------------------------------------------
4272
+; void interp_8tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
4273
+;-----------------------------------------------------------------------------------------------------------------
4274
+%if ARCH_X86_64
4275
+INIT_ZMM avx512
4276
+cglobal interp_8tap_vert_ss_24x32, 5, 10, 19
4277
+    add                   r1d,                r1d
4278
+    add                   r3d,                r3d
4279
+    lea                   r7,                 [3 * r1]
4280
+    sub                   r0,                 r7
4281
+    shl                   r4d,                8
4282
+%ifdef PIC
4283
+    lea                   r5,                 [pw_LumaCoeffVer_avx512]
4284
+    mova                  m15,                [r5 + r4]
4285
+    mova                  m16,                [r5 + r4 + 1 * mmsize]
4286
+    mova                  m17,                [r5 + r4 + 2 * mmsize]
4287
+    mova                  m18,                [r5 + r4 + 3 * mmsize]
4288
+%else
4289
+    lea                   r5,                 [pw_LumaCoeffVer_avx512 + r4]
4290
+    mova                  m15,                [r5]
4291
+    mova                  m16,                [r5 + 1 * mmsize]
4292
+    mova                  m17,                [r5 + 2 * mmsize]
4293
+    mova                  m18,                [r5 + 3 * mmsize]
4294
+%endif
4295
+
4296
+    lea                   r5,                 [3 * r3]
4297
+%rep 3
4298
+    PROCESS_LUMA_VERT_SS_24x8_AVX512
4299
+    lea                   r0,                 [r4]
4300
+    lea                   r2,                 [r2 + 4 * r3]
4301
+%endrep
4302
+    PROCESS_LUMA_VERT_SS_24x8_AVX512
4303
+    RET
4304
+%endif
4305
+
4306
+%macro PROCESS_LUMA_VERT_S_32x2_AVX512 1
4307
+    movu                 m1,                  [r0]                           ;0 row
4308
+    movu                 m3,                  [r0 + r1]                      ;1 row
4309
+    punpcklwd            m0,                  m1,                     m3
4310
+    pmaddwd              m0,                  m15
4311
+    punpckhwd            m1,                  m3
4312
+    pmaddwd              m1,                  m15
4313
+
4314
+    movu                 m4,                  [r0 + 2 * r1]                  ;2 row
4315
+    punpcklwd            m2,                  m3,                     m4
4316
+    pmaddwd              m2,                  m15
4317
+    punpckhwd            m3,                  m4
4318
+    pmaddwd              m3,                  m15
4319
+
4320
+    movu                 m5,                  [r0 + r7]                      ;3 row
4321
+    punpcklwd            m6,                  m4,                     m5
4322
+    pmaddwd              m6,                  m16
4323
+    punpckhwd            m4,                  m5
4324
+    pmaddwd              m4,                  m16
4325
+
4326
+    paddd                m0,                  m6
4327
+    paddd                m1,                  m4
4328
+
4329
+    movu                 m4,                  [r0 + 4 * r1]                  ;4 row
4330
+    punpcklwd            m6,                  m5,                     m4
4331
+    pmaddwd              m6,                  m16
4332
+    punpckhwd            m5,                  m4
4333
+    pmaddwd              m5,                  m16
4334
+
4335
+    paddd                m2,                  m6
4336
+    paddd                m3,                  m5
4337
+
4338
+    lea                  r6,                  [r0 + 4 * r1]
4339
+
4340
+    movu                 m11,                 [r6 + r1]                      ;5 row
4341
+    punpcklwd            m8,                  m4,                     m11
4342
+    pmaddwd              m8,                  m17
4343
+    punpckhwd            m4,                  m11
4344
+    pmaddwd              m4,                  m17
4345
+
4346
+    movu                 m12,                 [r6 + 2 * r1]                  ;6 row
4347
+    punpcklwd            m10,                 m11,                    m12
4348
+    pmaddwd              m10,                 m17
4349
+    punpckhwd            m11,                 m12
4350
+    pmaddwd              m11,                 m17
4351
+
4352
+    movu                 m13,                 [r6 + r7]                      ;7 row
4353
+    punpcklwd            m14,                 m12,                    m13
4354
+    pmaddwd              m14,                 m18
4355
+    punpckhwd            m12,                 m13
4356
+    pmaddwd              m12,                 m18
4357
+    paddd                m8,                  m14
4358
+    paddd                m4,                  m12
4359
+    movu                 m12,                 [r6 + 4 * r1]                 ; 8 row
4360
+    punpcklwd            m14,                 m13,                    m12
4361
+    pmaddwd              m14,                 m18
4362
+    punpckhwd            m13,                 m12
4363
+    pmaddwd              m13,                 m18
4364
+    paddd                m10,                 m14
4365
+    paddd                m11,                 m13
4366
+
4367
+    paddd                m0,                  m8
4368
+    paddd                m1,                  m4
4369
+    paddd                m2,                  m10
4370
+    paddd                m3,                  m11
4371
+%ifidn %1, sp
4372
+    paddd                m0,                  m19
4373
+    paddd                m1,                  m19
4374
+    paddd                m2,                  m19
4375
+    paddd                m3,                  m19
4376
+
4377
+    psrad                m0,                  12
4378
+    psrad                m1,                  12
4379
+    psrad                m2,                  12
4380
+    psrad                m3,                  12
4381
+
4382
+    packssdw             m0,                  m1
4383
+    packssdw             m2,                  m3
4384
+    packuswb             m0,                  m2
4385
+    vpermq               m0,                  m20,                   m0
4386
+    movu                 [r2],                ym0
4387
+    vextracti32x8        [r2 + r3],           m0,                    1
4388
+%else
4389
+    psrad                m0,                  6
4390
+    psrad                m1,                  6
4391
+    psrad                m2,                  6
4392
+    psrad                m3,                  6
4393
+
4394
+    packssdw             m0,                  m1
4395
+    packssdw             m2,                  m3
4396
+    movu                 [r2],                m0
4397
+    movu                 [r2 + r3],           m2
4398
+%endif
4399
+%endmacro
4400
+;-----------------------------------------------------------------------------------------------------------------
4401
+; void interp_8tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
4402
+;-----------------------------------------------------------------------------------------------------------------
4403
+%macro FILTER_VER_S_LUMA_32xN_AVX512 2
4404
+INIT_ZMM avx512
4405
+cglobal interp_8tap_vert_%1_32x%2, 5, 8, 21
4406
+    add                   r1d,                r1d
4407
+    lea                   r7,                 [3 * r1]
4408
+    sub                   r0,                 r7
4409
+    shl                   r4d,                8
4410
+%ifdef PIC
4411
+    lea                   r5,                 [pw_LumaCoeffVer_avx512]
4412
+    mova                  m15,                [r5 + r4]
4413
+    mova                  m16,                [r5 + r4 + 1 * mmsize]
4414
+    mova                  m17,                [r5 + r4 + 2 * mmsize]
4415
+    mova                  m18,                [r5 + r4 + 3 * mmsize]
4416
+%else
4417
+    lea                   r5,                 [pw_LumaCoeffVer_avx512 + r4]
4418
+    mova                  m15,                [r5]
4419
+    mova                  m16,                [r5 + 1 * mmsize]
4420
+    mova                  m17,                [r5 + 2 * mmsize]
4421
+    mova                  m18,                [r5 + 3 * mmsize]
4422
+%endif
4423
+%ifidn %1, sp
4424
+    vbroadcasti32x4       m19,                [pd_526336]
4425
+    mova                  m20,                [interp8_vsp_store_avx512]
4426
+%else
4427
+    add                   r3d,                r3d
4428
+%endif
4429
+
4430
+%rep %2/2 - 1
4431
+    PROCESS_LUMA_VERT_S_32x2_AVX512 %1
4432
+    lea                   r0,                 [r0 + 2 * r1]
4433
+    lea                   r2,                 [r2 + 2 * r3]
4434
+%endrep
4435
+    PROCESS_LUMA_VERT_S_32x2_AVX512 %1
4436
+    RET
4437
+%endmacro
4438
+
4439
+%if ARCH_X86_64
4440
+    FILTER_VER_S_LUMA_32xN_AVX512 ss, 8
4441
+    FILTER_VER_S_LUMA_32xN_AVX512 ss, 16
4442
+    FILTER_VER_S_LUMA_32xN_AVX512 ss, 32
4443
+    FILTER_VER_S_LUMA_32xN_AVX512 ss, 24
4444
+    FILTER_VER_S_LUMA_32xN_AVX512 ss, 64
4445
+    FILTER_VER_S_LUMA_32xN_AVX512 sp, 8
4446
+    FILTER_VER_S_LUMA_32xN_AVX512 sp, 16
4447
+    FILTER_VER_S_LUMA_32xN_AVX512 sp, 32
4448
+    FILTER_VER_S_LUMA_32xN_AVX512 sp, 24
4449
+    FILTER_VER_S_LUMA_32xN_AVX512 sp, 64
4450
+%endif
4451
+
4452
+%macro PROCESS_LUMA_VERT_S_48x4_AVX512 1
4453
+    PROCESS_LUMA_VERT_S_32x2_AVX512 %1
4454
+    movu                 m1,                  [r0 + 2 * r1]
4455
+    movu                 m3,                  [r0 + r7]
4456
+    punpcklwd            m0,                  m1,                     m3
4457
+    pmaddwd              m0,                  m15
4458
+    punpckhwd            m1,                  m3
4459
+    pmaddwd              m1,                  m15
4460
+
4461
+    movu                 m4,                  [r0 + 4 * r1]
4462
+    punpcklwd            m2,                  m3,                     m4
4463
+    pmaddwd              m2,                  m15
4464
+    punpckhwd            m3,                  m4
4465
+    pmaddwd              m3,                  m15
4466
+
4467
+    movu                 m5,                  [r6 + r1]
4468
+    punpcklwd            m6,                  m4,                     m5
4469
+    pmaddwd              m6,                  m16
4470
+    punpckhwd            m4,                  m5
4471
+    pmaddwd              m4,                  m16
4472
+
4473
+    paddd                m0,                  m6
4474
+    paddd                m1,                  m4
4475
+
4476
+    lea                  r4,                  [r6 + 4 * r1]
4477
+
4478
+    movu                 m4,                  [r6 + 2 * r1]
4479
+    punpcklwd            m6,                  m5,                     m4
4480
+    pmaddwd              m6,                  m16
4481
+    punpckhwd            m5,                  m4
4482
+    pmaddwd              m5,                  m16
4483
+
4484
+    paddd                m2,                  m6
4485
+    paddd                m3,                  m5
4486
+
4487
+    movu                 m11,                 [r6 + r7]
4488
+    punpcklwd            m8,                  m4,                     m11
4489
+    pmaddwd              m8,                  m17
4490
+    punpckhwd            m4,                  m11
4491
+    pmaddwd              m4,                  m17
4492
+
4493
+    movu                 m12,                 [r4]
4494
+    punpcklwd            m10,                 m11,                    m12
4495
+    pmaddwd              m10,                 m17
4496
+    punpckhwd            m11,                 m12
4497
+    pmaddwd              m11,                 m17
4498
+
4499
+    movu                 m13,                 [r4 + r1]
4500
+    punpcklwd            m14,                 m12,                    m13
4501
+    pmaddwd              m14,                 m18
4502
+    punpckhwd            m12,                 m13
4503
+    pmaddwd              m12,                 m18
4504
+    paddd                m8,                  m14
4505
+    paddd                m4,                  m12
4506
+    movu                 m12,                 [r4 + 2 * r1]
4507
+    punpcklwd            m14,                 m13,                    m12
4508
+    pmaddwd              m14,                 m18
4509
+    punpckhwd            m13,                 m12
4510
+    pmaddwd              m13,                 m18
4511
+    paddd                m10,                 m14
4512
+    paddd                m11,                 m13
4513
+
4514
+    paddd                m0,                  m8
4515
+    paddd                m1,                  m4
4516
+    paddd                m2,                  m10
4517
+    paddd                m3,                  m11
4518
+%ifidn %1, sp
4519
+    paddd                m0,                  m19
4520
+    paddd                m1,                  m19
4521
+    paddd                m2,                  m19
4522
+    paddd                m3,                  m19
4523
+
4524
+    psrad                m0,                  12
4525
+    psrad                m1,                  12
4526
+    psrad                m2,                  12
4527
+    psrad                m3,                  12
4528
+
4529
+    packssdw             m0,                  m1
4530
+    packssdw             m2,                  m3
4531
+    packuswb             m0,                  m2
4532
+    vpermq               m0,                  m20,                   m0
4533
+    movu                 [r2 + 2 * r3],       ym0
4534
+    vextracti32x8        [r2 + r5],           m0,                    1
4535
+%else
4536
+    psrad                m0,                  6
4537
+    psrad                m1,                  6
4538
+    psrad                m2,                  6
4539
+    psrad                m3,                  6
4540
+
4541
+    packssdw             m0,                  m1
4542
+    packssdw             m2,                  m3
4543
+    movu                 [r2 + 2 * r3],       m0
4544
+    movu                 [r2 + r5],           m2
4545
+%endif
4546
+    movu                 ym1,                 [r0 + mmsize]
4547
+    movu                 ym3,                 [r0 + r1 + mmsize]
4548
+    vinserti32x8         m1,                  [r0 + 2 * r1 + mmsize], 1
4549
+    vinserti32x8         m3,                  [r0 + r7 + mmsize],     1
4550
+    punpcklwd            m0,                  m1,                     m3
4551
+    pmaddwd              m0,                  m15
4552
+    punpckhwd            m1,                  m3
4553
+    pmaddwd              m1,                  m15
4554
+
4555
+    movu                 ym4,                 [r0 + 2 * r1 + mmsize]
4556
+    vinserti32x8         m4,                  [r6 + mmsize],          1
4557
+    punpcklwd            m2,                  m3,                     m4
4558
+    pmaddwd              m2,                  m15
4559
+    punpckhwd            m3,                  m4
4560
+    pmaddwd              m3,                  m15
4561
+
4562
+    movu                 ym5,                 [r0 + r7 + mmsize]
4563
+    vinserti32x8         m5,                  [r6 + r1 + mmsize],     1
4564
+    punpcklwd            m6,                  m4,                     m5
4565
+    pmaddwd              m6,                  m16
4566
+    punpckhwd            m4,                  m5
4567
+    pmaddwd              m4,                  m16
4568
+
4569
+    paddd                m0,                  m6
4570
+    paddd                m1,                  m4
4571
+
4572
+    movu                 ym4,                 [r6 + mmsize]
4573
+    vinserti32x8         m4,                  [r6 + 2 * r1 + mmsize], 1
4574
+    punpcklwd            m6,                  m5,                     m4
4575
+    pmaddwd              m6,                  m16
4576
+    punpckhwd            m5,                  m4
4577
+    pmaddwd              m5,                  m16
4578
+
4579
+    paddd                m2,                  m6
4580
+    paddd                m3,                  m5
4581
+
4582
+    movu                 ym11,                [r6 + r1 + mmsize]
4583
+    vinserti32x8         m11,                 [r6 + r7 + mmsize],     1
4584
+    punpcklwd            m8,                  m4,                     m11
4585
+    pmaddwd              m8,                  m17
4586
+    punpckhwd            m4,                  m11
4587
+    pmaddwd              m4,                  m17
4588
+
4589
+    movu                 ym12,                [r6 + 2 * r1 + mmsize]
4590
+    vinserti32x8         m12,                 [r6 + 4 * r1 + mmsize], 1
4591
+    punpcklwd            m10,                 m11,                    m12
4592
+    pmaddwd              m10,                 m17
4593
+    punpckhwd            m11,                 m12
4594
+    pmaddwd              m11,                 m17
4595
+
4596
+    movu                 ym13,                [r6 + r7 + mmsize]
4597
+    vinserti32x8         m13,                 [r4 + r1 + mmsize],     1
4598
+    punpcklwd            m14,                 m12,                    m13
4599
+    pmaddwd              m14,                 m18
4600
+    punpckhwd            m12,                 m13
4601
+    pmaddwd              m12,                 m18
4602
+    paddd                m8,                  m14
4603
+    paddd                m4,                  m12
4604
+    movu                 ym12,                [r6 + 4 * r1 + mmsize]
4605
+    vinserti32x8         m12,                 [r4 + 2 * r1 + mmsize], 1
4606
+    punpcklwd            m14,                 m13,                    m12
4607
+    pmaddwd              m14,                 m18
4608
+    punpckhwd            m13,                 m12
4609
+    pmaddwd              m13,                 m18
4610
+    paddd                m10,                 m14
4611
+    paddd                m11,                 m13
4612
+
4613
+    paddd                m0,                  m8
4614
+    paddd                m1,                  m4
4615
+    paddd                m2,                  m10
4616
+    paddd                m3,                  m11
4617
+%ifidn %1, sp
4618
+    paddd                m0,                  m19
4619
+    paddd                m1,                  m19
4620
+    paddd                m2,                  m19
4621
+    paddd                m3,                  m19
4622
+
4623
+    psrad                m0,                  12
4624
+    psrad                m1,                  12
4625
+    psrad                m2,                  12
4626
+    psrad                m3,                  12
4627
+
4628
+    packssdw             m0,                  m1
4629
+    packssdw             m2,                  m3
4630
+    packuswb             m0,                  m2
4631
+    vpermq               m0,                  m20,                   m0
4632
+    movu                 [r2 + mmsize/2],                xm0
4633
+    vextracti32x4        [r2 + r3 + mmsize/2], m0,                    2
4634
+    vextracti32x4        [r2 + 2 * r3 + mmsize/2],       m0,          1
4635
+    vextracti32x4        [r2 + r5 + mmsize/2],           m0,          3
4636
+%else
4637
+    psrad                m0,                  6
4638
+    psrad                m1,                  6
4639
+    psrad                m2,                  6
4640
+    psrad                m3,                  6
4641
+
4642
+    packssdw             m0,                  m1
4643
+    packssdw             m2,                  m3
4644
+
4645
+    movu                 [r2 + mmsize],                ym0
4646
+    movu                 [r2 + r3 + mmsize],           ym2
4647
+    vextracti32x8        [r2 + 2 * r3 + mmsize],       m0,                1
4648
+    vextracti32x8        [r2 + r5 + mmsize],           m2,                1
4649
+%endif
4650
+%endmacro
4651
+;-----------------------------------------------------------------------------------------------------------------
4652
+; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
4653
+;-----------------------------------------------------------------------------------------------------------------
4654
+%macro FILTER_VER_S_LUMA_48x64_AVX512 1
4655
+INIT_ZMM avx512
4656
+cglobal interp_8tap_vert_%1_48x64, 5, 8, 21
4657
+    add                   r1d,                r1d
4658
+    lea                   r7,                 [3 * r1]
4659
+    sub                   r0,                 r7
4660
+    shl                   r4d,                8
4661
+%ifdef PIC
4662
+    lea                   r5,                 [pw_LumaCoeffVer_avx512]
4663
+    mova                  m15,                [r5 + r4]
4664
+    mova                  m16,                [r5 + r4 + 1 * mmsize]
4665
+    mova                  m17,                [r5 + r4 + 2 * mmsize]
4666
+    mova                  m18,                [r5 + r4 + 3 * mmsize]
4667
+%else
4668
+    lea                   r5,                 [pw_LumaCoeffVer_avx512 + r4]
4669
+    mova                  m15,                [r5]
4670
+    mova                  m16,                [r5 + 1 * mmsize]
4671
+    mova                  m17,                [r5 + 2 * mmsize]
4672
+    mova                  m18,                [r5 + 3 * mmsize]
4673
+%endif
4674
+%ifidn %1, sp
4675
+    vbroadcasti32x4       m19,                [pd_526336]
4676
+    mova                  m20,                [interp8_vsp_store_avx512]
4677
+%else
4678
+    add                   r3d,                r3d
4679
+%endif
4680
+
4681
+    lea                   r5,                 [3 * r3]
4682
+%rep 15
4683
+    PROCESS_LUMA_VERT_S_48x4_AVX512 %1
4684
+    lea                   r0,                 [r0 + 4 * r1]
4685
+    lea                   r2,                 [r2 + 4 * r3]
4686
+%endrep
4687
+    PROCESS_LUMA_VERT_S_48x4_AVX512 %1
4688
+    RET
4689
+%endmacro
4690
+
4691
+%if ARCH_X86_64
4692
+    FILTER_VER_S_LUMA_48x64_AVX512 ss
4693
+    FILTER_VER_S_LUMA_48x64_AVX512 sp
4694
+%endif
4695
+
4696
+%macro PROCESS_LUMA_VERT_S_64x2_AVX512 1
4697
+    PROCESS_LUMA_VERT_S_32x2_AVX512 %1
4698
+    movu                 m1,                  [r0 + mmsize]                  ;0 row
4699
+    movu                 m3,                  [r0 + r1 + mmsize]             ;1 row
4700
+    punpcklwd            m0,                  m1,                     m3
4701
+    pmaddwd              m0,                  m15
4702
+    punpckhwd            m1,                  m3
4703
+    pmaddwd              m1,                  m15
4704
+
4705
+    movu                 m4,                  [r0 + 2 * r1 + mmsize]         ;2 row
4706
+    punpcklwd            m2,                  m3,                     m4
4707
+    pmaddwd              m2,                  m15
4708
+    punpckhwd            m3,                  m4
4709
+    pmaddwd              m3,                  m15
4710
+
4711
+    movu                 m5,                  [r0 + r7 + mmsize]             ;3 row
4712
+    punpcklwd            m6,                  m4,                     m5
4713
+    pmaddwd              m6,                  m16
4714
+    punpckhwd            m4,                  m5
4715
+    pmaddwd              m4,                  m16
4716
+
4717
+    paddd                m0,                  m6
4718
+    paddd                m1,                  m4
4719
+
4720
+    movu                 m4,                  [r0 + 4 * r1 + mmsize]         ;4 row
4721
+    punpcklwd            m6,                  m5,                     m4
4722
+    pmaddwd              m6,                  m16
4723
+    punpckhwd            m5,                  m4
4724
+    pmaddwd              m5,                  m16
4725
+
4726
+    paddd                m2,                  m6
4727
+    paddd                m3,                  m5
4728
+
4729
+    movu                 m11,                 [r6 + r1 + mmsize]             ;5 row
4730
+    punpcklwd            m8,                  m4,                     m11
4731
+    pmaddwd              m8,                  m17
4732
+    punpckhwd            m4,                  m11
4733
+    pmaddwd              m4,                  m17
4734
+
4735
+    movu                 m12,                 [r6 + 2 * r1 + mmsize]         ;6 row
4736
+    punpcklwd            m10,                 m11,                    m12
4737
+    pmaddwd              m10,                 m17
4738
+    punpckhwd            m11,                 m12
4739
+    pmaddwd              m11,                 m17
4740
+
4741
+    movu                 m13,                 [r6 + r7 + mmsize]             ;7 row
4742
+    punpcklwd            m14,                 m12,                    m13
4743
+    pmaddwd              m14,                 m18
4744
+    punpckhwd            m12,                 m13
4745
+    pmaddwd              m12,                 m18
4746
+    paddd                m8,                  m14
4747
+    paddd                m4,                  m12
4748
+    movu                 m12,                 [r6 + 4 * r1 + mmsize]         ; 8 row
4749
+    punpcklwd            m14,                 m13,                    m12
4750
+    pmaddwd              m14,                 m18
4751
+    punpckhwd            m13,                 m12
4752
+    pmaddwd              m13,                 m18
4753
+    paddd                m10,                 m14
4754
+    paddd                m11,                 m13
4755
+
4756
+    paddd                m0,                  m8
4757
+    paddd                m1,                  m4
4758
+    paddd                m2,                  m10
4759
+    paddd                m3,                  m11
4760
+%ifidn %1, sp
4761
+    paddd                m0,                  m19
4762
+    paddd                m1,                  m19
4763
+    paddd                m2,                  m19
4764
+    paddd                m3,                  m19
4765
+
4766
+    psrad                m0,                  12
4767
+    psrad                m1,                  12
4768
+    psrad                m2,                  12
4769
+    psrad                m3,                  12
4770
+
4771
+    packssdw             m0,                  m1
4772
+    packssdw             m2,                  m3
4773
+    packuswb             m0,                  m2
4774
+    vpermq               m0,                  m20,                   m0
4775
+    movu                 [r2 + mmsize/2],     ym0
4776
+    vextracti32x8        [r2 + r3 + mmsize/2], m0,                    1
4777
+%else
4778
+    psrad                m0,                  6
4779
+    psrad                m1,                  6
4780
+    psrad                m2,                  6
4781
+    psrad                m3,                  6
4782
+    packssdw             m0,                  m1
4783
+    packssdw             m2,                  m3
4784
+    movu                 [r2 + mmsize],       m0
4785
+    movu                 [r2 + r3 + mmsize],  m2
4786
+%endif
4787
+%endmacro
4788
+;-----------------------------------------------------------------------------------------------------------------
4789
+; void interp_8tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
4790
+;-----------------------------------------------------------------------------------------------------------------
4791
+%macro FILTER_VER_S_LUMA_64xN_AVX512 2
4792
+INIT_ZMM avx512
4793
+cglobal interp_8tap_vert_%1_64x%2, 5, 8, 21
4794
+    add                   r1d,                r1d
4795
+    lea                   r7,                 [3 * r1]
4796
+    sub                   r0,                 r7
4797
+    shl                   r4d,                8
4798
+%ifdef PIC
4799
+    lea                   r5,                 [pw_LumaCoeffVer_avx512]
4800
+    mova                  m15,                [r5 + r4]
4801
+    mova                  m16,                [r5 + r4 + 1 * mmsize]
4802
+    mova                  m17,                [r5 + r4 + 2 * mmsize]
4803
+    mova                  m18,                [r5 + r4 + 3 * mmsize]
4804
+%else
4805
+    lea                   r5,                 [pw_LumaCoeffVer_avx512 + r4]
4806
+    mova                  m15,                [r5]
4807
+    mova                  m16,                [r5 + 1 * mmsize]
4808
+    mova                  m17,                [r5 + 2 * mmsize]
4809
+    mova                  m18,                [r5 + 3 * mmsize]
4810
+%endif
4811
+%ifidn %1, sp
4812
+    vbroadcasti32x4       m19,                [pd_526336]
4813
+    mova                  m20,                [interp8_vsp_store_avx512]
4814
+%else
4815
+    add                   r3d,                r3d
4816
+%endif
4817
+
4818
+%rep %2/2 - 1
4819
+    PROCESS_LUMA_VERT_S_64x2_AVX512 %1
4820
+    lea                   r0,                 [r0 + 2 * r1]
4821
+    lea                   r2,                 [r2 + 2 * r3]
4822
+%endrep
4823
+    PROCESS_LUMA_VERT_S_64x2_AVX512 %1
4824
+    RET
4825
+%endmacro
4826
+
4827
+%if ARCH_X86_64
4828
+    FILTER_VER_S_LUMA_64xN_AVX512 ss, 16
4829
+    FILTER_VER_S_LUMA_64xN_AVX512 ss, 32
4830
+    FILTER_VER_S_LUMA_64xN_AVX512 ss, 48
4831
+    FILTER_VER_S_LUMA_64xN_AVX512 ss, 64
4832
+    FILTER_VER_S_LUMA_64xN_AVX512 sp, 16
4833
+    FILTER_VER_S_LUMA_64xN_AVX512 sp, 32
4834
+    FILTER_VER_S_LUMA_64xN_AVX512 sp, 48
4835
+    FILTER_VER_S_LUMA_64xN_AVX512 sp, 64
4836
+%endif
4837
+;-------------------------------------------------------------------------------------------------------------
4838
+;avx512 luma_vss code end
4839
+;-------------------------------------------------------------------------------------------------------------
4840
+;-------------------------------------------------------------------------------------------------------------
4841
+;avx512 luma_vpp and luma_vps code start
4842
+;-------------------------------------------------------------------------------------------------------------
4843
+%macro PROCESS_LUMA_VERT_16x8_AVX512 1
4844
+    lea                   r5,                 [r0 + 4 * r1]
4845
+    lea                   r4,                 [r5 + 4 * r1]
4846
+    movu                  xm1,                [r0]
4847
+    vinserti32x4          m1,                 [r0 + 2 * r1],       1
4848
+    vinserti32x4          m1,                 [r5],                2
4849
+    vinserti32x4          m1,                 [r5 + 2 * r1],       3
4850
+    movu                  xm3,                [r0 + r1]
4851
+    vinserti32x4          m3,                 [r0 + r6],           1
4852
+    vinserti32x4          m3,                 [r5 + r1],           2
4853
+    vinserti32x4          m3,                 [r5 + r6],           3
4854
+    punpcklbw             m0,                 m1,                  m3
4855
+    pmaddubsw             m0,                 m8
4856
+    punpckhbw             m1,                 m3
4857
+    pmaddubsw             m1,                 m8
4858
+
4859
+    movu                  xm4,                [r0 + 2 * r1]
4860
+    vinserti32x4          m4,                 [r0 + 4 * r1],       1
4861
+    vinserti32x4          m4,                 [r5 + 2 * r1],       2
4862
+    vinserti32x4          m4,                 [r5 + 4 * r1],       3
4863
+    punpcklbw             m2,                 m3,                  m4
4864
+    pmaddubsw             m2,                 m8
4865
+    punpckhbw             m3,                 m4
4866
+    pmaddubsw             m3,                 m8
4867
+
4868
+    movu                  xm5,                [r0 + r6]
4869
+    vinserti32x4          m5,                 [r5 + r1],           1
4870
+    vinserti32x4          m5,                 [r5 + r6],           2
4871
+    vinserti32x4          m5,                 [r4 + r1],           3
4872
+    punpcklbw             m6,                 m4,                  m5
4873
+    pmaddubsw             m6,                 m9
4874
+    punpckhbw             m4,                 m5
4875
+    pmaddubsw             m4,                 m9
4876
+
4877
+    paddw                 m0,                 m6
4878
+    paddw                 m1,                 m4
4879
+
4880
+    movu                  xm4,                [r0 + 4 * r1]
4881
+    vinserti32x4          m4,                 [r5 + 2 * r1],       1
4882
+    vinserti32x4          m4,                 [r5 + 4 * r1],       2
4883
+    vinserti32x4          m4,                 [r4 + 2 * r1],       3
4884
+    punpcklbw             m6,                 m5,                  m4
4885
+    pmaddubsw             m6,                 m9
4886
+    punpckhbw             m5,                 m4
4887
+    pmaddubsw             m5,                 m9
4888
+
4889
+    paddw                 m2,                 m6
4890
+    paddw                 m3,                 m5
4891
+
4892
+    movu                  xm15,               [r5 + r1]
4893
+    vinserti32x4          m15,                [r5 + r6],           1
4894
+    vinserti32x4          m15,                [r4 + r1],           2
4895
+    vinserti32x4          m15,                [r4 + r6],           3
4896
+    punpcklbw             m12,                m4,                 m15
4897
+    pmaddubsw             m12,                m10
4898
+    punpckhbw             m13,                m4,                 m15
4899
+    pmaddubsw             m13,                m10
4900
+
4901
+    lea                   r8,                 [r4 + 4 * r1]
4902
+    movu                  xm4,                [r5 + 2 * r1]
4903
+    vinserti32x4          m4,                 [r5 + 4 * r1],       1
4904
+    vinserti32x4          m4,                 [r4 + 2 * r1],       2
4905
+    vinserti32x4          m4,                 [r4 + 4 * r1],       3
4906
+    punpcklbw             m14,                m15,                 m4
4907
+    pmaddubsw             m14,                m10
4908
+    punpckhbw             m15,                m4
4909
+    pmaddubsw             m15,                m10
4910
+
4911
+    movu                  xm5,                [r5 + r6]
4912
+    vinserti32x4          m5,                 [r4 + r1],           1
4913
+    vinserti32x4          m5,                 [r4 + r6],           2
4914
+    vinserti32x4          m5,                 [r8 + r1],           3
4915
+    punpcklbw             m6,                 m4,                  m5
4916
+    pmaddubsw             m6,                 m11
4917
+    punpckhbw             m4,                 m5
4918
+    pmaddubsw             m4,                 m11
4919
+
4920
+    paddw                 m12,                m6
4921
+    paddw                 m13,                m4
4922
+
4923
+    movu                  xm4,                [r5 + 4 * r1]
4924
+    vinserti32x4          m4,                 [r4 + 2 * r1],       1
4925
+    vinserti32x4          m4,                 [r4 + 4 * r1],       2
4926
+    vinserti32x4          m4,                 [r8 + 2 * r1],       3
4927
+    punpcklbw             m6,                 m5,                  m4
4928
+    pmaddubsw             m6,                 m11
4929
+    punpckhbw             m5,                 m4
4930
+    pmaddubsw             m5,                 m11
4931
+
4932
+    paddw                 m14,                m6
4933
+    paddw                 m15,                m5
4934
+
4935
+    paddw                 m0,                 m12
4936
+    paddw                 m1,                 m13
4937
+    paddw                 m2,                 m14
4938
+    paddw                 m3,                 m15
4939
+%ifidn %1,pp
4940
+    pmulhrsw              m0,                 m7
4941
+    pmulhrsw              m1,                 m7
4942
+    pmulhrsw              m2,                 m7
4943
+    pmulhrsw              m3,                 m7
4944
+
4945
+    packuswb              m0,                 m1
4946
+    packuswb              m2,                 m3
4947
+    movu                  [r2],               xm0
4948
+    movu                  [r2 + r3],          xm2
4949
+    vextracti32x4         [r2 + 2 * r3],      m0,                  1
4950
+    vextracti32x4         [r2 + r7],          m2,                  1
4951
+    lea                   r2,                 [r2 + 4 * r3]
4952
+    vextracti32x4         [r2],               m0,                  2
4953
+    vextracti32x4         [r2 + r3],          m2,                  2
4954
+    vextracti32x4         [r2 + 2 * r3],      m0,                  3
4955
+    vextracti32x4         [r2 + r7],          m2,                  3
4956
+%else
4957
+    psubw                 m0,                 m7
4958
+    psubw                 m1,                 m7
4959
+    mova                  m12,                 m16
4960
+    mova                  m13,                 m17
4961
+    vpermi2q              m12,                 m0,                m1
4962
+    vpermi2q              m13,                 m0,                m1
4963
+    movu                  [r2],               ym12
4964
+    vextracti32x8         [r2 + 2 * r3],      m12,                 1
4965
+
4966
+    psubw                 m2,                 m7
4967
+    psubw                 m3,                 m7
4968
+    mova                  m14,                 m16
4969
+    mova                  m15,                 m17
4970
+    vpermi2q              m14,                 m2,                m3
4971
+    vpermi2q              m15,                 m2,                m3
4972
+    movu                  [r2 + r3],          ym14
4973
+    vextracti32x8         [r2 + r7],          m14,                 1
4974
+    lea                   r2,                 [r2 + 4 * r3]
4975
+
4976
+    movu                  [r2],               ym13
4977
+    movu                  [r2 + r3],          ym15
4978
+    vextracti32x8         [r2 + 2 * r3],      m13,                 1
4979
+    vextracti32x8         [r2 + r7],          m15,                 1
4980
+%endif
4981
+%endmacro
4982
+;-----------------------------------------------------------------------------------------------------------------
4983
+; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
4984
+;-----------------------------------------------------------------------------------------------------------------
4985
+%macro FILTER_VERT_LUMA_16xN_AVX512 2
4986
+INIT_ZMM avx512
4987
+cglobal interp_8tap_vert_%1_16x%2, 5, 9, 18
4988
+    mov                   r4d,                r4m
4989
+    shl                   r4d,                8
4990
+%ifdef PIC
4991
+    lea                   r5,                 [tab_LumaCoeffVer_32_avx512]
4992
+    mova                  m8,                 [r5 + r4]
4993
+    mova                  m9,                 [r5 + r4 + 1 * mmsize]
4994
+    mova                  m10,                [r5 + r4 + 2 * mmsize]
4995
+    mova                  m11,                [r5 + r4 + 3 * mmsize]
4996
+%else
4997
+    mova                  m8,                 [tab_LumaCoeffVer_32_avx512 + r4]
4998
+    mova                  m9,                 [tab_LumaCoeffVer_32_avx512 + r4 + 1 * mmsize]
4999
+    mova                  m10,                [tab_LumaCoeffVer_32_avx512 + r4 + 2 * mmsize]
5000
+    mova                  m11,                [tab_LumaCoeffVer_32_avx512 + r4 + 3 * mmsize]
5001
+%endif
5002
+%ifidn %1, pp
5003
+    vbroadcasti32x8       m7,                 [pw_512]
5004
+%else
5005
+    shl                   r3d,                1
5006
+    vbroadcasti32x8       m7,                 [pw_2000]
5007
+    mova                  m16,                [interp4_vps_store1_avx512]
5008
+    mova                  m17,                [interp4_vps_store2_avx512]
5009
+%endif
5010
+
5011
+    lea                   r6,                 [3 * r1]
5012
+    lea                   r7,                 [3 * r3]
5013
+    sub                   r0,                 r6
5014
+
5015
+%rep %2/8 - 1
5016
+    PROCESS_LUMA_VERT_16x8_AVX512 %1
5017
+    lea                   r0,                 [r4]
5018
+    lea                   r2,                 [r2 + 4 * r3]
5019
+%endrep
5020
+    PROCESS_LUMA_VERT_16x8_AVX512 %1
5021
+    RET
5022
+%endmacro
5023
+
5024
+%if ARCH_X86_64
5025
+    FILTER_VERT_LUMA_16xN_AVX512 pp, 8
5026
+    FILTER_VERT_LUMA_16xN_AVX512 pp, 16
5027
+    FILTER_VERT_LUMA_16xN_AVX512 pp, 32
5028
+    FILTER_VERT_LUMA_16xN_AVX512 pp, 64
5029
+
5030
+    FILTER_VERT_LUMA_16xN_AVX512 ps, 8
5031
+    FILTER_VERT_LUMA_16xN_AVX512 ps, 16
5032
+    FILTER_VERT_LUMA_16xN_AVX512 ps, 32
5033
+    FILTER_VERT_LUMA_16xN_AVX512 ps, 64
5034
+%endif
5035
+%macro PROCESS_LUMA_VERT_32x4_AVX512 1
5036
+    lea                   r5,                 [r0 + 4 * r1]
5037
+    movu                  ym1,                [r0]
5038
+    vinserti32x8          m1,                 [r0 + 2 * r1],       1
5039
+    movu                  ym3,                [r0 + r1]
5040
+    vinserti32x8          m3,                 [r0 + r6],           1
5041
+    punpcklbw             m0,                 m1,                  m3
5042
+    pmaddubsw             m0,                 m8
5043
+    punpckhbw             m1,                 m3
5044
+    pmaddubsw             m1,                 m8
5045
+
5046
+    movu                  ym4,                [r0 + 2 * r1]
5047
+    vinserti32x8          m4,                 [r0 + 4 * r1],       1
5048
+    punpcklbw             m2,                 m3,                  m4
5049
+    pmaddubsw             m2,                 m8
5050
+    punpckhbw             m3,                 m4
5051
+    pmaddubsw             m3,                 m8
5052
+
5053
+    movu                  ym5,                [r0 + r6]
5054
+    vinserti32x8          m5,                 [r5 + r1],           1
5055
+    punpcklbw             m6,                 m4,                  m5
5056
+    pmaddubsw             m6,                 m9
5057
+    punpckhbw             m4,                 m5
5058
+    pmaddubsw             m4,                 m9
5059
+
5060
+    paddw                 m0,                 m6
5061
+    paddw                 m1,                 m4
5062
+
5063
+    movu                  ym4,                [r0 + 4 * r1]
5064
+    vinserti32x8          m4,                 [r5 + 2 * r1],       1
5065
+    punpcklbw             m6,                 m5,                  m4
5066
+    pmaddubsw             m6,                 m9
5067
+    punpckhbw             m5,                 m4
5068
+    pmaddubsw             m5,                 m9
5069
+
5070
+    paddw                 m2,                 m6
5071
+    paddw                 m3,                 m5
5072
+
5073
+    lea                   r4,                 [r5 + 4 * r1]
5074
+    movu                  ym15,               [r5 + r1]
5075
+    vinserti32x8          m15,                [r5 + r6],           1
5076
+    punpcklbw             m12,                m4,                 m15
5077
+    pmaddubsw             m12,                m10
5078
+    punpckhbw             m13,                m4,                 m15
5079
+    pmaddubsw             m13,                m10
5080
+
5081
+    movu                  ym4,                [r5 + 2 * r1]
5082
+    vinserti32x8          m4,                 [r5 + 4 * r1],       1
5083
+    punpcklbw             m14,                m15,                 m4
5084
+    pmaddubsw             m14,                m10
5085
+    punpckhbw             m15,                m4
5086
+    pmaddubsw             m15,                m10
5087
+
5088
+    movu                  ym5,                [r5 + r6]
5089
+    vinserti32x8          m5,                 [r4 + r1],           1
5090
+    punpcklbw             m6,                 m4,                  m5
5091
+    pmaddubsw             m6,                 m11
5092
+    punpckhbw             m4,                 m5
5093
+    pmaddubsw             m4,                 m11
5094
+
5095
+    paddw                 m12,                m6
5096
+    paddw                 m13,                m4
5097
+
5098
+    movu                  ym4,                [r5 + 4 * r1]
5099
+    vinserti32x8          m4,                 [r4 + 2 * r1],       1
5100
+    punpcklbw             m6,                 m5,                  m4
5101
+    pmaddubsw             m6,                 m11
5102
+    punpckhbw             m5,                 m4
5103
+    pmaddubsw             m5,                 m11
5104
+
5105
+    paddw                 m14,                m6
5106
+    paddw                 m15,                m5
5107
+
5108
+    paddw                 m0,                 m12
5109
+    paddw                 m1,                 m13
5110
+    paddw                 m2,                 m14
5111
+    paddw                 m3,                 m15
5112
+%ifidn %1,pp
5113
+    pmulhrsw              m0,                 m7
5114
+    pmulhrsw              m1,                 m7
5115
+    pmulhrsw              m2,                 m7
5116
+    pmulhrsw              m3,                 m7
5117
+
5118
+    packuswb              m0,                 m1
5119
+    packuswb              m2,                 m3
5120
+    movu                  [r2],               ym0
5121
+    movu                  [r2 + r3],          ym2
5122
+    vextracti32x8         [r2 + 2 * r3],      m0,                  1
5123
+    vextracti32x8         [r2 + r7],          m2,                  1
5124
+%else
5125
+    psubw                 m0,                 m7
5126
+    psubw                 m1,                 m7
5127
+    mova                  m12,                 m16
5128
+    mova                  m13,                 m17
5129
+    vpermi2q              m12,                 m0,                m1
5130
+    vpermi2q              m13,                 m0,                m1
5131
+    movu                  [r2],               m12
5132
+    movu                  [r2 + 2 * r3],      m13
5133
+
5134
+    psubw                 m2,                 m7
5135
+    psubw                 m3,                 m7
5136
+    mova                  m14,                 m16
5137
+    mova                  m15,                 m17
5138
+    vpermi2q              m14,                 m2,                m3
5139
+    vpermi2q              m15,                 m2,                m3
5140
+    movu                  [r2 + r3],          m14
5141
+    movu                  [r2 + r7],          m15
5142
+%endif
5143
+%endmacro
5144
+;-----------------------------------------------------------------------------------------------------------------
5145
+; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
5146
+;-----------------------------------------------------------------------------------------------------------------
5147
+%macro FILTER_VERT_LUMA_32xN_AVX512 2
5148
+INIT_ZMM avx512
5149
+cglobal interp_8tap_vert_%1_32x%2, 5, 8, 18
5150
+    mov                   r4d,                r4m
5151
+    shl                   r4d,                8
5152
+%ifdef PIC
5153
+    lea                   r5,                 [tab_LumaCoeffVer_32_avx512]
5154
+    mova                  m8,                 [r5 + r4]
5155
+    mova                  m9,                 [r5 + r4 + 1 * mmsize]
5156
+    mova                  m10,                [r5 + r4 + 2 * mmsize]
5157
+    mova                  m11,                [r5 + r4 + 3 * mmsize]
5158
+%else
5159
+    mova                  m8,                 [tab_LumaCoeffVer_32_avx512 + r4]
5160
+    mova                  m9,                 [tab_LumaCoeffVer_32_avx512 + r4 + 1 * mmsize]
5161
+    mova                  m10,                [tab_LumaCoeffVer_32_avx512 + r4 + 2 * mmsize]
5162
+    mova                  m11,                [tab_LumaCoeffVer_32_avx512 + r4 + 3 * mmsize]
5163
+%endif
5164
+%ifidn %1, pp
5165
+    vbroadcasti32x8       m7,                 [pw_512]
5166
+%else
5167
+    shl                   r3d,                1
5168
+    vbroadcasti32x8       m7,                 [pw_2000]
5169
+    mova                  m16,                [interp4_vps_store1_avx512]
5170
+    mova                  m17,                [interp4_vps_store2_avx512]
5171
+%endif
5172
+
5173
+    lea                   r6,                 [3 * r1]
5174
+    lea                   r7,                 [3 * r3]
5175
+    sub                   r0,                 r6
5176
+
5177
+%rep %2/4 - 1
5178
+    PROCESS_LUMA_VERT_32x4_AVX512 %1
5179
+    lea                   r0,                 [r0 + 4 * r1]
5180
+    lea                   r2,                 [r2 + 4 * r3]
5181
+%endrep
5182
+    PROCESS_LUMA_VERT_32x4_AVX512 %1
5183
+    RET
5184
+%endmacro
5185
+
5186
+%if ARCH_X86_64
5187
+    FILTER_VERT_LUMA_32xN_AVX512 pp, 8
5188
+    FILTER_VERT_LUMA_32xN_AVX512 pp, 16
5189
+    FILTER_VERT_LUMA_32xN_AVX512 pp, 24
5190
+    FILTER_VERT_LUMA_32xN_AVX512 pp, 32
5191
+    FILTER_VERT_LUMA_32xN_AVX512 pp, 64
5192
+
5193
+    FILTER_VERT_LUMA_32xN_AVX512 ps, 8
5194
+    FILTER_VERT_LUMA_32xN_AVX512 ps, 16
5195
+    FILTER_VERT_LUMA_32xN_AVX512 ps, 24
5196
+    FILTER_VERT_LUMA_32xN_AVX512 ps, 32
5197
+    FILTER_VERT_LUMA_32xN_AVX512 ps, 64
5198
+%endif
5199
+%macro PROCESS_LUMA_VERT_48x8_AVX512 1
5200
+%ifidn %1, pp
5201
+    PROCESS_LUMA_VERT_32x4_AVX512 pp
5202
+%else
5203
+    PROCESS_LUMA_VERT_32x4_AVX512 ps
5204
+%endif
5205
+    lea                   r8,                 [r4 + 4 * r1]
5206
+    lea                   r9,                 [r2 + 4 * r3]
5207
+    movu                  ym1,                [r5]
5208
+    vinserti32x8          m1,                 [r5 + 2 * r1],       1
5209
+    movu                  ym3,                [r5 + r1]
5210
+    vinserti32x8          m3,                 [r5 + r6],           1
5211
+    punpcklbw             m0,                 m1,                  m3
5212
+    pmaddubsw             m0,                 m8
5213
+    punpckhbw             m1,                 m3
5214
+    pmaddubsw             m1,                 m8
5215
+
5216
+    movu                  ym4,                [r5 + 2 * r1]
5217
+    vinserti32x8          m4,                 [r5 + 4 * r1],       1
5218
+    punpcklbw             m2,                 m3,                  m4
5219
+    pmaddubsw             m2,                 m8
5220
+    punpckhbw             m3,                 m4
5221
+    pmaddubsw             m3,                 m8
5222
+
5223
+    movu                  ym5,                [r5 + r6]
5224
+    vinserti32x8          m5,                 [r4 + r1],           1
5225
+    punpcklbw             m6,                 m4,                  m5
5226
+    pmaddubsw             m6,                 m9
5227
+    punpckhbw             m4,                 m5
5228
+    pmaddubsw             m4,                 m9
5229
+
5230
+    paddw                 m0,                 m6
5231
+    paddw                 m1,                 m4
5232
+
5233
+    movu                  ym4,                [r5 + 4 * r1]
5234
+    vinserti32x8          m4,                 [r4 + 2 * r1],       1
5235
+    punpcklbw             m6,                 m5,                  m4
5236
+    pmaddubsw             m6,                 m9
5237
+    punpckhbw             m5,                 m4
5238
+    pmaddubsw             m5,                 m9
5239
+
5240
+    paddw                 m2,                 m6
5241
+    paddw                 m3,                 m5
5242
+
5243
+    movu                  ym15,               [r4 + r1]
5244
+    vinserti32x8          m15,                [r4 + r6],           1
5245
+    punpcklbw             m12,                m4,                 m15
5246
+    pmaddubsw             m12,                m10
5247
+    punpckhbw             m13,                m4,                 m15
5248
+    pmaddubsw             m13,                m10
5249
+
5250
+    movu                  ym4,                [r4 + 2 * r1]
5251
+    vinserti32x8          m4,                 [r4 + 4 * r1],       1
5252
+    punpcklbw             m14,                m15,                 m4
5253
+    pmaddubsw             m14,                m10
5254
+    punpckhbw             m15,                m4
5255
+    pmaddubsw             m15,                m10
5256
+
5257
+    movu                  ym5,                [r4 + r6]
5258
+    vinserti32x8          m5,                 [r8 + r1],           1
5259
+    punpcklbw             m6,                 m4,                  m5
5260
+    pmaddubsw             m6,                 m11
5261
+    punpckhbw             m4,                 m5
5262
+    pmaddubsw             m4,                 m11
5263
+
5264
+    paddw                 m12,                m6
5265
+    paddw                 m13,                m4
5266
+
5267
+    movu                  ym4,                [r4 + 4 * r1]
5268
+    vinserti32x8          m4,                 [r8 + 2 * r1],       1
5269
+    punpcklbw             m6,                 m5,                  m4
5270
+    pmaddubsw             m6,                 m11
5271
+    punpckhbw             m5,                 m4
5272
+    pmaddubsw             m5,                 m11
5273
+
5274
+    paddw                 m14,                m6
5275
+    paddw                 m15,                m5
5276
+
5277
+    paddw                 m0,                 m12
5278
+    paddw                 m1,                 m13
5279
+    paddw                 m2,                 m14
5280
+    paddw                 m3,                 m15
5281
+%ifidn %1,pp
5282
+    pmulhrsw              m0,                 m7
5283
+    pmulhrsw              m1,                 m7
5284
+    pmulhrsw              m2,                 m7
5285
+    pmulhrsw              m3,                 m7
5286
+    packuswb              m0,                 m1
5287
+    packuswb              m2,                 m3
5288
+
5289
+    movu                  [r9],               ym0
5290
+    movu                  [r9 + r3],          ym2
5291
+    vextracti32x8         [r9 + 2 * r3],      m0,                  1
5292
+    vextracti32x8         [r9 + r7],          m2,                  1
5293
+%else
5294
+    psubw                 m0,                 m7
5295
+    psubw                 m1,                 m7
5296
+    mova                  m12,                 m16
5297
+    mova                  m13,                 m17
5298
+    vpermi2q              m12,                 m0,                m1
5299
+    vpermi2q              m13,                 m0,                m1
5300
+    movu                  [r9],               m12
5301
+    movu                  [r9 + 2 * r3],      m13
5302
+
5303
+    psubw                 m2,                 m7
5304
+    psubw                 m3,                 m7
5305
+    mova                  m14,                 m16
5306
+    mova                  m15,                 m17
5307
+    vpermi2q              m14,                 m2,                m3
5308
+    vpermi2q              m15,                 m2,                m3
5309
+    movu                  [r9 + r3],          m14
5310
+    movu                  [r9 + r7],          m15
5311
+%endif
5312
+    movu                  xm1,                [r0 + mmsize/2]
5313
+    vinserti32x4          m1,                 [r0 + 2 * r1 + mmsize/2],       1
5314
+    vinserti32x4          m1,                 [r5 + mmsize/2],                2
5315
+    vinserti32x4          m1,                 [r5 + 2 * r1 + mmsize/2],       3
5316
+    movu                  xm3,                [r0 + r1 + mmsize/2]
5317
+    vinserti32x4          m3,                 [r0 + r6 + mmsize/2],           1
5318
+    vinserti32x4          m3,                 [r5 + r1 + mmsize/2],           2
5319
+    vinserti32x4          m3,                 [r5 + r6 + mmsize/2],           3
5320
+    punpcklbw             m0,                 m1,                  m3
5321
+    pmaddubsw             m0,                 m8
5322
+    punpckhbw             m1,                 m3
5323
+    pmaddubsw             m1,                 m8
5324
+
5325
+    movu                  xm4,                [r0 + 2 * r1 + mmsize/2]
5326
+    vinserti32x4          m4,                 [r0 + 4 * r1 + mmsize/2],       1
5327
+    vinserti32x4          m4,                 [r5 + 2 * r1 + mmsize/2],       2
5328
+    vinserti32x4          m4,                 [r5 + 4 * r1 + mmsize/2],       3
5329
+    punpcklbw             m2,                 m3,                  m4
5330
+    pmaddubsw             m2,                 m8
5331
+    punpckhbw             m3,                 m4
5332
+    pmaddubsw             m3,                 m8
5333
+
5334
+    movu                  xm5,                [r0 + r6 + mmsize/2]
5335
+    vinserti32x4          m5,                 [r5 + r1 + mmsize/2],           1
5336
+    vinserti32x4          m5,                 [r5 + r6 + mmsize/2],           2
5337
+    vinserti32x4          m5,                 [r4 + r1 + mmsize/2],           3
5338
+    punpcklbw             m6,                 m4,                  m5
5339
+    pmaddubsw             m6,                 m9
5340
+    punpckhbw             m4,                 m5
5341
+    pmaddubsw             m4,                 m9
5342
+
5343
+    paddw                 m0,                 m6
5344
+    paddw                 m1,                 m4
5345
+
5346
+    movu                  xm4,                [r0 + 4 * r1 + mmsize/2]
5347
+    vinserti32x4          m4,                 [r5 + 2 * r1 + mmsize/2],       1
5348
+    vinserti32x4          m4,                 [r5 + 4 * r1 + mmsize/2],       2
5349
+    vinserti32x4          m4,                 [r4 + 2 * r1 + mmsize/2],       3
5350
+    punpcklbw             m6,                 m5,                  m4
5351
+    pmaddubsw             m6,                 m9
5352
+    punpckhbw             m5,                 m4
5353
+    pmaddubsw             m5,                 m9
5354
+
5355
+    paddw                 m2,                 m6
5356
+    paddw                 m3,                 m5
5357
+
5358
+    movu                  xm15,               [r5 + r1 + mmsize/2]
5359
+    vinserti32x4          m15,                [r5 + r6 + mmsize/2],           1
5360
+    vinserti32x4          m15,                [r4 + r1 + mmsize/2],           2
5361
+    vinserti32x4          m15,                [r4 + r6 + mmsize/2],           3
5362
+    punpcklbw             m12,                m4,                 m15
5363
+    pmaddubsw             m12,                m10
5364
+    punpckhbw             m13,                m4,                 m15
5365
+    pmaddubsw             m13,                m10
5366
+
5367
+    movu                  xm4,                [r5 + 2 * r1 + mmsize/2]
5368
+    vinserti32x4          m4,                 [r5 + 4 * r1 + mmsize/2],       1
5369
+    vinserti32x4          m4,                 [r4 + 2 * r1 + mmsize/2],       2
5370
+    vinserti32x4          m4,                 [r4 + 4 * r1 + mmsize/2],       3
5371
+    punpcklbw             m14,                m15,                 m4
5372
+    pmaddubsw             m14,                m10
5373
+    punpckhbw             m15,                m4
5374
+    pmaddubsw             m15,                m10
5375
+
5376
+    movu                  xm5,                [r5 + r6 + mmsize/2]
5377
+    vinserti32x4          m5,                 [r4 + r1 + mmsize/2],           1
5378
+    vinserti32x4          m5,                 [r4 + r6 + mmsize/2],           2
5379
+    vinserti32x4          m5,                 [r8 + r1 + mmsize/2],           3
5380
+    punpcklbw             m6,                 m4,                  m5
5381
+    pmaddubsw             m6,                 m11
5382
+    punpckhbw             m4,                 m5
5383
+    pmaddubsw             m4,                 m11
5384
+
5385
+    paddw                 m12,                m6
5386
+    paddw                 m13,                m4
5387
+
5388
+    movu                  xm4,                [r5 + 4 * r1 + mmsize/2]
5389
+    vinserti32x4          m4,                 [r4 + 2 * r1 + mmsize/2],       1
5390
+    vinserti32x4          m4,                 [r4 + 4 * r1 + mmsize/2],       2
5391
+    vinserti32x4          m4,                 [r8 + 2 * r1 + mmsize/2],       3
5392
+    punpcklbw             m6,                 m5,                  m4
5393
+    pmaddubsw             m6,                 m11
5394
+    punpckhbw             m5,                 m4
5395
+    pmaddubsw             m5,                 m11
5396
+
5397
+    paddw                 m14,                m6
5398
+    paddw                 m15,                m5
5399
+
5400
+    paddw                 m0,                 m12
5401
+    paddw                 m1,                 m13
5402
+    paddw                 m2,                 m14
5403
+    paddw                 m3,                 m15
5404
+%ifidn %1, pp
5405
+    pmulhrsw              m0,                 m7
5406
+    pmulhrsw              m1,                 m7
5407
+    pmulhrsw              m2,                 m7
5408
+    pmulhrsw              m3,                 m7
5409
+
5410
+    packuswb              m0,                 m1
5411
+    packuswb              m2,                 m3
5412
+    movu                  [r2 + mmsize/2],               xm0
5413
+    movu                  [r2 + r3 + mmsize/2],          xm2
5414
+    vextracti32x4         [r2 + 2 * r3 + mmsize/2],      m0,                  1
5415
+    vextracti32x4         [r2 + r7 + mmsize/2],          m2,                  1
5416
+    lea                   r2,                 [r2 + 4 * r3]
5417
+    vextracti32x4         [r2 + mmsize/2],               m0,                  2
5418
+    vextracti32x4         [r2 + r3 + mmsize/2],          m2,                  2
5419
+    vextracti32x4         [r2 + 2 * r3 + mmsize/2],      m0,                  3
5420
+    vextracti32x4         [r2 + r7 + mmsize/2],          m2,                  3
5421
+%else
5422
+    psubw                 m0,                 m7
5423
+    psubw                 m1,                 m7
5424
+    mova                  m12,                 m16
5425
+    mova                  m13,                 m17
5426
+    vpermi2q              m12,                 m0,                m1
5427
+    vpermi2q              m13,                 m0,                m1
5428
+    movu                  [r2 + mmsize],               ym12
5429
+    vextracti32x8         [r2 + 2 * r3 + mmsize],      m12,                 1
5430
+
5431
+    psubw                 m2,                 m7
5432
+    psubw                 m3,                 m7
5433
+    mova                  m14,                 m16
5434
+    mova                  m15,                 m17
5435
+    vpermi2q              m14,                 m2,                m3
5436
+    vpermi2q              m15,                 m2,                m3
5437
+    movu                  [r2 + r3 + mmsize],          ym14
5438
+    vextracti32x8         [r2 + r7 + mmsize],          m14,                 1
5439
+    lea                   r2,                          [r2 + 4 * r3]
5440
+
5441
+    movu                  [r2 + mmsize],               ym13
5442
+    movu                  [r2 + r3 + mmsize],          ym15
5443
+    vextracti32x8         [r2 + 2 * r3 + mmsize],      m13,                 1
5444
+    vextracti32x8         [r2 + r7 + mmsize],          m15,                 1
5445
+%endif
5446
+%endmacro
5447
+;-----------------------------------------------------------------------------------------------------------------
5448
+; void interp_8tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
5449
+;-----------------------------------------------------------------------------------------------------------------
5450
+%macro FILTER_VERT_LUMA_48x64_AVX512 1
5451
+INIT_ZMM avx512
5452
+cglobal interp_8tap_vert_%1_48x64, 5, 10, 18
5453
+    mov                   r4d,                r4m
5454
+    shl                   r4d,                8
5455
+
5456
+%ifdef PIC
5457
+    lea                   r5,                 [tab_LumaCoeffVer_32_avx512]
5458
+    mova                  m8,                 [r5 + r4]
5459
+    mova                  m9,                 [r5 + r4 + 1 * mmsize]
5460
+    mova                  m10,                [r5 + r4 + 2 * mmsize]
5461
+    mova                  m11,                [r5 + r4 + 3 * mmsize]
5462
+%else
5463
+    mova                  m8,                 [tab_LumaCoeffVer_32_avx512 + r4]
5464
+    mova                  m9,                 [tab_LumaCoeffVer_32_avx512 + r4 + 1 * mmsize]
5465
+    mova                  m10,                [tab_LumaCoeffVer_32_avx512 + r4 + 2 * mmsize]
5466
+    mova                  m11,                [tab_LumaCoeffVer_32_avx512 + r4 + 3 * mmsize]
5467
+%endif
5468
+%ifidn %1, pp
5469
+    vbroadcasti32x8       m7,                 [pw_512]
5470
+%else
5471
+    shl                   r3d,                1
5472
+    vbroadcasti32x8       m7,                 [pw_2000]
5473
+    mova                  m16,                [interp4_vps_store1_avx512]
5474
+    mova                  m17,                [interp4_vps_store2_avx512]
5475
+%endif
5476
+
5477
+    lea                   r6,                 [3 * r1]
5478
+    lea                   r7,                 [3 * r3]
5479
+    sub                   r0,                 r6
5480
+
5481
+%rep 7
5482
+    PROCESS_LUMA_VERT_48x8_AVX512 %1
5483
+    lea                   r0,                 [r4]
5484
+    lea                   r2,                 [r2 + 4 * r3]
5485
+%endrep
5486
+    PROCESS_LUMA_VERT_48x8_AVX512 %1
5487
+    RET
5488
+%endmacro
5489
+
5490
+%if ARCH_X86_64
5491
+    FILTER_VERT_LUMA_48x64_AVX512 pp
5492
+    FILTER_VERT_LUMA_48x64_AVX512 ps
5493
+%endif
5494
+%macro PROCESS_LUMA_VERT_64x2_AVX512 1
5495
+    lea                   r5,                 [r0 + 4 * r1]
5496
+    movu                  m1,                 [r0]
5497
+    movu                  m3,                 [r0 + r1]
5498
+    punpcklbw             m0,                 m1,                  m3
5499
+    pmaddubsw             m0,                 m8
5500
+    punpckhbw             m1,                 m3
5501
+    pmaddubsw             m1,                 m8
5502
+
5503
+    movu                  m4,                 [r0 + 2 * r1]
5504
+    punpcklbw             m2,                 m3,                  m4
5505
+    pmaddubsw             m2,                 m8
5506
+    punpckhbw             m3,                 m4
5507
+    pmaddubsw             m3,                 m8
5508
+
5509
+    movu                  m5,                 [r0 + r6]
5510
+    punpcklbw             m6,                 m4,                  m5
5511
+    pmaddubsw             m6,                 m9
5512
+    punpckhbw             m4,                 m5
5513
+    pmaddubsw             m4,                 m9
5514
+
5515
+    paddw                 m0,                 m6
5516
+    paddw                 m1,                 m4
5517
+
5518
+    movu                  m4,                 [r0 + 4 * r1]
5519
+    punpcklbw             m6,                 m5,                  m4
5520
+    pmaddubsw             m6,                 m9
5521
+    punpckhbw             m5,                 m4
5522
+    pmaddubsw             m5,                 m9
5523
+
5524
+    paddw                 m2,                 m6
5525
+    paddw                 m3,                 m5
5526
+
5527
+    movu                  m15,                [r5 + r1]
5528
+    punpcklbw             m12,                m4,                  m15
5529
+    pmaddubsw             m12,                m10
5530
+    punpckhbw             m13,                m4,                  m15
5531
+    pmaddubsw             m13,                m10
5532
+
5533
+    movu                  m4,                 [r5 + 2 * r1]
5534
+    punpcklbw             m14,                m15,                 m4
5535
+    pmaddubsw             m14,                m10
5536
+    punpckhbw             m15,                m4
5537
+    pmaddubsw             m15,                m10
5538
+
5539
+    movu                  m5,                 [r5 + r6]
5540
+    punpcklbw             m6,                 m4,                  m5
5541
+    pmaddubsw             m6,                 m11
5542
+    punpckhbw             m4,                 m5
5543
+    pmaddubsw             m4,                 m11
5544
+
5545
+    paddw                 m12,                m6
5546
+    paddw                 m13,                m4
5547
+
5548
+    movu                  m4,                 [r5 + 4 * r1]
5549
+    punpcklbw             m6,                 m5,                  m4
5550
+    pmaddubsw             m6,                 m11
5551
+    punpckhbw             m5,                 m4
5552
+    pmaddubsw             m5,                 m11
5553
+
5554
+    paddw                 m14,                m6
5555
+    paddw                 m15,                m5
5556
+
5557
+    paddw                 m0,                 m12
5558
+    paddw                 m1,                 m13
5559
+    paddw                 m2,                 m14
5560
+    paddw                 m3,                 m15
5561
+%ifidn %1,pp
5562
+    pmulhrsw              m0,                 m7
5563
+    pmulhrsw              m1,                 m7
5564
+    pmulhrsw              m2,                 m7
5565
+    pmulhrsw              m3,                 m7
5566
+
5567
+    packuswb              m0,                 m1
5568
+    packuswb              m2,                 m3
5569
+    movu                  [r2],               m0
5570
+    movu                  [r2 + r3],          m2
5571
+%else
5572
+    psubw                 m0,                 m7
5573
+    psubw                 m1,                 m7
5574
+    mova                  m12,                 m16
5575
+    mova                  m13,                 m17
5576
+    vpermi2q              m12,                 m0,                m1
5577
+    vpermi2q              m13,                 m0,                m1
5578
+    movu                  [r2],               m12
5579
+    movu                  [r2 + mmsize],      m13
5580
+
5581
+    psubw                 m2,                 m7
5582
+    psubw                 m3,                 m7
5583
+    mova                  m14,                 m16
5584
+    mova                  m15,                 m17
5585
+    vpermi2q              m14,                 m2,                m3
5586
+    vpermi2q              m15,                 m2,                m3
5587
+    movu                  [r2 + r3],          m14
5588
+    movu                  [r2 + r3 + mmsize], m15
5589
+%endif
5590
+%endmacro
5591
+;-----------------------------------------------------------------------------------------------------------------
5592
+; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
5593
+;-----------------------------------------------------------------------------------------------------------------
5594
+%macro FILTER_VERT_LUMA_64xN_AVX512 2
5595
+INIT_ZMM avx512
5596
+cglobal interp_8tap_vert_%1_64x%2, 5, 8, 18
5597
+    mov                   r4d,                r4m
5598
+    shl                   r4d,                8
5599
+%ifdef PIC
5600
+    lea                   r5,                 [tab_LumaCoeffVer_32_avx512]
5601
+    mova                  m8,                 [r5 + r4]
5602
+    mova                  m9,                 [r5 + r4 + 1 * mmsize]
5603
+    mova                  m10,                [r5 + r4 + 2 * mmsize]
5604
+    mova                  m11,                [r5 + r4 + 3 * mmsize]
5605
+%else
5606
+    mova                  m8,                 [tab_LumaCoeffVer_32_avx512 + r4]
5607
+    mova                  m9,                 [tab_LumaCoeffVer_32_avx512 + r4 + 1 * mmsize]
5608
+    mova                  m10,                [tab_LumaCoeffVer_32_avx512 + r4 + 2 * mmsize]
5609
+    mova                  m11,                [tab_LumaCoeffVer_32_avx512 + r4 + 3 * mmsize]
5610
+%endif
5611
+%ifidn %1, pp
5612
+    vbroadcasti32x8       m7,                 [pw_512]
5613
+%else
5614
+    shl                   r3d,                1
5615
+    vbroadcasti32x8       m7,                 [pw_2000]
5616
+    mova                  m16,                [interp4_vps_store1_avx512]
5617
+    mova                  m17,                [interp4_vps_store2_avx512]
5618
+%endif
5619
+
5620
+    lea                   r6,                 [3 * r1]
5621
+    sub                   r0,                 r6
5622
+    lea                   r7,                 [3 * r3]
5623
+
5624
+%rep %2/2 - 1
5625
+    PROCESS_LUMA_VERT_64x2_AVX512 %1
5626
+    lea                   r0,                 [r0 + 2 * r1]
5627
+    lea                   r2,                 [r2 + 2 * r3]
5628
+%endrep
5629
+    PROCESS_LUMA_VERT_64x2_AVX512 %1
5630
+    RET
5631
+%endmacro
5632
+
5633
+%if ARCH_X86_64
5634
+FILTER_VERT_LUMA_64xN_AVX512 pp, 16
5635
+FILTER_VERT_LUMA_64xN_AVX512 pp, 32
5636
+FILTER_VERT_LUMA_64xN_AVX512 pp, 48
5637
+FILTER_VERT_LUMA_64xN_AVX512 pp, 64
5638
+
5639
+FILTER_VERT_LUMA_64xN_AVX512 ps, 16
5640
+FILTER_VERT_LUMA_64xN_AVX512 ps, 32
5641
+FILTER_VERT_LUMA_64xN_AVX512 ps, 48
5642
+FILTER_VERT_LUMA_64xN_AVX512 ps, 64
5643
+%endif
5644
+;-------------------------------------------------------------------------------------------------------------
5645
+;avx512 luma_vpp and luma_vps code end
5646
+;-------------------------------------------------------------------------------------------------------------
5647
+;-------------------------------------------------------------------------------------------------------------
5648
+;ipfilter_luma_avx512 code end
5649
+;-------------------------------------------------------------------------------------------------------------
5650
\ No newline at end of file
5651
x265_2.7.tar.gz/source/common/x86/ipfilter8.h -> x265_2.9.tar.gz/source/common/x86/ipfilter8.h Changed
16
 
1
@@ -33,6 +33,7 @@
2
     FUNCDEF_PU(void, interp_8tap_vert_ss, cpu, const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); \
3
     FUNCDEF_PU(void, interp_8tap_hv_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int idxX, int idxY); \
4
     FUNCDEF_CHROMA_PU(void, filterPixelToShort, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride); \
5
+    FUNCDEF_CHROMA_PU(void, filterPixelToShort_aligned, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride); \
6
     FUNCDEF_CHROMA_PU(void, interp_4tap_horiz_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
7
     FUNCDEF_CHROMA_PU(void, interp_4tap_horiz_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt); \
8
     FUNCDEF_CHROMA_PU(void, interp_4tap_vert_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
9
@@ -45,5 +46,6 @@
10
 SETUP_FUNC_DEF(sse3);
11
 SETUP_FUNC_DEF(sse4);
12
 SETUP_FUNC_DEF(avx2);
13
+SETUP_FUNC_DEF(avx512);
14
 
15
 #endif // ifndef X265_IPFILTER8_H
16
x265_2.7.tar.gz/source/common/x86/loopfilter.asm -> x265_2.9.tar.gz/source/common/x86/loopfilter.asm Changed
50
 
1
@@ -58,6 +58,7 @@
2
 ;============================================================================================================
3
 INIT_XMM sse4
4
 %if HIGH_BIT_DEPTH
5
+%if ARCH_X86_64
6
 cglobal saoCuOrgE0, 4,5,9
7
     mov         r4d, r4m
8
     movh        m6,  [r1]
9
@@ -157,7 +158,7 @@
10
     sub         r4d, 16
11
     jnz        .loopH
12
     RET
13
-
14
+%endif
15
 %else ; HIGH_BIT_DEPTH == 1
16
 
17
 cglobal saoCuOrgE0, 5, 5, 8, rec, offsetEo, lcuWidth, signLeft, stride
18
@@ -249,6 +250,7 @@
19
 
20
 INIT_YMM avx2
21
 %if HIGH_BIT_DEPTH
22
+%if ARCH_X86_64
23
 cglobal saoCuOrgE0, 4,4,9
24
     vbroadcasti128  m6, [r1]
25
     movzx           r1d, byte [r3]
26
@@ -308,6 +310,7 @@
27
     dec             r2d
28
     jnz             .loop
29
     RET
30
+%endif
31
 %else ; HIGH_BIT_DEPTH
32
 cglobal saoCuOrgE0, 5, 5, 7, rec, offsetEo, lcuWidth, signLeft, stride
33
 
34
@@ -1655,6 +1658,7 @@
35
     RET
36
 %endif
37
 
38
+%if ARCH_X86_64
39
 INIT_YMM avx2
40
 %if HIGH_BIT_DEPTH
41
 cglobal saoCuOrgB0, 5,7,8
42
@@ -1814,6 +1818,7 @@
43
 .end:
44
     RET
45
 %endif
46
+%endif
47
 
48
 ;============================================================================================================
49
 ; void calSign(int8_t *dst, const Pixel *src1, const Pixel *src2, const int width)
50
x265_2.7.tar.gz/source/common/x86/mc-a.asm -> x265_2.9.tar.gz/source/common/x86/mc-a.asm Changed
1841
 
1
@@ -46,13 +46,10 @@
2
     %error Unsupport bit depth!
3
 %endif
4
 
5
-SECTION_RODATA 32
6
+SECTION_RODATA 64
7
 
8
-ch_shuf: times 2 db 0,2,2,4,4,6,6,8,1,3,3,5,5,7,7,9
9
-ch_shuf_adj: times 8 db 0
10
-             times 8 db 2
11
-             times 8 db 4
12
-             times 8 db 6
13
+ALIGN 64
14
+const shuf_avx512,  dq 0, 2, 4, 6, 1, 3, 5, 7
15
 
16
 SECTION .text
17
 
18
@@ -1037,6 +1034,7 @@
19
 ;------------------------------------------------------------------------------
20
 ; avx2 asm for addAvg high_bit_depth
21
 ;------------------------------------------------------------------------------
22
+%if ARCH_X86_64
23
 INIT_YMM avx2
24
 cglobal addAvg_8x2, 6,6,2, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
25
     movu        xm0,         [r0]
26
@@ -1114,6 +1112,7 @@
27
     movu        [r2],        xm0
28
     movu        [r2 + r5],   xm2
29
     RET
30
+%endif
31
 
32
 %macro ADDAVG_W8_H4_AVX2 1
33
 cglobal addAvg_8x%1, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
34
@@ -1168,13 +1167,16 @@
35
     RET
36
 %endmacro
37
 
38
+%if ARCH_X86_64
39
 ADDAVG_W8_H4_AVX2 4
40
 ADDAVG_W8_H4_AVX2 8
41
 ADDAVG_W8_H4_AVX2 12
42
 ADDAVG_W8_H4_AVX2 16
43
 ADDAVG_W8_H4_AVX2 32
44
 ADDAVG_W8_H4_AVX2 64
45
+%endif
46
 
47
+%if ARCH_X86_64
48
 cglobal addAvg_12x16, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
49
     mova           m4,             [pw_ %+ ADDAVG_ROUND]
50
     mova           m5,             [pw_pixel_max]
51
@@ -1258,6 +1260,7 @@
52
     dec            r6d
53
     jnz            .loop
54
     RET
55
+%endif
56
 
57
 %macro ADDAVG_W16_H4_AVX2 1
58
 cglobal addAvg_16x%1, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
59
@@ -1299,6 +1302,7 @@
60
     RET
61
 %endmacro
62
 
63
+%if ARCH_X86_64
64
 ADDAVG_W16_H4_AVX2 4
65
 ADDAVG_W16_H4_AVX2 8
66
 ADDAVG_W16_H4_AVX2 12
67
@@ -1306,7 +1310,9 @@
68
 ADDAVG_W16_H4_AVX2 24
69
 ADDAVG_W16_H4_AVX2 32
70
 ADDAVG_W16_H4_AVX2 64
71
+%endif
72
 
73
+%if ARCH_X86_64
74
 cglobal addAvg_24x32, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
75
     mova        m4,              [pw_ %+ ADDAVG_ROUND]
76
     mova        m5,              [pw_pixel_max]
77
@@ -1418,6 +1424,7 @@
78
     dec         r6d
79
     jnz         .loop
80
     RET
81
+%endif
82
 
83
 %macro ADDAVG_W32_H2_AVX2 1
84
 cglobal addAvg_32x%1, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
85
@@ -1477,13 +1484,16 @@
86
     RET
87
 %endmacro
88
 
89
+%if ARCH_X86_64
90
 ADDAVG_W32_H2_AVX2 8
91
 ADDAVG_W32_H2_AVX2 16
92
 ADDAVG_W32_H2_AVX2 24
93
 ADDAVG_W32_H2_AVX2 32
94
 ADDAVG_W32_H2_AVX2 48
95
 ADDAVG_W32_H2_AVX2 64
96
+%endif
97
 
98
+%if ARCH_X86_64
99
 cglobal addAvg_48x64, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
100
     mova        m4,              [pw_ %+ ADDAVG_ROUND]
101
     mova        m5,              [pw_pixel_max]
102
@@ -1557,6 +1567,7 @@
103
     dec         r6d
104
     jnz        .loop
105
     RET
106
+%endif
107
 
108
 %macro ADDAVG_W64_H1_AVX2 1
109
 cglobal addAvg_64x%1, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
110
@@ -1652,10 +1663,729 @@
111
     RET
112
 %endmacro
113
 
114
+%if ARCH_X86_64
115
 ADDAVG_W64_H1_AVX2 16
116
 ADDAVG_W64_H1_AVX2 32
117
 ADDAVG_W64_H1_AVX2 48
118
 ADDAVG_W64_H1_AVX2 64
119
+%endif
120
+;-----------------------------------------------------------------------------
121
+;addAvg avx512 high bit depth code start
122
+;-----------------------------------------------------------------------------
123
+%macro PROCESS_ADDAVG_16x4_HBD_AVX512 0
124
+    movu              ym0,              [r0]
125
+    vinserti32x8       m0,              [r0 + r3], 1
126
+    movu              ym1,              [r1]
127
+    vinserti32x8       m1,              [r1 + r4], 1
128
+
129
+    paddw              m0,              m1
130
+    pmulhrsw           m0,              m3
131
+    paddw              m0,              m4
132
+    pmaxsw             m0,              m2
133
+    pminsw             m0,              m5
134
+
135
+    movu             [r2],              ym0
136
+    vextracti32x8    [r2 + r5],         m0, 1
137
+
138
+    movu              ym0,              [r0 + 2 * r3]
139
+    vinserti32x8       m0,              [r0 + r6], 1
140
+    movu              ym1,              [r1 + 2 * r4]
141
+    vinserti32x8       m1,              [r1 + r7], 1
142
+
143
+    paddw              m0,              m1
144
+    pmulhrsw           m0,              m3
145
+    paddw              m0,              m4
146
+    pmaxsw             m0,              m2
147
+    pminsw             m0,              m5
148
+
149
+    movu             [r2 + 2 * r5],    ym0
150
+    vextracti32x8    [r2 + r8],         m0, 1
151
+%endmacro
152
+
153
+%macro PROCESS_ADDAVG_32x4_HBD_AVX512 0
154
+    movu        m0,              [r0]
155
+    movu        m1,              [r1]
156
+    paddw       m0,              m1
157
+    pmulhrsw    m0,              m3
158
+    paddw       m0,              m4
159
+    pmaxsw      m0,              m2
160
+    pminsw      m0,              m5
161
+    movu        [r2],            m0
162
+
163
+    movu        m0,              [r0 + r3]
164
+    movu        m1,              [r1 + r4]
165
+    paddw       m0,              m1
166
+    pmulhrsw    m0,              m3
167
+    paddw       m0,              m4
168
+    pmaxsw      m0,              m2
169
+    pminsw      m0,              m5
170
+    movu        [r2 + r5],       m0
171
+
172
+    movu        m0,              [r0 + 2 * r3]
173
+    movu        m1,              [r1 + 2 * r4]
174
+    paddw       m0,              m1
175
+    pmulhrsw    m0,              m3
176
+    paddw       m0,              m4
177
+    pmaxsw      m0,              m2
178
+    pminsw      m0,              m5
179
+    movu        [r2 + 2 * r5],   m0
180
+
181
+    movu        m0,              [r0 + r6]
182
+    movu        m1,              [r1 + r7]
183
+    paddw       m0,              m1
184
+    pmulhrsw    m0,              m3
185
+    paddw       m0,              m4
186
+    pmaxsw      m0,              m2
187
+    pminsw      m0,              m5
188
+    movu        [r2 + r8],       m0
189
+%endmacro
190
+
191
+%macro PROCESS_ADDAVG_64x4_HBD_AVX512 0
192
+    movu        m0,              [r0]
193
+    movu        m1,              [r1]
194
+    paddw       m0,              m1
195
+    pmulhrsw    m0,              m3
196
+    paddw       m0,              m4
197
+    pmaxsw      m0,              m2
198
+    pminsw      m0,              m5
199
+    movu        [r2],            m0
200
+
201
+    movu        m0,              [r0 + mmsize]
202
+    movu        m1,              [r1 + mmsize]
203
+    paddw       m0,              m1
204
+    pmulhrsw    m0,              m3
205
+    paddw       m0,              m4
206
+    pmaxsw      m0,              m2
207
+    pminsw      m0,              m5
208
+    movu        [r2 + mmsize],   m0
209
+
210
+    movu        m0,              [r0 + r3]
211
+    movu        m1,              [r1 + r4]
212
+    paddw       m0,              m1
213
+    pmulhrsw    m0,              m3
214
+    paddw       m0,              m4
215
+    pmaxsw      m0,              m2
216
+    pminsw      m0,              m5
217
+    movu        [r2 + r5],       m0
218
+
219
+    movu        m0,              [r0 + r3 + mmsize]
220
+    movu        m1,              [r1 + r4 + mmsize]
221
+    paddw       m0,              m1
222
+    pmulhrsw    m0,              m3
223
+    paddw       m0,              m4
224
+    pmaxsw      m0,              m2
225
+    pminsw      m0,              m5
226
+    movu        [r2 + r5 + mmsize],       m0
227
+
228
+    movu        m0,              [r0 + 2 * r3]
229
+    movu        m1,              [r1 + 2 * r4]
230
+    paddw       m0,              m1
231
+    pmulhrsw    m0,              m3
232
+    paddw       m0,              m4
233
+    pmaxsw      m0,              m2
234
+    pminsw      m0,              m5
235
+    movu        [r2 + 2 * r5],   m0
236
+
237
+    movu        m0,              [r0 + 2 * r3 + mmsize]
238
+    movu        m1,              [r1 + 2 * r4 + mmsize]
239
+    paddw       m0,              m1
240
+    pmulhrsw    m0,              m3
241
+    paddw       m0,              m4
242
+    pmaxsw      m0,              m2
243
+    pminsw      m0,              m5
244
+    movu        [r2 + 2 * r5 + mmsize],   m0
245
+
246
+    movu        m0,              [r0 + r6]
247
+    movu        m1,              [r1 + r7]
248
+    paddw       m0,              m1
249
+    pmulhrsw    m0,              m3
250
+    paddw       m0,              m4
251
+    pmaxsw      m0,              m2
252
+    pminsw      m0,              m5
253
+    movu        [r2 + r8],       m0
254
+
255
+    movu        m0,              [r0 + r6 + mmsize]
256
+    movu        m1,              [r1 + r7 + mmsize]
257
+    paddw       m0,              m1
258
+    pmulhrsw    m0,              m3
259
+    paddw       m0,              m4
260
+    pmaxsw      m0,              m2
261
+    pminsw      m0,              m5
262
+    movu        [r2 + r8 + mmsize],       m0
263
+%endmacro
264
+
265
+%macro PROCESS_ADDAVG_48x4_HBD_AVX512 0
266
+    movu        m0,              [r0]
267
+    movu        m1,              [r1]
268
+    paddw       m0,              m1
269
+    pmulhrsw    m0,              m3
270
+    paddw       m0,              m4
271
+    pmaxsw      m0,              m2
272
+    pminsw      m0,              m5
273
+    movu        [r2],            m0
274
+
275
+    movu        ym0,              [r0 + mmsize]
276
+    movu        ym1,              [r1 + mmsize]
277
+    paddw       ym0,              ym1
278
+    pmulhrsw    ym0,              ym3
279
+    paddw       ym0,              ym4
280
+    pmaxsw      ym0,              ym2
281
+    pminsw      ym0,              ym5
282
+    movu        [r2 + mmsize],    ym0
283
+
284
+    movu        m0,              [r0 + r3]
285
+    movu        m1,              [r1 + r4]
286
+    paddw       m0,              m1
287
+    pmulhrsw    m0,              m3
288
+    paddw       m0,              m4
289
+    pmaxsw      m0,              m2
290
+    pminsw      m0,              m5
291
+    movu        [r2 + r5],       m0
292
+
293
+    movu        ym0,              [r0 + r3 + mmsize]
294
+    movu        ym1,              [r1 + r4 + mmsize]
295
+    paddw       ym0,              ym1
296
+    pmulhrsw    ym0,              ym3
297
+    paddw       ym0,              ym4
298
+    pmaxsw      ym0,              ym2
299
+    pminsw      ym0,              ym5
300
+    movu        [r2 + r5 + mmsize],       ym0
301
+
302
+    movu        m0,              [r0 + 2 * r3]
303
+    movu        m1,              [r1 + 2 * r4]
304
+    paddw       m0,              m1
305
+    pmulhrsw    m0,              m3
306
+    paddw       m0,              m4
307
+    pmaxsw      m0,              m2
308
+    pminsw      m0,              m5
309
+    movu        [r2 + 2 * r5],   m0
310
+
311
+    movu        ym0,              [r0 + 2 * r3 + mmsize]
312
+    movu        ym1,              [r1 + 2 * r4 + mmsize]
313
+    paddw       ym0,              ym1
314
+    pmulhrsw    ym0,              ym3
315
+    paddw       ym0,              ym4
316
+    pmaxsw      ym0,              ym2
317
+    pminsw      ym0,              ym5
318
+    movu        [r2 + 2 * r5 + mmsize],   ym0
319
+
320
+    movu        m0,              [r0 + r6]
321
+    movu        m1,              [r1 + r7]
322
+    paddw       m0,              m1
323
+    pmulhrsw    m0,              m3
324
+    paddw       m0,              m4
325
+    pmaxsw      m0,              m2
326
+    pminsw      m0,              m5
327
+    movu        [r2 + r8],       m0
328
+
329
+    movu        ym0,              [r0 + r6 + mmsize]
330
+    movu        ym1,              [r1 + r7 + mmsize]
331
+    paddw       ym0,              ym1
332
+    pmulhrsw    ym0,              ym3
333
+    paddw       ym0,              ym4
334
+    pmaxsw      ym0,              ym2
335
+    pminsw      ym0,              ym5
336
+    movu        [r2 + r8 + mmsize],       ym0
337
+%endmacro
338
+;-----------------------------------------------------------------------------
339
+;void addAvg (int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride)
340
+;-----------------------------------------------------------------------------
341
+%if ARCH_X86_64
342
+INIT_ZMM avx512
343
+cglobal addAvg_16x4, 6,9,6
344
+    vbroadcasti32x8        m4,              [pw_ %+ ADDAVG_ROUND]
345
+    vbroadcasti32x8        m5,              [pw_pixel_max]
346
+    vbroadcasti32x8        m3,              [pw_ %+ ADDAVG_FACTOR]
347
+    pxor        m2,        m2
348
+    add         r3,        r3
349
+    add         r4,        r4
350
+    add         r5,        r5
351
+    lea         r6,        [3 * r3]
352
+    lea         r7,        [3 * r4]
353
+    lea         r8,        [3 * r5]
354
+    PROCESS_ADDAVG_16x4_HBD_AVX512
355
+    RET
356
+%endif
357
+
358
+%macro ADDAVG_W16_HBD_AVX512 1
359
+INIT_ZMM avx512
360
+cglobal addAvg_16x%1, 6,9,6
361
+    vbroadcasti32x8        m4,              [pw_ %+ ADDAVG_ROUND]
362
+    vbroadcasti32x8        m5,              [pw_pixel_max]
363
+    vbroadcasti32x8        m3,              [pw_ %+ ADDAVG_FACTOR]
364
+    pxor        m2,        m2
365
+    add         r3,        r3
366
+    add         r4,        r4
367
+    add         r5,        r5
368
+    lea         r6,        [3 * r3]
369
+    lea         r7,        [3 * r4]
370
+    lea         r8,        [3 * r5]
371
+
372
+%rep %1/4 - 1
373
+    PROCESS_ADDAVG_16x4_HBD_AVX512
374
+    lea         r2,        [r2 + 4 * r5]
375
+    lea         r0,        [r0 + 4 * r3]
376
+    lea         r1,        [r1 + 4 * r4]
377
+%endrep
378
+    PROCESS_ADDAVG_16x4_HBD_AVX512
379
+    RET
380
+%endmacro
381
+
382
+%if ARCH_X86_64
383
+ADDAVG_W16_HBD_AVX512 8
384
+ADDAVG_W16_HBD_AVX512 12
385
+ADDAVG_W16_HBD_AVX512 16
386
+ADDAVG_W16_HBD_AVX512 24
387
+ADDAVG_W16_HBD_AVX512 32
388
+ADDAVG_W16_HBD_AVX512 64
389
+%endif
390
+
391
+%macro ADDAVG_W32_HBD_AVX512 1
392
+INIT_ZMM avx512
393
+cglobal addAvg_32x%1, 6,9,6
394
+    vbroadcasti32x8        m4,              [pw_ %+ ADDAVG_ROUND]
395
+    vbroadcasti32x8        m5,              [pw_pixel_max]
396
+    vbroadcasti32x8        m3,              [pw_ %+ ADDAVG_FACTOR]
397
+    pxor        m2,              m2
398
+    add         r3,              r3
399
+    add         r4,              r4
400
+    add         r5,              r5
401
+    lea         r6,              [3 * r3]
402
+    lea         r7,              [3 * r4]
403
+    lea         r8,              [3 * r5]
404
+
405
+%rep %1/4 - 1
406
+    PROCESS_ADDAVG_32x4_HBD_AVX512
407
+    lea         r2,              [r2 + 4 * r5]
408
+    lea         r0,              [r0 + 4 * r3]
409
+    lea         r1,              [r1 + 4 * r4]
410
+%endrep
411
+    PROCESS_ADDAVG_32x4_HBD_AVX512
412
+    RET
413
+%endmacro
414
+
415
+%if ARCH_X86_64
416
+ADDAVG_W32_HBD_AVX512 8
417
+ADDAVG_W32_HBD_AVX512 16
418
+ADDAVG_W32_HBD_AVX512 24
419
+ADDAVG_W32_HBD_AVX512 32
420
+ADDAVG_W32_HBD_AVX512 48
421
+ADDAVG_W32_HBD_AVX512 64
422
+%endif
423
+
424
+%macro ADDAVG_W64_HBD_AVX512 1
425
+INIT_ZMM avx512
426
+cglobal addAvg_64x%1, 6,9,6
427
+    vbroadcasti32x8        m4,              [pw_ %+ ADDAVG_ROUND]
428
+    vbroadcasti32x8        m5,              [pw_pixel_max]
429
+    vbroadcasti32x8        m3,              [pw_ %+ ADDAVG_FACTOR]
430
+    pxor        m2,              m2
431
+    add         r3,              r3
432
+    add         r4,              r4
433
+    add         r5,              r5
434
+    lea         r6,              [3 * r3]
435
+    lea         r7,              [3 * r4]
436
+    lea         r8,              [3 * r5]
437
+
438
+%rep %1/4 - 1
439
+    PROCESS_ADDAVG_64x4_HBD_AVX512
440
+    lea         r2,              [r2 + 4 * r5]
441
+    lea         r0,              [r0 + 4 * r3]
442
+    lea         r1,              [r1 + 4 * r4]
443
+%endrep
444
+    PROCESS_ADDAVG_64x4_HBD_AVX512
445
+    RET
446
+%endmacro
447
+
448
+%if ARCH_X86_64
449
+ADDAVG_W64_HBD_AVX512 16
450
+ADDAVG_W64_HBD_AVX512 32
451
+ADDAVG_W64_HBD_AVX512 48
452
+ADDAVG_W64_HBD_AVX512 64
453
+%endif
454
+
455
+%if ARCH_X86_64
456
+INIT_ZMM avx512
457
+cglobal addAvg_48x64, 6,9,6
458
+    vbroadcasti32x8        m4,              [pw_ %+ ADDAVG_ROUND]
459
+    vbroadcasti32x8        m5,              [pw_pixel_max]
460
+    vbroadcasti32x8        m3,              [pw_ %+ ADDAVG_FACTOR]
461
+    pxor        m2,              m2
462
+    add         r3,              r3
463
+    add         r4,              r4
464
+    add         r5,              r5
465
+    lea         r6,              [3 * r3]
466
+    lea         r7,              [3 * r4]
467
+    lea         r8,              [3 * r5]
468
+
469
+%rep 15
470
+    PROCESS_ADDAVG_48x4_HBD_AVX512
471
+    lea         r2,              [r2 + 4 * r5]
472
+    lea         r0,              [r0 + 4 * r3]
473
+    lea         r1,              [r1 + 4 * r4]
474
+%endrep
475
+    PROCESS_ADDAVG_48x4_HBD_AVX512
476
+    RET
477
+%endif
478
+
479
+%macro PROCESS_ADDAVG_ALIGNED_16x4_HBD_AVX512 0
480
+    movu              ym0,              [r0]
481
+    vinserti32x8       m0,              [r0 + r3], 1
482
+    movu              ym1,              [r1]
483
+    vinserti32x8       m1,              [r1 + r4], 1
484
+
485
+    paddw              m0,              m1
486
+    pmulhrsw           m0,              m3
487
+    paddw              m0,              m4
488
+    pmaxsw             m0,              m2
489
+    pminsw             m0,              m5
490
+
491
+    movu             [r2],              ym0
492
+    vextracti32x8    [r2 + r5],         m0, 1
493
+
494
+    movu              ym0,              [r0 + 2 * r3]
495
+    vinserti32x8       m0,              [r0 + r6], 1
496
+    movu              ym1,              [r1 + 2 * r4]
497
+    vinserti32x8       m1,              [r1 + r7], 1
498
+
499
+    paddw              m0,              m1
500
+    pmulhrsw           m0,              m3
501
+    paddw              m0,              m4
502
+    pmaxsw             m0,              m2
503
+    pminsw             m0,              m5
504
+
505
+    movu             [r2 + 2 * r5],    ym0
506
+    vextracti32x8    [r2 + r8],         m0, 1
507
+%endmacro
508
+
509
+%macro PROCESS_ADDAVG_ALIGNED_32x4_HBD_AVX512 0
510
+    movu        m0,              [r0]
511
+    movu        m1,              [r1]
512
+    paddw       m0,              m1
513
+    pmulhrsw    m0,              m3
514
+    paddw       m0,              m4
515
+    pmaxsw      m0,              m2
516
+    pminsw      m0,              m5
517
+    movu        [r2],            m0
518
+
519
+    movu        m0,              [r0 + r3]
520
+    movu        m1,              [r1 + r4]
521
+    paddw       m0,              m1
522
+    pmulhrsw    m0,              m3
523
+    paddw       m0,              m4
524
+    pmaxsw      m0,              m2
525
+    pminsw      m0,              m5
526
+    movu        [r2 + r5],       m0
527
+
528
+    movu        m0,              [r0 + 2 * r3]
529
+    movu        m1,              [r1 + 2 * r4]
530
+    paddw       m0,              m1
531
+    pmulhrsw    m0,              m3
532
+    paddw       m0,              m4
533
+    pmaxsw      m0,              m2
534
+    pminsw      m0,              m5
535
+    movu        [r2 + 2 * r5],   m0
536
+
537
+    movu        m0,              [r0 + r6]
538
+    movu        m1,              [r1 + r7]
539
+    paddw       m0,              m1
540
+    pmulhrsw    m0,              m3
541
+    paddw       m0,              m4
542
+    pmaxsw      m0,              m2
543
+    pminsw      m0,              m5
544
+    movu        [r2 + r8],       m0
545
+%endmacro
546
+
547
+%macro PROCESS_ADDAVG_ALIGNED_64x4_HBD_AVX512 0
548
+    movu        m0,              [r0]
549
+    movu        m1,              [r1]
550
+    paddw       m0,              m1
551
+    pmulhrsw    m0,              m3
552
+    paddw       m0,              m4
553
+    pmaxsw      m0,              m2
554
+    pminsw      m0,              m5
555
+    movu        [r2],            m0
556
+
557
+    movu        m0,              [r0 + mmsize]
558
+    movu        m1,              [r1 + mmsize]
559
+    paddw       m0,              m1
560
+    pmulhrsw    m0,              m3
561
+    paddw       m0,              m4
562
+    pmaxsw      m0,              m2
563
+    pminsw      m0,              m5
564
+    movu        [r2 + mmsize],   m0
565
+
566
+    movu        m0,              [r0 + r3]
567
+    movu        m1,              [r1 + r4]
568
+    paddw       m0,              m1
569
+    pmulhrsw    m0,              m3
570
+    paddw       m0,              m4
571
+    pmaxsw      m0,              m2
572
+    pminsw      m0,              m5
573
+    movu        [r2 + r5],       m0
574
+
575
+    movu        m0,              [r0 + r3 + mmsize]
576
+    movu        m1,              [r1 + r4 + mmsize]
577
+    paddw       m0,              m1
578
+    pmulhrsw    m0,              m3
579
+    paddw       m0,              m4
580
+    pmaxsw      m0,              m2
581
+    pminsw      m0,              m5
582
+    movu        [r2 + r5 + mmsize],       m0
583
+
584
+    movu        m0,              [r0 + 2 * r3]
585
+    movu        m1,              [r1 + 2 * r4]
586
+    paddw       m0,              m1
587
+    pmulhrsw    m0,              m3
588
+    paddw       m0,              m4
589
+    pmaxsw      m0,              m2
590
+    pminsw      m0,              m5
591
+    movu        [r2 + 2 * r5],   m0
592
+
593
+    movu        m0,              [r0 + 2 * r3 + mmsize]
594
+    movu        m1,              [r1 + 2 * r4 + mmsize]
595
+    paddw       m0,              m1
596
+    pmulhrsw    m0,              m3
597
+    paddw       m0,              m4
598
+    pmaxsw      m0,              m2
599
+    pminsw      m0,              m5
600
+    movu        [r2 + 2 * r5 + mmsize],   m0
601
+
602
+    movu        m0,              [r0 + r6]
603
+    movu        m1,              [r1 + r7]
604
+    paddw       m0,              m1
605
+    pmulhrsw    m0,              m3
606
+    paddw       m0,              m4
607
+    pmaxsw      m0,              m2
608
+    pminsw      m0,              m5
609
+    movu        [r2 + r8],       m0
610
+
611
+    movu        m0,              [r0 + r6 + mmsize]
612
+    movu        m1,              [r1 + r7 + mmsize]
613
+    paddw       m0,              m1
614
+    pmulhrsw    m0,              m3
615
+    paddw       m0,              m4
616
+    pmaxsw      m0,              m2
617
+    pminsw      m0,              m5
618
+    movu        [r2 + r8 + mmsize],       m0
619
+%endmacro
620
+
621
+%macro PROCESS_ADDAVG_ALIGNED_48x4_HBD_AVX512 0
622
+    movu        m0,              [r0]
623
+    movu        m1,              [r1]
624
+    paddw       m0,              m1
625
+    pmulhrsw    m0,              m3
626
+    paddw       m0,              m4
627
+    pmaxsw      m0,              m2
628
+    pminsw      m0,              m5
629
+    movu        [r2],            m0
630
+
631
+    movu        ym0,              [r0 + mmsize]
632
+    movu        ym1,              [r1 + mmsize]
633
+    paddw       ym0,              ym1
634
+    pmulhrsw    ym0,              ym3
635
+    paddw       ym0,              ym4
636
+    pmaxsw      ym0,              ym2
637
+    pminsw      ym0,              ym5
638
+    movu        [r2 + mmsize],    ym0
639
+
640
+    movu        m0,              [r0 + r3]
641
+    movu        m1,              [r1 + r4]
642
+    paddw       m0,              m1
643
+    pmulhrsw    m0,              m3
644
+    paddw       m0,              m4
645
+    pmaxsw      m0,              m2
646
+    pminsw      m0,              m5
647
+    movu        [r2 + r5],       m0
648
+
649
+    movu        ym0,              [r0 + r3 + mmsize]
650
+    movu        ym1,              [r1 + r4 + mmsize]
651
+    paddw       ym0,              ym1
652
+    pmulhrsw    ym0,              ym3
653
+    paddw       ym0,              ym4
654
+    pmaxsw      ym0,              ym2
655
+    pminsw      ym0,              ym5
656
+    movu        [r2 + r5 + mmsize],       ym0
657
+
658
+    movu        m0,              [r0 + 2 * r3]
659
+    movu        m1,              [r1 + 2 * r4]
660
+    paddw       m0,              m1
661
+    pmulhrsw    m0,              m3
662
+    paddw       m0,              m4
663
+    pmaxsw      m0,              m2
664
+    pminsw      m0,              m5
665
+    movu        [r2 + 2 * r5],   m0
666
+
667
+    movu        ym0,              [r0 + 2 * r3 + mmsize]
668
+    movu        ym1,              [r1 + 2 * r4 + mmsize]
669
+    paddw       ym0,              ym1
670
+    pmulhrsw    ym0,              ym3
671
+    paddw       ym0,              ym4
672
+    pmaxsw      ym0,              ym2
673
+    pminsw      ym0,              ym5
674
+    movu        [r2 + 2 * r5 + mmsize],   ym0
675
+
676
+    movu        m0,              [r0 + r6]
677
+    movu        m1,              [r1 + r7]
678
+    paddw       m0,              m1
679
+    pmulhrsw    m0,              m3
680
+    paddw       m0,              m4
681
+    pmaxsw      m0,              m2
682
+    pminsw      m0,              m5
683
+    movu        [r2 + r8],       m0
684
+
685
+    movu        ym0,              [r0 + r6 + mmsize]
686
+    movu        ym1,              [r1 + r7 + mmsize]
687
+    paddw       ym0,              ym1
688
+    pmulhrsw    ym0,              ym3
689
+    paddw       ym0,              ym4
690
+    pmaxsw      ym0,              ym2
691
+    pminsw      ym0,              ym5
692
+    movu        [r2 + r8 + mmsize],       ym0
693
+%endmacro
694
+;-----------------------------------------------------------------------------
695
+;void addAvg (int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride)
696
+;-----------------------------------------------------------------------------
697
+%if ARCH_X86_64
698
+INIT_ZMM avx512
699
+cglobal addAvg_aligned_16x4, 6,9,6
700
+    vbroadcasti32x8        m4,              [pw_ %+ ADDAVG_ROUND]
701
+    vbroadcasti32x8        m5,              [pw_pixel_max]
702
+    vbroadcasti32x8        m3,              [pw_ %+ ADDAVG_FACTOR]
703
+    pxor        m2,        m2
704
+    add         r3,        r3
705
+    add         r4,        r4
706
+    add         r5,        r5
707
+    lea         r6,        [3 * r3]
708
+    lea         r7,        [3 * r4]
709
+    lea         r8,        [3 * r5]
710
+    PROCESS_ADDAVG_ALIGNED_16x4_HBD_AVX512
711
+    RET
712
+%endif
713
+
714
+%macro ADDAVG_ALIGNED_W16_HBD_AVX512 1
715
+INIT_ZMM avx512
716
+cglobal addAvg_aligned_16x%1, 6,9,6
717
+    vbroadcasti32x8        m4,              [pw_ %+ ADDAVG_ROUND]
718
+    vbroadcasti32x8        m5,              [pw_pixel_max]
719
+    vbroadcasti32x8        m3,              [pw_ %+ ADDAVG_FACTOR]
720
+    pxor        m2,        m2
721
+    add         r3,        r3
722
+    add         r4,        r4
723
+    add         r5,        r5
724
+    lea         r6,        [3 * r3]
725
+    lea         r7,        [3 * r4]
726
+    lea         r8,        [3 * r5]
727
+
728
+%rep %1/4 - 1
729
+    PROCESS_ADDAVG_ALIGNED_16x4_HBD_AVX512
730
+    lea         r2,        [r2 + 4 * r5]
731
+    lea         r0,        [r0 + 4 * r3]
732
+    lea         r1,        [r1 + 4 * r4]
733
+%endrep
734
+    PROCESS_ADDAVG_ALIGNED_16x4_HBD_AVX512
735
+    RET
736
+%endmacro
737
+
738
+%if ARCH_X86_64
739
+ADDAVG_ALIGNED_W16_HBD_AVX512 8
740
+ADDAVG_ALIGNED_W16_HBD_AVX512 12
741
+ADDAVG_ALIGNED_W16_HBD_AVX512 16
742
+ADDAVG_ALIGNED_W16_HBD_AVX512 24
743
+ADDAVG_ALIGNED_W16_HBD_AVX512 32
744
+ADDAVG_ALIGNED_W16_HBD_AVX512 64
745
+%endif
746
+
747
+%macro ADDAVG_ALIGNED_W32_HBD_AVX512 1
748
+INIT_ZMM avx512
749
+cglobal addAvg_aligned_32x%1, 6,9,6
750
+    vbroadcasti32x8        m4,              [pw_ %+ ADDAVG_ROUND]
751
+    vbroadcasti32x8        m5,              [pw_pixel_max]
752
+    vbroadcasti32x8        m3,              [pw_ %+ ADDAVG_FACTOR]
753
+    pxor        m2,              m2
754
+    add         r3,              r3
755
+    add         r4,              r4
756
+    add         r5,              r5
757
+    lea         r6,              [3 * r3]
758
+    lea         r7,              [3 * r4]
759
+    lea         r8,              [3 * r5]
760
+
761
+%rep %1/4 - 1
762
+    PROCESS_ADDAVG_ALIGNED_32x4_HBD_AVX512
763
+    lea         r2,              [r2 + 4 * r5]
764
+    lea         r0,              [r0 + 4 * r3]
765
+    lea         r1,              [r1 + 4 * r4]
766
+%endrep
767
+    PROCESS_ADDAVG_ALIGNED_32x4_HBD_AVX512
768
+    RET
769
+%endmacro
770
+
771
+%if ARCH_X86_64
772
+ADDAVG_ALIGNED_W32_HBD_AVX512 8
773
+ADDAVG_ALIGNED_W32_HBD_AVX512 16
774
+ADDAVG_ALIGNED_W32_HBD_AVX512 24
775
+ADDAVG_ALIGNED_W32_HBD_AVX512 32
776
+ADDAVG_ALIGNED_W32_HBD_AVX512 48
777
+ADDAVG_ALIGNED_W32_HBD_AVX512 64
778
+%endif
779
+
780
+%macro ADDAVG_ALIGNED_W64_HBD_AVX512 1
781
+INIT_ZMM avx512
782
+cglobal addAvg_aligned_64x%1, 6,9,6
783
+    vbroadcasti32x8        m4,              [pw_ %+ ADDAVG_ROUND]
784
+    vbroadcasti32x8        m5,              [pw_pixel_max]
785
+    vbroadcasti32x8        m3,              [pw_ %+ ADDAVG_FACTOR]
786
+    pxor        m2,              m2
787
+    add         r3,              r3
788
+    add         r4,              r4
789
+    add         r5,              r5
790
+    lea         r6,              [3 * r3]
791
+    lea         r7,              [3 * r4]
792
+    lea         r8,              [3 * r5]
793
+
794
+%rep %1/4 - 1
795
+    PROCESS_ADDAVG_ALIGNED_64x4_HBD_AVX512
796
+    lea         r2,              [r2 + 4 * r5]
797
+    lea         r0,              [r0 + 4 * r3]
798
+    lea         r1,              [r1 + 4 * r4]
799
+%endrep
800
+    PROCESS_ADDAVG_ALIGNED_64x4_HBD_AVX512
801
+    RET
802
+%endmacro
803
+
804
+%if ARCH_X86_64
805
+ADDAVG_ALIGNED_W64_HBD_AVX512 16
806
+ADDAVG_ALIGNED_W64_HBD_AVX512 32
807
+ADDAVG_ALIGNED_W64_HBD_AVX512 48
808
+ADDAVG_ALIGNED_W64_HBD_AVX512 64
809
+%endif
810
+
811
+%if ARCH_X86_64
812
+INIT_ZMM avx512
813
+cglobal addAvg_aligned_48x64, 6,9,6
814
+    vbroadcasti32x8        m4,              [pw_ %+ ADDAVG_ROUND]
815
+    vbroadcasti32x8        m5,              [pw_pixel_max]
816
+    vbroadcasti32x8        m3,              [pw_ %+ ADDAVG_FACTOR]
817
+    pxor        m2,              m2
818
+    add         r3,              r3
819
+    add         r4,              r4
820
+    add         r5,              r5
821
+    lea         r6,              [3 * r3]
822
+    lea         r7,              [3 * r4]
823
+    lea         r8,              [3 * r5]
824
+
825
+%rep 15
826
+    PROCESS_ADDAVG_ALIGNED_48x4_HBD_AVX512
827
+    lea         r2,              [r2 + 4 * r5]
828
+    lea         r0,              [r0 + 4 * r3]
829
+    lea         r1,              [r1 + 4 * r4]
830
+%endrep
831
+    PROCESS_ADDAVG_ALIGNED_48x4_HBD_AVX512
832
+    RET
833
+%endif
834
+;-----------------------------------------------------------------------------
835
+;addAvg avx512 high bit depth code end
836
+;-----------------------------------------------------------------------------
837
 ;-----------------------------------------------------------------------------
838
 %else ; !HIGH_BIT_DEPTH
839
 ;-----------------------------------------------------------------------------
840
@@ -2968,7 +3698,221 @@
841
 ;-----------------------------------------------------------------------------
842
 ; addAvg avx2 code end
843
 ;-----------------------------------------------------------------------------
844
+; addAvg avx512 code start
845
+;-----------------------------------------------------------------------------
846
+%macro PROCESS_ADDAVG_64x2_AVX512 0
847
+    movu            m0, [r0]
848
+    movu            m1, [r1]
849
+    movu            m2, [r0 + mmsize]
850
+    movu            m3, [r1 + mmsize]
851
+
852
+    paddw           m0, m1
853
+    pmulhrsw        m0, m4
854
+    paddw           m0, m5
855
+    paddw           m2, m3
856
+    pmulhrsw        m2, m4
857
+    paddw           m2, m5
858
 
859
+    packuswb        m0, m2
860
+    vpermq          m0, m6, m0
861
+    movu            [r2], m0
862
+
863
+    movu            m0, [r0 + r3]
864
+    movu            m1, [r1 + r4]
865
+    movu            m2, [r0 + r3 + mmsize]
866
+    movu            m3, [r1 + r4 + mmsize]
867
+
868
+    paddw           m0, m1
869
+    pmulhrsw        m0, m4
870
+    paddw           m0, m5
871
+    paddw           m2, m3
872
+    pmulhrsw        m2, m4
873
+    paddw           m2, m5
874
+
875
+    packuswb        m0, m2
876
+    vpermq          m0, m6, m0
877
+    movu            [r2 + r5], m0
878
+%endmacro
879
+
880
+%macro PROCESS_ADDAVG_32x2_AVX512 0
881
+    movu            m0,         [r0]
882
+    movu            m1,         [r1]
883
+    movu            m2,         [r0 + r3]
884
+    movu            m3,         [r1 + r4]
885
+
886
+    paddw           m0,         m1
887
+    pmulhrsw        m0,         m4
888
+    paddw           m0,         m5
889
+    paddw           m2,         m3
890
+    pmulhrsw        m2,         m4
891
+    paddw           m2,         m5
892
+
893
+    packuswb        m0,         m2
894
+    vpermq          m0,         m6,     m0
895
+    movu            [r2],       ym0
896
+    vextracti32x8   [r2 + r5],  m0, 1
897
+%endmacro
898
+;--------------------------------------------------------------------------------------------------------------------
899
+;void addAvg (int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride)
900
+;--------------------------------------------------------------------------------------------------------------------
901
+%macro ADDAVG_W64_AVX512 1
902
+INIT_ZMM avx512
903
+cglobal addAvg_64x%1, 6,6,7
904
+    vbroadcasti32x8 m4, [pw_256]
905
+    vbroadcasti32x8 m5, [pw_128]
906
+    mova            m6, [shuf_avx512]
907
+
908
+    add             r3, r3
909
+    add             r4, r4
910
+
911
+%rep %1/2 - 1
912
+    PROCESS_ADDAVG_64x2_AVX512
913
+    lea             r2, [r2 + 2 * r5]
914
+    lea             r0, [r0 + 2 * r3]
915
+    lea             r1, [r1 + 2 * r4]
916
+%endrep
917
+    PROCESS_ADDAVG_64x2_AVX512
918
+    RET
919
+%endmacro
920
+
921
+ADDAVG_W64_AVX512 16
922
+ADDAVG_W64_AVX512 32
923
+ADDAVG_W64_AVX512 48
924
+ADDAVG_W64_AVX512 64
925
+
926
+%macro ADDAVG_W32_AVX512 1
927
+INIT_ZMM avx512
928
+cglobal addAvg_32x%1, 6,6,7
929
+    vbroadcasti32x8 m4, [pw_256]
930
+    vbroadcasti32x8 m5, [pw_128]
931
+    mova            m6, [shuf_avx512]
932
+    add             r3, r3
933
+    add             r4, r4
934
+
935
+%rep %1/2 - 1
936
+    PROCESS_ADDAVG_32x2_AVX512
937
+    lea             r2, [r2 + 2 * r5]
938
+    lea             r0, [r0 + 2 * r3]
939
+    lea             r1, [r1 + 2 * r4]
940
+%endrep
941
+    PROCESS_ADDAVG_32x2_AVX512
942
+    RET
943
+%endmacro
944
+
945
+ADDAVG_W32_AVX512 8
946
+ADDAVG_W32_AVX512 16
947
+ADDAVG_W32_AVX512 24
948
+ADDAVG_W32_AVX512 32
949
+ADDAVG_W32_AVX512 48
950
+ADDAVG_W32_AVX512 64
951
+
952
+%macro PROCESS_ADDAVG_ALIGNED_64x2_AVX512 0
953
+    mova            m0, [r0]
954
+    mova            m1, [r1]
955
+    mova            m2, [r0 + mmsize]
956
+    mova            m3, [r1 + mmsize]
957
+
958
+    paddw           m0, m1
959
+    pmulhrsw        m0, m4
960
+    paddw           m0, m5
961
+    paddw           m2, m3
962
+    pmulhrsw        m2, m4
963
+    paddw           m2, m5
964
+
965
+    packuswb        m0, m2
966
+    vpermq          m0, m6, m0
967
+    mova            [r2], m0
968
+
969
+    mova            m0, [r0 + r3]
970
+    mova            m1, [r1 + r4]
971
+    mova            m2, [r0 + r3 + mmsize]
972
+    mova            m3, [r1 + r4 + mmsize]
973
+
974
+    paddw           m0, m1
975
+    pmulhrsw        m0, m4
976
+    paddw           m0, m5
977
+    paddw           m2, m3
978
+    pmulhrsw        m2, m4
979
+    paddw           m2, m5
980
+
981
+    packuswb        m0, m2
982
+    vpermq          m0, m6, m0
983
+    mova            [r2 + r5], m0
984
+%endmacro
985
+
986
+%macro PROCESS_ADDAVG_ALIGNED_32x2_AVX512 0
987
+    mova            m0,         [r0]
988
+    mova            m1,         [r1]
989
+    mova            m2,         [r0 + r3]
990
+    mova            m3,         [r1 + r4]
991
+
992
+    paddw           m0,         m1
993
+    pmulhrsw        m0,         m4
994
+    paddw           m0,         m5
995
+    paddw           m2,         m3
996
+    pmulhrsw        m2,         m4
997
+    paddw           m2,         m5
998
+
999
+    packuswb        m0,         m2
1000
+    vpermq          m0,         m6,     m0
1001
+    mova            [r2],       ym0
1002
+    vextracti32x8   [r2 + r5],  m0, 1
1003
+%endmacro
1004
+;--------------------------------------------------------------------------------------------------------------------
1005
+;void addAvg (int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride)
1006
+;--------------------------------------------------------------------------------------------------------------------
1007
+%macro ADDAVG_ALIGNED_W64_AVX512 1
1008
+INIT_ZMM avx512
1009
+cglobal addAvg_aligned_64x%1, 6,6,7
1010
+    vbroadcasti32x8 m4, [pw_256]
1011
+    vbroadcasti32x8 m5, [pw_128]
1012
+    mova            m6, [shuf_avx512]
1013
+
1014
+    add             r3, r3
1015
+    add             r4, r4
1016
+
1017
+%rep %1/2 - 1
1018
+    PROCESS_ADDAVG_ALIGNED_64x2_AVX512
1019
+    lea             r2, [r2 + 2 * r5]
1020
+    lea             r0, [r0 + 2 * r3]
1021
+    lea             r1, [r1 + 2 * r4]
1022
+%endrep
1023
+    PROCESS_ADDAVG_ALIGNED_64x2_AVX512
1024
+    RET
1025
+%endmacro
1026
+
1027
+ADDAVG_ALIGNED_W64_AVX512 16
1028
+ADDAVG_ALIGNED_W64_AVX512 32
1029
+ADDAVG_ALIGNED_W64_AVX512 48
1030
+ADDAVG_ALIGNED_W64_AVX512 64
1031
+
1032
+%macro ADDAVG_ALIGNED_W32_AVX512 1
1033
+INIT_ZMM avx512
1034
+cglobal addAvg_aligned_32x%1, 6,6,7
1035
+    vbroadcasti32x8 m4, [pw_256]
1036
+    vbroadcasti32x8 m5, [pw_128]
1037
+    mova            m6, [shuf_avx512]
1038
+    add             r3, r3
1039
+    add             r4, r4
1040
+
1041
+%rep %1/2 - 1
1042
+    PROCESS_ADDAVG_ALIGNED_32x2_AVX512
1043
+    lea             r2, [r2 + 2 * r5]
1044
+    lea             r0, [r0 + 2 * r3]
1045
+    lea             r1, [r1 + 2 * r4]
1046
+%endrep
1047
+    PROCESS_ADDAVG_ALIGNED_32x2_AVX512
1048
+    RET
1049
+%endmacro
1050
+
1051
+ADDAVG_ALIGNED_W32_AVX512 8
1052
+ADDAVG_ALIGNED_W32_AVX512 16
1053
+ADDAVG_ALIGNED_W32_AVX512 24
1054
+ADDAVG_ALIGNED_W32_AVX512 32
1055
+ADDAVG_ALIGNED_W32_AVX512 48
1056
+ADDAVG_ALIGNED_W32_AVX512 64
1057
+;-----------------------------------------------------------------------------
1058
+; addAvg avx512 code end
1059
 ;-----------------------------------------------------------------------------
1060
 %macro ADDAVG_W24_H2 2
1061
 INIT_XMM sse4
1062
@@ -3367,11 +4311,11 @@
1063
     %endmacro
1064
 %endif
1065
 
1066
-%macro AVG_END 0
1067
-    lea  t4, [t4+t5*2*SIZEOF_PIXEL]
1068
+%macro AVG_END 0-1 2;rows
1069
     lea  t2, [t2+t3*2*SIZEOF_PIXEL]
1070
+    lea  t4, [t4+t5*2*SIZEOF_PIXEL]
1071
     lea  t0, [t0+t1*2*SIZEOF_PIXEL]
1072
-    sub eax, 2
1073
+    sub eax, %1
1074
     jg .height_loop
1075
  %ifidn movu,movq ; detect MMX
1076
     EMMS
1077
@@ -3434,17 +4378,24 @@
1078
 %endmacro
1079
 
1080
 %macro BIWEIGHT_START_SSSE3 0
1081
-    movzx  t6d, byte r6m ; FIXME x86_64
1082
-    mov    t7d, 64
1083
-    sub    t7d, t6d
1084
-    shl    t7d, 8
1085
-    add    t6d, t7d
1086
-    mova    m4, [pw_512]
1087
-    movd   xm3, t6d
1088
+    movzx         t6d, byte r6m ; FIXME x86_64
1089
+%if mmsize > 16
1090
+    vbroadcasti128 m4, [pw_512]
1091
+%else
1092
+    mova           m4, [pw_512]
1093
+%endif
1094
+    lea           t7d, [t6+(64<<8)]
1095
+    shl           t6d, 8
1096
+    sub           t7d, t6d
1097
+%if cpuflag(avx512)
1098
+    vpbroadcastw   m3, t7d
1099
+%else
1100
+    movd          xm3, t7d
1101
 %if cpuflag(avx2)
1102
-    vpbroadcastw m3, xm3
1103
+    vpbroadcastw   m3, xm3
1104
 %else
1105
-    SPLATW  m3, m3   ; weight_dst,src
1106
+    SPLATW         m3, m3   ; weight_dst,src
1107
+%endif
1108
 %endif
1109
 %endmacro
1110
 
1111
@@ -3567,6 +4518,38 @@
1112
 AVG_WEIGHT 24, 7
1113
 AVG_WEIGHT 48, 7
1114
 
1115
+INIT_YMM avx512
1116
+cglobal pixel_avg_weight_w8
1117
+    BIWEIGHT_START
1118
+    kxnorb         k1, k1, k1
1119
+    kaddb          k1, k1, k1
1120
+    AVG_START 5
1121
+.height_loop:
1122
+    movq          xm0, [t2]
1123
+    movq          xm2, [t4]
1124
+    movq          xm1, [t2+t3]
1125
+    movq          xm5, [t4+t5]
1126
+    lea            t2, [t2+t3*2]
1127
+    lea            t4, [t4+t5*2]
1128
+    vpbroadcastq   m0 {k1}, [t2]
1129
+    vpbroadcastq   m2 {k1}, [t4]
1130
+    vpbroadcastq   m1 {k1}, [t2+t3]
1131
+    vpbroadcastq   m5 {k1}, [t4+t5]
1132
+    punpcklbw      m0, m2
1133
+    punpcklbw      m1, m5
1134
+    pmaddubsw      m0, m3
1135
+    pmaddubsw      m1, m3
1136
+    pmulhrsw       m0, m4
1137
+    pmulhrsw       m1, m4
1138
+    packuswb       m0, m1
1139
+    vextracti128 xmm1, m0, 1
1140
+    movq         [t0], xm0
1141
+    movhps    [t0+t1], xm0
1142
+    lea            t0, [t0+t1*2]
1143
+    movq         [t0], xmm1
1144
+    movhps    [t0+t1], xmm1
1145
+    AVG_END 4
1146
+
1147
 INIT_YMM avx2
1148
 cglobal pixel_avg_weight_w16
1149
     BIWEIGHT_START
1150
@@ -3586,6 +4569,35 @@
1151
     vextracti128 [t0+t1], m0, 1
1152
     AVG_END
1153
 
1154
+INIT_ZMM avx512
1155
+ cglobal pixel_avg_weight_w16
1156
+    BIWEIGHT_START
1157
+    AVG_START 5
1158
+.height_loop:
1159
+    movu        xm0, [t2]
1160
+    movu        xm1, [t4]
1161
+    vinserti128 ym0, [t2+t3], 1
1162
+    vinserti128 ym1, [t4+t5], 1
1163
+    lea          t2, [t2+t3*2]
1164
+    lea          t4, [t4+t5*2]
1165
+    vinserti32x4 m0, [t2], 2
1166
+    vinserti32x4 m1, [t4], 2
1167
+    vinserti32x4 m0, [t2+t3], 3
1168
+    vinserti32x4 m1, [t4+t5], 3
1169
+    SBUTTERFLY   bw, 0, 1, 2
1170
+    pmaddubsw    m0, m3
1171
+    pmaddubsw    m1, m3
1172
+    pmulhrsw     m0, m4
1173
+    pmulhrsw     m1, m4
1174
+    packuswb     m0, m1
1175
+    mova       [t0], xm0
1176
+    vextracti128 [t0+t1], ym0, 1
1177
+    lea          t0, [t0+t1*2]
1178
+    vextracti32x4 [t0], m0, 2
1179
+    vextracti32x4 [t0+t1], m0, 3
1180
+    AVG_END 4
1181
+
1182
+INIT_YMM avx2
1183
 cglobal pixel_avg_weight_w32
1184
     BIWEIGHT_START
1185
     AVG_START 5
1186
@@ -3601,6 +4613,7 @@
1187
     mova    [t0], m0
1188
     AVG_END
1189
 
1190
+INIT_YMM avx2
1191
 cglobal pixel_avg_weight_w64
1192
     BIWEIGHT_START
1193
     AVG_START 5
1194
@@ -4345,6 +5358,18 @@
1195
 AVGH 16, 8
1196
 AVGH 16, 4
1197
 
1198
+INIT_XMM avx512
1199
+AVGH 16, 64
1200
+AVGH 16, 32
1201
+AVGH 16, 16
1202
+AVGH 16, 12
1203
+AVGH 16,  8
1204
+AVGH 16,  4
1205
+AVGH  8, 32
1206
+AVGH  8, 16
1207
+AVGH  8,  8
1208
+AVGH  8,  4
1209
+
1210
 %endif ;HIGH_BIT_DEPTH
1211
 
1212
 ;-------------------------------------------------------------------------------------------------------------------------------
1213
@@ -4482,6 +5507,58 @@
1214
     RET
1215
 %endif
1216
 
1217
+;-----------------------------------------------------------------------------
1218
+;pixel_avg_pp avx512 code start
1219
+;-----------------------------------------------------------------------------
1220
+%macro PROCESS_PIXELAVG_64x4_AVX512 0
1221
+    movu        m0,             [r2]
1222
+    movu        m2,             [r2 + r3]
1223
+    movu        m1,             [r4]
1224
+    movu        m3,             [r4 + r5]
1225
+    pavgb       m0,             m1
1226
+    pavgb       m2,             m3
1227
+    movu        [r0],           m0
1228
+    movu        [r0 + r1],      m2
1229
+
1230
+    movu        m0,             [r2 + 2 * r3]
1231
+    movu        m2,             [r2 + r7]
1232
+    movu        m1,             [r4 + 2 * r5]
1233
+    movu        m3,             [r4 + r8]
1234
+    pavgb       m0,             m1
1235
+    pavgb       m2,             m3
1236
+    movu        [r0 + 2 * r1],  m0
1237
+    movu        [r0 + r6],      m2
1238
+%endmacro
1239
+
1240
+;-------------------------------------------------------------------------------------------------------------------------------
1241
+;void pixelavg_pp(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int)
1242
+;-------------------------------------------------------------------------------------------------------------------------------
1243
+%if ARCH_X86_64 && BIT_DEPTH == 8
1244
+%macro PIXEL_AVG_64xN_AVX512 1
1245
+INIT_ZMM avx512
1246
+cglobal pixel_avg_64x%1, 6, 9, 4
1247
+    lea         r6, [3 * r1]
1248
+    lea         r7, [3 * r3]
1249
+    lea         r8, [3 * r5]
1250
+
1251
+%rep %1/4 - 1
1252
+    PROCESS_PIXELAVG_64x4_AVX512
1253
+    lea         r2, [r2 + r3 * 4]
1254
+    lea         r4, [r4 + r5 * 4]
1255
+    lea         r0, [r0 + r1 * 4]
1256
+%endrep
1257
+    PROCESS_PIXELAVG_64x4_AVX512
1258
+    RET
1259
+%endmacro
1260
+
1261
+PIXEL_AVG_64xN_AVX512 16
1262
+PIXEL_AVG_64xN_AVX512 32
1263
+PIXEL_AVG_64xN_AVX512 48
1264
+PIXEL_AVG_64xN_AVX512 64
1265
+%endif
1266
+;-----------------------------------------------------------------------------
1267
+;pixel_avg_pp avx512 code end
1268
+;-----------------------------------------------------------------------------
1269
 ;=============================================================================
1270
 ; pixel avg2
1271
 ;=============================================================================
1272
@@ -5267,6 +6344,552 @@
1273
     RET
1274
 %endif
1275
 
1276
+;-----------------------------------------------------------------------------
1277
+;pixel_avg_pp avx512 high bit depth code start
1278
+;-----------------------------------------------------------------------------
1279
+%macro PROCESS_PIXELAVG_32x8_HBD_AVX512 0
1280
+    movu    m0, [r2]
1281
+    movu    m1, [r4]
1282
+    movu    m2, [r2 + r3]
1283
+    movu    m3, [r4 + r5]
1284
+    pavgw   m0, m1
1285
+    pavgw   m2, m3
1286
+    movu    [r0], m0
1287
+    movu    [r0 + r1], m2
1288
+
1289
+    movu    m0, [r2 + r3 * 2]
1290
+    movu    m1, [r4 + r5 * 2]
1291
+    movu    m2, [r2 + r6]
1292
+    movu    m3, [r4 + r7]
1293
+    pavgw   m0, m1
1294
+    pavgw   m2, m3
1295
+    movu    [r0 + r1 * 2], m0
1296
+    movu    [r0 + r8], m2
1297
+
1298
+    lea     r0, [r0 + 4 * r1]
1299
+    lea     r2, [r2 + 4 * r3]
1300
+    lea     r4, [r4 + 4 * r5]
1301
+
1302
+    movu    m0, [r2]
1303
+    movu    m1, [r4]
1304
+    movu    m2, [r2 + r3]
1305
+    movu    m3, [r4 + r5]
1306
+    pavgw   m0, m1
1307
+    pavgw   m2, m3
1308
+    movu    [r0], m0
1309
+    movu    [r0 + r1], m2
1310
+
1311
+    movu    m0, [r2 + r3 * 2]
1312
+    movu    m1, [r4 + r5 * 2]
1313
+    movu    m2, [r2 + r6]
1314
+    movu    m3, [r4 + r7]
1315
+    pavgw   m0, m1
1316
+    pavgw   m2, m3
1317
+    movu    [r0 + r1 * 2], m0
1318
+    movu    [r0 + r8], m2
1319
+%endmacro
1320
+%macro PROCESS_PIXELAVG_ALIGNED_32x8_HBD_AVX512 0
1321
+    mova    m0, [r2]
1322
+    mova    m1, [r4]
1323
+    mova    m2, [r2 + r3]
1324
+    mova    m3, [r4 + r5]
1325
+    pavgw   m0, m1
1326
+    pavgw   m2, m3
1327
+    mova    [r0], m0
1328
+    mova    [r0 + r1], m2
1329
+
1330
+    mova    m0, [r2 + r3 * 2]
1331
+    mova    m1, [r4 + r5 * 2]
1332
+    mova    m2, [r2 + r6]
1333
+    mova    m3, [r4 + r7]
1334
+    pavgw   m0, m1
1335
+    pavgw   m2, m3
1336
+    mova    [r0 + r1 * 2], m0
1337
+    mova    [r0 + r8], m2
1338
+
1339
+    lea     r0, [r0 + 4 * r1]
1340
+    lea     r2, [r2 + 4 * r3]
1341
+    lea     r4, [r4 + 4 * r5]
1342
+
1343
+    mova    m0, [r2]
1344
+    mova    m1, [r4]
1345
+    mova    m2, [r2 + r3]
1346
+    mova    m3, [r4 + r5]
1347
+    pavgw   m0, m1
1348
+    pavgw   m2, m3
1349
+    mova    [r0], m0
1350
+    mova    [r0 + r1], m2
1351
+
1352
+    mova    m0, [r2 + r3 * 2]
1353
+    mova    m1, [r4 + r5 * 2]
1354
+    mova    m2, [r2 + r6]
1355
+    mova    m3, [r4 + r7]
1356
+    pavgw   m0, m1
1357
+    pavgw   m2, m3
1358
+    mova    [r0 + r1 * 2], m0
1359
+    mova    [r0 + r8], m2
1360
+%endmacro
1361
+
1362
+%macro PROCESS_PIXELAVG_64x8_HBD_AVX512 0
1363
+    movu    m0, [r2]
1364
+    movu    m1, [r4]
1365
+    movu    m2, [r2 + r3]
1366
+    movu    m3, [r4 + r5]
1367
+    pavgw   m0, m1
1368
+    pavgw   m2, m3
1369
+    movu    [r0], m0
1370
+    movu    [r0 + r1], m2
1371
+
1372
+    movu    m0, [r2 + mmsize]
1373
+    movu    m1, [r4 + mmsize]
1374
+    movu    m2, [r2 + r3 + mmsize]
1375
+    movu    m3, [r4 + r5 + mmsize]
1376
+    pavgw   m0, m1
1377
+    pavgw   m2, m3
1378
+    movu    [r0 + mmsize], m0
1379
+    movu    [r0 + r1 + mmsize], m2
1380
+
1381
+    movu    m0, [r2 + r3 * 2]
1382
+    movu    m1, [r4 + r5 * 2]
1383
+    movu    m2, [r2 + r6]
1384
+    movu    m3, [r4 + r7]
1385
+    pavgw   m0, m1
1386
+    pavgw   m2, m3
1387
+    movu    [r0 + r1 * 2], m0
1388
+    movu    [r0 + r8], m2
1389
+
1390
+    movu    m0, [r2 + r3 * 2 + mmsize]
1391
+    movu    m1, [r4 + r5 * 2 + mmsize]
1392
+    movu    m2, [r2 + r6 + mmsize]
1393
+    movu    m3, [r4 + r7 + mmsize]
1394
+    pavgw   m0, m1
1395
+    pavgw   m2, m3
1396
+    movu    [r0 + r1 * 2 + mmsize], m0
1397
+    movu    [r0 + r8 + mmsize], m2
1398
+
1399
+    lea     r0, [r0 + 4 * r1]
1400
+    lea     r2, [r2 + 4 * r3]
1401
+    lea     r4, [r4 + 4 * r5]
1402
+
1403
+    movu    m0, [r2]
1404
+    movu    m1, [r4]
1405
+    movu    m2, [r2 + r3]
1406
+    movu    m3, [r4 + r5]
1407
+    pavgw   m0, m1
1408
+    pavgw   m2, m3
1409
+    movu    [r0], m0
1410
+    movu    [r0 + r1], m2
1411
+
1412
+    movu    m0, [r2 + mmsize]
1413
+    movu    m1, [r4 + mmsize]
1414
+    movu    m2, [r2 + r3 + mmsize]
1415
+    movu    m3, [r4 + r5 + mmsize]
1416
+    pavgw   m0, m1
1417
+    pavgw   m2, m3
1418
+    movu    [r0 + mmsize], m0
1419
+    movu    [r0 + r1 + mmsize], m2
1420
+
1421
+    movu    m0, [r2 + r3 * 2]
1422
+    movu    m1, [r4 + r5 * 2]
1423
+    movu    m2, [r2 + r6]
1424
+    movu    m3, [r4 + r7]
1425
+    pavgw   m0, m1
1426
+    pavgw   m2, m3
1427
+    movu    [r0 + r1 * 2], m0
1428
+    movu    [r0 + r8], m2
1429
+
1430
+    movu    m0, [r2 + r3 * 2 + mmsize]
1431
+    movu    m1, [r4 + r5 * 2 + mmsize]
1432
+    movu    m2, [r2 + r6 + mmsize]
1433
+    movu    m3, [r4 + r7 + mmsize]
1434
+    pavgw   m0, m1
1435
+    pavgw   m2, m3
1436
+    movu    [r0 + r1 * 2 + mmsize], m0
1437
+    movu    [r0 + r8 + mmsize], m2
1438
+%endmacro
1439
+%macro PROCESS_PIXELAVG_ALIGNED_64x8_HBD_AVX512 0
1440
+    mova    m0, [r2]
1441
+    mova    m1, [r4]
1442
+    mova    m2, [r2 + r3]
1443
+    mova    m3, [r4 + r5]
1444
+    pavgw   m0, m1
1445
+    pavgw   m2, m3
1446
+    mova    [r0], m0
1447
+    mova    [r0 + r1], m2
1448
+
1449
+    mova    m0, [r2 + mmsize]
1450
+    mova    m1, [r4 + mmsize]
1451
+    mova    m2, [r2 + r3 + mmsize]
1452
+    mova    m3, [r4 + r5 + mmsize]
1453
+    pavgw   m0, m1
1454
+    pavgw   m2, m3
1455
+    mova    [r0 + mmsize], m0
1456
+    mova    [r0 + r1 + mmsize], m2
1457
+
1458
+    mova    m0, [r2 + r3 * 2]
1459
+    mova    m1, [r4 + r5 * 2]
1460
+    mova    m2, [r2 + r6]
1461
+    mova    m3, [r4 + r7]
1462
+    pavgw   m0, m1
1463
+    pavgw   m2, m3
1464
+    mova    [r0 + r1 * 2], m0
1465
+    mova    [r0 + r8], m2
1466
+
1467
+    mova    m0, [r2 + r3 * 2 + mmsize]
1468
+    mova    m1, [r4 + r5 * 2 + mmsize]
1469
+    mova    m2, [r2 + r6 + mmsize]
1470
+    mova    m3, [r4 + r7 + mmsize]
1471
+    pavgw   m0, m1
1472
+    pavgw   m2, m3
1473
+    mova    [r0 + r1 * 2 + mmsize], m0
1474
+    mova    [r0 + r8 + mmsize], m2
1475
+
1476
+    lea     r0, [r0 + 4 * r1]
1477
+    lea     r2, [r2 + 4 * r3]
1478
+    lea     r4, [r4 + 4 * r5]
1479
+
1480
+    mova    m0, [r2]
1481
+    mova    m1, [r4]
1482
+    mova    m2, [r2 + r3]
1483
+    mova    m3, [r4 + r5]
1484
+    pavgw   m0, m1
1485
+    pavgw   m2, m3
1486
+    mova    [r0], m0
1487
+    mova    [r0 + r1], m2
1488
+
1489
+    mova    m0, [r2 + mmsize]
1490
+    mova    m1, [r4 + mmsize]
1491
+    mova    m2, [r2 + r3 + mmsize]
1492
+    mova    m3, [r4 + r5 + mmsize]
1493
+    pavgw   m0, m1
1494
+    pavgw   m2, m3
1495
+    mova    [r0 + mmsize], m0
1496
+    mova    [r0 + r1 + mmsize], m2
1497
+
1498
+    mova    m0, [r2 + r3 * 2]
1499
+    mova    m1, [r4 + r5 * 2]
1500
+    mova    m2, [r2 + r6]
1501
+    mova    m3, [r4 + r7]
1502
+    pavgw   m0, m1
1503
+    pavgw   m2, m3
1504
+    mova    [r0 + r1 * 2], m0
1505
+    mova    [r0 + r8], m2
1506
+
1507
+    mova    m0, [r2 + r3 * 2 + mmsize]
1508
+    mova    m1, [r4 + r5 * 2 + mmsize]
1509
+    mova    m2, [r2 + r6 + mmsize]
1510
+    mova    m3, [r4 + r7 + mmsize]
1511
+    pavgw   m0, m1
1512
+    pavgw   m2, m3
1513
+    mova    [r0 + r1 * 2 + mmsize], m0
1514
+    mova    [r0 + r8 + mmsize], m2
1515
+%endmacro
1516
+
1517
+%macro PROCESS_PIXELAVG_48x8_HBD_AVX512 0
1518
+    movu    m0, [r2]
1519
+    movu    m1, [r4]
1520
+    movu    m2, [r2 + r3]
1521
+    movu    m3, [r4 + r5]
1522
+    pavgw   m0, m1
1523
+    pavgw   m2, m3
1524
+    movu    [r0], m0
1525
+    movu    [r0 + r1], m2
1526
+
1527
+    movu    ym0, [r2 + mmsize]
1528
+    movu    ym1, [r4 + mmsize]
1529
+    movu    ym2, [r2 + r3 + mmsize]
1530
+    movu    ym3, [r4 + r5 + mmsize]
1531
+    pavgw   ym0, ym1
1532
+    pavgw   ym2, ym3
1533
+    movu    [r0 + mmsize], ym0
1534
+    movu    [r0 + r1 + mmsize], ym2
1535
+
1536
+    movu    m0, [r2 + r3 * 2]
1537
+    movu    m1, [r4 + r5 * 2]
1538
+    movu    m2, [r2 + r6]
1539
+    movu    m3, [r4 + r7]
1540
+    pavgw   m0, m1
1541
+    pavgw   m2, m3
1542
+    movu    [r0 + r1 * 2], m0
1543
+    movu    [r0 + r8], m2
1544
+
1545
+    movu    ym0, [r2 + r3 * 2 + mmsize]
1546
+    movu    ym1, [r4 + r5 * 2 + mmsize]
1547
+    movu    ym2, [r2 + r6 + mmsize]
1548
+    movu    ym3, [r4 + r7 + mmsize]
1549
+    pavgw   ym0, ym1
1550
+    pavgw   ym2, ym3
1551
+    movu    [r0 + r1 * 2 + mmsize], ym0
1552
+    movu    [r0 + r8 + mmsize], ym2
1553
+
1554
+    lea     r0, [r0 + 4 * r1]
1555
+    lea     r2, [r2 + 4 * r3]
1556
+    lea     r4, [r4 + 4 * r5]
1557
+
1558
+    movu    m0, [r2]
1559
+    movu    m1, [r4]
1560
+    movu    m2, [r2 + r3]
1561
+    movu    m3, [r4 + r5]
1562
+    pavgw   m0, m1
1563
+    pavgw   m2, m3
1564
+    movu    [r0], m0
1565
+    movu    [r0 + r1], m2
1566
+
1567
+    movu    ym0, [r2 + mmsize]
1568
+    movu    ym1, [r4 + mmsize]
1569
+    movu    ym2, [r2 + r3 + mmsize]
1570
+    movu    ym3, [r4 + r5 + mmsize]
1571
+    pavgw   ym0, ym1
1572
+    pavgw   ym2, ym3
1573
+    movu    [r0 + mmsize], ym0
1574
+    movu    [r0 + r1 + mmsize], ym2
1575
+
1576
+    movu    m0, [r2 + r3 * 2]
1577
+    movu    m1, [r4 + r5 * 2]
1578
+    movu    m2, [r2 + r6]
1579
+    movu    m3, [r4 + r7]
1580
+    pavgw   m0, m1
1581
+    pavgw   m2, m3
1582
+    movu    [r0 + r1 * 2], m0
1583
+    movu    [r0 + r8], m2
1584
+
1585
+    movu    ym0, [r2 + r3 * 2 + mmsize]
1586
+    movu    ym1, [r4 + r5 * 2 + mmsize]
1587
+    movu    ym2, [r2 + r6 + mmsize]
1588
+    movu    ym3, [r4 + r7 + mmsize]
1589
+    pavgw   ym0, ym1
1590
+    pavgw   ym2, ym3
1591
+    movu    [r0 + r1 * 2 + mmsize], ym0
1592
+    movu    [r0 + r8 + mmsize], ym2
1593
+%endmacro
1594
+%macro PROCESS_PIXELAVG_ALIGNED_48x8_HBD_AVX512 0
1595
+    mova    m0, [r2]
1596
+    mova    m1, [r4]
1597
+    mova    m2, [r2 + r3]
1598
+    mova    m3, [r4 + r5]
1599
+    pavgw   m0, m1
1600
+    pavgw   m2, m3
1601
+    mova    [r0], m0
1602
+    mova    [r0 + r1], m2
1603
+
1604
+    mova    ym0, [r2 + mmsize]
1605
+    mova    ym1, [r4 + mmsize]
1606
+    mova    ym2, [r2 + r3 + mmsize]
1607
+    mova    ym3, [r4 + r5 + mmsize]
1608
+    pavgw   ym0, ym1
1609
+    pavgw   ym2, ym3
1610
+    mova    [r0 + mmsize], ym0
1611
+    mova    [r0 + r1 + mmsize], ym2
1612
+
1613
+    mova    m0, [r2 + r3 * 2]
1614
+    mova    m1, [r4 + r5 * 2]
1615
+    mova    m2, [r2 + r6]
1616
+    mova    m3, [r4 + r7]
1617
+    pavgw   m0, m1
1618
+    pavgw   m2, m3
1619
+    mova    [r0 + r1 * 2], m0
1620
+    mova    [r0 + r8], m2
1621
+
1622
+    mova    ym0, [r2 + r3 * 2 + mmsize]
1623
+    mova    ym1, [r4 + r5 * 2 + mmsize]
1624
+    mova    ym2, [r2 + r6 + mmsize]
1625
+    mova    ym3, [r4 + r7 + mmsize]
1626
+    pavgw   ym0, ym1
1627
+    pavgw   ym2, ym3
1628
+    mova    [r0 + r1 * 2 + mmsize], ym0
1629
+    mova    [r0 + r8 + mmsize], ym2
1630
+
1631
+    lea     r0, [r0 + 4 * r1]
1632
+    lea     r2, [r2 + 4 * r3]
1633
+    lea     r4, [r4 + 4 * r5]
1634
+
1635
+    mova    m0, [r2]
1636
+    mova    m1, [r4]
1637
+    mova    m2, [r2 + r3]
1638
+    mova    m3, [r4 + r5]
1639
+    pavgw   m0, m1
1640
+    pavgw   m2, m3
1641
+    mova    [r0], m0
1642
+    mova    [r0 + r1], m2
1643
+
1644
+    mova    ym0, [r2 + mmsize]
1645
+    mova    ym1, [r4 + mmsize]
1646
+    mova    ym2, [r2 + r3 + mmsize]
1647
+    mova    ym3, [r4 + r5 + mmsize]
1648
+    pavgw   ym0, ym1
1649
+    pavgw   ym2, ym3
1650
+    mova    [r0 + mmsize], ym0
1651
+    mova    [r0 + r1 + mmsize], ym2
1652
+
1653
+    mova    m0, [r2 + r3 * 2]
1654
+    mova    m1, [r4 + r5 * 2]
1655
+    mova    m2, [r2 + r6]
1656
+    mova    m3, [r4 + r7]
1657
+    pavgw   m0, m1
1658
+    pavgw   m2, m3
1659
+    mova    [r0 + r1 * 2], m0
1660
+    mova    [r0 + r8], m2
1661
+
1662
+    mova    ym0, [r2 + r3 * 2 + mmsize]
1663
+    mova    ym1, [r4 + r5 * 2 + mmsize]
1664
+    mova    ym2, [r2 + r6 + mmsize]
1665
+    mova    ym3, [r4 + r7 + mmsize]
1666
+    pavgw   ym0, ym1
1667
+    pavgw   ym2, ym3
1668
+    mova    [r0 + r1 * 2 + mmsize], ym0
1669
+    mova    [r0 + r8 + mmsize], ym2
1670
+%endmacro
1671
+
1672
+%macro PIXEL_AVG_HBD_W32 1
1673
+INIT_ZMM avx512
1674
+cglobal pixel_avg_32x%1, 6,9,4
1675
+    shl     r1d, 1
1676
+    shl     r3d, 1
1677
+    shl     r5d, 1
1678
+    lea     r6, [r3 * 3]
1679
+    lea     r7, [r5 * 3]
1680
+    lea     r8, [r1 * 3]
1681
+
1682
+%rep %1/8 - 1
1683
+    PROCESS_PIXELAVG_32x8_HBD_AVX512
1684
+    lea     r0, [r0 + 4 * r1]
1685
+    lea     r2, [r2 + 4 * r3]
1686
+    lea     r4, [r4 + 4 * r5]
1687
+%endrep
1688
+    PROCESS_PIXELAVG_32x8_HBD_AVX512
1689
+    RET
1690
+%endmacro
1691
+
1692
+%if ARCH_X86_64
1693
+PIXEL_AVG_HBD_W32 8
1694
+PIXEL_AVG_HBD_W32 16
1695
+PIXEL_AVG_HBD_W32 24
1696
+PIXEL_AVG_HBD_W32 32
1697
+PIXEL_AVG_HBD_W32 64
1698
+%endif
1699
+%macro PIXEL_AVG_HBD_ALIGNED_W32 1
1700
+INIT_ZMM avx512
1701
+cglobal pixel_avg_aligned_32x%1, 6,9,4
1702
+    shl     r1d, 1
1703
+    shl     r3d, 1
1704
+    shl     r5d, 1
1705
+    lea     r6, [r3 * 3]
1706
+    lea     r7, [r5 * 3]
1707
+    lea     r8, [r1 * 3]
1708
+
1709
+%rep %1/8 - 1
1710
+    PROCESS_PIXELAVG_ALIGNED_32x8_HBD_AVX512
1711
+    lea     r0, [r0 + 4 * r1]
1712
+    lea     r2, [r2 + 4 * r3]
1713
+    lea     r4, [r4 + 4 * r5]
1714
+%endrep
1715
+    PROCESS_PIXELAVG_ALIGNED_32x8_HBD_AVX512
1716
+    RET
1717
+%endmacro
1718
+
1719
+%if ARCH_X86_64
1720
+PIXEL_AVG_HBD_ALIGNED_W32 8
1721
+PIXEL_AVG_HBD_ALIGNED_W32 16
1722
+PIXEL_AVG_HBD_ALIGNED_W32 24
1723
+PIXEL_AVG_HBD_ALIGNED_W32 32
1724
+PIXEL_AVG_HBD_ALIGNED_W32 64
1725
+%endif
1726
+
1727
+%macro PIXEL_AVG_HBD_W64 1
1728
+INIT_ZMM avx512
1729
+cglobal pixel_avg_64x%1, 6,9,4
1730
+    shl     r1d, 1
1731
+    shl     r3d, 1
1732
+    shl     r5d, 1
1733
+    lea     r6, [r3 * 3]
1734
+    lea     r7, [r5 * 3]
1735
+    lea     r8, [r1 * 3]
1736
+
1737
+%rep %1/8 - 1
1738
+    PROCESS_PIXELAVG_64x8_HBD_AVX512
1739
+    lea     r0, [r0 + 4 * r1]
1740
+    lea     r2, [r2 + 4 * r3]
1741
+    lea     r4, [r4 + 4 * r5]
1742
+%endrep
1743
+    PROCESS_PIXELAVG_64x8_HBD_AVX512
1744
+    RET
1745
+%endmacro
1746
+
1747
+%if ARCH_X86_64
1748
+PIXEL_AVG_HBD_W64 16
1749
+PIXEL_AVG_HBD_W64 32
1750
+PIXEL_AVG_HBD_W64 48
1751
+PIXEL_AVG_HBD_W64 64
1752
+%endif
1753
+%macro PIXEL_AVG_HBD_ALIGNED_W64 1
1754
+INIT_ZMM avx512
1755
+cglobal pixel_avg_aligned_64x%1, 6,9,4
1756
+    shl     r1d, 1
1757
+    shl     r3d, 1
1758
+    shl     r5d, 1
1759
+    lea     r6, [r3 * 3]
1760
+    lea     r7, [r5 * 3]
1761
+    lea     r8, [r1 * 3]
1762
+
1763
+%rep %1/8 - 1
1764
+    PROCESS_PIXELAVG_ALIGNED_64x8_HBD_AVX512
1765
+    lea     r0, [r0 + 4 * r1]
1766
+    lea     r2, [r2 + 4 * r3]
1767
+    lea     r4, [r4 + 4 * r5]
1768
+%endrep
1769
+    PROCESS_PIXELAVG_ALIGNED_64x8_HBD_AVX512
1770
+    RET
1771
+%endmacro
1772
+
1773
+%if ARCH_X86_64
1774
+PIXEL_AVG_HBD_ALIGNED_W64 16
1775
+PIXEL_AVG_HBD_ALIGNED_W64 32
1776
+PIXEL_AVG_HBD_ALIGNED_W64 48
1777
+PIXEL_AVG_HBD_ALIGNED_W64 64
1778
+%endif
1779
+
1780
+%if ARCH_X86_64
1781
+INIT_ZMM avx512
1782
+cglobal pixel_avg_48x64, 6,9,4
1783
+    shl     r1d, 1
1784
+    shl     r3d, 1
1785
+    shl     r5d, 1
1786
+    lea     r6, [r3 * 3]
1787
+    lea     r7, [r5 * 3]
1788
+    lea     r8, [r1 * 3]
1789
+
1790
+%rep 7
1791
+    PROCESS_PIXELAVG_48x8_HBD_AVX512
1792
+    lea     r0, [r0 + 4 * r1]
1793
+    lea     r2, [r2 + 4 * r3]
1794
+    lea     r4, [r4 + 4 * r5]
1795
+%endrep
1796
+    PROCESS_PIXELAVG_48x8_HBD_AVX512
1797
+    RET
1798
+%endif
1799
+
1800
+%if ARCH_X86_64
1801
+INIT_ZMM avx512
1802
+cglobal pixel_avg_aligned_48x64, 6,9,4
1803
+    shl     r1d, 1
1804
+    shl     r3d, 1
1805
+    shl     r5d, 1
1806
+    lea     r6, [r3 * 3]
1807
+    lea     r7, [r5 * 3]
1808
+    lea     r8, [r1 * 3]
1809
+
1810
+%rep 7
1811
+    PROCESS_PIXELAVG_ALIGNED_48x8_HBD_AVX512
1812
+    lea     r0, [r0 + 4 * r1]
1813
+    lea     r2, [r2 + 4 * r3]
1814
+    lea     r4, [r4 + 4 * r5]
1815
+%endrep
1816
+    PROCESS_PIXELAVG_ALIGNED_48x8_HBD_AVX512
1817
+    RET
1818
+%endif
1819
+;-----------------------------------------------------------------------------
1820
+;pixel_avg_pp avx512 high bit depth code end
1821
+;-----------------------------------------------------------------------------
1822
 %endif ; HIGH_BIT_DEPTH
1823
 
1824
 %if HIGH_BIT_DEPTH == 0
1825
@@ -5395,6 +7018,7 @@
1826
     jg .height_loop
1827
     RET
1828
 
1829
+%if ARCH_X86_64
1830
 INIT_YMM avx2
1831
 cglobal pixel_avg2_w20, 6,7
1832
     sub    r2, r4
1833
@@ -5411,6 +7035,7 @@
1834
     sub    r5d, 2
1835
     jg     .height_loop
1836
     RET
1837
+%endif
1838
 
1839
 ; Cacheline split code for processors with high latencies for loads
1840
 ; split over cache lines.  See sad-a.asm for a more detailed explanation.
1841
x265_2.7.tar.gz/source/common/x86/pixel-a.asm -> x265_2.9.tar.gz/source/common/x86/pixel-a.asm Changed
1567
 
1
@@ -45,6 +45,9 @@
2
            times 2 dw 1, -1
3
            times 4 dw 1
4
            times 2 dw 1, -1
5
+psy_pp_shuff1:   dq 0, 1, 8, 9, 4, 5, 12, 13
6
+psy_pp_shuff2:   dq 2, 3, 10, 11, 6, 7, 14, 15
7
+psy_pp_shuff3:   dq 0, 0, 8, 8, 1, 1, 9, 9
8
 
9
 ALIGN 32
10
 transd_shuf1: SHUFFLE_MASK_W 0, 8, 2, 10, 4, 12, 6, 14
11
@@ -8145,6 +8148,243 @@
12
 %endif ; ARCH_X86_64=1
13
 %endif ; HIGH_BIT_DEPTH
14
 
15
+%macro SATD_AVX512_LOAD4 2 ; size, opmask
16
+    vpbroadcast%1 m0, [r0]
17
+    vpbroadcast%1 m0 {%2}, [r0+2*r1]
18
+    vpbroadcast%1 m2, [r2]
19
+    vpbroadcast%1 m2 {%2}, [r2+2*r3]
20
+    add           r0, r1
21
+    add           r2, r3
22
+    vpbroadcast%1 m1, [r0]
23
+    vpbroadcast%1 m1 {%2}, [r0+2*r1]
24
+    vpbroadcast%1 m3, [r2]
25
+    vpbroadcast%1 m3 {%2}, [r2+2*r3]
26
+%endmacro
27
+
28
+%macro SATD_AVX512_LOAD8 5 ; size, halfreg, opmask1, opmask2, opmask3
29
+    vpbroadcast%1 %{2}0, [r0]
30
+    vpbroadcast%1 %{2}0 {%3}, [r0+2*r1]
31
+    vpbroadcast%1 %{2}2, [r2]
32
+    vpbroadcast%1 %{2}2 {%3}, [r2+2*r3]
33
+    vpbroadcast%1    m0 {%4}, [r0+4*r1]
34
+    vpbroadcast%1    m2 {%4}, [r2+4*r3]
35
+    vpbroadcast%1    m0 {%5}, [r0+2*r4]
36
+    vpbroadcast%1    m2 {%5}, [r2+2*r5]
37
+    vpbroadcast%1 %{2}1, [r0+r1]
38
+    vpbroadcast%1 %{2}1 {%3}, [r0+r4]
39
+    vpbroadcast%1 %{2}3, [r2+r3]
40
+    vpbroadcast%1 %{2}3 {%3}, [r2+r5]
41
+    lea              r0, [r0+4*r1]
42
+    lea              r2, [r2+4*r3]
43
+    vpbroadcast%1    m1 {%4}, [r0+r1]
44
+    vpbroadcast%1    m3 {%4}, [r2+r3]
45
+    vpbroadcast%1    m1 {%5}, [r0+r4]
46
+    vpbroadcast%1    m3 {%5}, [r2+r5]
47
+%endmacro
48
+
49
+%macro SATD_AVX512_PACKED 0
50
+    DIFF_SUMSUB_SSSE3 0, 2, 1, 3, 4
51
+    SUMSUB_BA      w, 0, 1, 2
52
+    SBUTTERFLY   qdq, 0, 1, 2
53
+    SUMSUB_BA      w, 0, 1, 2
54
+    HMAXABSW2         0, 1, 2, 3
55
+%endmacro
56
+
57
+%macro SATD_AVX512_END 0-1 0 ; sa8d
58
+    paddw          m0 {k1}{z}, m1 ; zero-extend to dwords
59
+%if ARCH_X86_64
60
+%if mmsize == 64
61
+    vextracti32x8 ym1, m0, 1
62
+    paddd         ym0, ym1
63
+%endif
64
+%if mmsize >= 32
65
+    vextracti128  xm1, ym0, 1
66
+    paddd        xmm0, xm0, xm1
67
+%endif
68
+    punpckhqdq   xmm1, xmm0, xmm0
69
+    paddd        xmm0, xmm1
70
+    movq          rax, xmm0
71
+    rorx          rdx, rax, 32
72
+%if %1
73
+    lea           eax, [rax+rdx+1]
74
+    shr           eax, 1
75
+%else
76
+    add           eax, edx
77
+%endif
78
+%else
79
+    HADDD          m0, m1
80
+    movd          eax, xm0
81
+%if %1
82
+    inc           eax
83
+    shr           eax, 1
84
+%endif
85
+%endif
86
+    RET
87
+%endmacro
88
+
89
+%macro HMAXABSW2 4 ; a, b, tmp1, tmp2
90
+    pabsw     m%1, m%1
91
+    pabsw     m%2, m%2
92
+    psrldq    m%3, m%1, 2
93
+    psrld     m%4, m%2, 16
94
+    pmaxsw    m%1, m%3
95
+    pmaxsw    m%2, m%4
96
+%endmacro
97
+%if HIGH_BIT_DEPTH==0
98
+INIT_ZMM avx512
99
+cglobal pixel_satd_16x8_internal
100
+    vbroadcasti64x4 m6, [hmul_16p]
101
+    kxnorb           k2, k2, k2
102
+    mov             r4d, 0x55555555
103
+    knotw            k2, k2
104
+    kmovd            k1, r4d
105
+    lea              r4, [3*r1]
106
+    lea              r5, [3*r3]
107
+satd_16x8_avx512:
108
+    vbroadcasti128  ym0,      [r0]
109
+    vbroadcasti32x4  m0 {k2}, [r0+4*r1] ; 0 0 4 4
110
+    vbroadcasti128  ym4,      [r2]
111
+    vbroadcasti32x4  m4 {k2}, [r2+4*r3]
112
+    vbroadcasti128  ym2,      [r0+2*r1]
113
+    vbroadcasti32x4  m2 {k2}, [r0+2*r4] ; 2 2 6 6
114
+    vbroadcasti128  ym5,      [r2+2*r3]
115
+    vbroadcasti32x4  m5 {k2}, [r2+2*r5]
116
+    DIFF_SUMSUB_SSSE3 0, 4, 2, 5, 6
117
+    vbroadcasti128  ym1,      [r0+r1]
118
+    vbroadcasti128  ym4,      [r2+r3]
119
+    vbroadcasti128  ym3,      [r0+r4]
120
+    vbroadcasti128  ym5,      [r2+r5]
121
+    lea              r0, [r0+4*r1]
122
+    lea              r2, [r2+4*r3]
123
+    vbroadcasti32x4  m1 {k2}, [r0+r1] ; 1 1 5 5
124
+    vbroadcasti32x4  m4 {k2}, [r2+r3]
125
+    vbroadcasti32x4  m3 {k2}, [r0+r4] ; 3 3 7 7
126
+    vbroadcasti32x4  m5 {k2}, [r2+r5]
127
+    DIFF_SUMSUB_SSSE3 1, 4, 3, 5, 6
128
+    HADAMARD4_V       0, 1, 2, 3, 4
129
+    HMAXABSW2         0, 2, 4, 5
130
+    HMAXABSW2         1, 3, 4, 5
131
+    paddw            m4, m0, m2 ; m1
132
+    paddw            m2, m1, m3 ; m0
133
+    ret
134
+
135
+cglobal pixel_satd_8x8_internal
136
+    vbroadcasti64x4 m4, [hmul_16p]
137
+    mov     r4d, 0x55555555
138
+    kmovd    k1, r4d   ; 01010101
139
+    kshiftlb k2, k1, 5 ; 10100000
140
+    kshiftlb k3, k1, 4 ; 01010000
141
+    lea      r4, [3*r1]
142
+    lea      r5, [3*r3]
143
+satd_8x8_avx512:
144
+    SATD_AVX512_LOAD8 q, ym, k1, k2, k3 ; 2 0 2 0 6 4 6 4
145
+    SATD_AVX512_PACKED                  ; 3 1 3 1 7 5 7 5
146
+    ret
147
+
148
+cglobal pixel_satd_16x8, 4,6
149
+    call pixel_satd_16x8_internal_avx512
150
+    jmp satd_zmm_avx512_end
151
+
152
+cglobal pixel_satd_16x16, 4,6
153
+    call pixel_satd_16x8_internal_avx512
154
+    lea      r0, [r0+4*r1]
155
+    lea      r2, [r2+4*r3]
156
+    paddw    m7, m0, m1
157
+    call satd_16x8_avx512
158
+    paddw    m1, m7
159
+    jmp satd_zmm_avx512_end
160
+
161
+cglobal pixel_satd_8x8, 4,6
162
+    call pixel_satd_8x8_internal_avx512
163
+satd_zmm_avx512_end:
164
+    SATD_AVX512_END
165
+
166
+cglobal pixel_satd_8x16, 4,6
167
+    call pixel_satd_8x8_internal_avx512
168
+    lea      r0, [r0+4*r1]
169
+    lea      r2, [r2+4*r3]
170
+    paddw    m5, m0, m1
171
+    call satd_8x8_avx512
172
+    paddw    m1, m5
173
+    jmp satd_zmm_avx512_end
174
+
175
+INIT_YMM avx512
176
+cglobal pixel_satd_4x8_internal
177
+    vbroadcasti128 m4, [hmul_4p]
178
+    mov     r4d, 0x55550c
179
+    kmovd    k2, r4d   ; 00001100
180
+    kshiftlb k3, k2, 2 ; 00110000
181
+    kshiftlb k4, k2, 4 ; 11000000
182
+    kshiftrd k1, k2, 8 ; 01010101
183
+    lea      r4, [3*r1]
184
+    lea      r5, [3*r3]
185
+satd_4x8_avx512:
186
+    SATD_AVX512_LOAD8 d, xm, k2, k3, k4 ; 0 0 2 2 4 4 6 6
187
+satd_ymm_avx512:                        ; 1 1 3 3 5 5 7 7
188
+    SATD_AVX512_PACKED
189
+    ret
190
+
191
+cglobal pixel_satd_8x4, 4,5
192
+    mova     m4, [hmul_16p]
193
+    mov     r4d, 0x5555
194
+    kmovw    k1, r4d
195
+    SATD_AVX512_LOAD4 q, k1 ; 2 0 2 0
196
+    call satd_ymm_avx512    ; 3 1 3 1
197
+    jmp satd_ymm_avx512_end2
198
+
199
+cglobal pixel_satd_4x8, 4,6
200
+    call pixel_satd_4x8_internal_avx512
201
+satd_ymm_avx512_end:
202
+%if ARCH_X86_64 == 0
203
+    pop     r5d
204
+    %assign regs_used 5
205
+%endif
206
+satd_ymm_avx512_end2:
207
+    SATD_AVX512_END
208
+
209
+cglobal pixel_satd_4x16, 4,6
210
+    call pixel_satd_4x8_internal_avx512
211
+    lea      r0, [r0+4*r1]
212
+    lea      r2, [r2+4*r3]
213
+    paddw    m5, m0, m1
214
+    call satd_4x8_avx512
215
+    paddw    m1, m5
216
+    jmp satd_ymm_avx512_end
217
+
218
+INIT_XMM avx512
219
+cglobal pixel_satd_4x4, 4,5
220
+    mova     m4, [hmul_4p]
221
+    mov     r4d, 0x550c
222
+    kmovw    k2, r4d
223
+    kshiftrw k1, k2, 8
224
+    SATD_AVX512_LOAD4 d, k2 ; 0 0 2 2
225
+    SATD_AVX512_PACKED      ; 1 1 3 3
226
+    SWAP      0, 1
227
+    SATD_AVX512_END
228
+
229
+INIT_ZMM avx512
230
+cglobal pixel_sa8d_8x8, 4,6
231
+    vbroadcasti64x4 m4, [hmul_16p]
232
+    mov     r4d, 0x55555555
233
+    kmovd    k1, r4d   ; 01010101
234
+    kshiftlb k2, k1, 5 ; 10100000
235
+    kshiftlb k3, k1, 4 ; 01010000
236
+    lea      r4, [3*r1]
237
+    lea      r5, [3*r3]
238
+    SATD_AVX512_LOAD8 q, ym, k1, k2, k3 ; 2 0 2 0 6 4 6 4
239
+    DIFF_SUMSUB_SSSE3 0, 2, 1, 3, 4     ; 3 1 3 1 7 5 7 5
240
+    SUMSUB_BA      w, 0, 1, 2
241
+    SBUTTERFLY   qdq, 0, 1, 2
242
+    SUMSUB_BA      w, 0, 1, 2
243
+    shufps        m2, m0, m1, q2020
244
+    shufps        m1, m0, m1, q3131
245
+    SUMSUB_BA      w, 2, 1, 0
246
+    vshufi32x4    m0, m2, m1, q1010
247
+    vshufi32x4    m1, m2, m1, q3232
248
+    SUMSUB_BA      w, 0, 1, 2
249
+    HMAXABSW2      0, 1, 2, 3
250
+    SATD_AVX512_END 1
251
+%endif
252
 ; Input 10bit, Output 8bit
253
 ;------------------------------------------------------------------------------------------------------------------------
254
 ;void planecopy_sc(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)
255
@@ -8523,8 +8763,53 @@
256
 
257
 .end:
258
     RET
259
+INIT_ZMM avx512
260
+cglobal upShift_16, 4,7,4
261
+    mov         r4d, r4m
262
+    mov         r5d, r5m
263
+    movd        xm0, r6m        ; m0 = shift
264
+    vbroadcasti32x4 m3, [pw_pixel_max]
265
+    FIX_STRIDES r1d, r3d
266
+    dec         r5d
267
+.loopH:
268
+    xor         r6d, r6d
269
+.loopW:
270
+    movu        m1, [r0 + r6 * SIZEOF_PIXEL]
271
+    psllw       m1, xm0
272
+    pand        m1, m3
273
+    movu        [r2 + r6 * SIZEOF_PIXEL], m1
274
+
275
+    add         r6, mmsize / SIZEOF_PIXEL
276
+    cmp         r6d, r4d
277
+    jl         .loopW
278
+
279
+    ; move to next row
280
+    add         r0, r1
281
+    add         r2, r3
282
+    dec         r5d
283
+    jnz        .loopH
284
 
285
+    ; processing last row of every frame [To handle width which not a multiple of 32]
286
 
287
+.loop32:
288
+    movu        m1, [r0 + (r4 - mmsize/2) * 2]
289
+    psllw       m1, xm0
290
+    pand        m1, m3
291
+    movu        [r2 + (r4 - mmsize/2) * 2], m1
292
+
293
+    sub         r4d, mmsize/2
294
+    jz         .end
295
+    cmp         r4d, mmsize/2
296
+    jge        .loop32
297
+
298
+    ; process partial pixels
299
+    movu        m1, [r0]
300
+    psllw       m1, xm0
301
+    pand        m1, m3
302
+    movu        [r2], m1
303
+
304
+.end:
305
+    RET
306
 ;---------------------------------------------------------------------------------------------------------------------
307
 ;int psyCost_pp(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride)
308
 ;---------------------------------------------------------------------------------------------------------------------
309
@@ -10166,6 +10451,590 @@
310
     pabsd          m11, m11
311
 %endmacro
312
 
313
+%macro PSY_COST_PP_8x8_AVX512_MAIN12 0
314
+    ; load source and recon pixels
315
+    lea             r4, [r1 * 3]
316
+    pmovzxwd        ym0, [r0]
317
+    pmovzxwd        ym1, [r0 + r1]
318
+    pmovzxwd        ym2, [r0 + r1 * 2]
319
+    pmovzxwd        ym3, [r0 + r4]
320
+    lea             r5, [r0 + r1 * 4]
321
+    pmovzxwd        ym4, [r5]
322
+    pmovzxwd        ym5, [r5 + r1]
323
+    pmovzxwd        ym6, [r5 + r1 * 2]
324
+    pmovzxwd        ym7, [r5 + r4]
325
+
326
+    lea             r4, [r3 * 3]
327
+    pmovzxwd        ym16, [r2]
328
+    pmovzxwd        ym17, [r2 + r3]
329
+    pmovzxwd        ym18, [r2 + r3 * 2]
330
+    pmovzxwd        ym19, [r2 + r4]
331
+    lea               r5, [r2 + r3 * 4]
332
+    pmovzxwd        ym20, [r5]
333
+    pmovzxwd        ym21, [r5 + r3]
334
+    pmovzxwd        ym22, [r5 + r3 * 2]
335
+    pmovzxwd        ym23, [r5 + r4]
336
+
337
+    vinserti64x4    m0, m0, ym16, 1
338
+    vinserti64x4    m1, m1, ym17, 1
339
+    vinserti64x4    m2, m2, ym18, 1
340
+    vinserti64x4    m3, m3, ym19, 1
341
+    vinserti64x4    m4, m4, ym20, 1
342
+    vinserti64x4    m5, m5, ym21, 1
343
+    vinserti64x4    m6, m6, ym22, 1
344
+    vinserti64x4    m7, m7, ym23, 1
345
+
346
+    ; source +  recon SAD
347
+    paddd           m8, m0, m1
348
+    paddd           m8, m2
349
+    paddd           m8, m3
350
+    paddd           m8, m4
351
+    paddd           m8, m5
352
+    paddd           m8, m6
353
+    paddd           m8, m7
354
+
355
+    vextracti64x4   ym15, m8, 1
356
+
357
+    vextracti128    xm9, ym8, 1
358
+    paddd           ym8, ym9              ; sad_8x8
359
+    movhlps         xm9, xm8
360
+    paddd           xm8, xm9
361
+    pshuflw         xm9, xm8, 0Eh
362
+    paddd           xm8, xm9
363
+    psrld           ym8, 2
364
+
365
+    vextracti128    xm9, ym15, 1
366
+    paddd           ym15, ym9              ; sad_8x8
367
+    movhlps         xm9, xm15
368
+    paddd           xm15, xm9
369
+    pshuflw         xm9, xm15, 0Eh
370
+    paddd           xm15, xm9
371
+    psrld           ym15, 2
372
+
373
+    ; source and recon SA8D
374
+    psubd           m9, m1, m0
375
+    paddd           m0, m1
376
+    psubd           m1, m3, m2
377
+    paddd           m2, m3
378
+    punpckhdq       m3, m0, m9
379
+    punpckldq       m0, m9
380
+    psubd           m9, m3, m0
381
+    paddd           m0, m3
382
+    punpckhdq       m3, m2, m1
383
+    punpckldq       m2, m1
384
+    psubd           m10, m3, m2
385
+    paddd           m2, m3
386
+    psubd           m3, m5, m4
387
+    paddd           m4, m5
388
+    psubd           m5, m7, m6
389
+    paddd           m6, m7
390
+    punpckhdq       m1, m4, m3
391
+    punpckldq       m4, m3
392
+    psubd           m7, m1, m4
393
+    paddd           m4, m1
394
+    punpckhdq       m3, m6, m5
395
+    punpckldq       m6, m5
396
+    psubd           m1, m3, m6
397
+    paddd           m6, m3
398
+    psubd           m3, m2, m0
399
+    paddd           m0, m2
400
+    psubd           m2, m10, m9
401
+    paddd           m9, m10
402
+    punpckhqdq      m5, m0, m3
403
+    punpcklqdq      m0, m3
404
+    psubd           m10, m5, m0
405
+    paddd           m0, m5
406
+    punpckhqdq      m3, m9, m2
407
+    punpcklqdq      m9, m2
408
+    psubd           m5, m3, m9
409
+    paddd           m9, m3
410
+    psubd           m3, m6, m4
411
+    paddd           m4, m6
412
+    psubd           m6, m1, m7
413
+    paddd           m7, m1
414
+    punpckhqdq      m2, m4, m3
415
+    punpcklqdq      m4, m3
416
+    psubd           m1, m2, m4
417
+    paddd           m4, m2
418
+    punpckhqdq      m3, m7, m6
419
+    punpcklqdq      m7, m6
420
+
421
+    psubd           m2, m3, m7
422
+    paddd           m7, m3
423
+    psubd           m3, m4, m0
424
+    paddd           m0, m4
425
+    psubd           m4, m1, m10
426
+    paddd           m10, m1
427
+
428
+    mova       m16,    m13
429
+    mova       m17,    m14
430
+    vpermi2q   m16,    m0, m3
431
+    vpermi2q   m17,    m0, m3
432
+
433
+    pabsd           m17, m17
434
+    pabsd           m16, m16
435
+    pmaxsd          m17, m16
436
+
437
+    mova       m18,    m13
438
+    mova       m19,    m14
439
+    vpermi2q   m18,    m10, m4
440
+    vpermi2q   m19,    m10, m4
441
+
442
+    pabsd           m19, m19
443
+    pabsd           m18, m18
444
+    pmaxsd          m19, m18
445
+    psubd           m18, m7, m9
446
+    paddd           m9, m7
447
+    psubd           m7, m2, m5
448
+    paddd           m5, m2
449
+
450
+    mova       m20,    m13
451
+    mova       m21,    m14
452
+    vpermi2q   m20,    m9, m18
453
+    vpermi2q   m21,    m9, m18
454
+
455
+    pabsd           m21, m21
456
+    pabsd           m20, m20
457
+    pmaxsd          m21, m20
458
+
459
+    mova       m22,    m13
460
+    mova       m23,    m14
461
+    vpermi2q   m22,    m5, m7
462
+    vpermi2q   m23,    m5, m7
463
+
464
+    pabsd           m23, m23
465
+    pabsd           m22, m22
466
+    pmaxsd          m23, m22
467
+    paddd           m17, m21
468
+    paddd           m17, m19
469
+    paddd           m17, m23
470
+
471
+    vextracti64x4   ym26, m17, 1
472
+
473
+    vextracti128    xm9, m17, 1
474
+    paddd           ym17, ym9              ; sad_8x8
475
+    movhlps         xm9, xm17
476
+    paddd           xm17, xm9
477
+    pshuflw         xm9, xm17, 0Eh
478
+    paddd           xm17, xm9
479
+    paddd           ym17, [pd_1]
480
+    psrld           ym17, 1               ; sa8d_8x8
481
+
482
+    vextracti128    xm9, ym26, 1
483
+    paddd           ym26, ym9              ; sad_8x8
484
+    movhlps         xm9, xm26
485
+    paddd           xm26, xm9
486
+    pshuflw         xm9, xm26, 0Eh
487
+    paddd           xm26, xm9
488
+    paddd           ym26, [pd_1]
489
+    psrld           ym26, 1               ; sa8d_8x8
490
+
491
+
492
+
493
+    psubd           ym11, ym17, ym8         ; sa8d_8x8 - sad_8x8
494
+    psubd           ym12, ym26, ym15        ; sa8d_8x8 - sad_8x8
495
+
496
+    psubd          ym11, ym12
497
+    pabsd          ym11, ym11
498
+%endmacro
499
+
500
+%macro PSY_PP_INPUT_AVX512_MAIN10 0
501
+    lea             r4, [r1 * 3]
502
+    movu           xm0, [r0]
503
+    movu           xm1, [r0 + r1]
504
+    movu           xm2, [r0 + r1 * 2]
505
+    movu           xm3, [r0 + r4]
506
+    lea             r5, [r0 + r1 * 4]
507
+    movu           xm4, [r5]
508
+    movu           xm5, [r5 + r1]
509
+    movu           xm6, [r5 + r1 * 2]
510
+    movu           xm7, [r5 + r4]
511
+
512
+    lea             r4, [r3 * 3]
513
+    vinserti128     ym0, ym0, [r2], 1
514
+    vinserti128     ym1, ym1, [r2 + r3], 1
515
+    vinserti128     ym2, ym2, [r2 + r3 * 2], 1
516
+    vinserti128     ym3, ym3, [r2 + r4], 1
517
+    lea             r5, [r2 + r3 * 4]
518
+    vinserti128     ym4, ym4, [r5], 1
519
+    vinserti128     ym5, ym5, [r5 + r3], 1
520
+    vinserti128     ym6, ym6, [r5 + r3 * 2], 1
521
+    vinserti128     ym7, ym7, [r5 + r4], 1
522
+
523
+    add             r0, 16
524
+    add             r2, 16
525
+
526
+    lea             r4, [r1 * 3]
527
+    vinserti32x4    m0, m0, [r0], 2
528
+    vinserti32x4    m1, m1, [r0 + r1], 2
529
+    vinserti32x4    m2, m2, [r0 + r1 * 2], 2
530
+    vinserti32x4    m3, m3, [r0 + r4], 2
531
+    lea             r5, [r0 + r1 * 4]
532
+    vinserti32x4    m4, m4, [r5], 2
533
+    vinserti32x4    m5, m5, [r5 + r1], 2
534
+    vinserti32x4    m6, m6, [r5 + r1 * 2], 2
535
+    vinserti32x4    m7, m7, [r5 + r4], 2
536
+
537
+    lea             r4, [r3 * 3]
538
+    vinserti32x4    m0, m0, [r2], 3
539
+    vinserti32x4    m1, m1, [r2 + r3], 3
540
+    vinserti32x4    m2, m2, [r2 + r3 * 2], 3
541
+    vinserti32x4    m3, m3, [r2 + r4], 3
542
+    lea             r5, [r2 + r3 * 4]
543
+    vinserti32x4    m4, m4, [r5], 3
544
+    vinserti32x4    m5, m5, [r5 + r3], 3
545
+    vinserti32x4    m6, m6, [r5 + r3 * 2], 3
546
+    vinserti32x4    m7, m7, [r5 + r4], 3
547
+%endmacro
548
+
549
+
550
+%macro PSY_PP_16x8_AVX512_MAIN10 0
551
+    paddw           m8, m0, m1
552
+    paddw           m8, m2
553
+    paddw           m8, m3
554
+    paddw           m8, m4
555
+    paddw           m8, m5
556
+    paddw           m8, m6
557
+    paddw           m8, m7
558
+    pmaddwd         m8, m14
559
+
560
+    psrldq          m9, m8, 8
561
+    paddd           m8, m9
562
+    psrldq          m9, m8, 4
563
+    paddd           m8, m9
564
+    psrld           m8, 2
565
+
566
+    psubw           m9, m1, m0
567
+    paddw           m0, m1
568
+    psubw           m1, m3, m2
569
+    paddw           m2, m3
570
+    punpckhwd       m3, m0, m9
571
+    punpcklwd       m0, m9
572
+    psubw           m9, m3, m0
573
+    paddw           m0, m3
574
+    punpckhwd       m3, m2, m1
575
+    punpcklwd       m2, m1
576
+    psubw           m10, m3, m2
577
+    paddw           m2, m3
578
+
579
+    psubw           m3, m5, m4
580
+    paddw           m4, m5
581
+    psubw           m5, m7, m6
582
+    paddw           m6, m7
583
+    punpckhwd       m1, m4, m3
584
+    punpcklwd       m4, m3
585
+    psubw           m7, m1, m4
586
+    paddw           m4, m1
587
+    punpckhwd       m3, m6, m5
588
+    punpcklwd       m6, m5
589
+    psubw           m1, m3, m6
590
+    paddw           m6, m3
591
+
592
+    psubw           m3, m2, m0
593
+    paddw           m0, m2
594
+    psubw           m2, m10, m9
595
+    paddw           m9, m10
596
+    punpckhdq       m5, m0, m3
597
+    punpckldq       m0, m3
598
+    psubw           m10, m5, m0
599
+    paddw           m0, m5
600
+    punpckhdq       m3, m9, m2
601
+    punpckldq       m9, m2
602
+    psubw           m5, m3, m9
603
+    paddw           m9, m3
604
+
605
+    psubw           m3, m6, m4
606
+    paddw           m4, m6
607
+    psubw           m6, m1, m7
608
+    paddw           m7, m1
609
+    punpckhdq       m2, m4, m3
610
+    punpckldq       m4, m3
611
+    psubw           m1, m2, m4
612
+    paddw           m4, m2
613
+    punpckhdq       m3, m7, m6
614
+    punpckldq       m7, m6
615
+    psubw           m2, m3, m7
616
+    paddw           m7, m3
617
+
618
+    psubw           m3, m4, m0
619
+    paddw           m0, m4
620
+    psubw           m4, m1, m10
621
+    paddw           m10, m1
622
+    punpckhqdq      m6, m0, m3
623
+    punpcklqdq      m0, m3
624
+    pabsw           m0, m0
625
+    pabsw           m6, m6
626
+    pmaxsw          m0, m6
627
+    punpckhqdq      m3, m10, m4
628
+    punpcklqdq      m10, m4
629
+    pabsw           m10, m10
630
+    pabsw           m3, m3
631
+    pmaxsw          m10, m3
632
+
633
+    psubw           m3, m7, m9
634
+    paddw           m9, m7
635
+    psubw           m7, m2, m5
636
+    paddw           m5, m2
637
+    punpckhqdq      m4, m9, m3
638
+    punpcklqdq      m9, m3
639
+    pabsw           m9, m9
640
+    pabsw           m4, m4
641
+    pmaxsw          m9, m4
642
+    punpckhqdq      m3, m5, m7
643
+    punpcklqdq      m5, m7
644
+    pabsw           m5, m5
645
+    pabsw           m3, m3
646
+    pmaxsw          m5, m3
647
+
648
+    paddd           m0, m9
649
+    paddd           m0, m10
650
+    paddd           m0, m5
651
+    psrld           m9, m0, 16
652
+    pslld           m0, 16
653
+    psrld           m0, 16
654
+    paddd           m0, m9
655
+    psrldq          m9, m0, 8
656
+    paddd           m0, m9
657
+    psrldq          m9, m0, 4
658
+    paddd           m0, m9
659
+    paddd           m0, m15
660
+    psrld           m0, 1
661
+    psubd           m0, m8
662
+
663
+    vextracti64x4   ym2, m0, 1
664
+
665
+    vextracti128   xm3, ym2, 1
666
+    psubd          xm3, xm2
667
+    pabsd          xm3, xm3
668
+
669
+    vextracti128   xm1, ym0, 1
670
+    psubd          xm1, xm0
671
+    pabsd          xm1, xm1
672
+    paddd          xm1, xm3
673
+%endmacro
674
+
675
+%macro PSY_PP_INPUT_AVX512_MAIN 0
676
+    movu       xm16, [r0 + r1 * 0]
677
+    movu       xm17, [r0 + r1 * 1]
678
+    movu       xm18, [r0 + r1 * 2]
679
+    movu       xm19, [r0 + r4 * 1]
680
+
681
+    movu       xm20, [r2 + r3 * 0]
682
+    movu       xm21, [r2 + r3 * 1]
683
+    movu       xm22, [r2 + r3 * 2]
684
+    movu       xm23, [r2 + r7 * 1]
685
+
686
+    mova         m0, m26
687
+    vpermi2q     m0, m16, m20
688
+    mova         m1, m26
689
+    vpermi2q     m1, m17, m21
690
+    mova         m2, m26
691
+    vpermi2q     m2, m18, m22
692
+    mova         m3, m26
693
+    vpermi2q     m3, m19, m23
694
+
695
+
696
+    lea          r5, [r0 + r1 * 4]
697
+    lea          r6, [r2 + r3 * 4]
698
+
699
+    movu      xm16, [r5 + r1 * 0]
700
+    movu      xm17, [r5 + r1 * 1]
701
+    movu      xm18, [r5 + r1 * 2]
702
+    movu      xm19, [r5 + r4 * 1]
703
+
704
+    movu      xm20, [r6 + r3 * 0]
705
+    movu      xm21, [r6 + r3 * 1]
706
+    movu      xm22, [r6 + r3 * 2]
707
+    movu      xm23, [r6 + r7 * 1]
708
+
709
+    mova        m4, m26
710
+    vpermi2q    m4, m16, m20
711
+    mova        m5, m26
712
+    vpermi2q    m5, m17, m21
713
+    mova        m6, m26
714
+    vpermi2q    m6, m18, m22
715
+    mova        m7, m26
716
+    vpermi2q    m7, m19, m23
717
+%endmacro
718
+
719
+%macro PSY_PP_16x8_AVX512_MAIN 0
720
+    pmaddubsw       m0, m8
721
+    pmaddubsw       m1, m8
722
+    pmaddubsw       m2, m8
723
+    pmaddubsw       m3, m8
724
+    pmaddubsw       m4, m8
725
+    pmaddubsw       m5, m8
726
+    pmaddubsw       m6, m8
727
+    pmaddubsw       m7, m8
728
+
729
+    paddw           m11, m0, m1
730
+    paddw           m11, m2
731
+    paddw           m11, m3
732
+    paddw           m11, m4
733
+    paddw           m11, m5
734
+    paddw           m11, m6
735
+    paddw           m11, m7
736
+
737
+    pmaddwd         m11, m14
738
+    psrldq          m10, m11, 4
739
+    paddd           m11, m10
740
+    psrld           m11, 2
741
+
742
+    mova            m9, m0
743
+    paddw           m0, m1
744
+    psubw           m1, m9
745
+    mova            m9, m2
746
+    paddw           m2, m3
747
+    psubw           m3, m9
748
+    mova            m9, m0
749
+    paddw           m0, m2
750
+    psubw           m2, m9
751
+    mova            m9, m1
752
+    paddw           m1, m3
753
+    psubw           m3, m9
754
+
755
+    movdqa          m9, m4
756
+    paddw           m4, m5
757
+    psubw           m5, m9
758
+    movdqa          m9, m6
759
+    paddw           m6, m7
760
+    psubw           m7, m9
761
+    movdqa          m9, m4
762
+    paddw           m4, m6
763
+    psubw           m6, m9
764
+    movdqa          m9, m5
765
+    paddw           m5, m7
766
+    psubw           m7, m9
767
+
768
+    movdqa          m9, m0
769
+    paddw           m0, m4
770
+    psubw           m4, m9
771
+    movdqa          m9, m1
772
+    paddw           m1, m5
773
+    psubw           m5, m9
774
+
775
+    mova            m9, m0
776
+    vshufps         m9, m9, m4, 11011101b
777
+    vshufps         m0, m0, m4, 10001000b
778
+
779
+    movdqa          m4, m0
780
+    paddw           m16, m0, m9
781
+    psubw           m17, m9, m4
782
+
783
+    movaps          m4, m1
784
+    vshufps         m4, m4, m5, 11011101b
785
+    vshufps         m1, m1, m5, 10001000b
786
+
787
+    movdqa          m5, m1
788
+    paddw           m18, m1, m4
789
+    psubw           m19, m4, m5
790
+
791
+    movdqa          m5, m2
792
+    paddw           m2, m6
793
+    psubw           m6, m5
794
+    movdqa          m5, m3
795
+    paddw           m3, m7
796
+    psubw           m7, m5
797
+
798
+    movaps          m5, m2
799
+    vshufps         m5, m5, m6, 11011101b
800
+    vshufps         m2, m2, m6, 10001000b
801
+
802
+    movdqa          m6, m2
803
+    paddw           m20, m2, m5
804
+    psubw           m21, m5, m6
805
+
806
+    movaps          m6, m3
807
+
808
+    vshufps         m6, m6, m7, 11011101b
809
+    vshufps         m3, m3, m7, 10001000b
810
+
811
+    movdqa          m7, m3
812
+    paddw           m22, m3, m6
813
+    psubw           m23, m6, m7
814
+
815
+    movdqa          m7, m16
816
+
817
+    vextracti64x4    ym24,  m16, 1
818
+    vextracti64x4    ym25,  m17, 1
819
+    pblendw          ym16, ym17, 10101010b
820
+    pblendw          ym24, ym25, 10101010b
821
+    vinserti64x4     m16, m16, ym24, 1
822
+
823
+    pslld           m17, 10h
824
+    psrld           m7, 10h
825
+    por             m17, m7
826
+    pabsw           m16, m16
827
+    pabsw           m17, m17
828
+    pmaxsw          m16, m17
829
+    movdqa          m7, m18
830
+
831
+    vextracti64x4    ym24,  m18, 1
832
+    vextracti64x4    ym25,  m19, 1
833
+    pblendw          ym18,  ym19, 10101010b
834
+    pblendw          ym24,  ym25, 10101010b
835
+    vinserti64x4     m18, m18, ym24, 1
836
+
837
+    pslld           m19, 10h
838
+    psrld           m7, 10h
839
+    por             m19, m7
840
+    pabsw           m18, m18
841
+    pabsw           m19, m19
842
+    pmaxsw          m18, m19
843
+    movdqa          m7, m20
844
+
845
+    vextracti64x4    ym24,  m20, 1
846
+    vextracti64x4    ym25,  m21, 1
847
+    pblendw          ym20,  ym21, 10101010b
848
+    pblendw          ym24,  ym25, 10101010b
849
+    vinserti64x4     m20,   m20, ym24, 1
850
+
851
+    pslld           m21, 10h
852
+    psrld           m7, 10h
853
+    por             m21, m7
854
+    pabsw           m20, m20
855
+    pabsw           m21, m21
856
+    pmaxsw          m20, m21
857
+    mova            m7, m22
858
+
859
+    vextracti64x4    ym24,  m22, 1
860
+    vextracti64x4    ym25,  m23, 1
861
+    pblendw          ym22,  ym23, 10101010b
862
+    pblendw          ym24,  ym25, 10101010b
863
+    vinserti64x4     m22,   m22,  ym24, 1
864
+
865
+    pslld           m23, 10h
866
+    psrld           m7, 10h
867
+    por             m23, m7
868
+    pabsw           m22, m22
869
+    pabsw           m23, m23
870
+    pmaxsw          m22, m23
871
+    paddw           m16, m18
872
+    paddw           m16, m20
873
+    paddw           m16, m22
874
+    pmaddwd         m16, m14
875
+    psrldq          m1, m16, 8
876
+    paddd           m16, m1
877
+
878
+    pshuflw         m1, m16, 00001110b
879
+    paddd           m16, m1
880
+    paddd           m16, m15
881
+    psrld           m16, 1
882
+
883
+    psubd           m16, m11
884
+    vextracti64x4   ym2, m16, 1
885
+
886
+    vextracti128    xm1, ym16, 1
887
+    psubd           xm16, xm1
888
+    pabsd           xm16, xm16
889
+
890
+    vextracti128   xm3, ym2, 1
891
+    psubd          xm3, xm2
892
+    pabsd          xm3, xm3
893
+    paddd          xm16, xm3
894
+%endmacro
895
+
896
+
897
 %if ARCH_X86_64
898
 INIT_YMM avx2
899
 %if HIGH_BIT_DEPTH && BIT_DEPTH == 12
900
@@ -10435,6 +11304,257 @@
901
     RET
902
 %endif
903
 %endif
904
+%if ARCH_X86_64
905
+INIT_ZMM avx512
906
+%if HIGH_BIT_DEPTH && BIT_DEPTH == 12
907
+cglobal psyCost_pp_16x16, 4, 10, 27
908
+    add            r1d, r1d
909
+    add            r3d, r3d
910
+    pxor           m24, m24
911
+    movu       m13,    [psy_pp_shuff1]
912
+    movu       m14,    [psy_pp_shuff2]
913
+
914
+    mov            r8d, 2
915
+.loopH:
916
+    mov            r9d, 2
917
+.loopW:
918
+    PSY_COST_PP_8x8_AVX512_MAIN12
919
+
920
+    paddd         xm24, xm11
921
+    add             r0, 16
922
+    add             r2, 16
923
+    dec            r9d
924
+    jnz            .loopW
925
+    lea             r0, [r0 + r1 * 8 - 32]
926
+    lea             r2, [r2 + r3 * 8 - 32]
927
+    dec            r8d
928
+    jnz            .loopH
929
+    movd           eax, xm24
930
+    RET
931
+%endif
932
+
933
+%if HIGH_BIT_DEPTH && BIT_DEPTH == 10
934
+cglobal psyCost_pp_16x16, 4, 10, 16
935
+    add            r1d, r1d
936
+    add            r3d, r3d
937
+    pxor           m11, m11
938
+    vbroadcasti32x8 m14, [pw_1]
939
+    vbroadcasti32x8 m15, [pd_1]
940
+
941
+    mov            r8d, 2
942
+.loopH:
943
+    PSY_PP_INPUT_AVX512_MAIN10
944
+    PSY_PP_16x8_AVX512_MAIN10
945
+
946
+    paddd         xm11, xm1
947
+    lea             r0, [r0 + r1 * 8 - 16]
948
+    lea             r2, [r2 + r3 * 8 - 16]
949
+    dec            r8d
950
+    jnz            .loopH
951
+    movd           eax, xm11
952
+    RET
953
+%endif
954
+
955
+%if BIT_DEPTH == 8
956
+cglobal psyCost_pp_16x16, 4, 10, 27
957
+    lea             r4, [3 * r1]
958
+    lea             r7, [3 * r3]
959
+    vbroadcasti32x8  m8, [hmul_8p]
960
+    pxor            m13, m13
961
+    vbroadcasti32x8 m14, [pw_1]
962
+    vbroadcasti32x8 m15, [pd_1]
963
+    movu            m26, [psy_pp_shuff3]
964
+
965
+    mov             r8d, 2
966
+.loopH:
967
+    PSY_PP_INPUT_AVX512_MAIN
968
+    PSY_PP_16x8_AVX512_MAIN
969
+
970
+    paddd           m13, m16
971
+    lea             r0, [r0 + r1 * 8]
972
+    lea             r2, [r2 + r3 * 8]
973
+    dec             r8d
974
+    jnz             .loopH
975
+    movd            eax, xm13
976
+    RET
977
+%endif
978
+%endif
979
+
980
+%if ARCH_X86_64
981
+INIT_ZMM avx512
982
+%if HIGH_BIT_DEPTH && BIT_DEPTH == 12
983
+cglobal psyCost_pp_32x32, 4, 10, 27
984
+    add            r1d, r1d
985
+    add            r3d, r3d
986
+    pxor           m24, m24
987
+    movu       m13,    [psy_pp_shuff1]
988
+    movu       m14,    [psy_pp_shuff2]
989
+
990
+    mov            r8d, 4
991
+.loopH:
992
+    mov            r9d, 4
993
+.loopW:
994
+    PSY_COST_PP_8x8_AVX512_MAIN12
995
+
996
+    paddd         xm24, xm11
997
+    add             r0, 16
998
+    add             r2, 16
999
+    dec            r9d
1000
+    jnz            .loopW
1001
+    lea             r0, [r0 + r1 * 8 - 64]
1002
+    lea             r2, [r2 + r3 * 8 - 64]
1003
+    dec            r8d
1004
+    jnz            .loopH
1005
+    movd           eax, xm24
1006
+    RET
1007
+%endif
1008
+
1009
+%if HIGH_BIT_DEPTH && BIT_DEPTH == 10
1010
+cglobal psyCost_pp_32x32, 4, 10, 16
1011
+    add            r1d, r1d
1012
+    add            r3d, r3d
1013
+    pxor           m11, m11
1014
+    vbroadcasti32x8 m14, [pw_1]
1015
+    vbroadcasti32x8 m15, [pd_1]
1016
+
1017
+    mov            r8d, 4
1018
+.loopH:
1019
+    mov            r9d, 2
1020
+.loopW:
1021
+    PSY_PP_INPUT_AVX512_MAIN10
1022
+    PSY_PP_16x8_AVX512_MAIN10
1023
+
1024
+    paddd         xm11, xm1
1025
+    add             r0, 16
1026
+    add             r2, 16
1027
+    dec            r9d
1028
+    jnz            .loopW
1029
+    lea             r0, [r0 + r1 * 8 - 64]
1030
+    lea             r2, [r2 + r3 * 8 - 64]
1031
+    dec            r8d
1032
+    jnz            .loopH
1033
+    movd           eax, xm11
1034
+    RET
1035
+%endif
1036
+
1037
+%if BIT_DEPTH == 8
1038
+cglobal psyCost_pp_32x32, 4, 10, 27
1039
+    lea             r4, [3 * r1]
1040
+    lea             r7, [3 * r3]
1041
+    vbroadcasti32x8  m8, [hmul_8p]
1042
+    pxor            m13, m13
1043
+    vbroadcasti32x8 m14, [pw_1]
1044
+    vbroadcasti32x8 m15, [pd_1]
1045
+    movu            m26, [psy_pp_shuff3]
1046
+
1047
+    mov             r8d, 4
1048
+.loopH:
1049
+    mov             r9d, 2
1050
+.loopW:
1051
+    PSY_PP_INPUT_AVX512_MAIN
1052
+    PSY_PP_16x8_AVX512_MAIN
1053
+
1054
+    paddd           m13, m16
1055
+    add             r0, 16
1056
+    add             r2, 16
1057
+    dec             r9d
1058
+    jnz             .loopW
1059
+    lea             r0, [r0 + r1 * 8 - 32]
1060
+    lea             r2, [r2 + r3 * 8 - 32]
1061
+    dec             r8d
1062
+    jnz             .loopH
1063
+    movd            eax, xm13
1064
+    RET
1065
+%endif
1066
+%endif
1067
+
1068
+%if ARCH_X86_64
1069
+INIT_ZMM avx512
1070
+%if HIGH_BIT_DEPTH && BIT_DEPTH == 12
1071
+cglobal psyCost_pp_64x64, 4, 10, 27
1072
+    add            r1d, r1d
1073
+    add            r3d, r3d
1074
+    pxor           m24, m24
1075
+    movu       m13,    [psy_pp_shuff1]
1076
+    movu       m14,    [psy_pp_shuff2]
1077
+
1078
+    mov            r8d, 8
1079
+.loopH:
1080
+    mov            r9d, 8
1081
+.loopW:
1082
+    PSY_COST_PP_8x8_AVX512_MAIN12
1083
+
1084
+    paddd         xm24, xm11
1085
+    add             r0, 16
1086
+    add             r2, 16
1087
+    dec            r9d
1088
+    jnz            .loopW
1089
+    lea             r0, [r0 + r1 * 8 - 128]
1090
+    lea             r2, [r2 + r3 * 8 - 128]
1091
+    dec            r8d
1092
+    jnz            .loopH
1093
+    movd           eax, xm24
1094
+    RET
1095
+%endif
1096
+
1097
+%if HIGH_BIT_DEPTH && BIT_DEPTH == 10
1098
+cglobal psyCost_pp_64x64, 4, 10, 16
1099
+    add            r1d, r1d
1100
+    add            r3d, r3d
1101
+    pxor           m11, m11
1102
+    vbroadcasti32x8 m14, [pw_1]
1103
+    vbroadcasti32x8 m15, [pd_1]
1104
+
1105
+    mov            r8d, 8
1106
+.loopH:
1107
+    mov            r9d, 4
1108
+.loopW:
1109
+    PSY_PP_INPUT_AVX512_MAIN10
1110
+    PSY_PP_16x8_AVX512_MAIN10
1111
+
1112
+    paddd         xm11, xm1
1113
+    add             r0, 16
1114
+    add             r2, 16
1115
+    dec            r9d
1116
+    jnz            .loopW
1117
+    lea             r0, [r0 + r1 * 8 - 128]
1118
+    lea             r2, [r2 + r3 * 8 - 128]
1119
+    dec            r8d
1120
+    jnz            .loopH
1121
+    movd           eax, xm11
1122
+    RET
1123
+%endif
1124
+
1125
+%if BIT_DEPTH == 8
1126
+cglobal psyCost_pp_64x64, 4, 10, 27
1127
+    lea             r4, [3 * r1]
1128
+    lea             r7, [3 * r3]
1129
+    vbroadcasti32x8  m8, [hmul_8p]
1130
+    pxor            m13, m13
1131
+    vbroadcasti32x8 m14, [pw_1]
1132
+    vbroadcasti32x8 m15, [pd_1]
1133
+    movu            m26, [psy_pp_shuff3]
1134
+
1135
+    mov             r8d, 8
1136
+.loopH:
1137
+    mov             r9d, 4
1138
+.loopW:
1139
+    PSY_PP_INPUT_AVX512_MAIN
1140
+    PSY_PP_16x8_AVX512_MAIN
1141
+
1142
+    paddd           m13, m16
1143
+    add             r0, 16
1144
+    add             r2, 16
1145
+    dec             r9d
1146
+    jnz             .loopW
1147
+    lea             r0, [r0 + r1 * 8 - 64]
1148
+    lea             r2, [r2 + r3 * 8 - 64]
1149
+    dec             r8d
1150
+    jnz             .loopH
1151
+    movd            eax, xm13
1152
+    RET
1153
+%endif
1154
+%endif
1155
 
1156
 ;---------------------------------------------------------------------------------------------------------------------
1157
 ;int psyCost_ss(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride)
1158
@@ -12993,8 +14113,134 @@
1159
     paddd           xm0, xm1
1160
     movd            eax, xm0
1161
     RET
1162
-%endif ; ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 0
1163
 
1164
+%macro PROCESS_SATD_32x4_AVX512 0        ; function to compute satd cost for 32 columns, 4 rows
1165
+    ; rows 0-3
1166
+    pmovzxbw         m0, [r0]
1167
+    pmovzxbw         m4, [r2]
1168
+    psubw           m0, m4
1169
+    pmovzxbw         m1, [r0 + r1]
1170
+    pmovzxbw         m5, [r2 + r3]
1171
+    psubw           m1, m5
1172
+    pmovzxbw         m2, [r0 + r1 * 2]
1173
+    pmovzxbw         m4, [r2 + r3 * 2]
1174
+    psubw           m2, m4
1175
+    pmovzxbw         m3, [r0 + r4]
1176
+    pmovzxbw         m5, [r2 + r5]
1177
+    psubw           m3, m5
1178
+    paddw           m4, m0, m1
1179
+    psubw           m1, m0
1180
+    paddw           m0, m2, m3
1181
+    psubw           m3, m2
1182
+    punpckhwd       m2, m4, m1
1183
+    punpcklwd       m4, m1
1184
+    punpckhwd       m1, m0, m3
1185
+    punpcklwd       m0, m3
1186
+    paddw           m3, m4, m0
1187
+    psubw           m0, m4
1188
+    paddw           m4, m2, m1
1189
+    psubw           m1, m2
1190
+    punpckhdq       m2, m3, m0
1191
+    punpckldq       m3, m0
1192
+    paddw           m0, m3, m2
1193
+    psubw           m2, m3
1194
+    punpckhdq       m3, m4, m1
1195
+    punpckldq       m4, m1
1196
+    paddw           m1, m4, m3
1197
+    psubw           m3, m4
1198
+    punpckhqdq      m4, m0, m1
1199
+    punpcklqdq      m0, m1
1200
+    pabsw           m0, m0
1201
+    pabsw           m4, m4
1202
+    pmaxsw          m0, m0, m4
1203
+    punpckhqdq      m1, m2, m3
1204
+    punpcklqdq      m2, m3
1205
+    pabsw           m2, m2
1206
+    pabsw           m1, m1
1207
+    pmaxsw          m2, m1
1208
+    pxor            m7, m7
1209
+    mova            m1, m0
1210
+    punpcklwd       m1, m7
1211
+    paddd           m6, m1
1212
+    mova            m1, m0
1213
+    punpckhwd       m1, m7
1214
+    paddd           m6, m1
1215
+    pxor            m7, m7
1216
+    mova            m1, m2
1217
+    punpcklwd       m1, m7
1218
+    paddd           m6, m1
1219
+    mova            m1, m2
1220
+    punpckhwd       m1, m7
1221
+    paddd           m6, m1
1222
+%endmacro
1223
+
1224
+%macro SATD_MAIN_AVX512_END 0
1225
+    vextracti32x8   ym7,   m6,   1
1226
+    paddd           ym6,   ym7
1227
+    vextracti128    xm7,   ym6,  1
1228
+    paddd           xm6,   xm6,  xm7
1229
+    punpckhqdq      xm7,   xm6,  xm6
1230
+    paddd           xm6,   xm7
1231
+    movq            rax,   xm6
1232
+    rorx            rdx,   rax,  32
1233
+    add             eax,   edx
1234
+%endmacro
1235
+
1236
+%macro SATD_32xN_AVX512 1
1237
+INIT_ZMM avx512
1238
+cglobal pixel_satd_32x%1, 4,6,8
1239
+    lea             r4, [3 * r1]
1240
+    lea             r5, [3 * r3]
1241
+    pxor            m6, m6
1242
+%rep %1/4 - 1
1243
+    PROCESS_SATD_32x4_AVX512
1244
+    lea             r0, [r0 + 4 * r1]
1245
+    lea             r2, [r2 + 4 * r3]
1246
+%endrep
1247
+    PROCESS_SATD_32x4_AVX512
1248
+    SATD_MAIN_AVX512_END
1249
+    RET
1250
+%endmacro
1251
+
1252
+SATD_32xN_AVX512 8
1253
+SATD_32xN_AVX512 16
1254
+SATD_32xN_AVX512 24
1255
+SATD_32xN_AVX512 32
1256
+SATD_32xN_AVX512 48
1257
+SATD_32xN_AVX512 64
1258
+
1259
+%macro SATD_64xN_AVX512 1
1260
+INIT_ZMM avx512
1261
+cglobal pixel_satd_64x%1, 4,8,8
1262
+    lea             r4, [3 * r1]
1263
+    lea             r5, [3 * r3]
1264
+    pxor            m6, m6
1265
+    mov             r6, r0
1266
+    mov             r7, r2
1267
+
1268
+%rep %1/4 - 1
1269
+    PROCESS_SATD_32x4_AVX512
1270
+    lea             r0, [r0 + 4 * r1]
1271
+    lea             r2, [r2 + 4 * r3]
1272
+%endrep
1273
+    PROCESS_SATD_32x4_AVX512
1274
+    lea             r0, [r6 + mmsize/2]
1275
+    lea             r2, [r7 + mmsize/2]
1276
+%rep %1/4 - 1
1277
+    PROCESS_SATD_32x4_AVX512
1278
+    lea             r0, [r0 + 4 * r1]
1279
+    lea             r2, [r2 + 4 * r3]
1280
+%endrep
1281
+    PROCESS_SATD_32x4_AVX512
1282
+    SATD_MAIN_AVX512_END
1283
+    RET
1284
+%endmacro
1285
+
1286
+SATD_64xN_AVX512 16
1287
+SATD_64xN_AVX512 32
1288
+SATD_64xN_AVX512 48
1289
+SATD_64xN_AVX512 64
1290
+%endif ; ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 0
1291
 %if ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 1
1292
 INIT_YMM avx2
1293
 cglobal calc_satd_16x8    ; function to compute satd cost for 16 columns, 8 rows
1294
@@ -13721,6 +14967,257 @@
1295
     paddd           xm6, xm7
1296
     movd            eax, xm6
1297
     RET
1298
+
1299
+%macro SATD_HBD_AVX512_END 0
1300
+    vextracti32x8   ym7, m6, 1
1301
+    paddd           ym6, ym7
1302
+    vextracti128    xm7, ym6, 1
1303
+    paddd           xm6, xm7
1304
+    pxor            xm7, xm7
1305
+    movhlps         xm7, xm6
1306
+    paddd           xm6, xm7
1307
+    pshufd          xm7, xm6, 1
1308
+    paddd           xm6, xm7
1309
+    movd            eax, xm6
1310
+%endmacro
1311
+%macro PROCESS_SATD_16x8_HBD_AVX512 0        ; function to compute satd cost for 16 columns, 8 rows
1312
+    ; rows 0-3
1313
+    lea             r6, [r0 + r1 * 4]
1314
+    lea             r7, [r2 + r3 * 4]
1315
+    movu            ym0, [r0]
1316
+    movu            ym4, [r2]
1317
+    vinserti32x8    m0, [r6], 1
1318
+    vinserti32x8    m4, [r7], 1
1319
+    psubw           m0, m4
1320
+    movu            ym1, [r0 + r1]
1321
+    movu            ym5, [r2 + r3]
1322
+    vinserti32x8    m1, [r6 + r1], 1
1323
+    vinserti32x8    m5, [r7 + r3], 1
1324
+    psubw           m1, m5
1325
+    movu            ym2, [r0 + r1 * 2]
1326
+    movu            ym4, [r2 + r3 * 2]
1327
+    vinserti32x8    m2, [r6 + r1 * 2], 1
1328
+    vinserti32x8    m4, [r7 + r3 * 2], 1
1329
+    psubw           m2, m4
1330
+    movu            ym3, [r0 + r4]
1331
+    movu            ym5, [r2 + r5]
1332
+    vinserti32x8    m3, [r6 + r4], 1
1333
+    vinserti32x8    m5, [r7 + r5], 1
1334
+    psubw           m3, m5
1335
+
1336
+    paddw           m4, m0, m1
1337
+    psubw           m1, m0
1338
+    paddw           m0, m2, m3
1339
+    psubw           m3, m2
1340
+    punpckhwd       m2, m4, m1
1341
+    punpcklwd       m4, m1
1342
+    punpckhwd       m1, m0, m3
1343
+    punpcklwd       m0, m3
1344
+    paddw           m3, m4, m0
1345
+    psubw           m0, m4
1346
+    paddw           m4, m2, m1
1347
+    psubw           m1, m2
1348
+    punpckhdq       m2, m3, m0
1349
+    punpckldq       m3, m0
1350
+    paddw           m0, m3, m2
1351
+    psubw           m2, m3
1352
+    punpckhdq       m3, m4, m1
1353
+    punpckldq       m4, m1
1354
+    paddw           m1, m4, m3
1355
+    psubw           m3, m4
1356
+    punpckhqdq      m4, m0, m1
1357
+    punpcklqdq      m0, m1
1358
+    pabsw           m0, m0
1359
+    pabsw           m4, m4
1360
+    pmaxsw          m0, m0, m4
1361
+    punpckhqdq      m1, m2, m3
1362
+    punpcklqdq      m2, m3
1363
+    pabsw           m2, m2
1364
+    pabsw           m1, m1
1365
+    pmaxsw          m2, m1
1366
+    pxor            m7, m7
1367
+    mova            m1, m0
1368
+    punpcklwd       m1, m7
1369
+    paddd           m6, m1
1370
+    mova            m1, m0
1371
+    punpckhwd       m1, m7
1372
+    paddd           m6, m1
1373
+    pxor            m7, m7
1374
+    mova            m1, m2
1375
+    punpcklwd       m1, m7
1376
+    paddd           m6, m1
1377
+    mova            m1, m2
1378
+    punpckhwd       m1, m7
1379
+    paddd           m6, m1
1380
+%endmacro
1381
+%macro PROCESS_SATD_32x4_HBD_AVX512 0        ; function to compute satd cost for 32 columns, 4 rows
1382
+    ; rows 0-3
1383
+    movu            m0, [r0]
1384
+    movu            m4, [r2]
1385
+    psubw           m0, m4
1386
+    movu            m1, [r0 + r1]
1387
+    movu            m5, [r2 + r3]
1388
+    psubw           m1, m5
1389
+    movu            m2, [r0 + r1 * 2]
1390
+    movu            m4, [r2 + r3 * 2]
1391
+    psubw           m2, m4
1392
+    movu            m3, [r0 + r4]
1393
+    movu            m5, [r2 + r5]
1394
+    psubw           m3, m5
1395
+    paddw           m4, m0, m1
1396
+    psubw           m1, m0
1397
+    paddw           m0, m2, m3
1398
+    psubw           m3, m2
1399
+    punpckhwd       m2, m4, m1
1400
+    punpcklwd       m4, m1
1401
+    punpckhwd       m1, m0, m3
1402
+    punpcklwd       m0, m3
1403
+    paddw           m3, m4, m0
1404
+    psubw           m0, m4
1405
+    paddw           m4, m2, m1
1406
+    psubw           m1, m2
1407
+    punpckhdq       m2, m3, m0
1408
+    punpckldq       m3, m0
1409
+    paddw           m0, m3, m2
1410
+    psubw           m2, m3
1411
+    punpckhdq       m3, m4, m1
1412
+    punpckldq       m4, m1
1413
+    paddw           m1, m4, m3
1414
+    psubw           m3, m4
1415
+    punpckhqdq      m4, m0, m1
1416
+    punpcklqdq      m0, m1
1417
+    pabsw           m0, m0
1418
+    pabsw           m4, m4
1419
+    pmaxsw          m0, m0, m4
1420
+    punpckhqdq      m1, m2, m3
1421
+    punpcklqdq      m2, m3
1422
+    pabsw           m2, m2
1423
+    pabsw           m1, m1
1424
+    pmaxsw          m2, m1
1425
+    pxor            m7, m7
1426
+    mova            m1, m0
1427
+    punpcklwd       m1, m7
1428
+    paddd           m6, m1
1429
+    mova            m1, m0
1430
+    punpckhwd       m1, m7
1431
+    paddd           m6, m1
1432
+    pxor            m7, m7
1433
+    mova            m1, m2
1434
+    punpcklwd       m1, m7
1435
+    paddd           m6, m1
1436
+    mova            m1, m2
1437
+    punpckhwd       m1, m7
1438
+    paddd           m6, m1
1439
+%endmacro
1440
+
1441
+%macro SATD_16xN_HBD_AVX512 1
1442
+INIT_ZMM avx512
1443
+cglobal pixel_satd_16x%1, 4,8,8
1444
+    add             r1d, r1d
1445
+    add             r3d, r3d
1446
+    lea             r4, [3 * r1]
1447
+    lea             r5, [3 * r3]
1448
+    pxor            m6, m6
1449
+
1450
+%rep %1/8 - 1
1451
+    PROCESS_SATD_16x8_HBD_AVX512
1452
+    lea             r0, [r6 + 4 * r1]
1453
+    lea             r2, [r7 + 4 * r3]
1454
+%endrep
1455
+    PROCESS_SATD_16x8_HBD_AVX512
1456
+    SATD_HBD_AVX512_END
1457
+    RET
1458
+%endmacro
1459
+
1460
+SATD_16xN_HBD_AVX512 8
1461
+SATD_16xN_HBD_AVX512 16
1462
+SATD_16xN_HBD_AVX512 32
1463
+SATD_16xN_HBD_AVX512 64
1464
+
1465
+%macro SATD_32xN_HBD_AVX512 1
1466
+INIT_ZMM avx512
1467
+cglobal pixel_satd_32x%1, 4,8,8
1468
+    add             r1d, r1d
1469
+    add             r3d, r3d
1470
+    lea             r4, [3 * r1]
1471
+    lea             r5, [3 * r3]
1472
+    pxor            m6, m6
1473
+    mov             r6, r0
1474
+    mov             r7, r2
1475
+%rep %1/4 - 1
1476
+    PROCESS_SATD_32x4_HBD_AVX512
1477
+    lea             r0, [r0 + 4 * r1]
1478
+    lea             r2, [r2 + 4 * r3]
1479
+%endrep
1480
+    PROCESS_SATD_32x4_HBD_AVX512
1481
+    SATD_HBD_AVX512_END
1482
+    RET
1483
+%endmacro
1484
+
1485
+SATD_32xN_HBD_AVX512 8
1486
+SATD_32xN_HBD_AVX512 16
1487
+SATD_32xN_HBD_AVX512 24
1488
+SATD_32xN_HBD_AVX512 32
1489
+SATD_32xN_HBD_AVX512 64
1490
+INIT_ZMM avx512
1491
+cglobal pixel_satd_48x64, 4,10,8
1492
+    add             r1d, r1d
1493
+    add             r3d, r3d
1494
+    lea             r4, [3 * r1]
1495
+    lea             r5, [3 * r3]
1496
+    pxor            m6, m6
1497
+    mov             r8, r0
1498
+    mov             r9, r2
1499
+
1500
+%rep 15
1501
+    PROCESS_SATD_32x4_HBD_AVX512
1502
+    lea             r0, [r0 + 4 * r1]
1503
+    lea             r2, [r2 + 4 * r3]
1504
+%endrep
1505
+    PROCESS_SATD_32x4_HBD_AVX512
1506
+    lea             r0, [r8 + mmsize]
1507
+    lea             r2, [r9 + mmsize]
1508
+%rep 7
1509
+    PROCESS_SATD_16x8_HBD_AVX512
1510
+    lea             r0, [r6 + 4 * r1]
1511
+    lea             r2, [r7 + 4 * r3]
1512
+%endrep
1513
+    PROCESS_SATD_16x8_HBD_AVX512
1514
+    SATD_HBD_AVX512_END
1515
+    RET
1516
+
1517
+%macro SATD_64xN_HBD_AVX512 1
1518
+INIT_ZMM avx512
1519
+cglobal pixel_satd_64x%1, 4,8,8
1520
+    add             r1d, r1d
1521
+    add             r3d, r3d
1522
+    lea             r4, [3 * r1]
1523
+    lea             r5, [3 * r3]
1524
+    pxor            m6, m6
1525
+    mov             r6, r0
1526
+    mov             r7, r2
1527
+%rep %1/4 - 1
1528
+    PROCESS_SATD_32x4_HBD_AVX512
1529
+    lea             r0, [r0 + 4 * r1]
1530
+    lea             r2, [r2 + 4 * r3]
1531
+%endrep
1532
+    PROCESS_SATD_32x4_HBD_AVX512
1533
+    lea             r0, [r6 + mmsize]
1534
+    lea             r2, [r7 + mmsize]
1535
+%rep %1/4 - 1
1536
+    PROCESS_SATD_32x4_HBD_AVX512
1537
+    lea             r0, [r0 + 4 * r1]
1538
+    lea             r2, [r2 + 4 * r3]
1539
+%endrep
1540
+    PROCESS_SATD_32x4_HBD_AVX512
1541
+    SATD_HBD_AVX512_END
1542
+    RET
1543
+%endmacro
1544
+
1545
+SATD_64xN_HBD_AVX512 16
1546
+SATD_64xN_HBD_AVX512 32
1547
+SATD_64xN_HBD_AVX512 48
1548
+SATD_64xN_HBD_AVX512 64
1549
 %endif ; ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 1
1550
 
1551
 
1552
@@ -13818,6 +15315,7 @@
1553
     ;lea %8, [%8+4*r3]
1554
 %endmacro
1555
 
1556
+%if ARCH_X86_64
1557
 INIT_YMM avx2
1558
 cglobal pixel_satd_8x8, 4,4,7
1559
 
1560
@@ -14383,5 +15881,5 @@
1561
 
1562
     movd eax, xm0
1563
     RET
1564
-
1565
+%endif
1566
 %endif ; HIGH_BIT_DEPTH == 1 && BIT_DEPTH == 10
1567
x265_2.7.tar.gz/source/common/x86/pixel-util.h -> x265_2.9.tar.gz/source/common/x86/pixel-util.h Changed
33
 
1
@@ -27,6 +27,7 @@
2
 
3
 #define DEFINE_UTILS(cpu) \
4
     FUNCDEF_TU_S2(void, getResidual, cpu, const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride); \
5
+    FUNCDEF_TU_S2(void, getResidual_aligned, cpu, const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride); \
6
     FUNCDEF_TU_S2(void, transpose, cpu, pixel* dest, const pixel* src, intptr_t stride); \
7
     FUNCDEF_TU(int, count_nonzero, cpu, const int16_t* quantCoeff); \
8
     uint32_t PFX(quant_ ## cpu(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff)); \
9
@@ -36,6 +37,7 @@
10
     void PFX(weight_pp_ ## cpu(const pixel* src, pixel* dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset)); \
11
     void PFX(weight_sp_ ## cpu(const int16_t* src, pixel* dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset)); \
12
     void PFX(scale1D_128to64_ ## cpu(pixel*, const pixel*)); \
13
+    void PFX(scale1D_128to64_aligned_ ## cpu(pixel*, const pixel*)); \
14
     void PFX(scale2D_64to32_ ## cpu(pixel*, const pixel*, intptr_t)); \
15
     uint32_t PFX(costCoeffRemain_ ## cpu(uint16_t *absCoeff, int numNonZero, int idx)); \
16
     uint32_t PFX(costC1C2Flag_sse2(uint16_t *absCoeff, intptr_t numNonZero, uint8_t *baseCtxMod, intptr_t ctxOffset)); \
17
@@ -44,6 +46,7 @@
18
 DEFINE_UTILS(ssse3);
19
 DEFINE_UTILS(sse4);
20
 DEFINE_UTILS(avx2);
21
+DEFINE_UTILS(avx512);
22
 
23
 #undef DEFINE_UTILS
24
 
25
@@ -58,4 +61,7 @@
26
 uint32_t PFX(costCoeffNxN_sse4(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase));
27
 uint32_t PFX(costCoeffNxN_avx2_bmi2(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase));
28
 
29
+int  PFX(count_nonzero_16x16_avx512(const int16_t* quantCoeff));
30
+int  PFX(count_nonzero_32x32_avx512(const int16_t* quantCoeff));
31
+
32
 #endif // ifndef X265_PIXEL_UTIL_H
33
x265_2.7.tar.gz/source/common/x86/pixel-util8.asm -> x265_2.9.tar.gz/source/common/x86/pixel-util8.asm Changed
1798
 
1
@@ -4,6 +4,7 @@
2
 ;* Authors: Min Chen <chenm003@163.com> <min.chen@multicorewareinc.com>
3
 ;*          Nabajit Deka <nabajit@multicorewareinc.com>
4
 ;*          Rajesh Paulraj <rajesh@multicorewareinc.com>
5
+;*          Praveen Kumar Tiwari <praveen@multicorewareinc.com>
6
 ;*
7
 ;* This program is free software; you can redistribute it and/or modify
8
 ;* it under the terms of the GNU General Public License as published by
9
@@ -26,7 +27,13 @@
10
 %include "x86inc.asm"
11
 %include "x86util.asm"
12
 
13
-SECTION_RODATA 32
14
+SECTION_RODATA 64
15
+
16
+var_shuf_avx512: db 0,-1, 1,-1, 2,-1, 3,-1, 4,-1, 5,-1, 6,-1, 7,-1
17
+                 db 8,-1, 9,-1,10,-1,11,-1,12,-1,13,-1,14,-1,15,-1
18
+ALIGN 64
19
+const dequant_shuf1_avx512,  dq 0, 2, 4, 6, 1, 3, 5, 7
20
+const dequant_shuf2_avx512,  dq 0, 4, 1, 5, 2, 6, 3, 7
21
 
22
 %if BIT_DEPTH == 12
23
 ssim_c1:   times 4 dd 107321.76    ; .01*.01*4095*4095*64
24
@@ -552,6 +559,262 @@
25
 %endrep
26
     RET
27
 %endif
28
+
29
+%macro PROCESS_GETRESIDUAL32_W4_HBD_AVX512 0
30
+    movu        m0, [r0]
31
+    movu        m1, [r0 + r3]
32
+    movu        m2, [r0 + r3 * 2]
33
+    movu        m3, [r0 + r4]
34
+    lea         r0, [r0 + r3 * 4]
35
+
36
+    movu        m4, [r1]
37
+    movu        m5, [r1 + r3]
38
+    movu        m6, [r1 + r3 * 2]
39
+    movu        m7, [r1 + r4]
40
+    lea         r1, [r1 + r3 * 4]
41
+
42
+    psubw       m0, m4
43
+    psubw       m1, m5
44
+    psubw       m2, m6
45
+    psubw       m3, m7
46
+
47
+    movu        [r2], m0
48
+    movu        [r2 + r3], m1
49
+    movu        [r2 + r3 * 2], m2
50
+    movu        [r2 + r4], m3
51
+    lea         r2, [r2 + r3 * 4]
52
+%endmacro
53
+
54
+%macro PROCESS_GETRESIDUAL32_W4_HBD_AVX512_END 0
55
+    movu        m0, [r0]
56
+    movu        m1, [r0 + r3]
57
+    movu        m2, [r0 + r3 * 2]
58
+    movu        m3, [r0 + r4]
59
+
60
+    movu        m4, [r1]
61
+    movu        m5, [r1 + r3]
62
+    movu        m6, [r1 + r3 * 2]
63
+    movu        m7, [r1 + r4]
64
+
65
+    psubw       m0, m4
66
+    psubw       m1, m5
67
+    psubw       m2, m6
68
+    psubw       m3, m7
69
+
70
+    movu        [r2], m0
71
+    movu        [r2 + r3], m1
72
+    movu        [r2 + r3 * 2], m2
73
+    movu        [r2 + r4], m3
74
+%endmacro
75
+
76
+%macro PROCESS_GETRESIDUAL32_W4_AVX512 0
77
+    pmovzxbw    m0, [r0]
78
+    pmovzxbw    m1, [r0 + r3]
79
+    pmovzxbw    m2, [r0 + r3 * 2]
80
+    pmovzxbw    m3, [r0 + r4]
81
+    lea         r0, [r0 + r3 * 4]
82
+
83
+    pmovzxbw    m4, [r1]
84
+    pmovzxbw    m5, [r1 + r3]
85
+    pmovzxbw    m6, [r1 + r3 * 2]
86
+    pmovzxbw    m7, [r1 + r4]
87
+    lea         r1, [r1 + r3 * 4]
88
+
89
+    psubw       m0, m4
90
+    psubw       m1, m5
91
+    psubw       m2, m6
92
+    psubw       m3, m7
93
+
94
+    movu        [r2], m0
95
+    movu        [r2 + r3 * 2], m1
96
+    lea         r2, [r2 + r3 * 4]
97
+    movu        [r2], m2
98
+    movu        [r2 + r3 * 2], m3
99
+    lea         r2, [r2 + r3 * 4]
100
+%endmacro
101
+
102
+%macro PROCESS_GETRESIDUAL32_W4_AVX512_END 0
103
+    pmovzxbw    m0, [r0]
104
+    pmovzxbw    m1, [r0 + r3]
105
+    pmovzxbw    m2, [r0 + r3 * 2]
106
+    pmovzxbw    m3, [r0 + r4]
107
+
108
+    pmovzxbw    m4, [r1]
109
+    pmovzxbw    m5, [r1 + r3]
110
+    pmovzxbw    m6, [r1 + r3 * 2]
111
+    pmovzxbw    m7, [r1 + r4]
112
+
113
+    psubw       m0, m4
114
+    psubw       m1, m5
115
+    psubw       m2, m6
116
+    psubw       m3, m7
117
+
118
+    movu        [r2], m0
119
+    movu        [r2 + r3 * 2], m1
120
+    lea         r2, [r2 + r3 * 4]
121
+    movu        [r2], m2
122
+    movu        [r2 + r3 * 2], m3
123
+%endmacro
124
+
125
+
126
+%if HIGH_BIT_DEPTH
127
+INIT_ZMM avx512
128
+cglobal getResidual32, 4,5,8
129
+    add         r3, r3
130
+    lea         r4, [r3 * 3]
131
+
132
+    PROCESS_GETRESIDUAL32_W4_HBD_AVX512
133
+    PROCESS_GETRESIDUAL32_W4_HBD_AVX512
134
+    PROCESS_GETRESIDUAL32_W4_HBD_AVX512
135
+    PROCESS_GETRESIDUAL32_W4_HBD_AVX512
136
+    PROCESS_GETRESIDUAL32_W4_HBD_AVX512
137
+    PROCESS_GETRESIDUAL32_W4_HBD_AVX512
138
+    PROCESS_GETRESIDUAL32_W4_HBD_AVX512
139
+    PROCESS_GETRESIDUAL32_W4_HBD_AVX512_END
140
+    RET
141
+%else
142
+INIT_ZMM avx512
143
+cglobal getResidual32, 4,5,8
144
+    lea         r4, [r3 * 3]
145
+
146
+    PROCESS_GETRESIDUAL32_W4_AVX512
147
+    PROCESS_GETRESIDUAL32_W4_AVX512
148
+    PROCESS_GETRESIDUAL32_W4_AVX512
149
+    PROCESS_GETRESIDUAL32_W4_AVX512
150
+    PROCESS_GETRESIDUAL32_W4_AVX512
151
+    PROCESS_GETRESIDUAL32_W4_AVX512
152
+    PROCESS_GETRESIDUAL32_W4_AVX512
153
+    PROCESS_GETRESIDUAL32_W4_AVX512_END
154
+    RET
155
+%endif
156
+
157
+%macro PROCESS_GETRESIDUAL32_ALIGNED_W4_HBD_AVX512 0
158
+    movu        m0, [r0]
159
+    movu        m1, [r0 + r3]
160
+    movu        m2, [r0 + r3 * 2]
161
+    movu        m3, [r0 + r4]
162
+    lea         r0, [r0 + r3 * 4]
163
+
164
+    movu        m4, [r1]
165
+    movu        m5, [r1 + r3]
166
+    movu        m6, [r1 + r3 * 2]
167
+    movu        m7, [r1 + r4]
168
+    lea         r1, [r1 + r3 * 4]
169
+
170
+    psubw       m0, m4
171
+    psubw       m1, m5
172
+    psubw       m2, m6
173
+    psubw       m3, m7
174
+
175
+    movu        [r2], m0
176
+    movu        [r2 + r3], m1
177
+    movu        [r2 + r3 * 2], m2
178
+    movu        [r2 + r4], m3
179
+    lea         r2, [r2 + r3 * 4]
180
+%endmacro
181
+
182
+%macro PROCESS_GETRESIDUAL32_ALIGNED_W4_HBD_AVX512_END 0
183
+    movu        m0, [r0]
184
+    movu        m1, [r0 + r3]
185
+    movu        m2, [r0 + r3 * 2]
186
+    movu        m3, [r0 + r4]
187
+
188
+    movu        m4, [r1]
189
+    movu        m5, [r1 + r3]
190
+    movu        m6, [r1 + r3 * 2]
191
+    movu        m7, [r1 + r4]
192
+
193
+    psubw       m0, m4
194
+    psubw       m1, m5
195
+    psubw       m2, m6
196
+    psubw       m3, m7
197
+
198
+    movu        [r2], m0
199
+    movu        [r2 + r3], m1
200
+    movu        [r2 + r3 * 2], m2
201
+    movu        [r2 + r4], m3
202
+%endmacro
203
+
204
+%macro PROCESS_GETRESIDUAL32_ALIGNED_W4_AVX512 0
205
+    pmovzxbw    m0, [r0]
206
+    pmovzxbw    m1, [r0 + r3]
207
+    pmovzxbw    m2, [r0 + r3 * 2]
208
+    pmovzxbw    m3, [r0 + r4]
209
+    lea         r0, [r0 + r3 * 4]
210
+
211
+    pmovzxbw    m4, [r1]
212
+    pmovzxbw    m5, [r1 + r3]
213
+    pmovzxbw    m6, [r1 + r3 * 2]
214
+    pmovzxbw    m7, [r1 + r4]
215
+    lea         r1, [r1 + r3 * 4]
216
+
217
+    psubw       m0, m4
218
+    psubw       m1, m5
219
+    psubw       m2, m6
220
+    psubw       m3, m7
221
+
222
+    movu        [r2], m0
223
+    movu        [r2 + r3 * 2], m1
224
+    lea         r2, [r2 + r3 * 4]
225
+    movu        [r2], m2
226
+    movu        [r2 + r3 * 2], m3
227
+    lea         r2, [r2 + r3 * 4]
228
+%endmacro
229
+
230
+%macro PROCESS_GETRESIDUAL32_ALIGNED_W4_AVX512_END 0
231
+    pmovzxbw    m0, [r0]
232
+    pmovzxbw    m1, [r0 + r3]
233
+    pmovzxbw    m2, [r0 + r3 * 2]
234
+    pmovzxbw    m3, [r0 + r4]
235
+
236
+    pmovzxbw    m4, [r1]
237
+    pmovzxbw    m5, [r1 + r3]
238
+    pmovzxbw    m6, [r1 + r3 * 2]
239
+    pmovzxbw    m7, [r1 + r4]
240
+
241
+    psubw       m0, m4
242
+    psubw       m1, m5
243
+    psubw       m2, m6
244
+    psubw       m3, m7
245
+
246
+    movu        [r2], m0
247
+    movu        [r2 + r3 * 2], m1
248
+    lea         r2, [r2 + r3 * 4]
249
+    movu        [r2], m2
250
+    movu        [r2 + r3 * 2], m3
251
+%endmacro
252
+
253
+
254
+%if HIGH_BIT_DEPTH
255
+INIT_ZMM avx512
256
+cglobal getResidual_aligned32, 4,5,8
257
+    add         r3, r3
258
+    lea         r4, [r3 * 3]
259
+
260
+    PROCESS_GETRESIDUAL32_ALIGNED_W4_HBD_AVX512
261
+    PROCESS_GETRESIDUAL32_ALIGNED_W4_HBD_AVX512
262
+    PROCESS_GETRESIDUAL32_ALIGNED_W4_HBD_AVX512
263
+    PROCESS_GETRESIDUAL32_ALIGNED_W4_HBD_AVX512
264
+    PROCESS_GETRESIDUAL32_ALIGNED_W4_HBD_AVX512
265
+    PROCESS_GETRESIDUAL32_ALIGNED_W4_HBD_AVX512
266
+    PROCESS_GETRESIDUAL32_ALIGNED_W4_HBD_AVX512
267
+    PROCESS_GETRESIDUAL32_ALIGNED_W4_HBD_AVX512_END
268
+    RET
269
+%else
270
+INIT_ZMM avx512
271
+cglobal getResidual_aligned32, 4,5,8
272
+    lea         r4, [r3 * 3]
273
+
274
+    PROCESS_GETRESIDUAL32_ALIGNED_W4_AVX512
275
+    PROCESS_GETRESIDUAL32_ALIGNED_W4_AVX512
276
+    PROCESS_GETRESIDUAL32_ALIGNED_W4_AVX512
277
+    PROCESS_GETRESIDUAL32_ALIGNED_W4_AVX512
278
+    PROCESS_GETRESIDUAL32_ALIGNED_W4_AVX512
279
+    PROCESS_GETRESIDUAL32_ALIGNED_W4_AVX512
280
+    PROCESS_GETRESIDUAL32_ALIGNED_W4_AVX512
281
+    PROCESS_GETRESIDUAL32_ALIGNED_W4_AVX512_END
282
+    RET
283
+%endif
284
 ;-----------------------------------------------------------------------------
285
 ; uint32_t quant(int16_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff);
286
 ;-----------------------------------------------------------------------------
287
@@ -782,6 +1045,133 @@
288
 %endif ; ARCH_X86_64 == 1
289
 
290
 
291
+%if ARCH_X86_64 == 1
292
+INIT_ZMM avx512
293
+cglobal quant, 5, 6, 22
294
+    ; fill qbits
295
+    movd            xm4, r4d            ; m4 = qbits
296
+
297
+    ; fill qbits-8
298
+    sub             r4d, 8
299
+    movd            xm6, r4d            ; m6 = qbits8
300
+
301
+    ; fill offset
302
+%if UNIX64 == 0
303
+    vpbroadcastd    m5, r5m             ; m5 = add
304
+%else ; Mac
305
+    movd           xm5, r5m
306
+    vpbroadcastd    m5, xm5             ; m5 = add
307
+%endif
308
+
309
+    vbroadcasti32x8  m9, [pw_1]
310
+
311
+    mov             r4d, r6m
312
+    pxor             m7, m7
313
+    sub             r4d, 32
314
+    jl              .coeff16
315
+    add             r4d, 32
316
+    shr             r4d, 5
317
+    jmp             .loop
318
+
319
+.coeff16:
320
+    ; 16 coeff
321
+    pxor             m7,  m7
322
+    pmovsxwd        m16,  [r0]            ; m16 = level
323
+    pabsd            m1,  m16
324
+    pmulld           m1,  [r1]
325
+    paddd           m17,   m1, m5
326
+    psrad           m17,  xm4             ; m17 = level1
327
+
328
+    pslld            m3,    m17, 8
329
+    psrad            m1,    xm6
330
+    psubd            m1,     m3           ; m1 = deltaU1
331
+    movu             [r2],   m1
332
+    vextracti64x4    ym19,  m17, 1
333
+    vextracti64x4    ym20,  m16, 1
334
+    psignd           ym17, ym16
335
+    psignd          ym19,  ym20
336
+    packssdw        ym17,  ym19
337
+    vpermq          ym17,  ym17, q3120
338
+    movu            [r3],  ym17
339
+
340
+    pminuw          ym17,   ym9
341
+    paddw           ym7,   ym17
342
+
343
+    ; sum count
344
+    xorpd            m0,  m0
345
+    psadbw          ym7, ym0
346
+    vextracti128    xm1, ym7, 1
347
+    paddd           xm7, xm1
348
+    movhlps         xm0, xm7
349
+    paddd           xm7, xm0
350
+    movd            eax, xm7
351
+    RET
352
+
353
+.loop:
354
+    ; 16 coeff
355
+    pmovsxwd        m16,   [r0]            ; m16 = level
356
+    pabsd            m1,   m16
357
+    pmulld           m1,   [r1]
358
+    paddd           m17,   m1,  m5
359
+    psrad           m17,   xm4             ; m17 = level1
360
+
361
+    pslld            m3,   m17, 8
362
+    psrad            m1,   xm6
363
+    psubd            m1,    m3             ; m1 = deltaU1
364
+    movu            [r2],   m1
365
+    vextracti64x4   ym19,  m17, 1
366
+    vextracti64x4   ym20,  m16, 1
367
+    psignd          ym17, ym16
368
+    psignd          ym19, ym20
369
+    packssdw        ym17, ym19
370
+
371
+    ; 16 coeff
372
+    pmovsxwd        m16,  [r0 + mmsize/2]  ; m16 = level
373
+    pabsd            m1,  m16
374
+    pmulld           m1,  [r1 + mmsize]
375
+    paddd           m18,   m1, m5
376
+    psrad           m18,  xm4              ; m2 = level1
377
+
378
+    pslld            m8,  m18, 8
379
+    psrad            m1,  xm6
380
+    psubd            m1,  m8               ; m1 = deltaU1
381
+    movu             [r2 + mmsize], m1
382
+    vextracti64x4   ym21,  m18, 1
383
+    vextracti64x4   ym20,  m16, 1
384
+    psignd          ym18, ym16
385
+    psignd          ym21, ym20
386
+    packssdw        ym18, ym21
387
+    vinserti64x4     m17,  m17, ym18, 1
388
+    vpermq           m17,  m17, q3120
389
+
390
+    movu            [r3],  m17
391
+
392
+    pminuw          m17,   m9
393
+    paddw            m7,  m17
394
+
395
+    add              r0,  mmsize
396
+    add              r1,  mmsize * 2
397
+    add              r2,  mmsize * 2
398
+    add              r3,  mmsize
399
+
400
+    dec             r4d
401
+    jnz            .loop
402
+
403
+    ; sum count
404
+    xorpd            m0,  m0
405
+    psadbw           m7,  m0
406
+    vextracti32x8   ym1,  m7, 1
407
+    paddd           ym7, ym1
408
+    vextracti64x2   xm1,  m7, 1
409
+    paddd           xm7, xm1
410
+    pshufd          xm1, xm7, 2
411
+    paddd           xm7, xm1
412
+    movd            eax, xm7
413
+    RET
414
+%endif ; ARCH_X86_64 == 1
415
+
416
+
417
+
418
 ;-----------------------------------------------------------------------------
419
 ; uint32_t nquant(int16_t *coef, int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff);
420
 ;-----------------------------------------------------------------------------
421
@@ -888,7 +1278,101 @@
422
     paddd       xm5, xm0
423
     movd        eax, xm5
424
     RET
425
+%if ARCH_X86_64 == 1
426
+INIT_ZMM avx512
427
+cglobal nquant, 3,5,22
428
+%if UNIX64 == 0
429
+    vpbroadcastd m4, r4m
430
+%else ; Mac
431
+    movd         xm4, r4m
432
+    vpbroadcastd  m4, xm4
433
+%endif
434
 
435
+    vbroadcasti32x8  m6, [pw_1]
436
+    mov         r4d, r5m
437
+    pxor         m5, m5
438
+    movd        xm3, r3m
439
+    sub         r4d, 16
440
+    je          .coeff16
441
+    add         r4d, 16
442
+    shr         r4d, 5
443
+    jmp         .loop
444
+
445
+.coeff16:
446
+    pmovsxwd         m16, [r0]
447
+    pabsd            m17, m16
448
+    pmulld           m17, [r1]
449
+    paddd            m17, m4
450
+    psrad            m17, xm3
451
+
452
+    vextracti64x4   ym19,  m17, 1
453
+    vextracti64x4   ym20,  m16, 1
454
+    psignd          ym17, ym16
455
+    psignd          ym19, ym20
456
+    packssdw        ym17, ym19
457
+    vpermq          ym17, ym17, q3120
458
+    pabsw           ym17, ym17
459
+    movu            [r2], ym17
460
+    pminuw          ym17, ym6
461
+    paddw           ym5,  ym17
462
+    pxor            m0,    m0
463
+    psadbw          ym5,  ym0
464
+    vextracti128    xm0,  ym5, 1
465
+    paddd           xm5,  xm0
466
+    pshufd          xm0,  xm5, 2
467
+    paddd           xm5,  xm0
468
+    movd            eax,  xm5
469
+    RET
470
+
471
+.loop:
472
+    pmovsxwd         m16,  [r0]
473
+    pabsd            m17,  m16
474
+    pmulld           m17,  [r1]
475
+    paddd            m17,  m4
476
+    psrad            m17,  xm3
477
+    vextracti64x4   ym19,  m17, 1
478
+    vextracti64x4   ym20,  m16, 1
479
+    psignd          ym17, ym16
480
+    psignd          ym19, ym20
481
+    packssdw        ym17, ym19
482
+
483
+    pmovsxwd         m16, [r0 + mmsize/2]
484
+    pabsd            m18, m16
485
+    pmulld           m18, [r1 + mmsize]
486
+    paddd            m18,  m4
487
+    psrad            m18, xm3
488
+    vextracti64x4   ym21,  m18, 1
489
+    vextracti64x4   ym20,  m16, 1
490
+    psignd          ym18, ym16
491
+    psignd          ym21, ym20
492
+    packssdw        ym18, ym21
493
+    vinserti64x4     m17,  m17, ym18, 1
494
+    vpermq           m17,  m17, q3120
495
+
496
+    pabsw            m17, m17
497
+    movu            [r2], m17
498
+
499
+    add               r0, mmsize
500
+    add               r1, mmsize * 2
501
+    add               r2, mmsize
502
+
503
+    pminuw           m17,  m6
504
+    paddw             m5, m17
505
+
506
+    dec         r4d
507
+    jnz         .loop
508
+
509
+    pxor             m0,  m0
510
+    psadbw           m5,  m0
511
+    vextracti32x8   ym1,  m5, 1
512
+    paddd           ym5, ym1
513
+    vextracti64x2   xm1,  m5, 1
514
+    paddd           xm5, xm1
515
+    pshufd          xm1, xm5, 2
516
+    paddd           xm5, xm1
517
+    movd            eax, xm5
518
+    RET
519
+%endif ; ARCH_X86_64 == 1
520
 
521
 ;-----------------------------------------------------------------------------
522
 ; void dequant_normal(const int16_t* quantCoef, int32_t* coef, int num, int scale, int shift)
523
@@ -1106,6 +1590,142 @@
524
     jnz            .loop
525
     RET
526
 
527
+;----------------------------------------------------------------------------------------------------------------------
528
+;void dequant_scaling(const int16_t* src, const int32_t* dequantCoef, int16_t* dst, int num, int mcqp_miper, int shift)
529
+;----------------------------------------------------------------------------------------------------------------------
530
+INIT_ZMM avx512
531
+cglobal dequant_scaling, 6,7,8
532
+    mova        m6,  [dequant_shuf1_avx512]
533
+    mova        m7,  [dequant_shuf2_avx512]
534
+    add         r5d, 4
535
+    mov         r6d, r3d
536
+    shr         r3d, 5          ; num/32
537
+    cmp         r5d, r4d
538
+    jle         .skip
539
+    sub         r5d, r4d
540
+    vpbroadcastd m0, [pd_1]
541
+    movd        xm1, r5d         ; shift - per
542
+    dec         r5d
543
+    movd        xm2, r5d         ; shift - per - 1
544
+    pslld       m0, xm2          ; 1 << shift - per - 1
545
+
546
+.part0:
547
+    pmovsxwd    m2, [r0]
548
+    pmovsxwd    m4, [r0 + 32]
549
+    movu        m3, [r1]
550
+    movu        m5, [r1 + 64]
551
+    pmulld      m2, m3
552
+    pmulld      m4, m5
553
+    paddd       m2, m0
554
+    paddd       m4, m0
555
+    psrad       m2, xm1
556
+    psrad       m4, xm1
557
+    packssdw    m2, m4
558
+    vpermq      m2, m6, m2
559
+    cmp         r6d, 16
560
+    je          .num16part0
561
+    movu        [r2], m2
562
+
563
+    add         r0, 64
564
+    add         r1, 128
565
+    add         r2, 64
566
+    dec         r3d
567
+    jnz         .part0
568
+    jmp         .end
569
+
570
+.num16part0:
571
+    movu        [r2], ym2
572
+    jmp         .end
573
+
574
+.skip:
575
+    sub         r4d, r5d        ; per - shift
576
+    movd        xm0, r4d
577
+
578
+.part1:
579
+    pmovsxwd    m2, [r0]
580
+    pmovsxwd    m4, [r0 + 32]
581
+    movu        m3, [r1]
582
+    movu        m5, [r1 + 64]
583
+    pmulld      m2, m3
584
+    pmulld      m4, m5
585
+    packssdw    m2, m4
586
+
587
+    vextracti32x8 ym4, m2, 1
588
+    pmovsxwd    m1, ym2
589
+    pmovsxwd    m2, ym4
590
+    pslld       m1, xm0
591
+    pslld       m2, xm0
592
+    packssdw    m1, m2
593
+
594
+    vpermq      m1, m7, m1
595
+    cmp         r6d, 16
596
+    je          .num16part1
597
+    movu        [r2], m1
598
+
599
+    add         r0, 64
600
+    add         r1, 128
601
+    add         r2, 64
602
+    dec         r3d
603
+    jnz         .part1
604
+
605
+.num16part1:
606
+    movu        [r2], ym1
607
+
608
+.end:
609
+    RET
610
+
611
+INIT_ZMM avx512
612
+cglobal dequant_normal, 5,5,7
613
+    vpbroadcastd    m2, [pw_1]          ; m2 = word [1]
614
+    vpbroadcastd    m5, [pd_32767]      ; m5 = dword [32767]
615
+    vpbroadcastd    m6, [pd_n32768]     ; m6 = dword [-32768]
616
+%if HIGH_BIT_DEPTH
617
+    cmp             r3d, 32767
618
+    jle            .skip
619
+    shr             r3d, (BIT_DEPTH - 8)
620
+    sub             r4d, (BIT_DEPTH - 8)
621
+.skip:
622
+%endif
623
+    movd            xm0, r4d            ; m0 = shift
624
+    add             r4d, -1+16
625
+    bts             r3d, r4d
626
+
627
+    movd            xm1, r3d
628
+    vpbroadcastd    m1, xm1             ; m1 = dword [add scale]
629
+
630
+    ; m0 = shift
631
+    ; m1 = scale
632
+    ; m2 = word [1]
633
+    mov             r3d, r2d
634
+    shr             r2d, 5
635
+.loop:
636
+    movu            m3, [r0]
637
+    punpckhwd       m4, m3, m2
638
+    punpcklwd       m3, m2
639
+    pmaddwd         m3, m1              ; m3 = dword (clipQCoef * scale + add)
640
+    pmaddwd         m4, m1
641
+    psrad           m3, xm0
642
+    psrad           m4, xm0
643
+    pminsd          m3, m5
644
+    pmaxsd          m3, m6
645
+    pminsd          m4, m5
646
+    pmaxsd          m4, m6
647
+    packssdw        m3, m4
648
+
649
+    mova             [r1 + 0 * mmsize/2], ym3
650
+    cmp              r3d, 16
651
+    je               .num16
652
+    vextracti32x8    [r1 + 1 * mmsize/2], m3, 1
653
+
654
+    add             r0, mmsize
655
+    add             r1, mmsize
656
+
657
+    dec             r2d
658
+    jnz            .loop
659
+    RET
660
+.num16:
661
+    RET
662
+
663
 
664
 ;-----------------------------------------------------------------------------
665
 ; int x265_count_nonzero_4x4_sse2(const int16_t *quantCoeff);
666
@@ -1238,7 +1858,30 @@
667
     movd            eax, xm0
668
     RET
669
 
670
+;-----------------------------------------------------------------------------
671
+; int x265_count_nonzero_16x16_avx512(const int16_t *quantCoeff);
672
+;-----------------------------------------------------------------------------
673
+%if ARCH_X86_64
674
+INIT_ZMM avx512
675
+cglobal count_nonzero_16x16, 1,4,2
676
+    mov             r1, 0xFFFFFFFFFFFFFFFF
677
+    kmovq           k2, r1
678
+    xor             r3, r3
679
+    pxor            m0, m0
680
 
681
+%assign x 0
682
+%rep 4
683
+    movu            m1, [r0 + x]
684
+    vpacksswb       m1, [r0 + x + 64]
685
+%assign x x+128
686
+    vpcmpb          k1 {k2}, m1, m0, 00000100b
687
+    kmovq           r1, k1
688
+    popcnt          r2, r1
689
+    add             r3d, r2d
690
+%endrep
691
+    mov             eax, r3d
692
+    RET
693
+%endif
694
 ;-----------------------------------------------------------------------------
695
 ; int x265_count_nonzero_32x32_sse2(const int16_t *quantCoeff);
696
 ;-----------------------------------------------------------------------------
697
@@ -1288,6 +1931,30 @@
698
     RET
699
 
700
 
701
+;-----------------------------------------------------------------------------
702
+; int x265_count_nonzero_32x32_avx512(const int16_t *quantCoeff);
703
+;-----------------------------------------------------------------------------
704
+%if ARCH_X86_64
705
+INIT_ZMM avx512
706
+cglobal count_nonzero_32x32, 1,4,2
707
+    mov             r1, 0xFFFFFFFFFFFFFFFF
708
+    kmovq           k2, r1
709
+    xor             r3, r3
710
+    pxor            m0, m0
711
+
712
+%assign x 0
713
+%rep 16
714
+    movu            m1, [r0 + x]
715
+    vpacksswb       m1, [r0 + x + 64]
716
+%assign x x+128
717
+    vpcmpb          k1 {k2}, m1, m0, 00000100b
718
+    kmovq           r1, k1
719
+    popcnt          r2, r1
720
+    add             r3d, r2d
721
+%endrep
722
+    mov             eax, r3d
723
+    RET
724
+%endif
725
 ;-----------------------------------------------------------------------------------------------------------------------------------------------
726
 ;void weight_pp(pixel *src, pixel *dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset)
727
 ;-----------------------------------------------------------------------------------------------------------------------------------------------
728
@@ -1531,6 +2198,116 @@
729
     jnz         .loopH
730
     RET
731
 %endif
732
+
733
+%if HIGH_BIT_DEPTH
734
+INIT_ZMM avx512
735
+cglobal weight_pp, 6, 7, 7
736
+%define correction      (14 - BIT_DEPTH)
737
+    mov          r6d, r6m
738
+    shl          r6d, 16 - correction
739
+    or           r6d, r5d
740
+
741
+    movd         xm0, r6d
742
+    vpbroadcastd  m0, xm0
743
+    mov          r5d, r7m
744
+    sub          r5d, correction
745
+    movd         xm1, r5d
746
+
747
+    vpbroadcastd    m2, r8m
748
+    vbroadcasti32x8 m5, [pw_1]
749
+    vbroadcasti32x8 m6, [pw_pixel_max]
750
+
751
+    add         r2d, r2d
752
+    add         r3d, r3d
753
+    sub         r2d, r3d
754
+    shr         r3d, 6
755
+
756
+.loopH:
757
+    mov          r5d, r3d
758
+
759
+.loopW:
760
+    movu        m4, [r0]
761
+    punpcklwd   m3, m4, m5
762
+    pmaddwd     m3, m0
763
+    psrad       m3, xm1
764
+    paddd       m3, m2
765
+
766
+    punpckhwd   m4, m5
767
+    pmaddwd     m4, m0
768
+    psrad       m4, xm1
769
+    paddd       m4, m2
770
+
771
+    packusdw    m3,   m4
772
+    pminuw      m3,   m6
773
+    movu        [r1], m3
774
+
775
+    add         r0, 64
776
+    add         r1, 64
777
+
778
+    dec         r5d
779
+    jnz         .loopW
780
+
781
+    lea         r0, [r0 + r2]
782
+    lea         r1, [r1 + r2]
783
+
784
+    dec         r4d
785
+    jnz         .loopH
786
+%undef correction
787
+    RET
788
+%else
789
+INIT_ZMM avx512
790
+cglobal weight_pp, 6, 7, 6
791
+
792
+    shl          r5d, 6
793
+    mov          r6d, r6m
794
+    shl          r6d, 16
795
+    or           r6d, r5d
796
+
797
+    movd         xm0, r6d
798
+    vpbroadcastd  m0, xm0
799
+    movd         xm1, r7m
800
+    vpbroadcastd  m2, r8m
801
+
802
+    vbroadcasti32x8 m5, [pw_1]
803
+
804
+    sub          r2d, r3d
805
+    shr          r3d, 5
806
+
807
+.loopH:
808
+    mov          r5d, r3d
809
+
810
+.loopW:
811
+    pmovzxbw    m4, [r0]
812
+    punpcklwd   m3, m4, m5
813
+    pmaddwd     m3, m0
814
+    psrad       m3, xm1
815
+    paddd       m3, m2
816
+
817
+    punpckhwd   m4, m5
818
+    pmaddwd     m4, m0
819
+    psrad       m4, xm1
820
+    paddd       m4, m2
821
+
822
+    packssdw       m3,  m4
823
+    vextracti64x4 ym4,  m3, 1
824
+    packuswb      ym3,  ym4
825
+    vpermq        ym3,  ym3, q3120
826
+    movu          [r1], ym3
827
+
828
+    add         r0, 32
829
+    add         r1, 32
830
+
831
+    dec         r5d
832
+    jnz         .loopW
833
+
834
+    lea         r0, [r0 + r2]
835
+    lea         r1, [r1 + r2]
836
+
837
+    dec         r4d
838
+    jnz         .loopH
839
+    RET
840
+%endif
841
+
842
 ;-------------------------------------------------------------------------------------------------------------------------------------------------
843
 ;void weight_sp(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset)
844
 ;-------------------------------------------------------------------------------------------------------------------------------------------------
845
@@ -1892,6 +2669,149 @@
846
 %endif
847
 %endif
848
 
849
+%if ARCH_X86_64 == 1
850
+%if HIGH_BIT_DEPTH
851
+INIT_ZMM avx512
852
+cglobal weight_sp, 6,9,8
853
+    vbroadcasti32x8           m1, [pw_pixel_max]
854
+    vbroadcasti32x8           m2, [pw_1]
855
+
856
+    mov                       r6d, r7m
857
+    shl                       r6d, 16
858
+    or                        r6d, r6m
859
+    movd                      xm3, r6d
860
+    vpbroadcastd               m3, xm3      ; m3 = [round w0]
861
+    movd                      xm4, r8m      ; m4 = [shift]
862
+    vpbroadcastd               m5, r9m      ; m5 = [offset]
863
+
864
+    ; correct row stride
865
+    add                       r3d, r3d
866
+    add                       r2d, r2d
867
+    mov                       r6d, r4d
868
+    and                       r6d, ~(mmsize / SIZEOF_PIXEL - 1)
869
+    shl                       r6d, 1
870
+    sub                       r3d, r6d
871
+    sub                       r2d, r6d
872
+
873
+    mov                       r6d, r4d
874
+    and                       r6d, (mmsize / SIZEOF_PIXEL - 1)
875
+
876
+.loopH:
877
+    mov                       r6d, r4d
878
+
879
+.loopW:
880
+    movu                      m6, [r0]
881
+    vbroadcasti32x8           m8, [pw_2000]
882
+    paddw                     m6, m8
883
+
884
+    punpcklwd                 m7,  m6, m2
885
+    pmaddwd                   m7,  m3       ;(round w0)
886
+    psrad                     m7, xm4       ;(shift)
887
+    paddd                     m7,  m5        ;(offset)
888
+
889
+    punpckhwd                 m6,  m2
890
+    pmaddwd                   m6,  m3
891
+    psrad                     m6, xm4
892
+    paddd                     m6,  m5
893
+
894
+    packusdw                  m7, m6
895
+    pminuw                    m7, m1
896
+
897
+    sub                       r6d,  (mmsize / SIZEOF_PIXEL)
898
+    jl                        .widthLess30
899
+    movu                      [r1], m7
900
+    lea                       r0,   [r0 + mmsize]
901
+    lea                       r1,   [r1 + mmsize]
902
+    je                        .nextH
903
+    jmp                       .loopW
904
+
905
+.widthLess30:
906
+    mov             r8d, 0xFFFFFFFF
907
+    NEG             r6d
908
+    shrx            r8d, r8d, r6d
909
+    kmovd           k1, r8d
910
+    vmovdqu16       [r1] {k1}, m7
911
+    jmp                  .nextH
912
+
913
+.nextH:
914
+    add                       r0, r2
915
+    add                       r1, r3
916
+
917
+    dec                       r5d
918
+    jnz                       .loopH
919
+    RET
920
+
921
+%else
922
+INIT_ZMM avx512
923
+cglobal weight_sp, 6, 10, 7
924
+    mov                       r7d,       r7m
925
+    shl                       r7d,       16
926
+    or                        r7d,       r6m
927
+    movd                      xm0,       r7d
928
+    vpbroadcastd              m0,        xm0            ; m0 = times 8 dw w0, round
929
+    movd                      xm1,       r8m            ; m1 = [shift]
930
+    vpbroadcastd               m2,       r9m            ; m2 = times 16 dw offset
931
+    vpbroadcastw               m3,       [pw_1]
932
+    vpbroadcastw               m4,       [pw_2000]
933
+
934
+    add                       r2d,       r2d            ; 2 * srcstride
935
+
936
+    mov                       r7,        r0
937
+    mov                       r8,        r1
938
+.loopH:
939
+    mov                       r6d,       r4d            ; width
940
+
941
+    ; save old src and dst
942
+    mov                       r0,        r7              ; src
943
+    mov                       r1,        r8              ; dst
944
+
945
+.loopW:
946
+    movu                      m5,        [r0]
947
+    paddw                     m5,         m4
948
+
949
+    punpcklwd                 m6,         m5,  m3
950
+    pmaddwd                   m6,         m0
951
+    psrad                     m6,        xm1
952
+    paddd                     m6,         m2
953
+
954
+    punpckhwd                 m5,         m3
955
+    pmaddwd                   m5,         m0
956
+    psrad                     m5,        xm1
957
+    paddd                     m5,         m2
958
+
959
+    packssdw                  m6,         m5
960
+    vextracti64x4            ym5,         m6,  1
961
+    packuswb                 ym6,        ym5
962
+    vpermq                   ym6,        ym6,  q3120
963
+
964
+    sub                      r6d,         32
965
+    jl                       .widthLess30
966
+    movu                     [r1],       ym6
967
+    je                       .nextH
968
+    add                      r0,          64
969
+    add                      r1,          32
970
+    jmp                      .loopW
971
+
972
+
973
+.widthLess30:
974
+    mov             r9d, 0xFFFFFFFF
975
+    NEG             r6d
976
+    shrx            r9d, r9d, r6d
977
+    kmovd           k1, r9d
978
+    vmovdqu8        [r1] {k1}, ym6
979
+    jmp                  .nextH
980
+
981
+.nextH:
982
+    lea             r7, [r7 + r2]
983
+    lea             r8, [r8 + r3]
984
+
985
+    dec             r5d
986
+    jnz             .loopH
987
+    RET
988
+%endif
989
+%endif
990
+
991
+
992
 ;-----------------------------------------------------------------
993
 ; void transpose_4x4(pixel *dst, pixel *src, intptr_t stride)
994
 ;-----------------------------------------------------------------
995
@@ -4060,6 +4980,68 @@
996
     RET
997
 %endif
998
 
999
+%if HIGH_BIT_DEPTH == 0
1000
+INIT_ZMM avx512
1001
+cglobal scale1D_128to64, 2, 2, 7
1002
+    pxor            m4, m4
1003
+    mova            m6, [dequant_shuf1_avx512]
1004
+    vbroadcasti32x8 m5, [pb_1]
1005
+
1006
+    ;Top pixel
1007
+    movu            m0, [r1]
1008
+    movu            m1, [r1 + 1 * mmsize]
1009
+    movu            m2, [r1 + 2 * mmsize]
1010
+    movu            m3, [r1 + 3 * mmsize]
1011
+
1012
+    pmaddubsw       m0, m5
1013
+    pavgw           m0, m4
1014
+    pmaddubsw       m1, m5
1015
+    pavgw           m1, m4
1016
+    packuswb        m0, m1
1017
+    vpermq          m0, m6, m0
1018
+    movu            [r0], m0
1019
+
1020
+    ;Left pixel
1021
+    pmaddubsw       m2, m5
1022
+    pavgw           m2, m4
1023
+    pmaddubsw       m3, m5
1024
+    pavgw           m3, m4
1025
+    packuswb        m2, m3
1026
+    vpermq          m2, m6, m2
1027
+    movu            [r0 + mmsize], m2
1028
+    RET
1029
+
1030
+INIT_ZMM avx512
1031
+cglobal scale1D_128to64_aligned, 2, 2, 7
1032
+    pxor            m4, m4
1033
+    mova            m6, [dequant_shuf1_avx512]
1034
+    vbroadcasti32x8 m5, [pb_1]
1035
+
1036
+    ;Top pixel
1037
+    mova            m0, [r1]
1038
+    mova            m1, [r1 + 1 * mmsize]
1039
+    mova            m2, [r1 + 2 * mmsize]
1040
+    mova            m3, [r1 + 3 * mmsize]
1041
+
1042
+    pmaddubsw       m0, m5
1043
+    pavgw           m0, m4
1044
+    pmaddubsw       m1, m5
1045
+    pavgw           m1, m4
1046
+    packuswb        m0, m1
1047
+    vpermq          m0, m6, m0
1048
+    mova            [r0], m0
1049
+
1050
+    ;Left pixel
1051
+    pmaddubsw       m2, m5
1052
+    pavgw           m2, m4
1053
+    pmaddubsw       m3, m5
1054
+    pavgw           m3, m4
1055
+    packuswb        m2, m3
1056
+    vpermq          m2, m6, m2
1057
+    mova            [r0 + mmsize], m2
1058
+    RET
1059
+%endif
1060
+
1061
 ;-----------------------------------------------------------------
1062
 ; void scale2D_64to32(pixel *dst, pixel *src, intptr_t stride)
1063
 ;-----------------------------------------------------------------
1064
@@ -5323,6 +6305,226 @@
1065
 PIXELSUB_PS_W32_H8_avx2 32, 64
1066
 %endif
1067
 
1068
+%macro PROCESS_SUB_PS_32x8_AVX512 0
1069
+    pmovzxbw    m0,     [r2]
1070
+    pmovzxbw    m1,     [r3]
1071
+    pmovzxbw    m2,     [r2 + r4]
1072
+    pmovzxbw    m3,     [r3 + r5]
1073
+    pmovzxbw    m4,     [r2 + 2 * r4]
1074
+    pmovzxbw    m5,     [r3 + 2 * r5]
1075
+    pmovzxbw    m6,     [r2 + r7]
1076
+    pmovzxbw    m7,     [r3 + r8]
1077
+
1078
+    psubw       m0,     m1
1079
+    psubw       m2,     m3
1080
+    psubw       m4,     m5
1081
+    psubw       m6,     m7
1082
+
1083
+    movu        [r0],             m0
1084
+    movu        [r0 + r1],        m2
1085
+    movu        [r0 + r1 * 2 ],   m4
1086
+    movu        [r0 + r9],        m6
1087
+
1088
+    lea         r2,     [r2 + r4 * 4]
1089
+    lea         r3,     [r3 + r5 * 4]
1090
+    lea         r0,     [r0 + r1 * 4]
1091
+
1092
+    pmovzxbw    m0,     [r2]
1093
+    pmovzxbw    m1,     [r3]
1094
+    pmovzxbw    m2,     [r2 + r4]
1095
+    pmovzxbw    m3,     [r3 + r5]
1096
+    pmovzxbw    m4,     [r2 + 2 * r4]
1097
+    pmovzxbw    m5,     [r3 + 2 * r5]
1098
+    pmovzxbw    m6,     [r2 + r7]
1099
+    pmovzxbw    m7,     [r3 + r8]
1100
+
1101
+    psubw       m0,     m1
1102
+    psubw       m2,     m3
1103
+    psubw       m4,     m5
1104
+    psubw       m6,     m7
1105
+
1106
+    movu        [r0],             m0
1107
+    movu        [r0 + r1],        m2
1108
+    movu        [r0 + r1 * 2 ],   m4
1109
+    movu        [r0 + r9],        m6
1110
+%endmacro
1111
+
1112
+%macro PROCESS_SUB_PS_32x8_HBD_AVX512 0
1113
+    movu        m0,     [r2]
1114
+    movu        m1,     [r3]
1115
+    movu        m2,     [r2 + r4]
1116
+    movu        m3,     [r3 + r5]
1117
+    psubw       m0,     m1
1118
+    psubw       m2,     m3
1119
+
1120
+    movu        [r0],                 m0
1121
+    movu        [r0 + r1],            m2
1122
+
1123
+    movu        m0,     [r2 + r4 * 2]
1124
+    movu        m1,     [r3 + r5 * 2]
1125
+    movu        m2,     [r2 + r7]
1126
+    movu        m3,     [r3 + r8]
1127
+    psubw       m0,     m1
1128
+    psubw       m2,     m3
1129
+
1130
+    movu        [r0 + r1 * 2],        m0
1131
+    movu        [r0 + r6],            m2
1132
+
1133
+    lea         r0,     [r0 + r1 * 4]
1134
+    lea         r2,     [r2 + r4 * 4]
1135
+    lea         r3,     [r3 + r5 * 4]
1136
+
1137
+    movu        m0,     [r2]
1138
+    movu        m1,     [r3]
1139
+    movu        m2,     [r2 + r4]
1140
+    movu        m3,     [r3 + r5]
1141
+    psubw       m0,     m1
1142
+    psubw       m2,     m3
1143
+
1144
+    movu        [r0],                 m0
1145
+    movu        [r0 + r1],            m2
1146
+
1147
+    movu        m0,     [r2 + r4 * 2]
1148
+    movu        m1,     [r3 + r5 * 2]
1149
+    movu        m2,     [r2 + r7]
1150
+    movu        m3,     [r3 + r8]
1151
+    psubw       m0,     m1
1152
+    psubw       m2,     m3
1153
+
1154
+    movu        [r0 + r1 * 2],        m0
1155
+    movu        [r0 + r6],            m2
1156
+%endmacro
1157
+
1158
+;-----------------------------------------------------------------------------
1159
+; void pixel_sub_ps_32x32(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
1160
+;-----------------------------------------------------------------------------
1161
+%if HIGH_BIT_DEPTH
1162
+%if ARCH_X86_64
1163
+INIT_ZMM avx512
1164
+cglobal pixel_sub_ps_32x32, 6, 9, 4
1165
+    add         r1d,    r1d
1166
+    add         r4d,    r4d
1167
+    add         r5d,    r5d
1168
+    lea         r6,     [r1 * 3]
1169
+    lea         r7,     [r4 * 3]
1170
+    lea         r8,     [r5 * 3]
1171
+    PROCESS_SUB_PS_32x8_HBD_AVX512
1172
+    lea         r0,     [r0 + r1 * 4]
1173
+    lea         r2,     [r2 + r4 * 4]
1174
+    lea         r3,     [r3 + r5 * 4]
1175
+    PROCESS_SUB_PS_32x8_HBD_AVX512
1176
+    lea         r0,     [r0 + r1 * 4]
1177
+    lea         r2,     [r2 + r4 * 4]
1178
+    lea         r3,     [r3 + r5 * 4]
1179
+    PROCESS_SUB_PS_32x8_HBD_AVX512
1180
+    lea         r0,     [r0 + r1 * 4]
1181
+    lea         r2,     [r2 + r4 * 4]
1182
+    lea         r3,     [r3 + r5 * 4]
1183
+    PROCESS_SUB_PS_32x8_HBD_AVX512
1184
+    RET
1185
+
1186
+cglobal pixel_sub_ps_32x64, 6, 9, 4
1187
+    add         r1d,    r1d
1188
+    add         r4d,    r4d
1189
+    add         r5d,    r5d
1190
+    lea         r6,     [r1 * 3]
1191
+    lea         r7,     [r4 * 3]
1192
+    lea         r8,     [r5 * 3]
1193
+    PROCESS_SUB_PS_32x8_HBD_AVX512
1194
+    lea         r0,     [r0 + r1 * 4]
1195
+    lea         r2,     [r2 + r4 * 4]
1196
+    lea         r3,     [r3 + r5 * 4]
1197
+    PROCESS_SUB_PS_32x8_HBD_AVX512
1198
+    lea         r0,     [r0 + r1 * 4]
1199
+    lea         r2,     [r2 + r4 * 4]
1200
+    lea         r3,     [r3 + r5 * 4]
1201
+    PROCESS_SUB_PS_32x8_HBD_AVX512
1202
+    lea         r0,     [r0 + r1 * 4]
1203
+    lea         r2,     [r2 + r4 * 4]
1204
+    lea         r3,     [r3 + r5 * 4]
1205
+    PROCESS_SUB_PS_32x8_HBD_AVX512
1206
+    lea         r0,     [r0 + r1 * 4]
1207
+    lea         r2,     [r2 + r4 * 4]
1208
+    lea         r3,     [r3 + r5 * 4]
1209
+    PROCESS_SUB_PS_32x8_HBD_AVX512
1210
+    lea         r0,     [r0 + r1 * 4]
1211
+    lea         r2,     [r2 + r4 * 4]
1212
+    lea         r3,     [r3 + r5 * 4]
1213
+    PROCESS_SUB_PS_32x8_HBD_AVX512
1214
+    lea         r0,     [r0 + r1 * 4]
1215
+    lea         r2,     [r2 + r4 * 4]
1216
+    lea         r3,     [r3 + r5 * 4]
1217
+    PROCESS_SUB_PS_32x8_HBD_AVX512
1218
+    lea         r0,     [r0 + r1 * 4]
1219
+    lea         r2,     [r2 + r4 * 4]
1220
+    lea         r3,     [r3 + r5 * 4]
1221
+    PROCESS_SUB_PS_32x8_HBD_AVX512
1222
+    RET
1223
+%endif
1224
+%else
1225
+%if ARCH_X86_64
1226
+INIT_ZMM avx512
1227
+cglobal pixel_sub_ps_32x32, 6, 10, 8
1228
+    add         r1,     r1
1229
+    lea         r7,     [r4 * 3]
1230
+    lea         r8,     [r5 * 3]
1231
+    lea         r9,     [r1 * 3]
1232
+
1233
+    PROCESS_SUB_PS_32x8_AVX512
1234
+    lea         r2,     [r2 + r4 * 4]
1235
+    lea         r3,     [r3 + r5 * 4]
1236
+    lea         r0,     [r0 + r1 * 4]
1237
+    PROCESS_SUB_PS_32x8_AVX512
1238
+    lea         r2,     [r2 + r4 * 4]
1239
+    lea         r3,     [r3 + r5 * 4]
1240
+    lea         r0,     [r0 + r1 * 4]
1241
+    PROCESS_SUB_PS_32x8_AVX512
1242
+    lea         r2,     [r2 + r4 * 4]
1243
+    lea         r3,     [r3 + r5 * 4]
1244
+    lea         r0,     [r0 + r1 * 4]
1245
+    PROCESS_SUB_PS_32x8_AVX512
1246
+    RET
1247
+
1248
+INIT_ZMM avx512
1249
+cglobal pixel_sub_ps_32x64, 6, 10, 8
1250
+    add         r1,     r1
1251
+    lea         r7,     [r4 * 3]
1252
+    lea         r8,     [r5 * 3]
1253
+    lea         r9,     [r1 * 3]
1254
+
1255
+    PROCESS_SUB_PS_32x8_AVX512
1256
+    lea         r2,     [r2 + r4 * 4]
1257
+    lea         r3,     [r3 + r5 * 4]
1258
+    lea         r0,     [r0 + r1 * 4]
1259
+    PROCESS_SUB_PS_32x8_AVX512
1260
+    lea         r2,     [r2 + r4 * 4]
1261
+    lea         r3,     [r3 + r5 * 4]
1262
+    lea         r0,     [r0 + r1 * 4]
1263
+    PROCESS_SUB_PS_32x8_AVX512
1264
+    lea         r2,     [r2 + r4 * 4]
1265
+    lea         r3,     [r3 + r5 * 4]
1266
+    lea         r0,     [r0 + r1 * 4]
1267
+    PROCESS_SUB_PS_32x8_AVX512
1268
+    lea         r2,     [r2 + r4 * 4]
1269
+    lea         r3,     [r3 + r5 * 4]
1270
+    lea         r0,     [r0 + r1 * 4]
1271
+    PROCESS_SUB_PS_32x8_AVX512
1272
+    lea         r2,     [r2 + r4 * 4]
1273
+    lea         r3,     [r3 + r5 * 4]
1274
+    lea         r0,     [r0 + r1 * 4]
1275
+    PROCESS_SUB_PS_32x8_AVX512
1276
+    lea         r2,     [r2 + r4 * 4]
1277
+    lea         r3,     [r3 + r5 * 4]
1278
+    lea         r0,     [r0 + r1 * 4]
1279
+    PROCESS_SUB_PS_32x8_AVX512
1280
+    lea         r2,     [r2 + r4 * 4]
1281
+    lea         r3,     [r3 + r5 * 4]
1282
+    lea         r0,     [r0 + r1 * 4]
1283
+    PROCESS_SUB_PS_32x8_AVX512
1284
+    RET
1285
+%endif
1286
+%endif
1287
+
1288
 ;-----------------------------------------------------------------------------
1289
 ; void pixel_sub_ps_64x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
1290
 ;-----------------------------------------------------------------------------
1291
@@ -5747,6 +6949,251 @@
1292
     jnz         .loop
1293
     RET
1294
 %endif
1295
+
1296
+%macro PROCESS_SUB_PS_64x8_AVX512 0
1297
+    pmovzxbw    m0,     [r2]
1298
+    pmovzxbw    m1,     [r2 + 32]
1299
+    pmovzxbw    m2,     [r3]
1300
+    pmovzxbw    m3,     [r3 + 32]
1301
+    pmovzxbw    m4,     [r2 + r4]
1302
+    pmovzxbw    m5,     [r2 + r4 + 32]
1303
+    pmovzxbw    m6,     [r3 + r5]
1304
+    pmovzxbw    m7,     [r3 + r5 + 32]
1305
+
1306
+    psubw       m0,     m2
1307
+    psubw       m1,     m3
1308
+    psubw       m4,     m6
1309
+    psubw       m5,     m7
1310
+    movu        [r0],               m0
1311
+    movu        [r0 + 64],          m1
1312
+    movu        [r0 + 2 * r1],      m4
1313
+    movu        [r0 + 2 * r1 + 64], m5
1314
+
1315
+    lea         r0,     [r0 + 4 * r1]
1316
+    lea         r2,     [r2 + 2 * r4]
1317
+    lea         r3,     [r3 + 2 * r5]
1318
+
1319
+    pmovzxbw    m0,     [r2]
1320
+    pmovzxbw    m1,     [r2 + 32]
1321
+    pmovzxbw    m2,     [r3]
1322
+    pmovzxbw    m3,     [r3 + 32]
1323
+    pmovzxbw    m4,     [r2 + r4]
1324
+    pmovzxbw    m5,     [r2 + r4 + 32]
1325
+    pmovzxbw    m6,     [r3 + r5]
1326
+    pmovzxbw    m7,     [r3 + r5 + 32]
1327
+
1328
+    psubw       m0,     m2
1329
+    psubw       m1,     m3
1330
+    psubw       m4,     m6
1331
+    psubw       m5,     m7
1332
+    movu        [r0],               m0
1333
+    movu        [r0 + 64],          m1
1334
+    movu        [r0 + 2 * r1],      m4
1335
+    movu        [r0 + 2 * r1 + 64], m5
1336
+
1337
+    lea         r0,     [r0 + 4 * r1]
1338
+    lea         r2,     [r2 + 2 * r4]
1339
+    lea         r3,     [r3 + 2 * r5]
1340
+
1341
+    pmovzxbw    m0,     [r2]
1342
+    pmovzxbw    m1,     [r2 + 32]
1343
+    pmovzxbw    m2,     [r3]
1344
+    pmovzxbw    m3,     [r3 + 32]
1345
+    pmovzxbw    m4,     [r2 + r4]
1346
+    pmovzxbw    m5,     [r2 + r4 + 32]
1347
+    pmovzxbw    m6,     [r3 + r5]
1348
+    pmovzxbw    m7,     [r3 + r5 + 32]
1349
+
1350
+    psubw       m0,     m2
1351
+    psubw       m1,     m3
1352
+    psubw       m4,     m6
1353
+    psubw       m5,     m7
1354
+    movu        [r0],               m0
1355
+    movu        [r0 + 64],          m1
1356
+    movu        [r0 + 2 * r1],      m4
1357
+    movu        [r0 + 2 * r1 + 64], m5
1358
+
1359
+    lea         r0,     [r0 + 4 * r1]
1360
+    lea         r2,     [r2 + 2 * r4]
1361
+    lea         r3,     [r3 + 2 * r5]
1362
+
1363
+    pmovzxbw    m0,     [r2]
1364
+    pmovzxbw    m1,     [r2 + 32]
1365
+    pmovzxbw    m2,     [r3]
1366
+    pmovzxbw    m3,     [r3 + 32]
1367
+    pmovzxbw    m4,     [r2 + r4]
1368
+    pmovzxbw    m5,     [r2 + r4 + 32]
1369
+    pmovzxbw    m6,     [r3 + r5]
1370
+    pmovzxbw    m7,     [r3 + r5 + 32]
1371
+
1372
+    psubw       m0,     m2
1373
+    psubw       m1,     m3
1374
+    psubw       m4,     m6
1375
+    psubw       m5,     m7
1376
+    movu        [r0],               m0
1377
+    movu        [r0 + 64],          m1
1378
+    movu        [r0 + 2 * r1],      m4
1379
+    movu        [r0 + 2 * r1 + 64], m5
1380
+%endmacro
1381
+
1382
+%macro PROCESS_SUB_PS_64x8_HBD_AVX512 0
1383
+    movu        m0,     [r2]
1384
+    movu        m1,     [r2 + 64]
1385
+    movu        m4,     [r3]
1386
+    movu        m5,     [r3 + 64]
1387
+    psubw       m0,     m4
1388
+    psubw       m1,     m5
1389
+    movu        m2,     [r2 + r4]
1390
+    movu        m3,     [r2 + r4 + 64]
1391
+    movu        m6,     [r3 + r5]
1392
+    movu        m7,     [r3 + r5 + 64]
1393
+    psubw       m2,     m6
1394
+    psubw       m3,     m7
1395
+
1396
+    movu        [r0],                 m0
1397
+    movu        [r0 + 64],            m1
1398
+    movu        [r0 + r1],            m2
1399
+    movu        [r0 + r1 + 64],       m3
1400
+
1401
+    movu        m0,     [r2 + r4 * 2]
1402
+    movu        m1,     [r2 + r4 * 2 + 64]
1403
+    movu        m4,     [r3 + r5 * 2]
1404
+    movu        m5,     [r3 + r5 * 2 + 64]
1405
+    psubw       m0,     m4
1406
+    psubw       m1,     m5
1407
+    movu        m2,     [r2 + r7]
1408
+    movu        m3,     [r2 + r7 + 64]
1409
+    movu        m6,     [r3 + r8]
1410
+    movu        m7,     [r3 + r8 + 64]
1411
+    psubw       m2,     m6
1412
+    psubw       m3,     m7
1413
+
1414
+    movu        [r0 + r1 * 2],        m0
1415
+    movu        [r0 + r1 * 2 + 64],   m1
1416
+    movu        [r0 + r6],            m2
1417
+    movu        [r0 + r6 + 64],       m3
1418
+
1419
+    lea         r0,     [r0 + r1 * 4]
1420
+    lea         r2,     [r2 + r4 * 4]
1421
+    lea         r3,     [r3 + r5 * 4]
1422
+
1423
+    movu        m0,     [r2]
1424
+    movu        m1,     [r2 + 64]
1425
+    movu        m4,     [r3]
1426
+    movu        m5,     [r3 + 64]
1427
+    psubw       m0,     m4
1428
+    psubw       m1,     m5
1429
+    movu        m2,     [r2 + r4]
1430
+    movu        m3,     [r2 + r4 + 64]
1431
+    movu        m6,     [r3 + r5]
1432
+    movu        m7,     [r3 + r5 + 64]
1433
+    psubw       m2,     m6
1434
+    psubw       m3,     m7
1435
+
1436
+    movu        [r0],                 m0
1437
+    movu        [r0 + 64],            m1
1438
+    movu        [r0 + r1],            m2
1439
+    movu        [r0 + r1 + 64],       m3
1440
+
1441
+    movu        m0,     [r2 + r4 * 2]
1442
+    movu        m1,     [r2 + r4 * 2 + 64]
1443
+    movu        m4,     [r3 + r5 * 2]
1444
+    movu        m5,     [r3 + r5 * 2 + 64]
1445
+    psubw       m0,     m4
1446
+    psubw       m1,     m5
1447
+    movu        m2,     [r2 + r7]
1448
+    movu        m3,     [r2 + r7 + 64]
1449
+    movu        m6,     [r3 + r8]
1450
+    movu        m7,     [r3 + r8 + 64]
1451
+    psubw       m2,     m6
1452
+    psubw       m3,     m7
1453
+
1454
+    movu        [r0 + r1 * 2],        m0
1455
+    movu        [r0 + r1 * 2 + 64],   m1
1456
+    movu        [r0 + r6],            m2
1457
+    movu        [r0 + r6 + 64],       m3
1458
+%endmacro
1459
+;-----------------------------------------------------------------------------
1460
+; void pixel_sub_ps_64x64(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
1461
+;-----------------------------------------------------------------------------
1462
+%if HIGH_BIT_DEPTH
1463
+%if ARCH_X86_64
1464
+INIT_ZMM avx512
1465
+cglobal pixel_sub_ps_64x64, 6, 9, 8
1466
+    add         r1d,    r1d
1467
+    add         r4d,    r4d
1468
+    add         r5d,    r5d
1469
+    lea         r6,     [r1 * 3]
1470
+    lea         r7,     [r4 * 3]
1471
+    lea         r8,     [r5 * 3]
1472
+
1473
+    PROCESS_SUB_PS_64x8_HBD_AVX512
1474
+    lea         r0,     [r0 + r1 * 4]
1475
+    lea         r2,     [r2 + r4 * 4]
1476
+    lea         r3,     [r3 + r5 * 4]
1477
+    PROCESS_SUB_PS_64x8_HBD_AVX512
1478
+    lea         r0,     [r0 + r1 * 4]
1479
+    lea         r2,     [r2 + r4 * 4]
1480
+    lea         r3,     [r3 + r5 * 4]
1481
+    PROCESS_SUB_PS_64x8_HBD_AVX512
1482
+    lea         r0,     [r0 + r1 * 4]
1483
+    lea         r2,     [r2 + r4 * 4]
1484
+    lea         r3,     [r3 + r5 * 4]
1485
+    PROCESS_SUB_PS_64x8_HBD_AVX512
1486
+    lea         r0,     [r0 + r1 * 4]
1487
+    lea         r2,     [r2 + r4 * 4]
1488
+    lea         r3,     [r3 + r5 * 4]
1489
+    PROCESS_SUB_PS_64x8_HBD_AVX512
1490
+    lea         r0,     [r0 + r1 * 4]
1491
+    lea         r2,     [r2 + r4 * 4]
1492
+    lea         r3,     [r3 + r5 * 4]
1493
+    PROCESS_SUB_PS_64x8_HBD_AVX512
1494
+    lea         r0,     [r0 + r1 * 4]
1495
+    lea         r2,     [r2 + r4 * 4]
1496
+    lea         r3,     [r3 + r5 * 4]
1497
+    PROCESS_SUB_PS_64x8_HBD_AVX512
1498
+    lea         r0,     [r0 + r1 * 4]
1499
+    lea         r2,     [r2 + r4 * 4]
1500
+    lea         r3,     [r3 + r5 * 4]
1501
+    PROCESS_SUB_PS_64x8_HBD_AVX512
1502
+    RET
1503
+%endif
1504
+%else
1505
+%if ARCH_X86_64
1506
+INIT_ZMM avx512
1507
+cglobal pixel_sub_ps_64x64, 6, 7, 8
1508
+    PROCESS_SUB_PS_64x8_AVX512
1509
+    lea         r0,     [r0 + 4 * r1]
1510
+    lea         r2,     [r2 + 2 * r4]
1511
+    lea         r3,     [r3 + 2 * r5]
1512
+    PROCESS_SUB_PS_64x8_AVX512
1513
+    lea         r0,     [r0 + 4 * r1]
1514
+    lea         r2,     [r2 + 2 * r4]
1515
+    lea         r3,     [r3 + 2 * r5]
1516
+    PROCESS_SUB_PS_64x8_AVX512
1517
+    lea         r0,     [r0 + 4 * r1]
1518
+    lea         r2,     [r2 + 2 * r4]
1519
+    lea         r3,     [r3 + 2 * r5]
1520
+    PROCESS_SUB_PS_64x8_AVX512
1521
+    lea         r0,     [r0 + 4 * r1]
1522
+    lea         r2,     [r2 + 2 * r4]
1523
+    lea         r3,     [r3 + 2 * r5]
1524
+    PROCESS_SUB_PS_64x8_AVX512
1525
+    lea         r0,     [r0 + 4 * r1]
1526
+    lea         r2,     [r2 + 2 * r4]
1527
+    lea         r3,     [r3 + 2 * r5]
1528
+    PROCESS_SUB_PS_64x8_AVX512
1529
+    lea         r0,     [r0 + 4 * r1]
1530
+    lea         r2,     [r2 + 2 * r4]
1531
+    lea         r3,     [r3 + 2 * r5]
1532
+    PROCESS_SUB_PS_64x8_AVX512
1533
+    lea         r0,     [r0 + 4 * r1]
1534
+    lea         r2,     [r2 + 2 * r4]
1535
+    lea         r3,     [r3 + 2 * r5]
1536
+    PROCESS_SUB_PS_64x8_AVX512
1537
+    RET
1538
+%endif
1539
+%endif
1540
 ;=============================================================================
1541
 ; variance
1542
 ;=============================================================================
1543
@@ -5757,7 +7204,7 @@
1544
 %if HIGH_BIT_DEPTH == 0
1545
 %if %1
1546
     mova  m7, [pw_00ff]
1547
-%elif mmsize < 32
1548
+%elif mmsize == 16
1549
     pxor  m7, m7    ; zero
1550
 %endif
1551
 %endif ; !HIGH_BIT_DEPTH
1552
@@ -6476,6 +7923,245 @@
1553
     RET
1554
 %endif ; !HIGH_BIT_DEPTH
1555
 
1556
+%macro PROCESS_VAR_32x8_AVX512 0
1557
+    pmovzxbw        m0, [r0]
1558
+    pmovzxbw        m1, [r0 + r1]
1559
+    pmovzxbw        m2, [r0 + 2 * r1]
1560
+    pmovzxbw        m3, [r0 + r2]
1561
+
1562
+    paddw     m4, m0
1563
+    paddw     m4, m1
1564
+    paddw     m4, m2
1565
+    paddw     m4, m3
1566
+    pmaddwd   m0, m0
1567
+    pmaddwd   m1, m1
1568
+    pmaddwd   m2, m2
1569
+    pmaddwd   m3, m3
1570
+    paddd     m5, m0
1571
+    paddd     m5, m1
1572
+    paddd     m5, m2
1573
+    paddd     m5, m3
1574
+
1575
+    lea             r0, [r0 + r1 * 4]
1576
+
1577
+    pmovzxbw        m0, [r0]
1578
+    pmovzxbw        m1, [r0 + r1]
1579
+    pmovzxbw        m2, [r0 + 2 * r1]
1580
+    pmovzxbw        m3, [r0 + r2]
1581
+
1582
+    paddw     m4, m0
1583
+    paddw     m4, m1
1584
+    paddw     m4, m2
1585
+    paddw     m4, m3
1586
+    pmaddwd   m0, m0
1587
+    pmaddwd   m1, m1
1588
+    pmaddwd   m2, m2
1589
+    pmaddwd   m3, m3
1590
+    paddd     m5, m0
1591
+    paddd     m5, m1
1592
+    paddd     m5, m2
1593
+    paddd     m5, m3
1594
+%endmacro
1595
+
1596
+%macro PROCESS_VAR_AVX512_END 0
1597
+    vextracti32x8  ym0, m4, 1
1598
+    vextracti32x8  ym1, m5, 1
1599
+    paddw          ym4, ym0
1600
+    paddd          ym5, ym1
1601
+    vextracti32x4  xm0, m4, 1
1602
+    vextracti32x4  xm1, m5, 1
1603
+    paddw          xm4, xm0
1604
+    paddd          xm5, xm1
1605
+    HADDW          xm4, xm2
1606
+    HADDD          xm5, xm1
1607
+%if ARCH_X86_64
1608
+    punpckldq      xm4, xm5
1609
+    movq           rax, xm4
1610
+%else
1611
+    movd           eax, xm4
1612
+    movd           edx, xm5
1613
+%endif
1614
+%endmacro
1615
+%if ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 0
1616
+;-----------------------------------------------------------------------------
1617
+; int pixel_var_wxh( uint8_t *, intptr_t )
1618
+;-----------------------------------------------------------------------------
1619
+INIT_ZMM avx512
1620
+cglobal pixel_var_32x32, 2,4,6
1621
+    pxor  m4, m4    ; sum
1622
+    pxor  m5, m5    ; sum squared
1623
+    lea   r2, [3 * r1]
1624
+
1625
+    PROCESS_VAR_32x8_AVX512
1626
+    lea   r0, [r0 + r1 * 4]
1627
+    PROCESS_VAR_32x8_AVX512
1628
+    lea   r0, [r0 + r1 * 4]
1629
+    PROCESS_VAR_32x8_AVX512
1630
+    lea   r0, [r0 + r1 * 4]
1631
+    PROCESS_VAR_32x8_AVX512
1632
+    PROCESS_VAR_AVX512_END
1633
+    RET
1634
+
1635
+INIT_ZMM avx512
1636
+cglobal pixel_var_64x64, 2,4,7
1637
+    pxor            m5, m5    ; sum
1638
+    pxor            m6, m6    ; sum squared
1639
+    mov             r2d, 32
1640
+
1641
+.loop:
1642
+    pmovzxbw        m0, [r0]
1643
+    pmovzxbw        m3, [r0 + mmsize/2]
1644
+    pmovzxbw        m1, [r0 + r1]
1645
+    pmovzxbw        m4, [r0 + r1 + mmsize/2]
1646
+
1647
+    lea             r0, [r0 + 2 * r1]
1648
+
1649
+    paddw           m5, m0
1650
+    paddw           m5, m3
1651
+    paddw           m5, m1
1652
+    paddw           m5, m4
1653
+    pmaddwd         m0, m0
1654
+    pmaddwd         m3, m3
1655
+    pmaddwd         m1, m1
1656
+    pmaddwd         m4, m4
1657
+    paddd           m6, m0
1658
+    paddd           m6, m3
1659
+    paddd           m6, m1
1660
+    paddd           m6, m4
1661
+
1662
+    dec             r2d
1663
+    jg              .loop
1664
+
1665
+    pxor            m1, m1
1666
+    punpcklwd       m0, m5, m1
1667
+    punpckhwd       m5, m1
1668
+    paddd           m5, m0
1669
+    vextracti32x8  ym2, m5, 1
1670
+    vextracti32x8  ym1, m6, 1
1671
+    paddd          ym5, ym2
1672
+    paddd          ym6, ym1
1673
+    vextracti32x4  xm2, m5, 1
1674
+    vextracti32x4  xm1, m6, 1
1675
+    paddd          xm5, xm2
1676
+    paddd          xm6, xm1
1677
+    HADDD          xm5, xm2
1678
+    HADDD          xm6, xm1
1679
+    punpckldq      xm5, xm6
1680
+    movq           rax, xm5
1681
+    RET
1682
+%endif
1683
+%macro VAR_AVX512_CORE 1 ; accum
1684
+%if %1
1685
+    paddw    m0, m2
1686
+    pmaddwd  m2, m2
1687
+    paddw    m0, m3
1688
+    pmaddwd  m3, m3
1689
+    paddd    m1, m2
1690
+    paddd    m1, m3
1691
+%else
1692
+    paddw    m0, m2, m3
1693
+    pmaddwd  m2, m2
1694
+    pmaddwd  m3, m3
1695
+    paddd    m1, m2, m3
1696
+%endif
1697
+%endmacro
1698
+
1699
+%macro VAR_AVX512_CORE_16x16 1 ; accum
1700
+%if HIGH_BIT_DEPTH
1701
+    mova            ym2, [r0]
1702
+    vinserti64x4     m2, [r0+r1], 1
1703
+    mova            ym3, [r0+2*r1]
1704
+    vinserti64x4     m3, [r0+r3], 1
1705
+%else
1706
+    vbroadcasti64x2 ym2, [r0]
1707
+    vbroadcasti64x2  m2 {k1}, [r0+r1]
1708
+    vbroadcasti64x2 ym3, [r0+2*r1]
1709
+    vbroadcasti64x2  m3 {k1}, [r0+r3]
1710
+    pshufb           m2, m4
1711
+    pshufb           m3, m4
1712
+%endif
1713
+    VAR_AVX512_CORE %1
1714
+%endmacro
1715
+
1716
+%macro VAR_AVX512_CORE_8x8 1 ; accum
1717
+%if HIGH_BIT_DEPTH
1718
+    mova            xm2, [r0]
1719
+    mova            xm3, [r0+r1]
1720
+%else
1721
+    movq            xm2, [r0]
1722
+    movq            xm3, [r0+r1]
1723
+%endif
1724
+    vinserti128     ym2, [r0+2*r1], 1
1725
+    vinserti128     ym3, [r0+r2], 1
1726
+    lea              r0, [r0+4*r1]
1727
+    vinserti32x4     m2, [r0], 2
1728
+    vinserti32x4     m3, [r0+r1], 2
1729
+    vinserti32x4     m2, [r0+2*r1], 3
1730
+    vinserti32x4     m3, [r0+r2], 3
1731
+%if HIGH_BIT_DEPTH == 0
1732
+    punpcklbw        m2, m4
1733
+    punpcklbw        m3, m4
1734
+%endif
1735
+    VAR_AVX512_CORE %1
1736
+%endmacro
1737
+
1738
+INIT_ZMM avx512
1739
+cglobal pixel_var_16x16, 2,4
1740
+    FIX_STRIDES     r1
1741
+    mov            r2d, 0xf0
1742
+    lea             r3, [3*r1]
1743
+%if HIGH_BIT_DEPTH == 0
1744
+    vbroadcasti64x4 m4, [var_shuf_avx512]
1745
+    kmovb           k1, r2d
1746
+%endif
1747
+    VAR_AVX512_CORE_16x16 0
1748
+.loop:
1749
+    lea             r0, [r0+4*r1]
1750
+    VAR_AVX512_CORE_16x16 1
1751
+    sub            r2d, 0x50
1752
+    jg .loop
1753
+%if ARCH_X86_64 == 0
1754
+    pop            r3d
1755
+    %assign regs_used 3
1756
+%endif
1757
+var_avx512_end:
1758
+    vbroadcasti32x4 m2, [pw_1]
1759
+    pmaddwd         m0, m2
1760
+    SBUTTERFLY      dq, 0, 1, 2
1761
+    paddd           m0, m1
1762
+    vextracti32x8  ym1, m0, 1
1763
+    paddd          ym0, ym1
1764
+    vextracti128   xm1, ym0, 1
1765
+    paddd         xmm0, xm0, xm1
1766
+    punpckhqdq    xmm1, xmm0, xmm0
1767
+    paddd         xmm0, xmm1
1768
+%if ARCH_X86_64
1769
+    movq           rax, xmm0
1770
+%else
1771
+    movd           eax, xmm0
1772
+    pextrd         edx, xmm0, 1
1773
+ %endif
1774
+     RET
1775
+ 
1776
+%if HIGH_BIT_DEPTH == 0 ; 8x8 doesn't benefit from AVX-512 in high bit-depth
1777
+cglobal pixel_var_8x8, 2,3
1778
+    lea             r2, [3*r1]
1779
+    pxor           xm4, xm4
1780
+    VAR_AVX512_CORE_8x8 0
1781
+    jmp var_avx512_end
1782
+%endif
1783
+
1784
+cglobal pixel_var_8x16, 2,3
1785
+    FIX_STRIDES     r1
1786
+    lea             r2, [3*r1]
1787
+%if HIGH_BIT_DEPTH == 0
1788
+    pxor           xm4, xm4
1789
+%endif
1790
+    VAR_AVX512_CORE_8x8 0
1791
+    lea             r0, [r0+4*r1]
1792
+    VAR_AVX512_CORE_8x8 1
1793
+    jmp var_avx512_end
1794
+
1795
 %macro VAR2_END 3
1796
     HADDW   %2, xm1
1797
     movd   r1d, %2
1798
x265_2.7.tar.gz/source/common/x86/pixel.h -> x265_2.9.tar.gz/source/common/x86/pixel.h Changed
37
 
1
@@ -34,6 +34,7 @@
2
 void PFX(downShift_16_avx2)(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
3
 void PFX(upShift_16_sse2)(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
4
 void PFX(upShift_16_avx2)(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
5
+void PFX(upShift_16_avx512)(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
6
 void PFX(upShift_8_sse4)(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
7
 void PFX(upShift_8_avx2)(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
8
 pixel PFX(planeClipAndMax_avx2)(pixel *src, intptr_t stride, int width, int height, uint64_t *outsum, const pixel minPix, const pixel maxPix);
9
@@ -44,14 +45,19 @@
10
     FUNCDEF_PU(void, pixel_sad_x3, cpu, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*); \
11
     FUNCDEF_PU(void, pixel_sad_x4, cpu, const pixel*, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*); \
12
     FUNCDEF_PU(void, pixel_avg, cpu, pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int); \
13
+    FUNCDEF_PU(void, pixel_avg_aligned, cpu, pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int); \
14
     FUNCDEF_PU(void, pixel_add_ps, cpu, pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1); \
15
+    FUNCDEF_PU(void, pixel_add_ps_aligned, cpu, pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1); \
16
     FUNCDEF_PU(void, pixel_sub_ps, cpu, int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1); \
17
     FUNCDEF_CHROMA_PU(int, pixel_satd, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
18
     FUNCDEF_CHROMA_PU(int, pixel_sad, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
19
     FUNCDEF_CHROMA_PU(sse_t, pixel_ssd_ss, cpu, const int16_t*, intptr_t, const int16_t*, intptr_t); \
20
     FUNCDEF_CHROMA_PU(void, addAvg, cpu, const int16_t*, const int16_t*, pixel*, intptr_t, intptr_t, intptr_t); \
21
+    FUNCDEF_CHROMA_PU(void, addAvg_aligned, cpu, const int16_t*, const int16_t*, pixel*, intptr_t, intptr_t, intptr_t); \
22
     FUNCDEF_CHROMA_PU(sse_t, pixel_ssd_s, cpu, const int16_t*, intptr_t); \
23
+    FUNCDEF_CHROMA_PU(sse_t, pixel_ssd_s_aligned, cpu, const int16_t*, intptr_t); \
24
     FUNCDEF_TU_S(sse_t, pixel_ssd_s, cpu, const int16_t*, intptr_t); \
25
+    FUNCDEF_TU_S(sse_t, pixel_ssd_s_aligned, cpu, const int16_t*, intptr_t); \
26
     FUNCDEF_TU(uint64_t, pixel_var, cpu, const pixel*, intptr_t); \
27
     FUNCDEF_TU(int, psyCost_pp, cpu, const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride); \
28
     FUNCDEF_TU(int, psyCost_ss, cpu, const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride)
29
@@ -65,6 +71,7 @@
30
 DECL_PIXELS(avx);
31
 DECL_PIXELS(xop);
32
 DECL_PIXELS(avx2);
33
+DECL_PIXELS(avx512);
34
 
35
 #undef DECL_PIXELS
36
 
37
x265_2.7.tar.gz/source/common/x86/pixeladd8.asm -> x265_2.9.tar.gz/source/common/x86/pixeladd8.asm Changed
530
 
1
@@ -24,11 +24,11 @@
2
 
3
 %include "x86inc.asm"
4
 %include "x86util.asm"
5
+SECTION_RODATA 64
6
 
7
-SECTION_RODATA 32
8
-
9
+ALIGN 64
10
+const store_shuf1_avx512,  dq 0, 2, 4, 6, 1, 3, 5, 7
11
 SECTION .text
12
-
13
 cextern pw_pixel_max
14
 
15
 ;-----------------------------------------------------------------------------
16
@@ -768,7 +768,6 @@
17
 PIXEL_ADD_PS_W32_H4_avx2 32
18
 PIXEL_ADD_PS_W32_H4_avx2 64
19
 
20
-
21
 ;-----------------------------------------------------------------------------
22
 ; void pixel_add_ps_64x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
23
 ;-----------------------------------------------------------------------------
24
@@ -1145,3 +1144,505 @@
25
     RET
26
 
27
 %endif
28
+
29
+;-----------------------------------------------------------------------------
30
+; pixel_add_ps avx512 code start
31
+;-----------------------------------------------------------------------------
32
+%macro PROCESS_ADD_PS_64x4_AVX512 0
33
+    pmovzxbw    m0,         [r2]
34
+    pmovzxbw    m1,         [r2 + mmsize/2]
35
+    movu        m2,         [r3]
36
+    movu        m3,         [r3 + mmsize]
37
+    paddw       m0,         m2
38
+    paddw       m1,         m3
39
+    packuswb    m0,         m1
40
+    vpermq      m0,         m4,      m0
41
+    movu        [r0],       m0
42
+    pmovzxbw    m0,         [r2 + r4]
43
+    pmovzxbw    m1,         [r2 + r4 + mmsize/2]
44
+    movu        m2,         [r3 + r5]
45
+    movu        m3,         [r3 + r5 + mmsize]
46
+    paddw       m0,         m2
47
+    paddw       m1,         m3
48
+    packuswb    m0,         m1
49
+    vpermq      m0,         m4,      m0
50
+    movu        [r0 + r1],  m0
51
+    pmovzxbw    m0,         [r2 + 2 * r4]
52
+    pmovzxbw    m1,         [r2 + 2 * r4 + mmsize/2]
53
+    movu        m2,         [r3 + 2 * r5]
54
+    movu        m3,         [r3 + 2 * r5 + mmsize]
55
+    paddw       m0,         m2
56
+    paddw       m1,         m3
57
+    packuswb    m0,         m1
58
+    vpermq      m0,         m4,      m0
59
+    movu        [r0 + 2 * r1],       m0
60
+
61
+    pmovzxbw    m0,         [r2 + r7]
62
+    pmovzxbw    m1,         [r2 + r7 + mmsize/2]
63
+    movu        m2,         [r3 + r8]
64
+    movu        m3,         [r3 + r8 + mmsize]
65
+    paddw       m0,         m2
66
+    paddw       m1,         m3
67
+    packuswb    m0,         m1
68
+    vpermq      m0,         m4,      m0
69
+    movu        [r0 + r6],       m0
70
+%endmacro
71
+
72
+%macro PROCESS_ADD_PS_64x4_HBD_AVX512 0
73
+    movu    m0,     [r2]
74
+    movu    m1,     [r2 + mmsize]
75
+    movu    m2,     [r3]
76
+    movu    m3,     [r3 + mmsize]
77
+    paddw   m0,     m2
78
+    paddw   m1,     m3
79
+
80
+    CLIPW2  m0, m1, m4, m5
81
+    movu    [r0],                m0
82
+    movu    [r0 + mmsize],       m1
83
+
84
+    movu    m0,     [r2 + r4]
85
+    movu    m1,     [r2 + r4 + mmsize]
86
+    movu    m2,     [r3 + r5]
87
+    movu    m3,     [r3 + r5 + mmsize]
88
+    paddw   m0,     m2
89
+    paddw   m1,     m3
90
+
91
+    CLIPW2  m0, m1, m4, m5
92
+    movu    [r0 + r1],           m0
93
+    movu    [r0 + r1 + mmsize],  m1
94
+
95
+    movu    m0,     [r2 + r4 * 2]
96
+    movu    m1,     [r2 + r4 * 2 + mmsize]
97
+    movu    m2,     [r3 + r5 * 2]
98
+    movu    m3,     [r3 + r5 * 2 + mmsize]
99
+    paddw   m0,     m2
100
+    paddw   m1,     m3
101
+
102
+    CLIPW2  m0, m1, m4, m5
103
+    movu    [r0 + r1 * 2],           m0
104
+    movu    [r0 + r1 * 2 + mmsize],  m1
105
+
106
+    movu    m0,     [r2 + r6]
107
+    movu    m1,     [r2 + r6 + mmsize]
108
+    movu    m2,     [r3 + r7]
109
+    movu    m3,     [r3 + r7 + mmsize]
110
+    paddw   m0,     m2
111
+    paddw   m1,     m3
112
+
113
+    CLIPW2  m0, m1, m4, m5
114
+    movu    [r0 + r8],               m0
115
+    movu    [r0 + r8 + mmsize],      m1
116
+%endmacro
117
+
118
+%macro PROCESS_ADD_PS_64x4_ALIGNED_AVX512 0
119
+    pmovzxbw    m0,         [r2]
120
+    pmovzxbw    m1,         [r2 + mmsize/2]
121
+    mova        m2,         [r3]
122
+    mova        m3,         [r3 + mmsize]
123
+    paddw       m0,         m2
124
+    paddw       m1,         m3
125
+    packuswb    m0,         m1
126
+    vpermq      m0,         m4,      m0
127
+    mova        [r0],       m0
128
+    pmovzxbw    m0,         [r2 + r4]
129
+    pmovzxbw    m1,         [r2 + r4 + mmsize/2]
130
+    mova        m2,         [r3 + r5]
131
+    mova        m3,         [r3 + r5 + mmsize]
132
+    paddw       m0,         m2
133
+    paddw       m1,         m3
134
+    packuswb    m0,         m1
135
+    vpermq      m0,         m4,      m0
136
+    mova        [r0 + r1],  m0
137
+    pmovzxbw    m0,         [r2 + 2 * r4]
138
+    pmovzxbw    m1,         [r2 + 2 * r4 + mmsize/2]
139
+    mova        m2,         [r3 + 2 * r5]
140
+    mova        m3,         [r3 + 2 * r5 + mmsize]
141
+    paddw       m0,         m2
142
+    paddw       m1,         m3
143
+    packuswb    m0,         m1
144
+    vpermq      m0,         m4,      m0
145
+    mova        [r0 + 2 * r1],       m0
146
+
147
+    pmovzxbw    m0,         [r2 + r7]
148
+    pmovzxbw    m1,         [r2 + r7 + mmsize/2]
149
+    mova        m2,         [r3 + r8]
150
+    mova        m3,         [r3 + r8 + mmsize]
151
+    paddw       m0,         m2
152
+    paddw       m1,         m3
153
+    packuswb    m0,         m1
154
+    vpermq      m0,         m4,      m0
155
+    mova        [r0 + r6],       m0
156
+%endmacro
157
+
158
+%macro PROCESS_ADD_PS_64x4_HBD_ALIGNED_AVX512 0
159
+    mova    m0,     [r2]
160
+    mova    m1,     [r2 + mmsize]
161
+    mova    m2,     [r3]
162
+    mova    m3,     [r3 + mmsize]
163
+    paddw   m0,     m2
164
+    paddw   m1,     m3
165
+
166
+    CLIPW2  m0, m1, m4, m5
167
+    mova    [r0],                m0
168
+    mova    [r0 + mmsize],       m1
169
+
170
+    mova    m0,     [r2 + r4]
171
+    mova    m1,     [r2 + r4 + mmsize]
172
+    mova    m2,     [r3 + r5]
173
+    mova    m3,     [r3 + r5 + mmsize]
174
+    paddw   m0,     m2
175
+    paddw   m1,     m3
176
+
177
+    CLIPW2  m0, m1, m4, m5
178
+    mova    [r0 + r1],           m0
179
+    mova    [r0 + r1 + mmsize],  m1
180
+
181
+    mova    m0,     [r2 + r4 * 2]
182
+    mova    m1,     [r2 + r4 * 2 + mmsize]
183
+    mova    m2,     [r3 + r5 * 2]
184
+    mova    m3,     [r3 + r5 * 2 + mmsize]
185
+    paddw   m0,     m2
186
+    paddw   m1,     m3
187
+
188
+    CLIPW2  m0, m1, m4, m5
189
+    mova    [r0 + r1 * 2],           m0
190
+    mova    [r0 + r1 * 2 + mmsize],  m1
191
+
192
+    mova    m0,     [r2 + r6]
193
+    mova    m1,     [r2 + r6 + mmsize]
194
+    mova    m2,     [r3 + r7]
195
+    mova    m3,     [r3 + r7 + mmsize]
196
+    paddw   m0,     m2
197
+    paddw   m1,     m3
198
+
199
+    CLIPW2  m0, m1, m4, m5
200
+    mova    [r0 + r8],               m0
201
+    mova    [r0 + r8 + mmsize],      m1
202
+%endmacro
203
+
204
+;-----------------------------------------------------------------------------
205
+; void pixel_add_ps_64x64(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
206
+;-----------------------------------------------------------------------------
207
+%if HIGH_BIT_DEPTH
208
+%if ARCH_X86_64
209
+INIT_ZMM avx512
210
+cglobal pixel_add_ps_64x64, 6, 9, 6
211
+    vbroadcasti32x8  m5,     [pw_pixel_max]
212
+    pxor             m4,     m4
213
+    add             r4d,     r4d
214
+    add             r5d,     r5d
215
+    add             r1d,     r1d
216
+    lea              r6,     [r4 * 3]
217
+    lea              r7,     [r5 * 3]
218
+    lea              r8,     [r1 * 3]
219
+%rep 15
220
+    PROCESS_ADD_PS_64x4_HBD_AVX512
221
+    lea         r2,         [r2 + r4 * 4]
222
+    lea         r3,         [r3 + r5 * 4]
223
+    lea         r0,         [r0 + r1 * 4]
224
+%endrep
225
+    PROCESS_ADD_PS_64x4_HBD_AVX512
226
+    RET
227
+
228
+INIT_ZMM avx512
229
+cglobal pixel_add_ps_aligned_64x64, 6, 9, 6
230
+    vbroadcasti32x8  m5,     [pw_pixel_max]
231
+    pxor             m4,     m4
232
+    add             r4d,     r4d
233
+    add             r5d,     r5d
234
+    add             r1d,     r1d
235
+    lea              r6,     [r4 * 3]
236
+    lea              r7,     [r5 * 3]
237
+    lea              r8,     [r1 * 3]
238
+%rep 15
239
+    PROCESS_ADD_PS_64x4_HBD_ALIGNED_AVX512
240
+    lea         r2,         [r2 + r4 * 4]
241
+    lea         r3,         [r3 + r5 * 4]
242
+    lea         r0,         [r0 + r1 * 4]
243
+%endrep
244
+    PROCESS_ADD_PS_64x4_HBD_ALIGNED_AVX512
245
+    RET
246
+%endif
247
+%else
248
+%if ARCH_X86_64
249
+INIT_ZMM avx512
250
+cglobal pixel_add_ps_64x64, 6, 9, 4
251
+    add         r5,         r5
252
+    lea         r6,         [3 * r1]
253
+    lea         r7,         [3 * r4]
254
+    lea         r8,         [3 * r5]
255
+    mova        m4,         [store_shuf1_avx512]
256
+%rep 15
257
+    PROCESS_ADD_PS_64x4_AVX512
258
+    lea         r2,         [r2 + r4 * 4]
259
+    lea         r3,         [r3 + r5 * 4]
260
+    lea         r0,         [r0 + r1 * 4]
261
+%endrep
262
+    PROCESS_ADD_PS_64x4_AVX512
263
+    RET
264
+
265
+INIT_ZMM avx512
266
+cglobal pixel_add_ps_aligned_64x64, 6, 9, 4
267
+    add         r5,         r5
268
+    lea         r6,         [3 * r1]
269
+    lea         r7,         [3 * r4]
270
+    lea         r8,         [3 * r5]
271
+    mova        m4,         [store_shuf1_avx512]
272
+%rep 15
273
+    PROCESS_ADD_PS_64x4_ALIGNED_AVX512
274
+    lea         r2,         [r2 + r4 * 4]
275
+    lea         r3,         [r3 + r5 * 4]
276
+    lea         r0,         [r0 + r1 * 4]
277
+%endrep
278
+    PROCESS_ADD_PS_64x4_ALIGNED_AVX512
279
+    RET
280
+%endif
281
+%endif
282
+
283
+%macro PROCESS_ADD_PS_32x4_AVX512 0
284
+    pmovzxbw    m0,         [r2]
285
+    movu        m1,         [r3]
286
+    pmovzxbw    m2,         [r2 + r4]
287
+    movu        m3,         [r3 + r5]
288
+    paddw       m0,         m1
289
+    paddw       m2,         m3
290
+    packuswb    m0,         m2
291
+    vpermq      m0,         m4,      m0
292
+    movu           [r0],       ym0
293
+    vextracti32x8  [r0 + r1],   m0,    1
294
+    pmovzxbw    m0,         [r2 + r4 * 2]
295
+    movu        m1,         [r3 + r5 * 2]
296
+    pmovzxbw    m2,         [r2 + r6]
297
+    movu        m3,         [r3 + r7]
298
+    paddw       m0,         m1
299
+    paddw       m2,         m3
300
+    packuswb    m0,         m2
301
+    vpermq      m0,         m4,      m0
302
+    movu           [r0 + r1 * 2],   ym0
303
+    vextracti32x8  [r0 + r8],        m0,    1
304
+%endmacro
305
+
306
+%macro PROCESS_ADD_PS_32x4_HBD_AVX512 0
307
+    movu    m0,     [r2]
308
+    movu    m1,     [r2 + r4]
309
+    movu    m2,     [r3]
310
+    movu    m3,     [r3 + r5]
311
+    paddw   m0,     m2
312
+    paddw   m1,     m3
313
+
314
+    CLIPW2  m0, m1, m4, m5
315
+    movu    [r0],                m0
316
+    movu    [r0 + r1],           m1
317
+
318
+    movu    m0,     [r2 + r4 * 2]
319
+    movu    m1,     [r2 + r6]
320
+    movu    m2,     [r3 + r5 * 2]
321
+    movu    m3,     [r3 + r7]
322
+    paddw   m0,     m2
323
+    paddw   m1,     m3
324
+
325
+    CLIPW2  m0, m1, m4, m5
326
+    movu    [r0 + r1 * 2],           m0
327
+    movu    [r0 + r8],               m1
328
+%endmacro
329
+
330
+%macro PROCESS_ADD_PS_32x4_ALIGNED_AVX512 0
331
+    pmovzxbw    m0,         [r2]
332
+    mova        m1,         [r3]
333
+    pmovzxbw    m2,         [r2 + r4]
334
+    mova        m3,         [r3 + r5]
335
+    paddw       m0,         m1
336
+    paddw       m2,         m3
337
+    packuswb    m0,         m2
338
+    vpermq      m0,         m4,      m0
339
+    mova           [r0],       ym0
340
+    vextracti32x8  [r0 + r1],   m0,    1
341
+    pmovzxbw    m0,         [r2 + r4 * 2]
342
+    mova        m1,         [r3 + r5 * 2]
343
+    pmovzxbw    m2,         [r2 + r6]
344
+    mova        m3,         [r3 + r7]
345
+    paddw       m0,         m1
346
+    paddw       m2,         m3
347
+    packuswb    m0,         m2
348
+    vpermq      m0,         m4,      m0
349
+    mova           [r0 + r1 * 2],   ym0
350
+    vextracti32x8  [r0 + r8],        m0,    1
351
+%endmacro
352
+
353
+%macro PROCESS_ADD_PS_32x4_HBD_ALIGNED_AVX512 0
354
+    mova    m0,     [r2]
355
+    mova    m1,     [r2 + r4]
356
+    mova    m2,     [r3]
357
+    mova    m3,     [r3 + r5]
358
+    paddw   m0,     m2
359
+    paddw   m1,     m3
360
+
361
+    CLIPW2  m0, m1, m4, m5
362
+    mova    [r0],                m0
363
+    mova    [r0 + r1],           m1
364
+
365
+    mova    m0,     [r2 + r4 * 2]
366
+    mova    m1,     [r2 + r6]
367
+    mova    m2,     [r3 + r5 * 2]
368
+    mova    m3,     [r3 + r7]
369
+    paddw   m0,     m2
370
+    paddw   m1,     m3
371
+
372
+    CLIPW2  m0, m1, m4, m5
373
+    mova    [r0 + r1 * 2],           m0
374
+    mova    [r0 + r8],               m1
375
+%endmacro
376
+
377
+;-----------------------------------------------------------------------------
378
+; void pixel_add_ps_32x32(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
379
+;-----------------------------------------------------------------------------
380
+%if HIGH_BIT_DEPTH
381
+%if ARCH_X86_64
382
+INIT_ZMM avx512
383
+cglobal pixel_add_ps_32x32, 6, 9, 6
384
+    vbroadcasti32x8  m5,     [pw_pixel_max]
385
+    pxor             m4,     m4
386
+    add             r4d,     r4d
387
+    add             r5d,     r5d
388
+    add             r1d,     r1d
389
+    lea              r6,     [r4 * 3]
390
+    lea              r7,     [r5 * 3]
391
+    lea              r8,     [r1 * 3]
392
+%rep 7
393
+    PROCESS_ADD_PS_32x4_HBD_AVX512
394
+    lea         r2,         [r2 + r4 * 4]
395
+    lea         r3,         [r3 + r5 * 4]
396
+    lea         r0,         [r0 + r1 * 4]
397
+%endrep
398
+    PROCESS_ADD_PS_32x4_HBD_AVX512
399
+    RET
400
+
401
+INIT_ZMM avx512
402
+cglobal pixel_add_ps_32x64, 6, 9, 6
403
+    vbroadcasti32x8  m5,     [pw_pixel_max]
404
+    pxor             m4,     m4
405
+    add             r4d,     r4d
406
+    add             r5d,     r5d
407
+    add             r1d,     r1d
408
+    lea              r6,     [r4 * 3]
409
+    lea              r7,     [r5 * 3]
410
+    lea              r8,     [r1 * 3]
411
+%rep 15
412
+    PROCESS_ADD_PS_32x4_HBD_AVX512
413
+    lea         r2,         [r2 + r4 * 4]
414
+    lea         r3,         [r3 + r5 * 4]
415
+    lea         r0,         [r0 + r1 * 4]
416
+%endrep
417
+    PROCESS_ADD_PS_32x4_HBD_AVX512
418
+    RET
419
+
420
+INIT_ZMM avx512
421
+cglobal pixel_add_ps_aligned_32x32, 6, 9, 6
422
+    vbroadcasti32x8  m5,     [pw_pixel_max]
423
+    pxor             m4,     m4
424
+    add             r4d,     r4d
425
+    add             r5d,     r5d
426
+    add             r1d,     r1d
427
+    lea              r6,     [r4 * 3]
428
+    lea              r7,     [r5 * 3]
429
+    lea              r8,     [r1 * 3]
430
+%rep 7
431
+    PROCESS_ADD_PS_32x4_HBD_ALIGNED_AVX512
432
+    lea         r2,         [r2 + r4 * 4]
433
+    lea         r3,         [r3 + r5 * 4]
434
+    lea         r0,         [r0 + r1 * 4]
435
+%endrep
436
+    PROCESS_ADD_PS_32x4_HBD_ALIGNED_AVX512
437
+    RET
438
+
439
+INIT_ZMM avx512
440
+cglobal pixel_add_ps_aligned_32x64, 6, 9, 6
441
+    vbroadcasti32x8  m5,     [pw_pixel_max]
442
+    pxor             m4,     m4
443
+    add             r4d,     r4d
444
+    add             r5d,     r5d
445
+    add             r1d,     r1d
446
+    lea              r6,     [r4 * 3]
447
+    lea              r7,     [r5 * 3]
448
+    lea              r8,     [r1 * 3]
449
+%rep 15
450
+    PROCESS_ADD_PS_32x4_HBD_ALIGNED_AVX512
451
+    lea         r2,         [r2 + r4 * 4]
452
+    lea         r3,         [r3 + r5 * 4]
453
+    lea         r0,         [r0 + r1 * 4]
454
+%endrep
455
+    PROCESS_ADD_PS_32x4_HBD_ALIGNED_AVX512
456
+    RET
457
+%endif
458
+%else
459
+%if ARCH_X86_64
460
+INIT_ZMM avx512
461
+cglobal pixel_add_ps_32x32, 6, 9, 5
462
+    add         r5,         r5
463
+    lea         r6,         [r4 * 3]
464
+    lea         r7,         [r5 * 3]
465
+    lea         r8,         [r1 * 3]
466
+    mova        m4,         [store_shuf1_avx512]
467
+%rep 7
468
+    PROCESS_ADD_PS_32x4_AVX512
469
+    lea         r2,         [r2 + r4 * 4]
470
+    lea         r3,         [r3 + r5 * 4]
471
+    lea         r0,         [r0 + r1 * 4]
472
+%endrep
473
+    PROCESS_ADD_PS_32x4_AVX512
474
+    RET
475
+
476
+INIT_ZMM avx512
477
+cglobal pixel_add_ps_32x64, 6, 9, 5
478
+    add         r5,         r5
479
+    lea         r6,         [r4 * 3]
480
+    lea         r7,         [r5 * 3]
481
+    lea         r8,         [r1 * 3]
482
+    mova        m4,         [store_shuf1_avx512]
483
+
484
+%rep 15
485
+    PROCESS_ADD_PS_32x4_AVX512
486
+    lea         r2,         [r2 + r4 * 4]
487
+    lea         r3,         [r3 + r5 * 4]
488
+    lea         r0,         [r0 + r1 * 4]
489
+%endrep
490
+    PROCESS_ADD_PS_32x4_AVX512
491
+    RET
492
+
493
+INIT_ZMM avx512
494
+cglobal pixel_add_ps_aligned_32x32, 6, 9, 5
495
+    add         r5,         r5
496
+    lea         r6,         [r4 * 3]
497
+    lea         r7,         [r5 * 3]
498
+    lea         r8,         [r1 * 3]
499
+    mova        m4,         [store_shuf1_avx512]
500
+%rep 7
501
+    PROCESS_ADD_PS_32x4_ALIGNED_AVX512
502
+    lea         r2,         [r2 + r4 * 4]
503
+    lea         r3,         [r3 + r5 * 4]
504
+    lea         r0,         [r0 + r1 * 4]
505
+%endrep
506
+    PROCESS_ADD_PS_32x4_ALIGNED_AVX512
507
+    RET
508
+
509
+INIT_ZMM avx512
510
+cglobal pixel_add_ps_aligned_32x64, 6, 9, 5
511
+    add         r5,         r5
512
+    lea         r6,         [r4 * 3]
513
+    lea         r7,         [r5 * 3]
514
+    lea         r8,         [r1 * 3]
515
+    mova        m4,         [store_shuf1_avx512]
516
+
517
+%rep 15
518
+    PROCESS_ADD_PS_32x4_ALIGNED_AVX512
519
+    lea         r2,         [r2 + r4 * 4]
520
+    lea         r3,         [r3 + r5 * 4]
521
+    lea         r0,         [r0 + r1 * 4]
522
+%endrep
523
+    PROCESS_ADD_PS_32x4_ALIGNED_AVX512
524
+    RET
525
+%endif
526
+%endif
527
+;-----------------------------------------------------------------------------
528
+; pixel_add_ps avx512 code end
529
+;-----------------------------------------------------------------------------
530
x265_2.7.tar.gz/source/common/x86/sad-a.asm -> x265_2.9.tar.gz/source/common/x86/sad-a.asm Changed
877
 
1
@@ -378,111 +378,63 @@
2
     lea     r0,  [r0 + r1]
3
 %endmacro
4
 
5
-%macro SAD_W16 0
6
-;-----------------------------------------------------------------------------
7
-; int pixel_sad_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
8
-;-----------------------------------------------------------------------------
9
-cglobal pixel_sad_16x16, 4,4,8
10
-    movu    m0, [r2]
11
-    movu    m1, [r2+r3]
12
-    lea     r2, [r2+2*r3]
13
-    movu    m2, [r2]
14
-    movu    m3, [r2+r3]
15
-    lea     r2, [r2+2*r3]
16
-    psadbw  m0, [r0]
17
-    psadbw  m1, [r0+r1]
18
-    lea     r0, [r0+2*r1]
19
-    movu    m4, [r2]
20
-    paddw   m0, m1
21
-    psadbw  m2, [r0]
22
-    psadbw  m3, [r0+r1]
23
-    lea     r0, [r0+2*r1]
24
-    movu    m5, [r2+r3]
25
-    lea     r2, [r2+2*r3]
26
-    paddw   m2, m3
27
-    movu    m6, [r2]
28
-    movu    m7, [r2+r3]
29
-    lea     r2, [r2+2*r3]
30
-    paddw   m0, m2
31
-    psadbw  m4, [r0]
32
-    psadbw  m5, [r0+r1]
33
-    lea     r0, [r0+2*r1]
34
-    movu    m1, [r2]
35
-    paddw   m4, m5
36
-    psadbw  m6, [r0]
37
-    psadbw  m7, [r0+r1]
38
-    lea     r0, [r0+2*r1]
39
-    movu    m2, [r2+r3]
40
-    lea     r2, [r2+2*r3]
41
-    paddw   m6, m7
42
-    movu    m3, [r2]
43
-    paddw   m0, m4
44
-    movu    m4, [r2+r3]
45
-    lea     r2, [r2+2*r3]
46
-    paddw   m0, m6
47
-    psadbw  m1, [r0]
48
-    psadbw  m2, [r0+r1]
49
-    lea     r0, [r0+2*r1]
50
-    movu    m5, [r2]
51
-    paddw   m1, m2
52
-    psadbw  m3, [r0]
53
-    psadbw  m4, [r0+r1]
54
-    lea     r0, [r0+2*r1]
55
-    movu    m6, [r2+r3]
56
-    lea     r2, [r2+2*r3]
57
-    paddw   m3, m4
58
-    movu    m7, [r2]
59
-    paddw   m0, m1
60
-    movu    m1, [r2+r3]
61
-    paddw   m0, m3
62
-    psadbw  m5, [r0]
63
-    psadbw  m6, [r0+r1]
64
-    lea     r0, [r0+2*r1]
65
-    paddw   m5, m6
66
-    psadbw  m7, [r0]
67
-    psadbw  m1, [r0+r1]
68
-    paddw   m7, m1
69
-    paddw   m0, m5
70
-    paddw   m0, m7
71
-    SAD_END_SSE2
72
+%macro SAD_W16 1 ; h
73
+cglobal pixel_sad_16x%1, 4,4
74
+%ifidn cpuname, sse2
75
+.skip_prologue:
76
+%endif
77
+%assign %%i 0
78
+%if ARCH_X86_64
79
+    lea  r6, [3*r1] ; r6 results in fewer REX prefixes than r4 and both are volatile
80
+    lea  r5, [3*r3]
81
+%rep %1/4
82
+    movu     m1, [r2]
83
+    psadbw   m1, [r0]
84
+    movu     m3, [r2+r3]
85
+    psadbw   m3, [r0+r1]
86
+    movu     m2, [r2+2*r3]
87
+    psadbw   m2, [r0+2*r1]
88
+    movu     m4, [r2+r5]
89
+    psadbw   m4, [r0+r6]
90
+%if %%i != %1/4-1
91
+    lea      r2, [r2+4*r3]
92
+    lea      r0, [r0+4*r1]
93
+%endif
94
+    paddw    m1, m3
95
+    paddw    m2, m4
96
+    ACCUM paddw, 0, 1, %%i
97
+    paddw    m0, m2
98
+    %assign %%i %%i+1
99
+%endrep
100
+%else     ; The cost of having to save and restore registers on x86-32
101
+%rep %1/2 ; nullifies the benefit of having 3*stride in registers.
102
+    movu     m1, [r2]
103
+    psadbw   m1, [r0]
104
+    movu     m2, [r2+r3]
105
+    psadbw   m2, [r0+r1]
106
+%if %%i != %1/2-1
107
+    lea      r2, [r2+2*r3]
108
+    lea      r0, [r0+2*r1]
109
+%endif
110
+    ACCUM paddw, 0, 1, %%i
111
+    paddw    m0, m2
112
+    %assign %%i %%i+1
113
+%endrep
114
+%endif
115
+     SAD_END_SSE2
116
+ %endmacro
117
 
118
-;-----------------------------------------------------------------------------
119
-; int pixel_sad_16x8( uint8_t *, intptr_t, uint8_t *, intptr_t )
120
-;-----------------------------------------------------------------------------
121
-cglobal pixel_sad_16x8, 4,4
122
-    movu    m0, [r2]
123
-    movu    m2, [r2+r3]
124
-    lea     r2, [r2+2*r3]
125
-    movu    m3, [r2]
126
-    movu    m4, [r2+r3]
127
-    psadbw  m0, [r0]
128
-    psadbw  m2, [r0+r1]
129
-    lea     r0, [r0+2*r1]
130
-    psadbw  m3, [r0]
131
-    psadbw  m4, [r0+r1]
132
-    lea     r0, [r0+2*r1]
133
-    lea     r2, [r2+2*r3]
134
-    paddw   m0, m2
135
-    paddw   m3, m4
136
-    paddw   m0, m3
137
-    movu    m1, [r2]
138
-    movu    m2, [r2+r3]
139
-    lea     r2, [r2+2*r3]
140
-    movu    m3, [r2]
141
-    movu    m4, [r2+r3]
142
-    psadbw  m1, [r0]
143
-    psadbw  m2, [r0+r1]
144
-    lea     r0, [r0+2*r1]
145
-    psadbw  m3, [r0]
146
-    psadbw  m4, [r0+r1]
147
-    lea     r0, [r0+2*r1]
148
-    lea     r2, [r2+2*r3]
149
-    paddw   m1, m2
150
-    paddw   m3, m4
151
-    paddw   m0, m1
152
-    paddw   m0, m3
153
-    SAD_END_SSE2
154
+INIT_XMM sse2
155
+SAD_W16 8
156
+SAD_W16 16
157
+INIT_XMM sse3
158
+SAD_W16 8
159
+SAD_W16 16
160
+INIT_XMM sse2, aligned
161
+SAD_W16 8
162
+SAD_W16 16
163
 
164
+%macro SAD_Wx 0
165
 ;-----------------------------------------------------------------------------
166
 ; int pixel_sad_16x12( uint8_t *, intptr_t, uint8_t *, intptr_t )
167
 ;-----------------------------------------------------------------------------
168
@@ -808,11 +760,11 @@
169
 %endmacro
170
 
171
 INIT_XMM sse2
172
-SAD_W16
173
+SAD_Wx
174
 INIT_XMM sse3
175
-SAD_W16
176
+SAD_Wx
177
 INIT_XMM sse2, aligned
178
-SAD_W16
179
+SAD_Wx
180
 
181
 %macro SAD_INC_4x8P_SSE 1
182
     movq    m1, [r0]
183
@@ -841,7 +793,132 @@
184
     SAD_INC_4x8P_SSE 1
185
     SAD_INC_4x8P_SSE 1
186
     SAD_END_SSE2
187
+
188
+%macro SAD_W48_AVX512 3 ; w, h, d/q
189
+cglobal pixel_sad_%1x%2, 4,4
190
+    kxnorb        k1, k1, k1
191
+    kaddb         k1, k1, k1
192
+%assign %%i 0
193
+%if ARCH_X86_64 && %2 != 4
194
+    lea           r6, [3*r1]
195
+    lea           r5, [3*r3]
196
+%rep %2/4
197
+    mov%3         m1,      [r0]
198
+    vpbroadcast%3 m1 {k1}, [r0+r1]
199
+    mov%3         m3,      [r2]
200
+    vpbroadcast%3 m3 {k1}, [r2+r3]
201
+    mov%3         m2,      [r0+2*r1]
202
+    vpbroadcast%3 m2 {k1}, [r0+r6]
203
+    mov%3         m4,      [r2+2*r3]
204
+    vpbroadcast%3 m4 {k1}, [r2+r5]
205
+%if %%i != %2/4-1
206
+    lea           r0, [r0+4*r1]
207
+    lea           r2, [r2+4*r3]
208
+%endif
209
+    psadbw        m1, m3
210
+    psadbw        m2, m4
211
+    ACCUM      paddd, 0, 1, %%i
212
+    paddd         m0, m2
213
+    %assign %%i %%i+1
214
+%endrep
215
+%else
216
+%rep %2/2
217
+    mov%3         m1,      [r0]
218
+    vpbroadcast%3 m1 {k1}, [r0+r1]
219
+    mov%3         m2,      [r2]
220
+    vpbroadcast%3 m2 {k1}, [r2+r3]
221
+%if %%i != %2/2-1
222
+    lea           r0, [r0+2*r1]
223
+    lea           r2, [r2+2*r3]
224
+%endif
225
+    psadbw        m1, m2
226
+    ACCUM      paddd, 0, 1, %%i
227
+    %assign %%i %%i+1
228
+%endrep
229
+%endif
230
+%if %1 == 8
231
+    punpckhqdq    m1, m0, m0
232
+    paddd         m0, m1
233
+%endif
234
+    movd         eax, m0
235
+    RET
236
+%endmacro
237
+
238
+INIT_XMM avx512
239
+SAD_W48_AVX512 4,  4, d
240
+SAD_W48_AVX512 4,  8, d
241
+SAD_W48_AVX512 4, 16, d
242
+SAD_W48_AVX512 8,  4, q
243
+SAD_W48_AVX512 8,  8, q
244
+SAD_W48_AVX512 8, 16, q
245
+
246
+%macro SAD_W16_AVX512_START 1 ; h
247
+    cmp  r1d, 16                           ; optimized for width = 16, which has the
248
+    jne pixel_sad_16x%1_sse2.skip_prologue ; rows laid out contiguously in memory
249
+    lea   r1, [3*r3]
250
+%endmacro
251
+
252
+%macro SAD_W16_AVX512_END 0
253
+    paddd          m0, m1
254
+    paddd          m0, m2
255
+    paddd          m0, m3
256
+%if mmsize == 64
257
+    vextracti32x8 ym1, m0, 1
258
+    paddd         ym0, ym1
259
+%endif
260
+    vextracti128  xm1, ym0, 1
261
+    paddd        xmm0, xm0, xm1
262
+    punpckhqdq   xmm1, xmm0, xmm0
263
+    paddd        xmm0, xmm1
264
+    movd          eax, xmm0
265
     RET
266
+%endmacro
267
+
268
+INIT_YMM avx512
269
+cglobal pixel_sad_16x8, 4,4
270
+    SAD_W16_AVX512_START 8
271
+    movu         xm0, [r2]
272
+    vinserti128   m0, [r2+r3], 1
273
+    psadbw        m0, [r0+0*32]
274
+    movu         xm1, [r2+2*r3]
275
+    vinserti128   m1, [r2+r1], 1
276
+    lea           r2, [r2+4*r3]
277
+    psadbw        m1, [r0+1*32]
278
+    movu         xm2, [r2]
279
+    vinserti128   m2, [r2+r3], 1
280
+    psadbw        m2, [r0+2*32]
281
+    movu         xm3, [r2+2*r3]
282
+    vinserti128   m3, [r2+r1], 1
283
+    psadbw        m3, [r0+3*32]
284
+    SAD_W16_AVX512_END
285
+
286
+INIT_ZMM avx512
287
+cglobal pixel_sad_16x16, 4,4
288
+    SAD_W16_AVX512_START 16
289
+    movu          xm0, [r2]
290
+    vinserti128   ym0, [r2+r3],   1
291
+    movu          xm1, [r2+4*r3]
292
+    vinserti32x4   m0, [r2+2*r3], 2
293
+    vinserti32x4   m1, [r2+2*r1], 2
294
+    vinserti32x4   m0, [r2+r1],   3
295
+    lea            r2, [r2+4*r3]
296
+    vinserti32x4   m1, [r2+r3],   1
297
+    psadbw         m0, [r0+0*64]
298
+    vinserti32x4   m1, [r2+r1],   3
299
+    lea            r2, [r2+4*r3]
300
+    psadbw         m1, [r0+1*64]
301
+    movu          xm2, [r2]
302
+    vinserti128   ym2, [r2+r3],   1
303
+    movu          xm3, [r2+4*r3]
304
+    vinserti32x4   m2, [r2+2*r3], 2
305
+    vinserti32x4   m3, [r2+2*r1], 2
306
+    vinserti32x4   m2, [r2+r1],   3
307
+    lea            r2, [r2+4*r3]
308
+    vinserti32x4   m3, [r2+r3],   1
309
+    psadbw         m2, [r0+2*64]
310
+    vinserti32x4   m3, [r2+r1],   3
311
+    psadbw         m3, [r0+3*64]
312
+    SAD_W16_AVX512_END
313
 
314
 ;=============================================================================
315
 ; SAD x3/x4 MMX
316
@@ -4051,6 +4128,263 @@
317
     SAD_X4_48x8_AVX2
318
     PIXEL_SAD_X4_END_AVX2
319
     RET
320
+
321
+;------------------------------------------------------------
322
+;sad_x4 avx512 code start
323
+;------------------------------------------------------------
324
+%macro PROCESS_SAD_X4_64x4_AVX512 0
325
+    movu            m4, [r0]
326
+    movu            m5, [r1]
327
+    movu            m6, [r2]
328
+    movu            m7, [r3]
329
+    movu            m8, [r4]
330
+
331
+    psadbw          m9, m4, m5
332
+    psadbw          m5, m4, m6
333
+    psadbw          m6, m4, m7
334
+    psadbw          m4, m8
335
+
336
+    paddd           m0, m9
337
+    paddd           m1, m5
338
+    paddd           m2, m6
339
+    paddd           m3, m4
340
+
341
+    movu            m4, [r0 + FENC_STRIDE]
342
+    movu            m5, [r1 + r5]
343
+    movu            m6, [r2 + r5]
344
+    movu            m7, [r3 + r5]
345
+    movu            m8, [r4 + r5]
346
+
347
+    psadbw          m9, m4, m5
348
+    psadbw          m5, m4, m6
349
+    psadbw          m6, m4, m7
350
+    psadbw          m4, m8
351
+    paddd           m0, m9
352
+    paddd           m1, m5
353
+    paddd           m2, m6
354
+    paddd           m3, m4
355
+
356
+    movu            m4, [r0 + FENC_STRIDE * 2]
357
+    movu            m5, [r1 + r5 * 2]
358
+    movu            m6, [r2 + r5 * 2]
359
+    movu            m7, [r3 + r5 * 2]
360
+    movu            m8, [r4 + r5 * 2]
361
+
362
+    psadbw          m9, m4, m5
363
+    psadbw          m5, m4, m6
364
+    psadbw          m6, m4, m7
365
+    psadbw          m4, m8
366
+
367
+    paddd           m0, m9
368
+    paddd           m1, m5
369
+    paddd           m2, m6
370
+    paddd           m3, m4
371
+
372
+    movu            m4, [r0 + FENC_STRIDE * 3]
373
+    movu            m5, [r1 + r7]
374
+    movu            m6, [r2 + r7]
375
+    movu            m7, [r3 + r7]
376
+    movu            m8, [r4 + r7]
377
+
378
+    psadbw          m9, m4, m5
379
+    psadbw          m5, m4, m6
380
+    psadbw          m6, m4, m7
381
+    psadbw          m4, m8
382
+    paddd           m0, m9
383
+    paddd           m1, m5
384
+    paddd           m2, m6
385
+    paddd           m3, m4
386
+%endmacro
387
+
388
+%macro PROCESS_SAD_X4_32x4_AVX512 0
389
+    movu            ym4, [r0]
390
+    movu            ym5, [r1]
391
+    movu            ym6, [r2]
392
+    movu            ym7, [r3]
393
+    movu            ym8, [r4]
394
+
395
+    vinserti32x8    m4, [r0 + FENC_STRIDE], 1
396
+    vinserti32x8    m5, [r1 + r5], 1
397
+    vinserti32x8    m6, [r2 + r5], 1
398
+    vinserti32x8    m7, [r3 + r5], 1
399
+    vinserti32x8    m8, [r4 + r5], 1
400
+
401
+    psadbw          m9, m4, m5
402
+    psadbw          m5, m4, m6
403
+    psadbw          m6, m4, m7
404
+    psadbw          m4, m8
405
+
406
+    paddd           m0, m9
407
+    paddd           m1, m5
408
+    paddd           m2, m6
409
+    paddd           m3, m4
410
+
411
+    movu            ym4, [r0 + FENC_STRIDE * 2]
412
+    movu            ym5, [r1 + r5 * 2]
413
+    movu            ym6, [r2 + r5 * 2]
414
+    movu            ym7, [r3 + r5 * 2]
415
+    movu            ym8, [r4 + r5 * 2]
416
+
417
+    vinserti32x8     m4, [r0 + FENC_STRIDE * 3], 1
418
+    vinserti32x8     m5, [r1 + r7], 1
419
+    vinserti32x8     m6, [r2 + r7], 1
420
+    vinserti32x8     m7, [r3 + r7], 1
421
+    vinserti32x8     m8, [r4 + r7], 1
422
+
423
+    psadbw          m9, m4, m5
424
+    psadbw          m5, m4, m6
425
+    psadbw          m6, m4, m7
426
+    psadbw          m4, m8
427
+
428
+    paddd           m0, m9
429
+    paddd           m1, m5
430
+    paddd           m2, m6
431
+    paddd           m3, m4
432
+%endmacro
433
+
434
+%macro PROCESS_SAD_X4_48x4_AVX512 0
435
+    movu            ym4, [r0]
436
+    movu            ym5, [r1]
437
+    movu            ym6, [r2]
438
+    movu            ym7, [r3]
439
+    movu            ym8, [r4]
440
+
441
+    vinserti32x8    m4, [r0 + FENC_STRIDE], 1
442
+    vinserti32x8    m5, [r1 + r5], 1
443
+    vinserti32x8    m6, [r2 + r5], 1
444
+    vinserti32x8    m7, [r3 + r5], 1
445
+    vinserti32x8    m8, [r4 + r5], 1
446
+
447
+    psadbw          m9, m4, m5
448
+    psadbw          m5, m4, m6
449
+    psadbw          m6, m4, m7
450
+    psadbw          m4, m8
451
+
452
+    paddd           m0, m9
453
+    paddd           m1, m5
454
+    paddd           m2, m6
455
+    paddd           m3, m4
456
+
457
+    movu            ym4, [r0 + FENC_STRIDE * 2]
458
+    movu            ym5, [r1 + r5 * 2]
459
+    movu            ym6, [r2 + r5 * 2]
460
+    movu            ym7, [r3 + r5 * 2]
461
+    movu            ym8, [r4 + r5 * 2]
462
+
463
+    vinserti32x8     m4, [r0 + FENC_STRIDE * 3], 1
464
+    vinserti32x8     m5, [r1 + r7], 1
465
+    vinserti32x8     m6, [r2 + r7], 1
466
+    vinserti32x8     m7, [r3 + r7], 1
467
+    vinserti32x8     m8, [r4 + r7], 1
468
+
469
+    psadbw          m9, m4, m5
470
+    psadbw          m5, m4, m6
471
+    psadbw          m6, m4, m7
472
+    psadbw          m4, m8
473
+
474
+    paddd           m0, m9
475
+    paddd           m1, m5
476
+    paddd           m2, m6
477
+    paddd           m3, m4
478
+
479
+    movu           xm4, [r0 + mmsize/2]
480
+    movu           xm5, [r1 + mmsize/2]
481
+    movu           xm6, [r2 + mmsize/2]
482
+    movu           xm7, [r3 + mmsize/2]
483
+    movu           xm8, [r4 + mmsize/2]
484
+    vinserti32x4    m4, [r0 + FENC_STRIDE + mmsize/2], 1
485
+    vinserti32x4    m5, [r1 + r5 + mmsize/2], 1
486
+    vinserti32x4    m6, [r2 + r5 + mmsize/2], 1
487
+    vinserti32x4    m7, [r3 + r5 + mmsize/2], 1
488
+    vinserti32x4    m8, [r4 + r5 + mmsize/2], 1
489
+
490
+    vinserti32x4    m4, [r0 + FENC_STRIDE * 2 + mmsize/2], 2
491
+    vinserti32x4    m5, [r1 + r5 * 2 + mmsize/2], 2
492
+    vinserti32x4    m6, [r2 + r5 * 2 + mmsize/2], 2
493
+    vinserti32x4    m7, [r3 + r5 * 2 + mmsize/2], 2
494
+    vinserti32x4    m8, [r4 + r5 * 2 + mmsize/2], 2
495
+    vinserti32x4    m4, [r0 + FENC_STRIDE * 3 + mmsize/2], 3
496
+    vinserti32x4    m5, [r1 + r7 + mmsize/2], 3
497
+    vinserti32x4    m6, [r2 + r7 + mmsize/2], 3
498
+    vinserti32x4    m7, [r3 + r7 + mmsize/2], 3
499
+    vinserti32x4    m8, [r4 + r7 + mmsize/2], 3
500
+
501
+    psadbw          m9, m4, m5
502
+    psadbw          m5, m4, m6
503
+    psadbw          m6, m4, m7
504
+    psadbw          m4, m8
505
+    paddd           m0, m9
506
+    paddd           m1, m5
507
+    paddd           m2, m6
508
+    paddd           m3, m4
509
+%endmacro
510
+
511
+%macro PIXEL_SAD_X4_END_AVX512 0
512
+    vextracti32x8  ym4, m0, 1
513
+    vextracti32x8  ym5, m1, 1
514
+    vextracti32x8  ym6, m2, 1
515
+    vextracti32x8  ym7, m3, 1
516
+    paddd          ym0, ym4
517
+    paddd          ym1, ym5
518
+    paddd          ym2, ym6
519
+    paddd          ym3, ym7
520
+    vextracti64x2  xm4, m0, 1
521
+    vextracti64x2  xm5, m1, 1
522
+    vextracti64x2  xm6, m2, 1
523
+    vextracti64x2  xm7, m3, 1
524
+    paddd          xm0, xm4
525
+    paddd          xm1, xm5
526
+    paddd          xm2, xm6
527
+    paddd          xm3, xm7
528
+    pshufd         xm4, xm0, 2
529
+    pshufd         xm5, xm1, 2
530
+    pshufd         xm6, xm2, 2
531
+    pshufd         xm7, xm3, 2
532
+    paddd          xm0, xm4
533
+    paddd          xm1, xm5
534
+    paddd          xm2, xm6
535
+    paddd          xm3, xm7
536
+    movd           [r6 + 0], xm0
537
+    movd           [r6 + 4], xm1
538
+    movd           [r6 + 8], xm2
539
+    movd           [r6 + 12], xm3
540
+%endmacro
541
+
542
+%macro SAD_X4_AVX512 2
543
+INIT_ZMM avx512
544
+cglobal pixel_sad_x4_%1x%2, 7,8,10
545
+    pxor            m0, m0
546
+    pxor            m1, m1
547
+    pxor            m2, m2
548
+    pxor            m3, m3
549
+    lea             r7, [r5 * 3]
550
+
551
+%rep %2/4 - 1
552
+    PROCESS_SAD_X4_%1x4_AVX512
553
+    add             r0, FENC_STRIDE * 4
554
+    lea             r1, [r1 + r5 * 4]
555
+    lea             r2, [r2 + r5 * 4]
556
+    lea             r3, [r3 + r5 * 4]
557
+    lea             r4, [r4 + r5 * 4]
558
+%endrep
559
+    PROCESS_SAD_X4_%1x4_AVX512
560
+    PIXEL_SAD_X4_END_AVX512
561
+    RET
562
+%endmacro
563
+
564
+SAD_X4_AVX512 64, 64
565
+SAD_X4_AVX512 64, 48
566
+SAD_X4_AVX512 64, 32
567
+SAD_X4_AVX512 64, 16
568
+SAD_X4_AVX512 32, 64
569
+SAD_X4_AVX512 32, 32
570
+SAD_X4_AVX512 32, 24
571
+SAD_X4_AVX512 32, 16
572
+SAD_X4_AVX512 32, 8
573
+SAD_X4_AVX512 48, 64
574
+;------------------------------------------------------------
575
+;sad_x4 avx512 code end
576
+;------------------------------------------------------------
577
 %endif
578
 
579
 INIT_XMM sse2
580
@@ -5517,6 +5851,218 @@
581
     RET
582
 %endif
583
 
584
+;------------------------------------------------------------
585
+;sad_x3 avx512 code start
586
+;------------------------------------------------------------
587
+%macro PROCESS_SAD_X3_64x4_AVX512 0
588
+    movu            m3, [r0]
589
+    movu            m4, [r1]
590
+    movu            m5, [r2]
591
+    movu            m6, [r3]
592
+
593
+    psadbw          m7, m3, m4
594
+    psadbw          m4, m3, m5
595
+    psadbw          m3, m6
596
+
597
+    paddd           m0, m7
598
+    paddd           m1, m4
599
+    paddd           m2, m3
600
+
601
+    movu            m3, [r0 + FENC_STRIDE]
602
+    movu            m4, [r1 + r4]
603
+    movu            m5, [r2 + r4]
604
+    movu            m6, [r3 + r4]
605
+
606
+    psadbw          m7, m3, m4
607
+    psadbw          m4, m3, m5
608
+    psadbw          m3, m6
609
+
610
+    paddd           m0, m7
611
+    paddd           m1, m4
612
+    paddd           m2, m3
613
+
614
+    movu            m3, [r0 + FENC_STRIDE * 2]
615
+    movu            m4, [r1 + r4 * 2]
616
+    movu            m5, [r2 + r4 * 2]
617
+    movu            m6, [r3 + r4 * 2]
618
+
619
+    psadbw          m7, m3, m4
620
+    psadbw          m4, m3, m5
621
+    psadbw          m3, m6
622
+
623
+    paddd           m0, m7
624
+    paddd           m1, m4
625
+    paddd           m2, m3
626
+
627
+    movu            m3, [r0 + FENC_STRIDE * 3]
628
+    movu            m4, [r1 + r6]
629
+    movu            m5, [r2 + r6]
630
+    movu            m6, [r3 + r6]
631
+
632
+    psadbw          m7, m3, m4
633
+    psadbw          m4, m3, m5
634
+    psadbw          m3, m6
635
+
636
+    paddd           m0, m7
637
+    paddd           m1, m4
638
+    paddd           m2, m3
639
+%endmacro
640
+
641
+%macro PROCESS_SAD_X3_32x4_AVX512 0
642
+    movu            ym3, [r0]
643
+    movu            ym4, [r1]
644
+    movu            ym5, [r2]
645
+    movu            ym6, [r3]
646
+    vinserti32x8    m3, [r0 + FENC_STRIDE], 1
647
+    vinserti32x8    m4, [r1 + r4], 1
648
+    vinserti32x8    m5, [r2 + r4], 1
649
+    vinserti32x8    m6, [r3 + r4], 1
650
+
651
+    psadbw          m7, m3, m4
652
+    psadbw          m4, m3, m5
653
+    psadbw          m3, m6
654
+
655
+    paddd           m0, m7
656
+    paddd           m1, m4
657
+    paddd           m2, m3
658
+
659
+    movu            ym3, [r0 + FENC_STRIDE * 2]
660
+    movu            ym4, [r1 + r4 * 2]
661
+    movu            ym5, [r2 + r4 * 2]
662
+    movu            ym6, [r3 + r4 * 2]
663
+    vinserti32x8     m3, [r0 + FENC_STRIDE * 3], 1
664
+    vinserti32x8     m4, [r1 + r6], 1
665
+    vinserti32x8     m5, [r2 + r6], 1
666
+    vinserti32x8     m6, [r3 + r6], 1
667
+
668
+    psadbw          m7, m3, m4
669
+    psadbw          m4, m3, m5
670
+    psadbw          m3, m6
671
+
672
+    paddd           m0, m7
673
+    paddd           m1, m4
674
+    paddd           m2, m3
675
+%endmacro
676
+
677
+%macro PROCESS_SAD_X3_48x4_AVX512 0
678
+    movu            ym3, [r0]
679
+    movu            ym4, [r1]
680
+    movu            ym5, [r2]
681
+    movu            ym6, [r3]
682
+    vinserti32x8    m3, [r0 + FENC_STRIDE], 1
683
+    vinserti32x8    m4, [r1 + r4], 1
684
+    vinserti32x8    m5, [r2 + r4], 1
685
+    vinserti32x8    m6, [r3 + r4], 1
686
+
687
+    psadbw          m7, m3, m4
688
+    psadbw          m4, m3, m5
689
+    psadbw          m3, m6
690
+
691
+    paddd           m0, m7
692
+    paddd           m1, m4
693
+    paddd           m2, m3
694
+
695
+    movu            ym3, [r0 + FENC_STRIDE * 2]
696
+    movu            ym4, [r1 + r4 * 2]
697
+    movu            ym5, [r2 + r4 * 2]
698
+    movu            ym6, [r3 + r4 * 2]
699
+    vinserti32x8     m3, [r0 + FENC_STRIDE * 3], 1
700
+    vinserti32x8     m4, [r1 + r6], 1
701
+    vinserti32x8     m5, [r2 + r6], 1
702
+    vinserti32x8     m6, [r3 + r6], 1
703
+
704
+    psadbw          m7, m3, m4
705
+    psadbw          m4, m3, m5
706
+    psadbw          m3, m6
707
+
708
+    paddd           m0, m7
709
+    paddd           m1, m4
710
+    paddd           m2, m3
711
+
712
+    movu           xm3, [r0 + mmsize/2]
713
+    movu           xm4, [r1 + mmsize/2]
714
+    movu           xm5, [r2 + mmsize/2]
715
+    movu           xm6, [r3 + mmsize/2]
716
+    vinserti32x4    m3, [r0 + FENC_STRIDE + mmsize/2], 1
717
+    vinserti32x4    m4, [r1 + r4 + mmsize/2], 1
718
+    vinserti32x4    m5, [r2 + r4 + mmsize/2], 1
719
+    vinserti32x4    m6, [r3 + r4 + mmsize/2], 1
720
+
721
+    vinserti32x4    m3, [r0 + 2 * FENC_STRIDE + mmsize/2], 2
722
+    vinserti32x4    m4, [r1 + 2 * r4 + mmsize/2], 2
723
+    vinserti32x4    m5, [r2 + 2 * r4 + mmsize/2], 2
724
+    vinserti32x4    m6, [r3 + 2 * r4 + mmsize/2], 2
725
+    vinserti32x4    m3, [r0 + 3 * FENC_STRIDE + mmsize/2], 3
726
+    vinserti32x4    m4, [r1 + r6 + mmsize/2], 3
727
+    vinserti32x4    m5, [r2 + r6 + mmsize/2], 3
728
+    vinserti32x4    m6, [r3 + r6 + mmsize/2], 3
729
+
730
+    psadbw          m7, m3, m4
731
+    psadbw          m4, m3, m5
732
+    psadbw          m3, m6
733
+    paddd           m0, m7
734
+    paddd           m1, m4
735
+    paddd           m2, m3
736
+%endmacro
737
+
738
+%macro PIXEL_SAD_X3_END_AVX512 0
739
+    vextracti32x8   ym3, m0, 1
740
+    vextracti32x8   ym4, m1, 1
741
+    vextracti32x8   ym5, m2, 1
742
+    paddd           ym0, ym3
743
+    paddd           ym1, ym4
744
+    paddd           ym2, ym5
745
+    vextracti64x2   xm3, m0, 1
746
+    vextracti64x2   xm4, m1, 1
747
+    vextracti64x2   xm5, m2, 1
748
+    paddd           xm0, xm3
749
+    paddd           xm1, xm4
750
+    paddd           xm2, xm5
751
+    pshufd          xm3, xm0, 2
752
+    pshufd          xm4, xm1, 2
753
+    pshufd          xm5, xm2, 2
754
+    paddd           xm0, xm3
755
+    paddd           xm1, xm4
756
+    paddd           xm2, xm5
757
+    movd            [r5 + 0], xm0
758
+    movd            [r5 + 4], xm1
759
+    movd            [r5 + 8], xm2
760
+%endmacro
761
+
762
+%macro SAD_X3_AVX512 2
763
+INIT_ZMM avx512
764
+cglobal pixel_sad_x3_%1x%2, 6,7,8
765
+    pxor            m0, m0
766
+    pxor            m1, m1
767
+    pxor            m2, m2
768
+    lea             r6, [r4 * 3]
769
+
770
+%rep %2/4 - 1
771
+    PROCESS_SAD_X3_%1x4_AVX512
772
+    add             r0, FENC_STRIDE * 4
773
+    lea             r1, [r1 + r4 * 4]
774
+    lea             r2, [r2 + r4 * 4]
775
+    lea             r3, [r3 + r4 * 4]
776
+%endrep
777
+    PROCESS_SAD_X3_%1x4_AVX512
778
+    PIXEL_SAD_X3_END_AVX512
779
+    RET
780
+%endmacro
781
+
782
+SAD_X3_AVX512 64, 64
783
+SAD_X3_AVX512 64, 48
784
+SAD_X3_AVX512 64, 32
785
+SAD_X3_AVX512 64, 16
786
+SAD_X3_AVX512 32, 64
787
+SAD_X3_AVX512 32, 32
788
+SAD_X3_AVX512 32, 24
789
+SAD_X3_AVX512 32, 16
790
+SAD_X3_AVX512 32, 8
791
+SAD_X3_AVX512 48, 64
792
+;------------------------------------------------------------
793
+;sad_x3 avx512 code end
794
+;------------------------------------------------------------
795
+
796
 INIT_YMM avx2
797
 cglobal pixel_sad_x4_8x8, 7,7,5
798
     xorps           m0, m0
799
@@ -6138,4 +6684,77 @@
800
     movd            eax, xm0
801
     RET
802
 
803
+%macro PROCESS_SAD_64x4_AVX512 0
804
+    movu           m1, [r0]
805
+    movu           m2, [r2]
806
+    movu           m3, [r0 + r1]
807
+    movu           m4, [r2 + r3]
808
+    psadbw         m1, m2
809
+    psadbw         m3, m4
810
+    paddd          m0, m1
811
+    paddd          m0, m3
812
+    movu           m1, [r0 + 2 * r1]
813
+    movu           m2, [r2 + 2 * r3]
814
+    movu           m3, [r0 + r5]
815
+    movu           m4, [r2 + r6]
816
+    psadbw         m1, m2
817
+    psadbw         m3, m4
818
+    paddd          m0, m1
819
+    paddd          m0, m3
820
+%endmacro
821
+
822
+%macro PROCESS_SAD_32x4_AVX512 0
823
+    movu           ym1, [r0]
824
+    movu           ym2, [r2]
825
+    movu           ym3, [r0 + 2 * r1]
826
+    movu           ym4, [r2 + 2 * r3]
827
+    vinserti32x8    m1, [r0 + r1], 1
828
+    vinserti32x8    m2, [r2 + r3], 1
829
+    vinserti32x8    m3, [r0 + r5], 1
830
+    vinserti32x8    m4, [r2 + r6], 1
831
+
832
+    psadbw         m1, m2
833
+    psadbw         m3, m4
834
+    paddd          m0, m1
835
+    paddd          m0, m3
836
+%endmacro
837
+
838
+%macro PROCESS_SAD_AVX512_END 0
839
+    vextracti32x8  ym1, m0, 1
840
+    paddd          ym0, ym1
841
+    vextracti64x2  xm1, m0, 1
842
+    paddd          xm0, xm1
843
+    pshufd         xm1, xm0, 2
844
+    paddd          xm0, xm1
845
+    movd           eax, xm0
846
+%endmacro
847
+;-----------------------------------------------------------------------------
848
+; int pixel_sad_64x%1( uint8_t *, intptr_t, uint8_t *, intptr_t )
849
+;-----------------------------------------------------------------------------
850
+%macro SAD_MxN_AVX512 2
851
+INIT_ZMM avx512
852
+cglobal pixel_sad_%1x%2, 4, 7, 5
853
+    pxor            m0, m0
854
+    lea             r5, [3 * r1]
855
+    lea             r6, [3 * r3]
856
+
857
+%rep %2/4 - 1
858
+    PROCESS_SAD_%1x4_AVX512
859
+    lea            r2, [r2 + 4 * r3]
860
+    lea            r0, [r0 + 4 * r1]
861
+%endrep
862
+    PROCESS_SAD_%1x4_AVX512
863
+    PROCESS_SAD_AVX512_END
864
+    RET
865
+%endmacro
866
+
867
+SAD_MxN_AVX512 64, 16
868
+SAD_MxN_AVX512 64, 32
869
+SAD_MxN_AVX512 64, 48
870
+SAD_MxN_AVX512 64, 64
871
+SAD_MxN_AVX512 32, 8
872
+SAD_MxN_AVX512 32, 16
873
+SAD_MxN_AVX512 32, 24
874
+SAD_MxN_AVX512 32, 32
875
+SAD_MxN_AVX512 32, 64
876
 %endif
877
x265_2.7.tar.gz/source/common/x86/sad16-a.asm -> x265_2.9.tar.gz/source/common/x86/sad16-a.asm Changed
2819
 
1
@@ -1155,6 +1155,565 @@
2
 SAD_12  12, 16
3
 
4
 
5
+%macro PROCESS_SAD_64x8_AVX512 0
6
+    movu    m1, [r2]
7
+    movu    m2, [r2 + mmsize]
8
+    movu    m3, [r2 + r3]
9
+    movu    m4, [r2 + r3 + mmsize]
10
+    psubw   m1, [r0]
11
+    psubw   m2, [r0 + mmsize]
12
+    psubw   m3, [r0 + r1]
13
+    psubw   m4, [r0 + r1 + mmsize]
14
+    pabsw   m1, m1
15
+    pabsw   m2, m2
16
+    pabsw   m3, m3
17
+    pabsw   m4, m4
18
+    paddw   m1, m2
19
+    paddw   m3, m4
20
+    paddw   m5, m1, m3
21
+
22
+    movu    m1, [r2 + 2 * r3]
23
+    movu    m2, [r2 + 2 * r3 + mmsize]
24
+    movu    m3, [r2 + r5]
25
+    movu    m4, [r2 + r5 + mmsize]
26
+    psubw   m1, [r0 + 2 * r1]
27
+    psubw   m2, [r0 + 2 * r1 + mmsize]
28
+    psubw   m3, [r0 + r4]
29
+    psubw   m4, [r0 + r4 + mmsize]
30
+    pabsw   m1, m1
31
+    pabsw   m2, m2
32
+    pabsw   m3, m3
33
+    pabsw   m4, m4
34
+    paddw   m1, m2
35
+    paddw   m3, m4
36
+    paddw   m1, m3
37
+
38
+    lea     r0, [r0 + 4 * r1]
39
+    lea     r2, [r2 + 4 * r3]
40
+
41
+    pmaddwd m5, m6
42
+    paddd   m0, m5
43
+    pmaddwd m1, m6
44
+    paddd   m0, m1
45
+
46
+    movu    m1, [r2]
47
+    movu    m2, [r2 + mmsize]
48
+    movu    m3, [r2 + r3]
49
+    movu    m4, [r2 + r3 + mmsize]
50
+    psubw   m1, [r0]
51
+    psubw   m2, [r0 + mmsize]
52
+    psubw   m3, [r0 + r1]
53
+    psubw   m4, [r0 + r1 + mmsize]
54
+    pabsw   m1, m1
55
+    pabsw   m2, m2
56
+    pabsw   m3, m3
57
+    pabsw   m4, m4
58
+    paddw   m1, m2
59
+    paddw   m3, m4
60
+    paddw   m5, m1, m3
61
+
62
+    movu    m1, [r2 + 2 * r3]
63
+    movu    m2, [r2 + 2 * r3 + mmsize]
64
+    movu    m3, [r2 + r5]
65
+    movu    m4, [r2 + r5 + mmsize]
66
+    psubw   m1, [r0 + 2 * r1]
67
+    psubw   m2, [r0 + 2 * r1 + mmsize]
68
+    psubw   m3, [r0 + r4]
69
+    psubw   m4, [r0 + r4 + mmsize]
70
+    pabsw   m1, m1
71
+    pabsw   m2, m2
72
+    pabsw   m3, m3
73
+    pabsw   m4, m4
74
+    paddw   m1, m2
75
+    paddw   m3, m4
76
+    paddw   m1, m3
77
+
78
+    pmaddwd m5, m6
79
+    paddd   m0, m5
80
+    pmaddwd m1, m6
81
+    paddd   m0, m1
82
+%endmacro
83
+
84
+
85
+%macro PROCESS_SAD_32x8_AVX512 0
86
+    movu    m1, [r2]
87
+    movu    m2, [r2 + r3]
88
+    movu    m3, [r2 + 2 * r3]
89
+    movu    m4, [r2 + r5]
90
+    psubw   m1, [r0]
91
+    psubw   m2, [r0 + r1]
92
+    psubw   m3, [r0 + 2 * r1]
93
+    psubw   m4, [r0 + r4]
94
+    pabsw   m1, m1
95
+    pabsw   m2, m2
96
+    pabsw   m3, m3
97
+    pabsw   m4, m4
98
+    paddw   m1, m2
99
+    paddw   m3, m4
100
+    paddw   m5, m1, m3
101
+
102
+    lea     r0, [r0 + 4 * r1]
103
+    lea     r2, [r2 + 4 * r3]
104
+
105
+    movu    m1, [r2]
106
+    movu    m2, [r2 + r3]
107
+    movu    m3, [r2 + 2 * r3]
108
+    movu    m4, [r2 + r5]
109
+    psubw   m1, [r0]
110
+    psubw   m2, [r0 + r1]
111
+    psubw   m3, [r0 + 2 * r1]
112
+    psubw   m4, [r0 + r4]
113
+    pabsw   m1, m1
114
+    pabsw   m2, m2
115
+    pabsw   m3, m3
116
+    pabsw   m4, m4
117
+    paddw   m1, m2
118
+    paddw   m3, m4
119
+    paddw   m1, m3
120
+
121
+    pmaddwd m5, m6
122
+    paddd   m0, m5
123
+    pmaddwd m1, m6
124
+    paddd   m0, m1
125
+%endmacro
126
+
127
+%macro PROCESS_SAD_16x8_AVX512 0
128
+    movu            ym1, [r2]
129
+    vinserti64x4     m1, [r2 + r3],  1
130
+    movu            ym2, [r2 + 2 * r3]
131
+    vinserti64x4     m2, [r2 + r5],  1
132
+    movu            ym3, [r0]
133
+    vinserti64x4     m3, [r0 + r1],  1
134
+    movu            ym4, [r0 + 2 * r1]
135
+    vinserti64x4     m4, [r0 + r4],  1
136
+
137
+    psubw   m1, m3
138
+    psubw   m2, m4
139
+    pabsw   m1, m1
140
+    pabsw   m2, m2
141
+    paddw   m5, m1, m2
142
+
143
+    lea     r0, [r0 + 4 * r1]
144
+    lea     r2, [r2 + 4 * r3]
145
+
146
+    movu            ym1, [r2]
147
+    vinserti64x4     m1, [r2 + r3],  1
148
+    movu            ym2, [r2 + 2 * r3]
149
+    vinserti64x4     m2, [r2 + r5],  1
150
+    movu            ym3, [r0]
151
+    vinserti64x4     m3, [r0 + r1],  1
152
+    movu            ym4, [r0 + 2 * r1]
153
+    vinserti64x4     m4, [r0 + r4],  1
154
+
155
+    psubw   m1, m3
156
+    psubw   m2, m4
157
+    pabsw   m1, m1
158
+    pabsw   m2, m2
159
+    paddw   m1, m2
160
+
161
+    pmaddwd m5, m6
162
+    paddd   m0, m5
163
+    pmaddwd m1, m6
164
+    paddd   m0, m1
165
+%endmacro
166
+
167
+%macro PROCESS_SAD_AVX512_END 0
168
+    vextracti32x8  ym1, m0, 1
169
+    paddd          ym0, ym1
170
+    vextracti64x2  xm1, m0, 1
171
+    paddd          xm0, xm1
172
+    pshufd         xm1, xm0, 00001110b
173
+    paddd          xm0, xm1
174
+    pshufd         xm1, xm0, 00000001b
175
+    paddd          xm0, xm1
176
+    movd           eax, xm0
177
+%endmacro
178
+
179
+;-----------------------------------------------------------------------------
180
+; int pixel_sad_64x%1( uint16_t *, intptr_t, uint16_t *, intptr_t )
181
+;-----------------------------------------------------------------------------
182
+%if ARCH_X86_64
183
+INIT_ZMM avx512
184
+cglobal pixel_sad_64x16, 4,6,7
185
+    pxor    m0, m0
186
+
187
+    vbroadcasti32x8 m6, [pw_1]
188
+
189
+    add     r3d, r3d
190
+    add     r1d, r1d
191
+    lea     r4d, [r1 * 3]
192
+    lea     r5d, [r3 * 3]
193
+
194
+    PROCESS_SAD_64x8_AVX512
195
+    lea            r2, [r2 + 4 * r3]
196
+    lea            r0, [r0 + 4 * r1]
197
+    PROCESS_SAD_64x8_AVX512
198
+    PROCESS_SAD_AVX512_END
199
+    RET
200
+
201
+INIT_ZMM avx512
202
+cglobal pixel_sad_64x32, 4,6,7
203
+    pxor    m0, m0
204
+
205
+    vbroadcasti32x8 m6, [pw_1]
206
+
207
+    add     r3d, r3d
208
+    add     r1d, r1d
209
+    lea     r4d, [r1 * 3]
210
+    lea     r5d, [r3 * 3]
211
+
212
+    PROCESS_SAD_64x8_AVX512
213
+    lea            r2, [r2 + 4 * r3]
214
+    lea            r0, [r0 + 4 * r1]
215
+    PROCESS_SAD_64x8_AVX512
216
+    lea            r2, [r2 + 4 * r3]
217
+    lea            r0, [r0 + 4 * r1]
218
+    PROCESS_SAD_64x8_AVX512
219
+    lea            r2, [r2 + 4 * r3]
220
+    lea            r0, [r0 + 4 * r1]
221
+    PROCESS_SAD_64x8_AVX512
222
+    PROCESS_SAD_AVX512_END
223
+    RET
224
+
225
+INIT_ZMM avx512
226
+cglobal pixel_sad_64x48, 4,6,7
227
+    pxor    m0, m0
228
+
229
+    vbroadcasti32x8 m6, [pw_1]
230
+
231
+    add     r3d, r3d
232
+    add     r1d, r1d
233
+    lea     r4d, [r1 * 3]
234
+    lea     r5d, [r3 * 3]
235
+
236
+    PROCESS_SAD_64x8_AVX512
237
+    lea            r2, [r2 + 4 * r3]
238
+    lea            r0, [r0 + 4 * r1]
239
+    PROCESS_SAD_64x8_AVX512
240
+    lea            r2, [r2 + 4 * r3]
241
+    lea            r0, [r0 + 4 * r1]
242
+    PROCESS_SAD_64x8_AVX512
243
+    lea            r2, [r2 + 4 * r3]
244
+    lea            r0, [r0 + 4 * r1]
245
+    PROCESS_SAD_64x8_AVX512
246
+    lea            r2, [r2 + 4 * r3]
247
+    lea            r0, [r0 + 4 * r1]
248
+    PROCESS_SAD_64x8_AVX512
249
+    lea            r2, [r2 + 4 * r3]
250
+    lea            r0, [r0 + 4 * r1]
251
+    PROCESS_SAD_64x8_AVX512
252
+    PROCESS_SAD_AVX512_END
253
+    RET
254
+
255
+INIT_ZMM avx512
256
+cglobal pixel_sad_64x64, 4,6,7
257
+   pxor    m0, m0
258
+
259
+    vbroadcasti32x8 m6, [pw_1]
260
+
261
+    add     r3d, r3d
262
+    add     r1d, r1d
263
+    lea     r4d, [r1 * 3]
264
+    lea     r5d, [r3 * 3]
265
+
266
+    PROCESS_SAD_64x8_AVX512
267
+    lea            r2, [r2 + 4 * r3]
268
+    lea            r0, [r0 + 4 * r1]
269
+    PROCESS_SAD_64x8_AVX512
270
+    lea            r2, [r2 + 4 * r3]
271
+    lea            r0, [r0 + 4 * r1]
272
+    PROCESS_SAD_64x8_AVX512
273
+    lea            r2, [r2 + 4 * r3]
274
+    lea            r0, [r0 + 4 * r1]
275
+    PROCESS_SAD_64x8_AVX512
276
+    lea            r2, [r2 + 4 * r3]
277
+    lea            r0, [r0 + 4 * r1]
278
+    PROCESS_SAD_64x8_AVX512
279
+    lea            r2, [r2 + 4 * r3]
280
+    lea            r0, [r0 + 4 * r1]
281
+    PROCESS_SAD_64x8_AVX512
282
+    lea            r2, [r2 + 4 * r3]
283
+    lea            r0, [r0 + 4 * r1]
284
+    PROCESS_SAD_64x8_AVX512
285
+    lea            r2, [r2 + 4 * r3]
286
+    lea            r0, [r0 + 4 * r1]
287
+    PROCESS_SAD_64x8_AVX512
288
+    PROCESS_SAD_AVX512_END
289
+    RET
290
+%endif
291
+
292
+;-----------------------------------------------------------------------------
293
+; int pixel_sad_32x%1( uint16_t *, intptr_t, uint16_t *, intptr_t )
294
+;-----------------------------------------------------------------------------
295
+%if ARCH_X86_64
296
+INIT_ZMM avx512
297
+cglobal pixel_sad_32x8, 4,6,7
298
+    pxor    m0, m0
299
+
300
+    vbroadcasti32x8 m6, [pw_1]
301
+
302
+    add     r3d, r3d
303
+    add     r1d, r1d
304
+    lea     r4d, [r1 * 3]
305
+    lea     r5d, [r3 * 3]
306
+
307
+    PROCESS_SAD_32x8_AVX512
308
+    PROCESS_SAD_AVX512_END
309
+    RET
310
+
311
+
312
+INIT_ZMM avx512
313
+cglobal pixel_sad_32x16, 4,6,7
314
+    pxor    m0, m0
315
+
316
+    vbroadcasti32x8 m6, [pw_1]
317
+
318
+    add     r3d, r3d
319
+    add     r1d, r1d
320
+    lea     r4d, [r1 * 3]
321
+    lea     r5d, [r3 * 3]
322
+
323
+    PROCESS_SAD_32x8_AVX512
324
+    lea            r2, [r2 + 4 * r3]
325
+    lea            r0, [r0 + 4 * r1]
326
+    PROCESS_SAD_32x8_AVX512
327
+    PROCESS_SAD_AVX512_END
328
+    RET
329
+
330
+INIT_ZMM avx512
331
+cglobal pixel_sad_32x24, 4,6,7
332
+   pxor    m0, m0
333
+
334
+    vbroadcasti32x8 m6, [pw_1]
335
+
336
+    add     r3d, r3d
337
+    add     r1d, r1d
338
+    lea     r4d, [r1 * 3]
339
+    lea     r5d, [r3 * 3]
340
+
341
+    PROCESS_SAD_32x8_AVX512
342
+    lea            r2, [r2 + 4 * r3]
343
+    lea            r0, [r0 + 4 * r1]
344
+    PROCESS_SAD_32x8_AVX512
345
+    lea            r2, [r2 + 4 * r3]
346
+    lea            r0, [r0 + 4 * r1]
347
+    PROCESS_SAD_32x8_AVX512
348
+    PROCESS_SAD_AVX512_END
349
+    RET
350
+
351
+INIT_ZMM avx512
352
+cglobal pixel_sad_32x32, 4,6,7
353
+    pxor    m0, m0
354
+
355
+    vbroadcasti32x8 m6, [pw_1]
356
+
357
+    add     r3d, r3d
358
+    add     r1d, r1d
359
+    lea     r4d, [r1 * 3]
360
+    lea     r5d, [r3 * 3]
361
+
362
+    PROCESS_SAD_32x8_AVX512
363
+    lea            r2, [r2 + 4 * r3]
364
+    lea            r0, [r0 + 4 * r1]
365
+    PROCESS_SAD_32x8_AVX512
366
+    lea            r2, [r2 + 4 * r3]
367
+    lea            r0, [r0 + 4 * r1]
368
+    PROCESS_SAD_32x8_AVX512
369
+    lea            r2, [r2 + 4 * r3]
370
+    lea            r0, [r0 + 4 * r1]
371
+    PROCESS_SAD_32x8_AVX512
372
+    PROCESS_SAD_AVX512_END
373
+    RET
374
+
375
+INIT_ZMM avx512
376
+cglobal pixel_sad_32x64, 4,6,7
377
+   pxor    m0, m0
378
+
379
+    vbroadcasti32x8 m6, [pw_1]
380
+
381
+    add     r3d, r3d
382
+    add     r1d, r1d
383
+    lea     r4d, [r1 * 3]
384
+    lea     r5d, [r3 * 3]
385
+
386
+    PROCESS_SAD_32x8_AVX512
387
+    lea            r2, [r2 + 4 * r3]
388
+    lea            r0, [r0 + 4 * r1]
389
+    PROCESS_SAD_32x8_AVX512
390
+    lea            r2, [r2 + 4 * r3]
391
+    lea            r0, [r0 + 4 * r1]
392
+    PROCESS_SAD_32x8_AVX512
393
+    lea            r2, [r2 + 4 * r3]
394
+    lea            r0, [r0 + 4 * r1]
395
+    PROCESS_SAD_32x8_AVX512
396
+    lea            r2, [r2 + 4 * r3]
397
+    lea            r0, [r0 + 4 * r1]
398
+    PROCESS_SAD_32x8_AVX512
399
+    lea            r2, [r2 + 4 * r3]
400
+    lea            r0, [r0 + 4 * r1]
401
+    PROCESS_SAD_32x8_AVX512
402
+    lea            r2, [r2 + 4 * r3]
403
+    lea            r0, [r0 + 4 * r1]
404
+    PROCESS_SAD_32x8_AVX512
405
+    lea            r2, [r2 + 4 * r3]
406
+    lea            r0, [r0 + 4 * r1]
407
+    PROCESS_SAD_32x8_AVX512
408
+    PROCESS_SAD_AVX512_END
409
+    RET
410
+%endif
411
+
412
+;-----------------------------------------------------------------------------
413
+; int pixel_sad_16x%1( uint16_t *, intptr_t, uint16_t *, intptr_t )
414
+;-----------------------------------------------------------------------------
415
+%if ARCH_X86_64
416
+INIT_ZMM avx512
417
+cglobal pixel_sad_16x32, 4,6,7
418
+    pxor    m0, m0
419
+
420
+    vbroadcasti32x8 m6, [pw_1]
421
+
422
+    add     r3d, r3d
423
+    add     r1d, r1d
424
+    lea     r4d, [r1 * 3]
425
+    lea     r5d, [r3 * 3]
426
+
427
+    %rep 3
428
+        PROCESS_SAD_16x8_AVX512
429
+        lea            r2, [r2 + 4 * r3]
430
+        lea            r0, [r0 + 4 * r1]
431
+    %endrep
432
+    PROCESS_SAD_16x8_AVX512
433
+    PROCESS_SAD_AVX512_END
434
+    RET
435
+
436
+INIT_ZMM avx512
437
+cglobal pixel_sad_16x64, 4,6,7
438
+   pxor    m0, m0
439
+
440
+    vbroadcasti32x8 m6, [pw_1]
441
+
442
+    add     r3d, r3d
443
+    add     r1d, r1d
444
+    lea     r4d, [r1 * 3]
445
+    lea     r5d, [r3 * 3]
446
+
447
+    %rep 7
448
+        PROCESS_SAD_16x8_AVX512
449
+        lea            r2, [r2 + 4 * r3]
450
+        lea            r0, [r0 + 4 * r1]
451
+    %endrep
452
+    PROCESS_SAD_16x8_AVX512
453
+    PROCESS_SAD_AVX512_END
454
+    RET
455
+%endif
456
+
457
+;-----------------------------------------------------------------------------
458
+; int pixel_sad_48x64( uint16_t *, intptr_t, uint16_t *, intptr_t )
459
+;-----------------------------------------------------------------------------
460
+%if ARCH_X86_64
461
+INIT_ZMM avx512
462
+cglobal pixel_sad_48x64, 4, 7, 9
463
+    pxor    m0,  m0
464
+    mov     r6d, 64/8
465
+
466
+    vbroadcasti32x8 m8, [pw_1]
467
+
468
+    add     r3d, r3d
469
+    add     r1d, r1d
470
+    lea     r4d, [r1 * 3]
471
+    lea     r5d, [r3 * 3]
472
+.loop:
473
+    movu            m1,  [r2]
474
+    movu            m2,  [r2 + r3]
475
+    movu           ym3,  [r2 + mmsize]
476
+    vinserti32x8    m3,  [r2 + r3 + mmsize], 1
477
+    movu            m4,  [r0]
478
+    movu            m5,  [r0 + r1]
479
+    movu           ym6,  [r0 + mmsize]
480
+    vinserti32x8    m6,  [r0 + r1 + mmsize], 1
481
+
482
+    psubw   m1, m4
483
+    psubw   m2, m5
484
+    psubw   m3, m6
485
+    pabsw   m1, m1
486
+    pabsw   m2, m2
487
+    pabsw   m3, m3
488
+    paddw   m1, m2
489
+    paddw   m7, m3, m1
490
+
491
+    movu            m1,  [r2 + 2 * r3]
492
+    movu            m2,  [r2 + r5]
493
+    movu           ym3,  [r2 + 2 * r3 + mmsize]
494
+    vinserti32x8    m3,  [r2 + r5 + mmsize], 1
495
+    movu            m4,  [r0 + 2 * r1]
496
+    movu            m5,  [r0 + r4]
497
+    movu           ym6,  [r0 + 2 * r1 + mmsize]
498
+    vinserti32x8    m6,  [r0 + r4 + mmsize], 1
499
+    psubw   m1, m4
500
+    psubw   m2, m5
501
+    psubw   m3, m6
502
+    pabsw   m1, m1
503
+    pabsw   m2, m2
504
+    pabsw   m3, m3
505
+    paddw   m1, m2
506
+    paddw   m1, m3
507
+
508
+    pmaddwd m7, m8
509
+    paddd   m0, m7
510
+    pmaddwd m1, m8
511
+    paddd   m0, m1
512
+    lea     r0, [r0 + 4 * r1]
513
+    lea     r2, [r2 + 4 * r3]
514
+
515
+    movu            m1,  [r2]
516
+    movu            m2,  [r2 + r3]
517
+    movu           ym3,  [r2 + mmsize]
518
+    vinserti32x8    m3,  [r2 + r3 + mmsize], 1
519
+    movu            m4,  [r0]
520
+    movu            m5,  [r0 + r1]
521
+    movu           ym6,  [r0 + mmsize]
522
+    vinserti32x8    m6,  [r0 + r1 + mmsize], 1
523
+
524
+    psubw   m1, m4
525
+    psubw   m2, m5
526
+    psubw   m3, m6
527
+    pabsw   m1, m1
528
+    pabsw   m2, m2
529
+    pabsw   m3, m3
530
+    paddw   m1, m2
531
+    paddw   m7, m3, m1
532
+
533
+    movu            m1,  [r2 + 2 * r3]
534
+    movu            m2,  [r2 + r5]
535
+    movu           ym3,  [r2 + 2 * r3 + mmsize]
536
+    vinserti32x8    m3,  [r2 + r5 + mmsize], 1
537
+    movu            m4,  [r0 + 2 * r1]
538
+    movu            m5,  [r0 + r4]
539
+    movu           ym6,  [r0 + 2 * r1 + mmsize]
540
+    vinserti32x8    m6,  [r0 + r4 + mmsize], 1
541
+    psubw   m1, m4
542
+    psubw   m2, m5
543
+    psubw   m3, m6
544
+    pabsw   m1, m1
545
+    pabsw   m2, m2
546
+    pabsw   m3, m3
547
+    paddw   m1, m2
548
+    paddw   m1, m3
549
+
550
+    pmaddwd m7, m8
551
+    paddd   m0, m7
552
+    pmaddwd m1, m8
553
+    paddd   m0, m1
554
+    lea     r0, [r0 + 4 * r1]
555
+    lea     r2, [r2 + 4 * r3]
556
+
557
+    dec     r6d
558
+    jg      .loop
559
+
560
+    PROCESS_SAD_AVX512_END
561
+    RET
562
+%endif
563
+
564
 ;=============================================================================
565
 ; SAD x3/x4
566
 ;=============================================================================
567
@@ -1561,3 +2120,2251 @@
568
 SAD_X 4, 64, 48
569
 SAD_X 4, 64, 64
570
 
571
+;============================
572
+; SAD x3/x4 avx512 code start
573
+;============================
574
+
575
+%macro PROCESS_SAD_X4_16x4_AVX512 0
576
+    movu            ym8, [r0]
577
+    vinserti64x4     m8, [r0 + 2 * FENC_STRIDE],  1
578
+    movu            ym4, [r1]
579
+    vinserti64x4     m4, [r1 + r5],  1
580
+    movu            ym5, [r2]
581
+    vinserti64x4     m5, [r2 + r5],  1
582
+    movu            ym6, [r3]
583
+    vinserti64x4     m6, [r3 + r5],  1
584
+    movu            ym7, [r4]
585
+    vinserti64x4     m7, [r4 + r5],  1
586
+
587
+    
588
+    psubw   m4, m8
589
+    psubw   m5, m8
590
+    psubw   m6, m8
591
+    psubw   m7, m8
592
+    pabsw   m4, m4
593
+    pabsw   m5, m5
594
+    pabsw   m6, m6
595
+    pabsw   m7, m7
596
+
597
+    pmaddwd m4, m9
598
+    paddd   m0, m4
599
+    pmaddwd m5, m9
600
+    paddd   m1, m5
601
+    pmaddwd m6, m9
602
+    paddd   m2, m6
603
+    pmaddwd m7, m9
604
+    paddd   m3, m7
605
+
606
+    movu            ym8, [r0 + 4 * FENC_STRIDE]
607
+    vinserti64x4     m8, [r0 + 6 * FENC_STRIDE],  1
608
+    movu            ym4, [r1 + 2 * r5]
609
+    vinserti64x4     m4, [r1 + r7],  1
610
+    movu            ym5, [r2 + 2 * r5]
611
+    vinserti64x4     m5, [r2 + r7],  1
612
+    movu            ym6, [r3 +  2 * r5]
613
+    vinserti64x4     m6, [r3 + r7],  1
614
+    movu            ym7, [r4 +  2 * r5]
615
+    vinserti64x4     m7, [r4 + r7],  1
616
+
617
+    psubw   m4, m8
618
+    psubw   m5, m8
619
+    psubw   m6, m8
620
+    psubw   m7, m8
621
+    pabsw   m4, m4
622
+    pabsw   m5, m5
623
+    pabsw   m6, m6
624
+    pabsw   m7, m7
625
+
626
+    pmaddwd m4, m9
627
+    paddd   m0, m4
628
+    pmaddwd m5, m9
629
+    paddd   m1, m5
630
+    pmaddwd m6, m9
631
+    paddd   m2, m6
632
+    pmaddwd m7, m9
633
+    paddd   m3, m7
634
+%endmacro
635
+
636
+%macro PROCESS_SAD_X4_32x4_AVX512 0
637
+    movu    m8, [r0]
638
+    movu    m4, [r1]
639
+    movu    m5, [r2]
640
+    movu    m6, [r3]
641
+    movu    m7, [r4]
642
+
643
+    
644
+    psubw   m4, m8
645
+    psubw   m5, m8
646
+    psubw   m6, m8
647
+    psubw   m7, m8
648
+    pabsw   m4, m4
649
+    pabsw   m5, m5
650
+    pabsw   m6, m6
651
+    pabsw   m7, m7
652
+
653
+    pmaddwd m4, m9
654
+    paddd   m0, m4
655
+    pmaddwd m5, m9
656
+    paddd   m1, m5
657
+    pmaddwd m6, m9
658
+    paddd   m2, m6
659
+    pmaddwd m7, m9
660
+    paddd   m3, m7
661
+
662
+
663
+    movu    m8, [r0 + 2 * FENC_STRIDE]
664
+    movu    m4, [r1 + r5]
665
+    movu    m5, [r2 + r5]
666
+    movu    m6, [r3 + r5]
667
+    movu    m7, [r4 + r5]
668
+
669
+    
670
+    psubw   m4, m8
671
+    psubw   m5, m8
672
+    psubw   m6, m8
673
+    psubw   m7, m8
674
+    pabsw   m4, m4
675
+    pabsw   m5, m5
676
+    pabsw   m6, m6
677
+    pabsw   m7, m7
678
+
679
+    pmaddwd m4, m9
680
+    paddd   m0, m4
681
+    pmaddwd m5, m9
682
+    paddd   m1, m5
683
+    pmaddwd m6, m9
684
+    paddd   m2, m6
685
+    pmaddwd m7, m9
686
+    paddd   m3, m7
687
+
688
+    movu    m8, [r0 + 4 * FENC_STRIDE]
689
+    movu    m4, [r1 + 2 * r5]
690
+    movu    m5, [r2 + 2 * r5]
691
+    movu    m6, [r3 + 2 * r5]
692
+    movu    m7, [r4 + 2 * r5]
693
+
694
+    
695
+    psubw   m4, m8
696
+    psubw   m5, m8
697
+    psubw   m6, m8
698
+    psubw   m7, m8
699
+    pabsw   m4, m4
700
+    pabsw   m5, m5
701
+    pabsw   m6, m6
702
+    pabsw   m7, m7
703
+
704
+    pmaddwd m4, m9
705
+    paddd   m0, m4
706
+    pmaddwd m5, m9
707
+    paddd   m1, m5
708
+    pmaddwd m6, m9
709
+    paddd   m2, m6
710
+    pmaddwd m7, m9
711
+    paddd   m3, m7
712
+
713
+    movu    m8, [r0 + 6 * FENC_STRIDE]
714
+    movu    m4, [r1 + r7]
715
+    movu    m5, [r2 + r7]
716
+    movu    m6, [r3 + r7]
717
+    movu    m7, [r4 + r7]
718
+
719
+    
720
+    psubw   m4, m8
721
+    psubw   m5, m8
722
+    psubw   m6, m8
723
+    psubw   m7, m8
724
+    pabsw   m4, m4
725
+    pabsw   m5, m5
726
+    pabsw   m6, m6
727
+    pabsw   m7, m7
728
+
729
+    pmaddwd m4, m9
730
+    paddd   m0, m4
731
+    pmaddwd m5, m9
732
+    paddd   m1, m5
733
+    pmaddwd m6, m9
734
+    paddd   m2, m6
735
+    pmaddwd m7, m9
736
+    paddd   m3, m7
737
+%endmacro
738
+
739
+%macro PROCESS_SAD_X4_64x4_AVX512 0
740
+    movu    m8,  [r0]
741
+    movu    m10, [r0 + mmsize]
742
+    movu    m4,  [r1]
743
+    movu    m11, [r1 + mmsize]
744
+    movu    m5,  [r2]
745
+    movu    m12, [r2 + mmsize]
746
+    movu    m6,  [r3]
747
+    movu    m13, [r3 + mmsize]
748
+    movu    m7,  [r4]
749
+    movu    m14, [r4 + mmsize]
750
+
751
+    psubw   m4,  m8
752
+    psubw   m5,  m8
753
+    psubw   m6,  m8
754
+    psubw   m7,  m8
755
+    psubw   m11, m10
756
+    psubw   m12, m10
757
+    psubw   m13, m10
758
+    psubw   m14, m10
759
+    pabsw   m4,  m4
760
+    pabsw   m5,  m5
761
+    pabsw   m6,  m6
762
+    pabsw   m7,  m7
763
+    pabsw   m11, m11
764
+    pabsw   m12, m12
765
+    pabsw   m13, m13
766
+    pabsw   m14, m14
767
+    paddw   m4,  m11
768
+    paddw   m5,  m12
769
+    paddw   m6,  m13
770
+    paddw   m7,  m14
771
+
772
+    pmaddwd m4, m9
773
+    paddd   m0, m4
774
+    pmaddwd m5, m9
775
+    paddd   m1, m5
776
+    pmaddwd m6, m9
777
+    paddd   m2, m6
778
+    pmaddwd m7, m9
779
+    paddd   m3, m7
780
+
781
+
782
+    movu    m8,  [r0 + 2 * FENC_STRIDE]
783
+    movu    m10, [r0 + 2 * FENC_STRIDE + mmsize]
784
+    movu    m4,  [r1 + r5]
785
+    movu    m11, [r1 + r5 + mmsize]
786
+    movu    m5,  [r2 + r5]
787
+    movu    m12, [r2 + r5 + mmsize]
788
+    movu    m6,  [r3 + r5]
789
+    movu    m13, [r3 + r5 + mmsize]
790
+    movu    m7,  [r4 + r5]
791
+    movu    m14, [r4 + r5 + mmsize]
792
+
793
+    psubw   m4,  m8
794
+    psubw   m5,  m8
795
+    psubw   m6,  m8
796
+    psubw   m7,  m8
797
+    psubw   m11, m10
798
+    psubw   m12, m10
799
+    psubw   m13, m10
800
+    psubw   m14, m10
801
+    pabsw   m4,  m4
802
+    pabsw   m5,  m5
803
+    pabsw   m6,  m6
804
+    pabsw   m7,  m7
805
+    pabsw   m11, m11
806
+    pabsw   m12, m12
807
+    pabsw   m13, m13
808
+    pabsw   m14, m14
809
+    paddw   m4,  m11
810
+    paddw   m5,  m12
811
+    paddw   m6,  m13
812
+    paddw   m7,  m14
813
+
814
+    pmaddwd m4, m9
815
+    paddd   m0, m4
816
+    pmaddwd m5, m9
817
+    paddd   m1, m5
818
+    pmaddwd m6, m9
819
+    paddd   m2, m6
820
+    pmaddwd m7, m9
821
+    paddd   m3, m7
822
+
823
+    movu    m8,  [r0 + 4 * FENC_STRIDE]
824
+    movu    m10, [r0 + 4 * FENC_STRIDE + mmsize]
825
+    movu    m4,  [r1 + 2 * r5]
826
+    movu    m11, [r1 + 2 * r5 + mmsize]
827
+    movu    m5,  [r2 + 2 * r5]
828
+    movu    m12, [r2 + 2 * r5 + mmsize]
829
+    movu    m6,  [r3 + 2 * r5]
830
+    movu    m13, [r3 + 2 * r5 + mmsize]
831
+    movu    m7,  [r4 + 2 * r5]
832
+    movu    m14, [r4 + 2 * r5 + mmsize]
833
+
834
+    psubw   m4,  m8
835
+    psubw   m5,  m8
836
+    psubw   m6,  m8
837
+    psubw   m7,  m8
838
+    psubw   m11, m10
839
+    psubw   m12, m10
840
+    psubw   m13, m10
841
+    psubw   m14, m10
842
+    pabsw   m4,  m4
843
+    pabsw   m5,  m5
844
+    pabsw   m6,  m6
845
+    pabsw   m7,  m7
846
+    pabsw   m11, m11
847
+    pabsw   m12, m12
848
+    pabsw   m13, m13
849
+    pabsw   m14, m14
850
+    paddw   m4,  m11
851
+    paddw   m5,  m12
852
+    paddw   m6,  m13
853
+    paddw   m7,  m14
854
+
855
+    pmaddwd m4, m9
856
+    paddd   m0, m4
857
+    pmaddwd m5, m9
858
+    paddd   m1, m5
859
+    pmaddwd m6, m9
860
+    paddd   m2, m6
861
+    pmaddwd m7, m9
862
+    paddd   m3, m7
863
+
864
+    movu    m8,  [r0 + 6 * FENC_STRIDE]
865
+    movu    m10, [r0 + 6 * FENC_STRIDE + mmsize]
866
+    movu    m4,  [r1 + r7]
867
+    movu    m11, [r1 + r7 + mmsize]
868
+    movu    m5,  [r2 + r7]
869
+    movu    m12, [r2 + r7 + mmsize]
870
+    movu    m6,  [r3 + r7]
871
+    movu    m13, [r3 + r7 + mmsize]
872
+    movu    m7,  [r4 + r7]
873
+    movu    m14, [r4 + r7 + mmsize]
874
+
875
+    psubw   m4,  m8
876
+    psubw   m5,  m8
877
+    psubw   m6,  m8
878
+    psubw   m7,  m8
879
+    psubw   m11, m10
880
+    psubw   m12, m10
881
+    psubw   m13, m10
882
+    psubw   m14, m10
883
+    pabsw   m4,  m4
884
+    pabsw   m5,  m5
885
+    pabsw   m6,  m6
886
+    pabsw   m7,  m7
887
+    pabsw   m11, m11
888
+    pabsw   m12, m12
889
+    pabsw   m13, m13
890
+    pabsw   m14, m14
891
+    paddw   m4,  m11
892
+    paddw   m5,  m12
893
+    paddw   m6,  m13
894
+    paddw   m7,  m14
895
+
896
+    pmaddwd m4, m9
897
+    paddd   m0, m4
898
+    pmaddwd m5, m9
899
+    paddd   m1, m5
900
+    pmaddwd m6, m9
901
+    paddd   m2, m6
902
+    pmaddwd m7, m9
903
+    paddd   m3, m7
904
+%endmacro
905
+
906
+%macro PROCESS_SAD_X4_END_AVX512 0
907
+    vextracti32x8  ym4, m0, 1
908
+    vextracti32x8  ym5, m1, 1
909
+    vextracti32x8  ym6, m2, 1
910
+    vextracti32x8  ym7, m3, 1
911
+
912
+    paddd          ym0, ym4
913
+    paddd          ym1, ym5
914
+    paddd          ym2, ym6
915
+    paddd          ym3, ym7
916
+
917
+    vextracti64x2  xm4, m0, 1
918
+    vextracti64x2  xm5, m1, 1
919
+    vextracti64x2  xm6, m2, 1
920
+    vextracti64x2  xm7, m3, 1
921
+
922
+    paddd          xm0, xm4
923
+    paddd          xm1, xm5
924
+    paddd          xm2, xm6
925
+    paddd          xm3, xm7
926
+
927
+    pshufd         xm4, xm0, 00001110b
928
+    pshufd         xm5, xm1, 00001110b
929
+    pshufd         xm6, xm2, 00001110b
930
+    pshufd         xm7, xm3, 00001110b
931
+
932
+    paddd          xm0, xm4
933
+    paddd          xm1, xm5
934
+    paddd          xm2, xm6
935
+    paddd          xm3, xm7
936
+
937
+    pshufd         xm4, xm0, 00000001b
938
+    pshufd         xm5, xm1, 00000001b
939
+    pshufd         xm6, xm2, 00000001b
940
+    pshufd         xm7, xm3, 00000001b
941
+
942
+    paddd          xm0, xm4
943
+    paddd          xm1, xm5
944
+    paddd          xm2, xm6
945
+    paddd          xm3, xm7
946
+
947
+    mov                  r0,  r6mp
948
+    movd           [r0 + 0],  xm0
949
+    movd           [r0 + 4],  xm1
950
+    movd           [r0 + 8],  xm2
951
+    movd           [r0 + 12], xm3
952
+%endmacro
953
+
954
+
955
+%macro PROCESS_SAD_X3_16x4_AVX512 0
956
+    movu            ym6, [r0]
957
+    vinserti64x4     m6, [r0 + 2 * FENC_STRIDE],  1
958
+    movu            ym3, [r1]
959
+    vinserti64x4     m3, [r1 + r4],  1
960
+    movu            ym4, [r2]
961
+    vinserti64x4     m4, [r2 + r4],  1
962
+    movu            ym5, [r3]
963
+    vinserti64x4     m5, [r3 + r4],  1
964
+
965
+    psubw   m3, m6
966
+    psubw   m4, m6
967
+    psubw   m5, m6
968
+    pabsw   m3, m3
969
+    pabsw   m4, m4
970
+    pabsw   m5, m5
971
+
972
+    pmaddwd m3, m7
973
+    paddd   m0, m3
974
+    pmaddwd m4, m7
975
+    paddd   m1, m4
976
+    pmaddwd m5, m7
977
+    paddd   m2, m5
978
+
979
+    movu            ym6, [r0 + 4 * FENC_STRIDE]
980
+    vinserti64x4     m6, [r0 + 6 * FENC_STRIDE],  1
981
+    movu            ym3, [r1 + 2 * r4]
982
+    vinserti64x4     m3, [r1 + r6],  1
983
+    movu            ym4, [r2 + 2 * r4]
984
+    vinserti64x4     m4, [r2 + r6],  1
985
+    movu            ym5, [r3 + 2 * r4]
986
+    vinserti64x4     m5, [r3 + r6],  1
987
+
988
+    psubw   m3, m6
989
+    psubw   m4, m6
990
+    psubw   m5, m6
991
+    pabsw   m3, m3
992
+    pabsw   m4, m4
993
+    pabsw   m5, m5
994
+
995
+    pmaddwd m3, m7
996
+    paddd   m0, m3
997
+    pmaddwd m4, m7
998
+    paddd   m1, m4
999
+    pmaddwd m5, m7
1000
+    paddd   m2, m5
1001
+%endmacro
1002
+
1003
+
1004
+%macro PROCESS_SAD_X3_32x4_AVX512 0
1005
+    movu    m6, [r0]
1006
+    movu    m3, [r1]
1007
+    movu    m4, [r2]
1008
+    movu    m5, [r3]
1009
+
1010
+    
1011
+    psubw   m3, m6
1012
+    psubw   m4, m6
1013
+    psubw   m5, m6
1014
+    pabsw   m3, m3
1015
+    pabsw   m4, m4
1016
+    pabsw   m5, m5
1017
+
1018
+    pmaddwd m3, m7
1019
+    paddd   m0, m3
1020
+    pmaddwd m4, m7
1021
+    paddd   m1, m4
1022
+    pmaddwd m5, m7
1023
+    paddd   m2, m5
1024
+
1025
+    movu    m6, [r0 + 2 * FENC_STRIDE]
1026
+    movu    m3, [r1 + r4]
1027
+    movu    m4, [r2 + r4]
1028
+    movu    m5, [r3 + r4]
1029
+    
1030
+    psubw   m3, m6
1031
+    psubw   m4, m6
1032
+    psubw   m5, m6
1033
+    pabsw   m3, m3
1034
+    pabsw   m4, m4
1035
+    pabsw   m5, m5
1036
+
1037
+    pmaddwd m3, m7
1038
+    paddd   m0, m3
1039
+    pmaddwd m4, m7
1040
+    paddd   m1, m4
1041
+    pmaddwd m5, m7
1042
+    paddd   m2, m5
1043
+
1044
+    movu    m6, [r0 + 4 * FENC_STRIDE]
1045
+    movu    m3, [r1 + 2 * r4]
1046
+    movu    m4, [r2 + 2 * r4]
1047
+    movu    m5, [r3 + 2 * r4]
1048
+    
1049
+    psubw   m3, m6
1050
+    psubw   m4, m6
1051
+    psubw   m5, m6
1052
+    pabsw   m3, m3
1053
+    pabsw   m4, m4
1054
+    pabsw   m5, m5
1055
+
1056
+    pmaddwd m3, m7
1057
+    paddd   m0, m3
1058
+    pmaddwd m4, m7
1059
+    paddd   m1, m4
1060
+    pmaddwd m5, m7
1061
+    paddd   m2, m5
1062
+
1063
+    movu    m6, [r0 + 6 * FENC_STRIDE]
1064
+    movu    m3, [r1 + r6]
1065
+    movu    m4, [r2 + r6]
1066
+    movu    m5, [r3 + r6]
1067
+    
1068
+    psubw   m3, m6
1069
+    psubw   m4, m6
1070
+    psubw   m5, m6
1071
+    pabsw   m3, m3
1072
+    pabsw   m4, m4
1073
+    pabsw   m5, m5
1074
+
1075
+    pmaddwd m3, m7
1076
+    paddd   m0, m3
1077
+    pmaddwd m4, m7
1078
+    paddd   m1, m4
1079
+    pmaddwd m5, m7
1080
+    paddd   m2, m5
1081
+%endmacro
1082
+
1083
+%macro PROCESS_SAD_X3_64x4_AVX512 0
1084
+    movu    m6,  [r0]
1085
+    movu    m8,  [r0 + mmsize]
1086
+    movu    m3,  [r1]
1087
+    movu    m9,  [r1 + mmsize]
1088
+    movu    m4,  [r2]
1089
+    movu    m10, [r2 + mmsize]
1090
+    movu    m5,  [r3]
1091
+    movu    m11, [r3 + mmsize]
1092
+
1093
+    psubw   m3,  m6
1094
+    psubw   m9,  m8
1095
+    psubw   m4,  m6
1096
+    psubw   m10, m8
1097
+    psubw   m5,  m6
1098
+    psubw   m11, m8
1099
+    pabsw   m3,  m3
1100
+    pabsw   m4,  m4
1101
+    pabsw   m5,  m5
1102
+    pabsw   m9,  m9
1103
+    pabsw   m10, m10
1104
+    pabsw   m11, m11
1105
+    paddw   m3,  m9
1106
+    paddw   m4,  m10
1107
+    paddw   m5,  m11
1108
+
1109
+    pmaddwd m3, m7
1110
+    paddd   m0, m3
1111
+    pmaddwd m4, m7
1112
+    paddd   m1, m4
1113
+    pmaddwd m5, m7
1114
+    paddd   m2, m5
1115
+
1116
+    movu    m6,  [r0 + 2 * FENC_STRIDE]
1117
+    movu    m8,  [r0 + 2 * FENC_STRIDE + mmsize]
1118
+    movu    m3,  [r1 + r4]
1119
+    movu    m9,  [r1 + r4 + mmsize]
1120
+    movu    m4,  [r2 + r4]
1121
+    movu    m10, [r2 + r4 + mmsize]
1122
+    movu    m5,  [r3 + r4]
1123
+    movu    m11, [r3 + r4 + mmsize]
1124
+
1125
+    psubw   m3,  m6
1126
+    psubw   m9,  m8
1127
+    psubw   m4,  m6
1128
+    psubw   m10, m8
1129
+    psubw   m5,  m6
1130
+    psubw   m11, m8
1131
+    pabsw   m3,  m3
1132
+    pabsw   m4,  m4
1133
+    pabsw   m5,  m5
1134
+    pabsw   m9,  m9
1135
+    pabsw   m10, m10
1136
+    pabsw   m11, m11
1137
+    paddw   m3,  m9
1138
+    paddw   m4,  m10
1139
+    paddw   m5,  m11
1140
+
1141
+    pmaddwd m3, m7
1142
+    paddd   m0, m3
1143
+    pmaddwd m4, m7
1144
+    paddd   m1, m4
1145
+    pmaddwd m5, m7
1146
+    paddd   m2, m5
1147
+
1148
+    movu    m6,  [r0 + 4 * FENC_STRIDE]
1149
+    movu    m8,  [r0 + 4 * FENC_STRIDE + mmsize]
1150
+    movu    m3,  [r1 + 2 * r4]
1151
+    movu    m9,  [r1 + 2 * r4 + mmsize]
1152
+    movu    m4,  [r2 + 2 * r4]
1153
+    movu    m10, [r2 + 2 * r4 + mmsize]
1154
+    movu    m5,  [r3 + 2 * r4]
1155
+    movu    m11, [r3 + 2 * r4 + mmsize]
1156
+
1157
+    psubw   m3,  m6
1158
+    psubw   m9,  m8
1159
+    psubw   m4,  m6
1160
+    psubw   m10, m8
1161
+    psubw   m5,  m6
1162
+    psubw   m11, m8
1163
+    pabsw   m3,  m3
1164
+    pabsw   m4,  m4
1165
+    pabsw   m5,  m5
1166
+    pabsw   m9,  m9
1167
+    pabsw   m10, m10
1168
+    pabsw   m11, m11
1169
+    paddw   m3,  m9
1170
+    paddw   m4,  m10
1171
+    paddw   m5,  m11
1172
+
1173
+    pmaddwd m3, m7
1174
+    paddd   m0, m3
1175
+    pmaddwd m4, m7
1176
+    paddd   m1, m4
1177
+    pmaddwd m5, m7
1178
+    paddd   m2, m5
1179
+
1180
+    movu    m6,  [r0 + 6 * FENC_STRIDE]
1181
+    movu    m8,  [r0 + 6 * FENC_STRIDE + mmsize]
1182
+    movu    m3,  [r1 + r6]
1183
+    movu    m9,  [r1 + r6 + mmsize]
1184
+    movu    m4,  [r2 + r6]
1185
+    movu    m10, [r2 + r6 + mmsize]
1186
+    movu    m5,  [r3 + r6]
1187
+    movu    m11, [r3 + r6 + mmsize]
1188
+
1189
+    psubw   m3,  m6
1190
+    psubw   m9,  m8
1191
+    psubw   m4,  m6
1192
+    psubw   m10, m8
1193
+    psubw   m5,  m6
1194
+    psubw   m11, m8
1195
+    pabsw   m3,  m3
1196
+    pabsw   m4,  m4
1197
+    pabsw   m5,  m5
1198
+    pabsw   m9,  m9
1199
+    pabsw   m10, m10
1200
+    pabsw   m11, m11
1201
+    paddw   m3,  m9
1202
+    paddw   m4,  m10
1203
+    paddw   m5,  m11
1204
+
1205
+    pmaddwd m3, m7
1206
+    paddd   m0, m3
1207
+    pmaddwd m4, m7
1208
+    paddd   m1, m4
1209
+    pmaddwd m5, m7
1210
+    paddd   m2, m5
1211
+%endmacro
1212
+
1213
+%macro PROCESS_SAD_X3_END_AVX512 0
1214
+    vextracti32x8  ym3, m0, 1
1215
+    vextracti32x8  ym4, m1, 1
1216
+    vextracti32x8  ym5, m2, 1
1217
+    
1218
+    paddd          ym0, ym3
1219
+    paddd          ym1, ym4
1220
+    paddd          ym2, ym5
1221
+
1222
+    vextracti64x2  xm3, m0, 1
1223
+    vextracti64x2  xm4, m1, 1
1224
+    vextracti64x2  xm5, m2, 1
1225
+
1226
+    paddd          xm0, xm3
1227
+    paddd          xm1, xm4
1228
+    paddd          xm2, xm5
1229
+
1230
+    pshufd         xm3, xm0, 00001110b
1231
+    pshufd         xm4, xm1, 00001110b
1232
+    pshufd         xm5, xm2, 00001110b
1233
+
1234
+    paddd          xm0, xm3
1235
+    paddd          xm1, xm4
1236
+    paddd          xm2, xm5
1237
+
1238
+    pshufd         xm3, xm0, 00000001b
1239
+    pshufd         xm4, xm1, 00000001b
1240
+    pshufd         xm5, xm2, 00000001b
1241
+
1242
+    paddd          xm0, xm3
1243
+    paddd          xm1, xm4
1244
+    paddd          xm2, xm5
1245
+
1246
+    %if UNIX64
1247
+        movd     [r5 + 0], xm0
1248
+        movd     [r5 + 4], xm1
1249
+        movd     [r5 + 8], xm2
1250
+    %else
1251
+        mov            r0, r5mp
1252
+        movd     [r0 + 0], xm0
1253
+        movd     [r0 + 4], xm1
1254
+        movd     [r0 + 8], xm2
1255
+%endif
1256
+%endmacro
1257
+
1258
+
1259
+;------------------------------------------------------------------------------------------------------------------------------------------
1260
+; void pixel_sad_x3_16x%1( const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, intptr_t frefstride, int32_t* res )
1261
+;------------------------------------------------------------------------------------------------------------------------------------------
1262
+%if ARCH_X86_64
1263
+INIT_ZMM avx512
1264
+cglobal pixel_sad_x3_16x8, 6,7,8
1265
+    pxor    m0,  m0
1266
+    pxor    m1,  m1
1267
+    pxor    m2,  m2
1268
+
1269
+    vbroadcasti32x8 m7, [pw_1]
1270
+
1271
+    add     r4d, r4d
1272
+    lea     r6d, [r4 * 3]
1273
+
1274
+    PROCESS_SAD_X3_16x4_AVX512
1275
+    add             r0, FENC_STRIDE * 8
1276
+    lea             r1, [r1 + r4 * 4]
1277
+    lea             r2, [r2 + r4 * 4]
1278
+    lea             r3, [r3 + r4 * 4]
1279
+    PROCESS_SAD_X3_16x4_AVX512
1280
+    PROCESS_SAD_X3_END_AVX512
1281
+    RET
1282
+
1283
+INIT_ZMM avx512
1284
+cglobal pixel_sad_x3_16x12, 6,7,8
1285
+    pxor    m0,  m0
1286
+    pxor    m1,  m1
1287
+    pxor    m2,  m2
1288
+
1289
+    vbroadcasti32x8 m7, [pw_1]
1290
+
1291
+    add     r4d, r4d
1292
+    lea     r6d, [r4 * 3]
1293
+    %rep 2
1294
+        PROCESS_SAD_X3_16x4_AVX512
1295
+        add             r0, FENC_STRIDE * 8
1296
+        lea             r1, [r1 + r4 * 4]
1297
+        lea             r2, [r2 + r4 * 4]
1298
+        lea             r3, [r3 + r4 * 4]
1299
+    %endrep
1300
+    PROCESS_SAD_X3_16x4_AVX512
1301
+    PROCESS_SAD_X3_END_AVX512
1302
+    RET
1303
+
1304
+INIT_ZMM avx512
1305
+cglobal pixel_sad_x3_16x16, 6,7,8
1306
+    pxor    m0,  m0
1307
+    pxor    m1,  m1
1308
+    pxor    m2,  m2
1309
+
1310
+    vbroadcasti32x8 m7, [pw_1]
1311
+
1312
+    add     r4d, r4d
1313
+    lea     r6d, [r4 * 3]
1314
+
1315
+    %rep 3
1316
+        PROCESS_SAD_X3_16x4_AVX512
1317
+        add             r0, FENC_STRIDE * 8
1318
+        lea             r1, [r1 + r4 * 4]
1319
+        lea             r2, [r2 + r4 * 4]
1320
+        lea             r3, [r3 + r4 * 4]
1321
+    %endrep
1322
+    PROCESS_SAD_X3_16x4_AVX512
1323
+    PROCESS_SAD_X3_END_AVX512
1324
+    RET
1325
+
1326
+INIT_ZMM avx512
1327
+cglobal pixel_sad_x3_16x32, 6,7,8
1328
+    pxor    m0,  m0
1329
+    pxor    m1,  m1
1330
+    pxor    m2,  m2
1331
+
1332
+    vbroadcasti32x8 m7, [pw_1]
1333
+
1334
+    add     r4d, r4d
1335
+    lea     r6d, [r4 * 3]
1336
+
1337
+    %rep 7
1338
+        PROCESS_SAD_X3_16x4_AVX512
1339
+        add             r0, FENC_STRIDE * 8
1340
+        lea             r1, [r1 + r4 * 4]
1341
+        lea             r2, [r2 + r4 * 4]
1342
+        lea             r3, [r3 + r4 * 4]
1343
+    %endrep
1344
+    PROCESS_SAD_X3_16x4_AVX512
1345
+    PROCESS_SAD_X3_END_AVX512
1346
+    RET
1347
+
1348
+INIT_ZMM avx512
1349
+cglobal pixel_sad_x3_16x64, 6,7,8
1350
+    pxor    m0,  m0
1351
+    pxor    m1,  m1
1352
+    pxor    m2,  m2
1353
+
1354
+    vbroadcasti32x8 m7, [pw_1]
1355
+
1356
+    add     r4d, r4d
1357
+    lea     r6d, [r4 * 3]
1358
+
1359
+    %rep 15
1360
+        PROCESS_SAD_X3_16x4_AVX512
1361
+        add             r0, FENC_STRIDE * 8
1362
+        lea             r1, [r1 + r4 * 4]
1363
+        lea             r2, [r2 + r4 * 4]
1364
+        lea             r3, [r3 + r4 * 4]
1365
+    %endrep
1366
+    PROCESS_SAD_X3_16x4_AVX512
1367
+    PROCESS_SAD_X3_END_AVX512
1368
+    RET
1369
+%endif
1370
+
1371
+;------------------------------------------------------------------------------------------------------------------------------------------
1372
+; void pixel_sad_x3_32x%1( const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, intptr_t frefstride, int32_t* res )
1373
+;------------------------------------------------------------------------------------------------------------------------------------------
1374
+%if ARCH_X86_64
1375
+INIT_ZMM avx512
1376
+cglobal pixel_sad_x3_32x8, 6,7,8
1377
+    pxor    m0,  m0
1378
+    pxor    m1,  m1
1379
+    pxor    m2,  m2
1380
+
1381
+    vbroadcasti32x8 m7, [pw_1]
1382
+
1383
+    add     r4d, r4d
1384
+    lea     r6d, [r4 * 3]
1385
+
1386
+    PROCESS_SAD_X3_32x4_AVX512
1387
+    add             r0, FENC_STRIDE * 8
1388
+    lea             r1, [r1 + r4 * 4]
1389
+    lea             r2, [r2 + r4 * 4]
1390
+    lea             r3, [r3 + r4 * 4]
1391
+    PROCESS_SAD_X3_32x4_AVX512
1392
+    PROCESS_SAD_X3_END_AVX512
1393
+    RET
1394
+
1395
+
1396
+INIT_ZMM avx512
1397
+cglobal pixel_sad_x3_32x16, 6,7,8
1398
+    pxor    m0,  m0
1399
+    pxor    m1,  m1
1400
+    pxor    m2,  m2
1401
+
1402
+    vbroadcasti32x8 m7, [pw_1]
1403
+
1404
+    add     r4d, r4d
1405
+    lea     r6d, [r4 * 3]
1406
+
1407
+    PROCESS_SAD_X3_32x4_AVX512
1408
+    add             r0, FENC_STRIDE * 8
1409
+    lea             r1, [r1 + r4 * 4]
1410
+    lea             r2, [r2 + r4 * 4]
1411
+    lea             r3, [r3 + r4 * 4]
1412
+    PROCESS_SAD_X3_32x4_AVX512
1413
+    add             r0, FENC_STRIDE * 8
1414
+    lea             r1, [r1 + r4 * 4]
1415
+    lea             r2, [r2 + r4 * 4]
1416
+    lea             r3, [r3 + r4 * 4]
1417
+    PROCESS_SAD_X3_32x4_AVX512
1418
+    add             r0, FENC_STRIDE * 8
1419
+    lea             r1, [r1 + r4 * 4]
1420
+    lea             r2, [r2 + r4 * 4]
1421
+    lea             r3, [r3 + r4 * 4]
1422
+    PROCESS_SAD_X3_32x4_AVX512
1423
+    PROCESS_SAD_X3_END_AVX512
1424
+    RET
1425
+
1426
+INIT_ZMM avx512
1427
+cglobal pixel_sad_x3_32x24, 6,7,8
1428
+    pxor    m0,  m0
1429
+    pxor    m1,  m1
1430
+    pxor    m2,  m2
1431
+
1432
+    vbroadcasti32x8 m7, [pw_1]
1433
+
1434
+    add     r4d, r4d
1435
+    lea     r6d, [r4 * 3]
1436
+
1437
+    PROCESS_SAD_X3_32x4_AVX512
1438
+    add             r0, FENC_STRIDE * 8
1439
+    lea             r1, [r1 + r4 * 4]
1440
+    lea             r2, [r2 + r4 * 4]
1441
+    lea             r3, [r3 + r4 * 4]
1442
+    PROCESS_SAD_X3_32x4_AVX512
1443
+    add             r0, FENC_STRIDE * 8
1444
+    lea             r1, [r1 + r4 * 4]
1445
+    lea             r2, [r2 + r4 * 4]
1446
+    lea             r3, [r3 + r4 * 4]
1447
+    PROCESS_SAD_X3_32x4_AVX512
1448
+    add             r0, FENC_STRIDE * 8
1449
+    lea             r1, [r1 + r4 * 4]
1450
+    lea             r2, [r2 + r4 * 4]
1451
+    lea             r3, [r3 + r4 * 4]
1452
+    PROCESS_SAD_X3_32x4_AVX512
1453
+    add             r0, FENC_STRIDE * 8
1454
+    lea             r1, [r1 + r4 * 4]
1455
+    lea             r2, [r2 + r4 * 4]
1456
+    lea             r3, [r3 + r4 * 4]
1457
+    PROCESS_SAD_X3_32x4_AVX512
1458
+    add             r0, FENC_STRIDE * 8
1459
+    lea             r1, [r1 + r4 * 4]
1460
+    lea             r2, [r2 + r4 * 4]
1461
+    lea             r3, [r3 + r4 * 4]
1462
+    PROCESS_SAD_X3_32x4_AVX512
1463
+    PROCESS_SAD_X3_END_AVX512
1464
+    RET
1465
+
1466
+
1467
+INIT_ZMM avx512
1468
+cglobal pixel_sad_x3_32x32, 6,7,8
1469
+    pxor    m0,  m0
1470
+    pxor    m1,  m1
1471
+    pxor    m2,  m2
1472
+
1473
+    vbroadcasti32x8 m7, [pw_1]
1474
+
1475
+    add     r4d, r4d
1476
+    lea     r6d, [r4 * 3]
1477
+
1478
+    PROCESS_SAD_X3_32x4_AVX512
1479
+    add             r0, FENC_STRIDE * 8
1480
+    lea             r1, [r1 + r4 * 4]
1481
+    lea             r2, [r2 + r4 * 4]
1482
+    lea             r3, [r3 + r4 * 4]
1483
+    PROCESS_SAD_X3_32x4_AVX512
1484
+    add             r0, FENC_STRIDE * 8
1485
+    lea             r1, [r1 + r4 * 4]
1486
+    lea             r2, [r2 + r4 * 4]
1487
+    lea             r3, [r3 + r4 * 4]
1488
+    PROCESS_SAD_X3_32x4_AVX512
1489
+    add             r0, FENC_STRIDE * 8
1490
+    lea             r1, [r1 + r4 * 4]
1491
+    lea             r2, [r2 + r4 * 4]
1492
+    lea             r3, [r3 + r4 * 4]
1493
+    PROCESS_SAD_X3_32x4_AVX512
1494
+    add             r0, FENC_STRIDE * 8
1495
+    lea             r1, [r1 + r4 * 4]
1496
+    lea             r2, [r2 + r4 * 4]
1497
+    lea             r3, [r3 + r4 * 4]
1498
+    PROCESS_SAD_X3_32x4_AVX512
1499
+    add             r0, FENC_STRIDE * 8
1500
+    lea             r1, [r1 + r4 * 4]
1501
+    lea             r2, [r2 + r4 * 4]
1502
+    lea             r3, [r3 + r4 * 4]
1503
+    PROCESS_SAD_X3_32x4_AVX512
1504
+    add             r0, FENC_STRIDE * 8
1505
+    lea             r1, [r1 + r4 * 4]
1506
+    lea             r2, [r2 + r4 * 4]
1507
+    lea             r3, [r3 + r4 * 4]
1508
+    PROCESS_SAD_X3_32x4_AVX512
1509
+    add             r0, FENC_STRIDE * 8
1510
+    lea             r1, [r1 + r4 * 4]
1511
+    lea             r2, [r2 + r4 * 4]
1512
+    lea             r3, [r3 + r4 * 4]
1513
+    PROCESS_SAD_X3_32x4_AVX512
1514
+    PROCESS_SAD_X3_END_AVX512
1515
+    RET
1516
+
1517
+INIT_ZMM avx512
1518
+cglobal pixel_sad_x3_32x64, 6,7,8
1519
+    pxor    m0,  m0
1520
+    pxor    m1,  m1
1521
+    pxor    m2,  m2
1522
+
1523
+    vbroadcasti32x8 m7, [pw_1]
1524
+
1525
+    add     r4d, r4d
1526
+    lea     r6d, [r4 * 3]
1527
+
1528
+    PROCESS_SAD_X3_32x4_AVX512
1529
+    add             r0, FENC_STRIDE * 8
1530
+    lea             r1, [r1 + r4 * 4]
1531
+    lea             r2, [r2 + r4 * 4]
1532
+    lea             r3, [r3 + r4 * 4]
1533
+    PROCESS_SAD_X3_32x4_AVX512
1534
+    add             r0, FENC_STRIDE * 8
1535
+    lea             r1, [r1 + r4 * 4]
1536
+    lea             r2, [r2 + r4 * 4]
1537
+    lea             r3, [r3 + r4 * 4]
1538
+    PROCESS_SAD_X3_32x4_AVX512
1539
+    add             r0, FENC_STRIDE * 8
1540
+    lea             r1, [r1 + r4 * 4]
1541
+    lea             r2, [r2 + r4 * 4]
1542
+    lea             r3, [r3 + r4 * 4]
1543
+    PROCESS_SAD_X3_32x4_AVX512
1544
+    add             r0, FENC_STRIDE * 8
1545
+    lea             r1, [r1 + r4 * 4]
1546
+    lea             r2, [r2 + r4 * 4]
1547
+    lea             r3, [r3 + r4 * 4]
1548
+    PROCESS_SAD_X3_32x4_AVX512
1549
+    add             r0, FENC_STRIDE * 8
1550
+    lea             r1, [r1 + r4 * 4]
1551
+    lea             r2, [r2 + r4 * 4]
1552
+    lea             r3, [r3 + r4 * 4]
1553
+    PROCESS_SAD_X3_32x4_AVX512
1554
+    add             r0, FENC_STRIDE * 8
1555
+    lea             r1, [r1 + r4 * 4]
1556
+    lea             r2, [r2 + r4 * 4]
1557
+    lea             r3, [r3 + r4 * 4]
1558
+    PROCESS_SAD_X3_32x4_AVX512
1559
+    add             r0, FENC_STRIDE * 8
1560
+    lea             r1, [r1 + r4 * 4]
1561
+    lea             r2, [r2 + r4 * 4]
1562
+    lea             r3, [r3 + r4 * 4]
1563
+    PROCESS_SAD_X3_32x4_AVX512
1564
+    add             r0, FENC_STRIDE * 8
1565
+    lea             r1, [r1 + r4 * 4]
1566
+    lea             r2, [r2 + r4 * 4]
1567
+    lea             r3, [r3 + r4 * 4]
1568
+    PROCESS_SAD_X3_32x4_AVX512
1569
+    add             r0, FENC_STRIDE * 8
1570
+    lea             r1, [r1 + r4 * 4]
1571
+    lea             r2, [r2 + r4 * 4]
1572
+    lea             r3, [r3 + r4 * 4]
1573
+    PROCESS_SAD_X3_32x4_AVX512
1574
+    add             r0, FENC_STRIDE * 8
1575
+    lea             r1, [r1 + r4 * 4]
1576
+    lea             r2, [r2 + r4 * 4]
1577
+    lea             r3, [r3 + r4 * 4]
1578
+    PROCESS_SAD_X3_32x4_AVX512
1579
+    add             r0, FENC_STRIDE * 8
1580
+    lea             r1, [r1 + r4 * 4]
1581
+    lea             r2, [r2 + r4 * 4]
1582
+    lea             r3, [r3 + r4 * 4]
1583
+    PROCESS_SAD_X3_32x4_AVX512
1584
+    add             r0, FENC_STRIDE * 8
1585
+    lea             r1, [r1 + r4 * 4]
1586
+    lea             r2, [r2 + r4 * 4]
1587
+    lea             r3, [r3 + r4 * 4]
1588
+    PROCESS_SAD_X3_32x4_AVX512
1589
+    add             r0, FENC_STRIDE * 8
1590
+    lea             r1, [r1 + r4 * 4]
1591
+    lea             r2, [r2 + r4 * 4]
1592
+    lea             r3, [r3 + r4 * 4]
1593
+    PROCESS_SAD_X3_32x4_AVX512
1594
+    add             r0, FENC_STRIDE * 8
1595
+    lea             r1, [r1 + r4 * 4]
1596
+    lea             r2, [r2 + r4 * 4]
1597
+    lea             r3, [r3 + r4 * 4]
1598
+    PROCESS_SAD_X3_32x4_AVX512
1599
+    add             r0, FENC_STRIDE * 8
1600
+    lea             r1, [r1 + r4 * 4]
1601
+    lea             r2, [r2 + r4 * 4]
1602
+    lea             r3, [r3 + r4 * 4]
1603
+    PROCESS_SAD_X3_32x4_AVX512
1604
+    PROCESS_SAD_X3_END_AVX512
1605
+    RET
1606
+
1607
+;----------------------------------------------------------------------------------------------------------------------------------------
1608
+; int pixel_sad_x3_48x64( const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, intptr_t frefstride, int32_t* res )
1609
+;----------------------------------------------------------------------------------------------------------------------------------------
1610
+INIT_ZMM avx512
1611
+cglobal pixel_sad_x3_48x64, 4, 8, 17
1612
+    pxor    m0,  m0
1613
+    pxor    m1,  m1
1614
+    pxor    m2,  m2
1615
+    mov     r7d, 64/4
1616
+    vbroadcasti32x8 m16, [pw_1]
1617
+
1618
+    add     r4d, r4d
1619
+    lea     r6d, [r4 * 3]
1620
+.loop:
1621
+    movu            m4,   [r0]
1622
+    movu            m5,   [r0 + 2 * FENC_STRIDE]
1623
+    movu           ym6,   [r0 + mmsize]
1624
+    vinserti32x8    m6,   [r0 + 2 * FENC_STRIDE + mmsize], 1
1625
+    movu            m7,   [r1]
1626
+    movu            m8,   [r1 + r4]
1627
+    movu           ym9,   [r1 + mmsize]
1628
+    vinserti32x8    m9,   [r1 + r4 + mmsize], 1
1629
+    movu            m10,  [r2]
1630
+    movu            m11,  [r2 + r4]
1631
+    movu           ym12,  [r2 + mmsize]
1632
+    vinserti32x8    m12,  [r2 + r4 + mmsize], 1
1633
+    movu            m13,  [r3]
1634
+    movu            m14,  [r3 + r4]
1635
+    movu           ym15,  [r3 + mmsize]
1636
+    vinserti32x8    m15,  [r3 + r4 + mmsize], 1
1637
+
1638
+    psubw   m7,  m4
1639
+    psubw   m8,  m5
1640
+    psubw   m9,  m6
1641
+    psubw   m10, m4
1642
+    psubw   m11, m5
1643
+    psubw   m12, m6
1644
+    psubw   m13, m4
1645
+    psubw   m14, m5
1646
+    psubw   m15, m6
1647
+
1648
+    pabsw   m7,  m7
1649
+    pabsw   m8,  m8
1650
+    pabsw   m9,  m9
1651
+    pabsw   m10, m10
1652
+    pabsw   m11, m11
1653
+    pabsw   m12, m12
1654
+    pabsw   m13, m13
1655
+    pabsw   m14, m14
1656
+    pabsw   m15, m15
1657
+
1658
+    paddw   m7,  m8
1659
+    paddw   m7,  m9
1660
+    paddw   m10, m11
1661
+    paddw   m10, m12
1662
+    paddw   m13, m14
1663
+    paddw   m13, m15
1664
+
1665
+    pmaddwd m7,  m16
1666
+    paddd   m0,  m7
1667
+    pmaddwd m10, m16
1668
+    paddd   m1,  m10
1669
+    pmaddwd m13, m16
1670
+    paddd   m2,  m13
1671
+
1672
+    movu            m4,   [r0 + 4 * FENC_STRIDE]
1673
+    movu            m5,   [r0 + 6 * FENC_STRIDE]
1674
+    movu           ym6,   [r0 + 4 * FENC_STRIDE + mmsize]
1675
+    vinserti32x8    m6,   [r0 + 6 * FENC_STRIDE + mmsize], 1
1676
+    movu            m7,   [r1 + 2 * r4]
1677
+    movu            m8,   [r1 + r6]
1678
+    movu           ym9,   [r1 + 2 * r4 + mmsize]
1679
+    vinserti32x8    m9,   [r1 + r6 + mmsize], 1
1680
+    movu            m10,  [r2 + 2 * r4]
1681
+    movu            m11,  [r2 + r6]
1682
+    movu           ym12,  [r2 + 2 * r4 + mmsize]
1683
+    vinserti32x8    m12,  [r2 + r6 + mmsize], 1
1684
+    movu            m13,  [r3 + 2 * r4]
1685
+    movu            m14,  [r3 + r6]
1686
+    movu           ym15,  [r3 + 2 * r4 + mmsize]
1687
+    vinserti32x8    m15,  [r3 + r6 + mmsize], 1
1688
+
1689
+    psubw   m7,  m4
1690
+    psubw   m8,  m5
1691
+    psubw   m9,  m6
1692
+    psubw   m10, m4
1693
+    psubw   m11, m5
1694
+    psubw   m12, m6
1695
+    psubw   m13, m4
1696
+    psubw   m14, m5
1697
+    psubw   m15, m6
1698
+
1699
+    pabsw   m7,  m7
1700
+    pabsw   m8,  m8
1701
+    pabsw   m9,  m9
1702
+    pabsw   m10, m10
1703
+    pabsw   m11, m11
1704
+    pabsw   m12, m12
1705
+    pabsw   m13, m13
1706
+    pabsw   m14, m14
1707
+    pabsw   m15, m15
1708
+
1709
+    paddw   m7,  m8
1710
+    paddw   m7,  m9
1711
+    paddw   m10, m11
1712
+    paddw   m10, m12
1713
+    paddw   m13, m14
1714
+    paddw   m13, m15
1715
+
1716
+    pmaddwd m7,  m16
1717
+    paddd   m0,  m7
1718
+    pmaddwd m10, m16
1719
+    paddd   m1,  m10
1720
+    pmaddwd m13, m16
1721
+    paddd   m2,  m13
1722
+
1723
+    add             r0, FENC_STRIDE * 8
1724
+    lea             r1, [r1 + r4 * 4]
1725
+    lea             r2, [r2 + r4 * 4]
1726
+    lea             r3, [r3 + r4 * 4]
1727
+
1728
+    dec     r7d
1729
+    jg      .loop
1730
+
1731
+    PROCESS_SAD_X3_END_AVX512
1732
+    RET
1733
+%endif
1734
+
1735
+;------------------------------------------------------------------------------------------------------------------------------------------
1736
+; void pixel_sad_x3_64x%1( const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, intptr_t frefstride, int32_t* res )
1737
+;------------------------------------------------------------------------------------------------------------------------------------------
1738
+%if ARCH_X86_64
1739
+INIT_ZMM avx512
1740
+cglobal pixel_sad_x3_64x16, 6,7,12
1741
+    pxor    m0,  m0
1742
+    pxor    m1,  m1
1743
+    pxor    m2,  m2
1744
+
1745
+    vbroadcasti32x8 m7, [pw_1]
1746
+
1747
+    add     r4d, r4d
1748
+    lea     r6d, [r4 * 3]
1749
+
1750
+    PROCESS_SAD_X3_64x4_AVX512
1751
+    add             r0, FENC_STRIDE * 8
1752
+    lea             r1, [r1 + r4 * 4]
1753
+    lea             r2, [r2 + r4 * 4]
1754
+    lea             r3, [r3 + r4 * 4]
1755
+    PROCESS_SAD_X3_64x4_AVX512
1756
+    add             r0, FENC_STRIDE * 8
1757
+    lea             r1, [r1 + r4 * 4]
1758
+    lea             r2, [r2 + r4 * 4]
1759
+    lea             r3, [r3 + r4 * 4]
1760
+    PROCESS_SAD_X3_64x4_AVX512
1761
+    add             r0, FENC_STRIDE * 8
1762
+    lea             r1, [r1 + r4 * 4]
1763
+    lea             r2, [r2 + r4 * 4]
1764
+    lea             r3, [r3 + r4 * 4]
1765
+    PROCESS_SAD_X3_64x4_AVX512
1766
+    PROCESS_SAD_X3_END_AVX512
1767
+    RET
1768
+
1769
+INIT_ZMM avx512
1770
+cglobal pixel_sad_x3_64x32, 6,7,12
1771
+    pxor    m0,  m0
1772
+    pxor    m1,  m1
1773
+    pxor    m2,  m2
1774
+
1775
+    vbroadcasti32x8 m7, [pw_1]
1776
+
1777
+    add     r4d, r4d
1778
+    lea     r6d, [r4 * 3]
1779
+
1780
+    PROCESS_SAD_X3_64x4_AVX512
1781
+    add             r0, FENC_STRIDE * 8
1782
+    lea             r1, [r1 + r4 * 4]
1783
+    lea             r2, [r2 + r4 * 4]
1784
+    lea             r3, [r3 + r4 * 4]
1785
+    PROCESS_SAD_X3_64x4_AVX512
1786
+    add             r0, FENC_STRIDE * 8
1787
+    lea             r1, [r1 + r4 * 4]
1788
+    lea             r2, [r2 + r4 * 4]
1789
+    lea             r3, [r3 + r4 * 4]
1790
+    PROCESS_SAD_X3_64x4_AVX512
1791
+    add             r0, FENC_STRIDE * 8
1792
+    lea             r1, [r1 + r4 * 4]
1793
+    lea             r2, [r2 + r4 * 4]
1794
+    lea             r3, [r3 + r4 * 4]
1795
+    PROCESS_SAD_X3_64x4_AVX512
1796
+    add             r0, FENC_STRIDE * 8
1797
+    lea             r1, [r1 + r4 * 4]
1798
+    lea             r2, [r2 + r4 * 4]
1799
+    lea             r3, [r3 + r4 * 4]
1800
+    PROCESS_SAD_X3_64x4_AVX512
1801
+    add             r0, FENC_STRIDE * 8
1802
+    lea             r1, [r1 + r4 * 4]
1803
+    lea             r2, [r2 + r4 * 4]
1804
+    lea             r3, [r3 + r4 * 4]
1805
+    PROCESS_SAD_X3_64x4_AVX512
1806
+    add             r0, FENC_STRIDE * 8
1807
+    lea             r1, [r1 + r4 * 4]
1808
+    lea             r2, [r2 + r4 * 4]
1809
+    lea             r3, [r3 + r4 * 4]
1810
+    PROCESS_SAD_X3_64x4_AVX512
1811
+    add             r0, FENC_STRIDE * 8
1812
+    lea             r1, [r1 + r4 * 4]
1813
+    lea             r2, [r2 + r4 * 4]
1814
+    lea             r3, [r3 + r4 * 4]
1815
+    PROCESS_SAD_X3_64x4_AVX512
1816
+    PROCESS_SAD_X3_END_AVX512
1817
+    RET
1818
+
1819
+INIT_ZMM avx512
1820
+cglobal pixel_sad_x3_64x48, 6,7,12
1821
+    pxor    m0,  m0
1822
+    pxor    m1,  m1
1823
+    pxor    m2,  m2
1824
+
1825
+    vbroadcasti32x8 m7, [pw_1]
1826
+
1827
+    add     r4d, r4d
1828
+    lea     r6d, [r4 * 3]
1829
+
1830
+    PROCESS_SAD_X3_64x4_AVX512
1831
+    add             r0, FENC_STRIDE * 8
1832
+    lea             r1, [r1 + r4 * 4]
1833
+    lea             r2, [r2 + r4 * 4]
1834
+    lea             r3, [r3 + r4 * 4]
1835
+    PROCESS_SAD_X3_64x4_AVX512
1836
+    add             r0, FENC_STRIDE * 8
1837
+    lea             r1, [r1 + r4 * 4]
1838
+    lea             r2, [r2 + r4 * 4]
1839
+    lea             r3, [r3 + r4 * 4]
1840
+    PROCESS_SAD_X3_64x4_AVX512
1841
+    add             r0, FENC_STRIDE * 8
1842
+    lea             r1, [r1 + r4 * 4]
1843
+    lea             r2, [r2 + r4 * 4]
1844
+    lea             r3, [r3 + r4 * 4]
1845
+    PROCESS_SAD_X3_64x4_AVX512
1846
+    add             r0, FENC_STRIDE * 8
1847
+    lea             r1, [r1 + r4 * 4]
1848
+    lea             r2, [r2 + r4 * 4]
1849
+    lea             r3, [r3 + r4 * 4]
1850
+    PROCESS_SAD_X3_64x4_AVX512
1851
+    add             r0, FENC_STRIDE * 8
1852
+    lea             r1, [r1 + r4 * 4]
1853
+    lea             r2, [r2 + r4 * 4]
1854
+    lea             r3, [r3 + r4 * 4]
1855
+    PROCESS_SAD_X3_64x4_AVX512
1856
+    add             r0, FENC_STRIDE * 8
1857
+    lea             r1, [r1 + r4 * 4]
1858
+    lea             r2, [r2 + r4 * 4]
1859
+    lea             r3, [r3 + r4 * 4]
1860
+    PROCESS_SAD_X3_64x4_AVX512
1861
+    add             r0, FENC_STRIDE * 8
1862
+    lea             r1, [r1 + r4 * 4]
1863
+    lea             r2, [r2 + r4 * 4]
1864
+    lea             r3, [r3 + r4 * 4]
1865
+    PROCESS_SAD_X3_64x4_AVX512
1866
+    add             r0, FENC_STRIDE * 8
1867
+    lea             r1, [r1 + r4 * 4]
1868
+    lea             r2, [r2 + r4 * 4]
1869
+    lea             r3, [r3 + r4 * 4]
1870
+    PROCESS_SAD_X3_64x4_AVX512
1871
+    add             r0, FENC_STRIDE * 8
1872
+    lea             r1, [r1 + r4 * 4]
1873
+    lea             r2, [r2 + r4 * 4]
1874
+    lea             r3, [r3 + r4 * 4]
1875
+    PROCESS_SAD_X3_64x4_AVX512
1876
+    add             r0, FENC_STRIDE * 8
1877
+    lea             r1, [r1 + r4 * 4]
1878
+    lea             r2, [r2 + r4 * 4]
1879
+    lea             r3, [r3 + r4 * 4]
1880
+    PROCESS_SAD_X3_64x4_AVX512
1881
+    add             r0, FENC_STRIDE * 8
1882
+    lea             r1, [r1 + r4 * 4]
1883
+    lea             r2, [r2 + r4 * 4]
1884
+    lea             r3, [r3 + r4 * 4]
1885
+    PROCESS_SAD_X3_64x4_AVX512
1886
+    PROCESS_SAD_X3_END_AVX512
1887
+    RET
1888
+
1889
+INIT_ZMM avx512
1890
+cglobal pixel_sad_x3_64x64, 6,7,12
1891
+    pxor    m0,  m0
1892
+    pxor    m1,  m1
1893
+    pxor    m2,  m2
1894
+
1895
+    vbroadcasti32x8 m7, [pw_1]
1896
+
1897
+    add     r4d, r4d
1898
+    lea     r6d, [r4 * 3]
1899
+
1900
+    PROCESS_SAD_X3_64x4_AVX512
1901
+    add             r0, FENC_STRIDE * 8
1902
+    lea             r1, [r1 + r4 * 4]
1903
+    lea             r2, [r2 + r4 * 4]
1904
+    lea             r3, [r3 + r4 * 4]
1905
+    PROCESS_SAD_X3_64x4_AVX512
1906
+    add             r0, FENC_STRIDE * 8
1907
+    lea             r1, [r1 + r4 * 4]
1908
+    lea             r2, [r2 + r4 * 4]
1909
+    lea             r3, [r3 + r4 * 4]
1910
+    PROCESS_SAD_X3_64x4_AVX512
1911
+    add             r0, FENC_STRIDE * 8
1912
+    lea             r1, [r1 + r4 * 4]
1913
+    lea             r2, [r2 + r4 * 4]
1914
+    lea             r3, [r3 + r4 * 4]
1915
+    PROCESS_SAD_X3_64x4_AVX512
1916
+    add             r0, FENC_STRIDE * 8
1917
+    lea             r1, [r1 + r4 * 4]
1918
+    lea             r2, [r2 + r4 * 4]
1919
+    lea             r3, [r3 + r4 * 4]
1920
+    PROCESS_SAD_X3_64x4_AVX512
1921
+    add             r0, FENC_STRIDE * 8
1922
+    lea             r1, [r1 + r4 * 4]
1923
+    lea             r2, [r2 + r4 * 4]
1924
+    lea             r3, [r3 + r4 * 4]
1925
+    PROCESS_SAD_X3_64x4_AVX512
1926
+    add             r0, FENC_STRIDE * 8
1927
+    lea             r1, [r1 + r4 * 4]
1928
+    lea             r2, [r2 + r4 * 4]
1929
+    lea             r3, [r3 + r4 * 4]
1930
+    PROCESS_SAD_X3_64x4_AVX512
1931
+    add             r0, FENC_STRIDE * 8
1932
+    lea             r1, [r1 + r4 * 4]
1933
+    lea             r2, [r2 + r4 * 4]
1934
+    lea             r3, [r3 + r4 * 4]
1935
+    PROCESS_SAD_X3_64x4_AVX512
1936
+    add             r0, FENC_STRIDE * 8
1937
+    lea             r1, [r1 + r4 * 4]
1938
+    lea             r2, [r2 + r4 * 4]
1939
+    lea             r3, [r3 + r4 * 4]
1940
+    PROCESS_SAD_X3_64x4_AVX512
1941
+    add             r0, FENC_STRIDE * 8
1942
+    lea             r1, [r1 + r4 * 4]
1943
+    lea             r2, [r2 + r4 * 4]
1944
+    lea             r3, [r3 + r4 * 4]
1945
+    PROCESS_SAD_X3_64x4_AVX512
1946
+    add             r0, FENC_STRIDE * 8
1947
+    lea             r1, [r1 + r4 * 4]
1948
+    lea             r2, [r2 + r4 * 4]
1949
+    lea             r3, [r3 + r4 * 4]
1950
+    PROCESS_SAD_X3_64x4_AVX512
1951
+    add             r0, FENC_STRIDE * 8
1952
+    lea             r1, [r1 + r4 * 4]
1953
+    lea             r2, [r2 + r4 * 4]
1954
+    lea             r3, [r3 + r4 * 4]
1955
+    PROCESS_SAD_X3_64x4_AVX512
1956
+    add             r0, FENC_STRIDE * 8
1957
+    lea             r1, [r1 + r4 * 4]
1958
+    lea             r2, [r2 + r4 * 4]
1959
+    lea             r3, [r3 + r4 * 4]
1960
+    PROCESS_SAD_X3_64x4_AVX512
1961
+    add             r0, FENC_STRIDE * 8
1962
+    lea             r1, [r1 + r4 * 4]
1963
+    lea             r2, [r2 + r4 * 4]
1964
+    lea             r3, [r3 + r4 * 4]
1965
+    PROCESS_SAD_X3_64x4_AVX512
1966
+    add             r0, FENC_STRIDE * 8
1967
+    lea             r1, [r1 + r4 * 4]
1968
+    lea             r2, [r2 + r4 * 4]
1969
+    lea             r3, [r3 + r4 * 4]
1970
+    PROCESS_SAD_X3_64x4_AVX512
1971
+    add             r0, FENC_STRIDE * 8
1972
+    lea             r1, [r1 + r4 * 4]
1973
+    lea             r2, [r2 + r4 * 4]
1974
+    lea             r3, [r3 + r4 * 4]
1975
+    PROCESS_SAD_X3_64x4_AVX512
1976
+    PROCESS_SAD_X3_END_AVX512
1977
+    RET
1978
+%endif
1979
+
1980
+;------------------------------------------------------------------------------------------------------------------------------------------------------------
1981
+; void pixel_sad_x4_16x%1( const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, const pixel* pix5, intptr_t frefstride, int32_t* res )
1982
+;------------------------------------------------------------------------------------------------------------------------------------------------------------
1983
+%if ARCH_X86_64
1984
+INIT_ZMM avx512
1985
+cglobal pixel_sad_x4_16x8, 6,8,10
1986
+    pxor    m0,  m0
1987
+    pxor    m1,  m1
1988
+    pxor    m2,  m2
1989
+    pxor    m3,  m3
1990
+
1991
+    vbroadcasti32x8 m9, [pw_1]
1992
+
1993
+    add     r5d, r5d
1994
+    lea     r7d, [r5 * 3]
1995
+
1996
+    PROCESS_SAD_X4_16x4_AVX512
1997
+    add             r0, FENC_STRIDE * 8
1998
+    lea             r1, [r1 + r5 * 4]
1999
+    lea             r2, [r2 + r5 * 4]
2000
+    lea             r3, [r3 + r5 * 4]
2001
+    lea             r4, [r4 + r5 * 4]
2002
+    PROCESS_SAD_X4_16x4_AVX512
2003
+    PROCESS_SAD_X4_END_AVX512
2004
+    RET
2005
+
2006
+INIT_ZMM avx512
2007
+cglobal pixel_sad_x4_16x12, 6,8,10
2008
+    pxor    m0,  m0
2009
+    pxor    m1,  m1
2010
+    pxor    m2,  m2
2011
+    pxor    m3,  m3
2012
+
2013
+    vbroadcasti32x8 m9, [pw_1]
2014
+
2015
+    add     r5d, r5d
2016
+    lea     r7d, [r5 * 3]
2017
+
2018
+    %rep 2
2019
+        PROCESS_SAD_X4_16x4_AVX512
2020
+        add             r0, FENC_STRIDE * 8
2021
+        lea             r1, [r1 + r5 * 4]
2022
+        lea             r2, [r2 + r5 * 4]
2023
+        lea             r3, [r3 + r5 * 4]
2024
+        lea             r4, [r4 + r5 * 4]
2025
+    %endrep
2026
+    PROCESS_SAD_X4_16x4_AVX512
2027
+    PROCESS_SAD_X4_END_AVX512
2028
+    RET
2029
+
2030
+INIT_ZMM avx512
2031
+cglobal pixel_sad_x4_16x16, 6,8,10
2032
+    pxor    m0,  m0
2033
+    pxor    m1,  m1
2034
+    pxor    m2,  m2
2035
+    pxor    m3,  m3
2036
+
2037
+    vbroadcasti32x8 m9, [pw_1]
2038
+
2039
+    add     r5d, r5d
2040
+    lea     r7d, [r5 * 3]
2041
+
2042
+    %rep 3
2043
+        PROCESS_SAD_X4_16x4_AVX512
2044
+        add             r0, FENC_STRIDE * 8
2045
+        lea             r1, [r1 + r5 * 4]
2046
+        lea             r2, [r2 + r5 * 4]
2047
+        lea             r3, [r3 + r5 * 4]
2048
+        lea             r4, [r4 + r5 * 4]
2049
+    %endrep
2050
+    PROCESS_SAD_X4_16x4_AVX512
2051
+    PROCESS_SAD_X4_END_AVX512
2052
+    RET
2053
+
2054
+INIT_ZMM avx512
2055
+cglobal pixel_sad_x4_16x32, 6,8,10
2056
+    pxor    m0,  m0
2057
+    pxor    m1,  m1
2058
+    pxor    m2,  m2
2059
+    pxor    m3,  m3
2060
+
2061
+    vbroadcasti32x8 m9, [pw_1]
2062
+
2063
+    add     r5d, r5d
2064
+    lea     r7d, [r5 * 3]
2065
+
2066
+    %rep 7
2067
+        PROCESS_SAD_X4_16x4_AVX512
2068
+        add             r0, FENC_STRIDE * 8
2069
+        lea             r1, [r1 + r5 * 4]
2070
+        lea             r2, [r2 + r5 * 4]
2071
+        lea             r3, [r3 + r5 * 4]
2072
+        lea             r4, [r4 + r5 * 4]
2073
+    %endrep
2074
+    PROCESS_SAD_X4_16x4_AVX512
2075
+    PROCESS_SAD_X4_END_AVX512
2076
+    RET
2077
+
2078
+INIT_ZMM avx512
2079
+cglobal pixel_sad_x4_16x64, 6,8,10
2080
+    pxor    m0,  m0
2081
+    pxor    m1,  m1
2082
+    pxor    m2,  m2
2083
+    pxor    m3,  m3
2084
+
2085
+    vbroadcasti32x8 m9, [pw_1]
2086
+
2087
+    add     r5d, r5d
2088
+    lea     r7d, [r5 * 3]
2089
+
2090
+    %rep 15
2091
+        PROCESS_SAD_X4_16x4_AVX512
2092
+        add             r0, FENC_STRIDE * 8
2093
+        lea             r1, [r1 + r5 * 4]
2094
+        lea             r2, [r2 + r5 * 4]
2095
+        lea             r3, [r3 + r5 * 4]
2096
+        lea             r4, [r4 + r5 * 4]
2097
+    %endrep
2098
+    PROCESS_SAD_X4_16x4_AVX512
2099
+    PROCESS_SAD_X4_END_AVX512
2100
+    RET
2101
+%endif
2102
+
2103
+;------------------------------------------------------------------------------------------------------------------------------------------------------------
2104
+; void pixel_sad_x4_32x%1( const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, const pixel* pix5, intptr_t frefstride, int32_t* res )
2105
+;------------------------------------------------------------------------------------------------------------------------------------------------------------
2106
+%if ARCH_X86_64
2107
+INIT_ZMM avx512
2108
+cglobal pixel_sad_x4_32x8, 6,8,10
2109
+    pxor    m0,  m0
2110
+    pxor    m1,  m1
2111
+    pxor    m2,  m2
2112
+    pxor    m3,  m3
2113
+
2114
+    vbroadcasti32x8 m9, [pw_1]
2115
+
2116
+    add     r5d, r5d
2117
+    lea     r7d, [r5 * 3]
2118
+
2119
+    PROCESS_SAD_X4_32x4_AVX512
2120
+    add             r0, FENC_STRIDE * 8
2121
+    lea             r1, [r1 + r5 * 4]
2122
+    lea             r2, [r2 + r5 * 4]
2123
+    lea             r3, [r3 + r5 * 4]
2124
+    lea             r4, [r4 + r5 * 4]
2125
+    PROCESS_SAD_X4_32x4_AVX512
2126
+    PROCESS_SAD_X4_END_AVX512
2127
+    RET
2128
+
2129
+INIT_ZMM avx512
2130
+cglobal pixel_sad_x4_32x16, 6,8,10
2131
+    pxor    m0,  m0
2132
+    pxor    m1,  m1
2133
+    pxor    m2,  m2
2134
+    pxor    m3,  m3
2135
+
2136
+    vbroadcasti32x8 m9, [pw_1]
2137
+
2138
+    add     r5d, r5d
2139
+    lea     r7d, [r5 * 3]
2140
+
2141
+    PROCESS_SAD_X4_32x4_AVX512
2142
+    add             r0, FENC_STRIDE * 8
2143
+    lea             r1, [r1 + r5 * 4]
2144
+    lea             r2, [r2 + r5 * 4]
2145
+    lea             r3, [r3 + r5 * 4]
2146
+    lea             r4, [r4 + r5 * 4]
2147
+    PROCESS_SAD_X4_32x4_AVX512
2148
+    add             r0, FENC_STRIDE * 8
2149
+    lea             r1, [r1 + r5 * 4]
2150
+    lea             r2, [r2 + r5 * 4]
2151
+    lea             r3, [r3 + r5 * 4]
2152
+    lea             r4, [r4 + r5 * 4]
2153
+    PROCESS_SAD_X4_32x4_AVX512
2154
+    add             r0, FENC_STRIDE * 8
2155
+    lea             r1, [r1 + r5 * 4]
2156
+    lea             r2, [r2 + r5 * 4]
2157
+    lea             r3, [r3 + r5 * 4]
2158
+    lea             r4, [r4 + r5 * 4]
2159
+    PROCESS_SAD_X4_32x4_AVX512
2160
+    PROCESS_SAD_X4_END_AVX512
2161
+    RET
2162
+
2163
+INIT_ZMM avx512
2164
+cglobal pixel_sad_x4_32x24, 6,8,10
2165
+    pxor    m0,  m0
2166
+    pxor    m1,  m1
2167
+    pxor    m2,  m2
2168
+    pxor    m3,  m3
2169
+
2170
+    vbroadcasti32x8 m9, [pw_1]
2171
+
2172
+    add     r5d, r5d
2173
+    lea     r7d, [r5 * 3]
2174
+
2175
+    PROCESS_SAD_X4_32x4_AVX512
2176
+    add             r0, FENC_STRIDE * 8
2177
+    lea             r1, [r1 + r5 * 4]
2178
+    lea             r2, [r2 + r5 * 4]
2179
+    lea             r3, [r3 + r5 * 4]
2180
+    lea             r4, [r4 + r5 * 4]
2181
+    PROCESS_SAD_X4_32x4_AVX512
2182
+     add            r0, FENC_STRIDE * 8
2183
+    lea             r1, [r1 + r5 * 4]
2184
+    lea             r2, [r2 + r5 * 4]
2185
+    lea             r3, [r3 + r5 * 4]
2186
+    lea             r4, [r4 + r5 * 4]
2187
+    PROCESS_SAD_X4_32x4_AVX512
2188
+     add            r0, FENC_STRIDE * 8
2189
+    lea             r1, [r1 + r5 * 4]
2190
+    lea             r2, [r2 + r5 * 4]
2191
+    lea             r3, [r3 + r5 * 4]
2192
+    lea             r4, [r4 + r5 * 4]
2193
+    PROCESS_SAD_X4_32x4_AVX512
2194
+    add             r0, FENC_STRIDE * 8
2195
+    lea             r1, [r1 + r5 * 4]
2196
+    lea             r2, [r2 + r5 * 4]
2197
+    lea             r3, [r3 + r5 * 4]
2198
+    lea             r4, [r4 + r5 * 4]
2199
+    PROCESS_SAD_X4_32x4_AVX512
2200
+    add             r0, FENC_STRIDE * 8
2201
+    lea             r1, [r1 + r5 * 4]
2202
+    lea             r2, [r2 + r5 * 4]
2203
+    lea             r3, [r3 + r5 * 4]
2204
+    lea             r4, [r4 + r5 * 4]
2205
+    PROCESS_SAD_X4_32x4_AVX512
2206
+    PROCESS_SAD_X4_END_AVX512
2207
+    RET
2208
+
2209
+
2210
+INIT_ZMM avx512
2211
+cglobal pixel_sad_x4_32x32, 6,8,10
2212
+    pxor    m0,  m0
2213
+    pxor    m1,  m1
2214
+    pxor    m2,  m2
2215
+    pxor    m3,  m3
2216
+
2217
+    vbroadcasti32x8 m9, [pw_1]
2218
+
2219
+    add     r5d, r5d
2220
+    lea     r7d, [r5 * 3]
2221
+
2222
+    PROCESS_SAD_X4_32x4_AVX512
2223
+    add             r0, FENC_STRIDE * 8
2224
+    lea             r1, [r1 + r5 * 4]
2225
+    lea             r2, [r2 + r5 * 4]
2226
+    lea             r3, [r3 + r5 * 4]
2227
+    lea             r4, [r4 + r5 * 4]
2228
+    PROCESS_SAD_X4_32x4_AVX512
2229
+    add             r0, FENC_STRIDE * 8
2230
+    lea             r1, [r1 + r5 * 4]
2231
+    lea             r2, [r2 + r5 * 4]
2232
+    lea             r3, [r3 + r5 * 4]
2233
+    lea             r4, [r4 + r5 * 4]
2234
+    PROCESS_SAD_X4_32x4_AVX512
2235
+    add             r0, FENC_STRIDE * 8
2236
+    lea             r1, [r1 + r5 * 4]
2237
+    lea             r2, [r2 + r5 * 4]
2238
+    lea             r3, [r3 + r5 * 4]
2239
+    lea             r4, [r4 + r5 * 4]
2240
+    PROCESS_SAD_X4_32x4_AVX512
2241
+    add             r0, FENC_STRIDE * 8
2242
+    lea             r1, [r1 + r5 * 4]
2243
+    lea             r2, [r2 + r5 * 4]
2244
+    lea             r3, [r3 + r5 * 4]
2245
+    lea             r4, [r4 + r5 * 4]
2246
+    PROCESS_SAD_X4_32x4_AVX512
2247
+    add             r0, FENC_STRIDE * 8
2248
+    lea             r1, [r1 + r5 * 4]
2249
+    lea             r2, [r2 + r5 * 4]
2250
+    lea             r3, [r3 + r5 * 4]
2251
+    lea             r4, [r4 + r5 * 4]
2252
+    PROCESS_SAD_X4_32x4_AVX512
2253
+   add              r0, FENC_STRIDE * 8
2254
+    lea             r1, [r1 + r5 * 4]
2255
+    lea             r2, [r2 + r5 * 4]
2256
+    lea             r3, [r3 + r5 * 4]
2257
+    lea             r4, [r4 + r5 * 4]
2258
+    PROCESS_SAD_X4_32x4_AVX512
2259
+    add             r0, FENC_STRIDE * 8
2260
+    lea             r1, [r1 + r5 * 4]
2261
+    lea             r2, [r2 + r5 * 4]
2262
+    lea             r3, [r3 + r5 * 4]
2263
+    lea             r4, [r4 + r5 * 4]
2264
+    PROCESS_SAD_X4_32x4_AVX512
2265
+    PROCESS_SAD_X4_END_AVX512
2266
+    RET
2267
+
2268
+INIT_ZMM avx512
2269
+cglobal pixel_sad_x4_32x64, 6,8,10
2270
+    pxor    m0,  m0
2271
+    pxor    m1,  m1
2272
+    pxor    m2,  m2
2273
+    pxor    m3,  m3
2274
+
2275
+    vbroadcasti32x8 m9, [pw_1]
2276
+
2277
+    add     r5d, r5d
2278
+    lea     r7d, [r5 * 3]
2279
+
2280
+    PROCESS_SAD_X4_32x4_AVX512
2281
+    add             r0, FENC_STRIDE * 8
2282
+    lea             r1, [r1 + r5 * 4]
2283
+    lea             r2, [r2 + r5 * 4]
2284
+    lea             r3, [r3 + r5 * 4]
2285
+    lea             r4, [r4 + r5 * 4]
2286
+    PROCESS_SAD_X4_32x4_AVX512
2287
+    add             r0, FENC_STRIDE * 8
2288
+    lea             r1, [r1 + r5 * 4]
2289
+    lea             r2, [r2 + r5 * 4]
2290
+    lea             r3, [r3 + r5 * 4]
2291
+    lea             r4, [r4 + r5 * 4]
2292
+    PROCESS_SAD_X4_32x4_AVX512
2293
+    add             r0, FENC_STRIDE * 8
2294
+    lea             r1, [r1 + r5 * 4]
2295
+    lea             r2, [r2 + r5 * 4]
2296
+    lea             r3, [r3 + r5 * 4]
2297
+    lea             r4, [r4 + r5 * 4]
2298
+    PROCESS_SAD_X4_32x4_AVX512
2299
+    add             r0, FENC_STRIDE * 8
2300
+    lea             r1, [r1 + r5 * 4]
2301
+    lea             r2, [r2 + r5 * 4]
2302
+    lea             r3, [r3 + r5 * 4]
2303
+    lea             r4, [r4 + r5 * 4]
2304
+    PROCESS_SAD_X4_32x4_AVX512
2305
+    add             r0, FENC_STRIDE * 8
2306
+    lea             r1, [r1 + r5 * 4]
2307
+    lea             r2, [r2 + r5 * 4]
2308
+    lea             r3, [r3 + r5 * 4]
2309
+    lea             r4, [r4 + r5 * 4]
2310
+    PROCESS_SAD_X4_32x4_AVX512
2311
+    add             r0, FENC_STRIDE * 8
2312
+    lea             r1, [r1 + r5 * 4]
2313
+    lea             r2, [r2 + r5 * 4]
2314
+    lea             r3, [r3 + r5 * 4]
2315
+    lea             r4, [r4 + r5 * 4]
2316
+    PROCESS_SAD_X4_32x4_AVX512
2317
+    add             r0, FENC_STRIDE * 8
2318
+    lea             r1, [r1 + r5 * 4]
2319
+    lea             r2, [r2 + r5 * 4]
2320
+    lea             r3, [r3 + r5 * 4]
2321
+    lea             r4, [r4 + r5 * 4]
2322
+    PROCESS_SAD_X4_32x4_AVX512
2323
+    add             r0, FENC_STRIDE * 8
2324
+    lea             r1, [r1 + r5 * 4]
2325
+    lea             r2, [r2 + r5 * 4]
2326
+    lea             r3, [r3 + r5 * 4]
2327
+    lea             r4, [r4 + r5 * 4]
2328
+    PROCESS_SAD_X4_32x4_AVX512
2329
+    add             r0, FENC_STRIDE * 8
2330
+    lea             r1, [r1 + r5 * 4]
2331
+    lea             r2, [r2 + r5 * 4]
2332
+    lea             r3, [r3 + r5 * 4]
2333
+    lea             r4, [r4 + r5 * 4]
2334
+    PROCESS_SAD_X4_32x4_AVX512
2335
+    add             r0, FENC_STRIDE * 8
2336
+    lea             r1, [r1 + r5 * 4]
2337
+    lea             r2, [r2 + r5 * 4]
2338
+    lea             r3, [r3 + r5 * 4]
2339
+    lea             r4, [r4 + r5 * 4]
2340
+    PROCESS_SAD_X4_32x4_AVX512
2341
+    add             r0, FENC_STRIDE * 8
2342
+    lea             r1, [r1 + r5 * 4]
2343
+    lea             r2, [r2 + r5 * 4]
2344
+    lea             r3, [r3 + r5 * 4]
2345
+    lea             r4, [r4 + r5 * 4]
2346
+    PROCESS_SAD_X4_32x4_AVX512
2347
+    add             r0, FENC_STRIDE * 8
2348
+    lea             r1, [r1 + r5 * 4]
2349
+    lea             r2, [r2 + r5 * 4]
2350
+    lea             r3, [r3 + r5 * 4]
2351
+    lea             r4, [r4 + r5 * 4]
2352
+    PROCESS_SAD_X4_32x4_AVX512
2353
+    add             r0, FENC_STRIDE * 8
2354
+    lea             r1, [r1 + r5 * 4]
2355
+    lea             r2, [r2 + r5 * 4]
2356
+    lea             r3, [r3 + r5 * 4]
2357
+    lea             r4, [r4 + r5 * 4]
2358
+    PROCESS_SAD_X4_32x4_AVX512
2359
+    add             r0, FENC_STRIDE * 8
2360
+    lea             r1, [r1 + r5 * 4]
2361
+    lea             r2, [r2 + r5 * 4]
2362
+    lea             r3, [r3 + r5 * 4]
2363
+    lea             r4, [r4 + r5 * 4]
2364
+    PROCESS_SAD_X4_32x4_AVX512
2365
+    add             r0, FENC_STRIDE * 8
2366
+    lea             r1, [r1 + r5 * 4]
2367
+    lea             r2, [r2 + r5 * 4]
2368
+    lea             r3, [r3 + r5 * 4]
2369
+    lea             r4, [r4 + r5 * 4]
2370
+    PROCESS_SAD_X4_32x4_AVX512
2371
+    PROCESS_SAD_X4_END_AVX512
2372
+    RET
2373
+%endif
2374
+;------------------------------------------------------------------------------------------------------------------------------------------------------------
2375
+; void pixel_sad_x4_48x64( const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, const pixel* pix5, intptr_t frefstride, int32_t* res )
2376
+;------------------------------------------------------------------------------------------------------------------------------------------------------------
2377
+%if ARCH_X86_64
2378
+INIT_ZMM avx512
2379
+cglobal pixel_sad_x4_48x64, 4, 9, 20
2380
+    pxor    m0,  m0
2381
+    pxor    m1,  m1
2382
+    pxor    m2,  m2
2383
+    pxor    m3,  m3
2384
+    mov     r8d,  64/4
2385
+
2386
+    vbroadcasti32x8 m19, [pw_1]
2387
+
2388
+    add     r5d, r5d
2389
+    lea     r7d, [r5 * 3]
2390
+.loop:
2391
+    movu            m4,   [r0]
2392
+    movu            m5,   [r0 + 2 * FENC_STRIDE]
2393
+    movu           ym6,   [r0 + mmsize]
2394
+    vinserti32x8    m6,   [r0 + 2 * FENC_STRIDE + mmsize], 1
2395
+    movu            m7,   [r1]
2396
+    movu            m8,   [r1 + r5]
2397
+    movu           ym9,   [r1 + mmsize]
2398
+    vinserti32x8    m9,   [r1 + r5 + mmsize], 1
2399
+    movu            m10,  [r2]
2400
+    movu            m11,  [r2 + r5]
2401
+    movu           ym12,  [r2 + mmsize]
2402
+    vinserti32x8    m12,  [r2 + r5 + mmsize], 1
2403
+    movu            m13,  [r3]
2404
+    movu            m14,  [r3 + r5]
2405
+    movu           ym15,  [r3 + mmsize]
2406
+    vinserti32x8    m15,  [r3 + r5 + mmsize], 1
2407
+    movu            m16,  [r4]
2408
+    movu            m17,  [r4 + r5]
2409
+    movu           ym18,  [r4 + mmsize]
2410
+    vinserti32x8    m18,  [r4 + r5 + mmsize], 1
2411
+
2412
+    psubw   m7,  m4
2413
+    psubw   m8,  m5
2414
+    psubw   m9,  m6
2415
+    psubw   m10, m4
2416
+    psubw   m11, m5
2417
+    psubw   m12, m6
2418
+    psubw   m13, m4
2419
+    psubw   m14, m5
2420
+    psubw   m15, m6
2421
+    psubw   m16, m4
2422
+    psubw   m17, m5
2423
+    psubw   m18, m6
2424
+
2425
+    pabsw   m7,  m7
2426
+    pabsw   m8,  m8
2427
+    pabsw   m9,  m9
2428
+    pabsw   m10, m10
2429
+    pabsw   m11, m11
2430
+    pabsw   m12, m12
2431
+    pabsw   m13, m13
2432
+    pabsw   m14, m14
2433
+    pabsw   m15, m15
2434
+    pabsw   m16, m16
2435
+    pabsw   m17, m17
2436
+    pabsw   m18, m18
2437
+
2438
+    paddw   m7,  m8
2439
+    paddw   m7,  m9
2440
+    paddw   m10, m11
2441
+    paddw   m10, m12
2442
+    paddw   m13, m14
2443
+    paddw   m13, m15
2444
+    paddw   m16, m17
2445
+    paddw   m16, m18
2446
+
2447
+    pmaddwd m7,  m19
2448
+    paddd   m0,  m7
2449
+    pmaddwd m10, m19
2450
+    paddd   m1,  m10
2451
+    pmaddwd m13, m19
2452
+    paddd   m2,  m13
2453
+    pmaddwd m16, m19
2454
+    paddd   m3,  m16
2455
+
2456
+    movu            m4,   [r0 + 4 * FENC_STRIDE]
2457
+    movu            m5,   [r0 + 6 * FENC_STRIDE]
2458
+    movu           ym6,   [r0 + 4 * FENC_STRIDE + mmsize]
2459
+    vinserti32x8    m6,   [r0 + 6 * FENC_STRIDE + mmsize], 1
2460
+    movu            m7,   [r1 + 2 * r5]
2461
+    movu            m8,   [r1 + r7]
2462
+    movu           ym9,   [r1 + 2 * r5 + mmsize]
2463
+    vinserti32x8    m9,   [r1 + r7 + mmsize], 1
2464
+    movu            m10,  [r2 + 2 * r5]
2465
+    movu            m11,  [r2 + r7]
2466
+    movu           ym12,  [r2 + 2 * r5 + mmsize]
2467
+    vinserti32x8    m12,  [r2 + r7 + mmsize], 1
2468
+    movu            m13,  [r3 + 2 * r5]
2469
+    movu            m14,  [r3 + r7]
2470
+    movu           ym15,  [r3 + 2 * r5 + mmsize]
2471
+    vinserti32x8    m15,  [r3 + r7 + mmsize], 1
2472
+    movu            m16,  [r4 + 2 * r5]
2473
+    movu            m17,  [r4 + r7]
2474
+    movu           ym18,  [r4 + 2 * r5 + mmsize]
2475
+    vinserti32x8    m18,  [r4 + r7 + mmsize], 1
2476
+
2477
+
2478
+    psubw   m7,  m4
2479
+    psubw   m8,  m5
2480
+    psubw   m9,  m6
2481
+    psubw   m10, m4
2482
+    psubw   m11, m5
2483
+    psubw   m12, m6
2484
+    psubw   m13, m4
2485
+    psubw   m14, m5
2486
+    psubw   m15, m6
2487
+    psubw   m16, m4
2488
+    psubw   m17, m5
2489
+    psubw   m18, m6
2490
+
2491
+    pabsw   m7,  m7
2492
+    pabsw   m8,  m8
2493
+    pabsw   m9,  m9
2494
+    pabsw   m10, m10
2495
+    pabsw   m11, m11
2496
+    pabsw   m12, m12
2497
+    pabsw   m13, m13
2498
+    pabsw   m14, m14
2499
+    pabsw   m15, m15
2500
+    pabsw   m16, m16
2501
+    pabsw   m17, m17
2502
+    pabsw   m18, m18
2503
+
2504
+    paddw   m7,  m8
2505
+    paddw   m7,  m9
2506
+    paddw   m10, m11
2507
+    paddw   m10, m12
2508
+    paddw   m13, m14
2509
+    paddw   m13, m15
2510
+    paddw   m16, m17
2511
+    paddw   m16, m18
2512
+
2513
+    pmaddwd m7,  m19
2514
+    paddd   m0,  m7
2515
+    pmaddwd m10, m19
2516
+    paddd   m1,  m10
2517
+    pmaddwd m13, m19
2518
+    paddd   m2,  m13
2519
+    pmaddwd m16, m19
2520
+    paddd   m3,  m16
2521
+
2522
+    add             r0, FENC_STRIDE * 8
2523
+    lea             r1, [r1 + r5 * 4]
2524
+    lea             r2, [r2 + r5 * 4]
2525
+    lea             r3, [r3 + r5 * 4]
2526
+    lea             r4, [r4 + r5 * 4]
2527
+
2528
+    dec     r8d
2529
+    jg      .loop
2530
+
2531
+    PROCESS_SAD_X4_END_AVX512
2532
+    RET
2533
+%endif
2534
+
2535
+;------------------------------------------------------------------------------------------------------------------------------------------------------------
2536
+; void pixel_sad_x4_64x%1( const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, const pixel* pix5, intptr_t frefstride, int32_t* res )
2537
+;------------------------------------------------------------------------------------------------------------------------------------------------------------
2538
+%if ARCH_X86_64
2539
+INIT_ZMM avx512
2540
+cglobal pixel_sad_x4_64x16, 6,8,15
2541
+    pxor    m0,  m0
2542
+    pxor    m1,  m1
2543
+    pxor    m2,  m2
2544
+    pxor    m3,  m3
2545
+
2546
+    vbroadcasti32x8 m9, [pw_1]
2547
+
2548
+    add     r5d, r5d
2549
+    lea     r7d, [r5 * 3]
2550
+
2551
+    PROCESS_SAD_X4_64x4_AVX512
2552
+    add             r0, FENC_STRIDE * 8
2553
+    lea             r1, [r1 + r5 * 4]
2554
+    lea             r2, [r2 + r5 * 4]
2555
+    lea             r3, [r3 + r5 * 4]
2556
+    lea             r4, [r4 + r5 * 4]
2557
+    PROCESS_SAD_X4_64x4_AVX512
2558
+    add             r0, FENC_STRIDE * 8
2559
+    lea             r1, [r1 + r5 * 4]
2560
+    lea             r2, [r2 + r5 * 4]
2561
+    lea             r3, [r3 + r5 * 4]
2562
+    lea             r4, [r4 + r5 * 4]
2563
+    PROCESS_SAD_X4_64x4_AVX512
2564
+    add             r0, FENC_STRIDE * 8
2565
+    lea             r1, [r1 + r5 * 4]
2566
+    lea             r2, [r2 + r5 * 4]
2567
+    lea             r3, [r3 + r5 * 4]
2568
+    lea             r4, [r4 + r5 * 4]
2569
+    PROCESS_SAD_X4_64x4_AVX512
2570
+    PROCESS_SAD_X4_END_AVX512
2571
+    RET
2572
+
2573
+INIT_ZMM avx512
2574
+cglobal pixel_sad_x4_64x32, 6,8,15
2575
+    pxor    m0,  m0
2576
+    pxor    m1,  m1
2577
+    pxor    m2,  m2
2578
+    pxor    m3,  m3
2579
+
2580
+    vbroadcasti32x8 m9, [pw_1]
2581
+
2582
+    add     r5d, r5d
2583
+    lea     r7d, [r5 * 3]
2584
+
2585
+    PROCESS_SAD_X4_64x4_AVX512
2586
+    add             r0, FENC_STRIDE * 8
2587
+    lea             r1, [r1 + r5 * 4]
2588
+    lea             r2, [r2 + r5 * 4]
2589
+    lea             r3, [r3 + r5 * 4]
2590
+    lea             r4, [r4 + r5 * 4]
2591
+    PROCESS_SAD_X4_64x4_AVX512
2592
+    add             r0, FENC_STRIDE * 8
2593
+    lea             r1, [r1 + r5 * 4]
2594
+    lea             r2, [r2 + r5 * 4]
2595
+    lea             r3, [r3 + r5 * 4]
2596
+    lea             r4, [r4 + r5 * 4]
2597
+    PROCESS_SAD_X4_64x4_AVX512
2598
+    add             r0, FENC_STRIDE * 8
2599
+    lea             r1, [r1 + r5 * 4]
2600
+    lea             r2, [r2 + r5 * 4]
2601
+    lea             r3, [r3 + r5 * 4]
2602
+    lea             r4, [r4 + r5 * 4]
2603
+    PROCESS_SAD_X4_64x4_AVX512
2604
+    add             r0, FENC_STRIDE * 8
2605
+    lea             r1, [r1 + r5 * 4]
2606
+    lea             r2, [r2 + r5 * 4]
2607
+    lea             r3, [r3 + r5 * 4]
2608
+    lea             r4, [r4 + r5 * 4]
2609
+    PROCESS_SAD_X4_64x4_AVX512
2610
+    add             r0, FENC_STRIDE * 8
2611
+    lea             r1, [r1 + r5 * 4]
2612
+    lea             r2, [r2 + r5 * 4]
2613
+    lea             r3, [r3 + r5 * 4]
2614
+    lea             r4, [r4 + r5 * 4]
2615
+    PROCESS_SAD_X4_64x4_AVX512
2616
+    add             r0, FENC_STRIDE * 8
2617
+    lea             r1, [r1 + r5 * 4]
2618
+    lea             r2, [r2 + r5 * 4]
2619
+    lea             r3, [r3 + r5 * 4]
2620
+    lea             r4, [r4 + r5 * 4]
2621
+    PROCESS_SAD_X4_64x4_AVX512
2622
+    add             r0, FENC_STRIDE * 8
2623
+    lea             r1, [r1 + r5 * 4]
2624
+    lea             r2, [r2 + r5 * 4]
2625
+    lea             r3, [r3 + r5 * 4]
2626
+    lea             r4, [r4 + r5 * 4]
2627
+    PROCESS_SAD_X4_64x4_AVX512
2628
+    PROCESS_SAD_X4_END_AVX512
2629
+    RET
2630
+
2631
+INIT_ZMM avx512
2632
+cglobal pixel_sad_x4_64x48, 6,8,15
2633
+    pxor    m0,  m0
2634
+    pxor    m1,  m1
2635
+    pxor    m2,  m2
2636
+    pxor    m3,  m3
2637
+
2638
+    vbroadcasti32x8 m9, [pw_1]
2639
+
2640
+    add     r5d, r5d
2641
+    lea     r7d, [r5 * 3]
2642
+
2643
+    PROCESS_SAD_X4_64x4_AVX512
2644
+    add             r0, FENC_STRIDE * 8
2645
+    lea             r1, [r1 + r5 * 4]
2646
+    lea             r2, [r2 + r5 * 4]
2647
+    lea             r3, [r3 + r5 * 4]
2648
+    lea             r4, [r4 + r5 * 4]
2649
+    PROCESS_SAD_X4_64x4_AVX512
2650
+    add             r0, FENC_STRIDE * 8
2651
+    lea             r1, [r1 + r5 * 4]
2652
+    lea             r2, [r2 + r5 * 4]
2653
+    lea             r3, [r3 + r5 * 4]
2654
+    lea             r4, [r4 + r5 * 4]
2655
+    PROCESS_SAD_X4_64x4_AVX512
2656
+    add             r0, FENC_STRIDE * 8
2657
+    lea             r1, [r1 + r5 * 4]
2658
+    lea             r2, [r2 + r5 * 4]
2659
+    lea             r3, [r3 + r5 * 4]
2660
+    lea             r4, [r4 + r5 * 4]
2661
+    PROCESS_SAD_X4_64x4_AVX512
2662
+    add             r0, FENC_STRIDE * 8
2663
+    lea             r1, [r1 + r5 * 4]
2664
+    lea             r2, [r2 + r5 * 4]
2665
+    lea             r3, [r3 + r5 * 4]
2666
+    lea             r4, [r4 + r5 * 4]
2667
+    PROCESS_SAD_X4_64x4_AVX512
2668
+    add             r0, FENC_STRIDE * 8
2669
+    lea             r1, [r1 + r5 * 4]
2670
+    lea             r2, [r2 + r5 * 4]
2671
+    lea             r3, [r3 + r5 * 4]
2672
+    lea             r4, [r4 + r5 * 4]
2673
+    PROCESS_SAD_X4_64x4_AVX512
2674
+    add             r0, FENC_STRIDE * 8
2675
+    lea             r1, [r1 + r5 * 4]
2676
+    lea             r2, [r2 + r5 * 4]
2677
+    lea             r3, [r3 + r5 * 4]
2678
+    lea             r4, [r4 + r5 * 4]
2679
+    PROCESS_SAD_X4_64x4_AVX512
2680
+    add             r0, FENC_STRIDE * 8
2681
+    lea             r1, [r1 + r5 * 4]
2682
+    lea             r2, [r2 + r5 * 4]
2683
+    lea             r3, [r3 + r5 * 4]
2684
+    lea             r4, [r4 + r5 * 4]
2685
+    PROCESS_SAD_X4_64x4_AVX512
2686
+    add             r0, FENC_STRIDE * 8
2687
+    lea             r1, [r1 + r5 * 4]
2688
+    lea             r2, [r2 + r5 * 4]
2689
+    lea             r3, [r3 + r5 * 4]
2690
+    lea             r4, [r4 + r5 * 4]
2691
+    PROCESS_SAD_X4_64x4_AVX512
2692
+    add             r0, FENC_STRIDE * 8
2693
+    lea             r1, [r1 + r5 * 4]
2694
+    lea             r2, [r2 + r5 * 4]
2695
+    lea             r3, [r3 + r5 * 4]
2696
+    lea             r4, [r4 + r5 * 4]
2697
+    PROCESS_SAD_X4_64x4_AVX512
2698
+    add             r0, FENC_STRIDE * 8
2699
+    lea             r1, [r1 + r5 * 4]
2700
+    lea             r2, [r2 + r5 * 4]
2701
+    lea             r3, [r3 + r5 * 4]
2702
+    lea             r4, [r4 + r5 * 4]
2703
+    PROCESS_SAD_X4_64x4_AVX512
2704
+    add             r0, FENC_STRIDE * 8
2705
+    lea             r1, [r1 + r5 * 4]
2706
+    lea             r2, [r2 + r5 * 4]
2707
+    lea             r3, [r3 + r5 * 4]
2708
+    lea             r4, [r4 + r5 * 4]
2709
+    PROCESS_SAD_X4_64x4_AVX512
2710
+    PROCESS_SAD_X4_END_AVX512
2711
+    RET
2712
+
2713
+INIT_ZMM avx512
2714
+cglobal pixel_sad_x4_64x64, 6,8,15
2715
+    pxor    m0,  m0
2716
+    pxor    m1,  m1
2717
+    pxor    m2,  m2
2718
+    pxor    m3,  m3
2719
+
2720
+    vbroadcasti32x8 m9, [pw_1]
2721
+
2722
+    add     r5d, r5d
2723
+    lea     r7d, [r5 * 3]
2724
+
2725
+    PROCESS_SAD_X4_64x4_AVX512
2726
+    add             r0, FENC_STRIDE * 8
2727
+    lea             r1, [r1 + r5 * 4]
2728
+    lea             r2, [r2 + r5 * 4]
2729
+    lea             r3, [r3 + r5 * 4]
2730
+    lea             r4, [r4 + r5 * 4]
2731
+    PROCESS_SAD_X4_64x4_AVX512
2732
+    add             r0, FENC_STRIDE * 8
2733
+    lea             r1, [r1 + r5 * 4]
2734
+    lea             r2, [r2 + r5 * 4]
2735
+    lea             r3, [r3 + r5 * 4]
2736
+    lea             r4, [r4 + r5 * 4]
2737
+    PROCESS_SAD_X4_64x4_AVX512
2738
+    add             r0, FENC_STRIDE * 8
2739
+    lea             r1, [r1 + r5 * 4]
2740
+    lea             r2, [r2 + r5 * 4]
2741
+    lea             r3, [r3 + r5 * 4]
2742
+    lea             r4, [r4 + r5 * 4]
2743
+    PROCESS_SAD_X4_64x4_AVX512
2744
+    add             r0, FENC_STRIDE * 8
2745
+    lea             r1, [r1 + r5 * 4]
2746
+    lea             r2, [r2 + r5 * 4]
2747
+    lea             r3, [r3 + r5 * 4]
2748
+    lea             r4, [r4 + r5 * 4]
2749
+    PROCESS_SAD_X4_64x4_AVX512
2750
+    add             r0, FENC_STRIDE * 8
2751
+    lea             r1, [r1 + r5 * 4]
2752
+    lea             r2, [r2 + r5 * 4]
2753
+    lea             r3, [r3 + r5 * 4]
2754
+    lea             r4, [r4 + r5 * 4]
2755
+    PROCESS_SAD_X4_64x4_AVX512
2756
+    add             r0, FENC_STRIDE * 8
2757
+    lea             r1, [r1 + r5 * 4]
2758
+    lea             r2, [r2 + r5 * 4]
2759
+    lea             r3, [r3 + r5 * 4]
2760
+    lea             r4, [r4 + r5 * 4]
2761
+    PROCESS_SAD_X4_64x4_AVX512
2762
+    add             r0, FENC_STRIDE * 8
2763
+    lea             r1, [r1 + r5 * 4]
2764
+    lea             r2, [r2 + r5 * 4]
2765
+    lea             r3, [r3 + r5 * 4]
2766
+    lea             r4, [r4 + r5 * 4]
2767
+    PROCESS_SAD_X4_64x4_AVX512
2768
+    add             r0, FENC_STRIDE * 8
2769
+    lea             r1, [r1 + r5 * 4]
2770
+    lea             r2, [r2 + r5 * 4]
2771
+    lea             r3, [r3 + r5 * 4]
2772
+    lea             r4, [r4 + r5 * 4]
2773
+    PROCESS_SAD_X4_64x4_AVX512
2774
+    add             r0, FENC_STRIDE * 8
2775
+    lea             r1, [r1 + r5 * 4]
2776
+    lea             r2, [r2 + r5 * 4]
2777
+    lea             r3, [r3 + r5 * 4]
2778
+    lea             r4, [r4 + r5 * 4]
2779
+    PROCESS_SAD_X4_64x4_AVX512
2780
+    add             r0, FENC_STRIDE * 8
2781
+    lea             r1, [r1 + r5 * 4]
2782
+    lea             r2, [r2 + r5 * 4]
2783
+    lea             r3, [r3 + r5 * 4]
2784
+    lea             r4, [r4 + r5 * 4]
2785
+    PROCESS_SAD_X4_64x4_AVX512
2786
+    add             r0, FENC_STRIDE * 8
2787
+    lea             r1, [r1 + r5 * 4]
2788
+    lea             r2, [r2 + r5 * 4]
2789
+    lea             r3, [r3 + r5 * 4]
2790
+    lea             r4, [r4 + r5 * 4]
2791
+    PROCESS_SAD_X4_64x4_AVX512
2792
+    add             r0, FENC_STRIDE * 8
2793
+    lea             r1, [r1 + r5 * 4]
2794
+    lea             r2, [r2 + r5 * 4]
2795
+    lea             r3, [r3 + r5 * 4]
2796
+    lea             r4, [r4 + r5 * 4]
2797
+    PROCESS_SAD_X4_64x4_AVX512
2798
+    add             r0, FENC_STRIDE * 8
2799
+    lea             r1, [r1 + r5 * 4]
2800
+    lea             r2, [r2 + r5 * 4]
2801
+    lea             r3, [r3 + r5 * 4]
2802
+    lea             r4, [r4 + r5 * 4]
2803
+    PROCESS_SAD_X4_64x4_AVX512
2804
+    add             r0, FENC_STRIDE * 8
2805
+    lea             r1, [r1 + r5 * 4]
2806
+    lea             r2, [r2 + r5 * 4]
2807
+    lea             r3, [r3 + r5 * 4]
2808
+    lea             r4, [r4 + r5 * 4]
2809
+    PROCESS_SAD_X4_64x4_AVX512
2810
+    add             r0, FENC_STRIDE * 8
2811
+    lea             r1, [r1 + r5 * 4]
2812
+    lea             r2, [r2 + r5 * 4]
2813
+    lea             r3, [r3 + r5 * 4]
2814
+    lea             r4, [r4 + r5 * 4]
2815
+    PROCESS_SAD_X4_64x4_AVX512
2816
+    PROCESS_SAD_X4_END_AVX512
2817
+    RET
2818
+%endif
2819
x265_2.7.tar.gz/source/common/x86/ssd-a.asm -> x265_2.9.tar.gz/source/common/x86/ssd-a.asm Changed
590
 
1
@@ -141,6 +141,8 @@
2
 
3
 ; Function to find ssd for 32x16 block, sse2, 12 bit depth
4
 ; Defined sepeartely to be called from SSD_ONE_32 macro
5
+%if ARCH_X86_64
6
+;This code is written for 64 bit architecture
7
 INIT_XMM sse2
8
 cglobal ssd_ss_32x16
9
     pxor        m8, m8
10
@@ -180,8 +182,10 @@
11
     paddq       m4, m5
12
     paddq       m9, m4
13
     ret
14
+%endif
15
 
16
 %macro SSD_ONE_32 0
17
+%if ARCH_X86_64
18
 cglobal pixel_ssd_ss_32x64, 4,7,10
19
     add         r1d, r1d
20
     add         r3d, r3d
21
@@ -193,7 +197,9 @@
22
     call        ssd_ss_32x16
23
     movq        rax, m9
24
     RET
25
+%endif
26
 %endmacro
27
+
28
 %macro SSD_ONE_SS_32 0
29
 cglobal pixel_ssd_ss_32x32, 4,5,8
30
     add         r1d, r1d
31
@@ -554,6 +560,7 @@
32
     RET
33
 %endmacro
34
 
35
+%if ARCH_X86_64
36
 INIT_YMM avx2
37
 cglobal pixel_ssd_16x16, 4,7,3
38
     FIX_STRIDES r1, r3
39
@@ -697,6 +704,108 @@
40
     movq            rax, xm3
41
     RET
42
 
43
+INIT_ZMM avx512
44
+cglobal pixel_ssd_32x2
45
+    pxor            m0, m0
46
+    movu            m1, [r0]
47
+    psubw           m1, [r2]
48
+    pmaddwd         m1, m1
49
+    paddd           m0, m1
50
+    movu            m1, [r0 + r1]
51
+    psubw           m1, [r2 + r3]
52
+    pmaddwd         m1, m1
53
+    paddd           m0, m1
54
+    lea             r0, [r0 + r1 * 2]
55
+    lea             r2, [r2 + r3 * 2]
56
+
57
+    mova            m1, m0
58
+    pxor            m2, m2
59
+    punpckldq       m0, m2
60
+    punpckhdq       m1, m2
61
+
62
+    paddq           m3, m0
63
+    paddq           m3, m1
64
+ret
65
+
66
+INIT_ZMM avx512
67
+cglobal pixel_ssd_32x32, 4,5,5
68
+    shl             r1d, 1
69
+    shl             r3d, 1
70
+    pxor            m3, m3
71
+    mov             r4, 16
72
+.iterate:
73
+    call            pixel_ssd_32x2
74
+    dec             r4d
75
+    jne             .iterate
76
+
77
+    vextracti32x8   ym4, m3, 1
78
+    paddq           ym3, ym4
79
+    vextracti32x4   xm4, m3, 1
80
+    paddq           xm3, xm4
81
+    movhlps         xm4, xm3
82
+    paddq           xm3, xm4
83
+    movq            rax, xm3
84
+RET
85
+
86
+INIT_ZMM avx512
87
+cglobal pixel_ssd_32x64, 4,5,5
88
+    shl             r1d, 1
89
+    shl             r3d, 1
90
+    pxor            m3, m3
91
+    mov             r4, 32
92
+.iterate:
93
+    call            pixel_ssd_32x2
94
+    dec             r4d
95
+    jne             .iterate
96
+
97
+    vextracti32x8   ym4, m3, 1
98
+    paddq           ym3, ym4
99
+    vextracti32x4   xm4, m3, 1
100
+    paddq           xm3, xm4
101
+    movhlps         xm4, xm3
102
+    paddq           xm3, xm4
103
+    movq            rax, xm3
104
+RET
105
+
106
+INIT_ZMM avx512
107
+cglobal pixel_ssd_64x64, 4,5,5
108
+    FIX_STRIDES     r1, r3
109
+    mov             r4d, 64
110
+    pxor            m3, m3
111
+
112
+.loop:
113
+    pxor            m0, m0
114
+    movu            m1, [r0]
115
+    psubw           m1, [r2]
116
+    pmaddwd         m1, m1
117
+    paddd           m0, m1
118
+    movu            m1, [r0 + mmsize]
119
+    psubw           m1, [r2 + mmsize]
120
+    pmaddwd         m1, m1
121
+    paddd           m0, m1
122
+
123
+    lea             r0, [r0 + r1]
124
+    lea             r2, [r2 + r3]
125
+
126
+    mova            m1, m0
127
+    pxor            m2, m2
128
+    punpckldq       m0, m2
129
+    punpckhdq       m1, m2
130
+    paddq           m3, m0
131
+    paddq           m3, m1
132
+
133
+    dec             r4d
134
+    jg              .loop
135
+
136
+    vextracti32x8   ym4, m3, 1
137
+    paddq           ym3, ym4
138
+    vextracti32x4   xm4, m3, 1
139
+    paddq           xm3, xm4
140
+    movhlps         xm4, xm3
141
+    paddq           xm3, xm4
142
+    movq            rax, xm3
143
+    RET
144
+%endif
145
 INIT_MMX mmx2
146
 SSD_ONE     4,  4
147
 SSD_ONE     4,  8
148
@@ -726,7 +835,9 @@
149
 %if BIT_DEPTH <= 10
150
     SSD_ONE    32, 64
151
     SSD_ONE    32, 32
152
+%if ARCH_X86_64
153
     SSD_TWO    64, 64
154
+%endif
155
 %else
156
     SSD_ONE_32
157
     SSD_ONE_SS_32
158
@@ -1377,7 +1488,126 @@
159
     HADDD       m2, m0
160
     movd        eax, xm2
161
     RET
162
+;-----------------------------------------------------------------------------
163
+; ssd_ss avx512 code start
164
+;-----------------------------------------------------------------------------
165
+%if ARCH_X86_64
166
+%macro PROCESS_SSD_SS_64x4_AVX512 0
167
+    movu        m0, [r0]
168
+    movu        m1, [r0 + mmsize]
169
+    movu        m2, [r0 + r1]
170
+    movu        m3, [r0 + r1 + mmsize]
171
+    movu        m4, [r2]
172
+    movu        m5, [r2 + mmsize]
173
+    movu        m6, [r2 + r3]
174
+    movu        m7, [r2 + r3 + mmsize]
175
+
176
+    psubw       m0, m4
177
+    psubw       m1, m5
178
+    psubw       m2, m6
179
+    psubw       m3, m7
180
+    pmaddwd     m0, m0
181
+    pmaddwd     m1, m1
182
+    pmaddwd     m2, m2
183
+    pmaddwd     m3, m3
184
+    paddd       m8, m0
185
+    paddd       m8, m1
186
+    paddd       m8, m2
187
+    paddd       m8, m3
188
 
189
+    movu        m0, [r0 + 2 * r1]
190
+    movu        m1, [r0 + 2 * r1 + mmsize]
191
+    movu        m2, [r0 + r5]
192
+    movu        m3, [r0 + r5 + mmsize]
193
+    movu        m4, [r2 + 2 * r3]
194
+    movu        m5, [r2 + 2 * r3 + mmsize]
195
+    movu        m6, [r2 + r6]
196
+    movu        m7, [r2 + r6 + mmsize]
197
+
198
+    psubw       m0, m4
199
+    psubw       m1, m5
200
+    psubw       m2, m6
201
+    psubw       m3, m7
202
+    pmaddwd     m0, m0
203
+    pmaddwd     m1, m1
204
+    pmaddwd     m2, m2
205
+    pmaddwd     m3, m3
206
+    paddd       m8, m0
207
+    paddd       m8, m1
208
+    paddd       m8, m2
209
+    paddd       m8, m3
210
+%endmacro
211
+
212
+%macro PROCESS_SSD_SS_32x4_AVX512 0
213
+    movu        m0, [r0]
214
+    movu        m1, [r0 + r1]
215
+    movu        m2, [r0 + 2 * r1]
216
+    movu        m3, [r0 + r5]
217
+    movu        m4, [r2]
218
+    movu        m5, [r2 + r3]
219
+    movu        m6, [r2 + 2 * r3]
220
+    movu        m7, [r2 + r6]
221
+
222
+    psubw       m0, m4
223
+    psubw       m1, m5
224
+    psubw       m2, m6
225
+    psubw       m3, m7
226
+    pmaddwd     m0, m0
227
+    pmaddwd     m1, m1
228
+    pmaddwd     m2, m2
229
+    pmaddwd     m3, m3
230
+    paddd       m8, m0
231
+    paddd       m8, m1
232
+    paddd       m8, m2
233
+    paddd       m8, m3
234
+%endmacro
235
+
236
+%macro PROCESS_SSD_SS_16x4_AVX512 0
237
+    movu           ym0, [r0]
238
+    vinserti32x8    m0, [r0 + r1],    1
239
+    movu           ym1, [r0 + 2 * r1]
240
+    vinserti32x8    m1, [r0 + r5],    1
241
+    movu           ym4, [r2]
242
+    vinserti32x8    m4, [r2 + r3],    1
243
+    movu           ym5, [r2 + 2 * r3]
244
+    vinserti32x8    m5, [r2 + r6],    1
245
+
246
+    psubw       m0, m4
247
+    psubw       m1, m5
248
+    pmaddwd     m0, m0
249
+    pmaddwd     m1, m1
250
+    paddd       m8, m0
251
+    paddd       m8, m1
252
+%endmacro
253
+
254
+%macro SSD_SS_AVX512 2
255
+INIT_ZMM avx512
256
+cglobal pixel_ssd_ss_%1x%2, 4,7,9
257
+    add         r1d, r1d
258
+    add         r3d, r3d
259
+    lea         r5, [r1 * 3]
260
+    lea         r6, [r3 * 3]
261
+    pxor        m8, m8
262
+
263
+%rep %2/4 - 1
264
+    PROCESS_SSD_SS_%1x4_AVX512
265
+    lea         r0, [r0 + 4 * r1]
266
+    lea         r2, [r2 + 4 * r3]
267
+%endrep
268
+    PROCESS_SSD_SS_%1x4_AVX512
269
+    HADDD       m8, m0
270
+    movd        eax, xm8
271
+    RET
272
+%endmacro
273
+
274
+
275
+SSD_SS_AVX512 64, 64
276
+SSD_SS_AVX512 32, 32
277
+SSD_SS_AVX512 16, 16
278
+%endif
279
+;-----------------------------------------------------------------------------
280
+; ssd_ss avx512 code end
281
+;-----------------------------------------------------------------------------
282
 %endif ; !HIGH_BIT_DEPTH
283
 
284
 %if HIGH_BIT_DEPTH == 0
285
@@ -3064,7 +3294,7 @@
286
     movd    eax, m0
287
     RET
288
 
289
-
290
+%if ARCH_X86_64 && BIT_DEPTH >= 10
291
 INIT_XMM sse2
292
 cglobal pixel_ssd_s_32, 2,3,5
293
     add     r1, r1
294
@@ -3105,7 +3335,6 @@
295
     dec     r2d
296
     jnz    .loop
297
 
298
-%if BIT_DEPTH >= 10
299
     movu            m1, m0
300
     pxor            m2, m2
301
     punpckldq       m0, m2
302
@@ -3114,13 +3343,56 @@
303
     movhlps         m1, m0
304
     paddq           m0, m1
305
     movq            rax, xm0
306
-%else
307
+    RET
308
+%endif
309
+
310
+%if BIT_DEPTH == 8
311
+INIT_XMM sse2
312
+cglobal pixel_ssd_s_32, 2,3,5
313
+    add     r1, r1
314
+
315
+    mov     r2d, 16
316
+    pxor    m0, m0
317
+.loop:
318
+    movu    m1, [r0 + 0 * mmsize]
319
+    movu    m2, [r0 + 1 * mmsize]
320
+    movu    m3, [r0 + 2 * mmsize]
321
+    movu    m4, [r0 + 3 * mmsize]
322
+    add     r0, r1
323
+
324
+    pmaddwd m1, m1
325
+    pmaddwd m2, m2
326
+    pmaddwd m3, m3
327
+    pmaddwd m4, m4
328
+    paddd   m1, m2
329
+    paddd   m3, m4
330
+    paddd   m1, m3
331
+    paddd   m0, m1
332
+
333
+    movu    m1, [r0 + 0 * mmsize]
334
+    movu    m2, [r0 + 1 * mmsize]
335
+    movu    m3, [r0 + 2 * mmsize]
336
+    movu    m4, [r0 + 3 * mmsize]
337
+    add     r0, r1
338
+
339
+    pmaddwd m1, m1
340
+    pmaddwd m2, m2
341
+    pmaddwd m3, m3
342
+    pmaddwd m4, m4
343
+    paddd   m1, m2
344
+    paddd   m3, m4
345
+    paddd   m1, m3
346
+    paddd   m0, m1
347
+
348
+    dec     r2d
349
+    jnz    .loop
350
     ; calculate sum and return
351
     HADDD   m0, m1
352
     movd    eax, m0
353
-%endif
354
     RET
355
+%endif
356
 
357
+%if ARCH_X86_64
358
 INIT_YMM avx2
359
 cglobal pixel_ssd_s_16, 2,4,5
360
     add     r1, r1
361
@@ -3207,3 +3479,227 @@
362
     movd    eax, xm0
363
 %endif
364
     RET
365
+%endif
366
+;-----------------------------------------------------------------------------
367
+; ssd_s avx512 code start
368
+;-----------------------------------------------------------------------------
369
+%macro PROCESS_SSD_S_32x8_AVX512 0
370
+    movu    m1, [r0]
371
+    movu    m2, [r0 + r1]
372
+    movu    m3, [r0 + 2 * r1]
373
+    movu    m4, [r0 + r3]
374
+
375
+    pmaddwd m1, m1
376
+    pmaddwd m2, m2
377
+    pmaddwd m3, m3
378
+    pmaddwd m4, m4
379
+    paddd   m1, m2
380
+    paddd   m3, m4
381
+    paddd   m1, m3
382
+    paddd   m0, m1
383
+
384
+    lea     r0, [r0 + 4 * r1]
385
+
386
+    movu    m1, [r0]
387
+    movu    m2, [r0 + r1]
388
+    movu    m3, [r0 + 2 * r1]
389
+    movu    m4, [r0 + r3]
390
+
391
+    pmaddwd m1, m1
392
+    pmaddwd m2, m2
393
+    pmaddwd m3, m3
394
+    pmaddwd m4, m4
395
+    paddd   m1, m2
396
+    paddd   m3, m4
397
+    paddd   m1, m3
398
+    paddd   m0, m1
399
+%endmacro
400
+
401
+%macro PROCESS_SSD_S_16x8_AVX512 0
402
+    movu             ym1,   [r0]
403
+    vinserti32x8     m1,    [r0 + r1],     1
404
+    movu             ym2,   [r0 + 2 * r1]
405
+    vinserti32x8     m2,    [r0 + r3],     1
406
+    lea              r0,    [r0 + 4 * r1]
407
+    movu             ym3,   [r0]
408
+    vinserti32x8     m3,    [r0 + r1],     1
409
+    movu             ym4,   [r0 + 2 * r1]
410
+    vinserti32x8     m4,    [r0 + r3],     1
411
+    pmaddwd m1, m1
412
+    pmaddwd m2, m2
413
+    pmaddwd m3, m3
414
+    pmaddwd m4, m4
415
+    paddd   m1, m2
416
+    paddd   m3, m4
417
+    paddd   m1, m3
418
+    paddd   m0, m1
419
+%endmacro
420
+;-----------------------------------------------------------------------------
421
+; int pixel_ssd_s( int16_t *ref, intptr_t i_stride )
422
+;-----------------------------------------------------------------------------
423
+%if ARCH_X86_64
424
+INIT_ZMM avx512
425
+cglobal pixel_ssd_s_32, 2,4,5
426
+    add     r1, r1
427
+    lea     r3, [r1 * 3]
428
+    pxor    m0, m0
429
+
430
+    PROCESS_SSD_S_32x8_AVX512
431
+    lea     r0, [r0 + 4 * r1]
432
+    PROCESS_SSD_S_32x8_AVX512
433
+    lea     r0, [r0 + 4 * r1]
434
+    PROCESS_SSD_S_32x8_AVX512
435
+    lea     r0, [r0 + 4 * r1]
436
+    PROCESS_SSD_S_32x8_AVX512
437
+
438
+    ; calculate sum and return
439
+%if BIT_DEPTH >= 10
440
+    movu            m1, m0
441
+    pxor            m2, m2
442
+    punpckldq       m0, m2
443
+    punpckhdq       m1, m2
444
+    paddq           m0, m1
445
+    vextracti32x8   ym2, m0, 1
446
+    paddq           ym0, ym2
447
+    vextracti32x4   xm2, m0, 1
448
+    paddq           xm2, xm0
449
+    movhlps         xm1, xm2
450
+    paddq           xm2, xm1
451
+    movq            rax, xm2
452
+%else
453
+    HADDD   m0, m1
454
+    movd    eax, xm0
455
+%endif
456
+    RET
457
+
458
+INIT_ZMM avx512
459
+cglobal pixel_ssd_s_16, 2,4,5
460
+    add     r1, r1
461
+    lea     r3, [r1 * 3]
462
+    pxor    m0, m0
463
+
464
+    PROCESS_SSD_S_16x8_AVX512
465
+    lea     r0, [r0 + 4 * r1]
466
+    PROCESS_SSD_S_16x8_AVX512
467
+
468
+    ; calculate sum and return
469
+    HADDD   m0, m1
470
+    movd    eax, xm0
471
+    RET
472
+%endif
473
+;-----------------------------------------------------------------------------
474
+; ssd_s avx512 code end
475
+;-----------------------------------------------------------------------------
476
+;-----------------------------------------------------------------------------
477
+;ALigned version of macro
478
+;-----------------------------------------------------------------------------
479
+%macro PROCESS_SSD_S_16x8_ALIGNED_AVX512 0
480
+    mova             ym1,   [r0]
481
+    vinserti32x8     m1,    [r0 + r1],     1
482
+    mova             ym2,   [r0 + 2 * r1]
483
+    vinserti32x8     m2,    [r0 + r3],     1
484
+    lea              r0,    [r0 + 4 * r1]
485
+    mova             ym3,   [r0]
486
+    vinserti32x8     m3,    [r0 + r1],     1
487
+    mova             ym4,   [r0 + 2 * r1]
488
+    vinserti32x8     m4,    [r0 + r3],     1
489
+    pmaddwd m1, m1
490
+    pmaddwd m2, m2
491
+    pmaddwd m3, m3
492
+    pmaddwd m4, m4
493
+    paddd   m1, m2
494
+    paddd   m3, m4
495
+    paddd   m1, m3
496
+    paddd   m0, m1
497
+%endmacro
498
+;---------------------------------------------------------------------------------
499
+;int pixel_ssd_s_aligned( int16_t *ref, intptr_t i_stride )
500
+;-----------------------------------------------------------------------------------
501
+%if ARCH_X86_64
502
+INIT_ZMM avx512
503
+
504
+INIT_ZMM avx512
505
+cglobal pixel_ssd_s_aligned_16, 2,4,5
506
+    add     r1, r1
507
+    lea     r3, [r1 * 3]
508
+    pxor    m0, m0
509
+
510
+    PROCESS_SSD_S_16x8_ALIGNED_AVX512
511
+    lea     r0, [r0 + 4 * r1]
512
+    PROCESS_SSD_S_16x8_ALIGNED_AVX512
513
+
514
+    ; calculate sum and return
515
+    HADDD   m0, m1
516
+    movd    eax, xm0
517
+    RET
518
+%endif
519
+;---------------------------------------------------------------------------------------------
520
+; aligned implementation for 32
521
+;---------------------------------------------------------------------------------------------
522
+%macro PROCESS_SSD_S_32x8_ALIGNED_AVX512 0
523
+    mova    m1, [r0]
524
+    mova    m2, [r0 + r1]
525
+    mova    m3, [r0 + 2 * r1]
526
+    mova    m4, [r0 + r3]
527
+
528
+    pmaddwd m1, m1
529
+    pmaddwd m2, m2
530
+    pmaddwd m3, m3
531
+    pmaddwd m4, m4
532
+    paddd   m1, m2
533
+    paddd   m3, m4
534
+    paddd   m1, m3
535
+    paddd   m0, m1
536
+
537
+    lea     r0, [r0 + 4 * r1]
538
+
539
+    mova    m1, [r0]
540
+    mova    m2, [r0 + r1]
541
+    mova    m3, [r0 + 2 * r1]
542
+    mova    m4, [r0 + r3]
543
+
544
+    pmaddwd m1, m1
545
+    pmaddwd m2, m2
546
+    pmaddwd m3, m3
547
+    pmaddwd m4, m4
548
+    paddd   m1, m2
549
+    paddd   m3, m4
550
+    paddd   m1, m3
551
+    paddd   m0, m1
552
+%endmacro
553
+
554
+%if ARCH_X86_64
555
+INIT_ZMM avx512
556
+cglobal pixel_ssd_s_aligned_32, 2,4,5
557
+    add     r1, r1
558
+    lea     r3, [r1 * 3]
559
+    pxor    m0, m0
560
+
561
+    PROCESS_SSD_S_32x8_AVX512
562
+    lea     r0, [r0 + 4 * r1]
563
+    PROCESS_SSD_S_32x8_ALIGNED_AVX512
564
+    lea     r0, [r0 + 4 * r1]
565
+    PROCESS_SSD_S_32x8_ALIGNED_AVX512
566
+    lea     r0, [r0 + 4 * r1]
567
+    PROCESS_SSD_S_32x8_ALIGNED_AVX512
568
+
569
+    ; calculate sum and return
570
+%if BIT_DEPTH >= 10
571
+    mova            m1, m0
572
+    pxor            m2, m2
573
+    punpckldq       m0, m2
574
+    punpckhdq       m1, m2
575
+    paddq           m0, m1
576
+    vextracti32x8   ym2, m0, 1
577
+    paddq           ym0, ym2
578
+    vextracti32x4   xm2, m0, 1
579
+    paddq           xm2, xm0
580
+    movhlps         xm1, xm2
581
+    paddq           xm2, xm1
582
+    movq            rax, xm2
583
+%else
584
+    HADDD   m0, m1
585
+    movd    eax, xm0
586
+%endif
587
+    RET
588
+%endif
589
\ No newline at end of file
590
x265_2.7.tar.gz/source/common/x86/v4-ipfilter16.asm -> x265_2.9.tar.gz/source/common/x86/v4-ipfilter16.asm Changed
17
 
1
@@ -2931,6 +2931,7 @@
2
     RET
3
 %endmacro
4
 
5
+%if ARCH_X86_64
6
 FILTER_VER_CHROMA_AVX2_4xN pp, 16, 1, 6
7
 FILTER_VER_CHROMA_AVX2_4xN ps, 16, 0, INTERP_SHIFT_PS
8
 FILTER_VER_CHROMA_AVX2_4xN sp, 16, 1, INTERP_SHIFT_SP
9
@@ -2939,6 +2940,7 @@
10
 FILTER_VER_CHROMA_AVX2_4xN ps, 32, 0, INTERP_SHIFT_PS
11
 FILTER_VER_CHROMA_AVX2_4xN sp, 32, 1, INTERP_SHIFT_SP
12
 FILTER_VER_CHROMA_AVX2_4xN ss, 32, 0, 6
13
+%endif
14
 
15
 %macro FILTER_VER_CHROMA_AVX2_8x8 3
16
 INIT_YMM avx2
17
x265_2.7.tar.gz/source/common/x86/v4-ipfilter8.asm -> x265_2.9.tar.gz/source/common/x86/v4-ipfilter8.asm Changed
359
 
1
@@ -43,7 +43,7 @@
2
 const v4_interp4_vpp_shuf1, dd 0, 1, 1, 2, 2, 3, 3, 4
3
                          dd 2, 3, 3, 4, 4, 5, 5, 6
4
 
5
-const tab_ChromaCoeff, db  0, 64,  0,  0
6
+const v4_tab_ChromaCoeff, db  0, 64,  0,  0
7
                        db -2, 58, 10, -2
8
                        db -4, 54, 16, -2
9
                        db -6, 46, 28, -4
10
@@ -1031,8 +1031,8 @@
11
     mova        m6,        [r5 + r4]
12
     mova        m5,        [r5 + r4 + 16]
13
 %else
14
-    mova        m6,        [tab_ChromaCoeff + r4]
15
-    mova        m5,        [tab_ChromaCoeff + r4 + 16]
16
+    mova        m6,        [v4_tab_ChromaCoeff + r4]
17
+    mova        m5,        [v4_tab_ChromaCoeff + r4 + 16]
18
 %endif
19
 
20
 %ifidn %1,pp
21
@@ -2114,10 +2114,10 @@
22
     sub         r0,        r1
23
 
24
 %ifdef PIC
25
-    lea         r5,        [tab_ChromaCoeff]
26
+    lea         r5,        [v4_tab_ChromaCoeff]
27
     movd        m0,        [r5 + r4 * 4]
28
 %else
29
-    movd        m0,        [tab_ChromaCoeff + r4 * 4]
30
+    movd        m0,        [v4_tab_ChromaCoeff + r4 * 4]
31
 %endif
32
     lea         r4,        [r1 * 3]
33
     lea         r5,        [r0 + 4 * r1]
34
@@ -2430,10 +2430,10 @@
35
     sub         r0,        r1
36
 
37
 %ifdef PIC
38
-    lea         r5,        [tab_ChromaCoeff]
39
+    lea         r5,        [v4_tab_ChromaCoeff]
40
     movd        m0,        [r5 + r4 * 4]
41
 %else
42
-    movd        m0,        [tab_ChromaCoeff + r4 * 4]
43
+    movd        m0,        [v4_tab_ChromaCoeff + r4 * 4]
44
 %endif
45
 
46
     pshufb      m0,        [tab_Cm]
47
@@ -2515,10 +2515,10 @@
48
     sub         r0,        r1
49
 
50
 %ifdef PIC
51
-    lea         r5,        [tab_ChromaCoeff]
52
+    lea         r5,        [v4_tab_ChromaCoeff]
53
     movd        m0,        [r5 + r4 * 4]
54
 %else
55
-    movd        m0,        [tab_ChromaCoeff + r4 * 4]
56
+    movd        m0,        [v4_tab_ChromaCoeff + r4 * 4]
57
 %endif
58
 
59
     pshufb      m0,        [tab_Cm]
60
@@ -2611,10 +2611,10 @@
61
     sub         r0,        r1
62
 
63
 %ifdef PIC
64
-    lea         r5,        [tab_ChromaCoeff]
65
+    lea         r5,        [v4_tab_ChromaCoeff]
66
     movd        m0,        [r5 + r4 * 4]
67
 %else
68
-    movd        m0,        [tab_ChromaCoeff + r4 * 4]
69
+    movd        m0,        [v4_tab_ChromaCoeff + r4 * 4]
70
 %endif
71
 
72
     pshufb      m0,        [tab_Cm]
73
@@ -2984,10 +2984,10 @@
74
     sub         r0,        r1
75
 
76
 %ifdef PIC
77
-    lea         r5,        [tab_ChromaCoeff]
78
+    lea         r5,        [v4_tab_ChromaCoeff]
79
     movd        m0,        [r5 + r4 * 4]
80
 %else
81
-    movd        m0,        [tab_ChromaCoeff + r4 * 4]
82
+    movd        m0,        [v4_tab_ChromaCoeff + r4 * 4]
83
 %endif
84
 
85
     pshufb      m0,        [tab_Cm]
86
@@ -3180,10 +3180,10 @@
87
     punpcklbw   m4,        m2,          m3
88
 
89
 %ifdef PIC
90
-    lea         r6,        [tab_ChromaCoeff]
91
+    lea         r6,        [v4_tab_ChromaCoeff]
92
     movd        m5,        [r6 + r4 * 4]
93
 %else
94
-    movd        m5,        [tab_ChromaCoeff + r4 * 4]
95
+    movd        m5,        [v4_tab_ChromaCoeff + r4 * 4]
96
 %endif
97
 
98
     pshufb      m6,        m5,       [tab_Vm]
99
@@ -3233,10 +3233,10 @@
100
     add         r3d, r3d
101
 
102
 %ifdef PIC
103
-    lea         r5, [tab_ChromaCoeff]
104
+    lea         r5, [v4_tab_ChromaCoeff]
105
     movd        m0, [r5 + r4 * 4]
106
 %else
107
-    movd        m0, [tab_ChromaCoeff + r4 * 4]
108
+    movd        m0, [v4_tab_ChromaCoeff + r4 * 4]
109
 %endif
110
 
111
     pshufb      m0, [tab_Cm]
112
@@ -3280,10 +3280,10 @@
113
     add        r3d, r3d
114
 
115
 %ifdef PIC
116
-    lea        r5, [tab_ChromaCoeff]
117
+    lea        r5, [v4_tab_ChromaCoeff]
118
     movd       m0, [r5 + r4 * 4]
119
 %else
120
-    movd       m0, [tab_ChromaCoeff + r4 * 4]
121
+    movd       m0, [v4_tab_ChromaCoeff + r4 * 4]
122
 %endif
123
 
124
     pshufb     m0, [tab_Cm]
125
@@ -3355,10 +3355,10 @@
126
     add        r3d, r3d
127
 
128
 %ifdef PIC
129
-    lea        r5, [tab_ChromaCoeff]
130
+    lea        r5, [v4_tab_ChromaCoeff]
131
     movd       m0, [r5 + r4 * 4]
132
 %else
133
-    movd       m0, [tab_ChromaCoeff + r4 * 4]
134
+    movd       m0, [v4_tab_ChromaCoeff + r4 * 4]
135
 %endif
136
 
137
     pshufb     m0, [tab_Cm]
138
@@ -3442,10 +3442,10 @@
139
     add        r3d, r3d
140
 
141
 %ifdef PIC
142
-    lea        r5, [tab_ChromaCoeff]
143
+    lea        r5, [v4_tab_ChromaCoeff]
144
     movd       m5, [r5 + r4 * 4]
145
 %else
146
-    movd       m5, [tab_ChromaCoeff + r4 * 4]
147
+    movd       m5, [v4_tab_ChromaCoeff + r4 * 4]
148
 %endif
149
 
150
     pshufb     m6, m5, [tab_Vm]
151
@@ -3513,10 +3513,10 @@
152
     add        r3d, r3d
153
 
154
 %ifdef PIC
155
-    lea        r5, [tab_ChromaCoeff]
156
+    lea        r5, [v4_tab_ChromaCoeff]
157
     movd       m5, [r5 + r4 * 4]
158
 %else
159
-    movd       m5, [tab_ChromaCoeff + r4 * 4]
160
+    movd       m5, [v4_tab_ChromaCoeff + r4 * 4]
161
 %endif
162
 
163
     pshufb     m6, m5, [tab_Vm]
164
@@ -3605,10 +3605,10 @@
165
     add        r3d, r3d
166
 
167
 %ifdef PIC
168
-    lea        r5, [tab_ChromaCoeff]
169
+    lea        r5, [v4_tab_ChromaCoeff]
170
     movd       m5, [r5 + r4 * 4]
171
 %else
172
-    movd       m5, [tab_ChromaCoeff + r4 * 4]
173
+    movd       m5, [v4_tab_ChromaCoeff + r4 * 4]
174
 %endif
175
 
176
     pshufb     m6, m5, [tab_Vm]
177
@@ -3700,10 +3700,10 @@
178
     add        r3d, r3d
179
 
180
 %ifdef PIC
181
-    lea        r5, [tab_ChromaCoeff]
182
+    lea        r5, [v4_tab_ChromaCoeff]
183
     movd       m0, [r5 + r4 * 4]
184
 %else
185
-    movd       m0, [tab_ChromaCoeff + r4 * 4]
186
+    movd       m0, [v4_tab_ChromaCoeff + r4 * 4]
187
 %endif
188
 
189
     pshufb     m1, m0, [tab_Vm]
190
@@ -3786,10 +3786,10 @@
191
     add        r3d, r3d
192
 
193
 %ifdef PIC
194
-    lea        r5, [tab_ChromaCoeff]
195
+    lea        r5, [v4_tab_ChromaCoeff]
196
     movd       m0, [r5 + r4 * 4]
197
 %else
198
-    movd       m0, [tab_ChromaCoeff + r4 * 4]
199
+    movd       m0, [v4_tab_ChromaCoeff + r4 * 4]
200
 %endif
201
 
202
     pshufb     m1, m0, [tab_Vm]
203
@@ -3877,10 +3877,10 @@
204
     add        r3d, r3d
205
 
206
 %ifdef PIC
207
-    lea        r5, [tab_ChromaCoeff]
208
+    lea        r5, [v4_tab_ChromaCoeff]
209
     movd       m0, [r5 + r4 * 4]
210
 %else
211
-    movd       m0, [tab_ChromaCoeff + r4 * 4]
212
+    movd       m0, [v4_tab_ChromaCoeff + r4 * 4]
213
 %endif
214
 
215
     pshufb     m1, m0, [tab_Vm]
216
@@ -3995,10 +3995,10 @@
217
     add        r3d, r3d
218
 
219
 %ifdef PIC
220
-    lea        r5, [tab_ChromaCoeff]
221
+    lea        r5, [v4_tab_ChromaCoeff]
222
     movd       m0, [r5 + r4 * 4]
223
 %else
224
-    movd       m0, [tab_ChromaCoeff + r4 * 4]
225
+    movd       m0, [v4_tab_ChromaCoeff + r4 * 4]
226
 %endif
227
 
228
     pshufb     m1, m0, [tab_Vm]
229
@@ -4091,10 +4091,10 @@
230
     sub         r0,        r1
231
 
232
 %ifdef PIC
233
-    lea         r5,        [tab_ChromaCoeff]
234
+    lea         r5,        [v4_tab_ChromaCoeff]
235
     movd        m5,        [r5 + r4 * 4]
236
 %else
237
-    movd        m5,        [tab_ChromaCoeff + r4 * 4]
238
+    movd        m5,        [v4_tab_ChromaCoeff + r4 * 4]
239
 %endif
240
 
241
     pshufb      m6,        m5,       [tab_Vm]
242
@@ -4942,10 +4942,10 @@
243
     sub         r0,        r1
244
 
245
 %ifdef PIC
246
-    lea         r5,        [tab_ChromaCoeff]
247
+    lea         r5,        [v4_tab_ChromaCoeff]
248
     movd        m5,        [r5 + r4 * 4]
249
 %else
250
-    movd        m5,        [tab_ChromaCoeff + r4 * 4]
251
+    movd        m5,        [v4_tab_ChromaCoeff + r4 * 4]
252
 %endif
253
 
254
     pshufb      m6,        m5,       [tab_Vm]
255
@@ -5040,10 +5040,10 @@
256
     sub         r0,        r1
257
 
258
 %ifdef PIC
259
-    lea         r5,        [tab_ChromaCoeff]
260
+    lea         r5,        [v4_tab_ChromaCoeff]
261
     movd        m0,        [r5 + r4 * 4]
262
 %else
263
-    movd        m0,        [tab_ChromaCoeff + r4 * 4]
264
+    movd        m0,        [v4_tab_ChromaCoeff + r4 * 4]
265
 %endif
266
 
267
     pshufb      m1,        m0,       [tab_Vm]
268
@@ -5130,10 +5130,10 @@
269
     sub         r0,        r1
270
 
271
 %ifdef PIC
272
-    lea         r5,        [tab_ChromaCoeff]
273
+    lea         r5,        [v4_tab_ChromaCoeff]
274
     movd        m0,        [r5 + r4 * 4]
275
 %else
276
-    movd        m0,        [tab_ChromaCoeff + r4 * 4]
277
+    movd        m0,        [v4_tab_ChromaCoeff + r4 * 4]
278
 %endif
279
 
280
     pshufb      m1,        m0,       [tab_Vm]
281
@@ -7543,10 +7543,10 @@
282
     sub         r0,        r1
283
 
284
 %ifdef PIC
285
-    lea         r5,        [tab_ChromaCoeff]
286
+    lea         r5,        [v4_tab_ChromaCoeff]
287
     movd        m0,        [r5 + r4 * 4]
288
 %else
289
-    movd        m0,        [tab_ChromaCoeff + r4 * 4]
290
+    movd        m0,        [v4_tab_ChromaCoeff + r4 * 4]
291
 %endif
292
 
293
     pshufb      m1,        m0,       [tab_Vm]
294
@@ -7666,10 +7666,10 @@
295
     sub         r0,        r1
296
 
297
 %ifdef PIC
298
-    lea         r5,        [tab_ChromaCoeff]
299
+    lea         r5,        [v4_tab_ChromaCoeff]
300
     movd        m0,        [r5 + r4 * 4]
301
 %else
302
-    movd        m0,        [tab_ChromaCoeff + r4 * 4]
303
+    movd        m0,        [v4_tab_ChromaCoeff + r4 * 4]
304
 %endif
305
 
306
     pshufb      m1,        m0,       [tab_Vm]
307
@@ -8267,10 +8267,10 @@
308
     sub         r0,        r1
309
 
310
 %ifdef PIC
311
-    lea         r5,        [tab_ChromaCoeff]
312
+    lea         r5,        [v4_tab_ChromaCoeff]
313
     movd        m0,        [r5 + r4 * 4]
314
 %else
315
-    movd        m0,        [tab_ChromaCoeff + r4 * 4]
316
+    movd        m0,        [v4_tab_ChromaCoeff + r4 * 4]
317
 %endif
318
 
319
     pshufb      m1,        m0,       [tab_Vm]
320
@@ -8808,10 +8808,10 @@
321
     add        r3d, r3d
322
 
323
 %ifdef PIC
324
-    lea        r5, [tab_ChromaCoeff]
325
+    lea        r5, [v4_tab_ChromaCoeff]
326
     movd       m0, [r5 + r4 * 4]
327
 %else
328
-    movd       m0, [tab_ChromaCoeff + r4 * 4]
329
+    movd       m0, [v4_tab_ChromaCoeff + r4 * 4]
330
 %endif
331
 
332
     pshufb     m1, m0, [tab_Vm]
333
@@ -8907,10 +8907,10 @@
334
     add         r3d, r3d
335
 
336
 %ifdef PIC
337
-    lea         r5, [tab_ChromaCoeff]
338
+    lea         r5, [v4_tab_ChromaCoeff]
339
     movd        m0, [r5 + r4 * 4]
340
 %else
341
-    movd        m0, [tab_ChromaCoeff + r4 * 4]
342
+    movd        m0, [v4_tab_ChromaCoeff + r4 * 4]
343
 %endif
344
 
345
     pshufb      m0, [tab_Cm]
346
@@ -8981,10 +8981,10 @@
347
     add        r3d, r3d
348
 
349
 %ifdef PIC
350
-    lea        r5, [tab_ChromaCoeff]
351
+    lea        r5, [v4_tab_ChromaCoeff]
352
     movd       m0, [r5 + r4 * 4]
353
 %else
354
-    movd       m0, [tab_ChromaCoeff + r4 * 4]
355
+    movd       m0, [v4_tab_ChromaCoeff + r4 * 4]
356
 %endif
357
 
358
     pshufb     m0, [tab_Cm]
359
x265_2.7.tar.gz/source/common/x86/x86inc.asm -> x265_2.9.tar.gz/source/common/x86/x86inc.asm Changed
502
 
1
@@ -82,7 +82,13 @@
2
 %endif
3
 
4
 %macro SECTION_RODATA 0-1 32
5
-    SECTION .rodata align=%1
6
+    %ifidn __OUTPUT_FORMAT__,win32
7
+        SECTION .rdata align=%1
8
+    %elif WIN64
9
+        SECTION .rdata align=%1
10
+    %else
11
+        SECTION .rodata align=%1
12
+    %endif
13
 %endmacro
14
 
15
 %if WIN64
16
@@ -325,6 +331,8 @@
17
 %endmacro
18
 
19
 %define required_stack_alignment ((mmsize + 15) & ~15)
20
+%define vzeroupper_required (mmsize > 16 && (ARCH_X86_64 == 0 || xmm_regs_used > 16 || notcpuflag(avx512)))
21
+%define high_mm_regs (16*cpuflag(avx512))
22
 
23
 %macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only)
24
     %ifnum %1
25
@@ -438,15 +446,16 @@
26
 
27
 %macro WIN64_PUSH_XMM 0
28
     ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated.
29
-    %if xmm_regs_used > 6
30
+    %if xmm_regs_used > 6 + high_mm_regs
31
         movaps [rstk + stack_offset +  8], xmm6
32
     %endif
33
-    %if xmm_regs_used > 7
34
+    %if xmm_regs_used > 7 + high_mm_regs
35
         movaps [rstk + stack_offset + 24], xmm7
36
     %endif
37
-    %if xmm_regs_used > 8
38
+    %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
39
+    %if %%xmm_regs_on_stack > 0
40
         %assign %%i 8
41
-        %rep xmm_regs_used-8
42
+        %rep %%xmm_regs_on_stack
43
             movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i
44
             %assign %%i %%i+1
45
         %endrep
46
@@ -455,8 +464,9 @@
47
 
48
 %macro WIN64_SPILL_XMM 1
49
     %assign xmm_regs_used %1
50
-    ASSERT xmm_regs_used <= 16
51
-    %if xmm_regs_used > 8
52
+    ASSERT xmm_regs_used <= 16 + high_mm_regs
53
+    %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
54
+    %if %%xmm_regs_on_stack > 0
55
         ; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack.
56
         %assign %%pad (xmm_regs_used-8)*16 + 32
57
         %assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1))
58
@@ -467,9 +477,10 @@
59
 
60
 %macro WIN64_RESTORE_XMM_INTERNAL 0
61
     %assign %%pad_size 0
62
-    %if xmm_regs_used > 8
63
-        %assign %%i xmm_regs_used
64
-        %rep xmm_regs_used-8
65
+    %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
66
+    %if %%xmm_regs_on_stack > 0
67
+        %assign %%i xmm_regs_used - high_mm_regs
68
+        %rep %%xmm_regs_on_stack
69
             %assign %%i %%i-1
70
             movaps xmm %+ %%i, [rsp + (%%i-8)*16 + stack_size + 32]
71
         %endrep
72
@@ -482,10 +493,10 @@
73
             %assign %%pad_size stack_size_padded
74
         %endif
75
     %endif
76
-    %if xmm_regs_used > 7
77
+    %if xmm_regs_used > 7 + high_mm_regs
78
         movaps xmm7, [rsp + stack_offset - %%pad_size + 24]
79
     %endif
80
-    %if xmm_regs_used > 6
81
+    %if xmm_regs_used > 6 + high_mm_regs
82
         movaps xmm6, [rsp + stack_offset - %%pad_size +  8]
83
     %endif
84
 %endmacro
85
@@ -497,12 +508,12 @@
86
     %assign xmm_regs_used 0
87
 %endmacro
88
 
89
-%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 || stack_size > 0
90
+%define has_epilogue regs_used > 7 || stack_size > 0 || vzeroupper_required || xmm_regs_used > 6 + high_mm_regs
91
 
92
 %macro RET 0
93
     WIN64_RESTORE_XMM_INTERNAL
94
     POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
95
-    %if mmsize == 32
96
+    %if vzeroupper_required
97
         vzeroupper
98
     %endif
99
     AUTO_REP_RET
100
@@ -526,9 +537,10 @@
101
 DECLARE_REG 13, R12, 64
102
 DECLARE_REG 14, R13, 72
103
 
104
-%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
105
+%macro PROLOGUE 2-5+ 0; #args, #regs, #xmm_regs, [stack_size,] arg_names...
106
     %assign num_args %1
107
     %assign regs_used %2
108
+    %assign xmm_regs_used %3
109
     ASSERT regs_used >= num_args
110
     SETUP_STACK_POINTER %4
111
     ASSERT regs_used <= 15
112
@@ -538,7 +550,7 @@
113
     DEFINE_ARGS_INTERNAL %0, %4, %5
114
 %endmacro
115
 
116
-%define has_epilogue regs_used > 9 || mmsize == 32 || stack_size > 0
117
+%define has_epilogue regs_used > 9 || stack_size > 0 || vzeroupper_required
118
 
119
 %macro RET 0
120
     %if stack_size_padded > 0
121
@@ -549,7 +561,7 @@
122
         %endif
123
     %endif
124
     POP_IF_USED 14, 13, 12, 11, 10, 9
125
-    %if mmsize == 32
126
+    %if vzeroupper_required
127
         vzeroupper
128
     %endif
129
     AUTO_REP_RET
130
@@ -594,7 +606,7 @@
131
     DEFINE_ARGS_INTERNAL %0, %4, %5
132
 %endmacro
133
 
134
-%define has_epilogue regs_used > 3 || mmsize == 32 || stack_size > 0
135
+%define has_epilogue regs_used > 3 || stack_size > 0 || vzeroupper_required
136
 
137
 %macro RET 0
138
     %if stack_size_padded > 0
139
@@ -605,7 +617,7 @@
140
         %endif
141
     %endif
142
     POP_IF_USED 6, 5, 4, 3
143
-    %if mmsize == 32
144
+    %if vzeroupper_required
145
         vzeroupper
146
     %endif
147
     AUTO_REP_RET
148
@@ -710,12 +722,22 @@
149
     %assign stack_offset 0      ; stack pointer offset relative to the return address
150
     %assign stack_size 0        ; amount of stack space that can be freely used inside a function
151
     %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding
152
-    %assign xmm_regs_used 0     ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64
153
+    %assign xmm_regs_used 0     ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64 and vzeroupper
154
     %ifnidn %3, ""
155
         PROLOGUE %3
156
     %endif
157
 %endmacro
158
 
159
+; Create a global symbol from a local label with the correct name mangling and type
160
+%macro cglobal_label 1
161
+    %if FORMAT_ELF
162
+        global current_function %+ %1:function hidden
163
+    %else
164
+        global current_function %+ %1
165
+    %endif
166
+    %1:
167
+%endmacro
168
+
169
 %macro cextern 1
170
     %xdefine %1 mangle(private_prefix %+ _ %+ %1)
171
     CAT_XDEFINE cglobaled_, %1, 1
172
@@ -768,10 +790,10 @@
173
 %assign cpuflags_bmi1     (1<<16)| cpuflags_avx | cpuflags_lzcnt
174
 %assign cpuflags_bmi2     (1<<17)| cpuflags_bmi1
175
 %assign cpuflags_avx2     (1<<18)| cpuflags_fma3 | cpuflags_bmi2
176
+%assign cpuflags_avx512   (1<<19)| cpuflags_avx2 ; F, CD, BW, DQ, VL
177
 
178
-%assign cpuflags_cache32  (1<<19)
179
-%assign cpuflags_cache64  (1<<20)
180
-%assign cpuflags_slowctz  (1<<21)
181
+%assign cpuflags_cache32  (1<<20)
182
+%assign cpuflags_cache64  (1<<21)
183
 %assign cpuflags_aligned  (1<<22) ; not a cpu feature, but a function variant
184
 %assign cpuflags_atom     (1<<23)
185
 
186
@@ -829,11 +851,12 @@
187
     %endif
188
 %endmacro
189
 
190
-; Merge mmx and sse*
191
+; Merge mmx and sse*, and avx*
192
 ; m# is a simd register of the currently selected size
193
 ; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m#
194
 ; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m#
195
-; (All 3 remain in sync through SWAP.)
196
+; zm# is the corresponding zmm register if mmsize >= 64, otherwise the same as m#
197
+; (All 4 remain in sync through SWAP.)
198
 
199
 %macro CAT_XDEFINE 3
200
     %xdefine %1%2 %3
201
@@ -843,69 +866,100 @@
202
     %undef %1%2
203
 %endmacro
204
 
205
+%macro DEFINE_MMREGS 1 ; mmtype
206
+    %assign %%prev_mmregs 0
207
+    %ifdef num_mmregs
208
+        %assign %%prev_mmregs num_mmregs
209
+    %endif
210
+
211
+    %assign num_mmregs 8
212
+    %if ARCH_X86_64 && mmsize >= 16
213
+        %assign num_mmregs 16
214
+        %if cpuflag(avx512) || mmsize == 64
215
+            %assign num_mmregs 32
216
+        %endif
217
+    %endif
218
+
219
+    %assign %%i 0
220
+    %rep num_mmregs
221
+        CAT_XDEFINE m, %%i, %1 %+ %%i
222
+        CAT_XDEFINE nn%1, %%i, %%i
223
+        %assign %%i %%i+1
224
+    %endrep
225
+    %if %%prev_mmregs > num_mmregs
226
+        %rep %%prev_mmregs - num_mmregs
227
+            CAT_UNDEF m, %%i
228
+            CAT_UNDEF nn %+ mmtype, %%i
229
+            %assign %%i %%i+1
230
+        %endrep
231
+    %endif
232
+    %xdefine mmtype %1
233
+%endmacro
234
+
235
+; Prefer registers 16-31 over 0-15 to avoid having to use vzeroupper
236
+%macro AVX512_MM_PERMUTATION 0-1 0 ; start_reg
237
+    %if ARCH_X86_64 && cpuflag(avx512)
238
+        %assign %%i %1
239
+        %rep 16-%1
240
+            %assign %%i_high %%i+16
241
+            SWAP %%i, %%i_high
242
+            %assign %%i %%i+1
243
+        %endrep
244
+    %endif
245
+%endmacro
246
+
247
 %macro INIT_MMX 0-1+
248
     %assign avx_enabled 0
249
     %define RESET_MM_PERMUTATION INIT_MMX %1
250
     %define mmsize 8
251
-    %define num_mmregs 8
252
     %define mova movq
253
     %define movu movq
254
     %define movh movd
255
     %define movnta movntq
256
-    %assign %%i 0
257
-    %rep 8
258
-        CAT_XDEFINE m, %%i, mm %+ %%i
259
-        CAT_XDEFINE nnmm, %%i, %%i
260
-        %assign %%i %%i+1
261
-    %endrep
262
-    %rep 8
263
-        CAT_UNDEF m, %%i
264
-        CAT_UNDEF nnmm, %%i
265
-        %assign %%i %%i+1
266
-    %endrep
267
     INIT_CPUFLAGS %1
268
+    DEFINE_MMREGS mm
269
 %endmacro
270
 
271
 %macro INIT_XMM 0-1+
272
     %assign avx_enabled 0
273
     %define RESET_MM_PERMUTATION INIT_XMM %1
274
     %define mmsize 16
275
-    %define num_mmregs 8
276
-    %if ARCH_X86_64
277
-        %define num_mmregs 16
278
-    %endif
279
     %define mova movdqa
280
     %define movu movdqu
281
     %define movh movq
282
     %define movnta movntdq
283
-    %assign %%i 0
284
-    %rep num_mmregs
285
-        CAT_XDEFINE m, %%i, xmm %+ %%i
286
-        CAT_XDEFINE nnxmm, %%i, %%i
287
-        %assign %%i %%i+1
288
-    %endrep
289
     INIT_CPUFLAGS %1
290
+    DEFINE_MMREGS xmm
291
+    %if WIN64
292
+        ; Swap callee-saved registers with volatile registers
293
+        AVX512_MM_PERMUTATION 6
294
+    %endif
295
 %endmacro
296
 
297
 %macro INIT_YMM 0-1+
298
     %assign avx_enabled 1
299
     %define RESET_MM_PERMUTATION INIT_YMM %1
300
     %define mmsize 32
301
-    %define num_mmregs 8
302
-    %if ARCH_X86_64
303
-        %define num_mmregs 16
304
-    %endif
305
     %define mova movdqa
306
     %define movu movdqu
307
     %undef movh
308
     %define movnta movntdq
309
-    %assign %%i 0
310
-    %rep num_mmregs
311
-        CAT_XDEFINE m, %%i, ymm %+ %%i
312
-        CAT_XDEFINE nnymm, %%i, %%i
313
-        %assign %%i %%i+1
314
-    %endrep
315
     INIT_CPUFLAGS %1
316
+    DEFINE_MMREGS ymm
317
+    AVX512_MM_PERMUTATION
318
+%endmacro
319
+
320
+%macro INIT_ZMM 0-1+
321
+    %assign avx_enabled 1
322
+    %define RESET_MM_PERMUTATION INIT_ZMM %1
323
+    %define mmsize 64
324
+    %define mova movdqa
325
+    %define movu movdqu
326
+    %undef movh
327
+    %define movnta movntdq
328
+    INIT_CPUFLAGS %1
329
+    DEFINE_MMREGS zmm
330
+    AVX512_MM_PERMUTATION
331
 %endmacro
332
 
333
 INIT_XMM
334
@@ -914,18 +968,26 @@
335
     %define  mmmm%1   mm%1
336
     %define  mmxmm%1  mm%1
337
     %define  mmymm%1  mm%1
338
+    %define  mmzmm%1  mm%1
339
     %define xmmmm%1   mm%1
340
     %define xmmxmm%1 xmm%1
341
     %define xmmymm%1 xmm%1
342
+    %define xmmzmm%1 xmm%1
343
     %define ymmmm%1   mm%1
344
     %define ymmxmm%1 xmm%1
345
     %define ymmymm%1 ymm%1
346
+    %define ymmzmm%1 ymm%1
347
+    %define zmmmm%1   mm%1
348
+    %define zmmxmm%1 xmm%1
349
+    %define zmmymm%1 ymm%1
350
+    %define zmmzmm%1 zmm%1
351
     %define xm%1 xmm %+ m%1
352
     %define ym%1 ymm %+ m%1
353
+    %define zm%1 zmm %+ m%1
354
 %endmacro
355
 
356
 %assign i 0
357
-%rep 16
358
+%rep 32
359
     DECLARE_MMCAST i
360
     %assign i i+1
361
 %endrep
362
@@ -1060,12 +1122,17 @@
363
 ;=============================================================================
364
 
365
 %assign i 0
366
-%rep 16
367
+%rep 32
368
     %if i < 8
369
         CAT_XDEFINE sizeofmm, i, 8
370
+        CAT_XDEFINE regnumofmm, i, i
371
     %endif
372
     CAT_XDEFINE sizeofxmm, i, 16
373
     CAT_XDEFINE sizeofymm, i, 32
374
+    CAT_XDEFINE sizeofzmm, i, 64
375
+    CAT_XDEFINE regnumofxmm, i, i
376
+    CAT_XDEFINE regnumofymm, i, i
377
+    CAT_XDEFINE regnumofzmm, i, i
378
     %assign i i+1
379
 %endrep
380
 %undef i
381
@@ -1182,7 +1249,7 @@
382
     %endmacro
383
 %endmacro
384
 
385
-; Instructions with both VEX and non-VEX encodings
386
+; Instructions with both VEX/EVEX and legacy encodings
387
 ; Non-destructive instructions are written without parameters
388
 AVX_INSTR addpd, sse2, 1, 0, 1
389
 AVX_INSTR addps, sse, 1, 0, 1
390
@@ -1190,12 +1257,12 @@
391
 AVX_INSTR addss, sse, 1, 0, 0
392
 AVX_INSTR addsubpd, sse3, 1, 0, 0
393
 AVX_INSTR addsubps, sse3, 1, 0, 0
394
-AVX_INSTR aesdec, fnord, 0, 0, 0
395
-AVX_INSTR aesdeclast, fnord, 0, 0, 0
396
-AVX_INSTR aesenc, fnord, 0, 0, 0
397
-AVX_INSTR aesenclast, fnord, 0, 0, 0
398
-AVX_INSTR aesimc
399
-AVX_INSTR aeskeygenassist
400
+AVX_INSTR aesdec, aesni, 0, 0, 0
401
+AVX_INSTR aesdeclast, aesni, 0, 0, 0
402
+AVX_INSTR aesenc, aesni, 0, 0, 0
403
+AVX_INSTR aesenclast, aesni, 0, 0, 0
404
+AVX_INSTR aesimc, aesni
405
+AVX_INSTR aeskeygenassist, aesni
406
 AVX_INSTR andnpd, sse2, 1, 0, 0
407
 AVX_INSTR andnps, sse, 1, 0, 0
408
 AVX_INSTR andpd, sse2, 1, 0, 1
409
@@ -1204,10 +1271,42 @@
410
 AVX_INSTR blendps, sse4, 1, 1, 0
411
 AVX_INSTR blendvpd, sse4 ; can't be emulated
412
 AVX_INSTR blendvps, sse4 ; can't be emulated
413
+AVX_INSTR cmpeqpd, sse2, 1, 0, 1
414
+AVX_INSTR cmpeqps, sse, 1, 0, 1
415
+AVX_INSTR cmpeqsd, sse2, 1, 0, 0
416
+AVX_INSTR cmpeqss, sse, 1, 0, 0
417
+AVX_INSTR cmplepd, sse2, 1, 0, 0
418
+AVX_INSTR cmpleps, sse, 1, 0, 0
419
+AVX_INSTR cmplesd, sse2, 1, 0, 0
420
+AVX_INSTR cmpless, sse, 1, 0, 0
421
+AVX_INSTR cmpltpd, sse2, 1, 0, 0
422
+AVX_INSTR cmpltps, sse, 1, 0, 0
423
+AVX_INSTR cmpltsd, sse2, 1, 0, 0
424
+AVX_INSTR cmpltss, sse, 1, 0, 0
425
+AVX_INSTR cmpneqpd, sse2, 1, 0, 1
426
+AVX_INSTR cmpneqps, sse, 1, 0, 1
427
+AVX_INSTR cmpneqsd, sse2, 1, 0, 0
428
+AVX_INSTR cmpneqss, sse, 1, 0, 0
429
+AVX_INSTR cmpnlepd, sse2, 1, 0, 0
430
+AVX_INSTR cmpnleps, sse, 1, 0, 0
431
+AVX_INSTR cmpnlesd, sse2, 1, 0, 0
432
+AVX_INSTR cmpnless, sse, 1, 0, 0
433
+AVX_INSTR cmpnltpd, sse2, 1, 0, 0
434
+AVX_INSTR cmpnltps, sse, 1, 0, 0
435
+AVX_INSTR cmpnltsd, sse2, 1, 0, 0
436
+AVX_INSTR cmpnltss, sse, 1, 0, 0
437
+AVX_INSTR cmpordpd, sse2 1, 0, 1
438
+AVX_INSTR cmpordps, sse 1, 0, 1
439
+AVX_INSTR cmpordsd, sse2 1, 0, 0
440
+AVX_INSTR cmpordss, sse 1, 0, 0
441
 AVX_INSTR cmppd, sse2, 1, 1, 0
442
 AVX_INSTR cmpps, sse, 1, 1, 0
443
 AVX_INSTR cmpsd, sse2, 1, 1, 0
444
 AVX_INSTR cmpss, sse, 1, 1, 0
445
+AVX_INSTR cmpunordpd, sse2, 1, 0, 1
446
+AVX_INSTR cmpunordps, sse, 1, 0, 1
447
+AVX_INSTR cmpunordsd, sse2, 1, 0, 0
448
+AVX_INSTR cmpunordss, sse, 1, 0, 0
449
 AVX_INSTR comisd, sse2
450
 AVX_INSTR comiss, sse
451
 AVX_INSTR cvtdq2pd, sse2
452
@@ -1513,3 +1612,49 @@
453
 FMA4_INSTR fmsubadd, pd, ps
454
 FMA4_INSTR fnmadd,   pd, ps, sd, ss
455
 FMA4_INSTR fnmsub,   pd, ps, sd, ss
456
+
457
+; Macros for converting VEX instructions to equivalent EVEX ones.
458
+%macro EVEX_INSTR 2-3 0 ; vex, evex, prefer_evex
459
+    %macro %1 2-7 fnord, fnord, %1, %2, %3
460
+        %ifidn %3, fnord
461
+            %define %%args %1, %2
462
+        %elifidn %4, fnord
463
+            %define %%args %1, %2, %3
464
+        %else
465
+            %define %%args %1, %2, %3, %4
466
+        %endif
467
+        %assign %%evex_required cpuflag(avx512) & %7
468
+        %ifnum regnumof%1
469
+            %if regnumof%1 >= 16 || sizeof%1 > 32
470
+                %assign %%evex_required 1
471
+            %endif
472
+        %endif
473
+        %ifnum regnumof%2
474
+            %if regnumof%2 >= 16 || sizeof%2 > 32
475
+                %assign %%evex_required 1
476
+            %endif
477
+        %endif
478
+        %if %%evex_required
479
+            %6 %%args
480
+        %else
481
+            %5 %%args ; Prefer VEX over EVEX due to shorter instruction length
482
+        %endif
483
+    %endmacro
484
+%endmacro
485
+
486
+EVEX_INSTR vbroadcastf128, vbroadcastf32x4
487
+EVEX_INSTR vbroadcasti128, vbroadcasti32x4
488
+EVEX_INSTR vextractf128,   vextractf32x4
489
+EVEX_INSTR vextracti128,   vextracti32x4
490
+EVEX_INSTR vinsertf128,    vinsertf32x4
491
+EVEX_INSTR vinserti128,    vinserti32x4
492
+EVEX_INSTR vmovdqa,        vmovdqa32
493
+EVEX_INSTR vmovdqu,        vmovdqu32
494
+EVEX_INSTR vpand,          vpandd
495
+EVEX_INSTR vpandn,         vpandnd
496
+EVEX_INSTR vpor,           vpord
497
+EVEX_INSTR vpxor,          vpxord
498
+EVEX_INSTR vrcpps,         vrcp14ps,   1 ; EVEX versions have higher precision
499
+EVEX_INSTR vrcpss,         vrcp14ss,   1
500
+EVEX_INSTR vrsqrtps,       vrsqrt14ps, 1
501
+EVEX_INSTR vrsqrtss,       vrsqrt14ss, 1
502
x265_2.7.tar.gz/source/common/x86/x86util.asm -> x265_2.9.tar.gz/source/common/x86/x86util.asm Changed
101
 
1
@@ -299,32 +299,44 @@
2
     pminsw %2, %4
3
 %endmacro
4
 
5
+%macro MOVHL 2 ; dst, src
6
+%ifidn %1, %2
7
+    punpckhqdq %1, %2
8
+%elif cpuflag(avx)
9
+    punpckhqdq %1, %2, %2
10
+%elif cpuflag(sse4)
11
+    pshufd     %1, %2, q3232 ; pshufd is slow on some older CPUs, so only use it on more modern ones
12
+%else
13
+    movhlps    %1, %2        ; may cause an int/float domain transition and has a dependency on dst
14
+%endif
15
+%endmacro
16
+
17
 %macro HADDD 2 ; sum junk
18
-%if sizeof%1 == 32
19
-%define %2 xmm%2
20
-    vextracti128 %2, %1, 1
21
-%define %1 xmm%1
22
-    paddd   %1, %2
23
+%if sizeof%1 >= 64
24
+    vextracti32x8 ymm%2, zmm%1, 1
25
+    paddd         ymm%1, ymm%2
26
 %endif
27
-%if mmsize >= 16
28
-%if cpuflag(xop) && sizeof%1 == 16
29
-    vphadddq %1, %1
30
+%if sizeof%1 >= 32
31
+    vextracti128  xmm%2, ymm%1, 1
32
+    paddd         xmm%1, xmm%2
33
+%endif
34
+%if sizeof%1 >= 16
35
+    MOVHL         xmm%2, xmm%1
36
+    paddd         xmm%1, xmm%2
37
 %endif
38
-    movhlps %2, %1
39
-    paddd   %1, %2
40
+%if cpuflag(xop) && sizeof%1 == 16
41
+    vphadddq xmm%1, xmm%1
42
 %endif
43
 %if notcpuflag(xop)
44
-    PSHUFLW %2, %1, q0032
45
-    paddd   %1, %2
46
+    PSHUFLW xmm%2, xmm%1, q1032
47
+    paddd   xmm%1, xmm%2
48
 %endif
49
-%undef %1
50
-%undef %2
51
 %endmacro
52
 
53
 %macro HADDW 2 ; reg, tmp
54
 %if cpuflag(xop) && sizeof%1 == 16
55
     vphaddwq  %1, %1
56
-    movhlps   %2, %1
57
+    MOVHL     %2, %1
58
     paddd     %1, %2
59
 %else
60
     pmaddwd %1, [pw_1]
61
@@ -346,7 +358,7 @@
62
 %macro HADDUW 2
63
 %if cpuflag(xop) && sizeof%1 == 16
64
     vphadduwq %1, %1
65
-    movhlps   %2, %1
66
+    MOVHL     %2, %1
67
     paddd     %1, %2
68
 %else
69
     HADDUWD   %1, %2
70
@@ -739,25 +751,25 @@
71
 %if %6 ; %5 aligned?
72
     mova       %1, %4
73
     psubw      %1, %5
74
+%elif cpuflag(avx)
75
+    movu       %1, %4
76
+    psubw      %1, %5
77
 %else
78
     movu       %1, %4
79
     movu       %2, %5
80
     psubw      %1, %2
81
 %endif
82
 %else ; !HIGH_BIT_DEPTH
83
-%ifidn %3, none
84
     movh       %1, %4
85
     movh       %2, %5
86
+%ifidn %3, none
87
     punpcklbw  %1, %2
88
     punpcklbw  %2, %2
89
-    psubw      %1, %2
90
 %else
91
-    movh       %1, %4
92
     punpcklbw  %1, %3
93
-    movh       %2, %5
94
     punpcklbw  %2, %3
95
-    psubw      %1, %2
96
 %endif
97
+    psubw      %1, %2
98
 %endif ; HIGH_BIT_DEPTH
99
 %endmacro
100
 
101
x265_2.7.tar.gz/source/common/yuv.cpp -> x265_2.9.tar.gz/source/common/yuv.cpp Changed
39
 
1
@@ -170,11 +170,14 @@
2
 
3
 void Yuv::addClip(const Yuv& srcYuv0, const ShortYuv& srcYuv1, uint32_t log2SizeL, int picCsp)
4
 {
5
-    primitives.cu[log2SizeL - 2].add_ps(m_buf[0], m_size, srcYuv0.m_buf[0], srcYuv1.m_buf[0], srcYuv0.m_size, srcYuv1.m_size);
6
+    primitives.cu[log2SizeL - 2].add_ps[(m_size % 64 == 0) && (srcYuv0.m_size % 64 == 0) && (srcYuv1.m_size % 64 == 0)](m_buf[0],
7
+                                         m_size, srcYuv0.m_buf[0], srcYuv1.m_buf[0], srcYuv0.m_size, srcYuv1.m_size);
8
     if (m_csp != X265_CSP_I400 && picCsp != X265_CSP_I400)
9
     {
10
-        primitives.chroma[m_csp].cu[log2SizeL - 2].add_ps(m_buf[1], m_csize, srcYuv0.m_buf[1], srcYuv1.m_buf[1], srcYuv0.m_csize, srcYuv1.m_csize);
11
-        primitives.chroma[m_csp].cu[log2SizeL - 2].add_ps(m_buf[2], m_csize, srcYuv0.m_buf[2], srcYuv1.m_buf[2], srcYuv0.m_csize, srcYuv1.m_csize);
12
+        primitives.chroma[m_csp].cu[log2SizeL - 2].add_ps[(m_csize % 64 == 0) && (srcYuv0.m_csize % 64 ==0) && (srcYuv1.m_csize % 64 == 0)](m_buf[1],
13
+                                                           m_csize, srcYuv0.m_buf[1], srcYuv1.m_buf[1], srcYuv0.m_csize, srcYuv1.m_csize);
14
+        primitives.chroma[m_csp].cu[log2SizeL - 2].add_ps[(m_csize % 64 == 0) && (srcYuv0.m_csize % 64 == 0) && (srcYuv1.m_csize % 64 == 0)](m_buf[2],
15
+                                                           m_csize, srcYuv0.m_buf[2], srcYuv1.m_buf[2], srcYuv0.m_csize, srcYuv1.m_csize);
16
     }
17
     if (picCsp == X265_CSP_I400 && m_csp != X265_CSP_I400)
18
     {
19
@@ -192,7 +195,7 @@
20
         const int16_t* srcY0 = srcYuv0.getLumaAddr(absPartIdx);
21
         const int16_t* srcY1 = srcYuv1.getLumaAddr(absPartIdx);
22
         pixel* dstY = getLumaAddr(absPartIdx);
23
-        primitives.pu[part].addAvg(srcY0, srcY1, dstY, srcYuv0.m_size, srcYuv1.m_size, m_size);
24
+        primitives.pu[part].addAvg[(srcYuv0.m_size % 64 == 0) && (srcYuv1.m_size % 64 == 0) && (m_size % 64 == 0)](srcY0, srcY1, dstY, srcYuv0.m_size, srcYuv1.m_size, m_size);
25
     }
26
     if (bChroma)
27
     {
28
@@ -202,8 +205,8 @@
29
         const int16_t* srcV1 = srcYuv1.getCrAddr(absPartIdx);
30
         pixel* dstU = getCbAddr(absPartIdx);
31
         pixel* dstV = getCrAddr(absPartIdx);
32
-        primitives.chroma[m_csp].pu[part].addAvg(srcU0, srcU1, dstU, srcYuv0.m_csize, srcYuv1.m_csize, m_csize);
33
-        primitives.chroma[m_csp].pu[part].addAvg(srcV0, srcV1, dstV, srcYuv0.m_csize, srcYuv1.m_csize, m_csize);
34
+        primitives.chroma[m_csp].pu[part].addAvg[(srcYuv0.m_csize % 64 == 0) && (srcYuv1.m_csize % 64 == 0) && (m_csize % 64 == 0)](srcU0, srcU1, dstU, srcYuv0.m_csize, srcYuv1.m_csize, m_csize);
35
+        primitives.chroma[m_csp].pu[part].addAvg[(srcYuv0.m_csize % 64 == 0) && (srcYuv1.m_csize % 64 == 0) && (m_csize % 64 == 0)](srcV0, srcV1, dstV, srcYuv0.m_csize, srcYuv1.m_csize, m_csize);
36
     }
37
 }
38
 
39
x265_2.7.tar.gz/source/common/yuv.h -> x265_2.9.tar.gz/source/common/yuv.h Changed
9
 
1
@@ -38,7 +38,6 @@
2
 class Yuv
3
 {
4
 public:
5
-
6
     pixel*   m_buf[3];
7
 
8
     uint32_t m_size;
9
x265_2.7.tar.gz/source/dynamicHDR10/SeiMetadataDictionary.cpp -> x265_2.9.tar.gz/source/dynamicHDR10/SeiMetadataDictionary.cpp Changed
28
 
1
@@ -34,6 +34,7 @@
2
 const std::string BezierCurveNames::NumberOfAnchors = std::string("NumberOfAnchors");
3
 const std::string BezierCurveNames::KneePointX = std::string("KneePointX");
4
 const std::string BezierCurveNames::KneePointY = std::string("KneePointY");
5
+const std::string BezierCurveNames::AnchorsTag = std::string("Anchors");
6
 const std::string BezierCurveNames::Anchors[] = {std::string("Anchor0"),
7
                                                  std::string("Anchor1"),
8
                                                  std::string("Anchor2"),
9
@@ -69,6 +70,8 @@
10
 
11
 const std::string PercentileNames::TagName = std::string("PercentileLuminance");
12
 const std::string PercentileNames::NumberOfPercentiles = std::string("NumberOfPercentiles");
13
+const std::string PercentileNames::DistributionIndex = std::string("DistributionIndex");
14
+const std::string PercentileNames::DistributionValues = std::string("DistributionValues");
15
 const std::string PercentileNames::PercentilePercentageValue[] = {std::string("PercentilePercentage0"),
16
                                                                   std::string("PercentilePercentage1"),
17
                                                                   std::string("PercentilePercentage2"),
18
@@ -104,7 +107,9 @@
19
 
20
 
21
 const std::string LuminanceNames::TagName = std::string("LuminanceParameters");
22
+const std::string LuminanceNames::LlcTagName = std::string("LuminanceDistributions");
23
 const std::string LuminanceNames::AverageRGB = std::string("AverageRGB");
24
+const std::string LuminanceNames::MaxSCL = std::string("MaxScl");
25
 const std::string LuminanceNames::MaxSCL0 = std::string("MaxScl0");
26
 const std::string LuminanceNames::MaxSCL1 = std::string("MaxScl1");
27
 const std::string LuminanceNames::MaxSCL2 = std::string("MaxScl2");
28
x265_2.7.tar.gz/source/dynamicHDR10/SeiMetadataDictionary.h -> x265_2.9.tar.gz/source/dynamicHDR10/SeiMetadataDictionary.h Changed
28
 
1
@@ -48,6 +48,7 @@
2
         static const std::string NumberOfAnchors;
3
         static const std::string KneePointX;
4
         static const std::string KneePointY;
5
+        static const std::string AnchorsTag;
6
         static const std::string Anchors[14];
7
     };
8
     //Ellipse Selection Data
9
@@ -79,6 +80,8 @@
10
         public:
11
         static const std::string TagName;
12
         static const std::string NumberOfPercentiles;
13
+        static const std::string DistributionIndex;
14
+        static const std::string DistributionValues;
15
         static const std::string PercentilePercentageValue[15];
16
         static const std::string PercentileLuminanceValue[15];
17
     };
18
@@ -87,7 +90,9 @@
19
     {
20
         public:
21
         static const std::string TagName;
22
+        static const std::string LlcTagName;
23
         static const std::string AverageRGB;
24
+        static const std::string MaxSCL;
25
         static const std::string MaxSCL0;
26
         static const std::string MaxSCL1;
27
         static const std::string MaxSCL2;
28
x265_2.7.tar.gz/source/dynamicHDR10/metadataFromJson.cpp -> x265_2.9.tar.gz/source/dynamicHDR10/metadataFromJson.cpp Changed
534
 
1
@@ -46,89 +46,133 @@
2
     int mCurrentStreamBit;
3
     int mCurrentStreamByte;
4
 
5
-    bool luminanceParamFromJson(const Json &data, LuminanceParameters &obj)
6
+    bool luminanceParamFromJson(const Json &data, LuminanceParameters &obj, const JsonType jsonType)
7
     {
8
         JsonObject lumJsonData = data.object_items();
9
         if(!lumJsonData.empty())
10
         {
11
-            JsonObject percentileData = lumJsonData[PercentileNames::TagName].object_items();
12
-            obj.order = percentileData[PercentileNames::NumberOfPercentiles].int_value();
13
-
14
-            obj.averageLuminance = static_cast<float>(lumJsonData[LuminanceNames::AverageRGB].number_value());
15
-            obj.maxRLuminance = static_cast<float>(lumJsonData[LuminanceNames::MaxSCL0].number_value());
16
-            obj.maxGLuminance = static_cast<float>(lumJsonData[LuminanceNames::MaxSCL1].number_value());
17
-            obj.maxBLuminance = static_cast<float>(lumJsonData[LuminanceNames::MaxSCL2].number_value());
18
-
19
-            if(!percentileData.empty())
20
-            {
21
-                obj.percentiles.resize(obj.order);
22
-                for(int i = 0; i < obj.order; ++i)
23
-                {
24
-                    std::string percentileTag = PercentileNames::TagName;
25
-                    percentileTag += std::to_string(i);
26
-                    obj.percentiles[i] = static_cast<unsigned int>(percentileData[percentileTag].int_value());
27
-                }
28
-            }
29
-
30
-            return true;
31
-        }
32
-        return false;
33
-    }
34
-
35
-    bool percentagesFromJson(const Json &data, std::vector<unsigned int> &percentages)
36
-    {
37
-        JsonObject jsonData = data.object_items();
38
-        if(!jsonData.empty())
39
-        {
40
-            JsonObject percentileData = jsonData[PercentileNames::TagName].object_items();
41
-            int order = percentileData[PercentileNames::NumberOfPercentiles].int_value();
42
-
43
-            percentages.resize(order);
44
-            for(int i = 0; i < order; ++i)
45
-            {
46
-                std::string percentileTag = PercentileNames::PercentilePercentageValue[i];
47
-                percentages[i] = static_cast<unsigned int>(percentileData[percentileTag].int_value());
48
-            }
49
-
50
-            return true;
51
-        }
52
+           switch(jsonType)
53
+           {
54
+               case LEGACY:
55
+               {
56
+                   obj.averageLuminance = static_cast<float>(lumJsonData[LuminanceNames::AverageRGB].number_value());
57
+                   obj.maxRLuminance = static_cast<float>(lumJsonData[LuminanceNames::MaxSCL0].number_value());
58
+                   obj.maxGLuminance = static_cast<float>(lumJsonData[LuminanceNames::MaxSCL1].number_value());
59
+                   obj.maxBLuminance = static_cast<float>(lumJsonData[LuminanceNames::MaxSCL2].number_value());
60
+
61
+                   JsonObject percentileData = lumJsonData[PercentileNames::TagName].object_items();
62
+                   obj.order = percentileData[PercentileNames::NumberOfPercentiles].int_value();
63
+                   if(!percentileData.empty())
64
+                   {
65
+                       obj.percentiles.resize(obj.order);
66
+                       for(int i = 0; i < obj.order; ++i)
67
+                       {
68
+                           std::string percentileTag = PercentileNames::TagName;
69
+                           percentileTag += std::to_string(i);
70
+                           obj.percentiles[i] = static_cast<unsigned int>(percentileData[percentileTag].int_value());
71
+                       }
72
+                   }
73
+                   return true;
74
+               } break;
75
+               case LLC:
76
+               {
77
+                   obj.averageLuminance = static_cast<float>(lumJsonData[LuminanceNames::AverageRGB].number_value());
78
+                   JsonArray maxScl = lumJsonData[LuminanceNames::MaxSCL].array_items();
79
+                   obj.maxRLuminance = static_cast<float>(maxScl[0].number_value());
80
+                   obj.maxGLuminance = static_cast<float>(maxScl[1].number_value());
81
+                   obj.maxBLuminance = static_cast<float>(maxScl[2].number_value());
82
+
83
+                   JsonObject percentileData = lumJsonData[LuminanceNames::LlcTagName].object_items();
84
+                   if(!percentileData.empty())
85
+                   {
86
+                       JsonArray distributionValues = percentileData[PercentileNames::DistributionValues].array_items();
87
+                       obj.order = static_cast<int>(distributionValues.size());
88
+                       obj.percentiles.resize(obj.order);
89
+                       for(int i = 0; i < obj.order; ++i)
90
+                       {
91
+                           obj.percentiles[i] = static_cast<unsigned int>(distributionValues[i].int_value());
92
+                       }
93
+                   }
94
+                   return true;
95
+               } break;
96
+           }
97
+       }
98
         return false;
99
     }
100
 
101
-    bool percentagesFromJson(const Json &data, unsigned int *percentages)
102
+    bool percentagesFromJson(const Json &data, std::vector<unsigned int> &percentages, const JsonType jsonType)
103
     {
104
         JsonObject jsonData = data.object_items();
105
         if(!jsonData.empty())
106
         {
107
-            JsonObject percentileData = jsonData[PercentileNames::TagName].object_items();
108
-            int order = percentileData[PercentileNames::NumberOfPercentiles].int_value();
109
-
110
-            for(int i = 0; i < order; ++i)
111
-            {
112
-                std::string percentileTag = PercentileNames::PercentilePercentageValue[i];
113
-                percentages[i] = static_cast<unsigned int>(percentileData[percentileTag].int_value());
114
-            }
115
+           switch(jsonType)
116
+           {
117
+               case LEGACY:
118
+               {
119
+                   JsonObject percentileData = jsonData[PercentileNames::TagName].object_items();
120
+                   int order = percentileData[PercentileNames::NumberOfPercentiles].int_value();
121
+                   percentages.resize(order);
122
+                   for(int i = 0; i < order; ++i)
123
+                   {
124
+                       std::string percentileTag = PercentileNames::PercentilePercentageValue[i];
125
+                       percentages[i] = static_cast<unsigned int>(percentileData[percentileTag].int_value());
126
+                   }
127
+                   return true;
128
+               } break;
129
+               case LLC:
130
+               {
131
+                   JsonObject percentileData = jsonData[LuminanceNames::LlcTagName].object_items();
132
+                   if(!percentileData.empty())
133
+                   {
134
+                       JsonArray percentageValues = percentileData[PercentileNames::DistributionIndex].array_items();
135
+                       int order = static_cast<int>(percentageValues.size());
136
+                       percentages.resize(order);
137
+                       for(int i = 0; i < order; ++i)
138
+                       {
139
+                           percentages[i] = static_cast<unsigned int>(percentageValues[i].int_value());
140
+                       }
141
+                   } 
142
+                   return true;
143
+               } break;
144
+           }
145
 
146
-            return true;
147
         }
148
         return false;
149
     }
150
 
151
-    bool bezierCurveFromJson(const Json &data, BezierCurveData &obj)
152
+    bool bezierCurveFromJson(const Json &data, BezierCurveData &obj, const JsonType jsonType)
153
     {
154
         JsonObject jsonData = data.object_items();
155
         if(!jsonData.empty())
156
         {
157
-            obj.order = jsonData[BezierCurveNames::NumberOfAnchors].int_value();
158
-            obj.coeff.resize(obj.order);
159
-            obj.sPx = jsonData[BezierCurveNames::KneePointX].int_value();
160
-            obj.sPy = jsonData[BezierCurveNames::KneePointY].int_value();
161
-            for(int i = 0; i < obj.order; ++i)
162
-            {
163
-                obj.coeff[i] = jsonData[BezierCurveNames::Anchors[i]].int_value();
164
-            }
165
-
166
-            return true;
167
+           switch(jsonType)
168
+           {
169
+               case LEGACY:
170
+               {
171
+                   obj.sPx = jsonData[BezierCurveNames::KneePointX].int_value();
172
+                   obj.sPy = jsonData[BezierCurveNames::KneePointY].int_value();
173
+                   obj.order = jsonData[BezierCurveNames::NumberOfAnchors].int_value();
174
+                   obj.coeff.resize(obj.order);
175
+                   for(int i = 0; i < obj.order; ++i)
176
+                   {
177
+                       obj.coeff[i] = jsonData[BezierCurveNames::Anchors[i]].int_value();
178
+                   }
179
+                   return true;    
180
+               } break;
181
+               case LLC:
182
+               {
183
+                   obj.sPx = jsonData[BezierCurveNames::KneePointX].int_value();
184
+                   obj.sPy = jsonData[BezierCurveNames::KneePointY].int_value();
185
+                   JsonArray anchorValues = data[BezierCurveNames::AnchorsTag].array_items();
186
+                   obj.order = static_cast<int>(anchorValues.size());
187
+                   obj.coeff.resize(obj.order);
188
+                   for(int i = 0; i < obj.order; ++i)
189
+                   {
190
+                       obj.coeff[i] = anchorValues[i].int_value();
191
+                   }
192
+                   return true;
193
+               } break;
194
+           }
195
         }
196
         return false;
197
     }
198
@@ -162,9 +206,7 @@
199
     void setPayloadSize(uint8_t *dataStream, int positionOnStream, int payload)
200
     {
201
         int payloadBytes = 1;
202
-
203
         for(;payload >= 0xFF; payload -= 0xFF, ++payloadBytes);
204
-
205
         if(payloadBytes > 1)
206
         {
207
             shiftData(dataStream, payloadBytes-1, mCurrentStreamByte, positionOnStream);
208
@@ -196,8 +238,6 @@
209
         }
210
     }
211
 
212
-//    const std::string LocalParameters = std::string("LocalParameters");
213
-//    const std::string TargetDisplayLuminance = std::string("TargetedSystemDisplayMaximumLuminance");
214
 };
215
 
216
 metadataFromJson::metadataFromJson() :
217
@@ -211,17 +251,17 @@
218
     delete mPimpl;
219
 }
220
 
221
-
222
 bool metadataFromJson::frameMetadataFromJson(const char* filePath,
223
                                               int frame,
224
                                               uint8_t *&metadata)
225
 {
226
     std::string path(filePath);
227
     JsonArray fileData = JsonHelper::readJsonArray(path);
228
-
229
+   JsonType jsonType = LEGACY;
230
     if(fileData.empty())
231
     {
232
-        return false;
233
+       jsonType = LLC;
234
+        fileData = JsonHelper::readJson(filePath).at("SceneInfo").array_items();
235
     }
236
 
237
 //    frame = frame + 1; //index on the array start at 0 frames starts at 1
238
@@ -233,7 +273,6 @@
239
     }
240
 
241
     int mSEIBytesToRead = 509;
242
-
243
     if(metadata)
244
     {
245
         delete(metadata);
246
@@ -241,13 +280,9 @@
247
     metadata = new uint8_t[mSEIBytesToRead];
248
     mPimpl->mCurrentStreamBit = 8;
249
     mPimpl->mCurrentStreamByte = 1;
250
+    memset(metadata, 0, mSEIBytesToRead);
251
 
252
-    for(int j = 0; j < mSEIBytesToRead; ++j)
253
-    {
254
-        (metadata)[j] = 0;
255
-    }
256
-
257
-    fillMetadataArray(fileData, frame, metadata);
258
+    fillMetadataArray(fileData, frame, jsonType, metadata);
259
     mPimpl->setPayloadSize(metadata, 0, mPimpl->mCurrentStreamByte);
260
     return true;
261
 }
262
@@ -256,9 +291,11 @@
263
 {
264
     std::string path(filePath);
265
     JsonArray fileData = JsonHelper::readJsonArray(path);
266
+   JsonType jsonType = LEGACY;
267
     if (fileData.empty())
268
     {
269
-        return -1;
270
+       jsonType = LLC;
271
+        fileData = JsonHelper::readJson(filePath).at("SceneInfo").array_items();
272
     }
273
 
274
     int numFrames = static_cast<int>(fileData.size());
275
@@ -266,17 +303,12 @@
276
     for (int frame = 0; frame < numFrames; ++frame)
277
     {
278
         metadata[frame] = new uint8_t[509];
279
-        for (int i = 0; i < 509; ++i)
280
-        {
281
-            metadata[frame][i] = 0;
282
-        }
283
+        memset(metadata[frame], 0, 509);
284
         mPimpl->mCurrentStreamBit = 8;
285
         mPimpl->mCurrentStreamByte = 1;
286
 
287
-        fillMetadataArray(fileData, frame, metadata[frame]);
288
-
289
+        fillMetadataArray(fileData, frame, jsonType, metadata[frame]);
290
         mPimpl->setPayloadSize(metadata[frame], 0, mPimpl->mCurrentStreamByte);
291
-
292
     }
293
 
294
     return numFrames;
295
@@ -321,7 +353,7 @@
296
     /* NOTE: We leave TWO BYTES of space for the payload */
297
     mPimpl->mCurrentStreamByte += 2;
298
 
299
-    fillMetadataArray(fileData, frame, metadata);
300
+    fillMetadataArray(fileData, frame, LEGACY, metadata);
301
 
302
     /* Set payload in bytes 2 & 3 as indicated in Extended InfoFrame Type syntax */
303
     metadata[2] = (mPimpl->mCurrentStreamByte & 0xFF00) >> 8;
304
@@ -331,7 +363,7 @@
305
 
306
 int metadataFromJson::movieExtendedInfoFrameMetadataFromJson(const char* filePath, uint8_t **&metadata)
307
 {
308
-   std::string path(filePath);
309
+    std::string path(filePath);
310
     JsonArray fileData = JsonHelper::readJsonArray(path);
311
     if(fileData.empty())
312
     {
313
@@ -344,9 +376,9 @@
314
     {
315
         metadata[frame] = new uint8_t[509];
316
         for(int i = 0; i < 509; ++i) 
317
-       {
318
-           metadata[frame][i] = 0;
319
-       }
320
+        {
321
+            metadata[frame][i] = 0;
322
+        }
323
         mPimpl->mCurrentStreamBit = 8;
324
         mPimpl->mCurrentStreamByte = 0;
325
 
326
@@ -356,7 +388,7 @@
327
         /* NOTE: We leave TWO BYTES of space for the payload */
328
         mPimpl->mCurrentStreamByte += 2;
329
 
330
-        fillMetadataArray(fileData, frame, metadata[frame]);
331
+        fillMetadataArray(fileData, frame, LEGACY, metadata[frame]);
332
 
333
         /* Set payload in bytes 2 & 3 as indicated in Extended InfoFrame Type syntax */
334
         metadata[frame][2] = (mPimpl->mCurrentStreamByte & 0xFF00) >> 8;
335
@@ -366,7 +398,7 @@
336
     return numFrames;
337
 }
338
 
339
-void metadataFromJson::fillMetadataArray(const JsonArray &fileData, int frame, uint8_t *&metadata)
340
+void metadataFromJson::fillMetadataArray(const JsonArray &fileData, int frame, const JsonType jsonType, uint8_t *&metadata)
341
 {
342
     const uint8_t countryCode = 0xB5;
343
     const uint16_t terminalProviderCode = 0x003C;
344
@@ -381,57 +413,68 @@
345
     mPimpl->appendBits(metadata, applicationIdentifier, 8);
346
     mPimpl->appendBits(metadata, applicationVersion, 8);
347
 
348
-    //Note: Validated only add up to two local selections, ignore the rest
349
-    JsonArray jsonArray = fileData[frame][JsonDataKeys::LocalParameters].array_items();
350
-    int ellipsesNum = static_cast<int>(jsonArray.size() > 2 ? 2 : jsonArray.size());
351
-    uint16_t numWindows = (uint16_t)fileData[frame][JsonDataKeys::NumberOfWindows].int_value();
352
-    mPimpl->appendBits(metadata, numWindows, 2);
353
-    for (int i = 0; i < ellipsesNum; ++i)
354
+    uint16_t numWindows = 0;
355
+    /* HDR10+ LLC doesn't consider local windows */
356
+    if(jsonType & LLC)
357
+    {
358
+        numWindows = 1;
359
+        mPimpl->appendBits(metadata, numWindows, 2);
360
+    }
361
+    else
362
     {
363
-        mPimpl->appendBits(metadata, jsonArray[i][EllipseSelectionNames::WindowData]
364
-            [EllipseSelectionNames::WindowUpperLeftCornerX].int_value(), 16);
365
-        mPimpl->appendBits(metadata, jsonArray[i][EllipseSelectionNames::WindowData]
366
-            [EllipseSelectionNames::WindowUpperLeftCornerY].int_value(), 16);
367
-        mPimpl->appendBits(metadata, jsonArray[i][EllipseSelectionNames::WindowData]
368
-            [EllipseSelectionNames::WindowLowerRightCornerX].int_value(), 16);
369
-        mPimpl->appendBits(metadata, jsonArray[i][EllipseSelectionNames::WindowData]
370
-            [EllipseSelectionNames::WindowLowerRightCornerY].int_value(), 16);
371
+        //Note: Validated only add up to two local selections, ignore the rest
372
+        JsonArray jsonArray = fileData[frame][JsonDataKeys::LocalParameters].array_items();
373
+        int ellipsesNum = static_cast<int>(jsonArray.size() > 2 ? 2 : jsonArray.size());
374
+        numWindows = (uint16_t)fileData[frame][JsonDataKeys::NumberOfWindows].int_value();
375
+        mPimpl->appendBits(metadata, numWindows, 2);
376
+        for (int i = 0; i < ellipsesNum; ++i)
377
+        {
378
+            mPimpl->appendBits(metadata, jsonArray[i][EllipseSelectionNames::WindowData]
379
+                    [EllipseSelectionNames::WindowUpperLeftCornerX].int_value(), 16);
380
+            mPimpl->appendBits(metadata, jsonArray[i][EllipseSelectionNames::WindowData]
381
+                    [EllipseSelectionNames::WindowUpperLeftCornerY].int_value(), 16);
382
+            mPimpl->appendBits(metadata, jsonArray[i][EllipseSelectionNames::WindowData]
383
+                    [EllipseSelectionNames::WindowLowerRightCornerX].int_value(), 16);
384
+            mPimpl->appendBits(metadata, jsonArray[i][EllipseSelectionNames::WindowData]
385
+                    [EllipseSelectionNames::WindowLowerRightCornerY].int_value(), 16);
386
 
387
-        JsonObject ellipseJsonObject = jsonArray[i][EllipseNames::TagName].object_items();
388
+            JsonObject ellipseJsonObject = jsonArray[i][EllipseNames::TagName].object_items();
389
 
390
-        mPimpl->appendBits(metadata,
391
-            static_cast<uint16_t>(ellipseJsonObject[EllipseNames::CenterOfEllipseX].int_value()),
392
-            16);
393
+            mPimpl->appendBits(metadata,
394
+                               static_cast<uint16_t>(ellipseJsonObject[EllipseNames::CenterOfEllipseX].int_value()),
395
+                    16);
396
 
397
-        mPimpl->appendBits(metadata,
398
-            static_cast<uint16_t>(ellipseJsonObject[EllipseNames::CenterOfEllipseY].int_value()),
399
-            16);
400
+            mPimpl->appendBits(metadata,
401
+                               static_cast<uint16_t>(ellipseJsonObject[EllipseNames::CenterOfEllipseY].int_value()),
402
+                    16);
403
 
404
-        int angle = ellipseJsonObject[EllipseNames::RotationAngle].int_value();
405
-        uint8_t rotationAngle = static_cast<uint8_t>((angle > 180.0) ? angle - 180.0 : angle);
406
-        mPimpl->appendBits(metadata, rotationAngle, 8);
407
+            int angle = ellipseJsonObject[EllipseNames::RotationAngle].int_value();
408
+            uint8_t rotationAngle = static_cast<uint8_t>((angle > 180.0) ? angle - 180.0 : angle);
409
+            mPimpl->appendBits(metadata, rotationAngle, 8);
410
 
411
-        uint16_t semimajorExternalAxis =
412
-            static_cast<uint16_t>(ellipseJsonObject[EllipseNames::SemiMajorAxisExternalEllipse].int_value());
413
+            uint16_t semimajorExternalAxis =
414
+                    static_cast<uint16_t>(ellipseJsonObject[EllipseNames::SemiMajorAxisExternalEllipse].int_value());
415
 
416
-        uint16_t semiminorExternalAxis =
417
-            static_cast<uint16_t>(ellipseJsonObject[EllipseNames::SemiMinorAxisExternalEllipse].int_value());
418
+            uint16_t semiminorExternalAxis =
419
+                    static_cast<uint16_t>(ellipseJsonObject[EllipseNames::SemiMinorAxisExternalEllipse].int_value());
420
 
421
-        uint16_t semimajorInternalEllipse =
422
-            static_cast<uint16_t>(ellipseJsonObject[EllipseNames::SemiMajorAxisInternalEllipse].int_value());
423
+            uint16_t semimajorInternalEllipse =
424
+                    static_cast<uint16_t>(ellipseJsonObject[EllipseNames::SemiMajorAxisInternalEllipse].int_value());
425
 
426
-        mPimpl->appendBits(metadata, semimajorInternalEllipse, 16);
427
+            mPimpl->appendBits(metadata, semimajorInternalEllipse, 16);
428
 
429
-        mPimpl->appendBits(metadata, semimajorExternalAxis, 16);
430
-        mPimpl->appendBits(metadata, semiminorExternalAxis, 16);
431
-        uint8_t overlapProcessOption = static_cast<uint8_t>(ellipseJsonObject[EllipseNames::OverlapProcessOption].int_value());
432
-        //TODO: Uses Layering method, the value is "1"
433
-        mPimpl->appendBits(metadata, overlapProcessOption, 1);
434
+            mPimpl->appendBits(metadata, semimajorExternalAxis, 16);
435
+            mPimpl->appendBits(metadata, semiminorExternalAxis, 16);
436
+            uint8_t overlapProcessOption = static_cast<uint8_t>(ellipseJsonObject[EllipseNames::OverlapProcessOption].int_value());
437
+            //TODO: Uses Layering method, the value is "1"
438
+            mPimpl->appendBits(metadata, overlapProcessOption, 1);
439
+        }
440
     }
441
+
442
     /* Targeted System Display Data */
443
-    uint32_t monitorPeak = fileData[frame][JsonDataKeys::TargetDisplayLuminance].int_value();     //500;
444
+    uint32_t monitorPeak = fileData[frame][JsonDataKeys::TargetDisplayLuminance].int_value();
445
     mPimpl->appendBits(metadata, monitorPeak, 27);
446
-    //NOTE: Set as false for now, as requested
447
+
448
     uint8_t targetedSystemDisplayActualPeakLuminanceFlag = 0;
449
     mPimpl->appendBits(metadata, targetedSystemDisplayActualPeakLuminanceFlag, 1);
450
     if (targetedSystemDisplayActualPeakLuminanceFlag)
451
@@ -439,21 +482,20 @@
452
         //TODO
453
     }
454
 
455
-    /* Max rgb values (maxScl)*/
456
+    /* Max RGB values (maxScl)*/
457
     /* Luminance values/percentile for each window */
458
     for (int w = 0; w < numWindows; ++w)
459
     {
460
         Json lumObj = fileData[frame][LuminanceNames::TagName];
461
         LuminanceParameters luminanceData;
462
-        if (!mPimpl->luminanceParamFromJson(lumObj, luminanceData))
463
+        if(!mPimpl->luminanceParamFromJson(lumObj, luminanceData, jsonType))
464
         {
465
             std::cout << "error parsing luminance parameters frame: " << w << std::endl;
466
         }
467
 
468
-        /* NOTE: Maxscl from 0 t 100,000 based on data that says in values of 0.00001
469
+        /* NOTE: Maxscl from 0 to 100,000 based on data that says in values of 0.00001
470
         * one for each channel R,G,B
471
         */
472
-
473
         mPimpl->appendBits(metadata, static_cast<uint8_t>(((int)luminanceData.maxRLuminance & 0x10000) >> 16), 1);
474
         mPimpl->appendBits(metadata, static_cast<uint16_t>((int)luminanceData.maxRLuminance & 0xFFFF), 16);
475
         mPimpl->appendBits(metadata, static_cast<uint8_t>(((int)luminanceData.maxGLuminance & 0x10000) >> 16), 1);
476
@@ -467,11 +509,12 @@
477
         uint8_t numDistributionMaxrgbPercentiles = static_cast<uint8_t>(luminanceData.order);
478
         mPimpl->appendBits(metadata, numDistributionMaxrgbPercentiles, 4);
479
 
480
-        std::vector<unsigned int>percentilPercentages;
481
-        mPimpl->percentagesFromJson(lumObj, percentilPercentages);
482
+        std::vector<unsigned int>percentilePercentages;
483
+        mPimpl->percentagesFromJson(lumObj, percentilePercentages, jsonType);
484
+
485
         for (int i = 0; i < numDistributionMaxrgbPercentiles; ++i)
486
         {
487
-            uint8_t distributionMaxrgbPercentage = static_cast<uint8_t>(percentilPercentages.at(i));
488
+            uint8_t distributionMaxrgbPercentage = static_cast<uint8_t>(percentilePercentages.at(i));
489
             mPimpl->appendBits(metadata, distributionMaxrgbPercentage, 7);
490
 
491
             /* 17bits: 1bit then 16 */
492
@@ -483,7 +526,7 @@
493
         }
494
 
495
         /* 10bits: Fraction bright pixels */
496
-        uint16_t fractionBrightPixels = 1;
497
+        uint16_t fractionBrightPixels = 0;
498
         mPimpl->appendBits(metadata, fractionBrightPixels, 10);
499
 
500
     }
501
@@ -498,24 +541,24 @@
502
     /* Bezier Curve Data */
503
     for (int w = 0; w < numWindows; ++w)
504
     {
505
-        uint8_t toneMappingFlag = 1;
506
+        uint8_t toneMappingFlag = 0;
507
        /* Check if the window contains tone mapping bezier curve data and set toneMappingFlag appropriately */
508
-       //Json bezierData = fileData[frame][BezierCurveNames::TagName];
509
         BezierCurveData curveData;
510
        /* Select curve data based on global window */
511
         if (w == 0)
512
-        {
513
-            if (!mPimpl->bezierCurveFromJson(fileData[frame][BezierCurveNames::TagName], curveData))
514
+        {          
515
+            if (mPimpl->bezierCurveFromJson(fileData[frame][BezierCurveNames::TagName], curveData, jsonType))
516
             {
517
-               toneMappingFlag = 0;
518
+                toneMappingFlag = 1;
519
             }
520
         }
521
-       /* Select curve data based on local window */
522
+        /* Select curve data based on local window */
523
         else
524
         {
525
-            if (!mPimpl->bezierCurveFromJson(jsonArray[w - 1][BezierCurveNames::TagName], curveData))
526
+            JsonArray jsonArray = fileData[frame][JsonDataKeys::LocalParameters].array_items();
527
+            if (mPimpl->bezierCurveFromJson(jsonArray[w - 1][BezierCurveNames::TagName], curveData, jsonType))
528
             {
529
-               toneMappingFlag = 0;
530
+                toneMappingFlag = 1;
531
             }
532
         }      
533
         mPimpl->appendBits(metadata, toneMappingFlag, 1);
534
x265_2.7.tar.gz/source/dynamicHDR10/metadataFromJson.h -> x265_2.9.tar.gz/source/dynamicHDR10/metadataFromJson.h Changed
31
 
1
@@ -26,7 +26,7 @@
2
 #define METADATAFROMJSON_H
3
 
4
 #include<stdint.h>
5
-#include "string"
6
+#include<cstring>
7
 #include "JsonHelper.h"
8
 
9
 class metadataFromJson
10
@@ -36,6 +36,11 @@
11
     metadataFromJson();
12
     ~metadataFromJson();
13
 
14
+   enum JsonType{
15
+       LEGACY,
16
+       LLC
17
+   };
18
+       
19
 
20
     /**
21
      * @brief frameMetadataFromJson: Generates a sigle frame metadata array from Json file with all
22
@@ -98,7 +103,7 @@
23
 
24
     class DynamicMetaIO;
25
     DynamicMetaIO *mPimpl;
26
-    void fillMetadataArray(const JsonArray &fileData, int frame, uint8_t *&metadata);
27
+    void fillMetadataArray(const JsonArray &fileData, int frame, const JsonType jsonType, uint8_t *&metadata);
28
 };
29
 
30
 #endif // METADATAFROMJSON_H
31
x265_2.7.tar.gz/source/encoder/analysis.cpp -> x265_2.9.tar.gz/source/encoder/analysis.cpp Changed
634
 
1
@@ -37,7 +37,7 @@
2
 using namespace X265_NS;
3
 
4
 /* An explanation of rate distortion levels (--rd-level)
5
- * 
6
+ *
7
  * rd-level 0 generates no recon per CU (NO RDO or Quant)
8
  *
9
  *   sa8d selection between merge / skip / inter / intra and split
10
@@ -187,27 +187,24 @@
11
         for (uint32_t i = 0; i < cuGeom.numPartitions; i++)
12
             ctu.m_log2CUSize[i] = (uint8_t)m_param->maxLog2CUSize - ctu.m_cuDepth[i];
13
     }
14
-    if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead)
15
+    if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && (m_slice->m_sliceType != I_SLICE))
16
     {
17
-        m_multipassAnalysis = (analysis2PassFrameData*)m_frame->m_analysis2Pass.analysisFramedata;
18
-        m_multipassDepth = &m_multipassAnalysis->depth[ctu.m_cuAddr * ctu.m_numPartitions];
19
-        if (m_slice->m_sliceType != I_SLICE)
20
+        int numPredDir = m_slice->isInterP() ? 1 : 2;
21
+        m_reuseInterDataCTU = m_frame->m_analysisData.interData;
22
+        for (int dir = 0; dir < numPredDir; dir++)
23
         {
24
-            int numPredDir = m_slice->isInterP() ? 1 : 2;
25
-            for (int dir = 0; dir < numPredDir; dir++)
26
-            {
27
-                m_multipassMv[dir] = &m_multipassAnalysis->m_mv[dir][ctu.m_cuAddr * ctu.m_numPartitions];
28
-                m_multipassMvpIdx[dir] = &m_multipassAnalysis->mvpIdx[dir][ctu.m_cuAddr * ctu.m_numPartitions];
29
-                m_multipassRef[dir] = &m_multipassAnalysis->ref[dir][ctu.m_cuAddr * ctu.m_numPartitions];
30
-            }
31
-            m_multipassModes = &m_multipassAnalysis->modes[ctu.m_cuAddr * ctu.m_numPartitions];
32
+            m_reuseMv[dir] = &m_reuseInterDataCTU->mv[dir][ctu.m_cuAddr * ctu.m_numPartitions];
33
+            m_reuseMvpIdx[dir] = &m_reuseInterDataCTU->mvpIdx[dir][ctu.m_cuAddr * ctu.m_numPartitions];
34
         }
35
+        m_reuseRef = &m_reuseInterDataCTU->ref[ctu.m_cuAddr * ctu.m_numPartitions];
36
+        m_reuseModes = &m_reuseInterDataCTU->modes[ctu.m_cuAddr * ctu.m_numPartitions];
37
+        m_reuseDepth = &m_reuseInterDataCTU->depth[ctu.m_cuAddr * ctu.m_numPartitions];
38
     }
39
-
40
+    
41
     if ((m_param->analysisSave || m_param->analysisLoad) && m_slice->m_sliceType != I_SLICE && m_param->analysisReuseLevel > 1 && m_param->analysisReuseLevel < 10)
42
     {
43
         int numPredDir = m_slice->isInterP() ? 1 : 2;
44
-        m_reuseInterDataCTU = (analysis_inter_data*)m_frame->m_analysisData.interData;
45
+        m_reuseInterDataCTU = m_frame->m_analysisData.interData;
46
         m_reuseRef = &m_reuseInterDataCTU->ref [ctu.m_cuAddr * X265_MAX_PRED_MODE_PER_CTU * numPredDir];
47
         m_reuseDepth = &m_reuseInterDataCTU->depth[ctu.m_cuAddr * ctu.m_numPartitions];
48
         m_reuseModes = &m_reuseInterDataCTU->modes[ctu.m_cuAddr * ctu.m_numPartitions];
49
@@ -224,7 +221,7 @@
50
 
51
     if (m_slice->m_sliceType == I_SLICE)
52
     {
53
-        analysis_intra_data* intraDataCTU = (analysis_intra_data*)m_frame->m_analysisData.intraData;
54
+        x265_analysis_intra_data* intraDataCTU = m_frame->m_analysisData.intraData;
55
         if (m_param->analysisLoad && m_param->analysisReuseLevel > 1)
56
         {
57
             memcpy(ctu.m_cuDepth, &intraDataCTU->depth[ctu.m_cuAddr * numPartition], sizeof(uint8_t) * numPartition);
58
@@ -243,7 +240,7 @@
59
 
60
         if (bCopyAnalysis)
61
         {
62
-            analysis_inter_data* interDataCTU = (analysis_inter_data*)m_frame->m_analysisData.interData;
63
+            x265_analysis_inter_data* interDataCTU = m_frame->m_analysisData.interData;
64
             int posCTU = ctu.m_cuAddr * numPartition;
65
             memcpy(ctu.m_cuDepth, &interDataCTU->depth[posCTU], sizeof(uint8_t) * numPartition);
66
             memcpy(ctu.m_predMode, &interDataCTU->modes[posCTU], sizeof(uint8_t) * numPartition);
67
@@ -253,7 +250,7 @@
68
 
69
             if ((m_slice->m_sliceType == P_SLICE || m_param->bIntraInBFrames) && !m_param->bMVType)
70
             {
71
-                analysis_intra_data* intraDataCTU = (analysis_intra_data*)m_frame->m_analysisData.intraData;
72
+                x265_analysis_intra_data* intraDataCTU = m_frame->m_analysisData.intraData;
73
                 memcpy(ctu.m_lumaIntraDir, &intraDataCTU->modes[posCTU], sizeof(uint8_t) * numPartition);
74
                 memcpy(ctu.m_chromaIntraDir, &intraDataCTU->chromaModes[posCTU], sizeof(uint8_t) * numPartition);
75
             }
76
@@ -279,14 +276,14 @@
77
         }
78
         else if ((m_param->analysisLoad && m_param->analysisReuseLevel == 10) || ((m_param->bMVType == AVC_INFO) && m_param->analysisReuseLevel >= 7 && ctu.m_numPartitions <= 16))
79
         {
80
-            analysis_inter_data* interDataCTU = (analysis_inter_data*)m_frame->m_analysisData.interData;
81
+            x265_analysis_inter_data* interDataCTU = m_frame->m_analysisData.interData;
82
             int posCTU = ctu.m_cuAddr * numPartition;
83
             memcpy(ctu.m_cuDepth, &interDataCTU->depth[posCTU], sizeof(uint8_t) * numPartition);
84
             memcpy(ctu.m_predMode, &interDataCTU->modes[posCTU], sizeof(uint8_t) * numPartition);
85
             memcpy(ctu.m_partSize, &interDataCTU->partSize[posCTU], sizeof(uint8_t) * numPartition);
86
             if ((m_slice->m_sliceType == P_SLICE || m_param->bIntraInBFrames) && !(m_param->bMVType == AVC_INFO))
87
             {
88
-                analysis_intra_data* intraDataCTU = (analysis_intra_data*)m_frame->m_analysisData.intraData;
89
+                x265_analysis_intra_data* intraDataCTU = m_frame->m_analysisData.intraData;
90
                 memcpy(ctu.m_lumaIntraDir, &intraDataCTU->modes[posCTU], sizeof(uint8_t) * numPartition);
91
                 memcpy(ctu.m_chromaIntraDir, &intraDataCTU->chromaModes[posCTU], sizeof(uint8_t) * numPartition);
92
             }
93
@@ -518,19 +515,20 @@
94
     bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
95
     bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
96
 
97
-    bool bAlreadyDecided = parentCTU.m_lumaIntraDir[cuGeom.absPartIdx] != (uint8_t)ALL_IDX;
98
-    bool bDecidedDepth = parentCTU.m_cuDepth[cuGeom.absPartIdx] == depth;
99
+    bool bAlreadyDecided = m_param->intraRefine != 4 && parentCTU.m_lumaIntraDir[cuGeom.absPartIdx] != (uint8_t)ALL_IDX;
100
+    bool bDecidedDepth = m_param->intraRefine != 4 && parentCTU.m_cuDepth[cuGeom.absPartIdx] == depth;
101
     int split = 0;
102
-    if (m_param->intraRefine)
103
+    if (m_param->intraRefine && m_param->intraRefine != 4)
104
     {
105
-        split = ((cuGeom.log2CUSize == (uint32_t)(g_log2Size[m_param->minCUSize] + 1)) && bDecidedDepth);
106
+        split = m_param->scaleFactor && bDecidedDepth && (!mightNotSplit || 
107
+            ((cuGeom.log2CUSize == (uint32_t)(g_log2Size[m_param->minCUSize] + 1))));
108
         if (cuGeom.log2CUSize == (uint32_t)(g_log2Size[m_param->minCUSize]) && !bDecidedDepth)
109
             bAlreadyDecided = false;
110
     }
111
 
112
     if (bAlreadyDecided)
113
     {
114
-        if (bDecidedDepth)
115
+        if (bDecidedDepth && mightNotSplit)
116
         {
117
             Mode& mode = md.pred[0];
118
             md.bestMode = &mode;
119
@@ -1184,7 +1182,7 @@
120
 
121
         if (m_evaluateInter)
122
         {
123
-            if (m_param->interRefine == 2)
124
+            if (m_refineLevel == 2)
125
             {
126
                 if (parentCTU.m_predMode[cuGeom.absPartIdx] == MODE_SKIP)
127
                     skipModes = true;
128
@@ -1283,11 +1281,11 @@
129
                 }
130
             }
131
         }
132
-        if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && m_multipassAnalysis)
133
+        if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && m_reuseInterDataCTU)
134
         {
135
-            if (mightNotSplit && depth == m_multipassDepth[cuGeom.absPartIdx])
136
+            if (mightNotSplit && depth == m_reuseDepth[cuGeom.absPartIdx])
137
             {
138
-                if (m_multipassModes[cuGeom.absPartIdx] == MODE_SKIP)
139
+                if (m_reuseModes[cuGeom.absPartIdx] == MODE_SKIP)
140
                 {
141
                     md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
142
                     md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
143
@@ -1307,7 +1305,7 @@
144
             md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
145
             checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
146
             if (m_param->rdLevel)
147
-                skipModes = (m_param->bEnableEarlySkip || m_param->interRefine == 2)
148
+                skipModes = (m_param->bEnableEarlySkip || m_refineLevel == 2)
149
                 && md.bestMode && md.bestMode->cu.isSkipped(0); // TODO: sa8d threshold per depth
150
         }
151
         if (md.bestMode && m_param->bEnableRecursionSkip && !bCtuInfoCheck && !(m_param->bMVType && m_param->analysisReuseLevel == 7 && (m_modeFlag[0] || m_modeFlag[1])))
152
@@ -1874,7 +1872,7 @@
153
 
154
         if (m_evaluateInter)
155
         {
156
-            if (m_param->interRefine == 2)
157
+            if (m_refineLevel == 2)
158
             {
159
                 if (parentCTU.m_predMode[cuGeom.absPartIdx] == MODE_SKIP)
160
                     skipModes = true;
161
@@ -1976,11 +1974,11 @@
162
             }
163
         }
164
 
165
-        if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && m_multipassAnalysis)
166
+        if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && m_reuseInterDataCTU)
167
         {
168
-            if (mightNotSplit && depth == m_multipassDepth[cuGeom.absPartIdx])
169
+            if (mightNotSplit && depth == m_reuseDepth[cuGeom.absPartIdx])
170
             {
171
-                if (m_multipassModes[cuGeom.absPartIdx] == MODE_SKIP)
172
+                if (m_reuseModes[cuGeom.absPartIdx] == MODE_SKIP)
173
                 {
174
                     md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
175
                     md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
176
@@ -2004,7 +2002,7 @@
177
             md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
178
             md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
179
             checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
180
-            skipModes = (m_param->bEnableEarlySkip || m_param->interRefine == 2) &&
181
+            skipModes = (m_param->bEnableEarlySkip || m_refineLevel == 2) &&
182
                 md.bestMode && !md.bestMode->cu.getQtRootCbf(0);
183
             refMasks[0] = allSplitRefs;
184
             md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
185
@@ -2413,9 +2411,18 @@
186
     bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
187
     bool bDecidedDepth = parentCTU.m_cuDepth[cuGeom.absPartIdx] == depth;
188
 
189
-    int split = (m_param->interRefine && cuGeom.log2CUSize == (uint32_t)(g_log2Size[m_param->minCUSize] + 1) && bDecidedDepth);
190
+    TrainingData td;
191
+    td.init(parentCTU, cuGeom);
192
 
193
-    if (bDecidedDepth)
194
+    if (!m_param->bDynamicRefine)
195
+        m_refineLevel = m_param->interRefine;
196
+    else
197
+        m_refineLevel = m_frame->m_classifyFrame ? 1 : 3;
198
+    int split = (m_param->scaleFactor && bDecidedDepth && (!mightNotSplit || 
199
+        (m_refineLevel && cuGeom.log2CUSize == (uint32_t)(g_log2Size[m_param->minCUSize] + 1))));
200
+    td.split = split;
201
+
202
+    if (bDecidedDepth && mightNotSplit)
203
     {
204
         setLambdaFromQP(parentCTU, qp, lqp);
205
 
206
@@ -2423,39 +2430,44 @@
207
         md.bestMode = &mode;
208
         mode.cu.initSubCU(parentCTU, cuGeom, qp);
209
         PartSize size = (PartSize)parentCTU.m_partSize[cuGeom.absPartIdx];
210
-        if (parentCTU.isIntra(cuGeom.absPartIdx) && m_param->interRefine < 2)
211
+        if (parentCTU.isIntra(cuGeom.absPartIdx) && m_refineLevel < 2)
212
         {
213
-            bool reuseModes = !((m_param->intraRefine == 3) ||
214
-                                (m_param->intraRefine == 2 && parentCTU.m_lumaIntraDir[cuGeom.absPartIdx] > DC_IDX));
215
-            if (reuseModes)
216
+            if (m_param->intraRefine == 4)
217
+                compressIntraCU(parentCTU, cuGeom, qp);
218
+            else
219
             {
220
-                memcpy(mode.cu.m_lumaIntraDir, parentCTU.m_lumaIntraDir + cuGeom.absPartIdx, cuGeom.numPartitions);
221
-                memcpy(mode.cu.m_chromaIntraDir, parentCTU.m_chromaIntraDir + cuGeom.absPartIdx, cuGeom.numPartitions);
222
+                bool reuseModes = !((m_param->intraRefine == 3) ||
223
+                    (m_param->intraRefine == 2 && parentCTU.m_lumaIntraDir[cuGeom.absPartIdx] > DC_IDX));
224
+                if (reuseModes)
225
+                {
226
+                    memcpy(mode.cu.m_lumaIntraDir, parentCTU.m_lumaIntraDir + cuGeom.absPartIdx, cuGeom.numPartitions);
227
+                    memcpy(mode.cu.m_chromaIntraDir, parentCTU.m_chromaIntraDir + cuGeom.absPartIdx, cuGeom.numPartitions);
228
+                }
229
+                checkIntra(mode, cuGeom, size);
230
             }
231
-            checkIntra(mode, cuGeom, size);
232
         }
233
-        else if (!parentCTU.isIntra(cuGeom.absPartIdx) && m_param->interRefine < 2)
234
+        else if (!parentCTU.isIntra(cuGeom.absPartIdx) && m_refineLevel < 2)
235
         {
236
             mode.cu.copyFromPic(parentCTU, cuGeom, m_csp, false);
237
             uint32_t numPU = parentCTU.getNumPartInter(cuGeom.absPartIdx);
238
             for (uint32_t part = 0; part < numPU; part++)
239
             {
240
                 PredictionUnit pu(mode.cu, cuGeom, part);
241
-                if (m_param->analysisReuseLevel >= 7)
242
+                if ((m_param->analysisLoad && m_param->analysisReuseLevel == 10) || (m_param->bMVType == AVC_INFO && m_param->analysisReuseLevel >= 7))
243
                 {
244
-                    analysis_inter_data* interDataCTU = (analysis_inter_data*)m_frame->m_analysisData.interData;
245
+                    x265_analysis_inter_data* interDataCTU = m_frame->m_analysisData.interData;
246
                     int cuIdx = (mode.cu.m_cuAddr * parentCTU.m_numPartitions) + cuGeom.absPartIdx;
247
                     mode.cu.m_mergeFlag[pu.puAbsPartIdx] = interDataCTU->mergeFlag[cuIdx + part];
248
                     mode.cu.setPUInterDir(interDataCTU->interDir[cuIdx + part], pu.puAbsPartIdx, part);
249
                     for (int list = 0; list < m_slice->isInterB() + 1; list++)
250
                     {
251
-                        mode.cu.setPUMv(list, interDataCTU->mv[list][cuIdx + part], pu.puAbsPartIdx, part);
252
+                        mode.cu.setPUMv(list, interDataCTU->mv[list][cuIdx + part].word, pu.puAbsPartIdx, part);
253
                         mode.cu.setPURefIdx(list, interDataCTU->refIdx[list][cuIdx + part], pu.puAbsPartIdx, part);
254
                         mode.cu.m_mvpIdx[list][pu.puAbsPartIdx] = interDataCTU->mvpIdx[list][cuIdx + part];
255
                     }
256
                     if (!mode.cu.m_mergeFlag[pu.puAbsPartIdx])
257
                     {
258
-                        if (m_param->mvRefine)
259
+                        if (m_param->mvRefine || m_param->interRefine == 1)
260
                             m_me.setSourcePU(*mode.fencYuv, pu.ctuAddr, pu.cuAbsPartIdx, pu.puAbsPartIdx, pu.width, pu.height, m_param->searchMethod, m_param->subpelRefine, false);
261
                         //AMVP
262
                         MV mvc[(MD_ABOVE_LEFT + 1) * 2 + 2];
263
@@ -2465,23 +2477,37 @@
264
                             int ref = mode.cu.m_refIdx[list][pu.puAbsPartIdx];
265
                             if (ref == -1)
266
                                 continue;
267
-                            mode.cu.getPMV(mode.interNeighbours, list, ref, mode.amvpCand[list][ref], mvc);
268
-                            MV mvp = mode.amvpCand[list][ref][mode.cu.m_mvpIdx[list][pu.puAbsPartIdx]];
269
-                            if (m_param->mvRefine)
270
+                            MV mvp;
271
+
272
+                            int numMvc = mode.cu.getPMV(mode.interNeighbours, list, ref, mode.amvpCand[list][ref], mvc);
273
+                            if (m_param->interRefine != 1)
274
+                                mvp = mode.amvpCand[list][ref][mode.cu.m_mvpIdx[list][pu.puAbsPartIdx]];
275
+                            else
276
+                                mvp = interDataCTU->mv[list][cuIdx + part].word;
277
+                            if (m_param->mvRefine || m_param->interRefine == 1)
278
                             {
279
                                 MV outmv;
280
-                                searchMV(mode, pu, list, ref, outmv);
281
+                                searchMV(mode, pu, list, ref, outmv, mvp, numMvc, mvc);
282
                                 mode.cu.setPUMv(list, outmv, pu.puAbsPartIdx, part);
283
                             }
284
-                            mode.cu.m_mvd[list][pu.puAbsPartIdx] = mode.cu.m_mv[list][pu.puAbsPartIdx] - mvp;
285
+                            mode.cu.m_mvd[list][pu.puAbsPartIdx] = mode.cu.m_mv[list][pu.puAbsPartIdx] - mode.amvpCand[list][ref][mode.cu.m_mvpIdx[list][pu.puAbsPartIdx]]/*mvp*/;
286
                         }
287
                     }
288
-                    else if(m_param->scaleFactor)
289
+                    else
290
                     {
291
                         MVField candMvField[MRG_MAX_NUM_CANDS][2]; // double length for mv of both lists
292
                         uint8_t candDir[MRG_MAX_NUM_CANDS];
293
                         mode.cu.getInterMergeCandidates(pu.puAbsPartIdx, part, candMvField, candDir);
294
                         uint8_t mvpIdx = mode.cu.m_mvpIdx[0][pu.puAbsPartIdx];
295
+                        if (mode.cu.isBipredRestriction())
296
+                        {
297
+                            /* do not allow bidir merge candidates if PU is smaller than 8x8, drop L1 reference */
298
+                            if (candDir[mvpIdx] == 3)
299
+                            {
300
+                                candDir[mvpIdx] = 1;
301
+                                candMvField[mvpIdx][1].refIdx = REF_NOT_VALID;
302
+                            }
303
+                        }
304
                         mode.cu.setPUInterDir(candDir[mvpIdx], pu.puAbsPartIdx, part);
305
                         mode.cu.setPUMv(0, candMvField[mvpIdx][0].mv, pu.puAbsPartIdx, part);
306
                         mode.cu.setPUMv(1, candMvField[mvpIdx][1].mv, pu.puAbsPartIdx, part);
307
@@ -2491,7 +2517,7 @@
308
                 }
309
                 motionCompensation(mode.cu, pu, mode.predYuv, true, (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400));
310
             }
311
-            if (!m_param->interRefine && parentCTU.isSkipped(cuGeom.absPartIdx))
312
+            if (!m_param->interRefine && !m_param->bDynamicRefine && parentCTU.isSkipped(cuGeom.absPartIdx))
313
                 encodeResAndCalcRdSkipCU(mode);
314
             else
315
                 encodeResAndCalcRdInterCU(mode, cuGeom);
316
@@ -2502,7 +2528,7 @@
317
                 checkDQP(mode, cuGeom);
318
         }
319
 
320
-        if (m_param->interRefine < 2)
321
+        if (m_refineLevel < 2)
322
         {
323
             if (m_bTryLossless)
324
                 tryLossless(cuGeom);
325
@@ -2530,7 +2556,10 @@
326
             }
327
         }
328
 
329
-        if (m_param->interRefine > 1 || (m_param->interRefine && parentCTU.m_predMode[cuGeom.absPartIdx] == MODE_SKIP  && !mode.cu.isSkipped(0)))
330
+        if (m_param->bDynamicRefine)
331
+            classifyCU(parentCTU,cuGeom, *md.bestMode, td);
332
+
333
+        if (m_refineLevel > 1 || (m_refineLevel && parentCTU.m_predMode[cuGeom.absPartIdx] == MODE_SKIP  && !mode.cu.isSkipped(0)))
334
         {
335
             m_evaluateInter = 1;
336
             m_param->rdLevel > 4 ? compressInterCU_rd5_6(parentCTU, cuGeom, qp) : compressInterCU_rd0_4(parentCTU, cuGeom, qp);
337
@@ -2589,7 +2618,7 @@
338
         else
339
             updateModeCost(*splitPred);
340
 
341
-        if (m_param->interRefine)
342
+        if (m_refineLevel)
343
         {
344
             if (m_param->rdLevel > 1)
345
                 checkBestMode(*splitPred, cuGeom.depth);
346
@@ -2603,6 +2632,89 @@
347
         md.bestMode->cu.copyToPic(depth);
348
         md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, parentCTU.m_cuAddr, cuGeom.absPartIdx);
349
     }
350
+    if (m_param->bDynamicRefine && bDecidedDepth)
351
+        trainCU(parentCTU, cuGeom, *md.bestMode, td);
352
+}
353
+
354
+void Analysis::classifyCU(const CUData& ctu, const CUGeom& cuGeom, const Mode& bestMode, TrainingData& trainData)
355
+{
356
+    uint32_t depth = cuGeom.depth;
357
+    trainData.cuVariance = calculateCUVariance(ctu, cuGeom);
358
+    if (m_frame->m_classifyFrame)
359
+    {
360
+        uint64_t diffRefine[X265_REFINE_INTER_LEVELS];
361
+        uint64_t diffRefineRd[X265_REFINE_INTER_LEVELS];
362
+        float probRefine[X265_REFINE_INTER_LEVELS] = { 0 };
363
+        uint8_t varRefineLevel = 1;
364
+        uint8_t rdRefineLevel = 1;
365
+        uint64_t cuCost = bestMode.rdCost;
366
+        int offset = (depth * X265_REFINE_INTER_LEVELS);
367
+        if (cuCost < m_frame->m_classifyRd[offset])
368
+            m_refineLevel = 1;
369
+        else
370
+        {
371
+            uint64_t trainingCount = 0;
372
+            for (uint8_t i = 0; i < X265_REFINE_INTER_LEVELS; i++)
373
+            {
374
+                offset = (depth * X265_REFINE_INTER_LEVELS) + i;
375
+                trainingCount += m_frame->m_classifyCount[offset];
376
+            }
377
+            for (uint8_t i = 0; i < X265_REFINE_INTER_LEVELS; i++)
378
+            {
379
+                offset = (depth * X265_REFINE_INTER_LEVELS) + i;
380
+                /* Calculate distance values */
381
+                diffRefine[i] = abs((int64_t)(trainData.cuVariance - m_frame->m_classifyVariance[offset]));
382
+                diffRefineRd[i] = abs((int64_t)(cuCost - m_frame->m_classifyRd[offset]));
383
+
384
+                /* Calculate prior probability - ranges between 0 and 1 */
385
+                if (trainingCount)
386
+                    probRefine[i] = ((float)m_frame->m_classifyCount[offset] / (float)trainingCount);
387
+
388
+                /* Bayesian classification - P(c|x)P(x) = P(x|c)P(c)
389
+                P(c|x) is the posterior probability of class given predictor.
390
+                P(c) is the prior probability of class.
391
+                P(x|c) is the likelihood which is the probability of predictor given class.
392
+                P(x) is the prior probability of predictor.*/
393
+                int curRefineLevel = m_refineLevel - 1;
394
+                if ((diffRefine[i] * probRefine[curRefineLevel]) < (diffRefine[curRefineLevel] * probRefine[i]))
395
+                    varRefineLevel = i + 1;
396
+                if ((diffRefineRd[i] * probRefine[curRefineLevel]) < (diffRefineRd[curRefineLevel] * probRefine[i]))
397
+                    rdRefineLevel = i + 1;
398
+            }
399
+            m_refineLevel = X265_MAX(varRefineLevel, rdRefineLevel);
400
+        }
401
+    }
402
+}
403
+
404
+void Analysis::trainCU(const CUData& ctu, const CUGeom& cuGeom, const Mode& bestMode, TrainingData& trainData)
405
+{
406
+    uint32_t depth = cuGeom.depth;
407
+    int classify = 1;
408
+    if (!m_frame->m_classifyFrame)
409
+    {
410
+        /* classify = 1 : CUs for which the save data matches with that after encoding with refine-inter 3
411
+                          and CUs that has split.
412
+           classify = 2 : CUs which are encoded as simple modes (Skip/Merge/2Nx2N).
413
+           classify = 3 : CUs encoded as any other mode. */
414
+
415
+        bool refineInter0 = (trainData.predMode == ctu.m_predMode[cuGeom.absPartIdx] &&
416
+            trainData.partSize == ctu.m_partSize[cuGeom.absPartIdx] &&
417
+            trainData.mergeFlag == ctu.m_mergeFlag[cuGeom.absPartIdx]);
418
+        bool refineInter1 = (depth == m_param->maxCUDepth - 1) && trainData.split;
419
+        if (refineInter0 || refineInter1)
420
+            classify = 1;
421
+        else if (trainData.partSize == SIZE_2Nx2N && trainData.partSize == ctu.m_partSize[cuGeom.absPartIdx])
422
+            classify = 2;
423
+        else
424
+            classify = 3;
425
+    }
426
+    else
427
+        classify = m_refineLevel;
428
+    uint64_t cuCost = bestMode.rdCost;
429
+    int offset = (depth * X265_REFINE_INTER_LEVELS) + classify - 1;
430
+    ctu.m_collectCURd[offset] += cuCost;
431
+    ctu.m_collectCUVariance[offset] += trainData.cuVariance;
432
+    ctu.m_collectCUCount[offset]++;
433
 }
434
 
435
 /* sets md.bestMode if a valid merge candidate is found, else leaves it NULL */
436
@@ -2900,7 +3012,7 @@
437
         }
438
     }
439
 
440
-    if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && m_multipassAnalysis)
441
+    if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && m_reuseInterDataCTU)
442
     {
443
         uint32_t numPU = interMode.cu.getNumPartInter(0);
444
         for (uint32_t part = 0; part < numPU; part++)
445
@@ -2908,9 +3020,10 @@
446
             MotionData* bestME = interMode.bestME[part];
447
             for (int32_t i = 0; i < numPredDir; i++)
448
             {
449
-                bestME[i].ref = m_multipassRef[i][cuGeom.absPartIdx];
450
-                bestME[i].mv = m_multipassMv[i][cuGeom.absPartIdx];
451
-                bestME[i].mvpIdx = m_multipassMvpIdx[i][cuGeom.absPartIdx];
452
+                int* ref = &m_reuseRef[i * m_frame->m_analysisData.numPartitions * m_frame->m_analysisData.numCUsInFrame];
453
+                bestME[i].ref = ref[cuGeom.absPartIdx];
454
+                bestME[i].mv = m_reuseMv[i][cuGeom.absPartIdx].word;
455
+                bestME[i].mvpIdx = m_reuseMvpIdx[i][cuGeom.absPartIdx];
456
             }
457
         }
458
     }
459
@@ -2964,7 +3077,7 @@
460
         }
461
     }
462
 
463
-    if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && m_multipassAnalysis)
464
+    if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && m_reuseInterDataCTU)
465
     {
466
         uint32_t numPU = interMode.cu.getNumPartInter(0);
467
         for (uint32_t part = 0; part < numPU; part++)
468
@@ -2972,9 +3085,10 @@
469
             MotionData* bestME = interMode.bestME[part];
470
             for (int32_t i = 0; i < numPredDir; i++)
471
             {
472
-                bestME[i].ref = m_multipassRef[i][cuGeom.absPartIdx];
473
-                bestME[i].mv = m_multipassMv[i][cuGeom.absPartIdx];
474
-                bestME[i].mvpIdx = m_multipassMvpIdx[i][cuGeom.absPartIdx];
475
+                int* ref = &m_reuseRef[i * m_frame->m_analysisData.numPartitions * m_frame->m_analysisData.numCUsInFrame];
476
+                bestME[i].ref = ref[cuGeom.absPartIdx];
477
+                bestME[i].mv = m_reuseMv[i][cuGeom.absPartIdx].word;
478
+                bestME[i].mvpIdx = m_reuseMvpIdx[i][cuGeom.absPartIdx];
479
             }
480
         }
481
     }
482
@@ -3092,11 +3206,9 @@
483
             pixel *fref0 = m_slice->m_mref[0][ref0].getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx);
484
             pixel *fref1 = m_slice->m_mref[1][ref1].getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx);
485
             intptr_t refStride = m_slice->m_mref[0][0].lumaStride;
486
-
487
-            primitives.pu[partEnum].pixelavg_pp(tmpPredYuv.m_buf[0], tmpPredYuv.m_size, fref0, refStride, fref1, refStride, 32);
488
+            primitives.pu[partEnum].pixelavg_pp[(tmpPredYuv.m_size % 64 == 0) && (refStride % 64 == 0)](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, fref0, refStride, fref1, refStride, 32);
489
             zsa8d = primitives.cu[partEnum].sa8d(fencYuv.m_buf[0], fencYuv.m_size, tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
490
         }
491
-
492
         uint32_t bits0 = bestME[0].bits - m_me.bitcost(bestME[0].mv, mvp0) + m_me.bitcost(mvzero, mvp0);
493
         uint32_t bits1 = bestME[1].bits - m_me.bitcost(bestME[1].mv, mvp1) + m_me.bitcost(mvzero, mvp1);
494
         uint32_t zcost = zsa8d + m_rdCost.getCost(bits0) + m_rdCost.getCost(bits1);
495
@@ -3221,8 +3333,12 @@
496
          * resiYuv. Generate the recon pixels by adding it to the prediction */
497
 
498
         if (cu.m_cbf[0][0])
499
-            primitives.cu[sizeIdx].add_ps(reconPic.getLumaAddr(cu.m_cuAddr, absPartIdx), reconPic.m_stride,
500
-                                          predY, resiYuv.m_buf[0], predYuv.m_size, resiYuv.m_size);
501
+        {
502
+            bool reconPicAlign = (reconPic.m_cuOffsetY[cu.m_cuAddr] + reconPic.m_buOffsetY[absPartIdx]) % 64 == 0;
503
+            bool predYalign = predYuv.getAddrOffset(absPartIdx, predYuv.m_size) % 64 == 0;
504
+            primitives.cu[sizeIdx].add_ps[reconPicAlign && predYalign && (reconPic.m_stride % 64 == 0) && (predYuv.m_size % 64 == 0) &&
505
+                (resiYuv.m_size % 64 == 0)](reconPic.getLumaAddr(cu.m_cuAddr, absPartIdx), reconPic.m_stride, predY, resiYuv.m_buf[0], predYuv.m_size, resiYuv.m_size);
506
+        }
507
         else
508
             primitives.cu[sizeIdx].copy_pp(reconPic.getLumaAddr(cu.m_cuAddr, absPartIdx), reconPic.m_stride,
509
                                            predY, predYuv.m_size);
510
@@ -3230,16 +3346,24 @@
511
         {
512
              pixel* predU = predYuv.getCbAddr(absPartIdx);
513
              pixel* predV = predYuv.getCrAddr(absPartIdx);
514
-            if (cu.m_cbf[1][0])
515
-                primitives.chroma[m_csp].cu[sizeIdx].add_ps(reconPic.getCbAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC,
516
-                                                        predU, resiYuv.m_buf[1], predYuv.m_csize, resiYuv.m_csize);
517
+             if (cu.m_cbf[1][0])
518
+             {
519
+                 bool reconPicAlign = (reconPic.m_cuOffsetC[cu.m_cuAddr] + reconPic.m_buOffsetC[absPartIdx]) % 64 == 0;
520
+                 bool predUalign = predYuv.getChromaAddrOffset(absPartIdx) % 64 == 0;
521
+                 primitives.chroma[m_csp].cu[sizeIdx].add_ps[reconPicAlign && predUalign && (reconPic.m_strideC % 64 == 0) && (predYuv.m_csize % 64 == 0) &&
522
+                     (resiYuv.m_csize % 64 == 0)](reconPic.getCbAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC, predU, resiYuv.m_buf[1], predYuv.m_csize, resiYuv.m_csize);
523
+             }
524
             else
525
                 primitives.chroma[m_csp].cu[sizeIdx].copy_pp(reconPic.getCbAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC,
526
                                                          predU, predYuv.m_csize);
527
 
528
             if (cu.m_cbf[2][0])
529
-                primitives.chroma[m_csp].cu[sizeIdx].add_ps(reconPic.getCrAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC,
530
-                                                        predV, resiYuv.m_buf[2], predYuv.m_csize, resiYuv.m_csize);
531
+            {
532
+                bool reconPicAlign = (reconPic.m_cuOffsetC[cu.m_cuAddr] + reconPic.m_buOffsetC[absPartIdx]) % 64 == 0;
533
+                bool predValign = predYuv.getChromaAddrOffset(absPartIdx) % 64 == 0;
534
+                primitives.chroma[m_csp].cu[sizeIdx].add_ps[reconPicAlign && predValign && (reconPic.m_strideC % 64 == 0) && (predYuv.m_csize % 64 == 0) &&
535
+                    (resiYuv.m_csize % 64 == 0)](reconPic.getCrAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC, predV, resiYuv.m_buf[2], predYuv.m_csize, resiYuv.m_csize);
536
+            }
537
             else
538
                 primitives.chroma[m_csp].cu[sizeIdx].copy_pp(reconPic.getCrAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC,
539
                                                          predV, predYuv.m_csize);
540
@@ -3404,6 +3528,33 @@
541
     return false;
542
 }
543
 
544
+uint32_t Analysis::calculateCUVariance(const CUData& ctu, const CUGeom& cuGeom)
545
+{
546
+    uint32_t cuVariance = 0;
547
+    uint32_t *blockVariance = m_frame->m_lowres.blockVariance;
548
+    int loopIncr = (m_param->rc.qgSize == 8) ? 8 : 16;
549
+
550
+    uint32_t width = m_frame->m_fencPic->m_picWidth;
551
+    uint32_t height = m_frame->m_fencPic->m_picHeight;
552
+    uint32_t block_x = ctu.m_cuPelX + g_zscanToPelX[cuGeom.absPartIdx];
553
+    uint32_t block_y = ctu.m_cuPelY + g_zscanToPelY[cuGeom.absPartIdx];
554
+    uint32_t maxCols = (m_frame->m_fencPic->m_picWidth + (loopIncr - 1)) / loopIncr;
555
+    uint32_t blockSize = m_param->maxCUSize >> cuGeom.depth;
556
+    uint32_t cnt = 0; 
557
+
558
+    for (uint32_t block_yy = block_y; block_yy < block_y + blockSize && block_yy < height; block_yy += loopIncr)
559
+    {
560
+        for (uint32_t block_xx = block_x; block_xx < block_x + blockSize && block_xx < width; block_xx += loopIncr)
561
+        {
562
+            uint32_t idx = ((block_yy / loopIncr) * (maxCols)) + (block_xx / loopIncr);
563
+            cuVariance += blockVariance[idx];
564
+            cnt++;
565
+        }
566
+    }
567
+    
568
+    return cuVariance / cnt;
569
+}
570
+
571
 int Analysis::calculateQpforCuSize(const CUData& ctu, const CUGeom& cuGeom, int32_t complexCheck, double baseQp)
572
 {
573
     FrameData& curEncData = *m_frame->m_encData;
574
@@ -3411,24 +3562,18 @@
575
 
576
     if (m_param->analysisMultiPassDistortion && m_param->rc.bStatRead)
577
     {
578
-        m_multipassAnalysis = (analysis2PassFrameData*)m_frame->m_analysis2Pass.analysisFramedata;
579
-        if ((m_multipassAnalysis->threshold[ctu.m_cuAddr] < 0.9 || m_multipassAnalysis->threshold[ctu.m_cuAddr] > 1.1)
580
-            && m_multipassAnalysis->highDistortionCtuCount && m_multipassAnalysis->lowDistortionCtuCount)
581
-            qp += m_multipassAnalysis->offset[ctu.m_cuAddr];
582
+        x265_analysis_distortion_data* distortionData = m_frame->m_analysisData.distortionData;
583
+        if ((distortionData->threshold[ctu.m_cuAddr] < 0.9 || distortionData->threshold[ctu.m_cuAddr] > 1.1)
584
+            && distortionData->highDistortionCtuCount && distortionData->lowDistortionCtuCount)
585
+            qp += distortionData->offset[ctu.m_cuAddr];
586
     }
587
 
588
-    int loopIncr;
589
-    if (m_param->rc.qgSize == 8)
590
-        loopIncr = 8;
591
-    else
592
-        loopIncr = 16;
593
+    int loopIncr = (m_param->rc.qgSize == 8) ? 8 : 16;
594
+
595
     /* Use cuTree offsets if cuTree enabled and frame is referenced, else use AQ offsets */
596
     bool isReferenced = IS_REFERENCED(m_frame);
597
-    double *qpoffs;
598
-    if (complexCheck)
599
-        qpoffs = m_frame->m_lowres.qpAqOffset;
600
-    else
601
-        qpoffs = (isReferenced && m_param->rc.cuTree) ? m_frame->m_lowres.qpCuTreeOffset : m_frame->m_lowres.qpAqOffset;
602
+    double *qpoffs = (isReferenced && m_param->rc.cuTree && !complexCheck) ? m_frame->m_lowres.qpCuTreeOffset :
603
+                                                                             m_frame->m_lowres.qpAqOffset;
604
     if (qpoffs)
605
     {
606
         uint32_t width = m_frame->m_fencPic->m_picWidth;
607
@@ -3439,13 +3584,11 @@
608
         uint32_t blockSize = m_param->maxCUSize >> cuGeom.depth;
609
         double qp_offset = 0;
610
         uint32_t cnt = 0;
611
-        uint32_t idx;
612
-
613
         for (uint32_t block_yy = block_y; block_yy < block_y + blockSize && block_yy < height; block_yy += loopIncr)
614
         {
615
             for (uint32_t block_xx = block_x; block_xx < block_x + blockSize && block_xx < width; block_xx += loopIncr)
616
             {
617
-                idx = ((block_yy / loopIncr) * (maxCols)) + (block_xx / loopIncr);
618
+                uint32_t idx = ((block_yy / loopIncr) * (maxCols)) + (block_xx / loopIncr);
619
                 qp_offset += qpoffs[idx];
620
                 cnt++;
621
             }
622
@@ -3458,10 +3601,7 @@
623
             int32_t offset = (int32_t)(qp_offset * 100 + .5);
624
             double threshold = (1 - ((x265_ADAPT_RD_STRENGTH - m_param->dynamicRd) * 0.5));
625
             int32_t max_threshold = (int32_t)(threshold * 100 + .5);
626
-            if (offset < max_threshold)
627
-                return 1;
628
-            else
629
-                return 0;
630
+            return (offset < max_threshold);
631
         }
632
     }
633
 
634
x265_2.7.tar.gz/source/encoder/analysis.h -> x265_2.9.tar.gz/source/encoder/analysis.h Changed
69
 
1
@@ -123,27 +123,42 @@
2
 
3
 protected:
4
     /* Analysis data for save/load mode, writes/reads data based on absPartIdx */
5
-    analysis_inter_data* m_reuseInterDataCTU;
6
-    int32_t*             m_reuseRef;
7
-    uint8_t*             m_reuseDepth;
8
-    uint8_t*             m_reuseModes;
9
-    uint8_t*             m_reusePartSize;
10
-    uint8_t*             m_reuseMergeFlag;
11
+    x265_analysis_inter_data*  m_reuseInterDataCTU;
12
+    int32_t*                   m_reuseRef;
13
+    uint8_t*                   m_reuseDepth;
14
+    uint8_t*                   m_reuseModes;
15
+    uint8_t*                   m_reusePartSize;
16
+    uint8_t*                   m_reuseMergeFlag;
17
+    x265_analysis_MV*          m_reuseMv[2];
18
+    uint8_t*             m_reuseMvpIdx[2];
19
 
20
     uint32_t             m_splitRefIdx[4];
21
     uint64_t*            cacheCost;
22
 
23
-
24
-    analysis2PassFrameData* m_multipassAnalysis;
25
-    uint8_t*                m_multipassDepth;
26
-    MV*                     m_multipassMv[2];
27
-    int*                    m_multipassMvpIdx[2];
28
-    int32_t*                m_multipassRef[2];
29
-    uint8_t*                m_multipassModes;
30
-
31
     uint8_t                 m_evaluateInter;
32
+    int32_t                 m_refineLevel;
33
+
34
     uint8_t*                m_additionalCtuInfo;
35
     int*                    m_prevCtuInfoChange;
36
+
37
+    struct TrainingData
38
+    {
39
+        uint32_t cuVariance;
40
+        uint8_t predMode;
41
+        uint8_t partSize;
42
+        uint8_t mergeFlag;
43
+        int split;
44
+
45
+        void init(const CUData& parentCTU, const CUGeom& cuGeom)
46
+        {
47
+            cuVariance = 0;
48
+            predMode = parentCTU.m_predMode[cuGeom.absPartIdx];
49
+            partSize = parentCTU.m_partSize[cuGeom.absPartIdx];
50
+            mergeFlag = parentCTU.m_mergeFlag[cuGeom.absPartIdx];
51
+            split = 0;
52
+        }
53
+    };
54
+
55
     /* refine RD based on QP for rd-levels 5 and 6 */
56
     void qprdRefine(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp, int32_t lqp);
57
 
58
@@ -182,6 +197,10 @@
59
     void encodeResidue(const CUData& parentCTU, const CUGeom& cuGeom);
60
 
61
     int calculateQpforCuSize(const CUData& ctu, const CUGeom& cuGeom, int32_t complexCheck = 0, double baseQP = -1);
62
+    uint32_t calculateCUVariance(const CUData& ctu, const CUGeom& cuGeom);
63
+
64
+    void classifyCU(const CUData& ctu, const CUGeom& cuGeom, const Mode& bestMode, TrainingData& trainData);
65
+    void trainCU(const CUData& ctu, const CUGeom& cuGeom, const Mode& bestMode, TrainingData& trainData);
66
 
67
     void calculateNormFactor(CUData& ctu, int qp);
68
     void normFactor(const pixel* src, uint32_t blockSize, CUData& ctu, int qp, TextType ttype);
69
x265_2.7.tar.gz/source/encoder/api.cpp -> x265_2.9.tar.gz/source/encoder/api.cpp Changed
674
 
1
@@ -31,6 +31,10 @@
2
 #include "nal.h"
3
 #include "bitcost.h"
4
 
5
+#if ENABLE_LIBVMAF
6
+#include "libvmaf.h"
7
+#endif
8
+
9
 /* multilib namespace reflectors */
10
 #if LINKED_8BIT
11
 namespace x265_8bit {
12
@@ -274,10 +278,10 @@
13
         pic_in->analysisData.wt = NULL;
14
         pic_in->analysisData.intraData = NULL;
15
         pic_in->analysisData.interData = NULL;
16
-        pic_in->analysis2Pass.analysisFramedata = NULL;
17
+        pic_in->analysisData.distortionData = NULL;
18
     }
19
 
20
-    if (pp_nal && numEncoded > 0)
21
+    if (pp_nal && numEncoded > 0 && encoder->m_outputCount >= encoder->m_latestParam->chunkStart)
22
     {
23
         *pp_nal = &encoder->m_nalList.m_nal[0];
24
         if (pi_nal) *pi_nal = encoder->m_nalList.m_numNal;
25
@@ -285,7 +289,7 @@
26
     else if (pi_nal)
27
         *pi_nal = 0;
28
 
29
-    if (numEncoded && encoder->m_param->csvLogLevel)
30
+    if (numEncoded && encoder->m_param->csvLogLevel && encoder->m_outputCount >= encoder->m_latestParam->chunkStart)
31
         x265_csvlog_frame(encoder->m_param, pic_out);
32
 
33
     if (numEncoded < 0)
34
@@ -302,13 +306,34 @@
35
         encoder->fetchStats(outputStats, statsSizeBytes);
36
     }
37
 }
38
+#if ENABLE_LIBVMAF
39
+void x265_vmaf_encoder_log(x265_encoder* enc, int argc, char **argv, x265_param *param, x265_vmaf_data *vmafdata)
40
+{
41
+    if (enc)
42
+    {
43
+        Encoder *encoder = static_cast<Encoder*>(enc);
44
+        x265_stats stats;       
45
+        stats.aggregateVmafScore = x265_calculate_vmafscore(param, vmafdata);
46
+        if(vmafdata->reference_file)
47
+            fclose(vmafdata->reference_file);
48
+        if(vmafdata->distorted_file)
49
+            fclose(vmafdata->distorted_file);
50
+        if(vmafdata)
51
+            x265_free(vmafdata);
52
+        encoder->fetchStats(&stats, sizeof(stats));
53
+        int padx = encoder->m_sps.conformanceWindow.rightOffset;
54
+        int pady = encoder->m_sps.conformanceWindow.bottomOffset;
55
+        x265_csvlog_encode(encoder->m_param, &stats, padx, pady, argc, argv);
56
+    }
57
+}
58
+#endif
59
 
60
 void x265_encoder_log(x265_encoder* enc, int argc, char **argv)
61
 {
62
     if (enc)
63
     {
64
         Encoder *encoder = static_cast<Encoder*>(enc);
65
-        x265_stats stats;
66
+        x265_stats stats;       
67
         encoder->fetchStats(&stats, sizeof(stats));
68
         int padx = encoder->m_sps.conformanceWindow.rightOffset;
69
         int pady = encoder->m_sps.conformanceWindow.bottomOffset;
70
@@ -378,6 +403,181 @@
71
     return -1;
72
 }
73
 
74
+void x265_alloc_analysis_data(x265_param *param, x265_analysis_data* analysis)
75
+{
76
+    x265_analysis_inter_data *interData = analysis->interData = NULL;
77
+    x265_analysis_intra_data *intraData = analysis->intraData = NULL;
78
+    x265_analysis_distortion_data *distortionData = analysis->distortionData = NULL;
79
+    bool isVbv = param->rc.vbvMaxBitrate > 0 && param->rc.vbvBufferSize > 0;
80
+    int numDir = 2; //irrespective of P or B slices set direction as 2
81
+    uint32_t numPlanes = param->internalCsp == X265_CSP_I400 ? 1 : 3;
82
+
83
+#if X265_DEPTH < 10 && (LINKED_10BIT || LINKED_12BIT)
84
+    uint32_t numCUs_sse_t = param->internalBitDepth > 8 ? analysis->numCUsInFrame << 1 : analysis->numCUsInFrame;
85
+#elif X265_DEPTH >= 10 && LINKED_8BIT
86
+    uint32_t numCUs_sse_t = param->internalBitDepth > 8 ? analysis->numCUsInFrame : (analysis->numCUsInFrame + 1U) >> 1;
87
+#else
88
+    uint32_t numCUs_sse_t = analysis->numCUsInFrame;
89
+#endif
90
+
91
+    //Allocate memory for distortionData pointer
92
+    CHECKED_MALLOC_ZERO(distortionData, x265_analysis_distortion_data, 1);
93
+    CHECKED_MALLOC_ZERO(distortionData->distortion, sse_t, analysis->numPartitions * numCUs_sse_t);
94
+    if (param->rc.bStatRead)
95
+    {
96
+        CHECKED_MALLOC_ZERO(distortionData->ctuDistortion, sse_t, numCUs_sse_t);
97
+        CHECKED_MALLOC_ZERO(distortionData->scaledDistortion, double, analysis->numCUsInFrame);
98
+        CHECKED_MALLOC_ZERO(distortionData->offset, double, analysis->numCUsInFrame);
99
+        CHECKED_MALLOC_ZERO(distortionData->threshold, double, analysis->numCUsInFrame);
100
+    }
101
+    analysis->distortionData = distortionData;
102
+
103
+    if (param->bDisableLookahead && isVbv)
104
+    {
105
+        CHECKED_MALLOC_ZERO(analysis->lookahead.intraSatdForVbv, uint32_t, analysis->numCuInHeight);
106
+        CHECKED_MALLOC_ZERO(analysis->lookahead.satdForVbv, uint32_t, analysis->numCuInHeight);
107
+        CHECKED_MALLOC_ZERO(analysis->lookahead.intraVbvCost, uint32_t, analysis->numCUsInFrame);
108
+        CHECKED_MALLOC_ZERO(analysis->lookahead.vbvCost, uint32_t, analysis->numCUsInFrame);
109
+    }
110
+
111
+    //Allocate memory for weightParam pointer
112
+    if (!(param->bMVType == AVC_INFO))
113
+        CHECKED_MALLOC_ZERO(analysis->wt, x265_weight_param, numPlanes * numDir);
114
+
115
+    if (param->analysisReuseLevel < 2)
116
+        return;
117
+
118
+    //Allocate memory for intraData pointer
119
+    CHECKED_MALLOC_ZERO(intraData, x265_analysis_intra_data, 1);
120
+    CHECKED_MALLOC(intraData->depth, uint8_t, analysis->numPartitions * analysis->numCUsInFrame);
121
+    CHECKED_MALLOC(intraData->modes, uint8_t, analysis->numPartitions * analysis->numCUsInFrame);
122
+    CHECKED_MALLOC(intraData->partSizes, char, analysis->numPartitions * analysis->numCUsInFrame);
123
+    CHECKED_MALLOC(intraData->chromaModes, uint8_t, analysis->numPartitions * analysis->numCUsInFrame);
124
+    analysis->intraData = intraData;
125
+
126
+    //Allocate memory for interData pointer based on ReuseLevels
127
+    CHECKED_MALLOC_ZERO(interData, x265_analysis_inter_data, 1);
128
+    CHECKED_MALLOC(interData->depth, uint8_t, analysis->numPartitions * analysis->numCUsInFrame);
129
+    CHECKED_MALLOC(interData->modes, uint8_t, analysis->numPartitions * analysis->numCUsInFrame);
130
+
131
+    CHECKED_MALLOC_ZERO(interData->mvpIdx[0], uint8_t, analysis->numPartitions * analysis->numCUsInFrame);
132
+    CHECKED_MALLOC_ZERO(interData->mvpIdx[1], uint8_t, analysis->numPartitions * analysis->numCUsInFrame);
133
+    CHECKED_MALLOC_ZERO(interData->mv[0], x265_analysis_MV, analysis->numPartitions * analysis->numCUsInFrame);
134
+    CHECKED_MALLOC_ZERO(interData->mv[1], x265_analysis_MV, analysis->numPartitions * analysis->numCUsInFrame);
135
+
136
+    if (param->analysisReuseLevel > 4)
137
+    {
138
+        CHECKED_MALLOC(interData->partSize, uint8_t, analysis->numPartitions * analysis->numCUsInFrame);
139
+        CHECKED_MALLOC_ZERO(interData->mergeFlag, uint8_t, analysis->numPartitions * analysis->numCUsInFrame);
140
+    }
141
+    if (param->analysisReuseLevel >= 7)
142
+    {
143
+        CHECKED_MALLOC(interData->interDir, uint8_t, analysis->numPartitions * analysis->numCUsInFrame);
144
+        CHECKED_MALLOC(interData->sadCost, int64_t, analysis->numPartitions * analysis->numCUsInFrame);
145
+        for (int dir = 0; dir < numDir; dir++)
146
+        {
147
+            CHECKED_MALLOC(interData->refIdx[dir], int8_t, analysis->numPartitions * analysis->numCUsInFrame);
148
+            CHECKED_MALLOC_ZERO(analysis->modeFlag[dir], uint8_t, analysis->numPartitions * analysis->numCUsInFrame);
149
+        }
150
+    }
151
+    else
152
+    {
153
+        if (param->analysisMultiPassRefine || param->analysisMultiPassDistortion){
154
+            CHECKED_MALLOC_ZERO(interData->ref, int32_t, 2 * analysis->numPartitions * analysis->numCUsInFrame);
155
+        }
156
+        else
157
+            CHECKED_MALLOC_ZERO(interData->ref, int32_t, analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU * numDir);
158
+    }
159
+    analysis->interData = interData;
160
+
161
+    return;
162
+
163
+fail:
164
+    x265_free_analysis_data(param, analysis);
165
+}
166
+
167
+void x265_free_analysis_data(x265_param *param, x265_analysis_data* analysis)
168
+{
169
+    bool isVbv = param->rc.vbvMaxBitrate > 0 && param->rc.vbvBufferSize > 0;
170
+
171
+    //Free memory for Lookahead pointers
172
+    if (param->bDisableLookahead && isVbv)
173
+    {
174
+        X265_FREE(analysis->lookahead.satdForVbv);
175
+        X265_FREE(analysis->lookahead.intraSatdForVbv);
176
+        X265_FREE(analysis->lookahead.vbvCost);
177
+        X265_FREE(analysis->lookahead.intraVbvCost);
178
+    }
179
+
180
+    //Free memory for distortionData pointers
181
+    if (analysis->distortionData)
182
+    {
183
+        X265_FREE((analysis->distortionData)->distortion);
184
+        if (param->rc.bStatRead)
185
+        {
186
+            X265_FREE((analysis->distortionData)->ctuDistortion);
187
+            X265_FREE((analysis->distortionData)->scaledDistortion);
188
+            X265_FREE((analysis->distortionData)->offset);
189
+            X265_FREE((analysis->distortionData)->threshold);
190
+        }
191
+        X265_FREE(analysis->distortionData);
192
+    }
193
+
194
+    /* Early exit freeing weights alone if level is 1 (when there is no analysis inter/intra) */
195
+    if (analysis->wt && !(param->bMVType == AVC_INFO))
196
+        X265_FREE(analysis->wt);
197
+
198
+    if (param->analysisReuseLevel < 2)
199
+        return;
200
+
201
+    //Free memory for intraData pointers
202
+    if (analysis->intraData)
203
+    {
204
+        X265_FREE((analysis->intraData)->depth);
205
+        X265_FREE((analysis->intraData)->modes);
206
+        X265_FREE((analysis->intraData)->partSizes);
207
+        X265_FREE((analysis->intraData)->chromaModes);
208
+        X265_FREE(analysis->intraData);
209
+        analysis->intraData = NULL;
210
+    }
211
+
212
+    //Free interData pointers
213
+    if (analysis->interData)
214
+    {
215
+        X265_FREE((analysis->interData)->depth);
216
+        X265_FREE((analysis->interData)->modes);
217
+        X265_FREE((analysis->interData)->mvpIdx[0]);
218
+        X265_FREE((analysis->interData)->mvpIdx[1]);
219
+        X265_FREE((analysis->interData)->mv[0]);
220
+        X265_FREE((analysis->interData)->mv[1]);
221
+
222
+        if (param->analysisReuseLevel > 4)
223
+        {
224
+            X265_FREE((analysis->interData)->mergeFlag);
225
+            X265_FREE((analysis->interData)->partSize);
226
+        }
227
+        if (param->analysisReuseLevel >= 7)
228
+        {
229
+            int numDir = 2;
230
+            X265_FREE((analysis->interData)->interDir);
231
+            X265_FREE((analysis->interData)->sadCost);
232
+            for (int dir = 0; dir < numDir; dir++)
233
+            {
234
+                X265_FREE((analysis->interData)->refIdx[dir]);
235
+                if (analysis->modeFlag[dir] != NULL)
236
+                {
237
+                    X265_FREE(analysis->modeFlag[dir]);
238
+                    analysis->modeFlag[dir] = NULL;
239
+                }
240
+            }
241
+        }
242
+        else
243
+            X265_FREE((analysis->interData)->ref);
244
+        X265_FREE(analysis->interData);
245
+        analysis->interData = NULL;
246
+    }
247
+}
248
+
249
 void x265_cleanup(void)
250
 {
251
     BitCost::destroy();
252
@@ -457,7 +657,13 @@
253
     &x265_csvlog_frame,
254
     &x265_csvlog_encode,
255
     &x265_dither_image,
256
-    &x265_set_analysis_data
257
+    &x265_set_analysis_data,
258
+#if ENABLE_LIBVMAF
259
+    &x265_calculate_vmafscore,
260
+    &x265_calculate_vmaf_framelevelscore,
261
+    &x265_vmaf_encoder_log
262
+#endif
263
+
264
 };
265
 
266
 typedef const x265_api* (*api_get_func)(int bitDepth);
267
@@ -675,7 +881,7 @@
268
                 if (param->rc.rateControlMode == X265_RC_CRF)
269
                     fprintf(csvfp, "RateFactor, ");
270
                 if (param->rc.vbvBufferSize)
271
-                    fprintf(csvfp, "BufferFill, ");
272
+                    fprintf(csvfp, "BufferFill, BufferFillFinal, ");
273
                 if (param->bEnablePsnr)
274
                     fprintf(csvfp, "Y PSNR, U PSNR, V PSNR, YUV PSNR, ");
275
                 if (param->bEnableSsim)
276
@@ -751,6 +957,9 @@
277
                     /* detailed performance statistics */
278
                     fprintf(csvfp, ", DecideWait (ms), Row0Wait (ms), Wall time (ms), Ref Wait Wall (ms), Total CTU time (ms),"
279
                         "Stall Time (ms), Total frame time (ms), Avg WPP, Row Blocks");
280
+#if ENABLE_LIBVMAF
281
+                    fprintf(csvfp, ", VMAF Frame Score");
282
+#endif
283
                 }
284
                 fprintf(csvfp, "\n");
285
             }
286
@@ -759,6 +968,9 @@
287
                 fputs(summaryCSVHeader, csvfp);
288
                 if (param->csvLogLevel >= 2 || param->maxCLL || param->maxFALL)
289
                     fputs("MaxCLL, MaxFALL,", csvfp);
290
+#if ENABLE_LIBVMAF
291
+                fputs(" Aggregate VMAF Score,", csvfp);
292
+#endif
293
                 fputs(" Version\n", csvfp);
294
             }
295
         }
296
@@ -780,7 +992,7 @@
297
     if (param->rc.rateControlMode == X265_RC_CRF)
298
         fprintf(param->csvfpt, "%.3lf,", frameStats->rateFactor);
299
     if (param->rc.vbvBufferSize)
300
-        fprintf(param->csvfpt, "%.3lf,", frameStats->bufferFill);
301
+        fprintf(param->csvfpt, "%.3lf, %.3lf,", frameStats->bufferFill, frameStats->bufferFillFinal);
302
     if (param->bEnablePsnr)
303
         fprintf(param->csvfpt, "%.3lf, %.3lf, %.3lf, %.3lf,", frameStats->psnrY, frameStats->psnrU, frameStats->psnrV, frameStats->psnr);
304
     if (param->bEnableSsim)
305
@@ -868,6 +1080,9 @@
306
                                                                                      frameStats->totalFrameTime);
307
 
308
         fprintf(param->csvfpt, " %.3lf, %d", frameStats->avgWPP, frameStats->countRowBlocks);
309
+#if ENABLE_LIBVMAF
310
+        fprintf(param->csvfpt, ", %lf", frameStats->vmafFrameScore);
311
+#endif
312
     }
313
     fprintf(param->csvfpt, "\n");
314
     fflush(stderr);
315
@@ -886,7 +1101,11 @@
316
             fputs(summaryCSVHeader, p->csvfpt);
317
             if (p->csvLogLevel >= 2 || p->maxCLL || p->maxFALL)
318
                 fputs("MaxCLL, MaxFALL,", p->csvfpt);
319
+#if ENABLE_LIBVMAF
320
+            fputs(" Aggregate VMAF score,", p->csvfpt);
321
+#endif
322
             fputs(" Version\n",p->csvfpt);
323
+
324
         }
325
         // CLI arguments or other
326
         if (argc)
327
@@ -907,6 +1126,7 @@
328
                 fputc('"', p->csvfpt);
329
                 fputs(opts, p->csvfpt);
330
                 fputc('"', p->csvfpt);
331
+                X265_FREE(opts);
332
             }
333
         }
334
 
335
@@ -918,7 +1138,6 @@
336
         char buffer[200];
337
         strftime(buffer, 128, "%c", timeinfo);
338
         fprintf(p->csvfpt, ", %s, ", buffer);
339
-
340
         // elapsed time, fps, bitrate
341
         fprintf(p->csvfpt, "%.2f, %.2f, %.2f,",
342
             stats->elapsedEncodeTime, stats->encodedPictureCount / stats->elapsedEncodeTime, stats->bitrate);
343
@@ -980,7 +1199,11 @@
344
             fprintf(p->csvfpt, " -, -, -, -, -, -, -,");
345
         if (p->csvLogLevel >= 2 || p->maxCLL || p->maxFALL)
346
             fprintf(p->csvfpt, " %-6u, %-6u,", stats->maxCLL, stats->maxFALL);
347
+#if ENABLE_LIBVMAF
348
+        fprintf(p->csvfpt, " %lf,", stats->aggregateVmafScore);
349
+#endif
350
         fprintf(p->csvfpt, " %s\n", api->version_str);
351
+
352
     }
353
 }
354
 
355
@@ -1071,4 +1294,318 @@
356
     }
357
 }
358
 
359
+#if ENABLE_LIBVMAF
360
+/* Read y values of single frame for 8-bit input */
361
+int read_image_byte(FILE *file, float *buf, int width, int height, int stride)
362
+{
363
+    char *byte_ptr = (char *)buf;
364
+    unsigned char *tmp_buf = 0;
365
+    int i, j;
366
+    int ret = 1;
367
+
368
+    if (width <= 0 || height <= 0)
369
+    {
370
+        goto fail_or_end;
371
+    }
372
+
373
+    if (!(tmp_buf = (unsigned char*)malloc(width)))
374
+    {
375
+        goto fail_or_end;
376
+    }
377
+
378
+    for (i = 0; i < height; ++i)
379
+    {
380
+        float *row_ptr = (float *)byte_ptr;
381
+
382
+        if (fread(tmp_buf, 1, width, file) != (size_t)width)
383
+        {
384
+            goto fail_or_end;
385
+        }
386
+
387
+        for (j = 0; j < width; ++j)
388
+        {
389
+            row_ptr[j] = tmp_buf[j];
390
+        }
391
+
392
+        byte_ptr += stride;
393
+    }
394
+
395
+    ret = 0;
396
+
397
+fail_or_end:
398
+    free(tmp_buf);
399
+    return ret;
400
+}
401
+/* Read y values of single frame for 10-bit input */
402
+int read_image_word(FILE *file, float *buf, int width, int height, int stride)
403
+{
404
+    char *byte_ptr = (char *)buf;
405
+    unsigned short *tmp_buf = 0;
406
+    int i, j;
407
+    int ret = 1;
408
+
409
+    if (width <= 0 || height <= 0)
410
+    {
411
+        goto fail_or_end;
412
+    }
413
+
414
+    if (!(tmp_buf = (unsigned short*)malloc(width * 2))) // '*2' to accommodate words
415
+    {
416
+        goto fail_or_end;
417
+    }
418
+
419
+    for (i = 0; i < height; ++i)
420
+    {
421
+        float *row_ptr = (float *)byte_ptr;
422
+
423
+        if (fread(tmp_buf, 2, width, file) != (size_t)width) // '2' for word
424
+        {
425
+            goto fail_or_end;
426
+        }
427
+
428
+        for (j = 0; j < width; ++j)
429
+        {
430
+            row_ptr[j] = tmp_buf[j] / 4.0; // '/4' to convert from 10 to 8-bit
431
+        }
432
+
433
+        byte_ptr += stride;
434
+    }
435
+
436
+    ret = 0;
437
+
438
+fail_or_end:
439
+    free(tmp_buf);
440
+    return ret;
441
+}
442
+
443
+int read_frame(float *reference_data, float *distorted_data, float *temp_data, int stride_byte, void *s)
444
+{
445
+    x265_vmaf_data *user_data = (x265_vmaf_data *)s;
446
+    int ret;
447
+
448
+    // read reference y
449
+    if (user_data->internalBitDepth == 8)
450
+    {
451
+        ret = read_image_byte(user_data->reference_file, reference_data, user_data->width, user_data->height, stride_byte);
452
+    }
453
+    else if (user_data->internalBitDepth == 10)
454
+    {
455
+        ret = read_image_word(user_data->reference_file, reference_data, user_data->width, user_data->height, stride_byte);
456
+    }
457
+    else
458
+    {
459
+        x265_log(NULL, X265_LOG_ERROR, "Invalid bitdepth\n");
460
+        return 1;
461
+    }
462
+    if (ret)
463
+    {
464
+        if (feof(user_data->reference_file))
465
+        {
466
+            ret = 2; // OK if end of file
467
+        }
468
+        return ret;
469
+    }
470
+
471
+    // read distorted y
472
+    if (user_data->internalBitDepth == 8)
473
+    {
474
+        ret = read_image_byte(user_data->distorted_file, distorted_data, user_data->width, user_data->height, stride_byte);
475
+    }
476
+    else if (user_data->internalBitDepth == 10)
477
+    {
478
+        ret = read_image_word(user_data->distorted_file, distorted_data, user_data->width, user_data->height, stride_byte);
479
+    }
480
+    else
481
+    {
482
+        x265_log(NULL, X265_LOG_ERROR, "Invalid bitdepth\n");
483
+        return 1;
484
+    }
485
+    if (ret)
486
+    {
487
+        if (feof(user_data->distorted_file))
488
+        {
489
+            ret = 2; // OK if end of file
490
+        }
491
+        return ret;
492
+    }
493
+
494
+    // reference skip u and v
495
+    if (user_data->internalBitDepth == 8)
496
+    {
497
+        if (fread(temp_data, 1, user_data->offset, user_data->reference_file) != (size_t)user_data->offset)
498
+        {
499
+            x265_log(NULL, X265_LOG_ERROR, "reference fread to skip u and v failed.\n");
500
+            goto fail_or_end;
501
+        }
502
+    }
503
+    else if (user_data->internalBitDepth == 10)
504
+    {
505
+        if (fread(temp_data, 2, user_data->offset, user_data->reference_file) != (size_t)user_data->offset)
506
+        {
507
+            x265_log(NULL, X265_LOG_ERROR, "reference fread to skip u and v failed.\n");
508
+            goto fail_or_end;
509
+        }
510
+    }
511
+    else
512
+    {
513
+        x265_log(NULL, X265_LOG_ERROR, "Invalid format\n");
514
+        goto fail_or_end;
515
+    }
516
+
517
+    // distorted skip u and v
518
+    if (user_data->internalBitDepth == 8)
519
+    {
520
+        if (fread(temp_data, 1, user_data->offset, user_data->distorted_file) != (size_t)user_data->offset)
521
+        {
522
+            x265_log(NULL, X265_LOG_ERROR, "distorted fread to skip u and v failed.\n");
523
+            goto fail_or_end;
524
+        }
525
+    }
526
+    else if (user_data->internalBitDepth == 10)
527
+    {
528
+        if (fread(temp_data, 2, user_data->offset, user_data->distorted_file) != (size_t)user_data->offset)
529
+        {
530
+            x265_log(NULL, X265_LOG_ERROR, "distorted fread to skip u and v failed.\n");
531
+            goto fail_or_end;
532
+        }
533
+    }
534
+    else
535
+    {
536
+        x265_log(NULL, X265_LOG_ERROR, "Invalid format\n");
537
+        goto fail_or_end;
538
+    }
539
+
540
+
541
+fail_or_end:
542
+    return ret;
543
+}
544
+
545
+double x265_calculate_vmafscore(x265_param *param, x265_vmaf_data *data)
546
+{
547
+    double score;
548
+    
549
+    data->width = param->sourceWidth;
550
+    data->height = param->sourceHeight;
551
+    data->internalBitDepth = param->internalBitDepth;
552
+   
553
+    if (param->internalCsp == X265_CSP_I420)
554
+    {
555
+        if ((param->sourceWidth * param->sourceHeight) % 2 != 0)
556
+            x265_log(NULL, X265_LOG_ERROR, "Invalid file size\n");
557
+        data->offset = param->sourceWidth * param->sourceHeight / 2;
558
+    }
559
+    else if (param->internalCsp == X265_CSP_I422)
560
+        data->offset = param->sourceWidth * param->sourceHeight;
561
+    else if (param->internalCsp == X265_CSP_I444)
562
+        data->offset = param->sourceWidth * param->sourceHeight * 2;
563
+    else
564
+        x265_log(NULL, X265_LOG_ERROR, "Invalid format\n");
565
+  
566
+    compute_vmaf(&score, vcd->format, data->width, data->height, read_frame, data, vcd->model_path, vcd->log_path, vcd->log_fmt, vcd->disable_clip, vcd->disable_avx, vcd->enable_transform, vcd->phone_model, vcd->psnr, vcd->ssim, vcd->ms_ssim, vcd->pool); 
567
+
568
+    return score;
569
+}
570
+
571
+int read_frame_10bit(float *reference_data, float *distorted_data, float *temp_data, int stride, void *s)
572
+{
573
+    x265_vmaf_framedata *user_data = (x265_vmaf_framedata *)s;
574
+
575
+    PicYuv *reference_frame = (PicYuv *)user_data->reference_frame;
576
+    PicYuv *distorted_frame = (PicYuv *)user_data->distorted_frame;
577
+
578
+    if(!user_data->frame_set) {
579
+ 
580
+        int reference_stride = reference_frame->m_stride;
581
+        int distorted_stride = distorted_frame->m_stride;
582
+
583
+        const uint16_t *reference_ptr = (const uint16_t *)reference_frame->m_picOrg[0]; 
584
+        const uint16_t *distorted_ptr = (const uint16_t *)distorted_frame->m_picOrg[0];
585
+
586
+        temp_data = reference_data;
587
+
588
+        int height = user_data->height;
589
+        int width = user_data->width; 
590
+
591
+        int i,j;
592
+        for (i = 0; i < height; i++) {
593
+            for ( j = 0; j < width; j++) {
594
+                temp_data[j] = ((float)reference_ptr[j] / 4.0);
595
+            }
596
+            reference_ptr += reference_stride;
597
+            temp_data += stride / sizeof(*temp_data);
598
+        }
599
+        
600
+        temp_data = distorted_data;
601
+        for (i = 0; i < height; i++) {
602
+            for (j = 0; j < width; j++) {
603
+                 temp_data[j] = ((float)distorted_ptr[j] / 4.0);
604
+            }
605
+            distorted_ptr += distorted_stride;
606
+            temp_data += stride / sizeof(*temp_data);
607
+        }
608
+
609
+        user_data->frame_set = 1;
610
+        return 0;
611
+    }                                                             
612
+    return 2;                                                               
613
+}
614
+
615
+int read_frame_8bit(float *reference_data, float *distorted_data, float *temp_data, int stride, void *s)
616
+{
617
+    x265_vmaf_framedata *user_data = (x265_vmaf_framedata *)s;
618
+
619
+    PicYuv *reference_frame = (PicYuv *)user_data->reference_frame;
620
+    PicYuv *distorted_frame = (PicYuv *)user_data->distorted_frame;
621
+
622
+    if(!user_data->frame_set) {
623
+
624
+        int reference_stride = reference_frame->m_stride;
625
+        int distorted_stride = distorted_frame->m_stride;
626
+
627
+        const uint8_t *reference_ptr = (const uint8_t *)reference_frame->m_picOrg[0]; 
628
+        const uint8_t *distorted_ptr = (const uint8_t *)distorted_frame->m_picOrg[0];
629
+
630
+        temp_data = reference_data;
631
+
632
+        int height = user_data->height;
633
+        int width = user_data->width; 
634
+
635
+        int i,j;
636
+        for (i = 0; i < height; i++) {
637
+            for ( j = 0; j < width; j++) {
638
+                temp_data[j] = (float)reference_ptr[j];
639
+            }
640
+            reference_ptr += reference_stride;
641
+            temp_data += stride / sizeof(*temp_data);
642
+        }
643
+        
644
+        temp_data = distorted_data;
645
+        for (i = 0; i < height; i++) {
646
+            for (j = 0; j < width; j++) {
647
+                 temp_data[j] = (float)distorted_ptr[j];
648
+            }
649
+            distorted_ptr += distorted_stride;
650
+            temp_data += stride / sizeof(*temp_data);
651
+        }
652
+
653
+        user_data->frame_set = 1;
654
+        return 0;
655
+    }                                                             
656
+    return 2;                                                               
657
+}
658
+
659
+double x265_calculate_vmaf_framelevelscore(x265_vmaf_framedata *vmafframedata)
660
+{
661
+    double score; 
662
+    int (*read_frame)(float *reference_data, float *distorted_data, float *temp_data,
663
+                      int stride, void *s);
664
+    if (vmafframedata->internalBitDepth == 8)
665
+        read_frame = read_frame_8bit;
666
+    else
667
+        read_frame = read_frame_10bit;
668
+    compute_vmaf(&score, vcd->format, vmafframedata->width, vmafframedata->height, read_frame, vmafframedata, vcd->model_path, vcd->log_path, vcd->log_fmt, vcd->disable_clip, vcd->disable_avx, vcd->enable_transform, vcd->phone_model, vcd->psnr, vcd->ssim, vcd->ms_ssim, vcd->pool);
669
+ 
670
+    return score;
671
+}
672
+#endif
673
 } /* end namespace or extern "C" */
674
x265_2.7.tar.gz/source/encoder/dpb.cpp -> x265_2.9.tar.gz/source/encoder/dpb.cpp Changed
35
 
1
@@ -131,9 +131,8 @@
2
     int pocCurr = slice->m_poc;
3
     int type = newFrame->m_lowres.sliceType;
4
     bool bIsKeyFrame = newFrame->m_lowres.bKeyframe;
5
-
6
     slice->m_nalUnitType = getNalUnitType(pocCurr, bIsKeyFrame);
7
-    if (slice->m_nalUnitType == NAL_UNIT_CODED_SLICE_IDR_W_RADL)
8
+    if (slice->m_nalUnitType == NAL_UNIT_CODED_SLICE_IDR_W_RADL || slice->m_nalUnitType == NAL_UNIT_CODED_SLICE_IDR_N_LP)
9
         m_lastIDR = pocCurr;
10
     slice->m_lastIDR = m_lastIDR;
11
     slice->m_sliceType = IS_X265_TYPE_B(type) ? B_SLICE : (type == X265_TYPE_P) ? P_SLICE : I_SLICE;
12
@@ -250,7 +249,7 @@
13
 /* Marking reference pictures when an IDR/CRA is encountered. */
14
 void DPB::decodingRefreshMarking(int pocCurr, NalUnitType nalUnitType)
15
 {
16
-    if (nalUnitType == NAL_UNIT_CODED_SLICE_IDR_W_RADL)
17
+    if (nalUnitType == NAL_UNIT_CODED_SLICE_IDR_W_RADL || nalUnitType == NAL_UNIT_CODED_SLICE_IDR_N_LP)
18
     {
19
         /* If the nal_unit_type is IDR, all pictures in the reference picture
20
          * list are marked as "unused for reference" */
21
@@ -326,11 +325,9 @@
22
 NalUnitType DPB::getNalUnitType(int curPOC, bool bIsKeyFrame)
23
 {
24
     if (!curPOC)
25
-        return NAL_UNIT_CODED_SLICE_IDR_W_RADL;
26
-
27
+        return NAL_UNIT_CODED_SLICE_IDR_N_LP;
28
     if (bIsKeyFrame)
29
-        return m_bOpenGOP ? NAL_UNIT_CODED_SLICE_CRA : NAL_UNIT_CODED_SLICE_IDR_W_RADL;
30
-
31
+        return m_bOpenGOP ? NAL_UNIT_CODED_SLICE_CRA : m_bhasLeadingPicture ? NAL_UNIT_CODED_SLICE_IDR_W_RADL : NAL_UNIT_CODED_SLICE_IDR_N_LP;
32
     if (m_pocCRA && curPOC < m_pocCRA)
33
         // All leading pictures are being marked as TFD pictures here since
34
         // current encoder uses all reference pictures while encoding leading
35
x265_2.7.tar.gz/source/encoder/dpb.h -> x265_2.9.tar.gz/source/encoder/dpb.h Changed
17
 
1
@@ -40,6 +40,7 @@
2
     int                m_lastIDR;
3
     int                m_pocCRA;
4
     int                m_bOpenGOP;
5
+    int                m_bhasLeadingPicture;
6
     bool               m_bRefreshPending;
7
     bool               m_bTemporalSublayer;
8
     PicList            m_picList;
9
@@ -50,6 +51,7 @@
10
     {
11
         m_lastIDR = 0;
12
         m_pocCRA = 0;
13
+        m_bhasLeadingPicture = param->radl;
14
         m_bRefreshPending = false;
15
         m_frameDataFreeList = NULL;
16
         m_bOpenGOP = param->bOpenGOP;
17
x265_2.7.tar.gz/source/encoder/encoder.cpp -> x265_2.9.tar.gz/source/encoder/encoder.cpp Changed
2234
 
1
@@ -79,6 +79,7 @@
2
     m_threadPool = NULL;
3
     m_analysisFileIn = NULL;
4
     m_analysisFileOut = NULL;
5
+    m_naluFile = NULL;
6
     m_offsetEmergency = NULL;
7
     m_iFrameNum = 0;
8
     m_iPPSQpMinus26 = 0;
9
@@ -96,6 +97,8 @@
10
 #endif
11
 
12
     m_prevTonemapPayload.payload = NULL;
13
+    m_startPoint = 0;
14
+    m_saveCTUSize = 0;
15
 }
16
 inline char *strcatFilename(const char *input, const char *suffix)
17
 {
18
@@ -337,10 +340,12 @@
19
 
20
     if (m_param->bEmitHRDSEI)
21
         m_rateControl->initHRD(m_sps);
22
+
23
     if (!m_rateControl->init(m_sps))
24
         m_aborted = true;
25
     if (!m_lookahead->create())
26
         m_aborted = true;
27
+
28
     initRefIdx();
29
     if (m_param->analysisSave && m_param->bUseAnalysisFile)
30
     {
31
@@ -408,10 +413,35 @@
32
 
33
     m_emitCLLSEI = p->maxCLL || p->maxFALL;
34
 
35
+    if (m_param->naluFile)
36
+    {
37
+        m_naluFile = x265_fopen(m_param->naluFile, "r");
38
+        if (!m_naluFile)
39
+        {
40
+            x265_log_file(NULL, X265_LOG_ERROR, "%s file not found or Failed to open\n", m_param->naluFile);
41
+            m_aborted = true;
42
+        }
43
+        else
44
+             m_enableNal = 1;
45
+    }
46
+    else
47
+         m_enableNal = 0;
48
+
49
 #if ENABLE_HDR10_PLUS
50
     if (m_bToneMap)
51
         m_numCimInfo = m_hdr10plus_api->hdr10plus_json_to_movie_cim(m_param->toneMapFile, m_cim);
52
 #endif
53
+    if (m_param->bDynamicRefine)
54
+    {
55
+        /* Allocate memory for 1 GOP and reuse it for the subsequent GOPs */
56
+        int size = (m_param->keyframeMax + m_param->lookaheadDepth) * m_param->maxCUDepth * X265_REFINE_INTER_LEVELS;
57
+        CHECKED_MALLOC_ZERO(m_variance, uint64_t, size);
58
+        CHECKED_MALLOC_ZERO(m_rdCost, uint64_t, size);
59
+        CHECKED_MALLOC_ZERO(m_trainingCount, uint32_t, size);
60
+        return;
61
+    fail:
62
+        m_aborted = true;
63
+    }
64
 }
65
 
66
 void Encoder::stopJobs()
67
@@ -516,8 +546,8 @@
68
         curFrame->m_analysisData.numPartitions = m_param->num4x4Partitions;
69
         int num16x16inCUWidth = m_param->maxCUSize >> 4;
70
         uint32_t ctuAddr, offset, cuPos;
71
-        analysis_intra_data * intraData = (analysis_intra_data *)curFrame->m_analysisData.intraData;
72
-        analysis_intra_data * srcIntraData = (analysis_intra_data *)analysis_data->intraData;
73
+        x265_analysis_intra_data * intraData = curFrame->m_analysisData.intraData;
74
+        x265_analysis_intra_data * srcIntraData = analysis_data->intraData;
75
         for (int i = 0; i < mbImageHeight; i++)
76
         {
77
             for (int j = 0; j < mbImageWidth; j++)
78
@@ -546,8 +576,8 @@
79
         curFrame->m_analysisData.numPartitions = m_param->num4x4Partitions;
80
         int num16x16inCUWidth = m_param->maxCUSize >> 4;
81
         uint32_t ctuAddr, offset, cuPos;
82
-        analysis_inter_data * interData = (analysis_inter_data *)curFrame->m_analysisData.interData;
83
-        analysis_inter_data * srcInterData = (analysis_inter_data*)analysis_data->interData;
84
+        x265_analysis_inter_data * interData = curFrame->m_analysisData.interData;
85
+        x265_analysis_inter_data * srcInterData = analysis_data->interData;
86
         for (int i = 0; i < mbImageHeight; i++)
87
         {
88
             for (int j = 0; j < mbImageWidth; j++)
89
@@ -611,7 +641,7 @@
90
         curFrame->m_analysisData = (*analysis_data);
91
         curFrame->m_analysisData.numCUsInFrame = widthInCU * heightInCU;
92
         curFrame->m_analysisData.numPartitions = m_param->num4x4Partitions;
93
-        allocAnalysis(&curFrame->m_analysisData);
94
+        x265_alloc_analysis_data(m_param, &curFrame->m_analysisData);
95
         if (m_param->maxCUSize == 16)
96
         {
97
             if (analysis_data->sliceType == X265_TYPE_IDR || analysis_data->sliceType == X265_TYPE_I)
98
@@ -622,8 +652,8 @@
99
 
100
                 curFrame->m_analysisData.numPartitions = m_param->num4x4Partitions;
101
                 size_t count = 0;
102
-                analysis_intra_data * currIntraData = (analysis_intra_data *)curFrame->m_analysisData.intraData;
103
-                analysis_intra_data * intraData = (analysis_intra_data *)analysis_data->intraData;
104
+                x265_analysis_intra_data * currIntraData = curFrame->m_analysisData.intraData;
105
+                x265_analysis_intra_data * intraData = analysis_data->intraData;
106
                 for (uint32_t d = 0; d < cuBytes; d++)
107
                 {
108
                     int bytes = curFrame->m_analysisData.numPartitions >> ((intraData)->depth[d] * 2);
109
@@ -643,14 +673,14 @@
110
 
111
                 curFrame->m_analysisData.numPartitions = m_param->num4x4Partitions;
112
                 size_t count = 0;
113
-                analysis_inter_data * currInterData = (analysis_inter_data *)curFrame->m_analysisData.interData;
114
-                analysis_inter_data * interData = (analysis_inter_data *)analysis_data->interData;
115
+                x265_analysis_inter_data * currInterData = curFrame->m_analysisData.interData;
116
+                x265_analysis_inter_data * interData = analysis_data->interData;
117
                 for (uint32_t d = 0; d < cuBytes; d++)
118
                 {
119
                     int bytes = curFrame->m_analysisData.numPartitions >> ((interData)->depth[d] * 2);
120
                     memset(&(currInterData)->depth[count], (interData)->depth[d], bytes);
121
                     memset(&(currInterData)->modes[count], (interData)->modes[d], bytes);
122
-                    memcpy(&(currInterData)->sadCost[count], &((analysis_inter_data*)analysis_data->interData)->sadCost[d], bytes);
123
+                    memcpy(&(currInterData)->sadCost[count], &(analysis_data->interData)->sadCost[d], bytes);
124
                     if (m_param->analysisReuseLevel > 4)
125
                     {
126
                         memset(&(currInterData)->partSize[count], (interData)->partSize[d], bytes);
127
@@ -697,7 +727,13 @@
128
     if (m_bToneMap)
129
         m_hdr10plus_api->hdr10plus_clear_movie(m_cim, m_numCimInfo);
130
 #endif
131
-        
132
+
133
+    if (m_param->bDynamicRefine)
134
+    {
135
+        X265_FREE(m_variance);
136
+        X265_FREE(m_rdCost);
137
+        X265_FREE(m_trainingCount);
138
+    }
139
     if (m_exportedPic)
140
     {
141
         ATOMIC_DEC(&m_exportedPic->m_countRefEncoders);
142
@@ -761,6 +797,8 @@
143
         }
144
         X265_FREE(temp);
145
      }
146
+    if (m_naluFile)
147
+        fclose(m_naluFile);
148
     if (m_param)
149
     {
150
         if (m_param->csvfpt)
151
@@ -837,6 +875,77 @@
152
     }
153
 }
154
 
155
+void Encoder::copyUserSEIMessages(Frame *frame, const x265_picture* pic_in)
156
+{
157
+    x265_sei_payload toneMap;
158
+    toneMap.payload = NULL;
159
+    int toneMapPayload = 0;
160
+
161
+#if ENABLE_HDR10_PLUS
162
+    if (m_bToneMap)
163
+    {
164
+        int currentPOC = m_pocLast;
165
+        if (currentPOC < m_numCimInfo)
166
+        {
167
+            int32_t i = 0;
168
+            toneMap.payloadSize = 0;
169
+            while (m_cim[currentPOC][i] == 0xFF)
170
+                toneMap.payloadSize += m_cim[currentPOC][i++];
171
+            toneMap.payloadSize += m_cim[currentPOC][i];
172
+
173
+            toneMap.payload = (uint8_t*)x265_malloc(sizeof(uint8_t) * toneMap.payloadSize);
174
+            toneMap.payloadType = USER_DATA_REGISTERED_ITU_T_T35;
175
+            memcpy(toneMap.payload, &m_cim[currentPOC][i + 1], toneMap.payloadSize);
176
+            toneMapPayload = 1;
177
+        }
178
+    }
179
+#endif
180
+    /* seiMsg will contain SEI messages specified in a fixed file format in POC order.
181
+    * Format of the file : <POC><space><PREFIX><space><NAL UNIT TYPE>/<SEI TYPE><space><SEI Payload> */
182
+    x265_sei_payload seiMsg;
183
+    seiMsg.payload = NULL;
184
+    int userPayload = 0;
185
+    if (m_enableNal)
186
+    {
187
+        readUserSeiFile(seiMsg, m_pocLast);
188
+        if (seiMsg.payload)
189
+            userPayload = 1;;
190
+    }
191
+
192
+    int numPayloads = pic_in->userSEI.numPayloads + toneMapPayload + userPayload;
193
+    frame->m_userSEI.numPayloads = numPayloads;
194
+
195
+    if (frame->m_userSEI.numPayloads)
196
+    {
197
+        if (!frame->m_userSEI.payloads)
198
+        {
199
+            frame->m_userSEI.payloads = new x265_sei_payload[numPayloads];
200
+            for (int i = 0; i < numPayloads; i++)
201
+                frame->m_userSEI.payloads[i].payload = NULL;
202
+        }
203
+        for (int i = 0; i < numPayloads; i++)
204
+        {
205
+            x265_sei_payload input;
206
+            if ((i == (numPayloads - 1)) && toneMapPayload)
207
+                input = toneMap;
208
+            else if (m_enableNal)
209
+                input = seiMsg;
210
+            else
211
+                input = pic_in->userSEI.payloads[i];
212
+
213
+            if (!frame->m_userSEI.payloads[i].payload)
214
+                frame->m_userSEI.payloads[i].payload = new uint8_t[input.payloadSize];
215
+            memcpy(frame->m_userSEI.payloads[i].payload, input.payload, input.payloadSize);
216
+            frame->m_userSEI.payloads[i].payloadSize = input.payloadSize;
217
+            frame->m_userSEI.payloads[i].payloadType = input.payloadType;
218
+        }
219
+        if (toneMap.payload)
220
+            x265_free(toneMap.payload);
221
+        if (seiMsg.payload)
222
+            x265_free(seiMsg.payload);
223
+    }
224
+}
225
+
226
 /**
227
  * Feed one new input frame into the encoder, get one frame out. If pic_in is
228
  * NULL, a flush condition is implied and pic_in must be NULL for all subsequent
229
@@ -863,12 +972,12 @@
230
     if (m_exportedPic)
231
     {
232
         if (!m_param->bUseAnalysisFile && m_param->analysisSave)
233
-            freeAnalysis(&m_exportedPic->m_analysisData);
234
+            x265_free_analysis_data(m_param, &m_exportedPic->m_analysisData);
235
         ATOMIC_DEC(&m_exportedPic->m_countRefEncoders);
236
         m_exportedPic = NULL;
237
         m_dpb->recycleUnreferenced();
238
     }
239
-    if (pic_in)
240
+    if (pic_in && (!m_param->chunkEnd || (m_encodedFrameNum < m_param->chunkEnd)))
241
     {
242
         if (m_latestParam->forceFlush == 1)
243
         {
244
@@ -881,27 +990,6 @@
245
             m_latestParam->forceFlush = 0;
246
         }
247
 
248
-        x265_sei_payload toneMap;
249
-        toneMap.payload = NULL;
250
-#if ENABLE_HDR10_PLUS
251
-        if (m_bToneMap)
252
-        {
253
-            int currentPOC = m_pocLast + 1;
254
-            if (currentPOC < m_numCimInfo)
255
-            {
256
-                int32_t i = 0;
257
-                toneMap.payloadSize = 0;
258
-                while (m_cim[currentPOC][i] == 0xFF)
259
-                    toneMap.payloadSize += m_cim[currentPOC][i++];
260
-                toneMap.payloadSize += m_cim[currentPOC][i];
261
-
262
-                toneMap.payload = (uint8_t*)x265_malloc(sizeof(uint8_t) * toneMap.payloadSize);
263
-                toneMap.payloadType = USER_DATA_REGISTERED_ITU_T_T35;
264
-                memcpy(toneMap.payload, &m_cim[currentPOC][i+1], toneMap.payloadSize);
265
-            }
266
-        }
267
-#endif
268
-
269
         if (pic_in->bitDepth < 8 || pic_in->bitDepth > 16)
270
         {
271
             x265_log(m_param, X265_LOG_ERROR, "Input bit depth (%d) must be between 8 and 16\n",
272
@@ -983,36 +1071,7 @@
273
         inFrame->m_forceqp   = pic_in->forceqp;
274
         inFrame->m_param     = (m_reconfigure || m_reconfigureRc) ? m_latestParam : m_param;
275
 
276
-        int toneMapEnable = 0;
277
-        if (m_bToneMap && toneMap.payload)
278
-            toneMapEnable = 1;
279
-        int numPayloads = pic_in->userSEI.numPayloads + toneMapEnable;
280
-        inFrame->m_userSEI.numPayloads = numPayloads;
281
-
282
-        if (inFrame->m_userSEI.numPayloads)
283
-        {
284
-            if (!inFrame->m_userSEI.payloads)
285
-            {
286
-                inFrame->m_userSEI.payloads = new x265_sei_payload[numPayloads];
287
-                for (int i = 0; i < numPayloads; i++)
288
-                    inFrame->m_userSEI.payloads[i].payload = NULL;
289
-            }
290
-            for (int i = 0; i < numPayloads; i++)
291
-            {
292
-                x265_sei_payload input;
293
-                if ((i == (numPayloads - 1)) && toneMapEnable)
294
-                    input = toneMap;
295
-                else
296
-                    input = pic_in->userSEI.payloads[i];
297
-                int size = inFrame->m_userSEI.payloads[i].payloadSize = input.payloadSize;
298
-                inFrame->m_userSEI.payloads[i].payloadType = input.payloadType;
299
-                if (!inFrame->m_userSEI.payloads[i].payload)
300
-                    inFrame->m_userSEI.payloads[i].payload = new uint8_t[size];
301
-                memcpy(inFrame->m_userSEI.payloads[i].payload, input.payload, size);
302
-            }
303
-            if (toneMap.payload)
304
-                x265_free(toneMap.payload);
305
-        }
306
+        copyUserSEIMessages(inFrame, pic_in);
307
 
308
         if (pic_in->quantOffsets != NULL)
309
         {
310
@@ -1049,8 +1108,35 @@
311
         /* Load analysis data before lookahead->addPicture, since sliceType has been decided */
312
         if (m_param->analysisLoad)
313
         {
314
-            /* readAnalysisFile reads analysis data for the frame and allocates memory based on slicetype */
315
-            readAnalysisFile(&inFrame->m_analysisData, inFrame->m_poc, pic_in);
316
+            /* reads analysis data for the frame and allocates memory based on slicetype */
317
+            static int paramBytes = 0;
318
+            if (!inFrame->m_poc)
319
+            {
320
+                x265_analysis_data analysisData = pic_in->analysisData;
321
+                paramBytes = validateAnalysisData(&analysisData, 0);
322
+                if (paramBytes == -1)
323
+                {
324
+                    m_aborted = true;
325
+                    return -1;
326
+                }
327
+            }
328
+            if (m_saveCTUSize)
329
+            {
330
+                cuLocation cuLocInFrame;
331
+                cuLocInFrame.init(m_param);
332
+                /* Set skipWidth/skipHeight flags when the out of bound pixels in lowRes is greater than half of maxCUSize */
333
+                int extendedWidth = ((m_param->sourceWidth / 2 + m_param->maxCUSize - 1) >> m_param->maxLog2CUSize) * m_param->maxCUSize;
334
+                int extendedHeight = ((m_param->sourceHeight / 2 + m_param->maxCUSize - 1) >> m_param->maxLog2CUSize) * m_param->maxCUSize;
335
+                uint32_t outOfBoundaryLowres = extendedWidth - m_param->sourceWidth / 2;
336
+                if (outOfBoundaryLowres * 2 >= m_param->maxCUSize)
337
+                    cuLocInFrame.skipWidth = true;
338
+                uint32_t outOfBoundaryLowresH = extendedHeight - m_param->sourceHeight / 2;
339
+                if (outOfBoundaryLowresH * 2 >= m_param->maxCUSize)
340
+                    cuLocInFrame.skipHeight = true;
341
+                readAnalysisFile(&inFrame->m_analysisData, inFrame->m_poc, pic_in, paramBytes, cuLocInFrame);
342
+            }
343
+            else
344
+                readAnalysisFile(&inFrame->m_analysisData, inFrame->m_poc, pic_in, paramBytes);
345
             inFrame->m_poc = inFrame->m_analysisData.poc;
346
             sliceType = inFrame->m_analysisData.sliceType;
347
             inFrame->m_lowres.bScenecut = !!inFrame->m_analysisData.bScenecut;
348
@@ -1133,7 +1219,7 @@
349
 
350
             /* Free up pic_in->analysisData since it has already been used */
351
             if ((m_param->analysisLoad && !m_param->analysisSave) || (m_param->bMVType && slice->m_sliceType != I_SLICE))
352
-                freeAnalysis(&outFrame->m_analysisData);
353
+                x265_free_analysis_data(m_param, &outFrame->m_analysisData);
354
 
355
             if (pic_out)
356
             {
357
@@ -1146,6 +1232,7 @@
358
 
359
                 pic_out->pts = outFrame->m_pts;
360
                 pic_out->dts = outFrame->m_dts;
361
+                pic_out->reorderedPts = outFrame->m_reorderedPts;
362
                 pic_out->sliceType = outFrame->m_lowres.sliceType;
363
                 pic_out->planes[0] = recpic->m_picOrg[0];
364
                 pic_out->stride[0] = (int)(recpic->m_stride * sizeof(pixel));
365
@@ -1171,6 +1258,7 @@
366
                     pic_out->analysisData.intraData = outFrame->m_analysisData.intraData;
367
                     pic_out->analysisData.modeFlag[0] = outFrame->m_analysisData.modeFlag[0];
368
                     pic_out->analysisData.modeFlag[1] = outFrame->m_analysisData.modeFlag[1];
369
+                    pic_out->analysisData.distortionData = outFrame->m_analysisData.distortionData;
370
                     if (m_param->bDisableLookahead)
371
                     {
372
                         int factor = 1;
373
@@ -1178,6 +1266,7 @@
374
                             factor = m_param->scaleFactor * 2;
375
                         pic_out->analysisData.numCuInHeight = outFrame->m_analysisData.numCuInHeight;
376
                         pic_out->analysisData.lookahead.dts = outFrame->m_dts;
377
+                        pic_out->analysisData.lookahead.reorderedPts = outFrame->m_reorderedPts;
378
                         pic_out->analysisData.satdCost *= factor;
379
                         pic_out->analysisData.lookahead.keyframe = outFrame->m_lowres.bKeyframe;
380
                         pic_out->analysisData.lookahead.lastMiniGopBFrame = outFrame->m_lowres.bLastMiniGopBFrame;
381
@@ -1186,46 +1275,49 @@
382
                             int vbvCount = m_param->lookaheadDepth + m_param->bframes + 2;
383
                             for (int index = 0; index < vbvCount; index++)
384
                             {
385
-                                pic_out->analysisData.lookahead.plannedSatd[index] = outFrame->m_lowres.plannedSatd[index] * factor;
386
+                                pic_out->analysisData.lookahead.plannedSatd[index] = outFrame->m_lowres.plannedSatd[index];
387
                                 pic_out->analysisData.lookahead.plannedType[index] = outFrame->m_lowres.plannedType[index];
388
                             }
389
                             for (uint32_t index = 0; index < pic_out->analysisData.numCuInHeight; index++)
390
                             {
391
-                                outFrame->m_analysisData.lookahead.intraSatdForVbv[index] = outFrame->m_encData->m_rowStat[index].intraSatdForVbv * factor;
392
-                                outFrame->m_analysisData.lookahead.satdForVbv[index] = outFrame->m_encData->m_rowStat[index].satdForVbv * factor;
393
+                                outFrame->m_analysisData.lookahead.intraSatdForVbv[index] = outFrame->m_encData->m_rowStat[index].intraSatdForVbv;
394
+                                outFrame->m_analysisData.lookahead.satdForVbv[index] = outFrame->m_encData->m_rowStat[index].satdForVbv;
395
                             }
396
                             pic_out->analysisData.lookahead.intraSatdForVbv = outFrame->m_analysisData.lookahead.intraSatdForVbv;
397
                             pic_out->analysisData.lookahead.satdForVbv = outFrame->m_analysisData.lookahead.satdForVbv;
398
                             for (uint32_t index = 0; index < pic_out->analysisData.numCUsInFrame; index++)
399
                             {
400
-                                outFrame->m_analysisData.lookahead.intraVbvCost[index] = outFrame->m_encData->m_cuStat[index].intraVbvCost * factor;
401
-                                outFrame->m_analysisData.lookahead.vbvCost[index] = outFrame->m_encData->m_cuStat[index].vbvCost * factor;
402
+                                outFrame->m_analysisData.lookahead.intraVbvCost[index] = outFrame->m_encData->m_cuStat[index].intraVbvCost;
403
+                                outFrame->m_analysisData.lookahead.vbvCost[index] = outFrame->m_encData->m_cuStat[index].vbvCost;
404
                             }
405
                             pic_out->analysisData.lookahead.intraVbvCost = outFrame->m_analysisData.lookahead.intraVbvCost;
406
                             pic_out->analysisData.lookahead.vbvCost = outFrame->m_analysisData.lookahead.vbvCost;
407
                         }
408
                     }
409
                     writeAnalysisFile(&pic_out->analysisData, *outFrame->m_encData);
410
+                    pic_out->analysisData.saveParam = pic_out->analysisData.saveParam;
411
                     if (m_param->bUseAnalysisFile)
412
-                        freeAnalysis(&pic_out->analysisData);
413
+                        x265_free_analysis_data(m_param, &pic_out->analysisData);
414
                 }
415
             }
416
             if (m_param->rc.bStatWrite && (m_param->analysisMultiPassRefine || m_param->analysisMultiPassDistortion))
417
             {
418
                 if (pic_out)
419
                 {
420
-                    pic_out->analysis2Pass.poc = pic_out->poc;
421
-                    pic_out->analysis2Pass.analysisFramedata = outFrame->m_analysis2Pass.analysisFramedata;
422
+                    pic_out->analysisData.poc = pic_out->poc;
423
+                    pic_out->analysisData.interData = outFrame->m_analysisData.interData;
424
+                    pic_out->analysisData.intraData = outFrame->m_analysisData.intraData;
425
+                    pic_out->analysisData.distortionData = outFrame->m_analysisData.distortionData;
426
                 }
427
-                writeAnalysis2PassFile(&outFrame->m_analysis2Pass, *outFrame->m_encData, outFrame->m_lowres.sliceType);
428
+                writeAnalysisFileRefine(&outFrame->m_analysisData, *outFrame->m_encData);
429
             }
430
             if (m_param->analysisMultiPassRefine || m_param->analysisMultiPassDistortion)
431
-                freeAnalysis2Pass(&outFrame->m_analysis2Pass, outFrame->m_lowres.sliceType);
432
+                x265_free_analysis_data(m_param, &outFrame->m_analysisData);
433
             if (m_param->internalCsp == X265_CSP_I400)
434
             {
435
                 if (slice->m_sliceType == P_SLICE)
436
                 {
437
-                    if (slice->m_weightPredTable[0][0][0].bPresentFlag)
438
+                    if (slice->m_weightPredTable[0][0][0].wtPresent)
439
                         m_numLumaWPFrames++;
440
                 }
441
                 else if (slice->m_sliceType == B_SLICE)
442
@@ -1233,7 +1325,7 @@
443
                     bool bLuma = false;
444
                     for (int l = 0; l < 2; l++)
445
                     {
446
-                        if (slice->m_weightPredTable[l][0][0].bPresentFlag)
447
+                        if (slice->m_weightPredTable[l][0][0].wtPresent)
448
                             bLuma = true;
449
                     }
450
                     if (bLuma)
451
@@ -1244,10 +1336,10 @@
452
             {
453
                 if (slice->m_sliceType == P_SLICE)
454
                 {
455
-                    if (slice->m_weightPredTable[0][0][0].bPresentFlag)
456
+                    if (slice->m_weightPredTable[0][0][0].wtPresent)
457
                         m_numLumaWPFrames++;
458
-                    if (slice->m_weightPredTable[0][0][1].bPresentFlag ||
459
-                        slice->m_weightPredTable[0][0][2].bPresentFlag)
460
+                    if (slice->m_weightPredTable[0][0][1].wtPresent ||
461
+                        slice->m_weightPredTable[0][0][2].wtPresent)
462
                         m_numChromaWPFrames++;
463
                 }
464
                 else if (slice->m_sliceType == B_SLICE)
465
@@ -1255,10 +1347,10 @@
466
                     bool bLuma = false, bChroma = false;
467
                     for (int l = 0; l < 2; l++)
468
                     {
469
-                        if (slice->m_weightPredTable[l][0][0].bPresentFlag)
470
+                        if (slice->m_weightPredTable[l][0][0].wtPresent)
471
                             bLuma = true;
472
-                        if (slice->m_weightPredTable[l][0][1].bPresentFlag ||
473
-                            slice->m_weightPredTable[l][0][2].bPresentFlag)
474
+                        if (slice->m_weightPredTable[l][0][1].wtPresent ||
475
+                            slice->m_weightPredTable[l][0][2].wtPresent)
476
                             bChroma = true;
477
                     }
478
 
479
@@ -1271,7 +1363,8 @@
480
             if (m_aborted)
481
                 return -1;
482
 
483
-            finishFrameStats(outFrame, curEncoder, frameData, m_pocLast);
484
+            if ((m_outputCount + 1)  >= m_param->chunkStart)
485
+                finishFrameStats(outFrame, curEncoder, frameData, m_pocLast);
486
 
487
             /* Write RateControl Frame level stats in multipass encodes */
488
             if (m_param->rc.bStatWrite)
489
@@ -1306,8 +1399,12 @@
490
             }
491
             else
492
                 m_exportedPic = outFrame;
493
-
494
-            m_numDelayedPic--;
495
+            
496
+            m_outputCount++;
497
+            if (m_param->chunkEnd == m_outputCount)
498
+                m_numDelayedPic = 0;
499
+            else 
500
+                m_numDelayedPic--;
501
 
502
             ret = 1;
503
         }
504
@@ -1316,14 +1413,18 @@
505
          * curEncoder is guaranteed to be idle at this point */
506
         if (!pass)
507
             frameEnc = m_lookahead->getDecidedPicture();
508
-        if (frameEnc && !pass)
509
+        if (frameEnc && !pass && (!m_param->chunkEnd || (m_encodedFrameNum < m_param->chunkEnd)))
510
         {
511
             if (m_param->analysisMultiPassRefine || m_param->analysisMultiPassDistortion)
512
             {
513
-                allocAnalysis2Pass(&frameEnc->m_analysis2Pass, frameEnc->m_lowres.sliceType);
514
-                frameEnc->m_analysis2Pass.poc = frameEnc->m_poc;
515
+                uint32_t widthInCU = (m_param->sourceWidth + m_param->maxCUSize - 1) >> m_param->maxLog2CUSize;
516
+                uint32_t heightInCU = (m_param->sourceHeight + m_param->maxCUSize - 1) >> m_param->maxLog2CUSize;
517
+                frameEnc->m_analysisData.numCUsInFrame = widthInCU * heightInCU;
518
+                frameEnc->m_analysisData.numPartitions = m_param->num4x4Partitions;
519
+                x265_alloc_analysis_data(m_param, &frameEnc->m_analysisData);
520
+                frameEnc->m_analysisData.poc = frameEnc->m_poc;
521
                 if (m_param->rc.bStatRead)
522
-                    readAnalysis2PassFile(&frameEnc->m_analysis2Pass, frameEnc->m_poc, frameEnc->m_lowres.sliceType);
523
+                    readAnalysisFile(&frameEnc->m_analysisData, frameEnc->m_poc, frameEnc->m_lowres.sliceType);
524
              }
525
 
526
             if (frameEnc->m_reconfigureRc && m_reconfigureRc)
527
@@ -1370,6 +1471,7 @@
528
             if (m_param->analysisLoad && m_param->bDisableLookahead)
529
             {
530
                 frameEnc->m_dts = frameEnc->m_analysisData.lookahead.dts;
531
+                frameEnc->m_reorderedPts = frameEnc->m_analysisData.lookahead.reorderedPts;
532
                 if (m_rateControl->m_isVbv)
533
                 {
534
                     for (uint32_t index = 0; index < frameEnc->m_analysisData.numCuInHeight; index++)
535
@@ -1436,6 +1538,7 @@
536
             frameEnc->m_encData->m_slice->m_iNumRPSInSPS = m_sps.spsrpsNum;
537
 
538
             curEncoder->m_rce.encodeOrder = frameEnc->m_encodeOrder = m_encodedFrameNum++;
539
+
540
             if (!m_param->analysisLoad || !m_param->bDisableLookahead)
541
             {
542
                 if (m_bframeDelay)
543
@@ -1463,7 +1566,7 @@
544
                 analysis->numCUsInFrame  = numCUsInFrame;
545
                 analysis->numCuInHeight = heightInCU;
546
                 analysis->numPartitions  = m_param->num4x4Partitions;
547
-                allocAnalysis(analysis);
548
+                x265_alloc_analysis_data(m_param, analysis);
549
             }
550
             /* determine references, setup RPS, etc */
551
             m_dpb->prepareEncode(frameEnc);
552
@@ -2074,7 +2177,7 @@
553
     {
554
         const int picOrderCntLSB = slice->m_poc - slice->m_lastIDR;
555
 
556
-        frameStats->encoderOrder = m_outputCount++;
557
+        frameStats->encoderOrder = m_outputCount;
558
         frameStats->sliceType = c;
559
         frameStats->poc = picOrderCntLSB;
560
         frameStats->qp = curEncData.m_avgQpAq;
561
@@ -2083,6 +2186,7 @@
562
         if (m_param->csvLogLevel >= 2)
563
             frameStats->ipCostRatio = curFrame->m_lowres.ipCostRatio;
564
         frameStats->bufferFill = m_rateControl->m_bufferFillActual;
565
+        frameStats->bufferFillFinal = m_rateControl->m_bufferFillFinal;
566
         frameStats->frameLatency = inPoc - poc;
567
         if (m_param->rc.rateControlMode == X265_RC_CRF)
568
             frameStats->rateFactor = curEncData.m_rateFactor;
569
@@ -2106,6 +2210,9 @@
570
 #define ELAPSED_MSEC(start, end) (((double)(end) - (start)) / 1000)
571
         if (m_param->csvLogLevel >= 2)
572
         {
573
+#if ENABLE_LIBVMAF
574
+            frameStats->vmafFrameScore = curFrame->m_fencPic->m_vmafScore;
575
+#endif
576
             frameStats->decideWaitTime = ELAPSED_MSEC(0, curEncoder->m_slicetypeWaitTime);
577
             frameStats->row0WaitTime = ELAPSED_MSEC(curEncoder->m_startCompressTime, curEncoder->m_row0WaitTime);
578
             frameStats->wallTime = ELAPSED_MSEC(curEncoder->m_row0WaitTime, curEncoder->m_endCompressTime);
579
@@ -2265,30 +2372,25 @@
580
     list.serialize(NAL_UNIT_SPS, bs);
581
 
582
     bs.resetBits();
583
-    sbacCoder.codePPS( m_pps, (m_param->maxSlices <= 1), m_iPPSQpMinus26);
584
+    sbacCoder.codePPS(m_pps, (m_param->maxSlices <= 1), m_iPPSQpMinus26);
585
     bs.writeByteAlignment();
586
     list.serialize(NAL_UNIT_PPS, bs);
587
 
588
+    if (m_param->bSingleSeiNal)
589
+        bs.resetBits();
590
+
591
     if (m_param->bEmitHDRSEI)
592
     {
593
         SEIContentLightLevel cllsei;
594
         cllsei.max_content_light_level = m_param->maxCLL;
595
         cllsei.max_pic_average_light_level = m_param->maxFALL;
596
-        bs.resetBits();
597
-        cllsei.write(bs, m_sps);
598
-        bs.writeByteAlignment();
599
-        list.serialize(NAL_UNIT_PREFIX_SEI, bs);
600
+        cllsei.writeSEImessages(bs, m_sps, NAL_UNIT_PREFIX_SEI, list, m_param->bSingleSeiNal);
601
 
602
         if (m_param->masteringDisplayColorVolume)
603
         {
604
             SEIMasteringDisplayColorVolume mdsei;
605
             if (mdsei.parse(m_param->masteringDisplayColorVolume))
606
-            {
607
-                bs.resetBits();
608
-                mdsei.write(bs, m_sps);
609
-                bs.writeByteAlignment();
610
-                list.serialize(NAL_UNIT_PREFIX_SEI, bs);
611
-            }
612
+                mdsei.writeSEImessages(bs, m_sps, NAL_UNIT_PREFIX_SEI, list, m_param->bSingleSeiNal);
613
             else
614
                 x265_log(m_param, X265_LOG_WARNING, "unable to parse mastering display color volume info\n");
615
         }
616
@@ -2300,21 +2402,18 @@
617
         if (opts)
618
         {
619
             char *buffer = X265_MALLOC(char, strlen(opts) + strlen(PFX(version_str)) +
620
-                                             strlen(PFX(build_info_str)) + 200);
621
+                strlen(PFX(build_info_str)) + 200);
622
             if (buffer)
623
             {
624
                 sprintf(buffer, "x265 (build %d) - %s:%s - H.265/HEVC codec - "
625
-                        "Copyright 2013-2018 (c) Multicoreware, Inc - "
626
-                        "http://x265.org - options: %s",
627
-                        X265_BUILD, PFX(version_str), PFX(build_info_str), opts);
628
-                
629
-                bs.resetBits();
630
+                    "Copyright 2013-2018 (c) Multicoreware, Inc - "
631
+                    "http://x265.org - options: %s",
632
+                    X265_BUILD, PFX(version_str), PFX(build_info_str), opts);
633
+
634
                 SEIuserDataUnregistered idsei;
635
                 idsei.m_userData = (uint8_t*)buffer;
636
                 idsei.setSize((uint32_t)strlen(buffer));
637
-                idsei.write(bs, m_sps);
638
-                bs.writeByteAlignment();
639
-                list.serialize(NAL_UNIT_PREFIX_SEI, bs);
640
+                idsei.writeSEImessages(bs, m_sps, NAL_UNIT_PREFIX_SEI, list, m_param->bSingleSeiNal);
641
 
642
                 X265_FREE(buffer);
643
             }
644
@@ -2329,11 +2428,7 @@
645
         SEIActiveParameterSets sei;
646
         sei.m_selfContainedCvsFlag = true;
647
         sei.m_noParamSetUpdateFlag = true;
648
-
649
-        bs.resetBits();
650
-        sei.write(bs, m_sps);
651
-        bs.writeByteAlignment();
652
-        list.serialize(NAL_UNIT_PREFIX_SEI, bs);
653
+        sei.writeSEImessages(bs, m_sps, NAL_UNIT_PREFIX_SEI, list, m_param->bSingleSeiNal);
654
     }
655
 }
656
 
657
@@ -2416,7 +2511,7 @@
658
     vui.defaultDisplayWindow.bottomOffset = m_param->vui.defDispWinBottomOffset;
659
     vui.defaultDisplayWindow.leftOffset = m_param->vui.defDispWinLeftOffset;
660
 
661
-    vui.frameFieldInfoPresentFlag = !!m_param->interlaceMode;
662
+    vui.frameFieldInfoPresentFlag = !!m_param->interlaceMode || (m_param->pictureStructure >= 0);
663
     vui.fieldSeqFlag = !!m_param->interlaceMode;
664
 
665
     vui.hrdParametersPresentFlag = m_param->bEmitHRDSEI;
666
@@ -2428,6 +2523,7 @@
667
 void Encoder::initPPS(PPS *pps)
668
 {
669
     bool bIsVbv = m_param->rc.vbvBufferSize > 0 && m_param->rc.vbvMaxBitrate > 0;
670
+    bool bEnableDistOffset = m_param->analysisMultiPassDistortion && m_param->rc.bStatRead;
671
 
672
     if (!m_param->bLossless && (m_param->rc.aqMode || bIsVbv || m_param->bAQMotion))
673
     {
674
@@ -2435,6 +2531,11 @@
675
         pps->maxCuDQPDepth = g_log2Size[m_param->maxCUSize] - g_log2Size[m_param->rc.qgSize];
676
         X265_CHECK(pps->maxCuDQPDepth <= 3, "max CU DQP depth cannot be greater than 3\n");
677
     }
678
+    else if (!m_param->bLossless && bEnableDistOffset)
679
+    {
680
+        pps->bUseDQP = true;
681
+        pps->maxCuDQPDepth = 0;
682
+    }
683
     else
684
     {
685
         pps->bUseDQP = false;
686
@@ -2660,32 +2761,51 @@
687
         {
688
             p->scaleFactor = 0;
689
         }
690
-        else if ((!p->analysisLoad && !p->analysisSave) || p->analysisReuseLevel < 10)
691
+        else if ((!p->analysisLoad && !p->analysisSave) || (p->analysisReuseLevel > 6 && p->analysisReuseLevel != 10))
692
         {
693
-            x265_log(p, X265_LOG_WARNING, "Input scaling works with analysis load/save, analysis-reuse-level 10. Disabling scale-factor.\n");
694
+            x265_log(p, X265_LOG_WARNING, "Input scaling works with analysis load/save and analysis-reuse-level 1-6 and 10. Disabling scale-factor.\n");
695
             p->scaleFactor = 0;
696
         }
697
     }
698
 
699
     if (p->intraRefine)
700
     {
701
-        if (!p->analysisLoad || p->analysisReuseLevel < 10 || !p->scaleFactor)
702
+        if (!p->analysisLoad || p->analysisReuseLevel < 10)
703
         {
704
-            x265_log(p, X265_LOG_WARNING, "Intra refinement requires analysis load, analysis-reuse-level 10, scale factor. Disabling intra refine.\n");
705
+            x265_log(p, X265_LOG_WARNING, "Intra refinement requires analysis load, analysis-reuse-level 10. Disabling intra refine.\n");
706
             p->intraRefine = 0;
707
         }
708
     }
709
 
710
     if (p->interRefine)
711
     {
712
-        if (!p->analysisLoad || p->analysisReuseLevel < 10 || !p->scaleFactor)
713
+        if (!p->analysisLoad || p->analysisReuseLevel < 10)
714
+        {
715
+            x265_log(p, X265_LOG_WARNING, "Inter refinement requires analysis load, analysis-reuse-level 10. Disabling inter refine.\n");
716
+            p->interRefine = 0;
717
+        }
718
+    }
719
+
720
+    if (p->bDynamicRefine)
721
+    {
722
+        if (!p->analysisLoad || p->analysisReuseLevel < 10)
723
+        {
724
+            x265_log(p, X265_LOG_WARNING, "Dynamic refinement requires analysis load, analysis-reuse-level 10. Disabling dynamic refine.\n");
725
+            p->bDynamicRefine = 0;
726
+        }
727
+        if (p->interRefine)
728
         {
729
-            x265_log(p, X265_LOG_WARNING, "Inter refinement requires analysis load, analysis-reuse-level 10, scale factor. Disabling inter refine.\n");
730
+            x265_log(p, X265_LOG_WARNING, "Inter refine cannot be used with dynamic refine. Disabling refine-inter.\n");
731
             p->interRefine = 0;
732
         }
733
     }
734
+    if (p->scaleFactor && p->analysisLoad && !p->interRefine && !p->bDynamicRefine && p->analysisReuseLevel == 10)
735
+    {
736
+        x265_log(p, X265_LOG_WARNING, "Inter refinement 0 is not supported with scaling and analysis-reuse-level=10. Enabling refine-inter 1.\n");
737
+        p->interRefine = 1;
738
+    }
739
 
740
-    if (p->limitTU && p->interRefine)
741
+    if (p->limitTU && (p->interRefine || p->bDynamicRefine))
742
     {
743
         x265_log(p, X265_LOG_WARNING, "Inter refinement does not support limitTU. Disabling limitTU.\n");
744
         p->limitTU = 0;
745
@@ -2693,9 +2813,9 @@
746
 
747
     if (p->mvRefine)
748
     {
749
-        if (!p->analysisLoad || p->analysisReuseLevel < 10 || !p->scaleFactor)
750
+        if (!p->analysisLoad || p->analysisReuseLevel < 10)
751
         {
752
-            x265_log(p, X265_LOG_WARNING, "MV refinement requires analysis load, analysis-reuse-level 10, scale factor. Disabling MV refine.\n");
753
+            x265_log(p, X265_LOG_WARNING, "MV refinement requires analysis load, analysis-reuse-level 10. Disabling MV refine.\n");
754
             p->mvRefine = 0;
755
         }
756
         else if (p->interRefine >= 2)
757
@@ -2711,13 +2831,6 @@
758
         p->bDistributeMotionEstimation = p->bDistributeModeAnalysis = 0;
759
     }
760
 
761
-    if (p->rc.bEnableGrain)
762
-    {
763
-        x265_log(p, X265_LOG_WARNING, "Rc Grain removes qp fluctuations caused by aq/cutree, Disabling aq,cu-tree\n");
764
-        p->rc.cuTree = 0;
765
-        p->rc.aqMode = 0;
766
-    }
767
-
768
     if (p->bDistributeModeAnalysis && (p->limitReferences >> 1) && 1)
769
     {
770
         x265_log(p, X265_LOG_WARNING, "Limit reference options 2 and 3 are not supported with pmode. Disabling limit reference\n");
771
@@ -3054,231 +3167,32 @@
772
         p->radl = 0;
773
         x265_log(p, X265_LOG_WARNING, "Radl requires fixed gop-length (keyint == min-keyint). Disabling radl.\n");
774
     }
775
-}
776
-
777
-void Encoder::allocAnalysis(x265_analysis_data* analysis)
778
-{
779
-    X265_CHECK(analysis->sliceType, "invalid slice type\n");
780
-    analysis->interData = analysis->intraData = NULL;
781
-    if (m_param->bDisableLookahead && m_rateControl->m_isVbv)
782
-    {
783
-        CHECKED_MALLOC_ZERO(analysis->lookahead.intraSatdForVbv, uint32_t, analysis->numCuInHeight);
784
-        CHECKED_MALLOC_ZERO(analysis->lookahead.satdForVbv, uint32_t, analysis->numCuInHeight);
785
-        CHECKED_MALLOC_ZERO(analysis->lookahead.intraVbvCost, uint32_t, analysis->numCUsInFrame);
786
-        CHECKED_MALLOC_ZERO(analysis->lookahead.vbvCost, uint32_t, analysis->numCUsInFrame);
787
-    }
788
-    if (analysis->sliceType == X265_TYPE_IDR || analysis->sliceType == X265_TYPE_I)
789
-    {
790
-        if (m_param->analysisReuseLevel < 2)
791
-            return;
792
-
793
-        analysis_intra_data *intraData = (analysis_intra_data*)analysis->intraData;
794
-        CHECKED_MALLOC_ZERO(intraData, analysis_intra_data, 1);
795
-        CHECKED_MALLOC(intraData->depth, uint8_t, analysis->numPartitions * analysis->numCUsInFrame);
796
-        CHECKED_MALLOC(intraData->modes, uint8_t, analysis->numPartitions * analysis->numCUsInFrame);
797
-        CHECKED_MALLOC(intraData->partSizes, char, analysis->numPartitions * analysis->numCUsInFrame);
798
-        CHECKED_MALLOC(intraData->chromaModes, uint8_t, analysis->numPartitions * analysis->numCUsInFrame);
799
-        analysis->intraData = intraData;
800
-    }
801
-    else
802
-    {
803
-        int numDir = analysis->sliceType == X265_TYPE_P ? 1 : 2;
804
-        uint32_t numPlanes = m_param->internalCsp == X265_CSP_I400 ? 1 : 3;
805
-        if (!(m_param->bMVType == AVC_INFO))
806
-            CHECKED_MALLOC_ZERO(analysis->wt, WeightParam, numPlanes * numDir);
807
-        if (m_param->analysisReuseLevel < 2)
808
-            return;
809
-
810
-        analysis_inter_data *interData = (analysis_inter_data*)analysis->interData;
811
-        CHECKED_MALLOC_ZERO(interData, analysis_inter_data, 1);
812
-        CHECKED_MALLOC(interData->depth, uint8_t, analysis->numPartitions * analysis->numCUsInFrame);
813
-        CHECKED_MALLOC(interData->modes, uint8_t, analysis->numPartitions * analysis->numCUsInFrame);
814
-        if (m_param->analysisReuseLevel > 4)
815
-        {
816
-            CHECKED_MALLOC(interData->partSize, uint8_t, analysis->numPartitions * analysis->numCUsInFrame);
817
-            CHECKED_MALLOC(interData->mergeFlag, uint8_t, analysis->numPartitions * analysis->numCUsInFrame);
818
-        }
819
-
820
-        if (m_param->analysisReuseLevel >= 7)
821
-        {
822
-            CHECKED_MALLOC(interData->interDir, uint8_t, analysis->numPartitions * analysis->numCUsInFrame);
823
-            CHECKED_MALLOC(interData->sadCost, int64_t, analysis->numPartitions * analysis->numCUsInFrame);
824
-            for (int dir = 0; dir < numDir; dir++)
825
-            {
826
-                CHECKED_MALLOC(interData->mvpIdx[dir], uint8_t, analysis->numPartitions * analysis->numCUsInFrame);
827
-                CHECKED_MALLOC(interData->refIdx[dir], int8_t, analysis->numPartitions * analysis->numCUsInFrame);
828
-                CHECKED_MALLOC(interData->mv[dir], MV, analysis->numPartitions * analysis->numCUsInFrame);
829
-                CHECKED_MALLOC_ZERO(analysis->modeFlag[dir], uint8_t, analysis->numPartitions * analysis->numCUsInFrame);
830
-            }
831
-            /* Allocate intra in inter */
832
-            if (analysis->sliceType == X265_TYPE_P || m_param->bIntraInBFrames)
833
-            {
834
-                analysis_intra_data *intraData = (analysis_intra_data*)analysis->intraData;
835
-                CHECKED_MALLOC_ZERO(intraData, analysis_intra_data, 1);
836
-                CHECKED_MALLOC(intraData->modes, uint8_t, analysis->numPartitions * analysis->numCUsInFrame);
837
-                CHECKED_MALLOC(intraData->chromaModes, uint8_t, analysis->numPartitions * analysis->numCUsInFrame);
838
-                analysis->intraData = intraData;
839
-            }
840
-        }
841
-        else
842
-            CHECKED_MALLOC_ZERO(interData->ref, int32_t, analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU * numDir);
843
-
844
-        analysis->interData = interData;
845
-    }
846
-    return;
847
-
848
-fail:
849
-    freeAnalysis(analysis);
850
-    m_aborted = true;
851
-}
852
-void Encoder::freeAnalysis(x265_analysis_data* analysis)
853
-{
854
-    if (m_param->bDisableLookahead && m_rateControl->m_isVbv)
855
-    {
856
-        X265_FREE(analysis->lookahead.satdForVbv);
857
-        X265_FREE(analysis->lookahead.intraSatdForVbv);
858
-        X265_FREE(analysis->lookahead.vbvCost);
859
-        X265_FREE(analysis->lookahead.intraVbvCost);
860
-    }
861
-    /* Early exit freeing weights alone if level is 1 (when there is no analysis inter/intra) */
862
-    if (analysis->sliceType > X265_TYPE_I && analysis->wt && !(m_param->bMVType == AVC_INFO))
863
-        X265_FREE(analysis->wt);
864
-    if (m_param->analysisReuseLevel < 2)
865
-        return;
866
 
867
-    if (analysis->sliceType == X265_TYPE_IDR || analysis->sliceType == X265_TYPE_I)
868
+    if ((p->chunkStart || p->chunkEnd) && p->bOpenGOP)
869
     {
870
-        if (analysis->intraData)
871
-        {
872
-            X265_FREE(((analysis_intra_data*)analysis->intraData)->depth);
873
-            X265_FREE(((analysis_intra_data*)analysis->intraData)->modes);
874
-            X265_FREE(((analysis_intra_data*)analysis->intraData)->partSizes);
875
-            X265_FREE(((analysis_intra_data*)analysis->intraData)->chromaModes);
876
-            X265_FREE(analysis->intraData);
877
-            analysis->intraData = NULL;
878
-        }
879
+        p->chunkStart = p->chunkEnd = 0;
880
+        x265_log(p, X265_LOG_WARNING, "Chunking requires closed gop structure. Disabling chunking.\n");
881
     }
882
-    else
883
-    {
884
-        if (analysis->intraData)
885
-        {
886
-            X265_FREE(((analysis_intra_data*)analysis->intraData)->modes);
887
-            X265_FREE(((analysis_intra_data*)analysis->intraData)->chromaModes);
888
-            X265_FREE(analysis->intraData);
889
-            analysis->intraData = NULL;
890
-        }
891
-        if (analysis->interData)
892
-        {
893
-            X265_FREE(((analysis_inter_data*)analysis->interData)->depth);
894
-            X265_FREE(((analysis_inter_data*)analysis->interData)->modes);
895
-            if (m_param->analysisReuseLevel > 4)
896
-            {
897
-                X265_FREE(((analysis_inter_data*)analysis->interData)->mergeFlag);
898
-                X265_FREE(((analysis_inter_data*)analysis->interData)->partSize);
899
-            }
900
-            if (m_param->analysisReuseLevel >= 7)
901
-            {
902
-                X265_FREE(((analysis_inter_data*)analysis->interData)->interDir);
903
-                X265_FREE(((analysis_inter_data*)analysis->interData)->sadCost);
904
-                int numDir = analysis->sliceType == X265_TYPE_P ? 1 : 2;
905
-                for (int dir = 0; dir < numDir; dir++)
906
-                {
907
-                    X265_FREE(((analysis_inter_data*)analysis->interData)->mvpIdx[dir]);
908
-                    X265_FREE(((analysis_inter_data*)analysis->interData)->refIdx[dir]);
909
-                    X265_FREE(((analysis_inter_data*)analysis->interData)->mv[dir]);
910
-                    if (analysis->modeFlag[dir] != NULL)
911
-                    {
912
-                        X265_FREE(analysis->modeFlag[dir]);
913
-                        analysis->modeFlag[dir] = NULL;
914
-                    }
915
-                }
916
-            }
917
-            else
918
-                X265_FREE(((analysis_inter_data*)analysis->interData)->ref);
919
-
920
-            X265_FREE(analysis->interData);
921
-            analysis->interData = NULL;
922
-        }
923
-    }
924
-}
925
-
926
-void Encoder::allocAnalysis2Pass(x265_analysis_2Pass* analysis, int sliceType)
927
-{
928
-    analysis->analysisFramedata = NULL;
929
-    analysis2PassFrameData *analysisFrameData = (analysis2PassFrameData*)analysis->analysisFramedata;
930
-    uint32_t widthInCU = (m_param->sourceWidth + m_param->maxCUSize - 1) >> m_param->maxLog2CUSize;
931
-    uint32_t heightInCU = (m_param->sourceHeight + m_param->maxCUSize - 1) >> m_param->maxLog2CUSize;
932
 
933
-    uint32_t numCUsInFrame = widthInCU * heightInCU;
934
-    CHECKED_MALLOC_ZERO(analysisFrameData, analysis2PassFrameData, 1);
935
-    CHECKED_MALLOC_ZERO(analysisFrameData->depth, uint8_t, m_param->num4x4Partitions * numCUsInFrame);
936
-    CHECKED_MALLOC_ZERO(analysisFrameData->distortion, sse_t, m_param->num4x4Partitions * numCUsInFrame);
937
-    if (m_param->rc.bStatRead)
938
-    {
939
-        CHECKED_MALLOC_ZERO(analysisFrameData->ctuDistortion, sse_t, numCUsInFrame);
940
-        CHECKED_MALLOC_ZERO(analysisFrameData->scaledDistortion, double, numCUsInFrame);
941
-        CHECKED_MALLOC_ZERO(analysisFrameData->offset, double, numCUsInFrame);
942
-        CHECKED_MALLOC_ZERO(analysisFrameData->threshold, double, numCUsInFrame);
943
-    }
944
-    if (!IS_X265_TYPE_I(sliceType))
945
+    if (p->chunkEnd < p->chunkStart)
946
     {
947
-        CHECKED_MALLOC_ZERO(analysisFrameData->m_mv[0], MV, m_param->num4x4Partitions * numCUsInFrame);
948
-        CHECKED_MALLOC_ZERO(analysisFrameData->m_mv[1], MV, m_param->num4x4Partitions * numCUsInFrame);
949
-        CHECKED_MALLOC_ZERO(analysisFrameData->mvpIdx[0], int, m_param->num4x4Partitions * numCUsInFrame);
950
-        CHECKED_MALLOC_ZERO(analysisFrameData->mvpIdx[1], int, m_param->num4x4Partitions * numCUsInFrame);
951
-        CHECKED_MALLOC_ZERO(analysisFrameData->ref[0], int32_t, m_param->num4x4Partitions * numCUsInFrame);
952
-        CHECKED_MALLOC_ZERO(analysisFrameData->ref[1], int32_t, m_param->num4x4Partitions * numCUsInFrame);
953
-        CHECKED_MALLOC(analysisFrameData->modes, uint8_t, m_param->num4x4Partitions * numCUsInFrame);
954
+        p->chunkStart = p->chunkEnd = 0;
955
+        x265_log(p, X265_LOG_WARNING, "chunk-end cannot be less than chunk-start. Disabling chunking.\n");
956
     }
957
 
958
-    analysis->analysisFramedata = analysisFrameData;
959
-
960
-    return;
961
-
962
-fail:
963
-    freeAnalysis2Pass(analysis, sliceType);
964
-    m_aborted = true;
965
-}
966
-
967
-void Encoder::freeAnalysis2Pass(x265_analysis_2Pass* analysis, int sliceType)
968
-{
969
-    if (analysis->analysisFramedata)
970
-    {
971
-        X265_FREE(((analysis2PassFrameData*)analysis->analysisFramedata)->depth);
972
-        X265_FREE(((analysis2PassFrameData*)analysis->analysisFramedata)->distortion);
973
-        if (m_param->rc.bStatRead)
974
-        {
975
-            X265_FREE(((analysis2PassFrameData*)analysis->analysisFramedata)->ctuDistortion);
976
-            X265_FREE(((analysis2PassFrameData*)analysis->analysisFramedata)->scaledDistortion);
977
-            X265_FREE(((analysis2PassFrameData*)analysis->analysisFramedata)->offset);
978
-            X265_FREE(((analysis2PassFrameData*)analysis->analysisFramedata)->threshold);
979
-        }
980
-        if (!IS_X265_TYPE_I(sliceType))
981
-        {
982
-            X265_FREE(((analysis2PassFrameData*)analysis->analysisFramedata)->m_mv[0]);
983
-            X265_FREE(((analysis2PassFrameData*)analysis->analysisFramedata)->m_mv[1]);
984
-            X265_FREE(((analysis2PassFrameData*)analysis->analysisFramedata)->mvpIdx[0]);
985
-            X265_FREE(((analysis2PassFrameData*)analysis->analysisFramedata)->mvpIdx[1]);
986
-            X265_FREE(((analysis2PassFrameData*)analysis->analysisFramedata)->ref[0]);
987
-            X265_FREE(((analysis2PassFrameData*)analysis->analysisFramedata)->ref[1]);
988
-            X265_FREE(((analysis2PassFrameData*)analysis->analysisFramedata)->modes);
989
-        }
990
-        X265_FREE(analysis->analysisFramedata);
991
-    }
992
 }
993
 
994
-void Encoder::readAnalysisFile(x265_analysis_data* analysis, int curPoc, const x265_picture* picIn)
995
+void Encoder::readAnalysisFile(x265_analysis_data* analysis, int curPoc, const x265_picture* picIn, int paramBytes)
996
 {
997
-
998
 #define X265_FREAD(val, size, readSize, fileOffset, src)\
999
     if (!m_param->bUseAnalysisFile)\
1000
-    {\
1001
+        {\
1002
         memcpy(val, src, (size * readSize));\
1003
-    }\
1004
-    else if (fread(val, size, readSize, fileOffset) != readSize)\
1005
+        }\
1006
+        else if (fread(val, size, readSize, fileOffset) != readSize)\
1007
     {\
1008
         x265_log(NULL, X265_LOG_ERROR, "Error reading analysis data\n");\
1009
-        freeAnalysis(analysis);\
1010
+        x265_free_analysis_data(m_param, analysis);\
1011
         m_aborted = true;\
1012
         return;\
1013
     }\
1014
@@ -3287,10 +3201,10 @@
1015
     static uint64_t totalConsumedBytes = 0;
1016
     uint32_t depthBytes = 0;
1017
     if (m_param->bUseAnalysisFile)
1018
-        fseeko(m_analysisFileIn, totalConsumedBytes, SEEK_SET);
1019
+        fseeko(m_analysisFileIn, totalConsumedBytes + paramBytes, SEEK_SET);
1020
     const x265_analysis_data *picData = &(picIn->analysisData);
1021
-    analysis_intra_data *intraPic = (analysis_intra_data *)picData->intraData;
1022
-    analysis_inter_data *interPic = (analysis_inter_data *)picData->interData;
1023
+    x265_analysis_intra_data *intraPic = picData->intraData;
1024
+    x265_analysis_inter_data *interPic = picData->interData;
1025
 
1026
     int poc; uint32_t frameRecordSize;
1027
     X265_FREAD(&frameRecordSize, sizeof(uint32_t), 1, m_analysisFileIn, &(picData->frameRecordSize));
1028
@@ -3305,7 +3219,7 @@
1029
         while (poc != curPoc && !feof(m_analysisFileIn))
1030
         {
1031
             currentOffset += frameRecordSize;
1032
-            fseeko(m_analysisFileIn, currentOffset, SEEK_SET);
1033
+            fseeko(m_analysisFileIn, currentOffset + paramBytes, SEEK_SET);
1034
             X265_FREAD(&frameRecordSize, sizeof(uint32_t), 1, m_analysisFileIn, &(picData->frameRecordSize));
1035
             X265_FREAD(&depthBytes, sizeof(uint32_t), 1, m_analysisFileIn, &(picData->depthBytes));
1036
             X265_FREAD(&poc, sizeof(int), 1, m_analysisFileIn, &(picData->poc));
1037
@@ -3313,7 +3227,7 @@
1038
         if (poc != curPoc || feof(m_analysisFileIn))
1039
         {
1040
             x265_log(NULL, X265_LOG_WARNING, "Error reading analysis data: Cannot find POC %d\n", curPoc);
1041
-            freeAnalysis(analysis);
1042
+            x265_free_analysis_data(m_param, analysis);
1043
             return;
1044
         }
1045
     }
1046
@@ -3337,13 +3251,32 @@
1047
     if (m_param->scaleFactor)
1048
         analysis->numPartitions *= factor;
1049
     /* Memory is allocated for inter and intra analysis data based on the slicetype */
1050
-    allocAnalysis(analysis);
1051
+    x265_alloc_analysis_data(m_param, analysis);
1052
     if (m_param->bDisableLookahead && m_rateControl->m_isVbv)
1053
     {
1054
+        size_t vbvCount = m_param->lookaheadDepth + m_param->bframes + 2;
1055
         X265_FREAD(analysis->lookahead.intraVbvCost, sizeof(uint32_t), analysis->numCUsInFrame, m_analysisFileIn, picData->lookahead.intraVbvCost);
1056
         X265_FREAD(analysis->lookahead.vbvCost, sizeof(uint32_t), analysis->numCUsInFrame, m_analysisFileIn, picData->lookahead.vbvCost);
1057
         X265_FREAD(analysis->lookahead.satdForVbv, sizeof(uint32_t), analysis->numCuInHeight, m_analysisFileIn, picData->lookahead.satdForVbv);
1058
         X265_FREAD(analysis->lookahead.intraSatdForVbv, sizeof(uint32_t), analysis->numCuInHeight, m_analysisFileIn, picData->lookahead.intraSatdForVbv);
1059
+        X265_FREAD(analysis->lookahead.plannedSatd, sizeof(int64_t), vbvCount, m_analysisFileIn, picData->lookahead.plannedSatd);
1060
+
1061
+        if (m_param->scaleFactor)
1062
+        {
1063
+            for (uint64_t index = 0; index < vbvCount; index++)
1064
+                analysis->lookahead.plannedSatd[index] *= factor;
1065
+
1066
+            for (uint32_t i = 0; i < analysis->numCuInHeight; i++)
1067
+            {
1068
+                analysis->lookahead.satdForVbv[i] *= factor;
1069
+                analysis->lookahead.intraSatdForVbv[i] *= factor;
1070
+            }
1071
+            for (uint32_t i = 0; i < analysis->numCUsInFrame; i++)
1072
+            {
1073
+                analysis->lookahead.vbvCost[i] *= factor;
1074
+                analysis->lookahead.intraVbvCost[i] *= factor;
1075
+            }
1076
+        }
1077
     }
1078
     if (analysis->sliceType == X265_TYPE_IDR || analysis->sliceType == X265_TYPE_I)
1079
     {
1080
@@ -3372,22 +3305,22 @@
1081
                 if (partSizes[d] == SIZE_NxN)
1082
                     partSizes[d] = SIZE_2Nx2N;
1083
             }
1084
-            memset(&((analysis_intra_data *)analysis->intraData)->depth[count], depthBuf[d], bytes);
1085
-            memset(&((analysis_intra_data *)analysis->intraData)->chromaModes[count], modeBuf[d], bytes);
1086
-            memset(&((analysis_intra_data *)analysis->intraData)->partSizes[count], partSizes[d], bytes);
1087
+            memset(&(analysis->intraData)->depth[count], depthBuf[d], bytes);
1088
+            memset(&(analysis->intraData)->chromaModes[count], modeBuf[d], bytes);
1089
+            memset(&(analysis->intraData)->partSizes[count], partSizes[d], bytes);
1090
             count += bytes;
1091
         }
1092
 
1093
         if (!m_param->scaleFactor)
1094
         {
1095
-            X265_FREAD(((analysis_intra_data *)analysis->intraData)->modes, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFileIn, intraPic->modes);
1096
+            X265_FREAD((analysis->intraData)->modes, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFileIn, intraPic->modes);
1097
         }
1098
         else
1099
         {
1100
             uint8_t *tempLumaBuf = X265_MALLOC(uint8_t, analysis->numCUsInFrame * scaledNumPartition);
1101
             X265_FREAD(tempLumaBuf, sizeof(uint8_t), analysis->numCUsInFrame * scaledNumPartition, m_analysisFileIn, intraPic->modes);
1102
             for (uint32_t ctu32Idx = 0, cnt = 0; ctu32Idx < analysis->numCUsInFrame * scaledNumPartition; ctu32Idx++, cnt += factor)
1103
-                memset(&((analysis_intra_data *)analysis->intraData)->modes[cnt], tempLumaBuf[ctu32Idx], factor);
1104
+                memset(&(analysis->intraData)->modes[cnt], tempLumaBuf[ctu32Idx], factor);
1105
             X265_FREE(tempLumaBuf);
1106
         }
1107
         X265_FREE(tempBuf);
1108
@@ -3456,37 +3389,37 @@
1109
         {
1110
             int bytes = analysis->numPartitions >> (depthBuf[d] * 2);
1111
             if (m_param->scaleFactor && modeBuf[d] == MODE_INTRA && depthBuf[d] == 0)
1112
-                 depthBuf[d] = 1;
1113
-            memset(&((analysis_inter_data *)analysis->interData)->depth[count], depthBuf[d], bytes);
1114
-            memset(&((analysis_inter_data *)analysis->interData)->modes[count], modeBuf[d], bytes);
1115
+                depthBuf[d] = 1;
1116
+            memset(&(analysis->interData)->depth[count], depthBuf[d], bytes);
1117
+            memset(&(analysis->interData)->modes[count], modeBuf[d], bytes);
1118
             if (m_param->analysisReuseLevel > 4)
1119
             {
1120
                 if (m_param->scaleFactor && modeBuf[d] == MODE_INTRA && partSize[d] == SIZE_NxN)
1121
-                     partSize[d] = SIZE_2Nx2N;
1122
-                memset(&((analysis_inter_data *)analysis->interData)->partSize[count], partSize[d], bytes);
1123
+                    partSize[d] = SIZE_2Nx2N;
1124
+                memset(&(analysis->interData)->partSize[count], partSize[d], bytes);
1125
                 int numPU = (modeBuf[d] == MODE_INTRA) ? 1 : nbPartsTable[(int)partSize[d]];
1126
                 for (int pu = 0; pu < numPU; pu++)
1127
                 {
1128
                     if (pu) d++;
1129
-                    ((analysis_inter_data *)analysis->interData)->mergeFlag[count + pu] = mergeFlag[d];
1130
+                    (analysis->interData)->mergeFlag[count + pu] = mergeFlag[d];
1131
                     if (m_param->analysisReuseLevel == 10)
1132
                     {
1133
-                        ((analysis_inter_data *)analysis->interData)->interDir[count + pu] = interDir[d];
1134
+                        (analysis->interData)->interDir[count + pu] = interDir[d];
1135
                         for (uint32_t i = 0; i < numDir; i++)
1136
                         {
1137
-                            ((analysis_inter_data *)analysis->interData)->mvpIdx[i][count + pu] = mvpIdx[i][d];
1138
-                            ((analysis_inter_data *)analysis->interData)->refIdx[i][count + pu] = refIdx[i][d];
1139
+                            (analysis->interData)->mvpIdx[i][count + pu] = mvpIdx[i][d];
1140
+                            (analysis->interData)->refIdx[i][count + pu] = refIdx[i][d];
1141
                             if (m_param->scaleFactor)
1142
                             {
1143
                                 mv[i][d].x *= (int16_t)m_param->scaleFactor;
1144
                                 mv[i][d].y *= (int16_t)m_param->scaleFactor;
1145
                             }
1146
-                            memcpy(&((analysis_inter_data *)analysis->interData)->mv[i][count + pu], &mv[i][d], sizeof(MV));
1147
+                            memcpy(&(analysis->interData)->mv[i][count + pu], &mv[i][d], sizeof(MV));
1148
                         }
1149
                     }
1150
                 }
1151
                 if (m_param->analysisReuseLevel == 10 && bIntraInInter)
1152
-                    memset(&((analysis_intra_data *)analysis->intraData)->chromaModes[count], chromaDir[d], bytes);
1153
+                    memset(&(analysis->intraData)->chromaModes[count], chromaDir[d], bytes);
1154
             }
1155
             count += bytes;
1156
         }
1157
@@ -3505,20 +3438,20 @@
1158
             {
1159
                 if (!m_param->scaleFactor)
1160
                 {
1161
-                    X265_FREAD(((analysis_intra_data *)analysis->intraData)->modes, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFileIn, intraPic->modes);
1162
+                    X265_FREAD((analysis->intraData)->modes, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFileIn, intraPic->modes);
1163
                 }
1164
                 else
1165
                 {
1166
                     uint8_t *tempLumaBuf = X265_MALLOC(uint8_t, analysis->numCUsInFrame * scaledNumPartition);
1167
                     X265_FREAD(tempLumaBuf, sizeof(uint8_t), analysis->numCUsInFrame * scaledNumPartition, m_analysisFileIn, intraPic->modes);
1168
                     for (uint32_t ctu32Idx = 0, cnt = 0; ctu32Idx < analysis->numCUsInFrame * scaledNumPartition; ctu32Idx++, cnt += factor)
1169
-                        memset(&((analysis_intra_data *)analysis->intraData)->modes[cnt], tempLumaBuf[ctu32Idx], factor);
1170
+                        memset(&(analysis->intraData)->modes[cnt], tempLumaBuf[ctu32Idx], factor);
1171
                     X265_FREE(tempLumaBuf);
1172
                 }
1173
             }
1174
         }
1175
         else
1176
-            X265_FREAD(((analysis_inter_data *)analysis->interData)->ref, sizeof(int32_t), analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU * numDir, m_analysisFileIn, interPic->ref);
1177
+            X265_FREAD((analysis->interData)->ref, sizeof(int32_t), analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU * numDir, m_analysisFileIn, interPic->ref);
1178
 
1179
         consumedBytes += frameRecordSize;
1180
         if (numDir == 1)
1181
@@ -3527,23 +3460,602 @@
1182
 #undef X265_FREAD
1183
 }
1184
 
1185
-void Encoder::readAnalysis2PassFile(x265_analysis_2Pass* analysis2Pass, int curPoc, int sliceType)
1186
+void Encoder::readAnalysisFile(x265_analysis_data* analysis, int curPoc, const x265_picture* picIn, int paramBytes, cuLocation cuLoc)
1187
+{
1188
+#define X265_FREAD(val, size, readSize, fileOffset, src)\
1189
+    if (!m_param->bUseAnalysisFile)\
1190
+    {\
1191
+        memcpy(val, src, (size * readSize));\
1192
+    }\
1193
+    else if (fread(val, size, readSize, fileOffset) != readSize)\
1194
+    {\
1195
+        x265_log(NULL, X265_LOG_ERROR, "Error reading analysis data\n");\
1196
+        x265_free_analysis_data(m_param, analysis);\
1197
+        m_aborted = true;\
1198
+        return;\
1199
+    }\
1200
+
1201
+    static uint64_t consumedBytes = 0;
1202
+    static uint64_t totalConsumedBytes = 0;
1203
+    uint32_t depthBytes = 0;
1204
+    if (m_param->bUseAnalysisFile)
1205
+        fseeko(m_analysisFileIn, totalConsumedBytes + paramBytes, SEEK_SET);
1206
+
1207
+    const x265_analysis_data *picData = &(picIn->analysisData);
1208
+    x265_analysis_intra_data *intraPic = picData->intraData;
1209
+    x265_analysis_inter_data *interPic = picData->interData;
1210
+
1211
+    int poc; uint32_t frameRecordSize;
1212
+    X265_FREAD(&frameRecordSize, sizeof(uint32_t), 1, m_analysisFileIn, &(picData->frameRecordSize));
1213
+    X265_FREAD(&depthBytes, sizeof(uint32_t), 1, m_analysisFileIn, &(picData->depthBytes));
1214
+    X265_FREAD(&poc, sizeof(int), 1, m_analysisFileIn, &(picData->poc));
1215
+
1216
+    if (m_param->bUseAnalysisFile)
1217
+    {
1218
+        uint64_t currentOffset = totalConsumedBytes;
1219
+
1220
+        /* Seeking to the right frame Record */
1221
+        while (poc != curPoc && !feof(m_analysisFileIn))
1222
+        {
1223
+            currentOffset += frameRecordSize;
1224
+            fseeko(m_analysisFileIn, currentOffset + paramBytes, SEEK_SET);
1225
+            X265_FREAD(&frameRecordSize, sizeof(uint32_t), 1, m_analysisFileIn, &(picData->frameRecordSize));
1226
+            X265_FREAD(&depthBytes, sizeof(uint32_t), 1, m_analysisFileIn, &(picData->depthBytes));
1227
+            X265_FREAD(&poc, sizeof(int), 1, m_analysisFileIn, &(picData->poc));
1228
+        }
1229
+        if (poc != curPoc || feof(m_analysisFileIn))
1230
+        {
1231
+            x265_log(NULL, X265_LOG_WARNING, "Error reading analysis data: Cannot find POC %d\n", curPoc);
1232
+            x265_free_analysis_data(m_param, analysis);
1233
+            return;
1234
+        }
1235
+    }
1236
+
1237
+    /* Now arrived at the right frame, read the record */
1238
+    analysis->poc = poc;
1239
+    analysis->frameRecordSize = frameRecordSize;
1240
+    X265_FREAD(&analysis->sliceType, sizeof(int), 1, m_analysisFileIn, &(picData->sliceType));
1241
+    X265_FREAD(&analysis->bScenecut, sizeof(int), 1, m_analysisFileIn, &(picData->bScenecut));
1242
+    X265_FREAD(&analysis->satdCost, sizeof(int64_t), 1, m_analysisFileIn, &(picData->satdCost));
1243
+    X265_FREAD(&analysis->numCUsInFrame, sizeof(int), 1, m_analysisFileIn, &(picData->numCUsInFrame));
1244
+    X265_FREAD(&analysis->numPartitions, sizeof(int), 1, m_analysisFileIn, &(picData->numPartitions));
1245
+    if (m_param->bDisableLookahead)
1246
+    {
1247
+        X265_FREAD(&analysis->numCuInHeight, sizeof(uint32_t), 1, m_analysisFileIn, &(picData->numCuInHeight));
1248
+        X265_FREAD(&analysis->lookahead, sizeof(x265_lookahead_data), 1, m_analysisFileIn, &(picData->lookahead));
1249
+    }
1250
+    int scaledNumPartition = analysis->numPartitions;
1251
+    int factor = 1 << m_param->scaleFactor;
1252
+
1253
+    int numPartitions = analysis->numPartitions;
1254
+    int numCUsInFrame = analysis->numCUsInFrame;
1255
+    int numCuInHeight = analysis->numCuInHeight;
1256
+    /* Allocate memory for scaled resoultion's numPartitions and numCUsInFrame*/
1257
+    analysis->numPartitions = m_param->num4x4Partitions;
1258
+    analysis->numCUsInFrame = cuLoc.heightInCU * cuLoc.widthInCU;
1259
+    analysis->numCuInHeight = cuLoc.heightInCU;
1260
+
1261
+    /* Memory is allocated for inter and intra analysis data based on the slicetype */
1262
+    x265_alloc_analysis_data(m_param, analysis);
1263
+
1264
+    analysis->numPartitions = numPartitions * factor;
1265
+    analysis->numCUsInFrame = numCUsInFrame;
1266
+    analysis->numCuInHeight = numCuInHeight;
1267
+    if (m_param->bDisableLookahead && m_rateControl->m_isVbv)
1268
+    {
1269
+        uint32_t width = analysis->numCUsInFrame / analysis->numCuInHeight;
1270
+        bool skipLastRow = (analysis->numCuInHeight * 2) > cuLoc.heightInCU;
1271
+        bool skipLastCol = (width * 2) > cuLoc.widthInCU;
1272
+        uint32_t *intraVbvCostBuf = NULL, *vbvCostBuf = NULL, *satdForVbvBuf = NULL, *intraSatdForVbvBuf = NULL;
1273
+        intraVbvCostBuf = X265_MALLOC(uint32_t, analysis->numCUsInFrame);
1274
+        vbvCostBuf = X265_MALLOC(uint32_t, analysis->numCUsInFrame);
1275
+        satdForVbvBuf = X265_MALLOC(uint32_t, analysis->numCuInHeight);
1276
+        intraSatdForVbvBuf = X265_MALLOC(uint32_t, analysis->numCuInHeight);
1277
+
1278
+        X265_FREAD(intraVbvCostBuf, sizeof(uint32_t), analysis->numCUsInFrame, m_analysisFileIn, picData->lookahead.intraVbvCost);
1279
+        X265_FREAD(vbvCostBuf, sizeof(uint32_t), analysis->numCUsInFrame, m_analysisFileIn, picData->lookahead.vbvCost);
1280
+        X265_FREAD(satdForVbvBuf, sizeof(uint32_t), analysis->numCuInHeight, m_analysisFileIn, picData->lookahead.satdForVbv);
1281
+        X265_FREAD(intraSatdForVbvBuf, sizeof(uint32_t), analysis->numCuInHeight, m_analysisFileIn, picData->lookahead.intraSatdForVbv);
1282
+
1283
+        int k = 0;
1284
+        for (uint32_t i = 0; i < analysis->numCuInHeight; i++)
1285
+        {
1286
+            analysis->lookahead.satdForVbv[m_param->scaleFactor * i] = satdForVbvBuf[i] * m_param->scaleFactor;
1287
+            analysis->lookahead.intraSatdForVbv[m_param->scaleFactor * i] = intraSatdForVbvBuf[i] * m_param->scaleFactor;
1288
+            if (!(i == (analysis->numCuInHeight - 1) && skipLastRow))
1289
+            {
1290
+                analysis->lookahead.satdForVbv[(m_param->scaleFactor * i) + 1] = satdForVbvBuf[i] * m_param->scaleFactor;
1291
+                analysis->lookahead.intraSatdForVbv[(m_param->scaleFactor * i) + 1] = intraSatdForVbvBuf[i] * m_param->scaleFactor;
1292
+            }
1293
+
1294
+            for (uint32_t j = 0; j < width; j++, k++)
1295
+            {
1296
+                analysis->lookahead.vbvCost[(i * m_param->scaleFactor * cuLoc.widthInCU) + (j * m_param->scaleFactor)] = vbvCostBuf[k];
1297
+                analysis->lookahead.intraVbvCost[(i * m_param->scaleFactor * cuLoc.widthInCU) + (j * m_param->scaleFactor)] = intraVbvCostBuf[k];
1298
+
1299
+                if (!(j == (width - 1) && skipLastCol))
1300
+                {
1301
+                    analysis->lookahead.vbvCost[(i * m_param->scaleFactor * cuLoc.widthInCU) + (j * m_param->scaleFactor) + 1] = vbvCostBuf[k];
1302
+                    analysis->lookahead.intraVbvCost[(i * m_param->scaleFactor * cuLoc.widthInCU) + (j * m_param->scaleFactor) + 1] = intraVbvCostBuf[k];
1303
+                }
1304
+                if (!(i == (analysis->numCuInHeight - 1) && skipLastRow))
1305
+                {
1306
+                    analysis->lookahead.vbvCost[(i * m_param->scaleFactor * cuLoc.widthInCU) + cuLoc.widthInCU + (j * m_param->scaleFactor)] = vbvCostBuf[k];
1307
+                    analysis->lookahead.intraVbvCost[(i * m_param->scaleFactor * cuLoc.widthInCU) + cuLoc.widthInCU + (j * m_param->scaleFactor)] = intraVbvCostBuf[k];
1308
+                    if (!(j == (width - 1) && skipLastCol))
1309
+                    {
1310
+                        analysis->lookahead.vbvCost[(i * m_param->scaleFactor * cuLoc.widthInCU) + cuLoc.widthInCU + (j * m_param->scaleFactor) + 1] = vbvCostBuf[k];
1311
+                        analysis->lookahead.intraVbvCost[(i * m_param->scaleFactor * cuLoc.widthInCU) + cuLoc.widthInCU + (j * m_param->scaleFactor) + 1] = intraVbvCostBuf[k];
1312
+                    }
1313
+                }
1314
+            }
1315
+        }
1316
+        X265_FREE(satdForVbvBuf);
1317
+        X265_FREE(intraSatdForVbvBuf);
1318
+        X265_FREE(intraVbvCostBuf);
1319
+        X265_FREE(vbvCostBuf);
1320
+    }
1321
+
1322
+    if (analysis->sliceType == X265_TYPE_IDR || analysis->sliceType == X265_TYPE_I)
1323
+    {
1324
+        if (m_param->analysisReuseLevel < 2)
1325
+            return;
1326
+
1327
+        uint8_t *tempBuf = NULL, *depthBuf = NULL, *modeBuf = NULL, *partSizes = NULL;
1328
+
1329
+        tempBuf = X265_MALLOC(uint8_t, depthBytes * 3);
1330
+        depthBuf = tempBuf;
1331
+        modeBuf = tempBuf + depthBytes;
1332
+        partSizes = tempBuf + 2 * depthBytes;
1333
+
1334
+        X265_FREAD(depthBuf, sizeof(uint8_t), depthBytes, m_analysisFileIn, intraPic->depth);
1335
+        X265_FREAD(modeBuf, sizeof(uint8_t), depthBytes, m_analysisFileIn, intraPic->chromaModes);
1336
+        X265_FREAD(partSizes, sizeof(uint8_t), depthBytes, m_analysisFileIn, intraPic->partSizes);
1337
+
1338
+        uint32_t count = 0;
1339
+        for (uint32_t d = 0; d < depthBytes; d++)
1340
+        {
1341
+            int bytes = analysis->numPartitions >> (depthBuf[d] * 2);
1342
+            int numCTUCopied = 1;
1343
+            if (!depthBuf[d]) //copy data of one 64x64 to four scaled 64x64 CTUs.
1344
+            {
1345
+                bytes /= 4;
1346
+                numCTUCopied = 4;
1347
+            }
1348
+            if (partSizes[d] == SIZE_NxN)
1349
+                partSizes[d] = SIZE_2Nx2N;
1350
+            if ((depthBuf[d] > 1 && m_param->maxCUSize == 64) || (depthBuf[d] && m_param->maxCUSize != 64))
1351
+                depthBuf[d]--;
1352
+
1353
+            for (int numCTU = 0; numCTU < numCTUCopied; numCTU++)
1354
+            {
1355
+                memset(&(analysis->intraData)->depth[count], depthBuf[d], bytes);
1356
+                memset(&(analysis->intraData)->chromaModes[count], modeBuf[d], bytes);
1357
+                memset(&(analysis->intraData)->partSizes[count], partSizes[d], bytes);
1358
+                count += bytes;
1359
+                d += getCUIndex(&cuLoc, &count, bytes, 1);
1360
+            }
1361
+        }
1362
+
1363
+        cuLoc.evenRowIndex = 0;
1364
+        cuLoc.oddRowIndex = m_param->num4x4Partitions * cuLoc.widthInCU;
1365
+        cuLoc.switchCondition = 0;
1366
+        uint8_t *tempLumaBuf = X265_MALLOC(uint8_t, analysis->numCUsInFrame * scaledNumPartition);
1367
+        X265_FREAD(tempLumaBuf, sizeof(uint8_t), analysis->numCUsInFrame * scaledNumPartition, m_analysisFileIn, intraPic->modes);
1368
+        uint32_t cnt = 0;
1369
+        for (uint32_t ctu32Idx = 0; ctu32Idx < analysis->numCUsInFrame * scaledNumPartition; ctu32Idx++)
1370
+        {
1371
+            memset(&(analysis->intraData)->modes[cnt], tempLumaBuf[ctu32Idx], factor);
1372
+            cnt += factor;
1373
+            ctu32Idx += getCUIndex(&cuLoc, &cnt, factor, 0);
1374
+        }
1375
+        X265_FREE(tempLumaBuf);
1376
+        X265_FREE(tempBuf);
1377
+        consumedBytes += frameRecordSize;
1378
+    }
1379
+
1380
+    else
1381
+    {
1382
+        uint32_t numDir = analysis->sliceType == X265_TYPE_P ? 1 : 2;
1383
+        uint32_t numPlanes = m_param->internalCsp == X265_CSP_I400 ? 1 : 3;
1384
+        X265_FREAD((WeightParam*)analysis->wt, sizeof(WeightParam), numPlanes * numDir, m_analysisFileIn, (picIn->analysisData.wt));
1385
+        if (m_param->analysisReuseLevel < 2)
1386
+            return;
1387
+
1388
+        uint8_t *tempBuf = NULL, *depthBuf = NULL, *modeBuf = NULL, *partSize = NULL, *mergeFlag = NULL;
1389
+        uint8_t *interDir = NULL, *chromaDir = NULL, *mvpIdx[2];
1390
+        MV* mv[2];
1391
+        int8_t* refIdx[2];
1392
+
1393
+        int numBuf = m_param->analysisReuseLevel > 4 ? 4 : 2;
1394
+        bool bIntraInInter = false;
1395
+        if (m_param->analysisReuseLevel == 10)
1396
+        {
1397
+            numBuf++;
1398
+            bIntraInInter = (analysis->sliceType == X265_TYPE_P || m_param->bIntraInBFrames);
1399
+            if (bIntraInInter) numBuf++;
1400
+        }
1401
+
1402
+        tempBuf = X265_MALLOC(uint8_t, depthBytes * numBuf);
1403
+        depthBuf = tempBuf;
1404
+        modeBuf = tempBuf + depthBytes;
1405
+
1406
+        X265_FREAD(depthBuf, sizeof(uint8_t), depthBytes, m_analysisFileIn, interPic->depth);
1407
+        X265_FREAD(modeBuf, sizeof(uint8_t), depthBytes, m_analysisFileIn, interPic->modes);
1408
+        if (m_param->analysisReuseLevel > 4)
1409
+        {
1410
+            partSize = modeBuf + depthBytes;
1411
+            mergeFlag = partSize + depthBytes;
1412
+            X265_FREAD(partSize, sizeof(uint8_t), depthBytes, m_analysisFileIn, interPic->partSize);
1413
+            X265_FREAD(mergeFlag, sizeof(uint8_t), depthBytes, m_analysisFileIn, interPic->mergeFlag);
1414
+            if (m_param->analysisReuseLevel == 10)
1415
+            {
1416
+                interDir = mergeFlag + depthBytes;
1417
+                X265_FREAD(interDir, sizeof(uint8_t), depthBytes, m_analysisFileIn, interPic->interDir);
1418
+                if (bIntraInInter)
1419
+                {
1420
+                    chromaDir = interDir + depthBytes;
1421
+                    X265_FREAD(chromaDir, sizeof(uint8_t), depthBytes, m_analysisFileIn, intraPic->chromaModes);
1422
+                }
1423
+                for (uint32_t i = 0; i < numDir; i++)
1424
+                {
1425
+                    mvpIdx[i] = X265_MALLOC(uint8_t, depthBytes);
1426
+                    refIdx[i] = X265_MALLOC(int8_t, depthBytes);
1427
+                    mv[i] = X265_MALLOC(MV, depthBytes);
1428
+                    X265_FREAD(mvpIdx[i], sizeof(uint8_t), depthBytes, m_analysisFileIn, interPic->mvpIdx[i]);
1429
+                    X265_FREAD(refIdx[i], sizeof(int8_t), depthBytes, m_analysisFileIn, interPic->refIdx[i]);
1430
+                    X265_FREAD(mv[i], sizeof(MV), depthBytes, m_analysisFileIn, interPic->mv[i]);
1431
+                }
1432
+            }
1433
+        }
1434
+
1435
+        uint32_t count = 0;
1436
+        cuLoc.switchCondition = 0;
1437
+        for (uint32_t d = 0; d < depthBytes; d++)
1438
+        {
1439
+            int bytes = analysis->numPartitions >> (depthBuf[d] * 2);
1440
+            bool isScaledMaxCUSize = false;
1441
+            int numCTUCopied = 1;
1442
+            int writeDepth = depthBuf[d];
1443
+            if (!depthBuf[d]) //copy data of one 64x64 to four scaled 64x64 CTUs.
1444
+            {
1445
+                isScaledMaxCUSize = true;
1446
+                bytes /= 4;
1447
+                numCTUCopied = 4;
1448
+            }
1449
+            if ((modeBuf[d] != MODE_INTRA && depthBuf[d] != 0) || (modeBuf[d] == MODE_INTRA && depthBuf[d] > 1))
1450
+                writeDepth--;
1451
+
1452
+            for (int numCTU = 0; numCTU < numCTUCopied; numCTU++)
1453
+            {
1454
+                memset(&(analysis->interData)->depth[count], writeDepth, bytes);
1455
+                memset(&(analysis->interData)->modes[count], modeBuf[d], bytes);
1456
+                if (m_param->analysisReuseLevel == 10 && bIntraInInter)
1457
+                    memset(&(analysis->intraData)->chromaModes[count], chromaDir[d], bytes);
1458
+
1459
+                if (m_param->analysisReuseLevel > 4)
1460
+                {
1461
+                    puOrientation puOrient;
1462
+                    puOrient.init();
1463
+                    if (modeBuf[d] == MODE_INTRA && partSize[d] == SIZE_NxN)
1464
+                        partSize[d] = SIZE_2Nx2N;
1465
+                    int partitionSize = partSize[d];
1466
+                    if (isScaledMaxCUSize && partSize[d] != SIZE_2Nx2N)
1467
+                        partitionSize = getPuShape(&puOrient, partSize[d], numCTU);
1468
+                    memset(&(analysis->interData)->partSize[count], partitionSize, bytes);
1469
+                    int numPU = (modeBuf[d] == MODE_INTRA) ? 1 : nbPartsTable[(int)partSize[d]];
1470
+                    for (int pu = 0; pu < numPU; pu++)
1471
+                    {
1472
+                        if (!isScaledMaxCUSize && pu)
1473
+                            d++;
1474
+                        int restoreD = d;
1475
+                        /* Adjust d value when the current CTU takes data from 2nd PU */
1476
+                        if (puOrient.isRect || (puOrient.isAmp && partitionSize == SIZE_2Nx2N))
1477
+                        {
1478
+                            if ((numCTU > 1 && !puOrient.isVert) || ((numCTU % 2 == 1) && puOrient.isVert))
1479
+                                d++;
1480
+                        }
1481
+                        if (puOrient.isAmp && pu)
1482
+                            d++;
1483
+
1484
+                        (analysis->interData)->mergeFlag[count + pu] = mergeFlag[d];
1485
+                        if (m_param->analysisReuseLevel == 10)
1486
+                        {
1487
+                            (analysis->interData)->interDir[count + pu] = interDir[d];
1488
+                            MV mvCopy[2];
1489
+                            for (uint32_t i = 0; i < numDir; i++)
1490
+                            {
1491
+                                (analysis->interData)->mvpIdx[i][count + pu] = mvpIdx[i][d];
1492
+                                (analysis->interData)->refIdx[i][count + pu] = refIdx[i][d];
1493
+                                mvCopy[i].x = mv[i][d].x * (int16_t)m_param->scaleFactor;
1494
+                                mvCopy[i].y = mv[i][d].y * (int16_t)m_param->scaleFactor;
1495
+                                memcpy(&(analysis->interData)->mv[i][count + pu], &mvCopy[i], sizeof(MV));
1496
+                            }
1497
+                        }
1498
+                        d = restoreD; // Restore d value after copying each of the 4 64x64 CTUs
1499
+
1500
+                        if (isScaledMaxCUSize && (puOrient.isRect || puOrient.isAmp))
1501
+                        {
1502
+                            /* Skip PU index when current CTU is a 2Nx2N */
1503
+                            if (partitionSize == SIZE_2Nx2N)
1504
+                                pu++;
1505
+                            /* Adjust d after completion of all 4 CTU copies */
1506
+                            if (numCTU == 3 && (pu == (numPU - 1)))
1507
+                                d++;
1508
+                        }
1509
+                    }
1510
+                }
1511
+                count += bytes;
1512
+                d += getCUIndex(&cuLoc, &count, bytes, 1);
1513
+            }
1514
+        }
1515
+
1516
+        X265_FREE(tempBuf);
1517
+
1518
+        if (m_param->analysisReuseLevel == 10)
1519
+        {
1520
+            for (uint32_t i = 0; i < numDir; i++)
1521
+            {
1522
+                X265_FREE(mvpIdx[i]);
1523
+                X265_FREE(refIdx[i]);
1524
+                X265_FREE(mv[i]);
1525
+            }
1526
+            if (bIntraInInter)
1527
+            {
1528
+                cuLoc.evenRowIndex = 0;
1529
+                cuLoc.oddRowIndex = m_param->num4x4Partitions * cuLoc.widthInCU;
1530
+                cuLoc.switchCondition = 0;
1531
+                uint8_t *tempLumaBuf = X265_MALLOC(uint8_t, analysis->numCUsInFrame * scaledNumPartition);
1532
+                X265_FREAD(tempLumaBuf, sizeof(uint8_t), analysis->numCUsInFrame * scaledNumPartition, m_analysisFileIn, intraPic->modes);
1533
+                uint32_t cnt = 0;
1534
+                for (uint32_t ctu32Idx = 0; ctu32Idx < analysis->numCUsInFrame * scaledNumPartition; ctu32Idx++)
1535
+                {
1536
+                    memset(&(analysis->intraData)->modes[cnt], tempLumaBuf[ctu32Idx], factor);
1537
+                    cnt += factor;
1538
+                    ctu32Idx += getCUIndex(&cuLoc, &cnt, factor, 0);
1539
+                }
1540
+                X265_FREE(tempLumaBuf);
1541
+            }
1542
+        }
1543
+        else
1544
+            X265_FREAD((analysis->interData)->ref, sizeof(int32_t), analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU * numDir, m_analysisFileIn, interPic->ref);
1545
+
1546
+        consumedBytes += frameRecordSize;
1547
+        if (numDir == 1)
1548
+            totalConsumedBytes = consumedBytes;
1549
+    }
1550
+
1551
+    /* Restore to the current encode's numPartitions and numCUsInFrame */
1552
+    analysis->numPartitions = m_param->num4x4Partitions;
1553
+    analysis->numCUsInFrame = cuLoc.heightInCU * cuLoc.widthInCU;
1554
+    analysis->numCuInHeight = cuLoc.heightInCU;
1555
+#undef X265_FREAD
1556
+}
1557
+
1558
+
1559
+int Encoder::validateAnalysisData(x265_analysis_data* analysis, int writeFlag)
1560
+{
1561
+#define X265_PARAM_VALIDATE(analysisParam, size, bytes, param, errorMsg)\
1562
+    if(!writeFlag)\
1563
+    {\
1564
+        fileOffset = m_analysisFileIn;\
1565
+        if ((!m_param->bUseAnalysisFile && analysisParam != (int)*param) || \
1566
+            (m_param->bUseAnalysisFile && (fread(&readValue, size, bytes, fileOffset) != bytes || (readValue != (int)*param))))\
1567
+        {\
1568
+            x265_log(NULL, X265_LOG_ERROR, "Error reading analysis data. Incompatible option : <%s> \n", #errorMsg);\
1569
+            m_aborted = true;\
1570
+            return -1;\
1571
+        }\
1572
+    }\
1573
+    if(writeFlag)\
1574
+    {\
1575
+        fileOffset = m_analysisFileOut;\
1576
+        if(!m_param->bUseAnalysisFile)\
1577
+            analysisParam = *param;\
1578
+        else if(fwrite(param, size, bytes, fileOffset) < bytes)\
1579
+        {\
1580
+            x265_log(NULL, X265_LOG_ERROR, "Error writing analysis data\n"); \
1581
+            m_aborted = true;\
1582
+            return -1; \
1583
+        }\
1584
+    }\
1585
+    count++;
1586
+
1587
+#define X265_FREAD(val, size, readSize, fileOffset, src)\
1588
+    if (!m_param->bUseAnalysisFile)\
1589
+    {\
1590
+        memcpy(val, src, (size * readSize));\
1591
+    }\
1592
+    else if (fread(val, size, readSize, fileOffset) != readSize)\
1593
+    {\
1594
+        x265_log(NULL, X265_LOG_ERROR, "Error reading analysis data\n");\
1595
+        m_aborted = true;\
1596
+        return -1;\
1597
+    }\
1598
+    count++;
1599
+
1600
+    x265_analysis_validate *saveParam = &analysis->saveParam;
1601
+    FILE*     fileOffset = NULL;
1602
+    int       readValue = 0;
1603
+    int       count = 0;
1604
+
1605
+    X265_PARAM_VALIDATE(saveParam->intraRefresh, sizeof(int), 1, &m_param->bIntraRefresh, intra-refresh);
1606
+    X265_PARAM_VALIDATE(saveParam->maxNumReferences, sizeof(int), 1, &m_param->maxNumReferences, ref);
1607
+    X265_PARAM_VALIDATE(saveParam->analysisReuseLevel, sizeof(int), 1, &m_param->analysisReuseLevel, analysis-reuse-level);
1608
+    X265_PARAM_VALIDATE(saveParam->keyframeMax, sizeof(int), 1, &m_param->keyframeMax, keyint);
1609
+    X265_PARAM_VALIDATE(saveParam->keyframeMin, sizeof(int), 1, &m_param->keyframeMin, min-keyint);
1610
+    X265_PARAM_VALIDATE(saveParam->openGOP, sizeof(int), 1, &m_param->bOpenGOP, open-gop);
1611
+    X265_PARAM_VALIDATE(saveParam->bframes, sizeof(int), 1, &m_param->bframes, bframes);
1612
+    X265_PARAM_VALIDATE(saveParam->bPyramid, sizeof(int), 1, &m_param->bBPyramid, bPyramid);
1613
+    X265_PARAM_VALIDATE(saveParam->minCUSize, sizeof(int), 1, &m_param->minCUSize, min - cu - size);
1614
+    X265_PARAM_VALIDATE(saveParam->lookaheadDepth, sizeof(int), 1, &m_param->lookaheadDepth, rc - lookahead);
1615
+    X265_PARAM_VALIDATE(saveParam->chunkStart, sizeof(int), 1, &m_param->chunkStart, chunk-start);
1616
+    X265_PARAM_VALIDATE(saveParam->chunkEnd, sizeof(int), 1, &m_param->chunkEnd, chunk-end);
1617
+
1618
+    int sourceHeight, sourceWidth;
1619
+    if (writeFlag)
1620
+    {
1621
+        sourceHeight = m_param->sourceHeight - m_conformanceWindow.bottomOffset;
1622
+        sourceWidth = m_param->sourceWidth - m_conformanceWindow.rightOffset;
1623
+        X265_PARAM_VALIDATE(saveParam->sourceWidth, sizeof(int), 1, &sourceWidth, res-width);
1624
+        X265_PARAM_VALIDATE(saveParam->sourceHeight, sizeof(int), 1, &sourceHeight, res-height);
1625
+        X265_PARAM_VALIDATE(saveParam->maxCUSize, sizeof(int), 1, &m_param->maxCUSize, ctu);
1626
+    }
1627
+    else
1628
+    {
1629
+        fileOffset = m_analysisFileIn;
1630
+        bool error = false;
1631
+        int curSourceHeight = m_param->sourceHeight - m_conformanceWindow.bottomOffset;
1632
+        int curSourceWidth = m_param->sourceWidth - m_conformanceWindow.rightOffset;
1633
+
1634
+        X265_FREAD(&sourceWidth, sizeof(int), 1, m_analysisFileIn, &(saveParam->sourceWidth));
1635
+        X265_FREAD(&sourceHeight, sizeof(int), 1, m_analysisFileIn, &(saveParam->sourceHeight));
1636
+        X265_FREAD(&readValue, sizeof(int), 1, m_analysisFileIn, &(saveParam->maxCUSize));
1637
+
1638
+        bool isScaledRes = (2 * sourceHeight == curSourceHeight) && (2 * sourceWidth == curSourceWidth);
1639
+        if (!isScaledRes && (sourceHeight != curSourceHeight || sourceWidth != curSourceWidth 
1640
+                            || readValue != (int)m_param->maxCUSize || m_param->scaleFactor))
1641
+            error = true;
1642
+        else if (isScaledRes && !m_param->scaleFactor)
1643
+            error = true;
1644
+        else if (isScaledRes && (int)m_param->maxCUSize == readValue)
1645
+            m_saveCTUSize = 1;
1646
+        else if (isScaledRes && (g_log2Size[m_param->maxCUSize] - g_log2Size[readValue]) != 1)
1647
+            error = true;
1648
+
1649
+        if (error)
1650
+        {
1651
+            x265_log(NULL, X265_LOG_ERROR, "Error reading analysis data. Incompatible option : <input-res / scale-factor / ctu> \n");
1652
+            m_aborted = true;
1653
+            return -1;
1654
+        }
1655
+    }
1656
+    return (count * sizeof(int));
1657
+
1658
+#undef X265_FREAD
1659
+#undef X265_PARAM_VALIDATE
1660
+}
1661
+
1662
+/* Toggle between two consecutive CTU rows. The save's CTU is copied
1663
+twice consecutively in the first and second CTU row of load*/
1664
+
1665
+int Encoder::getCUIndex(cuLocation* cuLoc, uint32_t* count, int bytes, int flag)
1666
+{
1667
+    int index = 0;
1668
+    cuLoc->switchCondition += bytes;
1669
+    int isBoundaryW = (*count % (m_param->num4x4Partitions * cuLoc->widthInCU) == 0);
1670
+
1671
+    /* Width boundary case :
1672
+    Skip to appropriate index when out of boundary cases occur
1673
+    Out of boundary may occur when the out of bound pixels along
1674
+    the width in low resoultion is greater than half of the maxCUSize */
1675
+    if (cuLoc->skipWidth && isBoundaryW)
1676
+    {
1677
+        if (flag)
1678
+            index++;
1679
+        else
1680
+        {
1681
+            /* Number of 4x4 blocks in out of bound region */
1682
+            int outOfBound = m_param->maxCUSize / 2;
1683
+            uint32_t sum = (uint32_t)pow((outOfBound >> 2), 2);
1684
+            index += sum;
1685
+        }
1686
+        cuLoc->switchCondition += m_param->num4x4Partitions;
1687
+    }
1688
+
1689
+    /* Completed writing 2 CTUs - move to the last remembered index of the next CTU row*/
1690
+    if (cuLoc->switchCondition == 2 * m_param->num4x4Partitions)
1691
+    {
1692
+        if (isBoundaryW)
1693
+            cuLoc->evenRowIndex = *count + (m_param->num4x4Partitions * cuLoc->widthInCU); // end of row - skip to the next even row
1694
+        else
1695
+            cuLoc->evenRowIndex = *count;
1696
+        *count = cuLoc->oddRowIndex;
1697
+
1698
+        /* Height boundary case :
1699
+        Skip to appropriate index when out of boundary cases occur
1700
+        Out of boundary may occur when the out of bound pixels along
1701
+        the height in low resoultion is greater than half of the maxCUSize */
1702
+        int isBoundaryH = (*count >= (m_param->num4x4Partitions * cuLoc->heightInCU * cuLoc->widthInCU));
1703
+        if (cuLoc->skipHeight && isBoundaryH)
1704
+        {
1705
+            if (flag)
1706
+                index += 2;
1707
+            else
1708
+            {
1709
+                int outOfBound = m_param->maxCUSize / 2;
1710
+                uint32_t sum = (uint32_t)(2 * pow((abs(outOfBound) >> 2), 2));
1711
+                index += sum;
1712
+            }
1713
+            *count = cuLoc->evenRowIndex;
1714
+            cuLoc->switchCondition = 0;
1715
+        }
1716
+    }
1717
+    /* Completed writing 4 CTUs - move to the last remembered index of
1718
+    the previous CTU row to copy the next save CTU's data*/
1719
+    else if (cuLoc->switchCondition == 4 * m_param->num4x4Partitions)
1720
+    {
1721
+        if (isBoundaryW)
1722
+            cuLoc->oddRowIndex = *count + (m_param->num4x4Partitions * cuLoc->widthInCU); // end of row - skip to the next odd row
1723
+        else
1724
+            cuLoc->oddRowIndex = *count;
1725
+        *count = cuLoc->evenRowIndex;
1726
+        cuLoc->switchCondition = 0;
1727
+    }
1728
+    return index;
1729
+}
1730
+
1731
+/*      save                        load
1732
+                       CTU0    CTU1    CTU2    CTU3
1733
+        2NxN          2Nx2N   2Nx2N   2Nx2N   2Nx2N
1734
+        NX2N          2Nx2N   2Nx2N   2Nx2N   2Nx2N
1735
+        2NxnU          2NxN    2NxN   2Nx2N   2Nx2N
1736
+        2NxnD         2Nx2N   2Nx2N    2NxN    2NxN
1737
+        nLx2N          Nx2N   2Nx2N    Nx2N   2Nx2N
1738
+        nRx2N         2Nx2N    Nx2N    2Nx2N   Nx2N
1739
+*/
1740
+int Encoder::getPuShape(puOrientation* puOrient, int partSize, int numCTU)
1741
+{
1742
+    puOrient->isRect = true;
1743
+    if (partSize == SIZE_Nx2N)
1744
+        puOrient->isVert = true;
1745
+    if (partSize >= SIZE_2NxnU) // All AMP modes
1746
+    {
1747
+        puOrient->isAmp = true;
1748
+        puOrient->isRect = false;
1749
+        if (partSize == SIZE_2NxnD && numCTU > 1)
1750
+            return SIZE_2NxN;
1751
+        else if (partSize == SIZE_2NxnU && numCTU < 2)
1752
+            return SIZE_2NxN;
1753
+        else if (partSize == SIZE_nLx2N)
1754
+        {
1755
+            puOrient->isVert = true;
1756
+            if (!(numCTU % 2))
1757
+                return SIZE_Nx2N;
1758
+        }
1759
+        else if (partSize == SIZE_nRx2N)
1760
+        {
1761
+            puOrient->isVert = true;
1762
+            if (numCTU % 2)
1763
+                return SIZE_Nx2N;
1764
+        }
1765
+    }
1766
+    return SIZE_2Nx2N;
1767
+}
1768
+
1769
+void Encoder::readAnalysisFile(x265_analysis_data* analysis, int curPoc, int sliceType)
1770
 {
1771
 
1772
 #define X265_FREAD(val, size, readSize, fileOffset)\
1773
     if (fread(val, size, readSize, fileOffset) != readSize)\
1774
     {\
1775
     x265_log(NULL, X265_LOG_ERROR, "Error reading analysis 2 pass data\n"); \
1776
-    freeAnalysis2Pass(analysis2Pass, sliceType); \
1777
+    x265_alloc_analysis_data(m_param, analysis); \
1778
     m_aborted = true; \
1779
     return; \
1780
 }\
1781
 
1782
     uint32_t depthBytes = 0;
1783
-    uint32_t widthInCU = (m_param->sourceWidth + m_param->maxCUSize - 1) >> m_param->maxLog2CUSize;
1784
-    uint32_t heightInCU = (m_param->sourceHeight + m_param->maxCUSize - 1) >> m_param->maxLog2CUSize;
1785
-    uint32_t numCUsInFrame = widthInCU * heightInCU;
1786
-
1787
     int poc; uint32_t frameRecordSize;
1788
     X265_FREAD(&frameRecordSize, sizeof(uint32_t), 1, m_analysisFileIn);
1789
     X265_FREAD(&depthBytes, sizeof(uint32_t), 1, m_analysisFileIn);
1790
@@ -3552,11 +4064,11 @@
1791
     if (poc != curPoc || feof(m_analysisFileIn))
1792
     {
1793
         x265_log(NULL, X265_LOG_WARNING, "Error reading analysis 2 pass data: Cannot find POC %d\n", curPoc);
1794
-        freeAnalysis2Pass(analysis2Pass, sliceType);
1795
+        x265_free_analysis_data(m_param, analysis);
1796
         return;
1797
     }
1798
     /* Now arrived at the right frame, read the record */
1799
-    analysis2Pass->frameRecordSize = frameRecordSize;
1800
+    analysis->frameRecordSize = frameRecordSize;
1801
     uint8_t* tempBuf = NULL, *depthBuf = NULL;
1802
     sse_t *tempdistBuf = NULL, *distortionBuf = NULL;
1803
     tempBuf = X265_MALLOC(uint8_t, depthBytes);
1804
@@ -3565,76 +4077,85 @@
1805
     X265_FREAD(tempdistBuf, sizeof(sse_t), depthBytes, m_analysisFileIn);
1806
     depthBuf = tempBuf;
1807
     distortionBuf = tempdistBuf;
1808
-    analysis2PassFrameData* analysisFrameData = (analysis2PassFrameData*)analysis2Pass->analysisFramedata;
1809
+    x265_analysis_data *analysisData = (x265_analysis_data*)analysis;
1810
+    x265_analysis_intra_data *intraData = analysisData->intraData;
1811
+    x265_analysis_inter_data *interData = analysisData->interData;
1812
+    x265_analysis_distortion_data *distortionData = analysisData->distortionData;
1813
+
1814
     size_t count = 0;
1815
     uint32_t ctuCount = 0;
1816
     double sum = 0, sqrSum = 0;
1817
     for (uint32_t d = 0; d < depthBytes; d++)
1818
     {
1819
-        int bytes = m_param->num4x4Partitions >> (depthBuf[d] * 2);
1820
-        memset(&analysisFrameData->depth[count], depthBuf[d], bytes);
1821
-        analysisFrameData->distortion[count] = distortionBuf[d];
1822
-        analysisFrameData->ctuDistortion[ctuCount] += analysisFrameData->distortion[count];
1823
+        int bytes = analysis->numPartitions >> (depthBuf[d] * 2);
1824
+        if (IS_X265_TYPE_I(sliceType))
1825
+            memset(&intraData->depth[count], depthBuf[d], bytes);
1826
+        else
1827
+            memset(&interData->depth[count], depthBuf[d], bytes);
1828
+        distortionData->distortion[count] = distortionBuf[d];
1829
+        distortionData->ctuDistortion[ctuCount] += distortionData->distortion[count];
1830
         count += bytes;
1831
-        if ((count % (unsigned)m_param->num4x4Partitions) == 0)
1832
+        if ((count % (unsigned)analysis->numPartitions) == 0)
1833
         {
1834
-            analysisFrameData->scaledDistortion[ctuCount] = X265_LOG2(X265_MAX(analysisFrameData->ctuDistortion[ctuCount], 1));
1835
-            sum += analysisFrameData->scaledDistortion[ctuCount];
1836
-            sqrSum += analysisFrameData->scaledDistortion[ctuCount] * analysisFrameData->scaledDistortion[ctuCount];
1837
+            distortionData->scaledDistortion[ctuCount] = X265_LOG2(X265_MAX(distortionData->ctuDistortion[ctuCount], 1));
1838
+            sum += distortionData->scaledDistortion[ctuCount];
1839
+            sqrSum += distortionData->scaledDistortion[ctuCount] * distortionData->scaledDistortion[ctuCount];
1840
             ctuCount++;
1841
         }
1842
     }
1843
-    double avg = sum / numCUsInFrame;
1844
-    analysisFrameData->sdDistortion = pow(((sqrSum / numCUsInFrame) - (avg * avg)), 0.5);
1845
-    analysisFrameData->averageDistortion = avg;
1846
-    analysisFrameData->highDistortionCtuCount = analysisFrameData->lowDistortionCtuCount = 0;
1847
-    for (uint32_t i = 0; i < numCUsInFrame; ++i)
1848
-    {
1849
-        analysisFrameData->threshold[i] = analysisFrameData->scaledDistortion[i] / analysisFrameData->averageDistortion;
1850
-        analysisFrameData->offset[i] = (analysisFrameData->averageDistortion - analysisFrameData->scaledDistortion[i]) / analysisFrameData->sdDistortion;
1851
-        if (analysisFrameData->threshold[i] < 0.9 && analysisFrameData->offset[i] >= 1)
1852
-            analysisFrameData->lowDistortionCtuCount++;
1853
-        else if (analysisFrameData->threshold[i] > 1.1 && analysisFrameData->offset[i] <= -1)
1854
-            analysisFrameData->highDistortionCtuCount++;
1855
+    double avg = sum / analysis->numCUsInFrame;
1856
+    distortionData->sdDistortion = pow(((sqrSum / analysis->numCUsInFrame) - (avg * avg)), 0.5);
1857
+    distortionData->averageDistortion = avg;
1858
+    distortionData->highDistortionCtuCount = distortionData->lowDistortionCtuCount = 0;
1859
+    for (uint32_t i = 0; i < analysis->numCUsInFrame; ++i)
1860
+    {
1861
+        distortionData->threshold[i] = distortionData->scaledDistortion[i] / distortionData->averageDistortion;
1862
+        distortionData->offset[i] = (distortionData->averageDistortion - distortionData->scaledDistortion[i]) / distortionData->sdDistortion;
1863
+        if (distortionData->threshold[i] < 0.9 && distortionData->offset[i] >= 1)
1864
+            distortionData->lowDistortionCtuCount++;
1865
+        else if (distortionData->threshold[i] > 1.1 && distortionData->offset[i] <= -1)
1866
+            distortionData->highDistortionCtuCount++;
1867
     }
1868
     if (!IS_X265_TYPE_I(sliceType))
1869
     {
1870
         MV *tempMVBuf[2], *MVBuf[2];
1871
-        int32_t *tempRefBuf[2], *refBuf[2];
1872
-        int *tempMvpBuf[2], *mvpBuf[2];
1873
+        int32_t *tempRefBuf, *refBuf;
1874
+        uint8_t *tempMvpBuf[2], *mvpBuf[2];
1875
         uint8_t* tempModeBuf = NULL, *modeBuf = NULL;
1876
-
1877
         int numDir = sliceType == X265_TYPE_P ? 1 : 2;
1878
+        tempRefBuf = X265_MALLOC(int32_t, numDir * depthBytes);
1879
+
1880
         for (int i = 0; i < numDir; i++)
1881
         {
1882
             tempMVBuf[i] = X265_MALLOC(MV, depthBytes);
1883
             X265_FREAD(tempMVBuf[i], sizeof(MV), depthBytes, m_analysisFileIn);
1884
             MVBuf[i] = tempMVBuf[i];
1885
-            tempMvpBuf[i] = X265_MALLOC(int, depthBytes);
1886
-            X265_FREAD(tempMvpBuf[i], sizeof(int), depthBytes, m_analysisFileIn);
1887
+            tempMvpBuf[i] = X265_MALLOC(uint8_t, depthBytes);
1888
+            X265_FREAD(tempMvpBuf[i], sizeof(uint8_t), depthBytes, m_analysisFileIn);
1889
             mvpBuf[i] = tempMvpBuf[i];
1890
-            tempRefBuf[i] = X265_MALLOC(int32_t, depthBytes);
1891
-            X265_FREAD(tempRefBuf[i], sizeof(int32_t), depthBytes, m_analysisFileIn);
1892
-            refBuf[i] = tempRefBuf[i];
1893
+            X265_FREAD(&tempRefBuf[i*depthBytes], sizeof(int32_t), depthBytes, m_analysisFileIn);
1894
         }
1895
+        refBuf = tempRefBuf;
1896
         tempModeBuf = X265_MALLOC(uint8_t, depthBytes);
1897
         X265_FREAD(tempModeBuf, sizeof(uint8_t), depthBytes, m_analysisFileIn);
1898
         modeBuf = tempModeBuf;
1899
-
1900
+        
1901
         count = 0;
1902
+
1903
         for (uint32_t d = 0; d < depthBytes; d++)
1904
         {
1905
-            size_t bytes = m_param->num4x4Partitions >> (depthBuf[d] * 2);
1906
+            size_t bytes = analysis->numPartitions >> (depthBuf[d] * 2);
1907
             for (int i = 0; i < numDir; i++)
1908
             {
1909
+                int32_t* ref = &(analysis->interData)->ref[i * analysis->numPartitions * analysis->numCUsInFrame];
1910
                 for (size_t j = count, k = 0; k < bytes; j++, k++)
1911
                 {
1912
-                    memcpy(&((analysis2PassFrameData*)analysis2Pass->analysisFramedata)->m_mv[i][j], MVBuf[i] + d, sizeof(MV));
1913
-                    memcpy(&((analysis2PassFrameData*)analysis2Pass->analysisFramedata)->mvpIdx[i][j], mvpBuf[i] + d, sizeof(int));
1914
-                    memcpy(&((analysis2PassFrameData*)analysis2Pass->analysisFramedata)->ref[i][j], refBuf[i] + d, sizeof(int32_t));
1915
+                    memcpy(&(analysis->interData)->mv[i][j], MVBuf[i] + d, sizeof(MV));
1916
+                    memcpy(&(analysis->interData)->mvpIdx[i][j], mvpBuf[i] + d, sizeof(uint8_t));
1917
+                    memcpy(&ref[j], refBuf + (i * depthBytes) + d, sizeof(int32_t));
1918
                 }
1919
             }
1920
-            memset(&((analysis2PassFrameData *)analysis2Pass->analysisFramedata)->modes[count], modeBuf[d], bytes);
1921
+            memset(&(analysis->interData)->modes[count], modeBuf[d], bytes);
1922
             count += bytes;
1923
         }
1924
 
1925
@@ -3642,8 +4163,8 @@
1926
         {
1927
             X265_FREE(tempMVBuf[i]);
1928
             X265_FREE(tempMvpBuf[i]);
1929
-            X265_FREE(tempRefBuf[i]);
1930
         }
1931
+        X265_FREE(tempRefBuf);
1932
         X265_FREE(tempModeBuf);
1933
     }
1934
     X265_FREE(tempBuf);
1935
@@ -3659,7 +4180,7 @@
1936
     if (fwrite(val, size, writeSize, fileOffset) < writeSize)\
1937
     {\
1938
         x265_log(NULL, X265_LOG_ERROR, "Error writing analysis data\n");\
1939
-        freeAnalysis(analysis);\
1940
+        x265_free_analysis_data(m_param, analysis);\
1941
         m_aborted = true;\
1942
         return;\
1943
     }\
1944
@@ -3668,6 +4189,15 @@
1945
     uint32_t numDir, numPlanes;
1946
     bool bIntraInInter = false;
1947
 
1948
+    if (!analysis->poc)
1949
+    {
1950
+        if (validateAnalysisData(analysis, 1) == -1)
1951
+        {
1952
+            m_aborted = true;
1953
+            return;
1954
+        }
1955
+    }
1956
+
1957
     /* calculate frameRecordSize */
1958
     analysis->frameRecordSize = sizeof(analysis->frameRecordSize) + sizeof(depthBytes) + sizeof(analysis->poc) + sizeof(analysis->sliceType) +
1959
                       sizeof(analysis->numCUsInFrame) + sizeof(analysis->numPartitions) + sizeof(analysis->bScenecut) + sizeof(analysis->satdCost);
1960
@@ -3689,7 +4219,7 @@
1961
                 uint8_t partSize = 0;
1962
 
1963
                 CUData* ctu = curEncData.getPicCTU(cuAddr);
1964
-                analysis_intra_data* intraDataCTU = (analysis_intra_data*)analysis->intraData;
1965
+                x265_analysis_intra_data* intraDataCTU = analysis->intraData;
1966
 
1967
                 for (uint32_t absPartIdx = 0; absPartIdx < ctu->m_numPartitions; depthBytes++)
1968
                 {
1969
@@ -3717,8 +4247,8 @@
1970
                 uint8_t partSize = 0;
1971
 
1972
                 CUData* ctu = curEncData.getPicCTU(cuAddr);
1973
-                analysis_inter_data* interDataCTU = (analysis_inter_data*)analysis->interData;
1974
-                analysis_intra_data* intraDataCTU = (analysis_intra_data*)analysis->intraData;
1975
+                x265_analysis_inter_data* interDataCTU = analysis->interData;
1976
+                x265_analysis_intra_data* intraDataCTU = analysis->intraData;
1977
 
1978
                 for (uint32_t absPartIdx = 0; absPartIdx < ctu->m_numPartitions; depthBytes++)
1979
                 {
1980
@@ -3751,7 +4281,7 @@
1981
                                 {
1982
                                     interDataCTU->mvpIdx[dir][depthBytes] = ctu->m_mvpIdx[dir][puabsPartIdx];
1983
                                     interDataCTU->refIdx[dir][depthBytes] = ctu->m_refIdx[dir][puabsPartIdx];
1984
-                                    interDataCTU->mv[dir][depthBytes] = ctu->m_mv[dir][puabsPartIdx];
1985
+                                    interDataCTU->mv[dir][depthBytes].word = ctu->m_mv[dir][puabsPartIdx].word;
1986
                                 }
1987
                             }
1988
                         }
1989
@@ -3809,58 +4339,58 @@
1990
 
1991
     if (analysis->sliceType == X265_TYPE_IDR || analysis->sliceType == X265_TYPE_I)
1992
     {
1993
-        X265_FWRITE(((analysis_intra_data*)analysis->intraData)->depth, sizeof(uint8_t), depthBytes, m_analysisFileOut);
1994
-        X265_FWRITE(((analysis_intra_data*)analysis->intraData)->chromaModes, sizeof(uint8_t), depthBytes, m_analysisFileOut);
1995
-        X265_FWRITE(((analysis_intra_data*)analysis->intraData)->partSizes, sizeof(char), depthBytes, m_analysisFileOut);
1996
-        X265_FWRITE(((analysis_intra_data*)analysis->intraData)->modes, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFileOut);
1997
+        X265_FWRITE((analysis->intraData)->depth, sizeof(uint8_t), depthBytes, m_analysisFileOut);
1998
+        X265_FWRITE((analysis->intraData)->chromaModes, sizeof(uint8_t), depthBytes, m_analysisFileOut);
1999
+        X265_FWRITE((analysis->intraData)->partSizes, sizeof(char), depthBytes, m_analysisFileOut);
2000
+        X265_FWRITE((analysis->intraData)->modes, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFileOut);
2001
     }
2002
     else
2003
     {
2004
-        X265_FWRITE(((analysis_inter_data*)analysis->interData)->depth, sizeof(uint8_t), depthBytes, m_analysisFileOut);
2005
-        X265_FWRITE(((analysis_inter_data*)analysis->interData)->modes, sizeof(uint8_t), depthBytes, m_analysisFileOut);
2006
+        X265_FWRITE((analysis->interData)->depth, sizeof(uint8_t), depthBytes, m_analysisFileOut);
2007
+        X265_FWRITE((analysis->interData)->modes, sizeof(uint8_t), depthBytes, m_analysisFileOut);
2008
         if (m_param->analysisReuseLevel > 4)
2009
         {
2010
-            X265_FWRITE(((analysis_inter_data*)analysis->interData)->partSize, sizeof(uint8_t), depthBytes, m_analysisFileOut);
2011
-            X265_FWRITE(((analysis_inter_data*)analysis->interData)->mergeFlag, sizeof(uint8_t), depthBytes, m_analysisFileOut);
2012
+            X265_FWRITE((analysis->interData)->partSize, sizeof(uint8_t), depthBytes, m_analysisFileOut);
2013
+            X265_FWRITE((analysis->interData)->mergeFlag, sizeof(uint8_t), depthBytes, m_analysisFileOut);
2014
             if (m_param->analysisReuseLevel == 10)
2015
             {
2016
-                X265_FWRITE(((analysis_inter_data*)analysis->interData)->interDir, sizeof(uint8_t), depthBytes, m_analysisFileOut);
2017
-                if (bIntraInInter) X265_FWRITE(((analysis_intra_data*)analysis->intraData)->chromaModes, sizeof(uint8_t), depthBytes, m_analysisFileOut);
2018
+                X265_FWRITE((analysis->interData)->interDir, sizeof(uint8_t), depthBytes, m_analysisFileOut);
2019
+                if (bIntraInInter) X265_FWRITE((analysis->intraData)->chromaModes, sizeof(uint8_t), depthBytes, m_analysisFileOut);
2020
                 for (uint32_t dir = 0; dir < numDir; dir++)
2021
                 {
2022
-                    X265_FWRITE(((analysis_inter_data*)analysis->interData)->mvpIdx[dir], sizeof(uint8_t), depthBytes, m_analysisFileOut);
2023
-                    X265_FWRITE(((analysis_inter_data*)analysis->interData)->refIdx[dir], sizeof(int8_t), depthBytes, m_analysisFileOut);
2024
-                    X265_FWRITE(((analysis_inter_data*)analysis->interData)->mv[dir], sizeof(MV), depthBytes, m_analysisFileOut);
2025
+                    X265_FWRITE((analysis->interData)->mvpIdx[dir], sizeof(uint8_t), depthBytes, m_analysisFileOut);
2026
+                    X265_FWRITE((analysis->interData)->refIdx[dir], sizeof(int8_t), depthBytes, m_analysisFileOut);
2027
+                    X265_FWRITE((analysis->interData)->mv[dir], sizeof(MV), depthBytes, m_analysisFileOut);
2028
                 }
2029
                 if (bIntraInInter)
2030
-                    X265_FWRITE(((analysis_intra_data*)analysis->intraData)->modes, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFileOut);
2031
+                    X265_FWRITE((analysis->intraData)->modes, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFileOut);
2032
             }
2033
         }
2034
         if (m_param->analysisReuseLevel != 10)
2035
-            X265_FWRITE(((analysis_inter_data*)analysis->interData)->ref, sizeof(int32_t), analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU * numDir, m_analysisFileOut);
2036
+            X265_FWRITE((analysis->interData)->ref, sizeof(int32_t), analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU * numDir, m_analysisFileOut);
2037
 
2038
     }
2039
 #undef X265_FWRITE
2040
 }
2041
 
2042
-void Encoder::writeAnalysis2PassFile(x265_analysis_2Pass* analysis2Pass, FrameData &curEncData, int slicetype)
2043
+void Encoder::writeAnalysisFileRefine(x265_analysis_data* analysis, FrameData &curEncData)
2044
 {
2045
 #define X265_FWRITE(val, size, writeSize, fileOffset)\
2046
     if (fwrite(val, size, writeSize, fileOffset) < writeSize)\
2047
     {\
2048
     x265_log(NULL, X265_LOG_ERROR, "Error writing analysis 2 pass data\n"); \
2049
-    freeAnalysis2Pass(analysis2Pass, slicetype); \
2050
+    x265_free_analysis_data(m_param, analysis); \
2051
     m_aborted = true; \
2052
     return; \
2053
 }\
2054
 
2055
     uint32_t depthBytes = 0;
2056
-    uint32_t widthInCU = (m_param->sourceWidth + m_param->maxCUSize - 1) >> m_param->maxLog2CUSize;
2057
-    uint32_t heightInCU = (m_param->sourceHeight + m_param->maxCUSize - 1) >> m_param->maxLog2CUSize;
2058
-    uint32_t numCUsInFrame = widthInCU * heightInCU;
2059
-    analysis2PassFrameData* analysisFrameData = (analysis2PassFrameData*)analysis2Pass->analysisFramedata;
2060
+    x265_analysis_data *analysisData = (x265_analysis_data*)analysis;
2061
+    x265_analysis_intra_data *intraData = analysisData->intraData;
2062
+    x265_analysis_inter_data *interData = analysisData->interData;
2063
+    x265_analysis_distortion_data *distortionData = analysisData->distortionData;
2064
 
2065
-    for (uint32_t cuAddr = 0; cuAddr < numCUsInFrame; cuAddr++)
2066
+    for (uint32_t cuAddr = 0; cuAddr < analysis->numCUsInFrame; cuAddr++)
2067
     {
2068
         uint8_t depth = 0;
2069
 
2070
@@ -3869,37 +4399,42 @@
2071
         for (uint32_t absPartIdx = 0; absPartIdx < ctu->m_numPartitions; depthBytes++)
2072
         {
2073
             depth = ctu->m_cuDepth[absPartIdx];
2074
-            analysisFrameData->depth[depthBytes] = depth;
2075
-            analysisFrameData->distortion[depthBytes] = ctu->m_distortion[absPartIdx];
2076
+            if (curEncData.m_slice->m_sliceType == I_SLICE)
2077
+                intraData->depth[depthBytes] = depth;
2078
+            else
2079
+                interData->depth[depthBytes] = depth;
2080
+            distortionData->distortion[depthBytes] = ctu->m_distortion[absPartIdx];
2081
             absPartIdx += ctu->m_numPartitions >> (depth * 2);
2082
         }
2083
     }
2084
 
2085
     if (curEncData.m_slice->m_sliceType != I_SLICE)
2086
     {
2087
+        int32_t* ref[2];
2088
+        ref[0] = (analysis->interData)->ref;
2089
+        ref[1] = &(analysis->interData)->ref[analysis->numPartitions * analysis->numCUsInFrame];
2090
         depthBytes = 0;
2091
-        for (uint32_t cuAddr = 0; cuAddr < numCUsInFrame; cuAddr++)
2092
+        for (uint32_t cuAddr = 0; cuAddr < analysis->numCUsInFrame; cuAddr++)
2093
         {
2094
             uint8_t depth = 0;
2095
             uint8_t predMode = 0;
2096
 
2097
             CUData* ctu = curEncData.getPicCTU(cuAddr);
2098
-
2099
             for (uint32_t absPartIdx = 0; absPartIdx < ctu->m_numPartitions; depthBytes++)
2100
             {
2101
                 depth = ctu->m_cuDepth[absPartIdx];
2102
-                analysisFrameData->m_mv[0][depthBytes] = ctu->m_mv[0][absPartIdx];
2103
-                analysisFrameData->mvpIdx[0][depthBytes] = ctu->m_mvpIdx[0][absPartIdx];
2104
-                analysisFrameData->ref[0][depthBytes] = ctu->m_refIdx[0][absPartIdx];
2105
+                interData->mv[0][depthBytes].word = ctu->m_mv[0][absPartIdx].word;
2106
+                interData->mvpIdx[0][depthBytes] = ctu->m_mvpIdx[0][absPartIdx];
2107
+                ref[0][depthBytes] = ctu->m_refIdx[0][absPartIdx];
2108
                 predMode = ctu->m_predMode[absPartIdx];
2109
                 if (ctu->m_refIdx[1][absPartIdx] != -1)
2110
                 {
2111
-                    analysisFrameData->m_mv[1][depthBytes] = ctu->m_mv[1][absPartIdx];
2112
-                    analysisFrameData->mvpIdx[1][depthBytes] = ctu->m_mvpIdx[1][absPartIdx];
2113
-                    analysisFrameData->ref[1][depthBytes] = ctu->m_refIdx[1][absPartIdx];
2114
+                    interData->mv[1][depthBytes].word = ctu->m_mv[1][absPartIdx].word;
2115
+                    interData->mvpIdx[1][depthBytes] = ctu->m_mvpIdx[1][absPartIdx];
2116
+                    ref[1][depthBytes] = ctu->m_refIdx[1][absPartIdx];
2117
                     predMode = 4; // used as indiacator if the block is coded as bidir
2118
                 }
2119
-                analysisFrameData->modes[depthBytes] = predMode;
2120
+                interData->modes[depthBytes] = predMode;
2121
 
2122
                 absPartIdx += ctu->m_numPartitions >> (depth * 2);
2123
             }
2124
@@ -3907,34 +4442,40 @@
2125
     }
2126
 
2127
     /* calculate frameRecordSize */
2128
-    analysis2Pass->frameRecordSize = sizeof(analysis2Pass->frameRecordSize) + sizeof(depthBytes) + sizeof(analysis2Pass->poc);
2129
-
2130
-    analysis2Pass->frameRecordSize += depthBytes * sizeof(uint8_t);
2131
-    analysis2Pass->frameRecordSize += depthBytes * sizeof(sse_t);
2132
+    analysis->frameRecordSize = sizeof(analysis->frameRecordSize) + sizeof(depthBytes) + sizeof(analysis->poc);
2133
+    analysis->frameRecordSize += depthBytes * sizeof(uint8_t);
2134
+    analysis->frameRecordSize += depthBytes * sizeof(sse_t);
2135
     if (curEncData.m_slice->m_sliceType != I_SLICE)
2136
     {
2137
         int numDir = (curEncData.m_slice->m_sliceType == P_SLICE) ? 1 : 2;
2138
-        analysis2Pass->frameRecordSize += depthBytes * sizeof(MV) * numDir;
2139
-        analysis2Pass->frameRecordSize += depthBytes * sizeof(int32_t) * numDir;
2140
-        analysis2Pass->frameRecordSize += depthBytes * sizeof(int) * numDir;
2141
-        analysis2Pass->frameRecordSize += depthBytes * sizeof(uint8_t);
2142
+        analysis->frameRecordSize += depthBytes * sizeof(MV) * numDir;
2143
+        analysis->frameRecordSize += depthBytes * sizeof(int32_t) * numDir;
2144
+        analysis->frameRecordSize += depthBytes * sizeof(uint8_t) * numDir;
2145
+        analysis->frameRecordSize += depthBytes * sizeof(uint8_t);
2146
     }
2147
-    X265_FWRITE(&analysis2Pass->frameRecordSize, sizeof(uint32_t), 1, m_analysisFileOut);
2148
+    X265_FWRITE(&analysis->frameRecordSize, sizeof(uint32_t), 1, m_analysisFileOut);
2149
     X265_FWRITE(&depthBytes, sizeof(uint32_t), 1, m_analysisFileOut);
2150
-    X265_FWRITE(&analysis2Pass->poc, sizeof(uint32_t), 1, m_analysisFileOut);
2151
-
2152
-    X265_FWRITE(analysisFrameData->depth, sizeof(uint8_t), depthBytes, m_analysisFileOut);
2153
-    X265_FWRITE(analysisFrameData->distortion, sizeof(sse_t), depthBytes, m_analysisFileOut);
2154
+    X265_FWRITE(&analysis->poc, sizeof(uint32_t), 1, m_analysisFileOut);
2155
+    if (curEncData.m_slice->m_sliceType == I_SLICE)
2156
+    {
2157
+        X265_FWRITE((analysis->intraData)->depth, sizeof(uint8_t), depthBytes, m_analysisFileOut);
2158
+    }
2159
+    else
2160
+    {
2161
+        X265_FWRITE((analysis->interData)->depth, sizeof(uint8_t), depthBytes, m_analysisFileOut);
2162
+    }
2163
+    X265_FWRITE(distortionData->distortion, sizeof(sse_t), depthBytes, m_analysisFileOut);
2164
     if (curEncData.m_slice->m_sliceType != I_SLICE)
2165
     {
2166
         int numDir = curEncData.m_slice->m_sliceType == P_SLICE ? 1 : 2;
2167
         for (int i = 0; i < numDir; i++)
2168
         {
2169
-            X265_FWRITE(analysisFrameData->m_mv[i], sizeof(MV), depthBytes, m_analysisFileOut);
2170
-            X265_FWRITE(analysisFrameData->mvpIdx[i], sizeof(int), depthBytes, m_analysisFileOut);
2171
-            X265_FWRITE(analysisFrameData->ref[i], sizeof(int32_t), depthBytes, m_analysisFileOut);
2172
+            int32_t* ref = &(analysis->interData)->ref[i * analysis->numPartitions * analysis->numCUsInFrame];
2173
+            X265_FWRITE(interData->mv[i], sizeof(MV), depthBytes, m_analysisFileOut);
2174
+            X265_FWRITE(interData->mvpIdx[i], sizeof(uint8_t), depthBytes, m_analysisFileOut);
2175
+            X265_FWRITE(ref, sizeof(int32_t), depthBytes, m_analysisFileOut);
2176
         }
2177
-        X265_FWRITE(analysisFrameData->modes, sizeof(uint8_t), depthBytes, m_analysisFileOut);
2178
+        X265_FWRITE((analysis->interData)->modes, sizeof(uint8_t), depthBytes, m_analysisFileOut);
2179
     }
2180
 #undef X265_FWRITE
2181
 }
2182
@@ -3969,6 +4510,51 @@
2183
     TOOLCMP(oldParam->rc.rfConstant, newParam->rc.rfConstant, "crf=%f to %f\n");
2184
 }
2185
 
2186
+void Encoder::readUserSeiFile(x265_sei_payload& seiMsg, int curPoc)
2187
+{
2188
+    char line[1024];
2189
+    while (fgets(line, sizeof(line), m_naluFile))
2190
+    {
2191
+        int poc = atoi(strtok(line, " "));
2192
+        char *prefix = strtok(NULL, " ");
2193
+        int nalType = atoi(strtok(NULL, "/"));
2194
+        int payloadType = atoi(strtok(NULL, " "));
2195
+        char *base64Encode = strtok(NULL, "\n");
2196
+        int base64EncodeLength = (int)strlen(base64Encode);
2197
+        char *base64Decode = SEI::base64Decode(base64Encode, base64EncodeLength);
2198
+        if (nalType == NAL_UNIT_PREFIX_SEI && (!strcmp(prefix, "PREFIX")))
2199
+        {
2200
+            int currentPOC = curPoc;
2201
+            if (currentPOC == poc)
2202
+            {
2203
+                seiMsg.payloadSize = (base64EncodeLength / 4) * 3;
2204
+                seiMsg.payload = (uint8_t*)x265_malloc(sizeof(uint8_t) * seiMsg.payloadSize);
2205
+                if (!seiMsg.payload)
2206
+                {
2207
+                    x265_log(m_param, X265_LOG_ERROR, "Unable to allocate memory for SEI payload\n");
2208
+                    break;
2209
+                }
2210
+                if (payloadType == 4)
2211
+                    seiMsg.payloadType = USER_DATA_REGISTERED_ITU_T_T35;
2212
+                else if (payloadType == 5)
2213
+                    seiMsg.payloadType = USER_DATA_UNREGISTERED;
2214
+                else
2215
+                {
2216
+                    x265_log(m_param, X265_LOG_WARNING, "Unsupported SEI payload Type for frame %d\n", poc);
2217
+                    break;
2218
+                }
2219
+                memcpy(seiMsg.payload, base64Decode, seiMsg.payloadSize);
2220
+                break;
2221
+            }
2222
+        }
2223
+        else
2224
+        {
2225
+            x265_log(m_param, X265_LOG_WARNING, "SEI message for frame %d is not inserted. Will support only PREFIX SEI messages.\n", poc);
2226
+            break;
2227
+        }
2228
+    }
2229
+}
2230
+
2231
 bool Encoder::computeSPSRPSIndex()
2232
 {
2233
     RPS* rpsInSPS = m_sps.spsrps;
2234
x265_2.7.tar.gz/source/encoder/encoder.h -> x265_2.9.tar.gz/source/encoder/encoder.h Changed
121
 
1
@@ -90,6 +90,43 @@
2
     RPSListNode* prior;
3
 };
4
 
5
+struct cuLocation
6
+{
7
+    bool skipWidth;
8
+    bool skipHeight;
9
+    uint32_t heightInCU;
10
+    uint32_t widthInCU;
11
+    uint32_t oddRowIndex;
12
+    uint32_t evenRowIndex;
13
+    uint32_t switchCondition;
14
+
15
+    void init(x265_param* param)
16
+    {
17
+        skipHeight = false;
18
+        skipWidth = false;
19
+        heightInCU = (param->sourceHeight + param->maxCUSize - 1) >> param->maxLog2CUSize;
20
+        widthInCU = (param->sourceWidth + param->maxCUSize - 1) >> param->maxLog2CUSize;
21
+        evenRowIndex = 0;
22
+        oddRowIndex = param->num4x4Partitions * widthInCU;
23
+        switchCondition = 0; // To switch between odd and even rows
24
+    }
25
+};
26
+
27
+struct puOrientation
28
+{
29
+    bool isVert;
30
+    bool isRect;
31
+    bool isAmp;
32
+
33
+    void init()
34
+    {
35
+        isRect = false;
36
+        isAmp = false;
37
+        isVert = false;
38
+    }
39
+};
40
+
41
+
42
 class FrameEncoder;
43
 class DPB;
44
 class Lookahead;
45
@@ -132,6 +169,7 @@
46
     Frame*             m_exportedPic;
47
     FILE*              m_analysisFileIn;
48
     FILE*              m_analysisFileOut;
49
+    FILE*              m_naluFile;
50
     x265_param*        m_param;
51
     x265_param*        m_latestParam;     // Holds latest param during a reconfigure
52
     RateControl*       m_rateControl;
53
@@ -175,6 +213,7 @@
54
     double                m_cR;
55
 
56
     int                     m_bToneMap; // Enables tone-mapping
57
+    int                     m_enableNal;
58
 
59
 #ifdef ENABLE_HDR10_PLUS
60
     const hdr10plus_api     *m_hdr10plus_api;
61
@@ -184,6 +223,15 @@
62
 
63
     x265_sei_payload        m_prevTonemapPayload;
64
 
65
+    /* Collect frame level feature data */
66
+    uint64_t*               m_rdCost;
67
+    uint64_t*               m_variance;
68
+    uint32_t*               m_trainingCount;
69
+    int32_t                 m_startPoint;
70
+    Lock                    m_dynamicRefineLock;
71
+
72
+    bool                    m_saveCTUSize;
73
+
74
     Encoder();
75
     ~Encoder()
76
     {
77
@@ -227,21 +275,26 @@
78
 
79
     void updateVbvPlan(RateControl* rc);
80
 
81
-    void allocAnalysis(x265_analysis_data* analysis);
82
+    void readAnalysisFile(x265_analysis_data* analysis, int poc, int sliceType);
83
+
84
+    void readAnalysisFile(x265_analysis_data* analysis, int poc, const x265_picture* picIn, int paramBytes);
85
 
86
-    void freeAnalysis(x265_analysis_data* analysis);
87
+    void readAnalysisFile(x265_analysis_data* analysis, int poc, const x265_picture* picIn, int paramBytes, cuLocation cuLoc);
88
 
89
-    void allocAnalysis2Pass(x265_analysis_2Pass* analysis, int sliceType);
90
+    int getCUIndex(cuLocation* cuLoc, uint32_t* count, int bytes, int flag);
91
 
92
-    void freeAnalysis2Pass(x265_analysis_2Pass* analysis, int sliceType);
93
+    int getPuShape(puOrientation* puOrient, int partSize, int numCTU);
94
 
95
-    void readAnalysisFile(x265_analysis_data* analysis, int poc, const x265_picture* picIn);
96
+    void writeAnalysisFile(x265_analysis_data* analysis, FrameData &curEncData);
97
+
98
+    void writeAnalysisFileRefine(x265_analysis_data* analysis, FrameData &curEncData);
99
 
100
-    void writeAnalysisFile(x265_analysis_data* pic, FrameData &curEncData);
101
-    void readAnalysis2PassFile(x265_analysis_2Pass* analysis2Pass, int poc, int sliceType);
102
-    void writeAnalysis2PassFile(x265_analysis_2Pass* analysis2Pass, FrameData &curEncData, int slicetype);
103
     void finishFrameStats(Frame* pic, FrameEncoder *curEncoder, x265_frame_stats* frameStats, int inPoc);
104
 
105
+    int validateAnalysisData(x265_analysis_data* analysis, int readWriteFlag);
106
+
107
+    void readUserSeiFile(x265_sei_payload& seiMsg, int poc);
108
+
109
     void calcRefreshInterval(Frame* frameEnc);
110
 
111
     void initRefIdx();
112
@@ -249,6 +302,8 @@
113
     void updateRefIdx();
114
     bool computeSPSRPSIndex();
115
 
116
+    void copyUserSEIMessages(Frame *frame, const x265_picture* pic_in);
117
+
118
 protected:
119
 
120
     void initVPS(VPS *vps);
121
x265_2.7.tar.gz/source/encoder/entropy.cpp -> x265_2.9.tar.gz/source/encoder/entropy.cpp Changed
40
 
1
@@ -1369,8 +1369,8 @@
2
                     }
3
                     bDenomCoded = true;
4
                 }
5
-                WRITE_FLAG(wp[0].bPresentFlag, "luma_weight_lX_flag");
6
-                totalSignalledWeightFlags += wp[0].bPresentFlag;
7
+                WRITE_FLAG(!!wp[0].wtPresent, "luma_weight_lX_flag");
8
+                totalSignalledWeightFlags += wp[0].wtPresent;
9
             }
10
 
11
             if (bChroma)
12
@@ -1378,15 +1378,15 @@
13
                 for (int ref = 0; ref < slice.m_numRefIdx[list]; ref++)
14
                 {
15
                     wp = slice.m_weightPredTable[list][ref];
16
-                    WRITE_FLAG(wp[1].bPresentFlag, "chroma_weight_lX_flag");
17
-                    totalSignalledWeightFlags += 2 * wp[1].bPresentFlag;
18
+                    WRITE_FLAG(!!wp[1].wtPresent, "chroma_weight_lX_flag");
19
+                    totalSignalledWeightFlags += 2 * wp[1].wtPresent;
20
                 }
21
             }
22
 
23
             for (int ref = 0; ref < slice.m_numRefIdx[list]; ref++)
24
             {
25
                 wp = slice.m_weightPredTable[list][ref];
26
-                if (wp[0].bPresentFlag)
27
+                if (wp[0].wtPresent)
28
                 {
29
                     int deltaWeight = (wp[0].inputWeight - (1 << wp[0].log2WeightDenom));
30
                     WRITE_SVLC(deltaWeight, "delta_luma_weight_lX");
31
@@ -1395,7 +1395,7 @@
32
 
33
                 if (bChroma)
34
                 {
35
-                    if (wp[1].bPresentFlag)
36
+                    if (wp[1].wtPresent)
37
                     {
38
                         for (int plane = 1; plane < 3; plane++)
39
                         {
40
x265_2.7.tar.gz/source/encoder/frameencoder.cpp -> x265_2.9.tar.gz/source/encoder/frameencoder.cpp Changed
695
 
1
@@ -179,7 +179,7 @@
2
         ok &= m_rce.picTimingSEI && m_rce.hrdTiming;
3
     }
4
 
5
-    if (m_param->noiseReductionIntra || m_param->noiseReductionInter || m_param->rc.vbvBufferSize)
6
+    if (m_param->noiseReductionIntra || m_param->noiseReductionInter)
7
         m_nr = X265_MALLOC(NoiseReduction, 1);
8
     if (m_nr)
9
         memset(m_nr, 0, sizeof(NoiseReduction));
10
@@ -365,6 +365,65 @@
11
     return length;
12
 }
13
 
14
+bool FrameEncoder::writeToneMapInfo(x265_sei_payload *payload)
15
+{
16
+    bool payloadChange = false;
17
+    if (m_top->m_prevTonemapPayload.payload != NULL && payload->payloadSize == m_top->m_prevTonemapPayload.payloadSize)
18
+    {
19
+        if (memcmp(m_top->m_prevTonemapPayload.payload, payload->payload, payload->payloadSize) != 0)
20
+            payloadChange = true;
21
+    }
22
+    else
23
+    {
24
+        payloadChange = true;
25
+        if (m_top->m_prevTonemapPayload.payload != NULL)
26
+            x265_free(m_top->m_prevTonemapPayload.payload);
27
+        m_top->m_prevTonemapPayload.payload = (uint8_t*)x265_malloc(sizeof(uint8_t)* payload->payloadSize);
28
+    }
29
+
30
+    if (payloadChange)
31
+    {
32
+        m_top->m_prevTonemapPayload.payloadType = payload->payloadType;
33
+        m_top->m_prevTonemapPayload.payloadSize = payload->payloadSize;
34
+        memcpy(m_top->m_prevTonemapPayload.payload, payload->payload, payload->payloadSize);
35
+    }
36
+
37
+    bool isIDR = m_frame->m_lowres.sliceType == X265_TYPE_IDR;
38
+    return (payloadChange || isIDR);
39
+}
40
+
41
+void FrameEncoder::writeTrailingSEIMessages()
42
+{
43
+    Slice* slice = m_frame->m_encData->m_slice;
44
+    int planes = (m_param->internalCsp != X265_CSP_I400) ? 3 : 1;
45
+    int32_t payloadSize = 0;
46
+
47
+    if (m_param->decodedPictureHashSEI == 1)
48
+    {
49
+        m_seiReconPictureDigest.m_method = SEIDecodedPictureHash::MD5;
50
+        for (int i = 0; i < planes; i++)
51
+            MD5Final(&m_seiReconPictureDigest.m_state[i], m_seiReconPictureDigest.m_digest[i]);
52
+        payloadSize = 1 + 16 * planes;
53
+    }
54
+    else if (m_param->decodedPictureHashSEI == 2)
55
+    {
56
+        m_seiReconPictureDigest.m_method = SEIDecodedPictureHash::CRC;
57
+        for (int i = 0; i < planes; i++)
58
+            crcFinish(m_seiReconPictureDigest.m_crc[i], m_seiReconPictureDigest.m_digest[i]);
59
+        payloadSize = 1 + 2 * planes;
60
+    }
61
+    else if (m_param->decodedPictureHashSEI == 3)
62
+    {
63
+        m_seiReconPictureDigest.m_method = SEIDecodedPictureHash::CHECKSUM;
64
+        for (int i = 0; i < planes; i++)
65
+            checksumFinish(m_seiReconPictureDigest.m_checksum[i], m_seiReconPictureDigest.m_digest[i]);
66
+        payloadSize = 1 + 4 * planes;
67
+    }
68
+
69
+    m_seiReconPictureDigest.setSize(payloadSize);
70
+    m_seiReconPictureDigest.writeSEImessages(m_bs, *slice->m_sps, NAL_UNIT_SUFFIX_SEI, m_nalList, false);
71
+}
72
+
73
 void FrameEncoder::compressFrame()
74
 {
75
     ProfileScopeEvent(frameThread);
76
@@ -393,6 +452,7 @@
77
      * not repeating headers (since AUD is supposed to be the first NAL in the access
78
      * unit) */
79
     Slice* slice = m_frame->m_encData->m_slice;
80
+
81
     if (m_param->bEnableAccessUnitDelimiters && (m_frame->m_poc || m_param->bRepeatHeaders))
82
     {
83
         m_bs.resetBits();
84
@@ -400,6 +460,8 @@
85
         m_entropyCoder.codeAUD(*slice);
86
         m_bs.writeByteAlignment();
87
         m_nalList.serialize(NAL_UNIT_ACCESS_UNIT_DELIMITER, m_bs);
88
+        if (m_param->bSingleSeiNal)
89
+            m_bs.resetBits();
90
     }
91
     if (m_frame->m_lowres.bKeyframe && m_param->bRepeatHeaders)
92
     {
93
@@ -459,9 +521,7 @@
94
                 wa.waitForExit();
95
             else
96
                 weightAnalyse(*slice, *m_frame, *m_param);
97
-
98
         }
99
-
100
     }
101
     else
102
         slice->disableWeights();
103
@@ -475,7 +535,7 @@
104
         for (int ref = 0; ref < slice->m_numRefIdx[l]; ref++)
105
         {
106
             WeightParam *w = NULL;
107
-            if ((bUseWeightP || bUseWeightB) && slice->m_weightPredTable[l][ref][0].bPresentFlag)
108
+            if ((bUseWeightP || bUseWeightB) && slice->m_weightPredTable[l][ref][0].wtPresent)
109
                 w = slice->m_weightPredTable[l][ref];
110
             slice->m_refReconPicList[l][ref] = slice->m_refFrameList[l][ref]->m_reconPic;
111
             m_mref[l][ref].init(slice->m_refReconPicList[l][ref], w, *m_param);
112
@@ -496,41 +556,6 @@
113
 
114
     /* Get the QP for this frame from rate control. This call may block until
115
      * frames ahead of it in encode order have called rateControlEnd() */
116
-    m_rce.encodeOrder = m_frame->m_encodeOrder;
117
-    bool payloadChange = false;
118
-    bool writeSei = true;
119
-    if (m_param->bDhdr10opt)
120
-    {
121
-        for (int i = 0; i < m_frame->m_userSEI.numPayloads; i++)
122
-        {
123
-            x265_sei_payload *payload = &m_frame->m_userSEI.payloads[i];
124
-            if(payload->payloadType == USER_DATA_REGISTERED_ITU_T_T35)
125
-            {
126
-                if (m_top->m_prevTonemapPayload.payload != NULL && payload->payloadSize == m_top->m_prevTonemapPayload.payloadSize)
127
-                {
128
-                    if (memcmp(m_top->m_prevTonemapPayload.payload, payload->payload, payload->payloadSize) != 0)
129
-                        payloadChange = true;
130
-                }
131
-                else
132
-                {
133
-                    payloadChange = true;
134
-                    if (m_top->m_prevTonemapPayload.payload != NULL)
135
-                        x265_free(m_top->m_prevTonemapPayload.payload);
136
-                    m_top->m_prevTonemapPayload.payload = (uint8_t*)x265_malloc(sizeof(uint8_t) * payload->payloadSize);
137
-                }
138
-
139
-                if (payloadChange)
140
-                {
141
-                    m_top->m_prevTonemapPayload.payloadType = payload->payloadType;
142
-                    m_top->m_prevTonemapPayload.payloadSize = payload->payloadSize;
143
-                    memcpy(m_top->m_prevTonemapPayload.payload, payload->payload, payload->payloadSize);
144
-                }
145
-
146
-                bool isIDR = m_frame->m_lowres.sliceType == X265_TYPE_IDR;
147
-                writeSei = payloadChange || isIDR;
148
-            }
149
-        }
150
-    }
151
     int qp = m_top->m_rateControl->rateControlStart(m_frame, &m_rce, m_top);
152
     m_rce.newQp = qp;
153
 
154
@@ -594,7 +619,6 @@
155
 
156
     /* reset entropy coders and compute slice id */
157
     m_entropyCoder.load(m_initSliceContext);
158
-   
159
     for (uint32_t sliceId = 0; sliceId < m_param->maxSlices; sliceId++)   
160
         for (uint32_t row = m_sliceBaseRow[sliceId]; row < m_sliceBaseRow[sliceId + 1]; row++)
161
             m_rows[row].init(m_initSliceContext, sliceId);   
162
@@ -620,6 +644,7 @@
163
             m_outStreams[i].resetBits();
164
     }
165
 
166
+    m_rce.encodeOrder = m_frame->m_encodeOrder;
167
     int prevBPSEI = m_rce.encodeOrder ? m_top->m_lastBPSEI : 0;
168
 
169
     if (m_frame->m_lowres.bKeyframe)
170
@@ -632,18 +657,22 @@
171
             bpSei->m_auCpbRemovalDelayDelta = 1;
172
             bpSei->m_cpbDelayOffset = 0;
173
             bpSei->m_dpbDelayOffset = 0;
174
-
175
             // hrdFullness() calculates the initial CPB removal delay and offset
176
             m_top->m_rateControl->hrdFullness(bpSei);
177
-
178
-            m_bs.resetBits();
179
-            bpSei->write(m_bs, *slice->m_sps);
180
-            m_bs.writeByteAlignment();
181
-
182
-            m_nalList.serialize(NAL_UNIT_PREFIX_SEI, m_bs);
183
+            bpSei->writeSEImessages(m_bs, *slice->m_sps, NAL_UNIT_PREFIX_SEI, m_nalList, m_param->bSingleSeiNal);
184
 
185
             m_top->m_lastBPSEI = m_rce.encodeOrder;
186
         }
187
+
188
+        if (m_frame->m_lowres.sliceType == X265_TYPE_IDR && m_param->bEmitIDRRecoverySEI)
189
+        {
190
+            /* Recovery Point SEI require the SPS to be "activated" */
191
+            SEIRecoveryPoint sei;
192
+            sei.m_recoveryPocCnt = 0;
193
+            sei.m_exactMatchingFlag = true;
194
+            sei.m_brokenLinkFlag = false;
195
+            sei.writeSEImessages(m_bs, *slice->m_sps, NAL_UNIT_PREFIX_SEI, m_nalList, m_param->bSingleSeiNal);
196
+        }
197
     }
198
 
199
     if ((m_param->bEmitHRDSEI || !!m_param->interlaceMode))
200
@@ -660,8 +689,10 @@
201
             else if (m_param->interlaceMode == 1)
202
                 sei->m_picStruct = (poc & 1) ? 2 /* bottom */ : 1 /* top */;
203
             else
204
-                sei->m_picStruct = 0;
205
-            sei->m_sourceScanType = 0;
206
+                sei->m_picStruct = m_param->pictureStructure;
207
+
208
+            sei->m_sourceScanType = m_param->interlaceMode ? 0 : 1;
209
+
210
             sei->m_duplicateFlag = false;
211
         }
212
 
213
@@ -675,10 +706,14 @@
214
             sei->m_picDpbOutputDelay = slice->m_sps->numReorderPics + poc - m_rce.encodeOrder;
215
         }
216
 
217
-        m_bs.resetBits();
218
-        sei->write(m_bs, *slice->m_sps);
219
-        m_bs.writeByteAlignment();
220
-        m_nalList.serialize(NAL_UNIT_PREFIX_SEI, m_bs);
221
+        sei->writeSEImessages(m_bs, *slice->m_sps, NAL_UNIT_PREFIX_SEI, m_nalList, m_param->bSingleSeiNal);
222
+    }
223
+
224
+    if (m_param->preferredTransferCharacteristics > -1 && slice->isIRAP())
225
+    {
226
+        SEIAlternativeTC m_seiAlternativeTC;
227
+        m_seiAlternativeTC.m_preferredTransferCharacteristics = m_param->preferredTransferCharacteristics;
228
+        m_seiAlternativeTC.writeSEImessages(m_bs, *slice->m_sps, NAL_UNIT_PREFIX_SEI, m_nalList, m_param->bSingleSeiNal);
229
     }
230
 
231
     /* Write user SEI */
232
@@ -689,28 +724,33 @@
233
         {
234
             SEIuserDataUnregistered sei;
235
             sei.m_userData = payload->payload;
236
-            m_bs.resetBits();
237
             sei.setSize(payload->payloadSize);
238
-            sei.write(m_bs, *slice->m_sps);
239
-            m_bs.writeByteAlignment();
240
-            m_nalList.serialize(NAL_UNIT_PREFIX_SEI, m_bs);
241
+            sei.writeSEImessages(m_bs, *slice->m_sps, NAL_UNIT_PREFIX_SEI, m_nalList, m_param->bSingleSeiNal);
242
         }
243
         else if (payload->payloadType == USER_DATA_REGISTERED_ITU_T_T35)
244
         {
245
+            bool writeSei = m_param->bDhdr10opt ? writeToneMapInfo(payload) : true;
246
             if (writeSei)
247
             {
248
-                SEICreativeIntentMeta sei;
249
-                sei.m_payload = payload->payload;
250
-                m_bs.resetBits();
251
+                SEIuserDataRegistered sei;
252
+                sei.m_userData = payload->payload;
253
                 sei.setSize(payload->payloadSize);
254
-                sei.write(m_bs, *slice->m_sps);
255
-                m_bs.writeByteAlignment();
256
-                m_nalList.serialize(NAL_UNIT_PREFIX_SEI, m_bs);
257
+                sei.writeSEImessages(m_bs, *slice->m_sps, NAL_UNIT_PREFIX_SEI, m_nalList, m_param->bSingleSeiNal);
258
             }
259
         }
260
         else
261
             x265_log(m_param, X265_LOG_ERROR, "Unrecognized SEI type\n");
262
     }
263
+
264
+    bool isSei = ((m_frame->m_lowres.bKeyframe && m_param->bRepeatHeaders) || m_param->bEmitHRDSEI ||
265
+                 !!m_param->interlaceMode || (m_frame->m_lowres.sliceType == X265_TYPE_IDR && m_param->bEmitIDRRecoverySEI) ||
266
+                   m_frame->m_userSEI.numPayloads);
267
+
268
+    if (isSei && m_param->bSingleSeiNal)
269
+    {
270
+        m_bs.writeByteAlignment();
271
+        m_nalList.serialize(NAL_UNIT_PREFIX_SEI, m_bs);
272
+    }
273
     /* CQP and CRF (without capped VBV) doesn't use mid-frame statistics to 
274
      * tune RateControl parameters for other frames.
275
      * Hence, for these modes, update m_startEndOrder and unlock RC for previous threads waiting in
276
@@ -724,6 +764,9 @@
277
             m_top->m_rateControl->m_startEndOrder.incr(); // faked rateControlEnd calls for negative frames
278
     }
279
 
280
+    if (m_param->bDynamicRefine)
281
+        computeAvgTrainingData();
282
+
283
     /* Analyze CTU rows, most of the hard work is done here.  Frame is
284
      * compressed in a wave-front pattern if WPP is enabled. Row based loop
285
      * filters runs behind the CTU compression and reconstruction */
286
@@ -835,76 +878,19 @@
287
                 m_frameFilter.processRow(i - m_filterRowDelay);
288
         }
289
     }
290
+#if ENABLE_LIBVMAF
291
+    vmafFrameLevelScore();
292
+#endif
293
 
294
     if (m_param->maxSlices > 1)
295
     {
296
         PicYuv *reconPic = m_frame->m_reconPic;
297
         uint32_t height = reconPic->m_picHeight;
298
-        uint32_t width = reconPic->m_picWidth;
299
-        intptr_t stride = reconPic->m_stride;
300
-        const uint32_t hChromaShift = CHROMA_H_SHIFT(m_param->internalCsp);
301
-        const uint32_t vChromaShift = CHROMA_V_SHIFT(m_param->internalCsp);
302
+        initDecodedPictureHashSEI(0, 0, height);
303
+    } 
304
 
305
-        if (m_param->decodedPictureHashSEI == 1)
306
-        {
307
-
308
-            MD5Init(&m_state[0]);
309
-
310
-            updateMD5Plane(m_state[0], reconPic->m_picOrg[0], width, height, stride);
311
-
312
-            if (m_param->internalCsp != X265_CSP_I400)
313
-            {
314
-                MD5Init(&m_state[1]);
315
-                MD5Init(&m_state[2]);
316
-
317
-                width >>= hChromaShift;
318
-                height >>= vChromaShift;
319
-                stride = reconPic->m_strideC;
320
-
321
-                updateMD5Plane(m_state[1], reconPic->m_picOrg[1], width, height, stride);
322
-                updateMD5Plane(m_state[2], reconPic->m_picOrg[2], width, height, stride);
323
-            }
324
-        }
325
-        // TODO: NOT verify code in below mode
326
-        else if (m_param->decodedPictureHashSEI == 2)
327
-        {
328
-            m_crc[0] = 0xffff;
329
-
330
-            updateCRC(reconPic->m_picOrg[0], m_crc[0], height, width, stride);
331
-
332
-            if (m_param->internalCsp != X265_CSP_I400)
333
-            {
334
-                width >>= hChromaShift;
335
-                height >>= vChromaShift;
336
-                stride = reconPic->m_strideC;
337
-                m_crc[1] = m_crc[2] = 0xffff;
338
-
339
-                updateCRC(reconPic->m_picOrg[1], m_crc[1], height, width, stride);
340
-                updateCRC(reconPic->m_picOrg[2], m_crc[2], height, width, stride);
341
-            }
342
-        }
343
-        else if (m_param->decodedPictureHashSEI == 3)
344
-        {
345
-            uint32_t cuHeight = m_param->maxCUSize;
346
-
347
-            m_checksum[0] = 0;
348
-
349
-            updateChecksum(reconPic->m_picOrg[0], m_checksum[0], height, width, stride, 0, cuHeight);
350
-
351
-            if (m_param->internalCsp != X265_CSP_I400)
352
-            {
353
-                width >>= hChromaShift;
354
-                height >>= vChromaShift;
355
-                stride = reconPic->m_strideC;
356
-                cuHeight >>= vChromaShift;
357
-
358
-                m_checksum[1] = m_checksum[2] = 0;
359
-
360
-                updateChecksum(reconPic->m_picOrg[1], m_checksum[1], height, width, stride, 0, cuHeight);
361
-                updateChecksum(reconPic->m_picOrg[2], m_checksum[2], height, width, stride, 0, cuHeight);
362
-            }
363
-        }
364
-    } // end of (m_param->maxSlices > 1)
365
+    if (m_param->bDynamicRefine && m_top->m_startPoint <= m_frame->m_encodeOrder) //Avoid collecting data that will not be used by future frames.
366
+        collectDynDataFrame();
367
 
368
     if (m_param->rc.bStatWrite)
369
     {
370
@@ -992,8 +978,6 @@
371
             m_bs.resetBits();
372
 
373
             const uint32_t sliceAddr = nextSliceRow * m_numCols;
374
-            //CUData* ctu = m_frame->m_encData->getPicCTU(sliceAddr);
375
-            //const int sliceQp = ctu->m_qp[0];
376
             if (m_param->bOptRefListLengthPPS)
377
             {
378
                 ScopedLock refIdxLock(m_top->m_sliceRefIdxLock);
379
@@ -1040,38 +1024,8 @@
380
         m_nalList.serialize(slice->m_nalUnitType, m_bs);
381
     }
382
 
383
-
384
     if (m_param->decodedPictureHashSEI)
385
-    {
386
-        int planes = (m_frame->m_param->internalCsp != X265_CSP_I400) ? 3 : 1;
387
-        int32_t payloadSize = 0;
388
-        if (m_param->decodedPictureHashSEI == 1)
389
-        {
390
-            m_seiReconPictureDigest.m_method = SEIDecodedPictureHash::MD5;
391
-            for (int i = 0; i < planes; i++)
392
-                MD5Final(&m_state[i], m_seiReconPictureDigest.m_digest[i]);
393
-            payloadSize = 1 + 16 * planes;
394
-        }
395
-        else if (m_param->decodedPictureHashSEI == 2)
396
-        {
397
-            m_seiReconPictureDigest.m_method = SEIDecodedPictureHash::CRC;
398
-            for (int i = 0; i < planes; i++)
399
-                crcFinish(m_crc[i], m_seiReconPictureDigest.m_digest[i]);
400
-            payloadSize = 1 + 2 * planes;
401
-        }
402
-        else if (m_param->decodedPictureHashSEI == 3)
403
-        {
404
-            m_seiReconPictureDigest.m_method = SEIDecodedPictureHash::CHECKSUM;
405
-            for (int i = 0; i < planes; i++)
406
-                checksumFinish(m_checksum[i], m_seiReconPictureDigest.m_digest[i]);
407
-            payloadSize = 1 + 4 * planes;
408
-        }
409
-        m_bs.resetBits();
410
-        m_seiReconPictureDigest.setSize(payloadSize);
411
-        m_seiReconPictureDigest.write(m_bs, *slice->m_sps);
412
-        m_bs.writeByteAlignment();
413
-        m_nalList.serialize(NAL_UNIT_SUFFIX_SEI, m_bs);
414
-    }
415
+        writeTrailingSEIMessages();
416
 
417
     uint64_t bytes = 0;
418
     for (uint32_t i = 0; i < m_nalList.m_numNal; i++)
419
@@ -1160,7 +1114,79 @@
420
         m_cuStats.accumulate(m_tld[i].analysis.m_stats[m_jpId], *m_param);
421
 #endif
422
 
423
-    m_endFrameTime = x265_mdate();
424
+    m_endFrameTime = x265_mdate();  
425
+}
426
+
427
+void FrameEncoder::initDecodedPictureHashSEI(int row, int cuAddr, int height)
428
+{
429
+    PicYuv *reconPic = m_frame->m_reconPic;
430
+    uint32_t width = reconPic->m_picWidth; 
431
+    intptr_t stride = reconPic->m_stride;
432
+    uint32_t maxCUHeight = m_param->maxCUSize;
433
+
434
+    const uint32_t hChromaShift = CHROMA_H_SHIFT(m_param->internalCsp);
435
+    const uint32_t vChromaShift = CHROMA_V_SHIFT(m_param->internalCsp);
436
+
437
+    if (m_param->decodedPictureHashSEI == 1)
438
+    {
439
+        if (!row)
440
+            MD5Init(&m_seiReconPictureDigest.m_state[0]);
441
+
442
+        updateMD5Plane(m_seiReconPictureDigest.m_state[0], reconPic->getLumaAddr(cuAddr), width, height, stride);
443
+        if (m_param->internalCsp != X265_CSP_I400)
444
+        {
445
+            if (!row)
446
+            {
447
+                MD5Init(&m_seiReconPictureDigest.m_state[1]);
448
+                MD5Init(&m_seiReconPictureDigest.m_state[2]);
449
+            }
450
+
451
+            width >>= hChromaShift;
452
+            height >>= vChromaShift;
453
+            stride = reconPic->m_strideC;
454
+
455
+            updateMD5Plane(m_seiReconPictureDigest.m_state[1], reconPic->getCbAddr(cuAddr), width, height, stride);
456
+            updateMD5Plane(m_seiReconPictureDigest.m_state[2], reconPic->getCrAddr(cuAddr), width, height, stride);
457
+        }
458
+    }
459
+    else if (m_param->decodedPictureHashSEI == 2)
460
+    {
461
+
462
+        if (!row)
463
+            m_seiReconPictureDigest.m_crc[0] = 0xffff;
464
+
465
+        updateCRC(reconPic->getLumaAddr(cuAddr), m_seiReconPictureDigest.m_crc[0], height, width, stride);
466
+        if (m_param->internalCsp != X265_CSP_I400)
467
+        {
468
+            width >>= hChromaShift;
469
+            height >>= vChromaShift;
470
+            stride = reconPic->m_strideC;
471
+            m_seiReconPictureDigest.m_crc[1] = m_seiReconPictureDigest.m_crc[2] = 0xffff;
472
+
473
+            updateCRC(reconPic->getCbAddr(cuAddr), m_seiReconPictureDigest.m_crc[1], height, width, stride);
474
+            updateCRC(reconPic->getCrAddr(cuAddr), m_seiReconPictureDigest.m_crc[2], height, width, stride);
475
+        }
476
+    }
477
+    else if (m_param->decodedPictureHashSEI == 3)
478
+    {
479
+        if (!row)
480
+            m_seiReconPictureDigest.m_checksum[0] = 0;
481
+
482
+        updateChecksum(reconPic->m_picOrg[0], m_seiReconPictureDigest.m_checksum[0], height, width, stride, row, maxCUHeight);
483
+        if (m_param->internalCsp != X265_CSP_I400)
484
+        {
485
+            width >>= hChromaShift;
486
+            height >>= vChromaShift;
487
+            stride = reconPic->m_strideC;
488
+            maxCUHeight >>= vChromaShift;
489
+
490
+            if (!row)
491
+                m_seiReconPictureDigest.m_checksum[1] = m_seiReconPictureDigest.m_checksum[2] = 0;
492
+
493
+            updateChecksum(reconPic->m_picOrg[1], m_seiReconPictureDigest.m_checksum[1], height, width, stride, row, maxCUHeight);
494
+            updateChecksum(reconPic->m_picOrg[2], m_seiReconPictureDigest.m_checksum[2], height, width, stride, row, maxCUHeight);
495
+        }
496
+    }
497
 }
498
 
499
 void FrameEncoder::encodeSlice(uint32_t sliceAddr)
500
@@ -1367,7 +1393,7 @@
501
             }
502
             curRow.avgQPComputed = 1;
503
         }
504
-    }    
505
+    }
506
 
507
     // Initialize restrict on MV range in slices
508
     tld.analysis.m_sliceMinY = -(int16_t)(rowInSlice * m_param->maxCUSize * 4) + 3 * 4;
509
@@ -1445,6 +1471,12 @@
510
         // Does all the CU analysis, returns best top level mode decision
511
         Mode& best = tld.analysis.compressCTU(*ctu, *m_frame, m_cuGeoms[m_ctuGeomMap[cuAddr]], rowCoder);
512
 
513
+        /* startPoint > encodeOrder is true when the start point changes for
514
+        a new GOP but few frames from the previous GOP is still incomplete.
515
+        The data of frames in this interval will not be used by any future frames. */
516
+        if (m_param->bDynamicRefine && m_top->m_startPoint <= m_frame->m_encodeOrder)
517
+            collectDynDataRow(*ctu, &curRow.rowStats);
518
+
519
         // take a sample of the current active worker count
520
         ATOMIC_ADD(&m_totalActiveWorkerCount, m_activeWorkerCount);
521
         ATOMIC_INC(&m_activeWorkerCountSamples);
522
@@ -1466,7 +1498,7 @@
523
         {
524
             // NOTE: in VBV mode, we may reencode anytime, so we can't do Deblock stage-Horizon and SAO
525
             if (!bIsVbv)
526
-            {                
527
+            {
528
                 // Delay one row to avoid intra prediction conflict
529
                 if (m_pool && !bFirstRowInSlice)
530
                 {                    
531
@@ -1743,24 +1775,24 @@
532
         else if ((uint32_t)m_rce.encodeOrder <= 2 * (m_param->fpsNum / m_param->fpsDenom))
533
             rowCount = X265_MIN((maxRows + 1) / 2, maxRows - 1);
534
         else
535
-           rowCount = X265_MIN(m_refLagRows / m_param->maxSlices, maxRows - 1);
536
+            rowCount = X265_MIN(m_refLagRows / m_param->maxSlices, maxRows - 1);
537
 
538
         if (rowInSlice == rowCount)
539
         {
540
             m_rowSliceTotalBits[sliceId] = 0;
541
             if (bIsVbv && !(m_param->rc.bEnableConstVbv && m_param->bEnableWavefront))
542
-            {          
543
+            {
544
                 for (uint32_t i = m_sliceBaseRow[sliceId]; i < rowCount + m_sliceBaseRow[sliceId]; i++)
545
                     m_rowSliceTotalBits[sliceId] += curEncData.m_rowStat[i].encodedBits;
546
             }
547
             else
548
             {
549
                 uint32_t startAddr = m_sliceBaseRow[sliceId] * numCols;
550
-               uint32_t finishAddr = startAddr + rowCount * numCols;
551
+                uint32_t finishAddr = startAddr + rowCount * numCols;
552
                 
553
-               for (uint32_t cuAddr = startAddr; cuAddr < finishAddr; cuAddr++)
554
+                for (uint32_t cuAddr = startAddr; cuAddr < finishAddr; cuAddr++)
555
                     m_rowSliceTotalBits[sliceId] += curEncData.m_cuStat[cuAddr].totalBits;
556
-            }            
557
+            }
558
 
559
             if (ATOMIC_INC(&m_sliceCnt) == (int)m_param->maxSlices)
560
             {
561
@@ -1827,6 +1859,101 @@
562
         m_completionEvent.trigger();
563
 }
564
 
565
+void FrameEncoder::collectDynDataRow(CUData& ctu, FrameStats* rowStats)
566
+{
567
+    for (uint32_t i = 0; i < X265_REFINE_INTER_LEVELS; i++)
568
+    {
569
+        for (uint32_t depth = 0; depth < m_param->maxCUDepth; depth++)
570
+        {
571
+            int offset = (depth * X265_REFINE_INTER_LEVELS) + i;
572
+            if (ctu.m_collectCUCount[offset])
573
+            {
574
+                rowStats->rowVarDyn[offset] += ctu.m_collectCUVariance[offset];
575
+                rowStats->rowRdDyn[offset] += ctu.m_collectCURd[offset];
576
+                rowStats->rowCntDyn[offset] += ctu.m_collectCUCount[offset];
577
+            }
578
+        }
579
+    }
580
+}
581
+
582
+void FrameEncoder::collectDynDataFrame()
583
+{
584
+    for (uint32_t row = 0; row < m_numRows; row++)
585
+    {
586
+        for (uint32_t refLevel = 0; refLevel < X265_REFINE_INTER_LEVELS; refLevel++)
587
+        {
588
+            for (uint32_t depth = 0; depth < m_param->maxCUDepth; depth++)
589
+            {
590
+                int offset = (depth * X265_REFINE_INTER_LEVELS) + refLevel;
591
+                int curFrameIndex = m_frame->m_encodeOrder - m_top->m_startPoint;
592
+                int index = (curFrameIndex * X265_REFINE_INTER_LEVELS * m_param->maxCUDepth) + offset;
593
+                if (m_rows[row].rowStats.rowCntDyn[offset])
594
+                {
595
+                    m_top->m_variance[index] += m_rows[row].rowStats.rowVarDyn[offset];
596
+                    m_top->m_rdCost[index] += m_rows[row].rowStats.rowRdDyn[offset];
597
+                    m_top->m_trainingCount[index] += m_rows[row].rowStats.rowCntDyn[offset];
598
+                }
599
+            }
600
+        }
601
+    }
602
+}
603
+
604
+void FrameEncoder::computeAvgTrainingData()
605
+{
606
+    if (m_frame->m_lowres.bScenecut || m_frame->m_lowres.bKeyframe)
607
+    {
608
+        m_top->m_startPoint = m_frame->m_encodeOrder;
609
+        int size = (m_param->keyframeMax + m_param->lookaheadDepth) * m_param->maxCUDepth * X265_REFINE_INTER_LEVELS;
610
+        memset(m_top->m_variance, 0, size * sizeof(uint64_t));
611
+        memset(m_top->m_rdCost, 0, size * sizeof(uint64_t));
612
+        memset(m_top->m_trainingCount, 0, size * sizeof(uint32_t));
613
+    }
614
+    if (m_frame->m_encodeOrder - m_top->m_startPoint < 2 * m_param->frameNumThreads)
615
+        m_frame->m_classifyFrame = false;
616
+    else
617
+        m_frame->m_classifyFrame = true;
618
+
619
+    int size = m_param->maxCUDepth * X265_REFINE_INTER_LEVELS;
620
+    memset(m_frame->m_classifyRd, 0, size * sizeof(uint64_t));
621
+    memset(m_frame->m_classifyVariance, 0, size * sizeof(uint64_t));
622
+    memset(m_frame->m_classifyCount, 0, size * sizeof(uint32_t));
623
+    if (m_frame->m_classifyFrame)
624
+    {
625
+        uint32_t limit = m_frame->m_encodeOrder - m_top->m_startPoint - m_param->frameNumThreads;
626
+        for (uint32_t i = 1; i < limit; i++)
627
+        {
628
+            for (uint32_t j = 0; j < X265_REFINE_INTER_LEVELS; j++)
629
+            {
630
+                for (uint32_t depth = 0; depth < m_param->maxCUDepth; depth++)
631
+                {
632
+                    int offset = (depth * X265_REFINE_INTER_LEVELS) + j;
633
+                    int index = (i* X265_REFINE_INTER_LEVELS * m_param->maxCUDepth) + offset;
634
+                    if (m_top->m_trainingCount[index])
635
+                    {
636
+                        m_frame->m_classifyRd[offset] += m_top->m_rdCost[index] / m_top->m_trainingCount[index];
637
+                        m_frame->m_classifyVariance[offset] += m_top->m_variance[index] / m_top->m_trainingCount[index];
638
+                        m_frame->m_classifyCount[offset] += m_top->m_trainingCount[index];
639
+                    }
640
+                }
641
+            }
642
+        }
643
+        /* Calculates the average feature values of historic frames that are being considered for the current frame */
644
+        int historyCount = m_frame->m_encodeOrder - m_param->frameNumThreads - m_top->m_startPoint - 1;
645
+        if (historyCount)
646
+        {
647
+            for (uint32_t j = 0; j < X265_REFINE_INTER_LEVELS; j++)
648
+            {
649
+                for (uint32_t depth = 0; depth < m_param->maxCUDepth; depth++)
650
+                {
651
+                    int offset = (depth * X265_REFINE_INTER_LEVELS) + j;
652
+                    m_frame->m_classifyRd[offset] /= historyCount;
653
+                    m_frame->m_classifyVariance[offset] /= historyCount;
654
+                }
655
+            }
656
+        }
657
+    }
658
+}
659
+
660
 /* collect statistics about CU coding decisions, return total QP */
661
 int FrameEncoder::collectCTUStatistics(const CUData& ctu, FrameStats* log)
662
 {
663
@@ -1949,6 +2076,31 @@
664
         m_nr->nrOffsetDenoise[cat][0] = 0;
665
     }
666
 }
667
+#if ENABLE_LIBVMAF
668
+void FrameEncoder::vmafFrameLevelScore()
669
+{
670
+    PicYuv *fenc = m_frame->m_fencPic;
671
+    PicYuv *recon = m_frame->m_reconPic;
672
+
673
+    x265_vmaf_framedata *vmafframedata = (x265_vmaf_framedata*)x265_malloc(sizeof(x265_vmaf_framedata));
674
+    if (!vmafframedata)
675
+    {
676
+        x265_log(NULL, X265_LOG_ERROR, "vmaf frame data alloc failed\n");
677
+    }
678
+
679
+    vmafframedata->height = fenc->m_picHeight;
680
+    vmafframedata->width = fenc->m_picWidth;
681
+    vmafframedata->frame_set = 0;
682
+    vmafframedata->internalBitDepth = m_param->internalBitDepth;
683
+    vmafframedata->reference_frame = fenc;
684
+    vmafframedata->distorted_frame = recon;
685
+
686
+    fenc->m_vmafScore = x265_calculate_vmaf_framelevelscore(vmafframedata);
687
+
688
+    if (vmafframedata)
689
+    x265_free(vmafframedata);
690
+}
691
+#endif
692
 
693
 Frame *FrameEncoder::getEncodedPicture(NALList& output)
694
 {
695
x265_2.7.tar.gz/source/encoder/frameencoder.h -> x265_2.9.tar.gz/source/encoder/frameencoder.h Changed
42
 
1
@@ -129,6 +129,8 @@
2
     /* blocks until worker thread is done, returns access unit */
3
     Frame *getEncodedPicture(NALList& list);
4
 
5
+    void initDecodedPictureHashSEI(int row, int cuAddr, int height);
6
+
7
     Event                    m_enable;
8
     Event                    m_done;
9
     Event                    m_completionEvent;
10
@@ -161,9 +163,6 @@
11
     double                   m_ssim;
12
     uint64_t                 m_accessUnitBits;
13
     uint32_t                 m_ssimCnt;
14
-    MD5Context               m_state[3];
15
-    uint32_t                 m_crc[3];
16
-    uint32_t                 m_checksum[3];
17
 
18
     volatile int             m_activeWorkerCount;        // count of workers currently encoding or filtering CTUs
19
     volatile int             m_totalActiveWorkerCount;   // sum of m_activeWorkerCount sampled at end of each CTU
20
@@ -230,6 +229,8 @@
21
     void threadMain();
22
     int  collectCTUStatistics(const CUData& ctu, FrameStats* frameLog);
23
     void noiseReductionUpdate();
24
+    void writeTrailingSEIMessages();
25
+    bool writeToneMapInfo(x265_sei_payload *payload);
26
 
27
     /* Called by WaveFront::findJob() */
28
     virtual void processRow(int row, int threadId);
29
@@ -239,6 +240,12 @@
30
     void enqueueRowFilter(int row)  { WaveFront::enqueueRow(row * 2 + 1); }
31
     void enableRowEncoder(int row)  { WaveFront::enableRow(row * 2 + 0); }
32
     void enableRowFilter(int row)   { WaveFront::enableRow(row * 2 + 1); }
33
+#if ENABLE_LIBVMAF
34
+    void vmafFrameLevelScore();
35
+#endif
36
+    void collectDynDataFrame();
37
+    void computeAvgTrainingData();
38
+    void collectDynDataRow(CUData& ctu, FrameStats* rowStats);    
39
 };
40
 }
41
 
42
x265_2.7.tar.gz/source/encoder/framefilter.cpp -> x265_2.9.tar.gz/source/encoder/framefilter.cpp Changed
82
 
1
@@ -712,78 +712,8 @@
2
 
3
     if (m_param->maxSlices == 1)
4
     {
5
-        if (m_param->decodedPictureHashSEI == 1)
6
-        {
7
-            uint32_t height = m_parallelFilter[row].getCUHeight();
8
-            uint32_t width = reconPic->m_picWidth;
9
-            intptr_t stride = reconPic->m_stride;
10
-
11
-            if (!row)
12
-                MD5Init(&m_frameEncoder->m_state[0]);
13
-
14
-            updateMD5Plane(m_frameEncoder->m_state[0], reconPic->getLumaAddr(cuAddr), width, height, stride);
15
-            if (m_param->internalCsp != X265_CSP_I400)
16
-            {
17
-                if (!row)
18
-                {
19
-                    MD5Init(&m_frameEncoder->m_state[1]);
20
-                    MD5Init(&m_frameEncoder->m_state[2]);
21
-                }
22
-
23
-                width >>= m_hChromaShift;
24
-                height >>= m_vChromaShift;
25
-                stride = reconPic->m_strideC;
26
-
27
-                updateMD5Plane(m_frameEncoder->m_state[1], reconPic->getCbAddr(cuAddr), width, height, stride);
28
-                updateMD5Plane(m_frameEncoder->m_state[2], reconPic->getCrAddr(cuAddr), width, height, stride);
29
-            }
30
-        }
31
-        else if (m_param->decodedPictureHashSEI == 2)
32
-        {
33
-            uint32_t height = m_parallelFilter[row].getCUHeight();
34
-            uint32_t width = reconPic->m_picWidth;
35
-            intptr_t stride = reconPic->m_stride;
36
-
37
-            if (!row)
38
-                m_frameEncoder->m_crc[0] = 0xffff;
39
-
40
-            updateCRC(reconPic->getLumaAddr(cuAddr), m_frameEncoder->m_crc[0], height, width, stride);
41
-            if (m_param->internalCsp != X265_CSP_I400)
42
-            {
43
-                width >>= m_hChromaShift;
44
-                height >>= m_vChromaShift;
45
-                stride = reconPic->m_strideC;
46
-                m_frameEncoder->m_crc[1] = m_frameEncoder->m_crc[2] = 0xffff;
47
-
48
-                updateCRC(reconPic->getCbAddr(cuAddr), m_frameEncoder->m_crc[1], height, width, stride);
49
-                updateCRC(reconPic->getCrAddr(cuAddr), m_frameEncoder->m_crc[2], height, width, stride);
50
-            }
51
-        }
52
-        else if (m_param->decodedPictureHashSEI == 3)
53
-        {
54
-            uint32_t width = reconPic->m_picWidth;
55
-            uint32_t height = m_parallelFilter[row].getCUHeight();
56
-            intptr_t stride = reconPic->m_stride;
57
-            uint32_t cuHeight = m_param->maxCUSize;
58
-
59
-            if (!row)
60
-                m_frameEncoder->m_checksum[0] = 0;
61
-
62
-            updateChecksum(reconPic->m_picOrg[0], m_frameEncoder->m_checksum[0], height, width, stride, row, cuHeight);
63
-            if (m_param->internalCsp != X265_CSP_I400)
64
-            {
65
-                width >>= m_hChromaShift;
66
-                height >>= m_vChromaShift;
67
-                stride = reconPic->m_strideC;
68
-                cuHeight >>= m_vChromaShift;
69
-
70
-                if (!row)
71
-                    m_frameEncoder->m_checksum[1] = m_frameEncoder->m_checksum[2] = 0;
72
-
73
-                updateChecksum(reconPic->m_picOrg[1], m_frameEncoder->m_checksum[1], height, width, stride, row, cuHeight);
74
-                updateChecksum(reconPic->m_picOrg[2], m_frameEncoder->m_checksum[2], height, width, stride, row, cuHeight);
75
-            }
76
-        }
77
+        uint32_t height = m_parallelFilter[row].getCUHeight();
78
+        m_frameEncoder->initDecodedPictureHashSEI(row, cuAddr, height);
79
     } // end of (m_param->maxSlices == 1)
80
 
81
     if (ATOMIC_INC(&m_frameEncoder->m_completionCount) == 2 * (int)m_frameEncoder->m_numRows)
82
x265_2.7.tar.gz/source/encoder/ratecontrol.cpp -> x265_2.9.tar.gz/source/encoder/ratecontrol.cpp Changed
114
 
1
@@ -1282,6 +1282,12 @@
2
         m_predictedBits = m_totalBits;
3
         updateVbvPlan(enc);
4
         rce->bufferFill = m_bufferFill;
5
+        rce->vbvEndAdj = false;
6
+        if (m_param->vbvBufferEnd && rce->encodeOrder >= m_param->vbvEndFrameAdjust * m_param->totalFrames)
7
+        {
8
+            rce->vbvEndAdj = true;
9
+            rce->targetFill = 0;
10
+        }
11
 
12
         int mincr = enc->m_vps.ptl.minCrForLevel;
13
         /* Profiles above Main10 don't require maxAU size check, so just set the maximum to a large value. */
14
@@ -1290,7 +1296,7 @@
15
         else
16
         {
17
             /* The spec has a special case for the first frame. */
18
-            if (rce->encodeOrder == 0)
19
+            if (curFrame->m_lowres.bKeyframe)
20
             {
21
                 /* 1.5 * (Max( PicSizeInSamplesY, fR * MaxLumaSr) + MaxLumaSr * (AuCpbRemovalTime[ 0 ] -AuNominalRemovalTime[ 0 ])) ? MinCr */
22
                 double fr = 1. / 300;
23
@@ -1302,6 +1308,7 @@
24
                 /* 1.5 * MaxLumaSr * (AuCpbRemovalTime[ n ] - AuCpbRemovalTime[ n - 1 ]) / MinCr */
25
                 rce->frameSizeMaximum = 8 * 1.5 * enc->m_vps.ptl.maxLumaSrForLevel * m_frameDuration / mincr;
26
             }
27
+            rce->frameSizeMaximum *= m_param->maxAUSizeFactor;
28
         }
29
     }
30
     if (!m_isAbr && m_2pass && m_param->rc.rateControlMode == X265_RC_CRF)
31
@@ -2172,12 +2179,12 @@
32
                     curBits = predictSize(&m_pred[predType], frameQ[type], (double)satd);
33
                     bufferFillCur -= curBits;
34
                 }
35
-                if (m_param->vbvBufferEnd && rce->encodeOrder >= m_param->vbvEndFrameAdjust * m_param->totalFrames)
36
+                if (rce->vbvEndAdj)
37
                 {
38
                     bool loopBreak = false;
39
                     double bufferDiff = m_param->vbvBufferEnd - (m_bufferFill / m_bufferSize);
40
-                    targetFill = m_bufferFill + m_bufferSize * (bufferDiff / (m_param->totalFrames - rce->encodeOrder));
41
-                    if (bufferFillCur < targetFill)
42
+                    rce->targetFill = m_bufferFill + m_bufferSize * (bufferDiff / (m_param->totalFrames - rce->encodeOrder));
43
+                    if (bufferFillCur < rce->targetFill)
44
                     {
45
                         q *= 1.01;
46
                         loopTerminate |= 1;
47
@@ -2420,6 +2427,7 @@
48
         double rcTol = bufferLeftPlanned / m_param->frameNumThreads * m_rateTolerance;
49
         int32_t encodedBitsSoFar = 0;
50
         double accFrameBits = predictRowsSizeSum(curFrame, rce, qpVbv, encodedBitsSoFar);
51
+        double vbvEndBias = 0.95;
52
 
53
         /* * Don't increase the row QPs until a sufficent amount of the bits of
54
          * the frame have been processed, in case a flat area at the top of the
55
@@ -2441,7 +2449,8 @@
56
         while (qpVbv < qpMax
57
                && (((accFrameBits > rce->frameSizePlanned + rcTol) ||
58
                    (rce->bufferFill - accFrameBits < bufferLeftPlanned * 0.5) ||
59
-                   (accFrameBits > rce->frameSizePlanned && qpVbv < rce->qpNoVbv))
60
+                   (accFrameBits > rce->frameSizePlanned && qpVbv < rce->qpNoVbv) ||
61
+                   (rce->vbvEndAdj && ((rce->bufferFill - accFrameBits) < (rce->targetFill * vbvEndBias))))
62
                    && (!m_param->rc.bStrictCbr ? 1 : abrOvershoot > 0.1)))
63
         {
64
             qpVbv += stepSize;
65
@@ -2452,7 +2461,8 @@
66
         while (qpVbv > qpMin
67
                && (qpVbv > curEncData.m_rowStat[0].rowQp || m_singleFrameVbv)
68
                && (((accFrameBits < rce->frameSizePlanned * 0.8f && qpVbv <= prevRowQp)
69
-                   || accFrameBits < (rce->bufferFill - m_bufferSize + m_bufferRate) * 1.1)
70
+                   || accFrameBits < (rce->bufferFill - m_bufferSize + m_bufferRate) * 1.1
71
+                   || (rce->vbvEndAdj && ((rce->bufferFill - accFrameBits) > (rce->targetFill * vbvEndBias))))
72
                    && (!m_param->rc.bStrictCbr ? 1 : abrOvershoot < 0)))
73
         {
74
             qpVbv -= stepSize;
75
@@ -2630,8 +2640,9 @@
76
     FrameData& curEncData = *curFrame->m_encData;
77
     int64_t actualBits = bits;
78
     Slice *slice = curEncData.m_slice;
79
+    bool bEnableDistOffset = m_param->analysisMultiPassDistortion && m_param->rc.bStatRead;
80
 
81
-    if (m_param->rc.aqMode || m_isVbv || m_param->bAQMotion)
82
+    if (m_param->rc.aqMode || m_isVbv || m_param->bAQMotion || bEnableDistOffset)
83
     {
84
         if (m_isVbv && !(m_2pass && m_param->rc.rateControlMode == X265_RC_CRF))
85
         {
86
@@ -2645,10 +2656,10 @@
87
             rce->qpaRc = curEncData.m_avgQpRc;
88
         }
89
 
90
-        if (m_param->rc.aqMode || m_param->bAQMotion)
91
+        if (m_param->rc.aqMode || m_param->bAQMotion || bEnableDistOffset)
92
         {
93
             double avgQpAq = 0;
94
-            /* determine actual avg encoded QP, after AQ/cutree adjustments */
95
+            /* determine actual avg encoded QP, after AQ/cutree/distortion adjustments */
96
             for (uint32_t i = 0; i < slice->m_sps->numCuInHeight; i++)
97
                 avgQpAq += curEncData.m_rowStat[i].sumQpAq;
98
 
99
@@ -2792,12 +2803,8 @@
100
 /* called to write out the rate control frame stats info in multipass encodes */
101
 int RateControl::writeRateControlFrameStats(Frame* curFrame, RateControlEntry* rce)
102
 {
103
-    FrameData& curEncData = *curFrame->m_encData;
104
-    int ncu;
105
-    if (m_param->rc.qgSize == 8)
106
-        ncu = m_ncu * 4;
107
-    else
108
-        ncu = m_ncu;
109
+    FrameData& curEncData = *curFrame->m_encData;    
110
+    int ncu = (m_param->rc.qgSize == 8) ? m_ncu * 4 : m_ncu;
111
     char cType = rce->sliceType == I_SLICE ? (curFrame->m_lowres.sliceType == X265_TYPE_IDR ? 'I' : 'i')
112
         : rce->sliceType == P_SLICE ? 'P'
113
         : IS_REFERENCED(curFrame) ? 'B' : 'b';
114
x265_2.7.tar.gz/source/encoder/ratecontrol.h -> x265_2.9.tar.gz/source/encoder/ratecontrol.h Changed
10
 
1
@@ -82,6 +82,8 @@
2
     double  rowCplxrSum;
3
     double  qpNoVbv;
4
     double  bufferFill;
5
+    double  targetFill;
6
+    bool    vbvEndAdj;
7
     double  frameDuration;
8
     double  clippedDuration;
9
     double  frameSizeEstimated; /* hold frameSize, updated from cu level vbv rc */
10
x265_2.7.tar.gz/source/encoder/reference.cpp -> x265_2.9.tar.gz/source/encoder/reference.cpp Changed
24
 
1
@@ -89,7 +89,7 @@
2
                 cuHeight >>= reconPic->m_vChromaShift;
3
             }
4
 
5
-            if (wp[c].bPresentFlag)
6
+            if (wp[c].wtPresent)
7
             {
8
                 if (!weightBuffer[c])
9
                 {
10
@@ -155,12 +155,10 @@
11
 
12
         const pixel* src = reconPic->m_picOrg[c] + numWeightedRows * cuHeight * stride;
13
         pixel* dst = fpelPlane[c] + numWeightedRows * cuHeight * stride;
14
-
15
         // Computing weighted CU rows
16
         int correction = IF_INTERNAL_PREC - X265_DEPTH; // intermediate interpolation depth
17
-        int padwidth = (width + 15) & ~15;              // weightp assembly needs even 16 byte widths
18
+        int padwidth = (width + 31) & ~31;              // weightp assembly needs even 32 byte widths
19
         primitives.weight_pp(src, dst, stride, padwidth, height, w[c].weight, w[c].round << correction, w[c].shift + correction, w[c].offset);
20
-
21
         // Extending Left & Right
22
         primitives.extendRowBorder(dst, stride, width, height, marginX);
23
 
24
x265_2.7.tar.gz/source/encoder/search.cpp -> x265_2.9.tar.gz/source/encoder/search.cpp Changed
409
 
1
@@ -82,7 +82,7 @@
2
     m_me.init(param.internalCsp);
3
 
4
     bool ok = m_quant.init(param.psyRdoq, scalingList, m_entropyCoder);
5
-    if (m_param->noiseReductionIntra || m_param->noiseReductionInter || m_param->rc.vbvBufferSize)
6
+    if (m_param->noiseReductionIntra || m_param->noiseReductionInter )
7
         ok &= m_quant.allocNoiseReduction(param);
8
 
9
     ok &= Predict::allocBuffers(param.internalCsp); /* sets m_hChromaShift & m_vChromaShift */
10
@@ -354,14 +354,17 @@
11
         // store original entropy coding status
12
         if (bEnableRDOQ)
13
             m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
14
-
15
-        primitives.cu[sizeIdx].calcresidual(fenc, pred, residual, stride);
16
+        primitives.cu[sizeIdx].calcresidual[stride % 64 == 0](fenc, pred, residual, stride);
17
 
18
         uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffY, log2TrSize, TEXT_LUMA, absPartIdx, false);
19
         if (numSig)
20
         {
21
             m_quant.invtransformNxN(cu, residual, stride, coeffY, log2TrSize, TEXT_LUMA, true, false, numSig);
22
-            primitives.cu[sizeIdx].add_ps(reconQt, reconQtStride, pred, residual, stride, stride);
23
+            bool reconQtYuvAlign = m_rqt[qtLayer].reconQtYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 == 0;
24
+            bool predAlign = mode.predYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 == 0;
25
+            bool residualAlign = m_rqt[cuGeom.depth].tmpResiYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 == 0;
26
+            bool bufferAlignCheck = (reconQtStride % 64 == 0) && (stride % 64 == 0) && reconQtYuvAlign && predAlign && residualAlign;
27
+            primitives.cu[sizeIdx].add_ps[bufferAlignCheck](reconQt, reconQtStride, pred, residual, stride, stride);
28
         }
29
         else
30
             // no coded residual, recon = pred
31
@@ -559,15 +562,19 @@
32
 
33
         coeff_t* coeff = (useTSkip ? m_tsCoeff : coeffY);
34
         pixel*   tmpRecon = (useTSkip ? m_tsRecon : reconQt);
35
+        bool tmpReconAlign = (useTSkip ? 1 : (m_rqt[qtLayer].reconQtYuv.getAddrOffset(absPartIdx, m_rqt[qtLayer].reconQtYuv.m_size) % 64 == 0));
36
         uint32_t tmpReconStride = (useTSkip ? MAX_TS_SIZE : reconQtStride);
37
 
38
-        primitives.cu[sizeIdx].calcresidual(fenc, pred, residual, stride);
39
+        primitives.cu[sizeIdx].calcresidual[stride % 64 == 0](fenc, pred, residual, stride);
40
 
41
         uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSize, TEXT_LUMA, absPartIdx, useTSkip);
42
         if (numSig)
43
         {
44
             m_quant.invtransformNxN(cu, residual, stride, coeff, log2TrSize, TEXT_LUMA, true, useTSkip, numSig);
45
-            primitives.cu[sizeIdx].add_ps(tmpRecon, tmpReconStride, pred, residual, stride, stride);
46
+            bool residualAlign = m_rqt[cuGeom.depth].tmpResiYuv.getAddrOffset(absPartIdx, m_rqt[cuGeom.depth].tmpResiYuv.m_size) % 64 == 0;
47
+            bool predAlign = predYuv->getAddrOffset(absPartIdx, predYuv->m_size) % 64 == 0;
48
+            bool bufferAlignCheck = (stride % 64 == 0) && (tmpReconStride % 64 == 0) && tmpReconAlign && residualAlign && predAlign;
49
+            primitives.cu[sizeIdx].add_ps[bufferAlignCheck](tmpRecon, tmpReconStride, pred, residual, stride, stride);
50
         }
51
         else if (useTSkip)
52
         {
53
@@ -714,7 +721,7 @@
54
         coeff_t* coeffY       = cu.m_trCoeff[0] + coeffOffsetY;
55
 
56
         uint32_t sizeIdx   = log2TrSize - 2;
57
-        primitives.cu[sizeIdx].calcresidual(fenc, pred, residual, stride);
58
+        primitives.cu[sizeIdx].calcresidual[stride % 64 == 0](fenc, pred, residual, stride);
59
 
60
         PicYuv*  reconPic = m_frame->m_reconPic;
61
         pixel*   picReconY = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
62
@@ -724,7 +731,11 @@
63
         if (numSig)
64
         {
65
             m_quant.invtransformNxN(cu, residual, stride, coeffY, log2TrSize, TEXT_LUMA, true, false, numSig);
66
-            primitives.cu[sizeIdx].add_ps(picReconY, picStride, pred, residual, stride, stride);
67
+            bool picReconYAlign = (reconPic->m_cuOffsetY[cu.m_cuAddr] + reconPic->m_buOffsetY[cuGeom.absPartIdx + absPartIdx]) % 64 == 0;
68
+            bool predAlign = mode.predYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 == 0;
69
+            bool residualAlign = m_rqt[cuGeom.depth].tmpResiYuv.getAddrOffset(absPartIdx, m_rqt[cuGeom.depth].tmpResiYuv.m_size)% 64 == 0;
70
+            bool bufferAlignCheck = (picStride % 64 == 0) && (stride % 64 == 0) && picReconYAlign && predAlign && residualAlign;
71
+            primitives.cu[sizeIdx].add_ps[bufferAlignCheck](picReconY, picStride, pred, residual, stride, stride);
72
             cu.setCbfSubParts(1 << tuDepth, TEXT_LUMA, absPartIdx, fullDepth);
73
         }
74
         else
75
@@ -893,12 +904,17 @@
76
             predIntraChromaAng(chromaPredMode, pred, stride, log2TrSizeC);
77
             cu.setTransformSkipPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
78
 
79
-            primitives.cu[sizeIdxC].calcresidual(fenc, pred, residual, stride);
80
+            primitives.cu[sizeIdxC].calcresidual[stride % 64 == 0](fenc, pred, residual, stride);
81
+
82
             uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffC, log2TrSizeC, ttype, absPartIdxC, false);
83
             if (numSig)
84
             {
85
                 m_quant.invtransformNxN(cu, residual, stride, coeffC, log2TrSizeC, ttype, true, false, numSig);
86
-                primitives.cu[sizeIdxC].add_ps(reconQt, reconQtStride, pred, residual, stride, stride);
87
+                bool reconQtAlign = m_rqt[qtLayer].reconQtYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
88
+                bool predAlign = mode.predYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
89
+                bool residualAlign = resiYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
90
+                bool bufferAlignCheck = reconQtAlign && predAlign && residualAlign && (reconQtStride % 64 == 0) && (stride % 64 == 0);
91
+                primitives.cu[sizeIdxC].add_ps[bufferAlignCheck](reconQt, reconQtStride, pred, residual, stride, stride);
92
                 cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
93
             }
94
             else
95
@@ -992,13 +1008,17 @@
96
                 pixel*   recon = (useTSkip ? m_tsRecon : reconQt);
97
                 uint32_t reconStride = (useTSkip ? MAX_TS_SIZE : reconQtStride);
98
 
99
-                primitives.cu[sizeIdxC].calcresidual(fenc, pred, residual, stride);
100
+                primitives.cu[sizeIdxC].calcresidual[stride % 64 == 0](fenc, pred, residual, stride);
101
 
102
                 uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeff, log2TrSizeC, ttype, absPartIdxC, useTSkip);
103
                 if (numSig)
104
                 {
105
                     m_quant.invtransformNxN(cu, residual, stride, coeff, log2TrSizeC, ttype, true, useTSkip, numSig);
106
-                    primitives.cu[sizeIdxC].add_ps(recon, reconStride, pred, residual, stride, stride);
107
+                    bool reconAlign = (useTSkip ? 1 : m_rqt[qtLayer].reconQtYuv.getChromaAddrOffset(absPartIdxC)) % 64 == 0;
108
+                    bool predYuvAlign = mode.predYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
109
+                    bool residualAlign = m_rqt[cuGeom.depth].tmpResiYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
110
+                    bool bufferAlignCheck = reconAlign && predYuvAlign && residualAlign && (reconStride % 64 == 0) && (stride % 64 == 0);
111
+                    primitives.cu[sizeIdxC].add_ps[bufferAlignCheck](recon, reconStride, pred, residual, stride, stride);
112
                     cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
113
                 }
114
                 else if (useTSkip)
115
@@ -1183,12 +1203,17 @@
116
 
117
             X265_CHECK(!cu.m_transformSkip[ttype][0], "transform skip not supported at low RD levels\n");
118
 
119
-            primitives.cu[sizeIdxC].calcresidual(fenc, pred, residual, stride);
120
+            primitives.cu[sizeIdxC].calcresidual[stride % 64 == 0](fenc, pred, residual, stride);
121
+
122
             uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffC, log2TrSizeC, ttype, absPartIdxC, false);
123
             if (numSig)
124
             {
125
                 m_quant.invtransformNxN(cu, residual, stride, coeffC, log2TrSizeC, ttype, true, false, numSig);
126
-                primitives.cu[sizeIdxC].add_ps(picReconC, picStride, pred, residual, stride, stride);
127
+                bool picReconCAlign = (reconPic->m_cuOffsetC[cu.m_cuAddr] + reconPic->m_buOffsetC[cuGeom.absPartIdx + absPartIdxC]) % 64 == 0;
128
+                bool predAlign = mode.predYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
129
+                bool residualAlign = resiYuv.getChromaAddrOffset(absPartIdxC)% 64 == 0;
130
+                bool bufferAlignCheck = picReconCAlign && predAlign && residualAlign && (picStride % 64 == 0) && (stride % 64 == 0);
131
+                primitives.cu[sizeIdxC].add_ps[bufferAlignCheck](picReconC, picStride, pred, residual, stride, stride);
132
                 cu.setCbfPartRange(1 << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
133
             }
134
             else
135
@@ -1304,7 +1329,7 @@
136
 
137
         pixel nScale[129];
138
         intraNeighbourBuf[1][0] = intraNeighbourBuf[0][0];
139
-        primitives.scale1D_128to64(nScale + 1, intraNeighbourBuf[0] + 1);
140
+        primitives.scale1D_128to64[NONALIGNED](nScale + 1, intraNeighbourBuf[0] + 1);
141
 
142
         // we do not estimate filtering for downscaled samples
143
         memcpy(&intraNeighbourBuf[0][1], &nScale[1], 2 * 64 * sizeof(pixel));   // Top & Left pixels
144
@@ -2107,18 +2132,24 @@
145
         bestME[list].mvCost  = mvCost;
146
     }
147
 }
148
-
149
-void Search::searchMV(Mode& interMode, const PredictionUnit& pu, int list, int ref, MV& outmv)
150
+void Search::searchMV(Mode& interMode, const PredictionUnit& pu, int list, int ref, MV& outmv, MV mvp, int numMvc, MV* mvc)
151
 {
152
     CUData& cu = interMode.cu;
153
     const Slice *slice = m_slice;
154
-    MV mv = cu.m_mv[list][pu.puAbsPartIdx];
155
+    MV mv;
156
+    if (m_param->interRefine == 1)
157
+        mv = mvp;
158
+    else
159
+        mv = cu.m_mv[list][pu.puAbsPartIdx];
160
     cu.clipMv(mv);
161
     MV mvmin, mvmax;
162
     setSearchRange(cu, mv, m_param->searchRange, mvmin, mvmax);
163
-    m_me.refineMV(&slice->m_mref[list][ref], mvmin, mvmax, mv, outmv);
164
+    if (m_param->interRefine == 1)
165
+        m_me.motionEstimate(&m_slice->m_mref[list][ref], mvmin, mvmax, mv, numMvc, mvc, m_param->searchRange, outmv, m_param->maxSlices,
166
+        m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
167
+    else
168
+        m_me.refineMV(&slice->m_mref[list][ref], mvmin, mvmax, mv, outmv);
169
 }
170
-
171
 /* find the best inter prediction for each PU of specified mode */
172
 void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChromaMC, uint32_t refMasks[2])
173
 {
174
@@ -2138,20 +2169,29 @@
175
     int      totalmebits = 0;
176
     MV       mvzero(0, 0);
177
     Yuv&     tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv;
178
-
179
     MergeData merge;
180
     memset(&merge, 0, sizeof(merge));
181
-
182
+    bool useAsMVP = false;
183
     for (int puIdx = 0; puIdx < numPart; puIdx++)
184
     {
185
         MotionData* bestME = interMode.bestME[puIdx];
186
         PredictionUnit pu(cu, cuGeom, puIdx);
187
-
188
         m_me.setSourcePU(*interMode.fencYuv, pu.ctuAddr, pu.cuAbsPartIdx, pu.puAbsPartIdx, pu.width, pu.height, m_param->searchMethod, m_param->subpelRefine, bChromaMC);
189
-
190
+        useAsMVP = false;
191
+        x265_analysis_inter_data* interDataCTU = NULL;
192
+        int cuIdx;
193
+        cuIdx = (interMode.cu.m_cuAddr * m_param->num4x4Partitions) + cuGeom.absPartIdx;
194
+        if (m_param->analysisReuseLevel == 10 && m_param->interRefine > 1)
195
+        {
196
+            interDataCTU = m_frame->m_analysisData.interData;
197
+            if ((cu.m_predMode[pu.puAbsPartIdx] == interDataCTU->modes[cuIdx + pu.puAbsPartIdx])
198
+                && (cu.m_partSize[pu.puAbsPartIdx] == interDataCTU->partSize[cuIdx + pu.puAbsPartIdx])
199
+                && !(interDataCTU->mergeFlag[cuIdx + puIdx])
200
+                && (cu.m_cuDepth[0] == interDataCTU->depth[cuIdx]))
201
+                useAsMVP = true;
202
+        }
203
         /* find best cost merge candidate. note: 2Nx2N merge and bidir are handled as separate modes */
204
         uint32_t mrgCost = numPart == 1 ? MAX_UINT : mergeEstimation(cu, cuGeom, pu, puIdx, merge);
205
-
206
         bestME[0].cost = MAX_UINT;
207
         bestME[1].cost = MAX_UINT;
208
 
209
@@ -2159,26 +2199,37 @@
210
         bool bDoUnidir = true;
211
 
212
         cu.getNeighbourMV(puIdx, pu.puAbsPartIdx, interMode.interNeighbours);
213
-
214
         /* Uni-directional prediction */
215
         if ((m_param->analysisLoad && m_param->analysisReuseLevel > 1 && m_param->analysisReuseLevel != 10)
216
-            || (m_param->analysisMultiPassRefine && m_param->rc.bStatRead) || (m_param->bMVType == AVC_INFO))
217
+            || (m_param->analysisMultiPassRefine && m_param->rc.bStatRead) || (m_param->bMVType == AVC_INFO) || (useAsMVP))
218
         {
219
             for (int list = 0; list < numPredDir; list++)
220
             {
221
-                int ref = bestME[list].ref;
222
+
223
+                int ref = -1;
224
+                if (useAsMVP)
225
+                    ref = interDataCTU->refIdx[list][cuIdx + puIdx];
226
+
227
+                else
228
+                    ref = bestME[list].ref;
229
                 if (ref < 0)
230
+                {
231
                     continue;
232
-
233
+                }
234
                 uint32_t bits = m_listSelBits[list] + MVP_IDX_BITS;
235
                 bits += getTUBits(ref, numRefIdx[list]);
236
 
237
                 int numMvc = cu.getPMV(interMode.interNeighbours, list, ref, interMode.amvpCand[list][ref], mvc);
238
-
239
                 const MV* amvp = interMode.amvpCand[list][ref];
240
                 int mvpIdx = selectMVP(cu, pu, amvp, list, ref);
241
-                MV mvmin, mvmax, outmv, mvp = amvp[mvpIdx];
242
-
243
+                MV mvmin, mvmax, outmv, mvp;
244
+                if (useAsMVP)
245
+                {
246
+                    mvp = interDataCTU->mv[list][cuIdx + puIdx].word;
247
+                    mvpIdx = interDataCTU->mvpIdx[list][cuIdx + puIdx];
248
+                }
249
+                else
250
+                    mvp = amvp[mvpIdx];
251
                 if (m_param->searchMethod == X265_SEA)
252
                 {
253
                     int puX = puIdx & 1;
254
@@ -2198,9 +2249,8 @@
255
                 bits += m_me.bitcost(outmv);
256
                 uint32_t mvCost = m_me.mvcost(outmv);
257
                 uint32_t cost = (satdCost - mvCost) + m_rdCost.getCost(bits);
258
-
259
                 /* Refine MVP selection, updates: mvpIdx, bits, cost */
260
-                if (!m_param->analysisMultiPassRefine)
261
+                if (!(m_param->analysisMultiPassRefine || useAsMVP))
262
                     mvp = checkBestMVP(amvp, outmv, mvpIdx, bits, cost);
263
                 else
264
                 {
265
@@ -2225,6 +2275,7 @@
266
                     bestME[list].cost = cost;
267
                     bestME[list].bits = bits;
268
                     bestME[list].mvCost  = mvCost;
269
+                    bestME[list].ref = ref;
270
                 }
271
                 bDoUnidir = false;
272
             }            
273
@@ -2372,8 +2423,7 @@
274
                 /* Generate reference subpels */
275
                 predInterLumaPixel(pu, bidirYuv[0], *refPic0, bestME[0].mv);
276
                 predInterLumaPixel(pu, bidirYuv[1], *refPic1, bestME[1].mv);
277
-
278
-                primitives.pu[m_me.partEnum].pixelavg_pp(tmpPredYuv.m_buf[0], tmpPredYuv.m_size, bidirYuv[0].getLumaAddr(pu.puAbsPartIdx), bidirYuv[0].m_size,
279
+                primitives.pu[m_me.partEnum].pixelavg_pp[(tmpPredYuv.m_size % 64 == 0) && (bidirYuv[0].m_size % 64 == 0) && (bidirYuv[1].m_size % 64 == 0)](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, bidirYuv[0].getLumaAddr(pu.puAbsPartIdx), bidirYuv[0].m_size,
280
                                                                                                  bidirYuv[1].getLumaAddr(pu.puAbsPartIdx), bidirYuv[1].m_size, 32);
281
                 satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
282
             }
283
@@ -2415,11 +2465,9 @@
284
                     const pixel* ref0 = m_slice->m_mref[0][bestME[0].ref].getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx);
285
                     const pixel* ref1 = m_slice->m_mref[1][bestME[1].ref].getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx);
286
                     intptr_t refStride = slice->m_mref[0][0].lumaStride;
287
-
288
-                    primitives.pu[m_me.partEnum].pixelavg_pp(tmpPredYuv.m_buf[0], tmpPredYuv.m_size, ref0, refStride, ref1, refStride, 32);
289
+                    primitives.pu[m_me.partEnum].pixelavg_pp[(tmpPredYuv.m_size % 64 == 0) && (refStride % 64 == 0)](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, ref0, refStride, ref1, refStride, 32);
290
                     satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
291
                 }
292
-
293
                 MV mvp0 = bestME[0].mvp;
294
                 int mvpIdx0 = bestME[0].mvpIdx;
295
                 uint32_t bits0 = bestME[0].bits - m_me.bitcost(bestME[0].mv, mvp0) + m_me.bitcost(mvzero, mvp0);
296
@@ -2888,7 +2936,7 @@
297
         }
298
         else
299
         {
300
-            primitives.cu[sizeIdx].blockfill_s(curResiY, strideResiY, 0);
301
+            primitives.cu[sizeIdx].blockfill_s[strideResiY % 64 == 0](curResiY, strideResiY, 0);
302
             cu.setCbfSubParts(0, TEXT_LUMA, absPartIdx, depth);
303
         }
304
 
305
@@ -2921,7 +2969,7 @@
306
                 }
307
                 else
308
                 {
309
-                    primitives.cu[sizeIdxC].blockfill_s(curResiU, strideResiC, 0);
310
+                    primitives.cu[sizeIdxC].blockfill_s[strideResiC % 64 == 0](curResiU, strideResiC, 0);
311
                     cu.setCbfPartRange(0, TEXT_CHROMA_U, absPartIdxC, tuIterator.absPartIdxStep);
312
                 }
313
 
314
@@ -2935,7 +2983,7 @@
315
                 }
316
                 else
317
                 {
318
-                    primitives.cu[sizeIdxC].blockfill_s(curResiV, strideResiC, 0);
319
+                    primitives.cu[sizeIdxC].blockfill_s[strideResiC % 64 == 0](curResiV, strideResiC, 0);
320
                     cu.setCbfPartRange(0, TEXT_CHROMA_V, absPartIdxC, tuIterator.absPartIdxStep);
321
                 }
322
             }
323
@@ -3168,8 +3216,12 @@
324
             // non-zero cost calculation for luma - This is an approximation
325
             // finally we have to encode correct cbf after comparing with null cost
326
             pixel* curReconY = m_rqt[qtLayer].reconQtYuv.getLumaAddr(absPartIdx);
327
+            bool curReconYAlign = m_rqt[qtLayer].reconQtYuv.getAddrOffset(absPartIdx, m_rqt[qtLayer].reconQtYuv.m_size) % 64 == 0;
328
             uint32_t strideReconY = m_rqt[qtLayer].reconQtYuv.m_size;
329
-            primitives.cu[partSize].add_ps(curReconY, strideReconY, mode.predYuv.getLumaAddr(absPartIdx), curResiY, mode.predYuv.m_size, strideResiY);
330
+            bool predYuvAlign = mode.predYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 == 0;
331
+            bool curResiYAlign = m_rqt[qtLayer].resiQtYuv.getAddrOffset(absPartIdx, m_rqt[qtLayer].resiQtYuv.m_size) % 64 == 0;
332
+            bool bufferAlignCheck = curReconYAlign && predYuvAlign && curResiYAlign && (strideReconY % 64 == 0) && (mode.predYuv.m_size % 64 == 0) && (strideResiY % 64 == 0);
333
+            primitives.cu[partSize].add_ps[bufferAlignCheck](curReconY, strideReconY, mode.predYuv.getLumaAddr(absPartIdx), curResiY, mode.predYuv.m_size, strideResiY);
334
 
335
             const sse_t nonZeroDistY = primitives.cu[partSize].sse_pp(fenc, fencYuv->m_size, curReconY, strideReconY);
336
             uint32_t nzCbfBitsY = m_entropyCoder.estimateCbfBits(cbfFlag[TEXT_LUMA][0], TEXT_LUMA, tuDepth);
337
@@ -3203,7 +3255,7 @@
338
                 {
339
                     cbfFlag[TEXT_LUMA][0] = 0;
340
                     singleBits[TEXT_LUMA][0] = 0;
341
-                    primitives.cu[partSize].blockfill_s(curResiY, strideResiY, 0);
342
+                    primitives.cu[partSize].blockfill_s[strideResiY % 64 == 0](curResiY, strideResiY, 0);
343
 #if CHECKED_BUILD || _DEBUG
344
                     uint32_t numCoeffY = 1 << (log2TrSize << 1);
345
                     memset(coeffCurY, 0, sizeof(coeff_t)* numCoeffY);
346
@@ -3226,7 +3278,7 @@
347
         {
348
             if (checkTransformSkipY)
349
                 minCost[TEXT_LUMA][0] = estimateNullCbfCost(zeroDistY, zeroEnergyY, tuDepth, TEXT_LUMA);
350
-            primitives.cu[partSize].blockfill_s(curResiY, strideResiY, 0);
351
+            primitives.cu[partSize].blockfill_s[strideResiY % 64 == 0](curResiY, strideResiY, 0);
352
             singleDist[TEXT_LUMA][0] = zeroDistY;
353
             singleBits[TEXT_LUMA][0] = 0;
354
             singleEnergy[TEXT_LUMA][0] = zeroEnergyY;
355
@@ -3284,7 +3336,11 @@
356
                         // finally we have to encode correct cbf after comparing with null cost
357
                         pixel* curReconC      = m_rqt[qtLayer].reconQtYuv.getChromaAddr(chromaId, absPartIdxC);
358
                         uint32_t strideReconC = m_rqt[qtLayer].reconQtYuv.m_csize;
359
-                        primitives.cu[partSizeC].add_ps(curReconC, strideReconC, mode.predYuv.getChromaAddr(chromaId, absPartIdxC), curResiC, mode.predYuv.m_csize, strideResiC);
360
+                        bool curReconCAlign = m_rqt[qtLayer].reconQtYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
361
+                        bool predYuvAlign = mode.predYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
362
+                        bool curResiCAlign = m_rqt[qtLayer].resiQtYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
363
+                        bool bufferAlignCheck = curReconCAlign && predYuvAlign && curResiCAlign && (strideReconC % 64 == 0) && (mode.predYuv.m_csize % 64 == 0) && (strideResiC % 64 == 0);
364
+                        primitives.cu[partSizeC].add_ps[bufferAlignCheck](curReconC, strideReconC, mode.predYuv.getChromaAddr(chromaId, absPartIdxC), curResiC, mode.predYuv.m_csize, strideResiC);
365
                         sse_t nonZeroDistC = m_rdCost.scaleChromaDist(chromaId, primitives.cu[partSizeC].sse_pp(fenc, fencYuv->m_csize, curReconC, strideReconC));
366
                         uint32_t nzCbfBitsC = m_entropyCoder.estimateCbfBits(cbfFlag[chromaId][tuIterator.section], (TextType)chromaId, tuDepth);
367
                         uint32_t nonZeroEnergyC = 0; uint64_t singleCostC = 0;
368
@@ -3315,7 +3371,7 @@
369
                             {
370
                                 cbfFlag[chromaId][tuIterator.section] = 0;
371
                                 singleBits[chromaId][tuIterator.section] = 0;
372
-                                primitives.cu[partSizeC].blockfill_s(curResiC, strideResiC, 0);
373
+                                primitives.cu[partSizeC].blockfill_s[strideResiC % 64 == 0](curResiC, strideResiC, 0);
374
 #if CHECKED_BUILD || _DEBUG
375
                                 uint32_t numCoeffC = 1 << (log2TrSizeC << 1);
376
                                 memset(coeffCurC + subTUOffset, 0, sizeof(coeff_t) * numCoeffC);
377
@@ -3338,7 +3394,7 @@
378
                     {
379
                         if (checkTransformSkipC)
380
                             minCost[chromaId][tuIterator.section] = estimateNullCbfCost(zeroDistC, zeroEnergyC, tuDepthC, (TextType)chromaId);
381
-                        primitives.cu[partSizeC].blockfill_s(curResiC, strideResiC, 0);
382
+                        primitives.cu[partSizeC].blockfill_s[strideResiC % 64 == 0](curResiC, strideResiC, 0);
383
                         singleBits[chromaId][tuIterator.section] = 0;
384
                         singleDist[chromaId][tuIterator.section] = zeroDistC;
385
                         singleEnergy[chromaId][tuIterator.section] = zeroEnergyC;
386
@@ -3388,8 +3444,10 @@
387
                 const uint32_t skipSingleBitsY = m_entropyCoder.getNumberOfWrittenBits();
388
 
389
                 m_quant.invtransformNxN(cu, m_tsResidual, trSize, m_tsCoeff, log2TrSize, TEXT_LUMA, false, true, numSigTSkipY);
390
+                bool predYuvAlign = mode.predYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 == 0;
391
 
392
-                primitives.cu[partSize].add_ps(m_tsRecon, trSize, mode.predYuv.getLumaAddr(absPartIdx), m_tsResidual, mode.predYuv.m_size, trSize);
393
+                bool bufferAlignCheck = predYuvAlign && (trSize % 64 == 0) && (mode.predYuv.m_size % 64 == 0);
394
+                primitives.cu[partSize].add_ps[bufferAlignCheck](m_tsRecon, trSize, mode.predYuv.getLumaAddr(absPartIdx), m_tsResidual, mode.predYuv.m_size, trSize);
395
                 nonZeroDistY = primitives.cu[partSize].sse_pp(fenc, fencYuv->m_size, m_tsRecon, trSize);
396
 
397
                 if (m_rdCost.m_psyRd)
398
@@ -3466,7 +3524,9 @@
399
 
400
                         m_quant.invtransformNxN(cu, m_tsResidual, trSizeC, m_tsCoeff,
401
                                                 log2TrSizeC, (TextType)chromaId, false, true, numSigTSkipC);
402
-                        primitives.cu[partSizeC].add_ps(m_tsRecon, trSizeC, mode.predYuv.getChromaAddr(chromaId, absPartIdxC), m_tsResidual, mode.predYuv.m_csize, trSizeC);
403
+                        bool predYuvAlign = mode.predYuv.getChromaAddrOffset(absPartIdxC) % 64 == 0;
404
+                        bool bufferAlignCheck = predYuvAlign && (trSizeC % 64 == 0) && (mode.predYuv.m_csize % 64 == 0) && (trSizeC % 64 == 0);
405
+                        primitives.cu[partSizeC].add_ps[bufferAlignCheck](m_tsRecon, trSizeC, mode.predYuv.getChromaAddr(chromaId, absPartIdxC), m_tsResidual, mode.predYuv.m_csize, trSizeC);
406
                         nonZeroDistC = m_rdCost.scaleChromaDist(chromaId, primitives.cu[partSizeC].sse_pp(fenc, fencYuv->m_csize, m_tsRecon, trSizeC));
407
                         if (m_rdCost.m_psyRd)
408
                         {
409
x265_2.7.tar.gz/source/encoder/search.h -> x265_2.9.tar.gz/source/encoder/search.h Changed
11
 
1
@@ -310,8 +310,7 @@
2
 
3
     // estimation inter prediction (non-skip)
4
     void     predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChromaMC, uint32_t masks[2]);
5
-
6
-    void     searchMV(Mode& interMode, const PredictionUnit& pu, int list, int ref, MV& outmv);
7
+    void     searchMV(Mode& interMode, const PredictionUnit& pu, int list, int ref, MV& outmv, MV mvp, int numMvc, MV* mvc);
8
     // encode residual and compute rd-cost for inter mode
9
     void     encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom);
10
     void     encodeResAndCalcRdSkipCU(Mode& interMode);
11
x265_2.7.tar.gz/source/encoder/sei.cpp -> x265_2.9.tar.gz/source/encoder/sei.cpp Changed
141
 
1
@@ -35,45 +35,40 @@
2
 };
3
 
4
 /* marshal a single SEI message sei, storing the marshalled representation
5
- * in bitstream bs */
6
-void SEI::write(Bitstream& bs, const SPS& sps)
7
+* in bitstream bs */
8
+void SEI::writeSEImessages(Bitstream& bs, const SPS& sps, NalUnitType nalUnitType, NALList& list, int isNested)
9
 {
10
-    uint32_t type = m_payloadType;
11
+    if (!isNested)
12
+        bs.resetBits();
13
+
14
+    BitCounter counter;
15
+    m_bitIf = &counter;
16
+    writeSEI(sps);
17
+    /* count the size of the payload and return the size in bits */
18
+    X265_CHECK(0 == (counter.getNumberOfWrittenBits() & 7), "payload unaligned\n");
19
+    uint32_t payloadData = counter.getNumberOfWrittenBits() >> 3;
20
+
21
+    // set bitstream
22
     m_bitIf = &bs;
23
-    BitCounter count;
24
-    bool hrdTypes = (m_payloadType == ACTIVE_PARAMETER_SETS || m_payloadType == PICTURE_TIMING || m_payloadType == BUFFERING_PERIOD);
25
-    if (hrdTypes)
26
-    {
27
-        m_bitIf = &count;
28
-        /* virtual writeSEI method, write to bit counter to determine size */
29
-        writeSEI(sps);
30
-        m_bitIf = &bs;
31
-        uint32_t payloadType = m_payloadType;
32
-        for (; payloadType >= 0xff; payloadType -= 0xff)
33
-            WRITE_CODE(0xff, 8, "payload_type");
34
-    }
35
-    WRITE_CODE(type, 8, "payload_type");
36
-    uint32_t payloadSize;
37
-    if (hrdTypes || m_payloadType == USER_DATA_UNREGISTERED || m_payloadType == USER_DATA_REGISTERED_ITU_T_T35)
38
+
39
+    uint32_t payloadType = m_payloadType;
40
+    for (; payloadType >= 0xff; payloadType -= 0xff)
41
+        WRITE_CODE(0xff, 8, "payload_type");
42
+    WRITE_CODE(payloadType, 8, "payload_type");
43
+
44
+    uint32_t payloadSize = payloadData;
45
+    for (; payloadSize >= 0xff; payloadSize -= 0xff)
46
+        WRITE_CODE(0xff, 8, "payload_size");
47
+    WRITE_CODE(payloadSize, 8, "payload_size");
48
+
49
+    // virtual writeSEI method, write to bs 
50
+    writeSEI(sps);
51
+
52
+    if (!isNested)
53
     {
54
-        if (hrdTypes)
55
-        {
56
-            X265_CHECK(0 == (count.getNumberOfWrittenBits() & 7), "payload unaligned\n");
57
-            payloadSize = count.getNumberOfWrittenBits() >> 3;
58
-        }
59
-        else if (m_payloadType == USER_DATA_UNREGISTERED)
60
-            payloadSize = m_payloadSize + 16;
61
-        else
62
-            payloadSize = m_payloadSize;
63
-
64
-        for (; payloadSize >= 0xff; payloadSize -= 0xff)
65
-            WRITE_CODE(0xff, 8, "payload_size");
66
-        WRITE_CODE(payloadSize, 8, "payload_size");
67
+        bs.writeByteAlignment();
68
+        list.serialize(nalUnitType, bs);
69
     }
70
-    else
71
-        WRITE_CODE(m_payloadSize, 8, "payload_size");
72
-    /* virtual writeSEI method, write to bs */
73
-    writeSEI(sps);
74
 }
75
 
76
 void SEI::writeByteAlign()
77
@@ -93,3 +88,63 @@
78
 {
79
     m_payloadSize = size;
80
 }
81
+
82
+/* charSet = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/" */
83
+
84
+char* SEI::base64Decode(char encodedString[], int base64EncodeLength)
85
+{
86
+    char* decodedString;
87
+    decodedString = (char*)malloc(sizeof(char) * ((base64EncodeLength / 4) * 3));
88
+    int i, j, k = 0;
89
+    // stores the bitstream
90
+    int bitstream = 0;
91
+    // countBits stores current number of bits in bitstream
92
+    int countBits = 0;
93
+    // selects 4 characters from encodedString at a time. Find the position of each encoded character in charSet and stores in bitstream
94
+    for (i = 0; i < base64EncodeLength; i += 4)
95
+    {
96
+        bitstream = 0, countBits = 0;
97
+        for (j = 0; j < 4; j++)
98
+        {
99
+            // make space for 6 bits
100
+            if (encodedString[i + j] != '=')
101
+            {
102
+                bitstream = bitstream << 6;
103
+                countBits += 6;
104
+            }
105
+            // Finding the position of each encoded character in charSet and storing in bitstream, use OR '|' operator to store bits
106
+
107
+            if (encodedString[i + j] >= 'A' && encodedString[i + j] <= 'Z')
108
+                bitstream = bitstream | (encodedString[i + j] - 'A');
109
+
110
+            else if (encodedString[i + j] >= 'a' && encodedString[i + j] <= 'z')
111
+                bitstream = bitstream | (encodedString[i + j] - 'a' + 26);
112
+            
113
+            else if (encodedString[i + j] >= '0' && encodedString[i + j] <= '9')
114
+                bitstream = bitstream | (encodedString[i + j] - '0' + 52);
115
+            
116
+            // '+' occurs in 62nd position in charSet
117
+            else if (encodedString[i + j] == '+')
118
+                bitstream = bitstream | 62;
119
+            
120
+            // '/' occurs in 63rd position in charSet
121
+            else if (encodedString[i + j] == '/')
122
+                bitstream = bitstream | 63;
123
+            
124
+            // to delete appended bits during encoding
125
+            else
126
+            {
127
+                bitstream = bitstream >> 2;
128
+                countBits -= 2;
129
+            }
130
+        }
131
+    
132
+        while (countBits != 0)
133
+        {
134
+            countBits -= 8;
135
+            decodedString[k++] = (bitstream >> countBits) & 255;
136
+        }
137
+    }
138
+    return decodedString;
139
+}
140
+
141
x265_2.7.tar.gz/source/encoder/sei.h -> x265_2.9.tar.gz/source/encoder/sei.h Changed
135
 
1
@@ -27,6 +27,8 @@
2
 #include "common.h"
3
 #include "bitstream.h"
4
 #include "slice.h"
5
+#include "nal.h"
6
+#include "md5.h"
7
 
8
 namespace X265_NS {
9
 // private namespace
10
@@ -34,11 +36,11 @@
11
 class SEI : public SyntaxElementWriter
12
 {
13
 public:
14
-    /* SEI users call write() to marshal an SEI to a bitstream.
15
-     * The write() method calls writeSEI() which encodes the header */
16
-    void write(Bitstream& bs, const SPS& sps);
17
-
18
+    /* SEI users call writeSEImessages() to marshal an SEI to a bitstream.
19
+    * The writeSEImessages() method calls writeSEI() which encodes the header */
20
+    void writeSEImessages(Bitstream& bs, const SPS& sps, NalUnitType nalUnitType, NALList& list, int isNested);
21
     void setSize(uint32_t size);
22
+    static char* base64Decode(char encodedString[], int base64EncodeLength);
23
     virtual ~SEI() {}
24
 protected:
25
     SEIPayloadType  m_payloadType;
26
@@ -47,6 +49,32 @@
27
     void writeByteAlign();
28
 };
29
 
30
+//seongnam.oh@samsung.com :: for the Creative Intent Meta Data Encoding
31
+class SEIuserDataRegistered : public SEI
32
+{
33
+public:
34
+    SEIuserDataRegistered()
35
+    {
36
+        m_payloadType = USER_DATA_REGISTERED_ITU_T_T35;
37
+        m_payloadSize = 0;
38
+    }
39
+
40
+    uint8_t *m_userData;
41
+
42
+    // daniel.vt@samsung.com :: for the Creative Intent Meta Data Encoding ( seongnam.oh@samsung.com )
43
+    void writeSEI(const SPS&)
44
+    {
45
+        if (!m_userData)
46
+            return;
47
+
48
+        uint32_t i = 0;
49
+        for (; i < m_payloadSize; ++i)
50
+            WRITE_CODE(m_userData[i], 8, "creative_intent_metadata");
51
+    }
52
+};
53
+
54
+static const uint32_t ISO_IEC_11578_LEN = 16;
55
+
56
 class SEIuserDataUnregistered : public SEI
57
 {
58
 public:
59
@@ -55,11 +83,11 @@
60
         m_payloadType = USER_DATA_UNREGISTERED;
61
         m_payloadSize = 0;
62
     }
63
-    static const uint8_t m_uuid_iso_iec_11578[16];
64
+    static const uint8_t m_uuid_iso_iec_11578[ISO_IEC_11578_LEN];
65
     uint8_t *m_userData;
66
     void writeSEI(const SPS&)
67
     {
68
-        for (uint32_t i = 0; i < 16; i++)
69
+        for (uint32_t i = 0; i < ISO_IEC_11578_LEN; i++)
70
             WRITE_CODE(m_uuid_iso_iec_11578[i], 8, "sei.uuid_iso_iec_11578[i]");
71
         for (uint32_t i = 0; i < m_payloadSize; i++)
72
             WRITE_CODE(m_userData[i], 8, "user_data");
73
@@ -133,7 +161,12 @@
74
         CRC,
75
         CHECKSUM,
76
     } m_method;
77
-    uint8_t m_digest[3][16];
78
+
79
+    MD5Context m_state[3];
80
+    uint32_t   m_crc[3];
81
+    uint32_t   m_checksum[3];
82
+    uint8_t    m_digest[3][16];
83
+
84
     void writeSEI(const SPS& sps)
85
     {
86
         int planes = (sps.chromaFormatIdc != X265_CSP_I400) ? 3 : 1;
87
@@ -253,6 +286,11 @@
88
 class SEIRecoveryPoint : public SEI
89
 {
90
 public:
91
+    SEIRecoveryPoint()
92
+    {
93
+        m_payloadType = RECOVERY_POINT;
94
+        m_payloadSize = 0;
95
+    }
96
     int  m_recoveryPocCnt;
97
     bool m_exactMatchingFlag;
98
     bool m_brokenLinkFlag;
99
@@ -266,28 +304,22 @@
100
     }
101
 };
102
 
103
-//seongnam.oh@samsung.com :: for the Creative Intent Meta Data Encoding
104
-class SEICreativeIntentMeta : public SEI
105
+class SEIAlternativeTC : public SEI
106
 {
107
 public:
108
-    SEICreativeIntentMeta()
109
+    int m_preferredTransferCharacteristics;
110
+    SEIAlternativeTC()
111
     {
112
-        m_payloadType = USER_DATA_REGISTERED_ITU_T_T35;
113
+        m_payloadType = ALTERNATIVE_TRANSFER_CHARACTERISTICS;
114
         m_payloadSize = 0;
115
+        m_preferredTransferCharacteristics = -1;
116
     }
117
 
118
-    uint8_t *m_payload;
119
-
120
-    // daniel.vt@samsung.com :: for the Creative Intent Meta Data Encoding ( seongnam.oh@samsung.com )
121
     void writeSEI(const SPS&)
122
     {
123
-        if (!m_payload)
124
-            return;
125
-
126
-        uint32_t i = 0;
127
-        for (; i < m_payloadSize; ++i)
128
-            WRITE_CODE(m_payload[i], 8, "creative_intent_metadata");
129
+        WRITE_CODE(m_preferredTransferCharacteristics, 8, "Preferred transfer characteristics");
130
     }
131
 };
132
+
133
 }
134
 #endif // ifndef X265_SEI_H
135
x265_2.7.tar.gz/source/encoder/slicetype.cpp -> x265_2.9.tar.gz/source/encoder/slicetype.cpp Changed
400
 
1
@@ -150,20 +150,14 @@
2
         curFrame->m_lowres.wp_sum[y] = 0;
3
     }
4
 
5
-    /* Calculate Qp offset for each 16x16 or 8x8 block in the frame */
6
-    int blockXY = 0;
7
-    int blockX = 0, blockY = 0;
8
-    double strength = 0.f;
9
+    /* Calculate Qp offset for each 16x16 or 8x8 block in the frame */    
10
     if ((param->rc.aqMode == X265_AQ_NONE || param->rc.aqStrength == 0) || (param->rc.bStatRead && param->rc.cuTree && IS_REFERENCED(curFrame)))
11
     {
12
-        /* Need to init it anyways for CU tree */
13
-        int cuCount = blockCount;
14
-
15
         if (param->rc.aqMode && param->rc.aqStrength == 0)
16
         {
17
             if (quantOffsets)
18
             {
19
-                for (int cuxy = 0; cuxy < cuCount; cuxy++)
20
+                for (int cuxy = 0; cuxy < blockCount; cuxy++)
21
                 {
22
                     curFrame->m_lowres.qpCuTreeOffset[cuxy] = curFrame->m_lowres.qpAqOffset[cuxy] = quantOffsets[cuxy];
23
                     curFrame->m_lowres.invQscaleFactor[cuxy] = x265_exp2fix8(curFrame->m_lowres.qpCuTreeOffset[cuxy]);
24
@@ -171,61 +165,55 @@
25
             }
26
             else
27
             {
28
-                memset(curFrame->m_lowres.qpCuTreeOffset, 0, cuCount * sizeof(double));
29
-                memset(curFrame->m_lowres.qpAqOffset, 0, cuCount * sizeof(double));
30
-                for (int cuxy = 0; cuxy < cuCount; cuxy++)
31
-                    curFrame->m_lowres.invQscaleFactor[cuxy] = 256;
32
+               memset(curFrame->m_lowres.qpCuTreeOffset, 0, blockCount * sizeof(double));
33
+               memset(curFrame->m_lowres.qpAqOffset, 0, blockCount * sizeof(double));
34
+               for (int cuxy = 0; cuxy < blockCount; cuxy++)
35
+                   curFrame->m_lowres.invQscaleFactor[cuxy] = 256;
36
             }
37
         }
38
 
39
-        /* Need variance data for weighted prediction */
40
+        /* Need variance data for weighted prediction and dynamic refinement*/
41
         if (param->bEnableWeightedPred || param->bEnableWeightedBiPred)
42
         {
43
-            for (blockY = 0; blockY < maxRow; blockY += loopIncr)
44
-                for (blockX = 0; blockX < maxCol; blockX += loopIncr)
45
-                    acEnergyCu(curFrame, blockX, blockY, param->internalCsp, param->rc.qgSize);
46
+            for (int blockY = 0; blockY < maxRow; blockY += loopIncr)
47
+                for (int blockX = 0; blockX < maxCol; blockX += loopIncr)                
48
+                    acEnergyCu(curFrame, blockX, blockY, param->internalCsp, param->rc.qgSize);                
49
         }
50
     }
51
     else
52
     {
53
-        blockXY = 0;
54
-        double avg_adj_pow2 = 0, avg_adj = 0, qp_adj = 0;
55
-        double bias_strength = 0.f;
56
+        int blockXY = 0;
57
+        double avg_adj_pow2 = 0.f, avg_adj = 0.f, qp_adj = 0.f;
58
+        double bias_strength = 0.f, strength = 0.f;
59
         if (param->rc.aqMode == X265_AQ_AUTO_VARIANCE || param->rc.aqMode == X265_AQ_AUTO_VARIANCE_BIASED)
60
         {
61
-            double bit_depth_correction = 1.f / (1 << (2*(X265_DEPTH-8)));
62
-            curFrame->m_lowres.frameVariance = 0;
63
-            uint64_t rowVariance = 0;
64
-            for (blockY = 0; blockY < maxRow; blockY += loopIncr)
65
-            {
66
-                rowVariance = 0;
67
-                for (blockX = 0; blockX < maxCol; blockX += loopIncr)
68
-                {
69
-                    uint32_t energy = acEnergyCu(curFrame, blockX, blockY, param->internalCsp, param->rc.qgSize);
70
-                    curFrame->m_lowres.blockVariance[blockXY] = energy;
71
-                    rowVariance += energy;
72
+            double bit_depth_correction = 1.f / (1 << (2*(X265_DEPTH-8)));            
73
+            
74
+            for (int blockY = 0; blockY < maxRow; blockY += loopIncr)
75
+            {                
76
+                for (int blockX = 0; blockX < maxCol; blockX += loopIncr)
77
+                {
78
+                    uint32_t energy = acEnergyCu(curFrame, blockX, blockY, param->internalCsp, param->rc.qgSize);                    
79
                     qp_adj = pow(energy * bit_depth_correction + 1, 0.1);
80
                     curFrame->m_lowres.qpCuTreeOffset[blockXY] = qp_adj;
81
                     avg_adj += qp_adj;
82
                     avg_adj_pow2 += qp_adj * qp_adj;
83
                     blockXY++;
84
                 }
85
-                curFrame->m_lowres.frameVariance += (rowVariance / maxCol);
86
             }
87
-            curFrame->m_lowres.frameVariance /= maxRow;
88
             avg_adj /= blockCount;
89
             avg_adj_pow2 /= blockCount;
90
             strength = param->rc.aqStrength * avg_adj;
91
-            avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - (modeTwoConst)) / avg_adj;
92
+            avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - modeTwoConst) / avg_adj;
93
             bias_strength = param->rc.aqStrength;
94
         }
95
         else
96
             strength = param->rc.aqStrength * 1.0397f;
97
 
98
         blockXY = 0;
99
-        for (blockY = 0; blockY < maxRow; blockY += loopIncr)
100
+        for (int blockY = 0; blockY < maxRow; blockY += loopIncr)
101
         {
102
-            for (blockX = 0; blockX < maxCol; blockX += loopIncr)
103
+            for (int blockX = 0; blockX < maxCol; blockX += loopIncr)
104
             {
105
                 if (param->rc.aqMode == X265_AQ_AUTO_VARIANCE_BIASED)
106
                 {
107
@@ -240,7 +228,7 @@
108
                 else
109
                 {
110
                     uint32_t energy = acEnergyCu(curFrame, blockX, blockY, param->internalCsp,param->rc.qgSize);
111
-                    qp_adj = strength * (X265_LOG2(X265_MAX(energy, 1)) - (modeOneConst + 2 * (X265_DEPTH - 8)));
112
+                    qp_adj = strength * (X265_LOG2(X265_MAX(energy, 1)) - (modeOneConst + 2 * (X265_DEPTH - 8)));                    
113
                 }
114
 
115
                 if (param->bHDROpt)
116
@@ -308,6 +296,17 @@
117
             curFrame->m_lowres.wp_ssd[i] = ssd - (sum * sum + (width[i] * height[i]) / 2) / (width[i] * height[i]);
118
         }
119
     }
120
+
121
+    if (param->bDynamicRefine)
122
+    {
123
+        int blockXY = 0;
124
+        for (int blockY = 0; blockY < maxRow; blockY += loopIncr)
125
+            for (int blockX = 0; blockX < maxCol; blockX += loopIncr)
126
+            {
127
+                curFrame->m_lowres.blockVariance[blockXY] = acEnergyCu(curFrame, blockX, blockY, param->internalCsp, param->rc.qgSize);
128
+                blockXY++;
129
+            }
130
+    }
131
 }
132
 
133
 void LookaheadTLD::lowresIntraEstimate(Lowres& fenc, uint32_t qgSize)
134
@@ -426,7 +425,7 @@
135
     pixel *src = ref.fpelPlane[0];
136
     intptr_t stride = fenc.lumaStride;
137
 
138
-    if (wp.bPresentFlag)
139
+    if (wp.wtPresent)
140
     {
141
         int offset = wp.inputOffset << (X265_DEPTH - 8);
142
         int scale = wp.inputWeight;
143
@@ -480,7 +479,7 @@
144
     int deltaIndex = fenc.frameNum - ref.frameNum;
145
 
146
     WeightParam wp;
147
-    wp.bPresentFlag = false;
148
+    wp.wtPresent = 0;
149
 
150
     if (!wbuffer[0])
151
     {
152
@@ -1078,85 +1077,97 @@
153
     }
154
 
155
     int bframes, brefs;
156
-    for (bframes = 0, brefs = 0;; bframes++)
157
+    if (!m_param->analysisLoad)
158
     {
159
-        Lowres& frm = list[bframes]->m_lowres;
160
-
161
-        if (frm.sliceType == X265_TYPE_BREF && !m_param->bBPyramid && brefs == m_param->bBPyramid)
162
+        for (bframes = 0, brefs = 0;; bframes++)
163
         {
164
-            frm.sliceType = X265_TYPE_B;
165
-            x265_log(m_param, X265_LOG_WARNING, "B-ref at frame %d incompatible with B-pyramid\n",
166
-                     frm.frameNum);
167
-        }
168
+            Lowres& frm = list[bframes]->m_lowres;
169
 
170
-        /* pyramid with multiple B-refs needs a big enough dpb that the preceding P-frame stays available.
171
-         * smaller dpb could be supported by smart enough use of mmco, but it's easier just to forbid it. */
172
-        else if (frm.sliceType == X265_TYPE_BREF && m_param->bBPyramid && brefs &&
173
-                 m_param->maxNumReferences <= (brefs + 3))
174
-        {
175
-            frm.sliceType = X265_TYPE_B;
176
-            x265_log(m_param, X265_LOG_WARNING, "B-ref at frame %d incompatible with B-pyramid and %d reference frames\n",
177
-                     frm.sliceType, m_param->maxNumReferences);
178
-        }
179
-        if ((!m_param->bIntraRefresh || frm.frameNum == 0) && frm.frameNum - m_lastKeyframe >= m_param->keyframeMax &&
180
-            (!m_extendGopBoundary || frm.frameNum - m_lastKeyframe >= m_param->keyframeMax + m_param->gopLookahead))
181
-        {
182
-            if (frm.sliceType == X265_TYPE_AUTO || frm.sliceType == X265_TYPE_I)
183
-                frm.sliceType = m_param->bOpenGOP && m_lastKeyframe >= 0 ? X265_TYPE_I : X265_TYPE_IDR;
184
-            bool warn = frm.sliceType != X265_TYPE_IDR;
185
-            if (warn && m_param->bOpenGOP)
186
-                warn &= frm.sliceType != X265_TYPE_I;
187
-            if (warn)
188
+            if (frm.sliceType == X265_TYPE_BREF && !m_param->bBPyramid && brefs == m_param->bBPyramid)
189
             {
190
-                x265_log(m_param, X265_LOG_WARNING, "specified frame type (%d) at %d is not compatible with keyframe interval\n",
191
-                         frm.sliceType, frm.frameNum);
192
-                frm.sliceType = m_param->bOpenGOP && m_lastKeyframe >= 0 ? X265_TYPE_I : X265_TYPE_IDR;
193
+                frm.sliceType = X265_TYPE_B;
194
+                x265_log(m_param, X265_LOG_WARNING, "B-ref at frame %d incompatible with B-pyramid\n",
195
+                    frm.frameNum);
196
             }
197
-        }
198
-        if (frm.sliceType == X265_TYPE_I && frm.frameNum - m_lastKeyframe >= m_param->keyframeMin)
199
-        {
200
-            if (m_param->bOpenGOP)
201
+
202
+            /* pyramid with multiple B-refs needs a big enough dpb that the preceding P-frame stays available.
203
+             * smaller dpb could be supported by smart enough use of mmco, but it's easier just to forbid it. */
204
+            else if (frm.sliceType == X265_TYPE_BREF && m_param->bBPyramid && brefs &&
205
+                m_param->maxNumReferences <= (brefs + 3))
206
+            {
207
+                frm.sliceType = X265_TYPE_B;
208
+                x265_log(m_param, X265_LOG_WARNING, "B-ref at frame %d incompatible with B-pyramid and %d reference frames\n",
209
+                    frm.sliceType, m_param->maxNumReferences);
210
+            }
211
+            if (((!m_param->bIntraRefresh || frm.frameNum == 0) && frm.frameNum - m_lastKeyframe >= m_param->keyframeMax &&
212
+                (!m_extendGopBoundary || frm.frameNum - m_lastKeyframe >= m_param->keyframeMax + m_param->gopLookahead)) ||
213
+                (frm.frameNum == (m_param->chunkStart - 1)) || (frm.frameNum == m_param->chunkEnd))
214
+            {
215
+                if (frm.sliceType == X265_TYPE_AUTO || frm.sliceType == X265_TYPE_I)
216
+                    frm.sliceType = m_param->bOpenGOP && m_lastKeyframe >= 0 ? X265_TYPE_I : X265_TYPE_IDR;
217
+                bool warn = frm.sliceType != X265_TYPE_IDR;
218
+                if (warn && m_param->bOpenGOP)
219
+                    warn &= frm.sliceType != X265_TYPE_I;
220
+                if (warn)
221
+                {
222
+                    x265_log(m_param, X265_LOG_WARNING, "specified frame type (%d) at %d is not compatible with keyframe interval\n",
223
+                        frm.sliceType, frm.frameNum);
224
+                    frm.sliceType = m_param->bOpenGOP && m_lastKeyframe >= 0 ? X265_TYPE_I : X265_TYPE_IDR;
225
+                }
226
+            }
227
+            if ((frm.sliceType == X265_TYPE_I && frm.frameNum - m_lastKeyframe >= m_param->keyframeMin) || (frm.frameNum == (m_param->chunkStart - 1)) || (frm.frameNum == m_param->chunkEnd))
228
             {
229
+                if (m_param->bOpenGOP)
230
+                {
231
+                    m_lastKeyframe = frm.frameNum;
232
+                    frm.bKeyframe = true;
233
+                }
234
+                else
235
+                    frm.sliceType = X265_TYPE_IDR;
236
+            }
237
+            if (frm.sliceType == X265_TYPE_IDR)
238
+            {
239
+                /* Closed GOP */
240
                 m_lastKeyframe = frm.frameNum;
241
                 frm.bKeyframe = true;
242
+                if (bframes > 0 && !m_param->radl)
243
+                {
244
+                    list[bframes - 1]->m_lowres.sliceType = X265_TYPE_P;
245
+                    bframes--;
246
+                }
247
             }
248
-            else
249
-                frm.sliceType = X265_TYPE_IDR;
250
-        }
251
-        if (frm.sliceType == X265_TYPE_IDR)
252
-        {
253
-            /* Closed GOP */
254
-            m_lastKeyframe = frm.frameNum;
255
-            frm.bKeyframe = true;
256
-            if (bframes > 0 && !m_param->radl)
257
+            if (bframes == m_param->bframes || !list[bframes + 1])
258
             {
259
-                list[bframes - 1]->m_lowres.sliceType = X265_TYPE_P;
260
-                bframes--;
261
+                if (IS_X265_TYPE_B(frm.sliceType))
262
+                    x265_log(m_param, X265_LOG_WARNING, "specified frame type is not compatible with max B-frames\n");
263
+                if (frm.sliceType == X265_TYPE_AUTO || IS_X265_TYPE_B(frm.sliceType))
264
+                    frm.sliceType = X265_TYPE_P;
265
             }
266
-        }
267
-        if (m_param->radl && !m_param->bOpenGOP && list[bframes + 1])
268
-        {
269
-            if ((frm.frameNum - m_lastKeyframe) >  (m_param->keyframeMax - m_param->radl - 1) && (frm.frameNum - m_lastKeyframe) <  m_param->keyframeMax)
270
+            if (frm.sliceType == X265_TYPE_BREF)
271
+                brefs++;
272
+            if (frm.sliceType == X265_TYPE_AUTO)
273
                 frm.sliceType = X265_TYPE_B;
274
-            if ((frm.frameNum - m_lastKeyframe) == (m_param->keyframeMax - m_param->radl - 1))
275
-                frm.sliceType = X265_TYPE_P;
276
+            else if (!IS_X265_TYPE_B(frm.sliceType))
277
+                break;
278
         }
279
-
280
-        if (bframes == m_param->bframes || !list[bframes + 1])
281
+    }
282
+    else
283
+    {
284
+        for (bframes = 0, brefs = 0;; bframes++)
285
         {
286
-            if (IS_X265_TYPE_B(frm.sliceType))
287
-                x265_log(m_param, X265_LOG_WARNING, "specified frame type is not compatible with max B-frames\n");
288
-            if (frm.sliceType == X265_TYPE_AUTO || IS_X265_TYPE_B(frm.sliceType))
289
-                frm.sliceType = X265_TYPE_P;
290
-        }
291
-        if (frm.sliceType == X265_TYPE_BREF)
292
-            brefs++;
293
-        if (frm.sliceType == X265_TYPE_AUTO)
294
-            frm.sliceType = X265_TYPE_B;
295
-        else if (!IS_X265_TYPE_B(frm.sliceType))
296
-            break;
297
+            Lowres& frm = list[bframes]->m_lowres;
298
+            if (frm.sliceType == X265_TYPE_BREF)
299
+                brefs++;
300
+            if ((IS_X265_TYPE_I(frm.sliceType) && frm.frameNum - m_lastKeyframe >= m_param->keyframeMin)
301
+                || (frm.frameNum == (m_param->chunkStart - 1)) || (frm.frameNum == m_param->chunkEnd))
302
+            {
303
+                m_lastKeyframe = frm.frameNum;
304
+                frm.bKeyframe = true;
305
+            }
306
+            if (!IS_X265_TYPE_B(frm.sliceType))
307
+                break;
308
+        }
309
     }
310
-
311
     if (bframes)
312
         list[bframes - 1]->m_lowres.bLastMiniGopBFrame = true;
313
     list[bframes]->m_lowres.leadingBframes = bframes;
314
@@ -1406,7 +1417,19 @@
315
         return;
316
     }
317
     frames[framecnt + 1] = NULL;
318
-    int keyFrameLimit = m_param->keyframeMax + m_lastKeyframe - frames[0]->frameNum - 1;
319
+
320
+    int keylimit = m_param->keyframeMax;
321
+    if (frames[0]->frameNum < m_param->chunkEnd)
322
+    {
323
+        int chunkStart = (m_param->chunkStart - m_lastKeyframe - 1);
324
+        int chunkEnd = (m_param->chunkEnd - m_lastKeyframe);
325
+        if ((chunkStart > 0) && (chunkStart < m_param->keyframeMax))
326
+            keylimit = chunkStart;
327
+        else if ((chunkEnd > 0) && (chunkEnd < m_param->keyframeMax))
328
+            keylimit = chunkEnd;
329
+    }
330
+
331
+    int keyFrameLimit = keylimit + m_lastKeyframe - frames[0]->frameNum - 1;
332
     if (m_param->gopLookahead && keyFrameLimit <= m_param->bframes + 1)
333
         keyintLimit = keyFrameLimit + m_param->gopLookahead;
334
     else
335
@@ -1496,6 +1519,7 @@
336
     int numBFrames = 0;
337
     int numAnalyzed = numFrames;
338
     bool isScenecut = scenecut(frames, 0, 1, true, origNumFrames);
339
+
340
     /* When scenecut threshold is set, use scenecut detection for I frame placements */
341
     if (m_param->scenecutThreshold && isScenecut)
342
     {
343
@@ -1603,14 +1627,28 @@
344
             frames[numFrames]->sliceType = X265_TYPE_P;
345
         }
346
 
347
-        /* Check scenecut on the first minigop. */
348
-        for (int j = 1; j < numBFrames + 1; j++)
349
+        bool bForceRADL = m_param->radl && !m_param->bOpenGOP;
350
+        bool bLastMiniGop = (framecnt >= m_param->bframes + 1) ? false : true;
351
+        int preRADL = m_lastKeyframe + m_param->keyframeMax - m_param->radl - 1; /*Frame preceeding RADL in POC order*/
352
+        if (bForceRADL && (frames[0]->frameNum == preRADL) && !bLastMiniGop)
353
+        {
354
+            int j = 1;
355
+            numBFrames = m_param->radl;
356
+            for (; j <= m_param->radl; j++)
357
+                frames[j]->sliceType = X265_TYPE_B;
358
+            frames[j]->sliceType = X265_TYPE_I;
359
+        }
360
+        else /* Check scenecut and RADL on the first minigop. */
361
         {
362
-            if (scenecut(frames, j, j + 1, false, origNumFrames))
363
+            for (int j = 1; j < numBFrames + 1; j++)
364
             {
365
-                frames[j]->sliceType = X265_TYPE_P;
366
-                numAnalyzed = j;
367
-                break;
368
+                if (scenecut(frames, j, j + 1, false, origNumFrames) || 
369
+                    (bForceRADL && (frames[j]->frameNum == preRADL)))
370
+                {
371
+                    frames[j]->sliceType = X265_TYPE_P;
372
+                    numAnalyzed = j;
373
+                    break;
374
+                }
375
             }
376
         }
377
         resetStart = bKeyframe ? 1 : X265_MIN(numBFrames + 2, numAnalyzed + 1);
378
@@ -2513,19 +2551,16 @@
379
         intptr_t stride0 = X265_LOWRES_CU_SIZE, stride1 = X265_LOWRES_CU_SIZE;
380
         pixel *src0 = fref0->lowresMC(pelOffset, fenc->lowresMvs[0][listDist[0]][cuXY], subpelbuf0, stride0);
381
         pixel *src1 = fref1->lowresMC(pelOffset, fenc->lowresMvs[1][listDist[1]][cuXY], subpelbuf1, stride1);
382
-
383
         ALIGN_VAR_32(pixel, ref[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
384
-        primitives.pu[LUMA_8x8].pixelavg_pp(ref, X265_LOWRES_CU_SIZE, src0, stride0, src1, stride1, 32);
385
+        primitives.pu[LUMA_8x8].pixelavg_pp[NONALIGNED](ref, X265_LOWRES_CU_SIZE, src0, stride0, src1, stride1, 32);
386
         int bicost = tld.me.bufSATD(ref, X265_LOWRES_CU_SIZE);
387
         COPY2_IF_LT(bcost, bicost, listused, 3);
388
-
389
         /* coloc candidate */
390
         src0 = fref0->lowresPlane[0] + pelOffset;
391
         src1 = fref1->lowresPlane[0] + pelOffset;
392
-        primitives.pu[LUMA_8x8].pixelavg_pp(ref, X265_LOWRES_CU_SIZE, src0, fref0->lumaStride, src1, fref1->lumaStride, 32);
393
+        primitives.pu[LUMA_8x8].pixelavg_pp[NONALIGNED](ref, X265_LOWRES_CU_SIZE, src0, fref0->lumaStride, src1, fref1->lumaStride, 32);
394
         bicost = tld.me.bufSATD(ref, X265_LOWRES_CU_SIZE);
395
         COPY2_IF_LT(bcost, bicost, listused, 3);
396
-
397
         bcost += lowresPenalty;
398
     }
399
     else /* P, also consider intra */
400
x265_2.7.tar.gz/source/encoder/weightPrediction.cpp -> x265_2.9.tar.gz/source/encoder/weightPrediction.cpp Changed
56
 
1
@@ -184,8 +184,7 @@
2
         int denom = w->log2WeightDenom;
3
         int round = denom ? 1 << (denom - 1) : 0;
4
         int correction = IF_INTERNAL_PREC - X265_DEPTH; /* intermediate interpolation depth */
5
-        int pwidth = ((width + 15) >> 4) << 4;
6
-
7
+        int pwidth = ((width + 31) >> 5) << 5;
8
         primitives.weight_pp(ref, weightTemp, stride, pwidth, height,
9
                              weight, round << correction, denom + correction, offset);
10
         ref = weightTemp;
11
@@ -294,7 +293,7 @@
12
         for (int plane = 0; plane < (param.internalCsp != X265_CSP_I400 ? 3 : 1); plane++)
13
         {
14
             denom = plane ? chromaDenom : lumaDenom;
15
-            if (plane && !weights[0].bPresentFlag)
16
+            if (plane && !weights[0].wtPresent)
17
                 break;
18
 
19
             /* Early termination */
20
@@ -477,12 +476,12 @@
21
             }
22
         }
23
 
24
-        if (weights[0].bPresentFlag)
25
+        if (weights[0].wtPresent)
26
         {
27
             // Make sure both chroma channels match
28
-            if (weights[1].bPresentFlag != weights[2].bPresentFlag)
29
+            if (weights[1].wtPresent != weights[2].wtPresent)
30
             {
31
-                if (weights[1].bPresentFlag)
32
+                if (weights[1].wtPresent)
33
                     weights[2] = weights[1];
34
                 else
35
                     weights[1] = weights[2];
36
@@ -516,15 +515,15 @@
37
         for (int list = 0; list < numPredDir; list++)
38
         {
39
             WeightParam* w = &wp[list][0][0];
40
-            if (w[0].bPresentFlag || w[1].bPresentFlag || w[2].bPresentFlag)
41
+            if (w[0].wtPresent || w[1].wtPresent || w[2].wtPresent)
42
             {
43
                 bWeighted = true;
44
                 p += sprintf(buf + p, " [L%d:R0 ", list);
45
-                if (w[0].bPresentFlag)
46
+                if (w[0].wtPresent)
47
                     p += sprintf(buf + p, "Y{%d/%d%+d}", w[0].inputWeight, 1 << w[0].log2WeightDenom, w[0].inputOffset);
48
-                if (w[1].bPresentFlag)
49
+                if (w[1].wtPresent)
50
                     p += sprintf(buf + p, "U{%d/%d%+d}", w[1].inputWeight, 1 << w[1].log2WeightDenom, w[1].inputOffset);
51
-                if (w[2].bPresentFlag)
52
+                if (w[2].wtPresent)
53
                     p += sprintf(buf + p, "V{%d/%d%+d}", w[2].inputWeight, 1 << w[2].log2WeightDenom, w[2].inputOffset);
54
                 p += sprintf(buf + p, "]");
55
             }
56
x265_2.7.tar.gz/source/test/ipfilterharness.cpp -> x265_2.9.tar.gz/source/test/ipfilterharness.cpp Changed
268
 
1
@@ -489,6 +489,26 @@
2
     return true;
3
 }
4
 
5
+bool IPFilterHarness::check_IPFilterLumaP2S_aligned_primitive(filter_p2s_t ref, filter_p2s_t opt)
6
+{
7
+    for (int i = 0; i < TEST_CASES; i++)
8
+    {
9
+        int index = i % TEST_CASES;
10
+        intptr_t rand_srcStride[] = { 128, 192, 256, 512 };
11
+        intptr_t dstStride[] = { 192, 256, 512, 576 };
12
+        for (int p = 0; p < 4; p++)
13
+        {
14
+            ref(pixel_test_buff[index], rand_srcStride[p], IPF_C_output_s, dstStride[p]);
15
+            checked(opt, pixel_test_buff[index] + (64 * i), rand_srcStride[p], IPF_vec_output_s, dstStride[p]);
16
+            if (memcmp(IPF_vec_output_s, IPF_C_output_s, TEST_BUF_SIZE * sizeof(int16_t)))
17
+                return false;
18
+        }
19
+        reportfail();
20
+    }
21
+
22
+    return true;
23
+}
24
+
25
 bool IPFilterHarness::check_IPFilterChromaP2S_primitive(filter_p2s_t ref, filter_p2s_t opt)
26
 {
27
     for (int i = 0; i < ITERS; i++)
28
@@ -510,6 +530,29 @@
29
     return true;
30
 }
31
 
32
+bool IPFilterHarness::check_IPFilterChromaP2S_aligned_primitive(filter_p2s_t ref, filter_p2s_t opt)
33
+{
34
+    for (int i = 0; i < TEST_CASES; i++)
35
+    {
36
+        int index = i % TEST_CASES;
37
+        intptr_t rand_srcStride[] = { 128, 192, 256, 512};
38
+        intptr_t dstStride[] = { 192, 256, 512, 576 };
39
+
40
+        for (int p = 0; p < 4; p++)
41
+        {
42
+            ref(pixel_test_buff[index], rand_srcStride[p], IPF_C_output_s, dstStride[p]);
43
+
44
+            checked(opt, pixel_test_buff[index], rand_srcStride[p], IPF_vec_output_s, dstStride[p]);
45
+
46
+            if (memcmp(IPF_vec_output_s, IPF_C_output_s, TEST_BUF_SIZE * sizeof(int16_t)))
47
+                return false;
48
+        }
49
+        reportfail();
50
+    }
51
+
52
+    return true;
53
+}
54
+
55
 bool IPFilterHarness::testCorrectness(const EncoderPrimitives& ref, const EncoderPrimitives& opt)
56
 {
57
 
58
@@ -571,14 +614,22 @@
59
                 return false;
60
             }
61
         }
62
-        if (opt.pu[value].convert_p2s)
63
+        if (opt.pu[value].convert_p2s[NONALIGNED])
64
         {
65
-            if (!check_IPFilterLumaP2S_primitive(ref.pu[value].convert_p2s, opt.pu[value].convert_p2s))
66
+            if (!check_IPFilterLumaP2S_primitive(ref.pu[value].convert_p2s[NONALIGNED], opt.pu[value].convert_p2s[NONALIGNED]))
67
             {
68
                 printf("convert_p2s[%s]", lumaPartStr[value]);
69
                 return false;
70
             }
71
         }
72
+        if (opt.pu[value].convert_p2s[ALIGNED])
73
+        {
74
+            if (!check_IPFilterLumaP2S_aligned_primitive(ref.pu[value].convert_p2s[ALIGNED], opt.pu[value].convert_p2s[ALIGNED]))
75
+            {
76
+                printf("convert_p2s_aligned[%s]", lumaPartStr[value]);
77
+                return false;
78
+            }
79
+        }
80
     }
81
 
82
     for (int csp = X265_CSP_I420; csp < X265_CSP_COUNT; csp++)
83
@@ -633,9 +684,17 @@
84
                     return false;
85
                 }
86
             }
87
-            if (opt.chroma[csp].pu[value].p2s)
88
+            if (opt.chroma[csp].pu[value].p2s[ALIGNED])
89
+            {
90
+                if (!check_IPFilterChromaP2S_aligned_primitive(ref.chroma[csp].pu[value].p2s[ALIGNED], opt.chroma[csp].pu[value].p2s[ALIGNED]))
91
+                {
92
+                    printf("chroma_p2s_aligned[%s]", chromaPartStr[csp][value]);
93
+                    return false;
94
+                }
95
+            }
96
+            if (opt.chroma[csp].pu[value].p2s[NONALIGNED])
97
             {
98
-                if (!check_IPFilterChromaP2S_primitive(ref.chroma[csp].pu[value].p2s, opt.chroma[csp].pu[value].p2s))
99
+                if (!check_IPFilterChromaP2S_primitive(ref.chroma[csp].pu[value].p2s[NONALIGNED], opt.chroma[csp].pu[value].p2s[NONALIGNED]))
100
                 {
101
                     printf("chroma_p2s[%s]", chromaPartStr[csp][value]);
102
                     return false;
103
@@ -649,8 +708,8 @@
104
 
105
 void IPFilterHarness::measureSpeed(const EncoderPrimitives& ref, const EncoderPrimitives& opt)
106
 {
107
-    int16_t srcStride = 96;
108
-    int16_t dstStride = 96;
109
+    int16_t srcStride = 192;  /* Multiple of 64 */
110
+    int16_t dstStride = 192;
111
     int maxVerticalfilterHalfDistance = 3;
112
 
113
     for (int value = 0; value < NUM_PU_SIZES; value++)
114
@@ -659,62 +718,70 @@
115
         {
116
             printf("luma_hpp[%s]\t", lumaPartStr[value]);
117
             REPORT_SPEEDUP(opt.pu[value].luma_hpp, ref.pu[value].luma_hpp,
118
-                           pixel_buff + srcStride, srcStride, IPF_vec_output_p, dstStride, 1);
119
+                pixel_buff + srcStride, srcStride, IPF_vec_output_p, dstStride, 1);
120
         }
121
 
122
         if (opt.pu[value].luma_hps)
123
         {
124
             printf("luma_hps[%s]\t", lumaPartStr[value]);
125
             REPORT_SPEEDUP(opt.pu[value].luma_hps, ref.pu[value].luma_hps,
126
-                           pixel_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
127
-                           IPF_vec_output_s, dstStride, 1, 1);
128
+                pixel_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
129
+                IPF_vec_output_s, dstStride, 1, 1);
130
         }
131
 
132
         if (opt.pu[value].luma_vpp)
133
         {
134
             printf("luma_vpp[%s]\t", lumaPartStr[value]);
135
             REPORT_SPEEDUP(opt.pu[value].luma_vpp, ref.pu[value].luma_vpp,
136
-                           pixel_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
137
-                           IPF_vec_output_p, dstStride, 1);
138
+                pixel_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
139
+                IPF_vec_output_p, dstStride, 1);
140
         }
141
 
142
         if (opt.pu[value].luma_vps)
143
         {
144
             printf("luma_vps[%s]\t", lumaPartStr[value]);
145
             REPORT_SPEEDUP(opt.pu[value].luma_vps, ref.pu[value].luma_vps,
146
-                           pixel_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
147
-                           IPF_vec_output_s, dstStride, 1);
148
+                pixel_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
149
+                IPF_vec_output_s, dstStride, 1);
150
         }
151
 
152
         if (opt.pu[value].luma_vsp)
153
         {
154
             printf("luma_vsp[%s]\t", lumaPartStr[value]);
155
             REPORT_SPEEDUP(opt.pu[value].luma_vsp, ref.pu[value].luma_vsp,
156
-                           short_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
157
-                           IPF_vec_output_p, dstStride, 1);
158
+                short_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
159
+                IPF_vec_output_p, dstStride, 1);
160
         }
161
 
162
         if (opt.pu[value].luma_vss)
163
         {
164
             printf("luma_vss[%s]\t", lumaPartStr[value]);
165
             REPORT_SPEEDUP(opt.pu[value].luma_vss, ref.pu[value].luma_vss,
166
-                           short_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
167
-                           IPF_vec_output_s, dstStride, 1);
168
+                short_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
169
+                IPF_vec_output_s, dstStride, 1);
170
         }
171
 
172
         if (opt.pu[value].luma_hvpp)
173
         {
174
             printf("luma_hv [%s]\t", lumaPartStr[value]);
175
             REPORT_SPEEDUP(opt.pu[value].luma_hvpp, ref.pu[value].luma_hvpp,
176
-                           pixel_buff + 3 * srcStride, srcStride, IPF_vec_output_p, srcStride, 1, 3);
177
+                pixel_buff + 3 * srcStride, srcStride, IPF_vec_output_p, srcStride, 1, 3);
178
         }
179
 
180
-        if (opt.pu[value].convert_p2s)
181
+        if (opt.pu[value].convert_p2s[NONALIGNED])
182
         {
183
             printf("convert_p2s[%s]\t", lumaPartStr[value]);
184
-            REPORT_SPEEDUP(opt.pu[value].convert_p2s, ref.pu[value].convert_p2s,
185
-                               pixel_buff, srcStride,
186
-                               IPF_vec_output_s, dstStride);
187
+            REPORT_SPEEDUP(opt.pu[value].convert_p2s[NONALIGNED], ref.pu[value].convert_p2s[NONALIGNED],
188
+                pixel_buff, srcStride,
189
+                IPF_vec_output_s, dstStride);
190
+        }
191
+
192
+        if (opt.pu[value].convert_p2s[ALIGNED])
193
+        {
194
+            printf("convert_p2s_aligned[%s]\t", lumaPartStr[value]);
195
+            REPORT_SPEEDUP(opt.pu[value].convert_p2s[ALIGNED], ref.pu[value].convert_p2s[ALIGNED],
196
+                pixel_buff, srcStride,
197
+                IPF_vec_output_s, dstStride);
198
         }
199
     }
200
 
201
@@ -727,47 +794,53 @@
202
             {
203
                 printf("chroma_hpp[%s]", chromaPartStr[csp][value]);
204
                 REPORT_SPEEDUP(opt.chroma[csp].pu[value].filter_hpp, ref.chroma[csp].pu[value].filter_hpp,
205
-                               pixel_buff + srcStride, srcStride, IPF_vec_output_p, dstStride, 1);
206
+                    pixel_buff + srcStride, srcStride, IPF_vec_output_p, dstStride, 1);
207
             }
208
             if (opt.chroma[csp].pu[value].filter_hps)
209
             {
210
                 printf("chroma_hps[%s]", chromaPartStr[csp][value]);
211
                 REPORT_SPEEDUP(opt.chroma[csp].pu[value].filter_hps, ref.chroma[csp].pu[value].filter_hps,
212
-                               pixel_buff + srcStride, srcStride, IPF_vec_output_s, dstStride, 1, 1);
213
+                    pixel_buff + srcStride, srcStride, IPF_vec_output_s, dstStride, 1, 1);
214
             }
215
             if (opt.chroma[csp].pu[value].filter_vpp)
216
             {
217
                 printf("chroma_vpp[%s]", chromaPartStr[csp][value]);
218
                 REPORT_SPEEDUP(opt.chroma[csp].pu[value].filter_vpp, ref.chroma[csp].pu[value].filter_vpp,
219
-                               pixel_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
220
-                               IPF_vec_output_p, dstStride, 1);
221
+                    pixel_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
222
+                    IPF_vec_output_p, dstStride, 1);
223
             }
224
             if (opt.chroma[csp].pu[value].filter_vps)
225
             {
226
                 printf("chroma_vps[%s]", chromaPartStr[csp][value]);
227
                 REPORT_SPEEDUP(opt.chroma[csp].pu[value].filter_vps, ref.chroma[csp].pu[value].filter_vps,
228
-                               pixel_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
229
-                               IPF_vec_output_s, dstStride, 1);
230
+                    pixel_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
231
+                    IPF_vec_output_s, dstStride, 1);
232
             }
233
             if (opt.chroma[csp].pu[value].filter_vsp)
234
             {
235
                 printf("chroma_vsp[%s]", chromaPartStr[csp][value]);
236
                 REPORT_SPEEDUP(opt.chroma[csp].pu[value].filter_vsp, ref.chroma[csp].pu[value].filter_vsp,
237
-                               short_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
238
-                               IPF_vec_output_p, dstStride, 1);
239
+                    short_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
240
+                    IPF_vec_output_p, dstStride, 1);
241
             }
242
             if (opt.chroma[csp].pu[value].filter_vss)
243
             {
244
                 printf("chroma_vss[%s]", chromaPartStr[csp][value]);
245
                 REPORT_SPEEDUP(opt.chroma[csp].pu[value].filter_vss, ref.chroma[csp].pu[value].filter_vss,
246
-                               short_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
247
-                               IPF_vec_output_s, dstStride, 1);
248
+                    short_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
249
+                    IPF_vec_output_s, dstStride, 1);
250
             }
251
-            if (opt.chroma[csp].pu[value].p2s)
252
+            if (opt.chroma[csp].pu[value].p2s[NONALIGNED])
253
             {
254
                 printf("chroma_p2s[%s]\t", chromaPartStr[csp][value]);
255
-                REPORT_SPEEDUP(opt.chroma[csp].pu[value].p2s, ref.chroma[csp].pu[value].p2s,
256
-                               pixel_buff, srcStride, IPF_vec_output_s, dstStride);
257
+                REPORT_SPEEDUP(opt.chroma[csp].pu[value].p2s[NONALIGNED], ref.chroma[csp].pu[value].p2s[NONALIGNED],
258
+                    pixel_buff, srcStride, IPF_vec_output_s, dstStride);
259
+            }
260
+            if (opt.chroma[csp].pu[value].p2s[ALIGNED])
261
+            {
262
+                printf("chroma_p2s_aligned[%s]\t", chromaPartStr[csp][value]);
263
+                REPORT_SPEEDUP(opt.chroma[csp].pu[value].p2s[ALIGNED], ref.chroma[csp].pu[value].p2s[ALIGNED],
264
+                    pixel_buff, srcStride, IPF_vec_output_s, dstStride);
265
             }
266
         }
267
     }
268
x265_2.7.tar.gz/source/test/ipfilterharness.h -> x265_2.9.tar.gz/source/test/ipfilterharness.h Changed
35
 
1
@@ -40,15 +40,15 @@
2
     enum { TEST_CASES = 3 };
3
     enum { SMAX = 1 << 12 };
4
     enum { SMIN = (unsigned)-1 << 12 };
5
-    ALIGN_VAR_32(pixel, pixel_buff[TEST_BUF_SIZE]);
6
-    int16_t short_buff[TEST_BUF_SIZE];
7
-    int16_t IPF_vec_output_s[TEST_BUF_SIZE];
8
-    int16_t IPF_C_output_s[TEST_BUF_SIZE];
9
-    pixel   IPF_vec_output_p[TEST_BUF_SIZE];
10
-    pixel   IPF_C_output_p[TEST_BUF_SIZE];
11
+    ALIGN_VAR_64(pixel, pixel_buff[TEST_BUF_SIZE]);
12
+    ALIGN_VAR_64(int16_t, short_buff[TEST_BUF_SIZE]);
13
+    ALIGN_VAR_64(int16_t, IPF_vec_output_s[TEST_BUF_SIZE]);
14
+    ALIGN_VAR_64(int16_t, IPF_C_output_s[TEST_BUF_SIZE]);
15
+    ALIGN_VAR_64(pixel,   IPF_vec_output_p[TEST_BUF_SIZE]);
16
+    ALIGN_VAR_64(pixel,   IPF_C_output_p[TEST_BUF_SIZE]);
17
 
18
-    pixel   pixel_test_buff[TEST_CASES][TEST_BUF_SIZE];
19
-    int16_t short_test_buff[TEST_CASES][TEST_BUF_SIZE];
20
+    ALIGN_VAR_64(pixel,   pixel_test_buff[TEST_CASES][TEST_BUF_SIZE]);
21
+    ALIGN_VAR_64(int16_t, short_test_buff[TEST_CASES][TEST_BUF_SIZE]);
22
 
23
     bool check_IPFilterChroma_primitive(filter_pp_t ref, filter_pp_t opt);
24
     bool check_IPFilterChroma_ps_primitive(filter_ps_t ref, filter_ps_t opt);
25
@@ -62,7 +62,9 @@
26
     bool check_IPFilterLuma_ss_primitive(filter_ss_t ref, filter_ss_t opt);
27
     bool check_IPFilterLumaHV_primitive(filter_hv_pp_t ref, filter_hv_pp_t opt);
28
     bool check_IPFilterLumaP2S_primitive(filter_p2s_t ref, filter_p2s_t opt);
29
+    bool check_IPFilterLumaP2S_aligned_primitive(filter_p2s_t ref, filter_p2s_t opt);
30
     bool check_IPFilterChromaP2S_primitive(filter_p2s_t ref, filter_p2s_t opt);
31
+    bool check_IPFilterChromaP2S_aligned_primitive(filter_p2s_t ref, filter_p2s_t opt);
32
 
33
 public:
34
 
35
x265_2.7.tar.gz/source/test/mbdstharness.cpp -> x265_2.9.tar.gz/source/test/mbdstharness.cpp Changed
256
 
1
@@ -61,16 +61,17 @@
2
     for (int i = 0; i < TEST_BUF_SIZE; i++)
3
     {
4
         short_test_buff[0][i]    = (rand() & PIXEL_MAX) - (rand() & PIXEL_MAX);
5
+        short_test_buff1[0][i]   = (rand() & PIXEL_MAX) - (rand() & PIXEL_MAX);
6
         int_test_buff[0][i]      = rand() % PIXEL_MAX;
7
         int_idct_test_buff[0][i] = (rand() % (SHORT_MAX - SHORT_MIN)) - SHORT_MAX;
8
         short_denoise_test_buff1[0][i] = short_denoise_test_buff2[0][i] = (rand() & SHORT_MAX) - (rand() & SHORT_MAX);
9
-
10
         short_test_buff[1][i]    = -PIXEL_MAX;
11
+        short_test_buff1[1][i]   = -PIXEL_MAX;
12
         int_test_buff[1][i]      = -PIXEL_MAX;
13
         int_idct_test_buff[1][i] = SHORT_MIN;
14
         short_denoise_test_buff1[1][i] = short_denoise_test_buff2[1][i] = -SHORT_MAX;
15
-
16
         short_test_buff[2][i]    = PIXEL_MAX;
17
+        short_test_buff1[2][i]   = PIXEL_MAX;
18
         int_test_buff[2][i]      = PIXEL_MAX;
19
         int_idct_test_buff[2][i] = SHORT_MAX;
20
         short_denoise_test_buff1[2][i] = short_denoise_test_buff2[2][i] = SHORT_MAX;
21
@@ -252,12 +253,10 @@
22
 bool MBDstHarness::check_nquant_primitive(nquant_t ref, nquant_t opt)
23
 {
24
     int j = 0;
25
-
26
     for (int i = 0; i < ITERS; i++)
27
     {
28
-        int width = (rand() % 4 + 1) * 4;
29
+        int width = 1 << (rand() % 4 + 2);
30
         int height = width;
31
-
32
         uint32_t optReturnValue = 0;
33
         uint32_t refReturnValue = 0;
34
 
35
@@ -281,6 +280,136 @@
36
         reportfail();
37
         j += INCR;
38
     }
39
+    return true;
40
+}
41
+
42
+bool MBDstHarness::check_nonPsyRdoQuant_primitive(nonPsyRdoQuant_t ref, nonPsyRdoQuant_t opt)
43
+{
44
+    int j = 0;
45
+    int trSize[4] = { 16, 64, 256, 1024 };
46
+
47
+    ALIGN_VAR_32(int64_t, ref_dest[4 * MAX_TU_SIZE]);
48
+    ALIGN_VAR_32(int64_t, opt_dest[4 * MAX_TU_SIZE]);
49
+
50
+    for (int i = 0; i < ITERS; i++)
51
+    {
52
+        int64_t totalRdCostRef = rand();
53
+        int64_t totalUncodedCostRef = rand();
54
+        int64_t totalRdCostOpt = totalRdCostRef;
55
+        int64_t totalUncodedCostOpt = totalUncodedCostRef;
56
+
57
+        int index = rand() % 4;
58
+        uint32_t blkPos = trSize[index];
59
+        int cmp_size = 4 * MAX_TU_SIZE;
60
+
61
+        memset(ref_dest, 0, MAX_TU_SIZE * sizeof(int64_t));
62
+        memset(opt_dest, 0, MAX_TU_SIZE * sizeof(int64_t));
63
+
64
+        int index1 = rand() % TEST_CASES;
65
+
66
+        ref(short_test_buff[index1] + j, ref_dest, &totalUncodedCostRef, &totalRdCostRef, blkPos);
67
+        checked(opt, short_test_buff[index1] + j, opt_dest, &totalUncodedCostOpt, &totalRdCostOpt, blkPos);
68
+
69
+        if (memcmp(ref_dest, opt_dest, cmp_size))
70
+            return false;
71
+
72
+        if (totalUncodedCostRef != totalUncodedCostOpt)
73
+            return false;
74
+
75
+        if (totalRdCostRef != totalRdCostOpt)
76
+            return false;
77
+
78
+        reportfail();
79
+        j += INCR;
80
+    }
81
+
82
+    return true;
83
+}
84
+bool MBDstHarness::check_psyRdoQuant_primitive(psyRdoQuant_t ref, psyRdoQuant_t opt)
85
+{
86
+    int j = 0;
87
+    int trSize[4] = { 16, 64, 256, 1024 };
88
+
89
+    ALIGN_VAR_32(int64_t, ref_dest[4 * MAX_TU_SIZE]);
90
+    ALIGN_VAR_32(int64_t, opt_dest[4 * MAX_TU_SIZE]);
91
+
92
+    for (int i = 0; i < ITERS; i++)
93
+    {
94
+        int64_t totalRdCostRef = rand();
95
+        int64_t totalUncodedCostRef = rand();
96
+        int64_t totalRdCostOpt = totalRdCostRef;
97
+        int64_t totalUncodedCostOpt = totalUncodedCostRef;
98
+        int64_t *psyScale = X265_MALLOC(int64_t, 1);
99
+        *psyScale = rand();
100
+
101
+        int index = rand() % 4;
102
+        uint32_t blkPos = trSize[index];
103
+        int cmp_size = 4 * MAX_TU_SIZE;
104
+
105
+        memset(ref_dest, 0, MAX_TU_SIZE * sizeof(int64_t));
106
+        memset(opt_dest, 0, MAX_TU_SIZE * sizeof(int64_t));
107
+
108
+        int index1 = rand() % TEST_CASES;
109
+
110
+        ref(short_test_buff[index1] + j, short_test_buff1[index1] + j, ref_dest, &totalUncodedCostRef, &totalRdCostRef, psyScale, blkPos);
111
+        checked(opt, short_test_buff[index1] + j, short_test_buff1[index1] + j, opt_dest, &totalUncodedCostOpt, &totalRdCostOpt, psyScale, blkPos);
112
+
113
+        X265_FREE(psyScale);
114
+        if (memcmp(ref_dest, opt_dest, cmp_size))
115
+            return false;
116
+
117
+        if (totalUncodedCostRef != totalUncodedCostOpt)
118
+            return false;
119
+
120
+        if (totalRdCostRef != totalRdCostOpt)
121
+            return false;
122
+
123
+        reportfail();
124
+        j += INCR;
125
+    }
126
+
127
+    return true;
128
+}
129
+bool MBDstHarness::check_psyRdoQuant_primitive_avx2(psyRdoQuant_t1 ref, psyRdoQuant_t1 opt)
130
+{
131
+    int j = 0;
132
+    int trSize[4] = { 16, 64, 256, 1024 };
133
+
134
+    ALIGN_VAR_32(int64_t, ref_dest[4 * MAX_TU_SIZE]);
135
+    ALIGN_VAR_32(int64_t, opt_dest[4 * MAX_TU_SIZE]);
136
+
137
+    for (int i = 0; i < ITERS; i++)
138
+    {
139
+        int64_t totalRdCostRef = rand();
140
+        int64_t totalUncodedCostRef = rand();
141
+        int64_t totalRdCostOpt = totalRdCostRef;
142
+        int64_t totalUncodedCostOpt = totalUncodedCostRef;
143
+
144
+        int index = rand() % 4;
145
+        uint32_t blkPos =  trSize[index];
146
+        int cmp_size = 4 * MAX_TU_SIZE;
147
+
148
+        memset(ref_dest, 0, MAX_TU_SIZE * sizeof(int64_t));
149
+        memset(opt_dest, 0, MAX_TU_SIZE * sizeof(int64_t));
150
+
151
+        int index1 = rand() % TEST_CASES;
152
+
153
+        ref(short_test_buff[index1] + j, ref_dest, &totalUncodedCostRef, &totalRdCostRef, blkPos);
154
+        checked(opt, short_test_buff[index1] + j, opt_dest, &totalUncodedCostOpt, &totalRdCostOpt, blkPos);
155
+
156
+        
157
+        if (memcmp(ref_dest, opt_dest, cmp_size))
158
+            return false;
159
+
160
+        if (totalUncodedCostRef != totalUncodedCostOpt)
161
+            return false;
162
+
163
+        if (totalRdCostRef != totalRdCostOpt)
164
+            return false;
165
+
166
+        reportfail();
167
+        j += INCR;
168
+    }
169
 
170
     return true;
171
 }
172
@@ -420,6 +549,40 @@
173
             return false;
174
         }
175
     }
176
+
177
+    for (int i = 0; i < NUM_TR_SIZE; i++)
178
+    {
179
+        if (opt.cu[i].nonPsyRdoQuant)
180
+        {
181
+            if (!check_nonPsyRdoQuant_primitive(ref.cu[i].nonPsyRdoQuant, opt.cu[i].nonPsyRdoQuant))
182
+            {
183
+                printf("nonPsyRdoQuant[%dx%d]: Failed!\n", 4 << i, 4 << i);
184
+                return false;
185
+            }
186
+        }
187
+    }
188
+    for (int i = 0; i < NUM_TR_SIZE; i++)
189
+    {
190
+        if (opt.cu[i].psyRdoQuant)
191
+        {
192
+            if (!check_psyRdoQuant_primitive(ref.cu[i].psyRdoQuant, opt.cu[i].psyRdoQuant))
193
+            {
194
+                printf("psyRdoQuant[%dx%d]: Failed!\n", 4 << i, 4 << i);
195
+                return false;
196
+            }
197
+        }
198
+    }
199
+    for (int i = 0; i < NUM_TR_SIZE; i++)
200
+    {
201
+        if (opt.cu[i].psyRdoQuant_1p)
202
+        {
203
+            if (!check_psyRdoQuant_primitive_avx2(ref.cu[i].psyRdoQuant_1p, opt.cu[i].psyRdoQuant_1p))
204
+            {
205
+                printf("psyRdoQuant_1p[%dx%d]: Failed!\n", 4 << i, 4 << i);
206
+                return false;
207
+            }
208
+        }
209
+    }
210
     for (int i = 0; i < NUM_TR_SIZE; i++)
211
     {
212
         if (opt.cu[i].count_nonzero)
213
@@ -507,6 +670,42 @@
214
         printf("nquant\t\t");
215
         REPORT_SPEEDUP(opt.nquant, ref.nquant, short_test_buff[0], int_test_buff[1], mshortbuf2, 23, 23785, 32 * 32);
216
     }
217
+
218
+    for (int value = 0; value < NUM_TR_SIZE; value++)
219
+    {
220
+        if (opt.cu[value].nonPsyRdoQuant)
221
+        {
222
+            ALIGN_VAR_32(int64_t, opt_dest[4 * MAX_TU_SIZE]);
223
+            int64_t totalRdCost = 0;
224
+            int64_t totalUncodedCost = 0;
225
+            printf("nonPsyRdoQuant[%dx%d]", 4 << value, 4 << value);
226
+            REPORT_SPEEDUP(opt.cu[value].nonPsyRdoQuant, ref.cu[value].nonPsyRdoQuant, short_test_buff[0], opt_dest, &totalUncodedCost, &totalRdCost, 0);
227
+        }
228
+    }
229
+    for (int value = 0; value < NUM_TR_SIZE; value++)
230
+    {
231
+        if (opt.cu[value].psyRdoQuant)
232
+        {
233
+            ALIGN_VAR_32(int64_t, opt_dest[4 * MAX_TU_SIZE]);
234
+            int64_t totalRdCost = 0;
235
+            int64_t totalUncodedCost = 0;
236
+            int64_t *psyScale = X265_MALLOC(int64_t, 1);
237
+            *psyScale = 0;
238
+            printf("psyRdoQuant[%dx%d]", 4 << value, 4 << value);
239
+            REPORT_SPEEDUP(opt.cu[value].psyRdoQuant, ref.cu[value].psyRdoQuant, short_test_buff[0], short_test_buff1[0], opt_dest, &totalUncodedCost, &totalRdCost, psyScale, 0);
240
+        }
241
+    }
242
+    for (int value = 0; value < NUM_TR_SIZE; value++)
243
+    {
244
+        if (opt.cu[value].psyRdoQuant_1p)
245
+        {
246
+            ALIGN_VAR_32(int64_t, opt_dest[4 * MAX_TU_SIZE]);
247
+            int64_t totalRdCost = 0;
248
+            int64_t totalUncodedCost = 0;
249
+            printf("psyRdoQuant_1p[%dx%d]", 4 << value, 4 << value);
250
+            REPORT_SPEEDUP(opt.cu[value].psyRdoQuant_1p, ref.cu[value].psyRdoQuant_1p, short_test_buff[0], opt_dest, &totalUncodedCost, &totalRdCost, 0);
251
+        }
252
+    }
253
     for (int value = 0; value < NUM_TR_SIZE; value++)
254
     {
255
         if (opt.cu[value].count_nonzero)
256
x265_2.7.tar.gz/source/test/mbdstharness.h -> x265_2.9.tar.gz/source/test/mbdstharness.h Changed
32
 
1
@@ -51,26 +51,27 @@
2
     int     mintbuf2[MAX_TU_SIZE];
3
     int     mintbuf3[MAX_TU_SIZE];
4
     int     mintbuf4[MAX_TU_SIZE];
5
-
6
     int16_t short_test_buff[TEST_CASES][TEST_BUF_SIZE];
7
+    int16_t short_test_buff1[TEST_CASES][TEST_BUF_SIZE];
8
     int     int_test_buff[TEST_CASES][TEST_BUF_SIZE];
9
     int     int_idct_test_buff[TEST_CASES][TEST_BUF_SIZE];
10
-
11
     uint32_t mubuf1[MAX_TU_SIZE];
12
     uint32_t mubuf2[MAX_TU_SIZE];
13
     uint16_t mushortbuf1[MAX_TU_SIZE];
14
 
15
     int16_t short_denoise_test_buff1[TEST_CASES][TEST_BUF_SIZE];
16
     int16_t short_denoise_test_buff2[TEST_CASES][TEST_BUF_SIZE];
17
-
18
     bool check_dequant_primitive(dequant_scaling_t ref, dequant_scaling_t opt);
19
     bool check_dequant_primitive(dequant_normal_t ref, dequant_normal_t opt);
20
+    bool check_nonPsyRdoQuant_primitive(nonPsyRdoQuant_t ref, nonPsyRdoQuant_t opt);
21
+    bool check_psyRdoQuant_primitive(psyRdoQuant_t ref, psyRdoQuant_t opt);
22
     bool check_quant_primitive(quant_t ref, quant_t opt);
23
     bool check_nquant_primitive(nquant_t ref, nquant_t opt);
24
     bool check_dct_primitive(dct_t ref, dct_t opt, intptr_t width);
25
     bool check_idct_primitive(idct_t ref, idct_t opt, intptr_t width);
26
     bool check_count_nonzero_primitive(count_nonzero_t ref, count_nonzero_t opt);
27
     bool check_denoise_dct_primitive(denoiseDct_t ref, denoiseDct_t opt);
28
+    bool check_psyRdoQuant_primitive_avx2(psyRdoQuant_t1 ref, psyRdoQuant_t1 opt);
29
 
30
 public:
31
 
32
x265_2.7.tar.gz/source/test/pixelharness.cpp -> x265_2.9.tar.gz/source/test/pixelharness.cpp Changed
775
 
1
@@ -226,6 +226,31 @@
2
     return true;
3
 }
4
 
5
+bool PixelHarness::check_calresidual_aligned(calcresidual_t ref, calcresidual_t opt)
6
+{
7
+    ALIGN_VAR_16(int16_t, ref_dest[64 * 64]);
8
+    ALIGN_VAR_16(int16_t, opt_dest[64 * 64]);
9
+    memset(ref_dest, 0, 64 * 64 * sizeof(int16_t));
10
+    memset(opt_dest, 0, 64 * 64 * sizeof(int16_t));
11
+
12
+    int j = 0;
13
+    intptr_t stride = STRIDE;
14
+    for (int i = 0; i < ITERS; i++)
15
+    {
16
+        int index = i % TEST_CASES;
17
+        checked(opt, pbuf1 + j, pixel_test_buff[index] + j, opt_dest, stride);
18
+        ref(pbuf1 + j, pixel_test_buff[index] + j, ref_dest, stride);
19
+
20
+        if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int16_t)))
21
+            return false;
22
+
23
+        reportfail();
24
+        j += INCR;
25
+    }
26
+
27
+    return true;
28
+}
29
+
30
 bool PixelHarness::check_ssd_s(pixel_ssd_s_t ref, pixel_ssd_s_t opt)
31
 {
32
     int j = 0;
33
@@ -242,10 +267,27 @@
34
         reportfail();
35
         j += INCR;
36
     }
37
-
38
     return true;
39
 }
40
+bool PixelHarness::check_ssd_s_aligned(pixel_ssd_s_t ref, pixel_ssd_s_t opt)
41
+{
42
+    int j = 0;
43
+    for (int i = 0; i < ITERS; i++)
44
+    {
45
+               // NOTE: stride must be multiple of 16, because minimum block is 4x4
46
+        int stride = STRIDE;
47
+        sse_t cres = ref(sbuf1 + j, stride);
48
+        sse_t vres = (sse_t)checked(opt, sbuf1 + j, (intptr_t)stride);
49
+
50
+        if (cres != vres)
51
+            return false;
52
+
53
+        reportfail();
54
+        j += INCR+32;
55
+    }
56
 
57
+    return true;
58
+}
59
 bool PixelHarness::check_weightp(weightp_sp_t ref, weightp_sp_t opt)
60
 {
61
     ALIGN_VAR_16(pixel, ref_dest[64 * (64 + 1)]);
62
@@ -290,7 +332,11 @@
63
     memset(ref_dest, 0, 64 * 64 * sizeof(pixel));
64
     memset(opt_dest, 0, 64 * 64 * sizeof(pixel));
65
     int j = 0;
66
+    bool enableavx512 = true;
67
     int width = 16 * (rand() % 4 + 1);
68
+    int cpuid = X265_NS::cpu_detect(enableavx512);
69
+    if (cpuid & X265_CPU_AVX512)
70
+        width = 32 * (rand() % 2 + 1);
71
     int height = 8;
72
     int w0 = rand() % 128;
73
     int shift = rand() % 8; // maximum is 7, see setFromWeightAndOffset()
74
@@ -441,12 +487,10 @@
75
 
76
     return true;
77
 }
78
-
79
 bool PixelHarness::check_cpy1Dto2D_shl_t(cpy1Dto2D_shl_t ref, cpy1Dto2D_shl_t opt)
80
 {
81
-    ALIGN_VAR_16(int16_t, ref_dest[64 * 64]);
82
-    ALIGN_VAR_16(int16_t, opt_dest[64 * 64]);
83
-
84
+    ALIGN_VAR_64(int16_t, ref_dest[64 * 64]);
85
+    ALIGN_VAR_64(int16_t, opt_dest[64 * 64]);
86
     memset(ref_dest, 0xCD, sizeof(ref_dest));
87
     memset(opt_dest, 0xCD, sizeof(opt_dest));
88
 
89
@@ -469,6 +513,33 @@
90
 
91
     return true;
92
 }
93
+bool PixelHarness::check_cpy1Dto2D_shl_aligned_t(cpy1Dto2D_shl_t ref, cpy1Dto2D_shl_t opt)
94
+{
95
+    ALIGN_VAR_64(int16_t, ref_dest[64 * 64]);
96
+    ALIGN_VAR_64(int16_t, opt_dest[64 * 64]);
97
+
98
+    memset(ref_dest, 0xCD, sizeof(ref_dest));
99
+    memset(opt_dest, 0xCD, sizeof(opt_dest));
100
+
101
+    int j = 0;
102
+    intptr_t stride = STRIDE;
103
+    for (int i = 0; i < ITERS; i++)
104
+    {
105
+        int shift = (rand() % 7 + 1);
106
+
107
+        int index = i % TEST_CASES;
108
+        checked(opt, opt_dest, short_test_buff[index] + j, stride, shift);
109
+        ref(ref_dest, short_test_buff[index] + j, stride, shift);
110
+
111
+        if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int16_t)))
112
+            return false;
113
+
114
+        reportfail();
115
+        j += INCR + 32;
116
+    }
117
+
118
+    return true;
119
+}
120
 
121
 bool PixelHarness::check_cpy1Dto2D_shr_t(cpy1Dto2D_shr_t ref, cpy1Dto2D_shr_t opt)
122
 {
123
@@ -497,11 +568,37 @@
124
 
125
     return true;
126
 }
127
-
128
 bool PixelHarness::check_pixelavg_pp(pixelavg_pp_t ref, pixelavg_pp_t opt)
129
 {
130
-    ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
131
-    ALIGN_VAR_16(pixel, opt_dest[64 * 64]);
132
+    ALIGN_VAR_64(pixel, ref_dest[64 * 64]);
133
+    ALIGN_VAR_64(pixel, opt_dest[64 * 64]);
134
+    int j = 0;
135
+    memset(ref_dest, 0xCD, sizeof(ref_dest));
136
+    memset(opt_dest, 0xCD, sizeof(opt_dest));
137
+
138
+    intptr_t stride = STRIDE;
139
+    for (int i = 0; i < ITERS; i++)
140
+    {
141
+        int index1 = rand() % TEST_CASES;
142
+        int index2 = rand() % TEST_CASES;
143
+        checked(ref, ref_dest, stride, pixel_test_buff[index1] + j,
144
+                stride, pixel_test_buff[index2] + j, stride, 32);
145
+        opt(opt_dest, stride, pixel_test_buff[index1] + j,
146
+            stride, pixel_test_buff[index2] + j, stride, 32);
147
+
148
+        if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)))
149
+            return false;
150
+
151
+        reportfail();
152
+        j += INCR;
153
+    }
154
+
155
+    return true;
156
+}
157
+bool PixelHarness::check_pixelavg_pp_aligned(pixelavg_pp_t ref, pixelavg_pp_t opt)
158
+{
159
+    ALIGN_VAR_64(pixel, ref_dest[64 * 64]);
160
+    ALIGN_VAR_64(pixel, opt_dest[64 * 64]);
161
 
162
     int j = 0;
163
 
164
@@ -522,7 +619,7 @@
165
             return false;
166
 
167
         reportfail();
168
-        j += INCR;
169
+        j += INCR + 32;
170
     }
171
 
172
     return true;
173
@@ -642,8 +739,33 @@
174
 
175
 bool PixelHarness::check_blockfill_s(blockfill_s_t ref, blockfill_s_t opt)
176
 {
177
-    ALIGN_VAR_16(int16_t, ref_dest[64 * 64]);
178
-    ALIGN_VAR_16(int16_t, opt_dest[64 * 64]);
179
+    ALIGN_VAR_64(int16_t, ref_dest[64 * 64]);
180
+    ALIGN_VAR_64(int16_t, opt_dest[64 * 64]);
181
+
182
+    memset(ref_dest, 0xCD, sizeof(ref_dest));
183
+    memset(opt_dest, 0xCD, sizeof(opt_dest));
184
+
185
+    intptr_t stride = 64;
186
+    for (int i = 0; i < ITERS; i++)
187
+    {
188
+        int16_t value = (rand() % SHORT_MAX) + 1;
189
+
190
+        checked(opt, opt_dest, stride, value);
191
+        ref(ref_dest, stride, value);
192
+
193
+        if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int16_t)))
194
+            return false;
195
+
196
+        reportfail();
197
+    }
198
+
199
+    return true;
200
+}
201
+
202
+bool PixelHarness::check_blockfill_s_aligned(blockfill_s_t ref, blockfill_s_t opt)
203
+{
204
+    ALIGN_VAR_64(int16_t, ref_dest[64 * 64]);
205
+    ALIGN_VAR_64(int16_t, opt_dest[64 * 64]);
206
 
207
     memset(ref_dest, 0xCD, sizeof(ref_dest));
208
     memset(opt_dest, 0xCD, sizeof(opt_dest));
209
@@ -696,8 +818,8 @@
210
 
211
 bool PixelHarness::check_scale1D_pp(scale1D_t ref, scale1D_t opt)
212
 {
213
-    ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
214
-    ALIGN_VAR_16(pixel, opt_dest[64 * 64]);
215
+    ALIGN_VAR_64(pixel, ref_dest[64 * 64]);
216
+    ALIGN_VAR_64(pixel, opt_dest[64 * 64]);
217
 
218
     memset(ref_dest, 0, sizeof(ref_dest));
219
     memset(opt_dest, 0, sizeof(opt_dest));
220
@@ -719,6 +841,31 @@
221
     return true;
222
 }
223
 
224
+bool PixelHarness::check_scale1D_pp_aligned(scale1D_t ref, scale1D_t opt)
225
+{
226
+    ALIGN_VAR_64(pixel, ref_dest[64 * 64]);
227
+    ALIGN_VAR_64(pixel, opt_dest[64 * 64]);
228
+
229
+    memset(ref_dest, 0, sizeof(ref_dest));
230
+    memset(opt_dest, 0, sizeof(opt_dest));
231
+
232
+    int j = 0;
233
+    for (int i = 0; i < ITERS; i++)
234
+    {
235
+        int index = i % TEST_CASES;
236
+        checked(opt, opt_dest, pixel_test_buff[index] + j);
237
+        ref(ref_dest, pixel_test_buff[index] + j);
238
+
239
+        if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)))
240
+            return false;
241
+
242
+        reportfail();
243
+        j += INCR * 2;
244
+    }
245
+
246
+    return true;
247
+}
248
+
249
 bool PixelHarness::check_scale2D_pp(scale2D_t ref, scale2D_t opt)
250
 {
251
     ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
252
@@ -798,6 +945,31 @@
253
     return true;
254
 }
255
 
256
+bool PixelHarness::check_pixel_add_ps_aligned(pixel_add_ps_t ref, pixel_add_ps_t opt)
257
+{
258
+    ALIGN_VAR_64(pixel, ref_dest[64 * 64]);
259
+    ALIGN_VAR_64(pixel, opt_dest[64 * 64]);
260
+
261
+    memset(ref_dest, 0xCD, sizeof(ref_dest));
262
+    memset(opt_dest, 0xCD, sizeof(opt_dest));
263
+
264
+    int j = 0;
265
+    intptr_t stride2 = 64, stride = STRIDE;
266
+    for (int i = 0; i < ITERS; i++)
267
+    {
268
+        int index1 = rand() % TEST_CASES;
269
+        int index2 = rand() % TEST_CASES;
270
+        checked(opt, opt_dest, stride2, pixel_test_buff[index1] + j, short_test_buff[index2] + j, stride, stride);
271
+        ref(ref_dest, stride2, pixel_test_buff[index1] + j, short_test_buff[index2] + j, stride, stride);
272
+        if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)))
273
+            return false;
274
+
275
+        reportfail();
276
+        j += 2 * INCR;
277
+    }
278
+    return true;
279
+}
280
+
281
 bool PixelHarness::check_pixel_var(var_t ref, var_t opt)
282
 {
283
     int j = 0;
284
@@ -870,8 +1042,8 @@
285
 
286
 bool PixelHarness::check_addAvg(addAvg_t ref, addAvg_t opt)
287
 {
288
-    ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
289
-    ALIGN_VAR_16(pixel, opt_dest[64 * 64]);
290
+    ALIGN_VAR_64(pixel, ref_dest[64 * 64]);
291
+    ALIGN_VAR_64(pixel, opt_dest[64 * 64]);
292
 
293
     int j = 0;
294
 
295
@@ -895,6 +1067,32 @@
296
     return true;
297
 }
298
 
299
+bool PixelHarness::check_addAvg_aligned(addAvg_t ref, addAvg_t opt)
300
+{
301
+    ALIGN_VAR_64(pixel, ref_dest[64 * 64]);
302
+    ALIGN_VAR_64(pixel, opt_dest[64 * 64]);
303
+
304
+    int j = 0;
305
+
306
+    memset(ref_dest, 0xCD, sizeof(ref_dest));
307
+    memset(opt_dest, 0xCD, sizeof(opt_dest));
308
+    intptr_t stride = STRIDE;
309
+
310
+    for (int i = 0; i < ITERS; i++)
311
+    {
312
+        int index1 = rand() % TEST_CASES;
313
+        int index2 = rand() % TEST_CASES;
314
+        ref(short_test_buff2[index1] + j, short_test_buff2[index2] + j, ref_dest, stride, stride, stride);
315
+        checked(opt, short_test_buff2[index1] + j, short_test_buff2[index2] + j, opt_dest, stride, stride, stride);
316
+        if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)))
317
+            return false;
318
+
319
+            reportfail();
320
+            j += INCR * 2;
321
+        }
322
+
323
+    return true;
324
+}
325
 bool PixelHarness::check_calSign(sign_t ref, sign_t opt)
326
 {
327
     ALIGN_VAR_16(int8_t, ref_dest[64 * 2]);
328
@@ -2109,15 +2307,22 @@
329
             return false;
330
         }
331
     }
332
-
333
-    if (opt.pu[part].pixelavg_pp)
334
+    if (opt.pu[part].pixelavg_pp[NONALIGNED])
335
     {
336
-        if (!check_pixelavg_pp(ref.pu[part].pixelavg_pp, opt.pu[part].pixelavg_pp))
337
+        if (!check_pixelavg_pp(ref.pu[part].pixelavg_pp[NONALIGNED], opt.pu[part].pixelavg_pp[NONALIGNED]))
338
         {
339
             printf("pixelavg_pp[%s]: failed!\n", lumaPartStr[part]);
340
             return false;
341
         }
342
     }
343
+    if (opt.pu[part].pixelavg_pp[ALIGNED])
344
+    {
345
+        if (!check_pixelavg_pp_aligned(ref.pu[part].pixelavg_pp[ALIGNED], opt.pu[part].pixelavg_pp[ALIGNED]))
346
+        {
347
+            printf("pixelavg_pp_aligned[%s]: failed!\n", lumaPartStr[part]);
348
+            return false;
349
+        }
350
+    }
351
 
352
     if (opt.pu[part].copy_pp)
353
     {
354
@@ -2128,15 +2333,24 @@
355
         }
356
     }
357
 
358
-    if (opt.pu[part].addAvg)
359
+    if (opt.pu[part].addAvg[NONALIGNED])
360
     {
361
-        if (!check_addAvg(ref.pu[part].addAvg, opt.pu[part].addAvg))
362
+        if (!check_addAvg(ref.pu[part].addAvg[NONALIGNED], opt.pu[part].addAvg[NONALIGNED]))
363
         {
364
             printf("addAvg[%s] failed\n", lumaPartStr[part]);
365
             return false;
366
         }
367
     }
368
 
369
+    if (opt.pu[part].addAvg[ALIGNED])
370
+    {
371
+        if (!check_addAvg_aligned(ref.pu[part].addAvg[ALIGNED], opt.pu[part].addAvg[ALIGNED]))
372
+        {
373
+            printf("addAvg_aligned[%s] failed\n", lumaPartStr[part]);
374
+            return false;
375
+        }
376
+    }
377
+
378
     if (part < NUM_CU_SIZES)
379
     {
380
         if (opt.cu[part].sse_pp)
381
@@ -2166,15 +2380,24 @@
382
             }
383
         }
384
 
385
-        if (opt.cu[part].add_ps)
386
+        if (opt.cu[part].add_ps[NONALIGNED])
387
         {
388
-            if (!check_pixel_add_ps(ref.cu[part].add_ps, opt.cu[part].add_ps))
389
+            if (!check_pixel_add_ps(ref.cu[part].add_ps[NONALIGNED], opt.cu[part].add_ps[NONALIGNED]))
390
             {
391
                 printf("add_ps[%s] failed\n", lumaPartStr[part]);
392
                 return false;
393
             }
394
         }
395
 
396
+        if (opt.cu[part].add_ps[ALIGNED])
397
+        {
398
+            if (!check_pixel_add_ps_aligned(ref.cu[part].add_ps[ALIGNED], opt.cu[part].add_ps[ALIGNED]))
399
+            {
400
+                printf("add_ps_aligned[%s] failed\n", lumaPartStr[part]);
401
+                return false;
402
+            }
403
+        }
404
+
405
         if (opt.cu[part].copy_ss)
406
         {
407
             if (!check_copy_ss(ref.cu[part].copy_ss, opt.cu[part].copy_ss))
408
@@ -2213,14 +2436,22 @@
409
                 return false;
410
             }
411
         }
412
-        if (opt.chroma[i].pu[part].addAvg)
413
+        if (opt.chroma[i].pu[part].addAvg[NONALIGNED])
414
         {
415
-            if (!check_addAvg(ref.chroma[i].pu[part].addAvg, opt.chroma[i].pu[part].addAvg))
416
+            if (!check_addAvg(ref.chroma[i].pu[part].addAvg[NONALIGNED], opt.chroma[i].pu[part].addAvg[NONALIGNED]))
417
             {
418
                 printf("chroma_addAvg[%s][%s] failed\n", x265_source_csp_names[i], chromaPartStr[i][part]);
419
                 return false;
420
             }
421
         }
422
+        if (opt.chroma[i].pu[part].addAvg[ALIGNED])
423
+        {
424
+            if (!check_addAvg_aligned(ref.chroma[i].pu[part].addAvg[ALIGNED], opt.chroma[i].pu[part].addAvg[ALIGNED]))
425
+            {
426
+                printf("chroma_addAvg_aligned[%s][%s] failed\n", x265_source_csp_names[i], chromaPartStr[i][part]);
427
+                return false;
428
+            }
429
+        }
430
         if (opt.chroma[i].pu[part].satd)
431
         {
432
             if (!check_pixelcmp(ref.chroma[i].pu[part].satd, opt.chroma[i].pu[part].satd))
433
@@ -2247,14 +2478,22 @@
434
                     return false;
435
                 }
436
             }
437
-            if (opt.chroma[i].cu[part].add_ps)
438
+            if (opt.chroma[i].cu[part].add_ps[NONALIGNED])
439
             {
440
-                if (!check_pixel_add_ps(ref.chroma[i].cu[part].add_ps, opt.chroma[i].cu[part].add_ps))
441
+                if (!check_pixel_add_ps(ref.chroma[i].cu[part].add_ps[NONALIGNED], opt.chroma[i].cu[part].add_ps[NONALIGNED]))
442
                 {
443
                     printf("chroma_add_ps[%s][%s] failed\n", x265_source_csp_names[i], chromaPartStr[i][part]);
444
                     return false;
445
                 }
446
             }
447
+            if (opt.chroma[i].cu[part].add_ps[ALIGNED])
448
+            {
449
+                if (!check_pixel_add_ps_aligned(ref.chroma[i].cu[part].add_ps[ALIGNED], opt.chroma[i].cu[part].add_ps[ALIGNED]))
450
+                {
451
+                    printf("chroma_add_ps_aligned[%s][%s] failed\n", x265_source_csp_names[i], chromaPartStr[i][part]);
452
+                    return false;
453
+                }
454
+            }
455
             if (opt.chroma[i].cu[part].copy_sp)
456
             {
457
                 if (!check_copy_sp(ref.chroma[i].cu[part].copy_sp, opt.chroma[i].cu[part].copy_sp))
458
@@ -2333,15 +2572,23 @@
459
             }
460
         }
461
 
462
-        if (opt.cu[i].blockfill_s)
463
+        if (opt.cu[i].blockfill_s[NONALIGNED])
464
         {
465
-            if (!check_blockfill_s(ref.cu[i].blockfill_s, opt.cu[i].blockfill_s))
466
+            if (!check_blockfill_s(ref.cu[i].blockfill_s[NONALIGNED], opt.cu[i].blockfill_s[NONALIGNED]))
467
             {
468
                 printf("blockfill_s[%dx%d]: failed!\n", 4 << i, 4 << i);
469
                 return false;
470
             }
471
         }
472
 
473
+        if (opt.cu[i].blockfill_s[ALIGNED])
474
+        {
475
+            if (!check_blockfill_s_aligned(ref.cu[i].blockfill_s[ALIGNED], opt.cu[i].blockfill_s[ALIGNED]))
476
+            {
477
+                printf("blockfill_s_aligned[%dx%d]: failed!\n", 4 << i, 4 << i);
478
+                return false;
479
+            }
480
+        }
481
         if (opt.cu[i].var)
482
         {
483
             if (!check_pixel_var(ref.cu[i].var, opt.cu[i].var))
484
@@ -2364,15 +2611,24 @@
485
         {
486
             /* TU only primitives */
487
 
488
-            if (opt.cu[i].calcresidual)
489
+            if (opt.cu[i].calcresidual[NONALIGNED])
490
             {
491
-                if (!check_calresidual(ref.cu[i].calcresidual, opt.cu[i].calcresidual))
492
+                if (!check_calresidual(ref.cu[i].calcresidual[NONALIGNED], opt.cu[i].calcresidual[NONALIGNED]))
493
                 {
494
                     printf("calcresidual width: %d failed!\n", 4 << i);
495
                     return false;
496
                 }
497
             }
498
 
499
+            if (opt.cu[i].calcresidual[ALIGNED])
500
+            {
501
+                if (!check_calresidual_aligned(ref.cu[i].calcresidual[ALIGNED], opt.cu[i].calcresidual[ALIGNED]))
502
+                {
503
+                    printf("calcresidual_aligned width: %d failed!\n", 4 << i);
504
+                    return false;
505
+                }
506
+            }
507
+
508
             if (opt.cu[i].transpose)
509
             {
510
                 if (!check_transpose(ref.cu[i].transpose, opt.cu[i].transpose))
511
@@ -2381,16 +2637,22 @@
512
                     return false;
513
                 }
514
             }
515
-
516
-            if (opt.cu[i].ssd_s)
517
+            if (opt.cu[i].ssd_s[NONALIGNED])
518
             {
519
-                if (!check_ssd_s(ref.cu[i].ssd_s, opt.cu[i].ssd_s))
520
+                if (!check_ssd_s(ref.cu[i].ssd_s[NONALIGNED], opt.cu[i].ssd_s[NONALIGNED]))
521
                 {
522
                     printf("ssd_s[%dx%d]: failed!\n", 4 << i, 4 << i);
523
                     return false;
524
                 }
525
             }
526
-
527
+            if (opt.cu[i].ssd_s[ALIGNED])
528
+            {
529
+                if (!check_ssd_s_aligned(ref.cu[i].ssd_s[ALIGNED], opt.cu[i].ssd_s[ALIGNED]))
530
+                {
531
+                    printf("ssd_s_aligned[%dx%d]: failed!\n", 4 << i, 4 << i);
532
+                    return false;
533
+                }
534
+            }
535
             if (opt.cu[i].copy_cnt)
536
             {
537
                 if (!check_copy_cnt_t(ref.cu[i].copy_cnt, opt.cu[i].copy_cnt))
538
@@ -2417,15 +2679,22 @@
539
                     return false;
540
                 }
541
             }
542
-
543
-            if (opt.cu[i].cpy1Dto2D_shl)
544
+            if (opt.cu[i].cpy1Dto2D_shl[NONALIGNED])
545
             {
546
-                if (!check_cpy1Dto2D_shl_t(ref.cu[i].cpy1Dto2D_shl, opt.cu[i].cpy1Dto2D_shl))
547
+                if (!check_cpy1Dto2D_shl_t(ref.cu[i].cpy1Dto2D_shl[NONALIGNED], opt.cu[i].cpy1Dto2D_shl[NONALIGNED]))
548
                 {
549
                     printf("cpy1Dto2D_shl[%dx%d] failed!\n", 4 << i, 4 << i);
550
                     return false;
551
                 }
552
             }
553
+            if (opt.cu[i].cpy1Dto2D_shl[ALIGNED])
554
+            {
555
+                if (!check_cpy1Dto2D_shl_aligned_t(ref.cu[i].cpy1Dto2D_shl[ALIGNED], opt.cu[i].cpy1Dto2D_shl[ALIGNED]))
556
+                {
557
+                    printf("cpy1Dto2D_shl_aligned[%dx%d] failed!\n", 4 << i, 4 << i);
558
+                    return false;
559
+                }
560
+            }
561
 
562
             if (opt.cu[i].cpy1Dto2D_shr)
563
             {
564
@@ -2465,15 +2734,24 @@
565
         }
566
     }
567
 
568
-    if (opt.scale1D_128to64)
569
+    if (opt.scale1D_128to64[NONALIGNED])
570
     {
571
-        if (!check_scale1D_pp(ref.scale1D_128to64, opt.scale1D_128to64))
572
+        if (!check_scale1D_pp(ref.scale1D_128to64[NONALIGNED], opt.scale1D_128to64[NONALIGNED]))
573
         {
574
             printf("scale1D_128to64 failed!\n");
575
             return false;
576
         }
577
     }
578
 
579
+    if (opt.scale1D_128to64[ALIGNED])
580
+    {
581
+        if (!check_scale1D_pp_aligned(ref.scale1D_128to64[ALIGNED], opt.scale1D_128to64[ALIGNED]))
582
+        {
583
+            printf("scale1D_128to64_aligned failed!\n");
584
+            return false;
585
+        }
586
+    }
587
+
588
     if (opt.scale2D_64to32)
589
     {
590
         if (!check_scale2D_pp(ref.scale2D_64to32, opt.scale2D_64to32))
591
@@ -2830,13 +3108,17 @@
592
         HEADER("satd[%s]", lumaPartStr[part]);
593
         REPORT_SPEEDUP(opt.pu[part].satd, ref.pu[part].satd, pbuf1, STRIDE, fref, STRIDE);
594
     }
595
-
596
-    if (opt.pu[part].pixelavg_pp)
597
+    if (opt.pu[part].pixelavg_pp[NONALIGNED])
598
     {
599
         HEADER("avg_pp[%s]", lumaPartStr[part]);
600
-        REPORT_SPEEDUP(opt.pu[part].pixelavg_pp, ref.pu[part].pixelavg_pp, pbuf1, STRIDE, pbuf2, STRIDE, pbuf3, STRIDE, 32);
601
+        REPORT_SPEEDUP(opt.pu[part].pixelavg_pp[NONALIGNED], ref.pu[part].pixelavg_pp[NONALIGNED], pbuf1, STRIDE, pbuf2, STRIDE, pbuf3, STRIDE, 32);
602
     }
603
 
604
+    if (opt.pu[part].pixelavg_pp[ALIGNED])
605
+    {
606
+        HEADER("avg_pp_aligned[%s]", lumaPartStr[part]);
607
+        REPORT_SPEEDUP(opt.pu[part].pixelavg_pp[ALIGNED], ref.pu[part].pixelavg_pp[ALIGNED], pbuf1, STRIDE, pbuf2, STRIDE, pbuf3, STRIDE, 32);
608
+    }
609
     if (opt.pu[part].sad)
610
     {
611
         HEADER("sad[%s]", lumaPartStr[part]);
612
@@ -2861,10 +3143,15 @@
613
         REPORT_SPEEDUP(opt.pu[part].copy_pp, ref.pu[part].copy_pp, pbuf1, 64, pbuf2, 64);
614
     }
615
 
616
-    if (opt.pu[part].addAvg)
617
+    if (opt.pu[part].addAvg[NONALIGNED])
618
     {
619
         HEADER("addAvg[%s]", lumaPartStr[part]);
620
-        REPORT_SPEEDUP(opt.pu[part].addAvg, ref.pu[part].addAvg, sbuf1, sbuf2, pbuf1, STRIDE, STRIDE, STRIDE);
621
+        REPORT_SPEEDUP(opt.pu[part].addAvg[NONALIGNED], ref.pu[part].addAvg[NONALIGNED], sbuf1, sbuf2, pbuf1, STRIDE, STRIDE, STRIDE);
622
+    }
623
+    if (opt.pu[part].addAvg[ALIGNED])
624
+    {
625
+        HEADER("addAvg_aligned[%s]", lumaPartStr[part]);
626
+        REPORT_SPEEDUP(opt.pu[part].addAvg[ALIGNED], ref.pu[part].addAvg[ALIGNED], sbuf1, sbuf2, pbuf1, STRIDE, STRIDE, STRIDE);
627
     }
628
 
629
     if (part < NUM_CU_SIZES)
630
@@ -2885,10 +3172,15 @@
631
             HEADER("sub_ps[%s]", lumaPartStr[part]);
632
             REPORT_SPEEDUP(opt.cu[part].sub_ps, ref.cu[part].sub_ps, (int16_t*)pbuf1, FENC_STRIDE, pbuf2, pbuf1, STRIDE, STRIDE);
633
         }
634
-        if (opt.cu[part].add_ps)
635
+        if (opt.cu[part].add_ps[NONALIGNED])
636
         {
637
             HEADER("add_ps[%s]", lumaPartStr[part]);
638
-            REPORT_SPEEDUP(opt.cu[part].add_ps, ref.cu[part].add_ps, pbuf1, FENC_STRIDE, pbuf2, sbuf1, STRIDE, STRIDE);
639
+            REPORT_SPEEDUP(opt.cu[part].add_ps[NONALIGNED], ref.cu[part].add_ps[NONALIGNED], pbuf1, FENC_STRIDE, pbuf2, sbuf1, STRIDE, STRIDE);
640
+        }
641
+        if (opt.cu[part].add_ps[ALIGNED])
642
+        {
643
+            HEADER("add_ps_aligned[%s]", lumaPartStr[part]);
644
+            REPORT_SPEEDUP(opt.cu[part].add_ps[ALIGNED], ref.cu[part].add_ps[ALIGNED], pbuf1, FENC_STRIDE, pbuf2, sbuf1, STRIDE, STRIDE);
645
         }
646
         if (opt.cu[part].copy_ss)
647
         {
648
@@ -2914,10 +3206,15 @@
649
             HEADER("[%s] copy_pp[%s]", x265_source_csp_names[i], chromaPartStr[i][part]);
650
             REPORT_SPEEDUP(opt.chroma[i].pu[part].copy_pp, ref.chroma[i].pu[part].copy_pp, pbuf1, 64, pbuf2, 128);
651
         }
652
-        if (opt.chroma[i].pu[part].addAvg)
653
+        if (opt.chroma[i].pu[part].addAvg[NONALIGNED])
654
         {
655
             HEADER("[%s]  addAvg[%s]", x265_source_csp_names[i], chromaPartStr[i][part]);
656
-            REPORT_SPEEDUP(opt.chroma[i].pu[part].addAvg, ref.chroma[i].pu[part].addAvg, sbuf1, sbuf2, pbuf1, STRIDE, STRIDE, STRIDE);
657
+            REPORT_SPEEDUP(opt.chroma[i].pu[part].addAvg[NONALIGNED], ref.chroma[i].pu[part].addAvg[NONALIGNED], sbuf1, sbuf2, pbuf1, STRIDE, STRIDE, STRIDE);
658
+        }
659
+        if (opt.chroma[i].pu[part].addAvg[ALIGNED])
660
+        {
661
+            HEADER("[%s]  addAvg_aligned[%s]", x265_source_csp_names[i], chromaPartStr[i][part]);
662
+            REPORT_SPEEDUP(opt.chroma[i].pu[part].addAvg[ALIGNED], ref.chroma[i].pu[part].addAvg[ALIGNED], sbuf1, sbuf2, pbuf1, STRIDE, STRIDE, STRIDE);
663
         }
664
         if (opt.chroma[i].pu[part].satd)
665
         {
666
@@ -2951,10 +3248,15 @@
667
                 HEADER("[%s]  sub_ps[%s]", x265_source_csp_names[i], chromaPartStr[i][part]);
668
                 REPORT_SPEEDUP(opt.chroma[i].cu[part].sub_ps, ref.chroma[i].cu[part].sub_ps, (int16_t*)pbuf1, FENC_STRIDE, pbuf2, pbuf1, STRIDE, STRIDE);
669
             }
670
-            if (opt.chroma[i].cu[part].add_ps)
671
+            if (opt.chroma[i].cu[part].add_ps[NONALIGNED])
672
             {
673
                 HEADER("[%s]  add_ps[%s]", x265_source_csp_names[i], chromaPartStr[i][part]);
674
-                REPORT_SPEEDUP(opt.chroma[i].cu[part].add_ps, ref.chroma[i].cu[part].add_ps, pbuf1, FENC_STRIDE, pbuf2, sbuf1, STRIDE, STRIDE);
675
+                REPORT_SPEEDUP(opt.chroma[i].cu[part].add_ps[NONALIGNED], ref.chroma[i].cu[part].add_ps[NONALIGNED], pbuf1, FENC_STRIDE, pbuf2, sbuf1, STRIDE, STRIDE);
676
+            }
677
+            if (opt.chroma[i].cu[part].add_ps[ALIGNED])
678
+            {
679
+                HEADER("[%s]  add_ps_aligned[%s]", x265_source_csp_names[i], chromaPartStr[i][part]);
680
+                REPORT_SPEEDUP(opt.chroma[i].cu[part].add_ps[ALIGNED], ref.chroma[i].cu[part].add_ps[ALIGNED], pbuf1, FENC_STRIDE, pbuf2, sbuf1, STRIDE, STRIDE);
681
             }
682
             if (opt.chroma[i].cu[part].sa8d)
683
             {
684
@@ -3000,29 +3302,42 @@
685
             measurePartition(part, ref, opt);
686
         }
687
     }
688
-
689
     for (int i = 0; i < NUM_CU_SIZES; i++)
690
     {
691
-        if ((i <= BLOCK_32x32) && opt.cu[i].ssd_s)
692
+        if ((i <= BLOCK_32x32) && opt.cu[i].ssd_s[NONALIGNED])
693
         {
694
             HEADER("ssd_s[%dx%d]", 4 << i, 4 << i);
695
-            REPORT_SPEEDUP(opt.cu[i].ssd_s, ref.cu[i].ssd_s, sbuf1, STRIDE);
696
+            REPORT_SPEEDUP(opt.cu[i].ssd_s[NONALIGNED], ref.cu[i].ssd_s[NONALIGNED], sbuf1, STRIDE);
697
+        }
698
+        if ((i <= BLOCK_32x32) && opt.cu[i].ssd_s[ALIGNED])
699
+        {
700
+            HEADER("ssd_s_aligned[%dx%d]", 4 << i, 4 << i);
701
+            REPORT_SPEEDUP(opt.cu[i].ssd_s[ALIGNED], ref.cu[i].ssd_s[ALIGNED], sbuf1, STRIDE);
702
         }
703
         if (opt.cu[i].sa8d)
704
         {
705
             HEADER("sa8d[%dx%d]", 4 << i, 4 << i);
706
             REPORT_SPEEDUP(opt.cu[i].sa8d, ref.cu[i].sa8d, pbuf1, STRIDE, pbuf2, STRIDE);
707
         }
708
-        if (opt.cu[i].calcresidual)
709
+        if (opt.cu[i].calcresidual[NONALIGNED])
710
         {
711
             HEADER("residual[%dx%d]", 4 << i, 4 << i);
712
-            REPORT_SPEEDUP(opt.cu[i].calcresidual, ref.cu[i].calcresidual, pbuf1, pbuf2, sbuf1, 64);
713
+            REPORT_SPEEDUP(opt.cu[i].calcresidual[NONALIGNED], ref.cu[i].calcresidual[NONALIGNED], pbuf1, pbuf2, sbuf1, 64);
714
         }
715
-
716
-        if (opt.cu[i].blockfill_s)
717
+        if (opt.cu[i].calcresidual[ALIGNED])
718
+        {
719
+            HEADER("residual_aligned[%dx%d]", 4 << i, 4 << i);
720
+            REPORT_SPEEDUP(opt.cu[i].calcresidual[ALIGNED], ref.cu[i].calcresidual[ALIGNED], pbuf1, pbuf2, sbuf1, 64);
721
+        }
722
+        if (opt.cu[i].blockfill_s[NONALIGNED])
723
         {
724
             HEADER("blkfill[%dx%d]", 4 << i, 4 << i);
725
-            REPORT_SPEEDUP(opt.cu[i].blockfill_s, ref.cu[i].blockfill_s, sbuf1, 64, SHORT_MAX);
726
+            REPORT_SPEEDUP(opt.cu[i].blockfill_s[NONALIGNED], ref.cu[i].blockfill_s[NONALIGNED], sbuf1, 64, SHORT_MAX);
727
+        }
728
+        if (opt.cu[i].blockfill_s[ALIGNED])
729
+        {
730
+            HEADER("blkfill_aligned[%dx%d]", 4 << i, 4 << i);
731
+            REPORT_SPEEDUP(opt.cu[i].blockfill_s[ALIGNED], ref.cu[i].blockfill_s[ALIGNED], sbuf1, 64, SHORT_MAX);
732
         }
733
 
734
         if (opt.cu[i].transpose)
735
@@ -3049,13 +3364,17 @@
736
             HEADER("cpy2Dto1D_shr[%dx%d]", 4 << i, 4 << i);
737
             REPORT_SPEEDUP(opt.cu[i].cpy2Dto1D_shr, ref.cu[i].cpy2Dto1D_shr, sbuf1, sbuf2, STRIDE, 3);
738
         }
739
-
740
-        if ((i < BLOCK_64x64) && opt.cu[i].cpy1Dto2D_shl)
741
+        if ((i < BLOCK_64x64) && opt.cu[i].cpy1Dto2D_shl[NONALIGNED])
742
         {
743
             HEADER("cpy1Dto2D_shl[%dx%d]", 4 << i, 4 << i);
744
-            REPORT_SPEEDUP(opt.cu[i].cpy1Dto2D_shl, ref.cu[i].cpy1Dto2D_shl, sbuf1, sbuf2, STRIDE, 64);
745
+            REPORT_SPEEDUP(opt.cu[i].cpy1Dto2D_shl[NONALIGNED], ref.cu[i].cpy1Dto2D_shl[NONALIGNED], sbuf1, sbuf2, STRIDE, 64);
746
         }
747
 
748
+        if ((i < BLOCK_64x64) && opt.cu[i].cpy1Dto2D_shl[ALIGNED])
749
+        {
750
+            HEADER("cpy1Dto2D_shl_aligned[%dx%d]", 4 << i, 4 << i);
751
+            REPORT_SPEEDUP(opt.cu[i].cpy1Dto2D_shl[ALIGNED], ref.cu[i].cpy1Dto2D_shl[ALIGNED], sbuf1, sbuf2, STRIDE, 64);
752
+        }
753
         if ((i < BLOCK_64x64) && opt.cu[i].cpy1Dto2D_shr)
754
         {
755
             HEADER("cpy1Dto2D_shr[%dx%d]", 4 << i, 4 << i);
756
@@ -3093,10 +3412,16 @@
757
         REPORT_SPEEDUP(opt.frameInitLowres, ref.frameInitLowres, pbuf2, pbuf1, pbuf2, pbuf3, pbuf4, 64, 64, 64, 64);
758
     }
759
 
760
-    if (opt.scale1D_128to64)
761
+    if (opt.scale1D_128to64[NONALIGNED])
762
     {
763
         HEADER0("scale1D_128to64");
764
-        REPORT_SPEEDUP(opt.scale1D_128to64, ref.scale1D_128to64, pbuf2, pbuf1);
765
+        REPORT_SPEEDUP(opt.scale1D_128to64[NONALIGNED], ref.scale1D_128to64[NONALIGNED], pbuf2, pbuf1);
766
+    }
767
+
768
+    if (opt.scale1D_128to64[ALIGNED])
769
+    {
770
+        HEADER0("scale1D_128to64_aligned");
771
+        REPORT_SPEEDUP(opt.scale1D_128to64[ALIGNED], ref.scale1D_128to64[ALIGNED], pbuf2, pbuf1);
772
     }
773
 
774
     if (opt.scale2D_64to32)
775
x265_2.7.tar.gz/source/test/pixelharness.h -> x265_2.9.tar.gz/source/test/pixelharness.h Changed
89
 
1
@@ -44,30 +44,30 @@
2
     enum { RMAX = PIXEL_MAX - PIXEL_MIN }; //The maximum value obtained by subtracting pixel values (residual max)
3
     enum { RMIN = PIXEL_MIN - PIXEL_MAX }; //The minimum value obtained by subtracting pixel values (residual min)
4
 
5
-    ALIGN_VAR_32(pixel, pbuf1[BUFFSIZE]);
6
-    pixel    pbuf2[BUFFSIZE];
7
-    pixel    pbuf3[BUFFSIZE];
8
-    pixel    pbuf4[BUFFSIZE];
9
-    int      ibuf1[BUFFSIZE];
10
-    int8_t   psbuf1[BUFFSIZE];
11
-    int8_t   psbuf2[BUFFSIZE];
12
-    int8_t   psbuf3[BUFFSIZE];
13
-    int8_t   psbuf4[BUFFSIZE];
14
-    int8_t   psbuf5[BUFFSIZE];
15
+    ALIGN_VAR_64(pixel, pbuf1[BUFFSIZE]);
16
+    ALIGN_VAR_64(pixel,    pbuf2[BUFFSIZE]);
17
+    ALIGN_VAR_64(pixel,    pbuf3[BUFFSIZE]);
18
+    ALIGN_VAR_64(pixel,    pbuf4[BUFFSIZE]);
19
+    ALIGN_VAR_64(int,      ibuf1[BUFFSIZE]);
20
+    ALIGN_VAR_64(int8_t,   psbuf1[BUFFSIZE]);
21
+    ALIGN_VAR_64(int8_t,   psbuf2[BUFFSIZE]);
22
+    ALIGN_VAR_64(int8_t,   psbuf3[BUFFSIZE]);
23
+    ALIGN_VAR_64(int8_t,   psbuf4[BUFFSIZE]);
24
+    ALIGN_VAR_64(int8_t,   psbuf5[BUFFSIZE]);
25
 
26
-    int16_t  sbuf1[BUFFSIZE];
27
-    int16_t  sbuf2[BUFFSIZE];
28
-    int16_t  sbuf3[BUFFSIZE];
29
+    ALIGN_VAR_64(int16_t,  sbuf1[BUFFSIZE]);
30
+    ALIGN_VAR_64(int16_t,  sbuf2[BUFFSIZE]);
31
+    ALIGN_VAR_64(int16_t,  sbuf3[BUFFSIZE]);
32
 
33
-    pixel    pixel_test_buff[TEST_CASES][BUFFSIZE];
34
-    int16_t  short_test_buff[TEST_CASES][BUFFSIZE];
35
-    int16_t  short_test_buff1[TEST_CASES][BUFFSIZE];
36
-    int16_t  short_test_buff2[TEST_CASES][BUFFSIZE];
37
-    int      int_test_buff[TEST_CASES][BUFFSIZE];
38
-    uint16_t ushort_test_buff[TEST_CASES][BUFFSIZE];
39
-    uint8_t  uchar_test_buff[TEST_CASES][BUFFSIZE];
40
-    double   double_test_buff[TEST_CASES][BUFFSIZE];
41
-    int16_t  residual_test_buff[TEST_CASES][BUFFSIZE];
42
+    ALIGN_VAR_64(pixel,    pixel_test_buff[TEST_CASES][BUFFSIZE]);
43
+    ALIGN_VAR_64(int16_t,  short_test_buff[TEST_CASES][BUFFSIZE]);
44
+    ALIGN_VAR_64(int16_t,  short_test_buff1[TEST_CASES][BUFFSIZE]);
45
+    ALIGN_VAR_64(int16_t,  short_test_buff2[TEST_CASES][BUFFSIZE]);
46
+    ALIGN_VAR_64(int,      int_test_buff[TEST_CASES][BUFFSIZE]);
47
+    ALIGN_VAR_64(uint16_t, ushort_test_buff[TEST_CASES][BUFFSIZE]);
48
+    ALIGN_VAR_64(uint8_t,  uchar_test_buff[TEST_CASES][BUFFSIZE]);
49
+    ALIGN_VAR_64(double,   double_test_buff[TEST_CASES][BUFFSIZE]);
50
+    ALIGN_VAR_64(int16_t,  residual_test_buff[TEST_CASES][BUFFSIZE]);
51
 
52
     bool check_pixelcmp(pixelcmp_t ref, pixelcmp_t opt);
53
     bool check_pixel_sse(pixel_sse_t ref, pixel_sse_t opt);
54
@@ -79,13 +79,19 @@
55
     bool check_copy_ps(copy_ps_t ref, copy_ps_t opt);
56
     bool check_copy_ss(copy_ss_t ref, copy_ss_t opt);
57
     bool check_pixelavg_pp(pixelavg_pp_t ref, pixelavg_pp_t opt);
58
+    bool check_pixelavg_pp_aligned(pixelavg_pp_t ref, pixelavg_pp_t opt);
59
     bool check_pixel_sub_ps(pixel_sub_ps_t ref, pixel_sub_ps_t opt);
60
     bool check_pixel_add_ps(pixel_add_ps_t ref, pixel_add_ps_t opt);
61
+    bool check_pixel_add_ps_aligned(pixel_add_ps_t ref, pixel_add_ps_t opt);
62
     bool check_scale1D_pp(scale1D_t ref, scale1D_t opt);
63
+    bool check_scale1D_pp_aligned(scale1D_t ref, scale1D_t opt);
64
     bool check_scale2D_pp(scale2D_t ref, scale2D_t opt);
65
     bool check_ssd_s(pixel_ssd_s_t ref, pixel_ssd_s_t opt);
66
+    bool check_ssd_s_aligned(pixel_ssd_s_t ref, pixel_ssd_s_t opt);
67
     bool check_blockfill_s(blockfill_s_t ref, blockfill_s_t opt);
68
+    bool check_blockfill_s_aligned(blockfill_s_t ref, blockfill_s_t opt);
69
     bool check_calresidual(calcresidual_t ref, calcresidual_t opt);
70
+    bool check_calresidual_aligned(calcresidual_t ref, calcresidual_t opt);
71
     bool check_transpose(transpose_t ref, transpose_t opt);
72
     bool check_weightp(weightp_pp_t ref, weightp_pp_t opt);
73
     bool check_weightp(weightp_sp_t ref, weightp_sp_t opt);
74
@@ -93,12 +99,14 @@
75
     bool check_cpy2Dto1D_shl_t(cpy2Dto1D_shl_t ref, cpy2Dto1D_shl_t opt);
76
     bool check_cpy2Dto1D_shr_t(cpy2Dto1D_shr_t ref, cpy2Dto1D_shr_t opt);
77
     bool check_cpy1Dto2D_shl_t(cpy1Dto2D_shl_t ref, cpy1Dto2D_shl_t opt);
78
+    bool check_cpy1Dto2D_shl_aligned_t(cpy1Dto2D_shl_t ref, cpy1Dto2D_shl_t opt);
79
     bool check_cpy1Dto2D_shr_t(cpy1Dto2D_shr_t ref, cpy1Dto2D_shr_t opt);
80
     bool check_copy_cnt_t(copy_cnt_t ref, copy_cnt_t opt);
81
     bool check_pixel_var(var_t ref, var_t opt);
82
     bool check_ssim_4x4x2_core(ssim_4x4x2_core_t ref, ssim_4x4x2_core_t opt);
83
     bool check_ssim_end(ssim_end4_t ref, ssim_end4_t opt);
84
     bool check_addAvg(addAvg_t, addAvg_t);
85
+    bool check_addAvg_aligned(addAvg_t, addAvg_t);
86
     bool check_saoCuOrgE0_t(saoCuOrgE0_t ref, saoCuOrgE0_t opt);
87
     bool check_saoCuOrgE1_t(saoCuOrgE1_t ref, saoCuOrgE1_t opt);
88
     bool check_saoCuOrgE2_t(saoCuOrgE2_t ref[], saoCuOrgE2_t opt[]);
89
x265_2.7.tar.gz/source/test/regression-tests.txt -> x265_2.9.tar.gz/source/test/regression-tests.txt Changed
63
 
1
@@ -23,12 +23,12 @@
2
 BasketballDrive_1920x1080_50.y4m,--preset slower --lossless --chromaloc 3 --subme 0 --limit-tu 4
3
 BasketballDrive_1920x1080_50.y4m,--preset slower --no-cutree --analysis-save x265_analysis.dat --analysis-reuse-level 10 --bitrate 7000 --limit-tu 0::--preset slower --no-cutree --analysis-load x265_analysis.dat --analysis-reuse-level 10 --bitrate 7000 --limit-tu 0
4
 BasketballDrive_1920x1080_50.y4m,--preset veryslow --crf 4 --cu-lossless --pmode --limit-refs 1 --aq-mode 3 --limit-tu 3
5
-BasketballDrive_1920x1080_50.y4m,--preset veryslow --no-cutree --analysis-save x265_analysis.dat --bitrate 7000 --tskip-fast --limit-tu 2::--preset veryslow --no-cutree --analysis-load x265_analysis.dat --bitrate 7000  --tskip-fast --limit-tu 2
6
+BasketballDrive_1920x1080_50.y4m,--preset veryslow --no-cutree --analysis-save x265_analysis.dat --crf 18 --tskip-fast --limit-tu 2::--preset veryslow --no-cutree --analysis-load x265_analysis.dat --crf 18 --tskip-fast --limit-tu 2
7
 BasketballDrive_1920x1080_50.y4m,--preset veryslow --recon-y4m-exec "ffplay -i pipe:0 -autoexit"
8
 Coastguard-4k.y4m,--preset ultrafast --recon-y4m-exec "ffplay -i pipe:0 -autoexit"
9
 Coastguard-4k.y4m,--preset superfast --tune grain --overscan=crop
10
 Coastguard-4k.y4m,--preset superfast --tune grain --pme --aq-strength 2 --merange 190
11
-Coastguard-4k.y4m,--preset veryfast --no-cutree --analysis-save x265_analysis.dat --analysis-reuse-level 1 --bitrate 15000::--preset veryfast --no-cutree --analysis-load x265_analysis.dat --analysis-reuse-level 1 --bitrate 15000
12
+Coastguard-4k.y4m,--preset veryfast --no-cutree --analysis-save x265_analysis.dat --analysis-reuse-level 1 --qp 35::--preset veryfast --no-cutree --analysis-load x265_analysis.dat --analysis-reuse-level 1 --qp 35
13
 Coastguard-4k.y4m,--preset medium --rdoq-level 1 --tune ssim --no-signhide --me umh --slices 2
14
 Coastguard-4k.y4m,--preset slow --tune psnr --cbqpoffs -1 --crqpoffs 1 --limit-refs 1
15
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset ultrafast --weightp --tune zerolatency --qg-size 16
16
@@ -69,12 +69,11 @@
17
 KristenAndSara_1280x720_60.y4m,--preset slower --pmode --max-tu-size 8 --limit-refs 0 --limit-modes --limit-tu 1
18
 NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset superfast --tune psnr
19
 NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset medium --tune grain --limit-refs 2
20
-NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset slow --no-cutree --analysis-save x265_analysis.dat --rd 5 --analysis-reuse-level 10 --bitrate 9000::--preset slow --no-cutree --analysis-load x265_analysis.dat --rd 5 --analysis-reuse-level 10 --bitrate 9000
21
+NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset slow --no-cutree --analysis-save x265_analysis.dat --rd 5 --analysis-reuse-level 10 --bitrate 9000 --vbv-maxrate 9000 --vbv-bufsize 9000::--preset slow --no-cutree --analysis-load x265_analysis.dat --rd 5 --analysis-reuse-level 10 --bitrate 9000 --vbv-maxrate 9000 --vbv-bufsize 9000
22
 News-4k.y4m,--preset ultrafast --no-cutree --analysis-save x265_analysis.dat --analysis-reuse-level 2 --bitrate 15000::--preset ultrafast --no-cutree --analysis-load x265_analysis.dat --analysis-reuse-level 2 --bitrate 15000
23
 News-4k.y4m,--preset superfast --lookahead-slices 6 --aq-mode 0
24
 News-4k.y4m,--preset superfast --slices 4 --aq-mode 0 
25
 News-4k.y4m,--preset medium --tune ssim --no-sao --qg-size 16
26
-News-4k.y4m,--preset slower --opt-cu-delta-qp
27
 News-4k.y4m,--preset veryslow --no-rskip
28
 News-4k.y4m,--preset veryslow --pme --crf 40
29
 OldTownCross_1920x1080_50_10bit_422.yuv,--preset superfast --weightp
30
@@ -104,7 +103,6 @@
31
 city_4cif_60fps.y4m,--preset superfast --rdpenalty 1 --tu-intra-depth 2
32
 city_4cif_60fps.y4m,--preset medium --crf 4 --cu-lossless --sao-non-deblock
33
 city_4cif_60fps.y4m,--preset slower --scaling-list default
34
-city_4cif_60fps.y4m,--preset veryslow --opt-cu-delta-qp
35
 city_4cif_60fps.y4m,--preset veryslow --rdpenalty 2 --sao-non-deblock --no-b-intra --limit-refs 0
36
 ducks_take_off_420_720p50.y4m,--preset ultrafast --constrained-intra --rd 1
37
 ducks_take_off_444_720p50.y4m,--preset superfast --weightp --limit-refs 2
38
@@ -151,7 +149,7 @@
39
 Kimono1_1920x1080_24_400.yuv,--preset veryslow --crf 4 --cu-lossless --slices 2 --limit-refs 3 --limit-modes
40
 Kimono1_1920x1080_24_400.yuv,--preset placebo --ctu 32 --max-tu-size 8 --limit-tu 2
41
 big_buck_bunny_360p24.y4m, --keyint 60 --min-keyint 40 --gop-lookahead 14
42
-BasketballDrive_1920x1080_50.y4m, --preset medium --no-open-gop --keyint 50 --min-keyint 50 --radl 2
43
+BasketballDrive_1920x1080_50.y4m, --preset medium --no-open-gop --keyint 50 --min-keyint 50 --radl 2 --vbv-maxrate 5000 --vbv-bufsize 5000
44
 
45
 # Main12 intraCost overflow bug test
46
 720p50_parkrun_ter.y4m,--preset medium
47
@@ -167,4 +165,15 @@
48
 #low-pass dct test
49
 720p50_parkrun_ter.y4m,--preset medium --lowpass-dct
50
 
51
+#scaled save/load test
52
+crowd_run_1080p50.y4m,--preset ultrafast --no-cutree --analysis-save x265_analysis.dat  --analysis-reuse-level 1 --scale-factor 2 --crf 26 --vbv-maxrate 8000 --vbv-bufsize 8000::crowd_run_2160p50.y4m, --preset ultrafast --no-cutree --analysis-load x265_analysis.dat  --analysis-reuse-level 1 --scale-factor 2 --crf 26 --vbv-maxrate 12000 --vbv-bufsize 12000 
53
+crowd_run_1080p50.y4m,--preset superfast --no-cutree --analysis-save x265_analysis.dat  --analysis-reuse-level 2 --scale-factor 2 --crf 22 --vbv-maxrate 5000 --vbv-bufsize 5000::crowd_run_2160p50.y4m, --preset superfast --no-cutree --analysis-load x265_analysis.dat  --analysis-reuse-level 2 --scale-factor 2 --crf 22 --vbv-maxrate 10000 --vbv-bufsize 10000 
54
+crowd_run_1080p50.y4m,--preset fast --no-cutree --analysis-save x265_analysis.dat  --analysis-reuse-level 5 --scale-factor 2 --qp 18::crowd_run_2160p50.y4m, --preset fast --no-cutree --analysis-load x265_analysis.dat  --analysis-reuse-level 5 --scale-factor 2 --qp 18
55
+crowd_run_1080p50.y4m,--preset medium --no-cutree --analysis-save x265_analysis.dat  --analysis-reuse-level 10 --scale-factor 2 --bitrate 5000  --vbv-maxrate 5000 --vbv-bufsize 5000 --early-skip --tu-inter-depth 3::crowd_run_2160p50.y4m, --preset medium --no-cutree --analysis-load x265_analysis.dat  --analysis-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-maxrate 10000 --vbv-bufsize 10000 --early-skip --tu-inter-depth 3 --refine-intra 4 --dynamic-refine::crowd_run_2160p50.y4m, --preset medium --no-cutree --analysis-load x265_analysis.dat  --analysis-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-maxrate 10000 --vbv-bufsize 10000 --early-skip --tu-inter-depth 3 --refine-intra 3 --refine-inter 3
56
+RaceHorses_416x240_30.y4m,--preset slow --no-cutree --ctu 16 --analysis-save x265_analysis.dat --analysis-reuse-level 10 --scale-factor 2 --crf 22  --vbv-maxrate 1000 --vbv-bufsize 1000::RaceHorses_832x480_30.y4m, --preset slow --no-cutree --ctu 32 --analysis-load x265_analysis.dat  --analysis-save x265_analysis_2.dat --analysis-reuse-level 10 --scale-factor 2 --crf 16 --vbv-maxrate 4000 --vbv-bufsize 4000 --refine-intra 0 --refine-inter 1::RaceHorses_1664x960_30.y4m,--preset slow --no-cutree --ctu 64 --analysis-load x265_analysis_2.dat  --analysis-reuse-level 10 --scale-factor 2 --crf 12 --vbv-maxrate 7000 --vbv-bufsize 7000 --refine-intra 2 --refine-inter 2
57
+ElFunete_960x540_60.yuv,--colorprim bt709 --transfer bt709 --chromaloc 2 --aud --repeat-headers --no-opt-qp-pps --no-opt-ref-list-length-pps --wpp --no-interlace --sar 1:1 --min-keyint 60 --no-open-gop --rc-lookahead 180 --bframes 5 --b-intra --ref 4 --cbqpoffs -2 --crqpoffs -2 --lookahead-threads 0 --weightb --qg-size 8 --me star --preset veryslow --frame-threads 1 --b-adapt 2 --aq-mode 3 --rd 6 --pools 15 --colormatrix bt709 --keyint 120 --high-tier --ctu 64 --tune psnr --bitrate 10000 --vbv-bufsize 30000 --vbv-maxrate 17500 --analysis-reuse-level 10 --analysis-save elfuente_960x540.dat --scale-factor 2::ElFunete_1920x1080_60.yuv,--colorprim bt709 --transfer bt709 --chromaloc 2 --aud --repeat-headers --no-opt-qp-pps --no-opt-ref-list-length-pps --wpp --no-interlace --sar 1:1 --min-keyint 60 --no-open-gop --rc-lookahead 180 --bframes 5 --b-intra --ref 4 --cbqpoffs -2 --crqpoffs -2 --lookahead-threads 0 --weightb --qg-size 8 --me star --preset veryslow --frame-threads 1 --b-adapt 2 --aq-mode 3 --rd 6 --pools 15 --colormatrix bt709 --keyint 120 --high-tier --ctu 64 --tune psnr --bitrate 10000 --vbv-bufsize 30000 --vbv-maxrate 17500 --analysis-reuse-level 10 --analysis-save elfuente_1920x1080.dat --limit-tu 0 --scale-factor 2 --analysis-load elfuente_960x540.dat --refine-intra 4 --refine-inter 2::ElFuente_3840x2160_60.yuv,--colorprim bt709 --transfer bt709 --chromaloc 2 --aud --repeat-headers --no-opt-qp-pps --no-opt-ref-list-length-pps --wpp --no-interlace --sar 1:1 --min-keyint 60 --no-open-gop --rc-lookahead 180 --bframes 5 --b-intra --ref 4 --cbqpoffs -2 --crqpoffs -2 --lookahead-threads 0 --weightb --qg-size 8 --me star --preset veryslow --frame-threads 1 --b-adapt 2 --aq-mode 3 --rd 6 --pools 15 --colormatrix bt709 --keyint 120 --high-tier --ctu 64 --tune=psnr --bitrate 24000 --vbv-bufsize 84000 --vbv-maxrate 49000 --analysis-reuse-level 10 --limit-tu 0 --scale-factor 2 --analysis-load elfuente_1920x1080.dat --refine-intra 4 --refine-inter 2
58
+
59
+#segment encoding
60
+BasketballDrive_1920x1080_50.y4m, --preset ultrafast --no-open-gop --chunk-start 100 --chunk-end 200
61
+
62
 # vim: tw=200
63
x265_2.7.tar.gz/source/test/smoke-tests.txt -> x265_2.9.tar.gz/source/test/smoke-tests.txt Changed
10
 
1
@@ -13,7 +13,7 @@
2
 old_town_cross_444_720p50.y4m,--preset=fast --keyint 20 --min-cu-size 16
3
 old_town_cross_444_720p50.y4m,--preset=slow --sao-non-deblock --pmode --qg-size 32
4
 RaceHorses_416x240_30_10bit.yuv,--preset=veryfast --max-tu-size 8
5
-RaceHorses_416x240_30_10bit.yuv,--preset=slower --bitrate 500 -F4 --rdoq-level 1 --opt-cu-delta-qp
6
+RaceHorses_416x240_30_10bit.yuv,--preset=slower --bitrate 500 -F4 --rdoq-level 1
7
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset=ultrafast --constrained-intra --min-keyint 5 --keyint 10
8
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset=medium --max-tu-size 16 --tu-inter-depth 2 --limit-tu 3
9
 DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset=veryfast --min-cu 16
10
x265_2.7.tar.gz/source/test/testbench.cpp -> x265_2.9.tar.gz/source/test/testbench.cpp Changed
28
 
1
@@ -96,7 +96,8 @@
2
 
3
 int main(int argc, char *argv[])
4
 {
5
-    int cpuid = X265_NS::cpu_detect();
6
+    bool enableavx512 = true;
7
+    int cpuid = X265_NS::cpu_detect(enableavx512);
8
     const char *testname = 0;
9
 
10
     if (!(argc & 1))
11
@@ -117,7 +118,7 @@
12
         if (!strncmp(name, "cpuid", strlen(name)))
13
         {
14
             bool bError = false;
15
-            cpuid = parseCpuName(value, bError);
16
+            cpuid = parseCpuName(value, bError, enableavx512);
17
             if (bError)
18
             {
19
                 printf("Invalid CPU name: %s\n", value);
20
@@ -169,6 +170,7 @@
21
         { "XOP", X265_CPU_XOP },
22
         { "AVX2", X265_CPU_AVX2 },
23
         { "BMI2", X265_CPU_AVX2 | X265_CPU_BMI1 | X265_CPU_BMI2 },
24
+        { "AVX512", X265_CPU_AVX512 },
25
         { "ARMv6", X265_CPU_ARMV6 },
26
         { "NEON", X265_CPU_NEON },
27
         { "FastNeonMRC", X265_CPU_FAST_NEON_MRC },
28
x265_2.7.tar.gz/source/test/testharness.h -> x265_2.9.tar.gz/source/test/testharness.h Changed
19
 
1
@@ -72,7 +72,7 @@
2
 #include <x86intrin.h>
3
 #elif ( !defined(__APPLE__) && defined (__GNUC__) && defined(__ARM_NEON__))
4
 #include <arm_neon.h>
5
-#elif defined(__GNUC__)
6
+#elif defined(__GNUC__) && (!defined(__clang__) || __clang_major__ < 4)
7
 /* fallback for older GCC/MinGW */
8
 static inline uint32_t __rdtsc(void)
9
 {
10
@@ -91,7 +91,7 @@
11
 }
12
 #endif // ifdef _MSC_VER
13
 
14
-#define BENCH_RUNS 1000
15
+#define BENCH_RUNS 2000
16
 
17
 // Adapted from checkasm.c, runs each optimized primitive four times, measures rdtsc
18
 // and discards invalid times.  Repeats 1000 times to get a good average.  Then measures
19
x265_2.7.tar.gz/source/x265.cpp -> x265_2.9.tar.gz/source/x265.cpp Changed
121
 
1
@@ -75,6 +75,7 @@
2
     const char* reconPlayCmd;
3
     const x265_api* api;
4
     x265_param* param;
5
+    x265_vmaf_data* vmafData;
6
     bool bProgress;
7
     bool bForceY4m;
8
     bool bDither;
9
@@ -96,6 +97,7 @@
10
         reconPlayCmd = NULL;
11
         api = NULL;
12
         param = NULL;
13
+        vmafData = NULL;
14
         framesToBeEncoded = seek = 0;
15
         totalbytes = 0;
16
         bProgress = true;
17
@@ -142,7 +144,7 @@
18
     {
19
         int eta = (int)(elapsed * (framesToBeEncoded - frameNum) / ((int64_t)frameNum * 1000000));
20
         sprintf(buf, "x265 [%.1f%%] %d/%d frames, %.2f fps, %.2f kb/s, eta %d:%02d:%02d",
21
-                100. * frameNum / framesToBeEncoded, frameNum, framesToBeEncoded, fps, bitrate,
22
+            100. * frameNum / (param->chunkEnd ? param->chunkEnd : param->totalFrames), frameNum, (param->chunkEnd ? param->chunkEnd : param->totalFrames), fps, bitrate,
23
                 eta / 3600, (eta / 60) % 60, eta % 60);
24
     }
25
     else
26
@@ -216,6 +218,14 @@
27
         x265_log(NULL, X265_LOG_ERROR, "param alloc failed\n");
28
         return true;
29
     }
30
+#if ENABLE_LIBVMAF
31
+    vmafData = (x265_vmaf_data*)x265_malloc(sizeof(x265_vmaf_data));
32
+    if(!vmafData)
33
+    {
34
+        x265_log(NULL, X265_LOG_ERROR, "vmaf data alloc failed\n");
35
+        return true;
36
+    }
37
+#endif
38
 
39
     if (api->param_default_preset(param, preset, tune) < 0)
40
     {
41
@@ -363,6 +373,7 @@
42
     info.frameCount = 0;
43
     getParamAspectRatio(param, info.sarWidth, info.sarHeight);
44
 
45
+
46
     this->input = InputFile::open(info, this->bForceY4m);
47
     if (!this->input || this->input->isFail())
48
     {
49
@@ -392,7 +403,7 @@
50
     if (this->framesToBeEncoded == 0 && info.frameCount > (int)seek)
51
         this->framesToBeEncoded = info.frameCount - seek;
52
     param->totalFrames = this->framesToBeEncoded;
53
-
54
+    
55
     /* Force CFR until we have support for VFR */
56
     info.timebaseNum = param->fpsDenom;
57
     info.timebaseDenom = param->fpsNum;
58
@@ -439,7 +450,30 @@
59
                     param->sourceWidth, param->sourceHeight, param->fpsNum, param->fpsDenom,
60
                     x265_source_csp_names[param->internalCsp]);
61
     }
62
+#if ENABLE_LIBVMAF
63
+    if (!reconfn)
64
+    {
65
+        x265_log(param, X265_LOG_ERROR, "recon file must be specified to get VMAF score, try --help for help\n");
66
+        return true;
67
+    }
68
+    const char *str = strrchr(info.filename, '.');
69
 
70
+    if (!strcmp(str, ".y4m"))
71
+    {
72
+        x265_log(param, X265_LOG_ERROR, "VMAF supports YUV file format only.\n");
73
+        return true; 
74
+    }
75
+    if(param->internalCsp == X265_CSP_I420 || param->internalCsp == X265_CSP_I422 || param->internalCsp == X265_CSP_I444)
76
+    {
77
+        vmafData->reference_file = x265_fopen(inputfn, "rb");
78
+        vmafData->distorted_file = x265_fopen(reconfn, "rb");
79
+    }
80
+    else
81
+    {
82
+        x265_log(param, X265_LOG_ERROR, "VMAF will support only yuv420p, yu422p, yu444p, yuv420p10le, yuv422p10le, yuv444p10le formats.\n");
83
+        return true;
84
+    }
85
+#endif
86
     this->output = OutputFile::open(outputfn, info);
87
     if (this->output->isFail())
88
     {
89
@@ -555,7 +589,9 @@
90
 
91
     x265_param* param = cliopt.param;
92
     const x265_api* api = cliopt.api;
93
-
94
+#if ENABLE_LIBVMAF
95
+    x265_vmaf_data* vmafdata = cliopt.vmafData;
96
+#endif
97
     /* This allows muxers to modify bitstream format */
98
     cliopt.output->setParam(param);
99
 
100
@@ -712,7 +748,7 @@
101
         if (!numEncoded)
102
             break;
103
     }
104
-
105
+  
106
     /* clear progress report */
107
     if (cliopt.bProgress)
108
         fprintf(stderr, "%*s\r", 80, " ");
109
@@ -723,7 +759,11 @@
110
 
111
     api->encoder_get_stats(encoder, &stats, sizeof(stats));
112
     if (param->csvfn && !b_ctrl_c)
113
+#if ENABLE_LIBVMAF
114
+        api->vmaf_encoder_log(encoder, argc, argv, param, vmafdata);
115
+#else
116
         api->encoder_log(encoder, argc, argv);
117
+#endif
118
     api->encoder_close(encoder);
119
 
120
     int64_t second_largest_pts = 0;
121
x265_2.7.tar.gz/source/x265.h -> x265_2.9.tar.gz/source/x265.h Changed
425
 
1
@@ -31,6 +31,10 @@
2
 extern "C" {
3
 #endif
4
 
5
+#if _MSC_VER
6
+#pragma warning(disable: 4201) // non-standard extension used (nameless struct/union)
7
+#endif
8
+
9
 /* x265_encoder:
10
  *      opaque handler for encoder */
11
 typedef struct x265_encoder x265_encoder;
12
@@ -105,25 +109,107 @@
13
     int       lastMiniGopBFrame;
14
     int       plannedType[X265_LOOKAHEAD_MAX + 1];
15
     int64_t   dts;
16
+    int64_t   reorderedPts;
17
 } x265_lookahead_data;
18
 
19
+typedef struct x265_analysis_validate
20
+{
21
+    int     maxNumReferences;
22
+    int     analysisReuseLevel;
23
+    int     sourceWidth;
24
+    int     sourceHeight;
25
+    int     keyframeMax;
26
+    int     keyframeMin;
27
+    int     openGOP;
28
+    int     bframes;
29
+    int     bPyramid;
30
+    int     maxCUSize;
31
+    int     minCUSize;
32
+    int     intraRefresh;
33
+    int     lookaheadDepth;
34
+    int     chunkStart;
35
+    int     chunkEnd;
36
+}x265_analysis_validate;
37
+
38
+/* Stores intra analysis data for a single frame. This struct needs better packing */
39
+typedef struct x265_analysis_intra_data
40
+{
41
+    uint8_t*  depth;
42
+    uint8_t*  modes;
43
+    char*     partSizes;
44
+    uint8_t*  chromaModes;
45
+}x265_analysis_intra_data;
46
+
47
+typedef struct x265_analysis_MV
48
+{
49
+    union{
50
+        struct { int16_t x, y; };
51
+
52
+        int32_t word;
53
+    };
54
+}x265_analysis_MV;
55
+
56
+/* Stores inter analysis data for a single frame */
57
+typedef struct x265_analysis_inter_data
58
+{
59
+    int32_t*    ref;
60
+    uint8_t*    depth;
61
+    uint8_t*    modes;
62
+    uint8_t*    partSize;
63
+    uint8_t*    mergeFlag;
64
+    uint8_t*    interDir;
65
+    uint8_t*    mvpIdx[2];
66
+    int8_t*     refIdx[2];
67
+    x265_analysis_MV*         mv[2];
68
+    int64_t*     sadCost;
69
+}x265_analysis_inter_data;
70
+
71
+typedef struct x265_weight_param
72
+{
73
+    uint32_t log2WeightDenom;
74
+    int      inputWeight;
75
+    int      inputOffset;
76
+    int      wtPresent;
77
+}x265_weight_param;
78
+
79
+#if X265_DEPTH < 10
80
+typedef uint32_t sse_t;
81
+#else
82
+typedef uint64_t sse_t;
83
+#endif
84
+
85
+typedef struct x265_analysis_distortion_data
86
+{
87
+    sse_t*        distortion;
88
+    sse_t*        ctuDistortion;
89
+    double*       scaledDistortion;
90
+    double        averageDistortion;
91
+    double        sdDistortion;
92
+    uint32_t      highDistortionCtuCount;
93
+    uint32_t      lowDistortionCtuCount;
94
+    double*       offset;
95
+    double*       threshold;
96
+}x265_analysis_distortion_data;
97
+
98
 /* Stores all analysis data for a single frame */
99
 typedef struct x265_analysis_data
100
 {
101
-    int64_t          satdCost;
102
-    uint32_t         frameRecordSize;
103
-    uint32_t         poc;
104
-    uint32_t         sliceType;
105
-    uint32_t         numCUsInFrame;
106
-    uint32_t         numPartitions;
107
-    uint32_t         depthBytes;
108
-    int              bScenecut;
109
-    void*            wt;
110
-    void*            interData;
111
-    void*            intraData;
112
-    uint32_t         numCuInHeight;
113
-    x265_lookahead_data lookahead;
114
-    uint8_t*         modeFlag[2];
115
+    int64_t                           satdCost;
116
+    uint32_t                          frameRecordSize;
117
+    uint32_t                          poc;
118
+    uint32_t                          sliceType;
119
+    uint32_t                          numCUsInFrame;
120
+    uint32_t                          numPartitions;
121
+    uint32_t                          depthBytes;
122
+    int                               bScenecut;
123
+    x265_weight_param*                wt;
124
+    x265_analysis_inter_data*         interData;
125
+    x265_analysis_intra_data*         intraData;
126
+    uint32_t                          numCuInHeight;
127
+    x265_lookahead_data               lookahead;
128
+    uint8_t*                          modeFlag[2];
129
+    x265_analysis_validate            saveParam;
130
+    x265_analysis_distortion_data*    distortionData;
131
 } x265_analysis_data;
132
 
133
 /* cu statistics */
134
@@ -152,14 +238,6 @@
135
     /* All the above values will add up to 100%. */
136
 } x265_pu_stats;
137
 
138
-
139
-typedef struct x265_analysis_2Pass
140
-{
141
-    uint32_t      poc;
142
-    uint32_t      frameRecordSize;
143
-    void*         analysisFramedata;
144
-}x265_analysis_2Pass;
145
-
146
 /* Frame level statistics */
147
 typedef struct x265_frame_stats
148
 {
149
@@ -208,6 +286,8 @@
150
     x265_cu_stats    cuStats;
151
     x265_pu_stats    puStats;
152
     double           totalFrameTime;
153
+    double           vmafFrameScore;
154
+    double           bufferFillFinal;
155
 } x265_frame_stats;
156
 
157
 typedef struct x265_ctu_info_t
158
@@ -264,6 +344,7 @@
159
     REGION_REFRESH_INFO                  = 134,
160
     MASTERING_DISPLAY_INFO               = 137,
161
     CONTENT_LIGHT_LEVEL_INFO             = 144,
162
+    ALTERNATIVE_TRANSFER_CHARACTERISTICS = 147,
163
 } SEIPayloadType;
164
 
165
 typedef struct x265_sei_payload
166
@@ -362,7 +443,8 @@
167
 
168
     int    height;
169
 
170
-    x265_analysis_2Pass analysis2Pass;
171
+    // pts is reordered in the order of encoding.
172
+    int64_t reorderedPts;
173
 } x265_picture;
174
 
175
 typedef enum
176
@@ -378,39 +460,38 @@
177
 /* CPU flags */
178
 
179
 /* x86 */
180
-#define X265_CPU_CMOV            0x0000001
181
-#define X265_CPU_MMX             0x0000002
182
-#define X265_CPU_MMX2            0x0000004  /* MMX2 aka MMXEXT aka ISSE */
183
+#define X265_CPU_MMX             (1 << 0)
184
+#define X265_CPU_MMX2            (1 << 1)  /* MMX2 aka MMXEXT aka ISSE */
185
 #define X265_CPU_MMXEXT          X265_CPU_MMX2
186
-#define X265_CPU_SSE             0x0000008
187
-#define X265_CPU_SSE2            0x0000010
188
-#define X265_CPU_SSE3            0x0000020
189
-#define X265_CPU_SSSE3           0x0000040
190
-#define X265_CPU_SSE4            0x0000080  /* SSE4.1 */
191
-#define X265_CPU_SSE42           0x0000100  /* SSE4.2 */
192
-#define X265_CPU_LZCNT           0x0000200  /* Phenom support for "leading zero count" instruction. */
193
-#define X265_CPU_AVX             0x0000400  /* AVX support: requires OS support even if YMM registers aren't used. */
194
-#define X265_CPU_XOP             0x0000800  /* AMD XOP */
195
-#define X265_CPU_FMA4            0x0001000  /* AMD FMA4 */
196
-#define X265_CPU_AVX2            0x0002000  /* AVX2 */
197
-#define X265_CPU_FMA3            0x0004000  /* Intel FMA3 */
198
-#define X265_CPU_BMI1            0x0008000  /* BMI1 */
199
-#define X265_CPU_BMI2            0x0010000  /* BMI2 */
200
+#define X265_CPU_SSE             (1 << 2)
201
+#define X265_CPU_SSE2            (1 << 3)
202
+#define X265_CPU_LZCNT           (1 << 4)
203
+#define X265_CPU_SSE3            (1 << 5)
204
+#define X265_CPU_SSSE3           (1 << 6)
205
+#define X265_CPU_SSE4            (1 << 7)  /* SSE4.1 */
206
+#define X265_CPU_SSE42           (1 << 8)  /* SSE4.2 */
207
+#define X265_CPU_AVX             (1 << 9)  /* Requires OS support even if YMM registers aren't used. */
208
+#define X265_CPU_XOP             (1 << 10)  /* AMD XOP */
209
+#define X265_CPU_FMA4            (1 << 11)  /* AMD FMA4 */
210
+#define X265_CPU_FMA3            (1 << 12)  /* Intel FMA3 */
211
+#define X265_CPU_BMI1            (1 << 13)  /* BMI1 */
212
+#define X265_CPU_BMI2            (1 << 14)  /* BMI2 */
213
+#define X265_CPU_AVX2            (1 << 15)  /* AVX2 */
214
+#define X265_CPU_AVX512          (1 << 16)  /* AVX-512 {F, CD, BW, DQ, VL}, requires OS support */
215
 /* x86 modifiers */
216
-#define X265_CPU_CACHELINE_32    0x0020000  /* avoid memory loads that span the border between two cachelines */
217
-#define X265_CPU_CACHELINE_64    0x0040000  /* 32/64 is the size of a cacheline in bytes */
218
-#define X265_CPU_SSE2_IS_SLOW    0x0080000  /* avoid most SSE2 functions on Athlon64 */
219
-#define X265_CPU_SSE2_IS_FAST    0x0100000  /* a few functions are only faster on Core2 and Phenom */
220
-#define X265_CPU_SLOW_SHUFFLE    0x0200000  /* The Conroe has a slow shuffle unit (relative to overall SSE performance) */
221
-#define X265_CPU_STACK_MOD4      0x0400000  /* if stack is only mod4 and not mod16 */
222
-#define X265_CPU_SLOW_CTZ        0x0800000  /* BSR/BSF x86 instructions are really slow on some CPUs */
223
-#define X265_CPU_SLOW_ATOM       0x1000000  /* The Atom is terrible: slow SSE unaligned loads, slow
224
+#define X265_CPU_CACHELINE_32    (1 << 17)  /* avoid memory loads that span the border between two cachelines */
225
+#define X265_CPU_CACHELINE_64    (1 << 18)  /* 32/64 is the size of a cacheline in bytes */
226
+#define X265_CPU_SSE2_IS_SLOW    (1 << 19)  /* avoid most SSE2 functions on Athlon64 */
227
+#define X265_CPU_SSE2_IS_FAST    (1 << 20)  /* a few functions are only faster on Core2 and Phenom */
228
+#define X265_CPU_SLOW_SHUFFLE    (1 << 21)  /* The Conroe has a slow shuffle unit (relative to overall SSE performance) */
229
+#define X265_CPU_STACK_MOD4      (1 << 22)  /* if stack is only mod4 and not mod16 */
230
+#define X265_CPU_SLOW_ATOM       (1 << 23)  /* The Atom is terrible: slow SSE unaligned loads, slow
231
                                              * SIMD multiplies, slow SIMD variable shifts, slow pshufb,
232
                                              * cacheline split penalties -- gather everything here that
233
                                              * isn't shared by other CPUs to avoid making half a dozen
234
                                              * new SLOW flags. */
235
-#define X265_CPU_SLOW_PSHUFB     0x2000000  /* such as on the Intel Atom */
236
-#define X265_CPU_SLOW_PALIGNR    0x4000000  /* such as on the AMD Bobcat */
237
+#define X265_CPU_SLOW_PSHUFB     (1 << 24)  /* such as on the Intel Atom */
238
+#define X265_CPU_SLOW_PALIGNR    (1 << 25)  /* such as on the AMD Bobcat */
239
 
240
 /* ARM */
241
 #define X265_CPU_ARMV6           0x0000001
242
@@ -459,11 +540,9 @@
243
 #define X265_AQ_VARIANCE             1
244
 #define X265_AQ_AUTO_VARIANCE        2
245
 #define X265_AQ_AUTO_VARIANCE_BIASED 3
246
-
247
 #define x265_ADAPT_RD_STRENGTH   4
248
-
249
+#define X265_REFINE_INTER_LEVELS 3
250
 /* NOTE! For this release only X265_CSP_I420 and X265_CSP_I444 are supported */
251
-
252
 /* Supported internal color space types (according to semantics of chroma_format_idc) */
253
 #define X265_CSP_I400           0  /* yuv 4:0:0 planar */
254
 #define X265_CSP_I420           1  /* yuv 4:2:0 planar */
255
@@ -535,6 +614,7 @@
256
     double                elapsedEncodeTime;    /* wall time since encoder was opened */
257
     double                elapsedVideoTime;     /* encoded picture count / frame rate */
258
     double                bitrate;              /* accBits / elapsed video time */
259
+    double                aggregateVmafScore;   /* aggregate VMAF score for input video*/
260
     uint64_t              accBits;              /* total bits output thus far */
261
     uint32_t              encodedPictureCount;  /* number of output pictures thus far */
262
     uint32_t              totalWPFrames;        /* number of uni-directional weighted frames used */
263
@@ -571,6 +651,47 @@
264
     float bitrateFactor;
265
 } x265_zone;
266
     
267
+/* data to calculate aggregate VMAF score */
268
+typedef struct x265_vmaf_data
269
+{
270
+    int width;
271
+    int height;
272
+    size_t offset; 
273
+    int internalBitDepth;
274
+    FILE *reference_file; /* FILE pointer for input file */
275
+    FILE *distorted_file; /* FILE pointer for recon file generated*/
276
+}x265_vmaf_data;
277
+
278
+/* data to calculate frame level VMAF score */
279
+typedef struct x265_vmaf_framedata
280
+{
281
+    int width;
282
+    int height;
283
+    int frame_set; 
284
+    int internalBitDepth; 
285
+    void *reference_frame; /* points to fenc of particular frame */
286
+    void *distorted_frame; /* points to recon of particular frame */
287
+}x265_vmaf_framedata;
288
+
289
+/* common data needed to calculate both frame level and video level VMAF scores */
290
+typedef struct x265_vmaf_commondata
291
+{
292
+    char *format;
293
+    char *model_path;
294
+    char *log_path;
295
+    char *log_fmt;
296
+    int disable_clip;
297
+    int disable_avx;
298
+    int enable_transform;
299
+    int phone_model;
300
+    int psnr;
301
+    int ssim;
302
+    int ms_ssim;
303
+    char *pool;
304
+}x265_vmaf_commondata;
305
+
306
+static const x265_vmaf_commondata vcd[] = { { NULL, (char *)"/usr/local/share/model/vmaf_v0.6.1.pkl", NULL, NULL, 0, 0, 0, 0, 0, 0, 0, NULL } };
307
+
308
 /* x265 input parameters
309
  *
310
  * For version safety you may use x265_param_alloc/free() to manage the
311
@@ -584,7 +705,6 @@
312
      * somehow flawed on your target hardware. The asm function tables are
313
      * process global, the first encoder configures them for all encoders */
314
     int       cpuid;
315
-
316
     /*== Parallelism Features ==*/
317
 
318
     /* Number of concurrently encoded frames between 1 and X265_MAX_FRAME_THREADS
319
@@ -1153,6 +1273,18 @@
320
      * Default is 0, which is recommended */
321
     int       crQpOffset;
322
 
323
+   /* Specifies the preferred transfer characteristics syntax element in the
324
+    * alternative transfer characteristics SEI message (see. D.2.38 and D.3.38 of
325
+    * JCTVC-W1005 http://phenix.it-sudparis.eu/jct/doc_end_user/documents/23_San%20Diego/wg11/JCTVC-W1005-v4.zip
326
+    * */
327
+   int       preferredTransferCharacteristics;
328
+   
329
+   /*
330
+    * Specifies the value for the pic_struc syntax element of the picture timing SEI message (See D2.3 and D3.3)
331
+    * of the HEVC spec. for a detailed explanation
332
+    * */
333
+   int       pictureStructure; 
334
+
335
     struct
336
     {
337
         /* Explicit mode of rate-control, necessary for API users. It must
338
@@ -1548,6 +1680,36 @@
339
 
340
     /*Number of RADL pictures allowed in front of IDR*/
341
     int radl;
342
+
343
+    /* This value controls the maximum AU size defined in specification
344
+     * It represents the percentage of maximum AU size used.
345
+     * Default is 1 (which is 100%). Range is 0.5 to 1. */
346
+    double maxAUSizeFactor;
347
+
348
+    /* Enables the emission of a Recovery Point SEI with the stream headers
349
+    * at each IDR frame describing poc of the recovery point, exact matching flag
350
+    * and broken link flag. Default is disabled. */
351
+    int       bEmitIDRRecoverySEI;
352
+
353
+    /* Dynamically change refine-inter at block level*/
354
+    int       bDynamicRefine;
355
+
356
+    /* Enable writing all SEI messgaes in one single NAL instead of mul*/
357
+    int       bSingleSeiNal;
358
+
359
+
360
+    /* First frame of the chunk. Frames preceeding this in display order will
361
+    * be encoded, however, they will be discarded in the bitstream.
362
+    * Default 0 (disabled). */
363
+    int       chunkStart;
364
+
365
+    /* Last frame of the chunk. Frames following this in display order will be
366
+    * used in taking lookahead decisions, but, they will not be encoded.
367
+    * Default 0 (disabled). */
368
+    int       chunkEnd;
369
+    /* File containing base64 encoded SEI messages in POC order */
370
+    const char*    naluFile;
371
+
372
 } x265_param;
373
 
374
 /* x265_param_alloc:
375
@@ -1660,6 +1822,14 @@
376
  *      A static string describing the compiler and target architecture */
377
 X265_API extern const char *x265_build_info_str;
378
 
379
+/* x265_alloc_analysis_data:
380
+*     Allocate memory for the x265_analysis_data object's internal structures. */
381
+void x265_alloc_analysis_data(x265_param *param, x265_analysis_data* analysis);
382
+
383
+/*
384
+*    Free the allocated memory for x265_analysis_data object's internal structures. */
385
+void x265_free_analysis_data(x265_param *param, x265_analysis_data* analysis);
386
+
387
 /* Force a link error in the case of linking against an incompatible API version.
388
  * Glue #defines exist to force correct macro expansion; the final output of the macro
389
  * is x265_encoder_open_##X265_BUILD (for purposes of dlopen). */
390
@@ -1787,6 +1957,22 @@
391
 /* In-place downshift from a bit-depth greater than 8 to a bit-depth of 8, using
392
  * the residual bits to dither each row. */
393
 void x265_dither_image(x265_picture *, int picWidth, int picHeight, int16_t *errorBuf, int bitDepth);
394
+#if ENABLE_LIBVMAF
395
+/* x265_calculate_vmafScore:
396
+ *    returns VMAF score for the input video.
397
+ *    This api must be called only after encoding was done. */
398
+double x265_calculate_vmafscore(x265_param*, x265_vmaf_data*);
399
+
400
+/* x265_calculate_vmaf_framelevelscore:
401
+ *    returns VMAF score for each frame in a given input video. */
402
+double x265_calculate_vmaf_framelevelscore(x265_vmaf_framedata*);
403
+/* x265_vmaf_encoder_log:
404
+ *       write a line to the configured CSV file.  If a CSV filename was not
405
+ *       configured, or file open failed, this function will perform no write.
406
+ *       This api will be called only when ENABLE_LIBVMAF cmake option is set */
407
+void x265_vmaf_encoder_log(x265_encoder *encoder, int argc, char **argv, x265_param*, x265_vmaf_data*);
408
+
409
+#endif
410
 
411
 #define X265_MAJOR_VERSION 1
412
 
413
@@ -1840,6 +2026,11 @@
414
     void          (*csvlog_encode)(const x265_param*, const x265_stats *, int, int, int, char**);
415
     void          (*dither_image)(x265_picture*, int, int, int16_t*, int);
416
     int           (*set_analysis_data)(x265_encoder *encoder, x265_analysis_data *analysis_data, int poc, uint32_t cuBytes);
417
+#if ENABLE_LIBVMAF
418
+    double        (*calculate_vmafscore)(x265_param *, x265_vmaf_data *);
419
+    double        (*calculate_vmaf_framelevelscore)(x265_vmaf_framedata *);
420
+    void          (*vmaf_encoder_log)(x265_encoder*, int, char**, x265_param *, x265_vmaf_data *);
421
+#endif
422
     /* add new pointers to the end, or increment X265_MAJOR_VERSION */
423
 } x265_api;
424
 
425
x265_2.7.tar.gz/source/x265cli.h -> x265_2.9.tar.gz/source/x265cli.h Changed
104
 
1
@@ -152,6 +152,8 @@
2
     { "vbv-init",       required_argument, NULL, 0 },
3
     { "vbv-end",        required_argument, NULL, 0 },
4
     { "vbv-end-fr-adj", required_argument, NULL, 0 },
5
+    { "chunk-start",    required_argument, NULL, 0 },
6
+    { "chunk-end",      required_argument, NULL, 0 },
7
     { "bitrate",        required_argument, NULL, 0 },
8
     { "qp",             required_argument, NULL, 'q' },
9
     { "aq-mode",        required_argument, NULL, 0 },
10
@@ -263,6 +265,8 @@
11
     { "scale-factor",   required_argument, NULL, 0 },
12
     { "refine-intra",   required_argument, NULL, 0 },
13
     { "refine-inter",   required_argument, NULL, 0 },
14
+    { "dynamic-refine",       no_argument, NULL, 0 },
15
+    { "no-dynamic-refine",    no_argument, NULL, 0 },
16
     { "strict-cbr",           no_argument, NULL, 0 },
17
     { "temporal-layers",      no_argument, NULL, 0 },
18
     { "no-temporal-layers",   no_argument, NULL, 0 },
19
@@ -293,6 +297,14 @@
20
     { "refine-mv-type", required_argument, NULL, 0 },
21
     { "copy-pic",             no_argument, NULL, 0 },
22
     { "no-copy-pic",          no_argument, NULL, 0 },
23
+    { "max-ausize-factor", required_argument, NULL, 0 },
24
+    { "idr-recovery-sei",     no_argument, NULL, 0 },
25
+    { "no-idr-recovery-sei",  no_argument, NULL, 0 },
26
+    { "single-sei", no_argument, NULL, 0 },
27
+    { "no-single-sei", no_argument, NULL, 0 },
28
+    { "atc-sei", required_argument, NULL, 0 },
29
+    { "pic-struct", required_argument, NULL, 0 },
30
+    { "nalu-file", required_argument, NULL, 0 },
31
     { 0, 0, 0, 0 },
32
     { 0, 0, 0, 0 },
33
     { 0, 0, 0, 0 },
34
@@ -343,6 +355,7 @@
35
     H0("   --dhdr10-info <filename>      JSON file containing the Creative Intent Metadata to be encoded as Dynamic Tone Mapping\n");
36
     H0("   --[no-]dhdr10-opt             Insert tone mapping SEI only for IDR frames and when the tone mapping information changes. Default disabled\n");
37
 #endif
38
+    H0("   --nalu-file <filename>        Text file containing SEI messages in the following format : <POC><space><PREFIX><space><NAL UNIT TYPE>/<SEI TYPE><space><SEI Payload>\n");
39
     H0("-f/--frames <integer>            Maximum number of frames to encode. Default all\n");
40
     H0("   --seek <integer>              First frame to encode\n");
41
     H1("   --[no-]interlace <bff|tff>    Indicate input pictures are interlace fields in temporal order. Default progressive\n");
42
@@ -389,7 +402,7 @@
43
     H0("   --[no-]early-skip             Enable early SKIP detection. Default %s\n", OPT(param->bEnableEarlySkip));
44
     H0("   --[no-]rskip                  Enable early exit from recursion. Default %s\n", OPT(param->bEnableRecursionSkip));
45
     H1("   --[no-]tskip-fast             Enable fast intra transform skipping. Default %s\n", OPT(param->bEnableTSkipFast));
46
-    H1("   --[no-]splitrd-skip           Enable skipping split RD analysis when sum of split CU rdCost larger than none split CU rdCost for Intra CU. Default %s\n", OPT(param->bEnableSplitRdSkip));
47
+    H1("   --[no-]splitrd-skip           Enable skipping split RD analysis when sum of split CU rdCost larger than one split CU rdCost for Intra CU. Default %s\n", OPT(param->bEnableSplitRdSkip));
48
     H1("   --nr-intra <integer>          An integer value in range of 0 to 2000, which denotes strength of noise reduction in intra CUs. Default 0\n");
49
     H1("   --nr-inter <integer>          An integer value in range of 0 to 2000, which denotes strength of noise reduction in inter CUs. Default 0\n");
50
     H0("   --ctu-info <integer>          Enable receiving ctu information asynchronously and determine reaction to the CTU information (0, 1, 2, 4, 6) Default 0\n"
51
@@ -459,6 +472,8 @@
52
     H0("   --vbv-init <float>            Initial VBV buffer occupancy (fraction of bufsize or in kbits). Default %.2f\n", param->rc.vbvBufferInit);
53
     H0("   --vbv-end <float>             Final VBV buffer emptiness (fraction of bufsize or in kbits). Default 0 (disabled)\n");
54
     H0("   --vbv-end-fr-adj <float>      Frame from which qp has to be adjusted to achieve final decode buffer emptiness. Default 0\n");
55
+    H0("   --chunk-start <integer>       First frame of the chunk. Default 0 (disabled)\n");
56
+    H0("   --chunk-end <integer>         Last frame of the chunk. Default 0 (disabled)\n");
57
     H0("   --pass                        Multi pass rate control.\n"
58
        "                                   - 1 : First pass, creates stats file\n"
59
        "                                   - 2 : Last pass, does not overwrite stats file\n"
60
@@ -475,11 +490,12 @@
61
     H0("   --analysis-reuse-level <1..10>      Level of analysis reuse indicates amount of info stored/reused in save/load mode, 1:least..10:most. Default %d\n", param->analysisReuseLevel);
62
     H0("   --refine-mv-type <string>     Reuse MV information received through API call. Supported option is avc. Default disabled - %d\n", param->bMVType);
63
     H0("   --scale-factor <int>          Specify factor by which input video is scaled down for analysis save mode. Default %d\n", param->scaleFactor);
64
-    H0("   --refine-intra <0..3>         Enable intra refinement for encode that uses analysis-load.\n"
65
+    H0("   --refine-intra <0..4>         Enable intra refinement for encode that uses analysis-load.\n"
66
         "                                    - 0 : Forces both mode and depth from the save encode.\n"
67
         "                                    - 1 : Functionality of (0) + evaluate all intra modes at min-cu-size's depth when current depth is one smaller than min-cu-size's depth.\n"
68
         "                                    - 2 : Functionality of (1) + irrespective of size evaluate all angular modes when the save encode decides the best mode as angular.\n"
69
         "                                    - 3 : Functionality of (1) + irrespective of size evaluate all intra modes.\n"
70
+        "                                    - 4 : Re-evaluate all intra blocks, does not reuse data from save encode.\n"
71
         "                                Default:%d\n", param->intraRefine);
72
     H0("   --refine-inter <0..3>         Enable inter refinement for encode that uses analysis-load.\n"
73
         "                                    - 0 : Forces both mode and depth from the save encode.\n"
74
@@ -488,6 +504,7 @@
75
         "                                    - 2 : Functionality of (1) + irrespective of size restrict the modes evaluated when specific modes are decided as the best mode by the save encode.\n"
76
         "                                    - 3 : Functionality of (1) + irrespective of size evaluate all inter modes.\n"
77
         "                                Default:%d\n", param->interRefine);
78
+    H0("   --[no-]dynamic-refine         Dynamically changes refine-inter level for each CU. Default %s\n", OPT(param->bDynamicRefine));
79
     H0("   --[no-]refine-mv              Enable mv refinement for load mode. Default %s\n", OPT(param->mvRefine));
80
     H0("   --aq-mode <integer>           Mode for Adaptive Quantization - 0:none 1:uniform AQ 2:auto variance 3:auto variance with bias to dark scenes. Default %d\n", param->rc.aqMode);
81
     H0("   --aq-strength <float>         Reduces blocking and blurring in flat and textured areas (0 to 3.0). Default %.2f\n", param->rc.aqStrength);
82
@@ -515,6 +532,8 @@
83
     H1("                                 MAX_MAX_QP+1 floats for lambda table, then again for lambda2 table\n");
84
     H1("                                 Blank lines and lines starting with hash(#) are ignored\n");
85
     H1("                                 Comma is considered to be white-space\n");
86
+    H0("   --max-ausize-factor <float>   This value controls the maximum AU size defined in specification.\n");
87
+    H0("                                 It represents the percentage of maximum AU size used. Default %.1f\n", param->maxAUSizeFactor);
88
     H0("\nLoop filters (deblock and SAO):\n");
89
     H0("   --[no-]deblock                Enable Deblocking Loop Filter, optionally specify tC:Beta offsets Default %s\n", OPT(param->bEnableLoopFilter));
90
     H0("   --[no-]sao                    Enable Sample Adaptive Offset. Default %s\n", OPT(param->bEnableSAO));
91
@@ -548,9 +567,12 @@
92
     H0("   --[no-]repeat-headers         Emit SPS and PPS headers at each keyframe. Default %s\n", OPT(param->bRepeatHeaders));
93
     H0("   --[no-]info                   Emit SEI identifying encoder and parameters. Default %s\n", OPT(param->bEmitInfoSEI));
94
     H0("   --[no-]hrd                    Enable HRD parameters signaling. Default %s\n", OPT(param->bEmitHRDSEI));
95
+    H0("   --[no-]idr-recovery-sei      Emit recovery point infor SEI at each IDR frame \n");
96
     H0("   --[no-]temporal-layers        Enable a temporal sublayer for unreferenced B frames. Default %s\n", OPT(param->bEnableTemporalSubLayers));
97
     H0("   --[no-]aud                    Emit access unit delimiters at the start of each access unit. Default %s\n", OPT(param->bEnableAccessUnitDelimiters));
98
     H1("   --hash <integer>              Decoded Picture Hash SEI 0: disabled, 1: MD5, 2: CRC, 3: Checksum. Default %d\n", param->decodedPictureHashSEI);
99
+    H0("   --atc-sei <integer>           Emit the alternative transfer characteristics SEI message where the integer is the preferred transfer characteristics. Default disabled\n");
100
+    H0("   --pic-struct <integer>        Set the picture structure and emits it in the picture timing SEI message. Values in the range 0..12. See D.3.3 of the HEVC spec. for a detailed explanation.\n");
101
     H0("   --log2-max-poc-lsb <integer>  Maximum of the picture order count\n");
102
     H0("   --[no-]vui-timing-info        Emit VUI timing information in the bistream. Default %s\n", OPT(param->bEmitVUITimingInfo));
103
     H0("   --[no-]vui-hrd-info           Emit VUI HRD information in the bistream. Default %s\n", OPT(param->bEmitVUIHRDInfo));
104
Refresh

No build results available

Refresh

No rpmlint results available

Request History
Luigi Baldoni's avatar

Aloysius created request over 6 years ago


enzokiel's avatar

enzokiel accepted request over 6 years ago