Changes of Revision 12

x265.changes Changed
x
 
1
@@ -1,4 +1,30 @@
2
 -------------------------------------------------------------------
3
+Wed Feb  3 13:22:42 UTC 2016 - idonmez@suse.com
4
+
5
+- Update to version 1.9
6
+  API Changes:
7
+  * x265_frame_stats returns many additional fields: maxCLL, maxFALL,
8
+    residual energy, scenecut and latency logging
9
+  * --qpfile now supports frametype 'K"
10
+  * x265 now allows CRF ratecontrol in pass N (N greater than or equal to 2)
11
+  * Chroma subsampling format YUV 4:0:0 is now fully supported and tested
12
+  New Features:
13
+  * Quant offsets: This feature allows block level quantization offsets
14
+    to be specified for every frame. An API-only feature.
15
+  * --intra-refresh: Keyframes can be replaced by a moving column
16
+    of intra blocks in non-keyframes.
17
+  * --limit-modes: Intelligently restricts mode analysis.
18
+  * --max-luma and --min-luma for luma clipping, optional for HDR use-cases
19
+  * Emergency denoising is now enabled by default in very low bitrate, 
20
+    VBV encodes
21
+  Presets and Performance:
22
+  * Recently added features lookahead-slices, limit-modes, limit-refs
23
+    have been enabled by default for applicable presets.
24
+  * The default psy-rd strength has been increased to 2.0
25
+  * Multi-socket machines now use a single pool of threads that can
26
+    work cross-socket.
27
+
28
+-------------------------------------------------------------------
29
 Fri Nov 27 18:21:04 UTC 2015 - aloisio@gmx.com
30
 
31
 - Update to version 1.8:
32
x265.spec Changed
64
 
1
@@ -1,10 +1,10 @@
2
 # based on the spec file from https://build.opensuse.org/package/view_file/home:Simmphonie/libx265/
3
 
4
 Name:           x265
5
-%define soname  68
6
+%define soname  79
7
 %define libname lib%{name}
8
 %define libsoname %{libname}-%{soname}
9
-Version:        1.8
10
+Version:        1.9
11
 Release:        0
12
 License:        GPL-2.0+
13
 Summary:        A free h265/HEVC encoder - encoder binary
14
@@ -43,35 +43,34 @@
15
 streams. 
16
 
17
 %prep
18
-%setup -q -n "%{name}_11047/build/linux"
19
-cd ../..
20
+%setup -q -n x265_%{version}
21
 %patch0 -p1
22
-cd -
23
+
24
 %define FAKE_BUILDDATE %(LC_ALL=C date -u -r %{_sourcedir}/%{name}.changes '+%%b %%e %%Y')
25
-sed -i -e "s/0.0/%{soname}.0/g" ../../source/cmake/version.cmake
26
+sed -i -e "s/0.0/%{soname}.0/g" source/cmake/version.cmake
27
 
28
 
29
 %build
30
-export CXXFLAGS="%optflags"
31
-export CFLAGS="%optflags"
32
-cmake  -DCMAKE_INSTALL_PREFIX=/usr -DENABLE_TESTS=ON -G "Unix Makefiles" ../../source
33
-cmake -DCMAKE_INSTALL_PREFIX=/usr ../../source
34
-#./make-Makefiles.bash
35
+export CXXFLAGS="%{optflags}"
36
+export CFLAGS="%{optflags}"
37
+
38
+cd build/linux
39
+cmake  -DCMAKE_INSTALL_PREFIX=%{_prefix} \
40
+       -DLIB_INSTALL_DIR=%{_lib} \
41
+       -DENABLE_TESTS=ON \
42
+       -G "Unix Makefiles" \
43
+       ../../source
44
+
45
 make %{?_smp_mflags} VERBOSE=1
46
 
47
 %install
48
+cd build/linux
49
 %makeinstall
50
-%ifarch x86_64
51
-  mv "%{buildroot}/usr/lib" "%{buildroot}%{_libdir}"
52
-%endif
53
 
54
 rm -f %{buildroot}%{_libdir}/%{libname}.a
55
 
56
 echo "%{libname}-%{soname}" > %{_sourcedir}/baselibs.conf
57
 
58
-%clean
59
-%{?buildroot:%__rm -rf "%{buildroot}"}
60
-
61
 %post -n %{libsoname} -p /sbin/ldconfig
62
 %postun -n %{libsoname} -p /sbin/ldconfig
63
 
64
x265_1.8.tar.gz/.hg_archival.txt -> x265_1.9.tar.gz/.hg_archival.txt Changed
9
 
1
@@ -1,5 +1,4 @@
2
 repo: 09fe40627f03a0f9c3e6ac78b22ac93da23f9fdf
3
-node: 5dcc9d3a928c400b41a3547d7bfee10340519e56
4
+node: 1d3b6e448e01ec40b392ef78b7e55a86249fbe68
5
 branch: stable
6
-latesttag: 1.8
7
-latesttagdistance: 1
8
+tag: 1.9
9
x265_1.8.tar.gz/doc/reST/cli.rst -> x265_1.9.tar.gz/doc/reST/cli.rst Changed
310
 
1
@@ -84,8 +84,8 @@
2
    it adds one line per run. If :option:`--csv-log-level` is greater than
3
    0, it writes one line per frame. Default none
4
 
5
-   When frame level logging is enabled, several frame performance
6
-   statistics are listed:
7
+   Several frame performance statistics are available when 
8
+   :option:`--csv-log-level` is greater than or equal to 2:
9
 
10
    **DecideWait ms** number of milliseconds the frame encoder had to
11
    wait, since the previous frame was retrieved by the API thread,
12
@@ -202,15 +202,29 @@
13
    "-"       - same as "none"
14
    "10"      - allocate one pool, using up to 10 cores on node 0
15
    "-,+"     - allocate one pool, using all cores on node 1
16
-   "+,-,+"   - allocate two pools, using all cores on nodes 0 and 2
17
-   "+,-,+,-" - allocate two pools, using all cores on nodes 0 and 2
18
-   "-,*"     - allocate three pools, using all cores on nodes 1, 2 and 3
19
+   "+,-,+"   - allocate one pool, using only cores on nodes 0 and 2
20
+   "+,-,+,-" - allocate one pool, using only cores on nodes 0 and 2
21
+   "-,*"     - allocate one pool, using all cores on nodes 1, 2 and 3
22
    "8,8,8,8" - allocate four pools with up to 8 threads in each pool
23
-
24
-   The total number of threads will be determined by the number of threads
25
-   assigned to all nodes. The worker threads will each be given affinity for
26
-   their node, they will not be allowed to migrate between nodes, but they
27
-   will be allowed to move between CPU cores within their node.
28
+   "8,+,+,+" - allocate two pools, the first with 8 threads on node 0, and the second with all cores on node 1,2,3
29
+
30
+   A thread pool dedicated to a given NUMA node is enabled only when the
31
+   number of threads to be created on that NUMA node is explicitly mentioned
32
+   in that corresponding position with the --pools option. Else, all threads
33
+   are spawned from a single pool. The total number of threads will be
34
+   determined by the number of threads assigned to the enabled NUMA nodes for
35
+   that pool. The worker threads are be given affinity to all the enabled
36
+   NUMA nodes for that pool and may migrate between them, unless explicitly
37
+   specified as described above.
38
+
39
+   In the case that any threadpool has more than 64 threads, the threadpool
40
+   may be broken down into multiple pools of 64 threads each; on 32-bit
41
+   machines, this number is 32. All pools are given affinity to the NUMA
42
+   nodes on which the original pool had affinity. For performance reasons,
43
+   the last thread pool is spawned only if it has more than 32 threads for
44
+   64-bit machines, or 16 for 32-bit machines. If the total number of threads
45
+   in the system doesn't obey this constraint, we may spawn fewer threads
46
+   than cores which has been emperically shown to be better for performance. 
47
 
48
    If the four pool features: :option:`--wpp`, :option:`--pmode`,
49
    :option:`--pme` and :option:`--lookahead-slices` are all disabled,
50
@@ -219,10 +233,6 @@
51
    If "none" is specified, then all four of the thread pool features are
52
    implicitly disabled.
53
 
54
-   Multiple thread pools will be allocated for any NUMA node with more than
55
-   64 logical CPU cores. But any given thread pool will always use at most
56
-   one NUMA node.
57
-
58
    Frame encoders are distributed between the available thread pools,
59
    and the encoder will never generate more thread pools than
60
    :option:`--frame-threads`.  The pools are used for WPP and for
61
@@ -238,8 +248,12 @@
62
    system, a POSIX build of libx265 without libnuma will be less work
63
    efficient. See :ref:`thread pools <pools>` for more detail.
64
 
65
-   Default "", one thread is allocated per detected hardware thread
66
-   (logical CPU cores) and one thread pool per NUMA node.
67
+   Default "", one pool is created across all available NUMA nodes, with
68
+   one thread allocated per detected hardware thread
69
+   (logical CPU cores). In the case that the total number of threads is more
70
+   than the maximum size that ATOMIC operations can handle (32 for 32-bit
71
+   compiles, and 64 for 64-bit compiles), multiple thread pools may be
72
+   spawned subject to the performance constraint described above.
73
 
74
    Note that the string value will need to be escaped or quoted to
75
    protect against shell expansion on many platforms
76
@@ -353,7 +367,7 @@
77
 
78
    **CLI ONLY**
79
 
80
-.. option:: --total-frames <integer>
81
+.. option:: --frames <integer>
82
 
83
    The number of frames intended to be encoded.  It may be left
84
    unspecified, but when it is specified rate control can make use of
85
@@ -377,15 +391,15 @@
86
 
87
 .. option:: --input-csp <integer|string>
88
 
89
-   YUV only: Source color space. Only i420, i422, and i444 are
90
-   supported at this time. The internal color space is always the
91
-   same as the source color space (libx265 does not support any color
92
-   space conversions).
93
+   Chroma Subsampling (YUV only):  Only 4:0:0(monochrome), 4:2:0, 4:2:2, and 4:4:4 are supported at this time. 
94
+   The chroma subsampling format of your input must match your desired output chroma subsampling format 
95
+   (libx265 will not perform any chroma subsampling conversion), and it must be supported by the 
96
+   HEVC profile you have specified.
97
 
98
-   0. i400
99
-   1. i420 **(default)**
100
-   2. i422
101
-   3. i444
102
+   0. i400 (4:0:0 monochrome) - Not supported by Main or Main10 profiles
103
+   1. i420 (4:2:0 default)    - Supported by all HEVC profiles
104
+   2. i422 (4:2:2)            - Not supported by Main, Main10 and Main12 profiles
105
+   3. i444 (4:4:4)            - Supported by Main 4:4:4, Main 4:4:4 10, Main 4:4:4 12, Main 4:4:4 16 Intra profiles
106
    4. nv12
107
    5. nv16
108
 
109
@@ -436,8 +450,8 @@
110
    depth of the encoder. If the requested bit depth is not the bit
111
    depth of the linked libx265, it will attempt to bind libx265_main
112
    for an 8bit encoder, libx265_main10 for a 10bit encoder, or
113
-   libx265_main12 for a 12bit encoder (EXPERIMENTAL), with the
114
-   same API version as the linked libx265.
115
+   libx265_main12 for a 12bit encoder, with the same API version as the
116
+   linked libx265.
117
 
118
    If the output depth is not specified but :option:`--profile` is
119
    specified, the output depth will be derived from the profile name.
120
@@ -486,13 +500,6 @@
121
    The CLI application will derive the output bit depth from the
122
    profile name if :option:`--output-depth` is not specified.
123
 
124
-.. note::
125
-
126
-   All 12bit presets are extremely unstable, do not use them yet.
127
-   16bit is not supported at all, but those profiles are included
128
-   because it is possible for libx265 to make bitstreams compatible
129
-   with them.
130
-
131
 .. option:: --level-idc <integer|float>
132
 
133
    Minimum decoder requirement level. Defaults to 0, which implies
134
@@ -606,7 +613,8 @@
135
    +-------+---------------------------------------------------------------+
136
    | Level | Description                                                   |
137
    +=======+===============================================================+
138
-   | 0     | sa8d mode and split decisions, intra w/ source pixels         |
139
+   | 0     | sa8d mode and split decisions, intra w/ source pixels,        |
140
+   |       | currently not supported                                       |
141
    +-------+---------------------------------------------------------------+
142
    | 1     | recon generated (better intra), RDO merge/skip selection      |
143
    +-------+---------------------------------------------------------------+
144
@@ -677,7 +685,16 @@
145
    (within your decoder level limits) if you enable one or
146
    both of these flags.
147
 
148
-   This feature is EXPERIMENTAL and functional at all RD levels.
149
+   Default 3.
150
+
151
+.. option:: --limit-modes, --no-limit-modes
152
+    
153
+   When enabled, limit-modes will limit modes analyzed for each CU using cost 
154
+   metrics from the 4 sub-CUs. When multiple inter modes like :option:`--rect`
155
+   and/or :option:`--amp` are enabled, this feature will use motion cost 
156
+   heuristics from the 4 sub-CUs to bypass modes that are unlikely to be the 
157
+   best choice. This can significantly improve performance when :option:`rect`
158
+   and/or :option:`--amp` are enabled at minimal compression efficiency loss.
159
 
160
 .. option:: --rect, --no-rect
161
 
162
@@ -1049,9 +1066,9 @@
163
    energy of the source image in the encoded image at the expense of
164
    compression efficiency. It only has effect on presets which use
165
    RDO-based mode decisions (:option:`--rd` 3 and above). 1.0 is a
166
-   typical value. Default 0.3
167
+   typical value. Default 2.0
168
 
169
-   **Range of values:** 0 .. 2.0
170
+   **Range of values:** 0 .. 5.0
171
 
172
 .. option:: --psy-rdoq <float>
173
 
174
@@ -1076,7 +1093,8 @@
175
 
176
    Max intra period in frames. A special case of infinite-gop (single
177
    keyframe at the beginning of the stream) can be triggered with
178
-   argument -1. Use 1 to force all-intra. Default 250
179
+   argument -1. Use 1 to force all-intra. When intra-refresh is enabled
180
+   it specifies the interval between which refresh sweeps happen. Default 250
181
 
182
 .. option:: --min-keyint, -i <integer>
183
 
184
@@ -1095,6 +1113,14 @@
185
    :option:`--scenecut` 0 or :option:`--no-scenecut` disables adaptive
186
    I frame placement. Default 40
187
 
188
+.. option:: --intra-refresh
189
+
190
+   Enables Periodic Intra Refresh(PIR) instead of keyframe insertion.
191
+   PIR can replace keyframes by inserting a column of intra blocks in 
192
+   non-keyframes, that move across the video from one side to the other
193
+   and thereby refresh the image but over a period of multiple 
194
+   frames instead of a single keyframe.
195
+
196
 .. option:: --rc-lookahead <integer>
197
 
198
    Number of frames for slice-type decision lookahead (a key
199
@@ -1108,21 +1134,31 @@
200
 
201
 .. option:: --lookahead-slices <0..16>
202
 
203
-   Use multiple worker threads to measure the estimated cost of each
204
-   frame within the lookahead. When :option:`--b-adapt` is 2, most
205
-   frame cost estimates will be performed in batch mode, many cost
206
-   estimates at the same time, and lookahead-slices is ignored for
207
-   batched estimates. The effect on performance can be quite small.
208
-   The higher this parameter, the less accurate the frame costs will be
209
-   (since context is lost across slice boundaries) which will result in
210
-   less accurate B-frame and scene-cut decisions.
211
-
212
-   The encoder may internally lower the number of slices to ensure
213
-   each slice codes at least 10 16x16 rows of lowres blocks. If slices
214
-   are used in lookahead, they are logged in the list of tools as
215
-   *lslices*.
216
-   
217
-   **Values:** 0 - disabled (default). 1 is the same as 0. Max 16
218
+   Use multiple worker threads to measure the estimated cost of each frame
219
+   within the lookahead. The frame is divided into the specified number of
220
+   slices, and one-thread is launched  per slice. When :option:`--b-adapt` is
221
+   2, most frame cost estimates will be performed in batch mode (many cost
222
+   estimates at the same time) and lookahead-slices is ignored for batched
223
+   estimates; it may still be used for single cost estimations. The higher this
224
+   parameter, the less accurate the frame costs will be (since context is lost
225
+   across slice boundaries) which will result in less accurate B-frame and
226
+   scene-cut decisions. The effect on performance can be significant especially
227
+   on systems with many threads.
228
+
229
+   The encoder may internally lower the number of slices or disable
230
+    slicing to ensure each slice codes at least 10 16x16 rows of lowres
231
+    blocks to minimize the impact on quality. For example, for 720p and
232
+    1080p videos, the number of slices is capped to 4 and 6, respectively.
233
+    For resolutions lesser than 720p, slicing is auto-disabled.
234
+        
235
+    If slices are used in lookahead, they are logged in the list of tools
236
+    as *lslices*
237
+
238
+   **Values:** 0 - disabled. 1 is the same as 0. Max 16.
239
+    Default: 8 for ultrafast, superfast, faster, fast, medium
240
+             4 for slow, slower
241
+             disabled for veryslow, slower
242
+
243
 
244
 .. option:: --b-adapt <integer>
245
 
246
@@ -1198,6 +1234,13 @@
247
    is also non-zero. Both vbv-bufsize and vbv-maxrate are required to
248
    enable VBV in CRF mode. Default 0 (disabled)
249
 
250
+   Note that when VBV is enabled (with a valid :option:`--vbv-bufsize`),
251
+   VBV emergency denoising is turned on. This will turn on aggressive 
252
+   denoising at the frame level when frame QP > QP_MAX_SPEC (51), drastically
253
+   reducing bitrate and allowing ratecontrol to assign lower QPs for
254
+   the following frames. The visual effect is blurring, but removes 
255
+   significant blocking/displacement artifacts.
256
+
257
 .. option:: --vbv-init <float>
258
 
259
    Initial buffer occupancy. The portion of the decode buffer which
260
@@ -1405,10 +1448,11 @@
261
 
262
    framenumber frametype QP
263
 
264
-   Frametype can be one of [I,i,P,B,b]. **B** is a referenced B frame,
265
+   Frametype can be one of [I,i,K,P,B,b]. **B** is a referenced B frame,
266
    **b** is an unreferenced B frame.  **I** is a keyframe (random
267
-   access point) while **i** is a I frame that is not a keyframe
268
-   (references are not broken).
269
+   access point) while **i** is an I frame that is not a keyframe
270
+   (references are not broken). **K** implies **I** if closed_gop option
271
+   is enabled, and **i** otherwise.
272
 
273
    Specifying QP (integer) is optional, and if specified they are
274
    clamped within the encoder to qpmin/qpmax.
275
@@ -1551,7 +1595,7 @@
276
 
277
 .. option:: --colorprim <integer|string>
278
 
279
-   Specify color primitive to use when converting to RGB. Default
280
+   Specify color primaries to use when converting to RGB. Default
281
    undefined (not signaled)
282
 
283
    1. bt709
284
@@ -1621,7 +1665,7 @@
285
 
286
    Example for D65P3 1000-nits:
287
 
288
-       G(13200,34500)B(7500,3000)R(34000,16000)WP(15635,16450)L(10000000,1)
289
+       G(13250,34500)B(7500,3000)R(34000,16000)WP(15635,16450)L(10000000,1)
290
 
291
    Note that this string value will need to be escaped or quoted to
292
    protect against shell expansion on many platforms. No default.
293
@@ -1640,6 +1684,16 @@
294
    Note that this string value will need to be escaped or quoted to
295
    protect against shell expansion on many platforms. No default.
296
 
297
+.. option:: --min-luma <integer>
298
+
299
+   Minimum luma value allowed for input pictures. Any values below min-luma
300
+   are clipped. Experimental. No default.
301
+
302
+.. option:: --max-luma <integer>
303
+
304
+   Maximum luma value allowed for input pictures. Any values above max-luma
305
+   are clipped. Experimental. No default.
306
+
307
 Bitstream options
308
 =================
309
 
310
x265_1.8.tar.gz/doc/reST/presets.rst -> x265_1.9.tar.gz/doc/reST/presets.rst Changed
154
 
1
@@ -6,76 +6,83 @@
2
 Presets
3
 =======
4
 
5
-x265 has a number of predefined :option:`--preset` options that make
6
-trade-offs between encode speed (encoded frames per second) and
7
+x265 has ten predefined :option:`--preset` options that optimize the
8
+trade-off between encoding speed (encoded frames per second) and
9
 compression efficiency (quality per bit in the bitstream).  The default
10
-preset is medium, it does a reasonably good job of finding the best
11
-possible quality without spending enormous CPU cycles looking for the
12
-absolute most efficient way to achieve that quality.  As you go higher
13
-than medium, the encoder takes shortcuts to improve performance at the
14
-expense of quality and compression efficiency.  As you go lower than
15
-medium, the encoder tries harder and harder to achieve the best quailty
16
-per bit compression ratio.
17
-
18
-The presets adjust encoder parameters to affect these trade-offs.
19
-
20
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
21
-|              | ultrafast | superfast | veryfast | faster | fast | medium | slow | slower | veryslow | placebo |
22
-+==============+===========+===========+==========+========+======+========+======+========+==========+=========+
23
-| ctu          |   32      |    32     |   32     |  64    |  64  |   64   |  64  |  64    |   64     |   64    |
24
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
25
-| min-cu-size  |   16      |     8     |    8     |   8    |   8  |    8   |   8  |   8    |    8     |    8    |
26
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
27
-| bframes      |    3      |     3     |    4     |   4    |  4   |    4   |  4   |   8    |    8     |    8    |
28
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
29
-| b-adapt      |    0      |     0     |    0     |   0    |  0   |    2   |  2   |   2    |    2     |    2    |
30
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
31
-| rc-lookahead |    5      |    10     |   15     |  15    |  15  |   20   |  25  |   30   |   40     |   60    |
32
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
33
-| scenecut     |    0      |    40     |   40     |  40    |  40  |   40   |  40  |   40   |   40     |   40    |
34
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
35
-| refs         |    1      |     1     |    1     |   1    |  2   |    3   |  3   |   3    |    5     |    5    |
36
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
37
-| me           |   dia     |   hex     |   hex    |  hex   | hex  |   hex  | star |  star  |   star   |   star  |
38
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
39
-| merange      |   57      |    57     |   57     |  57    |  57  |   57   | 57   |  57    |   57     |   92    |
40
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
41
-| subme        |    0      |     1     |    1     |   2    |  2   |    2   |  3   |   3    |    4     |    5    |
42
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
43
-| rect         |    0      |     0     |    0     |   0    |  0   |    0   |  1   |   1    |    1     |    1    |
44
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
45
-| amp          |    0      |     0     |    0     |   0    |  0   |    0   |  0   |   1    |    1     |    1    |
46
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
47
-| max-merge    |    2      |     2     |    2     |   2    |  2   |    2   |  3   |   3    |    4     |    5    |
48
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
49
-| early-skip   |    1      |     1     |    1     |   1    |  0   |    0   |  0   |   0    |    0     |    0    |
50
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
51
-| fast-intra   |    1      |     1     |    1     |   1    |  1   |    0   |  0   |   0    |    0     |    0    |
52
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
53
-| b-intra      |    0      |     0     |    0     |   0    |  0   |    0   |  0   |   1    |    1     |    1    |
54
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
55
-| sao          |    0      |     0     |    1     |   1    |  1   |    1   |  1   |   1    |    1     |    1    |
56
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
57
-| signhide     |    0      |     1     |    1     |   1    |  1   |    1   |  1   |   1    |    1     |    1    |
58
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
59
-| weightp      |    0      |     0     |    1     |   1    |  1   |    1   |  1   |   1    |    1     |    1    |
60
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
61
-| weightb      |    0      |     0     |    0     |   0    |  0   |    0   |  0   |   1    |    1     |    1    |
62
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
63
-| aq-mode      |    0      |     0     |    1     |   1    |  1   |    1   |  1   |   1    |    1     |    1    |
64
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
65
-| cuTree       |    0      |     0     |    0     |   0    |  1   |    1   |  1   |   1    |    1     |    1    |
66
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
67
-| rdLevel      |    2      |     2     |    2     |   2    |  2   |    3   |  4   |   6    |    6     |    6    |
68
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
69
-| rdoq-level   |    0      |     0     |    0     |   0    |  0   |    0   |  2   |   2    |    2     |    2    |
70
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
71
-| tu-intra     |    1      |     1     |    1     |   1    |  1   |    1   |  1   |   2    |    3     |    4    |
72
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
73
-| tu-inter     |    1      |     1     |    1     |   1    |  1   |    1   |  1   |   2    |    3     |    4    |
74
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
75
-
76
-Placebo mode enables transform-skip prediction evaluation.
77
+preset is medium.  It does a reasonably good job of finding the best
78
+possible quality without spending excessive CPU cycles looking for the
79
+absolute most efficient way to achieve that quality.  When you use 
80
+faster presets, the encoder takes shortcuts to improve performance at 
81
+the expense of quality and compression efficiency.  When you use slower
82
+presets, x265 tests more encoding options, using more computations to  
83
+achieve the best quality at your selected bit rate (or in the case of
84
+--crf rate control, the lowest bit rate at the selected quality).
85
+
86
+The presets adjust encoder parameters as shown in the following table.
87
+Any parameters below that are specified in your command-line will be 
88
+changed from the value specified by the preset.
89
+
90
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
91
+|                 |ultrafast |superfast |veryfast |faster |fast |medium |slow |slower |veryslow |placebo |
92
++=================+==========+==========+=========+=======+=====+=======+=====+=======+=========+========+
93
+| ctu             |    32    |    32    |   64    |  64   | 64  |  64   | 64  |  64   |   64    |  64    |
94
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
95
+| min-cu-size     |    16    |     8    |    8    |   8   |  8  |   8   |  8  |   8   |    8    |   8    |
96
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
97
+| bframes         |     3    |     3    |    4    |   4   |  4  |   4   |  4  |   8   |    8    |   8    |
98
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
99
+| b-adapt         |     0    |     0    |    0    |   0   |  0  |   2   |  2  |   2   |    2    |   2    |
100
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
101
+| rc-lookahead    |     5    |    10    |   15    |  15   | 15  |  20   | 25  |  30   |   40    |  60    |
102
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
103
+| lookahead-slices|     8    |     8    |    8    |   8   |  8  |   8   |  4  |   4   |    1    |   1    |
104
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
105
+| scenecut        |     0    |    40    |   40    |  40   | 40  |  40   | 40  |  40   |   40    |  40    |
106
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
107
+| ref             |     1    |     1    |    2    |   2   |  3  |   3   |  4  |   4   |    5    |   5    |
108
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
109
+| limit-refs      |     0    |     0    |    3    |   3   |  3  |   3   |  3  |   2   |    1    |   0    |
110
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
111
+| me              |    dia   |   hex    |   hex   |  hex  |hex  |  hex  |star | star  |   star  |  star  |
112
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
113
+| merange         |    57    |    57    |   57    |  57   | 57  |  57   | 57  |  57   |   57    |  92    |
114
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
115
+| subme           |     0    |     1    |    1    |   2   |  2  |   2   |  3  |   3   |    4    |   5    |
116
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
117
+| rect            |     0    |     0    |    0    |   0   |  0  |   0   |  1  |   1   |    1    |   1    |
118
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
119
+| amp             |     0    |     0    |    0    |   0   |  0  |   0   |  0  |   1   |    1    |   1    |
120
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
121
+| limit-modes     |     0    |     0    |    0    |   0   |  0  |   0   |  1  |   1   |    1    |   0    |
122
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
123
+| max-merge       |     2    |     2    |    2    |   2   |  2  |   2   |  3  |   3   |    4    |   5    |
124
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
125
+| early-skip      |     1    |     1    |    1    |   1   |  0  |   0   |  0  |   0   |    0    |   0    |
126
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
127
+| fast-intra      |     1    |     1    |    1    |   1   |  1  |   0   |  0  |   0   |    0    |   0    |
128
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
129
+| b-intra         |     0    |     0    |    0    |   0   |  0  |   0   |  0  |   1   |    1    |   1    |
130
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
131
+| sao             |     0    |     0    |    1    |   1   |  1  |   1   |  1  |   1   |    1    |   1    |
132
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
133
+| signhide        |     0    |     1    |    1    |   1   |  1  |   1   |  1  |   1   |    1    |   1    |
134
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
135
+| weightp         |     0    |     0    |    1    |   1   |  1  |   1   |  1  |   1   |    1    |   1    |
136
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
137
+| weightb         |     0    |     0    |    0    |   0   |  0  |   0   |  0  |   1   |    1    |   1    |
138
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
139
+| aq-mode         |     0    |     0    |    1    |   1   |  1  |   1   |  1  |   1   |    1    |   1    |
140
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
141
+| cuTree          |     1    |     1    |    1    |   1   |  1  |   1   |  1  |   1   |    1    |   1    |
142
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
143
+| rdLevel         |     2    |     2    |    2    |   2   |  2  |   3   |  4  |   6   |    6    |   6    |
144
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
145
+| rdoq-level      |     0    |     0    |    0    |   0   |  0  |   0   |  2  |   2   |    2    |   2    |
146
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
147
+| tu-intra        |     1    |     1    |    1    |   1   |  1  |   1   |  1  |   2   |    3    |   4    |
148
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
149
+| tu-inter        |     1    |     1    |    1    |   1   |  1  |   1   |  1  |   2   |    3    |   4    |
150
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
151
 
152
 .. _tunings:
153
 
154
x265_1.8.tar.gz/source/CMakeLists.txt -> x265_1.9.tar.gz/source/CMakeLists.txt Changed
133
 
1
@@ -30,7 +30,7 @@
2
 mark_as_advanced(FPROFILE_USE FPROFILE_GENERATE NATIVE_BUILD)
3
 
4
 # X265_BUILD must be incremented each time the public API is changed
5
-set(X265_BUILD 68)
6
+set(X265_BUILD 79)
7
 configure_file("${PROJECT_SOURCE_DIR}/x265.def.in"
8
                "${PROJECT_BINARY_DIR}/x265.def")
9
 configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in"
10
@@ -45,12 +45,14 @@
11
 set(POWER_ALIASES ppc64 ppc64le)
12
 list(FIND POWER_ALIASES "${SYSPROC}" POWERMATCH)
13
 if("${SYSPROC}" STREQUAL "" OR X86MATCH GREATER "-1")
14
-    message(STATUS "Detected x86 target processor")
15
     set(X86 1)
16
     add_definitions(-DX265_ARCH_X86=1)
17
     if("${CMAKE_SIZEOF_VOID_P}" MATCHES 8)
18
         set(X64 1)
19
         add_definitions(-DX86_64=1)
20
+        message(STATUS "Detected x86_64 target processor")
21
+    else()
22
+        message(STATUS "Detected x86 target processor")
23
     endif()
24
 elseif(POWERMATCH GREATER "-1")
25
     message(STATUS "Detected POWER target processor")
26
@@ -71,23 +73,27 @@
27
     if(LIBRT)
28
         list(APPEND PLATFORM_LIBS rt)
29
     endif()
30
+    mark_as_advanced(LIBRT)
31
     find_library(LIBDL dl)
32
     if(LIBDL)
33
         list(APPEND PLATFORM_LIBS dl)
34
     endif()
35
-    find_package(Numa)
36
-    if(NUMA_FOUND)
37
-        link_directories(${NUMA_LIBRARY_DIR})
38
-        list(APPEND CMAKE_REQUIRED_LIBRARIES numa)
39
-        check_symbol_exists(numa_node_of_cpu numa.h NUMA_V2)
40
-        if(NUMA_V2)
41
-            add_definitions(-DHAVE_LIBNUMA)
42
-            message(STATUS "libnuma found, building with support for NUMA nodes")
43
-            list(APPEND PLATFORM_LIBS numa)
44
-            include_directories(${NUMA_INCLUDE_DIR})
45
+    option(ENABLE_LIBNUMA "Enable libnuma usage (Linux only)" ON)
46
+    if(ENABLE_LIBNUMA)
47
+        find_package(Numa)
48
+        if(NUMA_FOUND)
49
+            link_directories(${NUMA_LIBRARY_DIR})
50
+            list(APPEND CMAKE_REQUIRED_LIBRARIES numa)
51
+            check_symbol_exists(numa_node_of_cpu numa.h NUMA_V2)
52
+            if(NUMA_V2)
53
+                add_definitions(-DHAVE_LIBNUMA)
54
+                message(STATUS "libnuma found, building with support for NUMA nodes")
55
+                list(APPEND PLATFORM_LIBS numa)
56
+                include_directories(${NUMA_INCLUDE_DIR})
57
+            endif()
58
         endif()
59
-    endif()
60
-    mark_as_advanced(LIBRT NUMA_FOUND)
61
+        mark_as_advanced(NUMA_FOUND)
62
+    endif(ENABLE_LIBNUMA)
63
     option(NO_ATOMICS "Use a slow mutex to replace atomics" OFF)
64
     if(NO_ATOMICS)
65
         add_definitions(-DNO_ATOMICS=1)
66
@@ -157,6 +163,7 @@
67
 if(GCC)
68
     add_definitions(-Wall -Wextra -Wshadow)
69
     add_definitions(-D__STDC_LIMIT_MACROS=1)
70
+    add_definitions(-std=gnu++98)
71
     if(ENABLE_PIC)
72
          add_definitions(-fPIC)
73
     endif(ENABLE_PIC)
74
@@ -379,16 +386,19 @@
75
 
76
 option(ENABLE_VTUNE "Enable Vtune profiling instrumentation" OFF)
77
 if(ENABLE_VTUNE)
78
-    add_definitions(-DENABLE_VTUNE)
79
-    include_directories($ENV{VTUNE_AMPLIFIER_XE_2015_DIR}/include)
80
-    list(APPEND PLATFORM_LIBS vtune)
81
-    link_directories($ENV{VTUNE_AMPLIFIER_XE_2015_DIR}/lib64)
82
-    if(WIN32)
83
-        list(APPEND PLATFORM_LIBS libittnotify.lib)
84
-    else()
85
-        list(APPEND PLATFORM_LIBS libittnotify.a dl)
86
-    endif()
87
-    add_subdirectory(profile/vtune)
88
+    find_package(Vtune)
89
+    if(VTUNE_FOUND)
90
+        add_definitions(-DENABLE_VTUNE)
91
+        include_directories(${VTUNE_INCLUDE_DIR})
92
+        list(APPEND PLATFORM_LIBS vtune)
93
+        link_directories(${VTUNE_LIBRARY_DIR})
94
+        if(WIN32)
95
+            list(APPEND PLATFORM_LIBS libittnotify.lib)
96
+        else()
97
+            list(APPEND PLATFORM_LIBS libittnotify.a dl)
98
+        endif()
99
+        add_subdirectory(profile/vtune)
100
+    endif(VTUNE_FOUND)
101
 endif(ENABLE_VTUNE)
102
 
103
 option(DETAILED_CU_STATS "Enable internal profiling of encoder work" OFF)
104
@@ -455,6 +465,9 @@
105
 if(ENABLE_SHARED)
106
     add_library(x265-shared SHARED "${PROJECT_BINARY_DIR}/x265.def" ${YASM_OBJS}
107
                 ${X265_RC_FILE} $<TARGET_OBJECTS:encoder> $<TARGET_OBJECTS:common>)
108
+    if(EXTRA_LIB)
109
+        target_link_libraries(x265-shared ${EXTRA_LIB})
110
+    endif()
111
     target_link_libraries(x265-shared ${PLATFORM_LIBS})
112
     if(MSVC)
113
         set_target_properties(x265-shared PROPERTIES OUTPUT_NAME libx265)
114
@@ -465,6 +478,8 @@
115
         set_target_properties(x265-shared PROPERTIES VERSION ${X265_BUILD})
116
         if(APPLE)
117
             set_target_properties(x265-shared PROPERTIES MACOSX_RPATH 1)
118
+        elseif(CYGWIN)
119
+            # Cygwin is not officially supported or tested. MinGW with msys is recommended.
120
         else()
121
             list(APPEND LINKER_OPTIONS "-Wl,-Bsymbolic,-znoexecstack")
122
         endif()
123
@@ -480,9 +495,6 @@
124
                 ARCHIVE DESTINATION ${LIB_INSTALL_DIR}
125
                 RUNTIME DESTINATION ${BIN_INSTALL_DIR})
126
     endif()
127
-    if(EXTRA_LIB)
128
-        target_link_libraries(x265-shared ${EXTRA_LIB})
129
-    endif()
130
     if(LINKER_OPTIONS)
131
         # set_target_properties can't do list expansion
132
         string(REPLACE ";" " " LINKER_OPTION_STR "${LINKER_OPTIONS}")
133
x265_1.9.tar.gz/source/cmake/FindVtune.cmake Added
27
 
1
@@ -0,0 +1,25 @@
2
+# Module for locating Vtune
3
+#
4
+# Read-only variables
5
+#   VTUNE_FOUND: Indicates that the library has been found
6
+#   VTUNE_INCLUDE_DIR: Points to the vtunes include dir
7
+#   VTUNE_LIBRARY_DIR: Points to the directory with libraries
8
+#
9
+# Copyright (c) 2015 Pradeep Ramachandran
10
+
11
+include(FindPackageHandleStandardArgs)
12
+
13
+find_path(VTUNE_DIR
14
+    if(UNIX)
15
+        NAMES amplxe-vars.sh
16
+    else()
17
+        NAMES amplxe-vars.bat
18
+    endif(UNIX)
19
+    HINTS $ENV{VTUNE_AMPLIFIER_XE_2016_DIR} $ENV{VTUNE_AMPLIFIER_XE_2015_DIR}
20
+    DOC "Vtune root directory")
21
+
22
+set (VTUNE_INCLUDE_DIR ${VTUNE_DIR}/include)
23
+set (VTUNE_LIBRARY_DIR ${VTUNE_DIR}/lib64)
24
+
25
+mark_as_advanced(VTUNE_DIR)
26
+find_package_handle_standard_args(VTUNE REQUIRED_VARS VTUNE_DIR VTUNE_INCLUDE_DIR VTUNE_LIBRARY_DIR)
27
x265_1.8.tar.gz/source/common/bitstream.cpp -> x265_1.9.tar.gz/source/common/bitstream.cpp Changed
30
 
1
@@ -1,5 +1,6 @@
2
 #include "common.h"
3
 #include "bitstream.h"
4
+#include "threading.h"
5
 
6
 using namespace X265_NS;
7
 
8
@@ -112,16 +113,13 @@
9
 
10
 void SyntaxElementWriter::writeUvlc(uint32_t code)
11
 {
12
-    uint32_t length = 1;
13
-    uint32_t temp = ++code;
14
+    ++code;
15
 
16
-    X265_CHECK(temp, "writing -1 code, will cause infinite loop\n");
17
+    X265_CHECK(code, "writing -1 code, will cause infinite loop\n");
18
 
19
-    while (1 != temp)
20
-    {
21
-        temp >>= 1;
22
-        length += 2;
23
-    }
24
+    unsigned long idx;
25
+    CLZ(idx, code);
26
+    uint32_t length = (uint32_t)idx * 2 + 1;
27
 
28
     // Take care of cases where length > 32
29
     m_bitIf->write(0, length >> 1);
30
x265_1.8.tar.gz/source/common/bitstream.h -> x265_1.9.tar.gz/source/common/bitstream.h Changed
9
 
1
@@ -2,6 +2,7 @@
2
  * Copyright (C) 2013 x265 project
3
  *
4
  * Author: Steve Borho <steve@borho.org>
5
+ *         Min Chen <chenm003@163.com>
6
  *
7
  * This program is free software; you can redistribute it and/or modify
8
  * it under the terms of the GNU General Public License as published by
9
x265_1.8.tar.gz/source/common/common.h -> x265_1.9.tar.gz/source/common/common.h Changed
67
 
1
@@ -2,6 +2,7 @@
2
  * Copyright (C) 2013 x265 project
3
  *
4
  * Authors: Deepthi Nandakumar <deepthi@multicorewareinc.com>
5
+ *          Min Chen <chenm003@163.com>
6
  *
7
  * This program is free software; you can redistribute it and/or modify
8
  * it under the terms of the GNU General Public License as published by
9
@@ -134,10 +135,10 @@
10
 typedef int32_t  ssum2_t; // Signed sum
11
 #endif // if HIGH_BIT_DEPTH
12
 
13
-#if X265_DEPTH <= 10
14
-typedef uint32_t sse_ret_t;
15
+#if X265_DEPTH < 10
16
+typedef uint32_t sse_t;
17
 #else
18
-typedef uint64_t sse_ret_t;
19
+typedef uint64_t sse_t;
20
 #endif
21
 
22
 #ifndef NULL
23
@@ -214,6 +215,7 @@
24
 
25
 #define X265_MALLOC(type, count)    (type*)x265_malloc(sizeof(type) * (count))
26
 #define X265_FREE(ptr)              x265_free(ptr)
27
+#define X265_FREE_ZERO(ptr)         x265_free(ptr); (ptr) = NULL
28
 #define CHECKED_MALLOC(var, type, count) \
29
     { \
30
         var = (type*)x265_malloc(sizeof(type) * (count)); \
31
@@ -317,6 +319,9 @@
32
 #define CHROMA_V_SHIFT(x) (x == X265_CSP_I420)
33
 #define X265_MAX_PRED_MODE_PER_CTU 85 * 2 * 8
34
 
35
+#define MAX_NUM_TR_COEFFS           MAX_TR_SIZE * MAX_TR_SIZE // Maximum number of transform coefficients, for a 32x32 transform
36
+#define MAX_NUM_TR_CATEGORIES       16                        // 32, 16, 8, 4 transform categories each for luma and chroma
37
+
38
 namespace X265_NS {
39
 
40
 enum { SAO_NUM_OFFSET = 4 };
41
@@ -366,25 +371,6 @@
42
         delete[] ctuParam[2];
43
     }
44
 };
45
-
46
-/* Stores inter analysis data for a single frame */
47
-struct analysis_inter_data
48
-{
49
-    int32_t*    ref;
50
-    uint8_t*    depth;
51
-    uint8_t*    modes;
52
-    uint32_t*   bestMergeCand;
53
-};
54
-
55
-/* Stores intra analysis data for a single frame. This struct needs better packing */
56
-struct analysis_intra_data
57
-{
58
-    uint8_t*  depth;
59
-    uint8_t*  modes;
60
-    char*     partSizes;
61
-    uint8_t*  chromaModes;
62
-};
63
-
64
 enum TextType
65
 {
66
     TEXT_LUMA     = 0,  // luma
67
x265_1.8.tar.gz/source/common/constants.cpp -> x265_1.9.tar.gz/source/common/constants.cpp Changed
9
 
1
@@ -2,6 +2,7 @@
2
 * Copyright (C) 2015 x265 project
3
 *
4
 * Authors: Steve Borho <steve@borho.org>
5
+*          Min Chen <chenm003@163.com>
6
 *
7
 * This program is free software; you can redistribute it and/or modify
8
 * it under the terms of the GNU General Public License as published by
9
x265_1.8.tar.gz/source/common/constants.h -> x265_1.9.tar.gz/source/common/constants.h Changed
9
 
1
@@ -2,6 +2,7 @@
2
  * Copyright (C) 2015 x265 project
3
  *
4
  * Authors: Steve Borho <steve@borho.org>
5
+ *          Min Chen <chenm003@163.com>
6
  *
7
  * This program is free software; you can redistribute it and/or modify
8
  * it under the terms of the GNU General Public License as published by
9
x265_1.8.tar.gz/source/common/contexts.h -> x265_1.9.tar.gz/source/common/contexts.h Changed
9
 
1
@@ -2,6 +2,7 @@
2
 * Copyright (C) 2015 x265 project
3
 *
4
 * Authors: Steve Borho <steve@borho.org>
5
+*          Min Chen <chenm003@163.com>
6
 *
7
 * This program is free software; you can redistribute it and/or modify
8
 * it under the terms of the GNU General Public License as published by
9
x265_1.8.tar.gz/source/common/cudata.cpp -> x265_1.9.tar.gz/source/common/cudata.cpp Changed
394
 
1
@@ -2,6 +2,7 @@
2
  * Copyright (C) 2015 x265 project
3
  *
4
  * Authors: Steve Borho <steve@borho.org>
5
+ *          Min Chen <chenm003@163.com>
6
  *
7
  * This program is free software; you can redistribute it and/or modify
8
  * it under the terms of the GNU General Public License as published by
9
@@ -192,44 +193,82 @@
10
         break;
11
     }
12
 
13
-    /* Each CU's data is layed out sequentially within the charMemBlock */
14
-    uint8_t *charBuf = dataPool.charMemBlock + (m_numPartitions * BytesPerPartition) * instance;
15
-
16
-    m_qp        = (int8_t*)charBuf; charBuf += m_numPartitions;
17
-    m_log2CUSize         = charBuf; charBuf += m_numPartitions;
18
-    m_lumaIntraDir       = charBuf; charBuf += m_numPartitions;
19
-    m_tqBypass           = charBuf; charBuf += m_numPartitions;
20
-    m_refIdx[0] = (int8_t*)charBuf; charBuf += m_numPartitions;
21
-    m_refIdx[1] = (int8_t*)charBuf; charBuf += m_numPartitions;
22
-    m_cuDepth            = charBuf; charBuf += m_numPartitions;
23
-    m_predMode           = charBuf; charBuf += m_numPartitions; /* the order up to here is important in initCTU() and initSubCU() */
24
-    m_partSize           = charBuf; charBuf += m_numPartitions;
25
-    m_mergeFlag          = charBuf; charBuf += m_numPartitions;
26
-    m_interDir           = charBuf; charBuf += m_numPartitions;
27
-    m_mvpIdx[0]          = charBuf; charBuf += m_numPartitions;
28
-    m_mvpIdx[1]          = charBuf; charBuf += m_numPartitions;
29
-    m_tuDepth            = charBuf; charBuf += m_numPartitions;
30
-    m_transformSkip[0]   = charBuf; charBuf += m_numPartitions;
31
-    m_transformSkip[1]   = charBuf; charBuf += m_numPartitions;
32
-    m_transformSkip[2]   = charBuf; charBuf += m_numPartitions;
33
-    m_cbf[0]             = charBuf; charBuf += m_numPartitions;
34
-    m_cbf[1]             = charBuf; charBuf += m_numPartitions;
35
-    m_cbf[2]             = charBuf; charBuf += m_numPartitions;
36
-    m_chromaIntraDir     = charBuf; charBuf += m_numPartitions;
37
-
38
-    X265_CHECK(charBuf == dataPool.charMemBlock + (m_numPartitions * BytesPerPartition) * (instance + 1), "CU data layout is broken\n");
39
-
40
-    m_mv[0]  = dataPool.mvMemBlock + (instance * 4) * m_numPartitions;
41
-    m_mv[1]  = m_mv[0] +  m_numPartitions;
42
-    m_mvd[0] = m_mv[1] +  m_numPartitions;
43
-    m_mvd[1] = m_mvd[0] + m_numPartitions;
44
-
45
-    uint32_t cuSize = g_maxCUSize >> depth;
46
-    uint32_t sizeL = cuSize * cuSize;
47
-    uint32_t sizeC = sizeL >> (m_hChromaShift + m_vChromaShift);
48
-    m_trCoeff[0] = dataPool.trCoeffMemBlock + instance * (sizeL + sizeC * 2);
49
-    m_trCoeff[1] = m_trCoeff[0] + sizeL;
50
-    m_trCoeff[2] = m_trCoeff[0] + sizeL + sizeC;
51
+    if (csp == X265_CSP_I400)
52
+    {
53
+        /* Each CU's data is layed out sequentially within the charMemBlock */
54
+        uint8_t *charBuf = dataPool.charMemBlock + (m_numPartitions * (BytesPerPartition - 4)) * instance;
55
+
56
+        m_qp        = (int8_t*)charBuf; charBuf += m_numPartitions;
57
+        m_log2CUSize         = charBuf; charBuf += m_numPartitions;
58
+        m_lumaIntraDir       = charBuf; charBuf += m_numPartitions;
59
+        m_tqBypass           = charBuf; charBuf += m_numPartitions;
60
+        m_refIdx[0] = (int8_t*)charBuf; charBuf += m_numPartitions;
61
+        m_refIdx[1] = (int8_t*)charBuf; charBuf += m_numPartitions;
62
+        m_cuDepth            = charBuf; charBuf += m_numPartitions;
63
+        m_predMode           = charBuf; charBuf += m_numPartitions; /* the order up to here is important in initCTU() and initSubCU() */
64
+        m_partSize           = charBuf; charBuf += m_numPartitions;
65
+        m_mergeFlag          = charBuf; charBuf += m_numPartitions;
66
+        m_interDir           = charBuf; charBuf += m_numPartitions;
67
+        m_mvpIdx[0]          = charBuf; charBuf += m_numPartitions;
68
+        m_mvpIdx[1]          = charBuf; charBuf += m_numPartitions;
69
+        m_tuDepth            = charBuf; charBuf += m_numPartitions;
70
+        m_transformSkip[0]   = charBuf; charBuf += m_numPartitions;
71
+        m_cbf[0]             = charBuf; charBuf += m_numPartitions;
72
+        m_chromaIntraDir     = charBuf; charBuf += m_numPartitions;
73
+
74
+        X265_CHECK(charBuf == dataPool.charMemBlock + (m_numPartitions * (BytesPerPartition - 4)) * (instance + 1), "CU data layout is broken\n"); //BytesPerPartition
75
+
76
+        m_mv[0]  = dataPool.mvMemBlock + (instance * 4) * m_numPartitions;
77
+        m_mv[1]  = m_mv[0] +  m_numPartitions;
78
+        m_mvd[0] = m_mv[1] +  m_numPartitions;
79
+        m_mvd[1] = m_mvd[0] + m_numPartitions;
80
+
81
+        uint32_t cuSize = g_maxCUSize >> depth;
82
+        m_trCoeff[0] = dataPool.trCoeffMemBlock + instance * (cuSize * cuSize);
83
+        m_trCoeff[1] = m_trCoeff[2] = 0;
84
+        m_transformSkip[1] = m_transformSkip[2] = m_cbf[1] = m_cbf[2] = 0;
85
+    }
86
+    else
87
+    {
88
+        /* Each CU's data is layed out sequentially within the charMemBlock */
89
+        uint8_t *charBuf = dataPool.charMemBlock + (m_numPartitions * BytesPerPartition) * instance;
90
+
91
+        m_qp        = (int8_t*)charBuf; charBuf += m_numPartitions;
92
+        m_log2CUSize         = charBuf; charBuf += m_numPartitions;
93
+        m_lumaIntraDir       = charBuf; charBuf += m_numPartitions;
94
+        m_tqBypass           = charBuf; charBuf += m_numPartitions;
95
+        m_refIdx[0] = (int8_t*)charBuf; charBuf += m_numPartitions;
96
+        m_refIdx[1] = (int8_t*)charBuf; charBuf += m_numPartitions;
97
+        m_cuDepth            = charBuf; charBuf += m_numPartitions;
98
+        m_predMode           = charBuf; charBuf += m_numPartitions; /* the order up to here is important in initCTU() and initSubCU() */
99
+        m_partSize           = charBuf; charBuf += m_numPartitions;
100
+        m_mergeFlag          = charBuf; charBuf += m_numPartitions;
101
+        m_interDir           = charBuf; charBuf += m_numPartitions;
102
+        m_mvpIdx[0]          = charBuf; charBuf += m_numPartitions;
103
+        m_mvpIdx[1]          = charBuf; charBuf += m_numPartitions;
104
+        m_tuDepth            = charBuf; charBuf += m_numPartitions;
105
+        m_transformSkip[0]   = charBuf; charBuf += m_numPartitions;
106
+        m_transformSkip[1]   = charBuf; charBuf += m_numPartitions;
107
+        m_transformSkip[2]   = charBuf; charBuf += m_numPartitions;
108
+        m_cbf[0]             = charBuf; charBuf += m_numPartitions;
109
+        m_cbf[1]             = charBuf; charBuf += m_numPartitions;
110
+        m_cbf[2]             = charBuf; charBuf += m_numPartitions;
111
+        m_chromaIntraDir     = charBuf; charBuf += m_numPartitions;
112
+
113
+        X265_CHECK(charBuf == dataPool.charMemBlock + (m_numPartitions * BytesPerPartition) * (instance + 1), "CU data layout is broken\n");
114
+
115
+        m_mv[0]  = dataPool.mvMemBlock + (instance * 4) * m_numPartitions;
116
+        m_mv[1]  = m_mv[0] +  m_numPartitions;
117
+        m_mvd[0] = m_mv[1] +  m_numPartitions;
118
+        m_mvd[1] = m_mvd[0] + m_numPartitions;
119
+
120
+        uint32_t cuSize = g_maxCUSize >> depth;
121
+        uint32_t sizeL = cuSize * cuSize;
122
+        uint32_t sizeC = sizeL >> (m_hChromaShift + m_vChromaShift); // block chroma part
123
+        m_trCoeff[0] = dataPool.trCoeffMemBlock + instance * (sizeL + sizeC * 2);
124
+        m_trCoeff[1] = m_trCoeff[0] + sizeL;
125
+        m_trCoeff[2] = m_trCoeff[0] + sizeL + sizeC;
126
+    }
127
 }
128
 
129
 void CUData::initCTU(const Frame& frame, uint32_t cuAddr, int qp)
130
@@ -245,7 +284,8 @@
131
     /* sequential memsets */
132
     m_partSet((uint8_t*)m_qp, (uint8_t)qp);
133
     m_partSet(m_log2CUSize,   (uint8_t)g_maxLog2CUSize);
134
-    m_partSet(m_lumaIntraDir, (uint8_t)DC_IDX);
135
+    m_partSet(m_lumaIntraDir, (uint8_t)ALL_IDX);
136
+    m_partSet(m_chromaIntraDir, (uint8_t)ALL_IDX);
137
     m_partSet(m_tqBypass,     (uint8_t)frame.m_encData->m_param->bLossless);
138
     if (m_slice->m_sliceType != I_SLICE)
139
     {
140
@@ -256,7 +296,7 @@
141
     X265_CHECK(!(frame.m_encData->m_param->bLossless && !m_slice->m_pps->bTransquantBypassEnabled), "lossless enabled without TQbypass in PPS\n");
142
 
143
     /* initialize the remaining CU data in one memset */
144
-    memset(m_cuDepth, 0, (BytesPerPartition - 6) * m_numPartitions);
145
+    memset(m_cuDepth, 0, (frame.m_param->internalCsp == X265_CSP_I400 ? BytesPerPartition - 11 : BytesPerPartition - 7) * m_numPartitions);
146
 
147
     uint32_t widthInCU = m_slice->m_sps->numCuInWidth;
148
     m_cuLeft = (m_cuAddr % widthInCU) ? m_encData->getPicCTU(m_cuAddr - 1) : NULL;
149
@@ -283,14 +323,15 @@
150
     m_partSet((uint8_t*)m_qp, (uint8_t)qp);
151
 
152
     m_partSet(m_log2CUSize,   (uint8_t)cuGeom.log2CUSize);
153
-    m_partSet(m_lumaIntraDir, (uint8_t)DC_IDX);
154
+    m_partSet(m_lumaIntraDir, (uint8_t)ALL_IDX);
155
+    m_partSet(m_chromaIntraDir, (uint8_t)ALL_IDX);
156
     m_partSet(m_tqBypass,     (uint8_t)m_encData->m_param->bLossless);
157
     m_partSet((uint8_t*)m_refIdx[0], (uint8_t)REF_NOT_VALID);
158
     m_partSet((uint8_t*)m_refIdx[1], (uint8_t)REF_NOT_VALID);
159
     m_partSet(m_cuDepth,      (uint8_t)cuGeom.depth);
160
 
161
     /* initialize the remaining CU data in one memset */
162
-    memset(m_predMode, 0, (BytesPerPartition - 7) * m_numPartitions);
163
+    memset(m_predMode, 0, (ctu.m_chromaFormat == X265_CSP_I400 ? BytesPerPartition - 12 : BytesPerPartition - 8) * m_numPartitions);
164
 }
165
 
166
 /* Copy the results of a sub-part (split) CU to the parent CU */
167
@@ -314,13 +355,9 @@
168
     m_subPartCopy(m_mvpIdx[0] + offset, subCU.m_mvpIdx[0]);
169
     m_subPartCopy(m_mvpIdx[1] + offset, subCU.m_mvpIdx[1]);
170
     m_subPartCopy(m_tuDepth + offset, subCU.m_tuDepth);
171
+
172
     m_subPartCopy(m_transformSkip[0] + offset, subCU.m_transformSkip[0]);
173
-    m_subPartCopy(m_transformSkip[1] + offset, subCU.m_transformSkip[1]);
174
-    m_subPartCopy(m_transformSkip[2] + offset, subCU.m_transformSkip[2]);
175
     m_subPartCopy(m_cbf[0] + offset, subCU.m_cbf[0]);
176
-    m_subPartCopy(m_cbf[1] + offset, subCU.m_cbf[1]);
177
-    m_subPartCopy(m_cbf[2] + offset, subCU.m_cbf[2]);
178
-    m_subPartCopy(m_chromaIntraDir + offset, subCU.m_chromaIntraDir);
179
 
180
     memcpy(m_mv[0] + offset, subCU.m_mv[0], childGeom.numPartitions * sizeof(MV));
181
     memcpy(m_mv[1] + offset, subCU.m_mv[1], childGeom.numPartitions * sizeof(MV));
182
@@ -329,12 +366,21 @@
183
 
184
     uint32_t tmp = 1 << ((g_maxLog2CUSize - childGeom.depth) * 2);
185
     uint32_t tmp2 = subPartIdx * tmp;
186
-    memcpy(m_trCoeff[0] + tmp2, subCU.m_trCoeff[0], sizeof(coeff_t) * tmp);
187
+    memcpy(m_trCoeff[0] + tmp2, subCU.m_trCoeff[0], sizeof(coeff_t)* tmp);
188
 
189
-    uint32_t tmpC = tmp >> (m_hChromaShift + m_vChromaShift);
190
-    uint32_t tmpC2 = tmp2 >> (m_hChromaShift + m_vChromaShift);
191
-    memcpy(m_trCoeff[1] + tmpC2, subCU.m_trCoeff[1], sizeof(coeff_t) * tmpC);
192
-    memcpy(m_trCoeff[2] + tmpC2, subCU.m_trCoeff[2], sizeof(coeff_t) * tmpC);
193
+    if (subCU.m_chromaFormat != X265_CSP_I400)
194
+    {
195
+        m_subPartCopy(m_transformSkip[1] + offset, subCU.m_transformSkip[1]);
196
+        m_subPartCopy(m_transformSkip[2] + offset, subCU.m_transformSkip[2]);
197
+        m_subPartCopy(m_cbf[1] + offset, subCU.m_cbf[1]);
198
+        m_subPartCopy(m_cbf[2] + offset, subCU.m_cbf[2]);
199
+        m_subPartCopy(m_chromaIntraDir + offset, subCU.m_chromaIntraDir);
200
+
201
+        uint32_t tmpC = tmp >> (m_hChromaShift + m_vChromaShift);
202
+        uint32_t tmpC2 = tmp2 >> (m_hChromaShift + m_vChromaShift);
203
+        memcpy(m_trCoeff[1] + tmpC2, subCU.m_trCoeff[1], sizeof(coeff_t) * tmpC);
204
+        memcpy(m_trCoeff[2] + tmpC2, subCU.m_trCoeff[2], sizeof(coeff_t) * tmpC);
205
+    }
206
 }
207
 
208
 /* If a sub-CU part is not present (off the edge of the picture) its depth and
209
@@ -374,12 +420,17 @@
210
     /* clear residual coding flags */
211
     m_partSet(m_predMode, cu.m_predMode[0] & (MODE_INTRA | MODE_INTER));
212
     m_partSet(m_tuDepth, 0);
213
-    m_partSet(m_transformSkip[0], 0);
214
-    m_partSet(m_transformSkip[1], 0);
215
-    m_partSet(m_transformSkip[2], 0);
216
     m_partSet(m_cbf[0], 0);
217
-    m_partSet(m_cbf[1], 0);
218
-    m_partSet(m_cbf[2], 0);
219
+    m_partSet(m_transformSkip[0], 0);
220
+
221
+    if (cu.m_chromaFormat != X265_CSP_I400)
222
+    {
223
+        m_partSet(m_chromaIntraDir, (uint8_t)ALL_IDX);
224
+        m_partSet(m_cbf[1], 0);
225
+        m_partSet(m_cbf[2], 0);
226
+        m_partSet(m_transformSkip[1], 0);
227
+        m_partSet(m_transformSkip[2], 0);
228
+    }
229
 }
230
 
231
 /* Copy completed predicted CU to CTU in picture */
232
@@ -402,30 +453,34 @@
233
     m_partCopy(ctu.m_mvpIdx[1] + m_absIdxInCTU, m_mvpIdx[1]);
234
     m_partCopy(ctu.m_tuDepth + m_absIdxInCTU, m_tuDepth);
235
     m_partCopy(ctu.m_transformSkip[0] + m_absIdxInCTU, m_transformSkip[0]);
236
-    m_partCopy(ctu.m_transformSkip[1] + m_absIdxInCTU, m_transformSkip[1]);
237
-    m_partCopy(ctu.m_transformSkip[2] + m_absIdxInCTU, m_transformSkip[2]);
238
     m_partCopy(ctu.m_cbf[0] + m_absIdxInCTU, m_cbf[0]);
239
-    m_partCopy(ctu.m_cbf[1] + m_absIdxInCTU, m_cbf[1]);
240
-    m_partCopy(ctu.m_cbf[2] + m_absIdxInCTU, m_cbf[2]);
241
-    m_partCopy(ctu.m_chromaIntraDir + m_absIdxInCTU, m_chromaIntraDir);
242
 
243
-    memcpy(ctu.m_mv[0] + m_absIdxInCTU,  m_mv[0],  m_numPartitions * sizeof(MV));
244
-    memcpy(ctu.m_mv[1] + m_absIdxInCTU,  m_mv[1],  m_numPartitions * sizeof(MV));
245
+    memcpy(ctu.m_mv[0] + m_absIdxInCTU, m_mv[0], m_numPartitions * sizeof(MV));
246
+    memcpy(ctu.m_mv[1] + m_absIdxInCTU, m_mv[1], m_numPartitions * sizeof(MV));
247
     memcpy(ctu.m_mvd[0] + m_absIdxInCTU, m_mvd[0], m_numPartitions * sizeof(MV));
248
     memcpy(ctu.m_mvd[1] + m_absIdxInCTU, m_mvd[1], m_numPartitions * sizeof(MV));
249
 
250
     uint32_t tmpY = 1 << ((g_maxLog2CUSize - depth) * 2);
251
     uint32_t tmpY2 = m_absIdxInCTU << (LOG2_UNIT_SIZE * 2);
252
-    memcpy(ctu.m_trCoeff[0] + tmpY2, m_trCoeff[0], sizeof(coeff_t) * tmpY);
253
+    memcpy(ctu.m_trCoeff[0] + tmpY2, m_trCoeff[0], sizeof(coeff_t)* tmpY);
254
 
255
-    uint32_t tmpC = tmpY >> (m_hChromaShift + m_vChromaShift);
256
-    uint32_t tmpC2 = tmpY2 >> (m_hChromaShift + m_vChromaShift);
257
-    memcpy(ctu.m_trCoeff[1] + tmpC2, m_trCoeff[1], sizeof(coeff_t) * tmpC);
258
-    memcpy(ctu.m_trCoeff[2] + tmpC2, m_trCoeff[2], sizeof(coeff_t) * tmpC);
259
+    if (ctu.m_chromaFormat != X265_CSP_I400)
260
+    {
261
+        m_partCopy(ctu.m_transformSkip[1] + m_absIdxInCTU, m_transformSkip[1]);
262
+        m_partCopy(ctu.m_transformSkip[2] + m_absIdxInCTU, m_transformSkip[2]);
263
+        m_partCopy(ctu.m_cbf[1] + m_absIdxInCTU, m_cbf[1]);
264
+        m_partCopy(ctu.m_cbf[2] + m_absIdxInCTU, m_cbf[2]);
265
+        m_partCopy(ctu.m_chromaIntraDir + m_absIdxInCTU, m_chromaIntraDir);
266
+
267
+        uint32_t tmpC = tmpY >> (m_hChromaShift + m_vChromaShift);
268
+        uint32_t tmpC2 = tmpY2 >> (m_hChromaShift + m_vChromaShift);
269
+        memcpy(ctu.m_trCoeff[1] + tmpC2, m_trCoeff[1], sizeof(coeff_t) * tmpC);
270
+        memcpy(ctu.m_trCoeff[2] + tmpC2, m_trCoeff[2], sizeof(coeff_t) * tmpC);
271
+    }
272
 }
273
 
274
 /* The reverse of copyToPic, called only by encodeResidue */
275
-void CUData::copyFromPic(const CUData& ctu, const CUGeom& cuGeom)
276
+void CUData::copyFromPic(const CUData& ctu, const CUGeom& cuGeom, int csp)
277
 {
278
     m_encData       = ctu.m_encData;
279
     m_slice         = ctu.m_slice;
280
@@ -451,19 +506,23 @@
281
     m_partCopy(m_mvpIdx[1],    ctu.m_mvpIdx[1] + m_absIdxInCTU);
282
     m_partCopy(m_chromaIntraDir, ctu.m_chromaIntraDir + m_absIdxInCTU);
283
 
284
-    memcpy(m_mv[0],  ctu.m_mv[0] + m_absIdxInCTU,  m_numPartitions * sizeof(MV));
285
-    memcpy(m_mv[1],  ctu.m_mv[1] + m_absIdxInCTU,  m_numPartitions * sizeof(MV));
286
+    memcpy(m_mv[0], ctu.m_mv[0] + m_absIdxInCTU, m_numPartitions * sizeof(MV));
287
+    memcpy(m_mv[1], ctu.m_mv[1] + m_absIdxInCTU, m_numPartitions * sizeof(MV));
288
     memcpy(m_mvd[0], ctu.m_mvd[0] + m_absIdxInCTU, m_numPartitions * sizeof(MV));
289
     memcpy(m_mvd[1], ctu.m_mvd[1] + m_absIdxInCTU, m_numPartitions * sizeof(MV));
290
 
291
     /* clear residual coding flags */
292
     m_partSet(m_tuDepth, 0);
293
     m_partSet(m_transformSkip[0], 0);
294
-    m_partSet(m_transformSkip[1], 0);
295
-    m_partSet(m_transformSkip[2], 0);
296
     m_partSet(m_cbf[0], 0);
297
-    m_partSet(m_cbf[1], 0);
298
-    m_partSet(m_cbf[2], 0);
299
+
300
+    if (csp != X265_CSP_I400)
301
+    {        
302
+        m_partSet(m_transformSkip[1], 0);
303
+        m_partSet(m_transformSkip[2], 0);
304
+        m_partSet(m_cbf[1], 0);
305
+        m_partSet(m_cbf[2], 0);
306
+    }
307
 }
308
 
309
 /* Only called by encodeResidue, these fields can be modified during inter/intra coding */
310
@@ -473,22 +532,28 @@
311
 
312
     m_partCopy((uint8_t*)ctu.m_qp + m_absIdxInCTU, (uint8_t*)m_qp);
313
     m_partCopy(ctu.m_transformSkip[0] + m_absIdxInCTU, m_transformSkip[0]);
314
-    m_partCopy(ctu.m_transformSkip[1] + m_absIdxInCTU, m_transformSkip[1]);
315
-    m_partCopy(ctu.m_transformSkip[2] + m_absIdxInCTU, m_transformSkip[2]);
316
     m_partCopy(ctu.m_predMode + m_absIdxInCTU, m_predMode);
317
     m_partCopy(ctu.m_tuDepth + m_absIdxInCTU, m_tuDepth);
318
     m_partCopy(ctu.m_cbf[0] + m_absIdxInCTU, m_cbf[0]);
319
-    m_partCopy(ctu.m_cbf[1] + m_absIdxInCTU, m_cbf[1]);
320
-    m_partCopy(ctu.m_cbf[2] + m_absIdxInCTU, m_cbf[2]);
321
-    m_partCopy(ctu.m_chromaIntraDir + m_absIdxInCTU, m_chromaIntraDir);
322
 
323
     uint32_t tmpY = 1 << ((g_maxLog2CUSize - depth) * 2);
324
     uint32_t tmpY2 = m_absIdxInCTU << (LOG2_UNIT_SIZE * 2);
325
-    memcpy(ctu.m_trCoeff[0] + tmpY2, m_trCoeff[0], sizeof(coeff_t) * tmpY);
326
-    tmpY  >>= m_hChromaShift + m_vChromaShift;
327
-    tmpY2 >>= m_hChromaShift + m_vChromaShift;
328
-    memcpy(ctu.m_trCoeff[1] + tmpY2, m_trCoeff[1], sizeof(coeff_t) * tmpY);
329
-    memcpy(ctu.m_trCoeff[2] + tmpY2, m_trCoeff[2], sizeof(coeff_t) * tmpY);
330
+    memcpy(ctu.m_trCoeff[0] + tmpY2, m_trCoeff[0], sizeof(coeff_t)* tmpY);
331
+
332
+    if (ctu.m_chromaFormat != X265_CSP_I400)
333
+    {
334
+        m_partCopy(ctu.m_transformSkip[1] + m_absIdxInCTU, m_transformSkip[1]);
335
+        m_partCopy(ctu.m_transformSkip[2] + m_absIdxInCTU, m_transformSkip[2]);
336
+
337
+        m_partCopy(ctu.m_cbf[1] + m_absIdxInCTU, m_cbf[1]);
338
+        m_partCopy(ctu.m_cbf[2] + m_absIdxInCTU, m_cbf[2]);
339
+        m_partCopy(ctu.m_chromaIntraDir + m_absIdxInCTU, m_chromaIntraDir);
340
+
341
+        tmpY  >>= m_hChromaShift + m_vChromaShift;
342
+        tmpY2 >>= m_hChromaShift + m_vChromaShift;
343
+        memcpy(ctu.m_trCoeff[1] + tmpY2, m_trCoeff[1], sizeof(coeff_t) * tmpY);
344
+        memcpy(ctu.m_trCoeff[2] + tmpY2, m_trCoeff[2], sizeof(coeff_t) * tmpY);
345
+    }
346
 }
347
 
348
 const CUData* CUData::getPULeft(uint32_t& lPartUnitIdx, uint32_t curPartUnitIdx) const
349
@@ -1676,7 +1741,7 @@
350
         if (tempRefIdx != -1)
351
         {
352
             uint32_t cuAddr = neighbours[MD_COLLOCATED].cuAddr[picList];
353
-            const Frame* colPic = m_slice->m_refPicList[m_slice->isInterB() && !m_slice->m_colFromL0Flag][m_slice->m_colRefIdx];
354
+            const Frame* colPic = m_slice->m_refFrameList[m_slice->isInterB() && !m_slice->m_colFromL0Flag][m_slice->m_colRefIdx];
355
             const CUData* colCU = colPic->m_encData->getPicCTU(cuAddr);
356
 
357
             // Scale the vector
358
@@ -1857,7 +1922,7 @@
359
 
360
 bool CUData::getColMVP(MV& outMV, int& outRefIdx, int picList, int cuAddr, int partUnitIdx) const
361
 {
362
-    const Frame* colPic = m_slice->m_refPicList[m_slice->isInterB() && !m_slice->m_colFromL0Flag][m_slice->m_colRefIdx];
363
+    const Frame* colPic = m_slice->m_refFrameList[m_slice->isInterB() && !m_slice->m_colFromL0Flag][m_slice->m_colRefIdx];
364
     const CUData* colCU = colPic->m_encData->getPicCTU(cuAddr);
365
 
366
     uint32_t absPartAddr = partUnitIdx & TMVP_UNIT_MASK;
367
@@ -1892,7 +1957,7 @@
368
 // Cache the collocated MV.
369
 bool CUData::getCollocatedMV(int cuAddr, int partUnitIdx, InterNeighbourMV *neighbour) const
370
 {
371
-    const Frame* colPic = m_slice->m_refPicList[m_slice->isInterB() && !m_slice->m_colFromL0Flag][m_slice->m_colRefIdx];
372
+    const Frame* colPic = m_slice->m_refFrameList[m_slice->isInterB() && !m_slice->m_colFromL0Flag][m_slice->m_colRefIdx];
373
     const CUData* colCU = colPic->m_encData->getPicCTU(cuAddr);
374
 
375
     uint32_t absPartAddr = partUnitIdx & TMVP_UNIT_MASK;
376
@@ -1951,7 +2016,7 @@
377
     bool bIsIntra = isIntra(absPartIdx);
378
 
379
     // set the group layout
380
-    result.log2TrSizeCG = log2TrSize - 2;
381
+    const uint32_t log2TrSizeCG = log2TrSize - 2;
382
 
383
     // set the scan orders
384
     if (bIsIntra)
385
@@ -1979,7 +2044,7 @@
386
         result.scanType = SCAN_DIAG;
387
 
388
     result.scan     = g_scanOrder[result.scanType][log2TrSize - 2];
389
-    result.scanCG   = g_scanOrderCG[result.scanType][result.log2TrSizeCG];
390
+    result.scanCG   = g_scanOrderCG[result.scanType][log2TrSizeCG];
391
 
392
     if (log2TrSize == 2)
393
         result.firstSignificanceMapContext = 0;
394
x265_1.8.tar.gz/source/common/cudata.h -> x265_1.9.tar.gz/source/common/cudata.h Changed
51
 
1
@@ -222,12 +222,12 @@
2
     void     copyToPic(uint32_t depth) const;
3
 
4
     /* RD-0 methods called only from encodeResidue */
5
-    void     copyFromPic(const CUData& ctu, const CUGeom& cuGeom);
6
+    void     copyFromPic(const CUData& ctu, const CUGeom& cuGeom, int csp);
7
     void     updatePic(uint32_t depth) const;
8
 
9
     void     setPartSizeSubParts(PartSize size)    { m_partSet(m_partSize, (uint8_t)size); }
10
     void     setPredModeSubParts(PredMode mode)    { m_partSet(m_predMode, (uint8_t)mode); }
11
-    void     clearCbf()                            { m_partSet(m_cbf[0], 0); m_partSet(m_cbf[1], 0); m_partSet(m_cbf[2], 0); }
12
+    void     clearCbf()                            { m_partSet(m_cbf[0], 0); if (m_chromaFormat != X265_CSP_I400) { m_partSet(m_cbf[1], 0); m_partSet(m_cbf[2], 0);} }
13
 
14
     /* these functions all take depth as an absolute depth from CTU, it is used to calculate the number of parts to copy */
15
     void     setQPSubParts(int8_t qp, uint32_t absPartIdx, uint32_t depth)                    { s_partSet[depth]((uint8_t*)m_qp + absPartIdx, (uint8_t)qp); }
16
@@ -246,7 +246,7 @@
17
     void     setPURefIdx(int list, int8_t refIdx, int absPartIdx, int puIdx);
18
 
19
     uint8_t  getCbf(uint32_t absPartIdx, TextType ttype, uint32_t tuDepth) const { return (m_cbf[ttype][absPartIdx] >> tuDepth) & 0x1; }
20
-    uint8_t  getQtRootCbf(uint32_t absPartIdx) const                             { return m_cbf[0][absPartIdx] || m_cbf[1][absPartIdx] || m_cbf[2][absPartIdx]; }
21
+    uint8_t  getQtRootCbf(uint32_t absPartIdx) const                             { if (m_chromaFormat == X265_CSP_I400) return m_cbf[0][absPartIdx] || false; else { return m_cbf[0][absPartIdx] || m_cbf[1][absPartIdx] || m_cbf[2][absPartIdx];} }
22
     int8_t   getRefQP(uint32_t currAbsIdxInCTU) const;
23
     uint32_t getInterMergeCandidates(uint32_t absPartIdx, uint32_t puIdx, MVField (*candMvField)[2], uint8_t* candDir) const;
24
     void     clipMv(MV& outMV) const;
25
@@ -323,7 +323,6 @@
26
     const uint16_t *scan;
27
     const uint16_t *scanCG;
28
     ScanType        scanType;
29
-    uint32_t        log2TrSizeCG;
30
     uint32_t        firstSignificanceMapContext;
31
 };
32
 
33
@@ -340,8 +339,15 @@
34
         uint32_t numPartition = NUM_4x4_PARTITIONS >> (depth * 2);
35
         uint32_t cuSize = g_maxCUSize >> depth;
36
         uint32_t sizeL = cuSize * cuSize;
37
-        uint32_t sizeC = sizeL >> (CHROMA_H_SHIFT(csp) + CHROMA_V_SHIFT(csp));
38
-        CHECKED_MALLOC(trCoeffMemBlock, coeff_t, (sizeL + sizeC * 2) * numInstances);
39
+        if (csp == X265_CSP_I400)
40
+        {
41
+            CHECKED_MALLOC(trCoeffMemBlock, coeff_t, (sizeL) * numInstances);
42
+        }
43
+        else
44
+        {            
45
+            uint32_t sizeC = sizeL >> (CHROMA_H_SHIFT(csp) + CHROMA_V_SHIFT(csp));
46
+            CHECKED_MALLOC(trCoeffMemBlock, coeff_t, (sizeL + sizeC * 2) * numInstances);
47
+        }
48
         CHECKED_MALLOC(charMemBlock, uint8_t, numPartition * numInstances * CUData::BytesPerPartition);
49
         CHECKED_MALLOC(mvMemBlock, MV, numPartition * 4 * numInstances);
50
         return true;
51
x265_1.8.tar.gz/source/common/dct.cpp -> x265_1.9.tar.gz/source/common/dct.cpp Changed
46
 
1
@@ -703,7 +703,10 @@
2
         if (level)
3
             ++numSig;
4
         level *= sign;
5
-        qCoef[blockpos] = (int16_t)x265_clip3(-32768, 32767, level);
6
+
7
+        // TODO: when we limit range to [-32767, 32767], we can get more performance with output change
8
+        //       But nquant is a little percent in rdoQuant, so I keep old dynamic range for compatible
9
+        qCoef[blockpos] = (int16_t)abs(x265_clip3(-32768, 32767, level));
10
     }
11
 
12
     return numSig;
13
@@ -784,11 +787,12 @@
14
     return scanPosLast - 1;
15
 }
16
 
17
+// NOTE: no defined value on lastNZPosInCG & absSumSign when ALL ZEROS block as input
18
 static uint32_t findPosFirstLast_c(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16])
19
 {
20
     int n;
21
 
22
-    for (n = SCAN_SET_SIZE - 1; n >= 0; --n)
23
+    for (n = SCAN_SET_SIZE - 1; n >= 0; n--)
24
     {
25
         const uint32_t idx = scanTbl[n];
26
         const uint32_t idxY = idx / MLS_CG_SIZE;
27
@@ -812,8 +816,17 @@
28
 
29
     uint32_t firstNZPosInCG = (uint32_t)n;
30
 
31
+    uint32_t absSumSign = 0;
32
+    for (n = firstNZPosInCG; n <= (int)lastNZPosInCG; n++)
33
+    {
34
+        const uint32_t idx = scanTbl[n];
35
+        const uint32_t idxY = idx / MLS_CG_SIZE;
36
+        const uint32_t idxX = idx % MLS_CG_SIZE;
37
+        absSumSign += dstCoeff[idxY * trSize + idxX];
38
+    }
39
+
40
     // NOTE: when coeff block all ZERO, the lastNZPosInCG is undefined and firstNZPosInCG is 16
41
-    return ((lastNZPosInCG << 16) | firstNZPosInCG);
42
+    return ((absSumSign << 31) | (lastNZPosInCG << 8) | firstNZPosInCG);
43
 }
44
 
45
 
46
x265_1.8.tar.gz/source/common/deblock.cpp -> x265_1.9.tar.gz/source/common/deblock.cpp Changed
86
 
1
@@ -2,6 +2,7 @@
2
 * Copyright (C) 2013 x265 project
3
 *
4
 * Author: Gopu Govindaswamy <gopu@multicorewareinc.com>
5
+*         Min Chen <chenm003@163.com>
6
 *
7
 * This program is free software; you can redistribute it and/or modify
8
 * it under the terms of the GNU General Public License as published by
9
@@ -108,7 +109,7 @@
10
     for (uint32_t e = 0; e < numUnits; e += partIdxIncr)
11
     {
12
         edgeFilterLuma(cu, absPartIdx, depth, dir, e, blockStrength);
13
-        if (!((e0 + e) & chromaMask))
14
+        if (!((e0 + e) & chromaMask) && cu->m_chromaFormat != X265_CSP_I400)
15
             edgeFilterChroma(cu, absPartIdx, depth, dir, e, blockStrength);
16
     }
17
 }
18
@@ -209,8 +210,8 @@
19
     const Slice* const sliceQ = cuQ->m_slice;
20
     const Slice* const sliceP = cuP->m_slice;
21
 
22
-    const Frame* refP0 = sliceP->getRefPic(0, cuP->m_refIdx[0][partP]);
23
-    const Frame* refQ0 = sliceQ->getRefPic(0, cuQ->m_refIdx[0][partQ]);
24
+    const Frame* refP0 = sliceP->m_refFrameList[0][cuP->m_refIdx[0][partP]];
25
+    const Frame* refQ0 = sliceQ->m_refFrameList[0][cuQ->m_refIdx[0][partQ]];
26
     const MV& mvP0 = refP0 ? cuP->m_mv[0][partP] : zeroMv;
27
     const MV& mvQ0 = refQ0 ? cuQ->m_mv[0][partQ] : zeroMv;
28
 
29
@@ -221,8 +222,8 @@
30
     }
31
 
32
     // (sliceQ->isInterB() || sliceP->isInterB())
33
-    const Frame* refP1 = sliceP->getRefPic(1, cuP->m_refIdx[1][partP]);
34
-    const Frame* refQ1 = sliceQ->getRefPic(1, cuQ->m_refIdx[1][partQ]);
35
+    const Frame* refP1 = sliceP->m_refFrameList[1][cuP->m_refIdx[1][partP]];
36
+    const Frame* refQ1 = sliceQ->m_refFrameList[1][cuQ->m_refIdx[1][partQ]];
37
     const MV& mvP1 = refP1 ? cuP->m_mv[1][partP] : zeroMv;
38
     const MV& mvQ1 = refQ1 ? cuQ->m_mv[1][partQ] : zeroMv;
39
 
40
@@ -279,31 +280,6 @@
41
  * \param maskQ   indicator to enable filtering on partQ
42
  * \param maskP1  decision weak filter/no filter for partP
43
  * \param maskQ1  decision weak filter/no filter for partQ */
44
-static inline void pelFilterLumaStrong(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tc, int32_t maskP, int32_t maskQ)
45
-{
46
-    int32_t tc2 = 2 * tc;
47
-    int32_t tcP = (tc2 & maskP);
48
-    int32_t tcQ = (tc2 & maskQ);
49
-    for (int32_t i = 0; i < UNIT_SIZE; i++, src += srcStep)
50
-    {
51
-        int16_t m4  = (int16_t)src[0];
52
-        int16_t m3  = (int16_t)src[-offset];
53
-        int16_t m5  = (int16_t)src[offset];
54
-        int16_t m2  = (int16_t)src[-offset * 2];
55
-        int16_t m6  = (int16_t)src[offset * 2];
56
-        int16_t m1  = (int16_t)src[-offset * 3];
57
-        int16_t m7  = (int16_t)src[offset * 3];
58
-        int16_t m0  = (int16_t)src[-offset * 4];
59
-        src[-offset * 3] = (pixel)(x265_clip3(-tcP, tcP, ((2 * m0 + 3 * m1 + m2 + m3 + m4 + 4) >> 3) - m1) + m1);
60
-        src[-offset * 2] = (pixel)(x265_clip3(-tcP, tcP, ((m1 + m2 + m3 + m4 + 2) >> 2) - m2) + m2);
61
-        src[-offset]     = (pixel)(x265_clip3(-tcP, tcP, ((m1 + 2 * m2 + 2 * m3 + 2 * m4 + m5 + 4) >> 3) - m3) + m3);
62
-        src[0]           = (pixel)(x265_clip3(-tcQ, tcQ, ((m2 + 2 * m3 + 2 * m4 + 2 * m5 + m6 + 4) >> 3) - m4) + m4);
63
-        src[offset]      = (pixel)(x265_clip3(-tcQ, tcQ, ((m3 + m4 + m5 + m6 + 2) >> 2) - m5) + m5);
64
-        src[offset * 2]  = (pixel)(x265_clip3(-tcQ, tcQ, ((m3 + m4 + m5 + 3 * m6 + 2 * m7 + 4) >> 3) - m6) + m6);
65
-    }
66
-}
67
-
68
-/* Weak filter */
69
 static inline void pelFilterLuma(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tc, int32_t maskP, int32_t maskQ,
70
                                  int32_t maskP1, int32_t maskQ1)
71
 {
72
@@ -445,7 +421,12 @@
73
                    useStrongFiltering(offset, beta, tc, src + unitOffset + srcStep * 3));
74
 
75
         if (sw)
76
-            pelFilterLumaStrong(src + unitOffset, srcStep, offset, tc, maskP, maskQ);
77
+        {
78
+            int32_t tc2 = 2 * tc;
79
+            int32_t tcP = (tc2 & maskP);
80
+            int32_t tcQ = (tc2 & maskQ);
81
+            primitives.pelFilterLumaStrong[dir](src + unitOffset, srcStep, offset, tcP, tcQ);
82
+        }
83
         else
84
         {
85
             int32_t sideThreshold = (beta + (beta >> 1)) >> 3;
86
x265_1.8.tar.gz/source/common/deblock.h -> x265_1.9.tar.gz/source/common/deblock.h Changed
42
 
1
@@ -2,6 +2,7 @@
2
 * Copyright (C) 2013 x265 project
3
 *
4
 * Author: Gopu Govindaswamy <gopu@multicorewareinc.com>
5
+*         Min Chen <chenm003@163.com>
6
 *
7
 * This program is free software; you can redistribute it and/or modify
8
 * it under the terms of the GNU General Public License as published by
9
@@ -37,24 +38,24 @@
10
 public:
11
     enum { EDGE_VER, EDGE_HOR };
12
 
13
-    void deblockCTU(const CUData* ctu, const CUGeom& cuGeom, int32_t dir);
14
+    static void deblockCTU(const CUData* ctu, const CUGeom& cuGeom, int32_t dir);
15
 
16
 protected:
17
 
18
     // CU-level deblocking function
19
-    void deblockCU(const CUData* cu, const CUGeom& cuGeom, const int32_t dir, uint8_t blockStrength[]);
20
+    static void deblockCU(const CUData* cu, const CUGeom& cuGeom, const int32_t dir, uint8_t blockStrength[]);
21
 
22
     // set filtering functions
23
-    void setEdgefilterTU(const CUData* cu, uint32_t absPartIdx, uint32_t tuDepth, int32_t dir, uint8_t blockStrength[]);
24
-    void setEdgefilterPU(const CUData* cu, uint32_t absPartIdx, int32_t dir, uint8_t blockStrength[], uint32_t numUnits);
25
-    void setEdgefilterMultiple(const CUData* cu, uint32_t absPartIdx, int32_t dir, int32_t edgeIdx, uint8_t value, uint8_t blockStrength[], uint32_t numUnits);
26
+    static void setEdgefilterTU(const CUData* cu, uint32_t absPartIdx, uint32_t tuDepth, int32_t dir, uint8_t blockStrength[]);
27
+    static void setEdgefilterPU(const CUData* cu, uint32_t absPartIdx, int32_t dir, uint8_t blockStrength[], uint32_t numUnits);
28
+    static void setEdgefilterMultiple(const CUData* cu, uint32_t absPartIdx, int32_t dir, int32_t edgeIdx, uint8_t value, uint8_t blockStrength[], uint32_t numUnits);
29
 
30
     // get filtering functions
31
-    uint8_t getBoundaryStrength(const CUData* cuQ, int32_t dir, uint32_t partQ, const uint8_t blockStrength[]);
32
+    static uint8_t getBoundaryStrength(const CUData* cuQ, int32_t dir, uint32_t partQ, const uint8_t blockStrength[]);
33
 
34
     // filter luma/chroma functions
35
-    void edgeFilterLuma(const CUData* cuQ, uint32_t absPartIdx, uint32_t depth, int32_t dir, int32_t edge, const uint8_t blockStrength[]);
36
-    void edgeFilterChroma(const CUData* cuQ, uint32_t absPartIdx, uint32_t depth, int32_t dir, int32_t edge, const uint8_t blockStrength[]);
37
+    static void edgeFilterLuma(const CUData* cuQ, uint32_t absPartIdx, uint32_t depth, int32_t dir, int32_t edge, const uint8_t blockStrength[]);
38
+    static void edgeFilterChroma(const CUData* cuQ, uint32_t absPartIdx, uint32_t depth, int32_t dir, int32_t edge, const uint8_t blockStrength[]);
39
 
40
     static const uint8_t s_tcTable[54];
41
     static const uint8_t s_betaTable[52];
42
x265_1.8.tar.gz/source/common/frame.cpp -> x265_1.9.tar.gz/source/common/frame.cpp Changed
91
 
1
@@ -33,22 +33,37 @@
2
     m_bChromaExtended = false;
3
     m_lowresInit = false;
4
     m_reconRowCount.set(0);
5
+    m_reconColCount = NULL;
6
     m_countRefEncoders = 0;
7
     m_encData = NULL;
8
     m_reconPic = NULL;
9
+    m_quantOffsets = NULL;
10
     m_next = NULL;
11
     m_prev = NULL;
12
     m_param = NULL;
13
     memset(&m_lowres, 0, sizeof(m_lowres));
14
 }
15
 
16
-bool Frame::create(x265_param *param)
17
+bool Frame::create(x265_param *param, float* quantOffsets)
18
 {
19
     m_fencPic = new PicYuv;
20
     m_param = param;
21
 
22
-    return m_fencPic->create(param->sourceWidth, param->sourceHeight, param->internalCsp) &&
23
-           m_lowres.create(m_fencPic, param->bframes, !!param->rc.aqMode);
24
+    if (m_fencPic->create(param->sourceWidth, param->sourceHeight, param->internalCsp) &&
25
+        m_lowres.create(m_fencPic, param->bframes, !!param->rc.aqMode))
26
+    {
27
+        X265_CHECK((m_reconColCount == NULL), "m_reconColCount was initialized");
28
+        m_numRows = (m_fencPic->m_picHeight + g_maxCUSize - 1)  / g_maxCUSize;
29
+        m_reconColCount = new ThreadSafeInteger[m_numRows];
30
+
31
+        if (quantOffsets)
32
+        {
33
+            int32_t cuCount = m_lowres.maxBlocksInRow * m_lowres.maxBlocksInCol;
34
+            m_quantOffsets = new float[cuCount];
35
+        }
36
+        return true;
37
+    }
38
+    return false;
39
 }
40
 
41
 bool Frame::allocEncodeData(x265_param *param, const SPS& sps)
42
@@ -56,15 +71,27 @@
43
     m_encData = new FrameData;
44
     m_reconPic = new PicYuv;
45
     m_encData->m_reconPic = m_reconPic;
46
-    bool ok = m_encData->create(param, sps) && m_reconPic->create(param->sourceWidth, param->sourceHeight, param->internalCsp);
47
+    bool ok = m_encData->create(*param, sps) && m_reconPic->create(param->sourceWidth, param->sourceHeight, param->internalCsp);
48
     if (ok)
49
     {
50
         /* initialize right border of m_reconpicYuv as SAO may read beyond the
51
          * end of the picture accessing uninitialized pixels */
52
         int maxHeight = sps.numCuInHeight * g_maxCUSize;
53
-        memset(m_reconPic->m_picOrg[0], 0, sizeof(pixel) * m_reconPic->m_stride * maxHeight);
54
-        memset(m_reconPic->m_picOrg[1], 0, sizeof(pixel) * m_reconPic->m_strideC * (maxHeight >> m_reconPic->m_vChromaShift));
55
-        memset(m_reconPic->m_picOrg[2], 0, sizeof(pixel) * m_reconPic->m_strideC * (maxHeight >> m_reconPic->m_vChromaShift));
56
+        memset(m_reconPic->m_picOrg[0], 0, sizeof(pixel)* m_reconPic->m_stride * maxHeight);
57
+
58
+        /* use pre-calculated cu/pu offsets cached in the SPS structure */
59
+        m_reconPic->m_cuOffsetY = sps.cuOffsetY;
60
+        m_reconPic->m_buOffsetY = sps.buOffsetY;
61
+
62
+        if (param->internalCsp != X265_CSP_I400)
63
+        {
64
+            memset(m_reconPic->m_picOrg[1], 0, sizeof(pixel) * m_reconPic->m_strideC * (maxHeight >> m_reconPic->m_vChromaShift));
65
+            memset(m_reconPic->m_picOrg[2], 0, sizeof(pixel) * m_reconPic->m_strideC * (maxHeight >> m_reconPic->m_vChromaShift));
66
+
67
+            /* use pre-calculated cu/pu offsets cached in the SPS structure */
68
+            m_reconPic->m_cuOffsetC = sps.cuOffsetC;
69
+            m_reconPic->m_buOffsetC = sps.buOffsetC;
70
+        }
71
     }
72
     return ok;
73
 }
74
@@ -100,5 +127,16 @@
75
         m_reconPic = NULL;
76
     }
77
 
78
+    if (m_reconColCount)
79
+    {
80
+        delete[] m_reconColCount;
81
+        m_reconColCount = NULL;
82
+    }
83
+
84
+    if (m_quantOffsets)
85
+    {
86
+        delete[] m_quantOffsets;
87
+    }
88
+
89
     m_lowres.destroy();
90
 }
91
x265_1.8.tar.gz/source/common/frame.h -> x265_1.9.tar.gz/source/common/frame.h Changed
32
 
1
@@ -35,7 +35,7 @@
2
 class PicYuv;
3
 struct SPS;
4
 
5
-#define IS_REFERENCED(frame) (frame->m_lowres.sliceType != X265_TYPE_B) 
6
+#define IS_REFERENCED(frame) (frame->m_lowres.sliceType != X265_TYPE_B)
7
 
8
 class Frame
9
 {
10
@@ -59,8 +59,12 @@
11
     bool                   m_lowresInit;         // lowres init complete (pre-analysis)
12
     bool                   m_bChromaExtended;    // orig chroma planes motion extended for weight analysis
13
 
14
+    float*                 m_quantOffsets;       // points to quantOffsets in x265_picture
15
+
16
     /* Frame Parallelism - notification between FrameEncoders of available motion reference rows */
17
     ThreadSafeInteger      m_reconRowCount;      // count of CTU rows completely reconstructed and extended for motion reference
18
+    ThreadSafeInteger*     m_reconColCount;      // count of CTU cols completely reconstructed and extended for motion reference
19
+    int32_t                m_numRows;
20
     volatile uint32_t      m_countRefEncoders;   // count of FrameEncoder threads monitoring m_reconRowCount
21
 
22
     Frame*                 m_next;               // PicList doubly linked list pointers
23
@@ -69,7 +73,7 @@
24
     x265_analysis_data     m_analysisData;
25
     Frame();
26
 
27
-    bool create(x265_param *param);
28
+    bool create(x265_param *param, float* quantOffsets);
29
     bool allocEncodeData(x265_param *param, const SPS& sps);
30
     void reinit(const SPS& sps);
31
     void destroy();
32
x265_1.8.tar.gz/source/common/framedata.cpp -> x265_1.9.tar.gz/source/common/framedata.cpp Changed
21
 
1
@@ -31,15 +31,15 @@
2
     memset(this, 0, sizeof(*this));
3
 }
4
 
5
-bool FrameData::create(x265_param *param, const SPS& sps)
6
+bool FrameData::create(const x265_param& param, const SPS& sps)
7
 {
8
-    m_param = param;
9
+    m_param = &param;
10
     m_slice  = new Slice;
11
     m_picCTU = new CUData[sps.numCUsInFrame];
12
 
13
-    m_cuMemPool.create(0, param->internalCsp, sps.numCUsInFrame);
14
+    m_cuMemPool.create(0, param.internalCsp, sps.numCUsInFrame);
15
     for (uint32_t ctuAddr = 0; ctuAddr < sps.numCUsInFrame; ctuAddr++)
16
-        m_picCTU[ctuAddr].initialize(m_cuMemPool, 0, param->internalCsp, ctuAddr);
17
+        m_picCTU[ctuAddr].initialize(m_cuMemPool, 0, param.internalCsp, ctuAddr);
18
 
19
     CHECKED_MALLOC(m_cuStat, RCStatCU, sps.numCUsInFrame);
20
     CHECKED_MALLOC(m_rowStat, RCStatRow, sps.numCuInHeight);
21
x265_1.8.tar.gz/source/common/framedata.h -> x265_1.9.tar.gz/source/common/framedata.h Changed
83
 
1
@@ -55,8 +55,7 @@
2
     double      avgLumaDistortion;
3
     double      avgChromaDistortion;
4
     double      avgPsyEnergy;
5
-    double      avgLumaLevel;
6
-    double      lumaLevel;
7
+    double      avgResEnergy;
8
     double      percentIntraNxN;
9
     double      percentSkipCu[NUM_CU_DEPTH];
10
     double      percentMergeCu[NUM_CU_DEPTH];
11
@@ -69,13 +68,13 @@
12
     uint64_t    lumaDistortion;
13
     uint64_t    chromaDistortion;
14
     uint64_t    psyEnergy;
15
+    uint64_t    resEnergy;
16
     uint64_t    cntSkipCu[NUM_CU_DEPTH];
17
     uint64_t    cntMergeCu[NUM_CU_DEPTH];
18
     uint64_t    cntInter[NUM_CU_DEPTH];
19
     uint64_t    cntIntra[NUM_CU_DEPTH];
20
     uint64_t    cuInterDistribution[NUM_CU_DEPTH][INTER_MODES];
21
     uint64_t    cuIntraDistribution[NUM_CU_DEPTH][INTRA_MODES];
22
-    uint16_t    maxLumaLevel;
23
 
24
     FrameStats()
25
     {
26
@@ -96,7 +95,7 @@
27
 
28
     Slice*         m_slice;
29
     SAOParam*      m_saoParam;
30
-    x265_param*    m_param;
31
+    const x265_param* m_param;
32
 
33
     FrameData*     m_freeListNext;
34
     PicYuv*        m_reconPic;
35
@@ -135,19 +134,44 @@
36
     RCStatCU*      m_cuStat;
37
     RCStatRow*     m_rowStat;
38
     FrameStats     m_frameStats; // stats of current frame for multi-pass encodes
39
+    /* data needed for periodic intra refresh */
40
+    struct PeriodicIR
41
+    {
42
+        uint32_t   pirStartCol;
43
+        uint32_t   pirEndCol;
44
+        int        framesSinceLastPir;
45
+    };
46
 
47
+    PeriodicIR     m_pir;
48
     double         m_avgQpRc;    /* avg QP as decided by rate-control */
49
     double         m_avgQpAq;    /* avg QP as decided by AQ in addition to rate-control */
50
     double         m_rateFactor; /* calculated based on the Frame QP */
51
 
52
     FrameData();
53
 
54
-    bool create(x265_param *param, const SPS& sps);
55
+    bool create(const x265_param& param, const SPS& sps);
56
     void reinit(const SPS& sps);
57
     void destroy();
58
+    inline CUData* getPicCTU(uint32_t ctuAddr) { return &m_picCTU[ctuAddr]; }
59
+};
60
+
61
+/* Stores intra analysis data for a single frame. This struct needs better packing */
62
+struct analysis_intra_data
63
+{
64
+    uint8_t*  depth;
65
+    uint8_t*  modes;
66
+    char*     partSizes;
67
+    uint8_t*  chromaModes;
68
+};
69
 
70
-    CUData* getPicCTU(uint32_t ctuAddr) { return &m_picCTU[ctuAddr]; }
71
+/* Stores inter analysis data for a single frame */
72
+struct analysis_inter_data
73
+{
74
+    MV*         mv;
75
+    int32_t*    ref;
76
+    uint8_t*    depth;
77
+    uint8_t*    modes;
78
+    uint32_t*   bestMergeCand;
79
 };
80
 }
81
-
82
 #endif // ifndef X265_FRAMEDATA_H
83
x265_1.8.tar.gz/source/common/ipfilter.cpp -> x265_1.9.tar.gz/source/common/ipfilter.cpp Changed
9
 
1
@@ -4,6 +4,7 @@
2
  * Authors: Deepthi Devaki <deepthidevaki@multicorewareinc.com>,
3
  *          Rajesh Paulraj <rajesh@multicorewareinc.com>
4
  *          Praveen Kumar Tiwari <praveen@multicorewareinc.com>
5
+ *          Min Chen <chenm003@163.com>
6
  *
7
  * This program is free software; you can redistribute it and/or modify
8
  * it under the terms of the GNU General Public License as published by
9
x265_1.8.tar.gz/source/common/loopfilter.cpp -> x265_1.9.tar.gz/source/common/loopfilter.cpp Changed
47
 
1
@@ -3,6 +3,7 @@
2
 *
3
 * Authors: Praveen Kumar Tiwari <praveen@multicorewareinc.com>
4
 *          Dnyaneshwar Gorade <dnyaneshwar@multicorewareinc.com>
5
+*          Min Chen <chenm003@163.com>
6
 *
7
 * This program is free software; you can redistribute it and/or modify
8
 * it under the terms of the GNU General Public License as published by
9
@@ -136,6 +137,27 @@
10
         rec += stride;
11
     }
12
 }
13
+
14
+static void pelFilterLumaStrong_c(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tcP, int32_t tcQ)
15
+{
16
+    for (int32_t i = 0; i < UNIT_SIZE; i++, src += srcStep)
17
+    {
18
+        int16_t m4  = (int16_t)src[0];
19
+        int16_t m3  = (int16_t)src[-offset];
20
+        int16_t m5  = (int16_t)src[offset];
21
+        int16_t m2  = (int16_t)src[-offset * 2];
22
+        int16_t m6  = (int16_t)src[offset * 2];
23
+        int16_t m1  = (int16_t)src[-offset * 3];
24
+        int16_t m7  = (int16_t)src[offset * 3];
25
+        int16_t m0  = (int16_t)src[-offset * 4];
26
+        src[-offset * 3] = (pixel)(x265_clip3(-tcP, tcP, ((2 * m0 + 3 * m1 + m2 + m3 + m4 + 4) >> 3) - m1) + m1);
27
+        src[-offset * 2] = (pixel)(x265_clip3(-tcP, tcP, ((m1 + m2 + m3 + m4 + 2) >> 2) - m2) + m2);
28
+        src[-offset]     = (pixel)(x265_clip3(-tcP, tcP, ((m1 + 2 * m2 + 2 * m3 + 2 * m4 + m5 + 4) >> 3) - m3) + m3);
29
+        src[0]           = (pixel)(x265_clip3(-tcQ, tcQ, ((m2 + 2 * m3 + 2 * m4 + 2 * m5 + m6 + 4) >> 3) - m4) + m4);
30
+        src[offset]      = (pixel)(x265_clip3(-tcQ, tcQ, ((m3 + m4 + m5 + m6 + 2) >> 2) - m5) + m5);
31
+        src[offset * 2]  = (pixel)(x265_clip3(-tcQ, tcQ, ((m3 + m4 + m5 + 3 * m6 + 2 * m7 + 4) >> 3) - m6) + m6);
32
+    }
33
+}
34
 }
35
 
36
 namespace X265_NS {
37
@@ -150,5 +172,9 @@
38
     p.saoCuOrgE3[1] = processSaoCUE3;
39
     p.saoCuOrgB0 = processSaoCUB0;
40
     p.sign = calSign;
41
+
42
+    // C code is same for EDGE_VER and EDGE_HOR only asm code is different
43
+    p.pelFilterLumaStrong[0] = pelFilterLumaStrong_c;
44
+    p.pelFilterLumaStrong[1] = pelFilterLumaStrong_c;
45
 }
46
 }
47
x265_1.8.tar.gz/source/common/lowres.cpp -> x265_1.9.tar.gz/source/common/lowres.cpp Changed
29
 
1
@@ -52,6 +52,7 @@
2
         CHECKED_MALLOC(qpAqOffset, double, cuCount);
3
         CHECKED_MALLOC(invQscaleFactor, int, cuCount);
4
         CHECKED_MALLOC(qpCuTreeOffset, double, cuCount);
5
+        CHECKED_MALLOC(blockVariance, uint32_t, cuCount);
6
     }
7
     CHECKED_MALLOC(propagateCost, uint16_t, cuCount);
8
 
9
@@ -120,18 +121,17 @@
10
     X265_FREE(invQscaleFactor);
11
     X265_FREE(qpCuTreeOffset);
12
     X265_FREE(propagateCost);
13
+    X265_FREE(blockVariance);
14
 }
15
 
16
 // (re) initialize lowres state
17
 void Lowres::init(PicYuv *origPic, int poc)
18
 {
19
     bLastMiniGopBFrame = false;
20
-    bScenecut = false;  // could be a scene-cut, until ruled out by flash detection
21
     bKeyframe = false; // Not a keyframe unless identified by lookahead
22
     frameNum = poc;
23
     leadingBframes = 0;
24
     indB = 0;
25
-    satdCost = (int64_t)-1;
26
     memset(costEst, -1, sizeof(costEst));
27
     memset(weightedCostDelta, 0, sizeof(weightedCostDelta));
28
 
29
x265_1.8.tar.gz/source/common/lowres.h -> x265_1.9.tar.gz/source/common/lowres.h Changed
17
 
1
@@ -143,12 +143,15 @@
2
     double*   qpAqOffset;      // AQ QP offset values for each 16x16 CU
3
     double*   qpCuTreeOffset;  // cuTree QP offset values for each 16x16 CU
4
     int*      invQscaleFactor; // qScale values for qp Aq Offsets
5
+    uint32_t* blockVariance;
6
     uint64_t  wp_ssd[3];       // This is different than SSDY, this is sum(pixel^2) - sum(pixel)^2 for entire frame
7
     uint64_t  wp_sum[3];
8
+    uint64_t  frameVariance;
9
 
10
     /* cutree intermediate data */
11
     uint16_t* propagateCost;
12
     double    weightedCostDelta[X265_BFRAME_MAX + 2];
13
+    ReferencePlanes weightedRef[X265_BFRAME_MAX + 2];
14
 
15
     bool create(PicYuv *origPic, int _bframes, bool bAqEnabled);
16
     void destroy();
17
x265_1.8.tar.gz/source/common/param.cpp -> x265_1.9.tar.gz/source/common/param.cpp Changed
246
 
1
@@ -147,7 +147,7 @@
2
     param->bFrameAdaptive = X265_B_ADAPT_TRELLIS;
3
     param->bBPyramid = 1;
4
     param->scenecutThreshold = 40; /* Magic number pulled in from x264 */
5
-    param->lookaheadSlices = 0;
6
+    param->lookaheadSlices = 8;
7
 
8
     /* Intra Coding Tools */
9
     param->bEnableConstrainedIntra = 0;
10
@@ -159,7 +159,8 @@
11
     param->subpelRefine = 2;
12
     param->searchRange = 57;
13
     param->maxNumMergeCand = 2;
14
-    param->limitReferences = 0;
15
+    param->limitReferences = 3;
16
+    param->limitModes = 0;
17
     param->bEnableWeightedPred = 1;
18
     param->bEnableWeightedBiPred = 0;
19
     param->bEnableEarlySkip = 0;
20
@@ -184,7 +185,7 @@
21
     param->cbQpOffset = 0;
22
     param->crQpOffset = 0;
23
     param->rdPenalty = 0;
24
-    param->psyRd = 0.3;
25
+    param->psyRd = 2.0;
26
     param->psyRdoq = 0.0;
27
     param->analysisMode = 0;
28
     param->analysisFileName = NULL;
29
@@ -241,6 +242,10 @@
30
     param->vui.defDispWinRightOffset = 0;
31
     param->vui.defDispWinTopOffset = 0;
32
     param->vui.defDispWinBottomOffset = 0;
33
+    param->maxCLL = 0;
34
+    param->maxFALL = 0;
35
+    param->minLuma = 0;
36
+    param->maxLuma = (1 << X265_DEPTH) - 1;
37
 }
38
 
39
 int x265_param_default_preset(x265_param* param, const char* preset, const char* tune)
40
@@ -274,9 +279,9 @@
41
             param->bEnableWeightedPred = 0;
42
             param->rdLevel = 2;
43
             param->maxNumReferences = 1;
44
+            param->limitReferences = 0;
45
             param->rc.aqStrength = 0.0;
46
             param->rc.aqMode = X265_AQ_NONE;
47
-            param->rc.cuTree = 0;
48
             param->rc.qgSize = 32;
49
             param->bEnableFastIntra = 1;
50
         }
51
@@ -291,9 +296,9 @@
52
             param->bEnableWeightedPred = 0;
53
             param->rdLevel = 2;
54
             param->maxNumReferences = 1;
55
+            param->limitReferences = 0;
56
             param->rc.aqStrength = 0.0;
57
             param->rc.aqMode = X265_AQ_NONE;
58
-            param->rc.cuTree = 0;
59
             param->rc.qgSize = 32;
60
             param->bEnableSAO = 0;
61
             param->bEnableFastIntra = 1;
62
@@ -301,13 +306,11 @@
63
         else if (!strcmp(preset, "veryfast"))
64
         {
65
             param->lookaheadDepth = 15;
66
-            param->maxCUSize = 32;
67
             param->bFrameAdaptive = 0;
68
             param->subpelRefine = 1;
69
             param->bEnableEarlySkip = 1;
70
             param->rdLevel = 2;
71
-            param->maxNumReferences = 1;
72
-            param->rc.cuTree = 0;
73
+            param->maxNumReferences = 2;
74
             param->rc.qgSize = 32;
75
             param->bEnableFastIntra = 1;
76
         }
77
@@ -317,8 +320,7 @@
78
             param->bFrameAdaptive = 0;
79
             param->bEnableEarlySkip = 1;
80
             param->rdLevel = 2;
81
-            param->maxNumReferences = 1;
82
-            param->rc.cuTree = 0;
83
+            param->maxNumReferences = 2;
84
             param->bEnableFastIntra = 1;
85
         }
86
         else if (!strcmp(preset, "fast"))
87
@@ -326,7 +328,7 @@
88
             param->lookaheadDepth = 15;
89
             param->bFrameAdaptive = 0;
90
             param->rdLevel = 2;
91
-            param->maxNumReferences = 2;
92
+            param->maxNumReferences = 3;
93
             param->bEnableFastIntra = 1;
94
         }
95
         else if (!strcmp(preset, "medium"))
96
@@ -343,6 +345,9 @@
97
             param->subpelRefine = 3;
98
             param->maxNumMergeCand = 3;
99
             param->searchMethod = X265_STAR_SEARCH;
100
+            param->maxNumReferences = 4;
101
+            param->limitModes = 1;
102
+            param->lookaheadSlices = 4; // limit parallelism as already enough work exists
103
         }
104
         else if (!strcmp(preset, "slower"))
105
         {
106
@@ -359,7 +364,11 @@
107
             param->subpelRefine = 3;
108
             param->maxNumMergeCand = 3;
109
             param->searchMethod = X265_STAR_SEARCH;
110
+            param->maxNumReferences = 4;
111
+            param->limitReferences = 2;
112
+            param->limitModes = 1;
113
             param->bIntraInBFrames = 1;
114
+            param->lookaheadSlices = 4; // limit parallelism as already enough work exists
115
         }
116
         else if (!strcmp(preset, "veryslow"))
117
         {
118
@@ -377,7 +386,10 @@
119
             param->maxNumMergeCand = 4;
120
             param->searchMethod = X265_STAR_SEARCH;
121
             param->maxNumReferences = 5;
122
+            param->limitReferences = 1;
123
+            param->limitModes = 1;
124
             param->bIntraInBFrames = 1;
125
+            param->lookaheadSlices = 0; // disabled for best quality
126
         }
127
         else if (!strcmp(preset, "placebo"))
128
         {
129
@@ -397,8 +409,10 @@
130
             param->searchMethod = X265_STAR_SEARCH;
131
             param->bEnableTransformSkip = 1;
132
             param->maxNumReferences = 5;
133
+            param->limitReferences = 0;
134
             param->rc.bEnableSlowFirstPass = 1;
135
             param->bIntraInBFrames = 1;
136
+            param->lookaheadSlices = 0; // disabled for best quality
137
             // TODO: optimized esa
138
         }
139
         else
140
@@ -565,10 +579,14 @@
141
     OPT2("level-idc", "level")
142
     {
143
         /* allow "5.1" or "51", both converted to integer 51 */
144
-        if (atof(value) < 7)
145
+        /* if level-idc specifies an obviously wrong value in either float or int, 
146
+        throw error consistently. Stronger level checking will be done in encoder_open() */
147
+        if (atof(value) < 10)
148
             p->levelIdc = (int)(10 * atof(value) + .5);
149
-        else
150
+        else if (atoi(value) < 100)
151
             p->levelIdc = atoi(value);
152
+        else 
153
+            bError = true;
154
     }
155
     OPT("high-tier") p->bHighTier = atobool(value);
156
     OPT("allow-non-conformance") p->bAllowNonConformance = atobool(value);
157
@@ -608,6 +626,7 @@
158
     OPT2("constrained-intra", "cip") p->bEnableConstrainedIntra = atobool(value);
159
     OPT("fast-intra") p->bEnableFastIntra = atobool(value);
160
     OPT("open-gop") p->bOpenGOP = atobool(value);
161
+    OPT("intra-refresh") p->bIntraRefresh = atobool(value);
162
     OPT("lookahead-slices") p->lookaheadSlices = atoi(value);
163
     OPT("scenecut")
164
     {
165
@@ -644,6 +663,7 @@
166
     }
167
     OPT("ref") p->maxNumReferences = atoi(value);
168
     OPT("limit-refs") p->limitReferences = atoi(value);
169
+    OPT("limit-modes") p->limitModes = atobool(value);
170
     OPT("weightp") p->bEnableWeightedPred = atobool(value);
171
     OPT("weightb") p->bEnableWeightedBiPred = atobool(value);
172
     OPT("cbqpoffs") p->cbQpOffset = atoi(value);
173
@@ -854,7 +874,9 @@
174
     OPT("analysis-file") p->analysisFileName = strdup(value);
175
     OPT("qg-size") p->rc.qgSize = atoi(value);
176
     OPT("master-display") p->masteringDisplayColorVolume = strdup(value);
177
-    OPT("max-cll") p->contentLightLevelInfo = strdup(value);
178
+    OPT("max-cll") bError |= sscanf(value, "%hu,%hu", &p->maxCLL, &p->maxFALL) != 2;
179
+    OPT("min-luma") p->minLuma = (uint16_t)atoi(value);
180
+    OPT("max-luma") p->maxLuma = (uint16_t)atoi(value);
181
     else
182
         return X265_PARAM_BAD_NAME;
183
 #undef OPT
184
@@ -1035,6 +1057,8 @@
185
           "subme must be greater than or equal to 0");
186
     CHECK(param->limitReferences > 3,
187
           "limitReferences must be 0, 1, 2 or 3");
188
+    CHECK(param->limitModes > 1,
189
+          "limitRectAmp must be 0, 1");
190
     CHECK(param->frameNumThreads < 0 || param->frameNumThreads > X265_MAX_FRAME_THREADS,
191
           "frameNumThreads (--frame-threads) must be [0 .. X265_MAX_FRAME_THREADS)");
192
     CHECK(param->cbQpOffset < -12, "Min. Chroma Cb QP Offset is -12");
193
@@ -1063,8 +1087,8 @@
194
 
195
     CHECK(param->sourceWidth < (int)param->maxCUSize || param->sourceHeight < (int)param->maxCUSize,
196
           "Picture size must be at least one CTU");
197
-    CHECK(param->internalCsp < X265_CSP_I420 || X265_CSP_I444 < param->internalCsp,
198
-          "Color space must be i420, i422, or i444");
199
+    CHECK(param->internalCsp < X265_CSP_I400 || X265_CSP_I444 < param->internalCsp,
200
+          "chroma subsampling must be i400 (4:0:0 monochrome), i420 (4:2:0 default), i422 (4:2:0), i444 (4:4:4)");
201
     CHECK(param->sourceWidth & !!CHROMA_H_SHIFT(param->internalCsp),
202
           "Picture width must be an integer multiple of the specified chroma subsampling");
203
     CHECK(param->sourceHeight & !!CHROMA_V_SHIFT(param->internalCsp),
204
@@ -1094,7 +1118,7 @@
205
           "deblocking filter tC offset must be in the range of -6 to +6");
206
     CHECK(param->deblockingFilterBetaOffset < -6 || param->deblockingFilterBetaOffset > 6,
207
           "deblocking filter Beta offset must be in the range of -6 to +6");
208
-    CHECK(param->psyRd < 0 || 2.0 < param->psyRd, "Psy-rd strength must be between 0 and 2.0");
209
+    CHECK(param->psyRd < 0 || 5.0 < param->psyRd, "Psy-rd strength must be between 0 and 5.0");
210
     CHECK(param->psyRdoq < 0 || 50.0 < param->psyRdoq, "Psy-rdoq strength must be between 0 and 50.0");
211
     CHECK(param->bEnableWavefront < 0, "WaveFrontSynchro cannot be negative");
212
     CHECK((param->vui.aspectRatioIdc < 0
213
@@ -1170,7 +1194,7 @@
214
         CHECK(0 > param->noiseReductionIntra || param->noiseReductionIntra > 2000, "Valid noise reduction range 0 - 2000");
215
     if (param->noiseReductionInter)
216
         CHECK(0 > param->noiseReductionInter || param->noiseReductionInter > 2000, "Valid noise reduction range 0 - 2000");
217
-    CHECK(param->rc.rateControlMode == X265_RC_CRF && param->rc.bStatRead,
218
+    CHECK(param->rc.rateControlMode == X265_RC_CRF && param->rc.bStatRead && param->rc.vbvMaxBitrate == 0,
219
           "Constant rate-factor is incompatible with 2pass");
220
     CHECK(param->rc.rateControlMode == X265_RC_CQP && param->rc.bStatRead,
221
           "Constant QP is incompatible with 2pass");
222
@@ -1307,6 +1331,7 @@
223
 #define TOOLVAL(VAL, STR)  if (VAL) { sprintf(tmp, STR, VAL); appendtool(param, buf, sizeof(buf), tmp); }
224
     TOOLOPT(param->bEnableRectInter, "rect");
225
     TOOLOPT(param->bEnableAMP, "amp");
226
+    TOOLOPT(param->limitModes, "limit-modes");
227
     TOOLVAL(param->rdLevel, "rd=%d");
228
     TOOLVAL(param->psyRd, "psy-rd=%.2lf");
229
     TOOLVAL(param->rdoqLevel, "rdoq=%d");
230
@@ -1428,6 +1453,7 @@
231
     s += sprintf(s, " b-adapt=%d", p->bFrameAdaptive);
232
     s += sprintf(s, " ref=%d", p->maxNumReferences);
233
     s += sprintf(s, " limit-refs=%d", p->limitReferences);
234
+    BOOL(p->limitModes, "limit-modes");
235
     BOOL(p->bEnableWeightedPred, "weightp");
236
     BOOL(p->bEnableWeightedBiPred, "weightb");
237
     s += sprintf(s, " aq-mode=%d", p->rc.aqMode);
238
@@ -1447,6 +1473,7 @@
239
     BOOL(p->bSaoNonDeblocked, "sao-non-deblock");
240
     BOOL(p->bBPyramid, "b-pyramid");
241
     BOOL(p->rc.cuTree, "cutree");
242
+    BOOL(p->bIntraRefresh, "intra-refresh");
243
     s += sprintf(s, " rc=%s", p->rc.rateControlMode == X265_RC_ABR ? (
244
          p->rc.bStatRead ? "2 pass" : p->rc.bitrate == p->rc.vbvMaxBitrate ? "cbr" : "abr")
245
          : p->rc.rateControlMode == X265_RC_CRF ? "crf" : "cqp");
246
x265_1.8.tar.gz/source/common/picyuv.cpp -> x265_1.9.tar.gz/source/common/picyuv.cpp Changed
302
 
1
@@ -2,6 +2,7 @@
2
  * Copyright (C) 2015 x265 project
3
  *
4
  * Authors: Steve Borho <steve@borho.org>
5
+ *          Min Chen <chenm003@163.com>
6
  *
7
  * This program is free software; you can redistribute it and/or modify
8
  * it under the terms of the GNU General Public License as published by
9
@@ -42,6 +43,9 @@
10
     m_cuOffsetC = NULL;
11
     m_buOffsetY = NULL;
12
     m_buOffsetC = NULL;
13
+
14
+    m_maxLumaLevel = 0;
15
+    m_avgLumaLevel = 0;
16
 }
17
 
18
 bool PicYuv::create(uint32_t picWidth, uint32_t picHeight, uint32_t picCsp)
19
@@ -59,20 +63,27 @@
20
     m_lumaMarginY = g_maxCUSize + 16; // margin for 8-tap filter and infinite padding
21
     m_stride = (numCuInWidth * g_maxCUSize) + (m_lumaMarginX << 1);
22
 
23
-    m_chromaMarginX = m_lumaMarginX;  // keep 16-byte alignment for chroma CTUs
24
-    m_chromaMarginY = m_lumaMarginY >> m_vChromaShift;
25
-
26
-    m_strideC = ((numCuInWidth * g_maxCUSize) >> m_hChromaShift) + (m_chromaMarginX * 2);
27
     int maxHeight = numCuInHeight * g_maxCUSize;
28
-
29
     CHECKED_MALLOC(m_picBuf[0], pixel, m_stride * (maxHeight + (m_lumaMarginY * 2)));
30
-    CHECKED_MALLOC(m_picBuf[1], pixel, m_strideC * ((maxHeight >> m_vChromaShift) + (m_chromaMarginY * 2)));
31
-    CHECKED_MALLOC(m_picBuf[2], pixel, m_strideC * ((maxHeight >> m_vChromaShift) + (m_chromaMarginY * 2)));
32
+    m_picOrg[0] = m_picBuf[0] + m_lumaMarginY * m_stride + m_lumaMarginX;
33
+
34
+    if (picCsp != X265_CSP_I400)
35
+    {
36
+        m_chromaMarginX = m_lumaMarginX;  // keep 16-byte alignment for chroma CTUs
37
+        m_chromaMarginY = m_lumaMarginY >> m_vChromaShift;
38
+        m_strideC = ((numCuInWidth * g_maxCUSize) >> m_hChromaShift) + (m_chromaMarginX * 2);
39
 
40
-    m_picOrg[0] = m_picBuf[0] + m_lumaMarginY   * m_stride  + m_lumaMarginX;
41
-    m_picOrg[1] = m_picBuf[1] + m_chromaMarginY * m_strideC + m_chromaMarginX;
42
-    m_picOrg[2] = m_picBuf[2] + m_chromaMarginY * m_strideC + m_chromaMarginX;
43
+        CHECKED_MALLOC(m_picBuf[1], pixel, m_strideC * ((maxHeight >> m_vChromaShift) + (m_chromaMarginY * 2)));
44
+        CHECKED_MALLOC(m_picBuf[2], pixel, m_strideC * ((maxHeight >> m_vChromaShift) + (m_chromaMarginY * 2)));
45
 
46
+        m_picOrg[1] = m_picBuf[1] + m_chromaMarginY * m_strideC + m_chromaMarginX;
47
+        m_picOrg[2] = m_picBuf[2] + m_chromaMarginY * m_strideC + m_chromaMarginX;
48
+    }
49
+    else
50
+    {
51
+        m_picBuf[1] = m_picBuf[2] = NULL;
52
+        m_picOrg[1] = m_picOrg[2] = NULL;
53
+    }
54
     return true;
55
 
56
 fail:
57
@@ -85,27 +96,45 @@
58
 bool PicYuv::createOffsets(const SPS& sps)
59
 {
60
     uint32_t numPartitions = 1 << (g_unitSizeDepth * 2);
61
-    CHECKED_MALLOC(m_cuOffsetY, intptr_t, sps.numCuInWidth * sps.numCuInHeight);
62
-    CHECKED_MALLOC(m_cuOffsetC, intptr_t, sps.numCuInWidth * sps.numCuInHeight);
63
-    for (uint32_t cuRow = 0; cuRow < sps.numCuInHeight; cuRow++)
64
+
65
+    if (m_picCsp != X265_CSP_I400)
66
     {
67
-        for (uint32_t cuCol = 0; cuCol < sps.numCuInWidth; cuCol++)
68
+        CHECKED_MALLOC(m_cuOffsetY, intptr_t, sps.numCuInWidth * sps.numCuInHeight);
69
+        CHECKED_MALLOC(m_cuOffsetC, intptr_t, sps.numCuInWidth * sps.numCuInHeight);
70
+        for (uint32_t cuRow = 0; cuRow < sps.numCuInHeight; cuRow++)
71
         {
72
-            m_cuOffsetY[cuRow * sps.numCuInWidth + cuCol] = m_stride * cuRow * g_maxCUSize + cuCol * g_maxCUSize;
73
-            m_cuOffsetC[cuRow * sps.numCuInWidth + cuCol] = m_strideC * cuRow * (g_maxCUSize >> m_vChromaShift) + cuCol * (g_maxCUSize >> m_hChromaShift);
74
+            for (uint32_t cuCol = 0; cuCol < sps.numCuInWidth; cuCol++)
75
+            {
76
+                m_cuOffsetY[cuRow * sps.numCuInWidth + cuCol] = m_stride * cuRow * g_maxCUSize + cuCol * g_maxCUSize;
77
+                m_cuOffsetC[cuRow * sps.numCuInWidth + cuCol] = m_strideC * cuRow * (g_maxCUSize >> m_vChromaShift) + cuCol * (g_maxCUSize >> m_hChromaShift);
78
+            }
79
         }
80
-    }
81
 
82
-    CHECKED_MALLOC(m_buOffsetY, intptr_t, (size_t)numPartitions);
83
-    CHECKED_MALLOC(m_buOffsetC, intptr_t, (size_t)numPartitions);
84
-    for (uint32_t idx = 0; idx < numPartitions; ++idx)
85
-    {
86
-        intptr_t x = g_zscanToPelX[idx];
87
-        intptr_t y = g_zscanToPelY[idx];
88
-        m_buOffsetY[idx] = m_stride * y + x;
89
-        m_buOffsetC[idx] = m_strideC * (y >> m_vChromaShift) + (x >> m_hChromaShift);
90
+        CHECKED_MALLOC(m_buOffsetY, intptr_t, (size_t)numPartitions);
91
+        CHECKED_MALLOC(m_buOffsetC, intptr_t, (size_t)numPartitions);
92
+        for (uint32_t idx = 0; idx < numPartitions; ++idx)
93
+        {
94
+            intptr_t x = g_zscanToPelX[idx];
95
+            intptr_t y = g_zscanToPelY[idx];
96
+            m_buOffsetY[idx] = m_stride * y + x;
97
+            m_buOffsetC[idx] = m_strideC * (y >> m_vChromaShift) + (x >> m_hChromaShift);
98
+        }
99
     }
100
+    else
101
+    {
102
+        CHECKED_MALLOC(m_cuOffsetY, intptr_t, sps.numCuInWidth * sps.numCuInHeight);
103
+        for (uint32_t cuRow = 0; cuRow < sps.numCuInHeight; cuRow++)
104
+        for (uint32_t cuCol = 0; cuCol < sps.numCuInWidth; cuCol++)
105
+            m_cuOffsetY[cuRow * sps.numCuInWidth + cuCol] = m_stride * cuRow * g_maxCUSize + cuCol * g_maxCUSize;
106
 
107
+        CHECKED_MALLOC(m_buOffsetY, intptr_t, (size_t)numPartitions);
108
+        for (uint32_t idx = 0; idx < numPartitions; ++idx)
109
+        {
110
+            intptr_t x = g_zscanToPelX[idx];
111
+            intptr_t y = g_zscanToPelY[idx];
112
+            m_buOffsetY[idx] = m_stride * y + x;
113
+        }
114
+    }
115
     return true;
116
 
117
 fail:
118
@@ -121,7 +150,7 @@
119
 
120
 /* Copy pixels from an x265_picture into internal PicYuv instance.
121
  * Shift pixels as necessary, mask off bits above X265_DEPTH for safety. */
122
-void PicYuv::copyFromPicture(const x265_picture& pic, int padx, int pady)
123
+void PicYuv::copyFromPicture(const x265_picture& pic, const x265_param& param, int padx, int pady)
124
 {
125
     /* m_picWidth is the width that is being encoded, padx indicates how many
126
      * of those pixels are padding to reach multiple of MinCU(4) size.
127
@@ -155,28 +184,29 @@
128
 #if (X265_DEPTH > 8)
129
         {
130
             pixel *yPixel = m_picOrg[0];
131
-            pixel *uPixel = m_picOrg[1];
132
-            pixel *vPixel = m_picOrg[2];
133
 
134
             uint8_t *yChar = (uint8_t*)pic.planes[0];
135
-            uint8_t *uChar = (uint8_t*)pic.planes[1];
136
-            uint8_t *vChar = (uint8_t*)pic.planes[2];
137
             int shift = (X265_DEPTH - 8);
138
 
139
             primitives.planecopy_cp(yChar, pic.stride[0] / sizeof(*yChar), yPixel, m_stride, width, height, shift);
140
-            primitives.planecopy_cp(uChar, pic.stride[1] / sizeof(*uChar), uPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift);
141
-            primitives.planecopy_cp(vChar, pic.stride[2] / sizeof(*vChar), vPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift);
142
+
143
+            if (pic.colorSpace != X265_CSP_I400)
144
+            {
145
+                pixel *uPixel = m_picOrg[1];
146
+                pixel *vPixel = m_picOrg[2];
147
+
148
+                uint8_t *uChar = (uint8_t*)pic.planes[1];
149
+                uint8_t *vChar = (uint8_t*)pic.planes[2];
150
+
151
+                primitives.planecopy_cp(uChar, pic.stride[1] / sizeof(*uChar), uPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift);
152
+                primitives.planecopy_cp(vChar, pic.stride[2] / sizeof(*vChar), vPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift);
153
+            }
154
         }
155
 #else /* Case for (X265_DEPTH == 8) */
156
         // TODO: Does we need this path? may merge into above in future
157
         {
158
             pixel *yPixel = m_picOrg[0];
159
-            pixel *uPixel = m_picOrg[1];
160
-            pixel *vPixel = m_picOrg[2];
161
-
162
             uint8_t *yChar = (uint8_t*)pic.planes[0];
163
-            uint8_t *uChar = (uint8_t*)pic.planes[1];
164
-            uint8_t *vChar = (uint8_t*)pic.planes[2];
165
 
166
             for (int r = 0; r < height; r++)
167
             {
168
@@ -186,15 +216,24 @@
169
                 yChar += pic.stride[0] / sizeof(*yChar);
170
             }
171
 
172
-            for (int r = 0; r < height >> m_vChromaShift; r++)
173
+            if (pic.colorSpace != X265_CSP_I400)
174
             {
175
-                memcpy(uPixel, uChar, (width >> m_hChromaShift) * sizeof(pixel));
176
-                memcpy(vPixel, vChar, (width >> m_hChromaShift) * sizeof(pixel));
177
+                pixel *uPixel = m_picOrg[1];
178
+                pixel *vPixel = m_picOrg[2];
179
+
180
+                uint8_t *uChar = (uint8_t*)pic.planes[1];
181
+                uint8_t *vChar = (uint8_t*)pic.planes[2];
182
+
183
+                for (int r = 0; r < height >> m_vChromaShift; r++)
184
+                {
185
+                    memcpy(uPixel, uChar, (width >> m_hChromaShift) * sizeof(pixel));
186
+                    memcpy(vPixel, vChar, (width >> m_hChromaShift) * sizeof(pixel));
187
 
188
-                uPixel += m_strideC;
189
-                vPixel += m_strideC;
190
-                uChar += pic.stride[1] / sizeof(*uChar);
191
-                vChar += pic.stride[2] / sizeof(*vChar);
192
+                    uPixel += m_strideC;
193
+                    vPixel += m_strideC;
194
+                    uChar += pic.stride[1] / sizeof(*uChar);
195
+                    vChar += pic.stride[2] / sizeof(*vChar);
196
+                }
197
             }
198
         }
199
 #endif /* (X265_DEPTH > 8) */
200
@@ -205,43 +244,63 @@
201
         uint16_t mask = (1 << X265_DEPTH) - 1;
202
         int shift = abs(pic.bitDepth - X265_DEPTH);
203
         pixel *yPixel = m_picOrg[0];
204
-        pixel *uPixel = m_picOrg[1];
205
-        pixel *vPixel = m_picOrg[2];
206
 
207
         uint16_t *yShort = (uint16_t*)pic.planes[0];
208
-        uint16_t *uShort = (uint16_t*)pic.planes[1];
209
-        uint16_t *vShort = (uint16_t*)pic.planes[2];
210
 
211
         if (pic.bitDepth > X265_DEPTH)
212
         {
213
             /* shift right and mask pixels to final size */
214
             primitives.planecopy_sp(yShort, pic.stride[0] / sizeof(*yShort), yPixel, m_stride, width, height, shift, mask);
215
-            primitives.planecopy_sp(uShort, pic.stride[1] / sizeof(*uShort), uPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask);
216
-            primitives.planecopy_sp(vShort, pic.stride[2] / sizeof(*vShort), vPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask);
217
         }
218
         else /* Case for (pic.bitDepth <= X265_DEPTH) */
219
         {
220
             /* shift left and mask pixels to final size */
221
             primitives.planecopy_sp_shl(yShort, pic.stride[0] / sizeof(*yShort), yPixel, m_stride, width, height, shift, mask);
222
-            primitives.planecopy_sp_shl(uShort, pic.stride[1] / sizeof(*uShort), uPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask);
223
-            primitives.planecopy_sp_shl(vShort, pic.stride[2] / sizeof(*vShort), vPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask);
224
+        }
225
+
226
+        if (pic.colorSpace != X265_CSP_I400)
227
+        {
228
+            pixel *uPixel = m_picOrg[1];
229
+            pixel *vPixel = m_picOrg[2];
230
+
231
+            uint16_t *uShort = (uint16_t*)pic.planes[1];
232
+            uint16_t *vShort = (uint16_t*)pic.planes[2];
233
+
234
+            if (pic.bitDepth > X265_DEPTH)
235
+            {
236
+                primitives.planecopy_sp(uShort, pic.stride[1] / sizeof(*uShort), uPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask);
237
+                primitives.planecopy_sp(vShort, pic.stride[2] / sizeof(*vShort), vPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask);
238
+            }
239
+            else /* Case for (pic.bitDepth <= X265_DEPTH) */
240
+            {
241
+                primitives.planecopy_sp_shl(uShort, pic.stride[1] / sizeof(*uShort), uPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask);
242
+                primitives.planecopy_sp_shl(vShort, pic.stride[2] / sizeof(*vShort), vPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask);
243
+            }
244
         }
245
     }
246
 
247
     /* extend the right edge if width was not multiple of the minimum CU size */
248
-    if (padx)
249
+    uint64_t sumLuma;
250
+    pixel *Y = m_picOrg[0];
251
+    m_maxLumaLevel = primitives.planeClipAndMax(Y, m_stride, width, height, &sumLuma, (pixel)param.minLuma, (pixel)param.maxLuma);
252
+    m_avgLumaLevel = (double)(sumLuma) / (m_picHeight * m_picWidth);
253
+
254
+    for (int r = 0; r < height; r++)
255
     {
256
-        pixel *Y = m_picOrg[0];
257
-        pixel *U = m_picOrg[1];
258
-        pixel *V = m_picOrg[2];
259
+        for (int x = 0; x < padx; x++)
260
+            Y[width + x] = Y[width - 1];
261
+        Y += m_stride;
262
+    }
263
 
264
-        for (int r = 0; r < height; r++)
265
-        {
266
-            for (int x = 0; x < padx; x++)
267
-                Y[width + x] = Y[width - 1];
268
+    /* extend the bottom if height was not multiple of the minimum CU size */
269
+    Y = m_picOrg[0] + (height - 1) * m_stride;
270
+    for (int i = 1; i <= pady; i++)
271
+        memcpy(Y + i * m_stride, Y, (width + padx) * sizeof(pixel));
272
 
273
-            Y += m_stride;
274
-        }
275
+    if (pic.colorSpace != X265_CSP_I400)
276
+    {
277
+        pixel *U = m_picOrg[1];
278
+        pixel *V = m_picOrg[2];
279
 
280
         for (int r = 0; r < height >> m_vChromaShift; r++)
281
         {
282
@@ -254,17 +313,9 @@
283
             U += m_strideC;
284
             V += m_strideC;
285
         }
286
-    }
287
-
288
-    /* extend the bottom if height was not multiple of the minimum CU size */
289
-    if (pady)
290
-    {
291
-        pixel *Y = m_picOrg[0] + (height - 1) * m_stride;
292
-        pixel *U = m_picOrg[1] + ((height >> m_vChromaShift) - 1) * m_strideC;
293
-        pixel *V = m_picOrg[2] + ((height >> m_vChromaShift) - 1) * m_strideC;
294
 
295
-        for (int i = 1; i <= pady; i++)
296
-            memcpy(Y + i * m_stride, Y, (width + padx) * sizeof(pixel));
297
+        U = m_picOrg[1] + ((height >> m_vChromaShift) - 1) * m_strideC;
298
+        V = m_picOrg[2] + ((height >> m_vChromaShift) - 1) * m_strideC;
299
 
300
         for (int j = 1; j <= pady >> m_vChromaShift; j++)
301
         {
302
x265_1.8.tar.gz/source/common/picyuv.h -> x265_1.9.tar.gz/source/common/picyuv.h Changed
19
 
1
@@ -60,13 +60,16 @@
2
     uint32_t m_chromaMarginX;
3
     uint32_t m_chromaMarginY;
4
 
5
+    uint16_t m_maxLumaLevel;
6
+    double   m_avgLumaLevel;
7
+
8
     PicYuv();
9
 
10
     bool  create(uint32_t picWidth, uint32_t picHeight, uint32_t csp);
11
     bool  createOffsets(const SPS& sps);
12
     void  destroy();
13
 
14
-    void  copyFromPicture(const x265_picture&, int padx, int pady);
15
+    void  copyFromPicture(const x265_picture&, const x265_param& param, int padx, int pady);
16
 
17
     intptr_t getChromaAddrOffset(uint32_t ctuAddr, uint32_t absPartIdx) const { return m_cuOffsetC[ctuAddr] + m_buOffsetC[absPartIdx]; }
18
 
19
x265_1.8.tar.gz/source/common/pixel.cpp -> x265_1.9.tar.gz/source/common/pixel.cpp Changed
249
 
1
@@ -25,6 +25,7 @@
2
  *****************************************************************************/
3
 
4
 #include "common.h"
5
+#include "slicetype.h"      // LOWRES_COST_MASK
6
 #include "primitives.h"
7
 #include "x265.h"
8
 
9
@@ -117,9 +118,9 @@
10
 }
11
 
12
 template<int lx, int ly, class T1, class T2>
13
-sse_ret_t sse(const T1* pix1, intptr_t stride_pix1, const T2* pix2, intptr_t stride_pix2)
14
+sse_t sse(const T1* pix1, intptr_t stride_pix1, const T2* pix2, intptr_t stride_pix2)
15
 {
16
-    sse_ret_t sum = 0;
17
+    sse_t sum = 0;
18
     int tmp;
19
 
20
     for (int y = 0; y < ly; y++)
21
@@ -187,37 +188,6 @@
22
     return (int)(sum >> 1);
23
 }
24
 
25
-static int satd_4x4(const int16_t* pix1, intptr_t stride_pix1)
26
-{
27
-    int32_t tmp[4][4];
28
-    int32_t s01, s23, d01, d23;
29
-    int32_t satd = 0;
30
-    int d;
31
-
32
-    for (d = 0; d < 4; d++, pix1 += stride_pix1)
33
-    {
34
-        s01 = pix1[0] + pix1[1];
35
-        s23 = pix1[2] + pix1[3];
36
-        d01 = pix1[0] - pix1[1];
37
-        d23 = pix1[2] - pix1[3];
38
-
39
-        tmp[d][0] = s01 + s23;
40
-        tmp[d][1] = s01 - s23;
41
-        tmp[d][2] = d01 - d23;
42
-        tmp[d][3] = d01 + d23;
43
-    }
44
-
45
-    for (d = 0; d < 4; d++)
46
-    {
47
-        s01 = tmp[0][d] + tmp[1][d];
48
-        s23 = tmp[2][d] + tmp[3][d];
49
-        d01 = tmp[0][d] - tmp[1][d];
50
-        d23 = tmp[2][d] - tmp[3][d];
51
-        satd += abs(s01 + s23) + abs(s01 - s23) + abs(d01 - d23) + abs(d01 + d23);
52
-    }
53
-    return (int)(satd / 2);
54
-}
55
-
56
 // x264's SWAR version of satd 8x4, performs two 4x4 SATDs at once
57
 static int satd_8x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
58
 {
59
@@ -313,57 +283,6 @@
60
     return (int)((_sa8d_8x8(pix1, i_pix1, pix2, i_pix2) + 2) >> 2);
61
 }
62
 
63
-inline int _sa8d_8x8(const int16_t* pix1, intptr_t i_pix1)
64
-{
65
-    int32_t tmp[8][8];
66
-    int32_t a0, a1, a2, a3, a4, a5, a6, a7;
67
-    int32_t sum = 0;
68
-
69
-    for (int i = 0; i < 8; i++, pix1 += i_pix1)
70
-    {
71
-        a0 = pix1[0] + pix1[1];
72
-        a1 = pix1[2] + pix1[3];
73
-        a2 = pix1[4] + pix1[5];
74
-        a3 = pix1[6] + pix1[7];
75
-        a4 = pix1[0] - pix1[1];
76
-        a5 = pix1[2] - pix1[3];
77
-        a6 = pix1[4] - pix1[5];
78
-        a7 = pix1[6] - pix1[7];
79
-        tmp[i][0] = (a0 + a1) + (a2 + a3);
80
-        tmp[i][1] = (a0 + a1) - (a2 + a3);
81
-        tmp[i][2] = (a0 - a1) + (a2 - a3);
82
-        tmp[i][3] = (a0 - a1) - (a2 - a3);
83
-        tmp[i][4] = (a4 + a5) + (a6 + a7);
84
-        tmp[i][5] = (a4 + a5) - (a6 + a7);
85
-        tmp[i][6] = (a4 - a5) + (a6 - a7);
86
-        tmp[i][7] = (a4 - a5) - (a6 - a7);
87
-    }
88
-
89
-    for (int i = 0; i < 8; i++)
90
-    {
91
-        a0 = (tmp[0][i] + tmp[1][i]) + (tmp[2][i] + tmp[3][i]);
92
-        a2 = (tmp[0][i] + tmp[1][i]) - (tmp[2][i] + tmp[3][i]);
93
-        a1 = (tmp[0][i] - tmp[1][i]) + (tmp[2][i] - tmp[3][i]);
94
-        a3 = (tmp[0][i] - tmp[1][i]) - (tmp[2][i] - tmp[3][i]);
95
-        a4 = (tmp[4][i] + tmp[5][i]) + (tmp[6][i] + tmp[7][i]);
96
-        a6 = (tmp[4][i] + tmp[5][i]) - (tmp[6][i] + tmp[7][i]);
97
-        a5 = (tmp[4][i] - tmp[5][i]) + (tmp[6][i] - tmp[7][i]);
98
-        a7 = (tmp[4][i] - tmp[5][i]) - (tmp[6][i] - tmp[7][i]);
99
-        a0 = abs(a0 + a4) + abs(a0 - a4);
100
-        a0 += abs(a1 + a5) + abs(a1 - a5);
101
-        a0 += abs(a2 + a6) + abs(a2 - a6);
102
-        a0 += abs(a3 + a7) + abs(a3 - a7);
103
-        sum += a0;
104
-    }
105
-
106
-    return (int)sum;
107
-}
108
-
109
-static int sa8d_8x8(const int16_t* pix1, intptr_t i_pix1)
110
-{
111
-    return (int)((_sa8d_8x8(pix1, i_pix1) + 2) >> 2);
112
-}
113
-
114
 static int sa8d_16x16(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
115
 {
116
     int sum = _sa8d_8x8(pix1, i_pix1, pix2, i_pix2)
117
@@ -403,9 +322,9 @@
118
 }
119
 
120
 template<int size>
121
-int pixel_ssd_s_c(const int16_t* a, intptr_t dstride)
122
+sse_t pixel_ssd_s_c(const int16_t* a, intptr_t dstride)
123
 {
124
-    int sum = 0;
125
+    sse_t sum = 0;
126
     for (int y = 0; y < size; y++)
127
     {
128
         for (int x = 0; x < size; x++)
129
@@ -783,39 +702,6 @@
130
     }
131
 }
132
 
133
-template<int size>
134
-int psyCost_ss(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride)
135
-{
136
-    static int16_t zeroBuf[8] /* = { 0 } */;
137
-
138
-    if (size)
139
-    {
140
-        int dim = 1 << (size + 2);
141
-        uint32_t totEnergy = 0;
142
-        for (int i = 0; i < dim; i += 8)
143
-        {
144
-            for (int j = 0; j < dim; j+= 8)
145
-            {
146
-                /* AC energy, measured by sa8d (AC + DC) minus SAD (DC) */
147
-                int sourceEnergy = sa8d_8x8(source + i * sstride + j, sstride) - 
148
-                                   (sad<8, 8>(source + i * sstride + j, sstride, zeroBuf, 0) >> 2);
149
-                int reconEnergy =  sa8d_8x8(recon + i * rstride + j, rstride) - 
150
-                                   (sad<8, 8>(recon + i * rstride + j, rstride, zeroBuf, 0) >> 2);
151
-
152
-                totEnergy += abs(sourceEnergy - reconEnergy);
153
-            }
154
-        }
155
-        return totEnergy;
156
-    }
157
-    else
158
-    {
159
-        /* 4x4 is too small for sa8d */
160
-        int sourceEnergy = satd_4x4(source, sstride) - (sad<4, 4>(source, sstride, zeroBuf, 0) >> 2);
161
-        int reconEnergy = satd_4x4(recon, rstride) - (sad<4, 4>(recon, rstride, zeroBuf, 0) >> 2);
162
-        return abs(sourceEnergy - reconEnergy);
163
-    }
164
-}
165
-
166
 template<int bx, int by>
167
 void blockcopy_pp_c(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb)
168
 {
169
@@ -960,19 +846,57 @@
170
 /* Estimate the total amount of influence on future quality that could be had if we
171
  * were to improve the reference samples used to inter predict any given CU. */
172
 static void estimateCUPropagateCost(int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, const uint16_t* interCosts,
173
-                             const int32_t* invQscales, const double* fpsFactor, int len)
174
+                                    const int32_t* invQscales, const double* fpsFactor, int len)
175
 {
176
-    double fps = *fpsFactor / 256;
177
+    double fps = *fpsFactor / 256;  // range[0.01, 1.00]
178
 
179
     for (int i = 0; i < len; i++)
180
     {
181
-        double intraCost       = intraCosts[i] * invQscales[i];
182
-        double propagateAmount = (double)propagateIn[i] + intraCost * fps;
183
-        double propagateNum    = (double)intraCosts[i] - (interCosts[i] & ((1 << 14) - 1));
184
-        double propagateDenom  = (double)intraCosts[i];
185
+        int intraCost = intraCosts[i];
186
+        int interCost = X265_MIN(intraCosts[i], interCosts[i] & LOWRES_COST_MASK);
187
+        double propagateIntra  = intraCost * invQscales[i]; // Q16 x Q8.8 = Q24.8
188
+        double propagateAmount = (double)propagateIn[i] + propagateIntra * fps; // Q16.0 + Q24.8 x Q0.x = Q25.0
189
+        double propagateNum    = (double)(intraCost - interCost); // Q32 - Q32 = Q33.0
190
+
191
+#if 0
192
+        // algorithm that output match to asm
193
+        float intraRcp = (float)1.0f / intraCost;   // VC can't mapping this into RCPPS
194
+        float intraRcpError1 = (float)intraCost * (float)intraRcp;
195
+        intraRcpError1 *= (float)intraRcp;
196
+        float intraRcpError2 = intraRcp + intraRcp;
197
+        float propagateDenom = intraRcpError2 - intraRcpError1;
198
+        dst[i] = (int)(propagateAmount * propagateNum * (double)propagateDenom + 0.5);
199
+#else
200
+        double propagateDenom  = (double)intraCost;             // Q32
201
         dst[i] = (int)(propagateAmount * propagateNum / propagateDenom + 0.5);
202
+#endif
203
     }
204
 }
205
+
206
+static pixel planeClipAndMax_c(pixel *src, intptr_t stride, int width, int height, uint64_t *outsum, const pixel minPix, const pixel maxPix)
207
+{
208
+    pixel maxLumaLevel = 0;
209
+    uint64_t sumLuma = 0;
210
+
211
+    for (int r = 0; r < height; r++)
212
+    {
213
+        for (int c = 0; c < width; c++)
214
+        {
215
+            /* Clip luma of source picture to max and min values before extending edges of picYuv */
216
+            src[c] = x265_clip3((pixel)minPix, (pixel)maxPix, src[c]);
217
+
218
+            /* Determine maximum and average luma level in a picture */
219
+            maxLumaLevel = X265_MAX(src[c], maxLumaLevel);
220
+            sumLuma += src[c];
221
+        }
222
+
223
+        src += stride;
224
+    }
225
+
226
+    *outsum = sumLuma;
227
+    return maxLumaLevel;
228
+}
229
+
230
 }  // end anonymous namespace
231
 
232
 namespace X265_NS {
233
@@ -1020,7 +944,6 @@
234
     p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shl = cpy1Dto2D_shl<W>; \
235
     p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shr = cpy1Dto2D_shr<W>; \
236
     p.cu[BLOCK_ ## W ## x ## H].psy_cost_pp   = psyCost_pp<BLOCK_ ## W ## x ## H>; \
237
-    p.cu[BLOCK_ ## W ## x ## H].psy_cost_ss   = psyCost_ss<BLOCK_ ## W ## x ## H>; \
238
     p.cu[BLOCK_ ## W ## x ## H].transpose     = transpose<W>; \
239
     p.cu[BLOCK_ ## W ## x ## H].ssd_s         = pixel_ssd_s_c<W>; \
240
     p.cu[BLOCK_ ## W ## x ## H].var           = pixel_var<W>; \
241
@@ -1258,6 +1181,7 @@
242
     p.planecopy_cp = planecopy_cp_c;
243
     p.planecopy_sp = planecopy_sp_c;
244
     p.planecopy_sp_shl = planecopy_sp_shl_c;
245
+    p.planeClipAndMax = planeClipAndMax_c;
246
     p.propagateCost = estimateCUPropagateCost;
247
 }
248
 }
249
x265_1.8.tar.gz/source/common/predict.cpp -> x265_1.9.tar.gz/source/common/predict.cpp Changed
147
 
1
@@ -2,6 +2,7 @@
2
 * Copyright (C) 2013 x265 project
3
 *
4
 * Authors: Deepthi Nandakumar <deepthi@multicorewareinc.com>
5
+*          Min Chen <chenm003@163.com>
6
 *
7
 * This program is free software; you can redistribute it and/or modify
8
 * it under the terms of the GNU General Public License as published by
9
@@ -98,7 +99,7 @@
10
 
11
         if (cu.m_slice->m_pps->bUseWeightPred && wp0->bPresentFlag)
12
         {
13
-            for (int plane = 0; plane < 3; plane++)
14
+            for (int plane = 0; plane < (bChroma ? 3 : 1); plane++)
15
             {
16
                 wv0[plane].w      = wp0[plane].inputWeight;
17
                 wv0[plane].offset = wp0[plane].inputOffset * (1 << (X265_DEPTH - 8));
18
@@ -109,18 +110,18 @@
19
             ShortYuv& shortYuv = m_predShortYuv[0];
20
 
21
             if (bLuma)
22
-                predInterLumaShort(pu, shortYuv, *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
23
+                predInterLumaShort(pu, shortYuv, *cu.m_slice->m_refReconPicList[0][refIdx0], mv0);
24
             if (bChroma)
25
-                predInterChromaShort(pu, shortYuv, *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
26
+                predInterChromaShort(pu, shortYuv, *cu.m_slice->m_refReconPicList[0][refIdx0], mv0);
27
 
28
             addWeightUni(pu, predYuv, shortYuv, wv0, bLuma, bChroma);
29
         }
30
         else
31
         {
32
             if (bLuma)
33
-                predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
34
+                predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList[0][refIdx0], mv0);
35
             if (bChroma)
36
-                predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
37
+                predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList[0][refIdx0], mv0);
38
         }
39
     }
40
     else
41
@@ -141,7 +142,7 @@
42
             if (pwp0 && pwp1 && (pwp0->bPresentFlag || pwp1->bPresentFlag))
43
             {
44
                 /* biprediction weighting */
45
-                for (int plane = 0; plane < 3; plane++)
46
+                for (int plane = 0; plane < (bChroma ? 3 : 1); plane++)
47
                 {
48
                     wv0[plane].w = pwp0[plane].inputWeight;
49
                     wv0[plane].o = pwp0[plane].inputOffset * (1 << (X265_DEPTH - 8));
50
@@ -158,7 +159,7 @@
51
             {
52
                 /* uniprediction weighting, always outputs to wv0 */
53
                 const WeightParam* pwp = (refIdx0 >= 0) ? pwp0 : pwp1;
54
-                for (int plane = 0; plane < 3; plane++)
55
+                for (int plane = 0; plane < (bChroma ? 3 : 1); plane++)
56
                 {
57
                     wv0[plane].w = pwp[plane].inputWeight;
58
                     wv0[plane].offset = pwp[plane].inputOffset * (1 << (X265_DEPTH - 8));
59
@@ -179,13 +180,13 @@
60
 
61
             if (bLuma)
62
             {
63
-                predInterLumaShort(pu, m_predShortYuv[0], *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
64
-                predInterLumaShort(pu, m_predShortYuv[1], *cu.m_slice->m_refPicList[1][refIdx1]->m_reconPic, mv1);
65
+                predInterLumaShort(pu, m_predShortYuv[0], *cu.m_slice->m_refReconPicList[0][refIdx0], mv0);
66
+                predInterLumaShort(pu, m_predShortYuv[1], *cu.m_slice->m_refReconPicList[1][refIdx1], mv1);
67
             }
68
             if (bChroma)
69
             {
70
-                predInterChromaShort(pu, m_predShortYuv[0], *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
71
-                predInterChromaShort(pu, m_predShortYuv[1], *cu.m_slice->m_refPicList[1][refIdx1]->m_reconPic, mv1);
72
+                predInterChromaShort(pu, m_predShortYuv[0], *cu.m_slice->m_refReconPicList[0][refIdx0], mv0);
73
+                predInterChromaShort(pu, m_predShortYuv[1], *cu.m_slice->m_refReconPicList[1][refIdx1], mv1);
74
             }
75
 
76
             if (pwp0 && pwp1 && (pwp0->bPresentFlag || pwp1->bPresentFlag))
77
@@ -203,18 +204,18 @@
78
                 ShortYuv& shortYuv = m_predShortYuv[0];
79
 
80
                 if (bLuma)
81
-                    predInterLumaShort(pu, shortYuv, *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
82
+                    predInterLumaShort(pu, shortYuv, *cu.m_slice->m_refReconPicList[0][refIdx0], mv0);
83
                 if (bChroma)
84
-                    predInterChromaShort(pu, shortYuv, *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
85
+                    predInterChromaShort(pu, shortYuv, *cu.m_slice->m_refReconPicList[0][refIdx0], mv0);
86
 
87
                 addWeightUni(pu, predYuv, shortYuv, wv0, bLuma, bChroma);
88
             }
89
             else
90
             {
91
                 if (bLuma)
92
-                    predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
93
+                    predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList[0][refIdx0], mv0);
94
                 if (bChroma)
95
-                    predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
96
+                    predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList[0][refIdx0], mv0);
97
             }
98
         }
99
         else
100
@@ -230,18 +231,18 @@
101
                 ShortYuv& shortYuv = m_predShortYuv[0];
102
 
103
                 if (bLuma)
104
-                    predInterLumaShort(pu, shortYuv, *cu.m_slice->m_refPicList[1][refIdx1]->m_reconPic, mv1);
105
+                    predInterLumaShort(pu, shortYuv, *cu.m_slice->m_refReconPicList[1][refIdx1], mv1);
106
                 if (bChroma)
107
-                    predInterChromaShort(pu, shortYuv, *cu.m_slice->m_refPicList[1][refIdx1]->m_reconPic, mv1);
108
+                    predInterChromaShort(pu, shortYuv, *cu.m_slice->m_refReconPicList[1][refIdx1], mv1);
109
 
110
                 addWeightUni(pu, predYuv, shortYuv, wv0, bLuma, bChroma);
111
             }
112
             else
113
             {
114
                 if (bLuma)
115
-                    predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refPicList[1][refIdx1]->m_reconPic, mv1);
116
+                    predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList[1][refIdx1], mv1);
117
                 if (bChroma)
118
-                    predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refPicList[1][refIdx1]->m_reconPic, mv1);
119
+                    predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList[1][refIdx1], mv1);
120
             }
121
         }
122
     }
123
@@ -600,8 +601,9 @@
124
     int tuSize = 1 << intraNeighbors.log2TrSize;
125
     int tuSize2 = tuSize << 1;
126
 
127
-    pixel* adiOrigin = cu.m_encData->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + puAbsPartIdx);
128
-    intptr_t picStride = cu.m_encData->m_reconPic->m_stride;
129
+    PicYuv* reconPic = cu.m_encData->m_reconPic;
130
+    pixel* adiOrigin = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + puAbsPartIdx);
131
+    intptr_t picStride = reconPic->m_stride;
132
 
133
     fillReferenceSamples(adiOrigin, picStride, intraNeighbors, intraNeighbourBuf[0]);
134
 
135
@@ -648,8 +650,9 @@
136
 
137
 void Predict::initAdiPatternChroma(const CUData& cu, const CUGeom& cuGeom, uint32_t puAbsPartIdx, const IntraNeighbors& intraNeighbors, uint32_t chromaId)
138
 {
139
-    const pixel* adiOrigin = cu.m_encData->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + puAbsPartIdx);
140
-    intptr_t picStride = cu.m_encData->m_reconPic->m_strideC;
141
+    PicYuv* reconPic = cu.m_encData->m_reconPic;
142
+    const pixel* adiOrigin = reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + puAbsPartIdx);
143
+    intptr_t picStride = reconPic->m_strideC;
144
 
145
     fillReferenceSamples(adiOrigin, picStride, intraNeighbors, intraNeighbourBuf[0]);
146
 
147
x265_1.8.tar.gz/source/common/predict.h -> x265_1.9.tar.gz/source/common/predict.h Changed
9
 
1
@@ -2,6 +2,7 @@
2
 * Copyright (C) 2013 x265 project
3
 *
4
 * Authors: Deepthi Nandakumar <deepthi@multicorewareinc.com>
5
+*          Min Chen <chenm003@163.com>
6
 *
7
 * This program is free software; you can redistribute it and/or modify
8
 * it under the terms of the GNU General Public License as published by
9
x265_1.8.tar.gz/source/common/primitives.h -> x265_1.9.tar.gz/source/common/primitives.h Changed
69
 
1
@@ -112,9 +112,9 @@
2
 
3
 typedef int  (*pixelcmp_t)(const pixel* fenc, intptr_t fencstride, const pixel* fref, intptr_t frefstride); // fenc is aligned
4
 typedef int  (*pixelcmp_ss_t)(const int16_t* fenc, intptr_t fencstride, const int16_t* fref, intptr_t frefstride);
5
-typedef sse_ret_t (*pixel_sse_t)(const pixel* fenc, intptr_t fencstride, const pixel* fref, intptr_t frefstride); // fenc is aligned
6
-typedef sse_ret_t (*pixel_sse_ss_t)(const int16_t* fenc, intptr_t fencstride, const int16_t* fref, intptr_t frefstride);
7
-typedef int  (*pixel_ssd_s_t)(const int16_t* fenc, intptr_t fencstride);
8
+typedef sse_t (*pixel_sse_t)(const pixel* fenc, intptr_t fencstride, const pixel* fref, intptr_t frefstride); // fenc is aligned
9
+typedef sse_t (*pixel_sse_ss_t)(const int16_t* fenc, intptr_t fencstride, const int16_t* fref, intptr_t frefstride);
10
+typedef sse_t (*pixel_ssd_s_t)(const int16_t* fenc, intptr_t fencstride);
11
 typedef void (*pixelcmp_x4_t)(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
12
 typedef void (*pixelcmp_x3_t)(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
13
 typedef void (*blockfill_s_t)(int16_t* dst, intptr_t dstride, int16_t val);
14
@@ -176,15 +176,16 @@
15
 typedef void (*saoCuOrgE3_t)(pixel* rec, int8_t* upBuff1, int8_t* m_offsetEo, intptr_t stride, int startX, int endX);
16
 typedef void (*saoCuOrgB0_t)(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
17
 
18
-typedef void (*saoCuStatsBO_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count);
19
-typedef void (*saoCuStatsE0_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count);
20
-typedef void (*saoCuStatsE1_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
21
-typedef void (*saoCuStatsE2_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBuff, int endX, int endY, int32_t *stats, int32_t *count);
22
-typedef void (*saoCuStatsE3_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
23
+typedef void (*saoCuStatsBO_t)(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count);
24
+typedef void (*saoCuStatsE0_t)(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count);
25
+typedef void (*saoCuStatsE1_t)(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
26
+typedef void (*saoCuStatsE2_t)(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBuff, int endX, int endY, int32_t *stats, int32_t *count);
27
+typedef void (*saoCuStatsE3_t)(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
28
 
29
 typedef void (*sign_t)(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
30
 typedef void (*planecopy_cp_t) (const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
31
 typedef void (*planecopy_sp_t) (const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
32
+typedef pixel (*planeClipAndMax_t)(pixel *src, intptr_t stride, int width, int height, uint64_t *outsum, const pixel minPix, const pixel maxPix);
33
 
34
 typedef void (*cutree_propagate_cost) (int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, const uint16_t* interCosts, const int32_t* invQscales, const double* fpsFactor, int len);
35
 
36
@@ -195,6 +196,8 @@
37
 typedef uint32_t (*costCoeffRemain_t)(uint16_t *absCoeff, int numNonZero, int idx);
38
 typedef uint32_t (*costC1C2Flag_t)(uint16_t *absCoeff, intptr_t numC1Flag, uint8_t *baseCtxMod, intptr_t ctxOffset);
39
 
40
+typedef void (*pelFilterLumaStrong_t)(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tcP, int32_t tcQ);
41
+
42
 /* Function pointers to optimized encoder primitives. Each pointer can reference
43
  * either an assembly routine, a SIMD intrinsic primitive, or a C function */
44
 struct EncoderPrimitives
45
@@ -259,7 +262,6 @@
46
         pixel_sse_t     sse_pp;        // Sum of Square Error (pixel, pixel) fenc alignment not assumed
47
         pixel_sse_ss_t  sse_ss;        // Sum of Square Error (short, short) fenc alignment not assumed
48
         pixelcmp_t      psy_cost_pp;   // difference in AC energy between two pixel blocks
49
-        pixelcmp_ss_t   psy_cost_ss;   // difference in AC energy between two signed residual blocks
50
         pixel_ssd_s_t   ssd_s;         // Sum of Square Error (residual coeff to self)
51
         pixelcmp_t      sa8d;          // Sum of Transformed Differences (8x8 Hadamard), uses satd for 4x4 intra TU
52
 
53
@@ -316,6 +318,7 @@
54
     planecopy_cp_t        planecopy_cp;
55
     planecopy_sp_t        planecopy_sp;
56
     planecopy_sp_t        planecopy_sp_shl;
57
+    planeClipAndMax_t     planeClipAndMax;
58
 
59
     weightp_sp_t          weight_sp;
60
     weightp_pp_t          weight_pp;
61
@@ -328,6 +331,7 @@
62
     costCoeffRemain_t     costCoeffRemain;
63
     costC1C2Flag_t        costC1C2Flag;
64
 
65
+    pelFilterLumaStrong_t pelFilterLumaStrong[2]; // EDGE_VER = 0, EDGE_HOR = 1
66
 
67
     /* There is one set of chroma primitives per color space. An encoder will
68
      * have just a single color space and thus it will only ever use one entry
69
x265_1.8.tar.gz/source/common/quant.cpp -> x265_1.9.tar.gz/source/common/quant.cpp Changed
501
 
1
@@ -2,6 +2,7 @@
2
  * Copyright (C) 2015 x265 project
3
  *
4
  * Authors: Steve Borho <steve@borho.org>
5
+ *          Min Chen <chenm003@163.com>
6
  *
7
  * This program is free software; you can redistribute it and/or modify
8
  * it under the terms of the GNU General Public License as published by
9
@@ -50,9 +51,8 @@
10
     return y + ((x - y) & ((x - y) >> (sizeof(int) * CHAR_BIT - 1))); // min(x, y)
11
 }
12
 
13
-inline int getICRate(uint32_t absLevel, int32_t diffLevel, const int* greaterOneBits, const int* levelAbsBits, const uint32_t absGoRice, const uint32_t maxVlc, uint32_t c1c2Idx)
14
+inline int getICRate(uint32_t absLevel, int32_t diffLevel, const int* greaterOneBits, const int* levelAbsBits, const uint32_t absGoRice, const uint32_t maxVlc, const uint32_t c1c2Rate)
15
 {
16
-    X265_CHECK(c1c2Idx <= 3, "c1c2Idx check failure\n");
17
     X265_CHECK(absGoRice <= 4, "absGoRice check failure\n");
18
     if (!absLevel)
19
     {
20
@@ -94,12 +94,7 @@
21
         uint32_t numBins = fastMin(prefLen + absGoRice, 8 /* g_goRicePrefixLen[absGoRice] + absGoRice */);
22
 
23
         rate += numBins << 15;
24
-
25
-        if (c1c2Idx & 1)
26
-            rate += greaterOneBits[1];
27
-
28
-        if (c1c2Idx == 3)
29
-            rate += levelAbsBits[1];
30
+        rate += c1c2Rate;
31
     }
32
     return rate;
33
 }
34
@@ -140,7 +135,7 @@
35
 }
36
 
37
 /* Calculates the cost for specific absolute transform level */
38
-inline uint32_t getICRateCost(uint32_t absLevel, int32_t diffLevel, const int* greaterOneBits, const int* levelAbsBits, uint32_t absGoRice, uint32_t c1c2Idx)
39
+inline uint32_t getICRateCost(uint32_t absLevel, int32_t diffLevel, const int* greaterOneBits, const int* levelAbsBits, uint32_t absGoRice, const uint32_t c1c2Rate)
40
 {
41
     X265_CHECK(absLevel, "absLevel should not be zero\n");
42
 
43
@@ -175,16 +170,15 @@
44
 
45
             rate = (COEF_REMAIN_BIN_REDUCTION + length + absGoRice + 1 + length) << 15;
46
         }
47
-        if (c1c2Idx & 1)
48
-            rate += greaterOneBits[1];
49
-        if (c1c2Idx == 3)
50
-            rate += levelAbsBits[1];
51
+        rate += c1c2Rate;
52
         return rate;
53
     }
54
 }
55
 
56
 }
57
 
58
+Quant::rdoQuant_t Quant::rdoQuant_func[NUM_CU_DEPTH] = {&Quant::rdoQuant<2>, &Quant::rdoQuant<3>, &Quant::rdoQuant<4>, &Quant::rdoQuant<5>};
59
+
60
 Quant::Quant()
61
 {
62
     m_resiDctCoeff = NULL;
63
@@ -229,8 +223,11 @@
64
 {
65
     m_nr = m_frameNr ? &m_frameNr[ctu.m_encData->m_frameEncoderID] : NULL;
66
     m_qpParam[TEXT_LUMA].setQpParam(qp + QP_BD_OFFSET);
67
-    setChromaQP(qp + ctu.m_slice->m_pps->chromaQpOffset[0], TEXT_CHROMA_U, ctu.m_chromaFormat);
68
-    setChromaQP(qp + ctu.m_slice->m_pps->chromaQpOffset[1], TEXT_CHROMA_V, ctu.m_chromaFormat);
69
+    if (ctu.m_chromaFormat != X265_CSP_I400)
70
+    {
71
+        setChromaQP(qp + ctu.m_slice->m_pps->chromaQpOffset[0], TEXT_CHROMA_U, ctu.m_chromaFormat);
72
+        setChromaQP(qp + ctu.m_slice->m_pps->chromaQpOffset[1], TEXT_CHROMA_V, ctu.m_chromaFormat);
73
+    }
74
 }
75
 
76
 void Quant::setChromaQP(int qpin, TextType ttype, int chFmt)
77
@@ -444,18 +441,18 @@
78
             primitives.cu[sizeIdx].dct(m_fencShortBuf, m_fencDctCoeff, trSize);
79
         }
80
 
81
-        if (m_nr)
82
+        if (m_nr && m_nr->offset)
83
         {
84
             /* denoise is not applied to intra residual, so DST can be ignored */
85
             int cat = sizeIdx + 4 * !isLuma + 8 * !isIntra;
86
             int numCoeff = 1 << (log2TrSize * 2);
87
-            primitives.denoiseDct(m_resiDctCoeff, m_nr->residualSum[cat], m_nr->offsetDenoise[cat], numCoeff);
88
+            primitives.denoiseDct(m_resiDctCoeff, m_nr->residualSum[cat], m_nr->offset[cat], numCoeff);
89
             m_nr->count[cat]++;
90
         }
91
     }
92
 
93
     if (m_rdoqLevel)
94
-        return rdoQuant(cu, coeff, log2TrSize, ttype, absPartIdx, usePsy);
95
+        return (this->*rdoQuant_func[log2TrSize - 2])(cu, coeff, ttype, absPartIdx, usePsy);
96
     else
97
     {
98
         int deltaU[32 * 32];
99
@@ -550,9 +547,10 @@
100
 
101
 /* Rate distortion optimized quantization for entropy coding engines using
102
  * probability models like CABAC */
103
-uint32_t Quant::rdoQuant(const CUData& cu, int16_t* dstCoeff, uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool usePsy)
104
+template<uint32_t log2TrSize>
105
+uint32_t Quant::rdoQuant(const CUData& cu, int16_t* dstCoeff, TextType ttype, uint32_t absPartIdx, bool usePsy)
106
 {
107
-    int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
108
+    const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
109
     int scalingListType = (cu.isIntra(absPartIdx) ? 0 : 3) + ttype;
110
     const uint32_t usePsyMask = usePsy ? -1 : 0;
111
 
112
@@ -564,13 +562,13 @@
113
     int add = (1 << (qbits - 1));
114
     const int32_t* qCoef = m_scalingList->m_quantCoef[log2TrSize - 2][scalingListType][rem];
115
 
116
-    int numCoeff = 1 << (log2TrSize * 2);
117
+    const int numCoeff = 1 << (log2TrSize * 2);
118
     uint32_t numSig = primitives.nquant(m_resiDctCoeff, qCoef, dstCoeff, qbits, add, numCoeff);
119
     X265_CHECK((int)numSig == primitives.cu[log2TrSize - 2].count_nonzero(dstCoeff), "numSig differ\n");
120
     if (!numSig)
121
         return 0;
122
 
123
-    uint32_t trSize = 1 << log2TrSize;
124
+    const uint32_t trSize = 1 << log2TrSize;
125
     int64_t lambda2 = m_qpParam[ttype].lambda2;
126
     const int64_t psyScale = ((int64_t)m_psyRdoqScale * m_qpParam[ttype].lambda);
127
 
128
@@ -580,20 +578,20 @@
129
     const int32_t* unquantScale = m_scalingList->m_dequantCoef[log2TrSize - 2][scalingListType][rem];
130
     int unquantShift = QUANT_IQUANT_SHIFT - QUANT_SHIFT - transformShift + (m_scalingList->m_bEnabled ? 4 : 0);
131
     int unquantRound = (unquantShift > per) ? 1 << (unquantShift - per - 1) : 0;
132
-    int scaleBits = SCALE_BITS - 2 * transformShift;
133
+    const int scaleBits = SCALE_BITS - 2 * transformShift;
134
 
135
 #define UNQUANT(lvl)    (((lvl) * (unquantScale[blkPos] << per) + unquantRound) >> unquantShift)
136
 #define SIGCOST(bits)   ((lambda2 * (bits)) >> 8)
137
 #define RDCOST(d, bits) ((((int64_t)d * d) << scaleBits) + SIGCOST(bits))
138
 #define PSYVALUE(rec)   ((psyScale * (rec)) >> X265_MAX(0, (2 * transformShift + 1)))
139
 
140
-    int64_t costCoeff[32 * 32];   /* d*d + lambda * bits */
141
-    int64_t costUncoded[32 * 32]; /* d*d + lambda * 0    */
142
-    int64_t costSig[32 * 32];     /* lambda * bits       */
143
+    int64_t costCoeff[trSize * trSize];   /* d*d + lambda * bits */
144
+    int64_t costUncoded[trSize * trSize]; /* d*d + lambda * 0    */
145
+    int64_t costSig[trSize * trSize];     /* lambda * bits       */
146
 
147
-    int rateIncUp[32 * 32];      /* signal overhead of increasing level */
148
-    int rateIncDown[32 * 32];    /* signal overhead of decreasing level */
149
-    int sigRateDelta[32 * 32];   /* signal difference between zero and non-zero */
150
+    int rateIncUp[trSize * trSize];      /* signal overhead of increasing level */
151
+    int rateIncDown[trSize * trSize];    /* signal overhead of decreasing level */
152
+    int sigRateDelta[trSize * trSize];   /* signal difference between zero and non-zero */
153
 
154
     int64_t costCoeffGroupSig[MLS_GRP_NUM]; /* lambda * bits of group coding cost */
155
     uint64_t sigCoeffGroupFlag64 = 0;
156
@@ -611,7 +609,8 @@
157
 
158
     TUEntropyCodingParameters codeParams;
159
     cu.getTUEntropyCodingParameters(codeParams, absPartIdx, log2TrSize, bIsLuma);
160
-    const uint32_t cgNum = 1 << (codeParams.log2TrSizeCG * 2);
161
+    const uint32_t log2TrSizeCG = log2TrSize - 2;
162
+    const uint32_t cgNum = 1 << (log2TrSizeCG * 2);
163
     const uint32_t cgStride = (trSize >> MLS_CG_LOG2_SIZE);
164
 
165
     uint8_t coeffNum[MLS_GRP_NUM];      // value range[0, 16]
166
@@ -742,8 +741,8 @@
167
     {
168
         uint32_t ctxSet = (cgScanPos && bIsLuma) ? 2 : 0;
169
         const uint32_t cgBlkPos = codeParams.scanCG[cgScanPos];
170
-        const uint32_t cgPosY   = cgBlkPos >> codeParams.log2TrSizeCG;
171
-        const uint32_t cgPosX   = cgBlkPos - (cgPosY << codeParams.log2TrSizeCG);
172
+        const uint32_t cgPosY   = cgBlkPos >> log2TrSizeCG;
173
+        const uint32_t cgPosX   = cgBlkPos & ((1 << log2TrSizeCG) - 1);
174
         const uint64_t cgBlkPosMask = ((uint64_t)1 << cgBlkPos);
175
         const int patternSigCtx = calcPatternSigCtx(sigCoeffGroupFlag64, cgPosX, cgPosY, cgBlkPos, cgStride);
176
         const int ctxSigOffset = codeParams.firstSignificanceMapContext + (cgScanPos && bIsLuma ? 3 : 0);
177
@@ -829,6 +828,7 @@
178
         uint32_t subFlagMask = coeffFlag[cgScanPos];
179
         int    c2            = 0;
180
         uint32_t goRiceParam = 0;
181
+        uint32_t levelThreshold = 3;
182
         uint32_t c1Idx       = 0;
183
         uint32_t c2Idx       = 0;
184
         /* iterate over coefficients in each group in reverse scan order */
185
@@ -836,7 +836,7 @@
186
         {
187
             scanPos              = (cgScanPos << MLS_CG_SIZE) + scanPosinCG;
188
             uint32_t blkPos      = codeParams.scan[scanPos];
189
-            uint32_t maxAbsLevel = abs(dstCoeff[blkPos]);             /* abs(quantized coeff) */
190
+            uint32_t maxAbsLevel = dstCoeff[blkPos];                  /* abs(quantized coeff) */
191
             int signCoef         = m_resiDctCoeff[blkPos];            /* pre-quantization DCT coeff */
192
             int predictedCoef    = m_fencDctCoeff[blkPos] - signCoef; /* predicted DCT = source DCT - residual DCT*/
193
 
194
@@ -855,7 +855,11 @@
195
 
196
             // coefficient level estimation
197
             const int* greaterOneBits = estBitsSbac.greaterOneBits[4 * ctxSet + c1];
198
-            const uint32_t ctxSig = (blkPos == 0) ? 0 : table_cnt[(trSize == 4) ? 4 : patternSigCtx][g_scan4x4[codeParams.scanType][scanPosinCG]] + ctxSigOffset;
199
+            //const uint32_t ctxSig = (blkPos == 0) ? 0 : table_cnt[(trSize == 4) ? 4 : patternSigCtx][g_scan4x4[codeParams.scanType][scanPosinCG]] + ctxSigOffset;
200
+            static const uint64_t table_cnt64[4] = {0x0000000100110112ULL, 0x0000000011112222ULL, 0x0012001200120012ULL, 0x2222222222222222ULL};
201
+            uint64_t ctxCnt = (trSize == 4) ? 0x8877886654325410ULL : table_cnt64[patternSigCtx];
202
+            const uint32_t ctxSig = (blkPos == 0) ? 0 : ((ctxCnt >> (4 * g_scan4x4[codeParams.scanType][scanPosinCG])) & 0xF) + ctxSigOffset;
203
+            // NOTE: above equal to 'table_cnt[(trSize == 4) ? 4 : patternSigCtx][g_scan4x4[codeParams.scanType][scanPosinCG]] + ctxSigOffset'
204
             X265_CHECK(ctxSig == getSigCtxInc(patternSigCtx, log2TrSize, trSize, blkPos, bIsLuma, codeParams.firstSignificanceMapContext), "sigCtx check failure\n");
205
 
206
             // before find lastest non-zero coeff
207
@@ -886,15 +890,17 @@
208
             {
209
                 subFlagMask >>= 1;
210
 
211
-                const uint32_t c1c2Idx = ((c1Idx - 8) >> (sizeof(int) * CHAR_BIT - 1)) + (((-(int)c2Idx) >> (sizeof(int) * CHAR_BIT - 1)) + 1) * 2;
212
-                const uint32_t baseLevel = ((uint32_t)0xD9 >> (c1c2Idx * 2)) & 3;  // {1, 2, 1, 3}
213
+                const uint32_t c1c2idx = ((c1Idx - 8) >> (sizeof(int) * CHAR_BIT - 1)) + (((-(int)c2Idx) >> (sizeof(int) * CHAR_BIT - 1)) + 1) * 2;
214
+                const uint32_t baseLevel = ((uint32_t)0xD9 >> (c1c2idx * 2)) & 3;  // {1, 2, 1, 3}
215
 
216
                 X265_CHECK(!!((int)c1Idx < C1FLAG_NUMBER) == (int)((c1Idx - 8) >> (sizeof(int) * CHAR_BIT - 1)), "scan validation 1\n");
217
                 X265_CHECK(!!(c2Idx == 0) == ((-(int)c2Idx) >> (sizeof(int) * CHAR_BIT - 1)) + 1, "scan validation 2\n");
218
                 X265_CHECK((int)baseLevel == ((c1Idx < C1FLAG_NUMBER) ? (2 + (c2Idx == 0)) : 1), "scan validation 3\n");
219
+                X265_CHECK(c1c2idx <= 3, "c1c2Idx check failure\n");
220
 
221
                 // coefficient level estimation
222
                 const int* levelAbsBits = estBitsSbac.levelAbsBits[ctxSet + c2];
223
+                const uint32_t c1c2Rate = ((c1c2idx & 1) ?  greaterOneBits[1] : 0) + ((c1c2idx == 3) ? levelAbsBits[1] : 0);
224
 
225
                 uint32_t level = 0;
226
                 uint32_t sigCoefBits = 0;
227
@@ -914,13 +920,15 @@
228
                     sigCoefBits = estBitsSbac.significantBits[1][ctxSig];
229
                 }
230
 
231
+                const uint32_t unQuantLevel = (maxAbsLevel * (unquantScale[blkPos] << per) + unquantRound);
232
                 // NOTE: X265_MAX(maxAbsLevel - 1, 1) ==> (X>=2 -> X-1), (X<2 -> 1)  | (0 < X < 2 ==> X=1)
233
                 if (maxAbsLevel == 1)
234
                 {
235
-                    uint32_t levelBits = (c1c2Idx & 1) ? greaterOneBits[0] + IEP_RATE : ((1 + goRiceParam) << 15) + IEP_RATE;
236
-                    X265_CHECK(levelBits == getICRateCost(1, 1 - baseLevel, greaterOneBits, levelAbsBits, goRiceParam, c1c2Idx) + IEP_RATE, "levelBits mistake\n");
237
+                    uint32_t levelBits = (c1c2idx & 1) ? greaterOneBits[0] + IEP_RATE : ((1 + goRiceParam) << 15) + IEP_RATE;
238
+                    X265_CHECK(levelBits == getICRateCost(1, 1 - baseLevel, greaterOneBits, levelAbsBits, goRiceParam, c1c2Rate) + IEP_RATE, "levelBits mistake\n");
239
 
240
-                    int unquantAbsLevel = UNQUANT(1);
241
+                    int unquantAbsLevel = unQuantLevel >> unquantShift;
242
+                    X265_CHECK(UNQUANT(1) == unquantAbsLevel, "DQuant check failed\n");
243
                     int d = abs(signCoef) - unquantAbsLevel;
244
                     int64_t curCost = RDCOST(d, sigCoefBits + levelBits);
245
 
246
@@ -940,14 +948,18 @@
247
                 }
248
                 else if (maxAbsLevel)
249
                 {
250
-                    uint32_t levelBits0 = getICRateCost(maxAbsLevel,     maxAbsLevel     - baseLevel, greaterOneBits, levelAbsBits, goRiceParam, c1c2Idx) + IEP_RATE;
251
-                    uint32_t levelBits1 = getICRateCost(maxAbsLevel - 1, maxAbsLevel - 1 - baseLevel, greaterOneBits, levelAbsBits, goRiceParam, c1c2Idx) + IEP_RATE;
252
+                    uint32_t levelBits0 = getICRateCost(maxAbsLevel,     maxAbsLevel     - baseLevel, greaterOneBits, levelAbsBits, goRiceParam, c1c2Rate) + IEP_RATE;
253
+                    uint32_t levelBits1 = getICRateCost(maxAbsLevel - 1, maxAbsLevel - 1 - baseLevel, greaterOneBits, levelAbsBits, goRiceParam, c1c2Rate) + IEP_RATE;
254
+
255
+                    const uint32_t preDQuantLevelDiff = (unquantScale[blkPos] << per);
256
 
257
-                    int unquantAbsLevel0 = UNQUANT(maxAbsLevel);
258
+                    const int unquantAbsLevel0 = unQuantLevel >> unquantShift;
259
+                    X265_CHECK(UNQUANT(maxAbsLevel) == (uint32_t)unquantAbsLevel0, "DQuant check failed\n");
260
                     int d0 = abs(signCoef) - unquantAbsLevel0;
261
                     int64_t curCost0 = RDCOST(d0, sigCoefBits + levelBits0);
262
 
263
-                    int unquantAbsLevel1 = UNQUANT(maxAbsLevel - 1);
264
+                    const int unquantAbsLevel1 = (unQuantLevel - preDQuantLevelDiff) >> unquantShift;
265
+                    X265_CHECK(UNQUANT(maxAbsLevel - 1) == (uint32_t)unquantAbsLevel1, "DQuant check failed\n");
266
                     int d1 = abs(signCoef) - unquantAbsLevel1;
267
                     int64_t curCost1 = RDCOST(d1, sigCoefBits + levelBits1);
268
 
269
@@ -1012,9 +1024,9 @@
270
                     }
271
                     else
272
                     {
273
-                        rate1 = getICRate(level + 0, diff0 + 1, greaterOneBits, levelAbsBits, goRiceParam, maxVlc, c1c2Idx);
274
-                        rate2 = getICRate(level + 1, diff0 + 2, greaterOneBits, levelAbsBits, goRiceParam, maxVlc, c1c2Idx);
275
-                        rate0 = getICRate(level - 1, diff0 + 0, greaterOneBits, levelAbsBits, goRiceParam, maxVlc, c1c2Idx);
276
+                        rate1 = getICRate(level + 0, diff0 + 1, greaterOneBits, levelAbsBits, goRiceParam, maxVlc, c1c2Rate);
277
+                        rate2 = getICRate(level + 1, diff0 + 2, greaterOneBits, levelAbsBits, goRiceParam, maxVlc, c1c2Rate);
278
+                        rate0 = getICRate(level - 1, diff0 + 0, greaterOneBits, levelAbsBits, goRiceParam, maxVlc, c1c2Rate);
279
                     }
280
                     rateIncUp[blkPos] = rate2 - rate1;
281
                     rateIncDown[blkPos] = rate0 - rate1;
282
@@ -1026,10 +1038,14 @@
283
                 }
284
 
285
                 /* Update CABAC estimation state */
286
-                if (level >= baseLevel && goRiceParam < 4 && level > (3U << goRiceParam))
287
+                if ((level >= baseLevel) && (goRiceParam < 4) && (level > levelThreshold))
288
+                {
289
                     goRiceParam++;
290
+                    levelThreshold <<= 1;
291
+                }
292
 
293
-                c1Idx -= (-(int32_t)level) >> 31;
294
+                const uint32_t isNonZero = (uint32_t)(-(int32_t)level) >> 31;
295
+                c1Idx += isNonZero;
296
 
297
                 /* update bin model */
298
                 if (level > 1)
299
@@ -1038,7 +1054,7 @@
300
                     c2 += (uint32_t)(c2 - 2) >> 31;
301
                     c2Idx++;
302
                 }
303
-                else if ((c1 < 3) && (c1 > 0) && level)
304
+                else if (((c1 == 1) | (c1 == 2)) & isNonZero)
305
                     c1++;
306
 
307
                 if (dstCoeff[blkPos])
308
@@ -1219,7 +1235,8 @@
309
 
310
     // Average 49.62 pixels
311
     /* clean uncoded coefficients */
312
-    for (int pos = bestLastIdx; pos <= fastMin(lastScanPos, (bestLastIdx | (SCAN_SET_SIZE - 1))); pos++)
313
+    X265_CHECK((uint32_t)(fastMin(lastScanPos, bestLastIdx) | (SCAN_SET_SIZE - 1)) < trSize * trSize, "array beyond bound\n");
314
+    for (int pos = bestLastIdx; pos <= (fastMin(lastScanPos, bestLastIdx) | (SCAN_SET_SIZE - 1)); pos++)
315
     {
316
         dstCoeff[codeParams.scan[pos]] = 0;
317
     }
318
@@ -1236,7 +1253,8 @@
319
     if (cu.m_slice->m_pps->bSignHideEnabled && numSig >= 2)
320
     {
321
         const int realLastScanPos = (bestLastIdx - 1) >> LOG2_SCAN_SET_SIZE;
322
-        int lastCG = true;
323
+        int lastCG = 1;
324
+
325
         for (int subSet = realLastScanPos; subSet >= 0; subSet--)
326
         {
327
             int subPos = subSet << LOG2_SCAN_SET_SIZE;
328
@@ -1248,69 +1266,72 @@
329
             /* measure distance between first and last non-zero coef in this
330
              * coding group */
331
             const uint32_t posFirstLast = primitives.findPosFirstLast(&dstCoeff[codeParams.scan[subPos]], trSize, g_scan4x4[codeParams.scanType]);
332
-            int firstNZPosInCG = (uint16_t)posFirstLast;
333
-            int lastNZPosInCG = posFirstLast >> 16;
334
-
335
+            const int firstNZPosInCG = (uint8_t)posFirstLast;
336
+            const int lastNZPosInCG = (int8_t)(posFirstLast >> 8);
337
+            const uint32_t absSumSign = posFirstLast;
338
 
339
             if (lastNZPosInCG - firstNZPosInCG >= SBH_THRESHOLD)
340
             {
341
-                uint32_t signbit = (dstCoeff[codeParams.scan[subPos + firstNZPosInCG]] > 0 ? 0 : 1);
342
-                int absSum = 0;
343
+                const int32_t signbit = ((int32_t)dstCoeff[codeParams.scan[subPos + firstNZPosInCG]]);
344
 
345
+#if CHECKED_BUILD || _DEBUG
346
+                int32_t absSum_dummy = 0;
347
                 for (n = firstNZPosInCG; n <= lastNZPosInCG; n++)
348
-                    absSum += dstCoeff[codeParams.scan[n + subPos]];
349
+                    absSum_dummy += dstCoeff[codeParams.scan[n + subPos]];
350
+                X265_CHECK(((uint32_t)absSum_dummy & 1) == (absSumSign >> 31), "absSumSign check failure\n");
351
+#endif
352
 
353
-                if (signbit != (absSum & 1U))
354
+                //if (signbit != absSumSign)
355
+                if (((int32_t)(signbit ^ absSumSign)) < 0)
356
                 {
357
                     /* We must find a coeff to toggle up or down so the sign bit of the first non-zero coeff
358
                      * is properly implied. Note dstCoeff[] are signed by this point but curChange and
359
                      * finalChange imply absolute levels (+1 is away from zero, -1 is towards zero) */
360
 
361
                     int64_t minCostInc = MAX_INT64, curCost = MAX_INT64;
362
-                    int minPos = -1;
363
-                    int16_t finalChange = 0, curChange = 0;
364
+                    uint32_t minPos = 0;
365
+                    int8_t finalChange = 0;
366
+                    int curChange = 0;
367
+                    uint32_t lastCoeffAdjust = (lastCG & (abs(dstCoeff[codeParams.scan[lastNZPosInCG + subPos]]) == 1)) * 4 * IEP_RATE;
368
 
369
                     for (n = (lastCG ? lastNZPosInCG : SCAN_SET_SIZE - 1); n >= 0; --n)
370
                     {
371
-                        uint32_t blkPos = codeParams.scan[n + subPos];
372
-                        int signCoef    = m_resiDctCoeff[blkPos]; /* pre-quantization DCT coeff */
373
-                        int absLevel    = abs(dstCoeff[blkPos]);
374
+                        const uint32_t blkPos = codeParams.scan[n + subPos];
375
+                        const int32_t signCoef = m_resiDctCoeff[blkPos]; /* pre-quantization DCT coeff */
376
+                        const int absLevel = abs(dstCoeff[blkPos]);
377
+                        // TODO: this is constant in non-scaling mode
378
+                        const uint32_t preDQuantLevelDiff = (unquantScale[blkPos] << per);
379
+                        const uint32_t unQuantLevel = (absLevel * (unquantScale[blkPos] << per) + unquantRound);
380
+
381
+                        int d = abs(signCoef) - (unQuantLevel >> unquantShift);
382
+                        X265_CHECK((uint32_t)UNQUANT(absLevel) == (unQuantLevel >> unquantShift), "dquant check failed\n");
383
 
384
-                        int d = abs(signCoef) - UNQUANT(absLevel);
385
-                        int64_t origDist = (((int64_t)d * d)) << scaleBits;
386
+                        const int64_t origDist = (((int64_t)d * d));
387
 
388
-#define DELTARDCOST(d, deltabits) ((((int64_t)d * d) << scaleBits) - origDist + ((lambda2 * (int64_t)(deltabits)) >> 8))
389
+#define DELTARDCOST(d0, d, deltabits) ((((int64_t)d * d - d0) << scaleBits) + ((lambda2 * (int64_t)(deltabits)) >> 8))
390
 
391
+                        const uint32_t isOne = (absLevel == 1);
392
                         if (dstCoeff[blkPos])
393
                         {
394
-                            d = abs(signCoef) - UNQUANT(absLevel + 1);
395
-                            int64_t costUp = DELTARDCOST(d, rateIncUp[blkPos]);
396
+                            d = abs(signCoef) - ((unQuantLevel + preDQuantLevelDiff) >> unquantShift);
397
+                            X265_CHECK((uint32_t)UNQUANT(absLevel + 1) == ((unQuantLevel + preDQuantLevelDiff) >> unquantShift), "dquant check failed\n");
398
+                            int64_t costUp = DELTARDCOST(origDist, d, rateIncUp[blkPos]);
399
 
400
                             /* if decrementing would make the coeff 0, we can include the
401
                              * significant coeff flag cost savings */
402
-                            d = abs(signCoef) - UNQUANT(absLevel - 1);
403
-                            bool isOne = abs(dstCoeff[blkPos]) == 1;
404
+                            d = abs(signCoef) - ((unQuantLevel - preDQuantLevelDiff) >> unquantShift);
405
+                            X265_CHECK((uint32_t)UNQUANT(absLevel - 1) == ((unQuantLevel - preDQuantLevelDiff) >> unquantShift), "dquant check failed\n");
406
                             int downBits = rateIncDown[blkPos] - (isOne ? (IEP_RATE + sigRateDelta[blkPos]) : 0);
407
-                            int64_t costDown = DELTARDCOST(d, downBits);
408
+                            int64_t costDown = DELTARDCOST(origDist, d, downBits);
409
 
410
-                            if (lastCG && lastNZPosInCG == n && isOne)
411
-                                costDown -= 4 * IEP_RATE;
412
+                            costDown -= lastCoeffAdjust;
413
+                            curCost = ((n == firstNZPosInCG) & isOne) ? MAX_INT64 : costDown;
414
 
415
-                            if (costUp < costDown)
416
-                            {
417
-                                curCost = costUp;
418
-                                curChange =  1;
419
-                            }
420
-                            else
421
-                            {
422
-                                curChange = -1;
423
-                                if (n == firstNZPosInCG && isOne)
424
-                                    curCost = MAX_INT64;
425
-                                else
426
-                                    curCost = costDown;
427
-                            }
428
+                            curChange = 2 * (costUp < costDown) - 1;
429
+                            curCost = (costUp < costDown) ? costUp : curCost;
430
                         }
431
-                        else if (n < firstNZPosInCG && signbit != (signCoef >= 0 ? 0 : 1U))
432
+                        //else if ((n < firstNZPosInCG) & (signbit != ((uint32_t)signCoef >> 31)))
433
+                        else if ((n < firstNZPosInCG) & ((signbit ^ signCoef) < 0))
434
                         {
435
                             /* don't try to make a new coded coeff before the first coeff if its
436
                              * sign would be different than the first coeff, the inferred sign would
437
@@ -1320,36 +1341,48 @@
438
                         else
439
                         {
440
                             /* evaluate changing an uncoded coeff 0 to a coded coeff +/-1 */
441
-                            d = abs(signCoef) - UNQUANT(1);
442
-                            curCost = DELTARDCOST(d, rateIncUp[blkPos] + IEP_RATE + sigRateDelta[blkPos]);
443
+                            d = abs(signCoef) - ((preDQuantLevelDiff + unquantRound) >> unquantShift);
444
+                            X265_CHECK((uint32_t)UNQUANT(1) == ((preDQuantLevelDiff + unquantRound) >> unquantShift), "dquant check failed\n");
445
+                            curCost = DELTARDCOST(origDist, d, rateIncUp[blkPos] + IEP_RATE + sigRateDelta[blkPos]);
446
                             curChange = 1;
447
                         }
448
 
449
                         if (curCost < minCostInc)
450
                         {
451
                             minCostInc = curCost;
452
-                            finalChange = curChange;
453
-                            minPos = blkPos;
454
+                            finalChange = (int8_t)curChange;
455
+                            minPos = blkPos + (absLevel << 16);
456
                         }
457
+                        lastCoeffAdjust = 0;
458
                     }
459
 
460
-                    if (dstCoeff[minPos] == 32767 || dstCoeff[minPos] == -32768)
461
+                    const int absInMinPos = (minPos >> 16);
462
+                    minPos = (uint16_t)minPos;
463
+
464
+                    // if (dstCoeff[minPos] == 32767 || dstCoeff[minPos] == -32768)
465
+                    if (absInMinPos >= 32767)
466
                         /* don't allow sign hiding to violate the SPEC range */
467
                         finalChange = -1;
468
 
469
-                    if (dstCoeff[minPos] == 0)
470
-                        numSig++;
471
-                    else if (finalChange == -1 && abs(dstCoeff[minPos]) == 1)
472
-                        numSig--;
473
-
474
-                    if (m_resiDctCoeff[minPos] >= 0)
475
-                        dstCoeff[minPos] += finalChange;
476
-                    else
477
-                        dstCoeff[minPos] -= finalChange;
478
+                    // NOTE: Reference code
479
+                    //if (dstCoeff[minPos] == 0)
480
+                    //    numSig++;
481
+                    //else if (finalChange == -1 && abs(dstCoeff[minPos]) == 1)
482
+                    //    numSig--;
483
+                    numSig += (absInMinPos == 0) - ((finalChange == -1) & (absInMinPos == 1));
484
+
485
+
486
+                    // NOTE: Reference code
487
+                    //if (m_resiDctCoeff[minPos] >= 0)
488
+                    //    dstCoeff[minPos] += finalChange;
489
+                    //else
490
+                    //    dstCoeff[minPos] -= finalChange;
491
+                    const int16_t resiCoeffSign = ((int16_t)m_resiDctCoeff[minPos] >> 16);
492
+                    dstCoeff[minPos] += (((int16_t)finalChange ^ resiCoeffSign) - resiCoeffSign);
493
                 }
494
             }
495
 
496
-            lastCG = false;
497
+            lastCG = 0;
498
         }
499
     }
500
 
501
x265_1.8.tar.gz/source/common/quant.h -> x265_1.9.tar.gz/source/common/quant.h Changed
72
 
1
@@ -2,6 +2,7 @@
2
  * Copyright (C) 2015 x265 project
3
  *
4
  * Authors: Steve Borho <steve@borho.org>
5
+ *          Min Chen <chenm003@163.com>
6
  *
7
  * This program is free software; you can redistribute it and/or modify
8
  * it under the terms of the GNU General Public License as published by
9
@@ -59,18 +60,18 @@
10
     }
11
 };
12
 
13
-#define MAX_NUM_TR_COEFFS        MAX_TR_SIZE * MAX_TR_SIZE /* Maximum number of transform coefficients, for a 32x32 transform */
14
-#define MAX_NUM_TR_CATEGORIES    16                        /* 32, 16, 8, 4 transform categories each for luma and chroma */
15
-
16
 // NOTE: MUST be 16-byte aligned for asm code
17
 struct NoiseReduction
18
 {
19
     /* 0 = luma 4x4,   1 = luma 8x8,   2 = luma 16x16,   3 = luma 32x32
20
      * 4 = chroma 4x4, 5 = chroma 8x8, 6 = chroma 16x16, 7 = chroma 32x32
21
      * Intra 0..7 - Inter 8..15 */
22
-    ALIGN_VAR_16(uint32_t, residualSum[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS]);
23
-    uint32_t count[MAX_NUM_TR_CATEGORIES];
24
-    uint16_t offsetDenoise[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS];
25
+    ALIGN_VAR_16(uint32_t, nrResidualSum[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS]);
26
+    uint32_t nrCount[MAX_NUM_TR_CATEGORIES];
27
+    uint16_t nrOffsetDenoise[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS];
28
+    uint16_t (*offset)[MAX_NUM_TR_COEFFS];
29
+    uint32_t (*residualSum)[MAX_NUM_TR_COEFFS];
30
+    uint32_t *count;
31
 };
32
 
33
 class Quant
34
@@ -125,8 +126,8 @@
35
         const uint32_t sigPos = (uint32_t)(sigCoeffGroupFlag64 >> (cgBlkPos + 1)); // just need lowest 7-bits valid
36
 
37
         // TODO: instruction BT is faster, but _bittest64 still generate instruction 'BT m, r' in VS2012
38
-        const uint32_t sigRight = ((uint32_t)(cgPosX - (trSizeCG - 1)) >> 31) & sigPos;
39
-        const uint32_t sigLower = ((uint32_t)(cgPosY - (trSizeCG - 1)) >> 31) & (sigPos >> (trSizeCG - 1));
40
+        const uint32_t sigRight = (cgPosX != (trSizeCG - 1)) & sigPos;
41
+        const uint32_t sigLower = (cgPosY != (trSizeCG - 1)) & (sigPos >> (trSizeCG - 1));
42
         return sigRight + sigLower * 2;
43
     }
44
 
45
@@ -136,8 +137,8 @@
46
         X265_CHECK(cgBlkPos < 64, "cgBlkPos is too large\n");
47
         // NOTE: unsafe shift operator, see NOTE in calcPatternSigCtx
48
         const uint32_t sigPos = (uint32_t)(cgGroupMask >> (cgBlkPos + 1)); // just need lowest 8-bits valid
49
-        const uint32_t sigRight = ((uint32_t)(cgPosX - (trSizeCG - 1)) >> 31) & sigPos;
50
-        const uint32_t sigLower = ((uint32_t)(cgPosY - (trSizeCG - 1)) >> 31) & (sigPos >> (trSizeCG - 1));
51
+        const uint32_t sigRight = (cgPosX != (trSizeCG - 1)) & sigPos;
52
+        const uint32_t sigLower = (cgPosY != (trSizeCG - 1)) & (sigPos >> (trSizeCG - 1));
53
 
54
         return (sigRight | sigLower);
55
     }
56
@@ -151,7 +152,14 @@
57
 
58
     uint32_t signBitHidingHDQ(int16_t* qcoeff, int32_t* deltaU, uint32_t numSig, const TUEntropyCodingParameters &codingParameters, uint32_t log2TrSize);
59
 
60
-    uint32_t rdoQuant(const CUData& cu, int16_t* dstCoeff, uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool usePsy);
61
+    template<uint32_t log2TrSize>
62
+    uint32_t rdoQuant(const CUData& cu, int16_t* dstCoeff, TextType ttype, uint32_t absPartIdx, bool usePsy);
63
+
64
+public:
65
+    typedef uint32_t (Quant::*rdoQuant_t)(const CUData& cu, int16_t* dstCoeff, TextType ttype, uint32_t absPartIdx, bool usePsy);
66
+
67
+private:
68
+    static rdoQuant_t rdoQuant_func[NUM_CU_DEPTH];
69
 };
70
 }
71
 
72
x265_1.8.tar.gz/source/common/shortyuv.cpp -> x265_1.9.tar.gz/source/common/shortyuv.cpp Changed
51
 
1
@@ -40,19 +40,26 @@
2
 bool ShortYuv::create(uint32_t size, int csp)
3
 {
4
     m_csp = csp;
5
+    m_size = size;
6
     m_hChromaShift = CHROMA_H_SHIFT(csp);
7
     m_vChromaShift = CHROMA_V_SHIFT(csp);
8
-
9
-    m_size = size;
10
-    m_csize = size >> m_hChromaShift;
11
-
12
     size_t sizeL = size * size;
13
-    size_t sizeC = sizeL >> (m_hChromaShift + m_vChromaShift);
14
-    X265_CHECK((sizeC & 15) == 0, "invalid size");
15
 
16
-    CHECKED_MALLOC(m_buf[0], int16_t, sizeL + sizeC * 2);
17
-    m_buf[1] = m_buf[0] + sizeL;
18
-    m_buf[2] = m_buf[0] + sizeL + sizeC;
19
+    if (csp != X265_CSP_I400)
20
+    {
21
+        m_csize = size >> m_hChromaShift;
22
+        size_t sizeC = sizeL >> (m_hChromaShift + m_vChromaShift);
23
+        X265_CHECK((sizeC & 15) == 0, "invalid size");
24
+
25
+        CHECKED_MALLOC(m_buf[0], int16_t, sizeL + sizeC * 2);
26
+        m_buf[1] = m_buf[0] + sizeL;
27
+        m_buf[2] = m_buf[0] + sizeL + sizeC;
28
+    }
29
+    else
30
+    {
31
+        CHECKED_MALLOC(m_buf[0], int16_t, sizeL);
32
+        m_buf[1] = m_buf[2] = NULL;
33
+    }
34
     return true;
35
 
36
 fail:
37
@@ -75,8 +82,11 @@
38
 {
39
     const int sizeIdx = log2Size - 2;
40
     primitives.cu[sizeIdx].sub_ps(m_buf[0], m_size, srcYuv0.m_buf[0], srcYuv1.m_buf[0], srcYuv0.m_size, srcYuv1.m_size);
41
-    primitives.chroma[m_csp].cu[sizeIdx].sub_ps(m_buf[1], m_csize, srcYuv0.m_buf[1], srcYuv1.m_buf[1], srcYuv0.m_csize, srcYuv1.m_csize);
42
-    primitives.chroma[m_csp].cu[sizeIdx].sub_ps(m_buf[2], m_csize, srcYuv0.m_buf[2], srcYuv1.m_buf[2], srcYuv0.m_csize, srcYuv1.m_csize);
43
+    if (m_csp != X265_CSP_I400)
44
+    {
45
+        primitives.chroma[m_csp].cu[sizeIdx].sub_ps(m_buf[1], m_csize, srcYuv0.m_buf[1], srcYuv1.m_buf[1], srcYuv0.m_csize, srcYuv1.m_csize);
46
+        primitives.chroma[m_csp].cu[sizeIdx].sub_ps(m_buf[2], m_csize, srcYuv0.m_buf[2], srcYuv1.m_buf[2], srcYuv0.m_csize, srcYuv1.m_csize);
47
+    }
48
 }
49
 
50
 void ShortYuv::copyPartToPartLuma(ShortYuv& dstYuv, uint32_t absPartIdx, uint32_t log2Size) const
51
x265_1.8.tar.gz/source/common/slice.cpp -> x265_1.9.tar.gz/source/common/slice.cpp Changed
44
 
1
@@ -33,7 +33,9 @@
2
 {
3
     if (m_sliceType == I_SLICE)
4
     {
5
-        memset(m_refPicList, 0, sizeof(m_refPicList));
6
+        memset(m_refFrameList, 0, sizeof(m_refFrameList));
7
+        memset(m_refReconPicList, 0, sizeof(m_refReconPicList));
8
+        memset(m_refPOCList, 0, sizeof(m_refPOCList));
9
         m_numRefIdx[1] = m_numRefIdx[0] = 0;
10
         return;
11
     }
12
@@ -106,13 +108,13 @@
13
     {
14
         cIdx = rIdx % numPocTotalCurr;
15
         X265_CHECK(cIdx >= 0 && cIdx < numPocTotalCurr, "RPS index check fail\n");
16
-        m_refPicList[0][rIdx] = rpsCurrList0[cIdx];
17
+        m_refFrameList[0][rIdx] = rpsCurrList0[cIdx];
18
     }
19
 
20
     if (m_sliceType != B_SLICE)
21
     {
22
         m_numRefIdx[1] = 0;
23
-        memset(m_refPicList[1], 0, sizeof(m_refPicList[1]));
24
+        memset(m_refFrameList[1], 0, sizeof(m_refFrameList[1]));
25
     }
26
     else
27
     {
28
@@ -120,13 +122,13 @@
29
         {
30
             cIdx = rIdx % numPocTotalCurr;
31
             X265_CHECK(cIdx >= 0 && cIdx < numPocTotalCurr, "RPS index check fail\n");
32
-            m_refPicList[1][rIdx] = rpsCurrList1[cIdx];
33
+            m_refFrameList[1][rIdx] = rpsCurrList1[cIdx];
34
         }
35
     }
36
 
37
     for (int dir = 0; dir < 2; dir++)
38
         for (int numRefIdx = 0; numRefIdx < m_numRefIdx[dir]; numRefIdx++)
39
-            m_refPOCList[dir][numRefIdx] = m_refPicList[dir][numRefIdx]->m_poc;
40
+            m_refPOCList[dir][numRefIdx] = m_refFrameList[dir][numRefIdx]->m_poc;
41
 }
42
 
43
 void Slice::disableWeights()
44
x265_1.8.tar.gz/source/common/slice.h -> x265_1.9.tar.gz/source/common/slice.h Changed
243
 
1
@@ -31,6 +31,7 @@
2
 
3
 class Frame;
4
 class PicList;
5
+class PicYuv;
6
 class MotionReference;
7
 
8
 enum SliceType
9
@@ -104,6 +105,12 @@
10
 
11
 struct ProfileTierLevel
12
 {
13
+    int      profileIdc;
14
+    int      levelIdc;
15
+    uint32_t minCrForLevel;
16
+    uint32_t maxLumaSrForLevel;
17
+    uint32_t bitDepthConstraint;
18
+    int      chromaFormatConstraint;
19
     bool     tierFlag;
20
     bool     progressiveSourceFlag;
21
     bool     interlacedSourceFlag;
22
@@ -113,12 +120,6 @@
23
     bool     intraConstraintFlag;
24
     bool     onePictureOnlyConstraintFlag;
25
     bool     lowerBitRateConstraintFlag;
26
-    int      profileIdc;
27
-    int      levelIdc;
28
-    uint32_t minCrForLevel;
29
-    uint32_t maxLumaSrForLevel;
30
-    uint32_t bitDepthConstraint;
31
-    int      chromaFormatConstraint;
32
 };
33
 
34
 struct HRDInfo
35
@@ -151,21 +152,21 @@
36
 
37
 struct VPS
38
 {
39
+    HRDInfo          hrdParameters;
40
+    ProfileTierLevel ptl;
41
     uint32_t         maxTempSubLayers;
42
     uint32_t         numReorderPics;
43
     uint32_t         maxDecPicBuffering;
44
     uint32_t         maxLatencyIncrease;
45
-    HRDInfo          hrdParameters;
46
-    ProfileTierLevel ptl;
47
 };
48
 
49
 struct Window
50
 {
51
-    bool bEnabled;
52
     int  leftOffset;
53
     int  rightOffset;
54
     int  topOffset;
55
     int  bottomOffset;
56
+    bool bEnabled;
57
 
58
     Window()
59
     {
60
@@ -175,40 +176,41 @@
61
 
62
 struct VUI
63
 {
64
-    bool       aspectRatioInfoPresentFlag;
65
     int        aspectRatioIdc;
66
     int        sarWidth;
67
     int        sarHeight;
68
-
69
-    bool       overscanInfoPresentFlag;
70
-    bool       overscanAppropriateFlag;
71
-
72
-    bool       videoSignalTypePresentFlag;
73
     int        videoFormat;
74
-    bool       videoFullRangeFlag;
75
-
76
-    bool       colourDescriptionPresentFlag;
77
     int        colourPrimaries;
78
     int        transferCharacteristics;
79
     int        matrixCoefficients;
80
-
81
-    bool       chromaLocInfoPresentFlag;
82
     int        chromaSampleLocTypeTopField;
83
     int        chromaSampleLocTypeBottomField;
84
 
85
-    Window     defaultDisplayWindow;
86
-
87
+    bool       aspectRatioInfoPresentFlag;
88
+    bool       overscanInfoPresentFlag;
89
+    bool       overscanAppropriateFlag;
90
+    bool       videoSignalTypePresentFlag;
91
+    bool       videoFullRangeFlag;
92
+    bool       colourDescriptionPresentFlag;
93
+    bool       chromaLocInfoPresentFlag;
94
     bool       frameFieldInfoPresentFlag;
95
     bool       fieldSeqFlag;
96
-
97
     bool       hrdParametersPresentFlag;
98
-    HRDInfo    hrdParameters;
99
 
100
+    HRDInfo    hrdParameters;
101
+    Window     defaultDisplayWindow;
102
     TimingInfo timingInfo;
103
 };
104
 
105
 struct SPS
106
 {
107
+    /* cached PicYuv offset arrays, shared by all instances of
108
+     * PicYuv created by this encoder */
109
+    intptr_t* cuOffsetY;
110
+    intptr_t* cuOffsetC;
111
+    intptr_t* buOffsetY;
112
+    intptr_t* buOffsetC;
113
+
114
     int      chromaFormatIdc;        // use param
115
     uint32_t picWidthInLumaSamples;  // use param
116
     uint32_t picHeightInLumaSamples; // use param
117
@@ -228,8 +230,6 @@
118
     uint32_t quadtreeTUMaxDepthInter; // use param
119
     uint32_t quadtreeTUMaxDepthIntra; // use param
120
 
121
-    bool     bUseSAO; // use param
122
-    bool     bUseAMP; // use param
123
     uint32_t maxAMPDepth;
124
 
125
     uint32_t maxTempSubLayers;   // max number of Temporal Sub layers
126
@@ -237,11 +237,26 @@
127
     uint32_t maxLatencyIncrease;
128
     int      numReorderPics;
129
 
130
+    bool     bUseSAO; // use param
131
+    bool     bUseAMP; // use param
132
     bool     bUseStrongIntraSmoothing; // use param
133
     bool     bTemporalMVPEnabled;
134
 
135
     Window   conformanceWindow;
136
     VUI      vuiParameters;
137
+
138
+    SPS()
139
+    {
140
+        memset(this, 0, sizeof(*this));
141
+    }
142
+
143
+    ~SPS()
144
+    {
145
+        X265_FREE(cuOffsetY);
146
+        X265_FREE(cuOffsetC);
147
+        X265_FREE(buOffsetY);
148
+        X265_FREE(buOffsetC);
149
+    }
150
 };
151
 
152
 struct PPS
153
@@ -249,6 +264,8 @@
154
     uint32_t maxCuDQPDepth;
155
 
156
     int      chromaQpOffset[2];      // use param
157
+    int      deblockingFilterBetaOffsetDiv2;
158
+    int      deblockingFilterTcOffsetDiv2;
159
 
160
     bool     bUseWeightPred;         // use param
161
     bool     bUseWeightedBiPred;     // use param
162
@@ -262,17 +279,15 @@
163
 
164
     bool     bDeblockingFilterControlPresent;
165
     bool     bPicDisableDeblockingFilter;
166
-    int      deblockingFilterBetaOffsetDiv2;
167
-    int      deblockingFilterTcOffsetDiv2;
168
 };
169
 
170
 struct WeightParam
171
 {
172
     // Explicit weighted prediction parameters parsed in slice header,
173
-    bool     bPresentFlag;
174
     uint32_t log2WeightDenom;
175
     int      inputWeight;
176
     int      inputOffset;
177
+    bool     bPresentFlag;
178
 
179
     /* makes a non-h265 weight (i.e. fix7), into an h265 weight */
180
     void setFromWeightAndOffset(int w, int o, int denom, bool bNormalize)
181
@@ -304,6 +319,9 @@
182
 
183
     const SPS*  m_sps;
184
     const PPS*  m_pps;
185
+    Frame*      m_refFrameList[2][MAX_NUM_REF + 1];
186
+    PicYuv*     m_refReconPicList[2][MAX_NUM_REF + 1];
187
+
188
     WeightParam m_weightPredTable[2][MAX_NUM_REF][3]; // [list][refIdx][0:Y, 1:U, 2:V]
189
     MotionReference (*m_mref)[MAX_NUM_REF + 1];
190
     RPS         m_rps;
191
@@ -312,34 +330,28 @@
192
     SliceType   m_sliceType;
193
     int         m_sliceQp;
194
     int         m_poc;
195
-    
196
     int         m_lastIDR;
197
 
198
-    bool        m_bCheckLDC;       // TODO: is this necessary?
199
-    bool        m_sLFaseFlag;      // loop filter boundary flag
200
-    bool        m_colFromL0Flag;   // collocated picture from List0 or List1 flag
201
     uint32_t    m_colRefIdx;       // never modified
202
-    
203
+
204
     int         m_numRefIdx[2];
205
-    Frame*      m_refPicList[2][MAX_NUM_REF + 1];
206
     int         m_refPOCList[2][MAX_NUM_REF + 1];
207
 
208
     uint32_t    m_maxNumMergeCand; // use param
209
     uint32_t    m_endCUAddr;
210
 
211
+    bool        m_bCheckLDC;       // TODO: is this necessary?
212
+    bool        m_sLFaseFlag;      // loop filter boundary flag
213
+    bool        m_colFromL0Flag;   // collocated picture from List0 or List1 flag
214
+
215
     Slice()
216
     {
217
         m_lastIDR = 0;
218
         m_sLFaseFlag = true;
219
         m_numRefIdx[0] = m_numRefIdx[1] = 0;
220
-        for (int i = 0; i < MAX_NUM_REF; i++)
221
-        {
222
-            m_refPicList[0][i] = NULL;
223
-            m_refPicList[1][i] = NULL;
224
-            m_refPOCList[0][i] = 0;
225
-            m_refPOCList[1][i] = 0;
226
-        }
227
-
228
+        memset(m_refFrameList, 0, sizeof(m_refFrameList));
229
+        memset(m_refReconPicList, 0, sizeof(m_refReconPicList));
230
+        memset(m_refPOCList, 0, sizeof(m_refPOCList));
231
         disableWeights();
232
     }
233
 
234
@@ -347,8 +359,6 @@
235
 
236
     void setRefPicList(PicList& picList);
237
 
238
-    const Frame* getRefPic(int list, int refIdx) const { return refIdx >= 0 ? m_refPicList[list][refIdx] : NULL; }
239
-
240
     bool getRapPicFlag() const
241
     {
242
         return m_nalUnitType == NAL_UNIT_CODED_SLICE_IDR_W_RADL
243
x265_1.8.tar.gz/source/common/threading.h -> x265_1.9.tar.gz/source/common/threading.h Changed
41
 
1
@@ -2,6 +2,7 @@
2
  * Copyright (C) 2013 x265 project
3
  *
4
  * Authors: Steve Borho <steve@borho.org>
5
+ *          Min Chen <chenm003@163.com>
6
  *
7
  * This program is free software; you can redistribute it and/or modify
8
  * it under the terms of the GNU General Public License as published by
9
@@ -204,6 +205,15 @@
10
         return ret;
11
     }
12
 
13
+    int getIncr(int n = 1)
14
+    {
15
+        EnterCriticalSection(&m_cs);
16
+        int ret = m_val;
17
+        m_val += n;
18
+        LeaveCriticalSection(&m_cs);
19
+        return ret;
20
+    }
21
+
22
     void set(int newval)
23
     {
24
         EnterCriticalSection(&m_cs);
25
@@ -393,6 +403,15 @@
26
         return ret;
27
     }
28
 
29
+    int getIncr(int n = 1)
30
+    {
31
+        pthread_mutex_lock(&m_mutex);
32
+        int ret = m_val;
33
+        m_val += n;
34
+        pthread_mutex_unlock(&m_mutex);
35
+        return ret;
36
+    }
37
+
38
     void set(int newval)
39
     {
40
         pthread_mutex_lock(&m_mutex);
41
x265_1.8.tar.gz/source/common/threadpool.cpp -> x265_1.9.tar.gz/source/common/threadpool.cpp Changed
240
 
1
@@ -2,6 +2,7 @@
2
  * Copyright (C) 2013 x265 project
3
  *
4
  * Authors: Steve Borho <steve@borho.org>
5
+ *          Min Chen <chenm003@163.com>
6
  *
7
  * This program is free software; you can redistribute it and/or modify
8
  * it under the terms of the GNU General Public License as published by
9
@@ -59,6 +60,9 @@
10
 #if HAVE_LIBNUMA
11
 #include <numa.h>
12
 #endif
13
+#if defined(_MSC_VER)
14
+# define strcasecmp _stricmp
15
+#endif
16
 
17
 namespace X265_NS {
18
 // x265 private namespace
19
@@ -226,8 +230,13 @@
20
 {
21
     enum { MAX_NODE_NUM = 127 };
22
     int cpusPerNode[MAX_NODE_NUM + 1];
23
+    int threadsPerPool[MAX_NODE_NUM + 2];
24
+    uint64_t nodeMaskPerPool[MAX_NODE_NUM + 2];
25
 
26
     memset(cpusPerNode, 0, sizeof(cpusPerNode));
27
+    memset(threadsPerPool, 0, sizeof(threadsPerPool));
28
+    memset(nodeMaskPerPool, 0, sizeof(nodeMaskPerPool));
29
+
30
     int numNumaNodes = X265_MIN(getNumaNodeCount(), MAX_NODE_NUM);
31
     int cpuCount = getCpuCount();
32
     bool bNumaSupport = false;
33
@@ -258,7 +267,7 @@
34
         for (int i = 0; i < numNumaNodes; i++)
35
             x265_log(p, X265_LOG_DEBUG, "detected NUMA node %d with %d logical cores\n", i, cpusPerNode[i]);
36
 
37
-    /* limit nodes based on param->numaPools */
38
+    /* limit threads based on param->numaPools */
39
     if (p->numaPools && *p->numaPools)
40
     {
41
         const char *nodeStr = p->numaPools;
42
@@ -266,19 +275,30 @@
43
         {
44
             if (!*nodeStr)
45
             {
46
-                cpusPerNode[i] = 0;
47
+                threadsPerPool[i] = 0;
48
                 continue;
49
             }
50
             else if (*nodeStr == '-')
51
-                cpusPerNode[i] = 0;
52
-            else if (*nodeStr == '*')
53
+                threadsPerPool[i] = 0;
54
+           else if (*nodeStr == '*' || !strcasecmp(nodeStr, "NULL"))
55
+            {
56
+                for (int j = i; j < numNumaNodes; j++)
57
+                {
58
+                    threadsPerPool[numNumaNodes] += cpusPerNode[j];
59
+                    nodeMaskPerPool[numNumaNodes] |= ((uint64_t)1 << j);
60
+                }
61
                 break;
62
+            }
63
             else if (*nodeStr == '+')
64
-                ;
65
+            {
66
+                threadsPerPool[numNumaNodes] += cpusPerNode[i];
67
+                nodeMaskPerPool[numNumaNodes] |= ((uint64_t)1 << i);
68
+            }
69
             else
70
             {
71
                 int count = atoi(nodeStr);
72
-                cpusPerNode[i] = X265_MIN(count, cpusPerNode[i]);
73
+                threadsPerPool[i] = X265_MIN(count, cpusPerNode[i]);
74
+                nodeMaskPerPool[i] = ((uint64_t)1 << i);
75
             }
76
 
77
             /* consume current node string, comma, and white-space */
78
@@ -288,14 +308,31 @@
79
                ++nodeStr;
80
         }
81
     }
82
+    else
83
+    {
84
+        for (int i = 0; i < numNumaNodes; i++)
85
+        {
86
+            threadsPerPool[numNumaNodes]  += cpusPerNode[i];
87
+            nodeMaskPerPool[numNumaNodes] |= ((uint64_t)1 << i);
88
+        }
89
+    }
90
+ 
91
+    // If the last pool size is > MAX_POOL_THREADS, clip it to spawn thread pools only of size >= 1/2 max (heuristic)
92
+    if ((threadsPerPool[numNumaNodes] > MAX_POOL_THREADS) &&
93
+        ((threadsPerPool[numNumaNodes] % MAX_POOL_THREADS) < (MAX_POOL_THREADS / 2)))
94
+    {
95
+        threadsPerPool[numNumaNodes] -= (threadsPerPool[numNumaNodes] % MAX_POOL_THREADS);
96
+        x265_log(p, X265_LOG_DEBUG,
97
+                 "Creating only %d worker threads beyond specified numbers with --pools (if specified) to prevent asymmetry in pools; may not use all HW contexts\n", threadsPerPool[numNumaNodes]);
98
+    }
99
 
100
     numPools = 0;
101
-    for (int i = 0; i < numNumaNodes; i++)
102
+    for (int i = 0; i < numNumaNodes + 1; i++)
103
     {
104
         if (bNumaSupport)
105
             x265_log(p, X265_LOG_DEBUG, "NUMA node %d may use %d logical cores\n", i, cpusPerNode[i]);
106
-        if (cpusPerNode[i])
107
-            numPools += (cpusPerNode[i] + MAX_POOL_THREADS - 1) / MAX_POOL_THREADS;
108
+        if (threadsPerPool[i])
109
+            numPools += (threadsPerPool[i] + MAX_POOL_THREADS - 1) / MAX_POOL_THREADS;
110
     }
111
 
112
     if (!numPools)
113
@@ -314,20 +351,27 @@
114
         int node = 0;
115
         for (int i = 0; i < numPools; i++)
116
         {
117
-            while (!cpusPerNode[node])
118
+            while (!threadsPerPool[node])
119
                 node++;
120
-            int cores = X265_MIN(MAX_POOL_THREADS, cpusPerNode[node]);
121
-            if (!pools[i].create(cores, maxProviders, node))
122
+            int numThreads = X265_MIN(MAX_POOL_THREADS, threadsPerPool[node]);
123
+            if (!pools[i].create(numThreads, maxProviders, nodeMaskPerPool[node]))
124
             {
125
                 X265_FREE(pools);
126
                 numPools = 0;
127
                 return NULL;
128
             }
129
             if (numNumaNodes > 1)
130
-                x265_log(p, X265_LOG_INFO, "Thread pool %d using %d threads on NUMA node %d\n", i, cores, node);
131
+            {
132
+                char *nodesstr = new char[64 * strlen(",63") + 1];
133
+                int len = 0;
134
+                for (int j = 0; j < 64; j++)
135
+                    if ((nodeMaskPerPool[node] >> j) & 1)
136
+                        len += sprintf(nodesstr + len, ",%d", j);
137
+                x265_log(p, X265_LOG_INFO, "Thread pool %d using %d threads on numa nodes %s\n", i, numThreads, nodesstr + 1);
138
+            }
139
             else
140
-                x265_log(p, X265_LOG_INFO, "Thread pool created using %d threads\n", cores);
141
-            cpusPerNode[node] -= cores;
142
+                x265_log(p, X265_LOG_INFO, "Thread pool created using %d threads\n", numThreads);
143
+            threadsPerPool[node] -= numThreads;
144
         }
145
     }
146
     else
147
@@ -340,11 +384,37 @@
148
     memset(this, 0, sizeof(*this));
149
 }
150
 
151
-bool ThreadPool::create(int numThreads, int maxProviders, int node)
152
+bool ThreadPool::create(int numThreads, int maxProviders, uint64_t nodeMask)
153
 {
154
     X265_CHECK(numThreads <= MAX_POOL_THREADS, "a single thread pool cannot have more than MAX_POOL_THREADS threads\n");
155
 
156
-    m_numaNode = node;
157
+#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 
158
+    m_winCpuMask = 0x0;
159
+    GROUP_AFFINITY groupAffinity;
160
+    for (int i = 0; i < getNumaNodeCount(); i++)
161
+    {
162
+        int numaNode = ((nodeMask >> i) & 0x1U) ? i : -1;
163
+        if (numaNode != -1)
164
+            if (GetNumaNodeProcessorMaskEx((USHORT)numaNode, &groupAffinity))
165
+                m_winCpuMask |= groupAffinity.Mask;
166
+    }
167
+    m_numaMask = &m_winCpuMask;
168
+#elif HAVE_LIBNUMA
169
+    if (numa_available() >= 0)
170
+    {
171
+        struct bitmask* nodemask = numa_allocate_nodemask();
172
+        if (nodemask)
173
+        {
174
+            *(nodemask->maskp) = nodeMask;
175
+            m_numaMask = nodemask;
176
+        }
177
+        else
178
+            x265_log(NULL, X265_LOG_ERROR, "unable to get NUMA node mask for %lx\n", nodeMask);
179
+    }
180
+#else
181
+    (void)nodeMask;
182
+#endif
183
+
184
     m_numWorkers = numThreads;
185
 
186
     m_workers = X265_MALLOC(WorkerThread, numThreads);
187
@@ -398,36 +468,39 @@
188
 
189
     X265_FREE(m_workers);
190
     X265_FREE(m_jpTable);
191
+
192
+#if HAVE_LIBNUMA
193
+    if(m_numaMask)
194
+        numa_free_nodemask((struct bitmask*)m_numaMask);
195
+#endif
196
 }
197
 
198
 void ThreadPool::setCurrentThreadAffinity()
199
 {
200
-    setThreadNodeAffinity(m_numaNode);
201
+    setThreadNodeAffinity(m_numaMask);
202
 }
203
 
204
 /* static */
205
-void ThreadPool::setThreadNodeAffinity(int numaNode)
206
+void ThreadPool::setThreadNodeAffinity(void *numaMask)
207
 {
208
 #if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 
209
-    GROUP_AFFINITY groupAffinity;
210
-    if (GetNumaNodeProcessorMaskEx((USHORT)numaNode, &groupAffinity))
211
-    {
212
-        if (SetThreadAffinityMask(GetCurrentThread(), (DWORD_PTR)groupAffinity.Mask))
213
-            return;
214
-    }
215
-    x265_log(NULL, X265_LOG_ERROR, "unable to set thread affinity to NUMA node %d\n", numaNode);
216
+    if (SetThreadAffinityMask(GetCurrentThread(), *((DWORD_PTR*)numaMask)))
217
+        return;
218
+    else
219
+        x265_log(NULL, X265_LOG_ERROR, "unable to set thread affinity for NUMA node mask\n");
220
 #elif HAVE_LIBNUMA
221
     if (numa_available() >= 0)
222
     {
223
-        numa_run_on_node(numaNode);
224
-        numa_set_preferred(numaNode);
225
+        numa_run_on_node_mask((struct bitmask*)numaMask);
226
+        numa_set_interleave_mask((struct bitmask*)numaMask);
227
         numa_set_localalloc();
228
         return;
229
     }
230
-    x265_log(NULL, X265_LOG_ERROR, "unable to set thread affinity to NUMA node %d\n", numaNode);
231
+    x265_log(NULL, X265_LOG_ERROR, "unable to set thread affinity for NUMA node mask\n");
232
 #else
233
-    (void)numaNode;
234
+    (void)numaMask;
235
 #endif
236
+    return;
237
 }
238
 
239
 /* static */
240
x265_1.8.tar.gz/source/common/threadpool.h -> x265_1.9.tar.gz/source/common/threadpool.h Changed
31
 
1
@@ -83,7 +83,10 @@
2
     sleepbitmap_t m_sleepBitmap;
3
     int           m_numProviders;
4
     int           m_numWorkers;
5
-    int           m_numaNode;
6
+    void*         m_numaMask; // node mask in linux, cpu mask in windows
7
+#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 
8
+    DWORD_PTR     m_winCpuMask;
9
+#endif
10
     bool          m_isActive;
11
 
12
     JobProvider** m_jpTable;
13
@@ -92,7 +95,7 @@
14
     ThreadPool();
15
     ~ThreadPool();
16
 
17
-    bool create(int numThreads, int maxProviders, int node);
18
+    bool create(int numThreads, int maxProviders, uint64_t nodeMask);
19
     bool start();
20
     void stopWorkers();
21
     void setCurrentThreadAffinity();
22
@@ -103,7 +106,7 @@
23
 
24
     static int  getCpuCount();
25
     static int  getNumaNodeCount();
26
-    static void setThreadNodeAffinity(int node);
27
+    static void setThreadNodeAffinity(void *numaMask);
28
 };
29
 
30
 /* Any worker thread may enlist the help of idle worker threads from the same
31
x265_1.8.tar.gz/source/common/version.cpp -> x265_1.9.tar.gz/source/common/version.cpp Changed
9
 
1
@@ -2,6 +2,7 @@
2
  * Copyright (C) 2013 x265 project
3
  *
4
  * Authors: Steve Borho <steve@borho.org>
5
+ *          Min Chen <chenm003@163.com>
6
  *
7
  * This program is free software; you can redistribute it and/or modify
8
  * it under the terms of the GNU General Public License as published by
9
x265_1.8.tar.gz/source/common/wavefront.cpp -> x265_1.9.tar.gz/source/common/wavefront.cpp Changed
9
 
1
@@ -2,6 +2,7 @@
2
  * Copyright (C) 2013 x265 project
3
  *
4
  * Authors: Steve Borho <steve@borho.org>
5
+ *          Min Chen <chenm003@163.com>
6
  *
7
  * This program is free software; you can redistribute it and/or modify
8
  * it under the terms of the GNU General Public License as published by
9
x265_1.8.tar.gz/source/common/wavefront.h -> x265_1.9.tar.gz/source/common/wavefront.h Changed
9
 
1
@@ -2,6 +2,7 @@
2
  * Copyright (C) 2013 x265 project
3
  *
4
  * Authors: Steve Borho <steve@borho.org>
5
+ *          Min Chen <chenm003@163.com>
6
  *
7
  * This program is free software; you can redistribute it and/or modify
8
  * it under the terms of the GNU General Public License as published by
9
x265_1.8.tar.gz/source/common/x86/asm-primitives.cpp -> x265_1.9.tar.gz/source/common/x86/asm-primitives.cpp Changed
475
 
1
@@ -962,11 +962,8 @@
2
 
3
         p.cu[BLOCK_4x4].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar4_sse2);
4
         p.cu[BLOCK_8x8].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar8_sse2);
5
-
6
-#if X265_DEPTH <= 10
7
         p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar16_sse2);
8
         p.cu[BLOCK_32x32].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar32_sse2);
9
-#endif /* X265_DEPTH <= 10 */
10
         ALL_LUMA_TU_S(intra_pred[DC_IDX], intra_pred_dc, sse2);
11
 
12
         p.cu[BLOCK_4x4].intra_pred[2] = PFX(intra_pred_ang4_2_sse2);
13
@@ -1003,13 +1000,12 @@
14
         p.cu[BLOCK_4x4].intra_pred[33] = PFX(intra_pred_ang4_33_sse2);
15
 
16
         p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_32x64_sse2);
17
-#if X265_DEPTH <= 10
18
-        p.cu[BLOCK_4x4].sse_ss = PFX(pixel_ssd_ss_4x4_mmx2);
19
-        ALL_LUMA_CU(sse_ss, pixel_ssd_ss, sse2);
20
-
21
         p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_4x8_mmx2);
22
         p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_8x16_sse2);
23
         p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_16x32_sse2);
24
+#if X265_DEPTH <= 10
25
+        p.cu[BLOCK_4x4].sse_ss = PFX(pixel_ssd_ss_4x4_mmx2);
26
+        ALL_LUMA_CU(sse_ss, pixel_ssd_ss, sse2);
27
 #endif
28
         p.cu[BLOCK_4x4].dct = PFX(dct4_sse2);
29
         p.cu[BLOCK_8x8].dct = PFX(dct8_sse2);
30
@@ -1031,6 +1027,7 @@
31
         ALL_CHROMA_444_PU(p2s, filterPixelToShort, sse2);
32
         ALL_LUMA_PU(convert_p2s, filterPixelToShort, sse2);
33
         ALL_LUMA_TU(count_nonzero, count_nonzero, sse2);
34
+        p.propagateCost = PFX(mbtree_propagate_cost_sse2);
35
     }
36
     if (cpuMask & X265_CPU_SSE3)
37
     {
38
@@ -1144,11 +1141,8 @@
39
 
40
         p.cu[BLOCK_4x4].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar4_sse4);
41
         p.cu[BLOCK_8x8].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar8_sse4);
42
-
43
-#if X265_DEPTH <= 10
44
         p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar16_sse4);
45
         p.cu[BLOCK_32x32].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar32_sse4);
46
-#endif
47
         ALL_LUMA_TU_S(intra_pred[DC_IDX], intra_pred_dc, sse4);
48
         INTRA_ANG_SSE4_COMMON(sse4);
49
         INTRA_ANG_SSE4_HIGH(sse4);
50
@@ -1158,14 +1152,12 @@
51
         p.weight_sp = PFX(weight_sp_sse4);
52
 
53
         p.cu[BLOCK_4x4].psy_cost_pp = PFX(psyCost_pp_4x4_sse4);
54
-        p.cu[BLOCK_4x4].psy_cost_ss = PFX(psyCost_ss_4x4_sse4);
55
 
56
         // TODO: check POPCNT flag!
57
         ALL_LUMA_TU_S(copy_cnt, copy_cnt_, sse4);
58
 #if X265_DEPTH <= 10
59
         ALL_LUMA_CU(psy_cost_pp, psyCost_pp, sse4);
60
 #endif
61
-        ALL_LUMA_CU(psy_cost_ss, psyCost_ss, sse4);
62
 
63
         p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].p2s = PFX(filterPixelToShort_2x4_sse4);
64
         p.chroma[X265_CSP_I420].pu[CHROMA_420_2x8].p2s = PFX(filterPixelToShort_2x8_sse4);
65
@@ -1173,6 +1165,7 @@
66
         p.chroma[X265_CSP_I422].pu[CHROMA_422_2x8].p2s = PFX(filterPixelToShort_2x8_sse4);
67
         p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].p2s = PFX(filterPixelToShort_2x16_sse4);
68
         p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].p2s = PFX(filterPixelToShort_6x16_sse4);
69
+        p.costCoeffRemain = PFX(costCoeffRemain_sse4);
70
     }
71
     if (cpuMask & X265_CPU_AVX)
72
     {
73
@@ -1306,6 +1299,7 @@
74
         p.pu[LUMA_64x32].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x32_avx);
75
         p.pu[LUMA_64x48].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x48_avx);
76
         p.pu[LUMA_64x64].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x64_avx);
77
+        p.propagateCost = PFX(mbtree_propagate_cost_avx);
78
     }
79
     if (cpuMask & X265_CPU_XOP)
80
     {
81
@@ -1319,6 +1313,9 @@
82
     }
83
     if (cpuMask & X265_CPU_AVX2)
84
     {
85
+#if X265_DEPTH == 12
86
+        ASSIGN_SA8D(avx2);
87
+#endif
88
         p.cu[BLOCK_4x4].intra_filter = PFX(intra_filter_4x4_avx2);
89
 
90
         // TODO: the planecopy_sp is really planecopy_SC now, must be fix it
91
@@ -1479,20 +1476,14 @@
92
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].addAvg = PFX(addAvg_32x16_avx2);
93
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].addAvg = PFX(addAvg_32x48_avx2);
94
 
95
-        p.cu[BLOCK_4x4].psy_cost_ss = PFX(psyCost_ss_4x4_avx2);
96
-        p.cu[BLOCK_8x8].psy_cost_ss = PFX(psyCost_ss_8x8_avx2);
97
-        p.cu[BLOCK_16x16].psy_cost_ss = PFX(psyCost_ss_16x16_avx2);
98
-        p.cu[BLOCK_32x32].psy_cost_ss = PFX(psyCost_ss_32x32_avx2);
99
-        p.cu[BLOCK_64x64].psy_cost_ss = PFX(psyCost_ss_64x64_avx2);
100
         p.cu[BLOCK_4x4].psy_cost_pp = PFX(psyCost_pp_4x4_avx2);
101
-#if X265_DEPTH <= 10
102
+        p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar16_avx2);
103
+        p.cu[BLOCK_32x32].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar32_avx2);
104
+
105
         p.cu[BLOCK_8x8].psy_cost_pp = PFX(psyCost_pp_8x8_avx2);
106
         p.cu[BLOCK_16x16].psy_cost_pp = PFX(psyCost_pp_16x16_avx2);
107
         p.cu[BLOCK_32x32].psy_cost_pp = PFX(psyCost_pp_32x32_avx2);
108
         p.cu[BLOCK_64x64].psy_cost_pp = PFX(psyCost_pp_64x64_avx2);
109
-        p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar16_avx2);
110
-        p.cu[BLOCK_32x32].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar32_avx2);
111
-#endif
112
 
113
         p.cu[BLOCK_16x16].intra_pred[DC_IDX] = PFX(intra_pred_dc16_avx2);
114
         p.cu[BLOCK_32x32].intra_pred[DC_IDX] = PFX(intra_pred_dc32_avx2);
115
@@ -1536,20 +1527,13 @@
116
         p.cu[BLOCK_16x16].ssd_s = PFX(pixel_ssd_s_16_avx2);
117
         p.cu[BLOCK_32x32].ssd_s = PFX(pixel_ssd_s_32_avx2);
118
 
119
-#if X265_DEPTH <= 10
120
-        p.cu[BLOCK_16x16].sse_ss = PFX(pixel_ssd_ss_16x16_avx2);
121
-        p.cu[BLOCK_32x32].sse_ss = PFX(pixel_ssd_ss_32x32_avx2);
122
-        p.cu[BLOCK_64x64].sse_ss = PFX(pixel_ssd_ss_64x64_avx2);
123
-
124
-        p.cu[BLOCK_16x16].sse_pp = PFX(pixel_ssd_16x16_avx2);
125
-        p.cu[BLOCK_32x32].sse_pp = PFX(pixel_ssd_32x32_avx2);
126
-        p.cu[BLOCK_64x64].sse_pp = PFX(pixel_ssd_64x64_avx2);
127
-        p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sse_pp = PFX(pixel_ssd_16x16_avx2);
128
-        p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sse_pp = PFX(pixel_ssd_32x32_avx2);
129
+        p.cu[BLOCK_16x16].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_16x16_avx2);
130
+        p.cu[BLOCK_32x32].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_32x32_avx2);
131
+        p.cu[BLOCK_64x64].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_64x64_avx2);
132
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sse_pp = (pixel_sse_t)PFX(pixel_ssd_16x16_avx2);
133
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sse_pp = (pixel_sse_t)PFX(pixel_ssd_32x32_avx2);
134
         p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_16x32_avx2);
135
         p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_32x64_avx2);
136
-#endif
137
-
138
         p.quant = PFX(quant_avx2);
139
         p.nquant = PFX(nquant_avx2);
140
         p.dequant_normal  = PFX(dequant_normal_avx2);
141
@@ -1588,21 +1572,16 @@
142
         p.cu[BLOCK_16x16].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_16_avx2);
143
         p.cu[BLOCK_32x32].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_32_avx2);
144
 
145
-#if X265_DEPTH <= 10
146
-        ALL_LUMA_TU_S(dct, dct, avx2);
147
         ALL_LUMA_TU_S(idct, idct, avx2);
148
-#endif
149
+        ALL_LUMA_TU_S(dct, dct, avx2);
150
+
151
         ALL_LUMA_CU_S(transpose, transpose, avx2);
152
 
153
         ALL_LUMA_PU(luma_vpp, interp_8tap_vert_pp, avx2);
154
         ALL_LUMA_PU(luma_vps, interp_8tap_vert_ps, avx2);
155
-#if X265_DEPTH <= 10
156
         ALL_LUMA_PU(luma_vsp, interp_8tap_vert_sp, avx2);
157
-#endif
158
         ALL_LUMA_PU(luma_vss, interp_8tap_vert_ss, avx2);
159
-#if X265_DEPTH <= 10
160
         p.pu[LUMA_4x4].luma_vsp = PFX(interp_8tap_vert_sp_4x4_avx2);               // since ALL_LUMA_PU didn't declare 4x4 size, calling separately luma_vsp function to use 
161
-#endif
162
 
163
         p.cu[BLOCK_16x16].add_ps = PFX(pixel_add_ps_16x16_avx2);
164
         p.cu[BLOCK_32x32].add_ps = PFX(pixel_add_ps_32x32_avx2);
165
@@ -1625,7 +1604,6 @@
166
         p.pu[LUMA_16x12].sad = PFX(pixel_sad_16x12_avx2);
167
         p.pu[LUMA_16x16].sad = PFX(pixel_sad_16x16_avx2);
168
         p.pu[LUMA_16x32].sad = PFX(pixel_sad_16x32_avx2);
169
-#if X265_DEPTH <= 10
170
         p.pu[LUMA_16x64].sad = PFX(pixel_sad_16x64_avx2);
171
         p.pu[LUMA_32x8].sad = PFX(pixel_sad_32x8_avx2);
172
         p.pu[LUMA_32x16].sad = PFX(pixel_sad_32x16_avx2);
173
@@ -1637,7 +1615,6 @@
174
         p.pu[LUMA_64x32].sad = PFX(pixel_sad_64x32_avx2);
175
         p.pu[LUMA_64x48].sad = PFX(pixel_sad_64x48_avx2);
176
         p.pu[LUMA_64x64].sad = PFX(pixel_sad_64x64_avx2);
177
-#endif
178
 
179
         p.pu[LUMA_16x4].sad_x3 = PFX(pixel_sad_x3_16x4_avx2);
180
         p.pu[LUMA_16x8].sad_x3 = PFX(pixel_sad_x3_16x8_avx2);
181
@@ -1712,7 +1689,6 @@
182
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s = PFX(filterPixelToShort_32x48_avx2);
183
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s = PFX(filterPixelToShort_32x64_avx2);
184
 
185
-#if X265_DEPTH <= 10
186
         p.pu[LUMA_4x4].luma_hps = PFX(interp_8tap_horiz_ps_4x4_avx2);
187
         p.pu[LUMA_4x8].luma_hps = PFX(interp_8tap_horiz_ps_4x8_avx2);
188
         p.pu[LUMA_4x16].luma_hps = PFX(interp_8tap_horiz_ps_4x16_avx2);
189
@@ -1738,7 +1714,6 @@
190
         p.pu[LUMA_48x64].luma_hps = PFX(interp_8tap_horiz_ps_48x64_avx2);
191
         p.pu[LUMA_24x32].luma_hps = PFX(interp_8tap_horiz_ps_24x32_avx2);
192
         p.pu[LUMA_12x16].luma_hps = PFX(interp_8tap_horiz_ps_12x16_avx2);
193
-#endif
194
 
195
         p.pu[LUMA_4x4].luma_hpp = PFX(interp_8tap_horiz_pp_4x4_avx2);
196
         p.pu[LUMA_4x8].luma_hpp = PFX(interp_8tap_horiz_pp_4x8_avx2);
197
@@ -1766,7 +1741,6 @@
198
         p.pu[LUMA_24x32].luma_hpp = PFX(interp_8tap_horiz_pp_24x32_avx2);
199
         p.pu[LUMA_48x64].luma_hpp = PFX(interp_8tap_horiz_pp_48x64_avx2);
200
 
201
-#if X265_DEPTH <= 10
202
         p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].filter_hps = PFX(interp_4tap_horiz_ps_8x8_avx2);
203
         p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].filter_hps = PFX(interp_4tap_horiz_ps_8x4_avx2);
204
         p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].filter_hps = PFX(interp_4tap_horiz_ps_8x16_avx2);
205
@@ -2164,18 +2138,19 @@
206
         p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vsp = PFX(interp_4tap_vert_sp_64x32_avx2);
207
         p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vsp = PFX(interp_4tap_vert_sp_64x48_avx2);
208
         p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vsp = PFX(interp_4tap_vert_sp_64x64_avx2);
209
-#endif
210
 
211
         p.frameInitLowres = PFX(frame_init_lowres_core_avx2);
212
+        p.propagateCost = PFX(mbtree_propagate_cost_avx2);
213
 
214
-#if X265_DEPTH <= 10
215
         // TODO: depends on hps and vsp
216
         ALL_LUMA_PU_T(luma_hvpp, interp_8tap_hv_pp_cpu);                        // calling luma_hvpp for all sizes
217
         p.pu[LUMA_4x4].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_4x4>;             // ALL_LUMA_PU_T has declared all sizes except 4x4, hence calling luma_hvpp[4x4] 
218
-#endif
219
 
220
         if (cpuMask & X265_CPU_BMI2)
221
+        {
222
             p.scanPosLast = PFX(scanPosLast_avx2_bmi2);
223
+            p.costCoeffNxN = PFX(costCoeffNxN_avx2_bmi2);
224
+        }
225
     }
226
 }
227
 #else // if HIGH_BIT_DEPTH
228
@@ -2345,7 +2320,7 @@
229
         p.cu[BLOCK_8x8].idct = PFX(idct8_sse2);
230
 
231
         // TODO: it is passed smoke test, but we need testbench, so temporary disable
232
-        //p.costC1C2Flag = x265_costC1C2Flag_sse2;
233
+        p.costC1C2Flag = PFX(costC1C2Flag_sse2);
234
 #endif
235
         p.idst4x4 = PFX(idst4_sse2);
236
         p.dst4x4 = PFX(dst4_sse2);
237
@@ -2356,6 +2331,7 @@
238
         ALL_CHROMA_444_PU(p2s, filterPixelToShort, sse2);
239
         ALL_LUMA_PU(convert_p2s, filterPixelToShort, sse2);
240
         ALL_LUMA_TU(count_nonzero, count_nonzero, sse2);
241
+        p.propagateCost = PFX(mbtree_propagate_cost_sse2);
242
     }
243
     if (cpuMask & X265_CPU_SSE3)
244
     {
245
@@ -2530,7 +2506,6 @@
246
         INTRA_ANG_SSE4(sse4);
247
 
248
         p.cu[BLOCK_4x4].psy_cost_pp = PFX(psyCost_pp_4x4_sse4);
249
-        p.cu[BLOCK_4x4].psy_cost_ss = PFX(psyCost_ss_4x4_sse4);
250
 
251
         p.pu[LUMA_4x4].convert_p2s = PFX(filterPixelToShort_4x4_sse4);
252
         p.pu[LUMA_4x8].convert_p2s = PFX(filterPixelToShort_4x8_sse4);
253
@@ -2552,6 +2527,9 @@
254
         p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].p2s = PFX(filterPixelToShort_6x16_sse4);
255
 
256
 #if X86_64
257
+        p.pelFilterLumaStrong[0] = PFX(pelFilterLumaStrong_V_sse4);
258
+        p.pelFilterLumaStrong[1] = PFX(pelFilterLumaStrong_H_sse4);
259
+
260
         p.saoCuStatsBO = PFX(saoCuStatsBO_sse4);
261
         p.saoCuStatsE0 = PFX(saoCuStatsE0_sse4);
262
         p.saoCuStatsE1 = PFX(saoCuStatsE1_sse4);
263
@@ -2559,7 +2537,6 @@
264
         p.saoCuStatsE3 = PFX(saoCuStatsE3_sse4);
265
 
266
         ALL_LUMA_CU(psy_cost_pp, psyCost_pp, sse4);
267
-        ALL_LUMA_CU(psy_cost_ss, psyCost_ss, sse4);
268
 
269
         p.costCoeffNxN = PFX(costCoeffNxN_sse4);
270
 #endif
271
@@ -2664,6 +2641,7 @@
272
         p.pu[LUMA_48x64].copy_pp = PFX(blockcopy_pp_48x64_avx);
273
 
274
         p.frameInitLowres = PFX(frame_init_lowres_core_avx);
275
+        p.propagateCost = PFX(mbtree_propagate_cost_avx);
276
     }
277
     if (cpuMask & X265_CPU_XOP)
278
     {
279
@@ -2678,6 +2656,14 @@
280
 #if X86_64
281
     if (cpuMask & X265_CPU_AVX2)
282
     {
283
+        p.cu[BLOCK_16x16].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_16x16_avx2);
284
+        p.cu[BLOCK_32x32].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_32x32_avx2);
285
+        p.cu[BLOCK_64x64].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_64x64_avx2);
286
+
287
+        p.cu[BLOCK_16x16].var = PFX(pixel_var_16x16_avx2);
288
+        p.cu[BLOCK_32x32].var = PFX(pixel_var_32x32_avx2);
289
+        p.cu[BLOCK_64x64].var = PFX(pixel_var_64x64_avx2);
290
+
291
         p.cu[BLOCK_4x4].intra_filter = PFX(intra_filter_4x4_avx2);
292
 
293
         p.planecopy_sp = PFX(downShift_16_avx2);
294
@@ -2700,12 +2686,6 @@
295
         p.saoCuOrgB0 = PFX(saoCuOrgB0_avx2);
296
         p.sign = PFX(calSign_avx2);
297
 
298
-        p.cu[BLOCK_4x4].psy_cost_ss = PFX(psyCost_ss_4x4_avx2);
299
-        p.cu[BLOCK_8x8].psy_cost_ss = PFX(psyCost_ss_8x8_avx2);
300
-        p.cu[BLOCK_16x16].psy_cost_ss = PFX(psyCost_ss_16x16_avx2);
301
-        p.cu[BLOCK_32x32].psy_cost_ss = PFX(psyCost_ss_32x32_avx2);
302
-        p.cu[BLOCK_64x64].psy_cost_ss = PFX(psyCost_ss_64x64_avx2);
303
-
304
         p.cu[BLOCK_4x4].psy_cost_pp = PFX(psyCost_pp_4x4_avx2);
305
         p.cu[BLOCK_8x8].psy_cost_pp = PFX(psyCost_pp_8x8_avx2);
306
         p.cu[BLOCK_16x16].psy_cost_pp = PFX(psyCost_pp_16x16_avx2);
307
@@ -2811,7 +2791,7 @@
308
         p.pu[LUMA_32x24].pixelavg_pp = PFX(pixel_avg_32x24_avx2);
309
         p.pu[LUMA_32x16].pixelavg_pp = PFX(pixel_avg_32x16_avx2);
310
         p.pu[LUMA_32x8].pixelavg_pp = PFX(pixel_avg_32x8_avx2);
311
-
312
+        p.pu[LUMA_48x64].pixelavg_pp = PFX(pixel_avg_48x64_avx2);
313
         p.pu[LUMA_64x64].pixelavg_pp = PFX(pixel_avg_64x64_avx2);
314
         p.pu[LUMA_64x48].pixelavg_pp = PFX(pixel_avg_64x48_avx2);
315
         p.pu[LUMA_64x32].pixelavg_pp = PFX(pixel_avg_64x32_avx2);
316
@@ -2863,6 +2843,11 @@
317
         p.pu[LUMA_32x64].sad_x4 = PFX(pixel_sad_x4_32x64_avx2);
318
         p.pu[LUMA_32x24].sad_x4 = PFX(pixel_sad_x4_32x24_avx2);
319
         p.pu[LUMA_32x8].sad_x4 = PFX(pixel_sad_x4_32x8_avx2);
320
+        p.pu[LUMA_48x64].sad_x4 = PFX(pixel_sad_x4_48x64_avx2);
321
+        p.pu[LUMA_64x16].sad_x4 = PFX(pixel_sad_x4_64x16_avx2);
322
+        p.pu[LUMA_64x32].sad_x4 = PFX(pixel_sad_x4_64x32_avx2);
323
+        p.pu[LUMA_64x48].sad_x4 = PFX(pixel_sad_x4_64x48_avx2);
324
+        p.pu[LUMA_64x64].sad_x4 = PFX(pixel_sad_x4_64x64_avx2);
325
 
326
         p.cu[BLOCK_16x16].sse_pp = PFX(pixel_ssd_16x16_avx2);
327
         p.cu[BLOCK_32x32].sse_pp = PFX(pixel_ssd_32x32_avx2);
328
@@ -2935,31 +2920,31 @@
329
         p.cu[BLOCK_4x4].intra_pred[32] = PFX(intra_pred_ang4_32_avx2);
330
         p.cu[BLOCK_4x4].intra_pred[33] = PFX(intra_pred_ang4_33_avx2);
331
         p.cu[BLOCK_8x8].intra_pred[3] = PFX(intra_pred_ang8_3_avx2);
332
-        p.cu[BLOCK_8x8].intra_pred[33] = PFX(intra_pred_ang8_33_avx2);
333
         p.cu[BLOCK_8x8].intra_pred[4] = PFX(intra_pred_ang8_4_avx2);
334
-        p.cu[BLOCK_8x8].intra_pred[32] = PFX(intra_pred_ang8_32_avx2);
335
         p.cu[BLOCK_8x8].intra_pred[5] = PFX(intra_pred_ang8_5_avx2);
336
-        p.cu[BLOCK_8x8].intra_pred[31] = PFX(intra_pred_ang8_31_avx2);
337
-        p.cu[BLOCK_8x8].intra_pred[30] = PFX(intra_pred_ang8_30_avx2);
338
         p.cu[BLOCK_8x8].intra_pred[6] = PFX(intra_pred_ang8_6_avx2);
339
         p.cu[BLOCK_8x8].intra_pred[7] = PFX(intra_pred_ang8_7_avx2);
340
-        p.cu[BLOCK_8x8].intra_pred[29] = PFX(intra_pred_ang8_29_avx2);
341
         p.cu[BLOCK_8x8].intra_pred[8] = PFX(intra_pred_ang8_8_avx2);
342
-        p.cu[BLOCK_8x8].intra_pred[28] = PFX(intra_pred_ang8_28_avx2);
343
         p.cu[BLOCK_8x8].intra_pred[9] = PFX(intra_pred_ang8_9_avx2);
344
-        p.cu[BLOCK_8x8].intra_pred[27] = PFX(intra_pred_ang8_27_avx2);
345
-        p.cu[BLOCK_8x8].intra_pred[25] = PFX(intra_pred_ang8_25_avx2);
346
-        p.cu[BLOCK_8x8].intra_pred[12] = PFX(intra_pred_ang8_12_avx2);
347
-        p.cu[BLOCK_8x8].intra_pred[24] = PFX(intra_pred_ang8_24_avx2);
348
         p.cu[BLOCK_8x8].intra_pred[11] = PFX(intra_pred_ang8_11_avx2);
349
+        p.cu[BLOCK_8x8].intra_pred[12] = PFX(intra_pred_ang8_12_avx2);
350
         p.cu[BLOCK_8x8].intra_pred[13] = PFX(intra_pred_ang8_13_avx2);
351
+        p.cu[BLOCK_8x8].intra_pred[14] = PFX(intra_pred_ang8_14_avx2);
352
+        p.cu[BLOCK_8x8].intra_pred[15] = PFX(intra_pred_ang8_15_avx2);
353
+        p.cu[BLOCK_8x8].intra_pred[16] = PFX(intra_pred_ang8_16_avx2);
354
         p.cu[BLOCK_8x8].intra_pred[20] = PFX(intra_pred_ang8_20_avx2);
355
         p.cu[BLOCK_8x8].intra_pred[21] = PFX(intra_pred_ang8_21_avx2);
356
         p.cu[BLOCK_8x8].intra_pred[22] = PFX(intra_pred_ang8_22_avx2);
357
         p.cu[BLOCK_8x8].intra_pred[23] = PFX(intra_pred_ang8_23_avx2);
358
-        p.cu[BLOCK_8x8].intra_pred[14] = PFX(intra_pred_ang8_14_avx2);
359
-        p.cu[BLOCK_8x8].intra_pred[15] = PFX(intra_pred_ang8_15_avx2);
360
-        p.cu[BLOCK_8x8].intra_pred[16] = PFX(intra_pred_ang8_16_avx2);
361
+        p.cu[BLOCK_8x8].intra_pred[24] = PFX(intra_pred_ang8_24_avx2);
362
+        p.cu[BLOCK_8x8].intra_pred[25] = PFX(intra_pred_ang8_25_avx2);
363
+        p.cu[BLOCK_8x8].intra_pred[27] = PFX(intra_pred_ang8_27_avx2);
364
+        p.cu[BLOCK_8x8].intra_pred[28] = PFX(intra_pred_ang8_28_avx2);
365
+        p.cu[BLOCK_8x8].intra_pred[29] = PFX(intra_pred_ang8_29_avx2);
366
+        p.cu[BLOCK_8x8].intra_pred[30] = PFX(intra_pred_ang8_30_avx2);
367
+        p.cu[BLOCK_8x8].intra_pred[31] = PFX(intra_pred_ang8_31_avx2);
368
+        p.cu[BLOCK_8x8].intra_pred[32] = PFX(intra_pred_ang8_32_avx2);
369
+        p.cu[BLOCK_8x8].intra_pred[33] = PFX(intra_pred_ang8_33_avx2);
370
         p.cu[BLOCK_16x16].intra_pred[3] = PFX(intra_pred_ang16_3_avx2);
371
         p.cu[BLOCK_16x16].intra_pred[4] = PFX(intra_pred_ang16_4_avx2);
372
         p.cu[BLOCK_16x16].intra_pred[5] = PFX(intra_pred_ang16_5_avx2);
373
@@ -2970,6 +2955,10 @@
374
         p.cu[BLOCK_16x16].intra_pred[12] = PFX(intra_pred_ang16_12_avx2);
375
         p.cu[BLOCK_16x16].intra_pred[11] = PFX(intra_pred_ang16_11_avx2);
376
         p.cu[BLOCK_16x16].intra_pred[13] = PFX(intra_pred_ang16_13_avx2);
377
+        p.cu[BLOCK_16x16].intra_pred[14] = PFX(intra_pred_ang16_14_avx2);
378
+        p.cu[BLOCK_16x16].intra_pred[15] = PFX(intra_pred_ang16_15_avx2);
379
+        p.cu[BLOCK_16x16].intra_pred[16] = PFX(intra_pred_ang16_16_avx2);
380
+        p.cu[BLOCK_16x16].intra_pred[17] = PFX(intra_pred_ang16_17_avx2);
381
         p.cu[BLOCK_16x16].intra_pred[25] = PFX(intra_pred_ang16_25_avx2);
382
         p.cu[BLOCK_16x16].intra_pred[28] = PFX(intra_pred_ang16_28_avx2);
383
         p.cu[BLOCK_16x16].intra_pred[27] = PFX(intra_pred_ang16_27_avx2);
384
@@ -2981,6 +2970,21 @@
385
         p.cu[BLOCK_16x16].intra_pred[24] = PFX(intra_pred_ang16_24_avx2);
386
         p.cu[BLOCK_16x16].intra_pred[23] = PFX(intra_pred_ang16_23_avx2);
387
         p.cu[BLOCK_16x16].intra_pred[22] = PFX(intra_pred_ang16_22_avx2);
388
+        p.cu[BLOCK_32x32].intra_pred[5]  = PFX(intra_pred_ang32_5_avx2);
389
+        p.cu[BLOCK_32x32].intra_pred[6]  = PFX(intra_pred_ang32_6_avx2);
390
+        p.cu[BLOCK_32x32].intra_pred[7]  = PFX(intra_pred_ang32_7_avx2);
391
+        p.cu[BLOCK_32x32].intra_pred[8]  = PFX(intra_pred_ang32_8_avx2);
392
+        p.cu[BLOCK_32x32].intra_pred[9]  = PFX(intra_pred_ang32_9_avx2);
393
+        p.cu[BLOCK_32x32].intra_pred[10] = PFX(intra_pred_ang32_10_avx2);
394
+        p.cu[BLOCK_32x32].intra_pred[11] = PFX(intra_pred_ang32_11_avx2);
395
+        p.cu[BLOCK_32x32].intra_pred[12] = PFX(intra_pred_ang32_12_avx2);
396
+        p.cu[BLOCK_32x32].intra_pred[13] = PFX(intra_pred_ang32_13_avx2);
397
+        p.cu[BLOCK_32x32].intra_pred[14] = PFX(intra_pred_ang32_14_avx2);
398
+        p.cu[BLOCK_32x32].intra_pred[15] = PFX(intra_pred_ang32_15_avx2);
399
+        p.cu[BLOCK_32x32].intra_pred[16] = PFX(intra_pred_ang32_16_avx2);
400
+        p.cu[BLOCK_32x32].intra_pred[17] = PFX(intra_pred_ang32_17_avx2);
401
+        p.cu[BLOCK_32x32].intra_pred[19] = PFX(intra_pred_ang32_19_avx2);
402
+        p.cu[BLOCK_32x32].intra_pred[20] = PFX(intra_pred_ang32_20_avx2);
403
         p.cu[BLOCK_32x32].intra_pred[34] = PFX(intra_pred_ang32_34_avx2);
404
         p.cu[BLOCK_32x32].intra_pred[2] = PFX(intra_pred_ang32_2_avx2);
405
         p.cu[BLOCK_32x32].intra_pred[26] = PFX(intra_pred_ang32_26_avx2);
406
@@ -3309,6 +3313,12 @@
407
         ALL_LUMA_PU_T(luma_hvpp, interp_8tap_hv_pp_cpu);
408
         p.pu[LUMA_4x4].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_4x4>;
409
 
410
+        p.pu[LUMA_16x4].convert_p2s = PFX(filterPixelToShort_16x4_avx2);
411
+        p.pu[LUMA_16x8].convert_p2s = PFX(filterPixelToShort_16x8_avx2);
412
+        p.pu[LUMA_16x12].convert_p2s = PFX(filterPixelToShort_16x12_avx2);
413
+        p.pu[LUMA_16x16].convert_p2s = PFX(filterPixelToShort_16x16_avx2);
414
+        p.pu[LUMA_16x32].convert_p2s = PFX(filterPixelToShort_16x32_avx2);
415
+        p.pu[LUMA_16x64].convert_p2s = PFX(filterPixelToShort_16x64_avx2);
416
         p.pu[LUMA_32x8].convert_p2s = PFX(filterPixelToShort_32x8_avx2);
417
         p.pu[LUMA_32x16].convert_p2s = PFX(filterPixelToShort_32x16_avx2);
418
         p.pu[LUMA_32x24].convert_p2s = PFX(filterPixelToShort_32x24_avx2);
419
@@ -3321,11 +3331,21 @@
420
         p.pu[LUMA_48x64].convert_p2s = PFX(filterPixelToShort_48x64_avx2);
421
         p.pu[LUMA_24x32].convert_p2s = PFX(filterPixelToShort_24x32_avx2);
422
 
423
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].p2s = PFX(filterPixelToShort_16x4_avx2);
424
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].p2s = PFX(filterPixelToShort_16x8_avx2);
425
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].p2s = PFX(filterPixelToShort_16x12_avx2);
426
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].p2s = PFX(filterPixelToShort_16x16_avx2);
427
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].p2s = PFX(filterPixelToShort_16x32_avx2);
428
         p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].p2s = PFX(filterPixelToShort_24x32_avx2);
429
         p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].p2s = PFX(filterPixelToShort_32x8_avx2);
430
         p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].p2s = PFX(filterPixelToShort_32x16_avx2);
431
         p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].p2s = PFX(filterPixelToShort_32x24_avx2);
432
         p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].p2s = PFX(filterPixelToShort_32x32_avx2);
433
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].p2s = PFX(filterPixelToShort_16x8_avx2);
434
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].p2s = PFX(filterPixelToShort_16x16_avx2);
435
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].p2s = PFX(filterPixelToShort_16x24_avx2);
436
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].p2s = PFX(filterPixelToShort_16x32_avx2);
437
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].p2s = PFX(filterPixelToShort_16x64_avx2);
438
         p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].p2s = PFX(filterPixelToShort_24x64_avx2);
439
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].p2s = PFX(filterPixelToShort_32x16_avx2);
440
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].p2s = PFX(filterPixelToShort_32x32_avx2);
441
@@ -3616,13 +3636,33 @@
442
         p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vpp = PFX(interp_4tap_vert_pp_64x16_avx2);
443
 
444
         p.frameInitLowres = PFX(frame_init_lowres_core_avx2);
445
+        p.propagateCost = PFX(mbtree_propagate_cost_avx2);
446
+        p.saoCuStatsE0 = PFX(saoCuStatsE0_avx2);
447
+        p.saoCuStatsE1 = PFX(saoCuStatsE1_avx2);
448
+        p.saoCuStatsE2 = PFX(saoCuStatsE2_avx2);
449
+        p.saoCuStatsE3 = PFX(saoCuStatsE3_avx2);
450
 
451
         if (cpuMask & X265_CPU_BMI2)
452
+        {
453
             p.scanPosLast = PFX(scanPosLast_avx2_bmi2);
454
+            p.costCoeffNxN = PFX(costCoeffNxN_avx2_bmi2);
455
+        }
456
         p.cu[BLOCK_32x32].copy_ps = PFX(blockcopy_ps_32x32_avx2);
457
         p.chroma[X265_CSP_I420].cu[CHROMA_420_32x32].copy_ps = PFX(blockcopy_ps_32x32_avx2);
458
         p.chroma[X265_CSP_I422].cu[CHROMA_422_32x64].copy_ps = PFX(blockcopy_ps_32x64_avx2);
459
         p.cu[BLOCK_64x64].copy_ps = PFX(blockcopy_ps_64x64_avx2);
460
+        p.planeClipAndMax = PFX(planeClipAndMax_avx2);
461
+
462
+        p.pu[LUMA_32x8].sad_x3 = PFX(pixel_sad_x3_32x8_avx2);
463
+        p.pu[LUMA_32x16].sad_x3 = PFX(pixel_sad_x3_32x16_avx2);
464
+        p.pu[LUMA_32x24].sad_x3 = PFX(pixel_sad_x3_32x24_avx2);
465
+        p.pu[LUMA_32x32].sad_x3 = PFX(pixel_sad_x3_32x32_avx2);
466
+        p.pu[LUMA_32x64].sad_x3 = PFX(pixel_sad_x3_32x64_avx2);
467
+        p.pu[LUMA_64x16].sad_x3 = PFX(pixel_sad_x3_64x16_avx2);
468
+        p.pu[LUMA_64x32].sad_x3 = PFX(pixel_sad_x3_64x32_avx2);
469
+        p.pu[LUMA_64x48].sad_x3 = PFX(pixel_sad_x3_64x48_avx2);
470
+        p.pu[LUMA_64x64].sad_x3 = PFX(pixel_sad_x3_64x64_avx2);
471
+        p.pu[LUMA_48x64].sad_x3 = PFX(pixel_sad_x3_48x64_avx2);
472
 
473
     }
474
 #endif
475
x265_1.8.tar.gz/source/common/x86/blockcopy8.asm -> x265_1.9.tar.gz/source/common/x86/blockcopy8.asm Changed
9
 
1
@@ -3,6 +3,7 @@
2
 ;*
3
 ;* Authors: Praveen Kumar Tiwari <praveen@multicorewareinc.com>
4
 ;*          Murugan Vairavel <murugan@multicorewareinc.com>
5
+;*          Min Chen <chenm003@163.com>
6
 ;*
7
 ;* This program is free software; you can redistribute it and/or modify
8
 ;* it under the terms of the GNU General Public License as published by
9
x265_1.8.tar.gz/source/common/x86/blockcopy8.h -> x265_1.9.tar.gz/source/common/x86/blockcopy8.h Changed
9
 
1
@@ -2,6 +2,7 @@
2
  * Copyright (C) 2013 x265 project
3
  *
4
  * Authors: Steve Borho <steve@borho.org>
5
+;*          Min Chen <chenm003@163.com>
6
  *
7
  * This program is free software; you can redistribute it and/or modify
8
  * it under the terms of the GNU General Public License as published by
9
x265_1.8.tar.gz/source/common/x86/const-a.asm -> x265_1.9.tar.gz/source/common/x86/const-a.asm Changed
90
 
1
@@ -2,6 +2,7 @@
2
 ;* const-a.asm: x86 global constants
3
 ;*****************************************************************************
4
 ;* Copyright (C) 2010-2013 x264 project
5
+;* Copyright (C) 2013-2015 x265 project
6
 ;*
7
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
8
 ;*          Fiona Glaser <fiona@x264.com>
9
@@ -31,10 +32,10 @@
10
 
11
 ;; 8-bit constants
12
 
13
-const pb_0,                 times 16 db 0
14
+const pb_0,                 times 32 db 0
15
 const pb_1,                 times 32 db 1
16
 const pb_2,                 times 32 db 2
17
-const pb_3,                 times 16 db 3
18
+const pb_3,                 times 32 db 3
19
 const pb_4,                 times 32 db 4
20
 const pb_8,                 times 32 db 8
21
 const pb_15,                times 32 db 15
22
@@ -54,6 +55,11 @@
23
 const pb_shuf8x8c,          times  1 db   0,   0,   0,   0,   2,   2,   2,   2,   4,   4,   4,   4,   6,   6,   6,   6
24
 const pb_movemask,          times 16 db 0x00
25
                             times 16 db 0xFF
26
+
27
+const pb_movemask_32,       times 32 db 0x00
28
+                            times 32 db 0xFF
29
+                            times 32 db 0x00
30
+
31
 const pb_0000000000000F0F,  times  2 db 0xff, 0x00
32
                             times 12 db 0x00
33
 const pb_000000000000000F,           db 0xff
34
@@ -61,6 +67,7 @@
35
 
36
 ;; 16-bit constants
37
 
38
+const pw_n1,                times 16 dw -1
39
 const pw_1,                 times 16 dw 1
40
 const pw_2,                 times 16 dw 2
41
 const pw_3,                 times 16 dw 3
42
@@ -86,12 +93,12 @@
43
 const pw_ff00,              times  8 dw 0xff00
44
 const pw_2000,              times 16 dw 0x2000
45
 const pw_8000,              times  8 dw 0x8000
46
-const pw_3fff,              times  8 dw 0x3fff
47
+const pw_3fff,              times 16 dw 0x3fff
48
 const pw_32_0,              times  4 dw 32,
49
                             times  4 dw 0
50
 const pw_pixel_max,         times 16 dw ((1 << BIT_DEPTH)-1)
51
 
52
-const pw_0_15,              times  2 dw   0,   1,   2,   3,   4,   5,   6,   7
53
+const pw_0_7,               times  2 dw   0,   1,   2,   3,   4,   5,   6,   7
54
 const pw_ppppmmmm,          times  1 dw   1,   1,   1,   1,  -1,  -1,  -1,  -1
55
 const pw_ppmmppmm,          times  1 dw   1,   1,  -1,  -1,   1,   1,  -1,  -1
56
 const pw_pmpmpmpm,          times 16 dw   1,  -1,   1,  -1,   1,  -1,   1,  -1
57
@@ -107,6 +114,7 @@
58
                             times  7 dw 0xff
59
 const hmul_16p,             times 16 db   1
60
                             times  8 db   1,  -1
61
+const pw_exp2_0_15,                  dw 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768
62
 
63
 
64
 ;; 32-bit constants
65
@@ -115,8 +123,9 @@
66
 const pd_2,                 times  8 dd 2
67
 const pd_4,                 times  4 dd 4
68
 const pd_8,                 times  4 dd 8
69
+const pd_15,                times  8 dd 15
70
 const pd_16,                times  8 dd 16
71
-const pd_31,                times  4 dd 31
72
+const pd_31,                times  8 dd 31
73
 const pd_32,                times  8 dd 32
74
 const pd_64,                times  4 dd 64
75
 const pd_128,               times  4 dd 128
76
@@ -129,7 +138,12 @@
77
 const pd_524416,            times  4 dd 524416
78
 const pd_n32768,            times  8 dd 0xffff8000
79
 const pd_n131072,           times  4 dd 0xfffe0000
80
-
81
+const pd_0000ffff,          times  8 dd 0x0000FFFF
82
+const pd_planar16_mul0,     times  1 dd  15,  14,  13,  12,  11,  10,   9,   8,    7,   6,   5,   4,   3,   2,   1,   0
83
+const pd_planar16_mul1,     times  1 dd   1,   2,   3,   4,   5,   6,   7,   8,    9,  10,  11,  12,  13,  14,  15,  16
84
+const pd_planar32_mul1,     times  1 dd  31,  30,  29,  28,  27,  26,  25,  24,   23,  22,  21,  20,  19,  18,  17,  16
85
+const pd_planar32_mul2,     times  1 dd  17,  18,  19,  20,  21,  22,  23,  24,   25,  26,  27,  28,  29,  30,  31,  32
86
+const pd_planar16_mul2,     times  1 dd  15,  14,  13,  12,  11,  10,   9,   8,    7,   6,   5,   4,   3,   2,   1,   0
87
 const trans8_shuf,          times  1 dd   0,   4,   1,   5,   2,   6,   3,   7
88
 
89
 const popcnt_table
90
x265_1.8.tar.gz/source/common/x86/cpu-a.asm -> x265_1.9.tar.gz/source/common/x86/cpu-a.asm Changed
9
 
1
@@ -2,6 +2,7 @@
2
 ;* cpu-a.asm: x86 cpu utilities
3
 ;*****************************************************************************
4
 ;* Copyright (C) 2003-2013 x264 project
5
+;* Copyright (C) 2013-2015 x265 project
6
 ;*
7
 ;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
8
 ;*          Loren Merritt <lorenm@u.washington.edu>
9
x265_1.8.tar.gz/source/common/x86/dct8.asm -> x265_1.9.tar.gz/source/common/x86/dct8.asm Changed
113
 
1
@@ -2115,15 +2115,15 @@
2
     mova     m0, [r0]
3
     pabsw    m1, m0
4
 
5
-    mova     m2, [r1]
6
+    movu     m2, [r1]
7
     pmovsxwd m3, m1
8
     paddd    m2, m3
9
-    mova     [r1], m2
10
-    mova     m2, [r1 + 16]
11
+    movu     [r1], m2
12
+    movu     m2, [r1 + 16]
13
     psrldq   m3, m1, 8
14
     pmovsxwd m4, m3
15
     paddd    m2, m4
16
-    mova     [r1 + 16], m2
17
+    movu     [r1 + 16], m2
18
 
19
     movu     m3, [r2]
20
     psubusw  m1, m3
21
@@ -2174,7 +2174,7 @@
22
     pmaddwd         m0,                 m%4
23
     phaddd          m2,                 m0
24
     paddd           m2,                 m5
25
-    psrad           m2,                 DCT_SHIFT
26
+    psrad           m2,                 DCT8_SHIFT1
27
     packssdw        m2,                 m2
28
     vpermq          m2,                 m2, 0x08
29
     mova            [r5 + %2],          xm2
30
@@ -2190,7 +2190,7 @@
31
     phaddd          m8,                 m9
32
     phaddd          m6,                 m8
33
     paddd           m6,                 m5
34
-    psrad           m6,                 DCT_SHIFT2
35
+    psrad           m6,                 DCT8_SHIFT2
36
 
37
     vbroadcasti128  m4,                 [r6 + %2]
38
     pmaddwd         m10,                m0, m4
39
@@ -2201,7 +2201,7 @@
40
     phaddd          m8,                 m9
41
     phaddd          m10,                m8
42
     paddd           m10,                m5
43
-    psrad           m10,                DCT_SHIFT2
44
+    psrad           m10,                DCT8_SHIFT2
45
 
46
     packssdw        m6,                 m10
47
     vpermq          m10,                m6, 0xD8
48
@@ -2210,18 +2210,7 @@
49
 
50
 INIT_YMM avx2
51
 cglobal dct8, 3, 7, 11, 0-8*16
52
-%if BIT_DEPTH == 12
53
-    %define         DCT_SHIFT          6
54
-    vbroadcasti128  m5,                [pd_16]
55
-%elif BIT_DEPTH == 10
56
-    %define         DCT_SHIFT          4
57
-    vbroadcasti128  m5,                [pd_8]
58
-%elif BIT_DEPTH == 8
59
-    %define         DCT_SHIFT          2
60
-    vbroadcasti128  m5,                [pd_2]
61
-%else
62
-    %error Unsupported BIT_DEPTH!
63
-%endif
64
+vbroadcasti128      m5,                [pd_ %+ DCT8_ROUND1]
65
 %define             DCT_SHIFT2         9
66
 
67
     add             r2d,               r2d
68
@@ -2265,7 +2254,7 @@
69
     DCT8_PASS_1     7 * 16,             7 * 16, 4, 1
70
 
71
     ;pass2
72
-    vbroadcasti128  m5,                [pd_256]
73
+    vbroadcasti128  m5,                [pd_ %+ DCT8_ROUND2]
74
 
75
     mova            m0,                [r5]
76
     mova            m1,                [r5 + 32]
77
@@ -2904,7 +2893,7 @@
78
 cglobal idct8, 3, 7, 13, 0-8*16
79
 %if BIT_DEPTH == 12
80
     %define         IDCT_SHIFT2        8
81
-    vpbroadcastd    m12,                [pd_256]
82
+    vpbroadcastd    m12,                [pd_128]
83
 %elif BIT_DEPTH == 10
84
     %define         IDCT_SHIFT2        10
85
     vpbroadcastd    m12,                [pd_512]
86
@@ -3065,7 +3054,7 @@
87
 cglobal idct16, 3, 7, 16, 0-16*mmsize
88
 %if BIT_DEPTH == 12
89
     %define         IDCT_SHIFT2        8
90
-    vpbroadcastd    m15,                [pd_256]
91
+    vpbroadcastd    m15,                [pd_128]
92
 %elif BIT_DEPTH == 10
93
     %define         IDCT_SHIFT2        10
94
     vpbroadcastd    m15,                [pd_512]
95
@@ -3487,7 +3476,7 @@
96
 
97
 %if BIT_DEPTH == 12
98
     %define         IDCT_SHIFT2        8
99
-    vpbroadcastd    m15,                [pd_256]
100
+    vpbroadcastd    m15,                [pd_128]
101
 %elif BIT_DEPTH == 10
102
     %define         IDCT_SHIFT2        10
103
     vpbroadcastd    m15,                [pd_512]
104
@@ -3651,7 +3640,7 @@
105
 %define             IDCT_SHIFT1         7
106
 %if BIT_DEPTH == 12
107
     %define         IDCT_SHIFT2        8
108
-    vpbroadcastd    m5,                [pd_256]
109
+    vpbroadcastd    m5,                [pd_128]
110
 %elif BIT_DEPTH == 10
111
     %define         IDCT_SHIFT2        10
112
     vpbroadcastd    m5,                [pd_512]
113
x265_1.8.tar.gz/source/common/x86/dct8.h -> x265_1.9.tar.gz/source/common/x86/dct8.h Changed
9
 
1
@@ -2,6 +2,7 @@
2
  * Copyright (C) 2013 x265 project
3
  *
4
  * Authors: Nabajit Deka <nabajit@multicorewareinc.com>
5
+;*          Min Chen <chenm003@163.com>
6
  *
7
  * This program is free software; you can redistribute it and/or modify
8
  * it under the terms of the GNU General Public License as published by
9
x265_1.8.tar.gz/source/common/x86/intrapred16.asm -> x265_1.9.tar.gz/source/common/x86/intrapred16.asm Changed
700
 
1
@@ -109,9 +109,11 @@
2
 cextern pw_16
3
 cextern pw_31
4
 cextern pw_32
5
+cextern pd_15
6
 cextern pd_16
7
 cextern pd_31
8
 cextern pd_32
9
+cextern pd_0000ffff
10
 cextern pw_4096
11
 cextern pw_pixel_max
12
 cextern multiL
13
@@ -123,7 +125,12 @@
14
 cextern pb_unpackwq1
15
 cextern pb_unpackwq2
16
 cextern pw_planar16_mul
17
+cextern pd_planar16_mul0
18
+cextern pd_planar16_mul1
19
 cextern pw_planar32_mul
20
+cextern pd_planar32_mul1
21
+cextern pd_planar32_mul2
22
+cextern pd_planar16_mul2
23
 
24
 ;-----------------------------------------------------------------------------------
25
 ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* above, int, int filter)
26
@@ -731,6 +738,117 @@
27
 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
28
 ;---------------------------------------------------------------------------------------
29
 INIT_XMM sse2
30
+%if ARCH_X86_64 == 1 && BIT_DEPTH == 12
31
+cglobal intra_pred_planar16, 3,5,13
32
+    add             r1d, r1d
33
+    pxor            m12, m12
34
+
35
+    movu            m2, [r2 + 2]
36
+    movu            m10, [r2 + 18]
37
+
38
+    punpckhwd       m7, m2, m12
39
+    punpcklwd       m2, m12
40
+    punpckhwd       m0, m10, m12
41
+    punpcklwd       m10, m12
42
+
43
+    movzx           r3d, word [r2 + 34]                     ; topRight   = above[16]
44
+    lea             r4, [pd_planar16_mul1]
45
+
46
+    movd            m3, r3d
47
+    pshufd          m3, m3, 0                               ; topRight
48
+
49
+    pmaddwd         m8, m3, [r4 + 3*mmsize]                 ; (x + 1) * topRight
50
+    pmaddwd         m4, m3, [r4 + 2*mmsize]                 ; (x + 1) * topRight
51
+    pmaddwd         m9, m3, [r4 + 1*mmsize]                 ; (x + 1) * topRight
52
+    pmaddwd         m3, m3, [r4 + 0*mmsize]                 ; (x + 1) * topRight
53
+
54
+    mova            m11, [pd_15]
55
+    pmaddwd         m1, m2,  m11                            ; (blkSize - 1 - y) * above[x]
56
+    pmaddwd         m6, m7,  m11                            ; (blkSize - 1 - y) * above[x]
57
+    pmaddwd         m5, m10, m11                            ; (blkSize - 1 - y) * above[x]
58
+    pmaddwd         m11, m0                                 ; (blkSize - 1 - y) * above[x]
59
+
60
+    paddd           m4, m5
61
+    paddd           m3, m1
62
+    paddd           m8, m11
63
+    paddd           m9, m6
64
+
65
+    mova            m5, [pd_16]
66
+    paddd           m3, m5
67
+    paddd           m9, m5
68
+    paddd           m4, m5
69
+    paddd           m8, m5
70
+
71
+    movzx           r4d, word [r2 + 98]                     ; bottomLeft = left[16]
72
+    movd            m6, r4d
73
+    pshufd          m6, m6, 0                               ; bottomLeft
74
+
75
+    paddd           m4, m6
76
+    paddd           m3, m6
77
+    paddd           m8, m6
78
+    paddd           m9, m6
79
+
80
+    psubd           m1, m6, m0                              ; column 12-15
81
+    psubd           m11, m6, m10                            ; column 8-11
82
+    psubd           m10, m6, m7                             ; column 4-7
83
+    psubd           m6, m2                                  ; column 0-3
84
+
85
+    add             r2, 66
86
+    lea             r4, [pd_planar16_mul0]
87
+
88
+%macro INTRA_PRED_PLANAR16_sse2 1
89
+    movzx           r3d, word [r2 + %1*2]
90
+    movd            m5, r3d
91
+    pshufd          m5, m5, 0
92
+
93
+    pmaddwd         m0, m5, [r4 + 3*mmsize]                 ; column 12-15
94
+    pmaddwd         m2, m5, [r4 + 2*mmsize]                 ; column 8-11
95
+    pmaddwd         m7, m5, [r4 + 1*mmsize]                 ; column 4-7
96
+    pmaddwd         m5, m5, [r4 + 0*mmsize]                 ; column 0-3
97
+
98
+    paddd           m0, m8
99
+    paddd           m2, m4
100
+    paddd           m7, m9
101
+    paddd           m5, m3
102
+
103
+    paddd           m8, m1
104
+    paddd           m4, m11
105
+    paddd           m9, m10
106
+    paddd           m3, m6
107
+
108
+    psrad           m0, 5
109
+    psrad           m2, 5
110
+    psrad           m7, 5
111
+    psrad           m5, 5
112
+
113
+    packssdw        m2, m0
114
+    packssdw        m5, m7
115
+    movu            [r0], m5
116
+    movu            [r0 + mmsize], m2
117
+
118
+    add             r0, r1
119
+%endmacro
120
+
121
+    INTRA_PRED_PLANAR16_sse2 0
122
+    INTRA_PRED_PLANAR16_sse2 1
123
+    INTRA_PRED_PLANAR16_sse2 2
124
+    INTRA_PRED_PLANAR16_sse2 3
125
+    INTRA_PRED_PLANAR16_sse2 4
126
+    INTRA_PRED_PLANAR16_sse2 5
127
+    INTRA_PRED_PLANAR16_sse2 6
128
+    INTRA_PRED_PLANAR16_sse2 7
129
+    INTRA_PRED_PLANAR16_sse2 8
130
+    INTRA_PRED_PLANAR16_sse2 9
131
+    INTRA_PRED_PLANAR16_sse2 10
132
+    INTRA_PRED_PLANAR16_sse2 11
133
+    INTRA_PRED_PLANAR16_sse2 12
134
+    INTRA_PRED_PLANAR16_sse2 13
135
+    INTRA_PRED_PLANAR16_sse2 14
136
+    INTRA_PRED_PLANAR16_sse2 15
137
+    RET
138
+
139
+%else
140
+; code for BIT_DEPTH == 10
141
 cglobal intra_pred_planar16, 3,3,8
142
     movu            m2, [r2 + 2]
143
     movu            m7, [r2 + 18]
144
@@ -809,7 +927,180 @@
145
     INTRA_PRED_PLANAR_16 14
146
     INTRA_PRED_PLANAR_16 15
147
     RET
148
+%endif
149
+
150
+;---------------------------------------------------------------------------------------
151
+; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
152
+;---------------------------------------------------------------------------------------
153
+INIT_XMM sse2
154
+%if ARCH_X86_64 == 1 && BIT_DEPTH == 12
155
+cglobal intra_pred_planar32, 3,7,16
156
+    ; NOTE: align stack to 64 bytes, so all of local data in same cache line
157
+    mov             r6, rsp
158
+    sub             rsp, 4*mmsize
159
+    and             rsp, ~63
160
+    %define         m16 [rsp + 0 * mmsize]
161
+    %define         m17 [rsp + 1 * mmsize]
162
+    %define         m18 [rsp + 2 * mmsize]
163
+    %define         m19 [rsp + 3 * mmsize]
164
+
165
+    add             r1, r1
166
+    pxor            m12, m12
167
+
168
+    movzx           r3d, word [r2 + 66]
169
+    lea             r4, [planar32_table1]
170
+
171
+    movd            m0, r3d
172
+    pshufd          m0, m0, 0
173
+
174
+    pmaddwd         m8, m0, [r4 + 0]
175
+    pmaddwd         m9, m0, [r4 + 16]
176
+    pmaddwd         m10, m0, [r4 + 32]
177
+    pmaddwd         m11, m0, [r4 + 48]
178
+    pmaddwd         m7, m0, [r4 + 64]
179
+    pmaddwd         m13, m0, [r4 + 80]
180
+    pmaddwd         m14, m0, [r4 + 96]
181
+    pmaddwd         m15, m0, [r4 + 112]
182
+
183
+    movzx           r3d, word [r2 + 194]
184
+    movd            m0, r3d
185
+    pshufd          m0, m0, 0
186
+
187
+    paddd           m8, m0
188
+    paddd           m9, m0
189
+    paddd           m10, m0
190
+    paddd           m11, m0
191
+    paddd           m7, m0
192
+    paddd           m13, m0
193
+    paddd           m14, m0
194
+    paddd           m15, m0
195
+
196
+    paddd           m8, [pd_32]
197
+    paddd           m9, [pd_32]
198
+    paddd           m10, [pd_32]
199
+    paddd           m11, [pd_32]
200
+    paddd           m7, [pd_32]
201
+    paddd           m13, [pd_32]
202
+    paddd           m14, [pd_32]
203
+    paddd           m15, [pd_32]
204
+
205
+    movu            m1, [r2 + 2]
206
+    punpckhwd       m5, m1, m12
207
+    pmaddwd         m2, m5, [pd_31]
208
+    paddd           m9, m2
209
+    psubd           m2, m0, m5
210
+
211
+    punpcklwd       m1, m12
212
+    pmaddwd         m5, m1, [pd_31]
213
+    paddd           m8, m5
214
+    psubd           m3, m0, m1
215
+
216
+    movu            m1, [r2 + 18]
217
+    punpckhwd       m5, m1, m12
218
+    pmaddwd         m4, m5, [pd_31]
219
+    paddd           m11, m4
220
+    psubd           m4, m0, m5
221
+
222
+    punpcklwd       m1, m12
223
+    pmaddwd         m5, m1, [pd_31]
224
+    paddd           m10, m5
225
+    psubd           m5, m0, m1
226
+    mova            m16, m5
227
+
228
+    movu            m1, [r2 + 34]
229
+    punpckhwd       m6, m1, m12
230
+    psubd           m5, m0, m6
231
+    pmaddwd         m6, [pd_31]
232
+    paddd           m13, m6
233
+
234
+    punpcklwd       m6, m1, m12
235
+    psubd           m1, m0, m6
236
+    mova            m17, m1
237
+    pmaddwd         m6, [pd_31]
238
+    paddd           m7, m6
239
+
240
+    movu            m1, [r2 + 50]
241
+    mova            m18, m1
242
+    punpckhwd       m6, m1, m12
243
+    psubd           m1, m0, m6
244
+    pmaddwd         m6, [pd_31]
245
+    paddd           m15, m6
246
+
247
+    punpcklwd       m6, m18, m12
248
+    psubd           m12, m0, m6
249
+    mova            m19, m12
250
+    pmaddwd         m6, [pd_31]
251
+    paddd           m14, m6
252
+
253
+    add             r2, 130
254
+    lea             r5, [planar32_table]
255
+
256
+%macro INTRA_PRED_PLANAR32_sse2 0
257
+    movzx           r3d, word [r2]
258
+    movd            m0, r3d
259
+    pshufd          m0, m0, 0
260
+
261
+    pmaddwd         m6, m0, [r5]
262
+    pmaddwd         m12, m0, [r5 + 16]
263
+    paddd           m6, m8
264
+    paddd           m12, m9
265
+    paddd           m8, m3
266
+    paddd           m9, m2
267
+    psrad           m6, 6
268
+    psrad           m12, 6
269
+    packssdw        m6, m12
270
+    movu            [r0], m6
271
+
272
+    pmaddwd         m6, m0, [r5 + 32]
273
+    pmaddwd         m12, m0, [r5 + 48]
274
+    paddd           m6, m10
275
+    paddd           m12, m11
276
+    paddd           m10, m16
277
+    paddd           m11, m4
278
+    psrad           m6, 6
279
+    psrad           m12, 6
280
+    packssdw        m6, m12
281
+    movu            [r0 + 16], m6
282
+
283
+    pmaddwd         m6, m0, [r5 + 64]
284
+    pmaddwd         m12, m0, [r5 + 80]
285
+    paddd           m6, m7
286
+    paddd           m12, m13
287
+    paddd           m7, m17
288
+    paddd           m13, m5
289
+    psrad           m6, 6
290
+    psrad           m12, 6
291
+    packssdw        m6, m12
292
+    movu            [r0 + 32], m6
293
+
294
+    pmaddwd         m6, m0, [r5 + 96]
295
+    pmaddwd         m12, m0, [r5 + 112]
296
+    paddd           m6, m14
297
+    paddd           m12, m15
298
+    paddd           m14, m19
299
+    paddd           m15, m1
300
+    psrad           m6, 6
301
+    psrad           m12, 6
302
+    packssdw        m6, m12
303
+    movu            [r0 + 48], m6
304
+
305
+    lea             r0, [r0 + r1]
306
+    add             r2, 2
307
+%endmacro
308
+
309
+    mov             r4, 8
310
+.loop:
311
+    INTRA_PRED_PLANAR32_sse2
312
+    INTRA_PRED_PLANAR32_sse2
313
+    INTRA_PRED_PLANAR32_sse2
314
+    INTRA_PRED_PLANAR32_sse2
315
+    dec             r4
316
+    jnz             .loop
317
+    mov             rsp, r6
318
+    RET
319
 
320
+%else
321
+;code for BIT_DEPTH == 10
322
 ;---------------------------------------------------------------------------------------
323
 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
324
 ;---------------------------------------------------------------------------------------
325
@@ -917,11 +1208,132 @@
326
 %assign x x+1
327
 %endrep
328
     RET
329
+%endif
330
 
331
 ;---------------------------------------------------------------------------------------
332
 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
333
 ;---------------------------------------------------------------------------------------
334
 INIT_YMM avx2
335
+%if ARCH_X86_64 == 1 && BIT_DEPTH == 12
336
+cglobal intra_pred_planar32, 3,4,16
337
+    pmovzxwd        m1, [r2 + 2]
338
+    pmovzxwd        m4, [r2 + 34]
339
+    pmovzxwd        m2, [r2 + 18]
340
+    pmovzxwd        m3, [r2 + 50]
341
+    lea             r2, [r2 + 66]
342
+
343
+    movzx           r3d, word [r2]
344
+    movd            xm5, r3d
345
+    vpbroadcastd    m5, xm5
346
+
347
+    pslld           m8, m5, 3
348
+    pmulld          m7, m5, [pd_planar16_mul1 + 32]
349
+    psubd           m6, m7, m8
350
+    pmulld          m9, m5, [pd_planar32_mul2 + 32]
351
+    psubd           m8, m9, m8
352
+
353
+    movzx           r3d, word [r2 + 128]
354
+    movd            xm10, r3d
355
+    vpbroadcastd    m10, xm10
356
+
357
+    mova            m11, m10
358
+    paddd           m11, [pd_32]
359
+
360
+    paddd           m6, m11
361
+    paddd           m7, m11
362
+    paddd           m8, m11
363
+    paddd           m9, m11
364
+
365
+    psubd           m0, m10, m1
366
+    mova            m13, m0
367
+    pslld           m5, m1, 5
368
+    psubd           m1, m5, m1
369
+    paddd           m12, m6, m1
370
+
371
+    psubd           m5, m10, m4
372
+    mova            m6, m5
373
+    pslld           m1, m4, 5
374
+    psubd           m4, m1, m4
375
+    paddd           m14, m8, m4
376
+
377
+    psubd           m1, m10, m2
378
+    mova            m8, m1
379
+    pslld           m4, m2, 5
380
+    psubd           m2, m4, m2
381
+    paddd           m7, m2
382
+
383
+    psubd           m11, m10, m3
384
+    mova            m15, m11
385
+    pslld           m4, m3, 5
386
+    psubd           m3, m4, m3
387
+    paddd           m9, m3
388
+
389
+    mova            m2, [pd_planar32_mul1 + 32]
390
+    mova            m4, [pd_planar16_mul2 + 32]
391
+
392
+    add             r1, r1
393
+
394
+%macro PROCESS_AVX2 1
395
+    movzx           r3d, word [r2 + %1 * 2]
396
+    movd            xm0, r3d
397
+    vpbroadcastd    m0, xm0
398
+
399
+    pmulld          m1, m0, m2
400
+    pslld           m3, m0, 3
401
+    paddd           m5, m1, m3
402
+    pmulld          m0, m4
403
+    paddd           m11, m0, m3
404
+
405
+    paddd           m5, m12
406
+    paddd           m1, m7
407
+    paddd           m11, m14
408
+    paddd           m0, m9
409
+
410
+    psrad           m5, 6
411
+    psrad           m1, 6
412
+    psrad           m11, 6
413
+    psrad           m0, 6
414
+
415
+    packssdw        m5, m1
416
+    packssdw        m11, m0
417
+
418
+    vpermq          m5, m5, q3120
419
+    vpermq          m11, m11, q3120
420
+
421
+    movu            [r0], m5
422
+    movu            [r0 + mmsize], m11
423
+%endmacro
424
+
425
+%macro  INCREMENT_AVX2 0
426
+    paddd           m12, m13
427
+    paddd           m14, m6
428
+    paddd           m7, m8
429
+    paddd           m9, m15
430
+
431
+    add             r0, r1
432
+%endmacro
433
+
434
+    add             r2, mmsize*2
435
+%assign x 0
436
+%rep 4
437
+%assign y 0
438
+%rep 8
439
+    PROCESS_AVX2 y
440
+%if x + y < 10
441
+    INCREMENT_AVX2
442
+%endif
443
+%assign y y+1
444
+%endrep
445
+lea                 r2, [r2 + 16]
446
+%assign x x+1
447
+%endrep
448
+    RET
449
+
450
+%else
451
+; code for BIT_DEPTH == 10
452
+;---------------------------------------------------------------------------------------
453
+; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
454
+;---------------------------------------------------------------------------------------
455
 cglobal intra_pred_planar32, 3,3,8
456
     movu            m1, [r2 + 2]
457
     movu            m4, [r2 + 34]
458
@@ -980,11 +1392,106 @@
459
 %assign x x+1
460
 %endrep
461
     RET
462
+%endif
463
 
464
 ;---------------------------------------------------------------------------------------
465
 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
466
 ;---------------------------------------------------------------------------------------
467
 INIT_YMM avx2
468
+%if ARCH_X86_64 == 1 && BIT_DEPTH == 12
469
+cglobal intra_pred_planar16, 3,3,11
470
+    add             r1d, r1d
471
+
472
+    movzx           r4d, word [r2 + 34]
473
+    movd            xm3, r4d
474
+    vpbroadcastd    m3, xm3
475
+
476
+    movzx           r4d, word [r2 + 98]
477
+    movd            xm4, r4d
478
+    vpbroadcastd    m4, xm4
479
+
480
+    pmovzxwd        m2, [r2 + 2]
481
+    pmovzxwd        m5, [r2 + 18]
482
+
483
+    pmulld          m10, m3, [pd_planar16_mul1]
484
+    pmulld          m7, m3, [pd_planar16_mul1 + 32]
485
+
486
+    psubd           m10, m2
487
+    pslld           m1, m2, 4
488
+    paddd           m10, m1
489
+
490
+    psubd           m7, m5
491
+    pslld           m6, m5, 4
492
+    paddd           m9, m6, m7
493
+
494
+    paddd           m10, [pd_16]
495
+    paddd           m9, [pd_16]
496
+    paddd           m7, m10, m4
497
+    paddd           m9, m4
498
+
499
+    psubd           m0, m4, m2
500
+    psubd           m8, m4, m5
501
+
502
+    add             r2, 66
503
+    mova            m5, [pd_planar16_mul0]
504
+    mova            m6, [pd_planar16_mul0 + 32]
505
+    mova            m10, [pd_0000ffff]
506
+
507
+%macro INTRA_PRED_PLANAR16_AVX2 1
508
+    vpbroadcastd    m2, [r2 + %1]
509
+    pand            m1, m2, m10
510
+    psrld           m2, 16
511
+
512
+    pmulld          m3, m1, m5
513
+    pmulld          m4, m1, m6
514
+    pmulld          m1, m2, m5
515
+    pmulld          m2, m2, m6
516
+
517
+    paddd           m3, m7
518
+    paddd           m4, m9
519
+    paddd           m7, m0
520
+    paddd           m9, m8
521
+
522
+    psrad           m3, 5
523
+    psrad           m4, 5
524
+
525
+    paddd           m1, m7
526
+    paddd           m2, m9
527
+
528
+    psrad           m1, 5
529
+    psrad           m2, 5
530
+
531
+    paddd           m7, m0
532
+    paddd           m9, m8
533
+
534
+    packssdw        m3, m4
535
+    packssdw        m1, m2
536
+
537
+    vpermq          m3, m3, q3120
538
+    vpermq          m1, m1, q3120
539
+
540
+    movu            [r0], m3
541
+    movu            [r0 + r1], m1
542
+%if %1 <= 24
543
+    lea             r0, [r0 + r1 * 2]
544
+%endif
545
+%endmacro
546
+    INTRA_PRED_PLANAR16_AVX2 0
547
+    INTRA_PRED_PLANAR16_AVX2 4
548
+    INTRA_PRED_PLANAR16_AVX2 8
549
+    INTRA_PRED_PLANAR16_AVX2 12
550
+    INTRA_PRED_PLANAR16_AVX2 16
551
+    INTRA_PRED_PLANAR16_AVX2 20
552
+    INTRA_PRED_PLANAR16_AVX2 24
553
+    INTRA_PRED_PLANAR16_AVX2 28
554
+%undef INTRA_PRED_PLANAR16_AVX2
555
+    RET
556
+
557
+%else
558
+; code for BIT_DEPTH == 10
559
+;---------------------------------------------------------------------------------------
560
+; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
561
+;---------------------------------------------------------------------------------------
562
 cglobal intra_pred_planar16, 3,3,4
563
     add             r1d, r1d
564
     vpbroadcastw    m3, [r2 + 34]
565
@@ -1028,6 +1535,7 @@
566
     INTRA_PRED_PLANAR16_AVX2 28
567
 %undef INTRA_PRED_PLANAR16_AVX2
568
     RET
569
+%endif
570
 
571
 %macro TRANSPOSE_4x4 0
572
     punpckhwd    m0, m1, m3
573
@@ -2216,6 +2724,118 @@
574
 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
575
 ;---------------------------------------------------------------------------------------
576
 INIT_XMM sse4
577
+%if ARCH_X86_64 == 1 && BIT_DEPTH == 12
578
+cglobal intra_pred_planar16, 3,5,12
579
+    add             r1d, r1d
580
+
581
+    pmovzxwd        m2, [r2 + 2]
582
+    pmovzxwd        m7, [r2 + 10]
583
+    pmovzxwd        m10, [r2 + 18]
584
+    pmovzxwd        m0, [r2 + 26]
585
+
586
+    movzx           r3d, word [r2 + 34]                     ; topRight   = above[16]
587
+    lea             r4, [pd_planar16_mul1]
588
+
589
+    movd            m3, r3d
590
+    pshufd          m3, m3, 0                               ; topRight
591
+
592
+    pslld           m8, m3, 2
593
+    pmulld          m3, m3, [r4 + 0*mmsize]                 ; (x + 1) * topRight
594
+    paddd           m9, m3, m8
595
+    paddd           m4, m9, m8
596
+    paddd           m8, m4
597
+
598
+    pslld           m1, m2, 4
599
+    pslld           m6, m7, 4
600
+    pslld           m5, m10, 4
601
+    pslld           m11, m0, 4
602
+    psubd           m1, m2
603
+    psubd           m6, m7
604
+    psubd           m5, m10
605
+    psubd           m11, m0
606
+
607
+    paddd           m4, m5
608
+    paddd           m3, m1
609
+    paddd           m8, m11
610
+    paddd           m9, m6
611
+
612
+    mova            m5, [pd_16]
613
+    paddd           m3, m5
614
+    paddd           m9, m5
615
+    paddd           m4, m5
616
+    paddd           m8, m5
617
+
618
+    movzx           r4d, word [r2 + 98]                     ; bottomLeft = left[16]
619
+    movd            m6, r4d
620
+    pshufd          m6, m6, 0                               ; bottomLeft
621
+
622
+    paddd           m4, m6
623
+    paddd           m3, m6
624
+    paddd           m8, m6
625
+    paddd           m9, m6
626
+
627
+    psubd           m1, m6, m0                              ; column 12-15
628
+    psubd           m11, m6, m10                            ; column 8-11
629
+    psubd           m10, m6, m7                             ; column 4-7
630
+    psubd           m6, m2                                  ; column 0-3
631
+
632
+    add             r2, 66
633
+    lea             r4, [pd_planar16_mul0]
634
+
635
+%macro INTRA_PRED_PLANAR16 1
636
+    movzx           r3d, word [r2]
637
+    movd            m5, r3d
638
+    pshufd          m5, m5, 0
639
+
640
+    pmulld          m0, m5, [r4 + 3*mmsize]                 ; column 12-15
641
+    pmulld          m2, m5, [r4 + 2*mmsize]                 ; column 8-11
642
+    pmulld          m7, m5, [r4 + 1*mmsize]                 ; column 4-7
643
+    pmulld          m5, m5, [r4 + 0*mmsize]                 ; column 0-3
644
+
645
+    paddd           m0, m8
646
+    paddd           m2, m4
647
+    paddd           m7, m9
648
+    paddd           m5, m3
649
+
650
+    paddd           m8, m1
651
+    paddd           m4, m11
652
+    paddd           m9, m10
653
+    paddd           m3, m6
654
+
655
+    psrad           m0, 5
656
+    psrad           m2, 5
657
+    psrad           m7, 5
658
+    psrad           m5, 5
659
+
660
+    packusdw        m2, m0
661
+    packusdw        m5, m7
662
+    movu            [r0], m5
663
+    movu            [r0 + mmsize], m2
664
+
665
+    add             r2, 2
666
+    lea             r0, [r0 + r1]
667
+%endmacro
668
+
669
+    INTRA_PRED_PLANAR16 0
670
+    INTRA_PRED_PLANAR16 1
671
+    INTRA_PRED_PLANAR16 2
672
+    INTRA_PRED_PLANAR16 3
673
+    INTRA_PRED_PLANAR16 4
674
+    INTRA_PRED_PLANAR16 5
675
+    INTRA_PRED_PLANAR16 6
676
+    INTRA_PRED_PLANAR16 7
677
+    INTRA_PRED_PLANAR16 8
678
+    INTRA_PRED_PLANAR16 9
679
+    INTRA_PRED_PLANAR16 10
680
+    INTRA_PRED_PLANAR16 11
681
+    INTRA_PRED_PLANAR16 12
682
+    INTRA_PRED_PLANAR16 13
683
+    INTRA_PRED_PLANAR16 14
684
+    INTRA_PRED_PLANAR16 15
685
+    RET
686
+
687
+%else
688
+; code for BIT_DEPTH == 10
689
 cglobal intra_pred_planar16, 3,3,8
690
     add             r1, r1
691
     movu            m2, [r2 + 2]
692
@@ -2293,6 +2913,7 @@
693
     INTRA_PRED_PLANAR16 14
694
     INTRA_PRED_PLANAR16 15
695
     RET
696
+%endif
697
 
698
 ;---------------------------------------------------------------------------------------
699
 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
700
x265_1.8.tar.gz/source/common/x86/intrapred8.asm -> x265_1.9.tar.gz/source/common/x86/intrapred8.asm Changed
11439
 
1
@@ -27,7 +27,9 @@
2
 
3
 SECTION_RODATA 32
4
 
5
-intra_pred_shuff_0_8:    times 2 db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
6
+const intra_pred_shuff_0_8,     times 2 db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
7
+                                        db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
8
+
9
 intra_pred_shuff_15_0:   times 2 db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
10
 
11
 intra_filter4_shuf0:  times 2 db  2,  3,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13
12
@@ -54,13 +56,13 @@
13
 c_shuf8_0:            db  0,  1,  1,  2,  2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8
14
 c_deinterval8:        db  0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15
15
 pb_unpackbq:          db  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  1,  1,  1,  1
16
-c_mode16_12:    db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 6
17
-c_mode16_13:    db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 7, 4
18
-c_mode16_14:    db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 12, 10, 7, 5, 2
19
+c_mode16_12:          db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 6
20
+c_mode16_13:          db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 7, 4
21
+c_mode16_14:          db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 12, 10, 7, 5, 2
22
 c_mode16_15:          db  0,  0,  0,  0,  0,  0,  0,  0, 15, 13, 11,  9,  8,  6,  4,  2
23
 c_mode16_16:          db  8,  6,  5,  3,  2,  0, 15, 14, 12, 11,  9,  8,  6,  5,  3,  2
24
 c_mode16_17:          db  4,  2,  1,  0, 15, 14, 12, 11, 10,  9,  7,  6,  5,  4,  2,  1
25
-c_mode16_18:    db 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1
26
+c_mode16_18:          db 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1
27
 
28
 ALIGN 32
29
 c_ang8_src1_9_2_10:   db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
30
@@ -259,235 +261,6 @@
31
                      db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
32
                      db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
33
 
34
-
35
-ALIGN 32
36
-c_ang32_mode_27:    db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
37
-                    db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
38
-                    db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
39
-                    db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
40
-                    db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
41
-                    db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
42
-                    db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
43
-                    db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
44
-                    db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2
45
-                    db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
46
-                    db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
47
-                    db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
48
-                    db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
49
-                    db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
50
-                    db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
51
-                    db 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
52
-                    db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
53
-
54
-
55
-ALIGN 32
56
-c_ang32_mode_28:    db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
57
-                    db 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
58
-                    db 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
59
-                    db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
60
-                    db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
61
-                    db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
62
-                    db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
63
-                    db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
64
-                    db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
65
-                    db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31
66
-                    db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9
67
-                    db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19
68
-                    db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29
69
-                    db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7
70
-                    db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17
71
-                    db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27
72
-                    db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
73
-
74
-ALIGN 32
75
-c_ang32_mode_29:    db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
76
-                    db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27
77
-                    db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13
78
-                    db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31
79
-                    db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17
80
-                    db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
81
-                    db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
82
-                    db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
83
-                    db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
84
-                    db 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25
85
-                    db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11
86
-                    db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29
87
-                    db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15
88
-                    db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
89
-                    db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
90
-                    db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
91
-                    db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
92
-                    db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23
93
-                    db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
94
-
95
-
96
-ALIGN 32
97
-c_ang32_mode_30:    db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
98
-                    db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
99
-                    db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
100
-                    db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27
101
-                    db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21
102
-                    db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15
103
-                    db 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
104
-                    db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
105
-                    db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
106
-                    db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29,  3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29
107
-                    db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23
108
-                    db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17
109
-                    db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
110
-                    db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
111
-                    db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
112
-                    db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31
113
-                    db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25
114
-                    db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19
115
-                    db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
116
-
117
-
118
-ALIGN 32
119
-c_ang32_mode_31:    db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17
120
-                    db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19
121
-                    db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21
122
-                    db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23
123
-                    db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25
124
-                    db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27
125
-                    db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29
126
-                    db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31
127
-                    db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
128
-                    db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
129
-                    db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
130
-                    db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
131
-                    db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
132
-                    db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
133
-                    db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
134
-                    db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
135
-                    db 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15
136
-                    db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
137
-
138
-
139
-ALIGN 32
140
-c_ang32_mode_32:   db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21
141
-                   db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31
142
-                   db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
143
-                   db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
144
-                   db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19
145
-                   db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29
146
-                   db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
147
-                   db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
148
-                   db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17
149
-                   db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27
150
-                   db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
151
-                   db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
152
-                   db 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15
153
-                   db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25
154
-                   db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
155
-                   db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
156
-                   db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13
157
-                   db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23
158
-                   db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
159
-                   db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
160
-                   db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11
161
-                   db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
162
-
163
-ALIGN 32
164
-c_ang32_mode_25:   db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
165
-                   db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
166
-                   db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
167
-                   db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
168
-                   db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
169
-                   db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
170
-                   db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
171
-                   db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
172
-                   db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
173
-                   db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
174
-                   db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
175
-                   db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
176
-                   db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
177
-                   db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
178
-                   db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
179
-                   db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
180
-
181
-ALIGN 32
182
-c_ang32_mode_24:   db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
183
-                   db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
184
-                   db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2
185
-                   db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
186
-                   db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
187
-                   db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
188
-                   db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
189
-                   db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
190
-                   db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
191
-                   db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1
192
-                   db 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23
193
-                   db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13
194
-                   db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3
195
-                   db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25
196
-                   db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15
197
-                   db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5
198
-                   db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
199
-
200
-
201
-ALIGN 32
202
-c_ang32_mode_23:  db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
203
-                  db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5
204
-                  db 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19
205
-                  db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1
206
-                  db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15
207
-                  db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
208
-                  db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
209
-                  db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2
210
-                  db 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
211
-                  db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7
212
-                  db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21
213
-                  db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3
214
-                  db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17
215
-                  db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
216
-                  db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
217
-                  db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
218
-                  db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
219
-                  db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
220
-
221
-
222
-ALIGN 32
223
-c_ang32_mode_22: db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
224
-                 db 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
225
-                 db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
226
-                 db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5
227
-                 db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11
228
-                 db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17
229
-                 db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
230
-                 db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
231
-                 db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
232
-                 db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3
233
-                 db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9
234
-                 db 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15
235
-                 db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2
236
-                 db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
237
-                 db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
238
-                 db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1
239
-                 db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7
240
-                 db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13
241
-                 db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
242
-
243
-ALIGN 32
244
-c_ang32_mode_21: db 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15
245
-                 db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13
246
-                 db 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11
247
-                 db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9
248
-                 db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7
249
-                 db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5
250
-                 db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3
251
-                 db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1
252
-                 db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
253
-                 db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
254
-                 db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
255
-                 db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
256
-                 db 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
257
-                 db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
258
-                 db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
259
-                 db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2
260
-                 db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
261
-
262
-
263
 ALIGN 32
264
 intra_pred_shuff_0_4:    times 4 db 0, 1, 1, 2, 2, 3, 3, 4
265
 intra_pred4_shuff1:      db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 1, 2, 2, 3, 3, 4, 4, 5
266
@@ -560,11 +333,139 @@
267
                       db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
268
                       db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
269
 
270
-ALIGN 32
271
-c_ang8_mode_20:       db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
272
-                      db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
273
-                      db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2
274
-                      db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
275
+const c_ang8_mode_16,       db 8, 7, 6, 5, 4, 3, 2, 1, 0, 9, 10, 12, 13, 15, 0, 0
276
+
277
+const intra_pred8_shuff16,  db 0, 1, 1, 2, 3, 3, 4, 5
278
+                            db 1, 2, 2, 3, 4, 4, 5, 6
279
+                            db 2, 3, 3, 4, 5, 5, 6, 7
280
+                            db 3, 4, 4, 5, 6, 6, 7, 8
281
+                            db 4, 5, 5, 6, 7, 7, 8, 9
282
+
283
+const angHor8_tab_16,       db (32-11), 11, (32-22), 22, (32-1 ),  1, (32-12), 12, (32-23), 23, (32- 2),  2, (32-13), 13, (32-24), 24
284
+
285
+const c_ang8_mode_20,       db 15, 13, 12, 10, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0
286
+
287
+; NOTE: this big table improve speed ~10%, if we have broadcast instruction work on high-128bits infuture, we can remove the table
288
+const angHor8_tab_20,       times 8 db (32-24), 24
289
+                            times 8 db (32-13), 13
290
+                            times 8 db (32- 2),  2
291
+                            times 8 db (32-23), 23
292
+                            times 8 db (32-12), 12
293
+                            times 8 db (32- 1),  1
294
+                            times 8 db (32-22), 22
295
+                            times 8 db (32-11), 11
296
+
297
+const ang16_shuf_mode9,    times 8 db 0, 1
298
+                           times 8 db 1, 2
299
+
300
+const angHor_tab_9,  db (32-2), 2, (32-4), 4, (32-6), 6, (32-8), 8, (32-10), 10, (32-12), 12, (32-14), 14, (32-16), 16
301
+                     db (32-18), 18, (32-20), 20, (32-22), 22, (32-24),  24, (32-26),  26, (32-28), 28, (32-30), 30, (32-32), 32
302
+
303
+const angHor_tab_11, db (32-30), 30, (32-28), 28, (32-26), 26, (32-24), 24, (32-22), 22, (32-20), 20, (32-18), 18, (32-16), 16
304
+                     db (32-14), 14, (32-12), 12, (32-10), 10, (32- 8),  8, (32- 6),  6, (32- 4),  4, (32- 2),  2, (32- 0),  0
305
+
306
+const ang16_shuf_mode12,   db 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 1, 2, 1, 2, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 2, 3, 2, 3
307
+                           db 1, 2, 1, 2, 1, 2, 1, 2, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3, 1, 2, 1, 2, 1, 2, 1, 2
308
+
309
+const angHor_tab_12, db (32-27), 27, (32-22), 22, (32-17), 17, (32-12), 12, (32-7), 7, (32-2), 2, (32-29), 29, (32-24), 24
310
+                     db (32-19), 19, (32-14), 14, (32-9), 9, (32-4), 4, (32-31), 31, (32-26),  26, (32-21), 21, (32-16), 16
311
+
312
+const ang16_shuf_mode13,   db 4, 5, 4, 5, 4, 5, 3, 4, 3, 4, 3, 4, 3, 4, 2, 3, 5, 6, 5, 6, 5, 6, 4, 5, 4, 5, 4, 5, 4, 5, 3, 4
313
+                           db 2, 3, 2, 3, 1, 2, 1, 2, 1, 2, 1, 2, 0, 1, 0, 1, 3, 4, 3, 4, 2, 3, 2, 3, 2, 3, 2, 3, 1, 2, 1, 2
314
+                           db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 7, 4, 0, 0 ,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 7, 4, 0
315
+
316
+const angHor_tab_13, db (32-23), 23, (32-14), 14, (32-5), 5, (32-28), 28, (32-19), 19, (32-10), 10, (32-1), 1, (32-24), 24
317
+                     db (32-15), 15, (32-6), 6, (32-29), 29, (32-20), 20, (32-11), 11, (32-2), 2, (32-25), 25, (32-16), 16
318
+
319
+const ang16_shuf_mode14,   db 6, 7, 6, 7, 5, 6, 5, 6, 4, 5, 4, 5, 4, 5, 3, 4, 7, 8, 7, 8, 6, 7, 6, 7, 5, 6, 5, 6, 5, 6, 4, 5
320
+                           db 3, 4, 2, 3, 2, 3, 2, 3, 1, 2, 1, 2, 0, 1, 0, 1, 4, 5, 3, 4, 3, 4, 3, 4, 2, 3, 2, 3, 1, 2, 1, 2
321
+                           db 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 12, 10, 7, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 12, 10, 7, 5, 2, 0
322
+
323
+const angHor_tab_14, db (32-19), 19, (32-6), 6, (32-25), 25, (32-12), 12, (32-31), 31, (32-18), 18, (32-5), 5, (32-24), 24
324
+                     db (32-11), 11, (32-30), 30, (32-17), 17, (32-4), 4, (32-23), 23, (32-10), 10, (32-29), 29, (32-16), 16
325
+
326
+const ang16_shuf_mode15,   db 8, 9, 7, 8, 7, 8, 6, 7, 6, 7, 5, 6, 5, 6, 4, 5, 9, 10, 8, 9, 8, 9, 7, 8, 7, 8, 6, 7, 6, 7, 5, 6
327
+                           db 4, 5, 3, 4, 3, 4, 2, 3, 2, 3, 1, 2, 1, 2, 0, 1, 5, 6, 4, 5, 4, 5, 3, 4, 3, 4, 2, 3, 2, 3, 1, 2
328
+                           db 0, 0, 0, 0, 0, 0, 0, 15, 13, 11, 9, 8, 6, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 15, 13, 11, 9, 8, 6, 4, 2, 0
329
+
330
+const angHor_tab_15, db (32-15), 15, (32-30), 30, (32-13), 13, (32-28), 28, (32-11), 11, (32-26), 26, (32-9), 9, (32-24), 24
331
+                     db (32-7), 7, (32-22), 22, (32-5), 5, (32-20), 20, (32-3), 3, (32-18), 18, (32-1), 1, (32- 16), 16
332
+
333
+const ang16_shuf_mode16,   db 10, 11, 9, 10, 9, 10, 8, 9, 7, 8, 7, 8, 6, 7, 5, 6, 11, 12, 10, 11, 10, 11, 9, 10, 8, 9, 8, 9, 7, 8, 6, 7
334
+                           db 5, 6, 4, 5, 3, 4, 3, 4, 2, 3, 1, 2, 1, 2, 0, 1, 6, 7, 5, 6, 4, 5, 4, 5, 3, 4, 2, 3, 2, 3, 1, 2
335
+                           db 0 ,0, 0, 0, 0, 15, 14, 12 , 11, 9, 8, 6, 5, 3, 2, 0, 0, 0, 0, 0, 0, 15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0
336
+
337
+const angHor_tab_16, db (32-11), 11, (32-22), 22, (32-1), 1, (32-12), 12, (32-23), 23, (32-2), 2, (32-13), 13, (32-24), 24
338
+                     db (32-3), 3, (32-14), 14, (32-25), 25, (32-4), 4, (32-15), 15, (32-26), 26, (32-5), 5, (32-16), 16
339
+
340
+const ang16_shuf_mode17,   db 12, 13, 11, 12, 10, 11, 9, 10, 8, 9, 8, 9, 7, 8, 6, 7, 13, 14, 12, 13, 11, 12, 10, 11, 9, 10, 9, 10, 8, 9, 7, 8
341
+                           db 5, 6, 4, 5, 4, 5, 3, 4, 2, 3, 1, 2, 0, 1, 0, 1, 6, 7, 5, 6, 5, 6, 4, 5, 3, 4, 2, 3, 1, 2, 1, 2
342
+                           db 0, 0, 0, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, 0, 0, 0, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0
343
+
344
+const angHor_tab_17, db (32- 6),  6, (32-12), 12, (32-18), 18, (32-24), 24, (32-30), 30, (32- 4),  4, (32-10), 10, (32-16), 16
345
+                     db (32-22), 22, (32-28), 28, (32- 2),  2, (32- 8),  8, (32-14), 14, (32-20), 20, (32-26), 26, (32- 0),  0
346
+
347
+; Intrapred_angle32x32, modes 1 to 33 constants
348
+const ang32_shuf_mode9,         times 8 db 0, 1
349
+                                times 8 db 1, 2
350
+
351
+const ang32_shuf_mode11,        times 8 db 1, 2
352
+                                times 8 db 0, 1
353
+
354
+const ang32_fact_mode12,        db (32-27), 27, (32-22), 22, (32-17), 17, (32-12), 12, (32- 7),  7, (32- 2),  2, (32-29), 29, (32-24), 24
355
+                                db (32-11), 11, (32- 6),  6, (32- 1),  1, (32-28), 28, (32-23), 23, (32-18), 18, (32-13), 13, (32- 8),  8
356
+                                db (32-19), 19, (32-14), 14, (32- 9),  9, (32- 4),  4, (32-31), 31, (32-26), 26, (32-21), 21, (32-16), 16
357
+                                db (32- 3),  3, (32-30), 30, (32-25), 25, (32-20), 20, (32-15), 15, (32-10), 10, (32- 5),  5, (32- 0),  0
358
+const ang32_shuf_mode12,        db  4,  5,  4,  5,  4,  5,  4,  5,  4,  5,  4,  5,  3,  4,  3,  4,  2,  3,  2,  3,  2,  3,  1,  2,  1,  2,  1,  2,  1,  2,  1,  2
359
+                                db  3,  4,  3,  4,  3,  4,  3,  4,  2,  3,  2,  3,  2,  3,  2,  3,  1,  2,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1
360
+const ang32_shuf_mode24,        db  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 13, 13,  6,  6,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 10, 10,  3,  3
361
+                                dd  0,  0,  7,  3,  0,  0,  7,  3
362
+
363
+const ang32_fact_mode13,        db (32-23), 23, (32-14), 14, (32- 5),  5, (32-28), 28, (32-19), 19, (32-10), 10, (32- 1),  1, (32-24), 24
364
+                                db (32- 7),  7, (32-30), 30, (32-21), 21, (32-12), 12, (32- 3),  3, (32-26), 26, (32-17), 17, (32- 8),  8
365
+                                db (32-15), 15, (32- 6),  6, (32-29), 29, (32-20), 20, (32-11), 11, (32- 2),  2, (32-25), 25, (32-16), 16
366
+                                db (32-31), 31, (32-22), 22, (32-13), 13, (32- 4),  4, (32-27), 27, (32-18), 18, (32- 9),  9, (32- 0),  0
367
+const ang32_shuf_mode13,        db 14, 15, 14, 15, 14, 15, 13, 14, 13, 14, 13, 14, 13, 14, 12, 13, 10, 11,  9, 10,  9, 10,  9, 10,  9, 10,  8,  9,  8,  9,  8,  9
368
+                                db 12, 13, 12, 13, 11, 12, 11, 12, 11, 12, 11, 12, 10, 11, 10, 11,  7,  8,  7,  8,  7,  8,  7,  8,  6,  7,  6,  7,  6,  7,  6,  7
369
+                                db  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 14, 11,  7,  4,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 12,  9,  5,  2
370
+const ang32_shuf_mode23,        db  0,  0,  0,  0,  0,  0,  0,  0, 14, 14, 11, 11,  7,  7,  4,  4,  0,  0,  0,  0,  0,  0,  0,  0, 12, 12,  9,  9,  5,  5,  2,  2
371
+
372
+const ang32_fact_mode14,        db (32-19), 19, (32- 6),  6, (32-25), 25, (32-12), 12, (32-31), 31, (32-18), 18, (32- 5),  5, (32-24), 24
373
+                                db (32- 3),  3, (32-22), 22, (32- 9),  9, (32-28), 28, (32-15), 15, (32- 2),  2, (32-21), 21, (32- 8),  8
374
+                                db (32-11), 11, (32-30), 30, (32-17), 17, (32- 4),  4, (32-23), 23, (32-10), 10, (32-29), 29, (32-16), 16
375
+                                db (32-27), 27, (32-14), 14, (32- 1),  1, (32-20), 20, (32- 7),  7, (32-26), 26, (32-13), 13, (32- 0),  0
376
+const ang32_shuf_mode14,        db 14, 15, 14, 15, 13, 14, 13, 14, 12, 13, 12, 13, 12, 13, 11, 12,  8,  9,  7,  8,  7,  8,  6,  7,  6,  7,  6,  7,  5,  6,  5,  6
377
+                                db 11, 12, 10, 11, 10, 11, 10, 11,  9, 10,  9, 10,  8,  9,  8,  9,  4,  5,  4,  5,  4,  5,  3,  4,  3,  4,  2,  3,  2,  3,  2,  3
378
+                                db  0,  0,  0,  0,  0,  0,  0,  0, 15, 12, 10,  7,  5,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 14, 11,  9,  6,  4,  1
379
+const ang32_shuf_mode22,        db  0,  0, 15, 15, 13, 13, 10, 10,  8,  8,  5,  5,  3,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0, 12, 12,  9,  9,  7,  7,  4,  4,  2
380
+
381
+const ang32_fact_mode15,        db (32-15), 15, (32-30), 30, (32-13), 13, (32-28), 28, (32-11), 11, (32-26), 26, (32- 9),  9, (32-24), 24
382
+                                db (32-31), 31, (32-14), 14, (32-29), 29, (32-12), 12, (32-27), 27, (32-10), 10, (32-25), 25, (32- 8),  8
383
+                                db (32- 7),  7, (32-22), 22, (32- 5),  5, (32-20), 20, (32- 3),  3, (32-18), 18, (32- 1),  1, (32-16), 16
384
+                                db (32-23), 23, (32- 6),  6, (32-21), 21, (32- 4),  4, (32-19), 19, (32- 2),  2, (32-17), 17, (32- 0),  0
385
+const ang32_shuf_mode15,        db 14, 15, 13, 14, 13, 14, 12, 13, 12, 13, 11, 12, 11, 12, 10, 11,  5,  6,  5,  6,  4,  5,  4,  5,  3,  4,  3,  4,  2,  3,  2,  3
386
+                                db 12, 13, 11, 12, 11, 12, 10, 11, 10, 11,  9, 10,  9, 10,  8,  9,  3,  4,  3,  4,  2,  3,  2,  3,  1,  2,  1,  2,  0,  1,  0,  1
387
+                                db  0,  0,  0,  0,  0,  0,  0,  0, 15, 13, 11,  9,  8,  6,  4,  2,  0,  0,  0,  0,  0,  0,  0,  0, 14, 12, 10,  8,  7,  5,  3,  1
388
+const ang32_shuf_mode21,        db 15, 15, 13, 13, 11, 11,  9,  9,  8,  8,  6,  6,  4,  4,  2,  2, 14, 14, 12, 12, 10, 10,  8,  8,  7,  7,  5,  5,  3,  3,  1,  1
389
+
390
+const ang32_fact_mode16,        db (32-11), 11, (32-22), 22, (32- 1),  1, (32-12), 12, (32-23), 23, (32- 2),  2, (32-13), 13, (32-24), 24
391
+                                db (32- 3),  3, (32-14), 14, (32-25), 25, (32- 4),  4, (32-15), 15, (32-26), 26, (32- 5),  5, (32-16), 16
392
+                                db (32-27), 27, (32- 6),  6, (32-17), 17, (32-28), 28, (32- 7),  7, (32-18), 18, (32-29), 29, (32- 8),  8
393
+                                db (32-19), 19, (32-30), 30, (32- 9),  9, (32-20), 20, (32-31), 31, (32-10), 10, (32-21), 21, (32- 0),  0
394
+const ang32_shuf_mode16,        db 14, 15, 13, 14, 13, 14, 12, 13, 11, 12, 11, 12, 10, 11,  9, 10,  9, 10,  8,  9,  7,  8,  7,  8,  6,  7,  5,  6,  5,  6,  4,  5
395
+                                db 14, 15, 14, 15, 13, 14, 12, 13, 12, 13, 11, 12, 10, 11, 10, 11,  9, 10,  8,  9,  8,  9,  7,  8,  6,  7,  6,  7,  5,  6,  5,  6
396
+                                db  0,  0,  0,  0, 15, 14, 12, 11,  9,  8,  6,  5,  3,  2,  0,  0,  0,  0,  0,  0,  0,  0, 14, 13, 11, 10,  8,  7,  5,  4,  2,  1
397
+                                dd  7,  1,  2,  3,  7,  1,  2,  3
398
+const ang32_shuf_mode20,        db 12, 11,  9,  8,  6,  5,  3,  2,  0,  0,  0,  0,  0,  0, 14, 15,  8,  7,  5,  4,  2,  1,  0,  0, 14, 13, 13, 11, 11, 10, 10,  8
399
+                                db  2,  2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8,  8,  9,  9,  2,  2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  1,  1,  0,  0
400
+
401
+const ang32_fact_mode17,        db (32- 6),  6, (32-12), 12, (32-18), 18, (32-24), 24, (32-30), 30, (32- 4),  4, (32-10), 10, (32-16), 16
402
+                                db (32-22), 22, (32-28), 28, (32- 2),  2, (32- 8),  8, (32-14), 14, (32-20), 20, (32-26), 26, (32- 0),  0
403
+const ang32_shuf_mode17,        db 14, 15, 13, 14, 12, 13, 11, 12, 10, 11, 10, 11,  9, 10,  8,  9,  7,  8,  6,  7,  6,  7,  5,  6,  4,  5,  3,  4,  2,  3,  2,  3
404
+                                db  0,  0,  0,  0, 15, 14, 12, 11, 10,  9,  7,  6,  5,  4,  2,  1,  0,  0,  0, 15, 14, 12, 11, 10,  9,  7,  6,  5,  4,  2,  1,  0
405
+const ang32_shuf_mode19,        db  8,  8,  9,  9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15,  8,  8,  9,  9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15
406
+                                dd  0,  0,  2,  3,  0,  0,  7,  1
407
+                                dd  0,  0,  5,  6,  0,  0,  0,  0
408
 
409
 const ang_table
410
 %assign x 0
411
@@ -588,6 +489,7 @@
412
 %endrep
413
 
414
 SECTION .text
415
+cextern pb_1
416
 cextern pw_2
417
 cextern pw_3
418
 cextern pw_4
419
@@ -11966,6 +11868,6711 @@
420
 
421
     call ang32_mode_3_33_row_0_15
422
     RET
423
+
424
+cglobal ang32_mode_4_32_row_0_15
425
+    test        r7d,        r7d
426
+    ; rows 0 to 7
427
+    movu        m0,         [r2 +  1]           ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1]
428
+    movu        m1,         [r2 +  2]           ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2]
429
+    movu        m3,         [r2 + 17]           ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17]
430
+    movu        m4,         [r2 + 18]           ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18]
431
+
432
+    punpckhbw   m2,         m0, m1              ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10  9]
433
+    punpcklbw   m0,         m1                  ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17  9  8  8  7  7  6  6  5  5  4  4  3  3  2  2  1]
434
+    punpcklbw   m3,         m4                  ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17]
435
+
436
+    pmaddubsw   m4,         m0, [r3 + 5 * 32]   ; [21]
437
+    pmulhrsw    m4,         m7
438
+    pmaddubsw   m1,         m2, [r3 + 5 * 32]
439
+    pmulhrsw    m1,         m7
440
+    packuswb    m4,         m1
441
+
442
+    palignr     m6,         m2, m0, 2
443
+    palignr     m1,         m3, m2, 2
444
+    pmaddubsw   m5,         m6, [r3 - 6 * 32]   ; [10]
445
+    pmulhrsw    m5,         m7
446
+    pmaddubsw   m8,         m1, [r3 - 6 * 32]
447
+    pmulhrsw    m8,         m7
448
+    packuswb    m5,         m8
449
+
450
+    pmaddubsw   m6,         [r3 + 15 * 32]      ; [31]
451
+    pmulhrsw    m6,         m7
452
+    pmaddubsw   m1,         [r3 + 15 * 32]
453
+    pmulhrsw    m1,         m7
454
+    packuswb    m6,         m1
455
+
456
+    palignr     m8,         m2, m0, 4
457
+    palignr     m1,         m3, m2, 4
458
+    pmaddubsw   m8,         [r3 + 4 * 32]       ; [20]
459
+    pmulhrsw    m8,         m7
460
+    pmaddubsw   m1,         [r3 + 4 * 32]
461
+    pmulhrsw    m1,         m7
462
+    packuswb    m8,         m1
463
+
464
+    palignr     m10,        m2, m0, 6
465
+    palignr     m11,        m3, m2, 6
466
+    pmaddubsw   m9,         m10, [r3 - 7 * 32]  ; [9]
467
+    pmulhrsw    m9,         m7
468
+    pmaddubsw   m1,         m11, [r3 - 7 * 32]
469
+    pmulhrsw    m1,         m7
470
+    packuswb    m9,         m1
471
+
472
+    pmaddubsw   m10,        [r3 + 14 * 32]      ; [30]
473
+    pmulhrsw    m10,        m7
474
+    pmaddubsw   m11,        [r3 + 14 * 32]
475
+    pmulhrsw    m11,        m7
476
+    packuswb    m10,        m11
477
+
478
+    palignr     m11,        m2, m0, 8
479
+    palignr     m1,         m3, m2, 8
480
+    pmaddubsw   m11,        [r3 + 3 * 32]       ; [19]
481
+    pmulhrsw    m11,        m7
482
+    pmaddubsw   m1,         [r3 + 3 * 32]
483
+    pmulhrsw    m1,         m7
484
+    packuswb    m11,        m1
485
+
486
+    palignr     m12,        m2, m0, 10
487
+    palignr     m1,         m3, m2, 10
488
+    pmaddubsw   m12,        [r3 - 8 * 32]       ; [8]
489
+    pmulhrsw    m12,        m7
490
+    pmaddubsw   m1,         [r3 - 8 * 32]
491
+    pmulhrsw    m1,         m7
492
+    packuswb    m12,        m1
493
+
494
+    TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 0
495
+
496
+    ; rows 8 to 15
497
+    palignr     m4,         m2, m0, 10
498
+    palignr     m1,         m3, m2, 10
499
+    pmaddubsw   m4,         [r3 + 13 * 32]      ; [29]
500
+    pmulhrsw    m4,         m7
501
+    pmaddubsw   m1,         [r3 + 13 * 32]
502
+    pmulhrsw    m1,         m7
503
+    packuswb    m4,         m1
504
+
505
+    palignr     m5,         m2, m0, 12
506
+    palignr     m1,         m3, m2, 12
507
+    pmaddubsw   m5,         [r3 + 2 * 32]       ; [18]
508
+    pmulhrsw    m5,         m7
509
+    pmaddubsw   m1,         [r3 + 2 * 32]
510
+    pmulhrsw    m1,         m7
511
+    packuswb    m5,         m1
512
+
513
+    palignr     m8,         m2, m0, 14
514
+    palignr     m1,         m3, m2, 14
515
+    pmaddubsw   m6,         m8, [r3 - 9 * 32]   ; [7]
516
+    pmulhrsw    m6,         m7
517
+    pmaddubsw   m9,         m1, [r3 - 9 * 32]
518
+    pmulhrsw    m9,         m7
519
+    packuswb    m6,         m9
520
+
521
+    pmaddubsw   m8,         [r3 + 12 * 32]      ; [28]
522
+    pmulhrsw    m8,         m7
523
+    pmaddubsw   m1,         [r3 + 12 * 32]
524
+    pmulhrsw    m1,         m7
525
+    packuswb    m8,         m1
526
+
527
+    pmaddubsw   m9,         m2, [r3 + 1 * 32]   ; [17]
528
+    pmulhrsw    m9,         m7
529
+    pmaddubsw   m1,         m3, [r3 + 1 * 32]
530
+    pmulhrsw    m1,         m7
531
+    packuswb    m9,         m1
532
+
533
+    movu        m0,         [r2 + 25]
534
+    movu        m1,         [r2 + 26]
535
+    punpcklbw   m0,         m1
536
+
537
+    palignr     m11,        m3, m2, 2
538
+    palignr     m1,         m0, m3, 2
539
+    pmaddubsw   m10,        m11, [r3 - 10 * 32] ; [6]
540
+    pmulhrsw    m10,        m7
541
+    pmaddubsw   m12,        m1, [r3 - 10 * 32]
542
+    pmulhrsw    m12,        m7
543
+    packuswb    m10,        m12
544
+
545
+    pmaddubsw   m11,        [r3 + 11 * 32]      ; [27]
546
+    pmulhrsw    m11,        m7
547
+    pmaddubsw   m1,         [r3 + 11 * 32]
548
+    pmulhrsw    m1,         m7
549
+    packuswb    m11,        m1
550
+
551
+    palignr     m0,         m3, 4
552
+    palignr     m3,         m2, 4
553
+    pmaddubsw   m3,         [r3]                ; [16]
554
+    pmulhrsw    m3,         m7
555
+    pmaddubsw   m0,         [r3]
556
+    pmulhrsw    m0,         m7
557
+    packuswb    m3,         m0
558
+
559
+    TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 3, 0, 8
560
+    ret
561
+
562
+cglobal ang32_mode_4_32_row_16_31
563
+    test        r7d,      r7d
564
+    ; rows 0 to 7
565
+    movu        m0,         [r2 +  1]           ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1]
566
+    movu        m1,         [r2 +  2]           ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2]
567
+    movu        m3,         [r2 + 17]           ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17]
568
+    movu        m4,         [r2 + 18]           ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18]
569
+
570
+    punpckhbw   m2,         m0, m1              ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10  9]
571
+    punpcklbw   m0,         m1                  ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17  9  8  8  7  7  6  6  5  5  4  4  3  3  2  2  1]
572
+    punpcklbw   m3,         m4                  ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17]
573
+
574
+    pmaddubsw   m4,         m0, [r3 - 11 * 32]  ; [5]
575
+    pmulhrsw    m4,         m7
576
+    pmaddubsw   m1,         m2, [r3 - 11 * 32]
577
+    pmulhrsw    m1,         m7
578
+    packuswb    m4,         m1
579
+
580
+    pmaddubsw   m5,         m0, [r3 + 10 * 32]  ; [26]
581
+    pmulhrsw    m5,         m7
582
+    pmaddubsw   m1,         m2, [r3 + 10 * 32]
583
+    pmulhrsw    m1,         m7
584
+    packuswb    m5,         m1
585
+
586
+    palignr     m6,         m2, m0, 2
587
+    palignr     m1,         m3, m2, 2
588
+    pmaddubsw   m6,         [r3 - 1 * 32]       ; [15]
589
+    pmulhrsw    m6,         m7
590
+    pmaddubsw   m1,         [r3 - 1 * 32]
591
+    pmulhrsw    m1,         m7
592
+    packuswb    m6,         m1
593
+
594
+    palignr     m9,         m2, m0, 4
595
+    palignr     m10,        m3, m2, 4
596
+    pmaddubsw   m8,         m9, [r3 - 12 * 32]  ; [4]
597
+    pmulhrsw    m8,         m7
598
+    pmaddubsw   m1,         m10, [r3 - 12 * 32]
599
+    pmulhrsw    m1,         m7
600
+    packuswb    m8,         m1
601
+
602
+    pmaddubsw   m9,         [r3 + 9 * 32]       ; [25]
603
+    pmulhrsw    m9,         m7
604
+    pmaddubsw   m10,        [r3 + 9 * 32]
605
+    pmulhrsw    m10,        m7
606
+    packuswb    m9,         m10
607
+
608
+    palignr     m10,         m2, m0, 6
609
+    palignr     m11,        m3, m2, 6
610
+    pmaddubsw   m10,        [r3 - 2 * 32]       ; [14]
611
+    pmulhrsw    m10,        m7
612
+    pmaddubsw   m11,        [r3 - 2 * 32]
613
+    pmulhrsw    m11,        m7
614
+    packuswb    m10,        m11
615
+
616
+    palignr     m12,        m2, m0, 8
617
+    palignr     m1,         m3, m2, 8
618
+    pmaddubsw   m11,        m12, [r3 - 13 * 32] ; [3]
619
+    pmulhrsw    m11,        m7
620
+    pmaddubsw   m1,         [r3 - 13 * 32]
621
+    pmulhrsw    m1,         m7
622
+    packuswb    m11,        m1
623
+
624
+    palignr     m1,         m3, m2, 8
625
+    pmaddubsw   m12,        [r3 + 8 * 32]       ; [24]
626
+    pmulhrsw    m12,        m7
627
+    pmaddubsw   m1,         [r3 + 8 * 32]
628
+    pmulhrsw    m1,         m7
629
+    packuswb    m12,        m1
630
+
631
+    TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 0
632
+
633
+    ; rows 8 to 15
634
+    palignr     m4,         m2, m0, 10
635
+    palignr     m1,         m3, m2, 10
636
+    pmaddubsw   m4,         [r3 - 3 * 32]       ; [13]
637
+    pmulhrsw    m4,         m7
638
+    pmaddubsw   m1,         [r3 - 3 * 32]
639
+    pmulhrsw    m1,         m7
640
+    packuswb    m4,         m1
641
+
642
+    palignr     m6,         m2, m0, 12
643
+    palignr     m8,         m3, m2, 12
644
+    pmaddubsw   m5,         m6, [r3 - 14 * 32]  ; [2]
645
+    pmulhrsw    m5,         m7
646
+    pmaddubsw   m1,         m8, [r3 - 14 * 32]
647
+    pmulhrsw    m1,         m7
648
+    packuswb    m5,         m1
649
+
650
+    pmaddubsw   m6,         [r3 + 7 * 32]       ; [23]
651
+    pmulhrsw    m6,         m7
652
+    pmaddubsw   m8,         [r3 + 7 * 32]
653
+    pmulhrsw    m8,         m7
654
+    packuswb    m6,         m8
655
+
656
+    palignr     m8,         m2, m0, 14
657
+    palignr     m1,         m3, m2, 14
658
+    pmaddubsw   m8,         [r3 - 4 * 32]       ; [12]
659
+    pmulhrsw    m8,         m7
660
+    pmaddubsw   m1,         [r3 - 4 * 32]
661
+    pmulhrsw    m1,         m7
662
+    packuswb    m8,         m1
663
+
664
+    pmaddubsw   m9,         m2, [r3 - 15 * 32]  ; [1]
665
+    pmulhrsw    m9,         m7
666
+    pmaddubsw   m1,         m3, [r3 - 15 * 32]
667
+    pmulhrsw    m1,         m7
668
+    packuswb    m9,         m1
669
+
670
+    pmaddubsw   m10,        m2, [r3 + 6 * 32]   ; [22]
671
+    pmulhrsw    m10,        m7
672
+    pmaddubsw   m1,         m3, [r3 + 6 * 32]
673
+    pmulhrsw    m1,         m7
674
+    packuswb    m10,        m1
675
+
676
+    movu        m0,         [r2 + 25]
677
+    movu        m1,         [r2 + 26]
678
+    punpcklbw   m0,         m1
679
+
680
+    palignr     m11,        m3, m2, 2
681
+    palignr     m1,         m0, m3, 2
682
+    pmaddubsw   m11,        [r3 - 5 * 32]       ; [11]
683
+    pmulhrsw    m11,        m7
684
+    pmaddubsw   m1,         [r3 - 5 * 32]
685
+    pmulhrsw    m1,         m7
686
+    packuswb    m11,        m1
687
+
688
+    movu        m12,        [r2 + 11]
689
+
690
+    TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 8
691
+    ret
692
+
693
+INIT_YMM avx2
694
+cglobal intra_pred_ang32_4, 3,8,13
695
+    add         r2, 64
696
+    lea         r3, [ang_table_avx2 + 32 * 16]
697
+    lea         r5, [r1 * 3]            ; r5 -> 3 * stride
698
+    lea         r6, [r1 * 4]            ; r6 -> 4 * stride
699
+    mova        m7, [pw_1024]
700
+    mov         r4, r0
701
+    xor         r7d, r7d
702
+
703
+    call ang32_mode_4_32_row_0_15
704
+
705
+    add         r4, 16
706
+    mov         r0, r4
707
+    add         r2, 11
708
+
709
+    call ang32_mode_4_32_row_16_31
710
+    RET
711
+
712
+INIT_YMM avx2
713
+cglobal intra_pred_ang32_32, 3,8,13
714
+    lea         r3, [ang_table_avx2 + 32 * 16]
715
+    lea         r5, [r1 * 3]            ; r5 -> 3 * stride
716
+    lea         r6, [r1 * 4]            ; r6 -> 4 * stride
717
+    mova        m7, [pw_1024]
718
+    xor         r7d, r7d
719
+    inc         r7d
720
+
721
+    call ang32_mode_4_32_row_0_15
722
+
723
+    add         r2, 11
724
+
725
+    call ang32_mode_4_32_row_16_31
726
+    RET
727
+
728
+cglobal ang32_mode_5_31_row_0_15
729
+    test        r7d,        r7d
730
+    ; rows 0 to 7
731
+    movu        m0,         [r2 +  1]           ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1]
732
+    movu        m1,         [r2 +  2]           ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2]
733
+    movu        m3,         [r2 + 17]           ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17]
734
+    movu        m4,         [r2 + 18]           ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18]
735
+
736
+    punpckhbw   m2,         m0, m1              ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10  9]
737
+    punpcklbw   m0,         m1                  ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17  9  8  8  7  7  6  6  5  5  4  4  3  3  2  2  1]
738
+    punpcklbw   m3,         m4                  ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17]
739
+
740
+    pmaddubsw   m4,         m0, [r3 + 1 * 32]   ; [17]
741
+    pmulhrsw    m4,         m7
742
+    pmaddubsw   m1,         m2, [r3 + 1 * 32]
743
+    pmulhrsw    m1,         m7
744
+    packuswb    m4,         m1
745
+
746
+    palignr     m6,         m2, m0, 2
747
+    palignr     m1,         m3, m2, 2
748
+    pmaddubsw   m5,         m6, [r3 - 14 * 32]  ; [2]
749
+    pmulhrsw    m5,         m7
750
+    pmaddubsw   m8,         m1, [r3 - 14 * 32]
751
+    pmulhrsw    m8,         m7
752
+    packuswb    m5,         m8
753
+
754
+    pmaddubsw   m6,         [r3 + 3 * 32]       ; [19]
755
+    pmulhrsw    m6,         m7
756
+    pmaddubsw   m1,         [r3 + 3 * 32]
757
+    pmulhrsw    m1,         m7
758
+    packuswb    m6,         m1
759
+
760
+    palignr     m9,         m2, m0, 4
761
+    palignr     m10,        m3, m2, 4
762
+    pmaddubsw   m8,         m9, [r3 - 12 * 32]  ; [4]
763
+    pmulhrsw    m8,         m7
764
+    pmaddubsw   m1,         m10, [r3 - 12 * 32]
765
+    pmulhrsw    m1,         m7
766
+    packuswb    m8,         m1
767
+
768
+    pmaddubsw   m9,         [r3 + 5 * 32]       ; [21]
769
+    pmulhrsw    m9,         m7
770
+    pmaddubsw   m10,        [r3 + 5 * 32]
771
+    pmulhrsw    m10,        m7
772
+    packuswb    m9,         m10
773
+
774
+    palignr     m11,        m2, m0, 6
775
+    palignr     m12,        m3, m2, 6
776
+    pmaddubsw   m10,        m11, [r3 - 10 * 32] ; [6]
777
+    pmulhrsw    m10,        m7
778
+    pmaddubsw   m1,         m12, [r3 - 10 * 32]
779
+    pmulhrsw    m1,         m7
780
+    packuswb    m10,        m1
781
+
782
+    pmaddubsw   m11,        [r3 + 7 * 32]       ; [23]
783
+    pmulhrsw    m11,        m7
784
+    pmaddubsw   m12,        [r3 + 7 * 32]
785
+    pmulhrsw    m12,        m7
786
+    packuswb    m11,        m12
787
+
788
+    palignr     m12,        m2, m0, 8
789
+    palignr     m1,         m3, m2, 8
790
+    pmaddubsw   m12,        [r3 - 8 * 32]       ; [8]
791
+    pmulhrsw    m12,        m7
792
+    pmaddubsw   m1,         [r3 - 8 * 32]
793
+    pmulhrsw    m1,         m7
794
+    packuswb    m12,        m1
795
+
796
+    TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 0
797
+
798
+    ; rows 8 to 15
799
+    palignr     m4,         m2, m0, 8
800
+    palignr     m1,         m3, m2, 8
801
+    pmaddubsw   m4,         [r3 + 9 * 32]       ; [25]
802
+    pmulhrsw    m4,         m7
803
+    pmaddubsw   m1,         [r3 + 9 * 32]
804
+    pmulhrsw    m1,         m7
805
+    packuswb    m4,         m1
806
+
807
+    palignr     m6,         m2, m0, 10
808
+    palignr     m1,         m3, m2, 10
809
+    pmaddubsw   m5,         m6, [r3 - 6 * 32]   ; [10]
810
+    pmulhrsw    m5,         m7
811
+    pmaddubsw   m8,         m1, [r3 - 6 * 32]
812
+    pmulhrsw    m8,         m7
813
+    packuswb    m5,         m8
814
+
815
+    pmaddubsw   m6,         [r3 + 11 * 32]      ; [27]
816
+    pmulhrsw    m6,         m7
817
+    pmaddubsw   m1,         [r3 + 11 * 32]
818
+    pmulhrsw    m1,         m7
819
+    packuswb    m6,         m1
820
+
821
+    palignr     m9,         m2, m0, 12
822
+    palignr     m1,         m3, m2, 12
823
+    pmaddubsw   m8,         m9, [r3 - 4 * 32]   ; [12]
824
+    pmulhrsw    m8,         m7
825
+    pmaddubsw   m10,        m1, [r3 - 4 * 32]
826
+    pmulhrsw    m10,        m7
827
+    packuswb    m8,         m10
828
+
829
+    pmaddubsw   m9,         [r3 + 13 * 32]      ; [29]
830
+    pmulhrsw    m9,         m7
831
+    pmaddubsw   m1,         [r3 + 13 * 32]
832
+    pmulhrsw    m1,         m7
833
+    packuswb    m9,         m1
834
+
835
+    palignr     m11,        m2, m0, 14
836
+    palignr     m1,         m3, m2, 14
837
+    pmaddubsw   m10,        m11, [r3 - 2 * 32]  ; [14]
838
+    pmulhrsw    m10,        m7
839
+    pmaddubsw   m12,        m1, [r3 - 2 * 32]
840
+    pmulhrsw    m12,        m7
841
+    packuswb    m10,        m12
842
+
843
+    pmaddubsw   m11,        [r3 + 15 * 32]      ; [31]
844
+    pmulhrsw    m11,        m7
845
+    pmaddubsw   m1,         [r3 + 15 * 32]
846
+    pmulhrsw    m1,         m7
847
+    packuswb    m11,        m1
848
+
849
+    pmaddubsw   m2,         [r3]                ; [16]
850
+    pmulhrsw    m2,         m7
851
+    pmaddubsw   m3,         [r3]
852
+    pmulhrsw    m3,         m7
853
+    packuswb    m2,         m3
854
+
855
+    TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 2, 0, 8
856
+    ret
857
+
858
+cglobal ang32_mode_5_31_row_16_31
859
+    test        r7d,        r7d
860
+    ; rows 0 to 7
861
+    movu        m0,         [r2 +  1]               ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1]
862
+    movu        m1,         [r2 +  2]               ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2]
863
+    movu        m3,         [r2 + 17]               ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17]
864
+    movu        m4,         [r2 + 18]               ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18]
865
+
866
+    punpckhbw   m2,         m0, m1                  ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10  9]
867
+    punpcklbw   m0,         m1                      ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17  9  8  8  7  7  6  6  5  5  4  4  3  3  2  2  1]
868
+    punpcklbw   m3,         m4                      ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17]
869
+
870
+    pmaddubsw   m4,         m0, [r3 - 15 * 32]      ; [1]
871
+    pmulhrsw    m4,         m7
872
+    pmaddubsw   m1,         m2, [r3 - 15 * 32]
873
+    pmulhrsw    m1,         m7
874
+    packuswb    m4,         m1
875
+
876
+    pmaddubsw   m5,         m0, [r3 + 2 * 32]       ; [18]
877
+    pmulhrsw    m5,         m7
878
+    pmaddubsw   m8,         m2, [r3 + 2 * 32]
879
+    pmulhrsw    m8,         m7
880
+    packuswb    m5,         m8
881
+
882
+    palignr     m8,         m2, m0, 2
883
+    palignr     m9,         m3, m2, 2
884
+    pmaddubsw   m6,         m8, [r3 - 13 * 32]      ; [3]
885
+    pmulhrsw    m6,         m7
886
+    pmaddubsw   m1,         m9, [r3 - 13 * 32]
887
+    pmulhrsw    m1,         m7
888
+    packuswb    m6,         m1
889
+
890
+    pmaddubsw   m8,         [r3 + 4 * 32]           ; [20]
891
+    pmulhrsw    m8,         m7
892
+    pmaddubsw   m9,         [r3 + 4 * 32]
893
+    pmulhrsw    m9,         m7
894
+    packuswb    m8,         m9
895
+
896
+    palignr     m10,        m2, m0, 4
897
+    palignr     m1,         m3, m2, 4
898
+    pmaddubsw   m9,         m10, [r3 - 11 * 32]     ; [5]
899
+    pmulhrsw    m9,         m7
900
+    pmaddubsw   m11,        m1, [r3 - 11 * 32]
901
+    pmulhrsw    m11,        m7
902
+    packuswb    m9,         m11
903
+
904
+    pmaddubsw   m10,        [r3 + 6 * 32] ; [22]
905
+    pmulhrsw    m10,        m7
906
+    pmaddubsw   m1,         [r3 + 6 * 32]
907
+    pmulhrsw    m1,         m7
908
+    packuswb    m10,        m1
909
+
910
+    palignr     m12,        m2, m0, 6
911
+    palignr     m1,         m3, m2, 6
912
+    pmaddubsw   m11,        m12, [r3 - 9 * 32]      ; [7]
913
+    pmulhrsw    m11,        m7
914
+    pmaddubsw   m1,         [r3 - 9 * 32]
915
+    pmulhrsw    m1,         m7
916
+    packuswb    m11,        m1
917
+
918
+    palignr     m1,         m3, m2, 6
919
+    pmaddubsw   m12,        [r3 + 8 * 32]           ; [24]
920
+    pmulhrsw    m12,        m7
921
+    pmaddubsw   m1,         [r3 + 8 * 32]
922
+    pmulhrsw    m1,         m7
923
+    packuswb    m12,        m1
924
+
925
+    TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 0
926
+
927
+    ; rows 8 to 15
928
+    palignr     m5,         m2, m0, 8
929
+    palignr     m8,         m3, m2, 8
930
+    pmaddubsw   m4,         m5, [r3 - 7 * 32]       ; [9]
931
+    pmulhrsw    m4,         m7
932
+    pmaddubsw   m1,         m8, [r3 - 7 * 32]
933
+    pmulhrsw    m1,         m7
934
+    packuswb    m4,         m1
935
+
936
+    pmaddubsw   m5,         [r3 + 10 * 32]          ; [26]
937
+    pmulhrsw    m5,         m7
938
+    pmaddubsw   m8,         [r3 + 10 * 32]
939
+    pmulhrsw    m8,         m7
940
+    packuswb    m5,         m8
941
+
942
+    palignr     m8,         m2, m0, 10
943
+    palignr     m9,         m3, m2, 10
944
+    pmaddubsw   m6,         m8, [r3 - 5 * 32]       ; [11]
945
+    pmulhrsw    m6,         m7
946
+    pmaddubsw   m1,         m9, [r3 - 5 * 32]
947
+    pmulhrsw    m1,         m7
948
+    packuswb    m6,         m1
949
+
950
+    pmaddubsw   m8,         [r3 + 12 * 32]          ; [28]
951
+    pmulhrsw    m8,         m7
952
+    pmaddubsw   m9,         [r3 + 12 * 32]
953
+    pmulhrsw    m9,         m7
954
+    packuswb    m8,         m9
955
+
956
+    palignr     m10,        m2, m0, 12
957
+    palignr     m11,        m3, m2, 12
958
+    pmaddubsw   m9,         m10, [r3 - 3 * 32]      ; [13]
959
+    pmulhrsw    m9,         m7
960
+    pmaddubsw   m1,         m11, [r3 - 3 * 32]
961
+    pmulhrsw    m1,         m7
962
+    packuswb    m9,         m1
963
+
964
+    pmaddubsw   m10,        [r3 + 14 * 32]          ; [30]
965
+    pmulhrsw    m10,        m7
966
+    pmaddubsw   m11,        [r3 + 14 * 32]
967
+    pmulhrsw    m11,        m7
968
+    packuswb    m10,        m11
969
+
970
+    palignr     m11,        m2, m0, 14
971
+    palignr     m1,         m3, m2, 14
972
+    pmaddubsw   m11,        [r3 - 1 * 32]           ; [15]
973
+    pmulhrsw    m11,        m7
974
+    pmaddubsw   m1,         [r3 - 1 * 32]
975
+    pmulhrsw    m1,         m7
976
+    packuswb    m11,        m1
977
+
978
+    movu        m2,         [r2 + 9]
979
+    TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 2, 0, 8
980
+    ret
981
+
982
+INIT_YMM avx2
983
+cglobal intra_pred_ang32_5, 3,8,13
984
+    add         r2, 64
985
+    lea         r3, [ang_table_avx2 + 32 * 16]
986
+    lea         r5, [r1 * 3]            ; r5 -> 3 * stride
987
+    lea         r6, [r1 * 4]            ; r6 -> 4 * stride
988
+    mova        m7, [pw_1024]
989
+    mov         r4, r0
990
+    xor         r7d, r7d
991
+
992
+    call ang32_mode_5_31_row_0_15
993
+
994
+    add         r4, 16
995
+    mov         r0, r4
996
+    add         r2, 9
997
+
998
+    call ang32_mode_5_31_row_16_31
999
+    RET
1000
+
1001
+INIT_YMM avx2
1002
+cglobal intra_pred_ang32_31, 3,8,13
1003
+    lea         r3, [ang_table_avx2 + 32 * 16]
1004
+    lea         r5, [r1 * 3]            ; r5 -> 3 * stride
1005
+    lea         r6, [r1 * 4]            ; r6 -> 4 * stride
1006
+    mova        m7, [pw_1024]
1007
+    xor         r7d, r7d
1008
+    inc         r7d
1009
+
1010
+    call ang32_mode_5_31_row_0_15
1011
+
1012
+    add         r2, 9
1013
+
1014
+    call ang32_mode_5_31_row_16_31
1015
+    RET
1016
+
1017
+cglobal ang32_mode_6_30_row_0_15
1018
+    test        r7d,        r7d
1019
+    ; rows 0 to 7
1020
+    movu        m0,         [r2 +  1]           ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1]
1021
+    movu        m1,         [r2 +  2]           ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2]
1022
+    movu        m3,         [r2 + 17]           ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17]
1023
+    movu        m4,         [r2 + 18]           ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18]
1024
+
1025
+    punpckhbw   m2,         m0, m1              ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10  9]
1026
+    punpcklbw   m0,         m1                  ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17  9  8  8  7  7  6  6  5  5  4  4  3  3  2  2  1]
1027
+    punpcklbw   m3,         m4                  ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17]
1028
+
1029
+    pmaddubsw   m4,         m0, [r3 - 3 * 32]   ; [13]
1030
+    pmulhrsw    m4,         m7
1031
+    pmaddubsw   m1,         m2, [r3 - 3 * 32]
1032
+    pmulhrsw    m1,         m7
1033
+    packuswb    m4,         m1
1034
+
1035
+    pmaddubsw   m5,         m0, [r3 + 10 * 32]  ; [26]
1036
+    pmulhrsw    m5,         m7
1037
+    pmaddubsw   m8,         m2, [r3 + 10 * 32]
1038
+    pmulhrsw    m8,         m7
1039
+    packuswb    m5,         m8
1040
+
1041
+    palignr     m8,         m2, m0, 2
1042
+    palignr     m1,         m3, m2, 2
1043
+    pmaddubsw   m6,         m8, [r3 - 9 * 32]   ; [7]
1044
+    pmulhrsw    m6,         m7
1045
+    pmaddubsw   m9,         m1, [r3 - 9 * 32]
1046
+    pmulhrsw    m9,         m7
1047
+    packuswb    m6,         m9
1048
+
1049
+    pmaddubsw   m8,         [r3 + 4 * 32]       ; [20]
1050
+    pmulhrsw    m8,         m7
1051
+    pmaddubsw   m1,         [r3 + 4 * 32]
1052
+    pmulhrsw    m1,         m7
1053
+    packuswb    m8,         m1
1054
+
1055
+    palignr     m11,        m2, m0, 4
1056
+    palignr     m1,         m3, m2, 4
1057
+    pmaddubsw   m9,         m11, [r3 - 15 * 32] ; [1]
1058
+    pmulhrsw    m9,         m7
1059
+    pmaddubsw   m12,        m1, [r3 - 15 * 32]
1060
+    pmulhrsw    m12,        m7
1061
+    packuswb    m9,         m12
1062
+
1063
+    pmaddubsw   m10,        m11, [r3 - 2 * 32]  ; [14]
1064
+    pmulhrsw    m10,        m7
1065
+    pmaddubsw   m12,        m1, [r3 - 2 * 32]
1066
+    pmulhrsw    m12,        m7
1067
+    packuswb    m10,        m12
1068
+
1069
+    pmaddubsw   m11,        [r3 + 11 * 32]      ; [27]
1070
+    pmulhrsw    m11,        m7
1071
+    pmaddubsw   m1,         [r3 + 11 * 32]
1072
+    pmulhrsw    m1,         m7
1073
+    packuswb    m11,        m1
1074
+
1075
+    palignr     m12,        m2, m0, 6
1076
+    palignr     m1,         m3, m2, 6
1077
+    pmaddubsw   m12,        [r3 - 8 * 32]       ; [8]
1078
+    pmulhrsw    m12,        m7
1079
+    pmaddubsw   m1,         [r3 - 8 * 32]
1080
+    pmulhrsw    m1,         m7
1081
+    packuswb    m12,        m1
1082
+
1083
+    TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 0
1084
+
1085
+    ; rows 8 to 15
1086
+    palignr     m4,         m2, m0, 6
1087
+    palignr     m1,         m3, m2, 6
1088
+    pmaddubsw   m4,         [r3 + 5 * 32]       ; [21]
1089
+    pmulhrsw    m4,         m7
1090
+    pmaddubsw   m1,         [r3 + 5 * 32]
1091
+    pmulhrsw    m1,         m7
1092
+    packuswb    m4,         m1
1093
+
1094
+    palignr     m8,         m2, m0, 8
1095
+    palignr     m1,         m3, m2, 8
1096
+    pmaddubsw   m5,         m8, [r3 - 14 * 32]  ; [2]
1097
+    pmulhrsw    m5,         m7
1098
+    pmaddubsw   m9,         m1, [r3 - 14 * 32]
1099
+    pmulhrsw    m9,         m7
1100
+    packuswb    m5,         m9
1101
+
1102
+    pmaddubsw   m6,         m8, [r3 - 1 * 32]   ; [15]
1103
+    pmulhrsw    m6,         m7
1104
+    pmaddubsw   m9,         m1, [r3 - 1 * 32]
1105
+    pmulhrsw    m9,         m7
1106
+    packuswb    m6,         m9
1107
+
1108
+    pmaddubsw   m8,         [r3 + 12 * 32]      ; [28]
1109
+    pmulhrsw    m8,         m7
1110
+    pmaddubsw   m1,         [r3 + 12 * 32]
1111
+    pmulhrsw    m1,         m7
1112
+    packuswb    m8,         m1
1113
+
1114
+    palignr     m10,        m2, m0, 10
1115
+    palignr     m1,         m3, m2, 10
1116
+    pmaddubsw   m9,         m10, [r3 - 7 * 32]  ; [9]
1117
+    pmulhrsw    m9,         m7
1118
+    pmaddubsw   m11,        m1, [r3 - 7 * 32]
1119
+    pmulhrsw    m11,        m7
1120
+    packuswb    m9,         m11
1121
+
1122
+    pmaddubsw   m10,        [r3 + 6 * 32]       ; [22]
1123
+    pmulhrsw    m10,        m7
1124
+    pmaddubsw   m1,         m1, [r3 + 6 * 32]
1125
+    pmulhrsw    m1,         m7
1126
+    packuswb    m10,        m1
1127
+
1128
+    palignr     m3,         m2, 12
1129
+    palignr     m2,         m0, 12
1130
+    pmaddubsw   m11,        m2, [r3 - 13 * 32]  ; [3]
1131
+    pmulhrsw    m11,        m7
1132
+    pmaddubsw   m1,         m3, [r3 - 13 * 32]
1133
+    pmulhrsw    m1,         m7
1134
+    packuswb    m11,        m1
1135
+
1136
+    pmaddubsw   m2,         [r3]                ; [16]
1137
+    pmulhrsw    m2,         m7
1138
+    pmaddubsw   m3,         [r3]
1139
+    pmulhrsw    m3,         m7
1140
+    packuswb    m2,         m3
1141
+
1142
+    TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 2, 0, 8
1143
+    ret
1144
+
1145
+cglobal ang32_mode_6_30_row_16_31
1146
+    test        r7d,        r7d
1147
+    ; rows 0 to 7
1148
+    movu        m0,         [r2 +  1]           ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1]
1149
+    movu        m1,         [r2 +  2]           ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2]
1150
+    movu        m3,         [r2 + 17]           ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17]
1151
+    movu        m4,         [r2 + 18]           ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18]
1152
+
1153
+    punpckhbw   m2,         m0, m1              ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10  9]
1154
+    punpcklbw   m0,         m1                  ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17  9  8  8  7  7  6  6  5  5  4  4  3  3  2  2  1]
1155
+    punpcklbw   m3,         m4                  ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17]
1156
+
1157
+    pmaddubsw   m4,         m0, [r3 + 13 * 32]  ; [29]
1158
+    pmulhrsw    m4,         m7
1159
+    pmaddubsw   m1,         m2, [r3 + 13 * 32]
1160
+    pmulhrsw    m1,         m7
1161
+    packuswb    m4,         m1
1162
+
1163
+    palignr     m6,         m2, m0, 2
1164
+    palignr     m1,         m3, m2, 2
1165
+    pmaddubsw   m5,         m6, [r3 - 6 * 32]   ; [10]
1166
+    pmulhrsw    m5,         m7
1167
+    pmaddubsw   m8,         m1, [r3 - 6 * 32]
1168
+    pmulhrsw    m8,         m7
1169
+    packuswb    m5,         m8
1170
+
1171
+    pmaddubsw   m6,         [r3 + 7 * 32]       ; [23]
1172
+    pmulhrsw    m6,         m7
1173
+    pmaddubsw   m1,         [r3 + 7 * 32]
1174
+    pmulhrsw    m1,         m7
1175
+    packuswb    m6,         m1
1176
+
1177
+    palignr     m10,        m2, m0, 4
1178
+    palignr     m1,         m3, m2, 4
1179
+    pmaddubsw   m8,         m10, [r3 - 12 * 32] ; [4]
1180
+    pmulhrsw    m8,         m7
1181
+    pmaddubsw   m11,        m1, [r3 - 12 * 32]
1182
+    pmulhrsw    m11,        m7
1183
+    packuswb    m8,         m11
1184
+
1185
+    pmaddubsw   m9,         m10, [r3 + 1 * 32]  ; [17]
1186
+    pmulhrsw    m9,         m7
1187
+    pmaddubsw   m11,        m1, [r3 + 1 * 32]
1188
+    pmulhrsw    m11,        m7
1189
+    packuswb    m9,         m11
1190
+
1191
+    pmaddubsw   m10,        [r3 + 14 * 32]      ; [30]
1192
+    pmulhrsw    m10,        m7
1193
+    pmaddubsw   m1,         [r3 + 14 * 32]
1194
+    pmulhrsw    m1,         m7
1195
+    packuswb    m10,        m1
1196
+
1197
+    palignr     m12,        m2, m0, 6
1198
+    palignr     m1,         m3, m2, 6
1199
+    pmaddubsw   m11,        m12, [r3 - 5 * 32]  ; [11]
1200
+    pmulhrsw    m11,        m7
1201
+    pmaddubsw   m1,         [r3 - 5 * 32]
1202
+    pmulhrsw    m1,         m7
1203
+    packuswb    m11,        m1
1204
+
1205
+    palignr     m1,         m3, m2, 6
1206
+    pmaddubsw   m12,        [r3 + 8 * 32]       ; [24]
1207
+    pmulhrsw    m12,        m7
1208
+    pmaddubsw   m1,         [r3 + 8 * 32]
1209
+    pmulhrsw    m1,         m7
1210
+    packuswb    m12,        m1
1211
+
1212
+    TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 0
1213
+
1214
+    ; rows 8 to 15
1215
+    palignr     m6,         m2, m0, 8
1216
+    palignr     m1,         m3, m2, 8
1217
+    pmaddubsw   m4,         m6, [r3 - 11 * 32]  ; [5]
1218
+    pmulhrsw    m4,         m7
1219
+    pmaddubsw   m8,         m1, [r3 - 11 * 32]
1220
+    pmulhrsw    m8,         m7
1221
+    packuswb    m4,         m8
1222
+
1223
+    pmaddubsw   m5,         m6, [r3 + 2 * 32]   ; [18]
1224
+    pmulhrsw    m5,         m7
1225
+    pmaddubsw   m9,         m1, [r3 + 2 * 32]
1226
+    pmulhrsw    m9,         m7
1227
+    packuswb    m5,         m9
1228
+
1229
+    pmaddubsw   m6,         [r3 + 15 * 32]      ; [31]
1230
+    pmulhrsw    m6,         m7
1231
+    pmaddubsw   m1,         [r3 + 15 * 32]
1232
+    pmulhrsw    m1,         m7
1233
+    packuswb    m6,         m1
1234
+
1235
+    palignr     m9,         m2, m0, 10
1236
+    palignr     m1,         m3, m2, 10
1237
+    pmaddubsw   m8,         m9, [r3 - 4 * 32]   ; [12]
1238
+    pmulhrsw    m8,         m7
1239
+    pmaddubsw   m10,        m1, [r3 - 4 * 32]
1240
+    pmulhrsw    m10,        m7
1241
+    packuswb    m8,         m10
1242
+
1243
+    pmaddubsw   m9,         [r3 + 9 * 32]       ; [25]
1244
+    pmulhrsw    m9,         m7
1245
+    pmaddubsw   m1,         [r3 + 9 * 32]
1246
+    pmulhrsw    m1,         m7
1247
+    packuswb    m9,         m1
1248
+
1249
+    palignr     m3,         m2, 12
1250
+    palignr     m2,         m0, 12
1251
+    pmaddubsw   m10,        m2, [r3 - 10 * 32]  ; [6]
1252
+    pmulhrsw    m10,        m7
1253
+    pmaddubsw   m1,         m3, [r3 - 10 * 32]
1254
+    pmulhrsw    m1,         m7
1255
+    packuswb    m10,        m1
1256
+
1257
+    pmaddubsw   m2,         [r3 + 3 * 32]        ; [19]
1258
+    pmulhrsw    m2,         m7
1259
+    pmaddubsw   m3,         [r3 + 3 * 32]
1260
+    pmulhrsw    m3,         m7
1261
+    packuswb    m2,         m3
1262
+
1263
+    movu        m3,         [r2 + 8]             ; [0]
1264
+
1265
+    TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 2, 3, 0, 8
1266
+    ret
1267
+
1268
+INIT_YMM avx2
1269
+cglobal intra_pred_ang32_6, 3,8,13
1270
+    add         r2, 64
1271
+    lea         r3, [ang_table_avx2 + 32 * 16]
1272
+    lea         r5, [r1 * 3]            ; r5 -> 3 * stride
1273
+    lea         r6, [r1 * 4]            ; r6 -> 4 * stride
1274
+    mova        m7, [pw_1024]
1275
+    mov         r4, r0
1276
+    xor         r7d, r7d
1277
+
1278
+    call ang32_mode_6_30_row_0_15
1279
+
1280
+    add         r4, 16
1281
+    mov         r0, r4
1282
+    add         r2, 6
1283
+
1284
+    call ang32_mode_6_30_row_16_31
1285
+    RET
1286
+
1287
+INIT_YMM avx2
1288
+cglobal intra_pred_ang32_30, 3,8,13
1289
+    lea         r3, [ang_table_avx2 + 32 * 16]
1290
+    lea         r5, [r1 * 3]            ; r5 -> 3 * stride
1291
+    lea         r6, [r1 * 4]            ; r6 -> 4 * stride
1292
+    mova        m7, [pw_1024]
1293
+    xor         r7d, r7d
1294
+    inc         r7d
1295
+
1296
+    call ang32_mode_6_30_row_0_15
1297
+
1298
+    add         r2, 6
1299
+
1300
+    call ang32_mode_6_30_row_16_31
1301
+    RET
1302
+
1303
+cglobal ang32_mode_7_29_row_0_15
1304
+    test        r7d,        r7d
1305
+    ; rows 0 to 7
1306
+    movu        m0,         [r2 +  1]           ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1]
1307
+    movu        m1,         [r2 +  2]           ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2]
1308
+    movu        m3,         [r2 + 17]           ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17]
1309
+    movu        m4,         [r2 + 18]           ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18]
1310
+
1311
+    punpckhbw   m2,         m0, m1              ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10  9]
1312
+    punpcklbw   m0,         m1                  ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17  9  8  8  7  7  6  6  5  5  4  4  3  3  2  2  1]
1313
+    punpcklbw   m3,         m4                  ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17]
1314
+
1315
+    pmaddubsw   m4,         m0, [r3 - 7 * 32]   ; [9]
1316
+    pmulhrsw    m4,         m7
1317
+    pmaddubsw   m1,         m2, [r3 - 7 * 32]
1318
+    pmulhrsw    m1,         m7
1319
+    packuswb    m4,         m1
1320
+
1321
+    pmaddubsw   m5,         m0, [r3 + 2 * 32]   ; [18]
1322
+    pmulhrsw    m5,         m7
1323
+    pmaddubsw   m8,         m2, [r3 + 2 * 32]
1324
+    pmulhrsw    m8,         m7
1325
+    packuswb    m5,         m8
1326
+
1327
+    pmaddubsw   m6,         m0, [r3 + 11 * 32]  ; [27]
1328
+    pmulhrsw    m6,         m7
1329
+    pmaddubsw   m9,         m2, [r3 + 11 * 32]
1330
+    pmulhrsw    m9,         m7
1331
+    packuswb    m6,         m9
1332
+
1333
+    palignr     m11,        m2, m0, 2
1334
+    palignr     m1,         m3, m2, 2
1335
+    pmaddubsw   m8,         m11, [r3 - 12 * 32] ; [4]
1336
+    pmulhrsw    m8,         m7
1337
+    pmaddubsw   m12,        m1, [r3 - 12 * 32]
1338
+    pmulhrsw    m12,        m7
1339
+    packuswb    m8,         m12
1340
+
1341
+    pmaddubsw   m9,         m11, [r3 - 3 * 32]  ; [13]
1342
+    pmulhrsw    m9,         m7
1343
+    pmaddubsw   m12,        m1, [r3 - 3 * 32]
1344
+    pmulhrsw    m12,        m7
1345
+    packuswb    m9,         m12
1346
+
1347
+    pmaddubsw   m10,        m11, [r3 + 6 * 32]  ; [22]
1348
+    pmulhrsw    m10,        m7
1349
+    pmaddubsw   m12,        m1, [r3 + 6 * 32]
1350
+    pmulhrsw    m12,        m7
1351
+    packuswb    m10,        m12
1352
+
1353
+    pmaddubsw   m11,        [r3 + 15 * 32]      ; [31]
1354
+    pmulhrsw    m11,        m7
1355
+    pmaddubsw   m1,         [r3 + 15 * 32]
1356
+    pmulhrsw    m1,         m7
1357
+    packuswb    m11,        m1
1358
+
1359
+    palignr     m12,        m2, m0, 4
1360
+    palignr     m1,         m3, m2, 4
1361
+    pmaddubsw   m12,        [r3 - 8 * 32]       ; [8]
1362
+    pmulhrsw    m12,        m7
1363
+    pmaddubsw   m1,         [r3 - 8 * 32]
1364
+    pmulhrsw    m1,         m7
1365
+    packuswb    m12,        m1
1366
+
1367
+    TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 0
1368
+
1369
+    ; rows 8 to 15
1370
+    palignr     m5,         m2, m0, 4
1371
+    palignr     m1,         m3, m2, 4
1372
+    pmaddubsw   m4,         m5, [r3 + 1 * 32]   ; [17]
1373
+    pmulhrsw    m4,         m7
1374
+    pmaddubsw   m8,         m1, [r3 + 1 * 32]
1375
+    pmulhrsw    m8,         m7
1376
+    packuswb    m4,         m8
1377
+
1378
+    pmaddubsw   m5,         [r3 + 10 * 32]      ; [26]
1379
+    pmulhrsw    m5,         m7
1380
+    pmaddubsw   m1,         [r3 + 10 * 32]
1381
+    pmulhrsw    m1,         m7
1382
+    packuswb    m5,         m1
1383
+
1384
+    palignr     m10,        m2, m0, 6
1385
+    palignr     m1,         m3, m2, 6
1386
+    pmaddubsw   m6,         m10, [r3 - 13 * 32] ; [3]
1387
+    pmulhrsw    m6,         m7
1388
+    pmaddubsw   m9,         m1, [r3 - 13 * 32]
1389
+    pmulhrsw    m9,         m7
1390
+    packuswb    m6,         m9
1391
+
1392
+    pmaddubsw   m8,         m10, [r3 - 4 * 32]  ; [12]
1393
+    pmulhrsw    m8,         m7
1394
+    pmaddubsw   m11,        m1, [r3 - 4 * 32]
1395
+    pmulhrsw    m11,        m7
1396
+    packuswb    m8,         m11
1397
+
1398
+    pmaddubsw   m9,         m10, [r3 + 5 * 32]  ; [21]
1399
+    pmulhrsw    m9,         m7
1400
+    pmaddubsw   m11,        m1, [r3 + 5 * 32]
1401
+    pmulhrsw    m11,        m7
1402
+    packuswb    m9,         m11
1403
+
1404
+    pmaddubsw   m10,        [r3 + 14 * 32]      ; [30]
1405
+    pmulhrsw    m10,        m7
1406
+    pmaddubsw   m1,         [r3 + 14 * 32]
1407
+    pmulhrsw    m1,         m7
1408
+    packuswb    m10,        m1
1409
+
1410
+    palignr     m3,         m2, 8
1411
+    palignr     m2,         m0, 8
1412
+    pmaddubsw   m11,        m2, [r3 - 9 * 32]   ; [7]
1413
+    pmulhrsw    m11,        m7
1414
+    pmaddubsw   m1,         m3, [r3 - 9 * 32]
1415
+    pmulhrsw    m1,         m7
1416
+    packuswb    m11,        m1
1417
+
1418
+    pmaddubsw   m2,         [r3]                ; [16]
1419
+    pmulhrsw    m2,         m7
1420
+    pmaddubsw   m3,         [r3]
1421
+    pmulhrsw    m3,         m7
1422
+    packuswb    m2,         m3
1423
+
1424
+    TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 2, 0, 8
1425
+    ret
1426
+
1427
+cglobal ang32_mode_7_29_row_16_31
1428
+    test        r7d,        r7d
1429
+    ; rows 0 to 7
1430
+    movu        m0,         [r2 +  1]           ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1]
1431
+    movu        m1,         [r2 +  2]           ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2]
1432
+    movu        m3,         [r2 + 17]           ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17]
1433
+    movu        m4,         [r2 + 18]           ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18]
1434
+
1435
+    punpckhbw   m2,         m0, m1              ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10  9]
1436
+    punpcklbw   m0,         m1                  ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17  9  8  8  7  7  6  6  5  5  4  4  3  3  2  2  1]
1437
+    punpcklbw   m3,         m4                  ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17]
1438
+
1439
+    pmaddubsw   m4,         m0, [r3 + 9 * 32]   ; [25]
1440
+    pmulhrsw    m4,         m7
1441
+    pmaddubsw   m1,         m2, [r3 + 9 * 32]
1442
+    pmulhrsw    m1,         m7
1443
+    packuswb    m4,         m1
1444
+
1445
+    palignr     m9,         m2, m0, 2
1446
+    palignr     m1,         m3, m2, 2
1447
+    pmaddubsw   m5,         m9, [r3 - 14 * 32]  ; [2]
1448
+    pmulhrsw    m5,         m7
1449
+    pmaddubsw   m8,         m1, [r3 - 14 * 32]
1450
+    pmulhrsw    m8,         m7
1451
+    packuswb    m5,         m8
1452
+
1453
+    pmaddubsw   m6,         m9, [r3 - 5 * 32]   ; [11]
1454
+    pmulhrsw    m6,         m7
1455
+    pmaddubsw   m10,        m1, [r3 - 5 * 32]
1456
+    pmulhrsw    m10,        m7
1457
+    packuswb    m6,         m10
1458
+
1459
+    pmaddubsw   m8,         m9, [r3 + 4 * 32]   ; [20]
1460
+    pmulhrsw    m8,         m7
1461
+    pmaddubsw   m10,        m1, [r3 + 4 * 32]
1462
+    pmulhrsw    m10,        m7
1463
+    packuswb    m8,         m10
1464
+
1465
+    pmaddubsw   m9,         [r3 + 13 * 32]      ; [29]
1466
+    pmulhrsw    m9,         m7
1467
+    pmaddubsw   m1,         [r3 + 13 * 32]
1468
+    pmulhrsw    m1,         m7
1469
+    packuswb    m9,         m1
1470
+
1471
+    palignr     m12,        m2, m0, 4
1472
+    palignr     m1,         m3, m2, 4
1473
+    pmaddubsw   m10,        m12, [r3 - 10 * 32] ; [6]
1474
+    pmulhrsw    m10,        m7
1475
+    pmaddubsw   m11,        m1, [r3 - 10 * 32]
1476
+    pmulhrsw    m11,        m7
1477
+    packuswb    m10,        m11
1478
+
1479
+    pmaddubsw   m11,        m12, [r3 - 1 * 32]  ; [15]
1480
+    pmulhrsw    m11,        m7
1481
+    pmaddubsw   m1,         [r3 - 1 * 32]
1482
+    pmulhrsw    m1,         m7
1483
+    packuswb    m11,        m1
1484
+
1485
+    palignr     m1,         m3, m2, 4
1486
+    pmaddubsw   m12,        [r3 + 8 * 32]       ; [24]
1487
+    pmulhrsw    m12,        m7
1488
+    pmaddubsw   m1,         [r3 + 8 * 32]
1489
+    pmulhrsw    m1,         m7
1490
+    packuswb    m12,        m1
1491
+
1492
+    TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 0
1493
+
1494
+    ; rows 8 to 15
1495
+    palignr     m8,         m2, m0, 6
1496
+    palignr     m1,         m3, m2, 6
1497
+    pmaddubsw   m4,         m8, [r3 - 15 * 32]  ; [1]
1498
+    pmulhrsw    m4,         m7
1499
+    pmaddubsw   m9,         m1, [r3 - 15 * 32]
1500
+    pmulhrsw    m9,         m7
1501
+    packuswb    m4,         m9
1502
+
1503
+    pmaddubsw   m5,         m8, [r3 - 6 * 32]   ; [10]
1504
+    pmulhrsw    m5,         m7
1505
+    pmaddubsw   m9,         m1, [r3 - 6 * 32]
1506
+    pmulhrsw    m9,         m7
1507
+    packuswb    m5,         m9
1508
+
1509
+    pmaddubsw   m6,         m8, [r3 + 3 * 32]   ; [19]
1510
+    pmulhrsw    m6,         m7
1511
+    pmaddubsw   m9,         m1, [r3 + 3 * 32]
1512
+    pmulhrsw    m9,         m7
1513
+    packuswb    m6,         m9
1514
+
1515
+    pmaddubsw   m8,         [r3 + 12 * 32]      ; [28]
1516
+    pmulhrsw    m8,         m7
1517
+    pmaddubsw   m1,         [r3 + 12 * 32]
1518
+    pmulhrsw    m1,         m7
1519
+    packuswb    m8,         m1
1520
+
1521
+    palignr     m3,         m2, 8
1522
+    palignr     m2,         m0, 8
1523
+    pmaddubsw   m9,         m2, [r3 - 11 * 32]  ; [5]
1524
+    pmulhrsw    m9,         m7
1525
+    pmaddubsw   m1,         m3, [r3 - 11 * 32]
1526
+    pmulhrsw    m1,         m7
1527
+    packuswb    m9,         m1
1528
+
1529
+    pmaddubsw   m10,        m2, [r3 - 2 * 32]   ; [14]
1530
+    pmulhrsw    m10,        m7
1531
+    pmaddubsw   m1,         m3, [r3 - 2 * 32]
1532
+    pmulhrsw    m1,         m7
1533
+    packuswb    m10,        m1
1534
+
1535
+    pmaddubsw   m2,        [r3 + 7 * 32]        ; [23]
1536
+    pmulhrsw    m2,        m7
1537
+    pmaddubsw   m3,        [r3 + 7 * 32]
1538
+    pmulhrsw    m3,        m7
1539
+    packuswb    m2,        m3
1540
+
1541
+    movu        m1,         [r2 + 6]            ; [0]
1542
+
1543
+    TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 2, 1, 0, 8
1544
+    ret
1545
+
1546
+INIT_YMM avx2
1547
+cglobal intra_pred_ang32_7, 3,8,13
1548
+    add         r2, 64
1549
+    lea         r3, [ang_table_avx2 + 32 * 16]
1550
+    lea         r5, [r1 * 3]            ; r5 -> 3 * stride
1551
+    lea         r6, [r1 * 4]            ; r6 -> 4 * stride
1552
+    mova        m7, [pw_1024]
1553
+    mov         r4, r0
1554
+    xor         r7d, r7d
1555
+
1556
+    call ang32_mode_7_29_row_0_15
1557
+
1558
+    add         r4, 16
1559
+    mov         r0, r4
1560
+    add         r2, 4
1561
+
1562
+    call ang32_mode_7_29_row_16_31
1563
+    RET
1564
+
1565
+INIT_YMM avx2
1566
+cglobal intra_pred_ang32_29, 3,8,13
1567
+    lea         r3, [ang_table_avx2 + 32 * 16]
1568
+    lea         r5, [r1 * 3]            ; r5 -> 3 * stride
1569
+    lea         r6, [r1 * 4]            ; r6 -> 4 * stride
1570
+    mova        m7, [pw_1024]
1571
+    xor         r7d, r7d
1572
+    inc         r7d
1573
+
1574
+    call ang32_mode_7_29_row_0_15
1575
+
1576
+    add         r2, 4
1577
+
1578
+    call ang32_mode_7_29_row_16_31
1579
+    RET
1580
+
1581
+cglobal ang32_mode_8_28_avx2
1582
+    test        r7d,        r7d
1583
+    ; rows 0 to 7
1584
+    movu        m0,         [r2 +  1]           ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1]
1585
+    movu        m1,         [r2 +  2]           ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2]
1586
+    movu        m3,         [r2 + 17]           ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17]
1587
+    movu        m4,         [r2 + 18]           ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18]
1588
+
1589
+    punpckhbw   m2,         m0, m1              ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10  9]
1590
+    punpcklbw   m0,         m1                  ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17  9  8  8  7  7  6  6  5  5  4  4  3  3  2  2  1]
1591
+    punpcklbw   m3,         m4                  ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17]
1592
+
1593
+    pmaddubsw   m4,         m0, [r3 - 11 * 32]  ; [5]
1594
+    pmulhrsw    m4,         m7
1595
+    pmaddubsw   m1,         m2, [r3 - 11 * 32]
1596
+    pmulhrsw    m1,         m7
1597
+    packuswb    m4,         m1
1598
+
1599
+    pmaddubsw   m5,         m0, [r3 - 6 * 32]   ; [10]
1600
+    pmulhrsw    m5,         m7
1601
+    pmaddubsw   m8,         m2, [r3 - 6 * 32]
1602
+    pmulhrsw    m8,         m7
1603
+    packuswb    m5,         m8
1604
+
1605
+    pmaddubsw   m6,         m0, [r3 - 1 * 32]   ; [15]
1606
+    pmulhrsw    m6,         m7
1607
+    pmaddubsw   m9,         m2, [r3 - 1 * 32]
1608
+    pmulhrsw    m9,         m7
1609
+    packuswb    m6,         m9
1610
+
1611
+    pmaddubsw   m8,         m0, [r3 + 4 * 32]   ; [20]
1612
+    pmulhrsw    m8,         m7
1613
+    pmaddubsw   m12,        m2, [r3 + 4 * 32]
1614
+    pmulhrsw    m12,        m7
1615
+    packuswb    m8,         m12
1616
+
1617
+    pmaddubsw   m9,         m0, [r3 + 9 * 32]   ; [25]
1618
+    pmulhrsw    m9,         m7
1619
+    pmaddubsw   m12,        m2, [r3 + 9 * 32]
1620
+    pmulhrsw    m12,        m7
1621
+    packuswb    m9,         m12
1622
+
1623
+    pmaddubsw   m10,        m0, [r3 + 14 * 32]  ; [30]
1624
+    pmulhrsw    m10,        m7
1625
+    pmaddubsw   m12,        m2, [r3 + 14 * 32]
1626
+    pmulhrsw    m12,        m7
1627
+    packuswb    m10,        m12
1628
+
1629
+    palignr     m12,        m2, m0, 2
1630
+    palignr     m1,         m3, m2, 2
1631
+    pmaddubsw   m11,        m12, [r3 - 13 * 32] ; [3]
1632
+    pmulhrsw    m11,        m7
1633
+    pmaddubsw   m1,         [r3 - 13 * 32]
1634
+    pmulhrsw    m1,         m7
1635
+    packuswb    m11,        m1
1636
+
1637
+    palignr     m1,         m3, m2, 2
1638
+    pmaddubsw   m12,        [r3 - 8 * 32]       ; [8]
1639
+    pmulhrsw    m12,        m7
1640
+    pmaddubsw   m1,         [r3 - 8 * 32]
1641
+    pmulhrsw    m1,         m7
1642
+    packuswb    m12,        m1
1643
+
1644
+    TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 0
1645
+
1646
+    ; rows 8 to 15
1647
+
1648
+    palignr     m8,         m2, m0, 2
1649
+    palignr     m1,         m3, m2, 2
1650
+    pmaddubsw   m4,         m8, [r3 - 3 * 32]   ; [13]
1651
+    pmulhrsw    m4,         m7
1652
+    pmaddubsw   m9,         m1, [r3 - 3 * 32]
1653
+    pmulhrsw    m9,         m7
1654
+    packuswb    m4,         m9
1655
+
1656
+    pmaddubsw   m5,         m8, [r3 + 2 * 32]   ; [18]
1657
+    pmulhrsw    m5,         m7
1658
+    pmaddubsw   m9,         m1, [r3 + 2 * 32]
1659
+    pmulhrsw    m9,         m7
1660
+    packuswb    m5,         m9
1661
+
1662
+    pmaddubsw   m6,         m8, [r3 + 7 * 32]   ; [23]
1663
+    pmulhrsw    m6,         m7
1664
+    pmaddubsw   m9,         m1, [r3 + 7 * 32]
1665
+    pmulhrsw    m9,         m7
1666
+    packuswb    m6,         m9
1667
+
1668
+    pmaddubsw   m8,         [r3 + 12 * 32]      ; [28]
1669
+    pmulhrsw    m8,         m7
1670
+    pmaddubsw   m1,         [r3 + 12 * 32]
1671
+    pmulhrsw    m1,         m7
1672
+    packuswb    m8,         m1
1673
+
1674
+    palignr     m12,        m2, m0, 4
1675
+    palignr     m1,         m3, m2, 4
1676
+    pmaddubsw   m9,         m12, [r3 - 15 * 32] ; [1]
1677
+    pmulhrsw    m9,         m7
1678
+    pmaddubsw   m11,        m1, [r3 - 15 * 32]
1679
+    pmulhrsw    m11,        m7
1680
+    packuswb    m9,         m11
1681
+
1682
+    pmaddubsw   m10,        m12, [r3 - 10 * 32] ; [6]
1683
+    pmulhrsw    m10,        m7
1684
+    pmaddubsw   m11,        m1, [r3 - 10 * 32]
1685
+    pmulhrsw    m11,        m7
1686
+    packuswb    m10,        m11
1687
+
1688
+    pmaddubsw   m11,        m12, [r3 - 5 * 32]  ; [11]
1689
+    pmulhrsw    m11,        m7
1690
+    pmaddubsw   m1,         [r3 - 5 * 32]
1691
+    pmulhrsw    m1,         m7
1692
+    packuswb    m11,        m1
1693
+
1694
+    palignr     m1,         m3, m2, 4
1695
+    pmaddubsw   m12,        [r3]                ; [16]
1696
+    pmulhrsw    m12,        m7
1697
+    pmaddubsw   m1,         [r3]
1698
+    pmulhrsw    m1,         m7
1699
+    packuswb    m12,        m1
1700
+
1701
+    TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 8
1702
+
1703
+    ; rows 16 to 23
1704
+
1705
+    jnz         .doNotAdjustBufferPtr
1706
+    lea         r4,         [r4 + mmsize/2]
1707
+    mov         r0,         r4
1708
+.doNotAdjustBufferPtr:
1709
+
1710
+    palignr     m6,         m2, m0, 4
1711
+    palignr     m1,         m3, m2, 4
1712
+    pmaddubsw   m4,         m6, [r3 + 5 * 32]   ; [21]
1713
+    pmulhrsw    m4,         m7
1714
+    pmaddubsw   m8,         m1, [r3 + 5 * 32]
1715
+    pmulhrsw    m8,         m7
1716
+    packuswb    m4,         m8
1717
+
1718
+    pmaddubsw   m5,         m6, [r3 + 10 * 32]  ; [26]
1719
+    pmulhrsw    m5,         m7
1720
+    pmaddubsw   m8,         m1, [r3 + 10 * 32]
1721
+    pmulhrsw    m8,         m7
1722
+    packuswb    m5,         m8
1723
+
1724
+    pmaddubsw   m6,         [r3 + 15 * 32]      ; [31]
1725
+    pmulhrsw    m6,         m7
1726
+    pmaddubsw   m1,         [r3 + 15 * 32]
1727
+    pmulhrsw    m1,         m7
1728
+    packuswb    m6,         m1
1729
+
1730
+    palignr     m12,        m2, m0, 6
1731
+    palignr     m1,         m3, m2, 6
1732
+    pmaddubsw   m8,         m12, [r3 - 12 * 32] ; [4]
1733
+    pmulhrsw    m8,         m7
1734
+    pmaddubsw   m11,        m1, [r3 - 12 * 32]
1735
+    pmulhrsw    m11,        m7
1736
+    packuswb    m8,         m11
1737
+
1738
+    pmaddubsw   m9,         m12, [r3 - 7 * 32]  ; [9]
1739
+    pmulhrsw    m9,         m7
1740
+    pmaddubsw   m11,        m1, [r3 - 7 * 32]
1741
+    pmulhrsw    m11,        m7
1742
+    packuswb    m9,         m11
1743
+
1744
+    pmaddubsw   m10,        m12, [r3 - 2 * 32]  ; [14]
1745
+    pmulhrsw    m10,        m7
1746
+    pmaddubsw   m11,        m1, [r3 - 2 * 32]
1747
+    pmulhrsw    m11,        m7
1748
+    packuswb    m10,        m11
1749
+
1750
+    pmaddubsw   m11,        m12, [r3 + 3 * 32]  ; [19]
1751
+    pmulhrsw    m11,        m7
1752
+    pmaddubsw   m1,         [r3 + 3 * 32]
1753
+    pmulhrsw    m1,         m7
1754
+    packuswb    m11,        m1
1755
+
1756
+    palignr     m1,         m3, m2, 6
1757
+    pmaddubsw   m12,        [r3 + 8 * 32]       ; [24]
1758
+    pmulhrsw    m12,        m7
1759
+    pmaddubsw   m1,         [r3 + 8 * 32]
1760
+    pmulhrsw    m1,         m7
1761
+    packuswb    m12,        m1
1762
+
1763
+    TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 11, 12, 1, 16
1764
+
1765
+    ; rows 24 to 31
1766
+    palignr     m4,         m2, m0, 6
1767
+    palignr     m1,         m3, m2, 6
1768
+    pmaddubsw   m4,         [r3 + 13 * 32]      ; [29]
1769
+    pmulhrsw    m4,         m7
1770
+    pmaddubsw   m1,         [r3 + 13 * 32]
1771
+    pmulhrsw    m1,         m7
1772
+    packuswb    m4,         m1
1773
+
1774
+    palignr     m3,         m2, 8
1775
+    palignr     m2,         m0, 8
1776
+    pmaddubsw   m5,         m2, [r3 - 14 * 32]  ; [2]
1777
+    pmulhrsw    m5,         m7
1778
+    pmaddubsw   m9,         m3, [r3 - 14 * 32]
1779
+    pmulhrsw    m9,         m7
1780
+    packuswb    m5,         m9
1781
+
1782
+    pmaddubsw   m6,         m2, [r3 - 9 * 32]   ; [7]
1783
+    pmulhrsw    m6,         m7
1784
+    pmaddubsw   m9,         m3, [r3 - 9 * 32]
1785
+    pmulhrsw    m9,         m7
1786
+    packuswb    m6,         m9
1787
+
1788
+    pmaddubsw   m8,         m2, [r3 - 4 * 32]   ; [12]
1789
+    pmulhrsw    m8,         m7
1790
+    pmaddubsw   m1,         m3, [r3 - 4 * 32]
1791
+    pmulhrsw    m1,         m7
1792
+    packuswb    m8,         m1
1793
+
1794
+    pmaddubsw   m9,         m2, [r3 + 1 * 32]   ; [17]
1795
+    pmulhrsw    m9,         m7
1796
+    pmaddubsw   m11,        m3, [r3 + 1 * 32]
1797
+    pmulhrsw    m11,        m7
1798
+    packuswb    m9,         m11
1799
+
1800
+    pmaddubsw   m10,        m2, [r3 + 6 * 32]   ; [22]
1801
+    pmulhrsw    m10,        m7
1802
+    pmaddubsw   m1,         m3, [r3 + 6 * 32]
1803
+    pmulhrsw    m1,         m7
1804
+    packuswb    m10,        m1
1805
+
1806
+    pmaddubsw   m2,         [r3 + 11 * 32]      ; [27]
1807
+    pmulhrsw    m2,         m7
1808
+    pmaddubsw   m3,         [r3 + 11 * 32]
1809
+    pmulhrsw    m3,         m7
1810
+    packuswb    m2,         m3
1811
+
1812
+    movu        m3,         [r2 + 6]            ; [0]
1813
+
1814
+    TRANSPOSE_32x8_AVX2 4, 5, 6, 8, 9, 10, 2, 3, 0, 24
1815
+    ret
1816
+
1817
+INIT_YMM avx2
1818
+cglobal intra_pred_ang32_8, 3,8,13
1819
+    add         r2, 64
1820
+    lea         r3, [ang_table_avx2 + 32 * 16]
1821
+    lea         r5, [r1 * 3]            ; r5 -> 3 * stride
1822
+    lea         r6, [r1 * 4]            ; r6 -> 4 * stride
1823
+    mova        m7, [pw_1024]
1824
+    mov         r4, r0
1825
+    xor         r7d, r7d
1826
+
1827
+    call ang32_mode_8_28_avx2
1828
+    RET
1829
+
1830
+INIT_YMM avx2
1831
+cglobal intra_pred_ang32_28, 3,8,13
1832
+    lea         r3, [ang_table_avx2 + 32 * 16]
1833
+    lea         r5, [r1 * 3]            ; r5 -> 3 * stride
1834
+    lea         r6, [r1 * 4]            ; r6 -> 4 * stride
1835
+    mova        m7, [pw_1024]
1836
+    xor         r7d, r7d
1837
+    inc         r7d
1838
+
1839
+    call ang32_mode_8_28_avx2
1840
+    RET
1841
+
1842
+INIT_YMM avx2
1843
+cglobal intra_pred_ang32_9, 3,5,8
1844
+    vbroadcasti128      m0, [angHor_tab_9]
1845
+    vbroadcasti128      m1, [angHor_tab_9 + mmsize/2]
1846
+    mova                m2, [pw_1024]
1847
+    mova                m7, [ang32_shuf_mode9]
1848
+    lea                 r3, [r1 * 3]
1849
+
1850
+    vbroadcasti128      m3, [r2 + mmsize*2 +  1]
1851
+    vbroadcasti128      m6, [r2 + mmsize*2 + 17]
1852
+
1853
+    pshufb              m5, m3, m7
1854
+    pmaddubsw           m4, m5, m0
1855
+    pmaddubsw           m5, m1
1856
+    pmulhrsw            m4, m2
1857
+    pmulhrsw            m5, m2
1858
+    packuswb            m4, m5
1859
+    movu                [r0], m4
1860
+
1861
+    palignr             m5, m6, m3, 1
1862
+    pshufb              m5, m7
1863
+    pmaddubsw           m4, m5, m0
1864
+    pmaddubsw           m5, m1
1865
+    pmulhrsw            m4, m2
1866
+    pmulhrsw            m5, m2
1867
+    packuswb            m4, m5
1868
+    movu                [r0 + r1], m4
1869
+
1870
+    palignr             m5, m6, m3, 2
1871
+    pshufb              m5, m7
1872
+    pmaddubsw           m4, m5, m0
1873
+    pmaddubsw           m5, m1
1874
+    pmulhrsw            m4, m2
1875
+    pmulhrsw            m5, m2
1876
+    packuswb            m4, m5
1877
+    movu                [r0 + r1*2], m4
1878
+
1879
+    palignr             m5, m6, m3, 3
1880
+    pshufb              m5, m7
1881
+    pmaddubsw           m4, m5, m0
1882
+    pmaddubsw           m5, m1
1883
+    pmulhrsw            m4, m2
1884
+    pmulhrsw            m5, m2
1885
+    packuswb            m4, m5
1886
+    movu                [r0 + r3], m4
1887
+
1888
+    lea                 r0, [r0 + r1 * 4]
1889
+
1890
+    palignr             m5, m6, m3, 4
1891
+    pshufb              m5, m7
1892
+    pmaddubsw           m4, m5, m0
1893
+    pmaddubsw           m5, m1
1894
+    pmulhrsw            m4, m2
1895
+    pmulhrsw            m5, m2
1896
+    packuswb            m4, m5
1897
+    movu                [r0], m4
1898
+
1899
+    palignr             m5, m6, m3, 5
1900
+    pshufb              m5, m7
1901
+    pmaddubsw           m4, m5, m0
1902
+    pmaddubsw           m5, m1
1903
+    pmulhrsw            m4, m2
1904
+    pmulhrsw            m5, m2
1905
+    packuswb            m4, m5
1906
+    movu                [r0 + r1], m4
1907
+
1908
+    palignr             m5, m6, m3, 6
1909
+    pshufb              m5, m7
1910
+    pmaddubsw           m4, m5, m0
1911
+    pmaddubsw           m5, m1
1912
+    pmulhrsw            m4, m2
1913
+    pmulhrsw            m5, m2
1914
+    packuswb            m4, m5
1915
+    movu                [r0 + r1*2], m4
1916
+
1917
+    palignr             m5, m6, m3, 7
1918
+    pshufb              m5, m7
1919
+    pmaddubsw           m4, m5, m0
1920
+    pmaddubsw           m5, m1
1921
+    pmulhrsw            m4, m2
1922
+    pmulhrsw            m5, m2
1923
+    packuswb            m4, m5
1924
+    movu                [r0 + r3], m4
1925
+
1926
+    lea                 r0, [r0 + r1 * 4]
1927
+
1928
+    palignr             m5, m6, m3, 8
1929
+    pshufb              m5, m7
1930
+    pmaddubsw           m4, m5, m0
1931
+    pmaddubsw           m5, m1
1932
+    pmulhrsw            m4, m2
1933
+    pmulhrsw            m5, m2
1934
+    packuswb            m4, m5
1935
+    movu                [r0], m4
1936
+
1937
+    palignr             m5, m6, m3, 9
1938
+    pshufb              m5, m7
1939
+    pmaddubsw           m4, m5, m0
1940
+    pmaddubsw           m5, m1
1941
+    pmulhrsw            m4, m2
1942
+    pmulhrsw            m5, m2
1943
+    packuswb            m4, m5
1944
+    movu                [r0 + r1], m4
1945
+
1946
+    palignr             m5, m6, m3, 10
1947
+    pshufb              m5, m7
1948
+    pmaddubsw           m4, m5, m0
1949
+    pmaddubsw           m5, m1
1950
+    pmulhrsw            m4, m2
1951
+    pmulhrsw            m5, m2
1952
+    packuswb            m4, m5
1953
+    movu                [r0 + r1*2], m4
1954
+
1955
+    palignr             m5, m6, m3, 11
1956
+    pshufb              m5, m7
1957
+    pmaddubsw           m4, m5, m0
1958
+    pmaddubsw           m5, m1
1959
+    pmulhrsw            m4, m2
1960
+    pmulhrsw            m5, m2
1961
+    packuswb            m4, m5
1962
+    movu                [r0 + r3], m4
1963
+
1964
+    lea                 r0, [r0 + r1 * 4]
1965
+
1966
+    palignr             m5, m6, m3, 12
1967
+    pshufb              m5, m7
1968
+    pmaddubsw           m4, m5, m0
1969
+    pmaddubsw           m5, m1
1970
+    pmulhrsw            m4, m2
1971
+    pmulhrsw            m5, m2
1972
+    packuswb            m4, m5
1973
+    movu                [r0], m4
1974
+
1975
+    palignr             m5, m6, m3, 13
1976
+    pshufb              m5, m7
1977
+    pmaddubsw           m4, m5, m0
1978
+    pmaddubsw           m5, m1
1979
+    pmulhrsw            m4, m2
1980
+    pmulhrsw            m5, m2
1981
+    packuswb            m4, m5
1982
+    movu                [r0 + r1], m4
1983
+
1984
+    palignr             m5, m6, m3, 14
1985
+    pshufb              m5, m7
1986
+    pmaddubsw           m4, m5, m0
1987
+    pmaddubsw           m5, m1
1988
+    pmulhrsw            m4, m2
1989
+    pmulhrsw            m5, m2
1990
+    packuswb            m4, m5
1991
+    movu                [r0 + r1*2], m4
1992
+
1993
+    palignr             m5, m6, m3, 15
1994
+    pshufb              m5, m7
1995
+    pmaddubsw           m4, m5, m0
1996
+    pmaddubsw           m5, m1
1997
+    pmulhrsw            m4, m2
1998
+    pmulhrsw            m5, m2
1999
+    packuswb            m4, m5
2000
+    movu                [r0 + r3], m4
2001
+
2002
+    lea                 r0, [r0 + r1 * 4]
2003
+
2004
+    vbroadcasti128      m3, [r2 + mmsize*2 + 33]
2005
+
2006
+    pshufb              m5, m6, m7
2007
+    pmaddubsw           m4, m5, m0
2008
+    pmaddubsw           m5, m1
2009
+    pmulhrsw            m4, m2
2010
+    pmulhrsw            m5, m2
2011
+    packuswb            m4, m5
2012
+    movu                [r0], m4
2013
+
2014
+    palignr             m5, m3, m6, 1
2015
+    pshufb              m5, m7
2016
+    pmaddubsw           m4, m5, m0
2017
+    pmaddubsw           m5, m1
2018
+    pmulhrsw            m4, m2
2019
+    pmulhrsw            m5, m2
2020
+    packuswb            m4, m5
2021
+    movu                [r0 + r1], m4
2022
+
2023
+    palignr             m5, m3, m6, 2
2024
+    pshufb              m5, m7
2025
+    pmaddubsw           m4, m5, m0
2026
+    pmaddubsw           m5, m1
2027
+    pmulhrsw            m4, m2
2028
+    pmulhrsw            m5, m2
2029
+    packuswb            m4, m5
2030
+    movu                [r0 + r1*2], m4
2031
+
2032
+    palignr             m5, m3, m6, 3
2033
+    pshufb              m5, m7
2034
+    pmaddubsw           m4, m5, m0
2035
+    pmaddubsw           m5, m1
2036
+    pmulhrsw            m4, m2
2037
+    pmulhrsw            m5, m2
2038
+    packuswb            m4, m5
2039
+    movu                [r0 + r3], m4
2040
+
2041
+    lea                 r0, [r0 + r1 * 4]
2042
+
2043
+    palignr             m5, m3, m6, 4
2044
+    pshufb              m5, m7
2045
+    pmaddubsw           m4, m5, m0
2046
+    pmaddubsw           m5, m1
2047
+    pmulhrsw            m4, m2
2048
+    pmulhrsw            m5, m2
2049
+    packuswb            m4, m5
2050
+    movu                [r0], m4
2051
+
2052
+    palignr             m5, m3, m6, 5
2053
+    pshufb              m5, m7
2054
+    pmaddubsw           m4, m5, m0
2055
+    pmaddubsw           m5, m1
2056
+    pmulhrsw            m4, m2
2057
+    pmulhrsw            m5, m2
2058
+    packuswb            m4, m5
2059
+    movu                [r0 + r1], m4
2060
+
2061
+    palignr             m5, m3, m6, 6
2062
+    pshufb              m5, m7
2063
+    pmaddubsw           m4, m5, m0
2064
+    pmaddubsw           m5, m1
2065
+    pmulhrsw            m4, m2
2066
+    pmulhrsw            m5, m2
2067
+    packuswb            m4, m5
2068
+    movu                [r0 + r1*2], m4
2069
+
2070
+    palignr             m5, m3, m6, 7
2071
+    pshufb              m5, m7
2072
+    pmaddubsw           m4, m5, m0
2073
+    pmaddubsw           m5, m1
2074
+    pmulhrsw            m4, m2
2075
+    pmulhrsw            m5, m2
2076
+    packuswb            m4, m5
2077
+    movu                [r0 + r3], m4
2078
+
2079
+    lea                 r0, [r0 + r1 * 4]
2080
+
2081
+    palignr             m5, m3, m6, 8
2082
+    pshufb              m5, m7
2083
+    pmaddubsw           m4, m5, m0
2084
+    pmaddubsw           m5, m1
2085
+    pmulhrsw            m4, m2
2086
+    pmulhrsw            m5, m2
2087
+    packuswb            m4, m5
2088
+    movu                [r0], m4
2089
+
2090
+    palignr             m5, m3, m6, 9
2091
+    pshufb              m5, m7
2092
+    pmaddubsw           m4, m5, m0
2093
+    pmaddubsw           m5, m1
2094
+    pmulhrsw            m4, m2
2095
+    pmulhrsw            m5, m2
2096
+    packuswb            m4, m5
2097
+    movu                [r0 + r1], m4
2098
+
2099
+    palignr             m5, m3, m6, 10
2100
+    pshufb              m5, m7
2101
+    pmaddubsw           m4, m5, m0
2102
+    pmaddubsw           m5, m1
2103
+    pmulhrsw            m4, m2
2104
+    pmulhrsw            m5, m2
2105
+    packuswb            m4, m5
2106
+    movu                [r0 + r1*2], m4
2107
+
2108
+    palignr             m5, m3, m6, 11
2109
+    pshufb              m5, m7
2110
+    pmaddubsw           m4, m5, m0
2111
+    pmaddubsw           m5, m1
2112
+    pmulhrsw            m4, m2
2113
+    pmulhrsw            m5, m2
2114
+    packuswb            m4, m5
2115
+    movu                [r0 + r3], m4
2116
+
2117
+    lea                 r0, [r0 + r1 * 4]
2118
+
2119
+    palignr             m5, m3, m6, 12
2120
+    pshufb              m5, m7
2121
+    pmaddubsw           m4, m5, m0
2122
+    pmaddubsw           m5, m1
2123
+    pmulhrsw            m4, m2
2124
+    pmulhrsw            m5, m2
2125
+    packuswb            m4, m5
2126
+    movu                [r0], m4
2127
+
2128
+    palignr             m5, m3, m6, 13
2129
+    pshufb              m5, m7
2130
+    pmaddubsw           m4, m5, m0
2131
+    pmaddubsw           m5, m1
2132
+    pmulhrsw            m4, m2
2133
+    pmulhrsw            m5, m2
2134
+    packuswb            m4, m5
2135
+    movu                [r0 + r1], m4
2136
+
2137
+    palignr             m5, m3, m6, 14
2138
+    pshufb              m5, m7
2139
+    pmaddubsw           m4, m5, m0
2140
+    pmaddubsw           m5, m1
2141
+    pmulhrsw            m4, m2
2142
+    pmulhrsw            m5, m2
2143
+    packuswb            m4, m5
2144
+    movu                [r0 + r1*2], m4
2145
+
2146
+    palignr             m5, m3, m6, 15
2147
+    pshufb              m5, m7
2148
+    pmaddubsw           m4, m5, m0
2149
+    pmaddubsw           m5, m1
2150
+    pmulhrsw            m4, m2
2151
+    pmulhrsw            m5, m2
2152
+    packuswb            m4, m5
2153
+    movu                [r0 + r3], m4
2154
+    RET
2155
+
2156
+cglobal intra_pred_ang32_27, 3,5,6
2157
+    lea                 r3, [ang_table_avx2 + 32 * 16]
2158
+    lea                 r4, [r1 * 3]            ; r4 -> 3 * stride
2159
+    mova                m5, [pw_1024]
2160
+
2161
+    ; rows 0 to 7
2162
+    movu                m0, [r2 +  1]           ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1]
2163
+    movu                m1, [r2 +  2]           ; [33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2]
2164
+    movu                m3, [r2 + 17]           ; [48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17]
2165
+    movu                m4, [r2 + 18]           ; [49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18]
2166
+
2167
+    punpckhbw           m2, m0, m1              ; [33 32 32 31 31 30 30 29 29 28 28 27 27 26 26 25 17 16 16 15 15 14 14 13 13 12 12 11 11 10 10  9]
2168
+    punpcklbw           m0, m1                  ; [25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17  9  8  8  7  7  6  6  5  5  4  4  3  3  2  2  1]
2169
+    punpcklbw           m3, m4                  ; [41 40 40 39 39 38 38 37 37 36 36 35 35 34 34 33 25 24 24 23 23 22 22 21 21 20 20 19 19 18 18 17]
2170
+
2171
+    pmaddubsw           m4, m0, [r3 - 14 * 32]  ; [2]
2172
+    pmulhrsw            m4, m5
2173
+    pmaddubsw           m1, m2, [r3 - 14 * 32]
2174
+    pmulhrsw            m1, m5
2175
+    packuswb            m4, m1
2176
+    movu                [r0], m4
2177
+
2178
+    pmaddubsw           m4, m0, [r3 - 12 * 32]  ; [4]
2179
+    pmulhrsw            m4, m5
2180
+    pmaddubsw           m1, m2, [r3 - 12 * 32]
2181
+    pmulhrsw            m1, m5
2182
+    packuswb            m4, m1
2183
+    movu                [r0 + r1], m4
2184
+
2185
+    pmaddubsw           m4, m0, [r3 - 10 * 32]  ; [6]
2186
+    pmulhrsw            m4, m5
2187
+    pmaddubsw           m1, m2, [r3 - 10 * 32]
2188
+    pmulhrsw            m1, m5
2189
+    packuswb            m4, m1
2190
+    movu                [r0 + r1*2], m4
2191
+
2192
+    pmaddubsw           m4, m0, [r3 - 8 * 32]   ; [8]
2193
+    pmulhrsw            m4, m5
2194
+    pmaddubsw           m1, m2, [r3 - 8 * 32]
2195
+    pmulhrsw            m1, m5
2196
+    packuswb            m4, m1
2197
+    movu                [r0 + r4], m4
2198
+
2199
+    lea                 r0, [r0 + r1 * 4]
2200
+
2201
+    pmaddubsw           m4, m0, [r3 - 6 * 32]   ; [10]
2202
+    pmulhrsw            m4, m5
2203
+    pmaddubsw           m1, m2, [r3 - 6 * 32]
2204
+    pmulhrsw            m1, m5
2205
+    packuswb            m4, m1
2206
+    movu                [r0], m4
2207
+
2208
+    pmaddubsw           m4, m0, [r3 - 4 * 32]   ; [12]
2209
+    pmulhrsw            m4, m5
2210
+    pmaddubsw           m1, m2, [r3 - 4 * 32]
2211
+    pmulhrsw            m1, m5
2212
+    packuswb            m4, m1
2213
+    movu                [r0 + r1], m4
2214
+
2215
+    pmaddubsw           m4, m0, [r3 - 2 * 32]   ; [14]
2216
+    pmulhrsw            m4, m5
2217
+    pmaddubsw           m1, m2, [r3 - 2 * 32]
2218
+    pmulhrsw            m1, m5
2219
+    packuswb            m4, m1
2220
+    movu                [r0 + r1*2], m4
2221
+
2222
+    pmaddubsw           m4, m0, [r3]            ; [16]
2223
+    pmulhrsw            m4, m5
2224
+    pmaddubsw           m1, m2, [r3]
2225
+    pmulhrsw            m1, m5
2226
+    packuswb            m4, m1
2227
+    movu                [r0 + r4], m4
2228
+
2229
+    lea                 r0, [r0 + r1 * 4]
2230
+
2231
+    ; rows 8 to 15
2232
+    pmaddubsw           m4, m0, [r3 + 2 * 32]   ; [18]
2233
+    pmulhrsw            m4, m5
2234
+    pmaddubsw           m1, m2, [r3 + 2 * 32]
2235
+    pmulhrsw            m1, m5
2236
+    packuswb            m4, m1
2237
+    movu                [r0], m4
2238
+
2239
+    pmaddubsw           m4, m0, [r3 + 4 * 32]   ; [20]
2240
+    pmulhrsw            m4, m5
2241
+    pmaddubsw           m1, m2, [r3 + 4 * 32]
2242
+    pmulhrsw            m1, m5
2243
+    packuswb            m4, m1
2244
+    movu                [r0 + r1], m4
2245
+
2246
+    pmaddubsw           m4, m0, [r3 + 6 * 32]   ; [22]
2247
+    pmulhrsw            m4, m5
2248
+    pmaddubsw           m1, m2, [r3 + 6 * 32]
2249
+    pmulhrsw            m1, m5
2250
+    packuswb            m4, m1
2251
+    movu                [r0 + r1*2], m4
2252
+
2253
+    pmaddubsw           m4, m0, [r3 + 8 * 32]   ; [24]
2254
+    pmulhrsw            m4, m5
2255
+    pmaddubsw           m1, m2, [r3 + 8 * 32]
2256
+    pmulhrsw            m1, m5
2257
+    packuswb            m4, m1
2258
+    movu                [r0 + r4], m4
2259
+
2260
+    lea                 r0, [r0 + r1 * 4]
2261
+
2262
+    pmaddubsw           m4, m0, [r3 + 10 * 32]  ; [26]
2263
+    pmulhrsw            m4, m5
2264
+    pmaddubsw           m1, m2, [r3 + 10 * 32]
2265
+    pmulhrsw            m1, m5
2266
+    packuswb            m4, m1
2267
+    movu                [r0], m4
2268
+
2269
+    pmaddubsw           m4, m0, [r3 + 12 * 32]  ; [28]
2270
+    pmulhrsw            m4, m5
2271
+    pmaddubsw           m1, m2, [r3 + 12 * 32]
2272
+    pmulhrsw            m1, m5
2273
+    packuswb            m4, m1
2274
+    movu                [r0 + r1], m4
2275
+
2276
+    pmaddubsw           m4, m0, [r3 + 14 * 32]  ; [30]
2277
+    pmulhrsw            m4, m5
2278
+    pmaddubsw           m1, m2, [r3 + 14 * 32]
2279
+    pmulhrsw            m1, m5
2280
+    packuswb            m4, m1
2281
+    movu                [r0 + r1*2], m4
2282
+
2283
+    palignr             m3, m2, 2
2284
+    palignr             m2, m0, 2
2285
+    movu                m1, [r2 + 2]            ; [0]
2286
+    movu                [r0 + r4], m1
2287
+
2288
+    lea                 r0, [r0 + r1 * 4]
2289
+
2290
+    ; rows 16 to 23
2291
+    pmaddubsw           m4, m2, [r3 - 14 * 32]  ; [2]
2292
+    pmulhrsw            m4, m5
2293
+    pmaddubsw           m1, m3, [r3 - 14 * 32]
2294
+    pmulhrsw            m1, m5
2295
+    packuswb            m4, m1
2296
+    movu                [r0], m4
2297
+
2298
+    pmaddubsw           m4, m2, [r3 - 12 * 32]  ; [4]
2299
+    pmulhrsw            m4, m5
2300
+    pmaddubsw           m1, m3, [r3 - 12 * 32]
2301
+    pmulhrsw            m1, m5
2302
+    packuswb            m4, m1
2303
+    movu                [r0 + r1], m4
2304
+
2305
+    pmaddubsw           m4, m2, [r3 - 10 * 32]  ; [6]
2306
+    pmulhrsw            m4, m5
2307
+    pmaddubsw           m1, m3, [r3 - 10 * 32]
2308
+    pmulhrsw            m1, m5
2309
+    packuswb            m4, m1
2310
+    movu                [r0 + r1*2], m4
2311
+
2312
+    pmaddubsw           m4, m2, [r3 - 8 * 32]   ; [8]
2313
+    pmulhrsw            m4, m5
2314
+    pmaddubsw           m1, m3, [r3 - 8 * 32]
2315
+    pmulhrsw            m1, m5
2316
+    packuswb            m4, m1
2317
+    movu                [r0 + r4], m4
2318
+
2319
+    lea                 r0, [r0 + r1 * 4]
2320
+
2321
+    pmaddubsw           m4, m2, [r3 - 6 * 32]   ; [10]
2322
+    pmulhrsw            m4, m5
2323
+    pmaddubsw           m1, m3, [r3 - 6 * 32]
2324
+    pmulhrsw            m1, m5
2325
+    packuswb            m4, m1
2326
+    movu                [r0], m4
2327
+
2328
+    pmaddubsw           m4, m2, [r3 - 4 * 32]   ; [12]
2329
+    pmulhrsw            m4, m5
2330
+    pmaddubsw           m1, m3, [r3 - 4 * 32]
2331
+    pmulhrsw            m1, m5
2332
+    packuswb            m4, m1
2333
+    movu                [r0 + r1],  m4
2334
+
2335
+    pmaddubsw           m4, m2, [r3 - 2 * 32]   ; [14]
2336
+    pmulhrsw            m4, m5
2337
+    pmaddubsw           m1, m3, [r3 - 2 * 32]
2338
+    pmulhrsw            m1, m5
2339
+    packuswb            m4, m1
2340
+    movu                [r0 + r1*2], m4
2341
+
2342
+    pmaddubsw           m4, m2, [r3]            ; [16]
2343
+    pmulhrsw            m4, m5
2344
+    pmaddubsw           m1, m3, [r3]
2345
+    pmulhrsw            m1, m5
2346
+    packuswb            m4, m1
2347
+    movu                [r0 + r4], m4
2348
+
2349
+    lea                 r0,         [r0 + r1 * 4]
2350
+
2351
+    ; rows 8 to 15
2352
+    pmaddubsw           m4, m2, [r3 + 2 * 32]   ; [18]
2353
+    pmulhrsw            m4, m5
2354
+    pmaddubsw           m1, m3, [r3 + 2 * 32]
2355
+    pmulhrsw            m1, m5
2356
+    packuswb            m4, m1
2357
+    movu                [r0], m4
2358
+
2359
+    pmaddubsw           m4, m2, [r3 + 4 * 32]   ; [20]
2360
+    pmulhrsw            m4, m5
2361
+    pmaddubsw           m1, m3, [r3 + 4 * 32]
2362
+    pmulhrsw            m1, m5
2363
+    packuswb            m4, m1
2364
+    movu                [r0 + r1],  m4
2365
+
2366
+    pmaddubsw           m4, m2, [r3 + 6 * 32]   ; [22]
2367
+    pmulhrsw            m4, m5
2368
+    pmaddubsw           m1, m3, [r3 + 6 * 32]
2369
+    pmulhrsw            m1, m5
2370
+    packuswb            m4, m1
2371
+    movu                [r0 + r1*2], m4
2372
+
2373
+    pmaddubsw           m4, m2, [r3 + 8 * 32]   ; [24]
2374
+    pmulhrsw            m4, m5
2375
+    pmaddubsw           m1, m3, [r3 + 8 * 32]
2376
+    pmulhrsw            m1, m5
2377
+    packuswb            m4, m1
2378
+    movu                [r0 + r4],  m4
2379
+
2380
+    lea                 r0, [r0 + r1 * 4]
2381
+
2382
+    pmaddubsw           m4, m2, [r3 + 10 * 32]  ; [26]
2383
+    pmulhrsw            m4, m5
2384
+    pmaddubsw           m1, m3, [r3 + 10 * 32]
2385
+    pmulhrsw            m1, m5
2386
+    packuswb            m4, m1
2387
+    movu                [r0], m4
2388
+
2389
+    pmaddubsw           m4, m2, [r3 + 12 * 32]  ; [28]
2390
+    pmulhrsw            m4, m5
2391
+    pmaddubsw           m1, m3, [r3 + 12 * 32]
2392
+    pmulhrsw            m1, m5
2393
+    packuswb            m4, m1
2394
+    movu                [r0 + r1],  m4
2395
+
2396
+    pmaddubsw           m2, [r3 + 14 * 32]      ; [30]
2397
+    pmulhrsw            m2, m5
2398
+    pmaddubsw           m3, [r3 + 14 * 32]
2399
+    pmulhrsw            m3, m5
2400
+    packuswb            m2, m3
2401
+    movu                [r0 + r1*2], m2
2402
+
2403
+    movu                m1, [r2 + 3]            ; [0]
2404
+    movu                [r0 + r4], m1
2405
+    RET
2406
+
2407
+cglobal intra_pred_ang32_10, 5,5,4
2408
+    pxor                m0, m0
2409
+    mova                m1, [pb_1]
2410
+    lea                 r4, [r1 * 3]
2411
+
2412
+    vbroadcasti128      m2, [r2 + mmsize*2 + 1]
2413
+
2414
+    pshufb              m3, m2, m0
2415
+    movu                [r0], m3
2416
+    paddb               m0, m1
2417
+    pshufb              m3, m2, m0
2418
+    movu                [r0 + r1], m3
2419
+    paddb               m0, m1
2420
+    pshufb              m3, m2, m0
2421
+    movu                [r0 + r1 * 2], m3
2422
+    paddb               m0, m1
2423
+    pshufb              m3, m2, m0
2424
+    movu                [r0 + r4], m3
2425
+
2426
+    lea                 r0, [r0 + r1 * 4]
2427
+
2428
+    paddb               m0, m1
2429
+    pshufb              m3, m2, m0
2430
+    movu                [r0], m3
2431
+    paddb               m0, m1
2432
+    pshufb              m3, m2, m0
2433
+    movu                [r0 + r1], m3
2434
+    paddb               m0, m1
2435
+    pshufb              m3, m2, m0
2436
+    movu                [r0 + r1 * 2], m3
2437
+    paddb               m0, m1
2438
+    pshufb              m3, m2, m0
2439
+    movu                [r0 + r4], m3
2440
+
2441
+    lea                 r0, [r0 + r1 * 4]
2442
+
2443
+    paddb               m0, m1
2444
+    pshufb              m3, m2, m0
2445
+    movu                [r0], m3
2446
+    paddb               m0, m1
2447
+    pshufb              m3, m2, m0
2448
+    movu                [r0 + r1], m3
2449
+    paddb               m0, m1
2450
+    pshufb              m3, m2, m0
2451
+    movu                [r0 + r1 * 2], m3
2452
+    paddb               m0, m1
2453
+    pshufb              m3, m2, m0
2454
+    movu                [r0 + r4], m3
2455
+
2456
+    lea                 r0, [r0 + r1 * 4]
2457
+
2458
+    paddb               m0, m1
2459
+    pshufb              m3, m2, m0
2460
+    movu                [r0], m3
2461
+    paddb               m0, m1
2462
+    pshufb              m3, m2, m0
2463
+    movu                [r0 + r1], m3
2464
+    paddb               m0, m1
2465
+    pshufb              m3, m2, m0
2466
+    movu                [r0 + r1 * 2], m3
2467
+    paddb               m0, m1
2468
+    pshufb              m3, m2, m0
2469
+    movu                [r0 + r4], m3
2470
+
2471
+    lea                 r0, [r0 + r1 * 4]
2472
+    pxor                m0, m0
2473
+    vbroadcasti128      m2, [r2 + mmsize*2 + mmsize/2 + 1]
2474
+
2475
+    pshufb              m3, m2, m0
2476
+    movu                [r0], m3
2477
+    paddb               m0, m1
2478
+    pshufb              m3, m2, m0
2479
+    movu                [r0 + r1], m3
2480
+    paddb               m0, m1
2481
+    pshufb              m3, m2, m0
2482
+    movu                [r0 + r1 * 2], m3
2483
+    paddb               m0, m1
2484
+    pshufb              m3, m2, m0
2485
+    movu                [r0 + r4], m3
2486
+
2487
+    lea                 r0, [r0 + r1 * 4]
2488
+
2489
+    paddb               m0, m1
2490
+    pshufb              m3, m2, m0
2491
+    movu                [r0], m3
2492
+    paddb               m0, m1
2493
+    pshufb              m3, m2, m0
2494
+    movu                [r0 + r1], m3
2495
+    paddb               m0, m1
2496
+    pshufb              m3, m2, m0
2497
+    movu                [r0 + r1 * 2], m3
2498
+    paddb               m0, m1
2499
+    pshufb              m3, m2, m0
2500
+    movu                [r0 + r4], m3
2501
+
2502
+    lea                 r0, [r0 + r1 * 4]
2503
+
2504
+    paddb               m0, m1
2505
+    pshufb              m3, m2, m0
2506
+    movu                [r0], m3
2507
+    paddb               m0, m1
2508
+    pshufb              m3, m2, m0
2509
+    movu                [r0 + r1], m3
2510
+    paddb               m0, m1
2511
+    pshufb              m3, m2, m0
2512
+    movu                [r0 + r1 * 2], m3
2513
+    paddb               m0, m1
2514
+    pshufb              m3, m2, m0
2515
+    movu                [r0 + r4], m3
2516
+
2517
+    lea                 r0, [r0 + r1 * 4]
2518
+
2519
+    paddb               m0, m1
2520
+    pshufb              m3, m2, m0
2521
+    movu                [r0], m3
2522
+    paddb               m0, m1
2523
+    pshufb              m3, m2, m0
2524
+    movu                [r0 + r1], m3
2525
+    paddb               m0, m1
2526
+    pshufb              m3, m2, m0
2527
+    movu                [r0 + r1 * 2], m3
2528
+    paddb               m0, m1
2529
+    pshufb              m3, m2, m0
2530
+    movu                [r0 + r4], m3
2531
+    RET
2532
+
2533
+cglobal intra_pred_ang32_11, 3,4,8
2534
+    vbroadcasti128      m0, [angHor_tab_11]
2535
+    vbroadcasti128      m1, [angHor_tab_11 + mmsize/2]
2536
+    mova                m2, [pw_1024]
2537
+    mova                m7, [ang32_shuf_mode11]
2538
+    lea                 r3, [r1 * 3]
2539
+
2540
+    ; prepare for [16 0 -1 -2 ...]
2541
+    movu               xm3, [r2 + mmsize*2 -  1]
2542
+    vbroadcasti128      m6, [r2 + mmsize*2 + 15]
2543
+
2544
+    pinsrb             xm3, [r2 +  0], 1
2545
+    pinsrb             xm3, [r2 + 16], 0
2546
+    vinserti128         m3, m3, xm3, 1          ; [16  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 16  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14]
2547
+
2548
+    pshufb              m5, m3, m7              ; [ 0  1  0  1  0  1  0  1  0  1  0  1  0  1  0  1 16  0 16  0 16  0 16  0 16  0 16  0 16  0 16  0]
2549
+    pmaddubsw           m4, m5, m0
2550
+    pmaddubsw           m5, m1
2551
+    pmulhrsw            m4, m2
2552
+    pmulhrsw            m5, m2
2553
+    packuswb            m4, m5
2554
+    movu                [r0], m4
2555
+
2556
+    palignr             m5, m6, m3, 1
2557
+    pshufb              m5, m7
2558
+    pmaddubsw           m4, m5, m0
2559
+    pmaddubsw           m5, m1
2560
+    pmulhrsw            m4, m2
2561
+    pmulhrsw            m5, m2
2562
+    packuswb            m4, m5
2563
+    movu                [r0 + r1], m4
2564
+
2565
+    palignr             m5, m6, m3, 2
2566
+    pshufb              m5, m7
2567
+    pmaddubsw           m4, m5, m0
2568
+    pmaddubsw           m5, m1
2569
+    pmulhrsw            m4, m2
2570
+    pmulhrsw            m5, m2
2571
+    packuswb            m4, m5
2572
+    movu                [r0 + r1 * 2], m4
2573
+
2574
+    palignr             m5, m6, m3, 3
2575
+    pshufb              m5, m7
2576
+    pmaddubsw           m4, m5, m0
2577
+    pmaddubsw           m5, m1
2578
+    pmulhrsw            m4, m2
2579
+    pmulhrsw            m5, m2
2580
+    packuswb            m4, m5
2581
+    movu                [r0 + r3], m4
2582
+
2583
+    lea                 r0, [r0 + r1 * 4]
2584
+
2585
+    palignr             m5, m6, m3, 4
2586
+    pshufb              m5, m7
2587
+    pmaddubsw           m4, m5, m0
2588
+    pmaddubsw           m5, m1
2589
+    pmulhrsw            m4, m2
2590
+    pmulhrsw            m5, m2
2591
+    packuswb            m4, m5
2592
+    movu                [r0], m4
2593
+
2594
+    palignr             m5, m6, m3, 5
2595
+    pshufb              m5, m7
2596
+    pmaddubsw           m4, m5, m0
2597
+    pmaddubsw           m5, m1
2598
+    pmulhrsw            m4, m2
2599
+    pmulhrsw            m5, m2
2600
+    packuswb            m4, m5
2601
+    movu                [r0 + r1], m4
2602
+
2603
+    palignr             m5, m6, m3, 6
2604
+    pshufb              m5, m7
2605
+    pmaddubsw           m4, m5, m0
2606
+    pmaddubsw           m5, m1
2607
+    pmulhrsw            m4, m2
2608
+    pmulhrsw            m5, m2
2609
+    packuswb            m4, m5
2610
+    movu                [r0 + r1 * 2], m4
2611
+
2612
+    palignr             m5, m6, m3, 7
2613
+    pshufb              m5, m7
2614
+    pmaddubsw           m4, m5, m0
2615
+    pmaddubsw           m5, m1
2616
+    pmulhrsw            m4, m2
2617
+    pmulhrsw            m5, m2
2618
+    packuswb            m4, m5
2619
+    movu                [r0 + r3], m4
2620
+
2621
+    lea                 r0, [r0 + r1 * 4]
2622
+
2623
+    palignr             m5, m6, m3, 8
2624
+    pshufb              m5, m7
2625
+    pmaddubsw           m4, m5, m0
2626
+    pmaddubsw           m5, m1
2627
+    pmulhrsw            m4, m2
2628
+    pmulhrsw            m5, m2
2629
+    packuswb            m4, m5
2630
+    movu                [r0], m4
2631
+
2632
+    palignr             m5, m6, m3, 9
2633
+    pshufb              m5, m7
2634
+    pmaddubsw           m4, m5, m0
2635
+    pmaddubsw           m5, m1
2636
+    pmulhrsw            m4, m2
2637
+    pmulhrsw            m5, m2
2638
+    packuswb            m4, m5
2639
+    movu                [r0 + r1], m4
2640
+
2641
+    palignr             m5, m6, m3, 10
2642
+    pshufb              m5, m7
2643
+
2644
+    pmaddubsw           m4, m5, m0
2645
+    pmaddubsw           m5, m1
2646
+    pmulhrsw            m4, m2
2647
+    pmulhrsw            m5, m2
2648
+    packuswb            m4, m5
2649
+    movu                [r0 + r1 * 2], m4
2650
+
2651
+    palignr             m5, m6, m3, 11
2652
+    pshufb              m5, m7
2653
+    pmaddubsw           m4, m5, m0
2654
+    pmaddubsw           m5, m1
2655
+    pmulhrsw            m4, m2
2656
+    pmulhrsw            m5, m2
2657
+    packuswb            m4, m5
2658
+    movu                [r0 + r3], m4
2659
+
2660
+    lea                 r0, [r0 + r1 * 4]
2661
+
2662
+    palignr             m5, m6, m3, 12
2663
+    pshufb              m5, m7
2664
+    pmaddubsw           m4, m5, m0
2665
+    pmaddubsw           m5, m1
2666
+    pmulhrsw            m4, m2
2667
+    pmulhrsw            m5, m2
2668
+    packuswb            m4, m5
2669
+    movu                [r0], m4
2670
+
2671
+    palignr             m5, m6, m3, 13
2672
+    pshufb              m5, m7
2673
+    pmaddubsw           m4, m5, m0
2674
+    pmaddubsw           m5, m1
2675
+    pmulhrsw            m4, m2
2676
+    pmulhrsw            m5, m2
2677
+    packuswb            m4, m5
2678
+    movu                [r0 + r1], m4
2679
+
2680
+    palignr             m5, m6, m3, 14
2681
+    pshufb              m5, m7
2682
+    pmaddubsw           m4, m5, m0
2683
+    pmaddubsw           m5, m1
2684
+    pmulhrsw            m4, m2
2685
+    pmulhrsw            m5, m2
2686
+    packuswb            m4, m5
2687
+    movu                [r0 + r1 * 2], m4
2688
+
2689
+    palignr             m5, m6, m3, 15
2690
+    pshufb              m5, m7
2691
+    pmaddubsw           m4, m5, m0
2692
+    pmaddubsw           m5, m1
2693
+    pmulhrsw            m4, m2
2694
+    pmulhrsw            m5, m2
2695
+    packuswb            m4, m5
2696
+    movu                [r0 + r3], m4
2697
+
2698
+    lea                 r0, [r0 + r1 * 4]
2699
+
2700
+    mova                m3, m6
2701
+    vbroadcasti128      m6, [r2 + mmsize*2 + 15 + 16]
2702
+    pshufb              m5, m3, m7
2703
+    pmaddubsw           m4, m5, m0
2704
+    pmaddubsw           m5, m1
2705
+    pmulhrsw            m4, m2
2706
+    pmulhrsw            m5, m2
2707
+    packuswb            m4, m5
2708
+    movu                [r0], m4
2709
+
2710
+    palignr             m5, m6, m3, 1
2711
+    pshufb              m5, m7
2712
+    pmaddubsw           m4, m5, m0
2713
+    pmaddubsw           m5, m1
2714
+    pmulhrsw            m4, m2
2715
+    pmulhrsw            m5, m2
2716
+    packuswb            m4, m5
2717
+    movu                [r0 + r1], m4
2718
+
2719
+    palignr             m5, m6, m3, 2
2720
+    pshufb              m5, m7
2721
+    pmaddubsw           m4, m5, m0
2722
+    pmaddubsw           m5, m1
2723
+    pmulhrsw            m4, m2
2724
+    pmulhrsw            m5, m2
2725
+    packuswb            m4, m5
2726
+    movu                [r0 + r1 * 2], m4
2727
+
2728
+    palignr             m5, m6, m3, 3
2729
+    pshufb              m5, m7
2730
+    pmaddubsw           m4, m5, m0
2731
+    pmaddubsw           m5, m1
2732
+    pmulhrsw            m4, m2
2733
+    pmulhrsw            m5, m2
2734
+    packuswb            m4, m5
2735
+    movu                [r0 + r3], m4
2736
+
2737
+    lea                 r0, [r0 + r1 * 4]
2738
+
2739
+    palignr             m5, m6, m3, 4
2740
+    pshufb              m5, m7
2741
+    pmaddubsw           m4, m5, m0
2742
+    pmaddubsw           m5, m1
2743
+    pmulhrsw            m4, m2
2744
+    pmulhrsw            m5, m2
2745
+    packuswb            m4, m5
2746
+    movu                [r0], m4
2747
+
2748
+    palignr             m5, m6, m3, 5
2749
+    pshufb              m5, m7
2750
+    pmaddubsw           m4, m5, m0
2751
+    pmaddubsw           m5, m1
2752
+    pmulhrsw            m4, m2
2753
+    pmulhrsw            m5, m2
2754
+    packuswb            m4, m5
2755
+    movu                [r0 + r1], m4
2756
+
2757
+    palignr             m5, m6, m3, 6
2758
+    pshufb              m5, m7
2759
+    pmaddubsw           m4, m5, m0
2760
+    pmaddubsw           m5, m1
2761
+    pmulhrsw            m4, m2
2762
+    pmulhrsw            m5, m2
2763
+    packuswb            m4, m5
2764
+    movu                [r0 + r1 * 2], m4
2765
+
2766
+    palignr             m5, m6, m3, 7
2767
+    pshufb              m5, m7
2768
+    pmaddubsw           m4, m5, m0
2769
+    pmaddubsw           m5, m1
2770
+    pmulhrsw            m4, m2
2771
+    pmulhrsw            m5, m2
2772
+    packuswb            m4, m5
2773
+    movu                [r0 + r3], m4
2774
+
2775
+    lea                 r0, [r0 + r1 * 4]
2776
+
2777
+    palignr             m5, m6, m3, 8
2778
+    pshufb              m5, m7
2779
+    pmaddubsw           m4, m5, m0
2780
+    pmaddubsw           m5, m1
2781
+    pmulhrsw            m4, m2
2782
+    pmulhrsw            m5, m2
2783
+    packuswb            m4, m5
2784
+    movu                [r0], m4
2785
+
2786
+    palignr             m5, m6, m3, 9
2787
+    pshufb              m5, m7
2788
+    pmaddubsw           m4, m5, m0
2789
+    pmaddubsw           m5, m1
2790
+    pmulhrsw            m4, m2
2791
+    pmulhrsw            m5, m2
2792
+    packuswb            m4, m5
2793
+    movu                [r0 + r1], m4
2794
+
2795
+    palignr             m5, m6, m3, 10
2796
+    pshufb              m5, m7
2797
+    pmaddubsw           m4, m5, m0
2798
+    pmaddubsw           m5, m1
2799
+    pmulhrsw            m4, m2
2800
+    pmulhrsw            m5, m2
2801
+    packuswb            m4, m5
2802
+    movu                [r0 + r1 * 2], m4
2803
+
2804
+    palignr             m5, m6, m3, 11
2805
+    pshufb              m5, m7
2806
+    pmaddubsw           m4, m5, m0
2807
+    pmaddubsw           m5, m1
2808
+    pmulhrsw            m4, m2
2809
+    pmulhrsw            m5, m2
2810
+    packuswb            m4, m5
2811
+    movu                [r0 + r3], m4
2812
+
2813
+    lea                 r0, [r0 + r1 * 4]
2814
+
2815
+    palignr             m5, m6, m3, 12
2816
+    pshufb              m5, m7
2817
+    pmaddubsw           m4, m5, m0
2818
+    pmaddubsw           m5, m1
2819
+    pmulhrsw            m4, m2
2820
+    pmulhrsw            m5, m2
2821
+    packuswb            m4, m5
2822
+    movu                [r0], m4
2823
+
2824
+    palignr             m5, m6, m3, 13
2825
+    pshufb              m5, m7
2826
+    pmaddubsw           m4, m5, m0
2827
+    pmaddubsw           m5, m1
2828
+    pmulhrsw            m4, m2
2829
+    pmulhrsw            m5, m2
2830
+    packuswb            m4, m5
2831
+    movu                [r0 + r1], m4
2832
+
2833
+    palignr             m5, m6, m3, 14
2834
+    pshufb              m5, m7
2835
+    pmaddubsw           m4, m5, m0
2836
+    pmaddubsw           m5, m1
2837
+    pmulhrsw            m4, m2
2838
+    pmulhrsw            m5, m2
2839
+    packuswb            m4, m5
2840
+    movu                [r0 + r1 * 2], m4
2841
+
2842
+    palignr             m5, m6, m3, 15
2843
+    pshufb              m5, m7
2844
+    pmaddubsw           m4, m5, m0
2845
+    pmaddubsw           m5, m1
2846
+    pmulhrsw            m4, m2
2847
+    pmulhrsw            m5, m2
2848
+    packuswb            m4, m5
2849
+    movu                [r0 + r3], m4
2850
+    RET
2851
+
2852
+cglobal intra_pred_ang32_25, 3,5,7
2853
+    lea                 r3, [ang_table_avx2 + 32 * 16]
2854
+    lea                 r4, [r1 * 3]
2855
+    mova                m5, [pw_1024]
2856
+
2857
+    ; rows 0 to 7
2858
+    movu                m0, [r2 +  0]               ; [31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0]
2859
+    movu                m1, [r2 +  1]               ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1]
2860
+
2861
+    pinsrb              xm3, [r2], 15
2862
+    pinsrb              xm3, [r2 + mmsize*2 + 16], 14
2863
+
2864
+    punpckhbw           m2, m0, m1                  ; [32 31 31 30 30 29 29 28 28 27 27 26 26 25 25 24 16 15 15 14 14 13 13 12 12 11 11 10 10  9  9  8]
2865
+    punpcklbw           m0, m1                      ; [24 23 23 22 22 21 21 20 20 19 19 18 18 17 17 16  8  7  7  6  6  5  5  4  4  3  3  2  2  1  1  0]
2866
+    vinserti128         m3, m3, xm2, 1              ; [16 15 15 14 14 13 13 12 12 11 11 10 10  9  9  8  0 16  x  x  x  x  x  x  x  x  x  x  x  x  x  x]
2867
+
2868
+    pmaddubsw           m4, m0, [r3 + 14 * 32]      ; [30]
2869
+    pmulhrsw            m4, m5
2870
+    pmaddubsw           m1, m2, [r3 + 14 * 32]
2871
+    pmulhrsw            m1, m5
2872
+    packuswb            m4, m1
2873
+    movu                [r0], m4
2874
+
2875
+    pmaddubsw           m4, m0, [r3 + 12 * 32]      ; [28]
2876
+    pmulhrsw            m4, m5
2877
+    pmaddubsw           m1, m2, [r3 + 12 * 32]
2878
+    pmulhrsw            m1, m5
2879
+    packuswb            m4, m1
2880
+    movu                [r0 + r1], m4
2881
+
2882
+    pmaddubsw           m4, m0, [r3 + 10 * 32]      ; [26]
2883
+    pmulhrsw            m4, m5
2884
+    pmaddubsw           m1, m2, [r3 + 10 * 32]
2885
+    pmulhrsw            m1, m5
2886
+    packuswb            m4, m1
2887
+    movu                [r0 + r1*2], m4
2888
+
2889
+    pmaddubsw           m4, m0, [r3 + 8 * 32]       ; [24]
2890
+    pmulhrsw            m4, m5
2891
+    pmaddubsw           m1, m2, [r3 + 8 * 32]
2892
+    pmulhrsw            m1, m5
2893
+    packuswb            m4, m1
2894
+    movu                [r0 + r4], m4
2895
+
2896
+    lea                 r0, [r0 + r1 * 4]
2897
+
2898
+    pmaddubsw           m4, m0, [r3 + 6 * 32]       ; [22]
2899
+    pmulhrsw            m4, m5
2900
+    pmaddubsw           m1, m2, [r3 + 6 * 32]
2901
+    pmulhrsw            m1, m5
2902
+    packuswb            m4, m1
2903
+    movu                [r0], m4
2904
+
2905
+    pmaddubsw           m4, m0, [r3 + 4 * 32]       ; [20]
2906
+    pmulhrsw            m4, m5
2907
+    pmaddubsw           m1, m2, [r3 + 4 * 32]
2908
+    pmulhrsw            m1, m5
2909
+    packuswb            m4, m1
2910
+    movu                [r0 + r1], m4
2911
+
2912
+    pmaddubsw           m4, m0, [r3 + 2 * 32]       ; [18]
2913
+    pmulhrsw            m4, m5
2914
+    pmaddubsw           m1, m2, [r3 + 2 * 32]
2915
+    pmulhrsw            m1, m5
2916
+    packuswb            m4, m1
2917
+    movu                [r0 + r1*2], m4
2918
+
2919
+    pmaddubsw           m4, m0, [r3]                ; [16]
2920
+    pmulhrsw            m4, m5
2921
+    pmaddubsw           m1, m2, [r3]
2922
+    pmulhrsw            m1, m5
2923
+    packuswb            m4, m1
2924
+    movu                [r0 + r4], m4
2925
+
2926
+    lea                 r0, [r0 + r1 * 4]
2927
+
2928
+    ; rows 8 to 15
2929
+    pmaddubsw           m4, m0, [r3 - 2 * 32]       ; [14]
2930
+    pmulhrsw            m4, m5
2931
+    pmaddubsw           m1, m2, [r3 - 2 * 32]
2932
+    pmulhrsw            m1, m5
2933
+    packuswb            m4, m1
2934
+    movu                [r0], m4
2935
+
2936
+    pmaddubsw           m4, m0, [r3 - 4 * 32]       ; [12]
2937
+    pmulhrsw            m4, m5
2938
+    pmaddubsw           m1, m2, [r3 - 4 * 32]
2939
+    pmulhrsw            m1, m5
2940
+    packuswb            m4, m1
2941
+    movu                [r0 + r1], m4
2942
+
2943
+    pmaddubsw           m4, m0, [r3 - 6 * 32]       ; [10]
2944
+    pmulhrsw            m4, m5
2945
+    pmaddubsw           m1, m2, [r3 - 6 * 32]
2946
+    pmulhrsw            m1, m5
2947
+    packuswb            m4, m1
2948
+    movu                [r0 + r1*2], m4
2949
+
2950
+    pmaddubsw           m4, m0, [r3 - 8 * 32]       ; [8]
2951
+    pmulhrsw            m4, m5
2952
+    pmaddubsw           m1, m2, [r3 - 8 * 32]
2953
+    pmulhrsw            m1, m5
2954
+    packuswb            m4, m1
2955
+    movu                [r0 + r4], m4
2956
+
2957
+    lea                 r0, [r0 + r1 * 4]
2958
+
2959
+    pmaddubsw           m4, m0, [r3 - 10 * 32]      ; [6]
2960
+    pmulhrsw            m4, m5
2961
+    pmaddubsw           m1, m2, [r3 - 10 * 32]
2962
+    pmulhrsw            m1, m5
2963
+    packuswb            m4, m1
2964
+    movu                [r0], m4
2965
+
2966
+    pmaddubsw           m4, m0, [r3 - 12 * 32]      ; [4]
2967
+    pmulhrsw            m4, m5
2968
+    pmaddubsw           m1, m2, [r3 - 12 * 32]
2969
+    pmulhrsw            m1, m5
2970
+    packuswb            m4, m1
2971
+    movu                [r0 + r1], m4
2972
+
2973
+    pmaddubsw           m4, m0, [r3 - 14 * 32]      ; [2]
2974
+    pmulhrsw            m4, m5
2975
+    pmaddubsw           m1, m2, [r3 - 14 * 32]
2976
+    pmulhrsw            m1, m5
2977
+    packuswb            m4, m1
2978
+    movu                [r0 + r1 * 2], m4
2979
+
2980
+    movu                m1, [r2]                    ; [0]
2981
+    movu                [r0 + r4],  m1
2982
+
2983
+    lea                 r0, [r0 + r1 * 4]
2984
+    palignr             m2, m0, 14
2985
+    palignr             m0, m3, 14
2986
+
2987
+    ; rows 16 to 23
2988
+    pmaddubsw           m4, m0, [r3 + 14 * 32]      ; [30]
2989
+    pmulhrsw            m4, m5
2990
+    pmaddubsw           m1, m2, [r3 + 14 * 32]
2991
+    pmulhrsw            m1, m5
2992
+    packuswb            m4, m1
2993
+    movu                [r0], m4
2994
+
2995
+    pmaddubsw           m4, m0, [r3 + 12 * 32]      ; [28]
2996
+    pmulhrsw            m4, m5
2997
+    pmaddubsw           m1, m2, [r3 + 12 * 32]
2998
+    pmulhrsw            m1, m5
2999
+    packuswb            m4, m1
3000
+    movu                [r0 + r1], m4
3001
+
3002
+    pmaddubsw           m4, m0, [r3 + 10 * 32]      ; [26]
3003
+    pmulhrsw            m4, m5
3004
+    pmaddubsw           m1, m2, [r3 + 10 * 32]
3005
+    pmulhrsw            m1, m5
3006
+    packuswb            m4, m1
3007
+    movu                [r0 + r1*2], m4
3008
+
3009
+    pmaddubsw           m4, m0, [r3 + 8 * 32]       ; [24]
3010
+    pmulhrsw            m4, m5
3011
+    pmaddubsw           m1, m2, [r3 + 8 * 32]
3012
+    pmulhrsw            m1, m5
3013
+    packuswb            m4, m1
3014
+    movu                [r0 + r4], m4
3015
+
3016
+    lea                 r0, [r0 + r1 * 4]
3017
+
3018
+    pmaddubsw           m4, m0, [r3 + 6 * 32]       ; [22]
3019
+    pmulhrsw            m4, m5
3020
+    pmaddubsw           m1, m2, [r3 + 6 * 32]
3021
+    pmulhrsw            m1, m5
3022
+    packuswb            m4, m1
3023
+    movu                [r0], m4
3024
+
3025
+    pmaddubsw           m4, m0, [r3 + 4 * 32]       ; [20]
3026
+    pmulhrsw            m4, m5
3027
+    pmaddubsw           m1, m2, [r3 + 4 * 32]
3028
+    pmulhrsw            m1, m5
3029
+    packuswb            m4, m1
3030
+    movu                [r0 + r1], m4
3031
+
3032
+    pmaddubsw           m4, m0, [r3 + 2 * 32]       ; [18]
3033
+    pmulhrsw            m4, m5
3034
+    pmaddubsw           m1, m2, [r3 + 2 * 32]
3035
+    pmulhrsw            m1, m5
3036
+    packuswb            m4, m1
3037
+    movu                [r0 + r1*2], m4
3038
+
3039
+    pmaddubsw           m4, m0, [r3]                ; [16]
3040
+    pmulhrsw            m4, m5
3041
+    pmaddubsw           m1, m2, [r3]
3042
+    pmulhrsw            m1, m5
3043
+    packuswb            m4, m1
3044
+    movu                [r0 + r4], m4
3045
+
3046
+    lea                 r0, [r0 + r1 * 4]
3047
+
3048
+    ; rows 24 to 31
3049
+    pmaddubsw           m4, m0, [r3 - 2 * 32]       ; [14]
3050
+    pmulhrsw            m4, m5
3051
+    pmaddubsw           m1, m2, [r3 - 2 * 32]
3052
+    pmulhrsw            m1, m5
3053
+    packuswb            m4, m1
3054
+    movu                [r0], m4
3055
+
3056
+    pmaddubsw           m4, m0, [r3 - 4 * 32]       ; [12]
3057
+    pmulhrsw            m4, m5
3058
+    pmaddubsw           m1, m2, [r3 - 4 * 32]
3059
+    pmulhrsw            m1, m5
3060
+    packuswb            m4, m1
3061
+    movu                [r0 + r1], m4
3062
+
3063
+    pmaddubsw           m4, m0, [r3 - 6 * 32]       ; [10]
3064
+    pmulhrsw            m4, m5
3065
+    pmaddubsw           m1, m2, [r3 - 6 * 32]
3066
+    pmulhrsw            m1, m5
3067
+    packuswb            m4, m1
3068
+    movu                [r0 + r1 * 2], m4
3069
+
3070
+    pmaddubsw           m4, m0, [r3 - 8 * 32]       ; [8]
3071
+    pmulhrsw            m4, m5
3072
+    pmaddubsw           m1, m2, [r3 - 8 * 32]
3073
+    pmulhrsw            m1, m5
3074
+    packuswb            m4, m1
3075
+    movu                [r0 + r4], m4
3076
+
3077
+    lea                 r0, [r0 + r1 * 4]
3078
+
3079
+    pmaddubsw           m4, m0, [r3 - 10 * 32]      ; [6]
3080
+    pmulhrsw            m4, m5
3081
+    pmaddubsw           m1, m2, [r3 - 10 * 32]
3082
+    pmulhrsw            m1, m5
3083
+    packuswb            m4, m1
3084
+    movu                [r0], m4
3085
+
3086
+    pmaddubsw           m4, m0, [r3 - 12 * 32]      ; [4]
3087
+    pmulhrsw            m4, m5
3088
+    pmaddubsw           m1, m2, [r3 - 12 * 32]
3089
+    pmulhrsw            m1, m5
3090
+    packuswb            m4, m1
3091
+    movu                [r0 + r1], m4
3092
+
3093
+    pmaddubsw           m0, [r3 - 14 * 32]          ; [2]
3094
+    pmulhrsw            m0, m5
3095
+    pmaddubsw           m2, [r3 - 14 * 32]
3096
+    pmulhrsw            m2, m5
3097
+    packuswb            m0, m2
3098
+    movu                [r0 + r1*2], m0
3099
+
3100
+    movu                m1, [r2 + 1]                ; [0]
3101
+    palignr             m1, m3, 14
3102
+    movu                [r0 + r4], m1
3103
+    RET
3104
+
3105
+cglobal intra_pred_ang32_12, 3,4,9
3106
+    movu                m0, [ang32_fact_mode12]
3107
+    movu                m1, [ang32_fact_mode12 + mmsize]
3108
+    mova                m2, [pw_1024]
3109
+    mova                m7, [ang32_shuf_mode12]
3110
+    mova                m8, [ang32_shuf_mode12 + mmsize]
3111
+    lea                 r3, [r1 * 3]
3112
+
3113
+    ; prepare for [26, 19, 13,  6,  0, -1, -2....]
3114
+
3115
+    movu               xm4, [r2 + mmsize*2 - 4]
3116
+    vbroadcasti128      m6, [r2 + mmsize*2 + 12]
3117
+
3118
+    pinsrb             xm4, [r2 +  0], 4
3119
+    pinsrb             xm4, [r2 +  6], 3
3120
+    pinsrb             xm4, [r2 + 13], 2
3121
+    pinsrb             xm4, [r2 + 19], 1
3122
+    pinsrb             xm4, [r2 + 26], 0
3123
+    vinserti128         m3, m4, xm4, 1      ; [26, 19, 13,  6,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 26, 19, 13,  6,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11]
3124
+
3125
+    pshufb              m4, m3, m7          ; [ 0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  6,  0,  6,  0, 13,  6, 13,  6, 13,  6, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13]
3126
+    pshufb              m5, m3, m8          ; [ 6,  0,  6,  0,  6,  0,  6,  0, 13,  6, 13,  6, 13,  6, 13,  6, 19, 13, 16, 19, 16, 19, 16, 19, 16, 19, 16, 19, 16, 19, 16, 19]
3127
+    pmaddubsw           m4, m0
3128
+    pmaddubsw           m5, m1
3129
+    pmulhrsw            m4, m2
3130
+    pmulhrsw            m5, m2
3131
+    packuswb            m4, m5
3132
+    movu                [r0], m4
3133
+
3134
+    palignr             m4, m6, m3, 1
3135
+    pshufb              m5, m4, m8
3136
+    pshufb              m4, m7
3137
+    pmaddubsw           m4, m0
3138
+    pmaddubsw           m5, m1
3139
+    pmulhrsw            m4, m2
3140
+    pmulhrsw            m5, m2
3141
+    packuswb            m4, m5
3142
+    movu                [r0 + r1], m4
3143
+
3144
+    palignr             m4, m6, m3, 2
3145
+    pshufb              m5, m4, m8
3146
+    pshufb              m4, m7
3147
+    pmaddubsw           m4, m0
3148
+    pmaddubsw           m5, m1
3149
+    pmulhrsw            m4, m2
3150
+    pmulhrsw            m5, m2
3151
+    packuswb            m4, m5
3152
+    movu                [r0 + r1 * 2], m4
3153
+
3154
+    palignr             m4, m6, m3, 3
3155
+    pshufb              m5, m4, m8
3156
+    pshufb              m4, m7
3157
+    pmaddubsw           m4, m0
3158
+    pmaddubsw           m5, m1
3159
+    pmulhrsw            m4, m2
3160
+    pmulhrsw            m5, m2
3161
+    packuswb            m4, m5
3162
+    movu                [r0 + r3], m4
3163
+
3164
+    lea                 r0, [r0 + r1 * 4]
3165
+
3166
+    palignr             m4, m6, m3, 4
3167
+    pshufb              m5, m4, m8
3168
+    pshufb              m4, m7
3169
+    pmaddubsw           m4, m0
3170
+    pmaddubsw           m5, m1
3171
+    pmulhrsw            m4, m2
3172
+    pmulhrsw            m5, m2
3173
+    packuswb            m4, m5
3174
+    movu                [r0], m4
3175
+
3176
+    palignr             m4, m6, m3, 5
3177
+    pshufb              m5, m4, m8
3178
+    pshufb              m4, m7
3179
+    pmaddubsw           m4, m0
3180
+    pmaddubsw           m5, m1
3181
+    pmulhrsw            m4, m2
3182
+    pmulhrsw            m5, m2
3183
+    packuswb            m4, m5
3184
+    movu                [r0 + r1], m4
3185
+
3186
+    palignr             m4, m6, m3, 6
3187
+    pshufb              m5, m4, m8
3188
+    pshufb              m4, m7
3189
+    pmaddubsw           m4, m0
3190
+    pmaddubsw           m5, m1
3191
+    pmulhrsw            m4, m2
3192
+    pmulhrsw            m5, m2
3193
+    packuswb            m4, m5
3194
+    movu                [r0 + r1 * 2], m4
3195
+
3196
+    palignr             m4, m6, m3, 7
3197
+    pshufb              m5, m4, m8
3198
+    pshufb              m4, m7
3199
+    pmaddubsw           m4, m0
3200
+    pmaddubsw           m5, m1
3201
+    pmulhrsw            m4, m2
3202
+    pmulhrsw            m5, m2
3203
+    packuswb            m4, m5
3204
+    movu                [r0 + r3], m4
3205
+
3206
+    lea                 r0, [r0 + r1 * 4]
3207
+
3208
+    palignr             m4, m6, m3, 8
3209
+    pshufb              m5, m4, m8
3210
+    pshufb              m4, m7
3211
+    pmaddubsw           m4, m0
3212
+    pmaddubsw           m5, m1
3213
+    pmulhrsw            m4, m2
3214
+    pmulhrsw            m5, m2
3215
+    packuswb            m4, m5
3216
+    movu                [r0], m4
3217
+
3218
+    palignr             m4, m6, m3, 9
3219
+    pshufb              m5, m4, m8
3220
+    pshufb              m4, m7
3221
+    pmaddubsw           m4, m0
3222
+    pmaddubsw           m5, m1
3223
+    pmulhrsw            m4, m2
3224
+    pmulhrsw            m5, m2
3225
+    packuswb            m4, m5
3226
+    movu                [r0 + r1], m4
3227
+
3228
+    palignr             m4, m6, m3, 10
3229
+    pshufb              m5, m4, m8
3230
+    pshufb              m4, m7
3231
+    pmaddubsw           m4, m0
3232
+    pmaddubsw           m5, m1
3233
+    pmulhrsw            m4, m2
3234
+    pmulhrsw            m5, m2
3235
+    packuswb            m4, m5
3236
+    movu                [r0 + r1 * 2], m4
3237
+
3238
+    palignr             m4, m6, m3, 11
3239
+    pshufb              m5, m4, m8
3240
+    pshufb              m4, m7
3241
+    pmaddubsw           m4, m0
3242
+    pmaddubsw           m5, m1
3243
+    pmulhrsw            m4, m2
3244
+    pmulhrsw            m5, m2
3245
+    packuswb            m4, m5
3246
+    movu                [r0 + r3], m4
3247
+
3248
+    lea                 r0, [r0 + r1 * 4]
3249
+
3250
+    palignr             m4, m6, m3, 12
3251
+    pshufb              m5, m4, m8
3252
+    pshufb              m4, m7
3253
+    pmaddubsw           m4, m0
3254
+    pmaddubsw           m5, m1
3255
+    pmulhrsw            m4, m2
3256
+    pmulhrsw            m5, m2
3257
+    packuswb            m4, m5
3258
+    movu                [r0], m4
3259
+
3260
+    palignr             m4, m6, m3, 13
3261
+    pshufb              m5, m4, m8
3262
+    pshufb              m4, m7
3263
+    pmaddubsw           m4, m0
3264
+    pmaddubsw           m5, m1
3265
+    pmulhrsw            m4, m2
3266
+    pmulhrsw            m5, m2
3267
+    packuswb            m4, m5
3268
+    movu                [r0 + r1], m4
3269
+
3270
+    palignr             m4, m6, m3, 14
3271
+    pshufb              m5, m4, m8
3272
+    pshufb              m4, m7
3273
+    pmaddubsw           m4, m0
3274
+    pmaddubsw           m5, m1
3275
+    pmulhrsw            m4, m2
3276
+    pmulhrsw            m5, m2
3277
+    packuswb            m4, m5
3278
+    movu                [r0 + r1 * 2], m4
3279
+
3280
+    palignr             m4, m6, m3, 15
3281
+    pshufb              m5, m4, m8
3282
+    pshufb              m4, m7
3283
+    pmaddubsw           m4, m0
3284
+    pmaddubsw           m5, m1
3285
+    pmulhrsw            m4, m2
3286
+    pmulhrsw            m5, m2
3287
+    packuswb            m4, m5
3288
+    movu                [r0 + r3], m4
3289
+
3290
+    lea                 r0, [r0 + r1 * 4]
3291
+    mova                m3, m6
3292
+    vbroadcasti128      m6, [r2 + mmsize*2 + 12 + 16]
3293
+
3294
+    pshufb              m4, m3, m7
3295
+    pshufb              m5, m3, m8
3296
+    pmaddubsw           m4, m0
3297
+    pmaddubsw           m5, m1
3298
+    pmulhrsw            m4, m2
3299
+    pmulhrsw            m5, m2
3300
+    packuswb            m4, m5
3301
+    movu                [r0], m4
3302
+
3303
+    palignr             m4, m6, m3, 1
3304
+    pshufb              m5, m4, m8
3305
+    pshufb              m4, m7
3306
+    pmaddubsw           m4, m0
3307
+    pmaddubsw           m5, m1
3308
+    pmulhrsw            m4, m2
3309
+    pmulhrsw            m5, m2
3310
+    packuswb            m4, m5
3311
+    movu                [r0 + r1], m4
3312
+
3313
+    palignr             m4, m6, m3, 2
3314
+    pshufb              m5, m4, m8
3315
+    pshufb              m4, m7
3316
+    pmaddubsw           m4, m0
3317
+    pmaddubsw           m5, m1
3318
+    pmulhrsw            m4, m2
3319
+    pmulhrsw            m5, m2
3320
+    packuswb            m4, m5
3321
+    movu                [r0 + r1 * 2], m4
3322
+
3323
+    palignr             m4, m6, m3, 3
3324
+    pshufb              m5, m4, m8
3325
+    pshufb              m4, m7
3326
+    pmaddubsw           m4, m0
3327
+    pmaddubsw           m5, m1
3328
+    pmulhrsw            m4, m2
3329
+    pmulhrsw            m5, m2
3330
+    packuswb            m4, m5
3331
+    movu                [r0 + r3], m4
3332
+
3333
+    lea                 r0, [r0 + r1 * 4]
3334
+
3335
+    palignr             m4, m6, m3, 4
3336
+    pshufb              m5, m4, m8
3337
+    pshufb              m4, m7
3338
+    pmaddubsw           m4, m0
3339
+    pmaddubsw           m5, m1
3340
+    pmulhrsw            m4, m2
3341
+    pmulhrsw            m5, m2
3342
+    packuswb            m4, m5
3343
+    movu                [r0], m4
3344
+
3345
+    palignr             m4, m6, m3, 5
3346
+    pshufb              m5, m4, m8
3347
+    pshufb              m4, m7
3348
+    pmaddubsw           m4, m0
3349
+    pmaddubsw           m5, m1
3350
+    pmulhrsw            m4, m2
3351
+    pmulhrsw            m5, m2
3352
+    packuswb            m4, m5
3353
+    movu                [r0 + r1], m4
3354
+
3355
+    palignr             m4, m6, m3, 6
3356
+    pshufb              m5, m4, m8
3357
+    pshufb              m4, m7
3358
+    pmaddubsw           m4, m0
3359
+    pmaddubsw           m5, m1
3360
+    pmulhrsw            m4, m2
3361
+    pmulhrsw            m5, m2
3362
+    packuswb            m4, m5
3363
+    movu                [r0 + r1 * 2], m4
3364
+
3365
+    palignr             m4, m6, m3, 7
3366
+    pshufb              m5, m4, m8
3367
+    pshufb              m4, m7
3368
+    pmaddubsw           m4, m0
3369
+    pmaddubsw           m5, m1
3370
+    pmulhrsw            m4, m2
3371
+    pmulhrsw            m5, m2
3372
+    packuswb            m4, m5
3373
+    movu                [r0 + r3], m4
3374
+
3375
+    lea                 r0, [r0 + r1 * 4]
3376
+
3377
+    palignr             m4, m6, m3, 8
3378
+    pshufb              m5, m4, m8
3379
+    pshufb              m4, m7
3380
+    pmaddubsw           m4, m0
3381
+    pmaddubsw           m5, m1
3382
+    pmulhrsw            m4, m2
3383
+    pmulhrsw            m5, m2
3384
+    packuswb            m4, m5
3385
+    movu                [r0], m4
3386
+
3387
+    palignr             m4, m6, m3, 9
3388
+    pshufb              m5, m4, m8
3389
+    pshufb              m4, m7
3390
+    pmaddubsw           m4, m0
3391
+    pmaddubsw           m5, m1
3392
+    pmulhrsw            m4, m2
3393
+    pmulhrsw            m5, m2
3394
+    packuswb            m4, m5
3395
+    movu                [r0 + r1], m4
3396
+
3397
+    palignr             m4, m6, m3, 10
3398
+    pshufb              m5, m4, m8
3399
+    pshufb              m4, m7
3400
+    pmaddubsw           m4, m0
3401
+    pmaddubsw           m5, m1
3402
+    pmulhrsw            m4, m2
3403
+    pmulhrsw            m5, m2
3404
+    packuswb            m4, m5
3405
+    movu                [r0 + r1 * 2], m4
3406
+
3407
+    palignr             m4, m6, m3, 11
3408
+    pshufb              m5, m4, m8
3409
+    pshufb              m4, m7
3410
+    pmaddubsw           m4, m0
3411
+    pmaddubsw           m5, m1
3412
+    pmulhrsw            m4, m2
3413
+    pmulhrsw            m5, m2
3414
+    packuswb            m4, m5
3415
+    movu                [r0 + r3], m4
3416
+
3417
+    lea                 r0, [r0 + r1 * 4]
3418
+
3419
+    palignr             m4, m6, m3, 12
3420
+    pshufb              m5, m4, m8
3421
+    pshufb              m4, m7
3422
+    pmaddubsw           m4, m0
3423
+    pmaddubsw           m5, m1
3424
+    pmulhrsw            m4, m2
3425
+    pmulhrsw            m5, m2
3426
+    packuswb            m4, m5
3427
+    movu                [r0], m4
3428
+
3429
+    palignr             m4, m6, m3, 13
3430
+    pshufb              m5, m4, m8
3431
+    pshufb              m4, m7
3432
+    pmaddubsw           m4, m0
3433
+    pmaddubsw           m5, m1
3434
+    pmulhrsw            m4, m2
3435
+    pmulhrsw            m5, m2
3436
+    packuswb            m4, m5
3437
+    movu                [r0 + r1], m4
3438
+
3439
+    palignr             m4, m6, m3, 14
3440
+    pshufb              m5, m4, m8
3441
+    pshufb              m4, m7
3442
+    pmaddubsw           m4, m0
3443
+    pmaddubsw           m5, m1
3444
+    pmulhrsw            m4, m2
3445
+    pmulhrsw            m5, m2
3446
+    packuswb            m4, m5
3447
+    movu                [r0 + r1 * 2], m4
3448
+
3449
+    palignr             m4, m6, m3, 15
3450
+    pshufb              m5, m4, m8
3451
+    pshufb              m4, m7
3452
+    pmaddubsw           m4, m0
3453
+    pmaddubsw           m5, m1
3454
+    pmulhrsw            m4, m2
3455
+    pmulhrsw            m5, m2
3456
+    packuswb            m4, m5
3457
+    movu                [r0 + r3], m4
3458
+    RET
3459
+
3460
+cglobal intra_pred_ang32_24, 3,5,8
3461
+    lea                 r3, [ang_table_avx2 + 32 * 16]
3462
+    lea                 r4, [r1 * 3]
3463
+    mova                m5, [pw_1024]
3464
+
3465
+    ; rows 0 to 7
3466
+    movu                m0, [r2 + 0]
3467
+    movu                m1, [r2 + 1]
3468
+    punpckhbw           m2, m0, m1
3469
+    punpcklbw           m0, m1
3470
+
3471
+    movu                m4, [r2 + mmsize*2]
3472
+    pshufb              m4, [ang32_shuf_mode24]
3473
+    mova                m3, [ang32_shuf_mode24 + mmsize]
3474
+    vpermd              m4, m3, m4                      ; [6  6 13 13 19 19 26 26 x x x...]
3475
+    palignr             m3, m0, m4, 1
3476
+    vinserti128         m3, m3, xm2, 1
3477
+
3478
+    pmaddubsw           m4, m0, [r3 + 11 * 32]          ; [27]
3479
+    pmulhrsw            m4, m5
3480
+    pmaddubsw           m1, m2, [r3 + 11 * 32]
3481
+    pmulhrsw            m1, m5
3482
+    packuswb            m4, m1
3483
+    movu                [r0], m4
3484
+
3485
+    pmaddubsw           m4, m0, [r3 + 6 * 32]           ; [22]
3486
+    pmulhrsw            m4, m5
3487
+    pmaddubsw           m1, m2, [r3 + 6 * 32]
3488
+    pmulhrsw            m1, m5
3489
+    packuswb            m4, m1
3490
+    movu                [r0 + r1], m4
3491
+
3492
+    pmaddubsw           m4, m0, [r3 + 1 * 32]           ; [17]
3493
+    pmulhrsw            m4, m5
3494
+    pmaddubsw           m1, m2, [r3 + 1 * 32]
3495
+    pmulhrsw            m1, m5
3496
+    packuswb            m4, m1
3497
+    movu                [r0 + r1*2], m4
3498
+
3499
+    pmaddubsw           m4, m0, [r3 - 4 * 32]           ; [12]
3500
+    pmulhrsw            m4, m5
3501
+    pmaddubsw           m1, m2, [r3 - 4 * 32]
3502
+    pmulhrsw            m1, m5
3503
+    packuswb            m4, m1
3504
+    movu                [r0 + r4], m4
3505
+
3506
+    lea                 r0, [r0 + r1 * 4]
3507
+
3508
+    pmaddubsw           m4, m0, [r3 - 9 * 32]           ; [7]
3509
+    pmulhrsw            m4, m5
3510
+    pmaddubsw           m1, m2, [r3 - 9 * 32]
3511
+    pmulhrsw            m1, m5
3512
+    packuswb            m4, m1
3513
+    movu                [r0], m4
3514
+
3515
+    pmaddubsw           m4, m0, [r3 - 14 * 32]          ; [2]
3516
+    pmulhrsw            m4, m5
3517
+    pmaddubsw           m1, m2, [r3 - 14 * 32]
3518
+    pmulhrsw            m1, m5
3519
+    packuswb            m4, m1
3520
+    movu                [r0 + r1], m4
3521
+
3522
+    palignr             m6, m0, m3, 14
3523
+    palignr             m7, m2, m0, 14
3524
+
3525
+    pmaddubsw           m4, m6, [r3 + 13 * 32]          ; [29]
3526
+    pmulhrsw            m4, m5
3527
+    pmaddubsw           m1, m7, [r3 + 13 * 32]
3528
+    pmulhrsw            m1, m5
3529
+    packuswb            m4, m1
3530
+    movu                [r0 + r1*2], m4
3531
+
3532
+    pmaddubsw           m4, m6, [r3 + 8 * 32]           ; [24]
3533
+    pmulhrsw            m4, m5
3534
+    pmaddubsw           m1, m7, [r3 + 8 * 32]
3535
+    pmulhrsw            m1, m5
3536
+    packuswb            m4, m1
3537
+    movu                [r0 + r4], m4
3538
+
3539
+    lea                 r0, [r0 + r1 * 4]
3540
+
3541
+    ; rows 8 to 15
3542
+    pmaddubsw           m4, m6, [r3 + 3 * 32]           ; [19]
3543
+    pmulhrsw            m4, m5
3544
+    pmaddubsw           m1, m7, [r3 + 3 * 32]
3545
+    pmulhrsw            m1, m5
3546
+    packuswb            m4, m1
3547
+    movu                [r0], m4
3548
+
3549
+    pmaddubsw           m4, m6, [r3 - 2 * 32]           ; [14]
3550
+    pmulhrsw            m4, m5
3551
+    pmaddubsw           m1, m7, [r3 - 2 * 32]
3552
+    pmulhrsw            m1, m5
3553
+    packuswb            m4, m1
3554
+    movu                [r0 + r1], m4
3555
+
3556
+    pmaddubsw           m4, m6, [r3 - 7 * 32]           ; [9]
3557
+    pmulhrsw            m4, m5
3558
+    pmaddubsw           m1, m7, [r3 - 7 * 32]
3559
+    pmulhrsw            m1, m5
3560
+    packuswb            m4, m1
3561
+    movu                [r0 + r1*2], m4
3562
+
3563
+    pmaddubsw           m4, m6, [r3 - 12 * 32]          ; [4]
3564
+    pmulhrsw            m4, m5
3565
+    pmaddubsw           m1, m7, [r3 - 12 * 32]
3566
+    pmulhrsw            m1, m5
3567
+    packuswb            m4, m1
3568
+    movu                [r0 + r4], m4
3569
+
3570
+    lea                 r0, [r0 + r1 * 4]
3571
+
3572
+    palignr             m6, m0, m3, 12
3573
+    palignr             m7, m2, m0, 12
3574
+
3575
+    pmaddubsw           m4, m6, [r3 + 15 * 32]          ; [31]
3576
+    pmulhrsw            m4, m5
3577
+    pmaddubsw           m1, m7, [r3 + 15 * 32]
3578
+    pmulhrsw            m1, m5
3579
+    packuswb            m4, m1
3580
+    movu                [r0], m4
3581
+
3582
+    pmaddubsw           m4, m6, [r3 + 10 * 32]          ; [26]
3583
+    pmulhrsw            m4, m5
3584
+    pmaddubsw           m1, m7, [r3 + 10 * 32]
3585
+    pmulhrsw            m1, m5
3586
+    packuswb            m4, m1
3587
+    movu                [r0 + r1], m4
3588
+
3589
+    pmaddubsw           m4, m6, [r3 + 5 * 32]           ; [21]
3590
+    pmulhrsw            m4, m5
3591
+    pmaddubsw           m1, m7, [r3 + 5 * 32]
3592
+    pmulhrsw            m1, m5
3593
+    packuswb            m4, m1
3594
+    movu                [r0 + r1 * 2], m4
3595
+
3596
+    pmaddubsw           m4, m6, [r3]                    ; [16]
3597
+    pmulhrsw            m4, m5
3598
+    pmaddubsw           m1, m7, [r3]
3599
+    pmulhrsw            m1, m5
3600
+    packuswb            m4, m1
3601
+    movu                [r0 + r4], m4
3602
+
3603
+    lea                 r0, [r0 + r1 * 4]
3604
+
3605
+    ; rows 16 to 23
3606
+    pmaddubsw           m4, m6, [r3 - 5 * 32]           ; [11]
3607
+    pmulhrsw            m4, m5
3608
+    pmaddubsw           m1, m7, [r3 - 5 * 32]
3609
+    pmulhrsw            m1, m5
3610
+    packuswb            m4, m1
3611
+    movu                [r0], m4
3612
+
3613
+    pmaddubsw           m4, m6, [r3 - 10 * 32]          ; [6]
3614
+    pmulhrsw            m4, m5
3615
+    pmaddubsw           m1, m7, [r3 - 10 * 32]
3616
+    pmulhrsw            m1, m5
3617
+    packuswb            m4, m1
3618
+    movu                [r0 + r1], m4
3619
+
3620
+    pmaddubsw           m4, m6, [r3 - 15 * 32]          ; [1]
3621
+    pmulhrsw            m4, m5
3622
+    pmaddubsw           m1, m7, [r3 - 15 * 32]
3623
+    pmulhrsw            m1, m5
3624
+    packuswb            m4, m1
3625
+    movu                [r0 + r1*2], m4
3626
+
3627
+    palignr             m6, m0, m3, 10
3628
+    palignr             m7, m2, m0, 10
3629
+
3630
+    pmaddubsw           m4, m6, [r3 + 12 * 32]          ; [28]
3631
+    pmulhrsw            m4, m5
3632
+    pmaddubsw           m1, m7, [r3 + 12 * 32]
3633
+    pmulhrsw            m1, m5
3634
+    packuswb            m4, m1
3635
+    movu                [r0 + r4], m4
3636
+
3637
+    lea                 r0, [r0 + r1 * 4]
3638
+
3639
+    pmaddubsw           m4, m6, [r3 + 7 * 32]           ; [23]
3640
+    pmulhrsw            m4, m5
3641
+    pmaddubsw           m1, m7, [r3 + 7 * 32]
3642
+    pmulhrsw            m1, m5
3643
+    packuswb            m4, m1
3644
+    movu                [r0], m4
3645
+
3646
+    pmaddubsw           m4, m6, [r3 + 2 * 32]           ; [18]
3647
+    pmulhrsw            m4, m5
3648
+    pmaddubsw           m1, m7, [r3 + 2 * 32]
3649
+    pmulhrsw            m1, m5
3650
+    packuswb            m4, m1
3651
+    movu                [r0 + r1], m4
3652
+
3653
+    pmaddubsw           m4, m6, [r3 - 3 * 32]           ; [13]
3654
+    pmulhrsw            m4, m5
3655
+    pmaddubsw           m1, m7, [r3 - 3 * 32]
3656
+    pmulhrsw            m1, m5
3657
+    packuswb            m4, m1
3658
+    movu                [r0 + r1*2], m4
3659
+
3660
+    pmaddubsw           m4, m6, [r3 - 8 * 32]           ; [8]
3661
+    pmulhrsw            m4, m5
3662
+    pmaddubsw           m1, m7, [r3 - 8 * 32]
3663
+    pmulhrsw            m1, m5
3664
+    packuswb            m4, m1
3665
+    movu                [r0 + r4], m4
3666
+
3667
+    lea                 r0, [r0 + r1 * 4]
3668
+
3669
+    ; rows 24 to 31
3670
+    pmaddubsw           m4, m6, [r3 - 13 * 32]          ; [3]
3671
+    pmulhrsw            m4, m5
3672
+    pmaddubsw           m1, m7, [r3 - 13 * 32]
3673
+    pmulhrsw            m1, m5
3674
+    packuswb            m4, m1
3675
+    movu                [r0], m4
3676
+
3677
+    palignr             m6, m0, m3, 8
3678
+    palignr             m7, m2, m0, 8
3679
+
3680
+    pmaddubsw           m4, m6, [r3 + 14 * 32]          ; [30]
3681
+    pmulhrsw            m4, m5
3682
+    pmaddubsw           m1, m7, [r3 + 14 * 32]
3683
+    pmulhrsw            m1, m5
3684
+    packuswb            m4, m1
3685
+    movu                [r0 + r1], m4
3686
+
3687
+    pmaddubsw           m4, m6, [r3 + 9 * 32]           ; [25]
3688
+    pmulhrsw            m4, m5
3689
+    pmaddubsw           m1, m7, [r3 + 9 * 32]
3690
+    pmulhrsw            m1, m5
3691
+    packuswb            m4, m1
3692
+    movu                [r0 + r1 * 2], m4
3693
+
3694
+    pmaddubsw           m4, m6, [r3 + 4 * 32]           ; [20]
3695
+    pmulhrsw            m4, m5
3696
+    pmaddubsw           m1, m7, [r3 + 4 * 32]
3697
+    pmulhrsw            m1, m5
3698
+    packuswb            m4, m1
3699
+    movu                [r0 + r4], m4
3700
+
3701
+    lea                 r0, [r0 + r1 * 4]
3702
+
3703
+    pmaddubsw           m4, m6, [r3 - 1 * 32]           ; [15]
3704
+    pmulhrsw            m4, m5
3705
+    pmaddubsw           m1, m7, [r3 - 1 * 32]
3706
+    pmulhrsw            m1, m5
3707
+    packuswb            m4, m1
3708
+    movu                [r0], m4
3709
+
3710
+    pmaddubsw           m4, m6, [r3 - 6 * 32]           ; [10]
3711
+    pmulhrsw            m4, m5
3712
+    pmaddubsw           m1, m7, [r3 - 6 * 32]
3713
+    pmulhrsw            m1, m5
3714
+    packuswb            m4, m1
3715
+    movu                [r0 + r1], m4
3716
+
3717
+    pmaddubsw           m4, m6, [r3 - 11 * 32]          ; [5]
3718
+    pmulhrsw            m4, m5
3719
+    pmaddubsw           m1, m7, [r3 - 11 * 32]
3720
+    pmulhrsw            m1, m5
3721
+    packuswb            m4, m1
3722
+    movu                [r0 + r1*2], m4
3723
+
3724
+    pand                m6, [pw_00ff]
3725
+    pand                m7, [pw_00ff]
3726
+    packuswb            m6, m7
3727
+    movu                [r0 + r4], m6
3728
+    RET
3729
+
3730
+cglobal intra_pred_ang32_13, 3,4,9
3731
+    movu                m0, [ang32_fact_mode13]
3732
+    movu                m1, [ang32_fact_mode13 + mmsize]
3733
+    mova                m2, [pw_1024]
3734
+    mova                m7, [ang32_shuf_mode13]
3735
+    mova                m8, [ang32_shuf_mode13 + mmsize]
3736
+    lea                 r3, [r1 * 3]
3737
+
3738
+    ; prepare for [28, 25, 21, 18, 14, 11,  7,  4,  0, -1, -2....]
3739
+
3740
+    movu                m6, [r2]
3741
+    pshufb              m6, [ang32_shuf_mode13 + mmsize*2]
3742
+    mova                m3, [ang32_shuf_mode24 + mmsize*1]
3743
+    vpermd              m6, m3, m6
3744
+    palignr             m6, m6, 1
3745
+    vbroadcasti128      m3, [r2 + mmsize*2 + 1]
3746
+
3747
+    palignr             m5, m3, m6, 1
3748
+    pshufb              m4, m5, m7
3749
+    pshufb              m5, m8
3750
+    pmaddubsw           m4, m0
3751
+    pmaddubsw           m5, m1
3752
+    pmulhrsw            m4, m2
3753
+    pmulhrsw            m5, m2
3754
+    packuswb            m4, m5
3755
+    movu                [r0], m4
3756
+
3757
+    palignr             m5, m3, m6, 2
3758
+    pshufb              m4, m5, m7
3759
+    pshufb              m5, m8
3760
+    pmaddubsw           m4, m0
3761
+    pmaddubsw           m5, m1
3762
+    pmulhrsw            m4, m2
3763
+    pmulhrsw            m5, m2
3764
+    packuswb            m4, m5
3765
+    movu                [r0 + r1], m4
3766
+
3767
+    palignr             m5, m3, m6, 3
3768
+    pshufb              m4, m5, m7
3769
+    pshufb              m5, m8
3770
+    pmaddubsw           m4, m0
3771
+    pmaddubsw           m5, m1
3772
+    pmulhrsw            m4, m2
3773
+    pmulhrsw            m5, m2
3774
+    packuswb            m4, m5
3775
+    movu                [r0 + r1 * 2], m4
3776
+
3777
+    palignr             m5, m3, m6, 4
3778
+    pshufb              m4, m5, m7
3779
+    pshufb              m5, m8
3780
+    pmaddubsw           m4, m0
3781
+    pmaddubsw           m5, m1
3782
+    pmulhrsw            m4, m2
3783
+    pmulhrsw            m5, m2
3784
+    packuswb            m4, m5
3785
+    movu                [r0 + r3], m4
3786
+
3787
+    lea                 r0, [r0 + r1 * 4]
3788
+
3789
+    palignr             m5, m3, m6, 5
3790
+    pshufb              m4, m5, m7
3791
+    pshufb              m5, m8
3792
+    pmaddubsw           m4, m0
3793
+    pmaddubsw           m5, m1
3794
+    pmulhrsw            m4, m2
3795
+    pmulhrsw            m5, m2
3796
+    packuswb            m4, m5
3797
+    movu                [r0], m4
3798
+
3799
+    palignr             m5, m3, m6, 6
3800
+    pshufb              m4, m5, m7
3801
+    pshufb              m5, m8
3802
+    pmaddubsw           m4, m0
3803
+    pmaddubsw           m5, m1
3804
+    pmulhrsw            m4, m2
3805
+    pmulhrsw            m5, m2
3806
+    packuswb            m4, m5
3807
+    movu                [r0 + r1], m4
3808
+
3809
+    palignr             m5, m3, m6, 7
3810
+    pshufb              m4, m5, m7
3811
+    pshufb              m5, m8
3812
+    pmaddubsw           m4, m0
3813
+    pmaddubsw           m5, m1
3814
+    pmulhrsw            m4, m2
3815
+    pmulhrsw            m5, m2
3816
+    packuswb            m4, m5
3817
+    movu                [r0 + r1 * 2], m4
3818
+
3819
+    palignr             m5, m3, m6, 8
3820
+    pshufb              m4, m5, m7
3821
+    pshufb              m5, m8
3822
+    pmaddubsw           m4, m0
3823
+    pmaddubsw           m5, m1
3824
+    pmulhrsw            m4, m2
3825
+    pmulhrsw            m5, m2
3826
+    packuswb            m4, m5
3827
+    movu                [r0 + r3], m4
3828
+
3829
+    lea                 r0, [r0 + r1 * 4]
3830
+
3831
+    palignr             m5, m3, m6, 9
3832
+    pshufb              m4, m5, m7
3833
+    pshufb              m5, m8
3834
+    pmaddubsw           m4, m0
3835
+    pmaddubsw           m5, m1
3836
+    pmulhrsw            m4, m2
3837
+    pmulhrsw            m5, m2
3838
+    packuswb            m4, m5
3839
+    movu                [r0], m4
3840
+
3841
+    palignr             m5, m3, m6, 10
3842
+    pshufb              m4, m5, m7
3843
+    pshufb              m5, m8
3844
+    pmaddubsw           m4, m0
3845
+    pmaddubsw           m5, m1
3846
+    pmulhrsw            m4, m2
3847
+    pmulhrsw            m5, m2
3848
+    packuswb            m4, m5
3849
+    movu                [r0 + r1], m4
3850
+
3851
+    palignr             m5, m3, m6, 11
3852
+    pshufb              m4, m5, m7
3853
+    pshufb              m5, m8
3854
+    pmaddubsw           m4, m0
3855
+    pmaddubsw           m5, m1
3856
+    pmulhrsw            m4, m2
3857
+    pmulhrsw            m5, m2
3858
+    packuswb            m4, m5
3859
+    movu                [r0 + r1 * 2], m4
3860
+
3861
+    palignr             m5, m3, m6, 12
3862
+    pshufb              m4, m5, m7
3863
+    pshufb              m5, m8
3864
+    pmaddubsw           m4, m0
3865
+    pmaddubsw           m5, m1
3866
+    pmulhrsw            m4, m2
3867
+    pmulhrsw            m5, m2
3868
+    packuswb            m4, m5
3869
+    movu                [r0 + r3], m4
3870
+
3871
+    lea                 r0, [r0 + r1 * 4]
3872
+
3873
+    palignr             m5, m3, m6, 13
3874
+    pshufb              m4, m5, m7
3875
+    pshufb              m5, m8
3876
+    pmaddubsw           m4, m0
3877
+    pmaddubsw           m5, m1
3878
+    pmulhrsw            m4, m2
3879
+    pmulhrsw            m5, m2
3880
+    packuswb            m4, m5
3881
+    movu                [r0], m4
3882
+
3883
+    palignr             m5, m3, m6, 14
3884
+    pshufb              m4, m5, m7
3885
+    pshufb              m5, m8
3886
+    pmaddubsw           m4, m0
3887
+    pmaddubsw           m5, m1
3888
+    pmulhrsw            m4, m2
3889
+    pmulhrsw            m5, m2
3890
+    packuswb            m4, m5
3891
+    movu                [r0 + r1], m4
3892
+
3893
+    palignr             m5, m3, m6, 15
3894
+    pshufb              m4, m5, m7
3895
+    pshufb              m5, m8
3896
+    pmaddubsw           m4, m0
3897
+    pmaddubsw           m5, m1
3898
+    pmulhrsw            m4, m2
3899
+    pmulhrsw            m5, m2
3900
+    packuswb            m4, m5
3901
+    movu                [r0 + r1 * 2], m4
3902
+
3903
+    pshufb              m4, m3, m7
3904
+    pshufb              m5, m3, m8
3905
+    pmaddubsw           m4, m0
3906
+    pmaddubsw           m5, m1
3907
+    pmulhrsw            m4, m2
3908
+    pmulhrsw            m5, m2
3909
+    packuswb            m4, m5
3910
+    movu                [r0 + r3], m4
3911
+
3912
+    lea                 r0, [r0 + r1 * 4]
3913
+
3914
+    mova                m6, m3
3915
+    vbroadcasti128      m3, [r2 + mmsize*2 + 17]
3916
+    palignr             m5, m3, m6, 1
3917
+    pshufb              m4, m5, m7
3918
+    pshufb              m5, m8
3919
+    pmaddubsw           m4, m0
3920
+    pmaddubsw           m5, m1
3921
+    pmulhrsw            m4, m2
3922
+    pmulhrsw            m5, m2
3923
+    packuswb            m4, m5
3924
+    movu                [r0], m4
3925
+
3926
+    palignr             m5, m3, m6, 2
3927
+    pshufb              m4, m5, m7
3928
+    pshufb              m5, m8
3929
+    pmaddubsw           m4, m0
3930
+    pmaddubsw           m5, m1
3931
+    pmulhrsw            m4, m2
3932
+    pmulhrsw            m5, m2
3933
+    packuswb            m4, m5
3934
+    movu                [r0 + r1], m4
3935
+
3936
+    palignr             m5, m3, m6, 3
3937
+    pshufb              m4, m5, m7
3938
+    pshufb              m5, m8
3939
+    pmaddubsw           m4, m0
3940
+    pmaddubsw           m5, m1
3941
+    pmulhrsw            m4, m2
3942
+    pmulhrsw            m5, m2
3943
+    packuswb            m4, m5
3944
+    movu                [r0 + r1 * 2], m4
3945
+
3946
+    palignr             m5, m3, m6, 4
3947
+    pshufb              m4, m5, m7
3948
+    pshufb              m5, m5, m8
3949
+    pmaddubsw           m4, m0
3950
+    pmaddubsw           m5, m1
3951
+    pmulhrsw            m4, m2
3952
+    pmulhrsw            m5, m2
3953
+    packuswb            m4, m5
3954
+    movu                [r0 + r3], m4
3955
+
3956
+    lea                 r0, [r0 + r1 * 4]
3957
+
3958
+    palignr             m5, m3, m6, 5
3959
+    pshufb              m4, m5, m7
3960
+    pshufb              m5, m8
3961
+    pmaddubsw           m4, m0
3962
+    pmaddubsw           m5, m1
3963
+    pmulhrsw            m4, m2
3964
+    pmulhrsw            m5, m2
3965
+    packuswb            m4, m5
3966
+    movu                [r0], m4
3967
+
3968
+    palignr             m5, m3, m6, 6
3969
+    pshufb              m4, m5, m7
3970
+    pshufb              m5, m8
3971
+    pmaddubsw           m4, m0
3972
+    pmaddubsw           m5, m1
3973
+    pmulhrsw            m4, m2
3974
+    pmulhrsw            m5, m2
3975
+    packuswb            m4, m5
3976
+    movu                [r0 + r1], m4
3977
+
3978
+    palignr             m5, m3, m6, 7
3979
+    pshufb              m4, m5, m7
3980
+    pshufb              m5, m8
3981
+    pmaddubsw           m4, m0
3982
+    pmaddubsw           m5, m1
3983
+    pmulhrsw            m4, m2
3984
+    pmulhrsw            m5, m2
3985
+    packuswb            m4, m5
3986
+    movu                [r0 + r1 * 2], m4
3987
+
3988
+    palignr             m5, m3, m6, 8
3989
+    pshufb              m4, m5, m7
3990
+    pshufb              m5, m8
3991
+    pmaddubsw           m4, m0
3992
+    pmaddubsw           m5, m1
3993
+    pmulhrsw            m4, m2
3994
+    pmulhrsw            m5, m2
3995
+    packuswb            m4, m5
3996
+    movu                [r0 + r3], m4
3997
+
3998
+    lea                 r0, [r0 + r1 * 4]
3999
+
4000
+    palignr             m5, m3, m6, 9
4001
+    pshufb              m4, m5, m7
4002
+    pshufb              m5, m8
4003
+    pmaddubsw           m4, m0
4004
+    pmaddubsw           m5, m1
4005
+    pmulhrsw            m4, m2
4006
+    pmulhrsw            m5, m2
4007
+    packuswb            m4, m5
4008
+    movu                [r0], m4
4009
+
4010
+    palignr             m5, m3, m6, 10
4011
+    pshufb              m4, m5, m7
4012
+    pshufb              m5, m8
4013
+    pmaddubsw           m4, m0
4014
+    pmaddubsw           m5, m1
4015
+    pmulhrsw            m4, m2
4016
+    pmulhrsw            m5, m2
4017
+    packuswb            m4, m5
4018
+    movu                [r0 + r1], m4
4019
+
4020
+    palignr             m5, m3, m6, 11
4021
+    pshufb              m4, m5, m7
4022
+    pshufb              m5, m8
4023
+    pmaddubsw           m4, m0
4024
+    pmaddubsw           m5, m1
4025
+    pmulhrsw            m4, m2
4026
+    pmulhrsw            m5, m2
4027
+    packuswb            m4, m5
4028
+    movu                [r0 + r1 * 2], m4
4029
+
4030
+    palignr             m5, m3, m6, 12
4031
+    pshufb              m4, m5, m7
4032
+    pshufb              m5, m8
4033
+    pmaddubsw           m4, m0
4034
+    pmaddubsw           m5, m1
4035
+    pmulhrsw            m4, m2
4036
+    pmulhrsw            m5, m2
4037
+    packuswb            m4, m5
4038
+    movu                [r0 + r3], m4
4039
+
4040
+    lea                 r0, [r0 + r1 * 4]
4041
+
4042
+    palignr             m5, m3, m6, 13
4043
+    pshufb              m4, m5, m7
4044
+    pshufb              m5, m8
4045
+    pmaddubsw           m4, m0
4046
+    pmaddubsw           m5, m1
4047
+    pmulhrsw            m4, m2
4048
+    pmulhrsw            m5, m2
4049
+    packuswb            m4, m5
4050
+    movu                [r0], m4
4051
+
4052
+    palignr             m5, m3, m6, 14
4053
+    pshufb              m4, m5, m7
4054
+    pshufb              m5, m8
4055
+    pmaddubsw           m4, m0
4056
+    pmaddubsw           m5, m1
4057
+    pmulhrsw            m4, m2
4058
+    pmulhrsw            m5, m2
4059
+    packuswb            m4, m5
4060
+    movu                [r0 + r1], m4
4061
+
4062
+    palignr             m5, m3, m6, 15
4063
+    pshufb              m4, m5, m7
4064
+    pshufb              m5, m8
4065
+    pmaddubsw           m4, m0
4066
+    pmaddubsw           m5, m1
4067
+    pmulhrsw            m4, m2
4068
+    pmulhrsw            m5, m2
4069
+    packuswb            m4, m5
4070
+    movu                [r0 + r1 * 2], m4
4071
+
4072
+    pshufb              m4, m3, m7
4073
+    pshufb              m5, m3, m8
4074
+    pmaddubsw           m4, m0
4075
+    pmaddubsw           m5, m1
4076
+    pmulhrsw            m4, m2
4077
+    pmulhrsw            m5, m2
4078
+    packuswb            m4, m5
4079
+    movu                [r0 + r3], m4
4080
+    RET
4081
+
4082
+cglobal intra_pred_ang32_23, 3,5,8
4083
+    lea                 r3, [ang_table_avx2 + 32 * 16]
4084
+    lea                 r4, [r1 * 3]
4085
+    mova                m5, [pw_1024]
4086
+
4087
+    ; rows 0 to 7
4088
+    movu                m0, [r2 + 0]
4089
+    movu                m1, [r2 + 1]
4090
+    punpckhbw           m2, m0, m1
4091
+    punpcklbw           m0, m1
4092
+
4093
+    movu                m4, [r2 + mmsize*2]
4094
+    pshufb              m4, [ang32_shuf_mode23]
4095
+    vpermq              m4, m4, q1313
4096
+    palignr             m3, m0, m4, 1
4097
+    vinserti128         m3, m3, xm2, 1
4098
+
4099
+    pmaddubsw           m4, m0, [r3 + 7 * 32]           ; [23]
4100
+    pmulhrsw            m4, m5
4101
+    pmaddubsw           m1, m2, [r3 + 7 * 32]
4102
+    pmulhrsw            m1, m5
4103
+    packuswb            m4, m1
4104
+    movu                [r0], m4
4105
+
4106
+    pmaddubsw           m4, m0, [r3 - 2 * 32]           ; [14]
4107
+    pmulhrsw            m4, m5
4108
+    pmaddubsw           m1, m2, [r3 - 2 * 32]
4109
+    pmulhrsw            m1, m5
4110
+    packuswb            m4, m1
4111
+    movu                [r0 + r1], m4
4112
+
4113
+    pmaddubsw           m4, m0, [r3 - 11 * 32]          ; [5]
4114
+    pmulhrsw            m4, m5
4115
+    pmaddubsw           m1, m2, [r3 - 11 * 32]
4116
+    pmulhrsw            m1, m5
4117
+    packuswb            m4, m1
4118
+    movu                [r0 + r1*2], m4
4119
+
4120
+    palignr             m6, m0, m3, 14
4121
+    palignr             m7, m2, m0, 14
4122
+
4123
+    pmaddubsw           m4, m6, [r3 + 12 * 32]          ; [28]
4124
+    pmulhrsw            m4, m5
4125
+    pmaddubsw           m1, m7, [r3 + 12 * 32]
4126
+    pmulhrsw            m1, m5
4127
+    packuswb            m4, m1
4128
+    movu                [r0 + r4], m4
4129
+
4130
+    lea                 r0, [r0 + r1 * 4]
4131
+
4132
+    pmaddubsw           m4, m6, [r3 + 3 * 32]           ; [19]
4133
+    pmulhrsw            m4, m5
4134
+    pmaddubsw           m1, m7, [r3 + 3 * 32]
4135
+    pmulhrsw            m1, m5
4136
+    packuswb            m4, m1
4137
+    movu                [r0], m4
4138
+
4139
+    pmaddubsw           m4, m6, [r3 - 6 * 32]           ; [10]
4140
+    pmulhrsw            m4, m5
4141
+    pmaddubsw           m1, m7, [r3 - 6 * 32]
4142
+    pmulhrsw            m1, m5
4143
+    packuswb            m4, m1
4144
+    movu                [r0 + r1], m4
4145
+
4146
+    pmaddubsw           m4, m6, [r3 - 15 * 32]          ; [1]
4147
+    pmulhrsw            m4, m5
4148
+    pmaddubsw           m1, m7, [r3 - 15 * 32]
4149
+    pmulhrsw            m1, m5
4150
+    packuswb            m4, m1
4151
+    movu                [r0 + r1*2], m4
4152
+
4153
+    palignr             m6, m0, m3, 12
4154
+    palignr             m7, m2, m0, 12
4155
+
4156
+    pmaddubsw           m4, m6, [r3 + 8 * 32]           ; [24]
4157
+    pmulhrsw            m4, m5
4158
+    pmaddubsw           m1, m7, [r3 + 8 * 32]
4159
+    pmulhrsw            m1, m5
4160
+    packuswb            m4, m1
4161
+    movu                [r0 + r4], m4
4162
+
4163
+    lea                 r0, [r0 + r1 * 4]
4164
+
4165
+    ; rows 8 to 15
4166
+    pmaddubsw           m4, m6, [r3 - 1 * 32]           ; [15]
4167
+    pmulhrsw            m4, m5
4168
+    pmaddubsw           m1, m7, [r3 - 1 * 32]
4169
+    pmulhrsw            m1, m5
4170
+    packuswb            m4, m1
4171
+    movu                [r0], m4
4172
+
4173
+    pmaddubsw           m4, m6, [r3 - 10 * 32]          ; [6]
4174
+    pmulhrsw            m4, m5
4175
+    pmaddubsw           m1, m7, [r3 - 10 * 32]
4176
+    pmulhrsw            m1, m5
4177
+    packuswb            m4, m1
4178
+    movu                [r0 + r1], m4
4179
+
4180
+    palignr             m6, m0, m3, 10
4181
+    palignr             m7, m2, m0, 10
4182
+
4183
+    pmaddubsw           m4, m6, [r3 + 13 * 32]          ; [29]
4184
+    pmulhrsw            m4, m5
4185
+    pmaddubsw           m1, m7, [r3 + 13 * 32]
4186
+    pmulhrsw            m1, m5
4187
+    packuswb            m4, m1
4188
+    movu                [r0 + r1*2], m4
4189
+
4190
+    pmaddubsw           m4, m6, [r3 + 4 * 32]           ; [20]
4191
+    pmulhrsw            m4, m5
4192
+    pmaddubsw           m1, m7, [r3 + 4 * 32]
4193
+    pmulhrsw            m1, m5
4194
+    packuswb            m4, m1
4195
+    movu                [r0 + r4], m4
4196
+
4197
+    lea                 r0, [r0 + r1 * 4]
4198
+
4199
+    pmaddubsw           m4, m6, [r3 - 5 * 32]           ; [11]
4200
+    pmulhrsw            m4, m5
4201
+    pmaddubsw           m1, m7, [r3 - 5 * 32]
4202
+    pmulhrsw            m1, m5
4203
+    packuswb            m4, m1
4204
+    movu                [r0], m4
4205
+
4206
+    pmaddubsw           m4, m6, [r3 - 14 * 32]          ; [2]
4207
+    pmulhrsw            m4, m5
4208
+    pmaddubsw           m1, m7, [r3 - 14 * 32]
4209
+    pmulhrsw            m1, m5
4210
+    packuswb            m4, m1
4211
+    movu                [r0 + r1], m4
4212
+
4213
+    palignr             m6, m0, m3, 8
4214
+    palignr             m7, m2, m0, 8
4215
+
4216
+    pmaddubsw           m4, m6, [r3 + 9 * 32]           ; [25]
4217
+    pmulhrsw            m4, m5
4218
+    pmaddubsw           m1, m7, [r3 + 9 * 32]
4219
+    pmulhrsw            m1, m5
4220
+    packuswb            m4, m1
4221
+    movu                [r0 + r1 * 2], m4
4222
+
4223
+    pmaddubsw           m4, m6, [r3]                    ; [16]
4224
+    pmulhrsw            m4, m5
4225
+    pmaddubsw           m1, m7, [r3]
4226
+    pmulhrsw            m1, m5
4227
+    packuswb            m4, m1
4228
+    movu                [r0 + r4], m4
4229
+
4230
+    lea                 r0, [r0 + r1 * 4]
4231
+
4232
+    ; rows 16 to 23
4233
+    pmaddubsw           m4, m6, [r3 - 9 * 32]           ; [7]
4234
+    pmulhrsw            m4, m5
4235
+    pmaddubsw           m1, m7, [r3 - 9 * 32]
4236
+    pmulhrsw            m1, m5
4237
+    packuswb            m4, m1
4238
+    movu                [r0], m4
4239
+
4240
+    palignr             m6, m0, m3, 6
4241
+    palignr             m7, m2, m0, 6
4242
+
4243
+    pmaddubsw           m4, m6, [r3 + 14 * 32]          ; [30]
4244
+    pmulhrsw            m4, m5
4245
+    pmaddubsw           m1, m7, [r3 + 14 * 32]
4246
+    pmulhrsw            m1, m5
4247
+    packuswb            m4, m1
4248
+    movu                [r0 + r1], m4
4249
+
4250
+    pmaddubsw           m4, m6, [r3 + 5 * 32]           ; [21]
4251
+    pmulhrsw            m4, m5
4252
+    pmaddubsw           m1, m7, [r3 + 5 * 32]
4253
+    pmulhrsw            m1, m5
4254
+    packuswb            m4, m1
4255
+    movu                [r0 + r1*2], m4
4256
+
4257
+    pmaddubsw           m4, m6, [r3 - 4 * 32]           ; [12]
4258
+    pmulhrsw            m4, m5
4259
+    pmaddubsw           m1, m7, [r3 - 4 * 32]
4260
+    pmulhrsw            m1, m5
4261
+    packuswb            m4, m1
4262
+    movu                [r0 + r4], m4
4263
+
4264
+    lea                 r0, [r0 + r1 * 4]
4265
+
4266
+    pmaddubsw           m4, m6, [r3 - 13 * 32]          ; [3]
4267
+    pmulhrsw            m4, m5
4268
+    pmaddubsw           m1, m7, [r3 - 13 * 32]
4269
+    pmulhrsw            m1, m5
4270
+    packuswb            m4, m1
4271
+    movu                [r0], m4
4272
+
4273
+    palignr             m6, m0, m3, 4
4274
+    palignr             m7, m2, m0, 4
4275
+    pmaddubsw           m4, m6, [r3 + 10 * 32]          ; [26]
4276
+    pmulhrsw            m4, m5
4277
+    pmaddubsw           m1, m7, [r3 + 10 * 32]
4278
+    pmulhrsw            m1, m5
4279
+    packuswb            m4, m1
4280
+    movu                [r0 + r1], m4
4281
+
4282
+    pmaddubsw           m4, m6, [r3 + 1 * 32]           ; [17]
4283
+    pmulhrsw            m4, m5
4284
+    pmaddubsw           m1, m7, [r3 + 1 * 32]
4285
+    pmulhrsw            m1, m5
4286
+    packuswb            m4, m1
4287
+    movu                [r0 + r1*2], m4
4288
+
4289
+    pmaddubsw           m4, m6, [r3 - 8 * 32]           ; [8]
4290
+    pmulhrsw            m4, m5
4291
+    pmaddubsw           m1, m7, [r3 - 8 * 32]
4292
+    pmulhrsw            m1, m5
4293
+    packuswb            m4, m1
4294
+    movu                [r0 + r4], m4
4295
+
4296
+    lea                 r0, [r0 + r1 * 4]
4297
+
4298
+    ; rows 24 to 31
4299
+    palignr             m6, m0, m3, 2
4300
+    palignr             m7, m2, m0, 2
4301
+    pmaddubsw           m4, m6, [r3 + 15 * 32]          ; [31]
4302
+    pmulhrsw            m4, m5
4303
+    pmaddubsw           m1, m7, [r3 + 15 * 32]
4304
+    pmulhrsw            m1, m5
4305
+    packuswb            m4, m1
4306
+    movu                [r0], m4
4307
+
4308
+    pmaddubsw           m4, m6, [r3 + 6 * 32]           ; [22]
4309
+    pmulhrsw            m4, m5
4310
+    pmaddubsw           m1, m7, [r3 + 6 * 32]
4311
+    pmulhrsw            m1, m5
4312
+    packuswb            m4, m1
4313
+    movu                [r0 + r1], m4
4314
+
4315
+    pmaddubsw           m4, m6, [r3 - 3 * 32]           ; [13]
4316
+    pmulhrsw            m4, m5
4317
+    pmaddubsw           m1, m7, [r3 - 3 * 32]
4318
+    pmulhrsw            m1, m5
4319
+    packuswb            m4, m1
4320
+    movu                [r0 + r1 * 2], m4
4321
+
4322
+    pmaddubsw           m4, m6, [r3 - 12 * 32]          ; [4]
4323
+    pmulhrsw            m4, m5
4324
+    pmaddubsw           m1, m7, [r3 - 12 * 32]
4325
+    pmulhrsw            m1, m5
4326
+    packuswb            m4, m1
4327
+    movu                [r0 + r4], m4
4328
+
4329
+    lea                 r0, [r0 + r1 * 4]
4330
+
4331
+    pmaddubsw           m4, m3, [r3 + 11 * 32]          ; [27]
4332
+    pmulhrsw            m4, m5
4333
+    pmaddubsw           m1, m0, [r3 + 11 * 32]
4334
+    pmulhrsw            m1, m5
4335
+    packuswb            m4, m1
4336
+    movu                [r0], m4
4337
+
4338
+    pmaddubsw           m4, m3, [r3 + 2 * 32]           ; [18]
4339
+    pmulhrsw            m4, m5
4340
+    pmaddubsw           m1, m0, [r3 + 2 * 32]
4341
+    pmulhrsw            m1, m5
4342
+    packuswb            m4, m1
4343
+    movu                [r0 + r1], m4
4344
+
4345
+    pmaddubsw           m4, m3, [r3 - 7 * 32]           ; [9]
4346
+    pmulhrsw            m4, m5
4347
+    pmaddubsw           m1, m0, [r3 - 7 * 32]
4348
+    pmulhrsw            m1, m5
4349
+    packuswb            m4, m1
4350
+    movu                [r0 + r1*2], m4
4351
+
4352
+    pand                m3, [pw_00ff]
4353
+    pand                m0, [pw_00ff]
4354
+    packuswb            m3, m0
4355
+    movu                [r0 + r4], m3
4356
+    RET
4357
+
4358
+cglobal intra_pred_ang32_14, 3,4,9
4359
+    movu                m0, [ang32_fact_mode14]
4360
+    movu                m1, [ang32_fact_mode14 + mmsize]
4361
+    mova                m2, [pw_1024]
4362
+    mova                m7, [ang32_shuf_mode14]
4363
+    mova                m8, [ang32_shuf_mode14 + mmsize]
4364
+    lea                 r3, [r1 * 3]
4365
+
4366
+    ; prepare for [30, 27, 25, 22, 20, 17, 15, 12, 10, 7, 5, 2, 0, -1, -2...]
4367
+
4368
+    movu                m6, [r2]
4369
+    pshufb              m6, [ang32_shuf_mode14 + mmsize*2]
4370
+    vpermq              m6, m6, 01110111b
4371
+    pslldq              m6, m6, 1
4372
+    vbroadcasti128      m3, [r2 + mmsize*2 + 1]
4373
+
4374
+    palignr             m5, m3, m6, 1
4375
+    pshufb              m4, m5, m7
4376
+    pshufb              m5, m8
4377
+    pmaddubsw           m4, m0
4378
+    pmaddubsw           m5, m1
4379
+    pmulhrsw            m4, m2
4380
+    pmulhrsw            m5, m2
4381
+    packuswb            m4, m5
4382
+    movu                [r0], m4
4383
+
4384
+    palignr             m5, m3, m6, 2
4385
+    pshufb              m4, m5, m7
4386
+    pshufb              m5, m8
4387
+    pmaddubsw           m4, m0
4388
+    pmaddubsw           m5, m1
4389
+    pmulhrsw            m4, m2
4390
+    pmulhrsw            m5, m2
4391
+    packuswb            m4, m5
4392
+    movu                [r0 + r1], m4
4393
+
4394
+    palignr             m5, m3, m6, 3
4395
+    pshufb              m4, m5, m7
4396
+    pshufb              m5, m8
4397
+    pmaddubsw           m4, m0
4398
+    pmaddubsw           m5, m1
4399
+    pmulhrsw            m4, m2
4400
+    pmulhrsw            m5, m2
4401
+    packuswb            m4, m5
4402
+    movu                [r0 + r1 * 2], m4
4403
+
4404
+    palignr             m5, m3, m6, 4
4405
+    pshufb              m4, m5, m7
4406
+    pshufb              m5, m8
4407
+    pmaddubsw           m4, m0
4408
+    pmaddubsw           m5, m1
4409
+    pmulhrsw            m4, m2
4410
+    pmulhrsw            m5, m2
4411
+    packuswb            m4, m5
4412
+    movu                [r0 + r3], m4
4413
+
4414
+    lea                 r0, [r0 + r1 * 4]
4415
+
4416
+    palignr             m5, m3, m6, 5
4417
+    pshufb              m4, m5, m7
4418
+    pshufb              m5, m8
4419
+    pmaddubsw           m4, m0
4420
+    pmaddubsw           m5, m1
4421
+    pmulhrsw            m4, m2
4422
+    pmulhrsw            m5, m2
4423
+    packuswb            m4, m5
4424
+    movu                [r0], m4
4425
+
4426
+    palignr             m5, m3, m6, 6
4427
+    pshufb              m4, m5, m7
4428
+    pshufb              m5, m8
4429
+    pmaddubsw           m4, m0
4430
+    pmaddubsw           m5, m1
4431
+    pmulhrsw            m4, m2
4432
+    pmulhrsw            m5, m2
4433
+    packuswb            m4, m5
4434
+    movu                [r0 + r1], m4
4435
+
4436
+    palignr             m5, m3, m6, 7
4437
+    pshufb              m4, m5, m7
4438
+    pshufb              m5, m8
4439
+    pmaddubsw           m4, m0
4440
+    pmaddubsw           m5, m1
4441
+    pmulhrsw            m4, m2
4442
+    pmulhrsw            m5, m2
4443
+    packuswb            m4, m5
4444
+    movu                [r0 + r1 * 2], m4
4445
+
4446
+    palignr             m5, m3, m6, 8
4447
+    pshufb              m4, m5, m7
4448
+    pshufb              m5, m8
4449
+    pmaddubsw           m4, m0
4450
+    pmaddubsw           m5, m1
4451
+    pmulhrsw            m4, m2
4452
+    pmulhrsw            m5, m2
4453
+    packuswb            m4, m5
4454
+    movu                [r0 + r3], m4
4455
+
4456
+    lea                 r0, [r0 + r1 * 4]
4457
+
4458
+    palignr             m5, m3, m6, 9
4459
+    pshufb              m4, m5, m7
4460
+    pshufb              m5, m8
4461
+    pmaddubsw           m4, m0
4462
+    pmaddubsw           m5, m1
4463
+    pmulhrsw            m4, m2
4464
+    pmulhrsw            m5, m2
4465
+    packuswb            m4, m5
4466
+    movu                [r0], m4
4467
+
4468
+    palignr             m5, m3, m6, 10
4469
+    pshufb              m4, m5, m7
4470
+    pshufb              m5, m8
4471
+    pmaddubsw           m4, m0
4472
+    pmaddubsw           m5, m1
4473
+    pmulhrsw            m4, m2
4474
+    pmulhrsw            m5, m2
4475
+    packuswb            m4, m5
4476
+    movu                [r0 + r1], m4
4477
+
4478
+    palignr             m5, m3, m6, 11
4479
+    pshufb              m4, m5, m7
4480
+    pshufb              m5, m8
4481
+    pmaddubsw           m4, m0
4482
+    pmaddubsw           m5, m1
4483
+    pmulhrsw            m4, m2
4484
+    pmulhrsw            m5, m2
4485
+    packuswb            m4, m5
4486
+    movu                [r0 + r1 * 2], m4
4487
+
4488
+    palignr             m5, m3, m6, 12
4489
+    pshufb              m4, m5, m7
4490
+    pshufb              m5, m8
4491
+    pmaddubsw           m4, m0
4492
+    pmaddubsw           m5, m1
4493
+    pmulhrsw            m4, m2
4494
+    pmulhrsw            m5, m2
4495
+    packuswb            m4, m5
4496
+    movu                [r0 + r3], m4
4497
+
4498
+    lea                 r0, [r0 + r1 * 4]
4499
+
4500
+    palignr             m5, m3, m6, 13
4501
+    pshufb              m4, m5, m7
4502
+    pshufb              m5, m8
4503
+    pmaddubsw           m4, m0
4504
+    pmaddubsw           m5, m1
4505
+    pmulhrsw            m4, m2
4506
+    pmulhrsw            m5, m2
4507
+    packuswb            m4, m5
4508
+    movu                [r0], m4
4509
+
4510
+    palignr             m5, m3, m6, 14
4511
+    pshufb              m4, m5, m7
4512
+    pshufb              m5, m8
4513
+    pmaddubsw           m4, m0
4514
+    pmaddubsw           m5, m1
4515
+    pmulhrsw            m4, m2
4516
+    pmulhrsw            m5, m2
4517
+    packuswb            m4, m5
4518
+    movu                [r0 + r1], m4
4519
+
4520
+    palignr             m5, m3, m6, 15
4521
+    pshufb              m4, m5, m7
4522
+    pshufb              m5, m8
4523
+    pmaddubsw           m4, m0
4524
+    pmaddubsw           m5, m1
4525
+    pmulhrsw            m4, m2
4526
+    pmulhrsw            m5, m2
4527
+    packuswb            m4, m5
4528
+    movu                [r0 + r1 * 2], m4
4529
+
4530
+    pshufb              m4, m3, m7
4531
+    pshufb              m5, m3, m8
4532
+    pmaddubsw           m4, m0
4533
+    pmaddubsw           m5, m1
4534
+    pmulhrsw            m4, m2
4535
+    pmulhrsw            m5, m2
4536
+    packuswb            m4, m5
4537
+    movu                [r0 + r3], m4
4538
+
4539
+    lea                 r0, [r0 + r1 * 4]
4540
+
4541
+    mova                m6, m3
4542
+    vbroadcasti128      m3, [r2 + mmsize*2 + 17]
4543
+    palignr             m5, m3, m6, 1
4544
+    pshufb              m4, m5, m7
4545
+    pshufb              m5, m8
4546
+    pmaddubsw           m4, m0
4547
+    pmaddubsw           m5, m1
4548
+    pmulhrsw            m4, m2
4549
+    pmulhrsw            m5, m2
4550
+    packuswb            m4, m5
4551
+    movu                [r0], m4
4552
+
4553
+    palignr             m5, m3, m6, 2
4554
+    pshufb              m4, m5, m7
4555
+    pshufb              m5, m8
4556
+    pmaddubsw           m4, m0
4557
+    pmaddubsw           m5, m1
4558
+    pmulhrsw            m4, m2
4559
+    pmulhrsw            m5, m2
4560
+    packuswb            m4, m5
4561
+    movu                [r0 + r1], m4
4562
+
4563
+    palignr             m5, m3, m6, 3
4564
+    pshufb              m4, m5, m7
4565
+    pshufb              m5, m8
4566
+    pmaddubsw           m4, m0
4567
+    pmaddubsw           m5, m1
4568
+    pmulhrsw            m4, m2
4569
+    pmulhrsw            m5, m2
4570
+    packuswb            m4, m5
4571
+    movu                [r0 + r1 * 2], m4
4572
+
4573
+    palignr             m5, m3, m6, 4
4574
+    pshufb              m4, m5, m7
4575
+    pshufb              m5, m5, m8
4576
+    pmaddubsw           m4, m0
4577
+    pmaddubsw           m5, m1
4578
+    pmulhrsw            m4, m2
4579
+    pmulhrsw            m5, m2
4580
+    packuswb            m4, m5
4581
+    movu                [r0 + r3], m4
4582
+
4583
+    lea                 r0, [r0 + r1 * 4]
4584
+
4585
+    palignr             m5, m3, m6, 5
4586
+    pshufb              m4, m5, m7
4587
+    pshufb              m5, m8
4588
+    pmaddubsw           m4, m0
4589
+    pmaddubsw           m5, m1
4590
+    pmulhrsw            m4, m2
4591
+    pmulhrsw            m5, m2
4592
+    packuswb            m4, m5
4593
+    movu                [r0], m4
4594
+
4595
+    palignr             m5, m3, m6, 6
4596
+    pshufb              m4, m5, m7
4597
+    pshufb              m5, m8
4598
+    pmaddubsw           m4, m0
4599
+    pmaddubsw           m5, m1
4600
+    pmulhrsw            m4, m2
4601
+    pmulhrsw            m5, m2
4602
+    packuswb            m4, m5
4603
+    movu                [r0 + r1], m4
4604
+
4605
+    palignr             m5, m3, m6, 7
4606
+    pshufb              m4, m5, m7
4607
+    pshufb              m5, m8
4608
+    pmaddubsw           m4, m0
4609
+    pmaddubsw           m5, m1
4610
+    pmulhrsw            m4, m2
4611
+    pmulhrsw            m5, m2
4612
+    packuswb            m4, m5
4613
+    movu                [r0 + r1 * 2], m4
4614
+
4615
+    palignr             m5, m3, m6, 8
4616
+    pshufb              m4, m5, m7
4617
+    pshufb              m5, m8
4618
+    pmaddubsw           m4, m0
4619
+    pmaddubsw           m5, m1
4620
+    pmulhrsw            m4, m2
4621
+    pmulhrsw            m5, m2
4622
+    packuswb            m4, m5
4623
+    movu                [r0 + r3], m4
4624
+
4625
+    lea                 r0, [r0 + r1 * 4]
4626
+
4627
+    palignr             m5, m3, m6, 9
4628
+    pshufb              m4, m5, m7
4629
+    pshufb              m5, m8
4630
+    pmaddubsw           m4, m0
4631
+    pmaddubsw           m5, m1
4632
+    pmulhrsw            m4, m2
4633
+    pmulhrsw            m5, m2
4634
+    packuswb            m4, m5
4635
+    movu                [r0], m4
4636
+
4637
+    palignr             m5, m3, m6, 10
4638
+    pshufb              m4, m5, m7
4639
+    pshufb              m5, m8
4640
+    pmaddubsw           m4, m0
4641
+    pmaddubsw           m5, m1
4642
+    pmulhrsw            m4, m2
4643
+    pmulhrsw            m5, m2
4644
+    packuswb            m4, m5
4645
+    movu                [r0 + r1], m4
4646
+
4647
+    palignr             m5, m3, m6, 11
4648
+    pshufb              m4, m5, m7
4649
+    pshufb              m5, m8
4650
+    pmaddubsw           m4, m0
4651
+    pmaddubsw           m5, m1
4652
+    pmulhrsw            m4, m2
4653
+    pmulhrsw            m5, m2
4654
+    packuswb            m4, m5
4655
+    movu                [r0 + r1 * 2], m4
4656
+
4657
+    palignr             m5, m3, m6, 12
4658
+    pshufb              m4, m5, m7
4659
+    pshufb              m5, m8
4660
+    pmaddubsw           m4, m0
4661
+    pmaddubsw           m5, m1
4662
+    pmulhrsw            m4, m2
4663
+    pmulhrsw            m5, m2
4664
+    packuswb            m4, m5
4665
+    movu                [r0 + r3], m4
4666
+
4667
+    lea                 r0, [r0 + r1 * 4]
4668
+
4669
+    palignr             m5, m3, m6, 13
4670
+    pshufb              m4, m5, m7
4671
+    pshufb              m5, m8
4672
+    pmaddubsw           m4, m0
4673
+    pmaddubsw           m5, m1
4674
+    pmulhrsw            m4, m2
4675
+    pmulhrsw            m5, m2
4676
+    packuswb            m4, m5
4677
+    movu                [r0], m4
4678
+
4679
+    palignr             m5, m3, m6, 14
4680
+    pshufb              m4, m5, m7
4681
+    pshufb              m5, m8
4682
+    pmaddubsw           m4, m0
4683
+    pmaddubsw           m5, m1
4684
+    pmulhrsw            m4, m2
4685
+    pmulhrsw            m5, m2
4686
+    packuswb            m4, m5
4687
+    movu                [r0 + r1], m4
4688
+
4689
+    palignr             m5, m3, m6, 15
4690
+    pshufb              m4, m5, m7
4691
+    pshufb              m5, m8
4692
+    pmaddubsw           m4, m0
4693
+    pmaddubsw           m5, m1
4694
+    pmulhrsw            m4, m2
4695
+    pmulhrsw            m5, m2
4696
+    packuswb            m4, m5
4697
+    movu                [r0 + r1 * 2], m4
4698
+
4699
+    pshufb              m4, m3, m7
4700
+    pshufb              m5, m3, m8
4701
+    pmaddubsw           m4, m0
4702
+    pmaddubsw           m5, m1
4703
+    pmulhrsw            m4, m2
4704
+    pmulhrsw            m5, m2
4705
+    packuswb            m4, m5
4706
+    movu                [r0 + r3], m4
4707
+    RET
4708
+
4709
+cglobal intra_pred_ang32_22, 3,5,9
4710
+    lea                 r3, [ang_table_avx2 + 32 * 16]
4711
+    lea                 r4, [r1 * 3]
4712
+    mova                m5, [pw_1024]
4713
+
4714
+    ; rows 0 to 7
4715
+    movu                m0, [r2 + 0]
4716
+    movu                m1, [r2 + 1]
4717
+    punpckhbw           m2, m0, m1
4718
+    punpcklbw           m0, m1
4719
+
4720
+    movu                m4, [r2 + mmsize*2 + 2]
4721
+    pshufb              m4, [ang32_shuf_mode22]
4722
+    vextracti128        xm8, m4, 1
4723
+
4724
+    palignr             m3, m0, m4, 2
4725
+    palignr             m3, m8, 15
4726
+    vinserti128         m3, m3, xm2, 1
4727
+    vinserti128         m8, m8, xm0, 1
4728
+
4729
+    pmaddubsw           m4, m0, [r3 + 3 * 32]           ; [19]
4730
+    pmulhrsw            m4, m5
4731
+    pmaddubsw           m1, m2, [r3 + 3 * 32]
4732
+    pmulhrsw            m1, m5
4733
+    packuswb            m4, m1
4734
+    movu                [r0], m4
4735
+
4736
+    pmaddubsw           m4, m0, [r3 - 10 * 32]          ; [6]
4737
+    pmulhrsw            m4, m5
4738
+    pmaddubsw           m1, m2, [r3 - 10 * 32]
4739
+    pmulhrsw            m1, m5
4740
+    packuswb            m4, m1
4741
+    movu                [r0 + r1], m4
4742
+
4743
+    palignr             m6, m0, m3, 14
4744
+    palignr             m7, m2, m0, 14
4745
+
4746
+    pmaddubsw           m4, m6, [r3 + 9 * 32]           ; [25]
4747
+    pmulhrsw            m4, m5
4748
+    pmaddubsw           m1, m7, [r3 + 9 * 32]
4749
+    pmulhrsw            m1, m5
4750
+    packuswb            m4, m1
4751
+    movu                [r0 + r1*2], m4
4752
+
4753
+    pmaddubsw           m4, m6, [r3 - 4 * 32]           ; [12]
4754
+    pmulhrsw            m4, m5
4755
+    pmaddubsw           m1, m7, [r3 - 4 * 32]
4756
+    pmulhrsw            m1, m5
4757
+    packuswb            m4, m1
4758
+    movu                [r0 + r4], m4
4759
+
4760
+    lea                 r0, [r0 + r1 * 4]
4761
+
4762
+    palignr             m6, m0, m3, 12
4763
+    palignr             m7, m2, m0, 12
4764
+
4765
+    pmaddubsw           m4, m6, [r3 + 15 * 32]          ; [31]
4766
+    pmulhrsw            m4, m5
4767
+    pmaddubsw           m1, m7, [r3 + 15 * 32]
4768
+    pmulhrsw            m1, m5
4769
+    packuswb            m4, m1
4770
+    movu                [r0], m4
4771
+
4772
+    pmaddubsw           m4, m6, [r3 + 2 * 32]           ; [18]
4773
+    pmulhrsw            m4, m5
4774
+    pmaddubsw           m1, m7, [r3 + 2 * 32]
4775
+    pmulhrsw            m1, m5
4776
+    packuswb            m4, m1
4777
+    movu                [r0 + r1], m4
4778
+
4779
+    pmaddubsw           m4, m6, [r3 - 11 * 32]          ; [5]
4780
+    pmulhrsw            m4, m5
4781
+    pmaddubsw           m1, m7, [r3 - 11 * 32]
4782
+    pmulhrsw            m1, m5
4783
+    packuswb            m4, m1
4784
+    movu                [r0 + r1*2], m4
4785
+
4786
+    palignr             m6, m0, m3, 10
4787
+    palignr             m7, m2, m0, 10
4788
+
4789
+    pmaddubsw           m4, m6, [r3 + 8 * 32]           ; [24]
4790
+    pmulhrsw            m4, m5
4791
+    pmaddubsw           m1, m7, [r3 + 8 * 32]
4792
+    pmulhrsw            m1, m5
4793
+    packuswb            m4, m1
4794
+    movu                [r0 + r4], m4
4795
+
4796
+    lea                 r0, [r0 + r1 * 4]
4797
+
4798
+    ; rows 8 to 15
4799
+    pmaddubsw           m4, m6, [r3 - 5 * 32]           ; [11]
4800
+    pmulhrsw            m4, m5
4801
+    pmaddubsw           m1, m7, [r3 - 5 * 32]
4802
+    pmulhrsw            m1, m5
4803
+    packuswb            m4, m1
4804
+    movu                [r0], m4
4805
+
4806
+    palignr             m6, m0, m3, 8
4807
+    palignr             m7, m2, m0, 8
4808
+
4809
+    pmaddubsw           m4, m6, [r3 + 14 * 32]          ; [30]
4810
+    pmulhrsw            m4, m5
4811
+    pmaddubsw           m1, m7, [r3 + 14 * 32]
4812
+    pmulhrsw            m1, m5
4813
+    packuswb            m4, m1
4814
+    movu                [r0 + r1], m4
4815
+
4816
+    pmaddubsw           m4, m6, [r3 + 1 * 32]           ; [17]
4817
+    pmulhrsw            m4, m5
4818
+    pmaddubsw           m1, m7, [r3 + 1 * 32]
4819
+    pmulhrsw            m1, m5
4820
+    packuswb            m4, m1
4821
+    movu                [r0 + r1*2], m4
4822
+
4823
+    pmaddubsw           m4, m6, [r3 - 12 * 32]          ; [4]
4824
+    pmulhrsw            m4, m5
4825
+    pmaddubsw           m1, m7, [r3 - 12 * 32]
4826
+    pmulhrsw            m1, m5
4827
+    packuswb            m4, m1
4828
+    movu                [r0 + r4], m4
4829
+
4830
+    lea                 r0, [r0 + r1 * 4]
4831
+
4832
+    palignr             m6, m0, m3, 6
4833
+    palignr             m7, m2, m0, 6
4834
+
4835
+    pmaddubsw           m4, m6, [r3 + 7 * 32]           ; [23]
4836
+    pmulhrsw            m4, m5
4837
+    pmaddubsw           m1, m7, [r3 + 7 * 32]
4838
+    pmulhrsw            m1, m5
4839
+    packuswb            m4, m1
4840
+    movu                [r0], m4
4841
+
4842
+    pmaddubsw           m4, m6, [r3 - 6 * 32]           ; [10]
4843
+    pmulhrsw            m4, m5
4844
+    pmaddubsw           m1, m7, [r3 - 6 * 32]
4845
+    pmulhrsw            m1, m5
4846
+    packuswb            m4, m1
4847
+    movu                [r0 + r1], m4
4848
+
4849
+    palignr             m6, m0, m3, 4
4850
+    palignr             m7, m2, m0, 4
4851
+
4852
+    pmaddubsw           m4, m6, [r3 + 13 * 32]          ; [29]
4853
+    pmulhrsw            m4, m5
4854
+    pmaddubsw           m1, m7, [r3 + 13 * 32]
4855
+    pmulhrsw            m1, m5
4856
+    packuswb            m4, m1
4857
+    movu                [r0 + r1 * 2], m4
4858
+
4859
+    pmaddubsw           m4, m6, [r3]                    ; [16]
4860
+    pmulhrsw            m4, m5
4861
+    pmaddubsw           m1, m7, [r3]
4862
+    pmulhrsw            m1, m5
4863
+    packuswb            m4, m1
4864
+    movu                [r0 + r4], m4
4865
+
4866
+    lea                 r0, [r0 + r1 * 4]
4867
+
4868
+    ; rows 16 to 23
4869
+    pmaddubsw           m4, m6, [r3 - 13 * 32]          ; [3]
4870
+    pmulhrsw            m4, m5
4871
+    pmaddubsw           m1, m7, [r3 - 13 * 32]
4872
+    pmulhrsw            m1, m5
4873
+    packuswb            m4, m1
4874
+    movu                [r0], m4
4875
+
4876
+    palignr             m6, m0, m3, 2
4877
+    palignr             m7, m2, m0, 2
4878
+
4879
+    pmaddubsw           m4, m6, [r3 + 6 * 32]           ; [22]
4880
+    pmulhrsw            m4, m5
4881
+    pmaddubsw           m1, m7, [r3 + 6 * 32]
4882
+    pmulhrsw            m1, m5
4883
+    packuswb            m4, m1
4884
+    movu                [r0 + r1], m4
4885
+
4886
+    pmaddubsw           m4, m6, [r3 - 7 * 32]           ; [9]
4887
+    pmulhrsw            m4, m5
4888
+    pmaddubsw           m1, m7, [r3 - 7 * 32]
4889
+    pmulhrsw            m1, m5
4890
+    packuswb            m4, m1
4891
+    movu                [r0 + r1*2], m4
4892
+
4893
+    pmaddubsw           m4, m3, [r3 + 12 * 32]          ; [28]
4894
+    pmulhrsw            m4, m5
4895
+    pmaddubsw           m1, m0, [r3 + 12 * 32]
4896
+    pmulhrsw            m1, m5
4897
+    packuswb            m4, m1
4898
+    movu                [r0 + r4], m4
4899
+
4900
+    lea                 r0, [r0 + r1 * 4]
4901
+
4902
+    pmaddubsw           m4, m3, [r3 - 1 * 32]           ; [15]
4903
+    pmulhrsw            m4, m5
4904
+    pmaddubsw           m1, m0, [r3 - 1 * 32]
4905
+    pmulhrsw            m1, m5
4906
+    packuswb            m4, m1
4907
+    movu                [r0], m4
4908
+
4909
+    pmaddubsw           m4, m3, [r3 - 14 * 32]          ; [2]
4910
+    pmulhrsw            m4, m5
4911
+    pmaddubsw           m1, m0, [r3 - 14 * 32]
4912
+    pmulhrsw            m1, m5
4913
+    packuswb            m4, m1
4914
+    movu                [r0 + r1], m4
4915
+
4916
+    palignr             m6, m3, m8, 14
4917
+    palignr             m7, m0, m3, 14
4918
+
4919
+    pmaddubsw           m4, m6, [r3 + 5 * 32]           ; [21]
4920
+    pmulhrsw            m4, m5
4921
+    pmaddubsw           m1, m7, [r3 + 5 * 32]
4922
+    pmulhrsw            m1, m5
4923
+    packuswb            m4, m1
4924
+    movu                [r0 + r1*2], m4
4925
+
4926
+    pmaddubsw           m4, m6, [r3 - 8 * 32]           ; [8]
4927
+    pmulhrsw            m4, m5
4928
+    pmaddubsw           m1, m7, [r3 - 8 * 32]
4929
+    pmulhrsw            m1, m5
4930
+    packuswb            m4, m1
4931
+    movu                [r0 + r4], m4
4932
+
4933
+    lea                 r0, [r0 + r1 * 4]
4934
+
4935
+    ; rows 24 to 31
4936
+    palignr             m6, m3, m8, 12
4937
+    palignr             m7, m0, m3, 12
4938
+    pmaddubsw           m4, m6, [r3 + 11 * 32]          ; [27]
4939
+    pmulhrsw            m4, m5
4940
+    pmaddubsw           m1, m7, [r3 + 11 * 32]
4941
+    pmulhrsw            m1, m5
4942
+    packuswb            m4, m1
4943
+    movu                [r0], m4
4944
+
4945
+    pmaddubsw           m4, m6, [r3 - 2 * 32]           ; [14]
4946
+    pmulhrsw            m4, m5
4947
+    pmaddubsw           m1, m7, [r3 - 2 * 32]
4948
+    pmulhrsw            m1, m5
4949
+    packuswb            m4, m1
4950
+    movu                [r0 + r1], m4
4951
+
4952
+    pmaddubsw           m4, m6, [r3 - 15 * 32]          ; [1]
4953
+    pmulhrsw            m4, m5
4954
+    pmaddubsw           m1, m7, [r3 - 15 * 32]
4955
+    pmulhrsw            m1, m5
4956
+    packuswb            m4, m1
4957
+    movu                [r0 + r1 * 2], m4
4958
+
4959
+    palignr             m6, m3, m8, 10
4960
+    palignr             m7, m0, m3, 10
4961
+    pmaddubsw           m4, m6, [r3 + 4 * 32]           ; [20]
4962
+    pmulhrsw            m4, m5
4963
+    pmaddubsw           m1, m7, [r3 + 4 * 32]
4964
+    pmulhrsw            m1, m5
4965
+    packuswb            m4, m1
4966
+    movu                [r0 + r4], m4
4967
+
4968
+    lea                 r0, [r0 + r1 * 4]
4969
+
4970
+    pmaddubsw           m4, m6, [r3 - 9 * 32]           ; [7]
4971
+    pmulhrsw            m4, m5
4972
+    pmaddubsw           m1, m7, [r3 - 9 * 32]
4973
+    pmulhrsw            m1, m5
4974
+    packuswb            m4, m1
4975
+    movu                [r0], m4
4976
+
4977
+    palignr             m0, m3, 8
4978
+    palignr             m3, m8, 8
4979
+    pmaddubsw           m4, m3, [r3 + 10 * 32]          ; [26]
4980
+    pmulhrsw            m4, m5
4981
+    pmaddubsw           m1, m0, [r3 + 10 * 32]
4982
+    pmulhrsw            m1, m5
4983
+    packuswb            m4, m1
4984
+    movu                [r0 + r1], m4
4985
+
4986
+    pmaddubsw           m4, m3, [r3 - 3 * 32]           ; [13]
4987
+    pmulhrsw            m4, m5
4988
+    pmaddubsw           m1, m0, [r3 - 3 * 32]
4989
+    pmulhrsw            m1, m5
4990
+    packuswb            m4, m1
4991
+    movu                [r0 + r1*2], m4
4992
+
4993
+    pand                m3, [pw_00ff]
4994
+    pand                m0, [pw_00ff]
4995
+    packuswb            m3, m0
4996
+    movu                [r0 + r4], m3
4997
+    RET
4998
+
4999
+cglobal intra_pred_ang32_15, 3,4,9
5000
+    movu                m0, [ang32_fact_mode15]
5001
+    movu                m1, [ang32_fact_mode15 + mmsize]
5002
+    mova                m2, [pw_1024]
5003
+    mova                m7, [ang32_shuf_mode15]
5004
+    mova                m8, [ang32_shuf_mode15 + mmsize]
5005
+    lea                 r3, [r1 * 3]
5006
+
5007
+    ; prepare for [30, 28, 26, 24, 23, 21, 19, 17, 15, 13, 11,  9,  8,  6,  4,  2,  0, -1, -2...]
5008
+
5009
+    movu                m6, [r2]
5010
+    pshufb              m6, [ang32_shuf_mode15 + mmsize*2]
5011
+    vpermq              m6, m6, 01110111b
5012
+
5013
+    movu                xm3, [r2 + mmsize*2]
5014
+    pinsrb              xm3, [r2], 0
5015
+    vpermq              m3, m3, 01000100b
5016
+
5017
+    palignr             m4, m3, m6, 2
5018
+    pshufb              m4, m7
5019
+    pshufb              m5, m6, m8
5020
+    pmaddubsw           m4, m0
5021
+    pmaddubsw           m5, m1
5022
+    pmulhrsw            m4, m2
5023
+    pmulhrsw            m5, m2
5024
+    packuswb            m4, m5
5025
+    movu                [r0], m4
5026
+
5027
+    palignr             m4, m3, m6, 3
5028
+    pshufb              m4, m7
5029
+    palignr             m5, m3, m6, 1
5030
+    pshufb              m5, m8
5031
+    pmaddubsw           m4, m0
5032
+    pmaddubsw           m5, m1
5033
+    pmulhrsw            m4, m2
5034
+    pmulhrsw            m5, m2
5035
+    packuswb            m4, m5
5036
+    movu                [r0 + r1], m4
5037
+
5038
+    palignr             m4, m3, m6, 4
5039
+    pshufb              m4, m7
5040
+    palignr             m5, m3, m6, 2
5041
+    pshufb              m5, m8
5042
+    pmaddubsw           m4, m0
5043
+    pmaddubsw           m5, m1
5044
+    pmulhrsw            m4, m2
5045
+    pmulhrsw            m5, m2
5046
+    packuswb            m4, m5
5047
+    movu                [r0 + r1 * 2], m4
5048
+
5049
+    palignr             m4, m3, m6, 5
5050
+    pshufb              m4, m7
5051
+    palignr             m5, m3, m6, 3
5052
+    pshufb              m5, m8
5053
+    pmaddubsw           m4, m0
5054
+    pmaddubsw           m5, m1
5055
+    pmulhrsw            m4, m2
5056
+    pmulhrsw            m5, m2
5057
+    packuswb            m4, m5
5058
+    movu                [r0 + r3], m4
5059
+
5060
+    lea                 r0, [r0 + r1 * 4]
5061
+
5062
+    palignr             m4, m3, m6, 6
5063
+    pshufb              m4, m7
5064
+    palignr             m5, m3, m6, 4
5065
+    pshufb              m5, m8
5066
+    pmaddubsw           m4, m0
5067
+    pmaddubsw           m5, m1
5068
+    pmulhrsw            m4, m2
5069
+    pmulhrsw            m5, m2
5070
+    packuswb            m4, m5
5071
+    movu                [r0], m4
5072
+
5073
+    palignr             m4, m3, m6, 7
5074
+    pshufb              m4, m7
5075
+    palignr             m5, m3, m6, 5
5076
+    pshufb              m5, m8
5077
+    pmaddubsw           m4, m0
5078
+    pmaddubsw           m5, m1
5079
+    pmulhrsw            m4, m2
5080
+    pmulhrsw            m5, m2
5081
+    packuswb            m4, m5
5082
+    movu                [r0 + r1], m4
5083
+
5084
+    palignr             m4, m3, m6, 8
5085
+    pshufb              m4, m7
5086
+    palignr             m5, m3, m6, 6
5087
+    pshufb              m5, m8
5088
+    pmaddubsw           m4, m0
5089
+    pmaddubsw           m5, m1
5090
+    pmulhrsw            m4, m2
5091
+    pmulhrsw            m5, m2
5092
+    packuswb            m4, m5
5093
+    movu                [r0 + r1 * 2], m4
5094
+
5095
+    palignr             m4, m3, m6, 9
5096
+    pshufb              m4, m7
5097
+    palignr             m5, m3, m6, 7
5098
+    pshufb              m5, m8
5099
+    pmaddubsw           m4, m0
5100
+    pmaddubsw           m5, m1
5101
+    pmulhrsw            m4, m2
5102
+    pmulhrsw            m5, m2
5103
+    packuswb            m4, m5
5104
+    movu                [r0 + r3], m4
5105
+
5106
+    lea                 r0, [r0 + r1 * 4]
5107
+
5108
+    palignr             m4, m3, m6, 10
5109
+    pshufb              m4, m7
5110
+    palignr             m5, m3, m6, 8
5111
+    pshufb              m5, m8
5112
+    pmaddubsw           m4, m0
5113
+    pmaddubsw           m5, m1
5114
+    pmulhrsw            m4, m2
5115
+    pmulhrsw            m5, m2
5116
+    packuswb            m4, m5
5117
+    movu                [r0], m4
5118
+
5119
+    palignr             m4, m3, m6, 11
5120
+    pshufb              m4, m7
5121
+    palignr             m5, m3, m6, 9
5122
+    pshufb              m5, m8
5123
+    pmaddubsw           m4, m0
5124
+    pmaddubsw           m5, m1
5125
+    pmulhrsw            m4, m2
5126
+    pmulhrsw            m5, m2
5127
+    packuswb            m4, m5
5128
+    movu                [r0 + r1], m4
5129
+
5130
+    palignr             m4, m3, m6, 12
5131
+    pshufb              m4, m7
5132
+    palignr             m5, m3, m6, 10
5133
+    pshufb              m5, m8
5134
+    pmaddubsw           m4, m0
5135
+    pmaddubsw           m5, m1
5136
+    pmulhrsw            m4, m2
5137
+    pmulhrsw            m5, m2
5138
+    packuswb            m4, m5
5139
+    movu                [r0 + r1 * 2], m4
5140
+
5141
+    palignr             m4, m3, m6, 13
5142
+    pshufb              m4, m7
5143
+    palignr             m5, m3, m6, 11
5144
+    pshufb              m5, m8
5145
+    pmaddubsw           m4, m0
5146
+    pmaddubsw           m5, m1
5147
+    pmulhrsw            m4, m2
5148
+    pmulhrsw            m5, m2
5149
+    packuswb            m4, m5
5150
+    movu                [r0 + r3], m4
5151
+
5152
+    lea                 r0, [r0 + r1 * 4]
5153
+
5154
+    palignr             m4, m3, m6, 14
5155
+    pshufb              m4, m7
5156
+    palignr             m5, m3, m6, 12
5157
+    pshufb              m5, m8
5158
+    pmaddubsw           m4, m0
5159
+    pmaddubsw           m5, m1
5160
+    pmulhrsw            m4, m2
5161
+    pmulhrsw            m5, m2
5162
+    packuswb            m4, m5
5163
+    movu                [r0], m4
5164
+
5165
+    palignr             m4, m3, m6, 15
5166
+    pshufb              m4, m7
5167
+    palignr             m5, m3, m6, 13
5168
+    pshufb              m5, m8
5169
+    pmaddubsw           m4, m0
5170
+    pmaddubsw           m5, m1
5171
+    pmulhrsw            m4, m2
5172
+    pmulhrsw            m5, m2
5173
+    packuswb            m4, m5
5174
+    movu                [r0 + r1], m4
5175
+
5176
+    pshufb              m4, m3, m7
5177
+    palignr             m5, m3, m6, 14
5178
+    pshufb              m5, m8
5179
+    pmaddubsw           m4, m0
5180
+    pmaddubsw           m5, m1
5181
+    pmulhrsw            m4, m2
5182
+    pmulhrsw            m5, m2
5183
+    packuswb            m4, m5
5184
+    movu                [r0 + r1 * 2], m4
5185
+
5186
+    palignr             m5, m3, m6, 15
5187
+    mova                m6, m3
5188
+    vbroadcasti128      m3, [r2 + mmsize*2 + 16]
5189
+
5190
+    palignr             m4, m3, m6, 1
5191
+    pshufb              m4, m7
5192
+    pshufb              m5, m8
5193
+    pmaddubsw           m4, m0
5194
+    pmaddubsw           m5, m1
5195
+    pmulhrsw            m4, m2
5196
+    pmulhrsw            m5, m2
5197
+    packuswb            m4, m5
5198
+    movu                [r0 + r3], m4
5199
+
5200
+    lea                 r0, [r0 + r1 * 4]
5201
+
5202
+    palignr             m4, m3, m6, 2
5203
+    pshufb              m4, m7
5204
+    pshufb              m5, m6, m8
5205
+    pmaddubsw           m4, m0
5206
+    pmaddubsw           m5, m1
5207
+    pmulhrsw            m4, m2
5208
+    pmulhrsw            m5, m2
5209
+    packuswb            m4, m5
5210
+    movu                [r0], m4
5211
+
5212
+    palignr             m4, m3, m6, 3
5213
+    pshufb              m4, m7
5214
+    palignr             m5, m3, m6, 1
5215
+    pshufb              m5, m8
5216
+    pmaddubsw           m4, m0
5217
+    pmaddubsw           m5, m1
5218
+    pmulhrsw            m4, m2
5219
+    pmulhrsw            m5, m2
5220
+    packuswb            m4, m5
5221
+    movu                [r0 + r1], m4
5222
+
5223
+    palignr             m4, m3, m6, 4
5224
+    pshufb              m4, m7
5225
+    palignr             m5, m3, m6, 2
5226
+    pshufb              m5, m8
5227
+    pmaddubsw           m4, m0
5228
+    pmaddubsw           m5, m1
5229
+    pmulhrsw            m4, m2
5230
+    pmulhrsw            m5, m2
5231
+    packuswb            m4, m5
5232
+    movu                [r0 + r1 * 2], m4
5233
+
5234
+    palignr             m4, m3, m6, 5
5235
+    pshufb              m4, m7
5236
+    palignr             m5, m3, m6, 3
5237
+    pshufb              m5, m8
5238
+    pmaddubsw           m4, m0
5239
+    pmaddubsw           m5, m1
5240
+    pmulhrsw            m4, m2
5241
+    pmulhrsw            m5, m2
5242
+    packuswb            m4, m5
5243
+    movu                [r0 + r3], m4
5244
+
5245
+    lea                 r0, [r0 + r1 * 4]
5246
+
5247
+    palignr             m4, m3, m6, 6
5248
+    pshufb              m4, m7
5249
+    palignr             m5, m3, m6, 4
5250
+    pshufb              m5, m8
5251
+    pmaddubsw           m4, m0
5252
+    pmaddubsw           m5, m1
5253
+    pmulhrsw            m4, m2
5254
+    pmulhrsw            m5, m2
5255
+    packuswb            m4, m5
5256
+    movu                [r0], m4
5257
+
5258
+    palignr             m4, m3, m6, 7
5259
+    pshufb              m4, m7
5260
+    palignr             m5, m3, m6, 5
5261
+    pshufb              m5, m8
5262
+    pmaddubsw           m4, m0
5263
+    pmaddubsw           m5, m1
5264
+    pmulhrsw            m4, m2
5265
+    pmulhrsw            m5, m2
5266
+    packuswb            m4, m5
5267
+    movu                [r0 + r1], m4
5268
+
5269
+    palignr             m4, m3, m6, 8
5270
+    pshufb              m4, m7
5271
+    palignr             m5, m3, m6, 6
5272
+    pshufb              m5, m8
5273
+    pmaddubsw           m4, m0
5274
+    pmaddubsw           m5, m1
5275
+    pmulhrsw            m4, m2
5276
+    pmulhrsw            m5, m2
5277
+    packuswb            m4, m5
5278
+    movu                [r0 + r1 * 2], m4
5279
+
5280
+    palignr             m4, m3, m6, 9
5281
+    pshufb              m4, m7
5282
+    palignr             m5, m3, m6, 7
5283
+    pshufb              m5, m8
5284
+    pmaddubsw           m4, m0
5285
+    pmaddubsw           m5, m1
5286
+    pmulhrsw            m4, m2
5287
+    pmulhrsw            m5, m2
5288
+    packuswb            m4, m5
5289
+    movu                [r0 + r3], m4
5290
+
5291
+    lea                 r0, [r0 + r1 * 4]
5292
+
5293
+    palignr             m4, m3, m6, 10
5294
+    pshufb              m4, m7
5295
+    palignr             m5, m3, m6, 8
5296
+    pshufb              m5, m8
5297
+    pmaddubsw           m4, m0
5298
+    pmaddubsw           m5, m1
5299
+    pmulhrsw            m4, m2
5300
+    pmulhrsw            m5, m2
5301
+    packuswb            m4, m5
5302
+    movu                [r0], m4
5303
+
5304
+    palignr             m4, m3, m6, 11
5305
+    pshufb              m4, m7
5306
+    palignr             m5, m3, m6, 9
5307
+    pshufb              m5, m8
5308
+    pmaddubsw           m4, m0
5309
+    pmaddubsw           m5, m1
5310
+    pmulhrsw            m4, m2
5311
+    pmulhrsw            m5, m2
5312
+    packuswb            m4, m5
5313
+    movu                [r0 + r1], m4
5314
+
5315
+    palignr             m4, m3, m6, 12
5316
+    pshufb              m4, m7
5317
+    palignr             m5, m3, m6, 10
5318
+    pshufb              m5, m8
5319
+    pmaddubsw           m4, m0
5320
+    pmaddubsw           m5, m1
5321
+    pmulhrsw            m4, m2
5322
+    pmulhrsw            m5, m2
5323
+    packuswb            m4, m5
5324
+    movu                [r0 + r1 * 2], m4
5325
+
5326
+    palignr             m4, m3, m6, 13
5327
+    pshufb              m4, m7
5328
+    palignr             m5, m3, m6, 11
5329
+    pshufb              m5, m8
5330
+    pmaddubsw           m4, m0
5331
+    pmaddubsw           m5, m1
5332
+    pmulhrsw            m4, m2
5333
+    pmulhrsw            m5, m2
5334
+    packuswb            m4, m5
5335
+    movu                [r0 + r3], m4
5336
+
5337
+    lea                 r0, [r0 + r1 * 4]
5338
+
5339
+    palignr             m4, m3, m6, 14
5340
+    pshufb              m4, m7
5341
+    palignr             m5, m3, m6, 12
5342
+    pshufb              m5, m8
5343
+    pmaddubsw           m4, m0
5344
+    pmaddubsw           m5, m1
5345
+    pmulhrsw            m4, m2
5346
+    pmulhrsw            m5, m2
5347
+    packuswb            m4, m5
5348
+    movu                [r0], m4
5349
+
5350
+    palignr             m4, m3, m6, 15
5351
+    pshufb              m4, m7
5352
+    palignr             m5, m3, m6, 13
5353
+    pshufb              m5, m8
5354
+    pmaddubsw           m4, m0
5355
+    pmaddubsw           m5, m1
5356
+    pmulhrsw            m4, m2
5357
+    pmulhrsw            m5, m2
5358
+    packuswb            m4, m5
5359
+    movu                [r0 + r1], m4
5360
+
5361
+    pshufb              m4, m3, m7
5362
+    palignr             m5, m3, m6, 14
5363
+    pshufb              m5, m8
5364
+    pmaddubsw           m4, m0
5365
+    pmaddubsw           m5, m1
5366
+    pmulhrsw            m4, m2
5367
+    pmulhrsw            m5, m2
5368
+    packuswb            m4, m5
5369
+    movu                [r0 + r1 * 2], m4
5370
+
5371
+    palignr             m5, m3, m6, 15
5372
+    vbroadcasti128      m6, [r2 + mmsize*2 + 32]
5373
+
5374
+    palignr             m4, m6, m3, 1
5375
+    pshufb              m4, m7
5376
+    pshufb              m5, m8
5377
+    pmaddubsw           m4, m0
5378
+    pmaddubsw           m5, m1
5379
+    pmulhrsw            m4, m2
5380
+    pmulhrsw            m5, m2
5381
+    packuswb            m4, m5
5382
+    movu                [r0 + r3], m4
5383
+    RET
5384
+
5385
+cglobal intra_pred_ang32_21, 3,5,9
5386
+    lea                 r3, [ang_table_avx2 + 32 * 16]
5387
+    lea                 r4, [r1 * 3]
5388
+    mova                m5, [pw_1024]
5389
+
5390
+    ; rows 0 to 7
5391
+    movu                m0, [r2 + 0]
5392
+    movu                m1, [r2 + 1]
5393
+    punpckhbw           m2, m0, m1
5394
+    punpcklbw           m0, m1
5395
+
5396
+    movu                m4, [r2 + mmsize*2]
5397
+    pshufb              m4, [ang32_shuf_mode21]
5398
+    vextracti128        xm6, m4, 1
5399
+
5400
+    palignr             m3, m0, m4, 1
5401
+    palignr             m8, m3, m6, 1
5402
+    vinserti128         m3, m3, xm2, 1
5403
+    vinserti128         m8, m8, xm0, 1
5404
+
5405
+    pmaddubsw           m4, m0, [r3 - 1 * 32]           ; [15]
5406
+    pmulhrsw            m4, m5
5407
+    pmaddubsw           m1, m2, [r3 - 1 * 32]
5408
+    pmulhrsw            m1, m5
5409
+    packuswb            m4, m1
5410
+    movu                [r0], m4
5411
+
5412
+    palignr             m6, m0, m3, 14
5413
+    palignr             m7, m2, m0, 14
5414
+    pmaddubsw           m4, m6, [r3 + 14 * 32]          ; [30]
5415
+    pmulhrsw            m4, m5
5416
+    pmaddubsw           m1, m7, [r3 + 14 * 32]
5417
+    pmulhrsw            m1, m5
5418
+    packuswb            m4, m1
5419
+    movu                [r0 + r1], m4
5420
+
5421
+    pmaddubsw           m4, m6, [r3 - 3 * 32]           ; [13]
5422
+    pmulhrsw            m4, m5
5423
+    pmaddubsw           m1, m7, [r3 - 3 * 32]
5424
+    pmulhrsw            m1, m5
5425
+    packuswb            m4, m1
5426
+    movu                [r0 + r1*2], m4
5427
+
5428
+    palignr             m6, m0, m3, 12
5429
+    palignr             m7, m2, m0, 12
5430
+    pmaddubsw           m4, m6, [r3 + 12 * 32]          ; [28]
5431
+    pmulhrsw            m4, m5
5432
+    pmaddubsw           m1, m7, [r3 + 12 * 32]
5433
+    pmulhrsw            m1, m5
5434
+    packuswb            m4, m1
5435
+    movu                [r0 + r4], m4
5436
+
5437
+    lea                 r0, [r0 + r1 * 4]
5438
+
5439
+    pmaddubsw           m4, m6, [r3 - 5 * 32]           ; [11]
5440
+    pmulhrsw            m4, m5
5441
+    pmaddubsw           m1, m7, [r3 - 5 * 32]
5442
+    pmulhrsw            m1, m5
5443
+    packuswb            m4, m1
5444
+    movu                [r0], m4
5445
+
5446
+    palignr             m6, m0, m3, 10
5447
+    palignr             m7, m2, m0, 10
5448
+    pmaddubsw           m4, m6, [r3 + 10 * 32]          ; [26]
5449
+    pmulhrsw            m4, m5
5450
+    pmaddubsw           m1, m7, [r3 + 10 * 32]
5451
+    pmulhrsw            m1, m5
5452
+    packuswb            m4, m1
5453
+    movu                [r0 + r1], m4
5454
+
5455
+    pmaddubsw           m4, m6, [r3 - 7 * 32]           ; [9]
5456
+    pmulhrsw            m4, m5
5457
+    pmaddubsw           m1, m7, [r3 - 7 * 32]
5458
+    pmulhrsw            m1, m5
5459
+    packuswb            m4, m1
5460
+    movu                [r0 + r1*2], m4
5461
+
5462
+    palignr             m6, m0, m3, 8
5463
+    palignr             m7, m2, m0, 8
5464
+
5465
+    pmaddubsw           m4, m6, [r3 + 8 * 32]           ; [24]
5466
+    pmulhrsw            m4, m5
5467
+    pmaddubsw           m1, m7, [r3 + 8 * 32]
5468
+    pmulhrsw            m1, m5
5469
+    packuswb            m4, m1
5470
+    movu                [r0 + r4], m4
5471
+
5472
+    lea                 r0, [r0 + r1 * 4]
5473
+
5474
+    ; rows 8 to 15
5475
+    pmaddubsw           m4, m6, [r3 - 9 * 32]           ; [7]
5476
+    pmulhrsw            m4, m5
5477
+    pmaddubsw           m1, m7, [r3 - 9 * 32]
5478
+    pmulhrsw            m1, m5
5479
+    packuswb            m4, m1
5480
+    movu                [r0], m4
5481
+
5482
+    palignr             m6, m0, m3, 6
5483
+    palignr             m7, m2, m0, 6
5484
+    pmaddubsw           m4, m6, [r3 + 6 * 32]           ; [22]
5485
+    pmulhrsw            m4, m5
5486
+    pmaddubsw           m1, m7, [r3 + 6 * 32]
5487
+    pmulhrsw            m1, m5
5488
+    packuswb            m4, m1
5489
+    movu                [r0 + r1], m4
5490
+
5491
+    pmaddubsw           m4, m6, [r3 - 11 * 32]          ; [5]
5492
+    pmulhrsw            m4, m5
5493
+    pmaddubsw           m1, m7, [r3 - 11 * 32]
5494
+    pmulhrsw            m1, m5
5495
+    packuswb            m4, m1
5496
+    movu                [r0 + r1*2], m4
5497
+
5498
+    palignr             m6, m0, m3, 4
5499
+    palignr             m7, m2, m0, 4
5500
+    pmaddubsw           m4, m6, [r3 + 4 * 32]           ; [20]
5501
+    pmulhrsw            m4, m5
5502
+    pmaddubsw           m1, m7, [r3 + 4 * 32]
5503
+    pmulhrsw            m1, m5
5504
+    packuswb            m4, m1
5505
+    movu                [r0 + r4], m4
5506
+
5507
+    lea                 r0, [r0 + r1 * 4]
5508
+
5509
+    pmaddubsw           m4, m6, [r3 - 13 * 32]          ; [3]
5510
+    pmulhrsw            m4, m5
5511
+    pmaddubsw           m1, m7, [r3 - 13 * 32]
5512
+    pmulhrsw            m1, m5
5513
+    packuswb            m4, m1
5514
+    movu                [r0], m4
5515
+
5516
+    palignr             m6, m0, m3, 2
5517
+    palignr             m7, m2, m0, 2
5518
+    pmaddubsw           m4, m6, [r3 + 2 * 32]           ; [18]
5519
+    pmulhrsw            m4, m5
5520
+    pmaddubsw           m1, m7, [r3 + 2 * 32]
5521
+    pmulhrsw            m1, m5
5522
+    packuswb            m4, m1
5523
+    movu                [r0 + r1], m4
5524
+
5525
+    pmaddubsw           m4, m6, [r3 - 15 * 32]          ; [1]
5526
+    pmulhrsw            m4, m5
5527
+    pmaddubsw           m1, m7, [r3 - 15 * 32]
5528
+    pmulhrsw            m1, m5
5529
+    packuswb            m4, m1
5530
+    movu                [r0 + r1 * 2], m4
5531
+
5532
+    pmaddubsw           m4, m3, [r3]                    ; [16]
5533
+    pmulhrsw            m4, m5
5534
+    pmaddubsw           m1, m0, [r3]
5535
+    pmulhrsw            m1, m5
5536
+    packuswb            m4, m1
5537
+    movu                [r0 + r4], m4
5538
+
5539
+    lea                 r0, [r0 + r1 * 4]
5540
+
5541
+    ; rows 16 to 23
5542
+    palignr             m6, m3, m8, 14
5543
+    palignr             m7, m0, m3, 14
5544
+    pmaddubsw           m4, m6, [r3 + 15 * 32]          ; [31]
5545
+    pmulhrsw            m4, m5
5546
+    pmaddubsw           m1, m7, [r3 + 15 * 32]
5547
+    pmulhrsw            m1, m5
5548
+    packuswb            m4, m1
5549
+    movu                [r0], m4
5550
+
5551
+    pmaddubsw           m4, m6, [r3 - 2 * 32]           ; [14]
5552
+    pmulhrsw            m4, m5
5553
+    pmaddubsw           m1, m7, [r3 - 2 * 32]
5554
+    pmulhrsw            m1, m5
5555
+    packuswb            m4, m1
5556
+    movu                [r0 + r1], m4
5557
+
5558
+    palignr             m6, m3, m8, 12
5559
+    palignr             m7, m0, m3, 12
5560
+    pmaddubsw           m4, m6, [r3 + 13 * 32]          ; [29]
5561
+    pmulhrsw            m4, m5
5562
+    pmaddubsw           m1, m7, [r3 + 13 * 32]
5563
+    pmulhrsw            m1, m5
5564
+    packuswb            m4, m1
5565
+    movu                [r0 + r1*2], m4
5566
+
5567
+    pmaddubsw           m4, m6, [r3 - 4 * 32]           ; [12]
5568
+    pmulhrsw            m4, m5
5569
+    pmaddubsw           m1, m7, [r3 - 4 * 32]
5570
+    pmulhrsw            m1, m5
5571
+    packuswb            m4, m1
5572
+    movu                [r0 + r4], m4
5573
+
5574
+    lea                 r0, [r0 + r1 * 4]
5575
+
5576
+    palignr             m6, m3, m8, 10
5577
+    palignr             m7, m0, m3, 10
5578
+    pmaddubsw           m4, m6, [r3 + 11 * 32]          ; [27]
5579
+    pmulhrsw            m4, m5
5580
+    pmaddubsw           m1, m7, [r3 + 11 * 32]
5581
+    pmulhrsw            m1, m5
5582
+    packuswb            m4, m1
5583
+    movu                [r0], m4
5584
+
5585
+    pmaddubsw           m4, m6, [r3 - 6 * 32]           ; [10]
5586
+    pmulhrsw            m4, m5
5587
+    pmaddubsw           m1, m7, [r3 - 6 * 32]
5588
+    pmulhrsw            m1, m5
5589
+    packuswb            m4, m1
5590
+    movu                [r0 + r1], m4
5591
+
5592
+    palignr             m6, m3, m8, 8
5593
+    palignr             m7, m0, m3, 8
5594
+    pmaddubsw           m4, m6, [r3 + 9 * 32]           ; [25]
5595
+    pmulhrsw            m4, m5
5596
+    pmaddubsw           m1, m7, [r3 + 9 * 32]
5597
+    pmulhrsw            m1, m5
5598
+    packuswb            m4, m1
5599
+    movu                [r0 + r1*2], m4
5600
+
5601
+    pmaddubsw           m4, m6, [r3 - 8 * 32]           ; [8]
5602
+    pmulhrsw            m4, m5
5603
+    pmaddubsw           m1, m7, [r3 - 8 * 32]
5604
+    pmulhrsw            m1, m5
5605
+    packuswb            m4, m1
5606
+    movu                [r0 + r4], m4
5607
+
5608
+    lea                 r0, [r0 + r1 * 4]
5609
+
5610
+    ; rows 24 to 31
5611
+    palignr             m6, m3, m8, 6
5612
+    palignr             m7, m0, m3, 6
5613
+    pmaddubsw           m4, m6, [r3 + 7 * 32]           ; [23]
5614
+    pmulhrsw            m4, m5
5615
+    pmaddubsw           m1, m7, [r3 + 7 * 32]
5616
+    pmulhrsw            m1, m5
5617
+    packuswb            m4, m1
5618
+    movu                [r0], m4
5619
+
5620
+    pmaddubsw           m4, m6, [r3 - 10 * 32]          ; [6]
5621
+    pmulhrsw            m4, m5
5622
+    pmaddubsw           m1, m7, [r3 - 10 * 32]
5623
+    pmulhrsw            m1, m5
5624
+    packuswb            m4, m1
5625
+    movu                [r0 + r1], m4
5626
+
5627
+    palignr             m6, m3, m8, 4
5628
+    palignr             m7, m0, m3, 4
5629
+    pmaddubsw           m4, m6, [r3 + 5 * 32]           ; [21]
5630
+    pmulhrsw            m4, m5
5631
+    pmaddubsw           m1, m7, [r3 + 5 * 32]
5632
+    pmulhrsw            m1, m5
5633
+    packuswb            m4, m1
5634
+    movu                [r0 + r1 * 2], m4
5635
+
5636
+    pmaddubsw           m4, m6, [r3 - 12 * 32]          ; [4]
5637
+    pmulhrsw            m4, m5
5638
+    pmaddubsw           m1, m7, [r3 - 12 * 32]
5639
+    pmulhrsw            m1, m5
5640
+    packuswb            m4, m1
5641
+    movu                [r0 + r4], m4
5642
+
5643
+    lea                 r0, [r0 + r1 * 4]
5644
+
5645
+    palignr             m6, m3, m8, 2
5646
+    palignr             m7, m0, m3, 2
5647
+    pmaddubsw           m4, m6, [r3 + 3 * 32]           ; [19]
5648
+    pmulhrsw            m4, m5
5649
+    pmaddubsw           m1, m7, [r3 + 3 * 32]
5650
+    pmulhrsw            m1, m5
5651
+    packuswb            m4, m1
5652
+    movu                [r0], m4
5653
+
5654
+    pmaddubsw           m4, m6, [r3 - 14 * 32]          ; [2]
5655
+    pmulhrsw            m4, m5
5656
+    pmaddubsw           m1, m7, [r3 - 14 * 32]
5657
+    pmulhrsw            m1, m5
5658
+    packuswb            m4, m1
5659
+    movu                [r0 + r1], m4
5660
+
5661
+    pmaddubsw           m4, m8, [r3 + 1 * 32]           ; [17]
5662
+    pmulhrsw            m4, m5
5663
+    pmaddubsw           m1, m3, [r3 + 1 * 32]
5664
+    pmulhrsw            m1, m5
5665
+    packuswb            m4, m1
5666
+    movu                [r0 + r1*2], m4
5667
+
5668
+    pand                m8, [pw_00ff]
5669
+    pand                m3, [pw_00ff]
5670
+    packuswb            m8, m3
5671
+    movu                [r0 + r4], m8
5672
+    RET
5673
+
5674
+cglobal intra_pred_ang32_16, 3,4,10
5675
+    movu                m0, [ang32_fact_mode16]
5676
+    movu                m1, [ang32_fact_mode16 + mmsize]
5677
+    mova                m2, [pw_1024]
5678
+    mova                m7, [ang32_shuf_mode16]
5679
+    mova                m8, [ang32_shuf_mode16 + mmsize]
5680
+    lea                 r3, [r1 * 3]
5681
+
5682
+    ; prepare for [30, 29, 27, 26, 24, 23, 21, 20, 18, 17, 15, 14, 12, 11,  9,  8,  6,  5,  3,  2,  0, -1, -2...]
5683
+
5684
+    movu                m6, [r2]
5685
+    pshufb              m6, [ang32_shuf_mode16 + mmsize*2]
5686
+    mova                m9, m6
5687
+    mova                m3, [ang32_shuf_mode16 + mmsize*3]
5688
+    vpermd              m6, m3, m6
5689
+    vpermq              m9, m9, q3232
5690
+    pslldq              m9, 4
5691
+    palignr             m6, m9, 15
5692
+    pslldq              m9, 1
5693
+
5694
+    vbroadcasti128      m3, [r2 + mmsize*2 + 1]
5695
+
5696
+    palignr             m4, m3, m6, 1
5697
+    palignr             m5, m6, m9, 6
5698
+    pshufb              m4, m7
5699
+    pshufb              m5, m8
5700
+    pmaddubsw           m4, m0
5701
+    pmaddubsw           m5, m1
5702
+    pmulhrsw            m4, m2
5703
+    pmulhrsw            m5, m2
5704
+    packuswb            m4, m5
5705
+    vpermq              m4, m4, q3120
5706
+    movu                [r0], m4
5707
+
5708
+    palignr             m4, m3, m6, 2
5709
+    palignr             m5, m6, m9, 7
5710
+    pshufb              m4, m7
5711
+    pshufb              m5, m8
5712
+    pmaddubsw           m4, m0
5713
+    pmaddubsw           m5, m1
5714
+    pmulhrsw            m4, m2
5715
+    pmulhrsw            m5, m2
5716
+    packuswb            m4, m5
5717
+    vpermq              m4, m4, q3120
5718
+    movu                [r0 + r1], m4
5719
+
5720
+    palignr             m4, m3, m6, 3
5721
+    palignr             m5, m6, m9, 8
5722
+    pshufb              m4, m7
5723
+    pshufb              m5, m8
5724
+    pmaddubsw           m4, m0
5725
+    pmaddubsw           m5, m1
5726
+    pmulhrsw            m4, m2
5727
+    pmulhrsw            m5, m2
5728
+    packuswb            m4, m5
5729
+    vpermq              m4, m4, q3120
5730
+    movu                [r0 + r1 * 2], m4
5731
+
5732
+    palignr             m4, m3, m6, 4
5733
+    palignr             m5, m6, m9, 9
5734
+    pshufb              m4, m7
5735
+    pshufb              m5, m8
5736
+    pmaddubsw           m4, m0
5737
+    pmaddubsw           m5, m1
5738
+    pmulhrsw            m4, m2
5739
+    pmulhrsw            m5, m2
5740
+    packuswb            m4, m5
5741
+    vpermq              m4, m4, q3120
5742
+    movu                [r0 + r3], m4
5743
+
5744
+    lea                 r0, [r0 + r1 * 4]
5745
+
5746
+    palignr             m4, m3, m6, 5
5747
+    palignr             m5, m6, m9, 10
5748
+    pshufb              m4, m7
5749
+    pshufb              m5, m8
5750
+    pmaddubsw           m4, m0
5751
+    pmaddubsw           m5, m1
5752
+    pmulhrsw            m4, m2
5753
+    pmulhrsw            m5, m2
5754
+    packuswb            m4, m5
5755
+    vpermq              m4, m4, q3120
5756
+    movu                [r0], m4
5757
+
5758
+    palignr             m4, m3, m6, 6
5759
+    palignr             m5, m6, m9, 11
5760
+    pshufb              m4, m7
5761
+    pshufb              m5, m8
5762
+    pmaddubsw           m4, m0
5763
+    pmaddubsw           m5, m1
5764
+    pmulhrsw            m4, m2
5765
+    pmulhrsw            m5, m2
5766
+    packuswb            m4, m5
5767
+    vpermq              m4, m4, q3120
5768
+    movu                [r0 + r1], m4
5769
+
5770
+    palignr             m4, m3, m6, 7
5771
+    palignr             m5, m6, m9, 12
5772
+    pshufb              m4, m7
5773
+    pshufb              m5, m8
5774
+    pmaddubsw           m4, m0
5775
+    pmaddubsw           m5, m1
5776
+    pmulhrsw            m4, m2
5777
+    pmulhrsw            m5, m2
5778
+    packuswb            m4, m5
5779
+    vpermq              m4, m4, q3120
5780
+    movu                [r0 + r1 * 2], m4
5781
+
5782
+    palignr             m4, m3, m6, 8
5783
+    palignr             m5, m6, m9, 13
5784
+    pshufb              m4, m7
5785
+    pshufb              m5, m8
5786
+    pmaddubsw           m4, m0
5787
+    pmaddubsw           m5, m1
5788
+    pmulhrsw            m4, m2
5789
+    pmulhrsw            m5, m2
5790
+    packuswb            m4, m5
5791
+    vpermq              m4, m4, q3120
5792
+    movu                [r0 + r3], m4
5793
+
5794
+    lea                 r0, [r0 + r1 * 4]
5795
+
5796
+    palignr             m4, m3, m6, 9
5797
+    palignr             m5, m6, m9, 14
5798
+    pshufb              m4, m7
5799
+    pshufb              m5, m8
5800
+    pmaddubsw           m4, m0
5801
+    pmaddubsw           m5, m1
5802
+    pmulhrsw            m4, m2
5803
+    pmulhrsw            m5, m2
5804
+    packuswb            m4, m5
5805
+    vpermq              m4, m4, q3120
5806
+    movu                [r0], m4
5807
+
5808
+    palignr             m4, m3, m6, 10
5809
+    palignr             m5, m6, m9, 15
5810
+    pshufb              m4, m7
5811
+    pshufb              m5, m8
5812
+    pmaddubsw           m4, m0
5813
+    pmaddubsw           m5, m1
5814
+    pmulhrsw            m4, m2
5815
+    pmulhrsw            m5, m2
5816
+    packuswb            m4, m5
5817
+    vpermq              m4, m4, q3120
5818
+    movu                [r0 + r1], m4
5819
+
5820
+    palignr             m4, m3, m6, 11
5821
+    pshufb              m4, m7
5822
+    pshufb              m5, m6, m8
5823
+    pmaddubsw           m4, m0
5824
+    pmaddubsw           m5, m1
5825
+    pmulhrsw            m4, m2
5826
+    pmulhrsw            m5, m2
5827
+    packuswb            m4, m5
5828
+    vpermq              m4, m4, q3120
5829
+    movu                [r0 + r1 * 2], m4
5830
+
5831
+    palignr             m4, m3, m6, 12
5832
+    palignr             m5, m3, m6, 1
5833
+    pshufb              m4, m7
5834
+    pshufb              m5, m8
5835
+    pmaddubsw           m4, m0
5836
+    pmaddubsw           m5, m1
5837
+    pmulhrsw            m4, m2
5838
+    pmulhrsw            m5, m2
5839
+    packuswb            m4, m5
5840
+    vpermq              m4, m4, q3120
5841
+    movu                [r0 + r3], m4
5842
+
5843
+    lea                 r0, [r0 + r1 * 4]
5844
+
5845
+    palignr             m4, m3, m6, 13
5846
+    palignr             m5, m3, m6, 2
5847
+    pshufb              m4, m7
5848
+    pshufb              m5, m8
5849
+    pmaddubsw           m4, m0
5850
+    pmaddubsw           m5, m1
5851
+    pmulhrsw            m4, m2
5852
+    pmulhrsw            m5, m2
5853
+    packuswb            m4, m5
5854
+    vpermq              m4, m4, q3120
5855
+    movu                [r0], m4
5856
+
5857
+    palignr             m4, m3, m6, 14
5858
+    palignr             m5, m3, m6, 3
5859
+    pshufb              m4, m7
5860
+    pshufb              m5, m8
5861
+    pmaddubsw           m4, m0
5862
+    pmaddubsw           m5, m1
5863
+    pmulhrsw            m4, m2
5864
+    pmulhrsw            m5, m2
5865
+    packuswb            m4, m5
5866
+    vpermq              m4, m4, q3120
5867
+    movu                [r0 + r1], m4
5868
+
5869
+    palignr             m4, m3, m6, 15
5870
+    palignr             m5, m3, m6, 4
5871
+    pshufb              m4, m7
5872
+    pshufb              m5, m8
5873
+    pmaddubsw           m4, m0
5874
+    pmaddubsw           m5, m1
5875
+    pmulhrsw            m4, m2
5876
+    pmulhrsw            m5, m2
5877
+    packuswb            m4, m5
5878
+    vpermq              m4, m4, q3120
5879
+    movu                [r0 + r1 * 2], m4
5880
+
5881
+    palignr             m5, m3, m6, 5
5882
+    pshufb              m4, m3, m7
5883
+    pshufb              m5, m8
5884
+    pmaddubsw           m4, m0
5885
+    pmaddubsw           m5, m1
5886
+    pmulhrsw            m4, m2
5887
+    pmulhrsw            m5, m2
5888
+    packuswb            m4, m5
5889
+    vpermq              m4, m4, q3120
5890
+    movu                [r0 + r3], m4
5891
+
5892
+    lea                 r0, [r0 + r1 * 4]
5893
+
5894
+    vbroadcasti128      m9, [r2 + mmsize*2 + 17]
5895
+
5896
+    palignr             m4, m9, m3, 1
5897
+    palignr             m5, m3, m6, 6
5898
+    pshufb              m4, m7
5899
+    pshufb              m5, m8
5900
+    pmaddubsw           m4, m0
5901
+    pmaddubsw           m5, m1
5902
+    pmulhrsw            m4, m2
5903
+    pmulhrsw            m5, m2
5904
+    packuswb            m4, m5
5905
+    vpermq              m4, m4, q3120
5906
+    movu                [r0], m4
5907
+
5908
+    palignr             m4, m9, m3, 2
5909
+    palignr             m5, m3, m6, 7
5910
+    pshufb              m4, m7
5911
+    pshufb              m5, m8
5912
+    pmaddubsw           m4, m0
5913
+    pmaddubsw           m5, m1
5914
+    pmulhrsw            m4, m2
5915
+    pmulhrsw            m5, m2
5916
+    packuswb            m4, m5
5917
+    vpermq              m4, m4, q3120
5918
+    movu                [r0 + r1], m4
5919
+
5920
+    palignr             m4, m9, m3, 3
5921
+    palignr             m5, m3, m6, 8
5922
+    pshufb              m4, m7
5923
+    pshufb              m5, m8
5924
+    pmaddubsw           m4, m0
5925
+    pmaddubsw           m5, m1
5926
+    pmulhrsw            m4, m2
5927
+    pmulhrsw            m5, m2
5928
+    packuswb            m4, m5
5929
+    vpermq              m4, m4, q3120
5930
+    movu                [r0 + r1 * 2], m4
5931
+
5932
+    palignr             m4, m9, m3, 4
5933
+    palignr             m5, m3, m6, 9
5934
+    pshufb              m4, m7
5935
+    pshufb              m5, m8
5936
+    pmaddubsw           m4, m0
5937
+    pmaddubsw           m5, m1
5938
+    pmulhrsw            m4, m2
5939
+    pmulhrsw            m5, m2
5940
+    packuswb            m4, m5
5941
+    vpermq              m4, m4, q3120
5942
+    movu                [r0 + r3], m4
5943
+
5944
+    lea                 r0, [r0 + r1 * 4]
5945
+
5946
+    palignr             m4, m9, m3, 5
5947
+    palignr             m5, m3, m6, 10
5948
+    pshufb              m4, m7
5949
+    pshufb              m5, m8
5950
+    pmaddubsw           m4, m0
5951
+    pmaddubsw           m5, m1
5952
+    pmulhrsw            m4, m2
5953
+    pmulhrsw            m5, m2
5954
+    packuswb            m4, m5
5955
+    vpermq              m4, m4, q3120
5956
+    movu                [r0], m4
5957
+
5958
+    palignr             m4, m9, m3, 6
5959
+    palignr             m5, m3, m6, 11
5960
+    pshufb              m4, m7
5961
+    pshufb              m5, m8
5962
+    pmaddubsw           m4, m0
5963
+    pmaddubsw           m5, m1
5964
+    pmulhrsw            m4, m2
5965
+    pmulhrsw            m5, m2
5966
+    packuswb            m4, m5
5967
+    vpermq              m4, m4, q3120
5968
+    movu                [r0 + r1], m4
5969
+
5970
+    palignr             m4, m9, m3, 7
5971
+    palignr             m5, m3, m6, 12
5972
+    pshufb              m4, m7
5973
+    pshufb              m5, m8
5974
+    pmaddubsw           m4, m0
5975
+    pmaddubsw           m5, m1
5976
+    pmulhrsw            m4, m2
5977
+    pmulhrsw            m5, m2
5978
+    packuswb            m4, m5
5979
+    vpermq              m4, m4, q3120
5980
+    movu                [r0 + r1 * 2], m4
5981
+
5982
+    palignr             m4, m9, m3, 8
5983
+    palignr             m5, m3, m6, 13
5984
+    pshufb              m4, m7
5985
+    pshufb              m5, m8
5986
+    pmaddubsw           m4, m0
5987
+    pmaddubsw           m5, m1
5988
+    pmulhrsw            m4, m2
5989
+    pmulhrsw            m5, m2
5990
+    packuswb            m4, m5
5991
+    vpermq              m4, m4, q3120
5992
+    movu                [r0 + r3], m4
5993
+
5994
+    lea                 r0, [r0 + r1 * 4]
5995
+
5996
+    palignr             m4, m9, m3, 9
5997
+    palignr             m5, m3, m6, 14
5998
+    pshufb              m4, m7
5999
+    pshufb              m5, m8
6000
+    pmaddubsw           m4, m0
6001
+    pmaddubsw           m5, m1
6002
+    pmulhrsw            m4, m2
6003
+    pmulhrsw            m5, m2
6004
+    packuswb            m4, m5
6005
+    vpermq              m4, m4, q3120
6006
+    movu                [r0], m4
6007
+
6008
+    palignr             m4, m9, m3, 10
6009
+    palignr             m5, m3, m6, 15
6010
+    pshufb              m4, m7
6011
+    pshufb              m5, m8
6012
+    pmaddubsw           m4, m0
6013
+    pmaddubsw           m5, m1
6014
+    pmulhrsw            m4, m2
6015
+    pmulhrsw            m5, m2
6016
+    packuswb            m4, m5
6017
+    vpermq              m4, m4, q3120
6018
+    movu                [r0 + r1], m4
6019
+
6020
+    palignr             m4, m9, m3, 11
6021
+    pshufb              m4, m7
6022
+    pshufb              m5, m3, m8
6023
+    pmaddubsw           m4, m0
6024
+    pmaddubsw           m5, m1
6025
+    pmulhrsw            m4, m2
6026
+    pmulhrsw            m5, m2
6027
+    packuswb            m4, m5
6028
+    vpermq              m4, m4, q3120
6029
+    movu                [r0 + r1 * 2], m4
6030
+
6031
+    palignr             m4, m9, m3, 12
6032
+    palignr             m5, m9, m3, 1
6033
+    pshufb              m4, m7
6034
+    pshufb              m5, m8
6035
+    pmaddubsw           m4, m0
6036
+    pmaddubsw           m5, m1
6037
+    pmulhrsw            m4, m2
6038
+    pmulhrsw            m5, m2
6039
+    packuswb            m4, m5
6040
+    vpermq              m4, m4, q3120
6041
+    movu                [r0 + r3], m4
6042
+
6043
+    lea                 r0, [r0 + r1 * 4]
6044
+
6045
+    palignr             m4, m9, m3, 13
6046
+    palignr             m5, m9, m3, 2
6047
+    pshufb              m4, m7
6048
+    pshufb              m5, m8
6049
+    pmaddubsw           m4, m0
6050
+    pmaddubsw           m5, m1
6051
+    pmulhrsw            m4, m2
6052
+    pmulhrsw            m5, m2
6053
+    packuswb            m4, m5
6054
+    vpermq              m4, m4, q3120
6055
+    movu                [r0], m4
6056
+
6057
+    palignr             m4, m9, m3, 14
6058
+    palignr             m5, m9, m3, 3
6059
+    pshufb              m4, m7
6060
+    pshufb              m5, m8
6061
+    pmaddubsw           m4, m0
6062
+    pmaddubsw           m5, m1
6063
+    pmulhrsw            m4, m2
6064
+    pmulhrsw            m5, m2
6065
+    packuswb            m4, m5
6066
+    vpermq              m4, m4, q3120
6067
+    movu                [r0 + r1], m4
6068
+
6069
+    palignr             m4, m9, m3, 15
6070
+    palignr             m5, m9, m3, 4
6071
+    pshufb              m4, m7
6072
+    pshufb              m5, m8
6073
+    pmaddubsw           m4, m0
6074
+    pmaddubsw           m5, m1
6075
+    pmulhrsw            m4, m2
6076
+    pmulhrsw            m5, m2
6077
+    packuswb            m4, m5
6078
+    vpermq              m4, m4, q3120
6079
+    movu                [r0 + r1 * 2], m4
6080
+
6081
+    palignr             m5, m9, m3, 5
6082
+    pshufb              m4, m9, m7
6083
+    pshufb              m5, m8
6084
+    pmaddubsw           m4, m0
6085
+    pmaddubsw           m5, m1
6086
+    pmulhrsw            m4, m2
6087
+    pmulhrsw            m5, m2
6088
+    packuswb            m4, m5
6089
+    vpermq              m4, m4, q3120
6090
+    movu                [r0 + r3], m4
6091
+    RET
6092
+
6093
+cglobal intra_pred_ang32_20, 3,5,10
6094
+    lea                 r3, [ang_table_avx2 + 32 * 16]
6095
+    lea                 r4, [r1 * 3]
6096
+    mova                m5, [pw_1024]
6097
+
6098
+    ; rows 0 to 7
6099
+    movu                m0, [r2 + 0]
6100
+    movu                m1, [r2 + 1]
6101
+    punpckhbw           m2, m0, m1
6102
+    punpcklbw           m0, m1
6103
+
6104
+    movu                m4, [r2 + mmsize*2]
6105
+    pshufb              m4, [ang32_shuf_mode20]
6106
+    mova                m9, m4
6107
+    vpermq              m9, m9, q3333
6108
+    mova                m7, m4
6109
+    vpermq              m7, m7, q1111
6110
+    palignr             m4, m7, 14
6111
+    pshufb              m4, [ang32_shuf_mode20 + mmsize*1]
6112
+
6113
+    vextracti128       xm6, m4, 1
6114
+    palignr             m3, m0, m4, 1
6115
+    palignr             m8, m3, m6, 1
6116
+    vinserti128         m3, m3, xm2, 1
6117
+    vinserti128         m8, m8, xm0, 1
6118
+    vinserti128         m9, m9, xm3, 1
6119
+
6120
+    pmaddubsw           m4, m0, [r3 - 5 * 32]           ; [11]
6121
+    pmulhrsw            m4, m5
6122
+    pmaddubsw           m1, m2, [r3 - 5 * 32]
6123
+    pmulhrsw            m1, m5
6124
+    packuswb            m4, m1
6125
+    movu                [r0], m4
6126
+
6127
+    palignr             m6, m0, m3, 14
6128
+    palignr             m7, m2, m0, 14
6129
+    pmaddubsw           m4, m6, [r3 + 6 * 32]           ; [22]
6130
+    pmulhrsw            m4, m5
6131
+    pmaddubsw           m1, m7, [r3 + 6 * 32]
6132
+    pmulhrsw            m1, m5
6133
+    packuswb            m4, m1
6134
+    movu                [r0 + r1], m4
6135
+
6136
+    pmaddubsw           m4, m6, [r3 - 15 * 32]          ; [1]
6137
+    pmulhrsw            m4, m5
6138
+    pmaddubsw           m1, m7, [r3 - 15 * 32]
6139
+    pmulhrsw            m1, m5
6140
+    packuswb            m4, m1
6141
+    movu                [r0 + r1*2], m4
6142
+
6143
+    palignr             m6, m0, m3, 12
6144
+    palignr             m7, m2, m0, 12
6145
+    pmaddubsw           m4, m6, [r3 - 4 * 32]           ; [12]
6146
+    pmulhrsw            m4, m5
6147
+    pmaddubsw           m1, m7, [r3 - 4 * 32]
6148
+    pmulhrsw            m1, m5
6149
+    packuswb            m4, m1
6150
+    movu                [r0 + r4], m4
6151
+
6152
+    lea                 r0, [r0 + r1 * 4]
6153
+
6154
+    palignr             m6, m0, m3, 10
6155
+    palignr             m7, m2, m0, 10
6156
+    pmaddubsw           m4, m6, [r3 + 7 * 32]           ; [23]
6157
+    pmulhrsw            m4, m5
6158
+    pmaddubsw           m1, m7, [r3 + 7 * 32]
6159
+    pmulhrsw            m1, m5
6160
+    packuswb            m4, m1
6161
+    movu                [r0], m4
6162
+
6163
+    pmaddubsw           m4, m6, [r3 - 14 * 32]          ; [2]
6164
+    pmulhrsw            m4, m5
6165
+    pmaddubsw           m1, m7, [r3 - 14 * 32]
6166
+    pmulhrsw            m1, m5
6167
+    packuswb            m4, m1
6168
+    movu                [r0 + r1], m4
6169
+
6170
+    palignr             m6, m0, m3, 8
6171
+    palignr             m7, m2, m0, 8
6172
+    pmaddubsw           m4, m6, [r3 - 3 * 32]           ; [13]
6173
+    pmulhrsw            m4, m5
6174
+    pmaddubsw           m1, m7, [r3 - 3 * 32]
6175
+    pmulhrsw            m1, m5
6176
+    packuswb            m4, m1
6177
+    movu                [r0 + r1*2], m4
6178
+
6179
+    palignr             m6, m0, m3, 6
6180
+    palignr             m7, m2, m0, 6
6181
+    pmaddubsw           m4, m6, [r3 + 8 * 32]           ; [24]
6182
+    pmulhrsw            m4, m5
6183
+    pmaddubsw           m1, m7, [r3 + 8 * 32]
6184
+    pmulhrsw            m1, m5
6185
+    packuswb            m4, m1
6186
+    movu                [r0 + r4], m4
6187
+
6188
+    lea                 r0, [r0 + r1 * 4]
6189
+
6190
+    ; rows 8 to 15
6191
+    pmaddubsw           m4, m6, [r3 - 13 * 32]          ; [3]
6192
+    pmulhrsw            m4, m5
6193
+    pmaddubsw           m1, m7, [r3 - 13 * 32]
6194
+    pmulhrsw            m1, m5
6195
+    packuswb            m4, m1
6196
+    movu                [r0], m4
6197
+
6198
+    palignr             m6, m0, m3, 4
6199
+    palignr             m7, m2, m0, 4
6200
+    pmaddubsw           m4, m6, [r3 - 2 * 32]           ; [14]
6201
+    pmulhrsw            m4, m5
6202
+    pmaddubsw           m1, m7, [r3 - 2 * 32]
6203
+    pmulhrsw            m1, m5
6204
+    packuswb            m4, m1
6205
+    movu                [r0 + r1], m4
6206
+
6207
+    palignr             m6, m0, m3, 2
6208
+    palignr             m7, m2, m0, 2
6209
+    pmaddubsw           m4, m6, [r3 + 9 * 32]           ; [25]
6210
+    pmulhrsw            m4, m5
6211
+    pmaddubsw           m1, m7, [r3 + 9 * 32]
6212
+    pmulhrsw            m1, m5
6213
+    packuswb            m4, m1
6214
+    movu                [r0 + r1*2], m4
6215
+
6216
+    pmaddubsw           m4, m6, [r3 - 12 * 32]          ; [4]
6217
+    pmulhrsw            m4, m5
6218
+    pmaddubsw           m1, m7, [r3 - 12 * 32]
6219
+    pmulhrsw            m1, m5
6220
+    packuswb            m4, m1
6221
+    movu                [r0 + r4], m4
6222
+
6223
+    lea                 r0, [r0 + r1 * 4]
6224
+
6225
+    pmaddubsw           m4, m3, [r3 - 1 * 32]           ; [15]
6226
+    pmulhrsw            m4, m5
6227
+    pmaddubsw           m1, m0, [r3 - 1 * 32]
6228
+    pmulhrsw            m1, m5
6229
+    packuswb            m4, m1
6230
+    movu                [r0], m4
6231
+
6232
+    palignr             m6, m3, m8, 14
6233
+    palignr             m7, m0, m3, 14
6234
+    pmaddubsw           m4, m6, [r3 + 10 * 32]          ; [26]
6235
+    pmulhrsw            m4, m5
6236
+    pmaddubsw           m1, m7, [r3 + 10 * 32]
6237
+    pmulhrsw            m1, m5
6238
+    packuswb            m4, m1
6239
+    movu                [r0 + r1], m4
6240
+
6241
+    pmaddubsw           m4, m6, [r3 - 11 * 32]          ; [5]
6242
+    pmulhrsw            m4, m5
6243
+    pmaddubsw           m1, m7, [r3 - 11 * 32]
6244
+    pmulhrsw            m1, m5
6245
+    packuswb            m4, m1
6246
+    movu                [r0 + r1 * 2], m4
6247
+
6248
+    palignr             m6, m3, m8, 12
6249
+    palignr             m7, m0, m3, 12
6250
+    pmaddubsw           m4, m6, [r3]                    ; [16]
6251
+    pmulhrsw            m4, m5
6252
+    pmaddubsw           m1, m7, [r3]
6253
+    pmulhrsw            m1, m5
6254
+    packuswb            m4, m1
6255
+    movu                [r0 + r4], m4
6256
+
6257
+    lea                 r0, [r0 + r1 * 4]
6258
+
6259
+    ; rows 16 to 23
6260
+    palignr             m6, m3, m8, 10
6261
+    palignr             m7, m0, m3, 10
6262
+    pmaddubsw           m4, m6, [r3 + 11 * 32]          ; [27]
6263
+    pmulhrsw            m4, m5
6264
+    pmaddubsw           m1, m7, [r3 + 11 * 32]
6265
+    pmulhrsw            m1, m5
6266
+    packuswb            m4, m1
6267
+    movu                [r0], m4
6268
+
6269
+    pmaddubsw           m4, m6, [r3 - 10 * 32]          ; [6]
6270
+    pmulhrsw            m4, m5
6271
+    pmaddubsw           m1, m7, [r3 - 10 * 32]
6272
+    pmulhrsw            m1, m5
6273
+    packuswb            m4, m1
6274
+    movu                [r0 + r1], m4
6275
+
6276
+    palignr             m6, m3, m8, 8
6277
+    palignr             m7, m0, m3, 8
6278
+    pmaddubsw           m4, m6, [r3 + 1 * 32]           ; [17]
6279
+    pmulhrsw            m4, m5
6280
+    pmaddubsw           m1, m7, [r3 + 1 * 32]
6281
+    pmulhrsw            m1, m5
6282
+    packuswb            m4, m1
6283
+    movu                [r0 + r1*2], m4
6284
+
6285
+    palignr             m6, m3, m8, 6
6286
+    palignr             m7, m0, m3, 6
6287
+    pmaddubsw           m4, m6, [r3 + 12 * 32]          ; [28]
6288
+    pmulhrsw            m4, m5
6289
+    pmaddubsw           m1, m7, [r3 + 12 * 32]
6290
+    pmulhrsw            m1, m5
6291
+    packuswb            m4, m1
6292
+    movu                [r0 + r4], m4
6293
+
6294
+    lea                 r0, [r0 + r1 * 4]
6295
+
6296
+    pmaddubsw           m4, m6, [r3 - 9 * 32]           ; [7]
6297
+    pmulhrsw            m4, m5
6298
+    pmaddubsw           m1, m7, [r3 - 9 * 32]
6299
+    pmulhrsw            m1, m5
6300
+    packuswb            m4, m1
6301
+    movu                [r0], m4
6302
+
6303
+    palignr             m6, m3, m8, 4
6304
+    palignr             m7, m0, m3, 4
6305
+    pmaddubsw           m4, m6, [r3 + 2 * 32]           ; [18]
6306
+    pmulhrsw            m4, m5
6307
+    pmaddubsw           m1, m7, [r3 + 2 * 32]
6308
+    pmulhrsw            m1, m5
6309
+    packuswb            m4, m1
6310
+    movu                [r0 + r1], m4
6311
+
6312
+    palignr             m6, m3, m8, 2
6313
+    palignr             m7, m0, m3, 2
6314
+    pmaddubsw           m4, m6, [r3 + 13 * 32]          ; [29]
6315
+    pmulhrsw            m4, m5
6316
+    pmaddubsw           m1, m7, [r3 + 13 * 32]
6317
+    pmulhrsw            m1, m5
6318
+    packuswb            m4, m1
6319
+    movu                [r0 + r1*2], m4
6320
+
6321
+    pmaddubsw           m4, m6, [r3 - 8 * 32]           ; [8]
6322
+    pmulhrsw            m4, m5
6323
+    pmaddubsw           m1, m7, [r3 - 8 * 32]
6324
+    pmulhrsw            m1, m5
6325
+    packuswb            m4, m1
6326
+    movu                [r0 + r4], m4
6327
+
6328
+    lea                 r0, [r0 + r1 * 4]
6329
+
6330
+    ; rows 24 to 31
6331
+    pmaddubsw           m4, m8, [r3 + 3 * 32]           ; [19]
6332
+    pmulhrsw            m4, m5
6333
+    pmaddubsw           m1, m3, [r3 + 3 * 32]
6334
+    pmulhrsw            m1, m5
6335
+    packuswb            m4, m1
6336
+    movu                [r0], m4
6337
+
6338
+    palignr             m6, m8, m9, 14
6339
+    palignr             m7, m3, m8, 14
6340
+    pmaddubsw           m4, m6, [r3 + 14 * 32]          ; [30]
6341
+    pmulhrsw            m4, m5
6342
+    pmaddubsw           m1, m7, [r3 + 14 * 32]
6343
+    pmulhrsw            m1, m5
6344
+    packuswb            m4, m1
6345
+    movu                [r0 + r1], m4
6346
+
6347
+    pmaddubsw           m4, m6, [r3 - 7 * 32]           ; [9]
6348
+    pmulhrsw            m4, m5
6349
+    pmaddubsw           m1, m7, [r3 - 7 * 32]
6350
+    pmulhrsw            m1, m5
6351
+    packuswb            m4, m1
6352
+    movu                [r0 + r1 * 2], m4
6353
+
6354
+    palignr             m6, m8, m9, 12
6355
+    palignr             m7, m3, m8, 12
6356
+    pmaddubsw           m4, m6, [r3 + 4 * 32]           ; [20]
6357
+    pmulhrsw            m4, m5
6358
+    pmaddubsw           m1, m7, [r3 + 4 * 32]
6359
+    pmulhrsw            m1, m5
6360
+    packuswb            m4, m1
6361
+    movu                [r0 + r4], m4
6362
+
6363
+    lea                 r0, [r0 + r1 * 4]
6364
+
6365
+    palignr             m6, m8, m9, 10
6366
+    palignr             m7, m3, m8, 10
6367
+    pmaddubsw           m4, m6, [r3 + 15 * 32]          ; [31]
6368
+    pmulhrsw            m4, m5
6369
+    pmaddubsw           m1, m7, [r3 + 15 * 32]
6370
+    pmulhrsw            m1, m5
6371
+    packuswb            m4, m1
6372
+    movu                [r0], m4
6373
+
6374
+    pmaddubsw           m4, m6, [r3 - 6 * 32]           ; [10]
6375
+    pmulhrsw            m4, m5
6376
+    pmaddubsw           m1, m7, [r3 - 6 * 32]
6377
+    pmulhrsw            m1, m5
6378
+    packuswb            m4, m1
6379
+    movu                [r0 + r1], m4
6380
+
6381
+    palignr             m6, m8, m9, 8
6382
+    palignr             m7, m3, m8, 8
6383
+    pmaddubsw           m4, m6, [r3 + 5 * 32]           ; [21]
6384
+    pmulhrsw            m4, m5
6385
+    pmaddubsw           m1, m7, [r3 + 5 * 32]
6386
+    pmulhrsw            m1, m5
6387
+    packuswb            m4, m1
6388
+    movu                [r0 + r1*2], m4
6389
+
6390
+    pand                m6, [pw_00ff]
6391
+    pand                m7, [pw_00ff]
6392
+    packuswb            m6, m7
6393
+    movu                [r0 + r4], m6
6394
+    RET
6395
+
6396
+cglobal intra_pred_ang32_17, 3,4,8
6397
+    movu                m0, [ang32_fact_mode17]
6398
+    mova                m2, [pw_1024]
6399
+    mova                m7, [ang32_shuf_mode17]
6400
+    lea                 r3, [r1 * 3]
6401
+
6402
+    ; prepare for [31, 30, 28, 27, 26, 25, 23, 22, 21, 20, 18, 17, 16, 15, 14, 12, 11, 10,  9,  7,  6,  5,  4,  2,  1,  0, -1, -2...]
6403
+
6404
+    movu                m6, [r2]
6405
+    pshufb              m6, [ang32_shuf_mode17 + mmsize]
6406
+    mova                m1, m6
6407
+    mova                m3, [ang32_shuf_mode16 + mmsize*3]
6408
+    vpermd              m6, m3, m6
6409
+    vpermq              m1, m1, q3232
6410
+    pslldq              m1, 4
6411
+
6412
+    movu               xm4, [r2 + mmsize*2]
6413
+    pinsrb             xm4, [r2], 0
6414
+    vinserti128         m3, m4, xm4, 1
6415
+
6416
+    palignr             m4, m3, m6, 2
6417
+    palignr             m5, m6, m1, 5
6418
+    pshufb              m4, m7
6419
+    pshufb              m5, m7
6420
+    pmaddubsw           m4, m0
6421
+    pmaddubsw           m5, m0
6422
+    pmulhrsw            m4, m2
6423
+    pmulhrsw            m5, m2
6424
+    packuswb            m4, m5
6425
+    vpermq              m4, m4, q3120
6426
+    movu                [r0], m4
6427
+
6428
+    palignr             m4, m3, m6, 3
6429
+    palignr             m5, m6, m1, 6
6430
+    pshufb              m4, m7
6431
+    pshufb              m5, m7
6432
+    pmaddubsw           m4, m0
6433
+    pmaddubsw           m5, m0
6434
+    pmulhrsw            m4, m2
6435
+    pmulhrsw            m5, m2
6436
+    packuswb            m4, m5
6437
+    vpermq              m4, m4, q3120
6438
+    movu                [r0 + r1], m4
6439
+
6440
+    palignr             m4, m3, m6, 4
6441
+    palignr             m5, m6, m1, 7
6442
+    pshufb              m4, m7
6443
+    pshufb              m5, m7
6444
+    pmaddubsw           m4, m0
6445
+    pmaddubsw           m5, m0
6446
+    pmulhrsw            m4, m2
6447
+    pmulhrsw            m5, m2
6448
+    packuswb            m4, m5
6449
+    vpermq              m4, m4, q3120
6450
+    movu                [r0 + r1 * 2], m4
6451
+
6452
+    palignr             m4, m3, m6, 5
6453
+    palignr             m5, m6, m1, 8
6454
+    pshufb              m4, m7
6455
+    pshufb              m5, m7
6456
+    pmaddubsw           m4, m0
6457
+    pmaddubsw           m5, m0
6458
+    pmulhrsw            m4, m2
6459
+    pmulhrsw            m5, m2
6460
+    packuswb            m4, m5
6461
+    vpermq              m4, m4, q3120
6462
+    movu                [r0 + r3], m4
6463
+
6464
+    lea                 r0, [r0 + r1 * 4]
6465
+
6466
+    palignr             m4, m3, m6, 6
6467
+    palignr             m5, m6, m1, 9
6468
+    pshufb              m4, m7
6469
+    pshufb              m5, m7
6470
+    pmaddubsw           m4, m0
6471
+    pmaddubsw           m5, m0
6472
+    pmulhrsw            m4, m2
6473
+    pmulhrsw            m5, m2
6474
+    packuswb            m4, m5
6475
+    vpermq              m4, m4, q3120
6476
+    movu                [r0], m4
6477
+
6478
+    palignr             m4, m3, m6, 7
6479
+    palignr             m5, m6, m1, 10
6480
+    pshufb              m4, m7
6481
+    pshufb              m5, m7
6482
+    pmaddubsw           m4, m0
6483
+    pmaddubsw           m5, m0
6484
+    pmulhrsw            m4, m2
6485
+    pmulhrsw            m5, m2
6486
+    packuswb            m4, m5
6487
+    vpermq              m4, m4, q3120
6488
+    movu                [r0 + r1], m4
6489
+
6490
+    palignr             m4, m3, m6, 8
6491
+    palignr             m5, m6, m1, 11
6492
+    pshufb              m4, m7
6493
+    pshufb              m5, m7
6494
+    pmaddubsw           m4, m0
6495
+    pmaddubsw           m5, m0
6496
+    pmulhrsw            m4, m2
6497
+    pmulhrsw            m5, m2
6498
+    packuswb            m4, m5
6499
+    vpermq              m4, m4, q3120
6500
+    movu                [r0 + r1 * 2], m4
6501
+
6502
+    palignr             m4, m3, m6, 9
6503
+    palignr             m5, m6, m1, 12
6504
+    pshufb              m4, m7
6505
+    pshufb              m5, m7
6506
+    pmaddubsw           m4, m0
6507
+    pmaddubsw           m5, m0
6508
+    pmulhrsw            m4, m2
6509
+    pmulhrsw            m5, m2
6510
+    packuswb            m4, m5
6511
+    vpermq              m4, m4, q3120
6512
+    movu                [r0 + r3], m4
6513
+
6514
+    lea                 r0, [r0 + r1 * 4]
6515
+
6516
+    palignr             m4, m3, m6, 10
6517
+    palignr             m5, m6, m1, 13
6518
+    pshufb              m4, m7
6519
+    pshufb              m5, m7
6520
+    pmaddubsw           m4, m0
6521
+    pmaddubsw           m5, m0
6522
+    pmulhrsw            m4, m2
6523
+    pmulhrsw            m5, m2
6524
+    packuswb            m4, m5
6525
+    vpermq              m4, m4, q3120
6526
+    movu                [r0], m4
6527
+
6528
+    palignr             m4, m3, m6, 11
6529
+    palignr             m5, m6, m1, 14
6530
+    pshufb              m4, m7
6531
+    pshufb              m5, m7
6532
+    pmaddubsw           m4, m0
6533
+    pmaddubsw           m5, m0
6534
+    pmulhrsw            m4, m2
6535
+    pmulhrsw            m5, m2
6536
+    packuswb            m4, m5
6537
+    vpermq              m4, m4, q3120
6538
+    movu                [r0 + r1], m4
6539
+
6540
+    palignr             m4, m3, m6, 12
6541
+    palignr             m5, m6, m1, 15
6542
+    pshufb              m4, m7
6543
+    pshufb              m5, m7
6544
+    pmaddubsw           m4, m0
6545
+    pmaddubsw           m5, m0
6546
+    pmulhrsw            m4, m2
6547
+    pmulhrsw            m5, m2
6548
+    packuswb            m4, m5
6549
+    vpermq              m4, m4, q3120
6550
+    movu                [r0 + r1 * 2], m4
6551
+
6552
+    palignr             m4, m3, m6, 13
6553
+    pshufb              m4, m7
6554
+    pshufb              m5, m6, m7
6555
+    pmaddubsw           m4, m0
6556
+    pmaddubsw           m5, m0
6557
+    pmulhrsw            m4, m2
6558
+    pmulhrsw            m5, m2
6559
+    packuswb            m4, m5
6560
+    vpermq              m4, m4, q3120
6561
+    movu                [r0 + r3], m4
6562
+
6563
+    lea                 r0, [r0 + r1 * 4]
6564
+
6565
+    palignr             m4, m3, m6, 14
6566
+    palignr             m5, m3, m6, 1
6567
+    pshufb              m4, m7
6568
+    pshufb              m5, m7
6569
+    pmaddubsw           m4, m0
6570
+    pmaddubsw           m5, m0
6571
+    pmulhrsw            m4, m2
6572
+    pmulhrsw            m5, m2
6573
+    packuswb            m4, m5
6574
+    vpermq              m4, m4, q3120
6575
+    movu                [r0], m4
6576
+
6577
+    palignr             m4, m3, m6, 15
6578
+    palignr             m5, m3, m6, 2
6579
+    pshufb              m4, m7
6580
+    pshufb              m5, m7
6581
+    pmaddubsw           m4, m0
6582
+    pmaddubsw           m5, m0
6583
+    pmulhrsw            m4, m2
6584
+    pmulhrsw            m5, m2
6585
+    packuswb            m4, m5
6586
+    vpermq              m4, m4, q3120
6587
+    movu                [r0 + r1], m4
6588
+
6589
+    palignr             m5, m3, m6, 3
6590
+    pshufb              m4, m3, m7
6591
+    pshufb              m5, m7
6592
+    pmaddubsw           m4, m0
6593
+    pmaddubsw           m5, m0
6594
+    pmulhrsw            m4, m2
6595
+    pmulhrsw            m5, m2
6596
+    packuswb            m4, m5
6597
+    vpermq              m4, m4, q3120
6598
+    movu                [r0 + r1 * 2], m4
6599
+
6600
+    vbroadcasti128      m1, [r2 + mmsize*2 + 16]
6601
+    palignr             m4, m1, m3, 1
6602
+    palignr             m5, m3, m6, 4
6603
+    pshufb              m4, m7
6604
+    pshufb              m5, m7
6605
+    pmaddubsw           m4, m0
6606
+    pmaddubsw           m5, m0
6607
+    pmulhrsw            m4, m2
6608
+    pmulhrsw            m5, m2
6609
+    packuswb            m4, m5
6610
+    vpermq              m4, m4, q3120
6611
+    movu                [r0 + r3], m4
6612
+
6613
+    lea                 r0, [r0 + r1 * 4]
6614
+
6615
+    palignr             m4, m1, m3, 2
6616
+    palignr             m5, m3, m6, 5
6617
+    pshufb              m4, m7
6618
+    pshufb              m5, m7
6619
+    pmaddubsw           m4, m0
6620
+    pmaddubsw           m5, m0
6621
+    pmulhrsw            m4, m2
6622
+    pmulhrsw            m5, m2
6623
+    packuswb            m4, m5
6624
+    vpermq              m4, m4, q3120
6625
+    movu                [r0], m4
6626
+
6627
+    palignr             m4, m1, m3, 3
6628
+    palignr             m5, m3, m6, 6
6629
+    pshufb              m4, m7
6630
+    pshufb              m5, m7
6631
+    pmaddubsw           m4, m0
6632
+    pmaddubsw           m5, m0
6633
+    pmulhrsw            m4, m2
6634
+    pmulhrsw            m5, m2
6635
+    packuswb            m4, m5
6636
+    vpermq              m4, m4, q3120
6637
+    movu                [r0 + r1], m4
6638
+
6639
+    palignr             m4, m1, m3, 4
6640
+    palignr             m5, m3, m6, 7
6641
+    pshufb              m4, m7
6642
+    pshufb              m5, m7
6643
+    pmaddubsw           m4, m0
6644
+    pmaddubsw           m5, m0
6645
+    pmulhrsw            m4, m2
6646
+    pmulhrsw            m5, m2
6647
+    packuswb            m4, m5
6648
+    vpermq              m4, m4, q3120
6649
+    movu                [r0 + r1 * 2], m4
6650
+
6651
+    palignr             m4, m1, m3, 5
6652
+    palignr             m5, m3, m6, 8
6653
+    pshufb              m4, m7
6654
+    pshufb              m5, m7
6655
+    pmaddubsw           m4, m0
6656
+    pmaddubsw           m5, m0
6657
+    pmulhrsw            m4, m2
6658
+    pmulhrsw            m5, m2
6659
+    packuswb            m4, m5
6660
+    vpermq              m4, m4, q3120
6661
+    movu                [r0 + r3], m4
6662
+
6663
+    lea                 r0, [r0 + r1 * 4]
6664
+
6665
+    palignr             m4, m1, m3, 6
6666
+    palignr             m5, m3, m6, 9
6667
+    pshufb              m4, m7
6668
+    pshufb              m5, m7
6669
+    pmaddubsw           m4, m0
6670
+    pmaddubsw           m5, m0
6671
+    pmulhrsw            m4, m2
6672
+    pmulhrsw            m5, m2
6673
+    packuswb            m4, m5
6674
+    vpermq              m4, m4, q3120
6675
+    movu                [r0], m4
6676
+
6677
+    palignr             m4, m1, m3, 7
6678
+    palignr             m5, m3, m6, 10
6679
+    pshufb              m4, m7
6680
+    pshufb              m5, m7
6681
+    pmaddubsw           m4, m0
6682
+    pmaddubsw           m5, m0
6683
+    pmulhrsw            m4, m2
6684
+    pmulhrsw            m5, m2
6685
+    packuswb            m4, m5
6686
+    vpermq              m4, m4, q3120
6687
+    movu                [r0 + r1], m4
6688
+
6689
+    palignr             m4, m1, m3, 8
6690
+    palignr             m5, m3, m6, 11
6691
+    pshufb              m4, m7
6692
+    pshufb              m5, m7
6693
+    pmaddubsw           m4, m0
6694
+    pmaddubsw           m5, m0
6695
+    pmulhrsw            m4, m2
6696
+    pmulhrsw            m5, m2
6697
+    packuswb            m4, m5
6698
+    vpermq              m4, m4, q3120
6699
+    movu                [r0 + r1 * 2], m4
6700
+
6701
+    palignr             m4, m1, m3, 9
6702
+    palignr             m5, m3, m6, 12
6703
+    pshufb              m4, m7
6704
+    pshufb              m5, m7
6705
+    pmaddubsw           m4, m0
6706
+    pmaddubsw           m5, m0
6707
+    pmulhrsw            m4, m2
6708
+    pmulhrsw            m5, m2
6709
+    packuswb            m4, m5
6710
+    vpermq              m4, m4, q3120
6711
+    movu                [r0 + r3], m4
6712
+
6713
+    lea                 r0, [r0 + r1 * 4]
6714
+
6715
+    palignr             m4, m1, m3, 10
6716
+    palignr             m5, m3, m6, 13
6717
+    pshufb              m4, m7
6718
+    pshufb              m5, m7
6719
+    pmaddubsw           m4, m0
6720
+    pmaddubsw           m5, m0
6721
+    pmulhrsw            m4, m2
6722
+    pmulhrsw            m5, m2
6723
+    packuswb            m4, m5
6724
+    vpermq              m4, m4, q3120
6725
+    movu                [r0], m4
6726
+
6727
+    palignr             m4, m1, m3, 11
6728
+    palignr             m5, m3, m6, 14
6729
+    pshufb              m4, m7
6730
+    pshufb              m5, m7
6731
+    pmaddubsw           m4, m0
6732
+    pmaddubsw           m5, m0
6733
+    pmulhrsw            m4, m2
6734
+    pmulhrsw            m5, m2
6735
+    packuswb            m4, m5
6736
+    vpermq              m4, m4, q3120
6737
+    movu                [r0 + r1], m4
6738
+
6739
+    palignr             m4, m1, m3, 12
6740
+    palignr             m5, m3, m6, 15
6741
+    pshufb              m4, m7
6742
+    pshufb              m5, m7
6743
+    pmaddubsw           m4, m0
6744
+    pmaddubsw           m5, m0
6745
+    pmulhrsw            m4, m2
6746
+    pmulhrsw            m5, m2
6747
+    packuswb            m4, m5
6748
+    vpermq              m4, m4, q3120
6749
+    movu                [r0 + r1 * 2], m4
6750
+
6751
+    palignr             m4, m1, m3, 13
6752
+    pshufb              m4, m7
6753
+    pshufb              m5, m3, m7
6754
+    pmaddubsw           m4, m0
6755
+    pmaddubsw           m5, m0
6756
+    pmulhrsw            m4, m2
6757
+    pmulhrsw            m5, m2
6758
+    packuswb            m4, m5
6759
+    vpermq              m4, m4, q3120
6760
+    movu                [r0 + r3], m4
6761
+
6762
+    lea                 r0, [r0 + r1 * 4]
6763
+
6764
+    palignr             m4, m1, m3, 14
6765
+    palignr             m5, m1, m3, 1
6766
+    pshufb              m4, m7
6767
+    pshufb              m5, m7
6768
+    pmaddubsw           m4, m0
6769
+    pmaddubsw           m5, m0
6770
+    pmulhrsw            m4, m2
6771
+    pmulhrsw            m5, m2
6772
+    packuswb            m4, m5
6773
+    vpermq              m4, m4, q3120
6774
+    movu                [r0], m4
6775
+
6776
+    palignr             m4, m1, m3, 15
6777
+    palignr             m5, m1, m3, 2
6778
+    pshufb              m4, m7
6779
+    pshufb              m5, m7
6780
+    pmaddubsw           m4, m0
6781
+    pmaddubsw           m5, m0
6782
+    pmulhrsw            m4, m2
6783
+    pmulhrsw            m5, m2
6784
+    packuswb            m4, m5
6785
+    vpermq              m4, m4, q3120
6786
+    movu                [r0 + r1], m4
6787
+
6788
+    vbroadcasti128      m6, [r2 + mmsize*2 + mmsize]
6789
+    palignr             m5, m1, m3, 3
6790
+    pshufb              m4, m1, m7
6791
+    pshufb              m5, m7
6792
+    pmaddubsw           m4, m0
6793
+    pmaddubsw           m5, m0
6794
+    pmulhrsw            m4, m2
6795
+    pmulhrsw            m5, m2
6796
+    packuswb            m4, m5
6797
+    vpermq              m4, m4, q3120
6798
+    movu                [r0 + r1 * 2], m4
6799
+
6800
+    palignr             m4, m6, m1, 1
6801
+    palignr             m5, m1, m3, 4
6802
+    pshufb              m4, m7
6803
+    pshufb              m5, m7
6804
+    pmaddubsw           m4, m0
6805
+    pmaddubsw           m5, m0
6806
+    pmulhrsw            m4, m2
6807
+    pmulhrsw            m5, m2
6808
+    packuswb            m4, m5
6809
+    vpermq              m4, m4, q3120
6810
+    movu                [r0 + r3], m4
6811
+    RET
6812
+
6813
+cglobal intra_pred_ang32_19, 3,5,10
6814
+    lea                 r3, [ang_table_avx2 + 32 * 16]
6815
+    lea                 r4, [r1 * 3]
6816
+    mova                m5, [pw_1024]
6817
+
6818
+    ; rows 0 to 7
6819
+    movu                m0, [r2 + 0]
6820
+    movu                m1, [r2 + 1]
6821
+    punpckhbw           m2, m0, m1
6822
+    punpcklbw           m0, m1
6823
+
6824
+    movu                m4, [r2 + mmsize*2]
6825
+    pshufb              m4, [ang32_shuf_mode17 + mmsize*1]
6826
+    mova                m3, [ang32_shuf_mode19 + mmsize*1]
6827
+    mova                m6, [ang32_shuf_mode19 + mmsize*2]
6828
+    mova                m9, m4
6829
+    vpermd              m4, m3, m4
6830
+    vpermd              m9, m6, m9
6831
+    pshufb              m4, [ang32_shuf_mode19]
6832
+    pshufb              m9, [ang32_shuf_mode19]
6833
+
6834
+    vextracti128       xm6, m4, 1
6835
+    palignr             m3, m0, m4, 1
6836
+    palignr             m8, m3, m6, 1
6837
+    palignr             m7, m8, m9, 1
6838
+    vinserti128         m3, m3, xm2, 1
6839
+    vinserti128         m8, m8, xm0, 1
6840
+    vinserti128         m9, m7, xm3, 1
6841
+
6842
+    pmaddubsw           m4, m0, [r3 - 10 * 32]          ; [6]
6843
+    pmulhrsw            m4, m5
6844
+    pmaddubsw           m1, m2, [r3 - 10 * 32]
6845
+    pmulhrsw            m1, m5
6846
+    packuswb            m4, m1
6847
+    movu                [r0], m4
6848
+
6849
+    palignr             m6, m0, m3, 14
6850
+    palignr             m7, m2, m0, 14
6851
+    pmaddubsw           m4, m6, [r3 - 4 * 32]           ; [12]
6852
+    pmulhrsw            m4, m5
6853
+    pmaddubsw           m1, m7, [r3 - 4 * 32]
6854
+    pmulhrsw            m1, m5
6855
+    packuswb            m4, m1
6856
+    movu                [r0 + r1], m4
6857
+
6858
+    palignr             m6, m0, m3, 12
6859
+    palignr             m7, m2, m0, 12
6860
+    pmaddubsw           m4, m6, [r3 + 2 * 32]           ; [18]
6861
+    pmulhrsw            m4, m5
6862
+    pmaddubsw           m1, m7, [r3 + 2 * 32]
6863
+    pmulhrsw            m1, m5
6864
+    packuswb            m4, m1
6865
+    movu                [r0 + r1*2], m4
6866
+
6867
+    palignr             m6, m0, m3, 10
6868
+    palignr             m7, m2, m0, 10
6869
+    pmaddubsw           m4, m6, [r3 + 8 * 32]           ; [24]
6870
+    pmulhrsw            m4, m5
6871
+    pmaddubsw           m1, m7, [r3 + 8 * 32]
6872
+    pmulhrsw            m1, m5
6873
+    packuswb            m4, m1
6874
+    movu                [r0 + r4], m4
6875
+
6876
+    lea                 r0, [r0 + r1 * 4]
6877
+
6878
+    palignr             m6, m0, m3, 8
6879
+    palignr             m7, m2, m0, 8
6880
+    pmaddubsw           m4, m6, [r3 + 14 * 32]          ; [30]
6881
+    pmulhrsw            m4, m5
6882
+    pmaddubsw           m1, m7, [r3 + 14 * 32]
6883
+    pmulhrsw            m1, m5
6884
+    packuswb            m4, m1
6885
+    movu                [r0], m4
6886
+
6887
+    pmaddubsw           m4, m6, [r3 - 12 * 32]          ; [4]
6888
+    pmulhrsw            m4, m5
6889
+    pmaddubsw           m1, m7, [r3 - 12 * 32]
6890
+    pmulhrsw            m1, m5
6891
+    packuswb            m4, m1
6892
+    movu                [r0 + r1], m4
6893
+
6894
+    palignr             m6, m0, m3, 6
6895
+    palignr             m7, m2, m0, 6
6896
+    pmaddubsw           m4, m6, [r3 - 6 * 32]           ; [10]
6897
+    pmulhrsw            m4, m5
6898
+    pmaddubsw           m1, m7, [r3 - 6 * 32]
6899
+    pmulhrsw            m1, m5
6900
+    packuswb            m4, m1
6901
+    movu                [r0 + r1*2], m4
6902
+
6903
+    palignr             m6, m0, m3, 4
6904
+    palignr             m7, m2, m0, 4
6905
+    pmaddubsw           m4, m6, [r3]                    ; [16]
6906
+    pmulhrsw            m4, m5
6907
+    pmaddubsw           m1, m7, [r3]
6908
+    pmulhrsw            m1, m5
6909
+    packuswb            m4, m1
6910
+    movu                [r0 + r4], m4
6911
+
6912
+    lea                 r0, [r0 + r1 * 4]
6913
+
6914
+    ; rows 8 to 15
6915
+    palignr             m6, m0, m3, 2
6916
+    palignr             m7, m2, m0, 2
6917
+    pmaddubsw           m4, m6, [r3 + 6 * 32]           ; [22]
6918
+    pmulhrsw            m4, m5
6919
+    pmaddubsw           m1, m7, [r3 + 6 * 32]
6920
+    pmulhrsw            m1, m5
6921
+    packuswb            m4, m1
6922
+    movu                [r0], m4
6923
+
6924
+    pmaddubsw           m4, m3, [r3 + 12 * 32]          ; [28]
6925
+    pmulhrsw            m4, m5
6926
+    pmaddubsw           m1, m0, [r3 + 12 * 32]
6927
+    pmulhrsw            m1, m5
6928
+    packuswb            m4, m1
6929
+    movu                [r0 + r1], m4
6930
+
6931
+    pmaddubsw           m4, m3, [r3 - 14 * 32]          ; [2]
6932
+    pmulhrsw            m4, m5
6933
+    pmaddubsw           m1, m0, [r3 - 14 * 32]
6934
+    pmulhrsw            m1, m5
6935
+    packuswb            m4, m1
6936
+    movu                [r0 + r1*2], m4
6937
+
6938
+    palignr             m6, m3, m8, 14
6939
+    palignr             m7, m0, m3, 14
6940
+    pmaddubsw           m4, m6, [r3 - 8 * 32]           ; [8]
6941
+    pmulhrsw            m4, m5
6942
+    pmaddubsw           m1, m7, [r3 - 8 * 32]
6943
+    pmulhrsw            m1, m5
6944
+    packuswb            m4, m1
6945
+    movu                [r0 + r4], m4
6946
+
6947
+    lea                 r0, [r0 + r1 * 4]
6948
+
6949
+    palignr             m6, m3, m8, 12
6950
+    palignr             m7, m0, m3, 12
6951
+    pmaddubsw           m4, m6, [r3 - 2 * 32]           ; [14]
6952
+    pmulhrsw            m4, m5
6953
+    pmaddubsw           m1, m7, [r3 - 2 * 32]
6954
+    pmulhrsw            m1, m5
6955
+    packuswb            m4, m1
6956
+    movu                [r0], m4
6957
+
6958
+    palignr             m6, m3, m8, 10
6959
+    palignr             m7, m0, m3, 10
6960
+    pmaddubsw           m4, m6, [r3 + 4 * 32]           ; [20]
6961
+    pmulhrsw            m4, m5
6962
+    pmaddubsw           m1, m7, [r3 + 4 * 32]
6963
+    pmulhrsw            m1, m5
6964
+    packuswb            m4, m1
6965
+    movu                [r0 + r1], m4
6966
+
6967
+    palignr             m6, m3, m8, 8
6968
+    palignr             m7, m0, m3, 8
6969
+    pmaddubsw           m4, m6, [r3 + 10 * 32]          ; [26]
6970
+    pmulhrsw            m4, m5
6971
+    pmaddubsw           m1, m7, [r3 + 10 * 32]
6972
+    pmulhrsw            m1, m5
6973
+    packuswb            m4, m1
6974
+    movu                [r0 + r1 * 2], m4
6975
+
6976
+    pand                m6, [pw_00ff]
6977
+    pand                m7, [pw_00ff]
6978
+    packuswb            m6, m7
6979
+    movu                [r0 + r4], m6
6980
+
6981
+    lea                 r0, [r0 + r1 * 4]
6982
+
6983
+    ; rows 16 to 23
6984
+    palignr             m6, m3, m8, 6
6985
+    palignr             m7, m0, m3, 6
6986
+    pmaddubsw           m4, m6, [r3 - 10 * 32]          ; [6]
6987
+    pmulhrsw            m4, m5
6988
+    pmaddubsw           m1, m7, [r3 - 10 * 32]
6989
+    pmulhrsw            m1, m5
6990
+    packuswb            m4, m1
6991
+    movu                [r0], m4
6992
+
6993
+    palignr             m6, m3, m8, 4
6994
+    palignr             m7, m0, m3, 4
6995
+    pmaddubsw           m4, m6, [r3 - 4 * 32]           ; [12]
6996
+    pmulhrsw            m4, m5
6997
+    pmaddubsw           m1, m7, [r3 - 4 * 32]
6998
+    pmulhrsw            m1, m5
6999
+    packuswb            m4, m1
7000
+    movu                [r0 + r1], m4
7001
+
7002
+    palignr             m6, m3, m8, 2
7003
+    palignr             m7, m0, m3, 2
7004
+    pmaddubsw           m4, m6, [r3 + 2 * 32]           ; [18]
7005
+    pmulhrsw            m4, m5
7006
+    pmaddubsw           m1, m7, [r3 + 2 * 32]
7007
+    pmulhrsw            m1, m5
7008
+    packuswb            m4, m1
7009
+    movu                [r0 + r1*2], m4
7010
+
7011
+    pmaddubsw           m4, m8, [r3 + 8 * 32]           ; [24]
7012
+    pmulhrsw            m4, m5
7013
+    pmaddubsw           m1, m3, [r3 + 8 * 32]
7014
+    pmulhrsw            m1, m5
7015
+    packuswb            m4, m1
7016
+    movu                [r0 + r4], m4
7017
+
7018
+    lea                 r0, [r0 + r1 * 4]
7019
+
7020
+    palignr             m6, m8, m9, 14
7021
+    palignr             m7, m3, m8, 14
7022
+    pmaddubsw           m4, m6, [r3 + 14 * 32]          ; [30]
7023
+    pmulhrsw            m4, m5
7024
+    pmaddubsw           m1, m7, [r3 + 14 * 32]
7025
+    pmulhrsw            m1, m5
7026
+    packuswb            m4, m1
7027
+    movu                [r0], m4
7028
+
7029
+    pmaddubsw           m4, m6, [r3 - 12 * 32]          ; [4]
7030
+    pmulhrsw            m4, m5
7031
+    pmaddubsw           m1, m7, [r3 - 12 * 32]
7032
+    pmulhrsw            m1, m5
7033
+    packuswb            m4, m1
7034
+    movu                [r0 + r1], m4
7035
+
7036
+    palignr             m6, m8, m9, 12
7037
+    palignr             m7, m3, m8, 12
7038
+    pmaddubsw           m4, m6, [r3 - 6 * 32]           ; [10]
7039
+    pmulhrsw            m4, m5
7040
+    pmaddubsw           m1, m7, [r3 - 6 * 32]
7041
+    pmulhrsw            m1, m5
7042
+    packuswb            m4, m1
7043
+    movu                [r0 + r1*2], m4
7044
+
7045
+    palignr             m6, m8, m9, 10
7046
+    palignr             m7, m3, m8, 10
7047
+    pmaddubsw           m4, m6, [r3]                    ; [16]
7048
+    pmulhrsw            m4, m5
7049
+    pmaddubsw           m1, m7, [r3]
7050
+    pmulhrsw            m1, m5
7051
+    packuswb            m4, m1
7052
+    movu                [r0 + r4], m4
7053
+
7054
+    lea                 r0, [r0 + r1 * 4]
7055
+
7056
+    ; rows 24 to 31
7057
+    palignr             m6, m8, m9, 8
7058
+    palignr             m7, m3, m8, 8
7059
+    pmaddubsw           m4, m6, [r3 + 6 * 32]           ; [22]
7060
+    pmulhrsw            m4, m5
7061
+    pmaddubsw           m1, m7, [r3 + 6 * 32]
7062
+    pmulhrsw            m1, m5
7063
+    packuswb            m4, m1
7064
+    movu                [r0], m4
7065
+
7066
+    palignr             m6, m8, m9, 6
7067
+    palignr             m7, m3, m8, 6
7068
+    pmaddubsw           m4, m6, [r3 + 12 * 32]          ; [28]
7069
+    pmulhrsw            m4, m5
7070
+    pmaddubsw           m1, m7, [r3 + 12 * 32]
7071
+    pmulhrsw            m1, m5
7072
+    packuswb            m4, m1
7073
+    movu                [r0 + r1], m4
7074
+
7075
+    pmaddubsw           m4, m6, [r3 - 14 * 32]          ; [2]
7076
+    pmulhrsw            m4, m5
7077
+    pmaddubsw           m1, m7, [r3 - 14 * 32]
7078
+    pmulhrsw            m1, m5
7079
+    packuswb            m4, m1
7080
+    movu                [r0 + r1*2], m4
7081
+
7082
+    palignr             m6, m8, m9, 4
7083
+    palignr             m7, m3, m8, 4
7084
+    pmaddubsw           m4, m6, [r3 - 8 * 32]           ; [8]
7085
+    pmulhrsw            m4, m5
7086
+    pmaddubsw           m1, m7, [r3 - 8 * 32]
7087
+    pmulhrsw            m1, m5
7088
+    packuswb            m4, m1
7089
+    movu                [r0 + r4], m4
7090
+
7091
+    lea                 r0, [r0 + r1 * 4]
7092
+
7093
+    vpbroadcastb        m0, [r2 + mmsize*2 + 31]
7094
+    palignr             m1, m9, m0, 1
7095
+    vinserti128         m0, m1, xm8, 1
7096
+
7097
+    palignr             m6, m8, m9, 2
7098
+    palignr             m7, m3, m8, 2
7099
+    pmaddubsw           m4, m6, [r3 - 2 * 32]           ; [14]
7100
+    pmulhrsw            m4, m5
7101
+    pmaddubsw           m1, m7, [r3 - 2 * 32]
7102
+    pmulhrsw            m1, m5
7103
+    packuswb            m4, m1
7104
+    movu                [r0], m4
7105
+
7106
+    pmaddubsw           m4, m9, [r3 + 4 * 32]           ; [20]
7107
+    pmulhrsw            m4, m5
7108
+    pmaddubsw           m1, m8, [r3 + 4 * 32]
7109
+    pmulhrsw            m1, m5
7110
+    packuswb            m4, m1
7111
+    movu                [r0 + r1], m4
7112
+
7113
+    palignr             m6, m9, m0, 14
7114
+    palignr             m7, m8, m9, 14
7115
+    pmaddubsw           m4, m6, [r3 + 10 * 32]          ; [26]
7116
+    pmulhrsw            m4, m5
7117
+    pmaddubsw           m1, m7, [r3 + 10 * 32]
7118
+    pmulhrsw            m1, m5
7119
+    packuswb            m4, m1
7120
+    movu                [r0 + r1 * 2], m4
7121
+
7122
+    pand                m6, [pw_00ff]
7123
+    pand                m7, [pw_00ff]
7124
+    packuswb            m6, m7
7125
+    movu                [r0 + r4], m6
7126
+    RET
7127
+
7128
 %endif  ; ARCH_X86_64
7129
 ;-----------------------------------------------------------------------------------------
7130
 ; end of intra_pred_ang32 angular modes avx2 asm
7131
@@ -12679,70 +19286,113 @@
7132
     RET
7133
 
7134
 INIT_YMM avx2
7135
-cglobal intra_pred_ang8_16, 3, 6, 6
7136
-    mova              m3, [pw_1024]
7137
-    movu              xm5, [r2 + 16]
7138
-    pinsrb            xm5, [r2], 0
7139
-    lea               r5, [intra_pred_shuff_0_8]
7140
-    mova              xm0, xm5
7141
-    pslldq            xm5, 1
7142
-    pinsrb            xm5, [r2 + 2], 0
7143
-    vinserti128       m0, m0, xm5, 1
7144
-    pshufb            m0, [r5]
7145
-
7146
-    lea               r4, [c_ang8_mode_20]
7147
-    pmaddubsw         m1, m0, [r4]
7148
-    pmulhrsw          m1, m3
7149
-    mova              xm0, xm5
7150
-    pslldq            xm5, 1
7151
-    pinsrb            xm5, [r2 + 3], 0
7152
-    vinserti128       m0, m0, xm5, 1
7153
-    pshufb            m0, [r5]
7154
-    pmaddubsw         m2, m0, [r4 + mmsize]
7155
-    pmulhrsw          m2, m3
7156
-    pslldq            xm5, 1
7157
-    pinsrb            xm5, [r2 + 5], 0
7158
-    vinserti128       m0, m5, xm5, 1
7159
-    pshufb            m0, [r5]
7160
-    pmaddubsw         m4, m0, [r4 + 2 * mmsize]
7161
-    pmulhrsw          m4, m3
7162
-    pslldq            xm5, 1
7163
-    pinsrb            xm5, [r2 + 6], 0
7164
-    mova              xm0, xm5
7165
-    pslldq            xm5, 1
7166
-    pinsrb            xm5, [r2 + 8], 0
7167
-    vinserti128       m0, m0, xm5, 1
7168
-    pshufb            m0, [r5]
7169
-    pmaddubsw         m0, [r4 + 3 * mmsize]
7170
-    pmulhrsw          m0, m3
7171
-
7172
-    packuswb          m1, m2
7173
-    packuswb          m4, m0
7174
-
7175
-    vperm2i128        m2, m1, m4, 00100000b
7176
-    vperm2i128        m1, m1, m4, 00110001b
7177
-    punpcklbw         m4, m2, m1
7178
-    punpckhbw         m2, m1
7179
-    punpcklwd         m1, m4, m2
7180
-    punpckhwd         m4, m2
7181
-    mova              m0, [trans8_shuf]
7182
-    vpermd            m1, m0, m1
7183
-    vpermd            m4, m0, m4
7184
+cglobal intra_pred_ang8_16, 3,4,7
7185
+    lea                 r0, [r0 + r1 * 8]
7186
+    sub                 r0, r1
7187
+    neg                 r1
7188
+    lea                 r3, [r1 * 3]
7189
+    vbroadcasti128      m0, [angHor8_tab_16]            ; m0 = factor
7190
+    mova                m1, [intra_pred8_shuff16]       ; m1 = 4 of Row shuffle
7191
+    movu                m2, [intra_pred8_shuff16 + 8]   ; m2 = 4 of Row shuffle
7192
+
7193
+    ; prepare reference pixel
7194
+    movq                xm3, [r2 + 16 + 1]              ; m3 = [-1 -2 -3 -4 -5 -6 -7 -8 x x x x x x x x]
7195
+    movhps              xm3, [r2 + 2]                   ; m3 = [-1 -2 -3 -4 -5 -6 -7 -8 2 3 x 5 6 x 8 x]
7196
+    pslldq              xm3, 1
7197
+    pinsrb              xm3, [r2], 0                    ; m3 = [ 0 -1 -2 -3 -4 -5 -6 -7 -8 2 3 x 5 6 x 8]
7198
+    pshufb              xm3, [c_ang8_mode_16]
7199
+    vinserti128         m3, m3, xm3, 1                  ; m3 = [-8 -7 -6 -5 -4 -3 -2 -1  0 2 3 5 6 8]
7200
+
7201
+    ; process 4 rows
7202
+    pshufb              m4, m3, m1
7203
+    pshufb              m5, m3, m2
7204
+    psrldq              m3, 4
7205
+    punpcklbw           m6, m5, m4
7206
+    punpckhbw           m5, m4
7207
+    pmaddubsw           m6, m0
7208
+    pmulhrsw            m6, [pw_1024]
7209
+    pmaddubsw           m5, m0
7210
+    pmulhrsw            m5, [pw_1024]
7211
+    packuswb            m6, m5
7212
+    vextracti128        xm5, m6, 1
7213
+    movq                [r0], xm6
7214
+    movhps              [r0 + r1], xm6
7215
+    movq                [r0 + r1 * 2], xm5
7216
+    movhps              [r0 + r3], xm5
7217
+
7218
+    ; process 4 rows
7219
+    lea                 r0, [r0 + r1 * 4]
7220
+    pshufb              m4, m3, m1
7221
+    pshufb              m5, m3, m2
7222
+    punpcklbw           m6, m5, m4
7223
+    punpckhbw           m5, m4
7224
+    pmaddubsw           m6, m0
7225
+    pmulhrsw            m6, [pw_1024]
7226
+    pmaddubsw           m5, m0
7227
+    pmulhrsw            m5, [pw_1024]
7228
+    packuswb            m6, m5
7229
+    vextracti128        xm5, m6, 1
7230
+    movq                [r0], xm6
7231
+    movhps              [r0 + r1], xm6
7232
+    movq                [r0 + r1 * 2], xm5
7233
+    movhps              [r0 + r3], xm5
7234
+    RET
7235
 
7236
-    lea               r3, [3 * r1]
7237
-    movq              [r0], xm1
7238
-    movhps            [r0 + r1], xm1
7239
-    vextracti128      xm2, m1, 1
7240
-    movq              [r0 + 2 * r1], xm2
7241
-    movhps            [r0 + r3], xm2
7242
-    lea               r0, [r0 + 4 * r1]
7243
-    movq              [r0], xm4
7244
-    movhps            [r0 + r1], xm4
7245
-    vextracti128      xm2, m4, 1
7246
-    movq              [r0 + 2 * r1], xm2
7247
-    movhps            [r0 + r3], xm2
7248
+%if 1
7249
+INIT_YMM avx2
7250
+cglobal intra_pred_ang8_20, 3,5,6
7251
+    lea                 r0, [r0 + r1 * 8]
7252
+    sub                 r0, r1
7253
+    neg                 r1
7254
+    lea                 r3, [angHor8_tab_20]
7255
+    lea                 r4, [r1 * 3]
7256
+    movu                m5, [intra_pred_shuff_0_8 + 16]
7257
+
7258
+    ; prepare reference pixel
7259
+    movq                xm1, [r2 + 1]                   ; m3 = [ 1  2  3  4  5  6  7  8  x  x x  x  x  x  x  x]
7260
+    movhps              xm1, [r2 + 16 + 2]              ; m3 = [ 1  2  3  4  5  6  7  8 -2 -3 x -5 -6  x -8  x]
7261
+    palignr             xm1, xm1, [r2 - 15], 15         ; m3 = [ 0  1  2  3  4  5  6  7  8 -2 -3 x -5 -6  x -8]
7262
+    pshufb              xm1, [c_ang8_mode_20]
7263
+    vinserti128         m1, m1, xm1, 1
7264
+
7265
+    ; process 4 rows
7266
+    pshufb              m3, m1, m5
7267
+    psrldq              m1, 2
7268
+    pmaddubsw           m3, [r3 + 0 * 16]
7269
+    pmulhrsw            m3, [pw_1024]
7270
+
7271
+    pshufb              m4, m1, [intra_pred_shuff_0_8]
7272
+    psrldq              m1, 1
7273
+    pmaddubsw           m4, [r3 + 2 * 16]
7274
+    pmulhrsw            m4, [pw_1024]
7275
+
7276
+    packuswb            m3, m4
7277
+    vextracti128        xm4, m3, 1
7278
+    movq                [r0], xm3
7279
+    movq                [r0 + r1], xm4
7280
+    movhps              [r0 + r1 * 2], xm3
7281
+    movhps              [r0 + r4], xm4
7282
+
7283
+    ; process 4 rows
7284
+    lea                 r0, [r0 + r1 * 4]
7285
+    pshufb              m3, m1, m5
7286
+    psrldq              m1, 1
7287
+    pmaddubsw           m3, [r3 + 4 * 16]
7288
+    pmulhrsw            m3, [pw_1024]
7289
+
7290
+    pshufb              m4, m1, m5
7291
+    pmaddubsw           m4, [r3 + 6 * 16]
7292
+    pmulhrsw            m4, [pw_1024]
7293
+
7294
+    packuswb            m3, m4
7295
+    vextracti128        xm4, m3, 1
7296
+    movq                [r0], xm3
7297
+    movq                [r0 + r1], xm4
7298
+    movhps              [r0 + r1 * 2], xm3
7299
+    movhps              [r0 + r4], xm4
7300
     RET
7301
 
7302
+%else
7303
 INIT_YMM avx2
7304
 cglobal intra_pred_ang8_20, 3, 6, 6
7305
     mova              m3, [pw_1024]
7306
@@ -12796,6 +19446,7 @@
7307
     movhps            [r0 + 2 * r1], xm4
7308
     movhps            [r0 + r3], xm2
7309
     RET
7310
+%endif
7311
 
7312
 INIT_YMM avx2
7313
 cglobal intra_pred_ang8_21, 3, 6, 6
7314
@@ -13275,173 +19926,787 @@
7315
 
7316
 
7317
 INIT_YMM avx2
7318
-cglobal intra_pred_ang16_12, 3, 6, 13
7319
-    mova              m11, [pw_1024]
7320
-    lea               r5, [intra_pred_shuff_0_8]
7321
-
7322
-    movu              xm9, [r2 + 32]
7323
-    pinsrb            xm9, [r2], 0
7324
-    pslldq            xm7, xm9, 1
7325
-    pinsrb            xm7, [r2 + 6], 0
7326
-    vinserti128       m9, m9, xm7, 1
7327
-    pshufb            m9, [r5]
7328
-
7329
-    movu              xm12, [r2 + 6 + 32]
7330
-
7331
-    psrldq            xm10, xm12, 2
7332
-    psrldq            xm8, xm12, 1
7333
-    vinserti128       m10, m10, xm8, 1
7334
-    pshufb            m10, [r5]
7335
-
7336
-    lea               r3, [3 * r1]
7337
-    lea               r4, [c_ang16_mode_12]
7338
-
7339
-    INTRA_PRED_ANG16_CAL_ROW m0, m1, 0
7340
-    INTRA_PRED_ANG16_CAL_ROW m1, m2, 1
7341
-    INTRA_PRED_ANG16_CAL_ROW m2, m3, 2
7342
-    INTRA_PRED_ANG16_CAL_ROW m3, m4, 3
7343
-
7344
-    add               r4, 4 * mmsize
7345
+cglobal intra_pred_ang16_12, 3,4,9
7346
+    vbroadcasti128    m0, [angHor_tab_12]
7347
+    vbroadcasti128    m1, [angHor_tab_12 + mmsize/2]
7348
+    mova              m2, [pw_1024]
7349
+    mova              m7, [ang16_shuf_mode12]
7350
+    mova              m8, [ang16_shuf_mode12 + mmsize]
7351
+    lea               r3, [r1 * 3]
7352
+
7353
+    movu              xm4, [r2 + mmsize - 2]
7354
+    pinsrb            xm4, [r2 +  0], 2
7355
+    pinsrb            xm4, [r2 +  6], 1
7356
+    pinsrb            xm4, [r2 + 13], 0
7357
+    vbroadcasti128    m6, [r2 + mmsize + 14]
7358
+    vinserti128       m3, m4, xm4, 1
7359
+
7360
+    pshufb            m4, m3, m7
7361
+    pshufb            m5, m3, m8
7362
+    pmaddubsw         m4, m0
7363
+    pmaddubsw         m5, m1
7364
+    pmulhrsw          m4, m2
7365
+    pmulhrsw          m5, m2
7366
+    packuswb          m4, m5
7367
+    movu              [r0], xm4
7368
+    vextracti128      [r0 + r1], m4, 1
7369
+
7370
+    palignr           m5, m6, m3, 2
7371
+    pshufb            m4, m5, m7
7372
+    pshufb            m5, m8
7373
+
7374
+    pmaddubsw         m4, m0
7375
+    pmaddubsw         m5, m1
7376
+    pmulhrsw          m4, m2
7377
+    pmulhrsw          m5, m2
7378
+    packuswb          m4, m5
7379
+    movu              [r0 + r1 * 2], xm4
7380
+    vextracti128      [r0 + r3], m4, 1
7381
+    lea               r0, [r0 + r1 * 4]
7382
 
7383
-    pslldq            xm7, 1
7384
-    pinsrb            xm7, [r2 + 13], 0
7385
-    pshufb            xm7, [r5]
7386
-    vinserti128       m9, m9, xm7, 1
7387
-
7388
-    mova              xm8, xm12
7389
-    pshufb            xm8, [r5]
7390
-    vinserti128       m10, m10, xm8, 1
7391
-
7392
-    INTRA_PRED_ANG16_CAL_ROW m4, m5, 0
7393
-    INTRA_PRED_ANG16_CAL_ROW m5, m6, 1
7394
-
7395
-    movu              xm9, [r2 + 31]
7396
-    pinsrb            xm9, [r2 + 6], 0
7397
-    pinsrb            xm9, [r2 + 0], 1
7398
-    pshufb            xm9, [r5]
7399
-    vinserti128       m9, m9, xm7, 1
7400
-
7401
-    psrldq            xm10, xm12, 1
7402
-    vinserti128       m10, m10, xm12, 1
7403
-    pshufb            m10, [r5]
7404
+    palignr           m5, m6, m3, 4
7405
+    pshufb            m4, m5, m7
7406
+    pshufb            m5, m8
7407
+
7408
+    pmaddubsw         m4, m0
7409
+    pmaddubsw         m5, m1
7410
+    pmulhrsw          m4, m2
7411
+    pmulhrsw          m5, m2
7412
+    packuswb          m4, m5
7413
+    movu              [r0], xm4
7414
+    vextracti128      [r0 + r1], m4, 1
7415
+
7416
+    palignr           m5, m6, m3, 6
7417
+    pshufb            m4, m5, m7
7418
+    pshufb            m5, m8
7419
+
7420
+    pmaddubsw         m4, m0
7421
+    pmaddubsw         m5, m1
7422
+    pmulhrsw          m4, m2
7423
+    pmulhrsw          m5, m2
7424
+    packuswb          m4, m5
7425
+    movu              [r0 + r1 * 2], xm4
7426
+    vextracti128      [r0 + r3], m4, 1
7427
+    lea               r0, [r0 + r1 * 4]
7428
 
7429
-    INTRA_PRED_ANG16_CAL_ROW m6, m7, 2
7430
-    INTRA_PRED_ANG16_CAL_ROW m7, m8, 3
7431
+    palignr           m5, m6, m3, 8
7432
+    pshufb            m4, m5, m7
7433
+    pshufb            m5, m8
7434
+
7435
+    pmaddubsw         m4, m0
7436
+    pmaddubsw         m5, m1
7437
+    pmulhrsw          m4, m2
7438
+    pmulhrsw          m5, m2
7439
+    packuswb          m4, m5
7440
+    movu              [r0], xm4
7441
+    vextracti128      [r0 + r1], m4, 1
7442
+
7443
+    palignr           m5, m6, m3, 10
7444
+    pshufb            m4, m5, m7
7445
+    pshufb            m5, m8
7446
+
7447
+    pmaddubsw         m4, m0
7448
+    pmaddubsw         m5, m1
7449
+    pmulhrsw          m4, m2
7450
+    pmulhrsw          m5, m2
7451
+    packuswb          m4, m5
7452
+    movu              [r0 + r1 * 2], xm4
7453
+    vextracti128      [r0 + r3], m4, 1
7454
+    lea               r0, [r0 + r1 * 4]
7455
 
7456
-    ; transpose and store
7457
-    INTRA_PRED_TRANS_STORE_16x16
7458
+    palignr           m5, m6, m3, 12
7459
+    pshufb            m4, m5, m7
7460
+    pshufb            m5, m8
7461
+
7462
+    pmaddubsw         m4, m0
7463
+    pmaddubsw         m5, m1
7464
+    pmulhrsw          m4, m2
7465
+    pmulhrsw          m5, m2
7466
+    packuswb          m4, m5
7467
+    movu              [r0], xm4
7468
+    vextracti128      [r0 + r1], m4, 1
7469
+
7470
+    palignr           m5, m6, m3, 14
7471
+    pshufb            m4, m5, m7
7472
+    pshufb            m5, m8
7473
+
7474
+    pmaddubsw         m4, m0
7475
+    pmaddubsw         m5, m1
7476
+    pmulhrsw          m4, m2
7477
+    pmulhrsw          m5, m2
7478
+    packuswb          m4, m5
7479
+    movu              [r0 + r1 * 2], xm4
7480
+    vextracti128      [r0 + r3], m4, 1
7481
     RET
7482
 
7483
 INIT_YMM avx2
7484
-cglobal intra_pred_ang16_13, 3, 6, 14
7485
-    mova              m11, [pw_1024]
7486
-    lea               r5, [intra_pred_shuff_0_8]
7487
-
7488
-    movu              xm13, [r2 + 32]
7489
-    pinsrb            xm13, [r2], 0
7490
-    pslldq            xm7, xm13, 2
7491
-    pinsrb            xm7, [r2 + 7], 0
7492
-    pinsrb            xm7, [r2 + 4], 1
7493
-    vinserti128       m9, m13, xm7, 1
7494
-    pshufb            m9, [r5]
7495
-
7496
-    movu              xm12, [r2 + 4 + 32]
7497
-
7498
-    psrldq            xm10, xm12, 4
7499
-    psrldq            xm8, xm12, 2
7500
-    vinserti128       m10, m10, xm8, 1
7501
-    pshufb            m10, [r5]
7502
-
7503
-    lea               r3, [3 * r1]
7504
-    lea               r4, [c_ang16_mode_13]
7505
-
7506
-    INTRA_PRED_ANG16_CAL_ROW m0, m1, 0
7507
-    INTRA_PRED_ANG16_CAL_ROW m1, m2, 1
7508
+cglobal intra_pred_ang16_13, 3,4,9
7509
+    vbroadcasti128    m0, [angHor_tab_13]
7510
+    vbroadcasti128    m1, [angHor_tab_13 + mmsize/2]
7511
+    mova              m2, [pw_1024]
7512
+    mova              m7, [ang16_shuf_mode13]
7513
+    mova              m8, [ang16_shuf_mode13 + mmsize]
7514
+    lea               r3, [r1 * 3]
7515
+
7516
+    vbroadcasti128    m3, [r2 + mmsize + 1]
7517
+    vbroadcasti128    m4, [r2]
7518
+    pshufb            m4, [ang16_shuf_mode13 + mmsize * 2]
7519
+
7520
+    palignr           m3, m4, 11
7521
+    vbroadcasti128    m6, [r2 + mmsize + 12]
7522
+
7523
+    pshufb            m4, m3, m7
7524
+    pshufb            m5, m3, m8
7525
+    pmaddubsw         m4, m0
7526
+    pmaddubsw         m5, m1
7527
+    pmulhrsw          m4, m2
7528
+    pmulhrsw          m5, m2
7529
+    packuswb          m4, m5
7530
+    movu              [r0], xm4
7531
+    vextracti128      [r0 + r1], m4, 1
7532
+
7533
+    palignr           m5, m6, m3, 2
7534
+    pshufb            m4, m5, m7
7535
+    pshufb            m5, m8
7536
+
7537
+    pmaddubsw         m4, m0
7538
+    pmaddubsw         m5, m1
7539
+    pmulhrsw          m4, m2
7540
+    pmulhrsw          m5, m2
7541
+    packuswb          m4, m5
7542
+    movu              [r0 + r1 * 2], xm4
7543
+    vextracti128      [r0 + r3], m4, 1
7544
+    lea               r0, [r0 + r1 * 4]
7545
 
7546
-    pslldq            xm7, 1
7547
-    pinsrb            xm7, [r2 + 11], 0
7548
-    pshufb            xm2, xm7, [r5]
7549
-    vinserti128       m9, m9, xm2, 1
7550
+    palignr           m5, m6, m3, 4
7551
+    pshufb            m4, m5, m7
7552
+    pshufb            m5, m8
7553
+
7554
+    pmaddubsw         m4, m0
7555
+    pmaddubsw         m5, m1
7556
+    pmulhrsw          m4, m2
7557
+    pmulhrsw          m5, m2
7558
+    packuswb          m4, m5
7559
+    movu              [r0], xm4
7560
+    vextracti128      [r0 + r1], m4, 1
7561
+
7562
+    palignr           m5, m6, m3, 6
7563
+    pshufb            m4, m5, m7
7564
+    pshufb            m5, m8
7565
+
7566
+    pmaddubsw         m4, m0
7567
+    pmaddubsw         m5, m1
7568
+    pmulhrsw          m4, m2
7569
+    pmulhrsw          m5, m2
7570
+    packuswb          m4, m5
7571
+    movu              [r0 + r1 * 2], xm4
7572
+    vextracti128      [r0 + r3], m4, 1
7573
+    lea               r0, [r0 + r1 * 4]
7574
 
7575
-    psrldq            xm8, xm12, 1
7576
-    pshufb            xm8, [r5]
7577
-    vinserti128       m10, m10, xm8, 1
7578
+    palignr           m5, m6, m3, 8
7579
+    pshufb            m4, m5, m7
7580
+    pshufb            m5, m8
7581
+
7582
+    pmaddubsw         m4, m0
7583
+    pmaddubsw         m5, m1
7584
+    pmulhrsw          m4, m2
7585
+    pmulhrsw          m5, m2
7586
+    packuswb          m4, m5
7587
+    movu              [r0], xm4
7588
+    vextracti128      [r0 + r1], m4, 1
7589
+
7590
+    palignr           m5, m6, m3, 10
7591
+    pshufb            m4, m5, m7
7592
+    pshufb            m5, m8
7593
+
7594
+    pmaddubsw         m4, m0
7595
+    pmaddubsw         m5, m1
7596
+    pmulhrsw          m4, m2
7597
+    pmulhrsw          m5, m2
7598
+    packuswb          m4, m5
7599
+    movu              [r0 + r1 * 2], xm4
7600
+    vextracti128      [r0 + r3], m4, 1
7601
+    lea               r0, [r0 + r1 * 4]
7602
 
7603
-    INTRA_PRED_ANG16_CAL_ROW m2, m3, 2
7604
+    palignr           m5, m6, m3, 12
7605
+    pshufb            m4, m5, m7
7606
+    pshufb            m5, m8
7607
+
7608
+    pmaddubsw         m4, m0
7609
+    pmaddubsw         m5, m1
7610
+    pmulhrsw          m4, m2
7611
+    pmulhrsw          m5, m2
7612
+    packuswb          m4, m5
7613
+    movu              [r0], xm4
7614
+    vextracti128      [r0 + r1], m4, 1
7615
+
7616
+    palignr           m5, m6, m3, 14
7617
+    pshufb            m4, m5, m7
7618
+    pshufb            m5, m8
7619
+
7620
+    pmaddubsw         m4, m0
7621
+    pmaddubsw         m5, m1
7622
+    pmulhrsw          m4, m2
7623
+    pmulhrsw          m5, m2
7624
+    packuswb          m4, m5
7625
+    movu              [r0 + r1 * 2], xm4
7626
+    vextracti128      [r0 + r3], m4, 1
7627
+    RET
7628
 
7629
-    pslldq            xm13, 1
7630
-    pinsrb            xm13, [r2 + 4], 0
7631
-    pshufb            xm3, xm13, [r5]
7632
-    vinserti128       m9, m9, xm3, 0
7633
+INIT_YMM avx2
7634
+cglobal intra_pred_ang16_14, 3,4,9
7635
+    vbroadcasti128    m0, [angHor_tab_14]
7636
+    vbroadcasti128    m1, [angHor_tab_14 + mmsize/2]
7637
+    mova              m2, [pw_1024]
7638
+    mova              m7, [ang16_shuf_mode14]
7639
+    mova              m8, [ang16_shuf_mode14 + mmsize]
7640
+    lea               r3, [r1 * 3]
7641
+
7642
+    vbroadcasti128    m3, [r2 + mmsize + 1]
7643
+    vbroadcasti128    m4, [r2]
7644
+    pshufb            m4, [ang16_shuf_mode14 + mmsize * 2]
7645
+    palignr           m3, m4, 9
7646
+    vbroadcasti128    m6, [r2 + mmsize + 10]
7647
+
7648
+    pshufb            m4, m3, m7
7649
+    pshufb            m5, m3, m8
7650
+    pmaddubsw         m4, m0
7651
+    pmaddubsw         m5, m1
7652
+    pmulhrsw          m4, m2
7653
+    pmulhrsw          m5, m2
7654
+    packuswb          m4, m5
7655
+    movu              [r0], xm4
7656
+    vextracti128      [r0 + r1], m4, 1
7657
+
7658
+    palignr           m5, m6, m3, 2
7659
+    pshufb            m4, m5, m7
7660
+    pshufb            m5, m8
7661
+
7662
+    pmaddubsw         m4, m0
7663
+    pmaddubsw         m5, m1
7664
+    pmulhrsw          m4, m2
7665
+    pmulhrsw          m5, m2
7666
+    packuswb          m4, m5
7667
+    movu              [r0 + r1 * 2], xm4
7668
+    vextracti128      [r0 + r3], m4, 1
7669
+    lea               r0, [r0 + r1 * 4]
7670
 
7671
-    psrldq            xm8, xm12, 3
7672
-    pshufb            xm8, [r5]
7673
-    vinserti128       m10, m10, xm8, 0
7674
+    palignr           m5, m6, m3, 4
7675
+    pshufb            m4, m5, m7
7676
+    pshufb            m5, m8
7677
+
7678
+    pmaddubsw         m4, m0
7679
+    pmaddubsw         m5, m1
7680
+    pmulhrsw          m4, m2
7681
+    pmulhrsw          m5, m2
7682
+    packuswb          m4, m5
7683
+    movu              [r0], xm4
7684
+    vextracti128      [r0 + r1], m4, 1
7685
+
7686
+    palignr           m5, m6, m3, 6
7687
+    pshufb            m4, m5, m7
7688
+    pshufb            m5, m8
7689
+
7690
+    pmaddubsw         m4, m0
7691
+    pmaddubsw         m5, m1
7692
+    pmulhrsw          m4, m2
7693
+    pmulhrsw          m5, m2
7694
+    packuswb          m4, m5
7695
+    movu              [r0 + r1 * 2], xm4
7696
+    vextracti128      [r0 + r3], m4, 1
7697
+    lea               r0, [r0 + r1 * 4]
7698
 
7699
-    INTRA_PRED_ANG16_CAL_ROW m3, m4, 3
7700
+    palignr           m5, m6, m3, 8
7701
+    pshufb            m4, m5, m7
7702
+    pshufb            m5, m8
7703
+
7704
+    pmaddubsw         m4, m0
7705
+    pmaddubsw         m5, m1
7706
+    pmulhrsw          m4, m2
7707
+    pmulhrsw          m5, m2
7708
+    packuswb          m4, m5
7709
+    movu              [r0], xm4
7710
+    vextracti128      [r0 + r1], m4, 1
7711
+
7712
+    palignr           m5, m6, m3, 10
7713
+    pshufb            m4, m5, m7
7714
+    pshufb            m5, m8
7715
+
7716
+    pmaddubsw         m4, m0
7717
+    pmaddubsw         m5, m1
7718
+    pmulhrsw          m4, m2
7719
+    pmulhrsw          m5, m2
7720
+    packuswb          m4, m5
7721
+    movu              [r0 + r1 * 2], xm4
7722
+    vextracti128      [r0 + r3], m4, 1
7723
+    lea               r0, [r0 + r1 * 4]
7724
 
7725
-    add               r4, 4 * mmsize
7726
+    palignr           m5, m6, m3, 12
7727
+    pshufb            m4, m5, m7
7728
+    pshufb            m5, m8
7729
+
7730
+    pmaddubsw         m4, m0
7731
+    pmaddubsw         m5, m1
7732
+    pmulhrsw          m4, m2
7733
+    pmulhrsw          m5, m2
7734
+    packuswb          m4, m5
7735
+    movu              [r0], xm4
7736
+    vextracti128      [r0 + r1], m4, 1
7737
+
7738
+    palignr           m5, m6, m3, 14
7739
+    pshufb            m4, m5, m7
7740
+    pshufb            m5, m8
7741
+
7742
+    pmaddubsw         m4, m0
7743
+    pmaddubsw         m5, m1
7744
+    pmulhrsw          m4, m2
7745
+    pmulhrsw          m5, m2
7746
+    packuswb          m4, m5
7747
+    movu              [r0 + r1 * 2], xm4
7748
+    vextracti128      [r0 + r3], m4, 1
7749
+    RET
7750
 
7751
-    INTRA_PRED_ANG16_CAL_ROW m4, m5, 0
7752
-    INTRA_PRED_ANG16_CAL_ROW m5, m6, 1
7753
+INIT_YMM avx2
7754
+cglobal intra_pred_ang16_15, 3,4,9
7755
+    vbroadcasti128    m0, [angHor_tab_15]
7756
+    vbroadcasti128    m1, [angHor_tab_15 + mmsize/2]
7757
+    mova              m2, [pw_1024]
7758
+    mova              m7, [ang16_shuf_mode15]
7759
+    mova              m8, [ang16_shuf_mode15 + mmsize]
7760
+    lea               r3, [r1 * 3]
7761
+
7762
+    vbroadcasti128    m3, [r2 + mmsize + 1]
7763
+    vbroadcasti128    m4, [r2]
7764
+    pshufb            m4, [ang16_shuf_mode15 + mmsize * 2]
7765
+    palignr           m3, m3, m4, 7
7766
+    vbroadcasti128    m6, [r2 + mmsize + 8]
7767
+
7768
+    pshufb            m4, m3, m7
7769
+    pshufb            m5, m3, m8
7770
+    pmaddubsw         m4, m0
7771
+    pmaddubsw         m5, m1
7772
+    pmulhrsw          m4, m2
7773
+    pmulhrsw          m5, m2
7774
+    packuswb          m4, m5
7775
+    movu              [r0], xm4
7776
+    vextracti128      [r0 + r1], m4, 1
7777
+
7778
+    palignr           m5, m6, m3, 2
7779
+    pshufb            m4, m5, m7
7780
+    pshufb            m5, m8
7781
+
7782
+    pmaddubsw         m4, m0
7783
+    pmaddubsw         m5, m1
7784
+    pmulhrsw          m4, m2
7785
+    pmulhrsw          m5, m2
7786
+    packuswb          m4, m5
7787
+    movu              [r0 + r1 * 2], xm4
7788
+    vextracti128      [r0 + r3], m4, 1
7789
+    lea               r0, [r0 + r1 * 4]
7790
 
7791
-    pslldq            xm7, 1
7792
-    pinsrb            xm7, [r2 + 14], 0
7793
-    pshufb            xm7, [r5]
7794
-    vinserti128       m9, m9, xm7, 1
7795
+    palignr           m5, m6, m3, 4
7796
+    pshufb            m4, m5, m7
7797
+    pshufb            m5, m8
7798
+
7799
+    pmaddubsw         m4, m0
7800
+    pmaddubsw         m5, m1
7801
+    pmulhrsw          m4, m2
7802
+    pmulhrsw          m5, m2
7803
+    packuswb          m4, m5
7804
+    movu              [r0], xm4
7805
+    vextracti128      [r0 + r1], m4, 1
7806
+
7807
+    palignr           m5, m6, m3, 6
7808
+    pshufb            m4, m5, m7
7809
+    pshufb            m5, m8
7810
+
7811
+    pmaddubsw         m4, m0
7812
+    pmaddubsw         m5, m1
7813
+    pmulhrsw          m4, m2
7814
+    pmulhrsw          m5, m2
7815
+    packuswb          m4, m5
7816
+    movu              [r0 + r1 * 2], xm4
7817
+    vextracti128      [r0 + r3], m4, 1
7818
+    lea               r0, [r0 + r1 * 4]
7819
 
7820
-    mova              xm8, xm12
7821
-    pshufb            xm8, [r5]
7822
-    vinserti128       m10, m10, xm8, 1
7823
+    palignr           m5, m6, m3, 8
7824
+    pshufb            m4, m5, m7
7825
+    pshufb            m5, m8
7826
+
7827
+    pmaddubsw         m4, m0
7828
+    pmaddubsw         m5, m1
7829
+    pmulhrsw          m4, m2
7830
+    pmulhrsw          m5, m2
7831
+    packuswb          m4, m5
7832
+    movu              [r0], xm4
7833
+    vextracti128      [r0 + r1], m4, 1
7834
+
7835
+    palignr           m5, m6, m3, 10
7836
+    pshufb            m4, m5, m7
7837
+    pshufb            m5, m8
7838
+
7839
+    pmaddubsw         m4, m0
7840
+    pmaddubsw         m5, m1
7841
+    pmulhrsw          m4, m2
7842
+    pmulhrsw          m5, m2
7843
+    packuswb          m4, m5
7844
+    movu              [r0 + r1 * 2], xm4
7845
+    vextracti128      [r0 + r3], m4, 1
7846
+    lea               r0, [r0 + r1 * 4]
7847
 
7848
-    INTRA_PRED_ANG16_CAL_ROW m6, m7, 2
7849
+    palignr           m5, m6, m3, 12
7850
+    pshufb            m4, m5, m7
7851
+    pshufb            m5, m8
7852
+
7853
+    pmaddubsw         m4, m0
7854
+    pmaddubsw         m5, m1
7855
+    pmulhrsw          m4, m2
7856
+    pmulhrsw          m5, m2
7857
+    packuswb          m4, m5
7858
+    movu              [r0], xm4
7859
+    vextracti128      [r0 + r1], m4, 1
7860
+
7861
+    palignr           m5, m6, m3, 14
7862
+    pshufb            m4, m5, m7
7863
+    pshufb            m5, m8
7864
+
7865
+    pmaddubsw         m4, m0
7866
+    pmaddubsw         m5, m1
7867
+    pmulhrsw          m4, m2
7868
+    pmulhrsw          m5, m2
7869
+    packuswb          m4, m5
7870
+    movu              [r0 + r1 * 2], xm4
7871
+    vextracti128      [r0 + r3], m4, 1
7872
+    RET
7873
 
7874
-    pslldq            xm13, 1
7875
-    pinsrb            xm13, [r2 + 7], 0
7876
-    pshufb            xm13, [r5]
7877
-    vinserti128       m9, m9, xm13, 0
7878
+INIT_YMM avx2
7879
+cglobal intra_pred_ang16_16, 3,4,9
7880
+    vbroadcasti128    m0, [angHor_tab_16]
7881
+    vbroadcasti128    m1, [angHor_tab_16 + mmsize/2]
7882
+    mova              m2, [pw_1024]
7883
+    mova              m7, [ang16_shuf_mode16]
7884
+    mova              m8, [ang16_shuf_mode16 + mmsize]
7885
+    lea               r3, [r1 * 3]
7886
+
7887
+    vbroadcasti128    m3, [r2 + mmsize + 1]
7888
+    vbroadcasti128    m4, [r2]
7889
+    pshufb            m4, [ang16_shuf_mode16 + mmsize * 2]
7890
+    palignr           m3, m4, 5
7891
+    vbroadcasti128    m6, [r2 + mmsize + 6]
7892
+
7893
+    pshufb            m4, m3, m7
7894
+    pshufb            m5, m3, m8
7895
+    pmaddubsw         m4, m0
7896
+    pmaddubsw         m5, m1
7897
+    pmulhrsw          m4, m2
7898
+    pmulhrsw          m5, m2
7899
+    packuswb          m4, m5
7900
+    movu              [r0], xm4
7901
+    vextracti128      [r0 + r1], m4, 1
7902
+
7903
+    palignr           m5, m6, m3, 2
7904
+    pshufb            m4, m5, m7
7905
+    pshufb            m5, m8
7906
+
7907
+    pmaddubsw         m4, m0
7908
+    pmaddubsw         m5, m1
7909
+    pmulhrsw          m4, m2
7910
+    pmulhrsw          m5, m2
7911
+    packuswb          m4, m5
7912
+    movu              [r0 + r1 * 2], xm4
7913
+    vextracti128      [r0 + r3], m4, 1
7914
+    lea               r0, [r0 + r1 * 4]
7915
 
7916
-    psrldq            xm12, 2
7917
-    pshufb            xm12, [r5]
7918
-    vinserti128       m10, m10, xm12, 0
7919
+    palignr           m5, m6, m3, 4
7920
+    pshufb            m4, m5, m7
7921
+    pshufb            m5, m8
7922
+
7923
+    pmaddubsw         m4, m0
7924
+    pmaddubsw         m5, m1
7925
+    pmulhrsw          m4, m2
7926
+    pmulhrsw          m5, m2
7927
+    packuswb          m4, m5
7928
+    movu              [r0], xm4
7929
+    vextracti128      [r0 + r1], m4, 1
7930
+
7931
+    palignr           m5, m6, m3, 6
7932
+    pshufb            m4, m5, m7
7933
+    pshufb            m5, m8
7934
+
7935
+    pmaddubsw         m4, m0
7936
+    pmaddubsw         m5, m1
7937
+    pmulhrsw          m4, m2
7938
+    pmulhrsw          m5, m2
7939
+    packuswb          m4, m5
7940
+    movu              [r0 + r1 * 2], xm4
7941
+    vextracti128      [r0 + r3], m4, 1
7942
+    lea               r0, [r0 + r1 * 4]
7943
 
7944
-    INTRA_PRED_ANG16_CAL_ROW m7, m8, 3
7945
+    palignr           m5, m6, m3, 8
7946
+    pshufb            m4, m5, m7
7947
+    pshufb            m5, m8
7948
+
7949
+    pmaddubsw         m4, m0
7950
+    pmaddubsw         m5, m1
7951
+    pmulhrsw          m4, m2
7952
+    pmulhrsw          m5, m2
7953
+    packuswb          m4, m5
7954
+    movu              [r0], xm4
7955
+    vextracti128      [r0 + r1], m4, 1
7956
+
7957
+    palignr           m5, m6, m3, 10
7958
+    pshufb            m4, m5, m7
7959
+    pshufb            m5, m8
7960
+
7961
+    pmaddubsw         m4, m0
7962
+    pmaddubsw         m5, m1
7963
+    pmulhrsw          m4, m2
7964
+    pmulhrsw          m5, m2
7965
+    packuswb          m4, m5
7966
+    movu              [r0 + r1 * 2], xm4
7967
+    vextracti128      [r0 + r3], m4, 1
7968
+    lea               r0, [r0 + r1 * 4]
7969
 
7970
-    ; transpose and store
7971
-    INTRA_PRED_TRANS_STORE_16x16
7972
+    palignr           m5, m6, m3, 12
7973
+    pshufb            m4, m5, m7
7974
+    pshufb            m5, m8
7975
+
7976
+    pmaddubsw         m4, m0
7977
+    pmaddubsw         m5, m1
7978
+    pmulhrsw          m4, m2
7979
+    pmulhrsw          m5, m2
7980
+    packuswb          m4, m5
7981
+    movu              [r0], xm4
7982
+    vextracti128      [r0 + r1], m4, 1
7983
+
7984
+    palignr           m5, m6, m3, 14
7985
+    pshufb            m4, m5, m7
7986
+    pshufb            m5, m8
7987
+
7988
+    pmaddubsw         m4, m0
7989
+    pmaddubsw         m5, m1
7990
+    pmulhrsw          m4, m2
7991
+    pmulhrsw          m5, m2
7992
+    packuswb          m4, m5
7993
+    movu              [r0 + r1 * 2], xm4
7994
+    vextracti128      [r0 + r3], m4, 1
7995
     RET
7996
 
7997
 INIT_YMM avx2
7998
-cglobal intra_pred_ang16_11, 3, 5, 12
7999
-    mova              m11, [pw_1024]
8000
-
8001
-    movu              xm9, [r2 + 32]
8002
-    pinsrb            xm9, [r2], 0
8003
-    pshufb            xm9, [intra_pred_shuff_0_8]
8004
-    vinserti128       m9, m9, xm9, 1
8005
-
8006
-    vbroadcasti128    m10, [r2 + 8 + 32]
8007
-    pshufb            m10, [intra_pred_shuff_0_8]
8008
-
8009
-    lea               r3, [3 * r1]
8010
-    lea               r4, [c_ang16_mode_11]
8011
+cglobal intra_pred_ang16_17, 3,4,9
8012
+    vbroadcasti128    m0, [angHor_tab_17]
8013
+    vbroadcasti128    m1, [angHor_tab_17 + mmsize/2]
8014
+    mova              m2, [pw_1024]
8015
+    mova              m7, [ang16_shuf_mode17]
8016
+    mova              m8, [ang16_shuf_mode17 + mmsize]
8017
+    lea               r3, [r1 * 3]
8018
+
8019
+    vbroadcasti128    m3, [r2 + mmsize + 1]
8020
+    vbroadcasti128    m4, [r2]
8021
+    pshufb            m4, [ang16_shuf_mode17 + mmsize * 2]
8022
+    palignr           m3, m4, 3
8023
+    vbroadcasti128    m6, [r2 + mmsize + 4]
8024
+
8025
+    pshufb            m4, m3, m7
8026
+    pshufb            m5, m3, m8
8027
+    pmaddubsw         m4, m0
8028
+    pmaddubsw         m5, m1
8029
+    pmulhrsw          m4, m2
8030
+    pmulhrsw          m5, m2
8031
+    packuswb          m4, m5
8032
+    movu              [r0], xm4
8033
+    vextracti128      [r0 + r1], m4, 1
8034
+
8035
+    palignr           m5, m6, m3, 2
8036
+    pshufb            m4, m5, m7
8037
+    pshufb            m5, m8
8038
+
8039
+    pmaddubsw         m4, m0
8040
+    pmaddubsw         m5, m1
8041
+    pmulhrsw          m4, m2
8042
+    pmulhrsw          m5, m2
8043
+    packuswb          m4, m5
8044
+    movu              [r0 + r1 * 2], xm4
8045
+    vextracti128      [r0 + r3], m4, 1
8046
+    lea               r0, [r0 + r1 * 4]
8047
 
8048
-    INTRA_PRED_ANG16_CAL_ROW m0, m1, 0
8049
-    INTRA_PRED_ANG16_CAL_ROW m1, m2, 1
8050
-    INTRA_PRED_ANG16_CAL_ROW m2, m3, 2
8051
-    INTRA_PRED_ANG16_CAL_ROW m3, m4, 3
8052
+    palignr           m5, m6, m3, 4
8053
+    pshufb            m4, m5, m7
8054
+    pshufb            m5, m8
8055
+
8056
+    pmaddubsw         m4, m0
8057
+    pmaddubsw         m5, m1
8058
+    pmulhrsw          m4, m2
8059
+    pmulhrsw          m5, m2
8060
+    packuswb          m4, m5
8061
+    movu              [r0], xm4
8062
+    vextracti128      [r0 + r1], m4, 1
8063
+
8064
+    palignr           m5, m6, m3, 6
8065
+    pshufb            m4, m5, m7
8066
+    pshufb            m5, m8
8067
+
8068
+    pmaddubsw         m4, m0
8069
+    pmaddubsw         m5, m1
8070
+    pmulhrsw          m4, m2
8071
+    pmulhrsw          m5, m2
8072
+    packuswb          m4, m5
8073
+    movu              [r0 + r1 * 2], xm4
8074
+    vextracti128      [r0 + r3], m4, 1
8075
+    lea               r0, [r0 + r1 * 4]
8076
 
8077
-    add               r4, 4 * mmsize
8078
+    palignr           m5, m6, m3, 8
8079
+    pshufb            m4, m5, m7
8080
+    pshufb            m5, m8
8081
+
8082
+    pmaddubsw         m4, m0
8083
+    pmaddubsw         m5, m1
8084
+    pmulhrsw          m4, m2
8085
+    pmulhrsw          m5, m2
8086
+    packuswb          m4, m5
8087
+    movu              [r0], xm4
8088
+    vextracti128      [r0 + r1], m4, 1
8089
+
8090
+    palignr           m5, m6, m3, 10
8091
+    pshufb            m4, m5, m7
8092
+    pshufb            m5, m8
8093
+
8094
+    pmaddubsw         m4, m0
8095
+    pmaddubsw         m5, m1
8096
+    pmulhrsw          m4, m2
8097
+    pmulhrsw          m5, m2
8098
+    packuswb          m4, m5
8099
+    movu              [r0 + r1 * 2], xm4
8100
+    vextracti128      [r0 + r3], m4, 1
8101
+    lea               r0, [r0 + r1 * 4]
8102
 
8103
-    INTRA_PRED_ANG16_CAL_ROW m4, m5, 0
8104
-    INTRA_PRED_ANG16_CAL_ROW m5, m6, 1
8105
-    INTRA_PRED_ANG16_CAL_ROW m6, m7, 2
8106
-    INTRA_PRED_ANG16_CAL_ROW m7, m8, 3
8107
+    palignr           m5, m6, m3, 12
8108
+    pshufb            m4, m5, m7
8109
+    pshufb            m5, m8
8110
+
8111
+    pmaddubsw         m4, m0
8112
+    pmaddubsw         m5, m1
8113
+    pmulhrsw          m4, m2
8114
+    pmulhrsw          m5, m2
8115
+    packuswb          m4, m5
8116
+    movu              [r0], xm4
8117
+    vextracti128      [r0 + r1], m4, 1
8118
+
8119
+    palignr           m5, m6, m3, 14
8120
+    pshufb            m4, m5, m7
8121
+    pshufb            m5, m8
8122
+
8123
+    pmaddubsw         m4, m0
8124
+    pmaddubsw         m5, m1
8125
+    pmulhrsw          m4, m2
8126
+    pmulhrsw          m5, m2
8127
+    packuswb          m4, m5
8128
+    movu              [r0 + r1 * 2], xm4
8129
+    vextracti128      [r0 + r3], m4, 1
8130
+    RET
8131
 
8132
-    ; transpose and store
8133
-    INTRA_PRED_TRANS_STORE_16x16
8134
+INIT_YMM avx2
8135
+cglobal intra_pred_ang16_11, 3,4,8
8136
+    vbroadcasti128      m0, [angHor_tab_11]
8137
+    vbroadcasti128      m1, [angHor_tab_11 + mmsize/2]
8138
+    mova                m2, [pw_1024]
8139
+    mova                m7, [ang32_shuf_mode9]
8140
+    lea                 r3, [r1 * 3]
8141
+
8142
+    ; prepare for [0 -1 -2...]
8143
+
8144
+    movu               xm3, [r2 + mmsize]
8145
+    pinsrb             xm3, [r2], 0
8146
+    vbroadcasti128      m6, [r2 + mmsize + 16]
8147
+    vinserti128         m3, m3, xm3, 1
8148
+
8149
+    pshufb              m5, m3, m7              ; [ 0  1  0  1  0  1  0  1  0  1  0  1  0  1  0  1  1  2  1  2  1  2  1  2  1  2  1  2  1  2  1  2]
8150
+    pmaddubsw           m4, m5, m0
8151
+    pmaddubsw           m5, m1
8152
+    pmulhrsw            m4, m2
8153
+    pmulhrsw            m5, m2
8154
+    packuswb            m4, m5
8155
+    movu                [r0], xm4
8156
+    vextracti128        [r0 + r1], m4, 1
8157
+
8158
+    palignr             m5, m6, m3, 2
8159
+    pshufb              m5, m7
8160
+    pmaddubsw           m4, m5, m0
8161
+    pmaddubsw           m5, m1
8162
+    pmulhrsw            m4, m2
8163
+    pmulhrsw            m5, m2
8164
+    packuswb            m4, m5
8165
+    movu                [r0 + r1 * 2], xm4
8166
+    vextracti128        [r0 + r3], m4, 1
8167
+
8168
+    lea                 r0, [r0 + r1 * 4]
8169
+
8170
+    palignr             m5, m6, m3, 4
8171
+    pshufb              m5, m7
8172
+    pmaddubsw           m4, m5, m0
8173
+    pmaddubsw           m5, m1
8174
+    pmulhrsw            m4, m2
8175
+    pmulhrsw            m5, m2
8176
+    packuswb            m4, m5
8177
+    movu                [r0], xm4
8178
+    vextracti128        [r0 + r1], m4, 1
8179
+
8180
+    palignr             m5, m6, m3, 6
8181
+    pshufb              m5, m7
8182
+    pmaddubsw           m4, m5, m0
8183
+    pmaddubsw           m5, m1
8184
+    pmulhrsw            m4, m2
8185
+    pmulhrsw            m5, m2
8186
+    packuswb            m4, m5
8187
+    movu                [r0 + r1 * 2], xm4
8188
+    vextracti128        [r0 + r3], m4, 1
8189
+
8190
+    lea                 r0, [r0 + r1 * 4]
8191
+
8192
+    palignr             m5, m6, m3, 8
8193
+    pshufb              m5, m7
8194
+    pmaddubsw           m4, m5, m0
8195
+    pmaddubsw           m5, m1
8196
+    pmulhrsw            m4, m2
8197
+    pmulhrsw            m5, m2
8198
+    packuswb            m4, m5
8199
+    movu                [r0], xm4
8200
+    vextracti128        [r0 + r1], m4, 1
8201
+
8202
+    palignr             m5, m6, m3, 10
8203
+    pshufb              m5, m7
8204
+    pmaddubsw           m4, m5, m0
8205
+    pmaddubsw           m5, m1
8206
+    pmulhrsw            m4, m2
8207
+    pmulhrsw            m5, m2
8208
+    packuswb            m4, m5
8209
+    movu                [r0 + r1 * 2], xm4
8210
+    vextracti128        [r0 + r3], m4, 1
8211
+
8212
+    lea                 r0, [r0 + r1 * 4]
8213
+
8214
+    palignr             m5, m6, m3, 12
8215
+    pshufb              m5, m7
8216
+    pmaddubsw           m4, m5, m0
8217
+    pmaddubsw           m5, m1
8218
+    pmulhrsw            m4, m2
8219
+    pmulhrsw            m5, m2
8220
+    packuswb            m4, m5
8221
+    movu                [r0], xm4
8222
+    vextracti128        [r0 + r1], m4, 1
8223
+
8224
+    palignr             m5, m6, m3, 14
8225
+    pshufb              m5, m7
8226
+    pmaddubsw           m4, m5, m0
8227
+    pmaddubsw           m5, m1
8228
+    pmulhrsw            m4, m2
8229
+    pmulhrsw            m5, m2
8230
+    packuswb            m4, m5
8231
+    movu                [r0 + r1 * 2], xm4
8232
+    vextracti128        [r0 + r3], m4, 1
8233
     RET
8234
 
8235
+
8236
 ; transpose 8x32 to 16x16, used for intra_ang16x16 avx2 asm
8237
 %if ARCH_X86_64 == 1
8238
 INIT_YMM avx2
8239
@@ -13493,21 +20758,21 @@
8240
     movu            [r0 + r1 * 2], xm%2
8241
     movu            [r0 + r5 * 1], xm%11
8242
 
8243
-    lea             r0, [r0 + r6]
8244
+    add             r0, r6
8245
 
8246
     movu            [r0 + r1 * 0], xm%7
8247
     movu            [r0 + r1 * 1], xm%8
8248
     movu            [r0 + r1 * 2], xm%4
8249
     movu            [r0 + r5 * 1], xm%9
8250
 
8251
-    lea             r0, [r0 + r6]
8252
+    add             r0, r6
8253
 
8254
     vextracti128    [r0 + r1 * 0], m%5, 1
8255
     vextracti128    [r0 + r1 * 1], m%6, 1
8256
     vextracti128    [r0 + r1 * 2], m%2, 1
8257
     vextracti128    [r0 + r5 * 1], m%11, 1
8258
 
8259
-    lea             r0, [r0 + r6]
8260
+    add             r0, r6
8261
 
8262
     vextracti128    [r0 + r1 * 0], m%7, 1
8263
     vextracti128    [r0 + r1 * 1], m%8, 1
8264
@@ -13530,21 +20795,21 @@
8265
     movu            [r0 + r1 * 2], xm%3
8266
     movu            [r0 + r5 * 1], xm%4
8267
 
8268
-    lea             r0, [r0 + r6]
8269
+    add             r0, r6
8270
 
8271
     movu            [r0 + r1 * 0], xm%5
8272
     movu            [r0 + r1 * 1], xm%6
8273
     movu            [r0 + r1 * 2], xm%7
8274
     movu            [r0 + r5 * 1], xm%8
8275
 
8276
-    lea             r0, [r0 + r6]
8277
+    add             r0, r6
8278
 
8279
     vextracti128    [r0 + r1 * 0], m%1, 1
8280
     vextracti128    [r0 + r1 * 1], m%2, 1
8281
     vextracti128    [r0 + r1 * 2], m%3, 1
8282
     vextracti128    [r0 + r5 * 1], m%4, 1
8283
 
8284
-    lea             r0, [r0 + r6]
8285
+    add             r0, r6
8286
 
8287
     vextracti128    [r0 + r1 * 0], m%5, 1
8288
     vextracti128    [r0 + r1 * 1], m%6, 1
8289
@@ -14110,41 +21375,100 @@
8290
 %endif  ; ARCH_X86_64
8291
 
8292
 INIT_YMM avx2
8293
-cglobal intra_pred_ang16_9, 3, 6, 12
8294
-    mova              m11, [pw_1024]
8295
-    lea               r5, [intra_pred_shuff_0_8]
8296
+cglobal intra_pred_ang16_9, 3,4,8
8297
+    vbroadcasti128  m0, [angHor_tab_9]
8298
+    vbroadcasti128  m1, [angHor_tab_9 + mmsize/2]
8299
+    mova            m2, [pw_1024]
8300
+    lea             r3, [r1 * 3]
8301
+    mova            m7, [ang16_shuf_mode9]
8302
 
8303
-    vbroadcasti128    m9, [r2 + 1 + 32]
8304
-    pshufb            m9, [r5]
8305
-    vbroadcasti128    m10, [r2 + 9 + 32]
8306
-    pshufb            m10, [r5]
8307
+    vbroadcasti128  m6, [r2 + mmsize + 17]
8308
+    vbroadcasti128  m3, [r2 + mmsize + 1]
8309
 
8310
-    lea               r3, [3 * r1]
8311
-    lea               r4, [c_ang16_mode_9]
8312
+    pshufb          m5, m3, m7
8313
+    pmaddubsw       m4, m5, m0
8314
+    pmaddubsw       m5, m1
8315
+    pmulhrsw        m4, m2
8316
+    pmulhrsw        m5, m2
8317
+    packuswb        m4, m5
8318
+    movu            [r0], xm4
8319
+    vextracti128    [r0 + r1], m4, 1
8320
+
8321
+    palignr         m5, m6, m3, 2
8322
+    pshufb          m5, m7
8323
+    pmaddubsw       m4, m5, m0
8324
+    pmaddubsw       m5, m1
8325
+    pmulhrsw        m4, m2
8326
+    pmulhrsw        m5, m2
8327
+    packuswb        m4, m5
8328
+    movu            [r0 + r1 * 2], xm4
8329
+    vextracti128    [r0 + r3], m4, 1
8330
+
8331
+    lea             r0, [r0 + r1 * 4]
8332
 
8333
-    INTRA_PRED_ANG16_CAL_ROW m0, m1, 0
8334
-    INTRA_PRED_ANG16_CAL_ROW m1, m2, 1
8335
-    INTRA_PRED_ANG16_CAL_ROW m2, m3, 2
8336
-    INTRA_PRED_ANG16_CAL_ROW m3, m4, 3
8337
+    palignr         m5, m6, m3, 4
8338
+    pshufb          m5, m7
8339
+    pmaddubsw       m4, m5, m0
8340
+    pmaddubsw       m5, m1
8341
+    pmulhrsw        m4, m2
8342
+    pmulhrsw        m5, m2
8343
+    packuswb        m4, m5
8344
+    movu            [r0], xm4
8345
+    vextracti128    [r0 + r1], m4, 1
8346
+
8347
+    palignr         m5, m6, m3, 6
8348
+    pshufb          m5, m7
8349
+    pmaddubsw       m4, m5, m0
8350
+    pmaddubsw       m5, m1
8351
+    pmulhrsw        m4, m2
8352
+    pmulhrsw        m5, m2
8353
+    packuswb        m4, m5
8354
+    movu            [r0 + r1 * 2], xm4
8355
+    vextracti128    [r0 + r3], m4, 1
8356
 
8357
-    add               r4, 4 * mmsize
8358
+    lea             r0, [r0 + r1 * 4]
8359
 
8360
-    INTRA_PRED_ANG16_CAL_ROW m4, m5, 0
8361
-    INTRA_PRED_ANG16_CAL_ROW m5, m6, 1
8362
-    INTRA_PRED_ANG16_CAL_ROW m6, m7, 2
8363
-
8364
-    movu              xm7, [r2 + 2 + 32]
8365
-    pshufb            xm7, [r5]
8366
-    vinserti128       m9, m9, xm7, 1
8367
-
8368
-    movu              xm7, [r2 + 10 + 32]
8369
-    pshufb            xm7, [r5]
8370
-    vinserti128       m10, m10, xm7, 1
8371
+    palignr         m5, m6, m3, 8
8372
+    pshufb          m5, m7
8373
+    pmaddubsw       m4, m5, m0
8374
+    pmaddubsw       m5, m1
8375
+    pmulhrsw        m4, m2
8376
+    pmulhrsw        m5, m2
8377
+    packuswb        m4, m5
8378
+    movu            [r0], xm4
8379
+    vextracti128    [r0 + r1], m4, 1
8380
+
8381
+    palignr         m5, m6, m3, 10
8382
+    pshufb          m5, m7
8383
+    pmaddubsw       m4, m5, m0
8384
+    pmaddubsw       m5, m1
8385
+    pmulhrsw        m4, m2
8386
+    pmulhrsw        m5, m2
8387
+    packuswb        m4, m5
8388
+    movu            [r0 + r1 * 2], xm4
8389
+    vextracti128    [r0 + r3], m4, 1
8390
 
8391
-    INTRA_PRED_ANG16_CAL_ROW m7, m8, 3
8392
+    lea             r0, [r0 + r1 * 4]
8393
 
8394
-    ; transpose and store
8395
-    INTRA_PRED_TRANS_STORE_16x16
8396
+    palignr         m5, m6, m3, 12
8397
+    pshufb          m5, m7
8398
+    pmaddubsw       m4, m5, m0
8399
+    pmaddubsw       m5, m1
8400
+    pmulhrsw        m4, m2
8401
+    pmulhrsw        m5, m2
8402
+    packuswb        m4, m5
8403
+    movu            [r0], xm4
8404
+    vextracti128    [r0 + r1], m4, 1
8405
+
8406
+    palignr         m5, m6, m3, 14
8407
+    pshufb          m5, m7
8408
+    pmaddubsw       m4, m5, m0
8409
+    pmaddubsw       m5, m1
8410
+    pmulhrsw        m4, m2
8411
+    pmulhrsw        m5, m2
8412
+    packuswb        m4, m5
8413
+    movu            [r0 + r1 * 2], xm4
8414
+    vextracti128    [r0 + r3], m4, 1
8415
     RET
8416
 %endif
8417
 
8418
@@ -14587,3020 +21911,6 @@
8419
     INTRA_PRED_ANG32_STORE
8420
     RET
8421
 
8422
-%if ARCH_X86_64 == 1
8423
-%macro INTRA_PRED_ANG32_CAL_ROW 0
8424
-    pmaddubsw         m6, m2, m10
8425
-    pmulhrsw          m6, m0
8426
-    pmaddubsw         m7, m3, m10
8427
-    pmulhrsw          m7, m0
8428
-    pmaddubsw         m8, m4, m10
8429
-    pmulhrsw          m8, m0
8430
-    pmaddubsw         m9, m5, m10
8431
-    pmulhrsw          m9, m0
8432
-    packuswb          m6, m7
8433
-    packuswb          m8, m9
8434
-    vperm2i128        m7, m6, m8, 00100000b
8435
-    vperm2i128        m6, m6, m8, 00110001b
8436
-%endmacro
8437
-
8438
-
8439
-INIT_YMM avx2
8440
-cglobal intra_pred_ang32_27, 3, 5, 11
8441
-    mova              m0, [pw_1024]
8442
-    mova              m1, [intra_pred_shuff_0_8]
8443
-    lea               r3, [3 * r1]
8444
-    lea               r4, [c_ang32_mode_27]
8445
-
8446
-    vbroadcasti128    m2, [r2 + 1]
8447
-    pshufb            m2, m1
8448
-    vbroadcasti128    m3, [r2 + 9]
8449
-    pshufb            m3, m1
8450
-    vbroadcasti128    m4, [r2 + 17]
8451
-    pshufb            m4, m1
8452
-    vbroadcasti128    m5, [r2 + 25]
8453
-    pshufb            m5, m1
8454
-
8455
-    ;row [0, 1]
8456
-    mova              m10, [r4 + 0 * mmsize]
8457
-
8458
-    INTRA_PRED_ANG32_CAL_ROW
8459
-    movu              [r0], m7
8460
-    movu              [r0 + r1], m6
8461
-
8462
-    ;row [2, 3]
8463
-    mova              m10, [r4 + 1 * mmsize]
8464
-
8465
-    INTRA_PRED_ANG32_CAL_ROW
8466
-    movu              [r0 + 2 * r1], m7
8467
-    movu              [r0 + r3], m6
8468
-
8469
-    ;row [4, 5]
8470
-    mova              m10, [r4 + 2 * mmsize]
8471
-    lea               r0, [r0 + 4 * r1]
8472
-
8473
-    INTRA_PRED_ANG32_CAL_ROW
8474
-    movu              [r0], m7
8475
-    movu              [r0 + r1], m6
8476
-
8477
-    ;row [6, 7]
8478
-    mova              m10, [r4 + 3 * mmsize]
8479
-
8480
-    INTRA_PRED_ANG32_CAL_ROW
8481
-    movu              [r0 + 2 * r1], m7
8482
-    movu              [r0 + r3], m6
8483
-
8484
-    ;row [8, 9]
8485
-    lea               r0, [r0 + 4 * r1]
8486
-    add               r4, 4 * mmsize
8487
-    mova              m10, [r4 + 0 * mmsize]
8488
-
8489
-    INTRA_PRED_ANG32_CAL_ROW
8490
-    movu              [r0], m7
8491
-    movu              [r0 + r1], m6
8492
-
8493
-   ;row [10, 11]
8494
-    mova              m10, [r4 + 1 * mmsize]
8495
-
8496
-    INTRA_PRED_ANG32_CAL_ROW
8497
-    movu              [r0 + 2 * r1], m7
8498
-    movu              [r0 + r3], m6
8499
-
8500
-   ;row [12, 13]
8501
-    lea               r0, [r0 + 4 * r1]
8502
-    mova              m10, [r4 + 2 * mmsize]
8503
-
8504
-    INTRA_PRED_ANG32_CAL_ROW
8505
-    movu              [r0], m7
8506
-    movu              [r0 + r1], m6
8507
-
8508
-   ;row [14]
8509
-    mova              m10, [r4 + 3 * mmsize]
8510
-    vperm2i128        m6, m2, m3, 00100000b
8511
-    pmaddubsw         m6, m10
8512
-    pmulhrsw          m6, m0
8513
-    vperm2i128        m7, m4, m5, 00100000b
8514
-    pmaddubsw         m7, m10
8515
-    pmulhrsw          m7, m0
8516
-    packuswb          m6, m7
8517
-    vpermq            m6, m6, 11011000b
8518
-    movu              [r0 + 2 * r1], m6
8519
-
8520
-    vbroadcasti128    m2, [r2 + 2]
8521
-    pshufb            m2, m1
8522
-    vbroadcasti128    m3, [r2 + 10]
8523
-    pshufb            m3, m1
8524
-    vbroadcasti128    m4, [r2 + 18]
8525
-    pshufb            m4, m1
8526
-    vbroadcasti128    m5, [r2 + 26]
8527
-    pshufb            m5, m1
8528
-
8529
-    ;row [15, 16]
8530
-    add               r4, 4 * mmsize
8531
-    mova              m10, [r4 + 0 * mmsize]
8532
-
8533
-    INTRA_PRED_ANG32_CAL_ROW
8534
-    movu              [r0 + r3], m7
8535
-    lea               r0, [r0 + 4 * r1]
8536
-    movu              [r0], m6
8537
-
8538
-    ;row [17, 18]
8539
-    mova              m10, [r4 + 1 * mmsize]
8540
-
8541
-    INTRA_PRED_ANG32_CAL_ROW
8542
-    movu              [r0 + r1], m7
8543
-    movu              [r0 + 2 * r1], m6
8544
-
8545
-    ;row [19, 20]
8546
-    mova              m10, [r4 + 2 * mmsize]
8547
-
8548
-    INTRA_PRED_ANG32_CAL_ROW
8549
-    movu              [r0 + r3], m7
8550
-    lea               r0, [r0 + 4 * r1]
8551
-    movu              [r0], m6
8552
-
8553
-    ;row [21, 22]
8554
-    mova              m10, [r4 + 3 * mmsize]
8555
-
8556
-    INTRA_PRED_ANG32_CAL_ROW
8557
-    movu              [r0 + r1], m7
8558
-    movu              [r0 + 2 * r1], m6
8559
-
8560
-    ;row [23, 24]
8561
-    add               r4, 4 * mmsize
8562
-    mova              m10, [r4 + 0 * mmsize]
8563
-
8564
-    INTRA_PRED_ANG32_CAL_ROW
8565
-    movu              [r0 + r3], m7
8566
-    lea               r0, [r0 + 4 * r1]
8567
-    movu              [r0], m6
8568
-
8569
-    ;row [25, 26]
8570
-    mova              m10, [r4 + 1 * mmsize]
8571
-
8572
-    INTRA_PRED_ANG32_CAL_ROW
8573
-    movu              [r0 + r1], m7
8574
-    movu              [r0 + 2 * r1], m6
8575
-
8576
-    ;row [27, 28]
8577
-    mova              m10, [r4 + 2 * mmsize]
8578
-
8579
-    INTRA_PRED_ANG32_CAL_ROW
8580
-    movu              [r0 + r3], m7
8581
-    lea               r0, [r0 + 4 * r1]
8582
-    movu              [r0], m6
8583
-
8584
-    ;row [29, 30]
8585
-    mova              m10, [r4 + 3 * mmsize]
8586
-
8587
-    INTRA_PRED_ANG32_CAL_ROW
8588
-    movu              [r0 + r1], m7
8589
-    movu              [r0 + 2 * r1], m6
8590
-
8591
-    ;row [31]
8592
-    vbroadcasti128    m2, [r2 + 3]
8593
-    pshufb            m2, m1
8594
-    vbroadcasti128    m3, [r2 + 11]
8595
-    pshufb            m3, m1
8596
-    vbroadcasti128    m4, [r2 + 19]
8597
-    pshufb            m4, m1
8598
-    vbroadcasti128    m5, [r2 + 27]
8599
-    pshufb            m5, m1
8600
-
8601
-    mova              m10, [r4 + 4 * mmsize]
8602
-    vperm2i128        m6, m2, m3, 00100000b
8603
-    pmaddubsw         m6, m10
8604
-    pmulhrsw          m6, m0
8605
-    vperm2i128        m7, m4, m5, 00100000b
8606
-    pmaddubsw         m7, m10
8607
-    pmulhrsw          m7, m0
8608
-    packuswb          m6, m7
8609
-    vpermq            m6, m6, 11011000b
8610
-    movu              [r0 + r3], m6
8611
-    RET
8612
-
8613
-INIT_YMM avx2
8614
-cglobal intra_pred_ang32_28, 3, 5, 11
8615
-    mova              m0, [pw_1024]
8616
-    mova              m1, [intra_pred_shuff_0_8]
8617
-    lea               r3, [3 * r1]
8618
-    lea               r4, [c_ang32_mode_28]
8619
-
8620
-    vbroadcasti128    m2, [r2 + 1]
8621
-    pshufb            m2, m1
8622
-    vbroadcasti128    m3, [r2 + 9]
8623
-    pshufb            m3, m1
8624
-    vbroadcasti128    m4, [r2 + 17]
8625
-    pshufb            m4, m1
8626
-    vbroadcasti128    m5, [r2 + 25]
8627
-    pshufb            m5, m1
8628
-
8629
-    ;row [0, 1]
8630
-    mova              m10, [r4 + 0 * mmsize]
8631
-
8632
-    INTRA_PRED_ANG32_CAL_ROW
8633
-    movu              [r0], m7
8634
-    movu              [r0 + r1], m6
8635
-
8636
-    ;row [2, 3]
8637
-    mova              m10, [r4 + 1 * mmsize]
8638
-
8639
-    INTRA_PRED_ANG32_CAL_ROW
8640
-    movu              [r0 + 2 * r1], m7
8641
-    movu              [r0 + r3], m6
8642
-
8643
-    ;row [4, 5]
8644
-    mova              m10, [r4 + 2 * mmsize]
8645
-    lea               r0, [r0 + 4 * r1]
8646
-
8647
-    INTRA_PRED_ANG32_CAL_ROW
8648
-    movu              [r0], m7
8649
-    movu              [r0 + r1], m6
8650
-
8651
-    vbroadcasti128    m2, [r2 + 2]
8652
-    pshufb            m2, m1
8653
-    vbroadcasti128    m3, [r2 + 10]
8654
-    pshufb            m3, m1
8655
-    vbroadcasti128    m4, [r2 + 18]
8656
-    pshufb            m4, m1
8657
-    vbroadcasti128    m5, [r2 + 26]
8658
-    pshufb            m5, m1
8659
-
8660
-    ;row [6, 7]
8661
-    mova              m10, [r4 + 3 * mmsize]
8662
-
8663
-    INTRA_PRED_ANG32_CAL_ROW
8664
-    movu              [r0 + 2 * r1], m7
8665
-    movu              [r0 + r3], m6
8666
-
8667
-    ;row [8, 9]
8668
-    lea               r0, [r0 + 4 * r1]
8669
-    add               r4, 4 * mmsize
8670
-    mova              m10, [r4 + 0 * mmsize]
8671
-
8672
-    INTRA_PRED_ANG32_CAL_ROW
8673
-    movu              [r0], m7
8674
-    movu              [r0 + r1], m6
8675
-
8676
-   ;row [10, 11]
8677
-    mova              m10, [r4 + 1 * mmsize]
8678
-
8679
-    INTRA_PRED_ANG32_CAL_ROW
8680
-    movu              [r0 + 2 * r1], m7
8681
-    movu              [r0 + r3], m6
8682
-
8683
-    vbroadcasti128    m2, [r2 + 3]
8684
-    pshufb            m2, m1
8685
-    vbroadcasti128    m3, [r2 + 11]
8686
-    pshufb            m3, m1
8687
-    vbroadcasti128    m4, [r2 + 19]
8688
-    pshufb            m4, m1
8689
-    vbroadcasti128    m5, [r2 + 27]
8690
-    pshufb            m5, m1
8691
-
8692
-    ;row [12, 13]
8693
-    lea               r0, [r0 + 4 * r1]
8694
-    mova              m10, [r4 + 2 * mmsize]
8695
-
8696
-    INTRA_PRED_ANG32_CAL_ROW
8697
-    movu              [r0], m7
8698
-    movu              [r0 + r1], m6
8699
-
8700
-    ;row [14, 15]
8701
-    mova              m10, [r4 + 3 * mmsize]
8702
-
8703
-    INTRA_PRED_ANG32_CAL_ROW
8704
-    movu              [r0 + 2 * r1], m7
8705
-    movu              [r0 + r3], m6
8706
-
8707
-    ;row [16, 17]
8708
-    lea               r0, [r0 + 4 * r1]
8709
-    add               r4, 4 * mmsize
8710
-    mova              m10, [r4 + 0 * mmsize]
8711
-
8712
-    INTRA_PRED_ANG32_CAL_ROW
8713
-    movu              [r0], m7
8714
-    movu              [r0 + r1], m6
8715
-
8716
-    ;row [18]
8717
-    mova              m10, [r4 + 1 * mmsize]
8718
-    vperm2i128        m6, m2, m3, 00100000b
8719
-    pmaddubsw         m6, m10
8720
-    pmulhrsw          m6, m0
8721
-    vperm2i128        m7, m4, m5, 00100000b
8722
-    pmaddubsw         m7, m10
8723
-    pmulhrsw          m7, m0
8724
-    packuswb          m6, m7
8725
-    vpermq            m6, m6, 11011000b
8726
-    movu              [r0 + 2 * r1], m6
8727
-
8728
-    ;row [19, 20]
8729
-    vbroadcasti128    m2, [r2 + 4]
8730
-    pshufb            m2, m1
8731
-    vbroadcasti128    m3, [r2 + 12]
8732
-    pshufb            m3, m1
8733
-    vbroadcasti128    m4, [r2 + 20]
8734
-    pshufb            m4, m1
8735
-    vbroadcasti128    m5, [r2 + 28]
8736
-    pshufb            m5, m1
8737
-
8738
-    mova              m10, [r4 + 2 * mmsize]
8739
-
8740
-    INTRA_PRED_ANG32_CAL_ROW
8741
-    movu              [r0 + r3], m7
8742
-    lea               r0, [r0 + 4 * r1]
8743
-    movu              [r0], m6
8744
-
8745
-    ;row[21, 22]
8746
-    mova              m10, [r4 + 3 * mmsize]
8747
-
8748
-    INTRA_PRED_ANG32_CAL_ROW
8749
-    movu              [r0 + r1], m7
8750
-    movu              [r0 + 2 * r1], m6
8751
-
8752
-    ;row[23, 24]
8753
-    add               r4, 4 * mmsize
8754
-    mova              m10, [r4 + 0 * mmsize]
8755
-
8756
-    INTRA_PRED_ANG32_CAL_ROW
8757
-    movu              [r0 + r3], m7
8758
-    lea               r0, [r0 + 4 * r1]
8759
-    movu              [r0], m6
8760
-
8761
-    ;row [25, 26]
8762
-    vbroadcasti128    m2, [r2 + 5]
8763
-    pshufb            m2, m1
8764
-    vbroadcasti128    m3, [r2 + 13]
8765
-    pshufb            m3, m1
8766
-    vbroadcasti128    m4, [r2 + 21]
8767
-    pshufb            m4, m1
8768
-    vbroadcasti128    m5, [r2 + 29]
8769
-    pshufb            m5, m1
8770
-
8771
-    mova              m10, [r4 + 1 * mmsize]
8772
-
8773
-    INTRA_PRED_ANG32_CAL_ROW
8774
-    movu              [r0 + r1], m7
8775
-    movu              [r0 + 2 * r1], m6
8776
-
8777
-    ;row [27, 28]
8778
-    mova              m10, [r4 + 2 * mmsize]
8779
-
8780
-    INTRA_PRED_ANG32_CAL_ROW
8781
-    movu              [r0 + r3], m7
8782
-    lea               r0, [r0 + 4 * r1]
8783
-    movu              [r0], m6
8784
-
8785
-    ;row [29, 30]
8786
-    mova              m10, [r4 + 3 * mmsize]
8787
-
8788
-    INTRA_PRED_ANG32_CAL_ROW
8789
-    movu              [r0 + r1], m7
8790
-    movu              [r0 + 2 * r1], m6
8791
-
8792
-    ;row [31]
8793
-    vbroadcasti128    m2, [r2 + 6]
8794
-    pshufb            m2, m1
8795
-    vbroadcasti128    m3, [r2 + 14]
8796
-    pshufb            m3, m1
8797
-    vbroadcasti128    m4, [r2 + 22]
8798
-    pshufb            m4, m1
8799
-    vbroadcasti128    m5, [r2 + 30]
8800
-    pshufb            m5, m1
8801
-
8802
-    mova              m10, [r4 + 4 * mmsize]
8803
-    vperm2i128        m6, m2, m3, 00100000b
8804
-    pmaddubsw         m6, m10
8805
-    pmulhrsw          m6, m0
8806
-    vperm2i128        m7, m4, m5, 00100000b
8807
-    pmaddubsw         m7, m10
8808
-    pmulhrsw          m7, m0
8809
-    packuswb          m6, m7
8810
-    vpermq            m6, m6, 11011000b
8811
-    movu              [r0 + r3], m6
8812
-    RET
8813
-
8814
-INIT_YMM avx2
8815
-cglobal intra_pred_ang32_29, 3, 5, 11
8816
-    mova              m0, [pw_1024]
8817
-    mova              m1, [intra_pred_shuff_0_8]
8818
-    lea               r3, [3 * r1]
8819
-    lea               r4, [c_ang32_mode_29]
8820
-
8821
-    ;row [0, 1]
8822
-    vbroadcasti128    m2, [r2 + 1]
8823
-    pshufb            m2, m1
8824
-    vbroadcasti128    m3, [r2 + 9]
8825
-    pshufb            m3, m1
8826
-    vbroadcasti128    m4, [r2 + 17]
8827
-    pshufb            m4, m1
8828
-    vbroadcasti128    m5, [r2 + 25]
8829
-    pshufb            m5, m1
8830
-
8831
-    mova              m10, [r4 + 0 * mmsize]
8832
-
8833
-    INTRA_PRED_ANG32_CAL_ROW
8834
-    movu              [r0], m7
8835
-    movu              [r0 + r1], m6
8836
-
8837
-    ;row [2]
8838
-    mova              m10, [r4 + 1 * mmsize]
8839
-    vperm2i128        m6, m2, m3, 00100000b
8840
-    pmaddubsw         m6, m10
8841
-    pmulhrsw          m6, m0
8842
-    vperm2i128        m7, m4, m5, 00100000b
8843
-    pmaddubsw         m7, m10
8844
-    pmulhrsw          m7, m0
8845
-    packuswb          m6, m7
8846
-    vpermq            m6, m6, 11011000b
8847
-    movu              [r0 + 2 * r1], m6
8848
-
8849
-    ;row [3, 4]
8850
-    vbroadcasti128    m2, [r2 + 2]
8851
-    pshufb            m2, m1
8852
-    vbroadcasti128    m3, [r2 + 10]
8853
-    pshufb            m3, m1
8854
-    vbroadcasti128    m4, [r2 + 18]
8855
-    pshufb            m4, m1
8856
-    vbroadcasti128    m5, [r2 + 26]
8857
-    pshufb            m5, m1
8858
-
8859
-    mova              m10, [r4 + 2 * mmsize]
8860
-
8861
-    INTRA_PRED_ANG32_CAL_ROW
8862
-    movu              [r0 + r3], m7
8863
-    lea               r0, [r0 + 4 * r1]
8864
-    movu              [r0], m6
8865
-
8866
-    ;row [5, 6]
8867
-    mova              m10, [r4 + 3 * mmsize]
8868
-
8869
-    INTRA_PRED_ANG32_CAL_ROW
8870
-    movu              [r0 + r1], m7
8871
-    movu              [r0 + 2 * r1], m6
8872
-
8873
-    ;row [7, 8]
8874
-    vbroadcasti128    m2, [r2 + 3]
8875
-    pshufb            m2, m1
8876
-    vbroadcasti128    m3, [r2 + 11]
8877
-    pshufb            m3, m1
8878
-    vbroadcasti128    m4, [r2 + 19]
8879
-    pshufb            m4, m1
8880
-    vbroadcasti128    m5, [r2 + 27]
8881
-    pshufb            m5, m1
8882
-
8883
-    add               r4, 4 * mmsize
8884
-    mova              m10, [r4 + 0 * mmsize]
8885
-
8886
-    INTRA_PRED_ANG32_CAL_ROW
8887
-    movu              [r0 + r3], m7
8888
-    lea               r0, [r0 + 4 * r1]
8889
-    movu              [r0], m6
8890
-
8891
-    ;row [9]
8892
-    mova              m10, [r4 + 1 * mmsize]
8893
-    vperm2i128        m6, m2, m3, 00100000b
8894
-    pmaddubsw         m6, m10
8895
-    pmulhrsw          m6, m0
8896
-    vperm2i128        m7, m4, m5, 00100000b
8897
-    pmaddubsw         m7, m10
8898
-    pmulhrsw          m7, m0
8899
-    packuswb          m6, m7
8900
-    vpermq            m6, m6, 11011000b
8901
-    movu              [r0 + r1], m6
8902
-
8903
-    ;row [10, 11]
8904
-    vbroadcasti128    m2, [r2 + 4]
8905
-    pshufb            m2, m1
8906
-    vbroadcasti128    m3, [r2 + 12]
8907
-    pshufb            m3, m1
8908
-    vbroadcasti128    m4, [r2 + 20]
8909
-    pshufb            m4, m1
8910
-    vbroadcasti128    m5, [r2 + 28]
8911
-    pshufb            m5, m1
8912
-
8913
-    mova              m10, [r4 + 2 * mmsize]
8914
-
8915
-    INTRA_PRED_ANG32_CAL_ROW
8916
-    movu              [r0 + 2 * r1], m7
8917
-    movu              [r0 + r3], m6
8918
-
8919
-    ;row [12, 13]
8920
-    lea               r0, [r0 + 4 * r1]
8921
-    mova              m10, [r4 + 3 * mmsize]
8922
-
8923
-    INTRA_PRED_ANG32_CAL_ROW
8924
-    movu              [r0], m7
8925
-    movu              [r0 + r1], m6
8926
-
8927
-    ;row [14, 15]
8928
-    vbroadcasti128    m2, [r2 + 5]
8929
-    pshufb            m2, m1
8930
-    vbroadcasti128    m3, [r2 + 13]
8931
-    pshufb            m3, m1
8932
-    vbroadcasti128    m4, [r2 + 21]
8933
-    pshufb            m4, m1
8934
-    vbroadcasti128    m5, [r2 + 29]
8935
-    pshufb            m5, m1
8936
-
8937
-    add               r4, 4 * mmsize
8938
-    mova              m10, [r4 + 0 * mmsize]
8939
-
8940
-    INTRA_PRED_ANG32_CAL_ROW
8941
-    movu              [r0 + 2 * r1], m7
8942
-    movu              [r0 + r3], m6
8943
-
8944
-    ;row [16]
8945
-    lea               r0, [r0 + 4 * r1]
8946
-    mova              m10, [r4 + 1 * mmsize]
8947
-    vperm2i128        m6, m2, m3, 00100000b
8948
-    pmaddubsw         m6, m10
8949
-    pmulhrsw          m6, m0
8950
-    vperm2i128        m7, m4, m5, 00100000b
8951
-    pmaddubsw         m7, m10
8952
-    pmulhrsw          m7, m0
8953
-    packuswb          m6, m7
8954
-    vpermq            m6, m6, 11011000b
8955
-    movu              [r0], m6
8956
-
8957
-    ;row [17, 18]
8958
-    vbroadcasti128    m2, [r2 + 6]
8959
-    pshufb            m2, m1
8960
-    vbroadcasti128    m3, [r2 + 14]
8961
-    pshufb            m3, m1
8962
-    vbroadcasti128    m4, [r2 + 22]
8963
-    pshufb            m4, m1
8964
-    vbroadcasti128    m5, [r2 + 30]
8965
-    pshufb            m5, m1
8966
-
8967
-    mova              m10, [r4 + 2 * mmsize]
8968
-
8969
-    INTRA_PRED_ANG32_CAL_ROW
8970
-    movu              [r0 + r1], m7
8971
-    movu              [r0 + 2 * r1], m6
8972
-
8973
-    ;row [19, 20]
8974
-    mova              m10, [r4 + 3 * mmsize]
8975
-
8976
-    INTRA_PRED_ANG32_CAL_ROW
8977
-    movu              [r0 + r3], m7
8978
-    lea               r0, [r0 + 4 * r1]
8979
-    movu              [r0], m6
8980
-
8981
-    ;row [21, 22]
8982
-    vbroadcasti128    m2, [r2 + 7]
8983
-    pshufb            m2, m1
8984
-    vbroadcasti128    m3, [r2 + 15]
8985
-    pshufb            m3, m1
8986
-    vbroadcasti128    m4, [r2 + 23]
8987
-    pshufb            m4, m1
8988
-    vbroadcasti128    m5, [r2 + 31]
8989
-    pshufb            m5, m1
8990
-
8991
-    add               r4, 4 * mmsize
8992
-    mova              m10, [r4 + 0 * mmsize]
8993
-
8994
-    INTRA_PRED_ANG32_CAL_ROW
8995
-    movu              [r0 + r1], m7
8996
-    movu              [r0 + 2 * r1], m6
8997
-
8998
-    ;row [23]
8999
-    mova              m10, [r4 + 1 * mmsize]
9000
-    vperm2i128        m6, m2, m3, 00100000b
9001
-    pmaddubsw         m6, m10
9002
-    pmulhrsw          m6, m0
9003
-    vperm2i128        m7, m4, m5, 00100000b
9004
-    pmaddubsw         m7, m10
9005
-    pmulhrsw          m7, m0
9006
-    packuswb          m6, m7
9007
-    vpermq            m6, m6, 11011000b
9008
-    movu              [r0 + r3], m6
9009
-
9010
-    ;row [24, 25]
9011
-    vbroadcasti128    m2, [r2 + 8]
9012
-    pshufb            m2, m1
9013
-    vbroadcasti128    m3, [r2 + 16]
9014
-    pshufb            m3, m1
9015
-    vbroadcasti128    m4, [r2 + 24]
9016
-    pshufb            m4, m1
9017
-    vbroadcasti128    m5, [r2 + 32]
9018
-    pshufb            m5, m1
9019
-
9020
-    lea               r0, [r0 + 4 * r1]
9021
-    mova              m10, [r4 + 2 * mmsize]
9022
-
9023
-    INTRA_PRED_ANG32_CAL_ROW
9024
-    movu              [r0], m7
9025
-    movu              [r0 + r1], m6
9026
-
9027
-    ;row [26, 27]
9028
-    mova              m10, [r4 + 3 * mmsize]
9029
-
9030
-    INTRA_PRED_ANG32_CAL_ROW
9031
-    movu              [r0 + 2 * r1], m7
9032
-    movu              [r0 + r3], m6
9033
-
9034
-    ;row [28, 29]
9035
-    vbroadcasti128    m2, [r2 + 9]
9036
-    pshufb            m2, m1
9037
-    vbroadcasti128    m3, [r2 + 17]
9038
-    pshufb            m3, m1
9039
-    vbroadcasti128    m4, [r2 + 25]
9040
-    pshufb            m4, m1
9041
-    vbroadcasti128    m5, [r2 + 33]
9042
-    pshufb            m5, m1
9043
-
9044
-    lea               r0, [r0 + 4 * r1]
9045
-    add               r4, 4 * mmsize
9046
-    mova              m10, [r4 + 0 * mmsize]
9047
-
9048
-    INTRA_PRED_ANG32_CAL_ROW
9049
-    movu              [r0], m7
9050
-    movu              [r0 + r1], m6
9051
-
9052
-    ;row [30]
9053
-    mova              m10, [r4 + 1 * mmsize]
9054
-    vperm2i128        m6, m2, m3, 00100000b
9055
-    pmaddubsw         m6, m10
9056
-    pmulhrsw          m6, m0
9057
-    vperm2i128        m7, m4, m5, 00100000b
9058
-    pmaddubsw         m7, m10
9059
-    pmulhrsw          m7, m0
9060
-    packuswb          m6, m7
9061
-    vpermq            m6, m6, 11011000b
9062
-    movu              [r0 + 2 * r1], m6
9063
-
9064
-    ;row [31]
9065
-    vbroadcasti128    m2, [r2 + 10]
9066
-    pshufb            m2, m1
9067
-    vbroadcasti128    m3, [r2 + 18]
9068
-    pshufb            m3, m1
9069
-    vbroadcasti128    m4, [r2 + 26]
9070
-    pshufb            m4, m1
9071
-    vbroadcasti128    m5, [r2 + 34]
9072
-    pshufb            m5, m1
9073
-
9074
-    mova              m10, [r4 + 2 * mmsize]
9075
-    vperm2i128        m6, m2, m3, 00100000b
9076
-    pmaddubsw         m6, m10
9077
-    pmulhrsw          m6, m0
9078
-    vperm2i128        m7, m4, m5, 00100000b
9079
-    pmaddubsw         m7, m10
9080
-    pmulhrsw          m7, m0
9081
-    packuswb          m6, m7
9082
-    vpermq            m6, m6, 11011000b
9083
-    movu              [r0 + r3], m6
9084
-    RET
9085
-
9086
-INIT_YMM avx2
9087
-cglobal intra_pred_ang32_30, 3, 5, 11
9088
-    mova              m0, [pw_1024]
9089
-    mova              m1, [intra_pred_shuff_0_8]
9090
-    lea               r3, [3 * r1]
9091
-    lea               r4, [c_ang32_mode_30]
9092
-
9093
-    ;row [0, 1]
9094
-    vbroadcasti128    m2, [r2 + 1]
9095
-    pshufb            m2, m1
9096
-    vbroadcasti128    m3, [r2 + 9]
9097
-    pshufb            m3, m1
9098
-    vbroadcasti128    m4, [r2 + 17]
9099
-    pshufb            m4, m1
9100
-    vbroadcasti128    m5, [r2 + 25]
9101
-    pshufb            m5, m1
9102
-
9103
-    mova              m10, [r4 + 0 * mmsize]
9104
-
9105
-    INTRA_PRED_ANG32_CAL_ROW
9106
-    movu              [r0], m7
9107
-    movu              [r0 + r1], m6
9108
-
9109
-    ;row [2, 3]
9110
-    vbroadcasti128    m2, [r2 + 2]
9111
-    pshufb            m2, m1
9112
-    vbroadcasti128    m3, [r2 + 10]
9113
-    pshufb            m3, m1
9114
-    vbroadcasti128    m4, [r2 + 18]
9115
-    pshufb            m4, m1
9116
-    vbroadcasti128    m5, [r2 + 26]
9117
-    pshufb            m5, m1
9118
-
9119
-    mova              m10, [r4 + 1 * mmsize]
9120
-
9121
-    INTRA_PRED_ANG32_CAL_ROW
9122
-    movu              [r0 + 2 * r1], m7
9123
-    movu              [r0 + r3], m6
9124
-
9125
-    ;row [4, 5]
9126
-    vbroadcasti128    m2, [r2 + 3]
9127
-    pshufb            m2, m1
9128
-    vbroadcasti128    m3, [r2 + 11]
9129
-    pshufb            m3, m1
9130
-    vbroadcasti128    m4, [r2 + 19]
9131
-    pshufb            m4, m1
9132
-    vbroadcasti128    m5, [r2 + 27]
9133
-    pshufb            m5, m1
9134
-
9135
-    mova              m10, [r4 + 2 * mmsize]
9136
-    lea               r0, [r0 + 4 * r1]
9137
-
9138
-    INTRA_PRED_ANG32_CAL_ROW
9139
-    movu              [r0], m7
9140
-    movu              [r0 + r1], m6
9141
-
9142
-    ;row [6]
9143
-    mova              m10, [r4 + 3 * mmsize]
9144
-    vperm2i128        m6, m2, m3, 00100000b
9145
-    pmaddubsw         m6, m10
9146
-    pmulhrsw          m6, m0
9147
-    vperm2i128        m7, m4, m5, 00100000b
9148
-    pmaddubsw         m7, m10
9149
-    pmulhrsw          m7, m0
9150
-    packuswb          m6, m7
9151
-    vpermq            m6, m6, 11011000b
9152
-    movu              [r0 + 2 * r1], m6
9153
-
9154
-    ;row [7, 8]
9155
-    vbroadcasti128    m2, [r2 + 4]
9156
-    pshufb            m2, m1
9157
-    vbroadcasti128    m3, [r2 + 12]
9158
-    pshufb            m3, m1
9159
-    vbroadcasti128    m4, [r2 + 20]
9160
-    pshufb            m4, m1
9161
-    vbroadcasti128    m5, [r2 + 28]
9162
-    pshufb            m5, m1
9163
-
9164
-    add               r4, 4 * mmsize
9165
-    mova              m10, [r4 + 0 * mmsize]
9166
-
9167
-    INTRA_PRED_ANG32_CAL_ROW
9168
-    movu              [r0 + r3], m7
9169
-    lea               r0, [r0 + 4 * r1]
9170
-    movu              [r0], m6
9171
-
9172
-    ;row [9, 10]
9173
-    vbroadcasti128    m2, [r2 + 5]
9174
-    pshufb            m2, m1
9175
-    vbroadcasti128    m3, [r2 + 13]
9176
-    pshufb            m3, m1
9177
-    vbroadcasti128    m4, [r2 + 21]
9178
-    pshufb            m4, m1
9179
-    vbroadcasti128    m5, [r2 + 29]
9180
-    pshufb            m5, m1
9181
-
9182
-    mova              m10, [r4 + 1 * mmsize]
9183
-
9184
-    INTRA_PRED_ANG32_CAL_ROW
9185
-    movu              [r0 + r1], m7
9186
-    movu              [r0 + 2 * r1], m6
9187
-
9188
-    ;row [11]
9189
-    mova              m10, [r4 + 2 * mmsize]
9190
-    vperm2i128        m6, m2, m3, 00100000b
9191
-    pmaddubsw         m6, m10
9192
-    pmulhrsw          m6, m0
9193
-    vperm2i128        m7, m4, m5, 00100000b
9194
-    pmaddubsw         m7, m10
9195
-    pmulhrsw          m7, m0
9196
-    packuswb          m6, m7
9197
-    vpermq            m6, m6, 11011000b
9198
-    movu              [r0 + r3], m6
9199
-
9200
-    ;row [12, 13]
9201
-    vbroadcasti128    m2, [r2 + 6]
9202
-    pshufb            m2, m1
9203
-    vbroadcasti128    m3, [r2 + 14]
9204
-    pshufb            m3, m1
9205
-    vbroadcasti128    m4, [r2 + 22]
9206
-    pshufb            m4, m1
9207
-    vbroadcasti128    m5, [r2 + 30]
9208
-    pshufb            m5, m1
9209
-
9210
-    mova              m10, [r4 + 3 * mmsize]
9211
-
9212
-    lea               r0, [r0 + 4 * r1]
9213
-
9214
-    INTRA_PRED_ANG32_CAL_ROW
9215
-    movu              [r0], m7
9216
-    movu              [r0 + r1], m6
9217
-
9218
-    ;row [14, 15]
9219
-    vbroadcasti128    m2, [r2 + 7]
9220
-    pshufb            m2, m1
9221
-    vbroadcasti128    m3, [r2 + 15]
9222
-    pshufb            m3, m1
9223
-    vbroadcasti128    m4, [r2 + 23]
9224
-    pshufb            m4, m1
9225
-    vbroadcasti128    m5, [r2 + 31]
9226
-    pshufb            m5, m1
9227
-
9228
-    add               r4, 4 * mmsize
9229
-    mova              m10, [r4 + 0 * mmsize]
9230
-
9231
-    INTRA_PRED_ANG32_CAL_ROW
9232
-    movu              [r0 + 2 * r1], m7
9233
-    movu              [r0 + r3], m6
9234
-
9235
-    ;row [16]
9236
-    mova              m10, [r4 + 1 * mmsize]
9237
-    vperm2i128        m6, m2, m3, 00100000b
9238
-    pmaddubsw         m6, m10
9239
-    pmulhrsw          m6, m0
9240
-    vperm2i128        m7, m4, m5, 00100000b
9241
-    pmaddubsw         m7, m10
9242
-    pmulhrsw          m7, m0
9243
-    packuswb          m6, m7
9244
-    vpermq            m6, m6, 11011000b
9245
-    lea               r0, [r0 + 4 * r1]
9246
-    movu              [r0], m6
9247
-
9248
-    ;row [17, 18]
9249
-    vbroadcasti128    m2, [r2 + 8]
9250
-    pshufb            m2, m1
9251
-    vbroadcasti128    m3, [r2 + 16]
9252
-    pshufb            m3, m1
9253
-    vbroadcasti128    m4, [r2 + 24]
9254
-    pshufb            m4, m1
9255
-    vbroadcasti128    m5, [r2 + 32]
9256
-    pshufb            m5, m1
9257
-
9258
-    mova              m10, [r4 + 2 * mmsize]
9259
-
9260
-    INTRA_PRED_ANG32_CAL_ROW
9261
-    movu              [r0 + r1], m7
9262
-    movu              [r0 + 2 * r1], m6
9263
-
9264
-    ;row [19, 20]
9265
-    vbroadcasti128    m2, [r2 + 9]
9266
-    pshufb            m2, m1
9267
-    vbroadcasti128    m3, [r2 + 17]
9268
-    pshufb            m3, m1
9269
-    vbroadcasti128    m4, [r2 + 25]
9270
-    pshufb            m4, m1
9271
-    vbroadcasti128    m5, [r2 + 33]
9272
-    pshufb            m5, m1
9273
-
9274
-    mova              m10, [r4 + 3 * mmsize]
9275
-
9276
-    INTRA_PRED_ANG32_CAL_ROW
9277
-    movu              [r0 + r3], m7
9278
-    lea               r0, [r0 + 4 * r1]
9279
-    movu              [r0], m6
9280
-
9281
-    add               r4, 4 * mmsize
9282
-
9283
-    ;row [21]
9284
-    mova              m10, [r4 + 0 * mmsize]
9285
-    vperm2i128        m6, m2, m3, 00100000b
9286
-    pmaddubsw         m6, m10
9287
-    pmulhrsw          m6, m0
9288
-    vperm2i128        m7, m4, m5, 00100000b
9289
-    pmaddubsw         m7, m10
9290
-    pmulhrsw          m7, m0
9291
-    packuswb          m6, m7
9292
-    vpermq            m6, m6, 11011000b
9293
-    movu              [r0 + r1], m6
9294
-
9295
-    ;row [22, 23]
9296
-    vbroadcasti128    m2, [r2 + 10]
9297
-    pshufb            m2, m1
9298
-    vbroadcasti128    m3, [r2 + 18]
9299
-    pshufb            m3, m1
9300
-    vbroadcasti128    m4, [r2 + 26]
9301
-    pshufb            m4, m1
9302
-    vbroadcasti128    m5, [r2 + 34]
9303
-    pshufb            m5, m1
9304
-
9305
-    mova              m10, [r4 + 1 * mmsize]
9306
-
9307
-    INTRA_PRED_ANG32_CAL_ROW
9308
-    movu              [r0 + 2 * r1], m7
9309
-    movu              [r0 + r3], m6
9310
-
9311
-    ;row [24, 25]
9312
-    vbroadcasti128    m2, [r2 + 11]
9313
-    pshufb            m2, m1
9314
-    vbroadcasti128    m3, [r2 + 19]
9315
-    pshufb            m3, m1
9316
-    vbroadcasti128    m4, [r2 + 27]
9317
-    pshufb            m4, m1
9318
-    vbroadcasti128    m5, [r2 + 35]
9319
-    pshufb            m5, m1
9320
-
9321
-    mova              m10, [r4 + 2 * mmsize]
9322
-    lea               r0, [r0 + 4 * r1]
9323
-
9324
-    INTRA_PRED_ANG32_CAL_ROW
9325
-    movu              [r0], m7
9326
-    movu              [r0 + r1], m6
9327
-
9328
-    ;row [26]
9329
-    mova              m10, [r4 + 3 * mmsize]
9330
-    vperm2i128        m6, m2, m3, 00100000b
9331
-    pmaddubsw         m6, m10
9332
-    pmulhrsw          m6, m0
9333
-    vperm2i128        m7, m4, m5, 00100000b
9334
-    pmaddubsw         m7, m10
9335
-    pmulhrsw          m7, m0
9336
-    packuswb          m6, m7
9337
-    vpermq            m6, m6, 11011000b
9338
-    movu              [r0 + 2 * r1], m6
9339
-
9340
-    ;row [27, 28]
9341
-    vbroadcasti128    m2, [r2 + 12]
9342
-    pshufb            m2, m1
9343
-    vbroadcasti128    m3, [r2 + 20]
9344
-    pshufb            m3, m1
9345
-    vbroadcasti128    m4, [r2 + 28]
9346
-    pshufb            m4, m1
9347
-    vbroadcasti128    m5, [r2 + 36]
9348
-    pshufb            m5, m1
9349
-
9350
-    add               r4, 4 * mmsize
9351
-    mova              m10, [r4 + 0 * mmsize]
9352
-
9353
-    INTRA_PRED_ANG32_CAL_ROW
9354
-    movu              [r0 + r3], m7
9355
-    lea               r0, [r0 + 4 * r1]
9356
-    movu              [r0], m6
9357
-
9358
-    ;row [29, 30]
9359
-    vbroadcasti128    m2, [r2 + 13]
9360
-    pshufb            m2, m1
9361
-    vbroadcasti128    m3, [r2 + 21]
9362
-    pshufb            m3, m1
9363
-    vbroadcasti128    m4, [r2 + 29]
9364
-    pshufb            m4, m1
9365
-    vbroadcasti128    m5, [r2 + 37]
9366
-    pshufb            m5, m1
9367
-
9368
-    mova              m10, [r4 + 1 * mmsize]
9369
-
9370
-    INTRA_PRED_ANG32_CAL_ROW
9371
-    movu              [r0 + r1], m7
9372
-    movu              [r0 + 2 * r1], m6
9373
-
9374
-    ;row [31]
9375
-    vbroadcasti128    m2, [r2 + 14]
9376
-    pshufb            m2, m1
9377
-    vbroadcasti128    m3, [r2 + 22]
9378
-    pshufb            m3, m1
9379
-    vbroadcasti128    m4, [r2 + 30]
9380
-    pshufb            m4, m1
9381
-    vbroadcasti128    m5, [r2 + 38]
9382
-    pshufb            m5, m1
9383
-
9384
-    mova              m10, [r4 + 2 * mmsize]
9385
-    vperm2i128        m6, m2, m3, 00100000b
9386
-    pmaddubsw         m6, m10
9387
-    pmulhrsw          m6, m0
9388
-    vperm2i128        m7, m4, m5, 00100000b
9389
-    pmaddubsw         m7, m10
9390
-    pmulhrsw          m7, m0
9391
-    packuswb          m6, m7
9392
-    vpermq            m6, m6, 11011000b
9393
-    movu              [r0 + r3], m6
9394
-    RET
9395
-
9396
-INIT_YMM avx2
9397
-cglobal intra_pred_ang32_31, 3, 5, 11
9398
-    mova              m0, [pw_1024]
9399
-    mova              m1, [intra_pred_shuff_0_8]
9400
-    lea               r3, [3 * r1]
9401
-    lea               r4, [c_ang32_mode_31]
9402
-
9403
-    ;row [0]
9404
-    vbroadcasti128    m2, [r2 + 1]
9405
-    pshufb            m2, m1
9406
-    vbroadcasti128    m3, [r2 + 9]
9407
-    pshufb            m3, m1
9408
-    vbroadcasti128    m4, [r2 + 17]
9409
-    pshufb            m4, m1
9410
-    vbroadcasti128    m5, [r2 + 25]
9411
-    pshufb            m5, m1
9412
-
9413
-    mova              m10, [r4 + 0 * mmsize]
9414
-    vperm2i128        m6, m2, m3, 00100000b
9415
-    pmaddubsw         m6, m10
9416
-    pmulhrsw          m6, m0
9417
-    vperm2i128        m7, m4, m5, 00100000b
9418
-    pmaddubsw         m7, m10
9419
-    pmulhrsw          m7, m0
9420
-    packuswb          m6, m7
9421
-    vpermq            m6, m6, 11011000b
9422
-    movu              [r0], m6
9423
-
9424
-    ;row [1, 2]
9425
-    vbroadcasti128    m2, [r2 + 2]
9426
-    pshufb            m2, m1
9427
-    vbroadcasti128    m3, [r2 + 10]
9428
-    pshufb            m3, m1
9429
-    vbroadcasti128    m4, [r2 + 18]
9430
-    pshufb            m4, m1
9431
-    vbroadcasti128    m5, [r2 + 26]
9432
-    pshufb            m5, m1
9433
-
9434
-    mova              m10, [r4 + 1 * mmsize]
9435
-
9436
-    INTRA_PRED_ANG32_CAL_ROW
9437
-    movu              [r0 + r1], m7
9438
-    movu              [r0 + 2 * r1], m6
9439
-
9440
-    ;row [3, 4]
9441
-    vbroadcasti128    m2, [r2 + 3]
9442
-    pshufb            m2, m1
9443
-    vbroadcasti128    m3, [r2 + 11]
9444
-    pshufb            m3, m1
9445
-    vbroadcasti128    m4, [r2 + 19]
9446
-    pshufb            m4, m1
9447
-    vbroadcasti128    m5, [r2 + 27]
9448
-    pshufb            m5, m1
9449
-
9450
-    mova              m10, [r4 + 2 * mmsize]
9451
-
9452
-    INTRA_PRED_ANG32_CAL_ROW
9453
-    movu              [r0 + r3], m7
9454
-    lea               r0, [r0 + 4 * r1]
9455
-    movu              [r0], m6
9456
-
9457
-    ;row [5, 6]
9458
-    vbroadcasti128    m2, [r2 + 4]
9459
-    pshufb            m2, m1
9460
-    vbroadcasti128    m3, [r2 + 12]
9461
-    pshufb            m3, m1
9462
-    vbroadcasti128    m4, [r2 + 20]
9463
-    pshufb            m4, m1
9464
-    vbroadcasti128    m5, [r2 + 28]
9465
-    pshufb            m5, m1
9466
-
9467
-    mova              m10, [r4 + 3 * mmsize]
9468
-
9469
-    INTRA_PRED_ANG32_CAL_ROW
9470
-    movu              [r0 + r1], m7
9471
-    movu              [r0 + 2 * r1], m6
9472
-
9473
-    ;row [7, 8]
9474
-    vbroadcasti128    m2, [r2 + 5]
9475
-    pshufb            m2, m1
9476
-    vbroadcasti128    m3, [r2 + 13]
9477
-    pshufb            m3, m1
9478
-    vbroadcasti128    m4, [r2 + 21]
9479
-    pshufb            m4, m1
9480
-    vbroadcasti128    m5, [r2 + 29]
9481
-    pshufb            m5, m1
9482
-
9483
-    add               r4, 4 * mmsize
9484
-    mova              m10, [r4 + 0 * mmsize]
9485
-
9486
-    INTRA_PRED_ANG32_CAL_ROW
9487
-    movu              [r0 + r3], m7
9488
-    lea               r0, [r0 + 4 * r1]
9489
-    movu              [r0], m6
9490
-
9491
-    ;row [9, 10]
9492
-    vbroadcasti128    m2, [r2 + 6]
9493
-    pshufb            m2, m1
9494
-    vbroadcasti128    m3, [r2 + 14]
9495
-    pshufb            m3, m1
9496
-    vbroadcasti128    m4, [r2 + 22]
9497
-    pshufb            m4, m1
9498
-    vbroadcasti128    m5, [r2 + 30]
9499
-    pshufb            m5, m1
9500
-
9501
-    mova              m10, [r4 + 1 * mmsize]
9502
-
9503
-    INTRA_PRED_ANG32_CAL_ROW
9504
-    movu              [r0 + r1], m7
9505
-    movu              [r0 + 2 * r1], m6
9506
-
9507
-    ;row [11, 12]
9508
-    vbroadcasti128    m2, [r2 + 7]
9509
-    pshufb            m2, m1
9510
-    vbroadcasti128    m3, [r2 + 15]
9511
-    pshufb            m3, m1
9512
-    vbroadcasti128    m4, [r2 + 23]
9513
-    pshufb            m4, m1
9514
-    vbroadcasti128    m5, [r2 + 31]
9515
-    pshufb            m5, m1
9516
-
9517
-    mova              m10, [r4 + 2 * mmsize]
9518
-
9519
-    INTRA_PRED_ANG32_CAL_ROW
9520
-    movu              [r0 + r3], m7
9521
-    lea               r0, [r0 + 4 * r1]
9522
-    movu              [r0], m6
9523
-
9524
-    ;row [13, 14]
9525
-    vbroadcasti128    m2, [r2 + 8]
9526
-    pshufb            m2, m1
9527
-    vbroadcasti128    m3, [r2 + 16]
9528
-    pshufb            m3, m1
9529
-    vbroadcasti128    m4, [r2 + 24]
9530
-    pshufb            m4, m1
9531
-    vbroadcasti128    m5, [r2 + 32]
9532
-    pshufb            m5, m1
9533
-
9534
-    mova              m10, [r4 + 3 * mmsize]
9535
-
9536
-    INTRA_PRED_ANG32_CAL_ROW
9537
-    movu              [r0 + r1], m7
9538
-    movu              [r0 + 2 * r1], m6
9539
-
9540
-    ;row [15]
9541
-    vbroadcasti128    m2, [r2 + 9]
9542
-    pshufb            m2, m1
9543
-    vbroadcasti128    m3, [r2 + 17]
9544
-    pshufb            m3, m1
9545
-    vbroadcasti128    m4, [r2 + 25]
9546
-    pshufb            m4, m1
9547
-    vbroadcasti128    m5, [r2 + 33]
9548
-    pshufb            m5, m1
9549
-
9550
-    add               r4, 4 * mmsize
9551
-    mova              m10, [r4 + 0 * mmsize]
9552
-    vperm2i128        m6, m2, m3, 00100000b
9553
-    pmaddubsw         m6, m10
9554
-    pmulhrsw          m6, m0
9555
-    vperm2i128        m7, m4, m5, 00100000b
9556
-    pmaddubsw         m7, m10
9557
-    pmulhrsw          m7, m0
9558
-    packuswb          m6, m7
9559
-    vpermq            m6, m6, 11011000b
9560
-    movu              [r0 + r3], m6
9561
-
9562
-    ;row [16, 17]
9563
-    vbroadcasti128    m2, [r2 + 10]
9564
-    pshufb            m2, m1
9565
-    vbroadcasti128    m3, [r2 + 18]
9566
-    pshufb            m3, m1
9567
-    vbroadcasti128    m4, [r2 + 26]
9568
-    pshufb            m4, m1
9569
-    vbroadcasti128    m5, [r2 + 34]
9570
-    pshufb            m5, m1
9571
-
9572
-    lea               r0, [r0 + 4 * r1]
9573
-    mova              m10, [r4 + 1 * mmsize]
9574
-
9575
-    INTRA_PRED_ANG32_CAL_ROW
9576
-    movu              [r0], m7
9577
-    movu              [r0 + r1], m6
9578
-
9579
-    ;row [18, 19]
9580
-    vbroadcasti128    m2, [r2 + 11]
9581
-    pshufb            m2, m1
9582
-    vbroadcasti128    m3, [r2 + 19]
9583
-    pshufb            m3, m1
9584
-    vbroadcasti128    m4, [r2 + 27]
9585
-    pshufb            m4, m1
9586
-    vbroadcasti128    m5, [r2 + 35]
9587
-    pshufb            m5, m1
9588
-
9589
-    mova              m10, [r4 + 2 * mmsize]
9590
-
9591
-    INTRA_PRED_ANG32_CAL_ROW
9592
-    movu              [r0 + 2 * r1], m7
9593
-    movu              [r0 + r3], m6
9594
-
9595
-    ;row [20, 21]
9596
-    vbroadcasti128    m2, [r2 + 12]
9597
-    pshufb            m2, m1
9598
-    vbroadcasti128    m3, [r2 + 20]
9599
-    pshufb            m3, m1
9600
-    vbroadcasti128    m4, [r2 + 28]
9601
-    pshufb            m4, m1
9602
-    vbroadcasti128    m5, [r2 + 36]
9603
-    pshufb            m5, m1
9604
-
9605
-    mova              m10, [r4 + 3 * mmsize]
9606
-    lea               r0, [r0 + 4 * r1]
9607
-
9608
-    INTRA_PRED_ANG32_CAL_ROW
9609
-    movu              [r0], m7
9610
-    movu              [r0 + r1], m6
9611
-
9612
-    ;row [22, 23]
9613
-    vbroadcasti128    m2, [r2 + 13]
9614
-    pshufb            m2, m1
9615
-    vbroadcasti128    m3, [r2 + 21]
9616
-    pshufb            m3, m1
9617
-    vbroadcasti128    m4, [r2 + 29]
9618
-    pshufb            m4, m1
9619
-    vbroadcasti128    m5, [r2 + 37]
9620
-    pshufb            m5, m1
9621
-
9622
-    add               r4, 4 * mmsize
9623
-    mova              m10, [r4 + 0 * mmsize]
9624
-
9625
-    INTRA_PRED_ANG32_CAL_ROW
9626
-    movu              [r0 + 2 * r1], m7
9627
-    movu              [r0 + r3], m6
9628
-
9629
-    ;row [24, 25]
9630
-    vbroadcasti128    m2, [r2 + 14]
9631
-    pshufb            m2, m1
9632
-    vbroadcasti128    m3, [r2 + 22]
9633
-    pshufb            m3, m1
9634
-    vbroadcasti128    m4, [r2 + 30]
9635
-    pshufb            m4, m1
9636
-    vbroadcasti128    m5, [r2 + 38]
9637
-    pshufb            m5, m1
9638
-
9639
-    mova              m10, [r4 + 1 * mmsize]
9640
-    lea               r0, [r0 + 4 * r1]
9641
-
9642
-    INTRA_PRED_ANG32_CAL_ROW
9643
-    movu              [r0], m7
9644
-    movu              [r0 + r1], m6
9645
-
9646
-    ;row [26, 27]
9647
-    vbroadcasti128    m2, [r2 + 15]
9648
-    pshufb            m2, m1
9649
-    vbroadcasti128    m3, [r2 + 23]
9650
-    pshufb            m3, m1
9651
-    vbroadcasti128    m4, [r2 + 31]
9652
-    pshufb            m4, m1
9653
-    vbroadcasti128    m5, [r2 + 39]
9654
-    pshufb            m5, m1
9655
-
9656
-    mova              m10, [r4 + 2 * mmsize]
9657
-
9658
-    INTRA_PRED_ANG32_CAL_ROW
9659
-    movu              [r0 + 2 * r1], m7
9660
-    movu              [r0 + r3], m6
9661
-
9662
-    ;row [28, 29]
9663
-    vbroadcasti128    m2, [r2 + 16]
9664
-    pshufb            m2, m1
9665
-    vbroadcasti128    m3, [r2 + 24]
9666
-    pshufb            m3, m1
9667
-    vbroadcasti128    m4, [r2 + 32]
9668
-    pshufb            m4, m1
9669
-    vbroadcasti128    m5, [r2 + 40]
9670
-    pshufb            m5, m1
9671
-
9672
-    mova              m10, [r4 + 3 * mmsize]
9673
-    lea               r0, [r0 + 4 * r1]
9674
-
9675
-    INTRA_PRED_ANG32_CAL_ROW
9676
-    movu              [r0], m7
9677
-    movu              [r0 + r1], m6
9678
-
9679
-    ;row [30]
9680
-    vbroadcasti128    m2, [r2 + 17]
9681
-    pshufb            m2, m1
9682
-    vbroadcasti128    m3, [r2 + 25]
9683
-    pshufb            m3, m1
9684
-    vbroadcasti128    m4, [r2 + 33]
9685
-    pshufb            m4, m1
9686
-    vbroadcasti128    m5, [r2 + 41]
9687
-    pshufb            m5, m1
9688
-
9689
-    add               r4, 4 * mmsize
9690
-    mova              m10, [r4 + 0 * mmsize]
9691
-    vperm2i128        m6, m2, m3, 00100000b
9692
-    pmaddubsw         m6, m10
9693
-    pmulhrsw          m6, m0
9694
-    vperm2i128        m7, m4, m5, 00100000b
9695
-    pmaddubsw         m7, m10
9696
-    pmulhrsw          m7, m0
9697
-    packuswb          m6, m7
9698
-    vpermq            m6, m6, 11011000b
9699
-    movu              [r0 + 2 * r1], m6
9700
-
9701
-    ;row [31]
9702
-    vbroadcasti128    m2, [r2 + 18]
9703
-    pshufb            m2, m1
9704
-    vbroadcasti128    m3, [r2 + 26]
9705
-    pshufb            m3, m1
9706
-    vbroadcasti128    m4, [r2 + 34]
9707
-    pshufb            m4, m1
9708
-    vbroadcasti128    m5, [r2 + 42]
9709
-    pshufb            m5, m1
9710
-
9711
-    mova              m10, [r4 + 1 * mmsize]
9712
-    vperm2i128        m6, m2, m3, 00100000b
9713
-    pmaddubsw         m6, m10
9714
-    pmulhrsw          m6, m0
9715
-    vperm2i128        m7, m4, m5, 00100000b
9716
-    pmaddubsw         m7, m10
9717
-    pmulhrsw          m7, m0
9718
-    packuswb          m6, m7
9719
-    vpermq            m6, m6, 11011000b
9720
-    movu              [r0 + r3], m6
9721
-    RET
9722
-
9723
-INIT_YMM avx2
9724
-cglobal intra_pred_ang32_32, 3, 5, 11
9725
-    mova              m0, [pw_1024]
9726
-    mova              m1, [intra_pred_shuff_0_8]
9727
-    lea               r3, [3 * r1]
9728
-    lea               r4, [c_ang32_mode_32]
9729
-
9730
-    ;row [0]
9731
-    vbroadcasti128    m2, [r2 + 1]
9732
-    pshufb            m2, m1
9733
-    vbroadcasti128    m3, [r2 + 9]
9734
-    pshufb            m3, m1
9735
-    vbroadcasti128    m4, [r2 + 17]
9736
-    pshufb            m4, m1
9737
-    vbroadcasti128    m5, [r2 + 25]
9738
-    pshufb            m5, m1
9739
-
9740
-    mova              m10, [r4 + 0 * mmsize]
9741
-    vperm2i128        m6, m2, m3, 00100000b
9742
-    pmaddubsw         m6, m10
9743
-    pmulhrsw          m6, m0
9744
-    vperm2i128        m7, m4, m5, 00100000b
9745
-    pmaddubsw         m7, m10
9746
-    pmulhrsw          m7, m0
9747
-    packuswb          m6, m7
9748
-    vpermq            m6, m6, 11011000b
9749
-    movu              [r0], m6
9750
-
9751
-    ;row [1, 2]
9752
-    vbroadcasti128    m2, [r2 + 2]
9753
-    pshufb            m2, m1
9754
-    vbroadcasti128    m3, [r2 + 10]
9755
-    pshufb            m3, m1
9756
-    vbroadcasti128    m4, [r2 + 18]
9757
-    pshufb            m4, m1
9758
-    vbroadcasti128    m5, [r2 + 26]
9759
-    pshufb            m5, m1
9760
-
9761
-    mova              m10, [r4 + 1 * mmsize]
9762
-
9763
-    INTRA_PRED_ANG32_CAL_ROW
9764
-    movu              [r0 + r1], m7
9765
-    movu              [r0 + 2 * r1], m6
9766
-
9767
-    ;row [3]
9768
-    vbroadcasti128    m2, [r2 + 3]
9769
-    pshufb            m2, m1
9770
-    vbroadcasti128    m3, [r2 + 11]
9771
-    pshufb            m3, m1
9772
-    vbroadcasti128    m4, [r2 + 19]
9773
-    pshufb            m4, m1
9774
-    vbroadcasti128    m5, [r2 + 27]
9775
-    pshufb            m5, m1
9776
-
9777
-    mova              m10, [r4 + 2 * mmsize]
9778
-    vperm2i128        m6, m2, m3, 00100000b
9779
-    pmaddubsw         m6, m10
9780
-    pmulhrsw          m6, m0
9781
-    vperm2i128        m7, m4, m5, 00100000b
9782
-    pmaddubsw         m7, m10
9783
-    pmulhrsw          m7, m0
9784
-    packuswb          m6, m7
9785
-    vpermq            m6, m6, 11011000b
9786
-    movu              [r0 + r3], m6
9787
-
9788
-    ;row [4, 5]
9789
-    vbroadcasti128    m2, [r2 + 4]
9790
-    pshufb            m2, m1
9791
-    vbroadcasti128    m3, [r2 + 12]
9792
-    pshufb            m3, m1
9793
-    vbroadcasti128    m4, [r2 + 20]
9794
-    pshufb            m4, m1
9795
-    vbroadcasti128    m5, [r2 + 28]
9796
-    pshufb            m5, m1
9797
-
9798
-    mova              m10, [r4 + 3 * mmsize]
9799
-    lea               r0, [r0 + 4 * r1]
9800
-
9801
-    INTRA_PRED_ANG32_CAL_ROW
9802
-    movu              [r0], m7
9803
-    movu              [r0 + r1], m6
9804
-
9805
-    ;row [6]
9806
-    vbroadcasti128    m2, [r2 + 5]
9807
-    pshufb            m2, m1
9808
-    vbroadcasti128    m3, [r2 + 13]
9809
-    pshufb            m3, m1
9810
-    vbroadcasti128    m4, [r2 + 21]
9811
-    pshufb            m4, m1
9812
-    vbroadcasti128    m5, [r2 + 29]
9813
-    pshufb            m5, m1
9814
-
9815
-    add               r4, 4 * mmsize
9816
-    mova              m10, [r4 + 0 * mmsize]
9817
-    vperm2i128        m6, m2, m3, 00100000b
9818
-    pmaddubsw         m6, m10
9819
-    pmulhrsw          m6, m0
9820
-    vperm2i128        m7, m4, m5, 00100000b
9821
-    pmaddubsw         m7, m10
9822
-    pmulhrsw          m7, m0
9823
-    packuswb          m6, m7
9824
-    vpermq            m6, m6, 11011000b
9825
-    movu              [r0 + 2 * r1], m6
9826
-
9827
-    ;row [7, 8]
9828
-    vbroadcasti128    m2, [r2 + 6]
9829
-    pshufb            m2, m1
9830
-    vbroadcasti128    m3, [r2 + 14]
9831
-    pshufb            m3, m1
9832
-    vbroadcasti128    m4, [r2 + 22]
9833
-    pshufb            m4, m1
9834
-    vbroadcasti128    m5, [r2 + 30]
9835
-    pshufb            m5, m1
9836
-
9837
-    mova              m10, [r4 + 1 * mmsize]
9838
-
9839
-    INTRA_PRED_ANG32_CAL_ROW
9840
-    movu              [r0 + r3], m7
9841
-    lea               r0, [r0 + 4 * r1]
9842
-    movu              [r0], m6
9843
-
9844
-    ;row [9]
9845
-    vbroadcasti128    m2, [r2 + 7]
9846
-    pshufb            m2, m1
9847
-    vbroadcasti128    m3, [r2 + 15]
9848
-    pshufb            m3, m1
9849
-    vbroadcasti128    m4, [r2 + 23]
9850
-    pshufb            m4, m1
9851
-    vbroadcasti128    m5, [r2 + 31]
9852
-    pshufb            m5, m1
9853
-
9854
-    mova              m10, [r4 + 2 * mmsize]
9855
-    vperm2i128        m6, m2, m3, 00100000b
9856
-    pmaddubsw         m6, m10
9857
-    pmulhrsw          m6, m0
9858
-    vperm2i128        m7, m4, m5, 00100000b
9859
-    pmaddubsw         m7, m10
9860
-    pmulhrsw          m7, m0
9861
-    packuswb          m6, m7
9862
-    vpermq            m6, m6, 11011000b
9863
-    movu              [r0 + r1], m6
9864
-
9865
-    ;row [10, 11]
9866
-    vbroadcasti128    m2, [r2 + 8]
9867
-    pshufb            m2, m1
9868
-    vbroadcasti128    m3, [r2 + 16]
9869
-    pshufb            m3, m1
9870
-    vbroadcasti128    m4, [r2 + 24]
9871
-    pshufb            m4, m1
9872
-    vbroadcasti128    m5, [r2 + 32]
9873
-    pshufb            m5, m1
9874
-
9875
-    mova              m10, [r4 + 3 * mmsize]
9876
-
9877
-    INTRA_PRED_ANG32_CAL_ROW
9878
-    movu              [r0 + 2 * r1], m7
9879
-    movu              [r0 + r3], m6
9880
-
9881
-    ;row [12]
9882
-    vbroadcasti128    m2, [r2 + 9]
9883
-    pshufb            m2, m1
9884
-    vbroadcasti128    m3, [r2 + 17]
9885
-    pshufb            m3, m1
9886
-    vbroadcasti128    m4, [r2 + 25]
9887
-    pshufb            m4, m1
9888
-    vbroadcasti128    m5, [r2 + 33]
9889
-    pshufb            m5, m1
9890
-
9891
-    add               r4, 4 * mmsize
9892
-    mova              m10, [r4 + 0 * mmsize]
9893
-    lea               r0, [r0 + 4 * r1]
9894
-
9895
-    vperm2i128        m6, m2, m3, 00100000b
9896
-    pmaddubsw         m6, m10
9897
-    pmulhrsw          m6, m0
9898
-    vperm2i128        m7, m4, m5, 00100000b
9899
-    pmaddubsw         m7, m10
9900
-    pmulhrsw          m7, m0
9901
-    packuswb          m6, m7
9902
-    vpermq            m6, m6, 11011000b
9903
-    movu              [r0], m6
9904
-
9905
-    ;row [13, 14]
9906
-    vbroadcasti128    m2, [r2 + 10]
9907
-    pshufb            m2, m1
9908
-    vbroadcasti128    m3, [r2 + 18]
9909
-    pshufb            m3, m1
9910
-    vbroadcasti128    m4, [r2 + 26]
9911
-    pshufb            m4, m1
9912
-    vbroadcasti128    m5, [r2 + 34]
9913
-    pshufb            m5, m1
9914
-
9915
-    mova              m10, [r4 + 1 * mmsize]
9916
-
9917
-    INTRA_PRED_ANG32_CAL_ROW
9918
-    movu              [r0 + r1], m7
9919
-    movu              [r0 + 2 * r1], m6
9920
-
9921
-    ;row [15]
9922
-    vbroadcasti128    m2, [r2 + 11]
9923
-    pshufb            m2, m1
9924
-    vbroadcasti128    m3, [r2 + 19]
9925
-    pshufb            m3, m1
9926
-    vbroadcasti128    m4, [r2 + 27]
9927
-    pshufb            m4, m1
9928
-    vbroadcasti128    m5, [r2 + 35]
9929
-    pshufb            m5, m1
9930
-
9931
-    mova              m10, [r4 + 2 * mmsize]
9932
-    vperm2i128        m6, m2, m3, 00100000b
9933
-    pmaddubsw         m6, m10
9934
-    pmulhrsw          m6, m0
9935
-    vperm2i128        m7, m4, m5, 00100000b
9936
-    pmaddubsw         m7, m10
9937
-    pmulhrsw          m7, m0
9938
-    packuswb          m6, m7
9939
-    vpermq            m6, m6, 11011000b
9940
-    movu              [r0 + r3], m6
9941
-
9942
-    ;row [16, 17]
9943
-    vbroadcasti128    m2, [r2 + 12]
9944
-    pshufb            m2, m1
9945
-    vbroadcasti128    m3, [r2 + 20]
9946
-    pshufb            m3, m1
9947
-    vbroadcasti128    m4, [r2 + 28]
9948
-    pshufb            m4, m1
9949
-    vbroadcasti128    m5, [r2 + 36]
9950
-    pshufb            m5, m1
9951
-
9952
-    mova              m10, [r4 + 3 * mmsize]
9953
-   lea                r0, [r0 + 4 * r1]
9954
-
9955
-    INTRA_PRED_ANG32_CAL_ROW
9956
-    movu              [r0], m7
9957
-    movu              [r0 + r1], m6
9958
-
9959
-    ;row [18]
9960
-    vbroadcasti128    m2, [r2 + 13]
9961
-    pshufb            m2, m1
9962
-    vbroadcasti128    m3, [r2 + 21]
9963
-    pshufb            m3, m1
9964
-    vbroadcasti128    m4, [r2 + 29]
9965
-    pshufb            m4, m1
9966
-    vbroadcasti128    m5, [r2 + 37]
9967
-    pshufb            m5, m1
9968
-
9969
-    add               r4, 4 * mmsize
9970
-    mova              m10, [r4 + 0 * mmsize]
9971
-    vperm2i128        m6, m2, m3, 00100000b
9972
-    pmaddubsw         m6, m10
9973
-    pmulhrsw          m6, m0
9974
-    vperm2i128        m7, m4, m5, 00100000b
9975
-    pmaddubsw         m7, m10
9976
-    pmulhrsw          m7, m0
9977
-    packuswb          m6, m7
9978
-    vpermq            m6, m6, 11011000b
9979
-    movu              [r0 + 2 * r1], m6
9980
-
9981
-    ;row [19, 20]
9982
-    vbroadcasti128    m2, [r2 + 14]
9983
-    pshufb            m2, m1
9984
-    vbroadcasti128    m3, [r2 + 22]
9985
-    pshufb            m3, m1
9986
-    vbroadcasti128    m4, [r2 + 30]
9987
-    pshufb            m4, m1
9988
-    vbroadcasti128    m5, [r2 + 38]
9989
-    pshufb            m5, m1
9990
-
9991
-    mova              m10, [r4 + 1 * mmsize]
9992
-
9993
-    INTRA_PRED_ANG32_CAL_ROW
9994
-    movu              [r0 + r3], m7
9995
-    lea               r0, [r0 + 4 * r1]
9996
-    movu              [r0], m6
9997
-
9998
-    ;row [21]
9999
-    vbroadcasti128    m2, [r2 + 15]
10000
-    pshufb            m2, m1
10001
-    vbroadcasti128    m3, [r2 + 23]
10002
-    pshufb            m3, m1
10003
-    vbroadcasti128    m4, [r2 + 31]
10004
-    pshufb            m4, m1
10005
-    vbroadcasti128    m5, [r2 + 39]
10006
-    pshufb            m5, m1
10007
-
10008
-    mova              m10, [r4 + 2 * mmsize]
10009
-    vperm2i128        m6, m2, m3, 00100000b
10010
-    pmaddubsw         m6, m10
10011
-    pmulhrsw          m6, m0
10012
-    vperm2i128        m7, m4, m5, 00100000b
10013
-    pmaddubsw         m7, m10
10014
-    pmulhrsw          m7, m0
10015
-    packuswb          m6, m7
10016
-    vpermq            m6, m6, 11011000b
10017
-    movu              [r0 + r1], m6
10018
-
10019
-    ;row [22, 23]
10020
-    vbroadcasti128    m2, [r2 + 16]
10021
-    pshufb            m2, m1
10022
-    vbroadcasti128    m3, [r2 + 24]
10023
-    pshufb            m3, m1
10024
-    vbroadcasti128    m4, [r2 + 32]
10025
-    pshufb            m4, m1
10026
-    vbroadcasti128    m5, [r2 + 40]
10027
-    pshufb            m5, m1
10028
-
10029
-    mova              m10, [r4 + 3 * mmsize]
10030
-
10031
-    INTRA_PRED_ANG32_CAL_ROW
10032
-    movu              [r0 + 2 * r1], m7
10033
-    movu              [r0 + r3], m6
10034
-
10035
-    ;row [24]
10036
-    vbroadcasti128    m2, [r2 + 17]
10037
-    pshufb            m2, m1
10038
-    vbroadcasti128    m3, [r2 + 25]
10039
-    pshufb            m3, m1
10040
-    vbroadcasti128    m4, [r2 + 33]
10041
-    pshufb            m4, m1
10042
-    vbroadcasti128    m5, [r2 + 41]
10043
-    pshufb            m5, m1
10044
-
10045
-    lea               r0, [r0 + 4 * r1]
10046
-    add               r4, 4 * mmsize
10047
-    mova              m10, [r4 + 0 * mmsize]
10048
-    vperm2i128        m6, m2, m3, 00100000b
10049
-    pmaddubsw         m6, m10
10050
-    pmulhrsw          m6, m0
10051
-    vperm2i128        m7, m4, m5, 00100000b
10052
-    pmaddubsw         m7, m10
10053
-    pmulhrsw          m7, m0
10054
-    packuswb          m6, m7
10055
-    vpermq            m6, m6, 11011000b
10056
-    movu              [r0], m6
10057
-
10058
-    ;row [25, 26]
10059
-    vbroadcasti128    m2, [r2 + 18]
10060
-    pshufb            m2, m1
10061
-    vbroadcasti128    m3, [r2 + 26]
10062
-    pshufb            m3, m1
10063
-    vbroadcasti128    m4, [r2 + 34]
10064
-    pshufb            m4, m1
10065
-    vbroadcasti128    m5, [r2 + 42]
10066
-    pshufb            m5, m1
10067
-
10068
-    mova              m10, [r4 + 1 * mmsize]
10069
-
10070
-    INTRA_PRED_ANG32_CAL_ROW
10071
-    movu              [r0 + r1], m7
10072
-    movu              [r0 + 2 * r1], m6
10073
-
10074
-    ;row [27]
10075
-    vbroadcasti128    m2, [r2 + 19]
10076
-    pshufb            m2, m1
10077
-    vbroadcasti128    m3, [r2 + 27]
10078
-    pshufb            m3, m1
10079
-    vbroadcasti128    m4, [r2 + 35]
10080
-    pshufb            m4, m1
10081
-    vbroadcasti128    m5, [r2 + 43]
10082
-    pshufb            m5, m1
10083
-
10084
-    mova              m10, [r4 + 2 * mmsize]
10085
-    vperm2i128        m6, m2, m3, 00100000b
10086
-    pmaddubsw         m6, m10
10087
-    pmulhrsw          m6, m0
10088
-    vperm2i128        m7, m4, m5, 00100000b
10089
-    pmaddubsw         m7, m10
10090
-    pmulhrsw          m7, m0
10091
-    packuswb          m6, m7
10092
-    vpermq            m6, m6, 11011000b
10093
-    movu              [r0 + r3], m6
10094
-
10095
-    ;row [28, 29]
10096
-    vbroadcasti128    m2, [r2 + 20]
10097
-    pshufb            m2, m1
10098
-    vbroadcasti128    m3, [r2 + 28]
10099
-    pshufb            m3, m1
10100
-    vbroadcasti128    m4, [r2 + 36]
10101
-    pshufb            m4, m1
10102
-    vbroadcasti128    m5, [r2 + 44]
10103
-    pshufb            m5, m1
10104
-
10105
-    mova              m10, [r4 + 3 * mmsize]
10106
-    lea               r0, [r0 + 4 * r1]
10107
-
10108
-    INTRA_PRED_ANG32_CAL_ROW
10109
-    movu              [r0], m7
10110
-    movu              [r0 + r1], m6
10111
-
10112
-    ;row [30]
10113
-    vbroadcasti128    m2, [r2 + 21]
10114
-    pshufb            m2, m1
10115
-    vbroadcasti128    m3, [r2 + 29]
10116
-    pshufb            m3, m1
10117
-    vbroadcasti128    m4, [r2 + 37]
10118
-    pshufb            m4, m1
10119
-    vbroadcasti128    m5, [r2 + 45]
10120
-    pshufb            m5, m1
10121
-
10122
-    add               r4, 4 * mmsize
10123
-    mova              m10, [r4 + 0 * mmsize]
10124
-    vperm2i128        m6, m2, m3, 00100000b
10125
-    pmaddubsw         m6, m10
10126
-    pmulhrsw          m6, m0
10127
-    vperm2i128        m7, m4, m5, 00100000b
10128
-    pmaddubsw         m7, m10
10129
-    pmulhrsw          m7, m0
10130
-    packuswb          m6, m7
10131
-    vpermq            m6, m6, 11011000b
10132
-    movu              [r0 + 2 * r1], m6
10133
-
10134
-    ;row [31]
10135
-    vbroadcasti128    m2, [r2 + 22]
10136
-    pshufb            m2, m1
10137
-    vbroadcasti128    m3, [r2 + 30]
10138
-    pshufb            m3, m1
10139
-    vbroadcasti128    m4, [r2 + 38]
10140
-    pshufb            m4, m1
10141
-    vbroadcasti128    m5, [r2 + 46]
10142
-    pshufb            m5, m1
10143
-
10144
-    mova              m10, [r4 + 1 * mmsize]
10145
-    vperm2i128        m6, m2, m3, 00100000b
10146
-    pmaddubsw         m6, m10
10147
-    pmulhrsw          m6, m0
10148
-    vperm2i128        m7, m4, m5, 00100000b
10149
-    pmaddubsw         m7, m10
10150
-    pmulhrsw          m7, m0
10151
-    packuswb          m6, m7
10152
-    vpermq            m6, m6, 11011000b
10153
-    movu              [r0 + r3], m6
10154
-    RET
10155
-
10156
-INIT_YMM avx2
10157
-cglobal intra_pred_ang32_25, 3, 5, 11
10158
-    mova              m0, [pw_1024]
10159
-    mova              m1, [intra_pred_shuff_0_8]
10160
-    lea               r3, [3 * r1]
10161
-    lea               r4, [c_ang32_mode_25]
10162
-
10163
-    ;row [0, 1]
10164
-    vbroadcasti128    m2, [r2 + 0]
10165
-    pshufb            m2, m1
10166
-    vbroadcasti128    m3, [r2 + 8]
10167
-    pshufb            m3, m1
10168
-    vbroadcasti128    m4, [r2 + 16]
10169
-    pshufb            m4, m1
10170
-    vbroadcasti128    m5, [r2 + 24]
10171
-    pshufb            m5, m1
10172
-
10173
-    mova              m10, [r4 + 0 * mmsize]
10174
-
10175
-    INTRA_PRED_ANG32_CAL_ROW
10176
-    movu              [r0], m7
10177
-    movu              [r0 + r1], m6
10178
-
10179
-    ;row[2, 3]
10180
-    mova              m10, [r4 + 1 * mmsize]
10181
-
10182
-    INTRA_PRED_ANG32_CAL_ROW
10183
-    movu              [r0 + 2 * r1], m7
10184
-    movu              [r0 + r3], m6
10185
-
10186
-    ;row[4, 5]
10187
-    mova              m10, [r4 + 2 * mmsize]
10188
-    lea               r0, [r0 + 4 * r1]
10189
-
10190
-    INTRA_PRED_ANG32_CAL_ROW
10191
-    movu              [r0], m7
10192
-    movu              [r0 + r1], m6
10193
-
10194
-    ;row[6, 7]
10195
-    mova              m10, [r4 + 3 * mmsize]
10196
-
10197
-    INTRA_PRED_ANG32_CAL_ROW
10198
-    movu              [r0 + 2 * r1], m7
10199
-    movu              [r0 + r3], m6
10200
-
10201
-    ;row[8, 9]
10202
-    add               r4, 4 * mmsize
10203
-    lea               r0, [r0 + 4 * r1]
10204
-    mova              m10, [r4 + 0 * mmsize]
10205
-
10206
-    INTRA_PRED_ANG32_CAL_ROW
10207
-    movu              [r0], m7
10208
-    movu              [r0 + r1], m6
10209
-
10210
-    ;row[10, 11]
10211
-    mova              m10, [r4 + 1 * mmsize]
10212
-
10213
-    INTRA_PRED_ANG32_CAL_ROW
10214
-    movu              [r0 + 2 * r1], m7
10215
-    movu              [r0 + r3], m6
10216
-
10217
-    ;row[12, 13]
10218
-    mova              m10, [r4 + 2 * mmsize]
10219
-    lea               r0, [r0 + 4 * r1]
10220
-
10221
-    INTRA_PRED_ANG32_CAL_ROW
10222
-    movu              [r0], m7
10223
-    movu              [r0 + r1], m6
10224
-
10225
-    ;row[14, 15]
10226
-    mova              m10, [r4 + 3 * mmsize]
10227
-
10228
-    INTRA_PRED_ANG32_CAL_ROW
10229
-    movu              [r0 + 2 * r1], m7
10230
-    movu              [r0 + r3], m6
10231
-
10232
-    ;row[16, 17]
10233
-    movu              xm2, [r2 - 1]
10234
-    pinsrb            xm2, [r2 + 80], 0
10235
-    vinserti128       m2, m2, xm2, 1
10236
-    pshufb            m2, m1
10237
-    vbroadcasti128    m3, [r2 + 7]
10238
-    pshufb            m3, m1
10239
-    vbroadcasti128    m4, [r2 + 15]
10240
-    pshufb            m4, m1
10241
-    vbroadcasti128    m5, [r2 + 23]
10242
-    pshufb            m5, m1
10243
-
10244
-    add               r4, 4 * mmsize
10245
-    lea               r0, [r0 + 4 * r1]
10246
-    mova              m10, [r4 + 0 * mmsize]
10247
-
10248
-    INTRA_PRED_ANG32_CAL_ROW
10249
-    movu              [r0], m7
10250
-    movu              [r0 + r1], m6
10251
-
10252
-    ;row[18, 19]
10253
-    mova              m10, [r4 + 1 * mmsize]
10254
-
10255
-    INTRA_PRED_ANG32_CAL_ROW
10256
-    movu              [r0 + 2 * r1], m7
10257
-    movu              [r0 + r3], m6
10258
-
10259
-    ;row[20, 21]
10260
-    mova              m10, [r4 + 2 * mmsize]
10261
-    lea               r0, [r0 + 4 * r1]
10262
-
10263
-    INTRA_PRED_ANG32_CAL_ROW
10264
-    movu              [r0], m7
10265
-    movu              [r0 + r1], m6
10266
-
10267
-    ;row[22, 23]
10268
-    mova              m10, [r4 + 3 * mmsize]
10269
-
10270
-    INTRA_PRED_ANG32_CAL_ROW
10271
-    movu              [r0 + 2 * r1], m7
10272
-    movu              [r0 + r3], m6
10273
-
10274
-    ;row[24, 25]
10275
-    add               r4, 4 * mmsize
10276
-    lea               r0, [r0 + 4 * r1]
10277
-    mova              m10, [r4 + 0 * mmsize]
10278
-
10279
-    INTRA_PRED_ANG32_CAL_ROW
10280
-    movu              [r0], m7
10281
-    movu              [r0 + r1], m6
10282
-
10283
-    ;row[26, 27]
10284
-    mova              m10, [r4 + 1 * mmsize]
10285
-
10286
-    INTRA_PRED_ANG32_CAL_ROW
10287
-    movu              [r0 + 2 * r1], m7
10288
-    movu              [r0 + r3], m6
10289
-
10290
-    ;row[28, 29]
10291
-    mova              m10, [r4 + 2 * mmsize]
10292
-    lea               r0, [r0 + 4 * r1]
10293
-
10294
-    INTRA_PRED_ANG32_CAL_ROW
10295
-    movu              [r0], m7
10296
-    movu              [r0 + r1], m6
10297
-
10298
-    ;row[30, 31]
10299
-    mova              m10, [r4 + 3 * mmsize]
10300
-
10301
-    INTRA_PRED_ANG32_CAL_ROW
10302
-    movu              [r0 + 2 * r1], m7
10303
-    movu              [r0 + r3], m6
10304
-    RET
10305
-
10306
-INIT_YMM avx2
10307
-cglobal intra_pred_ang32_24, 3, 5, 12
10308
-    mova              m0, [pw_1024]
10309
-    mova              m1, [intra_pred_shuff_0_8]
10310
-    lea               r3, [3 * r1]
10311
-    lea               r4, [c_ang32_mode_24]
10312
-
10313
-    ;row[0, 1]
10314
-    vbroadcasti128    m11, [r2 + 0]
10315
-    pshufb            m2, m11, m1
10316
-    vbroadcasti128    m3, [r2 + 8]
10317
-    pshufb            m3, m1
10318
-    vbroadcasti128    m4, [r2 + 16]
10319
-    pshufb            m4, m1
10320
-    vbroadcasti128    m5, [r2 + 24]
10321
-    pshufb            m5, m1
10322
-
10323
-    mova              m10, [r4 + 0 * mmsize]
10324
-
10325
-    INTRA_PRED_ANG32_CAL_ROW
10326
-    movu              [r0], m7
10327
-    movu              [r0 + r1], m6
10328
-
10329
-    ;row[2, 3]
10330
-    mova              m10, [r4 + 1 * mmsize]
10331
-
10332
-    INTRA_PRED_ANG32_CAL_ROW
10333
-    movu              [r0 + 2 * r1], m7
10334
-    movu              [r0 + r3], m6
10335
-
10336
-    ;row[4, 5]
10337
-    mova              m10, [r4 + 2 * mmsize]
10338
-    lea               r0, [r0 + 4 * r1]
10339
-
10340
-    INTRA_PRED_ANG32_CAL_ROW
10341
-    movu              [r0], m7
10342
-    movu              [r0 + r1], m6
10343
-
10344
-    ;row[6, 7]
10345
-    pslldq            xm11, 1
10346
-    pinsrb            xm11, [r2 + 70], 0
10347
-    vinserti128       m2, m11, xm11, 1
10348
-    pshufb            m2, m1
10349
-    vbroadcasti128    m3, [r2 + 7]
10350
-    pshufb            m3, m1
10351
-    vbroadcasti128    m4, [r2 + 15]
10352
-    pshufb            m4, m1
10353
-    vbroadcasti128    m5, [r2 + 23]
10354
-    pshufb            m5, m1
10355
-
10356
-    mova              m10, [r4 + 3 * mmsize]
10357
-
10358
-    INTRA_PRED_ANG32_CAL_ROW
10359
-    movu              [r0 + 2 * r1], m7
10360
-    movu              [r0 + r3], m6
10361
-
10362
-    ;row[8, 9]
10363
-    add               r4, 4 * mmsize
10364
-    lea               r0, [r0 + 4 * r1]
10365
-    mova              m10, [r4 + 0 * mmsize]
10366
-
10367
-    INTRA_PRED_ANG32_CAL_ROW
10368
-    movu              [r0], m7
10369
-    movu              [r0 + r1], m6
10370
-
10371
-    ;row[10, 11]
10372
-    mova              m10, [r4 + 1 * mmsize]
10373
-
10374
-    INTRA_PRED_ANG32_CAL_ROW
10375
-    movu              [r0 + 2 * r1], m7
10376
-    movu              [r0 + r3], m6
10377
-
10378
-    ;row[12, 13]
10379
-    pslldq            xm11, 1
10380
-    pinsrb            xm11, [r2 + 77], 0
10381
-    vinserti128       m2, m11, xm11, 1
10382
-    pshufb            m2, m1
10383
-    vbroadcasti128    m3, [r2 + 6]
10384
-    pshufb            m3, m1
10385
-    vbroadcasti128    m4, [r2 + 14]
10386
-    pshufb            m4, m1
10387
-    vbroadcasti128    m5, [r2 + 22]
10388
-    pshufb            m5, m1
10389
-
10390
-    mova              m10, [r4 + 2 * mmsize]
10391
-    lea               r0, [r0 + 4 * r1]
10392
-
10393
-    INTRA_PRED_ANG32_CAL_ROW
10394
-    movu              [r0], m7
10395
-    movu              [r0 + r1], m6
10396
-
10397
-    ;row[14, 15]
10398
-    mova              m10, [r4 + 3 * mmsize]
10399
-
10400
-    INTRA_PRED_ANG32_CAL_ROW
10401
-    movu              [r0 + 2 * r1], m7
10402
-    movu              [r0 + r3], m6
10403
-
10404
-    ;row[16, 17]
10405
-    add               r4, 4 * mmsize
10406
-    lea               r0, [r0 + 4 * r1]
10407
-    mova              m10, [r4 + 0 * mmsize]
10408
-
10409
-    INTRA_PRED_ANG32_CAL_ROW
10410
-    movu              [r0], m7
10411
-    movu              [r0 + r1], m6
10412
-
10413
-    ;row[18]
10414
-    mova              m10, [r4 + 1 * mmsize]
10415
-    vperm2i128        m6, m2, m3, 00100000b
10416
-    pmaddubsw         m6, m10
10417
-    pmulhrsw          m6, m0
10418
-    vperm2i128        m7, m4, m5, 00100000b
10419
-    pmaddubsw         m7, m10
10420
-    pmulhrsw          m7, m0
10421
-    packuswb          m6, m7
10422
-    vpermq            m6, m6, 11011000b
10423
-    movu              [r0 + 2 * r1], m6
10424
-
10425
-    ;row[19, 20]
10426
-    pslldq            xm11, 1
10427
-    pinsrb            xm11, [r2 + 83], 0
10428
-    vinserti128       m2, m11, xm11, 1
10429
-    pshufb            m2, m1
10430
-    vbroadcasti128    m3, [r2 + 5]
10431
-    pshufb            m3, m1
10432
-    vbroadcasti128    m4, [r2 + 13]
10433
-    pshufb            m4, m1
10434
-    vbroadcasti128    m5, [r2 + 21]
10435
-    pshufb            m5, m1
10436
-
10437
-    mova              m10, [r4 + 2 * mmsize]
10438
-
10439
-    INTRA_PRED_ANG32_CAL_ROW
10440
-    movu              [r0 + r3], m7
10441
-    lea               r0, [r0 + 4 * r1]
10442
-    movu              [r0], m6
10443
-
10444
-    ;row[21, 22]
10445
-    mova              m10, [r4 + 3 * mmsize]
10446
-
10447
-    INTRA_PRED_ANG32_CAL_ROW
10448
-    movu              [r0 + r1], m7
10449
-    movu              [r0 + 2 * r1], m6
10450
-
10451
-    ;row[23, 24]
10452
-    add               r4, 4 * mmsize
10453
-    mova              m10, [r4 + 0 * mmsize]
10454
-
10455
-    INTRA_PRED_ANG32_CAL_ROW
10456
-    movu              [r0 + r3], m7
10457
-    lea               r0, [r0 + 4 * r1]
10458
-    movu              [r0], m6
10459
-
10460
-    ;row[25, 26]
10461
-    pslldq            xm11, 1
10462
-    pinsrb            xm11, [r2 + 90], 0
10463
-    vinserti128       m2, m11, xm11, 1
10464
-    pshufb            m2, m1
10465
-    vbroadcasti128    m3, [r2 + 4]
10466
-    pshufb            m3, m1
10467
-    vbroadcasti128    m4, [r2 + 12]
10468
-    pshufb            m4, m1
10469
-    vbroadcasti128    m5, [r2 + 20]
10470
-    pshufb            m5, m1
10471
-
10472
-    mova              m10, [r4 + 1 * mmsize]
10473
-
10474
-    INTRA_PRED_ANG32_CAL_ROW
10475
-    movu              [r0 + r1], m7
10476
-    movu              [r0 + 2 * r1], m6
10477
-
10478
-    ;row[27, 28]
10479
-    mova              m10, [r4 + 2 * mmsize]
10480
-
10481
-    INTRA_PRED_ANG32_CAL_ROW
10482
-    movu              [r0 + r3], m7
10483
-    lea               r0, [r0 + 4 * r1]
10484
-    movu              [r0], m6
10485
-
10486
-    ;row[29, 30]
10487
-    mova              m10, [r4 + 3 * mmsize]
10488
-
10489
-    INTRA_PRED_ANG32_CAL_ROW
10490
-    movu              [r0 + r1], m7
10491
-    movu              [r0 + 2 * r1], m6
10492
-
10493
-    ;[row 31]
10494
-    mova              m10, [r4 + 4 * mmsize]
10495
-    vperm2i128        m6, m2, m3, 00100000b
10496
-    pmaddubsw         m6, m10
10497
-    pmulhrsw          m6, m0
10498
-    vperm2i128        m7, m4, m5, 00100000b
10499
-    pmaddubsw         m7, m10
10500
-    pmulhrsw          m7, m0
10501
-    packuswb          m6, m7
10502
-    vpermq            m6, m6, 11011000b
10503
-    movu              [r0 + r3], m6
10504
-    RET
10505
-
10506
-INIT_YMM avx2
10507
-cglobal intra_pred_ang32_23, 3, 5, 12
10508
-    mova              m0, [pw_1024]
10509
-    mova              m1, [intra_pred_shuff_0_8]
10510
-    lea               r3, [3 * r1]
10511
-    lea               r4, [c_ang32_mode_23]
10512
-
10513
-    ;row[0, 1]
10514
-    vbroadcasti128    m11, [r2 + 0]
10515
-    pshufb            m2, m11, m1
10516
-    vbroadcasti128    m3, [r2 + 8]
10517
-    pshufb            m3, m1
10518
-    vbroadcasti128    m4, [r2 + 16]
10519
-    pshufb            m4, m1
10520
-    vbroadcasti128    m5, [r2 + 24]
10521
-    pshufb            m5, m1
10522
-
10523
-    mova              m10, [r4 + 0 * mmsize]
10524
-
10525
-    INTRA_PRED_ANG32_CAL_ROW
10526
-    movu              [r0], m7
10527
-    movu              [r0 + r1], m6
10528
-
10529
-    ;row[2]
10530
-    vperm2i128        m6, m2, m3, 00100000b
10531
-    pmaddubsw         m6, [r4 + 1 * mmsize]
10532
-    pmulhrsw          m6, m0
10533
-    vperm2i128        m7, m4, m5, 00100000b
10534
-    pmaddubsw         m7, [r4 + 1 * mmsize]
10535
-    pmulhrsw          m7, m0
10536
-    packuswb          m6, m7
10537
-    vpermq            m6, m6, 11011000b
10538
-    movu              [r0 + 2 * r1], m6
10539
-
10540
-    ;row[3, 4]
10541
-    pslldq            xm11, 1
10542
-    pinsrb            xm11, [r2 + 68], 0
10543
-    vinserti128       m2, m11, xm11, 1
10544
-    pshufb            m2, m1
10545
-    vbroadcasti128    m3, [r2 + 7]
10546
-    pshufb            m3, m1
10547
-    vbroadcasti128    m4, [r2 + 15]
10548
-    pshufb            m4, m1
10549
-    vbroadcasti128    m5, [r2 + 23]
10550
-    pshufb            m5, m1
10551
-
10552
-    mova              m10, [r4 + 2 * mmsize]
10553
-
10554
-    INTRA_PRED_ANG32_CAL_ROW
10555
-    movu              [r0 + r3], m7
10556
-    lea               r0, [r0 + 4 * r1]
10557
-    movu              [r0], m6
10558
-
10559
-    ;row[5, 6]
10560
-    mova              m10, [r4 + 3 * mmsize]
10561
-
10562
-    INTRA_PRED_ANG32_CAL_ROW
10563
-    movu              [r0 + r1], m7
10564
-    movu              [r0 + 2 * r1], m6
10565
-
10566
-    ;row[7, 8]
10567
-    pslldq            xm11, 1
10568
-    pinsrb            xm11, [r2 + 71], 0
10569
-    vinserti128       m2, m11, xm11, 1
10570
-    pshufb            m2, m1
10571
-    vbroadcasti128    m3, [r2 + 6]
10572
-    pshufb            m3, m1
10573
-    vbroadcasti128    m4, [r2 + 14]
10574
-    pshufb            m4, m1
10575
-    vbroadcasti128    m5, [r2 + 22]
10576
-    pshufb            m5, m1
10577
-
10578
-    add               r4, 4 * mmsize
10579
-    mova              m10, [r4 + 0 * mmsize]
10580
-
10581
-    INTRA_PRED_ANG32_CAL_ROW
10582
-    movu              [r0 + r3], m7
10583
-    lea               r0, [r0 + 4 * r1]
10584
-    movu              [r0], m6
10585
-
10586
-    ;row[9]
10587
-    vperm2i128        m6, m2, m3, 00100000b
10588
-    pmaddubsw         m6, [r4 + 1 * mmsize]
10589
-    pmulhrsw          m6, m0
10590
-    vperm2i128        m7, m4, m5, 00100000b
10591
-    pmaddubsw         m7, [r4 + 1 * mmsize]
10592
-    pmulhrsw          m7, m0
10593
-    packuswb          m6, m7
10594
-    vpermq            m6, m6, 11011000b
10595
-    movu              [r0 + r1], m6
10596
-
10597
-    ;row[10, 11]
10598
-    pslldq            xm11, 1
10599
-    pinsrb            xm11, [r2 + 75], 0
10600
-    vinserti128       m2, m11, xm11, 1
10601
-    pshufb            m2, m1
10602
-    vbroadcasti128    m3, [r2 + 5]
10603
-    pshufb            m3, m1
10604
-    vbroadcasti128    m4, [r2 + 13]
10605
-    pshufb            m4, m1
10606
-    vbroadcasti128    m5, [r2 + 21]
10607
-    pshufb            m5, m1
10608
-
10609
-    mova              m10, [r4 + 2 * mmsize]
10610
-
10611
-    INTRA_PRED_ANG32_CAL_ROW
10612
-    movu              [r0 + 2 * r1], m7
10613
-    movu              [r0 + r3], m6
10614
-
10615
-    ;row[12, 13]
10616
-    lea               r0, [r0 + 4 * r1]
10617
-    mova              m10, [r4 + 3 * mmsize]
10618
-
10619
-    INTRA_PRED_ANG32_CAL_ROW
10620
-    movu              [r0], m7
10621
-    movu              [r0 + r1], m6
10622
-
10623
-    ;row[14, 15]
10624
-    pslldq            xm11, 1
10625
-    pinsrb            xm11, [r2 + 78], 0
10626
-    vinserti128       m2, m11, xm11, 1
10627
-    pshufb            m2, m1
10628
-    vbroadcasti128    m3, [r2 + 4]
10629
-    pshufb            m3, m1
10630
-    vbroadcasti128    m4, [r2 + 12]
10631
-    pshufb            m4, m1
10632
-    vbroadcasti128    m5, [r2 + 20]
10633
-    pshufb            m5, m1
10634
-
10635
-    add               r4, 4 * mmsize
10636
-    mova              m10, [r4 + 0 * mmsize]
10637
-
10638
-    INTRA_PRED_ANG32_CAL_ROW
10639
-    movu              [r0 + 2 * r1], m7
10640
-    movu              [r0 + r3], m6
10641
-
10642
-    ;row[16]
10643
-    lea               r0, [r0 + 4 * r1]
10644
-    vperm2i128        m6, m2, m3, 00100000b
10645
-    pmaddubsw         m6, [r4 + 1 * mmsize]
10646
-    pmulhrsw          m6, m0
10647
-    vperm2i128        m7, m4, m5, 00100000b
10648
-    pmaddubsw         m7, [r4 + 1 * mmsize]
10649
-    pmulhrsw          m7, m0
10650
-    packuswb          m6, m7
10651
-    vpermq            m6, m6, 11011000b
10652
-    movu              [r0], m6
10653
-
10654
-    ;row[17, 18]
10655
-    pslldq            xm11, 1
10656
-    pinsrb            xm11, [r2 + 82], 0
10657
-    vinserti128       m2, m11, xm11, 1
10658
-    pshufb            m2, m1
10659
-    vbroadcasti128    m3, [r2 + 3]
10660
-    pshufb            m3, m1
10661
-    vbroadcasti128    m4, [r2 + 11]
10662
-    pshufb            m4, m1
10663
-    vbroadcasti128    m5, [r2 + 19]
10664
-    pshufb            m5, m1
10665
-
10666
-    mova              m10, [r4 + 2 * mmsize]
10667
-
10668
-    INTRA_PRED_ANG32_CAL_ROW
10669
-    movu              [r0 + r1], m7
10670
-    movu              [r0 + 2 * r1], m6
10671
-
10672
-    ;row[19, 20]
10673
-    mova              m10, [r4 + 3 * mmsize]
10674
-
10675
-    INTRA_PRED_ANG32_CAL_ROW
10676
-    movu              [r0 + r3], m7
10677
-    lea               r0, [r0 + 4 * r1]
10678
-    movu              [r0], m6
10679
-
10680
-    ;row[21, 22]
10681
-    pslldq            xm11, 1
10682
-    pinsrb            xm11, [r2 + 85], 0
10683
-    vinserti128       m2, m11, xm11, 1
10684
-    pshufb            m2, m1
10685
-    vbroadcasti128    m3, [r2 + 2]
10686
-    pshufb            m3, m1
10687
-    vbroadcasti128    m4, [r2 + 10]
10688
-    pshufb            m4, m1
10689
-    vbroadcasti128    m5, [r2 + 18]
10690
-    pshufb            m5, m1
10691
-
10692
-    add               r4, 4 * mmsize
10693
-    mova              m10, [r4 + 0 * mmsize]
10694
-
10695
-    INTRA_PRED_ANG32_CAL_ROW
10696
-    movu              [r0 + r1], m7
10697
-    movu              [r0 + 2 * r1], m6
10698
-
10699
-    ;row[23]
10700
-    vperm2i128        m6, m2, m3, 00100000b
10701
-    pmaddubsw         m6, [r4 + 1 * mmsize]
10702
-    pmulhrsw          m6, m0
10703
-    vperm2i128        m7, m4, m5, 00100000b
10704
-    pmaddubsw         m7, [r4 + 1 * mmsize]
10705
-    pmulhrsw          m7, m0
10706
-    packuswb          m6, m7
10707
-    vpermq            m6, m6, 11011000b
10708
-    movu              [r0 + r3], m6
10709
-
10710
-    ;row[24, 25]
10711
-    pslldq            xm11, 1
10712
-    pinsrb            xm11, [r2 + 89], 0
10713
-    vinserti128       m2, m11, xm11, 1
10714
-    pshufb            m2, m1
10715
-    vbroadcasti128    m3, [r2 + 1]
10716
-    pshufb            m3, m1
10717
-    vbroadcasti128    m4, [r2 + 9]
10718
-    pshufb            m4, m1
10719
-    vbroadcasti128    m5, [r2 + 17]
10720
-    pshufb            m5, m1
10721
-
10722
-    mova              m10, [r4 + 2 * mmsize]
10723
-    lea               r0, [r0 + 4 * r1]
10724
-
10725
-    INTRA_PRED_ANG32_CAL_ROW
10726
-    movu              [r0], m7
10727
-    movu              [r0 + r1], m6
10728
-
10729
-    ;row[26, 27]
10730
-    mova              m10, [r4 + 3 * mmsize]
10731
-
10732
-    INTRA_PRED_ANG32_CAL_ROW
10733
-    movu              [r0 + 2 * r1], m7
10734
-    movu              [r0 + r3], m6
10735
-
10736
-    ;row[28, 29]
10737
-    pslldq            xm11, 1
10738
-    pinsrb            xm11, [r2 + 92], 0
10739
-    vinserti128       m2, m11, xm11, 1
10740
-    pshufb            m2, m1
10741
-    vbroadcasti128    m3, [r2 + 0]
10742
-    pshufb            m3, m1
10743
-    vbroadcasti128    m4, [r2 + 8]
10744
-    pshufb            m4, m1
10745
-    vbroadcasti128    m5, [r2 + 16]
10746
-    pshufb            m5, m1
10747
-
10748
-    add               r4, 4 * mmsize
10749
-    mova              m10, [r4 + 0 * mmsize]
10750
-    lea               r0, [r0 + 4 * r1]
10751
-
10752
-    INTRA_PRED_ANG32_CAL_ROW
10753
-    movu              [r0], m7
10754
-    movu              [r0 + r1], m6
10755
-
10756
-    ;row[30, 31]
10757
-    mova              m10, [r4 + 1 * mmsize]
10758
-
10759
-    INTRA_PRED_ANG32_CAL_ROW
10760
-    movu              [r0 + 2 * r1], m7
10761
-    movu              [r0 + r3], m6
10762
-    RET
10763
-
10764
-INIT_YMM avx2
10765
-cglobal intra_pred_ang32_22, 3, 5, 13
10766
-    mova              m0, [pw_1024]
10767
-    mova              m1, [intra_pred_shuff_0_8]
10768
-    lea               r3, [3 * r1]
10769
-    lea               r4, [c_ang32_mode_22]
10770
-
10771
-    ;row[0, 1]
10772
-    vbroadcasti128    m11, [r2 + 0]
10773
-    pshufb            m2, m11, m1
10774
-    vbroadcasti128    m3, [r2 + 8]
10775
-    pshufb            m3, m1
10776
-    vbroadcasti128    m4, [r2 + 16]
10777
-    pshufb            m4, m1
10778
-    vbroadcasti128    m5, [r2 + 24]
10779
-    pshufb            m5, m1
10780
-
10781
-    mova              m10, [r4 + 0 * mmsize]
10782
-
10783
-    INTRA_PRED_ANG32_CAL_ROW
10784
-    movu              [r0], m7
10785
-    movu              [r0 + r1], m6
10786
-
10787
-    ;row[2, 3]
10788
-    pslldq            xm11, 1
10789
-    pinsrb            xm11, [r2 + 66], 0
10790
-    vinserti128       m2, m11, xm11, 1
10791
-    pshufb            m2, m1
10792
-    vbroadcasti128    m3, [r2 + 7]
10793
-    pshufb            m3, m1
10794
-    vbroadcasti128    m4, [r2 + 15]
10795
-    pshufb            m4, m1
10796
-    vbroadcasti128    m5, [r2 + 23]
10797
-    pshufb            m5, m1
10798
-
10799
-    mova              m10, [r4 + 1 * mmsize]
10800
-
10801
-    INTRA_PRED_ANG32_CAL_ROW
10802
-    movu              [r0 + 2 * r1], m7
10803
-    movu              [r0 + r3], m6
10804
-
10805
-    ;row[4, 5]
10806
-    pslldq            xm11, 1
10807
-    pinsrb            xm11, [r2 + 69], 0
10808
-    vinserti128       m2, m11, xm11, 1
10809
-    pshufb            m2, m1
10810
-    vbroadcasti128    m3, [r2 + 6]
10811
-    pshufb            m3, m1
10812
-    vbroadcasti128    m4, [r2 + 14]
10813
-    pshufb            m4, m1
10814
-    vbroadcasti128    m5, [r2 + 22]
10815
-    pshufb            m5, m1
10816
-
10817
-    lea               r0, [r0 + 4 * r1]
10818
-    mova              m10, [r4 + 2 * mmsize]
10819
-
10820
-    INTRA_PRED_ANG32_CAL_ROW
10821
-    movu              [r0], m7
10822
-    movu              [r0 + r1], m6
10823
-
10824
-    ;row[6]
10825
-    vperm2i128        m6, m2, m3, 00100000b
10826
-    pmaddubsw         m6, [r4 + 3 * mmsize]
10827
-    pmulhrsw          m6, m0
10828
-    vperm2i128        m7, m4, m5, 00100000b
10829
-    pmaddubsw         m7, [r4 + 3 * mmsize]
10830
-    pmulhrsw          m7, m0
10831
-    packuswb          m6, m7
10832
-    vpermq            m6, m6, 11011000b
10833
-    movu              [r0 + 2 * r1], m6
10834
-
10835
-    ;row[7, 8]
10836
-    pslldq            xm11, 1
10837
-    pinsrb            xm11, [r2 + 71], 0
10838
-    vinserti128       m2, m11, xm11, 1
10839
-    pshufb            m2, m1
10840
-    vbroadcasti128    m3, [r2 + 5]
10841
-    pshufb            m3, m1
10842
-    vbroadcasti128    m4, [r2 + 13]
10843
-    pshufb            m4, m1
10844
-    vbroadcasti128    m5, [r2 + 21]
10845
-    pshufb            m5, m1
10846
-
10847
-    add               r4, 4 * mmsize
10848
-    mova              m10, [r4 + 0 * mmsize]
10849
-
10850
-    INTRA_PRED_ANG32_CAL_ROW
10851
-    movu              [r0 + r3], m7
10852
-    lea               r0, [r0 + 4 * r1]
10853
-    movu              [r0], m6
10854
-
10855
-    ;row[9, 10]
10856
-    pslldq            xm11, 1
10857
-    pinsrb            xm11, [r2 + 74], 0
10858
-    vinserti128       m2, m11, xm11, 1
10859
-    vinserti128       m2, m2, xm2, 1
10860
-    pshufb            m2, m1
10861
-    vbroadcasti128    m3, [r2 + 4]
10862
-    pshufb            m3, m1
10863
-    vbroadcasti128    m4, [r2 + 12]
10864
-    pshufb            m4, m1
10865
-    vbroadcasti128    m5, [r2 + 20]
10866
-    pshufb            m5, m1
10867
-
10868
-    mova              m10, [r4 + 1 * mmsize]
10869
-
10870
-    INTRA_PRED_ANG32_CAL_ROW
10871
-    movu              [r0 + r1], m7
10872
-    movu              [r0 + 2 * r1], m6
10873
-
10874
-    ;row[11]
10875
-    vperm2i128        m6, m2, m3, 00100000b
10876
-    pmaddubsw         m6, [r4 + 2 * mmsize]
10877
-    pmulhrsw          m6, m0
10878
-    vperm2i128        m7, m4, m5, 00100000b
10879
-    pmaddubsw         m7, [r4 + 2 * mmsize]
10880
-    pmulhrsw          m7, m0
10881
-    packuswb          m6, m7
10882
-    vpermq            m6, m6, 11011000b
10883
-    movu              [r0 + r3], m6
10884
-
10885
-    ;row[12, 13]
10886
-    pslldq            xm11, 1
10887
-    pinsrb            xm11, [r2 + 76], 0
10888
-    vinserti128       m2, m11, xm11, 1
10889
-    pshufb            m2, m1
10890
-    vbroadcasti128    m3, [r2 + 3]
10891
-    pshufb            m3, m1
10892
-    vbroadcasti128    m4, [r2 + 11]
10893
-    pshufb            m4, m1
10894
-    vbroadcasti128    m5, [r2 + 19]
10895
-    pshufb            m5, m1
10896
-
10897
-    mova              m10, [r4 + 3 * mmsize]
10898
-    lea               r0, [r0 + 4 * r1]
10899
-
10900
-    INTRA_PRED_ANG32_CAL_ROW
10901
-    movu              [r0], m7
10902
-    movu              [r0 + r1], m6
10903
-
10904
-    ;row[14, 15]
10905
-    pslldq            xm11, 1
10906
-    pinsrb            xm11, [r2 + 79], 0
10907
-    vinserti128       m2, m11, xm11, 1
10908
-    pshufb            m2, m1
10909
-    vbroadcasti128    m3, [r2 + 2]
10910
-    pshufb            m3, m1
10911
-    vbroadcasti128    m4, [r2 + 10]
10912
-    pshufb            m4, m1
10913
-    vbroadcasti128    m5, [r2 + 18]
10914
-    pshufb            m5, m1
10915
-
10916
-    add               r4, 4 * mmsize
10917
-    mova              m10, [r4 + 0 * mmsize]
10918
-
10919
-    INTRA_PRED_ANG32_CAL_ROW
10920
-    movu              [r0 + 2 * r1], m7
10921
-    movu              [r0 + r3], m6
10922
-
10923
-    ;row[16]
10924
-    lea               r0, [r0 + 4 * r1]
10925
-    vperm2i128        m6, m2, m3, 00100000b
10926
-    pmaddubsw         m6, [r4 + 1 * mmsize]
10927
-    pmulhrsw          m6, m0
10928
-    vperm2i128        m7, m4, m5, 00100000b
10929
-    pmaddubsw         m7, [r4 + 1 * mmsize]
10930
-    pmulhrsw          m7, m0
10931
-    packuswb          m6, m7
10932
-    vpermq            m6, m6, 11011000b
10933
-    movu              [r0], m6
10934
-
10935
-    ;row[17, 18]
10936
-    pslldq            xm11, 1
10937
-    pinsrb            xm11, [r2 + 81], 0
10938
-    vinserti128       m2, m11, xm11, 1
10939
-    pshufb            m2, m1
10940
-    vbroadcasti128    m3, [r2 + 1]
10941
-    pshufb            m3, m1
10942
-    vbroadcasti128    m4, [r2 + 9]
10943
-    pshufb            m4, m1
10944
-    vbroadcasti128    m5, [r2 + 17]
10945
-    pshufb            m5, m1
10946
-
10947
-    mova              m10, [r4 + 2 * mmsize]
10948
-
10949
-    INTRA_PRED_ANG32_CAL_ROW
10950
-    movu              [r0 + r1], m7
10951
-    movu              [r0 + 2 * r1], m6
10952
-
10953
-    ;row[19, 20]
10954
-    pslldq            xm11, 1
10955
-    pinsrb            xm11, [r2 + 84], 0
10956
-    vinserti128       m2, m11, xm11, 1
10957
-    pshufb            m2, m1
10958
-    vbroadcasti128    m12, [r2 + 0]
10959
-    pshufb            m3, m12, m1
10960
-    vbroadcasti128    m4, [r2 + 8]
10961
-    pshufb            m4, m1
10962
-    vbroadcasti128    m5, [r2 + 16]
10963
-    pshufb            m5, m1
10964
-
10965
-    mova              m10, [r4 + 3 * mmsize]
10966
-
10967
-    INTRA_PRED_ANG32_CAL_ROW
10968
-    movu              [r0 + r3], m7
10969
-    lea               r0, [r0 + 4 * r1]
10970
-    movu              [r0], m6
10971
-
10972
-    ;row[21]
10973
-    add               r4, 4 * mmsize
10974
-    vperm2i128        m6, m2, m3, 00100000b
10975
-    pmaddubsw         m6, [r4 + 0 * mmsize]
10976
-    pmulhrsw          m6, m0
10977
-    vperm2i128        m7, m4, m5, 00100000b
10978
-    pmaddubsw         m7, [r4 + 0 * mmsize]
10979
-    pmulhrsw          m7, m0
10980
-    packuswb          m6, m7
10981
-    vpermq            m6, m6, 11011000b
10982
-    movu              [r0 + r1], m6
10983
-
10984
-    ;row[22, 23]
10985
-    pslldq            xm11, 1
10986
-    pinsrb            xm11, [r2 + 86], 0
10987
-    vinserti128       m2, m11, xm11, 1
10988
-    pshufb            m2, m1
10989
-    pslldq            xm12, 1
10990
-    pinsrb            xm12, [r2 + 66], 0
10991
-    vinserti128       m3, m12, xm12, 1
10992
-    pshufb            m3, m1
10993
-    vbroadcasti128    m4, [r2 + 7]
10994
-    pshufb            m4, m1
10995
-    vbroadcasti128    m5, [r2 + 15]
10996
-    pshufb            m5, m1
10997
-
10998
-    mova              m10, [r4 + 1 * mmsize]
10999
-
11000
-    INTRA_PRED_ANG32_CAL_ROW
11001
-    movu              [r0 + 2 * r1], m7
11002
-    movu              [r0 + r3], m6
11003
-
11004
-    ;row[24, 25]
11005
-    pslldq            xm11, 1
11006
-    pinsrb            xm11, [r2 + 89], 0
11007
-    vinserti128       m2, m11, xm11, 1
11008
-    pshufb            m2, m1
11009
-    pslldq            xm12, 1
11010
-    pinsrb            xm12, [r2 + 69], 0
11011
-    vinserti128       m3, m12, xm12, 1
11012
-    pshufb            m3, m1
11013
-    vbroadcasti128    m4, [r2 + 6]
11014
-    pshufb            m4, m1
11015
-    vbroadcasti128    m5, [r2 + 14]
11016
-    pshufb            m5, m1
11017
-
11018
-    mova              m10, [r4 + 2 * mmsize]
11019
-    lea               r0, [r0 + 4 * r1]
11020
-
11021
-    INTRA_PRED_ANG32_CAL_ROW
11022
-    movu              [r0], m7
11023
-    movu              [r0 + r1], m6
11024
-
11025
-    ;row[26]
11026
-    vperm2i128        m6, m2, m3, 00100000b
11027
-    pmaddubsw         m6, [r4 + 3 * mmsize]
11028
-    pmulhrsw          m6, m0
11029
-    vperm2i128        m7, m4, m5, 00100000b
11030
-    pmaddubsw         m7, [r4 + 3 * mmsize]
11031
-    pmulhrsw          m7, m0
11032
-    packuswb          m6, m7
11033
-    vpermq            m6, m6, 11011000b
11034
-    movu              [r0 + 2 * r1], m6
11035
-
11036
-    ;row[27, 28]
11037
-    pslldq            xm11, 1
11038
-    pinsrb            xm11, [r2 + 91], 0
11039
-    vinserti128       m2, m11, xm11, 1
11040
-    pshufb            m2, m1
11041
-    pslldq            xm12, 1
11042
-    pinsrb            xm12, [r2 + 71], 0
11043
-    vinserti128       m3, m12, xm12, 1
11044
-    pshufb            m3, m1
11045
-    vbroadcasti128    m4, [r2 + 5]
11046
-    pshufb            m4, m1
11047
-    vbroadcasti128    m5, [r2 + 13]
11048
-    pshufb            m5, m1
11049
-
11050
-    add               r4, 4 * mmsize
11051
-    mova              m10, [r4 + 0 * mmsize]
11052
-
11053
-    INTRA_PRED_ANG32_CAL_ROW
11054
-    movu              [r0 + r3], m7
11055
-    lea               r0, [r0 + 4 * r1]
11056
-    movu              [r0], m6
11057
-
11058
-    ;row[29, 30]
11059
-    pslldq            xm11, 1
11060
-    pinsrb            xm11, [r2 + 94], 0
11061
-    vinserti128       m2, m11, xm11, 1
11062
-    pshufb            m2, m1
11063
-    pslldq            xm12, 1
11064
-    pinsrb            xm12, [r2 + 74], 0
11065
-    vinserti128       m3, m12, xm12, 1
11066
-    pshufb            m3, m1
11067
-    vbroadcasti128    m4, [r2 + 4]
11068
-    pshufb            m4, m1
11069
-    vbroadcasti128    m5, [r2 + 12]
11070
-    pshufb            m5, m1
11071
-
11072
-    mova              m10, [r4 + 1 * mmsize]
11073
-
11074
-    INTRA_PRED_ANG32_CAL_ROW
11075
-    movu              [r0 + r1], m7
11076
-    movu              [r0 + 2 * r1], m6
11077
-
11078
-    ;row[31]
11079
-    vperm2i128        m6, m2, m3, 00100000b
11080
-    pmaddubsw         m6, [r4 + 2 * mmsize]
11081
-    pmulhrsw          m6, m0
11082
-    vperm2i128        m7, m4, m5, 00100000b
11083
-    pmaddubsw         m7, [r4 + 2 * mmsize]
11084
-    pmulhrsw          m7, m0
11085
-    packuswb          m6, m7
11086
-    vpermq            m6, m6, 11011000b
11087
-    movu              [r0 + r3], m6
11088
-    RET
11089
-
11090
-INIT_YMM avx2
11091
-cglobal intra_pred_ang32_21, 3, 5, 13
11092
-    mova              m0, [pw_1024]
11093
-    mova              m1, [intra_pred_shuff_0_8]
11094
-    lea               r3, [3 * r1]
11095
-    lea               r4, [c_ang32_mode_21]
11096
-
11097
-    ;row[0]
11098
-    vbroadcasti128    m11, [r2 + 0]
11099
-    pshufb            m2, m11, m1
11100
-    vbroadcasti128    m3, [r2 + 8]
11101
-    pshufb            m3, m1
11102
-    vbroadcasti128    m4, [r2 + 16]
11103
-    pshufb            m4, m1
11104
-    vbroadcasti128    m5, [r2 + 24]
11105
-    pshufb            m5, m1
11106
-
11107
-    vperm2i128        m6, m2, m3, 00100000b
11108
-    pmaddubsw         m6, [r4 + 0 * mmsize]
11109
-    pmulhrsw          m6, m0
11110
-    vperm2i128        m7, m4, m5, 00100000b
11111
-    pmaddubsw         m7, [r4 + 0 * mmsize]
11112
-    pmulhrsw          m7, m0
11113
-    packuswb          m6, m7
11114
-    vpermq            m6, m6, 11011000b
11115
-    movu              [r0], m6
11116
-
11117
-    ;row[1, 2]
11118
-    pslldq            xm11, 1
11119
-    pinsrb            xm11, [r2 + 66], 0
11120
-    vinserti128       m2, m11, xm11, 1
11121
-    pshufb            m2, m1
11122
-    vbroadcasti128    m3, [r2 + 7]
11123
-    pshufb            m3, m1
11124
-    vbroadcasti128    m4, [r2 + 15]
11125
-    pshufb            m4, m1
11126
-    vbroadcasti128    m5, [r2 + 23]
11127
-    pshufb            m5, m1
11128
-
11129
-    mova              m10, [r4 + 1 * mmsize]
11130
-
11131
-    INTRA_PRED_ANG32_CAL_ROW
11132
-    movu              [r0 + r1], m7
11133
-    movu              [r0 + 2 * r1], m6
11134
-
11135
-    ;row[3, 4]
11136
-    pslldq            xm11, 1
11137
-    pinsrb            xm11, [r2 + 68], 0
11138
-    vinserti128       m2, m11, xm11, 1
11139
-    pshufb            m2, m1
11140
-    vbroadcasti128    m3, [r2 + 6]
11141
-    pshufb            m3, m1
11142
-    vbroadcasti128    m4, [r2 + 14]
11143
-    pshufb            m4, m1
11144
-    vbroadcasti128    m5, [r2 + 22]
11145
-    pshufb            m5, m1
11146
-
11147
-    mova              m10, [r4 + 2 * mmsize]
11148
-
11149
-    INTRA_PRED_ANG32_CAL_ROW
11150
-    movu              [r0 + r3], m7
11151
-    lea               r0, [r0 + 4 * r1]
11152
-    movu              [r0], m6
11153
-
11154
-    ;row[5, 6]
11155
-    pslldq            xm11, 1
11156
-    pinsrb            xm11, [r2 + 70], 0
11157
-    vinserti128       m2, m11, xm11, 1
11158
-    pshufb            m2, m1
11159
-    vbroadcasti128    m3, [r2 + 5]
11160
-    pshufb            m3, m1
11161
-    vbroadcasti128    m4, [r2 + 13]
11162
-    pshufb            m4, m1
11163
-    vbroadcasti128    m5, [r2 + 21]
11164
-    pshufb            m5, m1
11165
-
11166
-    mova              m10, [r4 + 3 * mmsize]
11167
-
11168
-    INTRA_PRED_ANG32_CAL_ROW
11169
-    movu              [r0 + r1], m7
11170
-    movu              [r0 + 2 * r1], m6
11171
-
11172
-    ;row[7, 8]
11173
-    pslldq            xm11, 1
11174
-    pinsrb            xm11, [r2 + 72], 0
11175
-    vinserti128       m2, m11, xm11, 1
11176
-    pshufb            m2, m1
11177
-    vbroadcasti128    m3, [r2 + 4]
11178
-    pshufb            m3, m1
11179
-    vbroadcasti128    m4, [r2 + 12]
11180
-    pshufb            m4, m1
11181
-    vbroadcasti128    m5, [r2 + 20]
11182
-    pshufb            m5, m1
11183
-
11184
-    add               r4, 4 * mmsize
11185
-    mova              m10, [r4 + 0 * mmsize]
11186
-
11187
-    INTRA_PRED_ANG32_CAL_ROW
11188
-    movu              [r0 + r3], m7
11189
-    lea               r0, [r0 + 4 * r1]
11190
-    movu              [r0], m6
11191
-
11192
-    ;row[9, 10]
11193
-    pslldq            xm11, 1
11194
-    pinsrb            xm11, [r2 + 73], 0
11195
-    vinserti128       m2, m11, xm11, 1
11196
-    pshufb            m2, m1
11197
-    vbroadcasti128    m3, [r2 + 3]
11198
-    pshufb            m3, m1
11199
-    vbroadcasti128    m4, [r2 + 11]
11200
-    pshufb            m4, m1
11201
-    vbroadcasti128    m5, [r2 + 19]
11202
-    pshufb            m5, m1
11203
-
11204
-    mova              m10, [r4 + 1 * mmsize]
11205
-
11206
-    INTRA_PRED_ANG32_CAL_ROW
11207
-    movu              [r0 + r1], m7
11208
-    movu              [r0 + 2 * r1], m6
11209
-
11210
-    ;row[11, 12]
11211
-    pslldq            xm11, 1
11212
-    pinsrb            xm11, [r2 + 75], 0
11213
-    vinserti128       m2, m11, xm11, 1
11214
-    pshufb            m2, m1
11215
-    vbroadcasti128    m3, [r2 + 2]
11216
-    pshufb            m3, m1
11217
-    vbroadcasti128    m4, [r2 + 10]
11218
-    pshufb            m4, m1
11219
-    vbroadcasti128    m5, [r2 + 18]
11220
-    pshufb            m5, m1
11221
-
11222
-    mova              m10, [r4 + 2 * mmsize]
11223
-
11224
-    INTRA_PRED_ANG32_CAL_ROW
11225
-    movu              [r0 + r3], m7
11226
-    lea               r0, [r0 + 4 * r1]
11227
-    movu              [r0], m6
11228
-
11229
-    ;row[13, 14]
11230
-    pslldq            xm11, 1
11231
-    pinsrb            xm11, [r2 + 77], 0
11232
-    vinserti128       m2, m11, xm11, 1
11233
-    pshufb            m2, m1
11234
-    vbroadcasti128    m3, [r2 + 1]
11235
-    pshufb            m3, m1
11236
-    vbroadcasti128    m4, [r2 + 9]
11237
-    pshufb            m4, m1
11238
-    vbroadcasti128    m5, [r2 + 17]
11239
-    pshufb            m5, m1
11240
-
11241
-    mova              m10, [r4 + 3 * mmsize]
11242
-
11243
-    INTRA_PRED_ANG32_CAL_ROW
11244
-    movu              [r0 + r1], m7
11245
-    movu              [r0 + 2 * r1], m6
11246
-
11247
-    ;row[15]
11248
-    pslldq            xm11, 1
11249
-    pinsrb            xm11, [r2 + 79], 0
11250
-    vinserti128       m2, m11, xm11, 1
11251
-    pshufb            m2, m1
11252
-    vbroadcasti128    m12, [r2 + 0]
11253
-    pshufb            m3, m12, m1
11254
-    vbroadcasti128    m4, [r2 + 8]
11255
-    pshufb            m4, m1
11256
-    vbroadcasti128    m5, [r2 + 16]
11257
-    pshufb            m5, m1
11258
-    vperm2i128        m6, m2, m3, 00100000b
11259
-    add               r4, 4 * mmsize
11260
-    pmaddubsw         m6, [r4 + 0 * mmsize]
11261
-    pmulhrsw          m6, m0
11262
-    vperm2i128        m7, m4, m5, 00100000b
11263
-    pmaddubsw         m7, [r4 + 0 * mmsize]
11264
-    pmulhrsw          m7, m0
11265
-    packuswb          m6, m7
11266
-    vpermq            m6, m6, 11011000b
11267
-    movu              [r0 + r3], m6
11268
-
11269
-    ;row[16, 17]
11270
-    pslldq            xm11, 1
11271
-    pinsrb            xm11, [r2 + 81], 0
11272
-    vinserti128       m2, m11, xm11, 1
11273
-    pshufb            m2, m1
11274
-    pslldq            xm12, 1
11275
-    pinsrb            xm12, [r2 + 66], 0
11276
-    vinserti128       m3, m12, xm12, 1
11277
-    pshufb            m3, m1
11278
-    vbroadcasti128    m4, [r2 + 7]
11279
-    pshufb            m4, m1
11280
-    vbroadcasti128    m5, [r2 + 15]
11281
-    pshufb            m5, m1
11282
-
11283
-    mova              m10, [r4 + 1 * mmsize]
11284
-
11285
-    INTRA_PRED_ANG32_CAL_ROW
11286
-    lea               r0, [r0 + 4 * r1]
11287
-    movu              [r0], m7
11288
-    movu              [r0 + r1], m6
11289
-
11290
-    ;row[18, 19]
11291
-    pslldq            xm11, 1
11292
-    pinsrb            xm11, [r2 + 83], 0
11293
-    vinserti128       m2, m11, xm11, 1
11294
-    pshufb            m2, m1
11295
-    pslldq            xm12, 1
11296
-    pinsrb            xm12, [r2 + 68], 0
11297
-    vinserti128       m3, m12, xm12, 1
11298
-    pshufb            m3, m1
11299
-    vbroadcasti128    m4, [r2 + 6]
11300
-    pshufb            m4, m1
11301
-    vbroadcasti128    m5, [r2 + 14]
11302
-    pshufb            m5, m1
11303
-
11304
-    mova              m10, [r4 + 2 * mmsize]
11305
-
11306
-    INTRA_PRED_ANG32_CAL_ROW
11307
-    movu              [r0 + 2 * r1], m7
11308
-    movu              [r0 + r3], m6
11309
-
11310
-    ;row[20, 21]
11311
-    pslldq            xm11, 1
11312
-    pinsrb            xm11, [r2 + 85], 0
11313
-    vinserti128       m2, m11, xm11, 1
11314
-    pshufb            m2, m1
11315
-    pslldq            xm12, 1
11316
-    pinsrb            xm12, [r2 + 70], 0
11317
-    vinserti128       m3, m12, xm12, 1
11318
-    pshufb            m3, m1
11319
-    vbroadcasti128    m4, [r2 + 5]
11320
-    pshufb            m4, m1
11321
-    vbroadcasti128    m5, [r2 + 13]
11322
-    pshufb            m5, m1
11323
-
11324
-    mova              m10, [r4 + 3 * mmsize]
11325
-
11326
-    INTRA_PRED_ANG32_CAL_ROW
11327
-    lea               r0, [r0 + 4 * r1]
11328
-    movu              [r0], m7
11329
-    movu              [r0 + r1], m6
11330
-
11331
-    ;row[22, 23]
11332
-    pslldq            xm11, 1
11333
-    pinsrb            xm11, [r2 + 87], 0
11334
-    vinserti128       m2, m11, xm11, 1
11335
-    pshufb            m2, m1
11336
-    pslldq            xm12, 1
11337
-    pinsrb            xm12, [r2 + 72], 0
11338
-    vinserti128       m3, m12, xm12, 1
11339
-    pshufb            m3, m1
11340
-    vbroadcasti128    m4, [r2 + 4]
11341
-    pshufb            m4, m1
11342
-    vbroadcasti128    m5, [r2 + 12]
11343
-    pshufb            m5, m1
11344
-
11345
-    add               r4, 4 * mmsize
11346
-    mova              m10, [r4 + 0 * mmsize]
11347
-
11348
-    INTRA_PRED_ANG32_CAL_ROW
11349
-    movu              [r0 + 2 * r1], m7
11350
-    movu              [r0 + r3], m6
11351
-
11352
-    ;row[24, 25]
11353
-    pslldq            xm11, 1
11354
-    pinsrb            xm11, [r2 + 88], 0
11355
-    vinserti128       m2, m11, xm11, 1
11356
-    pshufb            m2, m1
11357
-    pslldq            xm12, 1
11358
-    pinsrb            xm12, [r2 + 73], 0
11359
-    vinserti128       m3, m12, xm12, 1
11360
-    pshufb            m3, m1
11361
-    vbroadcasti128    m4, [r2 + 3]
11362
-    pshufb            m4, m1
11363
-    vbroadcasti128    m5, [r2 + 11]
11364
-    pshufb            m5, m1
11365
-
11366
-    mova              m10, [r4 + 1 * mmsize]
11367
-
11368
-    INTRA_PRED_ANG32_CAL_ROW
11369
-    lea               r0, [r0 + 4 * r1]
11370
-    movu              [r0], m7
11371
-    movu              [r0 + r1], m6
11372
-
11373
-    ;row[26, 27]
11374
-    pslldq            xm11, 1
11375
-    pinsrb            xm11, [r2 + 90], 0
11376
-    vinserti128       m2, m11, xm11, 1
11377
-    pshufb            m2, m1
11378
-    pslldq            xm12, 1
11379
-    pinsrb            xm12, [r2 + 75], 0
11380
-    vinserti128       m3, m12, xm12, 1
11381
-    pshufb            m3, m1
11382
-    vbroadcasti128    m4, [r2 + 2]
11383
-    pshufb            m4, m1
11384
-    vbroadcasti128    m5, [r2 + 10]
11385
-    pshufb            m5, m1
11386
-
11387
-    mova              m10, [r4 + 2 * mmsize]
11388
-
11389
-    INTRA_PRED_ANG32_CAL_ROW
11390
-    movu              [r0 + 2 * r1], m7
11391
-    movu              [r0 + r3], m6
11392
-
11393
-    ;row[28, 29]
11394
-    pslldq            xm11, 1
11395
-    pinsrb            xm11, [r2 + 92], 0
11396
-    vinserti128       m2, m11, xm11, 1
11397
-    pshufb            m2, m1
11398
-    pslldq            xm12, 1
11399
-    pinsrb            xm12, [r2 + 77], 0
11400
-    vinserti128       m3, m12, xm12, 1
11401
-    pshufb            m3, m1
11402
-    vbroadcasti128    m4, [r2 + 1]
11403
-    pshufb            m4, m1
11404
-    vbroadcasti128    m5, [r2 + 9]
11405
-    pshufb            m5, m1
11406
-
11407
-    mova              m10, [r4 + 3 * mmsize]
11408
-
11409
-    INTRA_PRED_ANG32_CAL_ROW
11410
-    lea               r0, [r0 + 4 * r1]
11411
-    movu              [r0], m7
11412
-    movu              [r0 + r1], m6
11413
-
11414
-    ;row[30, 31]
11415
-    pslldq            xm11, 1
11416
-    pinsrb            xm11, [r2 + 94], 0
11417
-    vinserti128       m2, m11, xm11, 1
11418
-    pshufb            m2, m1
11419
-    pslldq            xm12, 1
11420
-    pinsrb            xm12, [r2 + 79], 0
11421
-    vinserti128       m3, m12, xm12, 1
11422
-    pshufb            m3, m1
11423
-    vbroadcasti128    m4, [r2 + 0]
11424
-    pshufb            m4, m1
11425
-    vbroadcasti128    m5, [r2 + 8]
11426
-    pshufb            m5, m1
11427
-
11428
-    mova              m10, [r4 + 4 * mmsize]
11429
-
11430
-    INTRA_PRED_ANG32_CAL_ROW
11431
-    movu              [r0 + 2 * r1], m7
11432
-    movu              [r0 + r3], m6
11433
-    RET
11434
-%endif
11435
-
11436
 %macro INTRA_PRED_STORE_4x4 0
11437
     movd              [r0], xm0
11438
     pextrd            [r0 + r1], xm0, 1
11439
x265_1.8.tar.gz/source/common/x86/intrapred8_allangs.asm -> x265_1.9.tar.gz/source/common/x86/intrapred8_allangs.asm Changed
473
 
1
@@ -27,62 +27,63 @@
2
 
3
 SECTION_RODATA 32
4
 
5
-all_ang4_shuff: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
6
-                db 0, 1, 1, 2, 2, 3, 3, 4, 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6, 3, 4, 4, 5, 5, 6, 6, 7
7
-                db 0, 1, 1, 2, 2, 3, 3, 4, 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6
8
-                db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5
9
-                db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 1, 2, 2, 3, 3, 4, 4, 5
10
-                db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4
11
-                db 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
12
-                db 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12
13
-                db 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 4, 0, 0, 9, 9, 10, 10, 11
14
-                db 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 2, 0, 0, 9, 9, 10, 10, 11, 2, 0, 0, 9, 9, 10, 10, 11
15
-                db 0, 9, 9, 10, 10, 11, 11, 12, 2, 0, 0, 9, 9, 10, 10, 11, 2, 0, 0, 9, 9, 10, 10, 11, 4, 2, 2, 0, 0, 9, 9, 10
16
-                db 0, 9, 9, 10, 10, 11, 11, 12, 2, 0, 0, 9, 9, 10, 10, 11, 2, 0, 0, 9, 9, 10, 10, 11, 3, 2, 2, 0, 0, 9, 9, 10
17
-                db 0, 9, 9, 10, 10, 11, 11, 12, 1, 0, 0, 9, 9, 10, 10, 11, 2, 1, 1, 0, 0, 9, 9, 10, 4, 2, 2, 1, 1, 0, 0, 9
18
-                db 0, 1, 2, 3, 9, 0, 1, 2, 10, 9, 0, 1, 11, 10, 9, 0, 0, 1, 2, 3, 9, 0, 1, 2, 10, 9, 0, 1, 11, 10, 9, 0
19
-                db 0, 1, 1, 2, 2, 3, 3, 4, 9, 0, 0, 1, 1, 2, 2, 3, 10, 9, 9, 0, 0, 1, 1, 2, 12, 10, 10, 9, 9, 0, 0, 1
20
-                db 0, 1, 1, 2, 2, 3, 3, 4, 10, 0, 0, 1, 1, 2, 2, 3, 10, 0, 0, 1, 1, 2, 2, 3, 11, 10, 10, 0, 0, 1, 1, 2
21
-                db 0, 1, 1, 2, 2, 3, 3, 4, 10, 0, 0, 1, 1, 2, 2, 3, 10, 0, 0, 1, 1, 2, 2, 3, 12, 10, 10, 0, 0, 1, 1, 2
22
-                db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 10, 0, 0, 1, 1, 2, 2, 3, 10, 0, 0, 1, 1, 2, 2, 3
23
-                db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 12, 0, 0, 1, 1, 2, 2, 3
24
-                db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4
25
-                db 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4
26
-                db 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5
27
-                db 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6
28
-                db 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6, 2, 3, 3, 4, 4, 5, 5, 6
29
-                db 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6, 2, 3, 3, 4, 4, 5, 5, 6, 3, 4, 4, 5, 5, 6, 6, 7
30
-                db 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6, 3, 4, 4, 5, 5, 6, 6, 7, 4, 5, 5, 6, 6, 7, 7, 8
31
-                db 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8
32
-
33
-all_ang4: db 6, 26, 6, 26, 6, 26, 6, 26, 12, 20, 12, 20, 12, 20, 12, 20, 18, 14, 18, 14, 18, 14, 18, 14, 24, 8, 24, 8, 24, 8, 24, 8
34
-          db 11, 21, 11, 21, 11, 21, 11, 21, 22, 10, 22, 10, 22, 10, 22, 10, 1, 31, 1, 31, 1, 31, 1, 31, 12, 20, 12, 20, 12, 20, 12, 20
35
-          db 15, 17, 15, 17, 15, 17, 15, 17, 30, 2, 30, 2, 30, 2, 30, 2, 13, 19, 13, 19, 13, 19, 13, 19, 28, 4, 28, 4, 28, 4, 28, 4
36
-          db 19, 13, 19, 13, 19, 13, 19, 13, 6, 26, 6, 26, 6, 26, 6, 26, 25, 7, 25, 7, 25, 7, 25, 7, 12, 20, 12, 20, 12, 20, 12, 20
37
-          db 23, 9, 23, 9, 23, 9, 23, 9, 14, 18, 14, 18, 14, 18, 14, 18, 5, 27, 5, 27, 5, 27, 5, 27, 28, 4, 28, 4, 28, 4, 28, 4
38
-          db 27, 5, 27, 5, 27, 5, 27, 5, 22, 10, 22, 10, 22, 10, 22, 10, 17, 15, 17, 15, 17, 15, 17, 15, 12, 20, 12, 20, 12, 20, 12, 20
39
-          db 30, 2, 30, 2, 30, 2, 30, 2, 28, 4, 28, 4, 28, 4, 28, 4, 26, 6, 26, 6, 26, 6, 26, 6, 24, 8, 24, 8, 24, 8, 24, 8
40
-          db 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24
41
-          db 5, 27, 5, 27, 5, 27, 5, 27, 10, 22, 10, 22, 10, 22, 10, 22, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12
42
-          db 9, 23, 9, 23, 9, 23, 9, 23, 18, 14, 18, 14, 18, 14, 18, 14, 27, 5, 27, 5, 27, 5, 27, 5, 4, 28, 4, 28, 4, 28, 4, 28
43
-          db 13, 19, 13, 19, 13, 19, 13, 19, 26, 6, 26, 6, 26, 6, 26, 6, 7, 25, 7, 25, 7, 25, 7, 25, 20, 12, 20, 12, 20, 12, 20, 12
44
-          db 17, 15, 17, 15, 17, 15, 17, 15, 2, 30, 2, 30, 2, 30, 2, 30, 19, 13, 19, 13, 19, 13, 19, 13, 4, 28, 4, 28, 4, 28, 4, 28
45
-          db 21, 11, 21, 11, 21, 11, 21, 11, 10, 22, 10, 22, 10, 22, 10, 22, 31, 1, 31, 1, 31, 1, 31, 1, 20, 12, 20, 12, 20, 12, 20, 12
46
-          db 26, 6, 26, 6, 26, 6, 26, 6, 20, 12, 20, 12, 20, 12, 20, 12, 14, 18, 14, 18, 14, 18, 14, 18, 8, 24, 8, 24, 8, 24, 8, 24
47
-          db 26, 6, 26, 6, 26, 6, 26, 6, 20, 12, 20, 12, 20, 12, 20, 12, 14, 18, 14, 18, 14, 18, 14, 18, 8, 24, 8, 24, 8, 24, 8, 24
48
-          db 21, 11, 21, 11, 21, 11, 21, 11, 10, 22, 10, 22, 10, 22, 10, 22, 31, 1, 31, 1, 31, 1, 31, 1, 20, 12, 20, 12, 20, 12, 20, 12
49
-          db 17, 15, 17, 15, 17, 15, 17, 15, 2, 30, 2, 30, 2, 30, 2, 30, 19, 13, 19, 13, 19, 13, 19, 13, 4, 28, 4, 28, 4, 28, 4, 28
50
-          db 13, 19, 13, 19, 13, 19, 13, 19, 26, 6, 26, 6, 26, 6, 26, 6, 7, 25, 7, 25, 7, 25, 7, 25, 20, 12, 20, 12, 20, 12, 20, 12
51
-          db 9, 23, 9, 23, 9, 23, 9, 23, 18, 14, 18, 14, 18, 14, 18, 14, 27, 5, 27, 5, 27, 5, 27, 5, 4, 28, 4, 28, 4, 28, 4, 28
52
-          db 5, 27, 5, 27, 5, 27, 5, 27, 10, 22, 10, 22, 10, 22, 10, 22, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12
53
-          db 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24
54
-          db 30, 2, 30, 2, 30, 2, 30, 2, 28, 4, 28, 4, 28, 4, 28, 4, 26, 6, 26, 6, 26, 6, 26, 6, 24, 8, 24, 8, 24, 8, 24, 8
55
-          db 27, 5, 27, 5, 27, 5, 27, 5, 22, 10, 22, 10, 22, 10, 22, 10, 17, 15, 17, 15, 17, 15, 17, 15, 12, 20, 12, 20, 12, 20, 12, 20
56
-          db 23, 9, 23, 9, 23, 9, 23, 9, 14, 18, 14, 18, 14, 18, 14, 18, 5, 27, 5, 27, 5, 27, 5, 27, 28, 4, 28, 4, 28, 4, 28, 4
57
-          db 19, 13, 19, 13, 19, 13, 19, 13, 6, 26, 6, 26, 6, 26, 6, 26, 25, 7, 25, 7, 25, 7, 25, 7, 12, 20, 12, 20, 12, 20, 12, 20
58
-          db 15, 17, 15, 17, 15, 17, 15, 17, 30, 2, 30, 2, 30, 2, 30, 2, 13, 19, 13, 19, 13, 19, 13, 19, 28, 4, 28, 4, 28, 4, 28, 4
59
-          db 11, 21, 11, 21, 11, 21, 11, 21, 22, 10, 22, 10, 22, 10, 22, 10, 1, 31, 1, 31, 1, 31, 1, 31, 12, 20, 12, 20, 12, 20, 12, 20
60
-          db 6, 26, 6, 26, 6, 26, 6, 26, 12, 20, 12, 20, 12, 20, 12, 20, 18, 14, 18, 14, 18, 14, 18, 14, 24, 8, 24, 8, 24, 8, 24, 8
61
+const allAng4_shuf_mode2,       db  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6,  4,  5,  6,  7,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6,  4,  5,  6,  7
62
+const allAng4_shuf_mode3_4,     db  0,  1,  1,  2,  2,  3,  3,  4,  1,  2,  2,  3,  3,  4,  4,  5,  0,  1,  1,  2,  2,  3,  3,  4,  1,  2,  2,  3,  3,  4,  4,  5
63
+                                db  2,  3,  3,  4,  4,  5,  5,  6,  3,  4,  4,  5,  5,  6,  6,  7,  1,  2,  2,  3,  3,  4,  4,  5,  2,  3,  3,  4,  4,  5,  5,  6
64
+const allAng4_shuf_mode5_6,     db  0,  1,  1,  2,  2,  3,  3,  4,  1,  2,  2,  3,  3,  4,  4,  5,  0,  1,  1,  2,  2,  3,  3,  4,  0,  1,  1,  2,  2,  3,  3,  4
65
+                                db  1,  2,  2,  3,  3,  4,  4,  5,  2,  3,  3,  4,  4,  5,  5,  6,  1,  2,  2,  3,  3,  4,  4,  5,  1,  2,  2,  3,  3,  4,  4,  5
66
+const allAng4_shuf_mode7_8,     db  0,  1,  1,  2,  2,  3,  3,  4,  0,  1,  1,  2,  2,  3,  3,  4,  0,  1,  1,  2,  2,  3,  3,  4,  0,  1,  1,  2,  2,  3,  3,  4
67
+                                db  0,  1,  1,  2,  2,  3,  3,  4,  1,  2,  2,  3,  3,  4,  4,  5,  0,  1,  1,  2,  2,  3,  3,  4,  0,  1,  1,  2,  2,  3,  3,  4
68
+const allAng4_shuf_mode10,      db  0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3
69
+const allAng4_shuf_mode11_12,   db  0,  9,  9, 10, 10, 11, 11, 12,  0,  9,  9, 10, 10, 11, 11, 12,  0,  9,  9, 10, 10, 11, 11, 12,  0,  9,  9, 10, 10, 11, 11, 12
70
+const allAng4_shuf_mode13_14,   db  0,  9,  9, 10, 10, 11, 11, 12,  4,  0,  0,  9,  9, 10, 10, 11,  2,  0,  0,  9,  9, 10, 10, 11,  2,  0,  0,  9,  9, 10, 10, 11
71
+const allAng4_shuf_mode15_16,   db  0,  9,  9, 10, 10, 11, 11, 12,  2,  0,  0,  9,  9, 10, 10, 11,  0,  9,  9, 10, 10, 11, 11, 12,  2,  0,  0,  9,  9, 10, 10, 11
72
+                                db  2,  0,  0,  9,  9, 10, 10, 11,  4,  2,  2,  0,  0,  9,  9, 10,  2,  0,  0,  9,  9, 10, 10, 11,  3,  2,  2,  0,  0,  9,  9, 10
73
+const allAng4_shuf_mode17,      db  0,  9,  9, 10, 10, 11, 11, 12,  1,  0,  0,  9,  9, 10, 10, 11,  2,  1,  1,  0,  0,  9,  9, 10,  4,  2,  2,  1,  1,  0,  0,  9
74
+                                db  0,  1,  2,  3,  9,  0,  1,  2, 10,  9,  0,  1, 11, 10,  9,  0,  0,  1,  2,  3,  9,  0,  1,  2, 10,  9,  0,  1, 11, 10,  9,  0
75
+const allAng4_shuf_mode18,      db  0,  1,  2,  3,  9,  0,  1,  2, 10,  9,  0,  1, 11, 10,  9,  0,  0,  1,  2,  3,  9,  0,  1,  2, 10,  9,  0,  1, 11, 10,  9,  0
76
+const allAng4_shuf_mode19_20,   db  0,  1,  1,  2,  2,  3,  3,  4,  9,  0,  0,  1,  1,  2,  2,  3,  0,  1,  1,  2,  2,  3,  3,  4, 10,  0,  0,  1,  1,  2,  2,  3
77
+                                db 10,  9,  9,  0,  0,  1,  1,  2, 12, 10, 10,  9,  9,  0,  0,  1, 10,  0,  0,  1,  1,  2,  2,  3, 11, 10, 10,  0,  0,  1,  1,  2
78
+const allAng4_shuf_mode21_22,   db  0,  1,  1,  2,  2,  3,  3,  4, 10,  0,  0,  1,  1,  2,  2,  3,  0,  1,  1,  2,  2,  3,  3,  4,  0,  1,  1,  2,  2,  3,  3,  4
79
+                                db 10,  0,  0,  1,  1,  2,  2,  3, 12, 10, 10,  0,  0,  1,  1,  2, 10,  0,  0,  1,  1,  2,  2,  3, 10,  0,  0,  1,  1,  2,  2,  3
80
+const allAng4_shuf_mode23_24,   db  0,  1,  1,  2,  2,  3,  3,  4,  0,  1,  1,  2,  2,  3,  3,  4,  0,  1,  1,  2,  2,  3,  3,  4,  0,  1,  1,  2,  2,  3,  3,  4
81
+                                db  0,  1,  1,  2,  2,  3,  3,  4, 12,  0,  0,  1,  1,  2,  2,  3,  0,  1,  1,  2,  2,  3,  3,  4,  0,  1,  1,  2,  2,  3,  3,  4
82
+const allAng4_shuf_mode26,      db  1,  2,  3,  4,  1,  2,  3,  4,  1,  2,  3,  4,  1,  2,  3,  4,  1,  2,  3,  4,  1,  2,  3,  4,  1,  2,  3,  4,  1,  2,  3,  4
83
+const allAng4_shuf_mode27_28,   db  1,  2,  2,  3,  3,  4,  4,  5,  1,  2,  2,  3,  3,  4,  4,  5,  1,  2,  2,  3,  3,  4,  4,  5,  1,  2,  2,  3,  3,  4,  4,  5
84
+const allAng4_shuf_mode29_30,   db  1,  2,  2,  3,  3,  4,  4,  5,  2,  3,  3,  4,  4,  5,  5,  6,  2,  3,  3,  4,  4,  5,  5,  6,  2,  3,  3,  4,  4,  5,  5,  6
85
+const allAng4_shuf_mode31_32,   db  1,  2,  2,  3,  3,  4,  4,  5,  2,  3,  3,  4,  4,  5,  5,  6,  1,  2,  2,  3,  3,  4,  4,  5,  2,  3,  3,  4,  4,  5,  5,  6
86
+                                db  2,  3,  3,  4,  4,  5,  5,  6,  3,  4,  4,  5,  5,  6,  6,  7,  2,  3,  3,  4,  4,  5,  5,  6,  3,  4,  4,  5,  5,  6,  6,  7
87
+const allAng4_shuf_mode33,      db  1,  2,  2,  3,  3,  4,  4,  5,  2,  3,  3,  4,  4,  5,  5,  6,  3,  4,  4,  5,  5,  6,  6,  7,  4,  5,  5,  6,  6,  7,  7,  8
88
+const allAng4_shuf_mode34,      db  2,  3,  4,  5,  3,  4,  5,  6,  4,  5,  6,  7,  5,  6,  7,  8,  2,  3,  4,  5,  3,  4,  5,  6,  4,  5,  6,  7,  5,  6,  7,  8
89
+
90
+const allAng4_fact_mode3_4,     db  6, 26,  6, 26,  6, 26,  6, 26, 12, 20, 12, 20, 12, 20, 12, 20, 11, 21, 11, 21, 11, 21, 11, 21, 22, 10, 22, 10, 22, 10, 22, 10
91
+                                db 18, 14, 18, 14, 18, 14, 18, 14, 24,  8, 24,  8, 24,  8, 24,  8,  1, 31,  1, 31,  1, 31,  1, 31, 12, 20, 12, 20, 12, 20, 12, 20
92
+const allAng4_fact_mode5_6,     db 15, 17, 15, 17, 15, 17, 15, 17, 30,  2, 30,  2, 30,  2, 30,  2, 19, 13, 19, 13, 19, 13, 19, 13,  6, 26,  6, 26,  6, 26,  6, 26
93
+                                db 13, 19, 13, 19, 13, 19, 13, 19, 28,  4, 28,  4, 28,  4, 28,  4, 25,  7, 25,  7, 25,  7, 25,  7, 12, 20, 12, 20, 12, 20, 12, 20
94
+const allAng4_fact_mode7_8,     db 23,  9, 23,  9, 23,  9, 23,  9, 14, 18, 14, 18, 14, 18, 14, 18, 27,  5, 27,  5, 27,  5, 27,  5, 22, 10, 22, 10, 22, 10, 22, 10
95
+                                db  5, 27,  5, 27,  5, 27,  5, 27, 28,  4, 28,  4, 28,  4, 28,  4, 17, 15, 17, 15, 17, 15, 17, 15, 12, 20, 12, 20, 12, 20, 12, 20
96
+const allAng4_fact_mode9,       db 30,  2, 30,  2, 30,  2, 30,  2, 28,  4, 28,  4, 28,  4, 28,  4, 26,  6, 26,  6, 26,  6, 26,  6, 24,  8, 24,  8, 24,  8, 24,  8
97
+const allAng4_fact_mode11_12,   db  2, 30,  2, 30,  2, 30,  2, 30,  4, 28,  4, 28,  4, 28,  4, 28,  5, 27,  5, 27,  5, 27,  5, 27, 10, 22, 10, 22, 10, 22, 10, 22
98
+                                db  6, 26,  6, 26,  6, 26,  6, 26,  8, 24,  8, 24,  8, 24,  8, 24, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12
99
+const allAng4_fact_mode13_14,   db  9, 23,  9, 23,  9, 23,  9, 23, 18, 14, 18, 14, 18, 14, 18, 14, 13, 19, 13, 19, 13, 19, 13, 19, 26,  6, 26,  6, 26,  6, 26,  6
100
+                                db 27,  5, 27,  5, 27,  5, 27,  5,  4, 28,  4, 28,  4, 28,  4, 28,  7, 25,  7, 25,  7, 25,  7, 25, 20, 12, 20, 12, 20, 12, 20, 12
101
+const allAng4_fact_mode15_16,   db 17, 15, 17, 15, 17, 15, 17, 15,  2, 30,  2, 30,  2, 30,  2, 30, 21, 11, 21, 11, 21, 11, 21, 11, 10, 22, 10, 22, 10, 22, 10, 22
102
+                                db 19, 13, 19, 13, 19, 13, 19, 13,  4, 28,  4, 28,  4, 28,  4, 28, 31,  1, 31,  1, 31,  1, 31,  1, 20, 12, 20, 12, 20, 12, 20, 12
103
+const allAng4_fact_mode17,      db 26,  6, 26,  6, 26,  6, 26,  6, 20, 12, 20, 12, 20, 12, 20, 12, 14, 18, 14, 18, 14, 18, 14, 18,  8, 24,  8, 24,  8, 24,  8, 24
104
+const allAng4_fact_mode19_20,   db 26,  6, 26,  6, 26,  6, 26,  6, 20, 12, 20, 12, 20, 12, 20, 12, 21, 11, 21, 11, 21, 11, 21, 11, 10, 22, 10, 22, 10, 22, 10, 22
105
+                                db 14, 18, 14, 18, 14, 18, 14, 18,  8, 24,  8, 24,  8, 24,  8, 24, 31,  1, 31,  1, 31,  1, 31,  1, 20, 12, 20, 12, 20, 12, 20, 12
106
+const allAng4_fact_mode21_22,   db 17, 15, 17, 15, 17, 15, 17, 15,  2, 30,  2, 30,  2, 30,  2, 30, 13, 19, 13, 19, 13, 19, 13, 19, 26,  6, 26,  6, 26,  6, 26,  6
107
+                                db 19, 13, 19, 13, 19, 13, 19, 13,  4, 28,  4, 28,  4, 28,  4, 28,  7, 25,  7, 25,  7, 25,  7, 25, 20, 12, 20, 12, 20, 12, 20, 12
108
+const allAng4_fact_mode23_24,   db  9, 23,  9, 23,  9, 23,  9, 23, 18, 14, 18, 14, 18, 14, 18, 14,  5, 27,  5, 27,  5, 27,  5, 27, 10, 22, 10, 22, 10, 22, 10, 22
109
+                                db 27,  5, 27,  5, 27,  5, 27,  5,  4, 28,  4, 28,  4, 28,  4, 28, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12
110
+const allAng4_fact_mode25,      db  2, 30,  2, 30,  2, 30,  2, 30,  4, 28,  4, 28,  4, 28,  4, 28,  6, 26,  6, 26,  6, 26,  6, 26,  8, 24,  8, 24,  8, 24,  8, 24
111
+const allAng4_fact_mode27_28,   db 30,  2, 30,  2, 30,  2, 30,  2, 28,  4, 28,  4, 28,  4, 28,  4, 27,  5, 27,  5, 27,  5, 27,  5, 22, 10, 22, 10, 22, 10, 22, 10
112
+                                db 26,  6, 26,  6, 26,  6, 26,  6, 24,  8, 24,  8, 24,  8, 24,  8, 17, 15, 17, 15, 17, 15, 17, 15, 12, 20, 12, 20, 12, 20, 12, 20
113
+const allAng4_fact_mode29_30,   db 23,  9, 23,  9, 23,  9, 23,  9, 14, 18, 14, 18, 14, 18, 14, 18, 19, 13, 19, 13, 19, 13, 19, 13,  6, 26,  6, 26,  6, 26,  6, 26
114
+                                db  5, 27,  5, 27,  5, 27,  5, 27, 28,  4, 28,  4, 28,  4, 28,  4, 25,  7, 25,  7, 25,  7, 25,  7, 12, 20, 12, 20, 12, 20, 12, 20
115
+const allAng4_fact_mode31_32,   db 15, 17, 15, 17, 15, 17, 15, 17, 30,  2, 30,  2, 30,  2, 30,  2, 11, 21, 11, 21, 11, 21, 11, 21, 22, 10, 22, 10, 22, 10, 22, 10
116
+                                db 13, 19, 13, 19, 13, 19, 13, 19, 28,  4, 28,  4, 28,  4, 28,  4,  1, 31,  1, 31,  1, 31,  1, 31, 12, 20, 12, 20, 12, 20, 12, 20
117
+const allAng4_fact_mode33,      db  6, 26,  6, 26,  6, 26,  6, 26, 12, 20, 12, 20, 12, 20, 12, 20, 18, 14, 18, 14, 18, 14, 18, 14, 24,  8, 24,  8, 24,  8, 24,  8
118
 
119
 
120
 SECTION .text
121
@@ -23075,80 +23076,69 @@
122
 ; void all_angs_pred_4x4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma)
123
 ;-----------------------------------------------------------------------------
124
 INIT_YMM avx2
125
-cglobal all_angs_pred_4x4, 4, 4, 6
126
+cglobal all_angs_pred_4x4, 2, 2, 6
127
 
128
     mova           m5, [pw_1024]
129
-    lea            r2, [all_ang4]
130
-    lea            r3, [all_ang4_shuff]
131
 
132
 ; mode 2
133
 
134
     vbroadcasti128 m0, [r1 + 9]
135
-    mova           xm1, xm0
136
-    psrldq         xm1, 1
137
-    pshufb         xm1, [r3]
138
+    pshufb         m1, m0, [allAng4_shuf_mode2]
139
     movu           [r0], xm1
140
 
141
 ; mode 3
142
 
143
-    pshufb         m1, m0, [r3 + 1 * mmsize]
144
-    pmaddubsw      m1, [r2]
145
+    pshufb         m1, m0, [allAng4_shuf_mode3_4]
146
+    pmaddubsw      m1, [allAng4_fact_mode3_4]
147
     pmulhrsw       m1, m5
148
 
149
 ; mode 4
150
 
151
-    pshufb         m2, m0, [r3 + 2 * mmsize]
152
-    pmaddubsw      m2, [r2 + 1 * mmsize]
153
+    pshufb         m2, m0, [allAng4_shuf_mode3_4 + mmsize]
154
+    pmaddubsw      m2, [allAng4_fact_mode3_4 + mmsize]
155
     pmulhrsw       m2, m5
156
     packuswb       m1, m2
157
-    vpermq         m1, m1, 11011000b
158
     movu           [r0 + (3 - 2) * 16], m1
159
 
160
 ; mode 5
161
 
162
-    pshufb         m1, m0, [r3 + 2 * mmsize]
163
-    pmaddubsw      m1, [r2 + 2 * mmsize]
164
+    pshufb         m1, m0, [allAng4_shuf_mode5_6]
165
+    pmaddubsw      m1, [allAng4_fact_mode5_6]
166
     pmulhrsw       m1, m5
167
 
168
 ; mode 6
169
 
170
-    pshufb         m2, m0, [r3 + 3 * mmsize]
171
-    pmaddubsw      m2, [r2 + 3 * mmsize]
172
+    pshufb         m2, m0, [allAng4_shuf_mode5_6 + mmsize]
173
+    pmaddubsw      m2, [allAng4_fact_mode5_6 + mmsize]
174
     pmulhrsw       m2, m5
175
     packuswb       m1, m2
176
-    vpermq         m1, m1, 11011000b
177
     movu           [r0 + (5 - 2) * 16], m1
178
 
179
-    add            r3, 4 * mmsize
180
-    add            r2, 4 * mmsize
181
-
182
 ; mode 7
183
 
184
-    pshufb         m1, m0, [r3 + 0 * mmsize]
185
-    pmaddubsw      m1, [r2 + 0 * mmsize]
186
+    pshufb         m3, m0, [allAng4_shuf_mode7_8]
187
+    pmaddubsw      m1, m3, [allAng4_fact_mode7_8]
188
     pmulhrsw       m1, m5
189
 
190
 ; mode 8
191
 
192
-    pshufb         m2, m0, [r3 + 1 * mmsize]
193
-    pmaddubsw      m2, [r2 + 1 * mmsize]
194
+    pshufb         m2, m0, [allAng4_shuf_mode7_8 + mmsize]
195
+    pmaddubsw      m2, [allAng4_fact_mode7_8 + mmsize]
196
     pmulhrsw       m2, m5
197
     packuswb       m1, m2
198
-    vpermq         m1, m1, 11011000b
199
     movu           [r0 + (7 - 2) * 16], m1
200
 
201
 ; mode 9
202
 
203
-    pshufb         m1, m0, [r3 + 1 * mmsize]
204
-    pmaddubsw      m1, [r2 + 2 * mmsize]
205
-    pmulhrsw       m1, m5
206
-    packuswb       m1, m1
207
-    vpermq         m1, m1, 11011000b
208
-    movu           [r0 + (9 - 2) * 16], xm1
209
+    pmaddubsw      m3, [allAng4_fact_mode9]
210
+    pmulhrsw       m3, m5
211
+    packuswb       m3, m3
212
+    vpermq         m3, m3, 11011000b
213
+    movu           [r0 + (9 - 2) * 16], xm3
214
 
215
 ; mode 10
216
 
217
-    pshufb         xm1, xm0, [r3 + 2 * mmsize]
218
+    pshufb         xm1, xm0, [allAng4_shuf_mode10]
219
     movu           [r0 + (10 - 2) * 16], xm1
220
 
221
     pxor           xm1, xm1
222
@@ -23173,135 +23163,111 @@
223
 ; mode 11
224
 
225
     vbroadcasti128 m0, [r1]
226
-    pshufb         m1, m0, [r3 + 3 * mmsize]
227
-    pmaddubsw      m1, [r2 + 3 * mmsize]
228
+    pshufb         m3, m0, [allAng4_shuf_mode11_12]
229
+    pmaddubsw      m1, m3, [allAng4_fact_mode11_12]
230
     pmulhrsw       m1, m5
231
 
232
 ; mode 12
233
 
234
-    add            r2, 4 * mmsize
235
-
236
-    pshufb         m2, m0, [r3 + 3 * mmsize]
237
-    pmaddubsw      m2, [r2 + 0 * mmsize]
238
+    pmaddubsw      m2, m3, [allAng4_fact_mode11_12 + mmsize]
239
     pmulhrsw       m2, m5
240
     packuswb       m1, m2
241
-    vpermq         m1, m1, 11011000b
242
     movu           [r0 + (11 - 2) * 16], m1
243
 
244
 ; mode 13
245
 
246
-    add            r3, 4 * mmsize
247
-
248
-    pshufb         m1, m0, [r3 + 0 * mmsize]
249
-    pmaddubsw      m1, [r2 + 1 * mmsize]
250
-    pmulhrsw       m1, m5
251
+    pmaddubsw      m3, [allAng4_fact_mode13_14]
252
+    pmulhrsw       m3, m5
253
 
254
 ; mode 14
255
 
256
-    pshufb         m2, m0, [r3 + 1 * mmsize]
257
-    pmaddubsw      m2, [r2 + 2 * mmsize]
258
+    pshufb         m2, m0, [allAng4_shuf_mode13_14]
259
+    pmaddubsw      m2, [allAng4_fact_mode13_14 + mmsize]
260
     pmulhrsw       m2, m5
261
-    packuswb       m1, m2
262
-    vpermq         m1, m1, 11011000b
263
-    movu           [r0 + (13 - 2) * 16], m1
264
+    packuswb       m3, m2
265
+    movu           [r0 + (13 - 2) * 16], m3
266
 
267
 ; mode 15
268
 
269
-    pshufb         m1, m0, [r3 + 2 * mmsize]
270
-    pmaddubsw      m1, [r2 + 3 * mmsize]
271
+    pshufb         m1, m0, [allAng4_shuf_mode15_16]
272
+    pmaddubsw      m1, [allAng4_fact_mode15_16]
273
     pmulhrsw       m1, m5
274
 
275
 ; mode 16
276
 
277
-    add            r2, 4 * mmsize
278
-
279
-    pshufb         m2, m0, [r3 + 3 * mmsize]
280
-    pmaddubsw      m2, [r2 + 0 * mmsize]
281
+    pshufb         m2, m0, [allAng4_shuf_mode15_16 + mmsize]
282
+    pmaddubsw      m2, [allAng4_fact_mode15_16 + mmsize]
283
     pmulhrsw       m2, m5
284
     packuswb       m1, m2
285
-    vpermq         m1, m1, 11011000b
286
     movu           [r0 + (15 - 2) * 16], m1
287
 
288
 ; mode 17
289
 
290
-    add            r3, 4 * mmsize
291
-
292
-    pshufb         m1, m0, [r3 + 0 * mmsize]
293
-    pmaddubsw      m1, [r2 + 1 * mmsize]
294
+    pshufb         m1, m0, [allAng4_shuf_mode17]
295
+    pmaddubsw      m1, [allAng4_fact_mode17]
296
     pmulhrsw       m1, m5
297
     packuswb       m1, m1
298
     vpermq         m1, m1, 11011000b
299
 
300
 ; mode 18
301
 
302
-    pshufb         m2, m0, [r3 + 1 * mmsize]
303
+    pshufb         m2, m0, [allAng4_shuf_mode18]
304
     vinserti128    m1, m1, xm2, 1
305
     movu           [r0 + (17 - 2) * 16], m1
306
 
307
 ; mode 19
308
 
309
-    pshufb         m1, m0, [r3 + 2 * mmsize]
310
-    pmaddubsw      m1, [r2 + 2 * mmsize]
311
+    pshufb         m1, m0, [allAng4_shuf_mode19_20]
312
+    pmaddubsw      m1, [allAng4_fact_mode19_20]
313
     pmulhrsw       m1, m5
314
 
315
 ; mode 20
316
 
317
-    pshufb         m2, m0, [r3 + 3 * mmsize]
318
-    pmaddubsw      m2, [r2 + 3 * mmsize]
319
+    pshufb         m2, m0, [allAng4_shuf_mode19_20 + mmsize]
320
+    pmaddubsw      m2, [allAng4_fact_mode19_20 + mmsize]
321
     pmulhrsw       m2, m5
322
     packuswb       m1, m2
323
-    vpermq         m1, m1, 11011000b
324
     movu           [r0 + (19 - 2) * 16], m1
325
 
326
 ; mode 21
327
 
328
-    add            r2, 4 * mmsize
329
-    add            r3, 4 * mmsize
330
-
331
-    pshufb         m1, m0, [r3 + 0 * mmsize]
332
-    pmaddubsw      m1, [r2 + 0 * mmsize]
333
+    pshufb         m1, m0, [allAng4_shuf_mode21_22]
334
+    pmaddubsw      m1, [allAng4_fact_mode21_22]
335
     pmulhrsw       m1, m5
336
 
337
 ; mode 22
338
 
339
-    pshufb         m2, m0, [r3 + 1 * mmsize]
340
-    pmaddubsw      m2, [r2 + 1 * mmsize]
341
+    pshufb         m2, m0, [allAng4_shuf_mode21_22 + mmsize]
342
+    pmaddubsw      m2, [allAng4_fact_mode21_22 + mmsize]
343
     pmulhrsw       m2, m5
344
     packuswb       m1, m2
345
-    vpermq         m1, m1, 11011000b
346
     movu           [r0 + (21 - 2) * 16], m1
347
 
348
 ; mode 23
349
 
350
-    pshufb         m1, m0, [r3 + 2 * mmsize]
351
-    pmaddubsw      m1, [r2 + 2 * mmsize]
352
+    pshufb         m3, m0, [allAng4_shuf_mode23_24]
353
+    pmaddubsw      m1, m3, [allAng4_fact_mode23_24]
354
     pmulhrsw       m1, m5
355
 
356
 ; mode 24
357
 
358
-    pshufb         m2, m0, [r3 + 3 * mmsize]
359
-    pmaddubsw      m2, [r2 + 3 * mmsize]
360
+    pshufb         m2, m0, [allAng4_shuf_mode23_24 + mmsize]
361
+    pmaddubsw      m2, [allAng4_fact_mode23_24 + mmsize]
362
     pmulhrsw       m2, m5
363
     packuswb       m1, m2
364
-    vpermq         m1, m1, 11011000b
365
     movu           [r0 + (23 - 2) * 16], m1
366
 
367
 ; mode 25
368
 
369
-    add            r2, 4 * mmsize
370
-
371
-    pshufb         m1, m0, [r3 + 3 * mmsize]
372
-    pmaddubsw      m1, [r2 + 0 * mmsize]
373
-    pmulhrsw       m1, m5
374
-    packuswb       m1, m1
375
-    vpermq         m1, m1, 11011000b
376
-    movu           [r0 + (25 - 2) * 16], xm1
377
+    pmaddubsw      m3, [allAng4_fact_mode25]
378
+    pmulhrsw       m3, m5
379
+    packuswb       m3, m3
380
+    vpermq         m3, m3, 11011000b
381
+    movu           [r0 + (25 - 2) * 16], xm3
382
 
383
 ; mode 26
384
 
385
-    add            r3, 4 * mmsize
386
-
387
-    pshufb         xm1, xm0, [r3 + 0 * mmsize]
388
+    pshufb         m1, m0, [allAng4_shuf_mode26]
389
     movu           [r0 + (26 - 2) * 16], xm1
390
 
391
     pxor           xm1, xm1
392
@@ -23326,64 +23292,55 @@
393
 
394
 ; mode 27
395
 
396
-    pshufb        m1, m0, [r3 + 1 * mmsize]
397
-    pmaddubsw     m1, [r2 + 1 * mmsize]
398
+    pshufb        m3, m0, [allAng4_shuf_mode27_28]
399
+    pmaddubsw     m1, m3, [allAng4_fact_mode27_28]
400
     pmulhrsw      m1, m5
401
 
402
 ; mode 28
403
 
404
-    pshufb        m2, m0, [r3 + 1 * mmsize]
405
-    pmaddubsw     m2, [r2 + 2 * mmsize]
406
+    pmaddubsw     m2, m3, [allAng4_fact_mode27_28 + mmsize]
407
     pmulhrsw      m2, m5
408
     packuswb      m1, m2
409
-    vpermq        m1, m1, 11011000b
410
     movu          [r0 + (27 - 2) * 16], m1
411
 
412
 ; mode 29
413
 
414
-    pshufb        m1, m0, [r3 + 2 * mmsize]
415
-    pmaddubsw     m1, [r2 + 3 * mmsize]
416
-    pmulhrsw      m1, m5
417
+    pmaddubsw     m3, [allAng4_fact_mode29_30]
418
+    pmulhrsw      m3, m5
419
 
420
 ; mode 30
421
 
422
-    add           r2, 4 * mmsize
423
-
424
-    pshufb        m2, m0, [r3 + 3 * mmsize]
425
-    pmaddubsw     m2, [r2 + 0 * mmsize]
426
+    pshufb        m2, m0, [allAng4_shuf_mode29_30]
427
+    pmaddubsw     m2, [allAng4_fact_mode29_30 + mmsize]
428
     pmulhrsw      m2, m5
429
-    packuswb      m1, m2
430
-    vpermq        m1, m1, 11011000b
431
-    movu          [r0 + (29 - 2) * 16], m1
432
+    packuswb      m3, m2
433
+    movu          [r0 + (29 - 2) * 16], m3
434
 
435
 ; mode 31
436
 
437
-    add           r3, 4 * mmsize
438
-
439
-    pshufb        m1, m0, [r3 + 0 * mmsize]
440
-    pmaddubsw     m1, [r2 + 1 * mmsize]
441
+    pshufb        m1, m0, [allAng4_shuf_mode31_32]
442
+    pmaddubsw     m1, [allAng4_fact_mode31_32]
443
     pmulhrsw      m1, m5
444
 
445
 ; mode 32
446
 
447
-    pshufb        m2, m0, [r3 + 0 * mmsize]
448
-    pmaddubsw     m2, [r2 + 2 * mmsize]
449
+    pshufb        m2, m0, [allAng4_shuf_mode31_32 + mmsize]
450
+    pmaddubsw     m2, [allAng4_fact_mode31_32 + mmsize]
451
     pmulhrsw      m2, m5
452
     packuswb      m1, m2
453
-    vpermq        m1, m1, 11011000b
454
     movu          [r0 + (31 - 2) * 16], m1
455
 
456
 ; mode 33
457
 
458
-    pshufb        m1, m0, [r3 + 1 * mmsize]
459
-    pmaddubsw     m1, [r2 + 3 * mmsize]
460
+    pshufb        m1, m0, [allAng4_shuf_mode33]
461
+    pmaddubsw     m1, [allAng4_fact_mode33]
462
     pmulhrsw      m1, m5
463
     packuswb      m1, m2
464
     vpermq        m1, m1, 11011000b
465
 
466
 ; mode 34
467
 
468
-    pshufb        m0, [r3 + 2 * mmsize]
469
+    pshufb        m0, [allAng4_shuf_mode34]
470
     vinserti128   m1, m1, xm0, 1
471
     movu          [r0 + (33 - 2) * 16], m1
472
     RET
473
x265_1.8.tar.gz/source/common/x86/ipfilter16.asm -> x265_1.9.tar.gz/source/common/x86/ipfilter16.asm Changed
860
 
1
@@ -4869,7 +4869,7 @@
2
 %ifidn %2,pp
3
     vbroadcasti128  m8, [INTERP_OFFSET_PP]
4
 %elifidn %2, sp
5
-    mova            m8, [INTERP_OFFSET_SP]
6
+    vbroadcasti128  m8, [INTERP_OFFSET_SP]
7
 %else
8
     vbroadcasti128  m8, [INTERP_OFFSET_PS]
9
 %endif
10
@@ -5011,11 +5011,11 @@
11
     mov       r4d, %1/2
12
 
13
 %ifidn %2, pp
14
-    mova      m7, [INTERP_OFFSET_PP]
15
+    vbroadcasti128  m7, [INTERP_OFFSET_PP]
16
 %elifidn %2, sp
17
-    mova      m7, [INTERP_OFFSET_SP]
18
+    vbroadcasti128  m7, [INTERP_OFFSET_SP]
19
 %elifidn %2, ps
20
-    mova      m7, [INTERP_OFFSET_PS]
21
+    vbroadcasti128  m7, [INTERP_OFFSET_PS]
22
 %endif
23
 
24
 .loopH:
25
@@ -5183,11 +5183,11 @@
26
     mov       r4d, %1/2
27
 
28
 %ifidn %2, pp
29
-    mova      m7, [INTERP_OFFSET_PP]
30
+    vbroadcasti128  m7, [INTERP_OFFSET_PP]
31
 %elifidn %2, sp
32
-    mova      m7, [INTERP_OFFSET_SP]
33
+    vbroadcasti128  m7, [INTERP_OFFSET_SP]
34
 %elifidn %2, ps
35
-    mova      m7, [INTERP_OFFSET_PS]
36
+    vbroadcasti128  m7, [INTERP_OFFSET_PS]
37
 %endif
38
 
39
 .loopH:
40
@@ -5325,11 +5325,11 @@
41
     mov       r4d, %1/2
42
 
43
 %ifidn %2, pp
44
-    mova      m7, [INTERP_OFFSET_PP]
45
+    vbroadcasti128  m7, [INTERP_OFFSET_PP]
46
 %elifidn %2, sp
47
-    mova      m7, [INTERP_OFFSET_SP]
48
+    vbroadcasti128  m7, [INTERP_OFFSET_SP]
49
 %elifidn %2, ps
50
-    mova      m7, [INTERP_OFFSET_PS]
51
+    vbroadcasti128  m7, [INTERP_OFFSET_PS]
52
 %endif
53
 
54
 .loopH:
55
@@ -5456,11 +5456,11 @@
56
     mov       r4d, %1/2
57
 
58
 %ifidn %2, pp
59
-    mova      m7, [INTERP_OFFSET_PP]
60
+    vbroadcasti128  m7, [INTERP_OFFSET_PP]
61
 %elifidn %2, sp
62
-    mova      m7, [INTERP_OFFSET_SP]
63
+    vbroadcasti128  m7, [INTERP_OFFSET_SP]
64
 %elifidn %2, ps
65
-    mova      m7, [INTERP_OFFSET_PS]
66
+    vbroadcasti128  m7, [INTERP_OFFSET_PS]
67
 %endif
68
 
69
 .loopH:
70
@@ -5609,11 +5609,11 @@
71
     mov       r4d, %1/2
72
 
73
 %ifidn %2, pp
74
-    mova      m7, [INTERP_OFFSET_PP]
75
+    vbroadcasti128  m7, [INTERP_OFFSET_PP]
76
 %elifidn %2, sp
77
-    mova      m7, [INTERP_OFFSET_SP]
78
+    vbroadcasti128  m7, [INTERP_OFFSET_SP]
79
 %elifidn %2, ps
80
-    mova      m7, [INTERP_OFFSET_PS]
81
+    vbroadcasti128  m7, [INTERP_OFFSET_PS]
82
 %endif
83
 
84
 .loopH:
85
@@ -5732,11 +5732,11 @@
86
     mov       r4d, 32
87
 
88
 %ifidn %1, pp
89
-    mova      m7, [INTERP_OFFSET_PP]
90
+    vbroadcasti128  m7, [INTERP_OFFSET_PP]
91
 %elifidn %1, sp
92
-    mova      m7, [INTERP_OFFSET_SP]
93
+    vbroadcasti128  m7, [INTERP_OFFSET_SP]
94
 %elifidn %1, ps
95
-    mova      m7, [INTERP_OFFSET_PS]
96
+    vbroadcasti128  m7, [INTERP_OFFSET_PS]
97
 %endif
98
 
99
 .loopH:
100
@@ -6068,7 +6068,7 @@
101
 %ifidn %1,pp
102
     vbroadcasti128  m6, [pd_32]
103
 %elifidn %1, sp
104
-    mova            m6, [pd_524800]
105
+    vbroadcasti128  m6, [INTERP_OFFSET_SP]
106
 %else
107
     vbroadcasti128  m6, [INTERP_OFFSET_PS]
108
 %endif
109
@@ -6178,7 +6178,7 @@
110
 %ifidn %1,pp
111
     vbroadcasti128  m11, [pd_32]
112
 %elifidn %1, sp
113
-    mova            m11, [pd_524800]
114
+    vbroadcasti128  m11, [INTERP_OFFSET_SP]
115
 %else
116
     vbroadcasti128  m11, [INTERP_OFFSET_PS]
117
 %endif
118
@@ -6816,7 +6816,7 @@
119
 %ifidn %1,pp
120
     vbroadcasti128  m14, [pd_32]
121
 %elifidn %1, sp
122
-    mova            m14, [INTERP_OFFSET_SP]
123
+    vbroadcasti128  m14, [INTERP_OFFSET_SP]
124
 %else
125
     vbroadcasti128  m14, [INTERP_OFFSET_PS]
126
 %endif
127
@@ -6867,7 +6867,7 @@
128
 %ifidn %3,pp
129
     vbroadcasti128  m14, [pd_32]
130
 %elifidn %3, sp
131
-    mova            m14, [INTERP_OFFSET_SP]
132
+    vbroadcasti128  m14, [INTERP_OFFSET_SP]
133
 %else
134
     vbroadcasti128  m14, [INTERP_OFFSET_PS]
135
 %endif
136
@@ -6950,7 +6950,7 @@
137
 %ifidn %1,pp
138
     vbroadcasti128  m14, [pd_32]
139
 %elifidn %1, sp
140
-    mova            m14, [INTERP_OFFSET_SP]
141
+    vbroadcasti128  m14, [INTERP_OFFSET_SP]
142
 %else
143
     vbroadcasti128  m14, [INTERP_OFFSET_PS]
144
 %endif
145
@@ -7597,7 +7597,7 @@
146
 %ifidn %1,pp
147
     vbroadcasti128  m11, [pd_32]
148
 %elifidn %1, sp
149
-    mova            m11, [INTERP_OFFSET_SP]
150
+    vbroadcasti128  m11, [INTERP_OFFSET_SP]
151
 %else
152
     vbroadcasti128  m11, [INTERP_OFFSET_PS]
153
 %endif
154
@@ -7644,7 +7644,7 @@
155
 %ifidn %1,pp
156
     vbroadcasti128  m14, [pd_32]
157
 %elifidn %1, sp
158
-    mova            m14, [INTERP_OFFSET_SP]
159
+    vbroadcasti128  m14, [INTERP_OFFSET_SP]
160
 %else
161
     vbroadcasti128  m14, [INTERP_OFFSET_PS]
162
 %endif
163
@@ -7816,7 +7816,7 @@
164
 %ifidn %1,pp
165
     vbroadcasti128  m7, [pd_32]
166
 %elifidn %1, sp
167
-    mova            m7, [INTERP_OFFSET_SP]
168
+    vbroadcasti128  m7, [INTERP_OFFSET_SP]
169
 %else
170
     vbroadcasti128  m7, [INTERP_OFFSET_PS]
171
 %endif
172
@@ -7861,7 +7861,7 @@
173
 %ifidn %1,pp
174
     vbroadcasti128  m7, [pd_32]
175
 %elifidn %1, sp
176
-    mova            m7, [INTERP_OFFSET_SP]
177
+    vbroadcasti128  m7, [INTERP_OFFSET_SP]
178
 %else
179
     vbroadcasti128  m7, [INTERP_OFFSET_PS]
180
 %endif
181
@@ -7901,7 +7901,7 @@
182
 %ifidn %1,pp
183
     vbroadcasti128  m14, [pd_32]
184
 %elifidn %1, sp
185
-    mova            m14, [INTERP_OFFSET_SP]
186
+    vbroadcasti128  m14, [INTERP_OFFSET_SP]
187
 %else
188
     vbroadcasti128  m14, [INTERP_OFFSET_PS]
189
 %endif
190
@@ -8248,7 +8248,7 @@
191
 %ifidn %1,pp
192
     vbroadcasti128  m7, [pd_32]
193
 %elifidn %1, sp
194
-    mova            m7, [INTERP_OFFSET_SP]
195
+    vbroadcasti128  m7, [INTERP_OFFSET_SP]
196
 %else
197
     vbroadcasti128  m7, [INTERP_OFFSET_PS]
198
 %endif
199
@@ -8668,7 +8668,7 @@
200
 %ifidn %1,pp
201
     vbroadcasti128  m7, [pd_32]
202
 %elifidn %1, sp
203
-    mova            m7, [INTERP_OFFSET_SP]
204
+    vbroadcasti128  m7, [INTERP_OFFSET_SP]
205
 %else
206
     vbroadcasti128  m7, [INTERP_OFFSET_PS]
207
 %endif
208
@@ -8703,7 +8703,7 @@
209
 %ifidn %1,pp
210
     vbroadcasti128  m14, [pd_32]
211
 %elifidn %1, sp
212
-    mova            m14, [INTERP_OFFSET_SP]
213
+    vbroadcasti128  m14, [INTERP_OFFSET_SP]
214
 %else
215
     vbroadcasti128  m14, [INTERP_OFFSET_PS]
216
 %endif
217
@@ -10342,8 +10342,8 @@
218
     vpermd                      m3,                m5,                  m3
219
     paddd                       m3,                m2
220
     vextracti128                xm4,               m3,                  1
221
-    psrad                       xm3,               2
222
-    psrad                       xm4,               2
223
+    psrad                       xm3,               INTERP_SHIFT_PS
224
+    psrad                       xm4,               INTERP_SHIFT_PS
225
     packssdw                    xm3,               xm3
226
     packssdw                    xm4,               xm4
227
 
228
@@ -10375,8 +10375,8 @@
229
     vpermd                      m3,                m5,                  m3
230
     paddd                       m3,                m2
231
     vextracti128                xm4,               m3,                  1
232
-    psrad                       xm3,               2
233
-    psrad                       xm4,               2
234
+    psrad                       xm3,               INTERP_SHIFT_PS
235
+    psrad                       xm4,               INTERP_SHIFT_PS
236
     packssdw                    xm3,               xm3
237
     packssdw                    xm4,               xm4
238
 
239
@@ -10441,8 +10441,8 @@
240
     vpermq              m4, m4, q3120
241
     paddd               m4, m2
242
     vextracti128        xm5,m4, 1
243
-    psrad               xm4, 2
244
-    psrad               xm5, 2
245
+    psrad               xm4, INTERP_SHIFT_PS
246
+    psrad               xm5, INTERP_SHIFT_PS
247
     packssdw            xm4, xm5
248
 
249
     movu                [r2], xm4
250
@@ -10511,8 +10511,8 @@
251
     vpermq              m4, m4, q3120
252
     paddd               m4, m2
253
     vextracti128        xm5,m4, 1
254
-    psrad               xm4, 2
255
-    psrad               xm5, 2
256
+    psrad               xm4, INTERP_SHIFT_PS
257
+    psrad               xm5, INTERP_SHIFT_PS
258
     packssdw            xm4, xm5
259
 
260
     movu                [r2 + x], xm4
261
@@ -10583,8 +10583,8 @@
262
     vpermq              m4, m4, q3120
263
     paddd               m4, m2
264
     vextracti128        xm5,m4, 1
265
-    psrad               xm4, 2
266
-    psrad               xm5, 2
267
+    psrad               xm4, INTERP_SHIFT_PS
268
+    psrad               xm5, INTERP_SHIFT_PS
269
     packssdw            xm4, xm5
270
 
271
     movu                [r2 + x], xm4
272
@@ -10609,8 +10609,8 @@
273
     vpermq              m6, m6, q3120
274
     paddd               m6, m2
275
     vextracti128        xm5,m6, 1
276
-    psrad               xm6, 2
277
-    psrad               xm5, 2
278
+    psrad               xm6, INTERP_SHIFT_PS
279
+    psrad               xm5, INTERP_SHIFT_PS
280
     packssdw            xm6, xm5
281
 
282
     movu                [r2 + 16 + x], xm6
283
@@ -10690,8 +10690,8 @@
284
     vpermq              m4, m4, q3120
285
     paddd               m4, m2
286
     vextracti128        xm5, m4, 1
287
-    psrad               xm4, 2
288
-    psrad               xm5, 2
289
+    psrad               xm4, INTERP_SHIFT_PS
290
+    psrad               xm5, INTERP_SHIFT_PS
291
     packssdw            xm4, xm5
292
     movu                [r2], xm4
293
 
294
@@ -10713,8 +10713,8 @@
295
     vpermq              m6, m6, q3120
296
     paddd               m6, m2
297
     vextracti128        xm5,m6, 1
298
-    psrad               xm6, 2
299
-    psrad               xm5, 2
300
+    psrad               xm6, INTERP_SHIFT_PS
301
+    psrad               xm5, INTERP_SHIFT_PS
302
     packssdw            xm6, xm5
303
     movu                [r2 + 16], xm6
304
 
305
@@ -10783,8 +10783,8 @@
306
     vpermq              m4, m4, q3120
307
     paddd               m4, m2
308
     vextracti128        xm5,m4, 1
309
-    psrad               xm4, 2
310
-    psrad               xm5, 2
311
+    psrad               xm4, INTERP_SHIFT_PS
312
+    psrad               xm5, INTERP_SHIFT_PS
313
     packssdw            xm4, xm5
314
     movu                [r2], xm4
315
 
316
@@ -10798,7 +10798,7 @@
317
     phaddd              m6, m6
318
     vpermq              m6, m6, q3120
319
     paddd               xm6, xm2
320
-    psrad               xm6, 2
321
+    psrad               xm6, INTERP_SHIFT_PS
322
     packssdw            xm6, xm6
323
     movq                [r2 + 16], xm6
324
 
325
@@ -10847,7 +10847,7 @@
326
     phaddd              m4, m5
327
     paddd               m4, m2
328
     vpermq              m4, m4, q3120
329
-    psrad               m4, 2
330
+    psrad               m4, INTERP_SHIFT_PS
331
     vextracti128        xm5, m4, 1
332
     packssdw            xm4, xm5
333
     movu                [r2], xm4
334
@@ -10906,7 +10906,7 @@
335
     phaddd              m4, m5
336
     paddd               m4, m2
337
     vpermq              m4, m4, q3120
338
-    psrad               m4, 2
339
+    psrad               m4, INTERP_SHIFT_PS
340
     vextracti128        xm5, m4, 1
341
     packssdw            xm4, xm5
342
     movu                [r2], xm4
343
@@ -10920,7 +10920,7 @@
344
     phaddd              m4, m5
345
     paddd               m4, m2
346
     vpermq              m4, m4, q3120
347
-    psrad               m4, 2
348
+    psrad               m4, INTERP_SHIFT_PS
349
     vextracti128        xm5, m4, 1
350
     packssdw            xm4, xm5
351
     movu                [r2 + 16], xm4
352
@@ -10979,7 +10979,7 @@
353
     phaddd              m4, m5
354
     paddd               m4, m2
355
     vpermq              m4, m4, q3120
356
-    psrad               m4, 2
357
+    psrad               m4, INTERP_SHIFT_PS
358
     vextracti128        xm5, m4, 1
359
     packssdw            xm4, xm5
360
     movu                [r2], xm4
361
@@ -10993,7 +10993,7 @@
362
     phaddd              m4, m5
363
     paddd               m4, m2
364
     vpermq              m4, m4, q3120
365
-    psrad               m4, 2
366
+    psrad               m4, INTERP_SHIFT_PS
367
     vextracti128        xm5, m4, 1
368
     packssdw            xm4, xm5
369
     movu                [r2 + 16], xm4
370
@@ -11007,7 +11007,7 @@
371
     phaddd              m4, m5
372
     paddd               m4, m2
373
     vpermq              m4, m4, q3120
374
-    psrad               m4, 2
375
+    psrad               m4, INTERP_SHIFT_PS
376
     vextracti128        xm5, m4, 1
377
     packssdw            xm4, xm5
378
     movu                [r2 + 32], xm4
379
@@ -11061,7 +11061,7 @@
380
     phaddd              m4, m5
381
     paddd               m4, m2
382
     vpermq              m4, m4, q3120
383
-    psrad               m4, 2
384
+    psrad               m4, INTERP_SHIFT_PS
385
     vextracti128        xm5, m4, 1
386
     packssdw            xm4, xm5
387
     movu                [r2], xm4
388
@@ -11072,7 +11072,7 @@
389
     phaddd              m4, m4
390
     paddd               m4, m2
391
     vpermq              m4, m4, q3120
392
-    psrad               m4, 2
393
+    psrad               m4, INTERP_SHIFT_PS
394
     vextracti128        xm5, m4, 1
395
     packssdw            xm4, xm5
396
     movq                [r2 + 16], xm4
397
@@ -11126,7 +11126,7 @@
398
     phaddd              m4, m5
399
     paddd               m4, m2
400
     vpermq              m4, m4, q3120
401
-    psrad               m4, 2
402
+    psrad               m4, INTERP_SHIFT_PS
403
     vextracti128        xm5, m4, 1
404
     packssdw            xm4, xm5
405
     movu                [r2], xm4
406
@@ -11140,7 +11140,7 @@
407
     phaddd              m4, m5
408
     paddd               m4, m2
409
     vpermq              m4, m4, q3120
410
-    psrad               m4, 2
411
+    psrad               m4, INTERP_SHIFT_PS
412
     vextracti128        xm5, m4, 1
413
     packssdw            xm4, xm5
414
     movu                [r2 + 16], xm4
415
@@ -11154,7 +11154,7 @@
416
     phaddd              m4, m5
417
     paddd               m4, m2
418
     vpermq              m4, m4, q3120
419
-    psrad               m4, 2
420
+    psrad               m4, INTERP_SHIFT_PS
421
     vextracti128        xm5, m4, 1
422
     packssdw            xm4, xm5
423
     movu                [r2 + 32], xm4
424
@@ -11168,7 +11168,7 @@
425
     phaddd              m4, m5
426
     paddd               m4, m2
427
     vpermq              m4, m4, q3120
428
-    psrad               m4, 2
429
+    psrad               m4, INTERP_SHIFT_PS
430
     vextracti128        xm5, m4, 1
431
     packssdw            xm4, xm5
432
     movu                [r2 + 48], xm4
433
@@ -11227,7 +11227,7 @@
434
     phaddd              m4, m5
435
     paddd               m4, m2
436
     vpermq              m4, m4, q3120
437
-    psrad               m4, 2
438
+    psrad               m4, INTERP_SHIFT_PS
439
     vextracti128        xm5, m4, 1
440
     packssdw            xm4, xm5
441
     movu                [r2], xm4
442
@@ -11241,7 +11241,7 @@
443
     phaddd              m4, m5
444
     paddd               m4, m2
445
     vpermq              m4, m4, q3120
446
-    psrad               m4, 2
447
+    psrad               m4, INTERP_SHIFT_PS
448
     vextracti128        xm5, m4, 1
449
     packssdw            xm4, xm5
450
     movu                [r2 + 16], xm4
451
@@ -11255,7 +11255,7 @@
452
     phaddd              m4, m5
453
     paddd               m4, m2
454
     vpermq              m4, m4, q3120
455
-    psrad               m4, 2
456
+    psrad               m4, INTERP_SHIFT_PS
457
     vextracti128        xm5, m4, 1
458
     packssdw            xm4, xm5
459
     movu                [r2 + 32], xm4
460
@@ -11269,7 +11269,7 @@
461
     phaddd              m4, m5
462
     paddd               m4, m2
463
     vpermq              m4, m4, q3120
464
-    psrad               m4, 2
465
+    psrad               m4, INTERP_SHIFT_PS
466
     vextracti128        xm5, m4, 1
467
     packssdw            xm4, xm5
468
     movu                [r2 + 48], xm4
469
@@ -11283,7 +11283,7 @@
470
     phaddd              m4, m5
471
     paddd               m4, m2
472
     vpermq              m4, m4, q3120
473
-    psrad               m4, 2
474
+    psrad               m4, INTERP_SHIFT_PS
475
     vextracti128        xm5, m4, 1
476
     packssdw            xm4, xm5
477
     movu                [r2 + 64], xm4
478
@@ -11297,7 +11297,7 @@
479
     phaddd              m4, m5
480
     paddd               m4, m2
481
     vpermq              m4, m4, q3120
482
-    psrad               m4, 2
483
+    psrad               m4, INTERP_SHIFT_PS
484
     vextracti128        xm5, m4, 1
485
     packssdw            xm4, xm5
486
     movu                [r2 + 80], xm4
487
@@ -11311,7 +11311,7 @@
488
     phaddd              m4, m5
489
     paddd               m4, m2
490
     vpermq              m4, m4, q3120
491
-    psrad               m4, 2
492
+    psrad               m4, INTERP_SHIFT_PS
493
     vextracti128        xm5, m4, 1
494
     packssdw            xm4, xm5
495
     movu                [r2 + 96], xm4
496
@@ -11325,7 +11325,7 @@
497
     phaddd              m4, m5
498
     paddd               m4, m2
499
     vpermq              m4, m4, q3120
500
-    psrad               m4, 2
501
+    psrad               m4, INTERP_SHIFT_PS
502
     vextracti128        xm5, m4, 1
503
     packssdw            xm4, xm5
504
     movu                [r2 + 112], xm4
505
@@ -11380,7 +11380,7 @@
506
     phaddd              m4, m5
507
     paddd               m4, m2
508
     vpermq              m4, m4, q3120
509
-    psrad               m4, 2
510
+    psrad               m4, INTERP_SHIFT_PS
511
     vextracti128        xm5, m4, 1
512
     packssdw            xm4, xm5
513
     movu                [r2], xm4
514
@@ -11394,7 +11394,7 @@
515
     phaddd              m4, m5
516
     paddd               m4, m2
517
     vpermq              m4, m4, q3120
518
-    psrad               m4, 2
519
+    psrad               m4, INTERP_SHIFT_PS
520
     vextracti128        xm5, m4, 1
521
     packssdw            xm4, xm5
522
     movu                [r2 + 16], xm4
523
@@ -11408,7 +11408,7 @@
524
     phaddd              m4, m5
525
     paddd               m4, m2
526
     vpermq              m4, m4, q3120
527
-    psrad               m4, 2
528
+    psrad               m4, INTERP_SHIFT_PS
529
     vextracti128        xm5, m4, 1
530
     packssdw            xm4, xm5
531
     movu                [r2 + 32], xm4
532
@@ -11422,7 +11422,7 @@
533
     phaddd              m4, m5
534
     paddd               m4, m2
535
     vpermq              m4, m4, q3120
536
-    psrad               m4, 2
537
+    psrad               m4, INTERP_SHIFT_PS
538
     vextracti128        xm5, m4, 1
539
     packssdw            xm4, xm5
540
     movu                [r2 + 48], xm4
541
@@ -11436,7 +11436,7 @@
542
     phaddd              m4, m5
543
     paddd               m4, m2
544
     vpermq              m4, m4, q3120
545
-    psrad               m4, 2
546
+    psrad               m4, INTERP_SHIFT_PS
547
     vextracti128        xm5, m4, 1
548
     packssdw            xm4, xm5
549
     movu                [r2 + 64], xm4
550
@@ -11450,7 +11450,7 @@
551
     phaddd              m4, m5
552
     paddd               m4, m2
553
     vpermq              m4, m4, q3120
554
-    psrad               m4, 2
555
+    psrad               m4, INTERP_SHIFT_PS
556
     vextracti128        xm5, m4, 1
557
     packssdw            xm4, xm5
558
     movu                [r2 + 80], xm4
559
@@ -11500,7 +11500,7 @@
560
     phaddd              m4, m5
561
     paddd               m4, m2
562
     vpermq              m4, m4, q3120
563
-    psrad               m4, 2
564
+    psrad               m4, INTERP_SHIFT_PS
565
     vextracti128        xm5, m4, 1
566
     packssdw            xm4, xm5
567
     movq                [r2], xm4
568
@@ -11537,7 +11537,7 @@
569
 %ifidn %1,pp
570
     vbroadcasti128  m14, [pd_32]
571
 %elifidn %1, sp
572
-    mova            m14, [pd_524800]
573
+    vbroadcasti128  m14, [INTERP_OFFSET_SP]
574
 %else
575
     vbroadcasti128  m14, [INTERP_OFFSET_PS]
576
 %endif
577
@@ -11665,19 +11665,19 @@
578
     psrad           m4, 6
579
     psrad           m5, 6
580
 %elifidn %1, sp
581
-    psrad           m0, 10
582
-    psrad           m1, 10
583
-    psrad           m2, 10
584
-    psrad           m3, 10
585
-    psrad           m4, 10
586
-    psrad           m5, 10
587
-%else
588
-    psrad           m0, 2
589
-    psrad           m1, 2
590
-    psrad           m2, 2
591
-    psrad           m3, 2
592
-    psrad           m4, 2
593
-    psrad           m5, 2
594
+    psrad           m0, INTERP_SHIFT_SP
595
+    psrad           m1, INTERP_SHIFT_SP
596
+    psrad           m2, INTERP_SHIFT_SP
597
+    psrad           m3, INTERP_SHIFT_SP
598
+    psrad           m4, INTERP_SHIFT_SP
599
+    psrad           m5, INTERP_SHIFT_SP
600
+%else
601
+    psrad           m0, INTERP_SHIFT_PS
602
+    psrad           m1, INTERP_SHIFT_PS
603
+    psrad           m2, INTERP_SHIFT_PS
604
+    psrad           m3, INTERP_SHIFT_PS
605
+    psrad           m4, INTERP_SHIFT_PS
606
+    psrad           m5, INTERP_SHIFT_PS
607
 %endif
608
 %endif
609
 
610
@@ -11736,11 +11736,11 @@
611
     psrad           m6, 6
612
     psrad           m7, 6
613
 %elifidn %1, sp
614
-    psrad           m6, 10
615
-    psrad           m7, 10
616
+    psrad           m6, INTERP_SHIFT_SP
617
+    psrad           m7, INTERP_SHIFT_SP
618
 %else
619
-    psrad           m6, 2
620
-    psrad           m7, 2
621
+    psrad           m6, INTERP_SHIFT_PS
622
+    psrad           m7, INTERP_SHIFT_PS
623
 %endif
624
 %endif
625
 
626
@@ -11814,23 +11814,23 @@
627
     psrad           m0, 6
628
     psrad           m1, 6
629
 %elifidn %1, sp
630
-    psrad           m8, 10
631
-    psrad           m9, 10
632
-    psrad           m10, 10
633
-    psrad           m11, 10
634
-    psrad           m12, 10
635
-    psrad           m13, 10
636
-    psrad           m0, 10
637
-    psrad           m1, 10
638
-%else
639
-    psrad           m8, 2
640
-    psrad           m9, 2
641
-    psrad           m10, 2
642
-    psrad           m11, 2
643
-    psrad           m12, 2
644
-    psrad           m13, 2
645
-    psrad           m0, 2
646
-    psrad           m1, 2
647
+    psrad           m8, INTERP_SHIFT_SP
648
+    psrad           m9, INTERP_SHIFT_SP
649
+    psrad           m10, INTERP_SHIFT_SP
650
+    psrad           m11, INTERP_SHIFT_SP
651
+    psrad           m12, INTERP_SHIFT_SP
652
+    psrad           m13, INTERP_SHIFT_SP
653
+    psrad           m0, INTERP_SHIFT_SP
654
+    psrad           m1, INTERP_SHIFT_SP
655
+%else
656
+    psrad           m8, INTERP_SHIFT_PS
657
+    psrad           m9, INTERP_SHIFT_PS
658
+    psrad           m10, INTERP_SHIFT_PS
659
+    psrad           m11, INTERP_SHIFT_PS
660
+    psrad           m12, INTERP_SHIFT_PS
661
+    psrad           m13, INTERP_SHIFT_PS
662
+    psrad           m0, INTERP_SHIFT_PS
663
+    psrad           m1, INTERP_SHIFT_PS
664
 %endif
665
 %endif
666
 
667
@@ -11954,7 +11954,7 @@
668
 %ifidn %1,pp
669
     vbroadcasti128  m7, [pd_32]
670
 %elifidn %1, sp
671
-    mova            m7, [pd_524800]
672
+    vbroadcasti128  m7, [INTERP_OFFSET_SP]
673
 %else
674
     vbroadcasti128  m7, [INTERP_OFFSET_PS]
675
 %endif
676
@@ -11966,8 +11966,8 @@
677
 %endmacro
678
 
679
 FILTER_VER_CHROMA_AVX2_8x2 pp, 1, 6
680
-FILTER_VER_CHROMA_AVX2_8x2 ps, 0, 2
681
-FILTER_VER_CHROMA_AVX2_8x2 sp, 1, 10
682
+FILTER_VER_CHROMA_AVX2_8x2 ps, 0, INTERP_SHIFT_PS
683
+FILTER_VER_CHROMA_AVX2_8x2 sp, 1, INTERP_SHIFT_SP
684
 FILTER_VER_CHROMA_AVX2_8x2 ss, 0, 6
685
 
686
 %macro FILTER_VER_CHROMA_AVX2_4x2 3
687
@@ -11991,7 +11991,7 @@
688
 %ifidn %1,pp
689
     vbroadcasti128  m6, [pd_32]
690
 %elifidn %1, sp
691
-    mova            m6, [pd_524800]
692
+    vbroadcasti128  m6, [INTERP_OFFSET_SP]
693
 %else
694
     vbroadcasti128  m6, [INTERP_OFFSET_PS]
695
 %endif
696
@@ -12033,8 +12033,8 @@
697
 %endmacro
698
 
699
 FILTER_VER_CHROMA_AVX2_4x2 pp, 1, 6
700
-FILTER_VER_CHROMA_AVX2_4x2 ps, 0, 2
701
-FILTER_VER_CHROMA_AVX2_4x2 sp, 1, 10
702
+FILTER_VER_CHROMA_AVX2_4x2 ps, 0, INTERP_SHIFT_PS
703
+FILTER_VER_CHROMA_AVX2_4x2 sp, 1, INTERP_SHIFT_SP
704
 FILTER_VER_CHROMA_AVX2_4x2 ss, 0, 6
705
 
706
 %macro FILTER_VER_CHROMA_AVX2_4x4 3
707
@@ -12058,7 +12058,7 @@
708
 %ifidn %1,pp
709
    vbroadcasti128  m6, [pd_32]
710
 %elifidn %1, sp
711
-    mova            m6, [pd_524800]
712
+   vbroadcasti128  m6, [INTERP_OFFSET_SP]
713
 %else
714
     vbroadcasti128  m6, [INTERP_OFFSET_PS]
715
 %endif
716
@@ -12112,8 +12112,8 @@
717
 %endmacro
718
 
719
 FILTER_VER_CHROMA_AVX2_4x4 pp, 1, 6
720
-FILTER_VER_CHROMA_AVX2_4x4 ps, 0, 2
721
-FILTER_VER_CHROMA_AVX2_4x4 sp, 1, 10
722
+FILTER_VER_CHROMA_AVX2_4x4 ps, 0, INTERP_SHIFT_PS
723
+FILTER_VER_CHROMA_AVX2_4x4 sp, 1, INTERP_SHIFT_SP
724
 FILTER_VER_CHROMA_AVX2_4x4 ss, 0, 6
725
 
726
 
727
@@ -12138,7 +12138,7 @@
728
 %ifidn %1,pp
729
     vbroadcasti128  m7, [pd_32]
730
 %elifidn %1, sp
731
-    mova            m7, [pd_524800]
732
+    vbroadcasti128  m7, [INTERP_OFFSET_SP]
733
 %else
734
     vbroadcasti128  m7, [INTERP_OFFSET_PS]
735
 %endif
736
@@ -12225,8 +12225,8 @@
737
 %endmacro
738
 
739
 FILTER_VER_CHROMA_AVX2_4x8 pp, 1, 6
740
-FILTER_VER_CHROMA_AVX2_4x8 ps, 0, 2
741
-FILTER_VER_CHROMA_AVX2_4x8 sp, 1, 10
742
+FILTER_VER_CHROMA_AVX2_4x8 ps, 0, INTERP_SHIFT_PS
743
+FILTER_VER_CHROMA_AVX2_4x8 sp, 1, INTERP_SHIFT_SP
744
 FILTER_VER_CHROMA_AVX2_4x8 ss, 0 , 6
745
 
746
 %macro PROCESS_LUMA_AVX2_W4_16R_4TAP 3
747
@@ -12396,7 +12396,7 @@
748
 %ifidn %1,pp
749
     vbroadcasti128  m7, [pd_32]
750
 %elifidn %1, sp
751
-    mova            m7, [pd_524800]
752
+    vbroadcasti128  m7, [INTERP_OFFSET_SP]
753
 %else
754
     vbroadcasti128  m7, [INTERP_OFFSET_PS]
755
 %endif
756
@@ -12410,12 +12410,12 @@
757
 %endmacro
758
 
759
 FILTER_VER_CHROMA_AVX2_4xN pp, 16, 1, 6
760
-FILTER_VER_CHROMA_AVX2_4xN ps, 16, 0, 2
761
-FILTER_VER_CHROMA_AVX2_4xN sp, 16, 1, 10
762
+FILTER_VER_CHROMA_AVX2_4xN ps, 16, 0, INTERP_SHIFT_PS
763
+FILTER_VER_CHROMA_AVX2_4xN sp, 16, 1, INTERP_SHIFT_SP
764
 FILTER_VER_CHROMA_AVX2_4xN ss, 16, 0, 6
765
 FILTER_VER_CHROMA_AVX2_4xN pp, 32, 1, 6
766
-FILTER_VER_CHROMA_AVX2_4xN ps, 32, 0, 2
767
-FILTER_VER_CHROMA_AVX2_4xN sp, 32, 1, 10
768
+FILTER_VER_CHROMA_AVX2_4xN ps, 32, 0, INTERP_SHIFT_PS
769
+FILTER_VER_CHROMA_AVX2_4xN sp, 32, 1, INTERP_SHIFT_SP
770
 FILTER_VER_CHROMA_AVX2_4xN ss, 32, 0, 6
771
 
772
 %macro FILTER_VER_CHROMA_AVX2_8x8 3
773
@@ -12429,7 +12429,7 @@
774
 
775
 %ifdef PIC
776
     lea             r5, [tab_ChromaCoeffVer]
777
-   add             r5, r4
778
+    add             r5, r4
779
 %else
780
     lea             r5, [tab_ChromaCoeffVer + r4]
781
 %endif
782
@@ -12440,7 +12440,7 @@
783
 %ifidn %1,pp
784
     vbroadcasti128  m11, [pd_32]
785
 %elifidn %1, sp
786
-    mova            m11, [pd_524800]
787
+    vbroadcasti128  m11, [INTERP_OFFSET_SP]
788
 %else
789
     vbroadcasti128  m11, [INTERP_OFFSET_PS]
790
 %endif
791
@@ -12569,8 +12569,8 @@
792
 %endmacro
793
 
794
 FILTER_VER_CHROMA_AVX2_8x8 pp, 1, 6
795
-FILTER_VER_CHROMA_AVX2_8x8 ps, 0, 2
796
-FILTER_VER_CHROMA_AVX2_8x8 sp, 1, 10
797
+FILTER_VER_CHROMA_AVX2_8x8 ps, 0, INTERP_SHIFT_PS
798
+FILTER_VER_CHROMA_AVX2_8x8 sp, 1, INTERP_SHIFT_SP
799
 FILTER_VER_CHROMA_AVX2_8x8 ss, 0, 6
800
 
801
 %macro FILTER_VER_CHROMA_AVX2_8x6 3
802
@@ -12595,7 +12595,7 @@
803
 %ifidn %1,pp
804
     vbroadcasti128  m11, [pd_32]
805
 %elifidn %1, sp
806
-    mova            m11, [pd_524800]
807
+    vbroadcasti128  m11, [INTERP_OFFSET_SP]
808
 %else
809
     vbroadcasti128  m11, [INTERP_OFFSET_PS]
810
 %endif
811
@@ -12700,8 +12700,8 @@
812
 %endmacro
813
 
814
 FILTER_VER_CHROMA_AVX2_8x6 pp, 1, 6
815
-FILTER_VER_CHROMA_AVX2_8x6 ps, 0, 2
816
-FILTER_VER_CHROMA_AVX2_8x6 sp, 1, 10
817
+FILTER_VER_CHROMA_AVX2_8x6 ps, 0, INTERP_SHIFT_PS
818
+FILTER_VER_CHROMA_AVX2_8x6 sp, 1, INTERP_SHIFT_SP
819
 FILTER_VER_CHROMA_AVX2_8x6 ss, 0, 6
820
 
821
 %macro PROCESS_CHROMA_AVX2 3
822
@@ -12785,7 +12785,7 @@
823
 %ifidn %1,pp
824
     vbroadcasti128  m7, [pd_32]
825
 %elifidn %1, sp
826
-    mova            m7, [pd_524800]
827
+    vbroadcasti128  m7, [INTERP_OFFSET_SP]
828
 %else
829
     vbroadcasti128  m7, [INTERP_OFFSET_PS]
830
 %endif
831
@@ -12799,8 +12799,8 @@
832
 %endmacro
833
 
834
 FILTER_VER_CHROMA_AVX2_8x4 pp, 1, 6
835
-FILTER_VER_CHROMA_AVX2_8x4 ps, 0, 2
836
-FILTER_VER_CHROMA_AVX2_8x4 sp, 1, 10
837
+FILTER_VER_CHROMA_AVX2_8x4 ps, 0, INTERP_SHIFT_PS
838
+FILTER_VER_CHROMA_AVX2_8x4 sp, 1, INTERP_SHIFT_SP
839
 FILTER_VER_CHROMA_AVX2_8x4 ss, 0, 6
840
 
841
 %macro FILTER_VER_CHROMA_AVX2_8x12 3
842
@@ -12824,7 +12824,7 @@
843
 %ifidn %1,pp
844
     vbroadcasti128  m14, [pd_32]
845
 %elifidn %1, sp
846
-    mova            m14, [pd_524800]
847
+    vbroadcasti128  m14, [INTERP_OFFSET_SP]
848
 %else
849
     vbroadcasti128  m14, [INTERP_OFFSET_PS]
850
 %endif
851
@@ -13002,6 +13002,6 @@
852
 %endmacro
853
 
854
 FILTER_VER_CHROMA_AVX2_8x12 pp, 1, 6
855
-FILTER_VER_CHROMA_AVX2_8x12 ps, 0, 2
856
-FILTER_VER_CHROMA_AVX2_8x12 sp, 1, 10
857
+FILTER_VER_CHROMA_AVX2_8x12 ps, 0, INTERP_SHIFT_PS
858
+FILTER_VER_CHROMA_AVX2_8x12 sp, 1, INTERP_SHIFT_SP
859
 FILTER_VER_CHROMA_AVX2_8x12 ss, 0, 6
860
x265_1.8.tar.gz/source/common/x86/ipfilter8.asm -> x265_1.9.tar.gz/source/common/x86/ipfilter8.asm Changed
571
 
1
@@ -12541,6 +12541,459 @@
2
 ;-----------------------------------------------------------------------------
3
 ; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
4
 ;-----------------------------------------------------------------------------
5
+INIT_YMM avx2
6
+cglobal filterPixelToShort_16x4, 3, 4, 2
7
+    mov             r3d, r3m
8
+    add             r3d, r3d
9
+
10
+    ; load constant
11
+    vbroadcasti128  m1, [pw_2000]
12
+
13
+    pmovzxbw        m0, [r0]
14
+    psllw           m0, 6
15
+    psubw           m0, m1
16
+    movu            [r2], m0
17
+
18
+    pmovzxbw        m0, [r0 + r1]
19
+    psllw           m0, 6
20
+    psubw           m0, m1
21
+    movu            [r2 + r3], m0
22
+
23
+    pmovzxbw        m0, [r0 + r1 * 2]
24
+    psllw           m0, 6
25
+    psubw           m0, m1
26
+    movu            [r2 + r3 * 2], m0
27
+
28
+    lea             r1, [r1 * 3]
29
+    lea             r3, [r3 * 3]
30
+
31
+    pmovzxbw        m0, [r0 + r1]
32
+    psllw           m0, 6
33
+    psubw           m0, m1
34
+    movu            [r2 + r3], m0
35
+    RET
36
+
37
+;-----------------------------------------------------------------------------
38
+; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
39
+;-----------------------------------------------------------------------------
40
+INIT_YMM avx2
41
+cglobal filterPixelToShort_16x8, 3, 6, 2
42
+    mov             r3d, r3m
43
+    add             r3d, r3d
44
+    lea             r4, [r1 * 3]
45
+    lea             r5, [r3 * 3]
46
+
47
+    ; load constant
48
+    vbroadcasti128  m1, [pw_2000]
49
+
50
+    pmovzxbw        m0, [r0]
51
+    psllw           m0, 6
52
+    psubw           m0, m1
53
+    movu            [r2], m0
54
+
55
+    pmovzxbw        m0, [r0 + r1]
56
+    psllw           m0, 6
57
+    psubw           m0, m1
58
+    movu            [r2 + r3], m0
59
+
60
+    pmovzxbw        m0, [r0 + r1 * 2]
61
+    psllw           m0, 6
62
+    psubw           m0, m1
63
+    movu            [r2 + r3 * 2], m0
64
+
65
+    pmovzxbw        m0, [r0 + r4]
66
+    psllw           m0, 6
67
+    psubw           m0, m1
68
+    movu            [r2 + r5], m0
69
+
70
+    lea             r0, [r0 + r1 * 4]
71
+    lea             r2, [r2 + r3 * 4]
72
+
73
+    pmovzxbw        m0, [r0]
74
+    psllw           m0, 6
75
+    psubw           m0, m1
76
+    movu            [r2], m0
77
+
78
+    pmovzxbw        m0, [r0 + r1]
79
+    psllw           m0, 6
80
+    psubw           m0, m1
81
+    movu            [r2 + r3], m0
82
+
83
+    pmovzxbw        m0, [r0 + r1 * 2]
84
+    psllw           m0, 6
85
+    psubw           m0, m1
86
+    movu            [r2 + r3 * 2], m0
87
+
88
+    pmovzxbw        m0, [r0 + r4]
89
+    psllw           m0, 6
90
+    psubw           m0, m1
91
+    movu            [r2 + r5], m0
92
+    RET
93
+
94
+;-----------------------------------------------------------------------------
95
+; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
96
+;-----------------------------------------------------------------------------
97
+INIT_YMM avx2
98
+cglobal filterPixelToShort_16x12, 3, 6, 2
99
+    mov             r3d, r3m
100
+    add             r3d, r3d
101
+    lea             r4, [r1 * 3]
102
+    lea             r5, [r3 * 3]
103
+
104
+    ; load constant
105
+    vbroadcasti128  m1, [pw_2000]
106
+
107
+    pmovzxbw        m0, [r0]
108
+    psllw           m0, 6
109
+    psubw           m0, m1
110
+    movu            [r2], m0
111
+
112
+    pmovzxbw        m0, [r0 + r1]
113
+    psllw           m0, 6
114
+    psubw           m0, m1
115
+    movu            [r2 + r3], m0
116
+
117
+    pmovzxbw        m0, [r0 + r1 * 2]
118
+    psllw           m0, 6
119
+    psubw           m0, m1
120
+    movu            [r2 + r3 * 2], m0
121
+
122
+    pmovzxbw        m0, [r0 + r4]
123
+    psllw           m0, 6
124
+    psubw           m0, m1
125
+    movu            [r2 + r5], m0
126
+
127
+    lea             r0, [r0 + r1 * 4]
128
+    lea             r2, [r2 + r3 * 4]
129
+
130
+    pmovzxbw        m0, [r0]
131
+    psllw           m0, 6
132
+    psubw           m0, m1
133
+    movu            [r2], m0
134
+
135
+    pmovzxbw        m0, [r0 + r1]
136
+    psllw           m0, 6
137
+    psubw           m0, m1
138
+    movu            [r2 + r3], m0
139
+
140
+    pmovzxbw        m0, [r0 + r1 * 2]
141
+    psllw           m0, 6
142
+    psubw           m0, m1
143
+    movu            [r2 + r3 * 2], m0
144
+
145
+    pmovzxbw        m0, [r0 + r4]
146
+    psllw           m0, 6
147
+    psubw           m0, m1
148
+    movu            [r2 + r5], m0
149
+
150
+    lea             r0, [r0 + r1 * 4]
151
+    lea             r2, [r2 + r3 * 4]
152
+
153
+    pmovzxbw        m0, [r0]
154
+    psllw           m0, 6
155
+    psubw           m0, m1
156
+    movu            [r2], m0
157
+
158
+    pmovzxbw        m0, [r0 + r1]
159
+    psllw           m0, 6
160
+    psubw           m0, m1
161
+    movu            [r2 + r3], m0
162
+
163
+    pmovzxbw        m0, [r0 + r1 * 2]
164
+    psllw           m0, 6
165
+    psubw           m0, m1
166
+    movu            [r2 + r3 * 2], m0
167
+
168
+    pmovzxbw        m0, [r0 + r4]
169
+    psllw           m0, 6
170
+    psubw           m0, m1
171
+    movu            [r2 + r5], m0
172
+    RET
173
+
174
+;-----------------------------------------------------------------------------
175
+; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
176
+;-----------------------------------------------------------------------------
177
+INIT_YMM avx2
178
+cglobal filterPixelToShort_16x16, 3, 6, 2
179
+    mov             r3d, r3m
180
+    add             r3d, r3d
181
+    lea             r4, [r1 * 3]
182
+    lea             r5, [r3 * 3]
183
+
184
+    ; load constant
185
+    vbroadcasti128  m1, [pw_2000]
186
+
187
+    pmovzxbw        m0, [r0]
188
+    psllw           m0, 6
189
+    psubw           m0, m1
190
+    movu            [r2], m0
191
+
192
+    pmovzxbw        m0, [r0 + r1]
193
+    psllw           m0, 6
194
+    psubw           m0, m1
195
+    movu            [r2 + r3], m0
196
+
197
+    pmovzxbw        m0, [r0 + r1 * 2]
198
+    psllw           m0, 6
199
+    psubw           m0, m1
200
+    movu            [r2 + r3 * 2], m0
201
+
202
+    pmovzxbw        m0, [r0 + r4]
203
+    psllw           m0, 6
204
+    psubw           m0, m1
205
+    movu            [r2 + r5], m0
206
+
207
+    lea             r0, [r0 + r1 * 4]
208
+    lea             r2, [r2 + r3 * 4]
209
+
210
+    pmovzxbw        m0, [r0]
211
+    psllw           m0, 6
212
+    psubw           m0, m1
213
+    movu            [r2], m0
214
+
215
+    pmovzxbw        m0, [r0 + r1]
216
+    psllw           m0, 6
217
+    psubw           m0, m1
218
+    movu            [r2 + r3], m0
219
+
220
+    pmovzxbw        m0, [r0 + r1 * 2]
221
+    psllw           m0, 6
222
+    psubw           m0, m1
223
+    movu            [r2 + r3 * 2], m0
224
+
225
+    pmovzxbw        m0, [r0 + r4]
226
+    psllw           m0, 6
227
+    psubw           m0, m1
228
+    movu            [r2 + r5], m0
229
+
230
+    lea             r0, [r0 + r1 * 4]
231
+    lea             r2, [r2 + r3 * 4]
232
+
233
+    pmovzxbw        m0, [r0]
234
+    psllw           m0, 6
235
+    psubw           m0, m1
236
+    movu            [r2], m0
237
+
238
+    pmovzxbw        m0, [r0 + r1]
239
+    psllw           m0, 6
240
+    psubw           m0, m1
241
+    movu            [r2 + r3], m0
242
+
243
+    pmovzxbw        m0, [r0 + r1 * 2]
244
+    psllw           m0, 6
245
+    psubw           m0, m1
246
+    movu            [r2 + r3 * 2], m0
247
+
248
+    pmovzxbw        m0, [r0 + r4]
249
+    psllw           m0, 6
250
+    psubw           m0, m1
251
+    movu            [r2 + r5], m0
252
+
253
+    lea             r0, [r0 + r1 * 4]
254
+    lea             r2, [r2 + r3 * 4]
255
+
256
+    pmovzxbw        m0, [r0]
257
+    psllw           m0, 6
258
+    psubw           m0, m1
259
+    movu            [r2], m0
260
+
261
+    pmovzxbw        m0, [r0 + r1]
262
+    psllw           m0, 6
263
+    psubw           m0, m1
264
+    movu            [r2 + r3], m0
265
+
266
+    pmovzxbw        m0, [r0 + r1 * 2]
267
+    psllw           m0, 6
268
+    psubw           m0, m1
269
+    movu            [r2 + r3 * 2], m0
270
+
271
+    pmovzxbw        m0, [r0 + r4]
272
+    psllw           m0, 6
273
+    psubw           m0, m1
274
+    movu            [r2 + r5], m0
275
+    RET
276
+
277
+;-----------------------------------------------------------------------------
278
+; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
279
+;-----------------------------------------------------------------------------
280
+INIT_YMM avx2
281
+cglobal filterPixelToShort_16x24, 3, 7, 2
282
+    mov             r3d, r3m
283
+    add             r3d, r3d
284
+    lea             r4, [r1 * 3]
285
+    lea             r5, [r3 * 3]
286
+    mov             r6d, 3
287
+
288
+    ; load constant
289
+    vbroadcasti128  m1, [pw_2000]
290
+.loop:
291
+    pmovzxbw        m0, [r0]
292
+    psllw           m0, 6
293
+    psubw           m0, m1
294
+    movu            [r2], m0
295
+
296
+    pmovzxbw        m0, [r0 + r1]
297
+    psllw           m0, 6
298
+    psubw           m0, m1
299
+    movu            [r2 + r3], m0
300
+
301
+    pmovzxbw        m0, [r0 + r1 * 2]
302
+    psllw           m0, 6
303
+    psubw           m0, m1
304
+    movu            [r2 + r3 * 2], m0
305
+
306
+    pmovzxbw        m0, [r0 + r4]
307
+    psllw           m0, 6
308
+    psubw           m0, m1
309
+    movu            [r2 + r5], m0
310
+
311
+    lea             r0, [r0 + r1 * 4]
312
+    lea             r2, [r2 + r3 * 4]
313
+
314
+    pmovzxbw        m0, [r0]
315
+    psllw           m0, 6
316
+    psubw           m0, m1
317
+    movu            [r2], m0
318
+
319
+    pmovzxbw        m0, [r0 + r1]
320
+    psllw           m0, 6
321
+    psubw           m0, m1
322
+    movu            [r2 + r3], m0
323
+
324
+    pmovzxbw        m0, [r0 + r1 * 2]
325
+    psllw           m0, 6
326
+    psubw           m0, m1
327
+    movu            [r2 + r3 * 2], m0
328
+
329
+    pmovzxbw        m0, [r0 + r4]
330
+    psllw           m0, 6
331
+    psubw           m0, m1
332
+    movu            [r2 + r5], m0
333
+
334
+    lea             r0, [r0 + r1 * 4]
335
+    lea             r2, [r2 + r3 * 4]
336
+
337
+    dec             r6d
338
+    jnz             .loop
339
+    RET
340
+
341
+;-----------------------------------------------------------------------------
342
+; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
343
+;-----------------------------------------------------------------------------
344
+%macro P2S_H_16xN_avx2 1
345
+INIT_YMM avx2
346
+cglobal filterPixelToShort_16x%1, 3, 7, 2
347
+    mov             r3d, r3m
348
+    add             r3d, r3d
349
+    lea             r4, [r1 * 3]
350
+    lea             r5, [r3 * 3]
351
+    mov             r6d, %1/16
352
+
353
+    ; load constant
354
+    vbroadcasti128  m1, [pw_2000]
355
+.loop:
356
+    pmovzxbw        m0, [r0]
357
+    psllw           m0, 6
358
+    psubw           m0, m1
359
+    movu            [r2], m0
360
+
361
+    pmovzxbw        m0, [r0 + r1]
362
+    psllw           m0, 6
363
+    psubw           m0, m1
364
+    movu            [r2 + r3], m0
365
+
366
+    pmovzxbw        m0, [r0 + r1 * 2]
367
+    psllw           m0, 6
368
+    psubw           m0, m1
369
+    movu            [r2 + r3 * 2], m0
370
+
371
+    pmovzxbw        m0, [r0 + r4]
372
+    psllw           m0, 6
373
+    psubw           m0, m1
374
+    movu            [r2 + r5], m0
375
+
376
+    lea             r0, [r0 + r1 * 4]
377
+    lea             r2, [r2 + r3 * 4]
378
+
379
+    pmovzxbw        m0, [r0]
380
+    psllw           m0, 6
381
+    psubw           m0, m1
382
+    movu            [r2], m0
383
+
384
+    pmovzxbw        m0, [r0 + r1]
385
+    psllw           m0, 6
386
+    psubw           m0, m1
387
+    movu            [r2 + r3], m0
388
+
389
+    pmovzxbw        m0, [r0 + r1 * 2]
390
+    psllw           m0, 6
391
+    psubw           m0, m1
392
+    movu            [r2 + r3 * 2], m0
393
+
394
+    pmovzxbw        m0, [r0 + r4]
395
+    psllw           m0, 6
396
+    psubw           m0, m1
397
+    movu            [r2 + r5], m0
398
+
399
+    lea             r0, [r0 + r1 * 4]
400
+    lea             r2, [r2 + r3 * 4]
401
+
402
+    pmovzxbw        m0, [r0]
403
+    psllw           m0, 6
404
+    psubw           m0, m1
405
+    movu            [r2], m0
406
+
407
+    pmovzxbw        m0, [r0 + r1]
408
+    psllw           m0, 6
409
+    psubw           m0, m1
410
+    movu            [r2 + r3], m0
411
+
412
+    pmovzxbw        m0, [r0 + r1 * 2]
413
+    psllw           m0, 6
414
+    psubw           m0, m1
415
+    movu            [r2 + r3 * 2], m0
416
+
417
+    pmovzxbw        m0, [r0 + r4]
418
+    psllw           m0, 6
419
+    psubw           m0, m1
420
+    movu            [r2 + r5], m0
421
+
422
+    lea             r0, [r0 + r1 * 4]
423
+    lea             r2, [r2 + r3 * 4]
424
+
425
+    pmovzxbw        m0, [r0]
426
+    psllw           m0, 6
427
+    psubw           m0, m1
428
+    movu            [r2], m0
429
+
430
+    pmovzxbw        m0, [r0 + r1]
431
+    psllw           m0, 6
432
+    psubw           m0, m1
433
+    movu            [r2 + r3], m0
434
+
435
+    pmovzxbw        m0, [r0 + r1 * 2]
436
+    psllw           m0, 6
437
+    psubw           m0, m1
438
+    movu            [r2 + r3 * 2], m0
439
+
440
+    pmovzxbw        m0, [r0 + r4]
441
+    psllw           m0, 6
442
+    psubw           m0, m1
443
+    movu            [r2 + r5], m0
444
+
445
+    lea             r0, [r0 + r1 * 4]
446
+    lea             r2, [r2 + r3 * 4]
447
+
448
+    dec             r6d
449
+    jnz             .loop
450
+    RET
451
+%endmacro
452
+P2S_H_16xN_avx2 32
453
+P2S_H_16xN_avx2 64
454
+
455
+;-----------------------------------------------------------------------------
456
+; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
457
+;-----------------------------------------------------------------------------
458
 %macro P2S_H_32xN 1
459
 INIT_XMM ssse3
460
 cglobal filterPixelToShort_32x%1, 3, 7, 6
461
@@ -25016,67 +25469,57 @@
462
 ; void interp_4tap_horiz_ps_32x32(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
463
 ;-----------------------------------------------------------------------------------------------------------------------------;
464
 INIT_YMM avx2
465
-cglobal interp_4tap_horiz_ps_32x32, 4,7,6
466
+cglobal interp_4tap_horiz_ps_32x32, 4,6,8
467
     mov             r4d, r4m
468
-    mov             r5d, r5m
469
     add             r3d, r3d
470
+    dec             r0
471
 
472
-%ifdef PIC
473
-    lea               r6,           [tab_ChromaCoeff]
474
-    vpbroadcastd      m0,           [r6 + r4 * 4]
475
-%else
476
-    vpbroadcastd      m0,           [tab_ChromaCoeff + r4 * 4]
477
-%endif
478
+    ; check isRowExt
479
+    cmp             r5m, byte 0
480
 
481
-    vbroadcasti128     m2,           [pw_1]
482
-    vbroadcasti128     m5,           [pw_2000]
483
-    mova               m1,           [tab_Tm]
484
+    lea             r5, [tab_ChromaCoeff]
485
+    vpbroadcastw    m0, [r5 + r4 * 4 + 0]
486
+    vpbroadcastw    m1, [r5 + r4 * 4 + 2]
487
+    mova            m7, [pw_2000]
488
 
489
     ; register map
490
-    ; m0 - interpolate coeff
491
-    ; m1 - shuffle order table
492
-    ; m2 - constant word 1
493
-    mov                r6d,         32
494
-    dec                r0
495
-    test                r5d,      r5d
496
-    je                 .loop
497
-    sub                r0 ,         r1
498
-    add                r6d ,        3
499
+    ; m0 - interpolate coeff Low
500
+    ; m1 - interpolate coeff High
501
+    ; m7 - constant pw_2000
502
+    mov             r4d, 32
503
+    je             .loop
504
+    sub             r0, r1
505
+    add             r4d, 3
506
 
507
 .loop
508
     ; Row 0
509
-    vbroadcasti128    m3,           [r0]                        ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
510
-    pshufb            m3,           m1
511
-    pmaddubsw         m3,           m0
512
-    pmaddwd           m3,           m2
513
-    vbroadcasti128    m4,           [r0 + 8]                      ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
514
-    pshufb            m4,           m1
515
-    pmaddubsw         m4,           m0
516
-    pmaddwd           m4,           m2
517
-
518
-    packssdw          m3,           m4
519
-    psubw             m3,           m5
520
-    vpermq            m3,           m3,          11011000b
521
-    movu             [r2],         m3
522
-
523
-    vbroadcasti128    m3,           [r0 + 16]                        ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
524
-    pshufb            m3,           m1
525
-    pmaddubsw         m3,           m0
526
-    pmaddwd           m3,           m2
527
-    vbroadcasti128    m4,           [r0 + 24]                      ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
528
-    pshufb            m4,           m1
529
-    pmaddubsw         m4,           m0
530
-    pmaddwd           m4,           m2
531
-
532
-    packssdw          m3,           m4
533
-    psubw             m3,           m5
534
-    vpermq            m3,              m3,          11011000b
535
-    movu             [r2 + 32],         m3
536
-
537
-    add                r2,           r3
538
-    add                r0,           r1
539
-    dec               r6d
540
-    jnz                .loop
541
+    movu            m2, [r0]
542
+    movu            m3, [r0 + 1]
543
+    punpckhbw       m4, m2, m3
544
+    punpcklbw       m2, m3
545
+    pmaddubsw       m4, m0
546
+    pmaddubsw       m2, m0
547
+
548
+    movu            m3, [r0 + 2]
549
+    movu            m5, [r0 + 3]
550
+    punpckhbw       m6, m3, m5
551
+    punpcklbw       m3, m5
552
+    pmaddubsw       m6, m1
553
+    pmaddubsw       m3, m1
554
+
555
+    paddw           m4, m6
556
+    paddw           m2, m3
557
+    psubw           m4, m7
558
+    psubw           m2, m7
559
+    vperm2i128      m3, m2, m4, 0x20
560
+    vperm2i128      m5, m2, m4, 0x31
561
+    movu            [r2], m3
562
+    movu            [r2 + mmsize], m5
563
+
564
+    add             r2, r3
565
+    add             r0, r1
566
+    dec             r4d
567
+    jnz            .loop
568
     RET
569
 
570
 ;-----------------------------------------------------------------------------------------------------------------------------
571
x265_1.8.tar.gz/source/common/x86/loopfilter.asm -> x265_1.9.tar.gz/source/common/x86/loopfilter.asm Changed
1676
 
1
@@ -26,24 +26,28 @@
2
 ;*****************************************************************************/
3
 
4
 %include "x86inc.asm"
5
+%include "x86util.asm"
6
 
7
 SECTION_RODATA 32
8
 pb_31:      times 32 db 31
9
 pb_124:     times 32 db 124
10
 pb_15:      times 32 db 15
11
-pb_movemask_32:  times 32 db 0x00
12
-                 times 32 db 0xFF
13
 
14
 SECTION .text
15
 cextern pb_1
16
-cextern pb_128
17
 cextern pb_2
18
+cextern pb_3
19
+cextern pb_4
20
+cextern pb_01
21
+cextern pb_128
22
+cextern pw_1
23
+cextern pw_n1
24
 cextern pw_2
25
+cextern pw_4
26
 cextern pw_pixel_max
27
 cextern pb_movemask
28
-cextern pw_1
29
+cextern pb_movemask_32
30
 cextern hmul_16p
31
-cextern pb_4
32
 
33
 
34
 ;============================================================================================================
35
@@ -1989,79 +1993,94 @@
36
 %endif
37
 
38
 ;--------------------------------------------------------------------------------------------------------------------------
39
-; saoCuStatsBO_c(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count)
40
+; saoCuStatsBO_c(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count)
41
 ;--------------------------------------------------------------------------------------------------------------------------
42
 %if ARCH_X86_64
43
 INIT_XMM sse4
44
-cglobal saoCuStatsBO, 7,12,6
45
-    mova        m3, [hmul_16p + 16]
46
-    mova        m4, [pb_124]
47
-    mova        m5, [pb_4]
48
-    xor         r7d, r7d
49
+cglobal saoCuStatsBO, 7,13,2
50
+    mova        m0, [pb_124]
51
+    add         r5, 4
52
+    add         r6, 4
53
 
54
 .loopH:
55
-    mov         r10, r0
56
+    mov         r12, r0
57
     mov         r11, r1
58
     mov         r9d, r3d
59
+
60
 .loopL:
61
     movu        m1, [r11]
62
-    movu        m0, [r10]
63
+    psrlw       m1, 1                   ; rec[x] >> boShift
64
+    pand        m1, m0
65
 
66
-    punpckhbw   m2, m0, m1
67
-    punpcklbw   m0, m1
68
-    psrlw       m1, 1               ; rec[x] >> boShift
69
-    pmaddubsw   m2, m3
70
-    pmaddubsw   m0, m3
71
-    pand        m1, m4
72
-    paddb       m1, m5
73
+    cmp         r9d, 8
74
+    jle        .proc8
75
 
76
+    movq        r10, m1
77
 %assign x 0
78
-%rep 16
79
-    pextrb      r7d, m1, x
80
+%rep 8
81
+    movzx       r7d, r10b
82
+    shr         r10, 8
83
 
84
-%if (x < 8)
85
-    pextrw      r8d, m0, (x % 8)
86
-%else
87
-    pextrw      r8d, m2, (x % 8)
88
-%endif
89
-    movsx       r8d, r8w
90
-    inc         dword  [r6 + r7]    ; count[classIdx]++
91
-    add         [r5 + r7], r8d      ; stats[classIdx] += (fenc[x] - rec[x]);
92
+    movsx       r8d, word [r12 + x*2]   ; diff[x]
93
+    inc         dword  [r6 + r7]        ; count[classIdx]++
94
+    add         [r5 + r7], r8d          ; stats[classIdx] += (fenc[x] - rec[x]);
95
+%assign x x+1
96
+%endrep
97
+    movhlps     m1, m1
98
+    sub         r9d, 8
99
+    add         r12, 8*2
100
+
101
+.proc8:
102
+    movq        r10, m1
103
+%assign x 0
104
+%rep 8
105
+    movzx       r7d, r10b
106
+    shr         r10, 8
107
+
108
+    movsx       r8d, word [r12 + x*2]   ; diff[x]
109
+    inc         dword  [r6 + r7]        ; count[classIdx]++
110
+    add         [r5 + r7], r8d          ; stats[classIdx] += (fenc[x] - rec[x]);
111
     dec         r9d
112
-    jz          .next
113
+    jz         .next
114
 %assign x x+1
115
 %endrep
116
 
117
-    add         r10, 16
118
+    add         r12, 8*2
119
     add         r11, 16
120
-    jmp         .loopL
121
+    jmp        .loopL
122
 
123
 .next:
124
-    add         r0, r2
125
+    add         r0, 64*2                ; MAX_CU_SIZE
126
     add         r1, r2
127
     dec         r4d
128
-    jnz         .loopH
129
+    jnz        .loopH
130
     RET
131
 %endif
132
 
133
 ;-----------------------------------------------------------------------------------------------------------------------
134
-; saoCuStatsE0(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count)
135
+; saoCuStatsE0(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count)
136
 ;-----------------------------------------------------------------------------------------------------------------------
137
 %if ARCH_X86_64
138
 INIT_XMM sse4
139
-cglobal saoCuStatsE0, 5,9,8, 0-32
140
+cglobal saoCuStatsE0, 3,10,6, 0-32
141
     mov         r3d, r3m
142
-    mov         r8, r5mp
143
+    mov         r4d, r4m
144
+    mov         r9, r5mp
145
 
146
     ; clear internal temporary buffer
147
     pxor        m0, m0
148
     mova        [rsp], m0
149
     mova        [rsp + mmsize], m0
150
     mova        m4, [pb_128]
151
-    mova        m5, [hmul_16p + 16]
152
-    mova        m6, [pb_2]
153
+    mova        m5, [pb_2]
154
     xor         r7d, r7d
155
 
156
+    ; correct stride for diff[] and rec
157
+    mov         r6d, r3d
158
+    and         r6d, ~15
159
+    sub         r2, r6
160
+    lea         r8, [(r6 - 64) * 2]             ; 64 = MAX_CU_SIZE
161
+
162
 .loopH:
163
     mov         r5d, r3d
164
 
165
@@ -2075,100 +2094,257 @@
166
     pinsrb      m0, r7d, 15
167
 
168
 .loopL:
169
-    movu        m7, [r1]
170
+    movu        m3, [r1]
171
     movu        m2, [r1 + 1]
172
 
173
-    pxor        m1, m7, m4
174
-    pxor        m3, m2, m4
175
-    pcmpgtb     m2, m1, m3
176
-    pcmpgtb     m3, m1
177
-    pand        m2, [pb_1]
178
-    por         m2, m3              ; signRight
179
+    pxor        m1, m3, m4
180
+    pxor        m2, m4
181
+    pcmpgtb     m3, m1, m2
182
+    pcmpgtb     m2, m1
183
+    pand        m3, [pb_1]
184
+    por         m2, m3                          ; signRight
185
 
186
     palignr     m3, m2, m0, 15
187
-    psignb      m3, m4              ; signLeft
188
+    psignb      m3, m4                          ; signLeft
189
 
190
     mova        m0, m2
191
     paddb       m2, m3
192
-    paddb       m2, m6              ; edgeType
193
+    paddb       m2, m5                          ; edgeType
194
 
195
     ; stats[edgeType]
196
-    movu        m3, [r0]            ; fenc[0-15]
197
-    punpckhbw   m1, m3, m7
198
-    punpcklbw   m3, m7
199
-    pmaddubsw   m1, m5
200
-    pmaddubsw   m3, m5
201
-
202
 %assign x 0
203
 %rep 16
204
     pextrb      r7d, m2, x
205
 
206
-%if (x < 8)
207
-    pextrw      r6d, m3, (x % 8)
208
-%else
209
-    pextrw      r6d, m1, (x % 8)
210
-%endif
211
-    movsx       r6d, r6w
212
+    movsx       r6d, word [r0 + x * 2]
213
     inc         word [rsp + r7 * 2]             ; tmp_count[edgeType]++
214
     add         [rsp + 5 * 2 + r7 * 4], r6d     ; tmp_stats[edgeType] += (fenc[x] - rec[x])
215
     dec         r5d
216
-    jz          .next
217
+    jz         .next
218
 %assign x x+1
219
 %endrep
220
 
221
-    add         r0q, 16
222
-    add         r1q, 16
223
-    jmp         .loopL
224
+    add         r0, 16*2
225
+    add         r1, 16
226
+    jmp        .loopL
227
 
228
 .next:
229
-    mov         r6d, r3d
230
-    and         r6d, 15
231
-
232
-    sub         r6, r3
233
-    add         r6, r2
234
-    add         r0, r6
235
-    add         r1, r6
236
+    sub         r0, r8
237
+    add         r1, r2
238
 
239
     dec         r4d
240
-    jnz         .loopH
241
+    jnz        .loopH
242
 
243
     ; sum to global buffer
244
     mov         r0, r6mp
245
 
246
     ; s_eoTable = {1, 2, 0, 3, 4}
247
-    movzx       r5d, word [rsp + 0 * 2]
248
-    add         [r0 + 1 * 4], r5d
249
-    movzx       r6d, word [rsp + 1 * 2]
250
-    add         [r0 + 2 * 4], r6d
251
-    movzx       r5d, word [rsp + 2 * 2]
252
-    add         [r0 + 0 * 4], r5d
253
-    movzx       r6d, word [rsp + 3 * 2]
254
-    add         [r0 + 3 * 4], r6d
255
+    pmovzxwd    m0, [rsp + 0 * 2]
256
+    pshufd      m0, m0, q3102
257
+    movu        m1, [r0]
258
+    paddd       m0, m1
259
+    movu        [r0], m0
260
     movzx       r5d, word [rsp + 4 * 2]
261
     add         [r0 + 4 * 4], r5d
262
 
263
-    mov         r6d, [rsp + 5 * 2 + 0 * 4]
264
-    add         [r8 + 1 * 4], r6d
265
-    mov         r5d, [rsp + 5 * 2 + 1 * 4]
266
-    add         [r8 + 2 * 4], r5d
267
-    mov         r6d, [rsp + 5 * 2 + 2 * 4]
268
-    add         [r8 + 0 * 4], r6d
269
-    mov         r5d, [rsp + 5 * 2 + 3 * 4]
270
-    add         [r8 + 3 * 4], r5d
271
+    movu        m0, [rsp + 5 * 2 + 0 * 4]
272
+    pshufd      m0, m0, q3102
273
+    movu        m1, [r9]
274
+    paddd       m0, m1
275
+    movu        [r9], m0
276
     mov         r6d, [rsp + 5 * 2 + 4 * 4]
277
-    add         [r8 + 4 * 4], r6d
278
+    add         [r9 + 4 * 4], r6d
279
+    RET
280
+
281
+
282
+;-----------------------------------------------------------------------------------------------------------------------
283
+; saoCuStatsE0(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count)
284
+;-----------------------------------------------------------------------------------------------------------------------
285
+INIT_YMM avx2
286
+; spending rbp register to avoid x86inc stack alignment problem
287
+cglobal saoCuStatsE0, 3,11,16
288
+    mov         r3d, r3m
289
+    mov         r4d, r4m
290
+    mov         r9, r5mp
291
+
292
+    ; clear internal temporary buffer
293
+    pxor        xm6, xm6                        ; count[0]
294
+    pxor        xm7, xm7                        ; count[1]
295
+    pxor        xm8, xm8                        ; count[2]
296
+    pxor        xm9, xm9                        ; count[3]
297
+    pxor        xm10, xm10                      ; count[4]
298
+    pxor        xm11, xm11                      ; stats[0]
299
+    pxor        xm12, xm12                      ; stats[1]
300
+    pxor        xm13, xm13                      ; stats[2]
301
+    pxor        xm14, xm14                      ; stats[3]
302
+    pxor        xm15, xm15                      ; stats[4]
303
+    xor         r7d, r7d
304
+
305
+    ; correct stride for diff[] and rec
306
+    mov         r6d, r3d
307
+    and         r6d, ~15
308
+    sub         r2, r6
309
+    lea         r8, [(r6 - 64) * 2]             ; 64 = MAX_CU_SIZE
310
+    lea         r10, [pb_movemask_32 + 32]
311
+
312
+.loopH:
313
+    mov         r5d, r3d
314
+
315
+    ; calculate signLeft
316
+    mov         r7b, [r1]
317
+    sub         r7b, [r1 - 1]
318
+    seta        r7b
319
+    setb        r6b
320
+    sub         r7b, r6b
321
+    neg         r7b
322
+    pinsrb      xm0, r7d, 15
323
+
324
+.loopL:
325
+    mova        m4, [pb_128]                    ; lower performance, but we haven't enough register for stats[]
326
+    movu        xm3, [r1]
327
+    movu        xm2, [r1 + 1]
328
+
329
+    pxor        xm1, xm3, xm4
330
+    pxor        xm2, xm4
331
+    pcmpgtb     xm3, xm1, xm2
332
+    pcmpgtb     xm2, xm1
333
+    pand        xm3, [pb_1]
334
+    por         xm2, xm3                        ; signRight
335
+
336
+    palignr     xm3, xm2, xm0, 15
337
+    psignb      xm3, xm4                        ; signLeft
338
+
339
+    mova        xm0, xm2
340
+    paddb       xm2, xm3
341
+    paddb       xm2, [pb_2]                     ; edgeType
342
+
343
+    ; get current process mask
344
+    mov         r7d, 16
345
+    mov         r6d, r5d
346
+    cmp         r5d, r7d
347
+    cmovge      r6d, r7d
348
+    neg         r6
349
+    movu        xm1, [r10 + r6]
350
+
351
+    ; tmp_count[edgeType]++
352
+    ; tmp_stats[edgeType] += (fenc[x] - rec[x])
353
+    pxor        xm3, xm3
354
+    por         xm1, xm2                        ; apply unavailable pixel mask
355
+    movu        m5, [r0]                        ; up to 14bits
356
+
357
+    pcmpeqb     xm3, xm1, xm3
358
+    psubb       xm6, xm3
359
+    pmovsxbw    m2, xm3
360
+    pmaddwd     m4, m5, m2
361
+    paddd       m11, m4
362
+
363
+    pcmpeqb     xm3, xm1, [pb_1]
364
+    psubb       xm7, xm3
365
+    pmovsxbw    m2, xm3
366
+    pmaddwd     m4, m5, m2
367
+    paddd       m12, m4
368
+
369
+    pcmpeqb     xm3, xm1, [pb_2]
370
+    psubb       xm8, xm3
371
+    pmovsxbw    m2, xm3
372
+    pmaddwd     m4, m5, m2
373
+    paddd       m13, m4
374
+
375
+    pcmpeqb     xm3, xm1, [pb_3]
376
+    psubb       xm9, xm3
377
+    pmovsxbw    m2, xm3
378
+    pmaddwd     m4, m5, m2
379
+    paddd       m14, m4
380
+
381
+    pcmpeqb     xm3, xm1, [pb_4]
382
+    psubb       xm10, xm3
383
+    pmovsxbw    m2, xm3
384
+    pmaddwd     m4, m5, m2
385
+    paddd       m15, m4
386
+
387
+    sub         r5d, r7d
388
+    jle        .next
389
+
390
+    add         r0, 16*2
391
+    add         r1, 16
392
+    jmp        .loopL
393
+
394
+.next:
395
+    sub         r0, r8
396
+    add         r1, r2
397
+
398
+    dec         r4d
399
+    jnz        .loopH
400
+
401
+    ; sum to global buffer
402
+    mov         r0, r6mp
403
+
404
+    ; sum into word
405
+    ; WARNING: There have a ovberflow bug on case Block64x64 with ALL pixels are SAME type (HM algorithm never pass Block64x64 into here)
406
+    pxor        xm0, xm0
407
+    psadbw      xm1, xm6, xm0
408
+    psadbw      xm2, xm7, xm0
409
+    psadbw      xm3, xm8, xm0
410
+    psadbw      xm4, xm9, xm0
411
+    psadbw      xm5, xm10, xm0
412
+    pshufd      xm1, xm1, q3120
413
+    pshufd      xm2, xm2, q3120
414
+    pshufd      xm3, xm3, q3120
415
+    pshufd      xm4, xm4, q3120
416
+
417
+    ; sum count[4] only
418
+    movhlps     xm6, xm5
419
+    paddd       xm5, xm6
420
+
421
+    ; sum count[s_eoTable]
422
+    ; s_eoTable = {1, 2, 0, 3, 4}
423
+    punpcklqdq  xm3, xm1
424
+    punpcklqdq  xm2, xm4
425
+    phaddd      xm3, xm2
426
+    movu        xm1, [r0]
427
+    paddd       xm3, xm1
428
+    movu        [r0], xm3
429
+    movd        r5d, xm5
430
+    add         [r0 + 4 * 4], r5d
431
+
432
+    ; sum stats[s_eoTable]
433
+    vextracti128 xm1, m11, 1
434
+    paddd       xm1, xm11
435
+    vextracti128 xm2, m12, 1
436
+    paddd       xm2, xm12
437
+    vextracti128 xm3, m13, 1
438
+    paddd       xm3, xm13
439
+    vextracti128 xm4, m14, 1
440
+    paddd       xm4, xm14
441
+    vextracti128 xm5, m15, 1
442
+    paddd       xm5, xm15
443
+
444
+    ; s_eoTable = {1, 2, 0, 3, 4}
445
+    phaddd      xm3, xm1
446
+    phaddd      xm2, xm4
447
+    phaddd      xm3, xm2
448
+    psubd       xm3, xm0, xm3               ; negtive for compensate PMADDWD sign algorithm problem
449
+
450
+    ; sum stats[4] only
451
+    HADDD       xm5, xm6
452
+    psubd       xm5, xm0, xm5
453
+
454
+    movu        xm1, [r9]
455
+    paddd       xm3, xm1
456
+    movu        [r9], xm3
457
+    movd        r6d, xm5
458
+    add         [r9 + 4 * 4], r6d
459
     RET
460
 %endif
461
 
462
 ;-------------------------------------------------------------------------------------------------------------------------------------------
463
-; saoCuStatsE1_c(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count)
464
+; saoCuStatsE1_c(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count)
465
 ;-------------------------------------------------------------------------------------------------------------------------------------------
466
 %if ARCH_X86_64
467
 INIT_XMM sse4
468
-cglobal saoCuStatsE1, 4,12,9,0-32    ; Stack: 5 of stats and 5 of count
469
+cglobal saoCuStatsE1, 4,12,8,0-32    ; Stack: 5 of stats and 5 of count
470
     mov         r5d, r5m
471
     mov         r4d, r4m
472
-    mov         r11d, r5d
473
 
474
     ; clear internal temporary buffer
475
     pxor        m0, m0
476
@@ -2177,7 +2353,6 @@
477
     mova        m0, [pb_128]
478
     mova        m5, [pb_1]
479
     mova        m6, [pb_2]
480
-    mova        m8, [hmul_16p + 16]
481
     movh        m7, [r3 + r4]
482
 
483
 .loopH:
484
@@ -2194,11 +2369,11 @@
485
     pxor        m1, m0
486
     pxor        m2, m0
487
     pcmpgtb     m3, m1, m2
488
-    pand        m3, m5
489
     pcmpgtb     m2, m1
490
+    pand        m3, m5
491
     por         m2, m3
492
     pxor        m3, m3
493
-    psubb       m3, m2      ; -signDown
494
+    psubb       m3, m2                          ; -signDown
495
 
496
     ; edgeType
497
     movu        m4, [r11]
498
@@ -2208,26 +2383,14 @@
499
     ; update upBuff1
500
     movu        [r11], m3
501
 
502
-    ; stats[edgeType]
503
-    pxor        m1, m0
504
-    movu        m3, [r9]
505
-    punpckhbw   m4, m3, m1
506
-    punpcklbw   m3, m1
507
-    pmaddubsw   m3, m8
508
-    pmaddubsw   m4, m8
509
-
510
     ; 16 pixels
511
 %assign x 0
512
 %rep 16
513
     pextrb      r7d, m2, x
514
     inc         word [rsp + r7 * 2]
515
 
516
-  %if (x < 8)
517
-    pextrw      r8d, m3, (x % 8)
518
-  %else
519
-    pextrw      r8d, m4, (x % 8)
520
-  %endif
521
-    movsx       r8d, r8w
522
+    ; stats[edgeType]
523
+    movsx       r8d, word [r9 + x * 2]
524
     add         [rsp + 5 * 2 + r7 * 4], r8d
525
 
526
     dec         r6d
527
@@ -2235,15 +2398,678 @@
528
 %assign x x+1
529
 %endrep
530
 
531
-    add         r9, 16
532
+    add         r9, 16*2
533
+    add         r10, 16
534
+    add         r11, 16
535
+    jmp        .loopW
536
+
537
+.next:
538
+    ; restore pointer upBuff1
539
+    add         r0, 64*2                        ; MAX_CU_SIZE
540
+    add         r1, r2
541
+
542
+    dec         r5d
543
+    jg         .loopH
544
+
545
+    ; restore unavailable pixels
546
+    movh        [r3 + r4], m7
547
+
548
+    ; sum to global buffer
549
+    mov         r1, r6m
550
+    mov         r0, r7m
551
+
552
+    ; s_eoTable = {1,2,0,3,4}
553
+    pmovzxwd    m0, [rsp + 0 * 2]
554
+    pshufd      m0, m0, q3102
555
+    movu        m1, [r0]
556
+    paddd       m0, m1
557
+    movu        [r0], m0
558
+    movzx       r5d, word [rsp + 4 * 2]
559
+    add         [r0 + 4 * 4], r5d
560
+
561
+    movu        m0, [rsp + 5 * 2 + 0 * 4]
562
+    pshufd      m0, m0, q3102
563
+    movu        m1, [r1]
564
+    paddd       m0, m1
565
+    movu        [r1], m0
566
+    mov         r6d, [rsp + 5 * 2 + 4 * 4]
567
+    add         [r1 + 4 * 4], r6d
568
+    RET
569
+
570
+
571
+INIT_YMM avx2
572
+cglobal saoCuStatsE1, 4,13,16       ; Stack: 5 of stats and 5 of count
573
+    mov         r5d, r5m
574
+    mov         r4d, r4m
575
+
576
+    ; clear internal temporary buffer
577
+    pxor        xm6, xm6                            ; count[0]
578
+    pxor        xm7, xm7                            ; count[1]
579
+    pxor        xm8, xm8                            ; count[2]
580
+    pxor        xm9, xm9                            ; count[3]
581
+    pxor        xm10, xm10                          ; count[4]
582
+    pxor        xm11, xm11                          ; stats[0]
583
+    pxor        xm12, xm12                          ; stats[1]
584
+    pxor        xm13, xm13                          ; stats[2]
585
+    pxor        xm14, xm14                          ; stats[3]
586
+    pxor        xm15, xm15                          ; stats[4]
587
+    mova        m0, [pb_128]
588
+    mova        m5, [pb_1]
589
+
590
+    ; save unavailable bound pixel
591
+    push  qword [r3 + r4]
592
+
593
+    ; unavailable mask
594
+    lea         r12, [pb_movemask_32 + 32]
595
+
596
+.loopH:
597
+    mov         r6d, r4d
598
+    mov         r9, r0
599
+    mov         r10, r1
600
+    mov         r11, r3
601
+
602
+.loopW:
603
+    movu        xm1, [r10]
604
+    movu        xm2, [r10 + r2]
605
+
606
+    ; signDown
607
+    pxor        xm1, xm0
608
+    pxor        xm2, xm0
609
+    pcmpgtb     xm3, xm1, xm2
610
+    pcmpgtb     xm2, xm1
611
+    pand        xm3, xm5
612
+    por         xm2, xm3
613
+    psignb      xm3, xm2, xm0                       ; -signDown
614
+
615
+    ; edgeType
616
+    movu        xm4, [r11]
617
+    paddb       xm4, [pb_2]
618
+    paddb       xm2, xm4
619
+
620
+    ; update upBuff1 (must be delay, above code modify memory[r11])
621
+    movu        [r11], xm3
622
+
623
+    ; m[1-4] free in here
624
+
625
+    ; get current process group mask
626
+    mov         r7d, 16
627
+    mov         r8d, r6d
628
+    cmp         r6d, r7d
629
+    cmovge      r8d, r7d
630
+    neg         r8
631
+    movu        xm1, [r12 + r8]
632
+
633
+    ; tmp_count[edgeType]++
634
+    ; tmp_stats[edgeType] += (fenc[x] - rec[x])
635
+    pxor        xm3, xm3
636
+    por         xm1, xm2                            ; apply unavailable pixel mask
637
+    movu        m4, [r9]                            ; up to 14bits
638
+
639
+    pcmpeqb     xm3, xm1, xm3
640
+    psubb       xm6, xm3
641
+    pmovsxbw    m2, xm3
642
+    pmaddwd     m3, m4, m2
643
+    paddd       m11, m3
644
+
645
+    pcmpeqb     xm3, xm1, xm5
646
+    psubb       xm7, xm3
647
+    pmovsxbw    m2, xm3
648
+    pmaddwd     m3, m4, m2
649
+    paddd       m12, m3
650
+
651
+    pcmpeqb     xm3, xm1, [pb_2]
652
+    psubb       xm8, xm3
653
+    pmovsxbw    m2, xm3
654
+    pmaddwd     m3, m4, m2
655
+    paddd       m13, m3
656
+
657
+    pcmpeqb     xm3, xm1, [pb_3]
658
+    psubb       xm9, xm3
659
+    pmovsxbw    m2, xm3
660
+    pmaddwd     m3, m4, m2
661
+    paddd       m14, m3
662
+
663
+    pcmpeqb     xm3, xm1, [pb_4]
664
+    psubb       xm10, xm3
665
+    pmovsxbw    m2, xm3
666
+    pmaddwd     m3, m4, m2
667
+    paddd       m15, m3
668
+
669
+    sub         r6d, r7d
670
+    jle        .next
671
+
672
+    add         r9, 16*2
673
     add         r10, 16
674
     add         r11, 16
675
+    jmp        .loopW
676
+
677
+.next:
678
+    ; restore pointer upBuff1
679
+    add         r0, 64*2                            ; MAX_CU_SIZE
680
+    add         r1, r2
681
+
682
+    dec         r5d
683
+    jg         .loopH
684
+
685
+    ; restore unavailable pixels
686
+    pop   qword [r3 + r4]
687
+
688
+    ; sum to global buffer
689
+    mov         r1, r6m
690
+    mov         r0, r7m
691
+
692
+    ; sum into word
693
+    ; WARNING: There have a ovberflow bug on case Block64x64 with ALL pixels are SAME type (HM algorithm never pass Block64x64 into here)
694
+    pxor        xm0, xm0
695
+    psadbw      xm1, xm6, xm0
696
+    psadbw      xm2, xm7, xm0
697
+    psadbw      xm3, xm8, xm0
698
+    psadbw      xm4, xm9, xm0
699
+    psadbw      xm5, xm10, xm0
700
+    pshufd      xm1, xm1, q3120
701
+    pshufd      xm2, xm2, q3120
702
+    pshufd      xm3, xm3, q3120
703
+    pshufd      xm4, xm4, q3120
704
+
705
+    ; sum count[4] only
706
+    movhlps     xm6, xm5
707
+    paddd       xm5, xm6
708
+
709
+    ; sum count[s_eoTable]
710
+    ; s_eoTable = {1, 2, 0, 3, 4}
711
+    punpcklqdq  xm3, xm1
712
+    punpcklqdq  xm2, xm4
713
+    phaddd      xm3, xm2
714
+    movu        xm1, [r0]
715
+    paddd       xm3, xm1
716
+    movu        [r0], xm3
717
+    movd        r5d, xm5
718
+    add         [r0 + 4 * 4], r5d
719
+
720
+    ; sum stats[s_eoTable]
721
+    vextracti128 xm1, m11, 1
722
+    paddd       xm1, xm11
723
+    vextracti128 xm2, m12, 1
724
+    paddd       xm2, xm12
725
+    vextracti128 xm3, m13, 1
726
+    paddd       xm3, xm13
727
+    vextracti128 xm4, m14, 1
728
+    paddd       xm4, xm14
729
+    vextracti128 xm5, m15, 1
730
+    paddd       xm5, xm15
731
+
732
+    ; s_eoTable = {1, 2, 0, 3, 4}
733
+    phaddd      xm3, xm1
734
+    phaddd      xm2, xm4
735
+    phaddd      xm3, xm2
736
+    psubd       xm3, xm0, xm3               ; negtive for compensate PMADDWD sign algorithm problem
737
+
738
+    ; sum stats[4] only
739
+    HADDD       xm5, xm6
740
+    psubd       xm5, xm0, xm5
741
+
742
+    movu        xm1, [r1]
743
+    paddd       xm3, xm1
744
+    movu        [r1], xm3
745
+    movd        r6d, xm5
746
+    add         [r1 + 4 * 4], r6d
747
+    RET
748
+%endif ; ARCH_X86_64
749
+
750
+
751
+;void saoCuStatsE2_c(const int16_t *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBufft, int endX, int endY, int32_t *stats, int32_t *count)
752
+;{
753
+;    X265_CHECK(endX < MAX_CU_SIZE, "endX check failure\n");
754
+;    X265_CHECK(endY < MAX_CU_SIZE, "endY check failure\n");
755
+;    int x, y;
756
+;    int32_t tmp_stats[SAO::NUM_EDGETYPE];
757
+;    int32_t tmp_count[SAO::NUM_EDGETYPE];
758
+;    memset(tmp_stats, 0, sizeof(tmp_stats));
759
+;    memset(tmp_count, 0, sizeof(tmp_count));
760
+;    for (y = 0; y < endY; y++)
761
+;    {
762
+;        upBufft[0] = signOf(rec[stride] - rec[-1]);
763
+;        for (x = 0; x < endX; x++)
764
+;        {
765
+;            int signDown = signOf2(rec[x], rec[x + stride + 1]);
766
+;            X265_CHECK(signDown == signOf(rec[x] - rec[x + stride + 1]), "signDown check failure\n");
767
+;            uint32_t edgeType = signDown + upBuff1[x] + 2;
768
+;            upBufft[x + 1] = (int8_t)(-signDown);
769
+;            tmp_stats[edgeType] += diff[x];
770
+;            tmp_count[edgeType]++;
771
+;        }
772
+;        std::swap(upBuff1, upBufft);
773
+;        rec += stride;
774
+;        fenc += stride;
775
+;    }
776
+;    for (x = 0; x < SAO::NUM_EDGETYPE; x++)
777
+;    {
778
+;        stats[SAO::s_eoTable[x]] += tmp_stats[x];
779
+;        count[SAO::s_eoTable[x]] += tmp_count[x];
780
+;    }
781
+;}
782
+
783
+%if ARCH_X86_64
784
+; TODO: x64 only because I need temporary register r7,r8, easy portab to x86
785
+INIT_XMM sse4
786
+cglobal saoCuStatsE2, 5,9,8,0-32    ; Stack: 5 of stats and 5 of count
787
+    mov         r5d, r5m
788
+
789
+    ; clear internal temporary buffer
790
+    pxor        m0, m0
791
+    mova        [rsp], m0
792
+    mova        [rsp + mmsize], m0
793
+    mova        m0, [pb_128]
794
+    mova        m5, [pb_1]
795
+    mova        m6, [pb_2]
796
+
797
+.loopH:
798
+    ; TODO: merge into SIMD in below
799
+    ; get upBuffX[0]
800
+    mov         r6b, [r1 + r2]
801
+    sub         r6b, [r1 -  1]
802
+    seta        r6b
803
+    setb        r7b
804
+    sub         r6b, r7b
805
+    mov         [r4], r6b
806
+
807
+    ; backup unavailable pixels
808
+    movh        m7, [r4 + r5 + 1]
809
+
810
+    mov         r6d, r5d
811
+.loopW:
812
+    movu        m1, [r1]
813
+    movu        m2, [r1 + r2 + 1]
814
+
815
+    ; signDown
816
+    ; stats[edgeType]
817
+    pxor        m1, m0
818
+    pxor        m2, m0
819
+    pcmpgtb     m3, m1, m2
820
+    pand        m3, m5
821
+    pcmpgtb     m2, m1
822
+    por         m2, m3
823
+    pxor        m3, m3
824
+    psubb       m3, m2
825
+
826
+    ; edgeType
827
+    movu        m4, [r3]
828
+    paddb       m4, m6
829
+    paddb       m2, m4
830
+
831
+    ; update upBuff1
832
+    movu        [r4 + 1], m3
833
+
834
+    ; 16 pixels
835
+%assign x 0
836
+%rep 16
837
+    pextrb      r7d, m2, x
838
+    inc    word [rsp + r7 * 2]
839
+
840
+    movsx       r8d, word [r0 + x * 2]
841
+    add         [rsp + 5 * 2 + r7 * 4], r8d
842
+
843
+    dec         r6d
844
+    jz         .next
845
+%assign x x+1
846
+%endrep
847
+
848
+    add         r0, 16*2
849
+    add         r1, 16
850
+    add         r3, 16
851
+    add         r4, 16
852
+    jmp        .loopW
853
+
854
+.next:
855
+    xchg        r3, r4
856
+
857
+    ; restore pointer upBuff1
858
+    mov         r6d, r5d
859
+    and         r6d, ~15
860
+    neg         r6                              ; MUST BE 64-bits, it is Negtive
861
+
862
+    ; move to next row
863
+
864
+    ; move back to start point
865
+    add         r3, r6
866
+    add         r4, r6
867
+
868
+    ; adjust with stride
869
+    lea         r0, [r0 + (r6 + 64) * 2]        ; 64 = MAX_CU_SIZE
870
+    add         r1, r2
871
+    add         r1, r6
872
+
873
+    ; restore unavailable pixels
874
+    movh        [r3 + r5 + 1], m7
875
+
876
+    dec    byte r6m
877
+    jg         .loopH
878
+
879
+    ; sum to global buffer
880
+    mov         r1, r7m
881
+    mov         r0, r8m
882
+
883
+    ; s_eoTable = {1,2,0,3,4}
884
+    pmovzxwd    m0, [rsp + 0 * 2]
885
+    pshufd      m0, m0, q3102
886
+    movu        m1, [r0]
887
+    paddd       m0, m1
888
+    movu        [r0], m0
889
+    movzx       r5d, word [rsp + 4 * 2]
890
+    add         [r0 + 4 * 4], r5d
891
+
892
+    movu        m0, [rsp + 5 * 2 + 0 * 4]
893
+    pshufd      m0, m0, q3102
894
+    movu        m1, [r1]
895
+    paddd       m0, m1
896
+    movu        [r1], m0
897
+    mov         r6d, [rsp + 5 * 2 + 4 * 4]
898
+    add         [r1 + 4 * 4], r6d
899
+    RET
900
+
901
+
902
+INIT_YMM avx2
903
+cglobal saoCuStatsE2, 5,10,16                        ; Stack: 5 of stats and 5 of count
904
+    mov         r5d, r5m
905
+
906
+    ; clear internal temporary buffer
907
+    pxor        xm6, xm6                            ; count[0]
908
+    pxor        xm7, xm7                            ; count[1]
909
+    pxor        xm8, xm8                            ; count[2]
910
+    pxor        xm9, xm9                            ; count[3]
911
+    pxor        xm10, xm10                          ; count[4]
912
+    pxor        xm11, xm11                          ; stats[0]
913
+    pxor        xm12, xm12                          ; stats[1]
914
+    pxor        xm13, xm13                          ; stats[2]
915
+    pxor        xm14, xm14                          ; stats[3]
916
+    pxor        xm15, xm15                          ; stats[4]
917
+    mova        m0, [pb_128]
918
+
919
+    ; unavailable mask
920
+    lea         r9, [pb_movemask_32 + 32]
921
+
922
+.loopH:
923
+    ; TODO: merge into SIMD in below
924
+    ; get upBuffX[0]
925
+    mov         r6b, [r1 + r2]
926
+    sub         r6b, [r1 -  1]
927
+    seta        r6b
928
+    setb        r7b
929
+    sub         r6b, r7b
930
+    mov         [r4], r6b
931
+
932
+    ; backup unavailable pixels
933
+    movq        xm5, [r4 + r5 + 1]
934
+
935
+    mov         r6d, r5d
936
+.loopW:
937
+    movu        m1, [r1]
938
+    movu        m2, [r1 + r2 + 1]
939
+
940
+    ; signDown
941
+    ; stats[edgeType]
942
+    pxor        xm1, xm0
943
+    pxor        xm2, xm0
944
+    pcmpgtb     xm3, xm1, xm2
945
+    pand        xm3, [pb_1]
946
+    pcmpgtb     xm2, xm1
947
+    por         xm2, xm3
948
+    psignb      xm3, xm2, xm0
949
+
950
+    ; edgeType
951
+    movu        xm4, [r3]
952
+    paddb       xm4, [pb_2]
953
+    paddb       xm2, xm4
954
+
955
+    ; update upBuff1
956
+    movu        [r4 + 1], xm3
957
+
958
+    ; m[1-4] free in here
959
+
960
+    ; get current process group mask
961
+    mov         r7d, 16
962
+    mov         r8d, r6d
963
+    cmp         r6d, r7d
964
+    cmovge      r8d, r7d
965
+    neg         r8
966
+    movu        xm1, [r9 + r8]
967
+
968
+    ; tmp_count[edgeType]++
969
+    ; tmp_stats[edgeType] += (fenc[x] - rec[x])
970
+    pxor        xm3, xm3
971
+    por         xm1, xm2                            ; apply unavailable pixel mask
972
+    movu        m4, [r0]                            ; up to 14bits
973
+
974
+    pcmpeqb     xm3, xm1, xm3
975
+    psubb       xm6, xm3
976
+    pmovsxbw    m2, xm3
977
+    pmaddwd     m3, m4, m2
978
+    paddd       m11, m3
979
+
980
+    pcmpeqb     xm3, xm1, [pb_1]
981
+    psubb       xm7, xm3
982
+    pmovsxbw    m2, xm3
983
+    pmaddwd     m3, m4, m2
984
+    paddd       m12, m3
985
+
986
+    pcmpeqb     xm3, xm1, [pb_2]
987
+    psubb       xm8, xm3
988
+    pmovsxbw    m2, xm3
989
+    pmaddwd     m3, m4, m2
990
+    paddd       m13, m3
991
+
992
+    pcmpeqb     xm3, xm1, [pb_3]
993
+    psubb       xm9, xm3
994
+    pmovsxbw    m2, xm3
995
+    pmaddwd     m3, m4, m2
996
+    paddd       m14, m3
997
+
998
+    pcmpeqb     xm3, xm1, [pb_4]
999
+    psubb       xm10, xm3
1000
+    pmovsxbw    m2, xm3
1001
+    pmaddwd     m3, m4, m2
1002
+    paddd       m15, m3
1003
+
1004
+    sub         r6d, r7d
1005
+    jle        .next
1006
+
1007
+    add         r0, 16*2
1008
+    add         r1, 16
1009
+    add         r3, 16
1010
+    add         r4, 16
1011
+    jmp        .loopW
1012
+
1013
+.next:
1014
+    xchg        r3, r4
1015
+
1016
+    ; restore pointer upBuff1
1017
+    ; TODO: BZHI
1018
+    mov         r6d, r5d
1019
+    and         r6d, ~15
1020
+    neg         r6                              ; MUST BE 64-bits, it is Negtive
1021
+
1022
+    ; move to next row
1023
+
1024
+    ; move back to start point
1025
+    add         r3, r6
1026
+    add         r4, r6
1027
+
1028
+    ; adjust with stride
1029
+    lea         r0, [r0 + (r6 + 64) * 2]        ; 64 = MAX_CU_SIZE
1030
+    add         r1, r2
1031
+    add         r1, r6
1032
+
1033
+    ; restore unavailable pixels
1034
+    movq        [r3 + r5 + 1], xm5
1035
+
1036
+    dec    byte r6m
1037
+    jg         .loopH
1038
+
1039
+    ; sum to global buffer
1040
+    mov         r1, r7m
1041
+    mov         r0, r8m
1042
+
1043
+    ; sum into word
1044
+    ; WARNING: There have a ovberflow bug on case Block64x64 with ALL pixels are SAME type (HM algorithm never pass Block64x64 into here)
1045
+    pxor        xm0, xm0
1046
+    psadbw      xm1, xm6, xm0
1047
+    psadbw      xm2, xm7, xm0
1048
+    psadbw      xm3, xm8, xm0
1049
+    psadbw      xm4, xm9, xm0
1050
+    psadbw      xm5, xm10, xm0
1051
+    pshufd      xm1, xm1, q3120
1052
+    pshufd      xm2, xm2, q3120
1053
+    pshufd      xm3, xm3, q3120
1054
+    pshufd      xm4, xm4, q3120
1055
+
1056
+    ; sum count[4] only
1057
+    movhlps     xm6, xm5
1058
+    paddd       xm5, xm6
1059
+
1060
+    ; sum count[s_eoTable]
1061
+    ; s_eoTable = {1, 2, 0, 3, 4}
1062
+    punpcklqdq  xm3, xm1
1063
+    punpcklqdq  xm2, xm4
1064
+    phaddd      xm3, xm2
1065
+    movu        xm1, [r0]
1066
+    paddd       xm3, xm1
1067
+    movu        [r0], xm3
1068
+    movd        r5d, xm5
1069
+    add         [r0 + 4 * 4], r5d
1070
+
1071
+    ; sum stats[s_eoTable]
1072
+    vextracti128 xm1, m11, 1
1073
+    paddd       xm1, xm11
1074
+    vextracti128 xm2, m12, 1
1075
+    paddd       xm2, xm12
1076
+    vextracti128 xm3, m13, 1
1077
+    paddd       xm3, xm13
1078
+    vextracti128 xm4, m14, 1
1079
+    paddd       xm4, xm14
1080
+    vextracti128 xm5, m15, 1
1081
+    paddd       xm5, xm15
1082
+
1083
+    ; s_eoTable = {1, 2, 0, 3, 4}
1084
+    phaddd      xm3, xm1
1085
+    phaddd      xm2, xm4
1086
+    phaddd      xm3, xm2
1087
+    psubd       xm3, xm0, xm3               ; negtive for compensate PMADDWD sign algorithm problem
1088
+
1089
+    ; sum stats[4] only
1090
+    HADDD       xm5, xm6
1091
+    psubd       xm5, xm0, xm5
1092
+
1093
+    movu        xm1, [r1]
1094
+    paddd       xm3, xm1
1095
+    movu        [r1], xm3
1096
+    movd        r6d, xm5
1097
+    add         [r1 + 4 * 4], r6d
1098
+    RET
1099
+%endif ; ARCH_X86_64
1100
+
1101
+
1102
+;void saoStatE3(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
1103
+;{
1104
+;    memset(tmp_stats, 0, sizeof(tmp_stats));
1105
+;    memset(tmp_count, 0, sizeof(tmp_count));
1106
+;    for (y = startY; y < endY; y++)
1107
+;    {
1108
+;        for (x = startX; x < endX; x++)
1109
+;        {
1110
+;            int signDown = signOf2(rec[x], rec[x + stride - 1]);
1111
+;            uint32_t edgeType = signDown + upBuff1[x] + 2;
1112
+;            upBuff1[x - 1] = (int8_t)(-signDown);
1113
+;            tmp_stats[edgeType] += diff[x];
1114
+;            tmp_count[edgeType]++;
1115
+;        }
1116
+;        upBuff1[endX - 1] = signOf(rec[endX - 1 + stride] - rec[endX]);
1117
+;        rec += stride;
1118
+;        fenc += stride;
1119
+;    }
1120
+;    for (x = 0; x < NUM_EDGETYPE; x++)
1121
+;    {
1122
+;        stats[s_eoTable[x]] += tmp_stats[x];
1123
+;        count[s_eoTable[x]] += tmp_count[x];
1124
+;    }
1125
+;}
1126
+
1127
+%if ARCH_X86_64
1128
+INIT_XMM sse4
1129
+cglobal saoCuStatsE3, 4,9,8,0-32    ; Stack: 5 of stats and 5 of count
1130
+    mov         r4d, r4m
1131
+    mov         r5d, r5m
1132
+
1133
+    ; clear internal temporary buffer
1134
+    pxor        m0, m0
1135
+    mova        [rsp], m0
1136
+    mova        [rsp + mmsize], m0
1137
+    mova        m0, [pb_128]
1138
+    mova        m5, [pb_1]
1139
+    mova        m6, [pb_2]
1140
+    movh        m7, [r3 + r4]
1141
+
1142
+.loopH:
1143
+    mov         r6d, r4d
1144
+
1145
+.loopW:
1146
+    movu        m1, [r1]
1147
+    movu        m2, [r1 + r2 - 1]
1148
+
1149
+    ; signDown
1150
+    pxor        m1, m0
1151
+    pxor        m2, m0
1152
+    pcmpgtb     m3, m1, m2
1153
+    pand        m3, m5
1154
+    pcmpgtb     m2, m1
1155
+    por         m2, m3
1156
+    pxor        m3, m3
1157
+    psubb       m3, m2
1158
+
1159
+    ; edgeType
1160
+    movu        m4, [r3]
1161
+    paddb       m4, m6
1162
+    paddb       m2, m4
1163
+
1164
+    ; update upBuff1
1165
+    movu        [r3 - 1], m3
1166
+
1167
+    ; stats[edgeType]
1168
+    pxor        m1, m0
1169
+
1170
+    ; 16 pixels
1171
+%assign x 0
1172
+%rep 16
1173
+    pextrb      r7d, m2, x
1174
+    inc    word [rsp + r7 * 2]
1175
+
1176
+    movsx       r8d, word [r0 + x * 2]
1177
+    add         [rsp + 5 * 2 + r7 * 4], r8d
1178
+
1179
+    dec         r6d
1180
+    jz         .next
1181
+%assign x x+1
1182
+%endrep
1183
+
1184
+    add         r0, 16*2
1185
+    add         r1, 16
1186
+    add         r3, 16
1187
     jmp         .loopW
1188
 
1189
 .next:
1190
     ; restore pointer upBuff1
1191
-    add         r0, r2
1192
+    mov         r6d, r4d
1193
+    and         r6d, ~15
1194
+    neg         r6                              ; MUST BE 64-bits, it is Negtive
1195
+
1196
+    ; move to next row
1197
+
1198
+    ; move back to start point
1199
+    add         r3, r6
1200
+
1201
+    ; adjust with stride
1202
+    lea         r0, [r0 + (r6 + 64) * 2]        ; 64 = MAX_CU_SIZE
1203
     add         r1, r2
1204
+    add         r1, r6
1205
 
1206
     dec         r5d
1207
     jg         .loopH
1208
@@ -2256,26 +3082,448 @@
1209
     mov         r0, r7m
1210
 
1211
     ; s_eoTable = {1,2,0,3,4}
1212
-    movzx       r6d, word [rsp + 0 * 2]
1213
-    add         [r0 + 1 * 4], r6d
1214
-    movzx       r6d, word [rsp + 1 * 2]
1215
-    add         [r0 + 2 * 4], r6d
1216
-    movzx       r6d, word [rsp + 2 * 2]
1217
-    add         [r0 + 0 * 4], r6d
1218
-    movzx       r6d, word [rsp + 3 * 2]
1219
-    add         [r0 + 3 * 4], r6d
1220
-    movzx       r6d, word [rsp + 4 * 2]
1221
-    add         [r0 + 4 * 4], r6d
1222
-
1223
-    mov         r6d, [rsp + 5 * 2 + 0 * 4]
1224
-    add         [r1 + 1 * 4], r6d
1225
-    mov         r6d, [rsp + 5 * 2 + 1 * 4]
1226
-    add         [r1 + 2 * 4], r6d
1227
-    mov         r6d, [rsp + 5 * 2 + 2 * 4]
1228
-    add         [r1 + 0 * 4], r6d
1229
-    mov         r6d, [rsp + 5 * 2 + 3 * 4]
1230
-    add         [r1 + 3 * 4], r6d
1231
+    pmovzxwd    m0, [rsp + 0 * 2]
1232
+    pshufd      m0, m0, q3102
1233
+    movu        m1, [r0]
1234
+    paddd       m0, m1
1235
+    movu        [r0], m0
1236
+    movzx       r5d, word [rsp + 4 * 2]
1237
+    add         [r0 + 4 * 4], r5d
1238
+
1239
+    movu        m0, [rsp + 5 * 2 + 0 * 4]
1240
+    pshufd      m0, m0, q3102
1241
+    movu        m1, [r1]
1242
+    paddd       m0, m1
1243
+    movu        [r1], m0
1244
     mov         r6d, [rsp + 5 * 2 + 4 * 4]
1245
     add         [r1 + 4 * 4], r6d
1246
     RET
1247
+
1248
+
1249
+INIT_YMM avx2
1250
+cglobal saoCuStatsE3, 4,10,16           ; Stack: 5 of stats and 5 of count
1251
+    mov         r4d, r4m
1252
+    mov         r5d, r5m
1253
+
1254
+    ; clear internal temporary buffer
1255
+    pxor        xm6, xm6                            ; count[0]
1256
+    pxor        xm7, xm7                            ; count[1]
1257
+    pxor        xm8, xm8                            ; count[2]
1258
+    pxor        xm9, xm9                            ; count[3]
1259
+    pxor        xm10, xm10                          ; count[4]
1260
+    pxor        xm11, xm11                          ; stats[0]
1261
+    pxor        xm12, xm12                          ; stats[1]
1262
+    pxor        xm13, xm13                          ; stats[2]
1263
+    pxor        xm14, xm14                          ; stats[3]
1264
+    pxor        xm15, xm15                          ; stats[4]
1265
+    mova        m0, [pb_128]
1266
+
1267
+    ; unavailable mask
1268
+    lea         r9, [pb_movemask_32 + 32]
1269
+    push  qword [r3 + r4]
1270
+
1271
+.loopH:
1272
+    mov         r6d, r4d
1273
+
1274
+.loopW:
1275
+    movu        m1, [r1]
1276
+    movu        m2, [r1 + r2 - 1]
1277
+
1278
+    ; signDown
1279
+    ; stats[edgeType]
1280
+    pxor        xm1, xm0
1281
+    pxor        xm2, xm0
1282
+    pcmpgtb     xm3, xm1, xm2
1283
+    pand        xm3, [pb_1]
1284
+    pcmpgtb     xm2, xm1
1285
+    por         xm2, xm3
1286
+    pxor        xm3, xm3
1287
+    psubb       xm3, xm2
1288
+
1289
+    ; edgeType
1290
+    movu        xm4, [r3]
1291
+    paddb       xm4, [pb_2]
1292
+    paddb       xm2, xm4
1293
+
1294
+    ; update upBuff1
1295
+    movu        [r3 - 1], xm3
1296
+
1297
+    ; m[1-4] free in here
1298
+
1299
+    ; get current process group mask
1300
+    mov         r7d, 16
1301
+    mov         r8d, r6d
1302
+    cmp         r6d, r7d
1303
+    cmovge      r8d, r7d
1304
+    neg         r8
1305
+    movu        xm1, [r9 + r8]
1306
+
1307
+    ; tmp_count[edgeType]++
1308
+    ; tmp_stats[edgeType] += (fenc[x] - rec[x])
1309
+    pxor        xm3, xm3
1310
+    por         xm1, xm2                            ; apply unavailable pixel mask
1311
+    movu        m4, [r0]                            ; up to 14bits
1312
+
1313
+    pcmpeqb     xm3, xm1, xm3
1314
+    psubb       xm6, xm3
1315
+    pmovsxbw    m2, xm3
1316
+    pmaddwd     m3, m4, m2
1317
+    paddd       m11, m3
1318
+
1319
+    pcmpeqb     xm3, xm1, [pb_1]
1320
+    psubb       xm7, xm3
1321
+    pmovsxbw    m2, xm3
1322
+    pmaddwd     m3, m4, m2
1323
+    paddd       m12, m3
1324
+
1325
+    pcmpeqb     xm3, xm1, [pb_2]
1326
+    psubb       xm8, xm3
1327
+    pmovsxbw    m2, xm3
1328
+    pmaddwd     m3, m4, m2
1329
+    paddd       m13, m3
1330
+
1331
+    pcmpeqb     xm3, xm1, [pb_3]
1332
+    psubb       xm9, xm3
1333
+    pmovsxbw    m2, xm3
1334
+    pmaddwd     m3, m4, m2
1335
+    paddd       m14, m3
1336
+
1337
+    pcmpeqb     xm3, xm1, [pb_4]
1338
+    psubb       xm10, xm3
1339
+    pmovsxbw    m2, xm3
1340
+    pmaddwd     m3, m4, m2
1341
+    paddd       m15, m3
1342
+
1343
+    sub         r6d, r7d
1344
+    jle        .next
1345
+
1346
+    add         r0, 16*2
1347
+    add         r1, 16
1348
+    add         r3, 16
1349
+    jmp        .loopW
1350
+
1351
+.next:
1352
+    ; restore pointer upBuff1
1353
+    mov         r6d, r4d
1354
+    and         r6d, ~15
1355
+    neg         r6                              ; MUST BE 64-bits, it is Negtive
1356
+
1357
+    ; move to next row
1358
+
1359
+    ; move back to start point
1360
+    add         r3, r6
1361
+
1362
+    ; adjust with stride
1363
+    lea         r0, [r0 + (r6 + 64) * 2]        ; 64 = MAX_CU_SIZE
1364
+    add         r1, r2
1365
+    add         r1, r6
1366
+
1367
+    dec         r5d
1368
+    jg         .loopH
1369
+
1370
+    ; restore unavailable pixels
1371
+    pop   qword [r3 + r4]
1372
+
1373
+    ; sum to global buffer
1374
+    mov         r1, r6m
1375
+    mov         r0, r7m
1376
+
1377
+    ; sum into word
1378
+    ; WARNING: There have a ovberflow bug on case Block64x64 with ALL pixels are SAME type (HM algorithm never pass Block64x64 into here)
1379
+    pxor        xm0, xm0
1380
+    psadbw      xm1, xm6, xm0
1381
+    psadbw      xm2, xm7, xm0
1382
+    psadbw      xm3, xm8, xm0
1383
+    psadbw      xm4, xm9, xm0
1384
+    psadbw      xm5, xm10, xm0
1385
+    pshufd      xm1, xm1, q3120
1386
+    pshufd      xm2, xm2, q3120
1387
+    pshufd      xm3, xm3, q3120
1388
+    pshufd      xm4, xm4, q3120
1389
+
1390
+    ; sum count[4] only
1391
+    movhlps     xm6, xm5
1392
+    paddd       xm5, xm6
1393
+
1394
+    ; sum count[s_eoTable]
1395
+    ; s_eoTable = {1, 2, 0, 3, 4}
1396
+    punpcklqdq  xm3, xm1
1397
+    punpcklqdq  xm2, xm4
1398
+    phaddd      xm3, xm2
1399
+    movu        xm1, [r0]
1400
+    paddd       xm3, xm1
1401
+    movu        [r0], xm3
1402
+    movd        r5d, xm5
1403
+    add         [r0 + 4 * 4], r5d
1404
+
1405
+    ; sum stats[s_eoTable]
1406
+    vextracti128 xm1, m11, 1
1407
+    paddd       xm1, xm11
1408
+    vextracti128 xm2, m12, 1
1409
+    paddd       xm2, xm12
1410
+    vextracti128 xm3, m13, 1
1411
+    paddd       xm3, xm13
1412
+    vextracti128 xm4, m14, 1
1413
+    paddd       xm4, xm14
1414
+    vextracti128 xm5, m15, 1
1415
+    paddd       xm5, xm15
1416
+
1417
+    ; s_eoTable = {1, 2, 0, 3, 4}
1418
+    phaddd      xm3, xm1
1419
+    phaddd      xm2, xm4
1420
+    phaddd      xm3, xm2
1421
+    psubd       xm3, xm0, xm3               ; negtive for compensate PMADDWD sign algorithm problem
1422
+
1423
+    ; sum stats[4] only
1424
+    HADDD       xm5, xm6
1425
+    psubd       xm5, xm0, xm5
1426
+
1427
+    movu        xm1, [r1]
1428
+    paddd       xm3, xm1
1429
+    movu        [r1], xm3
1430
+    movd        r6d, xm5
1431
+    add         [r1 + 4 * 4], r6d
1432
+    RET
1433
+%endif ; ARCH_X86_64
1434
+
1435
+
1436
+%if ARCH_X86_64
1437
+;; argument registers used -
1438
+; r0    - src
1439
+; r1    - srcStep
1440
+; r2    - offset
1441
+; r3    - tcP
1442
+; r4    - tcQ
1443
+
1444
+INIT_XMM sse4
1445
+cglobal pelFilterLumaStrong_H, 5,7,10
1446
+    mov             r1, r2
1447
+    neg             r3d
1448
+    neg             r4d
1449
+    neg             r1
1450
+
1451
+    lea             r5, [r2 * 3]
1452
+    lea             r6, [r1 * 3]
1453
+
1454
+    pmovzxbw        m4, [r0]                ; src[0]
1455
+    pmovzxbw        m3, [r0 + r1]           ; src[-offset]
1456
+    pmovzxbw        m2, [r0 + r1 * 2]       ; src[-offset * 2]
1457
+    pmovzxbw        m1, [r0 + r6]           ; src[-offset * 3]
1458
+    pmovzxbw        m0, [r0 + r1 * 4]       ; src[-offset * 4]
1459
+    pmovzxbw        m5, [r0 + r2]           ; src[offset]
1460
+    pmovzxbw        m6, [r0 + r2 * 2]       ; src[offset * 2]
1461
+    pmovzxbw        m7, [r0 + r5]           ; src[offset * 3]
1462
+
1463
+    paddw           m0, m0                  ; m0*2
1464
+    mova            m8, m2
1465
+    paddw           m8, m3                  ; m2 + m3
1466
+    paddw           m8, m4                  ; m2 + m3 + m4
1467
+    mova            m9, m8
1468
+    paddw           m9, m9                  ; 2*m2 + 2*m3 + 2*m4
1469
+    paddw           m8, m1                  ; m2 + m3 + m4 + m1
1470
+    paddw           m0, m8                  ; 2*m0 + m2+ m3 + m4 + m1
1471
+    paddw           m9, m1
1472
+    paddw           m0, m1
1473
+    paddw           m9, m5                  ; m1 + 2*m2 + 2*m3 + 2*m4 + m5
1474
+    paddw           m0, m1                  ; 2*m0 + 3*m1 + m2 + m3 + m4
1475
+
1476
+    punpcklqdq      m0, m9
1477
+    punpcklqdq      m1, m3
1478
+
1479
+    paddw           m3, m4
1480
+    mova            m9, m5
1481
+    paddw           m9, m6
1482
+    paddw           m7, m7                  ; 2*m7
1483
+    paddw           m9, m3                  ; m3 + m4 + m5 + m6
1484
+    mova            m3, m9
1485
+    paddw           m3, m3                  ; 2*m3 + 2*m4 + 2*m5 + 2*m6
1486
+    paddw           m7, m9                  ; 2*m7 + m3 + m4 + m5 + m6
1487
+    paddw           m7, m6
1488
+    psubw           m3, m6                  ; 2*m3 + 2*m4 + 2*m5 + m6
1489
+    paddw           m7, m6                  ; m3 + m4 + m5 + 3*m6 + 2*m7
1490
+    paddw           m3, m2                  ; m2 + 2*m3 + 2*m4 + 2*m5 + m6
1491
+
1492
+    punpcklqdq      m9, m8
1493
+    punpcklqdq      m3, m7
1494
+    punpcklqdq      m5, m2
1495
+    punpcklqdq      m4, m6
1496
+
1497
+    movd            m7, r3d                 ; -tcP
1498
+    movd            m2, r4d                 ; -tcQ
1499
+    pshufb          m7, [pb_01]
1500
+    pshufb          m2, [pb_01]
1501
+    mova            m6, m2
1502
+    punpcklqdq      m6, m7
1503
+
1504
+    paddw           m0, [pw_4]
1505
+    paddw           m3, [pw_4]
1506
+    paddw           m9, [pw_2]
1507
+
1508
+    psraw           m0, 3
1509
+    psraw           m3, 3
1510
+    psraw           m9, 2
1511
+
1512
+    psubw           m0, m1
1513
+    psubw           m3, m4
1514
+    psubw           m9, m5
1515
+
1516
+    pmaxsw          m0, m7
1517
+    pmaxsw          m3, m2
1518
+    pmaxsw          m9, m6
1519
+    psignw          m7, [pw_n1]
1520
+    psignw          m2, [pw_n1]
1521
+    psignw          m6, [pw_n1]
1522
+    pminsw          m0, m7
1523
+    pminsw          m3, m2
1524
+    pminsw          m9, m6
1525
+
1526
+    paddw           m0, m1
1527
+    paddw           m3, m4
1528
+    paddw           m9, m5
1529
+    packuswb        m0, m0
1530
+    packuswb        m3, m9
1531
+
1532
+    movd            [r0 + r6], m0
1533
+    pextrd          [r0 + r1], m0, 1
1534
+    movd            [r0], m3
1535
+    pextrd          [r0 + r2 * 2], m3, 1
1536
+    pextrd          [r0 + r2 * 1], m3, 2
1537
+    pextrd          [r0 + r1 * 2], m3, 3
1538
+    RET
1539
+
1540
+INIT_XMM sse4
1541
+cglobal pelFilterLumaStrong_V, 5,5,10
1542
+    neg             r3d
1543
+    neg             r4d
1544
+    lea             r2, [r1 * 3]
1545
+
1546
+    movh            m0, [r0 - 4]            ; src[-offset * 4] row 0
1547
+    movh            m1, [r0 + r1 * 1 - 4]   ; src[-offset * 4] row 1
1548
+    movh            m2, [r0 + r1 * 2 - 4]   ; src[-offset * 4] row 2
1549
+    movh            m3, [r0 + r2 * 1 - 4]   ; src[-offset * 4] row 3
1550
+
1551
+    punpcklbw       m0, m1
1552
+    punpcklbw       m2, m3
1553
+    mova            m4, m0
1554
+    punpcklwd       m0, m2
1555
+    punpckhwd       m4, m2
1556
+    mova            m1, m0
1557
+    mova            m2, m0
1558
+    mova            m3, m0
1559
+    pshufd          m0, m0, 0
1560
+    pshufd          m1, m1, 1
1561
+    pshufd          m2, m2, 2
1562
+    pshufd          m3, m3, 3
1563
+    mova            m5, m4
1564
+    mova            m6, m4
1565
+    mova            m7, m4
1566
+    pshufd          m4, m4, 0
1567
+    pshufd          m5, m5, 1
1568
+    pshufd          m6, m6, 2
1569
+    pshufd          m7, m7, 3
1570
+    pmovzxbw        m0, m0
1571
+    pmovzxbw        m1, m1
1572
+    pmovzxbw        m2, m2
1573
+    pmovzxbw        m3, m3
1574
+    pmovzxbw        m4, m4
1575
+    pmovzxbw        m5, m5
1576
+    pmovzxbw        m6, m6
1577
+    pmovzxbw        m7, m7
1578
+
1579
+    paddw           m0, m0                  ; m0*2
1580
+    mova            m8, m2
1581
+    paddw           m8, m3                  ; m2 + m3
1582
+    paddw           m8, m4                  ; m2 + m3 + m4
1583
+    mova            m9, m8
1584
+    paddw           m9, m9                  ; 2*m2 + 2*m3 + 2*m4
1585
+    paddw           m8, m1                  ; m2 + m3 + m4 + m1
1586
+    paddw           m0, m8                  ; 2*m0 + m2+ m3 + m4 + m1
1587
+    paddw           m9, m1
1588
+    paddw           m0, m1
1589
+    paddw           m9, m5                  ; m1 + 2*m2 + 2*m3 + 2*m4 + m5
1590
+    paddw           m0, m1                  ; 2*m0 + 3*m1 + m2 + m3 + m4
1591
+
1592
+    punpcklqdq      m0, m9
1593
+    punpcklqdq      m1, m3
1594
+
1595
+    paddw           m3, m4
1596
+    mova            m9, m5
1597
+    paddw           m9, m6
1598
+    paddw           m7, m7                  ; 2*m7
1599
+    paddw           m9, m3                  ; m3 + m4 + m5 + m6
1600
+    mova            m3, m9
1601
+    paddw           m3, m3                  ; 2*m3 + 2*m4 + 2*m5 + 2*m6
1602
+    paddw           m7, m9                  ; 2*m7 + m3 + m4 + m5 + m6
1603
+    paddw           m7, m6
1604
+    psubw           m3, m6                  ; 2*m3 + 2*m4 + 2*m5 + m6
1605
+    paddw           m7, m6                  ; m3 + m4 + m5 + 3*m6 + 2*m7
1606
+    paddw           m3, m2                  ; m2 + 2*m3 + 2*m4 + 2*m5 + m6
1607
+
1608
+    punpcklqdq      m9, m8
1609
+    punpcklqdq      m3, m7
1610
+    punpcklqdq      m5, m2
1611
+    punpcklqdq      m4, m6
1612
+
1613
+    movd            m7, r3d                 ; -tcP
1614
+    movd            m2, r4d                 ; -tcQ
1615
+    pshufb          m7, [pb_01]
1616
+    pshufb          m2, [pb_01]
1617
+    mova            m6, m2
1618
+    punpcklqdq      m6, m7
1619
+
1620
+    paddw           m0, [pw_4]
1621
+    paddw           m3, [pw_4]
1622
+    paddw           m9, [pw_2]
1623
+
1624
+    psraw           m0, 3
1625
+    psraw           m3, 3
1626
+    psraw           m9, 2
1627
+
1628
+    psubw           m0, m1
1629
+    psubw           m3, m4
1630
+    psubw           m9, m5
1631
+
1632
+    pmaxsw          m0, m7
1633
+    pmaxsw          m3, m2
1634
+    pmaxsw          m9, m6
1635
+    psignw          m7, [pw_n1]
1636
+    psignw          m2, [pw_n1]
1637
+    psignw          m6, [pw_n1]
1638
+    pminsw          m0, m7
1639
+    pminsw          m3, m2
1640
+    pminsw          m9, m6
1641
+
1642
+    paddw           m0, m1
1643
+    paddw           m3, m4
1644
+    paddw           m9, m5
1645
+    packuswb        m0, m0
1646
+    packuswb        m3, m9
1647
+
1648
+    ; 4x6 output rows -
1649
+    ; m0 - col 0
1650
+    ; m3 - col 3
1651
+    mova            m1, m0
1652
+    mova            m2, m3
1653
+    mova            m4, m3
1654
+    mova            m5, m3
1655
+    pshufd          m1, m1, 1               ; col 2
1656
+    pshufd          m2, m2, 1               ; col 5
1657
+    pshufd          m4, m4, 2               ; col 4
1658
+    pshufd          m5, m5, 3               ; col 1
1659
+
1660
+    ; transpose 4x6 to 6x4
1661
+    punpcklbw       m0, m5
1662
+    punpcklbw       m1, m3
1663
+    punpcklbw       m4, m2
1664
+    punpcklwd       m0, m1
1665
+
1666
+    movd            [r0 + r1 * 0 - 3], m0
1667
+    pextrd          [r0 + r1 * 1 - 3], m0, 1
1668
+    pextrd          [r0 + r1 * 2 - 3], m0, 2
1669
+    pextrd          [r0 + r2 * 1 - 3], m0, 3
1670
+    pextrw          [r0 + r1 * 0 + 1], m4, 0
1671
+    pextrw          [r0 + r1 * 1 + 1], m4, 1
1672
+    pextrw          [r0 + r1 * 2 + 1], m4, 2
1673
+    pextrw          [r0 + r2 * 1 + 1], m4, 3
1674
+    RET
1675
 %endif ; ARCH_X86_64
1676
x265_1.8.tar.gz/source/common/x86/loopfilter.h -> x265_1.9.tar.gz/source/common/x86/loopfilter.h Changed
32
 
1
@@ -3,6 +3,7 @@
2
  *
3
  * Authors: Dnyaneshwar Gorade <dnyaneshwar@multicorewareinc.com>
4
  *          Praveen Kumar Tiwari <praveen@multicorewareinc.com>
5
+;*          Min Chen <chenm003@163.com>
6
  *
7
  * This program is free software; you can redistribute it and/or modify
8
  * it under the terms of the GNU General Public License as published by
9
@@ -35,14 +36,17 @@
10
     void PFX(saoCuOrgE3_ ## cpu)(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX); \
11
     void PFX(saoCuOrgE3_32_ ## cpu)(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX); \
12
     void PFX(saoCuOrgB0_ ## cpu)(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride); \
13
-    void PFX(saoCuStatsBO_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); \
14
-    void PFX(saoCuStatsE0_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); \
15
-    void PFX(saoCuStatsE1_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count); \
16
-    void PFX(saoCuStatsE2_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBufft, int endX, int endY, int32_t *stats, int32_t *count); \
17
-    void PFX(saoCuStatsE3_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count); \
18
+    void PFX(saoCuStatsBO_ ## cpu)(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); \
19
+    void PFX(saoCuStatsE0_ ## cpu)(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); \
20
+    void PFX(saoCuStatsE1_ ## cpu)(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count); \
21
+    void PFX(saoCuStatsE2_ ## cpu)(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBufft, int endX, int endY, int32_t *stats, int32_t *count); \
22
+    void PFX(saoCuStatsE3_ ## cpu)(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count); \
23
     void PFX(calSign_ ## cpu)(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
24
 
25
 DECL_SAO(sse4);
26
 DECL_SAO(avx2);
27
 
28
+void PFX(pelFilterLumaStrong_V_sse4)(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tcP, int32_t tcQ);
29
+void PFX(pelFilterLumaStrong_H_sse4)(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tcP, int32_t tcQ);
30
+
31
 #endif // ifndef X265_LOOPFILTER_H
32
x265_1.8.tar.gz/source/common/x86/mc-a.asm -> x265_1.9.tar.gz/source/common/x86/mc-a.asm Changed
119
 
1
@@ -2,6 +2,7 @@
2
 ;* mc-a.asm: x86 motion compensation
3
 ;*****************************************************************************
4
 ;* Copyright (C) 2003-2013 x264 project
5
+;* Copyright (C) 2013-2015 x265 project
6
 ;*
7
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
8
 ;*          Fiona Glaser <fiona@x264.com>
9
@@ -3989,8 +3990,12 @@
10
     test dword r4m, 15
11
     jz pixel_avg_w%1_sse2
12
 %endif
13
+%if (%1 == 8)
14
+    jmp pixel_avg_w8_unaligned_sse2
15
+%else
16
     jmp pixel_avg_w%1_mmx2
17
 %endif
18
+%endif
19
 %endmacro
20
 
21
 ;-----------------------------------------------------------------------------
22
@@ -4049,6 +4054,32 @@
23
     lea     r4, [r4 + 4 * r5]
24
 %endmacro
25
 
26
+INIT_XMM sse2
27
+cglobal pixel_avg_w8_unaligned
28
+    AVG_START
29
+.height_loop:
30
+%if HIGH_BIT_DEPTH
31
+    ; NO TEST BRANCH!
32
+    movu    m0, [t2]
33
+    movu    m1, [t2+SIZEOF_PIXEL*t3]
34
+    movu    m2, [t4]
35
+    movu    m3, [t4+SIZEOF_PIXEL*t5]
36
+    pavgw   m0, m2
37
+    pavgw   m1, m3
38
+    movu    [t0], m0
39
+    movu    [t0+SIZEOF_PIXEL*t1], m1
40
+%else ;!HIGH_BIT_DEPTH
41
+    movq    m0, [t2]
42
+    movhps  m0, [t2+SIZEOF_PIXEL*t3]
43
+    movq    m1, [t4]
44
+    movhps  m1, [t4+SIZEOF_PIXEL*t5]
45
+    pavgb   m0, m1
46
+    movq    [t0], m0
47
+    movhps  [t0+SIZEOF_PIXEL*t1], m0
48
+%endif
49
+    AVG_END
50
+
51
+
52
 ;-------------------------------------------------------------------------------------------------------------------------------
53
 ;void pixelavg_pp(pixel dst, intptr_t dstride, const pixel src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int)
54
 ;-------------------------------------------------------------------------------------------------------------------------------
55
@@ -4115,11 +4146,11 @@
56
 AVGH 4, 4
57
 AVGH 4, 2
58
 
59
-AVG_FUNC 8, movq, movq
60
-AVGH 8, 32
61
-AVGH 8, 16
62
-AVGH 8,  8
63
-AVGH 8,  4
64
+;AVG_FUNC 8, movq, movq
65
+;AVGH 8, 32
66
+;AVGH 8, 16
67
+;AVGH 8,  8
68
+;AVGH 8,  4
69
 
70
 AVG_FUNC 16, movq, movq
71
 AVGH 16, 64
72
@@ -4197,7 +4228,7 @@
73
 AVGH 4, 4
74
 AVGH 4, 2
75
 
76
-AVG_FUNC 8, movq, movq
77
+;AVG_FUNC 8, movq, movq
78
 AVGH 8, 32
79
 AVGH 8, 16
80
 AVGH 8,  8
81
@@ -4418,6 +4449,37 @@
82
     call pixel_avg_16x64_8bit
83
     call pixel_avg_16x64_8bit
84
     RET
85
+
86
+cglobal pixel_avg_48x64, 6,7,4
87
+   mov          r6d, 4
88
+.loop:
89
+%rep 8
90
+    movu        m0, [r2]
91
+    movu        xm2, [r2 + mmsize]
92
+    movu        m1, [r4]
93
+    movu        xm3, [r4 + mmsize]
94
+    pavgb       m0, m1
95
+    pavgb       xm2, xm3
96
+    movu        [r0], m0
97
+    movu        [r0 + mmsize], xm2
98
+
99
+    movu        m0, [r2 + r3]
100
+    movu        xm2, [r2 + r3 + mmsize]
101
+    movu        m1, [r4 + r5]
102
+    movu        xm3, [r4 + r5 + mmsize]
103
+    pavgb       m0, m1
104
+    pavgb       xm2, xm3
105
+    movu        [r0 + r1], m0
106
+    movu        [r0 + r1 + mmsize], xm2
107
+
108
+    lea         r2, [r2 + r3 * 2]
109
+    lea         r4, [r4 + r5 * 2]
110
+    lea         r0, [r0 + r1 * 2]
111
+%endrep
112
+
113
+    dec         r6d
114
+    jnz         .loop
115
+    RET
116
 %endif
117
 
118
 ;=============================================================================
119
x265_1.8.tar.gz/source/common/x86/mc-a2.asm -> x265_1.9.tar.gz/source/common/x86/mc-a2.asm Changed
386
 
1
@@ -2,12 +2,14 @@
2
 ;* mc-a2.asm: x86 motion compensation
3
 ;*****************************************************************************
4
 ;* Copyright (C) 2005-2013 x264 project
5
+;* Copyright (C) 2013-2015 x265 project
6
 ;*
7
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
8
 ;*          Fiona Glaser <fiona@x264.com>
9
 ;*          Holger Lubitz <holger@lubitz.org>
10
 ;*          Mathieu Monnier <manao@melix.net>
11
 ;*          Oskar Arvidsson <oskar@irock.se>
12
+;*          Min Chen <chenm003@163.com>
13
 ;*
14
 ;* This program is free software; you can redistribute it and/or modify
15
 ;* it under the terms of the GNU General Public License as published by
16
@@ -46,6 +48,8 @@
17
 pd_16: times 4 dd 16
18
 pd_0f: times 4 dd 0xffff
19
 pf_inv256: times 8 dd 0.00390625
20
+const pd_inv256,    times 4 dq 0.00390625
21
+const pd_0_5,       times 4 dq 0.5
22
 
23
 SECTION .text
24
 
25
@@ -987,151 +991,227 @@
26
 %endif
27
 
28
 ;-----------------------------------------------------------------------------
29
-; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
30
-;                             uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len )
31
+; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, int32_t *intra_costs,
32
+;                             uint16_t *inter_costs, int32_t *inv_qscales, double *fps_factor, int len )
33
 ;-----------------------------------------------------------------------------
34
-%macro MBTREE 0
35
+INIT_XMM sse2
36
 cglobal mbtree_propagate_cost, 7,7,7
37
-    add        r6d, r6d
38
-    lea         r0, [r0+r6*2]
39
-    add         r1, r6
40
-    add         r2, r6
41
-    add         r3, r6
42
-    add         r4, r6
43
-    neg         r6
44
-    pxor      xmm4, xmm4
45
-    movss     xmm6, [r5]
46
-    shufps    xmm6, xmm6, 0
47
-    mulps     xmm6, [pf_inv256]
48
-    movdqa    xmm5, [pw_3fff]
49
+    dec         r6d
50
+    movsd       m6, [r5]
51
+    mulpd       m6, [pd_inv256]
52
+    xor         r5d, r5d
53
+    lea         r0, [r0+r5*2]
54
+    pxor        m4, m4
55
+    movlhps     m6, m6
56
+    mova        m5, [pw_3fff]
57
+
58
 .loop:
59
-    movq      xmm2, [r2+r6] ; intra
60
-    movq      xmm0, [r4+r6] ; invq
61
-    movq      xmm3, [r3+r6] ; inter
62
-    movq      xmm1, [r1+r6] ; prop
63
-    punpcklwd xmm2, xmm4
64
-    punpcklwd xmm0, xmm4
65
-    pmaddwd   xmm0, xmm2
66
-    pand      xmm3, xmm5
67
-    punpcklwd xmm1, xmm4
68
-    punpcklwd xmm3, xmm4
69
-%if cpuflag(fma4)
70
-    cvtdq2ps  xmm0, xmm0
71
-    cvtdq2ps  xmm1, xmm1
72
-    fmaddps   xmm0, xmm0, xmm6, xmm1
73
-    cvtdq2ps  xmm1, xmm2
74
-    psubd     xmm2, xmm3
75
-    cvtdq2ps  xmm2, xmm2
76
-    rcpps     xmm3, xmm1
77
-    mulps     xmm1, xmm3
78
-    mulps     xmm0, xmm2
79
-    addps     xmm2, xmm3, xmm3
80
-    fnmaddps  xmm3, xmm1, xmm3, xmm2
81
-    mulps     xmm0, xmm3
82
-%else
83
-    cvtdq2ps  xmm0, xmm0
84
-    mulps     xmm0, xmm6    ; intra*invq*fps_factor>>8
85
-    cvtdq2ps  xmm1, xmm1    ; prop
86
-    addps     xmm0, xmm1    ; prop + (intra*invq*fps_factor>>8)
87
-    cvtdq2ps  xmm1, xmm2    ; intra
88
-    psubd     xmm2, xmm3    ; intra - inter
89
-    cvtdq2ps  xmm2, xmm2    ; intra - inter
90
-    rcpps     xmm3, xmm1    ; 1 / intra 1st approximation
91
-    mulps     xmm1, xmm3    ; intra * (1/intra 1st approx)
92
-    mulps     xmm1, xmm3    ; intra * (1/intra 1st approx)^2
93
-    mulps     xmm0, xmm2    ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
94
-    addps     xmm3, xmm3    ; 2 * (1/intra 1st approx)
95
-    subps     xmm3, xmm1    ; 2nd approximation for 1/intra
96
-    mulps     xmm0, xmm3    ; / intra
97
-%endif
98
-    cvtps2dq  xmm0, xmm0
99
-    movdqa [r0+r6*2], xmm0
100
-    add         r6, 8
101
-    jl .loop
102
+    movh        m2, [r2+r5*4]       ; intra
103
+    movh        m0, [r4+r5*4]       ; invq
104
+    movd        m3, [r3+r5*2]       ; inter
105
+    pand        m3, m5
106
+    punpcklwd   m3, m4
107
+
108
+    ; PMINSD
109
+    pcmpgtd     m1, m2, m3
110
+    pand        m3, m1
111
+    pandn       m1, m2
112
+    por         m3, m1
113
+
114
+    movd        m1, [r1+r5*2]       ; prop
115
+    punpckldq   m2, m2
116
+    punpckldq   m0, m0
117
+    pmuludq     m0, m2
118
+    pshufd      m2, m2, q3120
119
+    pshufd      m0, m0, q3120
120
+
121
+    punpcklwd   m1, m4
122
+    cvtdq2pd    m0, m0
123
+    mulpd       m0, m6              ; intra*invq*fps_factor>>8
124
+    cvtdq2pd    m1, m1              ; prop
125
+    addpd       m0, m1              ; prop + (intra*invq*fps_factor>>8)
126
+    ;cvtdq2ps    m1, m2              ; intra
127
+    cvtdq2pd    m1, m2              ; intra
128
+    psubd       m2, m3              ; intra - inter
129
+    cvtdq2pd    m2, m2              ; intra - inter
130
+    ;rcpps       m3, m1
131
+    ;mulps       m1, m3              ; intra * (1/intra 1st approx)
132
+    ;mulps       m1, m3              ; intra * (1/intra 1st approx)^2
133
+    ;addps       m3, m3              ; 2 * (1/intra 1st approx)
134
+    ;subps       m3, m1              ; 2nd approximation for 1/intra
135
+    ;cvtps2pd    m3, m3              ; 1 / intra 1st approximation
136
+    mulpd       m0, m2              ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
137
+    ;mulpd       m0, m3              ; / intra
138
+
139
+    ; TODO: DIVPD very slow, but match to C model output, since it is not bottleneck function, I comment above faster code
140
+    divpd       m0, m1
141
+    addpd       m0, [pd_0_5]
142
+    cvttpd2dq    m0, m0
143
+
144
+    movh        [r0+r5*4], m0
145
+    add         r5d, 2
146
+    cmp         r5d, r6d
147
+    jl         .loop
148
+
149
+    xor         r6d, r5d
150
+    jnz         .even
151
+    movd        m2, [r2+r5*4]       ; intra
152
+    movd        m0, [r4+r5*4]       ; invq
153
+    movd        m3, [r3+r5*2]       ; inter
154
+    pand        m3, m5
155
+    punpcklwd   m3, m4
156
+
157
+    ; PMINSD
158
+    pcmpgtd     m1, m2, m3
159
+    pand        m3, m1
160
+    pandn       m1, m2
161
+    por         m3, m1
162
+
163
+    movd        m1, [r1+r5*2]       ; prop
164
+    punpckldq   m2, m2              ; DWORD [_ 1 _ 0]
165
+    punpckldq   m0, m0
166
+    pmuludq     m0, m2              ; QWORD [m1 m0]
167
+    pshufd      m2, m2, q3120
168
+    pshufd      m0, m0, q3120
169
+    punpcklwd   m1, m4
170
+    cvtdq2pd    m0, m0
171
+    mulpd       m0, m6              ; intra*invq*fps_factor>>8
172
+    cvtdq2pd    m1, m1              ; prop
173
+    addpd       m0, m1              ; prop + (intra*invq*fps_factor>>8)
174
+    cvtdq2pd    m1, m2              ; intra
175
+    psubd       m2, m3              ; intra - inter
176
+    cvtdq2pd    m2, m2              ; intra - inter
177
+    mulpd       m0, m2              ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
178
+
179
+    divpd       m0, m1
180
+    addpd       m0, [pd_0_5]
181
+    cvttpd2dq    m0, m0
182
+    movd        [r0+r5*4], m0
183
+.even:
184
     RET
185
-%endmacro
186
 
187
-INIT_XMM sse2
188
-MBTREE
189
-; Bulldozer only has a 128-bit float unit, so the AVX version of this function is actually slower.
190
-INIT_XMM fma4
191
-MBTREE
192
-
193
-%macro INT16_UNPACK 1
194
-    vpunpckhwd   xm4, xm%1, xm7
195
-    vpunpcklwd  xm%1, xm7
196
-    vinsertf128  m%1, m%1, xm4, 1
197
-%endmacro
198
 
199
+;-----------------------------------------------------------------------------
200
+; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, int32_t *intra_costs,
201
+;                             uint16_t *inter_costs, int32_t *inv_qscales, double *fps_factor, int len )
202
+;-----------------------------------------------------------------------------
203
 ; FIXME: align loads/stores to 16 bytes
204
 %macro MBTREE_AVX 0
205
-cglobal mbtree_propagate_cost, 7,7,8
206
-    add          r6d, r6d
207
-    lea           r0, [r0+r6*2]
208
-    add           r1, r6
209
-    add           r2, r6
210
-    add           r3, r6
211
-    add           r4, r6
212
-    neg           r6
213
-    mova         xm5, [pw_3fff]
214
-    vbroadcastss  m6, [r5]
215
-    mulps         m6, [pf_inv256]
216
-%if notcpuflag(avx2)
217
-    pxor         xm7, xm7
218
-%endif
219
+cglobal mbtree_propagate_cost, 7,7,7
220
+    sub             r6d, 3
221
+    vbroadcastsd    m6, [r5]
222
+    mulpd           m6, [pd_inv256]
223
+    xor             r5d, r5d
224
+    mova            m5, [pw_3fff]
225
+
226
 .loop:
227
-%if cpuflag(avx2)
228
-    pmovzxwd     m0, [r2+r6]      ; intra
229
-    pmovzxwd     m1, [r4+r6]      ; invq
230
-    pmovzxwd     m2, [r1+r6]      ; prop
231
-    pand        xm3, xm5, [r3+r6] ; inter
232
-    pmovzxwd     m3, xm3
233
-    pmaddwd      m1, m0
234
-    psubd        m4, m0, m3
235
-    cvtdq2ps     m0, m0
236
-    cvtdq2ps     m1, m1
237
-    cvtdq2ps     m2, m2
238
-    cvtdq2ps     m4, m4
239
-    fmaddps      m1, m1, m6, m2
240
-    rcpps        m3, m0
241
-    mulps        m2, m0, m3
242
-    mulps        m1, m4
243
-    addps        m4, m3, m3
244
-    fnmaddps     m4, m2, m3, m4
245
-    mulps        m1, m4
246
-%else
247
-    movu        xm0, [r2+r6]
248
-    movu        xm1, [r4+r6]
249
-    movu        xm2, [r1+r6]
250
-    pand        xm3, xm5, [r3+r6]
251
-    INT16_UNPACK 0
252
-    INT16_UNPACK 1
253
-    INT16_UNPACK 2
254
-    INT16_UNPACK 3
255
-    cvtdq2ps     m0, m0
256
-    cvtdq2ps     m1, m1
257
-    cvtdq2ps     m2, m2
258
-    cvtdq2ps     m3, m3
259
-    mulps        m1, m0
260
-    subps        m4, m0, m3
261
-    mulps        m1, m6         ; intra*invq*fps_factor>>8
262
-    addps        m1, m2         ; prop + (intra*invq*fps_factor>>8)
263
-    rcpps        m3, m0         ; 1 / intra 1st approximation
264
-    mulps        m2, m0, m3     ; intra * (1/intra 1st approx)
265
-    mulps        m2, m3         ; intra * (1/intra 1st approx)^2
266
-    mulps        m1, m4         ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
267
-    addps        m3, m3         ; 2 * (1/intra 1st approx)
268
-    subps        m3, m2         ; 2nd approximation for 1/intra
269
-    mulps        m1, m3         ; / intra
270
-%endif
271
-    vcvtps2dq    m1, m1
272
-    movu  [r0+r6*2], m1
273
-    add          r6, 16
274
-    jl .loop
275
+    movu            xm2, [r2+r5*4]      ; intra
276
+    movu            xm0, [r4+r5*4]      ; invq
277
+    pmovzxwd        xm3, [r3+r5*2]      ; inter
278
+    pand            xm3, xm5
279
+    pminsd          xm3, xm2
280
+
281
+    pmovzxwd        xm1, [r1+r5*2]      ; prop
282
+    pmulld          xm0, xm2
283
+    cvtdq2pd        m0, xm0
284
+    cvtdq2pd        m1, xm1             ; prop
285
+;%if cpuflag(avx2)
286
+;    fmaddpd         m0, m0, m6, m1
287
+;%else
288
+    mulpd           m0, m6              ; intra*invq*fps_factor>>8
289
+    addpd           m0, m1              ; prop + (intra*invq*fps_factor>>8)
290
+;%endif
291
+    cvtdq2pd        m1, xm2             ; intra
292
+    psubd           xm2, xm3            ; intra - inter
293
+    cvtdq2pd        m2, xm2             ; intra - inter
294
+    mulpd           m0, m2              ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
295
+
296
+    ; TODO: DIVPD very slow, but match to C model output, since it is not bottleneck function, I comment above faster code
297
+    divpd           m0, m1
298
+    addpd           m0, [pd_0_5]
299
+    cvttpd2dq       xm0, m0
300
+
301
+    movu            [r0+r5*4], xm0
302
+    add             r5d, 4              ; process 4 values in one iteration
303
+    cmp             r5d, r6d
304
+    jl             .loop
305
+
306
+    add             r6d, 3
307
+    xor             r6d, r5d
308
+    jz              .even               ; if loop counter is multiple of 4, all values are processed
309
+
310
+    and             r6d, 3              ; otherwise, remaining unprocessed values must be 1, 2 or 3
311
+    cmp             r6d, 1
312
+    je              .process1           ; if only 1 value is unprocessed
313
+
314
+    ; process 2 values here
315
+    movq            xm2, [r2+r5*4]      ; intra
316
+    movq            xm0, [r4+r5*4]      ; invq
317
+    movd            xm3, [r3+r5*2]      ; inter
318
+    pmovzxwd        xm3, xm3
319
+    pand            xm3, xm5
320
+    pminsd          xm3, xm2
321
+
322
+    movd            xm1, [r1+r5*2]      ; prop
323
+    pmovzxwd        xm1, xm1
324
+    pmulld          xm0, xm2
325
+    cvtdq2pd        m0, xm0
326
+    cvtdq2pd        m1, xm1             ; prop
327
+;%if cpuflag(avx2)
328
+;    fmaddpd         m0, m0, m6, m1
329
+;%else
330
+    mulpd           m0, m6              ; intra*invq*fps_factor>>8
331
+    addpd           m0, m1              ; prop + (intra*invq*fps_factor>>8)
332
+;%endif
333
+    cvtdq2pd        m1, xm2             ; intra
334
+    psubd           xm2, xm3            ; intra - inter
335
+    cvtdq2pd        m2, xm2             ; intra - inter
336
+    mulpd           m0, m2              ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
337
+
338
+    divpd           m0, m1
339
+    addpd           m0, [pd_0_5]
340
+    cvttpd2dq       xm0, m0
341
+    movq            [r0+r5*4], xm0
342
+
343
+    xor             r6d, 2
344
+    jz              .even
345
+    add             r5d, 2
346
+
347
+    ; process 1 value here
348
+.process1:
349
+    movd            xm2, [r2+r5*4]      ; intra
350
+    movd            xm0, [r4+r5*4]      ; invq
351
+    movzx           r6d, word [r3+r5*2] ; inter
352
+    movd            xm3, r6d
353
+    pand            xm3, xm5
354
+    pminsd          xm3, xm2
355
+
356
+    movzx           r6d, word [r1+r5*2] ; prop
357
+    movd            xm1, r6d
358
+    pmulld          xm0, xm2
359
+    cvtdq2pd        m0, xm0
360
+    cvtdq2pd        m1, xm1             ; prop
361
+;%if cpuflag(avx2)
362
+;    fmaddpd         m0, m0, m6, m1
363
+;%else
364
+    mulpd           m0, m6              ; intra*invq*fps_factor>>8
365
+    addpd           m0, m1              ; prop + (intra*invq*fps_factor>>8)
366
+;%endif
367
+    cvtdq2pd        m1, xm2             ; intra
368
+    psubd           xm2, xm3            ; intra - inter
369
+    cvtdq2pd        m2, xm2             ; intra - inter
370
+    mulpd           m0, m2              ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
371
+
372
+    divpd           m0, m1
373
+    addpd           m0, [pd_0_5]
374
+    cvttpd2dq       xm0, m0
375
+    movd            [r0+r5*4], xm0
376
+.even:
377
     RET
378
 %endmacro
379
 
380
 INIT_YMM avx
381
 MBTREE_AVX
382
-INIT_YMM avx2,fma3
383
+
384
+INIT_YMM avx2
385
 MBTREE_AVX
386
x265_1.8.tar.gz/source/common/x86/mc.h -> x265_1.9.tar.gz/source/common/x86/mc.h Changed
16
 
1
@@ -36,4 +36,14 @@
2
 
3
 #undef LOWRES
4
 
5
+#define PROPAGATE_COST(cpu) \
6
+    void PFX(mbtree_propagate_cost_ ## cpu)(int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, \
7
+                                              const uint16_t* interCosts, const int32_t* invQscales, const double* fpsFactor, int len);
8
+
9
+PROPAGATE_COST(sse2)
10
+PROPAGATE_COST(avx)
11
+PROPAGATE_COST(avx2)
12
+
13
+#undef PROPAGATE_COST
14
+
15
 #endif // ifndef X265_MC_H
16
x265_1.8.tar.gz/source/common/x86/pixel-a.asm -> x265_1.9.tar.gz/source/common/x86/pixel-a.asm Changed
2441
 
1
@@ -2,6 +2,7 @@
2
 ;* pixel.asm: x86 pixel metrics
3
 ;*****************************************************************************
4
 ;* Copyright (C) 2003-2013 x264 project
5
+;* Copyright (C) 2013-2015 x265 project
6
 ;*
7
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
8
 ;*          Holger Lubitz <holger@lubitz.org>
9
@@ -70,6 +71,7 @@
10
 cextern pd_2
11
 cextern hmul_16p
12
 cextern pb_movemask
13
+cextern pb_movemask_32
14
 cextern pw_pixel_max
15
 
16
 ;=============================================================================
17
@@ -6497,6 +6499,1357 @@
18
 %endif ; !ARCH_X86_64
19
 %endmacro ; SA8D
20
 
21
+
22
+%if ARCH_X86_64 == 1 && BIT_DEPTH == 12
23
+INIT_YMM avx2
24
+cglobal sa8d_8x8_12bit
25
+    pmovzxwd        m0, [r0]
26
+    pmovzxwd        m9, [r2]
27
+    psubd           m0, m9
28
+
29
+    pmovzxwd        m1, [r0 + r1]
30
+    pmovzxwd        m9, [r2 + r3]
31
+    psubd           m1, m9
32
+
33
+    pmovzxwd        m2, [r0 + r1 * 2]
34
+    pmovzxwd        m9, [r2 + r3 * 2]
35
+    psubd           m2, m9
36
+
37
+    pmovzxwd        m8, [r0 + r4]
38
+    pmovzxwd        m9, [r2 + r5]
39
+    psubd           m8, m9
40
+
41
+    lea             r0, [r0 + r1 * 4]
42
+    lea             r2, [r2 + r3 * 4]
43
+
44
+    pmovzxwd        m4, [r0]
45
+    pmovzxwd        m9, [r2]
46
+    psubd           m4, m9
47
+
48
+    pmovzxwd        m5, [r0 + r1]
49
+    pmovzxwd        m9, [r2 + r3]
50
+    psubd           m5, m9
51
+
52
+    pmovzxwd        m3, [r0 + r1 * 2]
53
+    pmovzxwd        m9, [r2 + r3 * 2]
54
+    psubd           m3, m9
55
+
56
+    pmovzxwd        m7, [r0 + r4]
57
+    pmovzxwd        m9, [r2 + r5]
58
+    psubd           m7, m9
59
+
60
+    mova            m6, m0
61
+    paddd           m0, m1
62
+    psubd           m1, m6
63
+    mova            m6, m2
64
+    paddd           m2, m8
65
+    psubd           m8, m6
66
+    mova            m6, m0
67
+
68
+    punpckldq       m0, m1
69
+    punpckhdq       m6, m1
70
+
71
+    mova            m1, m0
72
+    paddd           m0, m6
73
+    psubd           m6, m1
74
+    mova            m1, m2
75
+
76
+    punpckldq       m2, m8
77
+    punpckhdq       m1, m8
78
+
79
+    mova            m8, m2
80
+    paddd           m2, m1
81
+    psubd           m1, m8
82
+    mova            m8, m4
83
+    paddd           m4, m5
84
+    psubd           m5, m8
85
+    mova            m8, m3
86
+    paddd           m3, m7
87
+    psubd           m7, m8
88
+    mova            m8, m4
89
+
90
+    punpckldq       m4, m5
91
+    punpckhdq       m8, m5
92
+
93
+    mova            m5, m4
94
+    paddd           m4, m8
95
+    psubd           m8, m5
96
+    mova            m5, m3
97
+    punpckldq       m3, m7
98
+    punpckhdq       m5, m7
99
+
100
+    mova            m7, m3
101
+    paddd           m3, m5
102
+    psubd           m5, m7
103
+    mova            m7, m0
104
+    paddd           m0, m2
105
+    psubd           m2, m7
106
+    mova            m7, m6
107
+    paddd           m6, m1
108
+    psubd           m1, m7
109
+    mova            m7, m0
110
+
111
+    punpcklqdq      m0, m2
112
+    punpckhqdq      m7, m2
113
+
114
+    mova            m2, m0
115
+    paddd           m0, m7
116
+    psubd           m7, m2
117
+    mova            m2, m6
118
+
119
+    punpcklqdq      m6, m1
120
+    punpckhqdq      m2, m1
121
+
122
+    mova            m1, m6
123
+    paddd           m6, m2
124
+    psubd           m2, m1
125
+    mova            m1, m4
126
+    paddd           m4, m3
127
+    psubd           m3, m1
128
+    mova            m1, m8
129
+    paddd           m8, m5
130
+    psubd           m5, m1
131
+    mova            m1, m4
132
+
133
+    punpcklqdq      m4, m3
134
+    punpckhqdq      m1, m3
135
+
136
+    mova            m3, m4
137
+    paddd           m4, m1
138
+    psubd           m1, m3
139
+    mova            m3, m8
140
+
141
+    punpcklqdq      m8, m5
142
+    punpckhqdq      m3, m5
143
+
144
+    mova            m5, m8
145
+    paddd           m8, m3
146
+    psubd           m3, m5
147
+    mova            m5, m0
148
+    paddd           m0, m4
149
+    psubd           m4, m5
150
+    mova            m5, m7
151
+    paddd           m7, m1
152
+    psubd           m1, m5
153
+    mova            m5, m0
154
+
155
+    vinserti128     m0, m0, xm4, 1
156
+    vperm2i128      m5, m5, m4, 00110001b
157
+
158
+    pxor            m4, m4
159
+    psubd           m4, m0
160
+    pmaxsd          m0, m4
161
+    pxor            m4, m4
162
+    psubd           m4, m5
163
+    pmaxsd          m5, m4
164
+    pmaxsd          m0, m5
165
+    mova            m4, m7
166
+
167
+    vinserti128     m7, m7, xm1, 1
168
+    vperm2i128      m4, m4, m1, 00110001b
169
+
170
+    pxor            m1, m1
171
+    psubd           m1, m7
172
+    pmaxsd          m7, m1
173
+    pxor            m1, m1
174
+    psubd           m1, m4
175
+    pmaxsd          m4, m1
176
+    pmaxsd          m7, m4
177
+    mova            m1, m6
178
+    paddd           m6, m8
179
+    psubd           m8, m1
180
+    mova            m1, m2
181
+    paddd           m2, m3
182
+    psubd           m3, m1
183
+    mova            m1, m6
184
+
185
+    vinserti128     m6, m6, xm8, 1
186
+    vperm2i128      m1, m1, m8, 00110001b
187
+
188
+    pxor            m8, m8
189
+    psubd           m8, m6
190
+    pmaxsd          m6, m8
191
+    pxor            m8, m8
192
+    psubd           m8, m1
193
+    pmaxsd          m1, m8
194
+    pmaxsd          m6, m1
195
+    mova            m8, m2
196
+
197
+    vinserti128     m2, m2, xm3, 1
198
+    vperm2i128      m8, m8, m3, 00110001b
199
+
200
+    pxor            m3, m3
201
+    psubd           m3, m2
202
+    pmaxsd          m2, m3
203
+    pxor            m3, m3
204
+    psubd           m3, m8
205
+    pmaxsd          m8, m3
206
+    pmaxsd          m2, m8
207
+    paddd           m0, m6
208
+    paddd           m0, m7
209
+    paddd           m0, m2
210
+    ret
211
+
212
+cglobal pixel_sa8d_8x8, 4,6,10
213
+    add             r1d, r1d
214
+    add             r3d, r3d
215
+    lea             r4, [r1 + r1 * 2]
216
+    lea             r5, [r3 + r3 * 2]
217
+
218
+    call            sa8d_8x8_12bit
219
+
220
+    vextracti128    xm6, m0, 1
221
+    paddd           xm0, xm6
222
+
223
+    movhlps         xm6, xm0
224
+    paddd           xm0, xm6
225
+
226
+    pshuflw         xm6, xm0, 0Eh
227
+    paddd           xm0, xm6
228
+    movd            eax, xm0
229
+    add             eax, 1
230
+    shr             eax, 1
231
+    RET
232
+
233
+cglobal pixel_sa8d_8x16, 4,7,11
234
+    add             r1d, r1d
235
+    add             r3d, r3d
236
+    lea             r4, [r1 + r1 * 2]
237
+    lea             r5, [r3 + r3 * 2]
238
+    pxor            m10, m10
239
+
240
+    call            sa8d_8x8_12bit
241
+
242
+    vextracti128    xm6, m0, 1
243
+    paddd           xm0, xm6
244
+
245
+    movhlps         xm6, xm0
246
+    paddd           xm0, xm6
247
+
248
+    pshuflw         xm6, xm0, 0Eh
249
+    paddd           xm0, xm6
250
+    paddd           xm0, [pd_1]
251
+    psrld           xm0, 1
252
+    paddd           xm10, xm0
253
+
254
+    lea             r0, [r0 + r1 * 4]
255
+    lea             r2, [r2 + r3 * 4]
256
+    call            sa8d_8x8_12bit
257
+
258
+    vextracti128    xm6, m0, 1
259
+    paddd           xm0, xm6
260
+
261
+    movhlps         xm6, xm0
262
+    paddd           xm0, xm6
263
+
264
+    pshuflw         xm6, xm0, 0Eh
265
+    paddd           xm0, xm6
266
+    paddd           xm0, [pd_1]
267
+    psrld           xm0, 1
268
+    paddd           xm0, xm10
269
+    movd            eax, xm0
270
+    RET
271
+
272
+cglobal pixel_sa8d_16x16, 4,8,11
273
+    add             r1d, r1d
274
+    add             r3d, r3d
275
+    lea             r4, [r1 + r1 * 2]
276
+    lea             r5, [r3 + r3 * 2]
277
+    mov             r6, r0
278
+    mov             r7, r2
279
+    pxor            m10, m10
280
+
281
+    call            sa8d_8x8_12bit
282
+    paddd           m10, m0
283
+
284
+    lea             r0, [r0 + r1 * 4]
285
+    lea             r2, [r2 + r3 * 4]
286
+    call            sa8d_8x8_12bit
287
+    paddd           m10, m0
288
+
289
+    lea             r0, [r6 + 16]
290
+    lea             r2, [r7 + 16]
291
+    call            sa8d_8x8_12bit
292
+    paddd           m10, m0
293
+
294
+    lea             r0, [r0 + r1 * 4]
295
+    lea             r2, [r2 + r3 * 4]
296
+    call            sa8d_8x8_12bit
297
+    paddd           m0, m10
298
+
299
+    vextracti128    xm6, m0, 1
300
+    paddd           xm0, xm6
301
+
302
+    movhlps         xm6, xm0
303
+    paddd           xm0, xm6
304
+
305
+    pshuflw         xm6, xm0, 0Eh
306
+    paddd           xm0, xm6
307
+    movd            eax, xm0
308
+    add             eax, 1
309
+    shr             eax, 1
310
+    RET
311
+
312
+cglobal pixel_sa8d_16x32, 4,8,12
313
+    add             r1d, r1d
314
+    add             r3d, r3d
315
+    lea             r4, [r1 + r1 * 2]
316
+    lea             r5, [r3 + r3 * 2]
317
+    mov             r6, r0
318
+    mov             r7, r2
319
+    pxor            m10, m10
320
+    pxor            m11, m11
321
+
322
+    call            sa8d_8x8_12bit
323
+    paddd           m10, m0
324
+
325
+    lea             r0, [r0 + r1 * 4]
326
+    lea             r2, [r2 + r3 * 4]
327
+    call            sa8d_8x8_12bit
328
+    paddd           m10, m0
329
+
330
+    lea             r0, [r6 + 16]
331
+    lea             r2, [r7 + 16]
332
+    call            sa8d_8x8_12bit
333
+    paddd           m10, m0
334
+
335
+    lea             r0, [r0 + r1 * 4]
336
+    lea             r2, [r2 + r3 * 4]
337
+    call            sa8d_8x8_12bit
338
+    paddd           m0, m10
339
+
340
+    vextracti128    xm6, m0, 1
341
+    paddd           xm0, xm6
342
+
343
+    movhlps         xm6, xm0
344
+    paddd           xm0, xm6
345
+
346
+    pshuflw         xm6, xm0, 0Eh
347
+    paddd           xm0, xm6
348
+    paddd           xm0, [pd_1]
349
+    psrld           xm0, 1
350
+    paddd           xm11, xm0
351
+
352
+    lea             r6, [r6 + r1 * 8]
353
+    lea             r6, [r6 + r1 * 8]
354
+    lea             r7, [r7 + r3 * 8]
355
+    lea             r7, [r7 + r3 * 8]
356
+    pxor            m10, m10
357
+    mov             r0, r6
358
+    mov             r2, r7
359
+    call            sa8d_8x8_12bit
360
+    paddd           m10, m0
361
+
362
+    lea             r0, [r0 + r1 * 4]
363
+    lea             r2, [r2 + r3 * 4]
364
+    call            sa8d_8x8_12bit
365
+    paddd           m10, m0
366
+
367
+    lea             r0, [r6 + 16]
368
+    lea             r2, [r7 + 16]
369
+    call            sa8d_8x8_12bit
370
+    paddd           m10, m0
371
+
372
+    lea             r0, [r0 + r1 * 4]
373
+    lea             r2, [r2 + r3 * 4]
374
+    call            sa8d_8x8_12bit
375
+    paddd           m0, m10
376
+
377
+    vextracti128    xm6, m0, 1
378
+    paddd           xm0, xm6
379
+
380
+    movhlps         xm6, xm0
381
+    paddd           xm0, xm6
382
+
383
+    pshuflw         xm6, xm0, 0Eh
384
+    paddd           xm0, xm6
385
+    paddd           xm0, [pd_1]
386
+    psrld           xm0, 1
387
+    paddd           xm11, xm0
388
+    movd            eax, xm11
389
+    RET
390
+
391
+cglobal pixel_sa8d_32x32, 4,8,12
392
+    add             r1d, r1d
393
+    add             r3d, r3d
394
+    lea             r4, [r1 + r1 * 2]
395
+    lea             r5, [r3 + r3 * 2]
396
+    mov             r6, r0
397
+    mov             r7, r2
398
+    pxor            m10, m10
399
+    pxor            m11, m11
400
+
401
+    call            sa8d_8x8_12bit
402
+    paddd           m10, m0
403
+
404
+    lea             r0, [r0 + r1 * 4]
405
+    lea             r2, [r2 + r3 * 4]
406
+    call            sa8d_8x8_12bit
407
+    paddd           m10, m0
408
+
409
+    lea             r0, [r6 + 16]
410
+    lea             r2, [r7 + 16]
411
+    call            sa8d_8x8_12bit
412
+    paddd           m10, m0
413
+
414
+    lea             r0, [r0 + r1 * 4]
415
+    lea             r2, [r2 + r3 * 4]
416
+    call            sa8d_8x8_12bit
417
+    paddd           m0, m10
418
+
419
+    vextracti128    xm6, m0, 1
420
+    paddd           xm0, xm6
421
+
422
+    movhlps         xm6, xm0
423
+    paddd           xm0, xm6
424
+
425
+    pshuflw         xm6, xm0, 0Eh
426
+    paddd           xm0, xm6
427
+    paddd           xm0, [pd_1]
428
+    psrld           xm0, 1
429
+    paddd           xm11, xm0
430
+
431
+    pxor            m10, m10
432
+    lea             r0, [r6 + 32]
433
+    lea             r2, [r7 + 32]
434
+    call            sa8d_8x8_12bit
435
+    paddd           m10, m0
436
+
437
+    lea             r0, [r0 + r1 * 4]
438
+    lea             r2, [r2 + r3 * 4]
439
+    call            sa8d_8x8_12bit
440
+    paddd           m10, m0
441
+
442
+    lea             r0, [r6 + 48]
443
+    lea             r2, [r7 + 48]
444
+    call            sa8d_8x8_12bit
445
+    paddd           m10, m0
446
+
447
+    lea             r0, [r0 + r1 * 4]
448
+    lea             r2, [r2 + r3 * 4]
449
+    call            sa8d_8x8_12bit
450
+    paddd           m0, m10
451
+
452
+    vextracti128    xm6, m0, 1
453
+    paddd           xm0, xm6
454
+
455
+    movhlps         xm6, xm0
456
+    paddd           xm0, xm6
457
+
458
+    pshuflw         xm6, xm0, 0Eh
459
+    paddd           xm0, xm6
460
+    paddd           xm0, [pd_1]
461
+    psrld           xm0, 1
462
+    paddd           xm11, xm0
463
+
464
+    lea             r6, [r6 + r1 * 8]
465
+    lea             r6, [r6 + r1 * 8]
466
+    lea             r7, [r7 + r3 * 8]
467
+    lea             r7, [r7 + r3 * 8]
468
+    pxor            m10, m10
469
+    mov             r0, r6
470
+    mov             r2, r7
471
+    call            sa8d_8x8_12bit
472
+    paddd           m10, m0
473
+
474
+    lea             r0, [r0 + r1 * 4]
475
+    lea             r2, [r2 + r3 * 4]
476
+    call            sa8d_8x8_12bit
477
+    paddd           m10, m0
478
+
479
+    lea             r0, [r6 + 16]
480
+    lea             r2, [r7 + 16]
481
+    call            sa8d_8x8_12bit
482
+    paddd           m10, m0
483
+
484
+    lea             r0, [r0 + r1 * 4]
485
+    lea             r2, [r2 + r3 * 4]
486
+    call            sa8d_8x8_12bit
487
+    paddd           m0, m10
488
+
489
+    vextracti128    xm6, m0, 1
490
+    paddd           xm0, xm6
491
+
492
+    movhlps         xm6, xm0
493
+    paddd           xm0, xm6
494
+
495
+    pshuflw         xm6, xm0, 0Eh
496
+    paddd           xm0, xm6
497
+    paddd           xm0, [pd_1]
498
+    psrld           xm0, 1
499
+    paddd           xm11, xm0
500
+
501
+    pxor            m10, m10
502
+    lea             r0, [r6 + 32]
503
+    lea             r2, [r7 + 32]
504
+    call            sa8d_8x8_12bit
505
+    paddd           m10, m0
506
+
507
+    lea             r0, [r0 + r1 * 4]
508
+    lea             r2, [r2 + r3 * 4]
509
+    call            sa8d_8x8_12bit
510
+    paddd           m10, m0
511
+
512
+    lea             r0, [r6 + 48]
513
+    lea             r2, [r7 + 48]
514
+    call            sa8d_8x8_12bit
515
+    paddd           m10, m0
516
+
517
+    lea             r0, [r0 + r1 * 4]
518
+    lea             r2, [r2 + r3 * 4]
519
+    call            sa8d_8x8_12bit
520
+    paddd           m0, m10
521
+
522
+    vextracti128    xm6, m0, 1
523
+    paddd           xm0, xm6
524
+
525
+    movhlps         xm6, xm0
526
+    paddd           xm0, xm6
527
+
528
+    pshuflw         xm6, xm0, 0Eh
529
+    paddd           xm0, xm6
530
+    paddd           xm0, [pd_1]
531
+    psrld           xm0, 1
532
+    paddd           xm11, xm0
533
+    movd            eax, xm11
534
+    RET
535
+
536
+cglobal pixel_sa8d_32x64, 4,8,12
537
+    add             r1d, r1d
538
+    add             r3d, r3d
539
+    lea             r4, [r1 + r1 * 2]
540
+    lea             r5, [r3 + r3 * 2]
541
+    mov             r6, r0
542
+    mov             r7, r2
543
+    pxor            m10, m10
544
+    pxor            m11, m11
545
+
546
+    call            sa8d_8x8_12bit
547
+    paddd           m10, m0
548
+
549
+    lea             r0, [r0 + r1 * 4]
550
+    lea             r2, [r2 + r3 * 4]
551
+    call            sa8d_8x8_12bit
552
+    paddd           m10, m0
553
+
554
+    lea             r0, [r6 + 16]
555
+    lea             r2, [r7 + 16]
556
+    call            sa8d_8x8_12bit
557
+    paddd           m10, m0
558
+
559
+    lea             r0, [r0 + r1 * 4]
560
+    lea             r2, [r2 + r3 * 4]
561
+    call            sa8d_8x8_12bit
562
+    paddd           m0, m10
563
+
564
+    vextracti128    xm6, m0, 1
565
+    paddd           xm0, xm6
566
+
567
+    movhlps         xm6, xm0
568
+    paddd           xm0, xm6
569
+
570
+    pshuflw         xm6, xm0, 0Eh
571
+    paddd           xm0, xm6
572
+    paddd           xm0, [pd_1]
573
+    psrld           xm0, 1
574
+    paddd           xm11, xm0
575
+
576
+    pxor            m10, m10
577
+    lea             r0, [r6 + 32]
578
+    lea             r2, [r7 + 32]
579
+    call            sa8d_8x8_12bit
580
+    paddd           m10, m0
581
+
582
+    lea             r0, [r0 + r1 * 4]
583
+    lea             r2, [r2 + r3 * 4]
584
+    call            sa8d_8x8_12bit
585
+    paddd           m10, m0
586
+
587
+    lea             r0, [r6 + 48]
588
+    lea             r2, [r7 + 48]
589
+    call            sa8d_8x8_12bit
590
+    paddd           m10, m0
591
+
592
+    lea             r0, [r0 + r1 * 4]
593
+    lea             r2, [r2 + r3 * 4]
594
+    call            sa8d_8x8_12bit
595
+    paddd           m0, m10
596
+
597
+    vextracti128    xm6, m0, 1
598
+    paddd           xm0, xm6
599
+
600
+    movhlps         xm6, xm0
601
+    paddd           xm0, xm6
602
+
603
+    pshuflw         xm6, xm0, 0Eh
604
+    paddd           xm0, xm6
605
+    paddd           xm0, [pd_1]
606
+    psrld           xm0, 1
607
+    paddd           xm11, xm0
608
+
609
+    lea             r6, [r6 + r1 * 8]
610
+    lea             r6, [r6 + r1 * 8]
611
+    lea             r7, [r7 + r3 * 8]
612
+    lea             r7, [r7 + r3 * 8]
613
+    pxor            m10, m10
614
+    mov             r0, r6
615
+    mov             r2, r7
616
+    call            sa8d_8x8_12bit
617
+    paddd           m10, m0
618
+
619
+    lea             r0, [r0 + r1 * 4]
620
+    lea             r2, [r2 + r3 * 4]
621
+    call            sa8d_8x8_12bit
622
+    paddd           m10, m0
623
+
624
+    lea             r0, [r6 + 16]
625
+    lea             r2, [r7 + 16]
626
+    call            sa8d_8x8_12bit
627
+    paddd           m10, m0
628
+
629
+    lea             r0, [r0 + r1 * 4]
630
+    lea             r2, [r2 + r3 * 4]
631
+    call            sa8d_8x8_12bit
632
+    paddd           m0, m10
633
+
634
+    vextracti128    xm6, m0, 1
635
+    paddd           xm0, xm6
636
+
637
+    movhlps         xm6, xm0
638
+    paddd           xm0, xm6
639
+
640
+    pshuflw         xm6, xm0, 0Eh
641
+    paddd           xm0, xm6
642
+    paddd           xm0, [pd_1]
643
+    psrld           xm0, 1
644
+    paddd           xm11, xm0
645
+
646
+    pxor            m10, m10
647
+    lea             r0, [r6 + 32]
648
+    lea             r2, [r7 + 32]
649
+    call            sa8d_8x8_12bit
650
+    paddd           m10, m0
651
+
652
+    lea             r0, [r0 + r1 * 4]
653
+    lea             r2, [r2 + r3 * 4]
654
+    call            sa8d_8x8_12bit
655
+    paddd           m10, m0
656
+
657
+    lea             r0, [r6 + 48]
658
+    lea             r2, [r7 + 48]
659
+    call            sa8d_8x8_12bit
660
+    paddd           m10, m0
661
+
662
+    lea             r0, [r0 + r1 * 4]
663
+    lea             r2, [r2 + r3 * 4]
664
+    call            sa8d_8x8_12bit
665
+    paddd           m0, m10
666
+
667
+    vextracti128    xm6, m0, 1
668
+    paddd           xm0, xm6
669
+
670
+    movhlps         xm6, xm0
671
+    paddd           xm0, xm6
672
+
673
+    pshuflw         xm6, xm0, 0Eh
674
+    paddd           xm0, xm6
675
+    paddd           xm0, [pd_1]
676
+    psrld           xm0, 1
677
+    paddd           xm11, xm0
678
+
679
+    lea             r6, [r6 + r1 * 8]
680
+    lea             r6, [r6 + r1 * 8]
681
+    lea             r7, [r7 + r3 * 8]
682
+    lea             r7, [r7 + r3 * 8]
683
+    pxor            m10, m10
684
+    mov             r0, r6
685
+    mov             r2, r7
686
+    call            sa8d_8x8_12bit
687
+    paddd           m10, m0
688
+
689
+    lea             r0, [r0 + r1 * 4]
690
+    lea             r2, [r2 + r3 * 4]
691
+    call            sa8d_8x8_12bit
692
+    paddd           m10, m0
693
+
694
+    lea             r0, [r6 + 16]
695
+    lea             r2, [r7 + 16]
696
+    call            sa8d_8x8_12bit
697
+    paddd           m10, m0
698
+
699
+    lea             r0, [r0 + r1 * 4]
700
+    lea             r2, [r2 + r3 * 4]
701
+    call            sa8d_8x8_12bit
702
+    paddd           m0, m10
703
+
704
+    vextracti128    xm6, m0, 1
705
+    paddd           xm0, xm6
706
+
707
+    movhlps         xm6, xm0
708
+    paddd           xm0, xm6
709
+
710
+    pshuflw         xm6, xm0, 0Eh
711
+    paddd           xm0, xm6
712
+    paddd           xm0, [pd_1]
713
+    psrld           xm0, 1
714
+    paddd           xm11, xm0
715
+
716
+    pxor            m10, m10
717
+    lea             r0, [r6 + 32]
718
+    lea             r2, [r7 + 32]
719
+    call            sa8d_8x8_12bit
720
+    paddd           m10, m0
721
+
722
+    lea             r0, [r0 + r1 * 4]
723
+    lea             r2, [r2 + r3 * 4]
724
+    call            sa8d_8x8_12bit
725
+    paddd           m10, m0
726
+
727
+    lea             r0, [r6 + 48]
728
+    lea             r2, [r7 + 48]
729
+    call            sa8d_8x8_12bit
730
+    paddd           m10, m0
731
+
732
+    lea             r0, [r0 + r1 * 4]
733
+    lea             r2, [r2 + r3 * 4]
734
+    call            sa8d_8x8_12bit
735
+    paddd           m0, m10
736
+
737
+    vextracti128    xm6, m0, 1
738
+    paddd           xm0, xm6
739
+
740
+    movhlps         xm6, xm0
741
+    paddd           xm0, xm6
742
+
743
+    pshuflw         xm6, xm0, 0Eh
744
+    paddd           xm0, xm6
745
+    paddd           xm0, [pd_1]
746
+    psrld           xm0, 1
747
+    paddd           xm11, xm0
748
+
749
+    lea             r6, [r6 + r1 * 8]
750
+    lea             r6, [r6 + r1 * 8]
751
+    lea             r7, [r7 + r3 * 8]
752
+    lea             r7, [r7 + r3 * 8]
753
+    pxor            m10, m10
754
+    mov             r0, r6
755
+    mov             r2, r7
756
+    call            sa8d_8x8_12bit
757
+    paddd           m10, m0
758
+
759
+    lea             r0, [r0 + r1 * 4]
760
+    lea             r2, [r2 + r3 * 4]
761
+    call            sa8d_8x8_12bit
762
+    paddd           m10, m0
763
+
764
+    lea             r0, [r6 + 16]
765
+    lea             r2, [r7 + 16]
766
+    call            sa8d_8x8_12bit
767
+    paddd           m10, m0
768
+
769
+    lea             r0, [r0 + r1 * 4]
770
+    lea             r2, [r2 + r3 * 4]
771
+    call            sa8d_8x8_12bit
772
+    paddd           m0, m10
773
+
774
+    vextracti128    xm6, m0, 1
775
+    paddd           xm0, xm6
776
+
777
+    movhlps         xm6, xm0
778
+    paddd           xm0, xm6
779
+
780
+    pshuflw         xm6, xm0, 0Eh
781
+    paddd           xm0, xm6
782
+    paddd           xm0, [pd_1]
783
+    psrld           xm0, 1
784
+    paddd           xm11, xm0
785
+
786
+    pxor            m10, m10
787
+    lea             r0, [r6 + 32]
788
+    lea             r2, [r7 + 32]
789
+    call            sa8d_8x8_12bit
790
+    paddd           m10, m0
791
+
792
+    lea             r0, [r0 + r1 * 4]
793
+    lea             r2, [r2 + r3 * 4]
794
+    call            sa8d_8x8_12bit
795
+    paddd           m10, m0
796
+
797
+    lea             r0, [r6 + 48]
798
+    lea             r2, [r7 + 48]
799
+    call            sa8d_8x8_12bit
800
+    paddd           m10, m0
801
+
802
+    lea             r0, [r0 + r1 * 4]
803
+    lea             r2, [r2 + r3 * 4]
804
+    call            sa8d_8x8_12bit
805
+    paddd           m0, m10
806
+
807
+    vextracti128    xm6, m0, 1
808
+    paddd           xm0, xm6
809
+
810
+    movhlps         xm6, xm0
811
+    paddd           xm0, xm6
812
+
813
+    pshuflw         xm6, xm0, 0Eh
814
+    paddd           xm0, xm6
815
+    paddd           xm0, [pd_1]
816
+    psrld           xm0, 1
817
+    paddd           xm11, xm0
818
+    movd            eax, xm11
819
+    RET
820
+
821
+cglobal pixel_sa8d_64x64, 4,8,12
822
+    add             r1d, r1d
823
+    add             r3d, r3d
824
+    lea             r4, [r1 + r1 * 2]
825
+    lea             r5, [r3 + r3 * 2]
826
+    mov             r6, r0
827
+    mov             r7, r2
828
+    pxor            m10, m10
829
+    pxor            m11, m11
830
+
831
+    call            sa8d_8x8_12bit
832
+    paddd           m10, m0
833
+
834
+    lea             r0, [r0 + r1 * 4]
835
+    lea             r2, [r2 + r3 * 4]
836
+    call            sa8d_8x8_12bit
837
+    paddd           m10, m0
838
+
839
+    lea             r0, [r6 + 16]
840
+    lea             r2, [r7 + 16]
841
+    call            sa8d_8x8_12bit
842
+    paddd           m10, m0
843
+
844
+    lea             r0, [r0 + r1 * 4]
845
+    lea             r2, [r2 + r3 * 4]
846
+    call            sa8d_8x8_12bit
847
+    paddd           m0, m10
848
+
849
+    vextracti128    xm6, m0, 1
850
+    paddd           xm0, xm6
851
+
852
+    movhlps         xm6, xm0
853
+    paddd           xm0, xm6
854
+
855
+    pshuflw         xm6, xm0, 0Eh
856
+    paddd           xm0, xm6
857
+    paddd           xm0, [pd_1]
858
+    psrld           xm0, 1
859
+    paddd           xm11, xm0
860
+
861
+    pxor            m10, m10
862
+    lea             r0, [r6 + 32]
863
+    lea             r2, [r7 + 32]
864
+    call            sa8d_8x8_12bit
865
+    paddd           m10, m0
866
+
867
+    lea             r0, [r0 + r1 * 4]
868
+    lea             r2, [r2 + r3 * 4]
869
+    call            sa8d_8x8_12bit
870
+    paddd           m10, m0
871
+
872
+    lea             r0, [r6 + 48]
873
+    lea             r2, [r7 + 48]
874
+    call            sa8d_8x8_12bit
875
+    paddd           m10, m0
876
+
877
+    lea             r0, [r0 + r1 * 4]
878
+    lea             r2, [r2 + r3 * 4]
879
+    call            sa8d_8x8_12bit
880
+    paddd           m0, m10
881
+
882
+    vextracti128    xm6, m0, 1
883
+    paddd           xm0, xm6
884
+
885
+    movhlps         xm6, xm0
886
+    paddd           xm0, xm6
887
+
888
+    pshuflw         xm6, xm0, 0Eh
889
+    paddd           xm0, xm6
890
+    paddd           xm0, [pd_1]
891
+    psrld           xm0, 1
892
+    paddd           xm11, xm0
893
+
894
+    pxor            m10, m10
895
+    lea             r0, [r6 + 64]
896
+    lea             r2, [r7 + 64]
897
+    call            sa8d_8x8_12bit
898
+    paddd           m10, m0
899
+
900
+    lea             r0, [r0 + r1 * 4]
901
+    lea             r2, [r2 + r3 * 4]
902
+    call            sa8d_8x8_12bit
903
+    paddd           m10, m0
904
+
905
+    lea             r0, [r6 + 80]
906
+    lea             r2, [r7 + 80]
907
+    call            sa8d_8x8_12bit
908
+    paddd           m10, m0
909
+
910
+    lea             r0, [r0 + r1 * 4]
911
+    lea             r2, [r2 + r3 * 4]
912
+    call            sa8d_8x8_12bit
913
+    paddd           m0, m10
914
+
915
+    vextracti128    xm6, m0, 1
916
+    paddd           xm0, xm6
917
+
918
+    movhlps         xm6, xm0
919
+    paddd           xm0, xm6
920
+
921
+    pshuflw         xm6, xm0, 0Eh
922
+    paddd           xm0, xm6
923
+    paddd           xm0, [pd_1]
924
+    psrld           xm0, 1
925
+    paddd           xm11, xm0
926
+
927
+    pxor            m10, m10
928
+    lea             r0, [r6 + 96]
929
+    lea             r2, [r7 + 96]
930
+    call            sa8d_8x8_12bit
931
+    paddd           m10, m0
932
+
933
+    lea             r0, [r0 + r1 * 4]
934
+    lea             r2, [r2 + r3 * 4]
935
+    call            sa8d_8x8_12bit
936
+    paddd           m10, m0
937
+
938
+    lea             r0, [r6 + 112]
939
+    lea             r2, [r7 + 112]
940
+    call            sa8d_8x8_12bit
941
+    paddd           m10, m0
942
+
943
+    lea             r0, [r0 + r1 * 4]
944
+    lea             r2, [r2 + r3 * 4]
945
+    call            sa8d_8x8_12bit
946
+    paddd           m0, m10
947
+
948
+    vextracti128    xm6, m0, 1
949
+    paddd           xm0, xm6
950
+
951
+    movhlps         xm6, xm0
952
+    paddd           xm0, xm6
953
+
954
+    pshuflw         xm6, xm0, 0Eh
955
+    paddd           xm0, xm6
956
+    paddd           xm0, [pd_1]
957
+    psrld           xm0, 1
958
+    paddd           xm11, xm0
959
+
960
+    lea             r6, [r6 + r1 * 8]
961
+    lea             r6, [r6 + r1 * 8]
962
+    lea             r7, [r7 + r3 * 8]
963
+    lea             r7, [r7 + r3 * 8]
964
+    pxor            m10, m10
965
+    mov             r0, r6
966
+    mov             r2, r7
967
+    call            sa8d_8x8_12bit
968
+    paddd           m10, m0
969
+
970
+    lea             r0, [r0 + r1 * 4]
971
+    lea             r2, [r2 + r3 * 4]
972
+    call            sa8d_8x8_12bit
973
+    paddd           m10, m0
974
+
975
+    lea             r0, [r6 + 16]
976
+    lea             r2, [r7 + 16]
977
+    call            sa8d_8x8_12bit
978
+    paddd           m10, m0
979
+
980
+    lea             r0, [r0 + r1 * 4]
981
+    lea             r2, [r2 + r3 * 4]
982
+    call            sa8d_8x8_12bit
983
+    paddd           m0, m10
984
+
985
+    vextracti128    xm6, m0, 1
986
+    paddd           xm0, xm6
987
+
988
+    movhlps         xm6, xm0
989
+    paddd           xm0, xm6
990
+
991
+    pshuflw         xm6, xm0, 0Eh
992
+    paddd           xm0, xm6
993
+    paddd           xm0, [pd_1]
994
+    psrld           xm0, 1
995
+    paddd           xm11, xm0
996
+
997
+    pxor            m10, m10
998
+    lea             r0, [r6 + 32]
999
+    lea             r2, [r7 + 32]
1000
+    call            sa8d_8x8_12bit
1001
+    paddd           m10, m0
1002
+
1003
+    lea             r0, [r0 + r1 * 4]
1004
+    lea             r2, [r2 + r3 * 4]
1005
+    call            sa8d_8x8_12bit
1006
+    paddd           m10, m0
1007
+
1008
+    lea             r0, [r6 + 48]
1009
+    lea             r2, [r7 + 48]
1010
+    call            sa8d_8x8_12bit
1011
+    paddd           m10, m0
1012
+
1013
+    lea             r0, [r0 + r1 * 4]
1014
+    lea             r2, [r2 + r3 * 4]
1015
+    call            sa8d_8x8_12bit
1016
+    paddd           m0, m10
1017
+
1018
+    vextracti128    xm6, m0, 1
1019
+    paddd           xm0, xm6
1020
+
1021
+    movhlps         xm6, xm0
1022
+    paddd           xm0, xm6
1023
+
1024
+    pshuflw         xm6, xm0, 0Eh
1025
+    paddd           xm0, xm6
1026
+    paddd           xm0, [pd_1]
1027
+    psrld           xm0, 1
1028
+    paddd           xm11, xm0
1029
+
1030
+    pxor            m10, m10
1031
+    lea             r0, [r6 + 64]
1032
+    lea             r2, [r7 + 64]
1033
+    call            sa8d_8x8_12bit
1034
+    paddd           m10, m0
1035
+
1036
+    lea             r0, [r0 + r1 * 4]
1037
+    lea             r2, [r2 + r3 * 4]
1038
+    call            sa8d_8x8_12bit
1039
+    paddd           m10, m0
1040
+
1041
+    lea             r0, [r6 + 80]
1042
+    lea             r2, [r7 + 80]
1043
+    call            sa8d_8x8_12bit
1044
+    paddd           m10, m0
1045
+
1046
+    lea             r0, [r0 + r1 * 4]
1047
+    lea             r2, [r2 + r3 * 4]
1048
+    call            sa8d_8x8_12bit
1049
+    paddd           m0, m10
1050
+
1051
+    vextracti128    xm6, m0, 1
1052
+    paddd           xm0, xm6
1053
+
1054
+    movhlps         xm6, xm0
1055
+    paddd           xm0, xm6
1056
+
1057
+    pshuflw         xm6, xm0, 0Eh
1058
+    paddd           xm0, xm6
1059
+    paddd           xm0, [pd_1]
1060
+    psrld           xm0, 1
1061
+    paddd           xm11, xm0
1062
+
1063
+    pxor            m10, m10
1064
+    lea             r0, [r6 + 96]
1065
+    lea             r2, [r7 + 96]
1066
+    call            sa8d_8x8_12bit
1067
+    paddd           m10, m0
1068
+
1069
+    lea             r0, [r0 + r1 * 4]
1070
+    lea             r2, [r2 + r3 * 4]
1071
+    call            sa8d_8x8_12bit
1072
+    paddd           m10, m0
1073
+
1074
+    lea             r0, [r6 + 112]
1075
+    lea             r2, [r7 + 112]
1076
+    call            sa8d_8x8_12bit
1077
+    paddd           m10, m0
1078
+
1079
+    lea             r0, [r0 + r1 * 4]
1080
+    lea             r2, [r2 + r3 * 4]
1081
+    call            sa8d_8x8_12bit
1082
+    paddd           m0, m10
1083
+
1084
+    vextracti128    xm6, m0, 1
1085
+    paddd           xm0, xm6
1086
+
1087
+    movhlps         xm6, xm0
1088
+    paddd           xm0, xm6
1089
+
1090
+    pshuflw         xm6, xm0, 0Eh
1091
+    paddd           xm0, xm6
1092
+    paddd           xm0, [pd_1]
1093
+    psrld           xm0, 1
1094
+    paddd           xm11, xm0
1095
+
1096
+    lea             r6, [r6 + r1 * 8]
1097
+    lea             r6, [r6 + r1 * 8]
1098
+    lea             r7, [r7 + r3 * 8]
1099
+    lea             r7, [r7 + r3 * 8]
1100
+    pxor            m10, m10
1101
+    mov             r0, r6
1102
+    mov             r2, r7
1103
+    call            sa8d_8x8_12bit
1104
+    paddd           m10, m0
1105
+
1106
+    lea             r0, [r0 + r1 * 4]
1107
+    lea             r2, [r2 + r3 * 4]
1108
+    call            sa8d_8x8_12bit
1109
+    paddd           m10, m0
1110
+
1111
+    lea             r0, [r6 + 16]
1112
+    lea             r2, [r7 + 16]
1113
+    call            sa8d_8x8_12bit
1114
+    paddd           m10, m0
1115
+
1116
+    lea             r0, [r0 + r1 * 4]
1117
+    lea             r2, [r2 + r3 * 4]
1118
+    call            sa8d_8x8_12bit
1119
+    paddd           m0, m10
1120
+
1121
+    vextracti128    xm6, m0, 1
1122
+    paddd           xm0, xm6
1123
+
1124
+    movhlps         xm6, xm0
1125
+    paddd           xm0, xm6
1126
+
1127
+    pshuflw         xm6, xm0, 0Eh
1128
+    paddd           xm0, xm6
1129
+    paddd           xm0, [pd_1]
1130
+    psrld           xm0, 1
1131
+    paddd           xm11, xm0
1132
+
1133
+    pxor            m10, m10
1134
+    lea             r0, [r6 + 32]
1135
+    lea             r2, [r7 + 32]
1136
+    call            sa8d_8x8_12bit
1137
+    paddd           m10, m0
1138
+
1139
+    lea             r0, [r0 + r1 * 4]
1140
+    lea             r2, [r2 + r3 * 4]
1141
+    call            sa8d_8x8_12bit
1142
+    paddd           m10, m0
1143
+
1144
+    lea             r0, [r6 + 48]
1145
+    lea             r2, [r7 + 48]
1146
+    call            sa8d_8x8_12bit
1147
+    paddd           m10, m0
1148
+
1149
+    lea             r0, [r0 + r1 * 4]
1150
+    lea             r2, [r2 + r3 * 4]
1151
+    call            sa8d_8x8_12bit
1152
+    paddd           m0, m10
1153
+
1154
+    vextracti128    xm6, m0, 1
1155
+    paddd           xm0, xm6
1156
+
1157
+    movhlps         xm6, xm0
1158
+    paddd           xm0, xm6
1159
+
1160
+    pshuflw         xm6, xm0, 0Eh
1161
+    paddd           xm0, xm6
1162
+    paddd           xm0, [pd_1]
1163
+    psrld           xm0, 1
1164
+    paddd           xm11, xm0
1165
+
1166
+    pxor            m10, m10
1167
+    lea             r0, [r6 + 64]
1168
+    lea             r2, [r7 + 64]
1169
+    call            sa8d_8x8_12bit
1170
+    paddd           m10, m0
1171
+
1172
+    lea             r0, [r0 + r1 * 4]
1173
+    lea             r2, [r2 + r3 * 4]
1174
+    call            sa8d_8x8_12bit
1175
+    paddd           m10, m0
1176
+
1177
+    lea             r0, [r6 + 80]
1178
+    lea             r2, [r7 + 80]
1179
+    call            sa8d_8x8_12bit
1180
+    paddd           m10, m0
1181
+
1182
+    lea             r0, [r0 + r1 * 4]
1183
+    lea             r2, [r2 + r3 * 4]
1184
+    call            sa8d_8x8_12bit
1185
+    paddd           m0, m10
1186
+
1187
+    vextracti128    xm6, m0, 1
1188
+    paddd           xm0, xm6
1189
+
1190
+    movhlps         xm6, xm0
1191
+    paddd           xm0, xm6
1192
+
1193
+    pshuflw         xm6, xm0, 0Eh
1194
+    paddd           xm0, xm6
1195
+    paddd           xm0, [pd_1]
1196
+    psrld           xm0, 1
1197
+    paddd           xm11, xm0
1198
+
1199
+    pxor            m10, m10
1200
+    lea             r0, [r6 + 96]
1201
+    lea             r2, [r7 + 96]
1202
+    call            sa8d_8x8_12bit
1203
+    paddd           m10, m0
1204
+
1205
+    lea             r0, [r0 + r1 * 4]
1206
+    lea             r2, [r2 + r3 * 4]
1207
+    call            sa8d_8x8_12bit
1208
+    paddd           m10, m0
1209
+
1210
+    lea             r0, [r6 + 112]
1211
+    lea             r2, [r7 + 112]
1212
+    call            sa8d_8x8_12bit
1213
+    paddd           m10, m0
1214
+
1215
+    lea             r0, [r0 + r1 * 4]
1216
+    lea             r2, [r2 + r3 * 4]
1217
+    call            sa8d_8x8_12bit
1218
+    paddd           m0, m10
1219
+
1220
+    vextracti128    xm6, m0, 1
1221
+    paddd           xm0, xm6
1222
+
1223
+    movhlps         xm6, xm0
1224
+    paddd           xm0, xm6
1225
+
1226
+    pshuflw         xm6, xm0, 0Eh
1227
+    paddd           xm0, xm6
1228
+    paddd           xm0, [pd_1]
1229
+    psrld           xm0, 1
1230
+    paddd           xm11, xm0
1231
+
1232
+    lea             r6, [r6 + r1 * 8]
1233
+    lea             r6, [r6 + r1 * 8]
1234
+    lea             r7, [r7 + r3 * 8]
1235
+    lea             r7, [r7 + r3 * 8]
1236
+    pxor            m10, m10
1237
+    mov             r0, r6
1238
+    mov             r2, r7
1239
+    call            sa8d_8x8_12bit
1240
+    paddd           m10, m0
1241
+
1242
+    lea             r0, [r0 + r1 * 4]
1243
+    lea             r2, [r2 + r3 * 4]
1244
+    call            sa8d_8x8_12bit
1245
+    paddd           m10, m0
1246
+
1247
+    lea             r0, [r6 + 16]
1248
+    lea             r2, [r7 + 16]
1249
+    call            sa8d_8x8_12bit
1250
+    paddd           m10, m0
1251
+
1252
+    lea             r0, [r0 + r1 * 4]
1253
+    lea             r2, [r2 + r3 * 4]
1254
+    call            sa8d_8x8_12bit
1255
+    paddd           m0, m10
1256
+
1257
+    vextracti128    xm6, m0, 1
1258
+    paddd           xm0, xm6
1259
+
1260
+    movhlps         xm6, xm0
1261
+    paddd           xm0, xm6
1262
+
1263
+    pshuflw         xm6, xm0, 0Eh
1264
+    paddd           xm0, xm6
1265
+    paddd           xm0, [pd_1]
1266
+    psrld           xm0, 1
1267
+    paddd           xm11, xm0
1268
+
1269
+    pxor            m10, m10
1270
+    lea             r0, [r6 + 32]
1271
+    lea             r2, [r7 + 32]
1272
+    call            sa8d_8x8_12bit
1273
+    paddd           m10, m0
1274
+
1275
+    lea             r0, [r0 + r1 * 4]
1276
+    lea             r2, [r2 + r3 * 4]
1277
+    call            sa8d_8x8_12bit
1278
+    paddd           m10, m0
1279
+
1280
+    lea             r0, [r6 + 48]
1281
+    lea             r2, [r7 + 48]
1282
+    call            sa8d_8x8_12bit
1283
+    paddd           m10, m0
1284
+
1285
+    lea             r0, [r0 + r1 * 4]
1286
+    lea             r2, [r2 + r3 * 4]
1287
+    call            sa8d_8x8_12bit
1288
+    paddd           m0, m10
1289
+
1290
+    vextracti128    xm6, m0, 1
1291
+    paddd           xm0, xm6
1292
+
1293
+    movhlps         xm6, xm0
1294
+    paddd           xm0, xm6
1295
+
1296
+    pshuflw         xm6, xm0, 0Eh
1297
+    paddd           xm0, xm6
1298
+    paddd           xm0, [pd_1]
1299
+    psrld           xm0, 1
1300
+    paddd           xm11, xm0
1301
+
1302
+    pxor            m10, m10
1303
+    lea             r0, [r6 + 64]
1304
+    lea             r2, [r7 + 64]
1305
+    call            sa8d_8x8_12bit
1306
+    paddd           m10, m0
1307
+
1308
+    lea             r0, [r0 + r1 * 4]
1309
+    lea             r2, [r2 + r3 * 4]
1310
+    call            sa8d_8x8_12bit
1311
+    paddd           m10, m0
1312
+
1313
+    lea             r0, [r6 + 80]
1314
+    lea             r2, [r7 + 80]
1315
+    call            sa8d_8x8_12bit
1316
+    paddd           m10, m0
1317
+
1318
+    lea             r0, [r0 + r1 * 4]
1319
+    lea             r2, [r2 + r3 * 4]
1320
+    call            sa8d_8x8_12bit
1321
+    paddd           m0, m10
1322
+
1323
+    vextracti128    xm6, m0, 1
1324
+    paddd           xm0, xm6
1325
+
1326
+    movhlps         xm6, xm0
1327
+    paddd           xm0, xm6
1328
+
1329
+    pshuflw         xm6, xm0, 0Eh
1330
+    paddd           xm0, xm6
1331
+    paddd           xm0, [pd_1]
1332
+    psrld           xm0, 1
1333
+    paddd           xm11, xm0
1334
+
1335
+    pxor            m10, m10
1336
+    lea             r0, [r6 + 96]
1337
+    lea             r2, [r7 + 96]
1338
+    call            sa8d_8x8_12bit
1339
+    paddd           m10, m0
1340
+
1341
+    lea             r0, [r0 + r1 * 4]
1342
+    lea             r2, [r2 + r3 * 4]
1343
+    call            sa8d_8x8_12bit
1344
+    paddd           m10, m0
1345
+
1346
+    lea             r0, [r6 + 112]
1347
+    lea             r2, [r7 + 112]
1348
+    call            sa8d_8x8_12bit
1349
+    paddd           m10, m0
1350
+
1351
+    lea             r0, [r0 + r1 * 4]
1352
+    lea             r2, [r2 + r3 * 4]
1353
+    call            sa8d_8x8_12bit
1354
+    paddd           m0, m10
1355
+
1356
+    vextracti128    xm6, m0, 1
1357
+    paddd           xm0, xm6
1358
+
1359
+    movhlps         xm6, xm0
1360
+    paddd           xm0, xm6
1361
+
1362
+    pshuflw         xm6, xm0, 0Eh
1363
+    paddd           xm0, xm6
1364
+    paddd           xm0, [pd_1]
1365
+    psrld           xm0, 1
1366
+    paddd           xm11, xm0
1367
+    movd            eax, xm11
1368
+    RET
1369
+%endif
1370
+
1371
+
1372
 ;=============================================================================
1373
 ; INTRA SATD
1374
 ;=============================================================================
1375
@@ -6508,7 +7861,9 @@
1376
 %define movdqu movups
1377
 %define punpcklqdq movlhps
1378
 INIT_XMM sse2
1379
+%if BIT_DEPTH <= 10
1380
 SA8D
1381
+%endif
1382
 SATDS_SSE2
1383
 
1384
 %if HIGH_BIT_DEPTH == 0
1385
@@ -6524,8 +7879,10 @@
1386
 %define LOAD_SUMSUB_16P  LOAD_SUMSUB_16P_SSSE3
1387
 %endif
1388
 INIT_XMM ssse3
1389
-SATDS_SSE2
1390
+%if BIT_DEPTH <= 10
1391
 SA8D
1392
+%endif
1393
+SATDS_SSE2
1394
 %undef movdqa ; nehalem doesn't like movaps
1395
 %undef movdqu ; movups
1396
 %undef punpcklqdq ; or movlhps
1397
@@ -6533,21 +7890,24 @@
1398
 %define TRANS TRANS_SSE4
1399
 %define LOAD_DUP_4x8P LOAD_DUP_4x8P_PENRYN
1400
 INIT_XMM sse4
1401
-SATDS_SSE2
1402
+%if BIT_DEPTH <= 10
1403
 SA8D
1404
+%endif
1405
+SATDS_SSE2
1406
 
1407
 ; Sandy/Ivy Bridge and Bulldozer do movddup in the load unit, so
1408
 ; it's effectively free.
1409
 %define LOAD_DUP_4x8P LOAD_DUP_4x8P_CONROE
1410
 INIT_XMM avx
1411
-SATDS_SSE2
1412
 SA8D
1413
+SATDS_SSE2
1414
 
1415
 %define TRANS TRANS_XOP
1416
 INIT_XMM xop
1417
-SATDS_SSE2
1418
+%if BIT_DEPTH <= 10
1419
 SA8D
1420
-
1421
+%endif
1422
+SATDS_SSE2
1423
 
1424
 %if HIGH_BIT_DEPTH == 0
1425
 %define LOAD_SUMSUB_8x4P LOAD_SUMSUB8_16x4P_AVX2
1426
@@ -6555,34 +7915,39 @@
1427
 %define TRANS TRANS_SSE4
1428
 
1429
 %macro LOAD_SUMSUB_8x8P_AVX2 7 ; 4*dst, 2*tmp, mul]
1430
-    movq   xm%1, [r0]
1431
-    movq   xm%3, [r2]
1432
-    movq   xm%2, [r0+r1]
1433
-    movq   xm%4, [r2+r3]
1434
-    vinserti128 m%1, m%1, [r0+4*r1], 1
1435
-    vinserti128 m%3, m%3, [r2+4*r3], 1
1436
-    vinserti128 m%2, m%2, [r0+r4], 1
1437
-    vinserti128 m%4, m%4, [r2+r5], 1
1438
-    punpcklqdq m%1, m%1
1439
-    punpcklqdq m%3, m%3
1440
-    punpcklqdq m%2, m%2
1441
-    punpcklqdq m%4, m%4
1442
+    movddup xm%1, [r0]
1443
+    movddup xm%3, [r2]
1444
+    movddup xm%2, [r0+4*r1]
1445
+    movddup xm%5, [r2+4*r3]
1446
+    vinserti128 m%1, m%1, xm%2, 1
1447
+    vinserti128 m%3, m%3, xm%5, 1
1448
+
1449
+    movddup xm%2, [r0+r1]
1450
+    movddup xm%4, [r2+r3]
1451
+    movddup xm%5, [r0+r4]
1452
+    movddup xm%6, [r2+r5]
1453
+    vinserti128 m%2, m%2, xm%5, 1
1454
+    vinserti128 m%4, m%4, xm%6, 1
1455
+
1456
     DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %7
1457
     lea      r0, [r0+2*r1]
1458
     lea      r2, [r2+2*r3]
1459
 
1460
-    movq   xm%3, [r0]
1461
-    movq   xm%5, [r2]
1462
-    movq   xm%4, [r0+r1]
1463
+    movddup xm%3, [r0]
1464
+    movddup xm%5, [r0+4*r1]
1465
+    vinserti128 m%3, m%3, xm%5, 1
1466
+
1467
+    movddup xm%5, [r2]
1468
+    movddup xm%4, [r2+4*r3]
1469
+    vinserti128 m%5, m%5, xm%4, 1
1470
+
1471
+    movddup xm%4, [r0+r1]
1472
+    movddup xm%6, [r0+r4]
1473
+    vinserti128 m%4, m%4, xm%6, 1
1474
+
1475
     movq   xm%6, [r2+r3]
1476
-    vinserti128 m%3, m%3, [r0+4*r1], 1
1477
-    vinserti128 m%5, m%5, [r2+4*r3], 1
1478
-    vinserti128 m%4, m%4, [r0+r4], 1
1479
-    vinserti128 m%6, m%6, [r2+r5], 1
1480
-    punpcklqdq m%3, m%3
1481
-    punpcklqdq m%5, m%5
1482
-    punpcklqdq m%4, m%4
1483
-    punpcklqdq m%6, m%6
1484
+    movhps xm%6, [r2+r5]
1485
+    vpermq m%6, m%6, q1100
1486
     DIFF_SUMSUB_SSSE3 %3, %5, %4, %6, %7
1487
 %endmacro
1488
 
1489
@@ -6789,92 +8154,57 @@
1490
 ;void planecopy_sc(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)
1491
 ;------------------------------------------------------------------------------------------------------------------------
1492
 INIT_XMM sse2
1493
-cglobal downShift_16, 7,7,3
1494
-    movd        m0, r6d        ; m0 = shift
1495
+cglobal downShift_16, 4,7,3
1496
+    mov         r4d, r4m
1497
+    mov         r5d, r5m
1498
+    movd        m0, r6m        ; m0 = shift
1499
     add         r1, r1
1500
+
1501
     dec         r5d
1502
 .loopH:
1503
     xor         r6, r6
1504
+
1505
 .loopW:
1506
     movu        m1, [r0 + r6 * 2]
1507
-    movu        m2, [r0 + r6 * 2 + 16]
1508
+    movu        m2, [r0 + r6 * 2 + mmsize]
1509
     psrlw       m1, m0
1510
     psrlw       m2, m0
1511
     packuswb    m1, m2
1512
     movu        [r2 + r6], m1
1513
 
1514
-    add         r6, 16
1515
+    add         r6, mmsize
1516
     cmp         r6d, r4d
1517
-    jl          .loopW
1518
+    jl         .loopW
1519
 
1520
     ; move to next row
1521
     add         r0, r1
1522
     add         r2, r3
1523
     dec         r5d
1524
-    jnz         .loopH
1525
-
1526
-;processing last row of every frame [To handle width which not a multiple of 16]
1527
+    jnz        .loopH
1528
 
1529
+    ;processing last row of every frame [To handle width which not a multiple of 16]
1530
+    ; r4d must be more than or equal to 16(mmsize)
1531
 .loop16:
1532
-    movu        m1, [r0]
1533
-    movu        m2, [r0 + 16]
1534
+    movu        m1, [r0 + (r4 - mmsize) * 2]
1535
+    movu        m2, [r0 + (r4 - mmsize) * 2 + mmsize]
1536
     psrlw       m1, m0
1537
     psrlw       m2, m0
1538
     packuswb    m1, m2
1539
-    movu        [r2], m1
1540
+    movu        [r2 + r4 - mmsize], m1
1541
 
1542
-    add         r0, 2 * mmsize
1543
-    add         r2, mmsize
1544
-    sub         r4d, 16
1545
-    jz          .end
1546
-    cmp         r4d, 15
1547
-    jg          .loop16
1548
+    sub         r4d, mmsize
1549
+    jz         .end
1550
+    cmp         r4d, mmsize
1551
+    jge        .loop16
1552
 
1553
-    cmp         r4d, 8
1554
-    jl          .process4
1555
+    ; process partial pixels
1556
     movu        m1, [r0]
1557
+    movu        m2, [r0 + mmsize]
1558
     psrlw       m1, m0
1559
-    packuswb    m1, m1
1560
-    movh        [r2], m1
1561
-
1562
-    add         r0, mmsize
1563
-    add         r2, 8
1564
-    sub         r4d, 8
1565
-    jz          .end
1566
-
1567
-.process4:
1568
-    cmp         r4d, 4
1569
-    jl          .process2
1570
-    movh        m1,[r0]
1571
-    psrlw       m1, m0
1572
-    packuswb    m1, m1
1573
-    movd        [r2], m1
1574
-
1575
-    add         r0, 8
1576
-    add         r2, 4
1577
-    sub         r4d, 4
1578
-    jz          .end
1579
-
1580
-.process2:
1581
-    cmp         r4d, 2
1582
-    jl          .process1
1583
-    movd        m1, [r0]
1584
-    psrlw       m1, m0
1585
-    packuswb    m1, m1
1586
-    movd        r6, m1
1587
-    mov         [r2], r6w
1588
-
1589
-    add         r0, 4
1590
-    add         r2, 2
1591
-    sub         r4d, 2
1592
-    jz          .end
1593
+    psrlw       m2, m0
1594
+    packuswb    m1, m2
1595
+    movu        [r2], m1
1596
 
1597
-.process1:
1598
-    movd        m1, [r0]
1599
-    psrlw       m1, m0
1600
-    packuswb    m1, m1
1601
-    movd        r3, m1
1602
-    mov         [r2], r3b
1603
 .end:
1604
     RET
1605
 
1606
@@ -6883,12 +8213,16 @@
1607
 ;void planecopy_sp(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)
1608
 ;-------------------------------------------------------------------------------------------------------------------------------------
1609
 INIT_YMM avx2
1610
-cglobal downShift_16, 6,7,3
1611
+cglobal downShift_16, 4,7,3
1612
+    mov         r4d, r4m
1613
+    mov         r5d, r5m
1614
     movd        xm0, r6m        ; m0 = shift
1615
     add         r1d, r1d
1616
+
1617
     dec         r5d
1618
 .loopH:
1619
     xor         r6, r6
1620
+
1621
 .loopW:
1622
     movu        m1, [r0 + r6 * 2 +  0]
1623
     movu        m2, [r0 + r6 * 2 + 32]
1624
@@ -6900,92 +8234,39 @@
1625
 
1626
     add         r6d, mmsize
1627
     cmp         r6d, r4d
1628
-    jl          .loopW
1629
+    jl         .loopW
1630
 
1631
     ; move to next row
1632
     add         r0, r1
1633
     add         r2, r3
1634
     dec         r5d
1635
-    jnz         .loopH
1636
+    jnz        .loopH
1637
 
1638
-; processing last row of every frame [To handle width which not a multiple of 32]
1639
-    mov         r6d, r4d
1640
-    and         r4d, 31
1641
-    shr         r6d, 5
1642
+    ; processing last row of every frame [To handle width which not a multiple of 32]
1643
 
1644
 .loop32:
1645
-    movu        m1, [r0]
1646
-    movu        m2, [r0 + 32]
1647
+    movu        m1, [r0 + (r4 - mmsize) * 2]
1648
+    movu        m2, [r0 + (r4 - mmsize) * 2 + mmsize]
1649
     psrlw       m1, xm0
1650
     psrlw       m2, xm0
1651
     packuswb    m1, m2
1652
-    vpermq      m1, m1, 11011000b
1653
-    movu        [r2], m1
1654
+    vpermq      m1, m1, q3120
1655
+    movu        [r2 + r4 - mmsize], m1
1656
 
1657
-    add         r0, 2*mmsize
1658
-    add         r2, mmsize
1659
-    dec         r6d
1660
-    jnz         .loop32
1661
+    sub         r4d, mmsize
1662
+    jz         .end
1663
+    cmp         r4d, mmsize
1664
+    jge        .loop32
1665
 
1666
-    cmp         r4d, 16
1667
-    jl          .process8
1668
+    ; process partial pixels
1669
     movu        m1, [r0]
1670
+    movu        m2, [r0 + mmsize]
1671
     psrlw       m1, xm0
1672
-    packuswb    m1, m1
1673
-    vpermq      m1, m1, 10001000b
1674
-    movu        [r2], xm1
1675
-
1676
-    add         r0, mmsize
1677
-    add         r2, 16
1678
-    sub         r4d, 16
1679
-    jz          .end
1680
-
1681
-.process8:
1682
-    cmp         r4d, 8
1683
-    jl          .process4
1684
-    movu        m1, [r0]
1685
-    psrlw       m1, xm0
1686
-    packuswb    m1, m1
1687
-    movq        [r2], xm1
1688
-
1689
-    add         r0, 16
1690
-    add         r2, 8
1691
-    sub         r4d, 8
1692
-    jz          .end
1693
-
1694
-.process4:
1695
-    cmp         r4d, 4
1696
-    jl          .process2
1697
-    movq        xm1,[r0]
1698
-    psrlw       m1, xm0
1699
-    packuswb    m1, m1
1700
-    movd        [r2], xm1
1701
-
1702
-    add         r0, 8
1703
-    add         r2, 4
1704
-    sub         r4d, 4
1705
-    jz          .end
1706
-
1707
-.process2:
1708
-    cmp         r4d, 2
1709
-    jl          .process1
1710
-    movd        xm1, [r0]
1711
-    psrlw       m1, xm0
1712
-    packuswb    m1, m1
1713
-    movd        r6d, xm1
1714
-    mov         [r2], r6w
1715
-
1716
-    add         r0, 4
1717
-    add         r2, 2
1718
-    sub         r4d, 2
1719
-    jz          .end
1720
+    psrlw       m2, xm0
1721
+    packuswb    m1, m2
1722
+    vpermq      m1, m1, q3120
1723
+    movu        [r2], m1
1724
 
1725
-.process1:
1726
-    movd        xm1, [r0]
1727
-    psrlw       m1, xm0
1728
-    packuswb    m1, m1
1729
-    movd        r3d, xm1
1730
-    mov         [r2], r3b
1731
 .end:
1732
     RET
1733
 
1734
@@ -7122,7 +8403,9 @@
1735
 ;void planecopy_sp_shl(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)
1736
 ;------------------------------------------------------------------------------------------------------------------------
1737
 INIT_XMM sse2
1738
-cglobal upShift_16, 6,7,4
1739
+cglobal upShift_16, 4,7,4
1740
+    mov         r4d, r4m
1741
+    mov         r5d, r5m
1742
     movd        m0, r6m        ; m0 = shift
1743
     mova        m3, [pw_pixel_max]
1744
     FIX_STRIDES r1d, r3d
1745
@@ -7150,68 +8433,34 @@
1746
     dec         r5d
1747
     jnz        .loopH
1748
 
1749
-;processing last row of every frame [To handle width which not a multiple of 16]
1750
+    ;processing last row of every frame [To handle width which not a multiple of 16]
1751
 
1752
+    ; WARNING: width(r4d) MUST BE more than or equal to 16(mmsize) in here
1753
 .loop16:
1754
-    movu        m1, [r0]
1755
-    movu        m2, [r0 + mmsize]
1756
+    movu        m1, [r0 + (r4 - mmsize) * 2]
1757
+    movu        m2, [r0 + (r4 - mmsize) * 2 + mmsize]
1758
     psllw       m1, m0
1759
     psllw       m2, m0
1760
     pand        m1, m3
1761
     pand        m2, m3
1762
-    movu        [r2], m1
1763
-    movu        [r2 + mmsize], m2
1764
+    movu        [r2 + (r4 - mmsize) * 2], m1
1765
+    movu        [r2 + (r4 - mmsize) * 2 + mmsize], m2
1766
 
1767
-    add         r0, 2 * mmsize
1768
-    add         r2, 2 * mmsize
1769
-    sub         r4d, 16
1770
+    sub         r4d, mmsize
1771
     jz         .end
1772
-    jg         .loop16
1773
+    cmp         r4d, mmsize
1774
+    jge        .loop16
1775
 
1776
-    cmp         r4d, 8
1777
-    jl         .process4
1778
+    ; process partial pixels
1779
     movu        m1, [r0]
1780
-    psrlw       m1, m0
1781
-    pand        m1, m3
1782
-    movu        [r2], m1
1783
-
1784
-    add         r0, mmsize
1785
-    add         r2, mmsize
1786
-    sub         r4d, 8
1787
-    jz          .end
1788
-
1789
-.process4:
1790
-    cmp         r4d, 4
1791
-    jl         .process2
1792
-    movh        m1,[r0]
1793
-    psllw       m1, m0
1794
-    pand        m1, m3
1795
-    movh        [r2], m1
1796
-
1797
-    add         r0, 8
1798
-    add         r2, 8
1799
-    sub         r4d, 4
1800
-    jz         .end
1801
-
1802
-.process2:
1803
-    cmp         r4d, 2
1804
-    jl         .process1
1805
-    movd        m1, [r0]
1806
+    movu        m2, [r0 + mmsize]
1807
     psllw       m1, m0
1808
+    psllw       m2, m0
1809
     pand        m1, m3
1810
-    movd        [r2], m1
1811
-
1812
-    add         r0, 4
1813
-    add         r2, 4
1814
-    sub         r4d, 2
1815
-    jz         .end
1816
+    pand        m2, m3
1817
+    movu        [r2], m1
1818
+    movu        [r2 + mmsize], m2
1819
 
1820
-.process1:
1821
-    movd        m1, [r0]
1822
-    psllw       m1, m0
1823
-    pand        m1, m3
1824
-    movd        r3, m1
1825
-    mov         [r2], r3w
1826
 .end:
1827
     RET
1828
 
1829
@@ -7219,9 +8468,10 @@
1830
 ;-------------------------------------------------------------------------------------------------------------------------------------
1831
 ;void planecopy_sp_shl(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)
1832
 ;-------------------------------------------------------------------------------------------------------------------------------------
1833
-; TODO: NO TEST CODE!
1834
 INIT_YMM avx2
1835
-cglobal upShift_16, 6,7,4
1836
+cglobal upShift_16, 4,7,4
1837
+    mov         r4d, r4m
1838
+    mov         r5d, r5m
1839
     movd        xm0, r6m        ; m0 = shift
1840
     vbroadcasti128 m3, [pw_pixel_max]
1841
     FIX_STRIDES r1d, r3d
1842
@@ -7248,83 +8498,33 @@
1843
     dec         r5d
1844
     jnz        .loopH
1845
 
1846
-; processing last row of every frame [To handle width which not a multiple of 32]
1847
-    mov         r6d, r4d
1848
-    and         r4d, 31
1849
-    shr         r6d, 5
1850
+    ; processing last row of every frame [To handle width which not a multiple of 32]
1851
 
1852
 .loop32:
1853
-    movu        m1, [r0]
1854
-    movu        m2, [r0 + mmsize]
1855
+    movu        m1, [r0 + (r4 - mmsize) * 2]
1856
+    movu        m2, [r0 + (r4 - mmsize) * 2 + mmsize]
1857
     psllw       m1, xm0
1858
     psllw       m2, xm0
1859
     pand        m1, m3
1860
     pand        m2, m3
1861
-    movu        [r2], m1
1862
-    movu        [r2 + mmsize], m2
1863
+    movu        [r2 + (r4 - mmsize) * 2], m1
1864
+    movu        [r2 + (r4 - mmsize) * 2 + mmsize], m2
1865
 
1866
-    add         r0, 2*mmsize
1867
-    add         r2, 2*mmsize
1868
-    dec         r6d
1869
-    jnz        .loop32
1870
+    sub         r4d, mmsize
1871
+    jz         .end
1872
+    cmp         r4d, mmsize
1873
+    jge        .loop32
1874
 
1875
-    cmp         r4d, 16
1876
-    jl         .process8
1877
+    ; process partial pixels
1878
     movu        m1, [r0]
1879
+    movu        m2, [r0 + mmsize]
1880
     psllw       m1, xm0
1881
+    psllw       m2, xm0
1882
     pand        m1, m3
1883
+    pand        m2, m3
1884
     movu        [r2], m1
1885
+    movu        [r2 + mmsize], m2
1886
 
1887
-    add         r0, mmsize
1888
-    add         r2, mmsize
1889
-    sub         r4d, 16
1890
-    jz         .end
1891
-
1892
-.process8:
1893
-    cmp         r4d, 8
1894
-    jl         .process4
1895
-    movu        xm1, [r0]
1896
-    psllw       xm1, xm0
1897
-    pand        xm1, xm3
1898
-    movu        [r2], xm1
1899
-
1900
-    add         r0, 16
1901
-    add         r2, 16
1902
-    sub         r4d, 8
1903
-    jz         .end
1904
-
1905
-.process4:
1906
-    cmp         r4d, 4
1907
-    jl          .process2
1908
-    movq        xm1,[r0]
1909
-    psllw       xm1, xm0
1910
-    pand        xm1, xm3
1911
-    movq        [r2], xm1
1912
-
1913
-    add         r0, 8
1914
-    add         r2, 8
1915
-    sub         r4d, 4
1916
-    jz         .end
1917
-
1918
-.process2:
1919
-    cmp         r4d, 2
1920
-    jl         .process1
1921
-    movd        xm1, [r0]
1922
-    psllw       xm1, xm0
1923
-    pand        xm1, xm3
1924
-    movd        [r2], xm1
1925
-
1926
-    add         r0, 4
1927
-    add         r2, 4
1928
-    sub         r4d, 2
1929
-    jz         .end
1930
-
1931
-.process1:
1932
-    movd        xm1, [r0]
1933
-    psllw       xm1, xm0
1934
-    pand        xm1, xm3
1935
-    movd        r3d, xm1
1936
-    mov         [r2], r3w
1937
 .end:
1938
     RET
1939
 
1940
@@ -8725,16 +9925,272 @@
1941
     pabsd          xm1, xm1
1942
 %endmacro
1943
 
1944
+%macro PSY_COST_PP_8x8_MAIN12 0
1945
+    ; load source pixels
1946
+    lea             r4, [r1 * 3]
1947
+    pmovzxwd        m0, [r0]
1948
+    pmovzxwd        m1, [r0 + r1]
1949
+    pmovzxwd        m2, [r0 + r1 * 2]
1950
+    pmovzxwd        m3, [r0 + r4]
1951
+    lea             r5, [r0 + r1 * 4]
1952
+    pmovzxwd        m4, [r5]
1953
+    pmovzxwd        m5, [r5 + r1]
1954
+    pmovzxwd        m6, [r5 + r1 * 2]
1955
+    pmovzxwd        m7, [r5 + r4]
1956
+
1957
+    ; source SAD
1958
+    paddd           m8, m0, m1
1959
+    paddd           m8, m2
1960
+    paddd           m8, m3
1961
+    paddd           m8, m4
1962
+    paddd           m8, m5
1963
+    paddd           m8, m6
1964
+    paddd           m8, m7
1965
+
1966
+    vextracti128    xm9, m8, 1
1967
+    paddd           m8, m9              ; sad_8x8
1968
+    movhlps         xm9, xm8
1969
+    paddd           xm8, xm9
1970
+    pshuflw         xm9, xm8, 0Eh
1971
+    paddd           xm8, xm9
1972
+    psrld           m8, 2
1973
+
1974
+    ; source SA8D
1975
+    psubd           m9, m1, m0
1976
+    paddd           m0, m1
1977
+    psubd           m1, m3, m2
1978
+    paddd           m2, m3
1979
+    punpckhdq       m3, m0, m9
1980
+    punpckldq       m0, m9
1981
+    psubd           m9, m3, m0
1982
+    paddd           m0, m3
1983
+    punpckhdq       m3, m2, m1
1984
+    punpckldq       m2, m1
1985
+    psubd           m10, m3, m2
1986
+    paddd           m2, m3
1987
+    psubd           m3, m5, m4
1988
+    paddd           m4, m5
1989
+    psubd           m5, m7, m6
1990
+    paddd           m6, m7
1991
+    punpckhdq       m1, m4, m3
1992
+    punpckldq       m4, m3
1993
+    psubd           m7, m1, m4
1994
+    paddd           m4, m1
1995
+    punpckhdq       m3, m6, m5
1996
+    punpckldq       m6, m5
1997
+    psubd           m1, m3, m6
1998
+    paddd           m6, m3
1999
+    psubd           m3, m2, m0
2000
+    paddd           m0, m2
2001
+    psubd           m2, m10, m9
2002
+    paddd           m9, m10
2003
+    punpckhqdq      m5, m0, m3
2004
+    punpcklqdq      m0, m3
2005
+    psubd           m10, m5, m0
2006
+    paddd           m0, m5
2007
+    punpckhqdq      m3, m9, m2
2008
+    punpcklqdq      m9, m2
2009
+    psubd           m5, m3, m9
2010
+    paddd           m9, m3
2011
+    psubd           m3, m6, m4
2012
+    paddd           m4, m6
2013
+    psubd           m6, m1, m7
2014
+    paddd           m7, m1
2015
+    punpckhqdq      m2, m4, m3
2016
+    punpcklqdq      m4, m3
2017
+    psubd           m1, m2, m4
2018
+    paddd           m4, m2
2019
+    punpckhqdq      m3, m7, m6
2020
+    punpcklqdq      m7, m6
2021
+    psubd           m2, m3, m7
2022
+    paddd           m7, m3
2023
+    psubd           m3, m4, m0
2024
+    paddd           m0, m4
2025
+    psubd           m4, m1, m10
2026
+    paddd           m10, m1
2027
+    vinserti128     m6, m0, xm3, 1
2028
+    vperm2i128      m0, m0, m3, 00110001b
2029
+    pabsd           m0, m0
2030
+    pabsd           m6, m6
2031
+    pmaxsd          m0, m6
2032
+    vinserti128     m3, m10, xm4, 1
2033
+    vperm2i128      m10, m10, m4, 00110001b
2034
+    pabsd           m10, m10
2035
+    pabsd           m3, m3
2036
+    pmaxsd          m10, m3
2037
+    psubd           m3, m7, m9
2038
+    paddd           m9, m7
2039
+    psubd           m7, m2, m5
2040
+    paddd           m5, m2
2041
+    vinserti128     m4, m9, xm3, 1
2042
+    vperm2i128      m9, m9, m3, 00110001b
2043
+    pabsd           m9, m9
2044
+    pabsd           m4, m4
2045
+    pmaxsd          m9, m4
2046
+    vinserti128     m3, m5, xm7, 1
2047
+    vperm2i128      m5, m5, m7, 00110001b
2048
+    pabsd           m5, m5
2049
+    pabsd           m3, m3
2050
+    pmaxsd          m5, m3
2051
+    paddd           m0, m9
2052
+    paddd           m0, m10
2053
+    paddd           m0, m5
2054
+
2055
+    vextracti128    xm9, m0, 1
2056
+    paddd           m0, m9              ; sad_8x8
2057
+    movhlps         xm9, xm0
2058
+    paddd           xm0, xm9
2059
+    pshuflw         xm9, xm0, 0Eh
2060
+    paddd           xm0, xm9
2061
+    paddd           m0, [pd_1]
2062
+    psrld           m0, 1               ; sa8d_8x8
2063
+    psubd           m11, m0, m8         ; sa8d_8x8 - sad_8x8
2064
+
2065
+    ; load recon pixels
2066
+    lea             r4, [r3 * 3]
2067
+    pmovzxwd        m0, [r2]
2068
+    pmovzxwd        m1, [r2 + r3]
2069
+    pmovzxwd        m2, [r2 + r3 * 2]
2070
+    pmovzxwd        m3, [r2 + r4]
2071
+    lea             r5, [r2 + r3 * 4]
2072
+    pmovzxwd        m4, [r5]
2073
+    pmovzxwd        m5, [r5 + r3]
2074
+    pmovzxwd        m6, [r5 + r3 * 2]
2075
+    pmovzxwd        m7, [r5 + r4]
2076
+
2077
+    ; recon SAD
2078
+    paddd           m8, m0, m1
2079
+    paddd           m8, m2
2080
+    paddd           m8, m3
2081
+    paddd           m8, m4
2082
+    paddd           m8, m5
2083
+    paddd           m8, m6
2084
+    paddd           m8, m7
2085
+
2086
+    vextracti128    xm9, m8, 1
2087
+    paddd           m8, m9              ; sad_8x8
2088
+    movhlps         xm9, xm8
2089
+    paddd           xm8, xm9
2090
+    pshuflw         xm9, xm8, 0Eh
2091
+    paddd           xm8, xm9
2092
+    psrld           m8, 2
2093
+
2094
+    ; recon SA8D
2095
+    psubd           m9, m1, m0
2096
+    paddd           m0, m1
2097
+    psubd           m1, m3, m2
2098
+    paddd           m2, m3
2099
+    punpckhdq       m3, m0, m9
2100
+    punpckldq       m0, m9
2101
+    psubd           m9, m3, m0
2102
+    paddd           m0, m3
2103
+    punpckhdq       m3, m2, m1
2104
+    punpckldq       m2, m1
2105
+    psubd           m10, m3, m2
2106
+    paddd           m2, m3
2107
+    psubd           m3, m5, m4
2108
+    paddd           m4, m5
2109
+    psubd           m5, m7, m6
2110
+    paddd           m6, m7
2111
+    punpckhdq       m1, m4, m3
2112
+    punpckldq       m4, m3
2113
+    psubd           m7, m1, m4
2114
+    paddd           m4, m1
2115
+    punpckhdq       m3, m6, m5
2116
+    punpckldq       m6, m5
2117
+    psubd           m1, m3, m6
2118
+    paddd           m6, m3
2119
+    psubd           m3, m2, m0
2120
+    paddd           m0, m2
2121
+    psubd           m2, m10, m9
2122
+    paddd           m9, m10
2123
+    punpckhqdq      m5, m0, m3
2124
+    punpcklqdq      m0, m3
2125
+    psubd           m10, m5, m0
2126
+    paddd           m0, m5
2127
+    punpckhqdq      m3, m9, m2
2128
+    punpcklqdq      m9, m2
2129
+    psubd           m5, m3, m9
2130
+    paddd           m9, m3
2131
+    psubd           m3, m6, m4
2132
+    paddd           m4, m6
2133
+    psubd           m6, m1, m7
2134
+    paddd           m7, m1
2135
+    punpckhqdq      m2, m4, m3
2136
+    punpcklqdq      m4, m3
2137
+    psubd           m1, m2, m4
2138
+    paddd           m4, m2
2139
+    punpckhqdq      m3, m7, m6
2140
+    punpcklqdq      m7, m6
2141
+    psubd           m2, m3, m7
2142
+    paddd           m7, m3
2143
+    psubd           m3, m4, m0
2144
+    paddd           m0, m4
2145
+    psubd           m4, m1, m10
2146
+    paddd           m10, m1
2147
+    vinserti128     m6, m0, xm3, 1
2148
+    vperm2i128      m0, m0, m3, 00110001b
2149
+    pabsd           m0, m0
2150
+    pabsd           m6, m6
2151
+    pmaxsd          m0, m6
2152
+    vinserti128     m3, m10, xm4, 1
2153
+    vperm2i128      m10, m10, m4, 00110001b
2154
+    pabsd           m10, m10
2155
+    pabsd           m3, m3
2156
+    pmaxsd          m10, m3
2157
+    psubd           m3, m7, m9
2158
+    paddd           m9, m7
2159
+    psubd           m7, m2, m5
2160
+    paddd           m5, m2
2161
+    vinserti128     m4, m9, xm3, 1
2162
+    vperm2i128      m9, m9, m3, 00110001b
2163
+    pabsd           m9, m9
2164
+    pabsd           m4, m4
2165
+    pmaxsd          m9, m4
2166
+    vinserti128     m3, m5, xm7, 1
2167
+    vperm2i128      m5, m5, m7, 00110001b
2168
+    pabsd           m5, m5
2169
+    pabsd           m3, m3
2170
+    pmaxsd          m5, m3
2171
+    paddd           m0, m9
2172
+    paddd           m0, m10
2173
+    paddd           m0, m5
2174
+
2175
+    vextracti128    xm9, m0, 1
2176
+    paddd           m0, m9              ; sad_8x8
2177
+    movhlps         xm9, xm0
2178
+    paddd           xm0, xm9
2179
+    pshuflw         xm9, xm0, 0Eh
2180
+    paddd           xm0, xm9
2181
+    paddd           m0, [pd_1]
2182
+    psrld           m0, 1               ; sa8d_8x8
2183
+    psubd           m0, m8              ; sa8d_8x8 - sad_8x8
2184
+
2185
+    psubd          m11, m0
2186
+    pabsd          m11, m11
2187
+%endmacro
2188
+
2189
 %if ARCH_X86_64
2190
-%if HIGH_BIT_DEPTH
2191
+INIT_YMM avx2
2192
+%if HIGH_BIT_DEPTH && BIT_DEPTH == 12
2193
+cglobal psyCost_pp_8x8, 4, 8, 12
2194
+    add             r1d, r1d
2195
+    add             r3d, r3d
2196
+    PSY_COST_PP_8x8_MAIN12
2197
+    movd           eax, xm11
2198
+    RET
2199
+%endif
2200
+
2201
+%if HIGH_BIT_DEPTH && BIT_DEPTH == 10
2202
 cglobal psyCost_pp_8x8, 4, 8, 11
2203
     add            r1d, r1d
2204
     add            r3d, r3d
2205
     PSY_PP_8x8_AVX2
2206
     movd           eax, xm1
2207
     RET
2208
-%else ; !HIGH_BIT_DEPTH
2209
-INIT_YMM avx2
2210
+%endif
2211
+
2212
+%if BIT_DEPTH == 8
2213
 cglobal psyCost_pp_8x8, 4, 8, 13
2214
     lea             r4, [3 * r1]
2215
     lea             r7, [3 * r3]
2216
@@ -8746,9 +10202,35 @@
2217
     RET
2218
 %endif
2219
 %endif
2220
+
2221
 %if ARCH_X86_64
2222
 INIT_YMM avx2
2223
-%if HIGH_BIT_DEPTH
2224
+%if HIGH_BIT_DEPTH && BIT_DEPTH == 12
2225
+cglobal psyCost_pp_16x16, 4, 10, 13
2226
+    add            r1d, r1d
2227
+    add            r3d, r3d
2228
+    pxor           m12, m12
2229
+
2230
+    mov            r8d, 2
2231
+.loopH:
2232
+    mov            r9d, 2
2233
+.loopW:
2234
+    PSY_COST_PP_8x8_MAIN12
2235
+
2236
+    paddd         xm12, xm11
2237
+    add             r0, 16
2238
+    add             r2, 16
2239
+    dec            r9d
2240
+    jnz            .loopW
2241
+    lea             r0, [r0 + r1 * 8 - 32]
2242
+    lea             r2, [r2 + r3 * 8 - 32]
2243
+    dec            r8d
2244
+    jnz            .loopH
2245
+    movd           eax, xm12
2246
+    RET
2247
+%endif
2248
+
2249
+%if HIGH_BIT_DEPTH && BIT_DEPTH == 10
2250
 cglobal psyCost_pp_16x16, 4, 10, 12
2251
     add            r1d, r1d
2252
     add            r3d, r3d
2253
@@ -8771,7 +10253,9 @@
2254
     jnz            .loopH
2255
     movd           eax, xm11
2256
     RET
2257
-%else ; !HIGH_BIT_DEPTH
2258
+%endif
2259
+
2260
+%if BIT_DEPTH == 8
2261
 cglobal psyCost_pp_16x16, 4, 10, 14
2262
     lea             r4, [3 * r1]
2263
     lea             r7, [3 * r3]
2264
@@ -8797,9 +10281,35 @@
2265
     RET
2266
 %endif
2267
 %endif
2268
+
2269
 %if ARCH_X86_64
2270
 INIT_YMM avx2
2271
-%if HIGH_BIT_DEPTH
2272
+%if HIGH_BIT_DEPTH && BIT_DEPTH == 12
2273
+cglobal psyCost_pp_32x32, 4, 10, 13
2274
+    add            r1d, r1d
2275
+    add            r3d, r3d
2276
+    pxor           m12, m12
2277
+
2278
+    mov            r8d, 4
2279
+.loopH:
2280
+    mov            r9d, 4
2281
+.loopW:
2282
+    PSY_COST_PP_8x8_MAIN12
2283
+
2284
+    paddd         xm12, xm11
2285
+    add             r0, 16
2286
+    add             r2, 16
2287
+    dec            r9d
2288
+    jnz            .loopW
2289
+    lea             r0, [r0 + r1 * 8 - 64]
2290
+    lea             r2, [r2 + r3 * 8 - 64]
2291
+    dec            r8d
2292
+    jnz            .loopH
2293
+    movd           eax, xm12
2294
+    RET
2295
+%endif
2296
+
2297
+%if HIGH_BIT_DEPTH && BIT_DEPTH == 10
2298
 cglobal psyCost_pp_32x32, 4, 10, 12
2299
     add            r1d, r1d
2300
     add            r3d, r3d
2301
@@ -8822,7 +10332,9 @@
2302
     jnz            .loopH
2303
     movd           eax, xm11
2304
     RET
2305
-%else ; !HIGH_BIT_DEPTH
2306
+%endif
2307
+
2308
+%if BIT_DEPTH == 8
2309
 cglobal psyCost_pp_32x32, 4, 10, 14
2310
     lea             r4, [3 * r1]
2311
     lea             r7, [3 * r3]
2312
@@ -8848,9 +10360,35 @@
2313
     RET
2314
 %endif
2315
 %endif
2316
+
2317
 %if ARCH_X86_64
2318
 INIT_YMM avx2
2319
-%if HIGH_BIT_DEPTH
2320
+%if HIGH_BIT_DEPTH && BIT_DEPTH == 12
2321
+cglobal psyCost_pp_64x64, 4, 10, 13
2322
+    add            r1d, r1d
2323
+    add            r3d, r3d
2324
+    pxor           m12, m12
2325
+
2326
+    mov            r8d, 8
2327
+.loopH:
2328
+    mov            r9d, 8
2329
+.loopW:
2330
+    PSY_COST_PP_8x8_MAIN12
2331
+
2332
+    paddd         xm12, xm11
2333
+    add             r0, 16
2334
+    add             r2, 16
2335
+    dec            r9d
2336
+    jnz            .loopW
2337
+    lea             r0, [r0 + r1 * 8 - 128]
2338
+    lea             r2, [r2 + r3 * 8 - 128]
2339
+    dec            r8d
2340
+    jnz            .loopH
2341
+    movd           eax, xm12
2342
+    RET
2343
+%endif
2344
+
2345
+%if HIGH_BIT_DEPTH && BIT_DEPTH == 10
2346
 cglobal psyCost_pp_64x64, 4, 10, 12
2347
     add            r1d, r1d
2348
     add            r3d, r3d
2349
@@ -8873,7 +10411,9 @@
2350
     jnz            .loopH
2351
     movd           eax, xm11
2352
     RET
2353
-%else ; !HIGH_BIT_DEPTH
2354
+%endif
2355
+
2356
+%if BIT_DEPTH == 8
2357
 cglobal psyCost_pp_64x64, 4, 10, 14
2358
     lea             r4, [3 * r1]
2359
     lea             r7, [3 * r3]
2360
@@ -12186,3 +13726,80 @@
2361
     movd            eax, xm6
2362
     RET
2363
 %endif ; ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 1
2364
+
2365
+
2366
+;-------------------------------------------------------------------------------------------------------------------------------------
2367
+; pixel planeClipAndMax(pixel *src, intptr_t stride, int width, int height, uint64_t *outsum, const pixel minPix, const pixel maxPix)
2368
+;-------------------------------------------------------------------------------------------------------------------------------------
2369
+%if ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 0
2370
+INIT_YMM avx2
2371
+cglobal planeClipAndMax, 5,7,8
2372
+    movd            xm0, r5m
2373
+    vpbroadcastb    m0, xm0                 ; m0 = [min]
2374
+    vpbroadcastb    m1, r6m                 ; m1 = [max]
2375
+    pxor            m2, m2                  ; m2 = sumLuma
2376
+    pxor            m3, m3                  ; m3 = maxLumaLevel
2377
+    pxor            m4, m4                  ; m4 = zero
2378
+
2379
+    ; get mask to partial register pixels
2380
+    mov             r5d, r2d
2381
+    and             r2d, ~(mmsize - 1)
2382
+    sub             r5d, r2d
2383
+    lea             r6, [pb_movemask_32 + mmsize]
2384
+    sub             r6, r5
2385
+    movu            m5, [r6]                ; m5 = mask for last couple column
2386
+
2387
+.loopH:
2388
+    lea             r5d, [r2 - mmsize]
2389
+
2390
+.loopW:
2391
+    movu            m6, [r0 + r5]
2392
+    pmaxub          m6, m0
2393
+    pminub          m6, m1
2394
+    movu            [r0 + r5], m6           ; store back
2395
+    pmaxub          m3, m6                  ; update maxLumaLevel
2396
+    psadbw          m6, m4
2397
+    paddq           m2, m6
2398
+
2399
+    sub             r5d, mmsize
2400
+    jge            .loopW
2401
+
2402
+    ; partial pixels
2403
+    movu            m7, [r0 + r2]
2404
+    pmaxub          m6, m7, m0
2405
+    pminub          m6, m1
2406
+
2407
+    pand            m7, m5                  ; get invalid/unchange pixel
2408
+    pandn           m6, m5, m6              ; clear invalid pixels
2409
+    por             m7, m6                  ; combin valid & invalid pixels
2410
+    movu            [r0 + r2], m7           ; store back
2411
+    pmaxub          m3, m6                  ; update maxLumaLevel
2412
+    psadbw          m6, m4
2413
+    paddq           m2, m6
2414
+
2415
+.next:
2416
+    add             r0, r1
2417
+    dec             r3d
2418
+    jg             .loopH
2419
+
2420
+    ; sumLuma
2421
+    vextracti128    xm0, m2, 1
2422
+    paddq           xm0, xm2
2423
+    movhlps         xm1, xm0
2424
+    paddq           xm0, xm1
2425
+    movq            [r4], xm0
2426
+
2427
+    ; maxLumaLevel
2428
+    vextracti128    xm0, m3, 1
2429
+    pmaxub          xm0, xm3
2430
+    movhlps         xm3, xm0
2431
+    pmaxub          xm0, xm3
2432
+    pmovzxbw        xm0, xm0
2433
+    pxor            xm0, [pb_movemask + 16]
2434
+    phminposuw      xm0, xm0
2435
+
2436
+    movd            eax, xm0
2437
+    not             al
2438
+    movzx           eax, al
2439
+    RET
2440
+%endif ; ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 0
2441
x265_1.8.tar.gz/source/common/x86/pixel-util.h -> x265_1.9.tar.gz/source/common/x86/pixel-util.h Changed
16
 
1
@@ -2,6 +2,7 @@
2
  * Copyright (C) 2013 x265 project
3
  *
4
  * Authors: Steve Borho <steve@borho.org>
5
+;*          Min Chen <chenm003@163.com>
6
  *
7
  * This program is free software; you can redistribute it and/or modify
8
  * it under the terms of the GNU General Public License as published by
9
@@ -55,5 +56,6 @@
10
 int PFX(scanPosLast_avx2_bmi2(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* scanCG4x4, const int trSize));
11
 uint32_t PFX(findPosFirstLast_ssse3(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16]));
12
 uint32_t PFX(costCoeffNxN_sse4(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase));
13
+uint32_t PFX(costCoeffNxN_avx2_bmi2(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase));
14
 
15
 #endif // ifndef X265_PIXEL_UTIL_H
16
x265_1.8.tar.gz/source/common/x86/pixel-util8.asm -> x265_1.9.tar.gz/source/common/x86/pixel-util8.asm Changed
706
 
1
@@ -49,6 +49,7 @@
2
 mask_ff:                times 16 db 0xff
3
                         times 16 db 0
4
 deinterleave_shuf:      times  2 db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
5
+interleave_shuf:        times  2 db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
6
 deinterleave_word_shuf: times  2 db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15
7
 hmulw_16p:              times  8 dw 1
8
                         times  4 dw 1, -1
9
@@ -56,7 +57,7 @@
10
 SECTION .text
11
 
12
 cextern pw_1
13
-cextern pw_0_15
14
+cextern pw_0_7
15
 cextern pb_1
16
 cextern pb_128
17
 cextern pw_00ff
18
@@ -78,6 +79,7 @@
19
 cextern trans8_shuf
20
 cextern_naked private_prefix %+ _entropyStateBits
21
 cextern pb_movemask
22
+cextern pw_exp2_0_15
23
 
24
 ;-----------------------------------------------------------------------------
25
 ; void getResidual(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride)
26
@@ -792,6 +794,7 @@
27
     pshufd      m6, m6, 0       ; m6 = add
28
     mov         r3d, r4d        ; r3 = numCoeff
29
     shr         r4d, 3
30
+    pxor        m4, m4
31
 
32
 .loop:
33
     pmovsxwd    m0, [r0]        ; m0 = level
34
@@ -810,13 +813,13 @@
35
     psignd      m3, m1
36
 
37
     packssdw    m2, m3
38
+    pabsw       m2, m2
39
 
40
     movu        [r2], m2
41
     add         r0, 16
42
     add         r1, 32
43
     add         r2, 16
44
 
45
-    pxor        m4, m4
46
     pcmpeqw     m2, m4
47
     psubw       m7, m2
48
 
49
@@ -862,9 +865,11 @@
50
     psignd      m2, m0
51
 
52
     packssdw    m1, m2
53
-    vpermq      m2, m1, q3120
54
+    pabsw       m1, m1
55
 
56
+    vpermq      m2, m1, q3120
57
     movu        [r2], m2
58
+
59
     add         r0, mmsize
60
     add         r1, mmsize * 2
61
     add         r2, mmsize
62
@@ -1560,7 +1565,7 @@
63
     movd        m0, r6d
64
     pshuflw     m0, m0, 0
65
     punpcklqdq  m0, m0
66
-    pcmpgtw     m0, [pw_0_15]
67
+    pcmpgtw     m0, [pw_0_7]
68
 
69
 .loopH:
70
     mov         r6d, r4d
71
@@ -1718,7 +1723,7 @@
72
     pshuflw                   m0, m0, 0
73
     punpcklqdq                m0, m0
74
     vinserti128               m0, m0, xm0, 1
75
-    pcmpgtw                   m0, [pw_0_15]
76
+    pcmpgtw                   m0, [pw_0_7]
77
 
78
 .loopH:
79
     mov                       r6d, r4d
80
@@ -6397,6 +6402,78 @@
81
     movd   edx, xm6
82
 %endif
83
     RET
84
+
85
+INIT_YMM avx2
86
+cglobal pixel_var_32x32, 2,4,7
87
+    VAR_START 0
88
+    mov             r2d, 16
89
+
90
+.loop:
91
+    pmovzxbw        m0, [r0]
92
+    pmovzxbw        m3, [r0 + 16]
93
+    pmovzxbw        m1, [r0 + r1]
94
+    pmovzxbw        m4, [r0 + r1 + 16]
95
+
96
+    lea             r0, [r0 + r1 * 2]
97
+
98
+    VAR_CORE
99
+
100
+    dec             r2d
101
+    jg              .loop
102
+
103
+    vextracti128   xm0, m5, 1
104
+    vextracti128   xm1, m6, 1
105
+    paddw          xm5, xm0
106
+    paddd          xm6, xm1
107
+    HADDW          xm5, xm2
108
+    HADDD          xm6, xm1
109
+
110
+%if ARCH_X86_64
111
+    punpckldq      xm5, xm6
112
+    movq           rax, xm5
113
+%else
114
+    movd           eax, xm5
115
+    movd           edx, xm6
116
+%endif
117
+    RET
118
+
119
+INIT_YMM avx2
120
+cglobal pixel_var_64x64, 2,4,7
121
+    VAR_START 0
122
+    mov             r2d, 64
123
+
124
+.loop:
125
+    pmovzxbw        m0, [r0]
126
+    pmovzxbw        m3, [r0 + 16]
127
+    pmovzxbw        m1, [r0 + mmsize]
128
+    pmovzxbw        m4, [r0 + mmsize + 16]
129
+
130
+    lea             r0, [r0 + r1]
131
+
132
+    VAR_CORE
133
+
134
+    dec             r2d
135
+    jg              .loop
136
+
137
+    pxor            m1, m1
138
+    punpcklwd       m0, m5, m1
139
+    punpckhwd       m5, m1
140
+    paddd           m5, m0
141
+    vextracti128   xm2, m5, 1
142
+    vextracti128   xm1, m6, 1
143
+    paddd          xm5, xm2
144
+    paddd          xm6, xm1
145
+    HADDD          xm5, xm2
146
+    HADDD          xm6, xm1
147
+
148
+%if ARCH_X86_64
149
+    punpckldq      xm5, xm6
150
+    movq           rax, xm5
151
+%else
152
+    movd           eax, xm5
153
+    movd           edx, xm6
154
+%endif
155
+    RET
156
 %endif ; !HIGH_BIT_DEPTH
157
 
158
 %macro VAR2_END 3
159
@@ -6578,10 +6655,10 @@
160
 
161
 
162
 ;-----------------------------------------------------------------------------
163
-; uint32_t[last first] findPosFirstAndLast(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16])
164
+; uint32_t[sumSign last first] findPosFirstLast(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16], uint32_t *absSum)
165
 ;-----------------------------------------------------------------------------
166
 INIT_XMM ssse3
167
-cglobal findPosFirstLast, 3,3,3
168
+cglobal findPosFirstLast, 3,3,4
169
     ; convert stride to int16_t
170
     add         r1d, r1d
171
 
172
@@ -6593,10 +6670,22 @@
173
     movh        m1, [r0]
174
     movhps      m1, [r0 + r1]
175
     movh        m2, [r0 + r1 * 2]
176
-    lea         r1, [r1 * 3]
177
+    lea         r1d, [r1 * 3]
178
     movhps      m2, [r0 + r1]
179
+    pxor        m3, m1, m2
180
     packsswb    m1, m2
181
 
182
+    ; get absSum
183
+    movhlps     m2, m3
184
+    pxor        m3, m2
185
+    pshufd      m2, m3, q2301
186
+    pxor        m3, m2
187
+    movd        r0d, m3
188
+    mov         r2d, r0d
189
+    shr         r2d, 16
190
+    xor         r2d, r0d
191
+    shl         r2d, 31
192
+
193
     ; get non-zero mask
194
     pxor        m2, m2
195
     pcmpeqb     m1, m2
196
@@ -6609,319 +6698,10 @@
197
     not         r0d
198
     bsr         r1w, r0w
199
     bsf         eax, r0d    ; side effect: clear AH to Zero
200
-    shl         r1d, 16
201
-    or          eax, r1d
202
-    RET
203
-
204
-
205
-;void saoCuStatsE2_c(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBufft, int endX, int endY, int32_t *stats, int32_t *count)
206
-;{
207
-;    X265_CHECK(endX < MAX_CU_SIZE, "endX check failure\n");
208
-;    X265_CHECK(endY < MAX_CU_SIZE, "endY check failure\n");
209
-;    int x, y;
210
-;    int32_t tmp_stats[SAO::NUM_EDGETYPE];
211
-;    int32_t tmp_count[SAO::NUM_EDGETYPE];
212
-;    memset(tmp_stats, 0, sizeof(tmp_stats));
213
-;    memset(tmp_count, 0, sizeof(tmp_count));
214
-;    for (y = 0; y < endY; y++)
215
-;    {
216
-;        upBufft[0] = signOf(rec[stride] - rec[-1]);
217
-;        for (x = 0; x < endX; x++)
218
-;        {
219
-;            int signDown = signOf2(rec[x], rec[x + stride + 1]);
220
-;            X265_CHECK(signDown == signOf(rec[x] - rec[x + stride + 1]), "signDown check failure\n");
221
-;            uint32_t edgeType = signDown + upBuff1[x] + 2;
222
-;            upBufft[x + 1] = (int8_t)(-signDown);
223
-;            tmp_stats[edgeType] += (fenc[x] - rec[x]);
224
-;            tmp_count[edgeType]++;
225
-;        }
226
-;        std::swap(upBuff1, upBufft);
227
-;        rec += stride;
228
-;        fenc += stride;
229
-;    }
230
-;    for (x = 0; x < SAO::NUM_EDGETYPE; x++)
231
-;    {
232
-;        stats[SAO::s_eoTable[x]] += tmp_stats[x];
233
-;        count[SAO::s_eoTable[x]] += tmp_count[x];
234
-;    }
235
-;}
236
-
237
-%if ARCH_X86_64
238
-; TODO: x64 only because I need temporary register r7,r8, easy portab to x86
239
-INIT_XMM sse4
240
-cglobal saoCuStatsE2, 5,9,8,0-32    ; Stack: 5 of stats and 5 of count
241
-    mov         r5d, r5m
242
-
243
-    ; clear internal temporary buffer
244
-    pxor        m0, m0
245
-    mova        [rsp], m0
246
-    mova        [rsp + mmsize], m0
247
-    mova        m0, [pb_128]
248
-    mova        m5, [pb_1]
249
-    mova        m6, [pb_2]
250
-
251
-.loopH:
252
-    ; TODO: merge into below SIMD
253
-    ; get upBuffX[0]
254
-    mov         r6b, [r1 + r2]
255
-    sub         r6b, [r1 -  1]
256
-    seta        r6b
257
-    setb        r7b
258
-    sub         r6b, r7b
259
-    mov         [r4], r6b
260
-
261
-    ; backup unavailable pixels
262
-    movh        m7, [r4 + r5 + 1]
263
-
264
-    mov         r6d, r5d
265
-.loopW:
266
-    movu        m1, [r1]
267
-    movu        m2, [r1 + r2 + 1]
268
-
269
-    ; signDown
270
-    pxor        m1, m0
271
-    pxor        m2, m0
272
-    pcmpgtb     m3, m1, m2
273
-    pand        m3, m5
274
-    pcmpgtb     m2, m1
275
-    por         m2, m3
276
-    pxor        m3, m3
277
-    psubb       m3, m2
278
-
279
-    ; edgeType
280
-    movu        m4, [r3]
281
-    paddb       m4, m6
282
-    paddb       m2, m4
283
-
284
-    ; update upBuff1
285
-    movu        [r4 + 1], m3
286
-
287
-    ; stats[edgeType]
288
-    pxor        m1, m0
289
-    movu        m3, [r0]
290
-    punpckhbw   m4, m3, m1
291
-    punpcklbw   m3, m1
292
-    pmaddubsw   m3, [hmul_16p + 16]
293
-    pmaddubsw   m4, [hmul_16p + 16]
294
-
295
-    ; 16 pixels
296
-%assign x 0
297
-%rep 16
298
-    pextrb      r7d, m2, x
299
-    inc    word [rsp + r7 * 2]
300
-
301
-  %if (x < 8)
302
-    pextrw      r8d, m3, (x % 8)
303
-  %else
304
-    pextrw      r8d, m4, (x % 8)
305
-  %endif
306
-    movsx       r8d, r8w
307
-    add         [rsp + 5 * 2 + r7 * 4], r8d
308
-
309
-    dec         r6d
310
-    jz         .next
311
-%assign x x+1
312
-%endrep
313
-
314
-    add         r0, 16
315
-    add         r1, 16
316
-    add         r3, 16
317
-    add         r4, 16
318
-    jmp         .loopW
319
-
320
-.next:
321
-    xchg        r3, r4
322
-
323
-    ; restore pointer upBuff1
324
-    mov         r6d, r5d
325
-    and         r6d, 15
326
-
327
-    ; move to next row
328
-    sub         r6, r5
329
-    add         r3, r6
330
-    add         r4, r6
331
-    add         r6, r2
332
-    add         r0, r6
333
-    add         r1, r6
334
-
335
-    ; restore unavailable pixels
336
-    movh        [r3 + r5 + 1], m7
337
-
338
-    dec    byte r6m
339
-    jg         .loopH
340
-
341
-    ; sum to global buffer
342
-    mov         r1, r7m
343
-    mov         r0, r8m
344
-
345
-    ; s_eoTable = {1,2,0,3,4}
346
-    movzx       r6d, word [rsp + 0 * 2]
347
-    add         [r0 + 1 * 4], r6d
348
-    movzx       r6d, word [rsp + 1 * 2]
349
-    add         [r0 + 2 * 4], r6d
350
-    movzx       r6d, word [rsp + 2 * 2]
351
-    add         [r0 + 0 * 4], r6d
352
-    movzx       r6d, word [rsp + 3 * 2]
353
-    add         [r0 + 3 * 4], r6d
354
-    movzx       r6d, word [rsp + 4 * 2]
355
-    add         [r0 + 4 * 4], r6d
356
-
357
-    mov         r6d, [rsp + 5 * 2 + 0 * 4]
358
-    add         [r1 + 1 * 4], r6d
359
-    mov         r6d, [rsp + 5 * 2 + 1 * 4]
360
-    add         [r1 + 2 * 4], r6d
361
-    mov         r6d, [rsp + 5 * 2 + 2 * 4]
362
-    add         [r1 + 0 * 4], r6d
363
-    mov         r6d, [rsp + 5 * 2 + 3 * 4]
364
-    add         [r1 + 3 * 4], r6d
365
-    mov         r6d, [rsp + 5 * 2 + 4 * 4]
366
-    add         [r1 + 4 * 4], r6d
367
-    RET
368
-%endif ; ARCH_X86_64
369
-
370
-
371
-;void saoStatE3(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
372
-;{
373
-;    memset(tmp_stats, 0, sizeof(tmp_stats));
374
-;    memset(tmp_count, 0, sizeof(tmp_count));
375
-;    for (y = startY; y < endY; y++)
376
-;    {
377
-;        for (x = startX; x < endX; x++)
378
-;        {
379
-;            int signDown = signOf2(rec[x], rec[x + stride - 1]);
380
-;            uint32_t edgeType = signDown + upBuff1[x] + 2;
381
-;            upBuff1[x - 1] = (int8_t)(-signDown);
382
-;            tmp_stats[edgeType] += (fenc[x] - rec[x]);
383
-;            tmp_count[edgeType]++;
384
-;        }
385
-;        upBuff1[endX - 1] = signOf(rec[endX - 1 + stride] - rec[endX]);
386
-;        rec += stride;
387
-;        fenc += stride;
388
-;    }
389
-;    for (x = 0; x < NUM_EDGETYPE; x++)
390
-;    {
391
-;        stats[s_eoTable[x]] += tmp_stats[x];
392
-;        count[s_eoTable[x]] += tmp_count[x];
393
-;    }
394
-;}
395
-
396
-%if ARCH_X86_64
397
-INIT_XMM sse4
398
-cglobal saoCuStatsE3, 4,9,8,0-32    ; Stack: 5 of stats and 5 of count
399
-    mov         r4d, r4m
400
-    mov         r5d, r5m
401
-
402
-    ; clear internal temporary buffer
403
-    pxor        m0, m0
404
-    mova        [rsp], m0
405
-    mova        [rsp + mmsize], m0
406
-    mova        m0, [pb_128]
407
-    mova        m5, [pb_1]
408
-    mova        m6, [pb_2]
409
-    movh        m7, [r3 + r4]
410
-
411
-.loopH:
412
-    mov         r6d, r4d
413
-
414
-.loopW:
415
-    movu        m1, [r1]
416
-    movu        m2, [r1 + r2 - 1]
417
-
418
-    ; signDown
419
-    pxor        m1, m0
420
-    pxor        m2, m0
421
-    pcmpgtb     m3, m1, m2
422
-    pand        m3, m5
423
-    pcmpgtb     m2, m1
424
-    por         m2, m3
425
-    pxor        m3, m3
426
-    psubb       m3, m2
427
-
428
-    ; edgeType
429
-    movu        m4, [r3]
430
-    paddb       m4, m6
431
-    paddb       m2, m4
432
-
433
-    ; update upBuff1
434
-    movu        [r3 - 1], m3
435
-
436
-    ; stats[edgeType]
437
-    pxor        m1, m0
438
-    movu        m3, [r0]
439
-    punpckhbw   m4, m3, m1
440
-    punpcklbw   m3, m1
441
-    pmaddubsw   m3, [hmul_16p + 16]
442
-    pmaddubsw   m4, [hmul_16p + 16]
443
-
444
-    ; 16 pixels
445
-%assign x 0
446
-%rep 16
447
-    pextrb      r7d, m2, x
448
-    inc    word [rsp + r7 * 2]
449
-
450
-  %if (x < 8)
451
-    pextrw      r8d, m3, (x % 8)
452
-  %else
453
-    pextrw      r8d, m4, (x % 8)
454
-  %endif
455
-    movsx       r8d, r8w
456
-    add         [rsp + 5 * 2 + r7 * 4], r8d
457
-
458
-    dec         r6d
459
-    jz         .next
460
-%assign x x+1
461
-%endrep
462
-
463
-    add         r0, 16
464
-    add         r1, 16
465
-    add         r3, 16
466
-    jmp         .loopW
467
-
468
-.next:
469
-    ; restore pointer upBuff1
470
-    mov         r6d, r4d
471
-    and         r6d, 15
472
-
473
-    ; move to next row
474
-    sub         r6, r4
475
-    add         r3, r6
476
-    add         r6, r2
477
-    add         r0, r6
478
-    add         r1, r6
479
-    dec         r5d
480
-    jg         .loopH
481
-
482
-    ; restore unavailable pixels
483
-    movh        [r3 + r4], m7
484
-
485
-    ; sum to global buffer
486
-    mov         r1, r6m
487
-    mov         r0, r7m
488
-
489
-    ; s_eoTable = {1,2,0,3,4}
490
-    movzx       r6d, word [rsp + 0 * 2]
491
-    add         [r0 + 1 * 4], r6d
492
-    movzx       r6d, word [rsp + 1 * 2]
493
-    add         [r0 + 2 * 4], r6d
494
-    movzx       r6d, word [rsp + 2 * 2]
495
-    add         [r0 + 0 * 4], r6d
496
-    movzx       r6d, word [rsp + 3 * 2]
497
-    add         [r0 + 3 * 4], r6d
498
-    movzx       r6d, word [rsp + 4 * 2]
499
-    add         [r0 + 4 * 4], r6d
500
-
501
-    mov         r6d, [rsp + 5 * 2 + 0 * 4]
502
-    add         [r1 + 1 * 4], r6d
503
-    mov         r6d, [rsp + 5 * 2 + 1 * 4]
504
-    add         [r1 + 2 * 4], r6d
505
-    mov         r6d, [rsp + 5 * 2 + 2 * 4]
506
-    add         [r1 + 0 * 4], r6d
507
-    mov         r6d, [rsp + 5 * 2 + 3 * 4]
508
-    add         [r1 + 3 * 4], r6d
509
-    mov         r6d, [rsp + 5 * 2 + 4 * 4]
510
-    add         [r1 + 4 * 4], r6d
511
+    shl         r1d, 8
512
+    or          eax, r2d    ; merge absSumSign
513
+    or          eax, r1d    ; merge lastNZPosInCG
514
     RET
515
-%endif ; ARCH_X86_64
516
 
517
 
518
 ; uint32_t costCoeffNxN(uint16_t *scan, coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, uint8_t *tabSigCtx, uint16_t scanFlagMask, uint8_t *baseCtx, int offset, int subPosBase)
519
@@ -6963,7 +6743,7 @@
520
 %if ARCH_X86_64
521
 ; uint32_t costCoeffNxN(uint16_t *scan, coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, uint8_t *tabSigCtx, uint16_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase)
522
 INIT_XMM sse4
523
-cglobal costCoeffNxN, 6,11,5
524
+cglobal costCoeffNxN, 6,11,6
525
     add         r2d, r2d
526
 
527
     ; abs(coeff)
528
@@ -7096,6 +6876,177 @@
529
 %endif
530
     and         eax, 0xFFFFFF
531
     RET
532
+
533
+
534
+; uint32_t costCoeffNxN(uint16_t *scan, coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, uint8_t *tabSigCtx, uint16_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase)
535
+INIT_YMM avx2,bmi2
536
+cglobal costCoeffNxN, 6,10,5
537
+    add             r2d, r2d
538
+
539
+    ; abs(coeff)
540
+    movq            xm1, [r1]
541
+    movhps          xm1, [r1 + r2]
542
+    movq            xm2, [r1 + r2 * 2]
543
+    lea             r2, [r2 * 3]
544
+    movhps          xm2, [r1 + r2]
545
+    vinserti128     m1, m1, xm2, 1
546
+    pabsw           m1, m1
547
+    ; r[1-2] free here
548
+
549
+    ; loading tabSigCtx
550
+    mova            xm2, [r4]
551
+    ; r[4] free here
552
+
553
+    ; WARNING: beyond-bound read here!
554
+    ; loading scan table
555
+    mov             r2d, r8m
556
+    bzhi            r4d, r5d, r2d                   ; clear non-scan mask bits
557
+    mov             r6d, r2d
558
+    xor             r2d, 15
559
+    movu            m0, [r0 + r2 * 2]
560
+    packuswb        m0, m0
561
+    pxor            m0, [pb_15]
562
+    vpermq          m0, m0, q3120
563
+    add             r4d, r2d                        ; r4d = (scanPosSigOff == 15) -> (numNonZero == 0)
564
+    mov             r2d, r6d
565
+
566
+    ; reorder tabSigCtx (+offset)
567
+    pshufb          xm2, xm0
568
+    vpbroadcastb    xm3, r7m
569
+    paddb           xm2, xm3
570
+    ; r[0-1] free here
571
+
572
+    ; reorder coeff
573
+    pshufb          m1, [deinterleave_shuf]
574
+    vpermq          m1, m1, q3120
575
+    pshufb          m1, m0
576
+    vpermq          m1, m1, q3120
577
+    pshufb          m1, [interleave_shuf]
578
+    ; r[0-1], m[2-3] free here
579
+
580
+    ; sig mask
581
+    pxor            xm3, xm3
582
+    movd            xm4, r5d
583
+    vpbroadcastw    m4, xm4
584
+    pandn           m4, m4, [pw_exp2_0_15]
585
+    pcmpeqw         m4, m3
586
+
587
+    ; absCoeff[numNonZero] = tmpCoeff[blkPos]
588
+    ; [0-3]
589
+    movq            r0, xm4
590
+    movq            r1, xm1
591
+    pext            r6, r1, r0
592
+    mov       qword [r3], r6
593
+    popcnt          r0, r0
594
+    shr             r0, 3
595
+    add             r3, r0
596
+
597
+    ; [4-7]
598
+    pextrq          r0, xm4, 1
599
+    pextrq          r1, xm1, 1
600
+    pext            r6, r1, r0
601
+    mov       qword [r3], r6
602
+    popcnt          r0, r0
603
+    shr             r0, 3
604
+    add             r3, r0
605
+
606
+    ; [8-B]
607
+    vextracti128    xm4, m4, 1
608
+    movq            r0, xm4
609
+    vextracti128    xm1, m1, 1
610
+    movq            r1, xm1
611
+    pext            r6, r1, r0
612
+    mov       qword [r3], r6
613
+    popcnt          r0, r0
614
+    shr             r0, 3
615
+    add             r3, r0
616
+
617
+    ; [C-F]
618
+    pextrq          r0, xm4, 1
619
+    pextrq          r1, xm1, 1
620
+    pext            r6, r1, r0
621
+    mov       qword [r3], r6
622
+    ; r[0-1,3] free here
623
+
624
+    ; register mapping
625
+    ; m0 - Zigzag
626
+    ; m1 - sigCtx
627
+    ; r0 - x265_entropyStateBits
628
+    ; r1 - baseCtx
629
+    ; r2 - scanPosSigOff
630
+    ; r5 - scanFlagMask
631
+    ; r6 - sum
632
+    ; {r3,r4} - ctxSig[15-0]
633
+    ; r8m - (numNonZero != 0) || (subPosBase == 0)
634
+    lea             r0, [private_prefix %+ _entropyStateBits]
635
+    mov             r1, r6mp
636
+    xor             r6d, r6d
637
+    xor             r8d, r8d
638
+
639
+    test            r2d, r2d
640
+    jz             .idx_zero
641
+
642
+;   {
643
+;        const uint32_t cnt = tabSigCtx[blkPos] + offset + posOffset;
644
+;        ctxSig = cnt & posZeroMask;
645
+;        const uint32_t mstate = baseCtx[ctxSig];
646
+;        const uint32_t mps = mstate & 1;
647
+;        const uint32_t stateBits = x265_entropyStateBits[mstate ^ sig];
648
+;        uint32_t nextState = (stateBits >> 24) + mps;
649
+;        if ((mstate ^ sig) == 1)
650
+;            nextState = sig;
651
+;        baseCtx[ctxSig] = (uint8_t)nextState;
652
+;        sum += stateBits;
653
+;    }
654
+;    absCoeff[numNonZero] = tmpCoeff[blkPos];
655
+;    numNonZero += sig;
656
+;    scanPosSigOff--;
657
+.loop:
658
+    shr             r5d, 1
659
+    setc            r8b                             ; r8 = sig
660
+    movd            r7d, xm2                        ; r7 = ctxSig
661
+    movzx           r7d, r7b
662
+    psrldq          xm2, 1
663
+    movzx           r9d, byte [r1 + r7]             ; mstate = baseCtx[ctxSig]
664
+    mov             r3d, r9d
665
+    and             r3b, 1                          ; mps = mstate & 1
666
+    xor             r9d, r8d                        ; r9 = mstate ^ sig
667
+    add             r6d, [r0 + r9 * 4]              ; sum += entropyStateBits[mstate ^ sig]
668
+    add             r3b, byte [r0 + r9 * 4 + 3]     ; nextState = (stateBits >> 24) + mps
669
+    cmp             r9d, 1
670
+    cmove           r3d, r8d
671
+    mov        byte [r1 + r7], r3b
672
+
673
+    dec             r2d
674
+    jg             .loop
675
+
676
+.idx_zero:
677
+    xor             r2d, r2d
678
+    cmp        word r9m, 0
679
+    sete            r2b
680
+    add             r4d, r2d                        ; (numNonZero != 0) || (subPosBase == 0)
681
+    jz             .exit
682
+
683
+    dec             r2b
684
+    movd            r3d, xm2
685
+    and             r2d, r3d
686
+
687
+    movzx           r3d, byte [r1 + r2]             ; mstate = baseCtx[ctxSig]
688
+    mov             r4d, r5d
689
+    xor             r5d, r3d                        ; r0 = mstate ^ sig
690
+    and             r3b, 1                          ; mps = mstate & 1
691
+    add             r6d, [r0 + r5 * 4]              ; sum += x265_entropyStateBits[mstate ^ sig]
692
+    add             r3b, [r0 + r5 * 4 + 3]          ; nextState = (stateBits >> 24) + mps
693
+    cmp             r5b, 1
694
+    cmove           r3d, r4d
695
+    mov        byte [r1 + r2], r3b
696
+
697
+.exit:
698
+%ifnidn eax,r6d
699
+    mov             eax, r6d
700
+%endif
701
+    and             eax, 0xFFFFFF
702
+    RET
703
 %endif ; ARCH_X86_64
704
 
705
 
706
x265_1.8.tar.gz/source/common/x86/pixel.h -> x265_1.9.tar.gz/source/common/x86/pixel.h Changed
40
 
1
@@ -2,10 +2,12 @@
2
  * pixel.h: x86 pixel metrics
3
  *****************************************************************************
4
  * Copyright (C) 2003-2013 x264 project
5
+ * Copyright (C) 2013-2015 x265 project
6
  *
7
  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
8
  *          Loren Merritt <lorenm@u.washington.edu>
9
  *          Fiona Glaser <fiona@x264.com>
10
+;*          Min Chen <chenm003@163.com>
11
  *
12
  * This program is free software; you can redistribute it and/or modify
13
  * it under the terms of the GNU General Public License as published by
14
@@ -34,9 +36,10 @@
15
 void PFX(upShift_16_avx2)(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
16
 void PFX(upShift_8_sse4)(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
17
 void PFX(upShift_8_avx2)(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
18
+pixel PFX(planeClipAndMax_avx2)(pixel *src, intptr_t stride, int width, int height, uint64_t *outsum, const pixel minPix, const pixel maxPix);
19
 
20
 #define DECL_PIXELS(cpu) \
21
-    FUNCDEF_PU(uint32_t, pixel_ssd, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
22
+    FUNCDEF_PU(sse_t, pixel_ssd, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
23
     FUNCDEF_PU(int, pixel_sa8d, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
24
     FUNCDEF_PU(void, pixel_sad_x3, cpu, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*); \
25
     FUNCDEF_PU(void, pixel_sad_x4, cpu, const pixel*, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*); \
26
@@ -45,10 +48,10 @@
27
     FUNCDEF_PU(void, pixel_sub_ps, cpu, int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1); \
28
     FUNCDEF_CHROMA_PU(int, pixel_satd, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
29
     FUNCDEF_CHROMA_PU(int, pixel_sad, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
30
-    FUNCDEF_CHROMA_PU(uint32_t, pixel_ssd_ss, cpu, const int16_t*, intptr_t, const int16_t*, intptr_t); \
31
+    FUNCDEF_CHROMA_PU(sse_t, pixel_ssd_ss, cpu, const int16_t*, intptr_t, const int16_t*, intptr_t); \
32
     FUNCDEF_CHROMA_PU(void, addAvg, cpu, const int16_t*, const int16_t*, pixel*, intptr_t, intptr_t, intptr_t); \
33
-    FUNCDEF_CHROMA_PU(int, pixel_ssd_s, cpu, const int16_t*, intptr_t); \
34
-    FUNCDEF_TU_S(int, pixel_ssd_s, cpu, const int16_t*, intptr_t); \
35
+    FUNCDEF_CHROMA_PU(sse_t, pixel_ssd_s, cpu, const int16_t*, intptr_t); \
36
+    FUNCDEF_TU_S(sse_t, pixel_ssd_s, cpu, const int16_t*, intptr_t); \
37
     FUNCDEF_TU(uint64_t, pixel_var, cpu, const pixel*, intptr_t); \
38
     FUNCDEF_TU(int, psyCost_pp, cpu, const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride); \
39
     FUNCDEF_TU(int, psyCost_ss, cpu, const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride)
40
x265_1.8.tar.gz/source/common/x86/pixeladd8.asm -> x265_1.9.tar.gz/source/common/x86/pixeladd8.asm Changed
9
 
1
@@ -2,6 +2,7 @@
2
 ;* Copyright (C) 2013 x265 project
3
 ;*
4
 ;* Authors: Praveen Kumar Tiwari <praveen@multicorewareinc.com>
5
+;*          Min Chen <chenm003@163.com>
6
 ;*
7
 ;* This program is free software; you can redistribute it and/or modify
8
 ;* it under the terms of the GNU General Public License as published by
9
x265_1.8.tar.gz/source/common/x86/sad-a.asm -> x265_1.9.tar.gz/source/common/x86/sad-a.asm Changed
1590
 
1
@@ -2,6 +2,7 @@
2
 ;* sad-a.asm: x86 sad functions
3
 ;*****************************************************************************
4
 ;* Copyright (C) 2003-2013 x264 project
5
+;* Copyright (C) 2013-2015 x265 project
6
 ;*
7
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
8
 ;*          Fiona Glaser <fiona@x264.com>
9
@@ -3328,6 +3329,730 @@
10
     SAD_X4_END_SSE2 1
11
 %endmacro
12
 
13
+%if ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 0
14
+INIT_YMM avx2
15
+%macro SAD_X4_64x8_AVX2 0
16
+    movu            m4, [r0]
17
+    movu            m5, [r1]
18
+    movu            m6, [r2]
19
+    movu            m7, [r3]
20
+    movu            m8, [r4]
21
+
22
+    psadbw          m9, m4, m5
23
+    paddd           m0, m9
24
+    psadbw          m5, m4, m6
25
+    paddd           m1, m5
26
+    psadbw          m6, m4, m7
27
+    paddd           m2, m6
28
+    psadbw          m4, m8
29
+    paddd           m3, m4
30
+
31
+    movu            m4, [r0 + mmsize]
32
+    movu            m5, [r1 + mmsize]
33
+    movu            m6, [r2 + mmsize]
34
+    movu            m7, [r3 + mmsize]
35
+    movu            m8, [r4 + mmsize]
36
+
37
+    psadbw          m9, m4, m5
38
+    paddd           m0, m9
39
+    psadbw          m5, m4, m6
40
+    paddd           m1, m5
41
+    psadbw          m6, m4, m7
42
+    paddd           m2, m6
43
+    psadbw          m4, m8
44
+    paddd           m3, m4
45
+
46
+    movu            m4, [r0 + FENC_STRIDE]
47
+    movu            m5, [r1 + r5]
48
+    movu            m6, [r2 + r5]
49
+    movu            m7, [r3 + r5]
50
+    movu            m8, [r4 + r5]
51
+
52
+    psadbw          m9, m4, m5
53
+    paddd           m0, m9
54
+    psadbw          m5, m4, m6
55
+    paddd           m1, m5
56
+    psadbw          m6, m4, m7
57
+    paddd           m2, m6
58
+    psadbw          m4, m8
59
+    paddd           m3, m4
60
+
61
+    movu            m4, [r0 + FENC_STRIDE + mmsize]
62
+    movu            m5, [r1 + r5 + mmsize]
63
+    movu            m6, [r2 + r5 + mmsize]
64
+    movu            m7, [r3 + r5 + mmsize]
65
+    movu            m8, [r4 + r5 + mmsize]
66
+
67
+    psadbw          m9, m4, m5
68
+    paddd           m0, m9
69
+    psadbw          m5, m4, m6
70
+    paddd           m1, m5
71
+    psadbw          m6, m4, m7
72
+    paddd           m2, m6
73
+    psadbw          m4, m8
74
+    paddd           m3, m4
75
+
76
+    movu            m4, [r0 + FENC_STRIDE * 2]
77
+    movu            m5, [r1 + r5 * 2]
78
+    movu            m6, [r2 + r5 * 2]
79
+    movu            m7, [r3 + r5 * 2]
80
+    movu            m8, [r4 + r5 * 2]
81
+
82
+    psadbw          m9, m4, m5
83
+    paddd           m0, m9
84
+    psadbw          m5, m4, m6
85
+    paddd           m1, m5
86
+    psadbw          m6, m4, m7
87
+    paddd           m2, m6
88
+    psadbw          m4, m8
89
+    paddd           m3, m4
90
+
91
+    movu            m4, [r0 + FENC_STRIDE * 2 + mmsize]
92
+    movu            m5, [r1 + r5 * 2 + mmsize]
93
+    movu            m6, [r2 + r5 * 2 + mmsize]
94
+    movu            m7, [r3 + r5 * 2 + mmsize]
95
+    movu            m8, [r4 + r5 * 2 + mmsize]
96
+
97
+    psadbw          m9, m4, m5
98
+    paddd           m0, m9
99
+    psadbw          m5, m4, m6
100
+    paddd           m1, m5
101
+    psadbw          m6, m4, m7
102
+    paddd           m2, m6
103
+    psadbw          m4, m8
104
+    paddd           m3, m4
105
+
106
+    movu            m4, [r0 + FENC_STRIDE * 3]
107
+    movu            m5, [r1 + r7]
108
+    movu            m6, [r2 + r7]
109
+    movu            m7, [r3 + r7]
110
+    movu            m8, [r4 + r7]
111
+
112
+    psadbw          m9, m4, m5
113
+    paddd           m0, m9
114
+    psadbw          m5, m4, m6
115
+    paddd           m1, m5
116
+    psadbw          m6, m4, m7
117
+    paddd           m2, m6
118
+    psadbw          m4, m8
119
+    paddd           m3, m4
120
+
121
+    movu            m4, [r0 + FENC_STRIDE * 3 + mmsize]
122
+    movu            m5, [r1 + r7 + mmsize]
123
+    movu            m6, [r2 + r7 + mmsize]
124
+    movu            m7, [r3 + r7 + mmsize]
125
+    movu            m8, [r4 + r7 + mmsize]
126
+
127
+    psadbw          m9, m4, m5
128
+    paddd           m0, m9
129
+    psadbw          m5, m4, m6
130
+    paddd           m1, m5
131
+    psadbw          m6, m4, m7
132
+    paddd           m2, m6
133
+    psadbw          m4, m8
134
+    paddd           m3, m4
135
+
136
+    add             r0, FENC_STRIDE * 4
137
+    lea             r1, [r1 + r5 * 4]
138
+    lea             r2, [r2 + r5 * 4]
139
+    lea             r3, [r3 + r5 * 4]
140
+    lea             r4, [r4 + r5 * 4]
141
+
142
+    movu            m4, [r0]
143
+    movu            m5, [r1]
144
+    movu            m6, [r2]
145
+    movu            m7, [r3]
146
+    movu            m8, [r4]
147
+
148
+    psadbw          m9, m4, m5
149
+    paddd           m0, m9
150
+    psadbw          m5, m4, m6
151
+    paddd           m1, m5
152
+    psadbw          m6, m4, m7
153
+    paddd           m2, m6
154
+    psadbw          m4, m8
155
+    paddd           m3, m4
156
+
157
+    movu            m4, [r0 + mmsize]
158
+    movu            m5, [r1 + mmsize]
159
+    movu            m6, [r2 + mmsize]
160
+    movu            m7, [r3 + mmsize]
161
+    movu            m8, [r4 + mmsize]
162
+
163
+    psadbw          m9, m4, m5
164
+    paddd           m0, m9
165
+    psadbw          m5, m4, m6
166
+    paddd           m1, m5
167
+    psadbw          m6, m4, m7
168
+    paddd           m2, m6
169
+    psadbw          m4, m8
170
+    paddd           m3, m4
171
+
172
+    movu            m4, [r0 + FENC_STRIDE]
173
+    movu            m5, [r1 + r5]
174
+    movu            m6, [r2 + r5]
175
+    movu            m7, [r3 + r5]
176
+    movu            m8, [r4 + r5]
177
+
178
+    psadbw          m9, m4, m5
179
+    paddd           m0, m9
180
+    psadbw          m5, m4, m6
181
+    paddd           m1, m5
182
+    psadbw          m6, m4, m7
183
+    paddd           m2, m6
184
+    psadbw          m4, m8
185
+    paddd           m3, m4
186
+
187
+    movu            m4, [r0 + FENC_STRIDE + mmsize]
188
+    movu            m5, [r1 + r5 + mmsize]
189
+    movu            m6, [r2 + r5 + mmsize]
190
+    movu            m7, [r3 + r5 + mmsize]
191
+    movu            m8, [r4 + r5 + mmsize]
192
+
193
+    psadbw          m9, m4, m5
194
+    paddd           m0, m9
195
+    psadbw          m5, m4, m6
196
+    paddd           m1, m5
197
+    psadbw          m6, m4, m7
198
+    paddd           m2, m6
199
+    psadbw          m4, m8
200
+    paddd           m3, m4
201
+
202
+    movu            m4, [r0 + FENC_STRIDE * 2]
203
+    movu            m5, [r1 + r5 * 2]
204
+    movu            m6, [r2 + r5 * 2]
205
+    movu            m7, [r3 + r5 * 2]
206
+    movu            m8, [r4 + r5 * 2]
207
+
208
+    psadbw          m9, m4, m5
209
+    paddd           m0, m9
210
+    psadbw          m5, m4, m6
211
+    paddd           m1, m5
212
+    psadbw          m6, m4, m7
213
+    paddd           m2, m6
214
+    psadbw          m4, m8
215
+    paddd           m3, m4
216
+
217
+    movu            m4, [r0 + FENC_STRIDE * 2 + mmsize]
218
+    movu            m5, [r1 + r5 * 2 + mmsize]
219
+    movu            m6, [r2 + r5 * 2 + mmsize]
220
+    movu            m7, [r3 + r5 * 2 + mmsize]
221
+    movu            m8, [r4 + r5 * 2 + mmsize]
222
+
223
+    psadbw          m9, m4, m5
224
+    paddd           m0, m9
225
+    psadbw          m5, m4, m6
226
+    paddd           m1, m5
227
+    psadbw          m6, m4, m7
228
+    paddd           m2, m6
229
+    psadbw          m4, m8
230
+    paddd           m3, m4
231
+
232
+    movu            m4, [r0 + FENC_STRIDE * 3]
233
+    movu            m5, [r1 + r7]
234
+    movu            m6, [r2 + r7]
235
+    movu            m7, [r3 + r7]
236
+    movu            m8, [r4 + r7]
237
+
238
+    psadbw          m9, m4, m5
239
+    paddd           m0, m9
240
+    psadbw          m5, m4, m6
241
+    paddd           m1, m5
242
+    psadbw          m6, m4, m7
243
+    paddd           m2, m6
244
+    psadbw          m4, m8
245
+    paddd           m3, m4
246
+
247
+    movu            m4, [r0 + FENC_STRIDE * 3 + mmsize]
248
+    movu            m5, [r1 + r7 + mmsize]
249
+    movu            m6, [r2 + r7 + mmsize]
250
+    movu            m7, [r3 + r7 + mmsize]
251
+    movu            m8, [r4 + r7 + mmsize]
252
+
253
+    psadbw          m9, m4, m5
254
+    paddd           m0, m9
255
+    psadbw          m5, m4, m6
256
+    paddd           m1, m5
257
+    psadbw          m6, m4, m7
258
+    paddd           m2, m6
259
+    psadbw          m4, m8
260
+    paddd           m3, m4
261
+%endmacro
262
+
263
+%macro PIXEL_SAD_X4_END_AVX2 0
264
+    vextracti128   xm4, m0, 1
265
+    vextracti128   xm5, m1, 1
266
+    vextracti128   xm6, m2, 1
267
+    vextracti128   xm7, m3, 1
268
+    paddd           m0, m4
269
+    paddd           m1, m5
270
+    paddd           m2, m6
271
+    paddd           m3, m7
272
+    pshufd         xm4, xm0, 2
273
+    pshufd         xm5, xm1, 2
274
+    pshufd         xm6, xm2, 2
275
+    pshufd         xm7, xm3, 2
276
+    paddd           m0, m4
277
+    paddd           m1, m5
278
+    paddd           m2, m6
279
+    paddd           m3, m7
280
+
281
+    movd            [r6 + 0], xm0
282
+    movd            [r6 + 4], xm1
283
+    movd            [r6 + 8], xm2
284
+    movd            [r6 + 12], xm3
285
+%endmacro
286
+
287
+cglobal pixel_sad_x4_64x16, 7,8,10
288
+    pxor            m0, m0
289
+    pxor            m1, m1
290
+    pxor            m2, m2
291
+    pxor            m3, m3
292
+    lea             r7, [r5 * 3]
293
+
294
+    SAD_X4_64x8_AVX2
295
+
296
+    add             r0, FENC_STRIDE * 4
297
+    lea             r1, [r1 + r5 * 4]
298
+    lea             r2, [r2 + r5 * 4]
299
+    lea             r3, [r3 + r5 * 4]
300
+    lea             r4, [r4 + r5 * 4]
301
+
302
+    SAD_X4_64x8_AVX2
303
+    PIXEL_SAD_X4_END_AVX2
304
+    RET
305
+
306
+cglobal pixel_sad_x4_64x32, 7,8,10
307
+    pxor            m0, m0
308
+    pxor            m1, m1
309
+    pxor            m2, m2
310
+    pxor            m3, m3
311
+    lea             r7, [r5 * 3]
312
+
313
+    SAD_X4_64x8_AVX2
314
+
315
+    add             r0, FENC_STRIDE * 4
316
+    lea             r1, [r1 + r5 * 4]
317
+    lea             r2, [r2 + r5 * 4]
318
+    lea             r3, [r3 + r5 * 4]
319
+    lea             r4, [r4 + r5 * 4]
320
+
321
+    SAD_X4_64x8_AVX2
322
+
323
+    add             r0, FENC_STRIDE * 4
324
+    lea             r1, [r1 + r5 * 4]
325
+    lea             r2, [r2 + r5 * 4]
326
+    lea             r3, [r3 + r5 * 4]
327
+    lea             r4, [r4 + r5 * 4]
328
+
329
+    SAD_X4_64x8_AVX2
330
+
331
+    add             r0, FENC_STRIDE * 4
332
+    lea             r1, [r1 + r5 * 4]
333
+    lea             r2, [r2 + r5 * 4]
334
+    lea             r3, [r3 + r5 * 4]
335
+    lea             r4, [r4 + r5 * 4]
336
+
337
+    SAD_X4_64x8_AVX2
338
+    PIXEL_SAD_X4_END_AVX2
339
+    RET
340
+
341
+cglobal pixel_sad_x4_64x48, 7,8,10
342
+    pxor            m0, m0
343
+    pxor            m1, m1
344
+    pxor            m2, m2
345
+    pxor            m3, m3
346
+    lea             r7, [r5 * 3]
347
+
348
+    SAD_X4_64x8_AVX2
349
+
350
+    add             r0, FENC_STRIDE * 4
351
+    lea             r1, [r1 + r5 * 4]
352
+    lea             r2, [r2 + r5 * 4]
353
+    lea             r3, [r3 + r5 * 4]
354
+    lea             r4, [r4 + r5 * 4]
355
+
356
+    SAD_X4_64x8_AVX2
357
+
358
+    add             r0, FENC_STRIDE * 4
359
+    lea             r1, [r1 + r5 * 4]
360
+    lea             r2, [r2 + r5 * 4]
361
+    lea             r3, [r3 + r5 * 4]
362
+    lea             r4, [r4 + r5 * 4]
363
+
364
+    SAD_X4_64x8_AVX2
365
+
366
+    add             r0, FENC_STRIDE * 4
367
+    lea             r1, [r1 + r5 * 4]
368
+    lea             r2, [r2 + r5 * 4]
369
+    lea             r3, [r3 + r5 * 4]
370
+    lea             r4, [r4 + r5 * 4]
371
+
372
+    SAD_X4_64x8_AVX2
373
+
374
+    add             r0, FENC_STRIDE * 4
375
+    lea             r1, [r1 + r5 * 4]
376
+    lea             r2, [r2 + r5 * 4]
377
+    lea             r3, [r3 + r5 * 4]
378
+    lea             r4, [r4 + r5 * 4]
379
+
380
+    SAD_X4_64x8_AVX2
381
+
382
+    add             r0, FENC_STRIDE * 4
383
+    lea             r1, [r1 + r5 * 4]
384
+    lea             r2, [r2 + r5 * 4]
385
+    lea             r3, [r3 + r5 * 4]
386
+    lea             r4, [r4 + r5 * 4]
387
+
388
+    SAD_X4_64x8_AVX2
389
+    PIXEL_SAD_X4_END_AVX2
390
+    RET
391
+
392
+cglobal pixel_sad_x4_64x64, 7,8,10
393
+    pxor            m0, m0
394
+    pxor            m1, m1
395
+    pxor            m2, m2
396
+    pxor            m3, m3
397
+    lea             r7, [r5 * 3]
398
+
399
+    SAD_X4_64x8_AVX2
400
+
401
+    add             r0, FENC_STRIDE * 4
402
+    lea             r1, [r1 + r5 * 4]
403
+    lea             r2, [r2 + r5 * 4]
404
+    lea             r3, [r3 + r5 * 4]
405
+    lea             r4, [r4 + r5 * 4]
406
+
407
+    SAD_X4_64x8_AVX2
408
+
409
+    add             r0, FENC_STRIDE * 4
410
+    lea             r1, [r1 + r5 * 4]
411
+    lea             r2, [r2 + r5 * 4]
412
+    lea             r3, [r3 + r5 * 4]
413
+    lea             r4, [r4 + r5 * 4]
414
+
415
+    SAD_X4_64x8_AVX2
416
+
417
+    add             r0, FENC_STRIDE * 4
418
+    lea             r1, [r1 + r5 * 4]
419
+    lea             r2, [r2 + r5 * 4]
420
+    lea             r3, [r3 + r5 * 4]
421
+    lea             r4, [r4 + r5 * 4]
422
+
423
+    SAD_X4_64x8_AVX2
424
+
425
+    add             r0, FENC_STRIDE * 4
426
+    lea             r1, [r1 + r5 * 4]
427
+    lea             r2, [r2 + r5 * 4]
428
+    lea             r3, [r3 + r5 * 4]
429
+    lea             r4, [r4 + r5 * 4]
430
+
431
+    SAD_X4_64x8_AVX2
432
+
433
+    add             r0, FENC_STRIDE * 4
434
+    lea             r1, [r1 + r5 * 4]
435
+    lea             r2, [r2 + r5 * 4]
436
+    lea             r3, [r3 + r5 * 4]
437
+    lea             r4, [r4 + r5 * 4]
438
+
439
+    SAD_X4_64x8_AVX2
440
+
441
+    add             r0, FENC_STRIDE * 4
442
+    lea             r1, [r1 + r5 * 4]
443
+    lea             r2, [r2 + r5 * 4]
444
+    lea             r3, [r3 + r5 * 4]
445
+    lea             r4, [r4 + r5 * 4]
446
+
447
+    SAD_X4_64x8_AVX2
448
+
449
+    add             r0, FENC_STRIDE * 4
450
+    lea             r1, [r1 + r5 * 4]
451
+    lea             r2, [r2 + r5 * 4]
452
+    lea             r3, [r3 + r5 * 4]
453
+    lea             r4, [r4 + r5 * 4]
454
+
455
+    SAD_X4_64x8_AVX2
456
+    PIXEL_SAD_X4_END_AVX2
457
+    RET
458
+
459
+%macro SAD_X4_48x8_AVX2 0
460
+    movu            m4, [r0]
461
+    movu            m5, [r1]
462
+    movu            m6, [r2]
463
+    movu            m7, [r3]
464
+    movu            m8, [r4]
465
+
466
+    psadbw          m9, m4, m5
467
+    paddd           m0, m9
468
+    psadbw          m5, m4, m6
469
+    paddd           m1, m5
470
+    psadbw          m6, m4, m7
471
+    paddd           m2, m6
472
+    psadbw          m4, m8
473
+    paddd           m3, m4
474
+
475
+    movu            xm4, [r0 + mmsize]
476
+    movu            xm5, [r1 + mmsize]
477
+    movu            xm6, [r2 + mmsize]
478
+    movu            xm7, [r3 + mmsize]
479
+    movu            xm8, [r4 + mmsize]
480
+
481
+    vinserti128     m4, m4, [r0 + FENC_STRIDE], 1
482
+    vinserti128     m5, m5, [r1 + r5], 1
483
+    vinserti128     m6, m6, [r2 + r5], 1
484
+    vinserti128     m7, m7, [r3 + r5], 1
485
+    vinserti128     m8, m8, [r4 + r5], 1
486
+
487
+    psadbw          m9, m4, m5
488
+    paddd           m0, m9
489
+    psadbw          m5, m4, m6
490
+    paddd           m1, m5
491
+    psadbw          m6, m4, m7
492
+    paddd           m2, m6
493
+    psadbw          m4, m8
494
+    paddd           m3, m4
495
+
496
+    movu            m4, [r0 + FENC_STRIDE + mmsize/2]
497
+    movu            m5, [r1 + r5 + mmsize/2]
498
+    movu            m6, [r2 + r5 + mmsize/2]
499
+    movu            m7, [r3 + r5 + mmsize/2]
500
+    movu            m8, [r4 + r5 + mmsize/2]
501
+
502
+    psadbw          m9, m4, m5
503
+    paddd           m0, m9
504
+    psadbw          m5, m4, m6
505
+    paddd           m1, m5
506
+    psadbw          m6, m4, m7
507
+    paddd           m2, m6
508
+    psadbw          m4, m8
509
+    paddd           m3, m4
510
+
511
+    movu            m4, [r0 + FENC_STRIDE * 2]
512
+    movu            m5, [r1 + r5 * 2]
513
+    movu            m6, [r2 + r5 * 2]
514
+    movu            m7, [r3 + r5 * 2]
515
+    movu            m8, [r4 + r5 * 2]
516
+
517
+    psadbw          m9, m4, m5
518
+    paddd           m0, m9
519
+    psadbw          m5, m4, m6
520
+    paddd           m1, m5
521
+    psadbw          m6, m4, m7
522
+    paddd           m2, m6
523
+    psadbw          m4, m8
524
+    paddd           m3, m4
525
+
526
+    movu            xm4, [r0 + FENC_STRIDE * 2 + mmsize]
527
+    movu            xm5, [r1 + r5 * 2 + mmsize]
528
+    movu            xm6, [r2 + r5 * 2 + mmsize]
529
+    movu            xm7, [r3 + r5 * 2 + mmsize]
530
+    movu            xm8, [r4 + r5 * 2 + mmsize]
531
+    vinserti128     m4, m4, [r0 + FENC_STRIDE * 3], 1
532
+    vinserti128     m5, m5, [r1 + r7], 1
533
+    vinserti128     m6, m6, [r2 + r7], 1
534
+    vinserti128     m7, m7, [r3 + r7], 1
535
+    vinserti128     m8, m8, [r4 + r7], 1
536
+
537
+    psadbw          m9, m4, m5
538
+    paddd           m0, m9
539
+    psadbw          m5, m4, m6
540
+    paddd           m1, m5
541
+    psadbw          m6, m4, m7
542
+    paddd           m2, m6
543
+    psadbw          m4, m8
544
+    paddd           m3, m4
545
+
546
+    movu            m4, [r0 + FENC_STRIDE * 3 + mmsize/2]
547
+    movu            m5, [r1 + r7 + mmsize/2]
548
+    movu            m6, [r2 + r7 + mmsize/2]
549
+    movu            m7, [r3 + r7 + mmsize/2]
550
+    movu            m8, [r4 + r7 + mmsize/2]
551
+
552
+    psadbw          m9, m4, m5
553
+    paddd           m0, m9
554
+    psadbw          m5, m4, m6
555
+    paddd           m1, m5
556
+    psadbw          m6, m4, m7
557
+    paddd           m2, m6
558
+    psadbw          m4, m8
559
+    paddd           m3, m4
560
+
561
+    add             r0, FENC_STRIDE * 4
562
+    lea             r1, [r1 + r5 * 4]
563
+    lea             r2, [r2 + r5 * 4]
564
+    lea             r3, [r3 + r5 * 4]
565
+    lea             r4, [r4 + r5 * 4]
566
+
567
+    movu            m4, [r0]
568
+    movu            m5, [r1]
569
+    movu            m6, [r2]
570
+    movu            m7, [r3]
571
+    movu            m8, [r4]
572
+
573
+    psadbw          m9, m4, m5
574
+    paddd           m0, m9
575
+    psadbw          m5, m4, m6
576
+    paddd           m1, m5
577
+    psadbw          m6, m4, m7
578
+    paddd           m2, m6
579
+    psadbw          m4, m8
580
+    paddd           m3, m4
581
+
582
+    movu            xm4, [r0 + mmsize]
583
+    movu            xm5, [r1 + mmsize]
584
+    movu            xm6, [r2 + mmsize]
585
+    movu            xm7, [r3 + mmsize]
586
+    movu            xm8, [r4 + mmsize]
587
+    vinserti128     m4, m4, [r0 + FENC_STRIDE], 1
588
+    vinserti128     m5, m5, [r1 + r5], 1
589
+    vinserti128     m6, m6, [r2 + r5], 1
590
+    vinserti128     m7, m7, [r3 + r5], 1
591
+    vinserti128     m8, m8, [r4 + r5], 1
592
+
593
+    psadbw          m9, m4, m5
594
+    paddd           m0, m9
595
+    psadbw          m5, m4, m6
596
+    paddd           m1, m5
597
+    psadbw          m6, m4, m7
598
+    paddd           m2, m6
599
+    psadbw          m4, m8
600
+    paddd           m3, m4
601
+
602
+    movu            m4, [r0 + FENC_STRIDE + mmsize/2]
603
+    movu            m5, [r1 + r5 + mmsize/2]
604
+    movu            m6, [r2 + r5 + mmsize/2]
605
+    movu            m7, [r3 + r5 + mmsize/2]
606
+    movu            m8, [r4 + r5 + mmsize/2]
607
+
608
+    psadbw          m9, m4, m5
609
+    paddd           m0, m9
610
+    psadbw          m5, m4, m6
611
+    paddd           m1, m5
612
+    psadbw          m6, m4, m7
613
+    paddd           m2, m6
614
+    psadbw          m4, m8
615
+    paddd           m3, m4
616
+
617
+    movu            m4, [r0 + FENC_STRIDE * 2]
618
+    movu            m5, [r1 + r5 * 2]
619
+    movu            m6, [r2 + r5 * 2]
620
+    movu            m7, [r3 + r5 * 2]
621
+    movu            m8, [r4 + r5 * 2]
622
+
623
+    psadbw          m9, m4, m5
624
+    paddd           m0, m9
625
+    psadbw          m5, m4, m6
626
+    paddd           m1, m5
627
+    psadbw          m6, m4, m7
628
+    paddd           m2, m6
629
+    psadbw          m4, m8
630
+    paddd           m3, m4
631
+
632
+    movu            xm4, [r0 + FENC_STRIDE * 2 + mmsize]
633
+    movu            xm5, [r1 + r5 * 2 + mmsize]
634
+    movu            xm6, [r2 + r5 * 2 + mmsize]
635
+    movu            xm7, [r3 + r5 * 2 + mmsize]
636
+    movu            xm8, [r4 + r5 * 2 + mmsize]
637
+    vinserti128     m4, m4, [r0 + FENC_STRIDE * 3], 1
638
+    vinserti128     m5, m5, [r1 + r7], 1
639
+    vinserti128     m6, m6, [r2 + r7], 1
640
+    vinserti128     m7, m7, [r3 + r7], 1
641
+    vinserti128     m8, m8, [r4 + r7], 1
642
+
643
+    psadbw          m9, m4, m5
644
+    paddd           m0, m9
645
+    psadbw          m5, m4, m6
646
+    paddd           m1, m5
647
+    psadbw          m6, m4, m7
648
+    paddd           m2, m6
649
+    psadbw          m4, m8
650
+    paddd           m3, m4
651
+
652
+    movu            m4, [r0 + FENC_STRIDE * 3 + mmsize/2]
653
+    movu            m5, [r1 + r7 + mmsize/2]
654
+    movu            m6, [r2 + r7 + mmsize/2]
655
+    movu            m7, [r3 + r7 + mmsize/2]
656
+    movu            m8, [r4 + r7 + mmsize/2]
657
+
658
+    psadbw          m9, m4, m5
659
+    paddd           m0, m9
660
+    psadbw          m5, m4, m6
661
+    paddd           m1, m5
662
+    psadbw          m6, m4, m7
663
+    paddd           m2, m6
664
+    psadbw          m4, m8
665
+    paddd           m3, m4
666
+%endmacro
667
+
668
+INIT_YMM avx2
669
+cglobal pixel_sad_x4_48x64, 7,8,10
670
+    pxor            m0, m0
671
+    pxor            m1, m1
672
+    pxor            m2, m2
673
+    pxor            m3, m3
674
+    lea             r7, [r5 * 3]
675
+
676
+    SAD_X4_48x8_AVX2
677
+
678
+    add             r0, FENC_STRIDE * 4
679
+    lea             r1, [r1 + r5 * 4]
680
+    lea             r2, [r2 + r5 * 4]
681
+    lea             r3, [r3 + r5 * 4]
682
+    lea             r4, [r4 + r5 * 4]
683
+
684
+    SAD_X4_48x8_AVX2
685
+
686
+    add             r0, FENC_STRIDE * 4
687
+    lea             r1, [r1 + r5 * 4]
688
+    lea             r2, [r2 + r5 * 4]
689
+    lea             r3, [r3 + r5 * 4]
690
+    lea             r4, [r4 + r5 * 4]
691
+
692
+    SAD_X4_48x8_AVX2
693
+
694
+    add             r0, FENC_STRIDE * 4
695
+    lea             r1, [r1 + r5 * 4]
696
+    lea             r2, [r2 + r5 * 4]
697
+    lea             r3, [r3 + r5 * 4]
698
+    lea             r4, [r4 + r5 * 4]
699
+
700
+    SAD_X4_48x8_AVX2
701
+
702
+    add             r0, FENC_STRIDE * 4
703
+    lea             r1, [r1 + r5 * 4]
704
+    lea             r2, [r2 + r5 * 4]
705
+    lea             r3, [r3 + r5 * 4]
706
+    lea             r4, [r4 + r5 * 4]
707
+
708
+    SAD_X4_48x8_AVX2
709
+
710
+    add             r0, FENC_STRIDE * 4
711
+    lea             r1, [r1 + r5 * 4]
712
+    lea             r2, [r2 + r5 * 4]
713
+    lea             r3, [r3 + r5 * 4]
714
+    lea             r4, [r4 + r5 * 4]
715
+
716
+    SAD_X4_48x8_AVX2
717
+
718
+    add             r0, FENC_STRIDE * 4
719
+    lea             r1, [r1 + r5 * 4]
720
+    lea             r2, [r2 + r5 * 4]
721
+    lea             r3, [r3 + r5 * 4]
722
+    lea             r4, [r4 + r5 * 4]
723
+
724
+    SAD_X4_48x8_AVX2
725
+
726
+    add             r0, FENC_STRIDE * 4
727
+    lea             r1, [r1 + r5 * 4]
728
+    lea             r2, [r2 + r5 * 4]
729
+    lea             r3, [r3 + r5 * 4]
730
+    lea             r4, [r4 + r5 * 4]
731
+
732
+    SAD_X4_48x8_AVX2
733
+    PIXEL_SAD_X4_END_AVX2
734
+    RET
735
+%endif
736
+
737
 INIT_XMM sse2
738
 SAD_X_SSE2 3, 16, 16, 7
739
 SAD_X_SSE2 3, 16,  8, 7
740
@@ -3949,6 +4674,849 @@
741
     movd            [r5 + 8], xm1
742
     RET
743
 
744
+%if ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 0
745
+INIT_YMM avx2
746
+%macro SAD_X3_32x8_AVX2 0
747
+    movu            m3, [r0]
748
+    movu            m4, [r1]
749
+    movu            m5, [r2]
750
+    movu            m6, [r3]
751
+
752
+    psadbw          m7, m3, m4
753
+    paddd           m0, m7
754
+    psadbw          m7, m3, m5
755
+    paddd           m1, m7
756
+    psadbw          m3, m6
757
+    paddd           m2, m3
758
+
759
+    movu            m3, [r0 + FENC_STRIDE]
760
+    movu            m4, [r1 + r4]
761
+    movu            m5, [r2 + r4]
762
+    movu            m6, [r3 + r4]
763
+
764
+    psadbw          m7, m3, m4
765
+    paddd           m0, m7
766
+    psadbw          m4, m3, m5
767
+    paddd           m1, m4
768
+    psadbw          m3, m6
769
+    paddd           m2, m3
770
+
771
+    movu            m3, [r0 + FENC_STRIDE * 2]
772
+    movu            m4, [r1 + r4 * 2]
773
+    movu            m5, [r2 + r4 * 2]
774
+    movu            m6, [r3 + r4 * 2]
775
+
776
+    psadbw          m7, m3, m4
777
+    paddd           m0, m7
778
+    psadbw          m4, m3, m5
779
+    paddd           m1, m4
780
+    psadbw          m3, m6
781
+    paddd           m2, m3
782
+
783
+    movu            m3, [r0 + FENC_STRIDE * 3]
784
+    movu            m4, [r1 + r6]
785
+    movu            m5, [r2 + r6]
786
+    movu            m6, [r3 + r6]
787
+
788
+    psadbw          m7, m3, m4
789
+    paddd           m0, m7
790
+    psadbw          m4, m3, m5
791
+    paddd           m1, m4
792
+    psadbw          m3, m6
793
+    paddd           m2, m3
794
+
795
+    add             r0, FENC_STRIDE * 4
796
+    lea             r1, [r1 + r4 * 4]
797
+    lea             r2, [r2 + r4 * 4]
798
+    lea             r3, [r3 + r4 * 4]
799
+
800
+    movu            m3, [r0]
801
+    movu            m4, [r1]
802
+    movu            m5, [r2]
803
+    movu            m6, [r3]
804
+
805
+    psadbw          m7, m3, m4
806
+    paddd           m0, m7
807
+    psadbw          m4, m3, m5
808
+    paddd           m1, m4
809
+    psadbw          m3, m6
810
+    paddd           m2, m3
811
+
812
+    movu            m3, [r0 + FENC_STRIDE]
813
+    movu            m4, [r1 + r4]
814
+    movu            m5, [r2 + r4]
815
+    movu            m6, [r3 + r4]
816
+
817
+    psadbw          m7, m3, m4
818
+    paddd           m0, m7
819
+    psadbw          m4, m3, m5
820
+    paddd           m1, m4
821
+    psadbw          m3, m6
822
+    paddd           m2, m3
823
+
824
+    movu            m3, [r0 + FENC_STRIDE * 2]
825
+    movu            m4, [r1 + r4 * 2]
826
+    movu            m5, [r2 + r4 * 2]
827
+    movu            m6, [r3 + r4 * 2]
828
+
829
+    psadbw          m7, m3, m4
830
+    paddd           m0, m7
831
+    psadbw          m4, m3, m5
832
+    paddd           m1, m4
833
+    psadbw          m3, m6
834
+    paddd           m2, m3
835
+
836
+    movu            m3, [r0 + FENC_STRIDE * 3]
837
+    movu            m4, [r1 + r6]
838
+    movu            m5, [r2 + r6]
839
+    movu            m6, [r3 + r6]
840
+
841
+    psadbw          m7, m3, m4
842
+    paddd           m0, m7
843
+    psadbw          m4, m3, m5
844
+    paddd           m1, m4
845
+    psadbw          m3, m6
846
+    paddd           m2, m3
847
+%endmacro
848
+
849
+%macro SAD_X3_64x8_AVX2 0
850
+    movu            m3, [r0]
851
+    movu            m4, [r1]
852
+    movu            m5, [r2]
853
+    movu            m6, [r3]
854
+
855
+    psadbw          m7, m3, m4
856
+    paddd           m0, m7
857
+    psadbw          m4, m3, m5
858
+    paddd           m1, m4
859
+    psadbw          m3, m6
860
+    paddd           m2, m3
861
+
862
+    movu            m3, [r0 + mmsize]
863
+    movu            m4, [r1 + mmsize]
864
+    movu            m5, [r2 + mmsize]
865
+    movu            m6, [r3 + mmsize]
866
+
867
+    psadbw          m7, m3, m4
868
+    paddd           m0, m7
869
+    psadbw          m4, m3, m5
870
+    paddd           m1, m4
871
+    psadbw          m3, m6
872
+    paddd           m2, m3
873
+
874
+    movu            m3, [r0 + FENC_STRIDE]
875
+    movu            m4, [r1 + r4]
876
+    movu            m5, [r2 + r4]
877
+    movu            m6, [r3 + r4]
878
+
879
+    psadbw          m7, m3, m4
880
+    paddd           m0, m7
881
+    psadbw          m4, m3, m5
882
+    paddd           m1, m4
883
+    psadbw          m3, m6
884
+    paddd           m2, m3
885
+
886
+    movu            m3, [r0 + FENC_STRIDE + mmsize]
887
+    movu            m4, [r1 + r4 + mmsize]
888
+    movu            m5, [r2 + r4 + mmsize]
889
+    movu            m6, [r3 + r4 + mmsize]
890
+
891
+    psadbw          m7, m3, m4
892
+    paddd           m0, m7
893
+    psadbw          m4, m3, m5
894
+    paddd           m1, m4
895
+    psadbw          m3, m6
896
+    paddd           m2, m3
897
+
898
+    movu            m3, [r0 + FENC_STRIDE * 2]
899
+    movu            m4, [r1 + r4 * 2]
900
+    movu            m5, [r2 + r4 * 2]
901
+    movu            m6, [r3 + r4 * 2]
902
+
903
+    psadbw          m7, m3, m4
904
+    paddd           m0, m7
905
+    psadbw          m4, m3, m5
906
+    paddd           m1, m4
907
+    psadbw          m3, m6
908
+    paddd           m2, m3
909
+
910
+    movu            m3, [r0 + FENC_STRIDE * 2 + mmsize]
911
+    movu            m4, [r1 + r4 * 2 + mmsize]
912
+    movu            m5, [r2 + r4 * 2 + mmsize]
913
+    movu            m6, [r3 + r4 * 2 + mmsize]
914
+
915
+    psadbw          m7, m3, m4
916
+    paddd           m0, m7
917
+    psadbw          m4, m3, m5
918
+    paddd           m1, m4
919
+    psadbw          m3, m6
920
+    paddd           m2, m3
921
+
922
+    movu            m3, [r0 + FENC_STRIDE * 3]
923
+    movu            m4, [r1 + r6]
924
+    movu            m5, [r2 + r6]
925
+    movu            m6, [r3 + r6]
926
+
927
+    psadbw          m7, m3, m4
928
+    paddd           m0, m7
929
+    psadbw          m4, m3, m5
930
+    paddd           m1, m4
931
+    psadbw          m3, m6
932
+    paddd           m2, m3
933
+
934
+    movu            m3, [r0 + FENC_STRIDE * 3 + mmsize]
935
+    movu            m4, [r1 + r6 + mmsize]
936
+    movu            m5, [r2 + r6 + mmsize]
937
+    movu            m6, [r3 + r6 + mmsize]
938
+
939
+    psadbw          m7, m3, m4
940
+    paddd           m0, m7
941
+    psadbw          m4, m3, m5
942
+    paddd           m1, m4
943
+    psadbw          m3, m6
944
+    paddd           m2, m3
945
+
946
+    add             r0, FENC_STRIDE * 4
947
+    lea             r1, [r1 + r4 * 4]
948
+    lea             r2, [r2 + r4 * 4]
949
+    lea             r3, [r3 + r4 * 4]
950
+
951
+    movu            m3, [r0]
952
+    movu            m4, [r1]
953
+    movu            m5, [r2]
954
+    movu            m6, [r3]
955
+
956
+    psadbw          m7, m3, m4
957
+    paddd           m0, m7
958
+    psadbw          m4, m3, m5
959
+    paddd           m1, m4
960
+    psadbw          m3, m6
961
+    paddd           m2, m3
962
+
963
+    movu            m3, [r0 + mmsize]
964
+    movu            m4, [r1 + mmsize]
965
+    movu            m5, [r2 + mmsize]
966
+    movu            m6, [r3 + mmsize]
967
+
968
+    psadbw          m7, m3, m4
969
+    paddd           m0, m7
970
+    psadbw          m4, m3, m5
971
+    paddd           m1, m4
972
+    psadbw          m3, m6
973
+    paddd           m2, m3
974
+
975
+    movu            m3, [r0 + FENC_STRIDE]
976
+    movu            m4, [r1 + r4]
977
+    movu            m5, [r2 + r4]
978
+    movu            m6, [r3 + r4]
979
+
980
+    psadbw          m7, m3, m4
981
+    paddd           m0, m7
982
+    psadbw          m4, m3, m5
983
+    paddd           m1, m4
984
+    psadbw          m3, m6
985
+    paddd           m2, m3
986
+
987
+    movu            m3, [r0 + FENC_STRIDE + mmsize]
988
+    movu            m4, [r1 + r4 + mmsize]
989
+    movu            m5, [r2 + r4 + mmsize]
990
+    movu            m6, [r3 + r4 + mmsize]
991
+
992
+    psadbw          m7, m3, m4
993
+    paddd           m0, m7
994
+    psadbw          m4, m3, m5
995
+    paddd           m1, m4
996
+    psadbw          m3, m6
997
+    paddd           m2, m3
998
+
999
+    movu            m3, [r0 + FENC_STRIDE * 2]
1000
+    movu            m4, [r1 + r4 * 2]
1001
+    movu            m5, [r2 + r4 * 2]
1002
+    movu            m6, [r3 + r4 * 2]
1003
+
1004
+    psadbw          m7, m3, m4
1005
+    paddd           m0, m7
1006
+    psadbw          m4, m3, m5
1007
+    paddd           m1, m4
1008
+    psadbw          m3, m6
1009
+    paddd           m2, m3
1010
+
1011
+    movu            m3, [r0 + FENC_STRIDE * 2 + mmsize]
1012
+    movu            m4, [r1 + r4 * 2 + mmsize]
1013
+    movu            m5, [r2 + r4 * 2 + mmsize]
1014
+    movu            m6, [r3 + r4 * 2 + mmsize]
1015
+
1016
+    psadbw          m7, m3, m4
1017
+    paddd           m0, m7
1018
+    psadbw          m4, m3, m5
1019
+    paddd           m1, m4
1020
+    psadbw          m3, m6
1021
+    paddd           m2, m3
1022
+
1023
+    movu            m3, [r0 + FENC_STRIDE * 3]
1024
+    movu            m4, [r1 + r6]
1025
+    movu            m5, [r2 + r6]
1026
+    movu            m6, [r3 + r6]
1027
+
1028
+    psadbw          m7, m3, m4
1029
+    paddd           m0, m7
1030
+    psadbw          m4, m3, m5
1031
+    paddd           m1, m4
1032
+    psadbw          m3, m6
1033
+    paddd           m2, m3
1034
+
1035
+    movu            m3, [r0 + FENC_STRIDE * 3 + mmsize]
1036
+    movu            m4, [r1 + r6 + mmsize]
1037
+    movu            m5, [r2 + r6 + mmsize]
1038
+    movu            m6, [r3 + r6 + mmsize]
1039
+
1040
+    psadbw          m7, m3, m4
1041
+    paddd           m0, m7
1042
+    psadbw          m4, m3, m5
1043
+    paddd           m1, m4
1044
+    psadbw          m3, m6
1045
+    paddd           m2, m3
1046
+%endmacro
1047
+
1048
+%macro SAD_X3_48x8_AVX2 0
1049
+    movu            m3, [r0]
1050
+    movu            m4, [r1]
1051
+    movu            m5, [r2]
1052
+    movu            m6, [r3]
1053
+
1054
+    psadbw          m7, m3, m4
1055
+    paddd           m0, m7
1056
+    psadbw          m4, m3, m5
1057
+    paddd           m1, m4
1058
+    psadbw          m3, m6
1059
+    paddd           m2, m3
1060
+
1061
+    movu            xm3, [r0 + mmsize]
1062
+    movu            xm4, [r1 + mmsize]
1063
+    movu            xm5, [r2 + mmsize]
1064
+    movu            xm6, [r3 + mmsize]
1065
+    vinserti128     m3, m3, [r0 + FENC_STRIDE], 1
1066
+    vinserti128     m4, m4, [r1 + r4], 1
1067
+    vinserti128     m5, m5, [r2 + r4], 1
1068
+    vinserti128     m6, m6, [r3 + r4], 1
1069
+
1070
+    psadbw          m7, m3, m4
1071
+    paddd           m0, m7
1072
+    psadbw          m4, m3, m5
1073
+    paddd           m1, m4
1074
+    psadbw          m3, m6
1075
+    paddd           m2, m3
1076
+
1077
+    movu            m3, [r0 + FENC_STRIDE + mmsize/2]
1078
+    movu            m4, [r1 + r4 + mmsize/2]
1079
+    movu            m5, [r2 + r4 + mmsize/2]
1080
+    movu            m6, [r3 + r4 + mmsize/2]
1081
+
1082
+    psadbw          m7, m3, m4
1083
+    paddd           m0, m7
1084
+    psadbw          m4, m3, m5
1085
+    paddd           m1, m4
1086
+    psadbw          m3, m6
1087
+    paddd           m2, m3
1088
+
1089
+    movu            m3, [r0 + FENC_STRIDE * 2]
1090
+    movu            m4, [r1 + r4 * 2]
1091
+    movu            m5, [r2 + r4 * 2]
1092
+    movu            m6, [r3 + r4 * 2]
1093
+
1094
+    psadbw          m7, m3, m4
1095
+    paddd           m0, m7
1096
+    psadbw          m4, m3, m5
1097
+    paddd           m1, m4
1098
+    psadbw          m3, m6
1099
+    paddd           m2, m3
1100
+
1101
+    movu            xm3, [r0 + FENC_STRIDE * 2 + mmsize]
1102
+    movu            xm4, [r1 + r4 * 2 + mmsize]
1103
+    movu            xm5, [r2 + r4 * 2 + mmsize]
1104
+    movu            xm6, [r3 + r4 * 2 + mmsize]
1105
+    vinserti128     m3, m3, [r0 + FENC_STRIDE * 3], 1
1106
+    vinserti128     m4, m4, [r1 + r6], 1
1107
+    vinserti128     m5, m5, [r2 + r6], 1
1108
+    vinserti128     m6, m6, [r3 + r6], 1
1109
+
1110
+    psadbw          m7, m3, m4
1111
+    paddd           m0, m7
1112
+    psadbw          m4, m3, m5
1113
+    paddd           m1, m4
1114
+    psadbw          m3, m6
1115
+    paddd           m2, m3
1116
+
1117
+    movu            m3, [r0 + FENC_STRIDE * 3 + mmsize/2]
1118
+    movu            m4, [r1 + r6 + mmsize/2]
1119
+    movu            m5, [r2 + r6 + mmsize/2]
1120
+    movu            m6, [r3 + r6 + mmsize/2]
1121
+
1122
+    psadbw          m7, m3, m4
1123
+    paddd           m0, m7
1124
+    psadbw          m4, m3, m5
1125
+    paddd           m1, m4
1126
+    psadbw          m3, m6
1127
+    paddd           m2, m3
1128
+
1129
+    add             r0, FENC_STRIDE * 4
1130
+    lea             r1, [r1 + r4 * 4]
1131
+    lea             r2, [r2 + r4 * 4]
1132
+    lea             r3, [r3 + r4 * 4]
1133
+
1134
+    movu            m3, [r0]
1135
+    movu            m4, [r1]
1136
+    movu            m5, [r2]
1137
+    movu            m6, [r3]
1138
+
1139
+    psadbw          m7, m3, m4
1140
+    paddd           m0, m7
1141
+    psadbw          m4, m3, m5
1142
+    paddd           m1, m4
1143
+    psadbw          m3, m6
1144
+    paddd           m2, m3
1145
+
1146
+    movu            xm3, [r0 + mmsize]
1147
+    movu            xm4, [r1 + mmsize]
1148
+    movu            xm5, [r2 + mmsize]
1149
+    movu            xm6, [r3 + mmsize]
1150
+    vinserti128     m3, m3, [r0 + FENC_STRIDE], 1
1151
+    vinserti128     m4, m4, [r1 + r4], 1
1152
+    vinserti128     m5, m5, [r2 + r4], 1
1153
+    vinserti128     m6, m6, [r3 + r4], 1
1154
+
1155
+    psadbw          m7, m3, m4
1156
+    paddd           m0, m7
1157
+    psadbw          m4, m3, m5
1158
+    paddd           m1, m4
1159
+    psadbw          m3, m6
1160
+    paddd           m2, m3
1161
+
1162
+    movu            m3, [r0 + FENC_STRIDE + mmsize/2]
1163
+    movu            m4, [r1 + r4 + mmsize/2]
1164
+    movu            m5, [r2 + r4 + mmsize/2]
1165
+    movu            m6, [r3 + r4 + mmsize/2]
1166
+
1167
+    psadbw          m7, m3, m4
1168
+    paddd           m0, m7
1169
+    psadbw          m4, m3, m5
1170
+    paddd           m1, m4
1171
+    psadbw          m3, m6
1172
+    paddd           m2, m3
1173
+
1174
+    movu            m3, [r0 + FENC_STRIDE * 2]
1175
+    movu            m4, [r1 + r4 * 2]
1176
+    movu            m5, [r2 + r4 * 2]
1177
+    movu            m6, [r3 + r4 * 2]
1178
+
1179
+    psadbw          m7, m3, m4
1180
+    paddd           m0, m7
1181
+    psadbw          m4, m3, m5
1182
+    paddd           m1, m4
1183
+    psadbw          m3, m6
1184
+    paddd           m2, m3
1185
+
1186
+    movu            xm3, [r0 + FENC_STRIDE * 2 + mmsize]
1187
+    movu            xm4, [r1 + r4 * 2 + mmsize]
1188
+    movu            xm5, [r2 + r4 * 2 + mmsize]
1189
+    movu            xm6, [r3 + r4 * 2 + mmsize]
1190
+    vinserti128     m3, m3, [r0 + FENC_STRIDE * 3], 1
1191
+    vinserti128     m4, m4, [r1 + r6], 1
1192
+    vinserti128     m5, m5, [r2 + r6], 1
1193
+    vinserti128     m6, m6, [r3 + r6], 1
1194
+
1195
+    psadbw          m7, m3, m4
1196
+    paddd           m0, m7
1197
+    psadbw          m4, m3, m5
1198
+    paddd           m1, m4
1199
+    psadbw          m3, m6
1200
+    paddd           m2, m3
1201
+
1202
+    movu            m3, [r0 + FENC_STRIDE * 3 + mmsize/2]
1203
+    movu            m4, [r1 + r6 + mmsize/2]
1204
+    movu            m5, [r2 + r6 + mmsize/2]
1205
+    movu            m6, [r3 + r6 + mmsize/2]
1206
+
1207
+    psadbw          m7, m3, m4
1208
+    paddd           m0, m7
1209
+    psadbw          m4, m3, m5
1210
+    paddd           m1, m4
1211
+    psadbw          m3, m6
1212
+    paddd           m2, m3
1213
+%endmacro
1214
+
1215
+%macro PIXEL_SAD_X3_END_AVX2 0
1216
+    vextracti128   xm3, m0, 1
1217
+    vextracti128   xm4, m1, 1
1218
+    vextracti128   xm5, m2, 1
1219
+    paddd           m0, m3
1220
+    paddd           m1, m4
1221
+    paddd           m2, m5
1222
+    pshufd         xm3, xm0, 2
1223
+    pshufd         xm4, xm1, 2
1224
+    pshufd         xm5, xm2, 2
1225
+    paddd           m0, m3
1226
+    paddd           m1, m4
1227
+    paddd           m2, m5
1228
+
1229
+    movd            [r5 + 0], xm0
1230
+    movd            [r5 + 4], xm1
1231
+    movd            [r5 + 8], xm2
1232
+%endmacro
1233
+
1234
+cglobal pixel_sad_x3_32x8, 6,7,8
1235
+    pxor            m0, m0
1236
+    pxor            m1, m1
1237
+    pxor            m2, m2
1238
+    lea             r6, [r4 * 3]
1239
+
1240
+    SAD_X3_32x8_AVX2
1241
+    PIXEL_SAD_X3_END_AVX2
1242
+    RET
1243
+
1244
+cglobal pixel_sad_x3_32x16, 6,7,8
1245
+    pxor            m0, m0
1246
+    pxor            m1, m1
1247
+    pxor            m2, m2
1248
+    lea             r6, [r4 * 3]
1249
+
1250
+    SAD_X3_32x8_AVX2
1251
+
1252
+    add             r0, FENC_STRIDE * 4
1253
+    lea             r1, [r1 + r4 * 4]
1254
+    lea             r2, [r2 + r4 * 4]
1255
+    lea             r3, [r3 + r4 * 4]
1256
+
1257
+    SAD_X3_32x8_AVX2
1258
+    PIXEL_SAD_X3_END_AVX2
1259
+    RET
1260
+
1261
+cglobal pixel_sad_x3_32x24, 6,7,8
1262
+    pxor            m0, m0
1263
+    pxor            m1, m1
1264
+    pxor            m2, m2
1265
+    lea             r6, [r4 * 3]
1266
+
1267
+    SAD_X3_32x8_AVX2
1268
+
1269
+    add             r0, FENC_STRIDE * 4
1270
+    lea             r1, [r1 + r4 * 4]
1271
+    lea             r2, [r2 + r4 * 4]
1272
+    lea             r3, [r3 + r4 * 4]
1273
+
1274
+    SAD_X3_32x8_AVX2
1275
+
1276
+    add             r0, FENC_STRIDE * 4
1277
+    lea             r1, [r1 + r4 * 4]
1278
+    lea             r2, [r2 + r4 * 4]
1279
+    lea             r3, [r3 + r4 * 4]
1280
+
1281
+    SAD_X3_32x8_AVX2
1282
+    PIXEL_SAD_X3_END_AVX2
1283
+    RET
1284
+
1285
+cglobal pixel_sad_x3_32x32, 6,7,8
1286
+    pxor            m0, m0
1287
+    pxor            m1, m1
1288
+    pxor            m2, m2
1289
+    lea             r6, [r4 * 3]
1290
+
1291
+    SAD_X3_32x8_AVX2
1292
+
1293
+    add             r0, FENC_STRIDE * 4
1294
+    lea             r1, [r1 + r4 * 4]
1295
+    lea             r2, [r2 + r4 * 4]
1296
+    lea             r3, [r3 + r4 * 4]
1297
+
1298
+    SAD_X3_32x8_AVX2
1299
+
1300
+    add             r0, FENC_STRIDE * 4
1301
+    lea             r1, [r1 + r4 * 4]
1302
+    lea             r2, [r2 + r4 * 4]
1303
+    lea             r3, [r3 + r4 * 4]
1304
+
1305
+    SAD_X3_32x8_AVX2
1306
+
1307
+    add             r0, FENC_STRIDE * 4
1308
+    lea             r1, [r1 + r4 * 4]
1309
+    lea             r2, [r2 + r4 * 4]
1310
+    lea             r3, [r3 + r4 * 4]
1311
+
1312
+    SAD_X3_32x8_AVX2
1313
+    PIXEL_SAD_X3_END_AVX2
1314
+    RET
1315
+
1316
+cglobal pixel_sad_x3_32x64, 6,7,8
1317
+    pxor            m0, m0
1318
+    pxor            m1, m1
1319
+    pxor            m2, m2
1320
+    lea             r6, [r4 * 3]
1321
+
1322
+    SAD_X3_32x8_AVX2
1323
+
1324
+    add             r0, FENC_STRIDE * 4
1325
+    lea             r1, [r1 + r4 * 4]
1326
+    lea             r2, [r2 + r4 * 4]
1327
+    lea             r3, [r3 + r4 * 4]
1328
+
1329
+    SAD_X3_32x8_AVX2
1330
+
1331
+    add             r0, FENC_STRIDE * 4
1332
+    lea             r1, [r1 + r4 * 4]
1333
+    lea             r2, [r2 + r4 * 4]
1334
+    lea             r3, [r3 + r4 * 4]
1335
+
1336
+    SAD_X3_32x8_AVX2
1337
+
1338
+    add             r0, FENC_STRIDE * 4
1339
+    lea             r1, [r1 + r4 * 4]
1340
+    lea             r2, [r2 + r4 * 4]
1341
+    lea             r3, [r3 + r4 * 4]
1342
+
1343
+    SAD_X3_32x8_AVX2
1344
+
1345
+    add             r0, FENC_STRIDE * 4
1346
+    lea             r1, [r1 + r4 * 4]
1347
+    lea             r2, [r2 + r4 * 4]
1348
+    lea             r3, [r3 + r4 * 4]
1349
+
1350
+    SAD_X3_32x8_AVX2
1351
+
1352
+    add             r0, FENC_STRIDE * 4
1353
+    lea             r1, [r1 + r4 * 4]
1354
+    lea             r2, [r2 + r4 * 4]
1355
+    lea             r3, [r3 + r4 * 4]
1356
+
1357
+    SAD_X3_32x8_AVX2
1358
+
1359
+    add             r0, FENC_STRIDE * 4
1360
+    lea             r1, [r1 + r4 * 4]
1361
+    lea             r2, [r2 + r4 * 4]
1362
+    lea             r3, [r3 + r4 * 4]
1363
+
1364
+    SAD_X3_32x8_AVX2
1365
+
1366
+    add             r0, FENC_STRIDE * 4
1367
+    lea             r1, [r1 + r4 * 4]
1368
+    lea             r2, [r2 + r4 * 4]
1369
+    lea             r3, [r3 + r4 * 4]
1370
+
1371
+    SAD_X3_32x8_AVX2
1372
+    PIXEL_SAD_X3_END_AVX2
1373
+    RET
1374
+
1375
+cglobal pixel_sad_x3_64x16, 6,7,8
1376
+    pxor            m0, m0
1377
+    pxor            m1, m1
1378
+    pxor            m2, m2
1379
+    lea             r6, [r4 * 3]
1380
+
1381
+    SAD_X3_64x8_AVX2
1382
+
1383
+    add             r0, FENC_STRIDE * 4
1384
+    lea             r1, [r1 + r4 * 4]
1385
+    lea             r2, [r2 + r4 * 4]
1386
+    lea             r3, [r3 + r4 * 4]
1387
+
1388
+    SAD_X3_64x8_AVX2
1389
+    PIXEL_SAD_X3_END_AVX2
1390
+    RET
1391
+
1392
+cglobal pixel_sad_x3_64x32, 6,7,8
1393
+    pxor            m0, m0
1394
+    pxor            m1, m1
1395
+    pxor            m2, m2
1396
+    lea             r6, [r4 * 3]
1397
+
1398
+    SAD_X3_64x8_AVX2
1399
+
1400
+    add             r0, FENC_STRIDE * 4
1401
+    lea             r1, [r1 + r4 * 4]
1402
+    lea             r2, [r2 + r4 * 4]
1403
+    lea             r3, [r3 + r4 * 4]
1404
+
1405
+    SAD_X3_64x8_AVX2
1406
+
1407
+    add             r0, FENC_STRIDE * 4
1408
+    lea             r1, [r1 + r4 * 4]
1409
+    lea             r2, [r2 + r4 * 4]
1410
+    lea             r3, [r3 + r4 * 4]
1411
+
1412
+    SAD_X3_64x8_AVX2
1413
+
1414
+    add             r0, FENC_STRIDE * 4
1415
+    lea             r1, [r1 + r4 * 4]
1416
+    lea             r2, [r2 + r4 * 4]
1417
+    lea             r3, [r3 + r4 * 4]
1418
+
1419
+    SAD_X3_64x8_AVX2
1420
+    PIXEL_SAD_X3_END_AVX2
1421
+    RET
1422
+
1423
+cglobal pixel_sad_x3_64x48, 6,7,8
1424
+    pxor            m0, m0
1425
+    pxor            m1, m1
1426
+    pxor            m2, m2
1427
+    lea             r6, [r4 * 3]
1428
+
1429
+    SAD_X3_64x8_AVX2
1430
+
1431
+    add             r0, FENC_STRIDE * 4
1432
+    lea             r1, [r1 + r4 * 4]
1433
+    lea             r2, [r2 + r4 * 4]
1434
+    lea             r3, [r3 + r4 * 4]
1435
+
1436
+    SAD_X3_64x8_AVX2
1437
+
1438
+    add             r0, FENC_STRIDE * 4
1439
+    lea             r1, [r1 + r4 * 4]
1440
+    lea             r2, [r2 + r4 * 4]
1441
+    lea             r3, [r3 + r4 * 4]
1442
+
1443
+    SAD_X3_64x8_AVX2
1444
+
1445
+    add             r0, FENC_STRIDE * 4
1446
+    lea             r1, [r1 + r4 * 4]
1447
+    lea             r2, [r2 + r4 * 4]
1448
+    lea             r3, [r3 + r4 * 4]
1449
+
1450
+    SAD_X3_64x8_AVX2
1451
+
1452
+    add             r0, FENC_STRIDE * 4
1453
+    lea             r1, [r1 + r4 * 4]
1454
+    lea             r2, [r2 + r4 * 4]
1455
+    lea             r3, [r3 + r4 * 4]
1456
+
1457
+    SAD_X3_64x8_AVX2
1458
+
1459
+    add             r0, FENC_STRIDE * 4
1460
+    lea             r1, [r1 + r4 * 4]
1461
+    lea             r2, [r2 + r4 * 4]
1462
+    lea             r3, [r3 + r4 * 4]
1463
+
1464
+    SAD_X3_64x8_AVX2
1465
+    PIXEL_SAD_X3_END_AVX2
1466
+    RET
1467
+
1468
+cglobal pixel_sad_x3_64x64, 6,7,8
1469
+    pxor            m0, m0
1470
+    pxor            m1, m1
1471
+    pxor            m2, m2
1472
+    lea             r6, [r4 * 3]
1473
+
1474
+    SAD_X3_64x8_AVX2
1475
+
1476
+    add             r0, FENC_STRIDE * 4
1477
+    lea             r1, [r1 + r4 * 4]
1478
+    lea             r2, [r2 + r4 * 4]
1479
+    lea             r3, [r3 + r4 * 4]
1480
+
1481
+    SAD_X3_64x8_AVX2
1482
+
1483
+    add             r0, FENC_STRIDE * 4
1484
+    lea             r1, [r1 + r4 * 4]
1485
+    lea             r2, [r2 + r4 * 4]
1486
+    lea             r3, [r3 + r4 * 4]
1487
+
1488
+    SAD_X3_64x8_AVX2
1489
+
1490
+    add             r0, FENC_STRIDE * 4
1491
+    lea             r1, [r1 + r4 * 4]
1492
+    lea             r2, [r2 + r4 * 4]
1493
+    lea             r3, [r3 + r4 * 4]
1494
+
1495
+    SAD_X3_64x8_AVX2
1496
+
1497
+    add             r0, FENC_STRIDE * 4
1498
+    lea             r1, [r1 + r4 * 4]
1499
+    lea             r2, [r2 + r4 * 4]
1500
+    lea             r3, [r3 + r4 * 4]
1501
+
1502
+    SAD_X3_64x8_AVX2
1503
+
1504
+    add             r0, FENC_STRIDE * 4
1505
+    lea             r1, [r1 + r4 * 4]
1506
+    lea             r2, [r2 + r4 * 4]
1507
+    lea             r3, [r3 + r4 * 4]
1508
+
1509
+    SAD_X3_64x8_AVX2
1510
+
1511
+    add             r0, FENC_STRIDE * 4
1512
+    lea             r1, [r1 + r4 * 4]
1513
+    lea             r2, [r2 + r4 * 4]
1514
+    lea             r3, [r3 + r4 * 4]
1515
+
1516
+    SAD_X3_64x8_AVX2
1517
+
1518
+    add             r0, FENC_STRIDE * 4
1519
+    lea             r1, [r1 + r4 * 4]
1520
+    lea             r2, [r2 + r4 * 4]
1521
+    lea             r3, [r3 + r4 * 4]
1522
+
1523
+    SAD_X3_64x8_AVX2
1524
+    PIXEL_SAD_X3_END_AVX2
1525
+    RET
1526
+
1527
+cglobal pixel_sad_x3_48x64, 6,7,8
1528
+    pxor            m0, m0
1529
+    pxor            m1, m1
1530
+    pxor            m2, m2
1531
+    lea             r6, [r4 * 3]
1532
+
1533
+    SAD_X3_48x8_AVX2
1534
+
1535
+    add             r0, FENC_STRIDE * 4
1536
+    lea             r1, [r1 + r4 * 4]
1537
+    lea             r2, [r2 + r4 * 4]
1538
+    lea             r3, [r3 + r4 * 4]
1539
+
1540
+    SAD_X3_48x8_AVX2
1541
+
1542
+    add             r0, FENC_STRIDE * 4
1543
+    lea             r1, [r1 + r4 * 4]
1544
+    lea             r2, [r2 + r4 * 4]
1545
+    lea             r3, [r3 + r4 * 4]
1546
+
1547
+    SAD_X3_48x8_AVX2
1548
+
1549
+    add             r0, FENC_STRIDE * 4
1550
+    lea             r1, [r1 + r4 * 4]
1551
+    lea             r2, [r2 + r4 * 4]
1552
+    lea             r3, [r3 + r4 * 4]
1553
+
1554
+    SAD_X3_48x8_AVX2
1555
+
1556
+    add             r0, FENC_STRIDE * 4
1557
+    lea             r1, [r1 + r4 * 4]
1558
+    lea             r2, [r2 + r4 * 4]
1559
+    lea             r3, [r3 + r4 * 4]
1560
+
1561
+    SAD_X3_48x8_AVX2
1562
+
1563
+    add             r0, FENC_STRIDE * 4
1564
+    lea             r1, [r1 + r4 * 4]
1565
+    lea             r2, [r2 + r4 * 4]
1566
+    lea             r3, [r3 + r4 * 4]
1567
+
1568
+    SAD_X3_48x8_AVX2
1569
+
1570
+    add             r0, FENC_STRIDE * 4
1571
+    lea             r1, [r1 + r4 * 4]
1572
+    lea             r2, [r2 + r4 * 4]
1573
+    lea             r3, [r3 + r4 * 4]
1574
+
1575
+    SAD_X3_48x8_AVX2
1576
+
1577
+    add             r0, FENC_STRIDE * 4
1578
+    lea             r1, [r1 + r4 * 4]
1579
+    lea             r2, [r2 + r4 * 4]
1580
+    lea             r3, [r3 + r4 * 4]
1581
+
1582
+    SAD_X3_48x8_AVX2
1583
+    PIXEL_SAD_X3_END_AVX2
1584
+    RET
1585
+%endif
1586
+
1587
 INIT_YMM avx2
1588
 cglobal pixel_sad_x4_8x8, 7,7,5
1589
     xorps           m0, m0
1590
x265_1.8.tar.gz/source/common/x86/sad16-a.asm -> x265_1.9.tar.gz/source/common/x86/sad16-a.asm Changed
624
 
1
@@ -413,77 +413,50 @@
2
 SAD  16, 32
3
 
4
 INIT_YMM avx2
5
-cglobal pixel_sad_16x64, 4,7,4
6
+cglobal pixel_sad_16x64, 4,5,5
7
     pxor    m0, m0
8
-    pxor    m3, m3
9
-    mov     r4d, 64 / 8
10
-    add     r3d, r3d
11
-    add     r1d, r1d
12
-    lea     r5,     [r1 * 3]
13
-    lea     r6,     [r3 * 3]
14
+    mov     r4d, 16
15
+    mova    m4, [pw_1]
16
 .loop:
17
     movu    m1, [r2]
18
-    movu    m2, [r2 + r3]
19
+    movu    m2, [r2 + r3 * 2]
20
     psubw   m1, [r0]
21
-    psubw   m2, [r0 + r1]
22
-    pabsw   m1, m1
23
-    pabsw   m2, m2
24
-    paddw   m0, m1
25
-    paddw   m3, m2
26
-
27
-    movu    m1, [r2 + 2 * r3]
28
-    movu    m2, [r2 + r6]
29
-    psubw   m1, [r0 + 2 * r1]
30
-    psubw   m2, [r0 + r5]
31
+    psubw   m2, [r0 + r1 * 2]
32
     pabsw   m1, m1
33
     pabsw   m2, m2
34
-    paddw   m0, m1
35
-    paddw   m3, m2
36
-
37
+    paddw   m3, m1, m2
38
     lea     r0, [r0 + 4 * r1]
39
     lea     r2, [r2 + 4 * r3]
40
 
41
     movu    m1, [r2]
42
-    movu    m2, [r2 + r3]
43
+    movu    m2, [r2 + r3 * 2]
44
     psubw   m1, [r0]
45
-    psubw   m2, [r0 + r1]
46
+    psubw   m2, [r0 + r1 * 2]
47
     pabsw   m1, m1
48
     pabsw   m2, m2
49
-    paddw   m0, m1
50
-    paddw   m3, m2
51
-
52
-    movu    m1, [r2 + 2 * r3]
53
-    movu    m2, [r2 + r6]
54
-    psubw   m1, [r0 + 2 * r1]
55
-    psubw   m2, [r0 + r5]
56
-    pabsw   m1, m1
57
-    pabsw   m2, m2
58
-    paddw   m0, m1
59
-    paddw   m3, m2
60
-
61
-    lea     r0, [r0 + 4 * r1]
62
-    lea     r2, [r2 + 4 * r3]
63
-
64
-    dec    r4d
65
-    jg .loop
66
-
67
-    HADDUWD m0, m1
68
-    HADDUWD m3, m1
69
-    HADDD   m0, m1
70
-    HADDD   m3, m1
71
+    paddw   m1, m2
72
+    pmaddwd m3, m4
73
     paddd   m0, m3
74
+    pmaddwd m1, m4
75
+    paddd   m0, m1
76
+    lea     r0, [r0+4*r1]
77
+    lea     r2, [r2+4*r3]
78
+    dec     r4d
79
+    jg      .loop
80
 
81
+    HADDD   m0, m1
82
     movd    eax, xm0
83
     RET
84
 
85
 INIT_YMM avx2
86
-cglobal pixel_sad_32x8, 4,7,5
87
+cglobal pixel_sad_32x8, 4,7,7
88
     pxor    m0, m0
89
     mov     r4d, 8/4
90
+    mova    m6, [pw_1]
91
     add     r3d, r3d
92
     add     r1d, r1d
93
-    lea     r5,     [r1 * 3]
94
-    lea     r6,     [r3 * 3]
95
+    lea     r5d,     [r1 * 3]
96
+    lea     r6d,     [r3 * 3]
97
 .loop:
98
     movu    m1, [r2]
99
     movu    m2, [r2 + 32]
100
@@ -499,8 +472,7 @@
101
     pabsw   m4, m4
102
     paddw   m1, m2
103
     paddw   m3, m4
104
-    paddw   m0, m1
105
-    paddw   m0, m3
106
+    paddw   m5, m1, m3
107
 
108
     movu    m1, [r2 + 2 * r3]
109
     movu    m2, [r2 + 2 * r3 + 32]
110
@@ -518,24 +490,28 @@
111
     pabsw   m4, m4
112
     paddw   m1, m2
113
     paddw   m3, m4
114
-    paddw   m0, m1
115
-    paddw   m0, m3
116
+    paddw   m1, m3
117
 
118
+    pmaddwd m5, m6
119
+    paddd   m0, m5
120
+    pmaddwd m1, m6
121
+    paddd   m0, m1
122
     dec    r4d
123
     jg .loop
124
 
125
-    HADDW   m0, m1
126
+    HADDD   m0, m1
127
     movd    eax, xm0
128
     RET
129
 
130
 INIT_YMM avx2
131
-cglobal pixel_sad_32x16, 4,7,5
132
+cglobal pixel_sad_32x16, 4,7,7
133
     pxor    m0, m0
134
     mov     r4d, 16/8
135
+    mova    m6, [pw_1]
136
     add     r3d, r3d
137
     add     r1d, r1d
138
-    lea     r5,     [r1 * 3]
139
-    lea     r6,     [r3 * 3]
140
+    lea     r5d,     [r1 * 3]
141
+    lea     r6d,     [r3 * 3]
142
 .loop:
143
     movu    m1, [r2]
144
     movu    m2, [r2 + 32]
145
@@ -551,8 +527,7 @@
146
     pabsw   m4, m4
147
     paddw   m1, m2
148
     paddw   m3, m4
149
-    paddw   m0, m1
150
-    paddw   m0, m3
151
+    paddw   m5, m1, m3
152
 
153
     movu    m1, [r2 + 2 * r3]
154
     movu    m2, [r2 + 2 * r3 + 32]
155
@@ -570,8 +545,12 @@
156
     pabsw   m4, m4
157
     paddw   m1, m2
158
     paddw   m3, m4
159
-    paddw   m0, m1
160
-    paddw   m0, m3
161
+    paddw   m1, m3
162
+
163
+    pmaddwd m5, m6
164
+    paddd   m0, m5
165
+    pmaddwd m1, m6
166
+    paddd   m0, m1
167
 
168
     movu    m1, [r2]
169
     movu    m2, [r2 + 32]
170
@@ -587,8 +566,7 @@
171
     pabsw   m4, m4
172
     paddw   m1, m2
173
     paddw   m3, m4
174
-    paddw   m0, m1
175
-    paddw   m0, m3
176
+    paddw   m5, m1, m3
177
 
178
     movu    m1, [r2 + 2 * r3]
179
     movu    m2, [r2 + 2 * r3 + 32]
180
@@ -606,24 +584,28 @@
181
     pabsw   m4, m4
182
     paddw   m1, m2
183
     paddw   m3, m4
184
-    paddw   m0, m1
185
-    paddw   m0, m3
186
+    paddw   m1, m3
187
 
188
+    pmaddwd m5, m6
189
+    paddd   m0, m5
190
+    pmaddwd m1, m6
191
+    paddd   m0, m1
192
     dec    r4d
193
     jg .loop
194
 
195
-    HADDW   m0, m1
196
+    HADDD   m0, m1
197
     movd    eax, xm0
198
     RET
199
 
200
 INIT_YMM avx2
201
-cglobal pixel_sad_32x24, 4,7,5
202
+cglobal pixel_sad_32x24, 4,7,7
203
     pxor    m0, m0
204
     mov     r4d, 24/4
205
+    mova    m6, [pw_1]
206
     add     r3d, r3d
207
     add     r1d, r1d
208
-    lea     r5,     [r1 * 3]
209
-    lea     r6,     [r3 * 3]
210
+    lea     r5d,     [r1 * 3]
211
+    lea     r6d,     [r3 * 3]
212
 .loop:
213
     movu    m1, [r2]
214
     movu    m2, [r2 + 32]
215
@@ -639,8 +621,7 @@
216
     pabsw   m4, m4
217
     paddw   m1, m2
218
     paddw   m3, m4
219
-    paddw   m0, m1
220
-    paddw   m0, m3
221
+    paddw   m5, m1, m3
222
 
223
     movu    m1, [r2 + 2 * r3]
224
     movu    m2, [r2 + 2 * r3 + 32]
225
@@ -656,29 +637,30 @@
226
     pabsw   m4, m4
227
     paddw   m1, m2
228
     paddw   m3, m4
229
-    paddw   m0, m1
230
-    paddw   m0, m3
231
-
232
+    paddw   m1, m3
233
+    pmaddwd m5, m6
234
+    paddd   m0, m5
235
+    pmaddwd m1, m6
236
+    paddd   m0, m1
237
     lea     r0, [r0 + 4 * r1]
238
     lea     r2, [r2 + 4 * r3]
239
 
240
     dec    r4d
241
     jg .loop
242
 
243
-    HADDUWD m0, m1
244
     HADDD   m0, m1
245
     movd    eax, xm0
246
     RET
247
 
248
-
249
 INIT_YMM avx2
250
-cglobal pixel_sad_32x32, 4,7,5
251
+cglobal pixel_sad_32x32, 4,7,7
252
     pxor    m0, m0
253
     mov     r4d, 32/4
254
+    mova    m6, [pw_1]
255
     add     r3d, r3d
256
     add     r1d, r1d
257
-    lea     r5,     [r1 * 3]
258
-    lea     r6,     [r3 * 3]
259
+    lea     r5d,     [r1 * 3]
260
+    lea     r6d,     [r3 * 3]
261
 .loop:
262
     movu    m1, [r2]
263
     movu    m2, [r2 + 32]
264
@@ -694,8 +676,7 @@
265
     pabsw   m4, m4
266
     paddw   m1, m2
267
     paddw   m3, m4
268
-    paddw   m0, m1
269
-    paddw   m0, m3
270
+    paddw   m5, m1, m3
271
 
272
     movu    m1, [r2 + 2 * r3]
273
     movu    m2, [r2 + 2 * r3 + 32]
274
@@ -711,8 +692,12 @@
275
     pabsw   m4, m4
276
     paddw   m1, m2
277
     paddw   m3, m4
278
-    paddw   m0, m1
279
-    paddw   m0, m3
280
+    paddw   m1, m3
281
+
282
+    pmaddwd m5, m6
283
+    paddd   m0, m5
284
+    pmaddwd m1, m6
285
+    paddd   m0, m1
286
 
287
     lea     r0, [r0 + 4 * r1]
288
     lea     r2, [r2 + 4 * r3]
289
@@ -720,20 +705,19 @@
290
     dec    r4d
291
     jg .loop
292
 
293
-    HADDUWD m0, m1
294
     HADDD   m0, m1
295
     movd    eax, xm0
296
     RET
297
 
298
 INIT_YMM avx2
299
-cglobal pixel_sad_32x64, 4,7,6
300
+cglobal pixel_sad_32x64, 4,7,7
301
     pxor    m0, m0
302
-    pxor    m5, m5
303
     mov     r4d, 64 / 4
304
+    mova    m6, [pw_1]
305
     add     r3d, r3d
306
     add     r1d, r1d
307
-    lea     r5,     [r1 * 3]
308
-    lea     r6,     [r3 * 3]
309
+    lea     r5d,     [r1 * 3]
310
+    lea     r6d,     [r3 * 3]
311
 .loop:
312
     movu    m1, [r2]
313
     movu    m2, [r2 + 32]
314
@@ -749,8 +733,7 @@
315
     pabsw   m4, m4
316
     paddw   m1, m2
317
     paddw   m3, m4
318
-    paddw   m0, m1
319
-    paddw   m5, m3
320
+    paddw   m5, m1, m3
321
 
322
     movu    m1, [r2 + 2 * r3]
323
     movu    m2, [r2 + 2 * r3 + 32]
324
@@ -766,29 +749,28 @@
325
     pabsw   m4, m4
326
     paddw   m1, m2
327
     paddw   m3, m4
328
-    paddw   m0, m1
329
-    paddw   m5, m3
330
+    paddw   m1, m3
331
+
332
+    pmaddwd m5, m6
333
+    paddd   m0, m5
334
+    pmaddwd m1, m6
335
+    paddd   m0, m1
336
+
337
     lea     r0, [r0 + 4 * r1]
338
     lea     r2, [r2 + 4 * r3]
339
 
340
-    dec    r4d
341
+    dec     r4d
342
     jg .loop
343
 
344
-    HADDUWD m0, m1
345
-    HADDUWD m5, m1
346
     HADDD   m0, m1
347
-    HADDD   m5, m1
348
-    paddd   m0, m5
349
-
350
     movd    eax, xm0
351
     RET
352
 
353
 INIT_YMM avx2
354
 cglobal pixel_sad_48x64, 4, 5, 7
355
     pxor    m0, m0
356
-    pxor    m5, m5
357
-    pxor    m6, m6
358
     mov     r4d, 64/2
359
+    mova    m6, [pw_1]
360
     add     r3d, r3d
361
     add     r1d, r1d
362
 .loop:
363
@@ -801,9 +783,8 @@
364
     pabsw   m1, m1
365
     pabsw   m2, m2
366
     pabsw   m3, m3
367
-    paddw   m0, m1
368
-    paddw   m5, m2
369
-    paddw   m6, m3
370
+    paddw   m1, m2
371
+    paddw   m5, m3, m1
372
 
373
     movu    m1, [r2 + r3 + 0 * mmsize]
374
     movu    m2, [r2 + r3 + 1 * mmsize]
375
@@ -814,29 +795,28 @@
376
     pabsw   m1, m1
377
     pabsw   m2, m2
378
     pabsw   m3, m3
379
-    paddw   m0, m1
380
-    paddw   m5, m2
381
-    paddw   m6, m3
382
+    paddw   m1, m2
383
+    paddw   m3, m1
384
 
385
+    pmaddwd m5, m6
386
+    paddd   m0, m5
387
+    pmaddwd m3, m6
388
+    paddd   m0, m3
389
     lea     r0, [r0 + 2 * r1]
390
     lea     r2, [r2 + 2 * r3]
391
 
392
     dec     r4d
393
     jg      .loop
394
 
395
-    HADDUWD m0, m1
396
-    HADDUWD m5, m1
397
-    HADDUWD m6, m1
398
-    paddd   m0, m5
399
-    paddd   m0, m6
400
-    HADDD   m0, m1
401
+    HADDD   m0, m3
402
     movd    eax, xm0
403
     RET
404
 
405
 INIT_YMM avx2
406
-cglobal pixel_sad_64x16, 4, 5, 5
407
+cglobal pixel_sad_64x16, 4, 5, 7
408
     pxor    m0, m0
409
     mov     r4d, 16 / 2
410
+    mova    m6, [pw_1]
411
     add     r3d, r3d
412
     add     r1d, r1d
413
 .loop:
414
@@ -854,8 +834,8 @@
415
     pabsw   m4, m4
416
     paddw   m1, m2
417
     paddw   m3, m4
418
-    paddw   m0, m1
419
-    paddw   m0, m3
420
+    paddw   m5, m1, m3
421
+
422
     movu    m1, [r2 + r3]
423
     movu    m2, [r2 + r3 + 32]
424
     movu    m3, [r2 + r3 + 64]
425
@@ -870,24 +850,28 @@
426
     pabsw   m4, m4
427
     paddw   m1, m2
428
     paddw   m3, m4
429
-    paddw   m0, m1
430
-    paddw   m0, m3
431
+    paddw   m1, m3
432
+
433
+    pmaddwd m5, m6
434
+    paddd   m0, m5
435
+    pmaddwd m1, m6
436
+    paddd   m0, m1
437
+
438
     lea     r0, [r0 + 2 * r1]
439
     lea     r2, [r2 + 2 * r3]
440
 
441
-    dec    r4d
442
-    jg     .loop
443
+    dec     r4d
444
+    jg      .loop
445
 
446
-    HADDUWD m0, m1
447
     HADDD   m0, m1
448
     movd    eax, xm0
449
     RET
450
 
451
 INIT_YMM avx2
452
-cglobal pixel_sad_64x32, 4, 5, 6
453
+cglobal pixel_sad_64x32, 4, 5, 7
454
     pxor    m0, m0
455
-    pxor    m5, m5
456
     mov     r4d, 32 / 2
457
+    mova    m6, [pw_1]
458
     add     r3d, r3d
459
     add     r1d, r1d
460
 .loop:
461
@@ -905,8 +889,7 @@
462
     pabsw   m4, m4
463
     paddw   m1, m2
464
     paddw   m3, m4
465
-    paddw   m0, m1
466
-    paddw   m5, m3
467
+    paddw   m5, m1, m3
468
 
469
     movu    m1, [r2 + r3]
470
     movu    m2, [r2 + r3 + 32]
471
@@ -922,29 +905,27 @@
472
     pabsw   m4, m4
473
     paddw   m1, m2
474
     paddw   m3, m4
475
-    paddw   m0, m1
476
-    paddw   m5, m3
477
+    paddw   m1, m3
478
+
479
+    pmaddwd m5, m6
480
+    paddd   m0, m5
481
+    pmaddwd m1, m6
482
+    paddd   m0, m1
483
     lea     r0, [r0 + 2 * r1]
484
     lea     r2, [r2 + 2 * r3]
485
 
486
-    dec    r4d
487
-    jg     .loop
488
+    dec     r4d
489
+    jg      .loop
490
 
491
-    HADDUWD m0, m1
492
-    HADDUWD m5, m1
493
-    paddd   m0, m5
494
     HADDD   m0, m1
495
-    
496
     movd    eax, xm0
497
     RET
498
 
499
 INIT_YMM avx2
500
-cglobal pixel_sad_64x48, 4, 5, 8
501
+cglobal pixel_sad_64x48, 4, 5, 7
502
     pxor    m0, m0
503
-    pxor    m5, m5
504
-    pxor    m6, m6
505
-    pxor    m7, m7
506
     mov     r4d, 48 / 2
507
+    mova    m6, [pw_1]
508
     add     r3d, r3d
509
     add     r1d, r1d
510
 .loop:
511
@@ -960,10 +941,9 @@
512
     pabsw   m2, m2
513
     pabsw   m3, m3
514
     pabsw   m4, m4
515
-    paddw   m0, m1
516
-    paddw   m5, m2
517
-    paddw   m6, m3
518
-    paddw   m7, m4
519
+    paddw   m1, m2
520
+    paddw   m3, m4
521
+    paddw   m5, m1, m3
522
 
523
     movu    m1, [r2 + r3]
524
     movu    m2, [r2 + r3 + 32]
525
@@ -977,35 +957,30 @@
526
     pabsw   m2, m2
527
     pabsw   m3, m3
528
     pabsw   m4, m4
529
-    paddw   m0, m1
530
-    paddw   m5, m2
531
-    paddw   m6, m3
532
-    paddw   m7, m4
533
+    paddw   m1, m2
534
+    paddw   m3, m4
535
+    paddw   m1, m3
536
+
537
+    pmaddwd m5, m6
538
+    paddd   m0, m5
539
+    pmaddwd m1, m6
540
+    paddd   m0, m1
541
 
542
     lea     r0, [r0 + 2 * r1]
543
     lea     r2, [r2 + 2 * r3]
544
 
545
-    dec    r4d
546
-    jg     .loop
547
+    dec     r4d
548
+    jg      .loop
549
 
550
-    HADDUWD m0, m1
551
-    HADDUWD m5, m1
552
-    HADDUWD m6, m1
553
-    HADDUWD m7, m1
554
-    paddd   m0, m5
555
-    paddd   m0, m6
556
-    paddd   m0, m7
557
     HADDD   m0, m1
558
     movd    eax, xm0
559
     RET
560
 
561
 INIT_YMM avx2
562
-cglobal pixel_sad_64x64, 4, 5, 8
563
+cglobal pixel_sad_64x64, 4, 5, 7
564
     pxor    m0, m0
565
-    pxor    m5, m5
566
-    pxor    m6, m6
567
-    pxor    m7, m7
568
     mov     r4d, 64 / 2
569
+    mova    m6, [pw_1]
570
     add     r3d, r3d
571
     add     r1d, r1d
572
 .loop:
573
@@ -1021,10 +996,9 @@
574
     pabsw   m2, m2
575
     pabsw   m3, m3
576
     pabsw   m4, m4
577
-    paddw   m0, m1
578
-    paddw   m5, m2
579
-    paddw   m6, m3
580
-    paddw   m7, m4
581
+    paddw   m1, m2
582
+    paddw   m3, m4
583
+    paddw   m5, m1, m3
584
 
585
     movu    m1, [r2 + r3]
586
     movu    m2, [r2 + r3 + 32]
587
@@ -1038,25 +1012,22 @@
588
     pabsw   m2, m2
589
     pabsw   m3, m3
590
     pabsw   m4, m4
591
-    paddw   m0, m1
592
-    paddw   m5, m2
593
-    paddw   m6, m3
594
-    paddw   m7, m4
595
+    paddw   m1, m2
596
+    paddw   m3, m4
597
+    paddw   m1, m3
598
+
599
+    pmaddwd m5, m6
600
+    paddd   m0, m5
601
+    pmaddwd m1, m6
602
+    paddd   m0, m1
603
 
604
     lea     r0, [r0 + 2 * r1]
605
     lea     r2, [r2 + 2 * r3]
606
 
607
-    dec    r4d
608
-    jg     .loop
609
+    dec     r4d
610
+    jg      .loop
611
 
612
-    HADDUWD m0, m1
613
-    HADDUWD m5, m1
614
-    HADDUWD m6, m1
615
-    HADDUWD m7, m1
616
-    paddd   m0, m5
617
-    paddd   m0, m6
618
-    paddd   m0, m7
619
-    HADDD   m0, m1    
620
+    HADDD   m0, m1
621
     movd    eax, xm0
622
     RET
623
 
624
x265_1.8.tar.gz/source/common/x86/ssd-a.asm -> x265_1.9.tar.gz/source/common/x86/ssd-a.asm Changed
724
 
1
@@ -2,11 +2,13 @@
2
 ;* ssd-a.asm: x86 ssd functions
3
 ;*****************************************************************************
4
 ;* Copyright (C) 2003-2013 x264 project
5
+;* Copyright (C) 2013-2015 x265 project
6
 ;*
7
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
8
 ;*          Fiona Glaser <fiona@x264.com>
9
 ;*          Laurent Aimar <fenrir@via.ecp.fr>
10
 ;*          Alex Izvorski <aizvorksi@gmail.com>
11
+;*          Min Chen <chenm003@163.com>
12
 ;*
13
 ;* This program is free software; you can redistribute it and/or modify
14
 ;* it under the terms of the GNU General Public License as published by
15
@@ -105,8 +107,32 @@
16
     dec    r4d
17
     jg .loop
18
 %endif
19
+%if BIT_DEPTH == 12 && %1 >= 16 && %2 >=16
20
+%if  mmsize == 16
21
+    movu            m5, m0
22
+    pxor            m6, m6
23
+    punpckldq       m0, m6
24
+    punpckhdq       m5, m6
25
+    paddq           m0, m5
26
+    movhlps         m5, m0
27
+    paddq           m0, m5
28
+    movq            r6, xm0
29
+%elif mmsize == 32
30
+    movu            m1, m0
31
+    pxor            m2, m2
32
+    punpckldq       m0, m2
33
+    punpckhdq       m1, m2
34
+    paddq           m0, m1
35
+    vextracti128    xm2, m0, 1
36
+    paddq           xm2, xm0
37
+    movhlps         xm1, xm2
38
+    paddq           xm2, xm1
39
+    movq            rax, xm2
40
+%endif
41
+%else 
42
     HADDD   m0, m5
43
-    movd   eax, xm0
44
+    movd    eax,xm0
45
+%endif
46
 %ifidn movu,movq ; detect MMX
47
     EMMS
48
 %endif
49
@@ -168,6 +194,154 @@
50
     movq        rax, m9
51
     RET
52
 %endmacro
53
+%macro SSD_ONE_SS_32 0
54
+cglobal pixel_ssd_ss_32x32, 4,5,8
55
+    add         r1d, r1d
56
+    add         r3d, r3d
57
+    pxor        m5, m5
58
+    pxor        m6, m6
59
+    mov         r4d, 2
60
+
61
+.iterate:
62
+    mov         r5d, 16
63
+    pxor        m4, m4
64
+    pxor        m7, m7
65
+.loop:
66
+    movu        m0, [r0]
67
+    movu        m1, [r0 + mmsize]
68
+    movu        m2, [r2]
69
+    movu        m3, [r2 + mmsize]
70
+    psubw       m0, m2
71
+    psubw       m1, m3
72
+    pmaddwd     m0, m0
73
+    pmaddwd     m1, m1
74
+    paddd       m4, m0
75
+    paddd       m7, m1
76
+    movu        m0, [r0 + 2 * mmsize]
77
+    movu        m1, [r0 + 3 * mmsize]
78
+    movu        m2, [r2 + 2 * mmsize]
79
+    movu        m3, [r2 + 3 * mmsize]
80
+    psubw       m0, m2
81
+    psubw       m1, m3
82
+    pmaddwd     m0, m0
83
+    pmaddwd     m1, m1
84
+    paddd       m4, m0
85
+    paddd       m7, m1
86
+
87
+    add         r0, r1
88
+    add         r2, r3
89
+
90
+    dec         r5d
91
+    jnz         .loop
92
+
93
+    mova        m0, m4
94
+    pxor        m1, m1
95
+    punpckldq   m0, m1
96
+    punpckhdq   m4, m1
97
+    paddq       m5, m0
98
+    paddq       m6, m4
99
+
100
+    mova        m0, m7
101
+    punpckldq   m0, m1
102
+    punpckhdq   m7, m1
103
+    paddq       m5, m0
104
+    paddq       m6, m7
105
+
106
+    dec         r4d
107
+    jnz         .iterate
108
+
109
+    paddq       m5, m6
110
+    movhlps     m2, m5
111
+    paddq       m5, m2
112
+    movq        rax, m5
113
+    RET
114
+%endmacro
115
+
116
+%macro SSD_ONE_SS_64 0
117
+cglobal pixel_ssd_ss_64x64, 4,6,8
118
+    add         r1d, r1d
119
+    add         r3d, r3d
120
+    pxor        m5, m5
121
+    pxor        m6, m6
122
+    mov         r5d, 8
123
+
124
+.iterate:
125
+    pxor        m4, m4
126
+    pxor        m7, m7
127
+    mov         r4d, 8
128
+
129
+.loop:
130
+    ;----process 1st half a row----
131
+    movu        m0, [r0]
132
+    movu        m1, [r0 + mmsize]
133
+    movu        m2, [r2]
134
+    movu        m3, [r2 + mmsize]
135
+    psubw       m0, m2
136
+    psubw       m1, m3
137
+    pmaddwd     m0, m0
138
+    pmaddwd     m1, m1
139
+    paddd       m4, m0
140
+    paddd       m7, m1
141
+    movu        m0, [r0 + 2 * mmsize]
142
+    movu        m1, [r0 + 3 * mmsize]
143
+    movu        m2, [r2 + 2 * mmsize]
144
+    movu        m3, [r2 + 3 * mmsize]
145
+    psubw       m0, m2
146
+    psubw       m1, m3
147
+    pmaddwd     m0, m0
148
+    pmaddwd     m1, m1
149
+    paddd       m4, m0
150
+    paddd       m7, m1
151
+    ;----process 2nd half a row----
152
+    movu        m0, [r0 + 4 * mmsize]
153
+    movu        m1, [r0 + 5 * mmsize]
154
+    movu        m2, [r2 + 4 * mmsize]
155
+    movu        m3, [r2 + 5 * mmsize]
156
+    psubw       m0, m2
157
+    psubw       m1, m3
158
+    pmaddwd     m0, m0
159
+    pmaddwd     m1, m1
160
+    paddd       m4, m0
161
+    paddd       m7, m1
162
+    movu        m0, [r0 + 6 * mmsize]
163
+    movu        m1, [r0 + 7 * mmsize]
164
+    movu        m2, [r2 + 6 * mmsize]
165
+    movu        m3, [r2 + 7 * mmsize]
166
+    psubw       m0, m2
167
+    psubw       m1, m3
168
+    pmaddwd     m0, m0
169
+    pmaddwd     m1, m1
170
+    paddd       m4, m0
171
+    paddd       m7, m1
172
+
173
+    add         r0, r1
174
+    add         r2, r3
175
+
176
+    dec         r4d
177
+    jnz         .loop
178
+
179
+    mova        m0, m4
180
+    pxor        m1, m1
181
+    punpckldq   m0, m1
182
+    punpckhdq   m4, m1
183
+    paddq       m5, m0
184
+    paddq       m6, m4
185
+
186
+    mova        m0, m7
187
+    punpckldq   m0, m1
188
+    punpckhdq   m7, m1
189
+    paddq       m5, m0
190
+    paddq       m6, m7
191
+
192
+    dec         r5
193
+    jne         .iterate
194
+
195
+    paddq       m5, m6
196
+    movhlps     m2, m5
197
+    paddq       m5, m2
198
+    movq        rax, m5
199
+    RET
200
+%endmacro
201
 
202
 %macro SSD_TWO 2
203
 cglobal pixel_ssd_ss_%1x%2, 4,7,8
204
@@ -265,8 +439,19 @@
205
     lea     r2,  [r2 + r6]
206
     dec     r4d
207
     jnz  .loop
208
+%if BIT_DEPTH == 10 && %1 == 64 && %2 ==64
209
+    movu        m5, m0
210
+    pxor        m6, m6
211
+    punpckldq   m0, m6
212
+    punpckhdq   m5, m6
213
+    paddq       m0, m5
214
+    movhlps     m5, m0
215
+    paddq       m0, m5
216
+    movq        rax, xm0
217
+%else 
218
     HADDD   m0, m5
219
     movd   eax, xm0
220
+%endif
221
     RET
222
 %endmacro
223
 %macro SSD_24 2
224
@@ -370,120 +555,146 @@
225
 %endmacro
226
 
227
 INIT_YMM avx2
228
-cglobal pixel_ssd_16x16, 4,7,8
229
+cglobal pixel_ssd_16x16, 4,7,3
230
     FIX_STRIDES r1, r3
231
-    lea     r5, [3 * r1]
232
-    lea     r6, [3 * r3]
233
-    mov    r4d, 4
234
-    pxor    m0, m0
235
+    lea             r5, [3 * r1]
236
+    lea             r6, [3 * r3]
237
+    mov             r4d, 4
238
+    pxor            m0, m0
239
 .loop:
240
-    movu    m1, [r0]
241
-    movu    m2, [r0 + r1]
242
-    movu    m3, [r0 + r1 * 2]
243
-    movu    m4, [r0 + r5]
244
-    movu    m6, [r2]
245
-    movu    m7, [r2 + r3]
246
-    psubw   m1, m6
247
-    psubw   m2, m7
248
-    movu    m6, [r2 + r3 * 2]
249
-    movu    m7, [r2 + r6]
250
-    psubw   m3, m6
251
-    psubw   m4, m7
252
-
253
-    lea     r0, [r0 + r1 * 4]
254
-    lea     r2, [r2 + r3 * 4]
255
-
256
-    pmaddwd m1, m1
257
-    pmaddwd m2, m2
258
-    pmaddwd m3, m3
259
-    pmaddwd m4, m4
260
-    paddd   m1, m2
261
-    paddd   m3, m4
262
-    paddd   m0, m1
263
-    paddd   m0, m3
264
-
265
-    dec    r4d
266
-    jg .loop
267
-
268
-    HADDD   m0, m5
269
-    movd   eax, xm0
270
-    RET
271
+    movu            m1, [r0]
272
+    movu            m2, [r0 + r1] 
273
+    psubw           m1, [r2]
274
+    psubw           m2, [r2 + r3]
275
+    pmaddwd         m1, m1
276
+    pmaddwd         m2, m2
277
+    paddd           m0, m1
278
+    paddd           m0, m2
279
+    movu            m1, [r0 + r1 * 2]
280
+    movu            m2, [r0 + r5]
281
+    psubw           m1, [r2 + r3 * 2]
282
+    psubw           m2, [r2 + r6]
283
+    pmaddwd         m1, m1
284
+    pmaddwd         m2, m2
285
+    paddd           m0, m1
286
+    paddd           m0, m2
287
+    lea             r0, [r0 + r1 * 4]
288
+    lea             r2, [r2 + r3 * 4]
289
+
290
+    dec             r4d
291
+    jg              .loop
292
+
293
+    mova            m1, m0
294
+    pxor            m2, m2
295
+    punpckldq       m0, m2
296
+    punpckhdq       m1, m2
297
+    paddq           m0, m1
298
+    vextracti128    xm2, m0, 1
299
+    paddq           xm2, xm0
300
+    movhlps         xm1, xm2
301
+    paddq           xm2, xm1
302
+    movq            rax, xm2
303
+    ret
304
 
305
 INIT_YMM avx2
306
-cglobal pixel_ssd_32x32, 4,7,8
307
-    add     r1, r1
308
-    add     r3, r3
309
-    mov     r4d, 16
310
-    pxor    m0, m0
311
-.loop:
312
-    movu    m1, [r0]
313
-    movu    m2, [r0 + 32]
314
-    movu    m3, [r0 + r1]
315
-    movu    m4, [r0 + r1 + 32]
316
-    movu    m6, [r2]
317
-    movu    m7, [r2 + 32]
318
-    psubw   m1, m6
319
-    psubw   m2, m7
320
-    movu    m6, [r2 + r3]
321
-    movu    m7, [r2 + r3 + 32]
322
-    psubw   m3, m6
323
-    psubw   m4, m7
324
-
325
-    lea     r0, [r0 + r1 * 2]
326
-    lea     r2, [r2 + r3 * 2]
327
-
328
-    pmaddwd m1, m1
329
-    pmaddwd m2, m2
330
-    pmaddwd m3, m3
331
-    pmaddwd m4, m4
332
-    paddd   m1, m2
333
-    paddd   m3, m4
334
-    paddd   m0, m1
335
-    paddd   m0, m3
336
+cglobal pixel_ssd_32x2
337
+    pxor            m0, m0
338
+
339
+    movu            m1, [r0]
340
+    movu            m2, [r0 + 32]
341
+    psubw           m1, [r2]
342
+    psubw           m2, [r2 + 32]
343
+    pmaddwd         m1, m1
344
+    pmaddwd         m2, m2
345
+    paddd           m0, m1
346
+    paddd           m0, m2
347
+    movu            m1, [r0 + r1]
348
+    movu            m2, [r0 + r1 + 32]
349
+    psubw           m1, [r2 + r3]
350
+    psubw           m2, [r2 + r3 + 32]
351
+    pmaddwd         m1, m1
352
+    pmaddwd         m2, m2
353
+    paddd           m0, m1
354
+    paddd           m0, m2
355
+
356
+    lea             r0, [r0 + r1 * 2]
357
+    lea             r2, [r2 + r3 * 2]
358
+
359
+
360
+    mova            m1, m0
361
+    pxor            m2, m2
362
+    punpckldq       m0, m2
363
+    punpckhdq       m1, m2
364
+
365
+    paddq           m3, m0
366
+    paddq           m4, m1
367
+ret
368
 
369
-    dec    r4d
370
-    jg .loop
371
-
372
-    HADDD   m0, m5
373
-    movd   eax, xm0
374
-    RET
375
+INIT_YMM avx2
376
+cglobal pixel_ssd_32x32, 4,5,5
377
+    add             r1, r1
378
+    add             r3, r3
379
+    pxor            m3, m3
380
+    pxor            m4, m4
381
+    mov             r4, 16
382
+.iterate:
383
+    call            pixel_ssd_32x2
384
+    dec             r4d
385
+    jne             .iterate
386
+
387
+    paddq           m3, m4
388
+    vextracti128    xm4, m3, 1
389
+    paddq           xm3, xm4
390
+    movhlps         xm4, xm3
391
+    paddq           xm3, xm4
392
+    movq            rax, xm3
393
+RET
394
 
395
 INIT_YMM avx2
396
-cglobal pixel_ssd_64x64, 4,7,8
397
-    FIX_STRIDES r1, r3
398
-    mov    r4d, 64
399
-    pxor    m0, m0
400
+cglobal pixel_ssd_64x64, 4,5,5
401
+    FIX_STRIDES     r1, r3
402
+    mov             r4d, 64
403
+    pxor            m3, m3
404
+    pxor            m4, m4
405
 .loop:
406
-    movu    m1, [r0]
407
-    movu    m2, [r0+32]
408
-    movu    m3, [r0+32*2]
409
-    movu    m4, [r0+32*3]
410
-    movu    m6, [r2]
411
-    movu    m7, [r2+32]
412
-    psubw   m1, m6
413
-    psubw   m2, m7
414
-    movu    m6, [r2+32*2]
415
-    movu    m7, [r2+32*3]
416
-    psubw   m3, m6
417
-    psubw   m4, m7
418
-
419
-    lea     r0, [r0+r1]
420
-    lea     r2, [r2+r3]
421
-
422
-    pmaddwd m1, m1
423
-    pmaddwd m2, m2
424
-    pmaddwd m3, m3
425
-    pmaddwd m4, m4
426
-    paddd   m1, m2
427
-    paddd   m3, m4
428
-    paddd   m0, m1
429
-    paddd   m0, m3
430
-
431
-    dec    r4d
432
-    jg .loop
433
-
434
-    HADDD   m0, m5
435
-    movd   eax, xm0
436
+    pxor            m0, m0
437
+    movu            m1, [r0]
438
+    movu            m2, [r0+32]
439
+    psubw           m1, [r2]
440
+    psubw           m2, [r2+32]
441
+    pmaddwd         m1, m1
442
+    pmaddwd         m2, m2
443
+    paddd           m0, m1
444
+    paddd           m0, m2
445
+    movu            m1, [r0+32*2]
446
+    movu            m2, [r0+32*3]
447
+    psubw           m1, [r2+32*2]
448
+    psubw           m2, [r2+32*3]
449
+    pmaddwd         m1, m1
450
+    pmaddwd         m2, m2
451
+    paddd           m0, m1
452
+    paddd           m0, m2
453
+
454
+    lea             r0, [r0+r1]
455
+    lea             r2, [r2+r3]
456
+
457
+    mova            m1, m0
458
+    pxor            m2, m2
459
+    punpckldq       m0, m2
460
+    punpckhdq       m1, m2
461
+
462
+    paddq           m3, m0
463
+    paddq           m4, m1
464
+
465
+    dec             r4d
466
+    jg              .loop
467
+
468
+    paddq           m3, m4
469
+    vextracti128    xm4, m3, 1
470
+    paddq           xm3, xm4
471
+    movhlps         xm4, xm3
472
+    paddq           xm3, xm4
473
+    movq            rax, xm3
474
     RET
475
 
476
 INIT_MMX mmx2
477
@@ -511,24 +722,23 @@
478
 SSD_ONE    32,  8
479
 SSD_ONE    32, 16
480
 SSD_ONE    32, 24
481
-SSD_ONE    32, 32
482
 
483
 %if BIT_DEPTH <= 10
484
     SSD_ONE    32, 64
485
+    SSD_ONE    32, 32
486
+    SSD_TWO    64, 64
487
 %else
488
     SSD_ONE_32
489
+    SSD_ONE_SS_32
490
+    SSD_ONE_SS_64
491
 %endif
492
-
493
 SSD_TWO    48, 64
494
 SSD_TWO    64, 16
495
 SSD_TWO    64, 32
496
 SSD_TWO    64, 48
497
-SSD_TWO    64, 64
498
+
499
 INIT_YMM avx2
500
-SSD_ONE    16,  8
501
-SSD_ONE    16, 16
502
-SSD_ONE    32, 32
503
-SSD_ONE    64, 64
504
+SSD_ONE    16, 8
505
 SSD_ONE    16, 32
506
 SSD_ONE    32, 64
507
 %endif ; HIGH_BIT_DEPTH
508
@@ -1002,6 +1212,172 @@
509
 SSD_SS_32xN
510
 SSD_SS_48
511
 SSD_SS_64xN
512
+
513
+INIT_YMM avx2
514
+cglobal pixel_ssd_ss_16x16, 4,6,4
515
+    add         r1d, r1d
516
+    add         r3d, r3d
517
+    pxor        m2, m2
518
+    pxor        m3, m3
519
+    lea         r4, [3 * r1]
520
+    lea         r5, [3 * r3]
521
+
522
+    movu        m0, [r0]
523
+    movu        m1, [r0 + r1]
524
+    psubw       m0, [r2]
525
+    psubw       m1, [r2 + r3]
526
+    pmaddwd     m0, m0
527
+    pmaddwd     m1, m1
528
+    paddd       m2, m0
529
+    paddd       m3, m1
530
+
531
+    movu        m0, [r0 + 2 * r1]
532
+    movu        m1, [r0 + r4]
533
+    psubw       m0, [r2 + 2 * r3]
534
+    psubw       m1, [r2 + r5]
535
+    pmaddwd     m0, m0
536
+    pmaddwd     m1, m1
537
+    paddd       m2, m0
538
+    paddd       m3, m1
539
+
540
+    lea         r0, [r0 + 4 * r1]
541
+    lea         r2, [r2 + 4 * r3]
542
+
543
+    movu        m0, [r0]
544
+    movu        m1, [r0 + r1]
545
+    psubw       m0, [r2]
546
+    psubw       m1, [r2 + r3]
547
+    pmaddwd     m0, m0
548
+    pmaddwd     m1, m1
549
+    paddd       m2, m0
550
+    paddd       m3, m1
551
+
552
+    movu        m0, [r0 + 2 * r1]
553
+    movu        m1, [r0 + r4]
554
+    psubw       m0, [r2 + 2 * r3]
555
+    psubw       m1, [r2 + r5]
556
+    pmaddwd     m0, m0
557
+    pmaddwd     m1, m1
558
+    paddd       m2, m0
559
+    paddd       m3, m1
560
+
561
+    lea         r0, [r0 + 4 * r1]
562
+    lea         r2, [r2 + 4 * r3]
563
+
564
+    movu        m0, [r0]
565
+    movu        m1, [r0 + r1]
566
+    psubw       m0, [r2]
567
+    psubw       m1, [r2 + r3]
568
+    pmaddwd     m0, m0
569
+    pmaddwd     m1, m1
570
+    paddd       m2, m0
571
+    paddd       m3, m1
572
+
573
+    movu        m0, [r0 + 2 * r1]
574
+    movu        m1, [r0 + r4]
575
+    psubw       m0, [r2 + 2 * r3]
576
+    psubw       m1, [r2 + r5]
577
+    pmaddwd     m0, m0
578
+    pmaddwd     m1, m1
579
+    paddd       m2, m0
580
+    paddd       m3, m1
581
+
582
+    lea         r0, [r0 + 4 * r1]
583
+    lea         r2, [r2 + 4 * r3]
584
+
585
+    movu        m0, [r0]
586
+    movu        m1, [r0 + r1]
587
+    psubw       m0, [r2]
588
+    psubw       m1, [r2 + r3]
589
+    pmaddwd     m0, m0
590
+    pmaddwd     m1, m1
591
+    paddd       m2, m0
592
+    paddd       m3, m1
593
+
594
+    movu        m0, [r0 + 2 * r1]
595
+    movu        m1, [r0 + r4]
596
+    psubw       m0, [r2 + 2 * r3]
597
+    psubw       m1, [r2 + r5]
598
+    pmaddwd     m0, m0
599
+    pmaddwd     m1, m1
600
+    paddd       m2, m0
601
+    paddd       m3, m1
602
+
603
+    paddd       m2, m3
604
+    HADDD       m2, m0
605
+    movd        eax, xm2
606
+    RET
607
+
608
+INIT_YMM avx2
609
+cglobal pixel_ssd_ss_32x32, 4,5,4
610
+    add         r1d, r1d
611
+    add         r3d, r3d
612
+    pxor        m2, m2
613
+    pxor        m3, m3
614
+    mov         r4d, 16
615
+.loop:
616
+    movu        m0, [r0]
617
+    movu        m1, [r0 + mmsize] 
618
+    psubw       m0, [r2]
619
+    psubw       m1, [r2 + mmsize]
620
+    pmaddwd     m0, m0
621
+    pmaddwd     m1, m1
622
+    paddd       m2, m0
623
+    paddd       m3, m1
624
+    movu        m0, [r0 + r1]
625
+    movu        m1, [r0 + r1 + mmsize]
626
+    psubw       m0, [r2 + r3]
627
+    psubw       m1, [r2 + r3 + mmsize]
628
+    pmaddwd     m0, m0
629
+    pmaddwd     m1, m1
630
+    paddd       m2, m0
631
+    paddd       m3, m1
632
+    lea         r0, [r0 + 2 * r1]
633
+    lea         r2, [r2 + 2 * r3]
634
+    dec         r4d
635
+    jne         .loop
636
+
637
+    paddd       m2, m3
638
+    HADDD       m2, m0
639
+    movd        eax, xm2
640
+    RET
641
+
642
+INIT_YMM avx2
643
+cglobal pixel_ssd_ss_64x64, 4,5,4
644
+    add         r1d, r1d
645
+    add         r3d, r3d
646
+    pxor        m2, m2
647
+    pxor        m3, m3
648
+    mov         r4d,64
649
+.loop:
650
+    movu        m0, [r0]
651
+    movu        m1, [r0 + mmsize]
652
+    psubw       m0, [r2]
653
+    psubw       m1, [r2 + mmsize]
654
+    pmaddwd     m0, m0
655
+    pmaddwd     m1, m1
656
+    paddd       m2, m0
657
+    paddd       m3, m1
658
+    movu        m0, [r0 + 2 * mmsize]
659
+    movu        m1, [r0 + 3 * mmsize]
660
+    psubw       m0, [r2 + 2 * mmsize]
661
+    psubw       m1, [r2 + 3 * mmsize]
662
+    pmaddwd     m0, m0
663
+    pmaddwd     m1, m1
664
+    paddd       m2, m0
665
+    paddd       m3, m1
666
+
667
+    add         r0, r1
668
+    add         r2, r3
669
+
670
+    dec         r4d
671
+    jne         .loop
672
+
673
+    paddd       m2, m3
674
+    HADDD       m2, m0
675
+    movd        eax, xm2
676
+    RET
677
+
678
 %endif ; !HIGH_BIT_DEPTH
679
 
680
 %if HIGH_BIT_DEPTH == 0
681
@@ -2729,9 +3105,20 @@
682
     dec     r2d
683
     jnz    .loop
684
 
685
+%if BIT_DEPTH >= 10
686
+    movu            m1, m0
687
+    pxor            m2, m2
688
+    punpckldq       m0, m2
689
+    punpckhdq       m1, m2
690
+    paddq           m0, m1
691
+    movhlps         m1, m0
692
+    paddq           m0, m1
693
+    movq            rax, xm0
694
+%else
695
     ; calculate sum and return
696
     HADDD   m0, m1
697
     movd    eax, m0
698
+%endif
699
     RET
700
 
701
 INIT_YMM avx2
702
@@ -2803,8 +3190,20 @@
703
 
704
     dec     r2d
705
     jnz    .loop
706
-
707
+%if BIT_DEPTH >= 10
708
+    movu            m1, m0
709
+    pxor            m2, m2
710
+    punpckldq       m0, m2
711
+    punpckhdq       m1, m2
712
+    paddq           m0, m1
713
+    vextracti128    xm2, m0, 1
714
+    paddq           xm2, xm0
715
+    movhlps         xm1, xm2
716
+    paddq           xm2, xm1
717
+    movq            rax, xm2
718
+%else
719
     ; calculate sum and return
720
     HADDD   m0, m1
721
     movd    eax, xm0
722
+%endif
723
     RET
724
x265_1.8.tar.gz/source/common/x86/x86util.asm -> x265_1.9.tar.gz/source/common/x86/x86util.asm Changed
9
 
1
@@ -5,6 +5,7 @@
2
 ;*
3
 ;* Authors: Holger Lubitz <holger@lubitz.org>
4
 ;*          Loren Merritt <lorenm@u.washington.edu>
5
+;*          Min Chen <chenm003@163.com>
6
 ;*
7
 ;* This program is free software; you can redistribute it and/or modify
8
 ;* it under the terms of the GNU General Public License as published by
9
x265_1.8.tar.gz/source/common/yuv.cpp -> x265_1.9.tar.gz/source/common/yuv.cpp Changed
123
 
1
@@ -2,6 +2,7 @@
2
  * Copyright (C) 2015 x265 project
3
  *
4
  * Authors: Steve Borho <steve@borho.org>
5
+ *          Min Chen <chenm003@163.com>
6
  *
7
  * This program is free software; you can redistribute it and/or modify
8
  * it under the terms of the GNU General Public License as published by
9
@@ -50,7 +51,7 @@
10
     {
11
         CHECKED_MALLOC(m_buf[0], pixel, size * size + 8);
12
         m_buf[1] = m_buf[2] = 0;
13
-        m_csize = MAX_INT;
14
+        m_csize = 0;
15
         return true;
16
     }
17
     else
18
@@ -82,22 +83,26 @@
19
 {
20
     pixel* dstY = dstPic.getLumaAddr(cuAddr, absPartIdx);
21
     primitives.cu[m_part].copy_pp(dstY, dstPic.m_stride, m_buf[0], m_size);
22
-
23
-    pixel* dstU = dstPic.getCbAddr(cuAddr, absPartIdx);
24
-    pixel* dstV = dstPic.getCrAddr(cuAddr, absPartIdx);
25
-    primitives.chroma[m_csp].cu[m_part].copy_pp(dstU, dstPic.m_strideC, m_buf[1], m_csize);
26
-    primitives.chroma[m_csp].cu[m_part].copy_pp(dstV, dstPic.m_strideC, m_buf[2], m_csize);
27
+    if (m_csp != X265_CSP_I400)
28
+    {
29
+        pixel* dstU = dstPic.getCbAddr(cuAddr, absPartIdx);
30
+        pixel* dstV = dstPic.getCrAddr(cuAddr, absPartIdx);
31
+        primitives.chroma[m_csp].cu[m_part].copy_pp(dstU, dstPic.m_strideC, m_buf[1], m_csize);
32
+        primitives.chroma[m_csp].cu[m_part].copy_pp(dstV, dstPic.m_strideC, m_buf[2], m_csize);
33
+    }
34
 }
35
 
36
 void Yuv::copyFromPicYuv(const PicYuv& srcPic, uint32_t cuAddr, uint32_t absPartIdx)
37
 {
38
     const pixel* srcY = srcPic.getLumaAddr(cuAddr, absPartIdx);
39
     primitives.cu[m_part].copy_pp(m_buf[0], m_size, srcY, srcPic.m_stride);
40
-
41
-    const pixel* srcU = srcPic.getCbAddr(cuAddr, absPartIdx);
42
-    const pixel* srcV = srcPic.getCrAddr(cuAddr, absPartIdx);
43
-    primitives.chroma[m_csp].cu[m_part].copy_pp(m_buf[1], m_csize, srcU, srcPic.m_strideC);
44
-    primitives.chroma[m_csp].cu[m_part].copy_pp(m_buf[2], m_csize, srcV, srcPic.m_strideC);
45
+    if (m_csp != X265_CSP_I400)
46
+    {
47
+        const pixel* srcU = srcPic.getCbAddr(cuAddr, absPartIdx);
48
+        const pixel* srcV = srcPic.getCrAddr(cuAddr, absPartIdx);
49
+        primitives.chroma[m_csp].cu[m_part].copy_pp(m_buf[1], m_csize, srcU, srcPic.m_strideC);
50
+        primitives.chroma[m_csp].cu[m_part].copy_pp(m_buf[2], m_csize, srcV, srcPic.m_strideC);
51
+    }
52
 }
53
 
54
 void Yuv::copyFromYuv(const Yuv& srcYuv)
55
@@ -105,8 +110,11 @@
56
     X265_CHECK(m_size >= srcYuv.m_size, "invalid size\n");
57
 
58
     primitives.cu[m_part].copy_pp(m_buf[0], m_size, srcYuv.m_buf[0], srcYuv.m_size);
59
-    primitives.chroma[m_csp].cu[m_part].copy_pp(m_buf[1], m_csize, srcYuv.m_buf[1], srcYuv.m_csize);
60
-    primitives.chroma[m_csp].cu[m_part].copy_pp(m_buf[2], m_csize, srcYuv.m_buf[2], srcYuv.m_csize);
61
+    if (m_csp != X265_CSP_I400)
62
+    {
63
+        primitives.chroma[m_csp].cu[m_part].copy_pp(m_buf[1], m_csize, srcYuv.m_buf[1], srcYuv.m_csize);
64
+        primitives.chroma[m_csp].cu[m_part].copy_pp(m_buf[2], m_csize, srcYuv.m_buf[2], srcYuv.m_csize);
65
+    }
66
 }
67
 
68
 /* This version is intended for use by ME, which required FENC_STRIDE for luma fenc pixels */
69
@@ -130,11 +138,13 @@
70
 {
71
     pixel* dstY = dstYuv.getLumaAddr(absPartIdx);
72
     primitives.cu[m_part].copy_pp(dstY, dstYuv.m_size, m_buf[0], m_size);
73
-
74
-    pixel* dstU = dstYuv.getCbAddr(absPartIdx);
75
-    pixel* dstV = dstYuv.getCrAddr(absPartIdx);
76
-    primitives.chroma[m_csp].cu[m_part].copy_pp(dstU, dstYuv.m_csize, m_buf[1], m_csize);
77
-    primitives.chroma[m_csp].cu[m_part].copy_pp(dstV, dstYuv.m_csize, m_buf[2], m_csize);
78
+    if (m_csp != X265_CSP_I400)
79
+    {
80
+        pixel* dstU = dstYuv.getCbAddr(absPartIdx);
81
+        pixel* dstV = dstYuv.getCrAddr(absPartIdx);
82
+        primitives.chroma[m_csp].cu[m_part].copy_pp(dstU, dstYuv.m_csize, m_buf[1], m_csize);
83
+        primitives.chroma[m_csp].cu[m_part].copy_pp(dstV, dstYuv.m_csize, m_buf[2], m_csize);
84
+    }
85
 }
86
 
87
 void Yuv::copyPartToYuv(Yuv& dstYuv, uint32_t absPartIdx) const
88
@@ -142,20 +152,25 @@
89
     pixel* srcY = m_buf[0] + getAddrOffset(absPartIdx, m_size);
90
     pixel* dstY = dstYuv.m_buf[0];
91
     primitives.cu[dstYuv.m_part].copy_pp(dstY, dstYuv.m_size, srcY, m_size);
92
-
93
-    pixel* srcU = m_buf[1] + getChromaAddrOffset(absPartIdx);
94
-    pixel* srcV = m_buf[2] + getChromaAddrOffset(absPartIdx);
95
-    pixel* dstU = dstYuv.m_buf[1];
96
-    pixel* dstV = dstYuv.m_buf[2];
97
-    primitives.chroma[m_csp].cu[dstYuv.m_part].copy_pp(dstU, dstYuv.m_csize, srcU, m_csize);
98
-    primitives.chroma[m_csp].cu[dstYuv.m_part].copy_pp(dstV, dstYuv.m_csize, srcV, m_csize);
99
+    if (m_csp != X265_CSP_I400)
100
+    {
101
+        pixel* srcU = m_buf[1] + getChromaAddrOffset(absPartIdx);
102
+        pixel* srcV = m_buf[2] + getChromaAddrOffset(absPartIdx);
103
+        pixel* dstU = dstYuv.m_buf[1];
104
+        pixel* dstV = dstYuv.m_buf[2];
105
+        primitives.chroma[m_csp].cu[dstYuv.m_part].copy_pp(dstU, dstYuv.m_csize, srcU, m_csize);
106
+        primitives.chroma[m_csp].cu[dstYuv.m_part].copy_pp(dstV, dstYuv.m_csize, srcV, m_csize);
107
+    }
108
 }
109
 
110
 void Yuv::addClip(const Yuv& srcYuv0, const ShortYuv& srcYuv1, uint32_t log2SizeL)
111
 {
112
     primitives.cu[log2SizeL - 2].add_ps(m_buf[0], m_size, srcYuv0.m_buf[0], srcYuv1.m_buf[0], srcYuv0.m_size, srcYuv1.m_size);
113
-    primitives.chroma[m_csp].cu[log2SizeL - 2].add_ps(m_buf[1], m_csize, srcYuv0.m_buf[1], srcYuv1.m_buf[1], srcYuv0.m_csize, srcYuv1.m_csize);
114
-    primitives.chroma[m_csp].cu[log2SizeL - 2].add_ps(m_buf[2], m_csize, srcYuv0.m_buf[2], srcYuv1.m_buf[2], srcYuv0.m_csize, srcYuv1.m_csize);
115
+    if (m_csp != X265_CSP_I400)
116
+    {
117
+        primitives.chroma[m_csp].cu[log2SizeL - 2].add_ps(m_buf[1], m_csize, srcYuv0.m_buf[1], srcYuv1.m_buf[1], srcYuv0.m_csize, srcYuv1.m_csize);
118
+        primitives.chroma[m_csp].cu[log2SizeL - 2].add_ps(m_buf[2], m_csize, srcYuv0.m_buf[2], srcYuv1.m_buf[2], srcYuv0.m_csize, srcYuv1.m_csize);
119
+    }
120
 }
121
 
122
 void Yuv::addAvg(const ShortYuv& srcYuv0, const ShortYuv& srcYuv1, uint32_t absPartIdx, uint32_t width, uint32_t height, bool bLuma, bool bChroma)
123
x265_1.8.tar.gz/source/encoder/analysis.cpp -> x265_1.9.tar.gz/source/encoder/analysis.cpp Changed
1638
 
1
@@ -3,6 +3,7 @@
2
 *
3
 * Authors: Deepthi Nandakumar <deepthi@multicorewareinc.com>
4
 *          Steve Borho <steve@borho.org>
5
+*          Min Chen <chenm003@163.com>
6
 *
7
 * This program is free software; you can redistribute it and/or modify
8
 * it under the terms of the GNU General Public License as published by
9
@@ -71,12 +72,11 @@
10
 
11
 Analysis::Analysis()
12
 {
13
-    m_reuseIntraDataCTU = NULL;
14
     m_reuseInterDataCTU = NULL;
15
     m_reuseRef = NULL;
16
     m_reuseBestMergeCand = NULL;
17
+    m_reuseMv = NULL;
18
 }
19
-
20
 bool Analysis::create(ThreadLocalData *tld)
21
 {
22
     m_tld = tld;
23
@@ -127,9 +127,6 @@
24
     m_frame = &frame;
25
 
26
 #if _DEBUG || CHECKED_BUILD
27
-    for (uint32_t i = 0; i <= g_maxCUDepth; i++)
28
-        for (uint32_t j = 0; j < MAX_PRED_TYPES; j++)
29
-            m_modeDepth[i].pred[j].invalidate();
30
     invalidateContexts(0);
31
 #endif
32
 
33
@@ -140,40 +137,46 @@
34
     m_modeDepth[0].fencYuv.copyFromPicYuv(*m_frame->m_fencPic, ctu.m_cuAddr, 0);
35
 
36
     uint32_t numPartition = ctu.m_numPartitions;
37
-    if (m_param->analysisMode)
38
+    if (m_param->analysisMode && m_slice->m_sliceType != I_SLICE)
39
     {
40
-        if (m_slice->m_sliceType == I_SLICE)
41
-            m_reuseIntraDataCTU = (analysis_intra_data*)m_frame->m_analysisData.intraData;
42
-        else
43
-        {
44
-            int numPredDir = m_slice->isInterP() ? 1 : 2;
45
-            m_reuseInterDataCTU = (analysis_inter_data*)m_frame->m_analysisData.interData;
46
-            m_reuseRef = &m_reuseInterDataCTU->ref[ctu.m_cuAddr * X265_MAX_PRED_MODE_PER_CTU * numPredDir];
47
-            m_reuseBestMergeCand = &m_reuseInterDataCTU->bestMergeCand[ctu.m_cuAddr * CUGeom::MAX_GEOMS];
48
-        }
49
+        int numPredDir = m_slice->isInterP() ? 1 : 2;
50
+        m_reuseInterDataCTU = (analysis_inter_data*)m_frame->m_analysisData.interData;
51
+        m_reuseRef = &m_reuseInterDataCTU->ref[ctu.m_cuAddr * X265_MAX_PRED_MODE_PER_CTU * numPredDir];
52
+        m_reuseBestMergeCand = &m_reuseInterDataCTU->bestMergeCand[ctu.m_cuAddr * CUGeom::MAX_GEOMS];
53
+        m_reuseMv = &m_reuseInterDataCTU->mv[ctu.m_cuAddr * X265_MAX_PRED_MODE_PER_CTU * numPredDir];
54
     }
55
-
56
     ProfileCUScope(ctu, totalCTUTime, totalCTUs);
57
 
58
-    uint32_t zOrder = 0;
59
     if (m_slice->m_sliceType == I_SLICE)
60
     {
61
-        compressIntraCU(ctu, cuGeom, zOrder, qp);
62
-        if (m_param->analysisMode == X265_ANALYSIS_SAVE && m_frame->m_analysisData.intraData)
63
+        analysis_intra_data* intraDataCTU = (analysis_intra_data*)m_frame->m_analysisData.intraData;
64
+        if (m_param->analysisMode == X265_ANALYSIS_LOAD)
65
+        {
66
+            memcpy(ctu.m_cuDepth, &intraDataCTU->depth[ctu.m_cuAddr * numPartition], sizeof(uint8_t) * numPartition);
67
+            memcpy(ctu.m_lumaIntraDir, &intraDataCTU->modes[ctu.m_cuAddr * numPartition], sizeof(uint8_t) * numPartition);
68
+            memcpy(ctu.m_partSize, &intraDataCTU->partSizes[ctu.m_cuAddr * numPartition], sizeof(char) * numPartition);
69
+            memcpy(ctu.m_chromaIntraDir, &intraDataCTU->chromaModes[ctu.m_cuAddr * numPartition], sizeof(uint8_t) * numPartition);
70
+        }
71
+        compressIntraCU(ctu, cuGeom, qp);
72
+        if (m_param->analysisMode == X265_ANALYSIS_SAVE && intraDataCTU)
73
         {
74
             CUData* bestCU = &m_modeDepth[0].bestMode->cu;
75
-            memcpy(&m_reuseIntraDataCTU->depth[ctu.m_cuAddr * numPartition], bestCU->m_cuDepth, sizeof(uint8_t) * numPartition);
76
-            memcpy(&m_reuseIntraDataCTU->modes[ctu.m_cuAddr * numPartition], bestCU->m_lumaIntraDir, sizeof(uint8_t) * numPartition);
77
-            memcpy(&m_reuseIntraDataCTU->partSizes[ctu.m_cuAddr * numPartition], bestCU->m_partSize, sizeof(uint8_t) * numPartition);
78
-            memcpy(&m_reuseIntraDataCTU->chromaModes[ctu.m_cuAddr * numPartition], bestCU->m_chromaIntraDir, sizeof(uint8_t) * numPartition);
79
+            memcpy(&intraDataCTU->depth[ctu.m_cuAddr * numPartition], bestCU->m_cuDepth, sizeof(uint8_t) * numPartition);
80
+            memcpy(&intraDataCTU->modes[ctu.m_cuAddr * numPartition], bestCU->m_lumaIntraDir, sizeof(uint8_t) * numPartition);
81
+            memcpy(&intraDataCTU->partSizes[ctu.m_cuAddr * numPartition], bestCU->m_partSize, sizeof(uint8_t) * numPartition);
82
+            memcpy(&intraDataCTU->chromaModes[ctu.m_cuAddr * numPartition], bestCU->m_chromaIntraDir, sizeof(uint8_t) * numPartition);
83
         }
84
     }
85
     else
86
     {
87
-        if (!m_param->rdLevel)
88
+        if (m_param->bIntraRefresh && m_slice->m_sliceType == P_SLICE &&
89
+            ctu.m_cuPelX / g_maxCUSize >= frame.m_encData->m_pir.pirStartCol
90
+            && ctu.m_cuPelX / g_maxCUSize < frame.m_encData->m_pir.pirEndCol)
91
+            compressIntraCU(ctu, cuGeom, qp);
92
+        else if (!m_param->rdLevel)
93
         {
94
             /* In RD Level 0/1, copy source pixels into the reconstructed block so
95
-            * they are available for intra predictions */
96
+             * they are available for intra predictions */
97
             m_modeDepth[0].fencYuv.copyToPicYuv(*m_frame->m_reconPic, ctu.m_cuAddr, 0);
98
 
99
             compressInterCU_rd0_4(ctu, cuGeom, qp);
100
@@ -187,6 +190,7 @@
101
             compressInterCU_rd0_4(ctu, cuGeom, qp);
102
         else
103
         {
104
+            uint32_t zOrder = 0;
105
             compressInterCU_rd5_6(ctu, cuGeom, zOrder, qp);
106
             if (m_param->analysisMode == X265_ANALYSIS_SAVE && m_frame->m_analysisData.interData)
107
             {
108
@@ -212,8 +216,7 @@
109
         md.pred[PRED_LOSSLESS].initCosts();
110
         md.pred[PRED_LOSSLESS].cu.initLosslessCU(md.bestMode->cu, cuGeom);
111
         PartSize size = (PartSize)md.pred[PRED_LOSSLESS].cu.m_partSize[0];
112
-        uint8_t* modes = md.pred[PRED_LOSSLESS].cu.m_lumaIntraDir;
113
-        checkIntra(md.pred[PRED_LOSSLESS], cuGeom, size, modes, NULL);
114
+        checkIntra(md.pred[PRED_LOSSLESS], cuGeom, size);
115
         checkBestMode(md.pred[PRED_LOSSLESS], cuGeom.depth);
116
     }
117
     else
118
@@ -226,7 +229,7 @@
119
     }
120
 }
121
 
122
-void Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, uint32_t& zOrder, int32_t qp)
123
+void Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp)
124
 {
125
     uint32_t depth = cuGeom.depth;
126
     ModeDepth& md = m_modeDepth[depth];
127
@@ -235,42 +238,37 @@
128
     bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
129
     bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
130
 
131
-    if (m_param->analysisMode == X265_ANALYSIS_LOAD)
132
-    {
133
-        uint8_t* reuseDepth  = &m_reuseIntraDataCTU->depth[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
134
-        uint8_t* reuseModes  = &m_reuseIntraDataCTU->modes[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
135
-        char* reusePartSizes = &m_reuseIntraDataCTU->partSizes[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
136
-        uint8_t* reuseChromaModes = &m_reuseIntraDataCTU->chromaModes[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
137
+    bool bAlreadyDecided = parentCTU.m_lumaIntraDir[cuGeom.absPartIdx] != (uint8_t)ALL_IDX;
138
+    bool bDecidedDepth = parentCTU.m_cuDepth[cuGeom.absPartIdx] == depth;
139
 
140
-        if (mightNotSplit && depth == reuseDepth[zOrder] && zOrder == cuGeom.absPartIdx)
141
+    if (bAlreadyDecided)
142
+    {
143
+        if (bDecidedDepth)
144
         {
145
-            PartSize size = (PartSize)reusePartSizes[zOrder];
146
-            Mode& mode = size == SIZE_2Nx2N ? md.pred[PRED_INTRA] : md.pred[PRED_INTRA_NxN];
147
+            Mode& mode = md.pred[0];
148
+            md.bestMode = &mode;
149
             mode.cu.initSubCU(parentCTU, cuGeom, qp);
150
-            checkIntra(mode, cuGeom, size, &reuseModes[zOrder], &reuseChromaModes[zOrder]);
151
-            checkBestMode(mode, depth);
152
+            memcpy(mode.cu.m_lumaIntraDir, parentCTU.m_lumaIntraDir + cuGeom.absPartIdx, cuGeom.numPartitions);
153
+            memcpy(mode.cu.m_chromaIntraDir, parentCTU.m_chromaIntraDir + cuGeom.absPartIdx, cuGeom.numPartitions);
154
+            checkIntra(mode, cuGeom, (PartSize)parentCTU.m_partSize[cuGeom.absPartIdx]);
155
 
156
             if (m_bTryLossless)
157
                 tryLossless(cuGeom);
158
 
159
             if (mightSplit)
160
                 addSplitFlagCost(*md.bestMode, cuGeom.depth);
161
-
162
-            // increment zOrder offset to point to next best depth in sharedDepth buffer
163
-            zOrder += g_depthInc[g_maxCUDepth - 1][reuseDepth[zOrder]];
164
-            mightSplit = false;
165
         }
166
     }
167
-    else if (mightNotSplit)
168
+    else if (cuGeom.log2CUSize != MAX_LOG2_CU_SIZE && mightNotSplit)
169
     {
170
         md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom, qp);
171
-        checkIntra(md.pred[PRED_INTRA], cuGeom, SIZE_2Nx2N, NULL, NULL);
172
+        checkIntra(md.pred[PRED_INTRA], cuGeom, SIZE_2Nx2N);
173
         checkBestMode(md.pred[PRED_INTRA], depth);
174
 
175
         if (cuGeom.log2CUSize == 3 && m_slice->m_sps->quadtreeTULog2MinSize < 3)
176
         {
177
             md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom, qp);
178
-            checkIntra(md.pred[PRED_INTRA_NxN], cuGeom, SIZE_NxN, NULL, NULL);
179
+            checkIntra(md.pred[PRED_INTRA_NxN], cuGeom, SIZE_NxN);
180
             checkBestMode(md.pred[PRED_INTRA_NxN], depth);
181
         }
182
 
183
@@ -281,6 +279,9 @@
184
             addSplitFlagCost(*md.bestMode, cuGeom.depth);
185
     }
186
 
187
+    // stop recursion if we reach the depth of previous analysis decision
188
+    mightSplit &= !(bAlreadyDecided && bDecidedDepth);
189
+
190
     if (mightSplit)
191
     {
192
         Mode* splitPred = &md.pred[PRED_SPLIT];
193
@@ -305,7 +306,7 @@
194
                 if (m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth)
195
                     nextQP = setLambdaFromQP(parentCTU, calculateQpforCuSize(parentCTU, childGeom));
196
 
197
-                compressIntraCU(parentCTU, childGeom, zOrder, nextQP);
198
+                compressIntraCU(parentCTU, childGeom, nextQP);
199
 
200
                 // Save best CU and pred data for this sub CU
201
                 splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
202
@@ -317,7 +318,10 @@
203
             {
204
                 /* record the depth of this non-present sub-CU */
205
                 splitCU->setEmptyPart(childGeom, subPartIdx);
206
-                zOrder += g_depthInc[g_maxCUDepth - 1][nextDepth];
207
+
208
+                /* Set depth of non-present CU to 0 to ensure that correct CU is fetched as reference to code deltaQP */
209
+                if (bAlreadyDecided)
210
+                    memset(parentCTU.m_cuDepth + childGeom.absPartIdx, 0, childGeom.numPartitions);
211
             }
212
         }
213
         nextContext->store(splitPred->contexts);
214
@@ -394,32 +398,52 @@
215
                 break;
216
 
217
             case PRED_2Nx2N:
218
+                refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3];
219
+
220
                 slave.checkInter_rd0_4(md.pred[PRED_2Nx2N], pmode.cuGeom, SIZE_2Nx2N, refMasks);
221
                 if (m_slice->m_sliceType == B_SLICE)
222
                     slave.checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], pmode.cuGeom);
223
                 break;
224
 
225
             case PRED_Nx2N:
226
+                refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[2]; /* left */
227
+                refMasks[1] = m_splitRefIdx[1] | m_splitRefIdx[3]; /* right */
228
+
229
                 slave.checkInter_rd0_4(md.pred[PRED_Nx2N], pmode.cuGeom, SIZE_Nx2N, refMasks);
230
                 break;
231
 
232
             case PRED_2NxN:
233
+                refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1]; /* top */
234
+                refMasks[1] = m_splitRefIdx[2] | m_splitRefIdx[3]; /* bot */
235
+
236
                 slave.checkInter_rd0_4(md.pred[PRED_2NxN], pmode.cuGeom, SIZE_2NxN, refMasks);
237
                 break;
238
 
239
             case PRED_2NxnU:
240
+                refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1]; /* 25% top */
241
+                refMasks[1] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3]; /* 75% bot */
242
+
243
                 slave.checkInter_rd0_4(md.pred[PRED_2NxnU], pmode.cuGeom, SIZE_2NxnU, refMasks);
244
                 break;
245
 
246
             case PRED_2NxnD:
247
+                refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3]; /* 75% top */
248
+                refMasks[1] = m_splitRefIdx[2] | m_splitRefIdx[3]; /* 25% bot */
249
+
250
                 slave.checkInter_rd0_4(md.pred[PRED_2NxnD], pmode.cuGeom, SIZE_2NxnD, refMasks);
251
                 break;
252
 
253
             case PRED_nLx2N:
254
+                refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[2]; /* 25% left */
255
+                refMasks[1] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3]; /* 75% right */
256
+
257
                 slave.checkInter_rd0_4(md.pred[PRED_nLx2N], pmode.cuGeom, SIZE_nLx2N, refMasks);
258
                 break;
259
 
260
             case PRED_nRx2N:
261
+                refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3]; /* 75% left */
262
+                refMasks[1] = m_splitRefIdx[1] | m_splitRefIdx[3]; /* 25% right */
263
+
264
                 slave.checkInter_rd0_4(md.pred[PRED_nRx2N], pmode.cuGeom, SIZE_nRx2N, refMasks);
265
                 break;
266
 
267
@@ -433,12 +457,14 @@
268
             switch (pmode.modes[task])
269
             {
270
             case PRED_INTRA:
271
-                slave.checkIntra(md.pred[PRED_INTRA], pmode.cuGeom, SIZE_2Nx2N, NULL, NULL);
272
+                slave.checkIntra(md.pred[PRED_INTRA], pmode.cuGeom, SIZE_2Nx2N);
273
                 if (pmode.cuGeom.log2CUSize == 3 && m_slice->m_sps->quadtreeTULog2MinSize < 3)
274
-                    slave.checkIntra(md.pred[PRED_INTRA_NxN], pmode.cuGeom, SIZE_NxN, NULL, NULL);
275
+                    slave.checkIntra(md.pred[PRED_INTRA_NxN], pmode.cuGeom, SIZE_NxN);
276
                 break;
277
 
278
             case PRED_2Nx2N:
279
+                refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3];
280
+
281
                 slave.checkInter_rd5_6(md.pred[PRED_2Nx2N], pmode.cuGeom, SIZE_2Nx2N, refMasks);
282
                 md.pred[PRED_BIDIR].rdCost = MAX_INT64;
283
                 if (m_slice->m_sliceType == B_SLICE)
284
@@ -450,26 +476,42 @@
285
                 break;
286
 
287
             case PRED_Nx2N:
288
+                refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[2]; /* left */
289
+                refMasks[1] = m_splitRefIdx[1] | m_splitRefIdx[3]; /* right */
290
+
291
                 slave.checkInter_rd5_6(md.pred[PRED_Nx2N], pmode.cuGeom, SIZE_Nx2N, refMasks);
292
                 break;
293
 
294
             case PRED_2NxN:
295
+                refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1]; /* top */
296
+                refMasks[1] = m_splitRefIdx[2] | m_splitRefIdx[3]; /* bot */
297
+
298
                 slave.checkInter_rd5_6(md.pred[PRED_2NxN], pmode.cuGeom, SIZE_2NxN, refMasks);
299
                 break;
300
 
301
             case PRED_2NxnU:
302
+                refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1]; /* 25% top */
303
+                refMasks[1] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3]; /* 75% bot */
304
+
305
                 slave.checkInter_rd5_6(md.pred[PRED_2NxnU], pmode.cuGeom, SIZE_2NxnU, refMasks);
306
                 break;
307
 
308
             case PRED_2NxnD:
309
+                refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3]; /* 75% top */
310
+                refMasks[1] = m_splitRefIdx[2] | m_splitRefIdx[3]; /* 25% bot */
311
                 slave.checkInter_rd5_6(md.pred[PRED_2NxnD], pmode.cuGeom, SIZE_2NxnD, refMasks);
312
                 break;
313
 
314
             case PRED_nLx2N:
315
+                refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[2]; /* 25% left */
316
+                refMasks[1] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3]; /* 75% right */
317
+
318
                 slave.checkInter_rd5_6(md.pred[PRED_nLx2N], pmode.cuGeom, SIZE_nLx2N, refMasks);
319
                 break;
320
 
321
             case PRED_nRx2N:
322
+                refMasks[0] = m_splitRefIdx[0] | m_splitRefIdx[1] | m_splitRefIdx[2] | m_splitRefIdx[3]; /* 75% left */
323
+                refMasks[1] = m_splitRefIdx[1] | m_splitRefIdx[3]; /* 25% right */
324
                 slave.checkInter_rd5_6(md.pred[PRED_nRx2N], pmode.cuGeom, SIZE_nRx2N, refMasks);
325
                 break;
326
 
327
@@ -488,7 +530,7 @@
328
     while (task >= 0);
329
 }
330
 
331
-void Analysis::compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp)
332
+uint32_t Analysis::compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp)
333
 {
334
     uint32_t depth = cuGeom.depth;
335
     uint32_t cuAddr = parentCTU.m_cuAddr;
336
@@ -498,19 +540,89 @@
337
     bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
338
     bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
339
     uint32_t minDepth = m_param->rdLevel <= 4 ? topSkipMinDepth(parentCTU, cuGeom) : 0;
340
+    uint32_t splitRefs[4] = { 0, 0, 0, 0 };
341
 
342
     X265_CHECK(m_param->rdLevel >= 2, "compressInterCU_dist does not support RD 0 or 1\n");
343
 
344
+    PMODE pmode(*this, cuGeom);
345
+
346
     if (mightNotSplit && depth >= minDepth)
347
     {
348
-        int bTryAmp = m_slice->m_sps->maxAMPDepth > depth;
349
-        int bTryIntra = m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames;
350
-
351
-        PMODE pmode(*this, cuGeom);
352
-
353
         /* Initialize all prediction CUs based on parentCTU */
354
         md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
355
         md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
356
+
357
+        if (m_param->rdLevel <= 4)
358
+            checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
359
+        else
360
+            checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom, false);
361
+    }
362
+
363
+    bool bNoSplit = false;
364
+    bool splitIntra = true;
365
+    if (md.bestMode)
366
+    {
367
+        bNoSplit = md.bestMode->cu.isSkipped(0);
368
+        if (mightSplit && depth && depth >= minDepth && !bNoSplit && m_param->rdLevel <= 4)
369
+            bNoSplit = recursionDepthCheck(parentCTU, cuGeom, *md.bestMode);
370
+    }
371
+
372
+    if (mightSplit && !bNoSplit)
373
+    {
374
+        Mode* splitPred = &md.pred[PRED_SPLIT];
375
+        splitPred->initCosts();
376
+        CUData* splitCU = &splitPred->cu;
377
+        splitCU->initSubCU(parentCTU, cuGeom, qp);
378
+
379
+        uint32_t nextDepth = depth + 1;
380
+        ModeDepth& nd = m_modeDepth[nextDepth];
381
+        invalidateContexts(nextDepth);
382
+        Entropy* nextContext = &m_rqt[depth].cur;
383
+        int nextQP = qp;
384
+        splitIntra = false;
385
+
386
+        for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
387
+        {
388
+            const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
389
+            if (childGeom.flags & CUGeom::PRESENT)
390
+            {
391
+                m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.absPartIdx);
392
+                m_rqt[nextDepth].cur.load(*nextContext);
393
+
394
+                if (m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth)
395
+                    nextQP = setLambdaFromQP(parentCTU, calculateQpforCuSize(parentCTU, childGeom));
396
+
397
+                splitRefs[subPartIdx] = compressInterCU_dist(parentCTU, childGeom, nextQP);
398
+
399
+                // Save best CU and pred data for this sub CU
400
+                splitIntra |= nd.bestMode->cu.isIntra(0);
401
+                splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
402
+                splitPred->addSubCosts(*nd.bestMode);
403
+
404
+                nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx);
405
+                nextContext = &nd.bestMode->contexts;
406
+            }
407
+            else
408
+                splitCU->setEmptyPart(childGeom, subPartIdx);
409
+        }
410
+        nextContext->store(splitPred->contexts);
411
+
412
+        if (mightNotSplit)
413
+            addSplitFlagCost(*splitPred, cuGeom.depth);
414
+        else
415
+            updateModeCost(*splitPred);
416
+
417
+        checkDQPForSplitPred(*splitPred, cuGeom);
418
+    }
419
+
420
+    if (mightNotSplit && depth >= minDepth)
421
+    {
422
+        int bTryAmp = m_slice->m_sps->maxAMPDepth > depth;
423
+        int bTryIntra = (m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames) && (!m_param->limitReferences || splitIntra) && (cuGeom.log2CUSize != MAX_LOG2_CU_SIZE);
424
+
425
+        if (m_slice->m_pps->bUseDQP && depth <= m_slice->m_pps->maxCuDQPDepth && m_slice->m_pps->maxCuDQPDepth != 0)
426
+            setLambdaFromQP(parentCTU, qp);
427
+
428
         if (bTryIntra)
429
         {
430
             md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom, qp);
431
@@ -533,6 +645,8 @@
432
             md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom, qp); pmode.modes[pmode.m_jobTotal++] = PRED_nRx2N;
433
         }
434
 
435
+        m_splitRefIdx[0] = splitRefs[0]; m_splitRefIdx[1] = splitRefs[1]; m_splitRefIdx[2] = splitRefs[2]; m_splitRefIdx[3] = splitRefs[3];
436
+
437
         pmode.tryBondPeers(*m_frame->m_encData->m_jobProvider, pmode.m_jobTotal);
438
 
439
         /* participate in processing jobs, until all are distributed */
440
@@ -544,8 +658,6 @@
441
 
442
         if (m_param->rdLevel <= 4)
443
         {
444
-            checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
445
-
446
             {
447
                 ProfileCUScope(parentCTU, pmodeBlockTime, countPModeMasters);
448
                 pmode.waitForExit();
449
@@ -577,7 +689,7 @@
450
             if (m_param->rdLevel > 2)
451
             {
452
                 /* RD selection between merge, inter, bidir and intra */
453
-                if (!m_bChromaSa8d) /* When m_bChromaSa8d is enabled, chroma MC has already been done */
454
+                if (!m_bChromaSa8d && (m_csp != X265_CSP_I400)) /* When m_bChromaSa8d is enabled, chroma MC has already been done */
455
                 {
456
                     uint32_t numPU = bestInter->cu.getNumPartInter(0);
457
                     for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
458
@@ -628,14 +740,13 @@
459
         }
460
         else
461
         {
462
-            checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom, false);
463
             {
464
                 ProfileCUScope(parentCTU, pmodeBlockTime, countPModeMasters);
465
                 pmode.waitForExit();
466
             }
467
 
468
             checkBestMode(md.pred[PRED_2Nx2N], depth);
469
-            if (m_slice->m_sliceType == B_SLICE)
470
+            if (m_slice->m_sliceType == B_SLICE && md.pred[PRED_BIDIR].sa8dCost < MAX_INT64)
471
                 checkBestMode(md.pred[PRED_BIDIR], depth);
472
 
473
             if (m_param->bEnableRectInter)
474
@@ -660,14 +771,6 @@
475
             }
476
         }
477
 
478
-        if (md.bestMode->rdCost == MAX_INT64 && !bTryIntra)
479
-        {
480
-            md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom, qp);
481
-            checkIntraInInter(md.pred[PRED_INTRA], cuGeom);
482
-            encodeIntraInInter(md.pred[PRED_INTRA], cuGeom);
483
-            checkBestMode(md.pred[PRED_INTRA], depth);
484
-        }
485
-
486
         if (m_bTryLossless)
487
             tryLossless(cuGeom);
488
 
489
@@ -675,59 +778,24 @@
490
             addSplitFlagCost(*md.bestMode, cuGeom.depth);
491
     }
492
 
493
-    bool bNoSplit = false;
494
-    if (md.bestMode)
495
-    {
496
-        bNoSplit = md.bestMode->cu.isSkipped(0);
497
-        if (mightSplit && depth && depth >= minDepth && !bNoSplit && m_param->rdLevel <= 4)
498
-            bNoSplit = recursionDepthCheck(parentCTU, cuGeom, *md.bestMode);
499
-    }
500
-
501
+    /* compare split RD cost against best cost */
502
     if (mightSplit && !bNoSplit)
503
-    {
504
-        Mode* splitPred = &md.pred[PRED_SPLIT];
505
-        splitPred->initCosts();
506
-        CUData* splitCU = &splitPred->cu;
507
-        splitCU->initSubCU(parentCTU, cuGeom, qp);
508
-
509
-        uint32_t nextDepth = depth + 1;
510
-        ModeDepth& nd = m_modeDepth[nextDepth];
511
-        invalidateContexts(nextDepth);
512
-        Entropy* nextContext = &m_rqt[depth].cur;
513
-        int nextQP = qp;
514
-
515
-        for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
516
-        {
517
-            const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
518
-            if (childGeom.flags & CUGeom::PRESENT)
519
-            {
520
-                m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.absPartIdx);
521
-                m_rqt[nextDepth].cur.load(*nextContext);
522
-
523
-                if (m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth)
524
-                    nextQP = setLambdaFromQP(parentCTU, calculateQpforCuSize(parentCTU, childGeom));
525
-
526
-                compressInterCU_dist(parentCTU, childGeom, nextQP);
527
-
528
-                // Save best CU and pred data for this sub CU
529
-                splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
530
-                splitPred->addSubCosts(*nd.bestMode);
531
-
532
-                nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx);
533
-                nextContext = &nd.bestMode->contexts;
534
-            }
535
-            else
536
-                splitCU->setEmptyPart(childGeom, subPartIdx);
537
-        }
538
-        nextContext->store(splitPred->contexts);
539
-
540
-        if (mightNotSplit)
541
-            addSplitFlagCost(*splitPred, cuGeom.depth);
542
-        else
543
-            updateModeCost(*splitPred);
544
+        checkBestMode(md.pred[PRED_SPLIT], depth);
545
 
546
-        checkDQPForSplitPred(*splitPred, cuGeom);
547
-        checkBestMode(*splitPred, depth);
548
+    /* determine which motion references the parent CU should search */
549
+    uint32_t refMask;
550
+    if (!(m_param->limitReferences & X265_REF_LIMIT_DEPTH))
551
+        refMask = 0;
552
+    else if (md.bestMode == &md.pred[PRED_SPLIT])
553
+        refMask = splitRefs[0] | splitRefs[1] | splitRefs[2] | splitRefs[3];
554
+    else
555
+    {
556
+        /* use best merge/inter mode, in case of intra use 2Nx2N inter references */
557
+        CUData& cu = md.bestMode->cu.isIntra(0) ? md.pred[PRED_2Nx2N].cu : md.bestMode->cu;
558
+        uint32_t numPU = cu.getNumPartInter(0);
559
+        refMask = 0;
560
+        for (uint32_t puIdx = 0, subPartIdx = 0; puIdx < numPU; puIdx++, subPartIdx += cu.getPUOffset(puIdx, 0))
561
+            refMask |= cu.getBestRefIdx(subPartIdx);
562
     }
563
 
564
     if (mightNotSplit)
565
@@ -742,23 +810,40 @@
566
 
567
     /* Copy best data to encData CTU and recon */
568
     md.bestMode->cu.copyToPic(depth);
569
-    if (md.bestMode != &md.pred[PRED_SPLIT])
570
-        md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, cuAddr, cuGeom.absPartIdx);
571
+    md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, cuAddr, cuGeom.absPartIdx);
572
+
573
+    return refMask;
574
 }
575
 
576
-uint32_t Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp)
577
+SplitData Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp)
578
 {
579
     uint32_t depth = cuGeom.depth;
580
     uint32_t cuAddr = parentCTU.m_cuAddr;
581
     ModeDepth& md = m_modeDepth[depth];
582
     md.bestMode = NULL;
583
 
584
+    PicYuv& reconPic = *m_frame->m_reconPic;
585
+
586
     bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
587
     bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
588
     uint32_t minDepth = topSkipMinDepth(parentCTU, cuGeom);
589
     bool earlyskip = false;
590
     bool splitIntra = true;
591
-    uint32_t splitRefs[4] = { 0, 0, 0, 0 };
592
+
593
+    SplitData splitData[4];
594
+    splitData[0].initSplitCUData();
595
+    splitData[1].initSplitCUData();
596
+    splitData[2].initSplitCUData();
597
+    splitData[3].initSplitCUData();
598
+
599
+    // avoid uninitialize value in below reference
600
+    if (m_param->limitModes)
601
+    {
602
+        md.pred[PRED_2Nx2N].bestME[0][0].mvCost = 0; // L0
603
+        md.pred[PRED_2Nx2N].bestME[0][1].mvCost = 0; // L1
604
+        md.pred[PRED_2Nx2N].sa8dCost = 0;
605
+    }
606
+
607
     /* Step 1. Evaluate Merge/Skip candidates for likely early-outs */
608
     if (mightNotSplit && depth >= minDepth)
609
     {
610
@@ -804,7 +889,7 @@
611
                 if (m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth)
612
                     nextQP = setLambdaFromQP(parentCTU, calculateQpforCuSize(parentCTU, childGeom));
613
 
614
-                splitRefs[subPartIdx] = compressInterCU_rd0_4(parentCTU, childGeom, nextQP);
615
+                splitData[subPartIdx] = compressInterCU_rd0_4(parentCTU, childGeom, nextQP);
616
 
617
                 // Save best CU and pred data for this sub CU
618
                 splitIntra |= nd.bestMode->cu.isIntra(0);
619
@@ -834,7 +919,7 @@
620
     /* Split CUs
621
      *   0  1
622
      *   2  3 */
623
-    uint32_t allSplitRefs = splitRefs[0] | splitRefs[1] | splitRefs[2] | splitRefs[3];
624
+    uint32_t allSplitRefs = splitData[0].splitRefs | splitData[1].splitRefs | splitData[2].splitRefs | splitData[3].splitRefs;
625
     /* Step 3. Evaluate ME (2Nx2N, rect, amp) and intra modes at current depth */
626
     if (mightNotSplit && depth >= minDepth)
627
     {
628
@@ -852,7 +937,7 @@
629
             {
630
                 CUData& cu = md.pred[PRED_2Nx2N].cu;
631
                 uint32_t refMask = cu.getBestRefIdx(0);
632
-                allSplitRefs = splitRefs[0] = splitRefs[1] = splitRefs[2] = splitRefs[3] = refMask;
633
+                allSplitRefs = splitData[0].splitRefs = splitData[1].splitRefs = splitData[2].splitRefs = splitData[3].splitRefs = refMask;
634
             }
635
 
636
             if (m_slice->m_sliceType == B_SLICE)
637
@@ -864,23 +949,80 @@
638
             Mode *bestInter = &md.pred[PRED_2Nx2N];
639
             if (m_param->bEnableRectInter)
640
             {
641
-                refMasks[0] = splitRefs[0] | splitRefs[2]; /* left */
642
-                refMasks[1] = splitRefs[1] | splitRefs[3]; /* right */
643
-                md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
644
-                checkInter_rd0_4(md.pred[PRED_Nx2N], cuGeom, SIZE_Nx2N, refMasks);
645
-                if (md.pred[PRED_Nx2N].sa8dCost < bestInter->sa8dCost)
646
-                    bestInter = &md.pred[PRED_Nx2N];
647
+                uint64_t splitCost = splitData[0].sa8dCost + splitData[1].sa8dCost + splitData[2].sa8dCost + splitData[3].sa8dCost;
648
+                uint32_t threshold_2NxN, threshold_Nx2N;
649
 
650
-                refMasks[0] = splitRefs[0] | splitRefs[1]; /* top */
651
-                refMasks[1] = splitRefs[2] | splitRefs[3]; /* bot */
652
-                md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom, qp);
653
-                checkInter_rd0_4(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, refMasks);
654
-                if (md.pred[PRED_2NxN].sa8dCost < bestInter->sa8dCost)
655
-                    bestInter = &md.pred[PRED_2NxN];
656
+                if (m_slice->m_sliceType == P_SLICE)
657
+                {
658
+                    threshold_2NxN = splitData[0].mvCost[0] + splitData[1].mvCost[0];
659
+                    threshold_Nx2N = splitData[0].mvCost[0] + splitData[2].mvCost[0];
660
+                }
661
+                else
662
+                {
663
+                    threshold_2NxN = (splitData[0].mvCost[0] + splitData[1].mvCost[0] 
664
+                                    + splitData[0].mvCost[1] + splitData[1].mvCost[1] + 1) >> 1;
665
+                    threshold_Nx2N = (splitData[0].mvCost[0] + splitData[2].mvCost[0] 
666
+                                    + splitData[0].mvCost[1] + splitData[2].mvCost[1] + 1) >> 1;
667
+                }
668
+
669
+                int try_2NxN_first = threshold_2NxN < threshold_Nx2N;
670
+                if (try_2NxN_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_2NxN)
671
+                {
672
+                    refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* top */
673
+                    refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* bot */
674
+                    md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom, qp);
675
+                    checkInter_rd0_4(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, refMasks);
676
+                    if (md.pred[PRED_2NxN].sa8dCost < bestInter->sa8dCost)
677
+                        bestInter = &md.pred[PRED_2NxN];
678
+                }
679
+
680
+                if (splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_Nx2N)
681
+                {
682
+                    refMasks[0] = splitData[0].splitRefs | splitData[2].splitRefs; /* left */
683
+                    refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* right */
684
+                    md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
685
+                    checkInter_rd0_4(md.pred[PRED_Nx2N], cuGeom, SIZE_Nx2N, refMasks);
686
+                    if (md.pred[PRED_Nx2N].sa8dCost < bestInter->sa8dCost)
687
+                        bestInter = &md.pred[PRED_Nx2N];
688
+                }
689
+
690
+                if (!try_2NxN_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_2NxN)
691
+                {
692
+                    refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* top */
693
+                    refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* bot */
694
+                    md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom, qp);
695
+                    checkInter_rd0_4(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, refMasks);
696
+                    if (md.pred[PRED_2NxN].sa8dCost < bestInter->sa8dCost)
697
+                        bestInter = &md.pred[PRED_2NxN];
698
+                }
699
             }
700
 
701
             if (m_slice->m_sps->maxAMPDepth > depth)
702
             {
703
+                uint64_t splitCost = splitData[0].sa8dCost + splitData[1].sa8dCost + splitData[2].sa8dCost + splitData[3].sa8dCost;
704
+                uint32_t threshold_2NxnU, threshold_2NxnD, threshold_nLx2N, threshold_nRx2N;
705
+
706
+                if (m_slice->m_sliceType == P_SLICE)
707
+                {
708
+                    threshold_2NxnU = splitData[0].mvCost[0] + splitData[1].mvCost[0];
709
+                    threshold_2NxnD = splitData[2].mvCost[0] + splitData[3].mvCost[0];
710
+
711
+                    threshold_nLx2N = splitData[0].mvCost[0] + splitData[2].mvCost[0];
712
+                    threshold_nRx2N = splitData[1].mvCost[0] + splitData[3].mvCost[0];
713
+                }
714
+                else
715
+                {
716
+                    threshold_2NxnU = (splitData[0].mvCost[0] + splitData[1].mvCost[0] 
717
+                                       + splitData[0].mvCost[1] + splitData[1].mvCost[1] + 1) >> 1;
718
+                    threshold_2NxnD = (splitData[2].mvCost[0] + splitData[3].mvCost[0] 
719
+                                       + splitData[2].mvCost[1] + splitData[3].mvCost[1] + 1) >> 1;
720
+
721
+                    threshold_nLx2N = (splitData[0].mvCost[0] + splitData[2].mvCost[0] 
722
+                                       + splitData[0].mvCost[1] + splitData[2].mvCost[1] + 1) >> 1;
723
+                    threshold_nRx2N = (splitData[1].mvCost[0] + splitData[3].mvCost[0] 
724
+                                       + splitData[1].mvCost[1] + splitData[3].mvCost[1] + 1) >> 1;
725
+                }
726
+
727
                 bool bHor = false, bVer = false;
728
                 if (bestInter->cu.m_partSize[0] == SIZE_2NxN)
729
                     bHor = true;
730
@@ -895,42 +1037,76 @@
731
 
732
                 if (bHor)
733
                 {
734
-                    refMasks[0] = splitRefs[0] | splitRefs[1]; /* 25% top */
735
-                    refMasks[1] = allSplitRefs;                /* 75% bot */
736
-                    md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom, qp);
737
-                    checkInter_rd0_4(md.pred[PRED_2NxnU], cuGeom, SIZE_2NxnU, refMasks);
738
-                    if (md.pred[PRED_2NxnU].sa8dCost < bestInter->sa8dCost)
739
-                        bestInter = &md.pred[PRED_2NxnU];
740
-
741
-                    refMasks[0] = allSplitRefs;                /* 75% top */
742
-                    refMasks[1] = splitRefs[2] | splitRefs[3]; /* 25% bot */
743
-                    md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom, qp);
744
-                    checkInter_rd0_4(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, refMasks);
745
-                    if (md.pred[PRED_2NxnD].sa8dCost < bestInter->sa8dCost)
746
-                        bestInter = &md.pred[PRED_2NxnD];
747
+                    int try_2NxnD_first = threshold_2NxnD < threshold_2NxnU;
748
+                    if (try_2NxnD_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_2NxnD)
749
+                    {
750
+                        refMasks[0] = allSplitRefs;                                    /* 75% top */
751
+                        refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* 25% bot */
752
+                        md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom, qp);
753
+                        checkInter_rd0_4(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, refMasks);
754
+                        if (md.pred[PRED_2NxnD].sa8dCost < bestInter->sa8dCost)
755
+                            bestInter = &md.pred[PRED_2NxnD];
756
+                    }
757
+
758
+                    if (splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_2NxnU)
759
+                    {
760
+                        refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* 25% top */
761
+                        refMasks[1] = allSplitRefs;                                    /* 75% bot */
762
+                        md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom, qp);
763
+                        checkInter_rd0_4(md.pred[PRED_2NxnU], cuGeom, SIZE_2NxnU, refMasks);
764
+                        if (md.pred[PRED_2NxnU].sa8dCost < bestInter->sa8dCost)
765
+                            bestInter = &md.pred[PRED_2NxnU];
766
+                    }
767
+
768
+                    if (!try_2NxnD_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_2NxnD)
769
+                    {
770
+                        refMasks[0] = allSplitRefs;                                    /* 75% top */
771
+                        refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* 25% bot */
772
+                        md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom, qp);
773
+                        checkInter_rd0_4(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, refMasks);
774
+                        if (md.pred[PRED_2NxnD].sa8dCost < bestInter->sa8dCost)
775
+                            bestInter = &md.pred[PRED_2NxnD];
776
+                    }
777
                 }
778
                 if (bVer)
779
                 {
780
-                    refMasks[0] = splitRefs[0] | splitRefs[2]; /* 25% left */
781
-                    refMasks[1] = allSplitRefs;                /* 75% right */
782
-                    md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom, qp);
783
-                    checkInter_rd0_4(md.pred[PRED_nLx2N], cuGeom, SIZE_nLx2N, refMasks);
784
-                    if (md.pred[PRED_nLx2N].sa8dCost < bestInter->sa8dCost)
785
-                        bestInter = &md.pred[PRED_nLx2N];
786
-
787
-                    refMasks[0] = allSplitRefs;                /* 75% left */
788
-                    refMasks[1] = splitRefs[1] | splitRefs[3]; /* 25% right */
789
-                    md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom, qp);
790
-                    checkInter_rd0_4(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, refMasks);
791
-                    if (md.pred[PRED_nRx2N].sa8dCost < bestInter->sa8dCost)
792
-                        bestInter = &md.pred[PRED_nRx2N];
793
+                    int try_nRx2N_first = threshold_nRx2N < threshold_nLx2N;
794
+                    if (try_nRx2N_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_nRx2N)
795
+                    {
796
+                        refMasks[0] = allSplitRefs;                                    /* 75% left  */
797
+                        refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* 25% right */
798
+                        md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom, qp);
799
+                        checkInter_rd0_4(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, refMasks);
800
+                        if (md.pred[PRED_nRx2N].sa8dCost < bestInter->sa8dCost)
801
+                            bestInter = &md.pred[PRED_nRx2N];
802
+                    }
803
+
804
+                    if (splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_nLx2N)
805
+                    {
806
+                        refMasks[0] = splitData[0].splitRefs | splitData[2].splitRefs; /* 25% left  */
807
+                        refMasks[1] = allSplitRefs;                                    /* 75% right */
808
+                        md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom, qp);
809
+                        checkInter_rd0_4(md.pred[PRED_nLx2N], cuGeom, SIZE_nLx2N, refMasks);
810
+                        if (md.pred[PRED_nLx2N].sa8dCost < bestInter->sa8dCost)
811
+                            bestInter = &md.pred[PRED_nLx2N];
812
+                    }
813
+
814
+                    if (!try_nRx2N_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_nRx2N)
815
+                    {
816
+                        refMasks[0] = allSplitRefs;                                    /* 75% left  */
817
+                        refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* 25% right */
818
+                        md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom, qp);
819
+                        checkInter_rd0_4(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, refMasks);
820
+                        if (md.pred[PRED_nRx2N].sa8dCost < bestInter->sa8dCost)
821
+                            bestInter = &md.pred[PRED_nRx2N];
822
+                    }
823
                 }
824
             }
825
-            bool bTryIntra = m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames;
826
+            bool bTryIntra = (m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames) && cuGeom.log2CUSize != MAX_LOG2_CU_SIZE;
827
             if (m_param->rdLevel >= 3)
828
             {
829
                 /* Calculate RD cost of best inter option */
830
-                if (!m_bChromaSa8d) /* When m_bChromaSa8d is enabled, chroma MC has already been done */
831
+                if (!m_bChromaSa8d && (m_csp != X265_CSP_I400)) /* When m_bChromaSa8d is enabled, chroma MC has already been done */
832
                 {
833
                     uint32_t numPU = bestInter->cu.getNumPartInter(0);
834
                     for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
835
@@ -1005,10 +1181,13 @@
836
                 else if (md.bestMode->cu.isInter(0))
837
                 {
838
                     uint32_t numPU = md.bestMode->cu.getNumPartInter(0);
839
-                    for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
840
+                    if (m_csp != X265_CSP_I400)
841
                     {
842
-                        PredictionUnit pu(md.bestMode->cu, cuGeom, puIdx);
843
-                        motionCompensation(md.bestMode->cu, pu, md.bestMode->predYuv, false, true);
844
+                        for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
845
+                        {
846
+                            PredictionUnit pu(md.bestMode->cu, cuGeom, puIdx);
847
+                            motionCompensation(md.bestMode->cu, pu, md.bestMode->predYuv, false, true);
848
+                        }
849
                     }
850
                     if (m_param->rdLevel == 2)
851
                         encodeResAndCalcRdInterCU(*md.bestMode, cuGeom);
852
@@ -1019,7 +1198,6 @@
853
 
854
                         uint32_t tuDepthRange[2];
855
                         cu.getInterTUQtDepthRange(tuDepthRange, 0);
856
-
857
                         m_rqt[cuGeom.depth].tmpResiYuv.subtract(*md.bestMode->fencYuv, md.bestMode->predYuv, cuGeom.log2CUSize);
858
                         residualTransformQuantInter(*md.bestMode, cuGeom, 0, 0, tuDepthRange);
859
                         if (cu.getQtRootCbf(0))
860
@@ -1045,9 +1223,12 @@
861
                         cu.getIntraTUQtDepthRange(tuDepthRange, 0);
862
 
863
                         residualTransformQuantIntra(*md.bestMode, cuGeom, 0, 0, tuDepthRange);
864
-                        getBestIntraModeChroma(*md.bestMode, cuGeom);
865
-                        residualQTIntraChroma(*md.bestMode, cuGeom, 0, 0);
866
-                        md.bestMode->reconYuv.copyFromPicYuv(*m_frame->m_reconPic, cu.m_cuAddr, cuGeom.absPartIdx); // TODO:
867
+                        if (m_csp != X265_CSP_I400)
868
+                        {
869
+                            getBestIntraModeChroma(*md.bestMode, cuGeom);
870
+                            residualQTIntraChroma(*md.bestMode, cuGeom, 0, 0);
871
+                        }
872
+                        md.bestMode->reconYuv.copyFromPicYuv(reconPic, cu.m_cuAddr, cuGeom.absPartIdx); // TODO:
873
                     }
874
                 }
875
             }
876
@@ -1074,19 +1255,28 @@
877
     }
878
 
879
     /* determine which motion references the parent CU should search */
880
-    uint32_t refMask;
881
-    if (!(m_param->limitReferences & X265_REF_LIMIT_DEPTH))
882
-        refMask = 0;
883
-    else if (md.bestMode == &md.pred[PRED_SPLIT])
884
-        refMask = allSplitRefs;
885
-    else
886
+    SplitData splitCUData;
887
+    splitCUData.initSplitCUData();
888
+
889
+    if (m_param->limitReferences & X265_REF_LIMIT_DEPTH)
890
     {
891
-        /* use best merge/inter mode, in case of intra use 2Nx2N inter references */
892
-        CUData& cu = md.bestMode->cu.isIntra(0) ? md.pred[PRED_2Nx2N].cu : md.bestMode->cu;
893
-        uint32_t numPU = cu.getNumPartInter(0);
894
-        refMask = 0;
895
-        for (uint32_t puIdx = 0, subPartIdx = 0; puIdx < numPU; puIdx++, subPartIdx += cu.getPUOffset(puIdx, 0))
896
-            refMask |= cu.getBestRefIdx(subPartIdx);
897
+        if (md.bestMode == &md.pred[PRED_SPLIT])
898
+            splitCUData.splitRefs = allSplitRefs;
899
+        else 
900
+        {
901
+            /* use best merge/inter mode, in case of intra use 2Nx2N inter references */
902
+            CUData& cu = md.bestMode->cu.isIntra(0) ? md.pred[PRED_2Nx2N].cu : md.bestMode->cu;
903
+            uint32_t numPU = cu.getNumPartInter(0);
904
+            for (uint32_t puIdx = 0, subPartIdx = 0; puIdx < numPU; puIdx++, subPartIdx += cu.getPUOffset(puIdx, 0))
905
+                splitCUData.splitRefs |= cu.getBestRefIdx(subPartIdx);
906
+        }
907
+    }
908
+
909
+    if (m_param->limitModes)
910
+    {
911
+        splitCUData.mvCost[0] = md.pred[PRED_2Nx2N].bestME[0][0].mvCost; // L0
912
+        splitCUData.mvCost[1] = md.pred[PRED_2Nx2N].bestME[0][1].mvCost; // L1
913
+        splitCUData.sa8dCost    = md.pred[PRED_2Nx2N].sa8dCost;
914
     }
915
     
916
     if (mightNotSplit)
917
@@ -1100,15 +1290,14 @@
918
     }
919
 
920
     /* Copy best data to encData CTU and recon */
921
-    X265_CHECK(md.bestMode->ok(), "best mode is not ok");
922
     md.bestMode->cu.copyToPic(depth);
923
     if (m_param->rdLevel)
924
-        md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, cuAddr, cuGeom.absPartIdx);
925
+        md.bestMode->reconYuv.copyToPicYuv(reconPic, cuAddr, cuGeom.absPartIdx);
926
 
927
-    return refMask;
928
+    return splitCUData;
929
 }
930
 
931
-uint32_t Analysis::compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGeom, uint32_t &zOrder, int32_t qp)
932
+SplitData Analysis::compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGeom, uint32_t &zOrder, int32_t qp)
933
 {
934
     uint32_t depth = cuGeom.depth;
935
     ModeDepth& md = m_modeDepth[depth];
936
@@ -1116,6 +1305,16 @@
937
 
938
     bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
939
     bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
940
+    bool foundSkip = false;
941
+    bool splitIntra = true;
942
+
943
+    // avoid uninitialize value in below reference
944
+    if (m_param->limitModes)
945
+    {
946
+        md.pred[PRED_2Nx2N].bestME[0][0].mvCost = 0; // L0
947
+        md.pred[PRED_2Nx2N].bestME[0][1].mvCost = 0; // L1
948
+        md.pred[PRED_2Nx2N].rdCost = 0;
949
+    }
950
 
951
     if (m_param->analysisMode == X265_ANALYSIS_LOAD)
952
     {
953
@@ -1127,25 +1326,21 @@
954
             md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
955
             checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom, true);
956
 
957
-            if (m_bTryLossless)
958
-                tryLossless(cuGeom);
959
-
960
-            if (mightSplit)
961
-                addSplitFlagCost(*md.bestMode, cuGeom.depth);
962
-
963
             // increment zOrder offset to point to next best depth in sharedDepth buffer
964
             zOrder += g_depthInc[g_maxCUDepth - 1][reuseDepth[zOrder]];
965
 
966
-            mightSplit = false;
967
-            mightNotSplit = false;
968
+            foundSkip = true;
969
         }
970
-    }
971
+    }  
972
+
973
+    SplitData splitData[4];
974
+    splitData[0].initSplitCUData();
975
+    splitData[1].initSplitCUData();
976
+    splitData[2].initSplitCUData();
977
+    splitData[3].initSplitCUData();
978
 
979
-    bool foundSkip = false;
980
-    bool splitIntra = true;
981
-    uint32_t splitRefs[4] = { 0, 0, 0, 0 };
982
     /* Step 1. Evaluate Merge/Skip candidates for likely early-outs */
983
-    if (mightNotSplit)
984
+    if (mightNotSplit && !foundSkip)
985
     {
986
         md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
987
         md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
988
@@ -1180,7 +1375,7 @@
989
                 if (m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth)
990
                     nextQP = setLambdaFromQP(parentCTU, calculateQpforCuSize(parentCTU, childGeom));
991
 
992
-                splitRefs[subPartIdx] = compressInterCU_rd5_6(parentCTU, childGeom, zOrder, nextQP);
993
+                splitData[subPartIdx] = compressInterCU_rd5_6(parentCTU, childGeom, zOrder, nextQP);
994
 
995
                 // Save best CU and pred data for this sub CU
996
                 splitIntra |= nd.bestMode->cu.isIntra(0);
997
@@ -1207,7 +1402,7 @@
998
     /* Split CUs
999
      *   0  1
1000
      *   2  3 */
1001
-    uint32_t allSplitRefs = splitRefs[0] | splitRefs[1] | splitRefs[2] | splitRefs[3];
1002
+    uint32_t allSplitRefs = splitData[0].splitRefs | splitData[1].splitRefs | splitData[2].splitRefs | splitData[3].splitRefs;
1003
     /* Step 3. Evaluate ME (2Nx2N, rect, amp) and intra modes at current depth */
1004
     if (mightNotSplit)
1005
     {
1006
@@ -1226,7 +1421,7 @@
1007
             {
1008
                 CUData& cu = md.pred[PRED_2Nx2N].cu;
1009
                 uint32_t refMask = cu.getBestRefIdx(0);
1010
-                allSplitRefs = splitRefs[0] = splitRefs[1] = splitRefs[2] = splitRefs[3] = refMask;
1011
+                allSplitRefs = splitData[0].splitRefs = splitData[1].splitRefs = splitData[2].splitRefs = splitData[3].splitRefs = refMask;
1012
             }
1013
 
1014
             if (m_slice->m_sliceType == B_SLICE)
1015
@@ -1242,22 +1437,78 @@
1016
 
1017
             if (m_param->bEnableRectInter)
1018
             {
1019
-                refMasks[0] = splitRefs[0] | splitRefs[2]; /* left */
1020
-                refMasks[1] = splitRefs[1] | splitRefs[3]; /* right */
1021
-                md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
1022
-                checkInter_rd5_6(md.pred[PRED_Nx2N], cuGeom, SIZE_Nx2N, refMasks);
1023
-                checkBestMode(md.pred[PRED_Nx2N], cuGeom.depth);
1024
-
1025
-                refMasks[0] = splitRefs[0] | splitRefs[1]; /* top */
1026
-                refMasks[1] = splitRefs[2] | splitRefs[3]; /* bot */
1027
-                md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom, qp);
1028
-                checkInter_rd5_6(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, refMasks);
1029
-                checkBestMode(md.pred[PRED_2NxN], cuGeom.depth);
1030
+                uint64_t splitCost = splitData[0].sa8dCost + splitData[1].sa8dCost + splitData[2].sa8dCost + splitData[3].sa8dCost;
1031
+                uint32_t threshold_2NxN, threshold_Nx2N;
1032
+
1033
+                if (m_slice->m_sliceType == P_SLICE)
1034
+                {
1035
+                    threshold_2NxN = splitData[0].mvCost[0] + splitData[1].mvCost[0];
1036
+                    threshold_Nx2N = splitData[0].mvCost[0] + splitData[2].mvCost[0];
1037
+                }
1038
+                else
1039
+                {
1040
+                    threshold_2NxN = (splitData[0].mvCost[0] + splitData[1].mvCost[0] 
1041
+                                    + splitData[0].mvCost[1] + splitData[1].mvCost[1] + 1) >> 1;
1042
+                    threshold_Nx2N = (splitData[0].mvCost[0] + splitData[2].mvCost[0] 
1043
+                                    + splitData[0].mvCost[1] + splitData[2].mvCost[1] + 1) >> 1;
1044
+                }
1045
+
1046
+                int try_2NxN_first = threshold_2NxN < threshold_Nx2N;
1047
+                if (try_2NxN_first && splitCost < md.bestMode->rdCost + threshold_2NxN)
1048
+                {
1049
+                    refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* top */
1050
+                    refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* bot */
1051
+                    md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom, qp);
1052
+                    checkInter_rd5_6(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, refMasks);
1053
+                    checkBestMode(md.pred[PRED_2NxN], cuGeom.depth);
1054
+                }
1055
+
1056
+                if (splitCost < md.bestMode->rdCost + threshold_Nx2N)
1057
+                {
1058
+                    refMasks[0] = splitData[0].splitRefs | splitData[2].splitRefs; /* left */
1059
+                    refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* right */
1060
+                    md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
1061
+                    checkInter_rd5_6(md.pred[PRED_Nx2N], cuGeom, SIZE_Nx2N, refMasks);
1062
+                    checkBestMode(md.pred[PRED_Nx2N], cuGeom.depth);
1063
+                }
1064
+
1065
+                if (!try_2NxN_first && splitCost < md.bestMode->rdCost + threshold_2NxN)
1066
+                {
1067
+                    refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* top */
1068
+                    refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* bot */
1069
+                    md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom, qp);
1070
+                    checkInter_rd5_6(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, refMasks);
1071
+                    checkBestMode(md.pred[PRED_2NxN], cuGeom.depth);
1072
+                }
1073
             }
1074
 
1075
             // Try AMP (SIZE_2NxnU, SIZE_2NxnD, SIZE_nLx2N, SIZE_nRx2N)
1076
             if (m_slice->m_sps->maxAMPDepth > depth)
1077
             {
1078
+                uint64_t splitCost = splitData[0].sa8dCost + splitData[1].sa8dCost + splitData[2].sa8dCost + splitData[3].sa8dCost;
1079
+                uint32_t threshold_2NxnU, threshold_2NxnD, threshold_nLx2N, threshold_nRx2N;
1080
+
1081
+                if (m_slice->m_sliceType == P_SLICE)
1082
+                {
1083
+                    threshold_2NxnU = splitData[0].mvCost[0] + splitData[1].mvCost[0];
1084
+                    threshold_2NxnD = splitData[2].mvCost[0] + splitData[3].mvCost[0];
1085
+
1086
+                    threshold_nLx2N = splitData[0].mvCost[0] + splitData[2].mvCost[0];
1087
+                    threshold_nRx2N = splitData[1].mvCost[0] + splitData[3].mvCost[0];
1088
+                }
1089
+                else
1090
+                {
1091
+                    threshold_2NxnU = (splitData[0].mvCost[0] + splitData[1].mvCost[0] 
1092
+                                       + splitData[0].mvCost[1] + splitData[1].mvCost[1] + 1) >> 1;
1093
+                    threshold_2NxnD = (splitData[2].mvCost[0] + splitData[3].mvCost[0] 
1094
+                                       + splitData[2].mvCost[1] + splitData[3].mvCost[1] + 1) >> 1;
1095
+
1096
+                    threshold_nLx2N = (splitData[0].mvCost[0] + splitData[2].mvCost[0] 
1097
+                                       + splitData[0].mvCost[1] + splitData[2].mvCost[1] + 1) >> 1;
1098
+                    threshold_nRx2N = (splitData[1].mvCost[0] + splitData[3].mvCost[0] 
1099
+                                       + splitData[1].mvCost[1] + splitData[3].mvCost[1] + 1) >> 1;
1100
+                }
1101
+
1102
                 bool bHor = false, bVer = false;
1103
                 if (md.bestMode->cu.m_partSize[0] == SIZE_2NxN)
1104
                     bHor = true;
1105
@@ -1271,47 +1522,80 @@
1106
 
1107
                 if (bHor)
1108
                 {
1109
-                    refMasks[0] = splitRefs[0] | splitRefs[1]; /* 25% top */
1110
-                    refMasks[1] = allSplitRefs;                /* 75% bot */
1111
-                    md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom, qp);
1112
-                    checkInter_rd5_6(md.pred[PRED_2NxnU], cuGeom, SIZE_2NxnU, refMasks);
1113
-                    checkBestMode(md.pred[PRED_2NxnU], cuGeom.depth);
1114
-
1115
-                    refMasks[0] = allSplitRefs;                /* 75% top */
1116
-                    refMasks[1] = splitRefs[2] | splitRefs[3]; /* 25% bot */
1117
-                    md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom, qp);
1118
-                    checkInter_rd5_6(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, refMasks);
1119
-                    checkBestMode(md.pred[PRED_2NxnD], cuGeom.depth);
1120
+                    int try_2NxnD_first = threshold_2NxnD < threshold_2NxnU;
1121
+                    if (try_2NxnD_first && splitCost < md.bestMode->rdCost + threshold_2NxnD)
1122
+                    {
1123
+                        refMasks[0] = allSplitRefs;                                    /* 75% top */
1124
+                        refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* 25% bot */
1125
+                        md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom, qp);
1126
+                        checkInter_rd5_6(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, refMasks);
1127
+                        checkBestMode(md.pred[PRED_2NxnD], cuGeom.depth);
1128
+                    }
1129
+
1130
+                    if (splitCost < md.bestMode->rdCost + threshold_2NxnU)
1131
+                    {
1132
+                        refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* 25% top */
1133
+                        refMasks[1] = allSplitRefs;                                    /* 75% bot */
1134
+                        md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom, qp);
1135
+                        checkInter_rd5_6(md.pred[PRED_2NxnU], cuGeom, SIZE_2NxnU, refMasks);
1136
+                        checkBestMode(md.pred[PRED_2NxnU], cuGeom.depth);
1137
+                    }
1138
+
1139
+                    if (!try_2NxnD_first && splitCost < md.bestMode->rdCost + threshold_2NxnD)
1140
+                    {
1141
+                        refMasks[0] = allSplitRefs;                                    /* 75% top */
1142
+                        refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* 25% bot */
1143
+                        md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom, qp);
1144
+                        checkInter_rd5_6(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, refMasks);
1145
+                        checkBestMode(md.pred[PRED_2NxnD], cuGeom.depth);
1146
+                    }
1147
                 }
1148
+
1149
                 if (bVer)
1150
                 {
1151
-                    refMasks[0] = splitRefs[0] | splitRefs[2]; /* 25% left */
1152
-                    refMasks[1] = allSplitRefs;                /* 75% right */
1153
-                    md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom, qp);
1154
-                    checkInter_rd5_6(md.pred[PRED_nLx2N], cuGeom, SIZE_nLx2N, refMasks);
1155
-                    checkBestMode(md.pred[PRED_nLx2N], cuGeom.depth);
1156
+                    int try_nRx2N_first = threshold_nRx2N < threshold_nLx2N;
1157
+                    if (try_nRx2N_first && splitCost < md.bestMode->rdCost + threshold_nRx2N)
1158
+                    {
1159
+                        refMasks[0] = allSplitRefs;                                    /* 75% left  */
1160
+                        refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* 25% right */
1161
+                        md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom, qp);
1162
+                        checkInter_rd5_6(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, refMasks);
1163
+                        checkBestMode(md.pred[PRED_nRx2N], cuGeom.depth);
1164
+                    }
1165
+
1166
+                    if (splitCost < md.bestMode->rdCost + threshold_nLx2N)
1167
+                    {
1168
+                        refMasks[0] = splitData[0].splitRefs | splitData[2].splitRefs; /* 25% left  */
1169
+                        refMasks[1] = allSplitRefs;                                    /* 75% right */
1170
+                        md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom, qp);
1171
+                        checkInter_rd5_6(md.pred[PRED_nLx2N], cuGeom, SIZE_nLx2N, refMasks);
1172
+                        checkBestMode(md.pred[PRED_nLx2N], cuGeom.depth);
1173
+                    }
1174
 
1175
-                    refMasks[0] = allSplitRefs;                /* 75% left */
1176
-                    refMasks[1] = splitRefs[1] | splitRefs[3]; /* 25% right */
1177
-                    md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom, qp);
1178
-                    checkInter_rd5_6(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, refMasks);
1179
-                    checkBestMode(md.pred[PRED_nRx2N], cuGeom.depth);
1180
+                    if (!try_nRx2N_first && splitCost < md.bestMode->rdCost + threshold_nRx2N)
1181
+                    {
1182
+                        refMasks[0] = allSplitRefs;                                    /* 75% left  */
1183
+                        refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* 25% right */
1184
+                        md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom, qp);
1185
+                        checkInter_rd5_6(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, refMasks);
1186
+                        checkBestMode(md.pred[PRED_nRx2N], cuGeom.depth);
1187
+                    }
1188
                 }
1189
             }
1190
 
1191
-            if (m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames)
1192
+            if ((m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames) && cuGeom.log2CUSize != MAX_LOG2_CU_SIZE)
1193
             {
1194
                 if (!m_param->limitReferences || splitIntra)
1195
                 {
1196
                     ProfileCounter(parentCTU, totalIntraCU[cuGeom.depth]);
1197
                     md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom, qp);
1198
-                    checkIntra(md.pred[PRED_INTRA], cuGeom, SIZE_2Nx2N, NULL, NULL);
1199
+                    checkIntra(md.pred[PRED_INTRA], cuGeom, SIZE_2Nx2N);
1200
                     checkBestMode(md.pred[PRED_INTRA], depth);
1201
 
1202
                     if (cuGeom.log2CUSize == 3 && m_slice->m_sps->quadtreeTULog2MinSize < 3)
1203
                     {
1204
                         md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom, qp);
1205
-                        checkIntra(md.pred[PRED_INTRA_NxN], cuGeom, SIZE_NxN, NULL, NULL);
1206
+                        checkIntra(md.pred[PRED_INTRA_NxN], cuGeom, SIZE_NxN);
1207
                         checkBestMode(md.pred[PRED_INTRA_NxN], depth);
1208
                     }
1209
                 }
1210
@@ -1334,27 +1618,34 @@
1211
         checkBestMode(md.pred[PRED_SPLIT], depth);
1212
 
1213
        /* determine which motion references the parent CU should search */
1214
-    uint32_t refMask;
1215
-    if (!(m_param->limitReferences & X265_REF_LIMIT_DEPTH))
1216
-        refMask = 0;
1217
-    else if (md.bestMode == &md.pred[PRED_SPLIT])
1218
-        refMask = allSplitRefs;
1219
-    else
1220
+    SplitData splitCUData;
1221
+    splitCUData.initSplitCUData();
1222
+    if (m_param->limitReferences & X265_REF_LIMIT_DEPTH)
1223
     {
1224
-        /* use best merge/inter mode, in case of intra use 2Nx2N inter references */
1225
-        CUData& cu = md.bestMode->cu.isIntra(0) ? md.pred[PRED_2Nx2N].cu : md.bestMode->cu;
1226
-        uint32_t numPU = cu.getNumPartInter(0);
1227
-        refMask = 0;
1228
-        for (uint32_t puIdx = 0, subPartIdx = 0; puIdx < numPU; puIdx++, subPartIdx += cu.getPUOffset(puIdx, 0))
1229
-            refMask |= cu.getBestRefIdx(subPartIdx);
1230
+        if (md.bestMode == &md.pred[PRED_SPLIT])
1231
+            splitCUData.splitRefs = allSplitRefs;
1232
+        else
1233
+        {
1234
+            /* use best merge/inter mode, in case of intra use 2Nx2N inter references */
1235
+            CUData& cu = md.bestMode->cu.isIntra(0) ? md.pred[PRED_2Nx2N].cu : md.bestMode->cu;
1236
+            uint32_t numPU = cu.getNumPartInter(0);
1237
+            for (uint32_t puIdx = 0, subPartIdx = 0; puIdx < numPU; puIdx++, subPartIdx += cu.getPUOffset(puIdx, 0))
1238
+                splitCUData.splitRefs |= cu.getBestRefIdx(subPartIdx);
1239
+        }
1240
+    }
1241
+
1242
+    if (m_param->limitModes)
1243
+    {
1244
+        splitCUData.mvCost[0] = md.pred[PRED_2Nx2N].bestME[0][0].mvCost; // L0
1245
+        splitCUData.mvCost[1] = md.pred[PRED_2Nx2N].bestME[0][1].mvCost; // L1
1246
+        splitCUData.sa8dCost    = md.pred[PRED_2Nx2N].rdCost;
1247
     }
1248
 
1249
     /* Copy best data to encData CTU and recon */
1250
-    X265_CHECK(md.bestMode->ok(), "best mode is not ok");
1251
     md.bestMode->cu.copyToPic(depth);
1252
     md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, parentCTU.m_cuAddr, cuGeom.absPartIdx);
1253
 
1254
-    return refMask;
1255
+    return splitCUData;
1256
 }
1257
 
1258
 /* sets md.bestMode if a valid merge candidate is found, else leaves it NULL */
1259
@@ -1389,13 +1680,23 @@
1260
     bestPred->sa8dCost = MAX_INT64;
1261
     int bestSadCand = -1;
1262
     int sizeIdx = cuGeom.log2CUSize - 2;
1263
-
1264
+    int safeX, maxSafeMv;
1265
+    if (m_param->bIntraRefresh && m_slice->m_sliceType == P_SLICE)
1266
+    {
1267
+        safeX = m_slice->m_refFrameList[0][0]->m_encData->m_pir.pirEndCol * g_maxCUSize - 3;
1268
+        maxSafeMv = (safeX - tempPred->cu.m_cuPelX) * 4;
1269
+    }
1270
     for (uint32_t i = 0; i < numMergeCand; ++i)
1271
     {
1272
         if (m_bFrameParallel &&
1273
             (candMvField[i][0].mv.y >= (m_param->searchRange + 1) * 4 ||
1274
             candMvField[i][1].mv.y >= (m_param->searchRange + 1) * 4))
1275
             continue;
1276
+        if (m_param->bIntraRefresh && m_slice->m_sliceType == P_SLICE &&
1277
+            tempPred->cu.m_cuPelX / g_maxCUSize < m_frame->m_encData->m_pir.pirEndCol &&
1278
+            candMvField[i][0].mv.x > maxSafeMv)
1279
+            // skip merge candidates which reference beyond safe reference area
1280
+            continue;
1281
 
1282
         tempPred->cu.m_mvpIdx[0][0] = (uint8_t)i; // merge candidate ID is stored in L0 MVP idx
1283
         X265_CHECK(m_slice->m_sliceType == B_SLICE || !(candDir[i] & 0x10), " invalid merge for P slice\n");
1284
@@ -1404,12 +1705,11 @@
1285
         tempPred->cu.m_mv[1][0] = candMvField[i][1].mv;
1286
         tempPred->cu.m_refIdx[0][0] = (int8_t)candMvField[i][0].refIdx;
1287
         tempPred->cu.m_refIdx[1][0] = (int8_t)candMvField[i][1].refIdx;
1288
-
1289
-        motionCompensation(tempPred->cu, pu, tempPred->predYuv, true, m_bChromaSa8d);
1290
+        motionCompensation(tempPred->cu, pu, tempPred->predYuv, true, m_bChromaSa8d && (m_csp != X265_CSP_I400));
1291
 
1292
         tempPred->sa8dBits = getTUBits(i, numMergeCand);
1293
         tempPred->distortion = primitives.cu[sizeIdx].sa8d(fencYuv->m_buf[0], fencYuv->m_size, tempPred->predYuv.m_buf[0], tempPred->predYuv.m_size);
1294
-        if (m_bChromaSa8d)
1295
+        if (m_bChromaSa8d && (m_csp != X265_CSP_I400))
1296
         {
1297
             tempPred->distortion += primitives.chroma[m_csp].cu[sizeIdx].sa8d(fencYuv->m_buf[1], fencYuv->m_csize, tempPred->predYuv.m_buf[1], tempPred->predYuv.m_csize);
1298
             tempPred->distortion += primitives.chroma[m_csp].cu[sizeIdx].sa8d(fencYuv->m_buf[2], fencYuv->m_csize, tempPred->predYuv.m_buf[2], tempPred->predYuv.m_csize);
1299
@@ -1428,7 +1728,7 @@
1300
         return;
1301
 
1302
     /* calculate the motion compensation for chroma for the best mode selected */
1303
-    if (!m_bChromaSa8d) /* Chroma MC was done above */
1304
+    if (!m_bChromaSa8d && (m_csp != X265_CSP_I400)) /* Chroma MC was done above */
1305
         motionCompensation(bestPred->cu, pu, bestPred->predYuv, false, true);
1306
 
1307
     if (m_param->rdLevel)
1308
@@ -1463,7 +1763,6 @@
1309
     md.bestMode->cu.setPURefIdx(0, (int8_t)candMvField[bestSadCand][0].refIdx, 0, 0);
1310
     md.bestMode->cu.setPURefIdx(1, (int8_t)candMvField[bestSadCand][1].refIdx, 0, 0);
1311
     checkDQP(*md.bestMode, cuGeom);
1312
-    X265_CHECK(md.bestMode->ok(), "Merge mode not ok\n");
1313
 }
1314
 
1315
 /* sets md.bestMode if a valid merge candidate is found, else leaves it NULL */
1316
@@ -1501,7 +1800,12 @@
1317
         first = *m_reuseBestMergeCand;
1318
         last = first + 1;
1319
     }
1320
-
1321
+    int safeX, maxSafeMv;
1322
+    if (m_param->bIntraRefresh && m_slice->m_sliceType == P_SLICE)
1323
+    {
1324
+        safeX = m_slice->m_refFrameList[0][0]->m_encData->m_pir.pirEndCol * g_maxCUSize - 3;
1325
+        maxSafeMv = (safeX - tempPred->cu.m_cuPelX) * 4;
1326
+    }
1327
     for (uint32_t i = first; i < last; i++)
1328
     {
1329
         if (m_bFrameParallel &&
1330
@@ -1524,7 +1828,11 @@
1331
                 continue;
1332
             triedBZero = true;
1333
         }
1334
-
1335
+        if (m_param->bIntraRefresh && m_slice->m_sliceType == P_SLICE &&
1336
+            tempPred->cu.m_cuPelX / g_maxCUSize < m_frame->m_encData->m_pir.pirEndCol &&
1337
+            candMvField[i][0].mv.x > maxSafeMv)
1338
+            // skip merge candidates which reference beyond safe reference area
1339
+            continue;
1340
         tempPred->cu.m_mvpIdx[0][0] = (uint8_t)i;    /* merge candidate ID is stored in L0 MVP idx */
1341
         tempPred->cu.m_interDir[0] = candDir[i];
1342
         tempPred->cu.m_mv[0][0] = candMvField[i][0].mv;
1343
@@ -1533,11 +1841,12 @@
1344
         tempPred->cu.m_refIdx[1][0] = (int8_t)candMvField[i][1].refIdx;
1345
         tempPred->cu.setPredModeSubParts(MODE_INTER); /* must be cleared between encode iterations */
1346
 
1347
-        motionCompensation(tempPred->cu, pu, tempPred->predYuv, true, true);
1348
+        motionCompensation(tempPred->cu, pu, tempPred->predYuv, true, m_csp != X265_CSP_I400);
1349
 
1350
         uint8_t hasCbf = true;
1351
         bool swapped = false;
1352
-        if (!foundCbf0Merge)
1353
+        /* bypass encoding merge with residual if analysis-mode = load as only SKIP CUs enter this function */
1354
+        if (!foundCbf0Merge && !isShareMergeCand)
1355
         {
1356
             /* if the best prediction has CBF (not a skip) then try merge with residual */
1357
 
1358
@@ -1586,14 +1895,13 @@
1359
         bestPred->cu.setPURefIdx(0, (int8_t)candMvField[bestCand][0].refIdx, 0, 0);
1360
         bestPred->cu.setPURefIdx(1, (int8_t)candMvField[bestCand][1].refIdx, 0, 0);
1361
         checkDQP(*bestPred, cuGeom);
1362
-        X265_CHECK(bestPred->ok(), "merge mode is not ok");
1363
     }
1364
 
1365
     if (m_param->analysisMode)
1366
     {
1367
-        m_reuseBestMergeCand++;
1368
         if (m_param->analysisMode == X265_ANALYSIS_SAVE)
1369
             *m_reuseBestMergeCand = bestPred->cu.m_mvpIdx[0][0];
1370
+        m_reuseBestMergeCand++;
1371
     }
1372
 }
1373
 
1374
@@ -1614,18 +1922,20 @@
1375
             {
1376
                 bestME[i].ref = *m_reuseRef;
1377
                 m_reuseRef++;
1378
+
1379
+                bestME[i].mv = *m_reuseMv;
1380
+                m_reuseMv++;
1381
             }
1382
         }
1383
     }
1384
-
1385
-    predInterSearch(interMode, cuGeom, m_bChromaSa8d, refMask);
1386
+    predInterSearch(interMode, cuGeom, m_bChromaSa8d && (m_csp != X265_CSP_I400), refMask);
1387
 
1388
     /* predInterSearch sets interMode.sa8dBits */
1389
     const Yuv& fencYuv = *interMode.fencYuv;
1390
     Yuv& predYuv = interMode.predYuv;
1391
     int part = partitionFromLog2Size(cuGeom.log2CUSize);
1392
     interMode.distortion = primitives.cu[part].sa8d(fencYuv.m_buf[0], fencYuv.m_size, predYuv.m_buf[0], predYuv.m_size);
1393
-    if (m_bChromaSa8d)
1394
+    if (m_bChromaSa8d && (m_csp != X265_CSP_I400))
1395
     {
1396
         interMode.distortion += primitives.chroma[m_csp].cu[part].sa8d(fencYuv.m_buf[1], fencYuv.m_csize, predYuv.m_buf[1], predYuv.m_csize);
1397
         interMode.distortion += primitives.chroma[m_csp].cu[part].sa8d(fencYuv.m_buf[2], fencYuv.m_csize, predYuv.m_buf[2], predYuv.m_csize);
1398
@@ -1637,11 +1947,16 @@
1399
         uint32_t numPU = interMode.cu.getNumPartInter(0);
1400
         for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
1401
         {
1402
+            PredictionUnit pu(interMode.cu, cuGeom, puIdx);
1403
             MotionData* bestME = interMode.bestME[puIdx];
1404
             for (int32_t i = 0; i < numPredDir; i++)
1405
             {
1406
+                if (bestME[i].ref >= 0)
1407
+                    *m_reuseMv = getLowresMV(interMode.cu, pu, i, bestME[i].ref);
1408
+
1409
                 *m_reuseRef = bestME[i].ref;
1410
                 m_reuseRef++;
1411
+                m_reuseMv++;
1412
             }
1413
         }
1414
     }
1415
@@ -1664,11 +1979,13 @@
1416
             {
1417
                 bestME[i].ref = *m_reuseRef;
1418
                 m_reuseRef++;
1419
+
1420
+                bestME[i].mv = *m_reuseMv;
1421
+                m_reuseMv++;
1422
             }
1423
         }
1424
     }
1425
-
1426
-    predInterSearch(interMode, cuGeom, true, refMask);
1427
+    predInterSearch(interMode, cuGeom, m_csp != X265_CSP_I400, refMask);
1428
 
1429
     /* predInterSearch sets interMode.sa8dBits, but this is ignored */
1430
     encodeResAndCalcRdInterCU(interMode, cuGeom);
1431
@@ -1678,11 +1995,16 @@
1432
         uint32_t numPU = interMode.cu.getNumPartInter(0);
1433
         for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
1434
         {
1435
+            PredictionUnit pu(interMode.cu, cuGeom, puIdx);
1436
             MotionData* bestME = interMode.bestME[puIdx];
1437
             for (int32_t i = 0; i < numPredDir; i++)
1438
             {
1439
+                if (bestME[i].ref >= 0)
1440
+                    *m_reuseMv = getLowresMV(interMode.cu, pu, i, bestME[i].ref);
1441
+
1442
                 *m_reuseRef = bestME[i].ref;
1443
                 m_reuseRef++;
1444
+                m_reuseMv++;
1445
             }
1446
         }
1447
     }
1448
@@ -1731,10 +2053,10 @@
1449
     cu.m_mvd[1][0] = bestME[1].mv - mvp1;
1450
 
1451
     PredictionUnit pu(cu, cuGeom, 0);
1452
-    motionCompensation(cu, pu, bidir2Nx2N.predYuv, true, m_bChromaSa8d);
1453
+    motionCompensation(cu, pu, bidir2Nx2N.predYuv, true, m_bChromaSa8d && (m_csp != X265_CSP_I400));
1454
 
1455
     int sa8d = primitives.cu[partEnum].sa8d(fencYuv.m_buf[0], fencYuv.m_size, bidir2Nx2N.predYuv.m_buf[0], bidir2Nx2N.predYuv.m_size);
1456
-    if (m_bChromaSa8d)
1457
+    if (m_bChromaSa8d && (m_csp != X265_CSP_I400))
1458
     {
1459
         /* Add in chroma distortion */
1460
         sa8d += primitives.chroma[m_csp].cu[partEnum].sa8d(fencYuv.m_buf[1], fencYuv.m_csize, bidir2Nx2N.predYuv.m_buf[1], bidir2Nx2N.predYuv.m_csize);
1461
@@ -1765,16 +2087,16 @@
1462
 
1463
         int zsa8d;
1464
 
1465
-        if (m_bChromaSa8d)
1466
+        if (m_bChromaSa8d && (m_csp != X265_CSP_I400))
1467
         {
1468
             cu.m_mv[0][0] = mvzero;
1469
             cu.m_mv[1][0] = mvzero;
1470
 
1471
             motionCompensation(cu, pu, tmpPredYuv, true, true);
1472
-
1473
             zsa8d  = primitives.cu[partEnum].sa8d(fencYuv.m_buf[0], fencYuv.m_size, tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
1474
             zsa8d += primitives.chroma[m_csp].cu[partEnum].sa8d(fencYuv.m_buf[1], fencYuv.m_csize, tmpPredYuv.m_buf[1], tmpPredYuv.m_csize);
1475
             zsa8d += primitives.chroma[m_csp].cu[partEnum].sa8d(fencYuv.m_buf[2], fencYuv.m_csize, tmpPredYuv.m_buf[2], tmpPredYuv.m_csize);
1476
+
1477
         }
1478
         else
1479
         {
1480
@@ -1810,13 +2132,12 @@
1481
             cu.m_mvd[1][0] = mvzero - mvp1;
1482
             cu.m_mvpIdx[1][0] = (uint8_t)mvpIdx1;
1483
 
1484
-            if (m_bChromaSa8d)
1485
-                /* real MC was already performed */
1486
+            if (m_bChromaSa8d) /* real MC was already performed */
1487
                 bidir2Nx2N.predYuv.copyFromYuv(tmpPredYuv);
1488
             else
1489
-                motionCompensation(cu, pu, bidir2Nx2N.predYuv, true, true);
1490
+                motionCompensation(cu, pu, bidir2Nx2N.predYuv, true, m_csp != X265_CSP_I400);
1491
         }
1492
-        else if (m_bChromaSa8d)
1493
+        else if (m_bChromaSa8d && (m_csp != X265_CSP_I400))
1494
         {
1495
             /* recover overwritten motion vectors */
1496
             cu.m_mv[0][0] = bestME[0].mv;
1497
@@ -1845,7 +2166,9 @@
1498
     Mode *bestMode = m_modeDepth[cuGeom.depth].bestMode;
1499
     CUData& cu = bestMode->cu;
1500
 
1501
-    cu.copyFromPic(ctu, cuGeom);
1502
+    cu.copyFromPic(ctu, cuGeom, m_csp);
1503
+
1504
+    PicYuv& reconPic = *m_frame->m_reconPic;
1505
 
1506
     Yuv& fencYuv = m_modeDepth[cuGeom.depth].fencYuv;
1507
     if (cuGeom.depth)
1508
@@ -1860,8 +2183,11 @@
1509
         cu.getIntraTUQtDepthRange(tuDepthRange, 0);
1510
 
1511
         residualTransformQuantIntra(*bestMode, cuGeom, 0, 0, tuDepthRange);
1512
-        getBestIntraModeChroma(*bestMode, cuGeom);
1513
-        residualQTIntraChroma(*bestMode, cuGeom, 0, 0);
1514
+        if (m_csp != X265_CSP_I400)
1515
+        {
1516
+            getBestIntraModeChroma(*bestMode, cuGeom);
1517
+            residualQTIntraChroma(*bestMode, cuGeom, 0, 0);
1518
+        }
1519
     }
1520
     else // if (cu.isInter(0))
1521
     {
1522
@@ -1876,20 +2202,23 @@
1523
         /* at RD 0, the prediction pixels are accumulated into the top depth predYuv */
1524
         Yuv& predYuv = m_modeDepth[0].bestMode->predYuv;
1525
         pixel* predY = predYuv.getLumaAddr(absPartIdx);
1526
-        pixel* predU = predYuv.getCbAddr(absPartIdx);
1527
-        pixel* predV = predYuv.getCrAddr(absPartIdx);
1528
 
1529
         primitives.cu[sizeIdx].sub_ps(resiYuv.m_buf[0], resiYuv.m_size,
1530
                                       fencYuv.m_buf[0], predY,
1531
                                       fencYuv.m_size, predYuv.m_size);
1532
 
1533
-        primitives.chroma[m_csp].cu[sizeIdx].sub_ps(resiYuv.m_buf[1], resiYuv.m_csize,
1534
+        if (m_csp != X265_CSP_I400)
1535
+        {
1536
+            pixel* predU = predYuv.getCbAddr(absPartIdx);
1537
+            pixel* predV = predYuv.getCrAddr(absPartIdx);
1538
+            primitives.chroma[m_csp].cu[sizeIdx].sub_ps(resiYuv.m_buf[1], resiYuv.m_csize,
1539
                                                  fencYuv.m_buf[1], predU,
1540
                                                  fencYuv.m_csize, predYuv.m_csize);
1541
 
1542
-        primitives.chroma[m_csp].cu[sizeIdx].sub_ps(resiYuv.m_buf[2], resiYuv.m_csize,
1543
+            primitives.chroma[m_csp].cu[sizeIdx].sub_ps(resiYuv.m_buf[2], resiYuv.m_csize,
1544
                                                  fencYuv.m_buf[2], predV,
1545
                                                  fencYuv.m_csize, predYuv.m_csize);
1546
+        }
1547
 
1548
         uint32_t tuDepthRange[2];
1549
         cu.getInterTUQtDepthRange(tuDepthRange, 0);
1550
@@ -1902,27 +2231,30 @@
1551
         /* residualTransformQuantInter() wrote transformed residual back into
1552
          * resiYuv. Generate the recon pixels by adding it to the prediction */
1553
 
1554
-        PicYuv& reconPic = *m_frame->m_reconPic;
1555
         if (cu.m_cbf[0][0])
1556
             primitives.cu[sizeIdx].add_ps(reconPic.getLumaAddr(cu.m_cuAddr, absPartIdx), reconPic.m_stride,
1557
                                           predY, resiYuv.m_buf[0], predYuv.m_size, resiYuv.m_size);
1558
         else
1559
             primitives.cu[sizeIdx].copy_pp(reconPic.getLumaAddr(cu.m_cuAddr, absPartIdx), reconPic.m_stride,
1560
                                            predY, predYuv.m_size);
1561
-
1562
-        if (cu.m_cbf[1][0])
1563
-            primitives.chroma[m_csp].cu[sizeIdx].add_ps(reconPic.getCbAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC,
1564
+        if (m_csp != X265_CSP_I400)
1565
+        {
1566
+             pixel* predU = predYuv.getCbAddr(absPartIdx);
1567
+             pixel* predV = predYuv.getCrAddr(absPartIdx);
1568
+            if (cu.m_cbf[1][0])
1569
+                primitives.chroma[m_csp].cu[sizeIdx].add_ps(reconPic.getCbAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC,
1570
                                                         predU, resiYuv.m_buf[1], predYuv.m_csize, resiYuv.m_csize);
1571
-        else
1572
-            primitives.chroma[m_csp].cu[sizeIdx].copy_pp(reconPic.getCbAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC,
1573
+            else
1574
+                primitives.chroma[m_csp].cu[sizeIdx].copy_pp(reconPic.getCbAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC,
1575
                                                          predU, predYuv.m_csize);
1576
 
1577
-        if (cu.m_cbf[2][0])
1578
-            primitives.chroma[m_csp].cu[sizeIdx].add_ps(reconPic.getCrAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC,
1579
+            if (cu.m_cbf[2][0])
1580
+                primitives.chroma[m_csp].cu[sizeIdx].add_ps(reconPic.getCrAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC,
1581
                                                         predV, resiYuv.m_buf[2], predYuv.m_csize, resiYuv.m_csize);
1582
-        else
1583
-            primitives.chroma[m_csp].cu[sizeIdx].copy_pp(reconPic.getCrAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC,
1584
+            else
1585
+                primitives.chroma[m_csp].cu[sizeIdx].copy_pp(reconPic.getCrAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC,
1586
                                                          predV, predYuv.m_csize);
1587
+        }
1588
     }
1589
 
1590
     cu.updatePic(cuGeom.depth);
1591
@@ -1936,7 +2268,6 @@
1592
         mode.contexts.resetBits();
1593
         mode.contexts.codeSplitFlag(mode.cu, 0, depth);
1594
         uint32_t bits = mode.contexts.getNumberOfWrittenBits();
1595
-        mode.mvBits += bits;
1596
         mode.totalBits += bits;
1597
         updateModeCost(mode);
1598
     }
1599
@@ -1947,7 +2278,6 @@
1600
     }
1601
     else
1602
     {
1603
-        mode.mvBits++;
1604
         mode.totalBits++;
1605
         updateModeCost(mode);
1606
     }
1607
@@ -1965,7 +2295,7 @@
1608
     if (m_slice->m_numRefIdx[0])
1609
     {
1610
         numRefs++;
1611
-        const CUData& cu = *m_slice->m_refPicList[0][0]->m_encData->getPicCTU(parentCTU.m_cuAddr);
1612
+        const CUData& cu = *m_slice->m_refFrameList[0][0]->m_encData->getPicCTU(parentCTU.m_cuAddr);
1613
         previousQP = cu.m_qp[0];
1614
         if (!cu.m_cuDepth[cuGeom.absPartIdx])
1615
             return 0;
1616
@@ -1979,7 +2309,7 @@
1617
     if (m_slice->m_numRefIdx[1])
1618
     {
1619
         numRefs++;
1620
-        const CUData& cu = *m_slice->m_refPicList[1][0]->m_encData->getPicCTU(parentCTU.m_cuAddr);
1621
+        const CUData& cu = *m_slice->m_refFrameList[1][0]->m_encData->getPicCTU(parentCTU.m_cuAddr);
1622
         if (!cu.m_cuDepth[cuGeom.absPartIdx])
1623
             return 0;
1624
         for (uint32_t i = 0; i < cuGeom.numPartitions; i += 4)
1625
@@ -2061,10 +2391,10 @@
1626
     return false;
1627
 }
1628
 
1629
-int Analysis::calculateQpforCuSize(const CUData& ctu, const CUGeom& cuGeom)
1630
+int Analysis::calculateQpforCuSize(const CUData& ctu, const CUGeom& cuGeom, double baseQp)
1631
 {
1632
     FrameData& curEncData = *m_frame->m_encData;
1633
-    double qp = curEncData.m_cuStat[ctu.m_cuAddr].baseQp;
1634
+    double qp = baseQp >= 0 ? baseQp : curEncData.m_cuStat[ctu.m_cuAddr].baseQp;
1635
 
1636
     /* Use cuTree offsets if cuTree enabled and frame is referenced, else use AQ offsets */
1637
     bool isReferenced = IS_REFERENCED(m_frame);
1638
x265_1.8.tar.gz/source/encoder/analysis.h -> x265_1.9.tar.gz/source/encoder/analysis.h Changed
73
 
1
@@ -3,6 +3,7 @@
2
 *
3
 * Authors: Deepthi Nandakumar <deepthi@multicorewareinc.com>
4
 *          Steve Borho <steve@borho.org>
5
+*          Min Chen <chenm003@163.com>
6
 *
7
 * This program is free software; you can redistribute it and/or modify
8
 * it under the terms of the GNU General Public License as published by
9
@@ -40,6 +41,21 @@
10
 
11
 class Entropy;
12
 
13
+struct SplitData
14
+{
15
+    uint32_t splitRefs;
16
+    uint32_t mvCost[2];
17
+    uint64_t sa8dCost;
18
+
19
+    void initSplitCUData()
20
+    {
21
+        splitRefs = 0;
22
+        mvCost[0] = 0; // L0
23
+        mvCost[1] = 0; // L1
24
+        sa8dCost    = 0;
25
+    }
26
+};
27
+
28
 class Analysis : public Search
29
 {
30
 public:
31
@@ -101,20 +117,20 @@
32
     Mode& compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, const Entropy& initialContext);
33
 
34
 protected:
35
-
36
     /* Analysis data for load/save modes, keeps getting incremented as CTU analysis proceeds and data is consumed or read */
37
-    analysis_intra_data* m_reuseIntraDataCTU;
38
     analysis_inter_data* m_reuseInterDataCTU;
39
+    MV*                  m_reuseMv;
40
     int32_t*             m_reuseRef;
41
     uint32_t*            m_reuseBestMergeCand;
42
+    uint32_t m_splitRefIdx[4];
43
 
44
     /* full analysis for an I-slice CU */
45
-    void compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, uint32_t &zOrder, int32_t qp);
46
+    void compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp);
47
 
48
     /* full analysis for a P or B slice CU */
49
-    void compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp);
50
-    uint32_t compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp);
51
-    uint32_t compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGeom, uint32_t &zOrder, int32_t qp);
52
+    uint32_t compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp);
53
+    SplitData compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp);
54
+    SplitData compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGeom, uint32_t &zOrder, int32_t qp);
55
 
56
     /* measure merge and skip */
57
     void checkMerge2Nx2N_rd0_4(Mode& skip, Mode& merge, const CUGeom& cuGeom);
58
@@ -139,13 +155,11 @@
59
     /* generate residual and recon pixels for an entire CTU recursively (RD0) */
60
     void encodeResidue(const CUData& parentCTU, const CUGeom& cuGeom);
61
 
62
-    int calculateQpforCuSize(const CUData& ctu, const CUGeom& cuGeom);
63
+    int calculateQpforCuSize(const CUData& ctu, const CUGeom& cuGeom, double baseQP = -1);
64
 
65
     /* check whether current mode is the new best */
66
     inline void checkBestMode(Mode& mode, uint32_t depth)
67
     {
68
-        X265_CHECK(mode.ok(), "mode costs are uninitialized\n");
69
-
70
         ModeDepth& md = m_modeDepth[depth];
71
         if (md.bestMode)
72
         {
73
x265_1.8.tar.gz/source/encoder/api.cpp -> x265_1.9.tar.gz/source/encoder/api.cpp Changed
45
 
1
@@ -72,9 +72,7 @@
2
 #endif
3
 
4
 #if HIGH_BIT_DEPTH
5
-    if (X265_DEPTH == 12)
6
-        x265_log(p, X265_LOG_WARNING, "Main12 is HIGHLY experimental, do not use!\n");
7
-    else if (X265_DEPTH != 10 && X265_DEPTH != 12)
8
+    if (X265_DEPTH != 10 && X265_DEPTH != 12)
9
 #else
10
     if (X265_DEPTH != 8)
11
 #endif
12
@@ -247,6 +245,16 @@
13
     }
14
 }
15
 
16
+int x265_encoder_intra_refresh(x265_encoder *enc)
17
+{
18
+    if (!enc)
19
+        return -1;
20
+
21
+    Encoder *encoder = static_cast<Encoder*>(enc);
22
+    encoder->m_bQueuedIntraRefresh = 1;
23
+    return 0;
24
+}
25
+
26
 void x265_cleanup(void)
27
 {
28
     if (!g_ctuSizeConfigured)
29
@@ -268,6 +276,7 @@
30
     pic->bitDepth = param->internalBitDepth;
31
     pic->colorSpace = param->internalCsp;
32
     pic->forceqp = X265_QP_AUTO;
33
+    pic->quantOffsets = NULL;
34
     if (param->analysisMode)
35
     {
36
         uint32_t widthInCU       = (param->sourceWidth  + g_maxCUSize - 1) >> g_maxLog2CUSize;
37
@@ -318,6 +327,7 @@
38
     &x265_cleanup,
39
 
40
     sizeof(x265_frame_stats),
41
+    &x265_encoder_intra_refresh,
42
 };
43
 
44
 typedef const x265_api* (*api_get_func)(int bitDepth);
45
x265_1.8.tar.gz/source/encoder/bitcost.cpp -> x265_1.9.tar.gz/source/encoder/bitcost.cpp Changed
62
 
1
@@ -2,6 +2,7 @@
2
  * Copyright (C) 2013 x265 project
3
  *
4
  * Authors: Steve Borho <steve@borho.org>
5
+ *          Min Chen <chenm003@163.com>
6
  *
7
  * This program is free software; you can redistribute it and/or modify
8
  * it under the terms of the GNU General Public License as published by
9
@@ -40,7 +41,12 @@
10
             x265_emms(); // just to be safe
11
 
12
             CalculateLogs();
13
-            s_costs[qp] = new uint16_t[4 * BC_MAX_MV + 1] + 2 * BC_MAX_MV;
14
+            s_costs[qp] = X265_MALLOC(uint16_t, 4 * BC_MAX_MV + 1) + 2 * BC_MAX_MV;
15
+            if (!s_costs[qp])
16
+            {
17
+                x265_log(NULL, X265_LOG_ERROR, "BitCost s_costs buffer allocation failure\n");
18
+                return;
19
+            }
20
             double lambda = x265_lambda_tab[qp];
21
 
22
             // estimate same cost for negative and positive MVD
23
@@ -66,11 +72,16 @@
24
 {
25
     if (!s_bitsizes)
26
     {
27
-        s_bitsizes = new float[2 * BC_MAX_MV + 1];
28
+        s_bitsizes = X265_MALLOC(float, 4 * BC_MAX_MV + 1) + 2 * BC_MAX_MV;
29
+        if (!s_bitsizes)
30
+        {
31
+            x265_log(NULL, X265_LOG_ERROR, "BitCost s_bitsizes buffer allocation failure\n");
32
+            return;
33
+        }
34
         s_bitsizes[0] = 0.718f;
35
         float log2_2 = 2.0f / log(2.0f);  // 2 x 1/log(2)
36
         for (int i = 1; i <= 2 * BC_MAX_MV; i++)
37
-            s_bitsizes[i] = log((float)(i + 1)) * log2_2 + 1.718f;
38
+            s_bitsizes[i] = s_bitsizes[-i] = log((float)(i + 1)) * log2_2 + 1.718f;
39
     }
40
 }
41
 
42
@@ -80,12 +91,15 @@
43
     {
44
         if (s_costs[i])
45
         {
46
-            delete [] (s_costs[i] - 2 * BC_MAX_MV);
47
+            X265_FREE(s_costs[i] - 2 * BC_MAX_MV);
48
 
49
-            s_costs[i] = 0;
50
+            s_costs[i] = NULL;
51
         }
52
     }
53
 
54
-    delete [] s_bitsizes;
55
-    s_bitsizes = 0;
56
+    if (s_bitsizes)
57
+    {
58
+        X265_FREE(s_bitsizes - 2 * BC_MAX_MV);
59
+        s_bitsizes = NULL;
60
+    }
61
 }
62
x265_1.8.tar.gz/source/encoder/bitcost.h -> x265_1.9.tar.gz/source/encoder/bitcost.h Changed
20
 
1
@@ -47,14 +47,14 @@
2
     // return bit cost of motion vector difference, without lambda
3
     inline uint32_t bitcost(const MV& mv) const
4
     {
5
-        return (uint32_t)(s_bitsizes[abs(mv.x - m_mvp.x)] +
6
-                          s_bitsizes[abs(mv.y - m_mvp.y)] + 0.5f);
7
+        return (uint32_t)(s_bitsizes[mv.x - m_mvp.x] +
8
+                          s_bitsizes[mv.y - m_mvp.y] + 0.5f);
9
     }
10
 
11
     static inline uint32_t bitcost(const MV& mv, const MV& mvp)
12
     {
13
-        return (uint32_t)(s_bitsizes[abs(mv.x - mvp.x)] +
14
-                          s_bitsizes[abs(mv.y - mvp.y)] + 0.5f);
15
+        return (uint32_t)(s_bitsizes[mv.x - mvp.x] +
16
+                          s_bitsizes[mv.y - mvp.y] + 0.5f);
17
     }
18
 
19
     static void destroy();
20
x265_1.8.tar.gz/source/encoder/dpb.cpp -> x265_1.9.tar.gz/source/encoder/dpb.cpp Changed
56
 
1
@@ -47,16 +47,16 @@
2
         delete curFrame;
3
     }
4
 
5
-    while (m_picSymFreeList)
6
+    while (m_frameDataFreeList)
7
     {
8
-        FrameData* next = m_picSymFreeList->m_freeListNext;
9
-        m_picSymFreeList->destroy();
10
+        FrameData* next = m_frameDataFreeList->m_freeListNext;
11
+        m_frameDataFreeList->destroy();
12
 
13
-        m_picSymFreeList->m_reconPic->destroy();
14
-        delete m_picSymFreeList->m_reconPic;
15
+        m_frameDataFreeList->m_reconPic->destroy();
16
+        delete m_frameDataFreeList->m_reconPic;
17
 
18
-        delete m_picSymFreeList;
19
-        m_picSymFreeList = next;
20
+        delete m_frameDataFreeList;
21
+        m_frameDataFreeList = next;
22
     }
23
 }
24
 
25
@@ -74,13 +74,19 @@
26
             curFrame->m_reconRowCount.set(0);
27
             curFrame->m_bChromaExtended = false;
28
 
29
+            // Reset column counter
30
+            X265_CHECK(curFrame->m_reconColCount != NULL, "curFrame->m_reconColCount check failure");
31
+            X265_CHECK(curFrame->m_numRows > 0, "curFrame->m_numRows check failure");
32
+            for(int32_t col = 0; col < curFrame->m_numRows; col++)
33
+                curFrame->m_reconColCount[col].set(0);
34
+
35
             // iterator is invalidated by remove, restart scan
36
             m_picList.remove(*curFrame);
37
             iterFrame = m_picList.first();
38
 
39
             m_freeList.pushBack(*curFrame);
40
-            curFrame->m_encData->m_freeListNext = m_picSymFreeList;
41
-            m_picSymFreeList = curFrame->m_encData;
42
+            curFrame->m_encData->m_freeListNext = m_frameDataFreeList;
43
+            m_frameDataFreeList = curFrame->m_encData;
44
             curFrame->m_encData = NULL;
45
             curFrame->m_reconPic = NULL;
46
         }
47
@@ -171,7 +177,7 @@
48
     {
49
         for (int ref = 0; ref < slice->m_numRefIdx[l]; ref++)
50
         {
51
-            Frame *refpic = slice->m_refPicList[l][ref];
52
+            Frame *refpic = slice->m_refFrameList[l][ref];
53
             ATOMIC_INC(&refpic->m_countRefEncoders);
54
         }
55
     }
56
x265_1.8.tar.gz/source/encoder/dpb.h -> x265_1.9.tar.gz/source/encoder/dpb.h Changed
18
 
1
@@ -46,14 +46,14 @@
2
     bool               m_bTemporalSublayer;
3
     PicList            m_picList;
4
     PicList            m_freeList;
5
-    FrameData*         m_picSymFreeList;
6
+    FrameData*         m_frameDataFreeList;
7
 
8
     DPB(x265_param *param)
9
     {
10
         m_lastIDR = 0;
11
         m_pocCRA = 0;
12
         m_bRefreshPending = false;
13
-        m_picSymFreeList = NULL;
14
+        m_frameDataFreeList = NULL;
15
         m_maxRefL0 = param->maxNumReferences;
16
         m_maxRefL1 = param->bBPyramid ? 2 : 1;
17
         m_bOpenGOP = param->bOpenGOP;
18
x265_1.8.tar.gz/source/encoder/encoder.cpp -> x265_1.9.tar.gz/source/encoder/encoder.cpp Changed
736
 
1
@@ -2,6 +2,7 @@
2
  * Copyright (C) 2013 x265 project
3
  *
4
  * Authors: Steve Borho <steve@borho.org>
5
+ *          Min Chen <chenm003@163.com>
6
  *
7
  * This program is free software; you can redistribute it and/or modify
8
  * it under the terms of the GNU General Public License as published by
9
@@ -39,6 +40,10 @@
10
 
11
 #include "x265.h"
12
 
13
+#if _MSC_VER
14
+#pragma warning(disable: 4996) // POSIX functions are just fine, thanks
15
+#endif
16
+
17
 namespace X265_NS {
18
 const char g_sliceTypeToChar[] = {'B', 'P', 'I'};
19
 }
20
@@ -66,12 +71,9 @@
21
     m_outputCount = 0;
22
     m_param = NULL;
23
     m_latestParam = NULL;
24
-    m_cuOffsetY = NULL;
25
-    m_cuOffsetC = NULL;
26
-    m_buOffsetY = NULL;
27
-    m_buOffsetC = NULL;
28
     m_threadPool = NULL;
29
     m_analysisFile = NULL;
30
+    m_offsetEmergency = NULL;
31
     for (int i = 0; i < X265_MAX_FRAME_THREADS; i++)
32
         m_frameEncoder[i] = NULL;
33
 
34
@@ -191,6 +193,7 @@
35
     {
36
         x265_log(m_param, X265_LOG_ERROR, "Unable to allocate scaling list arrays\n");
37
         m_aborted = true;
38
+        return;
39
     }
40
     else if (!m_param->scalingLists || !strcmp(m_param->scalingLists, "off"))
41
         m_scalingList.m_bEnabled = false;
42
@@ -198,7 +201,6 @@
43
         m_scalingList.setDefaultScalingList();
44
     else if (m_scalingList.parseScalingList(m_param->scalingLists))
45
         m_aborted = true;
46
-    m_scalingList.setupQuantMatrices();
47
 
48
     m_lookahead = new Lookahead(m_param, m_threadPool);
49
     if (m_numPools)
50
@@ -213,6 +215,82 @@
51
     initVPS(&m_vps);
52
     initSPS(&m_sps);
53
     initPPS(&m_pps);
54
+   
55
+    if (m_param->rc.vbvBufferSize)
56
+    {
57
+        m_offsetEmergency = (uint16_t(*)[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS])X265_MALLOC(uint16_t, MAX_NUM_TR_CATEGORIES * MAX_NUM_TR_COEFFS * (QP_MAX_MAX - QP_MAX_SPEC));
58
+        if (!m_offsetEmergency)
59
+        {
60
+            x265_log(m_param, X265_LOG_ERROR, "Unable to allocate memory\n");
61
+            m_aborted = true;
62
+            return;
63
+        }
64
+
65
+        bool scalingEnabled = m_scalingList.m_bEnabled;
66
+        if (!scalingEnabled)
67
+        {
68
+            m_scalingList.setDefaultScalingList();
69
+            m_scalingList.setupQuantMatrices();
70
+        }
71
+        else
72
+            m_scalingList.setupQuantMatrices();
73
+
74
+        for (int q = 0; q < QP_MAX_MAX - QP_MAX_SPEC; q++)
75
+        {
76
+            for (int cat = 0; cat < MAX_NUM_TR_CATEGORIES; cat++)
77
+            {
78
+                uint16_t *nrOffset = m_offsetEmergency[q][cat];
79
+
80
+                int trSize = cat & 3;
81
+
82
+                int coefCount = 1 << ((trSize + 2) * 2);
83
+
84
+                /* Denoise chroma first then luma, then DC. */
85
+                int dcThreshold = (QP_MAX_MAX - QP_MAX_SPEC) * 2 / 3;
86
+                int lumaThreshold = (QP_MAX_MAX - QP_MAX_SPEC) * 2 / 3;
87
+                int chromaThreshold = 0;
88
+
89
+                int thresh = (cat < 4 || (cat >= 8 && cat < 12)) ? lumaThreshold : chromaThreshold;
90
+
91
+                double quantF = (double)(1ULL << (q / 6 + 16 + 8));
92
+
93
+                for (int i = 0; i < coefCount; i++)
94
+                {
95
+                    /* True "emergency mode": remove all DCT coefficients */
96
+                    if (q == QP_MAX_MAX - QP_MAX_SPEC - 1)
97
+                    {
98
+                        nrOffset[i] = INT16_MAX;
99
+                        continue;
100
+                    }
101
+
102
+                    int iThresh = i == 0 ? dcThreshold : thresh;
103
+                    if (q < iThresh)
104
+                    {
105
+                        nrOffset[i] = 0;
106
+                        continue;
107
+                    }
108
+
109
+                    int numList = (cat >= 8) * 3 + ((int)!iThresh);
110
+
111
+                    double pos = (double)(q - iThresh + 1) / (QP_MAX_MAX - QP_MAX_SPEC - iThresh);
112
+                    double start = quantF / (m_scalingList.m_quantCoef[trSize][numList][QP_MAX_SPEC % 6][i]);
113
+
114
+                    // Formula chosen as an exponential scale to vaguely mimic the effects of a higher quantizer.
115
+                    double bias = (pow(2, pos * (QP_MAX_MAX - QP_MAX_SPEC)) * 0.003 - 0.003) * start;
116
+                    nrOffset[i] = (uint16_t)X265_MIN(bias + 0.5, INT16_MAX);
117
+                }
118
+            }
119
+        }
120
+
121
+        if (!scalingEnabled)
122
+        {
123
+            m_scalingList.m_bEnabled = false;
124
+            m_scalingList.m_bDataPresent = false;
125
+            m_scalingList.setupQuantMatrices();
126
+        }
127
+    }
128
+    else
129
+        m_scalingList.setupQuantMatrices();
130
 
131
     int numRows = (m_param->sourceHeight + g_maxCUSize - 1) / g_maxCUSize;
132
     int numCols = (m_param->sourceWidth  + g_maxCUSize - 1) / g_maxCUSize;
133
@@ -259,6 +337,8 @@
134
     m_encodeStartTime = x265_mdate();
135
 
136
     m_nalList.m_annexB = !!m_param->bAnnexB;
137
+
138
+    m_emitCLLSEI = p->maxCLL || p->maxFALL;
139
 }
140
 
141
 void Encoder::stopJobs()
142
@@ -318,10 +398,7 @@
143
         delete m_rateControl;
144
     }
145
 
146
-    X265_FREE(m_cuOffsetY);
147
-    X265_FREE(m_cuOffsetC);
148
-    X265_FREE(m_buOffsetY);
149
-    X265_FREE(m_buOffsetC);
150
+    X265_FREE(m_offsetEmergency);
151
 
152
     if (m_analysisFile)
153
         fclose(m_analysisFile);
154
@@ -335,7 +412,6 @@
155
         free((char*)m_param->scalingLists);
156
         free((char*)m_param->numaPools);
157
         free((char*)m_param->masteringDisplayColorVolume);
158
-        free((char*)m_param->contentLightLevelInfo);
159
 
160
         PARAM_NS::x265_param_free(m_param);
161
     }
162
@@ -361,6 +437,45 @@
163
     }
164
 }
165
 
166
+void Encoder::calcRefreshInterval(Frame* frameEnc)
167
+{
168
+    Slice* slice = frameEnc->m_encData->m_slice;
169
+    uint32_t numBlocksInRow = slice->m_sps->numCuInWidth;
170
+    FrameData::PeriodicIR* pir = &frameEnc->m_encData->m_pir;
171
+    if (slice->m_sliceType == I_SLICE)
172
+    {
173
+        pir->framesSinceLastPir = 0;
174
+        m_bQueuedIntraRefresh = 0;
175
+        /* PIR is currently only supported with ref == 1, so any intra frame effectively refreshes
176
+         * the whole frame and counts as an intra refresh. */
177
+        pir->pirEndCol = numBlocksInRow;
178
+    }
179
+    else if (slice->m_sliceType == P_SLICE)
180
+    {
181
+        Frame* ref = frameEnc->m_encData->m_slice->m_refFrameList[0][0];
182
+        int pocdiff = frameEnc->m_poc - ref->m_poc;
183
+        int numPFramesInGOP = m_param->keyframeMax / pocdiff;
184
+        int increment = (numBlocksInRow + numPFramesInGOP - 1) / numPFramesInGOP;
185
+        pir->pirEndCol = ref->m_encData->m_pir.pirEndCol;
186
+        pir->framesSinceLastPir = ref->m_encData->m_pir.framesSinceLastPir + pocdiff;
187
+        if (pir->framesSinceLastPir >= m_param->keyframeMax ||
188
+            (m_bQueuedIntraRefresh && pir->pirEndCol >= numBlocksInRow))
189
+        {
190
+            pir->pirEndCol = 0;
191
+            pir->framesSinceLastPir = 0;
192
+            m_bQueuedIntraRefresh = 0;
193
+            frameEnc->m_lowres.bKeyframe = 1;
194
+        }
195
+        pir->pirStartCol = pir->pirEndCol;
196
+        pir->pirEndCol += increment;
197
+        /* If our intra refresh has reached the right side of the frame, we're done. */
198
+        if (pir->pirEndCol >= numBlocksInRow)
199
+        {
200
+            pir->pirEndCol = numBlocksInRow;
201
+        }
202
+    }
203
+}
204
+
205
 /**
206
  * Feed one new input frame into the encoder, get one frame out. If pic_in is
207
  * NULL, a flush condition is implied and pic_in must be NULL for all subsequent
208
@@ -395,7 +510,7 @@
209
     {
210
         if (pic_in->colorSpace != m_param->internalCsp)
211
         {
212
-            x265_log(m_param, X265_LOG_ERROR, "Unsupported color space (%d) on input\n",
213
+            x265_log(m_param, X265_LOG_ERROR, "Unsupported chroma subsampling (%d) on input\n",
214
                      pic_in->colorSpace);
215
             return -1;
216
         }
217
@@ -411,17 +526,20 @@
218
         {
219
             inFrame = new Frame;
220
             x265_param* p = m_reconfigured? m_latestParam : m_param;
221
-            if (inFrame->create(p))
222
+            if (inFrame->create(p, pic_in->quantOffsets))
223
             {
224
                 /* the first PicYuv created is asked to generate the CU and block unit offset
225
                  * arrays which are then shared with all subsequent PicYuv (orig and recon) 
226
                  * allocated by this top level encoder */
227
-                if (m_cuOffsetY)
228
+                if (m_sps.cuOffsetY)
229
                 {
230
-                    inFrame->m_fencPic->m_cuOffsetC = m_cuOffsetC;
231
-                    inFrame->m_fencPic->m_cuOffsetY = m_cuOffsetY;
232
-                    inFrame->m_fencPic->m_buOffsetC = m_buOffsetC;
233
-                    inFrame->m_fencPic->m_buOffsetY = m_buOffsetY;
234
+                    inFrame->m_fencPic->m_cuOffsetY = m_sps.cuOffsetY;
235
+                    inFrame->m_fencPic->m_buOffsetY = m_sps.buOffsetY;
236
+                    if (pic_in->colorSpace != X265_CSP_I400)
237
+                    {
238
+                        inFrame->m_fencPic->m_cuOffsetC = m_sps.cuOffsetC;
239
+                        inFrame->m_fencPic->m_buOffsetC = m_sps.buOffsetC;
240
+                    }
241
                 }
242
                 else
243
                 {
244
@@ -435,10 +553,15 @@
245
                     }
246
                     else
247
                     {
248
-                        m_cuOffsetC = inFrame->m_fencPic->m_cuOffsetC;
249
-                        m_cuOffsetY = inFrame->m_fencPic->m_cuOffsetY;
250
-                        m_buOffsetC = inFrame->m_fencPic->m_buOffsetC;
251
-                        m_buOffsetY = inFrame->m_fencPic->m_buOffsetY;
252
+                        m_sps.cuOffsetY = inFrame->m_fencPic->m_cuOffsetY;
253
+                        m_sps.buOffsetY = inFrame->m_fencPic->m_buOffsetY;
254
+                        if (pic_in->colorSpace != X265_CSP_I400)
255
+                        {
256
+                            m_sps.cuOffsetC = inFrame->m_fencPic->m_cuOffsetC;
257
+                            m_sps.cuOffsetY = inFrame->m_fencPic->m_cuOffsetY;
258
+                            m_sps.buOffsetC = inFrame->m_fencPic->m_buOffsetC;
259
+                            m_sps.buOffsetY = inFrame->m_fencPic->m_buOffsetY;
260
+                        }
261
                     }
262
                 }
263
             }
264
@@ -454,17 +577,27 @@
265
         else
266
         {
267
             inFrame = m_dpb->m_freeList.popBack();
268
+            /* Set lowres scencut and satdCost here to aovid overwriting ANALYSIS_READ
269
+               decision by lowres init*/
270
+            inFrame->m_lowres.bScenecut = false;
271
+            inFrame->m_lowres.satdCost = (int64_t)-1;
272
             inFrame->m_lowresInit = false;
273
         }
274
 
275
         /* Copy input picture into a Frame and PicYuv, send to lookahead */
276
-        inFrame->m_fencPic->copyFromPicture(*pic_in, m_sps.conformanceWindow.rightOffset, m_sps.conformanceWindow.bottomOffset);
277
+        inFrame->m_fencPic->copyFromPicture(*pic_in, *m_param, m_sps.conformanceWindow.rightOffset, m_sps.conformanceWindow.bottomOffset);
278
 
279
         inFrame->m_poc       = ++m_pocLast;
280
         inFrame->m_userData  = pic_in->userData;
281
         inFrame->m_pts       = pic_in->pts;
282
         inFrame->m_forceqp   = pic_in->forceqp;
283
         inFrame->m_param     = m_reconfigured ? m_latestParam : m_param;
284
+        
285
+        if (pic_in->quantOffsets != NULL)
286
+        {
287
+            int cuCount = inFrame->m_lowres.maxBlocksInRow * inFrame->m_lowres.maxBlocksInCol;
288
+            memcpy(inFrame->m_quantOffsets, pic_in->quantOffsets, cuCount * sizeof(float));
289
+        }
290
 
291
         if (m_pocLast == 0)
292
             m_firstPts = inFrame->m_pts;
293
@@ -496,11 +629,15 @@
294
             readAnalysisFile(&inputPic->analysisData, inFrame->m_poc);
295
             inFrame->m_analysisData.poc = inFrame->m_poc;
296
             inFrame->m_analysisData.sliceType = inputPic->analysisData.sliceType;
297
+            inFrame->m_analysisData.bScenecut = inputPic->analysisData.bScenecut;
298
+            inFrame->m_analysisData.satdCost = inputPic->analysisData.satdCost;
299
             inFrame->m_analysisData.numCUsInFrame = inputPic->analysisData.numCUsInFrame;
300
             inFrame->m_analysisData.numPartitions = inputPic->analysisData.numPartitions;
301
             inFrame->m_analysisData.interData = inputPic->analysisData.interData;
302
             inFrame->m_analysisData.intraData = inputPic->analysisData.intraData;
303
             sliceType = inputPic->analysisData.sliceType;
304
+            inFrame->m_lowres.bScenecut = !!inFrame->m_analysisData.bScenecut;
305
+            inFrame->m_lowres.satdCost = inFrame->m_analysisData.satdCost;
306
         }
307
 
308
         m_lookahead->addPicture(*inFrame, sliceType);
309
@@ -563,16 +700,21 @@
310
 
311
                 pic_out->planes[0] = recpic->m_picOrg[0];
312
                 pic_out->stride[0] = (int)(recpic->m_stride * sizeof(pixel));
313
-                pic_out->planes[1] = recpic->m_picOrg[1];
314
-                pic_out->stride[1] = (int)(recpic->m_strideC * sizeof(pixel));
315
-                pic_out->planes[2] = recpic->m_picOrg[2];
316
-                pic_out->stride[2] = (int)(recpic->m_strideC * sizeof(pixel));
317
+                if (m_param->internalCsp != X265_CSP_I400)
318
+                {
319
+                    pic_out->planes[1] = recpic->m_picOrg[1];
320
+                    pic_out->stride[1] = (int)(recpic->m_strideC * sizeof(pixel));
321
+                    pic_out->planes[2] = recpic->m_picOrg[2];
322
+                    pic_out->stride[2] = (int)(recpic->m_strideC * sizeof(pixel));
323
+                }
324
 
325
                 /* Dump analysis data from pic_out to file in save mode and free */
326
                 if (m_param->analysisMode == X265_ANALYSIS_SAVE)
327
                 {
328
                     pic_out->analysisData.poc = pic_out->poc;
329
                     pic_out->analysisData.sliceType = pic_out->sliceType;
330
+                    pic_out->analysisData.bScenecut = outFrame->m_lowres.bScenecut;
331
+                    pic_out->analysisData.satdCost  = outFrame->m_lowres.satdCost;                    
332
                     pic_out->analysisData.numCUsInFrame = outFrame->m_analysisData.numCUsInFrame;
333
                     pic_out->analysisData.numPartitions = outFrame->m_analysisData.numPartitions;
334
                     pic_out->analysisData.interData = outFrame->m_analysisData.interData;
335
@@ -581,36 +723,57 @@
336
                     freeAnalysis(&pic_out->analysisData);
337
                 }
338
             }
339
-            if (slice->m_sliceType == P_SLICE)
340
+            if (m_param->internalCsp == X265_CSP_I400)
341
             {
342
-                if (slice->m_weightPredTable[0][0][0].bPresentFlag)
343
-                    m_numLumaWPFrames++;
344
-                if (slice->m_weightPredTable[0][0][1].bPresentFlag ||
345
-                    slice->m_weightPredTable[0][0][2].bPresentFlag)
346
-                    m_numChromaWPFrames++;
347
+                if (slice->m_sliceType == P_SLICE)
348
+                {
349
+                    if (slice->m_weightPredTable[0][0][0].bPresentFlag)
350
+                        m_numLumaWPFrames++;
351
+                }
352
+                else if (slice->m_sliceType == B_SLICE)
353
+                {
354
+                    bool bLuma = false;
355
+                    for (int l = 0; l < 2; l++)
356
+                    {
357
+                        if (slice->m_weightPredTable[l][0][0].bPresentFlag)
358
+                            bLuma = true;
359
+                    }
360
+                    if (bLuma)
361
+                        m_numLumaWPBiFrames++;
362
+                }
363
             }
364
-            else if (slice->m_sliceType == B_SLICE)
365
+            else
366
             {
367
-                bool bLuma = false, bChroma = false;
368
-                for (int l = 0; l < 2; l++)
369
+                if (slice->m_sliceType == P_SLICE)
370
                 {
371
-                    if (slice->m_weightPredTable[l][0][0].bPresentFlag)
372
-                        bLuma = true;
373
-                    if (slice->m_weightPredTable[l][0][1].bPresentFlag ||
374
-                        slice->m_weightPredTable[l][0][2].bPresentFlag)
375
-                        bChroma = true;
376
+                    if (slice->m_weightPredTable[0][0][0].bPresentFlag)
377
+                        m_numLumaWPFrames++;
378
+                    if (slice->m_weightPredTable[0][0][1].bPresentFlag ||
379
+                        slice->m_weightPredTable[0][0][2].bPresentFlag)
380
+                        m_numChromaWPFrames++;
381
                 }
382
+                else if (slice->m_sliceType == B_SLICE)
383
+                {
384
+                    bool bLuma = false, bChroma = false;
385
+                    for (int l = 0; l < 2; l++)
386
+                    {
387
+                        if (slice->m_weightPredTable[l][0][0].bPresentFlag)
388
+                            bLuma = true;
389
+                        if (slice->m_weightPredTable[l][0][1].bPresentFlag ||
390
+                            slice->m_weightPredTable[l][0][2].bPresentFlag)
391
+                            bChroma = true;
392
+                    }
393
 
394
-                if (bLuma)
395
-                    m_numLumaWPBiFrames++;
396
-                if (bChroma)
397
-                    m_numChromaWPBiFrames++;
398
+                    if (bLuma)
399
+                        m_numLumaWPBiFrames++;
400
+                    if (bChroma)
401
+                        m_numChromaWPBiFrames++;
402
+                }
403
             }
404
-
405
             if (m_aborted)
406
                 return -1;
407
 
408
-            finishFrameStats(outFrame, curEncoder, curEncoder->m_accessUnitBits, frameData);
409
+            finishFrameStats(outFrame, curEncoder, frameData, m_pocLast);
410
 
411
             /* Write RateControl Frame level stats in multipass encodes */
412
             if (m_param->rc.bStatWrite)
413
@@ -638,10 +801,10 @@
414
         if (frameEnc && !pass)
415
         {
416
             /* give this frame a FrameData instance before encoding */
417
-            if (m_dpb->m_picSymFreeList)
418
+            if (m_dpb->m_frameDataFreeList)
419
             {
420
-                frameEnc->m_encData = m_dpb->m_picSymFreeList;
421
-                m_dpb->m_picSymFreeList = m_dpb->m_picSymFreeList->m_freeListNext;
422
+                frameEnc->m_encData = m_dpb->m_frameDataFreeList;
423
+                m_dpb->m_frameDataFreeList = m_dpb->m_frameDataFreeList->m_freeListNext;
424
                 frameEnc->reinit(m_sps);
425
             }
426
             else
427
@@ -652,10 +815,6 @@
428
                 slice->m_pps = &m_pps;
429
                 slice->m_maxNumMergeCand = m_param->maxNumMergeCand;
430
                 slice->m_endCUAddr = slice->realEndAddress(m_sps.numCUsInFrame * NUM_4x4_PARTITIONS);
431
-                frameEnc->m_reconPic->m_cuOffsetC = m_cuOffsetC;
432
-                frameEnc->m_reconPic->m_cuOffsetY = m_cuOffsetY;
433
-                frameEnc->m_reconPic->m_buOffsetC = m_buOffsetC;
434
-                frameEnc->m_reconPic->m_buOffsetY = m_buOffsetY;
435
             }
436
 
437
             curEncoder->m_rce.encodeOrder = m_encodedFrameNum++;
438
@@ -690,13 +849,15 @@
439
 
440
             if (m_param->rc.rateControlMode != X265_RC_CQP)
441
                 m_lookahead->getEstimatedPictureCost(frameEnc);
442
+            if (m_param->bIntraRefresh)
443
+                 calcRefreshInterval(frameEnc);
444
 
445
             /* Allow FrameEncoder::compressFrame() to start in the frame encoder thread */
446
             if (!curEncoder->startCompressFrame(frameEnc))
447
                 m_aborted = true;
448
         }
449
         else if (m_encodedFrameNum)
450
-            m_rateControl->setFinalFrameCount(m_encodedFrameNum); 
451
+            m_rateControl->setFinalFrameCount(m_encodedFrameNum);
452
     }
453
     while (m_bZeroLatency && ++pass < 2);
454
 
455
@@ -708,7 +869,7 @@
456
     encParam->maxNumReferences = param->maxNumReferences; // never uses more refs than specified in stream headers
457
     encParam->bEnableLoopFilter = param->bEnableLoopFilter;
458
     encParam->deblockingFilterTCOffset = param->deblockingFilterTCOffset;
459
-    encParam->deblockingFilterBetaOffset = param->deblockingFilterBetaOffset; 
460
+    encParam->deblockingFilterBetaOffset = param->deblockingFilterBetaOffset;
461
     encParam->bEnableFastIntra = param->bEnableFastIntra;
462
     encParam->bEnableEarlySkip = param->bEnableEarlySkip;
463
     encParam->bEnableTemporalMvp = param->bEnableTemporalMvp;
464
@@ -943,7 +1104,7 @@
465
                  (double)cuStats.countPModeMasters / cuStats.totalCTUs,
466
                  (double)cuStats.pmodeBlockTime / cuStats.countPModeMasters);
467
         x265_log(m_param, X265_LOG_INFO, "CU:       %.3lf slaves per PMODE master, each took average of %.3lf ms\n",
468
-                 (double)cuStats.countPModeTasks / cuStats.countPModeMasters, 
469
+                 (double)cuStats.countPModeTasks / cuStats.countPModeMasters,
470
                  ELAPSED_MSEC(cuStats.pmodeTime) / cuStats.countPModeTasks);
471
     }
472
 
473
@@ -1050,6 +1211,15 @@
474
         stats->statsB.psnrU   = m_analyzeB.m_psnrSumU / (double)m_analyzeB.m_numPics;
475
         stats->statsB.psnrV   = m_analyzeB.m_psnrSumV / (double)m_analyzeB.m_numPics;
476
         stats->statsB.ssim    = x265_ssim2dB(m_analyzeB.m_globalSsim / (double)m_analyzeB.m_numPics);
477
+
478
+        stats->maxCLL         = m_analyzeAll.m_maxCLL;
479
+        stats->maxFALL        = (uint16_t)(m_analyzeAll.m_maxFALL / m_analyzeAll.m_numPics);
480
+
481
+        if (m_emitCLLSEI)
482
+        {
483
+            m_param->maxCLL = stats->maxCLL;
484
+            m_param->maxFALL = stats->maxFALL;
485
+        }
486
     }
487
 
488
     /* If new statistics are added to x265_stats, we must check here whether the
489
@@ -1057,9 +1227,10 @@
490
      * future safety) */
491
 }
492
 
493
-void Encoder::finishFrameStats(Frame* curFrame, FrameEncoder *curEncoder, uint64_t bits, x265_frame_stats* frameStats)
494
+void Encoder::finishFrameStats(Frame* curFrame, FrameEncoder *curEncoder, x265_frame_stats* frameStats, int inPoc)
495
 {
496
     PicYuv* reconPic = curFrame->m_reconPic;
497
+    uint64_t bits = curEncoder->m_accessUnitBits;
498
 
499
     //===== calculate PSNR =====
500
     int width  = reconPic->m_picWidth - m_sps.conformanceWindow.rightOffset;
501
@@ -1123,6 +1294,9 @@
502
             m_analyzeB.addSsim(ssim);
503
     }
504
 
505
+    m_analyzeAll.m_maxFALL += curFrame->m_fencPic->m_avgLumaLevel;
506
+    m_analyzeAll.m_maxCLL = X265_MAX(m_analyzeAll.m_maxCLL, curFrame->m_fencPic->m_maxLumaLevel);
507
+
508
     char c = (slice->isIntra() ? 'I' : slice->isInterP() ? 'P' : 'B');
509
     int poc = slice->m_poc;
510
     if (!IS_REFERENCED(curFrame))
511
@@ -1130,11 +1304,15 @@
512
 
513
     if (frameStats)
514
     {
515
+        const int picOrderCntLSB = (slice->m_poc - slice->m_lastIDR + (1 << BITS_FOR_POC)) % (1 << BITS_FOR_POC);
516
+
517
         frameStats->encoderOrder = m_outputCount++;
518
         frameStats->sliceType = c;
519
-        frameStats->poc = poc;
520
+        frameStats->poc = picOrderCntLSB;
521
         frameStats->qp = curEncData.m_avgQpAq;
522
         frameStats->bits = bits;
523
+        frameStats->bScenecut = curFrame->m_lowres.bScenecut;
524
+        frameStats->frameLatency = inPoc - poc;
525
         if (m_param->rc.rateControlMode == X265_RC_CRF)
526
             frameStats->rateFactor = curEncData.m_rateFactor;
527
         frameStats->psnrY = psnrY;
528
@@ -1173,8 +1351,9 @@
529
         frameStats->avgChromaDistortion     = curFrame->m_encData->m_frameStats.avgChromaDistortion;
530
         frameStats->avgLumaDistortion       = curFrame->m_encData->m_frameStats.avgLumaDistortion;
531
         frameStats->avgPsyEnergy            = curFrame->m_encData->m_frameStats.avgPsyEnergy;
532
-        frameStats->avgLumaLevel            = curFrame->m_encData->m_frameStats.avgLumaLevel;
533
-        frameStats->maxLumaLevel            = curFrame->m_encData->m_frameStats.maxLumaLevel;
534
+        frameStats->avgResEnergy            = curFrame->m_encData->m_frameStats.avgResEnergy;
535
+        frameStats->avgLumaLevel            = curFrame->m_fencPic->m_avgLumaLevel;
536
+        frameStats->maxLumaLevel            = curFrame->m_fencPic->m_maxLumaLevel;
537
         for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
538
         {
539
             frameStats->cuStats.percentSkipCu[depth]  = curFrame->m_encData->m_frameStats.percentSkipCu[depth];
540
@@ -1227,18 +1406,15 @@
541
             x265_log(m_param, X265_LOG_WARNING, "unable to parse mastering display color volume info\n");
542
     }
543
 
544
-    if (m_param->contentLightLevelInfo)
545
+    if (m_emitCLLSEI)
546
     {
547
         SEIContentLightLevel cllsei;
548
-        if (cllsei.parse(m_param->contentLightLevelInfo))
549
-        {
550
-            bs.resetBits();
551
-            cllsei.write(bs, m_sps);
552
-            bs.writeByteAlignment();
553
-            list.serialize(NAL_UNIT_PREFIX_SEI, bs);
554
-        }
555
-        else
556
-            x265_log(m_param, X265_LOG_WARNING, "unable to parse content light level info\n");
557
+        cllsei.max_content_light_level = m_param->maxCLL;
558
+        cllsei.max_pic_average_light_level = m_param->maxFALL;
559
+        bs.resetBits();
560
+        cllsei.write(bs, m_sps);
561
+        bs.writeByteAlignment();
562
+        list.serialize(NAL_UNIT_PREFIX_SEI, bs);
563
     }
564
 
565
     if (m_param->bEmitInfoSEI)
566
@@ -1425,6 +1601,7 @@
567
         p->rc.cuTree = 0;
568
         p->bEnableWeightedPred = 0;
569
         p->bEnableWeightedBiPred = 0;
570
+        p->bIntraRefresh = 0;
571
 
572
         /* SPSs shall have sps_max_dec_pic_buffering_minus1[ sps_max_sub_layers_minus1 ] equal to 0 only */
573
         p->maxNumReferences = 1;
574
@@ -1515,10 +1692,38 @@
575
 
576
     if (p->totalFrames && p->totalFrames <= 2 * ((float)p->fpsNum) / p->fpsDenom && p->rc.bStrictCbr)
577
         p->lookaheadDepth = p->totalFrames;
578
+    if (p->bIntraRefresh)
579
+    {
580
+        int numCuInWidth = (m_param->sourceWidth + g_maxCUSize - 1) / g_maxCUSize;
581
+        if (p->maxNumReferences > 1)
582
+        {
583
+            x265_log(p,  X265_LOG_WARNING, "Max References > 1 + intra-refresh is not supported , setting max num references = 1\n");
584
+            p->maxNumReferences = 1;
585
+        }
586
+
587
+        if (p->bBPyramid && p->bframes)
588
+            x265_log(p,  X265_LOG_WARNING, "B pyramid cannot be enabled when max references is 1, Disabling B pyramid\n");
589
+        p->bBPyramid = 0;
590
+
591
+
592
+        if (p->bOpenGOP)
593
+        {
594
+            x265_log(p,  X265_LOG_WARNING, "Open Gop disabled, Intra Refresh is not compatible with openGop\n");
595
+            p->bOpenGOP = 0;
596
+        }
597
+
598
+        x265_log(p,  X265_LOG_WARNING, "Scenecut is disabled when Intra Refresh is enabled\n");
599
+
600
+        if (((float)numCuInWidth - 1) / m_param->keyframeMax > 1)
601
+            x265_log(p,  X265_LOG_WARNING, "Keyint value is very low.It leads to frequent intra refreshes, can be almost every frame."
602
+                     "Prefered use case would be high keyint value or an API call to refresh when necessary\n");
603
+
604
+    }
605
+
606
 
607
     if (p->scalingLists && p->internalCsp == X265_CSP_I444)
608
     {
609
-        x265_log(p, X265_LOG_WARNING, "Scaling lists are not yet supported for 4:4:4 color space\n");
610
+        x265_log(p, X265_LOG_WARNING, "Scaling lists are not yet supported for 4:4:4 chroma subsampling\n");
611
         p->scalingLists = 0;
612
     }
613
 
614
@@ -1536,6 +1741,17 @@
615
         x265_log(p, X265_LOG_WARNING, "Analysis load/save options incompatible with pmode/pme, Disabling pmode/pme\n");
616
         p->bDistributeMotionEstimation = p->bDistributeModeAnalysis = 0;
617
     }
618
+    if (p->analysisMode && p->rc.cuTree)
619
+    {
620
+        x265_log(p, X265_LOG_WARNING, "Analysis load/save options works only with cu-tree off, Disabling cu-tree\n");
621
+        p->rc.cuTree = 0;
622
+    }
623
+
624
+    if (p->bDistributeModeAnalysis && (p->limitReferences >> 1) && 1)
625
+    {
626
+        x265_log(p, X265_LOG_WARNING, "Limit reference options 2 and 3 are not supported with pmode. Disabling limit reference\n");
627
+        p->limitReferences = 0;
628
+    }
629
 
630
     if (p->bEnableTemporalSubLayers && !p->bframes)
631
     {
632
@@ -1641,6 +1857,7 @@
633
 
634
 void Encoder::allocAnalysis(x265_analysis_data* analysis)
635
 {
636
+    X265_CHECK(analysis->sliceType, "invalid slice type\n");
637
     analysis->interData = analysis->intraData = NULL;
638
     if (analysis->sliceType == X265_TYPE_IDR || analysis->sliceType == X265_TYPE_I)
639
     {
640
@@ -1654,12 +1871,14 @@
641
     }
642
     else
643
     {
644
+        int numDir = analysis->sliceType == X265_TYPE_P ? 1 : 2;
645
         analysis_inter_data *interData = (analysis_inter_data*)analysis->interData;
646
         CHECKED_MALLOC_ZERO(interData, analysis_inter_data, 1);
647
-        CHECKED_MALLOC_ZERO(interData->ref, int32_t, analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU * 2);
648
+        CHECKED_MALLOC_ZERO(interData->ref, int32_t, analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU * numDir);
649
         CHECKED_MALLOC(interData->depth, uint8_t, analysis->numPartitions * analysis->numCUsInFrame);
650
         CHECKED_MALLOC(interData->modes, uint8_t, analysis->numPartitions * analysis->numCUsInFrame);
651
         CHECKED_MALLOC_ZERO(interData->bestMergeCand, uint32_t, analysis->numCUsInFrame * CUGeom::MAX_GEOMS);
652
+        CHECKED_MALLOC_ZERO(interData->mv, MV, analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU * numDir);
653
         analysis->interData = interData;
654
     }
655
     return;
656
@@ -1685,6 +1904,7 @@
657
         X265_FREE(((analysis_inter_data*)analysis->interData)->depth);
658
         X265_FREE(((analysis_inter_data*)analysis->interData)->modes);
659
         X265_FREE(((analysis_inter_data*)analysis->interData)->bestMergeCand);
660
+        X265_FREE(((analysis_inter_data*)analysis->interData)->mv);
661
         X265_FREE(analysis->interData);
662
     }
663
 }
664
@@ -1731,6 +1951,8 @@
665
     analysis->poc = poc;
666
     analysis->frameRecordSize = frameRecordSize;
667
     X265_FREAD(&analysis->sliceType, sizeof(int), 1, m_analysisFile);
668
+    X265_FREAD(&analysis->bScenecut, sizeof(int), 1, m_analysisFile);
669
+    X265_FREAD(&analysis->satdCost, sizeof(int64_t), 1, m_analysisFile);
670
     X265_FREAD(&analysis->numCUsInFrame, sizeof(int), 1, m_analysisFile);
671
     X265_FREAD(&analysis->numPartitions, sizeof(int), 1, m_analysisFile);
672
 
673
@@ -1752,6 +1974,7 @@
674
         X265_FREAD(((analysis_inter_data *)analysis->interData)->depth, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile);
675
         X265_FREAD(((analysis_inter_data *)analysis->interData)->modes, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile);
676
         X265_FREAD(((analysis_inter_data *)analysis->interData)->bestMergeCand, sizeof(uint32_t), analysis->numCUsInFrame * CUGeom::MAX_GEOMS, m_analysisFile);
677
+        X265_FREAD(((analysis_inter_data *)analysis->interData)->mv, sizeof(MV), analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU, m_analysisFile);
678
         consumedBytes += frameRecordSize;
679
         totalConsumedBytes = consumedBytes;
680
     }
681
@@ -1761,6 +1984,7 @@
682
         X265_FREAD(((analysis_inter_data *)analysis->interData)->depth, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile);
683
         X265_FREAD(((analysis_inter_data *)analysis->interData)->modes, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile);
684
         X265_FREAD(((analysis_inter_data *)analysis->interData)->bestMergeCand, sizeof(uint32_t), analysis->numCUsInFrame * CUGeom::MAX_GEOMS, m_analysisFile);
685
+        X265_FREAD(((analysis_inter_data *)analysis->interData)->mv, sizeof(MV), analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU * 2, m_analysisFile);
686
         consumedBytes += frameRecordSize;
687
     }
688
 #undef X265_FREAD
689
@@ -1780,7 +2004,7 @@
690
 
691
     /* calculate frameRecordSize */
692
     analysis->frameRecordSize = sizeof(analysis->frameRecordSize) + sizeof(analysis->poc) + sizeof(analysis->sliceType) +
693
-                      sizeof(analysis->numCUsInFrame) + sizeof(analysis->numPartitions);
694
+                      sizeof(analysis->numCUsInFrame) + sizeof(analysis->numPartitions) + sizeof(analysis->bScenecut) + sizeof(analysis->satdCost);
695
     if (analysis->sliceType == X265_TYPE_IDR || analysis->sliceType == X265_TYPE_I)
696
         analysis->frameRecordSize += sizeof(uint8_t) * analysis->numCUsInFrame * analysis->numPartitions * 4;
697
     else if (analysis->sliceType == X265_TYPE_P)
698
@@ -1788,17 +2012,20 @@
699
         analysis->frameRecordSize += sizeof(int32_t) * analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU;
700
         analysis->frameRecordSize += sizeof(uint8_t) * analysis->numCUsInFrame * analysis->numPartitions * 2;
701
         analysis->frameRecordSize += sizeof(uint32_t) * analysis->numCUsInFrame * CUGeom::MAX_GEOMS;
702
+        analysis->frameRecordSize += sizeof(MV) * analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU;
703
     }
704
     else
705
     {
706
         analysis->frameRecordSize += sizeof(int32_t) * analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU * 2;
707
         analysis->frameRecordSize += sizeof(uint8_t) * analysis->numCUsInFrame * analysis->numPartitions * 2;
708
         analysis->frameRecordSize += sizeof(uint32_t) * analysis->numCUsInFrame * CUGeom::MAX_GEOMS;
709
+        analysis->frameRecordSize += sizeof(MV) * analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU * 2;
710
     }
711
-
712
     X265_FWRITE(&analysis->frameRecordSize, sizeof(uint32_t), 1, m_analysisFile);
713
     X265_FWRITE(&analysis->poc, sizeof(int), 1, m_analysisFile);
714
     X265_FWRITE(&analysis->sliceType, sizeof(int), 1, m_analysisFile);
715
+    X265_FWRITE(&analysis->bScenecut, sizeof(int), 1, m_analysisFile);
716
+    X265_FWRITE(&analysis->satdCost, sizeof(int64_t), 1, m_analysisFile);
717
     X265_FWRITE(&analysis->numCUsInFrame, sizeof(int), 1, m_analysisFile);
718
     X265_FWRITE(&analysis->numPartitions, sizeof(int), 1, m_analysisFile);
719
 
720
@@ -1815,6 +2042,7 @@
721
         X265_FWRITE(((analysis_inter_data*)analysis->interData)->depth, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile);
722
         X265_FWRITE(((analysis_inter_data*)analysis->interData)->modes, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile);
723
         X265_FWRITE(((analysis_inter_data*)analysis->interData)->bestMergeCand, sizeof(uint32_t), analysis->numCUsInFrame * CUGeom::MAX_GEOMS, m_analysisFile);
724
+        X265_FWRITE(((analysis_inter_data*)analysis->interData)->mv, sizeof(MV), analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU, m_analysisFile);
725
     }
726
     else
727
     {
728
@@ -1822,6 +2050,7 @@
729
         X265_FWRITE(((analysis_inter_data*)analysis->interData)->depth, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile);
730
         X265_FWRITE(((analysis_inter_data*)analysis->interData)->modes, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile);
731
         X265_FWRITE(((analysis_inter_data*)analysis->interData)->bestMergeCand, sizeof(uint32_t), analysis->numCUsInFrame * CUGeom::MAX_GEOMS, m_analysisFile);
732
+        X265_FWRITE(((analysis_inter_data*)analysis->interData)->mv, sizeof(MV), analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU * 2, m_analysisFile);
733
     }
734
 #undef X265_FWRITE
735
 }
736
x265_1.8.tar.gz/source/encoder/encoder.h -> x265_1.9.tar.gz/source/encoder/encoder.h Changed
126
 
1
@@ -45,8 +45,10 @@
2
     double        m_psnrSumV;
3
     double        m_globalSsim;
4
     double        m_totalQp;
5
+    double        m_maxFALL;
6
     uint64_t      m_accBits;
7
     uint32_t      m_numPics;
8
+    uint16_t      m_maxCLL;
9
 
10
     EncStats()
11
     {
12
@@ -54,6 +56,8 @@
13
         m_accBits = 0;
14
         m_numPics = 0;
15
         m_totalQp = 0;
16
+        m_maxCLL = 0;
17
+        m_maxFALL = 0;
18
     }
19
 
20
     void addQP(double aveQp);
21
@@ -75,64 +79,62 @@
22
 {
23
 public:
24
 
25
-    int                m_pocLast;         // time index (POC)
26
-    int                m_encodedFrameNum;
27
-    int                m_outputCount;
28
+    uint32_t           m_residualSumEmergency[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS];
29
+    uint32_t           m_countEmergency[MAX_NUM_TR_CATEGORIES];
30
+    uint16_t           (*m_offsetEmergency)[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS];
31
 
32
-    int                m_bframeDelay;
33
     int64_t            m_firstPts;
34
     int64_t            m_bframeDelayTime;
35
     int64_t            m_prevReorderedPts[2];
36
+    int64_t            m_encodeStartTime;
37
 
38
-    ThreadPool*        m_threadPool;
39
-    FrameEncoder*      m_frameEncoder[X265_MAX_FRAME_THREADS];
40
-    DPB*               m_dpb;
41
-
42
-    Frame*             m_exportedPic;
43
-
44
+    int                m_pocLast;         // time index (POC)
45
+    int                m_encodedFrameNum;
46
+    int                m_outputCount;
47
+    int                m_bframeDelay;
48
     int                m_numPools;
49
     int                m_curEncoder;
50
 
51
-    /* cached PicYuv offset arrays, shared by all instances of
52
-     * PicYuv created by this encoder */
53
-    intptr_t*          m_cuOffsetY;
54
-    intptr_t*          m_cuOffsetC;
55
-    intptr_t*          m_buOffsetY;
56
-    intptr_t*          m_buOffsetC;
57
-
58
-    /* Collect statistics globally */
59
-    EncStats           m_analyzeAll;
60
-    EncStats           m_analyzeI;
61
-    EncStats           m_analyzeP;
62
-    EncStats           m_analyzeB;
63
-    int64_t            m_encodeStartTime;
64
-
65
     // weighted prediction
66
     int                m_numLumaWPFrames;    // number of P frames with weighted luma reference
67
     int                m_numChromaWPFrames;  // number of P frames with weighted chroma reference
68
     int                m_numLumaWPBiFrames;  // number of B frames with weighted luma reference
69
     int                m_numChromaWPBiFrames; // number of B frames with weighted chroma reference
70
-    FILE*              m_analysisFile;
71
     int                m_conformanceMode;
72
-    VPS                m_vps;
73
-    SPS                m_sps;
74
-    PPS                m_pps;
75
-    NALList            m_nalList;
76
-    ScalingList        m_scalingList;      // quantization matrix information
77
-
78
     int                m_lastBPSEI;
79
     uint32_t           m_numDelayedPic;
80
 
81
+    ThreadPool*        m_threadPool;
82
+    FrameEncoder*      m_frameEncoder[X265_MAX_FRAME_THREADS];
83
+    DPB*               m_dpb;
84
+    Frame*             m_exportedPic;
85
+    FILE*              m_analysisFile;
86
     x265_param*        m_param;
87
     x265_param*        m_latestParam;
88
     RateControl*       m_rateControl;
89
     Lookahead*         m_lookahead;
90
+
91
+    /* Collect statistics globally */
92
+    EncStats           m_analyzeAll;
93
+    EncStats           m_analyzeI;
94
+    EncStats           m_analyzeP;
95
+    EncStats           m_analyzeB;
96
+    VPS                m_vps;
97
+    SPS                m_sps;
98
+    PPS                m_pps;
99
+    NALList            m_nalList;
100
+    ScalingList        m_scalingList;      // quantization matrix information
101
     Window             m_conformanceWindow;
102
 
103
+    bool               m_emitCLLSEI;
104
     bool               m_bZeroLatency;     // x265_encoder_encode() returns NALs for the input picture, zero lag
105
     bool               m_aborted;          // fatal error detected
106
     bool               m_reconfigured;      // reconfigure of encoder detected
107
 
108
+    /* Begin intra refresh when one not in progress or else begin one as soon as the current 
109
+     * one is done. Requires bIntraRefresh to be set.*/
110
+    int                m_bQueuedIntraRefresh;
111
+
112
     Encoder();
113
     ~Encoder() {}
114
 
115
@@ -164,7 +166,9 @@
116
 
117
     void writeAnalysisFile(x265_analysis_data* pic);
118
 
119
-    void finishFrameStats(Frame* pic, FrameEncoder *curEncoder, uint64_t bits, x265_frame_stats* frameStats);
120
+    void finishFrameStats(Frame* pic, FrameEncoder *curEncoder, x265_frame_stats* frameStats, int inPoc);
121
+
122
+    void calcRefreshInterval(Frame* frameEnc);
123
 
124
 protected:
125
 
126
x265_1.8.tar.gz/source/encoder/entropy.cpp -> x265_1.9.tar.gz/source/encoder/entropy.cpp Changed
252
 
1
@@ -2,6 +2,7 @@
2
 * Copyright (C) 2013 x265 project
3
 *
4
 * Authors: Steve Borho <steve@borho.org>
5
+*          Min Chen <chenm003@163.com>
6
 *
7
 * This program is free software; you can redistribute it and/or modify
8
 * it under the terms of the GNU General Public License as published by
9
@@ -429,7 +430,8 @@
10
     if (slice.m_sps->bUseSAO)
11
     {
12
         WRITE_FLAG(saoParam->bSaoFlag[0], "slice_sao_luma_flag");
13
-        WRITE_FLAG(saoParam->bSaoFlag[1], "slice_sao_chroma_flag");
14
+        if (encData.m_param->internalCsp != X265_CSP_I400)
15
+            WRITE_FLAG(saoParam->bSaoFlag[1], "slice_sao_chroma_flag");
16
     }
17
 
18
     // check if numRefIdx match the defaults (1, hard-coded in PPS). If not, override
19
@@ -828,6 +830,79 @@
20
     }
21
 }
22
 
23
+void Entropy::encodeTransformLuma(const CUData& cu, uint32_t absPartIdx, uint32_t curDepth, uint32_t log2CurSize,
24
+                              bool& bCodeDQP, const uint32_t depthRange[2])
25
+{
26
+    const bool subdiv = cu.m_tuDepth[absPartIdx] > curDepth;
27
+
28
+    /* in each of these conditions, the subdiv flag is implied and not signaled,
29
+     * so we have checks to make sure the implied value matches our intentions */
30
+    if (cu.isIntra(absPartIdx) && cu.m_partSize[absPartIdx] != SIZE_2Nx2N && log2CurSize == MIN_LOG2_CU_SIZE)
31
+    {
32
+        X265_CHECK(subdiv, "intra NxN requires TU depth below CU depth\n");
33
+    }
34
+    else if (cu.isInter(absPartIdx) && cu.m_partSize[absPartIdx] != SIZE_2Nx2N &&
35
+             !curDepth && cu.m_slice->m_sps->quadtreeTUMaxDepthInter == 1)
36
+    {
37
+        X265_CHECK(subdiv, "inter TU must be smaller than CU when not 2Nx2N part size: log2CurSize %d, depthRange[0] %d\n", log2CurSize, depthRange[0]);
38
+    }
39
+    else if (log2CurSize > depthRange[1])
40
+    {
41
+        X265_CHECK(subdiv, "TU is larger than the max allowed, it should have been split\n");
42
+    }
43
+    else if (log2CurSize == cu.m_slice->m_sps->quadtreeTULog2MinSize || log2CurSize == depthRange[0])
44
+    {
45
+        X265_CHECK(!subdiv, "min sized TU cannot be subdivided\n");
46
+    }
47
+    else
48
+    {
49
+        X265_CHECK(log2CurSize > depthRange[0], "transform size failure\n");
50
+        codeTransformSubdivFlag(subdiv, 5 - log2CurSize);
51
+    }
52
+
53
+    if (subdiv)
54
+    {
55
+        --log2CurSize;
56
+        ++curDepth;
57
+
58
+        uint32_t qNumParts = 1 << (log2CurSize - LOG2_UNIT_SIZE) * 2;
59
+
60
+        encodeTransformLuma(cu, absPartIdx + 0 * qNumParts, curDepth, log2CurSize, bCodeDQP, depthRange);
61
+        encodeTransformLuma(cu, absPartIdx + 1 * qNumParts, curDepth, log2CurSize, bCodeDQP, depthRange);
62
+        encodeTransformLuma(cu, absPartIdx + 2 * qNumParts, curDepth, log2CurSize, bCodeDQP, depthRange);
63
+        encodeTransformLuma(cu, absPartIdx + 3 * qNumParts, curDepth, log2CurSize, bCodeDQP, depthRange);
64
+        return;
65
+    }
66
+
67
+    if (!cu.isIntra(absPartIdx) && !curDepth)
68
+    {
69
+        X265_CHECK(cu.getCbf(absPartIdx, TEXT_LUMA, 0), "CBF should have been set\n");
70
+    }
71
+    else
72
+        codeQtCbfLuma(cu, absPartIdx, curDepth);
73
+
74
+    uint32_t cbfY = cu.getCbf(absPartIdx, TEXT_LUMA, curDepth);
75
+
76
+    if (!cbfY)
77
+        return;
78
+
79
+    // dQP: only for CTU once
80
+    if (cu.m_slice->m_pps->bUseDQP && bCodeDQP)
81
+    {
82
+        uint32_t log2CUSize = cu.m_log2CUSize[absPartIdx];
83
+        uint32_t absPartIdxLT = absPartIdx & (0xFF << (log2CUSize - LOG2_UNIT_SIZE) * 2);
84
+        codeDeltaQP(cu, absPartIdxLT);
85
+        bCodeDQP = false;
86
+    }
87
+
88
+    if (cbfY)
89
+    {
90
+        uint32_t coeffOffset = absPartIdx << (LOG2_UNIT_SIZE * 2);
91
+        codeCoeffNxN(cu, cu.m_trCoeff[0] + coeffOffset, absPartIdx, log2CurSize, TEXT_LUMA);
92
+    }
93
+}
94
+
95
+
96
 void Entropy::codePredInfo(const CUData& cu, uint32_t absPartIdx)
97
 {
98
     if (cu.isIntra(absPartIdx)) // If it is intra mode, encode intra prediction mode.
99
@@ -908,7 +983,10 @@
100
     }
101
 
102
     uint32_t log2CUSize = cu.m_log2CUSize[absPartIdx];
103
-    encodeTransform(cu, absPartIdx, 0, log2CUSize, bCodeDQP, depthRange);
104
+    if (cu.m_chromaFormat == X265_CSP_I400)
105
+        encodeTransformLuma(cu, absPartIdx, 0, log2CUSize, bCodeDQP, depthRange);
106
+    else
107
+        encodeTransform(cu, absPartIdx, 0, log2CUSize, bCodeDQP, depthRange);
108
 }
109
 
110
 void Entropy::codeSaoOffset(const SaoCtuParam& ctuParam, int plane)
111
@@ -1010,7 +1088,7 @@
112
 void Entropy::codePredWeightTable(const Slice& slice)
113
 {
114
     const WeightParam *wp;
115
-    bool            bChroma      = true; // 4:0:0 not yet supported
116
+    bool            bChroma = slice.m_sps->chromaFormatIdc != X265_CSP_I400;
117
     bool            bDenomCoded  = false;
118
     int             numRefDirs   = slice.m_sliceType == B_SLICE ? 2 : 1;
119
     uint32_t        totalSignalledWeightFlags = 0;
120
@@ -1565,11 +1643,16 @@
121
     uint8_t * const baseCtx = bIsLuma ? &m_contextState[OFF_SIG_FLAG_CTX] : &m_contextState[OFF_SIG_FLAG_CTX + NUM_SIG_FLAG_CTX_LUMA];
122
     uint32_t c1 = 1;
123
     int scanPosSigOff = scanPosLast - (lastScanSet << MLS_CG_SIZE) - 1;
124
-    ALIGN_VAR_32(uint16_t, absCoeff[(1 << MLS_CG_SIZE)]);
125
+    ALIGN_VAR_32(uint16_t, absCoeff[(1 << MLS_CG_SIZE) + 1]);   // extra 2 bytes(+1) space for AVX2 assembly, +1 because (numNonZero<=1) in costCoeffNxN path
126
     uint32_t numNonZero = 1;
127
     unsigned long lastNZPosInCG;
128
     unsigned long firstNZPosInCG;
129
 
130
+#if _DEBUG
131
+    // Unnecessary, for Valgrind-3.10.0 only
132
+    memset(absCoeff, 0, sizeof(absCoeff));
133
+#endif
134
+
135
     absCoeff[0] = (uint16_t)abs(coeff[posLast]);
136
 
137
     for (int subSet = lastScanSet; subSet >= 0; subSet--)
138
@@ -1715,6 +1798,7 @@
139
             {
140
                 // maximum g_entropyBits are 18-bits and maximum of count are 16, so intermedia of sum are 22-bits
141
                 const uint8_t *tabSigCtx = table_cnt[(log2TrSize == 2) ? 4 : (uint32_t)patternSigCtx];
142
+                X265_CHECK(numNonZero <= 1, "numNonZero check failure");
143
                 uint32_t sum = primitives.costCoeffNxN(g_scan4x4[codingParameters.scanType], &coeff[blkPosBase], (intptr_t)trSize, absCoeff + numNonZero, tabSigCtx, scanFlagMask, baseCtx, offset + posOffset, scanPosSigOff, subPosBase);
144
 
145
 #if CHECKED_BUILD || _DEBUG
146
@@ -1919,43 +2003,78 @@
147
         numCtx = bIsLuma ? 12 : 3;
148
     }
149
 
150
-    if (bIsLuma)
151
-    {
152
-        for (uint32_t bin = 0; bin < 2; bin++)
153
-            estBitsSbac.significantBits[bin][0] = sbacGetEntropyBits(m_contextState[OFF_SIG_FLAG_CTX], bin);
154
+    const int ctxSigOffset = OFF_SIG_FLAG_CTX + (bIsLuma ? 0 : NUM_SIG_FLAG_CTX_LUMA);
155
+
156
+    estBitsSbac.significantBits[0][0] = sbacGetEntropyBits(m_contextState[ctxSigOffset], 0);
157
+    estBitsSbac.significantBits[1][0] = sbacGetEntropyBits(m_contextState[ctxSigOffset], 1);
158
 
159
-        for (int ctxIdx = firstCtx; ctxIdx < firstCtx + numCtx; ctxIdx++)
160
-            for (uint32_t bin = 0; bin < 2; bin++)
161
-                estBitsSbac.significantBits[bin][ctxIdx] = sbacGetEntropyBits(m_contextState[OFF_SIG_FLAG_CTX + ctxIdx], bin);
162
+    for (int ctxIdx = firstCtx; ctxIdx < firstCtx + numCtx; ctxIdx++)
163
+    {
164
+        estBitsSbac.significantBits[0][ctxIdx] = sbacGetEntropyBits(m_contextState[ctxSigOffset + ctxIdx], 0);
165
+        estBitsSbac.significantBits[1][ctxIdx] = sbacGetEntropyBits(m_contextState[ctxSigOffset + ctxIdx], 1);
166
     }
167
-    else
168
+
169
+    const uint32_t maxGroupIdx = log2TrSize * 2 - 1;
170
+    if (bIsLuma)
171
     {
172
-        for (uint32_t bin = 0; bin < 2; bin++)
173
-            estBitsSbac.significantBits[bin][0] = sbacGetEntropyBits(m_contextState[OFF_SIG_FLAG_CTX + (NUM_SIG_FLAG_CTX_LUMA + 0)], bin);
174
+        if (log2TrSize == 2)
175
+        {
176
+            for (int i = 0, ctxIdx = 0; i < 2; i++, ctxIdx += NUM_CTX_LAST_FLAG_XY)
177
+            {
178
+                int bits = 0;
179
+                const uint8_t *ctxState = &m_contextState[OFF_CTX_LAST_FLAG_X + ctxIdx];
180
 
181
-        for (int ctxIdx = firstCtx; ctxIdx < firstCtx + numCtx; ctxIdx++)
182
-            for (uint32_t bin = 0; bin < 2; bin++)
183
-                estBitsSbac.significantBits[bin][ctxIdx] = sbacGetEntropyBits(m_contextState[OFF_SIG_FLAG_CTX + (NUM_SIG_FLAG_CTX_LUMA + ctxIdx)], bin);
184
-    }
185
+                for (uint32_t ctx = 0; ctx < 3; ctx++)
186
+                {
187
+                    estBitsSbac.lastBits[i][ctx] = bits + sbacGetEntropyBits(ctxState[ctx], 0);
188
+                    bits += sbacGetEntropyBits(ctxState[ctx], 1);
189
+                }
190
 
191
-    int blkSizeOffset = bIsLuma ? ((log2TrSize - 2) * 3 + ((log2TrSize - 1) >> 2)) : NUM_CTX_LAST_FLAG_XY_LUMA;
192
-    int ctxShift = bIsLuma ? ((log2TrSize + 1) >> 2) : log2TrSize - 2;
193
-    uint32_t maxGroupIdx = log2TrSize * 2 - 1;
194
+                estBitsSbac.lastBits[i][maxGroupIdx] = bits;
195
+            }
196
+        }
197
+        else
198
+        {
199
+            const int blkSizeOffset = ((log2TrSize - 2) * 3 + (log2TrSize == 5));
200
 
201
-    uint32_t ctx;
202
-    for (int i = 0, ctxIdx = 0; i < 2; i++, ctxIdx += NUM_CTX_LAST_FLAG_XY)
203
+            for (int i = 0, ctxIdx = 0; i < 2; i++, ctxIdx += NUM_CTX_LAST_FLAG_XY)
204
+            {
205
+                int bits = 0;
206
+                const uint8_t *ctxState = &m_contextState[OFF_CTX_LAST_FLAG_X + ctxIdx];
207
+                X265_CHECK(maxGroupIdx & 1, "maxGroupIdx check failure\n");
208
+
209
+                for (uint32_t ctx = 0; ctx < (maxGroupIdx >> 1) + 1; ctx++)
210
+                {
211
+                    const int cost0 = sbacGetEntropyBits(ctxState[blkSizeOffset + ctx], 0);
212
+                    const int cost1 = sbacGetEntropyBits(ctxState[blkSizeOffset + ctx], 1);
213
+                    estBitsSbac.lastBits[i][ctx * 2 + 0] = bits + cost0;
214
+                    estBitsSbac.lastBits[i][ctx * 2 + 1] = bits + cost1 + cost0;
215
+                    bits += 2 * cost1;
216
+                }
217
+                // correct latest bit cost, it didn't include cost0
218
+                estBitsSbac.lastBits[i][maxGroupIdx] -= sbacGetEntropyBits(ctxState[blkSizeOffset + (maxGroupIdx >> 1)], 0);
219
+            }
220
+        }
221
+    }
222
+    else
223
     {
224
-        int bits = 0;
225
-        const uint8_t *ctxState = &m_contextState[OFF_CTX_LAST_FLAG_X + ctxIdx];
226
+        const int blkSizeOffset = NUM_CTX_LAST_FLAG_XY_LUMA;
227
+        const int ctxShift = log2TrSize - 2;
228
 
229
-        for (ctx = 0; ctx < maxGroupIdx; ctx++)
230
+        for (int i = 0, ctxIdx = 0; i < 2; i++, ctxIdx += NUM_CTX_LAST_FLAG_XY)
231
         {
232
-            int ctxOffset = blkSizeOffset + (ctx >> ctxShift);
233
-            estBitsSbac.lastBits[i][ctx] = bits + sbacGetEntropyBits(ctxState[ctxOffset], 0);
234
-            bits += sbacGetEntropyBits(ctxState[ctxOffset], 1);
235
-        }
236
+            int bits = 0;
237
+            const uint8_t *ctxState = &m_contextState[OFF_CTX_LAST_FLAG_X + ctxIdx];
238
+
239
+            for (uint32_t ctx = 0; ctx < maxGroupIdx; ctx++)
240
+            {
241
+                int ctxOffset = blkSizeOffset + (ctx >> ctxShift);
242
+                estBitsSbac.lastBits[i][ctx] = bits + sbacGetEntropyBits(ctxState[ctxOffset], 0);
243
+                bits += sbacGetEntropyBits(ctxState[ctxOffset], 1);
244
+            }
245
 
246
-        estBitsSbac.lastBits[i][ctx] = bits;
247
+            estBitsSbac.lastBits[i][maxGroupIdx] = bits;
248
+        }
249
     }
250
 }
251
 
252
x265_1.8.tar.gz/source/encoder/entropy.h -> x265_1.9.tar.gz/source/encoder/entropy.h Changed
18
 
1
@@ -2,6 +2,7 @@
2
 * Copyright (C) 2013 x265 project
3
 *
4
 * Authors: Steve Borho <steve@borho.org>
5
+*          Min Chen <chenm003@163.com>
6
 *
7
 * This program is free software; you can redistribute it and/or modify
8
 * it under the terms of the GNU General Public License as published by
9
@@ -246,6 +247,8 @@
10
 
11
     void encodeTransform(const CUData& cu, uint32_t absPartIdx, uint32_t tuDepth, uint32_t log2TrSize,
12
                          bool& bCodeDQP, const uint32_t depthRange[2]);
13
+    void encodeTransformLuma(const CUData& cu, uint32_t absPartIdx, uint32_t tuDepth, uint32_t log2TrSize,
14
+                         bool& bCodeDQP, const uint32_t depthRange[2]);
15
 
16
     void copyFrom(const Entropy& src);
17
     void copyContextsFrom(const Entropy& src);
18
x265_1.8.tar.gz/source/encoder/frameencoder.cpp -> x265_1.9.tar.gz/source/encoder/frameencoder.cpp Changed
454
 
1
@@ -104,7 +104,8 @@
2
     m_param = top->m_param;
3
     m_numRows = numRows;
4
     m_numCols = numCols;
5
-    m_filterRowDelay = (m_param->bEnableSAO && m_param->bSaoNonDeblocked) ?
6
+    m_filterRowDelay = ((m_param->bEnableSAO && m_param->bSaoNonDeblocked)
7
+                        || (!m_param->bEnableLoopFilter && m_param->bEnableSAO)) ?
8
                         2 : (m_param->bEnableSAO || m_param->bEnableLoopFilter ? 1 : 0);
9
     m_filterRowDelayCus = m_filterRowDelay * numCols;
10
     m_rows = new CTURow[m_numRows];
11
@@ -124,7 +125,7 @@
12
         m_pool = NULL;
13
     }
14
 
15
-    m_frameFilter.init(top, this, numRows);
16
+    m_frameFilter.init(top, this, numRows, numCols);
17
 
18
     // initialize HRD parameters of SPS
19
     if (m_param->bEmitHRDSEI || !!m_param->interlaceMode)
20
@@ -135,7 +136,7 @@
21
         ok &= m_rce.picTimingSEI && m_rce.hrdTiming;
22
     }
23
 
24
-    if (m_param->noiseReductionIntra || m_param->noiseReductionInter)
25
+    if (m_param->noiseReductionIntra || m_param->noiseReductionInter || m_param->rc.vbvBufferSize)
26
         m_nr = X265_MALLOC(NoiseReduction, 1);
27
     if (m_nr)
28
         memset(m_nr, 0, sizeof(NoiseReduction));
29
@@ -275,7 +276,7 @@
30
         m_localTldIdx = 0;
31
     }
32
 
33
-    m_done.trigger();     /* signal that thread is initialized */ 
34
+    m_done.trigger();     /* signal that thread is initialized */
35
     m_enable.wait();      /* Encoder::encode() triggers this event */
36
 
37
     while (m_threadActive)
38
@@ -357,15 +358,52 @@
39
             WeightParam *w = NULL;
40
             if ((bUseWeightP || bUseWeightB) && slice->m_weightPredTable[l][ref][0].bPresentFlag)
41
                 w = slice->m_weightPredTable[l][ref];
42
-            m_mref[l][ref].init(slice->m_refPicList[l][ref]->m_reconPic, w, *m_param);
43
+            slice->m_refReconPicList[l][ref] = slice->m_refFrameList[l][ref]->m_reconPic;
44
+            m_mref[l][ref].init(slice->m_refReconPicList[l][ref], w, *m_param);
45
         }
46
     }
47
 
48
+    int numTLD;
49
+    if (m_pool)
50
+        numTLD = m_param->bEnableWavefront ? m_pool->m_numWorkers : m_pool->m_numWorkers + m_pool->m_numProviders;
51
+    else
52
+        numTLD = 1;
53
+
54
     /* Get the QP for this frame from rate control. This call may block until
55
      * frames ahead of it in encode order have called rateControlEnd() */
56
     int qp = m_top->m_rateControl->rateControlStart(m_frame, &m_rce, m_top);
57
     m_rce.newQp = qp;
58
 
59
+    if (m_nr)
60
+    {
61
+        if (qp > QP_MAX_SPEC && m_frame->m_param->rc.vbvBufferSize)
62
+        {
63
+            for (int i = 0; i < numTLD; i++)
64
+            {
65
+                m_tld[i].analysis.m_quant.m_frameNr[m_jpId].offset = m_top->m_offsetEmergency[qp - QP_MAX_SPEC - 1];
66
+                m_tld[i].analysis.m_quant.m_frameNr[m_jpId].residualSum = m_top->m_residualSumEmergency;
67
+                m_tld[i].analysis.m_quant.m_frameNr[m_jpId].count = m_top->m_countEmergency;
68
+            }
69
+        }
70
+        else
71
+        {
72
+            if (m_param->noiseReductionIntra || m_param->noiseReductionInter)
73
+            {
74
+                for (int i = 0; i < numTLD; i++)
75
+                {
76
+                    m_tld[i].analysis.m_quant.m_frameNr[m_jpId].offset = m_tld[i].analysis.m_quant.m_frameNr[m_jpId].nrOffsetDenoise;
77
+                    m_tld[i].analysis.m_quant.m_frameNr[m_jpId].residualSum = m_tld[i].analysis.m_quant.m_frameNr[m_jpId].nrResidualSum;
78
+                    m_tld[i].analysis.m_quant.m_frameNr[m_jpId].count = m_tld[i].analysis.m_quant.m_frameNr[m_jpId].nrCount;
79
+                }
80
+            }
81
+            else
82
+            {
83
+                for (int i = 0; i < numTLD; i++)
84
+                    m_tld[i].analysis.m_quant.m_frameNr[m_jpId].offset = NULL;
85
+            }
86
+        }
87
+    }
88
+
89
     /* Clip slice QP to 0-51 spec range before encoding */
90
     slice->m_sliceQp = x265_clip3(-QP_BD_OFFSET, QP_MAX_SPEC, qp);
91
 
92
@@ -458,7 +496,7 @@
93
     /* CQP and CRF (without capped VBV) doesn't use mid-frame statistics to 
94
      * tune RateControl parameters for other frames.
95
      * Hence, for these modes, update m_startEndOrder and unlock RC for previous threads waiting in
96
-     * RateControlEnd here, after the slicecontexts are initialized. For the rest - ABR
97
+     * RateControlEnd here, after the slice contexts are initialized. For the rest - ABR
98
      * and VBV, unlock only after rateControlUpdateStats of this frame is called */
99
     if (m_param->rc.rateControlMode != X265_RC_ABR && !m_top->m_rateControl->m_isVbv)
100
     {
101
@@ -482,7 +520,7 @@
102
             {
103
                 for (int ref = 0; ref < slice->m_numRefIdx[l]; ref++)
104
                 {
105
-                    Frame *refpic = slice->m_refPicList[l][ref];
106
+                    Frame *refpic = slice->m_refFrameList[l][ref];
107
 
108
                     uint32_t reconRowCount = refpic->m_reconRowCount.get();
109
                     while ((reconRowCount != m_numRows) && (reconRowCount < row + m_refLagRows))
110
@@ -521,7 +559,7 @@
111
                     int list = l;
112
                     for (int ref = 0; ref < slice->m_numRefIdx[list]; ref++)
113
                     {
114
-                        Frame *refpic = slice->m_refPicList[list][ref];
115
+                        Frame *refpic = slice->m_refFrameList[list][ref];
116
 
117
                         uint32_t reconRowCount = refpic->m_reconRowCount.get();
118
                         while ((reconRowCount != m_numRows) && (reconRowCount < i + m_refLagRows))
119
@@ -572,10 +610,7 @@
120
         m_frame->m_encData->m_frameStats.lumaDistortion   += m_rows[i].rowStats.lumaDistortion;
121
         m_frame->m_encData->m_frameStats.chromaDistortion += m_rows[i].rowStats.chromaDistortion;
122
         m_frame->m_encData->m_frameStats.psyEnergy        += m_rows[i].rowStats.psyEnergy;
123
-        m_frame->m_encData->m_frameStats.lumaLevel        += m_rows[i].rowStats.lumaLevel;
124
-
125
-        if (m_rows[i].rowStats.maxLumaLevel > m_frame->m_encData->m_frameStats.maxLumaLevel)
126
-            m_frame->m_encData->m_frameStats.maxLumaLevel = m_rows[i].rowStats.maxLumaLevel;
127
+        m_frame->m_encData->m_frameStats.resEnergy        += m_rows[i].rowStats.resEnergy;
128
         for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
129
         {
130
             m_frame->m_encData->m_frameStats.cntSkipCu[depth] += m_rows[i].rowStats.cntSkipCu[depth];
131
@@ -589,7 +624,7 @@
132
     m_frame->m_encData->m_frameStats.avgLumaDistortion   = (double)(m_frame->m_encData->m_frameStats.lumaDistortion) / m_frame->m_encData->m_frameStats.totalCtu;
133
     m_frame->m_encData->m_frameStats.avgChromaDistortion = (double)(m_frame->m_encData->m_frameStats.chromaDistortion) / m_frame->m_encData->m_frameStats.totalCtu;
134
     m_frame->m_encData->m_frameStats.avgPsyEnergy        = (double)(m_frame->m_encData->m_frameStats.psyEnergy) / m_frame->m_encData->m_frameStats.totalCtu;
135
-    m_frame->m_encData->m_frameStats.avgLumaLevel        = m_frame->m_encData->m_frameStats.lumaLevel / m_frame->m_encData->m_frameStats.totalCtu;
136
+    m_frame->m_encData->m_frameStats.avgResEnergy        = (double)(m_frame->m_encData->m_frameStats.resEnergy) / m_frame->m_encData->m_frameStats.totalCtu;
137
     m_frame->m_encData->m_frameStats.percentIntraNxN     = (double)(m_frame->m_encData->m_frameStats.cntIntraNxN * 100) / m_frame->m_encData->m_frameStats.totalCu;
138
     for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
139
     {
140
@@ -626,22 +661,23 @@
141
 
142
     if (m_param->decodedPictureHashSEI)
143
     {
144
+        int planes = (m_frame->m_param->internalCsp != X265_CSP_I400) ? 3 : 1;
145
         if (m_param->decodedPictureHashSEI == 1)
146
         {
147
             m_seiReconPictureDigest.m_method = SEIDecodedPictureHash::MD5;
148
-            for (int i = 0; i < 3; i++)
149
+            for (int i = 0; i < planes; i++)
150
                 MD5Final(&m_state[i], m_seiReconPictureDigest.m_digest[i]);
151
         }
152
         else if (m_param->decodedPictureHashSEI == 2)
153
         {
154
             m_seiReconPictureDigest.m_method = SEIDecodedPictureHash::CRC;
155
-            for (int i = 0; i < 3; i++)
156
+            for (int i = 0; i < planes; i++)
157
                 crcFinish(m_crc[i], m_seiReconPictureDigest.m_digest[i]);
158
         }
159
         else if (m_param->decodedPictureHashSEI == 3)
160
         {
161
             m_seiReconPictureDigest.m_method = SEIDecodedPictureHash::CHECKSUM;
162
-            for (int i = 0; i < 3; i++)
163
+            for (int i = 0; i < planes; i++)
164
                 checksumFinish(m_checksum[i], m_seiReconPictureDigest.m_digest[i]);
165
         }
166
 
167
@@ -678,41 +714,40 @@
168
     {
169
         for (int ref = 0; ref < slice->m_numRefIdx[l]; ref++)
170
         {
171
-            Frame *refpic = slice->m_refPicList[l][ref];
172
+            Frame *refpic = slice->m_refFrameList[l][ref];
173
             ATOMIC_DEC(&refpic->m_countRefEncoders);
174
         }
175
     }
176
 
177
-    int numTLD;
178
-    if (m_pool)
179
-        numTLD = m_param->bEnableWavefront ? m_pool->m_numWorkers : m_pool->m_numWorkers + m_pool->m_numProviders;
180
-    else
181
-        numTLD = 1;
182
-
183
     if (m_nr)
184
     {
185
-        /* Accumulate NR statistics from all worker threads */
186
-        for (int i = 0; i < numTLD; i++)
187
+        bool nrEnabled = (m_rce.newQp < QP_MAX_SPEC || !m_param->rc.vbvBufferSize) && (m_param->noiseReductionIntra || m_param->noiseReductionInter);
188
+
189
+        if (nrEnabled)
190
         {
191
-            NoiseReduction* nr = &m_tld[i].analysis.m_quant.m_frameNr[m_jpId];
192
-            for (int cat = 0; cat < MAX_NUM_TR_CATEGORIES; cat++)
193
+            /* Accumulate NR statistics from all worker threads */
194
+            for (int i = 0; i < numTLD; i++)
195
             {
196
-                for (int coeff = 0; coeff < MAX_NUM_TR_COEFFS; coeff++)
197
-                    m_nr->residualSum[cat][coeff] += nr->residualSum[cat][coeff];
198
-            
199
-                m_nr->count[cat] += nr->count[cat];
200
+                NoiseReduction* nr = &m_tld[i].analysis.m_quant.m_frameNr[m_jpId];
201
+                for (int cat = 0; cat < MAX_NUM_TR_CATEGORIES; cat++)
202
+                {
203
+                    for (int coeff = 0; coeff < MAX_NUM_TR_COEFFS; coeff++)
204
+                        m_nr->nrResidualSum[cat][coeff] += nr->nrResidualSum[cat][coeff];
205
+
206
+                    m_nr->nrCount[cat] += nr->nrCount[cat];
207
+                }
208
             }
209
-        }
210
 
211
-        noiseReductionUpdate();
212
+            noiseReductionUpdate();
213
 
214
-        /* Copy updated NR coefficients back to all worker threads */
215
-        for (int i = 0; i < numTLD; i++)
216
-        {
217
-            NoiseReduction* nr = &m_tld[i].analysis.m_quant.m_frameNr[m_jpId];
218
-            memcpy(nr->offsetDenoise, m_nr->offsetDenoise, sizeof(uint16_t) * MAX_NUM_TR_CATEGORIES * MAX_NUM_TR_COEFFS);
219
-            memset(nr->count, 0, sizeof(uint32_t) * MAX_NUM_TR_CATEGORIES);
220
-            memset(nr->residualSum, 0, sizeof(uint32_t) * MAX_NUM_TR_CATEGORIES * MAX_NUM_TR_COEFFS);
221
+            /* Copy updated NR coefficients back to all worker threads */
222
+            for (int i = 0; i < numTLD; i++)
223
+            {
224
+                NoiseReduction* nr = &m_tld[i].analysis.m_quant.m_frameNr[m_jpId];
225
+                memcpy(nr->nrOffsetDenoise, m_nr->nrOffsetDenoise, sizeof(uint16_t)* MAX_NUM_TR_CATEGORIES * MAX_NUM_TR_COEFFS);
226
+                memset(nr->nrCount, 0, sizeof(uint32_t)* MAX_NUM_TR_CATEGORIES);
227
+                memset(nr->nrResidualSum, 0, sizeof(uint32_t)* MAX_NUM_TR_CATEGORIES * MAX_NUM_TR_COEFFS);
228
+            }
229
         }
230
     }
231
 
232
@@ -773,7 +808,7 @@
233
             }
234
             else
235
             {
236
-                for (int i = 0; i < 3; i++)
237
+                for (int i = 0; i < (m_param->internalCsp != X265_CSP_I400 ? 3 : 1); i++)
238
                     saoParam->ctuParam[i][cuAddr].reset();
239
             }
240
         }
241
@@ -824,7 +859,7 @@
242
 // Called by worker threads
243
 void FrameEncoder::processRowEncoder(int intRow, ThreadLocalData& tld)
244
 {
245
-    uint32_t row = (uint32_t)intRow;
246
+    const uint32_t row = (uint32_t)intRow;
247
     CTURow& curRow = m_rows[row];
248
 
249
     tld.analysis.m_param = m_param;
250
@@ -858,11 +893,15 @@
251
     const uint32_t lineStartCUAddr = row * numCols;
252
     bool bIsVbv = m_param->rc.vbvBufferSize > 0 && m_param->rc.vbvMaxBitrate > 0;
253
 
254
+    uint32_t maxBlockCols = (m_frame->m_fencPic->m_picWidth + (16 - 1)) / 16;
255
+    uint32_t maxBlockRows = (m_frame->m_fencPic->m_picHeight + (16 - 1)) / 16;
256
+    uint32_t noOfBlocks = g_maxCUSize / 16;
257
+
258
     while (curRow.completed < numCols)
259
     {
260
         ProfileScopeEvent(encodeCTU);
261
 
262
-        uint32_t col = curRow.completed;
263
+        const uint32_t col = curRow.completed;
264
         const uint32_t cuAddr = lineStartCUAddr + col;
265
         CUData* ctu = curEncData.getPicCTU(cuAddr);
266
         ctu->initCTU(*m_frame, cuAddr, slice->m_sliceQp);
267
@@ -882,11 +921,8 @@
268
                 cuStat.baseQp = curEncData.m_rowStat[row].diagQp;
269
 
270
             /* TODO: use defines from slicetype.h for lowres block size */
271
-            uint32_t maxBlockCols = (m_frame->m_fencPic->m_picWidth + (16 - 1)) / 16;
272
-            uint32_t maxBlockRows = (m_frame->m_fencPic->m_picHeight + (16 - 1)) / 16;
273
-            uint32_t noOfBlocks = g_maxCUSize / 16;
274
-            uint32_t block_y = (cuAddr / curEncData.m_slice->m_sps->numCuInWidth) * noOfBlocks;
275
-            uint32_t block_x = (cuAddr * noOfBlocks) - block_y * curEncData.m_slice->m_sps->numCuInWidth;
276
+            uint32_t block_y = (ctu->m_cuPelY >> g_maxLog2CUSize) * noOfBlocks;
277
+            uint32_t block_x = (ctu->m_cuPelX >> g_maxLog2CUSize) * noOfBlocks;
278
             
279
             cuStat.vbvCost = 0;
280
             cuStat.intraVbvCost = 0;
281
@@ -926,6 +962,58 @@
282
             // Save CABAC state for next row
283
             curRow.bufferedEntropy.loadContexts(rowCoder);
284
 
285
+        /* SAO parameter estimation using non-deblocked pixels for CTU bottom and right boundary areas */
286
+        if (m_param->bEnableSAO && m_param->bSaoNonDeblocked)
287
+            m_frameFilter.m_parallelFilter[row].m_sao.calcSaoStatsCu_BeforeDblk(m_frame, col, row);
288
+
289
+        /* Deblock with idle threading */
290
+        if (m_param->bEnableLoopFilter | m_param->bEnableSAO)
291
+        {
292
+            // TODO: Multiple Threading
293
+            // Delay ONE row to avoid Intra Prediction Conflict
294
+            if (m_pool && (row >= 1))
295
+            {
296
+                // Waitting last threading finish
297
+                m_frameFilter.m_parallelFilter[row - 1].waitForExit();
298
+
299
+                // Processing new group
300
+                int allowCol = col;
301
+
302
+                // avoid race condition on last column
303
+                if (row >= 2)
304
+                {
305
+                    allowCol = X265_MIN(((col == numCols - 1) ? m_frameFilter.m_parallelFilter[row - 2].m_lastDeblocked.get()
306
+                                                              : m_frameFilter.m_parallelFilter[row - 2].m_lastCol.get()), (int)col);
307
+                }
308
+                m_frameFilter.m_parallelFilter[row - 1].m_allowedCol.set(allowCol);
309
+                m_frameFilter.m_parallelFilter[row - 1].tryBondPeers(*this, 1);
310
+            }
311
+
312
+            // Last Row may start early
313
+            if (m_pool && (row == m_numRows - 1))
314
+            {
315
+                // Waiting for the last thread to finish
316
+                m_frameFilter.m_parallelFilter[row].waitForExit();
317
+
318
+                // Deblocking last row
319
+                int allowCol = col;
320
+
321
+                // avoid race condition on last column
322
+                if (row >= 2)
323
+                {
324
+                    allowCol = X265_MIN(((col == numCols - 1) ? m_frameFilter.m_parallelFilter[row - 1].m_lastDeblocked.get()
325
+                                                              : m_frameFilter.m_parallelFilter[row - 1].m_lastCol.get()), (int)col);
326
+                }
327
+                m_frameFilter.m_parallelFilter[row].m_allowedCol.set(allowCol);
328
+                m_frameFilter.m_parallelFilter[row].tryBondPeers(*this, 1);
329
+            }
330
+        }
331
+        // Both Loopfilter and SAO Disabled
332
+        else
333
+        {
334
+            m_frameFilter.m_parallelFilter[row].processPostCu(col);
335
+        }
336
+
337
         // Completed CU processing
338
         curRow.completed++;
339
 
340
@@ -958,6 +1046,7 @@
341
         curRow.rowStats.lumaDistortion   += best.lumaDistortion;
342
         curRow.rowStats.chromaDistortion += best.chromaDistortion;
343
         curRow.rowStats.psyEnergy        += best.psyEnergy;
344
+        curRow.rowStats.resEnergy        += best.resEnergy;
345
         curRow.rowStats.cntIntraNxN      += frameLog.cntIntraNxN;
346
         curRow.rowStats.totalCu          += frameLog.totalCu;
347
         for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
348
@@ -970,17 +1059,6 @@
349
                 curRow.rowStats.cuIntraDistribution[depth][n] += frameLog.cuIntraDistribution[depth][n];
350
         }
351
 
352
-        /* calculate maximum and average luma levels */
353
-        uint32_t ctuLumaLevel = 0;
354
-        uint32_t ctuNoOfPixels = best.fencYuv->m_size * best.fencYuv->m_size;
355
-        for (uint32_t i = 0; i < ctuNoOfPixels; i++)
356
-        {
357
-            pixel p = best.fencYuv->m_buf[0][i];
358
-            ctuLumaLevel += p;
359
-            curRow.rowStats.maxLumaLevel = X265_MAX(p, curRow.rowStats.maxLumaLevel);
360
-        }
361
-        curRow.rowStats.lumaLevel += (double)(ctuLumaLevel) / ctuNoOfPixels;
362
-
363
         curEncData.m_cuStat[cuAddr].totalBits = best.totalBits;
364
         x265_emms();
365
 
366
@@ -1065,10 +1143,6 @@
367
             }
368
         }
369
 
370
-        /* SAO parameter estimation using non-deblocked pixels for CTU bottom and right boundary areas */
371
-        if (m_param->bEnableSAO && m_param->bSaoNonDeblocked)
372
-            m_frameFilter.m_sao.calcSaoStatsCu_BeforeDblk(m_frame, col, row);
373
-
374
         if (m_param->bEnableWavefront && curRow.completed >= 2 && row < m_numRows - 1 &&
375
             (!m_bAllRowsStop || intRow + 1 < m_vbvResetTriggerRow))
376
         {
377
@@ -1085,7 +1159,7 @@
378
 
379
         ScopedLock self(curRow.lock);
380
         if ((m_bAllRowsStop && intRow > m_vbvResetTriggerRow) ||
381
-            (row > 0 && curRow.completed < numCols - 1 && m_rows[row - 1].completed < m_rows[row].completed + 2))
382
+            (row > 0 && ((curRow.completed < numCols - 1) || (m_rows[row - 1].completed < numCols)) && m_rows[row - 1].completed < m_rows[row].completed + 2))
383
         {
384
             curRow.active = false;
385
             curRow.busy = false;
386
@@ -1127,9 +1201,24 @@
387
     if (!m_param->bEnableSAO && (m_param->bEnableWavefront || row == m_numRows - 1))
388
         rowCoder.finishSlice();
389
 
390
+    /* Processing left Deblock block with current threading */
391
+    if ((m_param->bEnableLoopFilter | m_param->bEnableSAO) & (row >= 2))
392
+    {
393
+        /* TODO: Multiple Threading */
394
+
395
+        /* Check conditional to start previous row process with current threading */
396
+        if (m_frameFilter.m_parallelFilter[row - 2].m_lastDeblocked.get() == (int)numCols)
397
+        {
398
+            /* stop threading on current row and restart it */
399
+            m_frameFilter.m_parallelFilter[row - 1].waitForExit();
400
+            m_frameFilter.m_parallelFilter[row - 1].m_allowedCol.set(numCols);
401
+            m_frameFilter.m_parallelFilter[row - 1].processTasks(-1);
402
+        }
403
+    }
404
+
405
+    /* trigger row-wise loop filters */
406
     if (m_param->bEnableWavefront)
407
     {
408
-        /* trigger row-wise loop filters */
409
         if (row >= m_filterRowDelay)
410
         {
411
             enableRowFilter(row - m_filterRowDelay);
412
@@ -1139,6 +1228,7 @@
413
                 enqueueRowFilter(0);
414
             tryWakeOne();
415
         }
416
+
417
         if (row == m_numRows - 1)
418
         {
419
             for (uint32_t i = m_numRows - m_filterRowDelay; i < m_numRows; i++)
420
@@ -1247,25 +1337,25 @@
421
         int trSize = cat & 3;
422
         int coefCount = 1 << ((trSize + 2) * 2);
423
 
424
-        if (m_nr->count[cat] > maxBlocksPerTrSize[trSize])
425
+        if (m_nr->nrCount[cat] > maxBlocksPerTrSize[trSize])
426
         {
427
             for (int i = 0; i < coefCount; i++)
428
-                m_nr->residualSum[cat][i] >>= 1;
429
-            m_nr->count[cat] >>= 1;
430
+                m_nr->nrResidualSum[cat][i] >>= 1;
431
+            m_nr->nrCount[cat] >>= 1;
432
         }
433
 
434
         int nrStrength = cat < 8 ? m_param->noiseReductionIntra : m_param->noiseReductionInter;
435
-        uint64_t scaledCount = (uint64_t)nrStrength * m_nr->count[cat];
436
+        uint64_t scaledCount = (uint64_t)nrStrength * m_nr->nrCount[cat];
437
 
438
         for (int i = 0; i < coefCount; i++)
439
         {
440
-            uint64_t value = scaledCount + m_nr->residualSum[cat][i] / 2;
441
-            uint64_t denom = m_nr->residualSum[cat][i] + 1;
442
-            m_nr->offsetDenoise[cat][i] = (uint16_t)(value / denom);
443
+            uint64_t value = scaledCount + m_nr->nrResidualSum[cat][i] / 2;
444
+            uint64_t denom = m_nr->nrResidualSum[cat][i] + 1;
445
+            m_nr->nrOffsetDenoise[cat][i] = (uint16_t)(value / denom);
446
         }
447
 
448
         // Don't denoise DC coefficients
449
-        m_nr->offsetDenoise[cat][0] = 0;
450
+        m_nr->nrOffsetDenoise[cat][0] = 0;
451
     }
452
 }
453
 
454
x265_1.8.tar.gz/source/encoder/framefilter.cpp -> x265_1.9.tar.gz/source/encoder/framefilter.cpp Changed
811
 
1
@@ -35,177 +35,486 @@
2
 static uint64_t computeSSD(pixel *fenc, pixel *rec, intptr_t stride, uint32_t width, uint32_t height);
3
 static float calculateSSIM(pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, uint32_t width, uint32_t height, void *buf, uint32_t& cnt);
4
 
5
-FrameFilter::FrameFilter()
6
-    : m_param(NULL)
7
-    , m_frame(NULL)
8
-    , m_frameEncoder(NULL)
9
-    , m_ssimBuf(NULL)
10
-{
11
-}
12
-
13
 void FrameFilter::destroy()
14
 {
15
-    if (m_param->bEnableSAO)
16
-        m_sao.destroy();
17
-
18
     X265_FREE(m_ssimBuf);
19
+
20
+    if (m_parallelFilter)
21
+    {
22
+        if (m_param->bEnableSAO)
23
+        {
24
+            for(int row = 0; row < m_numRows; row++)
25
+                m_parallelFilter[row].m_sao.destroy((row == 0 ? 1 : 0));
26
+        }
27
+
28
+        delete[] m_parallelFilter;
29
+        m_parallelFilter = NULL;
30
+    }
31
 }
32
 
33
-void FrameFilter::init(Encoder *top, FrameEncoder *frame, int numRows)
34
+void FrameFilter::init(Encoder *top, FrameEncoder *frame, int numRows, uint32_t numCols)
35
 {
36
     m_param = top->m_param;
37
     m_frameEncoder = frame;
38
     m_numRows = numRows;
39
+    m_numCols = numCols;
40
     m_hChromaShift = CHROMA_H_SHIFT(m_param->internalCsp);
41
     m_vChromaShift = CHROMA_V_SHIFT(m_param->internalCsp);
42
     m_pad[0] = top->m_sps.conformanceWindow.rightOffset;
43
     m_pad[1] = top->m_sps.conformanceWindow.bottomOffset;
44
     m_saoRowDelay = m_param->bEnableLoopFilter ? 1 : 0;
45
-    m_lastHeight = m_param->sourceHeight % g_maxCUSize ? m_param->sourceHeight % g_maxCUSize : g_maxCUSize;
46
-
47
-    if (m_param->bEnableSAO)
48
-        if (!m_sao.create(m_param))
49
-            m_param->bEnableSAO = 0;
50
+    m_lastHeight = (m_param->sourceHeight % g_maxCUSize) ? (m_param->sourceHeight % g_maxCUSize) : g_maxCUSize;
51
+    m_lastWidth = (m_param->sourceWidth % g_maxCUSize) ? (m_param->sourceWidth % g_maxCUSize) : g_maxCUSize;
52
 
53
     if (m_param->bEnableSsim)
54
         m_ssimBuf = X265_MALLOC(int, 8 * (m_param->sourceWidth / 4 + 3));
55
+
56
+    m_parallelFilter = new ParallelFilter[numRows];
57
+
58
+    if (m_parallelFilter)
59
+    {
60
+        if (m_param->bEnableSAO)
61
+        {
62
+            for(int row = 0; row < numRows; row++)
63
+            {
64
+                if (!m_parallelFilter[row].m_sao.create(m_param, (row == 0 ? 1 : 0)))
65
+                    m_param->bEnableSAO = 0;
66
+                else
67
+                {
68
+                    if (row != 0)
69
+                        m_parallelFilter[row].m_sao.createFromRootNode(&m_parallelFilter[0].m_sao);
70
+                }
71
+
72
+            }
73
+        }
74
+
75
+        for(int row = 0; row < numRows; row++)
76
+        {
77
+            // Setting maximum bound information
78
+            m_parallelFilter[row].m_rowHeight = (row == numRows - 1) ? m_lastHeight : g_maxCUSize;
79
+            m_parallelFilter[row].m_row = row;
80
+            m_parallelFilter[row].m_rowAddr = row * numCols;
81
+            m_parallelFilter[row].m_frameFilter = this;
82
+
83
+            if (row > 0)
84
+                m_parallelFilter[row].m_prevRow = &m_parallelFilter[row - 1];
85
+        }
86
+    }
87
+
88
 }
89
 
90
 void FrameFilter::start(Frame *frame, Entropy& initState, int qp)
91
 {
92
     m_frame = frame;
93
 
94
-    if (m_param->bEnableSAO)
95
-        m_sao.startSlice(frame, initState, qp);
96
+    // Reset Filter Data Struct
97
+    if (m_parallelFilter)
98
+    {
99
+        for(int row = 0; row < m_numRows; row++)
100
+        {
101
+            if (m_param->bEnableSAO)
102
+                m_parallelFilter[row].m_sao.startSlice(frame, initState, qp);
103
+
104
+            m_parallelFilter[row].m_lastCol.set(0);
105
+            m_parallelFilter[row].m_allowedCol.set(0);
106
+            m_parallelFilter[row].m_lastDeblocked.set(-1);
107
+            m_parallelFilter[row].m_encData = frame->m_encData;
108
+        }
109
+
110
+        // Reset SAO common statistics
111
+        if (m_param->bEnableSAO)
112
+            m_parallelFilter[0].m_sao.resetStats();
113
+    }
114
 }
115
 
116
-void FrameFilter::processRow(int row)
117
+/* restore original YUV samples to recon after SAO (if lossless) */
118
+static void restoreOrigLosslessYuv(const CUData* cu, Frame& frame, uint32_t absPartIdx)
119
 {
120
-    ProfileScopeEvent(filterCTURow);
121
+    const int size = cu->m_log2CUSize[absPartIdx] - 2;
122
+    const uint32_t cuAddr = cu->m_cuAddr;
123
 
124
-#if DETAILED_CU_STATS
125
-    ScopedElapsedTime filterPerfScope(m_frameEncoder->m_cuStats.loopFilterElapsedTime);
126
-    m_frameEncoder->m_cuStats.countLoopFilter++;
127
-#endif
128
+    PicYuv* reconPic = frame.m_reconPic;
129
+    PicYuv* fencPic  = frame.m_fencPic;
130
 
131
-    if (!m_param->bEnableLoopFilter && !m_param->bEnableSAO)
132
+    pixel* dst = reconPic->getLumaAddr(cuAddr, absPartIdx);
133
+    pixel* src = fencPic->getLumaAddr(cuAddr, absPartIdx);
134
+
135
+    primitives.cu[size].copy_pp(dst, reconPic->m_stride, src, fencPic->m_stride);
136
+
137
+    if (cu->m_chromaFormat != X265_CSP_I400)
138
     {
139
-        processRowPost(row);
140
+        pixel* dstCb = reconPic->getCbAddr(cuAddr, absPartIdx);
141
+        pixel* srcCb = fencPic->getCbAddr(cuAddr, absPartIdx);
142
+        pixel* dstCr = reconPic->getCrAddr(cuAddr, absPartIdx);
143
+        pixel* srcCr = fencPic->getCrAddr(cuAddr, absPartIdx);
144
+
145
+        const int csp = fencPic->m_picCsp;
146
+        primitives.chroma[csp].cu[size].copy_pp(dstCb, reconPic->m_strideC, srcCb, fencPic->m_strideC);
147
+        primitives.chroma[csp].cu[size].copy_pp(dstCr, reconPic->m_strideC, srcCr, fencPic->m_strideC);
148
+    }
149
+}
150
+
151
+/* Original YUV restoration for CU in lossless coding */
152
+static void origCUSampleRestoration(const CUData* cu, const CUGeom& cuGeom, Frame& frame)
153
+{
154
+    uint32_t absPartIdx = cuGeom.absPartIdx;
155
+    if (cu->m_cuDepth[absPartIdx] > cuGeom.depth)
156
+    {
157
+        for (int subPartIdx = 0; subPartIdx < 4; subPartIdx++)
158
+        {
159
+            const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
160
+            if (childGeom.flags & CUGeom::PRESENT)
161
+                origCUSampleRestoration(cu, childGeom, frame);
162
+        }
163
         return;
164
     }
165
-    FrameData& encData = *m_frame->m_encData;
166
-    const uint32_t numCols = encData.m_slice->m_sps->numCuInWidth;
167
-    const uint32_t lineStartCUAddr = row * numCols;
168
 
169
-    if (m_param->bEnableLoopFilter)
170
+    // restore original YUV samples
171
+    if (cu->m_tqBypass[absPartIdx])
172
+        restoreOrigLosslessYuv(cu, frame, absPartIdx);
173
+}
174
+
175
+void FrameFilter::ParallelFilter::copySaoAboveRef(PicYuv* reconPic, uint32_t cuAddr, int col)
176
+{
177
+    // Copy SAO Top Reference Pixels
178
+    int ctuWidth  = g_maxCUSize;
179
+    const pixel* recY = reconPic->getPlaneAddr(0, cuAddr) - (m_rowAddr == 0 ? 0 : reconPic->m_stride);
180
+
181
+    // Luma
182
+    memcpy(&m_sao.m_tmpU[0][col * ctuWidth], recY, ctuWidth * sizeof(pixel));
183
+    X265_CHECK(col * ctuWidth + ctuWidth <= m_sao.m_numCuInWidth * ctuWidth, "m_tmpU buffer beyond bound write detected");
184
+
185
+    // Chroma
186
+    if (m_frameFilter->m_param->internalCsp != X265_CSP_I400)
187
+    {
188
+        ctuWidth  >>= m_sao.m_hChromaShift;
189
+
190
+        const pixel* recU = reconPic->getPlaneAddr(1, cuAddr) - (m_rowAddr == 0 ? 0 : reconPic->m_strideC);
191
+        const pixel* recV = reconPic->getPlaneAddr(2, cuAddr) - (m_rowAddr == 0 ? 0 : reconPic->m_strideC);
192
+        memcpy(&m_sao.m_tmpU[1][col * ctuWidth], recU, ctuWidth * sizeof(pixel));
193
+        memcpy(&m_sao.m_tmpU[2][col * ctuWidth], recV, ctuWidth * sizeof(pixel));
194
+
195
+        X265_CHECK(col * ctuWidth + ctuWidth <= m_sao.m_numCuInWidth * ctuWidth, "m_tmpU buffer beyond bound write detected");
196
+    }
197
+}
198
+
199
+void FrameFilter::ParallelFilter::processSaoUnitCu(SAOParam *saoParam, int col)
200
+{
201
+    // TODO: apply SAO on CU and copy back soon, is it necessary?
202
+    if (saoParam->bSaoFlag[0])
203
+        m_sao.processSaoUnitCuLuma(saoParam->ctuParam[0], m_row, col);
204
+
205
+    if (saoParam->bSaoFlag[1])
206
+        m_sao.processSaoUnitCuChroma(saoParam->ctuParam, m_row, col);
207
+
208
+    if (m_encData->m_slice->m_pps->bTransquantBypassEnabled)
209
     {
210
-        const CUGeom* cuGeoms = m_frameEncoder->m_cuGeoms;
211
-        const uint32_t* ctuGeomMap = m_frameEncoder->m_ctuGeomMap;
212
+        const CUGeom* cuGeoms = m_frameFilter->m_frameEncoder->m_cuGeoms;
213
+        const uint32_t* ctuGeomMap = m_frameFilter->m_frameEncoder->m_ctuGeomMap;
214
 
215
-        for (uint32_t col = 0; col < numCols; col++)
216
+        uint32_t cuAddr = m_rowAddr + col;
217
+        const CUData* ctu = m_encData->getPicCTU(cuAddr);
218
+        assert(m_frameFilter->m_frame->m_reconPic == m_encData->m_reconPic);
219
+        origCUSampleRestoration(ctu, cuGeoms[ctuGeomMap[cuAddr]], *m_frameFilter->m_frame);
220
+    }
221
+}
222
+
223
+// NOTE: MUST BE delay a row when Deblock enabled, the Deblock will modify above pixels in Horizon pass
224
+void FrameFilter::ParallelFilter::processPostCu(int col) const
225
+{
226
+    // Update finished CU cursor
227
+    m_frameFilter->m_frame->m_reconColCount[m_row].set(col);
228
+
229
+    // shortcut path for non-border area
230
+    if ((col != 0) & (col != m_frameFilter->m_numCols - 1) & (m_row != 0) & (m_row != m_frameFilter->m_numRows - 1))
231
+        return;
232
+
233
+    PicYuv *reconPic = m_frameFilter->m_frame->m_reconPic;
234
+    const uint32_t lineStartCUAddr = m_rowAddr + col;
235
+    const int realH = getCUHeight();
236
+    const int realW = m_frameFilter->getCUWidth(col);
237
+
238
+    const uint32_t lumaMarginX = reconPic->m_lumaMarginX;
239
+    const uint32_t lumaMarginY = reconPic->m_lumaMarginY;
240
+    const uint32_t chromaMarginX = reconPic->m_chromaMarginX;
241
+    const uint32_t chromaMarginY = reconPic->m_chromaMarginY;
242
+    const int hChromaShift = reconPic->m_hChromaShift;
243
+    const int vChromaShift = reconPic->m_vChromaShift;
244
+    const intptr_t stride = reconPic->m_stride;
245
+    const intptr_t strideC = reconPic->m_strideC;
246
+    pixel *pixY = reconPic->getLumaAddr(lineStartCUAddr);
247
+    // // MUST BE check I400 since m_picOrg uninitialize in that case
248
+    pixel *pixU = (m_frameFilter->m_param->internalCsp != X265_CSP_I400) ? reconPic->getCbAddr(lineStartCUAddr) : NULL;
249
+    pixel *pixV = (m_frameFilter->m_param->internalCsp != X265_CSP_I400) ? reconPic->getCrAddr(lineStartCUAddr) : NULL;
250
+    int copySizeY = realW;
251
+    int copySizeC = (realW >> hChromaShift);
252
+
253
+    if ((col == 0) | (col == m_frameFilter->m_numCols - 1))
254
+    {
255
+        // TODO: improve by process on Left or Right only
256
+        primitives.extendRowBorder(reconPic->getLumaAddr(m_rowAddr), stride, reconPic->m_picWidth, realH, reconPic->m_lumaMarginX);
257
+
258
+        if (m_frameFilter->m_param->internalCsp != X265_CSP_I400)
259
         {
260
-            uint32_t cuAddr = lineStartCUAddr + col;
261
-            const CUData* ctu = encData.getPicCTU(cuAddr);
262
-            deblockCTU(ctu, cuGeoms[ctuGeomMap[cuAddr]], Deblock::EDGE_VER);
263
+            primitives.extendRowBorder(reconPic->getCbAddr(m_rowAddr), strideC, reconPic->m_picWidth >> hChromaShift, realH >> vChromaShift, reconPic->m_chromaMarginX);
264
+            primitives.extendRowBorder(reconPic->getCrAddr(m_rowAddr), strideC, reconPic->m_picWidth >> hChromaShift, realH >> vChromaShift, reconPic->m_chromaMarginX);
265
+        }
266
+    }
267
 
268
-            if (col > 0)
269
+    // Extra Left and Right border on first and last CU
270
+    if ((col == 0) | (col == m_frameFilter->m_numCols - 1))
271
+    {
272
+        copySizeY += lumaMarginX;
273
+        copySizeC += chromaMarginX;
274
+    }
275
+
276
+    // First column need extension left padding area and first CU
277
+    if (col == 0)
278
+    {
279
+        pixY -= lumaMarginX;
280
+        pixU -= chromaMarginX;
281
+        pixV -= chromaMarginX;
282
+    }
283
+
284
+    // Border extend Top
285
+    if (m_row == 0)
286
+    {
287
+        for (uint32_t y = 0; y < lumaMarginY; y++)
288
+            memcpy(pixY - (y + 1) * stride, pixY, copySizeY * sizeof(pixel));
289
+
290
+        if (m_frameFilter->m_param->internalCsp != X265_CSP_I400)
291
+        {
292
+            for (uint32_t y = 0; y < chromaMarginY; y++)
293
             {
294
-                const CUData* ctuPrev = encData.getPicCTU(cuAddr - 1);
295
-                deblockCTU(ctuPrev, cuGeoms[ctuGeomMap[cuAddr - 1]], Deblock::EDGE_HOR);
296
+                memcpy(pixU - (y + 1) * strideC, pixU, copySizeC * sizeof(pixel));
297
+                memcpy(pixV - (y + 1) * strideC, pixV, copySizeC * sizeof(pixel));
298
             }
299
         }
300
+    }
301
 
302
-        const CUData* ctuPrev = encData.getPicCTU(lineStartCUAddr + numCols - 1);
303
-        deblockCTU(ctuPrev, cuGeoms[ctuGeomMap[lineStartCUAddr + numCols - 1]], Deblock::EDGE_HOR);
304
+    // Border extend Bottom
305
+    if (m_row == m_frameFilter->m_numRows - 1)
306
+    {
307
+        pixY += (realH - 1) * stride;
308
+        pixU += ((realH >> vChromaShift) - 1) * strideC;
309
+        pixV += ((realH >> vChromaShift) - 1) * strideC;
310
+        for (uint32_t y = 0; y < lumaMarginY; y++)
311
+            memcpy(pixY + (y + 1) * stride, pixY, copySizeY * sizeof(pixel));
312
+
313
+        if (m_frameFilter->m_param->internalCsp != X265_CSP_I400)
314
+        {
315
+            for (uint32_t y = 0; y < chromaMarginY; y++)
316
+            {
317
+                memcpy(pixU + (y + 1) * strideC, pixU, copySizeC * sizeof(pixel));
318
+                memcpy(pixV + (y + 1) * strideC, pixV, copySizeC * sizeof(pixel));
319
+            }
320
+        }
321
     }
322
+}
323
 
324
-    // SAO
325
-    SAOParam* saoParam = encData.m_saoParam;
326
-    if (m_param->bEnableSAO)
327
+// NOTE: Single Threading only
328
+void FrameFilter::ParallelFilter::processTasks(int /*workerThreadId*/)
329
+{
330
+    SAOParam* saoParam = m_encData->m_saoParam;
331
+    const CUGeom* cuGeoms = m_frameFilter->m_frameEncoder->m_cuGeoms;
332
+    const uint32_t* ctuGeomMap = m_frameFilter->m_frameEncoder->m_ctuGeomMap;
333
+    PicYuv* reconPic = m_encData->m_reconPic;
334
+    const int colStart = m_lastCol.get();
335
+    // TODO: Waiting previous row finish or simple clip on it?
336
+    const int colEnd = m_allowedCol.get();
337
+    const int numCols = m_frameFilter->m_numCols;
338
+
339
+    // Avoid threading conflict
340
+    if (colStart >= colEnd)
341
+        return;
342
+
343
+    for (uint32_t col = (uint32_t)colStart; col < (uint32_t)colEnd; col++)
344
     {
345
-        m_sao.m_entropyCoder.load(m_frameEncoder->m_initSliceContext);
346
-        m_sao.m_rdContexts.next.load(m_frameEncoder->m_initSliceContext);
347
-        m_sao.m_rdContexts.cur.load(m_frameEncoder->m_initSliceContext);
348
+        const uint32_t cuAddr = m_rowAddr + col;
349
 
350
-        m_sao.rdoSaoUnitRow(saoParam, row);
351
+        if (m_frameFilter->m_param->bEnableLoopFilter)
352
+        {
353
+            const CUData* ctu = m_encData->getPicCTU(cuAddr);
354
+            deblockCTU(ctu, cuGeoms[ctuGeomMap[cuAddr]], Deblock::EDGE_VER);
355
+        }
356
 
357
-        // NOTE: Delay a row because SAO decide need top row pixels at next row, is it HM's bug?
358
-        if (row >= m_saoRowDelay)
359
-            processSao(row - m_saoRowDelay);
360
-    }
361
+        if (col >= 1)
362
+        {
363
+            if (m_frameFilter->m_param->bEnableLoopFilter)
364
+            {
365
+                const CUData* ctuPrev = m_encData->getPicCTU(cuAddr - 1);
366
+                deblockCTU(ctuPrev, cuGeoms[ctuGeomMap[cuAddr - 1]], Deblock::EDGE_HOR);
367
 
368
-    // this row of CTUs has been encoded
369
+                // When SAO Disable, setting column counter here
370
+                if ((!m_frameFilter->m_param->bEnableSAO) & (m_row >= 1))
371
+                    m_prevRow->processPostCu(col - 1);
372
+            }
373
 
374
-    if (row > 0)
375
-        processRowPost(row - 1);
376
+            if (m_frameFilter->m_param->bEnableSAO)
377
+            {
378
+                // Save SAO bottom row reference pixels
379
+                copySaoAboveRef(reconPic, cuAddr - 1, col - 1);
380
+
381
+                // SAO Decide
382
+                if (col >= 2)
383
+                {
384
+                    // NOTE: Delay 2 column to avoid mistake on below case, it is Deblock sync logic issue, less probability but still alive
385
+                    //       ... H V |
386
+                    //       ..S H V |
387
+                    m_sao.rdoSaoUnitCu(saoParam, m_rowAddr, col - 2, cuAddr - 2);
388
+                }
389
+
390
+                // Process Previous Row SAO CU
391
+                if (m_row >= 1 && col >= 3)
392
+                {
393
+                    // Must delay 1 row to avoid thread data race conflict
394
+                    m_prevRow->processSaoUnitCu(saoParam, col - 3);
395
+                    m_prevRow->processPostCu(col - 3);
396
+                }
397
+            }
398
 
399
-    if (row == m_numRows - 1)
400
+            m_lastDeblocked.set(col);
401
+        }
402
+        m_lastCol.incr();
403
+    }
404
+
405
+    if (colEnd == numCols)
406
     {
407
-        if (m_param->bEnableSAO)
408
+        const uint32_t cuAddr = m_rowAddr + numCols - 1;
409
+
410
+        if (m_frameFilter->m_param->bEnableLoopFilter)
411
         {
412
-            m_sao.rdoSaoUnitRowEnd(saoParam, encData.m_slice->m_sps->numCUsInFrame);
413
+            const CUData* ctuPrev = m_encData->getPicCTU(cuAddr);
414
+            deblockCTU(ctuPrev, cuGeoms[ctuGeomMap[cuAddr]], Deblock::EDGE_HOR);
415
 
416
-            for (int i = m_numRows - m_saoRowDelay; i < m_numRows; i++)
417
-                processSao(i);
418
+            // When SAO Disable, setting column counter here
419
+            if ((!m_frameFilter->m_param->bEnableSAO) & (m_row >= 1))
420
+                m_prevRow->processPostCu(numCols - 1);
421
         }
422
 
423
-        processRowPost(row);
424
+        // TODO: move processPostCu() into processSaoUnitCu()
425
+        if (m_frameFilter->m_param->bEnableSAO)
426
+        {
427
+            // Save SAO bottom row reference pixels
428
+            copySaoAboveRef(reconPic, cuAddr, numCols - 1);
429
+
430
+            // SAO Decide
431
+            // NOTE: reduce condition check for 1 CU only video, Why someone play with it?
432
+            if (numCols >= 2)
433
+                m_sao.rdoSaoUnitCu(saoParam, m_rowAddr, numCols - 2, cuAddr - 1);
434
+
435
+            if (numCols >= 1)
436
+                m_sao.rdoSaoUnitCu(saoParam, m_rowAddr, numCols - 1, cuAddr);
437
+
438
+            // Process Previous Rows SAO CU
439
+            if (m_row >= 1 && numCols >= 3)
440
+            {
441
+                m_prevRow->processSaoUnitCu(saoParam, numCols - 3);
442
+                m_prevRow->processPostCu(numCols - 3);
443
+            }
444
+
445
+            if (m_row >= 1 && numCols >= 2)
446
+            {
447
+                m_prevRow->processSaoUnitCu(saoParam, numCols - 2);
448
+                m_prevRow->processPostCu(numCols - 2);
449
+            }
450
+
451
+            if (m_row >= 1 && numCols >= 1)
452
+            {
453
+                m_prevRow->processSaoUnitCu(saoParam, numCols - 1);
454
+                m_prevRow->processPostCu(numCols - 1);
455
+            }
456
+
457
+            // Setting column sync counter
458
+            if (m_row >= 1)
459
+                m_frameFilter->m_frame->m_reconColCount[m_row - 1].set(numCols - 1);
460
+        }
461
+        m_lastDeblocked.set(numCols);
462
     }
463
 }
464
 
465
-uint32_t FrameFilter::getCUHeight(int rowNum) const
466
+void FrameFilter::processRow(int row)
467
 {
468
-    return rowNum == m_numRows - 1 ? m_lastHeight : g_maxCUSize;
469
-}
470
+    ProfileScopeEvent(filterCTURow);
471
 
472
-void FrameFilter::processRowPost(int row)
473
-{
474
-    PicYuv *reconPic = m_frame->m_reconPic;
475
-    const uint32_t numCols = m_frame->m_encData->m_slice->m_sps->numCuInWidth;
476
-    const uint32_t lineStartCUAddr = row * numCols;
477
-    const int realH = getCUHeight(row);
478
+#if DETAILED_CU_STATS
479
+    ScopedElapsedTime filterPerfScope(m_frameEncoder->m_cuStats.loopFilterElapsedTime);
480
+    m_frameEncoder->m_cuStats.countLoopFilter++;
481
+#endif
482
 
483
-    // Border extend Left and Right
484
-    primitives.extendRowBorder(reconPic->getLumaAddr(lineStartCUAddr), reconPic->m_stride, reconPic->m_picWidth, realH, reconPic->m_lumaMarginX);
485
-    primitives.extendRowBorder(reconPic->getCbAddr(lineStartCUAddr), reconPic->m_strideC, reconPic->m_picWidth >> m_hChromaShift, realH >> m_vChromaShift, reconPic->m_chromaMarginX);
486
-    primitives.extendRowBorder(reconPic->getCrAddr(lineStartCUAddr), reconPic->m_strideC, reconPic->m_picWidth >> m_hChromaShift, realH >> m_vChromaShift, reconPic->m_chromaMarginX);
487
+    if (!m_param->bEnableLoopFilter && !m_param->bEnableSAO)
488
+    {
489
+        processPostRow(row);
490
+        return;
491
+    }
492
+    FrameData& encData = *m_frame->m_encData;
493
 
494
-    // Border extend Top
495
-    if (!row)
496
+    // SAO: was integrate into encode loop
497
+    SAOParam* saoParam = encData.m_saoParam;
498
+
499
+    /* Processing left block Deblock with current threading */
500
     {
501
-        const intptr_t stride = reconPic->m_stride;
502
-        const intptr_t strideC = reconPic->m_strideC;
503
-        pixel *pixY = reconPic->getLumaAddr(lineStartCUAddr) - reconPic->m_lumaMarginX;
504
-        pixel *pixU = reconPic->getCbAddr(lineStartCUAddr) - reconPic->m_chromaMarginX;
505
-        pixel *pixV = reconPic->getCrAddr(lineStartCUAddr) - reconPic->m_chromaMarginX;
506
+        /* stop threading on current row */
507
+        m_parallelFilter[row].waitForExit();
508
+
509
+        /* Check to avoid previous row process slower than current row */
510
+        X265_CHECK((row < 1) || m_parallelFilter[row - 1].m_lastDeblocked.get() == m_numCols, "previous row not finish");
511
 
512
-        for (uint32_t y = 0; y < reconPic->m_lumaMarginY; y++)
513
-            memcpy(pixY - (y + 1) * stride, pixY, stride * sizeof(pixel));
514
+        m_parallelFilter[row].m_allowedCol.set(m_numCols);
515
+        m_parallelFilter[row].processTasks(-1);
516
 
517
-        for (uint32_t y = 0; y < reconPic->m_chromaMarginY; y++)
518
+        if (row == m_numRows - 1)
519
         {
520
-            memcpy(pixU - (y + 1) * strideC, pixU, strideC * sizeof(pixel));
521
-            memcpy(pixV - (y + 1) * strideC, pixV, strideC * sizeof(pixel));
522
+            /* TODO: Early start last row */
523
+            if ((row >= 1) && (m_parallelFilter[row - 1].m_lastDeblocked.get() != m_numCols))
524
+                x265_log(m_param, X265_LOG_WARNING, "detected ParallelFilter race condition on last row\n");
525
+
526
+            /* Apply SAO on last row of CUs, because we always apply SAO on row[X-1] */
527
+            if (m_param->bEnableSAO)
528
+            {
529
+                for(int col = 0; col < m_numCols; col++)
530
+                {
531
+                    // NOTE: must use processSaoUnitCu(), it include TQBypass logic
532
+                    m_parallelFilter[row].processSaoUnitCu(saoParam, col);
533
+                }
534
+            }
535
+
536
+            // Process border extension on last row
537
+            for(int col = 0; col < m_numCols; col++)
538
+            {
539
+                // m_reconColCount will be set in processPostCu()
540
+                m_parallelFilter[row].processPostCu(col);
541
+            }
542
         }
543
     }
544
 
545
-    // Border extend Bottom
546
+    // this row of CTUs has been encoded
547
+
548
+    if (row > 0)
549
+        processPostRow(row - 1);
550
+
551
     if (row == m_numRows - 1)
552
     {
553
-        const intptr_t stride = reconPic->m_stride;
554
-        const intptr_t strideC = reconPic->m_strideC;
555
-        pixel *pixY = reconPic->getLumaAddr(lineStartCUAddr) - reconPic->m_lumaMarginX + (realH - 1) * stride;
556
-        pixel *pixU = reconPic->getCbAddr(lineStartCUAddr) - reconPic->m_chromaMarginX + ((realH >> m_vChromaShift) - 1) * strideC;
557
-        pixel *pixV = reconPic->getCrAddr(lineStartCUAddr) - reconPic->m_chromaMarginX + ((realH >> m_vChromaShift) - 1) * strideC;
558
-        for (uint32_t y = 0; y < reconPic->m_lumaMarginY; y++)
559
-            memcpy(pixY + (y + 1) * stride, pixY, stride * sizeof(pixel));
560
-
561
-        for (uint32_t y = 0; y < reconPic->m_chromaMarginY; y++)
562
+        if (m_param->bEnableSAO)
563
         {
564
-            memcpy(pixU + (y + 1) * strideC, pixU, strideC * sizeof(pixel));
565
-            memcpy(pixV + (y + 1) * strideC, pixV, strideC * sizeof(pixel));
566
+            // Merge numNoSao into RootNode (Node0)
567
+            for(int i = 1; i < m_numRows; i++)
568
+            {
569
+                m_parallelFilter[0].m_sao.m_numNoSao[0] += m_parallelFilter[i].m_sao.m_numNoSao[0];
570
+                m_parallelFilter[0].m_sao.m_numNoSao[1] += m_parallelFilter[i].m_sao.m_numNoSao[1];
571
+            }
572
+
573
+            m_parallelFilter[0].m_sao.rdoSaoUnitRowEnd(saoParam, encData.m_slice->m_sps->numCUsInFrame);
574
         }
575
+        processPostRow(row);
576
     }
577
+}
578
+
579
+void FrameFilter::processPostRow(int row)
580
+{
581
+    PicYuv *reconPic = m_frame->m_reconPic;
582
+    const uint32_t numCols = m_frame->m_encData->m_slice->m_sps->numCuInWidth;
583
+    const uint32_t lineStartCUAddr = row * numCols;
584
 
585
     // Notify other FrameEncoders that this row of reconstructed pixels is available
586
     m_frame->m_reconRowCount.incr();
587
@@ -217,26 +526,30 @@
588
 
589
         intptr_t stride = reconPic->m_stride;
590
         uint32_t width  = reconPic->m_picWidth - m_pad[0];
591
-        uint32_t height = getCUHeight(row);
592
+        uint32_t height = m_parallelFilter[row].getCUHeight();
593
 
594
         uint64_t ssdY = computeSSD(fencPic->getLumaAddr(cuAddr), reconPic->getLumaAddr(cuAddr), stride, width, height);
595
-        height >>= m_vChromaShift;
596
-        width  >>= m_hChromaShift;
597
-        stride = reconPic->m_strideC;
598
+        m_frameEncoder->m_SSDY += ssdY;
599
 
600
-        uint64_t ssdU = computeSSD(fencPic->getCbAddr(cuAddr), reconPic->getCbAddr(cuAddr), stride, width, height);
601
-        uint64_t ssdV = computeSSD(fencPic->getCrAddr(cuAddr), reconPic->getCrAddr(cuAddr), stride, width, height);
602
+        if (m_param->internalCsp != X265_CSP_I400)
603
+        {
604
+            height >>= m_vChromaShift;
605
+            width >>= m_hChromaShift;
606
+            stride = reconPic->m_strideC;
607
 
608
-        m_frameEncoder->m_SSDY += ssdY;
609
-        m_frameEncoder->m_SSDU += ssdU;
610
-        m_frameEncoder->m_SSDV += ssdV;
611
+            uint64_t ssdU = computeSSD(fencPic->getCbAddr(cuAddr), reconPic->getCbAddr(cuAddr), stride, width, height);
612
+            uint64_t ssdV = computeSSD(fencPic->getCrAddr(cuAddr), reconPic->getCrAddr(cuAddr), stride, width, height);
613
+
614
+            m_frameEncoder->m_SSDU += ssdU;
615
+            m_frameEncoder->m_SSDV += ssdV;
616
+        }
617
     }
618
     if (m_param->bEnableSsim && m_ssimBuf)
619
     {
620
-        pixel *rec = m_frame->m_reconPic->m_picOrg[0];
621
+        pixel *rec = reconPic->m_picOrg[0];
622
         pixel *fenc = m_frame->m_fencPic->m_picOrg[0];
623
-        intptr_t stride1 = m_frame->m_fencPic->m_stride;
624
-        intptr_t stride2 = m_frame->m_reconPic->m_stride;
625
+        intptr_t stride1 = reconPic->m_stride;
626
+        intptr_t stride2 = m_frame->m_fencPic->m_stride;
627
         uint32_t bEnd = ((row + 1) == (this->m_numRows - 1));
628
         uint32_t bStart = (row == 0);
629
         uint32_t minPixY = row * g_maxCUSize - 4 * !bStart;
630
@@ -253,55 +566,75 @@
631
     }
632
     if (m_param->decodedPictureHashSEI == 1)
633
     {
634
-        uint32_t height = getCUHeight(row);
635
+        uint32_t height = m_parallelFilter[row].getCUHeight();
636
         uint32_t width = reconPic->m_picWidth;
637
         intptr_t stride = reconPic->m_stride;
638
 
639
         if (!row)
640
-        {
641
-            for (int i = 0; i < 3; i++)
642
-                MD5Init(&m_frameEncoder->m_state[i]);
643
-        }
644
+            MD5Init(&m_frameEncoder->m_state[0]);
645
 
646
         updateMD5Plane(m_frameEncoder->m_state[0], reconPic->getLumaAddr(cuAddr), width, height, stride);
647
-        width  >>= m_hChromaShift;
648
-        height >>= m_vChromaShift;
649
-        stride = reconPic->m_strideC;
650
+        if (m_param->internalCsp != X265_CSP_I400)
651
+        {
652
+            if (!row)
653
+            {
654
+                MD5Init(&m_frameEncoder->m_state[1]);
655
+                MD5Init(&m_frameEncoder->m_state[2]);
656
+            }
657
 
658
-        updateMD5Plane(m_frameEncoder->m_state[1], reconPic->getCbAddr(cuAddr), width, height, stride);
659
-        updateMD5Plane(m_frameEncoder->m_state[2], reconPic->getCrAddr(cuAddr), width, height, stride);
660
+            width >>= m_hChromaShift;
661
+            height >>= m_vChromaShift;
662
+            stride = reconPic->m_strideC;
663
+
664
+            updateMD5Plane(m_frameEncoder->m_state[1], reconPic->getCbAddr(cuAddr), width, height, stride);
665
+            updateMD5Plane(m_frameEncoder->m_state[2], reconPic->getCrAddr(cuAddr), width, height, stride);
666
+        }
667
     }
668
     else if (m_param->decodedPictureHashSEI == 2)
669
     {
670
-        uint32_t height = getCUHeight(row);
671
+        uint32_t height = m_parallelFilter[row].getCUHeight();
672
         uint32_t width = reconPic->m_picWidth;
673
         intptr_t stride = reconPic->m_stride;
674
+
675
         if (!row)
676
-            m_frameEncoder->m_crc[0] = m_frameEncoder->m_crc[1] = m_frameEncoder->m_crc[2] = 0xffff;
677
+            m_frameEncoder->m_crc[0] = 0xffff;
678
+
679
         updateCRC(reconPic->getLumaAddr(cuAddr), m_frameEncoder->m_crc[0], height, width, stride);
680
-        width  >>= m_hChromaShift;
681
-        height >>= m_vChromaShift;
682
-        stride = reconPic->m_strideC;
683
+        if (m_param->internalCsp != X265_CSP_I400)
684
+        {
685
+            width >>= m_hChromaShift;
686
+            height >>= m_vChromaShift;
687
+            stride = reconPic->m_strideC;
688
+            m_frameEncoder->m_crc[1] = m_frameEncoder->m_crc[2] = 0xffff;
689
 
690
-        updateCRC(reconPic->getCbAddr(cuAddr), m_frameEncoder->m_crc[1], height, width, stride);
691
-        updateCRC(reconPic->getCrAddr(cuAddr), m_frameEncoder->m_crc[2], height, width, stride);
692
+            updateCRC(reconPic->getCbAddr(cuAddr), m_frameEncoder->m_crc[1], height, width, stride);
693
+            updateCRC(reconPic->getCrAddr(cuAddr), m_frameEncoder->m_crc[2], height, width, stride);
694
+        }
695
     }
696
     else if (m_param->decodedPictureHashSEI == 3)
697
     {
698
         uint32_t width = reconPic->m_picWidth;
699
-        uint32_t height = getCUHeight(row);
700
+        uint32_t height = m_parallelFilter[row].getCUHeight();
701
         intptr_t stride = reconPic->m_stride;
702
         uint32_t cuHeight = g_maxCUSize;
703
+
704
         if (!row)
705
-            m_frameEncoder->m_checksum[0] = m_frameEncoder->m_checksum[1] = m_frameEncoder->m_checksum[2] = 0;
706
+            m_frameEncoder->m_checksum[0] = 0;
707
+
708
         updateChecksum(reconPic->m_picOrg[0], m_frameEncoder->m_checksum[0], height, width, stride, row, cuHeight);
709
-        width  >>= m_hChromaShift;
710
-        height >>= m_vChromaShift;
711
-        stride = reconPic->m_strideC;
712
-        cuHeight >>= m_vChromaShift;
713
+        if (m_param->internalCsp != X265_CSP_I400)
714
+        {
715
+            width >>= m_hChromaShift;
716
+            height >>= m_vChromaShift;
717
+            stride = reconPic->m_strideC;
718
+            cuHeight >>= m_vChromaShift;
719
+
720
+            if (!row)
721
+                m_frameEncoder->m_checksum[1] = m_frameEncoder->m_checksum[2] = 0;
722
 
723
-        updateChecksum(reconPic->m_picOrg[1], m_frameEncoder->m_checksum[1], height, width, stride, row, cuHeight);
724
-        updateChecksum(reconPic->m_picOrg[2], m_frameEncoder->m_checksum[2], height, width, stride, row, cuHeight);
725
+            updateChecksum(reconPic->m_picOrg[1], m_frameEncoder->m_checksum[1], height, width, stride, row, cuHeight);
726
+            updateChecksum(reconPic->m_picOrg[2], m_frameEncoder->m_checksum[2], height, width, stride, row, cuHeight);
727
+        }
728
     }
729
 
730
     if (ATOMIC_INC(&m_frameEncoder->m_completionCount) == 2 * (int)m_frameEncoder->m_numRows)
731
@@ -400,79 +733,3 @@
732
     cnt = (height - 1) * (width - 1);
733
     return ssim;
734
 }
735
-
736
-/* restore original YUV samples to recon after SAO (if lossless) */
737
-static void restoreOrigLosslessYuv(const CUData* cu, Frame& frame, uint32_t absPartIdx)
738
-{
739
-    int size = cu->m_log2CUSize[absPartIdx] - 2;
740
-    uint32_t cuAddr = cu->m_cuAddr;
741
-
742
-    PicYuv* reconPic = frame.m_reconPic;
743
-    PicYuv* fencPic  = frame.m_fencPic;
744
-
745
-    pixel* dst = reconPic->getLumaAddr(cuAddr, absPartIdx);
746
-    pixel* src = fencPic->getLumaAddr(cuAddr, absPartIdx);
747
-
748
-    primitives.cu[size].copy_pp(dst, reconPic->m_stride, src, fencPic->m_stride);
749
-   
750
-    pixel* dstCb = reconPic->getCbAddr(cuAddr, absPartIdx);
751
-    pixel* srcCb = fencPic->getCbAddr(cuAddr, absPartIdx);
752
-
753
-    pixel* dstCr = reconPic->getCrAddr(cuAddr, absPartIdx);
754
-    pixel* srcCr = fencPic->getCrAddr(cuAddr, absPartIdx);
755
-
756
-    int csp = fencPic->m_picCsp;
757
-    primitives.chroma[csp].cu[size].copy_pp(dstCb, reconPic->m_strideC, srcCb, fencPic->m_strideC);
758
-    primitives.chroma[csp].cu[size].copy_pp(dstCr, reconPic->m_strideC, srcCr, fencPic->m_strideC);
759
-}
760
-
761
-/* Original YUV restoration for CU in lossless coding */
762
-static void origCUSampleRestoration(const CUData* cu, const CUGeom& cuGeom, Frame& frame)
763
-{
764
-    uint32_t absPartIdx = cuGeom.absPartIdx;
765
-    if (cu->m_cuDepth[absPartIdx] > cuGeom.depth)
766
-    {
767
-        for (int subPartIdx = 0; subPartIdx < 4; subPartIdx++)
768
-        {
769
-            const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
770
-            if (childGeom.flags & CUGeom::PRESENT)
771
-                origCUSampleRestoration(cu, childGeom, frame);
772
-        }
773
-        return;
774
-    }
775
-
776
-    // restore original YUV samples
777
-    if (cu->m_tqBypass[absPartIdx])
778
-        restoreOrigLosslessYuv(cu, frame, absPartIdx);
779
-}
780
-
781
-void FrameFilter::processSao(int row)
782
-{
783
-    FrameData& encData = *m_frame->m_encData;
784
-    SAOParam* saoParam = encData.m_saoParam;
785
-
786
-    if (saoParam->bSaoFlag[0])
787
-        m_sao.processSaoUnitRow(saoParam->ctuParam[0], row, 0);
788
-
789
-    if (saoParam->bSaoFlag[1])
790
-    {
791
-        m_sao.processSaoUnitRow(saoParam->ctuParam[1], row, 1);
792
-        m_sao.processSaoUnitRow(saoParam->ctuParam[2], row, 2);
793
-    }
794
-
795
-    if (encData.m_slice->m_pps->bTransquantBypassEnabled)
796
-    {
797
-        uint32_t numCols = encData.m_slice->m_sps->numCuInWidth;
798
-        uint32_t lineStartCUAddr = row * numCols;
799
-
800
-        const CUGeom* cuGeoms = m_frameEncoder->m_cuGeoms;
801
-        const uint32_t* ctuGeomMap = m_frameEncoder->m_ctuGeomMap;
802
-
803
-        for (uint32_t col = 0; col < numCols; col++)
804
-        {
805
-            uint32_t cuAddr = lineStartCUAddr + col;
806
-            const CUData* ctu = encData.getPicCTU(cuAddr);
807
-            origCUSampleRestoration(ctu, cuGeoms[ctuGeomMap[cuAddr]], *m_frame);
808
-        }
809
-    }
810
-}
811
x265_1.8.tar.gz/source/encoder/framefilter.h -> x265_1.9.tar.gz/source/encoder/framefilter.h Changed
112
 
1
@@ -29,6 +29,7 @@
2
 #include "frame.h"
3
 #include "deblock.h"
4
 #include "sao.h"
5
+#include "threadpool.h" // class BondedTaskGroup
6
 
7
 namespace X265_NS {
8
 // private x265 namespace
9
@@ -39,7 +40,7 @@
10
 struct ThreadLocalData;
11
 
12
 // Manages the processing of a single frame loopfilter
13
-class FrameFilter : public Deblock
14
+class FrameFilter
15
 {
16
 public:
17
 
18
@@ -50,24 +51,86 @@
19
     int           m_vChromaShift;
20
     int           m_pad[2];
21
 
22
-    SAO           m_sao;
23
     int           m_numRows;
24
+    int           m_numCols;
25
     int           m_saoRowDelay;
26
     int           m_lastHeight;
27
+    int           m_lastWidth;
28
     
29
-    void*         m_ssimBuf; /* Temp storage for ssim computation */
30
+    void*         m_ssimBuf;        /* Temp storage for ssim computation */
31
 
32
-    FrameFilter();
33
+#define MAX_PFILTER_CUS     (4) /* maximum CUs for every thread */
34
+    class ParallelFilter : public BondedTaskGroup, public Deblock
35
+    {
36
+    public:
37
+        uint32_t            m_rowHeight;
38
+        int                 m_row;
39
+        uint32_t            m_rowAddr;
40
+        FrameFilter*        m_frameFilter;
41
+        FrameData*          m_encData;
42
+        ParallelFilter*     m_prevRow;
43
+        SAO                 m_sao;
44
+        ThreadSafeInteger   m_lastCol;          /* The column that next to process */
45
+        ThreadSafeInteger   m_allowedCol;       /* The column that processed from Encode pipeline */
46
+        ThreadSafeInteger   m_lastDeblocked;   /* The column that finished all of Deblock stages  */
47
 
48
-    void init(Encoder *top, FrameEncoder *frame, int numRows);
49
+        ParallelFilter()
50
+            : m_rowHeight(0)
51
+            , m_row(0)
52
+            , m_rowAddr(0)
53
+            , m_frameFilter(NULL)
54
+            , m_encData(NULL)
55
+            , m_prevRow(NULL)
56
+        {
57
+        }
58
+
59
+        ~ParallelFilter()
60
+        { }
61
+
62
+        void processTasks(int workerThreadId);
63
+
64
+        // Apply SAO on a CU in current row
65
+        void processSaoUnitCu(SAOParam *saoParam, int col);
66
+
67
+        // Copy and Save SAO reference pixels for SAO Rdo decide
68
+        void copySaoAboveRef(PicYuv* reconPic, uint32_t cuAddr, int col);
69
+
70
+        // Post-Process (Border extension)
71
+        void processPostCu(int col) const;
72
+
73
+        uint32_t getCUHeight() const
74
+        {
75
+            return m_rowHeight;
76
+        }
77
+
78
+    protected:
79
+
80
+        ParallelFilter operator=(const ParallelFilter&);
81
+    };
82
+
83
+    ParallelFilter*     m_parallelFilter;
84
+
85
+    FrameFilter()
86
+        : m_param(NULL)
87
+        , m_frame(NULL)
88
+        , m_frameEncoder(NULL)
89
+        , m_ssimBuf(NULL)
90
+        , m_parallelFilter(NULL)
91
+    {
92
+    }
93
+
94
+    uint32_t getCUWidth(int colNum) const
95
+    {
96
+        return (colNum == (int)m_numCols - 1) ? m_lastWidth : g_maxCUSize;
97
+    }
98
+
99
+    void init(Encoder *top, FrameEncoder *frame, int numRows, uint32_t numCols);
100
     void destroy();
101
 
102
     void start(Frame *pic, Entropy& initState, int qp);
103
 
104
     void processRow(int row);
105
-    void processRowPost(int row);
106
-    void processSao(int row);
107
-    uint32_t getCUHeight(int rowNum) const;
108
+    void processPostRow(int row);
109
 };
110
 }
111
 
112
x265_1.8.tar.gz/source/encoder/level.cpp -> x265_1.9.tar.gz/source/encoder/level.cpp Changed
27
 
1
@@ -2,6 +2,7 @@
2
  * Copyright (C) 2013 x265 project
3
  *
4
  * Authors: Steve Borho <steve@borho.org>
5
+ *          Min Chen <chenm003@163.com>
6
  *
7
  * This program is free software; you can redistribute it and/or modify
8
  * it under the terms of the GNU General Public License as published by
9
@@ -462,7 +463,7 @@
10
     {
11
         if (param->internalCsp != X265_CSP_I420)
12
         {
13
-            x265_log(param, X265_LOG_ERROR, "%s profile not compatible with %s input color space.\n",
14
+            x265_log(param, X265_LOG_ERROR, "%s profile not compatible with %s input chroma subsampling.\n",
15
                      profile, x265_source_csp_names[param->internalCsp]);
16
             return -1;
17
         }
18
@@ -472,7 +473,7 @@
19
     {
20
         if (param->internalCsp != X265_CSP_I420 && param->internalCsp != X265_CSP_I422)
21
         {
22
-            x265_log(param, X265_LOG_ERROR, "%s profile not compatible with %s input color space.\n",
23
+            x265_log(param, X265_LOG_ERROR, "%s profile not compatible with %s input chroma subsampling.\n",
24
                      profile, x265_source_csp_names[param->internalCsp]);
25
             return -1;
26
         }
27
x265_1.8.tar.gz/source/encoder/motion.cpp -> x265_1.9.tar.gz/source/encoder/motion.cpp Changed
37
 
1
@@ -2,6 +2,7 @@
2
  * Copyright (C) 2013 x265 project
3
  *
4
  * Authors: Steve Borho <steve@borho.org>
5
+ *          Min Chen <chenm003@163.com>
6
  *
7
  * This program is free software; you can redistribute it and/or modify
8
  * it under the terms of the GNU General Public License as published by
9
@@ -188,11 +189,12 @@
10
     satd = primitives.pu[partEnum].satd;
11
     sad_x3 = primitives.pu[partEnum].sad_x3;
12
     sad_x4 = primitives.pu[partEnum].sad_x4;
13
+
14
     chromaSatd = primitives.chroma[fencPUYuv.m_csp].pu[partEnum].satd;
15
 
16
     /* Enable chroma residual cost if subpelRefine level is greater than 2 and chroma block size
17
      * is an even multiple of 4x4 pixels (indicated by non-null chromaSatd pointer) */
18
-    bChromaSATD = subpelRefine > 2 && chromaSatd;
19
+    bChromaSATD = subpelRefine > 2 && chromaSatd && (srcFencYuv.m_csp != X265_CSP_I400);
20
     X265_CHECK(!(bChromaSATD && !workload[subpelRefine].hpel_satd), "Chroma SATD cannot be used with SAD hpel\n");
21
 
22
     ctuAddr = _ctuAddr;
23
@@ -1214,8 +1216,11 @@
24
         const pixel* refCb = ref->getCbAddr(ctuAddr, absPartIdx) + refOffset;
25
         const pixel* refCr = ref->getCrAddr(ctuAddr, absPartIdx) + refOffset;
26
 
27
-        xFrac = qmv.x & ((1 << shiftHor) - 1);
28
-        yFrac = qmv.y & ((1 << shiftVer) - 1);
29
+        X265_CHECK((hshift == 0) || (hshift == 1), "hshift must be 0 or 1\n");
30
+        X265_CHECK((vshift == 0) || (vshift == 1), "vshift must be 0 or 1\n");
31
+
32
+        xFrac = qmv.x & (hshift ? 7 : 3);
33
+        yFrac = qmv.y & (vshift ? 7 : 3);
34
 
35
         if (!(yFrac | xFrac))
36
         {
37
x265_1.8.tar.gz/source/encoder/motion.h -> x265_1.9.tar.gz/source/encoder/motion.h Changed
9
 
1
@@ -2,6 +2,7 @@
2
  * Copyright (C) 2013 x265 project
3
  *
4
  * Authors: Steve Borho <steve@borho.org>
5
+ *          Min Chen <chenm003@163.com>
6
  *
7
  * This program is free software; you can redistribute it and/or modify
8
  * it under the terms of the GNU General Public License as published by
9
x265_1.8.tar.gz/source/encoder/nal.cpp -> x265_1.9.tar.gz/source/encoder/nal.cpp Changed
9
 
1
@@ -2,6 +2,7 @@
2
 * Copyright (C) 2013 x265 project
3
 *
4
 * Authors: Steve Borho <steve@borho.org>
5
+*          Min Chen <chenm003@163.com>
6
 *
7
 * This program is free software; you can redistribute it and/or modify
8
 * it under the terms of the GNU General Public License as published by
9
x265_1.8.tar.gz/source/encoder/ratecontrol.cpp -> x265_1.9.tar.gz/source/encoder/ratecontrol.cpp Changed
682
 
1
@@ -23,6 +23,10 @@
2
  * For more information, contact us at license @ x265.com.
3
  *****************************************************************************/
4
 
5
+#if _MSC_VER
6
+#pragma warning(disable: 4127) // conditional expression is constant, yes I know
7
+#endif
8
+
9
 #include "common.h"
10
 #include "param.h"
11
 #include "frame.h"
12
@@ -142,6 +146,9 @@
13
     rce->expectedVbv = rce2Pass->expectedVbv;
14
     rce->blurredComplexity = rce2Pass->blurredComplexity;
15
     rce->sliceType = rce2Pass->sliceType;
16
+    rce->qpNoVbv = rce2Pass->qpNoVbv;
17
+    rce->newQp = rce2Pass->newQp;
18
+    rce->qRceq = rce2Pass->qRceq;
19
 }
20
 
21
 }  // end anonymous namespace
22
@@ -205,7 +212,7 @@
23
             m_rateFactorMaxDecrement = m_param->rc.rfConstant - m_param->rc.rfConstantMin;
24
     }
25
     m_isAbr = m_param->rc.rateControlMode != X265_RC_CQP && !m_param->rc.bStatRead;
26
-    m_2pass = m_param->rc.rateControlMode == X265_RC_ABR && m_param->rc.bStatRead;
27
+    m_2pass = (m_param->rc.rateControlMode == X265_RC_ABR || m_param->rc.vbvMaxBitrate > 0) && m_param->rc.bStatRead;
28
     m_bitrate = m_param->rc.bitrate * 1000;
29
     m_frameDuration = (double)m_param->fpsDenom / m_param->fpsNum;
30
     m_qp = m_param->rc.qp;
31
@@ -219,6 +226,7 @@
32
     m_cutreeStatFileOut = m_cutreeStatFileIn = NULL;
33
     m_rce2Pass = NULL;
34
     m_lastBsliceSatdCost = 0;
35
+    m_movingAvgSum = 0.0;
36
 
37
     // vbv initialization
38
     m_param->rc.vbvBufferSize = x265_clip3(0, 2000000, m_param->rc.vbvBufferSize);
39
@@ -444,6 +452,7 @@
40
                 CMP_OPT_FIRST_PASS("open-gop", m_param->bOpenGOP);
41
                 CMP_OPT_FIRST_PASS("keyint", m_param->keyframeMax);
42
                 CMP_OPT_FIRST_PASS("scenecut", m_param->scenecutThreshold);
43
+                CMP_OPT_FIRST_PASS("intra-refresh", m_param->bIntraRefresh);
44
 
45
                 if ((p = strstr(opts, "b-adapt=")) != 0 && sscanf(p, "b-adapt=%d", &i) && i >= X265_B_ADAPT_NONE && i <= X265_B_ADAPT_TRELLIS)
46
                 {
47
@@ -488,6 +497,12 @@
48
                  x265_log(m_param, X265_LOG_ERROR, "Rce Entries for 2 pass cannot be allocated\n");
49
                  return false;
50
             }
51
+            m_encOrder = X265_MALLOC(int, m_numEntries);
52
+            if (!m_encOrder)
53
+            {
54
+                x265_log(m_param, X265_LOG_ERROR, "Encode order for 2 pass cannot be allocated\n");
55
+                return false;
56
+            }
57
             /* init all to skipped p frames */
58
             for (int i = 0; i < m_numEntries; i++)
59
             {
60
@@ -504,22 +519,24 @@
61
             {
62
                 RateControlEntry *rce;
63
                 int frameNumber;
64
+                int encodeOrder;
65
                 char picType;
66
                 int e;
67
                 char *next;
68
-                double qpRc, qpAq;
69
+                double qpRc, qpAq, qNoVbv, qRceq;
70
                 next = strstr(p, ";");
71
                 if (next)
72
                     *next++ = 0;
73
-                e = sscanf(p, " in:%d ", &frameNumber);
74
+                e = sscanf(p, " in:%d out:%d", &frameNumber, &encodeOrder);
75
                 if (frameNumber < 0 || frameNumber >= m_numEntries)
76
                 {
77
                     x265_log(m_param, X265_LOG_ERROR, "bad frame number (%d) at stats line %d\n", frameNumber, i);
78
                     return false;
79
                 }
80
-                rce = &m_rce2Pass[frameNumber];
81
-                e += sscanf(p, " in:%*d out:%*d type:%c q:%lf q-aq:%lf tex:%d mv:%d misc:%d icu:%lf pcu:%lf scu:%lf",
82
-                       &picType, &qpRc, &qpAq, &rce->coeffBits,
83
+                rce = &m_rce2Pass[encodeOrder];
84
+                m_encOrder[frameNumber] = encodeOrder;
85
+                e += sscanf(p, " in:%*d out:%*d type:%c q:%lf q-aq:%lf q-noVbv:%lf q-Rceq:%lf tex:%d mv:%d misc:%d icu:%lf pcu:%lf scu:%lf",
86
+                       &picType, &qpRc, &qpAq, &qNoVbv, &qRceq, &rce->coeffBits,
87
                        &rce->mvBits, &rce->miscBits, &rce->iCuCount, &rce->pCuCount,
88
                        &rce->skipCuCount);
89
                 rce->keptAsRef = true;
90
@@ -538,13 +555,16 @@
91
                     x265_log(m_param, X265_LOG_ERROR, "statistics are damaged at line %d, parser out=%d\n", i, e);
92
                     return false;
93
                 }
94
-                rce->qScale = x265_qp2qScale(qpRc);
95
+                rce->qScale = rce->newQScale = x265_qp2qScale(qpRc);
96
                 totalQpAq += qpAq;
97
+                rce->qpNoVbv = qNoVbv;
98
+                rce->qpaRc = qpRc;
99
+                rce->qpAq = qpAq;
100
+                rce->qRceq = qRceq;
101
                 p = next;
102
             }
103
             X265_FREE(statsBuf);
104
-
105
-            if (m_param->rc.rateControlMode == X265_RC_ABR)
106
+            if (m_param->rc.rateControlMode == X265_RC_ABR || m_param->rc.vbvMaxBitrate > 0)
107
             {
108
                 if (!initPass2())
109
                     return false;
110
@@ -627,11 +647,8 @@
111
 
112
     #undef MAX_DURATION
113
 }
114
-
115
-bool RateControl::initPass2()
116
+bool RateControl::analyseABR2Pass(int startIndex, int endIndex, uint64_t allAvailableBits)
117
 {
118
-    uint64_t allConstBits = 0;
119
-    uint64_t allAvailableBits = uint64_t(m_param->rc.bitrate * 1000. * m_numEntries * m_frameDuration);
120
     double rateFactor, stepMult;
121
     double qBlur = m_param->rc.qblur;
122
     double cplxBlur = m_param->rc.complexityBlur;
123
@@ -640,30 +657,19 @@
124
     double *qScale, *blurredQscale;
125
     double baseCplx = m_ncu * (m_param->bframes ? 120 : 80);
126
     double clippedDuration = CLIP_DURATION(m_frameDuration) / BASE_FRAME_DURATION;
127
-
128
-    /* find total/average complexity & const_bits */
129
-    for (int i = 0; i < m_numEntries; i++)
130
-        allConstBits += m_rce2Pass[i].miscBits;
131
-
132
-    if (allAvailableBits < allConstBits)
133
-    {
134
-        x265_log(m_param, X265_LOG_ERROR, "requested bitrate is too low. estimated minimum is %d kbps\n",
135
-                 (int)(allConstBits * m_fps / m_numEntries * 1000.));
136
-        return false;
137
-    }
138
-
139
+    int framesCount = endIndex - startIndex + 1;
140
     /* Blur complexities, to reduce local fluctuation of QP.
141
      * We don't blur the QPs directly, because then one very simple frame
142
      * could drag down the QP of a nearby complex frame and give it more
143
      * bits than intended. */
144
-    for (int i = 0; i < m_numEntries; i++)
145
+    for (int i = startIndex; i <= endIndex; i++)
146
     {
147
         double weightSum = 0;
148
         double cplxSum = 0;
149
         double weight = 1.0;
150
         double gaussianWeight;
151
         /* weighted average of cplx of future frames */
152
-        for (int j = 1; j < cplxBlur * 2 && j < m_numEntries - i; j++)
153
+        for (int j = 1; j < cplxBlur * 2 && j <= endIndex - i; j++)
154
         {
155
             RateControlEntry *rcj = &m_rce2Pass[i + j];
156
             weight *= 1 - pow(rcj->iCuCount / m_ncu, 2);
157
@@ -687,11 +693,10 @@
158
         }
159
         m_rce2Pass[i].blurredComplexity = cplxSum / weightSum;
160
     }
161
-
162
-    CHECKED_MALLOC(qScale, double, m_numEntries);
163
+    CHECKED_MALLOC(qScale, double, framesCount);
164
     if (filterSize > 1)
165
     {
166
-        CHECKED_MALLOC(blurredQscale, double, m_numEntries);
167
+        CHECKED_MALLOC(blurredQscale, double, framesCount);
168
     }
169
     else
170
         blurredQscale = qScale;
171
@@ -702,9 +707,8 @@
172
      * because qscale2bits is not invertible, but we can start with the simple
173
      * approximation of scaling the 1st pass by the ratio of bitrates.
174
      * The search range is probably overkill, but speed doesn't matter here. */
175
-
176
     expectedBits = 1;
177
-    for (int i = 0; i < m_numEntries; i++)
178
+    for (int i = startIndex; i <= endIndex; i++)
179
     {
180
         RateControlEntry* rce = &m_rce2Pass[i];
181
         double q = getQScale(rce, 1.0);
182
@@ -781,12 +785,10 @@
183
     X265_FREE(qScale);
184
     if (filterSize > 1)
185
         X265_FREE(blurredQscale);
186
-
187
     if (m_isVbv)
188
-        if (!vbv2Pass(allAvailableBits))
189
+    if (!vbv2Pass(allAvailableBits, endIndex, startIndex))
190
             return false;
191
-    expectedBits = countExpectedBits();
192
-
193
+    expectedBits = countExpectedBits(startIndex, endIndex);
194
     if (fabs(expectedBits / allAvailableBits - 1.0) > 0.01)
195
     {
196
         double avgq = 0;
197
@@ -819,7 +821,123 @@
198
     return false;
199
 }
200
 
201
-bool RateControl::vbv2Pass(uint64_t allAvailableBits)
202
+bool RateControl::initPass2()
203
+{
204
+    uint64_t allConstBits = 0, allCodedBits = 0;
205
+    uint64_t allAvailableBits = uint64_t(m_param->rc.bitrate * 1000. * m_numEntries * m_frameDuration);
206
+    int startIndex, framesCount, endIndex;
207
+    int fps = (int)(m_fps + 0.5);
208
+    startIndex = endIndex = framesCount = 0;
209
+    bool isQpModified = true;
210
+    int diffQp = 0;
211
+    double targetBits = 0;
212
+    double expectedBits = 0;
213
+    for (startIndex = 0, endIndex = 0; endIndex < m_numEntries; endIndex++)
214
+    {
215
+        allConstBits += m_rce2Pass[endIndex].miscBits;
216
+        allCodedBits += m_rce2Pass[endIndex].coeffBits + m_rce2Pass[endIndex].mvBits;
217
+        if (m_param->rc.rateControlMode == X265_RC_CRF)
218
+        {
219
+            framesCount = endIndex - startIndex + 1;
220
+            diffQp += int (m_rce2Pass[endIndex].qpaRc - m_rce2Pass[endIndex].qpNoVbv);
221
+            if (framesCount > fps)
222
+                diffQp -= int (m_rce2Pass[endIndex - fps].qpaRc - m_rce2Pass[endIndex - fps].qpNoVbv);
223
+            if (framesCount >= fps)
224
+            {
225
+                if (diffQp >= 1)
226
+                {
227
+                    if (!isQpModified && endIndex > fps)
228
+                    {
229
+                        double factor = 2;
230
+                        double step = 0;
231
+                        for (int start = endIndex; start <= endIndex + fps - 1 && start < m_numEntries; start++)
232
+                        {
233
+                            RateControlEntry *rce = &m_rce2Pass[start];
234
+                            targetBits += qScale2bits(rce, x265_qp2qScale(rce->qpNoVbv));
235
+                            expectedBits += qScale2bits(rce, rce->qScale);
236
+                        }
237
+                        if (expectedBits < 0.95 * targetBits)
238
+                        {
239
+                            isQpModified = true;
240
+                            while (endIndex + fps < m_numEntries)
241
+                            {
242
+                                step = pow(2, factor / 6.0);
243
+                                expectedBits = 0;
244
+                                for (int start = endIndex; start <= endIndex + fps - 1; start++)
245
+                                {
246
+                                    RateControlEntry *rce = &m_rce2Pass[start];
247
+                                    rce->newQScale = rce->qScale / step;
248
+                                    X265_CHECK(rce->newQScale >= 0, "new Qscale is negative\n");
249
+                                    expectedBits += qScale2bits(rce, rce->newQScale);
250
+                                    rce->newQp = x265_qScale2qp(rce->newQScale);
251
+                                }
252
+                                if (expectedBits >= targetBits && step > 1)
253
+                                    factor *= 0.90;
254
+                                else
255
+                                    break;
256
+                            }
257
+
258
+                            if (m_isVbv && endIndex + fps < m_numEntries)
259
+                                if (!vbv2Pass((uint64_t)targetBits, endIndex + fps - 1, endIndex))
260
+                                    return false;
261
+
262
+                            targetBits = 0;
263
+                            expectedBits = 0;
264
+
265
+                            for (int start = endIndex - fps; start <= endIndex - 1; start++)
266
+                            {
267
+                                RateControlEntry *rce = &m_rce2Pass[start];
268
+                                targetBits += qScale2bits(rce, x265_qp2qScale(rce->qpNoVbv));
269
+                            }
270
+                            while (1)
271
+                            {
272
+                                step = pow(2, factor / 6.0);
273
+                                expectedBits = 0;
274
+                                for (int start = endIndex - fps; start <= endIndex - 1; start++)
275
+                                {
276
+                                    RateControlEntry *rce = &m_rce2Pass[start];
277
+                                    rce->newQScale = rce->qScale * step;
278
+                                    X265_CHECK(rce->newQScale >= 0, "new Qscale is negative\n");
279
+                                    expectedBits += qScale2bits(rce, rce->newQScale);
280
+                                    rce->newQp = x265_qScale2qp(rce->newQScale);
281
+                                }
282
+                                if (expectedBits > targetBits && step > 1)
283
+                                    factor *= 1.1;
284
+                                else
285
+                                     break;
286
+                            }
287
+                            if (m_isVbv)
288
+                                if (!vbv2Pass((uint64_t)targetBits, endIndex - 1, endIndex - fps))
289
+                                    return false;
290
+                            diffQp = 0;
291
+                            startIndex = endIndex + 1;
292
+                            targetBits = expectedBits = 0;
293
+                        }
294
+                        else
295
+                            targetBits = expectedBits = 0;
296
+                    }
297
+                }
298
+                else
299
+                    isQpModified = false;
300
+            }
301
+        }
302
+    }
303
+
304
+    if (m_param->rc.rateControlMode == X265_RC_ABR)
305
+    {
306
+        if (allAvailableBits < allConstBits)
307
+        {
308
+            x265_log(m_param, X265_LOG_ERROR, "requested bitrate is too low. estimated minimum is %d kbps\n",
309
+                     (int)(allConstBits * m_fps / framesCount * 1000.));
310
+            return false;
311
+        }
312
+        if (!analyseABR2Pass(0, m_numEntries - 1, allAvailableBits))
313
+            return false;
314
+    }
315
+    return true;
316
+}
317
+
318
+bool RateControl::vbv2Pass(uint64_t allAvailableBits, int endPos, int startPos)
319
 {
320
     /* for each interval of bufferFull .. underflow, uniformly increase the qp of all
321
      * frames in the interval until either buffer is full at some intermediate frame or the
322
@@ -845,10 +963,10 @@
323
         {   /* not first iteration */
324
             adjustment = X265_MAX(X265_MIN(expectedBits / allAvailableBits, 0.999), 0.9);
325
             fills[-1] = m_bufferSize * m_param->rc.vbvBufferInit;
326
-            t0 = 0;
327
+            t0 = startPos;
328
             /* fix overflows */
329
             adjMin = 1;
330
-            while (adjMin && findUnderflow(fills, &t0, &t1, 1))
331
+            while (adjMin && findUnderflow(fills, &t0, &t1, 1, endPos))
332
             {
333
                 adjMin = fixUnderflow(t0, t1, adjustment, MIN_QPSCALE, MAX_MAX_QPSCALE);
334
                 t0 = t1;
335
@@ -859,20 +977,16 @@
336
         t0 = 0;
337
         /* fix underflows -- should be done after overflow, as we'd better undersize target than underflowing VBV */
338
         adjMax = 1;
339
-        while (adjMax && findUnderflow(fills, &t0, &t1, 0))
340
+        while (adjMax && findUnderflow(fills, &t0, &t1, 0, endPos))
341
             adjMax = fixUnderflow(t0, t1, 1.001, MIN_QPSCALE, MAX_MAX_QPSCALE );
342
-
343
-        expectedBits = countExpectedBits();
344
+        expectedBits = countExpectedBits(startPos, endPos);
345
     }
346
-    while ((expectedBits < .995 * allAvailableBits) && ((int64_t)(expectedBits+.5) > (int64_t)(prevBits+.5)));
347
-
348
+    while ((expectedBits < .995 * allAvailableBits) && ((int64_t)(expectedBits+.5) > (int64_t)(prevBits+.5)) && !(m_param->rc.rateControlMode == X265_RC_CRF));
349
     if (!adjMax)
350
         x265_log(m_param, X265_LOG_WARNING, "vbv-maxrate issue, qpmax or vbv-maxrate too low\n");
351
-
352
     /* store expected vbv filling values for tracking when encoding */
353
-    for (int i = 0; i < m_numEntries; i++)
354
+    for (int i = startPos; i <= endPos; i++)
355
         m_rce2Pass[i].expectedVbv = m_bufferSize - fills[i];
356
-
357
     X265_FREE(fills - 1);
358
     return true;
359
 
360
@@ -912,9 +1026,10 @@
361
                 m_param->bframes = 1;
362
             return X265_TYPE_AUTO;
363
         }
364
-        int frameType = m_rce2Pass[frameNum].sliceType == I_SLICE ? (frameNum > 0 && m_param->bOpenGOP ? X265_TYPE_I : X265_TYPE_IDR)
365
-                            : m_rce2Pass[frameNum].sliceType == P_SLICE ? X265_TYPE_P
366
-                            : (m_rce2Pass[frameNum].sliceType == B_SLICE && m_rce2Pass[frameNum].keptAsRef? X265_TYPE_BREF : X265_TYPE_B);
367
+        int index = m_encOrder[frameNum];
368
+        int frameType = m_rce2Pass[index].sliceType == I_SLICE ? (frameNum > 0 && m_param->bOpenGOP ? X265_TYPE_I : X265_TYPE_IDR)
369
+                        : m_rce2Pass[index].sliceType == P_SLICE ? X265_TYPE_P
370
+                        : (m_rce2Pass[index].sliceType == B_SLICE && m_rce2Pass[index].keptAsRef ? X265_TYPE_BREF : X265_TYPE_B);
371
         return frameType;
372
     }
373
     else
374
@@ -926,16 +1041,20 @@
375
     /* Frame Predictors used in vbv */
376
     for (int i = 0; i < 4; i++)
377
     {
378
+        m_pred[i].coeffMin = 1.0 / 4;
379
         m_pred[i].coeff = 1.0;
380
         m_pred[i].count = 1.0;
381
         m_pred[i].decay = 0.5;
382
         m_pred[i].offset = 0.0;
383
     }
384
     m_pred[0].coeff = m_pred[3].coeff = 0.75;
385
+    m_pred[0].coeffMin = m_pred[3].coeffMin = 0.75 / 4;
386
     if (m_param->rc.qCompress >= 0.8) // when tuned for grain 
387
     {
388
+        m_pred[1].coeffMin = 0.75 / 4;
389
         m_pred[1].coeff = 0.75;
390
-        m_pred[0].coeff = m_pred[3].coeff = 0.50;
391
+        m_pred[0].coeff = m_pred[3].coeff = 0.5;
392
+        m_pred[0].coeffMin = m_pred[3].coeffMin = 0.5 / 4;
393
     }
394
 }
395
 
396
@@ -965,10 +1084,11 @@
397
     if (m_param->rc.bStatRead)
398
     {
399
         X265_CHECK(rce->poc >= 0 && rce->poc < m_numEntries, "bad encode ordinal\n");
400
-        copyRceData(rce, &m_rce2Pass[rce->poc]);
401
+        int index = m_encOrder[rce->poc];
402
+        copyRceData(rce, &m_rce2Pass[index]);
403
     }
404
     rce->isActive = true;
405
-    bool isRefFrameScenecut = m_sliceType!= I_SLICE && m_curSlice->m_refPicList[0][0]->m_lowres.bScenecut == 1;
406
+    bool isRefFrameScenecut = m_sliceType!= I_SLICE && m_curSlice->m_refFrameList[0][0]->m_lowres.bScenecut;
407
     if (curFrame->m_lowres.bScenecut)
408
     {
409
         m_isSceneTransition = true;
410
@@ -995,6 +1115,7 @@
411
             {
412
                 for (int j = 0; j < 2; j++)
413
                 {
414
+                    rce->rowPreds[i][j].coeffMin = 0.25 / 4;
415
                     rce->rowPreds[i][j].coeff = 0.25;
416
                     rce->rowPreds[i][j].count = 1.0;
417
                     rce->rowPreds[i][j].decay = 0.5;
418
@@ -1029,6 +1150,17 @@
419
             }
420
         }
421
     }
422
+    if (!m_isAbr && m_2pass && m_param->rc.rateControlMode == X265_RC_CRF)
423
+    {
424
+        rce->qpPrev = x265_qScale2qp(rce->qScale);
425
+        rce->qScale = rce->newQScale;
426
+        rce->qpaRc = curEncData.m_avgQpRc = curEncData.m_avgQpAq = x265_qScale2qp(rce->newQScale);
427
+        m_qp = int(rce->qpaRc + 0.5);
428
+        rce->frameSizePlanned = qScale2bits(rce, rce->qScale);
429
+        m_framesDone++;
430
+        return m_qp;
431
+    }
432
+
433
     if (m_isAbr || m_2pass) // ABR,CRF
434
     {
435
         if (m_isAbr || m_isVbv)
436
@@ -1200,11 +1332,10 @@
437
     }
438
     return q;
439
 }
440
-
441
-double RateControl::countExpectedBits()
442
+double RateControl::countExpectedBits(int startPos, int endPos)
443
 {
444
     double expectedBits = 0;
445
-    for( int i = 0; i < m_numEntries; i++ )
446
+    for (int i = startPos; i <= endPos; i++)
447
     {
448
         RateControlEntry *rce = &m_rce2Pass[i];
449
         rce->expectedBits = (uint64_t)expectedBits;
450
@@ -1212,8 +1343,7 @@
451
     }
452
     return expectedBits;
453
 }
454
-
455
-bool RateControl::findUnderflow(double *fills, int *t0, int *t1, int over)
456
+bool RateControl::findUnderflow(double *fills, int *t0, int *t1, int over, int endPos)
457
 {
458
     /* find an interval ending on an overflow or underflow (depending on whether
459
      * we're adding or removing bits), and starting on the earliest frame that
460
@@ -1223,7 +1353,7 @@
461
     double fill = fills[*t0 - 1];
462
     double parity = over ? 1. : -1.;
463
     int start = -1, end = -1;
464
-    for (int i = *t0; i < m_numEntries; i++)
465
+    for (int i = *t0; i <= endPos; i++)
466
     {
467
         fill += (m_frameDuration * m_vbvMaxRate -
468
                  qScale2bits(&m_rce2Pass[i], m_rce2Pass[i].newQScale)) * parity;
469
@@ -1260,12 +1390,11 @@
470
     }
471
     return adjusted;
472
 }
473
-
474
 bool RateControl::cuTreeReadFor2Pass(Frame* frame)
475
 {
476
-    uint8_t sliceTypeActual = (uint8_t)m_rce2Pass[frame->m_poc].sliceType;
477
-
478
-    if (m_rce2Pass[frame->m_poc].keptAsRef)
479
+    int index = m_encOrder[frame->m_poc];
480
+    uint8_t sliceTypeActual = (uint8_t)m_rce2Pass[index].sliceType;
481
+    if (m_rce2Pass[index].keptAsRef)
482
     {
483
         /* TODO: We don't need pre-lookahead to measure AQ offsets, but there is currently
484
          * no way to signal this */
485
@@ -1347,18 +1476,28 @@
486
     {
487
         if (m_isAbr)
488
         {
489
-            double slidingWindowCplxSum = 0;
490
-            int start = m_sliderPos > s_slidingWindowFrames ?  m_sliderPos : 0;
491
-            for (int cnt = 0; cnt < s_slidingWindowFrames; cnt++, start++)
492
-            {
493
-                int pos = start % s_slidingWindowFrames;
494
-                slidingWindowCplxSum *= 0.5;
495
-                if (!m_satdCostWindow[pos])
496
-                    break;
497
-                slidingWindowCplxSum += m_satdCostWindow[pos];
498
+            int pos = m_sliderPos % s_slidingWindowFrames;
499
+            int addPos = (pos + s_slidingWindowFrames - 1) % s_slidingWindowFrames;
500
+            if (m_sliderPos > s_slidingWindowFrames)
501
+            {
502
+                const static double base = pow(0.5, s_slidingWindowFrames - 1);
503
+                m_movingAvgSum -= m_lastRemovedSatdCost * base;
504
+                m_movingAvgSum *= 0.5;
505
+                m_movingAvgSum += m_satdCostWindow[addPos];
506
             }
507
-            rce->movingAvgSum = slidingWindowCplxSum;
508
-            m_satdCostWindow[m_sliderPos % s_slidingWindowFrames] = rce->lastSatd;
509
+            else if (m_sliderPos == s_slidingWindowFrames)
510
+            {
511
+                m_movingAvgSum += m_satdCostWindow[addPos];
512
+            }
513
+            else if (m_sliderPos > 0)
514
+            {
515
+                m_movingAvgSum += m_satdCostWindow[addPos];
516
+                m_movingAvgSum *= 0.5;
517
+            }
518
+
519
+            rce->movingAvgSum = m_movingAvgSum;
520
+            m_lastRemovedSatdCost = m_satdCostWindow[pos];
521
+            m_satdCostWindow[pos] = rce->lastSatd;
522
             m_sliderPos++;
523
         }
524
     }
525
@@ -1367,10 +1506,10 @@
526
     {
527
         /* B-frames don't have independent rate control, but rather get the
528
          * average QP of the two adjacent P-frames + an offset */
529
-        Slice* prevRefSlice = m_curSlice->m_refPicList[0][0]->m_encData->m_slice;
530
-        Slice* nextRefSlice = m_curSlice->m_refPicList[1][0]->m_encData->m_slice;
531
-        double q0 = m_curSlice->m_refPicList[0][0]->m_encData->m_avgQpRc;
532
-        double q1 = m_curSlice->m_refPicList[1][0]->m_encData->m_avgQpRc;
533
+        Slice* prevRefSlice = m_curSlice->m_refFrameList[0][0]->m_encData->m_slice;
534
+        Slice* nextRefSlice = m_curSlice->m_refFrameList[1][0]->m_encData->m_slice;
535
+        double q0 = m_curSlice->m_refFrameList[0][0]->m_encData->m_avgQpRc;
536
+        double q1 = m_curSlice->m_refFrameList[1][0]->m_encData->m_avgQpRc;
537
         bool i0 = prevRefSlice->m_sliceType == I_SLICE;
538
         bool i1 = nextRefSlice->m_sliceType == I_SLICE;
539
         int dt0 = abs(m_curSlice->m_poc - prevRefSlice->m_poc);
540
@@ -1386,9 +1525,9 @@
541
                 q0 = q1;
542
             }
543
         }
544
-        if (prevRefSlice->m_sliceType == B_SLICE && IS_REFERENCED(m_curSlice->m_refPicList[0][0]))
545
+        if (prevRefSlice->m_sliceType == B_SLICE && IS_REFERENCED(m_curSlice->m_refFrameList[0][0]))
546
             q0 -= m_pbOffset / 2;
547
-        if (nextRefSlice->m_sliceType == B_SLICE && IS_REFERENCED(m_curSlice->m_refPicList[1][0]))
548
+        if (nextRefSlice->m_sliceType == B_SLICE && IS_REFERENCED(m_curSlice->m_refFrameList[1][0]))
549
             q1 -= m_pbOffset / 2;
550
         if (i0 && i1)
551
             q = (q0 + q1) / 2 + m_ipOffset;
552
@@ -1512,7 +1651,7 @@
553
              * Then bias the quant up or down if total size so far was far from
554
              * the target.
555
              * Result: Depending on the value of rate_tolerance, there is a
556
-             * tradeoff between quality and bitrate precision. But at large
557
+             * trade-off between quality and bitrate precision. But at large
558
              * tolerances, the bit distribution approaches that of 2pass. */
559
 
560
             double overflow = 1;
561
@@ -1584,7 +1723,7 @@
562
             q = clipQscale(curFrame, rce, q);
563
             /*  clip qp to permissible range after vbv-lookahead estimation to avoid possible
564
              * mispredictions by initial frame size predictors, after each scenecut */
565
-            bool isFrameAfterScenecut = m_sliceType!= I_SLICE && m_curSlice->m_refPicList[0][0]->m_lowres.bScenecut;
566
+            bool isFrameAfterScenecut = m_sliceType!= I_SLICE && m_curSlice->m_refFrameList[0][0]->m_lowres.bScenecut;
567
             if (!m_2pass && m_isVbv && isFrameAfterScenecut)
568
                 q = x265_clip3(lqmin, lqmax, q);
569
         }
570
@@ -1714,7 +1853,7 @@
571
     }
572
 
573
     seiBP->m_initialCpbRemovalDelay = (uint32_t)(num * cpbState + denom) / denom;
574
-    seiBP->m_initialCpbRemovalDelayOffset = (uint32_t)(num * cpbSize + denom) / denom - seiBP->m_initialCpbRemovalDelay;
575
+    seiBP->m_initialCpbRemovalDelayOffset = (uint32_t)((num * cpbSize + denom) / denom - seiBP->m_initialCpbRemovalDelay);
576
 }
577
 
578
 void RateControl::updateVbvPlan(Encoder* enc)
579
@@ -1869,7 +2008,7 @@
580
     double qScale = x265_qp2qScale(qpVbv);
581
     FrameData& curEncData = *curFrame->m_encData;
582
     int picType = curEncData.m_slice->m_sliceType;
583
-    Frame* refFrame = curEncData.m_slice->m_refPicList[0][0];
584
+    Frame* refFrame = curEncData.m_slice->m_refFrameList[0][0];
585
 
586
     uint32_t maxRows = curEncData.m_slice->m_sps->numCuInHeight;
587
     uint32_t maxCols = curEncData.m_slice->m_sps->numCuInWidth;
588
@@ -1945,6 +2084,8 @@
589
 
590
 int RateControl::rowDiagonalVbvRateControl(Frame* curFrame, uint32_t row, RateControlEntry* rce, double& qpVbv)
591
 {
592
+    if (m_param->rc.bStatRead && m_param->rc.rateControlMode == X265_RC_CRF)
593
+        return 0;
594
     FrameData& curEncData = *curFrame->m_encData;
595
     double qScaleVbv = x265_qp2qScale(qpVbv);
596
     uint64_t rowSatdCost = curEncData.m_rowStat[row].diagSatd;
597
@@ -1957,9 +2098,9 @@
598
     }
599
     rowSatdCost >>= X265_DEPTH - 8;
600
     updatePredictor(rce->rowPred[0], qScaleVbv, (double)rowSatdCost, encodedBits);
601
-    if (curEncData.m_slice->m_sliceType == P_SLICE)
602
+    if (curEncData.m_slice->m_sliceType != I_SLICE)
603
     {
604
-        Frame* refFrame = curEncData.m_slice->m_refPicList[0][0];
605
+        Frame* refFrame = curEncData.m_slice->m_refFrameList[0][0];
606
         if (qpVbv < refFrame->m_encData->m_rowStat[row].diagQp)
607
         {
608
             uint64_t intraRowSatdCost = curEncData.m_rowStat[row].diagIntraSatd;
609
@@ -2137,7 +2278,8 @@
610
         return;
611
     const double range = 2;
612
     double old_coeff = p->coeff / p->count;
613
-    double new_coeff = bits * q / var;
614
+    double old_offset = p->offset / p->count;
615
+    double new_coeff = X265_MAX((bits * q - old_offset) / var, p->coeffMin );
616
     double new_coeff_clipped = x265_clip3(old_coeff / range, old_coeff * range, new_coeff);
617
     double new_offset = bits * q - new_coeff_clipped * var;
618
     if (new_offset >= 0)
619
@@ -2192,7 +2334,7 @@
620
 
621
     if (m_param->rc.aqMode || m_isVbv)
622
     {
623
-        if (m_isVbv)
624
+        if (m_isVbv && !(m_2pass && m_param->rc.rateControlMode == X265_RC_CRF))
625
         {
626
             /* determine avg QP decided by VBV rate control */
627
             for (uint32_t i = 0; i < slice->m_sps->numCuInHeight; i++)
628
@@ -2218,20 +2360,31 @@
629
     {
630
         if (m_param->rc.rateControlMode == X265_RC_ABR && !m_param->rc.bStatRead)
631
             checkAndResetABR(rce, true);
632
-
633
-        if (m_param->rc.rateControlMode == X265_RC_CRF)
634
+    }
635
+    if (m_param->rc.rateControlMode == X265_RC_CRF)
636
+    {
637
+        double crfVal, qpRef = curEncData.m_avgQpRc;
638
+        bool is2passCrfChange = false;
639
+        if (m_2pass)
640
         {
641
-            if (int(curEncData.m_avgQpRc + 0.5) == slice->m_sliceQp)
642
-                curEncData.m_rateFactor = m_rateFactorConstant;
643
-            else
644
+            if (abs(curEncData.m_avgQpRc - rce->qpPrev) > 0.1)
645
             {
646
-                /* If vbv changed the frame QP recalculate the rate-factor */
647
-                double baseCplx = m_ncu * (m_param->bframes ? 120 : 80);
648
-                double mbtree_offset = m_param->rc.cuTree ? (1.0 - m_param->rc.qCompress) * 13.5 : 0;
649
-                curEncData.m_rateFactor = pow(baseCplx, 1 - m_qCompress) /
650
-                    x265_qp2qScale(int(curEncData.m_avgQpRc + 0.5) + mbtree_offset);
651
+                qpRef = rce->qpPrev;
652
+                is2passCrfChange = true;
653
             }
654
         }
655
+        if (is2passCrfChange || abs(qpRef - rce->qpNoVbv) > 0.5)
656
+        {
657
+            double crfFactor = rce->qRceq /x265_qp2qScale(qpRef);
658
+            double baseCplx = m_ncu * (m_param->bframes ? 120 : 80);
659
+            double mbtree_offset = m_param->rc.cuTree ? (1.0 - m_param->rc.qCompress) * 13.5 : 0;
660
+            crfVal = x265_qScale2qp(pow(baseCplx, 1 - m_qCompress) / crfFactor) - mbtree_offset;
661
+        }
662
+        else
663
+            crfVal = rce->sliceType == I_SLICE ? m_param->rc.rfConstant - m_ipOffset : 
664
+            (rce->sliceType == B_SLICE ? m_param->rc.rfConstant + m_pbOffset : m_param->rc.rfConstant);
665
+
666
+        curEncData.m_rateFactor = crfVal;
667
     }
668
 
669
     if (m_isAbr && !m_isAbrReset)
670
@@ -2325,9 +2478,10 @@
671
         : rce->sliceType == P_SLICE ? 'P'
672
         : IS_REFERENCED(curFrame) ? 'B' : 'b';
673
     if (fprintf(m_statFileOut,
674
-                "in:%d out:%d type:%c q:%.2f q-aq:%.2f tex:%d mv:%d misc:%d icu:%.2f pcu:%.2f scu:%.2f ;\n",
675
+                "in:%d out:%d type:%c q:%.2f q-aq:%.2f q-noVbv:%.2f q-Rceq:%.2f tex:%d mv:%d misc:%d icu:%.2f pcu:%.2f scu:%.2f ;\n",
676
                 rce->poc, rce->encodeOrder,
677
                 cType, curEncData.m_avgQpRc, curEncData.m_avgQpAq,
678
+                rce->qpNoVbv, rce->qRceq,
679
                 curFrame->m_encData->m_frameStats.coeffBits,
680
                 curFrame->m_encData->m_frameStats.mvBits,
681
                 curFrame->m_encData->m_frameStats.miscBits,
682
x265_1.8.tar.gz/source/encoder/ratecontrol.h -> x265_1.9.tar.gz/source/encoder/ratecontrol.h Changed
52
 
1
@@ -48,6 +48,7 @@
2
 
3
 struct Predictor
4
 {
5
+    double coeffMin;
6
     double coeff;
7
     double count;
8
     double decay;
9
@@ -74,6 +75,7 @@
10
     double  qpaRc;
11
     double  qpAq;
12
     double  qRceq;
13
+    double  qpPrev;
14
     double  frameSizePlanned;  /* frame Size decided by RateCotrol before encoding the frame */
15
     double  bufferRate;
16
     double  movingAvgSum;
17
@@ -167,6 +169,8 @@
18
     int64_t m_satdCostWindow[50];
19
     int64_t m_encodedBitsWindow[50];
20
     int     m_sliderPos;
21
+    int64_t m_lastRemovedSatdCost;
22
+    double  m_movingAvgSum;
23
 
24
     /* To detect a pattern of low detailed static frames in single pass ABR using satdcosts */
25
     int64_t m_lastBsliceSatdCost;
26
@@ -205,8 +209,8 @@
27
     double  m_lastAccumPNorm;
28
     double  m_expectedBitsSum;   /* sum of qscale2bits after rceq, ratefactor, and overflow, only includes finished frames */
29
     int64_t m_predictedBits;
30
+    int     *m_encOrder;
31
     RateControlEntry* m_rce2Pass;
32
-
33
     struct
34
     {
35
         uint16_t *qpBuffer[2]; /* Global buffers for converting MB-tree quantizer data. */
36
@@ -258,11 +262,12 @@
37
     void   checkAndResetABR(RateControlEntry* rce, bool isFrameDone);
38
     double predictRowsSizeSum(Frame* pic, RateControlEntry* rce, double qpm, int32_t& encodedBits);
39
     bool   initPass2();
40
+    bool   analyseABR2Pass(int startPoc, int endPoc, uint64_t allAvailableBits);
41
     void   initFramePredictors();
42
     double getDiffLimitedQScale(RateControlEntry *rce, double q);
43
-    double countExpectedBits();
44
-    bool   vbv2Pass(uint64_t allAvailableBits);
45
-    bool   findUnderflow(double *fills, int *t0, int *t1, int over);
46
+    double countExpectedBits(int startPos, int framesCount);
47
+    bool   vbv2Pass(uint64_t allAvailableBits, int frameCount, int startPos);
48
+    bool   findUnderflow(double *fills, int *t0, int *t1, int over, int framesCount);
49
     bool   fixUnderflow(int t0, int t1, double adjustment, double qscaleMin, double qscaleMax);
50
 };
51
 }
52
x265_1.8.tar.gz/source/encoder/rdcost.h -> x265_1.9.tar.gz/source/encoder/rdcost.h Changed
99
 
1
@@ -2,6 +2,7 @@
2
 * Copyright (C) 2013 x265 project
3
 *
4
 * Authors: Steve Borho <steve@borho.org>
5
+*          Min Chen <chenm003@163.com>
6
 *
7
 * This program is free software; you can redistribute it and/or modify
8
 * it under the terms of the GNU General Public License as published by
9
@@ -73,13 +74,18 @@
10
             qpCr = x265_clip3(QP_MIN, QP_MAX_SPEC, qp + slice.m_pps->chromaQpOffset[1]);
11
         }
12
 
13
-        int chroma_offset_idx = X265_MIN(qp - qpCb + 12, MAX_CHROMA_LAMBDA_OFFSET);
14
-        uint16_t lambdaOffset = m_psyRd ? x265_chroma_lambda2_offset_tab[chroma_offset_idx] : 256;
15
-        m_chromaDistWeight[0] = lambdaOffset;
16
+        if (slice.m_sps->chromaFormatIdc == X265_CSP_I444)
17
+        {
18
+            int chroma_offset_idx = X265_MIN(qp - qpCb + 12, MAX_CHROMA_LAMBDA_OFFSET);
19
+            uint16_t lambdaOffset = m_psyRd ? x265_chroma_lambda2_offset_tab[chroma_offset_idx] : 256;
20
+            m_chromaDistWeight[0] = lambdaOffset;
21
 
22
-        chroma_offset_idx = X265_MIN(qp - qpCr + 12, MAX_CHROMA_LAMBDA_OFFSET);
23
-        lambdaOffset = m_psyRd ? x265_chroma_lambda2_offset_tab[chroma_offset_idx] : 256;
24
-        m_chromaDistWeight[1] = lambdaOffset;
25
+            chroma_offset_idx = X265_MIN(qp - qpCr + 12, MAX_CHROMA_LAMBDA_OFFSET);
26
+            lambdaOffset = m_psyRd ? x265_chroma_lambda2_offset_tab[chroma_offset_idx] : 256;
27
+            m_chromaDistWeight[1] = lambdaOffset;
28
+        }
29
+        else
30
+            m_chromaDistWeight[0] = m_chromaDistWeight[1] = 256;
31
     }
32
 
33
     void setLambda(double lambda2, double lambda)
34
@@ -88,9 +94,9 @@
35
         m_lambda = (uint64_t)floor(256.0 * lambda);
36
     }
37
 
38
-    inline uint64_t calcRdCost(sse_ret_t distortion, uint32_t bits) const
39
+    inline uint64_t calcRdCost(sse_t distortion, uint32_t bits) const
40
     {
41
-#if X265_DEPTH <= 10
42
+#if X265_DEPTH < 10
43
         X265_CHECK(bits <= (UINT64_MAX - 128) / m_lambda2,
44
                    "calcRdCost wrap detected dist: %u, bits %u, lambda: " X265_LL "\n",
45
                    distortion, bits, m_lambda2);
46
@@ -108,15 +114,18 @@
47
         return primitives.cu[size].psy_cost_pp(source, sstride, recon, rstride);
48
     }
49
 
50
-    /* return the difference in energy between the source block and the recon block */
51
-    inline int psyCost(int size, const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride) const
52
-    {
53
-        return primitives.cu[size].psy_cost_ss(source, sstride, recon, rstride);
54
-    }
55
-
56
     /* return the RD cost of this prediction, including the effect of psy-rd */
57
-    inline uint64_t calcPsyRdCost(sse_ret_t distortion, uint32_t bits, uint32_t psycost) const
58
+    inline uint64_t calcPsyRdCost(sse_t distortion, uint32_t bits, uint32_t psycost) const
59
     {
60
+#if X265_DEPTH < 10
61
+        X265_CHECK((bits <= (UINT64_MAX / m_lambda2)) && (psycost <= UINT64_MAX / (m_lambda * m_psyRd)),
62
+                   "calcPsyRdCost wrap detected dist: %u, bits: %u, lambda: " X265_LL ", lambda2: " X265_LL "\n",
63
+                   distortion, bits, m_lambda, m_lambda2);
64
+#else
65
+        X265_CHECK((bits <= (UINT64_MAX / m_lambda2)) && (psycost <= UINT64_MAX / (m_lambda * m_psyRd)),
66
+                   "calcPsyRdCost wrap detected dist: " X265_LL ", bits: %u, lambda: " X265_LL ", lambda2: " X265_LL "\n",
67
+                   distortion, bits, m_lambda, m_lambda2);
68
+#endif
69
         return distortion + ((m_lambda * m_psyRd * psycost) >> 24) + ((bits * m_lambda2) >> 8);
70
     }
71
 
72
@@ -127,9 +136,9 @@
73
         return sadCost + ((bits * m_lambda + 128) >> 8);
74
     }
75
 
76
-    inline sse_ret_t scaleChromaDist(uint32_t plane, sse_ret_t dist) const
77
+    inline sse_t scaleChromaDist(uint32_t plane, sse_t dist) const
78
     {
79
-#if X265_DEPTH <= 10
80
+#if X265_DEPTH < 10
81
         X265_CHECK(dist <= (UINT64_MAX - 128) / m_chromaDistWeight[plane - 1],
82
                    "scaleChromaDist wrap detected dist: %u, lambda: %u\n",
83
                    dist, m_chromaDistWeight[plane - 1]);
84
@@ -138,11 +147,13 @@
85
                    "scaleChromaDist wrap detected dist: " X265_LL " lambda: %u\n",
86
                    dist, m_chromaDistWeight[plane - 1]);
87
 #endif
88
-        return (sse_ret_t)((dist * (uint64_t)m_chromaDistWeight[plane - 1] + 128) >> 8);
89
+        return (sse_t)((dist * (uint64_t)m_chromaDistWeight[plane - 1] + 128) >> 8);
90
     }
91
 
92
     inline uint32_t getCost(uint32_t bits) const
93
     {
94
+        X265_CHECK(bits <= (UINT64_MAX - 128) / m_lambda,
95
+                   "getCost wrap detected bits: %u, lambda: " X265_LL "\n", bits, m_lambda);
96
         return (uint32_t)((bits * m_lambda + 128) >> 8);
97
     }
98
 };
99
x265_1.8.tar.gz/source/encoder/reference.cpp -> x265_1.9.tar.gz/source/encoder/reference.cpp Changed
10
 
1
@@ -68,7 +68,7 @@
2
         intptr_t stride = reconPic->m_stride;
3
         int cuHeight = g_maxCUSize;
4
 
5
-        for (int c = 0; c < numInterpPlanes; c++)
6
+        for (int c = 0; c < (p.internalCsp != X265_CSP_I400 ? numInterpPlanes : 1); c++)
7
         {
8
             if (c == 1)
9
             {
10
x265_1.8.tar.gz/source/encoder/sao.cpp -> x265_1.9.tar.gz/source/encoder/sao.cpp Changed
1293
 
1
@@ -73,9 +73,6 @@
2
 
3
 SAO::SAO()
4
 {
5
-    m_count = NULL;
6
-    m_offset = NULL;
7
-    m_offsetOrg = NULL;
8
     m_countPreDblk = NULL;
9
     m_offsetOrgPreDblk = NULL;
10
     m_refDepth = 0;
11
@@ -84,28 +81,22 @@
12
     m_param = NULL;
13
     m_clipTable = NULL;
14
     m_clipTableBase = NULL;
15
-    m_tmpU1[0] = NULL;
16
-    m_tmpU1[1] = NULL;
17
-    m_tmpU1[2] = NULL;
18
-    m_tmpU2[0] = NULL;
19
-    m_tmpU2[1] = NULL;
20
-    m_tmpU2[2] = NULL;
21
-    m_tmpL1 = NULL;
22
-    m_tmpL2 = NULL;
23
-
24
-    m_depthSaoRate[0][0] = 0;
25
-    m_depthSaoRate[0][1] = 0;
26
-    m_depthSaoRate[0][2] = 0;
27
-    m_depthSaoRate[0][3] = 0;
28
-    m_depthSaoRate[1][0] = 0;
29
-    m_depthSaoRate[1][1] = 0;
30
-    m_depthSaoRate[1][2] = 0;
31
-    m_depthSaoRate[1][3] = 0;
32
+    m_tmpU[0] = NULL;
33
+    m_tmpU[1] = NULL;
34
+    m_tmpU[2] = NULL;
35
+    m_tmpL1[0] = NULL;
36
+    m_tmpL1[1] = NULL;
37
+    m_tmpL1[2] = NULL;
38
+    m_tmpL2[0] = NULL;
39
+    m_tmpL2[1] = NULL;
40
+    m_tmpL2[2] = NULL;
41
+    m_depthSaoRate = NULL;
42
 }
43
 
44
-bool SAO::create(x265_param* param)
45
+bool SAO::create(x265_param* param, int initCommon)
46
 {
47
     m_param = param;
48
+    m_chromaFormat = param->internalCsp;
49
     m_hChromaShift = CHROMA_H_SHIFT(param->internalCsp);
50
     m_vChromaShift = CHROMA_V_SHIFT(param->internalCsp);
51
 
52
@@ -116,37 +107,56 @@
53
     const pixel rangeExt = maxY >> 1;
54
     int numCtu = m_numCuInWidth * m_numCuInHeight;
55
 
56
-    CHECKED_MALLOC(m_clipTableBase,  pixel, maxY + 2 * rangeExt);
57
-
58
-    CHECKED_MALLOC(m_tmpL1, pixel, g_maxCUSize + 1);
59
-    CHECKED_MALLOC(m_tmpL2, pixel, g_maxCUSize + 1);
60
-
61
-    for (int i = 0; i < 3; i++)
62
+    for (int i = 0; i < (param->internalCsp != X265_CSP_I400 ? 3 : 1); i++)
63
     {
64
+        CHECKED_MALLOC(m_tmpL1[i], pixel, g_maxCUSize + 1);
65
+        CHECKED_MALLOC(m_tmpL2[i], pixel, g_maxCUSize + 1);
66
+
67
         // SAO asm code will read 1 pixel before and after, so pad by 2
68
-        CHECKED_MALLOC(m_tmpU1[i], pixel, m_param->sourceWidth + 2);
69
-        m_tmpU1[i] += 1;
70
-        CHECKED_MALLOC(m_tmpU2[i], pixel, m_param->sourceWidth + 2);
71
-        m_tmpU2[i] += 1;
72
+        // NOTE: m_param->sourceWidth+2 enough, to avoid condition check in copySaoAboveRef(), I alloc more up to 63 bytes in here
73
+        CHECKED_MALLOC(m_tmpU[i], pixel, m_numCuInWidth * g_maxCUSize + 2 + 32);
74
+        m_tmpU[i] += 1;
75
     }
76
 
77
-    CHECKED_MALLOC(m_count, PerClass, NUM_PLANE);
78
-    CHECKED_MALLOC(m_offset, PerClass, NUM_PLANE);
79
-    CHECKED_MALLOC(m_offsetOrg, PerClass, NUM_PLANE);
80
-
81
-    CHECKED_MALLOC(m_countPreDblk, PerPlane, numCtu);
82
-    CHECKED_MALLOC(m_offsetOrgPreDblk, PerPlane, numCtu);
83
-
84
-    m_clipTable = &(m_clipTableBase[rangeExt]);
85
-
86
-    for (int i = 0; i < rangeExt; i++)
87
-        m_clipTableBase[i] = 0;
88
+    if (initCommon)
89
+    {
90
+        CHECKED_MALLOC(m_countPreDblk, PerPlane, numCtu);
91
+        CHECKED_MALLOC(m_offsetOrgPreDblk, PerPlane, numCtu);
92
+        CHECKED_MALLOC(m_depthSaoRate, double, 2 * SAO_DEPTHRATE_SIZE);
93
+
94
+        m_depthSaoRate[0 * SAO_DEPTHRATE_SIZE + 0] = 0;
95
+        m_depthSaoRate[0 * SAO_DEPTHRATE_SIZE + 1] = 0;
96
+        m_depthSaoRate[0 * SAO_DEPTHRATE_SIZE + 2] = 0;
97
+        m_depthSaoRate[0 * SAO_DEPTHRATE_SIZE + 3] = 0;
98
+        m_depthSaoRate[1 * SAO_DEPTHRATE_SIZE + 0] = 0;
99
+        m_depthSaoRate[1 * SAO_DEPTHRATE_SIZE + 1] = 0;
100
+        m_depthSaoRate[1 * SAO_DEPTHRATE_SIZE + 2] = 0;
101
+        m_depthSaoRate[1 * SAO_DEPTHRATE_SIZE + 3] = 0;
102
+
103
+        CHECKED_MALLOC(m_clipTableBase,  pixel, maxY + 2 * rangeExt);
104
+        m_clipTable = &(m_clipTableBase[rangeExt]);
105
+
106
+        // Share with fast clip lookup table
107
+        if (initCommon)
108
+        {
109
+            for (int i = 0; i < rangeExt; i++)
110
+                m_clipTableBase[i] = 0;
111
 
112
-    for (int i = 0; i < maxY; i++)
113
-        m_clipTable[i] = (pixel)i;
114
+            for (int i = 0; i < maxY; i++)
115
+                m_clipTable[i] = (pixel)i;
116
 
117
-    for (int i = maxY; i < maxY + rangeExt; i++)
118
-        m_clipTable[i] = maxY;
119
+            for (int i = maxY; i < maxY + rangeExt; i++)
120
+                m_clipTable[i] = maxY;
121
+        }
122
+    }
123
+    else
124
+    {
125
+        // must initialize these common pointer outside of function
126
+        m_countPreDblk = NULL;
127
+        m_offsetOrgPreDblk = NULL;
128
+        m_clipTableBase = NULL;
129
+        m_clipTable = NULL;
130
+    }
131
 
132
     return true;
133
 
134
@@ -154,34 +164,61 @@
135
     return false;
136
 }
137
 
138
-void SAO::destroy()
139
+void SAO::createFromRootNode(SAO* root)
140
 {
141
-    X265_FREE(m_clipTableBase);
142
-
143
-    X265_FREE(m_tmpL1);
144
-    X265_FREE(m_tmpL2);
145
+    X265_CHECK(m_countPreDblk == NULL, "duplicate initialize on m_countPreDblk");
146
+    X265_CHECK(m_offsetOrgPreDblk == NULL, "duplicate initialize on m_offsetOrgPreDblk");
147
+    X265_CHECK(m_depthSaoRate == NULL, "duplicate initialize on m_depthSaoRate");
148
+    X265_CHECK(m_clipTableBase == NULL, "duplicate initialize on m_clipTableBase");
149
+    X265_CHECK(m_clipTable == NULL, "duplicate initialize on m_clipTable");
150
+
151
+    m_countPreDblk = root->m_countPreDblk;
152
+    m_offsetOrgPreDblk = root->m_offsetOrgPreDblk;
153
+    m_depthSaoRate = root->m_depthSaoRate;
154
+    m_clipTableBase = root->m_clipTableBase; // Unnecessary
155
+    m_clipTable = root->m_clipTable;
156
+}
157
 
158
+void SAO::destroy(int destoryCommon)
159
+{
160
     for (int i = 0; i < 3; i++)
161
     {
162
-        if (m_tmpU1[i]) X265_FREE(m_tmpU1[i] - 1);
163
-        if (m_tmpU2[i]) X265_FREE(m_tmpU2[i] - 1);
164
+        if (m_tmpL1[i])
165
+        {
166
+            X265_FREE(m_tmpL1[i]);
167
+            m_tmpL1[i] = NULL;
168
+        }
169
+
170
+        if (m_tmpL2[i])
171
+        {
172
+            X265_FREE(m_tmpL2[i]);
173
+            m_tmpL2[i] = NULL;
174
+        }
175
+
176
+        if (m_tmpU[i])
177
+        {
178
+            X265_FREE(m_tmpU[i] - 1);
179
+            m_tmpU[i] = NULL;
180
+        }
181
     }
182
 
183
-    X265_FREE(m_count);
184
-    X265_FREE(m_offset);
185
-    X265_FREE(m_offsetOrg);
186
-    X265_FREE(m_countPreDblk);
187
-    X265_FREE(m_offsetOrgPreDblk);
188
+    if (destoryCommon)
189
+    {
190
+        X265_FREE_ZERO(m_countPreDblk);
191
+        X265_FREE_ZERO(m_offsetOrgPreDblk);
192
+        X265_FREE_ZERO(m_depthSaoRate);
193
+        X265_FREE_ZERO(m_clipTableBase);
194
+    }
195
 }
196
 
197
 /* allocate memory for SAO parameters */
198
 void SAO::allocSaoParam(SAOParam* saoParam) const
199
 {
200
+    int planes = (m_param->internalCsp != X265_CSP_I400) ? 3 : 1;
201
     saoParam->numCuInWidth  = m_numCuInWidth;
202
 
203
-    saoParam->ctuParam[0] = new SaoCtuParam[m_numCuInHeight * m_numCuInWidth];
204
-    saoParam->ctuParam[1] = new SaoCtuParam[m_numCuInHeight * m_numCuInWidth];
205
-    saoParam->ctuParam[2] = new SaoCtuParam[m_numCuInHeight * m_numCuInWidth];
206
+    for (int i = 0; i < planes; i++)
207
+        saoParam->ctuParam[i] = new SaoCtuParam[m_numCuInHeight * m_numCuInWidth];
208
 }
209
 
210
 void SAO::startSlice(Frame* frame, Entropy& initState, int qp)
211
@@ -209,8 +246,6 @@
212
         break;
213
     }
214
 
215
-    resetStats();
216
-
217
     m_entropyCoder.load(initState);
218
     m_rdContexts.next.load(initState);
219
     m_rdContexts.cur.load(initState);
220
@@ -224,7 +259,7 @@
221
     }
222
 
223
     saoParam->bSaoFlag[0] = true;
224
-    saoParam->bSaoFlag[1] = true;
225
+    saoParam->bSaoFlag[1] = m_param->internalCsp != X265_CSP_I400;
226
 
227
     m_numNoSao[0] = 0; // Luma
228
     m_numNoSao[1] = 0; // Chroma
229
@@ -232,9 +267,9 @@
230
     // NOTE: Allow SAO automatic turn-off only when frame parallelism is disabled.
231
     if (m_param->frameNumThreads == 1)
232
     {
233
-        if (m_refDepth > 0 && m_depthSaoRate[0][m_refDepth - 1] > SAO_ENCODING_RATE)
234
+        if (m_refDepth > 0 && m_depthSaoRate[0 * SAO_DEPTHRATE_SIZE + m_refDepth - 1] > SAO_ENCODING_RATE)
235
             saoParam->bSaoFlag[0] = false;
236
-        if (m_refDepth > 0 && m_depthSaoRate[1][m_refDepth - 1] > SAO_ENCODING_RATE_CHROMA)
237
+        if (m_refDepth > 0 && m_depthSaoRate[1 * SAO_DEPTHRATE_SIZE + m_refDepth - 1] > SAO_ENCODING_RATE_CHROMA)
238
             saoParam->bSaoFlag[1] = false;
239
     }
240
 }
241
@@ -243,12 +278,13 @@
242
 void SAO::processSaoCu(int addr, int typeIdx, int plane)
243
 {
244
     int x, y;
245
-    const CUData* cu = m_frame->m_encData->getPicCTU(addr);
246
-    pixel* rec = m_frame->m_reconPic->getPlaneAddr(plane, addr);
247
-    intptr_t stride = plane ? m_frame->m_reconPic->m_strideC : m_frame->m_reconPic->m_stride;
248
+    PicYuv* reconPic = m_frame->m_reconPic;
249
+    pixel* rec = reconPic->getPlaneAddr(plane, addr);
250
+    intptr_t stride = plane ? reconPic->m_strideC : reconPic->m_stride;
251
     uint32_t picWidth  = m_param->sourceWidth;
252
     uint32_t picHeight = m_param->sourceHeight;
253
-    int ctuWidth  = g_maxCUSize;
254
+    const CUData* cu = m_frame->m_encData->getPicCTU(addr);
255
+    int ctuWidth = g_maxCUSize;
256
     int ctuHeight = g_maxCUSize;
257
     uint32_t lpelx = cu->m_cuPelX;
258
     uint32_t tpely = cu->m_cuPelY;
259
@@ -278,17 +314,10 @@
260
 
261
     memset(_upBuff1 + MAX_CU_SIZE, 0, 2 * sizeof(int8_t)); /* avoid valgrind uninit warnings */
262
 
263
-    {
264
-        const pixel* recR = &rec[ctuWidth - 1];
265
-        for (int i = 0; i < ctuHeight + 1; i++)
266
-        {
267
-            m_tmpL2[i] = *recR;
268
-            recR += stride;
269
-        }
270
+    tmpL = m_tmpL1[plane];
271
+    tmpU = &(m_tmpU[plane][lpelx]);
272
 
273
-        tmpL = m_tmpL1;
274
-        tmpU = &(m_tmpU1[plane][lpelx]);
275
-    }
276
+    int8_t* offsetEo = m_offsetEo[plane];
277
 
278
     switch (typeIdx)
279
     {
280
@@ -308,7 +337,7 @@
281
                     int edgeType = signRight + signLeft + 2;
282
                     signLeft = -signRight;
283
 
284
-                    rec[x] = m_clipTable[rec[x] + m_offsetEo[edgeType]];
285
+                    rec[x] = m_clipTable[rec[x] + offsetEo[edgeType]];
286
                 }
287
 
288
                 rec += stride;
289
@@ -333,7 +362,7 @@
290
                     row1LastPxl = rec[stride + ctuWidth - 1];
291
                 }
292
 
293
-                primitives.saoCuOrgE0(rec, m_offsetEo, ctuWidth, signLeft1, stride);
294
+                primitives.saoCuOrgE0(rec, offsetEo, ctuWidth, signLeft1, stride);
295
 
296
                 if (!lpelx)
297
                 {
298
@@ -372,7 +401,7 @@
299
                     int edgeType = signDown + upBuff1[x] + 2;
300
                     upBuff1[x] = -signDown;
301
 
302
-                    rec[x] = m_clipTable[rec[x] + m_offsetEo[edgeType]];
303
+                    rec[x] = m_clipTable[rec[x] + offsetEo[edgeType]];
304
                 }
305
 
306
                 rec += stride;
307
@@ -385,11 +414,11 @@
308
             int diff = (endY - startY) % 2;
309
             for (y = startY; y < endY - diff; y += 2)
310
             {
311
-                primitives.saoCuOrgE1_2Rows(rec, upBuff1, m_offsetEo, stride, ctuWidth);
312
+                primitives.saoCuOrgE1_2Rows(rec, upBuff1, offsetEo, stride, ctuWidth);
313
                 rec += 2 * stride;
314
             }
315
             if (diff & 1)
316
-                primitives.saoCuOrgE1(rec, upBuff1, m_offsetEo, stride, ctuWidth);
317
+                primitives.saoCuOrgE1(rec, upBuff1, offsetEo, stride, ctuWidth);
318
         }
319
 
320
         break;
321
@@ -439,7 +468,7 @@
322
                      int8_t signDown = signOf(rec[x] - rec[x + stride + 1]);
323
                      int edgeType = signDown + upBuff1[x] + 2;
324
                      upBufft[x + 1] = -signDown;
325
-                     rec[x] = m_clipTable[rec[x] + m_offsetEo[edgeType]];
326
+                     rec[x] = m_clipTable[rec[x] + offsetEo[edgeType]];
327
                  }
328
 
329
                  std::swap(upBuff1, upBufft);
330
@@ -453,7 +482,7 @@
331
             {
332
                 int8_t iSignDown2 = signOf(rec[stride + startX] - tmpL[y]);
333
 
334
-                primitives.saoCuOrgE2[endX > 16](rec + startX, upBufft + startX, upBuff1 + startX, m_offsetEo, endX - startX, stride);
335
+                primitives.saoCuOrgE2[endX > 16](rec + startX, upBufft + startX, upBuff1 + startX, offsetEo, endX - startX, stride);
336
 
337
                 upBufft[startX] = iSignDown2;
338
 
339
@@ -485,14 +514,14 @@
340
                 int8_t signDown = signOf(rec[x] - tmpL[y + 1]);
341
                 int edgeType = signDown + upBuff1[x] + 2;
342
                 upBuff1[x - 1] = -signDown;
343
-                rec[x] = m_clipTable[rec[x] + m_offsetEo[edgeType]];
344
+                rec[x] = m_clipTable[rec[x] + offsetEo[edgeType]];
345
 
346
                 for (x = startX + 1; x < endX; x++)
347
                 {
348
                     signDown = signOf(rec[x] - rec[x + stride - 1]);
349
                     edgeType = signDown + upBuff1[x] + 2;
350
                     upBuff1[x - 1] = -signDown;
351
-                    rec[x] = m_clipTable[rec[x] + m_offsetEo[edgeType]];
352
+                    rec[x] = m_clipTable[rec[x] + offsetEo[edgeType]];
353
                 }
354
 
355
                 upBuff1[endX - 1] = signOf(rec[endX - 1 + stride] - rec[endX]);
356
@@ -522,9 +551,9 @@
357
                 int8_t signDown = signOf(rec[x] - tmpL[y + 1]);
358
                 int edgeType = signDown + upBuff1[x] + 2;
359
                 upBuff1[x - 1] = -signDown;
360
-                rec[x] = m_clipTable[rec[x] + m_offsetEo[edgeType]];
361
+                rec[x] = m_clipTable[rec[x] + offsetEo[edgeType]];
362
 
363
-                primitives.saoCuOrgE3[endX > 16](rec, upBuff1, m_offsetEo, stride - 1, startX, endX);
364
+                primitives.saoCuOrgE3[endX > 16](rec, upBuff1, offsetEo, stride - 1, startX, endX);
365
 
366
                 upBuff1[endX - 1] = signOf(rec[endX - 1 + stride] - rec[endX]);
367
 
368
@@ -536,7 +565,7 @@
369
     }
370
     case SAO_BO:
371
     {
372
-        const int8_t* offsetBo = m_offsetBo;
373
+        const int8_t* offsetBo = m_offsetBo[plane];
374
 
375
         if (ctuWidth & 15)
376
         {
377
@@ -564,98 +593,169 @@
378
     }
379
     default: break;
380
     }
381
-
382
-//   if (iSaoType!=SAO_BO_0 || iSaoType!=SAO_BO_1)
383
-    std::swap(m_tmpL1, m_tmpL2);
384
 }
385
 
386
-/* Process SAO all units */
387
-void SAO::processSaoUnitRow(SaoCtuParam* ctuParam, int idxY, int plane)
388
+/* Process SAO unit */
389
+void SAO::processSaoUnitCuLuma(SaoCtuParam* ctuParam, int idxY, int idxX)
390
 {
391
-    intptr_t stride = plane ? m_frame->m_reconPic->m_strideC : m_frame->m_reconPic->m_stride;
392
-    uint32_t picWidth  = m_param->sourceWidth;
393
+    PicYuv* reconPic = m_frame->m_reconPic;
394
+    intptr_t stride = reconPic->m_stride;
395
     int ctuWidth  = g_maxCUSize;
396
     int ctuHeight = g_maxCUSize;
397
-    if (plane)
398
+
399
+    int addr = idxY * m_numCuInWidth + idxX;
400
+    pixel* rec = reconPic->getLumaAddr(addr);
401
+
402
+    if (idxX == 0)
403
     {
404
-        picWidth  >>= m_hChromaShift;
405
-        ctuWidth  >>= m_hChromaShift;
406
-        ctuHeight >>= m_vChromaShift;
407
+        for (int i = 0; i < ctuHeight + 1; i++)
408
+        {
409
+            m_tmpL1[0][i] = rec[0];
410
+            rec += stride;
411
+        }
412
     }
413
 
414
-    if (!idxY)
415
+    bool mergeLeftFlag = (ctuParam[addr].mergeMode == SAO_MERGE_LEFT);
416
+    int typeIdx = ctuParam[addr].typeIdx;
417
+
418
+    if (idxX != (m_numCuInWidth - 1))
419
     {
420
-        pixel* rec = m_frame->m_reconPic->m_picOrg[plane];
421
-        memcpy(m_tmpU1[plane], rec, sizeof(pixel) * picWidth);
422
+        rec = reconPic->getLumaAddr(addr);
423
+        for (int i = 0; i < ctuHeight + 1; i++)
424
+        {
425
+            m_tmpL2[0][i] = rec[ctuWidth - 1];
426
+            rec += stride;
427
+        }
428
     }
429
 
430
-    int addr = idxY * m_numCuInWidth;
431
-    pixel* rec = plane ? m_frame->m_reconPic->getChromaAddr(plane, addr) : m_frame->m_reconPic->getLumaAddr(addr);
432
-
433
-    for (int i = 0; i < ctuHeight + 1; i++)
434
+    if (typeIdx >= 0)
435
     {
436
-        m_tmpL1[i] = rec[0];
437
-        rec += stride;
438
+        if (!mergeLeftFlag)
439
+        {
440
+            if (typeIdx == SAO_BO)
441
+            {
442
+                memset(m_offsetBo[0], 0, sizeof(m_offsetBo[0]));
443
+
444
+                for (int i = 0; i < SAO_NUM_OFFSET; i++)
445
+                    m_offsetBo[0][((ctuParam[addr].bandPos + i) & (SAO_NUM_BO_CLASSES - 1))] = (int8_t)(ctuParam[addr].offset[i] << SAO_BIT_INC);
446
+            }
447
+            else // if (typeIdx == SAO_EO_0 || typeIdx == SAO_EO_1 || typeIdx == SAO_EO_2 || typeIdx == SAO_EO_3)
448
+            {
449
+                int offset[NUM_EDGETYPE];
450
+                offset[0] = 0;
451
+                for (int i = 0; i < SAO_NUM_OFFSET; i++)
452
+                    offset[i + 1] = ctuParam[addr].offset[i] << SAO_BIT_INC;
453
+
454
+                for (int edgeType = 0; edgeType < NUM_EDGETYPE; edgeType++)
455
+                    m_offsetEo[0][edgeType] = (int8_t)offset[s_eoTable[edgeType]];
456
+            }
457
+        }
458
+        processSaoCu(addr, typeIdx, 0);
459
     }
460
+    std::swap(m_tmpL1[0], m_tmpL2[0]);
461
+}
462
+
463
+/* Process SAO unit (Chroma only) */
464
+void SAO::processSaoUnitCuChroma(SaoCtuParam* ctuParam[3], int idxY, int idxX)
465
+{
466
+    PicYuv* reconPic = m_frame->m_reconPic;
467
+    intptr_t stride = reconPic->m_strideC;
468
+    int ctuWidth  = g_maxCUSize;
469
+    int ctuHeight = g_maxCUSize;
470
 
471
-    rec -= (stride << 1);
472
+    {
473
+        ctuWidth  >>= m_hChromaShift;
474
+        ctuHeight >>= m_vChromaShift;
475
+    }
476
 
477
-    memcpy(m_tmpU2[plane], rec, sizeof(pixel) * picWidth);
478
+    int addr = idxY * m_numCuInWidth + idxX;
479
+    pixel* recCb = reconPic->getCbAddr(addr);
480
+    pixel* recCr = reconPic->getCrAddr(addr);
481
 
482
-    for (int idxX = 0; idxX < m_numCuInWidth; idxX++)
483
+    if (idxX == 0)
484
     {
485
-        addr = idxY * m_numCuInWidth + idxX;
486
+        for (int i = 0; i < ctuHeight + 1; i++)
487
+        {
488
+            m_tmpL1[1][i] = recCb[0];
489
+            m_tmpL1[2][i] = recCr[0];
490
+            recCb += stride;
491
+            recCr += stride;
492
+        }
493
+    }
494
 
495
-        bool mergeLeftFlag = ctuParam[addr].mergeMode == SAO_MERGE_LEFT;
496
-        int typeIdx = ctuParam[addr].typeIdx;
497
+    bool mergeLeftFlagCb = (ctuParam[1][addr].mergeMode == SAO_MERGE_LEFT);
498
+    int typeIdxCb = ctuParam[1][addr].typeIdx;
499
+
500
+    bool mergeLeftFlagCr = (ctuParam[2][addr].mergeMode == SAO_MERGE_LEFT);
501
+    int typeIdxCr = ctuParam[2][addr].typeIdx;
502
+
503
+    if (idxX != (m_numCuInWidth - 1))
504
+    {
505
+        recCb = reconPic->getCbAddr(addr);
506
+        recCr = reconPic->getCrAddr(addr);
507
+        for (int i = 0; i < ctuHeight + 1; i++)
508
+        {
509
+            m_tmpL2[1][i] = recCb[ctuWidth - 1];
510
+            m_tmpL2[2][i] = recCr[ctuWidth - 1];
511
+            recCb += stride;
512
+            recCr += stride;
513
+        }
514
+    }
515
 
516
-        if (typeIdx >= 0)
517
+    // Process U
518
+    if (typeIdxCb >= 0)
519
+    {
520
+        if (!mergeLeftFlagCb)
521
         {
522
-            if (!mergeLeftFlag)
523
+            if (typeIdxCb == SAO_BO)
524
             {
525
-                if (typeIdx == SAO_BO)
526
-                {
527
-                    memset(m_offsetBo, 0, sizeof(m_offsetBo));
528
+                memset(m_offsetBo[1], 0, sizeof(m_offsetBo[0]));
529
 
530
-                    for (int i = 0; i < SAO_NUM_OFFSET; i++)
531
-                        m_offsetBo[((ctuParam[addr].bandPos + i) & (SAO_NUM_BO_CLASSES - 1))] = (int8_t)(ctuParam[addr].offset[i] << SAO_BIT_INC);
532
-                }
533
-                else // if (typeIdx == SAO_EO_0 || typeIdx == SAO_EO_1 || typeIdx == SAO_EO_2 || typeIdx == SAO_EO_3)
534
-                {
535
-                    int offset[NUM_EDGETYPE];
536
-                    offset[0] = 0;
537
-                    for (int i = 0; i < SAO_NUM_OFFSET; i++)
538
-                        offset[i + 1] = ctuParam[addr].offset[i] << SAO_BIT_INC;
539
+                for (int i = 0; i < SAO_NUM_OFFSET; i++)
540
+                    m_offsetBo[1][((ctuParam[1][addr].bandPos + i) & (SAO_NUM_BO_CLASSES - 1))] = (int8_t)(ctuParam[1][addr].offset[i] << SAO_BIT_INC);
541
+            }
542
+            else // if (typeIdx == SAO_EO_0 || typeIdx == SAO_EO_1 || typeIdx == SAO_EO_2 || typeIdx == SAO_EO_3)
543
+            {
544
+                int offset[NUM_EDGETYPE];
545
+                offset[0] = 0;
546
+                for (int i = 0; i < SAO_NUM_OFFSET; i++)
547
+                    offset[i + 1] = ctuParam[1][addr].offset[i] << SAO_BIT_INC;
548
 
549
-                    for (int edgeType = 0; edgeType < NUM_EDGETYPE; edgeType++)
550
-                        m_offsetEo[edgeType] = (int8_t)offset[s_eoTable[edgeType]];
551
-                }
552
+                for (int edgeType = 0; edgeType < NUM_EDGETYPE; edgeType++)
553
+                    m_offsetEo[1][edgeType] = (int8_t)offset[s_eoTable[edgeType]];
554
             }
555
-            processSaoCu(addr, typeIdx, plane);
556
         }
557
-        else if (idxX != (m_numCuInWidth - 1))
558
+        processSaoCu(addr, typeIdxCb, 1);
559
+    }
560
+
561
+    // Process V
562
+    if (typeIdxCr >= 0)
563
+    {
564
+        if (!mergeLeftFlagCr)
565
         {
566
-            rec = plane ? m_frame->m_reconPic->getChromaAddr(plane, addr) : m_frame->m_reconPic->getLumaAddr(addr);
567
+            if (typeIdxCr == SAO_BO)
568
+            {
569
+                memset(m_offsetBo[2], 0, sizeof(m_offsetBo[0]));
570
 
571
-            for (int i = 0; i < ctuHeight + 1; i++)
572
+                for (int i = 0; i < SAO_NUM_OFFSET; i++)
573
+                    m_offsetBo[2][((ctuParam[2][addr].bandPos + i) & (SAO_NUM_BO_CLASSES - 1))] = (int8_t)(ctuParam[2][addr].offset[i] << SAO_BIT_INC);
574
+            }
575
+            else // if (typeIdx == SAO_EO_0 || typeIdx == SAO_EO_1 || typeIdx == SAO_EO_2 || typeIdx == SAO_EO_3)
576
             {
577
-                m_tmpL1[i] = rec[ctuWidth - 1];
578
-                rec += stride;
579
+                int offset[NUM_EDGETYPE];
580
+                offset[0] = 0;
581
+                for (int i = 0; i < SAO_NUM_OFFSET; i++)
582
+                    offset[i + 1] = ctuParam[2][addr].offset[i] << SAO_BIT_INC;
583
+
584
+                for (int edgeType = 0; edgeType < NUM_EDGETYPE; edgeType++)
585
+                    m_offsetEo[2][edgeType] = (int8_t)offset[s_eoTable[edgeType]];
586
             }
587
         }
588
+        processSaoCu(addr, typeIdxCb, 2);
589
     }
590
 
591
-    std::swap(m_tmpU1[plane], m_tmpU2[plane]);
592
-}
593
-
594
-void SAO::resetSaoUnit(SaoCtuParam* saoUnit)
595
-{
596
-    saoUnit->mergeMode  = SAO_MERGE_NONE;
597
-    saoUnit->typeIdx    = -1;
598
-    saoUnit->bandPos    = 0;
599
-
600
-    for (int i = 0; i < SAO_NUM_OFFSET; i++)
601
-        saoUnit->offset[i] = 0;
602
+    std::swap(m_tmpL1[1], m_tmpL2[1]);
603
+    std::swap(m_tmpL1[2], m_tmpL2[2]);
604
 }
605
 
606
 void SAO::copySaoUnit(SaoCtuParam* saoUnitDst, const SaoCtuParam* saoUnitSrc)
607
@@ -671,12 +771,13 @@
608
 /* Calculate SAO statistics for current CTU without non-crossing slice */
609
 void SAO::calcSaoStatsCu(int addr, int plane)
610
 {
611
+    const PicYuv* reconPic = m_frame->m_reconPic;
612
     const CUData* cu = m_frame->m_encData->getPicCTU(addr);
613
     const pixel* fenc0 = m_frame->m_fencPic->getPlaneAddr(plane, addr);
614
-    const pixel* rec0  = m_frame->m_reconPic->getPlaneAddr(plane, addr);
615
+    const pixel* rec0  = reconPic->getPlaneAddr(plane, addr);
616
     const pixel* fenc;
617
     const pixel* rec;
618
-    intptr_t stride = plane ? m_frame->m_reconPic->m_strideC : m_frame->m_reconPic->m_stride;
619
+    intptr_t stride = plane ? reconPic->m_strideC : reconPic->m_stride;
620
     uint32_t picWidth  = m_param->sourceWidth;
621
     uint32_t picHeight = m_param->sourceHeight;
622
     int ctuWidth  = g_maxCUSize;
623
@@ -702,24 +803,48 @@
624
     int endX;
625
     int endY;
626
 
627
-    int skipB = plane ? 2 : 4;
628
-    int skipR = plane ? 3 : 5;
629
+    const int plane_offset = plane ? 2 : 0;
630
+    int skipB = 4;
631
+    int skipR = 5;
632
 
633
-    int8_t _upBuff1[MAX_CU_SIZE + 2], *upBuff1 = _upBuff1 + 1;
634
-    int8_t _upBufft[MAX_CU_SIZE + 2], *upBufft = _upBufft + 1;
635
+    int8_t _upBuff[2 * (MAX_CU_SIZE + 16 + 16)], *upBuff1 = _upBuff + 16, *upBufft = upBuff1 + (MAX_CU_SIZE + 16 + 16);
636
+
637
+    ALIGN_VAR_32(int16_t, diff[MAX_CU_SIZE * MAX_CU_SIZE]);
638
+
639
+    // Calculate (fenc - frec) and put into diff[]
640
+    if ((lpelx + ctuWidth <  picWidth) & (tpely + ctuHeight < picHeight))
641
+    {
642
+        // WARNING: *) May read beyond bound on video than ctuWidth or ctuHeight is NOT multiple of cuSize
643
+        X265_CHECK((ctuWidth == ctuHeight) || (m_chromaFormat != X265_CSP_I420), "video size check failure\n");
644
+        if (plane)
645
+            primitives.chroma[m_chromaFormat].cu[g_maxLog2CUSize - 2].sub_ps(diff, MAX_CU_SIZE, fenc0, rec0, stride, stride);
646
+        else
647
+           primitives.cu[g_maxLog2CUSize - 2].sub_ps(diff, MAX_CU_SIZE, fenc0, rec0, stride, stride);
648
+    }
649
+    else
650
+    {
651
+        // path for non-square area (most in edge)
652
+        for(int y = 0; y < ctuHeight; y++)
653
+        {
654
+            for(int x = 0; x < ctuWidth; x++)
655
+            {
656
+                diff[y * MAX_CU_SIZE + x] = (fenc0[y * stride + x] - rec0[y * stride + x]);
657
+            }
658
+        }
659
+    }
660
 
661
     // SAO_BO:
662
     {
663
         if (m_param->bSaoNonDeblocked)
664
         {
665
-            skipB = plane ? 1 : 3;
666
-            skipR = plane ? 2 : 4;
667
+            skipB = 3;
668
+            skipR = 4;
669
         }
670
 
671
-        endX = (rpelx == picWidth) ? ctuWidth : ctuWidth - skipR;
672
-        endY = (bpely == picHeight) ? ctuHeight : ctuHeight - skipB;
673
+        endX = (rpelx == picWidth) ? ctuWidth : ctuWidth - skipR + plane_offset;
674
+        endY = (bpely == picHeight) ? ctuHeight : ctuHeight - skipB + plane_offset;
675
 
676
-        primitives.saoCuStatsBO(fenc0, rec0, stride, endX, endY, m_offsetOrg[plane][SAO_BO], m_count[plane][SAO_BO]);
677
+        primitives.saoCuStatsBO(diff, rec0, stride, endX, endY, m_offsetOrg[plane][SAO_BO], m_count[plane][SAO_BO]);
678
     }
679
 
680
     {
681
@@ -727,84 +852,82 @@
682
         {
683
             if (m_param->bSaoNonDeblocked)
684
             {
685
-                skipB = plane ? 1 : 3;
686
-                skipR = plane ? 3 : 5;
687
+                skipB = 3;
688
+                skipR = 5;
689
             }
690
 
691
             startX = !lpelx;
692
-            endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR;
693
+            endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR + plane_offset;
694
 
695
-            primitives.saoCuStatsE0(fenc0 + startX, rec0 + startX, stride, endX - startX, ctuHeight - skipB, m_offsetOrg[plane][SAO_EO_0], m_count[plane][SAO_EO_0]);
696
+            primitives.saoCuStatsE0(diff + startX, rec0 + startX, stride, endX - startX, ctuHeight - skipB + plane_offset, m_offsetOrg[plane][SAO_EO_0], m_count[plane][SAO_EO_0]);
697
         }
698
 
699
         // SAO_EO_1: // dir: |
700
         {
701
             if (m_param->bSaoNonDeblocked)
702
             {
703
-                skipB = plane ? 2 : 4;
704
-                skipR = plane ? 2 : 4;
705
+                skipB = 4;
706
+                skipR = 4;
707
             }
708
 
709
-            fenc = fenc0;
710
             rec  = rec0;
711
 
712
             startY = !tpely;
713
-            endX   = (rpelx == picWidth) ? ctuWidth : ctuWidth - skipR;
714
-            endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight - skipB;
715
+            endX   = (rpelx == picWidth) ? ctuWidth : ctuWidth - skipR + plane_offset;
716
+            endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight - skipB + plane_offset;
717
             if (!tpely)
718
             {
719
-                fenc += stride;
720
                 rec += stride;
721
             }
722
 
723
             primitives.sign(upBuff1, rec, &rec[- stride], ctuWidth);
724
 
725
-            primitives.saoCuStatsE1(fenc0 + startY * stride, rec0 + startY * stride, stride, upBuff1, endX, endY - startY, m_offsetOrg[plane][SAO_EO_1], m_count[plane][SAO_EO_1]);
726
+            primitives.saoCuStatsE1(diff + startY * MAX_CU_SIZE, rec0 + startY * stride, stride, upBuff1, endX, endY - startY, m_offsetOrg[plane][SAO_EO_1], m_count[plane][SAO_EO_1]);
727
         }
728
 
729
         // SAO_EO_2: // dir: 135
730
         {
731
             if (m_param->bSaoNonDeblocked)
732
             {
733
-                skipB = plane ? 2 : 4;
734
-                skipR = plane ? 3 : 5;
735
+                skipB = 4;
736
+                skipR = 5;
737
             }
738
 
739
             fenc = fenc0;
740
             rec  = rec0;
741
 
742
             startX = !lpelx;
743
-            endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR;
744
+            endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR + plane_offset;
745
 
746
             startY = !tpely;
747
-            endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight - skipB;
748
+            endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight - skipB + plane_offset;
749
             if (!tpely)
750
             {
751
                 fenc += stride;
752
                 rec += stride;
753
             }
754
 
755
-            primitives.sign(&upBuff1[startX], &rec[startX], &rec[startX - stride - 1], (endX - startX));
756
+            primitives.sign(upBuff1, &rec[startX], &rec[startX - stride - 1], (endX - startX));
757
 
758
-            primitives.saoCuStatsE2(fenc0 + startX + startY * stride, rec0  + startX + startY * stride, stride, upBuff1 + startX, upBufft + startX, endX - startX, endY - startY, m_offsetOrg[plane][SAO_EO_2], m_count[plane][SAO_EO_2]);
759
+            primitives.saoCuStatsE2(diff + startX + startY * MAX_CU_SIZE, rec0  + startX + startY * stride, stride, upBuff1, upBufft, endX - startX, endY - startY, m_offsetOrg[plane][SAO_EO_2], m_count[plane][SAO_EO_2]);
760
         }
761
 
762
         // SAO_EO_3: // dir: 45
763
         {
764
             if (m_param->bSaoNonDeblocked)
765
             {
766
-                skipB = plane ? 2 : 4;
767
-                skipR = plane ? 3 : 5;
768
+                skipB = 4;
769
+                skipR = 5;
770
             }
771
 
772
             fenc = fenc0;
773
             rec  = rec0;
774
 
775
             startX = !lpelx;
776
-            endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR;
777
+            endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR + plane_offset;
778
 
779
             startY = !tpely;
780
-            endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight - skipB;
781
+            endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight - skipB + plane_offset;
782
 
783
             if (!tpely)
784
             {
785
@@ -812,9 +935,9 @@
786
                 rec += stride;
787
             }
788
 
789
-            primitives.sign(&upBuff1[startX - 1], &rec[startX - 1], &rec[startX - 1 - stride + 1], (endX - startX + 1));
790
+            primitives.sign(upBuff1, &rec[startX - 1], &rec[startX - 1 - stride + 1], (endX - startX + 1));
791
 
792
-            primitives.saoCuStatsE3(fenc0 + startX + startY * stride, rec0  + startX + startY * stride, stride, upBuff1 + startX, endX - startX, endY - startY, m_offsetOrg[plane][SAO_EO_3], m_count[plane][SAO_EO_3]);
793
+            primitives.saoCuStatsE3(diff + startX + startY * MAX_CU_SIZE, rec0  + startX + startY * stride, stride, upBuff1 + 1, endX - startX, endY - startY, m_offsetOrg[plane][SAO_EO_3], m_count[plane][SAO_EO_3]);
794
         }
795
     }
796
 }
797
@@ -825,9 +948,10 @@
798
 
799
     int x, y;
800
     const CUData* cu = frame->m_encData->getPicCTU(addr);
801
+    const PicYuv* reconPic = m_frame->m_reconPic;
802
     const pixel* fenc;
803
     const pixel* rec;
804
-    intptr_t stride = m_frame->m_reconPic->m_stride;
805
+    intptr_t stride = reconPic->m_stride;
806
     uint32_t picWidth  = m_param->sourceWidth;
807
     uint32_t picHeight = m_param->sourceHeight;
808
     int ctuWidth  = g_maxCUSize;
809
@@ -857,11 +981,12 @@
810
     memset(m_countPreDblk[addr], 0, sizeof(PerPlane));
811
     memset(m_offsetOrgPreDblk[addr], 0, sizeof(PerPlane));
812
 
813
-    for (int plane = 0; plane < NUM_PLANE; plane++)
814
+    int plane_offset = 0;
815
+    for (int plane = 0; plane < (frame->m_param->internalCsp != X265_CSP_I400 ? NUM_PLANE : 1); plane++)
816
     {
817
         if (plane == 1)
818
         {
819
-            stride = frame->m_reconPic->m_strideC;
820
+            stride = reconPic->m_strideC;
821
             picWidth  >>= m_hChromaShift;
822
             picHeight >>= m_vChromaShift;
823
             ctuWidth  >>= m_hChromaShift;
824
@@ -874,14 +999,14 @@
825
 
826
         // SAO_BO:
827
 
828
-        skipB = plane ? 1 : 3;
829
-        skipR = plane ? 2 : 4;
830
+        skipB = 3 - plane_offset;
831
+        skipR = 4 - plane_offset;
832
 
833
         stats = m_offsetOrgPreDblk[addr][plane][SAO_BO];
834
         count = m_countPreDblk[addr][plane][SAO_BO];
835
 
836
         const pixel* fenc0 = m_frame->m_fencPic->getPlaneAddr(plane, addr);
837
-        const pixel* rec0  = m_frame->m_reconPic->getPlaneAddr(plane, addr);
838
+        const pixel* rec0 = reconPic->getPlaneAddr(plane, addr);
839
         fenc = fenc0;
840
         rec  = rec0;
841
 
842
@@ -903,8 +1028,8 @@
843
 
844
         // SAO_EO_0: // dir: -
845
         {
846
-            skipB = plane ? 1 : 3;
847
-            skipR = plane ? 3 : 5;
848
+            skipB = 3 - plane_offset;
849
+            skipR = 5 - plane_offset;
850
 
851
             stats = m_offsetOrgPreDblk[addr][plane][SAO_EO_0];
852
             count = m_countPreDblk[addr][plane][SAO_EO_0];
853
@@ -939,8 +1064,8 @@
854
 
855
         // SAO_EO_1: // dir: |
856
         {
857
-            skipB = plane ? 2 : 4;
858
-            skipR = plane ? 2 : 4;
859
+            skipB = 4 - plane_offset;
860
+            skipR = 4 - plane_offset;
861
 
862
             stats = m_offsetOrgPreDblk[addr][plane][SAO_EO_1];
863
             count = m_countPreDblk[addr][plane][SAO_EO_1];
864
@@ -984,8 +1109,8 @@
865
 
866
         // SAO_EO_2: // dir: 135
867
         {
868
-            skipB = plane ? 2 : 4;
869
-            skipR = plane ? 3 : 5;
870
+            skipB = 4 - plane_offset;
871
+            skipR = 5 - plane_offset;
872
 
873
             stats = m_offsetOrgPreDblk[addr][plane][SAO_EO_2];
874
             count = m_countPreDblk[addr][plane][SAO_EO_2];
875
@@ -1036,8 +1161,8 @@
876
 
877
         // SAO_EO_3: // dir: 45
878
         {
879
-            skipB = plane ? 2 : 4;
880
-            skipR = plane ? 3 : 5;
881
+            skipB = 4 - plane_offset;
882
+            skipR = 5 - plane_offset;
883
 
884
             stats = m_offsetOrgPreDblk[addr][plane][SAO_EO_3];
885
             count = m_countPreDblk[addr][plane][SAO_EO_3];
886
@@ -1083,28 +1208,29 @@
887
                 fenc += stride;
888
             }
889
         }
890
+        plane_offset = 2;
891
     }
892
 }
893
 
894
 /* reset offset statistics */
895
 void SAO::resetStats()
896
 {
897
-    memset(m_count, 0, sizeof(PerClass) * NUM_PLANE);
898
-    memset(m_offset, 0, sizeof(PerClass) * NUM_PLANE);
899
-    memset(m_offsetOrg, 0, sizeof(PerClass) * NUM_PLANE);
900
+    memset(m_count, 0, sizeof(m_count));
901
+    memset(m_offset, 0, sizeof(m_offset));
902
+    memset(m_offsetOrg, 0, sizeof(m_offsetOrg));
903
 }
904
 
905
 void SAO::rdoSaoUnitRowEnd(const SAOParam* saoParam, int numctus)
906
 {
907
     if (!saoParam->bSaoFlag[0])
908
-        m_depthSaoRate[0][m_refDepth] = 1.0;
909
+        m_depthSaoRate[0 * SAO_DEPTHRATE_SIZE + m_refDepth] = 1.0;
910
     else
911
-        m_depthSaoRate[0][m_refDepth] = m_numNoSao[0] / ((double)numctus);
912
+        m_depthSaoRate[0 * SAO_DEPTHRATE_SIZE + m_refDepth] = m_numNoSao[0] / ((double)numctus);
913
 
914
     if (!saoParam->bSaoFlag[1])
915
-        m_depthSaoRate[1][m_refDepth] = 1.0;
916
+        m_depthSaoRate[1 * SAO_DEPTHRATE_SIZE + m_refDepth] = 1.0;
917
     else
918
-        m_depthSaoRate[1][m_refDepth] = m_numNoSao[1] / ((double)numctus);
919
+        m_depthSaoRate[1 * SAO_DEPTHRATE_SIZE + m_refDepth] = m_numNoSao[1] / ((double)numctus);
920
 }
921
 
922
 void SAO::rdoSaoUnitRow(SAOParam* saoParam, int idxY)
923
@@ -1127,37 +1253,38 @@
924
         if (allowMerge[1])
925
             m_entropyCoder.codeSaoMerge(0);
926
         m_entropyCoder.store(m_rdContexts.temp);
927
+
928
         // reset stats Y, Cb, Cr
929
-        for (int plane = 0; plane < 3; plane++)
930
+        X265_CHECK(sizeof(PerPlane) == (sizeof(int32_t) * (NUM_PLANE * MAX_NUM_SAO_TYPE * MAX_NUM_SAO_CLASS)), "Found Padding space in struct PerPlane");
931
+
932
+        // TODO: Confirm the address space is continuous
933
+        if (m_param->bSaoNonDeblocked)
934
         {
935
-            for (int j = 0; j < MAX_NUM_SAO_TYPE; j++)
936
-            {
937
-                for (int k = 0; k < MAX_NUM_SAO_CLASS; k++)
938
-                {
939
-                    m_offset[plane][j][k] = 0;
940
-                    if (m_param->bSaoNonDeblocked)
941
-                    {
942
-                        m_count[plane][j][k] = m_countPreDblk[addr][plane][j][k];
943
-                        m_offsetOrg[plane][j][k] = m_offsetOrgPreDblk[addr][plane][j][k];
944
-                    }
945
-                    else
946
-                    {
947
-                        m_count[plane][j][k] = 0;
948
-                        m_offsetOrg[plane][j][k] = 0;
949
-                    }
950
-                }
951
-            }
952
+            memcpy(m_count, m_countPreDblk[addr], sizeof(m_count));
953
+            memcpy(m_offsetOrg, m_offsetOrgPreDblk[addr], sizeof(m_offsetOrg));
954
+        }
955
+        else
956
+        {
957
+            memset(m_count, 0, sizeof(m_count));
958
+            memset(m_offsetOrg, 0, sizeof(m_offsetOrg));
959
+        }
960
 
961
-            saoParam->ctuParam[plane][addr].mergeMode = SAO_MERGE_NONE;
962
-            saoParam->ctuParam[plane][addr].typeIdx   = -1;
963
-            saoParam->ctuParam[plane][addr].bandPos   = 0;
964
-            if (saoParam->bSaoFlag[plane > 0])
965
-                calcSaoStatsCu(addr, plane);
966
+        saoParam->ctuParam[0][addr].reset();
967
+        saoParam->ctuParam[1][addr].reset();
968
+        saoParam->ctuParam[2][addr].reset();
969
+
970
+        if (saoParam->bSaoFlag[0])
971
+            calcSaoStatsCu(addr, 0);
972
+
973
+        if (saoParam->bSaoFlag[1])
974
+        {
975
+            calcSaoStatsCu(addr, 1);
976
+            calcSaoStatsCu(addr, 2);
977
         }
978
 
979
         saoComponentParamDist(saoParam, addr, addrUp, addrLeft, &mergeSaoParam[0][0], mergeDist);
980
-
981
-        sao2ChromaParamDist(saoParam, addr, addrUp, addrLeft, mergeSaoParam, mergeDist);
982
+        if (m_chromaFormat != X265_CSP_I400)
983
+            sao2ChromaParamDist(saoParam, addr, addrUp, addrLeft, mergeSaoParam, mergeDist);
984
 
985
         if (saoParam->bSaoFlag[0] || saoParam->bSaoFlag[1])
986
         {
987
@@ -1209,14 +1336,122 @@
988
 
989
             if (saoParam->ctuParam[0][addr].typeIdx < 0)
990
                 m_numNoSao[0]++;
991
-            if (saoParam->ctuParam[1][addr].typeIdx < 0)
992
+            if (m_chromaFormat != X265_CSP_I400 && saoParam->ctuParam[1][addr].typeIdx < 0)
993
                 m_numNoSao[1]++;
994
+
995
             m_entropyCoder.load(m_rdContexts.temp);
996
             m_entropyCoder.store(m_rdContexts.cur);
997
         }
998
     }
999
 }
1000
 
1001
+void SAO::rdoSaoUnitCu(SAOParam* saoParam, int rowBaseAddr, int idxX, int addr)
1002
+{
1003
+    SaoCtuParam mergeSaoParam[NUM_MERGE_MODE][2];
1004
+    double mergeDist[NUM_MERGE_MODE];
1005
+    const bool allowMerge[2] = {(idxX != 0), (rowBaseAddr != 0)}; // left, up
1006
+
1007
+    const int addrUp   = rowBaseAddr ? addr - m_numCuInWidth : -1;
1008
+    const int addrLeft = idxX ? addr - 1 : -1;
1009
+
1010
+    bool chroma = m_param->internalCsp != X265_CSP_I400;
1011
+    int planes = chroma ? 3 : 1;
1012
+
1013
+    m_entropyCoder.load(m_rdContexts.cur);
1014
+    if (allowMerge[0])
1015
+        m_entropyCoder.codeSaoMerge(0);
1016
+    if (allowMerge[1])
1017
+        m_entropyCoder.codeSaoMerge(0);
1018
+    m_entropyCoder.store(m_rdContexts.temp);
1019
+
1020
+    // reset stats Y, Cb, Cr
1021
+    X265_CHECK(sizeof(PerPlane) == (sizeof(int32_t) * (NUM_PLANE * MAX_NUM_SAO_TYPE * MAX_NUM_SAO_CLASS)), "Found Padding space in struct PerPlane");
1022
+
1023
+    // TODO: Confirm the address space is continuous
1024
+    if (m_param->bSaoNonDeblocked)
1025
+    {
1026
+        memcpy(m_count, m_countPreDblk[addr], sizeof(m_count));
1027
+        memcpy(m_offsetOrg, m_offsetOrgPreDblk[addr], sizeof(m_offsetOrg));
1028
+    }
1029
+    else
1030
+    {
1031
+        memset(m_count, 0, sizeof(m_count));
1032
+        memset(m_offsetOrg, 0, sizeof(m_offsetOrg));
1033
+    }
1034
+
1035
+    for (int i = 0; i < planes; i++)
1036
+        saoParam->ctuParam[i][addr].reset();
1037
+
1038
+    if (saoParam->bSaoFlag[0])
1039
+        calcSaoStatsCu(addr, 0);
1040
+
1041
+    if (saoParam->bSaoFlag[1])
1042
+    {
1043
+        calcSaoStatsCu(addr, 1);
1044
+        calcSaoStatsCu(addr, 2);
1045
+    }
1046
+
1047
+    saoComponentParamDist(saoParam, addr, addrUp, addrLeft, &mergeSaoParam[0][0], mergeDist);
1048
+    if (chroma)
1049
+        sao2ChromaParamDist(saoParam, addr, addrUp, addrLeft, mergeSaoParam, mergeDist);
1050
+
1051
+    if (saoParam->bSaoFlag[0] || saoParam->bSaoFlag[1])
1052
+    {
1053
+        // Cost of new SAO_params
1054
+        m_entropyCoder.load(m_rdContexts.cur);
1055
+        m_entropyCoder.resetBits();
1056
+        if (allowMerge[0])
1057
+            m_entropyCoder.codeSaoMerge(0);
1058
+        if (allowMerge[1])
1059
+            m_entropyCoder.codeSaoMerge(0);
1060
+        for (int plane = 0; plane < planes; plane++)
1061
+        {
1062
+            if (saoParam->bSaoFlag[plane > 0])
1063
+                m_entropyCoder.codeSaoOffset(saoParam->ctuParam[plane][addr], plane);
1064
+        }
1065
+
1066
+        uint32_t rate = m_entropyCoder.getNumberOfWrittenBits();
1067
+        double bestCost = mergeDist[0] + (double)rate;
1068
+        m_entropyCoder.store(m_rdContexts.temp);
1069
+
1070
+        // Cost of Merge
1071
+        for (int mergeIdx = 0; mergeIdx < 2; ++mergeIdx)
1072
+        {
1073
+            if (!allowMerge[mergeIdx])
1074
+                continue;
1075
+
1076
+            m_entropyCoder.load(m_rdContexts.cur);
1077
+            m_entropyCoder.resetBits();
1078
+            if (allowMerge[0])
1079
+                m_entropyCoder.codeSaoMerge(1 - mergeIdx);
1080
+            if (allowMerge[1] && (mergeIdx == 1))
1081
+                m_entropyCoder.codeSaoMerge(1);
1082
+
1083
+            rate = m_entropyCoder.getNumberOfWrittenBits();
1084
+            double mergeCost = mergeDist[mergeIdx + 1] + (double)rate;
1085
+            if (mergeCost < bestCost)
1086
+            {
1087
+                SaoMergeMode mergeMode = mergeIdx ? SAO_MERGE_UP : SAO_MERGE_LEFT;
1088
+                bestCost = mergeCost;
1089
+                m_entropyCoder.store(m_rdContexts.temp);
1090
+                for (int plane = 0; plane < planes; plane++)
1091
+                {
1092
+                    mergeSaoParam[plane][mergeIdx].mergeMode = mergeMode;
1093
+                    if (saoParam->bSaoFlag[plane > 0])
1094
+                        copySaoUnit(&saoParam->ctuParam[plane][addr], &mergeSaoParam[plane][mergeIdx]);
1095
+                }
1096
+            }
1097
+        }
1098
+
1099
+        if (saoParam->ctuParam[0][addr].typeIdx < 0)
1100
+            m_numNoSao[0]++;
1101
+        if (chroma && saoParam->ctuParam[1][addr].typeIdx < 0)
1102
+            m_numNoSao[1]++;
1103
+        m_entropyCoder.load(m_rdContexts.temp);
1104
+        m_entropyCoder.store(m_rdContexts.cur);
1105
+    }
1106
+}
1107
+
1108
 /** rate distortion optimization of SAO unit */
1109
 inline int64_t SAO::estSaoTypeDist(int plane, int typeIdx, double lambda, int32_t* currentDistortionTableBo, double* currentRdCostTableBo)
1110
 {
1111
@@ -1302,7 +1537,6 @@
1112
     int    currentDistortionTableBo[MAX_NUM_SAO_CLASS];
1113
     double currentRdCostTableBo[MAX_NUM_SAO_CLASS];
1114
 
1115
-    resetSaoUnit(lclCtuParam);
1116
     m_entropyCoder.load(m_rdContexts.temp);
1117
     m_entropyCoder.resetBits();
1118
     m_entropyCoder.codeSaoOffset(*lclCtuParam, 0);
1119
@@ -1362,7 +1596,6 @@
1120
     m_entropyCoder.store(m_rdContexts.temp);
1121
 
1122
     // merge left or merge up
1123
-
1124
     for (int mergeIdx = 0; mergeIdx < 2; mergeIdx++)
1125
     {
1126
         SaoCtuParam* mergeSrcParam = NULL;
1127
@@ -1389,8 +1622,6 @@
1128
 
1129
             mergeDist[mergeIdx + 1] = ((double)estDist / m_lumaLambda);
1130
         }
1131
-        else
1132
-            resetSaoUnit(&mergeSaoParam[mergeIdx]);
1133
     }
1134
 }
1135
 
1136
@@ -1404,8 +1635,6 @@
1137
     int    bestClassTableBo[2] = { 0, 0 };
1138
     int    currentDistortionTableBo[MAX_NUM_SAO_CLASS];
1139
 
1140
-    resetSaoUnit(lclCtuParam[0]);
1141
-    resetSaoUnit(lclCtuParam[1]);
1142
     m_entropyCoder.load(m_rdContexts.temp);
1143
     m_entropyCoder.resetBits();
1144
     m_entropyCoder.codeSaoOffset(*lclCtuParam[0], 1);
1145
@@ -1483,7 +1712,6 @@
1146
     m_entropyCoder.store(m_rdContexts.temp);
1147
 
1148
     // merge left or merge up
1149
-
1150
     for (int mergeIdx = 0; mergeIdx < 2; mergeIdx++)
1151
     {
1152
         for (int compIdx = 0; compIdx < 2; compIdx++)
1153
@@ -1512,14 +1740,12 @@
1154
                 mergeSaoParam[plane][mergeIdx].mergeMode = mergeIdx ? SAO_MERGE_UP : SAO_MERGE_LEFT;
1155
                 mergeDist[mergeIdx + 1] += ((double)estDist / m_chromaLambda);
1156
             }
1157
-            else
1158
-                resetSaoUnit(&mergeSaoParam[plane][mergeIdx]);
1159
         }
1160
     }
1161
 }
1162
 
1163
 // NOTE: must put in namespace X265_NS since we need class SAO
1164
-void saoCuStatsBO_c(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count)
1165
+void saoCuStatsBO_c(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count)
1166
 {
1167
     int x, y;
1168
     const int boShift = X265_DEPTH - SAO_BO_BITS;
1169
@@ -1529,21 +1755,23 @@
1170
         for (x = 0; x < endX; x++)
1171
         {
1172
             int classIdx = 1 + (rec[x] >> boShift);
1173
-            stats[classIdx] += (fenc[x] - rec[x]);
1174
+            stats[classIdx] += diff[x];
1175
             count[classIdx]++;
1176
         }
1177
 
1178
-        fenc += stride;
1179
+        diff += MAX_CU_SIZE;
1180
         rec += stride;
1181
     }
1182
 }
1183
 
1184
-void saoCuStatsE0_c(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count)
1185
+void saoCuStatsE0_c(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count)
1186
 {
1187
     int x, y;
1188
     int32_t tmp_stats[SAO::NUM_EDGETYPE];
1189
     int32_t tmp_count[SAO::NUM_EDGETYPE];
1190
 
1191
+    X265_CHECK(endX <= MAX_CU_SIZE, "endX too big\n");
1192
+
1193
     memset(tmp_stats, 0, sizeof(tmp_stats));
1194
     memset(tmp_count, 0, sizeof(tmp_count));
1195
 
1196
@@ -1558,11 +1786,11 @@
1197
             signLeft = -signRight;
1198
 
1199
             X265_CHECK(edgeType <= 4, "edgeType check failure\n");
1200
-            tmp_stats[edgeType] += (fenc[x] - rec[x]);
1201
+            tmp_stats[edgeType] += diff[x];
1202
             tmp_count[edgeType]++;
1203
         }
1204
 
1205
-        fenc += stride;
1206
+        diff += MAX_CU_SIZE;
1207
         rec += stride;
1208
     }
1209
 
1210
@@ -1573,7 +1801,7 @@
1211
     }
1212
 }
1213
 
1214
-void saoCuStatsE1_c(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count)
1215
+void saoCuStatsE1_c(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count)
1216
 {
1217
     X265_CHECK(endX <= MAX_CU_SIZE, "endX check failure\n");
1218
     X265_CHECK(endY <= MAX_CU_SIZE, "endY check failure\n");
1219
@@ -1585,6 +1813,7 @@
1220
     memset(tmp_stats, 0, sizeof(tmp_stats));
1221
     memset(tmp_count, 0, sizeof(tmp_count));
1222
 
1223
+    X265_CHECK(endX * endY <= (4096 - 16), "Assembly of saoE1 may overflow with this block size\n");
1224
     for (y = 0; y < endY; y++)
1225
     {
1226
         for (x = 0; x < endX; x++)
1227
@@ -1594,10 +1823,11 @@
1228
             uint32_t edgeType = signDown + upBuff1[x] + 2;
1229
             upBuff1[x] = (int8_t)(-signDown);
1230
 
1231
-            tmp_stats[edgeType] += (fenc[x] - rec[x]);
1232
+            X265_CHECK(edgeType <= 4, "edgeType check failure\n");
1233
+            tmp_stats[edgeType] += diff[x];
1234
             tmp_count[edgeType]++;
1235
         }
1236
-        fenc += stride;
1237
+        diff += MAX_CU_SIZE;
1238
         rec += stride;
1239
     }
1240
 
1241
@@ -1608,7 +1838,7 @@
1242
     }
1243
 }
1244
 
1245
-void saoCuStatsE2_c(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBufft, int endX, int endY, int32_t *stats, int32_t *count)
1246
+void saoCuStatsE2_c(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBufft, int endX, int endY, int32_t *stats, int32_t *count)
1247
 {
1248
     X265_CHECK(endX < MAX_CU_SIZE, "endX check failure\n");
1249
     X265_CHECK(endY < MAX_CU_SIZE, "endY check failure\n");
1250
@@ -1629,14 +1859,14 @@
1251
             X265_CHECK(signDown == signOf(rec[x] - rec[x + stride + 1]), "signDown check failure\n");
1252
             uint32_t edgeType = signDown + upBuff1[x] + 2;
1253
             upBufft[x + 1] = (int8_t)(-signDown);
1254
-            tmp_stats[edgeType] += (fenc[x] - rec[x]);
1255
+            tmp_stats[edgeType] += diff[x];
1256
             tmp_count[edgeType]++;
1257
         }
1258
 
1259
         std::swap(upBuff1, upBufft);
1260
 
1261
         rec += stride;
1262
-        fenc += stride;
1263
+        diff += MAX_CU_SIZE;
1264
     }
1265
 
1266
     for (x = 0; x < SAO::NUM_EDGETYPE; x++)
1267
@@ -1646,7 +1876,7 @@
1268
     }
1269
 }
1270
 
1271
-void saoCuStatsE3_c(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count)
1272
+void saoCuStatsE3_c(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count)
1273
 {
1274
     X265_CHECK(endX < MAX_CU_SIZE, "endX check failure\n");
1275
     X265_CHECK(endY < MAX_CU_SIZE, "endY check failure\n");
1276
@@ -1668,14 +1898,14 @@
1277
 
1278
             uint32_t edgeType = signDown + upBuff1[x] + 2;
1279
             upBuff1[x - 1] = (int8_t)(-signDown);
1280
-            tmp_stats[edgeType] += (fenc[x] - rec[x]);
1281
+            tmp_stats[edgeType] += diff[x];
1282
             tmp_count[edgeType]++;
1283
         }
1284
 
1285
         upBuff1[endX - 1] = signOf(rec[endX - 1 + stride] - rec[endX]);
1286
 
1287
         rec += stride;
1288
-        fenc += stride;
1289
+        diff += MAX_CU_SIZE;
1290
     }
1291
 
1292
     for (x = 0; x < SAO::NUM_EDGETYPE; x++)
1293
x265_1.8.tar.gz/source/encoder/sao.h -> x265_1.9.tar.gz/source/encoder/sao.h Changed
80
 
1
@@ -62,6 +62,7 @@
2
     enum { NUM_EDGETYPE = 5 };
3
     enum { NUM_PLANE = 3 };
4
     enum { NUM_MERGE_MODE = 3 };
5
+    enum { SAO_DEPTHRATE_SIZE = 4 };
6
 
7
     static const uint32_t s_eoTable[NUM_EDGETYPE];
8
 
9
@@ -71,18 +72,19 @@
10
 protected:
11
 
12
     /* allocated per part */
13
-    PerClass*   m_count;
14
-    PerClass*   m_offset;
15
-    PerClass*   m_offsetOrg;
16
+    PerPlane    m_count;
17
+    PerPlane    m_offset;
18
+    PerPlane    m_offsetOrg;
19
 
20
     /* allocated per CTU */
21
     PerPlane*   m_countPreDblk;
22
     PerPlane*   m_offsetOrgPreDblk;
23
 
24
-    double      m_depthSaoRate[2][4];
25
-    int8_t      m_offsetBo[SAO_NUM_BO_CLASSES];
26
-    int8_t      m_offsetEo[NUM_EDGETYPE];
27
+    double*     m_depthSaoRate;
28
+    int8_t      m_offsetBo[NUM_PLANE][SAO_NUM_BO_CLASSES];
29
+    int8_t      m_offsetEo[NUM_PLANE][NUM_EDGETYPE];
30
 
31
+    int         m_chromaFormat;
32
     int         m_numCuInWidth;
33
     int         m_numCuInHeight;
34
     int         m_hChromaShift;
35
@@ -91,10 +93,9 @@
36
     pixel*      m_clipTable;
37
     pixel*      m_clipTableBase;
38
 
39
-    pixel*      m_tmpU1[3];
40
-    pixel*      m_tmpU2[3];
41
-    pixel*      m_tmpL1;
42
-    pixel*      m_tmpL2;
43
+    pixel*      m_tmpU[3];
44
+    pixel*      m_tmpL1[3];
45
+    pixel*      m_tmpL2[3];
46
 
47
 public:
48
 
49
@@ -119,8 +120,9 @@
50
 
51
     SAO();
52
 
53
-    bool create(x265_param* param);
54
-    void destroy();
55
+    bool create(x265_param* param, int initCommon);
56
+    void createFromRootNode(SAO *root);
57
+    void destroy(int destoryCommon);
58
 
59
     void allocSaoParam(SAOParam* saoParam) const;
60
 
61
@@ -131,6 +133,8 @@
62
     // CTU-based SAO process without slice granularity
63
     void processSaoCu(int addr, int typeIdx, int plane);
64
     void processSaoUnitRow(SaoCtuParam* ctuParam, int idxY, int plane);
65
+    void processSaoUnitCuLuma(SaoCtuParam* ctuParam, int idxY, int idxX);
66
+    void processSaoUnitCuChroma(SaoCtuParam* ctuParam[3], int idxY, int idxX);
67
 
68
     void copySaoUnit(SaoCtuParam* saoUnitDst, const SaoCtuParam* saoUnitSrc);
69
 
70
@@ -146,6 +150,9 @@
71
 
72
     void rdoSaoUnitRowEnd(const SAOParam* saoParam, int numctus);
73
     void rdoSaoUnitRow(SAOParam* saoParam, int idxY);
74
+    void rdoSaoUnitCu(SAOParam* saoParam, int rowBaseAddr, int idxX, int addr);
75
+
76
+    friend class FrameFilter;
77
 };
78
 
79
 }
80
x265_1.8.tar.gz/source/encoder/search.cpp -> x265_1.9.tar.gz/source/encoder/search.cpp Changed
1343
 
1
@@ -2,6 +2,7 @@
2
 * Copyright (C) 2013 x265 project
3
 *
4
 * Authors: Steve Borho <steve@borho.org>
5
+*          Min Chen <chenm003@163.com>
6
 *
7
 * This program is free software; you can redistribute it and/or modify
8
 * it under the terms of the GNU General Public License as published by
9
@@ -80,7 +81,7 @@
10
     m_me.init(param.searchMethod, param.subpelRefine, param.internalCsp);
11
 
12
     bool ok = m_quant.init(param.rdoqLevel, param.psyRdoq, scalingList, m_entropyCoder);
13
-    if (m_param->noiseReductionIntra || m_param->noiseReductionInter)
14
+    if (m_param->noiseReductionIntra || m_param->noiseReductionInter || m_param->rc.vbvBufferSize)
15
         ok &= m_quant.allocNoiseReduction(param);
16
 
17
     ok &= Predict::allocBuffers(param.internalCsp); /* sets m_hChromaShift & m_vChromaShift */
18
@@ -97,13 +98,27 @@
19
      * the coeffRQT and reconQtYuv are allocated to the max CU size at every depth. The parts
20
      * which are reconstructed at each depth are valid. At the end, the transform depth table
21
      * is walked and the coeff and recon at the correct depths are collected */
22
-    for (uint32_t i = 0; i <= m_numLayers; i++)
23
+
24
+    if (param.internalCsp != X265_CSP_I400)
25
+    {
26
+        for (uint32_t i = 0; i <= m_numLayers; i++)
27
+        {
28
+            CHECKED_MALLOC(m_rqt[i].coeffRQT[0], coeff_t, sizeL + sizeC * 2);
29
+            m_rqt[i].coeffRQT[1] = m_rqt[i].coeffRQT[0] + sizeL;
30
+            m_rqt[i].coeffRQT[2] = m_rqt[i].coeffRQT[0] + sizeL + sizeC;
31
+            ok &= m_rqt[i].reconQtYuv.create(g_maxCUSize, param.internalCsp);
32
+            ok &= m_rqt[i].resiQtYuv.create(g_maxCUSize, param.internalCsp);
33
+        }
34
+    }
35
+    else
36
     {
37
-        CHECKED_MALLOC(m_rqt[i].coeffRQT[0], coeff_t, sizeL + sizeC * 2);
38
-        m_rqt[i].coeffRQT[1] = m_rqt[i].coeffRQT[0] + sizeL;
39
-        m_rqt[i].coeffRQT[2] = m_rqt[i].coeffRQT[0] + sizeL + sizeC;
40
-        ok &= m_rqt[i].reconQtYuv.create(g_maxCUSize, param.internalCsp);
41
-        ok &= m_rqt[i].resiQtYuv.create(g_maxCUSize, param.internalCsp);
42
+        for (uint32_t i = 0; i <= m_numLayers; i++)
43
+        {
44
+            CHECKED_MALLOC(m_rqt[i].coeffRQT[0], coeff_t, sizeL);
45
+            m_rqt[i].coeffRQT[1] = m_rqt[i].coeffRQT[2] = NULL;
46
+            ok &= m_rqt[i].reconQtYuv.create(g_maxCUSize, param.internalCsp);
47
+            ok &= m_rqt[i].resiQtYuv.create(g_maxCUSize, param.internalCsp);
48
+        }
49
     }
50
 
51
     /* the rest of these buffers are indexed per-depth */
52
@@ -116,12 +131,22 @@
53
         ok &= m_rqt[i].bidirPredYuv[1].create(cuSize, param.internalCsp);
54
     }
55
 
56
-    CHECKED_MALLOC(m_qtTempCbf[0], uint8_t, numPartitions * 3);
57
-    m_qtTempCbf[1] = m_qtTempCbf[0] + numPartitions;
58
-    m_qtTempCbf[2] = m_qtTempCbf[0] + numPartitions * 2;
59
-    CHECKED_MALLOC(m_qtTempTransformSkipFlag[0], uint8_t, numPartitions * 3);
60
-    m_qtTempTransformSkipFlag[1] = m_qtTempTransformSkipFlag[0] + numPartitions;
61
-    m_qtTempTransformSkipFlag[2] = m_qtTempTransformSkipFlag[0] + numPartitions * 2;
62
+    if (param.internalCsp != X265_CSP_I400)
63
+    {
64
+        CHECKED_MALLOC(m_qtTempCbf[0], uint8_t, numPartitions * 3);
65
+        m_qtTempCbf[1] = m_qtTempCbf[0] + numPartitions;
66
+        m_qtTempCbf[2] = m_qtTempCbf[0] + numPartitions * 2;
67
+        CHECKED_MALLOC(m_qtTempTransformSkipFlag[0], uint8_t, numPartitions * 3);
68
+        m_qtTempTransformSkipFlag[1] = m_qtTempTransformSkipFlag[0] + numPartitions;
69
+        m_qtTempTransformSkipFlag[2] = m_qtTempTransformSkipFlag[0] + numPartitions * 2;
70
+    }
71
+    else
72
+    {
73
+        CHECKED_MALLOC(m_qtTempCbf[0], uint8_t, numPartitions);
74
+        m_qtTempCbf[1] = m_qtTempCbf[2] = NULL;
75
+        CHECKED_MALLOC(m_qtTempTransformSkipFlag[0], uint8_t, numPartitions);
76
+        m_qtTempTransformSkipFlag[1] = m_qtTempTransformSkipFlag[2] = NULL;
77
+    }
78
 
79
     CHECKED_MALLOC(m_intraPred, pixel, (32 * 32) * (33 + 3));
80
     m_fencScaled = m_intraPred + 32 * 32;
81
@@ -163,12 +188,12 @@
82
     X265_FREE(m_tsRecon);
83
 }
84
 
85
-int Search::setLambdaFromQP(const CUData& ctu, int qp)
86
+int Search::setLambdaFromQP(const CUData& ctu, int qp, int lambdaQp)
87
 {
88
     X265_CHECK(qp >= QP_MIN && qp <= QP_MAX_MAX, "QP used for lambda is out of range\n");
89
 
90
     m_me.setQP(qp);
91
-    m_rdCost.setQP(*m_slice, qp);
92
+    m_rdCost.setQP(*m_slice, lambdaQp < 0 ? qp : lambdaQp);
93
 
94
     int quantQP = x265_clip3(QP_MIN, QP_MAX_SPEC, qp);
95
     m_quant.setQPforQuant(ctu, quantQP);
96
@@ -446,8 +471,9 @@
97
     }
98
 
99
     // set reconstruction for next intra prediction blocks if full TU prediction won
100
-    pixel*   picReconY = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
101
-    intptr_t picStride = m_frame->m_reconPic->m_stride;
102
+    PicYuv*  reconPic = m_frame->m_reconPic;
103
+    pixel*   picReconY = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
104
+    intptr_t picStride = reconPic->m_stride;
105
     primitives.cu[sizeIdx].copy_pp(picReconY, picStride, reconQt, reconQtStride);
106
 
107
     outCost.rdcost     += fullCost.rdcost;
108
@@ -530,7 +556,7 @@
109
             // no residual coded, recon = pred
110
             primitives.cu[sizeIdx].copy_pp(tmpRecon, tmpReconStride, pred, stride);
111
 
112
-        sse_ret_t tmpDist = primitives.cu[sizeIdx].sse_pp(tmpRecon, tmpReconStride, fenc, stride);
113
+        sse_t tmpDist = primitives.cu[sizeIdx].sse_pp(tmpRecon, tmpReconStride, fenc, stride);
114
 
115
         cu.setTransformSkipSubParts(useTSkip, TEXT_LUMA, absPartIdx, fullDepth);
116
         cu.setCbfSubParts((!!numSig) << tuDepth, TEXT_LUMA, absPartIdx, fullDepth);
117
@@ -611,8 +637,9 @@
118
     }
119
 
120
     // set reconstruction for next intra prediction blocks
121
-    pixel*   picReconY = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
122
-    intptr_t picStride = m_frame->m_reconPic->m_stride;
123
+    PicYuv*  reconPic = m_frame->m_reconPic;
124
+    pixel*   picReconY = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
125
+    intptr_t picStride = reconPic->m_stride;
126
     primitives.cu[sizeIdx].copy_pp(picReconY, picStride, reconQt, reconQtStride);
127
 
128
     outCost.rdcost += fullCost.rdcost;
129
@@ -661,8 +688,9 @@
130
         uint32_t sizeIdx   = log2TrSize - 2;
131
         primitives.cu[sizeIdx].calcresidual(fenc, pred, residual, stride);
132
 
133
-        pixel*   picReconY = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
134
-        intptr_t picStride = m_frame->m_reconPic->m_stride;
135
+        PicYuv*  reconPic = m_frame->m_reconPic;
136
+        pixel*   picReconY = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
137
+        intptr_t picStride = reconPic->m_stride;
138
 
139
         uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffY, log2TrSize, TEXT_LUMA, absPartIdx, false);
140
         if (numSig)
141
@@ -750,7 +778,7 @@
142
 }
143
 
144
 /* returns distortion */
145
-uint32_t Search::codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, uint32_t& psyEnergy)
146
+void Search::codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, Cost& outCost)
147
 {
148
     CUData& cu = mode.cu;
149
     uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
150
@@ -758,10 +786,10 @@
151
     if (tuDepth < cu.m_tuDepth[absPartIdx])
152
     {
153
         uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
154
-        uint32_t outDist = 0, splitCbfU = 0, splitCbfV = 0;
155
+        uint32_t splitCbfU = 0, splitCbfV = 0;
156
         for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
157
         {
158
-            outDist += codeIntraChromaQt(mode, cuGeom, tuDepth + 1, qPartIdx, psyEnergy);
159
+            codeIntraChromaQt(mode, cuGeom, tuDepth + 1, qPartIdx, outCost);
160
             splitCbfU |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
161
             splitCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
162
         }
163
@@ -770,8 +798,7 @@
164
             cu.m_cbf[1][absPartIdx + offs] |= (splitCbfU << tuDepth);
165
             cu.m_cbf[2][absPartIdx + offs] |= (splitCbfV << tuDepth);
166
         }
167
-
168
-        return outDist;
169
+        return;
170
     }
171
 
172
     uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
173
@@ -780,7 +807,7 @@
174
     {
175
         X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
176
         if (absPartIdx & 3)
177
-            return 0;
178
+            return;
179
         log2TrSizeC = 2;
180
         tuDepthC--;
181
     }
182
@@ -791,13 +818,15 @@
183
     bool checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && log2TrSizeC <= MAX_LOG2_TS_SIZE && !cu.m_tqBypass[0];
184
     checkTransformSkip &= !m_param->bEnableTSkipFast || (log2TrSize <= MAX_LOG2_TS_SIZE && cu.m_transformSkip[TEXT_LUMA][absPartIdx]);
185
     if (checkTransformSkip)
186
-        return codeIntraChromaTSkip(mode, cuGeom, tuDepth, tuDepthC, absPartIdx, psyEnergy);
187
+    {
188
+        codeIntraChromaTSkip(mode, cuGeom, tuDepth, tuDepthC, absPartIdx, outCost);
189
+        return;
190
+    }
191
 
192
     ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv;
193
     uint32_t qtLayer = log2TrSize - 2;
194
     uint32_t stride = mode.fencYuv->m_csize;
195
     const uint32_t sizeIdxC = log2TrSizeC - 2;
196
-    sse_ret_t outDist = 0;
197
 
198
     uint32_t curPartNum = cuGeom.numPartitions >> tuDepthC * 2;
199
     const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
200
@@ -821,8 +850,9 @@
201
             coeff_t* coeffC        = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
202
             pixel*   reconQt       = m_rqt[qtLayer].reconQtYuv.getChromaAddr(chromaId, absPartIdxC);
203
             uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_csize;
204
-            pixel*   picReconC = m_frame->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + absPartIdxC);
205
-            intptr_t picStride = m_frame->m_reconPic->m_strideC;
206
+            PicYuv*  reconPic = m_frame->m_reconPic;
207
+            pixel*   picReconC = reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + absPartIdxC);
208
+            intptr_t picStride = reconPic->m_strideC;
209
 
210
             uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC];
211
             if (chromaPredMode == DM_CHROMA_IDX)
212
@@ -852,10 +882,10 @@
213
                 cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
214
             }
215
 
216
-            outDist += m_rdCost.scaleChromaDist(chromaId, primitives.cu[sizeIdxC].sse_pp(reconQt, reconQtStride, fenc, stride));
217
+            outCost.distortion += m_rdCost.scaleChromaDist(chromaId, primitives.cu[sizeIdxC].sse_pp(reconQt, reconQtStride, fenc, stride));
218
 
219
             if (m_rdCost.m_psyRd)
220
-                psyEnergy += m_rdCost.psyCost(sizeIdxC, fenc, stride, reconQt, reconQtStride);
221
+                outCost.energy += m_rdCost.psyCost(sizeIdxC, fenc, stride, reconQt, reconQtStride);
222
 
223
             primitives.cu[sizeIdxC].copy_pp(picReconC, picStride, reconQt, reconQtStride);
224
         }
225
@@ -867,19 +897,16 @@
226
         offsetSubTUCBFs(cu, TEXT_CHROMA_U, tuDepth, absPartIdx);
227
         offsetSubTUCBFs(cu, TEXT_CHROMA_V, tuDepth, absPartIdx);
228
     }
229
-
230
-    return outDist;
231
 }
232
 
233
 /* returns distortion */
234
-uint32_t Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t tuDepthC, uint32_t absPartIdx, uint32_t& psyEnergy)
235
+void Search::codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t tuDepthC, uint32_t absPartIdx, Cost& outCost)
236
 {
237
     CUData& cu = mode.cu;
238
     uint32_t fullDepth  = cuGeom.depth + tuDepth;
239
     uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
240
     const uint32_t log2TrSizeC = 2;
241
     uint32_t qtLayer = log2TrSize - 2;
242
-    uint32_t outDist = 0;
243
 
244
     /* At the TU layers above this one, no RDO is performed, only distortion is being measured,
245
      * so the entropy coder is not very accurate. The best we can do is return it in the same
246
@@ -925,7 +952,7 @@
247
             predIntraChromaAng(chromaPredMode, pred, stride, log2TrSizeC);
248
 
249
             uint64_t bCost = MAX_INT64;
250
-            uint32_t bDist = 0;
251
+            sse_t bDist = 0;
252
             uint32_t bCbf = 0;
253
             uint32_t bEnergy = 0;
254
             int      bTSkip = 0;
255
@@ -956,7 +983,7 @@
256
                     primitives.cu[sizeIdxC].copy_pp(recon, reconStride, pred, stride);
257
                     cu.setCbfPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
258
                 }
259
-                sse_ret_t tmpDist = primitives.cu[sizeIdxC].sse_pp(recon, reconStride, fenc, stride);
260
+                sse_t tmpDist = primitives.cu[sizeIdxC].sse_pp(recon, reconStride, fenc, stride);
261
                 tmpDist = m_rdCost.scaleChromaDist(chromaId, tmpDist);
262
 
263
                 cu.setTransformSkipPartRange(useTSkip, ttype, absPartIdxC, tuIterator.absPartIdxStep);
264
@@ -998,12 +1025,13 @@
265
             cu.setCbfPartRange(bCbf << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
266
             cu.setTransformSkipPartRange(bTSkip, ttype, absPartIdxC, tuIterator.absPartIdxStep);
267
 
268
-            pixel*   reconPicC = m_frame->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + absPartIdxC);
269
-            intptr_t picStride = m_frame->m_reconPic->m_strideC;
270
+            PicYuv*  reconPic = m_frame->m_reconPic;
271
+            pixel*   reconPicC = reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + absPartIdxC);
272
+            intptr_t picStride = reconPic->m_strideC;
273
             primitives.cu[sizeIdxC].copy_pp(reconPicC, picStride, reconQt, reconQtStride);
274
 
275
-            outDist += bDist;
276
-            psyEnergy += bEnergy;
277
+            outCost.distortion += bDist;
278
+            outCost.energy += bEnergy;
279
         }
280
     }
281
     while (tuIterator.isNextSection());
282
@@ -1015,7 +1043,6 @@
283
     }
284
 
285
     m_entropyCoder.load(m_rqt[fullDepth].rqtRoot);
286
-    return outDist;
287
 }
288
 
289
 void Search::extractIntraResultChromaQT(CUData& cu, Yuv& reconYuv, uint32_t absPartIdx, uint32_t tuDepth)
290
@@ -1108,8 +1135,9 @@
291
             int16_t* residual = resiYuv.getChromaAddr(chromaId, absPartIdxC);
292
             uint32_t coeffOffsetC  = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
293
             coeff_t* coeffC        = cu.m_trCoeff[ttype] + coeffOffsetC;
294
-            pixel*   picReconC = m_frame->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + absPartIdxC);
295
-            intptr_t picStride = m_frame->m_reconPic->m_strideC;
296
+            PicYuv*  reconPic = m_frame->m_reconPic;
297
+            pixel*   picReconC = reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + absPartIdxC);
298
+            intptr_t picStride = reconPic->m_strideC;
299
 
300
             uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC];
301
             if (chromaPredMode == DM_CHROMA_IDX)
302
@@ -1150,7 +1178,7 @@
303
     }
304
 }
305
 
306
-void Search::checkIntra(Mode& intraMode, const CUGeom& cuGeom, PartSize partSize, uint8_t* sharedModes, uint8_t* sharedChromaModes)
307
+void Search::checkIntra(Mode& intraMode, const CUGeom& cuGeom, PartSize partSize)
308
 {
309
     CUData& cu = intraMode.cu;
310
 
311
@@ -1161,34 +1189,43 @@
312
     cu.getIntraTUQtDepthRange(tuDepthRange, 0);
313
 
314
     intraMode.initCosts();
315
-    intraMode.lumaDistortion += estIntraPredQT(intraMode, cuGeom, tuDepthRange, sharedModes);
316
-    intraMode.chromaDistortion += estIntraPredChromaQT(intraMode, cuGeom, sharedChromaModes);
317
-    intraMode.distortion += intraMode.lumaDistortion + intraMode.chromaDistortion;
318
+    intraMode.lumaDistortion += estIntraPredQT(intraMode, cuGeom, tuDepthRange);
319
+    if (m_csp != X265_CSP_I400)
320
+    {
321
+        intraMode.chromaDistortion += estIntraPredChromaQT(intraMode, cuGeom);
322
+        intraMode.distortion += intraMode.lumaDistortion + intraMode.chromaDistortion;
323
+    }
324
+    else
325
+        intraMode.distortion += intraMode.lumaDistortion;
326
 
327
     m_entropyCoder.resetBits();
328
     if (m_slice->m_pps->bTransquantBypassEnabled)
329
         m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
330
 
331
+    int skipFlagBits = 0;
332
     if (!m_slice->isIntra())
333
     {
334
         m_entropyCoder.codeSkipFlag(cu, 0);
335
+        skipFlagBits = m_entropyCoder.getNumberOfWrittenBits();
336
         m_entropyCoder.codePredMode(cu.m_predMode[0]);
337
     }
338
 
339
     m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);
340
     m_entropyCoder.codePredInfo(cu, 0);
341
-    intraMode.mvBits = m_entropyCoder.getNumberOfWrittenBits();
342
+    intraMode.mvBits = m_entropyCoder.getNumberOfWrittenBits() - skipFlagBits;
343
 
344
     bool bCodeDQP = m_slice->m_pps->bUseDQP;
345
     m_entropyCoder.codeCoeff(cu, 0, bCodeDQP, tuDepthRange);
346
     m_entropyCoder.store(intraMode.contexts);
347
     intraMode.totalBits = m_entropyCoder.getNumberOfWrittenBits();
348
-    intraMode.coeffBits = intraMode.totalBits - intraMode.mvBits;
349
+    intraMode.coeffBits = intraMode.totalBits - intraMode.mvBits - skipFlagBits;
350
     if (m_rdCost.m_psyRd)
351
     {
352
         const Yuv* fencYuv = intraMode.fencYuv;
353
         intraMode.psyEnergy = m_rdCost.psyCost(cuGeom.log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, intraMode.reconYuv.m_buf[0], intraMode.reconYuv.m_size);
354
     }
355
+    intraMode.resEnergy = primitives.cu[cuGeom.log2CUSize - 2].sse_pp(intraMode.fencYuv->m_buf[0], intraMode.fencYuv->m_size, intraMode.predYuv.m_buf[0], intraMode.predYuv.m_size);
356
+
357
     updateModeCost(intraMode);
358
     checkDQP(intraMode, cuGeom);
359
 }
360
@@ -1356,7 +1393,6 @@
361
     intraMode.distortion = bsad;
362
     intraMode.sa8dCost = bcost;
363
     intraMode.sa8dBits = bbits;
364
-    X265_CHECK(intraMode.ok(), "intra mode is not ok");
365
 }
366
 
367
 void Search::encodeIntraInInter(Mode& intraMode, const CUGeom& cuGeom)
368
@@ -1379,35 +1415,41 @@
369
     extractIntraResultQT(cu, *reconYuv, 0, 0);
370
 
371
     intraMode.lumaDistortion = icosts.distortion;
372
-    intraMode.chromaDistortion = estIntraPredChromaQT(intraMode, cuGeom, NULL);
373
-    intraMode.distortion = intraMode.lumaDistortion + intraMode.chromaDistortion;
374
+    if (m_csp != X265_CSP_I400)
375
+    {
376
+        intraMode.chromaDistortion = estIntraPredChromaQT(intraMode, cuGeom);
377
+        intraMode.distortion = intraMode.lumaDistortion + intraMode.chromaDistortion;
378
+    }
379
+    else
380
+        intraMode.distortion = intraMode.lumaDistortion;
381
 
382
     m_entropyCoder.resetBits();
383
     if (m_slice->m_pps->bTransquantBypassEnabled)
384
         m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
385
     m_entropyCoder.codeSkipFlag(cu, 0);
386
+    int skipFlagBits = m_entropyCoder.getNumberOfWrittenBits();
387
     m_entropyCoder.codePredMode(cu.m_predMode[0]);
388
     m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);
389
     m_entropyCoder.codePredInfo(cu, 0);
390
-    intraMode.mvBits += m_entropyCoder.getNumberOfWrittenBits();
391
+    intraMode.mvBits = m_entropyCoder.getNumberOfWrittenBits() - skipFlagBits;
392
 
393
     bool bCodeDQP = m_slice->m_pps->bUseDQP;
394
     m_entropyCoder.codeCoeff(cu, 0, bCodeDQP, tuDepthRange);
395
 
396
     intraMode.totalBits = m_entropyCoder.getNumberOfWrittenBits();
397
-    intraMode.coeffBits = intraMode.totalBits - intraMode.mvBits;
398
+    intraMode.coeffBits = intraMode.totalBits - intraMode.mvBits - skipFlagBits;
399
     if (m_rdCost.m_psyRd)
400
     {
401
         const Yuv* fencYuv = intraMode.fencYuv;
402
         intraMode.psyEnergy = m_rdCost.psyCost(cuGeom.log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
403
     }
404
-
405
+    intraMode.resEnergy = primitives.cu[cuGeom.log2CUSize - 2].sse_pp(intraMode.fencYuv->m_buf[0], intraMode.fencYuv->m_size, intraMode.predYuv.m_buf[0], intraMode.predYuv.m_size);
406
     m_entropyCoder.store(intraMode.contexts);
407
     updateModeCost(intraMode);
408
     checkDQP(intraMode, cuGeom);
409
 }
410
 
411
-uint32_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, const uint32_t depthRange[2], uint8_t* sharedModes)
412
+sse_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, const uint32_t depthRange[2])
413
 {
414
     CUData& cu = intraMode.cu;
415
     Yuv* reconYuv = &intraMode.reconYuv;
416
@@ -1422,7 +1464,7 @@
417
     uint32_t qNumParts    = cuGeom.numPartitions >> 2;
418
     uint32_t sizeIdx      = log2TrSize - 2;
419
     uint32_t absPartIdx   = 0;
420
-    uint32_t totalDistortion = 0;
421
+    sse_t totalDistortion = 0;
422
 
423
     int checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && !cu.m_tqBypass[0] && cu.m_partSize[0] != SIZE_2Nx2N;
424
 
425
@@ -1431,8 +1473,8 @@
426
     {
427
         uint32_t bmode = 0;
428
 
429
-        if (sharedModes)
430
-            bmode = sharedModes[puIdx];
431
+        if (intraMode.cu.m_lumaIntraDir[puIdx] != (uint8_t)ALL_IDX)
432
+            bmode = intraMode.cu.m_lumaIntraDir[puIdx];
433
         else
434
         {
435
             uint64_t candCostList[MAX_RD_INTRA_MODES];
436
@@ -1456,25 +1498,6 @@
437
                 int scaleStride = stride;
438
                 int costShift = 0;
439
 
440
-                if (tuSize > 32)
441
-                {
442
-                    // origin is 64x64, we scale to 32x32 and setup required parameters
443
-                    primitives.scale2D_64to32(m_fencScaled, fenc, stride);
444
-                    fenc = m_fencScaled;
445
-
446
-                    pixel nScale[129];
447
-                    intraNeighbourBuf[1][0] = intraNeighbourBuf[0][0];
448
-                    primitives.scale1D_128to64(nScale + 1, intraNeighbourBuf[0] + 1);
449
-
450
-                    memcpy(&intraNeighbourBuf[0][1], &nScale[1], 2 * 64 * sizeof(pixel));
451
-                    memcpy(&intraNeighbourBuf[1][1], &nScale[1], 2 * 64 * sizeof(pixel));
452
-
453
-                    scaleTuSize = 32;
454
-                    scaleStride = 32;
455
-                    costShift = 2;
456
-                    sizeIdx = 5 - 2; // log2(scaleTuSize) - 2
457
-                }
458
-
459
                 m_entropyCoder.loadIntraDirModeLuma(m_rqt[depth].cur);
460
 
461
                 /* there are three cost tiers for intra modes:
462
@@ -1541,9 +1564,10 @@
463
                 for (int i = 0; i < maxCandCount; i++)
464
                     candCostList[i] = MAX_INT64;
465
 
466
-                uint64_t paddedBcost = bcost + (bcost >> 3); // 1.12%
467
+                uint64_t paddedBcost = bcost + (bcost >> 2); // 1.25%
468
                 for (int mode = 0; mode < 35; mode++)
469
-                    if (modeCosts[mode] < paddedBcost || (mpms & ((uint64_t)1 << mode)))
470
+                    if ((modeCosts[mode] < paddedBcost) || ((uint32_t)mode == mpmModes[0])) 
471
+                        /* choose for R-D analysis only if this mode passes cost threshold or matches MPM[0] */
472
                         updateCandList(mode, modeCosts[mode], maxCandCount, rdModeList, candCostList);
473
             }
474
 
475
@@ -1590,10 +1614,11 @@
476
              * output recon picture, so it cannot proceed in parallel with anything else when doing INTRA_NXN. Also
477
              * it is not updating m_rdContexts[depth].cur for the later PUs which I suspect is slightly wrong. I think
478
              * that the contexts should be tracked through each PU */
479
-            pixel*   dst         = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
480
-            uint32_t dststride   = m_frame->m_reconPic->m_stride;
481
-            const pixel*   src   = reconYuv->getLumaAddr(absPartIdx);
482
-            uint32_t srcstride   = reconYuv->m_size;
483
+            PicYuv*  reconPic = m_frame->m_reconPic;
484
+            pixel*   dst       = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
485
+            uint32_t dststride = reconPic->m_stride;
486
+            const pixel*   src = reconYuv->getLumaAddr(absPartIdx);
487
+            uint32_t srcstride = reconYuv->m_size;
488
             primitives.cu[log2TrSize - 2].copy_pp(dst, dststride, src, srcstride);
489
         }
490
     }
491
@@ -1670,7 +1695,7 @@
492
     cu.setChromIntraDirSubParts(bestMode, 0, cuGeom.depth);
493
 }
494
 
495
-uint32_t Search::estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom, uint8_t* sharedChromaModes)
496
+sse_t Search::estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom)
497
 {
498
     CUData& cu = intraMode.cu;
499
     Yuv& reconYuv = intraMode.reconYuv;
500
@@ -1679,7 +1704,7 @@
501
     uint32_t initTuDepth = cu.m_partSize[0] != SIZE_2Nx2N && m_csp == X265_CSP_I444;
502
     uint32_t log2TrSize  = cuGeom.log2CUSize - initTuDepth;
503
     uint32_t absPartStep = cuGeom.numPartitions;
504
-    uint32_t totalDistortion = 0;
505
+    sse_t totalDistortion = 0;
506
 
507
     int size = partitionFromLog2Size(log2TrSize);
508
 
509
@@ -1690,7 +1715,7 @@
510
         uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
511
 
512
         uint32_t bestMode = 0;
513
-        uint32_t bestDist = 0;
514
+        sse_t bestDist = 0;
515
         uint64_t bestCost = MAX_INT64;
516
 
517
         // init mode list
518
@@ -1698,10 +1723,10 @@
519
         uint32_t maxMode = NUM_CHROMA_MODE;
520
         uint32_t modeList[NUM_CHROMA_MODE];
521
 
522
-        if (sharedChromaModes && !initTuDepth)
523
+        if (intraMode.cu.m_chromaIntraDir[0] != (uint8_t)ALL_IDX && !initTuDepth)
524
         {
525
             for (uint32_t l = 0; l < NUM_CHROMA_MODE; l++)
526
-                modeList[l] = sharedChromaModes[0];
527
+                modeList[l] = intraMode.cu.m_chromaIntraDir[0];
528
             maxMode = 1;
529
         }
530
         else
531
@@ -1714,8 +1739,8 @@
532
             m_entropyCoder.load(m_rqt[depth].cur);
533
 
534
             cu.setChromIntraDirSubParts(modeList[mode], absPartIdxC, depth + initTuDepth);
535
-            uint32_t psyEnergy = 0;
536
-            uint32_t dist = codeIntraChromaQt(intraMode, cuGeom, initTuDepth, absPartIdxC, psyEnergy);
537
+            Cost outCost;
538
+            codeIntraChromaQt(intraMode, cuGeom, initTuDepth, absPartIdxC, outCost);
539
 
540
             if (m_slice->m_pps->bTransformSkipEnabled)
541
                 m_entropyCoder.load(m_rqt[depth].cur);
542
@@ -1738,12 +1763,13 @@
543
             codeCoeffQTChroma(cu, initTuDepth, absPartIdxC, TEXT_CHROMA_U);
544
             codeCoeffQTChroma(cu, initTuDepth, absPartIdxC, TEXT_CHROMA_V);
545
             uint32_t bits = m_entropyCoder.getNumberOfWrittenBits();
546
-            uint64_t cost = m_rdCost.m_psyRd ? m_rdCost.calcPsyRdCost(dist, bits, psyEnergy) : m_rdCost.calcRdCost(dist, bits);
547
+            uint64_t cost = m_rdCost.m_psyRd ? m_rdCost.calcPsyRdCost(outCost.distortion, bits, outCost.energy)
548
+                                             : m_rdCost.calcRdCost(outCost.distortion, bits);
549
 
550
             if (cost < bestCost)
551
             {
552
                 bestCost = cost;
553
-                bestDist = dist;
554
+                bestDist = outCost.distortion;
555
                 bestMode = modeList[mode];
556
                 extractIntraResultChromaQT(cu, reconYuv, absPartIdxC, initTuDepth);
557
                 memcpy(m_qtTempCbf[1], cu.m_cbf[1] + absPartIdxC, tuIterator.absPartIdxStep * sizeof(uint8_t));
558
@@ -1756,15 +1782,16 @@
559
         if (!tuIterator.isLastSection())
560
         {
561
             uint32_t zorder    = cuGeom.absPartIdx + absPartIdxC;
562
-            uint32_t dststride = m_frame->m_reconPic->m_strideC;
563
+            PicYuv*  reconPic  = m_frame->m_reconPic;
564
+            uint32_t dststride = reconPic->m_strideC;
565
             const pixel* src;
566
             pixel* dst;
567
 
568
-            dst = m_frame->m_reconPic->getCbAddr(cu.m_cuAddr, zorder);
569
+            dst = reconPic->getCbAddr(cu.m_cuAddr, zorder);
570
             src = reconYuv.getCbAddr(absPartIdxC);
571
             primitives.chroma[m_csp].cu[size].copy_pp(dst, dststride, src, reconYuv.m_csize);
572
 
573
-            dst = m_frame->m_reconPic->getCrAddr(cu.m_cuAddr, zorder);
574
+            dst = reconPic->getCrAddr(cu.m_cuAddr, zorder);
575
             src = reconYuv.getCrAddr(absPartIdxC);
576
             primitives.chroma[m_csp].cu[size].copy_pp(dst, dststride, src, reconYuv.m_csize);
577
         }
578
@@ -1865,7 +1892,7 @@
579
 /* find the lowres motion vector from lookahead in middle of current PU */
580
 MV Search::getLowresMV(const CUData& cu, const PredictionUnit& pu, int list, int ref)
581
 {
582
-    int diffPoc = abs(m_slice->m_poc - m_slice->m_refPicList[list][ref]->m_poc);
583
+    int diffPoc = abs(m_slice->m_poc - m_slice->m_refPOCList[list][ref]);
584
     if (diffPoc > m_param->bframes + 1)
585
         /* poc difference is out of range for lookahead */
586
         return 0;
587
@@ -1905,7 +1932,7 @@
588
         else
589
         {
590
             cu.clipMv(mvCand);
591
-            predInterLumaPixel(pu, tmpPredYuv, *m_slice->m_refPicList[list][ref]->m_reconPic, mvCand);
592
+            predInterLumaPixel(pu, tmpPredYuv, *m_slice->m_refReconPicList[list][ref], mvCand);
593
             costs[i] = m_me.bufSAD(tmpPredYuv.getLumaAddr(pu.puAbsPartIdx), tmpPredYuv.m_size);
594
         }
595
     }
596
@@ -1998,7 +2025,8 @@
597
 
598
     /* Get total cost of partition, but only include MV bit cost once */
599
     bits += m_me.bitcost(outmv);
600
-    uint32_t cost = (satdCost - m_me.mvcost(outmv)) + m_rdCost.getCost(bits);
601
+    uint32_t mvCost = m_me.mvcost(outmv);
602
+    uint32_t cost = (satdCost - mvCost) + m_rdCost.getCost(bits);
603
 
604
     /* Refine MVP selection, updates: mvpIdx, bits, cost */
605
     mvp = checkBestMVP(amvp, outmv, mvpIdx, bits, cost);
606
@@ -2014,6 +2042,7 @@
607
         bestME[list].ref = ref;
608
         bestME[list].cost = cost;
609
         bestME[list].bits = bits;
610
+        bestME[list].mvCost  = mvCost;
611
     }
612
 }
613
 
614
@@ -2059,11 +2088,14 @@
615
         cu.getNeighbourMV(puIdx, pu.puAbsPartIdx, interMode.interNeighbours);
616
 
617
         /* Uni-directional prediction */
618
-        if (m_param->analysisMode == X265_ANALYSIS_LOAD && bestME[0].ref >= 0)
619
+        if (m_param->analysisMode == X265_ANALYSIS_LOAD)
620
         {
621
             for (int list = 0; list < numPredDir; list++)
622
             {
623
                 int ref = bestME[list].ref;
624
+                if (ref < 0)
625
+                    continue;
626
+
627
                 uint32_t bits = m_listSelBits[list] + MVP_IDX_BITS;
628
                 bits += getTUBits(ref, numRefIdx[list]);
629
 
630
@@ -2072,8 +2104,7 @@
631
                 const MV* amvp = interMode.amvpCand[list][ref];
632
                 int mvpIdx = selectMVP(cu, pu, amvp, list, ref);
633
                 MV mvmin, mvmax, outmv, mvp = amvp[mvpIdx];
634
-
635
-                MV lmv = getLowresMV(cu, pu, list, ref);
636
+                MV lmv = bestME[list].mv;
637
                 if (lmv.notZero())
638
                     mvc[numMvc++] = lmv;
639
 
640
@@ -2082,7 +2113,8 @@
641
 
642
                 /* Get total cost of partition, but only include MV bit cost once */
643
                 bits += m_me.bitcost(outmv);
644
-                uint32_t cost = (satdCost - m_me.mvcost(outmv)) + m_rdCost.getCost(bits);
645
+                uint32_t mvCost = m_me.mvcost(outmv);
646
+                uint32_t cost = (satdCost - mvCost) + m_rdCost.getCost(bits);
647
 
648
                 /* Refine MVP selection, updates: mvpIdx, bits, cost */
649
                 mvp = checkBestMVP(amvp, outmv, mvpIdx, bits, cost);
650
@@ -2094,6 +2126,7 @@
651
                     bestME[list].mvpIdx = mvpIdx;
652
                     bestME[list].cost = cost;
653
                     bestME[list].bits = bits;
654
+                    bestME[list].mvCost  = mvCost;
655
                 }
656
             }
657
             bDoUnidir = false;
658
@@ -2142,6 +2175,7 @@
659
         }
660
         if (bDoUnidir)
661
         {
662
+            interMode.bestME[puIdx][0].ref = interMode.bestME[puIdx][1].ref = -1;
663
             uint32_t refMask = refMasks[puIdx] ? refMasks[puIdx] : (uint32_t)-1;
664
 
665
             for (int list = 0; list < numPredDir; list++)
666
@@ -2174,19 +2208,21 @@
667
 
668
                     /* Get total cost of partition, but only include MV bit cost once */
669
                     bits += m_me.bitcost(outmv);
670
-                    uint32_t cost = (satdCost - m_me.mvcost(outmv)) + m_rdCost.getCost(bits);
671
+                    uint32_t mvCost = m_me.mvcost(outmv);
672
+                    uint32_t cost = (satdCost - mvCost) + m_rdCost.getCost(bits);
673
 
674
                     /* Refine MVP selection, updates: mvpIdx, bits, cost */
675
                     mvp = checkBestMVP(amvp, outmv, mvpIdx, bits, cost);
676
 
677
                     if (cost < bestME[list].cost)
678
                     {
679
-                        bestME[list].mv = outmv;
680
-                        bestME[list].mvp = mvp;
681
-                        bestME[list].mvpIdx = mvpIdx;
682
-                        bestME[list].ref = ref;
683
-                        bestME[list].cost = cost;
684
-                        bestME[list].bits = bits;
685
+                        bestME[list].mv      = outmv;
686
+                        bestME[list].mvp     = mvp;
687
+                        bestME[list].mvpIdx  = mvpIdx;
688
+                        bestME[list].ref     = ref;
689
+                        bestME[list].cost    = cost;
690
+                        bestME[list].bits    = bits;
691
+                        bestME[list].mvCost  = mvCost;
692
                     }
693
                 }
694
                 /* the second list ref bits start at bit 16 */
695
@@ -2221,8 +2257,8 @@
696
             }
697
             else
698
             {
699
-                PicYuv* refPic0 = slice->m_refPicList[0][bestME[0].ref]->m_reconPic;
700
-                PicYuv* refPic1 = slice->m_refPicList[1][bestME[1].ref]->m_reconPic;
701
+                PicYuv* refPic0 = slice->m_refReconPicList[0][bestME[0].ref];
702
+                PicYuv* refPic1 = slice->m_refReconPicList[1][bestME[1].ref];
703
                 Yuv* bidirYuv = m_rqt[cuGeom.depth].bidirPredYuv;
704
 
705
                 /* Generate reference subpels */
706
@@ -2370,7 +2406,6 @@
707
 
708
         motionCompensation(cu, pu, *predYuv, true, bChromaMC);
709
     }
710
-    X265_CHECK(interMode.ok(), "inter mode is not ok");
711
     interMode.sa8dBits += totalmebits;
712
 }
713
 
714
@@ -2449,6 +2484,17 @@
715
     cu.clipMv(mvmin);
716
     cu.clipMv(mvmax);
717
 
718
+    if (cu.m_encData->m_param->bIntraRefresh && m_slice->m_sliceType == P_SLICE &&
719
+          cu.m_cuPelX / g_maxCUSize < m_frame->m_encData->m_pir.pirStartCol &&
720
+          m_slice->m_refFrameList[0][0]->m_encData->m_pir.pirEndCol < m_slice->m_sps->numCuInWidth)
721
+    {
722
+        int safeX, maxSafeMv;
723
+        safeX = m_slice->m_refFrameList[0][0]->m_encData->m_pir.pirEndCol * g_maxCUSize - 3;
724
+        maxSafeMv = (safeX - cu.m_cuPelX) * 4;
725
+        mvmax.x = X265_MIN(mvmax.x, maxSafeMv);
726
+        mvmin.x = X265_MIN(mvmin.x, maxSafeMv);
727
+    }
728
+
729
     /* Clip search range to signaled maximum MV length.
730
      * We do not support this VUI field being changed from the default */
731
     const int maxMvLen = (1 << 15) - 1;
732
@@ -2471,9 +2517,8 @@
733
     CUData& cu = interMode.cu;
734
     Yuv* reconYuv = &interMode.reconYuv;
735
     const Yuv* fencYuv = interMode.fencYuv;
736
-
737
+    Yuv* predYuv = &interMode.predYuv;
738
     X265_CHECK(!cu.isIntra(0), "intra CU not expected\n");
739
-
740
     uint32_t depth  = cu.m_cuDepth[0];
741
 
742
     // No residual coding : SKIP mode
743
@@ -2487,24 +2532,27 @@
744
     // Luma
745
     int part = partitionFromLog2Size(cu.m_log2CUSize[0]);
746
     interMode.lumaDistortion = primitives.cu[part].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
747
+    interMode.distortion = interMode.lumaDistortion;
748
     // Chroma
749
-    interMode.chromaDistortion = m_rdCost.scaleChromaDist(1, primitives.chroma[m_csp].cu[part].sse_pp(fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize));
750
-    interMode.chromaDistortion += m_rdCost.scaleChromaDist(2, primitives.chroma[m_csp].cu[part].sse_pp(fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize));
751
-    interMode.distortion = interMode.lumaDistortion + interMode.chromaDistortion;
752
-
753
+    if (m_csp != X265_CSP_I400)
754
+    {
755
+        interMode.chromaDistortion = m_rdCost.scaleChromaDist(1, primitives.chroma[m_csp].cu[part].sse_pp(fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize));
756
+        interMode.chromaDistortion += m_rdCost.scaleChromaDist(2, primitives.chroma[m_csp].cu[part].sse_pp(fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize));
757
+        interMode.distortion += interMode.chromaDistortion;
758
+    }
759
     m_entropyCoder.load(m_rqt[depth].cur);
760
     m_entropyCoder.resetBits();
761
     if (m_slice->m_pps->bTransquantBypassEnabled)
762
         m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
763
     m_entropyCoder.codeSkipFlag(cu, 0);
764
+    int skipFlagBits = m_entropyCoder.getNumberOfWrittenBits();
765
     m_entropyCoder.codeMergeIndex(cu, 0);
766
-
767
-    interMode.mvBits = m_entropyCoder.getNumberOfWrittenBits();
768
+    interMode.mvBits = m_entropyCoder.getNumberOfWrittenBits() - skipFlagBits;
769
     interMode.coeffBits = 0;
770
-    interMode.totalBits = interMode.mvBits;
771
+    interMode.totalBits = interMode.mvBits + skipFlagBits;
772
     if (m_rdCost.m_psyRd)
773
         interMode.psyEnergy = m_rdCost.psyCost(part, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
774
-
775
+    interMode.resEnergy = primitives.cu[part].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size);
776
     updateModeCost(interMode);
777
     m_entropyCoder.store(interMode.contexts);
778
 }
779
@@ -2540,9 +2588,12 @@
780
     uint32_t tqBypass = cu.m_tqBypass[0];
781
     if (!tqBypass)
782
     {
783
-        sse_ret_t cbf0Dist = primitives.cu[sizeIdx].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size);
784
-        cbf0Dist += m_rdCost.scaleChromaDist(1, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[1], predYuv->m_csize, predYuv->m_buf[1], predYuv->m_csize));
785
-        cbf0Dist += m_rdCost.scaleChromaDist(2, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[2], predYuv->m_csize, predYuv->m_buf[2], predYuv->m_csize));
786
+        sse_t cbf0Dist = primitives.cu[sizeIdx].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size);
787
+        if (m_csp != X265_CSP_I400)
788
+        {
789
+            cbf0Dist += m_rdCost.scaleChromaDist(1, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[1], predYuv->m_csize, predYuv->m_buf[1], predYuv->m_csize));
790
+            cbf0Dist += m_rdCost.scaleChromaDist(2, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[2], predYuv->m_csize, predYuv->m_buf[2], predYuv->m_csize));
791
+        }
792
 
793
         /* Consider the RD cost of not signaling any residual */
794
         m_entropyCoder.load(m_rqt[depth].cur);
795
@@ -2577,30 +2628,33 @@
796
     if (m_slice->m_pps->bTransquantBypassEnabled)
797
         m_entropyCoder.codeCUTransquantBypassFlag(tqBypass);
798
 
799
-    uint32_t coeffBits, bits;
800
+    uint32_t coeffBits, bits, mvBits;
801
     if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N && !cu.getQtRootCbf(0))
802
     {
803
         cu.setPredModeSubParts(MODE_SKIP);
804
 
805
         /* Merge/Skip */
806
+        coeffBits = mvBits = 0;
807
         m_entropyCoder.codeSkipFlag(cu, 0);
808
+        int skipFlagBits = m_entropyCoder.getNumberOfWrittenBits();
809
         m_entropyCoder.codeMergeIndex(cu, 0);
810
-        coeffBits = 0;
811
-        bits = m_entropyCoder.getNumberOfWrittenBits();
812
+        mvBits = m_entropyCoder.getNumberOfWrittenBits() - skipFlagBits;
813
+        bits = mvBits + skipFlagBits;
814
     }
815
     else
816
     {
817
         m_entropyCoder.codeSkipFlag(cu, 0);
818
+        int skipFlagBits = m_entropyCoder.getNumberOfWrittenBits();
819
         m_entropyCoder.codePredMode(cu.m_predMode[0]);
820
         m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);
821
         m_entropyCoder.codePredInfo(cu, 0);
822
-        uint32_t mvBits = m_entropyCoder.getNumberOfWrittenBits();
823
+        mvBits = m_entropyCoder.getNumberOfWrittenBits() - skipFlagBits;
824
 
825
         bool bCodeDQP = m_slice->m_pps->bUseDQP;
826
         m_entropyCoder.codeCoeff(cu, 0, bCodeDQP, tuDepthRange);
827
         bits = m_entropyCoder.getNumberOfWrittenBits();
828
 
829
-        coeffBits = bits - mvBits;
830
+        coeffBits = bits - mvBits - skipFlagBits;
831
     }
832
 
833
     m_entropyCoder.store(interMode.contexts);
834
@@ -2611,18 +2665,22 @@
835
         reconYuv->copyFromYuv(*predYuv);
836
 
837
     // update with clipped distortion and cost (qp estimation loop uses unclipped values)
838
-    sse_ret_t bestLumaDist = primitives.cu[sizeIdx].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
839
-    sse_ret_t bestChromaDist = m_rdCost.scaleChromaDist(1, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize));
840
-    bestChromaDist += m_rdCost.scaleChromaDist(2, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize));
841
+    sse_t bestLumaDist = primitives.cu[sizeIdx].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
842
+    interMode.distortion = bestLumaDist;
843
+    if (m_csp != X265_CSP_I400)
844
+    {
845
+        sse_t bestChromaDist = m_rdCost.scaleChromaDist(1, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize));
846
+        bestChromaDist += m_rdCost.scaleChromaDist(2, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize));
847
+        interMode.chromaDistortion = bestChromaDist;
848
+        interMode.distortion += bestChromaDist;
849
+    }
850
     if (m_rdCost.m_psyRd)
851
         interMode.psyEnergy = m_rdCost.psyCost(sizeIdx, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
852
-
853
+    interMode.resEnergy = primitives.cu[sizeIdx].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size);
854
     interMode.totalBits = bits;
855
     interMode.lumaDistortion = bestLumaDist;
856
-    interMode.chromaDistortion = bestChromaDist;
857
-    interMode.distortion = bestLumaDist + bestChromaDist;
858
     interMode.coeffBits = coeffBits;
859
-    interMode.mvBits = bits - coeffBits;
860
+    interMode.mvBits = mvBits;
861
     updateModeCost(interMode);
862
     checkDQP(interMode, cuGeom);
863
 }
864
@@ -2641,14 +2699,15 @@
865
     {
866
         // code full block
867
         uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
868
-        bool bCodeChroma = true;
869
+        uint32_t codeChroma = (m_csp != X265_CSP_I400) ? 1 : 0;
870
+
871
         uint32_t tuDepthC = tuDepth;
872
         if (log2TrSizeC < 2)
873
         {
874
             X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
875
             log2TrSizeC = 2;
876
             tuDepthC--;
877
-            bCodeChroma = !(absPartIdx & 3);
878
+            codeChroma &= !(absPartIdx & 3);
879
         }
880
 
881
         uint32_t absPartIdxStep = cuGeom.numPartitions >> tuDepthC * 2;
882
@@ -2682,7 +2741,7 @@
883
             cu.setCbfSubParts(0, TEXT_LUMA, absPartIdx, depth);
884
         }
885
 
886
-        if (bCodeChroma)
887
+        if (codeChroma)
888
         {
889
             uint32_t sizeIdxC = log2TrSizeC - 2;
890
             uint32_t strideResiC = resiYuv.m_csize;
891
@@ -2748,19 +2807,25 @@
892
         {
893
             residualTransformQuantInter(mode, cuGeom, qPartIdx, tuDepth + 1, depthRange);
894
             ycbf |= cu.getCbf(qPartIdx, TEXT_LUMA,     tuDepth + 1);
895
-            ucbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
896
-            vcbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
897
+            if (m_csp != X265_CSP_I400)
898
+            {
899
+                ucbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
900
+                vcbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
901
+            }
902
         }
903
         for (uint32_t i = 0; i < 4 * qNumParts; ++i)
904
         {
905
             cu.m_cbf[0][absPartIdx + i] |= ycbf << tuDepth;
906
-            cu.m_cbf[1][absPartIdx + i] |= ucbf << tuDepth;
907
-            cu.m_cbf[2][absPartIdx + i] |= vcbf << tuDepth;
908
+            if (m_csp != X265_CSP_I400)
909
+            {
910
+                cu.m_cbf[1][absPartIdx + i] |= ucbf << tuDepth;
911
+                cu.m_cbf[2][absPartIdx + i] |= vcbf << tuDepth;
912
+            }
913
         }
914
     }
915
 }
916
 
917
-uint64_t Search::estimateNullCbfCost(uint32_t &dist, uint32_t &psyEnergy, uint32_t tuDepth, TextType compId)
918
+uint64_t Search::estimateNullCbfCost(sse_t dist, uint32_t psyEnergy, uint32_t tuDepth, TextType compId)
919
 {
920
     uint32_t nullBits = m_entropyCoder.estimateCbfBits(0, compId, tuDepth);
921
 
922
@@ -2786,14 +2851,14 @@
923
     X265_CHECK(bCheckFull || bCheckSplit, "check-full or check-split must be set\n");
924
 
925
     uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
926
-    bool bCodeChroma = true;
927
+    uint32_t codeChroma = (m_csp != X265_CSP_I400) ? 1 : 0;
928
     uint32_t tuDepthC = tuDepth;
929
     if (log2TrSizeC < 2)
930
     {
931
         X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
932
         log2TrSizeC = 2;
933
         tuDepthC--;
934
-        bCodeChroma = !(absPartIdx & 3);
935
+        codeChroma &= !(absPartIdx & 3);
936
     }
937
 
938
     // code full block
939
@@ -2803,7 +2868,7 @@
940
     uint8_t  cbfFlag[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, {0, 0}, {0, 0} };
941
     uint32_t numSig[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, {0, 0}, {0, 0} };
942
     uint32_t singleBits[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
943
-    uint32_t singleDist[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
944
+    sse_t singleDist[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
945
     uint32_t singlePsyEnergy[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
946
     uint32_t bestTransformMode[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { 0, 0 }, { 0, 0 }, { 0, 0 } };
947
     uint64_t minCost[MAX_NUM_COMPONENT][2 /*0 = top (or whole TU for non-4:2:2) sub-TU, 1 = bottom sub-TU*/] = { { MAX_INT64, MAX_INT64 }, {MAX_INT64, MAX_INT64}, {MAX_INT64, MAX_INT64} };
948
@@ -2819,14 +2884,14 @@
949
     if (bCheckFull)
950
     {
951
         uint32_t trSizeC = 1 << log2TrSizeC;
952
-        int partSize  = partitionFromLog2Size(log2TrSize);
953
+        int partSize = partitionFromLog2Size(log2TrSize);
954
         int partSizeC = partitionFromLog2Size(log2TrSizeC);
955
         const uint32_t qtLayer = log2TrSize - 2;
956
         uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
957
         coeff_t* coeffCurY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
958
 
959
-        bool checkTransformSkip   = m_slice->m_pps->bTransformSkipEnabled && !cu.m_tqBypass[0];
960
-        bool checkTransformSkipY  = checkTransformSkip && log2TrSize  <= MAX_LOG2_TS_SIZE;
961
+        bool checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && !cu.m_tqBypass[0];
962
+        bool checkTransformSkipY = checkTransformSkip && log2TrSize <= MAX_LOG2_TS_SIZE;
963
         bool checkTransformSkipC = checkTransformSkip && log2TrSizeC <= MAX_LOG2_TS_SIZE;
964
 
965
         cu.setTUDepthSubParts(tuDepth, absPartIdx, depth);
966
@@ -2844,24 +2909,20 @@
967
 
968
         if (bSplitPresentFlag && log2TrSize > depthRange[0])
969
             m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize);
970
-        fullCost.bits = m_entropyCoder.getNumberOfWrittenBits();
971
 
972
-        // Coding luma cbf flag has been removed from here. The context for cbf flag is different for each depth.
973
-        // So it is valid if we encode coefficients and then cbfs at least for analysis.
974
-//        m_entropyCoder.codeQtCbfLuma(cbfFlag[TEXT_LUMA][0], tuDepth);
975
         if (cbfFlag[TEXT_LUMA][0])
976
             m_entropyCoder.codeCoeffNxN(cu, coeffCurY, absPartIdx, log2TrSize, TEXT_LUMA);
977
-
978
-        uint32_t singleBitsPrev = m_entropyCoder.getNumberOfWrittenBits();
979
-        singleBits[TEXT_LUMA][0] = singleBitsPrev - fullCost.bits;
980
+        singleBits[TEXT_LUMA][0] = m_entropyCoder.getNumberOfWrittenBits();
981
 
982
         X265_CHECK(log2TrSize <= 5, "log2TrSize is too large\n");
983
-        uint32_t distY = primitives.cu[partSize].ssd_s(resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size);
984
-        uint32_t psyEnergyY = 0;
985
+
986
+        //Assuming zero residual 
987
+        sse_t zeroDistY = primitives.cu[partSize].sse_pp(fenc, fencYuv->m_size, mode.predYuv.getLumaAddr(absPartIdx), mode.predYuv.m_size);
988
+        uint32_t zeroPsyEnergyY = 0;
989
         if (m_rdCost.m_psyRd)
990
-            psyEnergyY = m_rdCost.psyCost(partSize, resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, (int16_t*)zeroShort, 0);
991
+            zeroPsyEnergyY = m_rdCost.psyCost(partSize, fenc, fencYuv->m_size, mode.predYuv.getLumaAddr(absPartIdx), mode.predYuv.m_size);
992
 
993
-        int16_t* curResiY    = m_rqt[qtLayer].resiQtYuv.getLumaAddr(absPartIdx);
994
+        int16_t* curResiY = m_rqt[qtLayer].resiQtYuv.getLumaAddr(absPartIdx);
995
         uint32_t strideResiY = m_rqt[qtLayer].resiQtYuv.m_size;
996
 
997
         if (cbfFlag[TEXT_LUMA][0])
998
@@ -2870,12 +2931,16 @@
999
 
1000
             // non-zero cost calculation for luma - This is an approximation
1001
             // finally we have to encode correct cbf after comparing with null cost
1002
-            const uint32_t nonZeroDistY = primitives.cu[partSize].sse_ss(resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, curResiY, strideResiY);
1003
+            pixel* curReconY = m_rqt[qtLayer].reconQtYuv.getLumaAddr(absPartIdx);
1004
+            uint32_t strideReconY = m_rqt[qtLayer].reconQtYuv.m_size;
1005
+            primitives.cu[partSize].add_ps(curReconY, strideReconY, mode.predYuv.getLumaAddr(absPartIdx), curResiY, mode.predYuv.m_size, strideResiY);
1006
+
1007
+            const sse_t nonZeroDistY = primitives.cu[partSize].sse_pp(fenc, fencYuv->m_size, curReconY, strideReconY);
1008
             uint32_t nzCbfBitsY = m_entropyCoder.estimateCbfBits(cbfFlag[TEXT_LUMA][0], TEXT_LUMA, tuDepth);
1009
             uint32_t nonZeroPsyEnergyY = 0; uint64_t singleCostY = 0;
1010
             if (m_rdCost.m_psyRd)
1011
             {
1012
-                nonZeroPsyEnergyY = m_rdCost.psyCost(partSize, resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, curResiY, strideResiY);
1013
+                nonZeroPsyEnergyY = m_rdCost.psyCost(partSize, fenc, fencYuv->m_size, curReconY, strideReconY);
1014
                 singleCostY = m_rdCost.calcPsyRdCost(nonZeroDistY, nzCbfBitsY + singleBits[TEXT_LUMA][0], nonZeroPsyEnergyY);
1015
             }
1016
             else
1017
@@ -2891,7 +2956,7 @@
1018
                 // zero-cost calculation for luma. This is an approximation
1019
                 // Initial cost calculation was also an approximation. First resetting the bit counter and then encoding zero cbf.
1020
                 // Now encoding the zero cbf without writing into bitstream, keeping m_fracBits unchanged. The same is valid for chroma.
1021
-                uint64_t nullCostY = estimateNullCbfCost(distY, psyEnergyY, tuDepth, TEXT_LUMA);
1022
+                uint64_t nullCostY = estimateNullCbfCost(zeroDistY, zeroPsyEnergyY, tuDepth, TEXT_LUMA);
1023
 
1024
                 if (nullCostY < singleCostY)
1025
                 {
1026
@@ -2900,12 +2965,12 @@
1027
                     primitives.cu[partSize].blockfill_s(curResiY, strideResiY, 0);
1028
 #if CHECKED_BUILD || _DEBUG
1029
                     uint32_t numCoeffY = 1 << (log2TrSize << 1);
1030
-                    memset(coeffCurY, 0, sizeof(coeff_t) * numCoeffY);
1031
+                    memset(coeffCurY, 0, sizeof(coeff_t)* numCoeffY);
1032
 #endif
1033
                     if (checkTransformSkipY)
1034
                         minCost[TEXT_LUMA][0] = nullCostY;
1035
-                    singleDist[TEXT_LUMA][0] = distY;
1036
-                    singlePsyEnergy[TEXT_LUMA][0] = psyEnergyY;
1037
+                    singleDist[TEXT_LUMA][0] = zeroDistY;
1038
+                    singlePsyEnergy[TEXT_LUMA][0] = zeroPsyEnergyY;
1039
                 }
1040
                 else
1041
                 {
1042
@@ -2919,21 +2984,23 @@
1043
         else
1044
         {
1045
             if (checkTransformSkipY)
1046
-                minCost[TEXT_LUMA][0] = estimateNullCbfCost(distY, psyEnergyY, tuDepth, TEXT_LUMA);
1047
+                minCost[TEXT_LUMA][0] = estimateNullCbfCost(zeroDistY, zeroPsyEnergyY, tuDepth, TEXT_LUMA);
1048
             primitives.cu[partSize].blockfill_s(curResiY, strideResiY, 0);
1049
-            singleDist[TEXT_LUMA][0] = distY;
1050
-            singlePsyEnergy[TEXT_LUMA][0] = psyEnergyY;
1051
+            singleDist[TEXT_LUMA][0] = zeroDistY;
1052
+            singleBits[TEXT_LUMA][0] = 0;
1053
+            singlePsyEnergy[TEXT_LUMA][0] = zeroPsyEnergyY;
1054
         }
1055
 
1056
         cu.setCbfSubParts(cbfFlag[TEXT_LUMA][0] << tuDepth, TEXT_LUMA, absPartIdx, depth);
1057
 
1058
-        if (bCodeChroma)
1059
+        if (codeChroma)
1060
         {
1061
             uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
1062
             uint32_t strideResiC  = m_rqt[qtLayer].resiQtYuv.m_csize;
1063
             for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
1064
             {
1065
-                uint32_t distC = 0, psyEnergyC = 0;
1066
+                sse_t zeroDistC = 0;
1067
+                uint32_t zeroPsyEnergyC = 0;
1068
                 coeff_t* coeffCurC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
1069
                 TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);
1070
 
1071
@@ -2952,14 +3019,18 @@
1072
                     numSig[chromaId][tuIterator.section] = m_quant.transformNxN(cu, fenc, fencYuv->m_csize, resi, resiYuv.m_csize, coeffCurC + subTUOffset, log2TrSizeC, (TextType)chromaId, absPartIdxC, false);
1073
                     cbfFlag[chromaId][tuIterator.section] = !!numSig[chromaId][tuIterator.section];
1074
 
1075
+                    uint32_t latestBitCount = m_entropyCoder.getNumberOfWrittenBits();
1076
                     if (cbfFlag[chromaId][tuIterator.section])
1077
                         m_entropyCoder.codeCoeffNxN(cu, coeffCurC + subTUOffset, absPartIdxC, log2TrSizeC, (TextType)chromaId);
1078
-                    uint32_t newBits = m_entropyCoder.getNumberOfWrittenBits();
1079
-                    singleBits[chromaId][tuIterator.section] = newBits - singleBitsPrev;
1080
-                    singleBitsPrev = newBits;
1081
+
1082
+                    singleBits[chromaId][tuIterator.section] = m_entropyCoder.getNumberOfWrittenBits() - latestBitCount;
1083
 
1084
                     int16_t* curResiC = m_rqt[qtLayer].resiQtYuv.getChromaAddr(chromaId, absPartIdxC);
1085
-                    distC = m_rdCost.scaleChromaDist(chromaId, primitives.cu[log2TrSizeC - 2].ssd_s(resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize));
1086
+                    zeroDistC = m_rdCost.scaleChromaDist(chromaId, primitives.cu[log2TrSizeC - 2].sse_pp(fenc, fencYuv->m_csize, mode.predYuv.getChromaAddr(chromaId, absPartIdxC), mode.predYuv.m_csize));
1087
+
1088
+                    if (m_rdCost.m_psyRd)
1089
+                    //Assuming zero residual 
1090
+                        zeroPsyEnergyC = m_rdCost.psyCost(partSizeC, fenc, fencYuv->m_csize, mode.predYuv.getChromaAddr(chromaId, absPartIdxC), mode.predYuv.m_csize);
1091
 
1092
                     if (cbfFlag[chromaId][tuIterator.section])
1093
                     {
1094
@@ -2968,13 +3039,15 @@
1095
 
1096
                         // non-zero cost calculation for luma, same as luma - This is an approximation
1097
                         // finally we have to encode correct cbf after comparing with null cost
1098
-                        uint32_t dist = primitives.cu[partSizeC].sse_ss(resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, curResiC, strideResiC);
1099
+                        pixel* curReconC      = m_rqt[qtLayer].reconQtYuv.getChromaAddr(chromaId, absPartIdxC);
1100
+                        uint32_t strideReconC = m_rqt[qtLayer].reconQtYuv.m_csize;
1101
+                        primitives.cu[partSizeC].add_ps(curReconC, strideReconC, mode.predYuv.getChromaAddr(chromaId, absPartIdxC), curResiC, mode.predYuv.m_csize, strideResiC);
1102
+                        sse_t nonZeroDistC = m_rdCost.scaleChromaDist(chromaId, primitives.cu[partSizeC].sse_pp(fenc, fencYuv->m_csize, curReconC, strideReconC));
1103
                         uint32_t nzCbfBitsC = m_entropyCoder.estimateCbfBits(cbfFlag[chromaId][tuIterator.section], (TextType)chromaId, tuDepth);
1104
-                        uint32_t nonZeroDistC = m_rdCost.scaleChromaDist(chromaId, dist);
1105
                         uint32_t nonZeroPsyEnergyC = 0; uint64_t singleCostC = 0;
1106
                         if (m_rdCost.m_psyRd)
1107
                         {
1108
-                            nonZeroPsyEnergyC = m_rdCost.psyCost(partSizeC, resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, curResiC, strideResiC);
1109
+                            nonZeroPsyEnergyC = m_rdCost.psyCost(partSizeC, fenc, fencYuv->m_csize, curReconC, strideReconC);
1110
                             singleCostC = m_rdCost.calcPsyRdCost(nonZeroDistC, nzCbfBitsC + singleBits[chromaId][tuIterator.section], nonZeroPsyEnergyC);
1111
                         }
1112
                         else
1113
@@ -2988,7 +3061,7 @@
1114
                         else
1115
                         {
1116
                             //zero-cost calculation for chroma. This is an approximation
1117
-                            uint64_t nullCostC = estimateNullCbfCost(distC, psyEnergyC, tuDepth, (TextType)chromaId);
1118
+                            uint64_t nullCostC = estimateNullCbfCost(zeroDistC, zeroPsyEnergyC, tuDepth, (TextType)chromaId);
1119
 
1120
                             if (nullCostC < singleCostC)
1121
                             {
1122
@@ -3001,8 +3074,8 @@
1123
 #endif
1124
                                 if (checkTransformSkipC)
1125
                                     minCost[chromaId][tuIterator.section] = nullCostC;
1126
-                                singleDist[chromaId][tuIterator.section] = distC;
1127
-                                singlePsyEnergy[chromaId][tuIterator.section] = psyEnergyC;
1128
+                                singleDist[chromaId][tuIterator.section] = zeroDistC;
1129
+                                singlePsyEnergy[chromaId][tuIterator.section] = zeroPsyEnergyC;
1130
                             }
1131
                             else
1132
                             {
1133
@@ -3016,10 +3089,11 @@
1134
                     else
1135
                     {
1136
                         if (checkTransformSkipC)
1137
-                            minCost[chromaId][tuIterator.section] = estimateNullCbfCost(distC, psyEnergyC, tuDepthC, (TextType)chromaId);
1138
+                            minCost[chromaId][tuIterator.section] = estimateNullCbfCost(zeroDistC, zeroPsyEnergyC, tuDepthC, (TextType)chromaId);
1139
                         primitives.cu[partSizeC].blockfill_s(curResiC, strideResiC, 0);
1140
-                        singleDist[chromaId][tuIterator.section] = distC;
1141
-                        singlePsyEnergy[chromaId][tuIterator.section] = psyEnergyC;
1142
+                        singleBits[chromaId][tuIterator.section] = 0;
1143
+                        singleDist[chromaId][tuIterator.section] = zeroDistC;
1144
+                        singlePsyEnergy[chromaId][tuIterator.section] = zeroPsyEnergyC;
1145
                     }
1146
 
1147
                     cu.setCbfPartRange(cbfFlag[chromaId][tuIterator.section] << tuDepth, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
1148
@@ -3030,7 +3104,7 @@
1149
 
1150
         if (checkTransformSkipY)
1151
         {
1152
-            uint32_t nonZeroDistY = 0;
1153
+            sse_t nonZeroDistY = 0;
1154
             uint32_t nonZeroPsyEnergyY = 0;
1155
             uint64_t singleCostY = MAX_INT64;
1156
 
1157
@@ -3054,11 +3128,12 @@
1158
 
1159
                 m_quant.invtransformNxN(cu, m_tsResidual, trSize, m_tsCoeff, log2TrSize, TEXT_LUMA, false, true, numSigTSkipY);
1160
 
1161
-                nonZeroDistY = primitives.cu[partSize].sse_ss(resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, m_tsResidual, trSize);
1162
+                primitives.cu[partSize].add_ps(m_tsRecon, trSize, mode.predYuv.getLumaAddr(absPartIdx), m_tsResidual, mode.predYuv.m_size, trSize);
1163
+                nonZeroDistY = primitives.cu[partSize].sse_pp(fenc, fencYuv->m_size, m_tsRecon, trSize);
1164
 
1165
                 if (m_rdCost.m_psyRd)
1166
                 {
1167
-                    nonZeroPsyEnergyY = m_rdCost.psyCost(partSize, resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, m_tsResidual, trSize);
1168
+                    nonZeroPsyEnergyY = m_rdCost.psyCost(partSize, fenc, fencYuv->m_size, m_tsRecon, trSize);
1169
                     singleCostY = m_rdCost.calcPsyRdCost(nonZeroDistY, skipSingleBitsY, nonZeroPsyEnergyY);
1170
                 }
1171
                 else
1172
@@ -3081,9 +3156,10 @@
1173
             cu.setCbfSubParts(cbfFlag[TEXT_LUMA][0] << tuDepth, TEXT_LUMA, absPartIdx, depth);
1174
         }
1175
 
1176
-        if (bCodeChroma && checkTransformSkipC)
1177
+        if (codeChroma && checkTransformSkipC)
1178
         {
1179
-            uint32_t nonZeroDistC = 0, nonZeroPsyEnergyC = 0;
1180
+            sse_t nonZeroDistC = 0;
1181
+            uint32_t nonZeroPsyEnergyC = 0;
1182
             uint64_t singleCostC = MAX_INT64;
1183
             uint32_t strideResiC = m_rqt[qtLayer].resiQtYuv.m_csize;
1184
             uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
1185
@@ -3122,11 +3198,12 @@
1186
 
1187
                         m_quant.invtransformNxN(cu, m_tsResidual, trSizeC, m_tsCoeff,
1188
                                                 log2TrSizeC, (TextType)chromaId, false, true, numSigTSkipC);
1189
-                        uint32_t dist = primitives.cu[partSizeC].sse_ss(resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, m_tsResidual, trSizeC);
1190
-                        nonZeroDistC = m_rdCost.scaleChromaDist(chromaId, dist);
1191
+                        primitives.cu[partSizeC].add_ps(m_tsRecon, trSizeC, mode.predYuv.getChromaAddr(chromaId, absPartIdxC), m_tsResidual, mode.predYuv.m_csize, trSizeC);
1192
+                        nonZeroDistC = m_rdCost.scaleChromaDist(chromaId, primitives.cu[partSizeC].sse_pp(fenc, fencYuv->m_csize, m_tsRecon, trSizeC));
1193
                         if (m_rdCost.m_psyRd)
1194
                         {
1195
-                            nonZeroPsyEnergyC = m_rdCost.psyCost(partSizeC, resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, m_tsResidual, trSizeC);
1196
+
1197
+                            nonZeroPsyEnergyC = m_rdCost.psyCost(partSizeC, fenc, fencYuv->m_csize, m_tsRecon, trSizeC);
1198
                             singleCostC = m_rdCost.calcPsyRdCost(nonZeroDistC, singleBits[chromaId][tuIterator.section], nonZeroPsyEnergyC);
1199
                         }
1200
                         else
1201
@@ -3160,7 +3237,7 @@
1202
         m_entropyCoder.resetBits();
1203
 
1204
         //Encode cbf flags
1205
-        if (bCodeChroma)
1206
+        if (codeChroma)
1207
         {
1208
             if (!splitIntoSubTUs)
1209
             {
1210
@@ -3234,14 +3311,20 @@
1211
         {
1212
             estimateResidualQT(mode, cuGeom, qPartIdx, tuDepth + 1, resiYuv, splitCost, depthRange);
1213
             ycbf |= cu.getCbf(qPartIdx, TEXT_LUMA,     tuDepth + 1);
1214
-            ucbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
1215
-            vcbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
1216
+            if (m_csp != X265_CSP_I400)
1217
+            {
1218
+                ucbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
1219
+                vcbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
1220
+            }
1221
         }
1222
         for (uint32_t i = 0; i < 4 * qNumParts; ++i)
1223
         {
1224
             cu.m_cbf[0][absPartIdx + i] |= ycbf << tuDepth;
1225
-            cu.m_cbf[1][absPartIdx + i] |= ucbf << tuDepth;
1226
-            cu.m_cbf[2][absPartIdx + i] |= vcbf << tuDepth;
1227
+            if (m_csp != X265_CSP_I400)
1228
+            {
1229
+                cu.m_cbf[1][absPartIdx + i] |= ucbf << tuDepth;
1230
+                cu.m_cbf[2][absPartIdx + i] |= vcbf << tuDepth;
1231
+            }
1232
         }
1233
 
1234
         // Here we were encoding cbfs and coefficients for splitted blocks. Since I have collected coefficient bits
1235
@@ -3275,7 +3358,7 @@
1236
         }
1237
 
1238
         cu.setTransformSkipSubParts(bestTransformMode[TEXT_LUMA][0], TEXT_LUMA, absPartIdx, depth);
1239
-        if (bCodeChroma)
1240
+        if (codeChroma)
1241
         {
1242
             if (!splitIntoSubTUs)
1243
             {
1244
@@ -3298,7 +3381,7 @@
1245
     cu.setTUDepthSubParts(tuDepth, absPartIdx, depth);
1246
     cu.setCbfSubParts(cbfFlag[TEXT_LUMA][0] << tuDepth, TEXT_LUMA, absPartIdx, depth);
1247
 
1248
-    if (bCodeChroma)
1249
+    if (codeChroma)
1250
     {
1251
         if (!splitIntoSubTUs)
1252
         {
1253
@@ -3330,18 +3413,20 @@
1254
 
1255
     const bool bSubdiv  = tuDepth < cu.m_tuDepth[absPartIdx];
1256
     uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
1257
-
1258
-    if (!(log2TrSize - m_hChromaShift < 2))
1259
+    if (m_csp != X265_CSP_I400)
1260
     {
1261
-        if (!tuDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth - 1))
1262
-            m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_U, tuDepth, !bSubdiv);
1263
-        if (!tuDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth - 1))
1264
-            m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_V, tuDepth, !bSubdiv);
1265
-    }
1266
-    else
1267
-    {
1268
-        X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth) == cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth - 1), "chroma CBF not matching\n");
1269
-        X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth) == cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth - 1), "chroma CBF not matching\n");
1270
+        if (!(log2TrSize - m_hChromaShift < 2))
1271
+        {
1272
+            if (!tuDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth - 1))
1273
+                m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_U, tuDepth, !bSubdiv);
1274
+            if (!tuDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth - 1))
1275
+                m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_V, tuDepth, !bSubdiv);
1276
+        }
1277
+        else
1278
+        {
1279
+            X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth) == cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth - 1), "chroma CBF not matching\n");
1280
+            X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth) == cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth - 1), "chroma CBF not matching\n");
1281
+        }
1282
     }
1283
 
1284
     if (!bSubdiv)
1285
@@ -3371,14 +3456,14 @@
1286
     const uint32_t qtLayer = log2TrSize - 2;
1287
 
1288
     uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
1289
-    bool bCodeChroma = true;
1290
+    uint32_t codeChroma = (m_csp != X265_CSP_I400) ? 1 : 0;
1291
     uint32_t tuDepthC = tuDepth;
1292
     if (log2TrSizeC < 2)
1293
     {
1294
         X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
1295
         log2TrSizeC = 2;
1296
         tuDepthC--;
1297
-        bCodeChroma = !(absPartIdx & 3);
1298
+        codeChroma &= !(absPartIdx & 3);
1299
     }
1300
 
1301
     m_rqt[qtLayer].resiQtYuv.copyPartToPartLuma(resiYuv, absPartIdx, log2TrSize);
1302
@@ -3389,7 +3474,7 @@
1303
     coeff_t* coeffDstY = cu.m_trCoeff[0] + coeffOffsetY;
1304
     memcpy(coeffDstY, coeffSrcY, sizeof(coeff_t) * numCoeffY);
1305
 
1306
-    if (bCodeChroma)
1307
+    if (codeChroma)
1308
     {
1309
         m_rqt[qtLayer].resiQtYuv.copyPartToPartChroma(resiYuv, absPartIdx, log2TrSizeC + m_hChromaShift);
1310
 
1311
@@ -3453,7 +3538,6 @@
1312
                 mode.contexts.resetBits();
1313
                 mode.contexts.codeDeltaQP(cu, 0);
1314
                 uint32_t bits = mode.contexts.getNumberOfWrittenBits();
1315
-                mode.mvBits += bits;
1316
                 mode.totalBits += bits;
1317
                 updateModeCost(mode);
1318
             }
1319
@@ -3464,7 +3548,6 @@
1320
             }
1321
             else
1322
             {
1323
-                mode.mvBits++;
1324
                 mode.totalBits++;
1325
                 updateModeCost(mode);
1326
             }
1327
@@ -3498,7 +3581,6 @@
1328
                 mode.contexts.resetBits();
1329
                 mode.contexts.codeDeltaQP(cu, 0);
1330
                 uint32_t bits = mode.contexts.getNumberOfWrittenBits();
1331
-                mode.mvBits += bits;
1332
                 mode.totalBits += bits;
1333
                 updateModeCost(mode);
1334
             }
1335
@@ -3509,7 +3591,6 @@
1336
             }
1337
             else
1338
             {
1339
-                mode.mvBits++;
1340
                 mode.totalBits++;
1341
                 updateModeCost(mode);
1342
             }
1343
x265_1.8.tar.gz/source/encoder/search.h -> x265_1.9.tar.gz/source/encoder/search.h Changed
181
 
1
@@ -2,6 +2,7 @@
2
 * Copyright (C) 2013 x265 project
3
 *
4
 * Authors: Steve Borho <steve@borho.org>
5
+*          Min Chen <chenm003@163.com>
6
 *
7
 * This program is free software; you can redistribute it and/or modify
8
 * it under the terms of the GNU General Public License as published by
9
@@ -84,8 +85,14 @@
10
     MV       mvp;
11
     int      mvpIdx;
12
     int      ref;
13
-    uint32_t cost;
14
     int      bits;
15
+    uint32_t mvCost;
16
+    uint32_t cost;
17
+
18
+    MotionData()
19
+    {
20
+        memset(this, 0, sizeof(MotionData));
21
+    }
22
 };
23
 
24
 struct Mode
25
@@ -105,16 +112,17 @@
26
     // temporal candidate.
27
     InterNeighbourMV interNeighbours[6];
28
 
29
-    uint64_t   rdCost;     // sum of partition (psy) RD costs          (sse(fenc, recon) + lambda2 * bits)
30
-    uint64_t   sa8dCost;   // sum of partition sa8d distortion costs   (sa8d(fenc, pred) + lambda * bits)
31
-    uint32_t   sa8dBits;   // signal bits used in sa8dCost calculation
32
-    uint32_t   psyEnergy;  // sum of partition psycho-visual energy difference
33
-    sse_ret_t  lumaDistortion;
34
-    sse_ret_t  chromaDistortion;
35
-    sse_ret_t  distortion; // sum of partition SSE distortion
36
-    uint32_t   totalBits;  // sum of partition bits (mv + coeff)
37
-    uint32_t   mvBits;     // Mv bits + Ref + block type (or intra mode)
38
-    uint32_t   coeffBits;  // Texture bits (DCT Coeffs)
39
+    uint64_t    rdCost;     // sum of partition (psy) RD costs          (sse(fenc, recon) + lambda2 * bits)
40
+    uint64_t    sa8dCost;   // sum of partition sa8d distortion costs   (sa8d(fenc, pred) + lambda * bits)
41
+    uint32_t    sa8dBits;   // signal bits used in sa8dCost calculation
42
+    uint32_t    psyEnergy;  // sum of partition psycho-visual energy difference
43
+    sse_t   resEnergy;  // sum of partition residual energy after motion prediction
44
+    sse_t   lumaDistortion;
45
+    sse_t   chromaDistortion;
46
+    sse_t  distortion; // sum of partition SSE distortion
47
+    uint32_t    totalBits;  // sum of partition bits (mv + coeff)
48
+    uint32_t    mvBits;     // Mv bits + Ref + block type (or intra mode)
49
+    uint32_t    coeffBits;  // Texture bits (DCT Coeffs)
50
 
51
     void initCosts()
52
     {
53
@@ -122,6 +130,7 @@
54
         sa8dCost = 0;
55
         sa8dBits = 0;
56
         psyEnergy = 0;
57
+        resEnergy = 0;
58
         lumaDistortion = 0;
59
         chromaDistortion = 0;
60
         distortion = 0;
61
@@ -130,62 +139,13 @@
62
         coeffBits = 0;
63
     }
64
 
65
-    void invalidate()
66
-    {
67
-        /* set costs to invalid data, catch uninitialized re-use */
68
-        rdCost = UINT64_MAX / 2;
69
-        sa8dCost = UINT64_MAX / 2;
70
-        sa8dBits = MAX_UINT / 2;
71
-        psyEnergy = MAX_UINT / 2;
72
-#if X265_DEPTH <= 10
73
-        lumaDistortion = MAX_UINT / 2;
74
-        chromaDistortion = MAX_UINT / 2;
75
-        distortion = MAX_UINT / 2;
76
-#else
77
-        lumaDistortion = UINT64_MAX / 2;
78
-        chromaDistortion = UINT64_MAX / 2;
79
-        distortion = UINT64_MAX / 2;
80
-#endif
81
-        totalBits = MAX_UINT / 2;
82
-        mvBits = MAX_UINT / 2;
83
-        coeffBits = MAX_UINT / 2;
84
-    }
85
-
86
-    bool ok() const
87
-    {
88
-#if X265_DEPTH <= 10
89
-        return !(rdCost >= UINT64_MAX / 2 ||
90
-            sa8dCost >= UINT64_MAX / 2 ||
91
-            sa8dBits >= MAX_UINT / 2 ||
92
-            psyEnergy >= MAX_UINT / 2 ||
93
-            lumaDistortion >= MAX_UINT / 2 ||
94
-            chromaDistortion >= MAX_UINT / 2 ||
95
-            distortion >= MAX_UINT / 2 ||
96
-            totalBits >= MAX_UINT / 2 ||
97
-            mvBits >= MAX_UINT / 2 ||
98
-            coeffBits >= MAX_UINT / 2);
99
-#else
100
-        return !(rdCost >= UINT64_MAX / 2 ||
101
-                 sa8dCost >= UINT64_MAX / 2 ||
102
-                 sa8dBits >= MAX_UINT / 2 ||
103
-                 psyEnergy >= MAX_UINT / 2 ||
104
-                 lumaDistortion >= UINT64_MAX / 2 ||
105
-                 chromaDistortion >= UINT64_MAX / 2 ||
106
-                 distortion >= UINT64_MAX / 2 ||
107
-                 totalBits >= MAX_UINT / 2 ||
108
-                 mvBits >= MAX_UINT / 2 ||
109
-                 coeffBits >= MAX_UINT / 2);
110
-#endif
111
-    }
112
-
113
     void addSubCosts(const Mode& subMode)
114
     {
115
-        X265_CHECK(subMode.ok(), "sub-mode not initialized");
116
-
117
         rdCost += subMode.rdCost;
118
         sa8dCost += subMode.sa8dCost;
119
         sa8dBits += subMode.sa8dBits;
120
         psyEnergy += subMode.psyEnergy;
121
+        resEnergy += subMode.resEnergy;
122
         lumaDistortion += subMode.lumaDistortion;
123
         chromaDistortion += subMode.chromaDistortion;
124
         distortion += subMode.distortion;
125
@@ -325,13 +285,13 @@
126
     ~Search();
127
 
128
     bool     initSearch(const x265_param& param, ScalingList& scalingList);
129
-    int      setLambdaFromQP(const CUData& ctu, int qp); /* returns real quant QP in valid spec range */
130
+    int      setLambdaFromQP(const CUData& ctu, int qp, int lambdaQP = -1); /* returns real quant QP in valid spec range */
131
 
132
     // mark temp RD entropy contexts as uninitialized; useful for finding loads without stores
133
     void     invalidateContexts(int fromDepth);
134
 
135
-    // full RD search of intra modes. if sharedModes is not NULL, it directly uses them
136
-    void     checkIntra(Mode& intraMode, const CUGeom& cuGeom, PartSize partSize, uint8_t* sharedModes, uint8_t* sharedChromaModes);
137
+    // full RD search of intra modes
138
+    void     checkIntra(Mode& intraMode, const CUGeom& cuGeom, PartSize partSizes);
139
 
140
     // select best intra mode using only sa8d costs, cannot measure NxN intra
141
     void     checkIntraInInter(Mode& intraMode, const CUGeom& cuGeom);
142
@@ -397,10 +357,10 @@
143
     void     saveResidualQTData(CUData& cu, ShortYuv& resiYuv, uint32_t absPartIdx, uint32_t tuDepth);
144
 
145
     // RDO search of luma intra modes; result is fully encoded luma. luma distortion is returned
146
-    uint32_t estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, const uint32_t depthRange[2], uint8_t* sharedModes);
147
+    sse_t estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, const uint32_t depthRange[2]);
148
 
149
     // RDO select best chroma mode from luma; result is fully encode chroma. chroma distortion is returned
150
-    uint32_t estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom, uint8_t* sharedChromaModes);
151
+    sse_t estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom);
152
 
153
     void     codeSubdivCbfQTChroma(const CUData& cu, uint32_t tuDepth, uint32_t absPartIdx);
154
     void     codeInterSubdivCbfQT(CUData& cu, uint32_t absPartIdx, const uint32_t tuDepth, const uint32_t depthRange[2]);
155
@@ -410,12 +370,12 @@
156
     {
157
         uint64_t rdcost;
158
         uint32_t bits;
159
-        sse_ret_t distortion;
160
+        sse_t distortion;
161
         uint32_t energy;
162
         Cost() { rdcost = 0; bits = 0; distortion = 0; energy = 0; }
163
     };
164
 
165
-    uint64_t estimateNullCbfCost(uint32_t &dist, uint32_t &psyEnergy, uint32_t tuDepth, TextType compId);
166
+    uint64_t estimateNullCbfCost(sse_t dist, uint32_t psyEnergy, uint32_t tuDepth, TextType compId);
167
     void     estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, ShortYuv& resiYuv, Cost& costs, const uint32_t depthRange[2]);
168
 
169
     // generate prediction, generate residual and recon. if bAllowSplit, find optimal RQT splits
170
@@ -424,8 +384,8 @@
171
     void     extractIntraResultQT(CUData& cu, Yuv& reconYuv, uint32_t tuDepth, uint32_t absPartIdx);
172
 
173
     // generate chroma prediction, generate residual and recon
174
-    uint32_t codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, uint32_t& psyEnergy);
175
-    uint32_t codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t tuDepthC, uint32_t absPartIdx, uint32_t& psyEnergy);
176
+    void     codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, Cost& outCost);
177
+    void     codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t tuDepthC, uint32_t absPartIdx, Cost& outCost);
178
     void     extractIntraResultChromaQT(CUData& cu, Yuv& reconYuv, uint32_t absPartIdx, uint32_t tuDepth);
179
 
180
     // reshuffle CBF flags after coding a pair of 4:2:2 chroma blocks
181
x265_1.8.tar.gz/source/encoder/sei.h -> x265_1.9.tar.gz/source/encoder/sei.h Changed
51
 
1
@@ -163,12 +163,6 @@
2
 
3
     PayloadType payloadType() const { return CONTENT_LIGHT_LEVEL_INFO; }
4
 
5
-    bool parse(const char* value)
6
-    {
7
-        return sscanf(value, "%hu,%hu",
8
-                      &max_content_light_level, &max_pic_average_light_level) == 2;
9
-    }
10
-
11
     void write(Bitstream& bs, const SPS&)
12
     {
13
         m_bitIf = &bs;
14
@@ -195,29 +189,31 @@
15
 
16
     uint8_t m_digest[3][16];
17
 
18
-    void write(Bitstream& bs, const SPS&)
19
+    void write(Bitstream& bs, const SPS& sps)
20
     {
21
         m_bitIf = &bs;
22
 
23
+        int planes = (sps.chromaFormatIdc != X265_CSP_I400) ? 3 : 1;
24
+
25
         WRITE_CODE(DECODED_PICTURE_HASH, 8, "payload_type");
26
 
27
         switch (m_method)
28
         {
29
         case MD5:
30
-            WRITE_CODE(1 + 16 * 3, 8, "payload_size");
31
+            WRITE_CODE(1 + 16 * planes, 8, "payload_size");
32
             WRITE_CODE(MD5, 8, "hash_type");
33
             break;
34
         case CRC:
35
-            WRITE_CODE(1 + 2 * 3, 8, "payload_size");
36
+            WRITE_CODE(1 + 2 * planes, 8, "payload_size");
37
             WRITE_CODE(CRC, 8, "hash_type");
38
             break;
39
         case CHECKSUM:
40
-            WRITE_CODE(1 + 4 * 3, 8, "payload_size");
41
+            WRITE_CODE(1 + 4 * planes, 8, "payload_size");
42
             WRITE_CODE(CHECKSUM, 8, "hash_type");
43
             break;
44
         }
45
 
46
-        for (int yuvIdx = 0; yuvIdx < 3; yuvIdx++)
47
+        for (int yuvIdx = 0; yuvIdx < planes; yuvIdx++)
48
         {
49
             if (m_method == MD5)
50
             {
51
x265_1.8.tar.gz/source/encoder/slicetype.cpp -> x265_1.9.tar.gz/source/encoder/slicetype.cpp Changed
396
 
1
@@ -83,8 +83,11 @@
2
     uint32_t var;
3
 
4
     var  = acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[0] + blockOffsetLuma, stride, 0, csp);
5
-    var += acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[1] + blockOffsetChroma, cStride, 1, csp);
6
-    var += acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[2] + blockOffsetChroma, cStride, 2, csp);
7
+    if (csp != X265_CSP_I400)
8
+    {
9
+        var += acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[1] + blockOffsetChroma, cStride, 1, csp);
10
+        var += acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[2] + blockOffsetChroma, cStride, 2, csp);
11
+    }
12
     x265_emms();
13
     return var;
14
 }
15
@@ -96,6 +99,7 @@
16
     int maxRow = curFrame->m_fencPic->m_picHeight;
17
     int blockCount = curFrame->m_lowres.maxBlocksInRow * curFrame->m_lowres.maxBlocksInCol;
18
 
19
+    float* quantOffsets = curFrame->m_quantOffsets;
20
     for (int y = 0; y < 3; y++)
21
     {
22
         curFrame->m_lowres.wp_ssd[y] = 0;
23
@@ -113,10 +117,21 @@
24
 
25
         if (param->rc.aqMode && param->rc.aqStrength == 0)
26
         {
27
-            memset(curFrame->m_lowres.qpCuTreeOffset, 0, cuCount * sizeof(double));
28
-            memset(curFrame->m_lowres.qpAqOffset, 0, cuCount * sizeof(double));
29
-            for (int cuxy = 0; cuxy < cuCount; cuxy++)
30
-                curFrame->m_lowres.invQscaleFactor[cuxy] = 256;
31
+            if (quantOffsets)
32
+            {
33
+                for (int cuxy = 0; cuxy < cuCount; cuxy++)
34
+                {
35
+                    curFrame->m_lowres.qpCuTreeOffset[cuxy] = curFrame->m_lowres.qpAqOffset[cuxy] = quantOffsets[cuxy];
36
+                    curFrame->m_lowres.invQscaleFactor[cuxy] = x265_exp2fix8(curFrame->m_lowres.qpCuTreeOffset[cuxy]);
37
+                }
38
+            }
39
+            else
40
+            {
41
+                memset(curFrame->m_lowres.qpCuTreeOffset, 0, cuCount * sizeof(double));
42
+                memset(curFrame->m_lowres.qpAqOffset, 0, cuCount * sizeof(double));
43
+                for (int cuxy = 0; cuxy < cuCount; cuxy++)
44
+                    curFrame->m_lowres.invQscaleFactor[cuxy] = 256;
45
+            }
46
         }
47
 
48
         /* Need variance data for weighted prediction */
49
@@ -135,19 +150,25 @@
50
         if (param->rc.aqMode == X265_AQ_AUTO_VARIANCE || param->rc.aqMode == X265_AQ_AUTO_VARIANCE_BIASED)
51
         {
52
             double bit_depth_correction = 1.f / (1 << (2*(X265_DEPTH-8)));
53
+            curFrame->m_lowres.frameVariance = 0;
54
+            uint64_t rowVariance = 0;
55
             for (blockY = 0; blockY < maxRow; blockY += 16)
56
             {
57
+                rowVariance = 0;
58
                 for (blockX = 0; blockX < maxCol; blockX += 16)
59
                 {
60
                     uint32_t energy = acEnergyCu(curFrame, blockX, blockY, param->internalCsp);
61
+                    curFrame->m_lowres.blockVariance[blockXY] = energy;
62
+                    rowVariance += energy;
63
                     qp_adj = pow(energy * bit_depth_correction + 1, 0.1);
64
                     curFrame->m_lowres.qpCuTreeOffset[blockXY] = qp_adj;
65
                     avg_adj += qp_adj;
66
                     avg_adj_pow2 += qp_adj * qp_adj;
67
                     blockXY++;
68
                 }
69
+                curFrame->m_lowres.frameVariance += (rowVariance / maxCol);
70
             }
71
-
72
+            curFrame->m_lowres.frameVariance /= maxRow;
73
             avg_adj /= blockCount;
74
             avg_adj_pow2 /= blockCount;
75
             strength = param->rc.aqStrength * avg_adj;
76
@@ -177,6 +198,8 @@
77
                     uint32_t energy = acEnergyCu(curFrame, blockX, blockY, param->internalCsp);
78
                     qp_adj = strength * (X265_LOG2(X265_MAX(energy, 1)) - (14.427f + 2 * (X265_DEPTH - 8)));
79
                 }
80
+                if (quantOffsets != NULL)
81
+                    qp_adj += quantOffsets[blockXY];
82
                 curFrame->m_lowres.qpAqOffset[blockXY] = qp_adj;
83
                 curFrame->m_lowres.qpCuTreeOffset[blockXY] = qp_adj;
84
                 curFrame->m_lowres.invQscaleFactor[blockXY] = x265_exp2fix8(qp_adj);
85
@@ -328,7 +351,7 @@
86
 
87
         primitives.weight_pp(ref.buffer[0], wbuffer[0], stride, widthHeight, paddedLines,
88
             scale, round << correction, denom + correction, offset);
89
-        src = weightedRef.fpelPlane[0];
90
+        src = fenc.weightedRef[fenc.frameNum - ref.frameNum].fpelPlane[0];
91
     }
92
 
93
     uint32_t cost = 0;
94
@@ -350,7 +373,6 @@
95
 bool LookaheadTLD::allocWeightedRef(Lowres& fenc)
96
 {
97
     intptr_t planesize = fenc.buffer[1] - fenc.buffer[0];
98
-    intptr_t padoffset = fenc.lowresPlane[0] - fenc.buffer[0];
99
     paddedLines = (int)(planesize / fenc.lumaStride);
100
 
101
     wbuffer[0] = X265_MALLOC(pixel, 4 * planesize);
102
@@ -363,14 +385,6 @@
103
     else
104
         return false;
105
 
106
-    for (int i = 0; i < 4; i++)
107
-        weightedRef.lowresPlane[i] = wbuffer[i] + padoffset;
108
-
109
-    weightedRef.fpelPlane[0] = weightedRef.lowresPlane[0];
110
-    weightedRef.lumaStride = fenc.lumaStride;
111
-    weightedRef.isLowres = true;
112
-    weightedRef.isWeighted = false;
113
-
114
     return true;
115
 }
116
 
117
@@ -388,6 +402,16 @@
118
             return;
119
     }
120
 
121
+    ReferencePlanes& weightedRef = fenc.weightedRef[deltaIndex];
122
+    intptr_t padoffset = fenc.lowresPlane[0] - fenc.buffer[0];
123
+    for (int i = 0; i < 4; i++)
124
+        weightedRef.lowresPlane[i] = wbuffer[i] + padoffset;
125
+
126
+    weightedRef.fpelPlane[0] = weightedRef.lowresPlane[0];
127
+    weightedRef.lumaStride = fenc.lumaStride;
128
+    weightedRef.isLowres = true;
129
+    weightedRef.isWeighted = false;
130
+
131
     /* epsilon is chosen to require at least a numerator of 127 (with denominator = 128) */
132
     float guessScale, fencMean, refMean;
133
     x265_emms();
134
@@ -478,7 +502,13 @@
135
 
136
     m_8x8Height = ((m_param->sourceHeight / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
137
     m_8x8Width = ((m_param->sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
138
-    m_8x8Blocks = m_8x8Width > 2 && m_8x8Height > 2 ? (m_8x8Width - 2) * (m_8x8Height - 2) : m_8x8Width * m_8x8Height;
139
+    m_cuCount = m_8x8Width * m_8x8Height;
140
+    m_8x8Blocks = m_8x8Width > 2 && m_8x8Height > 2 ? (m_cuCount + 4 - 2 * (m_8x8Width + m_8x8Height)) : m_cuCount;
141
+
142
+    /* Allow the strength to be adjusted via qcompress, since the two concepts
143
+     * are very similar. */
144
+
145
+    m_cuTreeStrength = 5.0 * (1.0 - m_param->rc.qCompress);
146
 
147
     m_lastKeyframe = -m_param->keyframeMax;
148
     m_sliceTypeBusy = false;
149
@@ -502,7 +532,16 @@
150
     m_bBatchFrameCosts = m_bBatchMotionSearch;
151
 
152
     if (m_param->lookaheadSlices && !m_pool)
153
+    {
154
+        x265_log(param, X265_LOG_WARNING, "No pools found; disabling lookahead-slices\n");
155
+        m_param->lookaheadSlices = 0;
156
+    }
157
+
158
+    if (m_param->lookaheadSlices && (m_param->sourceHeight < 720))
159
+    {
160
+        x265_log(param, X265_LOG_WARNING, "Source height < 720p; disabling lookahead-slices\n");
161
         m_param->lookaheadSlices = 0;
162
+    }
163
 
164
     if (m_param->lookaheadSlices > 1)
165
     {
166
@@ -715,16 +754,16 @@
167
 
168
     case P_SLICE:
169
         b = p1 = poc - l0poc;
170
-        frames[p0] = &slice->m_refPicList[0][0]->m_lowres;
171
+        frames[p0] = &slice->m_refFrameList[0][0]->m_lowres;
172
         frames[b] = &curFrame->m_lowres;
173
         break;
174
 
175
     case B_SLICE:
176
         b = poc - l0poc;
177
         p1 = b + l1poc - poc;
178
-        frames[p0] = &slice->m_refPicList[0][0]->m_lowres;
179
+        frames[p0] = &slice->m_refFrameList[0][0]->m_lowres;
180
         frames[b] = &curFrame->m_lowres;
181
-        frames[p1] = &slice->m_refPicList[1][0]->m_lowres;
182
+        frames[p1] = &slice->m_refFrameList[1][0]->m_lowres;
183
         break;
184
 
185
     default:
186
@@ -736,10 +775,13 @@
187
     if (m_param->rc.cuTree && !m_param->rc.bStatRead)
188
         /* update row satds based on cutree offsets */
189
         curFrame->m_lowres.satdCost = frameCostRecalculate(frames, p0, p1, b);
190
-    else if (m_param->rc.aqMode)
191
-        curFrame->m_lowres.satdCost = curFrame->m_lowres.costEstAq[b - p0][p1 - b];
192
-    else
193
-        curFrame->m_lowres.satdCost = curFrame->m_lowres.costEst[b - p0][p1 - b];
194
+    else if (m_param->analysisMode != X265_ANALYSIS_LOAD)
195
+    {
196
+        if (m_param->rc.aqMode)
197
+            curFrame->m_lowres.satdCost = curFrame->m_lowres.costEstAq[b - p0][p1 - b];
198
+        else
199
+            curFrame->m_lowres.satdCost = curFrame->m_lowres.costEst[b - p0][p1 - b];
200
+    }
201
 
202
     if (m_param->rc.vbvBufferSize && m_param->rc.vbvMaxBitrate)
203
     {
204
@@ -760,6 +802,7 @@
205
             for (uint32_t cnt = 0; cnt < scale && lowresRow < heightInLowresCu; lowresRow++, cnt++)
206
             {
207
                 sum = 0; intraSum = 0;
208
+                int diff = 0;
209
                 lowresCuIdx = lowresRow * widthInLowresCu;
210
                 for (lowresCol = 0; lowresCol < widthInLowresCu; lowresCol++, lowresCuIdx++)
211
                 {
212
@@ -767,14 +810,18 @@
213
                     if (qp_offset)
214
                     {
215
                         lowresCuCost = (uint16_t)((lowresCuCost * x265_exp2fix8(qp_offset[lowresCuIdx]) + 128) >> 8);
216
-                        int32_t intraCuCost = curFrame->m_lowres.intraCost[lowresCuIdx]; 
217
+                        int32_t intraCuCost = curFrame->m_lowres.intraCost[lowresCuIdx];
218
                         curFrame->m_lowres.intraCost[lowresCuIdx] = (intraCuCost * x265_exp2fix8(qp_offset[lowresCuIdx]) + 128) >> 8;
219
                     }
220
+                    if (m_param->bIntraRefresh && slice->m_sliceType == X265_TYPE_P)
221
+                        for (uint32_t x = curFrame->m_encData->m_pir.pirStartCol; x <= curFrame->m_encData->m_pir.pirEndCol; x++)
222
+                            diff += curFrame->m_lowres.intraCost[lowresCuIdx] - lowresCuCost;
223
                     curFrame->m_lowres.lowresCostForRc[lowresCuIdx] = lowresCuCost;
224
                     sum += lowresCuCost;
225
                     intraSum += curFrame->m_lowres.intraCost[lowresCuIdx];
226
                 }
227
                 curFrame->m_encData->m_rowStat[row].satdForVbv += sum;
228
+                curFrame->m_encData->m_rowStat[row].satdForVbv += diff;
229
                 curFrame->m_encData->m_rowStat[row].intraSatdForVbv += intraSum;
230
             }
231
         }
232
@@ -886,8 +933,7 @@
233
             x265_log(m_param, X265_LOG_WARNING, "B-ref at frame %d incompatible with B-pyramid and %d reference frames\n",
234
                      frm.sliceType, m_param->maxNumReferences);
235
         }
236
-
237
-        if (/* (!param->intraRefresh || frm.frameNum == 0) && */ frm.frameNum - m_lastKeyframe >= m_param->keyframeMax)
238
+        if ((!m_param->bIntraRefresh || frm.frameNum == 0) && frm.frameNum - m_lastKeyframe >= m_param->keyframeMax)
239
         {
240
             if (frm.sliceType == X265_TYPE_AUTO || frm.sliceType == X265_TYPE_I)
241
                 frm.sliceType = m_param->bOpenGOP && m_lastKeyframe >= 0 ? X265_TYPE_I : X265_TYPE_IDR;
242
@@ -1170,7 +1216,7 @@
243
     frames[framecnt + 1] = NULL;
244
 
245
     keyintLimit = m_param->keyframeMax - frames[0]->frameNum + m_lastKeyframe - 1;
246
-    origNumFrames = numFrames = X265_MIN(framecnt, keyintLimit);
247
+    origNumFrames = numFrames = m_param->bIntraRefresh ? framecnt : X265_MIN(framecnt, keyintLimit);
248
 
249
     if (bIsVbvLookahead)
250
         numFrames = framecnt;
251
@@ -1366,12 +1412,12 @@
252
     if (m_param->rc.cuTree)
253
         cuTree(frames, X265_MIN(numFrames, m_param->keyframeMax), bKeyframe);
254
 
255
-    // if (!param->bIntraRefresh)
256
-    for (int j = keyintLimit + 1; j <= numFrames; j += m_param->keyframeMax)
257
-    {
258
-        frames[j]->sliceType = X265_TYPE_I;
259
-        resetStart = X265_MIN(resetStart, j + 1);
260
-    }
261
+    if (!m_param->bIntraRefresh)
262
+        for (int j = keyintLimit + 1; j <= numFrames; j += m_param->keyframeMax)
263
+        {
264
+            frames[j]->sliceType = X265_TYPE_I;
265
+            resetStart = X265_MIN(resetStart, j + 1);
266
+        }
267
 
268
     if (bIsVbvLookahead)
269
         vbvLookahead(frames, numFrames, bKeyframe);
270
@@ -1493,7 +1539,7 @@
271
     {
272
         if (m_param->keyframeMin == m_param->keyframeMax)
273
             threshMin = threshMax;
274
-        if (gopSize <= m_param->keyframeMin / 4)
275
+        if (gopSize <= m_param->keyframeMin / 4 || m_param->bIntraRefresh)
276
             bias = threshMin / 4;
277
         else if (gopSize <= m_param->keyframeMin)
278
             bias = threshMin * gopSize / m_param->keyframeMin;
279
@@ -1606,7 +1652,6 @@
280
     double averageDuration = totalDuration / (numframes + 1);
281
 
282
     int i = numframes;
283
-    int cuCount = m_8x8Width * m_8x8Height;
284
 
285
     while (i > 0 && frames[i]->sliceType == X265_TYPE_B)
286
         i--;
287
@@ -1620,18 +1665,18 @@
288
     {
289
         if (bIntra)
290
         {
291
-            memset(frames[0]->propagateCost, 0, cuCount * sizeof(uint16_t));
292
-            memcpy(frames[0]->qpCuTreeOffset, frames[0]->qpAqOffset, cuCount * sizeof(double));
293
+            memset(frames[0]->propagateCost, 0, m_cuCount * sizeof(uint16_t));
294
+            memcpy(frames[0]->qpCuTreeOffset, frames[0]->qpAqOffset, m_cuCount * sizeof(double));
295
             return;
296
         }
297
         std::swap(frames[lastnonb]->propagateCost, frames[0]->propagateCost);
298
-        memset(frames[0]->propagateCost, 0, cuCount * sizeof(uint16_t));
299
+        memset(frames[0]->propagateCost, 0, m_cuCount * sizeof(uint16_t));
300
     }
301
     else
302
     {
303
         if (lastnonb < idx)
304
             return;
305
-        memset(frames[lastnonb]->propagateCost, 0, cuCount * sizeof(uint16_t));
306
+        memset(frames[lastnonb]->propagateCost, 0, m_cuCount * sizeof(uint16_t));
307
     }
308
 
309
     CostEstimateGroup estGroup(*this, frames);
310
@@ -1647,13 +1692,13 @@
311
 
312
         estGroup.singleCost(curnonb, lastnonb, lastnonb);
313
 
314
-        memset(frames[curnonb]->propagateCost, 0, cuCount * sizeof(uint16_t));
315
+        memset(frames[curnonb]->propagateCost, 0, m_cuCount * sizeof(uint16_t));
316
         bframes = lastnonb - curnonb - 1;
317
         if (m_param->bBPyramid && bframes > 1)
318
         {
319
             int middle = (bframes + 1) / 2 + curnonb;
320
             estGroup.singleCost(curnonb, lastnonb, middle);
321
-            memset(frames[middle]->propagateCost, 0, cuCount * sizeof(uint16_t));
322
+            memset(frames[middle]->propagateCost, 0, m_cuCount * sizeof(uint16_t));
323
             while (i > curnonb)
324
             {
325
                 int p0 = i > middle ? middle : curnonb;
326
@@ -1804,20 +1849,14 @@
327
     if (ref0Distance && frame->weightedCostDelta[ref0Distance - 1] > 0)
328
         weightdelta = (1.0 - frame->weightedCostDelta[ref0Distance - 1]);
329
 
330
-    /* Allow the strength to be adjusted via qcompress, since the two concepts
331
-     * are very similar. */
332
-
333
-    int cuCount = m_8x8Width * m_8x8Height;
334
-    double strength = 5.0 * (1.0 - m_param->rc.qCompress);
335
-
336
-    for (int cuIndex = 0; cuIndex < cuCount; cuIndex++)
337
+    for (int cuIndex = 0; cuIndex < m_cuCount; cuIndex++)
338
     {
339
         int intracost = (frame->intraCost[cuIndex] * frame->invQscaleFactor[cuIndex] + 128) >> 8;
340
         if (intracost)
341
         {
342
             int propagateCost = (frame->propagateCost[cuIndex] * fpsFactor + 128) >> 8;
343
             double log2_ratio = X265_LOG2(intracost + propagateCost) - X265_LOG2(intracost) + weightdelta;
344
-            frame->qpCuTreeOffset[cuIndex] = frame->qpAqOffset[cuIndex] - strength * log2_ratio;
345
+            frame->qpCuTreeOffset[cuIndex] = frame->qpAqOffset[cuIndex] - m_cuTreeStrength * log2_ratio;
346
         }
347
     }
348
 }
349
@@ -1958,7 +1997,7 @@
350
         if (bDoSearch[1]) fenc->lowresMvs[1][p1 - b - 1][0].x = 0x7FFE;
351
 #endif
352
 
353
-        tld.weightedRef.isWeighted = false;
354
+        fenc->weightedRef[b - p0].isWeighted = false;
355
         if (param->bEnableWeightedPred && bDoSearch[0])
356
             tld.weightsAnalyse(*m_frames[b], *m_frames[p0]);
357
 
358
@@ -2032,7 +2071,7 @@
359
     Lowres *fref1 = m_frames[p1];
360
     Lowres *fenc  = m_frames[b];
361
 
362
-    ReferencePlanes *wfref0 = tld.weightedRef.isWeighted ? &tld.weightedRef : fref0;
363
+    ReferencePlanes *wfref0 = fenc->weightedRef[b - p0].isWeighted ? &fenc->weightedRef[b - p0] : fref0;
364
 
365
     const int widthInCU = m_lookahead.m_8x8Width;
366
     const int heightInCU = m_lookahead.m_8x8Height;
367
@@ -2061,6 +2100,7 @@
368
     for (int i = 0; i < 1 + bBidir; i++)
369
     {
370
         int& fencCost = fenc->lowresMvCosts[i][listDist[i]][cuXY];
371
+        int skipCost = INT_MAX;
372
 
373
         if (!bDoSearch[i])
374
         {
375
@@ -2103,12 +2143,20 @@
376
                 pixel *src = fref->lowresMC(pelOffset, mvc[idx], subpelbuf, stride);
377
                 int cost = tld.me.bufSATD(src, stride);
378
                 COPY2_IF_LT(mvpcost, cost, mvp, mvc[idx]);
379
+                /* Except for mv0 case, everyting else is likely to have enough residual to not trigger the skip. */
380
+                if (!mvp.notZero() && bBidir)
381
+                    skipCost = cost;
382
             }
383
         }
384
 
385
         /* ME will never return a cost larger than the cost @MVP, so we do not
386
          * have to check that ME cost is more than the estimated merge cost */
387
         fencCost = tld.me.motionEstimate(fref, mvmin, mvmax, mvp, 0, NULL, s_merange, *fencMV);
388
+        if (skipCost < 64 && skipCost < fencCost && bBidir)
389
+        {
390
+            fencCost = skipCost;
391
+            *fencMV = 0;
392
+        }
393
         COPY2_IF_LT(bcost, fencCost, listused, i + 1);
394
     }
395
 
396
x265_1.8.tar.gz/source/encoder/slicetype.h -> x265_1.9.tar.gz/source/encoder/slicetype.h Changed
59
 
1
@@ -2,6 +2,7 @@
2
  * Copyright (C) 2013 x265 project
3
  *
4
  * Authors: Steve Borho <steve@borho.org>
5
+ *          Min Chen <chenm003@163.com>
6
  *
7
  * This program is free software; you can redistribute it and/or modify
8
  * it under the terms of the GNU General Public License as published by
9
@@ -44,7 +45,6 @@
10
 struct LookaheadTLD
11
 {
12
     MotionEstimate  me;
13
-    ReferencePlanes weightedRef;
14
     pixel*          wbuffer[4];
15
     int             widthInCU;
16
     int             heightInCU;
17
@@ -103,29 +103,30 @@
18
     PicList       m_outputQueue;     // pictures to be encoded, in encode order
19
     Lock          m_inputLock;
20
     Lock          m_outputLock;
21
-
22
-    /* pre-lookahead */
23
-    int           m_fullQueueSize;
24
-    bool          m_isActive;
25
-    bool          m_sliceTypeBusy;
26
-    bool          m_bAdaptiveQuant;
27
-    bool          m_outputSignalRequired;
28
-    bool          m_bBatchMotionSearch;
29
-    bool          m_bBatchFrameCosts;
30
     Event         m_outputSignal;
31
-
32
     LookaheadTLD* m_tld;
33
     x265_param*   m_param;
34
     Lowres*       m_lastNonB;
35
     int*          m_scratch;         // temp buffer for cutree propagate
36
-    
37
+
38
+    /* pre-lookahead */
39
+    int           m_fullQueueSize;
40
     int           m_histogram[X265_BFRAME_MAX + 1];
41
     int           m_lastKeyframe;
42
     int           m_8x8Width;
43
     int           m_8x8Height;
44
     int           m_8x8Blocks;
45
+    int           m_cuCount;
46
     int           m_numCoopSlices;
47
     int           m_numRowsPerSlice;
48
+    double        m_cuTreeStrength;
49
+
50
+    bool          m_isActive;
51
+    bool          m_sliceTypeBusy;
52
+    bool          m_bAdaptiveQuant;
53
+    bool          m_outputSignalRequired;
54
+    bool          m_bBatchMotionSearch;
55
+    bool          m_bBatchFrameCosts;
56
     bool          m_filled;
57
     bool          m_isSceneTransition;
58
     Lookahead(x265_param *param, ThreadPool *pool);
59
x265_1.8.tar.gz/source/encoder/weightPrediction.cpp -> x265_1.9.tar.gz/source/encoder/weightPrediction.cpp Changed
43
 
1
@@ -4,6 +4,7 @@
2
  * Author: Shazeb Nawaz Khan <shazeb@multicorewareinc.com>
3
  *         Steve Borho <steve@borho.org>
4
  *         Kavitha Sampas <kavitha@multicorewareinc.com>
5
+ *         Min Chen <chenm003@163.com>
6
  *
7
  * This program is free software; you can redistribute it and/or modify
8
  * it under the terms of the GNU General Public License as published by
9
@@ -259,13 +260,13 @@
10
     for (int list = 0; list < cache.numPredDir; list++)
11
     {
12
         WeightParam *weights = wp[list][0];
13
-        Frame *refFrame = slice.m_refPicList[list][0];
14
+        Frame *refFrame = slice.m_refFrameList[list][0];
15
         Lowres& refLowres = refFrame->m_lowres;
16
         int diffPoc = abs(curPoc - refFrame->m_poc);
17
 
18
         /* prepare estimates */
19
         float guessScale[3], fencMean[3], refMean[3];
20
-        for (int plane = 0; plane < 3; plane++)
21
+        for (int plane = 0; plane < (param.internalCsp != X265_CSP_I400 ? 3 : 1); plane++)
22
         {
23
             SET_WEIGHT(weights[plane], false, 1, 0, 0);
24
             uint64_t fencVar = fenc.wp_ssd[plane] + !refLowres.wp_ssd[plane];
25
@@ -289,7 +290,7 @@
26
 
27
         MV *mvs = NULL;
28
 
29
-        for (int plane = 0; plane < 3; plane++)
30
+        for (int plane = 0; plane < (param.internalCsp != X265_CSP_I400 ? 3 : 1); plane++)
31
         {
32
             denom = plane ? chromaDenom : lumaDenom;
33
             if (plane && !weights[0].bPresentFlag)
34
@@ -328,7 +329,7 @@
35
                 {
36
                     /* reference chroma planes must be extended prior to being
37
                      * used as motion compensation sources */
38
-                    if (!refFrame->m_bChromaExtended)
39
+                    if (!refFrame->m_bChromaExtended && param.internalCsp != X265_CSP_I400)
40
                     {
41
                         refFrame->m_bChromaExtended = true;
42
                         PicYuv *refPic = refFrame->m_fencPic;
43
x265_1.8.tar.gz/source/output/y4m.cpp -> x265_1.9.tar.gz/source/output/y4m.cpp Changed
10
 
1
@@ -70,7 +70,7 @@
2
         x265_log(NULL, X265_LOG_WARNING, "y4m: forcing reconstructed pixels to 8 bits\n");
3
 #endif
4
 
5
-    X265_CHECK(pic.colorSpace == colorSpace, "invalid color space\n");
6
+    X265_CHECK(pic.colorSpace == colorSpace, "invalid chroma subsampling\n");
7
 
8
 #if HIGH_BIT_DEPTH
9
 
10
x265_1.8.tar.gz/source/output/yuv.cpp -> x265_1.9.tar.gz/source/output/yuv.cpp Changed
10
 
1
@@ -53,7 +53,7 @@
2
     uint64_t fileOffset = pic.poc;
3
     fileOffset *= frameSize;
4
 
5
-    X265_CHECK(pic.colorSpace == colorSpace, "invalid color space\n");
6
+    X265_CHECK(pic.colorSpace == colorSpace, "invalid chroma subsampling\n");
7
     X265_CHECK(pic.bitDepth == (int)depth, "invalid bit depth\n");
8
 
9
 #if HIGH_BIT_DEPTH
10
x265_1.8.tar.gz/source/profile/vtune/CMakeLists.txt -> x265_1.9.tar.gz/source/profile/vtune/CMakeLists.txt Changed
5
 
1
@@ -1,2 +1,2 @@
2
-include_directories($ENV{VTUNE_AMPLIFIER_XE_2015_DIR}/include)
3
+include_directories(${VTUNE_INCLUDE_DIR})
4
 add_library(vtune vtune.h vtune.cpp ../cpuEvents.h)
5
x265_1.8.tar.gz/source/profile/vtune/vtune.cpp -> x265_1.9.tar.gz/source/profile/vtune/vtune.cpp Changed
19
 
1
@@ -30,7 +30,6 @@
2
 const char *stringNames[] =
3
 {
4
 #include "../cpuEvents.h"
5
-    ""
6
 };
7
 #undef CPU_EVENT
8
 
9
@@ -44,7 +43,8 @@
10
 void vtuneInit()
11
 {
12
     domain = __itt_domain_create("x265");
13
-    for (size_t i = 0; i < sizeof(stringNames) / sizeof(const char *); i++)
14
+    size_t length = sizeof(stringNames) / sizeof(const char *);
15
+    for (size_t i = 0; i < length; i++)
16
         taskHandle[i] = __itt_string_handle_create(stringNames[i]);
17
 }
18
 
19
x265_1.8.tar.gz/source/test/checkasm-a.asm -> x265_1.9.tar.gz/source/test/checkasm-a.asm Changed
13
 
1
@@ -2,9 +2,11 @@
2
 ;* checkasm-a.asm: assembly check tool
3
 ;*****************************************************************************
4
 ;* Copyright (C) 2008-2014 x264 project
5
+;* Copyright (C) 2013-2015 x265 project
6
 ;*
7
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
8
 ;*          Henrik Gramner <henrik@gramner.com>
9
+;*          Min Chen <chenm003@163.com>
10
 ;*
11
 ;* This program is free software; you can redistribute it and/or modify
12
 ;* it under the terms of the GNU General Public License as published by
13
x265_1.8.tar.gz/source/test/intrapredharness.cpp -> x265_1.9.tar.gz/source/test/intrapredharness.cpp Changed
10
 
1
@@ -130,6 +130,8 @@
2
                 if (memcmp(pixel_out_vec + k * FENC_STRIDE, pixel_out_c + k * FENC_STRIDE, width * sizeof(pixel)))
3
                 {
4
                     printf("ang_%dx%d, Mode = %d, Row = %d failed !!\n", width, width, pmode, k);
5
+                    ref[pmode](pixel_out_c, stride, pixel_buff + j, pmode, bFilter);
6
+                    opt[pmode](pixel_out_vec, stride, pixel_buff + j, pmode, bFilter);
7
                     return false;
8
                 }
9
             }
10
x265_1.8.tar.gz/source/test/ipfilterharness.h -> x265_1.9.tar.gz/source/test/ipfilterharness.h Changed
9
 
1
@@ -4,6 +4,7 @@
2
  * Authors: Deepthi Devaki <deepthidevaki@multicorewareinc.com>,
3
  *          Rajesh Paulraj <rajesh@multicorewareinc.com>
4
  *          Praveen Kumar Tiwari <praveen@multicorewareinc.com>
5
+ *          Min Chen <chenm003@163.com>
6
  *
7
  * This program is free software; you can redistribute it and/or modify
8
  * it under the terms of the GNU General Public License as published by
9
x265_1.8.tar.gz/source/test/pixelharness.cpp -> x265_1.9.tar.gz/source/test/pixelharness.cpp Changed
671
 
1
@@ -2,6 +2,7 @@
2
  * Copyright (C) 2013 x265 project
3
  *
4
  * Authors: Steve Borho <steve@borho.org>
5
+ *          Min Chen <chenm003@163.com>
6
  *
7
  * This program is free software; you can redistribute it and/or modify
8
  * it under the terms of the GNU General Public License as published by
9
@@ -41,6 +42,7 @@
10
         int_test_buff[0][i]     = rand() % SHORT_MAX;
11
         ushort_test_buff[0][i]  = rand() % ((1 << 16) - 1);
12
         uchar_test_buff[0][i]   = rand() % ((1 << 8) - 1);
13
+        residual_test_buff[0][i] = (rand() % (2 * RMAX + 1)) - RMAX - 1;// For sse_ss only
14
 
15
         pixel_test_buff[1][i]   = PIXEL_MIN;
16
         short_test_buff[1][i]   = SMIN;
17
@@ -49,6 +51,7 @@
18
         int_test_buff[1][i]     = SHORT_MIN;
19
         ushort_test_buff[1][i]  = PIXEL_MIN;
20
         uchar_test_buff[1][i]   = PIXEL_MIN;
21
+        residual_test_buff[1][i] = RMIN;
22
 
23
         pixel_test_buff[2][i]   = PIXEL_MAX;
24
         short_test_buff[2][i]   = SMAX;
25
@@ -57,6 +60,7 @@
26
         int_test_buff[2][i]     = SHORT_MAX;
27
         ushort_test_buff[2][i]  = ((1 << 16) - 1);
28
         uchar_test_buff[2][i]   = 255;
29
+        residual_test_buff[2][i] = RMAX;
30
 
31
         pbuf1[i] = rand() & PIXEL_MAX;
32
         pbuf2[i] = rand() & PIXEL_MAX;
33
@@ -103,8 +107,8 @@
34
     {
35
         int index1 = rand() % TEST_CASES;
36
         int index2 = rand() % TEST_CASES;
37
-        sse_ret_t vres = (sse_ret_t)checked(opt, pixel_test_buff[index1], stride, pixel_test_buff[index2] + j, stride);
38
-        sse_ret_t cres = ref(pixel_test_buff[index1], stride, pixel_test_buff[index2] + j, stride);
39
+        sse_t vres = (sse_t)checked(opt, pixel_test_buff[index1], stride, pixel_test_buff[index2] + j, stride);
40
+        sse_t cres = ref(pixel_test_buff[index1], stride, pixel_test_buff[index2] + j, stride);
41
         if (vres != cres)
42
             return false;
43
 
44
@@ -124,8 +128,8 @@
45
     {
46
         int index1 = rand() % TEST_CASES;
47
         int index2 = rand() % TEST_CASES;
48
-        sse_ret_t vres = (sse_ret_t)checked(opt, short_test_buff[index1], stride, short_test_buff[index2] + j, stride);
49
-        sse_ret_t cres = ref(short_test_buff[index1], stride, short_test_buff[index2] + j, stride);
50
+        sse_t vres = (sse_t)checked(opt, residual_test_buff[index1], stride, residual_test_buff[index2] + j, stride);
51
+        sse_t cres = ref(residual_test_buff[index1], stride, residual_test_buff[index2] + j, stride);
52
         if (vres != cres)
53
             return false;
54
 
55
@@ -227,8 +231,8 @@
56
     {
57
         // NOTE: stride must be multiple of 16, because minimum block is 4x4
58
         int stride = (STRIDE + (rand() % STRIDE)) & ~15;
59
-        int cres = ref(sbuf1 + j, stride);
60
-        int vres = (int)checked(opt, sbuf1 + j, (intptr_t)stride);
61
+        sse_t cres = ref(sbuf1 + j, stride);
62
+        sse_t vres = (sse_t)checked(opt, sbuf1 + j, (intptr_t)stride);
63
 
64
         if (cres != vres)
65
             return false;
66
@@ -854,7 +858,7 @@
67
         int width = (rand() % 4) + 1; // range[1-4]
68
         float cres = ref(sum0, sum1, width);
69
         float vres = checked_float(opt, sum0, sum1, width);
70
-        if (fabs(vres - cres) > 0.00001)
71
+        if (fabs(vres - cres) > 0.0001)
72
             return false;
73
 
74
         reportfail();
75
@@ -1061,8 +1065,8 @@
76
         int endX = MAX_CU_SIZE - (rand() % 5);
77
         int endY = MAX_CU_SIZE - (rand() % 4) - 1;
78
 
79
-        ref(pbuf2 + j + 1, pbuf3 + 1, stride, endX, endY, stats_ref, count_ref);
80
-        checked(opt, pbuf2 + j + 1, pbuf3 + 1, stride, endX, endY, stats_vec, count_vec);
81
+        ref(sbuf2 + j + 1, pbuf3 + 1, stride, endX, endY, stats_ref, count_ref);
82
+        checked(opt, sbuf2 + j + 1, pbuf3 + 1, stride, endX, endY, stats_vec, count_vec);
83
 
84
         if (memcmp(stats_ref, stats_vec, sizeof(stats_ref)) || memcmp(count_ref, count_vec, sizeof(count_ref)))
85
             return false;
86
@@ -1097,8 +1101,8 @@
87
         int endX = MAX_CU_SIZE - (rand() % 5) - 1;
88
         int endY = MAX_CU_SIZE - (rand() % 4) - 1;
89
 
90
-        ref(pbuf2 + j + 1, pbuf3 + j + 1, stride, endX, endY, stats_ref, count_ref);
91
-        checked(opt, pbuf2 + j + 1, pbuf3 + j + 1, stride, endX, endY, stats_vec, count_vec);
92
+        ref(sbuf2 + j + 1, pbuf3 + j + 1, stride, endX, endY, stats_ref, count_ref);
93
+        checked(opt, sbuf2 + j + 1, pbuf3 + j + 1, stride, endX, endY, stats_vec, count_vec);
94
 
95
         if (memcmp(stats_ref, stats_vec, sizeof(stats_ref)) || memcmp(count_ref, count_vec, sizeof(count_ref)))
96
             return false;
97
@@ -1141,8 +1145,8 @@
98
         int endX = MAX_CU_SIZE - (rand() % 5);
99
         int endY = MAX_CU_SIZE - (rand() % 4) - 1;
100
 
101
-        ref(pbuf2 + 1, pbuf3 + 1, stride, upBuff1_ref, endX, endY, stats_ref, count_ref);
102
-        checked(opt, pbuf2 + 1, pbuf3 + 1, stride, upBuff1_vec, endX, endY, stats_vec, count_vec);
103
+        ref(sbuf2 + 1, pbuf3 + 1, stride, upBuff1_ref, endX, endY, stats_ref, count_ref);
104
+        checked(opt, sbuf2 + 1, pbuf3 + 1, stride, upBuff1_vec, endX, endY, stats_vec, count_vec);
105
 
106
         if (   memcmp(_upBuff1_ref, _upBuff1_vec, sizeof(_upBuff1_ref))
107
             || memcmp(stats_ref, stats_vec, sizeof(stats_ref))
108
@@ -1193,8 +1197,8 @@
109
         int endX = MAX_CU_SIZE - (rand() % 5) - 1;
110
         int endY = MAX_CU_SIZE - (rand() % 4) - 1;
111
 
112
-        ref(pbuf2 + 1, pbuf3 + 1, stride, upBuff1_ref, upBufft_ref, endX, endY, stats_ref, count_ref);
113
-        checked(opt, pbuf2 + 1, pbuf3 + 1, stride, upBuff1_vec, upBufft_vec, endX, endY, stats_vec, count_vec);
114
+        ref(sbuf2 + 1, pbuf3 + 1, stride, upBuff1_ref, upBufft_ref, endX, endY, stats_ref, count_ref);
115
+        checked(opt, sbuf2 + 1, pbuf3 + 1, stride, upBuff1_vec, upBufft_vec, endX, endY, stats_vec, count_vec);
116
 
117
         // TODO: don't check upBuff*, the latest output pixels different, and can move into stack temporary buffer in future
118
         if (   memcmp(_upBuff1_ref, _upBuff1_vec, sizeof(_upBuff1_ref))
119
@@ -1244,8 +1248,8 @@
120
         int endX = MAX_CU_SIZE - (rand() % 5) - 1;
121
         int endY = MAX_CU_SIZE - (rand() % 4) - 1;
122
 
123
-        ref(pbuf2, pbuf3, stride, upBuff1_ref, endX, endY, stats_ref, count_ref);
124
-        checked(opt, pbuf2, pbuf3, stride, upBuff1_vec, endX, endY, stats_vec, count_vec);
125
+        ref(sbuf2, pbuf3, stride, upBuff1_ref, endX, endY, stats_ref, count_ref);
126
+        checked(opt, sbuf2, pbuf3, stride, upBuff1_vec, endX, endY, stats_vec, count_vec);
127
 
128
         if (   memcmp(_upBuff1_ref, _upBuff1_vec, sizeof(_upBuff1_ref))
129
             || memcmp(stats_ref, stats_vec, sizeof(stats_ref))
130
@@ -1295,8 +1299,8 @@
131
 
132
     memset(ref_dest, 0xCD, sizeof(ref_dest));
133
     memset(opt_dest, 0xCD, sizeof(opt_dest));
134
-    int width = 32 + rand() % 32;
135
-    int height = 32 + rand() % 32;
136
+    int width = 32 + (rand() % 32);
137
+    int height = 32 + (rand() % 32);
138
     intptr_t srcStride = 64;
139
     intptr_t dstStride = width;
140
     int j = 0;
141
@@ -1304,11 +1308,23 @@
142
     for (int i = 0; i < ITERS; i++)
143
     {
144
         int index = i % TEST_CASES;
145
+
146
         checked(opt, ushort_test_buff[index] + j, srcStride, opt_dest, dstStride, width, height, (int)8, (uint16_t)((1 << X265_DEPTH) - 1));
147
         ref(ushort_test_buff[index] + j, srcStride, ref_dest, dstStride, width, height, (int)8, (uint16_t)((1 << X265_DEPTH) - 1));
148
 
149
-        if (memcmp(ref_dest, opt_dest, width * height * sizeof(pixel)))
150
+        if (memcmp(ref_dest, opt_dest, dstStride * height * sizeof(pixel)))
151
+        {
152
+            memcpy(opt_dest, ref_dest, sizeof(ref_dest));
153
+            opt(ushort_test_buff[index] + j, srcStride, opt_dest, dstStride, width, height, (int)8, (uint16_t)((1 << X265_DEPTH) - 1));
154
             return false;
155
+        }
156
+
157
+        // check tail memory area
158
+        for(int x = width; x < dstStride; x++)
159
+        {
160
+            if (opt_dest[(height - 1 * dstStride) + x] != 0xCD)
161
+                return false;
162
+        }
163
 
164
         reportfail();
165
         j += INCR;
166
@@ -1340,6 +1356,13 @@
167
         if (memcmp(ref_dest, opt_dest, sizeof(ref_dest)))
168
             return false;
169
 
170
+        // check tail memory area
171
+        for(int x = width; x < dstStride; x++)
172
+        {
173
+            if (opt_dest[(height - 1 * dstStride) + x] != 0xCD)
174
+                return false;
175
+        }
176
+
177
         reportfail();
178
         j += INCR;
179
     }
180
@@ -1356,16 +1379,16 @@
181
     memset(opt_dest, 0xCD, sizeof(opt_dest));
182
 
183
     double fps = 1.0;
184
-    int width = 16 + rand() % 64;
185
     int j = 0;
186
 
187
     for (int i = 0; i < ITERS; i++)
188
     {
189
+        int width = 16 + rand() % 64;
190
         int index = i % TEST_CASES;
191
         checked(opt, opt_dest, ushort_test_buff[index] + j, int_test_buff[index] + j, ushort_test_buff[index] + j, int_test_buff[index] + j, &fps, width);
192
         ref(ref_dest, ushort_test_buff[index] + j, int_test_buff[index] + j, ushort_test_buff[index] + j, int_test_buff[index] + j, &fps, width);
193
 
194
-        if (memcmp(ref_dest, opt_dest, width * sizeof(pixel)))
195
+        if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)))
196
             return false;
197
 
198
         reportfail();
199
@@ -1397,28 +1420,6 @@
200
     return true;
201
 }
202
 
203
-bool PixelHarness::check_psyCost_ss(pixelcmp_ss_t ref, pixelcmp_ss_t opt)
204
-{
205
-    int j = 0, index1, index2, optres, refres;
206
-    intptr_t stride = STRIDE;
207
-
208
-    for (int i = 0; i < ITERS; i++)
209
-    {
210
-        index1 = rand() % TEST_CASES;
211
-        index2 = rand() % TEST_CASES;
212
-        optres = (int)checked(opt, short_test_buff[index1], stride, short_test_buff[index2] + j, stride);
213
-        refres = ref(short_test_buff[index1], stride, short_test_buff[index2] + j, stride);
214
-
215
-        if (optres != refres)
216
-            return false;
217
-
218
-        reportfail();
219
-        j += INCR;
220
-    }
221
-
222
-    return true;
223
-}
224
-
225
 bool PixelHarness::check_saoCuOrgB0_t(saoCuOrgB0_t ref, saoCuOrgB0_t opt)
226
 {
227
     ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
228
@@ -1570,8 +1571,8 @@
229
         // specially case: all coeff group are zero
230
         if (j >= SCAN_SET_SIZE)
231
         {
232
-            // all zero block the high 16-bits undefined
233
-            if ((uint16_t)ref_scanPos != (uint16_t)opt_scanPos)
234
+            // all zero block the high 24-bits undefined
235
+            if ((uint8_t)ref_scanPos != (uint8_t)opt_scanPos)
236
                 return false;
237
         }
238
         else if (ref_scanPos != opt_scanPos)
239
@@ -1586,8 +1587,8 @@
240
 bool PixelHarness::check_costCoeffNxN(costCoeffNxN_t ref, costCoeffNxN_t opt)
241
 {
242
     ALIGN_VAR_16(coeff_t, ref_src[32 * 32 + ITERS * 3]);
243
-    ALIGN_VAR_32(uint16_t, ref_absCoeff[1 << MLS_CG_SIZE]);
244
-    ALIGN_VAR_32(uint16_t, opt_absCoeff[1 << MLS_CG_SIZE]);
245
+    ALIGN_VAR_32(uint16_t, ref_absCoeff[(1 << MLS_CG_SIZE)]);
246
+    ALIGN_VAR_32(uint16_t, opt_absCoeff[(1 << MLS_CG_SIZE) + 4]);
247
 
248
     memset(ref_absCoeff, 0xCD, sizeof(ref_absCoeff));
249
     memset(opt_absCoeff, 0xCD, sizeof(opt_absCoeff));
250
@@ -1613,6 +1614,12 @@
251
         ref_src[32 * 32 + i] = 0x1234;
252
     }
253
 
254
+    // Safe check magic
255
+    opt_absCoeff[(1 << MLS_CG_SIZE) + 0] = 0x0123;
256
+    opt_absCoeff[(1 << MLS_CG_SIZE) + 1] = 0x4567;
257
+    opt_absCoeff[(1 << MLS_CG_SIZE) + 2] = 0xBA98;
258
+    opt_absCoeff[(1 << MLS_CG_SIZE) + 3] = 0xFEDC;
259
+
260
     // generate CABAC context table
261
     uint8_t m_contextState_ref[OFF_SIG_FLAG_CTX + NUM_SIG_FLAG_CTX_LUMA];
262
     uint8_t m_contextState_opt[OFF_SIG_FLAG_CTX + NUM_SIG_FLAG_CTX_LUMA];
263
@@ -1703,8 +1710,8 @@
264
             continue;
265
 
266
         const uint32_t blkPosBase = scanTbl[subPosBase];
267
-        uint32_t ref_sum = ref(scanTblCG4x4, &ref_src[blkPosBase + i], trSize, ref_absCoeff + numNonZero, rand_tabSigCtx, scanFlagMask, (uint8_t*)ref_baseCtx, offset, rand_scanPosSigOff, subPosBase);
268
-        uint32_t opt_sum = (uint32_t)checked(opt, scanTblCG4x4, &ref_src[blkPosBase + i], trSize, opt_absCoeff + numNonZero, rand_tabSigCtx, scanFlagMask, (uint8_t*)opt_baseCtx, offset, rand_scanPosSigOff, subPosBase);
269
+        uint32_t ref_sum = ref(scanTblCG4x4, &ref_src[blkPosBase + i], (intptr_t)trSize, ref_absCoeff + numNonZero, rand_tabSigCtx, scanFlagMask, (uint8_t*)ref_baseCtx, offset, rand_scanPosSigOff, subPosBase);
270
+        uint32_t opt_sum = (uint32_t)checked(opt, scanTblCG4x4, &ref_src[blkPosBase + i], (intptr_t)trSize, opt_absCoeff + numNonZero, rand_tabSigCtx, scanFlagMask, (uint8_t*)opt_baseCtx, offset, rand_scanPosSigOff, subPosBase);
271
 
272
         if (ref_sum != opt_sum)
273
             return false;
274
@@ -1712,18 +1719,25 @@
275
             return false;
276
 
277
         // NOTE: just first rand_numCoeff valid, but I check full buffer for confirm no overwrite bug
278
-        if (memcmp(ref_absCoeff, opt_absCoeff, sizeof(ref_absCoeff)))
279
+        if (memcmp(ref_absCoeff, opt_absCoeff, rand_numCoeff * sizeof(ref_absCoeff[0])))
280
+            return false;
281
+
282
+        // Check memory beyond-bound write
283
+        if (   opt_absCoeff[(1 << MLS_CG_SIZE) + 1] != 0x4567
284
+            || opt_absCoeff[(1 << MLS_CG_SIZE) + 2] != 0xBA98
285
+            || opt_absCoeff[(1 << MLS_CG_SIZE) + 3] != 0xFEDC)
286
             return false;
287
 
288
         reportfail();
289
     }
290
     return true;
291
 }
292
+
293
 bool PixelHarness::check_costCoeffRemain(costCoeffRemain_t ref, costCoeffRemain_t opt)
294
 {
295
-    ALIGN_VAR_32(uint16_t, absCoeff[1 << MLS_CG_SIZE]);
296
+    ALIGN_VAR_32(uint16_t, absCoeff[(1 << MLS_CG_SIZE) + ITERS]);
297
 
298
-    for (int i = 0; i < (1 << MLS_CG_SIZE); i++)
299
+    for (int i = 0; i < (1 << MLS_CG_SIZE) + ITERS; i++)
300
     {
301
         absCoeff[i] = rand() & SHORT_MAX;
302
         // more coeff with value one
303
@@ -1737,20 +1751,168 @@
304
         int numNonZero = rand() % 17; //can be random, range[1, 16]
305
         for (k = 0; k < C1FLAG_NUMBER; k++)
306
         {
307
-            if (absCoeff[k] >= 2)
308
+            if (absCoeff[i + k] >= 2)
309
             {
310
                 break;
311
             }
312
         }
313
         firstC2Idx = k; // it is index of exact first coeff that value more than 2
314
-        int ref_sum = ref(absCoeff, numNonZero, firstC2Idx);
315
-        int opt_sum = (int)checked(opt, absCoeff, numNonZero, firstC2Idx);
316
+        int ref_sum = ref(absCoeff + i, numNonZero, firstC2Idx);
317
+        int opt_sum = (int)checked(opt, absCoeff + i, numNonZero, firstC2Idx);
318
         if (ref_sum != opt_sum)
319
             return false;
320
     }
321
     return true;
322
 }
323
 
324
+bool PixelHarness::check_costC1C2Flag(costC1C2Flag_t ref, costC1C2Flag_t opt)
325
+{
326
+    ALIGN_VAR_32(uint16_t, absCoeff[(1 << MLS_CG_SIZE)]);
327
+
328
+    // generate CABAC context table
329
+    uint8_t ref_baseCtx[8];
330
+    uint8_t opt_baseCtx[8];
331
+    for (int k = 0; k < 8; k++)
332
+    {
333
+        ref_baseCtx[k] =
334
+        opt_baseCtx[k] = (rand() % (125 - 2)) + 2;
335
+    }
336
+
337
+    for (int i = 0; i < ITERS; i++)
338
+    {
339
+        int rand_offset = rand() % 4;
340
+        int numNonZero = 0;
341
+
342
+        // generate test data, all are Absolute value and Aligned
343
+        for (int k = 0; k < C1FLAG_NUMBER; k++)
344
+        {
345
+            int value = rand() & SHORT_MAX;
346
+            // more coeff with value [0,2]
347
+            if (value < SHORT_MAX * 1 / 3)
348
+                value = 0;
349
+            else if (value < SHORT_MAX * 2 / 3)
350
+                value = 1;
351
+            else if (value < SHORT_MAX * 3 / 4)
352
+                value = 2;
353
+
354
+            if (value)
355
+            {
356
+                absCoeff[numNonZero] = (uint16_t)value;
357
+                numNonZero++;
358
+            }
359
+        }
360
+        if (numNonZero == 0)
361
+        {
362
+            numNonZero = 1;
363
+            absCoeff[0] = 1;
364
+        }
365
+
366
+        int ref_sum = ref(absCoeff, (intptr_t)numNonZero, ref_baseCtx, (intptr_t)rand_offset);
367
+        int opt_sum = (int)checked(opt, absCoeff, (intptr_t)numNonZero, opt_baseCtx, (intptr_t)rand_offset);
368
+        if (ref_sum != opt_sum)
369
+        {
370
+            ref_sum = ref(absCoeff, (intptr_t)numNonZero, ref_baseCtx, (intptr_t)rand_offset);
371
+            opt_sum = opt(absCoeff, (intptr_t)numNonZero, opt_baseCtx, (intptr_t)rand_offset);
372
+            return false;
373
+        }
374
+    }
375
+    return true;
376
+}
377
+
378
+bool PixelHarness::check_planeClipAndMax(planeClipAndMax_t ref, planeClipAndMax_t opt)
379
+{
380
+    for (int i = 0; i < ITERS; i++)
381
+    {
382
+        intptr_t rand_stride = rand() % STRIDE;
383
+        int rand_width = (rand() % (STRIDE * 2)) + 1;
384
+        const int rand_height = (rand() % MAX_HEIGHT) + 1;
385
+        const pixel rand_min = rand() % 32;
386
+        const pixel rand_max = PIXEL_MAX - (rand() % 32);
387
+        uint64_t ref_sum, opt_sum;
388
+
389
+        // video width must be more than or equal to 32
390
+        if (rand_width < 32)
391
+            rand_width = 32;
392
+
393
+        // stride must be more than or equal to width
394
+        if (rand_stride < rand_width)
395
+            rand_stride = rand_width;
396
+
397
+        pixel ref_max = ref(pbuf1, rand_stride, rand_width, rand_height, &ref_sum, rand_min, rand_max);
398
+        pixel opt_max = (pixel)checked(opt, pbuf1, rand_stride, rand_width, rand_height, &opt_sum, rand_min, rand_max);
399
+
400
+        if (ref_max != opt_max)
401
+            return false;
402
+    }
403
+    return true;
404
+}
405
+
406
+bool PixelHarness::check_pelFilterLumaStrong_H(pelFilterLumaStrong_t ref, pelFilterLumaStrong_t opt)
407
+{
408
+    intptr_t srcStep = 1, offset = 64;
409
+    int32_t tcP, tcQ, maskP, maskQ, tc;
410
+    int j = 0;
411
+
412
+    pixel pixel_test_buff1[TEST_CASES][BUFFSIZE];
413
+    for (int i = 0; i < TEST_CASES; i++)
414
+        memcpy(pixel_test_buff1[i], pixel_test_buff[i], sizeof(pixel) * BUFFSIZE);
415
+
416
+    for (int i = 0; i < ITERS; i++)
417
+    {
418
+        tc      = rand() % PIXEL_MAX;
419
+        maskP   = (rand() % PIXEL_MAX) - 1;
420
+        maskQ   = (rand() % PIXEL_MAX) - 1;
421
+        tcP     = (tc & maskP);
422
+        tcQ     = (tc & maskQ);
423
+
424
+        int index = rand() % 3;
425
+
426
+        ref(pixel_test_buff[index]  + 4 * offset + j, srcStep, offset, tcP, tcQ);
427
+        checked(opt, pixel_test_buff1[index] + 4 * offset + j, srcStep, offset, tcP, tcQ);
428
+
429
+        if (memcmp(pixel_test_buff[index], pixel_test_buff1[index], sizeof(pixel) * BUFFSIZE))
430
+            return false;
431
+
432
+        reportfail()
433
+        j += INCR;
434
+    }
435
+
436
+    return true;
437
+}
438
+
439
+bool PixelHarness::check_pelFilterLumaStrong_V(pelFilterLumaStrong_t ref, pelFilterLumaStrong_t opt)
440
+{
441
+    intptr_t srcStep = 64, offset = 1;
442
+    int32_t tcP, tcQ, maskP, maskQ, tc;
443
+    int j = 0;
444
+
445
+    pixel pixel_test_buff1[TEST_CASES][BUFFSIZE];
446
+    for (int i = 0; i < TEST_CASES; i++)
447
+        memcpy(pixel_test_buff1[i], pixel_test_buff[i], sizeof(pixel) * BUFFSIZE);
448
+
449
+    for (int i = 0; i < ITERS; i++)
450
+    {
451
+        tc      = rand() % PIXEL_MAX;
452
+        maskP   = (rand() % PIXEL_MAX) - 1;
453
+        maskQ   = (rand() % PIXEL_MAX) - 1;
454
+        tcP     = (tc & maskP);
455
+        tcQ     = (tc & maskQ);
456
+
457
+        int index = rand() % 3;
458
+
459
+        ref(pixel_test_buff[index]  + 4 + j, srcStep, offset, tcP, tcQ);
460
+        checked(opt, pixel_test_buff1[index] + 4 + j, srcStep, offset, tcP, tcQ);
461
+
462
+        if (memcmp(pixel_test_buff[index], pixel_test_buff1[index], sizeof(pixel) * BUFFSIZE))
463
+            return false;
464
+
465
+        reportfail()
466
+        j += INCR;
467
+    }
468
+
469
+    return true;
470
+}
471
+
472
 bool PixelHarness::testPU(int part, const EncoderPrimitives& ref, const EncoderPrimitives& opt)
473
 {
474
     if (opt.pu[part].satd)
475
@@ -2039,15 +2201,6 @@
476
             }
477
         }
478
 
479
-        if (opt.cu[i].psy_cost_ss)
480
-        {
481
-            if (!check_psyCost_ss(ref.cu[i].psy_cost_ss, opt.cu[i].psy_cost_ss))
482
-            {
483
-                printf("\npsy_cost_ss[%dx%d] failed!\n", 4 << i, 4 << i);
484
-                return false;
485
-            }
486
-        }
487
-
488
         if (i < BLOCK_64x64)
489
         {
490
             /* TU only primitives */
491
@@ -2175,7 +2328,7 @@
492
     {
493
         if (!check_ssim_4x4x2_core(ref.ssim_4x4x2_core, opt.ssim_4x4x2_core))
494
         {
495
-            printf("ssim_end_4 failed!\n");
496
+            printf("ssim_4x4x2_core failed!\n");
497
             return false;
498
         }
499
     }
500
@@ -2362,6 +2515,7 @@
501
             return false;
502
         }
503
     }
504
+
505
     if (opt.costCoeffNxN)
506
     {
507
         if (!check_costCoeffNxN(ref.costCoeffNxN, opt.costCoeffNxN))
508
@@ -2370,6 +2524,7 @@
509
             return false;
510
         }
511
     }
512
+
513
     if (opt.costCoeffRemain)
514
     {
515
         if (!check_costCoeffRemain(ref.costCoeffRemain, opt.costCoeffRemain))
516
@@ -2379,6 +2534,43 @@
517
         }
518
     }
519
 
520
+    if (opt.costC1C2Flag)
521
+    {
522
+        if (!check_costC1C2Flag(ref.costC1C2Flag, opt.costC1C2Flag))
523
+        {
524
+            printf("costC1C2Flag failed!\n");
525
+            return false;
526
+        }
527
+    }
528
+    
529
+
530
+    if (opt.planeClipAndMax)
531
+    {
532
+        if (!check_planeClipAndMax(ref.planeClipAndMax, opt.planeClipAndMax))
533
+        {
534
+            printf("planeClipAndMax failed!\n");
535
+            return false;
536
+        }
537
+    }
538
+
539
+    if (opt.pelFilterLumaStrong[0])
540
+    {
541
+        if (!check_pelFilterLumaStrong_V(ref.pelFilterLumaStrong[0], opt.pelFilterLumaStrong[0]))
542
+        {
543
+            printf("pelFilterLumaStrong Vertical failed!\n");
544
+            return false;
545
+        }
546
+    }
547
+
548
+    if (opt.pelFilterLumaStrong[1])
549
+    {
550
+        if (!check_pelFilterLumaStrong_H(ref.pelFilterLumaStrong[1], opt.pelFilterLumaStrong[1]))
551
+        {
552
+            printf("pelFilterLumaStrong Horizontal failed!\n");
553
+            return false;
554
+        }
555
+    }
556
+
557
     return true;
558
 }
559
 
560
@@ -2637,12 +2829,6 @@
561
             HEADER("psy_cost_pp[%dx%d]", 4 << i, 4 << i);
562
             REPORT_SPEEDUP(opt.cu[i].psy_cost_pp, ref.cu[i].psy_cost_pp, pbuf1, STRIDE, pbuf2, STRIDE);
563
         }
564
-
565
-        if (opt.cu[i].psy_cost_ss)
566
-        {
567
-            HEADER("psy_cost_ss[%dx%d]", 4 << i, 4 << i);
568
-            REPORT_SPEEDUP(opt.cu[i].psy_cost_ss, ref.cu[i].psy_cost_ss, sbuf1, STRIDE, sbuf2, STRIDE);
569
-        }
570
     }
571
 
572
     if (opt.weight_pp)
573
@@ -2745,14 +2931,14 @@
574
     {
575
         int32_t stats[33], count[33];
576
         HEADER0("saoCuStatsBO");
577
-        REPORT_SPEEDUP(opt.saoCuStatsBO, ref.saoCuStatsBO, pbuf2, pbuf3, 64, 60, 61, stats, count);
578
+        REPORT_SPEEDUP(opt.saoCuStatsBO, ref.saoCuStatsBO, sbuf2, pbuf3, 64, 60, 61, stats, count);
579
     }
580
 
581
     if (opt.saoCuStatsE0)
582
     {
583
         int32_t stats[33], count[33];
584
         HEADER0("saoCuStatsE0");
585
-        REPORT_SPEEDUP(opt.saoCuStatsE0, ref.saoCuStatsE0, pbuf2, pbuf3, 64, 60, 61, stats, count);
586
+        REPORT_SPEEDUP(opt.saoCuStatsE0, ref.saoCuStatsE0, sbuf2, pbuf3, 64, 60, 61, stats, count);
587
     }
588
 
589
     if (opt.saoCuStatsE1)
590
@@ -2761,7 +2947,7 @@
591
         int8_t upBuff1[MAX_CU_SIZE + 2];
592
         memset(upBuff1, 1, sizeof(upBuff1));
593
         HEADER0("saoCuStatsE1");
594
-        REPORT_SPEEDUP(opt.saoCuStatsE1, ref.saoCuStatsE1, pbuf2, pbuf3, 64, upBuff1 + 1,60, 61, stats, count);
595
+        REPORT_SPEEDUP(opt.saoCuStatsE1, ref.saoCuStatsE1, sbuf2, pbuf3, 64, upBuff1 + 1,60, 61, stats, count);
596
     }
597
 
598
     if (opt.saoCuStatsE2)
599
@@ -2772,7 +2958,7 @@
600
         memset(upBuff1, 1, sizeof(upBuff1));
601
         memset(upBufft, -1, sizeof(upBufft));
602
         HEADER0("saoCuStatsE2");
603
-        REPORT_SPEEDUP(opt.saoCuStatsE2, ref.saoCuStatsE2, pbuf2, pbuf3, 64, upBuff1 + 1, upBufft + 1, 60, 61, stats, count);
604
+        REPORT_SPEEDUP(opt.saoCuStatsE2, ref.saoCuStatsE2, sbuf2, pbuf3, 64, upBuff1 + 1, upBufft + 1, 60, 61, stats, count);
605
     }
606
 
607
     if (opt.saoCuStatsE3)
608
@@ -2781,7 +2967,7 @@
609
         int32_t stats[5], count[5];
610
         memset(upBuff1, 1, sizeof(upBuff1));
611
         HEADER0("saoCuStatsE3");
612
-        REPORT_SPEEDUP(opt.saoCuStatsE3, ref.saoCuStatsE3, pbuf2, pbuf3, 64, upBuff1 + 1, 60, 61, stats, count);
613
+        REPORT_SPEEDUP(opt.saoCuStatsE3, ref.saoCuStatsE3, sbuf2, pbuf3, 64, upBuff1 + 1, 60, 61, stats, count);
614
     }
615
 
616
     if (opt.planecopy_sp)
617
@@ -2823,6 +3009,7 @@
618
         coefBuf[3 + 3 * 32] = 0x0BAD;
619
         REPORT_SPEEDUP(opt.findPosFirstLast, ref.findPosFirstLast, coefBuf, 32, g_scan4x4[SCAN_DIAG]);
620
     }
621
+
622
     if (opt.costCoeffNxN)
623
     {
624
         HEADER0("costCoeffNxN");
625
@@ -2841,6 +3028,7 @@
626
 
627
         REPORT_SPEEDUP(opt.costCoeffNxN, ref.costCoeffNxN, g_scan4x4[SCAN_DIAG], coefBuf, 32, tmpOut, ctxSig, 0xFFFF, ctx, 1, 15, 32);
628
     }
629
+
630
     if (opt.costCoeffRemain)
631
     {
632
         HEADER0("costCoeffRemain");
633
@@ -2849,4 +3037,37 @@
634
         memset(abscoefBuf + 32 * 31, 1, 32 * sizeof(uint16_t));
635
         REPORT_SPEEDUP(opt.costCoeffRemain, ref.costCoeffRemain, abscoefBuf, 16, 3);
636
     }
637
+
638
+    if (opt.costC1C2Flag)
639
+    {
640
+        HEADER0("costC1C2Flag");
641
+        ALIGN_VAR_32(uint16_t, abscoefBuf[C1FLAG_NUMBER]);
642
+        memset(abscoefBuf, 1, sizeof(abscoefBuf));
643
+        abscoefBuf[C1FLAG_NUMBER - 2] = 2;
644
+        abscoefBuf[C1FLAG_NUMBER - 1] = 3;
645
+        REPORT_SPEEDUP(opt.costC1C2Flag, ref.costC1C2Flag, abscoefBuf, C1FLAG_NUMBER, (uint8_t*)psbuf1, 1);
646
+    }
647
+
648
+    if (opt.planeClipAndMax)
649
+    {
650
+        HEADER0("planeClipAndMax");
651
+        uint64_t dummy;
652
+        REPORT_SPEEDUP(opt.planeClipAndMax, ref.planeClipAndMax, pbuf1, 128, 63, 62, &dummy, 1, PIXEL_MAX - 1);
653
+    }
654
+
655
+    if (opt.pelFilterLumaStrong[0])
656
+    {
657
+        int32_t tcP = (rand() % PIXEL_MAX) - 1;
658
+        int32_t tcQ = (rand() % PIXEL_MAX) - 1;
659
+        HEADER0("pelFilterLumaStrong_Vertical");
660
+        REPORT_SPEEDUP(opt.pelFilterLumaStrong[0], ref.pelFilterLumaStrong[0], pbuf1, STRIDE, 1, tcP, tcQ);
661
+    }
662
+
663
+    if (opt.pelFilterLumaStrong[1])
664
+    {
665
+        int32_t tcP = (rand() % PIXEL_MAX) - 1;
666
+        int32_t tcQ = (rand() % PIXEL_MAX) - 1;
667
+        HEADER0("pelFilterLumaStrong_Horizontal");
668
+        REPORT_SPEEDUP(opt.pelFilterLumaStrong[1], ref.pelFilterLumaStrong[1], pbuf1, 1, STRIDE, tcP, tcQ);
669
+    }
670
 }
671
x265_1.8.tar.gz/source/test/pixelharness.h -> x265_1.9.tar.gz/source/test/pixelharness.h Changed
43
 
1
@@ -2,6 +2,7 @@
2
  * Copyright (C) 2013 x265 project
3
  *
4
  * Authors: Steve Borho <steve@borho.org>
5
+ *          Min Chen <chenm003@163.com>
6
  *
7
  * This program is free software; you can redistribute it and/or modify
8
  * it under the terms of the GNU General Public License as published by
9
@@ -40,6 +41,8 @@
10
     enum { TEST_CASES = 3 };
11
     enum { SMAX = 1 << 12 };
12
     enum { SMIN = -1 << 12 };
13
+    enum { RMAX = PIXEL_MAX - PIXEL_MIN }; //The maximum value obtained by subtracting pixel values (residual max)
14
+    enum { RMIN = PIXEL_MIN - PIXEL_MAX }; //The minimum value obtained by subtracting pixel values (residual min)
15
 
16
     ALIGN_VAR_32(pixel, pbuf1[BUFFSIZE]);
17
     pixel    pbuf2[BUFFSIZE];
18
@@ -64,6 +67,7 @@
19
     uint16_t ushort_test_buff[TEST_CASES][BUFFSIZE];
20
     uint8_t  uchar_test_buff[TEST_CASES][BUFFSIZE];
21
     double   double_test_buff[TEST_CASES][BUFFSIZE];
22
+    int16_t  residual_test_buff[TEST_CASES][BUFFSIZE];
23
 
24
     bool check_pixelcmp(pixelcmp_t ref, pixelcmp_t opt);
25
     bool check_pixel_sse(pixel_sse_t ref, pixel_sse_t opt);
26
@@ -110,12 +114,15 @@
27
     bool check_planecopy_cp(planecopy_cp_t ref, planecopy_cp_t opt);
28
     bool check_cutree_propagate_cost(cutree_propagate_cost ref, cutree_propagate_cost opt);
29
     bool check_psyCost_pp(pixelcmp_t ref, pixelcmp_t opt);
30
-    bool check_psyCost_ss(pixelcmp_ss_t ref, pixelcmp_ss_t opt);
31
     bool check_calSign(sign_t ref, sign_t opt);
32
     bool check_scanPosLast(scanPosLast_t ref, scanPosLast_t opt);
33
     bool check_findPosFirstLast(findPosFirstLast_t ref, findPosFirstLast_t opt);
34
     bool check_costCoeffNxN(costCoeffNxN_t ref, costCoeffNxN_t opt);
35
     bool check_costCoeffRemain(costCoeffRemain_t ref, costCoeffRemain_t opt);
36
+    bool check_costC1C2Flag(costC1C2Flag_t ref, costC1C2Flag_t opt);
37
+    bool check_planeClipAndMax(planeClipAndMax_t ref, planeClipAndMax_t opt);
38
+    bool check_pelFilterLumaStrong_V(pelFilterLumaStrong_t ref, pelFilterLumaStrong_t opt);
39
+    bool check_pelFilterLumaStrong_H(pelFilterLumaStrong_t ref, pelFilterLumaStrong_t opt);
40
 
41
 public:
42
 
43
x265_1.8.tar.gz/source/test/regression-tests.txt -> x265_1.9.tar.gz/source/test/regression-tests.txt Changed
192
 
1
@@ -11,124 +11,132 @@
2
 # consistent across many machines, you must force a certain -FN so it is
3
 # not auto-detected.
4
 
5
+BasketballDrive_1920x1080_50.y4m,--preset ultrafast --signhide --colormatrix bt709
6
+BasketballDrive_1920x1080_50.y4m,--preset superfast --psy-rd 1 --ctu 16 --no-wpp --limit-modes
7
+BasketballDrive_1920x1080_50.y4m,--preset veryfast --tune zerolatency --no-temporal-mvp
8
 BasketballDrive_1920x1080_50.y4m,--preset faster --aq-strength 2 --merange 190
9
 BasketballDrive_1920x1080_50.y4m,--preset medium --ctu 16 --max-tu-size 8 --subme 7 --qg-size 16 --cu-lossless
10
 BasketballDrive_1920x1080_50.y4m,--preset medium --keyint -1 --nr-inter 100 -F4 --no-sao
11
+BasketballDrive_1920x1080_50.y4m,--preset medium --no-cutree --analysis-mode=save --bitrate 7000 --limit-modes,--preset medium --no-cutree --analysis-mode=load --bitrate 7000 --limit-modes
12
 BasketballDrive_1920x1080_50.y4m,--preset slow --nr-intra 100 -F4 --aq-strength 3 --qg-size 16 --limit-refs 1
13
 BasketballDrive_1920x1080_50.y4m,--preset slower --lossless --chromaloc 3 --subme 0
14
-BasketballDrive_1920x1080_50.y4m,--preset superfast --psy-rd 1 --ctu 16 --no-wpp
15
-BasketballDrive_1920x1080_50.y4m,--preset ultrafast --signhide --colormatrix bt709
16
-BasketballDrive_1920x1080_50.y4m,--preset veryfast --tune zerolatency --no-temporal-mvp
17
-BasketballDrive_1920x1080_50.y4m,--preset veryslow --crf 4 --cu-lossless --pmode --limit-refs 1
18
+BasketballDrive_1920x1080_50.y4m,--preset slower --no-cutree --analysis-mode=save --bitrate 7000,--preset slower --no-cutree --analysis-mode=load --bitrate 7000
19
+BasketballDrive_1920x1080_50.y4m,--preset veryslow --crf 4 --cu-lossless --pmode --limit-refs 1 --aq-mode 3
20
+BasketballDrive_1920x1080_50.y4m,--preset veryslow --no-cutree --analysis-mode=save --bitrate 7000 --tskip-fast,--preset veryslow --no-cutree --analysis-mode=load --bitrate 7000  --tskip-fast
21
+BasketballDrive_1920x1080_50.y4m,--preset veryslow --recon-y4m-exec "ffplay -i pipe:0 -autoexit"
22
+Coastguard-4k.y4m,--preset ultrafast --recon-y4m-exec "ffplay -i pipe:0 -autoexit"
23
+Coastguard-4k.y4m,--preset superfast --tune grain --overscan=crop
24
+Coastguard-4k.y4m,--preset veryfast --no-cutree --analysis-mode=save --bitrate 15000,--preset veryfast --no-cutree --analysis-mode=load --bitrate 15000
25
 Coastguard-4k.y4m,--preset medium --rdoq-level 1 --tune ssim --no-signhide --me umh
26
 Coastguard-4k.y4m,--preset slow --tune psnr --cbqpoffs -1 --crqpoffs 1 --limit-refs 1
27
-Coastguard-4k.y4m,--preset superfast --tune grain --overscan=crop
28
-CrowdRun_1920x1080_50_10bit_422.yuv,--preset fast --aq-mode 0 --sar 2 --range full
29
+CrowdRun_1920x1080_50_10bit_422.yuv,--preset ultrafast --weightp --tune zerolatency --qg-size 16
30
+CrowdRun_1920x1080_50_10bit_422.yuv,--preset superfast --weightp --no-wpp --sao
31
+CrowdRun_1920x1080_50_10bit_422.yuv,--preset veryfast --temporal-layers --tune grain
32
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset faster --max-tu-size 4 --min-cu-size 32
33
+CrowdRun_1920x1080_50_10bit_422.yuv,--preset fast --aq-mode 0 --sar 2 --range full
34
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset medium --no-wpp --no-cutree --no-strong-intra-smoothing --limit-refs 1
35
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset slow --no-wpp --tune ssim --transfer smpte240m
36
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset slower --tune ssim --tune fastdecode --limit-refs 2
37
-CrowdRun_1920x1080_50_10bit_422.yuv,--preset superfast --weightp --no-wpp --sao
38
-CrowdRun_1920x1080_50_10bit_422.yuv,--preset ultrafast --weightp --tune zerolatency --qg-size 16
39
-CrowdRun_1920x1080_50_10bit_422.yuv,--preset veryfast --temporal-layers --tune grain
40
-CrowdRun_1920x1080_50_10bit_444.yuv,--preset medium --dither --keyint -1 --rdoq-level 1
41
-CrowdRun_1920x1080_50_10bit_444.yuv,--preset superfast --weightp --dither --no-psy-rd
42
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset ultrafast --weightp --no-wpp --no-open-gop
43
+CrowdRun_1920x1080_50_10bit_444.yuv,--preset superfast --weightp --dither --no-psy-rd
44
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset veryfast --temporal-layers --repeat-headers --limit-refs 2
45
+CrowdRun_1920x1080_50_10bit_444.yuv,--preset medium --dither --keyint -1 --rdoq-level 1 --limit-modes
46
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset veryslow --tskip --tskip-fast --no-scenecut
47
-DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset medium --tune psnr --bframes 16
48
-DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset slow --temporal-layers --no-psy-rd --qg-size 32 --limit-refs 0 --cu-lossless
49
 DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset superfast --weightp --qg-size 16
50
+DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset medium --tune psnr --bframes 16 --limit-modes
51
+DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset slow --temporal-layers --no-psy-rd --qg-size 32 --limit-refs 0 --cu-lossless
52
+DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset veryfast --weightp --nr-intra 1000 -F4
53
 DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset medium --nr-inter 500 -F4 --no-psy-rdoq
54
 DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset slower --no-weightp --rdoq-level 0 --limit-refs 3
55
-DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset veryfast --weightp --nr-intra 1000 -F4
56
-FourPeople_1280x720_60.y4m,--preset medium --qp 38 --no-psy-rd
57
 FourPeople_1280x720_60.y4m,--preset superfast --no-wpp --lookahead-slices 2
58
+FourPeople_1280x720_60.y4m,--preset medium --qp 38 --no-psy-rd
59
+FourPeople_1280x720_60.y4m,--preset medium --recon-y4m-exec "ffplay -i pipe:0 -autoexit"
60
+FourPeople_1280x720_60.y4m,--preset veryslow --numa-pools "none"
61
+Keiba_832x480_30.y4m,--preset superfast --no-fast-intra --nr-intra 1000 -F4
62
 Keiba_832x480_30.y4m,--preset medium --pmode --tune grain
63
 Keiba_832x480_30.y4m,--preset slower --fast-intra --nr-inter 500 -F4 --limit-refs 0
64
-Keiba_832x480_30.y4m,--preset superfast --no-fast-intra --nr-intra 1000 -F4
65
-Kimono1_1920x1080_24_10bit_444.yuv,--preset medium --min-cu-size 32
66
 Kimono1_1920x1080_24_10bit_444.yuv,--preset superfast --weightb
67
-KristenAndSara_1280x720_60.y4m,--preset medium --no-cutree --max-tu-size 16
68
-KristenAndSara_1280x720_60.y4m,--preset slower --pmode --max-tu-size 8 --limit-refs 0
69
-KristenAndSara_1280x720_60.y4m,--preset superfast --min-cu-size 16 --qg-size 16 --limit-refs 1
70
+Kimono1_1920x1080_24_10bit_444.yuv,--preset medium --min-cu-size 32
71
 KristenAndSara_1280x720_60.y4m,--preset ultrafast --strong-intra-smoothing
72
-NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset medium --tune grain --limit-refs 2
73
+KristenAndSara_1280x720_60.y4m,--preset superfast --min-cu-size 16 --qg-size 16 --limit-refs 1
74
+KristenAndSara_1280x720_60.y4m,--preset medium --no-cutree --max-tu-size 16
75
+KristenAndSara_1280x720_60.y4m,--preset slower --pmode --max-tu-size 8 --limit-refs 0 --limit-modes
76
 NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset superfast --tune psnr
77
-News-4k.y4m,--preset medium --tune ssim --no-sao --qg-size 16
78
+NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset medium --tune grain --limit-refs 2
79
+NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset slow --no-cutree --analysis-mode=save --bitrate 9000,--preset slow --no-cutree --analysis-mode=load --bitrate 9000
80
+News-4k.y4m,--preset ultrafast --no-cutree --analysis-mode=save --bitrate 15000,--preset ultrafast --no-cutree --analysis-mode=load --bitrate 15000
81
 News-4k.y4m,--preset superfast --lookahead-slices 6 --aq-mode 0
82
+News-4k.y4m,--preset medium --tune ssim --no-sao --qg-size 16
83
+OldTownCross_1920x1080_50_10bit_422.yuv,--preset superfast --weightp
84
 OldTownCross_1920x1080_50_10bit_422.yuv,--preset medium --no-weightp
85
 OldTownCross_1920x1080_50_10bit_422.yuv,--preset slower --tune fastdecode
86
-OldTownCross_1920x1080_50_10bit_422.yuv,--preset superfast --weightp
87
+ParkScene_1920x1080_24_10bit_444.yuv,--preset superfast --weightp --lookahead-slices 4
88
 ParkScene_1920x1080_24.y4m,--preset medium --qp 40 --rdpenalty 2 --tu-intra-depth 3
89
 ParkScene_1920x1080_24.y4m,--preset slower --no-weightp
90
-ParkScene_1920x1080_24_10bit_444.yuv,--preset superfast --weightp --lookahead-slices 4
91
+RaceHorses_416x240_30.y4m,--preset superfast --no-cutree
92
 RaceHorses_416x240_30.y4m,--preset medium --tskip-fast --tskip
93
 RaceHorses_416x240_30.y4m,--preset slower --keyint -1 --rdoq-level 0
94
-RaceHorses_416x240_30.y4m,--preset superfast --no-cutree
95
 RaceHorses_416x240_30.y4m,--preset veryslow --tskip-fast --tskip --limit-refs 3
96
-RaceHorses_416x240_30_10bit.yuv,--preset fast --lookahead-slices 2 --b-intra --limit-refs 1
97
-RaceHorses_416x240_30_10bit.yuv,--preset faster --rdoq-level 0 --dither
98
-RaceHorses_416x240_30_10bit.yuv,--preset slow --tune grain
99
 RaceHorses_416x240_30_10bit.yuv,--preset ultrafast --tune psnr --limit-refs 1
100
 RaceHorses_416x240_30_10bit.yuv,--preset veryfast --weightb
101
+RaceHorses_416x240_30_10bit.yuv,--preset faster --rdoq-level 0 --dither
102
+RaceHorses_416x240_30_10bit.yuv,--preset fast --lookahead-slices 2 --b-intra --limit-refs 1
103
+RaceHorses_416x240_30_10bit.yuv,--preset slow --tune grain  --limit-modes
104
 RaceHorses_416x240_30_10bit.yuv,--preset placebo --limit-refs 1
105
 SteamLocomotiveTrain_2560x1600_60_10bit_crop.yuv,--preset medium --dither
106
-big_buck_bunny_360p24.y4m,--preset faster --keyint 240 --min-keyint 60 --rc-lookahead 200
107
-big_buck_bunny_360p24.y4m,--preset medium --keyint 60 --min-keyint 48 --weightb --limit-refs 3
108
-big_buck_bunny_360p24.y4m,--preset slow --psy-rdoq 2.0 --rdoq-level 1 --no-b-intra
109
-big_buck_bunny_360p24.y4m,--preset superfast --psy-rdoq 2.0
110
 big_buck_bunny_360p24.y4m,--preset ultrafast --deblock=2
111
+big_buck_bunny_360p24.y4m,--preset superfast --psy-rdoq 2.0 --aq-mode 3
112
 big_buck_bunny_360p24.y4m,--preset veryfast --no-deblock
113
-city_4cif_60fps.y4m,--preset medium --crf 4 --cu-lossless --sao-non-deblock
114
+big_buck_bunny_360p24.y4m,--preset faster --keyint 240 --min-keyint 60 --rc-lookahead 200
115
+big_buck_bunny_360p24.y4m,--preset medium --keyint 60 --min-keyint 48 --weightb --limit-refs 3
116
+big_buck_bunny_360p24.y4m,--preset slow --psy-rdoq 2.0 --rdoq-level 1 --no-b-intra --aq-mode 3
117
 city_4cif_60fps.y4m,--preset superfast --rdpenalty 1 --tu-intra-depth 2
118
+city_4cif_60fps.y4m,--preset medium --crf 4 --cu-lossless --sao-non-deblock
119
 city_4cif_60fps.y4m,--preset slower --scaling-list default
120
 city_4cif_60fps.y4m,--preset veryslow --rdpenalty 2 --sao-non-deblock --no-b-intra --limit-refs 0
121
-ducks_take_off_420_720p50.y4m,--preset fast --deblock 6 --bframes 16 --rc-lookahead 40
122
+ducks_take_off_420_720p50.y4m,--preset ultrafast --constrained-intra --rd 1
123
+ducks_take_off_444_720p50.y4m,--preset superfast --weightp --limit-refs 2
124
 ducks_take_off_420_720p50.y4m,--preset faster --qp 24 --deblock -6 --limit-refs 2
125
+ducks_take_off_420_720p50.y4m,--preset fast --deblock 6 --bframes 16 --rc-lookahead 40
126
 ducks_take_off_420_720p50.y4m,--preset medium --tskip --tskip-fast --constrained-intra
127
-ducks_take_off_420_720p50.y4m,--preset slow --scaling-list default --qp 40
128
-ducks_take_off_420_720p50.y4m,--preset ultrafast --constrained-intra --rd 1
129
-ducks_take_off_420_720p50.y4m,--preset veryslow --constrained-intra --bframes 2
130
 ducks_take_off_444_720p50.y4m,--preset medium --qp 38 --no-scenecut
131
-ducks_take_off_444_720p50.y4m,--preset superfast --weightp --rd 0 --limit-refs 2
132
+ducks_take_off_420_720p50.y4m,--preset slow --scaling-list default --qp 40
133
 ducks_take_off_444_720p50.y4m,--preset slower --psy-rd 1 --psy-rdoq 2.0 --rdoq-level 1 --limit-refs 1
134
+ducks_take_off_420_720p50.y4m,--preset slower --no-wpp
135
+ducks_take_off_420_720p50.y4m,--preset veryslow --constrained-intra --bframes 2
136
+mobile_calendar_422_ntsc.y4m,--preset superfast --weightp
137
 mobile_calendar_422_ntsc.y4m,--preset medium --bitrate 500 -F4
138
 mobile_calendar_422_ntsc.y4m,--preset slower --tskip --tskip-fast
139
-mobile_calendar_422_ntsc.y4m,--preset superfast --weightp --rd 0
140
 mobile_calendar_422_ntsc.y4m,--preset veryslow --tskip --limit-refs 2
141
+old_town_cross_444_720p50.y4m,--preset ultrafast --weightp --min-cu 32
142
+old_town_cross_444_720p50.y4m,--preset superfast --weightp --min-cu 16 --limit-modes
143
+old_town_cross_444_720p50.y4m,--preset veryfast --qp 1 --tune ssim
144
 old_town_cross_444_720p50.y4m,--preset faster --rd 1 --tune zero-latency
145
+old_town_cross_444_720p50.y4m,--preset fast --no-cutree --analysis-mode=save --bitrate 3000 --early-skip,--preset fast --no-cutree --analysis-mode=load --bitrate 3000 --early-skip
146
 old_town_cross_444_720p50.y4m,--preset medium --keyint -1 --no-weightp --ref 6
147
 old_town_cross_444_720p50.y4m,--preset slow --rdoq-level 1 --early-skip --ref 7 --no-b-pyramid
148
 old_town_cross_444_720p50.y4m,--preset slower --crf 4 --cu-lossless
149
-old_town_cross_444_720p50.y4m,--preset superfast --weightp --min-cu 16
150
-old_town_cross_444_720p50.y4m,--preset ultrafast --weightp --min-cu 32
151
-old_town_cross_444_720p50.y4m,--preset veryfast --qp 1 --tune ssim
152
 parkrun_ter_720p50.y4m,--preset medium --no-open-gop --sao-non-deblock --crf 4 --cu-lossless
153
 parkrun_ter_720p50.y4m,--preset slower --fast-intra --no-rect --tune grain
154
-silent_cif_420.y4m,--preset medium --me full --rect --amp
155
 silent_cif_420.y4m,--preset superfast --weightp --rect
156
+silent_cif_420.y4m,--preset medium --me full --rect --amp
157
 silent_cif_420.y4m,--preset placebo --ctu 32 --no-sao --qg-size 16
158
-vtc1nw_422_ntsc.y4m,--preset medium --scaling-list default --ctu 16 --ref 5
159
-vtc1nw_422_ntsc.y4m,--preset slower --nr-inter 1000 -F4 --tune fast-decode --qg-size 16
160
+washdc_422_ntsc.y4m,--preset ultrafast --weightp --tu-intra-depth 4
161
 vtc1nw_422_ntsc.y4m,--preset superfast --weightp --nr-intra 100 -F4
162
-washdc_422_ntsc.y4m,--preset faster --rdoq-level 1 --max-merge 5
163
-washdc_422_ntsc.y4m,--preset medium --no-weightp --max-tu-size 4 --limit-refs 1
164
-washdc_422_ntsc.y4m,--preset slower --psy-rdoq 2.0 --rdoq-level 2 --qg-size 32 --limit-refs 1
165
 washdc_422_ntsc.y4m,--preset superfast --psy-rd 1 --tune zerolatency
166
-washdc_422_ntsc.y4m,--preset ultrafast --weightp --tu-intra-depth 4
167
 washdc_422_ntsc.y4m,--preset veryfast --tu-inter-depth 4
168
-washdc_422_ntsc.y4m,--preset veryslow --crf 4 --cu-lossless --limit-refs 3
169
-BasketballDrive_1920x1080_50.y4m,--preset medium --no-cutree --analysis-mode=save --bitrate 15000,--preset medium --no-cutree --analysis-mode=load --bitrate 13000,--preset medium --no-cutree --analysis-mode=load --bitrate 11000,--preset medium --no-cutree --analysis-mode=load --bitrate 9000,--preset medium --no-cutree --analysis-mode=load --bitrate 7000
170
-NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset slow --no-cutree --analysis-mode=save --bitrate 15000,--preset slow --no-cutree --analysis-mode=load --bitrate 13000,--preset slow --no-cutree --analysis-mode=load --bitrate 11000,--preset slow --no-cutree --analysis-mode=load --bitrate 9000,--preset slow --no-cutree --analysis-mode=load --bitrate 7000
171
-old_town_cross_444_720p50.y4m,--preset veryslow --no-cutree --analysis-mode=save --bitrate 15000 --early-skip,--preset veryslow --no-cutree --analysis-mode=load --bitrate 13000 --early-skip,--preset veryslow --no-cutree --analysis-mode=load --bitrate 11000 --early-skip,--preset veryslow --no-cutree --analysis-mode=load --bitrate 9000 --early-skip,--preset veryslow --no-cutree --analysis-mode=load --bitrate 7000 --early-skip
172
-Johnny_1280x720_60.y4m,--preset medium --no-cutree --analysis-mode=save --bitrate 15000 --tskip-fast,--preset medium --no-cutree --analysis-mode=load --bitrate 13000  --tskip-fast,--preset medium --no-cutree --analysis-mode=load --bitrate 11000  --tskip-fast,--preset medium --no-cutree --analysis-mode=load --bitrate 9000  --tskip-fast,--preset medium --no-cutree --analysis-mode=load --bitrate 7000  --tskip-fast
173
-BasketballDrive_1920x1080_50.y4m,--preset medium --recon-y4m-exec "ffplay -i pipe:0 -autoexit"
174
-FourPeople_1280x720_60.y4m,--preset ultrafast --recon-y4m-exec "ffplay -i pipe:0 -autoexit"
175
-FourPeople_1280x720_60.y4m,--preset veryslow --recon-y4m-exec "ffplay -i pipe:0 -autoexit"
176
+washdc_422_ntsc.y4m,--preset faster --rdoq-level 1 --max-merge 5
177
+vtc1nw_422_ntsc.y4m,--preset medium --scaling-list default --ctu 16 --ref 5
178
+washdc_422_ntsc.y4m,--preset medium --no-weightp --max-tu-size 4 --limit-refs 1 --aq-mode 2
179
+vtc1nw_422_ntsc.y4m,--preset slower --nr-inter 1000 -F4 --tune fast-decode --qg-size 16
180
+washdc_422_ntsc.y4m,--preset slower --psy-rdoq 2.0 --rdoq-level 2 --qg-size 32 --limit-refs 1
181
+washdc_422_ntsc.y4m,--preset veryslow --crf 4 --cu-lossless --limit-refs 3 --limit-modes
182
+
183
+# Main12 intraCost overflow bug test
184
+720p50_parkrun_ter.y4m,--preset medium
185
 
186
 # interlace test, even though input YUV is not field seperated
187
-CrowdRun_1920x1080_50_10bit_422.yuv,--preset fast --interlace bff
188
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset faster --interlace tff
189
+CrowdRun_1920x1080_50_10bit_422.yuv,--preset fast --interlace bff
190
 
191
 # vim: tw=200
192
x265_1.8.tar.gz/source/test/smoke-tests.txt -> x265_1.9.tar.gz/source/test/smoke-tests.txt Changed
8
 
1
@@ -19,3 +19,6 @@
2
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset=medium --max-tu-size 16
3
 DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset=veryfast --min-cu 16
4
 DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset=fast --weightb --interlace bff
5
+
6
+# Main12 intraCost overflow bug test
7
+720p50_parkrun_ter.y4m,--preset medium
8
x265_1.8.tar.gz/source/test/testbench.cpp -> x265_1.9.tar.gz/source/test/testbench.cpp Changed
9
 
1
@@ -4,6 +4,7 @@
2
  * Authors: Gopu Govindaswamy <gopu@govindaswamy.org>
3
  *          Mandar Gurav <mandar@multicorewareinc.com>
4
  *          Mahesh Pittala <mahesh@multicorewareinc.com>
5
+ *          Min Chen <chenm003@163.com>
6
  *
7
  * This program is free software; you can redistribute it and/or modify
8
  * it under the terms of the GNU General Public License as published by
9
x265_1.8.tar.gz/source/test/testharness.h -> x265_1.9.tar.gz/source/test/testharness.h Changed
9
 
1
@@ -2,6 +2,7 @@
2
  * Copyright (C) 2013 x265 project
3
  *
4
  * Authors: Steve Borho <steve@borho.org>
5
+ *          Min Chen <chenm003@163.com>
6
  *
7
  * This program is free software; you can redistribute it and/or modify
8
  * it under the terms of the GNU General Public License as published by
9
x265_1.8.tar.gz/source/x265-extras.cpp -> x265_1.9.tar.gz/source/x265-extras.cpp Changed
204
 
1
@@ -36,7 +36,7 @@
2
     "I count, I ave-QP, I kbps, I-PSNR Y, I-PSNR U, I-PSNR V, I-SSIM (dB), "
3
     "P count, P ave-QP, P kbps, P-PSNR Y, P-PSNR U, P-PSNR V, P-SSIM (dB), "
4
     "B count, B ave-QP, B kbps, B-PSNR Y, B-PSNR U, B-PSNR V, B-SSIM (dB), "
5
-    "Version\n";
6
+    "MaxCLL, MaxFALL, Version\n";
7
 
8
 FILE* x265_csvlog_open(const x265_api& api, const x265_param& param, const char* fname, int level)
9
 {
10
@@ -61,54 +61,58 @@
11
         {
12
             if (level)
13
             {
14
-                fprintf(csvfp, "Encode Order, Type, POC, QP, Bits, ");
15
+                fprintf(csvfp, "Encode Order, Type, POC, QP, Bits, Scenecut, ");
16
                 if (param.rc.rateControlMode == X265_RC_CRF)
17
                     fprintf(csvfp, "RateFactor, ");
18
-                fprintf(csvfp, "Y PSNR, U PSNR, V PSNR, YUV PSNR, SSIM, SSIM (dB),  List 0, List 1");
19
-                /* detailed performance statistics */
20
-                fprintf(csvfp, ", DecideWait (ms), Row0Wait (ms), Wall time (ms), Ref Wait Wall (ms), Total CTU time (ms), Stall Time (ms), Avg WPP, Row Blocks");
21
-                if (level >= 2)
22
+                if (param.bEnablePsnr)
23
+                    fprintf(csvfp, "Y PSNR, U PSNR, V PSNR, YUV PSNR, ");
24
+                if (param.bEnableSsim)
25
+                    fprintf(csvfp, "SSIM, SSIM(dB), ");
26
+                fprintf(csvfp, "Latency, ");
27
+                fprintf(csvfp, "List 0, List 1");
28
+                uint32_t size = param.maxCUSize;
29
+                for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
30
+                {
31
+                    fprintf(csvfp, ", Intra %dx%d DC, Intra %dx%d Planar, Intra %dx%d Ang", size, size, size, size, size, size);
32
+                    size /= 2;
33
+                }
34
+                fprintf(csvfp, ", 4x4");
35
+                size = param.maxCUSize;
36
+                if (param.bEnableRectInter)
37
                 {
38
-                    uint32_t size = param.maxCUSize;
39
-                    for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
40
-                    {
41
-                        fprintf(csvfp, ", Intra %dx%d DC, Intra %dx%d Planar, Intra %dx%d Ang", size, size, size, size, size, size);
42
-                        size /= 2;
43
-                    }
44
-                    fprintf(csvfp, ", 4x4");
45
-                    size = param.maxCUSize;
46
-                    if (param.bEnableRectInter)
47
-                    {
48
-                        for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
49
-                        {
50
-                            fprintf(csvfp, ", Inter %dx%d, Inter %dx%d (Rect)", size, size, size, size);
51
-                            if (param.bEnableAMP)
52
-                                fprintf(csvfp, ", Inter %dx%d (Amp)", size, size);
53
-                            size /= 2;
54
-                        }
55
-                    }
56
-                    else
57
-                    {
58
-                        for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
59
-                        {
60
-                            fprintf(csvfp, ", Inter %dx%d", size, size);
61
-                            size /= 2;
62
-                        }
63
-                    }
64
-                    size = param.maxCUSize;
65
                     for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
66
                     {
67
-                        fprintf(csvfp, ", Skip %dx%d", size, size);
68
+                        fprintf(csvfp, ", Inter %dx%d, Inter %dx%d (Rect)", size, size, size, size);
69
+                        if (param.bEnableAMP)
70
+                            fprintf(csvfp, ", Inter %dx%d (Amp)", size, size);
71
                         size /= 2;
72
                     }
73
-                    size = param.maxCUSize;
74
+                }
75
+                else
76
+                {
77
                     for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
78
                     {
79
-                        fprintf(csvfp, ", Merge %dx%d", size, size);
80
+                        fprintf(csvfp, ", Inter %dx%d", size, size);
81
                         size /= 2;
82
                     }
83
-                    fprintf(csvfp, ", Avg Luma Distortion, Avg Chroma Distortion, Avg psyEnergy, Avg Luma Level, Max Luma Level");
84
                 }
85
+                size = param.maxCUSize;
86
+                for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
87
+                {
88
+                    fprintf(csvfp, ", Skip %dx%d", size, size);
89
+                    size /= 2;
90
+                }
91
+                size = param.maxCUSize;
92
+                for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
93
+                {
94
+                    fprintf(csvfp, ", Merge %dx%d", size, size);
95
+                    size /= 2;
96
+                }
97
+                fprintf(csvfp, ", Avg Luma Distortion, Avg Chroma Distortion, Avg psyEnergy, Avg Luma Level, Max Luma Level, Avg Residual Energy");
98
+
99
+                /* detailed performance statistics */
100
+                if (level >= 2)
101
+                    fprintf(csvfp, ", DecideWait (ms), Row0Wait (ms), Wall time (ms), Ref Wait Wall (ms), Total CTU time (ms), Stall Time (ms), Avg WPP, Row Blocks");
102
                 fprintf(csvfp, "\n");
103
             }
104
             else
105
@@ -125,17 +129,14 @@
106
         return;
107
 
108
     const x265_frame_stats* frameStats = &pic.frameData;
109
-    fprintf(csvfp, "%d, %c-SLICE, %4d, %2.2lf, %10d,", frameStats->encoderOrder, frameStats->sliceType, frameStats->poc, frameStats->qp, (int)frameStats->bits);
110
+    fprintf(csvfp, "%d, %c-SLICE, %4d, %2.2lf, %10d, %d,", frameStats->encoderOrder, frameStats->sliceType, frameStats->poc, frameStats->qp, (int)frameStats->bits, frameStats->bScenecut);
111
     if (param.rc.rateControlMode == X265_RC_CRF)
112
         fprintf(csvfp, "%.3lf,", frameStats->rateFactor);
113
     if (param.bEnablePsnr)
114
         fprintf(csvfp, "%.3lf, %.3lf, %.3lf, %.3lf,", frameStats->psnrY, frameStats->psnrU, frameStats->psnrV, frameStats->psnr);
115
-    else
116
-        fputs(" -, -, -, -,", csvfp);
117
     if (param.bEnableSsim)
118
         fprintf(csvfp, " %.6f, %6.3f,", frameStats->ssim, x265_ssim2dB(frameStats->ssim));
119
-    else
120
-        fputs(" -, -,", csvfp);
121
+    fprintf(csvfp, "%d, ", frameStats->frameLatency);
122
     if (frameStats->sliceType == 'I')
123
         fputs(" -, -,", csvfp);
124
     else
125
@@ -154,32 +155,33 @@
126
         else
127
             fputs(" -,", csvfp);
128
     }
129
-    fprintf(csvfp, " %.1lf, %.1lf, %.1lf, %.1lf, %.1lf, %.1lf,", frameStats->decideWaitTime, frameStats->row0WaitTime, frameStats->wallTime, frameStats->refWaitWallTime, frameStats->totalCTUTime, frameStats->stallTime);
130
-    fprintf(csvfp, " %.3lf, %d", frameStats->avgWPP, frameStats->countRowBlocks);
131
-    if (level >= 2)
132
+    for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
133
+        fprintf(csvfp, "%5.2lf%%, %5.2lf%%, %5.2lf%%,", frameStats->cuStats.percentIntraDistribution[depth][0], frameStats->cuStats.percentIntraDistribution[depth][1], frameStats->cuStats.percentIntraDistribution[depth][2]);
134
+    fprintf(csvfp, "%5.2lf%%", frameStats->cuStats.percentIntraNxN);
135
+    if (param.bEnableRectInter)
136
     {
137
         for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
138
-            fprintf(csvfp, ", %5.2lf%%, %5.2lf%%, %5.2lf%%", frameStats->cuStats.percentIntraDistribution[depth][0], frameStats->cuStats.percentIntraDistribution[depth][1], frameStats->cuStats.percentIntraDistribution[depth][2]);
139
-        fprintf(csvfp, ", %5.2lf%%", frameStats->cuStats.percentIntraNxN);
140
-        if (param.bEnableRectInter)
141
         {
142
-            for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
143
-            {
144
-                fprintf(csvfp, ", %5.2lf%%, %5.2lf%%", frameStats->cuStats.percentInterDistribution[depth][0], frameStats->cuStats.percentInterDistribution[depth][1]);
145
-                if (param.bEnableAMP)
146
-                    fprintf(csvfp, ", %5.2lf%%", frameStats->cuStats.percentInterDistribution[depth][2]);
147
-            }
148
+            fprintf(csvfp, ", %5.2lf%%, %5.2lf%%", frameStats->cuStats.percentInterDistribution[depth][0], frameStats->cuStats.percentInterDistribution[depth][1]);
149
+            if (param.bEnableAMP)
150
+                fprintf(csvfp, ", %5.2lf%%", frameStats->cuStats.percentInterDistribution[depth][2]);
151
         }
152
-        else
153
-        {
154
-            for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
155
-                fprintf(csvfp, ", %5.2lf%%", frameStats->cuStats.percentInterDistribution[depth][0]);
156
-        }
157
-        for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
158
-            fprintf(csvfp, ", %5.2lf%%", frameStats->cuStats.percentSkipCu[depth]);
159
+    }
160
+    else
161
+    {
162
         for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
163
-            fprintf(csvfp, ", %5.2lf%%", frameStats->cuStats.percentMergeCu[depth]);
164
-        fprintf(csvfp, ", %.2lf, %.2lf, %.2lf, %.2lf, %d", frameStats->avgLumaDistortion, frameStats->avgChromaDistortion, frameStats->avgPsyEnergy, frameStats->avgLumaLevel, frameStats->maxLumaLevel);
165
+            fprintf(csvfp, ", %5.2lf%%", frameStats->cuStats.percentInterDistribution[depth][0]);
166
+    }
167
+    for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
168
+        fprintf(csvfp, ", %5.2lf%%", frameStats->cuStats.percentSkipCu[depth]);
169
+    for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
170
+        fprintf(csvfp, ", %5.2lf%%", frameStats->cuStats.percentMergeCu[depth]);
171
+    fprintf(csvfp, ", %.2lf, %.2lf, %.2lf, %.2lf, %d, %.2lf", frameStats->avgLumaDistortion, frameStats->avgChromaDistortion, frameStats->avgPsyEnergy, frameStats->avgLumaLevel, frameStats->maxLumaLevel, frameStats->avgResEnergy);
172
+
173
+    if (level >= 2)
174
+    {
175
+        fprintf(csvfp, ", %.1lf, %.1lf, %.1lf, %.1lf, %.1lf, %.1lf,", frameStats->decideWaitTime, frameStats->row0WaitTime, frameStats->wallTime, frameStats->refWaitWallTime, frameStats->totalCTUTime, frameStats->stallTime);
176
+        fprintf(csvfp, " %.3lf, %d", frameStats->avgWPP, frameStats->countRowBlocks);
177
     }
178
     fprintf(csvfp, "\n");
179
     fflush(stderr);
180
@@ -198,11 +200,13 @@
181
     }
182
 
183
     // CLI arguments or other
184
+    fputc('"', csvfp);
185
     for (int i = 1; i < argc; i++)
186
     {
187
-        if (i) fputc(' ', csvfp);
188
+        fputc(' ', csvfp);
189
         fputs(argv[i], csvfp);
190
     }
191
+    fputc('"', csvfp);
192
 
193
     // current date and time
194
     time_t now;
195
@@ -273,7 +277,7 @@
196
     else
197
         fprintf(csvfp, " -, -, -, -, -, -, -,");
198
 
199
-    fprintf(csvfp, " %s\n", api.version_str);
200
+    fprintf(csvfp, " %-6u, %-6u, %s\n", stats.maxCLL, stats.maxFALL, api.version_str);
201
 }
202
 
203
 /* The dithering algorithm is based on Sierra-2-4A error diffusion. */
204
x265_1.8.tar.gz/source/x265.cpp -> x265_1.9.tar.gz/source/x265.cpp Changed
9
 
1
@@ -486,6 +486,7 @@
2
             pic_org.forceqp = qp + 1;
3
         if (type == 'I') pic_org.sliceType = X265_TYPE_IDR;
4
         else if (type == 'i') pic_org.sliceType = X265_TYPE_I;
5
+        else if (type == 'K') pic_org.sliceType = param->bOpenGOP ? X265_TYPE_I : X265_TYPE_IDR;
6
         else if (type == 'P') pic_org.sliceType = X265_TYPE_P;
7
         else if (type == 'B') pic_org.sliceType = X265_TYPE_BREF;
8
         else if (type == 'b') pic_org.sliceType = X265_TYPE_B;
9
x265_1.8.tar.gz/source/x265.def.in -> x265_1.9.tar.gz/source/x265.def.in Changed
6
 
1
@@ -22,3 +22,4 @@
2
 x265_cleanup
3
 x265_api_get_${X265_BUILD}
4
 x265_api_query
5
+x265_encoder_intra_refresh
6
x265_1.8.tar.gz/source/x265.h -> x265_1.9.tar.gz/source/x265.h Changed
223
 
1
@@ -2,6 +2,7 @@
2
  * Copyright (C) 2013 x265 project
3
  *
4
  * Authors: Steve Borho <steve@borho.org>
5
+ *          Min Chen <chenm003@163.com>
6
  *
7
  * This program is free software; you can redistribute it and/or modify
8
  * it under the terms of the GNU General Public License as published by
9
@@ -91,13 +92,15 @@
10
 /* Stores all analysis data for a single frame */
11
 typedef struct x265_analysis_data
12
 {
13
-    void*            interData;
14
-    void*            intraData;
15
+    int64_t          satdCost;
16
     uint32_t         frameRecordSize;
17
     uint32_t         poc;
18
     uint32_t         sliceType;
19
     uint32_t         numCUsInFrame;
20
     uint32_t         numPartitions;
21
+    void*            interData;
22
+    void*            intraData;
23
+    int              bScenecut;
24
 } x265_analysis_data;
25
 
26
 /* cu statistics */
27
@@ -132,6 +135,7 @@
28
     double           avgLumaDistortion;
29
     double           avgChromaDistortion;
30
     double           avgPsyEnergy;
31
+    double           avgResEnergy;
32
     double           avgLumaLevel;
33
     uint64_t         bits;
34
     int              encoderOrder;
35
@@ -141,6 +145,8 @@
36
     int              list1POC[16];
37
     uint16_t         maxLumaLevel;
38
     char             sliceType;
39
+    int              bScenecut;
40
+    int              frameLatency;
41
     x265_cu_stats    cuStats;
42
 } x265_frame_stats;
43
 
44
@@ -205,6 +211,13 @@
45
      * this data structure */
46
     x265_analysis_data analysisData;
47
 
48
+    /* An array of quantizer offsets to be applied to this image during encoding.
49
+     * These are added on top of the decisions made by rateControl.
50
+     * Adaptive quantization must be enabled to use this feature. These quantizer
51
+     * offsets should be given for each 16x16 block. Behavior if quant
52
+     * offsets differ between encoding passes is undefined. */
53
+    float            *quantOffsets;
54
+
55
     /* Frame level statistics */
56
     x265_frame_stats frameData;
57
 
58
@@ -378,6 +391,8 @@
59
     x265_sliceType_stats  statsI;               /* statistics of I slice */
60
     x265_sliceType_stats  statsP;               /* statistics of P slice */
61
     x265_sliceType_stats  statsB;               /* statistics of B slice */
62
+    uint16_t              maxCLL;               /* maximum content light level */
63
+    uint16_t              maxFALL;              /* maximum frame average light level */
64
 } x265_stats;
65
 
66
 /* String values accepted by x265_param_parse() (and CLI) for various parameters */
67
@@ -604,7 +619,7 @@
68
 
69
     /* Enables the emission of a user data SEI with the stream headers which
70
      * describes the encoder version, build info, and parameters. This is
71
-     * very helpful for debugging, but may interfere with regression tests. 
72
+     * very helpful for debugging, but may interfere with regression tests.
73
      * Default enabled */
74
     int       bEmitInfoSEI;
75
 
76
@@ -664,9 +679,9 @@
77
     int       bBPyramid;
78
 
79
     /* A value which is added to the cost estimate of B frames in the lookahead.
80
-     * It may be a positive value (making B frames appear more expensive, which
81
-     * causes the lookahead to chose more P frames) or negative, which makes the
82
-     * lookahead chose more B frames. Default is 0, there are no limits */
83
+     * It may be a positive value (making B frames appear less expensive, which
84
+     * biases the lookahead to choose more B frames) or negative, which makes the
85
+     * lookahead choose more P frames. Default is 0, there are no limits */
86
     int       bFrameBias;
87
 
88
     /* The number of frames that must be queued in the lookahead before it may
89
@@ -691,6 +706,11 @@
90
      * should detect scene cuts. The default (40) is recommended. */
91
     int       scenecutThreshold;
92
 
93
+    /* Replace keyframes by using a column of intra blocks that move across the video
94
+     * from one side to the other, thereby "refreshing" the image. In effect, instead of a
95
+     * big keyframe, the keyframe is "spread" over many frames. */
96
+    int       bIntraRefresh;
97
+
98
     /*== Coding Unit (CU) definitions ==*/
99
 
100
     /* Maximum CU width and height in pixels.  The size must be 64, 32, or 16.
101
@@ -810,6 +830,9 @@
102
      * 4 split CUs at the next lower CU depth.  The two flags may be combined */
103
     uint32_t  limitReferences;
104
 
105
+    /* Limit modes analyzed for each CU using cost metrics from the 4 sub-CUs */
106
+    uint32_t limitModes;
107
+
108
     /* ME search method (DIA, HEX, UMH, STAR, FULL). The search patterns
109
      * (methods) are sorted in increasing complexity, with diamond being the
110
      * simplest and fastest and full being the slowest.  DIA, HEX, and UMH were
111
@@ -920,7 +943,7 @@
112
     /* Psycho-visual rate-distortion strength. Only has an effect in presets
113
      * which use RDO. It makes mode decision favor options which preserve the
114
      * energy of the source, at the cost of lost compression. The value must
115
-     * be between 0 and 2.0, 1.0 is typical. Default 0.3 */
116
+     * be between 0 and 5.0, 1.0 is typical. Default 2.0 */
117
     double    psyRd;
118
 
119
     /* Strength of psycho-visual optimizations in quantization. Only has an
120
@@ -1038,7 +1061,7 @@
121
 
122
         /* Enable slow and a more detailed first pass encode in multi pass rate control */
123
         int       bEnableSlowFirstPass;
124
-        
125
+
126
         /* rate-control overrides */
127
         int        zoneCount;
128
         x265_zone* zones;
129
@@ -1051,14 +1074,14 @@
130
          * values will affect all encoders in the same process */
131
         const char* lambdaFileName;
132
 
133
-        /* Enable stricter conditions to check bitrate deviations in CBR mode. May compromise 
134
+        /* Enable stricter conditions to check bitrate deviations in CBR mode. May compromise
135
          * quality to maintain bitrate adherence */
136
         int bStrictCbr;
137
 
138
-        /* Enable adaptive quantization at CU granularity. This parameter specifies 
139
-         * the minimum CU size at which QP can be adjusted, i.e. Quantization Group 
140
-         * (QG) size. Allowed values are 64, 32, 16 provided it falls within the 
141
-         * inclusuve range [maxCUSize, minCUSize]. Experimental, default: maxCUSize*/
142
+        /* Enable adaptive quantization at CU granularity. This parameter specifies
143
+         * the minimum CU size at which QP can be adjusted, i.e. Quantization Group
144
+         * (QG) size. Allowed values are 64, 32, 16 provided it falls within the
145
+         * inclusuve range [maxCUSize, minCUSize]. Experimental, default: maxCUSize */
146
         uint32_t qgSize;
147
     } rc;
148
 
149
@@ -1165,12 +1188,27 @@
150
      * max,min luminance values. */
151
     const char* masteringDisplayColorVolume;
152
 
153
-    /* Content light level info SEI, specified as a string which is parsed when
154
-     * the stream header SEI are emitted. The string format is "%hu,%hu" where
155
-     * %hu are unsigned 16bit integers. The first value is the max content light
156
-     * level (or 0 if no maximum is indicated), the second value is the maximum
157
-     * picture average light level (or 0). */
158
-    const char* contentLightLevelInfo;
159
+    /* Maximum Content light level(MaxCLL), specified as integer that indicates the
160
+     * maximum pixel intensity level in units of 1 candela per square metre of the
161
+     * bitstream. x265 will also calculate MaxCLL programmatically from the input
162
+     * pixel values and set in the Content light level info SEI */
163
+    uint16_t maxCLL;
164
+
165
+    /* Maximum Frame Average Light Level(MaxFALL), specified as integer that indicates
166
+     * the maximum frame average intensity level in units of 1 candela per square
167
+     * metre of the bitstream. x265 will also calculate MaxFALL programmatically
168
+     * from the input pixel values and set in the Content light level info SEI */
169
+    uint16_t maxFALL;
170
+
171
+    /* Minimum luma level of input source picture, specified as a integer which
172
+     * would automatically increase any luma values below the specified --min-luma
173
+     * value to that value. */
174
+    uint16_t minLuma;
175
+
176
+    /* Maximum luma level of input source picture, specified as a integer which
177
+     * would automatically decrease any luma values above the specified --max-luma
178
+     * value to that value. */
179
+    uint16_t maxLuma;
180
 
181
 } x265_param;
182
 
183
@@ -1211,7 +1249,7 @@
184
     "main422-10", "main422-10-intra",
185
     "main444-10", "main444-10-intra",
186
 
187
-    "main12",     "main12-intra",                  /* Highly Experimental */
188
+    "main12",     "main12-intra",
189
     "main422-12", "main422-12-intra",
190
     "main444-12", "main444-12-intra",
191
 
192
@@ -1347,6 +1385,22 @@
193
  *      close an encoder handler */
194
 void x265_encoder_close(x265_encoder *);
195
 
196
+/* x265_encoder_intra_refresh:
197
+ *      If an intra refresh is not in progress, begin one with the next P-frame.
198
+ *      If an intra refresh is in progress, begin one as soon as the current one finishes.
199
+ *      Requires bIntraRefresh to be set.
200
+ *
201
+ *      Useful for interactive streaming where the client can tell the server that packet loss has
202
+ *      occurred.  In this case, keyint can be set to an extremely high value so that intra refreshes
203
+ *      occur only when calling x265_encoder_intra_refresh.
204
+ *
205
+ *      In multi-pass encoding, if x265_encoder_intra_refresh is called differently in each pass,
206
+ *      behavior is undefined.
207
+ *
208
+ *      Should not be called during an x265_encoder_encode. */
209
+
210
+int x265_encoder_intra_refresh(x265_encoder *);
211
+
212
 /* x265_cleanup:
213
  *       release library static allocations, reset configured CTU size */
214
 void x265_cleanup(void);
215
@@ -1394,6 +1448,7 @@
216
     void          (*cleanup)(void);
217
 
218
     int           sizeof_frame_stats;   /* sizeof(x265_frame_stats) */
219
+    int           (*encoder_intra_refresh)(x265_encoder*);
220
     /* add new pointers to the end, or increment X265_MAJOR_VERSION */
221
 } x265_api;
222
 
223
x265_1.8.tar.gz/source/x265cli.h -> x265_1.9.tar.gz/source/x265cli.h Changed
98
 
1
@@ -116,6 +116,7 @@
2
     { "min-keyint",     required_argument, NULL, 'i' },
3
     { "scenecut",       required_argument, NULL, 0 },
4
     { "no-scenecut",          no_argument, NULL, 0 },
5
+    { "intra-refresh",        no_argument, NULL, 0 },
6
     { "rc-lookahead",   required_argument, NULL, 0 },
7
     { "lookahead-slices", required_argument, NULL, 0 },
8
     { "bframes",        required_argument, NULL, 'b' },
9
@@ -126,6 +127,8 @@
10
     { "b-pyramid",            no_argument, NULL, 0 },
11
     { "ref",            required_argument, NULL, 0 },
12
     { "limit-refs",     required_argument, NULL, 0 },
13
+    { "no-limit-modes",       no_argument, NULL, 0 },
14
+    { "limit-modes",          no_argument, NULL, 0 },
15
     { "no-weightp",           no_argument, NULL, 0 },
16
     { "weightp",              no_argument, NULL, 'w' },
17
     { "no-weightb",           no_argument, NULL, 0 },
18
@@ -192,6 +195,8 @@
19
     { "crop-rect",      required_argument, NULL, 0 }, /* DEPRECATED */
20
     { "master-display", required_argument, NULL, 0 },
21
     { "max-cll",        required_argument, NULL, 0 },
22
+    { "min-luma",       required_argument, NULL, 0 },
23
+    { "max-luma",       required_argument, NULL, 0 },
24
     { "no-dither",            no_argument, NULL, 0 },
25
     { "dither",               no_argument, NULL, 0 },
26
     { "no-repeat-headers",    no_argument, NULL, 0 },
27
@@ -251,14 +256,18 @@
28
     H0("   --log-level <string>          Logging level: none error warning info debug full. Default %s\n", X265_NS::logLevelNames[param->logLevel + 1]);
29
     H0("   --no-progress                 Disable CLI progress reports\n");
30
     H0("   --csv <filename>              Comma separated log file, if csv-log-level > 0 frame level statistics, else one line per run\n");
31
-    H0("   --csv-log-level               Level of csv logging, if csv-log-level > 0 frame level statistics, else one line per run: 0-2\n");
32
+    H0("   --csv-log-level <integer>     Level of csv logging, if csv-log-level > 0 frame level statistics, else one line per run: 0-2\n");
33
     H0("\nInput Options:\n");
34
     H0("   --input <filename>            Raw YUV or Y4M input file name. `-` for stdin\n");
35
     H1("   --y4m                         Force parsing of input stream as YUV4MPEG2 regardless of file extension\n");
36
     H0("   --fps <float|rational>        Source frame rate (float or num/denom), auto-detected if Y4M\n");
37
     H0("   --input-res WxH               Source picture size [w x h], auto-detected if Y4M\n");
38
     H1("   --input-depth <integer>       Bit-depth of input file. Default 8\n");
39
-    H1("   --input-csp <string>          Source color space: i420, i444 or i422, auto-detected if Y4M. Default: i420\n");
40
+    H1("   --input-csp <string>          Chroma subsampling, auto-detected if Y4M\n");
41
+    H1("                                 0 - i400 (4:0:0 monochrome)\n");
42
+    H1("                                 1 - i420 (4:2:0 default)\n");
43
+    H1("                                 2 - i422 (4:2:2)\n");
44
+    H1("                                 3 - i444 (4:4:4)\n");
45
     H0("-f/--frames <integer>            Maximum number of frames to encode. Default all\n");
46
     H0("   --seek <integer>              First frame to encode\n");
47
     H1("   --[no-]interlace <bff|tff>    Indicate input pictures are interlace fields in temporal order. Default progressive\n");
48
@@ -292,7 +301,7 @@
49
     H0("   --tu-inter-depth <integer>    Max TU recursive depth for inter CUs. Default %d\n", param->tuQTMaxInterDepth);
50
     H0("\nAnalysis:\n");
51
     H0("   --rd <0..6>                   Level of RDO in mode decision 0:least....6:full RDO. Default %d\n", param->rdLevel);
52
-    H0("   --[no-]psy-rd <0..2.0>        Strength of psycho-visual rate distortion optimization, 0 to disable. Default %.1f\n", param->psyRd);
53
+    H0("   --[no-]psy-rd <0..5.0>        Strength of psycho-visual rate distortion optimization, 0 to disable. Default %.1f\n", param->psyRd);
54
     H0("   --[no-]rdoq-level <0|1|2>     Level of RDO in quantization 0:none, 1:levels, 2:levels & coding groups. Default %d\n", param->rdoqLevel);
55
     H0("   --[no-]psy-rdoq <0..50.0>     Strength of psycho-visual optimization in RDO quantization, 0 to disable. Default %.1f\n", param->psyRdoq);
56
     H0("   --[no-]early-skip             Enable early SKIP detection. Default %s\n", OPT(param->bEnableEarlySkip));
57
@@ -308,12 +317,13 @@
58
     H0("\nTemporal / motion search options:\n");
59
     H0("   --max-merge <1..5>            Maximum number of merge candidates. Default %d\n", param->maxNumMergeCand);
60
     H0("   --ref <integer>               max number of L0 references to be allowed (1 .. 16) Default %d\n", param->maxNumReferences);
61
-    H0("   --limit-refs <0|1|2|3>        limit references per depth (1) or CU (2) or both (3). Default %d\n", param->limitReferences);
62
+    H0("   --limit-refs <0|1|2|3>        Limit references per depth (1) or CU (2) or both (3). Default %d\n", param->limitReferences);
63
     H0("   --me <string>                 Motion search method dia hex umh star full. Default %d\n", param->searchMethod);
64
     H0("-m/--subme <integer>             Amount of subpel refinement to perform (0:least .. 7:most). Default %d \n", param->subpelRefine);
65
     H0("   --merange <integer>           Motion search range. Default %d\n", param->searchRange);
66
     H0("   --[no-]rect                   Enable rectangular motion partitions Nx2N and 2NxN. Default %s\n", OPT(param->bEnableRectInter));
67
     H0("   --[no-]amp                    Enable asymmetric motion partitions, requires --rect. Default %s\n", OPT(param->bEnableAMP));
68
+    H0("   --[no-]limit-modes            Limit rectangular and asymmetric motion predictions. Default %d\n", param->limitModes);
69
     H1("   --[no-]temporal-mvp           Enable temporal MV predictors. Default %s\n", OPT(param->bEnableTemporalMvp));
70
     H0("\nSpatial / intra options:\n");
71
     H0("   --[no-]strong-intra-smoothing Enable strong intra smoothing for 32x32 blocks. Default %s\n", OPT(param->bEnableStrongIntraSmoothing));
72
@@ -327,6 +337,7 @@
73
     H0("-i/--min-keyint <integer>        Scenecuts closer together than this are coded as I, not IDR. Default: auto\n");
74
     H0("   --no-scenecut                 Disable adaptive I-frame decision\n");
75
     H0("   --scenecut <integer>          How aggressively to insert extra I-frames. Default %d\n", param->scenecutThreshold);
76
+    H0("   --intra-refresh               Use Periodic Intra Refresh instead of IDR frames\n");
77
     H0("   --rc-lookahead <integer>      Number of frames for frame-type lookahead (determines encoder latency) Default %d\n", param->lookaheadDepth);
78
     H1("   --lookahead-slices <0..16>    Number of slices to use per lookahead cost estimate. Default %d\n", param->lookaheadSlices);
79
     H0("   --bframes <integer>           Maximum number of consecutive b-frames (now it only enables B GOP structure) Default %d\n", param->bframes);
80
@@ -335,7 +346,7 @@
81
     H0("   --[no-]b-pyramid              Use B-frames as references. Default %s\n", OPT(param->bBPyramid));
82
     H1("   --qpfile <string>             Force frametypes and QPs for some or all frames\n");
83
     H1("                                 Format of each line: framenumber frametype QP\n");
84
-    H1("                                 QP is optional (none lets x265 choose). Frametypes: I,i,P,B,b.\n");
85
+    H1("                                 QP is optional (none lets x265 choose). Frametypes: I,i,K,P,B,b.\n");
86
     H1("                                 QPs are restricted by qpmin/qpmax.\n");
87
     H0("\nRate control, Adaptive Quantization:\n");
88
     H0("   --bitrate <integer>           Target bitrate (kbps) for ABR (implied). Default %d\n", param->rc.bitrate);
89
@@ -403,6 +414,8 @@
90
     H0("   --master-display <string>     SMPTE ST 2086 master display color volume info SEI (HDR)\n");
91
     H0("                                    format: G(x,y)B(x,y)R(x,y)WP(x,y)L(max,min)\n");
92
     H0("   --max-cll <string>            Emit content light level info SEI as \"cll,fall\" (HDR)\n");
93
+    H0("   --min-luma <integer>          Minimum luma plane value of input source picture\n");
94
+    H0("   --max-luma <integer>          Maximum luma plane value of input source picture\n");
95
     H0("\nBitstream options:\n");
96
     H0("   --[no-]repeat-headers         Emit SPS and PPS headers at each keyframe. Default %s\n", OPT(param->bRepeatHeaders));
97
     H0("   --[no-]info                   Emit SEI identifying encoder and parameters. Default %s\n", OPT(param->bEmitInfoSEI));
98