Changes of Revision 12

x265.changes Changed
x
 
1
@@ -1,4 +1,30 @@
2
 -------------------------------------------------------------------
3
+Wed Feb  3 13:22:42 UTC 2016 - idonmez@suse.com
4
+
5
+- Update to version 1.9
6
+  API Changes:
7
+  * x265_frame_stats returns many additional fields: maxCLL, maxFALL,
8
+    residual energy, scenecut and latency logging
9
+  * --qpfile now supports frametype 'K"
10
+  * x265 now allows CRF ratecontrol in pass N (N greater than or equal to 2)
11
+  * Chroma subsampling format YUV 4:0:0 is now fully supported and tested
12
+  New Features:
13
+  * Quant offsets: This feature allows block level quantization offsets
14
+    to be specified for every frame. An API-only feature.
15
+  * --intra-refresh: Keyframes can be replaced by a moving column
16
+    of intra blocks in non-keyframes.
17
+  * --limit-modes: Intelligently restricts mode analysis.
18
+  * --max-luma and --min-luma for luma clipping, optional for HDR use-cases
19
+  * Emergency denoising is now enabled by default in very low bitrate, 
20
+    VBV encodes
21
+  Presets and Performance:
22
+  * Recently added features lookahead-slices, limit-modes, limit-refs
23
+    have been enabled by default for applicable presets.
24
+  * The default psy-rd strength has been increased to 2.0
25
+  * Multi-socket machines now use a single pool of threads that can
26
+    work cross-socket.
27
+
28
+-------------------------------------------------------------------
29
 Fri Nov 27 18:21:04 UTC 2015 - aloisio@gmx.com
30
 
31
 - Update to version 1.8:
32
x265.spec Changed
64
 
1
@@ -1,10 +1,10 @@
2
 # based on the spec file from https://build.opensuse.org/package/view_file/home:Simmphonie/libx265/
3
 
4
 Name:           x265
5
-%define soname  68
6
+%define soname  79
7
 %define libname lib%{name}
8
 %define libsoname %{libname}-%{soname}
9
-Version:        1.8
10
+Version:        1.9
11
 Release:        0
12
 License:        GPL-2.0+
13
 Summary:        A free h265/HEVC encoder - encoder binary
14
@@ -43,35 +43,34 @@
15
 streams. 
16
 
17
 %prep
18
-%setup -q -n "%{name}_11047/build/linux"
19
-cd ../..
20
+%setup -q -n x265_%{version}
21
 %patch0 -p1
22
-cd -
23
+
24
 %define FAKE_BUILDDATE %(LC_ALL=C date -u -r %{_sourcedir}/%{name}.changes '+%%b %%e %%Y')
25
-sed -i -e "s/0.0/%{soname}.0/g" ../../source/cmake/version.cmake
26
+sed -i -e "s/0.0/%{soname}.0/g" source/cmake/version.cmake
27
 
28
 
29
 %build
30
-export CXXFLAGS="%optflags"
31
-export CFLAGS="%optflags"
32
-cmake  -DCMAKE_INSTALL_PREFIX=/usr -DENABLE_TESTS=ON -G "Unix Makefiles" ../../source
33
-cmake -DCMAKE_INSTALL_PREFIX=/usr ../../source
34
-#./make-Makefiles.bash
35
+export CXXFLAGS="%{optflags}"
36
+export CFLAGS="%{optflags}"
37
+
38
+cd build/linux
39
+cmake  -DCMAKE_INSTALL_PREFIX=%{_prefix} \
40
+       -DLIB_INSTALL_DIR=%{_lib} \
41
+       -DENABLE_TESTS=ON \
42
+       -G "Unix Makefiles" \
43
+       ../../source
44
+
45
 make %{?_smp_mflags} VERBOSE=1
46
 
47
 %install
48
+cd build/linux
49
 %makeinstall
50
-%ifarch x86_64
51
-  mv "%{buildroot}/usr/lib" "%{buildroot}%{_libdir}"
52
-%endif
53
 
54
 rm -f %{buildroot}%{_libdir}/%{libname}.a
55
 
56
 echo "%{libname}-%{soname}" > %{_sourcedir}/baselibs.conf
57
 
58
-%clean
59
-%{?buildroot:%__rm -rf "%{buildroot}"}
60
-
61
 %post -n %{libsoname} -p /sbin/ldconfig
62
 %postun -n %{libsoname} -p /sbin/ldconfig
63
 
64
x265_1.8.tar.gz/.hg_archival.txt -> x265_1.9.tar.gz/.hg_archival.txt Changed
9
 
1
@@ -1,5 +1,4 @@
2
 repo: 09fe40627f03a0f9c3e6ac78b22ac93da23f9fdf
3
-node: 5dcc9d3a928c400b41a3547d7bfee10340519e56
4
+node: 1d3b6e448e01ec40b392ef78b7e55a86249fbe68
5
 branch: stable
6
-latesttag: 1.8
7
-latesttagdistance: 1
8
+tag: 1.9
9
x265_1.8.tar.gz/doc/reST/cli.rst -> x265_1.9.tar.gz/doc/reST/cli.rst Changed
201
 
1
@@ -84,8 +84,8 @@
2
    it adds one line per run. If :option:`--csv-log-level` is greater than
3
    0, it writes one line per frame. Default none
4
 
5
-   When frame level logging is enabled, several frame performance
6
-   statistics are listed:
7
+   Several frame performance statistics are available when 
8
+   :option:`--csv-log-level` is greater than or equal to 2:
9
 
10
    **DecideWait ms** number of milliseconds the frame encoder had to
11
    wait, since the previous frame was retrieved by the API thread,
12
@@ -202,15 +202,29 @@
13
    "-"       - same as "none"
14
    "10"      - allocate one pool, using up to 10 cores on node 0
15
    "-,+"     - allocate one pool, using all cores on node 1
16
-   "+,-,+"   - allocate two pools, using all cores on nodes 0 and 2
17
-   "+,-,+,-" - allocate two pools, using all cores on nodes 0 and 2
18
-   "-,*"     - allocate three pools, using all cores on nodes 1, 2 and 3
19
+   "+,-,+"   - allocate one pool, using only cores on nodes 0 and 2
20
+   "+,-,+,-" - allocate one pool, using only cores on nodes 0 and 2
21
+   "-,*"     - allocate one pool, using all cores on nodes 1, 2 and 3
22
    "8,8,8,8" - allocate four pools with up to 8 threads in each pool
23
-
24
-   The total number of threads will be determined by the number of threads
25
-   assigned to all nodes. The worker threads will each be given affinity for
26
-   their node, they will not be allowed to migrate between nodes, but they
27
-   will be allowed to move between CPU cores within their node.
28
+   "8,+,+,+" - allocate two pools, the first with 8 threads on node 0, and the second with all cores on node 1,2,3
29
+
30
+   A thread pool dedicated to a given NUMA node is enabled only when the
31
+   number of threads to be created on that NUMA node is explicitly mentioned
32
+   in that corresponding position with the --pools option. Else, all threads
33
+   are spawned from a single pool. The total number of threads will be
34
+   determined by the number of threads assigned to the enabled NUMA nodes for
35
+   that pool. The worker threads are be given affinity to all the enabled
36
+   NUMA nodes for that pool and may migrate between them, unless explicitly
37
+   specified as described above.
38
+
39
+   In the case that any threadpool has more than 64 threads, the threadpool
40
+   may be broken down into multiple pools of 64 threads each; on 32-bit
41
+   machines, this number is 32. All pools are given affinity to the NUMA
42
+   nodes on which the original pool had affinity. For performance reasons,
43
+   the last thread pool is spawned only if it has more than 32 threads for
44
+   64-bit machines, or 16 for 32-bit machines. If the total number of threads
45
+   in the system doesn't obey this constraint, we may spawn fewer threads
46
+   than cores which has been emperically shown to be better for performance. 
47
 
48
    If the four pool features: :option:`--wpp`, :option:`--pmode`,
49
    :option:`--pme` and :option:`--lookahead-slices` are all disabled,
50
@@ -219,10 +233,6 @@
51
    If "none" is specified, then all four of the thread pool features are
52
    implicitly disabled.
53
 
54
-   Multiple thread pools will be allocated for any NUMA node with more than
55
-   64 logical CPU cores. But any given thread pool will always use at most
56
-   one NUMA node.
57
-
58
    Frame encoders are distributed between the available thread pools,
59
    and the encoder will never generate more thread pools than
60
    :option:`--frame-threads`.  The pools are used for WPP and for
61
@@ -238,8 +248,12 @@
62
    system, a POSIX build of libx265 without libnuma will be less work
63
    efficient. See :ref:`thread pools <pools>` for more detail.
64
 
65
-   Default "", one thread is allocated per detected hardware thread
66
-   (logical CPU cores) and one thread pool per NUMA node.
67
+   Default "", one pool is created across all available NUMA nodes, with
68
+   one thread allocated per detected hardware thread
69
+   (logical CPU cores). In the case that the total number of threads is more
70
+   than the maximum size that ATOMIC operations can handle (32 for 32-bit
71
+   compiles, and 64 for 64-bit compiles), multiple thread pools may be
72
+   spawned subject to the performance constraint described above.
73
 
74
    Note that the string value will need to be escaped or quoted to
75
    protect against shell expansion on many platforms
76
@@ -353,7 +367,7 @@
77
 
78
    **CLI ONLY**
79
 
80
-.. option:: --total-frames <integer>
81
+.. option:: --frames <integer>
82
 
83
    The number of frames intended to be encoded.  It may be left
84
    unspecified, but when it is specified rate control can make use of
85
@@ -377,15 +391,15 @@
86
 
87
 .. option:: --input-csp <integer|string>
88
 
89
-   YUV only: Source color space. Only i420, i422, and i444 are
90
-   supported at this time. The internal color space is always the
91
-   same as the source color space (libx265 does not support any color
92
-   space conversions).
93
+   Chroma Subsampling (YUV only):  Only 4:0:0(monochrome), 4:2:0, 4:2:2, and 4:4:4 are supported at this time. 
94
+   The chroma subsampling format of your input must match your desired output chroma subsampling format 
95
+   (libx265 will not perform any chroma subsampling conversion), and it must be supported by the 
96
+   HEVC profile you have specified.
97
 
98
-   0. i400
99
-   1. i420 **(default)**
100
-   2. i422
101
-   3. i444
102
+   0. i400 (4:0:0 monochrome) - Not supported by Main or Main10 profiles
103
+   1. i420 (4:2:0 default)    - Supported by all HEVC profiles
104
+   2. i422 (4:2:2)            - Not supported by Main, Main10 and Main12 profiles
105
+   3. i444 (4:4:4)            - Supported by Main 4:4:4, Main 4:4:4 10, Main 4:4:4 12, Main 4:4:4 16 Intra profiles
106
    4. nv12
107
    5. nv16
108
 
109
@@ -436,8 +450,8 @@
110
    depth of the encoder. If the requested bit depth is not the bit
111
    depth of the linked libx265, it will attempt to bind libx265_main
112
    for an 8bit encoder, libx265_main10 for a 10bit encoder, or
113
-   libx265_main12 for a 12bit encoder (EXPERIMENTAL), with the
114
-   same API version as the linked libx265.
115
+   libx265_main12 for a 12bit encoder, with the same API version as the
116
+   linked libx265.
117
 
118
    If the output depth is not specified but :option:`--profile` is
119
    specified, the output depth will be derived from the profile name.
120
@@ -486,13 +500,6 @@
121
    The CLI application will derive the output bit depth from the
122
    profile name if :option:`--output-depth` is not specified.
123
 
124
-.. note::
125
-
126
-   All 12bit presets are extremely unstable, do not use them yet.
127
-   16bit is not supported at all, but those profiles are included
128
-   because it is possible for libx265 to make bitstreams compatible
129
-   with them.
130
-
131
 .. option:: --level-idc <integer|float>
132
 
133
    Minimum decoder requirement level. Defaults to 0, which implies
134
@@ -606,7 +613,8 @@
135
    +-------+---------------------------------------------------------------+
136
    | Level | Description                                                   |
137
    +=======+===============================================================+
138
-   | 0     | sa8d mode and split decisions, intra w/ source pixels         |
139
+   | 0     | sa8d mode and split decisions, intra w/ source pixels,        |
140
+   |       | currently not supported                                       |
141
    +-------+---------------------------------------------------------------+
142
    | 1     | recon generated (better intra), RDO merge/skip selection      |
143
    +-------+---------------------------------------------------------------+
144
@@ -677,7 +685,16 @@
145
    (within your decoder level limits) if you enable one or
146
    both of these flags.
147
 
148
-   This feature is EXPERIMENTAL and functional at all RD levels.
149
+   Default 3.
150
+
151
+.. option:: --limit-modes, --no-limit-modes
152
+    
153
+   When enabled, limit-modes will limit modes analyzed for each CU using cost 
154
+   metrics from the 4 sub-CUs. When multiple inter modes like :option:`--rect`
155
+   and/or :option:`--amp` are enabled, this feature will use motion cost 
156
+   heuristics from the 4 sub-CUs to bypass modes that are unlikely to be the 
157
+   best choice. This can significantly improve performance when :option:`rect`
158
+   and/or :option:`--amp` are enabled at minimal compression efficiency loss.
159
 
160
 .. option:: --rect, --no-rect
161
 
162
@@ -1049,9 +1066,9 @@
163
    energy of the source image in the encoded image at the expense of
164
    compression efficiency. It only has effect on presets which use
165
    RDO-based mode decisions (:option:`--rd` 3 and above). 1.0 is a
166
-   typical value. Default 0.3
167
+   typical value. Default 2.0
168
 
169
-   **Range of values:** 0 .. 2.0
170
+   **Range of values:** 0 .. 5.0
171
 
172
 .. option:: --psy-rdoq <float>
173
 
174
@@ -1076,7 +1093,8 @@
175
 
176
    Max intra period in frames. A special case of infinite-gop (single
177
    keyframe at the beginning of the stream) can be triggered with
178
-   argument -1. Use 1 to force all-intra. Default 250
179
+   argument -1. Use 1 to force all-intra. When intra-refresh is enabled
180
+   it specifies the interval between which refresh sweeps happen. Default 250
181
 
182
 .. option:: --min-keyint, -i <integer>
183
 
184
@@ -1095,6 +1113,14 @@
185
    :option:`--scenecut` 0 or :option:`--no-scenecut` disables adaptive
186
    I frame placement. Default 40
187
 
188
+.. option:: --intra-refresh
189
+
190
+   Enables Periodic Intra Refresh(PIR) instead of keyframe insertion.
191
+   PIR can replace keyframes by inserting a column of intra blocks in 
192
+   non-keyframes, that move across the video from one side to the other
193
+   and thereby refresh the image but over a period of multiple 
194
+   frames instead of a single keyframe.
195
+
196
 .. option:: --rc-lookahead <integer>
197
 
198
    Number of frames for slice-type decision lookahead (a key
199
@@ -1108,21 +1134,31 @@
200
 
201
x265_1.8.tar.gz/doc/reST/presets.rst -> x265_1.9.tar.gz/doc/reST/presets.rst Changed
154
 
1
@@ -6,76 +6,83 @@
2
 Presets
3
 =======
4
 
5
-x265 has a number of predefined :option:`--preset` options that make
6
-trade-offs between encode speed (encoded frames per second) and
7
+x265 has ten predefined :option:`--preset` options that optimize the
8
+trade-off between encoding speed (encoded frames per second) and
9
 compression efficiency (quality per bit in the bitstream).  The default
10
-preset is medium, it does a reasonably good job of finding the best
11
-possible quality without spending enormous CPU cycles looking for the
12
-absolute most efficient way to achieve that quality.  As you go higher
13
-than medium, the encoder takes shortcuts to improve performance at the
14
-expense of quality and compression efficiency.  As you go lower than
15
-medium, the encoder tries harder and harder to achieve the best quailty
16
-per bit compression ratio.
17
-
18
-The presets adjust encoder parameters to affect these trade-offs.
19
-
20
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
21
-|              | ultrafast | superfast | veryfast | faster | fast | medium | slow | slower | veryslow | placebo |
22
-+==============+===========+===========+==========+========+======+========+======+========+==========+=========+
23
-| ctu          |   32      |    32     |   32     |  64    |  64  |   64   |  64  |  64    |   64     |   64    |
24
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
25
-| min-cu-size  |   16      |     8     |    8     |   8    |   8  |    8   |   8  |   8    |    8     |    8    |
26
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
27
-| bframes      |    3      |     3     |    4     |   4    |  4   |    4   |  4   |   8    |    8     |    8    |
28
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
29
-| b-adapt      |    0      |     0     |    0     |   0    |  0   |    2   |  2   |   2    |    2     |    2    |
30
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
31
-| rc-lookahead |    5      |    10     |   15     |  15    |  15  |   20   |  25  |   30   |   40     |   60    |
32
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
33
-| scenecut     |    0      |    40     |   40     |  40    |  40  |   40   |  40  |   40   |   40     |   40    |
34
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
35
-| refs         |    1      |     1     |    1     |   1    |  2   |    3   |  3   |   3    |    5     |    5    |
36
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
37
-| me           |   dia     |   hex     |   hex    |  hex   | hex  |   hex  | star |  star  |   star   |   star  |
38
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
39
-| merange      |   57      |    57     |   57     |  57    |  57  |   57   | 57   |  57    |   57     |   92    |
40
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
41
-| subme        |    0      |     1     |    1     |   2    |  2   |    2   |  3   |   3    |    4     |    5    |
42
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
43
-| rect         |    0      |     0     |    0     |   0    |  0   |    0   |  1   |   1    |    1     |    1    |
44
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
45
-| amp          |    0      |     0     |    0     |   0    |  0   |    0   |  0   |   1    |    1     |    1    |
46
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
47
-| max-merge    |    2      |     2     |    2     |   2    |  2   |    2   |  3   |   3    |    4     |    5    |
48
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
49
-| early-skip   |    1      |     1     |    1     |   1    |  0   |    0   |  0   |   0    |    0     |    0    |
50
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
51
-| fast-intra   |    1      |     1     |    1     |   1    |  1   |    0   |  0   |   0    |    0     |    0    |
52
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
53
-| b-intra      |    0      |     0     |    0     |   0    |  0   |    0   |  0   |   1    |    1     |    1    |
54
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
55
-| sao          |    0      |     0     |    1     |   1    |  1   |    1   |  1   |   1    |    1     |    1    |
56
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
57
-| signhide     |    0      |     1     |    1     |   1    |  1   |    1   |  1   |   1    |    1     |    1    |
58
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
59
-| weightp      |    0      |     0     |    1     |   1    |  1   |    1   |  1   |   1    |    1     |    1    |
60
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
61
-| weightb      |    0      |     0     |    0     |   0    |  0   |    0   |  0   |   1    |    1     |    1    |
62
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
63
-| aq-mode      |    0      |     0     |    1     |   1    |  1   |    1   |  1   |   1    |    1     |    1    |
64
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
65
-| cuTree       |    0      |     0     |    0     |   0    |  1   |    1   |  1   |   1    |    1     |    1    |
66
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
67
-| rdLevel      |    2      |     2     |    2     |   2    |  2   |    3   |  4   |   6    |    6     |    6    |
68
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
69
-| rdoq-level   |    0      |     0     |    0     |   0    |  0   |    0   |  2   |   2    |    2     |    2    |
70
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
71
-| tu-intra     |    1      |     1     |    1     |   1    |  1   |    1   |  1   |   2    |    3     |    4    |
72
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
73
-| tu-inter     |    1      |     1     |    1     |   1    |  1   |    1   |  1   |   2    |    3     |    4    |
74
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
75
-
76
-Placebo mode enables transform-skip prediction evaluation.
77
+preset is medium.  It does a reasonably good job of finding the best
78
+possible quality without spending excessive CPU cycles looking for the
79
+absolute most efficient way to achieve that quality.  When you use 
80
+faster presets, the encoder takes shortcuts to improve performance at 
81
+the expense of quality and compression efficiency.  When you use slower
82
+presets, x265 tests more encoding options, using more computations to  
83
+achieve the best quality at your selected bit rate (or in the case of
84
+--crf rate control, the lowest bit rate at the selected quality).
85
+
86
+The presets adjust encoder parameters as shown in the following table.
87
+Any parameters below that are specified in your command-line will be 
88
+changed from the value specified by the preset.
89
+
90
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
91
+|                 |ultrafast |superfast |veryfast |faster |fast |medium |slow |slower |veryslow |placebo |
92
++=================+==========+==========+=========+=======+=====+=======+=====+=======+=========+========+
93
+| ctu             |    32    |    32    |   64    |  64   | 64  |  64   | 64  |  64   |   64    |  64    |
94
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
95
+| min-cu-size     |    16    |     8    |    8    |   8   |  8  |   8   |  8  |   8   |    8    |   8    |
96
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
97
+| bframes         |     3    |     3    |    4    |   4   |  4  |   4   |  4  |   8   |    8    |   8    |
98
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
99
+| b-adapt         |     0    |     0    |    0    |   0   |  0  |   2   |  2  |   2   |    2    |   2    |
100
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
101
+| rc-lookahead    |     5    |    10    |   15    |  15   | 15  |  20   | 25  |  30   |   40    |  60    |
102
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
103
+| lookahead-slices|     8    |     8    |    8    |   8   |  8  |   8   |  4  |   4   |    1    |   1    |
104
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
105
+| scenecut        |     0    |    40    |   40    |  40   | 40  |  40   | 40  |  40   |   40    |  40    |
106
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
107
+| ref             |     1    |     1    |    2    |   2   |  3  |   3   |  4  |   4   |    5    |   5    |
108
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
109
+| limit-refs      |     0    |     0    |    3    |   3   |  3  |   3   |  3  |   2   |    1    |   0    |
110
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
111
+| me              |    dia   |   hex    |   hex   |  hex  |hex  |  hex  |star | star  |   star  |  star  |
112
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
113
+| merange         |    57    |    57    |   57    |  57   | 57  |  57   | 57  |  57   |   57    |  92    |
114
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
115
+| subme           |     0    |     1    |    1    |   2   |  2  |   2   |  3  |   3   |    4    |   5    |
116
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
117
+| rect            |     0    |     0    |    0    |   0   |  0  |   0   |  1  |   1   |    1    |   1    |
118
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
119
+| amp             |     0    |     0    |    0    |   0   |  0  |   0   |  0  |   1   |    1    |   1    |
120
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
121
+| limit-modes     |     0    |     0    |    0    |   0   |  0  |   0   |  1  |   1   |    1    |   0    |
122
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
123
+| max-merge       |     2    |     2    |    2    |   2   |  2  |   2   |  3  |   3   |    4    |   5    |
124
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
125
+| early-skip      |     1    |     1    |    1    |   1   |  0  |   0   |  0  |   0   |    0    |   0    |
126
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
127
+| fast-intra      |     1    |     1    |    1    |   1   |  1  |   0   |  0  |   0   |    0    |   0    |
128
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
129
+| b-intra         |     0    |     0    |    0    |   0   |  0  |   0   |  0  |   1   |    1    |   1    |
130
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
131
+| sao             |     0    |     0    |    1    |   1   |  1  |   1   |  1  |   1   |    1    |   1    |
132
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
133
+| signhide        |     0    |     1    |    1    |   1   |  1  |   1   |  1  |   1   |    1    |   1    |
134
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
135
+| weightp         |     0    |     0    |    1    |   1   |  1  |   1   |  1  |   1   |    1    |   1    |
136
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
137
+| weightb         |     0    |     0    |    0    |   0   |  0  |   0   |  0  |   1   |    1    |   1    |
138
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
139
+| aq-mode         |     0    |     0    |    1    |   1   |  1  |   1   |  1  |   1   |    1    |   1    |
140
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
141
+| cuTree          |     1    |     1    |    1    |   1   |  1  |   1   |  1  |   1   |    1    |   1    |
142
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
143
+| rdLevel         |     2    |     2    |    2    |   2   |  2  |   3   |  4  |   6   |    6    |   6    |
144
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
145
+| rdoq-level      |     0    |     0    |    0    |   0   |  0  |   0   |  2  |   2   |    2    |   2    |
146
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
147
+| tu-intra        |     1    |     1    |    1    |   1   |  1  |   1   |  1  |   2   |    3    |   4    |
148
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
149
+| tu-inter        |     1    |     1    |    1    |   1   |  1  |   1   |  1  |   2   |    3    |   4    |
150
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
151
 
152
 .. _tunings:
153
 
154
x265_1.8.tar.gz/source/CMakeLists.txt -> x265_1.9.tar.gz/source/CMakeLists.txt Changed
133
 
1
@@ -30,7 +30,7 @@
2
 mark_as_advanced(FPROFILE_USE FPROFILE_GENERATE NATIVE_BUILD)
3
 
4
 # X265_BUILD must be incremented each time the public API is changed
5
-set(X265_BUILD 68)
6
+set(X265_BUILD 79)
7
 configure_file("${PROJECT_SOURCE_DIR}/x265.def.in"
8
                "${PROJECT_BINARY_DIR}/x265.def")
9
 configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in"
10
@@ -45,12 +45,14 @@
11
 set(POWER_ALIASES ppc64 ppc64le)
12
 list(FIND POWER_ALIASES "${SYSPROC}" POWERMATCH)
13
 if("${SYSPROC}" STREQUAL "" OR X86MATCH GREATER "-1")
14
-    message(STATUS "Detected x86 target processor")
15
     set(X86 1)
16
     add_definitions(-DX265_ARCH_X86=1)
17
     if("${CMAKE_SIZEOF_VOID_P}" MATCHES 8)
18
         set(X64 1)
19
         add_definitions(-DX86_64=1)
20
+        message(STATUS "Detected x86_64 target processor")
21
+    else()
22
+        message(STATUS "Detected x86 target processor")
23
     endif()
24
 elseif(POWERMATCH GREATER "-1")
25
     message(STATUS "Detected POWER target processor")
26
@@ -71,23 +73,27 @@
27
     if(LIBRT)
28
         list(APPEND PLATFORM_LIBS rt)
29
     endif()
30
+    mark_as_advanced(LIBRT)
31
     find_library(LIBDL dl)
32
     if(LIBDL)
33
         list(APPEND PLATFORM_LIBS dl)
34
     endif()
35
-    find_package(Numa)
36
-    if(NUMA_FOUND)
37
-        link_directories(${NUMA_LIBRARY_DIR})
38
-        list(APPEND CMAKE_REQUIRED_LIBRARIES numa)
39
-        check_symbol_exists(numa_node_of_cpu numa.h NUMA_V2)
40
-        if(NUMA_V2)
41
-            add_definitions(-DHAVE_LIBNUMA)
42
-            message(STATUS "libnuma found, building with support for NUMA nodes")
43
-            list(APPEND PLATFORM_LIBS numa)
44
-            include_directories(${NUMA_INCLUDE_DIR})
45
+    option(ENABLE_LIBNUMA "Enable libnuma usage (Linux only)" ON)
46
+    if(ENABLE_LIBNUMA)
47
+        find_package(Numa)
48
+        if(NUMA_FOUND)
49
+            link_directories(${NUMA_LIBRARY_DIR})
50
+            list(APPEND CMAKE_REQUIRED_LIBRARIES numa)
51
+            check_symbol_exists(numa_node_of_cpu numa.h NUMA_V2)
52
+            if(NUMA_V2)
53
+                add_definitions(-DHAVE_LIBNUMA)
54
+                message(STATUS "libnuma found, building with support for NUMA nodes")
55
+                list(APPEND PLATFORM_LIBS numa)
56
+                include_directories(${NUMA_INCLUDE_DIR})
57
+            endif()
58
         endif()
59
-    endif()
60
-    mark_as_advanced(LIBRT NUMA_FOUND)
61
+        mark_as_advanced(NUMA_FOUND)
62
+    endif(ENABLE_LIBNUMA)
63
     option(NO_ATOMICS "Use a slow mutex to replace atomics" OFF)
64
     if(NO_ATOMICS)
65
         add_definitions(-DNO_ATOMICS=1)
66
@@ -157,6 +163,7 @@
67
 if(GCC)
68
     add_definitions(-Wall -Wextra -Wshadow)
69
     add_definitions(-D__STDC_LIMIT_MACROS=1)
70
+    add_definitions(-std=gnu++98)
71
     if(ENABLE_PIC)
72
          add_definitions(-fPIC)
73
     endif(ENABLE_PIC)
74
@@ -379,16 +386,19 @@
75
 
76
 option(ENABLE_VTUNE "Enable Vtune profiling instrumentation" OFF)
77
 if(ENABLE_VTUNE)
78
-    add_definitions(-DENABLE_VTUNE)
79
-    include_directories($ENV{VTUNE_AMPLIFIER_XE_2015_DIR}/include)
80
-    list(APPEND PLATFORM_LIBS vtune)
81
-    link_directories($ENV{VTUNE_AMPLIFIER_XE_2015_DIR}/lib64)
82
-    if(WIN32)
83
-        list(APPEND PLATFORM_LIBS libittnotify.lib)
84
-    else()
85
-        list(APPEND PLATFORM_LIBS libittnotify.a dl)
86
-    endif()
87
-    add_subdirectory(profile/vtune)
88
+    find_package(Vtune)
89
+    if(VTUNE_FOUND)
90
+        add_definitions(-DENABLE_VTUNE)
91
+        include_directories(${VTUNE_INCLUDE_DIR})
92
+        list(APPEND PLATFORM_LIBS vtune)
93
+        link_directories(${VTUNE_LIBRARY_DIR})
94
+        if(WIN32)
95
+            list(APPEND PLATFORM_LIBS libittnotify.lib)
96
+        else()
97
+            list(APPEND PLATFORM_LIBS libittnotify.a dl)
98
+        endif()
99
+        add_subdirectory(profile/vtune)
100
+    endif(VTUNE_FOUND)
101
 endif(ENABLE_VTUNE)
102
 
103
 option(DETAILED_CU_STATS "Enable internal profiling of encoder work" OFF)
104
@@ -455,6 +465,9 @@
105
 if(ENABLE_SHARED)
106
     add_library(x265-shared SHARED "${PROJECT_BINARY_DIR}/x265.def" ${YASM_OBJS}
107
                 ${X265_RC_FILE} $<TARGET_OBJECTS:encoder> $<TARGET_OBJECTS:common>)
108
+    if(EXTRA_LIB)
109
+        target_link_libraries(x265-shared ${EXTRA_LIB})
110
+    endif()
111
     target_link_libraries(x265-shared ${PLATFORM_LIBS})
112
     if(MSVC)
113
         set_target_properties(x265-shared PROPERTIES OUTPUT_NAME libx265)
114
@@ -465,6 +478,8 @@
115
         set_target_properties(x265-shared PROPERTIES VERSION ${X265_BUILD})
116
         if(APPLE)
117
             set_target_properties(x265-shared PROPERTIES MACOSX_RPATH 1)
118
+        elseif(CYGWIN)
119
+            # Cygwin is not officially supported or tested. MinGW with msys is recommended.
120
         else()
121
             list(APPEND LINKER_OPTIONS "-Wl,-Bsymbolic,-znoexecstack")
122
         endif()
123
@@ -480,9 +495,6 @@
124
                 ARCHIVE DESTINATION ${LIB_INSTALL_DIR}
125
                 RUNTIME DESTINATION ${BIN_INSTALL_DIR})
126
     endif()
127
-    if(EXTRA_LIB)
128
-        target_link_libraries(x265-shared ${EXTRA_LIB})
129
-    endif()
130
     if(LINKER_OPTIONS)
131
         # set_target_properties can't do list expansion
132
         string(REPLACE ";" " " LINKER_OPTION_STR "${LINKER_OPTIONS}")
133
x265_1.9.tar.gz/source/cmake/FindVtune.cmake Added
27
 
1
@@ -0,0 +1,25 @@
2
+# Module for locating Vtune
3
+#
4
+# Read-only variables
5
+#   VTUNE_FOUND: Indicates that the library has been found
6
+#   VTUNE_INCLUDE_DIR: Points to the vtunes include dir
7
+#   VTUNE_LIBRARY_DIR: Points to the directory with libraries
8
+#
9
+# Copyright (c) 2015 Pradeep Ramachandran
10
+
11
+include(FindPackageHandleStandardArgs)
12
+
13
+find_path(VTUNE_DIR
14
+    if(UNIX)
15
+        NAMES amplxe-vars.sh
16
+    else()
17
+        NAMES amplxe-vars.bat
18
+    endif(UNIX)
19
+    HINTS $ENV{VTUNE_AMPLIFIER_XE_2016_DIR} $ENV{VTUNE_AMPLIFIER_XE_2015_DIR}
20
+    DOC "Vtune root directory")
21
+
22
+set (VTUNE_INCLUDE_DIR ${VTUNE_DIR}/include)
23
+set (VTUNE_LIBRARY_DIR ${VTUNE_DIR}/lib64)
24
+
25
+mark_as_advanced(VTUNE_DIR)
26
+find_package_handle_standard_args(VTUNE REQUIRED_VARS VTUNE_DIR VTUNE_INCLUDE_DIR VTUNE_LIBRARY_DIR)
27
x265_1.8.tar.gz/source/common/bitstream.cpp -> x265_1.9.tar.gz/source/common/bitstream.cpp Changed
30
 
1
@@ -1,5 +1,6 @@
2
 #include "common.h"
3
 #include "bitstream.h"
4
+#include "threading.h"
5
 
6
 using namespace X265_NS;
7
 
8
@@ -112,16 +113,13 @@
9
 
10
 void SyntaxElementWriter::writeUvlc(uint32_t code)
11
 {
12
-    uint32_t length = 1;
13
-    uint32_t temp = ++code;
14
+    ++code;
15
 
16
-    X265_CHECK(temp, "writing -1 code, will cause infinite loop\n");
17
+    X265_CHECK(code, "writing -1 code, will cause infinite loop\n");
18
 
19
-    while (1 != temp)
20
-    {
21
-        temp >>= 1;
22
-        length += 2;
23
-    }
24
+    unsigned long idx;
25
+    CLZ(idx, code);
26
+    uint32_t length = (uint32_t)idx * 2 + 1;
27
 
28
     // Take care of cases where length > 32
29
     m_bitIf->write(0, length >> 1);
30
x265_1.8.tar.gz/source/common/bitstream.h -> x265_1.9.tar.gz/source/common/bitstream.h Changed
9
 
1
@@ -2,6 +2,7 @@
2
  * Copyright (C) 2013 x265 project
3
  *
4
  * Author: Steve Borho <steve@borho.org>
5
+ *         Min Chen <chenm003@163.com>
6
  *
7
  * This program is free software; you can redistribute it and/or modify
8
  * it under the terms of the GNU General Public License as published by
9
x265_1.8.tar.gz/source/common/common.h -> x265_1.9.tar.gz/source/common/common.h Changed
67
 
1
@@ -2,6 +2,7 @@
2
  * Copyright (C) 2013 x265 project
3
  *
4
  * Authors: Deepthi Nandakumar <deepthi@multicorewareinc.com>
5
+ *          Min Chen <chenm003@163.com>
6
  *
7
  * This program is free software; you can redistribute it and/or modify
8
  * it under the terms of the GNU General Public License as published by
9
@@ -134,10 +135,10 @@
10
 typedef int32_t  ssum2_t; // Signed sum
11
 #endif // if HIGH_BIT_DEPTH
12
 
13
-#if X265_DEPTH <= 10
14
-typedef uint32_t sse_ret_t;
15
+#if X265_DEPTH < 10
16
+typedef uint32_t sse_t;
17
 #else
18
-typedef uint64_t sse_ret_t;
19
+typedef uint64_t sse_t;
20
 #endif
21
 
22
 #ifndef NULL
23
@@ -214,6 +215,7 @@
24
 
25
 #define X265_MALLOC(type, count)    (type*)x265_malloc(sizeof(type) * (count))
26
 #define X265_FREE(ptr)              x265_free(ptr)
27
+#define X265_FREE_ZERO(ptr)         x265_free(ptr); (ptr) = NULL
28
 #define CHECKED_MALLOC(var, type, count) \
29
     { \
30
         var = (type*)x265_malloc(sizeof(type) * (count)); \
31
@@ -317,6 +319,9 @@
32
 #define CHROMA_V_SHIFT(x) (x == X265_CSP_I420)
33
 #define X265_MAX_PRED_MODE_PER_CTU 85 * 2 * 8
34
 
35
+#define MAX_NUM_TR_COEFFS           MAX_TR_SIZE * MAX_TR_SIZE // Maximum number of transform coefficients, for a 32x32 transform
36
+#define MAX_NUM_TR_CATEGORIES       16                        // 32, 16, 8, 4 transform categories each for luma and chroma
37
+
38
 namespace X265_NS {
39
 
40
 enum { SAO_NUM_OFFSET = 4 };
41
@@ -366,25 +371,6 @@
42
         delete[] ctuParam[2];
43
     }
44
 };
45
-
46
-/* Stores inter analysis data for a single frame */
47
-struct analysis_inter_data
48
-{
49
-    int32_t*    ref;
50
-    uint8_t*    depth;
51
-    uint8_t*    modes;
52
-    uint32_t*   bestMergeCand;
53
-};
54
-
55
-/* Stores intra analysis data for a single frame. This struct needs better packing */
56
-struct analysis_intra_data
57
-{
58
-    uint8_t*  depth;
59
-    uint8_t*  modes;
60
-    char*     partSizes;
61
-    uint8_t*  chromaModes;
62
-};
63
-
64
 enum TextType
65
 {
66
     TEXT_LUMA     = 0,  // luma
67
x265_1.8.tar.gz/source/common/constants.cpp -> x265_1.9.tar.gz/source/common/constants.cpp Changed
9
 
1
@@ -2,6 +2,7 @@
2
 * Copyright (C) 2015 x265 project
3
 *
4
 * Authors: Steve Borho <steve@borho.org>
5
+*          Min Chen <chenm003@163.com>
6
 *
7
 * This program is free software; you can redistribute it and/or modify
8
 * it under the terms of the GNU General Public License as published by
9
x265_1.8.tar.gz/source/common/constants.h -> x265_1.9.tar.gz/source/common/constants.h Changed
9
 
1
@@ -2,6 +2,7 @@
2
  * Copyright (C) 2015 x265 project
3
  *
4
  * Authors: Steve Borho <steve@borho.org>
5
+ *          Min Chen <chenm003@163.com>
6
  *
7
  * This program is free software; you can redistribute it and/or modify
8
  * it under the terms of the GNU General Public License as published by
9
x265_1.8.tar.gz/source/common/contexts.h -> x265_1.9.tar.gz/source/common/contexts.h Changed
9
 
1
@@ -2,6 +2,7 @@
2
 * Copyright (C) 2015 x265 project
3
 *
4
 * Authors: Steve Borho <steve@borho.org>
5
+*          Min Chen <chenm003@163.com>
6
 *
7
 * This program is free software; you can redistribute it and/or modify
8
 * it under the terms of the GNU General Public License as published by
9
x265_1.8.tar.gz/source/common/cudata.cpp -> x265_1.9.tar.gz/source/common/cudata.cpp Changed
201
 
1
@@ -2,6 +2,7 @@
2
  * Copyright (C) 2015 x265 project
3
  *
4
  * Authors: Steve Borho <steve@borho.org>
5
+ *          Min Chen <chenm003@163.com>
6
  *
7
  * This program is free software; you can redistribute it and/or modify
8
  * it under the terms of the GNU General Public License as published by
9
@@ -192,44 +193,82 @@
10
         break;
11
     }
12
 
13
-    /* Each CU's data is layed out sequentially within the charMemBlock */
14
-    uint8_t *charBuf = dataPool.charMemBlock + (m_numPartitions * BytesPerPartition) * instance;
15
-
16
-    m_qp        = (int8_t*)charBuf; charBuf += m_numPartitions;
17
-    m_log2CUSize         = charBuf; charBuf += m_numPartitions;
18
-    m_lumaIntraDir       = charBuf; charBuf += m_numPartitions;
19
-    m_tqBypass           = charBuf; charBuf += m_numPartitions;
20
-    m_refIdx[0] = (int8_t*)charBuf; charBuf += m_numPartitions;
21
-    m_refIdx[1] = (int8_t*)charBuf; charBuf += m_numPartitions;
22
-    m_cuDepth            = charBuf; charBuf += m_numPartitions;
23
-    m_predMode           = charBuf; charBuf += m_numPartitions; /* the order up to here is important in initCTU() and initSubCU() */
24
-    m_partSize           = charBuf; charBuf += m_numPartitions;
25
-    m_mergeFlag          = charBuf; charBuf += m_numPartitions;
26
-    m_interDir           = charBuf; charBuf += m_numPartitions;
27
-    m_mvpIdx[0]          = charBuf; charBuf += m_numPartitions;
28
-    m_mvpIdx[1]          = charBuf; charBuf += m_numPartitions;
29
-    m_tuDepth            = charBuf; charBuf += m_numPartitions;
30
-    m_transformSkip[0]   = charBuf; charBuf += m_numPartitions;
31
-    m_transformSkip[1]   = charBuf; charBuf += m_numPartitions;
32
-    m_transformSkip[2]   = charBuf; charBuf += m_numPartitions;
33
-    m_cbf[0]             = charBuf; charBuf += m_numPartitions;
34
-    m_cbf[1]             = charBuf; charBuf += m_numPartitions;
35
-    m_cbf[2]             = charBuf; charBuf += m_numPartitions;
36
-    m_chromaIntraDir     = charBuf; charBuf += m_numPartitions;
37
-
38
-    X265_CHECK(charBuf == dataPool.charMemBlock + (m_numPartitions * BytesPerPartition) * (instance + 1), "CU data layout is broken\n");
39
-
40
-    m_mv[0]  = dataPool.mvMemBlock + (instance * 4) * m_numPartitions;
41
-    m_mv[1]  = m_mv[0] +  m_numPartitions;
42
-    m_mvd[0] = m_mv[1] +  m_numPartitions;
43
-    m_mvd[1] = m_mvd[0] + m_numPartitions;
44
-
45
-    uint32_t cuSize = g_maxCUSize >> depth;
46
-    uint32_t sizeL = cuSize * cuSize;
47
-    uint32_t sizeC = sizeL >> (m_hChromaShift + m_vChromaShift);
48
-    m_trCoeff[0] = dataPool.trCoeffMemBlock + instance * (sizeL + sizeC * 2);
49
-    m_trCoeff[1] = m_trCoeff[0] + sizeL;
50
-    m_trCoeff[2] = m_trCoeff[0] + sizeL + sizeC;
51
+    if (csp == X265_CSP_I400)
52
+    {
53
+        /* Each CU's data is layed out sequentially within the charMemBlock */
54
+        uint8_t *charBuf = dataPool.charMemBlock + (m_numPartitions * (BytesPerPartition - 4)) * instance;
55
+
56
+        m_qp        = (int8_t*)charBuf; charBuf += m_numPartitions;
57
+        m_log2CUSize         = charBuf; charBuf += m_numPartitions;
58
+        m_lumaIntraDir       = charBuf; charBuf += m_numPartitions;
59
+        m_tqBypass           = charBuf; charBuf += m_numPartitions;
60
+        m_refIdx[0] = (int8_t*)charBuf; charBuf += m_numPartitions;
61
+        m_refIdx[1] = (int8_t*)charBuf; charBuf += m_numPartitions;
62
+        m_cuDepth            = charBuf; charBuf += m_numPartitions;
63
+        m_predMode           = charBuf; charBuf += m_numPartitions; /* the order up to here is important in initCTU() and initSubCU() */
64
+        m_partSize           = charBuf; charBuf += m_numPartitions;
65
+        m_mergeFlag          = charBuf; charBuf += m_numPartitions;
66
+        m_interDir           = charBuf; charBuf += m_numPartitions;
67
+        m_mvpIdx[0]          = charBuf; charBuf += m_numPartitions;
68
+        m_mvpIdx[1]          = charBuf; charBuf += m_numPartitions;
69
+        m_tuDepth            = charBuf; charBuf += m_numPartitions;
70
+        m_transformSkip[0]   = charBuf; charBuf += m_numPartitions;
71
+        m_cbf[0]             = charBuf; charBuf += m_numPartitions;
72
+        m_chromaIntraDir     = charBuf; charBuf += m_numPartitions;
73
+
74
+        X265_CHECK(charBuf == dataPool.charMemBlock + (m_numPartitions * (BytesPerPartition - 4)) * (instance + 1), "CU data layout is broken\n"); //BytesPerPartition
75
+
76
+        m_mv[0]  = dataPool.mvMemBlock + (instance * 4) * m_numPartitions;
77
+        m_mv[1]  = m_mv[0] +  m_numPartitions;
78
+        m_mvd[0] = m_mv[1] +  m_numPartitions;
79
+        m_mvd[1] = m_mvd[0] + m_numPartitions;
80
+
81
+        uint32_t cuSize = g_maxCUSize >> depth;
82
+        m_trCoeff[0] = dataPool.trCoeffMemBlock + instance * (cuSize * cuSize);
83
+        m_trCoeff[1] = m_trCoeff[2] = 0;
84
+        m_transformSkip[1] = m_transformSkip[2] = m_cbf[1] = m_cbf[2] = 0;
85
+    }
86
+    else
87
+    {
88
+        /* Each CU's data is layed out sequentially within the charMemBlock */
89
+        uint8_t *charBuf = dataPool.charMemBlock + (m_numPartitions * BytesPerPartition) * instance;
90
+
91
+        m_qp        = (int8_t*)charBuf; charBuf += m_numPartitions;
92
+        m_log2CUSize         = charBuf; charBuf += m_numPartitions;
93
+        m_lumaIntraDir       = charBuf; charBuf += m_numPartitions;
94
+        m_tqBypass           = charBuf; charBuf += m_numPartitions;
95
+        m_refIdx[0] = (int8_t*)charBuf; charBuf += m_numPartitions;
96
+        m_refIdx[1] = (int8_t*)charBuf; charBuf += m_numPartitions;
97
+        m_cuDepth            = charBuf; charBuf += m_numPartitions;
98
+        m_predMode           = charBuf; charBuf += m_numPartitions; /* the order up to here is important in initCTU() and initSubCU() */
99
+        m_partSize           = charBuf; charBuf += m_numPartitions;
100
+        m_mergeFlag          = charBuf; charBuf += m_numPartitions;
101
+        m_interDir           = charBuf; charBuf += m_numPartitions;
102
+        m_mvpIdx[0]          = charBuf; charBuf += m_numPartitions;
103
+        m_mvpIdx[1]          = charBuf; charBuf += m_numPartitions;
104
+        m_tuDepth            = charBuf; charBuf += m_numPartitions;
105
+        m_transformSkip[0]   = charBuf; charBuf += m_numPartitions;
106
+        m_transformSkip[1]   = charBuf; charBuf += m_numPartitions;
107
+        m_transformSkip[2]   = charBuf; charBuf += m_numPartitions;
108
+        m_cbf[0]             = charBuf; charBuf += m_numPartitions;
109
+        m_cbf[1]             = charBuf; charBuf += m_numPartitions;
110
+        m_cbf[2]             = charBuf; charBuf += m_numPartitions;
111
+        m_chromaIntraDir     = charBuf; charBuf += m_numPartitions;
112
+
113
+        X265_CHECK(charBuf == dataPool.charMemBlock + (m_numPartitions * BytesPerPartition) * (instance + 1), "CU data layout is broken\n");
114
+
115
+        m_mv[0]  = dataPool.mvMemBlock + (instance * 4) * m_numPartitions;
116
+        m_mv[1]  = m_mv[0] +  m_numPartitions;
117
+        m_mvd[0] = m_mv[1] +  m_numPartitions;
118
+        m_mvd[1] = m_mvd[0] + m_numPartitions;
119
+
120
+        uint32_t cuSize = g_maxCUSize >> depth;
121
+        uint32_t sizeL = cuSize * cuSize;
122
+        uint32_t sizeC = sizeL >> (m_hChromaShift + m_vChromaShift); // block chroma part
123
+        m_trCoeff[0] = dataPool.trCoeffMemBlock + instance * (sizeL + sizeC * 2);
124
+        m_trCoeff[1] = m_trCoeff[0] + sizeL;
125
+        m_trCoeff[2] = m_trCoeff[0] + sizeL + sizeC;
126
+    }
127
 }
128
 
129
 void CUData::initCTU(const Frame& frame, uint32_t cuAddr, int qp)
130
@@ -245,7 +284,8 @@
131
     /* sequential memsets */
132
     m_partSet((uint8_t*)m_qp, (uint8_t)qp);
133
     m_partSet(m_log2CUSize,   (uint8_t)g_maxLog2CUSize);
134
-    m_partSet(m_lumaIntraDir, (uint8_t)DC_IDX);
135
+    m_partSet(m_lumaIntraDir, (uint8_t)ALL_IDX);
136
+    m_partSet(m_chromaIntraDir, (uint8_t)ALL_IDX);
137
     m_partSet(m_tqBypass,     (uint8_t)frame.m_encData->m_param->bLossless);
138
     if (m_slice->m_sliceType != I_SLICE)
139
     {
140
@@ -256,7 +296,7 @@
141
     X265_CHECK(!(frame.m_encData->m_param->bLossless && !m_slice->m_pps->bTransquantBypassEnabled), "lossless enabled without TQbypass in PPS\n");
142
 
143
     /* initialize the remaining CU data in one memset */
144
-    memset(m_cuDepth, 0, (BytesPerPartition - 6) * m_numPartitions);
145
+    memset(m_cuDepth, 0, (frame.m_param->internalCsp == X265_CSP_I400 ? BytesPerPartition - 11 : BytesPerPartition - 7) * m_numPartitions);
146
 
147
     uint32_t widthInCU = m_slice->m_sps->numCuInWidth;
148
     m_cuLeft = (m_cuAddr % widthInCU) ? m_encData->getPicCTU(m_cuAddr - 1) : NULL;
149
@@ -283,14 +323,15 @@
150
     m_partSet((uint8_t*)m_qp, (uint8_t)qp);
151
 
152
     m_partSet(m_log2CUSize,   (uint8_t)cuGeom.log2CUSize);
153
-    m_partSet(m_lumaIntraDir, (uint8_t)DC_IDX);
154
+    m_partSet(m_lumaIntraDir, (uint8_t)ALL_IDX);
155
+    m_partSet(m_chromaIntraDir, (uint8_t)ALL_IDX);
156
     m_partSet(m_tqBypass,     (uint8_t)m_encData->m_param->bLossless);
157
     m_partSet((uint8_t*)m_refIdx[0], (uint8_t)REF_NOT_VALID);
158
     m_partSet((uint8_t*)m_refIdx[1], (uint8_t)REF_NOT_VALID);
159
     m_partSet(m_cuDepth,      (uint8_t)cuGeom.depth);
160
 
161
     /* initialize the remaining CU data in one memset */
162
-    memset(m_predMode, 0, (BytesPerPartition - 7) * m_numPartitions);
163
+    memset(m_predMode, 0, (ctu.m_chromaFormat == X265_CSP_I400 ? BytesPerPartition - 12 : BytesPerPartition - 8) * m_numPartitions);
164
 }
165
 
166
 /* Copy the results of a sub-part (split) CU to the parent CU */
167
@@ -314,13 +355,9 @@
168
     m_subPartCopy(m_mvpIdx[0] + offset, subCU.m_mvpIdx[0]);
169
     m_subPartCopy(m_mvpIdx[1] + offset, subCU.m_mvpIdx[1]);
170
     m_subPartCopy(m_tuDepth + offset, subCU.m_tuDepth);
171
+
172
     m_subPartCopy(m_transformSkip[0] + offset, subCU.m_transformSkip[0]);
173
-    m_subPartCopy(m_transformSkip[1] + offset, subCU.m_transformSkip[1]);
174
-    m_subPartCopy(m_transformSkip[2] + offset, subCU.m_transformSkip[2]);
175
     m_subPartCopy(m_cbf[0] + offset, subCU.m_cbf[0]);
176
-    m_subPartCopy(m_cbf[1] + offset, subCU.m_cbf[1]);
177
-    m_subPartCopy(m_cbf[2] + offset, subCU.m_cbf[2]);
178
-    m_subPartCopy(m_chromaIntraDir + offset, subCU.m_chromaIntraDir);
179
 
180
     memcpy(m_mv[0] + offset, subCU.m_mv[0], childGeom.numPartitions * sizeof(MV));
181
     memcpy(m_mv[1] + offset, subCU.m_mv[1], childGeom.numPartitions * sizeof(MV));
182
@@ -329,12 +366,21 @@
183
 
184
     uint32_t tmp = 1 << ((g_maxLog2CUSize - childGeom.depth) * 2);
185
     uint32_t tmp2 = subPartIdx * tmp;
186
-    memcpy(m_trCoeff[0] + tmp2, subCU.m_trCoeff[0], sizeof(coeff_t) * tmp);
187
+    memcpy(m_trCoeff[0] + tmp2, subCU.m_trCoeff[0], sizeof(coeff_t)* tmp);
188
 
189
-    uint32_t tmpC = tmp >> (m_hChromaShift + m_vChromaShift);
190
-    uint32_t tmpC2 = tmp2 >> (m_hChromaShift + m_vChromaShift);
191
-    memcpy(m_trCoeff[1] + tmpC2, subCU.m_trCoeff[1], sizeof(coeff_t) * tmpC);
192
-    memcpy(m_trCoeff[2] + tmpC2, subCU.m_trCoeff[2], sizeof(coeff_t) * tmpC);
193
+    if (subCU.m_chromaFormat != X265_CSP_I400)
194
+    {
195
+        m_subPartCopy(m_transformSkip[1] + offset, subCU.m_transformSkip[1]);
196
+        m_subPartCopy(m_transformSkip[2] + offset, subCU.m_transformSkip[2]);
197
+        m_subPartCopy(m_cbf[1] + offset, subCU.m_cbf[1]);
198
+        m_subPartCopy(m_cbf[2] + offset, subCU.m_cbf[2]);
199
+        m_subPartCopy(m_chromaIntraDir + offset, subCU.m_chromaIntraDir);
200
+
201
x265_1.8.tar.gz/source/common/cudata.h -> x265_1.9.tar.gz/source/common/cudata.h Changed
51
 
1
@@ -222,12 +222,12 @@
2
     void     copyToPic(uint32_t depth) const;
3
 
4
     /* RD-0 methods called only from encodeResidue */
5
-    void     copyFromPic(const CUData& ctu, const CUGeom& cuGeom);
6
+    void     copyFromPic(const CUData& ctu, const CUGeom& cuGeom, int csp);
7
     void     updatePic(uint32_t depth) const;
8
 
9
     void     setPartSizeSubParts(PartSize size)    { m_partSet(m_partSize, (uint8_t)size); }
10
     void     setPredModeSubParts(PredMode mode)    { m_partSet(m_predMode, (uint8_t)mode); }
11
-    void     clearCbf()                            { m_partSet(m_cbf[0], 0); m_partSet(m_cbf[1], 0); m_partSet(m_cbf[2], 0); }
12
+    void     clearCbf()                            { m_partSet(m_cbf[0], 0); if (m_chromaFormat != X265_CSP_I400) { m_partSet(m_cbf[1], 0); m_partSet(m_cbf[2], 0);} }
13
 
14
     /* these functions all take depth as an absolute depth from CTU, it is used to calculate the number of parts to copy */
15
     void     setQPSubParts(int8_t qp, uint32_t absPartIdx, uint32_t depth)                    { s_partSet[depth]((uint8_t*)m_qp + absPartIdx, (uint8_t)qp); }
16
@@ -246,7 +246,7 @@
17
     void     setPURefIdx(int list, int8_t refIdx, int absPartIdx, int puIdx);
18
 
19
     uint8_t  getCbf(uint32_t absPartIdx, TextType ttype, uint32_t tuDepth) const { return (m_cbf[ttype][absPartIdx] >> tuDepth) & 0x1; }
20
-    uint8_t  getQtRootCbf(uint32_t absPartIdx) const                             { return m_cbf[0][absPartIdx] || m_cbf[1][absPartIdx] || m_cbf[2][absPartIdx]; }
21
+    uint8_t  getQtRootCbf(uint32_t absPartIdx) const                             { if (m_chromaFormat == X265_CSP_I400) return m_cbf[0][absPartIdx] || false; else { return m_cbf[0][absPartIdx] || m_cbf[1][absPartIdx] || m_cbf[2][absPartIdx];} }
22
     int8_t   getRefQP(uint32_t currAbsIdxInCTU) const;
23
     uint32_t getInterMergeCandidates(uint32_t absPartIdx, uint32_t puIdx, MVField (*candMvField)[2], uint8_t* candDir) const;
24
     void     clipMv(MV& outMV) const;
25
@@ -323,7 +323,6 @@
26
     const uint16_t *scan;
27
     const uint16_t *scanCG;
28
     ScanType        scanType;
29
-    uint32_t        log2TrSizeCG;
30
     uint32_t        firstSignificanceMapContext;
31
 };
32
 
33
@@ -340,8 +339,15 @@
34
         uint32_t numPartition = NUM_4x4_PARTITIONS >> (depth * 2);
35
         uint32_t cuSize = g_maxCUSize >> depth;
36
         uint32_t sizeL = cuSize * cuSize;
37
-        uint32_t sizeC = sizeL >> (CHROMA_H_SHIFT(csp) + CHROMA_V_SHIFT(csp));
38
-        CHECKED_MALLOC(trCoeffMemBlock, coeff_t, (sizeL + sizeC * 2) * numInstances);
39
+        if (csp == X265_CSP_I400)
40
+        {
41
+            CHECKED_MALLOC(trCoeffMemBlock, coeff_t, (sizeL) * numInstances);
42
+        }
43
+        else
44
+        {            
45
+            uint32_t sizeC = sizeL >> (CHROMA_H_SHIFT(csp) + CHROMA_V_SHIFT(csp));
46
+            CHECKED_MALLOC(trCoeffMemBlock, coeff_t, (sizeL + sizeC * 2) * numInstances);
47
+        }
48
         CHECKED_MALLOC(charMemBlock, uint8_t, numPartition * numInstances * CUData::BytesPerPartition);
49
         CHECKED_MALLOC(mvMemBlock, MV, numPartition * 4 * numInstances);
50
         return true;
51
x265_1.8.tar.gz/source/common/dct.cpp -> x265_1.9.tar.gz/source/common/dct.cpp Changed
46
 
1
@@ -703,7 +703,10 @@
2
         if (level)
3
             ++numSig;
4
         level *= sign;
5
-        qCoef[blockpos] = (int16_t)x265_clip3(-32768, 32767, level);
6
+
7
+        // TODO: when we limit range to [-32767, 32767], we can get more performance with output change
8
+        //       But nquant is a little percent in rdoQuant, so I keep old dynamic range for compatible
9
+        qCoef[blockpos] = (int16_t)abs(x265_clip3(-32768, 32767, level));
10
     }
11
 
12
     return numSig;
13
@@ -784,11 +787,12 @@
14
     return scanPosLast - 1;
15
 }
16
 
17
+// NOTE: no defined value on lastNZPosInCG & absSumSign when ALL ZEROS block as input
18
 static uint32_t findPosFirstLast_c(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16])
19
 {
20
     int n;
21
 
22
-    for (n = SCAN_SET_SIZE - 1; n >= 0; --n)
23
+    for (n = SCAN_SET_SIZE - 1; n >= 0; n--)
24
     {
25
         const uint32_t idx = scanTbl[n];
26
         const uint32_t idxY = idx / MLS_CG_SIZE;
27
@@ -812,8 +816,17 @@
28
 
29
     uint32_t firstNZPosInCG = (uint32_t)n;
30
 
31
+    uint32_t absSumSign = 0;
32
+    for (n = firstNZPosInCG; n <= (int)lastNZPosInCG; n++)
33
+    {
34
+        const uint32_t idx = scanTbl[n];
35
+        const uint32_t idxY = idx / MLS_CG_SIZE;
36
+        const uint32_t idxX = idx % MLS_CG_SIZE;
37
+        absSumSign += dstCoeff[idxY * trSize + idxX];
38
+    }
39
+
40
     // NOTE: when coeff block all ZERO, the lastNZPosInCG is undefined and firstNZPosInCG is 16
41
-    return ((lastNZPosInCG << 16) | firstNZPosInCG);
42
+    return ((absSumSign << 31) | (lastNZPosInCG << 8) | firstNZPosInCG);
43
 }
44
 
45
 
46
x265_1.8.tar.gz/source/common/deblock.cpp -> x265_1.9.tar.gz/source/common/deblock.cpp Changed
86
 
1
@@ -2,6 +2,7 @@
2
 * Copyright (C) 2013 x265 project
3
 *
4
 * Author: Gopu Govindaswamy <gopu@multicorewareinc.com>
5
+*         Min Chen <chenm003@163.com>
6
 *
7
 * This program is free software; you can redistribute it and/or modify
8
 * it under the terms of the GNU General Public License as published by
9
@@ -108,7 +109,7 @@
10
     for (uint32_t e = 0; e < numUnits; e += partIdxIncr)
11
     {
12
         edgeFilterLuma(cu, absPartIdx, depth, dir, e, blockStrength);
13
-        if (!((e0 + e) & chromaMask))
14
+        if (!((e0 + e) & chromaMask) && cu->m_chromaFormat != X265_CSP_I400)
15
             edgeFilterChroma(cu, absPartIdx, depth, dir, e, blockStrength);
16
     }
17
 }
18
@@ -209,8 +210,8 @@
19
     const Slice* const sliceQ = cuQ->m_slice;
20
     const Slice* const sliceP = cuP->m_slice;
21
 
22
-    const Frame* refP0 = sliceP->getRefPic(0, cuP->m_refIdx[0][partP]);
23
-    const Frame* refQ0 = sliceQ->getRefPic(0, cuQ->m_refIdx[0][partQ]);
24
+    const Frame* refP0 = sliceP->m_refFrameList[0][cuP->m_refIdx[0][partP]];
25
+    const Frame* refQ0 = sliceQ->m_refFrameList[0][cuQ->m_refIdx[0][partQ]];
26
     const MV& mvP0 = refP0 ? cuP->m_mv[0][partP] : zeroMv;
27
     const MV& mvQ0 = refQ0 ? cuQ->m_mv[0][partQ] : zeroMv;
28
 
29
@@ -221,8 +222,8 @@
30
     }
31
 
32
     // (sliceQ->isInterB() || sliceP->isInterB())
33
-    const Frame* refP1 = sliceP->getRefPic(1, cuP->m_refIdx[1][partP]);
34
-    const Frame* refQ1 = sliceQ->getRefPic(1, cuQ->m_refIdx[1][partQ]);
35
+    const Frame* refP1 = sliceP->m_refFrameList[1][cuP->m_refIdx[1][partP]];
36
+    const Frame* refQ1 = sliceQ->m_refFrameList[1][cuQ->m_refIdx[1][partQ]];
37
     const MV& mvP1 = refP1 ? cuP->m_mv[1][partP] : zeroMv;
38
     const MV& mvQ1 = refQ1 ? cuQ->m_mv[1][partQ] : zeroMv;
39
 
40
@@ -279,31 +280,6 @@
41
  * \param maskQ   indicator to enable filtering on partQ
42
  * \param maskP1  decision weak filter/no filter for partP
43
  * \param maskQ1  decision weak filter/no filter for partQ */
44
-static inline void pelFilterLumaStrong(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tc, int32_t maskP, int32_t maskQ)
45
-{
46
-    int32_t tc2 = 2 * tc;
47
-    int32_t tcP = (tc2 & maskP);
48
-    int32_t tcQ = (tc2 & maskQ);
49
-    for (int32_t i = 0; i < UNIT_SIZE; i++, src += srcStep)
50
-    {
51
-        int16_t m4  = (int16_t)src[0];
52
-        int16_t m3  = (int16_t)src[-offset];
53
-        int16_t m5  = (int16_t)src[offset];
54
-        int16_t m2  = (int16_t)src[-offset * 2];
55
-        int16_t m6  = (int16_t)src[offset * 2];
56
-        int16_t m1  = (int16_t)src[-offset * 3];
57
-        int16_t m7  = (int16_t)src[offset * 3];
58
-        int16_t m0  = (int16_t)src[-offset * 4];
59
-        src[-offset * 3] = (pixel)(x265_clip3(-tcP, tcP, ((2 * m0 + 3 * m1 + m2 + m3 + m4 + 4) >> 3) - m1) + m1);
60
-        src[-offset * 2] = (pixel)(x265_clip3(-tcP, tcP, ((m1 + m2 + m3 + m4 + 2) >> 2) - m2) + m2);
61
-        src[-offset]     = (pixel)(x265_clip3(-tcP, tcP, ((m1 + 2 * m2 + 2 * m3 + 2 * m4 + m5 + 4) >> 3) - m3) + m3);
62
-        src[0]           = (pixel)(x265_clip3(-tcQ, tcQ, ((m2 + 2 * m3 + 2 * m4 + 2 * m5 + m6 + 4) >> 3) - m4) + m4);
63
-        src[offset]      = (pixel)(x265_clip3(-tcQ, tcQ, ((m3 + m4 + m5 + m6 + 2) >> 2) - m5) + m5);
64
-        src[offset * 2]  = (pixel)(x265_clip3(-tcQ, tcQ, ((m3 + m4 + m5 + 3 * m6 + 2 * m7 + 4) >> 3) - m6) + m6);
65
-    }
66
-}
67
-
68
-/* Weak filter */
69
 static inline void pelFilterLuma(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tc, int32_t maskP, int32_t maskQ,
70
                                  int32_t maskP1, int32_t maskQ1)
71
 {
72
@@ -445,7 +421,12 @@
73
                    useStrongFiltering(offset, beta, tc, src + unitOffset + srcStep * 3));
74
 
75
         if (sw)
76
-            pelFilterLumaStrong(src + unitOffset, srcStep, offset, tc, maskP, maskQ);
77
+        {
78
+            int32_t tc2 = 2 * tc;
79
+            int32_t tcP = (tc2 & maskP);
80
+            int32_t tcQ = (tc2 & maskQ);
81
+            primitives.pelFilterLumaStrong[dir](src + unitOffset, srcStep, offset, tcP, tcQ);
82
+        }
83
         else
84
         {
85
             int32_t sideThreshold = (beta + (beta >> 1)) >> 3;
86
x265_1.8.tar.gz/source/common/deblock.h -> x265_1.9.tar.gz/source/common/deblock.h Changed
42
 
1
@@ -2,6 +2,7 @@
2
 * Copyright (C) 2013 x265 project
3
 *
4
 * Author: Gopu Govindaswamy <gopu@multicorewareinc.com>
5
+*         Min Chen <chenm003@163.com>
6
 *
7
 * This program is free software; you can redistribute it and/or modify
8
 * it under the terms of the GNU General Public License as published by
9
@@ -37,24 +38,24 @@
10
 public:
11
     enum { EDGE_VER, EDGE_HOR };
12
 
13
-    void deblockCTU(const CUData* ctu, const CUGeom& cuGeom, int32_t dir);
14
+    static void deblockCTU(const CUData* ctu, const CUGeom& cuGeom, int32_t dir);
15
 
16
 protected:
17
 
18
     // CU-level deblocking function
19
-    void deblockCU(const CUData* cu, const CUGeom& cuGeom, const int32_t dir, uint8_t blockStrength[]);
20
+    static void deblockCU(const CUData* cu, const CUGeom& cuGeom, const int32_t dir, uint8_t blockStrength[]);
21
 
22
     // set filtering functions
23
-    void setEdgefilterTU(const CUData* cu, uint32_t absPartIdx, uint32_t tuDepth, int32_t dir, uint8_t blockStrength[]);
24
-    void setEdgefilterPU(const CUData* cu, uint32_t absPartIdx, int32_t dir, uint8_t blockStrength[], uint32_t numUnits);
25
-    void setEdgefilterMultiple(const CUData* cu, uint32_t absPartIdx, int32_t dir, int32_t edgeIdx, uint8_t value, uint8_t blockStrength[], uint32_t numUnits);
26
+    static void setEdgefilterTU(const CUData* cu, uint32_t absPartIdx, uint32_t tuDepth, int32_t dir, uint8_t blockStrength[]);
27
+    static void setEdgefilterPU(const CUData* cu, uint32_t absPartIdx, int32_t dir, uint8_t blockStrength[], uint32_t numUnits);
28
+    static void setEdgefilterMultiple(const CUData* cu, uint32_t absPartIdx, int32_t dir, int32_t edgeIdx, uint8_t value, uint8_t blockStrength[], uint32_t numUnits);
29
 
30
     // get filtering functions
31
-    uint8_t getBoundaryStrength(const CUData* cuQ, int32_t dir, uint32_t partQ, const uint8_t blockStrength[]);
32
+    static uint8_t getBoundaryStrength(const CUData* cuQ, int32_t dir, uint32_t partQ, const uint8_t blockStrength[]);
33
 
34
     // filter luma/chroma functions
35
-    void edgeFilterLuma(const CUData* cuQ, uint32_t absPartIdx, uint32_t depth, int32_t dir, int32_t edge, const uint8_t blockStrength[]);
36
-    void edgeFilterChroma(const CUData* cuQ, uint32_t absPartIdx, uint32_t depth, int32_t dir, int32_t edge, const uint8_t blockStrength[]);
37
+    static void edgeFilterLuma(const CUData* cuQ, uint32_t absPartIdx, uint32_t depth, int32_t dir, int32_t edge, const uint8_t blockStrength[]);
38
+    static void edgeFilterChroma(const CUData* cuQ, uint32_t absPartIdx, uint32_t depth, int32_t dir, int32_t edge, const uint8_t blockStrength[]);
39
 
40
     static const uint8_t s_tcTable[54];
41
     static const uint8_t s_betaTable[52];
42
x265_1.8.tar.gz/source/common/frame.cpp -> x265_1.9.tar.gz/source/common/frame.cpp Changed
91
 
1
@@ -33,22 +33,37 @@
2
     m_bChromaExtended = false;
3
     m_lowresInit = false;
4
     m_reconRowCount.set(0);
5
+    m_reconColCount = NULL;
6
     m_countRefEncoders = 0;
7
     m_encData = NULL;
8
     m_reconPic = NULL;
9
+    m_quantOffsets = NULL;
10
     m_next = NULL;
11
     m_prev = NULL;
12
     m_param = NULL;
13
     memset(&m_lowres, 0, sizeof(m_lowres));
14
 }
15
 
16
-bool Frame::create(x265_param *param)
17
+bool Frame::create(x265_param *param, float* quantOffsets)
18
 {
19
     m_fencPic = new PicYuv;
20
     m_param = param;
21
 
22
-    return m_fencPic->create(param->sourceWidth, param->sourceHeight, param->internalCsp) &&
23
-           m_lowres.create(m_fencPic, param->bframes, !!param->rc.aqMode);
24
+    if (m_fencPic->create(param->sourceWidth, param->sourceHeight, param->internalCsp) &&
25
+        m_lowres.create(m_fencPic, param->bframes, !!param->rc.aqMode))
26
+    {
27
+        X265_CHECK((m_reconColCount == NULL), "m_reconColCount was initialized");
28
+        m_numRows = (m_fencPic->m_picHeight + g_maxCUSize - 1)  / g_maxCUSize;
29
+        m_reconColCount = new ThreadSafeInteger[m_numRows];
30
+
31
+        if (quantOffsets)
32
+        {
33
+            int32_t cuCount = m_lowres.maxBlocksInRow * m_lowres.maxBlocksInCol;
34
+            m_quantOffsets = new float[cuCount];
35
+        }
36
+        return true;
37
+    }
38
+    return false;
39
 }
40
 
41
 bool Frame::allocEncodeData(x265_param *param, const SPS& sps)
42
@@ -56,15 +71,27 @@
43
     m_encData = new FrameData;
44
     m_reconPic = new PicYuv;
45
     m_encData->m_reconPic = m_reconPic;
46
-    bool ok = m_encData->create(param, sps) && m_reconPic->create(param->sourceWidth, param->sourceHeight, param->internalCsp);
47
+    bool ok = m_encData->create(*param, sps) && m_reconPic->create(param->sourceWidth, param->sourceHeight, param->internalCsp);
48
     if (ok)
49
     {
50
         /* initialize right border of m_reconpicYuv as SAO may read beyond the
51
          * end of the picture accessing uninitialized pixels */
52
         int maxHeight = sps.numCuInHeight * g_maxCUSize;
53
-        memset(m_reconPic->m_picOrg[0], 0, sizeof(pixel) * m_reconPic->m_stride * maxHeight);
54
-        memset(m_reconPic->m_picOrg[1], 0, sizeof(pixel) * m_reconPic->m_strideC * (maxHeight >> m_reconPic->m_vChromaShift));
55
-        memset(m_reconPic->m_picOrg[2], 0, sizeof(pixel) * m_reconPic->m_strideC * (maxHeight >> m_reconPic->m_vChromaShift));
56
+        memset(m_reconPic->m_picOrg[0], 0, sizeof(pixel)* m_reconPic->m_stride * maxHeight);
57
+
58
+        /* use pre-calculated cu/pu offsets cached in the SPS structure */
59
+        m_reconPic->m_cuOffsetY = sps.cuOffsetY;
60
+        m_reconPic->m_buOffsetY = sps.buOffsetY;
61
+
62
+        if (param->internalCsp != X265_CSP_I400)
63
+        {
64
+            memset(m_reconPic->m_picOrg[1], 0, sizeof(pixel) * m_reconPic->m_strideC * (maxHeight >> m_reconPic->m_vChromaShift));
65
+            memset(m_reconPic->m_picOrg[2], 0, sizeof(pixel) * m_reconPic->m_strideC * (maxHeight >> m_reconPic->m_vChromaShift));
66
+
67
+            /* use pre-calculated cu/pu offsets cached in the SPS structure */
68
+            m_reconPic->m_cuOffsetC = sps.cuOffsetC;
69
+            m_reconPic->m_buOffsetC = sps.buOffsetC;
70
+        }
71
     }
72
     return ok;
73
 }
74
@@ -100,5 +127,16 @@
75
         m_reconPic = NULL;
76
     }
77
 
78
+    if (m_reconColCount)
79
+    {
80
+        delete[] m_reconColCount;
81
+        m_reconColCount = NULL;
82
+    }
83
+
84
+    if (m_quantOffsets)
85
+    {
86
+        delete[] m_quantOffsets;
87
+    }
88
+
89
     m_lowres.destroy();
90
 }
91
x265_1.8.tar.gz/source/common/frame.h -> x265_1.9.tar.gz/source/common/frame.h Changed
32
 
1
@@ -35,7 +35,7 @@
2
 class PicYuv;
3
 struct SPS;
4
 
5
-#define IS_REFERENCED(frame) (frame->m_lowres.sliceType != X265_TYPE_B) 
6
+#define IS_REFERENCED(frame) (frame->m_lowres.sliceType != X265_TYPE_B)
7
 
8
 class Frame
9
 {
10
@@ -59,8 +59,12 @@
11
     bool                   m_lowresInit;         // lowres init complete (pre-analysis)
12
     bool                   m_bChromaExtended;    // orig chroma planes motion extended for weight analysis
13
 
14
+    float*                 m_quantOffsets;       // points to quantOffsets in x265_picture
15
+
16
     /* Frame Parallelism - notification between FrameEncoders of available motion reference rows */
17
     ThreadSafeInteger      m_reconRowCount;      // count of CTU rows completely reconstructed and extended for motion reference
18
+    ThreadSafeInteger*     m_reconColCount;      // count of CTU cols completely reconstructed and extended for motion reference
19
+    int32_t                m_numRows;
20
     volatile uint32_t      m_countRefEncoders;   // count of FrameEncoder threads monitoring m_reconRowCount
21
 
22
     Frame*                 m_next;               // PicList doubly linked list pointers
23
@@ -69,7 +73,7 @@
24
     x265_analysis_data     m_analysisData;
25
     Frame();
26
 
27
-    bool create(x265_param *param);
28
+    bool create(x265_param *param, float* quantOffsets);
29
     bool allocEncodeData(x265_param *param, const SPS& sps);
30
     void reinit(const SPS& sps);
31
     void destroy();
32
x265_1.8.tar.gz/source/common/framedata.cpp -> x265_1.9.tar.gz/source/common/framedata.cpp Changed
21
 
1
@@ -31,15 +31,15 @@
2
     memset(this, 0, sizeof(*this));
3
 }
4
 
5
-bool FrameData::create(x265_param *param, const SPS& sps)
6
+bool FrameData::create(const x265_param& param, const SPS& sps)
7
 {
8
-    m_param = param;
9
+    m_param = &param;
10
     m_slice  = new Slice;
11
     m_picCTU = new CUData[sps.numCUsInFrame];
12
 
13
-    m_cuMemPool.create(0, param->internalCsp, sps.numCUsInFrame);
14
+    m_cuMemPool.create(0, param.internalCsp, sps.numCUsInFrame);
15
     for (uint32_t ctuAddr = 0; ctuAddr < sps.numCUsInFrame; ctuAddr++)
16
-        m_picCTU[ctuAddr].initialize(m_cuMemPool, 0, param->internalCsp, ctuAddr);
17
+        m_picCTU[ctuAddr].initialize(m_cuMemPool, 0, param.internalCsp, ctuAddr);
18
 
19
     CHECKED_MALLOC(m_cuStat, RCStatCU, sps.numCUsInFrame);
20
     CHECKED_MALLOC(m_rowStat, RCStatRow, sps.numCuInHeight);
21
x265_1.8.tar.gz/source/common/framedata.h -> x265_1.9.tar.gz/source/common/framedata.h Changed
83
 
1
@@ -55,8 +55,7 @@
2
     double      avgLumaDistortion;
3
     double      avgChromaDistortion;
4
     double      avgPsyEnergy;
5
-    double      avgLumaLevel;
6
-    double      lumaLevel;
7
+    double      avgResEnergy;
8
     double      percentIntraNxN;
9
     double      percentSkipCu[NUM_CU_DEPTH];
10
     double      percentMergeCu[NUM_CU_DEPTH];
11
@@ -69,13 +68,13 @@
12
     uint64_t    lumaDistortion;
13
     uint64_t    chromaDistortion;
14
     uint64_t    psyEnergy;
15
+    uint64_t    resEnergy;
16
     uint64_t    cntSkipCu[NUM_CU_DEPTH];
17
     uint64_t    cntMergeCu[NUM_CU_DEPTH];
18
     uint64_t    cntInter[NUM_CU_DEPTH];
19
     uint64_t    cntIntra[NUM_CU_DEPTH];
20
     uint64_t    cuInterDistribution[NUM_CU_DEPTH][INTER_MODES];
21
     uint64_t    cuIntraDistribution[NUM_CU_DEPTH][INTRA_MODES];
22
-    uint16_t    maxLumaLevel;
23
 
24
     FrameStats()
25
     {
26
@@ -96,7 +95,7 @@
27
 
28
     Slice*         m_slice;
29
     SAOParam*      m_saoParam;
30
-    x265_param*    m_param;
31
+    const x265_param* m_param;
32
 
33
     FrameData*     m_freeListNext;
34
     PicYuv*        m_reconPic;
35
@@ -135,19 +134,44 @@
36
     RCStatCU*      m_cuStat;
37
     RCStatRow*     m_rowStat;
38
     FrameStats     m_frameStats; // stats of current frame for multi-pass encodes
39
+    /* data needed for periodic intra refresh */
40
+    struct PeriodicIR
41
+    {
42
+        uint32_t   pirStartCol;
43
+        uint32_t   pirEndCol;
44
+        int        framesSinceLastPir;
45
+    };
46
 
47
+    PeriodicIR     m_pir;
48
     double         m_avgQpRc;    /* avg QP as decided by rate-control */
49
     double         m_avgQpAq;    /* avg QP as decided by AQ in addition to rate-control */
50
     double         m_rateFactor; /* calculated based on the Frame QP */
51
 
52
     FrameData();
53
 
54
-    bool create(x265_param *param, const SPS& sps);
55
+    bool create(const x265_param& param, const SPS& sps);
56
     void reinit(const SPS& sps);
57
     void destroy();
58
+    inline CUData* getPicCTU(uint32_t ctuAddr) { return &m_picCTU[ctuAddr]; }
59
+};
60
+
61
+/* Stores intra analysis data for a single frame. This struct needs better packing */
62
+struct analysis_intra_data
63
+{
64
+    uint8_t*  depth;
65
+    uint8_t*  modes;
66
+    char*     partSizes;
67
+    uint8_t*  chromaModes;
68
+};
69
 
70
-    CUData* getPicCTU(uint32_t ctuAddr) { return &m_picCTU[ctuAddr]; }
71
+/* Stores inter analysis data for a single frame */
72
+struct analysis_inter_data
73
+{
74
+    MV*         mv;
75
+    int32_t*    ref;
76
+    uint8_t*    depth;
77
+    uint8_t*    modes;
78
+    uint32_t*   bestMergeCand;
79
 };
80
 }
81
-
82
 #endif // ifndef X265_FRAMEDATA_H
83
x265_1.8.tar.gz/source/common/ipfilter.cpp -> x265_1.9.tar.gz/source/common/ipfilter.cpp Changed
9
 
1
@@ -4,6 +4,7 @@
2
  * Authors: Deepthi Devaki <deepthidevaki@multicorewareinc.com>,
3
  *          Rajesh Paulraj <rajesh@multicorewareinc.com>
4
  *          Praveen Kumar Tiwari <praveen@multicorewareinc.com>
5
+ *          Min Chen <chenm003@163.com>
6
  *
7
  * This program is free software; you can redistribute it and/or modify
8
  * it under the terms of the GNU General Public License as published by
9
x265_1.8.tar.gz/source/common/loopfilter.cpp -> x265_1.9.tar.gz/source/common/loopfilter.cpp Changed
47
 
1
@@ -3,6 +3,7 @@
2
 *
3
 * Authors: Praveen Kumar Tiwari <praveen@multicorewareinc.com>
4
 *          Dnyaneshwar Gorade <dnyaneshwar@multicorewareinc.com>
5
+*          Min Chen <chenm003@163.com>
6
 *
7
 * This program is free software; you can redistribute it and/or modify
8
 * it under the terms of the GNU General Public License as published by
9
@@ -136,6 +137,27 @@
10
         rec += stride;
11
     }
12
 }
13
+
14
+static void pelFilterLumaStrong_c(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tcP, int32_t tcQ)
15
+{
16
+    for (int32_t i = 0; i < UNIT_SIZE; i++, src += srcStep)
17
+    {
18
+        int16_t m4  = (int16_t)src[0];
19
+        int16_t m3  = (int16_t)src[-offset];
20
+        int16_t m5  = (int16_t)src[offset];
21
+        int16_t m2  = (int16_t)src[-offset * 2];
22
+        int16_t m6  = (int16_t)src[offset * 2];
23
+        int16_t m1  = (int16_t)src[-offset * 3];
24
+        int16_t m7  = (int16_t)src[offset * 3];
25
+        int16_t m0  = (int16_t)src[-offset * 4];
26
+        src[-offset * 3] = (pixel)(x265_clip3(-tcP, tcP, ((2 * m0 + 3 * m1 + m2 + m3 + m4 + 4) >> 3) - m1) + m1);
27
+        src[-offset * 2] = (pixel)(x265_clip3(-tcP, tcP, ((m1 + m2 + m3 + m4 + 2) >> 2) - m2) + m2);
28
+        src[-offset]     = (pixel)(x265_clip3(-tcP, tcP, ((m1 + 2 * m2 + 2 * m3 + 2 * m4 + m5 + 4) >> 3) - m3) + m3);
29
+        src[0]           = (pixel)(x265_clip3(-tcQ, tcQ, ((m2 + 2 * m3 + 2 * m4 + 2 * m5 + m6 + 4) >> 3) - m4) + m4);
30
+        src[offset]      = (pixel)(x265_clip3(-tcQ, tcQ, ((m3 + m4 + m5 + m6 + 2) >> 2) - m5) + m5);
31
+        src[offset * 2]  = (pixel)(x265_clip3(-tcQ, tcQ, ((m3 + m4 + m5 + 3 * m6 + 2 * m7 + 4) >> 3) - m6) + m6);
32
+    }
33
+}
34
 }
35
 
36
 namespace X265_NS {
37
@@ -150,5 +172,9 @@
38
     p.saoCuOrgE3[1] = processSaoCUE3;
39
     p.saoCuOrgB0 = processSaoCUB0;
40
     p.sign = calSign;
41
+
42
+    // C code is same for EDGE_VER and EDGE_HOR only asm code is different
43
+    p.pelFilterLumaStrong[0] = pelFilterLumaStrong_c;
44
+    p.pelFilterLumaStrong[1] = pelFilterLumaStrong_c;
45
 }
46
 }
47
x265_1.8.tar.gz/source/common/lowres.cpp -> x265_1.9.tar.gz/source/common/lowres.cpp Changed
29
 
1
@@ -52,6 +52,7 @@
2
         CHECKED_MALLOC(qpAqOffset, double, cuCount);
3
         CHECKED_MALLOC(invQscaleFactor, int, cuCount);
4
         CHECKED_MALLOC(qpCuTreeOffset, double, cuCount);
5
+        CHECKED_MALLOC(blockVariance, uint32_t, cuCount);
6
     }
7
     CHECKED_MALLOC(propagateCost, uint16_t, cuCount);
8
 
9
@@ -120,18 +121,17 @@
10
     X265_FREE(invQscaleFactor);
11
     X265_FREE(qpCuTreeOffset);
12
     X265_FREE(propagateCost);
13
+    X265_FREE(blockVariance);
14
 }
15
 
16
 // (re) initialize lowres state
17
 void Lowres::init(PicYuv *origPic, int poc)
18
 {
19
     bLastMiniGopBFrame = false;
20
-    bScenecut = false;  // could be a scene-cut, until ruled out by flash detection
21
     bKeyframe = false; // Not a keyframe unless identified by lookahead
22
     frameNum = poc;
23
     leadingBframes = 0;
24
     indB = 0;
25
-    satdCost = (int64_t)-1;
26
     memset(costEst, -1, sizeof(costEst));
27
     memset(weightedCostDelta, 0, sizeof(weightedCostDelta));
28
 
29
x265_1.8.tar.gz/source/common/lowres.h -> x265_1.9.tar.gz/source/common/lowres.h Changed
17
 
1
@@ -143,12 +143,15 @@
2
     double*   qpAqOffset;      // AQ QP offset values for each 16x16 CU
3
     double*   qpCuTreeOffset;  // cuTree QP offset values for each 16x16 CU
4
     int*      invQscaleFactor; // qScale values for qp Aq Offsets
5
+    uint32_t* blockVariance;
6
     uint64_t  wp_ssd[3];       // This is different than SSDY, this is sum(pixel^2) - sum(pixel)^2 for entire frame
7
     uint64_t  wp_sum[3];
8
+    uint64_t  frameVariance;
9
 
10
     /* cutree intermediate data */
11
     uint16_t* propagateCost;
12
     double    weightedCostDelta[X265_BFRAME_MAX + 2];
13
+    ReferencePlanes weightedRef[X265_BFRAME_MAX + 2];
14
 
15
     bool create(PicYuv *origPic, int _bframes, bool bAqEnabled);
16
     void destroy();
17
x265_1.8.tar.gz/source/common/param.cpp -> x265_1.9.tar.gz/source/common/param.cpp Changed
201
 
1
@@ -147,7 +147,7 @@
2
     param->bFrameAdaptive = X265_B_ADAPT_TRELLIS;
3
     param->bBPyramid = 1;
4
     param->scenecutThreshold = 40; /* Magic number pulled in from x264 */
5
-    param->lookaheadSlices = 0;
6
+    param->lookaheadSlices = 8;
7
 
8
     /* Intra Coding Tools */
9
     param->bEnableConstrainedIntra = 0;
10
@@ -159,7 +159,8 @@
11
     param->subpelRefine = 2;
12
     param->searchRange = 57;
13
     param->maxNumMergeCand = 2;
14
-    param->limitReferences = 0;
15
+    param->limitReferences = 3;
16
+    param->limitModes = 0;
17
     param->bEnableWeightedPred = 1;
18
     param->bEnableWeightedBiPred = 0;
19
     param->bEnableEarlySkip = 0;
20
@@ -184,7 +185,7 @@
21
     param->cbQpOffset = 0;
22
     param->crQpOffset = 0;
23
     param->rdPenalty = 0;
24
-    param->psyRd = 0.3;
25
+    param->psyRd = 2.0;
26
     param->psyRdoq = 0.0;
27
     param->analysisMode = 0;
28
     param->analysisFileName = NULL;
29
@@ -241,6 +242,10 @@
30
     param->vui.defDispWinRightOffset = 0;
31
     param->vui.defDispWinTopOffset = 0;
32
     param->vui.defDispWinBottomOffset = 0;
33
+    param->maxCLL = 0;
34
+    param->maxFALL = 0;
35
+    param->minLuma = 0;
36
+    param->maxLuma = (1 << X265_DEPTH) - 1;
37
 }
38
 
39
 int x265_param_default_preset(x265_param* param, const char* preset, const char* tune)
40
@@ -274,9 +279,9 @@
41
             param->bEnableWeightedPred = 0;
42
             param->rdLevel = 2;
43
             param->maxNumReferences = 1;
44
+            param->limitReferences = 0;
45
             param->rc.aqStrength = 0.0;
46
             param->rc.aqMode = X265_AQ_NONE;
47
-            param->rc.cuTree = 0;
48
             param->rc.qgSize = 32;
49
             param->bEnableFastIntra = 1;
50
         }
51
@@ -291,9 +296,9 @@
52
             param->bEnableWeightedPred = 0;
53
             param->rdLevel = 2;
54
             param->maxNumReferences = 1;
55
+            param->limitReferences = 0;
56
             param->rc.aqStrength = 0.0;
57
             param->rc.aqMode = X265_AQ_NONE;
58
-            param->rc.cuTree = 0;
59
             param->rc.qgSize = 32;
60
             param->bEnableSAO = 0;
61
             param->bEnableFastIntra = 1;
62
@@ -301,13 +306,11 @@
63
         else if (!strcmp(preset, "veryfast"))
64
         {
65
             param->lookaheadDepth = 15;
66
-            param->maxCUSize = 32;
67
             param->bFrameAdaptive = 0;
68
             param->subpelRefine = 1;
69
             param->bEnableEarlySkip = 1;
70
             param->rdLevel = 2;
71
-            param->maxNumReferences = 1;
72
-            param->rc.cuTree = 0;
73
+            param->maxNumReferences = 2;
74
             param->rc.qgSize = 32;
75
             param->bEnableFastIntra = 1;
76
         }
77
@@ -317,8 +320,7 @@
78
             param->bFrameAdaptive = 0;
79
             param->bEnableEarlySkip = 1;
80
             param->rdLevel = 2;
81
-            param->maxNumReferences = 1;
82
-            param->rc.cuTree = 0;
83
+            param->maxNumReferences = 2;
84
             param->bEnableFastIntra = 1;
85
         }
86
         else if (!strcmp(preset, "fast"))
87
@@ -326,7 +328,7 @@
88
             param->lookaheadDepth = 15;
89
             param->bFrameAdaptive = 0;
90
             param->rdLevel = 2;
91
-            param->maxNumReferences = 2;
92
+            param->maxNumReferences = 3;
93
             param->bEnableFastIntra = 1;
94
         }
95
         else if (!strcmp(preset, "medium"))
96
@@ -343,6 +345,9 @@
97
             param->subpelRefine = 3;
98
             param->maxNumMergeCand = 3;
99
             param->searchMethod = X265_STAR_SEARCH;
100
+            param->maxNumReferences = 4;
101
+            param->limitModes = 1;
102
+            param->lookaheadSlices = 4; // limit parallelism as already enough work exists
103
         }
104
         else if (!strcmp(preset, "slower"))
105
         {
106
@@ -359,7 +364,11 @@
107
             param->subpelRefine = 3;
108
             param->maxNumMergeCand = 3;
109
             param->searchMethod = X265_STAR_SEARCH;
110
+            param->maxNumReferences = 4;
111
+            param->limitReferences = 2;
112
+            param->limitModes = 1;
113
             param->bIntraInBFrames = 1;
114
+            param->lookaheadSlices = 4; // limit parallelism as already enough work exists
115
         }
116
         else if (!strcmp(preset, "veryslow"))
117
         {
118
@@ -377,7 +386,10 @@
119
             param->maxNumMergeCand = 4;
120
             param->searchMethod = X265_STAR_SEARCH;
121
             param->maxNumReferences = 5;
122
+            param->limitReferences = 1;
123
+            param->limitModes = 1;
124
             param->bIntraInBFrames = 1;
125
+            param->lookaheadSlices = 0; // disabled for best quality
126
         }
127
         else if (!strcmp(preset, "placebo"))
128
         {
129
@@ -397,8 +409,10 @@
130
             param->searchMethod = X265_STAR_SEARCH;
131
             param->bEnableTransformSkip = 1;
132
             param->maxNumReferences = 5;
133
+            param->limitReferences = 0;
134
             param->rc.bEnableSlowFirstPass = 1;
135
             param->bIntraInBFrames = 1;
136
+            param->lookaheadSlices = 0; // disabled for best quality
137
             // TODO: optimized esa
138
         }
139
         else
140
@@ -565,10 +579,14 @@
141
     OPT2("level-idc", "level")
142
     {
143
         /* allow "5.1" or "51", both converted to integer 51 */
144
-        if (atof(value) < 7)
145
+        /* if level-idc specifies an obviously wrong value in either float or int, 
146
+        throw error consistently. Stronger level checking will be done in encoder_open() */
147
+        if (atof(value) < 10)
148
             p->levelIdc = (int)(10 * atof(value) + .5);
149
-        else
150
+        else if (atoi(value) < 100)
151
             p->levelIdc = atoi(value);
152
+        else 
153
+            bError = true;
154
     }
155
     OPT("high-tier") p->bHighTier = atobool(value);
156
     OPT("allow-non-conformance") p->bAllowNonConformance = atobool(value);
157
@@ -608,6 +626,7 @@
158
     OPT2("constrained-intra", "cip") p->bEnableConstrainedIntra = atobool(value);
159
     OPT("fast-intra") p->bEnableFastIntra = atobool(value);
160
     OPT("open-gop") p->bOpenGOP = atobool(value);
161
+    OPT("intra-refresh") p->bIntraRefresh = atobool(value);
162
     OPT("lookahead-slices") p->lookaheadSlices = atoi(value);
163
     OPT("scenecut")
164
     {
165
@@ -644,6 +663,7 @@
166
     }
167
     OPT("ref") p->maxNumReferences = atoi(value);
168
     OPT("limit-refs") p->limitReferences = atoi(value);
169
+    OPT("limit-modes") p->limitModes = atobool(value);
170
     OPT("weightp") p->bEnableWeightedPred = atobool(value);
171
     OPT("weightb") p->bEnableWeightedBiPred = atobool(value);
172
     OPT("cbqpoffs") p->cbQpOffset = atoi(value);
173
@@ -854,7 +874,9 @@
174
     OPT("analysis-file") p->analysisFileName = strdup(value);
175
     OPT("qg-size") p->rc.qgSize = atoi(value);
176
     OPT("master-display") p->masteringDisplayColorVolume = strdup(value);
177
-    OPT("max-cll") p->contentLightLevelInfo = strdup(value);
178
+    OPT("max-cll") bError |= sscanf(value, "%hu,%hu", &p->maxCLL, &p->maxFALL) != 2;
179
+    OPT("min-luma") p->minLuma = (uint16_t)atoi(value);
180
+    OPT("max-luma") p->maxLuma = (uint16_t)atoi(value);
181
     else
182
         return X265_PARAM_BAD_NAME;
183
 #undef OPT
184
@@ -1035,6 +1057,8 @@
185
           "subme must be greater than or equal to 0");
186
     CHECK(param->limitReferences > 3,
187
           "limitReferences must be 0, 1, 2 or 3");
188
+    CHECK(param->limitModes > 1,
189
+          "limitRectAmp must be 0, 1");
190
     CHECK(param->frameNumThreads < 0 || param->frameNumThreads > X265_MAX_FRAME_THREADS,
191
           "frameNumThreads (--frame-threads) must be [0 .. X265_MAX_FRAME_THREADS)");
192
     CHECK(param->cbQpOffset < -12, "Min. Chroma Cb QP Offset is -12");
193
@@ -1063,8 +1087,8 @@
194
 
195
     CHECK(param->sourceWidth < (int)param->maxCUSize || param->sourceHeight < (int)param->maxCUSize,
196
           "Picture size must be at least one CTU");
197
-    CHECK(param->internalCsp < X265_CSP_I420 || X265_CSP_I444 < param->internalCsp,
198
-          "Color space must be i420, i422, or i444");
199
+    CHECK(param->internalCsp < X265_CSP_I400 || X265_CSP_I444 < param->internalCsp,
200
+          "chroma subsampling must be i400 (4:0:0 monochrome), i420 (4:2:0 default), i422 (4:2:0), i444 (4:4:4)");
201
x265_1.8.tar.gz/source/common/picyuv.cpp -> x265_1.9.tar.gz/source/common/picyuv.cpp Changed
201
 
1
@@ -2,6 +2,7 @@
2
  * Copyright (C) 2015 x265 project
3
  *
4
  * Authors: Steve Borho <steve@borho.org>
5
+ *          Min Chen <chenm003@163.com>
6
  *
7
  * This program is free software; you can redistribute it and/or modify
8
  * it under the terms of the GNU General Public License as published by
9
@@ -42,6 +43,9 @@
10
     m_cuOffsetC = NULL;
11
     m_buOffsetY = NULL;
12
     m_buOffsetC = NULL;
13
+
14
+    m_maxLumaLevel = 0;
15
+    m_avgLumaLevel = 0;
16
 }
17
 
18
 bool PicYuv::create(uint32_t picWidth, uint32_t picHeight, uint32_t picCsp)
19
@@ -59,20 +63,27 @@
20
     m_lumaMarginY = g_maxCUSize + 16; // margin for 8-tap filter and infinite padding
21
     m_stride = (numCuInWidth * g_maxCUSize) + (m_lumaMarginX << 1);
22
 
23
-    m_chromaMarginX = m_lumaMarginX;  // keep 16-byte alignment for chroma CTUs
24
-    m_chromaMarginY = m_lumaMarginY >> m_vChromaShift;
25
-
26
-    m_strideC = ((numCuInWidth * g_maxCUSize) >> m_hChromaShift) + (m_chromaMarginX * 2);
27
     int maxHeight = numCuInHeight * g_maxCUSize;
28
-
29
     CHECKED_MALLOC(m_picBuf[0], pixel, m_stride * (maxHeight + (m_lumaMarginY * 2)));
30
-    CHECKED_MALLOC(m_picBuf[1], pixel, m_strideC * ((maxHeight >> m_vChromaShift) + (m_chromaMarginY * 2)));
31
-    CHECKED_MALLOC(m_picBuf[2], pixel, m_strideC * ((maxHeight >> m_vChromaShift) + (m_chromaMarginY * 2)));
32
+    m_picOrg[0] = m_picBuf[0] + m_lumaMarginY * m_stride + m_lumaMarginX;
33
+
34
+    if (picCsp != X265_CSP_I400)
35
+    {
36
+        m_chromaMarginX = m_lumaMarginX;  // keep 16-byte alignment for chroma CTUs
37
+        m_chromaMarginY = m_lumaMarginY >> m_vChromaShift;
38
+        m_strideC = ((numCuInWidth * g_maxCUSize) >> m_hChromaShift) + (m_chromaMarginX * 2);
39
 
40
-    m_picOrg[0] = m_picBuf[0] + m_lumaMarginY   * m_stride  + m_lumaMarginX;
41
-    m_picOrg[1] = m_picBuf[1] + m_chromaMarginY * m_strideC + m_chromaMarginX;
42
-    m_picOrg[2] = m_picBuf[2] + m_chromaMarginY * m_strideC + m_chromaMarginX;
43
+        CHECKED_MALLOC(m_picBuf[1], pixel, m_strideC * ((maxHeight >> m_vChromaShift) + (m_chromaMarginY * 2)));
44
+        CHECKED_MALLOC(m_picBuf[2], pixel, m_strideC * ((maxHeight >> m_vChromaShift) + (m_chromaMarginY * 2)));
45
 
46
+        m_picOrg[1] = m_picBuf[1] + m_chromaMarginY * m_strideC + m_chromaMarginX;
47
+        m_picOrg[2] = m_picBuf[2] + m_chromaMarginY * m_strideC + m_chromaMarginX;
48
+    }
49
+    else
50
+    {
51
+        m_picBuf[1] = m_picBuf[2] = NULL;
52
+        m_picOrg[1] = m_picOrg[2] = NULL;
53
+    }
54
     return true;
55
 
56
 fail:
57
@@ -85,27 +96,45 @@
58
 bool PicYuv::createOffsets(const SPS& sps)
59
 {
60
     uint32_t numPartitions = 1 << (g_unitSizeDepth * 2);
61
-    CHECKED_MALLOC(m_cuOffsetY, intptr_t, sps.numCuInWidth * sps.numCuInHeight);
62
-    CHECKED_MALLOC(m_cuOffsetC, intptr_t, sps.numCuInWidth * sps.numCuInHeight);
63
-    for (uint32_t cuRow = 0; cuRow < sps.numCuInHeight; cuRow++)
64
+
65
+    if (m_picCsp != X265_CSP_I400)
66
     {
67
-        for (uint32_t cuCol = 0; cuCol < sps.numCuInWidth; cuCol++)
68
+        CHECKED_MALLOC(m_cuOffsetY, intptr_t, sps.numCuInWidth * sps.numCuInHeight);
69
+        CHECKED_MALLOC(m_cuOffsetC, intptr_t, sps.numCuInWidth * sps.numCuInHeight);
70
+        for (uint32_t cuRow = 0; cuRow < sps.numCuInHeight; cuRow++)
71
         {
72
-            m_cuOffsetY[cuRow * sps.numCuInWidth + cuCol] = m_stride * cuRow * g_maxCUSize + cuCol * g_maxCUSize;
73
-            m_cuOffsetC[cuRow * sps.numCuInWidth + cuCol] = m_strideC * cuRow * (g_maxCUSize >> m_vChromaShift) + cuCol * (g_maxCUSize >> m_hChromaShift);
74
+            for (uint32_t cuCol = 0; cuCol < sps.numCuInWidth; cuCol++)
75
+            {
76
+                m_cuOffsetY[cuRow * sps.numCuInWidth + cuCol] = m_stride * cuRow * g_maxCUSize + cuCol * g_maxCUSize;
77
+                m_cuOffsetC[cuRow * sps.numCuInWidth + cuCol] = m_strideC * cuRow * (g_maxCUSize >> m_vChromaShift) + cuCol * (g_maxCUSize >> m_hChromaShift);
78
+            }
79
         }
80
-    }
81
 
82
-    CHECKED_MALLOC(m_buOffsetY, intptr_t, (size_t)numPartitions);
83
-    CHECKED_MALLOC(m_buOffsetC, intptr_t, (size_t)numPartitions);
84
-    for (uint32_t idx = 0; idx < numPartitions; ++idx)
85
-    {
86
-        intptr_t x = g_zscanToPelX[idx];
87
-        intptr_t y = g_zscanToPelY[idx];
88
-        m_buOffsetY[idx] = m_stride * y + x;
89
-        m_buOffsetC[idx] = m_strideC * (y >> m_vChromaShift) + (x >> m_hChromaShift);
90
+        CHECKED_MALLOC(m_buOffsetY, intptr_t, (size_t)numPartitions);
91
+        CHECKED_MALLOC(m_buOffsetC, intptr_t, (size_t)numPartitions);
92
+        for (uint32_t idx = 0; idx < numPartitions; ++idx)
93
+        {
94
+            intptr_t x = g_zscanToPelX[idx];
95
+            intptr_t y = g_zscanToPelY[idx];
96
+            m_buOffsetY[idx] = m_stride * y + x;
97
+            m_buOffsetC[idx] = m_strideC * (y >> m_vChromaShift) + (x >> m_hChromaShift);
98
+        }
99
     }
100
+    else
101
+    {
102
+        CHECKED_MALLOC(m_cuOffsetY, intptr_t, sps.numCuInWidth * sps.numCuInHeight);
103
+        for (uint32_t cuRow = 0; cuRow < sps.numCuInHeight; cuRow++)
104
+        for (uint32_t cuCol = 0; cuCol < sps.numCuInWidth; cuCol++)
105
+            m_cuOffsetY[cuRow * sps.numCuInWidth + cuCol] = m_stride * cuRow * g_maxCUSize + cuCol * g_maxCUSize;
106
 
107
+        CHECKED_MALLOC(m_buOffsetY, intptr_t, (size_t)numPartitions);
108
+        for (uint32_t idx = 0; idx < numPartitions; ++idx)
109
+        {
110
+            intptr_t x = g_zscanToPelX[idx];
111
+            intptr_t y = g_zscanToPelY[idx];
112
+            m_buOffsetY[idx] = m_stride * y + x;
113
+        }
114
+    }
115
     return true;
116
 
117
 fail:
118
@@ -121,7 +150,7 @@
119
 
120
 /* Copy pixels from an x265_picture into internal PicYuv instance.
121
  * Shift pixels as necessary, mask off bits above X265_DEPTH for safety. */
122
-void PicYuv::copyFromPicture(const x265_picture& pic, int padx, int pady)
123
+void PicYuv::copyFromPicture(const x265_picture& pic, const x265_param& param, int padx, int pady)
124
 {
125
     /* m_picWidth is the width that is being encoded, padx indicates how many
126
      * of those pixels are padding to reach multiple of MinCU(4) size.
127
@@ -155,28 +184,29 @@
128
 #if (X265_DEPTH > 8)
129
         {
130
             pixel *yPixel = m_picOrg[0];
131
-            pixel *uPixel = m_picOrg[1];
132
-            pixel *vPixel = m_picOrg[2];
133
 
134
             uint8_t *yChar = (uint8_t*)pic.planes[0];
135
-            uint8_t *uChar = (uint8_t*)pic.planes[1];
136
-            uint8_t *vChar = (uint8_t*)pic.planes[2];
137
             int shift = (X265_DEPTH - 8);
138
 
139
             primitives.planecopy_cp(yChar, pic.stride[0] / sizeof(*yChar), yPixel, m_stride, width, height, shift);
140
-            primitives.planecopy_cp(uChar, pic.stride[1] / sizeof(*uChar), uPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift);
141
-            primitives.planecopy_cp(vChar, pic.stride[2] / sizeof(*vChar), vPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift);
142
+
143
+            if (pic.colorSpace != X265_CSP_I400)
144
+            {
145
+                pixel *uPixel = m_picOrg[1];
146
+                pixel *vPixel = m_picOrg[2];
147
+
148
+                uint8_t *uChar = (uint8_t*)pic.planes[1];
149
+                uint8_t *vChar = (uint8_t*)pic.planes[2];
150
+
151
+                primitives.planecopy_cp(uChar, pic.stride[1] / sizeof(*uChar), uPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift);
152
+                primitives.planecopy_cp(vChar, pic.stride[2] / sizeof(*vChar), vPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift);
153
+            }
154
         }
155
 #else /* Case for (X265_DEPTH == 8) */
156
         // TODO: Does we need this path? may merge into above in future
157
         {
158
             pixel *yPixel = m_picOrg[0];
159
-            pixel *uPixel = m_picOrg[1];
160
-            pixel *vPixel = m_picOrg[2];
161
-
162
             uint8_t *yChar = (uint8_t*)pic.planes[0];
163
-            uint8_t *uChar = (uint8_t*)pic.planes[1];
164
-            uint8_t *vChar = (uint8_t*)pic.planes[2];
165
 
166
             for (int r = 0; r < height; r++)
167
             {
168
@@ -186,15 +216,24 @@
169
                 yChar += pic.stride[0] / sizeof(*yChar);
170
             }
171
 
172
-            for (int r = 0; r < height >> m_vChromaShift; r++)
173
+            if (pic.colorSpace != X265_CSP_I400)
174
             {
175
-                memcpy(uPixel, uChar, (width >> m_hChromaShift) * sizeof(pixel));
176
-                memcpy(vPixel, vChar, (width >> m_hChromaShift) * sizeof(pixel));
177
+                pixel *uPixel = m_picOrg[1];
178
+                pixel *vPixel = m_picOrg[2];
179
+
180
+                uint8_t *uChar = (uint8_t*)pic.planes[1];
181
+                uint8_t *vChar = (uint8_t*)pic.planes[2];
182
+
183
+                for (int r = 0; r < height >> m_vChromaShift; r++)
184
+                {
185
+                    memcpy(uPixel, uChar, (width >> m_hChromaShift) * sizeof(pixel));
186
+                    memcpy(vPixel, vChar, (width >> m_hChromaShift) * sizeof(pixel));
187
 
188
-                uPixel += m_strideC;
189
-                vPixel += m_strideC;
190
-                uChar += pic.stride[1] / sizeof(*uChar);
191
-                vChar += pic.stride[2] / sizeof(*vChar);
192
+                    uPixel += m_strideC;
193
+                    vPixel += m_strideC;
194
+                    uChar += pic.stride[1] / sizeof(*uChar);
195
+                    vChar += pic.stride[2] / sizeof(*vChar);
196
+                }
197
             }
198
         }
199
 #endif /* (X265_DEPTH > 8) */
200
@@ -205,43 +244,63 @@
201
x265_1.8.tar.gz/source/common/picyuv.h -> x265_1.9.tar.gz/source/common/picyuv.h Changed
19
 
1
@@ -60,13 +60,16 @@
2
     uint32_t m_chromaMarginX;
3
     uint32_t m_chromaMarginY;
4
 
5
+    uint16_t m_maxLumaLevel;
6
+    double   m_avgLumaLevel;
7
+
8
     PicYuv();
9
 
10
     bool  create(uint32_t picWidth, uint32_t picHeight, uint32_t csp);
11
     bool  createOffsets(const SPS& sps);
12
     void  destroy();
13
 
14
-    void  copyFromPicture(const x265_picture&, int padx, int pady);
15
+    void  copyFromPicture(const x265_picture&, const x265_param& param, int padx, int pady);
16
 
17
     intptr_t getChromaAddrOffset(uint32_t ctuAddr, uint32_t absPartIdx) const { return m_cuOffsetC[ctuAddr] + m_buOffsetC[absPartIdx]; }
18
 
19
x265_1.8.tar.gz/source/common/pixel.cpp -> x265_1.9.tar.gz/source/common/pixel.cpp Changed
201
 
1
@@ -25,6 +25,7 @@
2
  *****************************************************************************/
3
 
4
 #include "common.h"
5
+#include "slicetype.h"      // LOWRES_COST_MASK
6
 #include "primitives.h"
7
 #include "x265.h"
8
 
9
@@ -117,9 +118,9 @@
10
 }
11
 
12
 template<int lx, int ly, class T1, class T2>
13
-sse_ret_t sse(const T1* pix1, intptr_t stride_pix1, const T2* pix2, intptr_t stride_pix2)
14
+sse_t sse(const T1* pix1, intptr_t stride_pix1, const T2* pix2, intptr_t stride_pix2)
15
 {
16
-    sse_ret_t sum = 0;
17
+    sse_t sum = 0;
18
     int tmp;
19
 
20
     for (int y = 0; y < ly; y++)
21
@@ -187,37 +188,6 @@
22
     return (int)(sum >> 1);
23
 }
24
 
25
-static int satd_4x4(const int16_t* pix1, intptr_t stride_pix1)
26
-{
27
-    int32_t tmp[4][4];
28
-    int32_t s01, s23, d01, d23;
29
-    int32_t satd = 0;
30
-    int d;
31
-
32
-    for (d = 0; d < 4; d++, pix1 += stride_pix1)
33
-    {
34
-        s01 = pix1[0] + pix1[1];
35
-        s23 = pix1[2] + pix1[3];
36
-        d01 = pix1[0] - pix1[1];
37
-        d23 = pix1[2] - pix1[3];
38
-
39
-        tmp[d][0] = s01 + s23;
40
-        tmp[d][1] = s01 - s23;
41
-        tmp[d][2] = d01 - d23;
42
-        tmp[d][3] = d01 + d23;
43
-    }
44
-
45
-    for (d = 0; d < 4; d++)
46
-    {
47
-        s01 = tmp[0][d] + tmp[1][d];
48
-        s23 = tmp[2][d] + tmp[3][d];
49
-        d01 = tmp[0][d] - tmp[1][d];
50
-        d23 = tmp[2][d] - tmp[3][d];
51
-        satd += abs(s01 + s23) + abs(s01 - s23) + abs(d01 - d23) + abs(d01 + d23);
52
-    }
53
-    return (int)(satd / 2);
54
-}
55
-
56
 // x264's SWAR version of satd 8x4, performs two 4x4 SATDs at once
57
 static int satd_8x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
58
 {
59
@@ -313,57 +283,6 @@
60
     return (int)((_sa8d_8x8(pix1, i_pix1, pix2, i_pix2) + 2) >> 2);
61
 }
62
 
63
-inline int _sa8d_8x8(const int16_t* pix1, intptr_t i_pix1)
64
-{
65
-    int32_t tmp[8][8];
66
-    int32_t a0, a1, a2, a3, a4, a5, a6, a7;
67
-    int32_t sum = 0;
68
-
69
-    for (int i = 0; i < 8; i++, pix1 += i_pix1)
70
-    {
71
-        a0 = pix1[0] + pix1[1];
72
-        a1 = pix1[2] + pix1[3];
73
-        a2 = pix1[4] + pix1[5];
74
-        a3 = pix1[6] + pix1[7];
75
-        a4 = pix1[0] - pix1[1];
76
-        a5 = pix1[2] - pix1[3];
77
-        a6 = pix1[4] - pix1[5];
78
-        a7 = pix1[6] - pix1[7];
79
-        tmp[i][0] = (a0 + a1) + (a2 + a3);
80
-        tmp[i][1] = (a0 + a1) - (a2 + a3);
81
-        tmp[i][2] = (a0 - a1) + (a2 - a3);
82
-        tmp[i][3] = (a0 - a1) - (a2 - a3);
83
-        tmp[i][4] = (a4 + a5) + (a6 + a7);
84
-        tmp[i][5] = (a4 + a5) - (a6 + a7);
85
-        tmp[i][6] = (a4 - a5) + (a6 - a7);
86
-        tmp[i][7] = (a4 - a5) - (a6 - a7);
87
-    }
88
-
89
-    for (int i = 0; i < 8; i++)
90
-    {
91
-        a0 = (tmp[0][i] + tmp[1][i]) + (tmp[2][i] + tmp[3][i]);
92
-        a2 = (tmp[0][i] + tmp[1][i]) - (tmp[2][i] + tmp[3][i]);
93
-        a1 = (tmp[0][i] - tmp[1][i]) + (tmp[2][i] - tmp[3][i]);
94
-        a3 = (tmp[0][i] - tmp[1][i]) - (tmp[2][i] - tmp[3][i]);
95
-        a4 = (tmp[4][i] + tmp[5][i]) + (tmp[6][i] + tmp[7][i]);
96
-        a6 = (tmp[4][i] + tmp[5][i]) - (tmp[6][i] + tmp[7][i]);
97
-        a5 = (tmp[4][i] - tmp[5][i]) + (tmp[6][i] - tmp[7][i]);
98
-        a7 = (tmp[4][i] - tmp[5][i]) - (tmp[6][i] - tmp[7][i]);
99
-        a0 = abs(a0 + a4) + abs(a0 - a4);
100
-        a0 += abs(a1 + a5) + abs(a1 - a5);
101
-        a0 += abs(a2 + a6) + abs(a2 - a6);
102
-        a0 += abs(a3 + a7) + abs(a3 - a7);
103
-        sum += a0;
104
-    }
105
-
106
-    return (int)sum;
107
-}
108
-
109
-static int sa8d_8x8(const int16_t* pix1, intptr_t i_pix1)
110
-{
111
-    return (int)((_sa8d_8x8(pix1, i_pix1) + 2) >> 2);
112
-}
113
-
114
 static int sa8d_16x16(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
115
 {
116
     int sum = _sa8d_8x8(pix1, i_pix1, pix2, i_pix2)
117
@@ -403,9 +322,9 @@
118
 }
119
 
120
 template<int size>
121
-int pixel_ssd_s_c(const int16_t* a, intptr_t dstride)
122
+sse_t pixel_ssd_s_c(const int16_t* a, intptr_t dstride)
123
 {
124
-    int sum = 0;
125
+    sse_t sum = 0;
126
     for (int y = 0; y < size; y++)
127
     {
128
         for (int x = 0; x < size; x++)
129
@@ -783,39 +702,6 @@
130
     }
131
 }
132
 
133
-template<int size>
134
-int psyCost_ss(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride)
135
-{
136
-    static int16_t zeroBuf[8] /* = { 0 } */;
137
-
138
-    if (size)
139
-    {
140
-        int dim = 1 << (size + 2);
141
-        uint32_t totEnergy = 0;
142
-        for (int i = 0; i < dim; i += 8)
143
-        {
144
-            for (int j = 0; j < dim; j+= 8)
145
-            {
146
-                /* AC energy, measured by sa8d (AC + DC) minus SAD (DC) */
147
-                int sourceEnergy = sa8d_8x8(source + i * sstride + j, sstride) - 
148
-                                   (sad<8, 8>(source + i * sstride + j, sstride, zeroBuf, 0) >> 2);
149
-                int reconEnergy =  sa8d_8x8(recon + i * rstride + j, rstride) - 
150
-                                   (sad<8, 8>(recon + i * rstride + j, rstride, zeroBuf, 0) >> 2);
151
-
152
-                totEnergy += abs(sourceEnergy - reconEnergy);
153
-            }
154
-        }
155
-        return totEnergy;
156
-    }
157
-    else
158
-    {
159
-        /* 4x4 is too small for sa8d */
160
-        int sourceEnergy = satd_4x4(source, sstride) - (sad<4, 4>(source, sstride, zeroBuf, 0) >> 2);
161
-        int reconEnergy = satd_4x4(recon, rstride) - (sad<4, 4>(recon, rstride, zeroBuf, 0) >> 2);
162
-        return abs(sourceEnergy - reconEnergy);
163
-    }
164
-}
165
-
166
 template<int bx, int by>
167
 void blockcopy_pp_c(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb)
168
 {
169
@@ -960,19 +846,57 @@
170
 /* Estimate the total amount of influence on future quality that could be had if we
171
  * were to improve the reference samples used to inter predict any given CU. */
172
 static void estimateCUPropagateCost(int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, const uint16_t* interCosts,
173
-                             const int32_t* invQscales, const double* fpsFactor, int len)
174
+                                    const int32_t* invQscales, const double* fpsFactor, int len)
175
 {
176
-    double fps = *fpsFactor / 256;
177
+    double fps = *fpsFactor / 256;  // range[0.01, 1.00]
178
 
179
     for (int i = 0; i < len; i++)
180
     {
181
-        double intraCost       = intraCosts[i] * invQscales[i];
182
-        double propagateAmount = (double)propagateIn[i] + intraCost * fps;
183
-        double propagateNum    = (double)intraCosts[i] - (interCosts[i] & ((1 << 14) - 1));
184
-        double propagateDenom  = (double)intraCosts[i];
185
+        int intraCost = intraCosts[i];
186
+        int interCost = X265_MIN(intraCosts[i], interCosts[i] & LOWRES_COST_MASK);
187
+        double propagateIntra  = intraCost * invQscales[i]; // Q16 x Q8.8 = Q24.8
188
+        double propagateAmount = (double)propagateIn[i] + propagateIntra * fps; // Q16.0 + Q24.8 x Q0.x = Q25.0
189
+        double propagateNum    = (double)(intraCost - interCost); // Q32 - Q32 = Q33.0
190
+
191
+#if 0
192
+        // algorithm that output match to asm
193
+        float intraRcp = (float)1.0f / intraCost;   // VC can't mapping this into RCPPS
194
+        float intraRcpError1 = (float)intraCost * (float)intraRcp;
195
+        intraRcpError1 *= (float)intraRcp;
196
+        float intraRcpError2 = intraRcp + intraRcp;
197
+        float propagateDenom = intraRcpError2 - intraRcpError1;
198
+        dst[i] = (int)(propagateAmount * propagateNum * (double)propagateDenom + 0.5);
199
+#else
200
+        double propagateDenom  = (double)intraCost;             // Q32
201
x265_1.8.tar.gz/source/common/predict.cpp -> x265_1.9.tar.gz/source/common/predict.cpp Changed
147
 
1
@@ -2,6 +2,7 @@
2
 * Copyright (C) 2013 x265 project
3
 *
4
 * Authors: Deepthi Nandakumar <deepthi@multicorewareinc.com>
5
+*          Min Chen <chenm003@163.com>
6
 *
7
 * This program is free software; you can redistribute it and/or modify
8
 * it under the terms of the GNU General Public License as published by
9
@@ -98,7 +99,7 @@
10
 
11
         if (cu.m_slice->m_pps->bUseWeightPred && wp0->bPresentFlag)
12
         {
13
-            for (int plane = 0; plane < 3; plane++)
14
+            for (int plane = 0; plane < (bChroma ? 3 : 1); plane++)
15
             {
16
                 wv0[plane].w      = wp0[plane].inputWeight;
17
                 wv0[plane].offset = wp0[plane].inputOffset * (1 << (X265_DEPTH - 8));
18
@@ -109,18 +110,18 @@
19
             ShortYuv& shortYuv = m_predShortYuv[0];
20
 
21
             if (bLuma)
22
-                predInterLumaShort(pu, shortYuv, *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
23
+                predInterLumaShort(pu, shortYuv, *cu.m_slice->m_refReconPicList[0][refIdx0], mv0);
24
             if (bChroma)
25
-                predInterChromaShort(pu, shortYuv, *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
26
+                predInterChromaShort(pu, shortYuv, *cu.m_slice->m_refReconPicList[0][refIdx0], mv0);
27
 
28
             addWeightUni(pu, predYuv, shortYuv, wv0, bLuma, bChroma);
29
         }
30
         else
31
         {
32
             if (bLuma)
33
-                predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
34
+                predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList[0][refIdx0], mv0);
35
             if (bChroma)
36
-                predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
37
+                predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList[0][refIdx0], mv0);
38
         }
39
     }
40
     else
41
@@ -141,7 +142,7 @@
42
             if (pwp0 && pwp1 && (pwp0->bPresentFlag || pwp1->bPresentFlag))
43
             {
44
                 /* biprediction weighting */
45
-                for (int plane = 0; plane < 3; plane++)
46
+                for (int plane = 0; plane < (bChroma ? 3 : 1); plane++)
47
                 {
48
                     wv0[plane].w = pwp0[plane].inputWeight;
49
                     wv0[plane].o = pwp0[plane].inputOffset * (1 << (X265_DEPTH - 8));
50
@@ -158,7 +159,7 @@
51
             {
52
                 /* uniprediction weighting, always outputs to wv0 */
53
                 const WeightParam* pwp = (refIdx0 >= 0) ? pwp0 : pwp1;
54
-                for (int plane = 0; plane < 3; plane++)
55
+                for (int plane = 0; plane < (bChroma ? 3 : 1); plane++)
56
                 {
57
                     wv0[plane].w = pwp[plane].inputWeight;
58
                     wv0[plane].offset = pwp[plane].inputOffset * (1 << (X265_DEPTH - 8));
59
@@ -179,13 +180,13 @@
60
 
61
             if (bLuma)
62
             {
63
-                predInterLumaShort(pu, m_predShortYuv[0], *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
64
-                predInterLumaShort(pu, m_predShortYuv[1], *cu.m_slice->m_refPicList[1][refIdx1]->m_reconPic, mv1);
65
+                predInterLumaShort(pu, m_predShortYuv[0], *cu.m_slice->m_refReconPicList[0][refIdx0], mv0);
66
+                predInterLumaShort(pu, m_predShortYuv[1], *cu.m_slice->m_refReconPicList[1][refIdx1], mv1);
67
             }
68
             if (bChroma)
69
             {
70
-                predInterChromaShort(pu, m_predShortYuv[0], *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
71
-                predInterChromaShort(pu, m_predShortYuv[1], *cu.m_slice->m_refPicList[1][refIdx1]->m_reconPic, mv1);
72
+                predInterChromaShort(pu, m_predShortYuv[0], *cu.m_slice->m_refReconPicList[0][refIdx0], mv0);
73
+                predInterChromaShort(pu, m_predShortYuv[1], *cu.m_slice->m_refReconPicList[1][refIdx1], mv1);
74
             }
75
 
76
             if (pwp0 && pwp1 && (pwp0->bPresentFlag || pwp1->bPresentFlag))
77
@@ -203,18 +204,18 @@
78
                 ShortYuv& shortYuv = m_predShortYuv[0];
79
 
80
                 if (bLuma)
81
-                    predInterLumaShort(pu, shortYuv, *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
82
+                    predInterLumaShort(pu, shortYuv, *cu.m_slice->m_refReconPicList[0][refIdx0], mv0);
83
                 if (bChroma)
84
-                    predInterChromaShort(pu, shortYuv, *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
85
+                    predInterChromaShort(pu, shortYuv, *cu.m_slice->m_refReconPicList[0][refIdx0], mv0);
86
 
87
                 addWeightUni(pu, predYuv, shortYuv, wv0, bLuma, bChroma);
88
             }
89
             else
90
             {
91
                 if (bLuma)
92
-                    predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
93
+                    predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList[0][refIdx0], mv0);
94
                 if (bChroma)
95
-                    predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
96
+                    predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList[0][refIdx0], mv0);
97
             }
98
         }
99
         else
100
@@ -230,18 +231,18 @@
101
                 ShortYuv& shortYuv = m_predShortYuv[0];
102
 
103
                 if (bLuma)
104
-                    predInterLumaShort(pu, shortYuv, *cu.m_slice->m_refPicList[1][refIdx1]->m_reconPic, mv1);
105
+                    predInterLumaShort(pu, shortYuv, *cu.m_slice->m_refReconPicList[1][refIdx1], mv1);
106
                 if (bChroma)
107
-                    predInterChromaShort(pu, shortYuv, *cu.m_slice->m_refPicList[1][refIdx1]->m_reconPic, mv1);
108
+                    predInterChromaShort(pu, shortYuv, *cu.m_slice->m_refReconPicList[1][refIdx1], mv1);
109
 
110
                 addWeightUni(pu, predYuv, shortYuv, wv0, bLuma, bChroma);
111
             }
112
             else
113
             {
114
                 if (bLuma)
115
-                    predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refPicList[1][refIdx1]->m_reconPic, mv1);
116
+                    predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList[1][refIdx1], mv1);
117
                 if (bChroma)
118
-                    predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refPicList[1][refIdx1]->m_reconPic, mv1);
119
+                    predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList[1][refIdx1], mv1);
120
             }
121
         }
122
     }
123
@@ -600,8 +601,9 @@
124
     int tuSize = 1 << intraNeighbors.log2TrSize;
125
     int tuSize2 = tuSize << 1;
126
 
127
-    pixel* adiOrigin = cu.m_encData->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + puAbsPartIdx);
128
-    intptr_t picStride = cu.m_encData->m_reconPic->m_stride;
129
+    PicYuv* reconPic = cu.m_encData->m_reconPic;
130
+    pixel* adiOrigin = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + puAbsPartIdx);
131
+    intptr_t picStride = reconPic->m_stride;
132
 
133
     fillReferenceSamples(adiOrigin, picStride, intraNeighbors, intraNeighbourBuf[0]);
134
 
135
@@ -648,8 +650,9 @@
136
 
137
 void Predict::initAdiPatternChroma(const CUData& cu, const CUGeom& cuGeom, uint32_t puAbsPartIdx, const IntraNeighbors& intraNeighbors, uint32_t chromaId)
138
 {
139
-    const pixel* adiOrigin = cu.m_encData->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + puAbsPartIdx);
140
-    intptr_t picStride = cu.m_encData->m_reconPic->m_strideC;
141
+    PicYuv* reconPic = cu.m_encData->m_reconPic;
142
+    const pixel* adiOrigin = reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + puAbsPartIdx);
143
+    intptr_t picStride = reconPic->m_strideC;
144
 
145
     fillReferenceSamples(adiOrigin, picStride, intraNeighbors, intraNeighbourBuf[0]);
146
 
147
x265_1.8.tar.gz/source/common/predict.h -> x265_1.9.tar.gz/source/common/predict.h Changed
9
 
1
@@ -2,6 +2,7 @@
2
 * Copyright (C) 2013 x265 project
3
 *
4
 * Authors: Deepthi Nandakumar <deepthi@multicorewareinc.com>
5
+*          Min Chen <chenm003@163.com>
6
 *
7
 * This program is free software; you can redistribute it and/or modify
8
 * it under the terms of the GNU General Public License as published by
9
x265_1.8.tar.gz/source/common/primitives.h -> x265_1.9.tar.gz/source/common/primitives.h Changed
69
 
1
@@ -112,9 +112,9 @@
2
 
3
 typedef int  (*pixelcmp_t)(const pixel* fenc, intptr_t fencstride, const pixel* fref, intptr_t frefstride); // fenc is aligned
4
 typedef int  (*pixelcmp_ss_t)(const int16_t* fenc, intptr_t fencstride, const int16_t* fref, intptr_t frefstride);
5
-typedef sse_ret_t (*pixel_sse_t)(const pixel* fenc, intptr_t fencstride, const pixel* fref, intptr_t frefstride); // fenc is aligned
6
-typedef sse_ret_t (*pixel_sse_ss_t)(const int16_t* fenc, intptr_t fencstride, const int16_t* fref, intptr_t frefstride);
7
-typedef int  (*pixel_ssd_s_t)(const int16_t* fenc, intptr_t fencstride);
8
+typedef sse_t (*pixel_sse_t)(const pixel* fenc, intptr_t fencstride, const pixel* fref, intptr_t frefstride); // fenc is aligned
9
+typedef sse_t (*pixel_sse_ss_t)(const int16_t* fenc, intptr_t fencstride, const int16_t* fref, intptr_t frefstride);
10
+typedef sse_t (*pixel_ssd_s_t)(const int16_t* fenc, intptr_t fencstride);
11
 typedef void (*pixelcmp_x4_t)(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
12
 typedef void (*pixelcmp_x3_t)(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
13
 typedef void (*blockfill_s_t)(int16_t* dst, intptr_t dstride, int16_t val);
14
@@ -176,15 +176,16 @@
15
 typedef void (*saoCuOrgE3_t)(pixel* rec, int8_t* upBuff1, int8_t* m_offsetEo, intptr_t stride, int startX, int endX);
16
 typedef void (*saoCuOrgB0_t)(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
17
 
18
-typedef void (*saoCuStatsBO_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count);
19
-typedef void (*saoCuStatsE0_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count);
20
-typedef void (*saoCuStatsE1_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
21
-typedef void (*saoCuStatsE2_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBuff, int endX, int endY, int32_t *stats, int32_t *count);
22
-typedef void (*saoCuStatsE3_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
23
+typedef void (*saoCuStatsBO_t)(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count);
24
+typedef void (*saoCuStatsE0_t)(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count);
25
+typedef void (*saoCuStatsE1_t)(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
26
+typedef void (*saoCuStatsE2_t)(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBuff, int endX, int endY, int32_t *stats, int32_t *count);
27
+typedef void (*saoCuStatsE3_t)(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
28
 
29
 typedef void (*sign_t)(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
30
 typedef void (*planecopy_cp_t) (const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
31
 typedef void (*planecopy_sp_t) (const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
32
+typedef pixel (*planeClipAndMax_t)(pixel *src, intptr_t stride, int width, int height, uint64_t *outsum, const pixel minPix, const pixel maxPix);
33
 
34
 typedef void (*cutree_propagate_cost) (int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, const uint16_t* interCosts, const int32_t* invQscales, const double* fpsFactor, int len);
35
 
36
@@ -195,6 +196,8 @@
37
 typedef uint32_t (*costCoeffRemain_t)(uint16_t *absCoeff, int numNonZero, int idx);
38
 typedef uint32_t (*costC1C2Flag_t)(uint16_t *absCoeff, intptr_t numC1Flag, uint8_t *baseCtxMod, intptr_t ctxOffset);
39
 
40
+typedef void (*pelFilterLumaStrong_t)(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tcP, int32_t tcQ);
41
+
42
 /* Function pointers to optimized encoder primitives. Each pointer can reference
43
  * either an assembly routine, a SIMD intrinsic primitive, or a C function */
44
 struct EncoderPrimitives
45
@@ -259,7 +262,6 @@
46
         pixel_sse_t     sse_pp;        // Sum of Square Error (pixel, pixel) fenc alignment not assumed
47
         pixel_sse_ss_t  sse_ss;        // Sum of Square Error (short, short) fenc alignment not assumed
48
         pixelcmp_t      psy_cost_pp;   // difference in AC energy between two pixel blocks
49
-        pixelcmp_ss_t   psy_cost_ss;   // difference in AC energy between two signed residual blocks
50
         pixel_ssd_s_t   ssd_s;         // Sum of Square Error (residual coeff to self)
51
         pixelcmp_t      sa8d;          // Sum of Transformed Differences (8x8 Hadamard), uses satd for 4x4 intra TU
52
 
53
@@ -316,6 +318,7 @@
54
     planecopy_cp_t        planecopy_cp;
55
     planecopy_sp_t        planecopy_sp;
56
     planecopy_sp_t        planecopy_sp_shl;
57
+    planeClipAndMax_t     planeClipAndMax;
58
 
59
     weightp_sp_t          weight_sp;
60
     weightp_pp_t          weight_pp;
61
@@ -328,6 +331,7 @@
62
     costCoeffRemain_t     costCoeffRemain;
63
     costC1C2Flag_t        costC1C2Flag;
64
 
65
+    pelFilterLumaStrong_t pelFilterLumaStrong[2]; // EDGE_VER = 0, EDGE_HOR = 1
66
 
67
     /* There is one set of chroma primitives per color space. An encoder will
68
      * have just a single color space and thus it will only ever use one entry
69
x265_1.8.tar.gz/source/common/quant.cpp -> x265_1.9.tar.gz/source/common/quant.cpp Changed
201
 
1
@@ -2,6 +2,7 @@
2
  * Copyright (C) 2015 x265 project
3
  *
4
  * Authors: Steve Borho <steve@borho.org>
5
+ *          Min Chen <chenm003@163.com>
6
  *
7
  * This program is free software; you can redistribute it and/or modify
8
  * it under the terms of the GNU General Public License as published by
9
@@ -50,9 +51,8 @@
10
     return y + ((x - y) & ((x - y) >> (sizeof(int) * CHAR_BIT - 1))); // min(x, y)
11
 }
12
 
13
-inline int getICRate(uint32_t absLevel, int32_t diffLevel, const int* greaterOneBits, const int* levelAbsBits, const uint32_t absGoRice, const uint32_t maxVlc, uint32_t c1c2Idx)
14
+inline int getICRate(uint32_t absLevel, int32_t diffLevel, const int* greaterOneBits, const int* levelAbsBits, const uint32_t absGoRice, const uint32_t maxVlc, const uint32_t c1c2Rate)
15
 {
16
-    X265_CHECK(c1c2Idx <= 3, "c1c2Idx check failure\n");
17
     X265_CHECK(absGoRice <= 4, "absGoRice check failure\n");
18
     if (!absLevel)
19
     {
20
@@ -94,12 +94,7 @@
21
         uint32_t numBins = fastMin(prefLen + absGoRice, 8 /* g_goRicePrefixLen[absGoRice] + absGoRice */);
22
 
23
         rate += numBins << 15;
24
-
25
-        if (c1c2Idx & 1)
26
-            rate += greaterOneBits[1];
27
-
28
-        if (c1c2Idx == 3)
29
-            rate += levelAbsBits[1];
30
+        rate += c1c2Rate;
31
     }
32
     return rate;
33
 }
34
@@ -140,7 +135,7 @@
35
 }
36
 
37
 /* Calculates the cost for specific absolute transform level */
38
-inline uint32_t getICRateCost(uint32_t absLevel, int32_t diffLevel, const int* greaterOneBits, const int* levelAbsBits, uint32_t absGoRice, uint32_t c1c2Idx)
39
+inline uint32_t getICRateCost(uint32_t absLevel, int32_t diffLevel, const int* greaterOneBits, const int* levelAbsBits, uint32_t absGoRice, const uint32_t c1c2Rate)
40
 {
41
     X265_CHECK(absLevel, "absLevel should not be zero\n");
42
 
43
@@ -175,16 +170,15 @@
44
 
45
             rate = (COEF_REMAIN_BIN_REDUCTION + length + absGoRice + 1 + length) << 15;
46
         }
47
-        if (c1c2Idx & 1)
48
-            rate += greaterOneBits[1];
49
-        if (c1c2Idx == 3)
50
-            rate += levelAbsBits[1];
51
+        rate += c1c2Rate;
52
         return rate;
53
     }
54
 }
55
 
56
 }
57
 
58
+Quant::rdoQuant_t Quant::rdoQuant_func[NUM_CU_DEPTH] = {&Quant::rdoQuant<2>, &Quant::rdoQuant<3>, &Quant::rdoQuant<4>, &Quant::rdoQuant<5>};
59
+
60
 Quant::Quant()
61
 {
62
     m_resiDctCoeff = NULL;
63
@@ -229,8 +223,11 @@
64
 {
65
     m_nr = m_frameNr ? &m_frameNr[ctu.m_encData->m_frameEncoderID] : NULL;
66
     m_qpParam[TEXT_LUMA].setQpParam(qp + QP_BD_OFFSET);
67
-    setChromaQP(qp + ctu.m_slice->m_pps->chromaQpOffset[0], TEXT_CHROMA_U, ctu.m_chromaFormat);
68
-    setChromaQP(qp + ctu.m_slice->m_pps->chromaQpOffset[1], TEXT_CHROMA_V, ctu.m_chromaFormat);
69
+    if (ctu.m_chromaFormat != X265_CSP_I400)
70
+    {
71
+        setChromaQP(qp + ctu.m_slice->m_pps->chromaQpOffset[0], TEXT_CHROMA_U, ctu.m_chromaFormat);
72
+        setChromaQP(qp + ctu.m_slice->m_pps->chromaQpOffset[1], TEXT_CHROMA_V, ctu.m_chromaFormat);
73
+    }
74
 }
75
 
76
 void Quant::setChromaQP(int qpin, TextType ttype, int chFmt)
77
@@ -444,18 +441,18 @@
78
             primitives.cu[sizeIdx].dct(m_fencShortBuf, m_fencDctCoeff, trSize);
79
         }
80
 
81
-        if (m_nr)
82
+        if (m_nr && m_nr->offset)
83
         {
84
             /* denoise is not applied to intra residual, so DST can be ignored */
85
             int cat = sizeIdx + 4 * !isLuma + 8 * !isIntra;
86
             int numCoeff = 1 << (log2TrSize * 2);
87
-            primitives.denoiseDct(m_resiDctCoeff, m_nr->residualSum[cat], m_nr->offsetDenoise[cat], numCoeff);
88
+            primitives.denoiseDct(m_resiDctCoeff, m_nr->residualSum[cat], m_nr->offset[cat], numCoeff);
89
             m_nr->count[cat]++;
90
         }
91
     }
92
 
93
     if (m_rdoqLevel)
94
-        return rdoQuant(cu, coeff, log2TrSize, ttype, absPartIdx, usePsy);
95
+        return (this->*rdoQuant_func[log2TrSize - 2])(cu, coeff, ttype, absPartIdx, usePsy);
96
     else
97
     {
98
         int deltaU[32 * 32];
99
@@ -550,9 +547,10 @@
100
 
101
 /* Rate distortion optimized quantization for entropy coding engines using
102
  * probability models like CABAC */
103
-uint32_t Quant::rdoQuant(const CUData& cu, int16_t* dstCoeff, uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool usePsy)
104
+template<uint32_t log2TrSize>
105
+uint32_t Quant::rdoQuant(const CUData& cu, int16_t* dstCoeff, TextType ttype, uint32_t absPartIdx, bool usePsy)
106
 {
107
-    int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
108
+    const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
109
     int scalingListType = (cu.isIntra(absPartIdx) ? 0 : 3) + ttype;
110
     const uint32_t usePsyMask = usePsy ? -1 : 0;
111
 
112
@@ -564,13 +562,13 @@
113
     int add = (1 << (qbits - 1));
114
     const int32_t* qCoef = m_scalingList->m_quantCoef[log2TrSize - 2][scalingListType][rem];
115
 
116
-    int numCoeff = 1 << (log2TrSize * 2);
117
+    const int numCoeff = 1 << (log2TrSize * 2);
118
     uint32_t numSig = primitives.nquant(m_resiDctCoeff, qCoef, dstCoeff, qbits, add, numCoeff);
119
     X265_CHECK((int)numSig == primitives.cu[log2TrSize - 2].count_nonzero(dstCoeff), "numSig differ\n");
120
     if (!numSig)
121
         return 0;
122
 
123
-    uint32_t trSize = 1 << log2TrSize;
124
+    const uint32_t trSize = 1 << log2TrSize;
125
     int64_t lambda2 = m_qpParam[ttype].lambda2;
126
     const int64_t psyScale = ((int64_t)m_psyRdoqScale * m_qpParam[ttype].lambda);
127
 
128
@@ -580,20 +578,20 @@
129
     const int32_t* unquantScale = m_scalingList->m_dequantCoef[log2TrSize - 2][scalingListType][rem];
130
     int unquantShift = QUANT_IQUANT_SHIFT - QUANT_SHIFT - transformShift + (m_scalingList->m_bEnabled ? 4 : 0);
131
     int unquantRound = (unquantShift > per) ? 1 << (unquantShift - per - 1) : 0;
132
-    int scaleBits = SCALE_BITS - 2 * transformShift;
133
+    const int scaleBits = SCALE_BITS - 2 * transformShift;
134
 
135
 #define UNQUANT(lvl)    (((lvl) * (unquantScale[blkPos] << per) + unquantRound) >> unquantShift)
136
 #define SIGCOST(bits)   ((lambda2 * (bits)) >> 8)
137
 #define RDCOST(d, bits) ((((int64_t)d * d) << scaleBits) + SIGCOST(bits))
138
 #define PSYVALUE(rec)   ((psyScale * (rec)) >> X265_MAX(0, (2 * transformShift + 1)))
139
 
140
-    int64_t costCoeff[32 * 32];   /* d*d + lambda * bits */
141
-    int64_t costUncoded[32 * 32]; /* d*d + lambda * 0    */
142
-    int64_t costSig[32 * 32];     /* lambda * bits       */
143
+    int64_t costCoeff[trSize * trSize];   /* d*d + lambda * bits */
144
+    int64_t costUncoded[trSize * trSize]; /* d*d + lambda * 0    */
145
+    int64_t costSig[trSize * trSize];     /* lambda * bits       */
146
 
147
-    int rateIncUp[32 * 32];      /* signal overhead of increasing level */
148
-    int rateIncDown[32 * 32];    /* signal overhead of decreasing level */
149
-    int sigRateDelta[32 * 32];   /* signal difference between zero and non-zero */
150
+    int rateIncUp[trSize * trSize];      /* signal overhead of increasing level */
151
+    int rateIncDown[trSize * trSize];    /* signal overhead of decreasing level */
152
+    int sigRateDelta[trSize * trSize];   /* signal difference between zero and non-zero */
153
 
154
     int64_t costCoeffGroupSig[MLS_GRP_NUM]; /* lambda * bits of group coding cost */
155
     uint64_t sigCoeffGroupFlag64 = 0;
156
@@ -611,7 +609,8 @@
157
 
158
     TUEntropyCodingParameters codeParams;
159
     cu.getTUEntropyCodingParameters(codeParams, absPartIdx, log2TrSize, bIsLuma);
160
-    const uint32_t cgNum = 1 << (codeParams.log2TrSizeCG * 2);
161
+    const uint32_t log2TrSizeCG = log2TrSize - 2;
162
+    const uint32_t cgNum = 1 << (log2TrSizeCG * 2);
163
     const uint32_t cgStride = (trSize >> MLS_CG_LOG2_SIZE);
164
 
165
     uint8_t coeffNum[MLS_GRP_NUM];      // value range[0, 16]
166
@@ -742,8 +741,8 @@
167
     {
168
         uint32_t ctxSet = (cgScanPos && bIsLuma) ? 2 : 0;
169
         const uint32_t cgBlkPos = codeParams.scanCG[cgScanPos];
170
-        const uint32_t cgPosY   = cgBlkPos >> codeParams.log2TrSizeCG;
171
-        const uint32_t cgPosX   = cgBlkPos - (cgPosY << codeParams.log2TrSizeCG);
172
+        const uint32_t cgPosY   = cgBlkPos >> log2TrSizeCG;
173
+        const uint32_t cgPosX   = cgBlkPos & ((1 << log2TrSizeCG) - 1);
174
         const uint64_t cgBlkPosMask = ((uint64_t)1 << cgBlkPos);
175
         const int patternSigCtx = calcPatternSigCtx(sigCoeffGroupFlag64, cgPosX, cgPosY, cgBlkPos, cgStride);
176
         const int ctxSigOffset = codeParams.firstSignificanceMapContext + (cgScanPos && bIsLuma ? 3 : 0);
177
@@ -829,6 +828,7 @@
178
         uint32_t subFlagMask = coeffFlag[cgScanPos];
179
         int    c2            = 0;
180
         uint32_t goRiceParam = 0;
181
+        uint32_t levelThreshold = 3;
182
         uint32_t c1Idx       = 0;
183
         uint32_t c2Idx       = 0;
184
         /* iterate over coefficients in each group in reverse scan order */
185
@@ -836,7 +836,7 @@
186
         {
187
             scanPos              = (cgScanPos << MLS_CG_SIZE) + scanPosinCG;
188
             uint32_t blkPos      = codeParams.scan[scanPos];
189
-            uint32_t maxAbsLevel = abs(dstCoeff[blkPos]);             /* abs(quantized coeff) */
190
+            uint32_t maxAbsLevel = dstCoeff[blkPos];                  /* abs(quantized coeff) */
191
             int signCoef         = m_resiDctCoeff[blkPos];            /* pre-quantization DCT coeff */
192
             int predictedCoef    = m_fencDctCoeff[blkPos] - signCoef; /* predicted DCT = source DCT - residual DCT*/
193
 
194
@@ -855,7 +855,11 @@
195
 
196
             // coefficient level estimation
197
             const int* greaterOneBits = estBitsSbac.greaterOneBits[4 * ctxSet + c1];
198
-            const uint32_t ctxSig = (blkPos == 0) ? 0 : table_cnt[(trSize == 4) ? 4 : patternSigCtx][g_scan4x4[codeParams.scanType][scanPosinCG]] + ctxSigOffset;
199
+            //const uint32_t ctxSig = (blkPos == 0) ? 0 : table_cnt[(trSize == 4) ? 4 : patternSigCtx][g_scan4x4[codeParams.scanType][scanPosinCG]] + ctxSigOffset;
200
+            static const uint64_t table_cnt64[4] = {0x0000000100110112ULL, 0x0000000011112222ULL, 0x0012001200120012ULL, 0x2222222222222222ULL};
201
x265_1.8.tar.gz/source/common/quant.h -> x265_1.9.tar.gz/source/common/quant.h Changed
72
 
1
@@ -2,6 +2,7 @@
2
  * Copyright (C) 2015 x265 project
3
  *
4
  * Authors: Steve Borho <steve@borho.org>
5
+ *          Min Chen <chenm003@163.com>
6
  *
7
  * This program is free software; you can redistribute it and/or modify
8
  * it under the terms of the GNU General Public License as published by
9
@@ -59,18 +60,18 @@
10
     }
11
 };
12
 
13
-#define MAX_NUM_TR_COEFFS        MAX_TR_SIZE * MAX_TR_SIZE /* Maximum number of transform coefficients, for a 32x32 transform */
14
-#define MAX_NUM_TR_CATEGORIES    16                        /* 32, 16, 8, 4 transform categories each for luma and chroma */
15
-
16
 // NOTE: MUST be 16-byte aligned for asm code
17
 struct NoiseReduction
18
 {
19
     /* 0 = luma 4x4,   1 = luma 8x8,   2 = luma 16x16,   3 = luma 32x32
20
      * 4 = chroma 4x4, 5 = chroma 8x8, 6 = chroma 16x16, 7 = chroma 32x32
21
      * Intra 0..7 - Inter 8..15 */
22
-    ALIGN_VAR_16(uint32_t, residualSum[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS]);
23
-    uint32_t count[MAX_NUM_TR_CATEGORIES];
24
-    uint16_t offsetDenoise[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS];
25
+    ALIGN_VAR_16(uint32_t, nrResidualSum[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS]);
26
+    uint32_t nrCount[MAX_NUM_TR_CATEGORIES];
27
+    uint16_t nrOffsetDenoise[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS];
28
+    uint16_t (*offset)[MAX_NUM_TR_COEFFS];
29
+    uint32_t (*residualSum)[MAX_NUM_TR_COEFFS];
30
+    uint32_t *count;
31
 };
32
 
33
 class Quant
34
@@ -125,8 +126,8 @@
35
         const uint32_t sigPos = (uint32_t)(sigCoeffGroupFlag64 >> (cgBlkPos + 1)); // just need lowest 7-bits valid
36
 
37
         // TODO: instruction BT is faster, but _bittest64 still generate instruction 'BT m, r' in VS2012
38
-        const uint32_t sigRight = ((uint32_t)(cgPosX - (trSizeCG - 1)) >> 31) & sigPos;
39
-        const uint32_t sigLower = ((uint32_t)(cgPosY - (trSizeCG - 1)) >> 31) & (sigPos >> (trSizeCG - 1));
40
+        const uint32_t sigRight = (cgPosX != (trSizeCG - 1)) & sigPos;
41
+        const uint32_t sigLower = (cgPosY != (trSizeCG - 1)) & (sigPos >> (trSizeCG - 1));
42
         return sigRight + sigLower * 2;
43
     }
44
 
45
@@ -136,8 +137,8 @@
46
         X265_CHECK(cgBlkPos < 64, "cgBlkPos is too large\n");
47
         // NOTE: unsafe shift operator, see NOTE in calcPatternSigCtx
48
         const uint32_t sigPos = (uint32_t)(cgGroupMask >> (cgBlkPos + 1)); // just need lowest 8-bits valid
49
-        const uint32_t sigRight = ((uint32_t)(cgPosX - (trSizeCG - 1)) >> 31) & sigPos;
50
-        const uint32_t sigLower = ((uint32_t)(cgPosY - (trSizeCG - 1)) >> 31) & (sigPos >> (trSizeCG - 1));
51
+        const uint32_t sigRight = (cgPosX != (trSizeCG - 1)) & sigPos;
52
+        const uint32_t sigLower = (cgPosY != (trSizeCG - 1)) & (sigPos >> (trSizeCG - 1));
53
 
54
         return (sigRight | sigLower);
55
     }
56
@@ -151,7 +152,14 @@
57
 
58
     uint32_t signBitHidingHDQ(int16_t* qcoeff, int32_t* deltaU, uint32_t numSig, const TUEntropyCodingParameters &codingParameters, uint32_t log2TrSize);
59
 
60
-    uint32_t rdoQuant(const CUData& cu, int16_t* dstCoeff, uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool usePsy);
61
+    template<uint32_t log2TrSize>
62
+    uint32_t rdoQuant(const CUData& cu, int16_t* dstCoeff, TextType ttype, uint32_t absPartIdx, bool usePsy);
63
+
64
+public:
65
+    typedef uint32_t (Quant::*rdoQuant_t)(const CUData& cu, int16_t* dstCoeff, TextType ttype, uint32_t absPartIdx, bool usePsy);
66
+
67
+private:
68
+    static rdoQuant_t rdoQuant_func[NUM_CU_DEPTH];
69
 };
70
 }
71
 
72
x265_1.8.tar.gz/source/common/shortyuv.cpp -> x265_1.9.tar.gz/source/common/shortyuv.cpp Changed
51
 
1
@@ -40,19 +40,26 @@
2
 bool ShortYuv::create(uint32_t size, int csp)
3
 {
4
     m_csp = csp;
5
+    m_size = size;
6
     m_hChromaShift = CHROMA_H_SHIFT(csp);
7
     m_vChromaShift = CHROMA_V_SHIFT(csp);
8
-
9
-    m_size = size;
10
-    m_csize = size >> m_hChromaShift;
11
-
12
     size_t sizeL = size * size;
13
-    size_t sizeC = sizeL >> (m_hChromaShift + m_vChromaShift);
14
-    X265_CHECK((sizeC & 15) == 0, "invalid size");
15
 
16
-    CHECKED_MALLOC(m_buf[0], int16_t, sizeL + sizeC * 2);
17
-    m_buf[1] = m_buf[0] + sizeL;
18
-    m_buf[2] = m_buf[0] + sizeL + sizeC;
19
+    if (csp != X265_CSP_I400)
20
+    {
21
+        m_csize = size >> m_hChromaShift;
22
+        size_t sizeC = sizeL >> (m_hChromaShift + m_vChromaShift);
23
+        X265_CHECK((sizeC & 15) == 0, "invalid size");
24
+
25
+        CHECKED_MALLOC(m_buf[0], int16_t, sizeL + sizeC * 2);
26
+        m_buf[1] = m_buf[0] + sizeL;
27
+        m_buf[2] = m_buf[0] + sizeL + sizeC;
28
+    }
29
+    else
30
+    {
31
+        CHECKED_MALLOC(m_buf[0], int16_t, sizeL);
32
+        m_buf[1] = m_buf[2] = NULL;
33
+    }
34
     return true;
35
 
36
 fail:
37
@@ -75,8 +82,11 @@
38
 {
39
     const int sizeIdx = log2Size - 2;
40
     primitives.cu[sizeIdx].sub_ps(m_buf[0], m_size, srcYuv0.m_buf[0], srcYuv1.m_buf[0], srcYuv0.m_size, srcYuv1.m_size);
41
-    primitives.chroma[m_csp].cu[sizeIdx].sub_ps(m_buf[1], m_csize, srcYuv0.m_buf[1], srcYuv1.m_buf[1], srcYuv0.m_csize, srcYuv1.m_csize);
42
-    primitives.chroma[m_csp].cu[sizeIdx].sub_ps(m_buf[2], m_csize, srcYuv0.m_buf[2], srcYuv1.m_buf[2], srcYuv0.m_csize, srcYuv1.m_csize);
43
+    if (m_csp != X265_CSP_I400)
44
+    {
45
+        primitives.chroma[m_csp].cu[sizeIdx].sub_ps(m_buf[1], m_csize, srcYuv0.m_buf[1], srcYuv1.m_buf[1], srcYuv0.m_csize, srcYuv1.m_csize);
46
+        primitives.chroma[m_csp].cu[sizeIdx].sub_ps(m_buf[2], m_csize, srcYuv0.m_buf[2], srcYuv1.m_buf[2], srcYuv0.m_csize, srcYuv1.m_csize);
47
+    }
48
 }
49
 
50
 void ShortYuv::copyPartToPartLuma(ShortYuv& dstYuv, uint32_t absPartIdx, uint32_t log2Size) const
51
x265_1.8.tar.gz/source/common/slice.cpp -> x265_1.9.tar.gz/source/common/slice.cpp Changed
44
 
1
@@ -33,7 +33,9 @@
2
 {
3
     if (m_sliceType == I_SLICE)
4
     {
5
-        memset(m_refPicList, 0, sizeof(m_refPicList));
6
+        memset(m_refFrameList, 0, sizeof(m_refFrameList));
7
+        memset(m_refReconPicList, 0, sizeof(m_refReconPicList));
8
+        memset(m_refPOCList, 0, sizeof(m_refPOCList));
9
         m_numRefIdx[1] = m_numRefIdx[0] = 0;
10
         return;
11
     }
12
@@ -106,13 +108,13 @@
13
     {
14
         cIdx = rIdx % numPocTotalCurr;
15
         X265_CHECK(cIdx >= 0 && cIdx < numPocTotalCurr, "RPS index check fail\n");
16
-        m_refPicList[0][rIdx] = rpsCurrList0[cIdx];
17
+        m_refFrameList[0][rIdx] = rpsCurrList0[cIdx];
18
     }
19
 
20
     if (m_sliceType != B_SLICE)
21
     {
22
         m_numRefIdx[1] = 0;
23
-        memset(m_refPicList[1], 0, sizeof(m_refPicList[1]));
24
+        memset(m_refFrameList[1], 0, sizeof(m_refFrameList[1]));
25
     }
26
     else
27
     {
28
@@ -120,13 +122,13 @@
29
         {
30
             cIdx = rIdx % numPocTotalCurr;
31
             X265_CHECK(cIdx >= 0 && cIdx < numPocTotalCurr, "RPS index check fail\n");
32
-            m_refPicList[1][rIdx] = rpsCurrList1[cIdx];
33
+            m_refFrameList[1][rIdx] = rpsCurrList1[cIdx];
34
         }
35
     }
36
 
37
     for (int dir = 0; dir < 2; dir++)
38
         for (int numRefIdx = 0; numRefIdx < m_numRefIdx[dir]; numRefIdx++)
39
-            m_refPOCList[dir][numRefIdx] = m_refPicList[dir][numRefIdx]->m_poc;
40
+            m_refPOCList[dir][numRefIdx] = m_refFrameList[dir][numRefIdx]->m_poc;
41
 }
42
 
43
 void Slice::disableWeights()
44
x265_1.8.tar.gz/source/common/slice.h -> x265_1.9.tar.gz/source/common/slice.h Changed
201
 
1
@@ -31,6 +31,7 @@
2
 
3
 class Frame;
4
 class PicList;
5
+class PicYuv;
6
 class MotionReference;
7
 
8
 enum SliceType
9
@@ -104,6 +105,12 @@
10
 
11
 struct ProfileTierLevel
12
 {
13
+    int      profileIdc;
14
+    int      levelIdc;
15
+    uint32_t minCrForLevel;
16
+    uint32_t maxLumaSrForLevel;
17
+    uint32_t bitDepthConstraint;
18
+    int      chromaFormatConstraint;
19
     bool     tierFlag;
20
     bool     progressiveSourceFlag;
21
     bool     interlacedSourceFlag;
22
@@ -113,12 +120,6 @@
23
     bool     intraConstraintFlag;
24
     bool     onePictureOnlyConstraintFlag;
25
     bool     lowerBitRateConstraintFlag;
26
-    int      profileIdc;
27
-    int      levelIdc;
28
-    uint32_t minCrForLevel;
29
-    uint32_t maxLumaSrForLevel;
30
-    uint32_t bitDepthConstraint;
31
-    int      chromaFormatConstraint;
32
 };
33
 
34
 struct HRDInfo
35
@@ -151,21 +152,21 @@
36
 
37
 struct VPS
38
 {
39
+    HRDInfo          hrdParameters;
40
+    ProfileTierLevel ptl;
41
     uint32_t         maxTempSubLayers;
42
     uint32_t         numReorderPics;
43
     uint32_t         maxDecPicBuffering;
44
     uint32_t         maxLatencyIncrease;
45
-    HRDInfo          hrdParameters;
46
-    ProfileTierLevel ptl;
47
 };
48
 
49
 struct Window
50
 {
51
-    bool bEnabled;
52
     int  leftOffset;
53
     int  rightOffset;
54
     int  topOffset;
55
     int  bottomOffset;
56
+    bool bEnabled;
57
 
58
     Window()
59
     {
60
@@ -175,40 +176,41 @@
61
 
62
 struct VUI
63
 {
64
-    bool       aspectRatioInfoPresentFlag;
65
     int        aspectRatioIdc;
66
     int        sarWidth;
67
     int        sarHeight;
68
-
69
-    bool       overscanInfoPresentFlag;
70
-    bool       overscanAppropriateFlag;
71
-
72
-    bool       videoSignalTypePresentFlag;
73
     int        videoFormat;
74
-    bool       videoFullRangeFlag;
75
-
76
-    bool       colourDescriptionPresentFlag;
77
     int        colourPrimaries;
78
     int        transferCharacteristics;
79
     int        matrixCoefficients;
80
-
81
-    bool       chromaLocInfoPresentFlag;
82
     int        chromaSampleLocTypeTopField;
83
     int        chromaSampleLocTypeBottomField;
84
 
85
-    Window     defaultDisplayWindow;
86
-
87
+    bool       aspectRatioInfoPresentFlag;
88
+    bool       overscanInfoPresentFlag;
89
+    bool       overscanAppropriateFlag;
90
+    bool       videoSignalTypePresentFlag;
91
+    bool       videoFullRangeFlag;
92
+    bool       colourDescriptionPresentFlag;
93
+    bool       chromaLocInfoPresentFlag;
94
     bool       frameFieldInfoPresentFlag;
95
     bool       fieldSeqFlag;
96
-
97
     bool       hrdParametersPresentFlag;
98
-    HRDInfo    hrdParameters;
99
 
100
+    HRDInfo    hrdParameters;
101
+    Window     defaultDisplayWindow;
102
     TimingInfo timingInfo;
103
 };
104
 
105
 struct SPS
106
 {
107
+    /* cached PicYuv offset arrays, shared by all instances of
108
+     * PicYuv created by this encoder */
109
+    intptr_t* cuOffsetY;
110
+    intptr_t* cuOffsetC;
111
+    intptr_t* buOffsetY;
112
+    intptr_t* buOffsetC;
113
+
114
     int      chromaFormatIdc;        // use param
115
     uint32_t picWidthInLumaSamples;  // use param
116
     uint32_t picHeightInLumaSamples; // use param
117
@@ -228,8 +230,6 @@
118
     uint32_t quadtreeTUMaxDepthInter; // use param
119
     uint32_t quadtreeTUMaxDepthIntra; // use param
120
 
121
-    bool     bUseSAO; // use param
122
-    bool     bUseAMP; // use param
123
     uint32_t maxAMPDepth;
124
 
125
     uint32_t maxTempSubLayers;   // max number of Temporal Sub layers
126
@@ -237,11 +237,26 @@
127
     uint32_t maxLatencyIncrease;
128
     int      numReorderPics;
129
 
130
+    bool     bUseSAO; // use param
131
+    bool     bUseAMP; // use param
132
     bool     bUseStrongIntraSmoothing; // use param
133
     bool     bTemporalMVPEnabled;
134
 
135
     Window   conformanceWindow;
136
     VUI      vuiParameters;
137
+
138
+    SPS()
139
+    {
140
+        memset(this, 0, sizeof(*this));
141
+    }
142
+
143
+    ~SPS()
144
+    {
145
+        X265_FREE(cuOffsetY);
146
+        X265_FREE(cuOffsetC);
147
+        X265_FREE(buOffsetY);
148
+        X265_FREE(buOffsetC);
149
+    }
150
 };
151
 
152
 struct PPS
153
@@ -249,6 +264,8 @@
154
     uint32_t maxCuDQPDepth;
155
 
156
     int      chromaQpOffset[2];      // use param
157
+    int      deblockingFilterBetaOffsetDiv2;
158
+    int      deblockingFilterTcOffsetDiv2;
159
 
160
     bool     bUseWeightPred;         // use param
161
     bool     bUseWeightedBiPred;     // use param
162
@@ -262,17 +279,15 @@
163
 
164
     bool     bDeblockingFilterControlPresent;
165
     bool     bPicDisableDeblockingFilter;
166
-    int      deblockingFilterBetaOffsetDiv2;
167
-    int      deblockingFilterTcOffsetDiv2;
168
 };
169
 
170
 struct WeightParam
171
 {
172
     // Explicit weighted prediction parameters parsed in slice header,
173
-    bool     bPresentFlag;
174
     uint32_t log2WeightDenom;
175
     int      inputWeight;
176
     int      inputOffset;
177
+    bool     bPresentFlag;
178
 
179
     /* makes a non-h265 weight (i.e. fix7), into an h265 weight */
180
     void setFromWeightAndOffset(int w, int o, int denom, bool bNormalize)
181
@@ -304,6 +319,9 @@
182
 
183
     const SPS*  m_sps;
184
     const PPS*  m_pps;
185
+    Frame*      m_refFrameList[2][MAX_NUM_REF + 1];
186
+    PicYuv*     m_refReconPicList[2][MAX_NUM_REF + 1];
187
+
188
     WeightParam m_weightPredTable[2][MAX_NUM_REF][3]; // [list][refIdx][0:Y, 1:U, 2:V]
189
     MotionReference (*m_mref)[MAX_NUM_REF + 1];
190
     RPS         m_rps;
191
@@ -312,34 +330,28 @@
192
     SliceType   m_sliceType;
193
     int         m_sliceQp;
194
     int         m_poc;
195
-    
196
     int         m_lastIDR;
197
 
198
-    bool        m_bCheckLDC;       // TODO: is this necessary?
199
-    bool        m_sLFaseFlag;      // loop filter boundary flag
200
-    bool        m_colFromL0Flag;   // collocated picture from List0 or List1 flag
201
x265_1.8.tar.gz/source/common/threading.h -> x265_1.9.tar.gz/source/common/threading.h Changed
41
 
1
@@ -2,6 +2,7 @@
2
  * Copyright (C) 2013 x265 project
3
  *
4
  * Authors: Steve Borho <steve@borho.org>
5
+ *          Min Chen <chenm003@163.com>
6
  *
7
  * This program is free software; you can redistribute it and/or modify
8
  * it under the terms of the GNU General Public License as published by
9
@@ -204,6 +205,15 @@
10
         return ret;
11
     }
12
 
13
+    int getIncr(int n = 1)
14
+    {
15
+        EnterCriticalSection(&m_cs);
16
+        int ret = m_val;
17
+        m_val += n;
18
+        LeaveCriticalSection(&m_cs);
19
+        return ret;
20
+    }
21
+
22
     void set(int newval)
23
     {
24
         EnterCriticalSection(&m_cs);
25
@@ -393,6 +403,15 @@
26
         return ret;
27
     }
28
 
29
+    int getIncr(int n = 1)
30
+    {
31
+        pthread_mutex_lock(&m_mutex);
32
+        int ret = m_val;
33
+        m_val += n;
34
+        pthread_mutex_unlock(&m_mutex);
35
+        return ret;
36
+    }
37
+
38
     void set(int newval)
39
     {
40
         pthread_mutex_lock(&m_mutex);
41
x265_1.8.tar.gz/source/common/threadpool.cpp -> x265_1.9.tar.gz/source/common/threadpool.cpp Changed
201
 
1
@@ -2,6 +2,7 @@
2
  * Copyright (C) 2013 x265 project
3
  *
4
  * Authors: Steve Borho <steve@borho.org>
5
+ *          Min Chen <chenm003@163.com>
6
  *
7
  * This program is free software; you can redistribute it and/or modify
8
  * it under the terms of the GNU General Public License as published by
9
@@ -59,6 +60,9 @@
10
 #if HAVE_LIBNUMA
11
 #include <numa.h>
12
 #endif
13
+#if defined(_MSC_VER)
14
+# define strcasecmp _stricmp
15
+#endif
16
 
17
 namespace X265_NS {
18
 // x265 private namespace
19
@@ -226,8 +230,13 @@
20
 {
21
     enum { MAX_NODE_NUM = 127 };
22
     int cpusPerNode[MAX_NODE_NUM + 1];
23
+    int threadsPerPool[MAX_NODE_NUM + 2];
24
+    uint64_t nodeMaskPerPool[MAX_NODE_NUM + 2];
25
 
26
     memset(cpusPerNode, 0, sizeof(cpusPerNode));
27
+    memset(threadsPerPool, 0, sizeof(threadsPerPool));
28
+    memset(nodeMaskPerPool, 0, sizeof(nodeMaskPerPool));
29
+
30
     int numNumaNodes = X265_MIN(getNumaNodeCount(), MAX_NODE_NUM);
31
     int cpuCount = getCpuCount();
32
     bool bNumaSupport = false;
33
@@ -258,7 +267,7 @@
34
         for (int i = 0; i < numNumaNodes; i++)
35
             x265_log(p, X265_LOG_DEBUG, "detected NUMA node %d with %d logical cores\n", i, cpusPerNode[i]);
36
 
37
-    /* limit nodes based on param->numaPools */
38
+    /* limit threads based on param->numaPools */
39
     if (p->numaPools && *p->numaPools)
40
     {
41
         const char *nodeStr = p->numaPools;
42
@@ -266,19 +275,30 @@
43
         {
44
             if (!*nodeStr)
45
             {
46
-                cpusPerNode[i] = 0;
47
+                threadsPerPool[i] = 0;
48
                 continue;
49
             }
50
             else if (*nodeStr == '-')
51
-                cpusPerNode[i] = 0;
52
-            else if (*nodeStr == '*')
53
+                threadsPerPool[i] = 0;
54
+           else if (*nodeStr == '*' || !strcasecmp(nodeStr, "NULL"))
55
+            {
56
+                for (int j = i; j < numNumaNodes; j++)
57
+                {
58
+                    threadsPerPool[numNumaNodes] += cpusPerNode[j];
59
+                    nodeMaskPerPool[numNumaNodes] |= ((uint64_t)1 << j);
60
+                }
61
                 break;
62
+            }
63
             else if (*nodeStr == '+')
64
-                ;
65
+            {
66
+                threadsPerPool[numNumaNodes] += cpusPerNode[i];
67
+                nodeMaskPerPool[numNumaNodes] |= ((uint64_t)1 << i);
68
+            }
69
             else
70
             {
71
                 int count = atoi(nodeStr);
72
-                cpusPerNode[i] = X265_MIN(count, cpusPerNode[i]);
73
+                threadsPerPool[i] = X265_MIN(count, cpusPerNode[i]);
74
+                nodeMaskPerPool[i] = ((uint64_t)1 << i);
75
             }
76
 
77
             /* consume current node string, comma, and white-space */
78
@@ -288,14 +308,31 @@
79
                ++nodeStr;
80
         }
81
     }
82
+    else
83
+    {
84
+        for (int i = 0; i < numNumaNodes; i++)
85
+        {
86
+            threadsPerPool[numNumaNodes]  += cpusPerNode[i];
87
+            nodeMaskPerPool[numNumaNodes] |= ((uint64_t)1 << i);
88
+        }
89
+    }
90
+ 
91
+    // If the last pool size is > MAX_POOL_THREADS, clip it to spawn thread pools only of size >= 1/2 max (heuristic)
92
+    if ((threadsPerPool[numNumaNodes] > MAX_POOL_THREADS) &&
93
+        ((threadsPerPool[numNumaNodes] % MAX_POOL_THREADS) < (MAX_POOL_THREADS / 2)))
94
+    {
95
+        threadsPerPool[numNumaNodes] -= (threadsPerPool[numNumaNodes] % MAX_POOL_THREADS);
96
+        x265_log(p, X265_LOG_DEBUG,
97
+                 "Creating only %d worker threads beyond specified numbers with --pools (if specified) to prevent asymmetry in pools; may not use all HW contexts\n", threadsPerPool[numNumaNodes]);
98
+    }
99
 
100
     numPools = 0;
101
-    for (int i = 0; i < numNumaNodes; i++)
102
+    for (int i = 0; i < numNumaNodes + 1; i++)
103
     {
104
         if (bNumaSupport)
105
             x265_log(p, X265_LOG_DEBUG, "NUMA node %d may use %d logical cores\n", i, cpusPerNode[i]);
106
-        if (cpusPerNode[i])
107
-            numPools += (cpusPerNode[i] + MAX_POOL_THREADS - 1) / MAX_POOL_THREADS;
108
+        if (threadsPerPool[i])
109
+            numPools += (threadsPerPool[i] + MAX_POOL_THREADS - 1) / MAX_POOL_THREADS;
110
     }
111
 
112
     if (!numPools)
113
@@ -314,20 +351,27 @@
114
         int node = 0;
115
         for (int i = 0; i < numPools; i++)
116
         {
117
-            while (!cpusPerNode[node])
118
+            while (!threadsPerPool[node])
119
                 node++;
120
-            int cores = X265_MIN(MAX_POOL_THREADS, cpusPerNode[node]);
121
-            if (!pools[i].create(cores, maxProviders, node))
122
+            int numThreads = X265_MIN(MAX_POOL_THREADS, threadsPerPool[node]);
123
+            if (!pools[i].create(numThreads, maxProviders, nodeMaskPerPool[node]))
124
             {
125
                 X265_FREE(pools);
126
                 numPools = 0;
127
                 return NULL;
128
             }
129
             if (numNumaNodes > 1)
130
-                x265_log(p, X265_LOG_INFO, "Thread pool %d using %d threads on NUMA node %d\n", i, cores, node);
131
+            {
132
+                char *nodesstr = new char[64 * strlen(",63") + 1];
133
+                int len = 0;
134
+                for (int j = 0; j < 64; j++)
135
+                    if ((nodeMaskPerPool[node] >> j) & 1)
136
+                        len += sprintf(nodesstr + len, ",%d", j);
137
+                x265_log(p, X265_LOG_INFO, "Thread pool %d using %d threads on numa nodes %s\n", i, numThreads, nodesstr + 1);
138
+            }
139
             else
140
-                x265_log(p, X265_LOG_INFO, "Thread pool created using %d threads\n", cores);
141
-            cpusPerNode[node] -= cores;
142
+                x265_log(p, X265_LOG_INFO, "Thread pool created using %d threads\n", numThreads);
143
+            threadsPerPool[node] -= numThreads;
144
         }
145
     }
146
     else
147
@@ -340,11 +384,37 @@
148
     memset(this, 0, sizeof(*this));
149
 }
150
 
151
-bool ThreadPool::create(int numThreads, int maxProviders, int node)
152
+bool ThreadPool::create(int numThreads, int maxProviders, uint64_t nodeMask)
153
 {
154
     X265_CHECK(numThreads <= MAX_POOL_THREADS, "a single thread pool cannot have more than MAX_POOL_THREADS threads\n");
155
 
156
-    m_numaNode = node;
157
+#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 
158
+    m_winCpuMask = 0x0;
159
+    GROUP_AFFINITY groupAffinity;
160
+    for (int i = 0; i < getNumaNodeCount(); i++)
161
+    {
162
+        int numaNode = ((nodeMask >> i) & 0x1U) ? i : -1;
163
+        if (numaNode != -1)
164
+            if (GetNumaNodeProcessorMaskEx((USHORT)numaNode, &groupAffinity))
165
+                m_winCpuMask |= groupAffinity.Mask;
166
+    }
167
+    m_numaMask = &m_winCpuMask;
168
+#elif HAVE_LIBNUMA
169
+    if (numa_available() >= 0)
170
+    {
171
+        struct bitmask* nodemask = numa_allocate_nodemask();
172
+        if (nodemask)
173
+        {
174
+            *(nodemask->maskp) = nodeMask;
175
+            m_numaMask = nodemask;
176
+        }
177
+        else
178
+            x265_log(NULL, X265_LOG_ERROR, "unable to get NUMA node mask for %lx\n", nodeMask);
179
+    }
180
+#else
181
+    (void)nodeMask;
182
+#endif
183
+
184
     m_numWorkers = numThreads;
185
 
186
     m_workers = X265_MALLOC(WorkerThread, numThreads);
187
@@ -398,36 +468,39 @@
188
 
189
     X265_FREE(m_workers);
190
     X265_FREE(m_jpTable);
191
+
192
+#if HAVE_LIBNUMA
193
+    if(m_numaMask)
194
+        numa_free_nodemask((struct bitmask*)m_numaMask);
195
+#endif
196
 }
197
 
198
 void ThreadPool::setCurrentThreadAffinity()
199
 {
200
-    setThreadNodeAffinity(m_numaNode);
201
x265_1.8.tar.gz/source/common/threadpool.h -> x265_1.9.tar.gz/source/common/threadpool.h Changed
31
 
1
@@ -83,7 +83,10 @@
2
     sleepbitmap_t m_sleepBitmap;
3
     int           m_numProviders;
4
     int           m_numWorkers;
5
-    int           m_numaNode;
6
+    void*         m_numaMask; // node mask in linux, cpu mask in windows
7
+#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 
8
+    DWORD_PTR     m_winCpuMask;
9
+#endif
10
     bool          m_isActive;
11
 
12
     JobProvider** m_jpTable;
13
@@ -92,7 +95,7 @@
14
     ThreadPool();
15
     ~ThreadPool();
16
 
17
-    bool create(int numThreads, int maxProviders, int node);
18
+    bool create(int numThreads, int maxProviders, uint64_t nodeMask);
19
     bool start();
20
     void stopWorkers();
21
     void setCurrentThreadAffinity();
22
@@ -103,7 +106,7 @@
23
 
24
     static int  getCpuCount();
25
     static int  getNumaNodeCount();
26
-    static void setThreadNodeAffinity(int node);
27
+    static void setThreadNodeAffinity(void *numaMask);
28
 };
29
 
30
 /* Any worker thread may enlist the help of idle worker threads from the same
31
x265_1.8.tar.gz/source/common/version.cpp -> x265_1.9.tar.gz/source/common/version.cpp Changed
9
 
1
@@ -2,6 +2,7 @@
2
  * Copyright (C) 2013 x265 project
3
  *
4
  * Authors: Steve Borho <steve@borho.org>
5
+ *          Min Chen <chenm003@163.com>
6
  *
7
  * This program is free software; you can redistribute it and/or modify
8
  * it under the terms of the GNU General Public License as published by
9
x265_1.8.tar.gz/source/common/wavefront.cpp -> x265_1.9.tar.gz/source/common/wavefront.cpp Changed
9
 
1
@@ -2,6 +2,7 @@
2
  * Copyright (C) 2013 x265 project
3
  *
4
  * Authors: Steve Borho <steve@borho.org>
5
+ *          Min Chen <chenm003@163.com>
6
  *
7
  * This program is free software; you can redistribute it and/or modify
8
  * it under the terms of the GNU General Public License as published by
9
x265_1.8.tar.gz/source/common/wavefront.h -> x265_1.9.tar.gz/source/common/wavefront.h Changed
9
 
1
@@ -2,6 +2,7 @@
2
  * Copyright (C) 2013 x265 project
3
  *
4
  * Authors: Steve Borho <steve@borho.org>
5
+ *          Min Chen <chenm003@163.com>
6
  *
7
  * This program is free software; you can redistribute it and/or modify
8
  * it under the terms of the GNU General Public License as published by
9
x265_1.8.tar.gz/source/common/x86/asm-primitives.cpp -> x265_1.9.tar.gz/source/common/x86/asm-primitives.cpp Changed
201
 
1
@@ -962,11 +962,8 @@
2
 
3
         p.cu[BLOCK_4x4].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar4_sse2);
4
         p.cu[BLOCK_8x8].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar8_sse2);
5
-
6
-#if X265_DEPTH <= 10
7
         p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar16_sse2);
8
         p.cu[BLOCK_32x32].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar32_sse2);
9
-#endif /* X265_DEPTH <= 10 */
10
         ALL_LUMA_TU_S(intra_pred[DC_IDX], intra_pred_dc, sse2);
11
 
12
         p.cu[BLOCK_4x4].intra_pred[2] = PFX(intra_pred_ang4_2_sse2);
13
@@ -1003,13 +1000,12 @@
14
         p.cu[BLOCK_4x4].intra_pred[33] = PFX(intra_pred_ang4_33_sse2);
15
 
16
         p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_32x64_sse2);
17
-#if X265_DEPTH <= 10
18
-        p.cu[BLOCK_4x4].sse_ss = PFX(pixel_ssd_ss_4x4_mmx2);
19
-        ALL_LUMA_CU(sse_ss, pixel_ssd_ss, sse2);
20
-
21
         p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_4x8_mmx2);
22
         p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_8x16_sse2);
23
         p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_16x32_sse2);
24
+#if X265_DEPTH <= 10
25
+        p.cu[BLOCK_4x4].sse_ss = PFX(pixel_ssd_ss_4x4_mmx2);
26
+        ALL_LUMA_CU(sse_ss, pixel_ssd_ss, sse2);
27
 #endif
28
         p.cu[BLOCK_4x4].dct = PFX(dct4_sse2);
29
         p.cu[BLOCK_8x8].dct = PFX(dct8_sse2);
30
@@ -1031,6 +1027,7 @@
31
         ALL_CHROMA_444_PU(p2s, filterPixelToShort, sse2);
32
         ALL_LUMA_PU(convert_p2s, filterPixelToShort, sse2);
33
         ALL_LUMA_TU(count_nonzero, count_nonzero, sse2);
34
+        p.propagateCost = PFX(mbtree_propagate_cost_sse2);
35
     }
36
     if (cpuMask & X265_CPU_SSE3)
37
     {
38
@@ -1144,11 +1141,8 @@
39
 
40
         p.cu[BLOCK_4x4].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar4_sse4);
41
         p.cu[BLOCK_8x8].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar8_sse4);
42
-
43
-#if X265_DEPTH <= 10
44
         p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar16_sse4);
45
         p.cu[BLOCK_32x32].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar32_sse4);
46
-#endif
47
         ALL_LUMA_TU_S(intra_pred[DC_IDX], intra_pred_dc, sse4);
48
         INTRA_ANG_SSE4_COMMON(sse4);
49
         INTRA_ANG_SSE4_HIGH(sse4);
50
@@ -1158,14 +1152,12 @@
51
         p.weight_sp = PFX(weight_sp_sse4);
52
 
53
         p.cu[BLOCK_4x4].psy_cost_pp = PFX(psyCost_pp_4x4_sse4);
54
-        p.cu[BLOCK_4x4].psy_cost_ss = PFX(psyCost_ss_4x4_sse4);
55
 
56
         // TODO: check POPCNT flag!
57
         ALL_LUMA_TU_S(copy_cnt, copy_cnt_, sse4);
58
 #if X265_DEPTH <= 10
59
         ALL_LUMA_CU(psy_cost_pp, psyCost_pp, sse4);
60
 #endif
61
-        ALL_LUMA_CU(psy_cost_ss, psyCost_ss, sse4);
62
 
63
         p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].p2s = PFX(filterPixelToShort_2x4_sse4);
64
         p.chroma[X265_CSP_I420].pu[CHROMA_420_2x8].p2s = PFX(filterPixelToShort_2x8_sse4);
65
@@ -1173,6 +1165,7 @@
66
         p.chroma[X265_CSP_I422].pu[CHROMA_422_2x8].p2s = PFX(filterPixelToShort_2x8_sse4);
67
         p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].p2s = PFX(filterPixelToShort_2x16_sse4);
68
         p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].p2s = PFX(filterPixelToShort_6x16_sse4);
69
+        p.costCoeffRemain = PFX(costCoeffRemain_sse4);
70
     }
71
     if (cpuMask & X265_CPU_AVX)
72
     {
73
@@ -1306,6 +1299,7 @@
74
         p.pu[LUMA_64x32].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x32_avx);
75
         p.pu[LUMA_64x48].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x48_avx);
76
         p.pu[LUMA_64x64].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x64_avx);
77
+        p.propagateCost = PFX(mbtree_propagate_cost_avx);
78
     }
79
     if (cpuMask & X265_CPU_XOP)
80
     {
81
@@ -1319,6 +1313,9 @@
82
     }
83
     if (cpuMask & X265_CPU_AVX2)
84
     {
85
+#if X265_DEPTH == 12
86
+        ASSIGN_SA8D(avx2);
87
+#endif
88
         p.cu[BLOCK_4x4].intra_filter = PFX(intra_filter_4x4_avx2);
89
 
90
         // TODO: the planecopy_sp is really planecopy_SC now, must be fix it
91
@@ -1479,20 +1476,14 @@
92
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].addAvg = PFX(addAvg_32x16_avx2);
93
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].addAvg = PFX(addAvg_32x48_avx2);
94
 
95
-        p.cu[BLOCK_4x4].psy_cost_ss = PFX(psyCost_ss_4x4_avx2);
96
-        p.cu[BLOCK_8x8].psy_cost_ss = PFX(psyCost_ss_8x8_avx2);
97
-        p.cu[BLOCK_16x16].psy_cost_ss = PFX(psyCost_ss_16x16_avx2);
98
-        p.cu[BLOCK_32x32].psy_cost_ss = PFX(psyCost_ss_32x32_avx2);
99
-        p.cu[BLOCK_64x64].psy_cost_ss = PFX(psyCost_ss_64x64_avx2);
100
         p.cu[BLOCK_4x4].psy_cost_pp = PFX(psyCost_pp_4x4_avx2);
101
-#if X265_DEPTH <= 10
102
+        p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar16_avx2);
103
+        p.cu[BLOCK_32x32].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar32_avx2);
104
+
105
         p.cu[BLOCK_8x8].psy_cost_pp = PFX(psyCost_pp_8x8_avx2);
106
         p.cu[BLOCK_16x16].psy_cost_pp = PFX(psyCost_pp_16x16_avx2);
107
         p.cu[BLOCK_32x32].psy_cost_pp = PFX(psyCost_pp_32x32_avx2);
108
         p.cu[BLOCK_64x64].psy_cost_pp = PFX(psyCost_pp_64x64_avx2);
109
-        p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar16_avx2);
110
-        p.cu[BLOCK_32x32].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar32_avx2);
111
-#endif
112
 
113
         p.cu[BLOCK_16x16].intra_pred[DC_IDX] = PFX(intra_pred_dc16_avx2);
114
         p.cu[BLOCK_32x32].intra_pred[DC_IDX] = PFX(intra_pred_dc32_avx2);
115
@@ -1536,20 +1527,13 @@
116
         p.cu[BLOCK_16x16].ssd_s = PFX(pixel_ssd_s_16_avx2);
117
         p.cu[BLOCK_32x32].ssd_s = PFX(pixel_ssd_s_32_avx2);
118
 
119
-#if X265_DEPTH <= 10
120
-        p.cu[BLOCK_16x16].sse_ss = PFX(pixel_ssd_ss_16x16_avx2);
121
-        p.cu[BLOCK_32x32].sse_ss = PFX(pixel_ssd_ss_32x32_avx2);
122
-        p.cu[BLOCK_64x64].sse_ss = PFX(pixel_ssd_ss_64x64_avx2);
123
-
124
-        p.cu[BLOCK_16x16].sse_pp = PFX(pixel_ssd_16x16_avx2);
125
-        p.cu[BLOCK_32x32].sse_pp = PFX(pixel_ssd_32x32_avx2);
126
-        p.cu[BLOCK_64x64].sse_pp = PFX(pixel_ssd_64x64_avx2);
127
-        p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sse_pp = PFX(pixel_ssd_16x16_avx2);
128
-        p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sse_pp = PFX(pixel_ssd_32x32_avx2);
129
+        p.cu[BLOCK_16x16].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_16x16_avx2);
130
+        p.cu[BLOCK_32x32].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_32x32_avx2);
131
+        p.cu[BLOCK_64x64].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_64x64_avx2);
132
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sse_pp = (pixel_sse_t)PFX(pixel_ssd_16x16_avx2);
133
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sse_pp = (pixel_sse_t)PFX(pixel_ssd_32x32_avx2);
134
         p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_16x32_avx2);
135
         p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_32x64_avx2);
136
-#endif
137
-
138
         p.quant = PFX(quant_avx2);
139
         p.nquant = PFX(nquant_avx2);
140
         p.dequant_normal  = PFX(dequant_normal_avx2);
141
@@ -1588,21 +1572,16 @@
142
         p.cu[BLOCK_16x16].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_16_avx2);
143
         p.cu[BLOCK_32x32].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_32_avx2);
144
 
145
-#if X265_DEPTH <= 10
146
-        ALL_LUMA_TU_S(dct, dct, avx2);
147
         ALL_LUMA_TU_S(idct, idct, avx2);
148
-#endif
149
+        ALL_LUMA_TU_S(dct, dct, avx2);
150
+
151
         ALL_LUMA_CU_S(transpose, transpose, avx2);
152
 
153
         ALL_LUMA_PU(luma_vpp, interp_8tap_vert_pp, avx2);
154
         ALL_LUMA_PU(luma_vps, interp_8tap_vert_ps, avx2);
155
-#if X265_DEPTH <= 10
156
         ALL_LUMA_PU(luma_vsp, interp_8tap_vert_sp, avx2);
157
-#endif
158
         ALL_LUMA_PU(luma_vss, interp_8tap_vert_ss, avx2);
159
-#if X265_DEPTH <= 10
160
         p.pu[LUMA_4x4].luma_vsp = PFX(interp_8tap_vert_sp_4x4_avx2);               // since ALL_LUMA_PU didn't declare 4x4 size, calling separately luma_vsp function to use 
161
-#endif
162
 
163
         p.cu[BLOCK_16x16].add_ps = PFX(pixel_add_ps_16x16_avx2);
164
         p.cu[BLOCK_32x32].add_ps = PFX(pixel_add_ps_32x32_avx2);
165
@@ -1625,7 +1604,6 @@
166
         p.pu[LUMA_16x12].sad = PFX(pixel_sad_16x12_avx2);
167
         p.pu[LUMA_16x16].sad = PFX(pixel_sad_16x16_avx2);
168
         p.pu[LUMA_16x32].sad = PFX(pixel_sad_16x32_avx2);
169
-#if X265_DEPTH <= 10
170
         p.pu[LUMA_16x64].sad = PFX(pixel_sad_16x64_avx2);
171
         p.pu[LUMA_32x8].sad = PFX(pixel_sad_32x8_avx2);
172
         p.pu[LUMA_32x16].sad = PFX(pixel_sad_32x16_avx2);
173
@@ -1637,7 +1615,6 @@
174
         p.pu[LUMA_64x32].sad = PFX(pixel_sad_64x32_avx2);
175
         p.pu[LUMA_64x48].sad = PFX(pixel_sad_64x48_avx2);
176
         p.pu[LUMA_64x64].sad = PFX(pixel_sad_64x64_avx2);
177
-#endif
178
 
179
         p.pu[LUMA_16x4].sad_x3 = PFX(pixel_sad_x3_16x4_avx2);
180
         p.pu[LUMA_16x8].sad_x3 = PFX(pixel_sad_x3_16x8_avx2);
181
@@ -1712,7 +1689,6 @@
182
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s = PFX(filterPixelToShort_32x48_avx2);
183
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s = PFX(filterPixelToShort_32x64_avx2);
184
 
185
-#if X265_DEPTH <= 10
186
         p.pu[LUMA_4x4].luma_hps = PFX(interp_8tap_horiz_ps_4x4_avx2);
187
         p.pu[LUMA_4x8].luma_hps = PFX(interp_8tap_horiz_ps_4x8_avx2);
188
         p.pu[LUMA_4x16].luma_hps = PFX(interp_8tap_horiz_ps_4x16_avx2);
189
@@ -1738,7 +1714,6 @@
190
         p.pu[LUMA_48x64].luma_hps = PFX(interp_8tap_horiz_ps_48x64_avx2);
191
         p.pu[LUMA_24x32].luma_hps = PFX(interp_8tap_horiz_ps_24x32_avx2);
192
         p.pu[LUMA_12x16].luma_hps = PFX(interp_8tap_horiz_ps_12x16_avx2);
193
-#endif
194
 
195
         p.pu[LUMA_4x4].luma_hpp = PFX(interp_8tap_horiz_pp_4x4_avx2);
196
         p.pu[LUMA_4x8].luma_hpp = PFX(interp_8tap_horiz_pp_4x8_avx2);
197
@@ -1766,7 +1741,6 @@
198
         p.pu[LUMA_24x32].luma_hpp = PFX(interp_8tap_horiz_pp_24x32_avx2);
199
         p.pu[LUMA_48x64].luma_hpp = PFX(interp_8tap_horiz_pp_48x64_avx2);
200
 
201
x265_1.8.tar.gz/source/common/x86/blockcopy8.asm -> x265_1.9.tar.gz/source/common/x86/blockcopy8.asm Changed
9
 
1
@@ -3,6 +3,7 @@
2
 ;*
3
 ;* Authors: Praveen Kumar Tiwari <praveen@multicorewareinc.com>
4
 ;*          Murugan Vairavel <murugan@multicorewareinc.com>
5
+;*          Min Chen <chenm003@163.com>
6
 ;*
7
 ;* This program is free software; you can redistribute it and/or modify
8
 ;* it under the terms of the GNU General Public License as published by
9
x265_1.8.tar.gz/source/common/x86/blockcopy8.h -> x265_1.9.tar.gz/source/common/x86/blockcopy8.h Changed
9
 
1
@@ -2,6 +2,7 @@
2
  * Copyright (C) 2013 x265 project
3
  *
4
  * Authors: Steve Borho <steve@borho.org>
5
+;*          Min Chen <chenm003@163.com>
6
  *
7
  * This program is free software; you can redistribute it and/or modify
8
  * it under the terms of the GNU General Public License as published by
9
x265_1.8.tar.gz/source/common/x86/const-a.asm -> x265_1.9.tar.gz/source/common/x86/const-a.asm Changed
90
 
1
@@ -2,6 +2,7 @@
2
 ;* const-a.asm: x86 global constants
3
 ;*****************************************************************************
4
 ;* Copyright (C) 2010-2013 x264 project
5
+;* Copyright (C) 2013-2015 x265 project
6
 ;*
7
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
8
 ;*          Fiona Glaser <fiona@x264.com>
9
@@ -31,10 +32,10 @@
10
 
11
 ;; 8-bit constants
12
 
13
-const pb_0,                 times 16 db 0
14
+const pb_0,                 times 32 db 0
15
 const pb_1,                 times 32 db 1
16
 const pb_2,                 times 32 db 2
17
-const pb_3,                 times 16 db 3
18
+const pb_3,                 times 32 db 3
19
 const pb_4,                 times 32 db 4
20
 const pb_8,                 times 32 db 8
21
 const pb_15,                times 32 db 15
22
@@ -54,6 +55,11 @@
23
 const pb_shuf8x8c,          times  1 db   0,   0,   0,   0,   2,   2,   2,   2,   4,   4,   4,   4,   6,   6,   6,   6
24
 const pb_movemask,          times 16 db 0x00
25
                             times 16 db 0xFF
26
+
27
+const pb_movemask_32,       times 32 db 0x00
28
+                            times 32 db 0xFF
29
+                            times 32 db 0x00
30
+
31
 const pb_0000000000000F0F,  times  2 db 0xff, 0x00
32
                             times 12 db 0x00
33
 const pb_000000000000000F,           db 0xff
34
@@ -61,6 +67,7 @@
35
 
36
 ;; 16-bit constants
37
 
38
+const pw_n1,                times 16 dw -1
39
 const pw_1,                 times 16 dw 1
40
 const pw_2,                 times 16 dw 2
41
 const pw_3,                 times 16 dw 3
42
@@ -86,12 +93,12 @@
43
 const pw_ff00,              times  8 dw 0xff00
44
 const pw_2000,              times 16 dw 0x2000
45
 const pw_8000,              times  8 dw 0x8000
46
-const pw_3fff,              times  8 dw 0x3fff
47
+const pw_3fff,              times 16 dw 0x3fff
48
 const pw_32_0,              times  4 dw 32,
49
                             times  4 dw 0
50
 const pw_pixel_max,         times 16 dw ((1 << BIT_DEPTH)-1)
51
 
52
-const pw_0_15,              times  2 dw   0,   1,   2,   3,   4,   5,   6,   7
53
+const pw_0_7,               times  2 dw   0,   1,   2,   3,   4,   5,   6,   7
54
 const pw_ppppmmmm,          times  1 dw   1,   1,   1,   1,  -1,  -1,  -1,  -1
55
 const pw_ppmmppmm,          times  1 dw   1,   1,  -1,  -1,   1,   1,  -1,  -1
56
 const pw_pmpmpmpm,          times 16 dw   1,  -1,   1,  -1,   1,  -1,   1,  -1
57
@@ -107,6 +114,7 @@
58
                             times  7 dw 0xff
59
 const hmul_16p,             times 16 db   1
60
                             times  8 db   1,  -1
61
+const pw_exp2_0_15,                  dw 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768
62
 
63
 
64
 ;; 32-bit constants
65
@@ -115,8 +123,9 @@
66
 const pd_2,                 times  8 dd 2
67
 const pd_4,                 times  4 dd 4
68
 const pd_8,                 times  4 dd 8
69
+const pd_15,                times  8 dd 15
70
 const pd_16,                times  8 dd 16
71
-const pd_31,                times  4 dd 31
72
+const pd_31,                times  8 dd 31
73
 const pd_32,                times  8 dd 32
74
 const pd_64,                times  4 dd 64
75
 const pd_128,               times  4 dd 128
76
@@ -129,7 +138,12 @@
77
 const pd_524416,            times  4 dd 524416
78
 const pd_n32768,            times  8 dd 0xffff8000
79
 const pd_n131072,           times  4 dd 0xfffe0000
80
-
81
+const pd_0000ffff,          times  8 dd 0x0000FFFF
82
+const pd_planar16_mul0,     times  1 dd  15,  14,  13,  12,  11,  10,   9,   8,    7,   6,   5,   4,   3,   2,   1,   0
83
+const pd_planar16_mul1,     times  1 dd   1,   2,   3,   4,   5,   6,   7,   8,    9,  10,  11,  12,  13,  14,  15,  16
84
+const pd_planar32_mul1,     times  1 dd  31,  30,  29,  28,  27,  26,  25,  24,   23,  22,  21,  20,  19,  18,  17,  16
85
+const pd_planar32_mul2,     times  1 dd  17,  18,  19,  20,  21,  22,  23,  24,   25,  26,  27,  28,  29,  30,  31,  32
86
+const pd_planar16_mul2,     times  1 dd  15,  14,  13,  12,  11,  10,   9,   8,    7,   6,   5,   4,   3,   2,   1,   0
87
 const trans8_shuf,          times  1 dd   0,   4,   1,   5,   2,   6,   3,   7
88
 
89
 const popcnt_table
90
x265_1.8.tar.gz/source/common/x86/cpu-a.asm -> x265_1.9.tar.gz/source/common/x86/cpu-a.asm Changed
9
 
1
@@ -2,6 +2,7 @@
2
 ;* cpu-a.asm: x86 cpu utilities
3
 ;*****************************************************************************
4
 ;* Copyright (C) 2003-2013 x264 project
5
+;* Copyright (C) 2013-2015 x265 project
6
 ;*
7
 ;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
8
 ;*          Loren Merritt <lorenm@u.washington.edu>
9
x265_1.8.tar.gz/source/common/x86/dct8.asm -> x265_1.9.tar.gz/source/common/x86/dct8.asm Changed
113
 
1
@@ -2115,15 +2115,15 @@
2
     mova     m0, [r0]
3
     pabsw    m1, m0
4
 
5
-    mova     m2, [r1]
6
+    movu     m2, [r1]
7
     pmovsxwd m3, m1
8
     paddd    m2, m3
9
-    mova     [r1], m2
10
-    mova     m2, [r1 + 16]
11
+    movu     [r1], m2
12
+    movu     m2, [r1 + 16]
13
     psrldq   m3, m1, 8
14
     pmovsxwd m4, m3
15
     paddd    m2, m4
16
-    mova     [r1 + 16], m2
17
+    movu     [r1 + 16], m2
18
 
19
     movu     m3, [r2]
20
     psubusw  m1, m3
21
@@ -2174,7 +2174,7 @@
22
     pmaddwd         m0,                 m%4
23
     phaddd          m2,                 m0
24
     paddd           m2,                 m5
25
-    psrad           m2,                 DCT_SHIFT
26
+    psrad           m2,                 DCT8_SHIFT1
27
     packssdw        m2,                 m2
28
     vpermq          m2,                 m2, 0x08
29
     mova            [r5 + %2],          xm2
30
@@ -2190,7 +2190,7 @@
31
     phaddd          m8,                 m9
32
     phaddd          m6,                 m8
33
     paddd           m6,                 m5
34
-    psrad           m6,                 DCT_SHIFT2
35
+    psrad           m6,                 DCT8_SHIFT2
36
 
37
     vbroadcasti128  m4,                 [r6 + %2]
38
     pmaddwd         m10,                m0, m4
39
@@ -2201,7 +2201,7 @@
40
     phaddd          m8,                 m9
41
     phaddd          m10,                m8
42
     paddd           m10,                m5
43
-    psrad           m10,                DCT_SHIFT2
44
+    psrad           m10,                DCT8_SHIFT2
45
 
46
     packssdw        m6,                 m10
47
     vpermq          m10,                m6, 0xD8
48
@@ -2210,18 +2210,7 @@
49
 
50
 INIT_YMM avx2
51
 cglobal dct8, 3, 7, 11, 0-8*16
52
-%if BIT_DEPTH == 12
53
-    %define         DCT_SHIFT          6
54
-    vbroadcasti128  m5,                [pd_16]
55
-%elif BIT_DEPTH == 10
56
-    %define         DCT_SHIFT          4
57
-    vbroadcasti128  m5,                [pd_8]
58
-%elif BIT_DEPTH == 8
59
-    %define         DCT_SHIFT          2
60
-    vbroadcasti128  m5,                [pd_2]
61
-%else
62
-    %error Unsupported BIT_DEPTH!
63
-%endif
64
+vbroadcasti128      m5,                [pd_ %+ DCT8_ROUND1]
65
 %define             DCT_SHIFT2         9
66
 
67
     add             r2d,               r2d
68
@@ -2265,7 +2254,7 @@
69
     DCT8_PASS_1     7 * 16,             7 * 16, 4, 1
70
 
71
     ;pass2
72
-    vbroadcasti128  m5,                [pd_256]
73
+    vbroadcasti128  m5,                [pd_ %+ DCT8_ROUND2]
74
 
75
     mova            m0,                [r5]
76
     mova            m1,                [r5 + 32]
77
@@ -2904,7 +2893,7 @@
78
 cglobal idct8, 3, 7, 13, 0-8*16
79
 %if BIT_DEPTH == 12
80
     %define         IDCT_SHIFT2        8
81
-    vpbroadcastd    m12,                [pd_256]
82
+    vpbroadcastd    m12,                [pd_128]
83
 %elif BIT_DEPTH == 10
84
     %define         IDCT_SHIFT2        10
85
     vpbroadcastd    m12,                [pd_512]
86
@@ -3065,7 +3054,7 @@
87
 cglobal idct16, 3, 7, 16, 0-16*mmsize
88
 %if BIT_DEPTH == 12
89
     %define         IDCT_SHIFT2        8
90
-    vpbroadcastd    m15,                [pd_256]
91
+    vpbroadcastd    m15,                [pd_128]
92
 %elif BIT_DEPTH == 10
93
     %define         IDCT_SHIFT2        10
94
     vpbroadcastd    m15,                [pd_512]
95
@@ -3487,7 +3476,7 @@
96
 
97
 %if BIT_DEPTH == 12
98
     %define         IDCT_SHIFT2        8
99
-    vpbroadcastd    m15,                [pd_256]
100
+    vpbroadcastd    m15,                [pd_128]
101
 %elif BIT_DEPTH == 10
102
     %define         IDCT_SHIFT2        10
103
     vpbroadcastd    m15,                [pd_512]
104
@@ -3651,7 +3640,7 @@
105
 %define             IDCT_SHIFT1         7
106
 %if BIT_DEPTH == 12
107
     %define         IDCT_SHIFT2        8
108
-    vpbroadcastd    m5,                [pd_256]
109
+    vpbroadcastd    m5,                [pd_128]
110
 %elif BIT_DEPTH == 10
111
     %define         IDCT_SHIFT2        10
112
     vpbroadcastd    m5,                [pd_512]
113
x265_1.8.tar.gz/source/common/x86/dct8.h -> x265_1.9.tar.gz/source/common/x86/dct8.h Changed
9
 
1
@@ -2,6 +2,7 @@
2
  * Copyright (C) 2013 x265 project
3
  *
4
  * Authors: Nabajit Deka <nabajit@multicorewareinc.com>
5
+;*          Min Chen <chenm003@163.com>
6
  *
7
  * This program is free software; you can redistribute it and/or modify
8
  * it under the terms of the GNU General Public License as published by
9
x265_1.8.tar.gz/source/common/x86/intrapred16.asm -> x265_1.9.tar.gz/source/common/x86/intrapred16.asm Changed
201
 
1
@@ -109,9 +109,11 @@
2
 cextern pw_16
3
 cextern pw_31
4
 cextern pw_32
5
+cextern pd_15
6
 cextern pd_16
7
 cextern pd_31
8
 cextern pd_32
9
+cextern pd_0000ffff
10
 cextern pw_4096
11
 cextern pw_pixel_max
12
 cextern multiL
13
@@ -123,7 +125,12 @@
14
 cextern pb_unpackwq1
15
 cextern pb_unpackwq2
16
 cextern pw_planar16_mul
17
+cextern pd_planar16_mul0
18
+cextern pd_planar16_mul1
19
 cextern pw_planar32_mul
20
+cextern pd_planar32_mul1
21
+cextern pd_planar32_mul2
22
+cextern pd_planar16_mul2
23
 
24
 ;-----------------------------------------------------------------------------------
25
 ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* above, int, int filter)
26
@@ -731,6 +738,117 @@
27
 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
28
 ;---------------------------------------------------------------------------------------
29
 INIT_XMM sse2
30
+%if ARCH_X86_64 == 1 && BIT_DEPTH == 12
31
+cglobal intra_pred_planar16, 3,5,13
32
+    add             r1d, r1d
33
+    pxor            m12, m12
34
+
35
+    movu            m2, [r2 + 2]
36
+    movu            m10, [r2 + 18]
37
+
38
+    punpckhwd       m7, m2, m12
39
+    punpcklwd       m2, m12
40
+    punpckhwd       m0, m10, m12
41
+    punpcklwd       m10, m12
42
+
43
+    movzx           r3d, word [r2 + 34]                     ; topRight   = above[16]
44
+    lea             r4, [pd_planar16_mul1]
45
+
46
+    movd            m3, r3d
47
+    pshufd          m3, m3, 0                               ; topRight
48
+
49
+    pmaddwd         m8, m3, [r4 + 3*mmsize]                 ; (x + 1) * topRight
50
+    pmaddwd         m4, m3, [r4 + 2*mmsize]                 ; (x + 1) * topRight
51
+    pmaddwd         m9, m3, [r4 + 1*mmsize]                 ; (x + 1) * topRight
52
+    pmaddwd         m3, m3, [r4 + 0*mmsize]                 ; (x + 1) * topRight
53
+
54
+    mova            m11, [pd_15]
55
+    pmaddwd         m1, m2,  m11                            ; (blkSize - 1 - y) * above[x]
56
+    pmaddwd         m6, m7,  m11                            ; (blkSize - 1 - y) * above[x]
57
+    pmaddwd         m5, m10, m11                            ; (blkSize - 1 - y) * above[x]
58
+    pmaddwd         m11, m0                                 ; (blkSize - 1 - y) * above[x]
59
+
60
+    paddd           m4, m5
61
+    paddd           m3, m1
62
+    paddd           m8, m11
63
+    paddd           m9, m6
64
+
65
+    mova            m5, [pd_16]
66
+    paddd           m3, m5
67
+    paddd           m9, m5
68
+    paddd           m4, m5
69
+    paddd           m8, m5
70
+
71
+    movzx           r4d, word [r2 + 98]                     ; bottomLeft = left[16]
72
+    movd            m6, r4d
73
+    pshufd          m6, m6, 0                               ; bottomLeft
74
+
75
+    paddd           m4, m6
76
+    paddd           m3, m6
77
+    paddd           m8, m6
78
+    paddd           m9, m6
79
+
80
+    psubd           m1, m6, m0                              ; column 12-15
81
+    psubd           m11, m6, m10                            ; column 8-11
82
+    psubd           m10, m6, m7                             ; column 4-7
83
+    psubd           m6, m2                                  ; column 0-3
84
+
85
+    add             r2, 66
86
+    lea             r4, [pd_planar16_mul0]
87
+
88
+%macro INTRA_PRED_PLANAR16_sse2 1
89
+    movzx           r3d, word [r2 + %1*2]
90
+    movd            m5, r3d
91
+    pshufd          m5, m5, 0
92
+
93
+    pmaddwd         m0, m5, [r4 + 3*mmsize]                 ; column 12-15
94
+    pmaddwd         m2, m5, [r4 + 2*mmsize]                 ; column 8-11
95
+    pmaddwd         m7, m5, [r4 + 1*mmsize]                 ; column 4-7
96
+    pmaddwd         m5, m5, [r4 + 0*mmsize]                 ; column 0-3
97
+
98
+    paddd           m0, m8
99
+    paddd           m2, m4
100
+    paddd           m7, m9
101
+    paddd           m5, m3
102
+
103
+    paddd           m8, m1
104
+    paddd           m4, m11
105
+    paddd           m9, m10
106
+    paddd           m3, m6
107
+
108
+    psrad           m0, 5
109
+    psrad           m2, 5
110
+    psrad           m7, 5
111
+    psrad           m5, 5
112
+
113
+    packssdw        m2, m0
114
+    packssdw        m5, m7
115
+    movu            [r0], m5
116
+    movu            [r0 + mmsize], m2
117
+
118
+    add             r0, r1
119
+%endmacro
120
+
121
+    INTRA_PRED_PLANAR16_sse2 0
122
+    INTRA_PRED_PLANAR16_sse2 1
123
+    INTRA_PRED_PLANAR16_sse2 2
124
+    INTRA_PRED_PLANAR16_sse2 3
125
+    INTRA_PRED_PLANAR16_sse2 4
126
+    INTRA_PRED_PLANAR16_sse2 5
127
+    INTRA_PRED_PLANAR16_sse2 6
128
+    INTRA_PRED_PLANAR16_sse2 7
129
+    INTRA_PRED_PLANAR16_sse2 8
130
+    INTRA_PRED_PLANAR16_sse2 9
131
+    INTRA_PRED_PLANAR16_sse2 10
132
+    INTRA_PRED_PLANAR16_sse2 11
133
+    INTRA_PRED_PLANAR16_sse2 12
134
+    INTRA_PRED_PLANAR16_sse2 13
135
+    INTRA_PRED_PLANAR16_sse2 14
136
+    INTRA_PRED_PLANAR16_sse2 15
137
+    RET
138
+
139
+%else
140
+; code for BIT_DEPTH == 10
141
 cglobal intra_pred_planar16, 3,3,8
142
     movu            m2, [r2 + 2]
143
     movu            m7, [r2 + 18]
144
@@ -809,7 +927,180 @@
145
     INTRA_PRED_PLANAR_16 14
146
     INTRA_PRED_PLANAR_16 15
147
     RET
148
+%endif
149
+
150
+;---------------------------------------------------------------------------------------
151
+; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
152
+;---------------------------------------------------------------------------------------
153
+INIT_XMM sse2
154
+%if ARCH_X86_64 == 1 && BIT_DEPTH == 12
155
+cglobal intra_pred_planar32, 3,7,16
156
+    ; NOTE: align stack to 64 bytes, so all of local data in same cache line
157
+    mov             r6, rsp
158
+    sub             rsp, 4*mmsize
159
+    and             rsp, ~63
160
+    %define         m16 [rsp + 0 * mmsize]
161
+    %define         m17 [rsp + 1 * mmsize]
162
+    %define         m18 [rsp + 2 * mmsize]
163
+    %define         m19 [rsp + 3 * mmsize]
164
+
165
+    add             r1, r1
166
+    pxor            m12, m12
167
+
168
+    movzx           r3d, word [r2 + 66]
169
+    lea             r4, [planar32_table1]
170
+
171
+    movd            m0, r3d
172
+    pshufd          m0, m0, 0
173
+
174
+    pmaddwd         m8, m0, [r4 + 0]
175
+    pmaddwd         m9, m0, [r4 + 16]
176
+    pmaddwd         m10, m0, [r4 + 32]
177
+    pmaddwd         m11, m0, [r4 + 48]
178
+    pmaddwd         m7, m0, [r4 + 64]
179
+    pmaddwd         m13, m0, [r4 + 80]
180
+    pmaddwd         m14, m0, [r4 + 96]
181
+    pmaddwd         m15, m0, [r4 + 112]
182
+
183
+    movzx           r3d, word [r2 + 194]
184
+    movd            m0, r3d
185
+    pshufd          m0, m0, 0
186
+
187
+    paddd           m8, m0
188
+    paddd           m9, m0
189
+    paddd           m10, m0
190
+    paddd           m11, m0
191
+    paddd           m7, m0
192
+    paddd           m13, m0
193
+    paddd           m14, m0
194
+    paddd           m15, m0
195
+
196
+    paddd           m8, [pd_32]
197
+    paddd           m9, [pd_32]
198
+    paddd           m10, [pd_32]
199
+    paddd           m11, [pd_32]
200
+    paddd           m7, [pd_32]
201
x265_1.8.tar.gz/source/common/x86/intrapred8.asm -> x265_1.9.tar.gz/source/common/x86/intrapred8.asm Changed
201
 
1
@@ -27,7 +27,9 @@
2
 
3
 SECTION_RODATA 32
4
 
5
-intra_pred_shuff_0_8:    times 2 db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
6
+const intra_pred_shuff_0_8,     times 2 db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
7
+                                        db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
8
+
9
 intra_pred_shuff_15_0:   times 2 db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
10
 
11
 intra_filter4_shuf0:  times 2 db  2,  3,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13
12
@@ -54,13 +56,13 @@
13
 c_shuf8_0:            db  0,  1,  1,  2,  2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8
14
 c_deinterval8:        db  0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15
15
 pb_unpackbq:          db  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  1,  1,  1,  1
16
-c_mode16_12:    db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 6
17
-c_mode16_13:    db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 7, 4
18
-c_mode16_14:    db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 12, 10, 7, 5, 2
19
+c_mode16_12:          db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 6
20
+c_mode16_13:          db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 7, 4
21
+c_mode16_14:          db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 12, 10, 7, 5, 2
22
 c_mode16_15:          db  0,  0,  0,  0,  0,  0,  0,  0, 15, 13, 11,  9,  8,  6,  4,  2
23
 c_mode16_16:          db  8,  6,  5,  3,  2,  0, 15, 14, 12, 11,  9,  8,  6,  5,  3,  2
24
 c_mode16_17:          db  4,  2,  1,  0, 15, 14, 12, 11, 10,  9,  7,  6,  5,  4,  2,  1
25
-c_mode16_18:    db 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1
26
+c_mode16_18:          db 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1
27
 
28
 ALIGN 32
29
 c_ang8_src1_9_2_10:   db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
30
@@ -259,235 +261,6 @@
31
                      db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
32
                      db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
33
 
34
-
35
-ALIGN 32
36
-c_ang32_mode_27:    db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
37
-                    db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
38
-                    db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
39
-                    db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
40
-                    db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
41
-                    db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
42
-                    db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
43
-                    db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
44
-                    db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2
45
-                    db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
46
-                    db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
47
-                    db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
48
-                    db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
49
-                    db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
50
-                    db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
51
-                    db 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
52
-                    db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
53
-
54
-
55
-ALIGN 32
56
-c_ang32_mode_28:    db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
57
-                    db 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
58
-                    db 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
59
-                    db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
60
-                    db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
61
-                    db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
62
-                    db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
63
-                    db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
64
-                    db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
65
-                    db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31
66
-                    db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9
67
-                    db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19
68
-                    db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29
69
-                    db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7
70
-                    db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17
71
-                    db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27
72
-                    db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
73
-
74
-ALIGN 32
75
-c_ang32_mode_29:    db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
76
-                    db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27
77
-                    db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13
78
-                    db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31
79
-                    db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17
80
-                    db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
81
-                    db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
82
-                    db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
83
-                    db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
84
-                    db 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25
85
-                    db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11
86
-                    db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29
87
-                    db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15
88
-                    db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
89
-                    db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
90
-                    db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
91
-                    db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
92
-                    db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23
93
-                    db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
94
-
95
-
96
-ALIGN 32
97
-c_ang32_mode_30:    db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
98
-                    db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
99
-                    db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
100
-                    db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27
101
-                    db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21
102
-                    db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15
103
-                    db 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
104
-                    db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
105
-                    db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
106
-                    db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29,  3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29
107
-                    db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23
108
-                    db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17
109
-                    db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
110
-                    db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
111
-                    db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
112
-                    db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31
113
-                    db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25
114
-                    db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19
115
-                    db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
116
-
117
-
118
-ALIGN 32
119
-c_ang32_mode_31:    db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17
120
-                    db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19
121
-                    db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21
122
-                    db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23
123
-                    db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25
124
-                    db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27
125
-                    db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29
126
-                    db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31
127
-                    db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
128
-                    db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
129
-                    db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
130
-                    db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
131
-                    db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
132
-                    db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
133
-                    db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
134
-                    db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
135
-                    db 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15
136
-                    db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
137
-
138
-
139
-ALIGN 32
140
-c_ang32_mode_32:   db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21
141
-                   db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31
142
-                   db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
143
-                   db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
144
-                   db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19
145
-                   db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29
146
-                   db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
147
-                   db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
148
-                   db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17
149
-                   db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27
150
-                   db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
151
-                   db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
152
-                   db 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15
153
-                   db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25
154
-                   db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
155
-                   db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
156
-                   db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13
157
-                   db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23
158
-                   db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
159
-                   db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
160
-                   db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11
161
-                   db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
162
-
163
-ALIGN 32
164
-c_ang32_mode_25:   db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
165
-                   db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
166
-                   db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
167
-                   db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
168
-                   db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
169
-                   db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
170
-                   db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
171
-                   db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
172
-                   db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
173
-                   db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
174
-                   db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
175
-                   db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
176
-                   db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
177
-                   db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
178
-                   db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
179
-                   db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
180
-
181
-ALIGN 32
182
-c_ang32_mode_24:   db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
183
-                   db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
184
-                   db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2
185
-                   db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
186
-                   db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
187
-                   db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
188
-                   db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
189
-                   db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
190
-                   db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
191
-                   db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1
192
-                   db 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23
193
-                   db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13
194
-                   db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3
195
-                   db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25
196
-                   db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15
197
-                   db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5
198
-                   db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
199
-
200
-
201
x265_1.8.tar.gz/source/common/x86/intrapred8_allangs.asm -> x265_1.9.tar.gz/source/common/x86/intrapred8_allangs.asm Changed
201
 
1
@@ -27,62 +27,63 @@
2
 
3
 SECTION_RODATA 32
4
 
5
-all_ang4_shuff: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
6
-                db 0, 1, 1, 2, 2, 3, 3, 4, 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6, 3, 4, 4, 5, 5, 6, 6, 7
7
-                db 0, 1, 1, 2, 2, 3, 3, 4, 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6
8
-                db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5
9
-                db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 1, 2, 2, 3, 3, 4, 4, 5
10
-                db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4
11
-                db 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
12
-                db 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12
13
-                db 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 4, 0, 0, 9, 9, 10, 10, 11
14
-                db 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 2, 0, 0, 9, 9, 10, 10, 11, 2, 0, 0, 9, 9, 10, 10, 11
15
-                db 0, 9, 9, 10, 10, 11, 11, 12, 2, 0, 0, 9, 9, 10, 10, 11, 2, 0, 0, 9, 9, 10, 10, 11, 4, 2, 2, 0, 0, 9, 9, 10
16
-                db 0, 9, 9, 10, 10, 11, 11, 12, 2, 0, 0, 9, 9, 10, 10, 11, 2, 0, 0, 9, 9, 10, 10, 11, 3, 2, 2, 0, 0, 9, 9, 10
17
-                db 0, 9, 9, 10, 10, 11, 11, 12, 1, 0, 0, 9, 9, 10, 10, 11, 2, 1, 1, 0, 0, 9, 9, 10, 4, 2, 2, 1, 1, 0, 0, 9
18
-                db 0, 1, 2, 3, 9, 0, 1, 2, 10, 9, 0, 1, 11, 10, 9, 0, 0, 1, 2, 3, 9, 0, 1, 2, 10, 9, 0, 1, 11, 10, 9, 0
19
-                db 0, 1, 1, 2, 2, 3, 3, 4, 9, 0, 0, 1, 1, 2, 2, 3, 10, 9, 9, 0, 0, 1, 1, 2, 12, 10, 10, 9, 9, 0, 0, 1
20
-                db 0, 1, 1, 2, 2, 3, 3, 4, 10, 0, 0, 1, 1, 2, 2, 3, 10, 0, 0, 1, 1, 2, 2, 3, 11, 10, 10, 0, 0, 1, 1, 2
21
-                db 0, 1, 1, 2, 2, 3, 3, 4, 10, 0, 0, 1, 1, 2, 2, 3, 10, 0, 0, 1, 1, 2, 2, 3, 12, 10, 10, 0, 0, 1, 1, 2
22
-                db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 10, 0, 0, 1, 1, 2, 2, 3, 10, 0, 0, 1, 1, 2, 2, 3
23
-                db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 12, 0, 0, 1, 1, 2, 2, 3
24
-                db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4
25
-                db 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4
26
-                db 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5
27
-                db 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6
28
-                db 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6, 2, 3, 3, 4, 4, 5, 5, 6
29
-                db 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6, 2, 3, 3, 4, 4, 5, 5, 6, 3, 4, 4, 5, 5, 6, 6, 7
30
-                db 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6, 3, 4, 4, 5, 5, 6, 6, 7, 4, 5, 5, 6, 6, 7, 7, 8
31
-                db 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8
32
-
33
-all_ang4: db 6, 26, 6, 26, 6, 26, 6, 26, 12, 20, 12, 20, 12, 20, 12, 20, 18, 14, 18, 14, 18, 14, 18, 14, 24, 8, 24, 8, 24, 8, 24, 8
34
-          db 11, 21, 11, 21, 11, 21, 11, 21, 22, 10, 22, 10, 22, 10, 22, 10, 1, 31, 1, 31, 1, 31, 1, 31, 12, 20, 12, 20, 12, 20, 12, 20
35
-          db 15, 17, 15, 17, 15, 17, 15, 17, 30, 2, 30, 2, 30, 2, 30, 2, 13, 19, 13, 19, 13, 19, 13, 19, 28, 4, 28, 4, 28, 4, 28, 4
36
-          db 19, 13, 19, 13, 19, 13, 19, 13, 6, 26, 6, 26, 6, 26, 6, 26, 25, 7, 25, 7, 25, 7, 25, 7, 12, 20, 12, 20, 12, 20, 12, 20
37
-          db 23, 9, 23, 9, 23, 9, 23, 9, 14, 18, 14, 18, 14, 18, 14, 18, 5, 27, 5, 27, 5, 27, 5, 27, 28, 4, 28, 4, 28, 4, 28, 4
38
-          db 27, 5, 27, 5, 27, 5, 27, 5, 22, 10, 22, 10, 22, 10, 22, 10, 17, 15, 17, 15, 17, 15, 17, 15, 12, 20, 12, 20, 12, 20, 12, 20
39
-          db 30, 2, 30, 2, 30, 2, 30, 2, 28, 4, 28, 4, 28, 4, 28, 4, 26, 6, 26, 6, 26, 6, 26, 6, 24, 8, 24, 8, 24, 8, 24, 8
40
-          db 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24
41
-          db 5, 27, 5, 27, 5, 27, 5, 27, 10, 22, 10, 22, 10, 22, 10, 22, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12
42
-          db 9, 23, 9, 23, 9, 23, 9, 23, 18, 14, 18, 14, 18, 14, 18, 14, 27, 5, 27, 5, 27, 5, 27, 5, 4, 28, 4, 28, 4, 28, 4, 28
43
-          db 13, 19, 13, 19, 13, 19, 13, 19, 26, 6, 26, 6, 26, 6, 26, 6, 7, 25, 7, 25, 7, 25, 7, 25, 20, 12, 20, 12, 20, 12, 20, 12
44
-          db 17, 15, 17, 15, 17, 15, 17, 15, 2, 30, 2, 30, 2, 30, 2, 30, 19, 13, 19, 13, 19, 13, 19, 13, 4, 28, 4, 28, 4, 28, 4, 28
45
-          db 21, 11, 21, 11, 21, 11, 21, 11, 10, 22, 10, 22, 10, 22, 10, 22, 31, 1, 31, 1, 31, 1, 31, 1, 20, 12, 20, 12, 20, 12, 20, 12
46
-          db 26, 6, 26, 6, 26, 6, 26, 6, 20, 12, 20, 12, 20, 12, 20, 12, 14, 18, 14, 18, 14, 18, 14, 18, 8, 24, 8, 24, 8, 24, 8, 24
47
-          db 26, 6, 26, 6, 26, 6, 26, 6, 20, 12, 20, 12, 20, 12, 20, 12, 14, 18, 14, 18, 14, 18, 14, 18, 8, 24, 8, 24, 8, 24, 8, 24
48
-          db 21, 11, 21, 11, 21, 11, 21, 11, 10, 22, 10, 22, 10, 22, 10, 22, 31, 1, 31, 1, 31, 1, 31, 1, 20, 12, 20, 12, 20, 12, 20, 12
49
-          db 17, 15, 17, 15, 17, 15, 17, 15, 2, 30, 2, 30, 2, 30, 2, 30, 19, 13, 19, 13, 19, 13, 19, 13, 4, 28, 4, 28, 4, 28, 4, 28
50
-          db 13, 19, 13, 19, 13, 19, 13, 19, 26, 6, 26, 6, 26, 6, 26, 6, 7, 25, 7, 25, 7, 25, 7, 25, 20, 12, 20, 12, 20, 12, 20, 12
51
-          db 9, 23, 9, 23, 9, 23, 9, 23, 18, 14, 18, 14, 18, 14, 18, 14, 27, 5, 27, 5, 27, 5, 27, 5, 4, 28, 4, 28, 4, 28, 4, 28
52
-          db 5, 27, 5, 27, 5, 27, 5, 27, 10, 22, 10, 22, 10, 22, 10, 22, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12
53
-          db 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24
54
-          db 30, 2, 30, 2, 30, 2, 30, 2, 28, 4, 28, 4, 28, 4, 28, 4, 26, 6, 26, 6, 26, 6, 26, 6, 24, 8, 24, 8, 24, 8, 24, 8
55
-          db 27, 5, 27, 5, 27, 5, 27, 5, 22, 10, 22, 10, 22, 10, 22, 10, 17, 15, 17, 15, 17, 15, 17, 15, 12, 20, 12, 20, 12, 20, 12, 20
56
-          db 23, 9, 23, 9, 23, 9, 23, 9, 14, 18, 14, 18, 14, 18, 14, 18, 5, 27, 5, 27, 5, 27, 5, 27, 28, 4, 28, 4, 28, 4, 28, 4
57
-          db 19, 13, 19, 13, 19, 13, 19, 13, 6, 26, 6, 26, 6, 26, 6, 26, 25, 7, 25, 7, 25, 7, 25, 7, 12, 20, 12, 20, 12, 20, 12, 20
58
-          db 15, 17, 15, 17, 15, 17, 15, 17, 30, 2, 30, 2, 30, 2, 30, 2, 13, 19, 13, 19, 13, 19, 13, 19, 28, 4, 28, 4, 28, 4, 28, 4
59
-          db 11, 21, 11, 21, 11, 21, 11, 21, 22, 10, 22, 10, 22, 10, 22, 10, 1, 31, 1, 31, 1, 31, 1, 31, 12, 20, 12, 20, 12, 20, 12, 20
60
-          db 6, 26, 6, 26, 6, 26, 6, 26, 12, 20, 12, 20, 12, 20, 12, 20, 18, 14, 18, 14, 18, 14, 18, 14, 24, 8, 24, 8, 24, 8, 24, 8
61
+const allAng4_shuf_mode2,       db  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6,  4,  5,  6,  7,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6,  4,  5,  6,  7
62
+const allAng4_shuf_mode3_4,     db  0,  1,  1,  2,  2,  3,  3,  4,  1,  2,  2,  3,  3,  4,  4,  5,  0,  1,  1,  2,  2,  3,  3,  4,  1,  2,  2,  3,  3,  4,  4,  5
63
+                                db  2,  3,  3,  4,  4,  5,  5,  6,  3,  4,  4,  5,  5,  6,  6,  7,  1,  2,  2,  3,  3,  4,  4,  5,  2,  3,  3,  4,  4,  5,  5,  6
64
+const allAng4_shuf_mode5_6,     db  0,  1,  1,  2,  2,  3,  3,  4,  1,  2,  2,  3,  3,  4,  4,  5,  0,  1,  1,  2,  2,  3,  3,  4,  0,  1,  1,  2,  2,  3,  3,  4
65
+                                db  1,  2,  2,  3,  3,  4,  4,  5,  2,  3,  3,  4,  4,  5,  5,  6,  1,  2,  2,  3,  3,  4,  4,  5,  1,  2,  2,  3,  3,  4,  4,  5
66
+const allAng4_shuf_mode7_8,     db  0,  1,  1,  2,  2,  3,  3,  4,  0,  1,  1,  2,  2,  3,  3,  4,  0,  1,  1,  2,  2,  3,  3,  4,  0,  1,  1,  2,  2,  3,  3,  4
67
+                                db  0,  1,  1,  2,  2,  3,  3,  4,  1,  2,  2,  3,  3,  4,  4,  5,  0,  1,  1,  2,  2,  3,  3,  4,  0,  1,  1,  2,  2,  3,  3,  4
68
+const allAng4_shuf_mode10,      db  0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3
69
+const allAng4_shuf_mode11_12,   db  0,  9,  9, 10, 10, 11, 11, 12,  0,  9,  9, 10, 10, 11, 11, 12,  0,  9,  9, 10, 10, 11, 11, 12,  0,  9,  9, 10, 10, 11, 11, 12
70
+const allAng4_shuf_mode13_14,   db  0,  9,  9, 10, 10, 11, 11, 12,  4,  0,  0,  9,  9, 10, 10, 11,  2,  0,  0,  9,  9, 10, 10, 11,  2,  0,  0,  9,  9, 10, 10, 11
71
+const allAng4_shuf_mode15_16,   db  0,  9,  9, 10, 10, 11, 11, 12,  2,  0,  0,  9,  9, 10, 10, 11,  0,  9,  9, 10, 10, 11, 11, 12,  2,  0,  0,  9,  9, 10, 10, 11
72
+                                db  2,  0,  0,  9,  9, 10, 10, 11,  4,  2,  2,  0,  0,  9,  9, 10,  2,  0,  0,  9,  9, 10, 10, 11,  3,  2,  2,  0,  0,  9,  9, 10
73
+const allAng4_shuf_mode17,      db  0,  9,  9, 10, 10, 11, 11, 12,  1,  0,  0,  9,  9, 10, 10, 11,  2,  1,  1,  0,  0,  9,  9, 10,  4,  2,  2,  1,  1,  0,  0,  9
74
+                                db  0,  1,  2,  3,  9,  0,  1,  2, 10,  9,  0,  1, 11, 10,  9,  0,  0,  1,  2,  3,  9,  0,  1,  2, 10,  9,  0,  1, 11, 10,  9,  0
75
+const allAng4_shuf_mode18,      db  0,  1,  2,  3,  9,  0,  1,  2, 10,  9,  0,  1, 11, 10,  9,  0,  0,  1,  2,  3,  9,  0,  1,  2, 10,  9,  0,  1, 11, 10,  9,  0
76
+const allAng4_shuf_mode19_20,   db  0,  1,  1,  2,  2,  3,  3,  4,  9,  0,  0,  1,  1,  2,  2,  3,  0,  1,  1,  2,  2,  3,  3,  4, 10,  0,  0,  1,  1,  2,  2,  3
77
+                                db 10,  9,  9,  0,  0,  1,  1,  2, 12, 10, 10,  9,  9,  0,  0,  1, 10,  0,  0,  1,  1,  2,  2,  3, 11, 10, 10,  0,  0,  1,  1,  2
78
+const allAng4_shuf_mode21_22,   db  0,  1,  1,  2,  2,  3,  3,  4, 10,  0,  0,  1,  1,  2,  2,  3,  0,  1,  1,  2,  2,  3,  3,  4,  0,  1,  1,  2,  2,  3,  3,  4
79
+                                db 10,  0,  0,  1,  1,  2,  2,  3, 12, 10, 10,  0,  0,  1,  1,  2, 10,  0,  0,  1,  1,  2,  2,  3, 10,  0,  0,  1,  1,  2,  2,  3
80
+const allAng4_shuf_mode23_24,   db  0,  1,  1,  2,  2,  3,  3,  4,  0,  1,  1,  2,  2,  3,  3,  4,  0,  1,  1,  2,  2,  3,  3,  4,  0,  1,  1,  2,  2,  3,  3,  4
81
+                                db  0,  1,  1,  2,  2,  3,  3,  4, 12,  0,  0,  1,  1,  2,  2,  3,  0,  1,  1,  2,  2,  3,  3,  4,  0,  1,  1,  2,  2,  3,  3,  4
82
+const allAng4_shuf_mode26,      db  1,  2,  3,  4,  1,  2,  3,  4,  1,  2,  3,  4,  1,  2,  3,  4,  1,  2,  3,  4,  1,  2,  3,  4,  1,  2,  3,  4,  1,  2,  3,  4
83
+const allAng4_shuf_mode27_28,   db  1,  2,  2,  3,  3,  4,  4,  5,  1,  2,  2,  3,  3,  4,  4,  5,  1,  2,  2,  3,  3,  4,  4,  5,  1,  2,  2,  3,  3,  4,  4,  5
84
+const allAng4_shuf_mode29_30,   db  1,  2,  2,  3,  3,  4,  4,  5,  2,  3,  3,  4,  4,  5,  5,  6,  2,  3,  3,  4,  4,  5,  5,  6,  2,  3,  3,  4,  4,  5,  5,  6
85
+const allAng4_shuf_mode31_32,   db  1,  2,  2,  3,  3,  4,  4,  5,  2,  3,  3,  4,  4,  5,  5,  6,  1,  2,  2,  3,  3,  4,  4,  5,  2,  3,  3,  4,  4,  5,  5,  6
86
+                                db  2,  3,  3,  4,  4,  5,  5,  6,  3,  4,  4,  5,  5,  6,  6,  7,  2,  3,  3,  4,  4,  5,  5,  6,  3,  4,  4,  5,  5,  6,  6,  7
87
+const allAng4_shuf_mode33,      db  1,  2,  2,  3,  3,  4,  4,  5,  2,  3,  3,  4,  4,  5,  5,  6,  3,  4,  4,  5,  5,  6,  6,  7,  4,  5,  5,  6,  6,  7,  7,  8
88
+const allAng4_shuf_mode34,      db  2,  3,  4,  5,  3,  4,  5,  6,  4,  5,  6,  7,  5,  6,  7,  8,  2,  3,  4,  5,  3,  4,  5,  6,  4,  5,  6,  7,  5,  6,  7,  8
89
+
90
+const allAng4_fact_mode3_4,     db  6, 26,  6, 26,  6, 26,  6, 26, 12, 20, 12, 20, 12, 20, 12, 20, 11, 21, 11, 21, 11, 21, 11, 21, 22, 10, 22, 10, 22, 10, 22, 10
91
+                                db 18, 14, 18, 14, 18, 14, 18, 14, 24,  8, 24,  8, 24,  8, 24,  8,  1, 31,  1, 31,  1, 31,  1, 31, 12, 20, 12, 20, 12, 20, 12, 20
92
+const allAng4_fact_mode5_6,     db 15, 17, 15, 17, 15, 17, 15, 17, 30,  2, 30,  2, 30,  2, 30,  2, 19, 13, 19, 13, 19, 13, 19, 13,  6, 26,  6, 26,  6, 26,  6, 26
93
+                                db 13, 19, 13, 19, 13, 19, 13, 19, 28,  4, 28,  4, 28,  4, 28,  4, 25,  7, 25,  7, 25,  7, 25,  7, 12, 20, 12, 20, 12, 20, 12, 20
94
+const allAng4_fact_mode7_8,     db 23,  9, 23,  9, 23,  9, 23,  9, 14, 18, 14, 18, 14, 18, 14, 18, 27,  5, 27,  5, 27,  5, 27,  5, 22, 10, 22, 10, 22, 10, 22, 10
95
+                                db  5, 27,  5, 27,  5, 27,  5, 27, 28,  4, 28,  4, 28,  4, 28,  4, 17, 15, 17, 15, 17, 15, 17, 15, 12, 20, 12, 20, 12, 20, 12, 20
96
+const allAng4_fact_mode9,       db 30,  2, 30,  2, 30,  2, 30,  2, 28,  4, 28,  4, 28,  4, 28,  4, 26,  6, 26,  6, 26,  6, 26,  6, 24,  8, 24,  8, 24,  8, 24,  8
97
+const allAng4_fact_mode11_12,   db  2, 30,  2, 30,  2, 30,  2, 30,  4, 28,  4, 28,  4, 28,  4, 28,  5, 27,  5, 27,  5, 27,  5, 27, 10, 22, 10, 22, 10, 22, 10, 22
98
+                                db  6, 26,  6, 26,  6, 26,  6, 26,  8, 24,  8, 24,  8, 24,  8, 24, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12
99
+const allAng4_fact_mode13_14,   db  9, 23,  9, 23,  9, 23,  9, 23, 18, 14, 18, 14, 18, 14, 18, 14, 13, 19, 13, 19, 13, 19, 13, 19, 26,  6, 26,  6, 26,  6, 26,  6
100
+                                db 27,  5, 27,  5, 27,  5, 27,  5,  4, 28,  4, 28,  4, 28,  4, 28,  7, 25,  7, 25,  7, 25,  7, 25, 20, 12, 20, 12, 20, 12, 20, 12
101
+const allAng4_fact_mode15_16,   db 17, 15, 17, 15, 17, 15, 17, 15,  2, 30,  2, 30,  2, 30,  2, 30, 21, 11, 21, 11, 21, 11, 21, 11, 10, 22, 10, 22, 10, 22, 10, 22
102
+                                db 19, 13, 19, 13, 19, 13, 19, 13,  4, 28,  4, 28,  4, 28,  4, 28, 31,  1, 31,  1, 31,  1, 31,  1, 20, 12, 20, 12, 20, 12, 20, 12
103
+const allAng4_fact_mode17,      db 26,  6, 26,  6, 26,  6, 26,  6, 20, 12, 20, 12, 20, 12, 20, 12, 14, 18, 14, 18, 14, 18, 14, 18,  8, 24,  8, 24,  8, 24,  8, 24
104
+const allAng4_fact_mode19_20,   db 26,  6, 26,  6, 26,  6, 26,  6, 20, 12, 20, 12, 20, 12, 20, 12, 21, 11, 21, 11, 21, 11, 21, 11, 10, 22, 10, 22, 10, 22, 10, 22
105
+                                db 14, 18, 14, 18, 14, 18, 14, 18,  8, 24,  8, 24,  8, 24,  8, 24, 31,  1, 31,  1, 31,  1, 31,  1, 20, 12, 20, 12, 20, 12, 20, 12
106
+const allAng4_fact_mode21_22,   db 17, 15, 17, 15, 17, 15, 17, 15,  2, 30,  2, 30,  2, 30,  2, 30, 13, 19, 13, 19, 13, 19, 13, 19, 26,  6, 26,  6, 26,  6, 26,  6
107
+                                db 19, 13, 19, 13, 19, 13, 19, 13,  4, 28,  4, 28,  4, 28,  4, 28,  7, 25,  7, 25,  7, 25,  7, 25, 20, 12, 20, 12, 20, 12, 20, 12
108
+const allAng4_fact_mode23_24,   db  9, 23,  9, 23,  9, 23,  9, 23, 18, 14, 18, 14, 18, 14, 18, 14,  5, 27,  5, 27,  5, 27,  5, 27, 10, 22, 10, 22, 10, 22, 10, 22
109
+                                db 27,  5, 27,  5, 27,  5, 27,  5,  4, 28,  4, 28,  4, 28,  4, 28, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12
110
+const allAng4_fact_mode25,      db  2, 30,  2, 30,  2, 30,  2, 30,  4, 28,  4, 28,  4, 28,  4, 28,  6, 26,  6, 26,  6, 26,  6, 26,  8, 24,  8, 24,  8, 24,  8, 24
111
+const allAng4_fact_mode27_28,   db 30,  2, 30,  2, 30,  2, 30,  2, 28,  4, 28,  4, 28,  4, 28,  4, 27,  5, 27,  5, 27,  5, 27,  5, 22, 10, 22, 10, 22, 10, 22, 10
112
+                                db 26,  6, 26,  6, 26,  6, 26,  6, 24,  8, 24,  8, 24,  8, 24,  8, 17, 15, 17, 15, 17, 15, 17, 15, 12, 20, 12, 20, 12, 20, 12, 20
113
+const allAng4_fact_mode29_30,   db 23,  9, 23,  9, 23,  9, 23,  9, 14, 18, 14, 18, 14, 18, 14, 18, 19, 13, 19, 13, 19, 13, 19, 13,  6, 26,  6, 26,  6, 26,  6, 26
114
+                                db  5, 27,  5, 27,  5, 27,  5, 27, 28,  4, 28,  4, 28,  4, 28,  4, 25,  7, 25,  7, 25,  7, 25,  7, 12, 20, 12, 20, 12, 20, 12, 20
115
+const allAng4_fact_mode31_32,   db 15, 17, 15, 17, 15, 17, 15, 17, 30,  2, 30,  2, 30,  2, 30,  2, 11, 21, 11, 21, 11, 21, 11, 21, 22, 10, 22, 10, 22, 10, 22, 10
116
+                                db 13, 19, 13, 19, 13, 19, 13, 19, 28,  4, 28,  4, 28,  4, 28,  4,  1, 31,  1, 31,  1, 31,  1, 31, 12, 20, 12, 20, 12, 20, 12, 20
117
+const allAng4_fact_mode33,      db  6, 26,  6, 26,  6, 26,  6, 26, 12, 20, 12, 20, 12, 20, 12, 20, 18, 14, 18, 14, 18, 14, 18, 14, 24,  8, 24,  8, 24,  8, 24,  8
118
 
119
 
120
 SECTION .text
121
@@ -23075,80 +23076,69 @@
122
 ; void all_angs_pred_4x4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma)
123
 ;-----------------------------------------------------------------------------
124
 INIT_YMM avx2
125
-cglobal all_angs_pred_4x4, 4, 4, 6
126
+cglobal all_angs_pred_4x4, 2, 2, 6
127
 
128
     mova           m5, [pw_1024]
129
-    lea            r2, [all_ang4]
130
-    lea            r3, [all_ang4_shuff]
131
 
132
 ; mode 2
133
 
134
     vbroadcasti128 m0, [r1 + 9]
135
-    mova           xm1, xm0
136
-    psrldq         xm1, 1
137
-    pshufb         xm1, [r3]
138
+    pshufb         m1, m0, [allAng4_shuf_mode2]
139
     movu           [r0], xm1
140
 
141
 ; mode 3
142
 
143
-    pshufb         m1, m0, [r3 + 1 * mmsize]
144
-    pmaddubsw      m1, [r2]
145
+    pshufb         m1, m0, [allAng4_shuf_mode3_4]
146
+    pmaddubsw      m1, [allAng4_fact_mode3_4]
147
     pmulhrsw       m1, m5
148
 
149
 ; mode 4
150
 
151
-    pshufb         m2, m0, [r3 + 2 * mmsize]
152
-    pmaddubsw      m2, [r2 + 1 * mmsize]
153
+    pshufb         m2, m0, [allAng4_shuf_mode3_4 + mmsize]
154
+    pmaddubsw      m2, [allAng4_fact_mode3_4 + mmsize]
155
     pmulhrsw       m2, m5
156
     packuswb       m1, m2
157
-    vpermq         m1, m1, 11011000b
158
     movu           [r0 + (3 - 2) * 16], m1
159
 
160
 ; mode 5
161
 
162
-    pshufb         m1, m0, [r3 + 2 * mmsize]
163
-    pmaddubsw      m1, [r2 + 2 * mmsize]
164
+    pshufb         m1, m0, [allAng4_shuf_mode5_6]
165
+    pmaddubsw      m1, [allAng4_fact_mode5_6]
166
     pmulhrsw       m1, m5
167
 
168
 ; mode 6
169
 
170
-    pshufb         m2, m0, [r3 + 3 * mmsize]
171
-    pmaddubsw      m2, [r2 + 3 * mmsize]
172
+    pshufb         m2, m0, [allAng4_shuf_mode5_6 + mmsize]
173
+    pmaddubsw      m2, [allAng4_fact_mode5_6 + mmsize]
174
     pmulhrsw       m2, m5
175
     packuswb       m1, m2
176
-    vpermq         m1, m1, 11011000b
177
     movu           [r0 + (5 - 2) * 16], m1
178
 
179
-    add            r3, 4 * mmsize
180
-    add            r2, 4 * mmsize
181
-
182
 ; mode 7
183
 
184
-    pshufb         m1, m0, [r3 + 0 * mmsize]
185
-    pmaddubsw      m1, [r2 + 0 * mmsize]
186
+    pshufb         m3, m0, [allAng4_shuf_mode7_8]
187
+    pmaddubsw      m1, m3, [allAng4_fact_mode7_8]
188
     pmulhrsw       m1, m5
189
 
190
 ; mode 8
191
 
192
-    pshufb         m2, m0, [r3 + 1 * mmsize]
193
-    pmaddubsw      m2, [r2 + 1 * mmsize]
194
+    pshufb         m2, m0, [allAng4_shuf_mode7_8 + mmsize]
195
+    pmaddubsw      m2, [allAng4_fact_mode7_8 + mmsize]
196
     pmulhrsw       m2, m5
197
     packuswb       m1, m2
198
-    vpermq         m1, m1, 11011000b
199
     movu           [r0 + (7 - 2) * 16], m1
200
 
201
x265_1.8.tar.gz/source/common/x86/ipfilter16.asm -> x265_1.9.tar.gz/source/common/x86/ipfilter16.asm Changed
201
 
1
@@ -4869,7 +4869,7 @@
2
 %ifidn %2,pp
3
     vbroadcasti128  m8, [INTERP_OFFSET_PP]
4
 %elifidn %2, sp
5
-    mova            m8, [INTERP_OFFSET_SP]
6
+    vbroadcasti128  m8, [INTERP_OFFSET_SP]
7
 %else
8
     vbroadcasti128  m8, [INTERP_OFFSET_PS]
9
 %endif
10
@@ -5011,11 +5011,11 @@
11
     mov       r4d, %1/2
12
 
13
 %ifidn %2, pp
14
-    mova      m7, [INTERP_OFFSET_PP]
15
+    vbroadcasti128  m7, [INTERP_OFFSET_PP]
16
 %elifidn %2, sp
17
-    mova      m7, [INTERP_OFFSET_SP]
18
+    vbroadcasti128  m7, [INTERP_OFFSET_SP]
19
 %elifidn %2, ps
20
-    mova      m7, [INTERP_OFFSET_PS]
21
+    vbroadcasti128  m7, [INTERP_OFFSET_PS]
22
 %endif
23
 
24
 .loopH:
25
@@ -5183,11 +5183,11 @@
26
     mov       r4d, %1/2
27
 
28
 %ifidn %2, pp
29
-    mova      m7, [INTERP_OFFSET_PP]
30
+    vbroadcasti128  m7, [INTERP_OFFSET_PP]
31
 %elifidn %2, sp
32
-    mova      m7, [INTERP_OFFSET_SP]
33
+    vbroadcasti128  m7, [INTERP_OFFSET_SP]
34
 %elifidn %2, ps
35
-    mova      m7, [INTERP_OFFSET_PS]
36
+    vbroadcasti128  m7, [INTERP_OFFSET_PS]
37
 %endif
38
 
39
 .loopH:
40
@@ -5325,11 +5325,11 @@
41
     mov       r4d, %1/2
42
 
43
 %ifidn %2, pp
44
-    mova      m7, [INTERP_OFFSET_PP]
45
+    vbroadcasti128  m7, [INTERP_OFFSET_PP]
46
 %elifidn %2, sp
47
-    mova      m7, [INTERP_OFFSET_SP]
48
+    vbroadcasti128  m7, [INTERP_OFFSET_SP]
49
 %elifidn %2, ps
50
-    mova      m7, [INTERP_OFFSET_PS]
51
+    vbroadcasti128  m7, [INTERP_OFFSET_PS]
52
 %endif
53
 
54
 .loopH:
55
@@ -5456,11 +5456,11 @@
56
     mov       r4d, %1/2
57
 
58
 %ifidn %2, pp
59
-    mova      m7, [INTERP_OFFSET_PP]
60
+    vbroadcasti128  m7, [INTERP_OFFSET_PP]
61
 %elifidn %2, sp
62
-    mova      m7, [INTERP_OFFSET_SP]
63
+    vbroadcasti128  m7, [INTERP_OFFSET_SP]
64
 %elifidn %2, ps
65
-    mova      m7, [INTERP_OFFSET_PS]
66
+    vbroadcasti128  m7, [INTERP_OFFSET_PS]
67
 %endif
68
 
69
 .loopH:
70
@@ -5609,11 +5609,11 @@
71
     mov       r4d, %1/2
72
 
73
 %ifidn %2, pp
74
-    mova      m7, [INTERP_OFFSET_PP]
75
+    vbroadcasti128  m7, [INTERP_OFFSET_PP]
76
 %elifidn %2, sp
77
-    mova      m7, [INTERP_OFFSET_SP]
78
+    vbroadcasti128  m7, [INTERP_OFFSET_SP]
79
 %elifidn %2, ps
80
-    mova      m7, [INTERP_OFFSET_PS]
81
+    vbroadcasti128  m7, [INTERP_OFFSET_PS]
82
 %endif
83
 
84
 .loopH:
85
@@ -5732,11 +5732,11 @@
86
     mov       r4d, 32
87
 
88
 %ifidn %1, pp
89
-    mova      m7, [INTERP_OFFSET_PP]
90
+    vbroadcasti128  m7, [INTERP_OFFSET_PP]
91
 %elifidn %1, sp
92
-    mova      m7, [INTERP_OFFSET_SP]
93
+    vbroadcasti128  m7, [INTERP_OFFSET_SP]
94
 %elifidn %1, ps
95
-    mova      m7, [INTERP_OFFSET_PS]
96
+    vbroadcasti128  m7, [INTERP_OFFSET_PS]
97
 %endif
98
 
99
 .loopH:
100
@@ -6068,7 +6068,7 @@
101
 %ifidn %1,pp
102
     vbroadcasti128  m6, [pd_32]
103
 %elifidn %1, sp
104
-    mova            m6, [pd_524800]
105
+    vbroadcasti128  m6, [INTERP_OFFSET_SP]
106
 %else
107
     vbroadcasti128  m6, [INTERP_OFFSET_PS]
108
 %endif
109
@@ -6178,7 +6178,7 @@
110
 %ifidn %1,pp
111
     vbroadcasti128  m11, [pd_32]
112
 %elifidn %1, sp
113
-    mova            m11, [pd_524800]
114
+    vbroadcasti128  m11, [INTERP_OFFSET_SP]
115
 %else
116
     vbroadcasti128  m11, [INTERP_OFFSET_PS]
117
 %endif
118
@@ -6816,7 +6816,7 @@
119
 %ifidn %1,pp
120
     vbroadcasti128  m14, [pd_32]
121
 %elifidn %1, sp
122
-    mova            m14, [INTERP_OFFSET_SP]
123
+    vbroadcasti128  m14, [INTERP_OFFSET_SP]
124
 %else
125
     vbroadcasti128  m14, [INTERP_OFFSET_PS]
126
 %endif
127
@@ -6867,7 +6867,7 @@
128
 %ifidn %3,pp
129
     vbroadcasti128  m14, [pd_32]
130
 %elifidn %3, sp
131
-    mova            m14, [INTERP_OFFSET_SP]
132
+    vbroadcasti128  m14, [INTERP_OFFSET_SP]
133
 %else
134
     vbroadcasti128  m14, [INTERP_OFFSET_PS]
135
 %endif
136
@@ -6950,7 +6950,7 @@
137
 %ifidn %1,pp
138
     vbroadcasti128  m14, [pd_32]
139
 %elifidn %1, sp
140
-    mova            m14, [INTERP_OFFSET_SP]
141
+    vbroadcasti128  m14, [INTERP_OFFSET_SP]
142
 %else
143
     vbroadcasti128  m14, [INTERP_OFFSET_PS]
144
 %endif
145
@@ -7597,7 +7597,7 @@
146
 %ifidn %1,pp
147
     vbroadcasti128  m11, [pd_32]
148
 %elifidn %1, sp
149
-    mova            m11, [INTERP_OFFSET_SP]
150
+    vbroadcasti128  m11, [INTERP_OFFSET_SP]
151
 %else
152
     vbroadcasti128  m11, [INTERP_OFFSET_PS]
153
 %endif
154
@@ -7644,7 +7644,7 @@
155
 %ifidn %1,pp
156
     vbroadcasti128  m14, [pd_32]
157
 %elifidn %1, sp
158
-    mova            m14, [INTERP_OFFSET_SP]
159
+    vbroadcasti128  m14, [INTERP_OFFSET_SP]
160
 %else
161
     vbroadcasti128  m14, [INTERP_OFFSET_PS]
162
 %endif
163
@@ -7816,7 +7816,7 @@
164
 %ifidn %1,pp
165
     vbroadcasti128  m7, [pd_32]
166
 %elifidn %1, sp
167
-    mova            m7, [INTERP_OFFSET_SP]
168
+    vbroadcasti128  m7, [INTERP_OFFSET_SP]
169
 %else
170
     vbroadcasti128  m7, [INTERP_OFFSET_PS]
171
 %endif
172
@@ -7861,7 +7861,7 @@
173
 %ifidn %1,pp
174
     vbroadcasti128  m7, [pd_32]
175
 %elifidn %1, sp
176
-    mova            m7, [INTERP_OFFSET_SP]
177
+    vbroadcasti128  m7, [INTERP_OFFSET_SP]
178
 %else
179
     vbroadcasti128  m7, [INTERP_OFFSET_PS]
180
 %endif
181
@@ -7901,7 +7901,7 @@
182
 %ifidn %1,pp
183
     vbroadcasti128  m14, [pd_32]
184
 %elifidn %1, sp
185
-    mova            m14, [INTERP_OFFSET_SP]
186
+    vbroadcasti128  m14, [INTERP_OFFSET_SP]
187
 %else
188
     vbroadcasti128  m14, [INTERP_OFFSET_PS]
189
 %endif
190
@@ -8248,7 +8248,7 @@
191
 %ifidn %1,pp
192
     vbroadcasti128  m7, [pd_32]
193
 %elifidn %1, sp
194
-    mova            m7, [INTERP_OFFSET_SP]
195
+    vbroadcasti128  m7, [INTERP_OFFSET_SP]
196
 %else
197
     vbroadcasti128  m7, [INTERP_OFFSET_PS]
198
 %endif
199
@@ -8668,7 +8668,7 @@
200
 %ifidn %1,pp
201
x265_1.8.tar.gz/source/common/x86/ipfilter8.asm -> x265_1.9.tar.gz/source/common/x86/ipfilter8.asm Changed
201
 
1
@@ -12541,6 +12541,459 @@
2
 ;-----------------------------------------------------------------------------
3
 ; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
4
 ;-----------------------------------------------------------------------------
5
+INIT_YMM avx2
6
+cglobal filterPixelToShort_16x4, 3, 4, 2
7
+    mov             r3d, r3m
8
+    add             r3d, r3d
9
+
10
+    ; load constant
11
+    vbroadcasti128  m1, [pw_2000]
12
+
13
+    pmovzxbw        m0, [r0]
14
+    psllw           m0, 6
15
+    psubw           m0, m1
16
+    movu            [r2], m0
17
+
18
+    pmovzxbw        m0, [r0 + r1]
19
+    psllw           m0, 6
20
+    psubw           m0, m1
21
+    movu            [r2 + r3], m0
22
+
23
+    pmovzxbw        m0, [r0 + r1 * 2]
24
+    psllw           m0, 6
25
+    psubw           m0, m1
26
+    movu            [r2 + r3 * 2], m0
27
+
28
+    lea             r1, [r1 * 3]
29
+    lea             r3, [r3 * 3]
30
+
31
+    pmovzxbw        m0, [r0 + r1]
32
+    psllw           m0, 6
33
+    psubw           m0, m1
34
+    movu            [r2 + r3], m0
35
+    RET
36
+
37
+;-----------------------------------------------------------------------------
38
+; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
39
+;-----------------------------------------------------------------------------
40
+INIT_YMM avx2
41
+cglobal filterPixelToShort_16x8, 3, 6, 2
42
+    mov             r3d, r3m
43
+    add             r3d, r3d
44
+    lea             r4, [r1 * 3]
45
+    lea             r5, [r3 * 3]
46
+
47
+    ; load constant
48
+    vbroadcasti128  m1, [pw_2000]
49
+
50
+    pmovzxbw        m0, [r0]
51
+    psllw           m0, 6
52
+    psubw           m0, m1
53
+    movu            [r2], m0
54
+
55
+    pmovzxbw        m0, [r0 + r1]
56
+    psllw           m0, 6
57
+    psubw           m0, m1
58
+    movu            [r2 + r3], m0
59
+
60
+    pmovzxbw        m0, [r0 + r1 * 2]
61
+    psllw           m0, 6
62
+    psubw           m0, m1
63
+    movu            [r2 + r3 * 2], m0
64
+
65
+    pmovzxbw        m0, [r0 + r4]
66
+    psllw           m0, 6
67
+    psubw           m0, m1
68
+    movu            [r2 + r5], m0
69
+
70
+    lea             r0, [r0 + r1 * 4]
71
+    lea             r2, [r2 + r3 * 4]
72
+
73
+    pmovzxbw        m0, [r0]
74
+    psllw           m0, 6
75
+    psubw           m0, m1
76
+    movu            [r2], m0
77
+
78
+    pmovzxbw        m0, [r0 + r1]
79
+    psllw           m0, 6
80
+    psubw           m0, m1
81
+    movu            [r2 + r3], m0
82
+
83
+    pmovzxbw        m0, [r0 + r1 * 2]
84
+    psllw           m0, 6
85
+    psubw           m0, m1
86
+    movu            [r2 + r3 * 2], m0
87
+
88
+    pmovzxbw        m0, [r0 + r4]
89
+    psllw           m0, 6
90
+    psubw           m0, m1
91
+    movu            [r2 + r5], m0
92
+    RET
93
+
94
+;-----------------------------------------------------------------------------
95
+; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
96
+;-----------------------------------------------------------------------------
97
+INIT_YMM avx2
98
+cglobal filterPixelToShort_16x12, 3, 6, 2
99
+    mov             r3d, r3m
100
+    add             r3d, r3d
101
+    lea             r4, [r1 * 3]
102
+    lea             r5, [r3 * 3]
103
+
104
+    ; load constant
105
+    vbroadcasti128  m1, [pw_2000]
106
+
107
+    pmovzxbw        m0, [r0]
108
+    psllw           m0, 6
109
+    psubw           m0, m1
110
+    movu            [r2], m0
111
+
112
+    pmovzxbw        m0, [r0 + r1]
113
+    psllw           m0, 6
114
+    psubw           m0, m1
115
+    movu            [r2 + r3], m0
116
+
117
+    pmovzxbw        m0, [r0 + r1 * 2]
118
+    psllw           m0, 6
119
+    psubw           m0, m1
120
+    movu            [r2 + r3 * 2], m0
121
+
122
+    pmovzxbw        m0, [r0 + r4]
123
+    psllw           m0, 6
124
+    psubw           m0, m1
125
+    movu            [r2 + r5], m0
126
+
127
+    lea             r0, [r0 + r1 * 4]
128
+    lea             r2, [r2 + r3 * 4]
129
+
130
+    pmovzxbw        m0, [r0]
131
+    psllw           m0, 6
132
+    psubw           m0, m1
133
+    movu            [r2], m0
134
+
135
+    pmovzxbw        m0, [r0 + r1]
136
+    psllw           m0, 6
137
+    psubw           m0, m1
138
+    movu            [r2 + r3], m0
139
+
140
+    pmovzxbw        m0, [r0 + r1 * 2]
141
+    psllw           m0, 6
142
+    psubw           m0, m1
143
+    movu            [r2 + r3 * 2], m0
144
+
145
+    pmovzxbw        m0, [r0 + r4]
146
+    psllw           m0, 6
147
+    psubw           m0, m1
148
+    movu            [r2 + r5], m0
149
+
150
+    lea             r0, [r0 + r1 * 4]
151
+    lea             r2, [r2 + r3 * 4]
152
+
153
+    pmovzxbw        m0, [r0]
154
+    psllw           m0, 6
155
+    psubw           m0, m1
156
+    movu            [r2], m0
157
+
158
+    pmovzxbw        m0, [r0 + r1]
159
+    psllw           m0, 6
160
+    psubw           m0, m1
161
+    movu            [r2 + r3], m0
162
+
163
+    pmovzxbw        m0, [r0 + r1 * 2]
164
+    psllw           m0, 6
165
+    psubw           m0, m1
166
+    movu            [r2 + r3 * 2], m0
167
+
168
+    pmovzxbw        m0, [r0 + r4]
169
+    psllw           m0, 6
170
+    psubw           m0, m1
171
+    movu            [r2 + r5], m0
172
+    RET
173
+
174
+;-----------------------------------------------------------------------------
175
+; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
176
+;-----------------------------------------------------------------------------
177
+INIT_YMM avx2
178
+cglobal filterPixelToShort_16x16, 3, 6, 2
179
+    mov             r3d, r3m
180
+    add             r3d, r3d
181
+    lea             r4, [r1 * 3]
182
+    lea             r5, [r3 * 3]
183
+
184
+    ; load constant
185
+    vbroadcasti128  m1, [pw_2000]
186
+
187
+    pmovzxbw        m0, [r0]
188
+    psllw           m0, 6
189
+    psubw           m0, m1
190
+    movu            [r2], m0
191
+
192
+    pmovzxbw        m0, [r0 + r1]
193
+    psllw           m0, 6
194
+    psubw           m0, m1
195
+    movu            [r2 + r3], m0
196
+
197
+    pmovzxbw        m0, [r0 + r1 * 2]
198
+    psllw           m0, 6
199
+    psubw           m0, m1
200
+    movu            [r2 + r3 * 2], m0
201
x265_1.8.tar.gz/source/common/x86/loopfilter.asm -> x265_1.9.tar.gz/source/common/x86/loopfilter.asm Changed
201
 
1
@@ -26,24 +26,28 @@
2
 ;*****************************************************************************/
3
 
4
 %include "x86inc.asm"
5
+%include "x86util.asm"
6
 
7
 SECTION_RODATA 32
8
 pb_31:      times 32 db 31
9
 pb_124:     times 32 db 124
10
 pb_15:      times 32 db 15
11
-pb_movemask_32:  times 32 db 0x00
12
-                 times 32 db 0xFF
13
 
14
 SECTION .text
15
 cextern pb_1
16
-cextern pb_128
17
 cextern pb_2
18
+cextern pb_3
19
+cextern pb_4
20
+cextern pb_01
21
+cextern pb_128
22
+cextern pw_1
23
+cextern pw_n1
24
 cextern pw_2
25
+cextern pw_4
26
 cextern pw_pixel_max
27
 cextern pb_movemask
28
-cextern pw_1
29
+cextern pb_movemask_32
30
 cextern hmul_16p
31
-cextern pb_4
32
 
33
 
34
 ;============================================================================================================
35
@@ -1989,79 +1993,94 @@
36
 %endif
37
 
38
 ;--------------------------------------------------------------------------------------------------------------------------
39
-; saoCuStatsBO_c(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count)
40
+; saoCuStatsBO_c(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count)
41
 ;--------------------------------------------------------------------------------------------------------------------------
42
 %if ARCH_X86_64
43
 INIT_XMM sse4
44
-cglobal saoCuStatsBO, 7,12,6
45
-    mova        m3, [hmul_16p + 16]
46
-    mova        m4, [pb_124]
47
-    mova        m5, [pb_4]
48
-    xor         r7d, r7d
49
+cglobal saoCuStatsBO, 7,13,2
50
+    mova        m0, [pb_124]
51
+    add         r5, 4
52
+    add         r6, 4
53
 
54
 .loopH:
55
-    mov         r10, r0
56
+    mov         r12, r0
57
     mov         r11, r1
58
     mov         r9d, r3d
59
+
60
 .loopL:
61
     movu        m1, [r11]
62
-    movu        m0, [r10]
63
+    psrlw       m1, 1                   ; rec[x] >> boShift
64
+    pand        m1, m0
65
 
66
-    punpckhbw   m2, m0, m1
67
-    punpcklbw   m0, m1
68
-    psrlw       m1, 1               ; rec[x] >> boShift
69
-    pmaddubsw   m2, m3
70
-    pmaddubsw   m0, m3
71
-    pand        m1, m4
72
-    paddb       m1, m5
73
+    cmp         r9d, 8
74
+    jle        .proc8
75
 
76
+    movq        r10, m1
77
 %assign x 0
78
-%rep 16
79
-    pextrb      r7d, m1, x
80
+%rep 8
81
+    movzx       r7d, r10b
82
+    shr         r10, 8
83
 
84
-%if (x < 8)
85
-    pextrw      r8d, m0, (x % 8)
86
-%else
87
-    pextrw      r8d, m2, (x % 8)
88
-%endif
89
-    movsx       r8d, r8w
90
-    inc         dword  [r6 + r7]    ; count[classIdx]++
91
-    add         [r5 + r7], r8d      ; stats[classIdx] += (fenc[x] - rec[x]);
92
+    movsx       r8d, word [r12 + x*2]   ; diff[x]
93
+    inc         dword  [r6 + r7]        ; count[classIdx]++
94
+    add         [r5 + r7], r8d          ; stats[classIdx] += (fenc[x] - rec[x]);
95
+%assign x x+1
96
+%endrep
97
+    movhlps     m1, m1
98
+    sub         r9d, 8
99
+    add         r12, 8*2
100
+
101
+.proc8:
102
+    movq        r10, m1
103
+%assign x 0
104
+%rep 8
105
+    movzx       r7d, r10b
106
+    shr         r10, 8
107
+
108
+    movsx       r8d, word [r12 + x*2]   ; diff[x]
109
+    inc         dword  [r6 + r7]        ; count[classIdx]++
110
+    add         [r5 + r7], r8d          ; stats[classIdx] += (fenc[x] - rec[x]);
111
     dec         r9d
112
-    jz          .next
113
+    jz         .next
114
 %assign x x+1
115
 %endrep
116
 
117
-    add         r10, 16
118
+    add         r12, 8*2
119
     add         r11, 16
120
-    jmp         .loopL
121
+    jmp        .loopL
122
 
123
 .next:
124
-    add         r0, r2
125
+    add         r0, 64*2                ; MAX_CU_SIZE
126
     add         r1, r2
127
     dec         r4d
128
-    jnz         .loopH
129
+    jnz        .loopH
130
     RET
131
 %endif
132
 
133
 ;-----------------------------------------------------------------------------------------------------------------------
134
-; saoCuStatsE0(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count)
135
+; saoCuStatsE0(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count)
136
 ;-----------------------------------------------------------------------------------------------------------------------
137
 %if ARCH_X86_64
138
 INIT_XMM sse4
139
-cglobal saoCuStatsE0, 5,9,8, 0-32
140
+cglobal saoCuStatsE0, 3,10,6, 0-32
141
     mov         r3d, r3m
142
-    mov         r8, r5mp
143
+    mov         r4d, r4m
144
+    mov         r9, r5mp
145
 
146
     ; clear internal temporary buffer
147
     pxor        m0, m0
148
     mova        [rsp], m0
149
     mova        [rsp + mmsize], m0
150
     mova        m4, [pb_128]
151
-    mova        m5, [hmul_16p + 16]
152
-    mova        m6, [pb_2]
153
+    mova        m5, [pb_2]
154
     xor         r7d, r7d
155
 
156
+    ; correct stride for diff[] and rec
157
+    mov         r6d, r3d
158
+    and         r6d, ~15
159
+    sub         r2, r6
160
+    lea         r8, [(r6 - 64) * 2]             ; 64 = MAX_CU_SIZE
161
+
162
 .loopH:
163
     mov         r5d, r3d
164
 
165
@@ -2075,100 +2094,257 @@
166
     pinsrb      m0, r7d, 15
167
 
168
 .loopL:
169
-    movu        m7, [r1]
170
+    movu        m3, [r1]
171
     movu        m2, [r1 + 1]
172
 
173
-    pxor        m1, m7, m4
174
-    pxor        m3, m2, m4
175
-    pcmpgtb     m2, m1, m3
176
-    pcmpgtb     m3, m1
177
-    pand        m2, [pb_1]
178
-    por         m2, m3              ; signRight
179
+    pxor        m1, m3, m4
180
+    pxor        m2, m4
181
+    pcmpgtb     m3, m1, m2
182
+    pcmpgtb     m2, m1
183
+    pand        m3, [pb_1]
184
+    por         m2, m3                          ; signRight
185
 
186
     palignr     m3, m2, m0, 15
187
-    psignb      m3, m4              ; signLeft
188
+    psignb      m3, m4                          ; signLeft
189
 
190
     mova        m0, m2
191
     paddb       m2, m3
192
-    paddb       m2, m6              ; edgeType
193
+    paddb       m2, m5                          ; edgeType
194
 
195
     ; stats[edgeType]
196
-    movu        m3, [r0]            ; fenc[0-15]
197
-    punpckhbw   m1, m3, m7
198
-    punpcklbw   m3, m7
199
-    pmaddubsw   m1, m5
200
-    pmaddubsw   m3, m5
201
x265_1.8.tar.gz/source/common/x86/loopfilter.h -> x265_1.9.tar.gz/source/common/x86/loopfilter.h Changed
32
 
1
@@ -3,6 +3,7 @@
2
  *
3
  * Authors: Dnyaneshwar Gorade <dnyaneshwar@multicorewareinc.com>
4
  *          Praveen Kumar Tiwari <praveen@multicorewareinc.com>
5
+;*          Min Chen <chenm003@163.com>
6
  *
7
  * This program is free software; you can redistribute it and/or modify
8
  * it under the terms of the GNU General Public License as published by
9
@@ -35,14 +36,17 @@
10
     void PFX(saoCuOrgE3_ ## cpu)(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX); \
11
     void PFX(saoCuOrgE3_32_ ## cpu)(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX); \
12
     void PFX(saoCuOrgB0_ ## cpu)(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride); \
13
-    void PFX(saoCuStatsBO_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); \
14
-    void PFX(saoCuStatsE0_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); \
15
-    void PFX(saoCuStatsE1_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count); \
16
-    void PFX(saoCuStatsE2_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBufft, int endX, int endY, int32_t *stats, int32_t *count); \
17
-    void PFX(saoCuStatsE3_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count); \
18
+    void PFX(saoCuStatsBO_ ## cpu)(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); \
19
+    void PFX(saoCuStatsE0_ ## cpu)(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); \
20
+    void PFX(saoCuStatsE1_ ## cpu)(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count); \
21
+    void PFX(saoCuStatsE2_ ## cpu)(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBufft, int endX, int endY, int32_t *stats, int32_t *count); \
22
+    void PFX(saoCuStatsE3_ ## cpu)(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count); \
23
     void PFX(calSign_ ## cpu)(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
24
 
25
 DECL_SAO(sse4);
26
 DECL_SAO(avx2);
27
 
28
+void PFX(pelFilterLumaStrong_V_sse4)(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tcP, int32_t tcQ);
29
+void PFX(pelFilterLumaStrong_H_sse4)(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tcP, int32_t tcQ);
30
+
31
 #endif // ifndef X265_LOOPFILTER_H
32
x265_1.8.tar.gz/source/common/x86/mc-a.asm -> x265_1.9.tar.gz/source/common/x86/mc-a.asm Changed
119
 
1
@@ -2,6 +2,7 @@
2
 ;* mc-a.asm: x86 motion compensation
3
 ;*****************************************************************************
4
 ;* Copyright (C) 2003-2013 x264 project
5
+;* Copyright (C) 2013-2015 x265 project
6
 ;*
7
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
8
 ;*          Fiona Glaser <fiona@x264.com>
9
@@ -3989,8 +3990,12 @@
10
     test dword r4m, 15
11
     jz pixel_avg_w%1_sse2
12
 %endif
13
+%if (%1 == 8)
14
+    jmp pixel_avg_w8_unaligned_sse2
15
+%else
16
     jmp pixel_avg_w%1_mmx2
17
 %endif
18
+%endif
19
 %endmacro
20
 
21
 ;-----------------------------------------------------------------------------
22
@@ -4049,6 +4054,32 @@
23
     lea     r4, [r4 + 4 * r5]
24
 %endmacro
25
 
26
+INIT_XMM sse2
27
+cglobal pixel_avg_w8_unaligned
28
+    AVG_START
29
+.height_loop:
30
+%if HIGH_BIT_DEPTH
31
+    ; NO TEST BRANCH!
32
+    movu    m0, [t2]
33
+    movu    m1, [t2+SIZEOF_PIXEL*t3]
34
+    movu    m2, [t4]
35
+    movu    m3, [t4+SIZEOF_PIXEL*t5]
36
+    pavgw   m0, m2
37
+    pavgw   m1, m3
38
+    movu    [t0], m0
39
+    movu    [t0+SIZEOF_PIXEL*t1], m1
40
+%else ;!HIGH_BIT_DEPTH
41
+    movq    m0, [t2]
42
+    movhps  m0, [t2+SIZEOF_PIXEL*t3]
43
+    movq    m1, [t4]
44
+    movhps  m1, [t4+SIZEOF_PIXEL*t5]
45
+    pavgb   m0, m1
46
+    movq    [t0], m0
47
+    movhps  [t0+SIZEOF_PIXEL*t1], m0
48
+%endif
49
+    AVG_END
50
+
51
+
52
 ;-------------------------------------------------------------------------------------------------------------------------------
53
 ;void pixelavg_pp(pixel dst, intptr_t dstride, const pixel src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int)
54
 ;-------------------------------------------------------------------------------------------------------------------------------
55
@@ -4115,11 +4146,11 @@
56
 AVGH 4, 4
57
 AVGH 4, 2
58
 
59
-AVG_FUNC 8, movq, movq
60
-AVGH 8, 32
61
-AVGH 8, 16
62
-AVGH 8,  8
63
-AVGH 8,  4
64
+;AVG_FUNC 8, movq, movq
65
+;AVGH 8, 32
66
+;AVGH 8, 16
67
+;AVGH 8,  8
68
+;AVGH 8,  4
69
 
70
 AVG_FUNC 16, movq, movq
71
 AVGH 16, 64
72
@@ -4197,7 +4228,7 @@
73
 AVGH 4, 4
74
 AVGH 4, 2
75
 
76
-AVG_FUNC 8, movq, movq
77
+;AVG_FUNC 8, movq, movq
78
 AVGH 8, 32
79
 AVGH 8, 16
80
 AVGH 8,  8
81
@@ -4418,6 +4449,37 @@
82
     call pixel_avg_16x64_8bit
83
     call pixel_avg_16x64_8bit
84
     RET
85
+
86
+cglobal pixel_avg_48x64, 6,7,4
87
+   mov          r6d, 4
88
+.loop:
89
+%rep 8
90
+    movu        m0, [r2]
91
+    movu        xm2, [r2 + mmsize]
92
+    movu        m1, [r4]
93
+    movu        xm3, [r4 + mmsize]
94
+    pavgb       m0, m1
95
+    pavgb       xm2, xm3
96
+    movu        [r0], m0
97
+    movu        [r0 + mmsize], xm2
98
+
99
+    movu        m0, [r2 + r3]
100
+    movu        xm2, [r2 + r3 + mmsize]
101
+    movu        m1, [r4 + r5]
102
+    movu        xm3, [r4 + r5 + mmsize]
103
+    pavgb       m0, m1
104
+    pavgb       xm2, xm3
105
+    movu        [r0 + r1], m0
106
+    movu        [r0 + r1 + mmsize], xm2
107
+
108
+    lea         r2, [r2 + r3 * 2]
109
+    lea         r4, [r4 + r5 * 2]
110
+    lea         r0, [r0 + r1 * 2]
111
+%endrep
112
+
113
+    dec         r6d
114
+    jnz         .loop
115
+    RET
116
 %endif
117
 
118
 ;=============================================================================
119
x265_1.8.tar.gz/source/common/x86/mc-a2.asm -> x265_1.9.tar.gz/source/common/x86/mc-a2.asm Changed
201
 
1
@@ -2,12 +2,14 @@
2
 ;* mc-a2.asm: x86 motion compensation
3
 ;*****************************************************************************
4
 ;* Copyright (C) 2005-2013 x264 project
5
+;* Copyright (C) 2013-2015 x265 project
6
 ;*
7
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
8
 ;*          Fiona Glaser <fiona@x264.com>
9
 ;*          Holger Lubitz <holger@lubitz.org>
10
 ;*          Mathieu Monnier <manao@melix.net>
11
 ;*          Oskar Arvidsson <oskar@irock.se>
12
+;*          Min Chen <chenm003@163.com>
13
 ;*
14
 ;* This program is free software; you can redistribute it and/or modify
15
 ;* it under the terms of the GNU General Public License as published by
16
@@ -46,6 +48,8 @@
17
 pd_16: times 4 dd 16
18
 pd_0f: times 4 dd 0xffff
19
 pf_inv256: times 8 dd 0.00390625
20
+const pd_inv256,    times 4 dq 0.00390625
21
+const pd_0_5,       times 4 dq 0.5
22
 
23
 SECTION .text
24
 
25
@@ -987,151 +991,227 @@
26
 %endif
27
 
28
 ;-----------------------------------------------------------------------------
29
-; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
30
-;                             uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len )
31
+; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, int32_t *intra_costs,
32
+;                             uint16_t *inter_costs, int32_t *inv_qscales, double *fps_factor, int len )
33
 ;-----------------------------------------------------------------------------
34
-%macro MBTREE 0
35
+INIT_XMM sse2
36
 cglobal mbtree_propagate_cost, 7,7,7
37
-    add        r6d, r6d
38
-    lea         r0, [r0+r6*2]
39
-    add         r1, r6
40
-    add         r2, r6
41
-    add         r3, r6
42
-    add         r4, r6
43
-    neg         r6
44
-    pxor      xmm4, xmm4
45
-    movss     xmm6, [r5]
46
-    shufps    xmm6, xmm6, 0
47
-    mulps     xmm6, [pf_inv256]
48
-    movdqa    xmm5, [pw_3fff]
49
+    dec         r6d
50
+    movsd       m6, [r5]
51
+    mulpd       m6, [pd_inv256]
52
+    xor         r5d, r5d
53
+    lea         r0, [r0+r5*2]
54
+    pxor        m4, m4
55
+    movlhps     m6, m6
56
+    mova        m5, [pw_3fff]
57
+
58
 .loop:
59
-    movq      xmm2, [r2+r6] ; intra
60
-    movq      xmm0, [r4+r6] ; invq
61
-    movq      xmm3, [r3+r6] ; inter
62
-    movq      xmm1, [r1+r6] ; prop
63
-    punpcklwd xmm2, xmm4
64
-    punpcklwd xmm0, xmm4
65
-    pmaddwd   xmm0, xmm2
66
-    pand      xmm3, xmm5
67
-    punpcklwd xmm1, xmm4
68
-    punpcklwd xmm3, xmm4
69
-%if cpuflag(fma4)
70
-    cvtdq2ps  xmm0, xmm0
71
-    cvtdq2ps  xmm1, xmm1
72
-    fmaddps   xmm0, xmm0, xmm6, xmm1
73
-    cvtdq2ps  xmm1, xmm2
74
-    psubd     xmm2, xmm3
75
-    cvtdq2ps  xmm2, xmm2
76
-    rcpps     xmm3, xmm1
77
-    mulps     xmm1, xmm3
78
-    mulps     xmm0, xmm2
79
-    addps     xmm2, xmm3, xmm3
80
-    fnmaddps  xmm3, xmm1, xmm3, xmm2
81
-    mulps     xmm0, xmm3
82
-%else
83
-    cvtdq2ps  xmm0, xmm0
84
-    mulps     xmm0, xmm6    ; intra*invq*fps_factor>>8
85
-    cvtdq2ps  xmm1, xmm1    ; prop
86
-    addps     xmm0, xmm1    ; prop + (intra*invq*fps_factor>>8)
87
-    cvtdq2ps  xmm1, xmm2    ; intra
88
-    psubd     xmm2, xmm3    ; intra - inter
89
-    cvtdq2ps  xmm2, xmm2    ; intra - inter
90
-    rcpps     xmm3, xmm1    ; 1 / intra 1st approximation
91
-    mulps     xmm1, xmm3    ; intra * (1/intra 1st approx)
92
-    mulps     xmm1, xmm3    ; intra * (1/intra 1st approx)^2
93
-    mulps     xmm0, xmm2    ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
94
-    addps     xmm3, xmm3    ; 2 * (1/intra 1st approx)
95
-    subps     xmm3, xmm1    ; 2nd approximation for 1/intra
96
-    mulps     xmm0, xmm3    ; / intra
97
-%endif
98
-    cvtps2dq  xmm0, xmm0
99
-    movdqa [r0+r6*2], xmm0
100
-    add         r6, 8
101
-    jl .loop
102
+    movh        m2, [r2+r5*4]       ; intra
103
+    movh        m0, [r4+r5*4]       ; invq
104
+    movd        m3, [r3+r5*2]       ; inter
105
+    pand        m3, m5
106
+    punpcklwd   m3, m4
107
+
108
+    ; PMINSD
109
+    pcmpgtd     m1, m2, m3
110
+    pand        m3, m1
111
+    pandn       m1, m2
112
+    por         m3, m1
113
+
114
+    movd        m1, [r1+r5*2]       ; prop
115
+    punpckldq   m2, m2
116
+    punpckldq   m0, m0
117
+    pmuludq     m0, m2
118
+    pshufd      m2, m2, q3120
119
+    pshufd      m0, m0, q3120
120
+
121
+    punpcklwd   m1, m4
122
+    cvtdq2pd    m0, m0
123
+    mulpd       m0, m6              ; intra*invq*fps_factor>>8
124
+    cvtdq2pd    m1, m1              ; prop
125
+    addpd       m0, m1              ; prop + (intra*invq*fps_factor>>8)
126
+    ;cvtdq2ps    m1, m2              ; intra
127
+    cvtdq2pd    m1, m2              ; intra
128
+    psubd       m2, m3              ; intra - inter
129
+    cvtdq2pd    m2, m2              ; intra - inter
130
+    ;rcpps       m3, m1
131
+    ;mulps       m1, m3              ; intra * (1/intra 1st approx)
132
+    ;mulps       m1, m3              ; intra * (1/intra 1st approx)^2
133
+    ;addps       m3, m3              ; 2 * (1/intra 1st approx)
134
+    ;subps       m3, m1              ; 2nd approximation for 1/intra
135
+    ;cvtps2pd    m3, m3              ; 1 / intra 1st approximation
136
+    mulpd       m0, m2              ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
137
+    ;mulpd       m0, m3              ; / intra
138
+
139
+    ; TODO: DIVPD very slow, but match to C model output, since it is not bottleneck function, I comment above faster code
140
+    divpd       m0, m1
141
+    addpd       m0, [pd_0_5]
142
+    cvttpd2dq    m0, m0
143
+
144
+    movh        [r0+r5*4], m0
145
+    add         r5d, 2
146
+    cmp         r5d, r6d
147
+    jl         .loop
148
+
149
+    xor         r6d, r5d
150
+    jnz         .even
151
+    movd        m2, [r2+r5*4]       ; intra
152
+    movd        m0, [r4+r5*4]       ; invq
153
+    movd        m3, [r3+r5*2]       ; inter
154
+    pand        m3, m5
155
+    punpcklwd   m3, m4
156
+
157
+    ; PMINSD
158
+    pcmpgtd     m1, m2, m3
159
+    pand        m3, m1
160
+    pandn       m1, m2
161
+    por         m3, m1
162
+
163
+    movd        m1, [r1+r5*2]       ; prop
164
+    punpckldq   m2, m2              ; DWORD [_ 1 _ 0]
165
+    punpckldq   m0, m0
166
+    pmuludq     m0, m2              ; QWORD [m1 m0]
167
+    pshufd      m2, m2, q3120
168
+    pshufd      m0, m0, q3120
169
+    punpcklwd   m1, m4
170
+    cvtdq2pd    m0, m0
171
+    mulpd       m0, m6              ; intra*invq*fps_factor>>8
172
+    cvtdq2pd    m1, m1              ; prop
173
+    addpd       m0, m1              ; prop + (intra*invq*fps_factor>>8)
174
+    cvtdq2pd    m1, m2              ; intra
175
+    psubd       m2, m3              ; intra - inter
176
+    cvtdq2pd    m2, m2              ; intra - inter
177
+    mulpd       m0, m2              ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
178
+
179
+    divpd       m0, m1
180
+    addpd       m0, [pd_0_5]
181
+    cvttpd2dq    m0, m0
182
+    movd        [r0+r5*4], m0
183
+.even:
184
     RET
185
-%endmacro
186
 
187
-INIT_XMM sse2
188
-MBTREE
189
-; Bulldozer only has a 128-bit float unit, so the AVX version of this function is actually slower.
190
-INIT_XMM fma4
191
-MBTREE
192
-
193
-%macro INT16_UNPACK 1
194
-    vpunpckhwd   xm4, xm%1, xm7
195
-    vpunpcklwd  xm%1, xm7
196
-    vinsertf128  m%1, m%1, xm4, 1
197
-%endmacro
198
 
199
+;-----------------------------------------------------------------------------
200
+; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, int32_t *intra_costs,
201
x265_1.8.tar.gz/source/common/x86/mc.h -> x265_1.9.tar.gz/source/common/x86/mc.h Changed
16
 
1
@@ -36,4 +36,14 @@
2
 
3
 #undef LOWRES
4
 
5
+#define PROPAGATE_COST(cpu) \
6
+    void PFX(mbtree_propagate_cost_ ## cpu)(int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, \
7
+                                              const uint16_t* interCosts, const int32_t* invQscales, const double* fpsFactor, int len);
8
+
9
+PROPAGATE_COST(sse2)
10
+PROPAGATE_COST(avx)
11
+PROPAGATE_COST(avx2)
12
+
13
+#undef PROPAGATE_COST
14
+
15
 #endif // ifndef X265_MC_H
16
x265_1.8.tar.gz/source/common/x86/pixel-a.asm -> x265_1.9.tar.gz/source/common/x86/pixel-a.asm Changed
201
 
1
@@ -2,6 +2,7 @@
2
 ;* pixel.asm: x86 pixel metrics
3
 ;*****************************************************************************
4
 ;* Copyright (C) 2003-2013 x264 project
5
+;* Copyright (C) 2013-2015 x265 project
6
 ;*
7
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
8
 ;*          Holger Lubitz <holger@lubitz.org>
9
@@ -70,6 +71,7 @@
10
 cextern pd_2
11
 cextern hmul_16p
12
 cextern pb_movemask
13
+cextern pb_movemask_32
14
 cextern pw_pixel_max
15
 
16
 ;=============================================================================
17
@@ -6497,6 +6499,1357 @@
18
 %endif ; !ARCH_X86_64
19
 %endmacro ; SA8D
20
 
21
+
22
+%if ARCH_X86_64 == 1 && BIT_DEPTH == 12
23
+INIT_YMM avx2
24
+cglobal sa8d_8x8_12bit
25
+    pmovzxwd        m0, [r0]
26
+    pmovzxwd        m9, [r2]
27
+    psubd           m0, m9
28
+
29
+    pmovzxwd        m1, [r0 + r1]
30
+    pmovzxwd        m9, [r2 + r3]
31
+    psubd           m1, m9
32
+
33
+    pmovzxwd        m2, [r0 + r1 * 2]
34
+    pmovzxwd        m9, [r2 + r3 * 2]
35
+    psubd           m2, m9
36
+
37
+    pmovzxwd        m8, [r0 + r4]
38
+    pmovzxwd        m9, [r2 + r5]
39
+    psubd           m8, m9
40
+
41
+    lea             r0, [r0 + r1 * 4]
42
+    lea             r2, [r2 + r3 * 4]
43
+
44
+    pmovzxwd        m4, [r0]
45
+    pmovzxwd        m9, [r2]
46
+    psubd           m4, m9
47
+
48
+    pmovzxwd        m5, [r0 + r1]
49
+    pmovzxwd        m9, [r2 + r3]
50
+    psubd           m5, m9
51
+
52
+    pmovzxwd        m3, [r0 + r1 * 2]
53
+    pmovzxwd        m9, [r2 + r3 * 2]
54
+    psubd           m3, m9
55
+
56
+    pmovzxwd        m7, [r0 + r4]
57
+    pmovzxwd        m9, [r2 + r5]
58
+    psubd           m7, m9
59
+
60
+    mova            m6, m0
61
+    paddd           m0, m1
62
+    psubd           m1, m6
63
+    mova            m6, m2
64
+    paddd           m2, m8
65
+    psubd           m8, m6
66
+    mova            m6, m0
67
+
68
+    punpckldq       m0, m1
69
+    punpckhdq       m6, m1
70
+
71
+    mova            m1, m0
72
+    paddd           m0, m6
73
+    psubd           m6, m1
74
+    mova            m1, m2
75
+
76
+    punpckldq       m2, m8
77
+    punpckhdq       m1, m8
78
+
79
+    mova            m8, m2
80
+    paddd           m2, m1
81
+    psubd           m1, m8
82
+    mova            m8, m4
83
+    paddd           m4, m5
84
+    psubd           m5, m8
85
+    mova            m8, m3
86
+    paddd           m3, m7
87
+    psubd           m7, m8
88
+    mova            m8, m4
89
+
90
+    punpckldq       m4, m5
91
+    punpckhdq       m8, m5
92
+
93
+    mova            m5, m4
94
+    paddd           m4, m8
95
+    psubd           m8, m5
96
+    mova            m5, m3
97
+    punpckldq       m3, m7
98
+    punpckhdq       m5, m7
99
+
100
+    mova            m7, m3
101
+    paddd           m3, m5
102
+    psubd           m5, m7
103
+    mova            m7, m0
104
+    paddd           m0, m2
105
+    psubd           m2, m7
106
+    mova            m7, m6
107
+    paddd           m6, m1
108
+    psubd           m1, m7
109
+    mova            m7, m0
110
+
111
+    punpcklqdq      m0, m2
112
+    punpckhqdq      m7, m2
113
+
114
+    mova            m2, m0
115
+    paddd           m0, m7
116
+    psubd           m7, m2
117
+    mova            m2, m6
118
+
119
+    punpcklqdq      m6, m1
120
+    punpckhqdq      m2, m1
121
+
122
+    mova            m1, m6
123
+    paddd           m6, m2
124
+    psubd           m2, m1
125
+    mova            m1, m4
126
+    paddd           m4, m3
127
+    psubd           m3, m1
128
+    mova            m1, m8
129
+    paddd           m8, m5
130
+    psubd           m5, m1
131
+    mova            m1, m4
132
+
133
+    punpcklqdq      m4, m3
134
+    punpckhqdq      m1, m3
135
+
136
+    mova            m3, m4
137
+    paddd           m4, m1
138
+    psubd           m1, m3
139
+    mova            m3, m8
140
+
141
+    punpcklqdq      m8, m5
142
+    punpckhqdq      m3, m5
143
+
144
+    mova            m5, m8
145
+    paddd           m8, m3
146
+    psubd           m3, m5
147
+    mova            m5, m0
148
+    paddd           m0, m4
149
+    psubd           m4, m5
150
+    mova            m5, m7
151
+    paddd           m7, m1
152
+    psubd           m1, m5
153
+    mova            m5, m0
154
+
155
+    vinserti128     m0, m0, xm4, 1
156
+    vperm2i128      m5, m5, m4, 00110001b
157
+
158
+    pxor            m4, m4
159
+    psubd           m4, m0
160
+    pmaxsd          m0, m4
161
+    pxor            m4, m4
162
+    psubd           m4, m5
163
+    pmaxsd          m5, m4
164
+    pmaxsd          m0, m5
165
+    mova            m4, m7
166
+
167
+    vinserti128     m7, m7, xm1, 1
168
+    vperm2i128      m4, m4, m1, 00110001b
169
+
170
+    pxor            m1, m1
171
+    psubd           m1, m7
172
+    pmaxsd          m7, m1
173
+    pxor            m1, m1
174
+    psubd           m1, m4
175
+    pmaxsd          m4, m1
176
+    pmaxsd          m7, m4
177
+    mova            m1, m6
178
+    paddd           m6, m8
179
+    psubd           m8, m1
180
+    mova            m1, m2
181
+    paddd           m2, m3
182
+    psubd           m3, m1
183
+    mova            m1, m6
184
+
185
+    vinserti128     m6, m6, xm8, 1
186
+    vperm2i128      m1, m1, m8, 00110001b
187
+
188
+    pxor            m8, m8
189
+    psubd           m8, m6
190
+    pmaxsd          m6, m8
191
+    pxor            m8, m8
192
+    psubd           m8, m1
193
+    pmaxsd          m1, m8
194
+    pmaxsd          m6, m1
195
+    mova            m8, m2
196
+
197
+    vinserti128     m2, m2, xm3, 1
198
+    vperm2i128      m8, m8, m3, 00110001b
199
+
200
+    pxor            m3, m3
201
x265_1.8.tar.gz/source/common/x86/pixel-util.h -> x265_1.9.tar.gz/source/common/x86/pixel-util.h Changed
16
 
1
@@ -2,6 +2,7 @@
2
  * Copyright (C) 2013 x265 project
3
  *
4
  * Authors: Steve Borho <steve@borho.org>
5
+;*          Min Chen <chenm003@163.com>
6
  *
7
  * This program is free software; you can redistribute it and/or modify
8
  * it under the terms of the GNU General Public License as published by
9
@@ -55,5 +56,6 @@
10
 int PFX(scanPosLast_avx2_bmi2(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* scanCG4x4, const int trSize));
11
 uint32_t PFX(findPosFirstLast_ssse3(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16]));
12
 uint32_t PFX(costCoeffNxN_sse4(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase));
13
+uint32_t PFX(costCoeffNxN_avx2_bmi2(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase));
14
 
15
 #endif // ifndef X265_PIXEL_UTIL_H
16
x265_1.8.tar.gz/source/common/x86/pixel-util8.asm -> x265_1.9.tar.gz/source/common/x86/pixel-util8.asm Changed
201
 
1
@@ -49,6 +49,7 @@
2
 mask_ff:                times 16 db 0xff
3
                         times 16 db 0
4
 deinterleave_shuf:      times  2 db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
5
+interleave_shuf:        times  2 db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
6
 deinterleave_word_shuf: times  2 db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15
7
 hmulw_16p:              times  8 dw 1
8
                         times  4 dw 1, -1
9
@@ -56,7 +57,7 @@
10
 SECTION .text
11
 
12
 cextern pw_1
13
-cextern pw_0_15
14
+cextern pw_0_7
15
 cextern pb_1
16
 cextern pb_128
17
 cextern pw_00ff
18
@@ -78,6 +79,7 @@
19
 cextern trans8_shuf
20
 cextern_naked private_prefix %+ _entropyStateBits
21
 cextern pb_movemask
22
+cextern pw_exp2_0_15
23
 
24
 ;-----------------------------------------------------------------------------
25
 ; void getResidual(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride)
26
@@ -792,6 +794,7 @@
27
     pshufd      m6, m6, 0       ; m6 = add
28
     mov         r3d, r4d        ; r3 = numCoeff
29
     shr         r4d, 3
30
+    pxor        m4, m4
31
 
32
 .loop:
33
     pmovsxwd    m0, [r0]        ; m0 = level
34
@@ -810,13 +813,13 @@
35
     psignd      m3, m1
36
 
37
     packssdw    m2, m3
38
+    pabsw       m2, m2
39
 
40
     movu        [r2], m2
41
     add         r0, 16
42
     add         r1, 32
43
     add         r2, 16
44
 
45
-    pxor        m4, m4
46
     pcmpeqw     m2, m4
47
     psubw       m7, m2
48
 
49
@@ -862,9 +865,11 @@
50
     psignd      m2, m0
51
 
52
     packssdw    m1, m2
53
-    vpermq      m2, m1, q3120
54
+    pabsw       m1, m1
55
 
56
+    vpermq      m2, m1, q3120
57
     movu        [r2], m2
58
+
59
     add         r0, mmsize
60
     add         r1, mmsize * 2
61
     add         r2, mmsize
62
@@ -1560,7 +1565,7 @@
63
     movd        m0, r6d
64
     pshuflw     m0, m0, 0
65
     punpcklqdq  m0, m0
66
-    pcmpgtw     m0, [pw_0_15]
67
+    pcmpgtw     m0, [pw_0_7]
68
 
69
 .loopH:
70
     mov         r6d, r4d
71
@@ -1718,7 +1723,7 @@
72
     pshuflw                   m0, m0, 0
73
     punpcklqdq                m0, m0
74
     vinserti128               m0, m0, xm0, 1
75
-    pcmpgtw                   m0, [pw_0_15]
76
+    pcmpgtw                   m0, [pw_0_7]
77
 
78
 .loopH:
79
     mov                       r6d, r4d
80
@@ -6397,6 +6402,78 @@
81
     movd   edx, xm6
82
 %endif
83
     RET
84
+
85
+INIT_YMM avx2
86
+cglobal pixel_var_32x32, 2,4,7
87
+    VAR_START 0
88
+    mov             r2d, 16
89
+
90
+.loop:
91
+    pmovzxbw        m0, [r0]
92
+    pmovzxbw        m3, [r0 + 16]
93
+    pmovzxbw        m1, [r0 + r1]
94
+    pmovzxbw        m4, [r0 + r1 + 16]
95
+
96
+    lea             r0, [r0 + r1 * 2]
97
+
98
+    VAR_CORE
99
+
100
+    dec             r2d
101
+    jg              .loop
102
+
103
+    vextracti128   xm0, m5, 1
104
+    vextracti128   xm1, m6, 1
105
+    paddw          xm5, xm0
106
+    paddd          xm6, xm1
107
+    HADDW          xm5, xm2
108
+    HADDD          xm6, xm1
109
+
110
+%if ARCH_X86_64
111
+    punpckldq      xm5, xm6
112
+    movq           rax, xm5
113
+%else
114
+    movd           eax, xm5
115
+    movd           edx, xm6
116
+%endif
117
+    RET
118
+
119
+INIT_YMM avx2
120
+cglobal pixel_var_64x64, 2,4,7
121
+    VAR_START 0
122
+    mov             r2d, 64
123
+
124
+.loop:
125
+    pmovzxbw        m0, [r0]
126
+    pmovzxbw        m3, [r0 + 16]
127
+    pmovzxbw        m1, [r0 + mmsize]
128
+    pmovzxbw        m4, [r0 + mmsize + 16]
129
+
130
+    lea             r0, [r0 + r1]
131
+
132
+    VAR_CORE
133
+
134
+    dec             r2d
135
+    jg              .loop
136
+
137
+    pxor            m1, m1
138
+    punpcklwd       m0, m5, m1
139
+    punpckhwd       m5, m1
140
+    paddd           m5, m0
141
+    vextracti128   xm2, m5, 1
142
+    vextracti128   xm1, m6, 1
143
+    paddd          xm5, xm2
144
+    paddd          xm6, xm1
145
+    HADDD          xm5, xm2
146
+    HADDD          xm6, xm1
147
+
148
+%if ARCH_X86_64
149
+    punpckldq      xm5, xm6
150
+    movq           rax, xm5
151
+%else
152
+    movd           eax, xm5
153
+    movd           edx, xm6
154
+%endif
155
+    RET
156
 %endif ; !HIGH_BIT_DEPTH
157
 
158
 %macro VAR2_END 3
159
@@ -6578,10 +6655,10 @@
160
 
161
 
162
 ;-----------------------------------------------------------------------------
163
-; uint32_t[last first] findPosFirstAndLast(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16])
164
+; uint32_t[sumSign last first] findPosFirstLast(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16], uint32_t *absSum)
165
 ;-----------------------------------------------------------------------------
166
 INIT_XMM ssse3
167
-cglobal findPosFirstLast, 3,3,3
168
+cglobal findPosFirstLast, 3,3,4
169
     ; convert stride to int16_t
170
     add         r1d, r1d
171
 
172
@@ -6593,10 +6670,22 @@
173
     movh        m1, [r0]
174
     movhps      m1, [r0 + r1]
175
     movh        m2, [r0 + r1 * 2]
176
-    lea         r1, [r1 * 3]
177
+    lea         r1d, [r1 * 3]
178
     movhps      m2, [r0 + r1]
179
+    pxor        m3, m1, m2
180
     packsswb    m1, m2
181
 
182
+    ; get absSum
183
+    movhlps     m2, m3
184
+    pxor        m3, m2
185
+    pshufd      m2, m3, q2301
186
+    pxor        m3, m2
187
+    movd        r0d, m3
188
+    mov         r2d, r0d
189
+    shr         r2d, 16
190
+    xor         r2d, r0d
191
+    shl         r2d, 31
192
+
193
     ; get non-zero mask
194
     pxor        m2, m2
195
     pcmpeqb     m1, m2
196
@@ -6609,319 +6698,10 @@
197
     not         r0d
198
     bsr         r1w, r0w
199
     bsf         eax, r0d    ; side effect: clear AH to Zero
200
-    shl         r1d, 16
201
x265_1.8.tar.gz/source/common/x86/pixel.h -> x265_1.9.tar.gz/source/common/x86/pixel.h Changed
40
 
1
@@ -2,10 +2,12 @@
2
  * pixel.h: x86 pixel metrics
3
  *****************************************************************************
4
  * Copyright (C) 2003-2013 x264 project
5
+ * Copyright (C) 2013-2015 x265 project
6
  *
7
  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
8
  *          Loren Merritt <lorenm@u.washington.edu>
9
  *          Fiona Glaser <fiona@x264.com>
10
+;*          Min Chen <chenm003@163.com>
11
  *
12
  * This program is free software; you can redistribute it and/or modify
13
  * it under the terms of the GNU General Public License as published by
14
@@ -34,9 +36,10 @@
15
 void PFX(upShift_16_avx2)(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
16
 void PFX(upShift_8_sse4)(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
17
 void PFX(upShift_8_avx2)(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
18
+pixel PFX(planeClipAndMax_avx2)(pixel *src, intptr_t stride, int width, int height, uint64_t *outsum, const pixel minPix, const pixel maxPix);
19
 
20
 #define DECL_PIXELS(cpu) \
21
-    FUNCDEF_PU(uint32_t, pixel_ssd, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
22
+    FUNCDEF_PU(sse_t, pixel_ssd, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
23
     FUNCDEF_PU(int, pixel_sa8d, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
24
     FUNCDEF_PU(void, pixel_sad_x3, cpu, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*); \
25
     FUNCDEF_PU(void, pixel_sad_x4, cpu, const pixel*, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*); \
26
@@ -45,10 +48,10 @@
27
     FUNCDEF_PU(void, pixel_sub_ps, cpu, int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1); \
28
     FUNCDEF_CHROMA_PU(int, pixel_satd, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
29
     FUNCDEF_CHROMA_PU(int, pixel_sad, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
30
-    FUNCDEF_CHROMA_PU(uint32_t, pixel_ssd_ss, cpu, const int16_t*, intptr_t, const int16_t*, intptr_t); \
31
+    FUNCDEF_CHROMA_PU(sse_t, pixel_ssd_ss, cpu, const int16_t*, intptr_t, const int16_t*, intptr_t); \
32
     FUNCDEF_CHROMA_PU(void, addAvg, cpu, const int16_t*, const int16_t*, pixel*, intptr_t, intptr_t, intptr_t); \
33
-    FUNCDEF_CHROMA_PU(int, pixel_ssd_s, cpu, const int16_t*, intptr_t); \
34
-    FUNCDEF_TU_S(int, pixel_ssd_s, cpu, const int16_t*, intptr_t); \
35
+    FUNCDEF_CHROMA_PU(sse_t, pixel_ssd_s, cpu, const int16_t*, intptr_t); \
36
+    FUNCDEF_TU_S(sse_t, pixel_ssd_s, cpu, const int16_t*, intptr_t); \
37
     FUNCDEF_TU(uint64_t, pixel_var, cpu, const pixel*, intptr_t); \
38
     FUNCDEF_TU(int, psyCost_pp, cpu, const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride); \
39
     FUNCDEF_TU(int, psyCost_ss, cpu, const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride)
40
x265_1.8.tar.gz/source/common/x86/pixeladd8.asm -> x265_1.9.tar.gz/source/common/x86/pixeladd8.asm Changed
9
 
1
@@ -2,6 +2,7 @@
2
 ;* Copyright (C) 2013 x265 project
3
 ;*
4
 ;* Authors: Praveen Kumar Tiwari <praveen@multicorewareinc.com>
5
+;*          Min Chen <chenm003@163.com>
6
 ;*
7
 ;* This program is free software; you can redistribute it and/or modify
8
 ;* it under the terms of the GNU General Public License as published by
9
x265_1.8.tar.gz/source/common/x86/sad-a.asm -> x265_1.9.tar.gz/source/common/x86/sad-a.asm Changed
201
 
1
@@ -2,6 +2,7 @@
2
 ;* sad-a.asm: x86 sad functions
3
 ;*****************************************************************************
4
 ;* Copyright (C) 2003-2013 x264 project
5
+;* Copyright (C) 2013-2015 x265 project
6
 ;*
7
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
8
 ;*          Fiona Glaser <fiona@x264.com>
9
@@ -3328,6 +3329,730 @@
10
     SAD_X4_END_SSE2 1
11
 %endmacro
12
 
13
+%if ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 0
14
+INIT_YMM avx2
15
+%macro SAD_X4_64x8_AVX2 0
16
+    movu            m4, [r0]
17
+    movu            m5, [r1]
18
+    movu            m6, [r2]
19
+    movu            m7, [r3]
20
+    movu            m8, [r4]
21
+
22
+    psadbw          m9, m4, m5
23
+    paddd           m0, m9
24
+    psadbw          m5, m4, m6
25
+    paddd           m1, m5
26
+    psadbw          m6, m4, m7
27
+    paddd           m2, m6
28
+    psadbw          m4, m8
29
+    paddd           m3, m4
30
+
31
+    movu            m4, [r0 + mmsize]
32
+    movu            m5, [r1 + mmsize]
33
+    movu            m6, [r2 + mmsize]
34
+    movu            m7, [r3 + mmsize]
35
+    movu            m8, [r4 + mmsize]
36
+
37
+    psadbw          m9, m4, m5
38
+    paddd           m0, m9
39
+    psadbw          m5, m4, m6
40
+    paddd           m1, m5
41
+    psadbw          m6, m4, m7
42
+    paddd           m2, m6
43
+    psadbw          m4, m8
44
+    paddd           m3, m4
45
+
46
+    movu            m4, [r0 + FENC_STRIDE]
47
+    movu            m5, [r1 + r5]
48
+    movu            m6, [r2 + r5]
49
+    movu            m7, [r3 + r5]
50
+    movu            m8, [r4 + r5]
51
+
52
+    psadbw          m9, m4, m5
53
+    paddd           m0, m9
54
+    psadbw          m5, m4, m6
55
+    paddd           m1, m5
56
+    psadbw          m6, m4, m7
57
+    paddd           m2, m6
58
+    psadbw          m4, m8
59
+    paddd           m3, m4
60
+
61
+    movu            m4, [r0 + FENC_STRIDE + mmsize]
62
+    movu            m5, [r1 + r5 + mmsize]
63
+    movu            m6, [r2 + r5 + mmsize]
64
+    movu            m7, [r3 + r5 + mmsize]
65
+    movu            m8, [r4 + r5 + mmsize]
66
+
67
+    psadbw          m9, m4, m5
68
+    paddd           m0, m9
69
+    psadbw          m5, m4, m6
70
+    paddd           m1, m5
71
+    psadbw          m6, m4, m7
72
+    paddd           m2, m6
73
+    psadbw          m4, m8
74
+    paddd           m3, m4
75
+
76
+    movu            m4, [r0 + FENC_STRIDE * 2]
77
+    movu            m5, [r1 + r5 * 2]
78
+    movu            m6, [r2 + r5 * 2]
79
+    movu            m7, [r3 + r5 * 2]
80
+    movu            m8, [r4 + r5 * 2]
81
+
82
+    psadbw          m9, m4, m5
83
+    paddd           m0, m9
84
+    psadbw          m5, m4, m6
85
+    paddd           m1, m5
86
+    psadbw          m6, m4, m7
87
+    paddd           m2, m6
88
+    psadbw          m4, m8
89
+    paddd           m3, m4
90
+
91
+    movu            m4, [r0 + FENC_STRIDE * 2 + mmsize]
92
+    movu            m5, [r1 + r5 * 2 + mmsize]
93
+    movu            m6, [r2 + r5 * 2 + mmsize]
94
+    movu            m7, [r3 + r5 * 2 + mmsize]
95
+    movu            m8, [r4 + r5 * 2 + mmsize]
96
+
97
+    psadbw          m9, m4, m5
98
+    paddd           m0, m9
99
+    psadbw          m5, m4, m6
100
+    paddd           m1, m5
101
+    psadbw          m6, m4, m7
102
+    paddd           m2, m6
103
+    psadbw          m4, m8
104
+    paddd           m3, m4
105
+
106
+    movu            m4, [r0 + FENC_STRIDE * 3]
107
+    movu            m5, [r1 + r7]
108
+    movu            m6, [r2 + r7]
109
+    movu            m7, [r3 + r7]
110
+    movu            m8, [r4 + r7]
111
+
112
+    psadbw          m9, m4, m5
113
+    paddd           m0, m9
114
+    psadbw          m5, m4, m6
115
+    paddd           m1, m5
116
+    psadbw          m6, m4, m7
117
+    paddd           m2, m6
118
+    psadbw          m4, m8
119
+    paddd           m3, m4
120
+
121
+    movu            m4, [r0 + FENC_STRIDE * 3 + mmsize]
122
+    movu            m5, [r1 + r7 + mmsize]
123
+    movu            m6, [r2 + r7 + mmsize]
124
+    movu            m7, [r3 + r7 + mmsize]
125
+    movu            m8, [r4 + r7 + mmsize]
126
+
127
+    psadbw          m9, m4, m5
128
+    paddd           m0, m9
129
+    psadbw          m5, m4, m6
130
+    paddd           m1, m5
131
+    psadbw          m6, m4, m7
132
+    paddd           m2, m6
133
+    psadbw          m4, m8
134
+    paddd           m3, m4
135
+
136
+    add             r0, FENC_STRIDE * 4
137
+    lea             r1, [r1 + r5 * 4]
138
+    lea             r2, [r2 + r5 * 4]
139
+    lea             r3, [r3 + r5 * 4]
140
+    lea             r4, [r4 + r5 * 4]
141
+
142
+    movu            m4, [r0]
143
+    movu            m5, [r1]
144
+    movu            m6, [r2]
145
+    movu            m7, [r3]
146
+    movu            m8, [r4]
147
+
148
+    psadbw          m9, m4, m5
149
+    paddd           m0, m9
150
+    psadbw          m5, m4, m6
151
+    paddd           m1, m5
152
+    psadbw          m6, m4, m7
153
+    paddd           m2, m6
154
+    psadbw          m4, m8
155
+    paddd           m3, m4
156
+
157
+    movu            m4, [r0 + mmsize]
158
+    movu            m5, [r1 + mmsize]
159
+    movu            m6, [r2 + mmsize]
160
+    movu            m7, [r3 + mmsize]
161
+    movu            m8, [r4 + mmsize]
162
+
163
+    psadbw          m9, m4, m5
164
+    paddd           m0, m9
165
+    psadbw          m5, m4, m6
166
+    paddd           m1, m5
167
+    psadbw          m6, m4, m7
168
+    paddd           m2, m6
169
+    psadbw          m4, m8
170
+    paddd           m3, m4
171
+
172
+    movu            m4, [r0 + FENC_STRIDE]
173
+    movu            m5, [r1 + r5]
174
+    movu            m6, [r2 + r5]
175
+    movu            m7, [r3 + r5]
176
+    movu            m8, [r4 + r5]
177
+
178
+    psadbw          m9, m4, m5
179
+    paddd           m0, m9
180
+    psadbw          m5, m4, m6
181
+    paddd           m1, m5
182
+    psadbw          m6, m4, m7
183
+    paddd           m2, m6
184
+    psadbw          m4, m8
185
+    paddd           m3, m4
186
+
187
+    movu            m4, [r0 + FENC_STRIDE + mmsize]
188
+    movu            m5, [r1 + r5 + mmsize]
189
+    movu            m6, [r2 + r5 + mmsize]
190
+    movu            m7, [r3 + r5 + mmsize]
191
+    movu            m8, [r4 + r5 + mmsize]
192
+
193
+    psadbw          m9, m4, m5
194
+    paddd           m0, m9
195
+    psadbw          m5, m4, m6
196
+    paddd           m1, m5
197
+    psadbw          m6, m4, m7
198
+    paddd           m2, m6
199
+    psadbw          m4, m8
200
+    paddd           m3, m4
201
x265_1.8.tar.gz/source/common/x86/sad16-a.asm -> x265_1.9.tar.gz/source/common/x86/sad16-a.asm Changed
201
 
1
@@ -413,77 +413,50 @@
2
 SAD  16, 32
3
 
4
 INIT_YMM avx2
5
-cglobal pixel_sad_16x64, 4,7,4
6
+cglobal pixel_sad_16x64, 4,5,5
7
     pxor    m0, m0
8
-    pxor    m3, m3
9
-    mov     r4d, 64 / 8
10
-    add     r3d, r3d
11
-    add     r1d, r1d
12
-    lea     r5,     [r1 * 3]
13
-    lea     r6,     [r3 * 3]
14
+    mov     r4d, 16
15
+    mova    m4, [pw_1]
16
 .loop:
17
     movu    m1, [r2]
18
-    movu    m2, [r2 + r3]
19
+    movu    m2, [r2 + r3 * 2]
20
     psubw   m1, [r0]
21
-    psubw   m2, [r0 + r1]
22
-    pabsw   m1, m1
23
-    pabsw   m2, m2
24
-    paddw   m0, m1
25
-    paddw   m3, m2
26
-
27
-    movu    m1, [r2 + 2 * r3]
28
-    movu    m2, [r2 + r6]
29
-    psubw   m1, [r0 + 2 * r1]
30
-    psubw   m2, [r0 + r5]
31
+    psubw   m2, [r0 + r1 * 2]
32
     pabsw   m1, m1
33
     pabsw   m2, m2
34
-    paddw   m0, m1
35
-    paddw   m3, m2
36
-
37
+    paddw   m3, m1, m2
38
     lea     r0, [r0 + 4 * r1]
39
     lea     r2, [r2 + 4 * r3]
40
 
41
     movu    m1, [r2]
42
-    movu    m2, [r2 + r3]
43
+    movu    m2, [r2 + r3 * 2]
44
     psubw   m1, [r0]
45
-    psubw   m2, [r0 + r1]
46
+    psubw   m2, [r0 + r1 * 2]
47
     pabsw   m1, m1
48
     pabsw   m2, m2
49
-    paddw   m0, m1
50
-    paddw   m3, m2
51
-
52
-    movu    m1, [r2 + 2 * r3]
53
-    movu    m2, [r2 + r6]
54
-    psubw   m1, [r0 + 2 * r1]
55
-    psubw   m2, [r0 + r5]
56
-    pabsw   m1, m1
57
-    pabsw   m2, m2
58
-    paddw   m0, m1
59
-    paddw   m3, m2
60
-
61
-    lea     r0, [r0 + 4 * r1]
62
-    lea     r2, [r2 + 4 * r3]
63
-
64
-    dec    r4d
65
-    jg .loop
66
-
67
-    HADDUWD m0, m1
68
-    HADDUWD m3, m1
69
-    HADDD   m0, m1
70
-    HADDD   m3, m1
71
+    paddw   m1, m2
72
+    pmaddwd m3, m4
73
     paddd   m0, m3
74
+    pmaddwd m1, m4
75
+    paddd   m0, m1
76
+    lea     r0, [r0+4*r1]
77
+    lea     r2, [r2+4*r3]
78
+    dec     r4d
79
+    jg      .loop
80
 
81
+    HADDD   m0, m1
82
     movd    eax, xm0
83
     RET
84
 
85
 INIT_YMM avx2
86
-cglobal pixel_sad_32x8, 4,7,5
87
+cglobal pixel_sad_32x8, 4,7,7
88
     pxor    m0, m0
89
     mov     r4d, 8/4
90
+    mova    m6, [pw_1]
91
     add     r3d, r3d
92
     add     r1d, r1d
93
-    lea     r5,     [r1 * 3]
94
-    lea     r6,     [r3 * 3]
95
+    lea     r5d,     [r1 * 3]
96
+    lea     r6d,     [r3 * 3]
97
 .loop:
98
     movu    m1, [r2]
99
     movu    m2, [r2 + 32]
100
@@ -499,8 +472,7 @@
101
     pabsw   m4, m4
102
     paddw   m1, m2
103
     paddw   m3, m4
104
-    paddw   m0, m1
105
-    paddw   m0, m3
106
+    paddw   m5, m1, m3
107
 
108
     movu    m1, [r2 + 2 * r3]
109
     movu    m2, [r2 + 2 * r3 + 32]
110
@@ -518,24 +490,28 @@
111
     pabsw   m4, m4
112
     paddw   m1, m2
113
     paddw   m3, m4
114
-    paddw   m0, m1
115
-    paddw   m0, m3
116
+    paddw   m1, m3
117
 
118
+    pmaddwd m5, m6
119
+    paddd   m0, m5
120
+    pmaddwd m1, m6
121
+    paddd   m0, m1
122
     dec    r4d
123
     jg .loop
124
 
125
-    HADDW   m0, m1
126
+    HADDD   m0, m1
127
     movd    eax, xm0
128
     RET
129
 
130
 INIT_YMM avx2
131
-cglobal pixel_sad_32x16, 4,7,5
132
+cglobal pixel_sad_32x16, 4,7,7
133
     pxor    m0, m0
134
     mov     r4d, 16/8
135
+    mova    m6, [pw_1]
136
     add     r3d, r3d
137
     add     r1d, r1d
138
-    lea     r5,     [r1 * 3]
139
-    lea     r6,     [r3 * 3]
140
+    lea     r5d,     [r1 * 3]
141
+    lea     r6d,     [r3 * 3]
142
 .loop:
143
     movu    m1, [r2]
144
     movu    m2, [r2 + 32]
145
@@ -551,8 +527,7 @@
146
     pabsw   m4, m4
147
     paddw   m1, m2
148
     paddw   m3, m4
149
-    paddw   m0, m1
150
-    paddw   m0, m3
151
+    paddw   m5, m1, m3
152
 
153
     movu    m1, [r2 + 2 * r3]
154
     movu    m2, [r2 + 2 * r3 + 32]
155
@@ -570,8 +545,12 @@
156
     pabsw   m4, m4
157
     paddw   m1, m2
158
     paddw   m3, m4
159
-    paddw   m0, m1
160
-    paddw   m0, m3
161
+    paddw   m1, m3
162
+
163
+    pmaddwd m5, m6
164
+    paddd   m0, m5
165
+    pmaddwd m1, m6
166
+    paddd   m0, m1
167
 
168
     movu    m1, [r2]
169
     movu    m2, [r2 + 32]
170
@@ -587,8 +566,7 @@
171
     pabsw   m4, m4
172
     paddw   m1, m2
173
     paddw   m3, m4
174
-    paddw   m0, m1
175
-    paddw   m0, m3
176
+    paddw   m5, m1, m3
177
 
178
     movu    m1, [r2 + 2 * r3]
179
     movu    m2, [r2 + 2 * r3 + 32]
180
@@ -606,24 +584,28 @@
181
     pabsw   m4, m4
182
     paddw   m1, m2
183
     paddw   m3, m4
184
-    paddw   m0, m1
185
-    paddw   m0, m3
186
+    paddw   m1, m3
187
 
188
+    pmaddwd m5, m6
189
+    paddd   m0, m5
190
+    pmaddwd m1, m6
191
+    paddd   m0, m1
192
     dec    r4d
193
     jg .loop
194
 
195
-    HADDW   m0, m1
196
+    HADDD   m0, m1
197
     movd    eax, xm0
198
     RET
199
 
200
 INIT_YMM avx2
201
x265_1.8.tar.gz/source/common/x86/ssd-a.asm -> x265_1.9.tar.gz/source/common/x86/ssd-a.asm Changed
201
 
1
@@ -2,11 +2,13 @@
2
 ;* ssd-a.asm: x86 ssd functions
3
 ;*****************************************************************************
4
 ;* Copyright (C) 2003-2013 x264 project
5
+;* Copyright (C) 2013-2015 x265 project
6
 ;*
7
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
8
 ;*          Fiona Glaser <fiona@x264.com>
9
 ;*          Laurent Aimar <fenrir@via.ecp.fr>
10
 ;*          Alex Izvorski <aizvorksi@gmail.com>
11
+;*          Min Chen <chenm003@163.com>
12
 ;*
13
 ;* This program is free software; you can redistribute it and/or modify
14
 ;* it under the terms of the GNU General Public License as published by
15
@@ -105,8 +107,32 @@
16
     dec    r4d
17
     jg .loop
18
 %endif
19
+%if BIT_DEPTH == 12 && %1 >= 16 && %2 >=16
20
+%if  mmsize == 16
21
+    movu            m5, m0
22
+    pxor            m6, m6
23
+    punpckldq       m0, m6
24
+    punpckhdq       m5, m6
25
+    paddq           m0, m5
26
+    movhlps         m5, m0
27
+    paddq           m0, m5
28
+    movq            r6, xm0
29
+%elif mmsize == 32
30
+    movu            m1, m0
31
+    pxor            m2, m2
32
+    punpckldq       m0, m2
33
+    punpckhdq       m1, m2
34
+    paddq           m0, m1
35
+    vextracti128    xm2, m0, 1
36
+    paddq           xm2, xm0
37
+    movhlps         xm1, xm2
38
+    paddq           xm2, xm1
39
+    movq            rax, xm2
40
+%endif
41
+%else 
42
     HADDD   m0, m5
43
-    movd   eax, xm0
44
+    movd    eax,xm0
45
+%endif
46
 %ifidn movu,movq ; detect MMX
47
     EMMS
48
 %endif
49
@@ -168,6 +194,154 @@
50
     movq        rax, m9
51
     RET
52
 %endmacro
53
+%macro SSD_ONE_SS_32 0
54
+cglobal pixel_ssd_ss_32x32, 4,5,8
55
+    add         r1d, r1d
56
+    add         r3d, r3d
57
+    pxor        m5, m5
58
+    pxor        m6, m6
59
+    mov         r4d, 2
60
+
61
+.iterate:
62
+    mov         r5d, 16
63
+    pxor        m4, m4
64
+    pxor        m7, m7
65
+.loop:
66
+    movu        m0, [r0]
67
+    movu        m1, [r0 + mmsize]
68
+    movu        m2, [r2]
69
+    movu        m3, [r2 + mmsize]
70
+    psubw       m0, m2
71
+    psubw       m1, m3
72
+    pmaddwd     m0, m0
73
+    pmaddwd     m1, m1
74
+    paddd       m4, m0
75
+    paddd       m7, m1
76
+    movu        m0, [r0 + 2 * mmsize]
77
+    movu        m1, [r0 + 3 * mmsize]
78
+    movu        m2, [r2 + 2 * mmsize]
79
+    movu        m3, [r2 + 3 * mmsize]
80
+    psubw       m0, m2
81
+    psubw       m1, m3
82
+    pmaddwd     m0, m0
83
+    pmaddwd     m1, m1
84
+    paddd       m4, m0
85
+    paddd       m7, m1
86
+
87
+    add         r0, r1
88
+    add         r2, r3
89
+
90
+    dec         r5d
91
+    jnz         .loop
92
+
93
+    mova        m0, m4
94
+    pxor        m1, m1
95
+    punpckldq   m0, m1
96
+    punpckhdq   m4, m1
97
+    paddq       m5, m0
98
+    paddq       m6, m4
99
+
100
+    mova        m0, m7
101
+    punpckldq   m0, m1
102
+    punpckhdq   m7, m1
103
+    paddq       m5, m0
104
+    paddq       m6, m7
105
+
106
+    dec         r4d
107
+    jnz         .iterate
108
+
109
+    paddq       m5, m6
110
+    movhlps     m2, m5
111
+    paddq       m5, m2
112
+    movq        rax, m5
113
+    RET
114
+%endmacro
115
+
116
+%macro SSD_ONE_SS_64 0
117
+cglobal pixel_ssd_ss_64x64, 4,6,8
118
+    add         r1d, r1d
119
+    add         r3d, r3d
120
+    pxor        m5, m5
121
+    pxor        m6, m6
122
+    mov         r5d, 8
123
+
124
+.iterate:
125
+    pxor        m4, m4
126
+    pxor        m7, m7
127
+    mov         r4d, 8
128
+
129
+.loop:
130
+    ;----process 1st half a row----
131
+    movu        m0, [r0]
132
+    movu        m1, [r0 + mmsize]
133
+    movu        m2, [r2]
134
+    movu        m3, [r2 + mmsize]
135
+    psubw       m0, m2
136
+    psubw       m1, m3
137
+    pmaddwd     m0, m0
138
+    pmaddwd     m1, m1
139
+    paddd       m4, m0
140
+    paddd       m7, m1
141
+    movu        m0, [r0 + 2 * mmsize]
142
+    movu        m1, [r0 + 3 * mmsize]
143
+    movu        m2, [r2 + 2 * mmsize]
144
+    movu        m3, [r2 + 3 * mmsize]
145
+    psubw       m0, m2
146
+    psubw       m1, m3
147
+    pmaddwd     m0, m0
148
+    pmaddwd     m1, m1
149
+    paddd       m4, m0
150
+    paddd       m7, m1
151
+    ;----process 2nd half a row----
152
+    movu        m0, [r0 + 4 * mmsize]
153
+    movu        m1, [r0 + 5 * mmsize]
154
+    movu        m2, [r2 + 4 * mmsize]
155
+    movu        m3, [r2 + 5 * mmsize]
156
+    psubw       m0, m2
157
+    psubw       m1, m3
158
+    pmaddwd     m0, m0
159
+    pmaddwd     m1, m1
160
+    paddd       m4, m0
161
+    paddd       m7, m1
162
+    movu        m0, [r0 + 6 * mmsize]
163
+    movu        m1, [r0 + 7 * mmsize]
164
+    movu        m2, [r2 + 6 * mmsize]
165
+    movu        m3, [r2 + 7 * mmsize]
166
+    psubw       m0, m2
167
+    psubw       m1, m3
168
+    pmaddwd     m0, m0
169
+    pmaddwd     m1, m1
170
+    paddd       m4, m0
171
+    paddd       m7, m1
172
+
173
+    add         r0, r1
174
+    add         r2, r3
175
+
176
+    dec         r4d
177
+    jnz         .loop
178
+
179
+    mova        m0, m4
180
+    pxor        m1, m1
181
+    punpckldq   m0, m1
182
+    punpckhdq   m4, m1
183
+    paddq       m5, m0
184
+    paddq       m6, m4
185
+
186
+    mova        m0, m7
187
+    punpckldq   m0, m1
188
+    punpckhdq   m7, m1
189
+    paddq       m5, m0
190
+    paddq       m6, m7
191
+
192
+    dec         r5
193
+    jne         .iterate
194
+
195
+    paddq       m5, m6
196
+    movhlps     m2, m5
197
+    paddq       m5, m2
198
+    movq        rax, m5
199
+    RET
200
+%endmacro
201
x265_1.8.tar.gz/source/common/x86/x86util.asm -> x265_1.9.tar.gz/source/common/x86/x86util.asm Changed
9
 
1
@@ -5,6 +5,7 @@
2
 ;*
3
 ;* Authors: Holger Lubitz <holger@lubitz.org>
4
 ;*          Loren Merritt <lorenm@u.washington.edu>
5
+;*          Min Chen <chenm003@163.com>
6
 ;*
7
 ;* This program is free software; you can redistribute it and/or modify
8
 ;* it under the terms of the GNU General Public License as published by
9
x265_1.8.tar.gz/source/common/yuv.cpp -> x265_1.9.tar.gz/source/common/yuv.cpp Changed
123
 
1
@@ -2,6 +2,7 @@
2
  * Copyright (C) 2015 x265 project
3
  *
4
  * Authors: Steve Borho <steve@borho.org>
5
+ *          Min Chen <chenm003@163.com>
6
  *
7
  * This program is free software; you can redistribute it and/or modify
8
  * it under the terms of the GNU General Public License as published by
9
@@ -50,7 +51,7 @@
10
     {
11
         CHECKED_MALLOC(m_buf[0], pixel, size * size + 8);
12
         m_buf[1] = m_buf[2] = 0;
13
-        m_csize = MAX_INT;
14
+        m_csize = 0;
15
         return true;
16
     }
17
     else
18
@@ -82,22 +83,26 @@
19
 {
20
     pixel* dstY = dstPic.getLumaAddr(cuAddr, absPartIdx);
21
     primitives.cu[m_part].copy_pp(dstY, dstPic.m_stride, m_buf[0], m_size);
22
-
23
-    pixel* dstU = dstPic.getCbAddr(cuAddr, absPartIdx);
24
-    pixel* dstV = dstPic.getCrAddr(cuAddr, absPartIdx);
25
-    primitives.chroma[m_csp].cu[m_part].copy_pp(dstU, dstPic.m_strideC, m_buf[1], m_csize);
26
-    primitives.chroma[m_csp].cu[m_part].copy_pp(dstV, dstPic.m_strideC, m_buf[2], m_csize);
27
+    if (m_csp != X265_CSP_I400)
28
+    {
29
+        pixel* dstU = dstPic.getCbAddr(cuAddr, absPartIdx);
30
+        pixel* dstV = dstPic.getCrAddr(cuAddr, absPartIdx);
31
+        primitives.chroma[m_csp].cu[m_part].copy_pp(dstU, dstPic.m_strideC, m_buf[1], m_csize);
32
+        primitives.chroma[m_csp].cu[m_part].copy_pp(dstV, dstPic.m_strideC, m_buf[2], m_csize);
33
+    }
34
 }
35
 
36
 void Yuv::copyFromPicYuv(const PicYuv& srcPic, uint32_t cuAddr, uint32_t absPartIdx)
37
 {
38
     const pixel* srcY = srcPic.getLumaAddr(cuAddr, absPartIdx);
39
     primitives.cu[m_part].copy_pp(m_buf[0], m_size, srcY, srcPic.m_stride);
40
-
41
-    const pixel* srcU = srcPic.getCbAddr(cuAddr, absPartIdx);
42
-    const pixel* srcV = srcPic.getCrAddr(cuAddr, absPartIdx);
43
-    primitives.chroma[m_csp].cu[m_part].copy_pp(m_buf[1], m_csize, srcU, srcPic.m_strideC);
44
-    primitives.chroma[m_csp].cu[m_part].copy_pp(m_buf[2], m_csize, srcV, srcPic.m_strideC);
45
+    if (m_csp != X265_CSP_I400)
46
+    {
47
+        const pixel* srcU = srcPic.getCbAddr(cuAddr, absPartIdx);
48
+        const pixel* srcV = srcPic.getCrAddr(cuAddr, absPartIdx);
49
+        primitives.chroma[m_csp].cu[m_part].copy_pp(m_buf[1], m_csize, srcU, srcPic.m_strideC);
50
+        primitives.chroma[m_csp].cu[m_part].copy_pp(m_buf[2], m_csize, srcV, srcPic.m_strideC);
51
+    }
52
 }
53
 
54
 void Yuv::copyFromYuv(const Yuv& srcYuv)
55
@@ -105,8 +110,11 @@
56
     X265_CHECK(m_size >= srcYuv.m_size, "invalid size\n");
57
 
58
     primitives.cu[m_part].copy_pp(m_buf[0], m_size, srcYuv.m_buf[0], srcYuv.m_size);
59
-    primitives.chroma[m_csp].cu[m_part].copy_pp(m_buf[1], m_csize, srcYuv.m_buf[1], srcYuv.m_csize);
60
-    primitives.chroma[m_csp].cu[m_part].copy_pp(m_buf[2], m_csize, srcYuv.m_buf[2], srcYuv.m_csize);
61
+    if (m_csp != X265_CSP_I400)
62
+    {
63
+        primitives.chroma[m_csp].cu[m_part].copy_pp(m_buf[1], m_csize, srcYuv.m_buf[1], srcYuv.m_csize);
64
+        primitives.chroma[m_csp].cu[m_part].copy_pp(m_buf[2], m_csize, srcYuv.m_buf[2], srcYuv.m_csize);
65
+    }
66
 }
67
 
68
 /* This version is intended for use by ME, which required FENC_STRIDE for luma fenc pixels */
69
@@ -130,11 +138,13 @@
70
 {
71
     pixel* dstY = dstYuv.getLumaAddr(absPartIdx);
72
     primitives.cu[m_part].copy_pp(dstY, dstYuv.m_size, m_buf[0], m_size);
73
-
74
-    pixel* dstU = dstYuv.getCbAddr(absPartIdx);
75
-    pixel* dstV = dstYuv.getCrAddr(absPartIdx);
76
-    primitives.chroma[m_csp].cu[m_part].copy_pp(dstU, dstYuv.m_csize, m_buf[1], m_csize);
77
-    primitives.chroma[m_csp].cu[m_part].copy_pp(dstV, dstYuv.m_csize, m_buf[2], m_csize);
78
+    if (m_csp != X265_CSP_I400)
79
+    {
80
+        pixel* dstU = dstYuv.getCbAddr(absPartIdx);
81
+        pixel* dstV = dstYuv.getCrAddr(absPartIdx);
82
+        primitives.chroma[m_csp].cu[m_part].copy_pp(dstU, dstYuv.m_csize, m_buf[1], m_csize);
83
+        primitives.chroma[m_csp].cu[m_part].copy_pp(dstV, dstYuv.m_csize, m_buf[2], m_csize);
84
+    }
85
 }
86
 
87
 void Yuv::copyPartToYuv(Yuv& dstYuv, uint32_t absPartIdx) const
88
@@ -142,20 +152,25 @@
89
     pixel* srcY = m_buf[0] + getAddrOffset(absPartIdx, m_size);
90
     pixel* dstY = dstYuv.m_buf[0];
91
     primitives.cu[dstYuv.m_part].copy_pp(dstY, dstYuv.m_size, srcY, m_size);
92
-
93
-    pixel* srcU = m_buf[1] + getChromaAddrOffset(absPartIdx);
94
-    pixel* srcV = m_buf[2] + getChromaAddrOffset(absPartIdx);
95
-    pixel* dstU = dstYuv.m_buf[1];
96
-    pixel* dstV = dstYuv.m_buf[2];
97
-    primitives.chroma[m_csp].cu[dstYuv.m_part].copy_pp(dstU, dstYuv.m_csize, srcU, m_csize);
98
-    primitives.chroma[m_csp].cu[dstYuv.m_part].copy_pp(dstV, dstYuv.m_csize, srcV, m_csize);
99
+    if (m_csp != X265_CSP_I400)
100
+    {
101
+        pixel* srcU = m_buf[1] + getChromaAddrOffset(absPartIdx);
102
+        pixel* srcV = m_buf[2] + getChromaAddrOffset(absPartIdx);
103
+        pixel* dstU = dstYuv.m_buf[1];
104
+        pixel* dstV = dstYuv.m_buf[2];
105
+        primitives.chroma[m_csp].cu[dstYuv.m_part].copy_pp(dstU, dstYuv.m_csize, srcU, m_csize);
106
+        primitives.chroma[m_csp].cu[dstYuv.m_part].copy_pp(dstV, dstYuv.m_csize, srcV, m_csize);
107
+    }
108
 }
109
 
110
 void Yuv::addClip(const Yuv& srcYuv0, const ShortYuv& srcYuv1, uint32_t log2SizeL)
111
 {
112
     primitives.cu[log2SizeL - 2].add_ps(m_buf[0], m_size, srcYuv0.m_buf[0], srcYuv1.m_buf[0], srcYuv0.m_size, srcYuv1.m_size);
113
-    primitives.chroma[m_csp].cu[log2SizeL - 2].add_ps(m_buf[1], m_csize, srcYuv0.m_buf[1], srcYuv1.m_buf[1], srcYuv0.m_csize, srcYuv1.m_csize);
114
-    primitives.chroma[m_csp].cu[log2SizeL - 2].add_ps(m_buf[2], m_csize, srcYuv0.m_buf[2], srcYuv1.m_buf[2], srcYuv0.m_csize, srcYuv1.m_csize);
115
+    if (m_csp != X265_CSP_I400)
116
+    {
117
+        primitives.chroma[m_csp].cu[log2SizeL - 2].add_ps(m_buf[1], m_csize, srcYuv0.m_buf[1], srcYuv1.m_buf[1], srcYuv0.m_csize, srcYuv1.m_csize);
118
+        primitives.chroma[m_csp].cu[log2SizeL - 2].add_ps(m_buf[2], m_csize, srcYuv0.m_buf[2], srcYuv1.m_buf[2], srcYuv0.m_csize, srcYuv1.m_csize);
119
+    }
120
 }
121
 
122
 void Yuv::addAvg(const ShortYuv& srcYuv0, const ShortYuv& srcYuv1, uint32_t absPartIdx, uint32_t width, uint32_t height, bool bLuma, bool bChroma)
123
x265_1.8.tar.gz/source/encoder/analysis.cpp -> x265_1.9.tar.gz/source/encoder/analysis.cpp Changed
201
 
1
@@ -3,6 +3,7 @@
2
 *
3
 * Authors: Deepthi Nandakumar <deepthi@multicorewareinc.com>
4
 *          Steve Borho <steve@borho.org>
5
+*          Min Chen <chenm003@163.com>
6
 *
7
 * This program is free software; you can redistribute it and/or modify
8
 * it under the terms of the GNU General Public License as published by
9
@@ -71,12 +72,11 @@
10
 
11
 Analysis::Analysis()
12
 {
13
-    m_reuseIntraDataCTU = NULL;
14
     m_reuseInterDataCTU = NULL;
15
     m_reuseRef = NULL;
16
     m_reuseBestMergeCand = NULL;
17
+    m_reuseMv = NULL;
18
 }
19
-
20
 bool Analysis::create(ThreadLocalData *tld)
21
 {
22
     m_tld = tld;
23
@@ -127,9 +127,6 @@
24
     m_frame = &frame;
25
 
26
 #if _DEBUG || CHECKED_BUILD
27
-    for (uint32_t i = 0; i <= g_maxCUDepth; i++)
28
-        for (uint32_t j = 0; j < MAX_PRED_TYPES; j++)
29
-            m_modeDepth[i].pred[j].invalidate();
30
     invalidateContexts(0);
31
 #endif
32
 
33
@@ -140,40 +137,46 @@
34
     m_modeDepth[0].fencYuv.copyFromPicYuv(*m_frame->m_fencPic, ctu.m_cuAddr, 0);
35
 
36
     uint32_t numPartition = ctu.m_numPartitions;
37
-    if (m_param->analysisMode)
38
+    if (m_param->analysisMode && m_slice->m_sliceType != I_SLICE)
39
     {
40
-        if (m_slice->m_sliceType == I_SLICE)
41
-            m_reuseIntraDataCTU = (analysis_intra_data*)m_frame->m_analysisData.intraData;
42
-        else
43
-        {
44
-            int numPredDir = m_slice->isInterP() ? 1 : 2;
45
-            m_reuseInterDataCTU = (analysis_inter_data*)m_frame->m_analysisData.interData;
46
-            m_reuseRef = &m_reuseInterDataCTU->ref[ctu.m_cuAddr * X265_MAX_PRED_MODE_PER_CTU * numPredDir];
47
-            m_reuseBestMergeCand = &m_reuseInterDataCTU->bestMergeCand[ctu.m_cuAddr * CUGeom::MAX_GEOMS];
48
-        }
49
+        int numPredDir = m_slice->isInterP() ? 1 : 2;
50
+        m_reuseInterDataCTU = (analysis_inter_data*)m_frame->m_analysisData.interData;
51
+        m_reuseRef = &m_reuseInterDataCTU->ref[ctu.m_cuAddr * X265_MAX_PRED_MODE_PER_CTU * numPredDir];
52
+        m_reuseBestMergeCand = &m_reuseInterDataCTU->bestMergeCand[ctu.m_cuAddr * CUGeom::MAX_GEOMS];
53
+        m_reuseMv = &m_reuseInterDataCTU->mv[ctu.m_cuAddr * X265_MAX_PRED_MODE_PER_CTU * numPredDir];
54
     }
55
-
56
     ProfileCUScope(ctu, totalCTUTime, totalCTUs);
57
 
58
-    uint32_t zOrder = 0;
59
     if (m_slice->m_sliceType == I_SLICE)
60
     {
61
-        compressIntraCU(ctu, cuGeom, zOrder, qp);
62
-        if (m_param->analysisMode == X265_ANALYSIS_SAVE && m_frame->m_analysisData.intraData)
63
+        analysis_intra_data* intraDataCTU = (analysis_intra_data*)m_frame->m_analysisData.intraData;
64
+        if (m_param->analysisMode == X265_ANALYSIS_LOAD)
65
+        {
66
+            memcpy(ctu.m_cuDepth, &intraDataCTU->depth[ctu.m_cuAddr * numPartition], sizeof(uint8_t) * numPartition);
67
+            memcpy(ctu.m_lumaIntraDir, &intraDataCTU->modes[ctu.m_cuAddr * numPartition], sizeof(uint8_t) * numPartition);
68
+            memcpy(ctu.m_partSize, &intraDataCTU->partSizes[ctu.m_cuAddr * numPartition], sizeof(char) * numPartition);
69
+            memcpy(ctu.m_chromaIntraDir, &intraDataCTU->chromaModes[ctu.m_cuAddr * numPartition], sizeof(uint8_t) * numPartition);
70
+        }
71
+        compressIntraCU(ctu, cuGeom, qp);
72
+        if (m_param->analysisMode == X265_ANALYSIS_SAVE && intraDataCTU)
73
         {
74
             CUData* bestCU = &m_modeDepth[0].bestMode->cu;
75
-            memcpy(&m_reuseIntraDataCTU->depth[ctu.m_cuAddr * numPartition], bestCU->m_cuDepth, sizeof(uint8_t) * numPartition);
76
-            memcpy(&m_reuseIntraDataCTU->modes[ctu.m_cuAddr * numPartition], bestCU->m_lumaIntraDir, sizeof(uint8_t) * numPartition);
77
-            memcpy(&m_reuseIntraDataCTU->partSizes[ctu.m_cuAddr * numPartition], bestCU->m_partSize, sizeof(uint8_t) * numPartition);
78
-            memcpy(&m_reuseIntraDataCTU->chromaModes[ctu.m_cuAddr * numPartition], bestCU->m_chromaIntraDir, sizeof(uint8_t) * numPartition);
79
+            memcpy(&intraDataCTU->depth[ctu.m_cuAddr * numPartition], bestCU->m_cuDepth, sizeof(uint8_t) * numPartition);
80
+            memcpy(&intraDataCTU->modes[ctu.m_cuAddr * numPartition], bestCU->m_lumaIntraDir, sizeof(uint8_t) * numPartition);
81
+            memcpy(&intraDataCTU->partSizes[ctu.m_cuAddr * numPartition], bestCU->m_partSize, sizeof(uint8_t) * numPartition);
82
+            memcpy(&intraDataCTU->chromaModes[ctu.m_cuAddr * numPartition], bestCU->m_chromaIntraDir, sizeof(uint8_t) * numPartition);
83
         }
84
     }
85
     else
86
     {
87
-        if (!m_param->rdLevel)
88
+        if (m_param->bIntraRefresh && m_slice->m_sliceType == P_SLICE &&
89
+            ctu.m_cuPelX / g_maxCUSize >= frame.m_encData->m_pir.pirStartCol
90
+            && ctu.m_cuPelX / g_maxCUSize < frame.m_encData->m_pir.pirEndCol)
91
+            compressIntraCU(ctu, cuGeom, qp);
92
+        else if (!m_param->rdLevel)
93
         {
94
             /* In RD Level 0/1, copy source pixels into the reconstructed block so
95
-            * they are available for intra predictions */
96
+             * they are available for intra predictions */
97
             m_modeDepth[0].fencYuv.copyToPicYuv(*m_frame->m_reconPic, ctu.m_cuAddr, 0);
98
 
99
             compressInterCU_rd0_4(ctu, cuGeom, qp);
100
@@ -187,6 +190,7 @@
101
             compressInterCU_rd0_4(ctu, cuGeom, qp);
102
         else
103
         {
104
+            uint32_t zOrder = 0;
105
             compressInterCU_rd5_6(ctu, cuGeom, zOrder, qp);
106
             if (m_param->analysisMode == X265_ANALYSIS_SAVE && m_frame->m_analysisData.interData)
107
             {
108
@@ -212,8 +216,7 @@
109
         md.pred[PRED_LOSSLESS].initCosts();
110
         md.pred[PRED_LOSSLESS].cu.initLosslessCU(md.bestMode->cu, cuGeom);
111
         PartSize size = (PartSize)md.pred[PRED_LOSSLESS].cu.m_partSize[0];
112
-        uint8_t* modes = md.pred[PRED_LOSSLESS].cu.m_lumaIntraDir;
113
-        checkIntra(md.pred[PRED_LOSSLESS], cuGeom, size, modes, NULL);
114
+        checkIntra(md.pred[PRED_LOSSLESS], cuGeom, size);
115
         checkBestMode(md.pred[PRED_LOSSLESS], cuGeom.depth);
116
     }
117
     else
118
@@ -226,7 +229,7 @@
119
     }
120
 }
121
 
122
-void Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, uint32_t& zOrder, int32_t qp)
123
+void Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp)
124
 {
125
     uint32_t depth = cuGeom.depth;
126
     ModeDepth& md = m_modeDepth[depth];
127
@@ -235,42 +238,37 @@
128
     bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
129
     bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
130
 
131
-    if (m_param->analysisMode == X265_ANALYSIS_LOAD)
132
-    {
133
-        uint8_t* reuseDepth  = &m_reuseIntraDataCTU->depth[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
134
-        uint8_t* reuseModes  = &m_reuseIntraDataCTU->modes[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
135
-        char* reusePartSizes = &m_reuseIntraDataCTU->partSizes[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
136
-        uint8_t* reuseChromaModes = &m_reuseIntraDataCTU->chromaModes[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
137
+    bool bAlreadyDecided = parentCTU.m_lumaIntraDir[cuGeom.absPartIdx] != (uint8_t)ALL_IDX;
138
+    bool bDecidedDepth = parentCTU.m_cuDepth[cuGeom.absPartIdx] == depth;
139
 
140
-        if (mightNotSplit && depth == reuseDepth[zOrder] && zOrder == cuGeom.absPartIdx)
141
+    if (bAlreadyDecided)
142
+    {
143
+        if (bDecidedDepth)
144
         {
145
-            PartSize size = (PartSize)reusePartSizes[zOrder];
146
-            Mode& mode = size == SIZE_2Nx2N ? md.pred[PRED_INTRA] : md.pred[PRED_INTRA_NxN];
147
+            Mode& mode = md.pred[0];
148
+            md.bestMode = &mode;
149
             mode.cu.initSubCU(parentCTU, cuGeom, qp);
150
-            checkIntra(mode, cuGeom, size, &reuseModes[zOrder], &reuseChromaModes[zOrder]);
151
-            checkBestMode(mode, depth);
152
+            memcpy(mode.cu.m_lumaIntraDir, parentCTU.m_lumaIntraDir + cuGeom.absPartIdx, cuGeom.numPartitions);
153
+            memcpy(mode.cu.m_chromaIntraDir, parentCTU.m_chromaIntraDir + cuGeom.absPartIdx, cuGeom.numPartitions);
154
+            checkIntra(mode, cuGeom, (PartSize)parentCTU.m_partSize[cuGeom.absPartIdx]);
155
 
156
             if (m_bTryLossless)
157
                 tryLossless(cuGeom);
158
 
159
             if (mightSplit)
160
                 addSplitFlagCost(*md.bestMode, cuGeom.depth);
161
-
162
-            // increment zOrder offset to point to next best depth in sharedDepth buffer
163
-            zOrder += g_depthInc[g_maxCUDepth - 1][reuseDepth[zOrder]];
164
-            mightSplit = false;
165
         }
166
     }
167
-    else if (mightNotSplit)
168
+    else if (cuGeom.log2CUSize != MAX_LOG2_CU_SIZE && mightNotSplit)
169
     {
170
         md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom, qp);
171
-        checkIntra(md.pred[PRED_INTRA], cuGeom, SIZE_2Nx2N, NULL, NULL);
172
+        checkIntra(md.pred[PRED_INTRA], cuGeom, SIZE_2Nx2N);
173
         checkBestMode(md.pred[PRED_INTRA], depth);
174
 
175
         if (cuGeom.log2CUSize == 3 && m_slice->m_sps->quadtreeTULog2MinSize < 3)
176
         {
177
             md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom, qp);
178
-            checkIntra(md.pred[PRED_INTRA_NxN], cuGeom, SIZE_NxN, NULL, NULL);
179
+            checkIntra(md.pred[PRED_INTRA_NxN], cuGeom, SIZE_NxN);
180
             checkBestMode(md.pred[PRED_INTRA_NxN], depth);
181
         }
182
 
183
@@ -281,6 +279,9 @@
184
             addSplitFlagCost(*md.bestMode, cuGeom.depth);
185
     }
186
 
187
+    // stop recursion if we reach the depth of previous analysis decision
188
+    mightSplit &= !(bAlreadyDecided && bDecidedDepth);
189
+
190
     if (mightSplit)
191
     {
192
         Mode* splitPred = &md.pred[PRED_SPLIT];
193
@@ -305,7 +306,7 @@
194
                 if (m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth)
195
                     nextQP = setLambdaFromQP(parentCTU, calculateQpforCuSize(parentCTU, childGeom));
196
 
197
-                compressIntraCU(parentCTU, childGeom, zOrder, nextQP);
198
+                compressIntraCU(parentCTU, childGeom, nextQP);
199
 
200
                 // Save best CU and pred data for this sub CU
201
x265_1.8.tar.gz/source/encoder/analysis.h -> x265_1.9.tar.gz/source/encoder/analysis.h Changed
73
 
1
@@ -3,6 +3,7 @@
2
 *
3
 * Authors: Deepthi Nandakumar <deepthi@multicorewareinc.com>
4
 *          Steve Borho <steve@borho.org>
5
+*          Min Chen <chenm003@163.com>
6
 *
7
 * This program is free software; you can redistribute it and/or modify
8
 * it under the terms of the GNU General Public License as published by
9
@@ -40,6 +41,21 @@
10
 
11
 class Entropy;
12
 
13
+struct SplitData
14
+{
15
+    uint32_t splitRefs;
16
+    uint32_t mvCost[2];
17
+    uint64_t sa8dCost;
18
+
19
+    void initSplitCUData()
20
+    {
21
+        splitRefs = 0;
22
+        mvCost[0] = 0; // L0
23
+        mvCost[1] = 0; // L1
24
+        sa8dCost    = 0;
25
+    }
26
+};
27
+
28
 class Analysis : public Search
29
 {
30
 public:
31
@@ -101,20 +117,20 @@
32
     Mode& compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, const Entropy& initialContext);
33
 
34
 protected:
35
-
36
     /* Analysis data for load/save modes, keeps getting incremented as CTU analysis proceeds and data is consumed or read */
37
-    analysis_intra_data* m_reuseIntraDataCTU;
38
     analysis_inter_data* m_reuseInterDataCTU;
39
+    MV*                  m_reuseMv;
40
     int32_t*             m_reuseRef;
41
     uint32_t*            m_reuseBestMergeCand;
42
+    uint32_t m_splitRefIdx[4];
43
 
44
     /* full analysis for an I-slice CU */
45
-    void compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, uint32_t &zOrder, int32_t qp);
46
+    void compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp);
47
 
48
     /* full analysis for a P or B slice CU */
49
-    void compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp);
50
-    uint32_t compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp);
51
-    uint32_t compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGeom, uint32_t &zOrder, int32_t qp);
52
+    uint32_t compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp);
53
+    SplitData compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp);
54
+    SplitData compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGeom, uint32_t &zOrder, int32_t qp);
55
 
56
     /* measure merge and skip */
57
     void checkMerge2Nx2N_rd0_4(Mode& skip, Mode& merge, const CUGeom& cuGeom);
58
@@ -139,13 +155,11 @@
59
     /* generate residual and recon pixels for an entire CTU recursively (RD0) */
60
     void encodeResidue(const CUData& parentCTU, const CUGeom& cuGeom);
61
 
62
-    int calculateQpforCuSize(const CUData& ctu, const CUGeom& cuGeom);
63
+    int calculateQpforCuSize(const CUData& ctu, const CUGeom& cuGeom, double baseQP = -1);
64
 
65
     /* check whether current mode is the new best */
66
     inline void checkBestMode(Mode& mode, uint32_t depth)
67
     {
68
-        X265_CHECK(mode.ok(), "mode costs are uninitialized\n");
69
-
70
         ModeDepth& md = m_modeDepth[depth];
71
         if (md.bestMode)
72
         {
73
x265_1.8.tar.gz/source/encoder/api.cpp -> x265_1.9.tar.gz/source/encoder/api.cpp Changed
45
 
1
@@ -72,9 +72,7 @@
2
 #endif
3
 
4
 #if HIGH_BIT_DEPTH
5
-    if (X265_DEPTH == 12)
6
-        x265_log(p, X265_LOG_WARNING, "Main12 is HIGHLY experimental, do not use!\n");
7
-    else if (X265_DEPTH != 10 && X265_DEPTH != 12)
8
+    if (X265_DEPTH != 10 && X265_DEPTH != 12)
9
 #else
10
     if (X265_DEPTH != 8)
11
 #endif
12
@@ -247,6 +245,16 @@
13
     }
14
 }
15
 
16
+int x265_encoder_intra_refresh(x265_encoder *enc)
17
+{
18
+    if (!enc)
19
+        return -1;
20
+
21
+    Encoder *encoder = static_cast<Encoder*>(enc);
22
+    encoder->m_bQueuedIntraRefresh = 1;
23
+    return 0;
24
+}
25
+
26
 void x265_cleanup(void)
27
 {
28
     if (!g_ctuSizeConfigured)
29
@@ -268,6 +276,7 @@
30
     pic->bitDepth = param->internalBitDepth;
31
     pic->colorSpace = param->internalCsp;
32
     pic->forceqp = X265_QP_AUTO;
33
+    pic->quantOffsets = NULL;
34
     if (param->analysisMode)
35
     {
36
         uint32_t widthInCU       = (param->sourceWidth  + g_maxCUSize - 1) >> g_maxLog2CUSize;
37
@@ -318,6 +327,7 @@
38
     &x265_cleanup,
39
 
40
     sizeof(x265_frame_stats),
41
+    &x265_encoder_intra_refresh,
42
 };
43
 
44
 typedef const x265_api* (*api_get_func)(int bitDepth);
45
x265_1.8.tar.gz/source/encoder/bitcost.cpp -> x265_1.9.tar.gz/source/encoder/bitcost.cpp Changed
62
 
1
@@ -2,6 +2,7 @@
2
  * Copyright (C) 2013 x265 project
3
  *
4
  * Authors: Steve Borho <steve@borho.org>
5
+ *          Min Chen <chenm003@163.com>
6
  *
7
  * This program is free software; you can redistribute it and/or modify
8
  * it under the terms of the GNU General Public License as published by
9
@@ -40,7 +41,12 @@
10
             x265_emms(); // just to be safe
11
 
12
             CalculateLogs();
13
-            s_costs[qp] = new uint16_t[4 * BC_MAX_MV + 1] + 2 * BC_MAX_MV;
14
+            s_costs[qp] = X265_MALLOC(uint16_t, 4 * BC_MAX_MV + 1) + 2 * BC_MAX_MV;
15
+            if (!s_costs[qp])
16
+            {
17
+                x265_log(NULL, X265_LOG_ERROR, "BitCost s_costs buffer allocation failure\n");
18
+                return;
19
+            }
20
             double lambda = x265_lambda_tab[qp];
21
 
22
             // estimate same cost for negative and positive MVD
23
@@ -66,11 +72,16 @@
24
 {
25
     if (!s_bitsizes)
26
     {
27
-        s_bitsizes = new float[2 * BC_MAX_MV + 1];
28
+        s_bitsizes = X265_MALLOC(float, 4 * BC_MAX_MV + 1) + 2 * BC_MAX_MV;
29
+        if (!s_bitsizes)
30
+        {
31
+            x265_log(NULL, X265_LOG_ERROR, "BitCost s_bitsizes buffer allocation failure\n");
32
+            return;
33
+        }
34
         s_bitsizes[0] = 0.718f;
35
         float log2_2 = 2.0f / log(2.0f);  // 2 x 1/log(2)
36
         for (int i = 1; i <= 2 * BC_MAX_MV; i++)
37
-            s_bitsizes[i] = log((float)(i + 1)) * log2_2 + 1.718f;
38
+            s_bitsizes[i] = s_bitsizes[-i] = log((float)(i + 1)) * log2_2 + 1.718f;
39
     }
40
 }
41
 
42
@@ -80,12 +91,15 @@
43
     {
44
         if (s_costs[i])
45
         {
46
-            delete [] (s_costs[i] - 2 * BC_MAX_MV);
47
+            X265_FREE(s_costs[i] - 2 * BC_MAX_MV);
48
 
49
-            s_costs[i] = 0;
50
+            s_costs[i] = NULL;
51
         }
52
     }
53
 
54
-    delete [] s_bitsizes;
55
-    s_bitsizes = 0;
56
+    if (s_bitsizes)
57
+    {
58
+        X265_FREE(s_bitsizes - 2 * BC_MAX_MV);
59
+        s_bitsizes = NULL;
60
+    }
61
 }
62
x265_1.8.tar.gz/source/encoder/bitcost.h -> x265_1.9.tar.gz/source/encoder/bitcost.h Changed
20
 
1
@@ -47,14 +47,14 @@
2
     // return bit cost of motion vector difference, without lambda
3
     inline uint32_t bitcost(const MV& mv) const
4
     {
5
-        return (uint32_t)(s_bitsizes[abs(mv.x - m_mvp.x)] +
6
-                          s_bitsizes[abs(mv.y - m_mvp.y)] + 0.5f);
7
+        return (uint32_t)(s_bitsizes[mv.x - m_mvp.x] +
8
+                          s_bitsizes[mv.y - m_mvp.y] + 0.5f);
9
     }
10
 
11
     static inline uint32_t bitcost(const MV& mv, const MV& mvp)
12
     {
13
-        return (uint32_t)(s_bitsizes[abs(mv.x - mvp.x)] +
14
-                          s_bitsizes[abs(mv.y - mvp.y)] + 0.5f);
15
+        return (uint32_t)(s_bitsizes[mv.x - mvp.x] +
16
+                          s_bitsizes[mv.y - mvp.y] + 0.5f);
17
     }
18
 
19
     static void destroy();
20
x265_1.8.tar.gz/source/encoder/dpb.cpp -> x265_1.9.tar.gz/source/encoder/dpb.cpp Changed
56
 
1
@@ -47,16 +47,16 @@
2
         delete curFrame;
3
     }
4
 
5
-    while (m_picSymFreeList)
6
+    while (m_frameDataFreeList)
7
     {
8
-        FrameData* next = m_picSymFreeList->m_freeListNext;
9
-        m_picSymFreeList->destroy();
10
+        FrameData* next = m_frameDataFreeList->m_freeListNext;
11
+        m_frameDataFreeList->destroy();
12
 
13
-        m_picSymFreeList->m_reconPic->destroy();
14
-        delete m_picSymFreeList->m_reconPic;
15
+        m_frameDataFreeList->m_reconPic->destroy();
16
+        delete m_frameDataFreeList->m_reconPic;
17
 
18
-        delete m_picSymFreeList;
19
-        m_picSymFreeList = next;
20
+        delete m_frameDataFreeList;
21
+        m_frameDataFreeList = next;
22
     }
23
 }
24
 
25
@@ -74,13 +74,19 @@
26
             curFrame->m_reconRowCount.set(0);
27
             curFrame->m_bChromaExtended = false;
28
 
29
+            // Reset column counter
30
+            X265_CHECK(curFrame->m_reconColCount != NULL, "curFrame->m_reconColCount check failure");
31
+            X265_CHECK(curFrame->m_numRows > 0, "curFrame->m_numRows check failure");
32
+            for(int32_t col = 0; col < curFrame->m_numRows; col++)
33
+                curFrame->m_reconColCount[col].set(0);
34
+
35
             // iterator is invalidated by remove, restart scan
36
             m_picList.remove(*curFrame);
37
             iterFrame = m_picList.first();
38
 
39
             m_freeList.pushBack(*curFrame);
40
-            curFrame->m_encData->m_freeListNext = m_picSymFreeList;
41
-            m_picSymFreeList = curFrame->m_encData;
42
+            curFrame->m_encData->m_freeListNext = m_frameDataFreeList;
43
+            m_frameDataFreeList = curFrame->m_encData;
44
             curFrame->m_encData = NULL;
45
             curFrame->m_reconPic = NULL;
46
         }
47
@@ -171,7 +177,7 @@
48
     {
49
         for (int ref = 0; ref < slice->m_numRefIdx[l]; ref++)
50
         {
51
-            Frame *refpic = slice->m_refPicList[l][ref];
52
+            Frame *refpic = slice->m_refFrameList[l][ref];
53
             ATOMIC_INC(&refpic->m_countRefEncoders);
54
         }
55
     }
56
x265_1.8.tar.gz/source/encoder/dpb.h -> x265_1.9.tar.gz/source/encoder/dpb.h Changed
18
 
1
@@ -46,14 +46,14 @@
2
     bool               m_bTemporalSublayer;
3
     PicList            m_picList;
4
     PicList            m_freeList;
5
-    FrameData*         m_picSymFreeList;
6
+    FrameData*         m_frameDataFreeList;
7
 
8
     DPB(x265_param *param)
9
     {
10
         m_lastIDR = 0;
11
         m_pocCRA = 0;
12
         m_bRefreshPending = false;
13
-        m_picSymFreeList = NULL;
14
+        m_frameDataFreeList = NULL;
15
         m_maxRefL0 = param->maxNumReferences;
16
         m_maxRefL1 = param->bBPyramid ? 2 : 1;
17
         m_bOpenGOP = param->bOpenGOP;
18
x265_1.8.tar.gz/source/encoder/encoder.cpp -> x265_1.9.tar.gz/source/encoder/encoder.cpp Changed
201
 
1
@@ -2,6 +2,7 @@
2
  * Copyright (C) 2013 x265 project
3
  *
4
  * Authors: Steve Borho <steve@borho.org>
5
+ *          Min Chen <chenm003@163.com>
6
  *
7
  * This program is free software; you can redistribute it and/or modify
8
  * it under the terms of the GNU General Public License as published by
9
@@ -39,6 +40,10 @@
10
 
11
 #include "x265.h"
12
 
13
+#if _MSC_VER
14
+#pragma warning(disable: 4996) // POSIX functions are just fine, thanks
15
+#endif
16
+
17
 namespace X265_NS {
18
 const char g_sliceTypeToChar[] = {'B', 'P', 'I'};
19
 }
20
@@ -66,12 +71,9 @@
21
     m_outputCount = 0;
22
     m_param = NULL;
23
     m_latestParam = NULL;
24
-    m_cuOffsetY = NULL;
25
-    m_cuOffsetC = NULL;
26
-    m_buOffsetY = NULL;
27
-    m_buOffsetC = NULL;
28
     m_threadPool = NULL;
29
     m_analysisFile = NULL;
30
+    m_offsetEmergency = NULL;
31
     for (int i = 0; i < X265_MAX_FRAME_THREADS; i++)
32
         m_frameEncoder[i] = NULL;
33
 
34
@@ -191,6 +193,7 @@
35
     {
36
         x265_log(m_param, X265_LOG_ERROR, "Unable to allocate scaling list arrays\n");
37
         m_aborted = true;
38
+        return;
39
     }
40
     else if (!m_param->scalingLists || !strcmp(m_param->scalingLists, "off"))
41
         m_scalingList.m_bEnabled = false;
42
@@ -198,7 +201,6 @@
43
         m_scalingList.setDefaultScalingList();
44
     else if (m_scalingList.parseScalingList(m_param->scalingLists))
45
         m_aborted = true;
46
-    m_scalingList.setupQuantMatrices();
47
 
48
     m_lookahead = new Lookahead(m_param, m_threadPool);
49
     if (m_numPools)
50
@@ -213,6 +215,82 @@
51
     initVPS(&m_vps);
52
     initSPS(&m_sps);
53
     initPPS(&m_pps);
54
+   
55
+    if (m_param->rc.vbvBufferSize)
56
+    {
57
+        m_offsetEmergency = (uint16_t(*)[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS])X265_MALLOC(uint16_t, MAX_NUM_TR_CATEGORIES * MAX_NUM_TR_COEFFS * (QP_MAX_MAX - QP_MAX_SPEC));
58
+        if (!m_offsetEmergency)
59
+        {
60
+            x265_log(m_param, X265_LOG_ERROR, "Unable to allocate memory\n");
61
+            m_aborted = true;
62
+            return;
63
+        }
64
+
65
+        bool scalingEnabled = m_scalingList.m_bEnabled;
66
+        if (!scalingEnabled)
67
+        {
68
+            m_scalingList.setDefaultScalingList();
69
+            m_scalingList.setupQuantMatrices();
70
+        }
71
+        else
72
+            m_scalingList.setupQuantMatrices();
73
+
74
+        for (int q = 0; q < QP_MAX_MAX - QP_MAX_SPEC; q++)
75
+        {
76
+            for (int cat = 0; cat < MAX_NUM_TR_CATEGORIES; cat++)
77
+            {
78
+                uint16_t *nrOffset = m_offsetEmergency[q][cat];
79
+
80
+                int trSize = cat & 3;
81
+
82
+                int coefCount = 1 << ((trSize + 2) * 2);
83
+
84
+                /* Denoise chroma first then luma, then DC. */
85
+                int dcThreshold = (QP_MAX_MAX - QP_MAX_SPEC) * 2 / 3;
86
+                int lumaThreshold = (QP_MAX_MAX - QP_MAX_SPEC) * 2 / 3;
87
+                int chromaThreshold = 0;
88
+
89
+                int thresh = (cat < 4 || (cat >= 8 && cat < 12)) ? lumaThreshold : chromaThreshold;
90
+
91
+                double quantF = (double)(1ULL << (q / 6 + 16 + 8));
92
+
93
+                for (int i = 0; i < coefCount; i++)
94
+                {
95
+                    /* True "emergency mode": remove all DCT coefficients */
96
+                    if (q == QP_MAX_MAX - QP_MAX_SPEC - 1)
97
+                    {
98
+                        nrOffset[i] = INT16_MAX;
99
+                        continue;
100
+                    }
101
+
102
+                    int iThresh = i == 0 ? dcThreshold : thresh;
103
+                    if (q < iThresh)
104
+                    {
105
+                        nrOffset[i] = 0;
106
+                        continue;
107
+                    }
108
+
109
+                    int numList = (cat >= 8) * 3 + ((int)!iThresh);
110
+
111
+                    double pos = (double)(q - iThresh + 1) / (QP_MAX_MAX - QP_MAX_SPEC - iThresh);
112
+                    double start = quantF / (m_scalingList.m_quantCoef[trSize][numList][QP_MAX_SPEC % 6][i]);
113
+
114
+                    // Formula chosen as an exponential scale to vaguely mimic the effects of a higher quantizer.
115
+                    double bias = (pow(2, pos * (QP_MAX_MAX - QP_MAX_SPEC)) * 0.003 - 0.003) * start;
116
+                    nrOffset[i] = (uint16_t)X265_MIN(bias + 0.5, INT16_MAX);
117
+                }
118
+            }
119
+        }
120
+
121
+        if (!scalingEnabled)
122
+        {
123
+            m_scalingList.m_bEnabled = false;
124
+            m_scalingList.m_bDataPresent = false;
125
+            m_scalingList.setupQuantMatrices();
126
+        }
127
+    }
128
+    else
129
+        m_scalingList.setupQuantMatrices();
130
 
131
     int numRows = (m_param->sourceHeight + g_maxCUSize - 1) / g_maxCUSize;
132
     int numCols = (m_param->sourceWidth  + g_maxCUSize - 1) / g_maxCUSize;
133
@@ -259,6 +337,8 @@
134
     m_encodeStartTime = x265_mdate();
135
 
136
     m_nalList.m_annexB = !!m_param->bAnnexB;
137
+
138
+    m_emitCLLSEI = p->maxCLL || p->maxFALL;
139
 }
140
 
141
 void Encoder::stopJobs()
142
@@ -318,10 +398,7 @@
143
         delete m_rateControl;
144
     }
145
 
146
-    X265_FREE(m_cuOffsetY);
147
-    X265_FREE(m_cuOffsetC);
148
-    X265_FREE(m_buOffsetY);
149
-    X265_FREE(m_buOffsetC);
150
+    X265_FREE(m_offsetEmergency);
151
 
152
     if (m_analysisFile)
153
         fclose(m_analysisFile);
154
@@ -335,7 +412,6 @@
155
         free((char*)m_param->scalingLists);
156
         free((char*)m_param->numaPools);
157
         free((char*)m_param->masteringDisplayColorVolume);
158
-        free((char*)m_param->contentLightLevelInfo);
159
 
160
         PARAM_NS::x265_param_free(m_param);
161
     }
162
@@ -361,6 +437,45 @@
163
     }
164
 }
165
 
166
+void Encoder::calcRefreshInterval(Frame* frameEnc)
167
+{
168
+    Slice* slice = frameEnc->m_encData->m_slice;
169
+    uint32_t numBlocksInRow = slice->m_sps->numCuInWidth;
170
+    FrameData::PeriodicIR* pir = &frameEnc->m_encData->m_pir;
171
+    if (slice->m_sliceType == I_SLICE)
172
+    {
173
+        pir->framesSinceLastPir = 0;
174
+        m_bQueuedIntraRefresh = 0;
175
+        /* PIR is currently only supported with ref == 1, so any intra frame effectively refreshes
176
+         * the whole frame and counts as an intra refresh. */
177
+        pir->pirEndCol = numBlocksInRow;
178
+    }
179
+    else if (slice->m_sliceType == P_SLICE)
180
+    {
181
+        Frame* ref = frameEnc->m_encData->m_slice->m_refFrameList[0][0];
182
+        int pocdiff = frameEnc->m_poc - ref->m_poc;
183
+        int numPFramesInGOP = m_param->keyframeMax / pocdiff;
184
+        int increment = (numBlocksInRow + numPFramesInGOP - 1) / numPFramesInGOP;
185
+        pir->pirEndCol = ref->m_encData->m_pir.pirEndCol;
186
+        pir->framesSinceLastPir = ref->m_encData->m_pir.framesSinceLastPir + pocdiff;
187
+        if (pir->framesSinceLastPir >= m_param->keyframeMax ||
188
+            (m_bQueuedIntraRefresh && pir->pirEndCol >= numBlocksInRow))
189
+        {
190
+            pir->pirEndCol = 0;
191
+            pir->framesSinceLastPir = 0;
192
+            m_bQueuedIntraRefresh = 0;
193
+            frameEnc->m_lowres.bKeyframe = 1;
194
+        }
195
+        pir->pirStartCol = pir->pirEndCol;
196
+        pir->pirEndCol += increment;
197
+        /* If our intra refresh has reached the right side of the frame, we're done. */
198
+        if (pir->pirEndCol >= numBlocksInRow)
199
+        {
200
+            pir->pirEndCol = numBlocksInRow;
201
x265_1.8.tar.gz/source/encoder/encoder.h -> x265_1.9.tar.gz/source/encoder/encoder.h Changed
126
 
1
@@ -45,8 +45,10 @@
2
     double        m_psnrSumV;
3
     double        m_globalSsim;
4
     double        m_totalQp;
5
+    double        m_maxFALL;
6
     uint64_t      m_accBits;
7
     uint32_t      m_numPics;
8
+    uint16_t      m_maxCLL;
9
 
10
     EncStats()
11
     {
12
@@ -54,6 +56,8 @@
13
         m_accBits = 0;
14
         m_numPics = 0;
15
         m_totalQp = 0;
16
+        m_maxCLL = 0;
17
+        m_maxFALL = 0;
18
     }
19
 
20
     void addQP(double aveQp);
21
@@ -75,64 +79,62 @@
22
 {
23
 public:
24
 
25
-    int                m_pocLast;         // time index (POC)
26
-    int                m_encodedFrameNum;
27
-    int                m_outputCount;
28
+    uint32_t           m_residualSumEmergency[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS];
29
+    uint32_t           m_countEmergency[MAX_NUM_TR_CATEGORIES];
30
+    uint16_t           (*m_offsetEmergency)[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS];
31
 
32
-    int                m_bframeDelay;
33
     int64_t            m_firstPts;
34
     int64_t            m_bframeDelayTime;
35
     int64_t            m_prevReorderedPts[2];
36
+    int64_t            m_encodeStartTime;
37
 
38
-    ThreadPool*        m_threadPool;
39
-    FrameEncoder*      m_frameEncoder[X265_MAX_FRAME_THREADS];
40
-    DPB*               m_dpb;
41
-
42
-    Frame*             m_exportedPic;
43
-
44
+    int                m_pocLast;         // time index (POC)
45
+    int                m_encodedFrameNum;
46
+    int                m_outputCount;
47
+    int                m_bframeDelay;
48
     int                m_numPools;
49
     int                m_curEncoder;
50
 
51
-    /* cached PicYuv offset arrays, shared by all instances of
52
-     * PicYuv created by this encoder */
53
-    intptr_t*          m_cuOffsetY;
54
-    intptr_t*          m_cuOffsetC;
55
-    intptr_t*          m_buOffsetY;
56
-    intptr_t*          m_buOffsetC;
57
-
58
-    /* Collect statistics globally */
59
-    EncStats           m_analyzeAll;
60
-    EncStats           m_analyzeI;
61
-    EncStats           m_analyzeP;
62
-    EncStats           m_analyzeB;
63
-    int64_t            m_encodeStartTime;
64
-
65
     // weighted prediction
66
     int                m_numLumaWPFrames;    // number of P frames with weighted luma reference
67
     int                m_numChromaWPFrames;  // number of P frames with weighted chroma reference
68
     int                m_numLumaWPBiFrames;  // number of B frames with weighted luma reference
69
     int                m_numChromaWPBiFrames; // number of B frames with weighted chroma reference
70
-    FILE*              m_analysisFile;
71
     int                m_conformanceMode;
72
-    VPS                m_vps;
73
-    SPS                m_sps;
74
-    PPS                m_pps;
75
-    NALList            m_nalList;
76
-    ScalingList        m_scalingList;      // quantization matrix information
77
-
78
     int                m_lastBPSEI;
79
     uint32_t           m_numDelayedPic;
80
 
81
+    ThreadPool*        m_threadPool;
82
+    FrameEncoder*      m_frameEncoder[X265_MAX_FRAME_THREADS];
83
+    DPB*               m_dpb;
84
+    Frame*             m_exportedPic;
85
+    FILE*              m_analysisFile;
86
     x265_param*        m_param;
87
     x265_param*        m_latestParam;
88
     RateControl*       m_rateControl;
89
     Lookahead*         m_lookahead;
90
+
91
+    /* Collect statistics globally */
92
+    EncStats           m_analyzeAll;
93
+    EncStats           m_analyzeI;
94
+    EncStats           m_analyzeP;
95
+    EncStats           m_analyzeB;
96
+    VPS                m_vps;
97
+    SPS                m_sps;
98
+    PPS                m_pps;
99
+    NALList            m_nalList;
100
+    ScalingList        m_scalingList;      // quantization matrix information
101
     Window             m_conformanceWindow;
102
 
103
+    bool               m_emitCLLSEI;
104
     bool               m_bZeroLatency;     // x265_encoder_encode() returns NALs for the input picture, zero lag
105
     bool               m_aborted;          // fatal error detected
106
     bool               m_reconfigured;      // reconfigure of encoder detected
107
 
108
+    /* Begin intra refresh when one not in progress or else begin one as soon as the current 
109
+     * one is done. Requires bIntraRefresh to be set.*/
110
+    int                m_bQueuedIntraRefresh;
111
+
112
     Encoder();
113
     ~Encoder() {}
114
 
115
@@ -164,7 +166,9 @@
116
 
117
     void writeAnalysisFile(x265_analysis_data* pic);
118
 
119
-    void finishFrameStats(Frame* pic, FrameEncoder *curEncoder, uint64_t bits, x265_frame_stats* frameStats);
120
+    void finishFrameStats(Frame* pic, FrameEncoder *curEncoder, x265_frame_stats* frameStats, int inPoc);
121
+
122
+    void calcRefreshInterval(Frame* frameEnc);
123
 
124
 protected:
125
 
126
x265_1.8.tar.gz/source/encoder/entropy.cpp -> x265_1.9.tar.gz/source/encoder/entropy.cpp Changed
201
 
1
@@ -2,6 +2,7 @@
2
 * Copyright (C) 2013 x265 project
3
 *
4
 * Authors: Steve Borho <steve@borho.org>
5
+*          Min Chen <chenm003@163.com>
6
 *
7
 * This program is free software; you can redistribute it and/or modify
8
 * it under the terms of the GNU General Public License as published by
9
@@ -429,7 +430,8 @@
10
     if (slice.m_sps->bUseSAO)
11
     {
12
         WRITE_FLAG(saoParam->bSaoFlag[0], "slice_sao_luma_flag");
13
-        WRITE_FLAG(saoParam->bSaoFlag[1], "slice_sao_chroma_flag");
14
+        if (encData.m_param->internalCsp != X265_CSP_I400)
15
+            WRITE_FLAG(saoParam->bSaoFlag[1], "slice_sao_chroma_flag");
16
     }
17
 
18
     // check if numRefIdx match the defaults (1, hard-coded in PPS). If not, override
19
@@ -828,6 +830,79 @@
20
     }
21
 }
22
 
23
+void Entropy::encodeTransformLuma(const CUData& cu, uint32_t absPartIdx, uint32_t curDepth, uint32_t log2CurSize,
24
+                              bool& bCodeDQP, const uint32_t depthRange[2])
25
+{
26
+    const bool subdiv = cu.m_tuDepth[absPartIdx] > curDepth;
27
+
28
+    /* in each of these conditions, the subdiv flag is implied and not signaled,
29
+     * so we have checks to make sure the implied value matches our intentions */
30
+    if (cu.isIntra(absPartIdx) && cu.m_partSize[absPartIdx] != SIZE_2Nx2N && log2CurSize == MIN_LOG2_CU_SIZE)
31
+    {
32
+        X265_CHECK(subdiv, "intra NxN requires TU depth below CU depth\n");
33
+    }
34
+    else if (cu.isInter(absPartIdx) && cu.m_partSize[absPartIdx] != SIZE_2Nx2N &&
35
+             !curDepth && cu.m_slice->m_sps->quadtreeTUMaxDepthInter == 1)
36
+    {
37
+        X265_CHECK(subdiv, "inter TU must be smaller than CU when not 2Nx2N part size: log2CurSize %d, depthRange[0] %d\n", log2CurSize, depthRange[0]);
38
+    }
39
+    else if (log2CurSize > depthRange[1])
40
+    {
41
+        X265_CHECK(subdiv, "TU is larger than the max allowed, it should have been split\n");
42
+    }
43
+    else if (log2CurSize == cu.m_slice->m_sps->quadtreeTULog2MinSize || log2CurSize == depthRange[0])
44
+    {
45
+        X265_CHECK(!subdiv, "min sized TU cannot be subdivided\n");
46
+    }
47
+    else
48
+    {
49
+        X265_CHECK(log2CurSize > depthRange[0], "transform size failure\n");
50
+        codeTransformSubdivFlag(subdiv, 5 - log2CurSize);
51
+    }
52
+
53
+    if (subdiv)
54
+    {
55
+        --log2CurSize;
56
+        ++curDepth;
57
+
58
+        uint32_t qNumParts = 1 << (log2CurSize - LOG2_UNIT_SIZE) * 2;
59
+
60
+        encodeTransformLuma(cu, absPartIdx + 0 * qNumParts, curDepth, log2CurSize, bCodeDQP, depthRange);
61
+        encodeTransformLuma(cu, absPartIdx + 1 * qNumParts, curDepth, log2CurSize, bCodeDQP, depthRange);
62
+        encodeTransformLuma(cu, absPartIdx + 2 * qNumParts, curDepth, log2CurSize, bCodeDQP, depthRange);
63
+        encodeTransformLuma(cu, absPartIdx + 3 * qNumParts, curDepth, log2CurSize, bCodeDQP, depthRange);
64
+        return;
65
+    }
66
+
67
+    if (!cu.isIntra(absPartIdx) && !curDepth)
68
+    {
69
+        X265_CHECK(cu.getCbf(absPartIdx, TEXT_LUMA, 0), "CBF should have been set\n");
70
+    }
71
+    else
72
+        codeQtCbfLuma(cu, absPartIdx, curDepth);
73
+
74
+    uint32_t cbfY = cu.getCbf(absPartIdx, TEXT_LUMA, curDepth);
75
+
76
+    if (!cbfY)
77
+        return;
78
+
79
+    // dQP: only for CTU once
80
+    if (cu.m_slice->m_pps->bUseDQP && bCodeDQP)
81
+    {
82
+        uint32_t log2CUSize = cu.m_log2CUSize[absPartIdx];
83
+        uint32_t absPartIdxLT = absPartIdx & (0xFF << (log2CUSize - LOG2_UNIT_SIZE) * 2);
84
+        codeDeltaQP(cu, absPartIdxLT);
85
+        bCodeDQP = false;
86
+    }
87
+
88
+    if (cbfY)
89
+    {
90
+        uint32_t coeffOffset = absPartIdx << (LOG2_UNIT_SIZE * 2);
91
+        codeCoeffNxN(cu, cu.m_trCoeff[0] + coeffOffset, absPartIdx, log2CurSize, TEXT_LUMA);
92
+    }
93
+}
94
+
95
+
96
 void Entropy::codePredInfo(const CUData& cu, uint32_t absPartIdx)
97
 {
98
     if (cu.isIntra(absPartIdx)) // If it is intra mode, encode intra prediction mode.
99
@@ -908,7 +983,10 @@
100
     }
101
 
102
     uint32_t log2CUSize = cu.m_log2CUSize[absPartIdx];
103
-    encodeTransform(cu, absPartIdx, 0, log2CUSize, bCodeDQP, depthRange);
104
+    if (cu.m_chromaFormat == X265_CSP_I400)
105
+        encodeTransformLuma(cu, absPartIdx, 0, log2CUSize, bCodeDQP, depthRange);
106
+    else
107
+        encodeTransform(cu, absPartIdx, 0, log2CUSize, bCodeDQP, depthRange);
108
 }
109
 
110
 void Entropy::codeSaoOffset(const SaoCtuParam& ctuParam, int plane)
111
@@ -1010,7 +1088,7 @@
112
 void Entropy::codePredWeightTable(const Slice& slice)
113
 {
114
     const WeightParam *wp;
115
-    bool            bChroma      = true; // 4:0:0 not yet supported
116
+    bool            bChroma = slice.m_sps->chromaFormatIdc != X265_CSP_I400;
117
     bool            bDenomCoded  = false;
118
     int             numRefDirs   = slice.m_sliceType == B_SLICE ? 2 : 1;
119
     uint32_t        totalSignalledWeightFlags = 0;
120
@@ -1565,11 +1643,16 @@
121
     uint8_t * const baseCtx = bIsLuma ? &m_contextState[OFF_SIG_FLAG_CTX] : &m_contextState[OFF_SIG_FLAG_CTX + NUM_SIG_FLAG_CTX_LUMA];
122
     uint32_t c1 = 1;
123
     int scanPosSigOff = scanPosLast - (lastScanSet << MLS_CG_SIZE) - 1;
124
-    ALIGN_VAR_32(uint16_t, absCoeff[(1 << MLS_CG_SIZE)]);
125
+    ALIGN_VAR_32(uint16_t, absCoeff[(1 << MLS_CG_SIZE) + 1]);   // extra 2 bytes(+1) space for AVX2 assembly, +1 because (numNonZero<=1) in costCoeffNxN path
126
     uint32_t numNonZero = 1;
127
     unsigned long lastNZPosInCG;
128
     unsigned long firstNZPosInCG;
129
 
130
+#if _DEBUG
131
+    // Unnecessary, for Valgrind-3.10.0 only
132
+    memset(absCoeff, 0, sizeof(absCoeff));
133
+#endif
134
+
135
     absCoeff[0] = (uint16_t)abs(coeff[posLast]);
136
 
137
     for (int subSet = lastScanSet; subSet >= 0; subSet--)
138
@@ -1715,6 +1798,7 @@
139
             {
140
                 // maximum g_entropyBits are 18-bits and maximum of count are 16, so intermedia of sum are 22-bits
141
                 const uint8_t *tabSigCtx = table_cnt[(log2TrSize == 2) ? 4 : (uint32_t)patternSigCtx];
142
+                X265_CHECK(numNonZero <= 1, "numNonZero check failure");
143
                 uint32_t sum = primitives.costCoeffNxN(g_scan4x4[codingParameters.scanType], &coeff[blkPosBase], (intptr_t)trSize, absCoeff + numNonZero, tabSigCtx, scanFlagMask, baseCtx, offset + posOffset, scanPosSigOff, subPosBase);
144
 
145
 #if CHECKED_BUILD || _DEBUG
146
@@ -1919,43 +2003,78 @@
147
         numCtx = bIsLuma ? 12 : 3;
148
     }
149
 
150
-    if (bIsLuma)
151
-    {
152
-        for (uint32_t bin = 0; bin < 2; bin++)
153
-            estBitsSbac.significantBits[bin][0] = sbacGetEntropyBits(m_contextState[OFF_SIG_FLAG_CTX], bin);
154
+    const int ctxSigOffset = OFF_SIG_FLAG_CTX + (bIsLuma ? 0 : NUM_SIG_FLAG_CTX_LUMA);
155
+
156
+    estBitsSbac.significantBits[0][0] = sbacGetEntropyBits(m_contextState[ctxSigOffset], 0);
157
+    estBitsSbac.significantBits[1][0] = sbacGetEntropyBits(m_contextState[ctxSigOffset], 1);
158
 
159
-        for (int ctxIdx = firstCtx; ctxIdx < firstCtx + numCtx; ctxIdx++)
160
-            for (uint32_t bin = 0; bin < 2; bin++)
161
-                estBitsSbac.significantBits[bin][ctxIdx] = sbacGetEntropyBits(m_contextState[OFF_SIG_FLAG_CTX + ctxIdx], bin);
162
+    for (int ctxIdx = firstCtx; ctxIdx < firstCtx + numCtx; ctxIdx++)
163
+    {
164
+        estBitsSbac.significantBits[0][ctxIdx] = sbacGetEntropyBits(m_contextState[ctxSigOffset + ctxIdx], 0);
165
+        estBitsSbac.significantBits[1][ctxIdx] = sbacGetEntropyBits(m_contextState[ctxSigOffset + ctxIdx], 1);
166
     }
167
-    else
168
+
169
+    const uint32_t maxGroupIdx = log2TrSize * 2 - 1;
170
+    if (bIsLuma)
171
     {
172
-        for (uint32_t bin = 0; bin < 2; bin++)
173
-            estBitsSbac.significantBits[bin][0] = sbacGetEntropyBits(m_contextState[OFF_SIG_FLAG_CTX + (NUM_SIG_FLAG_CTX_LUMA + 0)], bin);
174
+        if (log2TrSize == 2)
175
+        {
176
+            for (int i = 0, ctxIdx = 0; i < 2; i++, ctxIdx += NUM_CTX_LAST_FLAG_XY)
177
+            {
178
+                int bits = 0;
179
+                const uint8_t *ctxState = &m_contextState[OFF_CTX_LAST_FLAG_X + ctxIdx];
180
 
181
-        for (int ctxIdx = firstCtx; ctxIdx < firstCtx + numCtx; ctxIdx++)
182
-            for (uint32_t bin = 0; bin < 2; bin++)
183
-                estBitsSbac.significantBits[bin][ctxIdx] = sbacGetEntropyBits(m_contextState[OFF_SIG_FLAG_CTX + (NUM_SIG_FLAG_CTX_LUMA + ctxIdx)], bin);
184
-    }
185
+                for (uint32_t ctx = 0; ctx < 3; ctx++)
186
+                {
187
+                    estBitsSbac.lastBits[i][ctx] = bits + sbacGetEntropyBits(ctxState[ctx], 0);
188
+                    bits += sbacGetEntropyBits(ctxState[ctx], 1);
189
+                }
190
 
191
-    int blkSizeOffset = bIsLuma ? ((log2TrSize - 2) * 3 + ((log2TrSize - 1) >> 2)) : NUM_CTX_LAST_FLAG_XY_LUMA;
192
-    int ctxShift = bIsLuma ? ((log2TrSize + 1) >> 2) : log2TrSize - 2;
193
-    uint32_t maxGroupIdx = log2TrSize * 2 - 1;
194
+                estBitsSbac.lastBits[i][maxGroupIdx] = bits;
195
+            }
196
+        }
197
+        else
198
+        {
199
+            const int blkSizeOffset = ((log2TrSize - 2) * 3 + (log2TrSize == 5));
200
 
201
x265_1.8.tar.gz/source/encoder/entropy.h -> x265_1.9.tar.gz/source/encoder/entropy.h Changed
18
 
1
@@ -2,6 +2,7 @@
2
 * Copyright (C) 2013 x265 project
3
 *
4
 * Authors: Steve Borho <steve@borho.org>
5
+*          Min Chen <chenm003@163.com>
6
 *
7
 * This program is free software; you can redistribute it and/or modify
8
 * it under the terms of the GNU General Public License as published by
9
@@ -246,6 +247,8 @@
10
 
11
     void encodeTransform(const CUData& cu, uint32_t absPartIdx, uint32_t tuDepth, uint32_t log2TrSize,
12
                          bool& bCodeDQP, const uint32_t depthRange[2]);
13
+    void encodeTransformLuma(const CUData& cu, uint32_t absPartIdx, uint32_t tuDepth, uint32_t log2TrSize,
14
+                         bool& bCodeDQP, const uint32_t depthRange[2]);
15
 
16
     void copyFrom(const Entropy& src);
17
     void copyContextsFrom(const Entropy& src);
18
x265_1.8.tar.gz/source/encoder/frameencoder.cpp -> x265_1.9.tar.gz/source/encoder/frameencoder.cpp Changed
201
 
1
@@ -104,7 +104,8 @@
2
     m_param = top->m_param;
3
     m_numRows = numRows;
4
     m_numCols = numCols;
5
-    m_filterRowDelay = (m_param->bEnableSAO && m_param->bSaoNonDeblocked) ?
6
+    m_filterRowDelay = ((m_param->bEnableSAO && m_param->bSaoNonDeblocked)
7
+                        || (!m_param->bEnableLoopFilter && m_param->bEnableSAO)) ?
8
                         2 : (m_param->bEnableSAO || m_param->bEnableLoopFilter ? 1 : 0);
9
     m_filterRowDelayCus = m_filterRowDelay * numCols;
10
     m_rows = new CTURow[m_numRows];
11
@@ -124,7 +125,7 @@
12
         m_pool = NULL;
13
     }
14
 
15
-    m_frameFilter.init(top, this, numRows);
16
+    m_frameFilter.init(top, this, numRows, numCols);
17
 
18
     // initialize HRD parameters of SPS
19
     if (m_param->bEmitHRDSEI || !!m_param->interlaceMode)
20
@@ -135,7 +136,7 @@
21
         ok &= m_rce.picTimingSEI && m_rce.hrdTiming;
22
     }
23
 
24
-    if (m_param->noiseReductionIntra || m_param->noiseReductionInter)
25
+    if (m_param->noiseReductionIntra || m_param->noiseReductionInter || m_param->rc.vbvBufferSize)
26
         m_nr = X265_MALLOC(NoiseReduction, 1);
27
     if (m_nr)
28
         memset(m_nr, 0, sizeof(NoiseReduction));
29
@@ -275,7 +276,7 @@
30
         m_localTldIdx = 0;
31
     }
32
 
33
-    m_done.trigger();     /* signal that thread is initialized */ 
34
+    m_done.trigger();     /* signal that thread is initialized */
35
     m_enable.wait();      /* Encoder::encode() triggers this event */
36
 
37
     while (m_threadActive)
38
@@ -357,15 +358,52 @@
39
             WeightParam *w = NULL;
40
             if ((bUseWeightP || bUseWeightB) && slice->m_weightPredTable[l][ref][0].bPresentFlag)
41
                 w = slice->m_weightPredTable[l][ref];
42
-            m_mref[l][ref].init(slice->m_refPicList[l][ref]->m_reconPic, w, *m_param);
43
+            slice->m_refReconPicList[l][ref] = slice->m_refFrameList[l][ref]->m_reconPic;
44
+            m_mref[l][ref].init(slice->m_refReconPicList[l][ref], w, *m_param);
45
         }
46
     }
47
 
48
+    int numTLD;
49
+    if (m_pool)
50
+        numTLD = m_param->bEnableWavefront ? m_pool->m_numWorkers : m_pool->m_numWorkers + m_pool->m_numProviders;
51
+    else
52
+        numTLD = 1;
53
+
54
     /* Get the QP for this frame from rate control. This call may block until
55
      * frames ahead of it in encode order have called rateControlEnd() */
56
     int qp = m_top->m_rateControl->rateControlStart(m_frame, &m_rce, m_top);
57
     m_rce.newQp = qp;
58
 
59
+    if (m_nr)
60
+    {
61
+        if (qp > QP_MAX_SPEC && m_frame->m_param->rc.vbvBufferSize)
62
+        {
63
+            for (int i = 0; i < numTLD; i++)
64
+            {
65
+                m_tld[i].analysis.m_quant.m_frameNr[m_jpId].offset = m_top->m_offsetEmergency[qp - QP_MAX_SPEC - 1];
66
+                m_tld[i].analysis.m_quant.m_frameNr[m_jpId].residualSum = m_top->m_residualSumEmergency;
67
+                m_tld[i].analysis.m_quant.m_frameNr[m_jpId].count = m_top->m_countEmergency;
68
+            }
69
+        }
70
+        else
71
+        {
72
+            if (m_param->noiseReductionIntra || m_param->noiseReductionInter)
73
+            {
74
+                for (int i = 0; i < numTLD; i++)
75
+                {
76
+                    m_tld[i].analysis.m_quant.m_frameNr[m_jpId].offset = m_tld[i].analysis.m_quant.m_frameNr[m_jpId].nrOffsetDenoise;
77
+                    m_tld[i].analysis.m_quant.m_frameNr[m_jpId].residualSum = m_tld[i].analysis.m_quant.m_frameNr[m_jpId].nrResidualSum;
78
+                    m_tld[i].analysis.m_quant.m_frameNr[m_jpId].count = m_tld[i].analysis.m_quant.m_frameNr[m_jpId].nrCount;
79
+                }
80
+            }
81
+            else
82
+            {
83
+                for (int i = 0; i < numTLD; i++)
84
+                    m_tld[i].analysis.m_quant.m_frameNr[m_jpId].offset = NULL;
85
+            }
86
+        }
87
+    }
88
+
89
     /* Clip slice QP to 0-51 spec range before encoding */
90
     slice->m_sliceQp = x265_clip3(-QP_BD_OFFSET, QP_MAX_SPEC, qp);
91
 
92
@@ -458,7 +496,7 @@
93
     /* CQP and CRF (without capped VBV) doesn't use mid-frame statistics to 
94
      * tune RateControl parameters for other frames.
95
      * Hence, for these modes, update m_startEndOrder and unlock RC for previous threads waiting in
96
-     * RateControlEnd here, after the slicecontexts are initialized. For the rest - ABR
97
+     * RateControlEnd here, after the slice contexts are initialized. For the rest - ABR
98
      * and VBV, unlock only after rateControlUpdateStats of this frame is called */
99
     if (m_param->rc.rateControlMode != X265_RC_ABR && !m_top->m_rateControl->m_isVbv)
100
     {
101
@@ -482,7 +520,7 @@
102
             {
103
                 for (int ref = 0; ref < slice->m_numRefIdx[l]; ref++)
104
                 {
105
-                    Frame *refpic = slice->m_refPicList[l][ref];
106
+                    Frame *refpic = slice->m_refFrameList[l][ref];
107
 
108
                     uint32_t reconRowCount = refpic->m_reconRowCount.get();
109
                     while ((reconRowCount != m_numRows) && (reconRowCount < row + m_refLagRows))
110
@@ -521,7 +559,7 @@
111
                     int list = l;
112
                     for (int ref = 0; ref < slice->m_numRefIdx[list]; ref++)
113
                     {
114
-                        Frame *refpic = slice->m_refPicList[list][ref];
115
+                        Frame *refpic = slice->m_refFrameList[list][ref];
116
 
117
                         uint32_t reconRowCount = refpic->m_reconRowCount.get();
118
                         while ((reconRowCount != m_numRows) && (reconRowCount < i + m_refLagRows))
119
@@ -572,10 +610,7 @@
120
         m_frame->m_encData->m_frameStats.lumaDistortion   += m_rows[i].rowStats.lumaDistortion;
121
         m_frame->m_encData->m_frameStats.chromaDistortion += m_rows[i].rowStats.chromaDistortion;
122
         m_frame->m_encData->m_frameStats.psyEnergy        += m_rows[i].rowStats.psyEnergy;
123
-        m_frame->m_encData->m_frameStats.lumaLevel        += m_rows[i].rowStats.lumaLevel;
124
-
125
-        if (m_rows[i].rowStats.maxLumaLevel > m_frame->m_encData->m_frameStats.maxLumaLevel)
126
-            m_frame->m_encData->m_frameStats.maxLumaLevel = m_rows[i].rowStats.maxLumaLevel;
127
+        m_frame->m_encData->m_frameStats.resEnergy        += m_rows[i].rowStats.resEnergy;
128
         for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
129
         {
130
             m_frame->m_encData->m_frameStats.cntSkipCu[depth] += m_rows[i].rowStats.cntSkipCu[depth];
131
@@ -589,7 +624,7 @@
132
     m_frame->m_encData->m_frameStats.avgLumaDistortion   = (double)(m_frame->m_encData->m_frameStats.lumaDistortion) / m_frame->m_encData->m_frameStats.totalCtu;
133
     m_frame->m_encData->m_frameStats.avgChromaDistortion = (double)(m_frame->m_encData->m_frameStats.chromaDistortion) / m_frame->m_encData->m_frameStats.totalCtu;
134
     m_frame->m_encData->m_frameStats.avgPsyEnergy        = (double)(m_frame->m_encData->m_frameStats.psyEnergy) / m_frame->m_encData->m_frameStats.totalCtu;
135
-    m_frame->m_encData->m_frameStats.avgLumaLevel        = m_frame->m_encData->m_frameStats.lumaLevel / m_frame->m_encData->m_frameStats.totalCtu;
136
+    m_frame->m_encData->m_frameStats.avgResEnergy        = (double)(m_frame->m_encData->m_frameStats.resEnergy) / m_frame->m_encData->m_frameStats.totalCtu;
137
     m_frame->m_encData->m_frameStats.percentIntraNxN     = (double)(m_frame->m_encData->m_frameStats.cntIntraNxN * 100) / m_frame->m_encData->m_frameStats.totalCu;
138
     for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
139
     {
140
@@ -626,22 +661,23 @@
141
 
142
     if (m_param->decodedPictureHashSEI)
143
     {
144
+        int planes = (m_frame->m_param->internalCsp != X265_CSP_I400) ? 3 : 1;
145
         if (m_param->decodedPictureHashSEI == 1)
146
         {
147
             m_seiReconPictureDigest.m_method = SEIDecodedPictureHash::MD5;
148
-            for (int i = 0; i < 3; i++)
149
+            for (int i = 0; i < planes; i++)
150
                 MD5Final(&m_state[i], m_seiReconPictureDigest.m_digest[i]);
151
         }
152
         else if (m_param->decodedPictureHashSEI == 2)
153
         {
154
             m_seiReconPictureDigest.m_method = SEIDecodedPictureHash::CRC;
155
-            for (int i = 0; i < 3; i++)
156
+            for (int i = 0; i < planes; i++)
157
                 crcFinish(m_crc[i], m_seiReconPictureDigest.m_digest[i]);
158
         }
159
         else if (m_param->decodedPictureHashSEI == 3)
160
         {
161
             m_seiReconPictureDigest.m_method = SEIDecodedPictureHash::CHECKSUM;
162
-            for (int i = 0; i < 3; i++)
163
+            for (int i = 0; i < planes; i++)
164
                 checksumFinish(m_checksum[i], m_seiReconPictureDigest.m_digest[i]);
165
         }
166
 
167
@@ -678,41 +714,40 @@
168
     {
169
         for (int ref = 0; ref < slice->m_numRefIdx[l]; ref++)
170
         {
171
-            Frame *refpic = slice->m_refPicList[l][ref];
172
+            Frame *refpic = slice->m_refFrameList[l][ref];
173
             ATOMIC_DEC(&refpic->m_countRefEncoders);
174
         }
175
     }
176
 
177
-    int numTLD;
178
-    if (m_pool)
179
-        numTLD = m_param->bEnableWavefront ? m_pool->m_numWorkers : m_pool->m_numWorkers + m_pool->m_numProviders;
180
-    else
181
-        numTLD = 1;
182
-
183
     if (m_nr)
184
     {
185
-        /* Accumulate NR statistics from all worker threads */
186
-        for (int i = 0; i < numTLD; i++)
187
+        bool nrEnabled = (m_rce.newQp < QP_MAX_SPEC || !m_param->rc.vbvBufferSize) && (m_param->noiseReductionIntra || m_param->noiseReductionInter);
188
+
189
+        if (nrEnabled)
190
         {
191
-            NoiseReduction* nr = &m_tld[i].analysis.m_quant.m_frameNr[m_jpId];
192
-            for (int cat = 0; cat < MAX_NUM_TR_CATEGORIES; cat++)
193
+            /* Accumulate NR statistics from all worker threads */
194
+            for (int i = 0; i < numTLD; i++)
195
             {
196
-                for (int coeff = 0; coeff < MAX_NUM_TR_COEFFS; coeff++)
197
-                    m_nr->residualSum[cat][coeff] += nr->residualSum[cat][coeff];
198
-            
199
-                m_nr->count[cat] += nr->count[cat];
200
+                NoiseReduction* nr = &m_tld[i].analysis.m_quant.m_frameNr[m_jpId];
201
x265_1.8.tar.gz/source/encoder/framefilter.cpp -> x265_1.9.tar.gz/source/encoder/framefilter.cpp Changed
201
 
1
@@ -35,177 +35,486 @@
2
 static uint64_t computeSSD(pixel *fenc, pixel *rec, intptr_t stride, uint32_t width, uint32_t height);
3
 static float calculateSSIM(pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, uint32_t width, uint32_t height, void *buf, uint32_t& cnt);
4
 
5
-FrameFilter::FrameFilter()
6
-    : m_param(NULL)
7
-    , m_frame(NULL)
8
-    , m_frameEncoder(NULL)
9
-    , m_ssimBuf(NULL)
10
-{
11
-}
12
-
13
 void FrameFilter::destroy()
14
 {
15
-    if (m_param->bEnableSAO)
16
-        m_sao.destroy();
17
-
18
     X265_FREE(m_ssimBuf);
19
+
20
+    if (m_parallelFilter)
21
+    {
22
+        if (m_param->bEnableSAO)
23
+        {
24
+            for(int row = 0; row < m_numRows; row++)
25
+                m_parallelFilter[row].m_sao.destroy((row == 0 ? 1 : 0));
26
+        }
27
+
28
+        delete[] m_parallelFilter;
29
+        m_parallelFilter = NULL;
30
+    }
31
 }
32
 
33
-void FrameFilter::init(Encoder *top, FrameEncoder *frame, int numRows)
34
+void FrameFilter::init(Encoder *top, FrameEncoder *frame, int numRows, uint32_t numCols)
35
 {
36
     m_param = top->m_param;
37
     m_frameEncoder = frame;
38
     m_numRows = numRows;
39
+    m_numCols = numCols;
40
     m_hChromaShift = CHROMA_H_SHIFT(m_param->internalCsp);
41
     m_vChromaShift = CHROMA_V_SHIFT(m_param->internalCsp);
42
     m_pad[0] = top->m_sps.conformanceWindow.rightOffset;
43
     m_pad[1] = top->m_sps.conformanceWindow.bottomOffset;
44
     m_saoRowDelay = m_param->bEnableLoopFilter ? 1 : 0;
45
-    m_lastHeight = m_param->sourceHeight % g_maxCUSize ? m_param->sourceHeight % g_maxCUSize : g_maxCUSize;
46
-
47
-    if (m_param->bEnableSAO)
48
-        if (!m_sao.create(m_param))
49
-            m_param->bEnableSAO = 0;
50
+    m_lastHeight = (m_param->sourceHeight % g_maxCUSize) ? (m_param->sourceHeight % g_maxCUSize) : g_maxCUSize;
51
+    m_lastWidth = (m_param->sourceWidth % g_maxCUSize) ? (m_param->sourceWidth % g_maxCUSize) : g_maxCUSize;
52
 
53
     if (m_param->bEnableSsim)
54
         m_ssimBuf = X265_MALLOC(int, 8 * (m_param->sourceWidth / 4 + 3));
55
+
56
+    m_parallelFilter = new ParallelFilter[numRows];
57
+
58
+    if (m_parallelFilter)
59
+    {
60
+        if (m_param->bEnableSAO)
61
+        {
62
+            for(int row = 0; row < numRows; row++)
63
+            {
64
+                if (!m_parallelFilter[row].m_sao.create(m_param, (row == 0 ? 1 : 0)))
65
+                    m_param->bEnableSAO = 0;
66
+                else
67
+                {
68
+                    if (row != 0)
69
+                        m_parallelFilter[row].m_sao.createFromRootNode(&m_parallelFilter[0].m_sao);
70
+                }
71
+
72
+            }
73
+        }
74
+
75
+        for(int row = 0; row < numRows; row++)
76
+        {
77
+            // Setting maximum bound information
78
+            m_parallelFilter[row].m_rowHeight = (row == numRows - 1) ? m_lastHeight : g_maxCUSize;
79
+            m_parallelFilter[row].m_row = row;
80
+            m_parallelFilter[row].m_rowAddr = row * numCols;
81
+            m_parallelFilter[row].m_frameFilter = this;
82
+
83
+            if (row > 0)
84
+                m_parallelFilter[row].m_prevRow = &m_parallelFilter[row - 1];
85
+        }
86
+    }
87
+
88
 }
89
 
90
 void FrameFilter::start(Frame *frame, Entropy& initState, int qp)
91
 {
92
     m_frame = frame;
93
 
94
-    if (m_param->bEnableSAO)
95
-        m_sao.startSlice(frame, initState, qp);
96
+    // Reset Filter Data Struct
97
+    if (m_parallelFilter)
98
+    {
99
+        for(int row = 0; row < m_numRows; row++)
100
+        {
101
+            if (m_param->bEnableSAO)
102
+                m_parallelFilter[row].m_sao.startSlice(frame, initState, qp);
103
+
104
+            m_parallelFilter[row].m_lastCol.set(0);
105
+            m_parallelFilter[row].m_allowedCol.set(0);
106
+            m_parallelFilter[row].m_lastDeblocked.set(-1);
107
+            m_parallelFilter[row].m_encData = frame->m_encData;
108
+        }
109
+
110
+        // Reset SAO common statistics
111
+        if (m_param->bEnableSAO)
112
+            m_parallelFilter[0].m_sao.resetStats();
113
+    }
114
 }
115
 
116
-void FrameFilter::processRow(int row)
117
+/* restore original YUV samples to recon after SAO (if lossless) */
118
+static void restoreOrigLosslessYuv(const CUData* cu, Frame& frame, uint32_t absPartIdx)
119
 {
120
-    ProfileScopeEvent(filterCTURow);
121
+    const int size = cu->m_log2CUSize[absPartIdx] - 2;
122
+    const uint32_t cuAddr = cu->m_cuAddr;
123
 
124
-#if DETAILED_CU_STATS
125
-    ScopedElapsedTime filterPerfScope(m_frameEncoder->m_cuStats.loopFilterElapsedTime);
126
-    m_frameEncoder->m_cuStats.countLoopFilter++;
127
-#endif
128
+    PicYuv* reconPic = frame.m_reconPic;
129
+    PicYuv* fencPic  = frame.m_fencPic;
130
 
131
-    if (!m_param->bEnableLoopFilter && !m_param->bEnableSAO)
132
+    pixel* dst = reconPic->getLumaAddr(cuAddr, absPartIdx);
133
+    pixel* src = fencPic->getLumaAddr(cuAddr, absPartIdx);
134
+
135
+    primitives.cu[size].copy_pp(dst, reconPic->m_stride, src, fencPic->m_stride);
136
+
137
+    if (cu->m_chromaFormat != X265_CSP_I400)
138
     {
139
-        processRowPost(row);
140
+        pixel* dstCb = reconPic->getCbAddr(cuAddr, absPartIdx);
141
+        pixel* srcCb = fencPic->getCbAddr(cuAddr, absPartIdx);
142
+        pixel* dstCr = reconPic->getCrAddr(cuAddr, absPartIdx);
143
+        pixel* srcCr = fencPic->getCrAddr(cuAddr, absPartIdx);
144
+
145
+        const int csp = fencPic->m_picCsp;
146
+        primitives.chroma[csp].cu[size].copy_pp(dstCb, reconPic->m_strideC, srcCb, fencPic->m_strideC);
147
+        primitives.chroma[csp].cu[size].copy_pp(dstCr, reconPic->m_strideC, srcCr, fencPic->m_strideC);
148
+    }
149
+}
150
+
151
+/* Original YUV restoration for CU in lossless coding */
152
+static void origCUSampleRestoration(const CUData* cu, const CUGeom& cuGeom, Frame& frame)
153
+{
154
+    uint32_t absPartIdx = cuGeom.absPartIdx;
155
+    if (cu->m_cuDepth[absPartIdx] > cuGeom.depth)
156
+    {
157
+        for (int subPartIdx = 0; subPartIdx < 4; subPartIdx++)
158
+        {
159
+            const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
160
+            if (childGeom.flags & CUGeom::PRESENT)
161
+                origCUSampleRestoration(cu, childGeom, frame);
162
+        }
163
         return;
164
     }
165
-    FrameData& encData = *m_frame->m_encData;
166
-    const uint32_t numCols = encData.m_slice->m_sps->numCuInWidth;
167
-    const uint32_t lineStartCUAddr = row * numCols;
168
 
169
-    if (m_param->bEnableLoopFilter)
170
+    // restore original YUV samples
171
+    if (cu->m_tqBypass[absPartIdx])
172
+        restoreOrigLosslessYuv(cu, frame, absPartIdx);
173
+}
174
+
175
+void FrameFilter::ParallelFilter::copySaoAboveRef(PicYuv* reconPic, uint32_t cuAddr, int col)
176
+{
177
+    // Copy SAO Top Reference Pixels
178
+    int ctuWidth  = g_maxCUSize;
179
+    const pixel* recY = reconPic->getPlaneAddr(0, cuAddr) - (m_rowAddr == 0 ? 0 : reconPic->m_stride);
180
+
181
+    // Luma
182
+    memcpy(&m_sao.m_tmpU[0][col * ctuWidth], recY, ctuWidth * sizeof(pixel));
183
+    X265_CHECK(col * ctuWidth + ctuWidth <= m_sao.m_numCuInWidth * ctuWidth, "m_tmpU buffer beyond bound write detected");
184
+
185
+    // Chroma
186
+    if (m_frameFilter->m_param->internalCsp != X265_CSP_I400)
187
+    {
188
+        ctuWidth  >>= m_sao.m_hChromaShift;
189
+
190
+        const pixel* recU = reconPic->getPlaneAddr(1, cuAddr) - (m_rowAddr == 0 ? 0 : reconPic->m_strideC);
191
+        const pixel* recV = reconPic->getPlaneAddr(2, cuAddr) - (m_rowAddr == 0 ? 0 : reconPic->m_strideC);
192
+        memcpy(&m_sao.m_tmpU[1][col * ctuWidth], recU, ctuWidth * sizeof(pixel));
193
+        memcpy(&m_sao.m_tmpU[2][col * ctuWidth], recV, ctuWidth * sizeof(pixel));
194
+
195
+        X265_CHECK(col * ctuWidth + ctuWidth <= m_sao.m_numCuInWidth * ctuWidth, "m_tmpU buffer beyond bound write detected");
196
+    }
197
+}
198
+
199
+void FrameFilter::ParallelFilter::processSaoUnitCu(SAOParam *saoParam, int col)
200
+{
201
x265_1.8.tar.gz/source/encoder/framefilter.h -> x265_1.9.tar.gz/source/encoder/framefilter.h Changed
112
 
1
@@ -29,6 +29,7 @@
2
 #include "frame.h"
3
 #include "deblock.h"
4
 #include "sao.h"
5
+#include "threadpool.h" // class BondedTaskGroup
6
 
7
 namespace X265_NS {
8
 // private x265 namespace
9
@@ -39,7 +40,7 @@
10
 struct ThreadLocalData;
11
 
12
 // Manages the processing of a single frame loopfilter
13
-class FrameFilter : public Deblock
14
+class FrameFilter
15
 {
16
 public:
17
 
18
@@ -50,24 +51,86 @@
19
     int           m_vChromaShift;
20
     int           m_pad[2];
21
 
22
-    SAO           m_sao;
23
     int           m_numRows;
24
+    int           m_numCols;
25
     int           m_saoRowDelay;
26
     int           m_lastHeight;
27
+    int           m_lastWidth;
28
     
29
-    void*         m_ssimBuf; /* Temp storage for ssim computation */
30
+    void*         m_ssimBuf;        /* Temp storage for ssim computation */
31
 
32
-    FrameFilter();
33
+#define MAX_PFILTER_CUS     (4) /* maximum CUs for every thread */
34
+    class ParallelFilter : public BondedTaskGroup, public Deblock
35
+    {
36
+    public:
37
+        uint32_t            m_rowHeight;
38
+        int                 m_row;
39
+        uint32_t            m_rowAddr;
40
+        FrameFilter*        m_frameFilter;
41
+        FrameData*          m_encData;
42
+        ParallelFilter*     m_prevRow;
43
+        SAO                 m_sao;
44
+        ThreadSafeInteger   m_lastCol;          /* The column that next to process */
45
+        ThreadSafeInteger   m_allowedCol;       /* The column that processed from Encode pipeline */
46
+        ThreadSafeInteger   m_lastDeblocked;   /* The column that finished all of Deblock stages  */
47
 
48
-    void init(Encoder *top, FrameEncoder *frame, int numRows);
49
+        ParallelFilter()
50
+            : m_rowHeight(0)
51
+            , m_row(0)
52
+            , m_rowAddr(0)
53
+            , m_frameFilter(NULL)
54
+            , m_encData(NULL)
55
+            , m_prevRow(NULL)
56
+        {
57
+        }
58
+
59
+        ~ParallelFilter()
60
+        { }
61
+
62
+        void processTasks(int workerThreadId);
63
+
64
+        // Apply SAO on a CU in current row
65
+        void processSaoUnitCu(SAOParam *saoParam, int col);
66
+
67
+        // Copy and Save SAO reference pixels for SAO Rdo decide
68
+        void copySaoAboveRef(PicYuv* reconPic, uint32_t cuAddr, int col);
69
+
70
+        // Post-Process (Border extension)
71
+        void processPostCu(int col) const;
72
+
73
+        uint32_t getCUHeight() const
74
+        {
75
+            return m_rowHeight;
76
+        }
77
+
78
+    protected:
79
+
80
+        ParallelFilter operator=(const ParallelFilter&);
81
+    };
82
+
83
+    ParallelFilter*     m_parallelFilter;
84
+
85
+    FrameFilter()
86
+        : m_param(NULL)
87
+        , m_frame(NULL)
88
+        , m_frameEncoder(NULL)
89
+        , m_ssimBuf(NULL)
90
+        , m_parallelFilter(NULL)
91
+    {
92
+    }
93
+
94
+    uint32_t getCUWidth(int colNum) const
95
+    {
96
+        return (colNum == (int)m_numCols - 1) ? m_lastWidth : g_maxCUSize;
97
+    }
98
+
99
+    void init(Encoder *top, FrameEncoder *frame, int numRows, uint32_t numCols);
100
     void destroy();
101
 
102
     void start(Frame *pic, Entropy& initState, int qp);
103
 
104
     void processRow(int row);
105
-    void processRowPost(int row);
106
-    void processSao(int row);
107
-    uint32_t getCUHeight(int rowNum) const;
108
+    void processPostRow(int row);
109
 };
110
 }
111
 
112
x265_1.8.tar.gz/source/encoder/level.cpp -> x265_1.9.tar.gz/source/encoder/level.cpp Changed
27
 
1
@@ -2,6 +2,7 @@
2
  * Copyright (C) 2013 x265 project
3
  *
4
  * Authors: Steve Borho <steve@borho.org>
5
+ *          Min Chen <chenm003@163.com>
6
  *
7
  * This program is free software; you can redistribute it and/or modify
8
  * it under the terms of the GNU General Public License as published by
9
@@ -462,7 +463,7 @@
10
     {
11
         if (param->internalCsp != X265_CSP_I420)
12
         {
13
-            x265_log(param, X265_LOG_ERROR, "%s profile not compatible with %s input color space.\n",
14
+            x265_log(param, X265_LOG_ERROR, "%s profile not compatible with %s input chroma subsampling.\n",
15
                      profile, x265_source_csp_names[param->internalCsp]);
16
             return -1;
17
         }
18
@@ -472,7 +473,7 @@
19
     {
20
         if (param->internalCsp != X265_CSP_I420 && param->internalCsp != X265_CSP_I422)
21
         {
22
-            x265_log(param, X265_LOG_ERROR, "%s profile not compatible with %s input color space.\n",
23
+            x265_log(param, X265_LOG_ERROR, "%s profile not compatible with %s input chroma subsampling.\n",
24
                      profile, x265_source_csp_names[param->internalCsp]);
25
             return -1;
26
         }
27
x265_1.8.tar.gz/source/encoder/motion.cpp -> x265_1.9.tar.gz/source/encoder/motion.cpp Changed
37
 
1
@@ -2,6 +2,7 @@
2
  * Copyright (C) 2013 x265 project
3
  *
4
  * Authors: Steve Borho <steve@borho.org>
5
+ *          Min Chen <chenm003@163.com>
6
  *
7
  * This program is free software; you can redistribute it and/or modify
8
  * it under the terms of the GNU General Public License as published by
9
@@ -188,11 +189,12 @@
10
     satd = primitives.pu[partEnum].satd;
11
     sad_x3 = primitives.pu[partEnum].sad_x3;
12
     sad_x4 = primitives.pu[partEnum].sad_x4;
13
+
14
     chromaSatd = primitives.chroma[fencPUYuv.m_csp].pu[partEnum].satd;
15
 
16
     /* Enable chroma residual cost if subpelRefine level is greater than 2 and chroma block size
17
      * is an even multiple of 4x4 pixels (indicated by non-null chromaSatd pointer) */
18
-    bChromaSATD = subpelRefine > 2 && chromaSatd;
19
+    bChromaSATD = subpelRefine > 2 && chromaSatd && (srcFencYuv.m_csp != X265_CSP_I400);
20
     X265_CHECK(!(bChromaSATD && !workload[subpelRefine].hpel_satd), "Chroma SATD cannot be used with SAD hpel\n");
21
 
22
     ctuAddr = _ctuAddr;
23
@@ -1214,8 +1216,11 @@
24
         const pixel* refCb = ref->getCbAddr(ctuAddr, absPartIdx) + refOffset;
25
         const pixel* refCr = ref->getCrAddr(ctuAddr, absPartIdx) + refOffset;
26
 
27
-        xFrac = qmv.x & ((1 << shiftHor) - 1);
28
-        yFrac = qmv.y & ((1 << shiftVer) - 1);
29
+        X265_CHECK((hshift == 0) || (hshift == 1), "hshift must be 0 or 1\n");
30
+        X265_CHECK((vshift == 0) || (vshift == 1), "vshift must be 0 or 1\n");
31
+
32
+        xFrac = qmv.x & (hshift ? 7 : 3);
33
+        yFrac = qmv.y & (vshift ? 7 : 3);
34
 
35
         if (!(yFrac | xFrac))
36
         {
37
x265_1.8.tar.gz/source/encoder/motion.h -> x265_1.9.tar.gz/source/encoder/motion.h Changed
9
 
1
@@ -2,6 +2,7 @@
2
  * Copyright (C) 2013 x265 project
3
  *
4
  * Authors: Steve Borho <steve@borho.org>
5
+ *          Min Chen <chenm003@163.com>
6
  *
7
  * This program is free software; you can redistribute it and/or modify
8
  * it under the terms of the GNU General Public License as published by
9
x265_1.8.tar.gz/source/encoder/nal.cpp -> x265_1.9.tar.gz/source/encoder/nal.cpp Changed
9
 
1
@@ -2,6 +2,7 @@
2
 * Copyright (C) 2013 x265 project
3
 *
4
 * Authors: Steve Borho <steve@borho.org>
5
+*          Min Chen <chenm003@163.com>
6
 *
7
 * This program is free software; you can redistribute it and/or modify
8
 * it under the terms of the GNU General Public License as published by
9
x265_1.8.tar.gz/source/encoder/ratecontrol.cpp -> x265_1.9.tar.gz/source/encoder/ratecontrol.cpp Changed
201
 
1
@@ -23,6 +23,10 @@
2
  * For more information, contact us at license @ x265.com.
3
  *****************************************************************************/
4
 
5
+#if _MSC_VER
6
+#pragma warning(disable: 4127) // conditional expression is constant, yes I know
7
+#endif
8
+
9
 #include "common.h"
10
 #include "param.h"
11
 #include "frame.h"
12
@@ -142,6 +146,9 @@
13
     rce->expectedVbv = rce2Pass->expectedVbv;
14
     rce->blurredComplexity = rce2Pass->blurredComplexity;
15
     rce->sliceType = rce2Pass->sliceType;
16
+    rce->qpNoVbv = rce2Pass->qpNoVbv;
17
+    rce->newQp = rce2Pass->newQp;
18
+    rce->qRceq = rce2Pass->qRceq;
19
 }
20
 
21
 }  // end anonymous namespace
22
@@ -205,7 +212,7 @@
23
             m_rateFactorMaxDecrement = m_param->rc.rfConstant - m_param->rc.rfConstantMin;
24
     }
25
     m_isAbr = m_param->rc.rateControlMode != X265_RC_CQP && !m_param->rc.bStatRead;
26
-    m_2pass = m_param->rc.rateControlMode == X265_RC_ABR && m_param->rc.bStatRead;
27
+    m_2pass = (m_param->rc.rateControlMode == X265_RC_ABR || m_param->rc.vbvMaxBitrate > 0) && m_param->rc.bStatRead;
28
     m_bitrate = m_param->rc.bitrate * 1000;
29
     m_frameDuration = (double)m_param->fpsDenom / m_param->fpsNum;
30
     m_qp = m_param->rc.qp;
31
@@ -219,6 +226,7 @@
32
     m_cutreeStatFileOut = m_cutreeStatFileIn = NULL;
33
     m_rce2Pass = NULL;
34
     m_lastBsliceSatdCost = 0;
35
+    m_movingAvgSum = 0.0;
36
 
37
     // vbv initialization
38
     m_param->rc.vbvBufferSize = x265_clip3(0, 2000000, m_param->rc.vbvBufferSize);
39
@@ -444,6 +452,7 @@
40
                 CMP_OPT_FIRST_PASS("open-gop", m_param->bOpenGOP);
41
                 CMP_OPT_FIRST_PASS("keyint", m_param->keyframeMax);
42
                 CMP_OPT_FIRST_PASS("scenecut", m_param->scenecutThreshold);
43
+                CMP_OPT_FIRST_PASS("intra-refresh", m_param->bIntraRefresh);
44
 
45
                 if ((p = strstr(opts, "b-adapt=")) != 0 && sscanf(p, "b-adapt=%d", &i) && i >= X265_B_ADAPT_NONE && i <= X265_B_ADAPT_TRELLIS)
46
                 {
47
@@ -488,6 +497,12 @@
48
                  x265_log(m_param, X265_LOG_ERROR, "Rce Entries for 2 pass cannot be allocated\n");
49
                  return false;
50
             }
51
+            m_encOrder = X265_MALLOC(int, m_numEntries);
52
+            if (!m_encOrder)
53
+            {
54
+                x265_log(m_param, X265_LOG_ERROR, "Encode order for 2 pass cannot be allocated\n");
55
+                return false;
56
+            }
57
             /* init all to skipped p frames */
58
             for (int i = 0; i < m_numEntries; i++)
59
             {
60
@@ -504,22 +519,24 @@
61
             {
62
                 RateControlEntry *rce;
63
                 int frameNumber;
64
+                int encodeOrder;
65
                 char picType;
66
                 int e;
67
                 char *next;
68
-                double qpRc, qpAq;
69
+                double qpRc, qpAq, qNoVbv, qRceq;
70
                 next = strstr(p, ";");
71
                 if (next)
72
                     *next++ = 0;
73
-                e = sscanf(p, " in:%d ", &frameNumber);
74
+                e = sscanf(p, " in:%d out:%d", &frameNumber, &encodeOrder);
75
                 if (frameNumber < 0 || frameNumber >= m_numEntries)
76
                 {
77
                     x265_log(m_param, X265_LOG_ERROR, "bad frame number (%d) at stats line %d\n", frameNumber, i);
78
                     return false;
79
                 }
80
-                rce = &m_rce2Pass[frameNumber];
81
-                e += sscanf(p, " in:%*d out:%*d type:%c q:%lf q-aq:%lf tex:%d mv:%d misc:%d icu:%lf pcu:%lf scu:%lf",
82
-                       &picType, &qpRc, &qpAq, &rce->coeffBits,
83
+                rce = &m_rce2Pass[encodeOrder];
84
+                m_encOrder[frameNumber] = encodeOrder;
85
+                e += sscanf(p, " in:%*d out:%*d type:%c q:%lf q-aq:%lf q-noVbv:%lf q-Rceq:%lf tex:%d mv:%d misc:%d icu:%lf pcu:%lf scu:%lf",
86
+                       &picType, &qpRc, &qpAq, &qNoVbv, &qRceq, &rce->coeffBits,
87
                        &rce->mvBits, &rce->miscBits, &rce->iCuCount, &rce->pCuCount,
88
                        &rce->skipCuCount);
89
                 rce->keptAsRef = true;
90
@@ -538,13 +555,16 @@
91
                     x265_log(m_param, X265_LOG_ERROR, "statistics are damaged at line %d, parser out=%d\n", i, e);
92
                     return false;
93
                 }
94
-                rce->qScale = x265_qp2qScale(qpRc);
95
+                rce->qScale = rce->newQScale = x265_qp2qScale(qpRc);
96
                 totalQpAq += qpAq;
97
+                rce->qpNoVbv = qNoVbv;
98
+                rce->qpaRc = qpRc;
99
+                rce->qpAq = qpAq;
100
+                rce->qRceq = qRceq;
101
                 p = next;
102
             }
103
             X265_FREE(statsBuf);
104
-
105
-            if (m_param->rc.rateControlMode == X265_RC_ABR)
106
+            if (m_param->rc.rateControlMode == X265_RC_ABR || m_param->rc.vbvMaxBitrate > 0)
107
             {
108
                 if (!initPass2())
109
                     return false;
110
@@ -627,11 +647,8 @@
111
 
112
     #undef MAX_DURATION
113
 }
114
-
115
-bool RateControl::initPass2()
116
+bool RateControl::analyseABR2Pass(int startIndex, int endIndex, uint64_t allAvailableBits)
117
 {
118
-    uint64_t allConstBits = 0;
119
-    uint64_t allAvailableBits = uint64_t(m_param->rc.bitrate * 1000. * m_numEntries * m_frameDuration);
120
     double rateFactor, stepMult;
121
     double qBlur = m_param->rc.qblur;
122
     double cplxBlur = m_param->rc.complexityBlur;
123
@@ -640,30 +657,19 @@
124
     double *qScale, *blurredQscale;
125
     double baseCplx = m_ncu * (m_param->bframes ? 120 : 80);
126
     double clippedDuration = CLIP_DURATION(m_frameDuration) / BASE_FRAME_DURATION;
127
-
128
-    /* find total/average complexity & const_bits */
129
-    for (int i = 0; i < m_numEntries; i++)
130
-        allConstBits += m_rce2Pass[i].miscBits;
131
-
132
-    if (allAvailableBits < allConstBits)
133
-    {
134
-        x265_log(m_param, X265_LOG_ERROR, "requested bitrate is too low. estimated minimum is %d kbps\n",
135
-                 (int)(allConstBits * m_fps / m_numEntries * 1000.));
136
-        return false;
137
-    }
138
-
139
+    int framesCount = endIndex - startIndex + 1;
140
     /* Blur complexities, to reduce local fluctuation of QP.
141
      * We don't blur the QPs directly, because then one very simple frame
142
      * could drag down the QP of a nearby complex frame and give it more
143
      * bits than intended. */
144
-    for (int i = 0; i < m_numEntries; i++)
145
+    for (int i = startIndex; i <= endIndex; i++)
146
     {
147
         double weightSum = 0;
148
         double cplxSum = 0;
149
         double weight = 1.0;
150
         double gaussianWeight;
151
         /* weighted average of cplx of future frames */
152
-        for (int j = 1; j < cplxBlur * 2 && j < m_numEntries - i; j++)
153
+        for (int j = 1; j < cplxBlur * 2 && j <= endIndex - i; j++)
154
         {
155
             RateControlEntry *rcj = &m_rce2Pass[i + j];
156
             weight *= 1 - pow(rcj->iCuCount / m_ncu, 2);
157
@@ -687,11 +693,10 @@
158
         }
159
         m_rce2Pass[i].blurredComplexity = cplxSum / weightSum;
160
     }
161
-
162
-    CHECKED_MALLOC(qScale, double, m_numEntries);
163
+    CHECKED_MALLOC(qScale, double, framesCount);
164
     if (filterSize > 1)
165
     {
166
-        CHECKED_MALLOC(blurredQscale, double, m_numEntries);
167
+        CHECKED_MALLOC(blurredQscale, double, framesCount);
168
     }
169
     else
170
         blurredQscale = qScale;
171
@@ -702,9 +707,8 @@
172
      * because qscale2bits is not invertible, but we can start with the simple
173
      * approximation of scaling the 1st pass by the ratio of bitrates.
174
      * The search range is probably overkill, but speed doesn't matter here. */
175
-
176
     expectedBits = 1;
177
-    for (int i = 0; i < m_numEntries; i++)
178
+    for (int i = startIndex; i <= endIndex; i++)
179
     {
180
         RateControlEntry* rce = &m_rce2Pass[i];
181
         double q = getQScale(rce, 1.0);
182
@@ -781,12 +785,10 @@
183
     X265_FREE(qScale);
184
     if (filterSize > 1)
185
         X265_FREE(blurredQscale);
186
-
187
     if (m_isVbv)
188
-        if (!vbv2Pass(allAvailableBits))
189
+    if (!vbv2Pass(allAvailableBits, endIndex, startIndex))
190
             return false;
191
-    expectedBits = countExpectedBits();
192
-
193
+    expectedBits = countExpectedBits(startIndex, endIndex);
194
     if (fabs(expectedBits / allAvailableBits - 1.0) > 0.01)
195
     {
196
         double avgq = 0;
197
@@ -819,7 +821,123 @@
198
     return false;
199
 }
200
 
201
x265_1.8.tar.gz/source/encoder/ratecontrol.h -> x265_1.9.tar.gz/source/encoder/ratecontrol.h Changed
52
 
1
@@ -48,6 +48,7 @@
2
 
3
 struct Predictor
4
 {
5
+    double coeffMin;
6
     double coeff;
7
     double count;
8
     double decay;
9
@@ -74,6 +75,7 @@
10
     double  qpaRc;
11
     double  qpAq;
12
     double  qRceq;
13
+    double  qpPrev;
14
     double  frameSizePlanned;  /* frame Size decided by RateCotrol before encoding the frame */
15
     double  bufferRate;
16
     double  movingAvgSum;
17
@@ -167,6 +169,8 @@
18
     int64_t m_satdCostWindow[50];
19
     int64_t m_encodedBitsWindow[50];
20
     int     m_sliderPos;
21
+    int64_t m_lastRemovedSatdCost;
22
+    double  m_movingAvgSum;
23
 
24
     /* To detect a pattern of low detailed static frames in single pass ABR using satdcosts */
25
     int64_t m_lastBsliceSatdCost;
26
@@ -205,8 +209,8 @@
27
     double  m_lastAccumPNorm;
28
     double  m_expectedBitsSum;   /* sum of qscale2bits after rceq, ratefactor, and overflow, only includes finished frames */
29
     int64_t m_predictedBits;
30
+    int     *m_encOrder;
31
     RateControlEntry* m_rce2Pass;
32
-
33
     struct
34
     {
35
         uint16_t *qpBuffer[2]; /* Global buffers for converting MB-tree quantizer data. */
36
@@ -258,11 +262,12 @@
37
     void   checkAndResetABR(RateControlEntry* rce, bool isFrameDone);
38
     double predictRowsSizeSum(Frame* pic, RateControlEntry* rce, double qpm, int32_t& encodedBits);
39
     bool   initPass2();
40
+    bool   analyseABR2Pass(int startPoc, int endPoc, uint64_t allAvailableBits);
41
     void   initFramePredictors();
42
     double getDiffLimitedQScale(RateControlEntry *rce, double q);
43
-    double countExpectedBits();
44
-    bool   vbv2Pass(uint64_t allAvailableBits);
45
-    bool   findUnderflow(double *fills, int *t0, int *t1, int over);
46
+    double countExpectedBits(int startPos, int framesCount);
47
+    bool   vbv2Pass(uint64_t allAvailableBits, int frameCount, int startPos);
48
+    bool   findUnderflow(double *fills, int *t0, int *t1, int over, int framesCount);
49
     bool   fixUnderflow(int t0, int t1, double adjustment, double qscaleMin, double qscaleMax);
50
 };
51
 }
52
x265_1.8.tar.gz/source/encoder/rdcost.h -> x265_1.9.tar.gz/source/encoder/rdcost.h Changed
99
 
1
@@ -2,6 +2,7 @@
2
 * Copyright (C) 2013 x265 project
3
 *
4
 * Authors: Steve Borho <steve@borho.org>
5
+*          Min Chen <chenm003@163.com>
6
 *
7
 * This program is free software; you can redistribute it and/or modify
8
 * it under the terms of the GNU General Public License as published by
9
@@ -73,13 +74,18 @@
10
             qpCr = x265_clip3(QP_MIN, QP_MAX_SPEC, qp + slice.m_pps->chromaQpOffset[1]);
11
         }
12
 
13
-        int chroma_offset_idx = X265_MIN(qp - qpCb + 12, MAX_CHROMA_LAMBDA_OFFSET);
14
-        uint16_t lambdaOffset = m_psyRd ? x265_chroma_lambda2_offset_tab[chroma_offset_idx] : 256;
15
-        m_chromaDistWeight[0] = lambdaOffset;
16
+        if (slice.m_sps->chromaFormatIdc == X265_CSP_I444)
17
+        {
18
+            int chroma_offset_idx = X265_MIN(qp - qpCb + 12, MAX_CHROMA_LAMBDA_OFFSET);
19
+            uint16_t lambdaOffset = m_psyRd ? x265_chroma_lambda2_offset_tab[chroma_offset_idx] : 256;
20
+            m_chromaDistWeight[0] = lambdaOffset;
21
 
22
-        chroma_offset_idx = X265_MIN(qp - qpCr + 12, MAX_CHROMA_LAMBDA_OFFSET);
23
-        lambdaOffset = m_psyRd ? x265_chroma_lambda2_offset_tab[chroma_offset_idx] : 256;
24
-        m_chromaDistWeight[1] = lambdaOffset;
25
+            chroma_offset_idx = X265_MIN(qp - qpCr + 12, MAX_CHROMA_LAMBDA_OFFSET);
26
+            lambdaOffset = m_psyRd ? x265_chroma_lambda2_offset_tab[chroma_offset_idx] : 256;
27
+            m_chromaDistWeight[1] = lambdaOffset;
28
+        }
29
+        else
30
+            m_chromaDistWeight[0] = m_chromaDistWeight[1] = 256;
31
     }
32
 
33
     void setLambda(double lambda2, double lambda)
34
@@ -88,9 +94,9 @@
35
         m_lambda = (uint64_t)floor(256.0 * lambda);
36
     }
37
 
38
-    inline uint64_t calcRdCost(sse_ret_t distortion, uint32_t bits) const
39
+    inline uint64_t calcRdCost(sse_t distortion, uint32_t bits) const
40
     {
41
-#if X265_DEPTH <= 10
42
+#if X265_DEPTH < 10
43
         X265_CHECK(bits <= (UINT64_MAX - 128) / m_lambda2,
44
                    "calcRdCost wrap detected dist: %u, bits %u, lambda: " X265_LL "\n",
45
                    distortion, bits, m_lambda2);
46
@@ -108,15 +114,18 @@
47
         return primitives.cu[size].psy_cost_pp(source, sstride, recon, rstride);
48
     }
49
 
50
-    /* return the difference in energy between the source block and the recon block */
51
-    inline int psyCost(int size, const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride) const
52
-    {
53
-        return primitives.cu[size].psy_cost_ss(source, sstride, recon, rstride);
54
-    }
55
-
56
     /* return the RD cost of this prediction, including the effect of psy-rd */
57
-    inline uint64_t calcPsyRdCost(sse_ret_t distortion, uint32_t bits, uint32_t psycost) const
58
+    inline uint64_t calcPsyRdCost(sse_t distortion, uint32_t bits, uint32_t psycost) const
59
     {
60
+#if X265_DEPTH < 10
61
+        X265_CHECK((bits <= (UINT64_MAX / m_lambda2)) && (psycost <= UINT64_MAX / (m_lambda * m_psyRd)),
62
+                   "calcPsyRdCost wrap detected dist: %u, bits: %u, lambda: " X265_LL ", lambda2: " X265_LL "\n",
63
+                   distortion, bits, m_lambda, m_lambda2);
64
+#else
65
+        X265_CHECK((bits <= (UINT64_MAX / m_lambda2)) && (psycost <= UINT64_MAX / (m_lambda * m_psyRd)),
66
+                   "calcPsyRdCost wrap detected dist: " X265_LL ", bits: %u, lambda: " X265_LL ", lambda2: " X265_LL "\n",
67
+                   distortion, bits, m_lambda, m_lambda2);
68
+#endif
69
         return distortion + ((m_lambda * m_psyRd * psycost) >> 24) + ((bits * m_lambda2) >> 8);
70
     }
71
 
72
@@ -127,9 +136,9 @@
73
         return sadCost + ((bits * m_lambda + 128) >> 8);
74
     }
75
 
76
-    inline sse_ret_t scaleChromaDist(uint32_t plane, sse_ret_t dist) const
77
+    inline sse_t scaleChromaDist(uint32_t plane, sse_t dist) const
78
     {
79
-#if X265_DEPTH <= 10
80
+#if X265_DEPTH < 10
81
         X265_CHECK(dist <= (UINT64_MAX - 128) / m_chromaDistWeight[plane - 1],
82
                    "scaleChromaDist wrap detected dist: %u, lambda: %u\n",
83
                    dist, m_chromaDistWeight[plane - 1]);
84
@@ -138,11 +147,13 @@
85
                    "scaleChromaDist wrap detected dist: " X265_LL " lambda: %u\n",
86
                    dist, m_chromaDistWeight[plane - 1]);
87
 #endif
88
-        return (sse_ret_t)((dist * (uint64_t)m_chromaDistWeight[plane - 1] + 128) >> 8);
89
+        return (sse_t)((dist * (uint64_t)m_chromaDistWeight[plane - 1] + 128) >> 8);
90
     }
91
 
92
     inline uint32_t getCost(uint32_t bits) const
93
     {
94
+        X265_CHECK(bits <= (UINT64_MAX - 128) / m_lambda,
95
+                   "getCost wrap detected bits: %u, lambda: " X265_LL "\n", bits, m_lambda);
96
         return (uint32_t)((bits * m_lambda + 128) >> 8);
97
     }
98
 };
99
x265_1.8.tar.gz/source/encoder/reference.cpp -> x265_1.9.tar.gz/source/encoder/reference.cpp Changed
10
 
1
@@ -68,7 +68,7 @@
2
         intptr_t stride = reconPic->m_stride;
3
         int cuHeight = g_maxCUSize;
4
 
5
-        for (int c = 0; c < numInterpPlanes; c++)
6
+        for (int c = 0; c < (p.internalCsp != X265_CSP_I400 ? numInterpPlanes : 1); c++)
7
         {
8
             if (c == 1)
9
             {
10
x265_1.8.tar.gz/source/encoder/sao.cpp -> x265_1.9.tar.gz/source/encoder/sao.cpp Changed
201
 
1
@@ -73,9 +73,6 @@
2
 
3
 SAO::SAO()
4
 {
5
-    m_count = NULL;
6
-    m_offset = NULL;
7
-    m_offsetOrg = NULL;
8
     m_countPreDblk = NULL;
9
     m_offsetOrgPreDblk = NULL;
10
     m_refDepth = 0;
11
@@ -84,28 +81,22 @@
12
     m_param = NULL;
13
     m_clipTable = NULL;
14
     m_clipTableBase = NULL;
15
-    m_tmpU1[0] = NULL;
16
-    m_tmpU1[1] = NULL;
17
-    m_tmpU1[2] = NULL;
18
-    m_tmpU2[0] = NULL;
19
-    m_tmpU2[1] = NULL;
20
-    m_tmpU2[2] = NULL;
21
-    m_tmpL1 = NULL;
22
-    m_tmpL2 = NULL;
23
-
24
-    m_depthSaoRate[0][0] = 0;
25
-    m_depthSaoRate[0][1] = 0;
26
-    m_depthSaoRate[0][2] = 0;
27
-    m_depthSaoRate[0][3] = 0;
28
-    m_depthSaoRate[1][0] = 0;
29
-    m_depthSaoRate[1][1] = 0;
30
-    m_depthSaoRate[1][2] = 0;
31
-    m_depthSaoRate[1][3] = 0;
32
+    m_tmpU[0] = NULL;
33
+    m_tmpU[1] = NULL;
34
+    m_tmpU[2] = NULL;
35
+    m_tmpL1[0] = NULL;
36
+    m_tmpL1[1] = NULL;
37
+    m_tmpL1[2] = NULL;
38
+    m_tmpL2[0] = NULL;
39
+    m_tmpL2[1] = NULL;
40
+    m_tmpL2[2] = NULL;
41
+    m_depthSaoRate = NULL;
42
 }
43
 
44
-bool SAO::create(x265_param* param)
45
+bool SAO::create(x265_param* param, int initCommon)
46
 {
47
     m_param = param;
48
+    m_chromaFormat = param->internalCsp;
49
     m_hChromaShift = CHROMA_H_SHIFT(param->internalCsp);
50
     m_vChromaShift = CHROMA_V_SHIFT(param->internalCsp);
51
 
52
@@ -116,37 +107,56 @@
53
     const pixel rangeExt = maxY >> 1;
54
     int numCtu = m_numCuInWidth * m_numCuInHeight;
55
 
56
-    CHECKED_MALLOC(m_clipTableBase,  pixel, maxY + 2 * rangeExt);
57
-
58
-    CHECKED_MALLOC(m_tmpL1, pixel, g_maxCUSize + 1);
59
-    CHECKED_MALLOC(m_tmpL2, pixel, g_maxCUSize + 1);
60
-
61
-    for (int i = 0; i < 3; i++)
62
+    for (int i = 0; i < (param->internalCsp != X265_CSP_I400 ? 3 : 1); i++)
63
     {
64
+        CHECKED_MALLOC(m_tmpL1[i], pixel, g_maxCUSize + 1);
65
+        CHECKED_MALLOC(m_tmpL2[i], pixel, g_maxCUSize + 1);
66
+
67
         // SAO asm code will read 1 pixel before and after, so pad by 2
68
-        CHECKED_MALLOC(m_tmpU1[i], pixel, m_param->sourceWidth + 2);
69
-        m_tmpU1[i] += 1;
70
-        CHECKED_MALLOC(m_tmpU2[i], pixel, m_param->sourceWidth + 2);
71
-        m_tmpU2[i] += 1;
72
+        // NOTE: m_param->sourceWidth+2 enough, to avoid condition check in copySaoAboveRef(), I alloc more up to 63 bytes in here
73
+        CHECKED_MALLOC(m_tmpU[i], pixel, m_numCuInWidth * g_maxCUSize + 2 + 32);
74
+        m_tmpU[i] += 1;
75
     }
76
 
77
-    CHECKED_MALLOC(m_count, PerClass, NUM_PLANE);
78
-    CHECKED_MALLOC(m_offset, PerClass, NUM_PLANE);
79
-    CHECKED_MALLOC(m_offsetOrg, PerClass, NUM_PLANE);
80
-
81
-    CHECKED_MALLOC(m_countPreDblk, PerPlane, numCtu);
82
-    CHECKED_MALLOC(m_offsetOrgPreDblk, PerPlane, numCtu);
83
-
84
-    m_clipTable = &(m_clipTableBase[rangeExt]);
85
-
86
-    for (int i = 0; i < rangeExt; i++)
87
-        m_clipTableBase[i] = 0;
88
+    if (initCommon)
89
+    {
90
+        CHECKED_MALLOC(m_countPreDblk, PerPlane, numCtu);
91
+        CHECKED_MALLOC(m_offsetOrgPreDblk, PerPlane, numCtu);
92
+        CHECKED_MALLOC(m_depthSaoRate, double, 2 * SAO_DEPTHRATE_SIZE);
93
+
94
+        m_depthSaoRate[0 * SAO_DEPTHRATE_SIZE + 0] = 0;
95
+        m_depthSaoRate[0 * SAO_DEPTHRATE_SIZE + 1] = 0;
96
+        m_depthSaoRate[0 * SAO_DEPTHRATE_SIZE + 2] = 0;
97
+        m_depthSaoRate[0 * SAO_DEPTHRATE_SIZE + 3] = 0;
98
+        m_depthSaoRate[1 * SAO_DEPTHRATE_SIZE + 0] = 0;
99
+        m_depthSaoRate[1 * SAO_DEPTHRATE_SIZE + 1] = 0;
100
+        m_depthSaoRate[1 * SAO_DEPTHRATE_SIZE + 2] = 0;
101
+        m_depthSaoRate[1 * SAO_DEPTHRATE_SIZE + 3] = 0;
102
+
103
+        CHECKED_MALLOC(m_clipTableBase,  pixel, maxY + 2 * rangeExt);
104
+        m_clipTable = &(m_clipTableBase[rangeExt]);
105
+
106
+        // Share with fast clip lookup table
107
+        if (initCommon)
108
+        {
109
+            for (int i = 0; i < rangeExt; i++)
110
+                m_clipTableBase[i] = 0;
111
 
112
-    for (int i = 0; i < maxY; i++)
113
-        m_clipTable[i] = (pixel)i;
114
+            for (int i = 0; i < maxY; i++)
115
+                m_clipTable[i] = (pixel)i;
116
 
117
-    for (int i = maxY; i < maxY + rangeExt; i++)
118
-        m_clipTable[i] = maxY;
119
+            for (int i = maxY; i < maxY + rangeExt; i++)
120
+                m_clipTable[i] = maxY;
121
+        }
122
+    }
123
+    else
124
+    {
125
+        // must initialize these common pointer outside of function
126
+        m_countPreDblk = NULL;
127
+        m_offsetOrgPreDblk = NULL;
128
+        m_clipTableBase = NULL;
129
+        m_clipTable = NULL;
130
+    }
131
 
132
     return true;
133
 
134
@@ -154,34 +164,61 @@
135
     return false;
136
 }
137
 
138
-void SAO::destroy()
139
+void SAO::createFromRootNode(SAO* root)
140
 {
141
-    X265_FREE(m_clipTableBase);
142
-
143
-    X265_FREE(m_tmpL1);
144
-    X265_FREE(m_tmpL2);
145
+    X265_CHECK(m_countPreDblk == NULL, "duplicate initialize on m_countPreDblk");
146
+    X265_CHECK(m_offsetOrgPreDblk == NULL, "duplicate initialize on m_offsetOrgPreDblk");
147
+    X265_CHECK(m_depthSaoRate == NULL, "duplicate initialize on m_depthSaoRate");
148
+    X265_CHECK(m_clipTableBase == NULL, "duplicate initialize on m_clipTableBase");
149
+    X265_CHECK(m_clipTable == NULL, "duplicate initialize on m_clipTable");
150
+
151
+    m_countPreDblk = root->m_countPreDblk;
152
+    m_offsetOrgPreDblk = root->m_offsetOrgPreDblk;
153
+    m_depthSaoRate = root->m_depthSaoRate;
154
+    m_clipTableBase = root->m_clipTableBase; // Unnecessary
155
+    m_clipTable = root->m_clipTable;
156
+}
157
 
158
+void SAO::destroy(int destoryCommon)
159
+{
160
     for (int i = 0; i < 3; i++)
161
     {
162
-        if (m_tmpU1[i]) X265_FREE(m_tmpU1[i] - 1);
163
-        if (m_tmpU2[i]) X265_FREE(m_tmpU2[i] - 1);
164
+        if (m_tmpL1[i])
165
+        {
166
+            X265_FREE(m_tmpL1[i]);
167
+            m_tmpL1[i] = NULL;
168
+        }
169
+
170
+        if (m_tmpL2[i])
171
+        {
172
+            X265_FREE(m_tmpL2[i]);
173
+            m_tmpL2[i] = NULL;
174
+        }
175
+
176
+        if (m_tmpU[i])
177
+        {
178
+            X265_FREE(m_tmpU[i] - 1);
179
+            m_tmpU[i] = NULL;
180
+        }
181
     }
182
 
183
-    X265_FREE(m_count);
184
-    X265_FREE(m_offset);
185
-    X265_FREE(m_offsetOrg);
186
-    X265_FREE(m_countPreDblk);
187
-    X265_FREE(m_offsetOrgPreDblk);
188
+    if (destoryCommon)
189
+    {
190
+        X265_FREE_ZERO(m_countPreDblk);
191
+        X265_FREE_ZERO(m_offsetOrgPreDblk);
192
+        X265_FREE_ZERO(m_depthSaoRate);
193
+        X265_FREE_ZERO(m_clipTableBase);
194
+    }
195
 }
196
 
197
 /* allocate memory for SAO parameters */
198
 void SAO::allocSaoParam(SAOParam* saoParam) const
199
 {
200
+    int planes = (m_param->internalCsp != X265_CSP_I400) ? 3 : 1;
201
x265_1.8.tar.gz/source/encoder/sao.h -> x265_1.9.tar.gz/source/encoder/sao.h Changed
80
 
1
@@ -62,6 +62,7 @@
2
     enum { NUM_EDGETYPE = 5 };
3
     enum { NUM_PLANE = 3 };
4
     enum { NUM_MERGE_MODE = 3 };
5
+    enum { SAO_DEPTHRATE_SIZE = 4 };
6
 
7
     static const uint32_t s_eoTable[NUM_EDGETYPE];
8
 
9
@@ -71,18 +72,19 @@
10
 protected:
11
 
12
     /* allocated per part */
13
-    PerClass*   m_count;
14
-    PerClass*   m_offset;
15
-    PerClass*   m_offsetOrg;
16
+    PerPlane    m_count;
17
+    PerPlane    m_offset;
18
+    PerPlane    m_offsetOrg;
19
 
20
     /* allocated per CTU */
21
     PerPlane*   m_countPreDblk;
22
     PerPlane*   m_offsetOrgPreDblk;
23
 
24
-    double      m_depthSaoRate[2][4];
25
-    int8_t      m_offsetBo[SAO_NUM_BO_CLASSES];
26
-    int8_t      m_offsetEo[NUM_EDGETYPE];
27
+    double*     m_depthSaoRate;
28
+    int8_t      m_offsetBo[NUM_PLANE][SAO_NUM_BO_CLASSES];
29
+    int8_t      m_offsetEo[NUM_PLANE][NUM_EDGETYPE];
30
 
31
+    int         m_chromaFormat;
32
     int         m_numCuInWidth;
33
     int         m_numCuInHeight;
34
     int         m_hChromaShift;
35
@@ -91,10 +93,9 @@
36
     pixel*      m_clipTable;
37
     pixel*      m_clipTableBase;
38
 
39
-    pixel*      m_tmpU1[3];
40
-    pixel*      m_tmpU2[3];
41
-    pixel*      m_tmpL1;
42
-    pixel*      m_tmpL2;
43
+    pixel*      m_tmpU[3];
44
+    pixel*      m_tmpL1[3];
45
+    pixel*      m_tmpL2[3];
46
 
47
 public:
48
 
49
@@ -119,8 +120,9 @@
50
 
51
     SAO();
52
 
53
-    bool create(x265_param* param);
54
-    void destroy();
55
+    bool create(x265_param* param, int initCommon);
56
+    void createFromRootNode(SAO *root);
57
+    void destroy(int destoryCommon);
58
 
59
     void allocSaoParam(SAOParam* saoParam) const;
60
 
61
@@ -131,6 +133,8 @@
62
     // CTU-based SAO process without slice granularity
63
     void processSaoCu(int addr, int typeIdx, int plane);
64
     void processSaoUnitRow(SaoCtuParam* ctuParam, int idxY, int plane);
65
+    void processSaoUnitCuLuma(SaoCtuParam* ctuParam, int idxY, int idxX);
66
+    void processSaoUnitCuChroma(SaoCtuParam* ctuParam[3], int idxY, int idxX);
67
 
68
     void copySaoUnit(SaoCtuParam* saoUnitDst, const SaoCtuParam* saoUnitSrc);
69
 
70
@@ -146,6 +150,9 @@
71
 
72
     void rdoSaoUnitRowEnd(const SAOParam* saoParam, int numctus);
73
     void rdoSaoUnitRow(SAOParam* saoParam, int idxY);
74
+    void rdoSaoUnitCu(SAOParam* saoParam, int rowBaseAddr, int idxX, int addr);
75
+
76
+    friend class FrameFilter;
77
 };
78
 
79
 }
80
x265_1.8.tar.gz/source/encoder/search.cpp -> x265_1.9.tar.gz/source/encoder/search.cpp Changed
201
 
1
@@ -2,6 +2,7 @@
2
 * Copyright (C) 2013 x265 project
3
 *
4
 * Authors: Steve Borho <steve@borho.org>
5
+*          Min Chen <chenm003@163.com>
6
 *
7
 * This program is free software; you can redistribute it and/or modify
8
 * it under the terms of the GNU General Public License as published by
9
@@ -80,7 +81,7 @@
10
     m_me.init(param.searchMethod, param.subpelRefine, param.internalCsp);
11
 
12
     bool ok = m_quant.init(param.rdoqLevel, param.psyRdoq, scalingList, m_entropyCoder);
13
-    if (m_param->noiseReductionIntra || m_param->noiseReductionInter)
14
+    if (m_param->noiseReductionIntra || m_param->noiseReductionInter || m_param->rc.vbvBufferSize)
15
         ok &= m_quant.allocNoiseReduction(param);
16
 
17
     ok &= Predict::allocBuffers(param.internalCsp); /* sets m_hChromaShift & m_vChromaShift */
18
@@ -97,13 +98,27 @@
19
      * the coeffRQT and reconQtYuv are allocated to the max CU size at every depth. The parts
20
      * which are reconstructed at each depth are valid. At the end, the transform depth table
21
      * is walked and the coeff and recon at the correct depths are collected */
22
-    for (uint32_t i = 0; i <= m_numLayers; i++)
23
+
24
+    if (param.internalCsp != X265_CSP_I400)
25
+    {
26
+        for (uint32_t i = 0; i <= m_numLayers; i++)
27
+        {
28
+            CHECKED_MALLOC(m_rqt[i].coeffRQT[0], coeff_t, sizeL + sizeC * 2);
29
+            m_rqt[i].coeffRQT[1] = m_rqt[i].coeffRQT[0] + sizeL;
30
+            m_rqt[i].coeffRQT[2] = m_rqt[i].coeffRQT[0] + sizeL + sizeC;
31
+            ok &= m_rqt[i].reconQtYuv.create(g_maxCUSize, param.internalCsp);
32
+            ok &= m_rqt[i].resiQtYuv.create(g_maxCUSize, param.internalCsp);
33
+        }
34
+    }
35
+    else
36
     {
37
-        CHECKED_MALLOC(m_rqt[i].coeffRQT[0], coeff_t, sizeL + sizeC * 2);
38
-        m_rqt[i].coeffRQT[1] = m_rqt[i].coeffRQT[0] + sizeL;
39
-        m_rqt[i].coeffRQT[2] = m_rqt[i].coeffRQT[0] + sizeL + sizeC;
40
-        ok &= m_rqt[i].reconQtYuv.create(g_maxCUSize, param.internalCsp);
41
-        ok &= m_rqt[i].resiQtYuv.create(g_maxCUSize, param.internalCsp);
42
+        for (uint32_t i = 0; i <= m_numLayers; i++)
43
+        {
44
+            CHECKED_MALLOC(m_rqt[i].coeffRQT[0], coeff_t, sizeL);
45
+            m_rqt[i].coeffRQT[1] = m_rqt[i].coeffRQT[2] = NULL;
46
+            ok &= m_rqt[i].reconQtYuv.create(g_maxCUSize, param.internalCsp);
47
+            ok &= m_rqt[i].resiQtYuv.create(g_maxCUSize, param.internalCsp);
48
+        }
49
     }
50
 
51
     /* the rest of these buffers are indexed per-depth */
52
@@ -116,12 +131,22 @@
53
         ok &= m_rqt[i].bidirPredYuv[1].create(cuSize, param.internalCsp);
54
     }
55
 
56
-    CHECKED_MALLOC(m_qtTempCbf[0], uint8_t, numPartitions * 3);
57
-    m_qtTempCbf[1] = m_qtTempCbf[0] + numPartitions;
58
-    m_qtTempCbf[2] = m_qtTempCbf[0] + numPartitions * 2;
59
-    CHECKED_MALLOC(m_qtTempTransformSkipFlag[0], uint8_t, numPartitions * 3);
60
-    m_qtTempTransformSkipFlag[1] = m_qtTempTransformSkipFlag[0] + numPartitions;
61
-    m_qtTempTransformSkipFlag[2] = m_qtTempTransformSkipFlag[0] + numPartitions * 2;
62
+    if (param.internalCsp != X265_CSP_I400)
63
+    {
64
+        CHECKED_MALLOC(m_qtTempCbf[0], uint8_t, numPartitions * 3);
65
+        m_qtTempCbf[1] = m_qtTempCbf[0] + numPartitions;
66
+        m_qtTempCbf[2] = m_qtTempCbf[0] + numPartitions * 2;
67
+        CHECKED_MALLOC(m_qtTempTransformSkipFlag[0], uint8_t, numPartitions * 3);
68
+        m_qtTempTransformSkipFlag[1] = m_qtTempTransformSkipFlag[0] + numPartitions;
69
+        m_qtTempTransformSkipFlag[2] = m_qtTempTransformSkipFlag[0] + numPartitions * 2;
70
+    }
71
+    else
72
+    {
73
+        CHECKED_MALLOC(m_qtTempCbf[0], uint8_t, numPartitions);
74
+        m_qtTempCbf[1] = m_qtTempCbf[2] = NULL;
75
+        CHECKED_MALLOC(m_qtTempTransformSkipFlag[0], uint8_t, numPartitions);
76
+        m_qtTempTransformSkipFlag[1] = m_qtTempTransformSkipFlag[2] = NULL;
77
+    }
78
 
79
     CHECKED_MALLOC(m_intraPred, pixel, (32 * 32) * (33 + 3));
80
     m_fencScaled = m_intraPred + 32 * 32;
81
@@ -163,12 +188,12 @@
82
     X265_FREE(m_tsRecon);
83
 }
84
 
85
-int Search::setLambdaFromQP(const CUData& ctu, int qp)
86
+int Search::setLambdaFromQP(const CUData& ctu, int qp, int lambdaQp)
87
 {
88
     X265_CHECK(qp >= QP_MIN && qp <= QP_MAX_MAX, "QP used for lambda is out of range\n");
89
 
90
     m_me.setQP(qp);
91
-    m_rdCost.setQP(*m_slice, qp);
92
+    m_rdCost.setQP(*m_slice, lambdaQp < 0 ? qp : lambdaQp);
93
 
94
     int quantQP = x265_clip3(QP_MIN, QP_MAX_SPEC, qp);
95
     m_quant.setQPforQuant(ctu, quantQP);
96
@@ -446,8 +471,9 @@
97
     }
98
 
99
     // set reconstruction for next intra prediction blocks if full TU prediction won
100
-    pixel*   picReconY = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
101
-    intptr_t picStride = m_frame->m_reconPic->m_stride;
102
+    PicYuv*  reconPic = m_frame->m_reconPic;
103
+    pixel*   picReconY = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
104
+    intptr_t picStride = reconPic->m_stride;
105
     primitives.cu[sizeIdx].copy_pp(picReconY, picStride, reconQt, reconQtStride);
106
 
107
     outCost.rdcost     += fullCost.rdcost;
108
@@ -530,7 +556,7 @@
109
             // no residual coded, recon = pred
110
             primitives.cu[sizeIdx].copy_pp(tmpRecon, tmpReconStride, pred, stride);
111
 
112
-        sse_ret_t tmpDist = primitives.cu[sizeIdx].sse_pp(tmpRecon, tmpReconStride, fenc, stride);
113
+        sse_t tmpDist = primitives.cu[sizeIdx].sse_pp(tmpRecon, tmpReconStride, fenc, stride);
114
 
115
         cu.setTransformSkipSubParts(useTSkip, TEXT_LUMA, absPartIdx, fullDepth);
116
         cu.setCbfSubParts((!!numSig) << tuDepth, TEXT_LUMA, absPartIdx, fullDepth);
117
@@ -611,8 +637,9 @@
118
     }
119
 
120
     // set reconstruction for next intra prediction blocks
121
-    pixel*   picReconY = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
122
-    intptr_t picStride = m_frame->m_reconPic->m_stride;
123
+    PicYuv*  reconPic = m_frame->m_reconPic;
124
+    pixel*   picReconY = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
125
+    intptr_t picStride = reconPic->m_stride;
126
     primitives.cu[sizeIdx].copy_pp(picReconY, picStride, reconQt, reconQtStride);
127
 
128
     outCost.rdcost += fullCost.rdcost;
129
@@ -661,8 +688,9 @@
130
         uint32_t sizeIdx   = log2TrSize - 2;
131
         primitives.cu[sizeIdx].calcresidual(fenc, pred, residual, stride);
132
 
133
-        pixel*   picReconY = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
134
-        intptr_t picStride = m_frame->m_reconPic->m_stride;
135
+        PicYuv*  reconPic = m_frame->m_reconPic;
136
+        pixel*   picReconY = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
137
+        intptr_t picStride = reconPic->m_stride;
138
 
139
         uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffY, log2TrSize, TEXT_LUMA, absPartIdx, false);
140
         if (numSig)
141
@@ -750,7 +778,7 @@
142
 }
143
 
144
 /* returns distortion */
145
-uint32_t Search::codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, uint32_t& psyEnergy)
146
+void Search::codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, Cost& outCost)
147
 {
148
     CUData& cu = mode.cu;
149
     uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
150
@@ -758,10 +786,10 @@
151
     if (tuDepth < cu.m_tuDepth[absPartIdx])
152
     {
153
         uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
154
-        uint32_t outDist = 0, splitCbfU = 0, splitCbfV = 0;
155
+        uint32_t splitCbfU = 0, splitCbfV = 0;
156
         for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
157
         {
158
-            outDist += codeIntraChromaQt(mode, cuGeom, tuDepth + 1, qPartIdx, psyEnergy);
159
+            codeIntraChromaQt(mode, cuGeom, tuDepth + 1, qPartIdx, outCost);
160
             splitCbfU |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
161
             splitCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
162
         }
163
@@ -770,8 +798,7 @@
164
             cu.m_cbf[1][absPartIdx + offs] |= (splitCbfU << tuDepth);
165
             cu.m_cbf[2][absPartIdx + offs] |= (splitCbfV << tuDepth);
166
         }
167
-
168
-        return outDist;
169
+        return;
170
     }
171
 
172
     uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
173
@@ -780,7 +807,7 @@
174
     {
175
         X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
176
         if (absPartIdx & 3)
177
-            return 0;
178
+            return;
179
         log2TrSizeC = 2;
180
         tuDepthC--;
181
     }
182
@@ -791,13 +818,15 @@
183
     bool checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && log2TrSizeC <= MAX_LOG2_TS_SIZE && !cu.m_tqBypass[0];
184
     checkTransformSkip &= !m_param->bEnableTSkipFast || (log2TrSize <= MAX_LOG2_TS_SIZE && cu.m_transformSkip[TEXT_LUMA][absPartIdx]);
185
     if (checkTransformSkip)
186
-        return codeIntraChromaTSkip(mode, cuGeom, tuDepth, tuDepthC, absPartIdx, psyEnergy);
187
+    {
188
+        codeIntraChromaTSkip(mode, cuGeom, tuDepth, tuDepthC, absPartIdx, outCost);
189
+        return;
190
+    }
191
 
192
     ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv;
193
     uint32_t qtLayer = log2TrSize - 2;
194
     uint32_t stride = mode.fencYuv->m_csize;
195
     const uint32_t sizeIdxC = log2TrSizeC - 2;
196
-    sse_ret_t outDist = 0;
197
 
198
     uint32_t curPartNum = cuGeom.numPartitions >> tuDepthC * 2;
199
     const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
200
@@ -821,8 +850,9 @@
201
x265_1.8.tar.gz/source/encoder/search.h -> x265_1.9.tar.gz/source/encoder/search.h Changed
181
 
1
@@ -2,6 +2,7 @@
2
 * Copyright (C) 2013 x265 project
3
 *
4
 * Authors: Steve Borho <steve@borho.org>
5
+*          Min Chen <chenm003@163.com>
6
 *
7
 * This program is free software; you can redistribute it and/or modify
8
 * it under the terms of the GNU General Public License as published by
9
@@ -84,8 +85,14 @@
10
     MV       mvp;
11
     int      mvpIdx;
12
     int      ref;
13
-    uint32_t cost;
14
     int      bits;
15
+    uint32_t mvCost;
16
+    uint32_t cost;
17
+
18
+    MotionData()
19
+    {
20
+        memset(this, 0, sizeof(MotionData));
21
+    }
22
 };
23
 
24
 struct Mode
25
@@ -105,16 +112,17 @@
26
     // temporal candidate.
27
     InterNeighbourMV interNeighbours[6];
28
 
29
-    uint64_t   rdCost;     // sum of partition (psy) RD costs          (sse(fenc, recon) + lambda2 * bits)
30
-    uint64_t   sa8dCost;   // sum of partition sa8d distortion costs   (sa8d(fenc, pred) + lambda * bits)
31
-    uint32_t   sa8dBits;   // signal bits used in sa8dCost calculation
32
-    uint32_t   psyEnergy;  // sum of partition psycho-visual energy difference
33
-    sse_ret_t  lumaDistortion;
34
-    sse_ret_t  chromaDistortion;
35
-    sse_ret_t  distortion; // sum of partition SSE distortion
36
-    uint32_t   totalBits;  // sum of partition bits (mv + coeff)
37
-    uint32_t   mvBits;     // Mv bits + Ref + block type (or intra mode)
38
-    uint32_t   coeffBits;  // Texture bits (DCT Coeffs)
39
+    uint64_t    rdCost;     // sum of partition (psy) RD costs          (sse(fenc, recon) + lambda2 * bits)
40
+    uint64_t    sa8dCost;   // sum of partition sa8d distortion costs   (sa8d(fenc, pred) + lambda * bits)
41
+    uint32_t    sa8dBits;   // signal bits used in sa8dCost calculation
42
+    uint32_t    psyEnergy;  // sum of partition psycho-visual energy difference
43
+    sse_t   resEnergy;  // sum of partition residual energy after motion prediction
44
+    sse_t   lumaDistortion;
45
+    sse_t   chromaDistortion;
46
+    sse_t  distortion; // sum of partition SSE distortion
47
+    uint32_t    totalBits;  // sum of partition bits (mv + coeff)
48
+    uint32_t    mvBits;     // Mv bits + Ref + block type (or intra mode)
49
+    uint32_t    coeffBits;  // Texture bits (DCT Coeffs)
50
 
51
     void initCosts()
52
     {
53
@@ -122,6 +130,7 @@
54
         sa8dCost = 0;
55
         sa8dBits = 0;
56
         psyEnergy = 0;
57
+        resEnergy = 0;
58
         lumaDistortion = 0;
59
         chromaDistortion = 0;
60
         distortion = 0;
61
@@ -130,62 +139,13 @@
62
         coeffBits = 0;
63
     }
64
 
65
-    void invalidate()
66
-    {
67
-        /* set costs to invalid data, catch uninitialized re-use */
68
-        rdCost = UINT64_MAX / 2;
69
-        sa8dCost = UINT64_MAX / 2;
70
-        sa8dBits = MAX_UINT / 2;
71
-        psyEnergy = MAX_UINT / 2;
72
-#if X265_DEPTH <= 10
73
-        lumaDistortion = MAX_UINT / 2;
74
-        chromaDistortion = MAX_UINT / 2;
75
-        distortion = MAX_UINT / 2;
76
-#else
77
-        lumaDistortion = UINT64_MAX / 2;
78
-        chromaDistortion = UINT64_MAX / 2;
79
-        distortion = UINT64_MAX / 2;
80
-#endif
81
-        totalBits = MAX_UINT / 2;
82
-        mvBits = MAX_UINT / 2;
83
-        coeffBits = MAX_UINT / 2;
84
-    }
85
-
86
-    bool ok() const
87
-    {
88
-#if X265_DEPTH <= 10
89
-        return !(rdCost >= UINT64_MAX / 2 ||
90
-            sa8dCost >= UINT64_MAX / 2 ||
91
-            sa8dBits >= MAX_UINT / 2 ||
92
-            psyEnergy >= MAX_UINT / 2 ||
93
-            lumaDistortion >= MAX_UINT / 2 ||
94
-            chromaDistortion >= MAX_UINT / 2 ||
95
-            distortion >= MAX_UINT / 2 ||
96
-            totalBits >= MAX_UINT / 2 ||
97
-            mvBits >= MAX_UINT / 2 ||
98
-            coeffBits >= MAX_UINT / 2);
99
-#else
100
-        return !(rdCost >= UINT64_MAX / 2 ||
101
-                 sa8dCost >= UINT64_MAX / 2 ||
102
-                 sa8dBits >= MAX_UINT / 2 ||
103
-                 psyEnergy >= MAX_UINT / 2 ||
104
-                 lumaDistortion >= UINT64_MAX / 2 ||
105
-                 chromaDistortion >= UINT64_MAX / 2 ||
106
-                 distortion >= UINT64_MAX / 2 ||
107
-                 totalBits >= MAX_UINT / 2 ||
108
-                 mvBits >= MAX_UINT / 2 ||
109
-                 coeffBits >= MAX_UINT / 2);
110
-#endif
111
-    }
112
-
113
     void addSubCosts(const Mode& subMode)
114
     {
115
-        X265_CHECK(subMode.ok(), "sub-mode not initialized");
116
-
117
         rdCost += subMode.rdCost;
118
         sa8dCost += subMode.sa8dCost;
119
         sa8dBits += subMode.sa8dBits;
120
         psyEnergy += subMode.psyEnergy;
121
+        resEnergy += subMode.resEnergy;
122
         lumaDistortion += subMode.lumaDistortion;
123
         chromaDistortion += subMode.chromaDistortion;
124
         distortion += subMode.distortion;
125
@@ -325,13 +285,13 @@
126
     ~Search();
127
 
128
     bool     initSearch(const x265_param& param, ScalingList& scalingList);
129
-    int      setLambdaFromQP(const CUData& ctu, int qp); /* returns real quant QP in valid spec range */
130
+    int      setLambdaFromQP(const CUData& ctu, int qp, int lambdaQP = -1); /* returns real quant QP in valid spec range */
131
 
132
     // mark temp RD entropy contexts as uninitialized; useful for finding loads without stores
133
     void     invalidateContexts(int fromDepth);
134
 
135
-    // full RD search of intra modes. if sharedModes is not NULL, it directly uses them
136
-    void     checkIntra(Mode& intraMode, const CUGeom& cuGeom, PartSize partSize, uint8_t* sharedModes, uint8_t* sharedChromaModes);
137
+    // full RD search of intra modes
138
+    void     checkIntra(Mode& intraMode, const CUGeom& cuGeom, PartSize partSizes);
139
 
140
     // select best intra mode using only sa8d costs, cannot measure NxN intra
141
     void     checkIntraInInter(Mode& intraMode, const CUGeom& cuGeom);
142
@@ -397,10 +357,10 @@
143
     void     saveResidualQTData(CUData& cu, ShortYuv& resiYuv, uint32_t absPartIdx, uint32_t tuDepth);
144
 
145
     // RDO search of luma intra modes; result is fully encoded luma. luma distortion is returned
146
-    uint32_t estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, const uint32_t depthRange[2], uint8_t* sharedModes);
147
+    sse_t estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, const uint32_t depthRange[2]);
148
 
149
     // RDO select best chroma mode from luma; result is fully encode chroma. chroma distortion is returned
150
-    uint32_t estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom, uint8_t* sharedChromaModes);
151
+    sse_t estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom);
152
 
153
     void     codeSubdivCbfQTChroma(const CUData& cu, uint32_t tuDepth, uint32_t absPartIdx);
154
     void     codeInterSubdivCbfQT(CUData& cu, uint32_t absPartIdx, const uint32_t tuDepth, const uint32_t depthRange[2]);
155
@@ -410,12 +370,12 @@
156
     {
157
         uint64_t rdcost;
158
         uint32_t bits;
159
-        sse_ret_t distortion;
160
+        sse_t distortion;
161
         uint32_t energy;
162
         Cost() { rdcost = 0; bits = 0; distortion = 0; energy = 0; }
163
     };
164
 
165
-    uint64_t estimateNullCbfCost(uint32_t &dist, uint32_t &psyEnergy, uint32_t tuDepth, TextType compId);
166
+    uint64_t estimateNullCbfCost(sse_t dist, uint32_t psyEnergy, uint32_t tuDepth, TextType compId);
167
     void     estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, ShortYuv& resiYuv, Cost& costs, const uint32_t depthRange[2]);
168
 
169
     // generate prediction, generate residual and recon. if bAllowSplit, find optimal RQT splits
170
@@ -424,8 +384,8 @@
171
     void     extractIntraResultQT(CUData& cu, Yuv& reconYuv, uint32_t tuDepth, uint32_t absPartIdx);
172
 
173
     // generate chroma prediction, generate residual and recon
174
-    uint32_t codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, uint32_t& psyEnergy);
175
-    uint32_t codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t tuDepthC, uint32_t absPartIdx, uint32_t& psyEnergy);
176
+    void     codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, Cost& outCost);
177
+    void     codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t tuDepthC, uint32_t absPartIdx, Cost& outCost);
178
     void     extractIntraResultChromaQT(CUData& cu, Yuv& reconYuv, uint32_t absPartIdx, uint32_t tuDepth);
179
 
180
     // reshuffle CBF flags after coding a pair of 4:2:2 chroma blocks
181
x265_1.8.tar.gz/source/encoder/sei.h -> x265_1.9.tar.gz/source/encoder/sei.h Changed
51
 
1
@@ -163,12 +163,6 @@
2
 
3
     PayloadType payloadType() const { return CONTENT_LIGHT_LEVEL_INFO; }
4
 
5
-    bool parse(const char* value)
6
-    {
7
-        return sscanf(value, "%hu,%hu",
8
-                      &max_content_light_level, &max_pic_average_light_level) == 2;
9
-    }
10
-
11
     void write(Bitstream& bs, const SPS&)
12
     {
13
         m_bitIf = &bs;
14
@@ -195,29 +189,31 @@
15
 
16
     uint8_t m_digest[3][16];
17
 
18
-    void write(Bitstream& bs, const SPS&)
19
+    void write(Bitstream& bs, const SPS& sps)
20
     {
21
         m_bitIf = &bs;
22
 
23
+        int planes = (sps.chromaFormatIdc != X265_CSP_I400) ? 3 : 1;
24
+
25
         WRITE_CODE(DECODED_PICTURE_HASH, 8, "payload_type");
26
 
27
         switch (m_method)
28
         {
29
         case MD5:
30
-            WRITE_CODE(1 + 16 * 3, 8, "payload_size");
31
+            WRITE_CODE(1 + 16 * planes, 8, "payload_size");
32
             WRITE_CODE(MD5, 8, "hash_type");
33
             break;
34
         case CRC:
35
-            WRITE_CODE(1 + 2 * 3, 8, "payload_size");
36
+            WRITE_CODE(1 + 2 * planes, 8, "payload_size");
37
             WRITE_CODE(CRC, 8, "hash_type");
38
             break;
39
         case CHECKSUM:
40
-            WRITE_CODE(1 + 4 * 3, 8, "payload_size");
41
+            WRITE_CODE(1 + 4 * planes, 8, "payload_size");
42
             WRITE_CODE(CHECKSUM, 8, "hash_type");
43
             break;
44
         }
45
 
46
-        for (int yuvIdx = 0; yuvIdx < 3; yuvIdx++)
47
+        for (int yuvIdx = 0; yuvIdx < planes; yuvIdx++)
48
         {
49
             if (m_method == MD5)
50
             {
51
x265_1.8.tar.gz/source/encoder/slicetype.cpp -> x265_1.9.tar.gz/source/encoder/slicetype.cpp Changed
201
 
1
@@ -83,8 +83,11 @@
2
     uint32_t var;
3
 
4
     var  = acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[0] + blockOffsetLuma, stride, 0, csp);
5
-    var += acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[1] + blockOffsetChroma, cStride, 1, csp);
6
-    var += acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[2] + blockOffsetChroma, cStride, 2, csp);
7
+    if (csp != X265_CSP_I400)
8
+    {
9
+        var += acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[1] + blockOffsetChroma, cStride, 1, csp);
10
+        var += acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[2] + blockOffsetChroma, cStride, 2, csp);
11
+    }
12
     x265_emms();
13
     return var;
14
 }
15
@@ -96,6 +99,7 @@
16
     int maxRow = curFrame->m_fencPic->m_picHeight;
17
     int blockCount = curFrame->m_lowres.maxBlocksInRow * curFrame->m_lowres.maxBlocksInCol;
18
 
19
+    float* quantOffsets = curFrame->m_quantOffsets;
20
     for (int y = 0; y < 3; y++)
21
     {
22
         curFrame->m_lowres.wp_ssd[y] = 0;
23
@@ -113,10 +117,21 @@
24
 
25
         if (param->rc.aqMode && param->rc.aqStrength == 0)
26
         {
27
-            memset(curFrame->m_lowres.qpCuTreeOffset, 0, cuCount * sizeof(double));
28
-            memset(curFrame->m_lowres.qpAqOffset, 0, cuCount * sizeof(double));
29
-            for (int cuxy = 0; cuxy < cuCount; cuxy++)
30
-                curFrame->m_lowres.invQscaleFactor[cuxy] = 256;
31
+            if (quantOffsets)
32
+            {
33
+                for (int cuxy = 0; cuxy < cuCount; cuxy++)
34
+                {
35
+                    curFrame->m_lowres.qpCuTreeOffset[cuxy] = curFrame->m_lowres.qpAqOffset[cuxy] = quantOffsets[cuxy];
36
+                    curFrame->m_lowres.invQscaleFactor[cuxy] = x265_exp2fix8(curFrame->m_lowres.qpCuTreeOffset[cuxy]);
37
+                }
38
+            }
39
+            else
40
+            {
41
+                memset(curFrame->m_lowres.qpCuTreeOffset, 0, cuCount * sizeof(double));
42
+                memset(curFrame->m_lowres.qpAqOffset, 0, cuCount * sizeof(double));
43
+                for (int cuxy = 0; cuxy < cuCount; cuxy++)
44
+                    curFrame->m_lowres.invQscaleFactor[cuxy] = 256;
45
+            }
46
         }
47
 
48
         /* Need variance data for weighted prediction */
49
@@ -135,19 +150,25 @@
50
         if (param->rc.aqMode == X265_AQ_AUTO_VARIANCE || param->rc.aqMode == X265_AQ_AUTO_VARIANCE_BIASED)
51
         {
52
             double bit_depth_correction = 1.f / (1 << (2*(X265_DEPTH-8)));
53
+            curFrame->m_lowres.frameVariance = 0;
54
+            uint64_t rowVariance = 0;
55
             for (blockY = 0; blockY < maxRow; blockY += 16)
56
             {
57
+                rowVariance = 0;
58
                 for (blockX = 0; blockX < maxCol; blockX += 16)
59
                 {
60
                     uint32_t energy = acEnergyCu(curFrame, blockX, blockY, param->internalCsp);
61
+                    curFrame->m_lowres.blockVariance[blockXY] = energy;
62
+                    rowVariance += energy;
63
                     qp_adj = pow(energy * bit_depth_correction + 1, 0.1);
64
                     curFrame->m_lowres.qpCuTreeOffset[blockXY] = qp_adj;
65
                     avg_adj += qp_adj;
66
                     avg_adj_pow2 += qp_adj * qp_adj;
67
                     blockXY++;
68
                 }
69
+                curFrame->m_lowres.frameVariance += (rowVariance / maxCol);
70
             }
71
-
72
+            curFrame->m_lowres.frameVariance /= maxRow;
73
             avg_adj /= blockCount;
74
             avg_adj_pow2 /= blockCount;
75
             strength = param->rc.aqStrength * avg_adj;
76
@@ -177,6 +198,8 @@
77
                     uint32_t energy = acEnergyCu(curFrame, blockX, blockY, param->internalCsp);
78
                     qp_adj = strength * (X265_LOG2(X265_MAX(energy, 1)) - (14.427f + 2 * (X265_DEPTH - 8)));
79
                 }
80
+                if (quantOffsets != NULL)
81
+                    qp_adj += quantOffsets[blockXY];
82
                 curFrame->m_lowres.qpAqOffset[blockXY] = qp_adj;
83
                 curFrame->m_lowres.qpCuTreeOffset[blockXY] = qp_adj;
84
                 curFrame->m_lowres.invQscaleFactor[blockXY] = x265_exp2fix8(qp_adj);
85
@@ -328,7 +351,7 @@
86
 
87
         primitives.weight_pp(ref.buffer[0], wbuffer[0], stride, widthHeight, paddedLines,
88
             scale, round << correction, denom + correction, offset);
89
-        src = weightedRef.fpelPlane[0];
90
+        src = fenc.weightedRef[fenc.frameNum - ref.frameNum].fpelPlane[0];
91
     }
92
 
93
     uint32_t cost = 0;
94
@@ -350,7 +373,6 @@
95
 bool LookaheadTLD::allocWeightedRef(Lowres& fenc)
96
 {
97
     intptr_t planesize = fenc.buffer[1] - fenc.buffer[0];
98
-    intptr_t padoffset = fenc.lowresPlane[0] - fenc.buffer[0];
99
     paddedLines = (int)(planesize / fenc.lumaStride);
100
 
101
     wbuffer[0] = X265_MALLOC(pixel, 4 * planesize);
102
@@ -363,14 +385,6 @@
103
     else
104
         return false;
105
 
106
-    for (int i = 0; i < 4; i++)
107
-        weightedRef.lowresPlane[i] = wbuffer[i] + padoffset;
108
-
109
-    weightedRef.fpelPlane[0] = weightedRef.lowresPlane[0];
110
-    weightedRef.lumaStride = fenc.lumaStride;
111
-    weightedRef.isLowres = true;
112
-    weightedRef.isWeighted = false;
113
-
114
     return true;
115
 }
116
 
117
@@ -388,6 +402,16 @@
118
             return;
119
     }
120
 
121
+    ReferencePlanes& weightedRef = fenc.weightedRef[deltaIndex];
122
+    intptr_t padoffset = fenc.lowresPlane[0] - fenc.buffer[0];
123
+    for (int i = 0; i < 4; i++)
124
+        weightedRef.lowresPlane[i] = wbuffer[i] + padoffset;
125
+
126
+    weightedRef.fpelPlane[0] = weightedRef.lowresPlane[0];
127
+    weightedRef.lumaStride = fenc.lumaStride;
128
+    weightedRef.isLowres = true;
129
+    weightedRef.isWeighted = false;
130
+
131
     /* epsilon is chosen to require at least a numerator of 127 (with denominator = 128) */
132
     float guessScale, fencMean, refMean;
133
     x265_emms();
134
@@ -478,7 +502,13 @@
135
 
136
     m_8x8Height = ((m_param->sourceHeight / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
137
     m_8x8Width = ((m_param->sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
138
-    m_8x8Blocks = m_8x8Width > 2 && m_8x8Height > 2 ? (m_8x8Width - 2) * (m_8x8Height - 2) : m_8x8Width * m_8x8Height;
139
+    m_cuCount = m_8x8Width * m_8x8Height;
140
+    m_8x8Blocks = m_8x8Width > 2 && m_8x8Height > 2 ? (m_cuCount + 4 - 2 * (m_8x8Width + m_8x8Height)) : m_cuCount;
141
+
142
+    /* Allow the strength to be adjusted via qcompress, since the two concepts
143
+     * are very similar. */
144
+
145
+    m_cuTreeStrength = 5.0 * (1.0 - m_param->rc.qCompress);
146
 
147
     m_lastKeyframe = -m_param->keyframeMax;
148
     m_sliceTypeBusy = false;
149
@@ -502,7 +532,16 @@
150
     m_bBatchFrameCosts = m_bBatchMotionSearch;
151
 
152
     if (m_param->lookaheadSlices && !m_pool)
153
+    {
154
+        x265_log(param, X265_LOG_WARNING, "No pools found; disabling lookahead-slices\n");
155
+        m_param->lookaheadSlices = 0;
156
+    }
157
+
158
+    if (m_param->lookaheadSlices && (m_param->sourceHeight < 720))
159
+    {
160
+        x265_log(param, X265_LOG_WARNING, "Source height < 720p; disabling lookahead-slices\n");
161
         m_param->lookaheadSlices = 0;
162
+    }
163
 
164
     if (m_param->lookaheadSlices > 1)
165
     {
166
@@ -715,16 +754,16 @@
167
 
168
     case P_SLICE:
169
         b = p1 = poc - l0poc;
170
-        frames[p0] = &slice->m_refPicList[0][0]->m_lowres;
171
+        frames[p0] = &slice->m_refFrameList[0][0]->m_lowres;
172
         frames[b] = &curFrame->m_lowres;
173
         break;
174
 
175
     case B_SLICE:
176
         b = poc - l0poc;
177
         p1 = b + l1poc - poc;
178
-        frames[p0] = &slice->m_refPicList[0][0]->m_lowres;
179
+        frames[p0] = &slice->m_refFrameList[0][0]->m_lowres;
180
         frames[b] = &curFrame->m_lowres;
181
-        frames[p1] = &slice->m_refPicList[1][0]->m_lowres;
182
+        frames[p1] = &slice->m_refFrameList[1][0]->m_lowres;
183
         break;
184
 
185
     default:
186
@@ -736,10 +775,13 @@
187
     if (m_param->rc.cuTree && !m_param->rc.bStatRead)
188
         /* update row satds based on cutree offsets */
189
         curFrame->m_lowres.satdCost = frameCostRecalculate(frames, p0, p1, b);
190
-    else if (m_param->rc.aqMode)
191
-        curFrame->m_lowres.satdCost = curFrame->m_lowres.costEstAq[b - p0][p1 - b];
192
-    else
193
-        curFrame->m_lowres.satdCost = curFrame->m_lowres.costEst[b - p0][p1 - b];
194
+    else if (m_param->analysisMode != X265_ANALYSIS_LOAD)
195
+    {
196
+        if (m_param->rc.aqMode)
197
+            curFrame->m_lowres.satdCost = curFrame->m_lowres.costEstAq[b - p0][p1 - b];
198
+        else
199
+            curFrame->m_lowres.satdCost = curFrame->m_lowres.costEst[b - p0][p1 - b];
200
+    }
201
x265_1.8.tar.gz/source/encoder/slicetype.h -> x265_1.9.tar.gz/source/encoder/slicetype.h Changed
59
 
1
@@ -2,6 +2,7 @@
2
  * Copyright (C) 2013 x265 project
3
  *
4
  * Authors: Steve Borho <steve@borho.org>
5
+ *          Min Chen <chenm003@163.com>
6
  *
7
  * This program is free software; you can redistribute it and/or modify
8
  * it under the terms of the GNU General Public License as published by
9
@@ -44,7 +45,6 @@
10
 struct LookaheadTLD
11
 {
12
     MotionEstimate  me;
13
-    ReferencePlanes weightedRef;
14
     pixel*          wbuffer[4];
15
     int             widthInCU;
16
     int             heightInCU;
17
@@ -103,29 +103,30 @@
18
     PicList       m_outputQueue;     // pictures to be encoded, in encode order
19
     Lock          m_inputLock;
20
     Lock          m_outputLock;
21
-
22
-    /* pre-lookahead */
23
-    int           m_fullQueueSize;
24
-    bool          m_isActive;
25
-    bool          m_sliceTypeBusy;
26
-    bool          m_bAdaptiveQuant;
27
-    bool          m_outputSignalRequired;
28
-    bool          m_bBatchMotionSearch;
29
-    bool          m_bBatchFrameCosts;
30
     Event         m_outputSignal;
31
-
32
     LookaheadTLD* m_tld;
33
     x265_param*   m_param;
34
     Lowres*       m_lastNonB;
35
     int*          m_scratch;         // temp buffer for cutree propagate
36
-    
37
+
38
+    /* pre-lookahead */
39
+    int           m_fullQueueSize;
40
     int           m_histogram[X265_BFRAME_MAX + 1];
41
     int           m_lastKeyframe;
42
     int           m_8x8Width;
43
     int           m_8x8Height;
44
     int           m_8x8Blocks;
45
+    int           m_cuCount;
46
     int           m_numCoopSlices;
47
     int           m_numRowsPerSlice;
48
+    double        m_cuTreeStrength;
49
+
50
+    bool          m_isActive;
51
+    bool          m_sliceTypeBusy;
52
+    bool          m_bAdaptiveQuant;
53
+    bool          m_outputSignalRequired;
54
+    bool          m_bBatchMotionSearch;
55
+    bool          m_bBatchFrameCosts;
56
     bool          m_filled;
57
     bool          m_isSceneTransition;
58
     Lookahead(x265_param *param, ThreadPool *pool);
59
x265_1.8.tar.gz/source/encoder/weightPrediction.cpp -> x265_1.9.tar.gz/source/encoder/weightPrediction.cpp Changed
43
 
1
@@ -4,6 +4,7 @@
2
  * Author: Shazeb Nawaz Khan <shazeb@multicorewareinc.com>
3
  *         Steve Borho <steve@borho.org>
4
  *         Kavitha Sampas <kavitha@multicorewareinc.com>
5
+ *         Min Chen <chenm003@163.com>
6
  *
7
  * This program is free software; you can redistribute it and/or modify
8
  * it under the terms of the GNU General Public License as published by
9
@@ -259,13 +260,13 @@
10
     for (int list = 0; list < cache.numPredDir; list++)
11
     {
12
         WeightParam *weights = wp[list][0];
13
-        Frame *refFrame = slice.m_refPicList[list][0];
14
+        Frame *refFrame = slice.m_refFrameList[list][0];
15
         Lowres& refLowres = refFrame->m_lowres;
16
         int diffPoc = abs(curPoc - refFrame->m_poc);
17
 
18
         /* prepare estimates */
19
         float guessScale[3], fencMean[3], refMean[3];
20
-        for (int plane = 0; plane < 3; plane++)
21
+        for (int plane = 0; plane < (param.internalCsp != X265_CSP_I400 ? 3 : 1); plane++)
22
         {
23
             SET_WEIGHT(weights[plane], false, 1, 0, 0);
24
             uint64_t fencVar = fenc.wp_ssd[plane] + !refLowres.wp_ssd[plane];
25
@@ -289,7 +290,7 @@
26
 
27
         MV *mvs = NULL;
28
 
29
-        for (int plane = 0; plane < 3; plane++)
30
+        for (int plane = 0; plane < (param.internalCsp != X265_CSP_I400 ? 3 : 1); plane++)
31
         {
32
             denom = plane ? chromaDenom : lumaDenom;
33
             if (plane && !weights[0].bPresentFlag)
34
@@ -328,7 +329,7 @@
35
                 {
36
                     /* reference chroma planes must be extended prior to being
37
                      * used as motion compensation sources */
38
-                    if (!refFrame->m_bChromaExtended)
39
+                    if (!refFrame->m_bChromaExtended && param.internalCsp != X265_CSP_I400)
40
                     {
41
                         refFrame->m_bChromaExtended = true;
42
                         PicYuv *refPic = refFrame->m_fencPic;
43
x265_1.8.tar.gz/source/output/y4m.cpp -> x265_1.9.tar.gz/source/output/y4m.cpp Changed
10
 
1
@@ -70,7 +70,7 @@
2
         x265_log(NULL, X265_LOG_WARNING, "y4m: forcing reconstructed pixels to 8 bits\n");
3
 #endif
4
 
5
-    X265_CHECK(pic.colorSpace == colorSpace, "invalid color space\n");
6
+    X265_CHECK(pic.colorSpace == colorSpace, "invalid chroma subsampling\n");
7
 
8
 #if HIGH_BIT_DEPTH
9
 
10
x265_1.8.tar.gz/source/output/yuv.cpp -> x265_1.9.tar.gz/source/output/yuv.cpp Changed
10
 
1
@@ -53,7 +53,7 @@
2
     uint64_t fileOffset = pic.poc;
3
     fileOffset *= frameSize;
4
 
5
-    X265_CHECK(pic.colorSpace == colorSpace, "invalid color space\n");
6
+    X265_CHECK(pic.colorSpace == colorSpace, "invalid chroma subsampling\n");
7
     X265_CHECK(pic.bitDepth == (int)depth, "invalid bit depth\n");
8
 
9
 #if HIGH_BIT_DEPTH
10
x265_1.8.tar.gz/source/profile/vtune/CMakeLists.txt -> x265_1.9.tar.gz/source/profile/vtune/CMakeLists.txt Changed
5
 
1
@@ -1,2 +1,2 @@
2
-include_directories($ENV{VTUNE_AMPLIFIER_XE_2015_DIR}/include)
3
+include_directories(${VTUNE_INCLUDE_DIR})
4
 add_library(vtune vtune.h vtune.cpp ../cpuEvents.h)
5
x265_1.8.tar.gz/source/profile/vtune/vtune.cpp -> x265_1.9.tar.gz/source/profile/vtune/vtune.cpp Changed
19
 
1
@@ -30,7 +30,6 @@
2
 const char *stringNames[] =
3
 {
4
 #include "../cpuEvents.h"
5
-    ""
6
 };
7
 #undef CPU_EVENT
8
 
9
@@ -44,7 +43,8 @@
10
 void vtuneInit()
11
 {
12
     domain = __itt_domain_create("x265");
13
-    for (size_t i = 0; i < sizeof(stringNames) / sizeof(const char *); i++)
14
+    size_t length = sizeof(stringNames) / sizeof(const char *);
15
+    for (size_t i = 0; i < length; i++)
16
         taskHandle[i] = __itt_string_handle_create(stringNames[i]);
17
 }
18
 
19
x265_1.8.tar.gz/source/test/checkasm-a.asm -> x265_1.9.tar.gz/source/test/checkasm-a.asm Changed
13
 
1
@@ -2,9 +2,11 @@
2
 ;* checkasm-a.asm: assembly check tool
3
 ;*****************************************************************************
4
 ;* Copyright (C) 2008-2014 x264 project
5
+;* Copyright (C) 2013-2015 x265 project
6
 ;*
7
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
8
 ;*          Henrik Gramner <henrik@gramner.com>
9
+;*          Min Chen <chenm003@163.com>
10
 ;*
11
 ;* This program is free software; you can redistribute it and/or modify
12
 ;* it under the terms of the GNU General Public License as published by
13
x265_1.8.tar.gz/source/test/intrapredharness.cpp -> x265_1.9.tar.gz/source/test/intrapredharness.cpp Changed
10
 
1
@@ -130,6 +130,8 @@
2
                 if (memcmp(pixel_out_vec + k * FENC_STRIDE, pixel_out_c + k * FENC_STRIDE, width * sizeof(pixel)))
3
                 {
4
                     printf("ang_%dx%d, Mode = %d, Row = %d failed !!\n", width, width, pmode, k);
5
+                    ref[pmode](pixel_out_c, stride, pixel_buff + j, pmode, bFilter);
6
+                    opt[pmode](pixel_out_vec, stride, pixel_buff + j, pmode, bFilter);
7
                     return false;
8
                 }
9
             }
10
x265_1.8.tar.gz/source/test/ipfilterharness.h -> x265_1.9.tar.gz/source/test/ipfilterharness.h Changed
9
 
1
@@ -4,6 +4,7 @@
2
  * Authors: Deepthi Devaki <deepthidevaki@multicorewareinc.com>,
3
  *          Rajesh Paulraj <rajesh@multicorewareinc.com>
4
  *          Praveen Kumar Tiwari <praveen@multicorewareinc.com>
5
+ *          Min Chen <chenm003@163.com>
6
  *
7
  * This program is free software; you can redistribute it and/or modify
8
  * it under the terms of the GNU General Public License as published by
9
x265_1.8.tar.gz/source/test/pixelharness.cpp -> x265_1.9.tar.gz/source/test/pixelharness.cpp Changed
201
 
1
@@ -2,6 +2,7 @@
2
  * Copyright (C) 2013 x265 project
3
  *
4
  * Authors: Steve Borho <steve@borho.org>
5
+ *          Min Chen <chenm003@163.com>
6
  *
7
  * This program is free software; you can redistribute it and/or modify
8
  * it under the terms of the GNU General Public License as published by
9
@@ -41,6 +42,7 @@
10
         int_test_buff[0][i]     = rand() % SHORT_MAX;
11
         ushort_test_buff[0][i]  = rand() % ((1 << 16) - 1);
12
         uchar_test_buff[0][i]   = rand() % ((1 << 8) - 1);
13
+        residual_test_buff[0][i] = (rand() % (2 * RMAX + 1)) - RMAX - 1;// For sse_ss only
14
 
15
         pixel_test_buff[1][i]   = PIXEL_MIN;
16
         short_test_buff[1][i]   = SMIN;
17
@@ -49,6 +51,7 @@
18
         int_test_buff[1][i]     = SHORT_MIN;
19
         ushort_test_buff[1][i]  = PIXEL_MIN;
20
         uchar_test_buff[1][i]   = PIXEL_MIN;
21
+        residual_test_buff[1][i] = RMIN;
22
 
23
         pixel_test_buff[2][i]   = PIXEL_MAX;
24
         short_test_buff[2][i]   = SMAX;
25
@@ -57,6 +60,7 @@
26
         int_test_buff[2][i]     = SHORT_MAX;
27
         ushort_test_buff[2][i]  = ((1 << 16) - 1);
28
         uchar_test_buff[2][i]   = 255;
29
+        residual_test_buff[2][i] = RMAX;
30
 
31
         pbuf1[i] = rand() & PIXEL_MAX;
32
         pbuf2[i] = rand() & PIXEL_MAX;
33
@@ -103,8 +107,8 @@
34
     {
35
         int index1 = rand() % TEST_CASES;
36
         int index2 = rand() % TEST_CASES;
37
-        sse_ret_t vres = (sse_ret_t)checked(opt, pixel_test_buff[index1], stride, pixel_test_buff[index2] + j, stride);
38
-        sse_ret_t cres = ref(pixel_test_buff[index1], stride, pixel_test_buff[index2] + j, stride);
39
+        sse_t vres = (sse_t)checked(opt, pixel_test_buff[index1], stride, pixel_test_buff[index2] + j, stride);
40
+        sse_t cres = ref(pixel_test_buff[index1], stride, pixel_test_buff[index2] + j, stride);
41
         if (vres != cres)
42
             return false;
43
 
44
@@ -124,8 +128,8 @@
45
     {
46
         int index1 = rand() % TEST_CASES;
47
         int index2 = rand() % TEST_CASES;
48
-        sse_ret_t vres = (sse_ret_t)checked(opt, short_test_buff[index1], stride, short_test_buff[index2] + j, stride);
49
-        sse_ret_t cres = ref(short_test_buff[index1], stride, short_test_buff[index2] + j, stride);
50
+        sse_t vres = (sse_t)checked(opt, residual_test_buff[index1], stride, residual_test_buff[index2] + j, stride);
51
+        sse_t cres = ref(residual_test_buff[index1], stride, residual_test_buff[index2] + j, stride);
52
         if (vres != cres)
53
             return false;
54
 
55
@@ -227,8 +231,8 @@
56
     {
57
         // NOTE: stride must be multiple of 16, because minimum block is 4x4
58
         int stride = (STRIDE + (rand() % STRIDE)) & ~15;
59
-        int cres = ref(sbuf1 + j, stride);
60
-        int vres = (int)checked(opt, sbuf1 + j, (intptr_t)stride);
61
+        sse_t cres = ref(sbuf1 + j, stride);
62
+        sse_t vres = (sse_t)checked(opt, sbuf1 + j, (intptr_t)stride);
63
 
64
         if (cres != vres)
65
             return false;
66
@@ -854,7 +858,7 @@
67
         int width = (rand() % 4) + 1; // range[1-4]
68
         float cres = ref(sum0, sum1, width);
69
         float vres = checked_float(opt, sum0, sum1, width);
70
-        if (fabs(vres - cres) > 0.00001)
71
+        if (fabs(vres - cres) > 0.0001)
72
             return false;
73
 
74
         reportfail();
75
@@ -1061,8 +1065,8 @@
76
         int endX = MAX_CU_SIZE - (rand() % 5);
77
         int endY = MAX_CU_SIZE - (rand() % 4) - 1;
78
 
79
-        ref(pbuf2 + j + 1, pbuf3 + 1, stride, endX, endY, stats_ref, count_ref);
80
-        checked(opt, pbuf2 + j + 1, pbuf3 + 1, stride, endX, endY, stats_vec, count_vec);
81
+        ref(sbuf2 + j + 1, pbuf3 + 1, stride, endX, endY, stats_ref, count_ref);
82
+        checked(opt, sbuf2 + j + 1, pbuf3 + 1, stride, endX, endY, stats_vec, count_vec);
83
 
84
         if (memcmp(stats_ref, stats_vec, sizeof(stats_ref)) || memcmp(count_ref, count_vec, sizeof(count_ref)))
85
             return false;
86
@@ -1097,8 +1101,8 @@
87
         int endX = MAX_CU_SIZE - (rand() % 5) - 1;
88
         int endY = MAX_CU_SIZE - (rand() % 4) - 1;
89
 
90
-        ref(pbuf2 + j + 1, pbuf3 + j + 1, stride, endX, endY, stats_ref, count_ref);
91
-        checked(opt, pbuf2 + j + 1, pbuf3 + j + 1, stride, endX, endY, stats_vec, count_vec);
92
+        ref(sbuf2 + j + 1, pbuf3 + j + 1, stride, endX, endY, stats_ref, count_ref);
93
+        checked(opt, sbuf2 + j + 1, pbuf3 + j + 1, stride, endX, endY, stats_vec, count_vec);
94
 
95
         if (memcmp(stats_ref, stats_vec, sizeof(stats_ref)) || memcmp(count_ref, count_vec, sizeof(count_ref)))
96
             return false;
97
@@ -1141,8 +1145,8 @@
98
         int endX = MAX_CU_SIZE - (rand() % 5);
99
         int endY = MAX_CU_SIZE - (rand() % 4) - 1;
100
 
101
-        ref(pbuf2 + 1, pbuf3 + 1, stride, upBuff1_ref, endX, endY, stats_ref, count_ref);
102
-        checked(opt, pbuf2 + 1, pbuf3 + 1, stride, upBuff1_vec, endX, endY, stats_vec, count_vec);
103
+        ref(sbuf2 + 1, pbuf3 + 1, stride, upBuff1_ref, endX, endY, stats_ref, count_ref);
104
+        checked(opt, sbuf2 + 1, pbuf3 + 1, stride, upBuff1_vec, endX, endY, stats_vec, count_vec);
105
 
106
         if (   memcmp(_upBuff1_ref, _upBuff1_vec, sizeof(_upBuff1_ref))
107
             || memcmp(stats_ref, stats_vec, sizeof(stats_ref))
108
@@ -1193,8 +1197,8 @@
109
         int endX = MAX_CU_SIZE - (rand() % 5) - 1;
110
         int endY = MAX_CU_SIZE - (rand() % 4) - 1;
111
 
112
-        ref(pbuf2 + 1, pbuf3 + 1, stride, upBuff1_ref, upBufft_ref, endX, endY, stats_ref, count_ref);
113
-        checked(opt, pbuf2 + 1, pbuf3 + 1, stride, upBuff1_vec, upBufft_vec, endX, endY, stats_vec, count_vec);
114
+        ref(sbuf2 + 1, pbuf3 + 1, stride, upBuff1_ref, upBufft_ref, endX, endY, stats_ref, count_ref);
115
+        checked(opt, sbuf2 + 1, pbuf3 + 1, stride, upBuff1_vec, upBufft_vec, endX, endY, stats_vec, count_vec);
116
 
117
         // TODO: don't check upBuff*, the latest output pixels different, and can move into stack temporary buffer in future
118
         if (   memcmp(_upBuff1_ref, _upBuff1_vec, sizeof(_upBuff1_ref))
119
@@ -1244,8 +1248,8 @@
120
         int endX = MAX_CU_SIZE - (rand() % 5) - 1;
121
         int endY = MAX_CU_SIZE - (rand() % 4) - 1;
122
 
123
-        ref(pbuf2, pbuf3, stride, upBuff1_ref, endX, endY, stats_ref, count_ref);
124
-        checked(opt, pbuf2, pbuf3, stride, upBuff1_vec, endX, endY, stats_vec, count_vec);
125
+        ref(sbuf2, pbuf3, stride, upBuff1_ref, endX, endY, stats_ref, count_ref);
126
+        checked(opt, sbuf2, pbuf3, stride, upBuff1_vec, endX, endY, stats_vec, count_vec);
127
 
128
         if (   memcmp(_upBuff1_ref, _upBuff1_vec, sizeof(_upBuff1_ref))
129
             || memcmp(stats_ref, stats_vec, sizeof(stats_ref))
130
@@ -1295,8 +1299,8 @@
131
 
132
     memset(ref_dest, 0xCD, sizeof(ref_dest));
133
     memset(opt_dest, 0xCD, sizeof(opt_dest));
134
-    int width = 32 + rand() % 32;
135
-    int height = 32 + rand() % 32;
136
+    int width = 32 + (rand() % 32);
137
+    int height = 32 + (rand() % 32);
138
     intptr_t srcStride = 64;
139
     intptr_t dstStride = width;
140
     int j = 0;
141
@@ -1304,11 +1308,23 @@
142
     for (int i = 0; i < ITERS; i++)
143
     {
144
         int index = i % TEST_CASES;
145
+
146
         checked(opt, ushort_test_buff[index] + j, srcStride, opt_dest, dstStride, width, height, (int)8, (uint16_t)((1 << X265_DEPTH) - 1));
147
         ref(ushort_test_buff[index] + j, srcStride, ref_dest, dstStride, width, height, (int)8, (uint16_t)((1 << X265_DEPTH) - 1));
148
 
149
-        if (memcmp(ref_dest, opt_dest, width * height * sizeof(pixel)))
150
+        if (memcmp(ref_dest, opt_dest, dstStride * height * sizeof(pixel)))
151
+        {
152
+            memcpy(opt_dest, ref_dest, sizeof(ref_dest));
153
+            opt(ushort_test_buff[index] + j, srcStride, opt_dest, dstStride, width, height, (int)8, (uint16_t)((1 << X265_DEPTH) - 1));
154
             return false;
155
+        }
156
+
157
+        // check tail memory area
158
+        for(int x = width; x < dstStride; x++)
159
+        {
160
+            if (opt_dest[(height - 1 * dstStride) + x] != 0xCD)
161
+                return false;
162
+        }
163
 
164
         reportfail();
165
         j += INCR;
166
@@ -1340,6 +1356,13 @@
167
         if (memcmp(ref_dest, opt_dest, sizeof(ref_dest)))
168
             return false;
169
 
170
+        // check tail memory area
171
+        for(int x = width; x < dstStride; x++)
172
+        {
173
+            if (opt_dest[(height - 1 * dstStride) + x] != 0xCD)
174
+                return false;
175
+        }
176
+
177
         reportfail();
178
         j += INCR;
179
     }
180
@@ -1356,16 +1379,16 @@
181
     memset(opt_dest, 0xCD, sizeof(opt_dest));
182
 
183
     double fps = 1.0;
184
-    int width = 16 + rand() % 64;
185
     int j = 0;
186
 
187
     for (int i = 0; i < ITERS; i++)
188
     {
189
+        int width = 16 + rand() % 64;
190
         int index = i % TEST_CASES;
191
         checked(opt, opt_dest, ushort_test_buff[index] + j, int_test_buff[index] + j, ushort_test_buff[index] + j, int_test_buff[index] + j, &fps, width);
192
         ref(ref_dest, ushort_test_buff[index] + j, int_test_buff[index] + j, ushort_test_buff[index] + j, int_test_buff[index] + j, &fps, width);
193
 
194
-        if (memcmp(ref_dest, opt_dest, width * sizeof(pixel)))
195
+        if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)))
196
             return false;
197
 
198
         reportfail();
199
@@ -1397,28 +1420,6 @@
200
     return true;
201
x265_1.8.tar.gz/source/test/pixelharness.h -> x265_1.9.tar.gz/source/test/pixelharness.h Changed
43
 
1
@@ -2,6 +2,7 @@
2
  * Copyright (C) 2013 x265 project
3
  *
4
  * Authors: Steve Borho <steve@borho.org>
5
+ *          Min Chen <chenm003@163.com>
6
  *
7
  * This program is free software; you can redistribute it and/or modify
8
  * it under the terms of the GNU General Public License as published by
9
@@ -40,6 +41,8 @@
10
     enum { TEST_CASES = 3 };
11
     enum { SMAX = 1 << 12 };
12
     enum { SMIN = -1 << 12 };
13
+    enum { RMAX = PIXEL_MAX - PIXEL_MIN }; //The maximum value obtained by subtracting pixel values (residual max)
14
+    enum { RMIN = PIXEL_MIN - PIXEL_MAX }; //The minimum value obtained by subtracting pixel values (residual min)
15
 
16
     ALIGN_VAR_32(pixel, pbuf1[BUFFSIZE]);
17
     pixel    pbuf2[BUFFSIZE];
18
@@ -64,6 +67,7 @@
19
     uint16_t ushort_test_buff[TEST_CASES][BUFFSIZE];
20
     uint8_t  uchar_test_buff[TEST_CASES][BUFFSIZE];
21
     double   double_test_buff[TEST_CASES][BUFFSIZE];
22
+    int16_t  residual_test_buff[TEST_CASES][BUFFSIZE];
23
 
24
     bool check_pixelcmp(pixelcmp_t ref, pixelcmp_t opt);
25
     bool check_pixel_sse(pixel_sse_t ref, pixel_sse_t opt);
26
@@ -110,12 +114,15 @@
27
     bool check_planecopy_cp(planecopy_cp_t ref, planecopy_cp_t opt);
28
     bool check_cutree_propagate_cost(cutree_propagate_cost ref, cutree_propagate_cost opt);
29
     bool check_psyCost_pp(pixelcmp_t ref, pixelcmp_t opt);
30
-    bool check_psyCost_ss(pixelcmp_ss_t ref, pixelcmp_ss_t opt);
31
     bool check_calSign(sign_t ref, sign_t opt);
32
     bool check_scanPosLast(scanPosLast_t ref, scanPosLast_t opt);
33
     bool check_findPosFirstLast(findPosFirstLast_t ref, findPosFirstLast_t opt);
34
     bool check_costCoeffNxN(costCoeffNxN_t ref, costCoeffNxN_t opt);
35
     bool check_costCoeffRemain(costCoeffRemain_t ref, costCoeffRemain_t opt);
36
+    bool check_costC1C2Flag(costC1C2Flag_t ref, costC1C2Flag_t opt);
37
+    bool check_planeClipAndMax(planeClipAndMax_t ref, planeClipAndMax_t opt);
38
+    bool check_pelFilterLumaStrong_V(pelFilterLumaStrong_t ref, pelFilterLumaStrong_t opt);
39
+    bool check_pelFilterLumaStrong_H(pelFilterLumaStrong_t ref, pelFilterLumaStrong_t opt);
40
 
41
 public:
42
 
43
x265_1.8.tar.gz/source/test/regression-tests.txt -> x265_1.9.tar.gz/source/test/regression-tests.txt Changed
192
 
1
@@ -11,124 +11,132 @@
2
 # consistent across many machines, you must force a certain -FN so it is
3
 # not auto-detected.
4
 
5
+BasketballDrive_1920x1080_50.y4m,--preset ultrafast --signhide --colormatrix bt709
6
+BasketballDrive_1920x1080_50.y4m,--preset superfast --psy-rd 1 --ctu 16 --no-wpp --limit-modes
7
+BasketballDrive_1920x1080_50.y4m,--preset veryfast --tune zerolatency --no-temporal-mvp
8
 BasketballDrive_1920x1080_50.y4m,--preset faster --aq-strength 2 --merange 190
9
 BasketballDrive_1920x1080_50.y4m,--preset medium --ctu 16 --max-tu-size 8 --subme 7 --qg-size 16 --cu-lossless
10
 BasketballDrive_1920x1080_50.y4m,--preset medium --keyint -1 --nr-inter 100 -F4 --no-sao
11
+BasketballDrive_1920x1080_50.y4m,--preset medium --no-cutree --analysis-mode=save --bitrate 7000 --limit-modes,--preset medium --no-cutree --analysis-mode=load --bitrate 7000 --limit-modes
12
 BasketballDrive_1920x1080_50.y4m,--preset slow --nr-intra 100 -F4 --aq-strength 3 --qg-size 16 --limit-refs 1
13
 BasketballDrive_1920x1080_50.y4m,--preset slower --lossless --chromaloc 3 --subme 0
14
-BasketballDrive_1920x1080_50.y4m,--preset superfast --psy-rd 1 --ctu 16 --no-wpp
15
-BasketballDrive_1920x1080_50.y4m,--preset ultrafast --signhide --colormatrix bt709
16
-BasketballDrive_1920x1080_50.y4m,--preset veryfast --tune zerolatency --no-temporal-mvp
17
-BasketballDrive_1920x1080_50.y4m,--preset veryslow --crf 4 --cu-lossless --pmode --limit-refs 1
18
+BasketballDrive_1920x1080_50.y4m,--preset slower --no-cutree --analysis-mode=save --bitrate 7000,--preset slower --no-cutree --analysis-mode=load --bitrate 7000
19
+BasketballDrive_1920x1080_50.y4m,--preset veryslow --crf 4 --cu-lossless --pmode --limit-refs 1 --aq-mode 3
20
+BasketballDrive_1920x1080_50.y4m,--preset veryslow --no-cutree --analysis-mode=save --bitrate 7000 --tskip-fast,--preset veryslow --no-cutree --analysis-mode=load --bitrate 7000  --tskip-fast
21
+BasketballDrive_1920x1080_50.y4m,--preset veryslow --recon-y4m-exec "ffplay -i pipe:0 -autoexit"
22
+Coastguard-4k.y4m,--preset ultrafast --recon-y4m-exec "ffplay -i pipe:0 -autoexit"
23
+Coastguard-4k.y4m,--preset superfast --tune grain --overscan=crop
24
+Coastguard-4k.y4m,--preset veryfast --no-cutree --analysis-mode=save --bitrate 15000,--preset veryfast --no-cutree --analysis-mode=load --bitrate 15000
25
 Coastguard-4k.y4m,--preset medium --rdoq-level 1 --tune ssim --no-signhide --me umh
26
 Coastguard-4k.y4m,--preset slow --tune psnr --cbqpoffs -1 --crqpoffs 1 --limit-refs 1
27
-Coastguard-4k.y4m,--preset superfast --tune grain --overscan=crop
28
-CrowdRun_1920x1080_50_10bit_422.yuv,--preset fast --aq-mode 0 --sar 2 --range full
29
+CrowdRun_1920x1080_50_10bit_422.yuv,--preset ultrafast --weightp --tune zerolatency --qg-size 16
30
+CrowdRun_1920x1080_50_10bit_422.yuv,--preset superfast --weightp --no-wpp --sao
31
+CrowdRun_1920x1080_50_10bit_422.yuv,--preset veryfast --temporal-layers --tune grain
32
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset faster --max-tu-size 4 --min-cu-size 32
33
+CrowdRun_1920x1080_50_10bit_422.yuv,--preset fast --aq-mode 0 --sar 2 --range full
34
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset medium --no-wpp --no-cutree --no-strong-intra-smoothing --limit-refs 1
35
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset slow --no-wpp --tune ssim --transfer smpte240m
36
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset slower --tune ssim --tune fastdecode --limit-refs 2
37
-CrowdRun_1920x1080_50_10bit_422.yuv,--preset superfast --weightp --no-wpp --sao
38
-CrowdRun_1920x1080_50_10bit_422.yuv,--preset ultrafast --weightp --tune zerolatency --qg-size 16
39
-CrowdRun_1920x1080_50_10bit_422.yuv,--preset veryfast --temporal-layers --tune grain
40
-CrowdRun_1920x1080_50_10bit_444.yuv,--preset medium --dither --keyint -1 --rdoq-level 1
41
-CrowdRun_1920x1080_50_10bit_444.yuv,--preset superfast --weightp --dither --no-psy-rd
42
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset ultrafast --weightp --no-wpp --no-open-gop
43
+CrowdRun_1920x1080_50_10bit_444.yuv,--preset superfast --weightp --dither --no-psy-rd
44
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset veryfast --temporal-layers --repeat-headers --limit-refs 2
45
+CrowdRun_1920x1080_50_10bit_444.yuv,--preset medium --dither --keyint -1 --rdoq-level 1 --limit-modes
46
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset veryslow --tskip --tskip-fast --no-scenecut
47
-DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset medium --tune psnr --bframes 16
48
-DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset slow --temporal-layers --no-psy-rd --qg-size 32 --limit-refs 0 --cu-lossless
49
 DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset superfast --weightp --qg-size 16
50
+DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset medium --tune psnr --bframes 16 --limit-modes
51
+DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset slow --temporal-layers --no-psy-rd --qg-size 32 --limit-refs 0 --cu-lossless
52
+DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset veryfast --weightp --nr-intra 1000 -F4
53
 DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset medium --nr-inter 500 -F4 --no-psy-rdoq
54
 DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset slower --no-weightp --rdoq-level 0 --limit-refs 3
55
-DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset veryfast --weightp --nr-intra 1000 -F4
56
-FourPeople_1280x720_60.y4m,--preset medium --qp 38 --no-psy-rd
57
 FourPeople_1280x720_60.y4m,--preset superfast --no-wpp --lookahead-slices 2
58
+FourPeople_1280x720_60.y4m,--preset medium --qp 38 --no-psy-rd
59
+FourPeople_1280x720_60.y4m,--preset medium --recon-y4m-exec "ffplay -i pipe:0 -autoexit"
60
+FourPeople_1280x720_60.y4m,--preset veryslow --numa-pools "none"
61
+Keiba_832x480_30.y4m,--preset superfast --no-fast-intra --nr-intra 1000 -F4
62
 Keiba_832x480_30.y4m,--preset medium --pmode --tune grain
63
 Keiba_832x480_30.y4m,--preset slower --fast-intra --nr-inter 500 -F4 --limit-refs 0
64
-Keiba_832x480_30.y4m,--preset superfast --no-fast-intra --nr-intra 1000 -F4
65
-Kimono1_1920x1080_24_10bit_444.yuv,--preset medium --min-cu-size 32
66
 Kimono1_1920x1080_24_10bit_444.yuv,--preset superfast --weightb
67
-KristenAndSara_1280x720_60.y4m,--preset medium --no-cutree --max-tu-size 16
68
-KristenAndSara_1280x720_60.y4m,--preset slower --pmode --max-tu-size 8 --limit-refs 0
69
-KristenAndSara_1280x720_60.y4m,--preset superfast --min-cu-size 16 --qg-size 16 --limit-refs 1
70
+Kimono1_1920x1080_24_10bit_444.yuv,--preset medium --min-cu-size 32
71
 KristenAndSara_1280x720_60.y4m,--preset ultrafast --strong-intra-smoothing
72
-NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset medium --tune grain --limit-refs 2
73
+KristenAndSara_1280x720_60.y4m,--preset superfast --min-cu-size 16 --qg-size 16 --limit-refs 1
74
+KristenAndSara_1280x720_60.y4m,--preset medium --no-cutree --max-tu-size 16
75
+KristenAndSara_1280x720_60.y4m,--preset slower --pmode --max-tu-size 8 --limit-refs 0 --limit-modes
76
 NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset superfast --tune psnr
77
-News-4k.y4m,--preset medium --tune ssim --no-sao --qg-size 16
78
+NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset medium --tune grain --limit-refs 2
79
+NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset slow --no-cutree --analysis-mode=save --bitrate 9000,--preset slow --no-cutree --analysis-mode=load --bitrate 9000
80
+News-4k.y4m,--preset ultrafast --no-cutree --analysis-mode=save --bitrate 15000,--preset ultrafast --no-cutree --analysis-mode=load --bitrate 15000
81
 News-4k.y4m,--preset superfast --lookahead-slices 6 --aq-mode 0
82
+News-4k.y4m,--preset medium --tune ssim --no-sao --qg-size 16
83
+OldTownCross_1920x1080_50_10bit_422.yuv,--preset superfast --weightp
84
 OldTownCross_1920x1080_50_10bit_422.yuv,--preset medium --no-weightp
85
 OldTownCross_1920x1080_50_10bit_422.yuv,--preset slower --tune fastdecode
86
-OldTownCross_1920x1080_50_10bit_422.yuv,--preset superfast --weightp
87
+ParkScene_1920x1080_24_10bit_444.yuv,--preset superfast --weightp --lookahead-slices 4
88
 ParkScene_1920x1080_24.y4m,--preset medium --qp 40 --rdpenalty 2 --tu-intra-depth 3
89
 ParkScene_1920x1080_24.y4m,--preset slower --no-weightp
90
-ParkScene_1920x1080_24_10bit_444.yuv,--preset superfast --weightp --lookahead-slices 4
91
+RaceHorses_416x240_30.y4m,--preset superfast --no-cutree
92
 RaceHorses_416x240_30.y4m,--preset medium --tskip-fast --tskip
93
 RaceHorses_416x240_30.y4m,--preset slower --keyint -1 --rdoq-level 0
94
-RaceHorses_416x240_30.y4m,--preset superfast --no-cutree
95
 RaceHorses_416x240_30.y4m,--preset veryslow --tskip-fast --tskip --limit-refs 3
96
-RaceHorses_416x240_30_10bit.yuv,--preset fast --lookahead-slices 2 --b-intra --limit-refs 1
97
-RaceHorses_416x240_30_10bit.yuv,--preset faster --rdoq-level 0 --dither
98
-RaceHorses_416x240_30_10bit.yuv,--preset slow --tune grain
99
 RaceHorses_416x240_30_10bit.yuv,--preset ultrafast --tune psnr --limit-refs 1
100
 RaceHorses_416x240_30_10bit.yuv,--preset veryfast --weightb
101
+RaceHorses_416x240_30_10bit.yuv,--preset faster --rdoq-level 0 --dither
102
+RaceHorses_416x240_30_10bit.yuv,--preset fast --lookahead-slices 2 --b-intra --limit-refs 1
103
+RaceHorses_416x240_30_10bit.yuv,--preset slow --tune grain  --limit-modes
104
 RaceHorses_416x240_30_10bit.yuv,--preset placebo --limit-refs 1
105
 SteamLocomotiveTrain_2560x1600_60_10bit_crop.yuv,--preset medium --dither
106
-big_buck_bunny_360p24.y4m,--preset faster --keyint 240 --min-keyint 60 --rc-lookahead 200
107
-big_buck_bunny_360p24.y4m,--preset medium --keyint 60 --min-keyint 48 --weightb --limit-refs 3
108
-big_buck_bunny_360p24.y4m,--preset slow --psy-rdoq 2.0 --rdoq-level 1 --no-b-intra
109
-big_buck_bunny_360p24.y4m,--preset superfast --psy-rdoq 2.0
110
 big_buck_bunny_360p24.y4m,--preset ultrafast --deblock=2
111
+big_buck_bunny_360p24.y4m,--preset superfast --psy-rdoq 2.0 --aq-mode 3
112
 big_buck_bunny_360p24.y4m,--preset veryfast --no-deblock
113
-city_4cif_60fps.y4m,--preset medium --crf 4 --cu-lossless --sao-non-deblock
114
+big_buck_bunny_360p24.y4m,--preset faster --keyint 240 --min-keyint 60 --rc-lookahead 200
115
+big_buck_bunny_360p24.y4m,--preset medium --keyint 60 --min-keyint 48 --weightb --limit-refs 3
116
+big_buck_bunny_360p24.y4m,--preset slow --psy-rdoq 2.0 --rdoq-level 1 --no-b-intra --aq-mode 3
117
 city_4cif_60fps.y4m,--preset superfast --rdpenalty 1 --tu-intra-depth 2
118
+city_4cif_60fps.y4m,--preset medium --crf 4 --cu-lossless --sao-non-deblock
119
 city_4cif_60fps.y4m,--preset slower --scaling-list default
120
 city_4cif_60fps.y4m,--preset veryslow --rdpenalty 2 --sao-non-deblock --no-b-intra --limit-refs 0
121
-ducks_take_off_420_720p50.y4m,--preset fast --deblock 6 --bframes 16 --rc-lookahead 40
122
+ducks_take_off_420_720p50.y4m,--preset ultrafast --constrained-intra --rd 1
123
+ducks_take_off_444_720p50.y4m,--preset superfast --weightp --limit-refs 2
124
 ducks_take_off_420_720p50.y4m,--preset faster --qp 24 --deblock -6 --limit-refs 2
125
+ducks_take_off_420_720p50.y4m,--preset fast --deblock 6 --bframes 16 --rc-lookahead 40
126
 ducks_take_off_420_720p50.y4m,--preset medium --tskip --tskip-fast --constrained-intra
127
-ducks_take_off_420_720p50.y4m,--preset slow --scaling-list default --qp 40
128
-ducks_take_off_420_720p50.y4m,--preset ultrafast --constrained-intra --rd 1
129
-ducks_take_off_420_720p50.y4m,--preset veryslow --constrained-intra --bframes 2
130
 ducks_take_off_444_720p50.y4m,--preset medium --qp 38 --no-scenecut
131
-ducks_take_off_444_720p50.y4m,--preset superfast --weightp --rd 0 --limit-refs 2
132
+ducks_take_off_420_720p50.y4m,--preset slow --scaling-list default --qp 40
133
 ducks_take_off_444_720p50.y4m,--preset slower --psy-rd 1 --psy-rdoq 2.0 --rdoq-level 1 --limit-refs 1
134
+ducks_take_off_420_720p50.y4m,--preset slower --no-wpp
135
+ducks_take_off_420_720p50.y4m,--preset veryslow --constrained-intra --bframes 2
136
+mobile_calendar_422_ntsc.y4m,--preset superfast --weightp
137
 mobile_calendar_422_ntsc.y4m,--preset medium --bitrate 500 -F4
138
 mobile_calendar_422_ntsc.y4m,--preset slower --tskip --tskip-fast
139
-mobile_calendar_422_ntsc.y4m,--preset superfast --weightp --rd 0
140
 mobile_calendar_422_ntsc.y4m,--preset veryslow --tskip --limit-refs 2
141
+old_town_cross_444_720p50.y4m,--preset ultrafast --weightp --min-cu 32
142
+old_town_cross_444_720p50.y4m,--preset superfast --weightp --min-cu 16 --limit-modes
143
+old_town_cross_444_720p50.y4m,--preset veryfast --qp 1 --tune ssim
144
 old_town_cross_444_720p50.y4m,--preset faster --rd 1 --tune zero-latency
145
+old_town_cross_444_720p50.y4m,--preset fast --no-cutree --analysis-mode=save --bitrate 3000 --early-skip,--preset fast --no-cutree --analysis-mode=load --bitrate 3000 --early-skip
146
 old_town_cross_444_720p50.y4m,--preset medium --keyint -1 --no-weightp --ref 6
147
 old_town_cross_444_720p50.y4m,--preset slow --rdoq-level 1 --early-skip --ref 7 --no-b-pyramid
148
 old_town_cross_444_720p50.y4m,--preset slower --crf 4 --cu-lossless
149
-old_town_cross_444_720p50.y4m,--preset superfast --weightp --min-cu 16
150
-old_town_cross_444_720p50.y4m,--preset ultrafast --weightp --min-cu 32
151
-old_town_cross_444_720p50.y4m,--preset veryfast --qp 1 --tune ssim
152
 parkrun_ter_720p50.y4m,--preset medium --no-open-gop --sao-non-deblock --crf 4 --cu-lossless
153
 parkrun_ter_720p50.y4m,--preset slower --fast-intra --no-rect --tune grain
154
-silent_cif_420.y4m,--preset medium --me full --rect --amp
155
 silent_cif_420.y4m,--preset superfast --weightp --rect
156
+silent_cif_420.y4m,--preset medium --me full --rect --amp
157
 silent_cif_420.y4m,--preset placebo --ctu 32 --no-sao --qg-size 16
158
-vtc1nw_422_ntsc.y4m,--preset medium --scaling-list default --ctu 16 --ref 5
159
-vtc1nw_422_ntsc.y4m,--preset slower --nr-inter 1000 -F4 --tune fast-decode --qg-size 16
160
+washdc_422_ntsc.y4m,--preset ultrafast --weightp --tu-intra-depth 4
161
 vtc1nw_422_ntsc.y4m,--preset superfast --weightp --nr-intra 100 -F4
162
-washdc_422_ntsc.y4m,--preset faster --rdoq-level 1 --max-merge 5
163
-washdc_422_ntsc.y4m,--preset medium --no-weightp --max-tu-size 4 --limit-refs 1
164
-washdc_422_ntsc.y4m,--preset slower --psy-rdoq 2.0 --rdoq-level 2 --qg-size 32 --limit-refs 1
165
 washdc_422_ntsc.y4m,--preset superfast --psy-rd 1 --tune zerolatency
166
-washdc_422_ntsc.y4m,--preset ultrafast --weightp --tu-intra-depth 4
167
 washdc_422_ntsc.y4m,--preset veryfast --tu-inter-depth 4
168
-washdc_422_ntsc.y4m,--preset veryslow --crf 4 --cu-lossless --limit-refs 3
169
-BasketballDrive_1920x1080_50.y4m,--preset medium --no-cutree --analysis-mode=save --bitrate 15000,--preset medium --no-cutree --analysis-mode=load --bitrate 13000,--preset medium --no-cutree --analysis-mode=load --bitrate 11000,--preset medium --no-cutree --analysis-mode=load --bitrate 9000,--preset medium --no-cutree --analysis-mode=load --bitrate 7000
170
-NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset slow --no-cutree --analysis-mode=save --bitrate 15000,--preset slow --no-cutree --analysis-mode=load --bitrate 13000,--preset slow --no-cutree --analysis-mode=load --bitrate 11000,--preset slow --no-cutree --analysis-mode=load --bitrate 9000,--preset slow --no-cutree --analysis-mode=load --bitrate 7000
171
-old_town_cross_444_720p50.y4m,--preset veryslow --no-cutree --analysis-mode=save --bitrate 15000 --early-skip,--preset veryslow --no-cutree --analysis-mode=load --bitrate 13000 --early-skip,--preset veryslow --no-cutree --analysis-mode=load --bitrate 11000 --early-skip,--preset veryslow --no-cutree --analysis-mode=load --bitrate 9000 --early-skip,--preset veryslow --no-cutree --analysis-mode=load --bitrate 7000 --early-skip
172
-Johnny_1280x720_60.y4m,--preset medium --no-cutree --analysis-mode=save --bitrate 15000 --tskip-fast,--preset medium --no-cutree --analysis-mode=load --bitrate 13000  --tskip-fast,--preset medium --no-cutree --analysis-mode=load --bitrate 11000  --tskip-fast,--preset medium --no-cutree --analysis-mode=load --bitrate 9000  --tskip-fast,--preset medium --no-cutree --analysis-mode=load --bitrate 7000  --tskip-fast
173
-BasketballDrive_1920x1080_50.y4m,--preset medium --recon-y4m-exec "ffplay -i pipe:0 -autoexit"
174
-FourPeople_1280x720_60.y4m,--preset ultrafast --recon-y4m-exec "ffplay -i pipe:0 -autoexit"
175
-FourPeople_1280x720_60.y4m,--preset veryslow --recon-y4m-exec "ffplay -i pipe:0 -autoexit"
176
+washdc_422_ntsc.y4m,--preset faster --rdoq-level 1 --max-merge 5
177
+vtc1nw_422_ntsc.y4m,--preset medium --scaling-list default --ctu 16 --ref 5
178
+washdc_422_ntsc.y4m,--preset medium --no-weightp --max-tu-size 4 --limit-refs 1 --aq-mode 2
179
+vtc1nw_422_ntsc.y4m,--preset slower --nr-inter 1000 -F4 --tune fast-decode --qg-size 16
180
+washdc_422_ntsc.y4m,--preset slower --psy-rdoq 2.0 --rdoq-level 2 --qg-size 32 --limit-refs 1
181
+washdc_422_ntsc.y4m,--preset veryslow --crf 4 --cu-lossless --limit-refs 3 --limit-modes
182
+
183
+# Main12 intraCost overflow bug test
184
+720p50_parkrun_ter.y4m,--preset medium
185
 
186
 # interlace test, even though input YUV is not field seperated
187
-CrowdRun_1920x1080_50_10bit_422.yuv,--preset fast --interlace bff
188
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset faster --interlace tff
189
+CrowdRun_1920x1080_50_10bit_422.yuv,--preset fast --interlace bff
190
 
191
 # vim: tw=200
192
x265_1.8.tar.gz/source/test/smoke-tests.txt -> x265_1.9.tar.gz/source/test/smoke-tests.txt Changed
8
 
1
@@ -19,3 +19,6 @@
2
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset=medium --max-tu-size 16
3
 DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset=veryfast --min-cu 16
4
 DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset=fast --weightb --interlace bff
5
+
6
+# Main12 intraCost overflow bug test
7
+720p50_parkrun_ter.y4m,--preset medium
8
x265_1.8.tar.gz/source/test/testbench.cpp -> x265_1.9.tar.gz/source/test/testbench.cpp Changed
9
 
1
@@ -4,6 +4,7 @@
2
  * Authors: Gopu Govindaswamy <gopu@govindaswamy.org>
3
  *          Mandar Gurav <mandar@multicorewareinc.com>
4
  *          Mahesh Pittala <mahesh@multicorewareinc.com>
5
+ *          Min Chen <chenm003@163.com>
6
  *
7
  * This program is free software; you can redistribute it and/or modify
8
  * it under the terms of the GNU General Public License as published by
9
x265_1.8.tar.gz/source/test/testharness.h -> x265_1.9.tar.gz/source/test/testharness.h Changed
9
 
1
@@ -2,6 +2,7 @@
2
  * Copyright (C) 2013 x265 project
3
  *
4
  * Authors: Steve Borho <steve@borho.org>
5
+ *          Min Chen <chenm003@163.com>
6
  *
7
  * This program is free software; you can redistribute it and/or modify
8
  * it under the terms of the GNU General Public License as published by
9
x265_1.8.tar.gz/source/x265-extras.cpp -> x265_1.9.tar.gz/source/x265-extras.cpp Changed
201
 
1
@@ -36,7 +36,7 @@
2
     "I count, I ave-QP, I kbps, I-PSNR Y, I-PSNR U, I-PSNR V, I-SSIM (dB), "
3
     "P count, P ave-QP, P kbps, P-PSNR Y, P-PSNR U, P-PSNR V, P-SSIM (dB), "
4
     "B count, B ave-QP, B kbps, B-PSNR Y, B-PSNR U, B-PSNR V, B-SSIM (dB), "
5
-    "Version\n";
6
+    "MaxCLL, MaxFALL, Version\n";
7
 
8
 FILE* x265_csvlog_open(const x265_api& api, const x265_param& param, const char* fname, int level)
9
 {
10
@@ -61,54 +61,58 @@
11
         {
12
             if (level)
13
             {
14
-                fprintf(csvfp, "Encode Order, Type, POC, QP, Bits, ");
15
+                fprintf(csvfp, "Encode Order, Type, POC, QP, Bits, Scenecut, ");
16
                 if (param.rc.rateControlMode == X265_RC_CRF)
17
                     fprintf(csvfp, "RateFactor, ");
18
-                fprintf(csvfp, "Y PSNR, U PSNR, V PSNR, YUV PSNR, SSIM, SSIM (dB),  List 0, List 1");
19
-                /* detailed performance statistics */
20
-                fprintf(csvfp, ", DecideWait (ms), Row0Wait (ms), Wall time (ms), Ref Wait Wall (ms), Total CTU time (ms), Stall Time (ms), Avg WPP, Row Blocks");
21
-                if (level >= 2)
22
+                if (param.bEnablePsnr)
23
+                    fprintf(csvfp, "Y PSNR, U PSNR, V PSNR, YUV PSNR, ");
24
+                if (param.bEnableSsim)
25
+                    fprintf(csvfp, "SSIM, SSIM(dB), ");
26
+                fprintf(csvfp, "Latency, ");
27
+                fprintf(csvfp, "List 0, List 1");
28
+                uint32_t size = param.maxCUSize;
29
+                for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
30
+                {
31
+                    fprintf(csvfp, ", Intra %dx%d DC, Intra %dx%d Planar, Intra %dx%d Ang", size, size, size, size, size, size);
32
+                    size /= 2;
33
+                }
34
+                fprintf(csvfp, ", 4x4");
35
+                size = param.maxCUSize;
36
+                if (param.bEnableRectInter)
37
                 {
38
-                    uint32_t size = param.maxCUSize;
39
-                    for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
40
-                    {
41
-                        fprintf(csvfp, ", Intra %dx%d DC, Intra %dx%d Planar, Intra %dx%d Ang", size, size, size, size, size, size);
42
-                        size /= 2;
43
-                    }
44
-                    fprintf(csvfp, ", 4x4");
45
-                    size = param.maxCUSize;
46
-                    if (param.bEnableRectInter)
47
-                    {
48
-                        for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
49
-                        {
50
-                            fprintf(csvfp, ", Inter %dx%d, Inter %dx%d (Rect)", size, size, size, size);
51
-                            if (param.bEnableAMP)
52
-                                fprintf(csvfp, ", Inter %dx%d (Amp)", size, size);
53
-                            size /= 2;
54
-                        }
55
-                    }
56
-                    else
57
-                    {
58
-                        for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
59
-                        {
60
-                            fprintf(csvfp, ", Inter %dx%d", size, size);
61
-                            size /= 2;
62
-                        }
63
-                    }
64
-                    size = param.maxCUSize;
65
                     for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
66
                     {
67
-                        fprintf(csvfp, ", Skip %dx%d", size, size);
68
+                        fprintf(csvfp, ", Inter %dx%d, Inter %dx%d (Rect)", size, size, size, size);
69
+                        if (param.bEnableAMP)
70
+                            fprintf(csvfp, ", Inter %dx%d (Amp)", size, size);
71
                         size /= 2;
72
                     }
73
-                    size = param.maxCUSize;
74
+                }
75
+                else
76
+                {
77
                     for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
78
                     {
79
-                        fprintf(csvfp, ", Merge %dx%d", size, size);
80
+                        fprintf(csvfp, ", Inter %dx%d", size, size);
81
                         size /= 2;
82
                     }
83
-                    fprintf(csvfp, ", Avg Luma Distortion, Avg Chroma Distortion, Avg psyEnergy, Avg Luma Level, Max Luma Level");
84
                 }
85
+                size = param.maxCUSize;
86
+                for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
87
+                {
88
+                    fprintf(csvfp, ", Skip %dx%d", size, size);
89
+                    size /= 2;
90
+                }
91
+                size = param.maxCUSize;
92
+                for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
93
+                {
94
+                    fprintf(csvfp, ", Merge %dx%d", size, size);
95
+                    size /= 2;
96
+                }
97
+                fprintf(csvfp, ", Avg Luma Distortion, Avg Chroma Distortion, Avg psyEnergy, Avg Luma Level, Max Luma Level, Avg Residual Energy");
98
+
99
+                /* detailed performance statistics */
100
+                if (level >= 2)
101
+                    fprintf(csvfp, ", DecideWait (ms), Row0Wait (ms), Wall time (ms), Ref Wait Wall (ms), Total CTU time (ms), Stall Time (ms), Avg WPP, Row Blocks");
102
                 fprintf(csvfp, "\n");
103
             }
104
             else
105
@@ -125,17 +129,14 @@
106
         return;
107
 
108
     const x265_frame_stats* frameStats = &pic.frameData;
109
-    fprintf(csvfp, "%d, %c-SLICE, %4d, %2.2lf, %10d,", frameStats->encoderOrder, frameStats->sliceType, frameStats->poc, frameStats->qp, (int)frameStats->bits);
110
+    fprintf(csvfp, "%d, %c-SLICE, %4d, %2.2lf, %10d, %d,", frameStats->encoderOrder, frameStats->sliceType, frameStats->poc, frameStats->qp, (int)frameStats->bits, frameStats->bScenecut);
111
     if (param.rc.rateControlMode == X265_RC_CRF)
112
         fprintf(csvfp, "%.3lf,", frameStats->rateFactor);
113
     if (param.bEnablePsnr)
114
         fprintf(csvfp, "%.3lf, %.3lf, %.3lf, %.3lf,", frameStats->psnrY, frameStats->psnrU, frameStats->psnrV, frameStats->psnr);
115
-    else
116
-        fputs(" -, -, -, -,", csvfp);
117
     if (param.bEnableSsim)
118
         fprintf(csvfp, " %.6f, %6.3f,", frameStats->ssim, x265_ssim2dB(frameStats->ssim));
119
-    else
120
-        fputs(" -, -,", csvfp);
121
+    fprintf(csvfp, "%d, ", frameStats->frameLatency);
122
     if (frameStats->sliceType == 'I')
123
         fputs(" -, -,", csvfp);
124
     else
125
@@ -154,32 +155,33 @@
126
         else
127
             fputs(" -,", csvfp);
128
     }
129
-    fprintf(csvfp, " %.1lf, %.1lf, %.1lf, %.1lf, %.1lf, %.1lf,", frameStats->decideWaitTime, frameStats->row0WaitTime, frameStats->wallTime, frameStats->refWaitWallTime, frameStats->totalCTUTime, frameStats->stallTime);
130
-    fprintf(csvfp, " %.3lf, %d", frameStats->avgWPP, frameStats->countRowBlocks);
131
-    if (level >= 2)
132
+    for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
133
+        fprintf(csvfp, "%5.2lf%%, %5.2lf%%, %5.2lf%%,", frameStats->cuStats.percentIntraDistribution[depth][0], frameStats->cuStats.percentIntraDistribution[depth][1], frameStats->cuStats.percentIntraDistribution[depth][2]);
134
+    fprintf(csvfp, "%5.2lf%%", frameStats->cuStats.percentIntraNxN);
135
+    if (param.bEnableRectInter)
136
     {
137
         for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
138
-            fprintf(csvfp, ", %5.2lf%%, %5.2lf%%, %5.2lf%%", frameStats->cuStats.percentIntraDistribution[depth][0], frameStats->cuStats.percentIntraDistribution[depth][1], frameStats->cuStats.percentIntraDistribution[depth][2]);
139
-        fprintf(csvfp, ", %5.2lf%%", frameStats->cuStats.percentIntraNxN);
140
-        if (param.bEnableRectInter)
141
         {
142
-            for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
143
-            {
144
-                fprintf(csvfp, ", %5.2lf%%, %5.2lf%%", frameStats->cuStats.percentInterDistribution[depth][0], frameStats->cuStats.percentInterDistribution[depth][1]);
145
-                if (param.bEnableAMP)
146
-                    fprintf(csvfp, ", %5.2lf%%", frameStats->cuStats.percentInterDistribution[depth][2]);
147
-            }
148
+            fprintf(csvfp, ", %5.2lf%%, %5.2lf%%", frameStats->cuStats.percentInterDistribution[depth][0], frameStats->cuStats.percentInterDistribution[depth][1]);
149
+            if (param.bEnableAMP)
150
+                fprintf(csvfp, ", %5.2lf%%", frameStats->cuStats.percentInterDistribution[depth][2]);
151
         }
152
-        else
153
-        {
154
-            for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
155
-                fprintf(csvfp, ", %5.2lf%%", frameStats->cuStats.percentInterDistribution[depth][0]);
156
-        }
157
-        for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
158
-            fprintf(csvfp, ", %5.2lf%%", frameStats->cuStats.percentSkipCu[depth]);
159
+    }
160
+    else
161
+    {
162
         for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
163
-            fprintf(csvfp, ", %5.2lf%%", frameStats->cuStats.percentMergeCu[depth]);
164
-        fprintf(csvfp, ", %.2lf, %.2lf, %.2lf, %.2lf, %d", frameStats->avgLumaDistortion, frameStats->avgChromaDistortion, frameStats->avgPsyEnergy, frameStats->avgLumaLevel, frameStats->maxLumaLevel);
165
+            fprintf(csvfp, ", %5.2lf%%", frameStats->cuStats.percentInterDistribution[depth][0]);
166
+    }
167
+    for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
168
+        fprintf(csvfp, ", %5.2lf%%", frameStats->cuStats.percentSkipCu[depth]);
169
+    for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
170
+        fprintf(csvfp, ", %5.2lf%%", frameStats->cuStats.percentMergeCu[depth]);
171
+    fprintf(csvfp, ", %.2lf, %.2lf, %.2lf, %.2lf, %d, %.2lf", frameStats->avgLumaDistortion, frameStats->avgChromaDistortion, frameStats->avgPsyEnergy, frameStats->avgLumaLevel, frameStats->maxLumaLevel, frameStats->avgResEnergy);
172
+
173
+    if (level >= 2)
174
+    {
175
+        fprintf(csvfp, ", %.1lf, %.1lf, %.1lf, %.1lf, %.1lf, %.1lf,", frameStats->decideWaitTime, frameStats->row0WaitTime, frameStats->wallTime, frameStats->refWaitWallTime, frameStats->totalCTUTime, frameStats->stallTime);
176
+        fprintf(csvfp, " %.3lf, %d", frameStats->avgWPP, frameStats->countRowBlocks);
177
     }
178
     fprintf(csvfp, "\n");
179
     fflush(stderr);
180
@@ -198,11 +200,13 @@
181
     }
182
 
183
     // CLI arguments or other
184
+    fputc('"', csvfp);
185
     for (int i = 1; i < argc; i++)
186
     {
187
-        if (i) fputc(' ', csvfp);
188
+        fputc(' ', csvfp);
189
         fputs(argv[i], csvfp);
190
     }
191
+    fputc('"', csvfp);
192
 
193
     // current date and time
194
     time_t now;
195
@@ -273,7 +277,7 @@
196
     else
197
         fprintf(csvfp, " -, -, -, -, -, -, -,");
198
 
199
-    fprintf(csvfp, " %s\n", api.version_str);
200
+    fprintf(csvfp, " %-6u, %-6u, %s\n", stats.maxCLL, stats.maxFALL, api.version_str);
201
x265_1.8.tar.gz/source/x265.cpp -> x265_1.9.tar.gz/source/x265.cpp Changed
9
 
1
@@ -486,6 +486,7 @@
2
             pic_org.forceqp = qp + 1;
3
         if (type == 'I') pic_org.sliceType = X265_TYPE_IDR;
4
         else if (type == 'i') pic_org.sliceType = X265_TYPE_I;
5
+        else if (type == 'K') pic_org.sliceType = param->bOpenGOP ? X265_TYPE_I : X265_TYPE_IDR;
6
         else if (type == 'P') pic_org.sliceType = X265_TYPE_P;
7
         else if (type == 'B') pic_org.sliceType = X265_TYPE_BREF;
8
         else if (type == 'b') pic_org.sliceType = X265_TYPE_B;
9
x265_1.8.tar.gz/source/x265.def.in -> x265_1.9.tar.gz/source/x265.def.in Changed
6
 
1
@@ -22,3 +22,4 @@
2
 x265_cleanup
3
 x265_api_get_${X265_BUILD}
4
 x265_api_query
5
+x265_encoder_intra_refresh
6
x265_1.8.tar.gz/source/x265.h -> x265_1.9.tar.gz/source/x265.h Changed
201
 
1
@@ -2,6 +2,7 @@
2
  * Copyright (C) 2013 x265 project
3
  *
4
  * Authors: Steve Borho <steve@borho.org>
5
+ *          Min Chen <chenm003@163.com>
6
  *
7
  * This program is free software; you can redistribute it and/or modify
8
  * it under the terms of the GNU General Public License as published by
9
@@ -91,13 +92,15 @@
10
 /* Stores all analysis data for a single frame */
11
 typedef struct x265_analysis_data
12
 {
13
-    void*            interData;
14
-    void*            intraData;
15
+    int64_t          satdCost;
16
     uint32_t         frameRecordSize;
17
     uint32_t         poc;
18
     uint32_t         sliceType;
19
     uint32_t         numCUsInFrame;
20
     uint32_t         numPartitions;
21
+    void*            interData;
22
+    void*            intraData;
23
+    int              bScenecut;
24
 } x265_analysis_data;
25
 
26
 /* cu statistics */
27
@@ -132,6 +135,7 @@
28
     double           avgLumaDistortion;
29
     double           avgChromaDistortion;
30
     double           avgPsyEnergy;
31
+    double           avgResEnergy;
32
     double           avgLumaLevel;
33
     uint64_t         bits;
34
     int              encoderOrder;
35
@@ -141,6 +145,8 @@
36
     int              list1POC[16];
37
     uint16_t         maxLumaLevel;
38
     char             sliceType;
39
+    int              bScenecut;
40
+    int              frameLatency;
41
     x265_cu_stats    cuStats;
42
 } x265_frame_stats;
43
 
44
@@ -205,6 +211,13 @@
45
      * this data structure */
46
     x265_analysis_data analysisData;
47
 
48
+    /* An array of quantizer offsets to be applied to this image during encoding.
49
+     * These are added on top of the decisions made by rateControl.
50
+     * Adaptive quantization must be enabled to use this feature. These quantizer
51
+     * offsets should be given for each 16x16 block. Behavior if quant
52
+     * offsets differ between encoding passes is undefined. */
53
+    float            *quantOffsets;
54
+
55
     /* Frame level statistics */
56
     x265_frame_stats frameData;
57
 
58
@@ -378,6 +391,8 @@
59
     x265_sliceType_stats  statsI;               /* statistics of I slice */
60
     x265_sliceType_stats  statsP;               /* statistics of P slice */
61
     x265_sliceType_stats  statsB;               /* statistics of B slice */
62
+    uint16_t              maxCLL;               /* maximum content light level */
63
+    uint16_t              maxFALL;              /* maximum frame average light level */
64
 } x265_stats;
65
 
66
 /* String values accepted by x265_param_parse() (and CLI) for various parameters */
67
@@ -604,7 +619,7 @@
68
 
69
     /* Enables the emission of a user data SEI with the stream headers which
70
      * describes the encoder version, build info, and parameters. This is
71
-     * very helpful for debugging, but may interfere with regression tests. 
72
+     * very helpful for debugging, but may interfere with regression tests.
73
      * Default enabled */
74
     int       bEmitInfoSEI;
75
 
76
@@ -664,9 +679,9 @@
77
     int       bBPyramid;
78
 
79
     /* A value which is added to the cost estimate of B frames in the lookahead.
80
-     * It may be a positive value (making B frames appear more expensive, which
81
-     * causes the lookahead to chose more P frames) or negative, which makes the
82
-     * lookahead chose more B frames. Default is 0, there are no limits */
83
+     * It may be a positive value (making B frames appear less expensive, which
84
+     * biases the lookahead to choose more B frames) or negative, which makes the
85
+     * lookahead choose more P frames. Default is 0, there are no limits */
86
     int       bFrameBias;
87
 
88
     /* The number of frames that must be queued in the lookahead before it may
89
@@ -691,6 +706,11 @@
90
      * should detect scene cuts. The default (40) is recommended. */
91
     int       scenecutThreshold;
92
 
93
+    /* Replace keyframes by using a column of intra blocks that move across the video
94
+     * from one side to the other, thereby "refreshing" the image. In effect, instead of a
95
+     * big keyframe, the keyframe is "spread" over many frames. */
96
+    int       bIntraRefresh;
97
+
98
     /*== Coding Unit (CU) definitions ==*/
99
 
100
     /* Maximum CU width and height in pixels.  The size must be 64, 32, or 16.
101
@@ -810,6 +830,9 @@
102
      * 4 split CUs at the next lower CU depth.  The two flags may be combined */
103
     uint32_t  limitReferences;
104
 
105
+    /* Limit modes analyzed for each CU using cost metrics from the 4 sub-CUs */
106
+    uint32_t limitModes;
107
+
108
     /* ME search method (DIA, HEX, UMH, STAR, FULL). The search patterns
109
      * (methods) are sorted in increasing complexity, with diamond being the
110
      * simplest and fastest and full being the slowest.  DIA, HEX, and UMH were
111
@@ -920,7 +943,7 @@
112
     /* Psycho-visual rate-distortion strength. Only has an effect in presets
113
      * which use RDO. It makes mode decision favor options which preserve the
114
      * energy of the source, at the cost of lost compression. The value must
115
-     * be between 0 and 2.0, 1.0 is typical. Default 0.3 */
116
+     * be between 0 and 5.0, 1.0 is typical. Default 2.0 */
117
     double    psyRd;
118
 
119
     /* Strength of psycho-visual optimizations in quantization. Only has an
120
@@ -1038,7 +1061,7 @@
121
 
122
         /* Enable slow and a more detailed first pass encode in multi pass rate control */
123
         int       bEnableSlowFirstPass;
124
-        
125
+
126
         /* rate-control overrides */
127
         int        zoneCount;
128
         x265_zone* zones;
129
@@ -1051,14 +1074,14 @@
130
          * values will affect all encoders in the same process */
131
         const char* lambdaFileName;
132
 
133
-        /* Enable stricter conditions to check bitrate deviations in CBR mode. May compromise 
134
+        /* Enable stricter conditions to check bitrate deviations in CBR mode. May compromise
135
          * quality to maintain bitrate adherence */
136
         int bStrictCbr;
137
 
138
-        /* Enable adaptive quantization at CU granularity. This parameter specifies 
139
-         * the minimum CU size at which QP can be adjusted, i.e. Quantization Group 
140
-         * (QG) size. Allowed values are 64, 32, 16 provided it falls within the 
141
-         * inclusuve range [maxCUSize, minCUSize]. Experimental, default: maxCUSize*/
142
+        /* Enable adaptive quantization at CU granularity. This parameter specifies
143
+         * the minimum CU size at which QP can be adjusted, i.e. Quantization Group
144
+         * (QG) size. Allowed values are 64, 32, 16 provided it falls within the
145
+         * inclusuve range [maxCUSize, minCUSize]. Experimental, default: maxCUSize */
146
         uint32_t qgSize;
147
     } rc;
148
 
149
@@ -1165,12 +1188,27 @@
150
      * max,min luminance values. */
151
     const char* masteringDisplayColorVolume;
152
 
153
-    /* Content light level info SEI, specified as a string which is parsed when
154
-     * the stream header SEI are emitted. The string format is "%hu,%hu" where
155
-     * %hu are unsigned 16bit integers. The first value is the max content light
156
-     * level (or 0 if no maximum is indicated), the second value is the maximum
157
-     * picture average light level (or 0). */
158
-    const char* contentLightLevelInfo;
159
+    /* Maximum Content light level(MaxCLL), specified as integer that indicates the
160
+     * maximum pixel intensity level in units of 1 candela per square metre of the
161
+     * bitstream. x265 will also calculate MaxCLL programmatically from the input
162
+     * pixel values and set in the Content light level info SEI */
163
+    uint16_t maxCLL;
164
+
165
+    /* Maximum Frame Average Light Level(MaxFALL), specified as integer that indicates
166
+     * the maximum frame average intensity level in units of 1 candela per square
167
+     * metre of the bitstream. x265 will also calculate MaxFALL programmatically
168
+     * from the input pixel values and set in the Content light level info SEI */
169
+    uint16_t maxFALL;
170
+
171
+    /* Minimum luma level of input source picture, specified as a integer which
172
+     * would automatically increase any luma values below the specified --min-luma
173
+     * value to that value. */
174
+    uint16_t minLuma;
175
+
176
+    /* Maximum luma level of input source picture, specified as a integer which
177
+     * would automatically decrease any luma values above the specified --max-luma
178
+     * value to that value. */
179
+    uint16_t maxLuma;
180
 
181
 } x265_param;
182
 
183
@@ -1211,7 +1249,7 @@
184
     "main422-10", "main422-10-intra",
185
     "main444-10", "main444-10-intra",
186
 
187
-    "main12",     "main12-intra",                  /* Highly Experimental */
188
+    "main12",     "main12-intra",
189
     "main422-12", "main422-12-intra",
190
     "main444-12", "main444-12-intra",
191
 
192
@@ -1347,6 +1385,22 @@
193
  *      close an encoder handler */
194
 void x265_encoder_close(x265_encoder *);
195
 
196
+/* x265_encoder_intra_refresh:
197
+ *      If an intra refresh is not in progress, begin one with the next P-frame.
198
+ *      If an intra refresh is in progress, begin one as soon as the current one finishes.
199
+ *      Requires bIntraRefresh to be set.
200
+ *
201
x265_1.8.tar.gz/source/x265cli.h -> x265_1.9.tar.gz/source/x265cli.h Changed
98
 
1
@@ -116,6 +116,7 @@
2
     { "min-keyint",     required_argument, NULL, 'i' },
3
     { "scenecut",       required_argument, NULL, 0 },
4
     { "no-scenecut",          no_argument, NULL, 0 },
5
+    { "intra-refresh",        no_argument, NULL, 0 },
6
     { "rc-lookahead",   required_argument, NULL, 0 },
7
     { "lookahead-slices", required_argument, NULL, 0 },
8
     { "bframes",        required_argument, NULL, 'b' },
9
@@ -126,6 +127,8 @@
10
     { "b-pyramid",            no_argument, NULL, 0 },
11
     { "ref",            required_argument, NULL, 0 },
12
     { "limit-refs",     required_argument, NULL, 0 },
13
+    { "no-limit-modes",       no_argument, NULL, 0 },
14
+    { "limit-modes",          no_argument, NULL, 0 },
15
     { "no-weightp",           no_argument, NULL, 0 },
16
     { "weightp",              no_argument, NULL, 'w' },
17
     { "no-weightb",           no_argument, NULL, 0 },
18
@@ -192,6 +195,8 @@
19
     { "crop-rect",      required_argument, NULL, 0 }, /* DEPRECATED */
20
     { "master-display", required_argument, NULL, 0 },
21
     { "max-cll",        required_argument, NULL, 0 },
22
+    { "min-luma",       required_argument, NULL, 0 },
23
+    { "max-luma",       required_argument, NULL, 0 },
24
     { "no-dither",            no_argument, NULL, 0 },
25
     { "dither",               no_argument, NULL, 0 },
26
     { "no-repeat-headers",    no_argument, NULL, 0 },
27
@@ -251,14 +256,18 @@
28
     H0("   --log-level <string>          Logging level: none error warning info debug full. Default %s\n", X265_NS::logLevelNames[param->logLevel + 1]);
29
     H0("   --no-progress                 Disable CLI progress reports\n");
30
     H0("   --csv <filename>              Comma separated log file, if csv-log-level > 0 frame level statistics, else one line per run\n");
31
-    H0("   --csv-log-level               Level of csv logging, if csv-log-level > 0 frame level statistics, else one line per run: 0-2\n");
32
+    H0("   --csv-log-level <integer>     Level of csv logging, if csv-log-level > 0 frame level statistics, else one line per run: 0-2\n");
33
     H0("\nInput Options:\n");
34
     H0("   --input <filename>            Raw YUV or Y4M input file name. `-` for stdin\n");
35
     H1("   --y4m                         Force parsing of input stream as YUV4MPEG2 regardless of file extension\n");
36
     H0("   --fps <float|rational>        Source frame rate (float or num/denom), auto-detected if Y4M\n");
37
     H0("   --input-res WxH               Source picture size [w x h], auto-detected if Y4M\n");
38
     H1("   --input-depth <integer>       Bit-depth of input file. Default 8\n");
39
-    H1("   --input-csp <string>          Source color space: i420, i444 or i422, auto-detected if Y4M. Default: i420\n");
40
+    H1("   --input-csp <string>          Chroma subsampling, auto-detected if Y4M\n");
41
+    H1("                                 0 - i400 (4:0:0 monochrome)\n");
42
+    H1("                                 1 - i420 (4:2:0 default)\n");
43
+    H1("                                 2 - i422 (4:2:2)\n");
44
+    H1("                                 3 - i444 (4:4:4)\n");
45
     H0("-f/--frames <integer>            Maximum number of frames to encode. Default all\n");
46
     H0("   --seek <integer>              First frame to encode\n");
47
     H1("   --[no-]interlace <bff|tff>    Indicate input pictures are interlace fields in temporal order. Default progressive\n");
48
@@ -292,7 +301,7 @@
49
     H0("   --tu-inter-depth <integer>    Max TU recursive depth for inter CUs. Default %d\n", param->tuQTMaxInterDepth);
50
     H0("\nAnalysis:\n");
51
     H0("   --rd <0..6>                   Level of RDO in mode decision 0:least....6:full RDO. Default %d\n", param->rdLevel);
52
-    H0("   --[no-]psy-rd <0..2.0>        Strength of psycho-visual rate distortion optimization, 0 to disable. Default %.1f\n", param->psyRd);
53
+    H0("   --[no-]psy-rd <0..5.0>        Strength of psycho-visual rate distortion optimization, 0 to disable. Default %.1f\n", param->psyRd);
54
     H0("   --[no-]rdoq-level <0|1|2>     Level of RDO in quantization 0:none, 1:levels, 2:levels & coding groups. Default %d\n", param->rdoqLevel);
55
     H0("   --[no-]psy-rdoq <0..50.0>     Strength of psycho-visual optimization in RDO quantization, 0 to disable. Default %.1f\n", param->psyRdoq);
56
     H0("   --[no-]early-skip             Enable early SKIP detection. Default %s\n", OPT(param->bEnableEarlySkip));
57
@@ -308,12 +317,13 @@
58
     H0("\nTemporal / motion search options:\n");
59
     H0("   --max-merge <1..5>            Maximum number of merge candidates. Default %d\n", param->maxNumMergeCand);
60
     H0("   --ref <integer>               max number of L0 references to be allowed (1 .. 16) Default %d\n", param->maxNumReferences);
61
-    H0("   --limit-refs <0|1|2|3>        limit references per depth (1) or CU (2) or both (3). Default %d\n", param->limitReferences);
62
+    H0("   --limit-refs <0|1|2|3>        Limit references per depth (1) or CU (2) or both (3). Default %d\n", param->limitReferences);
63
     H0("   --me <string>                 Motion search method dia hex umh star full. Default %d\n", param->searchMethod);
64
     H0("-m/--subme <integer>             Amount of subpel refinement to perform (0:least .. 7:most). Default %d \n", param->subpelRefine);
65
     H0("   --merange <integer>           Motion search range. Default %d\n", param->searchRange);
66
     H0("   --[no-]rect                   Enable rectangular motion partitions Nx2N and 2NxN. Default %s\n", OPT(param->bEnableRectInter));
67
     H0("   --[no-]amp                    Enable asymmetric motion partitions, requires --rect. Default %s\n", OPT(param->bEnableAMP));
68
+    H0("   --[no-]limit-modes            Limit rectangular and asymmetric motion predictions. Default %d\n", param->limitModes);
69
     H1("   --[no-]temporal-mvp           Enable temporal MV predictors. Default %s\n", OPT(param->bEnableTemporalMvp));
70
     H0("\nSpatial / intra options:\n");
71
     H0("   --[no-]strong-intra-smoothing Enable strong intra smoothing for 32x32 blocks. Default %s\n", OPT(param->bEnableStrongIntraSmoothing));
72
@@ -327,6 +337,7 @@
73
     H0("-i/--min-keyint <integer>        Scenecuts closer together than this are coded as I, not IDR. Default: auto\n");
74
     H0("   --no-scenecut                 Disable adaptive I-frame decision\n");
75
     H0("   --scenecut <integer>          How aggressively to insert extra I-frames. Default %d\n", param->scenecutThreshold);
76
+    H0("   --intra-refresh               Use Periodic Intra Refresh instead of IDR frames\n");
77
     H0("   --rc-lookahead <integer>      Number of frames for frame-type lookahead (determines encoder latency) Default %d\n", param->lookaheadDepth);
78
     H1("   --lookahead-slices <0..16>    Number of slices to use per lookahead cost estimate. Default %d\n", param->lookaheadSlices);
79
     H0("   --bframes <integer>           Maximum number of consecutive b-frames (now it only enables B GOP structure) Default %d\n", param->bframes);
80
@@ -335,7 +346,7 @@
81
     H0("   --[no-]b-pyramid              Use B-frames as references. Default %s\n", OPT(param->bBPyramid));
82
     H1("   --qpfile <string>             Force frametypes and QPs for some or all frames\n");
83
     H1("                                 Format of each line: framenumber frametype QP\n");
84
-    H1("                                 QP is optional (none lets x265 choose). Frametypes: I,i,P,B,b.\n");
85
+    H1("                                 QP is optional (none lets x265 choose). Frametypes: I,i,K,P,B,b.\n");
86
     H1("                                 QPs are restricted by qpmin/qpmax.\n");
87
     H0("\nRate control, Adaptive Quantization:\n");
88
     H0("   --bitrate <integer>           Target bitrate (kbps) for ABR (implied). Default %d\n", param->rc.bitrate);
89
@@ -403,6 +414,8 @@
90
     H0("   --master-display <string>     SMPTE ST 2086 master display color volume info SEI (HDR)\n");
91
     H0("                                    format: G(x,y)B(x,y)R(x,y)WP(x,y)L(max,min)\n");
92
     H0("   --max-cll <string>            Emit content light level info SEI as \"cll,fall\" (HDR)\n");
93
+    H0("   --min-luma <integer>          Minimum luma plane value of input source picture\n");
94
+    H0("   --max-luma <integer>          Maximum luma plane value of input source picture\n");
95
     H0("\nBitstream options:\n");
96
     H0("   --[no-]repeat-headers         Emit SPS and PPS headers at each keyframe. Default %s\n", OPT(param->bRepeatHeaders));
97
     H0("   --[no-]info                   Emit SEI identifying encoder and parameters. Default %s\n", OPT(param->bEmitInfoSEI));
98