Changes of Revision 9

x265.changes Changed
x
 
1
@@ -1,4 +1,62 @@
2
 -------------------------------------------------------------------
3
+Tue Apr 28 20:08:06 UTC 2015 - aloisio@gmx.com
4
+
5
+- soname bumped to 51
6
+- Update to stable version 1.6
7
+  Perfomance changes:
8
+  * heavy improvements for AVX2 capable platforms
9
+    (Haswell and later Intel CPUs) and work efficiency
10
+    improvements for multiple-socket machines.
11
+  
12
+  API changes:
13
+  * --threads N replaced by --pools N,N and --lookahead-slices N
14
+  * --[no-]rdoq-level N - finer control over RDOQ effort
15
+  * --min-cu-size N - trade-off compression for performance
16
+  * --max-tu-size N - trade-off compression for performance
17
+  * --[no-]temporal-layers - code unreferenced B frames in temporal
18
+    layer 1
19
+  * --[no-]cip aliases added for --[no-]constrained-intra
20
+  * Added support for new color transfer functions "smpte-st-2084"
21
+    and "smpte-st-428
22
+  * --limit-refs N was added, but not yet implemented
23
+  * Deprecated x265_setup_primitives() was removed from the public
24
+    API and is no longer exported DLLs
25
+  
26
+  Threading changes:
27
+  * The x265 thread pool has been made NUMA aware.
28
+  * The --threads  parameter, which used to specify a global
29
+    pool size, has been replaced with a --pools parameter which
30
+    allows you to specify a pool size per NUMA node (aka CPU socket
31
+    or package). The default is still to allocate one pool worker
32
+    thread per logical core on the machine, but with --pools one
33
+    can isolate those threads to a given socket.
34
+  * Other than socket isolation, the biggest visible change in the
35
+    NUMA aware thread pools is the increase in work efficiency.
36
+    The total utilization will generally decrease but the performance
37
+    will increase since worker threads spend less time context
38
+    switching.  Also, the threading of the lookahead was made more
39
+    work-efficient. Each lookahead job is a much larger piece of work.
40
+    Before (1.5):
41
+    disable thread pool: --threads 1
42
+    default thread pool: --threads 0
43
+    restrict to 4 threads: --threads 4
44
+    After (1.6):
45
+    disable thread pools: --pools 0
46
+    default thread pools: --pools *
47
+    restrict to 4 threads: --pools 4
48
+    restrict to 4 threads on socket 1: --pools -,4
49
+    restrict to all threads on socket 0: --pools +,-
50
+  
51
+  Multi-lib interface:
52
+  * In order to support runtime selection of a libx265
53
+    shared library, we have introduced an x265_api structure
54
+    and an x265_api_get() function. Applications which use
55
+    this interface to acquire the libx265 functional interface
56
+    will be able to use shim libraries to bind a particular build
57
+    of libx265 at run time. See the API documentation for full
58
+    details.
59
+
60
+-------------------------------------------------------------------
61
 Sun Feb 22 09:07:11 UTC 2015 - aloisio@gmx.com
62
 
63
 - soname bump
64
x265.spec Changed
23
 
1
@@ -1,10 +1,10 @@
2
 # based on the spec file from https://build.opensuse.org/package/view_file/home:Simmphonie/libx265/
3
 
4
 Name:           x265
5
-%define soname  43
6
+%define soname  51
7
 %define libname lib%{name}
8
 %define libsoname %{libname}-%{soname}
9
-Version:        1.5
10
+Version:        1.6
11
 Release:        0
12
 License:        GPL-2.0+
13
 Summary:        A free h265/HEVC encoder - encoder binary
14
@@ -45,7 +45,7 @@
15
 %prep
16
 %setup -q -n "%{name}_%{version}/build/linux"
17
 cd ../..
18
-%patch0 -p1
19
+%patch0
20
 cd -
21
 %define FAKE_BUILDDATE %(LC_ALL=C date -u -r %{_sourcedir}/%{name}.changes '+%%b %%e %%Y')
22
 sed -i -e "s/0.0/%{soname}.0/g" ../../source/cmake/version.cmake
23
arm.patch Changed
39
 
1
@@ -1,7 +1,6 @@
2
-diff -urN a/source/CMakeLists.txt b/source/CMakeLists.txt
3
---- a/source/CMakeLists.txt    2015-02-10 14:15:13.000000000 -0700
4
-+++ b/source/CMakeLists.txt    2015-02-12 06:25:01.334927114 -0700
5
-@@ -46,10 +46,18 @@
6
+--- source/CMakeLists.txt.orig 2015-04-28 21:43:18.585528552 +0200
7
++++ source/CMakeLists.txt  2015-04-28 21:47:14.995334232 +0200
8
+@@ -50,10 +50,18 @@
9
          set(X64 1)
10
          add_definitions(-DX86_64=1)
11
      endif()
12
@@ -23,8 +22,8 @@
13
  else()
14
      message(STATUS "CMAKE_SYSTEM_PROCESSOR value `${CMAKE_SYSTEM_PROCESSOR}` is unknown")
15
      message(STATUS "Please add this value near ${CMAKE_CURRENT_LIST_FILE}:${CMAKE_CURRENT_LIST_LINE}")
16
-@@ -133,8 +141,8 @@
17
-     if(X86 AND NOT X64)
18
+@@ -155,8 +163,8 @@
19
+     elseif(X86 AND NOT X64)
20
          add_definitions(-march=i686)
21
      endif()
22
 -    if(ARM)
23
@@ -32,11 +31,10 @@
24
 +    if(ARMV7)
25
 +        add_definitions(-fPIC)
26
      endif()
27
-     check_cxx_compiler_flag(-Wno-narrowing CC_HAS_NO_NARROWING) 
28
-     check_cxx_compiler_flag(-Wno-array-bounds CC_HAS_NO_ARRAY_BOUNDS) 
29
-diff -urN a/source/common/cpu.cpp b/source/common/cpu.cpp
30
---- a/source/common/cpu.cpp    2015-02-10 14:15:13.000000000 -0700
31
-+++ b/source/common/cpu.cpp    2015-02-12 06:25:01.334927114 -0700
32
+     if(FPROFILE_GENERATE)
33
+         if(INTEL_CXX)
34
+--- source/common/cpu.cpp.orig 2015-04-28 21:47:44.634923269 +0200
35
++++ source/common/cpu.cpp  2015-04-28 21:49:50.305468867 +0200
36
 @@ -37,7 +37,7 @@
37
  #include <machine/cpu.h>
38
  #endif
39
baselibs.conf Changed
4
 
1
@@ -1,1 +1,1 @@
2
-libx265-43
3
+libx265-51
4
x265_1.5.tar.gz/.hg_archival.txt -> x265_1.6.tar.gz/.hg_archival.txt Changed
8
 
1
@@ -1,4 +1,4 @@
2
 repo: 09fe40627f03a0f9c3e6ac78b22ac93da23f9fdf
3
-node: 9f0324125f53a12f766f6ed6f98f16e2f42337f4
4
+node: cbeb7d8a4880e4020c4545dd8e498432c3c6cad3
5
 branch: stable
6
-tag: 1.5
7
+tag: 1.6
8
x265_1.5.tar.gz/.hgtags -> x265_1.6.tar.gz/.hgtags Changed
6
 
1
@@ -13,3 +13,4 @@
2
 d6257335c5370ee54317a0426a12c1f0724b18b9 1.2
3
 c1e4fc0162c14fdb84f5c3bd404fb28cfe10a17f 1.3
4
 5e604833c5aa605d0b6efbe5234492b5e7d8ac61 1.4
5
+9f0324125f53a12f766f6ed6f98f16e2f42337f4 1.5
6
x265_1.5.tar.gz/doc/reST/api.rst -> x265_1.6.tar.gz/doc/reST/api.rst Changed
66
 
1
@@ -72,11 +72,13 @@
2
    process. All of the encoders must use the same maximum CTU size
3
    because many global variables are configured based on this size.
4
    Encoder allocation will fail if a mis-matched CTU size is attempted.
5
+   If no encoders are open, **x265_cleanup()** can be called to reset
6
+   the configured CTU size so a new size can be used.
7
 
8
 An encoder is allocated by calling **x265_encoder_open()**::
9
 
10
    /* x265_encoder_open:
11
-   *      create a new encoder handler, all parameters from x265_param are copied */
12
+    *      create a new encoder handler, all parameters from x265_param are copied */
13
    x265_encoder* x265_encoder_open(x265_param *);
14
 
15
 The returned pointer is then passed to all of the functions pertaining
16
@@ -337,10 +339,44 @@
17
    void x265_encoder_close(x265_encoder *);
18
 
19
 When the application has completed all encodes, it should call
20
-**x265_cleanup()** to free process global resources like the thread pool;
21
-particularly if a memory-leak detection tool is being used::
22
+**x265_cleanup()** to free process global, particularly if a memory-leak
23
+detection tool is being used. **x265_cleanup()** also resets the saved
24
+CTU size so it will be possible to create a new encoder with a different
25
+CTU size::
26
 
27
-   /***
28
-    * Release library static allocations
29
-    */
30
+   /* x265_cleanup:
31
+    *     release library static allocations, reset configured CTU size */
32
    void x265_cleanup(void);
33
+
34
+
35
+Multi-library Interface
36
+=======================
37
+
38
+If your application might want to make a runtime selection between among
39
+a number of libx265 libraries (perhaps 8bpp and 16bpp), then you will
40
+want to use the multi-library interface.
41
+
42
+Instead of directly using all of the **x265_** methods documented
43
+above, you query an x265_api structure from your libx265 and then use
44
+the function pointers within that structure of the same name, but
45
+without the **x265_** prefix. So **x265_param_default()** becomes
46
+**api->param_default()**. The key method is x265_api_get()::
47
+
48
+    /* x265_api_get:
49
+     *   Retrieve the programming interface for a linked x265 library.
50
+     *   May return NULL if no library is available that supports the
51
+     *   requested bit depth. If bitDepth is 0, the function is guarunteed
52
+     *   to return a non-NULL x265_api pointer from the system default
53
+     *   libx265 */
54
+    const x265_api* x265_api_get(int bitDepth);
55
+
56
+The general idea is to request the API for the bitDepth you would prefer
57
+the encoder to use (8 or 10), and if that returns NULL you request the
58
+API for bitDepth=0, which returns the system default libx265.
59
+
60
+Note that using this multi-library API in your application is only the
61
+first step. Next your application must dynamically link to libx265 and
62
+then you must build and install a multi-lib configuration of libx265,
63
+which includes 8bpp and 16bpp builds of libx265 and a shim library which
64
+forwards x265_api_get() calls to the appropriate library using dynamic
65
+loading and binding.
66
x265_1.5.tar.gz/doc/reST/cli.rst -> x265_1.6.tar.gz/doc/reST/cli.rst Changed
201
 
1
@@ -171,19 +171,54 @@
2
    Over-allocation of frame threads will not improve performance, it
3
    will generally just increase memory use.
4
 
5
-.. option:: --threads <integer>
6
+   **Values:** any value between 8 and 16. Default is 0, auto-detect
7
 
8
-   Number of threads to allocate for the worker thread pool  This pool
9
-   is used for WPP and for distributed analysis and motion search:
10
-   :option:`--wpp` :option:`--pmode` and :option:`--pme` respectively.
11
+.. option:: --pools <string>, --numa-pools <string>
12
 
13
-   If :option:`--threads` 1 is specified, then no thread pool is
14
-   created. When no thread pool is created, all the thread pool
15
-   features are implicitly disabled. If all the pool features are
16
-   disabled by the user, then the pool is implicitly disabled.
17
+   Comma seperated list of threads per NUMA node. If "none", then no worker
18
+   pools are created and only frame parallelism is possible. If NULL or ""
19
+   (default) x265 will use all available threads on each NUMA node::
20
 
21
-   Default 0, one thread is allocated per detected hardware thread
22
-   (logical CPU cores)
23
+   '+'  is a special value indicating all cores detected on the node
24
+   '*'  is a special value indicating all cores detected on the node and all remaining nodes
25
+   '-'  is a special value indicating no cores on the node, same as '0'
26
+
27
+   example strings for a 4-node system::
28
+
29
+   ""        - default, unspecified, all numa nodes are used for thread pools
30
+   "*"       - same as default
31
+   "none"    - no thread pools are created, only frame parallelism possible
32
+   "-"       - same as "none"
33
+   "10"      - allocate one pool, using up to 10 cores on node 0
34
+   "-,+"     - allocate one pool, using all cores on node 1
35
+   "+,-,+"   - allocate two pools, using all cores on nodes 0 and 2
36
+   "+,-,+,-" - allocate two pools, using all cores on nodes 0 and 2
37
+   "-,*"     - allocate three pools, using all cores on nodes 1, 2 and 3
38
+   "8,8,8,8" - allocate four pools with up to 8 threads in each pool
39
+
40
+   The total number of threads will be determined by the number of threads
41
+   assigned to all nodes. The worker threads will each be given affinity for
42
+   their node, they will not be allowed to migrate between nodes, but they
43
+   will be allowed to move between CPU cores within their node.
44
+
45
+   If the three pool features: :option:`--wpp` :option:`--pmode` and
46
+   :option:`--pme` are all disabled, then :option:`--pools` is ignored
47
+   and no thread pools are created.
48
+
49
+   If "none" is specified, then all three of the thread pool features are
50
+   implicitly disabled.
51
+
52
+   Multiple thread pools will be allocated for any NUMA node with more than
53
+   64 logical CPU cores. But any given thread pool will always use at most
54
+   one NUMA node.
55
+
56
+   Frame encoders are distributed between the available thread pools,
57
+   and the encoder will never generate more thread pools than
58
+   :option:`--frame-threads`.  The pools are used for WPP and for
59
+   distributed analysis and motion search.
60
+
61
+   Default "", one thread is allocated per detected hardware thread
62
+   (logical CPU cores) and one thread pool per NUMA node.
63
 
64
 .. option:: --wpp, --no-wpp
65
 
66
@@ -409,7 +444,30 @@
67
    If :option:`--level-idc` has been specified, the option adds the
68
    intention to support the High tier of that level. If your specified
69
    level does not support a High tier, a warning is issued and this
70
-   modifier flag is ignored.
71
+   modifier flag is ignored. If :option:`--level-idc` has been specified,
72
+   but not --high-tier, then the encoder will attempt to encode at the 
73
+   specified level, main tier first, turning on high tier only if 
74
+   necessary and available at that level.
75
+
76
+.. option:: --ref <1..16>
77
+
78
+   Max number of L0 references to be allowed. This number has a linear
79
+   multiplier effect on the amount of work performed in motion search,
80
+   but will generally have a beneficial affect on compression and
81
+   distortion.
82
+   
83
+   Note that x265 allows up to 16 L0 references but the HEVC
84
+   specification only allows a maximum of 8 total reference frames. So
85
+   if you have B frames enabled only 7 L0 refs are valid and if you
86
+   have :option:`--b-pyramid` enabled (which is enabled by default in
87
+   all presets), then only 6 L0 refs are the maximum allowed by the
88
+   HEVC specification.  If x265 detects that the total reference count
89
+   is greater than 8, it will issue a warning that the resulting stream
90
+   is non-compliant and it signals the stream as profile NONE and level
91
+   NONE but still allows the encode to continue.  Compliant HEVC
92
+   decoders may refuse to decode such streams.
93
+   
94
+   Default 3
95
 
96
 .. note::
97
    :option:`--profile`, :option:`--level-idc`, and
98
@@ -444,7 +502,7 @@
99
    +-------+---------------------------------------------------------------+
100
    | 3     | RDO mode and split decisions, chroma residual used for sa8d   |
101
    +-------+---------------------------------------------------------------+
102
-   | 4     | Adds RDO Quant                                                |
103
+   | 4     | Currently same as 3                                           |
104
    +-------+---------------------------------------------------------------+
105
    | 5     | Adds RDO prediction decisions                                 |
106
    +-------+---------------------------------------------------------------+
107
@@ -465,6 +523,23 @@
108
    and less frame parallelism as well. Because of this the faster
109
    presets use a CU size of 32. Default: 64
110
 
111
+.. option:: --min-cu-size <64|32|16|8>
112
+
113
+   Minimum CU size (width and height). By using 16 or 32 the encoder
114
+   will not analyze the cost of CUs below that minimum threshold,
115
+   saving considerable amounts of compute with a predictable increase
116
+   in bitrate. This setting has a large effect on performance on the
117
+   faster presets.
118
+
119
+   Default: 8 (minimum 8x8 CU for HEVC, best compression efficiency)
120
+
121
+.. note::
122
+
123
+   All encoders within a single process must use the same settings for
124
+   the CU size range. :option:`--ctu` and :option:`--min-cu-size` must
125
+   be consistent for all of them since the encoder configures several
126
+   key global data structures based on this range.
127
+
128
 .. option:: --rect, --no-rect
129
 
130
    Enable analysis of rectangular motion partitions Nx2N and 2NxN
131
@@ -494,14 +569,6 @@
132
    Measure full CU size (2Nx2N) merge candidates first; if no residual
133
    is found the analysis is short circuited. Default disabled
134
 
135
-.. option:: --fast-cbf, --no-fast-cbf
136
-
137
-   Short circuit analysis if a prediction is found that does not set
138
-   the coded block flag (aka: no residual was encoded).  It prevents
139
-   the encoder from perhaps finding other predictions that also have no
140
-   residual but require less signaling bits or have less distortion.
141
-   Only applicable for RD levels 5 and 6. Default disabled
142
-
143
 .. option:: --fast-intra, --no-fast-intra
144
 
145
    Perform an initial scan of every fifth intra angular mode, then
146
@@ -526,14 +593,6 @@
147
    Only effective at RD levels 3 and above, which perform RDO mode
148
    decisions.
149
 
150
-.. option:: --tskip, --no-tskip
151
-
152
-   Enable evaluation of transform skip (bypass DCT but still use
153
-   quantization) coding for 4x4 TU coded blocks.
154
-
155
-   Only effective at RD levels 3 and above, which perform RDO mode
156
-   decisions. Default disabled
157
-
158
 .. option:: --tskip-fast, --no-tskip-fast
159
 
160
    Only evaluate transform skip for NxN intra predictions (4x4 blocks).
161
@@ -567,6 +626,30 @@
162
 Options which affect the transform unit quad-tree, sometimes referred to
163
 as the residual quad-tree (RQT).
164
 
165
+.. option:: --rdoq-level <0|1|2>, --no-rdoq-level
166
+
167
+   Specify the amount of rate-distortion analysis to use within
168
+   quantization::
169
+
170
+   At level 0 rate-distortion cost is not considered in quant
171
+   
172
+   At level 1 rate-distortion cost is used to find optimal rounding
173
+   values for each level (and allows psy-rdoq to be effective). It
174
+   trades-off the signaling cost of the coefficient vs its post-inverse
175
+   quant distortion from the pre-quant coefficient. When
176
+   :option:`--psy-rdoq` is enabled, this formula is biased in favor of
177
+   more energy in the residual (larger coefficient absolute levels)
178
+   
179
+   At level 2 rate-distortion cost is used to make decimate decisions
180
+   on each 4x4 coding group, including the cost of signaling the group
181
+   within the group bitmap. If the total distortion of not signaling
182
+   the entire coding group is less than the rate cost, the block is
183
+   decimated. Next, it applies rate-distortion cost analysis to the
184
+   last non-zero coefficient, which can result in many (or all) of the
185
+   coding groups being decimated. Psy-rdoq is less effective at
186
+   preserving energy when RDOQ is at level 2, since it only has
187
+   influence over the level distortion costs.
188
+
189
 .. option:: --tu-intra-depth <1..4>
190
 
191
    The transform unit (residual) quad-tree begins with the same depth
192
@@ -593,9 +676,76 @@
193
    partitions, in which case a TU split is implied and thus the
194
    residual quad-tree begins one layer below the CU quad-tree.
195
 
196
+.. option:: --nr-intra <integer>, --nr-inter <integer>
197
+
198
+   Noise reduction - an adaptive deadzone applied after DCT
199
+   (subtracting from DCT coefficients), before quantization.  It does
200
+   no pixel-level filtering, doesn't cross DCT block boundaries, has no
201
x265_1.5.tar.gz/doc/reST/presets.rst -> x265_1.6.tar.gz/doc/reST/presets.rst Changed
63
 
1
@@ -24,19 +24,21 @@
2
 +==============+===========+===========+==========+========+======+========+======+========+==========+=========+
3
 | ctu          |   32      |    32     |   32     |  64    |  64  |   64   |  64  |  64    |   64     |   64    |
4
 +--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
5
-| bframes      |    4      |     4     |    4     |   4    |  4   |    4   |  4   |   8    |    8     |    8    |
6
+| min-cu-size  |   16      |     8     |    8     |   8    |   8  |    8   |   8  |   8    |    8     |    8    |
7
 +--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
8
-| b-adapt      |    0      |     0     |    0     |   0    |  2   |    2   |  2   |   2    |    2     |    2    |
9
+| bframes      |    3      |     3     |    4     |   4    |  4   |    4   |  4   |   8    |    8     |    8    |
10
 +--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
11
-| rc-lookahead |   10      |    10     |   15     |  15    |  15  |   20   |  25  |   30   |   40     |   60    |
12
+| b-adapt      |    0      |     0     |    0     |   0    |  0   |    2   |  2   |   2    |    2     |    2    |
13
++--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
14
+| rc-lookahead |    5      |    10     |   15     |  15    |  15  |   20   |  25  |   30   |   40     |   60    |
15
 +--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
16
 | scenecut     |    0      |    40     |   40     |  40    |  40  |   40   |  40  |   40   |   40     |   40    |
17
 +--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
18
-| refs         |    1      |     1     |    1     |   1    |  3   |    3   |  3   |   3    |    5     |    5    |
19
+| refs         |    1      |     1     |    1     |   1    |  2   |    3   |  3   |   3    |    5     |    5    |
20
 +--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
21
 | me           |   dia     |   hex     |   hex    |  hex   | hex  |   hex  | star |  star  |   star   |   star  |
22
 +--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
23
-| merange      |   25      |    44     |   57     |  57    |  57  |   57   | 57   |  57    |   57     |   92    |
24
+| merange      |   57      |    57     |   57     |  57    |  57  |   57   | 57   |  57    |   57     |   92    |
25
 +--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
26
 | subme        |    0      |     1     |    1     |   2    |  2   |    2   |  3   |   3    |    4     |    5    |
27
 +--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
28
@@ -60,12 +62,14 @@
29
 +--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
30
 | weightb      |    0      |     0     |    0     |   0    |  0   |    0   |  0   |   1    |    1     |    1    |
31
 +--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
32
-| aq-mode      |    0      |     0     |    2     |   2    |  2   |    2   |  2   |   2    |    2     |    2    |
33
+| aq-mode      |    0      |     0     |    1     |   1    |  1   |    1   |  1   |   1    |    1     |    1    |
34
 +--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
35
 | cuTree       |    0      |     0     |    0     |   0    |  1   |    1   |  1   |   1    |    1     |    1    |
36
 +--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
37
 | rdLevel      |    2      |     2     |    2     |   2    |  2   |    3   |  4   |   6    |    6     |    6    |
38
 +--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
39
+| rdoq-level   |    0      |     0     |    0     |   0    |  0   |    0   |  2   |   2    |    2     |    2    |
40
++--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
41
 | tu-intra     |    1      |     1     |    1     |   1    |  1   |    1   |  1   |   2    |    3     |    4    |
42
 +--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
43
 | tu-inter     |    1      |     1     |    1     |   1    |  1   |    1   |  1   |   2    |    3     |    4    |
44
@@ -114,17 +118,12 @@
45
 modes which preserve high frequency noise:
46
 
47
     * :option:`--psy-rd` 0.5
48
+    * :option:`--rdoq-level` 1
49
     * :option:`--psy-rdoq` 30
50
 
51
-.. Note::
52
-
53
-    --psy-rdoq is only effective when RDOQuant is enabled, which is at
54
-    RD levels 4, 5, and 6 (presets slow and below).
55
-
56
 It lowers the strength of adaptive quantization, so residual energy can
57
 be more evenly distributed across the (noisy) picture:
58
 
59
-    * :option:`--aq-mode` 1
60
     * :option:`--aq-strength` 0.3
61
 
62
 And it similarly tunes rate control to prevent the slice QP from
63
x265_1.5.tar.gz/doc/reST/threading.rst -> x265_1.6.tar.gz/doc/reST/threading.rst Changed
154
 
1
@@ -2,41 +2,34 @@
2
 Threading
3
 *********
4
 
5
-Thread Pool
6
-===========
7
+Thread Pools
8
+============
9
 
10
-x265 creates a pool of worker threads and shares this thread pool
11
-with all encoders within the same process (it is process global, aka a
12
-singleton).  The number of threads within the thread pool is determined
13
-by the encoder which first allocates the pool, which by definition is
14
-the first encoder created within each process.
15
+x265 creates one or more thread pools per encoder, one pool per NUMA
16
+node (typically a CPU socket). :option:`--pools` specifies the number of
17
+pools and the number of threads per pool the encoder will allocate. By
18
+default x265 allocates one thread per (hyperthreaded) CPU core on each
19
+NUMA node.
20
 
21
-:option:`--threads` specifies the number of threads the encoder will
22
-try to allocate for its thread pool.  If the thread pool was already
23
-allocated this parameter is ignored.  By default x265 allocates one
24
-thread per (hyperthreaded) CPU core in your system.
25
+If you are running multiple encoders on a system with multiple NUMA
26
+nodes, it is recommended to isolate each of them to a single node in
27
+order to avoid the NUMA overhead of remote memory access.
28
 
29
-Work distribution is job based.  Idle worker threads ask their parent
30
-pool object for jobs to perform.  When no jobs are available, idle
31
-worker threads block and consume no CPU cycles.
32
+Work distribution is job based. Idle worker threads scan the job
33
+providers assigned to their thread pool for jobs to perform. When no
34
+jobs are available, the idle worker threads block and consume no CPU
35
+cycles.
36
 
37
 Objects which desire to distribute work to worker threads are known as
38
-job providers (and they derive from the JobProvider class).  When job
39
-providers have work they enqueue themselves into the pool's provider
40
-list (and dequeue themselves when they no longer have work).  The thread
41
+job providers (and they derive from the JobProvider class).  The thread
42
 pool has a method to **poke** awake a blocked idle thread, and job
43
 providers are recommended to call this method when they make new jobs
44
 available.
45
 
46
 Worker jobs are not allowed to block except when abosultely necessary
47
-for data locking. If a job becomes blocked, the worker thread is
48
-expected to drop that job and go back to the pool and find more work.
49
-
50
-.. note::
51
-
52
-   x265_cleanup() frees the process-global thread pool, allowing
53
-   it to be reallocated if necessary, but only if no encoders are
54
-   allocated at the time it is called.
55
+for data locking. If a job becomes blocked, the work function is
56
+expected to drop that job so the worker thread may go back to the pool
57
+and find more work.
58
 
59
 Wavefront Parallel Processing
60
 =============================
61
@@ -82,24 +75,35 @@
62
 thread count to be higher than if WPP was enabled.  The exact formulas
63
 are described in the next section.
64
 
65
+Bonded Task Groups
66
+==================
67
+
68
+If a worker thread job has work which can be performed in parallel by
69
+many threads, it may allocate a bonded task group and enlist the help of
70
+other idle worker threads in the same pool. Those threads will cooperate
71
+to complete the work of the bonded task group and then return to their
72
+idle states. The larger and more uniform those tasks are, the better the
73
+bonded task group will perform.
74
+
75
 Parallel Mode Analysis
76
-======================
77
+~~~~~~~~~~~~~~~~~~~~~~
78
 
79
 When :option:`--pmode` is enabled, each CU (at all depths from 64x64 to
80
-8x8) will distribute its analysis work to the thread pool. Each analysis
81
-job will measure the cost of one prediction for the CU: merge, skip,
82
-intra, inter (2Nx2N, Nx2N, 2NxN, and AMP). At slower presets, the amount
83
-of increased parallelism is often enough to be able to reduce frame
84
-parallelism while achieving the same overall CPU utilization. Reducing
85
-frame threads is often beneficial to ABR and VBV rate control.
86
+8x8) will distribute its analysis work to the thread pool via a bonded
87
+task group. Each analysis job will measure the cost of one prediction
88
+for the CU: merge, skip, intra, inter (2Nx2N, Nx2N, 2NxN, and AMP). At
89
+slower presets, the amount of increased parallelism is often enough to
90
+be able to reduce frame parallelism while achieving the same overall CPU
91
+utilization. Reducing frame threads is often beneficial to ABR and VBV
92
+rate control.
93
 
94
 Parallel Motion Estimation
95
-==========================
96
+~~~~~~~~~~~~~~~~~~~~~~~~~~
97
 
98
 When :option:`--pme` is enabled all of the analysis functions which
99
 perform motion searches to reference frames will distribute those motion
100
-searches as jobs for worker threads (if more than two motion searches
101
-are required).
102
+searches as jobs for worker threads via a bonded task group (if more
103
+than two motion searches are required).
104
 
105
 Frame Threading
106
 ===============
107
@@ -125,16 +129,21 @@
108
 for motion reference must be processed by the loop filters and the loop
109
 filters cannot run until a full row has been encoded, and it must run a
110
 full row behind the encode process so that the pixels below the row
111
-being filtered are available. When you add up all the row lags each
112
-frame ends up being 3 CTU rows behind its reference frames (the
113
-equivalent of 12 macroblock rows for x264)
114
+being filtered are available. On top of this, HEVC has two loop filters:
115
+deblocking and SAO, which must be run in series with a row lag between
116
+them. When you add up all the row lags each frame ends up being 3 CTU
117
+rows behind its reference frames (the equivalent of 12 macroblock rows
118
+for x264). And keep in mind the wave-front progression pattern; by the
119
+time the reference frame finishes the third row of CTUs, nearly half of
120
+the CTUs in the frame may be compressed (depending on the display aspect
121
+ratio).
122
 
123
 The third extenuating circumstance is that when a frame being encoded
124
 becomes blocked by a reference frame row being available, that frame's
125
 wave-front becomes completely stalled and when the row becomes available
126
 again it can take quite some time for the wave to be restarted, if it
127
-ever does. This makes WPP many times less effective when frame
128
-parallelism is in use.
129
+ever does. This makes WPP less effective when frame parallelism is in
130
+use.
131
 
132
 :option:`--merange` can have a negative impact on frame parallelism. If
133
 the range is too large, more rows of CTU lag must be added to ensure
134
@@ -213,13 +222,13 @@
135
 
136
 The lookahead module of x265 (the lowres pre-encode which determines
137
 scene cuts and slice types) uses the thread pool to distribute the
138
-lowres cost analysis to worker threads. It follows the same wave-front
139
-pattern as the main encoder except it works in reverse-scan order.
140
+lowres cost analysis to worker threads. It will use bonded task groups
141
+to perform batches of frame cost estimates, and it may optionally use
142
+bonded task groups to measure single frame cost estimates using slices.
143
 
144
-The function slicetypeDecide() itself may also be performed by a worker
145
-thread if your system has enough CPU cores to make this a beneficial
146
-trade-off, else it runs within the context of the thread which calls the
147
-x265_encoder_encode().
148
+The function slicetypeDecide() itself is also be performed by a worker
149
+thread if your encoder has a thread pool, else it runs within the
150
+context of the thread which calls the x265_encoder_encode().
151
 
152
 SAO
153
 ===
154
x265_1.6.tar.gz/readme.rst Added
16
 
1
@@ -0,0 +1,14 @@
2
+=================
3
+x265 HEVC Encoder
4
+=================
5
+
6
+| **Read:** | Online `documentation <http://x265.readthedocs.org/en/default/>`_ | Developer `wiki <http://bitbucket.org/multicoreware/x265/wiki/>`_
7
+| **Download:** | `releases <http://bitbucket.org/multicoreware/x265/downloads/>`_ 
8
+| **Interact:** | #x265 on freenode.irc.net | `x265-devel@videolan.org <http://mailman.videolan.org/listinfo/x265-devel>`_ | `Report an issue <https://bitbucket.org/multicoreware/x265/issues?status=new&status=open>`_
9
+
10
+`x265 <https://www.videolan.org/developers/x265.html>`_ is an open
11
+source HEVC encoder. See the developer wiki for instructions for
12
+downloading and building the source.
13
+
14
+x265 is free to use under the `GNU GPL <http://www.gnu.org/licenses/gpl-2.0.html>`_ 
15
+and is also available under a commercial `license <http://x265.org>`_ 
16
x265_1.5.tar.gz/source/CMakeLists.txt -> x265_1.6.tar.gz/source/CMakeLists.txt Changed
201
 
1
@@ -12,6 +12,9 @@
2
 if(POLICY CMP0042)
3
     cmake_policy(SET CMP0042 NEW) # MACOSX_RPATH
4
 endif()
5
+if(POLICY CMP0054)
6
+    cmake_policy(SET CMP0054 OLD) # Only interpret if() arguments as variables or keywords when unquoted
7
+endif()
8
 
9
 project (x265)
10
 cmake_minimum_required (VERSION 2.8.8) # OBJECT libraries require 2.8.8
11
@@ -20,8 +23,14 @@
12
 include(CheckSymbolExists)
13
 include(CheckCXXCompilerFlag)
14
 
15
+option(FPROFILE_GENERATE "Compile executable to generate usage data" OFF)
16
+option(FPROFILE_USE "Compile executable using generated usage data" OFF)
17
+option(NATIVE_BUILD "Target the build CPU" OFF)
18
+option(STATIC_LINK_CRT "Statically link C runtime for release builds" OFF)
19
+mark_as_advanced(FPROFILE_USE FPROFILE_GENERATE NATIVE_BUILD)
20
+
21
 # X265_BUILD must be incremented each time the public API is changed
22
-set(X265_BUILD 43)
23
+set(X265_BUILD 51)
24
 configure_file("${PROJECT_SOURCE_DIR}/x265.def.in"
25
                "${PROJECT_BINARY_DIR}/x265.def")
26
 configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in"
27
@@ -29,11 +38,6 @@
28
 
29
 SET(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake" "${CMAKE_MODULE_PATH}")
30
 
31
-option(CHECKED_BUILD "Enable run-time sanity checks (debugging)" OFF)
32
-if(CHECKED_BUILD)
33
-    add_definitions(-DCHECKED_BUILD=1)
34
-endif()
35
-
36
 # System architecture detection
37
 string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" SYSPROC)
38
 set(X86_ALIASES x86 i386 i686 x86_64 amd64)
39
@@ -61,6 +65,19 @@
40
     if(LIBRT)
41
         list(APPEND PLATFORM_LIBS rt)
42
     endif()
43
+    find_package(Numa)
44
+    if(NUMA_FOUND)
45
+        list(APPEND CMAKE_REQUIRED_LIBRARIES ${NUMA_LIBRARY})
46
+        check_symbol_exists(numa_node_of_cpu numa.h NUMA_V2)
47
+        if(NUMA_V2)
48
+            add_definitions(-DHAVE_LIBNUMA)
49
+            message(STATUS "libnuma found, building with support for NUMA nodes")
50
+            list(APPEND PLATFORM_LIBS ${NUMA_LIBRARY})
51
+            link_directories(${NUMA_LIBRARY_DIR})
52
+            include_directories(${NUMA_INCLUDE_DIR})
53
+        endif()
54
+    endif()
55
+    mark_as_advanced(LIBRT NUMA_FOUND)
56
 endif(UNIX)
57
 
58
 if(X64 AND NOT WIN32)
59
@@ -77,13 +94,13 @@
60
   add_definitions(-DMACOS)
61
 endif()
62
 
63
-if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
64
+if(${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang")
65
     set(CLANG 1)
66
 endif()
67
-if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel")
68
+if(${CMAKE_CXX_COMPILER_ID} STREQUAL "Intel")
69
     set(INTEL_CXX 1)
70
 endif()
71
-if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
72
+if(${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU")
73
     set(GCC 1)
74
 endif()
75
 
76
@@ -92,13 +109,12 @@
77
     set(MSVC 1)
78
 endif()
79
 if(MSVC)
80
-    option(STATIC_LINK_CRT "Statically link C runtime for release builds" OFF)
81
-    if (STATIC_LINK_CRT)
82
+    if(STATIC_LINK_CRT)
83
         set(CompilerFlags CMAKE_CXX_FLAGS_RELEASE CMAKE_C_FLAGS_RELEASE)
84
         foreach(CompilerFlag ${CompilerFlags})
85
             string(REPLACE "/MD" "/MT" ${CompilerFlag} "${${CompilerFlag}}")
86
         endforeach()
87
-    endif (STATIC_LINK_CRT)
88
+    endif(STATIC_LINK_CRT)
89
     add_definitions(/W4)  # Full warnings
90
     add_definitions(/Ob2) # always inline
91
     add_definitions(/MP)  # multithreaded build
92
@@ -130,12 +146,56 @@
93
     if(ENABLE_PIC)
94
          add_definitions(-fPIC)
95
     endif(ENABLE_PIC)
96
-    if(X86 AND NOT X64)
97
+    if(NATIVE_BUILD)
98
+        if(INTEL_CXX)
99
+            add_definitions(-xhost)
100
+        else()
101
+            add_definitions(-march=native)
102
+        endif()
103
+    elseif(X86 AND NOT X64)
104
         add_definitions(-march=i686)
105
     endif()
106
     if(ARM)
107
         add_definitions(-march=armv6 -mfloat-abi=hard -mfpu=vfp)
108
     endif()
109
+    if(FPROFILE_GENERATE)
110
+        if(INTEL_CXX)
111
+            add_definitions(-prof-gen -prof-dir="${CMAKE_CURRENT_BINARY_DIR}")
112
+            list(APPEND LINKER_OPTIONS "-prof-gen")
113
+        else()
114
+            check_cxx_compiler_flag(-fprofile-generate CC_HAS_PROFILE_GENERATE)
115
+            if(CC_HAS_PROFILE_GENERATE)
116
+                add_definitions(-fprofile-generate)
117
+                list(APPEND LINKER_OPTIONS "-fprofile-generate")
118
+            endif(CC_HAS_PROFILE_GENERATE)
119
+        endif(INTEL_CXX)
120
+    endif(FPROFILE_GENERATE)
121
+    if(FPROFILE_USE)
122
+        if(INTEL_CXX)
123
+            add_definitions(-prof-use -prof-dir="${CMAKE_CURRENT_BINARY_DIR}")
124
+            list(APPEND LINKER_OPTIONS "-prof-use")
125
+        else()
126
+            check_cxx_compiler_flag(-fprofile-use CC_HAS_PROFILE_USE)
127
+            check_cxx_compiler_flag(-fprofile-correction CC_HAS_PROFILE_CORRECTION)
128
+            check_cxx_compiler_flag(-Wno-error=coverage-mismatch CC_HAS_COVMISMATCH)
129
+            if(CC_HAS_PROFILE_USE)
130
+                add_definitions(-fprofile-use)
131
+                list(APPEND LINKER_OPTIONS "-fprofile-use")
132
+            endif(CC_HAS_PROFILE_USE)
133
+            if(CC_HAS_PROFILE_CORRECTION)
134
+                # auto-correct corrupted counters (happens a lot with x265)
135
+                add_definitions(-fprofile-correction)
136
+            endif(CC_HAS_PROFILE_CORRECTION)
137
+            if(CC_HAS_COVMISMATCH)
138
+                # ignore coverage mismatches (also happens a lot)
139
+                add_definitions(-Wno-error=coverage-mismatch)
140
+            endif(CC_HAS_COVMISMATCH)
141
+        endif(INTEL_CXX)
142
+    endif(FPROFILE_USE)
143
+    if(STATIC_LINK_CRT)
144
+        add_definitions(-static)
145
+        list(APPEND LINKER_OPTIONS "-static")
146
+    endif(STATIC_LINK_CRT)
147
     check_cxx_compiler_flag(-Wno-narrowing CC_HAS_NO_NARROWING) 
148
     check_cxx_compiler_flag(-Wno-array-bounds CC_HAS_NO_ARRAY_BOUNDS) 
149
     if (CC_HAS_NO_ARRAY_BOUNDS)
150
@@ -154,6 +214,35 @@
151
     if(CC_HAS_FNO_EXCEPTIONS_FLAG)
152
         add_definitions(-fno-exceptions)
153
     endif()
154
+    set(FSANITIZE "" CACHE STRING "-fsanitize options for GCC/clang")
155
+    if(FSANITIZE)
156
+        add_definitions(-fsanitize=${FSANITIZE})
157
+        # clang and gcc need the sanitize options to be passed at link
158
+        # time so the appropriate ASAN/TSAN runtime libraries can be
159
+        # linked.
160
+        list(APPEND LINKER_OPTIONS "-fsanitize=${FSANITIZE}")
161
+    endif()
162
+    option(ENABLE_AGGRESSIVE_CHECKS "Enable stack protection and -ftrapv" OFF)
163
+    if(ENABLE_AGGRESSIVE_CHECKS)
164
+        # use with care, -ftrapv can cause testbench SIGILL exceptions
165
+        # since it is testing corner cases of signed integer math
166
+        add_definitions(-DUSING_FTRAPV=1)
167
+        check_cxx_compiler_flag(-fsanitize=undefined-trap CC_HAS_CATCH_UNDEFINED) # clang
168
+        check_cxx_compiler_flag(-ftrapv CC_HAS_FTRAPV)                            # gcc
169
+        check_cxx_compiler_flag(-fstack-protector-all CC_HAS_STACK_PROTECT)       # gcc
170
+        if(CC_HAS_FTRAPV)
171
+            add_definitions(-ftrapv)
172
+        endif()
173
+        if(CC_HAS_CATCH_UNDEFINED)
174
+            add_definitions(-fsanitize=undefined-trap -fsanitize-undefined-trap-on-error)
175
+        endif()
176
+        if(CC_HAS_STACK_PROTECT)
177
+            add_definitions(-fstack-protector-all)
178
+            if(MINGW)
179
+                list(APPEND PLATFORM_LIBS ssp)
180
+            endif()
181
+        endif()
182
+    endif(ENABLE_AGGRESSIVE_CHECKS)
183
     execute_process(COMMAND ${CMAKE_CXX_COMPILER} -dumpversion OUTPUT_VARIABLE CC_VERSION)
184
 endif(GCC)
185
 
186
@@ -168,6 +257,11 @@
187
     endif()
188
 endif()
189
 
190
+option(CHECKED_BUILD "Enable run-time sanity checks (debugging)" OFF)
191
+if(CHECKED_BUILD)
192
+    add_definitions(-DCHECKED_BUILD=1)
193
+endif()
194
+
195
 # Build options
196
 set(LIB_INSTALL_DIR lib CACHE STRING "Install location of libraries")
197
 set(BIN_INSTALL_DIR bin CACHE STRING "Install location of executables")
198
@@ -179,6 +273,7 @@
199
     # can disable this if(X64) check if you desparately need a 32bit
200
     # build with 10bit/12bit support, but this violates the "shrink wrap
201
x265_1.6.tar.gz/source/cmake/FindNuma.cmake Added
45
 
1
@@ -0,0 +1,43 @@
2
+# Module for locating libnuma
3
+#
4
+# Read-only variables:
5
+#   NUMA_FOUND
6
+#     Indicates that the library has been found.
7
+#
8
+#   NUMA_INCLUDE_DIR
9
+#     Points to the libnuma include directory.
10
+#
11
+#   NUMA_LIBRARY_DIR
12
+#     Points to the directory that contains the libraries.
13
+#     The content of this variable can be passed to link_directories.
14
+#
15
+#   NUMA_LIBRARY
16
+#     Points to the libnuma that can be passed to target_link_libararies.
17
+#
18
+# Copyright (c) 2015 Steve Borho
19
+
20
+include(FindPackageHandleStandardArgs)
21
+
22
+find_path(NUMA_ROOT_DIR
23
+  NAMES include/numa.h
24
+  PATHS ENV NUMA_ROOT
25
+  DOC "NUMA root directory")
26
+
27
+find_path(NUMA_INCLUDE_DIR
28
+  NAMES numa.h
29
+  HINTS ${NUMA_ROOT_DIR}
30
+  PATH_SUFFIXES include
31
+  DOC "NUMA include directory")
32
+
33
+find_library(NUMA_LIBRARY
34
+  NAMES numa
35
+  HINTS ${NUMA_ROOT_DIR}
36
+  DOC "NUMA library")
37
+
38
+if (NUMA_LIBRARY)
39
+    get_filename_component(NUMA_LIBRARY_DIR ${NUMA_LIBRARY} PATH)
40
+endif()
41
+
42
+mark_as_advanced(NUMA_INCLUDE_DIR NUMA_LIBRARY_DIR NUMA_LIBRARY)
43
+
44
+find_package_handle_standard_args(NUMA REQUIRED_VARS NUMA_ROOT_DIR NUMA_INCLUDE_DIR NUMA_LIBRARY)
45
x265_1.5.tar.gz/source/cmake/version.cmake -> x265_1.6.tar.gz/source/cmake/version.cmake Changed
71
 
1
@@ -10,9 +10,9 @@
2
 set(X265_LATEST_TAG "0.0")
3
 set(X265_TAG_DISTANCE "0")
4
 
5
-if(EXISTS ${CMAKE_SOURCE_DIR}/../.hg_archival.txt)
6
+if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/../.hg_archival.txt)
7
     # read the lines of the archive summary file to extract the version
8
-    file(READ ${CMAKE_SOURCE_DIR}/../.hg_archival.txt archive)
9
+    file(READ ${CMAKE_CURRENT_SOURCE_DIR}/../.hg_archival.txt archive)
10
     STRING(REGEX REPLACE "\n" ";" archive "${archive}")
11
     foreach(f ${archive})
12
         string(FIND "${f}" ": " pos)
13
@@ -29,7 +29,7 @@
14
         string(SUBSTRING "${hg_node}" 0 16 hg_id)
15
         set(X265_VERSION "${hg_latesttag}+${hg_latesttagdistance}-${hg_id}")
16
     endif()
17
-elseif(HG_EXECUTABLE AND EXISTS ${CMAKE_SOURCE_DIR}/../.hg)
18
+elseif(HG_EXECUTABLE AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/../.hg)
19
     if(EXISTS "${HG_EXECUTABLE}.bat")
20
         # mercurial source installs on Windows require .bat extension
21
         set(HG_EXECUTABLE "${HG_EXECUTABLE}.bat")
22
@@ -38,14 +38,14 @@
23
 
24
     execute_process(COMMAND
25
         ${HG_EXECUTABLE} log -r. --template "{latesttag}"
26
-        WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
27
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
28
         OUTPUT_VARIABLE X265_LATEST_TAG
29
         ERROR_QUIET
30
         OUTPUT_STRIP_TRAILING_WHITESPACE
31
         )
32
     execute_process(COMMAND
33
         ${HG_EXECUTABLE} log -r. --template "{latesttagdistance}"
34
-        WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
35
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
36
         OUTPUT_VARIABLE X265_TAG_DISTANCE
37
         ERROR_QUIET
38
         OUTPUT_STRIP_TRAILING_WHITESPACE
39
@@ -53,7 +53,7 @@
40
     execute_process(
41
         COMMAND
42
         ${HG_EXECUTABLE} log -r. --template "{node|short}"
43
-        WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
44
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
45
         OUTPUT_VARIABLE HG_REVISION_ID
46
         ERROR_QUIET
47
         OUTPUT_STRIP_TRAILING_WHITESPACE
48
@@ -67,11 +67,11 @@
49
     else()
50
         set(X265_VERSION "${X265_LATEST_TAG}+${X265_TAG_DISTANCE}-${HG_REVISION_ID}")
51
     endif()
52
-elseif(GIT_EXECUTABLE AND EXISTS ${CMAKE_SOURCE_DIR}/../.git)
53
+elseif(GIT_EXECUTABLE AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/../.git)
54
     execute_process(
55
         COMMAND
56
         ${GIT_EXECUTABLE} describe --tags --abbrev=0
57
-        WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
58
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
59
         OUTPUT_VARIABLE X265_LATEST_TAG
60
         ERROR_QUIET
61
         OUTPUT_STRIP_TRAILING_WHITESPACE
62
@@ -80,7 +80,7 @@
63
     execute_process(
64
         COMMAND
65
         ${GIT_EXECUTABLE} describe --tags
66
-        WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
67
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
68
         OUTPUT_VARIABLE X265_VERSION
69
         ERROR_QUIET
70
         OUTPUT_STRIP_TRAILING_WHITESPACE
71
x265_1.5.tar.gz/source/common/CMakeLists.txt -> x265_1.6.tar.gz/source/common/CMakeLists.txt Changed
19
 
1
@@ -1,7 +1,7 @@
2
 # vim: syntax=cmake
3
 
4
 if(ENABLE_ASSEMBLY)
5
-    set_source_files_properties(primitives.cpp PROPERTIES COMPILE_FLAGS -DENABLE_ASSEMBLY=1)
6
+    set_source_files_properties(threading.cpp primitives.cpp PROPERTIES COMPILE_FLAGS -DENABLE_ASSEMBLY=1)
7
 
8
     set(SSE3  vec/dct-sse3.cpp)
9
     set(SSSE3 vec/dct-ssse3.cpp)
10
@@ -48,7 +48,7 @@
11
     if(HIGH_BIT_DEPTH)
12
         set(A_SRCS ${A_SRCS} sad16-a.asm intrapred16.asm ipfilter16.asm)
13
     else()
14
-        set(A_SRCS ${A_SRCS} sad-a.asm intrapred8.asm ipfilter8.asm loopfilter.asm)
15
+        set(A_SRCS ${A_SRCS} sad-a.asm intrapred8.asm intrapred8_allangs.asm ipfilter8.asm loopfilter.asm)
16
     endif()
17
 
18
     if(NOT X64)
19
x265_1.5.tar.gz/source/common/bitstream.cpp -> x265_1.6.tar.gz/source/common/bitstream.cpp Changed
31
 
1
@@ -27,7 +27,7 @@
2
         uint8_t *temp = X265_MALLOC(uint8_t, m_byteAlloc * 2);
3
         if (temp)
4
         {
5
-            ::memcpy(temp, m_fifo, m_byteOccupancy);
6
+            memcpy(temp, m_fifo, m_byteOccupancy);
7
             X265_FREE(m_fifo);
8
             m_fifo = temp;
9
             m_byteAlloc *= 2;
10
@@ -44,7 +44,7 @@
11
 void Bitstream::write(uint32_t val, uint32_t numBits)
12
 {
13
     X265_CHECK(numBits <= 32, "numBits out of range\n");
14
-    X265_CHECK(numBits == 32 || ((val & (~0 << numBits)) == 0), "numBits & val out of range\n");
15
+    X265_CHECK(numBits == 32 || ((val & (~0u << numBits)) == 0), "numBits & val out of range\n");
16
 
17
     uint32_t totalPartialBits = m_partialByteBits + numBits;
18
     uint32_t nextPartialBits = totalPartialBits & 7;
19
@@ -55,7 +55,11 @@
20
     {
21
         /* topword aligns m_partialByte with the msb of val */
22
         uint32_t topword = (numBits - nextPartialBits) & ~7;
23
+#if USING_FTRAPV
24
+        uint32_t write_bits = (topword < 32 ? m_partialByte << topword : 0) | (val >> nextPartialBits);
25
+#else
26
         uint32_t write_bits = (m_partialByte << topword) | (val >> nextPartialBits);
27
+#endif
28
 
29
         switch (writeBytes)
30
         {
31
x265_1.5.tar.gz/source/common/common.cpp -> x265_1.6.tar.gz/source/common/common.cpp Changed
12
 
1
@@ -33,6 +33,10 @@
2
 #include <sys/time.h>
3
 #endif
4
 
5
+#if CHECKED_BUILD || _DEBUG
6
+int g_checkFailures;
7
+#endif
8
+
9
 int64_t x265_mdate(void)
10
 {
11
 #if _WIN32
12
x265_1.5.tar.gz/source/common/common.h -> x265_1.6.tar.gz/source/common/common.h Changed
81
 
1
@@ -74,13 +74,6 @@
2
 #define ALIGN_VAR_16(T, var) T var __attribute__((aligned(16)))
3
 #define ALIGN_VAR_32(T, var) T var __attribute__((aligned(32)))
4
 
5
-#if X265_ARCH_X86 && !defined(X86_64)
6
-extern "C" intptr_t x265_stack_align(void (*func)(), ...);
7
-#define x265_stack_align(func, ...) x265_stack_align((void (*)())func, __VA_ARGS__)
8
-#else
9
-#define x265_stack_align(func, ...) func(__VA_ARGS__)
10
-#endif
11
-
12
 #if defined(__MINGW32__)
13
 #define fseeko fseeko64
14
 #endif
15
@@ -90,7 +83,6 @@
16
 #define ALIGN_VAR_8(T, var)  __declspec(align(8)) T var
17
 #define ALIGN_VAR_16(T, var) __declspec(align(16)) T var
18
 #define ALIGN_VAR_32(T, var) __declspec(align(32)) T var
19
-#define x265_stack_align(func, ...) func(__VA_ARGS__)
20
 #define fseeko _fseeki64
21
 
22
 #endif // if defined(__GNUC__)
23
@@ -106,19 +98,20 @@
24
 #if _DEBUG && defined(_MSC_VER)
25
 #define DEBUG_BREAK() __debugbreak()
26
 #elif __APPLE_CC__
27
-#define DEBUG_BREAK() __builtin_trap();
28
+#define DEBUG_BREAK() __builtin_trap()
29
 #else
30
-#define DEBUG_BREAK()
31
+#define DEBUG_BREAK() abort()
32
 #endif
33
 
34
 /* If compiled with CHECKED_BUILD perform run-time checks and log any that
35
  * fail, both to stderr and to a file */
36
 #if CHECKED_BUILD || _DEBUG
37
+extern int g_checkFailures;
38
 #define X265_CHECK(expr, ...) if (!(expr)) { \
39
     x265_log(NULL, X265_LOG_ERROR, __VA_ARGS__); \
40
-    DEBUG_BREAK(); \
41
     FILE *fp = fopen("x265_check_failures.txt", "a"); \
42
     if (fp) { fprintf(fp, "%s:%d\n", __FILE__, __LINE__); fprintf(fp, __VA_ARGS__); fclose(fp); } \
43
+    g_checkFailures++; DEBUG_BREAK(); \
44
 }
45
 #if _MSC_VER
46
 #pragma warning(disable: 4127) // some checks have constant conditions
47
@@ -257,7 +250,7 @@
48
 #define UNIT_SIZE               (1 << LOG2_UNIT_SIZE)       // unit size of CU partition
49
 
50
 #define MAX_NUM_PARTITIONS      256
51
-#define NUM_CU_PARTITIONS       (1U << (g_maxFullDepth << 1))
52
+#define NUM_4x4_PARTITIONS      (1U << (g_unitSizeDepth << 1)) // number of 4x4 units in max CU size
53
 
54
 #define MIN_PU_SIZE             4
55
 #define MIN_TU_SIZE             4
56
@@ -376,6 +369,7 @@
57
     int32_t*    ref;
58
     uint8_t*    depth;
59
     uint8_t*    modes;
60
+    uint32_t*   bestMergeCand;
61
 };
62
 
63
 /* Stores intra analysis data for a single frame. This struct needs better packing */
64
@@ -384,6 +378,7 @@
65
     uint8_t*  depth;
66
     uint8_t*  modes;
67
     char*     partSizes;
68
+    uint8_t*  chromaModes;
69
 };
70
 
71
 enum TextType
72
@@ -430,6 +425,8 @@
73
 void     x265_free(void *ptr);
74
 char*    x265_slurp_file(const char *filename);
75
 
76
+void     x265_setup_primitives(x265_param* param, int cpu); /* primitives.cpp */
77
+
78
 #include "constants.h"
79
 
80
 #endif // ifndef X265_COMMON_H
81
x265_1.5.tar.gz/source/common/constants.cpp -> x265_1.6.tar.gz/source/common/constants.cpp Changed
13
 
1
@@ -119,9 +119,10 @@
2
     65535
3
 };
4
 
5
+int      g_ctuSizeConfigured = 0;
6
 uint32_t g_maxLog2CUSize = MAX_LOG2_CU_SIZE;
7
 uint32_t g_maxCUSize     = MAX_CU_SIZE;
8
-uint32_t g_maxFullDepth  = NUM_FULL_DEPTH - 1;
9
+uint32_t g_unitSizeDepth = NUM_CU_DEPTH;
10
 uint32_t g_maxCUDepth    = NUM_CU_DEPTH - 1;
11
 uint32_t g_zscanToRaster[MAX_NUM_PARTITIONS] = { 0, };
12
 uint32_t g_rasterToZscan[MAX_NUM_PARTITIONS] = { 0, };
13
x265_1.5.tar.gz/source/common/constants.h -> x265_1.6.tar.gz/source/common/constants.h Changed
19
 
1
@@ -29,6 +29,8 @@
2
 namespace x265 {
3
 // private namespace
4
 
5
+extern int g_ctuSizeConfigured;
6
+
7
 void initZscanToRaster(uint32_t maxFullDepth, uint32_t depth, uint32_t startVal, uint32_t*& curIdx);
8
 void initRasterToZscan(uint32_t maxFullDepth);
9
 
10
@@ -55,7 +57,7 @@
11
 extern uint32_t g_maxLog2CUSize;
12
 extern uint32_t g_maxCUSize;
13
 extern uint32_t g_maxCUDepth;
14
-extern uint32_t g_maxFullDepth;
15
+extern uint32_t g_unitSizeDepth; // Depth at which 4x4 unit occurs from max CU size
16
 
17
 extern const int16_t g_t4[4][4];
18
 extern const int16_t g_t8[8][8];
19
x265_1.5.tar.gz/source/common/cudata.cpp -> x265_1.6.tar.gz/source/common/cudata.cpp Changed
201
 
1
@@ -38,7 +38,7 @@
2
 void bcast1(uint8_t* dst, uint8_t val)  { dst[0] = val; }
3
 
4
 void copy4(uint8_t* dst, uint8_t* src)  { ((uint32_t*)dst)[0] = ((uint32_t*)src)[0]; }
5
-void bcast4(uint8_t* dst, uint8_t val)  { ((uint32_t*)dst)[0] = 0x01010101 * val; }
6
+void bcast4(uint8_t* dst, uint8_t val)  { ((uint32_t*)dst)[0] = 0x01010101u * val; }
7
 
8
 void copy16(uint8_t* dst, uint8_t* src) { ((uint64_t*)dst)[0] = ((uint64_t*)src)[0]; ((uint64_t*)dst)[1] = ((uint64_t*)src)[1]; }
9
 void bcast16(uint8_t* dst, uint8_t val) { uint64_t bval = 0x0101010101010101ULL * val; ((uint64_t*)dst)[0] = bval; ((uint64_t*)dst)[1] = bval; }
10
@@ -159,11 +159,11 @@
11
     m_chromaFormat  = csp;
12
     m_hChromaShift  = CHROMA_H_SHIFT(csp);
13
     m_vChromaShift  = CHROMA_V_SHIFT(csp);
14
-    m_numPartitions = NUM_CU_PARTITIONS >> (depth * 2);
15
+    m_numPartitions = NUM_4x4_PARTITIONS >> (depth * 2);
16
 
17
     if (!s_partSet[0])
18
     {
19
-        s_numPartInCUSize = 1 << g_maxFullDepth;
20
+        s_numPartInCUSize = 1 << g_unitSizeDepth;
21
         switch (g_maxLog2CUSize)
22
         {
23
         case 6:
24
@@ -272,7 +272,7 @@
25
     m_cuPelX        = (cuAddr % m_slice->m_sps->numCuInWidth) << g_maxLog2CUSize;
26
     m_cuPelY        = (cuAddr / m_slice->m_sps->numCuInWidth) << g_maxLog2CUSize;
27
     m_absIdxInCTU   = 0;
28
-    m_numPartitions = NUM_CU_PARTITIONS;
29
+    m_numPartitions = NUM_4x4_PARTITIONS;
30
 
31
     /* sequential memsets */
32
     m_partSet((uint8_t*)m_qp, (uint8_t)qp);
33
@@ -300,12 +300,12 @@
34
 // initialize Sub partition
35
 void CUData::initSubCU(const CUData& ctu, const CUGeom& cuGeom)
36
 {
37
-    m_absIdxInCTU   = cuGeom.encodeIdx;
38
+    m_absIdxInCTU   = cuGeom.absPartIdx;
39
     m_encData       = ctu.m_encData;
40
     m_slice         = ctu.m_slice;
41
     m_cuAddr        = ctu.m_cuAddr;
42
-    m_cuPelX        = ctu.m_cuPelX + g_zscanToPelX[cuGeom.encodeIdx];
43
-    m_cuPelY        = ctu.m_cuPelY + g_zscanToPelY[cuGeom.encodeIdx];
44
+    m_cuPelX        = ctu.m_cuPelX + g_zscanToPelX[cuGeom.absPartIdx];
45
+    m_cuPelY        = ctu.m_cuPelY + g_zscanToPelY[cuGeom.absPartIdx];
46
     m_cuLeft        = ctu.m_cuLeft;
47
     m_cuAbove       = ctu.m_cuAbove;
48
     m_cuAboveLeft   = ctu.m_cuAboveLeft;
49
@@ -392,7 +392,7 @@
50
     m_cuAbove      = cu.m_cuAbove;
51
     m_cuAboveLeft  = cu.m_cuAboveLeft;
52
     m_cuAboveRight = cu.m_cuAboveRight;
53
-    m_absIdxInCTU  = cuGeom.encodeIdx;
54
+    m_absIdxInCTU  = cuGeom.absPartIdx;
55
     m_numPartitions = cuGeom.numPartitions;
56
     memcpy(m_qp, cu.m_qp, BytesPerPartition * m_numPartitions);
57
     memcpy(m_mv[0],  cu.m_mv[0],  m_numPartitions * sizeof(MV));
58
@@ -462,9 +462,9 @@
59
     m_encData       = ctu.m_encData;
60
     m_slice         = ctu.m_slice;
61
     m_cuAddr        = ctu.m_cuAddr;
62
-    m_cuPelX        = ctu.m_cuPelX + g_zscanToPelX[cuGeom.encodeIdx];
63
-    m_cuPelY        = ctu.m_cuPelY + g_zscanToPelY[cuGeom.encodeIdx];
64
-    m_absIdxInCTU   = cuGeom.encodeIdx;
65
+    m_cuPelX        = ctu.m_cuPelX + g_zscanToPelX[cuGeom.absPartIdx];
66
+    m_cuPelY        = ctu.m_cuPelY + g_zscanToPelY[cuGeom.absPartIdx];
67
+    m_absIdxInCTU   = cuGeom.absPartIdx;
68
     m_numPartitions = cuGeom.numPartitions;
69
 
70
     /* copy out all prediction info for this part */
71
@@ -559,7 +559,7 @@
72
         return this;
73
     }
74
 
75
-    aPartUnitIdx = g_rasterToZscan[absPartIdx + NUM_CU_PARTITIONS - s_numPartInCUSize];
76
+    aPartUnitIdx = g_rasterToZscan[absPartIdx + NUM_4x4_PARTITIONS - s_numPartInCUSize];
77
     return m_cuAbove;
78
 }
79
 
80
@@ -581,7 +581,7 @@
81
                 return this;
82
             }
83
         }
84
-        alPartUnitIdx = g_rasterToZscan[absPartIdx + NUM_CU_PARTITIONS - s_numPartInCUSize - 1];
85
+        alPartUnitIdx = g_rasterToZscan[absPartIdx + NUM_4x4_PARTITIONS - s_numPartInCUSize - 1];
86
         return m_cuAbove;
87
     }
88
 
89
@@ -591,7 +591,7 @@
90
         return m_cuLeft;
91
     }
92
 
93
-    alPartUnitIdx = g_rasterToZscan[NUM_CU_PARTITIONS - 1];
94
+    alPartUnitIdx = g_rasterToZscan[NUM_4x4_PARTITIONS - 1];
95
     return m_cuAboveLeft;
96
 }
97
 
98
@@ -620,14 +620,14 @@
99
             }
100
             return NULL;
101
         }
102
-        arPartUnitIdx = g_rasterToZscan[absPartIdxRT + NUM_CU_PARTITIONS - s_numPartInCUSize + 1];
103
+        arPartUnitIdx = g_rasterToZscan[absPartIdxRT + NUM_4x4_PARTITIONS - s_numPartInCUSize + 1];
104
         return m_cuAbove;
105
     }
106
 
107
     if (!isZeroRow(absPartIdxRT, s_numPartInCUSize))
108
         return NULL;
109
 
110
-    arPartUnitIdx = g_rasterToZscan[NUM_CU_PARTITIONS - s_numPartInCUSize];
111
+    arPartUnitIdx = g_rasterToZscan[NUM_4x4_PARTITIONS - s_numPartInCUSize];
112
     return m_cuAboveRight;
113
 }
114
 
115
@@ -720,21 +720,21 @@
116
             }
117
             return NULL;
118
         }
119
-        arPartUnitIdx = g_rasterToZscan[absPartIdxRT + NUM_CU_PARTITIONS - s_numPartInCUSize + partUnitOffset];
120
+        arPartUnitIdx = g_rasterToZscan[absPartIdxRT + NUM_4x4_PARTITIONS - s_numPartInCUSize + partUnitOffset];
121
         return m_cuAbove;
122
     }
123
 
124
     if (!isZeroRow(absPartIdxRT, s_numPartInCUSize))
125
         return NULL;
126
 
127
-    arPartUnitIdx = g_rasterToZscan[NUM_CU_PARTITIONS - s_numPartInCUSize + partUnitOffset - 1];
128
+    arPartUnitIdx = g_rasterToZscan[NUM_4x4_PARTITIONS - s_numPartInCUSize + partUnitOffset - 1];
129
     return m_cuAboveRight;
130
 }
131
 
132
 /* Get left QpMinCu */
133
 const CUData* CUData::getQpMinCuLeft(uint32_t& lPartUnitIdx, uint32_t curAbsIdxInCTU) const
134
 {
135
-    uint32_t absZorderQpMinCUIdx = curAbsIdxInCTU & (0xFF << (g_maxFullDepth - m_slice->m_pps->maxCuDQPDepth) * 2);
136
+    uint32_t absZorderQpMinCUIdx = curAbsIdxInCTU & (0xFF << (g_unitSizeDepth - m_slice->m_pps->maxCuDQPDepth) * 2);
137
     uint32_t absRorderQpMinCUIdx = g_zscanToRaster[absZorderQpMinCUIdx];
138
 
139
     // check for left CTU boundary
140
@@ -751,7 +751,7 @@
141
 /* Get above QpMinCu */
142
 const CUData* CUData::getQpMinCuAbove(uint32_t& aPartUnitIdx, uint32_t curAbsIdxInCTU) const
143
 {
144
-    uint32_t absZorderQpMinCUIdx = curAbsIdxInCTU & (0xFF << (g_maxFullDepth - m_slice->m_pps->maxCuDQPDepth) * 2);
145
+    uint32_t absZorderQpMinCUIdx = curAbsIdxInCTU & (0xFF << (g_unitSizeDepth - m_slice->m_pps->maxCuDQPDepth) * 2);
146
     uint32_t absRorderQpMinCUIdx = g_zscanToRaster[absZorderQpMinCUIdx];
147
 
148
     // check for top CTU boundary
149
@@ -790,7 +790,7 @@
150
 
151
 int8_t CUData::getLastCodedQP(uint32_t absPartIdx) const
152
 {
153
-    uint32_t quPartIdxMask = 0xFF << (g_maxFullDepth - m_slice->m_pps->maxCuDQPDepth) * 2;
154
+    uint32_t quPartIdxMask = 0xFF << (g_unitSizeDepth - m_slice->m_pps->maxCuDQPDepth) * 2;
155
     int lastValidPartIdx = getLastValidPartIdx(absPartIdx & quPartIdxMask);
156
 
157
     if (lastValidPartIdx >= 0)
158
@@ -800,7 +800,7 @@
159
         if (m_absIdxInCTU)
160
             return m_encData->getPicCTU(m_cuAddr)->getLastCodedQP(m_absIdxInCTU);
161
         else if (m_cuAddr > 0 && !(m_slice->m_pps->bEntropyCodingSyncEnabled && !(m_cuAddr % m_slice->m_sps->numCuInWidth)))
162
-            return m_encData->getPicCTU(m_cuAddr - 1)->getLastCodedQP(NUM_CU_PARTITIONS);
163
+            return m_encData->getPicCTU(m_cuAddr - 1)->getLastCodedQP(NUM_4x4_PARTITIONS);
164
         else
165
             return (int8_t)m_slice->m_sliceQp;
166
     }
167
@@ -932,7 +932,7 @@
168
 
169
 bool CUData::setQPSubCUs(int8_t qp, uint32_t absPartIdx, uint32_t depth)
170
 {
171
-    uint32_t curPartNumb = NUM_CU_PARTITIONS >> (depth << 1);
172
+    uint32_t curPartNumb = NUM_4x4_PARTITIONS >> (depth << 1);
173
     uint32_t curPartNumQ = curPartNumb >> 2;
174
 
175
     if (m_cuDepth[absPartIdx] > depth)
176
@@ -1375,8 +1375,8 @@
177
     return true;
178
 }
179
 
180
-/* Construct list of merging candidates */
181
-uint32_t CUData::getInterMergeCandidates(uint32_t absPartIdx, uint32_t puIdx, MVField(*mvFieldNeighbours)[2], uint8_t* interDirNeighbours) const
182
+/* Construct list of merging candidates, returns count */
183
+uint32_t CUData::getInterMergeCandidates(uint32_t absPartIdx, uint32_t puIdx, MVField(*candMvField)[2], uint8_t* candDir) const
184
 {
185
     uint32_t absPartAddr = m_absIdxInCTU + absPartIdx;
186
     const bool isInterB = m_slice->isInterB();
187
@@ -1385,10 +1385,10 @@
188
 
189
     for (uint32_t i = 0; i < maxNumMergeCand; ++i)
190
     {
191
-        mvFieldNeighbours[i][0].mv = 0;
192
-        mvFieldNeighbours[i][1].mv = 0;
193
-        mvFieldNeighbours[i][0].refIdx = REF_NOT_VALID;
194
-        mvFieldNeighbours[i][1].refIdx = REF_NOT_VALID;
195
+        candMvField[i][0].mv = 0;
196
+        candMvField[i][1].mv = 0;
197
+        candMvField[i][0].refIdx = REF_NOT_VALID;
198
+        candMvField[i][1].refIdx = REF_NOT_VALID;
199
     }
200
 
201
x265_1.5.tar.gz/source/common/cudata.h -> x265_1.6.tar.gz/source/common/cudata.h Changed
126
 
1
@@ -64,7 +64,8 @@
2
     MD_ABOVE,       // MVP of above block
3
     MD_ABOVE_RIGHT, // MVP of above right block
4
     MD_BELOW_LEFT,  // MVP of below left block
5
-    MD_ABOVE_LEFT   // MVP of above left block
6
+    MD_ABOVE_LEFT,  // MVP of above left block
7
+    MD_COLLOCATED   // MVP of temporal neighbour
8
 };
9
 
10
 struct CUGeom
11
@@ -82,7 +83,7 @@
12
 
13
     uint32_t log2CUSize;    // Log of the CU size.
14
     uint32_t childOffset;   // offset of the first child CU from current CU
15
-    uint32_t encodeIdx;     // Encoding index of this CU in terms of 4x4 blocks.
16
+    uint32_t absPartIdx;    // Part index of this CU in terms of 4x4 blocks.
17
     uint32_t numPartitions; // Number of 4x4 blocks in the CU
18
     uint32_t depth;         // depth of this CU relative from CTU
19
     uint32_t flags;         // CU flags.
20
@@ -94,6 +95,26 @@
21
     int refIdx;
22
 };
23
 
24
+// Structure that keeps the neighbour's MV information.
25
+struct InterNeighbourMV
26
+{
27
+    // Neighbour MV. The index represents the list.
28
+    MV mv[2];
29
+
30
+    // Collocated right bottom CU addr.
31
+    uint32_t cuAddr[2];
32
+
33
+    // For spatial prediction, this field contains the reference index
34
+    // in each list (-1 if not available).
35
+    //
36
+    // For temporal prediction, the first value is used for the 
37
+    // prediction with list 0. The second value is used for the prediction 
38
+    // with list 1. For each value, the first four bits are the reference index 
39
+    // associated to the PMV, and the fifth bit is the list associated to the PMV.
40
+    // if both reference indices are -1, then unifiedRef is also -1
41
+    union { int16_t refIdx[2]; int32_t unifiedRef; };
42
+};
43
+
44
 typedef void(*cucopy_t)(uint8_t* dst, uint8_t* src); // dst and src are aligned to MIN(size, 32)
45
 typedef void(*cubcast_t)(uint8_t* dst, uint8_t val); // dst is aligned to MIN(size, 32)
46
 
47
@@ -122,9 +143,9 @@
48
     uint32_t      m_cuPelY;           // CU position within the picture, in pixels (Y)
49
     uint32_t      m_numPartitions;    // maximum number of 4x4 partitions within this CU
50
 
51
-    int           m_chromaFormat;
52
-    int           m_hChromaShift;
53
-    int           m_vChromaShift;
54
+    uint32_t      m_chromaFormat;
55
+    uint32_t      m_hChromaShift;
56
+    uint32_t      m_vChromaShift;
57
 
58
     /* Per-part data, stored contiguously */
59
     int8_t*       m_qp;               // array of QP values
60
@@ -158,7 +179,7 @@
61
     CUData();
62
 
63
     void     initialize(const CUDataMemPool& dataPool, uint32_t depth, int csp, int instance);
64
-    static void calcCTUGeoms(uint32_t ctuWidth, uint32_t ctuHeight, uint32_t maxCUSize, CUGeom cuDataArray[CUGeom::MAX_GEOMS]);
65
+    static void calcCTUGeoms(uint32_t ctuWidth, uint32_t ctuHeight, uint32_t maxCUSize, uint32_t minCUSize, CUGeom cuDataArray[CUGeom::MAX_GEOMS]);
66
 
67
     void     initCTU(const Frame& frame, uint32_t cuAddr, int qp);
68
     void     initSubCU(const CUData& ctu, const CUGeom& cuGeom);
69
@@ -195,9 +216,10 @@
70
     uint8_t  getCbf(uint32_t absPartIdx, TextType ttype, uint32_t tuDepth) const { return (m_cbf[ttype][absPartIdx] >> tuDepth) & 0x1; }
71
     uint8_t  getQtRootCbf(uint32_t absPartIdx) const                             { return m_cbf[0][absPartIdx] || m_cbf[1][absPartIdx] || m_cbf[2][absPartIdx]; }
72
     int8_t   getRefQP(uint32_t currAbsIdxInCTU) const;
73
-    uint32_t getInterMergeCandidates(uint32_t absPartIdx, uint32_t puIdx, MVField (*mvFieldNeighbours)[2], uint8_t* interDirNeighbours) const;
74
+    uint32_t getInterMergeCandidates(uint32_t absPartIdx, uint32_t puIdx, MVField (*candMvField)[2], uint8_t* candDir) const;
75
     void     clipMv(MV& outMV) const;
76
-    int      fillMvpCand(uint32_t puIdx, uint32_t absPartIdx, int picList, int refIdx, MV* amvpCand, MV* mvc) const;
77
+    int      getPMV(InterNeighbourMV *neighbours, uint32_t reference_list, uint32_t refIdx, MV* amvpCand, MV* pmv) const;
78
+    void     getNeighbourMV(uint32_t puIdx, uint32_t absPartIdx, InterNeighbourMV* neighbours) const;
79
     void     getIntraTUQtDepthRange(uint32_t tuDepthRange[2], uint32_t absPartIdx) const;
80
     void     getInterTUQtDepthRange(uint32_t tuDepthRange[2], uint32_t absPartIdx) const;
81
 
82
@@ -213,10 +235,9 @@
83
     void     getAllowedChromaDir(uint32_t absPartIdx, uint32_t* modeList) const;
84
     int      getIntraDirLumaPredictor(uint32_t absPartIdx, uint32_t* intraDirPred) const;
85
 
86
-    uint32_t getSCUAddr() const                  { return (m_cuAddr << g_maxFullDepth * 2) + m_absIdxInCTU; }
87
+    uint32_t getSCUAddr() const                  { return (m_cuAddr << g_unitSizeDepth * 2) + m_absIdxInCTU; }
88
     uint32_t getCtxSplitFlag(uint32_t absPartIdx, uint32_t depth) const;
89
     uint32_t getCtxSkipFlag(uint32_t absPartIdx) const;
90
-    ScanType getCoefScanIdx(uint32_t absPartIdx, uint32_t log2TrSize, bool bIsLuma, bool bIsIntra) const;
91
     void     getTUEntropyCodingParameters(TUEntropyCodingParameters &result, uint32_t absPartIdx, uint32_t log2TrSize, bool bIsLuma) const;
92
 
93
     const CUData* getPULeft(uint32_t& lPartUnitIdx, uint32_t curPartUnitIdx) const;
94
@@ -241,15 +262,18 @@
95
 
96
     bool hasEqualMotion(uint32_t absPartIdx, const CUData& candCU, uint32_t candAbsPartIdx) const;
97
 
98
-    bool isDiffMER(int xN, int yN, int xP, int yP) const;
99
+    /* Check whether the current PU and a spatial neighboring PU are in same merge region */
100
+    bool isDiffMER(int xN, int yN, int xP, int yP) const { return ((xN >> 2) != (xP >> 2)) || ((yN >> 2) != (yP >> 2)); }
101
 
102
     // add possible motion vector predictor candidates
103
-    bool addMVPCand(MV& mvp, int picList, int refIdx, uint32_t absPartIdx, MVP_DIR dir) const;
104
-    bool addMVPCandOrder(MV& mvp, int picList, int refIdx, uint32_t absPartIdx, MVP_DIR dir) const;
105
+    bool getDirectPMV(MV& pmv, InterNeighbourMV *neighbours, uint32_t picList, uint32_t refIdx) const;
106
+    bool getIndirectPMV(MV& outMV, InterNeighbourMV *neighbours, uint32_t reference_list, uint32_t refIdx) const;
107
+    void getInterNeighbourMV(InterNeighbourMV *neighbour, uint32_t partUnitIdx, MVP_DIR dir) const;
108
 
109
     bool getColMVP(MV& outMV, int& outRefIdx, int picList, int cuAddr, int absPartIdx) const;
110
+    bool getCollocatedMV(int cuAddr, int partUnitIdx, InterNeighbourMV *neighbour) const;
111
 
112
-    void scaleMvByPOCDist(MV& outMV, const MV& inMV, int curPOC, int curRefPOC, int colPOC, int colRefPOC) const;
113
+    MV scaleMvByPOCDist(const MV& inMV, int curPOC, int curRefPOC, int colPOC, int colRefPOC) const;
114
 
115
     void     deriveLeftRightTopIdx(uint32_t puIdx, uint32_t& partIdxLT, uint32_t& partIdxRT) const;
116
 
117
@@ -278,7 +302,7 @@
118
 
119
     bool create(uint32_t depth, uint32_t csp, uint32_t numInstances)
120
     {
121
-        uint32_t numPartition = NUM_CU_PARTITIONS >> (depth * 2);
122
+        uint32_t numPartition = NUM_4x4_PARTITIONS >> (depth * 2);
123
         uint32_t cuSize = g_maxCUSize >> depth;
124
         uint32_t sizeL = cuSize * cuSize;
125
         uint32_t sizeC = sizeL >> (CHROMA_H_SHIFT(csp) + CHROMA_V_SHIFT(csp));
126
x265_1.5.tar.gz/source/common/dct.cpp -> x265_1.6.tar.gz/source/common/dct.cpp Changed
78
 
1
@@ -709,14 +709,12 @@
2
 
3
     return numSig;
4
 }
5
-
6
-int  count_nonzero_c(const int16_t* quantCoeff, int numCoeff)
7
+template<int trSize>
8
+int  count_nonzero_c(const int16_t* quantCoeff)
9
 {
10
     X265_CHECK(((intptr_t)quantCoeff & 15) == 0, "quant buffer not aligned\n");
11
-    X265_CHECK(numCoeff > 0 && (numCoeff & 15) == 0, "numCoeff invalid %d\n", numCoeff);
12
-
13
     int count = 0;
14
-
15
+    int numCoeff = trSize * trSize;
16
     for (int i = 0; i < numCoeff; i++)
17
     {
18
         count += quantCoeff[i] != 0;
19
@@ -754,6 +752,39 @@
20
     }
21
 }
22
 
23
+int findPosLast_c(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig)
24
+{
25
+    memset(coeffNum, 0, MLS_GRP_NUM * sizeof(*coeffNum));
26
+    memset(coeffFlag, 0, MLS_GRP_NUM * sizeof(*coeffFlag));
27
+    memset(coeffSign, 0, MLS_GRP_NUM * sizeof(*coeffSign));
28
+
29
+    int scanPosLast = 0;
30
+    do
31
+    {
32
+        const uint32_t cgIdx = (uint32_t)scanPosLast >> MLS_CG_SIZE;
33
+
34
+        const uint32_t posLast = scan[scanPosLast++];
35
+
36
+        const int curCoeff = coeff[posLast];
37
+        const uint32_t isNZCoeff = (curCoeff != 0);
38
+        // get L1 sig map
39
+        // NOTE: the new algorithm is complicated, so I keep reference code here
40
+        //uint32_t posy   = posLast >> log2TrSize;
41
+        //uint32_t posx   = posLast - (posy << log2TrSize);
42
+        //uint32_t blkIdx0 = ((posy >> MLS_CG_LOG2_SIZE) << codingParameters.log2TrSizeCG) + (posx >> MLS_CG_LOG2_SIZE);
43
+        //const uint32_t blkIdx = ((posLast >> (2 * MLS_CG_LOG2_SIZE)) & ~maskPosXY) + ((posLast >> MLS_CG_LOG2_SIZE) & maskPosXY);
44
+        //sigCoeffGroupFlag64 |= ((uint64_t)isNZCoeff << blkIdx);
45
+        numSig -= isNZCoeff;
46
+
47
+        // TODO: optimize by instruction BTS
48
+        coeffSign[cgIdx] += (uint16_t)(((uint32_t)curCoeff >> 31) << coeffNum[cgIdx]);
49
+        coeffFlag[cgIdx] = (coeffFlag[cgIdx] << 1) + (uint16_t)isNZCoeff;
50
+        coeffNum[cgIdx] += (uint8_t)isNZCoeff;
51
+    }
52
+    while (numSig > 0);
53
+    return scanPosLast - 1;
54
+}
55
+
56
 }  // closing - anonymous file-static namespace
57
 
58
 namespace x265 {
59
@@ -775,12 +806,17 @@
60
     p.cu[BLOCK_8x8].idct   = idct8_c;
61
     p.cu[BLOCK_16x16].idct = idct16_c;
62
     p.cu[BLOCK_32x32].idct = idct32_c;
63
-    p.count_nonzero = count_nonzero_c;
64
     p.denoiseDct = denoiseDct_c;
65
+    p.cu[BLOCK_4x4].count_nonzero = count_nonzero_c<4>;
66
+    p.cu[BLOCK_8x8].count_nonzero = count_nonzero_c<8>;
67
+    p.cu[BLOCK_16x16].count_nonzero = count_nonzero_c<16>;
68
+    p.cu[BLOCK_32x32].count_nonzero = count_nonzero_c<32>;
69
 
70
     p.cu[BLOCK_4x4].copy_cnt   = copy_count<4>;
71
     p.cu[BLOCK_8x8].copy_cnt   = copy_count<8>;
72
     p.cu[BLOCK_16x16].copy_cnt = copy_count<16>;
73
     p.cu[BLOCK_32x32].copy_cnt = copy_count<32>;
74
+
75
+    p.findPosLast = findPosLast_c;
76
 }
77
 }
78
x265_1.5.tar.gz/source/common/deblock.cpp -> x265_1.6.tar.gz/source/common/deblock.cpp Changed
19
 
1
@@ -70,7 +70,7 @@
2
  * param Edge the direction of the edge in block boundary (horizonta/vertical), which is added newly */
3
 void Deblock::deblockCU(const CUData* cu, const CUGeom& cuGeom, const int32_t dir, uint8_t blockStrength[])
4
 {
5
-    uint32_t absPartIdx = cuGeom.encodeIdx;
6
+    uint32_t absPartIdx = cuGeom.absPartIdx;
7
     uint32_t depth = cuGeom.depth;
8
     if (cu->m_predMode[absPartIdx] == MODE_NONE)
9
         return;
10
@@ -358,7 +358,7 @@
11
         int16_t m5  = (int16_t)src[offset];
12
         int16_t m2  = (int16_t)src[-offset * 2];
13
 
14
-        int32_t delta = x265_clip3(-tc, tc, ((((m4 - m3) << 2) + m2 - m5 + 4) >> 3));
15
+        int32_t delta = x265_clip3(-tc, tc, ((((m4 - m3) * 4) + m2 - m5 + 4) >> 3));
16
         src[-offset] = x265_clip(m3 + (delta & maskP));
17
         src[0] = x265_clip(m4 - (delta & maskQ));
18
     }
19
x265_1.5.tar.gz/source/common/framedata.h -> x265_1.6.tar.gz/source/common/framedata.h Changed
17
 
1
@@ -32,6 +32,7 @@
2
 // private namespace
3
 
4
 class PicYuv;
5
+class JobProvider;
6
 
7
 /* Per-frame data that is used during encodes and referenced while the picture
8
  * is available for reference. A FrameData instance is attached to a Frame as it
9
@@ -52,6 +53,7 @@
10
     PicYuv*        m_reconPic;
11
     bool           m_bHasReferences;   /* used during DPB/RPS updates */
12
     int            m_frameEncoderID;   /* the ID of the FrameEncoder encoding this frame */
13
+    JobProvider*   m_jobProvider;
14
 
15
     CUDataMemPool  m_cuMemPool;
16
     CUData*        m_picCTU;
17
x265_1.5.tar.gz/source/common/intrapred.cpp -> x265_1.6.tar.gz/source/common/intrapred.cpp Changed
43
 
1
@@ -27,6 +27,29 @@
2
 using namespace x265;
3
 
4
 namespace {
5
+
6
+template<int tuSize>
7
+void intraFilter(const pixel* samples, pixel* filtered) /* 1:2:1 filtering of left and top reference samples */
8
+{
9
+    const int tuSize2 = tuSize << 1;
10
+
11
+    pixel topLeft = samples[0], topLast = samples[tuSize2], leftLast = samples[tuSize2 + tuSize2];
12
+
13
+    // filtering top
14
+    for (int i = 1; i < tuSize2; i++)
15
+        filtered[i] = ((samples[i] << 1) + samples[i - 1] + samples[i + 1] + 2) >> 2;
16
+    filtered[tuSize2] = topLast;
17
+    
18
+    // filtering top-left
19
+    filtered[0] = ((topLeft << 1) + samples[1] + samples[tuSize2 + 1] + 2) >> 2;
20
+
21
+    // filtering left
22
+    filtered[tuSize2 + 1] = ((samples[tuSize2 + 1] << 1) + topLeft + samples[tuSize2 + 2] + 2) >> 2;
23
+    for (int i = tuSize2 + 2; i < tuSize2 + tuSize2; i++)
24
+        filtered[i] = ((samples[i] << 1) + samples[i - 1] + samples[i + 1] + 2) >> 2;
25
+    filtered[tuSize2 + tuSize2] = leftLast;
26
+}
27
+
28
 void dcPredFilter(const pixel* above, const pixel* left, pixel* dst, intptr_t dststride, int size)
29
 {
30
     // boundary pixels processing
31
@@ -216,6 +239,11 @@
32
 
33
 void setupIntraPrimitives_c(EncoderPrimitives& p)
34
 {
35
+    p.cu[BLOCK_4x4].intra_filter = intraFilter<4>;
36
+    p.cu[BLOCK_8x8].intra_filter = intraFilter<8>;
37
+    p.cu[BLOCK_16x16].intra_filter = intraFilter<16>;
38
+    p.cu[BLOCK_32x32].intra_filter = intraFilter<32>;
39
+
40
     p.cu[BLOCK_4x4].intra_pred[PLANAR_IDX] = planar_pred_c<2>;
41
     p.cu[BLOCK_8x8].intra_pred[PLANAR_IDX] = planar_pred_c<3>;
42
     p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = planar_pred_c<4>;
43
x265_1.5.tar.gz/source/common/ipfilter.cpp -> x265_1.6.tar.gz/source/common/ipfilter.cpp Changed
97
 
1
@@ -34,8 +34,27 @@
2
 #endif
3
 
4
 namespace {
5
+template<int dstStride, int width, int height>
6
+void pixelToShort_c(const pixel* src, intptr_t srcStride, int16_t* dst)
7
+{
8
+    int shift = IF_INTERNAL_PREC - X265_DEPTH;
9
+    int row, col;
10
+
11
+    for (row = 0; row < height; row++)
12
+    {
13
+        for (col = 0; col < width; col++)
14
+        {
15
+            int16_t val = src[col] << shift;
16
+            dst[col] = val - (int16_t)IF_INTERNAL_OFFS;
17
+        }
18
+
19
+        src += srcStride;
20
+        dst += dstStride;
21
+    }
22
+}
23
+
24
 template<int dstStride>
25
-void filterConvertPelToShort_c(const pixel* src, intptr_t srcStride, int16_t* dst, int width, int height)
26
+void filterPixelToShort_c(const pixel* src, intptr_t srcStride, int16_t* dst, int width, int height)
27
 {
28
     int shift = IF_INTERNAL_PREC - X265_DEPTH;
29
     int row, col;
30
@@ -65,8 +84,8 @@
31
         }
32
 
33
 #else
34
-        ::memset(txt - marginX, txt[0], marginX);
35
-        ::memset(txt + width, txt[width - 1], marginX);
36
+        memset(txt - marginX, txt[0], marginX);
37
+        memset(txt + width, txt[width - 1], marginX);
38
 #endif
39
 
40
         txt += stride;
41
@@ -378,7 +397,8 @@
42
     p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_vpp = interp_vert_pp_c<4, W, H>;  \
43
     p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_vps = interp_vert_ps_c<4, W, H>;  \
44
     p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_vsp = interp_vert_sp_c<4, W, H>;  \
45
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_vss = interp_vert_ss_c<4, W, H>;
46
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_vss = interp_vert_ss_c<4, W, H>; \
47
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].chroma_p2s = pixelToShort_c<MAX_CU_SIZE / 2, W, H>; 
48
 
49
 #define CHROMA_422(W, H) \
50
     p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_hpp = interp_horiz_pp_c<4, W, H>; \
51
@@ -386,7 +406,8 @@
52
     p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_vpp = interp_vert_pp_c<4, W, H>;  \
53
     p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_vps = interp_vert_ps_c<4, W, H>;  \
54
     p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_vsp = interp_vert_sp_c<4, W, H>;  \
55
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_vss = interp_vert_ss_c<4, W, H>;
56
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_vss = interp_vert_ss_c<4, W, H>; \
57
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].chroma_p2s = pixelToShort_c<MAX_CU_SIZE / 2, W, H>; 
58
 
59
 #define CHROMA_444(W, H) \
60
     p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_hpp = interp_horiz_pp_c<4, W, H>; \
61
@@ -394,7 +415,8 @@
62
     p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vpp = interp_vert_pp_c<4, W, H>;  \
63
     p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vps = interp_vert_ps_c<4, W, H>;  \
64
     p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vsp = interp_vert_sp_c<4, W, H>;  \
65
-    p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vss = interp_vert_ss_c<4, W, H>;
66
+    p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vss = interp_vert_ss_c<4, W, H>; \
67
+    p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].chroma_p2s = pixelToShort_c<MAX_CU_SIZE, W, H>; 
68
 
69
 #define LUMA(W, H) \
70
     p.pu[LUMA_ ## W ## x ## H].luma_hpp     = interp_horiz_pp_c<8, W, H>; \
71
@@ -403,7 +425,8 @@
72
     p.pu[LUMA_ ## W ## x ## H].luma_vps     = interp_vert_ps_c<8, W, H>;  \
73
     p.pu[LUMA_ ## W ## x ## H].luma_vsp     = interp_vert_sp_c<8, W, H>;  \
74
     p.pu[LUMA_ ## W ## x ## H].luma_vss     = interp_vert_ss_c<8, W, H>;  \
75
-    p.pu[LUMA_ ## W ## x ## H].luma_hvpp    = interp_hv_pp_c<8, W, H>;
76
+    p.pu[LUMA_ ## W ## x ## H].luma_hvpp    = interp_hv_pp_c<8, W, H>; \
77
+    p.pu[LUMA_ ## W ## x ## H].filter_p2s = pixelToShort_c<MAX_CU_SIZE, W, H>
78
 
79
 void setupFilterPrimitives_c(EncoderPrimitives& p)
80
 {
81
@@ -507,11 +530,11 @@
82
     CHROMA_444(48, 64);
83
     CHROMA_444(64, 16);
84
     CHROMA_444(16, 64);
85
-    p.luma_p2s = filterConvertPelToShort_c<MAX_CU_SIZE>;
86
+    p.luma_p2s = filterPixelToShort_c<MAX_CU_SIZE>;
87
 
88
-    p.chroma[X265_CSP_I444].p2s = filterConvertPelToShort_c<MAX_CU_SIZE>;
89
-    p.chroma[X265_CSP_I420].p2s = filterConvertPelToShort_c<MAX_CU_SIZE / 2>;
90
-    p.chroma[X265_CSP_I422].p2s = filterConvertPelToShort_c<MAX_CU_SIZE / 2>;
91
+    p.chroma[X265_CSP_I444].p2s = filterPixelToShort_c<MAX_CU_SIZE>;
92
+    p.chroma[X265_CSP_I420].p2s = filterPixelToShort_c<MAX_CU_SIZE / 2>;
93
+    p.chroma[X265_CSP_I422].p2s = filterPixelToShort_c<MAX_CU_SIZE / 2>;
94
 
95
     p.extendRowBorder = extendCURowColBorder;
96
 }
97
x265_1.5.tar.gz/source/common/lowres.cpp -> x265_1.6.tar.gz/source/common/lowres.cpp Changed
56
 
1
@@ -56,12 +56,11 @@
2
     CHECKED_MALLOC(propagateCost, uint16_t, cuCount);
3
 
4
     /* allocate lowres buffers */
5
-    for (int i = 0; i < 4; i++)
6
-    {
7
-        CHECKED_MALLOC(buffer[i], pixel, planesize);
8
-        /* initialize the whole buffer to prevent valgrind warnings on right edge */
9
-        memset(buffer[i], 0, sizeof(pixel) * planesize);
10
-    }
11
+    CHECKED_MALLOC_ZERO(buffer[0], pixel, 4 * planesize);
12
+
13
+    buffer[1] = buffer[0] + planesize;
14
+    buffer[2] = buffer[1] + planesize;
15
+    buffer[3] = buffer[2] + planesize;
16
 
17
     lowresPlane[0] = buffer[0] + padoffset;
18
     lowresPlane[1] = buffer[1] + padoffset;
19
@@ -96,9 +95,7 @@
20
 
21
 void Lowres::destroy()
22
 {
23
-    for (int i = 0; i < 4; i++)
24
-        X265_FREE(buffer[i]);
25
-
26
+    X265_FREE(buffer[0]);
27
     X265_FREE(intraCost);
28
     X265_FREE(intraMode);
29
 
30
@@ -126,13 +123,11 @@
31
 }
32
 
33
 // (re) initialize lowres state
34
-void Lowres::init(PicYuv *origPic, int poc, int type)
35
+void Lowres::init(PicYuv *origPic, int poc)
36
 {
37
-    bIntraCalculated = false;
38
     bLastMiniGopBFrame = false;
39
     bScenecut = true;  // could be a scene-cut, until ruled out by flash detection
40
     bKeyframe = false; // Not a keyframe unless identified by lookahead
41
-    sliceType = type;
42
     frameNum = poc;
43
     leadingBframes = 0;
44
     indB = 0;
45
@@ -158,8 +153,8 @@
46
 
47
     /* downscale and generate 4 hpel planes for lookahead */
48
     primitives.frameInitLowres(origPic->m_picOrg[0],
49
-                                      lowresPlane[0], lowresPlane[1], lowresPlane[2], lowresPlane[3],
50
-                                      origPic->m_stride, lumaStride, width, lines);
51
+                               lowresPlane[0], lowresPlane[1], lowresPlane[2], lowresPlane[3],
52
+                               origPic->m_stride, lumaStride, width, lines);
53
 
54
     /* extend hpel planes for motion search */
55
     extendPicBorder(lowresPlane[0], lumaStride, width, lines, origPic->m_lumaMarginX, origPic->m_lumaMarginY);
56
x265_1.5.tar.gz/source/common/lowres.h -> x265_1.6.tar.gz/source/common/lowres.h Changed
18
 
1
@@ -114,7 +114,6 @@
2
     int    lines;            // height of lowres frame in pixel lines
3
     int    leadingBframes;   // number of leading B frames for P or I
4
 
5
-    bool   bIntraCalculated;
6
     bool   bScenecut;        // Set to false if the frame cannot possibly be part of a real scenecut.
7
     bool   bKeyframe;
8
     bool   bLastMiniGopBFrame;
9
@@ -151,7 +150,7 @@
10
 
11
     bool create(PicYuv *origPic, int _bframes, bool bAqEnabled);
12
     void destroy();
13
-    void init(PicYuv *origPic, int poc, int sliceType);
14
+    void init(PicYuv *origPic, int poc);
15
 };
16
 }
17
 
18
x265_1.5.tar.gz/source/common/mv.h -> x265_1.6.tar.gz/source/common/mv.h Changed
21
 
1
@@ -56,12 +56,17 @@
2
 
3
     MV& operator >>=(int i)                    { x >>= i; y >>= i; return *this; }
4
 
5
+#if USING_FTRAPV
6
+    /* avoid signed left-shifts when -ftrapv is enabled */
7
+    MV& operator <<=(int i)                    { x *= (1 << i); y *= (1 << i); return *this; }
8
+    MV operator <<(int i) const                { return MV(x * (1 << i), y * (1 << i)); }
9
+#else
10
     MV& operator <<=(int i)                    { x <<= i; y <<= i; return *this; }
11
+    MV operator <<(int i) const                { return MV(x << i, y << i); }
12
+#endif
13
 
14
     MV operator >>(int i) const                { return MV(x >> i, y >> i); }
15
 
16
-    MV operator <<(int i) const                { return MV(x << i, y << i); }
17
-
18
     MV operator *(int16_t i) const             { return MV(x * i, y * i); }
19
 
20
     MV operator -(const MV& other) const       { return MV(x - other.x, y - other.y); }
21
x265_1.5.tar.gz/source/common/param.cpp -> x265_1.6.tar.gz/source/common/param.cpp Changed
201
 
1
@@ -52,9 +52,7 @@
2
  */
3
 
4
 #undef strtok_r
5
-char* strtok_r(char *      str,
6
-               const char *delim,
7
-               char **     nextp)
8
+char* strtok_r(char* str, const char* delim, char** nextp)
9
 {
10
     if (!str)
11
         str = *nextp;
12
@@ -87,20 +85,19 @@
13
 }
14
 
15
 extern "C"
16
-void x265_param_free(x265_param *p)
17
+void x265_param_free(x265_param* p)
18
 {
19
     return x265_free(p);
20
 }
21
 
22
 extern "C"
23
-void x265_param_default(x265_param *param)
24
+void x265_param_default(x265_param* param)
25
 {
26
     memset(param, 0, sizeof(x265_param));
27
 
28
     /* Applying default values to all elements in the param structure */
29
     param->cpuid = x265::cpu_detect();
30
     param->bEnableWavefront = 1;
31
-    param->poolNumThreads = 0;
32
     param->frameNumThreads = 0;
33
 
34
     param->logLevel = X265_LOG_INFO;
35
@@ -127,8 +124,10 @@
36
 
37
     /* CU definitions */
38
     param->maxCUSize = 64;
39
+    param->minCUSize = 8;
40
     param->tuQTMaxInterDepth = 1;
41
     param->tuQTMaxIntraDepth = 1;
42
+    param->maxTUSize = 32;
43
 
44
     /* Coding Structure */
45
     param->keyframeMin = 0;
46
@@ -139,6 +138,7 @@
47
     param->bFrameAdaptive = X265_B_ADAPT_TRELLIS;
48
     param->bBPyramid = 1;
49
     param->scenecutThreshold = 40; /* Magic number pulled in from x264 */
50
+    param->lookaheadSlices = 0;
51
 
52
     /* Intra Coding Tools */
53
     param->bEnableConstrainedIntra = 0;
54
@@ -153,10 +153,10 @@
55
     param->bEnableWeightedPred = 1;
56
     param->bEnableWeightedBiPred = 0;
57
     param->bEnableEarlySkip = 0;
58
-    param->bEnableCbfFastMode = 0;
59
     param->bEnableAMP = 0;
60
     param->bEnableRectInter = 0;
61
     param->rdLevel = 3;
62
+    param->rdoqLevel = 0;
63
     param->bEnableSignHiding = 1;
64
     param->bEnableTransformSkip = 0;
65
     param->bEnableTSkipFast = 0;
66
@@ -175,12 +175,13 @@
67
     param->crQpOffset = 0;
68
     param->rdPenalty = 0;
69
     param->psyRd = 0.3;
70
-    param->psyRdoq = 1.0;
71
+    param->psyRdoq = 0.0;
72
     param->analysisMode = 0;
73
     param->analysisFileName = NULL;
74
     param->bIntraInBFrames = 0;
75
     param->bLossless = 0;
76
     param->bCULossless = 0;
77
+    param->bEnableTemporalSubLayers = 0;
78
 
79
     /* Rate control options */
80
     param->rc.vbvMaxBitrate = 0;
81
@@ -232,7 +233,7 @@
82
 }
83
 
84
 extern "C"
85
-int x265_param_default_preset(x265_param *param, const char *preset, const char *tune)
86
+int x265_param_default_preset(x265_param* param, const char* preset, const char* tune)
87
 {
88
     x265_param_default(param);
89
 
90
@@ -245,10 +246,11 @@
91
 
92
         if (!strcmp(preset, "ultrafast"))
93
         {
94
-            param->lookaheadDepth = 10;
95
+            param->lookaheadDepth = 5;
96
             param->scenecutThreshold = 0; // disable lookahead
97
             param->maxCUSize = 32;
98
-            param->searchRange = 25;
99
+            param->minCUSize = 16;
100
+            param->bframes = 3;
101
             param->bFrameAdaptive = 0;
102
             param->subpelRefine = 0;
103
             param->searchMethod = X265_DIA_SEARCH;
104
@@ -267,7 +269,7 @@
105
         {
106
             param->lookaheadDepth = 10;
107
             param->maxCUSize = 32;
108
-            param->searchRange = 44;
109
+            param->bframes = 3;
110
             param->bFrameAdaptive = 0;
111
             param->subpelRefine = 1;
112
             param->bEnableEarlySkip = 1;
113
@@ -319,6 +321,8 @@
114
             param->bEnableRectInter = 1;
115
             param->lookaheadDepth = 25;
116
             param->rdLevel = 4;
117
+            param->rdoqLevel = 2;
118
+            param->psyRdoq = 1.0;
119
             param->subpelRefine = 3;
120
             param->maxNumMergeCand = 3;
121
             param->searchMethod = X265_STAR_SEARCH;
122
@@ -333,6 +337,8 @@
123
             param->tuQTMaxInterDepth = 2;
124
             param->tuQTMaxIntraDepth = 2;
125
             param->rdLevel = 6;
126
+            param->rdoqLevel = 2;
127
+            param->psyRdoq = 1.0;
128
             param->subpelRefine = 3;
129
             param->maxNumMergeCand = 3;
130
             param->searchMethod = X265_STAR_SEARCH;
131
@@ -348,6 +354,8 @@
132
             param->tuQTMaxInterDepth = 3;
133
             param->tuQTMaxIntraDepth = 3;
134
             param->rdLevel = 6;
135
+            param->rdoqLevel = 2;
136
+            param->psyRdoq = 1.0;
137
             param->subpelRefine = 4;
138
             param->maxNumMergeCand = 4;
139
             param->searchMethod = X265_STAR_SEARCH;
140
@@ -365,6 +373,8 @@
141
             param->tuQTMaxInterDepth = 4;
142
             param->tuQTMaxIntraDepth = 4;
143
             param->rdLevel = 6;
144
+            param->rdoqLevel = 2;
145
+            param->psyRdoq = 1.0;
146
             param->subpelRefine = 5;
147
             param->maxNumMergeCand = 5;
148
             param->searchMethod = X265_STAR_SEARCH;
149
@@ -415,11 +425,11 @@
150
             param->deblockingFilterBetaOffset = -2;
151
             param->deblockingFilterTCOffset = -2;
152
             param->bIntraInBFrames = 0;
153
+            param->rdoqLevel = 1;
154
             param->psyRdoq = 30;
155
             param->psyRd = 0.5;
156
             param->rc.ipFactor = 1.1;
157
             param->rc.pbFactor = 1.1;
158
-            param->rc.aqMode = X265_AQ_VARIANCE;
159
             param->rc.aqStrength = 0.3;
160
             param->rc.qCompress = 0.8;
161
         }
162
@@ -430,7 +440,7 @@
163
     return 0;
164
 }
165
 
166
-static int x265_atobool(const char *str, bool& bError)
167
+static int x265_atobool(const char* str, bool& bError)
168
 {
169
     if (!strcmp(str, "1") ||
170
         !strcmp(str, "true") ||
171
@@ -444,7 +454,7 @@
172
     return 0;
173
 }
174
 
175
-static double x265_atof(const char *str, bool& bError)
176
+static double x265_atof(const char* str, bool& bError)
177
 {
178
     char *end;
179
     double v = strtod(str, &end);
180
@@ -454,7 +464,7 @@
181
     return v;
182
 }
183
 
184
-static int parseName(const char *arg, const char * const * names, bool& bError)
185
+static int parseName(const char* arg, const char* const* names, bool& bError)
186
 {
187
     for (int i = 0; names[i]; i++)
188
         if (!strcmp(arg, names[i]))
189
@@ -471,7 +481,7 @@
190
 #define atobool(str) (bNameWasBool = true, x265_atobool(str, bError))
191
 
192
 extern "C"
193
-int x265_param_parse(x265_param *p, const char *name, const char *value)
194
+int x265_param_parse(x265_param* p, const char* name, const char* value)
195
 {
196
     bool bError = false;
197
     bool bNameWasBool = false;
198
@@ -543,7 +553,6 @@
199
             }
200
         }
201
x265_1.5.tar.gz/source/common/picyuv.cpp -> x265_1.6.tar.gz/source/common/picyuv.cpp Changed
40
 
1
@@ -84,7 +84,7 @@
2
  * allocated by the same encoder. */
3
 bool PicYuv::createOffsets(const SPS& sps)
4
 {
5
-    uint32_t numPartitions = 1 << (g_maxFullDepth * 2);
6
+    uint32_t numPartitions = 1 << (g_unitSizeDepth * 2);
7
     CHECKED_MALLOC(m_cuOffsetY, intptr_t, sps.numCuInWidth * sps.numCuInHeight);
8
     CHECKED_MALLOC(m_cuOffsetC, intptr_t, sps.numCuInWidth * sps.numCuInHeight);
9
     for (uint32_t cuRow = 0; cuRow < sps.numCuInHeight; cuRow++)
10
@@ -176,9 +176,7 @@
11
         for (int r = 0; r < height; r++)
12
         {
13
             for (int c = 0; c < width; c++)
14
-            {
15
                 yPixel[c] = (pixel)yChar[c];
16
-            }
17
 
18
             yPixel += m_stride;
19
             yChar += pic.stride[0] / sizeof(*yChar);
20
@@ -229,9 +227,7 @@
21
         for (int r = 0; r < height; r++)
22
         {
23
             for (int x = 0; x < padx; x++)
24
-            {
25
                 Y[width + x] = Y[width - 1];
26
-            }
27
 
28
             Y += m_stride;
29
         }
30
@@ -257,9 +253,7 @@
31
         pixel *V = m_picOrg[2] + ((height >> m_vChromaShift) - 1) * m_strideC;
32
 
33
         for (int i = 1; i <= pady; i++)
34
-        {
35
             memcpy(Y + i * m_stride, Y, (width + padx) * sizeof(pixel));
36
-        }
37
 
38
         for (int j = 1; j <= pady >> m_vChromaShift; j++)
39
         {
40
x265_1.5.tar.gz/source/common/pixel.cpp -> x265_1.6.tar.gz/source/common/pixel.cpp Changed
51
 
1
@@ -428,7 +428,7 @@
2
 void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift)
3
 {
4
     X265_CHECK(((intptr_t)dst & 15) == 0, "dst alignment error\n");
5
-    X265_CHECK((((intptr_t)src | srcStride) & 15) == 0 || size == 4, "src alignment error\n");
6
+    X265_CHECK((((intptr_t)src | (srcStride * sizeof(*src))) & 15) == 0 || size == 4, "src alignment error\n");
7
     X265_CHECK(shift >= 0, "invalid shift\n");
8
 
9
     for (int i = 0; i < size; i++)
10
@@ -445,7 +445,7 @@
11
 void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift)
12
 {
13
     X265_CHECK(((intptr_t)dst & 15) == 0, "dst alignment error\n");
14
-    X265_CHECK((((intptr_t)src | srcStride) & 15) == 0 || size == 4, "src alignment error\n");
15
+    X265_CHECK((((intptr_t)src | (srcStride * sizeof(*src))) & 15) == 0 || size == 4, "src alignment error\n");
16
     X265_CHECK(shift > 0, "invalid shift\n");
17
 
18
     int16_t round = 1 << (shift - 1);
19
@@ -462,7 +462,7 @@
20
 template<int size>
21
 void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
22
 {
23
-    X265_CHECK((((intptr_t)dst | dstStride) & 15) == 0 || size == 4, "dst alignment error\n");
24
+    X265_CHECK((((intptr_t)dst | (dstStride * sizeof(*dst))) & 15) == 0 || size == 4, "dst alignment error\n");
25
     X265_CHECK(((intptr_t)src & 15) == 0, "src alignment error\n");
26
     X265_CHECK(shift >= 0, "invalid shift\n");
27
 
28
@@ -479,7 +479,7 @@
29
 template<int size>
30
 void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
31
 {
32
-    X265_CHECK((((intptr_t)dst | dstStride) & 15) == 0 || size == 4, "dst alignment error\n");
33
+    X265_CHECK((((intptr_t)dst | (dstStride * sizeof(*dst))) & 15) == 0 || size == 4, "dst alignment error\n");
34
     X265_CHECK(((intptr_t)src & 15) == 0, "src alignment error\n");
35
     X265_CHECK(shift > 0, "invalid shift\n");
36
 
37
@@ -522,12 +522,10 @@
38
 
39
 #if CHECKED_BUILD || _DEBUG
40
     const int correction = (IF_INTERNAL_PREC - X265_DEPTH);
41
-#endif
42
-
43
     X265_CHECK(!((w0 << 6) > 32767), "w0 using more than 16 bits, asm output will mismatch\n");
44
     X265_CHECK(!(round > 32767), "round using more than 16 bits, asm output will mismatch\n");
45
     X265_CHECK((shift >= correction), "shift must be include factor correction, please update ASM ABI\n");
46
-    X265_CHECK(!(round & ((1 << correction) - 1)), "round must be include factor correction, please update ASM ABI\n");
47
+#endif
48
 
49
     for (y = 0; y <= height - 1; y++)
50
     {
51
x265_1.5.tar.gz/source/common/predict.cpp -> x265_1.6.tar.gz/source/common/predict.cpp Changed
201
 
1
@@ -34,11 +34,23 @@
2
 #pragma warning(disable: 4127) // conditional expression is constant
3
 #endif
4
 
5
+PredictionUnit::PredictionUnit(const CUData& cu, const CUGeom& cuGeom, int puIdx)
6
+{
7
+    /* address of CTU */
8
+    ctuAddr = cu.m_cuAddr;
9
+
10
+    /* offset of CU */
11
+    cuAbsPartIdx = cuGeom.absPartIdx;
12
+
13
+    /* offset and dimensions of PU */
14
+    cu.getPartIndexAndSize(puIdx, puAbsPartIdx, width, height);
15
+}
16
+
17
 namespace
18
 {
19
 inline pixel weightBidir(int w0, int16_t P0, int w1, int16_t P1, int round, int shift, int offset)
20
 {
21
-    return x265_clip((w0 * (P0 + IF_INTERNAL_OFFS) + w1 * (P1 + IF_INTERNAL_OFFS) + round + (offset << (shift - 1))) >> shift);
22
+    return x265_clip((w0 * (P0 + IF_INTERNAL_OFFS) + w1 * (P1 + IF_INTERNAL_OFFS) + round + (offset * (1 << (shift - 1)))) >> shift);
23
 }
24
 }
25
 
26
@@ -67,82 +79,24 @@
27
     return false;
28
 }
29
 
30
-void Predict::predIntraLumaAng(uint32_t dirMode, pixel* dst, intptr_t stride, uint32_t log2TrSize)
31
-{
32
-    int sizeIdx = log2TrSize - 2;
33
-    int tuSize = 1 << log2TrSize;
34
-    int filter = !!(g_intraFilterFlags[dirMode] & tuSize);
35
-    X265_CHECK(sizeIdx >= 0 && sizeIdx < 4, "intra block size is out of range\n");
36
-
37
-    bool bFilter = log2TrSize <= 4;
38
-    primitives.cu[sizeIdx].intra_pred[dirMode](dst, stride, intraNeighbourBuf[filter], dirMode, bFilter);
39
-}
40
-
41
-void Predict::predIntraChromaAng(uint32_t dirMode, pixel* dst, intptr_t stride, uint32_t log2TrSizeC, int chFmt)
42
-{
43
-    int tuSize = 1 << log2TrSizeC;
44
-    int tuSize2 = tuSize << 1;
45
-
46
-    pixel* srcBuf = intraNeighbourBuf[0];
47
-
48
-    if (chFmt == X265_CSP_I444 && (g_intraFilterFlags[dirMode] & tuSize))
49
-    {
50
-        pixel* fltBuf = intraNeighbourBuf[1];
51
-        pixel topLeft = srcBuf[0], topLast = srcBuf[tuSize2], leftLast = srcBuf[tuSize2 + tuSize2];
52
-
53
-        // filtering top
54
-        for (int i = 1; i < tuSize2; i++)
55
-            fltBuf[i] = ((srcBuf[i] << 1) + srcBuf[i - 1] + srcBuf[i + 1] + 2) >> 2;
56
-        fltBuf[tuSize2] = topLast;
57
-
58
-        // filtering top-left
59
-        fltBuf[0] = ((srcBuf[0] << 1) + srcBuf[1] + srcBuf[tuSize2 + 1] + 2) >> 2;
60
-
61
-        //filtering left
62
-        fltBuf[tuSize2 + 1] = ((srcBuf[tuSize2 + 1] << 1) + topLeft + srcBuf[tuSize2 + 2] + 2) >> 2;
63
-        for (int i = tuSize2 + 2; i < tuSize2 + tuSize2; i++)
64
-            fltBuf[i] = ((srcBuf[i] << 1) + srcBuf[i - 1] + srcBuf[i + 1] + 2) >> 2;
65
-        fltBuf[tuSize2 + tuSize2] = leftLast;
66
-
67
-        srcBuf = intraNeighbourBuf[1];
68
-    }
69
-
70
-    int sizeIdx = log2TrSizeC - 2;
71
-    X265_CHECK(sizeIdx >= 0 && sizeIdx < 4, "intra block size is out of range\n");
72
-    primitives.cu[sizeIdx].intra_pred[dirMode](dst, stride, srcBuf, dirMode, 0);
73
-}
74
-
75
-void Predict::initMotionCompensation(const CUData& cu, const CUGeom& cuGeom, int partIdx)
76
+void Predict::motionCompensation(const CUData& cu, const PredictionUnit& pu, Yuv& predYuv, bool bLuma, bool bChroma)
77
 {
78
-    m_predSlice = cu.m_slice;
79
-    cu.getPartIndexAndSize(partIdx, m_puAbsPartIdx, m_puWidth, m_puHeight);
80
-    m_ctuAddr = cu.m_cuAddr;
81
-    m_cuAbsPartIdx = cuGeom.encodeIdx;
82
-}
83
-
84
-void Predict::prepMotionCompensation(const CUData& cu, const CUGeom& cuGeom, int partIdx)
85
-{
86
-    initMotionCompensation(cu, cuGeom, partIdx);
87
-
88
-    m_refIdx0      = cu.m_refIdx[0][m_puAbsPartIdx];
89
-    m_clippedMv[0] = cu.m_mv[0][m_puAbsPartIdx];
90
-    m_refIdx1      = cu.m_refIdx[1][m_puAbsPartIdx];
91
-    m_clippedMv[1] = cu.m_mv[1][m_puAbsPartIdx];
92
-    cu.clipMv(m_clippedMv[0]);
93
-    cu.clipMv(m_clippedMv[1]);
94
-}
95
+    int refIdx0 = cu.m_refIdx[0][pu.puAbsPartIdx];
96
+    int refIdx1 = cu.m_refIdx[1][pu.puAbsPartIdx];
97
 
98
-void Predict::motionCompensation(Yuv& predYuv, bool bLuma, bool bChroma)
99
-{
100
-    if (m_predSlice->isInterP())
101
+    if (cu.m_slice->isInterP())
102
     {
103
         /* P Slice */
104
         WeightValues wv0[3];
105
-        X265_CHECK(m_refIdx0 >= 0, "invalid P refidx\n");
106
-        X265_CHECK(m_refIdx0 < m_predSlice->m_numRefIdx[0], "P refidx out of range\n");
107
-        const WeightParam *wp0 = m_predSlice->m_weightPredTable[0][m_refIdx0];
108
 
109
-        if (m_predSlice->m_pps->bUseWeightPred && wp0->bPresentFlag)
110
+        X265_CHECK(refIdx0 >= 0, "invalid P refidx\n");
111
+        X265_CHECK(refIdx0 < cu.m_slice->m_numRefIdx[0], "P refidx out of range\n");
112
+        const WeightParam *wp0 = cu.m_slice->m_weightPredTable[0][refIdx0];
113
+
114
+        MV mv0 = cu.m_mv[0][pu.puAbsPartIdx];
115
+        cu.clipMv(mv0);
116
+
117
+        if (cu.m_slice->m_pps->bUseWeightPred && wp0->bPresentFlag)
118
         {
119
             for (int plane = 0; plane < 3; plane++)
120
             {
121
@@ -155,18 +109,18 @@
122
             ShortYuv& shortYuv = m_predShortYuv[0];
123
 
124
             if (bLuma)
125
-                predInterLumaShort(shortYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]);
126
+                predInterLumaShort(pu, shortYuv, *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
127
             if (bChroma)
128
-                predInterChromaShort(shortYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]);
129
+                predInterChromaShort(pu, shortYuv, *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
130
 
131
-            addWeightUni(predYuv, shortYuv, wv0, bLuma, bChroma);
132
+            addWeightUni(pu, predYuv, shortYuv, wv0, bLuma, bChroma);
133
         }
134
         else
135
         {
136
             if (bLuma)
137
-                predInterLumaPixel(predYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]);
138
+                predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
139
             if (bChroma)
140
-                predInterChromaPixel(predYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]);
141
+                predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
142
         }
143
     }
144
     else
145
@@ -176,10 +130,13 @@
146
         WeightValues wv0[3], wv1[3];
147
         const WeightParam *pwp0, *pwp1;
148
 
149
-        if (m_predSlice->m_pps->bUseWeightedBiPred)
150
+        X265_CHECK(refIdx0 < cu.m_slice->m_numRefIdx[0], "bidir refidx0 out of range\n");
151
+        X265_CHECK(refIdx1 < cu.m_slice->m_numRefIdx[1], "bidir refidx1 out of range\n");
152
+
153
+        if (cu.m_slice->m_pps->bUseWeightedBiPred)
154
         {
155
-            pwp0 = m_refIdx0 >= 0 ? m_predSlice->m_weightPredTable[0][m_refIdx0] : NULL;
156
-            pwp1 = m_refIdx1 >= 0 ? m_predSlice->m_weightPredTable[1][m_refIdx1] : NULL;
157
+            pwp0 = refIdx0 >= 0 ? cu.m_slice->m_weightPredTable[0][refIdx0] : NULL;
158
+            pwp1 = refIdx1 >= 0 ? cu.m_slice->m_weightPredTable[1][refIdx1] : NULL;
159
 
160
             if (pwp0 && pwp1 && (pwp0->bPresentFlag || pwp1->bPresentFlag))
161
             {
162
@@ -200,7 +157,7 @@
163
             else
164
             {
165
                 /* uniprediction weighting, always outputs to wv0 */
166
-                const WeightParam* pwp = (m_refIdx0 >= 0) ? pwp0 : pwp1;
167
+                const WeightParam* pwp = (refIdx0 >= 0) ? pwp0 : pwp1;
168
                 for (int plane = 0; plane < 3; plane++)
169
                 {
170
                     wv0[plane].w = pwp[plane].inputWeight;
171
@@ -213,89 +170,92 @@
172
         else
173
             pwp0 = pwp1 = NULL;
174
 
175
-        if (m_refIdx0 >= 0 && m_refIdx1 >= 0)
176
+        if (refIdx0 >= 0 && refIdx1 >= 0)
177
         {
178
-            /* Biprediction */
179
-            X265_CHECK(m_refIdx0 < m_predSlice->m_numRefIdx[0], "bidir refidx0 out of range\n");
180
-            X265_CHECK(m_refIdx1 < m_predSlice->m_numRefIdx[1], "bidir refidx1 out of range\n");
181
+            MV mv0 = cu.m_mv[0][pu.puAbsPartIdx];
182
+            MV mv1 = cu.m_mv[1][pu.puAbsPartIdx];
183
+            cu.clipMv(mv0);
184
+            cu.clipMv(mv1);
185
 
186
             if (bLuma)
187
             {
188
-                predInterLumaShort(m_predShortYuv[0], *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]);
189
-                predInterLumaShort(m_predShortYuv[1], *m_predSlice->m_refPicList[1][m_refIdx1]->m_reconPic, m_clippedMv[1]);
190
+                predInterLumaShort(pu, m_predShortYuv[0], *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
191
+                predInterLumaShort(pu, m_predShortYuv[1], *cu.m_slice->m_refPicList[1][refIdx1]->m_reconPic, mv1);
192
             }
193
             if (bChroma)
194
             {
195
-                predInterChromaShort(m_predShortYuv[0], *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]);
196
-                predInterChromaShort(m_predShortYuv[1], *m_predSlice->m_refPicList[1][m_refIdx1]->m_reconPic, m_clippedMv[1]);
197
+                predInterChromaShort(pu, m_predShortYuv[0], *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
198
+                predInterChromaShort(pu, m_predShortYuv[1], *cu.m_slice->m_refPicList[1][refIdx1]->m_reconPic, mv1);
199
             }
200
 
201
x265_1.5.tar.gz/source/common/predict.h -> x265_1.6.tar.gz/source/common/predict.h Changed
102
 
1
@@ -36,6 +36,17 @@
2
 class Slice;
3
 struct CUGeom;
4
 
5
+struct PredictionUnit
6
+{
7
+    uint32_t     ctuAddr;      // raster index of current CTU within its picture
8
+    uint32_t     cuAbsPartIdx; // z-order offset of current CU within its CTU
9
+    uint32_t     puAbsPartIdx; // z-order offset of current PU with its CU
10
+    int          width;
11
+    int          height;
12
+
13
+    PredictionUnit(const CUData& cu, const CUGeom& cuGeom, int puIdx);
14
+};
15
+
16
 class Predict
17
 {
18
 public:
19
@@ -56,7 +67,7 @@
20
         int      leftUnits;
21
         int      unitWidth;
22
         int      unitHeight;
23
-        int      tuSize;
24
+        int      log2TrSize;
25
         bool     bNeighborFlags[4 * MAX_NUM_SPU_W + 1];
26
     };
27
 
28
@@ -65,38 +76,34 @@
29
 
30
     // Unfiltered/filtered neighbours of the current partition.
31
     pixel     intraNeighbourBuf[2][258];
32
+
33
     /* Slice information */
34
-    const Slice* m_predSlice;
35
     int       m_csp;
36
     int       m_hChromaShift;
37
     int       m_vChromaShift;
38
 
39
-    /* cached CU information for prediction */
40
-    uint32_t  m_ctuAddr;      // raster index of current CTU within its picture
41
-    uint32_t  m_cuAbsPartIdx; // z-order index of current CU within its CTU
42
-    uint32_t  m_puAbsPartIdx; // z-order index of current PU with its CU
43
-    int       m_puWidth;
44
-    int       m_puHeight;
45
-    int       m_refIdx0;
46
-    int       m_refIdx1;
47
-
48
-    /* TODO: Need to investigate clipping while writing into the TComDataCU fields itself */
49
-    MV        m_clippedMv[2];
50
-
51
     Predict();
52
     ~Predict();
53
 
54
     bool allocBuffers(int csp);
55
 
56
     // motion compensation functions
57
-    void predInterLumaPixel(Yuv& dstYuv, const PicYuv& refPic, const MV& mv) const;
58
-    void predInterChromaPixel(Yuv& dstYuv, const PicYuv& refPic, const MV& mv) const;
59
+    void predInterLumaPixel(const PredictionUnit& pu, Yuv& dstYuv, const PicYuv& refPic, const MV& mv) const;
60
+    void predInterChromaPixel(const PredictionUnit& pu, Yuv& dstYuv, const PicYuv& refPic, const MV& mv) const;
61
 
62
-    void predInterLumaShort(ShortYuv& dstSYuv, const PicYuv& refPic, const MV& mv) const;
63
-    void predInterChromaShort(ShortYuv& dstSYuv, const PicYuv& refPic, const MV& mv) const;
64
+    void predInterLumaShort(const PredictionUnit& pu, ShortYuv& dstSYuv, const PicYuv& refPic, const MV& mv) const;
65
+    void predInterChromaShort(const PredictionUnit& pu, ShortYuv& dstSYuv, const PicYuv& refPic, const MV& mv) const;
66
 
67
-    void addWeightBi(Yuv& predYuv, const ShortYuv& srcYuv0, const ShortYuv& srcYuv1, const WeightValues wp0[3], const WeightValues wp1[3], bool bLuma, bool bChroma) const;
68
-    void addWeightUni(Yuv& predYuv, const ShortYuv& srcYuv, const WeightValues wp[3], bool bLuma, bool bChroma) const;
69
+    void addWeightBi(const PredictionUnit& pu, Yuv& predYuv, const ShortYuv& srcYuv0, const ShortYuv& srcYuv1, const WeightValues wp0[3], const WeightValues wp1[3], bool bLuma, bool bChroma) const;
70
+    void addWeightUni(const PredictionUnit& pu, Yuv& predYuv, const ShortYuv& srcYuv, const WeightValues wp[3], bool bLuma, bool bChroma) const;
71
+
72
+    void motionCompensation(const CUData& cu, const PredictionUnit& pu, Yuv& predYuv, bool bLuma, bool bChroma);
73
+
74
+    /* Angular Intra */
75
+    void predIntraLumaAng(uint32_t dirMode, pixel* pred, intptr_t stride, uint32_t log2TrSize);
76
+    void predIntraChromaAng(uint32_t dirMode, pixel* pred, intptr_t stride, uint32_t log2TrSizeC);
77
+    void initAdiPattern(const CUData& cu, const CUGeom& cuGeom, uint32_t puAbsPartIdx, const IntraNeighbors& intraNeighbors, int dirMode);
78
+    void initAdiPatternChroma(const CUData& cu, const CUGeom& cuGeom, uint32_t puAbsPartIdx, const IntraNeighbors& intraNeighbors, uint32_t chromaId);
79
 
80
     /* Intra prediction helper functions */
81
     static void initIntraNeighbors(const CUData& cu, uint32_t absPartIdx, uint32_t tuDepth, bool isLuma, IntraNeighbors *IntraNeighbors);
82
@@ -111,19 +118,6 @@
83
     static int  isAboveRightAvailable(const CUData& cu, uint32_t partIdxRT, bool* bValidFlags, uint32_t numUnits);
84
     template<bool cip>
85
     static int  isBelowLeftAvailable(const CUData& cu, uint32_t partIdxLB, bool* bValidFlags, uint32_t numUnits);
86
-
87
-public:
88
-
89
-    /* prepMotionCompensation needs to be called to prepare MC with CU-relevant data */
90
-    void initMotionCompensation(const CUData& cu, const CUGeom& cuGeom, int partIdx);
91
-    void prepMotionCompensation(const CUData& cu, const CUGeom& cuGeom, int partIdx);
92
-    void motionCompensation(Yuv& predYuv, bool bLuma, bool bChroma);
93
-
94
-    /* Angular Intra */
95
-    void predIntraLumaAng(uint32_t dirMode, pixel* pred, intptr_t stride, uint32_t log2TrSize);
96
-    void predIntraChromaAng(uint32_t dirMode, pixel* pred, intptr_t stride, uint32_t log2TrSizeC, int chFmt);
97
-    void initAdiPattern(const CUData& cu, const CUGeom& cuGeom, uint32_t absPartIdx, const IntraNeighbors& intraNeighbors, int dirMode);
98
-    void initAdiPatternChroma(const CUData& cu, const CUGeom& cuGeom, uint32_t absPartIdx, const IntraNeighbors& intraNeighbors, uint32_t chromaId);
99
 };
100
 }
101
 
102
x265_1.5.tar.gz/source/common/primitives.cpp -> x265_1.6.tar.gz/source/common/primitives.cpp Changed
26
 
1
@@ -98,6 +98,7 @@
2
         p.chroma[X265_CSP_I444].pu[i].copy_pp = p.pu[i].copy_pp;
3
         p.chroma[X265_CSP_I444].pu[i].addAvg  = p.pu[i].addAvg;
4
         p.chroma[X265_CSP_I444].pu[i].satd    = p.pu[i].satd;
5
+        p.chroma[X265_CSP_I444].pu[i].chroma_p2s = p.pu[i].filter_p2s;
6
     }
7
 
8
     for (int i = 0; i < NUM_CU_SIZES; i++)
9
@@ -190,7 +191,6 @@
10
 
11
 /* cpuid >= 0 - force CPU type
12
  * cpuid < 0  - auto-detect if uninitialized */
13
-extern "C"
14
 void x265_setup_primitives(x265_param *param, int cpuid)
15
 {
16
     if (cpuid < 0)
17
@@ -257,7 +257,7 @@
18
 extern "C" {
19
 int x265_cpu_cpuid_test(void) { return 0; }
20
 void x265_cpu_emms(void) {}
21
-void x265_cpu_cpuid(uint32_t, uint32_t *, uint32_t *, uint32_t *, uint32_t *) {}
22
+void x265_cpu_cpuid(uint32_t, uint32_t *eax, uint32_t *, uint32_t *, uint32_t *) { *eax = 0; }
23
 void x265_cpu_xgetbv(uint32_t, uint32_t *, uint32_t *) {}
24
 }
25
 #endif
26
x265_1.5.tar.gz/source/common/primitives.h -> x265_1.6.tar.gz/source/common/primitives.h Changed
102
 
1
@@ -119,6 +119,7 @@
2
 
3
 typedef void (*intra_pred_t)(pixel* dst, intptr_t dstStride, const pixel *srcPix, int dirMode, int bFilter);
4
 typedef void (*intra_allangs_t)(pixel *dst, pixel *refPix, pixel *filtPix, int bLuma);
5
+typedef void (*intra_filter_t)(const pixel* references, pixel* filtered);
6
 
7
 typedef void (*cpy2Dto1D_shl_t)(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
8
 typedef void (*cpy2Dto1D_shr_t)(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
9
@@ -136,8 +137,7 @@
10
 typedef uint32_t (*nquant_t)(const int16_t* coef, const int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff);
11
 typedef void (*dequant_scaling_t)(const int16_t* src, const int32_t* dequantCoef, int16_t* dst, int num, int mcqp_miper, int shift);
12
 typedef void (*dequant_normal_t)(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift);
13
-typedef int  (*count_nonzero_t)(const int16_t* quantCoeff, int numCoeff);
14
-
15
+typedef int(*count_nonzero_t)(const int16_t* quantCoeff);
16
 typedef void (*weightp_pp_t)(const pixel* src, pixel* dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset);
17
 typedef void (*weightp_sp_t)(const int16_t* src, pixel* dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);
18
 typedef void (*scale_t)(pixel* dst, const pixel* src, intptr_t stride);
19
@@ -155,7 +155,8 @@
20
 typedef void (*filter_sp_t) (const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
21
 typedef void (*filter_ss_t) (const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
22
 typedef void (*filter_hv_pp_t) (const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int idxX, int idxY);
23
-typedef void (*filter_p2s_t)(const pixel* src, intptr_t srcStride, int16_t* dst, int width, int height);
24
+typedef void (*filter_p2s_wxh_t)(const pixel* src, intptr_t srcStride, int16_t* dst, int width, int height);
25
+typedef void (*filter_p2s_t)(const pixel* src, intptr_t srcStride, int16_t* dst);
26
 
27
 typedef void (*copy_pp_t)(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); // dst is aligned
28
 typedef void (*copy_sp_t)(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
29
@@ -178,6 +179,8 @@
30
 
31
 typedef void (*cutree_propagate_cost) (int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, const uint16_t* interCosts, const int32_t* invQscales, const double* fpsFactor, int len);
32
 
33
+typedef int (*findPosLast_t)(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig);
34
+
35
 /* Function pointers to optimized encoder primitives. Each pointer can reference
36
  * either an assembly routine, a SIMD intrinsic primitive, or a C function */
37
 struct EncoderPrimitives
38
@@ -207,6 +210,7 @@
39
         addAvg_t       addAvg;      // bidir motion compensation, uses 16bit values
40
 
41
         copy_pp_t      copy_pp;
42
+        filter_p2s_t   filter_p2s;
43
     }
44
     pu[NUM_PU_SIZES];
45
 
46
@@ -225,7 +229,7 @@
47
         pixel_add_ps_t  add_ps;
48
         blockfill_s_t   blockfill_s;   // block fill, for DC transforms
49
         copy_cnt_t      copy_cnt;      // copy coeff while counting non-zero
50
-
51
+        count_nonzero_t count_nonzero;
52
         cpy2Dto1D_shl_t cpy2Dto1D_shl;
53
         cpy2Dto1D_shr_t cpy2Dto1D_shr;
54
         cpy1Dto2D_shl_t cpy1Dto2D_shl;
55
@@ -246,6 +250,7 @@
56
 
57
         transpose_t     transpose;     // transpose pixel block; for use with intra all-angs
58
         intra_allangs_t intra_pred_allangs;
59
+        intra_filter_t  intra_filter;
60
         intra_pred_t    intra_pred[NUM_INTRA_MODE];
61
     }
62
     cu[NUM_CU_SIZES];
63
@@ -260,9 +265,7 @@
64
     nquant_t              nquant;
65
     dequant_scaling_t     dequant_scaling;
66
     dequant_normal_t      dequant_normal;
67
-    count_nonzero_t       count_nonzero;
68
     denoiseDct_t          denoiseDct;
69
-
70
     scale_t               scale1D_128to64;
71
     scale_t               scale2D_64to32;
72
 
73
@@ -286,7 +289,9 @@
74
     weightp_sp_t          weight_sp;
75
     weightp_pp_t          weight_pp;
76
 
77
-    filter_p2s_t          luma_p2s;
78
+    filter_p2s_wxh_t      luma_p2s;
79
+
80
+    findPosLast_t         findPosLast;
81
 
82
     /* There is one set of chroma primitives per color space. An encoder will
83
      * have just a single color space and thus it will only ever use one entry
84
@@ -311,6 +316,8 @@
85
             filter_hps_t filter_hps;
86
             addAvg_t     addAvg;
87
             copy_pp_t    copy_pp;
88
+            filter_p2s_t chroma_p2s;
89
+
90
         }
91
         pu[NUM_PU_SIZES];
92
 
93
@@ -329,7 +336,7 @@
94
         }
95
         cu[NUM_CU_SIZES];
96
 
97
-        filter_p2s_t p2s; // takes width/height as arguments
98
+        filter_p2s_wxh_t p2s; // takes width/height as arguments
99
     }
100
     chroma[X265_CSP_COUNT];
101
 };
102
x265_1.5.tar.gz/source/common/quant.cpp -> x265_1.6.tar.gz/source/common/quant.cpp Changed
193
 
1
@@ -50,7 +50,7 @@
2
     return y + ((x - y) & ((x - y) >> (sizeof(int) * CHAR_BIT - 1))); // min(x, y)
3
 }
4
 
5
-inline int getICRate(uint32_t absLevel, int32_t diffLevel, const int* greaterOneBits, const int* levelAbsBits, uint32_t absGoRice, uint32_t c1c2Idx)
6
+inline int getICRate(uint32_t absLevel, int32_t diffLevel, const int* greaterOneBits, const int* levelAbsBits, const uint32_t absGoRice, const uint32_t maxVlc, uint32_t c1c2Idx)
7
 {
8
     X265_CHECK(c1c2Idx <= 3, "c1c2Idx check failure\n");
9
     X265_CHECK(absGoRice <= 4, "absGoRice check failure\n");
10
@@ -72,7 +72,6 @@
11
     else
12
     {
13
         uint32_t symbol = diffLevel;
14
-        const uint32_t maxVlc = g_goRiceRange[absGoRice];
15
         bool expGolomb = (symbol > maxVlc);
16
 
17
         if (expGolomb)
18
@@ -105,6 +104,41 @@
19
     return rate;
20
 }
21
 
22
+#if CHECKED_BUILD || _DEBUG
23
+inline int getICRateNegDiff(uint32_t absLevel, const int* greaterOneBits, const int* levelAbsBits)
24
+{
25
+    X265_CHECK(absLevel <= 2, "absLevel check failure\n");
26
+
27
+    int rate;
28
+    if (absLevel == 0)
29
+        rate = 0;
30
+    else if (absLevel == 2)
31
+        rate = greaterOneBits[1] + levelAbsBits[0];
32
+    else
33
+        rate = greaterOneBits[0];
34
+    return rate;
35
+}
36
+#endif
37
+
38
+inline int getICRateLessVlc(uint32_t absLevel, int32_t diffLevel, const uint32_t absGoRice)
39
+{
40
+    X265_CHECK(absGoRice <= 4, "absGoRice check failure\n");
41
+    if (!absLevel)
42
+    {
43
+        X265_CHECK(diffLevel < 0, "diffLevel check failure\n");
44
+        return 0;
45
+    }
46
+    int rate;
47
+
48
+    uint32_t symbol = diffLevel;
49
+    uint32_t prefLen = (symbol >> absGoRice) + 1;
50
+    uint32_t numBins = fastMin(prefLen + absGoRice, 8 /* g_goRicePrefixLen[absGoRice] + absGoRice */);
51
+
52
+    rate = numBins << 15;
53
+
54
+    return rate;
55
+}
56
+
57
 /* Calculates the cost for specific absolute transform level */
58
 inline uint32_t getICRateCost(uint32_t absLevel, int32_t diffLevel, const int* greaterOneBits, const int* levelAbsBits, uint32_t absGoRice, uint32_t c1c2Idx)
59
 {
60
@@ -160,12 +194,12 @@
61
     m_nr           = NULL;
62
 }
63
 
64
-bool Quant::init(bool useRDOQ, double psyScale, const ScalingList& scalingList, Entropy& entropy)
65
+bool Quant::init(int rdoqLevel, double psyScale, const ScalingList& scalingList, Entropy& entropy)
66
 {
67
     m_entropyCoder = &entropy;
68
-    m_useRDOQ = useRDOQ;
69
+    m_rdoqLevel    = rdoqLevel;
70
     m_psyRdoqScale = (int64_t)(psyScale * 256.0);
71
-    m_scalingList = &scalingList;
72
+    m_scalingList  = &scalingList;
73
     m_resiDctCoeff = X265_MALLOC(int16_t, MAX_TR_SIZE * MAX_TR_SIZE * 2);
74
     m_fencDctCoeff = m_resiDctCoeff + (MAX_TR_SIZE * MAX_TR_SIZE);
75
     m_fencShortBuf = X265_MALLOC(int16_t, MAX_TR_SIZE * MAX_TR_SIZE);
76
@@ -382,13 +416,13 @@
77
         }
78
     }
79
 
80
-    if (m_useRDOQ)
81
+    if (m_rdoqLevel)
82
         return rdoQuant(cu, coeff, log2TrSize, ttype, absPartIdx, usePsy);
83
     else
84
     {
85
         int deltaU[32 * 32];
86
 
87
-        int scalingListType = ttype + (isLuma ? 3 : 0);
88
+        int scalingListType = (cu.isIntra(absPartIdx) ? 0 : 3) + ttype;
89
         int rem = m_qpParam[ttype].rem;
90
         int per = m_qpParam[ttype].per;
91
         const int32_t* quantCoeff = m_scalingList->m_quantCoef[log2TrSize - 2][scalingListType][rem];
92
@@ -454,9 +488,7 @@
93
     else
94
     {
95
         int useDST = !sizeIdx && ttype == TEXT_LUMA && bIntra;
96
-
97
-        X265_CHECK((int)numSig == primitives.count_nonzero(coeff, 1 << (log2TrSize * 2)), "numSig differ\n");
98
-
99
+        X265_CHECK((int)numSig == primitives.cu[log2TrSize - 2].count_nonzero(coeff), "numSig differ\n");
100
         // DC only
101
         if (numSig == 1 && coeff[0] != 0 && !useDST)
102
         {
103
@@ -493,13 +525,10 @@
104
     const int32_t* qCoef = m_scalingList->m_quantCoef[log2TrSize - 2][scalingListType][rem];
105
 
106
     int numCoeff = 1 << (log2TrSize * 2);
107
-
108
     uint32_t numSig = primitives.nquant(m_resiDctCoeff, qCoef, dstCoeff, qbits, add, numCoeff);
109
-
110
-    X265_CHECK((int)numSig == primitives.count_nonzero(dstCoeff, 1 << (log2TrSize * 2)), "numSig differ\n");
111
+    X265_CHECK((int)numSig == primitives.cu[log2TrSize - 2].count_nonzero(dstCoeff), "numSig differ\n");
112
     if (!numSig)
113
         return 0;
114
-
115
     uint32_t trSize = 1 << log2TrSize;
116
     int64_t lambda2 = m_qpParam[ttype].lambda2;
117
     int64_t psyScale = (m_psyRdoqScale * m_qpParam[ttype].lambda);
118
@@ -674,9 +703,43 @@
119
                 /* record costs for sign-hiding performed at the end */
120
                 if (level)
121
                 {
122
-                    int rateNow = getICRate(level, level - baseLevel, greaterOneBits, levelAbsBits, goRiceParam, c1c2Idx);
123
-                    rateIncUp[blkPos] = getICRate(level + 1, level + 1 - baseLevel, greaterOneBits, levelAbsBits, goRiceParam, c1c2Idx) - rateNow;
124
-                    rateIncDown[blkPos] = getICRate(level - 1, level - 1 - baseLevel, greaterOneBits, levelAbsBits, goRiceParam, c1c2Idx) - rateNow;
125
+                    const int32_t diff0 = level - 1 - baseLevel;
126
+                    const int32_t diff2 = level + 1 - baseLevel;
127
+                    const int32_t maxVlc = g_goRiceRange[goRiceParam];
128
+                    int rate0, rate1, rate2;
129
+
130
+                    if (diff0 < -2)  // prob (92.9, 86.5, 74.5)%
131
+                    {
132
+                        // NOTE: Min: L - 1 - {1,2,1,3} < -2 ==> L < {0,1,0,2}
133
+                        //            additional L > 0, so I got (L > 0 && L < 2) ==> L = 1
134
+                        X265_CHECK(level == 1, "absLevel check failure\n");
135
+
136
+                        const int rateEqual2 = greaterOneBits[1] + levelAbsBits[0];;
137
+                        const int rateNotEqual2 = greaterOneBits[0];
138
+
139
+                        rate0 = 0;
140
+                        rate2 = rateEqual2;
141
+                        rate1 = rateNotEqual2;
142
+
143
+                        X265_CHECK(rate1 == getICRateNegDiff(level + 0, greaterOneBits, levelAbsBits), "rate1 check failure!\n");
144
+                        X265_CHECK(rate2 == getICRateNegDiff(level + 1, greaterOneBits, levelAbsBits), "rate1 check failure!\n");
145
+                        X265_CHECK(rate0 == getICRateNegDiff(level - 1, greaterOneBits, levelAbsBits), "rate1 check failure!\n");
146
+                    }
147
+                    else if (diff0 >= 0 && diff2 <= maxVlc)     // prob except from above path (98.6, 97.9, 96.9)%
148
+                    {
149
+                        // NOTE: no c1c2 correct rate since all of rate include this factor
150
+                        rate1 = getICRateLessVlc(level + 0, diff0 + 1, goRiceParam);
151
+                        rate2 = getICRateLessVlc(level + 1, diff0 + 2, goRiceParam);
152
+                        rate0 = getICRateLessVlc(level - 1, diff0 + 0, goRiceParam);
153
+                    }
154
+                    else
155
+                    {
156
+                        rate1 = getICRate(level + 0, diff0 + 1, greaterOneBits, levelAbsBits, goRiceParam, maxVlc, c1c2Idx);
157
+                        rate2 = getICRate(level + 1, diff0 + 2, greaterOneBits, levelAbsBits, goRiceParam, maxVlc, c1c2Idx);
158
+                        rate0 = getICRate(level - 1, diff0 + 0, greaterOneBits, levelAbsBits, goRiceParam, maxVlc, c1c2Idx);
159
+                    }
160
+                    rateIncUp[blkPos] = rate2 - rate1;
161
+                    rateIncDown[blkPos] = rate0 - rate1;
162
                 }
163
                 else
164
                 {
165
@@ -762,7 +825,7 @@
166
             costCoeffGroupSig[cgScanPos] = SIGCOST(estBitsSbac.significantCoeffGroupBits[sigCtx][1]);
167
             totalRdCost += costCoeffGroupSig[cgScanPos];  /* add the cost of 1 bit in significant CG bitmap */
168
 
169
-            if (costZeroCG < totalRdCost)
170
+            if (costZeroCG < totalRdCost && m_rdoqLevel > 1)
171
             {
172
                 sigCoeffGroupFlag64 &= ~cgBlkPosMask;
173
                 totalRdCost = costZeroCG;
174
@@ -870,7 +933,7 @@
175
                     bestLastIdx = scanPos + 1;
176
                     bestCost = costAsLast;
177
                 }
178
-                if (dstCoeff[blkPos] > 1)
179
+                if (dstCoeff[blkPos] > 1 || m_rdoqLevel == 1)
180
                 {
181
                     foundLast = true;
182
                     break;
183
@@ -1037,7 +1100,8 @@
184
 
185
     const uint32_t trSizeCG = 1 << log2TrSizeCG;
186
     X265_CHECK(trSizeCG <= 8, "transform CG is too large\n");
187
-    const uint32_t sigPos = (uint32_t)(sigCoeffGroupFlag64 >> (1 + (cgPosY << log2TrSizeCG) + cgPosX));
188
+    const uint32_t shift = (cgPosY << log2TrSizeCG) + cgPosX + 1;
189
+    const uint32_t sigPos = (uint32_t)(shift >= 64 ? 0 : sigCoeffGroupFlag64 >> shift);
190
     const uint32_t sigRight = ((int32_t)(cgPosX - (trSizeCG - 1)) >> 31) & (sigPos & 1);
191
     const uint32_t sigLower = ((int32_t)(cgPosY - (trSizeCG - 1)) >> 31) & (sigPos >> (trSizeCG - 2)) & 2;
192
 
193
x265_1.5.tar.gz/source/common/quant.h -> x265_1.6.tar.gz/source/common/quant.h Changed
19
 
1
@@ -81,7 +81,7 @@
2
 
3
     QpParam            m_qpParam[3];
4
 
5
-    bool               m_useRDOQ;
6
+    int                m_rdoqLevel;
7
     int64_t            m_psyRdoqScale;
8
     int16_t*           m_resiDctCoeff;
9
     int16_t*           m_fencDctCoeff;
10
@@ -99,7 +99,7 @@
11
     ~Quant();
12
 
13
     /* one-time setup */
14
-    bool init(bool useRDOQ, double psyScale, const ScalingList& scalingList, Entropy& entropy);
15
+    bool init(int rdoqLevel, double psyScale, const ScalingList& scalingList, Entropy& entropy);
16
     bool allocNoiseReduction(const x265_param& param);
17
 
18
     /* CU setup */
19
x265_1.5.tar.gz/source/common/scalinglist.cpp -> x265_1.6.tar.gz/source/common/scalinglist.cpp Changed
10
 
1
@@ -222,7 +222,7 @@
2
 
3
 void ScalingList::processDefaultMarix(int sizeId, int listId)
4
 {
5
-    ::memcpy(m_scalingListCoef[sizeId][listId], getScalingListDefaultAddress(sizeId, listId), sizeof(int) * X265_MIN(MAX_MATRIX_COEF_NUM, s_numCoefPerSize[sizeId]));
6
+    memcpy(m_scalingListCoef[sizeId][listId], getScalingListDefaultAddress(sizeId, listId), sizeof(int) * X265_MIN(MAX_MATRIX_COEF_NUM, s_numCoefPerSize[sizeId]));
7
     m_scalingListDC[sizeId][listId] = SCALING_LIST_DC;
8
 }
9
 
10
x265_1.5.tar.gz/source/common/shortyuv.cpp -> x265_1.6.tar.gz/source/common/shortyuv.cpp Changed
14
 
1
@@ -66,9 +66,9 @@
2
 
3
 void ShortYuv::clear()
4
 {
5
-    ::memset(m_buf[0], 0, (m_size  * m_size) *  sizeof(int16_t));
6
-    ::memset(m_buf[1], 0, (m_csize * m_csize) * sizeof(int16_t));
7
-    ::memset(m_buf[2], 0, (m_csize * m_csize) * sizeof(int16_t));
8
+    memset(m_buf[0], 0, (m_size  * m_size) *  sizeof(int16_t));
9
+    memset(m_buf[1], 0, (m_csize * m_csize) * sizeof(int16_t));
10
+    memset(m_buf[2], 0, (m_csize * m_csize) * sizeof(int16_t));
11
 }
12
 
13
 void ShortYuv::subtract(const Yuv& srcYuv0, const Yuv& srcYuv1, uint32_t log2Size)
14
x265_1.5.tar.gz/source/common/slice.cpp -> x265_1.6.tar.gz/source/common/slice.cpp Changed
46
 
1
@@ -33,7 +33,7 @@
2
 {
3
     if (m_sliceType == I_SLICE)
4
     {
5
-        ::memset(m_refPicList, 0, sizeof(m_refPicList));
6
+        memset(m_refPicList, 0, sizeof(m_refPicList));
7
         m_numRefIdx[1] = m_numRefIdx[0] = 0;
8
         return;
9
     }
10
@@ -112,7 +112,7 @@
11
     if (m_sliceType != B_SLICE)
12
     {
13
         m_numRefIdx[1] = 0;
14
-        ::memset(m_refPicList[1], 0, sizeof(m_refPicList[1]));
15
+        memset(m_refPicList[1], 0, sizeof(m_refPicList[1]));
16
     }
17
     else
18
     {
19
@@ -183,8 +183,8 @@
20
 uint32_t Slice::realEndAddress(uint32_t endCUAddr) const
21
 {
22
     // Calculate end address
23
-    uint32_t internalAddress = (endCUAddr - 1) % NUM_CU_PARTITIONS;
24
-    uint32_t externalAddress = (endCUAddr - 1) / NUM_CU_PARTITIONS;
25
+    uint32_t internalAddress = (endCUAddr - 1) % NUM_4x4_PARTITIONS;
26
+    uint32_t externalAddress = (endCUAddr - 1) / NUM_4x4_PARTITIONS;
27
     uint32_t xmax = m_sps->picWidthInLumaSamples - (externalAddress % m_sps->numCuInWidth) * g_maxCUSize;
28
     uint32_t ymax = m_sps->picHeightInLumaSamples - (externalAddress / m_sps->numCuInWidth) * g_maxCUSize;
29
 
30
@@ -192,13 +192,13 @@
31
         internalAddress--;
32
 
33
     internalAddress++;
34
-    if (internalAddress == NUM_CU_PARTITIONS)
35
+    if (internalAddress == NUM_4x4_PARTITIONS)
36
     {
37
         internalAddress = 0;
38
         externalAddress++;
39
     }
40
 
41
-    return externalAddress * NUM_CU_PARTITIONS + internalAddress;
42
+    return externalAddress * NUM_4x4_PARTITIONS + internalAddress;
43
 }
44
 
45
 
46
x265_1.5.tar.gz/source/common/slice.h -> x265_1.6.tar.gz/source/common/slice.h Changed
52
 
1
@@ -55,9 +55,9 @@
2
         , numberOfNegativePictures(0)
3
         , numberOfPositivePictures(0)
4
     {
5
-        ::memset(deltaPOC, 0, sizeof(deltaPOC));
6
-        ::memset(poc, 0, sizeof(poc));
7
-        ::memset(bUsed, 0, sizeof(bUsed));
8
+        memset(deltaPOC, 0, sizeof(deltaPOC));
9
+        memset(poc, 0, sizeof(poc));
10
+        memset(bUsed, 0, sizeof(bUsed));
11
     }
12
 
13
     void sortDeltaPOC();
14
@@ -149,8 +149,10 @@
15
 
16
 struct VPS
17
 {
18
+    uint32_t         maxTempSubLayers;
19
     uint32_t         numReorderPics;
20
     uint32_t         maxDecPicBuffering;
21
+    uint32_t         maxLatencyIncrease;
22
     HRDInfo          hrdParameters;
23
     ProfileTierLevel ptl;
24
 };
25
@@ -228,9 +230,10 @@
26
     bool     bUseAMP; // use param
27
     uint32_t maxAMPDepth;
28
 
29
+    uint32_t maxTempSubLayers;   // max number of Temporal Sub layers
30
     uint32_t maxDecPicBuffering; // these are dups of VPS values
31
+    uint32_t maxLatencyIncrease;
32
     int      numReorderPics;
33
-    int      maxLatencyIncrease;
34
 
35
     bool     bUseStrongIntraSmoothing; // use param
36
     bool     bTemporalMVPEnabled;
37
@@ -285,6 +288,14 @@
38
     }
39
 };
40
 
41
+#define SET_WEIGHT(w, b, s, d, o) \
42
+    { \
43
+        (w).inputWeight = (s); \
44
+        (w).log2WeightDenom = (d); \
45
+        (w).inputOffset = (o); \
46
+        (w).bPresentFlag = (b); \
47
+    }
48
+
49
 class Slice
50
 {
51
 public:
52
x265_1.5.tar.gz/source/common/threading.cpp -> x265_1.6.tar.gz/source/common/threading.cpp Changed
15
 
1
@@ -26,6 +26,13 @@
2
 namespace x265 {
3
 // x265 private namespace
4
 
5
+#if X265_ARCH_X86 && !defined(X86_64) && ENABLE_ASSEMBLY && defined(__GNUC__)
6
+extern "C" intptr_t x265_stack_align(void (*func)(), ...);
7
+#define x265_stack_align(func, ...) x265_stack_align((void (*)())func, __VA_ARGS__)
8
+#else
9
+#define x265_stack_align(func, ...) func(__VA_ARGS__)
10
+#endif
11
+
12
 /* C shim for forced stack alignment */
13
 static void stackAlignMain(Thread *instance)
14
 {
15
x265_1.5.tar.gz/source/common/threading.h -> x265_1.6.tar.gz/source/common/threading.h Changed
119
 
1
@@ -42,32 +42,32 @@
2
 #include <sys/sysctl.h>
3
 #endif
4
 
5
-#ifdef __GNUC__                         /* GCCs builtin atomics */
6
+#ifdef __GNUC__               /* GCCs builtin atomics */
7
 
8
 #include <sys/time.h>
9
 #include <unistd.h>
10
 
11
-#define CLZ(id, x)                          id = (unsigned long)__builtin_clz(x) ^ 31
12
-#define CTZ(id, x)                          id = (unsigned long)__builtin_ctz(x)
13
-#define ATOMIC_OR(ptr, mask)                __sync_fetch_and_or(ptr, mask)
14
-#define ATOMIC_AND(ptr, mask)               __sync_fetch_and_and(ptr, mask)
15
-#define ATOMIC_INC(ptr)                     __sync_add_and_fetch((volatile int32_t*)ptr, 1)
16
-#define ATOMIC_DEC(ptr)                     __sync_add_and_fetch((volatile int32_t*)ptr, -1)
17
-#define ATOMIC_ADD(ptr, value)              __sync_add_and_fetch((volatile int32_t*)ptr, value)
18
-#define GIVE_UP_TIME()                      usleep(0)
19
+#define CLZ(id, x)            id = (unsigned long)__builtin_clz(x) ^ 31
20
+#define CTZ(id, x)            id = (unsigned long)__builtin_ctz(x)
21
+#define ATOMIC_OR(ptr, mask)  __sync_fetch_and_or(ptr, mask)
22
+#define ATOMIC_AND(ptr, mask) __sync_fetch_and_and(ptr, mask)
23
+#define ATOMIC_INC(ptr)       __sync_add_and_fetch((volatile int32_t*)ptr, 1)
24
+#define ATOMIC_DEC(ptr)       __sync_add_and_fetch((volatile int32_t*)ptr, -1)
25
+#define ATOMIC_ADD(ptr, val)  __sync_fetch_and_add((volatile int32_t*)ptr, val)
26
+#define GIVE_UP_TIME()        usleep(0)
27
 
28
-#elif defined(_MSC_VER)                 /* Windows atomic intrinsics */
29
+#elif defined(_MSC_VER)       /* Windows atomic intrinsics */
30
 
31
 #include <intrin.h>
32
 
33
-#define CLZ(id, x)                          _BitScanReverse(&id, x)
34
-#define CTZ(id, x)                          _BitScanForward(&id, x)
35
-#define ATOMIC_INC(ptr)                     InterlockedIncrement((volatile LONG*)ptr)
36
-#define ATOMIC_DEC(ptr)                     InterlockedDecrement((volatile LONG*)ptr)
37
-#define ATOMIC_ADD(ptr, value)              InterlockedExchangeAdd((volatile LONG*)ptr, value)
38
-#define ATOMIC_OR(ptr, mask)                _InterlockedOr((volatile LONG*)ptr, (LONG)mask)
39
-#define ATOMIC_AND(ptr, mask)               _InterlockedAnd((volatile LONG*)ptr, (LONG)mask)
40
-#define GIVE_UP_TIME()                      Sleep(0)
41
+#define CLZ(id, x)            _BitScanReverse(&id, x)
42
+#define CTZ(id, x)            _BitScanForward(&id, x)
43
+#define ATOMIC_INC(ptr)       InterlockedIncrement((volatile LONG*)ptr)
44
+#define ATOMIC_DEC(ptr)       InterlockedDecrement((volatile LONG*)ptr)
45
+#define ATOMIC_ADD(ptr, val)  InterlockedExchangeAdd((volatile LONG*)ptr, val)
46
+#define ATOMIC_OR(ptr, mask)  _InterlockedOr((volatile LONG*)ptr, (LONG)mask)
47
+#define ATOMIC_AND(ptr, mask) _InterlockedAnd((volatile LONG*)ptr, (LONG)mask)
48
+#define GIVE_UP_TIME()        Sleep(0)
49
 
50
 #endif // ifdef __GNUC__
51
 
52
@@ -128,8 +128,8 @@
53
 
54
     bool timedWait(uint32_t milliseconds)
55
     {
56
-        /* returns true if event was signaled */
57
-        return WaitForSingleObject(this->handle, milliseconds) == WAIT_OBJECT_0;
58
+        /* returns true if the wait timed out */
59
+        return WaitForSingleObject(this->handle, milliseconds) == WAIT_TIMEOUT;
60
     }
61
 
62
     void trigger()
63
@@ -263,10 +263,8 @@
64
 
65
         /* blocking wait on conditional variable, mutex is atomically released
66
          * while blocked. When condition is signaled, mutex is re-acquired */
67
-        while (m_counter == 0)
68
-        {
69
+        while (!m_counter)
70
             pthread_cond_wait(&m_cond, &m_mutex);
71
-        }
72
 
73
         m_counter--;
74
         pthread_mutex_unlock(&m_mutex);
75
@@ -277,7 +275,7 @@
76
         bool bTimedOut = false;
77
 
78
         pthread_mutex_lock(&m_mutex);
79
-        if (m_counter == 0)
80
+        if (!m_counter)
81
         {
82
             struct timeval tv;
83
             struct timespec ts;
84
@@ -297,7 +295,10 @@
85
             bTimedOut = pthread_cond_timedwait(&m_cond, &m_mutex, &ts) == ETIMEDOUT;
86
         }
87
         if (m_counter > 0)
88
+        {
89
             m_counter--;
90
+            bTimedOut = false;
91
+        }
92
         pthread_mutex_unlock(&m_mutex);
93
         return bTimedOut;
94
     }
95
@@ -408,6 +409,23 @@
96
     Lock &inst;
97
 };
98
 
99
+// Utility class which adds elapsed time of the scope of the object into the
100
+// accumulator provided to the constructor
101
+struct ScopedElapsedTime
102
+{
103
+    ScopedElapsedTime(int64_t& accum) : accumlatedTime(accum) { startTime = x265_mdate(); }
104
+
105
+    ~ScopedElapsedTime() { accumlatedTime += x265_mdate() - startTime; }
106
+
107
+protected:
108
+
109
+    int64_t  startTime;
110
+    int64_t& accumlatedTime;
111
+
112
+    // do not allow assignments
113
+    ScopedElapsedTime &operator =(const ScopedElapsedTime &);
114
+};
115
+
116
 //< Simplistic portable thread class.  Shutdown signalling left to derived class
117
 class Thread
118
 {
119
x265_1.5.tar.gz/source/common/threadpool.cpp -> x265_1.6.tar.gz/source/common/threadpool.cpp Changed
201
 
1
@@ -27,115 +27,65 @@
2
 
3
 #include <new>
4
 
5
-#if MACOS
6
-#include <sys/param.h>
7
-#include <sys/sysctl.h>
8
-#endif
9
-
10
-namespace x265 {
11
-// x265 private namespace
12
-
13
-class ThreadPoolImpl;
14
+#if X86_64
15
 
16
-class PoolThread : public Thread
17
-{
18
-private:
19
+#ifdef __GNUC__
20
 
21
-    ThreadPoolImpl &m_pool;
22
+#define SLEEPBITMAP_CTZ(id, x)     id = (unsigned long)__builtin_ctzll(x)
23
+#define SLEEPBITMAP_OR(ptr, mask)  __sync_fetch_and_or(ptr, mask)
24
+#define SLEEPBITMAP_AND(ptr, mask) __sync_fetch_and_and(ptr, mask)
25
 
26
-    PoolThread& operator =(const PoolThread&);
27
+#elif defined(_MSC_VER)
28
 
29
-    int            m_id;
30
+#define SLEEPBITMAP_CTZ(id, x)     _BitScanForward64(&id, x)
31
+#define SLEEPBITMAP_OR(ptr, mask)  InterlockedOr64((volatile LONG64*)ptr, (LONG)mask)
32
+#define SLEEPBITMAP_AND(ptr, mask) InterlockedAnd64((volatile LONG64*)ptr, (LONG)mask)
33
 
34
-    bool           m_dirty;
35
+#endif // ifdef __GNUC__
36
 
37
-    bool           m_exited;
38
-
39
-    Event          m_wakeEvent;
40
-
41
-public:
42
-
43
-    PoolThread(ThreadPoolImpl& pool, int id)
44
-        : m_pool(pool)
45
-        , m_id(id)
46
-        , m_dirty(false)
47
-        , m_exited(false)
48
-    {
49
-    }
50
-
51
-    bool isDirty() const  { return m_dirty; }
52
-
53
-    void markDirty()      { m_dirty = true; }
54
+#else
55
 
56
-    bool isExited() const { return m_exited; }
57
+/* use 32-bit primitives defined in threading.h */
58
+#define SLEEPBITMAP_CTZ CTZ
59
+#define SLEEPBITMAP_OR  ATOMIC_OR
60
+#define SLEEPBITMAP_AND ATOMIC_AND
61
 
62
-    void poke()           { m_wakeEvent.trigger(); }
63
+#endif
64
 
65
-    virtual ~PoolThread() {}
66
+#if MACOS
67
+#include <sys/param.h>
68
+#include <sys/sysctl.h>
69
+#endif
70
+#if HAVE_LIBNUMA
71
+#include <numa.h>
72
+#endif
73
 
74
-    void threadMain();
75
-};
76
+namespace x265 {
77
+// x265 private namespace
78
 
79
-class ThreadPoolImpl : public ThreadPool
80
+class WorkerThread : public Thread
81
 {
82
 private:
83
 
84
-    bool         m_ok;
85
-    int          m_referenceCount;
86
-    int          m_numThreads;
87
-    int          m_numSleepMapWords;
88
-    PoolThread  *m_threads;
89
-    volatile uint32_t *m_sleepMap;
90
+    ThreadPool&  m_pool;
91
+    int          m_id;
92
+    Event        m_wakeEvent;
93
 
94
-    /* Lock for write access to the provider lists.  Threads are
95
-     * always allowed to read m_firstProvider and follow the
96
-     * linked list.  Providers must zero their m_nextProvider
97
-     * pointers before removing themselves from this list */
98
-    Lock         m_writeLock;
99
+    WorkerThread& operator =(const WorkerThread&);
100
 
101
 public:
102
 
103
-    static ThreadPoolImpl *s_instance;
104
-    static Lock s_createLock;
105
-
106
-    JobProvider *m_firstProvider;
107
-    JobProvider *m_lastProvider;
108
-
109
-public:
110
-
111
-    ThreadPoolImpl(int numthreads);
112
-
113
-    virtual ~ThreadPoolImpl();
114
-
115
-    ThreadPoolImpl *AddReference()
116
-    {
117
-        m_referenceCount++;
118
-
119
-        return this;
120
-    }
121
-
122
-    void markThreadAsleep(int id);
123
-
124
-    void waitForAllIdle();
125
-
126
-    int getThreadCount() const { return m_numThreads; }
127
-
128
-    bool IsValid() const       { return m_ok; }
129
-
130
-    void release();
131
+    JobProvider*     m_curJobProvider;
132
+    BondedTaskGroup* m_bondMaster;
133
 
134
-    void Stop();
135
+    WorkerThread(ThreadPool& pool, int id) : m_pool(pool), m_id(id) {}
136
+    virtual ~WorkerThread() {}
137
 
138
-    void enqueueJobProvider(JobProvider &);
139
-
140
-    void dequeueJobProvider(JobProvider &);
141
-
142
-    void FlushProviderList();
143
-
144
-    void pokeIdleThread();
145
+    void threadMain();
146
+    void awaken()           { m_wakeEvent.trigger(); }
147
 };
148
 
149
-void PoolThread::threadMain()
150
+void WorkerThread::threadMain()
151
 {
152
     THREAD_NAME("Worker", m_id);
153
 
154
@@ -145,286 +95,361 @@
155
     __attribute__((unused)) int val = nice(10);
156
 #endif
157
 
158
-    while (m_pool.IsValid())
159
+    m_pool.setCurrentThreadAffinity();
160
+
161
+    sleepbitmap_t idBit = (sleepbitmap_t)1 << m_id;
162
+    m_curJobProvider = m_pool.m_jpTable[0];
163
+    m_bondMaster = NULL;
164
+
165
+    SLEEPBITMAP_OR(&m_curJobProvider->m_ownerBitmap, idBit);
166
+    SLEEPBITMAP_OR(&m_pool.m_sleepBitmap, idBit);
167
+    m_wakeEvent.wait();
168
+
169
+    while (m_pool.m_isActive)
170
     {
171
-        /* Walk list of job providers, looking for work */
172
-        JobProvider *cur = m_pool.m_firstProvider;
173
-        while (cur)
174
+        if (m_bondMaster)
175
         {
176
-            // FindJob() may perform actual work and return true.  If
177
-            // it does we restart the job search
178
-            if (cur->findJob(m_id) == true)
179
-                break;
180
-
181
-            cur = cur->m_nextProvider;
182
+            m_bondMaster->processTasks(m_id);
183
+            m_bondMaster->m_exitedPeerCount.incr();
184
+            m_bondMaster = NULL;
185
         }
186
 
187
-        // this thread has reached the end of the provider list
188
-        m_dirty = false;
189
-
190
-        if (cur == NULL)
191
+        do
192
         {
193
-            m_pool.markThreadAsleep(m_id);
194
-            m_wakeEvent.wait();
195
+            /* do pending work for current job provider */
196
+            m_curJobProvider->findJob(m_id);
197
+
198
+            /* if the current job provider still wants help, only switch to a
199
+             * higher priority provider (lower slice type). Else take the first
200
+             * available job provider with the highest priority */
201
x265_1.5.tar.gz/source/common/threadpool.h -> x265_1.6.tar.gz/source/common/threadpool.h Changed
201
 
1
@@ -25,85 +25,148 @@
2
 #define X265_THREADPOOL_H
3
 
4
 #include "common.h"
5
+#include "threading.h"
6
 
7
 namespace x265 {
8
 // x265 private namespace
9
 
10
 class ThreadPool;
11
+class WorkerThread;
12
+class BondedTaskGroup;
13
 
14
-int getCpuCount();
15
+#if X86_64
16
+typedef uint64_t sleepbitmap_t;
17
+#else
18
+typedef uint32_t sleepbitmap_t;
19
+#endif
20
 
21
-// Any class that wants to distribute work to the thread pool must
22
-// derive from JobProvider and implement FindJob().
23
+static const sleepbitmap_t ALL_POOL_THREADS = (sleepbitmap_t)-1;
24
+enum { MAX_POOL_THREADS = sizeof(sleepbitmap_t) * 8 };
25
+enum { INVALID_SLICE_PRIORITY = 10 }; // a value larger than any X265_TYPE_* macro
26
+
27
+// Frame level job providers. FrameEncoder and Lookahead derive from
28
+// this class and implement findJob()
29
 class JobProvider
30
 {
31
-protected:
32
-
33
-    ThreadPool   *m_pool;
34
-
35
-    JobProvider  *m_nextProvider;
36
-    JobProvider  *m_prevProvider;
37
-
38
 public:
39
 
40
-    JobProvider(ThreadPool *p) : m_pool(p), m_nextProvider(0), m_prevProvider(0) {}
41
+    ThreadPool*   m_pool;
42
+    sleepbitmap_t m_ownerBitmap;
43
+    int           m_jpId;
44
+    int           m_sliceType;
45
+    bool          m_helpWanted;
46
+    bool          m_isFrameEncoder; /* rather ugly hack, but nothing better presents itself */
47
+
48
+    JobProvider()
49
+        : m_pool(NULL)
50
+        , m_ownerBitmap(0)
51
+        , m_jpId(-1)
52
+        , m_sliceType(INVALID_SLICE_PRIORITY)
53
+        , m_helpWanted(false)
54
+        , m_isFrameEncoder(false)
55
+    {}
56
 
57
     virtual ~JobProvider() {}
58
 
59
-    void setThreadPool(ThreadPool *p) { m_pool = p; }
60
-
61
-    // Register this job provider with the thread pool, jobs are available
62
-    void enqueue();
63
-
64
-    // Remove this job provider from the thread pool, all jobs complete
65
-    void dequeue();
66
-
67
-    // Worker threads will call this method to find a job.  Must return true if
68
-    // work was completed.  False if no work was available.
69
-    virtual bool findJob(int threadId) = 0;
70
-
71
-    // All derived objects that call Enqueue *MUST* call flush before allowing
72
-    // their object to be destroyed, otherwise you will see random crashes involving
73
-    // partially freed vtables and you will be unhappy
74
-    void flush();
75
+    // Worker threads will call this method to perform work
76
+    virtual void findJob(int workerThreadId) = 0;
77
 
78
-    friend class ThreadPoolImpl;
79
-    friend class PoolThread;
80
+    // Will awaken one idle thread, preferring a thread which most recently
81
+    // performed work for this provider.
82
+    void tryWakeOne();
83
 };
84
 
85
-// Abstract interface to ThreadPool.  Each encoder instance should call
86
-// AllocThreadPool() to get a handle to the singleton object and then make
87
-// it available to their job provider structures (wave-front frame encoders,
88
-// etc).
89
 class ThreadPool
90
 {
91
-protected:
92
-
93
-    // Destructor is inaccessable, force the use of reference counted Release()
94
-    ~ThreadPool() {}
95
-
96
-    virtual void enqueueJobProvider(JobProvider &) = 0;
97
+public:
98
 
99
-    virtual void dequeueJobProvider(JobProvider &) = 0;
100
+    sleepbitmap_t m_sleepBitmap;
101
+    int           m_numProviders;
102
+    int           m_numWorkers;
103
+    int           m_numaNode;
104
+    bool          m_isActive;
105
 
106
-public:
107
+    JobProvider** m_jpTable;
108
+    WorkerThread* m_workers;
109
 
110
-    // When numthreads == 0, a default thread count is used. A request may grow
111
-    // an existing pool but it will never shrink.
112
-    static ThreadPool *allocThreadPool(int numthreads = 0);
113
+    ThreadPool();
114
+    ~ThreadPool();
115
 
116
-    static ThreadPool *getThreadPool();
117
+    bool create(int numThreads, int maxProviders, int node);
118
+    bool start();
119
+    void stop();
120
+    void setCurrentThreadAffinity();
121
+    int  tryAcquireSleepingThread(sleepbitmap_t firstTryBitmap, sleepbitmap_t secondTryBitmap);
122
+    int  tryBondPeers(int maxPeers, sleepbitmap_t peerBitmap, BondedTaskGroup& master);
123
 
124
-    virtual void pokeIdleThread() = 0;
125
+    static ThreadPool* allocThreadPools(x265_param* p, int& numPools);
126
 
127
-    // The pool is reference counted so all calls to AllocThreadPool() should be
128
-    // followed by a call to Release()
129
-    virtual void release() = 0;
130
+    static int  getCpuCount();
131
+    static int  getNumaNodeCount();
132
+    static void setThreadNodeAffinity(int node);
133
+};
134
 
135
-    virtual int  getThreadCount() const = 0;
136
+/* Any worker thread may enlist the help of idle worker threads from the same
137
+ * job provider. They must derive from this class and implement the
138
+ * processTasks() method.  To use, an instance must be instantiated by a worker
139
+ * thread (referred to as the master thread) and then tryBondPeers() must be
140
+ * called. If it returns non-zero then some number of slave worker threads are
141
+ * already in the process of calling your processTasks() function. The master
142
+ * thread should participate and call processTasks() itself. When
143
+ * waitForExit() returns, all bonded peer threads are quarunteed to have
144
+ * exitied processTasks(). Since the thread count is small, it uses explicit
145
+ * locking instead of atomic counters and bitmasks */
146
+class BondedTaskGroup
147
+{
148
+public:
149
 
150
-    friend class JobProvider;
151
+    Lock              m_lock;
152
+    ThreadSafeInteger m_exitedPeerCount;
153
+    int               m_bondedPeerCount;
154
+    int               m_jobTotal;
155
+    int               m_jobAcquired;
156
+
157
+    BondedTaskGroup()  { m_bondedPeerCount = m_jobTotal = m_jobAcquired = 0; }
158
+
159
+    /* Do not allow the instance to be destroyed before all bonded peers have
160
+     * exited processTasks() */
161
+    ~BondedTaskGroup() { waitForExit(); }
162
+
163
+    /* Try to enlist the help of idle worker threads on most recently associated
164
+     * with the given job provider and "bond" them to work on your tasks. Up to
165
+     * maxPeers worker threads will call your processTasks() method. */
166
+    int tryBondPeers(JobProvider& jp, int maxPeers)
167
+    {
168
+        int count = jp.m_pool->tryBondPeers(maxPeers, jp.m_ownerBitmap, *this);
169
+        m_bondedPeerCount += count;
170
+        return count;
171
+    }
172
+
173
+    /* Try to enlist the help of any idle worker threads and "bond" them to work
174
+     * on your tasks. Up to maxPeers worker threads will call your
175
+     * processTasks() method. */
176
+    int tryBondPeers(ThreadPool& pool, int maxPeers)
177
+    {
178
+        int count = pool.tryBondPeers(maxPeers, ALL_POOL_THREADS, *this);
179
+        m_bondedPeerCount += count;
180
+        return count;
181
+    }
182
+
183
+    /* Returns when all bonded peers have exited processTasks(). It does *NOT*
184
+     * ensure all tasks are completed (but this is generally implied). */
185
+    void waitForExit()
186
+    {
187
+        int exited = m_exitedPeerCount.get();
188
+        while (m_bondedPeerCount != exited)
189
+            exited = m_exitedPeerCount.waitForChange(exited);
190
+    }
191
+
192
+    /* Derived classes must define this method. The worker thread ID may be
193
+     * used to index into thread local data, or ignored.  The ID will be between
194
+     * 0 and jp.m_numWorkers - 1 */
195
+    virtual void processTasks(int workerThreadId) = 0;
196
 };
197
+
198
 } // end namespace x265
199
 
200
 #endif // ifndef X265_THREADPOOL_H
201
x265_1.5.tar.gz/source/common/wavefront.cpp -> x265_1.6.tar.gz/source/common/wavefront.cpp Changed
49
 
1
@@ -54,13 +54,13 @@
2
 void WaveFront::clearEnabledRowMask()
3
 {
4
     memset((void*)m_externalDependencyBitmap, 0, sizeof(uint32_t) * m_numWords);
5
+    memset((void*)m_internalDependencyBitmap, 0, sizeof(uint32_t) * m_numWords);
6
 }
7
 
8
 void WaveFront::enqueueRow(int row)
9
 {
10
     uint32_t bit = 1 << (row & 31);
11
     ATOMIC_OR(&m_internalDependencyBitmap[row >> 5], bit);
12
-    if (m_pool) m_pool->pokeIdleThread();
13
 }
14
 
15
 void WaveFront::enableRow(int row)
16
@@ -80,11 +80,11 @@
17
     return !!(ATOMIC_AND(&m_internalDependencyBitmap[row >> 5], ~bit) & bit);
18
 }
19
 
20
-bool WaveFront::findJob(int threadId)
21
+void WaveFront::findJob(int threadId)
22
 {
23
     unsigned long id;
24
 
25
-    // thread safe
26
+    /* Loop over each word until all available rows are finished */
27
     for (int w = 0; w < m_numWords; w++)
28
     {
29
         uint32_t oldval = m_internalDependencyBitmap[w] & m_externalDependencyBitmap[w];
30
@@ -97,15 +97,14 @@
31
             {
32
                 /* we cleared the bit, we get to process the row */
33
                 processRow(w * 32 + id, threadId);
34
-                return true;
35
+                m_helpWanted = true;
36
+                return; /* check for a higher priority task */
37
             }
38
 
39
-            // some other thread cleared the bit, try another bit
40
             oldval = m_internalDependencyBitmap[w] & m_externalDependencyBitmap[w];
41
         }
42
     }
43
 
44
-    // made it through the bitmap without finding any enqueued rows
45
-    return false;
46
+    m_helpWanted = false;
47
 }
48
 }
49
x265_1.5.tar.gz/source/common/wavefront.h -> x265_1.6.tar.gz/source/common/wavefront.h Changed
26
 
1
@@ -53,10 +53,9 @@
2
 
3
 public:
4
 
5
-    WaveFront(ThreadPool *pool)
6
-        : JobProvider(pool)
7
-        , m_internalDependencyBitmap(0)
8
-        , m_externalDependencyBitmap(0)
9
+    WaveFront()
10
+        : m_internalDependencyBitmap(NULL)
11
+        , m_externalDependencyBitmap(NULL)
12
     {}
13
 
14
     virtual ~WaveFront();
15
@@ -86,8 +85,8 @@
16
 
17
     // WaveFront's implementation of JobProvider::findJob. Consults
18
     // m_queuedBitmap and calls ProcessRow(row) for lowest numbered queued row
19
-    // or returns false
20
-    bool findJob(int threadId);
21
+    // processes available rows and returns when no work remains
22
+    void findJob(int threadId);
23
 
24
     // Start or resume encode processing of this row, must be implemented by
25
     // derived classes.
26
x265_1.5.tar.gz/source/common/x86/asm-primitives.cpp -> x265_1.6.tar.gz/source/common/x86/asm-primitives.cpp Changed
201
 
1
@@ -44,6 +44,11 @@
2
     p.cu[BLOCK_16x16].prim = fncdef x265_ ## fname ## _16x16_ ## cpu; \
3
     p.cu[BLOCK_32x32].prim = fncdef x265_ ## fname ## _32x32_ ## cpu; \
4
     p.cu[BLOCK_64x64].prim = fncdef x265_ ## fname ## _64x64_ ## cpu
5
+#define ALL_LUMA_CU_TYPED_S(prim, fncdef, fname, cpu) \
6
+    p.cu[BLOCK_8x8].prim   = fncdef x265_ ## fname ## 8_ ## cpu; \
7
+    p.cu[BLOCK_16x16].prim = fncdef x265_ ## fname ## 16_ ## cpu; \
8
+    p.cu[BLOCK_32x32].prim = fncdef x265_ ## fname ## 32_ ## cpu; \
9
+    p.cu[BLOCK_64x64].prim = fncdef x265_ ## fname ## 64_ ## cpu
10
 #define ALL_LUMA_TU_TYPED(prim, fncdef, fname, cpu) \
11
     p.cu[BLOCK_4x4].prim   = fncdef x265_ ## fname ## _4x4_ ## cpu; \
12
     p.cu[BLOCK_8x8].prim   = fncdef x265_ ## fname ## _8x8_ ## cpu; \
13
@@ -61,6 +66,7 @@
14
     p.cu[BLOCK_32x32].prim = fncdef x265_ ## fname ## _32x32_ ## cpu; \
15
     p.cu[BLOCK_64x64].prim = fncdef x265_ ## fname ## _64x64_ ## cpu;
16
 #define ALL_LUMA_CU(prim, fname, cpu)      ALL_LUMA_CU_TYPED(prim, , fname, cpu)
17
+#define ALL_LUMA_CU_S(prim, fname, cpu)    ALL_LUMA_CU_TYPED_S(prim, , fname, cpu)
18
 #define ALL_LUMA_TU(prim, fname, cpu)      ALL_LUMA_TU_TYPED(prim, , fname, cpu)
19
 #define ALL_LUMA_BLOCKS(prim, fname, cpu)  ALL_LUMA_BLOCKS_TYPED(prim, , fname, cpu)
20
 #define ALL_LUMA_TU_S(prim, fname, cpu)    ALL_LUMA_TU_TYPED_S(prim, , fname, cpu)
21
@@ -179,7 +185,6 @@
22
     p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].prim  = fncdef x265_ ## fname ## _8x32_ ## cpu
23
 #define ALL_CHROMA_420_4x4_PU(prim, fname, cpu) ALL_CHROMA_420_4x4_PU_TYPED(prim, , fname, cpu)
24
 
25
-
26
 #define ALL_CHROMA_422_CU_TYPED(prim, fncdef, fname, cpu) \
27
     p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].prim   = fncdef x265_ ## fname ## _4x8_ ## cpu; \
28
     p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].prim  = fncdef x265_ ## fname ## _8x16_ ## cpu; \
29
@@ -791,6 +796,10 @@
30
 
31
 void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask) // 16bpp
32
 {
33
+#if !defined(X86_64)
34
+#error "Unsupported build configuration (32bit x86 and HIGH_BIT_DEPTH), you must configure ENABLE_ASSEMBLY=OFF"
35
+#endif
36
+
37
     if (cpuMask & X265_CPU_SSE2)
38
     {
39
         /* We do not differentiate CPUs which support MMX and not SSE2. We only check
40
@@ -863,6 +872,16 @@
41
         ALL_LUMA_TU_S(calcresidual, getResidual, sse2);
42
         ALL_LUMA_TU_S(transpose, transpose, sse2);
43
 
44
+        p.cu[BLOCK_4x4].intra_pred[DC_IDX] = x265_intra_pred_dc4_sse2;
45
+        p.cu[BLOCK_8x8].intra_pred[DC_IDX] = x265_intra_pred_dc8_sse2;
46
+        p.cu[BLOCK_16x16].intra_pred[DC_IDX] = x265_intra_pred_dc16_sse2;
47
+        p.cu[BLOCK_32x32].intra_pred[DC_IDX] = x265_intra_pred_dc32_sse2;
48
+
49
+        p.cu[BLOCK_4x4].intra_pred[PLANAR_IDX] = x265_intra_pred_planar4_sse2;
50
+        p.cu[BLOCK_8x8].intra_pred[PLANAR_IDX] = x265_intra_pred_planar8_sse2;
51
+        p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = x265_intra_pred_planar16_sse2;
52
+        p.cu[BLOCK_32x32].intra_pred[PLANAR_IDX] = x265_intra_pred_planar32_sse2;
53
+
54
         p.cu[BLOCK_4x4].sse_ss = x265_pixel_ssd_ss_4x4_mmx2;
55
         ALL_LUMA_CU(sse_ss, pixel_ssd_ss, sse2);
56
 
57
@@ -872,10 +891,10 @@
58
         p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sse_pp = (pixelcmp_t)x265_pixel_ssd_ss_32x64_sse2;
59
 
60
         p.cu[BLOCK_4x4].dct = x265_dct4_sse2;
61
+        p.cu[BLOCK_8x8].dct = x265_dct8_sse2;
62
         p.cu[BLOCK_4x4].idct = x265_idct4_sse2;
63
-#if X86_64
64
         p.cu[BLOCK_8x8].idct = x265_idct8_sse2;
65
-#endif
66
+
67
         p.idst4x4 = x265_idst4_sse2;
68
 
69
         LUMA_VSS_FILTERS(sse2);
70
@@ -894,7 +913,10 @@
71
 
72
         p.dst4x4 = x265_dst4_ssse3;
73
         p.cu[BLOCK_8x8].idct = x265_idct8_ssse3;
74
-        p.count_nonzero = x265_count_nonzero_ssse3;
75
+        p.cu[BLOCK_4x4].count_nonzero = x265_count_nonzero_4x4_ssse3;
76
+        p.cu[BLOCK_8x8].count_nonzero = x265_count_nonzero_8x8_ssse3;
77
+        p.cu[BLOCK_16x16].count_nonzero = x265_count_nonzero_16x16_ssse3;
78
+        p.cu[BLOCK_32x32].count_nonzero = x265_count_nonzero_32x32_ssse3;
79
         p.frameInitLowres = x265_frame_init_lowres_core_ssse3;
80
     }
81
     if (cpuMask & X265_CPU_SSE4)
82
@@ -931,19 +953,30 @@
83
         p.cu[BLOCK_4x4].psy_cost_pp = x265_psyCost_pp_4x4_sse4;
84
         p.cu[BLOCK_4x4].psy_cost_ss = x265_psyCost_ss_4x4_sse4;
85
 
86
-#if X86_64
87
+        // TODO: check POPCNT flag!
88
+        ALL_LUMA_TU_S(copy_cnt, copy_cnt_, sse4);
89
         ALL_LUMA_CU(psy_cost_pp, psyCost_pp, sse4);
90
         ALL_LUMA_CU(psy_cost_ss, psyCost_ss, sse4);
91
-#endif
92
     }
93
     if (cpuMask & X265_CPU_AVX)
94
     {
95
         // p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d = x265_pixel_satd_4x4_avx; fails tests
96
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].satd = x265_pixel_satd_16x24_avx;
97
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].satd = x265_pixel_satd_32x48_avx;
98
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].satd = x265_pixel_satd_24x64_avx;
99
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].satd = x265_pixel_satd_8x64_avx;
100
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].satd = x265_pixel_satd_8x12_avx;
101
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].satd = x265_pixel_satd_12x32_avx;
102
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].satd = x265_pixel_satd_4x32_avx;
103
+
104
         ALL_LUMA_PU(satd, pixel_satd, avx);
105
         ASSIGN_SA8D(avx);
106
         LUMA_VAR(avx);
107
         p.ssim_4x4x2_core = x265_pixel_ssim_4x4x2_core_avx;
108
         p.ssim_end_4 = x265_pixel_ssim_end4_avx;
109
+
110
+        // copy_pp primitives
111
+        // 16 x N
112
         p.pu[LUMA_64x64].copy_pp = (copy_pp_t)x265_blockcopy_ss_64x64_avx;
113
         p.pu[LUMA_16x4].copy_pp = (copy_pp_t)x265_blockcopy_ss_16x4_avx;
114
         p.pu[LUMA_16x8].copy_pp = (copy_pp_t)x265_blockcopy_ss_16x8_avx;
115
@@ -963,11 +996,82 @@
116
         p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].copy_pp = (copy_pp_t)x265_blockcopy_ss_16x16_avx;
117
         p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].copy_pp = (copy_pp_t)x265_blockcopy_ss_16x24_avx;
118
         p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].copy_pp = (copy_pp_t)x265_blockcopy_ss_16x32_avx;
119
+
120
+        // 24 X N
121
+        p.pu[LUMA_24x32].copy_pp = (copy_pp_t)x265_blockcopy_ss_24x32_avx;
122
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].copy_pp = (copy_pp_t)x265_blockcopy_ss_24x32_avx;
123
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].copy_pp = (copy_pp_t)x265_blockcopy_ss_24x64_avx;
124
+
125
+        // 32 x N
126
+        p.pu[LUMA_32x8].copy_pp = (copy_pp_t)x265_blockcopy_ss_32x8_avx;
127
+        p.pu[LUMA_32x16].copy_pp = (copy_pp_t)x265_blockcopy_ss_32x16_avx;
128
+        p.pu[LUMA_32x24].copy_pp = (copy_pp_t)x265_blockcopy_ss_32x24_avx;
129
+        p.pu[LUMA_32x32].copy_pp = (copy_pp_t)x265_blockcopy_ss_32x32_avx;
130
+        p.pu[LUMA_32x64].copy_pp = (copy_pp_t)x265_blockcopy_ss_32x64_avx;
131
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].copy_pp = (copy_pp_t)x265_blockcopy_ss_32x8_avx;
132
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].copy_pp = (copy_pp_t)x265_blockcopy_ss_32x16_avx;
133
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].copy_pp = (copy_pp_t)x265_blockcopy_ss_32x24_avx;
134
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].copy_pp = (copy_pp_t)x265_blockcopy_ss_32x32_avx;
135
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].copy_pp = (copy_pp_t)x265_blockcopy_ss_32x16_avx;
136
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].copy_pp = (copy_pp_t)x265_blockcopy_ss_32x32_avx;
137
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].copy_pp = (copy_pp_t)x265_blockcopy_ss_32x48_avx;
138
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].copy_pp = (copy_pp_t)x265_blockcopy_ss_32x64_avx;
139
+
140
+        // 48 X 64
141
+        p.pu[LUMA_48x64].copy_pp = (copy_pp_t)x265_blockcopy_ss_48x64_avx;
142
+
143
+        // copy_ss primitives
144
+        // 16 X N
145
+        p.cu[BLOCK_16x16].copy_ss = x265_blockcopy_ss_16x16_avx;
146
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].copy_ss = x265_blockcopy_ss_16x16_avx;
147
+        p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].copy_ss = x265_blockcopy_ss_16x32_avx;
148
+
149
+        // 32 X N
150
+        p.cu[BLOCK_32x32].copy_ss = x265_blockcopy_ss_32x32_avx;
151
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].copy_ss = x265_blockcopy_ss_32x32_avx;
152
+        p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_ss = x265_blockcopy_ss_32x64_avx;
153
+
154
+        // 64 X N
155
+        p.cu[BLOCK_64x64].copy_ss = x265_blockcopy_ss_64x64_avx;
156
+
157
+        // copy_ps primitives
158
+        // 16 X N
159
+        p.cu[BLOCK_16x16].copy_ps = (copy_ps_t)x265_blockcopy_ss_16x16_avx;
160
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].copy_ps = (copy_ps_t)x265_blockcopy_ss_16x16_avx;
161
+        p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].copy_ps = (copy_ps_t)x265_blockcopy_ss_16x32_avx;
162
+
163
+        // 32 X N
164
+        p.cu[BLOCK_32x32].copy_ps = (copy_ps_t)x265_blockcopy_ss_32x32_avx;
165
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].copy_ps = (copy_ps_t)x265_blockcopy_ss_32x32_avx;
166
+        p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_ps = (copy_ps_t)x265_blockcopy_ss_32x64_avx;
167
+
168
+        // 64 X N
169
+        p.cu[BLOCK_64x64].copy_ps = (copy_ps_t)x265_blockcopy_ss_64x64_avx;
170
+
171
+        // copy_sp primitives
172
+        // 16 X N
173
+        p.cu[BLOCK_16x16].copy_sp = (copy_sp_t)x265_blockcopy_ss_16x16_avx;
174
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].copy_sp = (copy_sp_t)x265_blockcopy_ss_16x16_avx;
175
+        p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].copy_sp = (copy_sp_t)x265_blockcopy_ss_16x32_avx;
176
+
177
+        // 32 X N
178
+        p.cu[BLOCK_32x32].copy_sp = (copy_sp_t)x265_blockcopy_ss_32x32_avx;
179
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].copy_sp = (copy_sp_t)x265_blockcopy_ss_32x32_avx;
180
+        p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_sp = (copy_sp_t)x265_blockcopy_ss_32x64_avx;
181
+
182
+        // 64 X N
183
+        p.cu[BLOCK_64x64].copy_sp = (copy_sp_t)x265_blockcopy_ss_64x64_avx;
184
+
185
         p.frameInitLowres = x265_frame_init_lowres_core_avx;
186
+
187
+        p.pu[LUMA_64x16].copy_pp = (copy_pp_t)x265_blockcopy_ss_64x16_avx;
188
+        p.pu[LUMA_64x32].copy_pp = (copy_pp_t)x265_blockcopy_ss_64x32_avx;
189
+        p.pu[LUMA_64x48].copy_pp = (copy_pp_t)x265_blockcopy_ss_64x48_avx;
190
+        p.pu[LUMA_64x64].copy_pp = (copy_pp_t)x265_blockcopy_ss_64x64_avx;
191
     }
192
     if (cpuMask & X265_CPU_XOP)
193
     {
194
-        p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d = x265_pixel_satd_4x4_xop;
195
+        //p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d = x265_pixel_satd_4x4_xop; this one is broken
196
         ALL_LUMA_PU(satd, pixel_satd, xop);
197
         ASSIGN_SA8D(xop);
198
         LUMA_VAR(xop);
199
@@ -975,36 +1079,48 @@
200
     }
201
x265_1.5.tar.gz/source/common/x86/blockcopy8.asm -> x265_1.6.tar.gz/source/common/x86/blockcopy8.asm Changed
201
 
1
@@ -47,15 +47,15 @@
2
 cglobal blockcopy_pp_2x4, 4, 7, 0
3
     mov    r4w,    [r2]
4
     mov    r5w,    [r2 + r3]
5
-    lea    r2,     [r2 + r3 * 2]
6
-    mov    r6w,    [r2]
7
+    mov    r6w,    [r2 + 2 * r3]
8
+    lea    r3,     [r3 + 2 * r3]
9
     mov    r3w,    [r2 + r3]
10
 
11
-    mov    [r0],         r4w
12
-    mov    [r0 + r1],    r5w
13
-    lea    r0,           [r0 + 2 * r1]
14
-    mov    [r0],         r6w
15
-    mov    [r0 + r1],    r3w
16
+    mov    [r0],          r4w
17
+    mov    [r0 + r1],     r5w
18
+    mov    [r0 + 2 * r1], r6w
19
+    lea    r1,            [r1 + 2 * r1]
20
+    mov    [r0 + r1],     r3w
21
 RET
22
 
23
 ;-----------------------------------------------------------------------------
24
@@ -63,37 +63,29 @@
25
 ;-----------------------------------------------------------------------------
26
 INIT_XMM sse2
27
 cglobal blockcopy_pp_2x8, 4, 7, 0
28
-    mov     r4w,     [r2]
29
-    mov     r5w,     [r2 + r3]
30
-    mov     r6w,     [r2 + 2 * r3]
31
+    lea     r5,      [3 * r1]
32
+    lea     r6,      [3 * r3]
33
 
34
-    mov     [r0],            r4w
35
-    mov     [r0 + r1],       r5w
36
-    mov     [r0 + 2 * r1],   r6w
37
-
38
-    lea     r0,             [r0 + 2 * r1]
39
-    lea     r2,             [r2 + 2 * r3]
40
-
41
-    mov     r4w,             [r2 + r3]
42
-    mov     r5w,             [r2 + 2 * r3]
43
-
44
-    mov     [r0 + r1],       r4w
45
-    mov     [r0 + 2 * r1],   r5w
46
-
47
-    lea     r0,              [r0 + 2 * r1]
48
-    lea     r2,              [r2 + 2 * r3]
49
-
50
-    mov     r4w,             [r2 + r3]
51
-    mov     r5w,             [r2 + 2 * r3]
52
-
53
-    mov     [r0 + r1],       r4w
54
-    mov     [r0 + 2 * r1],   r5w
55
-
56
-    lea     r0,              [r0 + 2 * r1]
57
-    lea     r2,              [r2 + 2 * r3]
58
-
59
-    mov     r4w,             [r2 + r3]
60
-    mov     [r0 + r1],       r4w
61
+    mov     r4w,           [r2]
62
+    mov     [r0],          r4w
63
+    mov     r4w,           [r2 + r3]
64
+    mov     [r0 + r1],     r4w
65
+    mov     r4w,           [r2 + 2 * r3]
66
+    mov     [r0 + 2 * r1], r4w
67
+    mov     r4w,           [r2 + r6]
68
+    mov     [r0 + r5],     r4w
69
+
70
+    lea     r2,            [r2 + 4 * r3]
71
+    mov     r4w,           [r2]
72
+    lea     r0,            [r0 + 4 * r1]
73
+    mov     [r0],          r4w
74
+
75
+    mov     r4w,           [r2 + r3]
76
+    mov     [r0 + r1],     r4w
77
+    mov     r4w,           [r2 + 2 * r3]
78
+    mov     [r0 + 2 * r1], r4w
79
+    mov     r4w,           [r2 + r6]
80
+    mov     [r0 + r5],     r4w
81
     RET
82
 
83
 ;-----------------------------------------------------------------------------
84
@@ -101,16 +93,30 @@
85
 ;-----------------------------------------------------------------------------
86
 INIT_XMM sse2
87
 cglobal blockcopy_pp_2x16, 4, 7, 0
88
-    mov     r6d,    16/2
89
-.loop:
90
-    mov     r4w,    [r2]
91
-    mov     r5w,    [r2 + r3]
92
-    dec     r6d
93
-    lea     r2,     [r2 + r3 * 2]
94
-    mov     [r0],       r4w
95
-    mov     [r0 + r1],  r5w
96
-    lea     r0,     [r0 + r1 * 2]
97
-    jnz     .loop
98
+    lea     r5,      [3 * r1]
99
+    lea     r6,      [3 * r3]
100
+
101
+    mov     r4w,           [r2]
102
+    mov     [r0],          r4w
103
+    mov     r4w,           [r2 + r3]
104
+    mov     [r0 + r1],     r4w
105
+    mov     r4w,           [r2 + 2 * r3]
106
+    mov     [r0 + 2 * r1], r4w
107
+    mov     r4w,           [r2 + r6]
108
+    mov     [r0 + r5],     r4w
109
+
110
+%rep 3
111
+    lea     r2,            [r2 + 4 * r3]
112
+    mov     r4w,           [r2]
113
+    lea     r0,            [r0 + 4 * r1]
114
+    mov     [r0],          r4w
115
+    mov     r4w,           [r2 + r3]
116
+    mov     [r0 + r1],     r4w
117
+    mov     r4w,           [r2 + 2 * r3]
118
+    mov     [r0 + 2 * r1], r4w
119
+    mov     r4w,           [r2 + r6]
120
+    mov     [r0 + r5],     r4w
121
+%endrep
122
     RET
123
 
124
 
125
@@ -145,115 +151,130 @@
126
     RET
127
 
128
 ;-----------------------------------------------------------------------------
129
+; void blockcopy_pp_4x8(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
130
+;-----------------------------------------------------------------------------
131
+INIT_XMM sse2
132
+cglobal blockcopy_pp_4x8, 4, 6, 4
133
+
134
+    lea     r4,    [3 * r1]
135
+    lea     r5,    [3 * r3]
136
+
137
+    movd     m0,     [r2]
138
+    movd     m1,     [r2 + r3]
139
+    movd     m2,     [r2 + 2 * r3]
140
+    movd     m3,     [r2 + r5]
141
+
142
+    movd     [r0],          m0
143
+    movd     [r0 + r1],     m1
144
+    movd     [r0 + 2 * r1], m2
145
+    movd     [r0 + r4],     m3
146
+
147
+    lea      r2,     [r2 + 4 * r3]
148
+    movd     m0,     [r2]
149
+    movd     m1,     [r2 + r3]
150
+    movd     m2,     [r2 + 2 * r3]
151
+    movd     m3,     [r2 + r5]
152
+
153
+    lea      r0,            [r0 + 4 * r1]
154
+    movd     [r0],          m0
155
+    movd     [r0 + r1],     m1
156
+    movd     [r0 + 2 * r1], m2
157
+    movd     [r0 + r4],     m3
158
+    RET
159
+
160
+;-----------------------------------------------------------------------------
161
 ; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
162
 ;-----------------------------------------------------------------------------
163
 %macro BLOCKCOPY_PP_W4_H8 2
164
 INIT_XMM sse2
165
-cglobal blockcopy_pp_%1x%2, 4, 5, 4
166
+cglobal blockcopy_pp_%1x%2, 4, 7, 4
167
     mov    r4d,    %2/8
168
+    lea    r5,     [3 * r1]
169
+    lea    r6,     [3 * r3]
170
+
171
 .loop:
172
     movd     m0,     [r2]
173
     movd     m1,     [r2 + r3]
174
-    lea      r2,     [r2 + 2 * r3]
175
-    movd     m2,     [r2]
176
-    movd     m3,     [r2 + r3]
177
+    movd     m2,     [r2 + 2 * r3]
178
+    movd     m3,     [r2 + r6]
179
 
180
-    movd     [r0],                m0
181
-    movd     [r0 + r1],           m1
182
-    lea      r0,                  [r0 + 2 * r1]
183
-    movd     [r0],                m2
184
-    movd     [r0 + r1],           m3
185
+    movd     [r0],          m0
186
+    movd     [r0 + r1],     m1
187
+    movd     [r0 + 2 * r1], m2
188
+    movd     [r0 + r5],     m3
189
 
190
-    lea       r0,     [r0 + 2 * r1]
191
-    lea       r2,     [r2 + 2 * r3]
192
+    lea      r2,     [r2 + 4 * r3]
193
     movd     m0,     [r2]
194
     movd     m1,     [r2 + r3]
195
-    lea      r2,     [r2 + 2 * r3]
196
-    movd     m2,     [r2]
197
-    movd     m3,     [r2 + r3]
198
+    movd     m2,     [r2 + 2 * r3]
199
+    movd     m3,     [r2 + r6]
200
 
201
x265_1.5.tar.gz/source/common/x86/blockcopy8.h -> x265_1.6.tar.gz/source/common/x86/blockcopy8.h Changed
67
 
1
@@ -48,6 +48,12 @@
2
 void x265_cpy1Dto2D_shr_8_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
3
 void x265_cpy1Dto2D_shr_16_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
4
 void x265_cpy1Dto2D_shr_32_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
5
+void x265_cpy2Dto1D_shl_8_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
6
+void x265_cpy2Dto1D_shl_16_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
7
+void x265_cpy2Dto1D_shl_32_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
8
+void x265_cpy2Dto1D_shr_8_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
9
+void x265_cpy2Dto1D_shr_16_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
10
+void x265_cpy2Dto1D_shr_32_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
11
 uint32_t x265_copy_cnt_4_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride);
12
 uint32_t x265_copy_cnt_8_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride);
13
 uint32_t x265_copy_cnt_16_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride);
14
@@ -198,6 +204,15 @@
15
 void x265_blockcopy_ss_64x32_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
16
 void x265_blockcopy_ss_64x48_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
17
 void x265_blockcopy_ss_64x64_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
18
+void x265_blockcopy_ss_32x8_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
19
+void x265_blockcopy_ss_32x16_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
20
+void x265_blockcopy_ss_32x24_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
21
+void x265_blockcopy_ss_32x32_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
22
+void x265_blockcopy_ss_32x48_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
23
+void x265_blockcopy_ss_32x64_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
24
+void x265_blockcopy_ss_48x64_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
25
+void x265_blockcopy_ss_24x32_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
26
+void x265_blockcopy_ss_24x64_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
27
 
28
 void x265_blockcopy_pp_32x8_avx(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb);
29
 void x265_blockcopy_pp_32x16_avx(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb);
30
@@ -205,9 +220,36 @@
31
 void x265_blockcopy_pp_32x32_avx(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb);
32
 void x265_blockcopy_pp_32x48_avx(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb);
33
 void x265_blockcopy_pp_32x64_avx(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb);
34
+void x265_blockcopy_pp_64x16_avx(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb);
35
+void x265_blockcopy_pp_64x32_avx(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb);
36
+void x265_blockcopy_pp_64x48_avx(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb);
37
+void x265_blockcopy_pp_64x64_avx(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb);
38
+void x265_blockcopy_pp_48x64_avx(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb);
39
 
40
 void x265_blockfill_s_16x16_avx2(int16_t* dst, intptr_t dstride, int16_t val);
41
 void x265_blockfill_s_32x32_avx2(int16_t* dst, intptr_t dstride, int16_t val);
42
+// copy_sp primitives
43
+// 16 x N
44
+void x265_blockcopy_sp_16x16_avx2(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
45
+void x265_blockcopy_sp_16x32_avx2(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
46
+
47
+// 32 x N
48
+void x265_blockcopy_sp_32x32_avx2(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
49
+void x265_blockcopy_sp_32x64_avx2(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
50
+
51
+// 64 x N
52
+void x265_blockcopy_sp_64x64_avx2(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
53
+// copy_ps primitives
54
+// 16 x N
55
+void x265_blockcopy_ps_16x16_avx2(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb);
56
+void x265_blockcopy_ps_16x32_avx2(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb);
57
+
58
+// 32 x N
59
+void x265_blockcopy_ps_32x32_avx2(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb);
60
+void x265_blockcopy_ps_32x64_avx2(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb);
61
+
62
+// 64 x N
63
+void x265_blockcopy_ps_64x64_avx2(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb);
64
 
65
 #undef BLOCKCOPY_COMMON
66
 #undef BLOCKCOPY_SS_PP
67
x265_1.5.tar.gz/source/common/x86/const-a.asm -> x265_1.6.tar.gz/source/common/x86/const-a.asm Changed
66
 
1
@@ -6,7 +6,7 @@
2
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
3
 ;*          Fiona Glaser <fiona@x264.com>
4
 ;*          Min Chen <chenm003@163.com> <min.chen@multicorewareinc.com>
5
-;*
6
+;*          Praveen Kumar Tiwari <praveen@multicorewareinc.com>
7
 ;* This program is free software; you can redistribute it and/or modify
8
 ;* it under the terms of the GNU General Public License as published by
9
 ;* the Free Software Foundation; either version 2 of the License, or
10
@@ -37,11 +37,14 @@
11
 const pw_32,       times 16 dw 32
12
 const pw_128,      times 16 dw 128
13
 const pw_256,      times 16 dw 256
14
+const pw_257,      times 16 dw 257
15
 const pw_512,      times 16 dw 512
16
 const pw_1023,     times 8  dw 1023
17
+ALIGN 32
18
 const pw_1024,     times 16 dw 1024
19
 const pw_4096,     times 16 dw 4096
20
 const pw_00ff,     times 16 dw 0x00ff
21
+ALIGN 32
22
 const pw_pixel_max,times 16 dw ((1 << BIT_DEPTH)-1)
23
 const deinterleave_shufd, dd 0,4,1,5,2,6,3,7
24
 const pb_unpackbd1, times 2 db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
25
@@ -50,16 +53,16 @@
26
 const pb_unpackwq2, db 4,5,4,5,4,5,4,5,6,7,6,7,6,7,6,7
27
 const pw_swap,      times 2 db 6,7,4,5,2,3,0,1
28
 
29
-const pb_2,        times 16 db 2
30
-const pb_4,        times 16 db 4
31
-const pb_16,       times 16 db 16
32
-const pb_64,       times 16 db 64
33
+const pb_2,        times 32 db 2
34
+const pb_4,        times 32 db 4
35
+const pb_16,       times 32 db 16
36
+const pb_64,       times 32 db 64
37
 const pb_01,       times  8 db 0,1
38
 const pb_0,        times 16 db 0
39
 const pb_a1,       times 16 db 0xa1
40
 const pb_3,        times 16 db 3
41
-const pb_8,        times 16 db 8
42
-const pb_32,       times 16 db 32
43
+const pb_8,        times 32 db 8
44
+const pb_32,       times 32 db 32
45
 const pb_128,      times 16 db 128
46
 const pb_shuf8x8c, db 0,0,0,0,2,2,2,2,4,4,4,4,6,6,6,6
47
 
48
@@ -72,7 +75,7 @@
49
 const pw_256,      times 8 dw 256
50
 const pw_32_0,     times 4 dw 32,
51
                    times 4 dw 0
52
-const pw_2000,     times 8 dw 0x2000
53
+const pw_2000,     times 16 dw 0x2000
54
 const pw_8000,     times 8 dw 0x8000
55
 const pw_3fff,     times 8 dw 0x3fff
56
 const pw_ppppmmmm, dw 1,1,1,1,-1,-1,-1,-1
57
@@ -80,7 +83,7 @@
58
 const pw_pmpmpmpm, dw 1,-1,1,-1,1,-1,1,-1
59
 const pw_pmmpzzzz, dw 1,-1,-1,1,0,0,0,0
60
 const pd_1,        times 8 dd 1
61
-const pd_2,        times 4 dd 2
62
+const pd_2,        times 8 dd 2
63
 const pd_4,        times 4 dd 4
64
 const pd_8,        times 4 dd 8
65
 const pd_16,       times 4 dd 16
66
x265_1.5.tar.gz/source/common/x86/dct8.asm -> x265_1.6.tar.gz/source/common/x86/dct8.asm Changed
201
 
1
@@ -748,6 +748,368 @@
2
     movhps      [r1 + r2], m1
3
     RET
4
 
5
+;-------------------------------------------------------
6
+; void dct8(const int16_t* src, int16_t* dst, intptr_t srcStride)
7
+;-------------------------------------------------------
8
+INIT_XMM sse2
9
+cglobal dct8, 3,6,8,0-16*mmsize
10
+    ;------------------------
11
+    ; Stack Mapping(dword)
12
+    ;------------------------
13
+    ; Row0[0-3] Row1[0-3]
14
+    ; ...
15
+    ; Row6[0-3] Row7[0-3]
16
+    ; Row0[0-3] Row7[0-3]
17
+    ; ...
18
+    ; Row6[4-7] Row7[4-7]
19
+    ;------------------------
20
+%if BIT_DEPTH == 10
21
+  %define       DCT_SHIFT1 4
22
+  %define       DCT_ADD1 [pd_8]
23
+%elif BIT_DEPTH == 8
24
+  %define       DCT_SHIFT1 2
25
+  %define       DCT_ADD1 [pd_2]
26
+%else
27
+  %error Unsupported BIT_DEPTH!
28
+%endif
29
+%define         DCT_ADD2 [pd_256]
30
+%define         DCT_SHIFT2 9
31
+
32
+    add         r2, r2
33
+    lea         r3, [r2 * 3]
34
+    mov         r5, rsp
35
+%assign x 0
36
+%rep 2
37
+    movu        m0, [r0]
38
+    movu        m1, [r0 + r2]
39
+    movu        m2, [r0 + r2 * 2]
40
+    movu        m3, [r0 + r3]
41
+
42
+    punpcklwd   m4, m0, m1
43
+    punpckhwd   m0, m1
44
+    punpcklwd   m5, m2, m3
45
+    punpckhwd   m2, m3
46
+    punpckldq   m1, m4, m5          ; m1 = [1 0]
47
+    punpckhdq   m4, m5              ; m4 = [3 2]
48
+    punpckldq   m3, m0, m2
49
+    punpckhdq   m0, m2
50
+    pshufd      m2, m3, 0x4E        ; m2 = [4 5]
51
+    pshufd      m0, m0, 0x4E        ; m0 = [6 7]
52
+
53
+    paddw       m3, m1, m0
54
+    psubw       m1, m0              ; m1 = [d1 d0]
55
+    paddw       m0, m4, m2
56
+    psubw       m4, m2              ; m4 = [d3 d2]
57
+    punpcklqdq  m2, m3, m0          ; m2 = [s2 s0]
58
+    punpckhqdq  m3, m0
59
+    pshufd      m3, m3, 0x4E        ; m3 = [s1 s3]
60
+
61
+    punpcklwd   m0, m1, m4          ; m0 = [d2/d0]
62
+    punpckhwd   m1, m4              ; m1 = [d3/d1]
63
+    punpckldq   m4, m0, m1          ; m4 = [d3 d1 d2 d0]
64
+    punpckhdq   m0, m1              ; m0 = [d3 d1 d2 d0]
65
+
66
+    ; odd
67
+    lea         r4, [tab_dct8_1]
68
+    pmaddwd     m1, m4, [r4 + 0*16]
69
+    pmaddwd     m5, m0, [r4 + 0*16]
70
+    pshufd      m1, m1, 0xD8
71
+    pshufd      m5, m5, 0xD8
72
+    mova        m7, m1
73
+    punpckhqdq  m7, m5
74
+    punpcklqdq  m1, m5
75
+    paddd       m1, m7
76
+    paddd       m1, DCT_ADD1
77
+    psrad       m1, DCT_SHIFT1
78
+  %if x == 1
79
+    pshufd      m1, m1, 0x1B
80
+  %endif
81
+    mova        [r5 + 1*2*mmsize], m1 ; Row 1
82
+
83
+    pmaddwd     m1, m4, [r4 + 1*16]
84
+    pmaddwd     m5, m0, [r4 + 1*16]
85
+    pshufd      m1, m1, 0xD8
86
+    pshufd      m5, m5, 0xD8
87
+    mova        m7, m1
88
+    punpckhqdq  m7, m5
89
+    punpcklqdq  m1, m5
90
+    paddd       m1, m7
91
+    paddd       m1, DCT_ADD1
92
+    psrad       m1, DCT_SHIFT1
93
+  %if x == 1
94
+    pshufd      m1, m1, 0x1B
95
+  %endif
96
+    mova        [r5 + 3*2*mmsize], m1 ; Row 3
97
+
98
+    pmaddwd     m1, m4, [r4 + 2*16]
99
+    pmaddwd     m5, m0, [r4 + 2*16]
100
+    pshufd      m1, m1, 0xD8
101
+    pshufd      m5, m5, 0xD8
102
+    mova        m7, m1
103
+    punpckhqdq  m7, m5
104
+    punpcklqdq  m1, m5
105
+    paddd       m1, m7
106
+    paddd       m1, DCT_ADD1
107
+    psrad       m1, DCT_SHIFT1
108
+  %if x == 1
109
+    pshufd      m1, m1, 0x1B
110
+  %endif
111
+    mova        [r5 + 5*2*mmsize], m1 ; Row 5
112
+
113
+    pmaddwd     m4, [r4 + 3*16]
114
+    pmaddwd     m0, [r4 + 3*16]
115
+    pshufd      m4, m4, 0xD8
116
+    pshufd      m0, m0, 0xD8
117
+    mova        m7, m4
118
+    punpckhqdq  m7, m0
119
+    punpcklqdq  m4, m0
120
+    paddd       m4, m7
121
+    paddd       m4, DCT_ADD1
122
+    psrad       m4, DCT_SHIFT1
123
+  %if x == 1
124
+    pshufd      m4, m4, 0x1B
125
+  %endif
126
+    mova        [r5 + 7*2*mmsize], m4; Row 7
127
+
128
+    ; even
129
+    lea         r4, [tab_dct4]
130
+    paddw       m0, m2, m3          ; m0 = [EE1 EE0]
131
+    pshufd      m0, m0, 0xD8
132
+    pshuflw     m0, m0, 0xD8
133
+    pshufhw     m0, m0, 0xD8
134
+    psubw       m2, m3              ; m2 = [EO1 EO0]
135
+    pmullw      m2, [pw_ppppmmmm]
136
+    pshufd      m2, m2, 0xD8
137
+    pshuflw     m2, m2, 0xD8
138
+    pshufhw     m2, m2, 0xD8
139
+    pmaddwd     m3, m0, [r4 + 0*16]
140
+    paddd       m3, DCT_ADD1
141
+    psrad       m3, DCT_SHIFT1
142
+  %if x == 1
143
+    pshufd      m3, m3, 0x1B
144
+  %endif
145
+    mova        [r5 + 0*2*mmsize], m3 ; Row 0
146
+    pmaddwd     m0, [r4 + 2*16]
147
+    paddd       m0, DCT_ADD1
148
+    psrad       m0, DCT_SHIFT1
149
+  %if x == 1
150
+    pshufd      m0, m0, 0x1B
151
+  %endif
152
+    mova        [r5 + 4*2*mmsize], m0 ; Row 4
153
+    pmaddwd     m3, m2, [r4 + 1*16]
154
+    paddd       m3, DCT_ADD1
155
+    psrad       m3, DCT_SHIFT1
156
+  %if x == 1
157
+    pshufd      m3, m3, 0x1B
158
+  %endif
159
+    mova        [r5 + 2*2*mmsize], m3 ; Row 2
160
+    pmaddwd     m2, [r4 + 3*16]
161
+    paddd       m2, DCT_ADD1
162
+    psrad       m2, DCT_SHIFT1
163
+  %if x == 1
164
+    pshufd      m2, m2, 0x1B
165
+  %endif
166
+    mova        [r5 + 6*2*mmsize], m2 ; Row 6
167
+
168
+  %if x != 1
169
+    lea         r0, [r0 + r2 * 4]
170
+    add         r5, mmsize
171
+  %endif
172
+%assign x x+1
173
+%endrep
174
+
175
+    mov         r0, rsp                 ; r0 = pointer to Low Part
176
+    lea         r4, [tab_dct8_2]
177
+
178
+%assign x 0
179
+%rep 4
180
+    mova        m0, [r0 + 0*2*mmsize]     ; [3 2 1 0]
181
+    mova        m1, [r0 + 1*2*mmsize]
182
+    paddd       m2, m0, [r0 + (0*2+1)*mmsize]
183
+    pshufd      m2, m2, 0x9C            ; m2 = [s2 s1 s3 s0]
184
+    paddd       m3, m1, [r0 + (1*2+1)*mmsize]
185
+    pshufd      m3, m3, 0x9C            ; m3 = ^^
186
+    psubd       m0, [r0 + (0*2+1)*mmsize]     ; m0 = [d3 d2 d1 d0]
187
+    psubd       m1, [r0 + (1*2+1)*mmsize]     ; m1 = ^^
188
+
189
+    ; even
190
+    pshufd      m4, m2, 0xD8
191
+    pshufd      m3, m3, 0xD8
192
+    mova        m7, m4
193
+    punpckhqdq  m7, m3
194
+    punpcklqdq  m4, m3
195
+    mova        m2, m4
196
+    paddd       m4, m7                  ; m4 = [EE1 EE0 EE1 EE0]
197
+    psubd       m2, m7                  ; m2 = [EO1 EO0 EO1 EO0]
198
+
199
+    pslld       m4, 6                   ; m4 = [64*EE1 64*EE0]
200
+    mova        m5, m2
201
x265_1.5.tar.gz/source/common/x86/dct8.h -> x265_1.6.tar.gz/source/common/x86/dct8.h Changed
9
 
1
@@ -24,6 +24,7 @@
2
 #ifndef X265_DCT8_H
3
 #define X265_DCT8_H
4
 void x265_dct4_sse2(const int16_t* src, int16_t* dst, intptr_t srcStride);
5
+void x265_dct8_sse2(const int16_t* src, int16_t* dst, intptr_t srcStride);
6
 void x265_dst4_ssse3(const int16_t* src, int16_t* dst, intptr_t srcStride);
7
 void x265_dct8_sse4(const int16_t* src, int16_t* dst, intptr_t srcStride);
8
 void x265_dct4_avx2(const int16_t* src, int16_t* dst, intptr_t srcStride);
9
x265_1.5.tar.gz/source/common/x86/intrapred.h -> x265_1.6.tar.gz/source/common/x86/intrapred.h Changed
92
 
1
@@ -4,7 +4,7 @@
2
  * Copyright (C) 2003-2013 x264 project
3
  *
4
  * Authors: Min Chen <chenm003@163.com> <min.chen@multicorewareinc.com>
5
- *
6
+ *          Praveen Kumar Tiwari <praveen@multicorewareinc.com>
7
  * This program is free software; you can redistribute it and/or modify
8
  * it under the terms of the GNU General Public License as published by
9
  * the Free Software Foundation; either version 2 of the License, or
10
@@ -26,11 +26,19 @@
11
 #ifndef X265_INTRAPRED_H
12
 #define X265_INTRAPRED_H
13
 
14
-void x265_intra_pred_dc4_sse4 (pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
15
+void x265_intra_pred_dc4_sse2(pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
16
+void x265_intra_pred_dc8_sse2(pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
17
+void x265_intra_pred_dc16_sse2(pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
18
+void x265_intra_pred_dc32_sse2(pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
19
+void x265_intra_pred_dc4_sse4(pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
20
 void x265_intra_pred_dc8_sse4(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int filter);
21
 void x265_intra_pred_dc16_sse4(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int filter);
22
 void x265_intra_pred_dc32_sse4(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int filter);
23
 
24
+void x265_intra_pred_planar4_sse2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int);
25
+void x265_intra_pred_planar8_sse2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int);
26
+void x265_intra_pred_planar16_sse2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int);
27
+void x265_intra_pred_planar32_sse2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int);
28
 void x265_intra_pred_planar4_sse4(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int);
29
 void x265_intra_pred_planar8_sse4(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int);
30
 void x265_intra_pred_planar16_sse4(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int);
31
@@ -39,6 +47,15 @@
32
 #define DECL_ANG(bsize, mode, cpu) \
33
     void x265_intra_pred_ang ## bsize ## _ ## mode ## _ ## cpu(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
34
 
35
+DECL_ANG(4, 2, sse2);
36
+DECL_ANG(4, 3, sse2);
37
+DECL_ANG(4, 4, sse2);
38
+DECL_ANG(4, 5, sse2);
39
+DECL_ANG(4, 6, sse2);
40
+DECL_ANG(4, 7, sse2);
41
+DECL_ANG(4, 8, sse2);
42
+DECL_ANG(4, 9, sse2);
43
+
44
 DECL_ANG(4, 2, ssse3);
45
 DECL_ANG(4, 3, sse4);
46
 DECL_ANG(4, 4, sse4);
47
@@ -157,6 +174,44 @@
48
 DECL_ANG(32, 33, sse4);
49
 
50
 #undef DECL_ANG
51
+void x265_intra_pred_ang8_3_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
52
+void x265_intra_pred_ang8_33_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
53
+void x265_intra_pred_ang8_4_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
54
+void x265_intra_pred_ang8_32_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
55
+void x265_intra_pred_ang8_5_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
56
+void x265_intra_pred_ang8_31_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
57
+void x265_intra_pred_ang8_6_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
58
+void x265_intra_pred_ang8_30_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
59
+void x265_intra_pred_ang8_7_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
60
+void x265_intra_pred_ang8_29_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
61
+void x265_intra_pred_ang8_8_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
62
+void x265_intra_pred_ang8_28_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
63
+void x265_intra_pred_ang8_9_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
64
+void x265_intra_pred_ang8_27_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
65
+void x265_intra_pred_ang8_25_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
66
+void x265_intra_pred_ang8_12_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
67
+void x265_intra_pred_ang8_24_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
68
+void x265_intra_pred_ang8_11_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
69
+void x265_intra_pred_ang16_25_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
70
+void x265_intra_pred_ang16_28_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
71
+void x265_intra_pred_ang16_27_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
72
+void x265_intra_pred_ang16_29_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
73
+void x265_intra_pred_ang16_30_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
74
+void x265_intra_pred_ang16_31_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
75
+void x265_intra_pred_ang16_32_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
76
+void x265_intra_pred_ang16_33_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
77
+void x265_intra_pred_ang16_24_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
78
+void x265_intra_pred_ang16_23_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
79
+void x265_intra_pred_ang16_22_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
80
+void x265_intra_pred_ang32_34_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
81
+void x265_intra_pred_ang32_2_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
82
+void x265_intra_pred_ang32_26_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
83
+void x265_intra_pred_ang32_27_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
84
+void x265_intra_pred_ang32_28_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
85
+void x265_intra_pred_ang32_29_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
86
+void x265_intra_pred_ang32_30_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
87
+void x265_intra_pred_ang32_31_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
88
+void x265_intra_pred_ang32_32_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
89
 void x265_all_angs_pred_4x4_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);
90
 void x265_all_angs_pred_8x8_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);
91
 void x265_all_angs_pred_16x16_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);
92
x265_1.5.tar.gz/source/common/x86/intrapred16.asm -> x265_1.6.tar.gz/source/common/x86/intrapred16.asm Changed
201
 
1
@@ -65,6 +65,10 @@
2
 pw_planar16_1:        dw 15, 15, 15, 15, 15, 15, 15, 15
3
 pd_planar32_1:        dd 31, 31, 31, 31
4
 
5
+pw_planar32_1:        dw 31, 31, 31, 31, 31, 31, 31, 31
6
+pw_planar32_L:        dw 31, 30, 29, 28, 27, 26, 25, 24
7
+pw_planar32_H:        dw 23, 22, 21, 20, 19, 18, 17, 16
8
+
9
 const planar32_table
10
 %assign x 31
11
 %rep 8
12
@@ -82,15 +86,19 @@
13
 SECTION .text
14
 
15
 cextern pw_1
16
+cextern pw_2
17
 cextern pw_4
18
 cextern pw_8
19
 cextern pw_16
20
+cextern pw_32
21
 cextern pw_1023
22
 cextern pd_16
23
 cextern pd_32
24
 cextern pw_4096
25
 cextern multiL
26
 cextern multiH
27
+cextern multiH2
28
+cextern multiH3
29
 cextern multi_2Row
30
 cextern pw_swap
31
 cextern pb_unpackwq1
32
@@ -99,6 +107,592 @@
33
 ;-----------------------------------------------------------------------------------
34
 ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* above, int, int filter)
35
 ;-----------------------------------------------------------------------------------
36
+INIT_XMM sse2
37
+cglobal intra_pred_dc4, 5,6,2
38
+    movh        m0,             [r2 + 18]          ; sumAbove
39
+    movh        m1,             [r2 + 2]           ; sumLeft
40
+
41
+    paddw       m0,             m1
42
+    pshuflw     m1,             m0, 0x4E
43
+    paddw       m0,             m1
44
+    pshuflw     m1,             m0, 0xB1
45
+    paddw       m0,             m1
46
+
47
+    test        r4d,            r4d
48
+
49
+    paddw       m0,             [pw_4]
50
+    psraw       m0,             3
51
+
52
+    ; store DC 4x4
53
+    movh        [r0],           m0
54
+    movh        [r0 + r1 * 2],  m0
55
+    movh        [r0 + r1 * 4],  m0
56
+    lea         r5,             [r0 + r1 * 4]
57
+    movh        [r5 + r1 * 2],  m0
58
+
59
+    ; do DC filter
60
+    jz          .end
61
+    movh        m1,             m0
62
+    psllw       m1,             1
63
+    paddw       m1,             [pw_2]
64
+    movd        r3d,            m1
65
+    paddw       m0,             m1
66
+    ; filter top
67
+    movh        m1,             [r2 + 2]
68
+    paddw       m1,             m0
69
+    psraw       m1,             2
70
+    movh        [r0],           m1             ; overwrite top-left pixel, we will update it later
71
+
72
+    ; filter top-left
73
+    movzx       r3d,            r3w
74
+    movzx       r4d, word       [r2 + 18]
75
+    add         r3d,            r4d
76
+    movzx       r4d, word       [r2 + 2]
77
+    add         r4d,            r3d
78
+    shr         r4d,            2
79
+    mov         [r0],           r4w
80
+
81
+    ; filter left
82
+    movu        m1,             [r2 + 20]
83
+    paddw       m1,             m0
84
+    psraw       m1,             2
85
+    movd        r3d,            m1
86
+    mov         [r0 + r1 * 2],  r3w
87
+    shr         r3d,            16
88
+    mov         [r0 + r1 * 4],  r3w
89
+    pextrw      r3d,            m1, 2
90
+    mov         [r5 + r1 * 2],  r3w
91
+.end:
92
+    RET
93
+
94
+;-----------------------------------------------------------------------------------
95
+; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* above, int, int filter)
96
+;-----------------------------------------------------------------------------------
97
+INIT_XMM sse2
98
+cglobal intra_pred_dc8, 5, 8, 2
99
+    movu            m0,            [r2 + 34]
100
+    movu            m1,            [r2 + 2]
101
+
102
+    paddw           m0,            m1
103
+    movhlps         m1,            m0
104
+    paddw           m0,            m1
105
+    pshufd          m1,            m0, 1
106
+    paddw           m0,            m1
107
+    pmaddwd         m0,            [pw_1]
108
+
109
+    paddw           m0,            [pw_8]
110
+    psraw           m0,            4              ; sum = sum / 16
111
+    pshuflw         m0,            m0, 0
112
+    pshufd          m0,            m0, 0          ; m0 = word [dc_val ...]
113
+
114
+    test            r4d,           r4d
115
+
116
+    ; store DC 8x8
117
+    lea             r6,            [r1 + r1 * 4]
118
+    lea             r6,            [r6 + r1]
119
+    lea             r5,            [r6 + r1 * 4]
120
+    lea             r7,            [r6 + r1 * 8]
121
+    movu            [r0],          m0
122
+    movu            [r0 + r1 * 2], m0
123
+    movu            [r0 + r1 * 4], m0
124
+    movu            [r0 + r6],     m0
125
+    movu            [r0 + r1 * 8], m0
126
+    movu            [r0 + r5],     m0
127
+    movu            [r0 + r6 * 2], m0
128
+    movu            [r0 + r7],     m0
129
+
130
+    ; Do DC Filter
131
+    jz              .end
132
+    mova            m1,            [pw_2]
133
+    pmullw          m1,            m0
134
+    paddw           m1,            [pw_2]
135
+    movd            r4d,           m1             ; r4d = DC * 2 + 2
136
+    paddw           m1,            m0             ; m1 = DC * 3 + 2
137
+    pshuflw         m1,            m1, 0
138
+    pshufd          m1,            m1, 0          ; m1 = pixDCx3
139
+
140
+    ; filter top
141
+    movu            m0,            [r2 + 2]
142
+    paddw           m0,            m1
143
+    psraw           m0,            2
144
+    movu            [r0],          m0
145
+
146
+    ; filter top-left
147
+    movzx           r4d,           r4w
148
+    movzx           r3d, word      [r2 + 34]
149
+    add             r4d,           r3d
150
+    movzx           r3d, word      [r2 + 2]
151
+    add             r3d,           r4d
152
+    shr             r3d,           2
153
+    mov             [r0],          r3w
154
+
155
+    ; filter left
156
+    movu            m0,            [r2 + 36]
157
+    paddw           m0,            m1
158
+    psraw           m0,            2
159
+    movh            r3,            m0
160
+    mov             [r0 + r1 * 2], r3w
161
+    shr             r3,            16
162
+    mov             [r0 + r1 * 4], r3w
163
+    shr             r3,            16
164
+    mov             [r0 + r6],     r3w
165
+    shr             r3,            16
166
+    mov             [r0 + r1 * 8], r3w
167
+    pshufd          m0,            m0, 0x6E
168
+    movh            r3,            m0
169
+    mov             [r0 + r5],     r3w
170
+    shr             r3,            16
171
+    mov             [r0 + r6 * 2], r3w
172
+    shr             r3,            16
173
+    mov             [r0 + r7],     r3w
174
+.end:
175
+    RET
176
+
177
+;-------------------------------------------------------------------------------------------------------
178
+; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
179
+;-------------------------------------------------------------------------------------------------------
180
+INIT_XMM sse2
181
+cglobal intra_pred_dc16, 5, 10, 4
182
+    lea             r3,                  [r2 + 66]
183
+    add             r1,                  r1
184
+    movu            m0,                  [r3]
185
+    movu            m1,                  [r3 + 16]
186
+    movu            m2,                  [r2 + 2]
187
+    movu            m3,                  [r2 + 18]
188
+
189
+    paddw           m0,                  m1
190
+    paddw           m2,                  m3
191
+    paddw           m0,                  m2
192
+    movhlps         m1,                  m0
193
+    paddw           m0,                  m1
194
+    pshuflw         m1,                  m0, 0x6E
195
+    paddw           m0,                  m1
196
+    pmaddwd         m0,                  [pw_1]
197
+
198
+    paddw           m0,                  [pw_16]
199
+    psraw           m0,                  5
200
+    movd            r5d,                 m0
201
x265_1.5.tar.gz/source/common/x86/intrapred8.asm -> x265_1.6.tar.gz/source/common/x86/intrapred8.asm Changed
201
 
1
@@ -2,6 +2,7 @@
2
 ;* Copyright (C) 2013 x265 project
3
 ;*
4
 ;* Authors: Min Chen <chenm003@163.com> <min.chen@multicorewareinc.com>
5
+;*          Praveen Kumar Tiwari <praveen@multicorewareinc.com>
6
 ;*
7
 ;* This program is free software; you can redistribute it and/or modify
8
 ;* it under the terms of the GNU General Public License as published by
9
@@ -26,11 +27,15 @@
10
 
11
 SECTION_RODATA 32
12
 
13
+intra_pred_shuff_0_8:    times 2 db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
14
+
15
 pb_0_8        times 8 db  0,  8
16
 pb_unpackbw1  times 2 db  1,  8,  2,  8,  3,  8,  4,  8
17
 pb_swap8:     times 2 db  7,  6,  5,  4,  3,  2,  1,  0
18
 c_trans_4x4           db  0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15
19
-tab_Si:               db  0,  1,  2,  3,  4,  5,  6,  7,  0,  1,  2,  3,  4,  5,  6,  7
20
+const tab_S1,         db 15, 14, 12, 11, 10,  9,  7,  6,  5,  4,  2,  1,  0,  0,  0,  0
21
+const tab_S2,         db 0, 1, 3, 5, 7, 9, 11, 13, 0, 0, 0, 0, 0, 0, 0, 0
22
+const tab_Si,         db  0,  1,  2,  3,  4,  5,  6,  7,  0,  1,  2,  3,  4,  5,  6,  7
23
 pb_fact0:             db  0,  2,  4,  6,  8, 10, 12, 14,  0,  0,  0,  0,  0,  0,  0,  0
24
 c_mode32_12_0:        db  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 13,  7,  0
25
 c_mode32_13_0:        db  3,  6, 10, 13,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
26
@@ -43,7 +48,6 @@
27
 c_mode32_18_0:        db 15, 14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1,  0
28
 c_shuf8_0:            db  0,  1,  1,  2,  2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8
29
 c_deinterval8:        db  0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15
30
-tab_S1:               db 15, 14, 12, 11, 10,  9,  7,  6,  5,  4,  2,  1,  0,  0,  0,  0
31
 pb_unpackbq:          db  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  1,  1,  1,  1
32
 c_mode16_12:    db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 6
33
 c_mode16_13:    db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 7, 4
34
@@ -52,8 +56,327 @@
35
 c_mode16_16:          db  8,  6,  5,  3,  2,  0, 15, 14, 12, 11,  9,  8,  6,  5,  3,  2
36
 c_mode16_17:          db  4,  2,  1,  0, 15, 14, 12, 11, 10,  9,  7,  6,  5,  4,  2,  1
37
 c_mode16_18:    db 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1
38
-tab_S2:         db 0, 1, 3, 5, 7, 9, 11, 13, 0, 0, 0, 0, 0, 0, 0, 0
39
 
40
+ALIGN 32
41
+trans8_shuf:          dd 0, 4, 1, 5, 2, 6, 3, 7
42
+c_ang8_src1_9_2_10:   db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
43
+c_ang8_26_20:         db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
44
+c_ang8_src3_11_4_12:  db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11
45
+c_ang8_14_8:          db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
46
+c_ang8_src5_13_5_13:  db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
47
+c_ang8_2_28:          db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
48
+c_ang8_src6_14_7_15:  db 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
49
+c_ang8_22_16:         db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
50
+
51
+c_ang8_21_10       :  db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
52
+c_ang8_src2_10_3_11:  db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
53
+c_ang8_31_20:         db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
54
+c_ang8_src4_12_4_12:  times 2 db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11
55
+c_ang8_9_30:          db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
56
+c_ang8_src5_13_6_14:  db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13
57
+c_ang8_19_8:          db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
58
+
59
+c_ang8_17_2:          db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2
60
+c_ang8_19_4:          db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
61
+c_ang8_21_6:          db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
62
+c_ang8_23_8:          db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8,
63
+c_ang8_src4_12_5_13:  db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
64
+
65
+c_ang8_13_26:         db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
66
+c_ang8_7_20:          db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
67
+c_ang8_1_14:          db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
68
+c_ang8_27_8:          db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
69
+c_ang8_src2_10_2_10:  db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
70
+c_ang8_src3_11_3_11:  db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
71
+
72
+c_ang8_31_8:          db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
73
+c_ang8_13_22:         db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
74
+c_ang8_27_4:          db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
75
+c_ang8_9_18:          db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
76
+
77
+c_ang8_5_10:          db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
78
+c_ang8_15_20:         db 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
79
+c_ang8_25_30:         db 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
80
+c_ang8_3_8:           db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
81
+
82
+c_ang8_mode_27:       db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
83
+                      db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
84
+                      db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
85
+                      db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
86
+
87
+c_ang8_mode_25:       db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
88
+                      db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
89
+                      db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
90
+                      db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
91
+
92
+c_ang8_mode_24:       db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
93
+                      db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
94
+                      db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2
95
+                      db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
96
+
97
+ALIGN 32
98
+c_ang16_mode_25:      db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
99
+                      db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
100
+                      db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
101
+                      db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
102
+                      db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
103
+                      db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
104
+                      db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
105
+                      db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
106
+
107
+
108
+ALIGN 32
109
+c_ang16_mode_28:      db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
110
+                      db 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
111
+                      db 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
112
+                      db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
113
+                      db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
114
+                      db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
115
+                      db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
116
+                      db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
117
+
118
+
119
+ALIGN 32
120
+c_ang16_mode_27:      db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
121
+                      db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
122
+                      db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
123
+                      db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
124
+                      db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
125
+                      db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
126
+                      db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
127
+                      db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
128
+                      db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
129
+
130
+ALIGN 32
131
+intra_pred_shuff_0_15: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 15
132
+
133
+
134
+ALIGN 32
135
+c_ang16_mode_29:     db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9,  14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
136
+                     db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27
137
+                     db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13
138
+                     db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31
139
+                     db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17
140
+                     db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
141
+                     db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
142
+                     db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
143
+                     db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
144
+
145
+
146
+ALIGN 32
147
+c_ang16_mode_30:      db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
148
+                      db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
149
+                      db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
150
+                      db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27
151
+                      db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21
152
+                      db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15
153
+                      db 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
154
+                      db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
155
+                      db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
156
+
157
+
158
+ALIGN 32
159
+c_ang16_mode_31:      db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17
160
+                      db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19
161
+                      db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21
162
+                      db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6,  9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23
163
+                      db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8,  7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25
164
+                      db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27
165
+                      db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29
166
+                      db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31
167
+                      db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
168
+
169
+ALIGN 32
170
+c_ang16_mode_32:      db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21
171
+                      db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31
172
+                      db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
173
+                      db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
174
+                      db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19
175
+                      db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29
176
+                      db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
177
+                      db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
178
+                      db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17
179
+                      db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27
180
+                      db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
181
+
182
+ALIGN 32
183
+c_ang16_mode_33:     db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
184
+                     db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
185
+                     db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
186
+                     db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
187
+                     db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
188
+                     db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
189
+                     db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
190
+                     db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
191
+                     db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
192
+                     db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
193
+                     db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
194
+                     db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
195
+                     db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
196
+                     db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
197
+
198
+ALIGN 32
199
+c_ang16_mode_24:     db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
200
+                     db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
201
x265_1.6.tar.gz/source/common/x86/intrapred8_allangs.asm Added
201
 
1
@@ -0,0 +1,23008 @@
2
+;*****************************************************************************
3
+;* Copyright (C) 2013 x265 project
4
+;*
5
+;* Authors: Min Chen <chenm003@163.com> <min.chen@multicorewareinc.com>
6
+;*          Praveen Tiwari <praveen@multicorewareinc.com>
7
+;*
8
+;* This program is free software; you can redistribute it and/or modify
9
+;* it under the terms of the GNU General Public License as published by
10
+;* the Free Software Foundation; either version 2 of the License, or
11
+;* (at your option) any later version.
12
+;*
13
+;* This program is distributed in the hope that it will be useful,
14
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16
+;* GNU General Public License for more details.
17
+;*
18
+;* You should have received a copy of the GNU General Public License
19
+;* along with this program; if not, write to the Free Software
20
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
21
+;*
22
+;* This program is also available under a commercial proprietary license.
23
+;* For more information, contact us at license @ x265.com.
24
+;*****************************************************************************/
25
+
26
+%include "x86inc.asm"
27
+%include "x86util.asm"
28
+
29
+SECTION_RODATA 32
30
+
31
+SECTION .text
32
+
33
+; global constant
34
+cextern pw_1024
35
+
36
+; common constant with intrapred8.asm
37
+cextern ang_table
38
+cextern tab_S1
39
+cextern tab_S2
40
+cextern tab_Si
41
+
42
+
43
+;-----------------------------------------------------------------------------
44
+; void all_angs_pred_4x4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma)
45
+;-----------------------------------------------------------------------------
46
+INIT_XMM sse4
47
+cglobal all_angs_pred_4x4, 4, 4, 8
48
+
49
+; mode 2
50
+
51
+movh      m0,         [r1 + 10]
52
+movd      [r0],       m0
53
+
54
+palignr   m1,         m0,      1
55
+movd      [r0 + 4],   m1
56
+
57
+palignr   m1,         m0,      2
58
+movd      [r0 + 8],   m1
59
+
60
+palignr   m1,         m0,      3
61
+movd      [r0 + 12],  m1
62
+
63
+; mode 3
64
+
65
+mova          m2,        [pw_1024]
66
+
67
+pslldq        m1,        m0,         1
68
+pinsrb        m1,        [r1 + 9],   0
69
+punpcklbw     m1,        m0
70
+
71
+lea           r3,        [ang_table]
72
+
73
+pmaddubsw     m6,        m1,        [r3 + 26 * 16]
74
+pmulhrsw      m6,        m2
75
+packuswb      m6,        m6
76
+movd          [r0 + 16], m6
77
+
78
+palignr       m0,        m1,        2
79
+
80
+mova          m7,        [r3 + 20 * 16]
81
+
82
+pmaddubsw     m3,        m0,        m7
83
+pmulhrsw      m3,        m2
84
+packuswb      m3,        m3
85
+movd          [r0 + 20], m3
86
+
87
+; mode 6 [row 3]
88
+movd          [r0 + 76], m3
89
+
90
+palignr       m3,        m1,       4
91
+
92
+pmaddubsw     m4,        m3,        [r3 + 14 * 16]
93
+pmulhrsw      m4,        m2
94
+packuswb      m4,        m4
95
+movd          [r0 + 24], m4
96
+
97
+palignr       m4,        m1,        6
98
+
99
+pmaddubsw     m4,        [r3 + 8 * 16]
100
+pmulhrsw      m4,        m2
101
+packuswb      m4,        m4
102
+movd          [r0 + 28], m4
103
+
104
+; mode 4
105
+
106
+pmaddubsw     m5,        m1,        [r3 + 21 * 16]
107
+pmulhrsw      m5,        m2
108
+packuswb      m5,        m5
109
+movd          [r0 + 32], m5
110
+
111
+pmaddubsw     m5,        m0,        [r3 + 10 * 16]
112
+pmulhrsw      m5,        m2
113
+packuswb      m5,        m5
114
+movd          [r0 + 36], m5
115
+
116
+pmaddubsw     m5,        m0,        [r3 + 31 * 16]
117
+pmulhrsw      m5,        m2
118
+packuswb      m5,        m5
119
+movd          [r0 + 40], m5
120
+
121
+pmaddubsw     m4,        m3,        m7
122
+pmulhrsw      m4,        m2
123
+packuswb      m4,        m4
124
+movd          [r0 + 44], m4
125
+
126
+; mode 5
127
+
128
+pmaddubsw     m5,        m1,        [r3 + 17 * 16]
129
+pmulhrsw      m5,        m2
130
+packuswb      m5,        m5
131
+movd          [r0 + 48], m5
132
+
133
+pmaddubsw     m5,        m0,        [r3 + 2 * 16]
134
+pmulhrsw      m5,        m2
135
+packuswb      m5,        m5
136
+movd          [r0 + 52], m5
137
+
138
+pmaddubsw     m5,        m0,        [r3 + 19 * 16]
139
+pmulhrsw      m5,        m2
140
+packuswb      m5,        m5
141
+movd          [r0 + 56], m5
142
+
143
+pmaddubsw     m4,        m3,        [r3 + 4 * 16]
144
+pmulhrsw      m4,        m2
145
+packuswb      m4,        m4
146
+movd          [r0 + 60], m4
147
+
148
+; mode 6
149
+
150
+pmaddubsw     m5,        m1,        [r3 + 13 * 16]
151
+pmulhrsw      m5,        m2
152
+packuswb      m5,        m5
153
+movd          [r0 + 64], m5
154
+
155
+movd          [r0 + 68], m6
156
+
157
+pmaddubsw     m5,        m0,        [r3 + 7 * 16]
158
+pmulhrsw      m5,        m2
159
+packuswb      m5,        m5
160
+movd          [r0 + 72], m5
161
+
162
+; mode 7
163
+
164
+pmaddubsw     m5,        m1,        [r3 + 9 * 16]
165
+pmulhrsw      m5,        m2
166
+packuswb      m5,        m5
167
+movd          [r0 + 80], m5
168
+
169
+pmaddubsw     m5,        m1,        [r3 + 18 * 16]
170
+pmulhrsw      m5,        m2
171
+packuswb      m5,        m5
172
+movd          [r0 + 84], m5
173
+
174
+pmaddubsw     m5,        m1,        [r3 + 27 * 16]
175
+pmulhrsw      m5,        m2
176
+packuswb      m5,        m5
177
+movd          [r0 + 88], m5
178
+
179
+pmaddubsw     m5,        m0,        [r3 + 4 * 16]
180
+pmulhrsw      m5,        m2
181
+packuswb      m5,        m5
182
+movd          [r0 + 92], m5
183
+
184
+; mode 8
185
+
186
+pmaddubsw     m5,        m1,        [r3 + 5 * 16]
187
+pmulhrsw      m5,        m2
188
+packuswb      m5,        m5
189
+movd          [r0 + 96], m5
190
+
191
+pmaddubsw     m5,         m1,       [r3 + 10 * 16]
192
+pmulhrsw      m5,         m2
193
+packuswb      m5,         m5
194
+movd          [r0 + 100], m5
195
+
196
+pmaddubsw     m5,         m1,        [r3 + 15 * 16]
197
+pmulhrsw      m5,         m2
198
+packuswb      m5,         m5
199
+movd          [r0 + 104], m5
200
+
201
x265_1.5.tar.gz/source/common/x86/ipfilter16.asm -> x265_1.6.tar.gz/source/common/x86/ipfilter16.asm Changed
201
 
1
@@ -31,6 +31,7 @@
2
 tab_c_n32768:     times 4 dd -32768
3
 tab_c_524800:     times 4 dd 524800
4
 tab_c_n8192:      times 8 dw -8192
5
+pd_524800:        times 8 dd 524800
6
 
7
 tab_Tm16:         db 0, 1, 2, 3, 4,  5,  6, 7, 2, 3, 4,  5, 6, 7, 8, 9
8
 
9
@@ -91,9 +92,28 @@
10
                   times 4 dw -5, 17
11
                   times 4 dw 58, -10
12
                   times 4 dw 4, -1
13
+ALIGN 32
14
+tab_LumaCoeffVer: times 8 dw 0, 0
15
+                  times 8 dw 0, 64
16
+                  times 8 dw 0, 0
17
+                  times 8 dw 0, 0
18
+
19
+                  times 8 dw -1, 4
20
+                  times 8 dw -10, 58
21
+                  times 8 dw 17, -5
22
+                  times 8 dw 1, 0
23
+
24
+                  times 8 dw -1, 4
25
+                  times 8 dw -11, 40
26
+                  times 8 dw 40, -11
27
+                  times 8 dw 4, -1
28
+
29
+                  times 8 dw 0, 1
30
+                  times 8 dw -5, 17
31
+                  times 8 dw 58, -10
32
+                  times 8 dw 4, -1
33
 
34
 SECTION .text
35
-
36
 cextern pd_32
37
 cextern pw_pixel_max
38
 cextern pd_n32768
39
@@ -2562,6 +2582,2681 @@
40
     FILTER_VER_LUMA_PP 64, 16
41
     FILTER_VER_LUMA_PP 16, 64
42
 
43
+%macro FILTER_VER_LUMA_AVX2_4x4 1
44
+INIT_YMM avx2
45
+cglobal interp_8tap_vert_%1_4x4, 4, 6, 7
46
+    mov             r4d, r4m
47
+    add             r1d, r1d
48
+    add             r3d, r3d
49
+    shl             r4d, 7
50
+
51
+%ifdef PIC
52
+    lea             r5, [tab_LumaCoeffVer]
53
+    add             r5, r4
54
+%else
55
+    lea             r5, [tab_LumaCoeffVer + r4]
56
+%endif
57
+
58
+    lea             r4, [r1 * 3]
59
+    sub             r0, r4
60
+
61
+%ifidn %1,pp
62
+    vbroadcasti128  m6, [pd_32]
63
+%elifidn %1, sp
64
+    mova            m6, [pd_524800]
65
+%else
66
+    vbroadcasti128  m6, [pd_n32768]
67
+%endif
68
+
69
+    movq            xm0, [r0]
70
+    movq            xm1, [r0 + r1]
71
+    punpcklwd       xm0, xm1
72
+    movq            xm2, [r0 + r1 * 2]
73
+    punpcklwd       xm1, xm2
74
+    vinserti128     m0, m0, xm1, 1                  ; m0 = [2 1 1 0]
75
+    pmaddwd         m0, [r5]
76
+    movq            xm3, [r0 + r4]
77
+    punpcklwd       xm2, xm3
78
+    lea             r0, [r0 + 4 * r1]
79
+    movq            xm4, [r0]
80
+    punpcklwd       xm3, xm4
81
+    vinserti128     m2, m2, xm3, 1                  ; m2 = [4 3 3 2]
82
+    pmaddwd         m5, m2, [r5 + 1 * mmsize]
83
+    pmaddwd         m2, [r5]
84
+    paddd           m0, m5
85
+    movq            xm3, [r0 + r1]
86
+    punpcklwd       xm4, xm3
87
+    movq            xm1, [r0 + r1 * 2]
88
+    punpcklwd       xm3, xm1
89
+    vinserti128     m4, m4, xm3, 1                  ; m4 = [6 5 5 4]
90
+    pmaddwd         m5, m4, [r5 + 2 * mmsize]
91
+    pmaddwd         m4, [r5 + 1 * mmsize]
92
+    paddd           m0, m5
93
+    paddd           m2, m4
94
+    movq            xm3, [r0 + r4]
95
+    punpcklwd       xm1, xm3
96
+    lea             r0, [r0 + 4 * r1]
97
+    movq            xm4, [r0]
98
+    punpcklwd       xm3, xm4
99
+    vinserti128     m1, m1, xm3, 1                  ; m1 = [8 7 7 6]
100
+    pmaddwd         m5, m1, [r5 + 3 * mmsize]
101
+    pmaddwd         m1, [r5 + 2 * mmsize]
102
+    paddd           m0, m5
103
+    paddd           m2, m1
104
+    movq            xm3, [r0 + r1]
105
+    punpcklwd       xm4, xm3
106
+    movq            xm1, [r0 + 2 * r1]
107
+    punpcklwd       xm3, xm1
108
+    vinserti128     m4, m4, xm3, 1                  ; m4 = [A 9 9 8]
109
+    pmaddwd         m4, [r5 + 3 * mmsize]
110
+    paddd           m2, m4
111
+
112
+%ifidn %1,ss
113
+    psrad           m0, 6
114
+    psrad           m2, 6
115
+%else
116
+    paddd           m0, m6
117
+    paddd           m2, m6
118
+%ifidn %1,pp
119
+    psrad           m0, 6
120
+    psrad           m2, 6
121
+%elifidn %1, sp
122
+    psrad           m0, 10
123
+    psrad           m2, 10
124
+%else
125
+    psrad           m0, 2
126
+    psrad           m2, 2
127
+%endif
128
+%endif
129
+
130
+    packssdw        m0, m2
131
+    pxor            m1, m1
132
+%ifidn %1,pp
133
+    CLIPW           m0, m1, [pw_pixel_max]
134
+%elifidn %1, sp
135
+    CLIPW           m0, m1, [pw_pixel_max]
136
+%endif
137
+
138
+    vextracti128    xm2, m0, 1
139
+    lea             r4, [r3 * 3]
140
+    movq            [r2], xm0
141
+    movq            [r2 + r3], xm2
142
+    movhps          [r2 + r3 * 2], xm0
143
+    movhps          [r2 + r4], xm2
144
+    RET
145
+%endmacro
146
+
147
+FILTER_VER_LUMA_AVX2_4x4 pp
148
+FILTER_VER_LUMA_AVX2_4x4 ps
149
+FILTER_VER_LUMA_AVX2_4x4 sp
150
+FILTER_VER_LUMA_AVX2_4x4 ss
151
+
152
+%macro FILTER_VER_LUMA_AVX2_8x8 1
153
+INIT_YMM avx2
154
+%if ARCH_X86_64 == 1
155
+cglobal interp_8tap_vert_%1_8x8, 4, 6, 12
156
+    mov             r4d, r4m
157
+    add             r1d, r1d
158
+    add             r3d, r3d
159
+    shl             r4d, 7
160
+
161
+%ifdef PIC
162
+    lea             r5, [tab_LumaCoeffVer]
163
+    add             r5, r4
164
+%else
165
+    lea             r5, [tab_LumaCoeffVer + r4]
166
+%endif
167
+
168
+    lea             r4, [r1 * 3]
169
+    sub             r0, r4
170
+
171
+%ifidn %1,pp
172
+    vbroadcasti128  m11, [pd_32]
173
+%elifidn %1, sp
174
+    mova            m11, [pd_524800]
175
+%else
176
+    vbroadcasti128  m11, [pd_n32768]
177
+%endif
178
+
179
+    movu            xm0, [r0]                       ; m0 = row 0
180
+    movu            xm1, [r0 + r1]                  ; m1 = row 1
181
+    punpckhwd       xm2, xm0, xm1
182
+    punpcklwd       xm0, xm1
183
+    vinserti128     m0, m0, xm2, 1
184
+    pmaddwd         m0, [r5]
185
+    movu            xm2, [r0 + r1 * 2]              ; m2 = row 2
186
+    punpckhwd       xm3, xm1, xm2
187
+    punpcklwd       xm1, xm2
188
+    vinserti128     m1, m1, xm3, 1
189
+    pmaddwd         m1, [r5]
190
+    movu            xm3, [r0 + r4]                  ; m3 = row 3
191
+    punpckhwd       xm4, xm2, xm3
192
+    punpcklwd       xm2, xm3
193
+    vinserti128     m2, m2, xm4, 1
194
+    pmaddwd         m4, m2, [r5 + 1 * mmsize]
195
+    pmaddwd         m2, [r5]
196
+    paddd           m0, m4
197
+    lea             r0, [r0 + r1 * 4]
198
+    movu            xm4, [r0]                       ; m4 = row 4
199
+    punpckhwd       xm5, xm3, xm4
200
+    punpcklwd       xm3, xm4
201
x265_1.5.tar.gz/source/common/x86/ipfilter8.asm -> x265_1.6.tar.gz/source/common/x86/ipfilter8.asm Changed
201
 
1
@@ -35,10 +35,20 @@
2
 const interp4_vpp_shuf, times 2 db 0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15
3
 
4
 ALIGN 32
5
+const interp_vert_shuf, times 2 db 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9
6
+                        times 2 db 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11, 13
7
+
8
+ALIGN 32
9
 const interp4_vpp_shuf1, dd 0, 1, 1, 2, 2, 3, 3, 4
10
                          dd 2, 3, 3, 4, 4, 5, 5, 6
11
 
12
 ALIGN 32
13
+const pb_8tap_hps_0, times 2 db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
14
+                     times 2 db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9,10
15
+                     times 2 db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9,10,10,11,11,12
16
+                     times 2 db 6, 7, 7, 8, 8, 9, 9,10,10,11,11,12,12,13,13,14
17
+
18
+ALIGN 32
19
 tab_Lm:    db 0, 1, 2, 3, 4,  5,  6,  7,  1, 2, 3, 4,  5,  6,  7,  8
20
            db 2, 3, 4, 5, 6,  7,  8,  9,  3, 4, 5, 6,  7,  8,  9,  10
21
            db 4, 5, 6, 7, 8,  9,  10, 11, 5, 6, 7, 8,  9,  10, 11, 12
22
@@ -51,6 +61,8 @@
23
 
24
 tab_c_526336:   times 4 dd 8192*64+2048
25
 
26
+pd_526336:      times 8 dd 8192*64+2048
27
+
28
 tab_ChromaCoeff: db  0, 64,  0,  0
29
                  db -2, 58, 10, -2
30
                  db -4, 54, 16, -2
31
@@ -59,6 +71,30 @@
32
                  db -4, 28, 46, -6
33
                  db -2, 16, 54, -4
34
                  db -2, 10, 58, -2
35
+ALIGN 32
36
+tab_ChromaCoeff_V: times 8 db 0, 64
37
+                   times 8 db 0,  0
38
+
39
+                   times 8 db -2, 58
40
+                   times 8 db 10, -2
41
+
42
+                   times 8 db -4, 54
43
+                   times 8 db 16, -2
44
+
45
+                   times 8 db -6, 46
46
+                   times 8 db 28, -4
47
+
48
+                   times 8 db -4, 36
49
+                   times 8 db 36, -4
50
+
51
+                   times 8 db -4, 28
52
+                   times 8 db 46, -6
53
+
54
+                   times 8 db -2, 16
55
+                   times 8 db 54, -4
56
+
57
+                   times 8 db -2, 10
58
+                   times 8 db 58, -2
59
 
60
 tab_ChromaCoeffV: times 4 dw 0, 64
61
                   times 4 dw 0, 0
62
@@ -84,6 +120,31 @@
63
                   times 4 dw -2, 10
64
                   times 4 dw 58, -2
65
 
66
+ALIGN 32
67
+pw_ChromaCoeffV:  times 8 dw 0, 64
68
+                  times 8 dw 0, 0
69
+
70
+                  times 8 dw -2, 58
71
+                  times 8 dw 10, -2
72
+
73
+                  times 8 dw -4, 54
74
+                  times 8 dw 16, -2
75
+
76
+                  times 8 dw -6, 46 
77
+                  times 8 dw 28, -4
78
+
79
+                  times 8 dw -4, 36
80
+                  times 8 dw 36, -4
81
+
82
+                  times 8 dw -4, 28
83
+                  times 8 dw 46, -6
84
+
85
+                  times 8 dw -2, 16
86
+                  times 8 dw 54, -4
87
+
88
+                  times 8 dw -2, 10
89
+                  times 8 dw 58, -2
90
+
91
 tab_LumaCoeff:   db   0, 0,  0,  64,  0,   0,  0,  0
92
                  db  -1, 4, -10, 58,  17, -5,  1,  0
93
                  db  -1, 4, -11, 40,  40, -11, 4, -1
94
@@ -109,6 +170,47 @@
95
                 times 4 dw 58, -10
96
                 times 4 dw 4, -1
97
 
98
+ALIGN 32
99
+pw_LumaCoeffVer: times 8 dw 0, 0
100
+                 times 8 dw 0, 64
101
+                 times 8 dw 0, 0
102
+                 times 8 dw 0, 0
103
+
104
+                 times 8 dw -1, 4
105
+                 times 8 dw -10, 58
106
+                 times 8 dw 17, -5
107
+                 times 8 dw 1, 0
108
+
109
+                 times 8 dw -1, 4
110
+                 times 8 dw -11, 40
111
+                 times 8 dw 40, -11
112
+                 times 8 dw 4, -1
113
+
114
+                 times 8 dw 0, 1
115
+                 times 8 dw -5, 17
116
+                 times 8 dw 58, -10
117
+                 times 8 dw 4, -1
118
+
119
+pb_LumaCoeffVer: times 16 db 0, 0
120
+                 times 16 db 0, 64
121
+                 times 16 db 0, 0
122
+                 times 16 db 0, 0
123
+
124
+                 times 16 db -1, 4
125
+                 times 16 db -10, 58
126
+                 times 16 db 17, -5
127
+                 times 16 db 1, 0
128
+
129
+                 times 16 db -1, 4
130
+                 times 16 db -11, 40
131
+                 times 16 db 40, -11
132
+                 times 16 db 4, -1
133
+
134
+                 times 16 db 0, 1
135
+                 times 16 db -5, 17
136
+                 times 16 db 58, -10
137
+                 times 16 db 4, -1
138
+
139
 tab_LumaCoeffVer: times 8 db 0, 0
140
                   times 8 db 0, 64
141
                   times 8 db 0, 0
142
@@ -183,6 +285,15 @@
143
 interp4_horiz_shuf1:    db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
144
                         db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
145
 
146
+ALIGN 32
147
+interp4_hpp_shuf: times 2 db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12
148
+
149
+ALIGN 32
150
+interp8_hps_shuf: dd 0, 4, 1, 5, 2, 6, 3, 7
151
+
152
+ALIGN 32
153
+interp4_hps_shuf: times 2 db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12
154
+
155
 SECTION .text
156
 
157
 cextern pb_128
158
@@ -913,6 +1024,105 @@
159
     pextrd          [r2+r0], xm3, 3
160
     RET
161
 
162
+%macro FILTER_HORIZ_LUMA_AVX2_4xN 1
163
+INIT_YMM avx2
164
+%if ARCH_X86_64 == 1
165
+cglobal interp_8tap_horiz_pp_4x%1, 4, 6, 9
166
+    mov             r4d, r4m
167
+
168
+%ifdef PIC
169
+    lea             r5, [tab_LumaCoeff]
170
+    vpbroadcastq    m0, [r5 + r4 * 8]
171
+%else
172
+    vpbroadcastq    m0, [tab_LumaCoeff + r4 * 8]
173
+%endif
174
+
175
+    mova            m1, [tab_Lm]
176
+    mova            m2, [pw_1]
177
+    mova            m7, [interp8_hps_shuf]
178
+    mova            m8, [pw_512]
179
+
180
+    ; register map
181
+    ; m0 - interpolate coeff
182
+    ; m1 - shuffle order table
183
+    ; m2 - constant word 1
184
+    lea             r4, [r1 * 3]
185
+    lea             r5, [r3 * 3]
186
+    sub             r0, 3
187
+%rep %1 / 8
188
+    ; Row 0-1
189
+    vbroadcasti128  m3, [r0]                        ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
190
+    pshufb          m3, m1
191
+    pmaddubsw       m3, m0
192
+    pmaddwd         m3, m2
193
+    vbroadcasti128  m4, [r0 + r1]                   ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
194
+    pshufb          m4, m1
195
+    pmaddubsw       m4, m0
196
+    pmaddwd         m4, m2
197
+    phaddd          m3, m4                          ; DWORD [R1D R1C R0D R0C R1B R1A R0B R0A]
198
+
199
+    ; Row 2-3
200
+    vbroadcasti128  m4, [r0 + r1 * 2]               ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
201
x265_1.5.tar.gz/source/common/x86/ipfilter8.h -> x265_1.6.tar.gz/source/common/x86/ipfilter8.h Changed
49
 
1
@@ -576,8 +576,12 @@
2
 CHROMA_420_FILTERS(_avx2);
3
 CHROMA_420_SP_FILTERS(_sse2);
4
 CHROMA_420_SP_FILTERS_SSE4(_sse4);
5
+CHROMA_420_SP_FILTERS(_avx2);
6
+CHROMA_420_SP_FILTERS_SSE4(_avx2);
7
 CHROMA_420_SS_FILTERS(_sse2);
8
 CHROMA_420_SS_FILTERS_SSE4(_sse4);
9
+CHROMA_420_SS_FILTERS(_avx2);
10
+CHROMA_420_SS_FILTERS_SSE4(_avx2);
11
 
12
 CHROMA_422_FILTERS(_sse4);
13
 CHROMA_422_FILTERS(_avx2);
14
@@ -617,10 +621,31 @@
15
 LUMA_SP_FILTERS(_sse4);
16
 LUMA_SS_FILTERS(_sse2);
17
 LUMA_FILTERS(_avx2);
18
-
19
+LUMA_SP_FILTERS(_avx2);
20
+LUMA_SS_FILTERS(_avx2);
21
 void x265_interp_8tap_hv_pp_8x8_sse4(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int idxX, int idxY);
22
-void x265_luma_p2s_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst, int width, int height);
23
-
24
+void x265_pixelToShort_4x4_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
25
+void x265_pixelToShort_4x8_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
26
+void x265_pixelToShort_4x16_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
27
+void x265_pixelToShort_8x4_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
28
+void x265_pixelToShort_8x8_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
29
+void x265_pixelToShort_8x16_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
30
+void x265_pixelToShort_8x32_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
31
+void x265_pixelToShort_16x4_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
32
+void x265_pixelToShort_16x8_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
33
+void x265_pixelToShort_16x12_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
34
+void x265_pixelToShort_16x16_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
35
+void x265_pixelToShort_16x32_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
36
+void x265_pixelToShort_16x64_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
37
+void x265_pixelToShort_32x8_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
38
+void x265_pixelToShort_32x16_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
39
+void x265_pixelToShort_32x24_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
40
+void x265_pixelToShort_32x32_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
41
+void x265_pixelToShort_32x64_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
42
+void x265_pixelToShort_64x16_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
43
+void x265_pixelToShort_64x32_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
44
+void x265_pixelToShort_64x48_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
45
+void x265_pixelToShort_64x64_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
46
 #undef LUMA_FILTERS
47
 #undef LUMA_SP_FILTERS
48
 #undef LUMA_SS_FILTERS
49
x265_1.5.tar.gz/source/common/x86/mc-a.asm -> x265_1.6.tar.gz/source/common/x86/mc-a.asm Changed
201
 
1
@@ -1759,7 +1759,570 @@
2
 ADDAVG_W16_H4 24
3
 
4
 ;-----------------------------------------------------------------------------
5
+; addAvg avx2 code start
6
+;-----------------------------------------------------------------------------
7
+
8
+INIT_YMM avx2
9
+cglobal addAvg_8x2, 6,6,4, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
10
+    movu            xm0, [r0]
11
+    vinserti128     m0, m0, [r0 + 2 * r3], 1
12
+
13
+    movu            xm2, [r1]
14
+    vinserti128     m2, m2, [r1 + 2 * r4], 1
15
+
16
+    paddw           m0, m2
17
+    pmulhrsw        m0, [pw_256]
18
+    paddw           m0, [pw_128]
19
+
20
+    packuswb        m0, m0
21
+    vextracti128    xm1, m0, 1
22
+    movq            [r2], xm0
23
+    movq            [r2 + r5], xm1
24
+    RET
25
+
26
+cglobal addAvg_8x6, 6,6,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
27
+    mova            m4, [pw_256]
28
+    mova            m5, [pw_128]
29
+    add             r3, r3
30
+    add             r4, r4
31
+
32
+    movu            xm0, [r0]
33
+    vinserti128     m0, m0, [r0 + r3], 1
34
+
35
+    movu            xm2, [r1]
36
+    vinserti128     m2, m2, [r1 + r4], 1
37
+
38
+    paddw           m0, m2
39
+    pmulhrsw        m0, m4
40
+    paddw           m0, m5
41
+
42
+    packuswb        m0, m0
43
+    vextracti128    xm1, m0, 1
44
+    movq            [r2], xm0
45
+    movq            [r2 + r5], xm1
46
+
47
+    lea             r2, [r2 + 2 * r5]
48
+    lea             r0, [r0 + 2 * r3]
49
+    lea             r1, [r1 + 2 * r4]
50
+
51
+    movu            xm0, [r0]
52
+    vinserti128     m0, m0, [r0+  r3], 1
53
+
54
+    movu            xm2, [r1]
55
+    vinserti128     m2, m2, [r1 + r4], 1
56
+
57
+    paddw           m0, m2
58
+    pmulhrsw        m0, m4
59
+    paddw           m0, m5
60
+
61
+    packuswb        m0, m0
62
+    vextracti128    xm1, m0, 1
63
+    movq            [r2], xm0
64
+    movq            [r2 + r5], xm1
65
+
66
+    lea             r2, [r2 + 2 * r5]
67
+    lea             r0, [r0 + 2 * r3]
68
+    lea             r1, [r1 + 2 * r4]
69
+
70
+    movu            xm0, [r0]
71
+    vinserti128     m0, m0, [r0 + r3], 1
72
+
73
+    movu            xm2, [r1]
74
+    vinserti128     m2, m2, [r1 + r4], 1
75
+
76
+    paddw           m0, m2
77
+    pmulhrsw        m0, m4
78
+    paddw           m0, m5
79
+
80
+    packuswb        m0, m0
81
+    vextracti128    xm1, m0, 1
82
+    movq            [r2], xm0
83
+    movq            [r2 + r5], xm1
84
+    RET
85
+
86
+%macro ADDAVG_W8_H4_AVX2 1
87
+INIT_YMM avx2
88
+cglobal addAvg_8x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
89
+    mova            m4, [pw_256]
90
+    mova            m5, [pw_128]
91
+    add             r3, r3
92
+    add             r4, r4
93
+    mov             r6d, %1/4
94
+
95
+.loop:
96
+    movu            xm0, [r0]
97
+    vinserti128     m0, m0, [r0 + r3], 1
98
+
99
+    movu            xm2, [r1]
100
+    vinserti128     m2, m2, [r1 + r4], 1
101
+
102
+    paddw           m0, m2
103
+    pmulhrsw        m0, m4
104
+    paddw           m0, m5
105
+
106
+    packuswb        m0, m0
107
+    vextracti128    xm1, m0, 1
108
+    movq            [r2], xm0
109
+    movq            [r2 + r5], xm1
110
+
111
+    lea             r2, [r2 + 2 * r5]
112
+    lea             r0, [r0 + 2 * r3]
113
+    lea             r1, [r1 + 2 * r4]
114
+
115
+    movu            xm0, [r0]
116
+    vinserti128     m0, m0, [r0 + r3], 1
117
+
118
+    movu            m2, [r1]
119
+    vinserti128     m2, m2, [r1 + r4], 1
120
+
121
+    paddw           m0, m2
122
+    pmulhrsw        m0, m4
123
+    paddw           m0, m5
124
+
125
+    packuswb        m0, m0
126
+    vextracti128    xm1, m0, 1
127
+    movq            [r2], xm0
128
+    movq            [r2 + r5], xm1
129
+
130
+    lea             r2, [r2 + 2 * r5]
131
+    lea             r0, [r0 + 2 * r3]
132
+    lea             r1, [r1 + 2 * r4]
133
+
134
+    dec             r6d
135
+    jnz             .loop
136
+    RET
137
+%endmacro
138
 
139
+ADDAVG_W8_H4_AVX2 4
140
+ADDAVG_W8_H4_AVX2 8
141
+ADDAVG_W8_H4_AVX2 16
142
+ADDAVG_W8_H4_AVX2 32
143
+
144
+%macro ADDAVG_W12_H4_AVX2 1
145
+INIT_YMM avx2
146
+cglobal addAvg_12x%1, 6,7,7, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
147
+    mova            m4, [pw_256]
148
+    mova            m5, [pw_128]
149
+    add             r3, r3
150
+    add             r4, r4
151
+    mov             r6d, %1/4
152
+
153
+.loop:
154
+    movu            xm0, [r0]
155
+    movu            xm1, [r1]
156
+    movq            xm2, [r0 + 16]
157
+    movq            xm3, [r1 + 16]
158
+    vinserti128     m0, m0, xm2, 1
159
+    vinserti128     m1, m1, xm3, 1
160
+
161
+    paddw           m0, m1
162
+    pmulhrsw        m0, m4
163
+    paddw           m0, m5
164
+
165
+    movu            xm1, [r0 + r3]
166
+    movu            xm2, [r1 + r4]
167
+    movq            xm3, [r0 + r3 + 16]
168
+    movq            xm6, [r1 + r3 + 16]
169
+    vinserti128     m1, m1, xm3, 1
170
+    vinserti128     m2, m2, xm6, 1
171
+
172
+    paddw           m1, m2
173
+    pmulhrsw        m1, m4
174
+    paddw           m1, m5
175
+
176
+    packuswb        m0, m1
177
+    vextracti128    xm1, m0, 1
178
+    movq            [r2], xm0
179
+    movd            [r2 + 8], xm1
180
+    vpshufd         m1, m1, 2
181
+    movhps          [r2 + r5], xm0
182
+    movd            [r2 + r5 + 8], xm1
183
+
184
+    lea             r2, [r2 + 2 * r5]
185
+    lea             r0, [r0 + 2 * r3]
186
+    lea             r1, [r1 + 2 * r4]
187
+
188
+    movu            xm0, [r0]
189
+    movu            xm1, [r1]
190
+    movq            xm2, [r0 + 16]
191
+    movq            xm3, [r1 + 16]
192
+    vinserti128     m0, m0, xm2, 1
193
+    vinserti128     m1, m1, xm3, 1
194
+
195
+    paddw           m0, m1
196
+    pmulhrsw        m0, m4
197
+    paddw           m0, m5
198
+
199
+    movu            xm1, [r0 + r3]
200
+    movu            xm2, [r1 + r4]
201
x265_1.5.tar.gz/source/common/x86/pixel-a.asm -> x265_1.6.tar.gz/source/common/x86/pixel-a.asm Changed
201
 
1
@@ -38,13 +38,15 @@
2
            times 4 db 1, -1
3
            times 8 db 1
4
            times 4 db 1, -1
5
-hmul_4p:   times 2 db 1, 1, 1, 1, 1, -1, 1, -1
6
+hmul_4p:   times 4 db 1, 1, 1, 1, 1, -1, 1, -1
7
 mask_10:   times 4 dw 0, -1
8
 mask_1100: times 2 dd 0, -1
9
 hmul_8w:   times 4 dw 1
10
            times 2 dw 1, -1
11
+           times 4 dw 1
12
+           times 2 dw 1, -1
13
 ALIGN 32
14
-hmul_w:    dw 1, -1, 1, -1, 1, -1, 1, -1
15
+hmul_w:    times 2 dw 1, -1, 1, -1, 1, -1, 1, -1
16
 ALIGN 32
17
 transd_shuf1: SHUFFLE_MASK_W 0, 8, 2, 10, 4, 12, 6, 14
18
 transd_shuf2: SHUFFLE_MASK_W 1, 9, 3, 11, 5, 13, 7, 15
19
@@ -1235,6 +1237,580 @@
20
     RET
21
 
22
 %else
23
+%if WIN64
24
+cglobal pixel_satd_16x24, 4,8,14    ;if WIN64 && cpuflag(avx)
25
+    SATD_START_SSE2 m6, m7
26
+    mov r6, r0
27
+    mov r7, r2
28
+    call pixel_satd_8x8_internal2
29
+    call pixel_satd_8x8_internal2
30
+    call pixel_satd_8x8_internal2
31
+    lea r0, [r6 + 8*SIZEOF_PIXEL]
32
+    lea r2, [r7 + 8*SIZEOF_PIXEL]
33
+    call pixel_satd_8x8_internal2
34
+    call pixel_satd_8x8_internal2
35
+    call pixel_satd_8x8_internal2
36
+    pxor    m7, m7
37
+    movhlps m7, m6
38
+    paddd   m6, m7
39
+    pshufd  m7, m6, 1
40
+    paddd   m6, m7
41
+    movd   eax, m6
42
+    RET
43
+%else
44
+cglobal pixel_satd_16x24, 4,7,8,0-gprsize    ;if !WIN64
45
+    SATD_START_SSE2 m6, m7
46
+    mov r6, r0
47
+    mov [rsp], r2
48
+    call pixel_satd_8x8_internal2
49
+    call pixel_satd_8x8_internal2
50
+    call pixel_satd_8x8_internal2
51
+    lea r0, [r6 + 8*SIZEOF_PIXEL]
52
+    mov r2, [rsp]
53
+    add r2, 8*SIZEOF_PIXEL
54
+    call pixel_satd_8x8_internal2
55
+    call pixel_satd_8x8_internal2
56
+    call pixel_satd_8x8_internal2
57
+    pxor    m7, m7
58
+    movhlps m7, m6
59
+    paddd   m6, m7
60
+    pshufd  m7, m6, 1
61
+    paddd   m6, m7
62
+    movd   eax, m6
63
+    RET
64
+%endif
65
+%if WIN64
66
+cglobal pixel_satd_32x48, 4,8,14    ;if WIN64 && cpuflag(avx)
67
+    SATD_START_SSE2 m6, m7
68
+    mov r6, r0
69
+    mov r7, r2
70
+    call pixel_satd_8x8_internal2
71
+    call pixel_satd_8x8_internal2
72
+    call pixel_satd_8x8_internal2
73
+    call pixel_satd_8x8_internal2
74
+    call pixel_satd_8x8_internal2
75
+    call pixel_satd_8x8_internal2
76
+    lea r0, [r6 + 8*SIZEOF_PIXEL]
77
+    lea r2, [r7 + 8*SIZEOF_PIXEL]
78
+    call pixel_satd_8x8_internal2
79
+    call pixel_satd_8x8_internal2
80
+    call pixel_satd_8x8_internal2
81
+    call pixel_satd_8x8_internal2
82
+    call pixel_satd_8x8_internal2
83
+    call pixel_satd_8x8_internal2
84
+    lea r0, [r6 + 16*SIZEOF_PIXEL]
85
+    lea r2, [r7 + 16*SIZEOF_PIXEL]
86
+    call pixel_satd_8x8_internal2
87
+    call pixel_satd_8x8_internal2
88
+    call pixel_satd_8x8_internal2
89
+    call pixel_satd_8x8_internal2
90
+    call pixel_satd_8x8_internal2
91
+    call pixel_satd_8x8_internal2
92
+    lea r0, [r6 + 24*SIZEOF_PIXEL]
93
+    lea r2, [r7 + 24*SIZEOF_PIXEL]
94
+    call pixel_satd_8x8_internal2
95
+    call pixel_satd_8x8_internal2
96
+    call pixel_satd_8x8_internal2
97
+    call pixel_satd_8x8_internal2
98
+    call pixel_satd_8x8_internal2
99
+    call pixel_satd_8x8_internal2
100
+    pxor    m7, m7
101
+    movhlps m7, m6
102
+    paddd   m6, m7
103
+    pshufd  m7, m6, 1
104
+    paddd   m6, m7
105
+    movd   eax, m6
106
+    RET
107
+%else
108
+cglobal pixel_satd_32x48, 4,7,8,0-gprsize    ;if !WIN64
109
+    SATD_START_SSE2 m6, m7
110
+    mov r6, r0
111
+    mov [rsp], r2
112
+    call pixel_satd_8x8_internal2
113
+    call pixel_satd_8x8_internal2
114
+    call pixel_satd_8x8_internal2
115
+    call pixel_satd_8x8_internal2
116
+    call pixel_satd_8x8_internal2
117
+    call pixel_satd_8x8_internal2
118
+    lea r0, [r6 + 8*SIZEOF_PIXEL]
119
+    mov r2, [rsp]
120
+    add r2, 8*SIZEOF_PIXEL
121
+    call pixel_satd_8x8_internal2
122
+    call pixel_satd_8x8_internal2
123
+    call pixel_satd_8x8_internal2
124
+    call pixel_satd_8x8_internal2
125
+    call pixel_satd_8x8_internal2
126
+    call pixel_satd_8x8_internal2
127
+    lea r0, [r6 + 16*SIZEOF_PIXEL]
128
+    mov r2, [rsp]
129
+    add r2, 16*SIZEOF_PIXEL
130
+    call pixel_satd_8x8_internal2
131
+    call pixel_satd_8x8_internal2
132
+    call pixel_satd_8x8_internal2
133
+    call pixel_satd_8x8_internal2
134
+    call pixel_satd_8x8_internal2
135
+    call pixel_satd_8x8_internal2
136
+    lea r0, [r6 + 24*SIZEOF_PIXEL]
137
+    mov r2, [rsp]
138
+    add r2, 24*SIZEOF_PIXEL
139
+    call pixel_satd_8x8_internal2
140
+    call pixel_satd_8x8_internal2
141
+    call pixel_satd_8x8_internal2
142
+    call pixel_satd_8x8_internal2
143
+    call pixel_satd_8x8_internal2
144
+    call pixel_satd_8x8_internal2
145
+    pxor    m7, m7
146
+    movhlps m7, m6
147
+    paddd   m6, m7
148
+    pshufd  m7, m6, 1
149
+    paddd   m6, m7
150
+    movd   eax, m6
151
+    RET
152
+%endif
153
+
154
+%if WIN64
155
+cglobal pixel_satd_24x64, 4,8,14    ;if WIN64 && cpuflag(avx)
156
+    SATD_START_SSE2 m6, m7
157
+    mov r6, r0
158
+    mov r7, r2
159
+    call pixel_satd_8x8_internal2
160
+    call pixel_satd_8x8_internal2
161
+    call pixel_satd_8x8_internal2
162
+    call pixel_satd_8x8_internal2
163
+    call pixel_satd_8x8_internal2
164
+    call pixel_satd_8x8_internal2
165
+    call pixel_satd_8x8_internal2
166
+    call pixel_satd_8x8_internal2
167
+    lea r0, [r6 + 8*SIZEOF_PIXEL]
168
+    lea r2, [r7 + 8*SIZEOF_PIXEL]
169
+    call pixel_satd_8x8_internal2
170
+    call pixel_satd_8x8_internal2
171
+    call pixel_satd_8x8_internal2
172
+    call pixel_satd_8x8_internal2
173
+    call pixel_satd_8x8_internal2
174
+    call pixel_satd_8x8_internal2
175
+    call pixel_satd_8x8_internal2
176
+    call pixel_satd_8x8_internal2
177
+    lea r0, [r6 + 16*SIZEOF_PIXEL]
178
+    lea r2, [r7 + 16*SIZEOF_PIXEL]
179
+    call pixel_satd_8x8_internal2
180
+    call pixel_satd_8x8_internal2
181
+    call pixel_satd_8x8_internal2
182
+    call pixel_satd_8x8_internal2
183
+    call pixel_satd_8x8_internal2
184
+    call pixel_satd_8x8_internal2
185
+    call pixel_satd_8x8_internal2
186
+    call pixel_satd_8x8_internal2
187
+    pxor    m7, m7
188
+    movhlps m7, m6
189
+    paddd   m6, m7
190
+    pshufd  m7, m6, 1
191
+    paddd   m6, m7
192
+    movd   eax, m6
193
+    RET
194
+%else
195
+cglobal pixel_satd_24x64, 4,7,8,0-gprsize    ;if !WIN64
196
+    SATD_START_SSE2 m6, m7
197
+    mov r6, r0
198
+    mov [rsp], r2
199
+    call pixel_satd_8x8_internal2
200
+    call pixel_satd_8x8_internal2
201
x265_1.5.tar.gz/source/common/x86/pixel-util.h -> x265_1.6.tar.gz/source/common/x86/pixel-util.h Changed
36
 
1
@@ -30,6 +30,8 @@
2
 void x265_getResidual16_sse4(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
3
 void x265_getResidual32_sse2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
4
 void x265_getResidual32_sse4(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
5
+void x265_getResidual16_avx2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
6
+void x265_getResidual32_avx2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
7
 
8
 void x265_transpose4_sse2(pixel* dest, const pixel* src, intptr_t stride);
9
 void x265_transpose8_sse2(pixel* dest, const pixel* src, intptr_t stride);
10
@@ -48,7 +50,15 @@
11
 uint32_t x265_nquant_avx2(const int16_t* coef, const int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff);
12
 void x265_dequant_normal_sse4(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift);
13
 void x265_dequant_normal_avx2(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift);
14
-int x265_count_nonzero_ssse3(const int16_t* quantCoeff, int numCoeff);
15
+
16
+int x265_count_nonzero_4x4_ssse3(const int16_t* quantCoeff);
17
+int x265_count_nonzero_8x8_ssse3(const int16_t* quantCoeff);
18
+int x265_count_nonzero_16x16_ssse3(const int16_t* quantCoeff);
19
+int x265_count_nonzero_32x32_ssse3(const int16_t* quantCoeff);
20
+int x265_count_nonzero_4x4_avx2(const int16_t* quantCoeff);
21
+int x265_count_nonzero_8x8_avx2(const int16_t* quantCoeff);
22
+int x265_count_nonzero_16x16_avx2(const int16_t* quantCoeff);
23
+int x265_count_nonzero_32x32_avx2(const int16_t* quantCoeff);
24
 
25
 void x265_weight_pp_sse4(const pixel* src, pixel* dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset);
26
 void x265_weight_pp_avx2(const pixel* src, pixel* dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset);
27
@@ -67,6 +77,8 @@
28
 void x265_scale1D_128to64_avx2(pixel*, const pixel*, intptr_t);
29
 void x265_scale2D_64to32_ssse3(pixel*, const pixel*, intptr_t);
30
 
31
+int x265_findPosLast_x64(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig);
32
+
33
 #define SETUP_CHROMA_PIXELSUB_PS_FUNC(W, H, cpu) \
34
     void x265_pixel_sub_ps_ ## W ## x ## H ## cpu(int16_t*  dest, intptr_t destride, const pixel* src0, const pixel* src1, intptr_t srcstride0, intptr_t srcstride1); \
35
     void x265_pixel_add_ps_ ## W ## x ## H ## cpu(pixel* dest, intptr_t destride, const pixel* src0, const int16_t*  scr1, intptr_t srcStride0, intptr_t srcStride1);
36
x265_1.5.tar.gz/source/common/x86/pixel-util8.asm -> x265_1.6.tar.gz/source/common/x86/pixel-util8.asm Changed
201
 
1
@@ -3,6 +3,7 @@
2
 ;*
3
 ;* Authors: Min Chen <chenm003@163.com> <min.chen@multicorewareinc.com>
4
 ;*          Nabajit Deka <nabajit@multicorewareinc.com>
5
+;*          Rajesh Paulraj <rajesh@multicorewareinc.com>
6
 ;*
7
 ;* This program is free software; you can redistribute it and/or modify
8
 ;* it under the terms of the GNU General Public License as published by
9
@@ -63,6 +64,12 @@
10
 cextern pd_1
11
 cextern pd_32767
12
 cextern pd_n32768
13
+cextern pb_2
14
+cextern pb_4
15
+cextern pb_8
16
+cextern pb_16
17
+cextern pb_32
18
+cextern pb_64
19
 
20
 ;-----------------------------------------------------------------------------
21
 ; void getResidual(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride)
22
@@ -95,9 +102,9 @@
23
     punpcklqdq   m0, m1
24
     punpcklqdq   m2, m3
25
     psubw        m0, m2
26
-
27
     movh        [r2], m0
28
     movhps      [r2 + r3], m0
29
+    RET
30
 %else
31
 cglobal getResidual4, 4,4,5
32
     pxor        m0, m0
33
@@ -130,8 +137,8 @@
34
     psubw       m1, m3
35
     movh        [r2], m1
36
     movhps      [r2 + r3 * 2], m1
37
-%endif
38
     RET
39
+%endif
40
 
41
 
42
 INIT_XMM sse2
43
@@ -157,6 +164,7 @@
44
     lea         r2, [r2 + r3 * 2]
45
 %endif
46
 %endrep
47
+    RET
48
 %else
49
 cglobal getResidual8, 4,4,5
50
     pxor        m0, m0
51
@@ -183,8 +191,9 @@
52
     lea         r2, [r2 + r3 * 4]
53
 %endif
54
 %endrep
55
-%endif
56
     RET
57
+%endif
58
+
59
 
60
 %if HIGH_BIT_DEPTH
61
 INIT_XMM sse2
62
@@ -238,10 +247,9 @@
63
     lea         r0, [r0 + r3 * 2]
64
     lea         r1, [r1 + r3 * 2]
65
     lea         r2, [r2 + r3 * 2]
66
-
67
     jnz        .loop
68
+    RET
69
 %else
70
-
71
 INIT_XMM sse4
72
 cglobal getResidual16, 4,5,8
73
     mov         r4d, 16/4
74
@@ -302,11 +310,67 @@
75
     lea         r0, [r0 + r3 * 2]
76
     lea         r1, [r1 + r3 * 2]
77
     lea         r2, [r2 + r3 * 4]
78
-
79
     jnz        .loop
80
+    RET
81
 %endif
82
 
83
+%if HIGH_BIT_DEPTH
84
+INIT_YMM avx2
85
+cglobal getResidual16, 4,4,5
86
+    add         r3, r3
87
+    pxor        m0, m0
88
+
89
+%assign x 0
90
+%rep 16/2
91
+    movu        m1, [r0]
92
+    movu        m2, [r0 + r3]
93
+    movu        m3, [r1]
94
+    movu        m4, [r1 + r3]
95
+
96
+    psubw       m1, m3
97
+    psubw       m2, m4
98
+    movu        [r2], m1
99
+    movu        [r2 + r3], m2
100
+%assign x x+1
101
+%if (x != 8)
102
+    lea         r0, [r0 + r3 * 2]
103
+    lea         r1, [r1 + r3 * 2]
104
+    lea         r2, [r2 + r3 * 2]
105
+%endif
106
+%endrep
107
     RET
108
+%else
109
+INIT_YMM avx2
110
+cglobal getResidual16, 4,5,8
111
+    lea         r4, [r3 * 2]
112
+    add         r4d, r3d
113
+%assign x 0
114
+%rep 4
115
+    pmovzxbw    m0, [r0]
116
+    pmovzxbw    m1, [r0 + r3]
117
+    pmovzxbw    m2, [r0 + r3 * 2]
118
+    pmovzxbw    m3, [r0 + r4]
119
+    pmovzxbw    m4, [r1]
120
+    pmovzxbw    m5, [r1 + r3]
121
+    pmovzxbw    m6, [r1 + r3 * 2]
122
+    pmovzxbw    m7, [r1 + r4]
123
+    psubw       m0, m4
124
+    psubw       m1, m5
125
+    psubw       m2, m6
126
+    psubw       m3, m7
127
+    movu        [r2], m0
128
+    movu        [r2 + r3 * 2], m1
129
+    movu        [r2 + r3 * 2 * 2], m2
130
+    movu        [r2 + r4 * 2], m3
131
+%assign x x+1
132
+%if (x != 4)
133
+    lea         r0, [r0 + r3 * 2 * 2]
134
+    lea         r1, [r1 + r3 * 2 * 2]
135
+    lea         r2, [r2 + r3 * 4 * 2]
136
+%endif
137
+%endrep
138
+    RET
139
+%endif
140
 
141
 %if HIGH_BIT_DEPTH
142
 INIT_XMM sse2
143
@@ -357,9 +421,8 @@
144
     lea         r0, [r0 + r3 * 2]
145
     lea         r1, [r1 + r3 * 2]
146
     lea         r2, [r2 + r3 * 2]
147
-
148
     jnz        .loop
149
-
150
+    RET
151
 %else
152
 INIT_XMM sse4
153
 cglobal getResidual32, 4,5,7
154
@@ -415,12 +478,70 @@
155
     lea         r0, [r0 + r3 * 2]
156
     lea         r1, [r1 + r3 * 2]
157
     lea         r2, [r2 + r3 * 4]
158
-
159
     jnz        .loop
160
+    RET
161
+%endif
162
+
163
+
164
+%if HIGH_BIT_DEPTH
165
+INIT_YMM avx2
166
+cglobal getResidual32, 4,4,5
167
+    add         r3, r3
168
+    pxor        m0, m0
169
+
170
+%assign x 0
171
+%rep 32
172
+    movu        m1, [r0]
173
+    movu        m2, [r0 + 32]
174
+    movu        m3, [r1]
175
+    movu        m4, [r1 + 32]
176
+
177
+    psubw       m1, m3
178
+    psubw       m2, m4
179
+    movu        [r2], m1
180
+    movu        [r2 + 32], m2
181
+%assign x x+1
182
+%if (x != 32)
183
+    lea         r0, [r0 + r3]
184
+    lea         r1, [r1 + r3]
185
+    lea         r2, [r2 + r3]
186
 %endif
187
+%endrep
188
     RET
189
+%else
190
+INIT_YMM avx2
191
+cglobal getResidual32, 4,5,8
192
+    lea         r4, [r3 * 2]
193
+%assign x 0
194
+%rep 16
195
+    pmovzxbw    m0, [r0]
196
+    pmovzxbw    m1, [r0 + 16]
197
+    pmovzxbw    m2, [r0 + r3]
198
+    pmovzxbw    m3, [r0 + r3 + 16]
199
+
200
+    pmovzxbw    m4, [r1]
201
x265_1.5.tar.gz/source/common/x86/pixel.h -> x265_1.6.tar.gz/source/common/x86/pixel.h Changed
71
 
1
@@ -103,6 +103,13 @@
2
 DECL_X1(satd, avx)
3
 DECL_X1(satd, xop)
4
 DECL_X1(satd, avx2)
5
+int x265_pixel_satd_16x24_avx(const pixel*, intptr_t, const pixel*, intptr_t);
6
+int x265_pixel_satd_32x48_avx(const pixel*, intptr_t, const pixel*, intptr_t);
7
+int x265_pixel_satd_24x64_avx(const pixel*, intptr_t, const pixel*, intptr_t);
8
+int x265_pixel_satd_8x64_avx(const pixel*, intptr_t, const pixel*, intptr_t);
9
+int x265_pixel_satd_8x12_avx(const pixel*, intptr_t, const pixel*, intptr_t);
10
+int x265_pixel_satd_12x32_avx(const pixel*, intptr_t, const pixel*, intptr_t);
11
+int x265_pixel_satd_4x32_avx(const pixel*, intptr_t, const pixel*, intptr_t);
12
 int x265_pixel_satd_8x32_sse2(const pixel*, intptr_t, const pixel*, intptr_t);
13
 int x265_pixel_satd_16x4_sse2(const pixel*, intptr_t, const pixel*, intptr_t);
14
 int x265_pixel_satd_16x12_sse2(const pixel*, intptr_t, const pixel*, intptr_t);
15
@@ -170,10 +177,12 @@
16
 int x265_pixel_ssd_s_8_sse2(const int16_t*, intptr_t);
17
 int x265_pixel_ssd_s_16_sse2(const int16_t*, intptr_t);
18
 int x265_pixel_ssd_s_32_sse2(const int16_t*, intptr_t);
19
+int x265_pixel_ssd_s_16_avx2(const int16_t*, intptr_t);
20
 int x265_pixel_ssd_s_32_avx2(const int16_t*, intptr_t);
21
 
22
 #define ADDAVG(func)  \
23
-    void x265_ ## func ## _sse4(const int16_t*, const int16_t*, pixel*, intptr_t, intptr_t, intptr_t);
24
+    void x265_ ## func ## _sse4(const int16_t*, const int16_t*, pixel*, intptr_t, intptr_t, intptr_t); \
25
+    void x265_ ## func ## _avx2(const int16_t*, const int16_t*, pixel*, intptr_t, intptr_t, intptr_t);
26
 ADDAVG(addAvg_2x4)
27
 ADDAVG(addAvg_2x8)
28
 ADDAVG(addAvg_4x2);
29
@@ -228,6 +237,41 @@
30
 int x265_psyCost_ss_16x16_sse4(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride);
31
 int x265_psyCost_ss_32x32_sse4(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride);
32
 int x265_psyCost_ss_64x64_sse4(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride);
33
+void x265_pixel_avg_16x4_avx2(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
34
+void x265_pixel_avg_16x8_avx2(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
35
+void x265_pixel_avg_16x12_avx2(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
36
+void x265_pixel_avg_16x16_avx2(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
37
+void x265_pixel_avg_16x32_avx2(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
38
+void x265_pixel_avg_16x64_avx2(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
39
+void x265_pixel_avg_32x64_avx2(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
40
+void x265_pixel_avg_32x32_avx2(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
41
+void x265_pixel_avg_32x24_avx2(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
42
+void x265_pixel_avg_32x16_avx2(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
43
+void x265_pixel_avg_32x8_avx2(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
44
+void x265_pixel_avg_64x64_avx2(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
45
+void x265_pixel_avg_64x48_avx2(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
46
+void x265_pixel_avg_64x32_avx2(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
47
+void x265_pixel_avg_64x16_avx2(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
48
+
49
+void x265_pixel_add_ps_16x16_avx2(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
50
+void x265_pixel_add_ps_32x32_avx2(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
51
+void x265_pixel_add_ps_64x64_avx2(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
52
+
53
+void x265_pixel_sub_ps_16x16_avx2(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1);
54
+void x265_pixel_sub_ps_32x32_avx2(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1);
55
+void x265_pixel_sub_ps_64x64_avx2(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1);
56
+
57
+int x265_psyCost_pp_4x4_avx2(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
58
+int x265_psyCost_pp_8x8_avx2(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
59
+int x265_psyCost_pp_16x16_avx2(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
60
+int x265_psyCost_pp_32x32_avx2(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
61
+int x265_psyCost_pp_64x64_avx2(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
62
+
63
+int x265_psyCost_ss_4x4_avx2(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride);
64
+int x265_psyCost_ss_8x8_avx2(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride);
65
+int x265_psyCost_ss_16x16_avx2(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride);
66
+int x265_psyCost_ss_32x32_avx2(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride);
67
+int x265_psyCost_ss_64x64_avx2(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride);
68
 
69
 #undef DECL_PIXELS
70
 #undef DECL_HEVC_SSD
71
x265_1.5.tar.gz/source/common/x86/pixeladd8.asm -> x265_1.6.tar.gz/source/common/x86/pixeladd8.asm Changed
183
 
1
@@ -398,6 +398,52 @@
2
 
3
     jnz         .loop
4
     RET
5
+
6
+INIT_YMM avx2
7
+cglobal pixel_add_ps_16x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1
8
+    mov         r6d,        %2/4
9
+    add         r5,         r5
10
+.loop:
11
+
12
+    pmovzxbw    m0,         [r2]        ; row 0 of src0
13
+    pmovzxbw    m1,         [r2 + r4]   ; row 1 of src0
14
+    movu        m2,        [r3]        ; row 0 of src1
15
+    movu        m3,        [r3 + r5]   ; row 1 of src1
16
+    paddw       m0,         m2
17
+    paddw       m1,         m3
18
+    packuswb    m0,         m1
19
+
20
+    lea         r2,         [r2 + r4 * 2]
21
+    lea         r3,         [r3 + r5 * 2]
22
+
23
+    pmovzxbw    m2,         [r2]        ; row 2 of src0
24
+    pmovzxbw    m3,         [r2 + r4]   ; row 3 of src0
25
+    movu        m4,        [r3]        ; row 2 of src1
26
+    movu        m5,        [r3 + r5]   ; row 3 of src1
27
+    paddw       m2,         m4
28
+    paddw       m3,         m5
29
+    packuswb    m2,         m3
30
+
31
+    lea         r2,         [r2 + r4 * 2]
32
+    lea         r3,         [r3 + r5 * 2]
33
+
34
+    vpermq      m0, m0, 11011000b
35
+    movu        [r0],      xm0           ; row 0 of dst
36
+    vextracti128 xm3, m0, 1
37
+    movu        [r0 + r1], xm3           ; row 1 of dst
38
+
39
+    lea         r0,         [r0 + r1 * 2]
40
+    vpermq      m2, m2, 11011000b
41
+    movu        [r0],      xm2           ; row 2 of dst
42
+    vextracti128 xm3, m2, 1
43
+    movu         [r0 + r1], xm3          ; row 3 of dst
44
+
45
+    lea         r0,         [r0 + r1 * 2]
46
+
47
+    dec         r6d
48
+    jnz         .loop
49
+
50
+    RET
51
 %endif
52
 %endmacro
53
 
54
@@ -523,6 +569,67 @@
55
 
56
     jnz         .loop
57
     RET
58
+
59
+INIT_YMM avx2
60
+cglobal pixel_add_ps_32x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1
61
+    mov         r6d,        %2/4
62
+    add         r5,         r5
63
+.loop:
64
+    pmovzxbw    m0,         [r2]                ; first half of row 0 of src0
65
+    pmovzxbw    m1,         [r2 + 16]           ; second half of row 0 of src0
66
+    movu        m2,         [r3]                ; first half of row 0 of src1
67
+    movu        m3,         [r3 + 32]           ; second half of row 0 of src1
68
+
69
+    paddw       m0,         m2
70
+    paddw       m1,         m3
71
+    packuswb    m0,         m1
72
+    vpermq      m0, m0, 11011000b
73
+    movu        [r0],      m0                   ; row 0 of dst
74
+
75
+    pmovzxbw    m0,         [r2 + r4]           ; first half of row 1 of src0
76
+    pmovzxbw    m1,         [r2 + r4 + 16]      ; second half of row 1 of src0
77
+    movu        m2,         [r3 + r5]           ; first half of row 1 of src1
78
+    movu        m3,         [r3 + r5 + 32]      ; second half of row 1 of src1
79
+
80
+    paddw       m0,         m2
81
+    paddw       m1,         m3
82
+    packuswb    m0,         m1
83
+    vpermq      m0, m0, 11011000b
84
+    movu        [r0 + r1],      m0              ; row 1 of dst
85
+
86
+    lea         r2,         [r2 + r4 * 2]
87
+    lea         r3,         [r3 + r5 * 2]
88
+    lea         r0,         [r0 + r1 * 2]
89
+
90
+    pmovzxbw    m0,         [r2]                ; first half of row 2 of src0
91
+    pmovzxbw    m1,         [r2 + 16]           ; second half of row 2 of src0
92
+    movu        m2,         [r3]                ; first half of row 2 of src1
93
+    movu        m3,         [r3 + 32]           ; second half of row 2 of src1
94
+
95
+    paddw       m0,         m2
96
+    paddw       m1,         m3
97
+    packuswb    m0,         m1
98
+    vpermq      m0, m0, 11011000b
99
+    movu        [r0],      m0                   ; row 2 of dst
100
+
101
+    pmovzxbw    m0,         [r2 + r4]           ; first half of row 3 of src0
102
+    pmovzxbw    m1,         [r2 + r4 + 16]      ; second half of row 3 of src0
103
+    movu        m2,         [r3 + r5]           ; first half of row 3 of src1
104
+    movu        m3,         [r3 + r5 + 32]      ; second half of row 3 of src1
105
+
106
+    paddw       m0,         m2
107
+    paddw       m1,         m3
108
+    packuswb    m0,         m1
109
+    vpermq      m0, m0, 11011000b
110
+    movu        [r0 + r1],      m0              ; row 3 of dst
111
+
112
+    lea         r2,         [r2 + r4 * 2]
113
+    lea         r3,         [r3 + r5 * 2]
114
+    lea         r0,         [r0 + r1 * 2]
115
+
116
+    dec         r6d
117
+    jnz         .loop
118
+    RET
119
 %endif
120
 %endmacro
121
 
122
@@ -734,6 +841,60 @@
123
 
124
     jnz         .loop
125
     RET
126
+
127
+INIT_YMM avx2
128
+cglobal pixel_add_ps_64x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1
129
+    mov         r6d,        %2/2
130
+    add         r5,         r5
131
+.loop:
132
+    pmovzxbw    m0,         [r2]                ; first 16 of row 0 of src0
133
+    pmovzxbw    m1,         [r2 + 16]           ; second 16 of row 0 of src0
134
+    pmovzxbw    m2,         [r2 + 32]           ; third 16 of row 0 of src0
135
+    pmovzxbw    m3,         [r2 + 48]           ; forth 16 of row 0 of src0
136
+    movu        m4,         [r3]                ; first 16 of row 0 of src1
137
+    movu        m5,         [r3 + 32]           ; second 16 of row 0 of src1
138
+    movu        m6,         [r3 + 64]           ; third 16 of row 0 of src1
139
+    movu        m7,         [r3 + 96]           ; forth 16 of row 0 of src1
140
+
141
+    paddw       m0,         m4
142
+    paddw       m1,         m5
143
+    paddw       m2,         m6
144
+    paddw       m3,         m7
145
+    packuswb    m0,         m1
146
+    packuswb    m2,         m3
147
+    vpermq      m0, m0, 11011000b
148
+    movu        [r0],      m0                   ; first 32 of row 0 of dst
149
+    vpermq      m2, m2, 11011000b
150
+    movu        [r0 + 32],      m2              ; second 32 of row 0 of dst
151
+
152
+    pmovzxbw    m0,         [r2 + r4]           ; first 16 of row 1 of src0
153
+    pmovzxbw    m1,         [r2 + r4 + 16]      ; second 16 of row 1 of src0
154
+    pmovzxbw    m2,         [r2 + r4 + 32]      ; third 16 of row 1 of src0
155
+    pmovzxbw    m3,         [r2 + r4 + 48]      ; forth 16 of row 1 of src0
156
+    movu        m4,         [r3 + r5]           ; first 16 of row 1 of src1
157
+    movu        m5,         [r3 + r5 + 32]      ; second 16 of row 1 of src1
158
+    movu        m6,         [r3 + r5 + 64]      ; third 16 of row 1 of src1
159
+    movu        m7,         [r3 + r5 + 96]      ; forth 16 of row 1 of src1
160
+
161
+    paddw       m0,         m4
162
+    paddw       m1,         m5
163
+    paddw       m2,         m6
164
+    paddw       m3,         m7
165
+    packuswb    m0,         m1
166
+    packuswb    m2,         m3
167
+    vpermq      m0, m0, 11011000b
168
+    movu        [r0 + r1],      m0              ; first 32 of row 1 of dst
169
+    vpermq      m2, m2, 11011000b
170
+    movu        [r0 + r1 + 32],      m2         ; second 32 of row 1 of dst
171
+
172
+    lea         r2,         [r2 + r4 * 2]
173
+    lea         r3,         [r3 + r5 * 2]
174
+    lea         r0,         [r0 + r1 * 2]
175
+
176
+    dec         r6d
177
+    jnz         .loop
178
+    RET
179
+
180
 %endif
181
 %endmacro
182
 
183
x265_1.5.tar.gz/source/common/x86/sad-a.asm -> x265_1.6.tar.gz/source/common/x86/sad-a.asm Changed
201
 
1
@@ -3710,3 +3710,749 @@
2
 SADX34_CACHELINE_FUNC 16, 16, 64, sse2, ssse3, ssse3
3
 SADX34_CACHELINE_FUNC 16,  8, 64, sse2, ssse3, ssse3
4
 
5
+%if HIGH_BIT_DEPTH==0
6
+INIT_YMM avx2
7
+cglobal pixel_sad_x3_8x4, 6,6,5
8
+    xorps           m0, m0
9
+    xorps           m1, m1
10
+
11
+    sub             r2, r1          ; rebase on pointer r1
12
+    sub             r3, r1
13
+
14
+    ; row 0
15
+    vpbroadcastq   xm2, [r0 + 0 * FENC_STRIDE]
16
+    movq           xm3, [r1]
17
+    movhps         xm3, [r1 + r2]
18
+    movq           xm4, [r1 + r3]
19
+    psadbw         xm3, xm2
20
+    psadbw         xm4, xm2
21
+    paddd          xm0, xm3
22
+    paddd          xm1, xm4
23
+    add             r1, r4
24
+
25
+    ; row 1
26
+    vpbroadcastq   xm2, [r0 + 1 * FENC_STRIDE]
27
+    movq           xm3, [r1]
28
+    movhps         xm3, [r1 + r2]
29
+    movq           xm4, [r1 + r3]
30
+    psadbw         xm3, xm2
31
+    psadbw         xm4, xm2
32
+    paddd          xm0, xm3
33
+    paddd          xm1, xm4
34
+    add             r1, r4
35
+
36
+    ; row 2
37
+    vpbroadcastq   xm2, [r0 + 2 * FENC_STRIDE]
38
+    movq           xm3, [r1]
39
+    movhps         xm3, [r1 + r2]
40
+    movq           xm4, [r1 + r3]
41
+    psadbw         xm3, xm2
42
+    psadbw         xm4, xm2
43
+    paddd          xm0, xm3
44
+    paddd          xm1, xm4
45
+    add             r1, r4
46
+
47
+    ; row 3
48
+    vpbroadcastq   xm2, [r0 + 3 * FENC_STRIDE]
49
+    movq           xm3, [r1]
50
+    movhps         xm3, [r1 + r2]
51
+    movq           xm4, [r1 + r3]
52
+    psadbw         xm3, xm2
53
+    psadbw         xm4, xm2
54
+    paddd          xm0, xm3
55
+    paddd          xm1, xm4
56
+
57
+    pshufd          xm0, xm0, q0020
58
+    movq            [r5 + 0], xm0
59
+    movd            [r5 + 8], xm1
60
+    RET
61
+
62
+INIT_YMM avx2
63
+cglobal pixel_sad_x3_8x8, 6,6,5
64
+    xorps           m0, m0
65
+    xorps           m1, m1
66
+
67
+    sub             r2, r1          ; rebase on pointer r1
68
+    sub             r3, r1
69
+%assign x 0
70
+%rep 4
71
+    ; row 0
72
+    vpbroadcastq   xm2, [r0 + 0 * FENC_STRIDE]
73
+    movq           xm3, [r1]
74
+    movhps         xm3, [r1 + r2]
75
+    movq           xm4, [r1 + r3]
76
+    psadbw         xm3, xm2
77
+    psadbw         xm4, xm2
78
+    paddd          xm0, xm3
79
+    paddd          xm1, xm4
80
+    add             r1, r4
81
+
82
+    ; row 1
83
+    vpbroadcastq   xm2, [r0 + 1 * FENC_STRIDE]
84
+    movq           xm3, [r1]
85
+    movhps         xm3, [r1 + r2]
86
+    movq           xm4, [r1 + r3]
87
+    psadbw         xm3, xm2
88
+    psadbw         xm4, xm2
89
+    paddd          xm0, xm3
90
+    paddd          xm1, xm4
91
+
92
+%assign x x+1
93
+  %if x < 4
94
+    add             r1, r4
95
+    add             r0, 2 * FENC_STRIDE
96
+  %endif
97
+%endrep
98
+
99
+    pshufd          xm0, xm0, q0020
100
+    movq            [r5 + 0], xm0
101
+    movd            [r5 + 8], xm1
102
+    RET
103
+
104
+INIT_YMM avx2
105
+cglobal pixel_sad_x3_8x16, 6,6,5
106
+    xorps           m0, m0
107
+    xorps           m1, m1
108
+
109
+    sub             r2, r1          ; rebase on pointer r1
110
+    sub             r3, r1
111
+%assign x 0
112
+%rep 8
113
+    ; row 0
114
+    vpbroadcastq   xm2, [r0 + 0 * FENC_STRIDE]
115
+    movq           xm3, [r1]
116
+    movhps         xm3, [r1 + r2]
117
+    movq           xm4, [r1 + r3]
118
+    psadbw         xm3, xm2
119
+    psadbw         xm4, xm2
120
+    paddd          xm0, xm3
121
+    paddd          xm1, xm4
122
+    add             r1, r4
123
+
124
+    ; row 1
125
+    vpbroadcastq   xm2, [r0 + 1 * FENC_STRIDE]
126
+    movq           xm3, [r1]
127
+    movhps         xm3, [r1 + r2]
128
+    movq           xm4, [r1 + r3]
129
+    psadbw         xm3, xm2
130
+    psadbw         xm4, xm2
131
+    paddd          xm0, xm3
132
+    paddd          xm1, xm4
133
+
134
+%assign x x+1
135
+  %if x < 8
136
+    add             r1, r4
137
+    add             r0, 2 * FENC_STRIDE
138
+  %endif
139
+%endrep
140
+
141
+    pshufd          xm0, xm0, q0020
142
+    movq            [r5 + 0], xm0
143
+    movd            [r5 + 8], xm1
144
+    RET
145
+
146
+INIT_YMM avx2
147
+cglobal pixel_sad_x4_8x8, 7,7,5
148
+    xorps           m0, m0
149
+    xorps           m1, m1
150
+
151
+    sub             r2, r1          ; rebase on pointer r1
152
+    sub             r3, r1
153
+    sub             r4, r1
154
+%assign x 0
155
+%rep 4
156
+    ; row 0
157
+    vpbroadcastq   xm2, [r0 + 0 * FENC_STRIDE]
158
+    movq           xm3, [r1]
159
+    movhps         xm3, [r1 + r2]
160
+    movq           xm4, [r1 + r3]
161
+    movhps         xm4, [r1 + r4]
162
+    psadbw         xm3, xm2
163
+    psadbw         xm4, xm2
164
+    paddd          xm0, xm3
165
+    paddd          xm1, xm4
166
+    add             r1, r5
167
+
168
+    ; row 1
169
+    vpbroadcastq   xm2, [r0 + 1 * FENC_STRIDE]
170
+    movq           xm3, [r1]
171
+    movhps         xm3, [r1 + r2]
172
+    movq           xm4, [r1 + r3]
173
+    movhps         xm4, [r1 + r4]
174
+    psadbw         xm3, xm2
175
+    psadbw         xm4, xm2
176
+    paddd          xm0, xm3
177
+    paddd          xm1, xm4
178
+
179
+%assign x x+1
180
+  %if x < 4
181
+    add             r1, r5
182
+    add             r0, 2 * FENC_STRIDE
183
+  %endif
184
+%endrep
185
+
186
+    pshufd          xm0, xm0, q0020
187
+    pshufd          xm1, xm1, q0020
188
+    movq            [r6 + 0], xm0
189
+    movq            [r6 + 8], xm1
190
+    RET
191
+
192
+INIT_YMM avx2
193
+cglobal pixel_sad_32x8, 4,4,6
194
+    xorps           m0, m0
195
+    xorps           m5, m5
196
+
197
+    movu           m1, [r0]               ; row 0 of pix0
198
+    movu           m2, [r2]               ; row 0 of pix1
199
+    movu           m3, [r0 + r1]          ; row 1 of pix0
200
+    movu           m4, [r2 + r3]          ; row 1 of pix1
201
x265_1.5.tar.gz/source/common/x86/ssd-a.asm -> x265_1.6.tar.gz/source/common/x86/ssd-a.asm Changed
61
 
1
@@ -822,10 +822,10 @@
2
 
3
 %if HIGH_BIT_DEPTH == 0
4
 %macro SSD_LOAD_FULL 5
5
-    mova      m1, [t0+%1]
6
-    mova      m2, [t2+%2]
7
-    mova      m3, [t0+%3]
8
-    mova      m4, [t2+%4]
9
+    movu      m1, [t0+%1]
10
+    movu      m2, [t2+%2]
11
+    movu      m3, [t0+%3]
12
+    movu      m4, [t2+%4]
13
 %if %5==1
14
     add       t0, t1
15
     add       t2, t3
16
@@ -1094,6 +1094,8 @@
17
 INIT_YMM avx2
18
 SSD 16, 16
19
 SSD 16,  8
20
+SSD 32, 32
21
+SSD 64, 64
22
 %assign function_align 16
23
 %endif ; !HIGH_BIT_DEPTH
24
 
25
@@ -2548,6 +2550,35 @@
26
     movd    eax, m0
27
     RET
28
 
29
+INIT_YMM avx2
30
+cglobal pixel_ssd_s_16, 2,4,5
31
+    add     r1, r1
32
+    lea     r3, [r1 * 3]
33
+    mov     r2d, 16/4
34
+    pxor    m0, m0
35
+.loop:
36
+    movu    m1, [r0]
37
+    movu    m2, [r0 + r1]
38
+    movu    m3, [r0 + 2 * r1]
39
+    movu    m4, [r0 + r3]
40
+
41
+    lea     r0, [r0 + r1 * 4]
42
+    pmaddwd m1, m1
43
+    pmaddwd m2, m2
44
+    pmaddwd m3, m3
45
+    pmaddwd m4, m4
46
+    paddd   m1, m2
47
+    paddd   m3, m4
48
+    paddd   m1, m3
49
+    paddd   m0, m1
50
+
51
+    dec     r2d
52
+    jnz    .loop
53
+
54
+    ; calculate sum and return
55
+    HADDD   m0, m1
56
+    movd    eax, xm0
57
+    RET
58
 
59
 INIT_YMM avx2
60
 cglobal pixel_ssd_s_32, 2,4,5
61
x265_1.5.tar.gz/source/encoder/analysis.cpp -> x265_1.6.tar.gz/source/encoder/analysis.cpp Changed
201
 
1
@@ -71,9 +71,10 @@
2
 
3
 Analysis::Analysis()
4
 {
5
-    m_totalNumJobs = m_numAcquiredJobs = m_numCompletedJobs = 0;
6
     m_reuseIntraDataCTU = NULL;
7
     m_reuseInterDataCTU = NULL;
8
+    m_reuseRef = NULL;
9
+    m_reuseBestMergeCand = NULL;
10
 }
11
 
12
 bool Analysis::create(ThreadLocalData *tld)
13
@@ -125,6 +126,11 @@
14
     m_slice = ctu.m_slice;
15
     m_frame = &frame;
16
 
17
+#if _DEBUG || CHECKED_BUILD
18
+    for (uint32_t i = 0; i <= g_maxCUDepth; i++)
19
+        for (uint32_t j = 0; j < MAX_PRED_TYPES; j++)
20
+            m_modeDepth[i].pred[j].invalidate();
21
+#endif
22
     invalidateContexts(0);
23
     m_quant.setQPforQuant(ctu);
24
     m_rqt[0].cur.load(initialContext);
25
@@ -139,10 +145,13 @@
26
         {
27
             int numPredDir = m_slice->isInterP() ? 1 : 2;
28
             m_reuseInterDataCTU = (analysis_inter_data *)m_frame->m_analysisData.interData;
29
-            reuseRef = &m_reuseInterDataCTU->ref[ctu.m_cuAddr * X265_MAX_PRED_MODE_PER_CTU * numPredDir];
30
+            m_reuseRef = &m_reuseInterDataCTU->ref[ctu.m_cuAddr * X265_MAX_PRED_MODE_PER_CTU * numPredDir];
31
+            m_reuseBestMergeCand = &m_reuseInterDataCTU->bestMergeCand[ctu.m_cuAddr * CUGeom::MAX_GEOMS];
32
         }
33
     }
34
 
35
+    ProfileCUScope(ctu, totalCTUTime, totalCTUs);
36
+
37
     uint32_t zOrder = 0;
38
     if (m_slice->m_sliceType == I_SLICE)
39
     {
40
@@ -153,6 +162,7 @@
41
             memcpy(&m_reuseIntraDataCTU->depth[ctu.m_cuAddr * numPartition], bestCU->m_cuDepth, sizeof(uint8_t) * numPartition);
42
             memcpy(&m_reuseIntraDataCTU->modes[ctu.m_cuAddr * numPartition], bestCU->m_lumaIntraDir, sizeof(uint8_t) * numPartition);
43
             memcpy(&m_reuseIntraDataCTU->partSizes[ctu.m_cuAddr * numPartition], bestCU->m_partSize, sizeof(uint8_t) * numPartition);
44
+            memcpy(&m_reuseIntraDataCTU->chromaModes[ctu.m_cuAddr * numPartition], bestCU->m_chromaIntraDir, sizeof(uint8_t) * numPartition);
45
         }
46
     }
47
     else
48
@@ -196,14 +206,16 @@
49
         return;
50
     else if (md.bestMode->cu.isIntra(0))
51
     {
52
+        md.pred[PRED_LOSSLESS].initCosts();
53
         md.pred[PRED_LOSSLESS].cu.initLosslessCU(md.bestMode->cu, cuGeom);
54
         PartSize size = (PartSize)md.pred[PRED_LOSSLESS].cu.m_partSize[0];
55
         uint8_t* modes = md.pred[PRED_LOSSLESS].cu.m_lumaIntraDir;
56
-        checkIntra(md.pred[PRED_LOSSLESS], cuGeom, size, modes);
57
+        checkIntra(md.pred[PRED_LOSSLESS], cuGeom, size, modes, NULL);
58
         checkBestMode(md.pred[PRED_LOSSLESS], cuGeom.depth);
59
     }
60
     else
61
     {
62
+        md.pred[PRED_LOSSLESS].initCosts();
63
         md.pred[PRED_LOSSLESS].cu.initLosslessCU(md.bestMode->cu, cuGeom);
64
         md.pred[PRED_LOSSLESS].predYuv.copyFromYuv(md.bestMode->predYuv);
65
         encodeResAndCalcRdInterCU(md.pred[PRED_LOSSLESS], cuGeom);
66
@@ -225,15 +237,16 @@
67
         uint8_t* reuseDepth  = &m_reuseIntraDataCTU->depth[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
68
         uint8_t* reuseModes  = &m_reuseIntraDataCTU->modes[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
69
         char* reusePartSizes = &m_reuseIntraDataCTU->partSizes[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
70
+        uint8_t* reuseChromaModes = &m_reuseIntraDataCTU->chromaModes[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
71
 
72
-        if (mightNotSplit && depth == reuseDepth[zOrder] && zOrder == cuGeom.encodeIdx)
73
+        if (mightNotSplit && depth == reuseDepth[zOrder] && zOrder == cuGeom.absPartIdx)
74
         {
75
             m_quant.setQPforQuant(parentCTU);
76
 
77
             PartSize size = (PartSize)reusePartSizes[zOrder];
78
             Mode& mode = size == SIZE_2Nx2N ? md.pred[PRED_INTRA] : md.pred[PRED_INTRA_NxN];
79
             mode.cu.initSubCU(parentCTU, cuGeom);
80
-            checkIntra(mode, cuGeom, size, &reuseModes[zOrder]);
81
+            checkIntra(mode, cuGeom, size, &reuseModes[zOrder], &reuseChromaModes[zOrder]);
82
             checkBestMode(mode, depth);
83
 
84
             if (m_bTryLossless)
85
@@ -252,13 +265,13 @@
86
         m_quant.setQPforQuant(parentCTU);
87
 
88
         md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom);
89
-        checkIntra(md.pred[PRED_INTRA], cuGeom, SIZE_2Nx2N, NULL);
90
+        checkIntra(md.pred[PRED_INTRA], cuGeom, SIZE_2Nx2N, NULL, NULL);
91
         checkBestMode(md.pred[PRED_INTRA], depth);
92
 
93
-        if (depth == g_maxCUDepth)
94
+        if (cuGeom.log2CUSize == 3 && m_slice->m_sps->quadtreeTULog2MinSize < 3)
95
         {
96
             md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom);
97
-            checkIntra(md.pred[PRED_INTRA_NxN], cuGeom, SIZE_NxN, NULL);
98
+            checkIntra(md.pred[PRED_INTRA_NxN], cuGeom, SIZE_NxN, NULL, NULL);
99
             checkBestMode(md.pred[PRED_INTRA_NxN], depth);
100
         }
101
 
102
@@ -286,7 +299,7 @@
103
             const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
104
             if (childGeom.flags & CUGeom::PRESENT)
105
             {
106
-                m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.encodeIdx);
107
+                m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.absPartIdx);
108
                 m_rqt[nextDepth].cur.load(*nextContext);
109
                 compressIntraCU(parentCTU, childGeom, zOrder);
110
 
111
@@ -308,203 +321,173 @@
112
             addSplitFlagCost(*splitPred, cuGeom.depth);
113
         else
114
             updateModeCost(*splitPred);
115
+
116
+        checkDQPForSplitPred(splitPred->cu, cuGeom);
117
         checkBestMode(*splitPred, depth);
118
     }
119
 
120
-    checkDQP(md.bestMode->cu, cuGeom);
121
-
122
     /* Copy best data to encData CTU and recon */
123
     md.bestMode->cu.copyToPic(depth);
124
     if (md.bestMode != &md.pred[PRED_SPLIT])
125
-        md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, parentCTU.m_cuAddr, cuGeom.encodeIdx);
126
+        md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, parentCTU.m_cuAddr, cuGeom.absPartIdx);
127
 }
128
 
129
-bool Analysis::findJob(int threadId)
130
+void Analysis::PMODE::processTasks(int workerThreadId)
131
 {
132
-    /* try to acquire a CU mode to analyze */
133
-    m_pmodeLock.acquire();
134
-    if (m_totalNumJobs > m_numAcquiredJobs)
135
-    {
136
-        int id = m_numAcquiredJobs++;
137
-        m_pmodeLock.release();
138
-
139
-        ProfileScopeEvent(pmode);
140
-        parallelModeAnalysis(threadId, id);
141
-
142
-        m_pmodeLock.acquire();
143
-        if (++m_numCompletedJobs == m_totalNumJobs)
144
-            m_modeCompletionEvent.trigger();
145
-        m_pmodeLock.release();
146
-        return true;
147
-    }
148
-    else
149
-        m_pmodeLock.release();
150
-
151
-    m_meLock.acquire();
152
-    if (m_totalNumME > m_numAcquiredME)
153
-    {
154
-        int id = m_numAcquiredME++;
155
-        m_meLock.release();
156
-
157
-        ProfileScopeEvent(pme);
158
-        parallelME(threadId, id);
159
-
160
-        m_meLock.acquire();
161
-        if (++m_numCompletedME == m_totalNumME)
162
-            m_meCompletionEvent.trigger();
163
-        m_meLock.release();
164
-        return true;
165
-    }
166
-    else
167
-        m_meLock.release();
168
-
169
-    return false;
170
+#if DETAILED_CU_STATS
171
+    int fe = master.m_modeDepth[cuGeom.depth].pred[PRED_2Nx2N].cu.m_encData->m_frameEncoderID;
172
+    master.m_stats[fe].countPModeTasks++;
173
+    ScopedElapsedTime pmodeTime(master.m_stats[fe].pmodeTime);
174
+#endif
175
+    ProfileScopeEvent(pmode);
176
+    master.processPmode(*this, master.m_tld[workerThreadId].analysis);
177
 }
178
 
179
-void Analysis::parallelME(int threadId, int meId)
180
+/* process pmode jobs until none remain; may be called by the master thread or by
181
+ * a bonded peer (slave) thread via pmodeTasks() */
182
+void Analysis::processPmode(PMODE& pmode, Analysis& slave)
183
 {
184
-    Analysis* slave;
185
-
186
-    if (threadId == -1)
187
-        slave = this;
188
-    else
189
+    /* acquire a mode task, else exit early */
190
+    int task;
191
+    pmode.m_lock.acquire();
192
+    if (pmode.m_jobTotal > pmode.m_jobAcquired)
193
     {
194
-        slave = &m_tld[threadId].analysis;
195
-        slave->setQP(*m_slice, m_rdCost.m_qp);
196
-        slave->m_slice = m_slice;
197
-        slave->m_frame = m_frame;
198
-
199
-        slave->m_me.setSourcePU(*m_curInterMode->fencYuv, m_curInterMode->cu.m_cuAddr, m_curGeom->encodeIdx, m_puAbsPartIdx, m_puWidth, m_puHeight);
200
-        slave->prepMotionCompensation(m_curInterMode->cu, *m_curGeom, m_curPart);
201
x265_1.5.tar.gz/source/encoder/analysis.h -> x265_1.6.tar.gz/source/encoder/analysis.h Changed
91
 
1
@@ -70,30 +70,43 @@
2
         CUDataMemPool  cuMemPool;
3
     };
4
 
5
+    class PMODE : public BondedTaskGroup
6
+    {
7
+    public:
8
+
9
+        Analysis&     master;
10
+        const CUGeom& cuGeom;
11
+        int           modes[MAX_PRED_TYPES];
12
+
13
+        PMODE(Analysis& m, const CUGeom& g) : master(m), cuGeom(g) {}
14
+
15
+        void processTasks(int workerThreadId);
16
+
17
+    protected:
18
+
19
+        PMODE operator=(const PMODE&);
20
+    };
21
+
22
+    void processPmode(PMODE& pmode, Analysis& slave);
23
+
24
     ModeDepth m_modeDepth[NUM_CU_DEPTH];
25
     bool      m_bTryLossless;
26
     bool      m_bChromaSa8d;
27
 
28
-    /* Analysis data for load/save modes, keeps getting incremented as CTU analysis proceeds and data is consumed or read */
29
-    analysis_intra_data* m_reuseIntraDataCTU;
30
-    analysis_inter_data* m_reuseInterDataCTU;
31
-    int32_t* reuseRef;
32
     Analysis();
33
+
34
     bool create(ThreadLocalData* tld);
35
     void destroy();
36
+
37
     Mode& compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, const Entropy& initialContext);
38
 
39
 protected:
40
 
41
-    /* mode analysis distribution */
42
-    int           m_totalNumJobs;
43
-    volatile int  m_numAcquiredJobs;
44
-    volatile int  m_numCompletedJobs;
45
-    Lock          m_pmodeLock;
46
-    Event         m_modeCompletionEvent;
47
-    bool findJob(int threadId);
48
-    void parallelModeAnalysis(int threadId, int jobId);
49
-    void parallelME(int threadId, int meId);
50
+    /* Analysis data for load/save modes, keeps getting incremented as CTU analysis proceeds and data is consumed or read */
51
+    analysis_intra_data* m_reuseIntraDataCTU;
52
+    analysis_inter_data* m_reuseInterDataCTU;
53
+    int32_t*             m_reuseRef;
54
+    uint32_t*            m_reuseBestMergeCand;
55
 
56
     /* full analysis for an I-slice CU */
57
     void compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, uint32_t &zOrder);
58
@@ -105,7 +118,7 @@
59
 
60
     /* measure merge and skip */
61
     void checkMerge2Nx2N_rd0_4(Mode& skip, Mode& merge, const CUGeom& cuGeom);
62
-    void checkMerge2Nx2N_rd5_6(Mode& skip, Mode& merge, const CUGeom& cuGeom);
63
+    void checkMerge2Nx2N_rd5_6(Mode& skip, Mode& merge, const CUGeom& cuGeom, bool isSkipMode);
64
 
65
     /* measure inter options */
66
     void checkInter_rd0_4(Mode& interMode, const CUGeom& cuGeom, PartSize partSize);
67
@@ -119,9 +132,6 @@
68
     /* add the RD cost of coding a split flag (0 or 1) to the given mode */
69
     void addSplitFlagCost(Mode& mode, uint32_t depth);
70
 
71
-    /* update CBF flags and QP values to be internally consistent */
72
-    void checkDQP(CUData& cu, const CUGeom& cuGeom);
73
-
74
     /* work-avoidance heuristics for RD levels < 5 */
75
     uint32_t topSkipMinDepth(const CUData& parentCTU, const CUGeom& cuGeom);
76
     bool recursionDepthCheck(const CUData& parentCTU, const CUGeom& cuGeom, const Mode& bestMode);
77
@@ -129,9 +139,13 @@
78
     /* generate residual and recon pixels for an entire CTU recursively (RD0) */
79
     void encodeResidue(const CUData& parentCTU, const CUGeom& cuGeom);
80
 
81
+    int calculateQpforCuSize(CUData& ctu, const CUGeom& cuGeom);
82
+
83
     /* check whether current mode is the new best */
84
     inline void checkBestMode(Mode& mode, uint32_t depth)
85
     {
86
+        X265_CHECK(mode.ok(), "mode costs are uninitialized\n");
87
+
88
         ModeDepth& md = m_modeDepth[depth];
89
         if (md.bestMode)
90
         {
91
x265_1.5.tar.gz/source/encoder/api.cpp -> x265_1.6.tar.gz/source/encoder/api.cpp Changed
64
 
1
@@ -173,6 +173,7 @@
2
     {
3
         Encoder *encoder = static_cast<Encoder*>(enc);
4
 
5
+        encoder->stop();
6
         encoder->printSummary();
7
         encoder->destroy();
8
         delete encoder;
9
@@ -183,6 +184,8 @@
10
 void x265_cleanup(void)
11
 {
12
     BitCost::destroy();
13
+    CUData::s_partSet[0] = NULL; /* allow CUData to adjust to new CTU size */
14
+    g_ctuSizeConfigured = 0;
15
 }
16
 
17
 extern "C"
18
@@ -206,7 +209,7 @@
19
 
20
         uint32_t numCUsInFrame   = widthInCU * heightInCU;
21
         pic->analysisData.numCUsInFrame = numCUsInFrame;
22
-        pic->analysisData.numPartitions = NUM_CU_PARTITIONS;
23
+        pic->analysisData.numPartitions = NUM_4x4_PARTITIONS;
24
     }
25
 }
26
 
27
@@ -215,3 +218,36 @@
28
 {
29
     return x265_free(p);
30
 }
31
+
32
+static const x265_api libapi =
33
+{
34
+    &x265_param_alloc,
35
+    &x265_param_free,
36
+    &x265_param_default,
37
+    &x265_param_parse,
38
+    &x265_param_apply_profile,
39
+    &x265_param_default_preset,
40
+    &x265_picture_alloc,
41
+    &x265_picture_free,
42
+    &x265_picture_init,
43
+    &x265_encoder_open,
44
+    &x265_encoder_parameters,
45
+    &x265_encoder_headers,
46
+    &x265_encoder_encode,
47
+    &x265_encoder_get_stats,
48
+    &x265_encoder_log,
49
+    &x265_encoder_close,
50
+    &x265_cleanup,
51
+    x265_version_str,
52
+    x265_build_info_str,
53
+    x265_max_bit_depth,
54
+};
55
+
56
+extern "C"
57
+const x265_api* x265_api_get(int bitDepth)
58
+{
59
+    if (bitDepth && bitDepth != X265_DEPTH)
60
+        return NULL;
61
+
62
+    return &libapi;
63
+}
64
x265_1.5.tar.gz/source/encoder/dpb.cpp -> x265_1.6.tar.gz/source/encoder/dpb.cpp Changed
35
 
1
@@ -104,11 +104,14 @@
2
 
3
     if (type == X265_TYPE_B)
4
     {
5
-        // change from _R "referenced" to _N "non-referenced" NAL unit type
6
+        newFrame->m_encData->m_bHasReferences = false;
7
+
8
+        // Adjust NAL type for unreferenced B frames (change from _R "referenced"
9
+        // to _N "non-referenced" NAL unit type)
10
         switch (slice->m_nalUnitType)
11
         {
12
         case NAL_UNIT_CODED_SLICE_TRAIL_R:
13
-            slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_TRAIL_N;
14
+            slice->m_nalUnitType = m_bTemporalSublayer ? NAL_UNIT_CODED_SLICE_TSA_N : NAL_UNIT_CODED_SLICE_TRAIL_N;
15
             break;
16
         case NAL_UNIT_CODED_SLICE_RADL_R:
17
             slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_RADL_N;
18
@@ -120,10 +123,12 @@
19
             break;
20
         }
21
     }
22
-
23
-    /* m_bHasReferences starts out as true for non-B pictures, and is set to false
24
-     * once no more pictures reference it */
25
-    newFrame->m_encData->m_bHasReferences = IS_REFERENCED(newFrame);
26
+    else
27
+    {
28
+        /* m_bHasReferences starts out as true for non-B pictures, and is set to false
29
+         * once no more pictures reference it */
30
+        newFrame->m_encData->m_bHasReferences = true;
31
+    }
32
 
33
     m_picList.pushFront(*newFrame);
34
 
35
x265_1.5.tar.gz/source/encoder/dpb.h -> x265_1.6.tar.gz/source/encoder/dpb.h Changed
22
 
1
@@ -39,10 +39,11 @@
2
 
3
     int                m_lastIDR;
4
     int                m_pocCRA;
5
-    bool               m_bRefreshPending;
6
     int                m_maxRefL0;
7
     int                m_maxRefL1;
8
     int                m_bOpenGOP;
9
+    bool               m_bRefreshPending;
10
+    bool               m_bTemporalSublayer;
11
     PicList            m_picList;
12
     PicList            m_freeList;
13
     FrameData*         m_picSymFreeList;
14
@@ -56,6 +57,7 @@
15
         m_maxRefL0 = param->maxNumReferences;
16
         m_maxRefL1 = param->bBPyramid ? 2 : 1;
17
         m_bOpenGOP = param->bOpenGOP;
18
+        m_bTemporalSublayer = !!param->bEnableTemporalSubLayers;
19
     }
20
 
21
     ~DPB();
22
x265_1.5.tar.gz/source/encoder/encoder.cpp -> x265_1.6.tar.gz/source/encoder/encoder.cpp Changed
201
 
1
@@ -43,7 +43,7 @@
2
 const char g_sliceTypeToChar[] = {'B', 'P', 'I'};
3
 }
4
 
5
-static const char *summaryCSVHeader =
6
+static const char* summaryCSVHeader =
7
     "Command, Date/Time, Elapsed Time, FPS, Bitrate, "
8
     "Y PSNR, U PSNR, V PSNR, Global PSNR, SSIM, SSIM (dB), "
9
     "I count, I ave-QP, I kpbs, I-PSNR Y, I-PSNR U, I-PSNR V, I-SSIM (dB), "
10
@@ -51,7 +51,7 @@
11
     "B count, B ave-QP, B kpbs, B-PSNR Y, B-PSNR U, B-PSNR V, B-SSIM (dB), "
12
     "Version\n";
13
 
14
-const char* defaultAnalysisFileName = "x265_analysis.dat";
15
+static const char* defaultAnalysisFileName = "x265_analysis.dat";
16
 
17
 using namespace x265;
18
 
19
@@ -66,7 +66,6 @@
20
     m_numLumaWPBiFrames = 0;
21
     m_numChromaWPBiFrames = 0;
22
     m_lookahead = NULL;
23
-    m_frameEncoder = NULL;
24
     m_rateControl = NULL;
25
     m_dpb = NULL;
26
     m_exportedPic = NULL;
27
@@ -78,9 +77,12 @@
28
     m_cuOffsetC = NULL;
29
     m_buOffsetY = NULL;
30
     m_buOffsetC = NULL;
31
-    m_threadPool = 0;
32
-    m_numThreadLocalData = 0;
33
+    m_threadPool = NULL;
34
     m_analysisFile = NULL;
35
+    for (int i = 0; i < X265_MAX_FRAME_THREADS; i++)
36
+        m_frameEncoder[i] = NULL;
37
+
38
+    MotionEstimate::initScales();
39
 }
40
 
41
 void Encoder::create()
42
@@ -101,21 +103,35 @@
43
     if (rows == 1 || cols < 3)
44
         p->bEnableWavefront = 0;
45
 
46
-    int poolThreadCount = p->poolNumThreads ? p->poolNumThreads : getCpuCount();
47
+    bool allowPools = !p->numaPools || strcmp(p->numaPools, "none");
48
 
49
     // Trim the thread pool if --wpp, --pme, and --pmode are disabled
50
     if (!p->bEnableWavefront && !p->bDistributeModeAnalysis && !p->bDistributeMotionEstimation)
51
-        poolThreadCount = 0;
52
+        allowPools = false;
53
 
54
-    if (poolThreadCount > 1)
55
+    if (!p->frameNumThreads)
56
     {
57
-        m_threadPool = ThreadPool::allocThreadPool(poolThreadCount);
58
-        poolThreadCount = m_threadPool->getThreadCount();
59
+        // auto-detect frame threads
60
+        int cpuCount = ThreadPool::getCpuCount();
61
+        if (!p->bEnableWavefront)
62
+            p->frameNumThreads = X265_MIN3(cpuCount, (rows + 1) / 2, X265_MAX_FRAME_THREADS);
63
+        else if (cpuCount >= 32)
64
+            p->frameNumThreads = (p->sourceHeight > 2000) ? 8 : 6; // dual-socket 10-core IvyBridge or higher
65
+        else if (cpuCount >= 16)
66
+            p->frameNumThreads = 5; // 8 HT cores, or dual socket
67
+        else if (cpuCount >= 8)
68
+            p->frameNumThreads = 3; // 4 HT cores
69
+        else if (cpuCount >= 4)
70
+            p->frameNumThreads = 2; // Dual or Quad core
71
+        else
72
+            p->frameNumThreads = 1;
73
     }
74
-    else
75
-        poolThreadCount = 0;
76
 
77
-    if (!poolThreadCount)
78
+    m_numPools = 0;
79
+    if (allowPools)
80
+        m_threadPool = ThreadPool::allocThreadPools(p, m_numPools);
81
+
82
+    if (!m_numPools)
83
     {
84
         // issue warnings if any of these features were requested
85
         if (p->bEnableWavefront)
86
@@ -129,31 +145,40 @@
87
         p->bEnableWavefront = p->bDistributeModeAnalysis = p->bDistributeMotionEstimation = 0;
88
     }
89
 
90
-    if (!p->frameNumThreads)
91
-    {
92
-        // auto-detect frame threads
93
-        int cpuCount = getCpuCount();
94
-        if (!p->bEnableWavefront)
95
-            p->frameNumThreads = X265_MIN(cpuCount, (rows + 1) / 2);
96
-        else if (cpuCount >= 32)
97
-            p->frameNumThreads = (p->sourceHeight > 2000) ? 8 : 6; // dual-socket 10-core IvyBridge or higher
98
-        else if (cpuCount >= 16)
99
-            p->frameNumThreads = 5; // 8 HT cores, or dual socket
100
-        else if (cpuCount >= 8)
101
-            p->frameNumThreads = 3; // 4 HT cores
102
-        else if (cpuCount >= 4)
103
-            p->frameNumThreads = 2; // Dual or Quad core
104
-        else
105
-            p->frameNumThreads = 1;
106
-    }
107
+    char buf[128];
108
+    int len = 0;
109
+    if (p->bEnableWavefront)
110
+        len += sprintf(buf + len, "wpp(%d rows)", rows);
111
+    if (p->bDistributeModeAnalysis)
112
+        len += sprintf(buf + len, "%spmode", len ? "+" : "");
113
+    if (p->bDistributeMotionEstimation)
114
+        len += sprintf(buf + len, "%spme ", len ? "+" : "");
115
+    if (!len)
116
+        strcpy(buf, "none");
117
 
118
-    x265_log(p, X265_LOG_INFO, "WPP streams / frame threads / pool  : %d / %d / %d%s%s\n", 
119
-             p->bEnableWavefront ? rows : 0, p->frameNumThreads, poolThreadCount,
120
-             p->bDistributeMotionEstimation ? " / pme" : "", p->bDistributeModeAnalysis ? " / pmode" : "");
121
+    x265_log(p, X265_LOG_INFO, "frame threads / pool features       : %d / %s\n", p->frameNumThreads, buf);
122
 
123
-    m_frameEncoder = new FrameEncoder[m_param->frameNumThreads];
124
     for (int i = 0; i < m_param->frameNumThreads; i++)
125
-        m_frameEncoder[i].setThreadPool(m_threadPool);
126
+        m_frameEncoder[i] = new FrameEncoder;
127
+
128
+    if (m_numPools)
129
+    {
130
+        for (int i = 0; i < m_param->frameNumThreads; i++)
131
+        {
132
+            int pool = i % m_numPools;
133
+            m_frameEncoder[i]->m_pool = &m_threadPool[pool];
134
+            m_frameEncoder[i]->m_jpId = m_threadPool[pool].m_numProviders++;
135
+            m_threadPool[pool].m_jpTable[m_frameEncoder[i]->m_jpId] = m_frameEncoder[i];
136
+        }
137
+        for (int i = 0; i < m_numPools; i++)
138
+            m_threadPool[i].start();
139
+    }
140
+    else
141
+    {
142
+        /* CU stats and noise-reduction buffers are indexed by jpId, so it cannot be left as -1 */
143
+        for (int i = 0; i < m_param->frameNumThreads; i++)
144
+            m_frameEncoder[i]->m_jpId = 0;
145
+    }
146
 
147
     if (!m_scalingList.init())
148
     {
149
@@ -168,27 +193,17 @@
150
         m_aborted = true;
151
     m_scalingList.setupQuantMatrices();
152
 
153
-    /* Allocate thread local data, one for each thread pool worker and
154
-     * if --no-wpp, one for each frame encoder */
155
-    m_numThreadLocalData = poolThreadCount;
156
-    if (!m_param->bEnableWavefront)
157
-        m_numThreadLocalData += m_param->frameNumThreads;
158
-    m_threadLocalData = new ThreadLocalData[m_numThreadLocalData];
159
-    for (int i = 0; i < m_numThreadLocalData; i++)
160
+    m_lookahead = new Lookahead(m_param, m_threadPool);
161
+    if (m_numPools)
162
     {
163
-        m_threadLocalData[i].analysis.setThreadPool(m_threadPool);
164
-        m_threadLocalData[i].analysis.initSearch(*m_param, m_scalingList);
165
-        m_threadLocalData[i].analysis.create(m_threadLocalData);
166
+        m_lookahead->m_jpId = m_threadPool[0].m_numProviders++;
167
+        m_threadPool[0].m_jpTable[m_lookahead->m_jpId] = m_lookahead;
168
     }
169
 
170
-    if (!m_param->bEnableWavefront)
171
-        for (int i = 0; i < m_param->frameNumThreads; i++)
172
-            m_frameEncoder[i].m_tld = &m_threadLocalData[poolThreadCount + i];
173
-
174
-    m_lookahead = new Lookahead(m_param, m_threadPool);
175
     m_dpb = new DPB(m_param);
176
-    m_rateControl = new RateControl(m_param);
177
+    m_rateControl = new RateControl(*m_param);
178
 
179
+    initVPS(&m_vps);
180
     initSPS(&m_sps);
181
     initPPS(&m_pps);
182
 
183
@@ -229,26 +244,29 @@
184
         }
185
     }
186
 
187
-    if (m_frameEncoder)
188
+    int numRows = (m_param->sourceHeight + g_maxCUSize - 1) / g_maxCUSize;
189
+    int numCols = (m_param->sourceWidth  + g_maxCUSize - 1) / g_maxCUSize;
190
+    for (int i = 0; i < m_param->frameNumThreads; i++)
191
     {
192
-        int numRows = (m_param->sourceHeight + g_maxCUSize - 1) / g_maxCUSize;
193
-        int numCols = (m_param->sourceWidth  + g_maxCUSize - 1) / g_maxCUSize;
194
-        for (int i = 0; i < m_param->frameNumThreads; i++)
195
+        if (!m_frameEncoder[i]->init(this, numRows, numCols))
196
         {
197
-            if (!m_frameEncoder[i].init(this, numRows, numCols, i))
198
-            {
199
-                x265_log(m_param, X265_LOG_ERROR, "Unable to initialize frame encoder, aborting\n");
200
-                m_aborted = true;
201
x265_1.5.tar.gz/source/encoder/encoder.h -> x265_1.6.tar.gz/source/encoder/encoder.h Changed
63
 
1
@@ -70,7 +70,6 @@
2
 class Lookahead;
3
 class RateControl;
4
 class ThreadPool;
5
-struct ThreadLocalData;
6
 
7
 class Encoder : public x265_encoder
8
 {
9
@@ -86,11 +85,12 @@
10
     int64_t            m_prevReorderedPts[2];
11
 
12
     ThreadPool*        m_threadPool;
13
-    FrameEncoder*      m_frameEncoder;
14
+    FrameEncoder*      m_frameEncoder[X265_MAX_FRAME_THREADS];
15
     DPB*               m_dpb;
16
 
17
     Frame*             m_exportedPic;
18
 
19
+    int                m_numPools;
20
     int                m_curEncoder;
21
 
22
     /* cached PicYuv offset arrays, shared by all instances of
23
@@ -120,14 +120,12 @@
24
     PPS                m_pps;
25
     NALList            m_nalList;
26
     ScalingList        m_scalingList;      // quantization matrix information
27
-    int                m_numThreadLocalData;
28
 
29
     int                m_lastBPSEI;
30
     uint32_t           m_numDelayedPic;
31
 
32
     x265_param*        m_param;
33
     RateControl*       m_rateControl;
34
-    ThreadLocalData*   m_threadLocalData;
35
     Lookahead*         m_lookahead;
36
     Window             m_conformanceWindow;
37
 
38
@@ -138,6 +136,7 @@
39
     ~Encoder() {}
40
 
41
     void create();
42
+    void stop();
43
     void destroy();
44
 
45
     int encode(const x265_picture* pic, x265_picture *pic_out);
46
@@ -154,8 +153,6 @@
47
 
48
     char* statsCSVString(EncStats& stat, char* buffer);
49
 
50
-    void setThreadPool(ThreadPool* p) { m_threadPool = p; }
51
-
52
     void configure(x265_param *param);
53
 
54
     void updateVbvPlan(RateControl* rc);
55
@@ -172,6 +169,7 @@
56
 
57
 protected:
58
 
59
+    void initVPS(VPS *vps);
60
     void initSPS(SPS *sps);
61
     void initPPS(PPS *pps);
62
 };
63
x265_1.5.tar.gz/source/encoder/entropy.cpp -> x265_1.6.tar.gz/source/encoder/entropy.cpp Changed
201
 
1
@@ -43,6 +43,7 @@
2
 {
3
     markValid();
4
     m_fracBits = 0;
5
+    m_pad = 0;
6
     X265_CHECK(sizeof(m_contextState) >= sizeof(m_contextState[0]) * MAX_OFF_CTX_MOD, "context state table is too small\n");
7
 }
8
 
9
@@ -51,17 +52,21 @@
10
     WRITE_CODE(0,       4, "vps_video_parameter_set_id");
11
     WRITE_CODE(3,       2, "vps_reserved_three_2bits");
12
     WRITE_CODE(0,       6, "vps_reserved_zero_6bits");
13
-    WRITE_CODE(0,       3, "vps_max_sub_layers_minus1");
14
-    WRITE_FLAG(1,          "vps_temporal_id_nesting_flag");
15
+    WRITE_CODE(vps.maxTempSubLayers - 1, 3, "vps_max_sub_layers_minus1");
16
+    WRITE_FLAG(vps.maxTempSubLayers == 1,   "vps_temporal_id_nesting_flag");
17
     WRITE_CODE(0xffff, 16, "vps_reserved_ffff_16bits");
18
 
19
-    codeProfileTier(vps.ptl);
20
+    codeProfileTier(vps.ptl, vps.maxTempSubLayers);
21
 
22
     WRITE_FLAG(true, "vps_sub_layer_ordering_info_present_flag");
23
-    WRITE_UVLC(vps.maxDecPicBuffering - 1, "vps_max_dec_pic_buffering_minus1[i]");
24
-    WRITE_UVLC(vps.numReorderPics,         "vps_num_reorder_pics[i]");
25
 
26
-    WRITE_UVLC(0,    "vps_max_latency_increase_plus1[i]");
27
+    for (uint32_t i = 0; i < vps.maxTempSubLayers; i++)
28
+    {
29
+        WRITE_UVLC(vps.maxDecPicBuffering - 1, "vps_max_dec_pic_buffering_minus1[i]");
30
+        WRITE_UVLC(vps.numReorderPics,         "vps_num_reorder_pics[i]");
31
+        WRITE_UVLC(vps.maxLatencyIncrease + 1, "vps_max_latency_increase_plus1[i]");
32
+    }
33
+
34
     WRITE_CODE(0, 6, "vps_max_nuh_reserved_zero_layer_id");
35
     WRITE_UVLC(0,    "vps_max_op_sets_minus1");
36
     WRITE_FLAG(0,    "vps_timing_info_present_flag"); /* we signal timing info in SPS-VUI */
37
@@ -71,16 +76,16 @@
38
 void Entropy::codeSPS(const SPS& sps, const ScalingList& scalingList, const ProfileTierLevel& ptl)
39
 {
40
     WRITE_CODE(0, 4, "sps_video_parameter_set_id");
41
-    WRITE_CODE(0, 3, "sps_max_sub_layers_minus1");
42
-    WRITE_FLAG(1,    "sps_temporal_id_nesting_flag");
43
+    WRITE_CODE(sps.maxTempSubLayers - 1, 3, "sps_max_sub_layers_minus1");
44
+    WRITE_FLAG(sps.maxTempSubLayers == 1,   "sps_temporal_id_nesting_flag");
45
 
46
-    codeProfileTier(ptl);
47
+    codeProfileTier(ptl, sps.maxTempSubLayers);
48
 
49
     WRITE_UVLC(0, "sps_seq_parameter_set_id");
50
     WRITE_UVLC(sps.chromaFormatIdc, "chroma_format_idc");
51
 
52
     if (sps.chromaFormatIdc == X265_CSP_I444)
53
-        WRITE_FLAG(0,                        "separate_colour_plane_flag");
54
+        WRITE_FLAG(0,                       "separate_colour_plane_flag");
55
 
56
     WRITE_UVLC(sps.picWidthInLumaSamples,   "pic_width_in_luma_samples");
57
     WRITE_UVLC(sps.picHeightInLumaSamples,  "pic_height_in_luma_samples");
58
@@ -101,9 +106,12 @@
59
     WRITE_UVLC(BITS_FOR_POC - 4, "log2_max_pic_order_cnt_lsb_minus4");
60
     WRITE_FLAG(true,             "sps_sub_layer_ordering_info_present_flag");
61
 
62
-    WRITE_UVLC(sps.maxDecPicBuffering - 1, "sps_max_dec_pic_buffering_minus1[i]");
63
-    WRITE_UVLC(sps.numReorderPics,         "sps_num_reorder_pics[i]");
64
-    WRITE_UVLC(sps.maxLatencyIncrease + 1, "sps_max_latency_increase_plus1[i]");
65
+    for (uint32_t i = 0; i < sps.maxTempSubLayers; i++)
66
+    {
67
+        WRITE_UVLC(sps.maxDecPicBuffering - 1, "sps_max_dec_pic_buffering_minus1[i]");
68
+        WRITE_UVLC(sps.numReorderPics,         "sps_num_reorder_pics[i]");
69
+        WRITE_UVLC(sps.maxLatencyIncrease + 1, "sps_max_latency_increase_plus1[i]");
70
+    }
71
 
72
     WRITE_UVLC(sps.log2MinCodingBlockSize - 3,    "log2_min_coding_block_size_minus3");
73
     WRITE_UVLC(sps.log2DiffMaxMinCodingBlockSize, "log2_diff_max_min_coding_block_size");
74
@@ -129,7 +137,7 @@
75
     WRITE_FLAG(sps.bUseStrongIntraSmoothing, "sps_strong_intra_smoothing_enable_flag");
76
 
77
     WRITE_FLAG(1, "vui_parameters_present_flag");
78
-    codeVUI(sps.vuiParameters);
79
+    codeVUI(sps.vuiParameters, sps.maxTempSubLayers);
80
 
81
     WRITE_FLAG(0, "sps_extension_flag");
82
 }
83
@@ -184,7 +192,7 @@
84
     WRITE_FLAG(0, "pps_extension_flag");
85
 }
86
 
87
-void Entropy::codeProfileTier(const ProfileTierLevel& ptl)
88
+void Entropy::codeProfileTier(const ProfileTierLevel& ptl, int maxTempSubLayers)
89
 {
90
     WRITE_CODE(0, 2,                "XXX_profile_space[]");
91
     WRITE_FLAG(ptl.tierFlag,        "XXX_tier_flag[]");
92
@@ -222,9 +230,17 @@
93
     }
94
 
95
     WRITE_CODE(ptl.levelIdc, 8, "general_level_idc");
96
+
97
+    if (maxTempSubLayers > 1)
98
+    {
99
+         WRITE_FLAG(0, "sub_layer_profile_present_flag[i]");
100
+         WRITE_FLAG(0, "sub_layer_level_present_flag[i]");
101
+         for (int i = maxTempSubLayers - 1; i < 8 ; i++)
102
+             WRITE_CODE(0, 2, "reserved_zero_2bits");
103
+    }
104
 }
105
 
106
-void Entropy::codeVUI(const VUI& vui)
107
+void Entropy::codeVUI(const VUI& vui, int maxSubTLayers)
108
 {
109
     WRITE_FLAG(vui.aspectRatioInfoPresentFlag,  "aspect_ratio_info_present_flag");
110
     if (vui.aspectRatioInfoPresentFlag)
111
@@ -282,7 +298,7 @@
112
 
113
     WRITE_FLAG(vui.hrdParametersPresentFlag,  "vui_hrd_parameters_present_flag");
114
     if (vui.hrdParametersPresentFlag)
115
-        codeHrdParameters(vui.hrdParameters);
116
+        codeHrdParameters(vui.hrdParameters, maxSubTLayers);
117
 
118
     WRITE_FLAG(0, "bitstream_restriction_flag");
119
 }
120
@@ -329,7 +345,7 @@
121
     }
122
 }
123
 
124
-void Entropy::codeHrdParameters(const HRDInfo& hrd)
125
+void Entropy::codeHrdParameters(const HRDInfo& hrd, int maxSubTLayers)
126
 {
127
     WRITE_FLAG(1, "nal_hrd_parameters_present_flag");
128
     WRITE_FLAG(0, "vcl_hrd_parameters_present_flag");
129
@@ -342,13 +358,16 @@
130
     WRITE_CODE(hrd.cpbRemovalDelayLength - 1,        5, "au_cpb_removal_delay_length_minus1");
131
     WRITE_CODE(hrd.dpbOutputDelayLength - 1,         5, "dpb_output_delay_length_minus1");
132
 
133
-    WRITE_FLAG(1, "fixed_pic_rate_general_flag");
134
-    WRITE_UVLC(0, "elemental_duration_in_tc_minus1");
135
-    WRITE_UVLC(0, "cpb_cnt_minus1");
136
+    for (int i = 0; i < maxSubTLayers; i++)
137
+    {
138
+        WRITE_FLAG(1, "fixed_pic_rate_general_flag");
139
+        WRITE_UVLC(0, "elemental_duration_in_tc_minus1");
140
+        WRITE_UVLC(0, "cpb_cnt_minus1");
141
 
142
-    WRITE_UVLC(hrd.bitRateValue - 1, "bit_rate_value_minus1");
143
-    WRITE_UVLC(hrd.cpbSizeValue - 1, "cpb_size_value_minus1");
144
-    WRITE_FLAG(hrd.cbrFlag, "cbr_flag");
145
+        WRITE_UVLC(hrd.bitRateValue - 1, "bit_rate_value_minus1");
146
+        WRITE_UVLC(hrd.cpbSizeValue - 1, "cpb_size_value_minus1");
147
+        WRITE_FLAG(hrd.cbrFlag, "cbr_flag");
148
+    }
149
 }
150
 
151
 void Entropy::codeAUD(const Slice& slice)
152
@@ -521,15 +540,14 @@
153
 {
154
     const Slice* slice = ctu.m_slice;
155
 
156
-    if (depth <= slice->m_pps->maxCuDQPDepth && slice->m_pps->bUseDQP)
157
-        bEncodeDQP = true;
158
-
159
     int cuSplitFlag = !(cuGeom.flags & CUGeom::LEAF);
160
     int cuUnsplitFlag = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
161
 
162
     if (!cuUnsplitFlag)
163
     {
164
         uint32_t qNumParts = cuGeom.numPartitions >> 2;
165
+        if (depth == slice->m_pps->maxCuDQPDepth && slice->m_pps->bUseDQP)
166
+            bEncodeDQP = true;
167
         for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
168
         {
169
             const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + qIdx);
170
@@ -539,13 +557,14 @@
171
         return;
172
     }
173
 
174
-    // We need to split, so don't try these modes.
175
     if (cuSplitFlag) 
176
         codeSplitFlag(ctu, absPartIdx, depth);
177
 
178
     if (depth < ctu.m_cuDepth[absPartIdx] && depth < g_maxCUDepth)
179
     {
180
         uint32_t qNumParts = cuGeom.numPartitions >> 2;
181
+        if (depth == slice->m_pps->maxCuDQPDepth && slice->m_pps->bUseDQP)
182
+            bEncodeDQP = true;
183
         for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
184
         {
185
             const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + qIdx);
186
@@ -554,6 +573,9 @@
187
         return;
188
     }
189
 
190
+    if (depth <= slice->m_pps->maxCuDQPDepth && slice->m_pps->bUseDQP)
191
+        bEncodeDQP = true;
192
+
193
     if (slice->m_pps->bTransquantBypassEnabled)
194
         codeCUTransquantBypassFlag(ctu.m_tqBypass[absPartIdx]);
195
 
196
@@ -654,7 +676,7 @@
197
     {
198
         // Encode slice finish
199
         bool bTerminateSlice = false;
200
-        if (cuAddr + (NUM_CU_PARTITIONS >> (depth << 1)) == realEndAddress)
201
x265_1.5.tar.gz/source/encoder/entropy.h -> x265_1.6.tar.gz/source/encoder/entropy.h Changed
22
 
1
@@ -142,9 +142,9 @@
2
     void codeVPS(const VPS& vps);
3
     void codeSPS(const SPS& sps, const ScalingList& scalingList, const ProfileTierLevel& ptl);
4
     void codePPS(const PPS& pps);
5
-    void codeVUI(const VUI& vui);
6
+    void codeVUI(const VUI& vui, int maxSubTLayers);
7
     void codeAUD(const Slice& slice);
8
-    void codeHrdParameters(const HRDInfo& hrd);
9
+    void codeHrdParameters(const HRDInfo& hrd, int maxSubTLayers);
10
 
11
     void codeSliceHeader(const Slice& slice, FrameData& encData);
12
     void codeSliceHeaderWPPEntryPoints(const Slice& slice, const uint32_t *substreamSizes, uint32_t maxOffset);
13
@@ -230,7 +230,7 @@
14
     void writeEpExGolomb(uint32_t symbol, uint32_t count);
15
     void writeCoefRemainExGolomb(uint32_t symbol, const uint32_t absGoRice);
16
 
17
-    void codeProfileTier(const ProfileTierLevel& ptl);
18
+    void codeProfileTier(const ProfileTierLevel& ptl, int maxTempSubLayers);
19
     void codeScalingList(const ScalingList&);
20
     void codeScalingList(const ScalingList& scalingList, uint32_t sizeId, uint32_t listId);
21
 
22
x265_1.5.tar.gz/source/encoder/frameencoder.cpp -> x265_1.6.tar.gz/source/encoder/frameencoder.cpp Changed
201
 
1
@@ -39,14 +39,13 @@
2
 void weightAnalyse(Slice& slice, Frame& frame, x265_param& param);
3
 
4
 FrameEncoder::FrameEncoder()
5
-    : WaveFront(NULL)
6
-    , m_threadActive(true)
7
 {
8
     m_prevOutputTime = x265_mdate();
9
-    m_totalWorkerElapsedTime = 0;
10
+    m_isFrameEncoder = true;
11
+    m_threadActive = true;
12
     m_slicetypeWaitTime = 0;
13
-    m_frameEncoderID = 0;
14
     m_activeWorkerCount = 0;
15
+    m_completionCount = 0;
16
     m_bAllRowsStop = false;
17
     m_vbvResetTriggerRow = -1;
18
     m_outStreams = NULL;
19
@@ -59,6 +58,7 @@
20
     m_frame = NULL;
21
     m_cuGeoms = NULL;
22
     m_ctuGeomMap = NULL;
23
+    m_localTldIdx = 0;
24
     memset(&m_frameStats, 0, sizeof(m_frameStats));
25
     memset(&m_rce, 0, sizeof(RateControlEntry));
26
 }
27
@@ -66,10 +66,22 @@
28
 void FrameEncoder::destroy()
29
 {
30
     if (m_pool)
31
-        JobProvider::flush();  // ensure no worker threads are using this frame
32
-
33
-    m_threadActive = false;
34
-    m_enable.trigger();
35
+    {
36
+        if (!m_jpId)
37
+        {
38
+            int numTLD = m_pool->m_numWorkers;
39
+            if (!m_param->bEnableWavefront)
40
+                numTLD += m_pool->m_numProviders;
41
+            for (int i = 0; i < numTLD; i++)
42
+                m_tld[i].destroy();
43
+            delete [] m_tld;
44
+        }
45
+    }
46
+    else
47
+    {
48
+        m_tld->destroy();
49
+        delete m_tld;
50
+    }
51
 
52
     delete[] m_rows;
53
     delete[] m_outStreams;
54
@@ -85,12 +97,9 @@
55
         delete m_rce.picTimingSEI;
56
         delete m_rce.hrdTiming;
57
     }
58
-
59
-    // wait for worker thread to exit
60
-    stop();
61
 }
62
 
63
-bool FrameEncoder::init(Encoder *top, int numRows, int numCols, int id)
64
+bool FrameEncoder::init(Encoder *top, int numRows, int numCols)
65
 {
66
     m_top = top;
67
     m_param = top->m_param;
68
@@ -99,14 +108,14 @@
69
     m_filterRowDelay = (m_param->bEnableSAO && m_param->bSaoNonDeblocked) ?
70
                         2 : (m_param->bEnableSAO || m_param->bEnableLoopFilter ? 1 : 0);
71
     m_filterRowDelayCus = m_filterRowDelay * numCols;
72
-    m_frameEncoderID = id;
73
     m_rows = new CTURow[m_numRows];
74
     bool ok = !!m_numRows;
75
 
76
-    int range  = m_param->searchRange; /* fpel search */
77
-        range += 1;                    /* diamond search range check lag */
78
-        range += 2;                    /* subpel refine */
79
-        range += NTAPS_LUMA / 2;       /* subpel filter half-length */
80
+    /* determine full motion search range */
81
+    int range  = m_param->searchRange;       /* fpel search */
82
+    range += !!(m_param->searchMethod < 2);  /* diamond/hex range check lag */
83
+    range += NTAPS_LUMA / 2;                 /* subpel filter half-length */
84
+    range += 2 + MotionEstimate::hpelIterationCount(m_param->subpelRefine) / 2; /* subpel refine steps */
85
     m_refLagRows = 1 + ((range + g_maxCUSize - 1) / g_maxCUSize);
86
 
87
     // NOTE: 2 times of numRows because both Encoder and Filter in same queue
88
@@ -134,7 +143,6 @@
89
     else
90
         m_param->noiseReductionIntra = m_param->noiseReductionInter = 0;
91
 
92
-    start();
93
     return ok;
94
 }
95
 
96
@@ -143,6 +151,7 @@
97
 {
98
     /* Geoms only vary between CTUs in the presence of picture edges */
99
     int maxCUSize = m_param->maxCUSize;
100
+    int minCUSize = m_param->minCUSize;
101
     int heightRem = m_param->sourceHeight & (maxCUSize - 1);
102
     int widthRem = m_param->sourceWidth & (maxCUSize - 1);
103
     int allocGeoms = 1; // body
104
@@ -157,7 +166,7 @@
105
         return false;
106
 
107
     // body
108
-    CUData::calcCTUGeoms(maxCUSize, maxCUSize, maxCUSize, m_cuGeoms);
109
+    CUData::calcCTUGeoms(maxCUSize, maxCUSize, maxCUSize, minCUSize, m_cuGeoms);
110
     memset(m_ctuGeomMap, 0, sizeof(uint32_t) * m_numRows * m_numCols);
111
     if (allocGeoms == 1)
112
         return true;
113
@@ -166,7 +175,7 @@
114
     if (widthRem)
115
     {
116
         // right
117
-        CUData::calcCTUGeoms(widthRem, maxCUSize, maxCUSize, m_cuGeoms + countGeoms * CUGeom::MAX_GEOMS);
118
+        CUData::calcCTUGeoms(widthRem, maxCUSize, maxCUSize, minCUSize, m_cuGeoms + countGeoms * CUGeom::MAX_GEOMS);
119
         for (uint32_t i = 0; i < m_numRows; i++)
120
         {
121
             uint32_t ctuAddr = m_numCols * (i + 1) - 1;
122
@@ -177,7 +186,7 @@
123
     if (heightRem)
124
     {
125
         // bottom
126
-        CUData::calcCTUGeoms(maxCUSize, heightRem, maxCUSize, m_cuGeoms + countGeoms * CUGeom::MAX_GEOMS);
127
+        CUData::calcCTUGeoms(maxCUSize, heightRem, maxCUSize, minCUSize, m_cuGeoms + countGeoms * CUGeom::MAX_GEOMS);
128
         for (uint32_t i = 0; i < m_numCols; i++)
129
         {
130
             uint32_t ctuAddr = m_numCols * (m_numRows - 1) + i;
131
@@ -188,7 +197,7 @@
132
         if (widthRem)
133
         {
134
             // corner
135
-            CUData::calcCTUGeoms(widthRem, heightRem, maxCUSize, m_cuGeoms + countGeoms * CUGeom::MAX_GEOMS);
136
+            CUData::calcCTUGeoms(widthRem, heightRem, maxCUSize, minCUSize, m_cuGeoms + countGeoms * CUGeom::MAX_GEOMS);
137
 
138
             uint32_t ctuAddr = m_numCols * m_numRows - 1;
139
             m_ctuGeomMap[ctuAddr] = countGeoms * CUGeom::MAX_GEOMS;
140
@@ -204,7 +213,9 @@
141
 {
142
     m_slicetypeWaitTime = x265_mdate() - m_prevOutputTime;
143
     m_frame = curFrame;
144
-    curFrame->m_encData->m_frameEncoderID = m_frameEncoderID; // Each Frame knows the ID of the FrameEncoder encoding it
145
+    m_sliceType = curFrame->m_lowres.sliceType;
146
+    curFrame->m_encData->m_frameEncoderID = m_jpId;
147
+    curFrame->m_encData->m_jobProvider = this;
148
     curFrame->m_encData->m_slice->m_mref = m_mref;
149
 
150
     if (!m_cuGeoms)
151
@@ -219,19 +230,66 @@
152
 
153
 void FrameEncoder::threadMain()
154
 {
155
-    THREAD_NAME("Frame", m_frameEncoderID);
156
+    THREAD_NAME("Frame", m_jpId);
157
 
158
-    // worker thread routine for FrameEncoder
159
-    do
160
+    if (m_pool)
161
     {
162
-        m_enable.wait(); // Encoder::encode() triggers this event
163
-        if (m_threadActive)
164
+        m_pool->setCurrentThreadAffinity();
165
+
166
+        /* the first FE on each NUMA node is responsible for allocating thread
167
+         * local data for all worker threads in that pool. If WPP is disabled, then
168
+         * each FE also needs a TLD instance */
169
+        if (!m_jpId)
170
         {
171
-            compressFrame();
172
-            m_done.trigger(); // FrameEncoder::getEncodedPicture() blocks for this event
173
+            int numTLD = m_pool->m_numWorkers;
174
+            if (!m_param->bEnableWavefront)
175
+                numTLD += m_pool->m_numProviders;
176
+
177
+            m_tld = new ThreadLocalData[numTLD];
178
+            for (int i = 0; i < numTLD; i++)
179
+            {
180
+                m_tld[i].analysis.initSearch(*m_param, m_top->m_scalingList);
181
+                m_tld[i].analysis.create(m_tld);
182
+            }
183
+
184
+            for (int i = 0; i < m_pool->m_numProviders; i++)
185
+            {
186
+                if (m_pool->m_jpTable[i]->m_isFrameEncoder) /* ugh; over-allocation and other issues here */
187
+                {
188
+                    FrameEncoder *peer = dynamic_cast<FrameEncoder*>(m_pool->m_jpTable[i]);
189
+                    peer->m_tld = m_tld;
190
+                }
191
+            }
192
         }
193
+
194
+        if (m_param->bEnableWavefront)
195
+            m_localTldIdx = -1; // cause exception if used
196
+        else
197
+            m_localTldIdx = m_pool->m_numWorkers + m_jpId;
198
+    }
199
+    else
200
+    {
201
x265_1.5.tar.gz/source/encoder/frameencoder.h -> x265_1.6.tar.gz/source/encoder/frameencoder.h Changed
88
 
1
@@ -122,7 +122,7 @@
2
 
3
     virtual ~FrameEncoder() {}
4
 
5
-    bool init(Encoder *top, int numRows, int numCols, int id);
6
+    virtual bool init(Encoder *top, int numRows, int numCols);
7
 
8
     void destroy();
9
 
10
@@ -135,8 +135,12 @@
11
     Event                    m_enable;
12
     Event                    m_done;
13
     Event                    m_completionEvent;
14
-    bool                     m_threadActive;
15
-    int                      m_frameEncoderID;
16
+    int                      m_localTldIdx;
17
+
18
+    volatile bool            m_threadActive;
19
+    volatile bool            m_bAllRowsStop;
20
+    volatile int             m_completionCount;
21
+    volatile int             m_vbvResetTriggerRow;
22
 
23
     uint32_t                 m_numRows;
24
     uint32_t                 m_numCols;
25
@@ -144,9 +148,6 @@
26
     uint32_t                 m_filterRowDelayCus;
27
     uint32_t                 m_refLagRows;
28
 
29
-    volatile bool            m_bAllRowsStop;
30
-    volatile int             m_vbvResetTriggerRow;
31
-
32
     CTURow*                  m_rows;
33
     RateControlEntry         m_rce;
34
     SEIDecodedPictureHash    m_seiReconPictureDigest;
35
@@ -177,6 +178,9 @@
36
     int64_t                  m_slicetypeWaitTime;        // total elapsed time waiting for decided frame
37
     int64_t                  m_totalWorkerElapsedTime;   // total elapsed time spent by worker threads processing CTUs
38
     int64_t                  m_totalNoWorkerTime;        // total elapsed time without any active worker threads
39
+#if DETAILED_CU_STATS
40
+    CUStats                  m_cuStats;
41
+#endif
42
 
43
     Encoder*                 m_top;
44
     x265_param*              m_param;
45
@@ -196,6 +200,21 @@
46
     FrameFilter              m_frameFilter;
47
     NALList                  m_nalList;
48
 
49
+    class WeightAnalysis : public BondedTaskGroup
50
+    {
51
+    public:
52
+
53
+        FrameEncoder& master;
54
+
55
+        WeightAnalysis(FrameEncoder& fe) : master(fe) {}
56
+
57
+        void processTasks(int workerThreadId);
58
+
59
+    protected:
60
+
61
+        WeightAnalysis operator=(const WeightAnalysis&);
62
+    };
63
+
64
 protected:
65
 
66
     bool initializeGeoms();
67
@@ -203,9 +222,6 @@
68
     /* analyze / compress frame, can be run in parallel within reference constraints */
69
     void compressFrame();
70
 
71
-    /* called by compressFrame to perform wave-front compression analysis */
72
-    void compressCTURows();
73
-
74
     /* called by compressFrame to generate final per-row bitstreams */
75
     void encodeSlice();
76
 
77
@@ -215,8 +231,8 @@
78
     void noiseReductionUpdate();
79
 
80
     /* Called by WaveFront::findJob() */
81
-    void processRow(int row, int threadId);
82
-    void processRowEncoder(int row, ThreadLocalData& tld);
83
+    virtual void processRow(int row, int threadId);
84
+    virtual void processRowEncoder(int row, ThreadLocalData& tld);
85
 
86
     void enqueueRowEncoder(int row) { WaveFront::enqueueRow(row * 2 + 0); }
87
     void enqueueRowFilter(int row)  { WaveFront::enqueueRow(row * 2 + 1); }
88
x265_1.5.tar.gz/source/encoder/framefilter.cpp -> x265_1.6.tar.gz/source/encoder/framefilter.cpp Changed
32
 
1
@@ -83,6 +83,11 @@
2
 {
3
     ProfileScopeEvent(filterCTURow);
4
 
5
+#if DETAILED_CU_STATS
6
+    ScopedElapsedTime filterPerfScope(m_frameEncoder->m_cuStats.loopFilterElapsedTime);
7
+    m_frameEncoder->m_cuStats.countLoopFilter++;
8
+#endif
9
+
10
     if (!m_param->bEnableLoopFilter && !m_param->bEnableSAO)
11
     {
12
         processRowPost(row);
13
@@ -298,6 +303,9 @@
14
         updateChecksum(reconPic->m_picOrg[1], m_frameEncoder->m_checksum[1], height, width, stride, row, cuHeight);
15
         updateChecksum(reconPic->m_picOrg[2], m_frameEncoder->m_checksum[2], height, width, stride, row, cuHeight);
16
     }
17
+
18
+    if (ATOMIC_INC(&m_frameEncoder->m_completionCount) == 2 * (int)m_frameEncoder->m_numRows)
19
+        m_frameEncoder->m_completionEvent.trigger();
20
 }
21
 
22
 static uint64_t computeSSD(pixel *fenc, pixel *rec, intptr_t stride, uint32_t width, uint32_t height)
23
@@ -421,7 +429,7 @@
24
 /* Original YUV restoration for CU in lossless coding */
25
 static void origCUSampleRestoration(const CUData* cu, const CUGeom& cuGeom, Frame& frame)
26
 {
27
-    uint32_t absPartIdx = cuGeom.encodeIdx;
28
+    uint32_t absPartIdx = cuGeom.absPartIdx;
29
     if (cu->m_cuDepth[absPartIdx] > cuGeom.depth)
30
     {
31
         for (int subPartIdx = 0; subPartIdx < 4; subPartIdx++)
32
x265_1.5.tar.gz/source/encoder/level.cpp -> x265_1.6.tar.gz/source/encoder/level.cpp Changed
49
 
1
@@ -60,6 +60,7 @@
2
 /* determine minimum decoder level required to decode the described video */
3
 void determineLevel(const x265_param &param, VPS& vps)
4
 {
5
+    vps.maxTempSubLayers = param.bEnableTemporalSubLayers ? 2 : 1;
6
     if (param.bLossless)
7
         vps.ptl.profileIdc = Profile::NONE;
8
     else if (param.internalCsp == X265_CSP_I420)
9
@@ -154,15 +155,25 @@
10
             return;
11
         }
12
 
13
-        vps.ptl.levelIdc = levels[i].levelEnum;
14
-        vps.ptl.minCrForLevel = levels[i].minCompressionRatio;
15
-        vps.ptl.maxLumaSrForLevel = levels[i].maxLumaSamplesPerSecond;
16
+#define CHECK_RANGE(value, main, high) (value > main && value <= high)
17
 
18
-        if (bitrate > levels[i].maxBitrateMain && bitrate <= levels[i].maxBitrateHigh &&
19
+        if (CHECK_RANGE(bitrate, levels[i].maxBitrateMain, levels[i].maxBitrateHigh) &&
20
+            CHECK_RANGE((uint32_t)param.rc.vbvBufferSize, levels[i].maxCpbSizeMain, levels[i].maxCpbSizeHigh) &&
21
             levels[i].maxBitrateHigh != MAX_UINT)
22
-            vps.ptl.tierFlag = Level::HIGH;
23
+        {
24
+            /* If the user has not enabled high tier, continue looking to see if we can encode at a higher level, main tier */
25
+            if (!param.bHighTier && (levels[i].levelIdc < param.levelIdc))
26
+                continue;
27
+            else
28
+                vps.ptl.tierFlag = Level::HIGH;
29
+        }
30
         else
31
             vps.ptl.tierFlag = Level::MAIN;
32
+#undef CHECK_RANGE
33
+
34
+        vps.ptl.levelIdc = levels[i].levelEnum;
35
+        vps.ptl.minCrForLevel = levels[i].minCompressionRatio;
36
+        vps.ptl.maxLumaSrForLevel = levels[i].maxLumaSamplesPerSecond;
37
         break;
38
     }
39
 
40
@@ -250,7 +261,7 @@
41
     }
42
     if ((uint32_t)param.rc.vbvBufferSize > (highTier ? l.maxCpbSizeHigh : l.maxCpbSizeMain))
43
     {
44
-        param.rc.vbvMaxBitrate = highTier ? l.maxCpbSizeHigh : l.maxCpbSizeMain;
45
+        param.rc.vbvBufferSize = highTier ? l.maxCpbSizeHigh : l.maxCpbSizeMain;
46
         x265_log(&param, X265_LOG_INFO, "lowering VBV buffer size to %dKb\n", param.rc.vbvBufferSize);
47
     }
48
 
49
x265_1.5.tar.gz/source/encoder/motion.cpp -> x265_1.6.tar.gz/source/encoder/motion.cpp Changed
101
 
1
@@ -59,38 +59,6 @@
2
 int sizeScale[NUM_PU_SIZES];
3
 #define SAD_THRESH(v) (bcost < (((v >> 4) * sizeScale[partEnum])))
4
 
5
-void initScales(void)
6
-{
7
-#define SETUP_SCALE(W, H) \
8
-    sizeScale[LUMA_ ## W ## x ## H] = (H * H) >> 4;
9
-    SETUP_SCALE(4, 4);
10
-    SETUP_SCALE(8, 8);
11
-    SETUP_SCALE(8, 4);
12
-    SETUP_SCALE(4, 8);
13
-    SETUP_SCALE(16, 16);
14
-    SETUP_SCALE(16, 8);
15
-    SETUP_SCALE(8, 16);
16
-    SETUP_SCALE(16, 12);
17
-    SETUP_SCALE(12, 16);
18
-    SETUP_SCALE(4, 16);
19
-    SETUP_SCALE(16, 4);
20
-    SETUP_SCALE(32, 32);
21
-    SETUP_SCALE(32, 16);
22
-    SETUP_SCALE(16, 32);
23
-    SETUP_SCALE(32, 24);
24
-    SETUP_SCALE(24, 32);
25
-    SETUP_SCALE(32, 8);
26
-    SETUP_SCALE(8, 32);
27
-    SETUP_SCALE(64, 64);
28
-    SETUP_SCALE(64, 32);
29
-    SETUP_SCALE(32, 64);
30
-    SETUP_SCALE(64, 48);
31
-    SETUP_SCALE(48, 64);
32
-    SETUP_SCALE(64, 16);
33
-    SETUP_SCALE(16, 64);
34
-#undef SETUP_SCALE
35
-}
36
-
37
 /* radius 2 hexagon. repeated entries are to avoid having to compute mod6 every time. */
38
 const MV hex2[8] = { MV(-1, -2), MV(-2, 0), MV(-1, 2), MV(1, 2), MV(2, 0), MV(1, -2), MV(-1, -2), MV(-2, 0) };
39
 const uint8_t mod6m1[8] = { 5, 0, 1, 2, 3, 4, 5, 0 };  /* (x-1)%6 */
40
@@ -136,20 +104,57 @@
41
     absPartIdx = -1;
42
     searchMethod = X265_HEX_SEARCH;
43
     subpelRefine = 2;
44
+    blockwidth = blockheight = 0;
45
+    blockOffset = 0;
46
     bChromaSATD = false;
47
     chromaSatd = NULL;
48
 }
49
 
50
 void MotionEstimate::init(int method, int refine, int csp)
51
 {
52
-    if (!sizeScale[0])
53
-        initScales();
54
-
55
     searchMethod = method;
56
     subpelRefine = refine;
57
     fencPUYuv.create(FENC_STRIDE, csp);
58
 }
59
 
60
+void MotionEstimate::initScales(void)
61
+{
62
+#define SETUP_SCALE(W, H) \
63
+    sizeScale[LUMA_ ## W ## x ## H] = (H * H) >> 4;
64
+    SETUP_SCALE(4, 4);
65
+    SETUP_SCALE(8, 8);
66
+    SETUP_SCALE(8, 4);
67
+    SETUP_SCALE(4, 8);
68
+    SETUP_SCALE(16, 16);
69
+    SETUP_SCALE(16, 8);
70
+    SETUP_SCALE(8, 16);
71
+    SETUP_SCALE(16, 12);
72
+    SETUP_SCALE(12, 16);
73
+    SETUP_SCALE(4, 16);
74
+    SETUP_SCALE(16, 4);
75
+    SETUP_SCALE(32, 32);
76
+    SETUP_SCALE(32, 16);
77
+    SETUP_SCALE(16, 32);
78
+    SETUP_SCALE(32, 24);
79
+    SETUP_SCALE(24, 32);
80
+    SETUP_SCALE(32, 8);
81
+    SETUP_SCALE(8, 32);
82
+    SETUP_SCALE(64, 64);
83
+    SETUP_SCALE(64, 32);
84
+    SETUP_SCALE(32, 64);
85
+    SETUP_SCALE(64, 48);
86
+    SETUP_SCALE(48, 64);
87
+    SETUP_SCALE(64, 16);
88
+    SETUP_SCALE(16, 64);
89
+#undef SETUP_SCALE
90
+}
91
+
92
+int MotionEstimate::hpelIterationCount(int subme)
93
+{
94
+    return workload[subme].hpel_iters +
95
+           workload[subme].qpel_iters / 2;
96
+}
97
+
98
 MotionEstimate::~MotionEstimate()
99
 {
100
     fencPUYuv.destroy();
101
x265_1.5.tar.gz/source/encoder/motion.h -> x265_1.6.tar.gz/source/encoder/motion.h Changed
10
 
1
@@ -67,6 +67,8 @@
2
     MotionEstimate();
3
     ~MotionEstimate();
4
 
5
+    static void initScales();
6
+    static int hpelIterationCount(int subme);
7
     void init(int method, int refine, int csp);
8
 
9
     /* Methods called at slice setup */
10
x265_1.5.tar.gz/source/encoder/nal.cpp -> x265_1.6.tar.gz/source/encoder/nal.cpp Changed
10
 
1
@@ -107,7 +107,7 @@
2
      * nuh_reserved_zero_6bits  6-bits
3
      * nuh_temporal_id_plus1    3-bits */
4
     out[bytes++] = (uint8_t)nalUnitType << 1;
5
-    out[bytes++] = 1;
6
+    out[bytes++] = 1 + (nalUnitType == NAL_UNIT_CODED_SLICE_TSA_N);
7
 
8
     /* 7.4.1 ...
9
      * Within the NAL unit, the following three-byte sequences shall not occur at
10
x265_1.5.tar.gz/source/encoder/ratecontrol.cpp -> x265_1.6.tar.gz/source/encoder/ratecontrol.cpp Changed
201
 
1
@@ -145,30 +145,6 @@
2
 }
3
 
4
 }  // end anonymous namespace
5
-/* Compute variance to derive AC energy of each block */
6
-static inline uint32_t acEnergyVar(Frame *curFrame, uint64_t sum_ssd, int shift, int i)
7
-{
8
-    uint32_t sum = (uint32_t)sum_ssd;
9
-    uint32_t ssd = (uint32_t)(sum_ssd >> 32);
10
-
11
-    curFrame->m_lowres.wp_sum[i] += sum;
12
-    curFrame->m_lowres.wp_ssd[i] += ssd;
13
-    return ssd - ((uint64_t)sum * sum >> shift);
14
-}
15
-
16
-/* Find the energy of each block in Y/Cb/Cr plane */
17
-static inline uint32_t acEnergyPlane(Frame *curFrame, pixel* src, intptr_t srcStride, int bChroma, int colorFormat)
18
-{
19
-    if ((colorFormat != X265_CSP_I444) && bChroma)
20
-    {
21
-        ALIGN_VAR_8(pixel, pix[8 * 8]);
22
-        primitives.cu[BLOCK_8x8].copy_pp(pix, 8, src, srcStride);
23
-        return acEnergyVar(curFrame, primitives.cu[BLOCK_8x8].var(pix, 8), 6, bChroma);
24
-    }
25
-    else
26
-        return acEnergyVar(curFrame, primitives.cu[BLOCK_16x16].var(src, srcStride), 8, bChroma);
27
-}
28
-
29
 /* Returns the zone for the current frame */
30
 x265_zone* RateControl::getZone()
31
 {
32
@@ -181,138 +157,9 @@
33
     return NULL;
34
 }
35
 
36
-/* Find the total AC energy of each block in all planes */
37
-uint32_t RateControl::acEnergyCu(Frame* curFrame, uint32_t block_x, uint32_t block_y)
38
-{
39
-    intptr_t stride = curFrame->m_fencPic->m_stride;
40
-    intptr_t cStride = curFrame->m_fencPic->m_strideC;
41
-    intptr_t blockOffsetLuma = block_x + (block_y * stride);
42
-    int colorFormat = m_param->internalCsp;
43
-    int hShift = CHROMA_H_SHIFT(colorFormat);
44
-    int vShift = CHROMA_V_SHIFT(colorFormat);
45
-    intptr_t blockOffsetChroma = (block_x >> hShift) + ((block_y >> vShift) * cStride);
46
-
47
-    uint32_t var;
48
-
49
-    var  = acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[0] + blockOffsetLuma, stride, 0, colorFormat);
50
-    var += acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[1] + blockOffsetChroma, cStride, 1, colorFormat);
51
-    var += acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[2] + blockOffsetChroma, cStride, 2, colorFormat);
52
-    x265_emms();
53
-    return var;
54
-}
55
-
56
-void RateControl::calcAdaptiveQuantFrame(Frame *curFrame)
57
-{
58
-    /* Actual adaptive quantization */
59
-    int maxCol = curFrame->m_fencPic->m_picWidth;
60
-    int maxRow = curFrame->m_fencPic->m_picHeight;
61
-
62
-    for (int y = 0; y < 3; y++)
63
-    {
64
-        curFrame->m_lowres.wp_ssd[y] = 0;
65
-        curFrame->m_lowres.wp_sum[y] = 0;
66
-    }
67
-
68
-    /* Calculate Qp offset for each 16x16 block in the frame */
69
-    int block_xy = 0;
70
-    int block_x = 0, block_y = 0;
71
-    double strength = 0.f;
72
-    if (m_param->rc.aqMode == X265_AQ_NONE || m_param->rc.aqStrength == 0)
73
-    {
74
-        /* Need to init it anyways for CU tree */
75
-        int cuWidth = ((maxCol / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
76
-        int cuHeight = ((maxRow / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
77
-        int cuCount = cuWidth * cuHeight;
78
-
79
-        if (m_param->rc.aqMode && m_param->rc.aqStrength == 0)
80
-        {
81
-            memset(curFrame->m_lowres.qpCuTreeOffset, 0, cuCount * sizeof(double));
82
-            memset(curFrame->m_lowres.qpAqOffset, 0, cuCount * sizeof(double));
83
-            for (int cuxy = 0; cuxy < cuCount; cuxy++)
84
-                curFrame->m_lowres.invQscaleFactor[cuxy] = 256;
85
-        }
86
-
87
-        /* Need variance data for weighted prediction */
88
-        if (m_param->bEnableWeightedPred || m_param->bEnableWeightedBiPred)
89
-        {
90
-            for (block_y = 0; block_y < maxRow; block_y += 16)
91
-                for (block_x = 0; block_x < maxCol; block_x += 16)
92
-                    acEnergyCu(curFrame, block_x, block_y);
93
-        }
94
-    }
95
-    else
96
-    {
97
-        block_xy = 0;
98
-        double avg_adj_pow2 = 0, avg_adj = 0, qp_adj = 0;
99
-        if (m_param->rc.aqMode == X265_AQ_AUTO_VARIANCE)
100
-        {
101
-            double bit_depth_correction = pow(1 << (X265_DEPTH - 8), 0.5);
102
-            for (block_y = 0; block_y < maxRow; block_y += 16)
103
-            {
104
-                for (block_x = 0; block_x < maxCol; block_x += 16)
105
-                {
106
-                    uint32_t energy = acEnergyCu(curFrame, block_x, block_y);
107
-                    qp_adj = pow(energy + 1, 0.1);
108
-                    curFrame->m_lowres.qpCuTreeOffset[block_xy] = qp_adj;
109
-                    avg_adj += qp_adj;
110
-                    avg_adj_pow2 += qp_adj * qp_adj;
111
-                    block_xy++;
112
-                }
113
-            }
114
-
115
-            avg_adj /= m_ncu;
116
-            avg_adj_pow2 /= m_ncu;
117
-            strength = m_param->rc.aqStrength * avg_adj / bit_depth_correction;
118
-            avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - (11.f * bit_depth_correction)) / avg_adj;
119
-        }
120
-        else
121
-            strength = m_param->rc.aqStrength * 1.0397f;
122
-
123
-        block_xy = 0;
124
-        for (block_y = 0; block_y < maxRow; block_y += 16)
125
-        {
126
-            for (block_x = 0; block_x < maxCol; block_x += 16)
127
-            {
128
-                if (m_param->rc.aqMode == X265_AQ_AUTO_VARIANCE)
129
-                {
130
-                    qp_adj = curFrame->m_lowres.qpCuTreeOffset[block_xy];
131
-                    qp_adj = strength * (qp_adj - avg_adj);
132
-                }
133
-                else
134
-                {
135
-                    uint32_t energy = acEnergyCu(curFrame, block_x, block_y);
136
-                    qp_adj = strength * (X265_LOG2(X265_MAX(energy, 1)) - (14.427f + 2 * (X265_DEPTH - 8)));
137
-                }
138
-                curFrame->m_lowres.qpAqOffset[block_xy] = qp_adj;
139
-                curFrame->m_lowres.qpCuTreeOffset[block_xy] = qp_adj;
140
-                curFrame->m_lowres.invQscaleFactor[block_xy] = x265_exp2fix8(qp_adj);
141
-                block_xy++;
142
-            }
143
-        }
144
-    }
145
-
146
-    if (m_param->bEnableWeightedPred || m_param->bEnableWeightedBiPred)
147
-    {
148
-        int hShift = CHROMA_H_SHIFT(m_param->internalCsp);
149
-        int vShift = CHROMA_V_SHIFT(m_param->internalCsp);
150
-        maxCol = ((maxCol + 8) >> 4) << 4;
151
-        maxRow = ((maxRow + 8) >> 4) << 4;
152
-        int width[3]  = { maxCol, maxCol >> hShift, maxCol >> hShift };
153
-        int height[3] = { maxRow, maxRow >> vShift, maxRow >> vShift };
154
-
155
-        for (int i = 0; i < 3; i++)
156
-        {
157
-            uint64_t sum, ssd;
158
-            sum = curFrame->m_lowres.wp_sum[i];
159
-            ssd = curFrame->m_lowres.wp_ssd[i];
160
-            curFrame->m_lowres.wp_ssd[i] = ssd - (sum * sum + (width[i] * height[i]) / 2) / (width[i] * height[i]);
161
-        }
162
-    }
163
-}
164
-
165
-RateControl::RateControl(x265_param *p)
166
+RateControl::RateControl(x265_param& p)
167
 {
168
-    m_param = p;
169
+    m_param = &p;
170
     int lowresCuWidth = ((m_param->sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
171
     int lowresCuHeight = ((m_param->sourceHeight / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
172
     m_ncu = lowresCuWidth * lowresCuHeight;
173
@@ -329,13 +176,11 @@
174
     m_partialResidualCost = 0;
175
     m_rateFactorMaxIncrement = 0;
176
     m_rateFactorMaxDecrement = 0;
177
-    m_fps = m_param->fpsNum / m_param->fpsDenom;
178
+    m_fps = (double)m_param->fpsNum / m_param->fpsDenom;
179
     m_startEndOrder.set(0);
180
     m_bTerminated = false;
181
     m_finalFrameCount = 0;
182
     m_numEntries = 0;
183
-    m_amortizeFraction = 0.85;
184
-    m_amortizeFrames = 75;
185
     if (m_param->rc.rateControlMode == X265_RC_CRF)
186
     {
187
         m_param->rc.qp = (int)m_param->rc.rfConstant;
188
@@ -371,6 +216,7 @@
189
     m_statFileOut = NULL;
190
     m_cutreeStatFileOut = m_cutreeStatFileIn = NULL;
191
     m_rce2Pass = NULL;
192
+    m_lastBsliceSatdCost = 0;
193
 
194
     // vbv initialization
195
     m_param->rc.vbvBufferSize = x265_clip3(0, 2000000, m_param->rc.vbvBufferSize);
196
@@ -424,11 +270,6 @@
197
         x265_log(m_param, X265_LOG_WARNING, "strict CBR set without CBR mode, ignored\n");
198
         m_param->rc.bStrictCbr = 0;
199
     }
200
-    if (m_param->totalFrames <= 2 * m_fps && m_param->rc.bStrictCbr) /* Strict CBR segment encode */
201
x265_1.5.tar.gz/source/encoder/ratecontrol.h -> x265_1.6.tar.gz/source/encoder/ratecontrol.h Changed
201
 
1
@@ -34,14 +34,16 @@
2
 
3
 class Encoder;
4
 class Frame;
5
-struct SPS;
6
 class SEIBufferingPeriod;
7
+struct SPS;
8
 #define BASE_FRAME_DURATION 0.04
9
 
10
 /* Arbitrary limitations as a sanity check. */
11
 #define MAX_FRAME_DURATION 1.00
12
 #define MIN_FRAME_DURATION 0.01
13
 
14
+#define MIN_AMORTIZE_FRAME 10
15
+#define MIN_AMORTIZE_FRACTION 0.2
16
 #define CLIP_DURATION(f) x265_clip3(MIN_FRAME_DURATION, MAX_FRAME_DURATION, f)
17
 
18
 /* Current frame stats for 2 pass */
19
@@ -79,46 +81,50 @@
20
 
21
 struct RateControlEntry
22
 {
23
-    int64_t lastSatd; /* Contains the picture cost of the previous frame, required for resetAbr and VBV */
24
-    int sliceType;
25
-    int bframes;
26
-    int poc;
27
-    int encodeOrder;
28
-    int64_t leadingNoBSatd;
29
-    bool bLastMiniGopBFrame;
30
-    double blurredComplexity;
31
-    double qpaRc;
32
-    double qpAq;
33
-    double qRceq;
34
-    double frameSizePlanned;  /* frame Size decided by RateCotrol before encoding the frame */
35
-    double bufferRate;
36
-    double movingAvgSum;
37
-    double   rowCplxrSum;
38
-    int64_t  rowTotalBits;  /* update cplxrsum and totalbits at the end of 2 rows */
39
-    double qpNoVbv;
40
-    double bufferFill;
41
-    double frameDuration;
42
-    double clippedDuration;
43
-    Predictor rowPreds[3][2];
44
+    Predictor  rowPreds[3][2];
45
     Predictor* rowPred[2];
46
-    double frameSizeEstimated;  /* hold frameSize, updated from cu level vbv rc */
47
-    double frameSizeMaximum;  /* max frame Size according to minCR restrictions and level of the video */
48
-    bool isActive;
49
-    SEIPictureTiming *picTimingSEI;
50
-    HRDTiming        *hrdTiming;
51
+
52
+    int64_t lastSatd;      /* Contains the picture cost of the previous frame, required for resetAbr and VBV */
53
+    int64_t leadingNoBSatd;
54
+    int64_t rowTotalBits;  /* update cplxrsum and totalbits at the end of 2 rows */
55
+    double  blurredComplexity;
56
+    double  qpaRc;
57
+    double  qpAq;
58
+    double  qRceq;
59
+    double  frameSizePlanned;  /* frame Size decided by RateCotrol before encoding the frame */
60
+    double  bufferRate;
61
+    double  movingAvgSum;
62
+    double  rowCplxrSum;
63
+    double  qpNoVbv;
64
+    double  bufferFill;
65
+    double  frameDuration;
66
+    double  clippedDuration;
67
+    double  frameSizeEstimated; /* hold frameSize, updated from cu level vbv rc */
68
+    double  frameSizeMaximum;   /* max frame Size according to minCR restrictions and level of the video */
69
+    int     sliceType;
70
+    int     bframes;
71
+    int     poc;
72
+    int     encodeOrder;
73
+    bool    bLastMiniGopBFrame;
74
+    bool    isActive;
75
+    double  amortizeFrames;
76
+    double  amortizeFraction;
77
     /* Required in 2-pass rate control */
78
-    double iCuCount;
79
-    double pCuCount;
80
-    double skipCuCount;
81
-    bool keptAsRef;
82
-    double expectedVbv;
83
-    double qScale;
84
-    double newQScale;
85
-    double newQp;
86
-    int mvBits;
87
-    int miscBits;
88
-    int coeffBits;
89
     uint64_t expectedBits; /* total expected bits up to the current frame (current one excluded) */
90
+    double   iCuCount;
91
+    double   pCuCount;
92
+    double   skipCuCount;
93
+    double   expectedVbv;
94
+    double   qScale;
95
+    double   newQScale;
96
+    double   newQp;
97
+    int      mvBits;
98
+    int      miscBits;
99
+    int      coeffBits;
100
+    bool     keptAsRef;
101
+
102
+    SEIPictureTiming *picTimingSEI;
103
+    HRDTiming        *hrdTiming;
104
 };
105
 
106
 class RateControl
107
@@ -139,7 +145,7 @@
108
     bool   m_isAbrReset;
109
     int    m_lastAbrResetPoc;
110
 
111
-    double  m_rateTolerance;
112
+    double m_rateTolerance;
113
     double m_frameDuration;     /* current frame duration in seconds */
114
     double m_bitrate;
115
     double m_rateFactorConstant;
116
@@ -154,33 +160,38 @@
117
     Predictor m_pred[5];
118
     Predictor m_predBfromP;
119
 
120
-    int       m_leadingBframes;
121
-    int64_t   m_bframeBits;
122
-    int64_t   m_currentSatd;
123
-    int       m_qpConstant[3];
124
-    double    m_ipOffset;
125
-    double    m_pbOffset;
126
-
127
-    int      m_lastNonBPictType;
128
-    int64_t  m_leadingNoBSatd;
129
-
130
-    double   m_cplxrSum;          /* sum of bits*qscale/rceq */
131
-    double   m_wantedBitsWindow;  /* target bitrate * window */
132
-    double   m_accumPQp;          /* for determining I-frame quant */
133
-    double   m_accumPNorm;
134
-    double   m_lastQScaleFor[3];  /* last qscale for a specific pict type, used for max_diff & ipb factor stuff */
135
-    double   m_lstep;
136
-    double   m_shortTermCplxSum;
137
-    double   m_shortTermCplxCount;
138
-    double   m_lastRceq;
139
-    double   m_qCompress;
140
-    int64_t  m_totalBits;        /* total bits used for already encoded frames (after ammortization) */
141
-    int      m_framesDone;       /* # of frames passed through RateCotrol already */
142
-    int64_t  m_encodedBits;      /* bits used for encoded frames (without ammortization) */
143
-    double   m_fps;
144
-    int64_t  m_satdCostWindow[50];
145
-    int      m_sliderPos;
146
-    int64_t  m_encodedBitsWindow[50];
147
+    int64_t m_leadingNoBSatd;
148
+    double  m_ipOffset;
149
+    double  m_pbOffset;
150
+    int64_t m_bframeBits;
151
+    int64_t m_currentSatd;
152
+    int     m_leadingBframes;
153
+    int     m_qpConstant[3];
154
+    int     m_lastNonBPictType;
155
+    int     m_framesDone;        /* # of frames passed through RateCotrol already */
156
+
157
+    double  m_cplxrSum;          /* sum of bits*qscale/rceq */
158
+    double  m_wantedBitsWindow;  /* target bitrate * window */
159
+    double  m_accumPQp;          /* for determining I-frame quant */
160
+    double  m_accumPNorm;
161
+    double  m_lastQScaleFor[3];  /* last qscale for a specific pict type, used for max_diff & ipb factor stuff */
162
+    double  m_lstep;
163
+    double  m_shortTermCplxSum;
164
+    double  m_shortTermCplxCount;
165
+    double  m_lastRceq;
166
+    double  m_qCompress;
167
+    int64_t m_totalBits;        /* total bits used for already encoded frames (after ammortization) */
168
+    int64_t m_encodedBits;      /* bits used for encoded frames (without ammortization) */
169
+    double  m_fps;
170
+    int64_t m_satdCostWindow[50];
171
+    int64_t m_encodedBitsWindow[50];
172
+    int     m_sliderPos;
173
+
174
+    /* To detect a pattern of low detailed static frames in single pass ABR using satdcosts */
175
+    int64_t m_lastBsliceSatdCost;
176
+    int     m_numBframesInPattern;
177
+    bool    m_isPatternPresent;
178
+
179
     /* a common variable on which rateControlStart, rateControlEnd and rateControUpdateStats waits to
180
      * sync the calls to these functions. For example
181
      * -F2:
182
@@ -194,24 +205,25 @@
183
      * rceUpdate 12
184
      * rceEnd    11 */
185
     ThreadSafeInteger m_startEndOrder;
186
-    int      m_finalFrameCount;   /* set when encoder begins flushing */
187
-    bool     m_bTerminated;       /* set true when encoder is closing */
188
+    int     m_finalFrameCount;   /* set when encoder begins flushing */
189
+    bool    m_bTerminated;       /* set true when encoder is closing */
190
 
191
     /* hrd stuff */
192
     SEIBufferingPeriod m_bufPeriodSEI;
193
-    double   m_nominalRemovalTime;
194
-    double   m_prevCpbFinalAT;
195
+    double  m_nominalRemovalTime;
196
+    double  m_prevCpbFinalAT;
197
 
198
     /* 2 pass */
199
-    bool     m_2pass;
200
-    FILE*    m_statFileOut;
201
x265_1.5.tar.gz/source/encoder/sao.cpp -> x265_1.6.tar.gz/source/encoder/sao.cpp Changed
10
 
1
@@ -261,6 +261,8 @@
2
     int8_t _upBuff1[MAX_CU_SIZE + 2], *upBuff1 = _upBuff1 + 1;
3
     int8_t _upBufft[MAX_CU_SIZE + 2], *upBufft = _upBufft + 1;
4
 
5
+    memset(_upBuff1 + MAX_CU_SIZE, 0, 2 * sizeof(int8_t)); /* avoid valgrind uninit warnings */
6
+
7
     {
8
         const pixel* recR = &rec[ctuWidth - 1];
9
         for (int i = 0; i < ctuHeight + 1; i++)
10
x265_1.5.tar.gz/source/encoder/search.cpp -> x265_1.6.tar.gz/source/encoder/search.cpp Changed
201
 
1
@@ -30,6 +30,9 @@
2
 #include "entropy.h"
3
 #include "rdcost.h"
4
 
5
+#include "analysis.h"  // TLD
6
+#include "framedata.h"
7
+
8
 using namespace x265;
9
 
10
 #if _MSC_VER
11
@@ -40,10 +43,9 @@
12
 
13
 #define MVP_IDX_BITS 1
14
 
15
-ALIGN_VAR_32(const pixel, Search::zeroPixel[MAX_CU_SIZE]) = { 0 };
16
 ALIGN_VAR_32(const int16_t, Search::zeroShort[MAX_CU_SIZE]) = { 0 };
17
 
18
-Search::Search() : JobProvider(NULL)
19
+Search::Search()
20
 {
21
     memset(m_rqt, 0, sizeof(m_rqt));
22
 
23
@@ -54,25 +56,30 @@
24
     }
25
 
26
     m_numLayers = 0;
27
+    m_intraPred = NULL;
28
+    m_intraPredAngs = NULL;
29
+    m_fencScaled = NULL;
30
+    m_fencTransposed = NULL;
31
+    m_tsCoeff = NULL;
32
+    m_tsResidual = NULL;
33
+    m_tsRecon = NULL;
34
     m_param = NULL;
35
     m_slice = NULL;
36
     m_frame = NULL;
37
-    m_bJobsQueued = false;
38
-    m_totalNumME = m_numAcquiredME = m_numCompletedME = 0;
39
 }
40
 
41
 bool Search::initSearch(const x265_param& param, ScalingList& scalingList)
42
 {
43
     uint32_t maxLog2CUSize = g_log2Size[param.maxCUSize];
44
     m_param = &param;
45
-    m_bEnableRDOQ = param.rdLevel >= 4;
46
+    m_bEnableRDOQ = !!param.rdoqLevel;
47
     m_bFrameParallel = param.frameNumThreads > 1;
48
     m_numLayers = g_log2Size[param.maxCUSize] - 2;
49
 
50
     m_rdCost.setPsyRdScale(param.psyRd);
51
     m_me.init(param.searchMethod, param.subpelRefine, param.internalCsp);
52
 
53
-    bool ok = m_quant.init(m_bEnableRDOQ, param.psyRdoq, scalingList, m_entropyCoder);
54
+    bool ok = m_quant.init(param.rdoqLevel, param.psyRdoq, scalingList, m_entropyCoder);
55
     if (m_param->noiseReductionIntra || m_param->noiseReductionInter)
56
         ok &= m_quant.allocNoiseReduction(param);
57
 
58
@@ -116,6 +123,15 @@
59
     m_qtTempTransformSkipFlag[1] = m_qtTempTransformSkipFlag[0] + numPartitions;
60
     m_qtTempTransformSkipFlag[2] = m_qtTempTransformSkipFlag[0] + numPartitions * 2;
61
 
62
+    CHECKED_MALLOC(m_intraPred, pixel, (32 * 32) * (33 + 3));
63
+    m_fencScaled = m_intraPred + 32 * 32;
64
+    m_fencTransposed = m_fencScaled + 32 * 32;
65
+    m_intraPredAngs = m_fencTransposed + 32 * 32;
66
+
67
+    CHECKED_MALLOC(m_tsCoeff,    coeff_t, MAX_TS_SIZE * MAX_TS_SIZE);
68
+    CHECKED_MALLOC(m_tsResidual, int16_t, MAX_TS_SIZE * MAX_TS_SIZE);
69
+    CHECKED_MALLOC(m_tsRecon,    pixel,   MAX_TS_SIZE * MAX_TS_SIZE);
70
+
71
     return ok;
72
 
73
 fail:
74
@@ -141,6 +157,10 @@
75
 
76
     X265_FREE(m_qtTempCbf[0]);
77
     X265_FREE(m_qtTempTransformSkipFlag[0]);
78
+    X265_FREE(m_intraPred);
79
+    X265_FREE(m_tsCoeff);
80
+    X265_FREE(m_tsResidual);
81
+    X265_FREE(m_tsRecon);
82
 }
83
 
84
 void Search::setQP(const Slice& slice, int qp)
85
@@ -421,7 +441,7 @@
86
     }
87
 
88
     // set reconstruction for next intra prediction blocks if full TU prediction won
89
-    pixel*   picReconY = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
90
+    pixel*   picReconY = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
91
     intptr_t picStride = m_frame->m_reconPic->m_stride;
92
     primitives.cu[sizeIdx].copy_pp(picReconY, picStride, reconQt, reconQtStride);
93
 
94
@@ -477,17 +497,14 @@
95
     if (m_bEnableRDOQ)
96
         m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
97
 
98
-    ALIGN_VAR_32(coeff_t, tsCoeffY[MAX_TS_SIZE * MAX_TS_SIZE]);
99
-    ALIGN_VAR_32(pixel,   tsReconY[MAX_TS_SIZE * MAX_TS_SIZE]);
100
-
101
     int checkTransformSkip = 1;
102
     for (int useTSkip = 0; useTSkip <= checkTransformSkip; useTSkip++)
103
     {
104
         uint64_t tmpCost;
105
         uint32_t tmpEnergy = 0;
106
 
107
-        coeff_t* coeff = (useTSkip ? tsCoeffY : coeffY);
108
-        pixel*   tmpRecon = (useTSkip ? tsReconY : reconQt);
109
+        coeff_t* coeff = (useTSkip ? m_tsCoeff : coeffY);
110
+        pixel*   tmpRecon = (useTSkip ? m_tsRecon : reconQt);
111
         uint32_t tmpReconStride = (useTSkip ? MAX_TS_SIZE : reconQtStride);
112
 
113
         primitives.cu[sizeIdx].calcresidual(fenc, pred, residual, stride);
114
@@ -578,8 +595,8 @@
115
 
116
     if (bTSkip)
117
     {
118
-        memcpy(coeffY, tsCoeffY, sizeof(coeff_t) << (log2TrSize * 2));
119
-        primitives.cu[sizeIdx].copy_pp(reconQt, reconQtStride, tsReconY, tuSize);
120
+        memcpy(coeffY, m_tsCoeff, sizeof(coeff_t) << (log2TrSize * 2));
121
+        primitives.cu[sizeIdx].copy_pp(reconQt, reconQtStride, m_tsRecon, tuSize);
122
     }
123
     else if (checkTransformSkip)
124
     {
125
@@ -589,7 +606,7 @@
126
     }
127
 
128
     // set reconstruction for next intra prediction blocks
129
-    pixel*   picReconY = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
130
+    pixel*   picReconY = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
131
     intptr_t picStride = m_frame->m_reconPic->m_stride;
132
     primitives.cu[sizeIdx].copy_pp(picReconY, picStride, reconQt, reconQtStride);
133
 
134
@@ -639,7 +656,7 @@
135
         uint32_t sizeIdx   = log2TrSize - 2;
136
         primitives.cu[sizeIdx].calcresidual(fenc, pred, residual, stride);
137
 
138
-        pixel*   picReconY = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
139
+        pixel*   picReconY = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
140
         intptr_t picStride = m_frame->m_reconPic->m_stride;
141
 
142
         uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffY, log2TrSize, TEXT_LUMA, absPartIdx, false);
143
@@ -799,7 +816,7 @@
144
             coeff_t* coeffC        = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
145
             pixel*   reconQt       = m_rqt[qtLayer].reconQtYuv.getChromaAddr(chromaId, absPartIdxC);
146
             uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_csize;
147
-            pixel*   picReconC = m_frame->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdxC);
148
+            pixel*   picReconC = m_frame->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + absPartIdxC);
149
             intptr_t picStride = m_frame->m_reconPic->m_strideC;
150
 
151
             uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC];
152
@@ -812,7 +829,7 @@
153
             initAdiPatternChroma(cu, cuGeom, absPartIdxC, intraNeighbors, chromaId);
154
 
155
             // get prediction signal
156
-            predIntraChromaAng(chromaPredMode, pred, stride, log2TrSizeC, m_csp);
157
+            predIntraChromaAng(chromaPredMode, pred, stride, log2TrSizeC);
158
             cu.setTransformSkipPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
159
 
160
             primitives.cu[sizeIdxC].calcresidual(fenc, pred, residual, stride);
161
@@ -864,9 +881,6 @@
162
      * condition as it arrived, and to do all bit estimates from the same state. */
163
     m_entropyCoder.store(m_rqt[fullDepth].rqtRoot);
164
 
165
-    ALIGN_VAR_32(coeff_t, tskipCoeffC[MAX_TS_SIZE * MAX_TS_SIZE]);
166
-    ALIGN_VAR_32(pixel,   tskipReconC[MAX_TS_SIZE * MAX_TS_SIZE]);
167
-
168
     uint32_t curPartNum = cuGeom.numPartitions >> tuDepthC * 2;
169
     const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
170
 
171
@@ -903,7 +917,7 @@
172
                 chromaPredMode = g_chroma422IntraAngleMappingTable[chromaPredMode];
173
 
174
             // get prediction signal
175
-            predIntraChromaAng(chromaPredMode, pred, stride, log2TrSizeC, m_csp);
176
+            predIntraChromaAng(chromaPredMode, pred, stride, log2TrSizeC);
177
 
178
             uint64_t bCost = MAX_INT64;
179
             uint32_t bDist = 0;
180
@@ -914,8 +928,8 @@
181
             int checkTransformSkip = 1;
182
             for (int useTSkip = 0; useTSkip <= checkTransformSkip; useTSkip++)
183
             {
184
-                coeff_t* coeff = (useTSkip ? tskipCoeffC : coeffC);
185
-                pixel*   recon = (useTSkip ? tskipReconC : reconQt);
186
+                coeff_t* coeff = (useTSkip ? m_tsCoeff : coeffC);
187
+                pixel*   recon = (useTSkip ? m_tsRecon : reconQt);
188
                 uint32_t reconStride = (useTSkip ? MAX_TS_SIZE : reconQtStride);
189
 
190
                 primitives.cu[sizeIdxC].calcresidual(fenc, pred, residual, stride);
191
@@ -972,14 +986,14 @@
192
 
193
             if (bTSkip)
194
             {
195
-                memcpy(coeffC, tskipCoeffC, sizeof(coeff_t) << (log2TrSizeC * 2));
196
-                primitives.cu[sizeIdxC].copy_pp(reconQt, reconQtStride, tskipReconC, MAX_TS_SIZE);
197
+                memcpy(coeffC, m_tsCoeff, sizeof(coeff_t) << (log2TrSizeC * 2));
198
+                primitives.cu[sizeIdxC].copy_pp(reconQt, reconQtStride, m_tsRecon, MAX_TS_SIZE);
199
             }
200
 
201
x265_1.5.tar.gz/source/encoder/search.h -> x265_1.6.tar.gz/source/encoder/search.h Changed
201
 
1
@@ -28,6 +28,7 @@
2
 #include "predict.h"
3
 #include "quant.h"
4
 #include "bitcost.h"
5
+#include "framedata.h"
6
 #include "yuv.h"
7
 #include "threadpool.h"
8
 
9
@@ -35,6 +36,18 @@
10
 #include "entropy.h"
11
 #include "motion.h"
12
 
13
+#if DETAILED_CU_STATS
14
+#define ProfileCUScopeNamed(name, cu, acc, count) \
15
+    m_stats[cu.m_encData->m_frameEncoderID].count++; \
16
+    ScopedElapsedTime name(m_stats[cu.m_encData->m_frameEncoderID].acc)
17
+#define ProfileCUScope(cu, acc, count) ProfileCUScopeNamed(timedScope, cu, acc, count)
18
+#define ProfileCounter(cu, count) m_stats[cu.m_encData->m_frameEncoderID].count++;
19
+#else
20
+#define ProfileCUScopeNamed(name, cu, acc, count)
21
+#define ProfileCUScope(cu, acc, count)
22
+#define ProfileCounter(cu, count)
23
+#endif
24
+
25
 namespace x265 {
26
 // private namespace
27
 
28
@@ -88,6 +101,10 @@
29
     MotionData bestME[MAX_INTER_PARTS][2];
30
     MV         amvpCand[2][MAX_NUM_REF][AMVP_NUM_CANDS];
31
 
32
+    // Neighbour MVs of the current partition. 5 spatial candidates and the
33
+    // temporal candidate.
34
+    InterNeighbourMV interNeighbours[6];
35
+
36
     uint64_t   rdCost;     // sum of partition (psy) RD costs          (sse(fenc, recon) + lambda2 * bits)
37
     uint64_t   sa8dCost;   // sum of partition sa8d distortion costs   (sa8d(fenc, pred) + lambda * bits)
38
     uint32_t   sa8dBits;   // signal bits used in sa8dCost calculation
39
@@ -109,8 +126,35 @@
40
         coeffBits = 0;
41
     }
42
 
43
+    void invalidate()
44
+    {
45
+        /* set costs to invalid data, catch uninitialized re-use */
46
+        rdCost = UINT64_MAX / 2;
47
+        sa8dCost = UINT64_MAX / 2;
48
+        sa8dBits = MAX_UINT / 2;
49
+        psyEnergy = MAX_UINT / 2;
50
+        distortion = MAX_UINT / 2;
51
+        totalBits = MAX_UINT / 2;
52
+        mvBits = MAX_UINT / 2;
53
+        coeffBits = MAX_UINT / 2;
54
+    }
55
+
56
+    bool ok() const
57
+    {
58
+        return !(rdCost >= UINT64_MAX / 2 ||
59
+                 sa8dCost >= UINT64_MAX / 2 ||
60
+                 sa8dBits >= MAX_UINT / 2 ||
61
+                 psyEnergy >= MAX_UINT / 2 ||
62
+                 distortion >= MAX_UINT / 2 ||
63
+                 totalBits >= MAX_UINT / 2 ||
64
+                 mvBits >= MAX_UINT / 2 ||
65
+                 coeffBits >= MAX_UINT / 2);
66
+    }
67
+
68
     void addSubCosts(const Mode& subMode)
69
     {
70
+        X265_CHECK(subMode.ok(), "sub-mode not initialized");
71
+
72
         rdCost += subMode.rdCost;
73
         sa8dCost += subMode.sa8dCost;
74
         sa8dBits += subMode.sa8dBits;
75
@@ -122,16 +166,89 @@
76
     }
77
 };
78
 
79
+#if DETAILED_CU_STATS
80
+/* This structure is intended for performance debugging and we make no attempt
81
+ * to handle dynamic range overflows. Care should be taken to avoid long encodes
82
+ * if you care about the accuracy of these elapsed times and counters. This
83
+ * profiling is orthogonal to PPA/VTune and can be enabled independently from
84
+ * either of them */
85
+struct CUStats
86
+{
87
+    int64_t  intraRDOElapsedTime[NUM_CU_DEPTH]; // elapsed worker time in intra RDO per CU depth
88
+    int64_t  interRDOElapsedTime[NUM_CU_DEPTH]; // elapsed worker time in inter RDO per CU depth
89
+    int64_t  intraAnalysisElapsedTime;          // elapsed worker time in intra sa8d analysis
90
+    int64_t  motionEstimationElapsedTime;       // elapsed worker time in predInterSearch()
91
+    int64_t  loopFilterElapsedTime;             // elapsed worker time in deblock and SAO and PSNR/SSIM
92
+    int64_t  pmeTime;                           // elapsed worker time processing ME slave jobs
93
+    int64_t  pmeBlockTime;                      // elapsed worker time blocked for pme batch completion
94
+    int64_t  pmodeTime;                         // elapsed worker time processing pmode slave jobs
95
+    int64_t  pmodeBlockTime;                    // elapsed worker time blocked for pmode batch completion
96
+    int64_t  weightAnalyzeTime;                 // elapsed worker time analyzing reference weights
97
+    int64_t  totalCTUTime;                      // elapsed worker time in compressCTU (includes pmode master)
98
+
99
+    uint64_t countIntraRDO[NUM_CU_DEPTH];
100
+    uint64_t countInterRDO[NUM_CU_DEPTH];
101
+    uint64_t countIntraAnalysis;
102
+    uint64_t countMotionEstimate;
103
+    uint64_t countLoopFilter;
104
+    uint64_t countPMETasks;
105
+    uint64_t countPMEMasters;
106
+    uint64_t countPModeTasks;
107
+    uint64_t countPModeMasters;
108
+    uint64_t countWeightAnalyze;
109
+    uint64_t totalCTUs;
110
+
111
+    CUStats() { clear(); }
112
+
113
+    void clear()
114
+    {
115
+        memset(this, 0, sizeof(*this));
116
+    }
117
+
118
+    void accumulate(CUStats& other)
119
+    {
120
+        for (uint32_t i = 0; i <= g_maxCUDepth; i++)
121
+        {
122
+            intraRDOElapsedTime[i] += other.intraRDOElapsedTime[i];
123
+            interRDOElapsedTime[i] += other.interRDOElapsedTime[i];
124
+            countIntraRDO[i] += other.countIntraRDO[i];
125
+            countInterRDO[i] += other.countInterRDO[i];
126
+        }
127
+
128
+        intraAnalysisElapsedTime += other.intraAnalysisElapsedTime;
129
+        motionEstimationElapsedTime += other.motionEstimationElapsedTime;
130
+        loopFilterElapsedTime += other.loopFilterElapsedTime;
131
+        pmeTime += other.pmeTime;
132
+        pmeBlockTime += other.pmeBlockTime;
133
+        pmodeTime += other.pmodeTime;
134
+        pmodeBlockTime += other.pmodeBlockTime;
135
+        weightAnalyzeTime += other.weightAnalyzeTime;
136
+        totalCTUTime += other.totalCTUTime;
137
+
138
+        countIntraAnalysis += other.countIntraAnalysis;
139
+        countMotionEstimate += other.countMotionEstimate;
140
+        countLoopFilter += other.countLoopFilter;
141
+        countPMETasks += other.countPMETasks;
142
+        countPMEMasters += other.countPMEMasters;
143
+        countPModeTasks += other.countPModeTasks;
144
+        countPModeMasters += other.countPModeMasters;
145
+        countWeightAnalyze += other.countWeightAnalyze;
146
+        totalCTUs += other.totalCTUs;
147
+
148
+        other.clear();
149
+    }
150
+}; 
151
+#endif
152
+
153
 inline int getTUBits(int idx, int numIdx)
154
 {
155
     return idx + (idx < numIdx - 1);
156
 }
157
 
158
-class Search : public JobProvider, public Predict
159
+class Search : public Predict
160
 {
161
 public:
162
 
163
-    static const pixel   zeroPixel[MAX_CU_SIZE];
164
     static const int16_t zeroShort[MAX_CU_SIZE];
165
 
166
     MotionEstimate  m_me;
167
@@ -147,11 +264,25 @@
168
     uint8_t*        m_qtTempCbf[3];
169
     uint8_t*        m_qtTempTransformSkipFlag[3];
170
 
171
+    pixel*          m_fencScaled;     /* 32x32 buffer for down-scaled version of 64x64 CU fenc */
172
+    pixel*          m_fencTransposed; /* 32x32 buffer for transposed copy of fenc */
173
+    pixel*          m_intraPred;      /* 32x32 buffer for individual intra predictions */
174
+    pixel*          m_intraPredAngs;  /* allocation for 33 consecutive (all angular) 32x32 intra predictions */
175
+
176
+    coeff_t*        m_tsCoeff;        /* transform skip coeff 32x32 */
177
+    int16_t*        m_tsResidual;     /* transform skip residual 32x32 */
178
+    pixel*          m_tsRecon;        /* transform skip reconstructed pixels 32x32 */
179
+
180
     bool            m_bFrameParallel;
181
     bool            m_bEnableRDOQ;
182
     uint32_t        m_numLayers;
183
     uint32_t        m_refLagPixels;
184
 
185
+#if DETAILED_CU_STATS
186
+    /* Accumulate CU statistics separately for each frame encoder */
187
+    CUStats         m_stats[X265_MAX_FRAME_THREADS];
188
+#endif
189
+
190
     Search();
191
     ~Search();
192
 
193
@@ -162,7 +293,7 @@
194
     void     invalidateContexts(int fromDepth);
195
 
196
     // full RD search of intra modes. if sharedModes is not NULL, it directly uses them
197
-    void     checkIntra(Mode& intraMode, const CUGeom& cuGeom, PartSize partSize, uint8_t* sharedModes);
198
+    void     checkIntra(Mode& intraMode, const CUGeom& cuGeom, PartSize partSize, uint8_t* sharedModes, uint8_t* sharedChromaModes);
199
 
200
     // select best intra mode using only sa8d costs, cannot measure NxN intra
201
x265_1.5.tar.gz/source/encoder/slicetype.cpp -> x265_1.6.tar.gz/source/encoder/slicetype.cpp Changed
201
 
1
@@ -34,11 +34,17 @@
2
 #include "motion.h"
3
 #include "ratecontrol.h"
4
 
5
-#define NUM_CUS (m_widthInCU > 2 && m_heightInCU > 2 ? (m_widthInCU - 2) * (m_heightInCU - 2) : m_widthInCU * m_heightInCU)
6
+#if DETAILED_CU_STATS
7
+#define ProfileLookaheadTime(elapsed, count) ScopedElapsedTime _scope(elapsed); count++
8
+#else
9
+#define ProfileLookaheadTime(elapsed, count)
10
+#endif
11
 
12
 using namespace x265;
13
 
14
-static inline int16_t median(int16_t a, int16_t b, int16_t c)
15
+namespace {
16
+
17
+inline int16_t median(int16_t a, int16_t b, int16_t c)
18
 {
19
     int16_t t = (a - b) & ((a - b) >> 31);
20
 
21
@@ -49,55 +55,531 @@
22
     return b;
23
 }
24
 
25
-static inline void median_mv(MV &dst, MV a, MV b, MV c)
26
+inline void median_mv(MV &dst, MV a, MV b, MV c)
27
 {
28
     dst.x = median(a.x, b.x, c.x);
29
     dst.y = median(a.y, b.y, c.y);
30
 }
31
 
32
+/* Compute variance to derive AC energy of each block */
33
+inline uint32_t acEnergyVar(Frame *curFrame, uint64_t sum_ssd, int shift, int plane)
34
+{
35
+    uint32_t sum = (uint32_t)sum_ssd;
36
+    uint32_t ssd = (uint32_t)(sum_ssd >> 32);
37
+
38
+    curFrame->m_lowres.wp_sum[plane] += sum;
39
+    curFrame->m_lowres.wp_ssd[plane] += ssd;
40
+    return ssd - ((uint64_t)sum * sum >> shift);
41
+}
42
+
43
+/* Find the energy of each block in Y/Cb/Cr plane */
44
+inline uint32_t acEnergyPlane(Frame *curFrame, pixel* src, intptr_t srcStride, int plane, int colorFormat)
45
+{
46
+    if ((colorFormat != X265_CSP_I444) && plane)
47
+    {
48
+        ALIGN_VAR_8(pixel, pix[8 * 8]);
49
+        primitives.cu[BLOCK_8x8].copy_pp(pix, 8, src, srcStride);
50
+        return acEnergyVar(curFrame, primitives.cu[BLOCK_8x8].var(pix, 8), 6, plane);
51
+    }
52
+    else
53
+        return acEnergyVar(curFrame, primitives.cu[BLOCK_16x16].var(src, srcStride), 8, plane);
54
+}
55
+
56
+} // end anonymous namespace
57
+
58
+/* Find the total AC energy of each block in all planes */
59
+uint32_t LookaheadTLD::acEnergyCu(Frame* curFrame, uint32_t blockX, uint32_t blockY, int csp)
60
+{
61
+    intptr_t stride = curFrame->m_fencPic->m_stride;
62
+    intptr_t cStride = curFrame->m_fencPic->m_strideC;
63
+    intptr_t blockOffsetLuma = blockX + (blockY * stride);
64
+    int hShift = CHROMA_H_SHIFT(csp);
65
+    int vShift = CHROMA_V_SHIFT(csp);
66
+    intptr_t blockOffsetChroma = (blockX >> hShift) + ((blockY >> vShift) * cStride);
67
+
68
+    uint32_t var;
69
+
70
+    var  = acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[0] + blockOffsetLuma, stride, 0, csp);
71
+    var += acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[1] + blockOffsetChroma, cStride, 1, csp);
72
+    var += acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[2] + blockOffsetChroma, cStride, 2, csp);
73
+    x265_emms();
74
+    return var;
75
+}
76
+
77
+void LookaheadTLD::calcAdaptiveQuantFrame(Frame *curFrame, x265_param* param)
78
+{
79
+    /* Actual adaptive quantization */
80
+    int maxCol = curFrame->m_fencPic->m_picWidth;
81
+    int maxRow = curFrame->m_fencPic->m_picHeight;
82
+    int blockWidth = ((param->sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
83
+    int blockHeight = ((param->sourceHeight / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
84
+    int blockCount = blockWidth * blockHeight;
85
+
86
+    for (int y = 0; y < 3; y++)
87
+    {
88
+        curFrame->m_lowres.wp_ssd[y] = 0;
89
+        curFrame->m_lowres.wp_sum[y] = 0;
90
+    }
91
+
92
+    /* Calculate Qp offset for each 16x16 block in the frame */
93
+    int blockXY = 0;
94
+    int blockX = 0, blockY = 0;
95
+    double strength = 0.f;
96
+    if (param->rc.aqMode == X265_AQ_NONE || param->rc.aqStrength == 0)
97
+    {
98
+        /* Need to init it anyways for CU tree */
99
+        int cuCount = widthInCU * heightInCU;
100
+
101
+        if (param->rc.aqMode && param->rc.aqStrength == 0)
102
+        {
103
+            memset(curFrame->m_lowres.qpCuTreeOffset, 0, cuCount * sizeof(double));
104
+            memset(curFrame->m_lowres.qpAqOffset, 0, cuCount * sizeof(double));
105
+            for (int cuxy = 0; cuxy < cuCount; cuxy++)
106
+                curFrame->m_lowres.invQscaleFactor[cuxy] = 256;
107
+        }
108
+
109
+        /* Need variance data for weighted prediction */
110
+        if (param->bEnableWeightedPred || param->bEnableWeightedBiPred)
111
+        {
112
+            for (blockY = 0; blockY < maxRow; blockY += 16)
113
+                for (blockX = 0; blockX < maxCol; blockX += 16)
114
+                    acEnergyCu(curFrame, blockX, blockY, param->internalCsp);
115
+        }
116
+    }
117
+    else
118
+    {
119
+        blockXY = 0;
120
+        double avg_adj_pow2 = 0, avg_adj = 0, qp_adj = 0;
121
+        if (param->rc.aqMode == X265_AQ_AUTO_VARIANCE)
122
+        {
123
+            double bit_depth_correction = pow(1 << (X265_DEPTH - 8), 0.5);
124
+            for (blockY = 0; blockY < maxRow; blockY += 16)
125
+            {
126
+                for (blockX = 0; blockX < maxCol; blockX += 16)
127
+                {
128
+                    uint32_t energy = acEnergyCu(curFrame, blockX, blockY, param->internalCsp);
129
+                    qp_adj = pow(energy + 1, 0.1);
130
+                    curFrame->m_lowres.qpCuTreeOffset[blockXY] = qp_adj;
131
+                    avg_adj += qp_adj;
132
+                    avg_adj_pow2 += qp_adj * qp_adj;
133
+                    blockXY++;
134
+                }
135
+            }
136
+
137
+            avg_adj /= blockCount;
138
+            avg_adj_pow2 /= blockCount;
139
+            strength = param->rc.aqStrength * avg_adj / bit_depth_correction;
140
+            avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - (11.f * bit_depth_correction)) / avg_adj;
141
+        }
142
+        else
143
+            strength = param->rc.aqStrength * 1.0397f;
144
+
145
+        blockXY = 0;
146
+        for (blockY = 0; blockY < maxRow; blockY += 16)
147
+        {
148
+            for (blockX = 0; blockX < maxCol; blockX += 16)
149
+            {
150
+                if (param->rc.aqMode == X265_AQ_AUTO_VARIANCE)
151
+                {
152
+                    qp_adj = curFrame->m_lowres.qpCuTreeOffset[blockXY];
153
+                    qp_adj = strength * (qp_adj - avg_adj);
154
+                }
155
+                else
156
+                {
157
+                    uint32_t energy = acEnergyCu(curFrame, blockX, blockY, param->internalCsp);
158
+                    qp_adj = strength * (X265_LOG2(X265_MAX(energy, 1)) - (14.427f + 2 * (X265_DEPTH - 8)));
159
+                }
160
+                curFrame->m_lowres.qpAqOffset[blockXY] = qp_adj;
161
+                curFrame->m_lowres.qpCuTreeOffset[blockXY] = qp_adj;
162
+                curFrame->m_lowres.invQscaleFactor[blockXY] = x265_exp2fix8(qp_adj);
163
+                blockXY++;
164
+            }
165
+        }
166
+    }
167
+
168
+    if (param->bEnableWeightedPred || param->bEnableWeightedBiPred)
169
+    {
170
+        int hShift = CHROMA_H_SHIFT(param->internalCsp);
171
+        int vShift = CHROMA_V_SHIFT(param->internalCsp);
172
+        maxCol = ((maxCol + 8) >> 4) << 4;
173
+        maxRow = ((maxRow + 8) >> 4) << 4;
174
+        int width[3]  = { maxCol, maxCol >> hShift, maxCol >> hShift };
175
+        int height[3] = { maxRow, maxRow >> vShift, maxRow >> vShift };
176
+
177
+        for (int i = 0; i < 3; i++)
178
+        {
179
+            uint64_t sum, ssd;
180
+            sum = curFrame->m_lowres.wp_sum[i];
181
+            ssd = curFrame->m_lowres.wp_ssd[i];
182
+            curFrame->m_lowres.wp_ssd[i] = ssd - (sum * sum + (width[i] * height[i]) / 2) / (width[i] * height[i]);
183
+        }
184
+    }
185
+}
186
+
187
+void LookaheadTLD::lowresIntraEstimate(Lowres& fenc)
188
+{
189
+    ALIGN_VAR_32(pixel, prediction[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
190
+    pixel fencIntra[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE];
191
+    pixel neighbours[2][X265_LOWRES_CU_SIZE * 4 + 1];
192
+    pixel* samples = neighbours[0], *filtered = neighbours[1];
193
+
194
+    const int lookAheadLambda = (int)x265_lambda_tab[X265_LOOKAHEAD_QP];
195
+    const int intraPenalty = 5 * lookAheadLambda;
196
+    const int lowresPenalty = 4; /* fixed CU cost overhead */
197
+
198
+    const int cuSize  = X265_LOWRES_CU_SIZE;
199
+    const int cuSize2 = cuSize << 1;
200
+    const int sizeIdx = X265_LOWRES_CU_BITS - 2;
201
x265_1.5.tar.gz/source/encoder/slicetype.h -> x265_1.6.tar.gz/source/encoder/slicetype.h Changed
201
 
1
@@ -28,141 +28,135 @@
2
 #include "slice.h"
3
 #include "motion.h"
4
 #include "piclist.h"
5
-#include "wavefront.h"
6
+#include "threadpool.h"
7
 
8
 namespace x265 {
9
 // private namespace
10
 
11
 struct Lowres;
12
 class Frame;
13
+class Lookahead;
14
 
15
 #define LOWRES_COST_MASK  ((1 << 14) - 1)
16
 #define LOWRES_COST_SHIFT 14
17
 
18
-#define SET_WEIGHT(w, b, s, d, o) \
19
-    { \
20
-        (w).inputWeight = (s); \
21
-        (w).log2WeightDenom = (d); \
22
-        (w).inputOffset = (o); \
23
-        (w).bPresentFlag = b; \
24
-    }
25
-
26
-class EstimateRow
27
+/* Thread local data for lookahead tasks */
28
+struct LookaheadTLD
29
 {
30
-public:
31
-    x265_param*         m_param;
32
-    MotionEstimate      m_me;
33
-    Lock                m_lock;
34
-
35
-    volatile uint32_t   m_completed;      // Number of CUs in this row for which cost estimation is completed
36
-    volatile bool       m_active;
37
-
38
-    uint64_t            m_costEst;        // Estimated cost for all CUs in a row
39
-    uint64_t            m_costEstAq;      // Estimated weight Aq cost for all CUs in a row
40
-    uint64_t            m_costIntraAq;    // Estimated weighted Aq Intra cost for all CUs in a row
41
-    int                 m_intraMbs;       // Number of Intra CUs
42
-    int                 m_costIntra;      // Estimated Intra cost for all CUs in a row
43
-
44
-    int                 m_merange;
45
-    int                 m_lookAheadLambda;
46
-
47
-    int                 m_widthInCU;
48
-    int                 m_heightInCU;
49
-
50
-    EstimateRow()
51
+    MotionEstimate  me;
52
+    ReferencePlanes weightedRef;
53
+    pixel*          wbuffer[4];
54
+    int             widthInCU;
55
+    int             heightInCU;
56
+    int             ncu;
57
+    int             paddedLines;
58
+
59
+#if DETAILED_CU_STATS
60
+    int64_t         batchElapsedTime;
61
+    int64_t         coopSliceElapsedTime;
62
+    uint64_t        countBatches;
63
+    uint64_t        countCoopSlices;
64
+#endif
65
+
66
+    LookaheadTLD()
67
     {
68
-        m_me.setQP(X265_LOOKAHEAD_QP);
69
-        m_me.init(X265_HEX_SEARCH, 1, X265_CSP_I400);
70
-        m_merange = 16;
71
-        m_lookAheadLambda = (int)x265_lambda_tab[X265_LOOKAHEAD_QP];
72
+        me.setQP(X265_LOOKAHEAD_QP);
73
+        me.init(X265_HEX_SEARCH, 1, X265_CSP_I400);
74
+        for (int i = 0; i < 4; i++)
75
+            wbuffer[i] = NULL;
76
+        widthInCU = heightInCU = ncu = paddedLines = 0;
77
+
78
+#if DETAILED_CU_STATS
79
+        batchElapsedTime = 0;
80
+        coopSliceElapsedTime = 0;
81
+        countBatches = 0;
82
+        countCoopSlices = 0;
83
+#endif
84
     }
85
 
86
-    void init();
87
-
88
-    void estimateCUCost(Lowres * *frames, ReferencePlanes * wfref0, int cux, int cuy, int p0, int p1, int b, bool bDoSearch[2]);
89
-};
90
-
91
-/* CostEstimate manages the cost estimation of a single frame, ie:
92
- * estimateFrameCost() and everything below it in the call graph */
93
-class CostEstimate : public WaveFront
94
-{
95
-public:
96
-    CostEstimate(ThreadPool *p);
97
-    ~CostEstimate();
98
-    void init(x265_param *, Frame *);
99
-
100
-    x265_param      *m_param;
101
-    EstimateRow     *m_rows;
102
-    pixel           *m_wbuffer[4];
103
-    Lowres         **m_curframes;
104
-
105
-    ReferencePlanes  m_weightedRef;
106
-    WeightParam      m_w;
107
+    void init(int w, int h, int n)
108
+    {
109
+        widthInCU = w;
110
+        heightInCU = h;
111
+        ncu = n;
112
+    }
113
 
114
-    int              m_paddedLines;     // number of lines in padded frame
115
-    int              m_widthInCU;       // width of lowres frame in downscale CUs
116
-    int              m_heightInCU;      // height of lowres frame in downscale CUs
117
+    ~LookaheadTLD() { X265_FREE(wbuffer[0]); }
118
 
119
-    bool             m_bDoSearch[2];
120
-    volatile bool    m_bFrameCompleted;
121
-    int              m_curb, m_curp0, m_curp1;
122
+    void calcAdaptiveQuantFrame(Frame *curFrame, x265_param* param);
123
+    void lowresIntraEstimate(Lowres& fenc);
124
 
125
-    void     processRow(int row, int threadId);
126
-    int64_t  estimateFrameCost(Lowres **frames, int p0, int p1, int b, bool bIntraPenalty);
127
+    void weightsAnalyse(Lowres& fenc, Lowres& ref);
128
 
129
 protected:
130
 
131
-    void     weightsAnalyse(Lowres **frames, int b, int p0);
132
-    uint32_t weightCostLuma(Lowres **frames, int b, int p0, WeightParam *w);
133
+    uint32_t acEnergyCu(Frame* curFrame, uint32_t blockX, uint32_t blockY, int csp);
134
+    uint32_t weightCostLuma(Lowres& fenc, Lowres& ref, WeightParam& wp);
135
+    bool     allocWeightedRef(Lowres& fenc);
136
 };
137
 
138
 class Lookahead : public JobProvider
139
 {
140
 public:
141
 
142
+    PicList       m_inputQueue;      // input pictures in order received
143
+    PicList       m_outputQueue;     // pictures to be encoded, in encode order
144
+    Lock          m_inputLock;
145
+    Lock          m_outputLock;
146
+
147
+    /* pre-lookahead */
148
+    Frame*        m_preframes[X265_LOOKAHEAD_MAX];
149
+    int           m_preTotal, m_preAcquired, m_preCompleted;
150
+    int           m_fullQueueSize;
151
+    bool          m_isActive;
152
+    bool          m_sliceTypeBusy;
153
+    bool          m_bAdaptiveQuant;
154
+    bool          m_outputSignalRequired;
155
+    bool          m_bBatchMotionSearch;
156
+    bool          m_bBatchFrameCosts;
157
+    Lock          m_preLookaheadLock;
158
+    Event         m_outputSignal;
159
+
160
+    LookaheadTLD* m_tld;
161
+    x265_param*   m_param;
162
+    Lowres*       m_lastNonB;
163
+    int*          m_scratch;         // temp buffer for cutree propagate
164
+    
165
+    int           m_histogram[X265_BFRAME_MAX + 1];
166
+    int           m_lastKeyframe;
167
+    int           m_8x8Width;
168
+    int           m_8x8Height;
169
+    int           m_8x8Blocks;
170
+    int           m_numCoopSlices;
171
+    int           m_numRowsPerSlice;
172
+    bool          m_filled;
173
+
174
     Lookahead(x265_param *param, ThreadPool *pool);
175
-    ~Lookahead();
176
-    void init();
177
-    void destroy();
178
 
179
-    CostEstimate     m_est;             // Frame cost estimator
180
-    PicList          m_inputQueue;      // input pictures in order received
181
-    PicList          m_outputQueue;     // pictures to be encoded, in encode order
182
+#if DETAILED_CU_STATS
183
+    int64_t       m_slicetypeDecideElapsedTime;
184
+    int64_t       m_preLookaheadElapsedTime;
185
+    uint64_t      m_countSlicetypeDecide;
186
+    uint64_t      m_countPreLookahead;
187
+    void          getWorkerStats(int64_t& batchElapsedTime, uint64_t& batchCount, int64_t& coopSliceElapsedTime, uint64_t& coopSliceCount);
188
+#endif
189
 
190
-    x265_param      *m_param;
191
-    Lowres          *m_lastNonB;
192
-    int             *m_scratch;         // temp buffer
193
+    bool    create();
194
+    void    destroy();
195
+    void    stop();
196
 
197
-    int              m_widthInCU;       // width of lowres frame in downscale CUs
198
-    int              m_heightInCU;      // height of lowres frame in downscale CUs
199
-    int              m_lastKeyframe;
200
-    int              m_histogram[X265_BFRAME_MAX + 1];
201
x265_1.5.tar.gz/source/encoder/weightPrediction.cpp -> x265_1.6.tar.gz/source/encoder/weightPrediction.cpp Changed
69
 
1
@@ -27,8 +27,8 @@
2
 #include "frame.h"
3
 #include "picyuv.h"
4
 #include "lowres.h"
5
+#include "slice.h"
6
 #include "mv.h"
7
-#include "slicetype.h"
8
 #include "bitstream.h"
9
 
10
 using namespace x265;
11
@@ -58,6 +58,7 @@
12
 void mcLuma(pixel* mcout, Lowres& ref, const MV * mvs)
13
 {
14
     intptr_t stride = ref.lumaStride;
15
+    const int mvshift = 1 << 2;
16
     const int cuSize = 8;
17
     MV mvmin, mvmax;
18
 
19
@@ -66,15 +67,15 @@
20
     for (int y = 0; y < ref.lines; y += cuSize)
21
     {
22
         intptr_t pixoff = y * stride;
23
-        mvmin.y = (int16_t)((-y - 8) << 2);
24
-        mvmax.y = (int16_t)((ref.lines - y - 1 + 8) << 2);
25
+        mvmin.y = (int16_t)((-y - 8) * mvshift);
26
+        mvmax.y = (int16_t)((ref.lines - y - 1 + 8) * mvshift);
27
 
28
         for (int x = 0; x < ref.width; x += cuSize, pixoff += cuSize, cu++)
29
         {
30
             ALIGN_VAR_16(pixel, buf8x8[8 * 8]);
31
             intptr_t bstride = 8;
32
-            mvmin.x = (int16_t)((-x - 8) << 2);
33
-            mvmax.x = (int16_t)((ref.width - x - 1 + 8) << 2);
34
+            mvmin.x = (int16_t)((-x - 8) * mvshift);
35
+            mvmax.x = (int16_t)((ref.width - x - 1 + 8) * mvshift);
36
 
37
             /* clip MV to available pixels */
38
             MV mv = mvs[cu];
39
@@ -100,6 +101,7 @@
40
     int csp = cache.csp;
41
     int bw = 16 >> cache.hshift;
42
     int bh = 16 >> cache.vshift;
43
+    const int mvshift = 1 << 2;
44
     MV mvmin, mvmax;
45
 
46
     for (int y = 0; y < height; y += bh)
47
@@ -109,8 +111,8 @@
48
          * into the lowres structures */
49
         int cu = y * cache.lowresWidthInCU;
50
         intptr_t pixoff = y * stride;
51
-        mvmin.y = (int16_t)((-y - 8) << 2);
52
-        mvmax.y = (int16_t)((height - y - 1 + 8) << 2);
53
+        mvmin.y = (int16_t)((-y - 8) * mvshift);
54
+        mvmax.y = (int16_t)((height - y - 1 + 8) * mvshift);
55
 
56
         for (int x = 0; x < width; x += bw, cu++, pixoff += bw)
57
         {
58
@@ -122,8 +124,8 @@
59
                 mv.y >>= cache.vshift;
60
 
61
                 /* clip MV to available pixels */
62
-                mvmin.x = (int16_t)((-x - 8) << 2);
63
-                mvmax.x = (int16_t)((width - x - 1 + 8) << 2);
64
+                mvmin.x = (int16_t)((-x - 8) * mvshift);
65
+                mvmax.x = (int16_t)((width - x - 1 + 8) * mvshift);
66
                 mv = mv.clipped(mvmin, mvmax);
67
 
68
                 intptr_t fpeloffset = (mv.y >> 2) * stride + (mv.x >> 2);
69
x265_1.5.tar.gz/source/input/y4m.cpp -> x265_1.6.tar.gz/source/input/y4m.cpp Changed
185
 
1
@@ -177,147 +177,118 @@
2
     int csp = 0;
3
     int d = 0;
4
 
5
-    while (!ifs->eof())
6
+    while (ifs->good())
7
     {
8
         // Skip Y4MPEG string
9
         int c = ifs->get();
10
-        while (!ifs->eof() && (c != ' ') && (c != '\n'))
11
-        {
12
+        while (ifs->good() && (c != ' ') && (c != '\n'))
13
             c = ifs->get();
14
-        }
15
 
16
-        while (c == ' ' && !ifs->eof())
17
+        while (c == ' ' && ifs->good())
18
         {
19
             // read parameter identifier
20
             switch (ifs->get())
21
             {
22
             case 'W':
23
                 width = 0;
24
-                while (!ifs->eof())
25
+                while (ifs->good())
26
                 {
27
                     c = ifs->get();
28
 
29
                     if (c == ' ' || c == '\n')
30
-                    {
31
                         break;
32
-                    }
33
                     else
34
-                    {
35
                         width = width * 10 + (c - '0');
36
-                    }
37
                 }
38
-
39
                 break;
40
 
41
             case 'H':
42
                 height = 0;
43
-                while (!ifs->eof())
44
+                while (ifs->good())
45
                 {
46
                     c = ifs->get();
47
                     if (c == ' ' || c == '\n')
48
-                    {
49
                         break;
50
-                    }
51
                     else
52
-                    {
53
                         height = height * 10 + (c - '0');
54
-                    }
55
                 }
56
-
57
                 break;
58
 
59
             case 'F':
60
                 rateNum = 0;
61
                 rateDenom = 0;
62
-                while (!ifs->eof())
63
+                while (ifs->good())
64
                 {
65
                     c = ifs->get();
66
                     if (c == '.')
67
                     {
68
                         rateDenom = 1;
69
-                        while (!ifs->eof())
70
+                        while (ifs->good())
71
                         {
72
                             c = ifs->get();
73
                             if (c == ' ' || c == '\n')
74
-                            {
75
                                 break;
76
-                            }
77
                             else
78
                             {
79
                                 rateNum = rateNum * 10 + (c - '0');
80
                                 rateDenom = rateDenom * 10;
81
                             }
82
                         }
83
-
84
                         break;
85
                     }
86
                     else if (c == ':')
87
                     {
88
-                        while (!ifs->eof())
89
+                        while (ifs->good())
90
                         {
91
                             c = ifs->get();
92
                             if (c == ' ' || c == '\n')
93
-                            {
94
                                 break;
95
-                            }
96
                             else
97
                                 rateDenom = rateDenom * 10 + (c - '0');
98
                         }
99
-
100
                         break;
101
                     }
102
                     else
103
-                    {
104
                         rateNum = rateNum * 10 + (c - '0');
105
-                    }
106
                 }
107
-
108
                 break;
109
 
110
             case 'A':
111
                 sarWidth = 0;
112
                 sarHeight = 0;
113
-                while (!ifs->eof())
114
+                while (ifs->good())
115
                 {
116
                     c = ifs->get();
117
                     if (c == ':')
118
                     {
119
-                        while (!ifs->eof())
120
+                        while (ifs->good())
121
                         {
122
                             c = ifs->get();
123
                             if (c == ' ' || c == '\n')
124
-                            {
125
                                 break;
126
-                            }
127
                             else
128
                                 sarHeight = sarHeight * 10 + (c - '0');
129
                         }
130
-
131
                         break;
132
                     }
133
                     else
134
-                    {
135
                         sarWidth = sarWidth * 10 + (c - '0');
136
-                    }
137
                 }
138
-
139
                 break;
140
 
141
             case 'C':
142
                 csp = 0;
143
                 d = 0;
144
-                while (!ifs->eof())
145
+                while (ifs->good())
146
                 {
147
                     c = ifs->get();
148
 
149
                     if (c <= '9' && c >= '0')
150
-                    {
151
                         csp = csp * 10 + (c - '0');
152
-                    }
153
                     else if (c == 'p')
154
                     {
155
                         // example: C420p16
156
-                        while (!ifs->eof())
157
+                        while (ifs->good())
158
                         {
159
                             c = ifs->get();
160
 
161
@@ -338,22 +309,19 @@
162
                 break;
163
 
164
             default:
165
-                while (!ifs->eof())
166
+                while (ifs->good())
167
                 {
168
                     // consume this unsupported configuration word
169
                     c = ifs->get();
170
                     if (c == ' ' || c == '\n')
171
                         break;
172
                 }
173
-
174
                 break;
175
             }
176
         }
177
 
178
         if (c == '\n')
179
-        {
180
             break;
181
-        }
182
     }
183
 
184
     if (width < MIN_FRAME_WIDTH || width > MAX_FRAME_WIDTH ||
185
x265_1.5.tar.gz/source/output/y4m.cpp -> x265_1.6.tar.gz/source/output/y4m.cpp Changed
36
 
1
@@ -46,9 +46,7 @@
2
     }
3
 
4
     for (int i = 0; i < x265_cli_csps[colorSpace].planes; i++)
5
-    {
6
         frameSize += (uint32_t)((width >> x265_cli_csps[colorSpace].width[i]) * (height >> x265_cli_csps[colorSpace].height[i]));
7
-    }
8
 }
9
 
10
 Y4MOutput::~Y4MOutput()
11
@@ -66,14 +64,10 @@
12
 
13
 #if HIGH_BIT_DEPTH
14
     if (pic.bitDepth > 8 && pic.poc == 0)
15
-    {
16
         x265_log(NULL, X265_LOG_WARNING, "y4m: down-shifting reconstructed pixels to 8 bits\n");
17
-    }
18
 #else
19
     if (pic.bitDepth > 8 && pic.poc == 0)
20
-    {
21
         x265_log(NULL, X265_LOG_WARNING, "y4m: forcing reconstructed pixels to 8 bits\n");
22
-    }
23
 #endif
24
 
25
     X265_CHECK(pic.colorSpace == colorSpace, "invalid color space\n");
26
@@ -89,9 +83,7 @@
27
         for (int h = 0; h < height >> x265_cli_csps[colorSpace].height[i]; h++)
28
         {
29
             for (int w = 0; w < width >> x265_cli_csps[colorSpace].width[i]; w++)
30
-            {
31
                 buf[w] = (char)(src[w] >> shift);
32
-            }
33
 
34
             ofs.write(buf, width >> x265_cli_csps[colorSpace].width[i]);
35
             src += pic.stride[i] / sizeof(*src);
36
x265_1.5.tar.gz/source/output/yuv.cpp -> x265_1.6.tar.gz/source/output/yuv.cpp Changed
21
 
1
@@ -39,9 +39,7 @@
2
     buf = new char[width];
3
 
4
     for (int i = 0; i < x265_cli_csps[colorSpace].planes; i++)
5
-    {
6
         frameSize += (uint32_t)((width >> x265_cli_csps[colorSpace].width[i]) * (height >> x265_cli_csps[colorSpace].height[i]));
7
-    }
8
 }
9
 
10
 YUVOutput::~YUVOutput()
11
@@ -69,9 +67,7 @@
12
             for (int h = 0; h < height >> x265_cli_csps[colorSpace].height[i]; h++)
13
             {
14
                 for (int w = 0; w < width >> x265_cli_csps[colorSpace].width[i]; w++)
15
-                {
16
                     buf[w] = (char)(src[w] >> shift);
17
-                }
18
 
19
                 ofs.write(buf, width >> x265_cli_csps[colorSpace].width[i]);
20
                 src += pic.stride[i] / sizeof(*src);
21
x265_1.5.tar.gz/source/profile/cpuEvents.h -> x265_1.6.tar.gz/source/profile/cpuEvents.h Changed
10
 
1
@@ -5,6 +5,7 @@
2
 CPU_EVENT(filterCTURow)
3
 CPU_EVENT(slicetypeDecideEV)
4
 CPU_EVENT(prelookahead)
5
-CPU_EVENT(costEstimateRow)
6
+CPU_EVENT(estCostSingle)
7
+CPU_EVENT(estCostCoop)
8
 CPU_EVENT(pmode)
9
 CPU_EVENT(pme)
10
x265_1.5.tar.gz/source/test/CMakeLists.txt -> x265_1.6.tar.gz/source/test/CMakeLists.txt Changed
8
 
1
@@ -23,3 +23,6 @@
2
     ipfilterharness.cpp ipfilterharness.h
3
     intrapredharness.cpp intrapredharness.h)
4
 target_link_libraries(TestBench x265-static ${PLATFORM_LIBS})
5
+if(LINKER_OPTIONS)
6
+    set_target_properties(TestBench PROPERTIES LINK_FLAGS ${LINKER_OPTIONS})
7
+endif()
8
x265_1.5.tar.gz/source/test/ipfilterharness.cpp -> x265_1.6.tar.gz/source/test/ipfilterharness.cpp Changed
116
 
1
@@ -61,7 +61,7 @@
2
     }
3
 }
4
 
5
-bool IPFilterHarness::check_IPFilter_primitive(filter_p2s_t ref, filter_p2s_t opt, int isChroma, int csp)
6
+bool IPFilterHarness::check_IPFilter_primitive(filter_p2s_wxh_t ref, filter_p2s_wxh_t opt, int isChroma, int csp)
7
 {
8
     intptr_t rand_srcStride;
9
     int min_size = isChroma ? 2 : 4;
10
@@ -512,6 +512,46 @@
11
     return true;
12
 }
13
 
14
+bool IPFilterHarness::check_IPFilterLumaP2S_primitive(filter_p2s_t ref, filter_p2s_t opt)
15
+{
16
+    for (int i = 0; i < ITERS; i++)
17
+    {
18
+        intptr_t rand_srcStride = rand() % 100;
19
+        int index = i % TEST_CASES;
20
+
21
+        ref(pixel_test_buff[index] + i, rand_srcStride, IPF_C_output_s);
22
+
23
+        checked(opt, pixel_test_buff[index] + i, rand_srcStride, IPF_vec_output_s);
24
+
25
+        if (memcmp(IPF_vec_output_s, IPF_C_output_s, TEST_BUF_SIZE * sizeof(pixel)))
26
+            return false;
27
+
28
+        reportfail();
29
+    }
30
+
31
+    return true;
32
+}
33
+
34
+bool IPFilterHarness::check_IPFilterChromaP2S_primitive(filter_p2s_t ref, filter_p2s_t opt)
35
+{
36
+    for (int i = 0; i < ITERS; i++)
37
+    {
38
+        intptr_t rand_srcStride = rand() % 100;
39
+        int index = i % TEST_CASES;
40
+
41
+        ref(pixel_test_buff[index] + i, rand_srcStride, IPF_C_output_s);
42
+
43
+        checked(opt, pixel_test_buff[index] + i, rand_srcStride, IPF_vec_output_s);
44
+
45
+        if (memcmp(IPF_vec_output_s, IPF_C_output_s, TEST_BUF_SIZE * sizeof(pixel)))
46
+            return false;
47
+
48
+        reportfail();
49
+    }
50
+
51
+    return true;
52
+}
53
+
54
 bool IPFilterHarness::testCorrectness(const EncoderPrimitives& ref, const EncoderPrimitives& opt)
55
 {
56
     if (opt.luma_p2s)
57
@@ -582,6 +622,14 @@
58
                 return false;
59
             }
60
         }
61
+        if (opt.pu[value].filter_p2s)
62
+        {
63
+            if (!check_IPFilterLumaP2S_primitive(ref.pu[value].filter_p2s, opt.pu[value].filter_p2s))
64
+            {
65
+                printf("filter_p2s[%s]", lumaPartStr[value]);
66
+                return false;
67
+            }
68
+        }
69
     }
70
 
71
     for (int csp = X265_CSP_I420; csp < X265_CSP_COUNT; csp++)
72
@@ -644,6 +692,14 @@
73
                     return false;
74
                 }
75
             }
76
+            if (opt.chroma[csp].pu[value].chroma_p2s)
77
+            {
78
+                if (!check_IPFilterChromaP2S_primitive(ref.chroma[csp].pu[value].chroma_p2s, opt.chroma[csp].pu[value].chroma_p2s))
79
+                {
80
+                    printf("chroma_p2s[%s]", chromaPartStr[csp][value]);
81
+                    return false;
82
+                }
83
+            }
84
         }
85
     }
86
 
87
@@ -720,6 +776,13 @@
88
             REPORT_SPEEDUP(opt.pu[value].luma_hvpp, ref.pu[value].luma_hvpp,
89
                            pixel_buff + 3 * srcStride, srcStride, IPF_vec_output_p, srcStride, 1, 3);
90
         }
91
+
92
+        if (opt.pu[value].filter_p2s)
93
+        {
94
+            printf("filter_p2s [%s]\t", lumaPartStr[value]);
95
+            REPORT_SPEEDUP(opt.pu[value].filter_p2s, ref.pu[value].filter_p2s,
96
+                           pixel_buff, srcStride, IPF_vec_output_s);
97
+        }
98
     }
99
 
100
     for (int csp = X265_CSP_I420; csp < X265_CSP_COUNT; csp++)
101
@@ -773,6 +836,14 @@
102
                                short_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
103
                                IPF_vec_output_s, dstStride, 1);
104
             }
105
+
106
+            if (opt.chroma[csp].pu[value].chroma_p2s)
107
+            {
108
+                printf("chroma_p2s[%s]\t", chromaPartStr[csp][value]);
109
+                REPORT_SPEEDUP(opt.chroma[csp].pu[value].chroma_p2s, ref.chroma[csp].pu[value].chroma_p2s,
110
+                               pixel_buff, srcStride,
111
+                               IPF_vec_output_s);
112
+            }
113
         }
114
     }
115
 }
116
x265_1.5.tar.gz/source/test/ipfilterharness.h -> x265_1.6.tar.gz/source/test/ipfilterharness.h Changed
19
 
1
@@ -50,7 +50,7 @@
2
     pixel   pixel_test_buff[TEST_CASES][TEST_BUF_SIZE];
3
     int16_t short_test_buff[TEST_CASES][TEST_BUF_SIZE];
4
 
5
-    bool check_IPFilter_primitive(filter_p2s_t ref, filter_p2s_t opt, int isChroma, int csp);
6
+    bool check_IPFilter_primitive(filter_p2s_wxh_t ref, filter_p2s_wxh_t opt, int isChroma, int csp);
7
     bool check_IPFilterChroma_primitive(filter_pp_t ref, filter_pp_t opt);
8
     bool check_IPFilterChroma_ps_primitive(filter_ps_t ref, filter_ps_t opt);
9
     bool check_IPFilterChroma_hps_primitive(filter_hps_t ref, filter_hps_t opt);
10
@@ -62,6 +62,8 @@
11
     bool check_IPFilterLuma_sp_primitive(filter_sp_t ref, filter_sp_t opt);
12
     bool check_IPFilterLuma_ss_primitive(filter_ss_t ref, filter_ss_t opt);
13
     bool check_IPFilterLumaHV_primitive(filter_hv_pp_t ref, filter_hv_pp_t opt);
14
+    bool check_IPFilterLumaP2S_primitive(filter_p2s_t ref, filter_p2s_t opt);
15
+    bool check_IPFilterChromaP2S_primitive(filter_p2s_t ref, filter_p2s_t opt);
16
 
17
 public:
18
 
19
x265_1.5.tar.gz/source/test/mbdstharness.cpp -> x265_1.6.tar.gz/source/test/mbdstharness.cpp Changed
107
 
1
@@ -209,7 +209,7 @@
2
 
3
     for (int i = 0; i < ITERS; i++)
4
     {
5
-        int width = (rand() % 4 + 1) * 4;
6
+        int width = 1 << (rand() % 4 + 2);
7
         int height = width;
8
 
9
         uint32_t optReturnValue = 0;
10
@@ -278,42 +278,19 @@
11
 
12
     return true;
13
 }
14
-
15
 bool MBDstHarness::check_count_nonzero_primitive(count_nonzero_t ref, count_nonzero_t opt)
16
 {
17
-    ALIGN_VAR_32(int16_t, qcoeff[32 * 32]);
18
-
19
-    for (int i = 0; i < 4; i++)
20
+    int j = 0;
21
+    for (int i = 0; i < ITERS; i++)
22
     {
23
-        int log2TrSize = i + 2;
24
-        int num = 1 << (log2TrSize * 2);
25
-        int mask = num - 1;
26
-
27
-        for (int n = 0; n <= num; n++)
28
-        {
29
-            memset(qcoeff, 0, num * sizeof(int16_t));
30
-
31
-            for (int j = 0; j < n; j++)
32
-            {
33
-                int k = rand() & mask;
34
-                while (qcoeff[k])
35
-                {
36
-                    k = (k + 11) & mask;
37
-                }
38
-
39
-                qcoeff[k] = (int16_t)rand() - RAND_MAX / 2;
40
-            }
41
-
42
-            int refval = ref(qcoeff, num);
43
-            int optval = (int)checked(opt, qcoeff, num);
44
-
45
-            if (refval != optval)
46
-                return false;
47
-
48
-            reportfail();
49
-        }
50
+        int index = i % TEST_CASES;
51
+        int opt_cnt = (int)checked(opt, short_test_buff[index] + j);
52
+        int ref_cnt = ref(short_test_buff[index] + j);
53
+        if (ref_cnt != opt_cnt)
54
+            return false;
55
+        reportfail();
56
+        j += INCR;
57
     }
58
-
59
     return true;
60
 }
61
 
62
@@ -437,16 +414,17 @@
63
             return false;
64
         }
65
     }
66
-
67
-    if (opt.count_nonzero)
68
+    for (int i = 0; i < NUM_TR_SIZE; i++)
69
     {
70
-        if (!check_count_nonzero_primitive(ref.count_nonzero, opt.count_nonzero))
71
+        if (opt.cu[i].count_nonzero)
72
         {
73
-            printf("count_nonzero: Failed!\n");
74
-            return false;
75
+            if (!check_count_nonzero_primitive(ref.cu[i].count_nonzero, opt.cu[i].count_nonzero))
76
+            {
77
+                printf("count_nonzero[%dx%d] Failed!\n", 4 << i, 4 << i);
78
+                return false;
79
+            }
80
         }
81
     }
82
-
83
     if (opt.dequant_scaling)
84
     {
85
         if (!check_dequant_primitive(ref.dequant_scaling, opt.dequant_scaling))
86
@@ -523,16 +501,14 @@
87
         printf("nquant\t\t");
88
         REPORT_SPEEDUP(opt.nquant, ref.nquant, short_test_buff[0], int_test_buff[1], mshortbuf2, 23, 23785, 32 * 32);
89
     }
90
-
91
-    if (opt.count_nonzero)
92
+    for (int value = 0; value < NUM_TR_SIZE; value++)
93
     {
94
-        for (int i = 4; i <= 32; i <<= 1)
95
+        if (opt.cu[value].count_nonzero)
96
         {
97
-            printf("count_nonzero[%dx%d]", i, i);
98
-            REPORT_SPEEDUP(opt.count_nonzero, ref.count_nonzero, mbuf1, i * i)
99
+            printf("count_nonzero[%dx%d]", 4 << value, 4 << value);
100
+            REPORT_SPEEDUP(opt.cu[value].count_nonzero, ref.cu[value].count_nonzero, mbuf1);
101
         }
102
     }
103
-
104
     if (opt.denoiseDct)
105
     {
106
         printf("denoiseDct\t");
107
x265_1.5.tar.gz/source/test/pixelharness.cpp -> x265_1.6.tar.gz/source/test/pixelharness.cpp Changed
166
 
1
@@ -1149,6 +1149,71 @@
2
     return true;
3
 }
4
 
5
+bool PixelHarness::check_findPosLast(findPosLast_t ref, findPosLast_t opt)
6
+{
7
+    ALIGN_VAR_16(coeff_t, ref_src[32 * 32 + ITERS * 2]);
8
+    uint8_t ref_coeffNum[MLS_GRP_NUM], opt_coeffNum[MLS_GRP_NUM];      // value range[0, 16]
9
+    uint16_t ref_coeffSign[MLS_GRP_NUM], opt_coeffSign[MLS_GRP_NUM];    // bit mask map for non-zero coeff sign
10
+    uint16_t ref_coeffFlag[MLS_GRP_NUM], opt_coeffFlag[MLS_GRP_NUM];    // bit mask map for non-zero coeff
11
+
12
+    int totalCoeffs = 0;
13
+    for (int i = 0; i < 32 * 32; i++)
14
+    {
15
+        ref_src[i] = rand() & SHORT_MAX;
16
+        totalCoeffs += (ref_src[i] != 0);
17
+    }
18
+
19
+    // extra test area all of 0x1234
20
+    for (int i = 0; i < ITERS * 2; i++)
21
+    {
22
+        ref_src[32 * 32 + i] = 0x1234;
23
+    }
24
+    
25
+
26
+    memset(ref_coeffNum, 0xCD, sizeof(ref_coeffNum));
27
+    memset(ref_coeffSign, 0xCD, sizeof(ref_coeffSign));
28
+    memset(ref_coeffFlag, 0xCD, sizeof(ref_coeffFlag));
29
+
30
+    memset(opt_coeffNum, 0xCD, sizeof(opt_coeffNum));
31
+    memset(opt_coeffSign, 0xCD, sizeof(opt_coeffSign));
32
+    memset(opt_coeffFlag, 0xCD, sizeof(opt_coeffFlag));
33
+
34
+    for (int i = 0; i < ITERS; i++)
35
+    {
36
+        int rand_scan_type = rand() % NUM_SCAN_TYPE;
37
+        int rand_scan_size = rand() % NUM_SCAN_SIZE;
38
+        int rand_numCoeff = 0;
39
+
40
+        for (int j = 0; j < 1 << (2 * (rand_scan_size + 2)); j++)
41
+            rand_numCoeff += (ref_src[i + j] != 0);
42
+
43
+        const uint16_t* const scanTbl = g_scanOrder[rand_scan_type][rand_scan_size];
44
+
45
+        int ref_scanPos = ref(scanTbl, ref_src + i, ref_coeffSign, ref_coeffFlag, ref_coeffNum, rand_numCoeff);
46
+        int opt_scanPos = (int)checked(opt, scanTbl, ref_src + i, opt_coeffSign, opt_coeffFlag, opt_coeffNum, rand_numCoeff);
47
+
48
+        if (ref_scanPos != opt_scanPos)
49
+            return false;
50
+
51
+        for (int j = 0; rand_numCoeff; j++)
52
+        {
53
+            if (ref_coeffSign[j] != opt_coeffSign[j])
54
+                return false;
55
+
56
+            if (ref_coeffFlag[j] != opt_coeffFlag[j])
57
+                return false;
58
+
59
+            if (ref_coeffNum[j] != opt_coeffNum[j])
60
+                return false;
61
+
62
+            rand_numCoeff -= ref_coeffNum[j];
63
+        }
64
+
65
+        reportfail();
66
+    }
67
+
68
+    return true;
69
+}
70
 
71
 bool PixelHarness::testPU(int part, const EncoderPrimitives& ref, const EncoderPrimitives& opt)
72
 {
73
@@ -1299,6 +1364,14 @@
74
                 return false;
75
             }
76
         }
77
+        if (opt.chroma[i].pu[part].satd)
78
+        {
79
+            if (!check_pixelcmp(ref.chroma[i].pu[part].satd, opt.chroma[i].pu[part].satd))
80
+            {
81
+                printf("chroma_satd[%s][%s] failed!\n", x265_source_csp_names[i], chromaPartStr[i][part]);
82
+                return false;
83
+            }
84
+        }
85
         if (part < NUM_CU_SIZES)
86
         {
87
             if (opt.chroma[i].cu[part].sub_ps)
88
@@ -1467,7 +1540,7 @@
89
             {
90
                 if (!check_cpy2Dto1D_shl_t(ref.cu[i].cpy2Dto1D_shl, opt.cu[i].cpy2Dto1D_shl))
91
                 {
92
-                    printf("cpy2Dto1D_shl failed!\n");
93
+                    printf("cpy2Dto1D_shl[%dx%d] failed!\n", 4 << i, 4 << i);
94
                     return false;
95
                 }
96
             }
97
@@ -1645,6 +1718,15 @@
98
         }
99
     }
100
 
101
+    if (opt.findPosLast)
102
+    {
103
+        if (!check_findPosLast(ref.findPosLast, opt.findPosLast))
104
+        {
105
+            printf("findPosLast failed!\n");
106
+            return false;
107
+        }
108
+    }
109
+
110
     return true;
111
 }
112
 
113
@@ -1688,7 +1770,7 @@
114
     if (opt.pu[part].copy_pp)
115
     {
116
         HEADER("copy_pp[%s]", lumaPartStr[part]);
117
-        REPORT_SPEEDUP(opt.pu[part].copy_pp, ref.pu[part].copy_pp, pbuf1, 64, pbuf2, 128);
118
+        REPORT_SPEEDUP(opt.pu[part].copy_pp, ref.pu[part].copy_pp, pbuf1, 64, pbuf2, 64);
119
     }
120
 
121
     if (opt.pu[part].addAvg)
122
@@ -1723,7 +1805,7 @@
123
         if (opt.cu[part].copy_ss)
124
         {
125
             HEADER("copy_ss[%s]", lumaPartStr[part]);
126
-            REPORT_SPEEDUP(opt.cu[part].copy_ss, ref.cu[part].copy_ss, sbuf1, 64, sbuf2, 128);
127
+            REPORT_SPEEDUP(opt.cu[part].copy_ss, ref.cu[part].copy_ss, sbuf1, 128, sbuf2, 128);
128
         }
129
         if (opt.cu[part].copy_sp)
130
         {
131
@@ -1733,7 +1815,7 @@
132
         if (opt.cu[part].copy_ps)
133
         {
134
             HEADER("copy_ps[%s]", lumaPartStr[part]);
135
-            REPORT_SPEEDUP(opt.cu[part].copy_ps, ref.cu[part].copy_ps, sbuf1, 64, pbuf1, 128);
136
+            REPORT_SPEEDUP(opt.cu[part].copy_ps, ref.cu[part].copy_ps, sbuf1, 128, pbuf1, 64);
137
         }
138
     }
139
 
140
@@ -1749,6 +1831,11 @@
141
             HEADER("[%s]  addAvg[%s]", x265_source_csp_names[i], chromaPartStr[i][part]);
142
             REPORT_SPEEDUP(opt.chroma[i].pu[part].addAvg, ref.chroma[i].pu[part].addAvg, sbuf1, sbuf2, pbuf1, STRIDE, STRIDE, STRIDE);
143
         }
144
+        if (opt.chroma[i].pu[part].satd)
145
+        {
146
+            HEADER("[%s] satd[%s]", x265_source_csp_names[i], chromaPartStr[i][part]);
147
+            REPORT_SPEEDUP(opt.chroma[i].pu[part].satd, ref.chroma[i].pu[part].satd, pbuf1, STRIDE, fref, STRIDE);
148
+        }
149
         if (part < NUM_CU_SIZES)
150
         {
151
             if (opt.chroma[i].cu[part].copy_ss)
152
@@ -1990,4 +2077,13 @@
153
         HEADER0("propagateCost");
154
         REPORT_SPEEDUP(opt.propagateCost, ref.propagateCost, ibuf1, ushort_test_buff[0], int_test_buff[0], ushort_test_buff[0], int_test_buff[0], double_test_buff[0], 80);
155
     }
156
+
157
+    if (opt.findPosLast)
158
+    {
159
+        HEADER0("findPosLast");
160
+        coeff_t coefBuf[32 * 32];
161
+        memset(coefBuf, 0, sizeof(coefBuf));
162
+        memset(coefBuf + 32 * 31, 1, 32 * sizeof(coeff_t));
163
+        REPORT_SPEEDUP(opt.findPosLast, ref.findPosLast, g_scanOrder[SCAN_DIAG][NUM_SCAN_SIZE - 1], coefBuf, (uint16_t*)sbuf1, (uint16_t*)sbuf2, (uint8_t*)psbuf1, 32);
164
+    }
165
 }
166
x265_1.5.tar.gz/source/test/pixelharness.h -> x265_1.6.tar.gz/source/test/pixelharness.h Changed
9
 
1
@@ -104,6 +104,7 @@
2
     bool check_psyCost_pp(pixelcmp_t ref, pixelcmp_t opt);
3
     bool check_psyCost_ss(pixelcmp_ss_t ref, pixelcmp_ss_t opt);
4
     bool check_calSign(sign_t ref, sign_t opt);
5
+    bool check_findPosLast(findPosLast_t ref, findPosLast_t opt);
6
 
7
 public:
8
 
9
x265_1.6.tar.gz/source/test/rate-control-tests.txt Added
36
 
1
@@ -0,0 +1,34 @@
2
+# List of command lines to be run by rate control regression tests, see https://bitbucket.org/sborho/test-harness
3
+
4
+# This test is listed first since it currently reproduces bugs
5
+big_buck_bunny_360p24.y4m,--preset medium --bitrate 1000 --pass 1 -F4,--preset medium --bitrate 1000 --pass 2 -F4
6
+
7
+# VBV tests, non-deterministic so testing for correctness and bitrate
8
+# fluctuations - up to 1% bitrate fluctuation is allowed between runs
9
+RaceHorses_416x240_30_10bit.yuv,--preset medium --bitrate 700 --vbv-bufsize 900 --vbv-maxrate 700
10
+RaceHorses_416x240_30_10bit.yuv,--preset superfast --bitrate 600 --vbv-bufsize 600 --vbv-maxrate 600
11
+RaceHorses_416x240_30_10bit.yuv,--preset veryslow --bitrate 1100 --vbv-bufsize 1100 --vbv-maxrate 1200
12
+112_1920x1080_25.yuv,--preset medium --bitrate 1000 --vbv-maxrate 1500 --vbv-bufsize 1500 --aud
13
+112_1920x1080_25.yuv,--preset medium --bitrate 10000 --vbv-maxrate 10000 --vbv-bufsize 15000 --hrd
14
+112_1920x1080_25.yuv,--preset medium --bitrate 4000 --vbv-maxrate 12000 --vbv-bufsize 12000 --repeat-headers
15
+112_1920x1080_25.yuv,--preset superfast --bitrate 1000 --vbv-maxrate 1000 --vbv-bufsize 1500 --hrd --strict-cbr
16
+112_1920x1080_25.yuv,--preset superfast --bitrate 30000 --vbv-maxrate 30000 --vbv-bufsize 30000 --repeat-headers
17
+112_1920x1080_25.yuv,--preset superfast --bitrate 4000 --vbv-maxrate 6000 --vbv-bufsize 6000 --aud
18
+112_1920x1080_25.yuv,--preset veryslow --bitrate 1000 --vbv-maxrate 3000 --vbv-bufsize 3000 --repeat-headers
19
+big_buck_bunny_360p24.y4m,--preset medium --bitrate 1000 --vbv-bufsize 3000 --vbv-maxrate 3000 --repeat-headers
20
+big_buck_bunny_360p24.y4m,--preset medium --bitrate 3000 --vbv-bufsize 3000 --vbv-maxrate 3000 --hrd
21
+big_buck_bunny_360p24.y4m,--preset medium --bitrate 400 --vbv-bufsize 600 --vbv-maxrate 600 --aud
22
+big_buck_bunny_360p24.y4m,--preset medium --crf 1 --vbv-bufsize 3000 --vbv-maxrate 3000 --hrd
23
+big_buck_bunny_360p24.y4m,--preset superfast --bitrate 1000 --vbv-bufsize 1000 --vbv-maxrate 1000 --aud --strict-cbr
24
+big_buck_bunny_360p24.y4m,--preset superfast --bitrate 3000 --vbv-bufsize 9000 --vbv-maxrate 9000 --repeat-headers
25
+big_buck_bunny_360p24.y4m,--preset superfast --bitrate 400 --vbv-bufsize 600 --vbv-maxrate 400 --hrd
26
+big_buck_bunny_360p24.y4m,--preset superfast --crf 6 --vbv-bufsize 1000 --vbv-maxrate 1000 --aud
27
+
28
+# multi-pass rate control tests
29
+big_buck_bunny_360p24.y4m,--preset slow --crf 40 --pass 1,--preset slow --bitrate 200 --pass 2
30
+big_buck_bunny_360p24.y4m,--preset medium --bitrate 700 --pass 1 -F4 --slow-firstpass,--preset medium --bitrate 700 --vbv-bufsize 900 --vbv-maxrate 700 --pass 2 -F4
31
+112_1920x1080_25.yuv,--preset slow --bitrate 1000 --pass 1 -F4,--preset slow --bitrate 1000 --pass 2 -F4
32
+112_1920x1080_25.yuv,--preset superfast --crf 12 --pass 1,--preset superfast --bitrate 4000 --pass 2 -F4
33
+RaceHorses_416x240_30_10bit.yuv,--preset veryslow --crf 40 --pass 1, --preset veryslow --bitrate 200 --pass 2 -F4
34
+RaceHorses_416x240_30_10bit.yuv,--preset superfast --bitrate 600 --pass 1 -F4 --slow-firstpass,--preset superfast --bitrate 600 --pass 2 -F4
35
+RaceHorses_416x240_30_10bit.yuv,--preset medium --crf 26 --pass 1,--preset medium --bitrate 500 --pass 3 -F4,--preset medium --bitrate 500 --pass 2 -F4
36
x265_1.6.tar.gz/source/test/regression-tests.txt Added
129
 
1
@@ -0,0 +1,127 @@
2
+# List of command lines to be run by regression tests, see https://bitbucket.org/sborho/test-harness
3
+
4
+# the vast majority of the commands are tested for results matching the
5
+# most recent commit which was known to change outputs. The output
6
+# bitstream must be bit-exact or the test fails. If no golden outputs
7
+# are available the bitstream is validated (decoded) and then saved as a
8
+# new golden output
9
+
10
+# Note: --nr-intra, --nr-inter, and --bitrate (ABR) give different
11
+# outputs for different frame encoder counts. In order for outputs to be
12
+# consistent across many machines, you must force a certain -FN so it is
13
+# not auto-detected.
14
+
15
+BasketballDrive_1920x1080_50.y4m,--preset faster --aq-strength 2 --merange 190
16
+BasketballDrive_1920x1080_50.y4m,--preset medium --ctu 16 --max-tu-size 8 --subme 7
17
+BasketballDrive_1920x1080_50.y4m,--preset medium --keyint -1 --nr-inter 100 -F4 --no-sao
18
+BasketballDrive_1920x1080_50.y4m,--preset slow --nr-intra 100 -F4 --aq-strength 3
19
+BasketballDrive_1920x1080_50.y4m,--preset slower --lossless --chromaloc 3 --subme 0
20
+BasketballDrive_1920x1080_50.y4m,--preset superfast --psy-rd 1 --ctu 16 --no-wpp
21
+BasketballDrive_1920x1080_50.y4m,--preset ultrafast --signhide --colormatrix bt709
22
+BasketballDrive_1920x1080_50.y4m,--preset veryfast --tune zerolatency --no-temporal-mvp
23
+BasketballDrive_1920x1080_50.y4m,--preset veryslow --crf 4 --cu-lossless --pmode
24
+Coastguard-4k.y4m,--preset medium --rdoq-level 1 --tune ssim --no-signhide --me umh
25
+Coastguard-4k.y4m,--preset slow --tune psnr --cbqpoffs -1 --crqpoffs 1
26
+Coastguard-4k.y4m,--preset superfast --tune grain --overscan=crop
27
+CrowdRun_1920x1080_50_10bit_422.yuv,--preset fast --aq-mode 0 --sar 2 --range full
28
+CrowdRun_1920x1080_50_10bit_422.yuv,--preset faster --max-tu-size 4 --min-cu-size 32
29
+CrowdRun_1920x1080_50_10bit_422.yuv,--preset medium --no-wpp --no-cutree --no-strong-intra-smoothing
30
+CrowdRun_1920x1080_50_10bit_422.yuv,--preset slow --no-wpp --tune ssim --transfer smpte240m
31
+CrowdRun_1920x1080_50_10bit_422.yuv,--preset slower --tune ssim --tune fastdecode
32
+CrowdRun_1920x1080_50_10bit_422.yuv,--preset superfast --weightp --no-wpp --sao
33
+CrowdRun_1920x1080_50_10bit_422.yuv,--preset ultrafast --weightp --tune zerolatency
34
+CrowdRun_1920x1080_50_10bit_422.yuv,--preset veryfast --temporal-layers --tune grain
35
+CrowdRun_1920x1080_50_10bit_444.yuv,--preset medium --dither --keyint -1 --rdoq-level 1
36
+CrowdRun_1920x1080_50_10bit_444.yuv,--preset superfast --weightp --dither --no-psy-rd
37
+CrowdRun_1920x1080_50_10bit_444.yuv,--preset ultrafast --weightp --no-wpp --no-open-gop
38
+CrowdRun_1920x1080_50_10bit_444.yuv,--preset veryfast --temporal-layers --repeat-headers
39
+CrowdRun_1920x1080_50_10bit_444.yuv,--preset veryslow --tskip --tskip-fast --no-scenecut
40
+DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset medium --tune psnr --bframes 16
41
+DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset slow --temporal-layers --no-psy-rd
42
+DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset superfast --weightp
43
+DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset medium --nr-inter 500 -F4 --no-psy-rdoq
44
+DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset slower --no-weightp --rdoq-level 0
45
+DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset veryfast --weightp --nr-intra 1000 -F4
46
+FourPeople_1280x720_60.y4m,--preset medium --qp 38 --no-psy-rd
47
+FourPeople_1280x720_60.y4m,--preset superfast --no-wpp --lookahead-slices 2
48
+Keiba_832x480_30.y4m,--preset medium --pmode --tune grain
49
+Keiba_832x480_30.y4m,--preset slower --fast-intra --nr-inter 500 -F4
50
+Keiba_832x480_30.y4m,--preset superfast --no-fast-intra --nr-intra 1000 -F4
51
+Kimono1_1920x1080_24_10bit_444.yuv,--preset medium --min-cu-size 32
52
+Kimono1_1920x1080_24_10bit_444.yuv,--preset superfast --weightb
53
+KristenAndSara_1280x720_60.y4m,--preset medium --no-cutree --max-tu-size 16
54
+KristenAndSara_1280x720_60.y4m,--preset slower --pmode --max-tu-size 8
55
+KristenAndSara_1280x720_60.y4m,--preset superfast --min-cu-size 16
56
+KristenAndSara_1280x720_60.y4m,--preset ultrafast --strong-intra-smoothing
57
+NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset medium --tune grain
58
+NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset superfast --tune psnr
59
+News-4k.y4m,--preset medium --tune ssim --no-sao
60
+News-4k.y4m,--preset superfast --lookahead-slices 6 --aq-mode 0
61
+OldTownCross_1920x1080_50_10bit_422.yuv,--preset medium --no-weightp
62
+OldTownCross_1920x1080_50_10bit_422.yuv,--preset slower --tune fastdecode
63
+OldTownCross_1920x1080_50_10bit_422.yuv,--preset superfast --weightp
64
+ParkScene_1920x1080_24.y4m,--preset medium --qp 40 --rdpenalty 2 --tu-intra-depth 3
65
+ParkScene_1920x1080_24.y4m,--preset slower --no-weightp
66
+ParkScene_1920x1080_24_10bit_444.yuv,--preset superfast --weightp --lookahead-slices 4
67
+RaceHorses_416x240_30.y4m,--preset medium --tskip-fast --tskip
68
+RaceHorses_416x240_30.y4m,--preset slower --keyint -1 --rdoq-level 0
69
+RaceHorses_416x240_30.y4m,--preset superfast --no-cutree
70
+RaceHorses_416x240_30.y4m,--preset veryslow --tskip-fast --tskip
71
+RaceHorses_416x240_30_10bit.yuv,--preset fast --lookahead-slices 2 --b-intra
72
+RaceHorses_416x240_30_10bit.yuv,--preset faster --rdoq-level 0 --dither
73
+RaceHorses_416x240_30_10bit.yuv,--preset slow --tune grain
74
+RaceHorses_416x240_30_10bit.yuv,--preset ultrafast --tune psnr
75
+RaceHorses_416x240_30_10bit.yuv,--preset veryfast --weightb
76
+RaceHorses_416x240_30_10bit.yuv,--preset placebo
77
+SteamLocomotiveTrain_2560x1600_60_10bit_crop.yuv,--preset medium --dither
78
+big_buck_bunny_360p24.y4m,--preset faster --keyint 240 --min-keyint 60 --rc-lookahead 200
79
+big_buck_bunny_360p24.y4m,--preset medium --keyint 60 --min-keyint 48 --weightb
80
+big_buck_bunny_360p24.y4m,--preset slow --psy-rdoq 2.0 --rdoq-level 1 --no-b-intra
81
+big_buck_bunny_360p24.y4m,--preset superfast --psy-rdoq 2.0
82
+big_buck_bunny_360p24.y4m,--preset ultrafast --deblock=2
83
+big_buck_bunny_360p24.y4m,--preset veryfast --no-deblock
84
+city_4cif_60fps.y4m,--preset medium --crf 4 --cu-lossless --sao-non-deblock
85
+city_4cif_60fps.y4m,--preset superfast --rdpenalty 1 --tu-intra-depth 2
86
+city_4cif_60fps.y4m,--preset slower --scaling-list default
87
+city_4cif_60fps.y4m,--preset veryslow --rdpenalty 2 --sao-non-deblock --no-b-intra
88
+ducks_take_off_420_720p50.y4m,--preset fast --deblock 6 --bframes 16 --rc-lookahead 40
89
+ducks_take_off_420_720p50.y4m,--preset faster --qp 24 --deblock -6
90
+ducks_take_off_420_720p50.y4m,--preset medium --tskip --tskip-fast --constrained-intra
91
+ducks_take_off_420_720p50.y4m,--preset slow --scaling-list default --qp 40
92
+ducks_take_off_420_720p50.y4m,--preset ultrafast --constrained-intra --rd 1
93
+ducks_take_off_420_720p50.y4m,--preset veryslow --constrained-intra --bframes 2
94
+ducks_take_off_444_720p50.y4m,--preset medium --qp 38 --no-scenecut
95
+ducks_take_off_444_720p50.y4m,--preset superfast --weightp --rd 0
96
+ducks_take_off_444_720p50.y4m,--preset slower --psy-rd 1 --psy-rdoq 2.0 --rdoq-level 1
97
+mobile_calendar_422_ntsc.y4m,--preset medium --bitrate 500 -F4
98
+mobile_calendar_422_ntsc.y4m,--preset slower --tskip --tskip-fast
99
+mobile_calendar_422_ntsc.y4m,--preset superfast --weightp --rd 0
100
+mobile_calendar_422_ntsc.y4m,--preset veryslow --tskip
101
+old_town_cross_444_720p50.y4m,--preset faster --rd 1 --tune zero-latency
102
+old_town_cross_444_720p50.y4m,--preset medium --keyint -1 --no-weightp --ref 6
103
+old_town_cross_444_720p50.y4m,--preset slow --rdoq-level 1 --early-skip --ref 7 --no-b-pyramid
104
+old_town_cross_444_720p50.y4m,--preset slower --crf 4 --cu-lossless
105
+old_town_cross_444_720p50.y4m,--preset superfast --weightp --min-cu 16
106
+old_town_cross_444_720p50.y4m,--preset ultrafast --weightp --min-cu 32
107
+old_town_cross_444_720p50.y4m,--preset veryfast --qp 1 --tune ssim
108
+parkrun_ter_720p50.y4m,--preset medium --no-open-gop --sao-non-deblock --crf 4 --cu-lossless
109
+parkrun_ter_720p50.y4m,--preset slower --fast-intra --no-rect --tune grain
110
+silent_cif_420.y4m,--preset medium --me full --rect --amp
111
+silent_cif_420.y4m,--preset superfast --weightp --rect
112
+silent_cif_420.y4m,--preset placebo --ctu 32 --no-sao
113
+vtc1nw_422_ntsc.y4m,--preset medium --scaling-list default --ctu 16 --ref 5
114
+vtc1nw_422_ntsc.y4m,--preset slower --nr-inter 1000 -F4 --tune fast-decode
115
+vtc1nw_422_ntsc.y4m,--preset superfast --weightp --nr-intra 100 -F4
116
+washdc_422_ntsc.y4m,--preset faster --rdoq-level 1 --max-merge 5
117
+washdc_422_ntsc.y4m,--preset medium --no-weightp --max-tu-size 4
118
+washdc_422_ntsc.y4m,--preset slower --psy-rdoq 2.0 --rdoq-level 2
119
+washdc_422_ntsc.y4m,--preset superfast --psy-rd 1 --tune zerolatency
120
+washdc_422_ntsc.y4m,--preset ultrafast --weightp --tu-intra-depth 4
121
+washdc_422_ntsc.y4m,--preset veryfast --tu-inter-depth 4
122
+washdc_422_ntsc.y4m,--preset veryslow --crf 4 --cu-lossless
123
+
124
+# interlace test, even though input YUV is not field seperated
125
+CrowdRun_1920x1080_50_10bit_422.yuv,--preset fast --interlace bff
126
+CrowdRun_1920x1080_50_10bit_422.yuv,--preset faster --interlace tff
127
+
128
+# vim: tw=200
129
x265_1.6.tar.gz/source/test/smoke-tests.txt Added
19
 
1
@@ -0,0 +1,17 @@
2
+# List of command lines to be run by smoke tests, see https://bitbucket.org/sborho/test-harness
3
+
4
+big_buck_bunny_360p24.y4m,--preset=superfast --bitrate 400 --vbv-bufsize 600 --vbv-maxrate 400 --hrd --aud --repeat-headers
5
+big_buck_bunny_360p24.y4m,--preset=medium --bitrate 1000 -F4 --cu-lossless --scaling-list default
6
+big_buck_bunny_360p24.y4m,--preset=slower --no-weightp --cu-stats --pme
7
+washdc_422_ntsc.y4m,--preset=faster --no-strong-intra-smoothing --keyint 1
8
+washdc_422_ntsc.y4m,--preset=medium --qp 40 --nr-inter 400 -F4
9
+washdc_422_ntsc.y4m,--preset=veryslow --pmode --tskip --rdoq-level 0
10
+old_town_cross_444_720p50.y4m,--preset=ultrafast --weightp --keyint -1
11
+old_town_cross_444_720p50.y4m,--preset=fast --keyint 20 --min-cu-size 16
12
+old_town_cross_444_720p50.y4m,--preset=slow --sao-non-deblock --pmode
13
+RaceHorses_416x240_30_10bit.yuv,--preset=veryfast --cu-stats --max-tu-size 8
14
+RaceHorses_416x240_30_10bit.yuv,--preset=slower --bitrate 500 -F4 --rdoq-level 1
15
+CrowdRun_1920x1080_50_10bit_444.yuv,--preset=ultrafast --constrained-intra --min-keyint 5 --keyint 10
16
+CrowdRun_1920x1080_50_10bit_444.yuv,--preset=medium --max-tu-size 16
17
+DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset=veryfast --min-cu 16
18
+DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset=fast --weightb --interlace bff
19
x265_1.5.tar.gz/source/test/testbench.cpp -> x265_1.6.tar.gz/source/test/testbench.cpp Changed
36
 
1
@@ -174,7 +174,10 @@
2
     for (int i = 0; test_arch[i].flag; i++)
3
     {
4
         if (test_arch[i].flag & cpuid)
5
+        {
6
             printf("Testing primitives: %s\n", test_arch[i].name);
7
+            fflush(stdout);
8
+        }
9
         else
10
             continue;
11
 
12
@@ -188,6 +191,7 @@
13
                 continue;
14
             if (!harness[h]->testCorrectness(cprim, vecprim))
15
             {
16
+                fflush(stdout);
17
                 fprintf(stderr, "\nx265: intrinsic primitive has failed. Go and fix that Right Now!\n");
18
                 return -1;
19
             }
20
@@ -204,6 +208,7 @@
21
                 continue;
22
             if (!harness[h]->testCorrectness(cprim, asmprim))
23
             {
24
+                fflush(stdout);
25
                 fprintf(stderr, "\nx265: asm primitive has failed. Go and fix that Right Now!\n");
26
                 return -1;
27
             }
28
@@ -226,6 +231,7 @@
29
     memcpy(&primitives, &optprim, sizeof(EncoderPrimitives));
30
 
31
     printf("\nTest performance improvement with full optimizations\n");
32
+    fflush(stdout);
33
 
34
     for (size_t h = 0; h < sizeof(harness) / sizeof(TestHarness*); h++)
35
     {
36
x265_1.5.tar.gz/source/test/testharness.h -> x265_1.6.tar.gz/source/test/testharness.h Changed
10
 
1
@@ -158,7 +158,7 @@
2
                                     m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, \
3
                                     m_rand, m_rand, m_rand, m_rand, m_rand), /* max_args+6 */ \
4
         x265_checkasm_call_float((float(*)())func, &m_ok, 0, 0, 0, 0, __VA_ARGS__))
5
-#define reportfail() if (!m_ok) { fprintf(stderr, "stack clobber check failed at %s:%d", __FILE__, __LINE__); abort(); }
6
+#define reportfail() if (!m_ok) { fflush(stdout); fprintf(stderr, "stack clobber check failed at %s:%d", __FILE__, __LINE__); abort(); }
7
 #elif ARCH_X86
8
 #define checked(func, ...) x265_checkasm_call((intptr_t(*)())func, &m_ok, __VA_ARGS__);
9
 #define checked_float(func, ...) x265_checkasm_call_float((float(*)())func, &m_ok, __VA_ARGS__);
10
x265_1.5.tar.gz/source/x265.cpp -> x265_1.6.tar.gz/source/x265.cpp Changed
29
 
1
@@ -147,6 +147,7 @@
2
 
3
     if (!bProgress || !frameNum || (prevUpdateTime && time - prevUpdateTime < UPDATE_INTERVAL))
4
         return;
5
+
6
     int64_t elapsed = time - startTime;
7
     double fps = elapsed > 0 ? frameNum * 1000000. / elapsed : 0;
8
     float bitrate = 0.008f * totalbytes * (param->fpsNum / param->fpsDenom) / ((float)frameNum);
9
@@ -158,9 +159,8 @@
10
                 eta / 3600, (eta / 60) % 60, eta % 60);
11
     }
12
     else
13
-    {
14
         sprintf(buf, "x265 %d frames: %.2f fps, %.2f kb/s", frameNum, fps, bitrate);
15
-    }
16
+
17
     fprintf(stderr, "%s  \r", buf + 5);
18
     SetConsoleTitle(buf);
19
     fflush(stderr); // needed in windows
20
@@ -530,7 +530,7 @@
21
     while (pic_in && !b_ctrl_c)
22
     {
23
         pic_orig.poc = inFrameCount;
24
-        if (cliopt.qpfile && !param->rc.bStatRead)
25
+        if (cliopt.qpfile)
26
         {
27
             if (!cliopt.parseQPFile(pic_orig))
28
             {
29
x265_1.5.tar.gz/source/x265.def.in -> x265_1.6.tar.gz/source/x265.def.in Changed
13
 
1
@@ -1,6 +1,5 @@
2
 EXPORTS
3
 x265_encoder_open_${X265_BUILD}
4
-x265_setup_primitives
5
 x265_param_default
6
 x265_param_default_preset
7
 x265_param_parse
8
@@ -20,3 +19,4 @@
9
 x265_encoder_log
10
 x265_encoder_close
11
 x265_cleanup
12
+x265_api_get_${X265_BUILD}
13
x265_1.5.tar.gz/source/x265.h -> x265_1.6.tar.gz/source/x265.h Changed
201
 
1
@@ -91,19 +91,31 @@
2
 /* Stores all analysis data for a single frame */
3
 typedef struct x265_analysis_data
4
 {
5
+    void*            interData;
6
+    void*            intraData;
7
     uint32_t         frameRecordSize;
8
-    int32_t          poc;
9
-    int32_t          sliceType;
10
+    uint32_t         poc;
11
+    uint32_t         sliceType;
12
     uint32_t         numCUsInFrame;
13
     uint32_t         numPartitions;
14
-    void*            interData;
15
-    void*            intraData;
16
 } x265_analysis_data;
17
 
18
 /* Used to pass pictures into the encoder, and to get picture data back out of
19
  * the encoder.  The input and output semantics are different */
20
 typedef struct x265_picture
21
 {
22
+    /* presentation time stamp: user-specified, returned on output */
23
+    int64_t pts;
24
+
25
+    /* display time stamp: ignored on input, copied from reordered pts. Returned
26
+     * on output */
27
+    int64_t dts;
28
+
29
+    /* force quantizer for != X265_QP_AUTO */
30
+    /* The value provided on input is returned with the same picture (POC) on
31
+     * output */
32
+    void*   userData;
33
+
34
     /* Must be specified on input pictures, the number of planes is determined
35
      * by the colorSpace value */
36
     void*   planes[3];
37
@@ -132,18 +144,8 @@
38
      * initialize this value to the internal color space */
39
     int     colorSpace;
40
 
41
-    /* presentation time stamp: user-specified, returned on output */
42
-    int64_t pts;
43
-
44
-    /* display time stamp: ignored on input, copied from reordered pts. Returned
45
-     * on output */
46
-    int64_t dts;
47
-
48
-    /* The value provided on input is returned with the same picture (POC) on
49
-     * output */
50
-    void*   userData;
51
-
52
-    /* force quantizer for != X265_QP_AUTO */
53
+    /* Force the slice base QP for this picture within the encoder. Set to 0
54
+     * to allow the encoder to determine base QP */
55
     int     forceqp;
56
 
57
     /* If param.analysisMode is X265_ANALYSIS_OFF this field is ignored on input
58
@@ -159,8 +161,6 @@
59
      * this data structure */
60
     x265_analysis_data analysisData;
61
 
62
-    /* new data members to this structure must be added to the end so that
63
-     * users of x265_picture_alloc/free() can be assured of future safety */
64
 } x265_picture;
65
 
66
 typedef enum
67
@@ -229,7 +229,11 @@
68
 #define X265_B_ADAPT_FAST       1
69
 #define X265_B_ADAPT_TRELLIS    2
70
 
71
+#define X265_REF_LIMIT_DEPTH    1
72
+#define X265_REF_LIMIT_CU       2
73
+
74
 #define X265_BFRAME_MAX         16
75
+#define X265_MAX_FRAME_THREADS  16
76
 
77
 #define X265_TYPE_AUTO          0x0000  /* Let x265 choose the right type */
78
 #define X265_TYPE_IDR           0x0001
79
@@ -237,13 +241,14 @@
80
 #define X265_TYPE_P             0x0003
81
 #define X265_TYPE_BREF          0x0004  /* Non-disposable B-frame */
82
 #define X265_TYPE_B             0x0005
83
+#define IS_X265_TYPE_I(x) ((x) == X265_TYPE_I || (x) == X265_TYPE_IDR)
84
+#define IS_X265_TYPE_B(x) ((x) == X265_TYPE_B || (x) == X265_TYPE_BREF)
85
+
86
 #define X265_QP_AUTO                 0
87
 
88
 #define X265_AQ_NONE                 0
89
 #define X265_AQ_VARIANCE             1
90
 #define X265_AQ_AUTO_VARIANCE        2
91
-#define IS_X265_TYPE_I(x) ((x) == X265_TYPE_I || (x) == X265_TYPE_IDR)
92
-#define IS_X265_TYPE_B(x) ((x) == X265_TYPE_B || (x) == X265_TYPE_BREF)
93
 
94
 /* NOTE! For this release only X265_CSP_I420 and X265_CSP_I444 are supported */
95
 
96
@@ -308,11 +313,9 @@
97
     double    elapsedEncodeTime;    /* wall time since encoder was opened */
98
     double    elapsedVideoTime;     /* encoded picture count / frame rate */
99
     double    bitrate;              /* accBits / elapsed video time */
100
+    uint64_t  accBits;              /* total bits output thus far */
101
     uint32_t  encodedPictureCount;  /* number of output pictures thus far */
102
     uint32_t  totalWPFrames;        /* number of uni-directional weighted frames used */
103
-    uint64_t  accBits;              /* total bits output thus far */
104
-
105
-    /* new statistic member variables must be added below this line */
106
 } x265_stats;
107
 
108
 /* String values accepted by x265_param_parse() (and CLI) for various parameters */
109
@@ -322,7 +325,8 @@
110
 static const char * const x265_fullrange_names[] = { "limited", "full", 0 };
111
 static const char * const x265_colorprim_names[] = { "", "bt709", "undef", "", "bt470m", "bt470bg", "smpte170m", "smpte240m", "film", "bt2020", 0 };
112
 static const char * const x265_transfer_names[] = { "", "bt709", "undef", "", "bt470m", "bt470bg", "smpte170m", "smpte240m", "linear", "log100",
113
-                                                    "log316", "iec61966-2-4", "bt1361e", "iec61966-2-1", "bt2020-10", "bt2020-12", 0 };
114
+                                                    "log316", "iec61966-2-4", "bt1361e", "iec61966-2-1", "bt2020-10", "bt2020-12",
115
+                                                    "smpte-st-2084", "smpte-st-428", 0 };
116
 static const char * const x265_colmatrix_names[] = { "GBR", "bt709", "undef", "", "fcc", "bt470bg", "smpte170m", "smpte240m",
117
                                                      "YCgCo", "bt2020nc", "bt2020c", 0 };
118
 static const char * const x265_sar_names[] = { "undef", "1:1", "12:11", "10:11", "16:11", "40:33", "24:11", "20:11",
119
@@ -334,9 +338,9 @@
120
  * If zones overlap, whichever comes later in the list takes precedence. */
121
 typedef struct x265_zone
122
 {
123
-    int startFrame, endFrame;   /* range of frame numbers */
124
-    int bForceQp;               /* whether to use qp vs bitrate factor */
125
-    int qp;
126
+    int   startFrame, endFrame; /* range of frame numbers */
127
+    int   bForceQp;             /* whether to use qp vs bitrate factor */
128
+    int   qp;
129
     float bitrateFactor;
130
 } x265_zone;
131
     
132
@@ -348,36 +352,77 @@
133
  * x265_param as an opaque data structure */
134
 typedef struct x265_param
135
 {
136
-    /*== Encoder Environment ==*/
137
-
138
     /* x265_param_default() will auto-detect this cpu capability bitmap.  it is
139
      * recommended to not change this value unless you know the cpu detection is
140
      * somehow flawed on your target hardware. The asm function tables are
141
      * process global, the first encoder configures them for all encoders */
142
     int       cpuid;
143
 
144
+    /*== Parallelism Features ==*/
145
+
146
+    /* Number of concurrently encoded frames between 1 and X265_MAX_FRAME_THREADS
147
+     * or 0 for auto-detection. By default x265 will use a number of frame
148
+     * threads empirically determined to be optimal for your CPU core count,
149
+     * between 2 and 6.  Using more than one frame thread causes motion search
150
+     * in the down direction to be clamped but otherwise encode behavior is
151
+     * unaffected. With CQP rate control the output bitstream is deterministic
152
+     * for all values of frameNumThreads greater than 1. All other forms of
153
+     * rate-control can be negatively impacted by increases to the number of
154
+     * frame threads because the extra concurrency adds uncertainty to the
155
+     * bitrate estimations. Frame parallelism is generally limited by the the
156
+     * is generally limited by the the number of CU rows
157
+     *
158
+     * When thread pools are used, each frame thread is assigned to a single
159
+     * pool and the frame thread itself is given the node affinity of its pool.
160
+     * But when no thread pools are used no node affinity is assigned. */
161
+    int       frameNumThreads;
162
+
163
+    /* Comma seperated list of threads per NUMA node. If "none", then no worker
164
+     * pools are created and only frame parallelism is possible. If NULL or ""
165
+     * (default) x265 will use all available threads on each NUMA node.
166
+     *
167
+     * '+'  is a special value indicating all cores detected on the node
168
+     * '*'  is a special value indicating all cores detected on the node and all
169
+     *      remaining nodes.
170
+     * '-'  is a special value indicating no cores on the node, same as '0'
171
+     *
172
+     * example strings for a 4-node system:
173
+     *   ""        - default, unspecified, all numa nodes are used for thread pools
174
+     *   "*"       - same as default
175
+     *   "none"    - no thread pools are created, only frame parallelism possible
176
+     *   "-"       - same as "none"
177
+     *   "10"      - allocate one pool, using up to 10 cores on node 0
178
+     *   "-,+"     - allocate one pool, using all cores on node 1
179
+     *   "+,-,+"   - allocate two pools, using all cores on nodes 0 and 2
180
+     *   "+,-,+,-" - allocate two pools, using all cores on nodes 0 and 2
181
+     *   "-,*"     - allocate three pools, using all cores on nodes 1, 2 and 3
182
+     *   "8,8,8,8" - allocate four pools with up to 8 threads in each pool
183
+     *
184
+     * The total number of threads will be determined by the number of threads
185
+     * assigned to all nodes. The worker threads will each be given affinity for
186
+     * their node, they will not be allowed to migrate between nodes, but they
187
+     * will be allowed to move between CPU cores within their node.
188
+     *
189
+     * If the three pool features: bEnableWavefront, bDistributeModeAnalysis and
190
+     * bDistributeMotionEstimation are all disabled, then numaPools is ignored
191
+     * and no thread pools are created.
192
+     *
193
+     * If "none" is specified, then all three of the thread pool features are
194
+     * implicitly disabled.
195
+     *
196
+     * Multiple thread pools will be allocated for any NUMA node with more than
197
+     * 64 logical CPU cores. But any given thread pool will always use at most
198
+     * one NUMA node.
199
+     *
200
+     * Frame encoders are distributed between the available thread pools, and
201
x265_1.5.tar.gz/source/x265cli.h -> x265_1.6.tar.gz/source/x265cli.h Changed
111
 
1
@@ -37,7 +37,8 @@
2
     { "version",              no_argument, NULL, 'V' },
3
     { "asm",            required_argument, NULL, 0 },
4
     { "no-asm",               no_argument, NULL, 0 },
5
-    { "threads",        required_argument, NULL, 0 },
6
+    { "pools",          required_argument, NULL, 0 },
7
+    { "numa-pools",     required_argument, NULL, 0 },
8
     { "preset",         required_argument, NULL, 'p' },
9
     { "tune",           required_argument, NULL, 't' },
10
     { "frame-threads",  required_argument, NULL, 'F' },
11
@@ -71,6 +72,8 @@
12
     { "no-wpp",               no_argument, NULL, 0 },
13
     { "wpp",                  no_argument, NULL, 0 },
14
     { "ctu",            required_argument, NULL, 's' },
15
+    { "min-cu-size",    required_argument, NULL, 0 },
16
+    { "max-tu-size",    required_argument, NULL, 0 },
17
     { "tu-intra-depth", required_argument, NULL, 0 },
18
     { "tu-inter-depth", required_argument, NULL, 0 },
19
     { "me",             required_argument, NULL, 0 },
20
@@ -96,6 +99,8 @@
21
     { "no-cu-lossless",       no_argument, NULL, 0 },
22
     { "no-constrained-intra", no_argument, NULL, 0 },
23
     { "constrained-intra",    no_argument, NULL, 0 },
24
+    { "cip",                  no_argument, NULL, 0 },
25
+    { "no-cip",               no_argument, NULL, 0 },
26
     { "fast-intra",           no_argument, NULL, 0 },
27
     { "no-fast-intra",        no_argument, NULL, 0 },
28
     { "no-open-gop",          no_argument, NULL, 0 },
29
@@ -105,6 +110,7 @@
30
     { "scenecut",       required_argument, NULL, 0 },
31
     { "no-scenecut",          no_argument, NULL, 0 },
32
     { "rc-lookahead",   required_argument, NULL, 0 },
33
+    { "lookahead-slices", required_argument, NULL, 0 },
34
     { "bframes",        required_argument, NULL, 'b' },
35
     { "bframe-bias",    required_argument, NULL, 0 },
36
     { "b-adapt",        required_argument, NULL, 0 },
37
@@ -136,6 +142,8 @@
38
     { "cbqpoffs",       required_argument, NULL, 0 },
39
     { "crqpoffs",       required_argument, NULL, 0 },
40
     { "rd",             required_argument, NULL, 0 },
41
+    { "rdoq-level",     required_argument, NULL, 0 },
42
+    { "no-rdoq-level",        no_argument, NULL, 0 },
43
     { "psy-rd",         required_argument, NULL, 0 },
44
     { "psy-rdoq",       required_argument, NULL, 0 },
45
     { "no-psy-rd",            no_argument, NULL, 0 },
46
@@ -195,6 +203,8 @@
47
     { "analysis-mode",  required_argument, NULL, 0 },
48
     { "analysis-file",  required_argument, NULL, 0 },
49
     { "strict-cbr",           no_argument, NULL, 0 },
50
+    { "temporal-layers",      no_argument, NULL, 0 },
51
+    { "no-temporal-layers",   no_argument, NULL, 0 },
52
     { 0, 0, 0, 0 },
53
     { 0, 0, 0, 0 },
54
     { 0, 0, 0, 0 },
55
@@ -246,10 +256,11 @@
56
     H0("   --[no-]psnr                   Enable reporting PSNR metric scores. Default %s\n", OPT(param->bEnablePsnr));
57
     H0("\nProfile, Level, Tier:\n");
58
     H0("   --profile <string>            Enforce an encode profile: main, main10, mainstillpicture\n");
59
-    H0("   --level-idc <integer|float>   Force a minumum required decoder level (as '5.0' or '50')\n");
60
+    H0("   --level-idc <integer|float>   Force a minimum required decoder level (as '5.0' or '50')\n");
61
     H0("   --[no-]high-tier              If a decoder level is specified, this modifier selects High tier of that level\n");
62
     H0("\nThreading, performance:\n");
63
-    H0("   --threads <integer>           Number of threads for thread pool (0: detect CPU core count, default)\n");
64
+    H0("   --pools <integer,...>         Comma separated thread count per thread pool (pool per NUMA node)\n");
65
+    H0("                                 '-' implies no threads on node, '+' implies one thread per core on node\n");
66
     H0("-F/--frame-threads <integer>     Number of concurrently encoded frames. 0: auto-determined by core count\n");
67
     H0("   --[no-]wpp                    Enable Wavefront Parallel Processing. Default %s\n", OPT(param->bEnableWavefront));
68
     H0("   --[no-]pmode                  Parallel mode analysis. Default %s\n", OPT(param->bDistributeModeAnalysis));
69
@@ -262,14 +273,16 @@
70
     H0("                                 psnr, ssim, grain, zerolatency, fastdecode\n");
71
     H0("\nQuad-Tree size and depth:\n");
72
     H0("-s/--ctu <64|32|16>              Maximum CU size (WxH). Default %d\n", param->maxCUSize);
73
+    H0("   --min-cu-size <64|32|16|8>    Minimum CU size (WxH). Default %d\n", param->minCUSize);
74
+    H0("   --max-tu-size <32|16|8|4>     Maximum TU size (WxH). Default %d\n", param->maxTUSize);
75
     H0("   --tu-intra-depth <integer>    Max TU recursive depth for intra CUs. Default %d\n", param->tuQTMaxIntraDepth);
76
     H0("   --tu-inter-depth <integer>    Max TU recursive depth for inter CUs. Default %d\n", param->tuQTMaxInterDepth);
77
     H0("\nAnalysis:\n");
78
-    H0("   --rd <0..6>                   Level of RD in mode decision 0:least....6:full RDO. Default %d\n", param->rdLevel);
79
+    H0("   --rd <0..6>                   Level of RDO in mode decision 0:least....6:full RDO. Default %d\n", param->rdLevel);
80
     H0("   --[no-]psy-rd <0..2.0>        Strength of psycho-visual rate distortion optimization, 0 to disable. Default %.1f\n", param->psyRd);
81
-    H0("   --[no-]psy-rdoq <0..50.0>     Strength of psycho-visual optimization in quantization, 0 to disable. Default %.1f\n", param->psyRdoq);
82
+    H0("   --[no-]rdoq-level <0|1|2>     Level of RDO in quantization 0:none, 1:levels, 2:levels & coding groups. Default %d\n", param->rdoqLevel);
83
+    H0("   --[no-]psy-rdoq <0..50.0>     Strength of psycho-visual optimization in RDO quantization, 0 to disable. Default %.1f\n", param->psyRdoq);
84
     H0("   --[no-]early-skip             Enable early SKIP detection. Default %s\n", OPT(param->bEnableEarlySkip));
85
-    H1("   --[no-]fast-cbf               Enable early outs based on whether residual is coded. Default %s\n", OPT(param->bEnableCbfFastMode));
86
     H1("   --[no-]tskip-fast             Enable fast intra transform skipping. Default %s\n", OPT(param->bEnableTSkipFast));
87
     H1("   --nr-intra <integer>          An integer value in range of 0 to 2000, which denotes strength of noise reduction in intra CUs. Default 0\n");
88
     H1("   --nr-inter <integer>          An integer value in range of 0 to 2000, which denotes strength of noise reduction in inter CUs. Default 0\n");
89
@@ -300,6 +313,7 @@
90
     H0("   --no-scenecut                 Disable adaptive I-frame decision\n");
91
     H0("   --scenecut <integer>          How aggressively to insert extra I-frames. Default %d\n", param->scenecutThreshold);
92
     H0("   --rc-lookahead <integer>      Number of frames for frame-type lookahead (determines encoder latency) Default %d\n", param->lookaheadDepth);
93
+    H1("   --lookahead-slices <0..16>    Number of slices to use per lookahead cost estimate. Default %d\n", param->lookaheadSlices);
94
     H0("   --bframes <integer>           Maximum number of consecutive b-frames (now it only enables B GOP structure) Default %d\n", param->bframes);
95
     H1("   --bframe-bias <integer>       Bias towards B frame decisions. Default %d\n", param->bFrameBias);
96
     H0("   --b-adapt <0..2>              0 - none, 1 - fast, 2 - full (trellis) adaptive B frame scheduling. Default %d\n", param->bFrameAdaptive);
97
@@ -371,10 +385,11 @@
98
     H1("                                 smpte240m, GBR, YCgCo, bt2020nc, bt2020c. Default undef\n");
99
     H1("   --chromaloc <integer>         Specify chroma sample location (0 to 5). Default of %d\n", param->vui.chromaSampleLocTypeTopField);
100
     H0("\nBitstream options:\n");
101
+    H0("   --[no-]repeat-headers         Emit SPS and PPS headers at each keyframe. Default %s\n", OPT(param->bRepeatHeaders));
102
     H0("   --[no-]info                   Emit SEI identifying encoder and parameters. Default %s\n", OPT(param->bEmitInfoSEI));
103
-    H0("   --[no-]aud                    Emit access unit delimiters at the start of each access unit. Default %s\n", OPT(param->bEnableAccessUnitDelimiters));
104
     H0("   --[no-]hrd                    Enable HRD parameters signaling. Default %s\n", OPT(param->bEmitHRDSEI));
105
-    H0("   --[no-]repeat-headers         Emit SPS and PPS headers at each keyframe. Default %s\n", OPT(param->bRepeatHeaders));
106
+    H0("   --[no-]temporal-layers        Enable a temporal sublayer for unreferenced B frames. Default %s\n", OPT(param->bEnableTemporalSubLayers));
107
+    H0("   --[no-]aud                    Emit access unit delimiters at the start of each access unit. Default %s\n", OPT(param->bEnableAccessUnitDelimiters));
108
     H1("   --hash <integer>              Decoded Picture Hash SEI 0: disabled, 1: MD5, 2: CRC, 3: Checksum. Default %d\n", param->decodedPictureHashSEI);
109
     H1("\nReconstructed video options (debugging):\n");
110
     H1("-r/--recon <filename>            Reconstructed raw image YUV or Y4M output file name\n");
111