Packman Build Service PMBS

We truncated the diff of some files because they were too big. If you want to see the full diff for every file, click here.

Changes of Revision 12

x265.changes Changed

@@ -1,4 +1,30 @@
 -------------------------------------------------------------------
+Wed Feb  3 13:22:42 UTC 2016 - idonmez@suse.com
+
+- Update to version 1.9
+  API Changes:
+  * x265_frame_stats returns many additional fields: maxCLL, maxFALL,
+    residual energy, scenecut and latency logging
+  * --qpfile now supports frametype 'K"
+  * x265 now allows CRF ratecontrol in pass N (N greater than or equal to 2)
+  * Chroma subsampling format YUV 4:0:0 is now fully supported and tested
+  New Features:
+  * Quant offsets: This feature allows block level quantization offsets
+    to be specified for every frame. An API-only feature.
+  * --intra-refresh: Keyframes can be replaced by a moving column
+    of intra blocks in non-keyframes.
+  * --limit-modes: Intelligently restricts mode analysis.
+  * --max-luma and --min-luma for luma clipping, optional for HDR use-cases
+  * Emergency denoising is now enabled by default in very low bitrate, 
+    VBV encodes
+  Presets and Performance:
+  * Recently added features lookahead-slices, limit-modes, limit-refs
+    have been enabled by default for applicable presets.
+  * The default psy-rd strength has been increased to 2.0
+  * Multi-socket machines now use a single pool of threads that can
+    work cross-socket.
+
+-------------------------------------------------------------------
 Fri Nov 27 18:21:04 UTC 2015 - aloisio@gmx.com
 
 - Update to version 1.8:

​x
 
@@ -1,4 +1,30 @@
 -------------------------------------------------------------------
+Wed Feb  3 13:22:42 UTC 2016 - idonmez@suse.com
+
+- Update to version 1.9
+  API Changes:
+  * x265_frame_stats returns many additional fields: maxCLL, maxFALL,
+    residual energy, scenecut and latency logging
+  * --qpfile now supports frametype 'K"
+  * x265 now allows CRF ratecontrol in pass N (N greater than or equal to 2)
+  * Chroma subsampling format YUV 4:0:0 is now fully supported and tested
+  New Features:
+  * Quant offsets: This feature allows block level quantization offsets
+    to be specified for every frame. An API-only feature.
+  * --intra-refresh: Keyframes can be replaced by a moving column
+    of intra blocks in non-keyframes.
+  * --limit-modes: Intelligently restricts mode analysis.
+  * --max-luma and --min-luma for luma clipping, optional for HDR use-cases
+  * Emergency denoising is now enabled by default in very low bitrate, 
+    VBV encodes
+  Presets and Performance:
+  * Recently added features lookahead-slices, limit-modes, limit-refs
+    have been enabled by default for applicable presets.
+  * The default psy-rd strength has been increased to 2.0
+  * Multi-socket machines now use a single pool of threads that can
+    work cross-socket.
+
+-------------------------------------------------------------------
 Fri Nov 27 18:21:04 UTC 2015 - aloisio@gmx.com
 
 - Update to version 1.8:
​

x265.spec Changed

@@ -1,10 +1,10 @@
 # based on the spec file from https://build.opensuse.org/package/view_file/home:Simmphonie/libx265/
 
 Name:           x265
-%define soname  68
+%define soname  79
 %define libname lib%{name}
 %define libsoname %{libname}-%{soname}
-Version:        1.8
+Version:        1.9
 Release:        0
 License:        GPL-2.0+
 Summary:        A free h265/HEVC encoder - encoder binary
@@ -43,35 +43,34 @@
 streams. 
 
 %prep
-%setup -q -n "%{name}_11047/build/linux"
-cd ../..
+%setup -q -n x265_%{version}
 %patch0 -p1
-cd -
+
 %define FAKE_BUILDDATE %(LC_ALL=C date -u -r %{_sourcedir}/%{name}.changes '+%%b %%e %%Y')
-sed -i -e "s/0.0/%{soname}.0/g" ../../source/cmake/version.cmake
+sed -i -e "s/0.0/%{soname}.0/g" source/cmake/version.cmake
 
 
 %build
-export CXXFLAGS="%optflags"
-export CFLAGS="%optflags"
-cmake  -DCMAKE_INSTALL_PREFIX=/usr -DENABLE_TESTS=ON -G "Unix Makefiles" ../../source
-cmake -DCMAKE_INSTALL_PREFIX=/usr ../../source
-#./make-Makefiles.bash
+export CXXFLAGS="%{optflags}"
+export CFLAGS="%{optflags}"
+
+cd build/linux
+cmake  -DCMAKE_INSTALL_PREFIX=%{_prefix} \
+       -DLIB_INSTALL_DIR=%{_lib} \
+       -DENABLE_TESTS=ON \
+       -G "Unix Makefiles" \
+       ../../source
+
 make %{?_smp_mflags} VERBOSE=1
 
 %install
+cd build/linux
 %makeinstall
-%ifarch x86_64
-  mv "%{buildroot}/usr/lib" "%{buildroot}%{_libdir}"
-%endif
 
 rm -f %{buildroot}%{_libdir}/%{libname}.a
 
 echo "%{libname}-%{soname}" > %{_sourcedir}/baselibs.conf
 
-%clean
-%{?buildroot:%__rm -rf "%{buildroot}"}
-
 %post -n %{libsoname} -p /sbin/ldconfig
 %postun -n %{libsoname} -p /sbin/ldconfig

 
@@ -1,10 +1,10 @@
 # based on the spec file from https://build.opensuse.org/package/view_file/home:Simmphonie/libx265/
 
 Name:           x265
-%define soname  68
+%define soname  79
 %define libname lib%{name}
 %define libsoname %{libname}-%{soname}
-Version:        1.8
+Version:        1.9
 Release:        0
 License:        GPL-2.0+
 Summary:        A free h265/HEVC encoder - encoder binary
@@ -43,35 +43,34 @@
 streams. 
 
 %prep
-%setup -q -n "%{name}_11047/build/linux"
-cd ../..
+%setup -q -n x265_%{version}
 %patch0 -p1
-cd -
+
 %define FAKE_BUILDDATE %(LC_ALL=C date -u -r %{_sourcedir}/%{name}.changes '+%%b %%e %%Y')
-sed -i -e "s/0.0/%{soname}.0/g" ../../source/cmake/version.cmake
+sed -i -e "s/0.0/%{soname}.0/g" source/cmake/version.cmake
 
 
 %build
-export CXXFLAGS="%optflags"
-export CFLAGS="%optflags"
-cmake  -DCMAKE_INSTALL_PREFIX=/usr -DENABLE_TESTS=ON -G "Unix Makefiles" ../../source
-cmake -DCMAKE_INSTALL_PREFIX=/usr ../../source
-#./make-Makefiles.bash
+export CXXFLAGS="%{optflags}"
+export CFLAGS="%{optflags}"
+
+cd build/linux
+cmake  -DCMAKE_INSTALL_PREFIX=%{_prefix} \
+       -DLIB_INSTALL_DIR=%{_lib} \
+       -DENABLE_TESTS=ON \
+       -G "Unix Makefiles" \
+       ../../source
+
 make %{?_smp_mflags} VERBOSE=1
 
 %install
+cd build/linux
 %makeinstall
-%ifarch x86_64
-  mv "%{buildroot}/usr/lib" "%{buildroot}%{_libdir}"
-%endif
 
 rm -f %{buildroot}%{_libdir}/%{libname}.a
 
 echo "%{libname}-%{soname}" > %{_sourcedir}/baselibs.conf
 
-%clean
-%{?buildroot:%__rm -rf "%{buildroot}"}
-
 %post -n %{libsoname} -p /sbin/ldconfig
 %postun -n %{libsoname} -p /sbin/ldconfig
 
​

x265_1.8.tar.gz/.hg_archival.txt -> x265_1.9.tar.gz/.hg_archival.txt Changed

 
@@ -1,5 +1,4 @@
 repo: 09fe40627f03a0f9c3e6ac78b22ac93da23f9fdf
-node: 5dcc9d3a928c400b41a3547d7bfee10340519e56
+node: 1d3b6e448e01ec40b392ef78b7e55a86249fbe68
 branch: stable
-latesttag: 1.8
-latesttagdistance: 1
+tag: 1.9
​

x265_1.8.tar.gz/doc/reST/cli.rst -> x265_1.9.tar.gz/doc/reST/cli.rst Changed

@@ -84,8 +84,8 @@
 	it adds one line per run. If :option:`--csv-log-level` is greater than
 	0, it writes one line per frame. Default none
 
-	When frame level logging is enabled, several frame performance
-	statistics are listed:
+	Several frame performance statistics are available when 
+	:option:`--csv-log-level` is greater than or equal to 2:
 
 	**DecideWait ms** number of milliseconds the frame encoder had to
 	wait, since the previous frame was retrieved by the API thread,
@@ -202,15 +202,29 @@
 	"-"       - same as "none"
 	"10"      - allocate one pool, using up to 10 cores on node 0
 	"-,+"     - allocate one pool, using all cores on node 1
-	"+,-,+"   - allocate two pools, using all cores on nodes 0 and 2
-	"+,-,+,-" - allocate two pools, using all cores on nodes 0 and 2
-	"-,*"     - allocate three pools, using all cores on nodes 1, 2 and 3
+	"+,-,+"   - allocate one pool, using only cores on nodes 0 and 2
+	"+,-,+,-" - allocate one pool, using only cores on nodes 0 and 2
+	"-,*"     - allocate one pool, using all cores on nodes 1, 2 and 3
 	"8,8,8,8" - allocate four pools with up to 8 threads in each pool
-
-	The total number of threads will be determined by the number of threads
-	assigned to all nodes. The worker threads will each be given affinity for
-	their node, they will not be allowed to migrate between nodes, but they
-	will be allowed to move between CPU cores within their node.
+	"8,+,+,+" - allocate two pools, the first with 8 threads on node 0, and the second with all cores on node 1,2,3
+
+	A thread pool dedicated to a given NUMA node is enabled only when the
+	number of threads to be created on that NUMA node is explicitly mentioned
+	in that corresponding position with the --pools option. Else, all threads
+	are spawned from a single pool. The total number of threads will be
+	determined by the number of threads assigned to the enabled NUMA nodes for
+	that pool. The worker threads are be given affinity to all the enabled
+	NUMA nodes for that pool and may migrate between them, unless explicitly
+	specified as described above.
+
+	In the case that any threadpool has more than 64 threads, the threadpool
+	may be broken down into multiple pools of 64 threads each; on 32-bit
+	machines, this number is 32. All pools are given affinity to the NUMA
+	nodes on which the original pool had affinity. For performance reasons,
+	the last thread pool is spawned only if it has more than 32 threads for
+	64-bit machines, or 16 for 32-bit machines. If the total number of threads
+	in the system doesn't obey this constraint, we may spawn fewer threads
+	than cores which has been emperically shown to be better for performance. 
 
 	If the four pool features: :option:`--wpp`, :option:`--pmode`,
 	:option:`--pme` and :option:`--lookahead-slices` are all disabled,
@@ -219,10 +233,6 @@
 	If "none" is specified, then all four of the thread pool features are
 	implicitly disabled.
 
-	Multiple thread pools will be allocated for any NUMA node with more than
-	64 logical CPU cores. But any given thread pool will always use at most
-	one NUMA node.
-
 	Frame encoders are distributed between the available thread pools,
 	and the encoder will never generate more thread pools than
 	:option:`--frame-threads`.  The pools are used for WPP and for
@@ -238,8 +248,12 @@
 	system, a POSIX build of libx265 without libnuma will be less work
 	efficient. See :ref:`thread pools <pools>` for more detail.
 
-	Default "", one thread is allocated per detected hardware thread
-	(logical CPU cores) and one thread pool per NUMA node.
+	Default "", one pool is created across all available NUMA nodes, with
+	one thread allocated per detected hardware thread
+	(logical CPU cores). In the case that the total number of threads is more
+	than the maximum size that ATOMIC operations can handle (32 for 32-bit
+	compiles, and 64 for 64-bit compiles), multiple thread pools may be
+	spawned subject to the performance constraint described above.
 
 	Note that the string value will need to be escaped or quoted to
 	protect against shell expansion on many platforms
@@ -353,7 +367,7 @@
 
 	**CLI ONLY**
 
-.. option:: --total-frames <integer>
+.. option:: --frames <integer>
 
 	The number of frames intended to be encoded.  It may be left
 	unspecified, but when it is specified rate control can make use of
@@ -377,15 +391,15 @@
 
 .. option:: --input-csp <integer|string>
 
-	YUV only: Source color space. Only i420, i422, and i444 are
-	supported at this time. The internal color space is always the
-	same as the source color space (libx265 does not support any color
-	space conversions).
+	Chroma Subsampling (YUV only):  Only 4:0:0(monochrome), 4:2:0, 4:2:2, and 4:4:4 are supported at this time. 
+	The chroma subsampling format of your input must match your desired output chroma subsampling format 
+	(libx265 will not perform any chroma subsampling conversion), and it must be supported by the 
+	HEVC profile you have specified.
 
-	0. i400
-	1. i420 **(default)**
-	2. i422
-	3. i444
+	0. i400 (4:0:0 monochrome) - Not supported by Main or Main10 profiles
+	1. i420 (4:2:0 default)    - Supported by all HEVC profiles
+	2. i422 (4:2:2)            - Not supported by Main, Main10 and Main12 profiles
+	3. i444 (4:4:4)            - Supported by Main 4:4:4, Main 4:4:4 10, Main 4:4:4 12, Main 4:4:4 16 Intra profiles
 	4. nv12
 	5. nv16
 
@@ -436,8 +450,8 @@
 	depth of the encoder. If the requested bit depth is not the bit
 	depth of the linked libx265, it will attempt to bind libx265_main
 	for an 8bit encoder, libx265_main10 for a 10bit encoder, or
-	libx265_main12 for a 12bit encoder (EXPERIMENTAL), with the
-	same API version as the linked libx265.
+	libx265_main12 for a 12bit encoder, with the same API version as the
+	linked libx265.
 
 	If the output depth is not specified but :option:`--profile` is
 	specified, the output depth will be derived from the profile name.
@@ -486,13 +500,6 @@
 	The CLI application will derive the output bit depth from the
 	profile name if :option:`--output-depth` is not specified.
 
-.. note::
-
-	All 12bit presets are extremely unstable, do not use them yet.
-	16bit is not supported at all, but those profiles are included
-	because it is possible for libx265 to make bitstreams compatible
-	with them.
-
 .. option:: --level-idc <integer|float>
 
 	Minimum decoder requirement level. Defaults to 0, which implies
@@ -606,7 +613,8 @@
 	+-------+---------------------------------------------------------------+
 	| Level | Description                                                   |
 	+=======+===============================================================+
-	| 0     | sa8d mode and split decisions, intra w/ source pixels         |
+	| 0     | sa8d mode and split decisions, intra w/ source pixels,        |
+	|       | currently not supported                                       |
 	+-------+---------------------------------------------------------------+
 	| 1     | recon generated (better intra), RDO merge/skip selection      |
 	+-------+---------------------------------------------------------------+
@@ -677,7 +685,16 @@
 	(within your decoder level limits) if you enable one or
 	both of these flags.
 
-	This feature is EXPERIMENTAL and functional at all RD levels.
+	Default 3.
+
+.. option:: --limit-modes, --no-limit-modes
+    
+	When enabled, limit-modes will limit modes analyzed for each CU	using cost 
+	metrics from the 4 sub-CUs. When multiple inter modes like :option:`--rect`
+	and/or :option:`--amp` are enabled, this feature will use motion cost 
+	heuristics from the 4 sub-CUs to bypass modes that are unlikely to be the 
+	best choice. This can significantly improve performance when :option:`rect`
+	and/or :option:`--amp` are enabled at minimal compression efficiency loss.
 
 .. option:: --rect, --no-rect
 
@@ -1049,9 +1066,9 @@
 	energy of the source image in the encoded image at the expense of
 	compression efficiency. It only has effect on presets which use
 	RDO-based mode decisions (:option:`--rd` 3 and above). 1.0 is a
-	typical value. Default 0.3
+	typical value. Default 2.0
 
-	**Range of values:** 0 .. 2.0
+	**Range of values:** 0 .. 5.0
 
 .. option:: --psy-rdoq <float>
 
@@ -1076,7 +1093,8 @@
 
 	Max intra period in frames. A special case of infinite-gop (single
 	keyframe at the beginning of the stream) can be triggered with
-	argument -1. Use 1 to force all-intra. Default 250
+	argument -1. Use 1 to force all-intra. When intra-refresh is enabled
+	it specifies the interval between which refresh sweeps happen. Default 250
 
 .. option:: --min-keyint, -i <integer>
 
@@ -1095,6 +1113,14 @@
 	:option:`--scenecut` 0 or :option:`--no-scenecut` disables adaptive
 	I frame placement. Default 40
 
+.. option:: --intra-refresh
+
+	Enables Periodic Intra Refresh(PIR) instead of keyframe insertion.
+	PIR can replace keyframes by inserting a column of intra blocks in 
+	non-keyframes, that move across the video from one side to the other
+	and thereby refresh the image but over a period of multiple 
+	frames instead of a single keyframe.
+
 .. option:: --rc-lookahead <integer>
 
 	Number of frames for slice-type decision lookahead (a key
@@ -1108,21 +1134,31 @@

 
@@ -84,8 +84,8 @@
    it adds one line per run. If :option:`--csv-log-level` is greater than
    0, it writes one line per frame. Default none
 
-   When frame level logging is enabled, several frame performance
-   statistics are listed:
+   Several frame performance statistics are available when 
+   :option:`--csv-log-level` is greater than or equal to 2:
 
    **DecideWait ms** number of milliseconds the frame encoder had to
    wait, since the previous frame was retrieved by the API thread,
@@ -202,15 +202,29 @@
    "-"       - same as "none"
    "10"      - allocate one pool, using up to 10 cores on node 0
    "-,+"     - allocate one pool, using all cores on node 1
-   "+,-,+"   - allocate two pools, using all cores on nodes 0 and 2
-   "+,-,+,-" - allocate two pools, using all cores on nodes 0 and 2
-   "-,*"     - allocate three pools, using all cores on nodes 1, 2 and 3
+   "+,-,+"   - allocate one pool, using only cores on nodes 0 and 2
+   "+,-,+,-" - allocate one pool, using only cores on nodes 0 and 2
+   "-,*"     - allocate one pool, using all cores on nodes 1, 2 and 3
    "8,8,8,8" - allocate four pools with up to 8 threads in each pool
-
-   The total number of threads will be determined by the number of threads
-   assigned to all nodes. The worker threads will each be given affinity for
-   their node, they will not be allowed to migrate between nodes, but they
-   will be allowed to move between CPU cores within their node.
+   "8,+,+,+" - allocate two pools, the first with 8 threads on node 0, and the second with all cores on node 1,2,3
+
+   A thread pool dedicated to a given NUMA node is enabled only when the
+   number of threads to be created on that NUMA node is explicitly mentioned
+   in that corresponding position with the --pools option. Else, all threads
+   are spawned from a single pool. The total number of threads will be
+   determined by the number of threads assigned to the enabled NUMA nodes for
+   that pool. The worker threads are be given affinity to all the enabled
+   NUMA nodes for that pool and may migrate between them, unless explicitly
+   specified as described above.
+
+   In the case that any threadpool has more than 64 threads, the threadpool
+   may be broken down into multiple pools of 64 threads each; on 32-bit
+   machines, this number is 32. All pools are given affinity to the NUMA
+   nodes on which the original pool had affinity. For performance reasons,
+   the last thread pool is spawned only if it has more than 32 threads for
+   64-bit machines, or 16 for 32-bit machines. If the total number of threads
+   in the system doesn't obey this constraint, we may spawn fewer threads
+   than cores which has been emperically shown to be better for performance. 
 
    If the four pool features: :option:`--wpp`, :option:`--pmode`,
    :option:`--pme` and :option:`--lookahead-slices` are all disabled,
@@ -219,10 +233,6 @@
    If "none" is specified, then all four of the thread pool features are
    implicitly disabled.
 
-   Multiple thread pools will be allocated for any NUMA node with more than
-   64 logical CPU cores. But any given thread pool will always use at most
-   one NUMA node.
-
    Frame encoders are distributed between the available thread pools,
    and the encoder will never generate more thread pools than
    :option:`--frame-threads`.  The pools are used for WPP and for
@@ -238,8 +248,12 @@
    system, a POSIX build of libx265 without libnuma will be less work
    efficient. See :ref:`thread pools <pools>` for more detail.
 
-   Default "", one thread is allocated per detected hardware thread
-   (logical CPU cores) and one thread pool per NUMA node.
+   Default "", one pool is created across all available NUMA nodes, with
+   one thread allocated per detected hardware thread
+   (logical CPU cores). In the case that the total number of threads is more
+   than the maximum size that ATOMIC operations can handle (32 for 32-bit
+   compiles, and 64 for 64-bit compiles), multiple thread pools may be
+   spawned subject to the performance constraint described above.
 
    Note that the string value will need to be escaped or quoted to
    protect against shell expansion on many platforms
@@ -353,7 +367,7 @@
 
    **CLI ONLY**
 
-.. option:: --total-frames <integer>
+.. option:: --frames <integer>
 
    The number of frames intended to be encoded.  It may be left
    unspecified, but when it is specified rate control can make use of
@@ -377,15 +391,15 @@
 
 .. option:: --input-csp <integer|string>
 
-   YUV only: Source color space. Only i420, i422, and i444 are
-   supported at this time. The internal color space is always the
-   same as the source color space (libx265 does not support any color
-   space conversions).
+   Chroma Subsampling (YUV only):  Only 4:0:0(monochrome), 4:2:0, 4:2:2, and 4:4:4 are supported at this time. 
+   The chroma subsampling format of your input must match your desired output chroma subsampling format 
+   (libx265 will not perform any chroma subsampling conversion), and it must be supported by the 
+   HEVC profile you have specified.
 
-   0. i400
-   1. i420 **(default)**
-   2. i422
-   3. i444
+   0. i400 (4:0:0 monochrome) - Not supported by Main or Main10 profiles
+   1. i420 (4:2:0 default)    - Supported by all HEVC profiles
+   2. i422 (4:2:2)            - Not supported by Main, Main10 and Main12 profiles
+   3. i444 (4:4:4)            - Supported by Main 4:4:4, Main 4:4:4 10, Main 4:4:4 12, Main 4:4:4 16 Intra profiles
    4. nv12
    5. nv16
 
@@ -436,8 +450,8 @@
    depth of the encoder. If the requested bit depth is not the bit
    depth of the linked libx265, it will attempt to bind libx265_main
    for an 8bit encoder, libx265_main10 for a 10bit encoder, or
-   libx265_main12 for a 12bit encoder (EXPERIMENTAL), with the
-   same API version as the linked libx265.
+   libx265_main12 for a 12bit encoder, with the same API version as the
+   linked libx265.
 
    If the output depth is not specified but :option:`--profile` is
    specified, the output depth will be derived from the profile name.
@@ -486,13 +500,6 @@
    The CLI application will derive the output bit depth from the
    profile name if :option:`--output-depth` is not specified.
 
-.. note::
-
-   All 12bit presets are extremely unstable, do not use them yet.
-   16bit is not supported at all, but those profiles are included
-   because it is possible for libx265 to make bitstreams compatible
-   with them.
-
 .. option:: --level-idc <integer|float>
 
    Minimum decoder requirement level. Defaults to 0, which implies
@@ -606,7 +613,8 @@
    +-------+---------------------------------------------------------------+
    | Level | Description                                                   |
    +=======+===============================================================+
-   | 0     | sa8d mode and split decisions, intra w/ source pixels         |
+   | 0     | sa8d mode and split decisions, intra w/ source pixels,        |
+   |       | currently not supported                                       |
    +-------+---------------------------------------------------------------+
    | 1     | recon generated (better intra), RDO merge/skip selection      |
    +-------+---------------------------------------------------------------+
@@ -677,7 +685,16 @@
    (within your decoder level limits) if you enable one or
    both of these flags.
 
-   This feature is EXPERIMENTAL and functional at all RD levels.
+   Default 3.
+
+.. option:: --limit-modes, --no-limit-modes
+    
+   When enabled, limit-modes will limit modes analyzed for each CU using cost 
+   metrics from the 4 sub-CUs. When multiple inter modes like :option:`--rect`
+   and/or :option:`--amp` are enabled, this feature will use motion cost 
+   heuristics from the 4 sub-CUs to bypass modes that are unlikely to be the 
+   best choice. This can significantly improve performance when :option:`rect`
+   and/or :option:`--amp` are enabled at minimal compression efficiency loss.
 
 .. option:: --rect, --no-rect
 
@@ -1049,9 +1066,9 @@
    energy of the source image in the encoded image at the expense of
    compression efficiency. It only has effect on presets which use
    RDO-based mode decisions (:option:`--rd` 3 and above). 1.0 is a
-   typical value. Default 0.3
+   typical value. Default 2.0
 
-   **Range of values:** 0 .. 2.0
+   **Range of values:** 0 .. 5.0
 
 .. option:: --psy-rdoq <float>
 
@@ -1076,7 +1093,8 @@
 
    Max intra period in frames. A special case of infinite-gop (single
    keyframe at the beginning of the stream) can be triggered with
-   argument -1. Use 1 to force all-intra. Default 250
+   argument -1. Use 1 to force all-intra. When intra-refresh is enabled
+   it specifies the interval between which refresh sweeps happen. Default 250
 
 .. option:: --min-keyint, -i <integer>
 
@@ -1095,6 +1113,14 @@
    :option:`--scenecut` 0 or :option:`--no-scenecut` disables adaptive
    I frame placement. Default 40
 
+.. option:: --intra-refresh
+
+   Enables Periodic Intra Refresh(PIR) instead of keyframe insertion.
+   PIR can replace keyframes by inserting a column of intra blocks in 
+   non-keyframes, that move across the video from one side to the other
+   and thereby refresh the image but over a period of multiple 
+   frames instead of a single keyframe.
+
 .. option:: --rc-lookahead <integer>
 
    Number of frames for slice-type decision lookahead (a key
@@ -1108,21 +1134,31 @@
 
​

x265_1.8.tar.gz/doc/reST/presets.rst -> x265_1.9.tar.gz/doc/reST/presets.rst Changed

@@ -6,76 +6,83 @@
 Presets
 =======
 
-x265 has a number of predefined :option:`--preset` options that make
-trade-offs between encode speed (encoded frames per second) and
+x265 has ten predefined :option:`--preset` options that optimize the
+trade-off between encoding speed (encoded frames per second) and
 compression efficiency (quality per bit in the bitstream).  The default
-preset is medium, it does a reasonably good job of finding the best
-possible quality without spending enormous CPU cycles looking for the
-absolute most efficient way to achieve that quality.  As you go higher
-than medium, the encoder takes shortcuts to improve performance at the
-expense of quality and compression efficiency.  As you go lower than
-medium, the encoder tries harder and harder to achieve the best quailty
-per bit compression ratio.
-
-The presets adjust encoder parameters to affect these trade-offs.
-
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-|              | ultrafast | superfast | veryfast | faster | fast | medium | slow | slower | veryslow | placebo |
-+==============+===========+===========+==========+========+======+========+======+========+==========+=========+
-| ctu          |   32      |    32     |   32     |  64    |  64  |   64   |  64  |  64    |   64     |   64    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| min-cu-size  |   16      |     8     |    8     |   8    |   8  |    8   |   8  |   8    |    8     |    8    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| bframes      |    3      |     3     |    4     |   4    |  4   |    4   |  4   |   8    |    8     |    8    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| b-adapt      |    0      |     0     |    0     |   0    |  0   |    2   |  2   |   2    |    2     |    2    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| rc-lookahead |    5      |    10     |   15     |  15    |  15  |   20   |  25  |   30   |   40     |   60    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| scenecut     |    0      |    40     |   40     |  40    |  40  |   40   |  40  |   40   |   40     |   40    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| refs         |    1      |     1     |    1     |   1    |  2   |    3   |  3   |   3    |    5     |    5    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| me           |   dia     |   hex     |   hex    |  hex   | hex  |   hex  | star |  star  |   star   |   star  |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| merange      |   57      |    57     |   57     |  57    |  57  |   57   | 57   |  57    |   57     |   92    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| subme        |    0      |     1     |    1     |   2    |  2   |    2   |  3   |   3    |    4     |    5    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| rect         |    0      |     0     |    0     |   0    |  0   |    0   |  1   |   1    |    1     |    1    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| amp          |    0      |     0     |    0     |   0    |  0   |    0   |  0   |   1    |    1     |    1    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| max-merge    |    2      |     2     |    2     |   2    |  2   |    2   |  3   |   3    |    4     |    5    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| early-skip   |    1      |     1     |    1     |   1    |  0   |    0   |  0   |   0    |    0     |    0    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| fast-intra   |    1      |     1     |    1     |   1    |  1   |    0   |  0   |   0    |    0     |    0    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| b-intra      |    0      |     0     |    0     |   0    |  0   |    0   |  0   |   1    |    1     |    1    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| sao          |    0      |     0     |    1     |   1    |  1   |    1   |  1   |   1    |    1     |    1    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| signhide     |    0      |     1     |    1     |   1    |  1   |    1   |  1   |   1    |    1     |    1    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| weightp      |    0      |     0     |    1     |   1    |  1   |    1   |  1   |   1    |    1     |    1    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| weightb      |    0      |     0     |    0     |   0    |  0   |    0   |  0   |   1    |    1     |    1    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| aq-mode      |    0      |     0     |    1     |   1    |  1   |    1   |  1   |   1    |    1     |    1    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| cuTree       |    0      |     0     |    0     |   0    |  1   |    1   |  1   |   1    |    1     |    1    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| rdLevel      |    2      |     2     |    2     |   2    |  2   |    3   |  4   |   6    |    6     |    6    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| rdoq-level   |    0      |     0     |    0     |   0    |  0   |    0   |  2   |   2    |    2     |    2    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| tu-intra     |    1      |     1     |    1     |   1    |  1   |    1   |  1   |   2    |    3     |    4    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| tu-inter     |    1      |     1     |    1     |   1    |  1   |    1   |  1   |   2    |    3     |    4    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-
-Placebo mode enables transform-skip prediction evaluation.
+preset is medium.  It does a reasonably good job of finding the best
+possible quality without spending excessive CPU cycles looking for the
+absolute most efficient way to achieve that quality.  When you use 
+faster presets, the encoder takes shortcuts to improve performance at 
+the expense of quality and compression efficiency.  When you use slower
+presets, x265 tests more encoding options, using more computations to  
+achieve the best quality at your selected bit rate (or in the case of
+--crf rate control, the lowest bit rate at the selected quality).
+
+The presets adjust encoder parameters as shown in the following table.
+Any parameters below that are specified in your command-line will be 
+changed from the value specified by the preset.
+
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+|                 |ultrafast |superfast |veryfast |faster |fast |medium |slow |slower |veryslow |placebo |
++=================+==========+==========+=========+=======+=====+=======+=====+=======+=========+========+
+| ctu             |    32    |    32    |   64    |  64   | 64  |  64   | 64  |  64   |   64    |  64    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| min-cu-size     |    16    |     8    |    8    |   8   |  8  |   8   |  8  |   8   |    8    |   8    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| bframes         |     3    |     3    |    4    |   4   |  4  |   4   |  4  |   8   |    8    |   8    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| b-adapt         |     0    |     0    |    0    |   0   |  0  |   2   |  2  |   2   |    2    |   2    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| rc-lookahead    |     5    |    10    |   15    |  15   | 15  |  20   | 25  |  30   |   40    |  60    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| lookahead-slices|     8    |     8    |    8    |   8   |  8  |   8   |  4  |   4   |    1    |   1    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| scenecut        |     0    |    40    |   40    |  40   | 40  |  40   | 40  |  40   |   40    |  40    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| ref             |     1    |     1    |    2    |   2   |  3  |   3   |  4  |   4   |    5    |   5    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| limit-refs      |     0    |     0    |    3    |   3   |  3  |   3   |  3  |   2   |    1    |   0    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| me              |    dia   |   hex    |   hex   |  hex  |hex  |  hex  |star | star  |   star  |  star  |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| merange         |    57    |    57    |   57    |  57   | 57  |  57   | 57  |  57   |   57    |  92    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| subme           |     0    |     1    |    1    |   2   |  2  |   2   |  3  |   3   |    4    |   5    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| rect            |     0    |     0    |    0    |   0   |  0  |   0   |  1  |   1   |    1    |   1    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| amp             |     0    |     0    |    0    |   0   |  0  |   0   |  0  |   1   |    1    |   1    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| limit-modes     |     0    |     0    |    0    |   0   |  0  |   0   |  1  |   1   |    1    |   0    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| max-merge       |     2    |     2    |    2    |   2   |  2  |   2   |  3  |   3   |    4    |   5    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| early-skip      |     1    |     1    |    1    |   1   |  0  |   0   |  0  |   0   |    0    |   0    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| fast-intra      |     1    |     1    |    1    |   1   |  1  |   0   |  0  |   0   |    0    |   0    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| b-intra         |     0    |     0    |    0    |   0   |  0  |   0   |  0  |   1   |    1    |   1    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| sao             |     0    |     0    |    1    |   1   |  1  |   1   |  1  |   1   |    1    |   1    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| signhide        |     0    |     1    |    1    |   1   |  1  |   1   |  1  |   1   |    1    |   1    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| weightp         |     0    |     0    |    1    |   1   |  1  |   1   |  1  |   1   |    1    |   1    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| weightb         |     0    |     0    |    0    |   0   |  0  |   0   |  0  |   1   |    1    |   1    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| aq-mode         |     0    |     0    |    1    |   1   |  1  |   1   |  1  |   1   |    1    |   1    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| cuTree          |     1    |     1    |    1    |   1   |  1  |   1   |  1  |   1   |    1    |   1    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| rdLevel         |     2    |     2    |    2    |   2   |  2  |   3   |  4  |   6   |    6    |   6    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| rdoq-level      |     0    |     0    |    0    |   0   |  0  |   0   |  2  |   2   |    2    |   2    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| tu-intra        |     1    |     1    |    1    |   1   |  1  |   1   |  1  |   2   |    3    |   4    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| tu-inter        |     1    |     1    |    1    |   1   |  1  |   1   |  1  |   2   |    3    |   4    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
 
 .. _tunings:

 
@@ -6,76 +6,83 @@
 Presets
 =======
 
-x265 has a number of predefined :option:`--preset` options that make
-trade-offs between encode speed (encoded frames per second) and
+x265 has ten predefined :option:`--preset` options that optimize the
+trade-off between encoding speed (encoded frames per second) and
 compression efficiency (quality per bit in the bitstream).  The default
-preset is medium, it does a reasonably good job of finding the best
-possible quality without spending enormous CPU cycles looking for the
-absolute most efficient way to achieve that quality.  As you go higher
-than medium, the encoder takes shortcuts to improve performance at the
-expense of quality and compression efficiency.  As you go lower than
-medium, the encoder tries harder and harder to achieve the best quailty
-per bit compression ratio.
-
-The presets adjust encoder parameters to affect these trade-offs.
-
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-|              | ultrafast | superfast | veryfast | faster | fast | medium | slow | slower | veryslow | placebo |
-+==============+===========+===========+==========+========+======+========+======+========+==========+=========+
-| ctu          |   32      |    32     |   32     |  64    |  64  |   64   |  64  |  64    |   64     |   64    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| min-cu-size  |   16      |     8     |    8     |   8    |   8  |    8   |   8  |   8    |    8     |    8    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| bframes      |    3      |     3     |    4     |   4    |  4   |    4   |  4   |   8    |    8     |    8    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| b-adapt      |    0      |     0     |    0     |   0    |  0   |    2   |  2   |   2    |    2     |    2    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| rc-lookahead |    5      |    10     |   15     |  15    |  15  |   20   |  25  |   30   |   40     |   60    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| scenecut     |    0      |    40     |   40     |  40    |  40  |   40   |  40  |   40   |   40     |   40    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| refs         |    1      |     1     |    1     |   1    |  2   |    3   |  3   |   3    |    5     |    5    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| me           |   dia     |   hex     |   hex    |  hex   | hex  |   hex  | star |  star  |   star   |   star  |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| merange      |   57      |    57     |   57     |  57    |  57  |   57   | 57   |  57    |   57     |   92    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| subme        |    0      |     1     |    1     |   2    |  2   |    2   |  3   |   3    |    4     |    5    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| rect         |    0      |     0     |    0     |   0    |  0   |    0   |  1   |   1    |    1     |    1    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| amp          |    0      |     0     |    0     |   0    |  0   |    0   |  0   |   1    |    1     |    1    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| max-merge    |    2      |     2     |    2     |   2    |  2   |    2   |  3   |   3    |    4     |    5    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| early-skip   |    1      |     1     |    1     |   1    |  0   |    0   |  0   |   0    |    0     |    0    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| fast-intra   |    1      |     1     |    1     |   1    |  1   |    0   |  0   |   0    |    0     |    0    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| b-intra      |    0      |     0     |    0     |   0    |  0   |    0   |  0   |   1    |    1     |    1    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| sao          |    0      |     0     |    1     |   1    |  1   |    1   |  1   |   1    |    1     |    1    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| signhide     |    0      |     1     |    1     |   1    |  1   |    1   |  1   |   1    |    1     |    1    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| weightp      |    0      |     0     |    1     |   1    |  1   |    1   |  1   |   1    |    1     |    1    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| weightb      |    0      |     0     |    0     |   0    |  0   |    0   |  0   |   1    |    1     |    1    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| aq-mode      |    0      |     0     |    1     |   1    |  1   |    1   |  1   |   1    |    1     |    1    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| cuTree       |    0      |     0     |    0     |   0    |  1   |    1   |  1   |   1    |    1     |    1    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| rdLevel      |    2      |     2     |    2     |   2    |  2   |    3   |  4   |   6    |    6     |    6    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| rdoq-level   |    0      |     0     |    0     |   0    |  0   |    0   |  2   |   2    |    2     |    2    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| tu-intra     |    1      |     1     |    1     |   1    |  1   |    1   |  1   |   2    |    3     |    4    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-| tu-inter     |    1      |     1     |    1     |   1    |  1   |    1   |  1   |   2    |    3     |    4    |
-+--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+
-
-Placebo mode enables transform-skip prediction evaluation.
+preset is medium.  It does a reasonably good job of finding the best
+possible quality without spending excessive CPU cycles looking for the
+absolute most efficient way to achieve that quality.  When you use 
+faster presets, the encoder takes shortcuts to improve performance at 
+the expense of quality and compression efficiency.  When you use slower
+presets, x265 tests more encoding options, using more computations to  
+achieve the best quality at your selected bit rate (or in the case of
+--crf rate control, the lowest bit rate at the selected quality).
+
+The presets adjust encoder parameters as shown in the following table.
+Any parameters below that are specified in your command-line will be 
+changed from the value specified by the preset.
+
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+|                 |ultrafast |superfast |veryfast |faster |fast |medium |slow |slower |veryslow |placebo |
++=================+==========+==========+=========+=======+=====+=======+=====+=======+=========+========+
+| ctu             |    32    |    32    |   64    |  64   | 64  |  64   | 64  |  64   |   64    |  64    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| min-cu-size     |    16    |     8    |    8    |   8   |  8  |   8   |  8  |   8   |    8    |   8    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| bframes         |     3    |     3    |    4    |   4   |  4  |   4   |  4  |   8   |    8    |   8    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| b-adapt         |     0    |     0    |    0    |   0   |  0  |   2   |  2  |   2   |    2    |   2    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| rc-lookahead    |     5    |    10    |   15    |  15   | 15  |  20   | 25  |  30   |   40    |  60    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| lookahead-slices|     8    |     8    |    8    |   8   |  8  |   8   |  4  |   4   |    1    |   1    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| scenecut        |     0    |    40    |   40    |  40   | 40  |  40   | 40  |  40   |   40    |  40    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| ref             |     1    |     1    |    2    |   2   |  3  |   3   |  4  |   4   |    5    |   5    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| limit-refs      |     0    |     0    |    3    |   3   |  3  |   3   |  3  |   2   |    1    |   0    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| me              |    dia   |   hex    |   hex   |  hex  |hex  |  hex  |star | star  |   star  |  star  |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| merange         |    57    |    57    |   57    |  57   | 57  |  57   | 57  |  57   |   57    |  92    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| subme           |     0    |     1    |    1    |   2   |  2  |   2   |  3  |   3   |    4    |   5    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| rect            |     0    |     0    |    0    |   0   |  0  |   0   |  1  |   1   |    1    |   1    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| amp             |     0    |     0    |    0    |   0   |  0  |   0   |  0  |   1   |    1    |   1    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| limit-modes     |     0    |     0    |    0    |   0   |  0  |   0   |  1  |   1   |    1    |   0    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| max-merge       |     2    |     2    |    2    |   2   |  2  |   2   |  3  |   3   |    4    |   5    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| early-skip      |     1    |     1    |    1    |   1   |  0  |   0   |  0  |   0   |    0    |   0    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| fast-intra      |     1    |     1    |    1    |   1   |  1  |   0   |  0  |   0   |    0    |   0    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| b-intra         |     0    |     0    |    0    |   0   |  0  |   0   |  0  |   1   |    1    |   1    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| sao             |     0    |     0    |    1    |   1   |  1  |   1   |  1  |   1   |    1    |   1    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| signhide        |     0    |     1    |    1    |   1   |  1  |   1   |  1  |   1   |    1    |   1    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| weightp         |     0    |     0    |    1    |   1   |  1  |   1   |  1  |   1   |    1    |   1    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| weightb         |     0    |     0    |    0    |   0   |  0  |   0   |  0  |   1   |    1    |   1    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| aq-mode         |     0    |     0    |    1    |   1   |  1  |   1   |  1  |   1   |    1    |   1    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| cuTree          |     1    |     1    |    1    |   1   |  1  |   1   |  1  |   1   |    1    |   1    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| rdLevel         |     2    |     2    |    2    |   2   |  2  |   3   |  4  |   6   |    6    |   6    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| rdoq-level      |     0    |     0    |    0    |   0   |  0  |   0   |  2  |   2   |    2    |   2    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| tu-intra        |     1    |     1    |    1    |   1   |  1  |   1   |  1  |   2   |    3    |   4    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
+| tu-inter        |     1    |     1    |    1    |   1   |  1  |   1   |  1  |   2   |    3    |   4    |
++-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
 
 .. _tunings:
 
​

x265_1.8.tar.gz/source/CMakeLists.txt -> x265_1.9.tar.gz/source/CMakeLists.txt Changed

@@ -30,7 +30,7 @@
 mark_as_advanced(FPROFILE_USE FPROFILE_GENERATE NATIVE_BUILD)
 
 # X265_BUILD must be incremented each time the public API is changed
-set(X265_BUILD 68)
+set(X265_BUILD 79)
 configure_file("${PROJECT_SOURCE_DIR}/x265.def.in"
                "${PROJECT_BINARY_DIR}/x265.def")
 configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in"
@@ -45,12 +45,14 @@
 set(POWER_ALIASES ppc64 ppc64le)
 list(FIND POWER_ALIASES "${SYSPROC}" POWERMATCH)
 if("${SYSPROC}" STREQUAL "" OR X86MATCH GREATER "-1")
-    message(STATUS "Detected x86 target processor")
     set(X86 1)
     add_definitions(-DX265_ARCH_X86=1)
     if("${CMAKE_SIZEOF_VOID_P}" MATCHES 8)
         set(X64 1)
         add_definitions(-DX86_64=1)
+        message(STATUS "Detected x86_64 target processor")
+    else()
+        message(STATUS "Detected x86 target processor")
     endif()
 elseif(POWERMATCH GREATER "-1")
     message(STATUS "Detected POWER target processor")
@@ -71,23 +73,27 @@
     if(LIBRT)
         list(APPEND PLATFORM_LIBS rt)
     endif()
+    mark_as_advanced(LIBRT)
     find_library(LIBDL dl)
     if(LIBDL)
         list(APPEND PLATFORM_LIBS dl)
     endif()
-    find_package(Numa)
-    if(NUMA_FOUND)
-        link_directories(${NUMA_LIBRARY_DIR})
-        list(APPEND CMAKE_REQUIRED_LIBRARIES numa)
-        check_symbol_exists(numa_node_of_cpu numa.h NUMA_V2)
-        if(NUMA_V2)
-            add_definitions(-DHAVE_LIBNUMA)
-            message(STATUS "libnuma found, building with support for NUMA nodes")
-            list(APPEND PLATFORM_LIBS numa)
-            include_directories(${NUMA_INCLUDE_DIR})
+    option(ENABLE_LIBNUMA "Enable libnuma usage (Linux only)" ON)
+    if(ENABLE_LIBNUMA)
+        find_package(Numa)
+        if(NUMA_FOUND)
+            link_directories(${NUMA_LIBRARY_DIR})
+            list(APPEND CMAKE_REQUIRED_LIBRARIES numa)
+            check_symbol_exists(numa_node_of_cpu numa.h NUMA_V2)
+            if(NUMA_V2)
+                add_definitions(-DHAVE_LIBNUMA)
+                message(STATUS "libnuma found, building with support for NUMA nodes")
+                list(APPEND PLATFORM_LIBS numa)
+                include_directories(${NUMA_INCLUDE_DIR})
+            endif()
         endif()
-    endif()
-    mark_as_advanced(LIBRT NUMA_FOUND)
+        mark_as_advanced(NUMA_FOUND)
+    endif(ENABLE_LIBNUMA)
     option(NO_ATOMICS "Use a slow mutex to replace atomics" OFF)
     if(NO_ATOMICS)
         add_definitions(-DNO_ATOMICS=1)
@@ -157,6 +163,7 @@
 if(GCC)
     add_definitions(-Wall -Wextra -Wshadow)
     add_definitions(-D__STDC_LIMIT_MACROS=1)
+    add_definitions(-std=gnu++98)
     if(ENABLE_PIC)
          add_definitions(-fPIC)
     endif(ENABLE_PIC)
@@ -379,16 +386,19 @@
 
 option(ENABLE_VTUNE "Enable Vtune profiling instrumentation" OFF)
 if(ENABLE_VTUNE)
-    add_definitions(-DENABLE_VTUNE)
-    include_directories($ENV{VTUNE_AMPLIFIER_XE_2015_DIR}/include)
-    list(APPEND PLATFORM_LIBS vtune)
-    link_directories($ENV{VTUNE_AMPLIFIER_XE_2015_DIR}/lib64)
-    if(WIN32)
-        list(APPEND PLATFORM_LIBS libittnotify.lib)
-    else()
-        list(APPEND PLATFORM_LIBS libittnotify.a dl)
-    endif()
-    add_subdirectory(profile/vtune)
+    find_package(Vtune)
+    if(VTUNE_FOUND)
+        add_definitions(-DENABLE_VTUNE)
+        include_directories(${VTUNE_INCLUDE_DIR})
+        list(APPEND PLATFORM_LIBS vtune)
+        link_directories(${VTUNE_LIBRARY_DIR})
+        if(WIN32)
+            list(APPEND PLATFORM_LIBS libittnotify.lib)
+        else()
+            list(APPEND PLATFORM_LIBS libittnotify.a dl)
+        endif()
+        add_subdirectory(profile/vtune)
+    endif(VTUNE_FOUND)
 endif(ENABLE_VTUNE)
 
 option(DETAILED_CU_STATS "Enable internal profiling of encoder work" OFF)
@@ -455,6 +465,9 @@
 if(ENABLE_SHARED)
     add_library(x265-shared SHARED "${PROJECT_BINARY_DIR}/x265.def" ${YASM_OBJS}
                 ${X265_RC_FILE} $<TARGET_OBJECTS:encoder> $<TARGET_OBJECTS:common>)
+    if(EXTRA_LIB)
+        target_link_libraries(x265-shared ${EXTRA_LIB})
+    endif()
     target_link_libraries(x265-shared ${PLATFORM_LIBS})
     if(MSVC)
         set_target_properties(x265-shared PROPERTIES OUTPUT_NAME libx265)
@@ -465,6 +478,8 @@
         set_target_properties(x265-shared PROPERTIES VERSION ${X265_BUILD})
         if(APPLE)
             set_target_properties(x265-shared PROPERTIES MACOSX_RPATH 1)
+        elseif(CYGWIN)
+            # Cygwin is not officially supported or tested. MinGW with msys is recommended.
         else()
             list(APPEND LINKER_OPTIONS "-Wl,-Bsymbolic,-znoexecstack")
         endif()
@@ -480,9 +495,6 @@
                 ARCHIVE DESTINATION ${LIB_INSTALL_DIR}
                 RUNTIME DESTINATION ${BIN_INSTALL_DIR})
     endif()
-    if(EXTRA_LIB)
-        target_link_libraries(x265-shared ${EXTRA_LIB})
-    endif()
     if(LINKER_OPTIONS)
         # set_target_properties can't do list expansion
         string(REPLACE ";" " " LINKER_OPTION_STR "${LINKER_OPTIONS}")

 
@@ -30,7 +30,7 @@
 mark_as_advanced(FPROFILE_USE FPROFILE_GENERATE NATIVE_BUILD)
 
 # X265_BUILD must be incremented each time the public API is changed
-set(X265_BUILD 68)
+set(X265_BUILD 79)
 configure_file("${PROJECT_SOURCE_DIR}/x265.def.in"
                "${PROJECT_BINARY_DIR}/x265.def")
 configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in"
@@ -45,12 +45,14 @@
 set(POWER_ALIASES ppc64 ppc64le)
 list(FIND POWER_ALIASES "${SYSPROC}" POWERMATCH)
 if("${SYSPROC}" STREQUAL "" OR X86MATCH GREATER "-1")
-    message(STATUS "Detected x86 target processor")
     set(X86 1)
     add_definitions(-DX265_ARCH_X86=1)
     if("${CMAKE_SIZEOF_VOID_P}" MATCHES 8)
         set(X64 1)
         add_definitions(-DX86_64=1)
+        message(STATUS "Detected x86_64 target processor")
+    else()
+        message(STATUS "Detected x86 target processor")
     endif()
 elseif(POWERMATCH GREATER "-1")
     message(STATUS "Detected POWER target processor")
@@ -71,23 +73,27 @@
     if(LIBRT)
         list(APPEND PLATFORM_LIBS rt)
     endif()
+    mark_as_advanced(LIBRT)
     find_library(LIBDL dl)
     if(LIBDL)
         list(APPEND PLATFORM_LIBS dl)
     endif()
-    find_package(Numa)
-    if(NUMA_FOUND)
-        link_directories(${NUMA_LIBRARY_DIR})
-        list(APPEND CMAKE_REQUIRED_LIBRARIES numa)
-        check_symbol_exists(numa_node_of_cpu numa.h NUMA_V2)
-        if(NUMA_V2)
-            add_definitions(-DHAVE_LIBNUMA)
-            message(STATUS "libnuma found, building with support for NUMA nodes")
-            list(APPEND PLATFORM_LIBS numa)
-            include_directories(${NUMA_INCLUDE_DIR})
+    option(ENABLE_LIBNUMA "Enable libnuma usage (Linux only)" ON)
+    if(ENABLE_LIBNUMA)
+        find_package(Numa)
+        if(NUMA_FOUND)
+            link_directories(${NUMA_LIBRARY_DIR})
+            list(APPEND CMAKE_REQUIRED_LIBRARIES numa)
+            check_symbol_exists(numa_node_of_cpu numa.h NUMA_V2)
+            if(NUMA_V2)
+                add_definitions(-DHAVE_LIBNUMA)
+                message(STATUS "libnuma found, building with support for NUMA nodes")
+                list(APPEND PLATFORM_LIBS numa)
+                include_directories(${NUMA_INCLUDE_DIR})
+            endif()
         endif()
-    endif()
-    mark_as_advanced(LIBRT NUMA_FOUND)
+        mark_as_advanced(NUMA_FOUND)
+    endif(ENABLE_LIBNUMA)
     option(NO_ATOMICS "Use a slow mutex to replace atomics" OFF)
     if(NO_ATOMICS)
         add_definitions(-DNO_ATOMICS=1)
@@ -157,6 +163,7 @@
 if(GCC)
     add_definitions(-Wall -Wextra -Wshadow)
     add_definitions(-D__STDC_LIMIT_MACROS=1)
+    add_definitions(-std=gnu++98)
     if(ENABLE_PIC)
          add_definitions(-fPIC)
     endif(ENABLE_PIC)
@@ -379,16 +386,19 @@
 
 option(ENABLE_VTUNE "Enable Vtune profiling instrumentation" OFF)
 if(ENABLE_VTUNE)
-    add_definitions(-DENABLE_VTUNE)
-    include_directories($ENV{VTUNE_AMPLIFIER_XE_2015_DIR}/include)
-    list(APPEND PLATFORM_LIBS vtune)
-    link_directories($ENV{VTUNE_AMPLIFIER_XE_2015_DIR}/lib64)
-    if(WIN32)
-        list(APPEND PLATFORM_LIBS libittnotify.lib)
-    else()
-        list(APPEND PLATFORM_LIBS libittnotify.a dl)
-    endif()
-    add_subdirectory(profile/vtune)
+    find_package(Vtune)
+    if(VTUNE_FOUND)
+        add_definitions(-DENABLE_VTUNE)
+        include_directories(${VTUNE_INCLUDE_DIR})
+        list(APPEND PLATFORM_LIBS vtune)
+        link_directories(${VTUNE_LIBRARY_DIR})
+        if(WIN32)
+            list(APPEND PLATFORM_LIBS libittnotify.lib)
+        else()
+            list(APPEND PLATFORM_LIBS libittnotify.a dl)
+        endif()
+        add_subdirectory(profile/vtune)
+    endif(VTUNE_FOUND)
 endif(ENABLE_VTUNE)
 
 option(DETAILED_CU_STATS "Enable internal profiling of encoder work" OFF)
@@ -455,6 +465,9 @@
 if(ENABLE_SHARED)
     add_library(x265-shared SHARED "${PROJECT_BINARY_DIR}/x265.def" ${YASM_OBJS}
                 ${X265_RC_FILE} $<TARGET_OBJECTS:encoder> $<TARGET_OBJECTS:common>)
+    if(EXTRA_LIB)
+        target_link_libraries(x265-shared ${EXTRA_LIB})
+    endif()
     target_link_libraries(x265-shared ${PLATFORM_LIBS})
     if(MSVC)
         set_target_properties(x265-shared PROPERTIES OUTPUT_NAME libx265)
@@ -465,6 +478,8 @@
         set_target_properties(x265-shared PROPERTIES VERSION ${X265_BUILD})
         if(APPLE)
             set_target_properties(x265-shared PROPERTIES MACOSX_RPATH 1)
+        elseif(CYGWIN)
+            # Cygwin is not officially supported or tested. MinGW with msys is recommended.
         else()
             list(APPEND LINKER_OPTIONS "-Wl,-Bsymbolic,-znoexecstack")
         endif()
@@ -480,9 +495,6 @@
                 ARCHIVE DESTINATION ${LIB_INSTALL_DIR}
                 RUNTIME DESTINATION ${BIN_INSTALL_DIR})
     endif()
-    if(EXTRA_LIB)
-        target_link_libraries(x265-shared ${EXTRA_LIB})
-    endif()
     if(LINKER_OPTIONS)
         # set_target_properties can't do list expansion
         string(REPLACE ";" " " LINKER_OPTION_STR "${LINKER_OPTIONS}")
​

x265_1.9.tar.gz/source/cmake/FindVtune.cmake Added

 
@@ -0,0 +1,25 @@
+# Module for locating Vtune
+#
+# Read-only variables
+#   VTUNE_FOUND: Indicates that the library has been found
+#   VTUNE_INCLUDE_DIR: Points to the vtunes include dir
+#   VTUNE_LIBRARY_DIR: Points to the directory with libraries
+#
+# Copyright (c) 2015 Pradeep Ramachandran
+
+include(FindPackageHandleStandardArgs)
+
+find_path(VTUNE_DIR
+    if(UNIX)
+        NAMES amplxe-vars.sh
+    else()
+        NAMES amplxe-vars.bat
+    endif(UNIX)
+    HINTS $ENV{VTUNE_AMPLIFIER_XE_2016_DIR} $ENV{VTUNE_AMPLIFIER_XE_2015_DIR}
+    DOC "Vtune root directory")
+
+set (VTUNE_INCLUDE_DIR ${VTUNE_DIR}/include)
+set (VTUNE_LIBRARY_DIR ${VTUNE_DIR}/lib64)
+
+mark_as_advanced(VTUNE_DIR)
+find_package_handle_standard_args(VTUNE REQUIRED_VARS VTUNE_DIR VTUNE_INCLUDE_DIR VTUNE_LIBRARY_DIR)
​

x265_1.8.tar.gz/source/common/bitstream.cpp -> x265_1.9.tar.gz/source/common/bitstream.cpp Changed

 
@@ -1,5 +1,6 @@
 #include "common.h"
 #include "bitstream.h"
+#include "threading.h"
 
 using namespace X265_NS;
 
@@ -112,16 +113,13 @@
 
 void SyntaxElementWriter::writeUvlc(uint32_t code)
 {
-    uint32_t length = 1;
-    uint32_t temp = ++code;
+    ++code;
 
-    X265_CHECK(temp, "writing -1 code, will cause infinite loop\n");
+    X265_CHECK(code, "writing -1 code, will cause infinite loop\n");
 
-    while (1 != temp)
-    {
-        temp >>= 1;
-        length += 2;
-    }
+    unsigned long idx;
+    CLZ(idx, code);
+    uint32_t length = (uint32_t)idx * 2 + 1;
 
     // Take care of cases where length > 32
     m_bitIf->write(0, length >> 1);
​

x265_1.8.tar.gz/source/common/bitstream.h -> x265_1.9.tar.gz/source/common/bitstream.h Changed

 
@@ -2,6 +2,7 @@
  * Copyright (C) 2013 x265 project
  *
  * Author: Steve Borho <steve@borho.org>
+ *         Min Chen <chenm003@163.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
​

x265_1.8.tar.gz/source/common/common.h -> x265_1.9.tar.gz/source/common/common.h Changed

 
@@ -2,6 +2,7 @@
  * Copyright (C) 2013 x265 project
  *
  * Authors: Deepthi Nandakumar <deepthi@multicorewareinc.com>
+ *          Min Chen <chenm003@163.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -134,10 +135,10 @@
 typedef int32_t  ssum2_t; // Signed sum
 #endif // if HIGH_BIT_DEPTH
 
-#if X265_DEPTH <= 10
-typedef uint32_t sse_ret_t;
+#if X265_DEPTH < 10
+typedef uint32_t sse_t;
 #else
-typedef uint64_t sse_ret_t;
+typedef uint64_t sse_t;
 #endif
 
 #ifndef NULL
@@ -214,6 +215,7 @@
 
 #define X265_MALLOC(type, count)    (type*)x265_malloc(sizeof(type) * (count))
 #define X265_FREE(ptr)              x265_free(ptr)
+#define X265_FREE_ZERO(ptr)         x265_free(ptr); (ptr) = NULL
 #define CHECKED_MALLOC(var, type, count) \
     { \
         var = (type*)x265_malloc(sizeof(type) * (count)); \
@@ -317,6 +319,9 @@
 #define CHROMA_V_SHIFT(x) (x == X265_CSP_I420)
 #define X265_MAX_PRED_MODE_PER_CTU 85 * 2 * 8
 
+#define MAX_NUM_TR_COEFFS           MAX_TR_SIZE * MAX_TR_SIZE // Maximum number of transform coefficients, for a 32x32 transform
+#define MAX_NUM_TR_CATEGORIES       16                        // 32, 16, 8, 4 transform categories each for luma and chroma
+
 namespace X265_NS {
 
 enum { SAO_NUM_OFFSET = 4 };
@@ -366,25 +371,6 @@
         delete[] ctuParam[2];
     }
 };
-
-/* Stores inter analysis data for a single frame */
-struct analysis_inter_data
-{
-    int32_t*    ref;
-    uint8_t*    depth;
-    uint8_t*    modes;
-    uint32_t*   bestMergeCand;
-};
-
-/* Stores intra analysis data for a single frame. This struct needs better packing */
-struct analysis_intra_data
-{
-    uint8_t*  depth;
-    uint8_t*  modes;
-    char*     partSizes;
-    uint8_t*  chromaModes;
-};
-
 enum TextType
 {
     TEXT_LUMA     = 0,  // luma
​

x265_1.8.tar.gz/source/common/constants.cpp -> x265_1.9.tar.gz/source/common/constants.cpp Changed

 
@@ -2,6 +2,7 @@
 * Copyright (C) 2015 x265 project
 *
 * Authors: Steve Borho <steve@borho.org>
+*          Min Chen <chenm003@163.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
​

x265_1.8.tar.gz/source/common/constants.h -> x265_1.9.tar.gz/source/common/constants.h Changed

 
@@ -2,6 +2,7 @@
  * Copyright (C) 2015 x265 project
  *
  * Authors: Steve Borho <steve@borho.org>
+ *          Min Chen <chenm003@163.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
​

x265_1.8.tar.gz/source/common/contexts.h -> x265_1.9.tar.gz/source/common/contexts.h Changed

 
@@ -2,6 +2,7 @@
 * Copyright (C) 2015 x265 project
 *
 * Authors: Steve Borho <steve@borho.org>
+*          Min Chen <chenm003@163.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
​

x265_1.8.tar.gz/source/common/cudata.cpp -> x265_1.9.tar.gz/source/common/cudata.cpp Changed

@@ -2,6 +2,7 @@
  * Copyright (C) 2015 x265 project
  *
  * Authors: Steve Borho <steve@borho.org>
+ *          Min Chen <chenm003@163.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -192,44 +193,82 @@
         break;
     }
 
-    /* Each CU's data is layed out sequentially within the charMemBlock */
-    uint8_t *charBuf = dataPool.charMemBlock + (m_numPartitions * BytesPerPartition) * instance;
-
-    m_qp        = (int8_t*)charBuf; charBuf += m_numPartitions;
-    m_log2CUSize         = charBuf; charBuf += m_numPartitions;
-    m_lumaIntraDir       = charBuf; charBuf += m_numPartitions;
-    m_tqBypass           = charBuf; charBuf += m_numPartitions;
-    m_refIdx[0] = (int8_t*)charBuf; charBuf += m_numPartitions;
-    m_refIdx[1] = (int8_t*)charBuf; charBuf += m_numPartitions;
-    m_cuDepth            = charBuf; charBuf += m_numPartitions;
-    m_predMode           = charBuf; charBuf += m_numPartitions; /* the order up to here is important in initCTU() and initSubCU() */
-    m_partSize           = charBuf; charBuf += m_numPartitions;
-    m_mergeFlag          = charBuf; charBuf += m_numPartitions;
-    m_interDir           = charBuf; charBuf += m_numPartitions;
-    m_mvpIdx[0]          = charBuf; charBuf += m_numPartitions;
-    m_mvpIdx[1]          = charBuf; charBuf += m_numPartitions;
-    m_tuDepth            = charBuf; charBuf += m_numPartitions;
-    m_transformSkip[0]   = charBuf; charBuf += m_numPartitions;
-    m_transformSkip[1]   = charBuf; charBuf += m_numPartitions;
-    m_transformSkip[2]   = charBuf; charBuf += m_numPartitions;
-    m_cbf[0]             = charBuf; charBuf += m_numPartitions;
-    m_cbf[1]             = charBuf; charBuf += m_numPartitions;
-    m_cbf[2]             = charBuf; charBuf += m_numPartitions;
-    m_chromaIntraDir     = charBuf; charBuf += m_numPartitions;
-
-    X265_CHECK(charBuf == dataPool.charMemBlock + (m_numPartitions * BytesPerPartition) * (instance + 1), "CU data layout is broken\n");
-
-    m_mv[0]  = dataPool.mvMemBlock + (instance * 4) * m_numPartitions;
-    m_mv[1]  = m_mv[0] +  m_numPartitions;
-    m_mvd[0] = m_mv[1] +  m_numPartitions;
-    m_mvd[1] = m_mvd[0] + m_numPartitions;
-
-    uint32_t cuSize = g_maxCUSize >> depth;
-    uint32_t sizeL = cuSize * cuSize;
-    uint32_t sizeC = sizeL >> (m_hChromaShift + m_vChromaShift);
-    m_trCoeff[0] = dataPool.trCoeffMemBlock + instance * (sizeL + sizeC * 2);
-    m_trCoeff[1] = m_trCoeff[0] + sizeL;
-    m_trCoeff[2] = m_trCoeff[0] + sizeL + sizeC;
+    if (csp == X265_CSP_I400)
+    {
+        /* Each CU's data is layed out sequentially within the charMemBlock */
+        uint8_t *charBuf = dataPool.charMemBlock + (m_numPartitions * (BytesPerPartition - 4)) * instance;
+
+        m_qp        = (int8_t*)charBuf; charBuf += m_numPartitions;
+        m_log2CUSize         = charBuf; charBuf += m_numPartitions;
+        m_lumaIntraDir       = charBuf; charBuf += m_numPartitions;
+        m_tqBypass           = charBuf; charBuf += m_numPartitions;
+        m_refIdx[0] = (int8_t*)charBuf; charBuf += m_numPartitions;
+        m_refIdx[1] = (int8_t*)charBuf; charBuf += m_numPartitions;
+        m_cuDepth            = charBuf; charBuf += m_numPartitions;
+        m_predMode           = charBuf; charBuf += m_numPartitions; /* the order up to here is important in initCTU() and initSubCU() */
+        m_partSize           = charBuf; charBuf += m_numPartitions;
+        m_mergeFlag          = charBuf; charBuf += m_numPartitions;
+        m_interDir           = charBuf; charBuf += m_numPartitions;
+        m_mvpIdx[0]          = charBuf; charBuf += m_numPartitions;
+        m_mvpIdx[1]          = charBuf; charBuf += m_numPartitions;
+        m_tuDepth            = charBuf; charBuf += m_numPartitions;
+        m_transformSkip[0]   = charBuf; charBuf += m_numPartitions;
+        m_cbf[0]             = charBuf; charBuf += m_numPartitions;
+        m_chromaIntraDir     = charBuf; charBuf += m_numPartitions;
+
+        X265_CHECK(charBuf == dataPool.charMemBlock + (m_numPartitions * (BytesPerPartition - 4)) * (instance + 1), "CU data layout is broken\n"); //BytesPerPartition
+
+        m_mv[0]  = dataPool.mvMemBlock + (instance * 4) * m_numPartitions;
+        m_mv[1]  = m_mv[0] +  m_numPartitions;
+        m_mvd[0] = m_mv[1] +  m_numPartitions;
+        m_mvd[1] = m_mvd[0] + m_numPartitions;
+
+        uint32_t cuSize = g_maxCUSize >> depth;
+        m_trCoeff[0] = dataPool.trCoeffMemBlock + instance * (cuSize * cuSize);
+        m_trCoeff[1] = m_trCoeff[2] = 0;
+        m_transformSkip[1] = m_transformSkip[2] = m_cbf[1] = m_cbf[2] = 0;
+    }
+    else
+    {
+        /* Each CU's data is layed out sequentially within the charMemBlock */
+        uint8_t *charBuf = dataPool.charMemBlock + (m_numPartitions * BytesPerPartition) * instance;
+
+        m_qp        = (int8_t*)charBuf; charBuf += m_numPartitions;
+        m_log2CUSize         = charBuf; charBuf += m_numPartitions;
+        m_lumaIntraDir       = charBuf; charBuf += m_numPartitions;
+        m_tqBypass           = charBuf; charBuf += m_numPartitions;
+        m_refIdx[0] = (int8_t*)charBuf; charBuf += m_numPartitions;
+        m_refIdx[1] = (int8_t*)charBuf; charBuf += m_numPartitions;
+        m_cuDepth            = charBuf; charBuf += m_numPartitions;
+        m_predMode           = charBuf; charBuf += m_numPartitions; /* the order up to here is important in initCTU() and initSubCU() */
+        m_partSize           = charBuf; charBuf += m_numPartitions;
+        m_mergeFlag          = charBuf; charBuf += m_numPartitions;
+        m_interDir           = charBuf; charBuf += m_numPartitions;
+        m_mvpIdx[0]          = charBuf; charBuf += m_numPartitions;
+        m_mvpIdx[1]          = charBuf; charBuf += m_numPartitions;
+        m_tuDepth            = charBuf; charBuf += m_numPartitions;
+        m_transformSkip[0]   = charBuf; charBuf += m_numPartitions;
+        m_transformSkip[1]   = charBuf; charBuf += m_numPartitions;
+        m_transformSkip[2]   = charBuf; charBuf += m_numPartitions;
+        m_cbf[0]             = charBuf; charBuf += m_numPartitions;
+        m_cbf[1]             = charBuf; charBuf += m_numPartitions;
+        m_cbf[2]             = charBuf; charBuf += m_numPartitions;
+        m_chromaIntraDir     = charBuf; charBuf += m_numPartitions;
+
+        X265_CHECK(charBuf == dataPool.charMemBlock + (m_numPartitions * BytesPerPartition) * (instance + 1), "CU data layout is broken\n");
+
+        m_mv[0]  = dataPool.mvMemBlock + (instance * 4) * m_numPartitions;
+        m_mv[1]  = m_mv[0] +  m_numPartitions;
+        m_mvd[0] = m_mv[1] +  m_numPartitions;
+        m_mvd[1] = m_mvd[0] + m_numPartitions;
+
+        uint32_t cuSize = g_maxCUSize >> depth;
+        uint32_t sizeL = cuSize * cuSize;
+        uint32_t sizeC = sizeL >> (m_hChromaShift + m_vChromaShift); // block chroma part
+        m_trCoeff[0] = dataPool.trCoeffMemBlock + instance * (sizeL + sizeC * 2);
+        m_trCoeff[1] = m_trCoeff[0] + sizeL;
+        m_trCoeff[2] = m_trCoeff[0] + sizeL + sizeC;
+    }
 }
 
 void CUData::initCTU(const Frame& frame, uint32_t cuAddr, int qp)
@@ -245,7 +284,8 @@
     /* sequential memsets */
     m_partSet((uint8_t*)m_qp, (uint8_t)qp);
     m_partSet(m_log2CUSize,   (uint8_t)g_maxLog2CUSize);
-    m_partSet(m_lumaIntraDir, (uint8_t)DC_IDX);
+    m_partSet(m_lumaIntraDir, (uint8_t)ALL_IDX);
+    m_partSet(m_chromaIntraDir, (uint8_t)ALL_IDX);
     m_partSet(m_tqBypass,     (uint8_t)frame.m_encData->m_param->bLossless);
     if (m_slice->m_sliceType != I_SLICE)
     {
@@ -256,7 +296,7 @@
     X265_CHECK(!(frame.m_encData->m_param->bLossless && !m_slice->m_pps->bTransquantBypassEnabled), "lossless enabled without TQbypass in PPS\n");
 
     /* initialize the remaining CU data in one memset */
-    memset(m_cuDepth, 0, (BytesPerPartition - 6) * m_numPartitions);
+    memset(m_cuDepth, 0, (frame.m_param->internalCsp == X265_CSP_I400 ? BytesPerPartition - 11 : BytesPerPartition - 7) * m_numPartitions);
 
     uint32_t widthInCU = m_slice->m_sps->numCuInWidth;
     m_cuLeft = (m_cuAddr % widthInCU) ? m_encData->getPicCTU(m_cuAddr - 1) : NULL;
@@ -283,14 +323,15 @@
     m_partSet((uint8_t*)m_qp, (uint8_t)qp);
 
     m_partSet(m_log2CUSize,   (uint8_t)cuGeom.log2CUSize);
-    m_partSet(m_lumaIntraDir, (uint8_t)DC_IDX);
+    m_partSet(m_lumaIntraDir, (uint8_t)ALL_IDX);
+    m_partSet(m_chromaIntraDir, (uint8_t)ALL_IDX);
     m_partSet(m_tqBypass,     (uint8_t)m_encData->m_param->bLossless);
     m_partSet((uint8_t*)m_refIdx[0], (uint8_t)REF_NOT_VALID);
     m_partSet((uint8_t*)m_refIdx[1], (uint8_t)REF_NOT_VALID);
     m_partSet(m_cuDepth,      (uint8_t)cuGeom.depth);
 
     /* initialize the remaining CU data in one memset */
-    memset(m_predMode, 0, (BytesPerPartition - 7) * m_numPartitions);
+    memset(m_predMode, 0, (ctu.m_chromaFormat == X265_CSP_I400 ? BytesPerPartition - 12 : BytesPerPartition - 8) * m_numPartitions);
 }
 
 /* Copy the results of a sub-part (split) CU to the parent CU */
@@ -314,13 +355,9 @@
     m_subPartCopy(m_mvpIdx[0] + offset, subCU.m_mvpIdx[0]);
     m_subPartCopy(m_mvpIdx[1] + offset, subCU.m_mvpIdx[1]);
     m_subPartCopy(m_tuDepth + offset, subCU.m_tuDepth);
+
     m_subPartCopy(m_transformSkip[0] + offset, subCU.m_transformSkip[0]);
-    m_subPartCopy(m_transformSkip[1] + offset, subCU.m_transformSkip[1]);
-    m_subPartCopy(m_transformSkip[2] + offset, subCU.m_transformSkip[2]);
     m_subPartCopy(m_cbf[0] + offset, subCU.m_cbf[0]);
-    m_subPartCopy(m_cbf[1] + offset, subCU.m_cbf[1]);
-    m_subPartCopy(m_cbf[2] + offset, subCU.m_cbf[2]);
-    m_subPartCopy(m_chromaIntraDir + offset, subCU.m_chromaIntraDir);
 
     memcpy(m_mv[0] + offset, subCU.m_mv[0], childGeom.numPartitions * sizeof(MV));
     memcpy(m_mv[1] + offset, subCU.m_mv[1], childGeom.numPartitions * sizeof(MV));
@@ -329,12 +366,21 @@
 
     uint32_t tmp = 1 << ((g_maxLog2CUSize - childGeom.depth) * 2);
     uint32_t tmp2 = subPartIdx * tmp;
-    memcpy(m_trCoeff[0] + tmp2, subCU.m_trCoeff[0], sizeof(coeff_t) * tmp);
+    memcpy(m_trCoeff[0] + tmp2, subCU.m_trCoeff[0], sizeof(coeff_t)* tmp);
 
-    uint32_t tmpC = tmp >> (m_hChromaShift + m_vChromaShift);
-    uint32_t tmpC2 = tmp2 >> (m_hChromaShift + m_vChromaShift);
-    memcpy(m_trCoeff[1] + tmpC2, subCU.m_trCoeff[1], sizeof(coeff_t) * tmpC);
-    memcpy(m_trCoeff[2] + tmpC2, subCU.m_trCoeff[2], sizeof(coeff_t) * tmpC);
+    if (subCU.m_chromaFormat != X265_CSP_I400)
+    {
+        m_subPartCopy(m_transformSkip[1] + offset, subCU.m_transformSkip[1]);
+        m_subPartCopy(m_transformSkip[2] + offset, subCU.m_transformSkip[2]);
+        m_subPartCopy(m_cbf[1] + offset, subCU.m_cbf[1]);
+        m_subPartCopy(m_cbf[2] + offset, subCU.m_cbf[2]);
+        m_subPartCopy(m_chromaIntraDir + offset, subCU.m_chromaIntraDir);
+

 
@@ -2,6 +2,7 @@
  * Copyright (C) 2015 x265 project
  *
  * Authors: Steve Borho <steve@borho.org>
+ *          Min Chen <chenm003@163.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -192,44 +193,82 @@
         break;
     }
 
-    /* Each CU's data is layed out sequentially within the charMemBlock */
-    uint8_t *charBuf = dataPool.charMemBlock + (m_numPartitions * BytesPerPartition) * instance;
-
-    m_qp        = (int8_t*)charBuf; charBuf += m_numPartitions;
-    m_log2CUSize         = charBuf; charBuf += m_numPartitions;
-    m_lumaIntraDir       = charBuf; charBuf += m_numPartitions;
-    m_tqBypass           = charBuf; charBuf += m_numPartitions;
-    m_refIdx[0] = (int8_t*)charBuf; charBuf += m_numPartitions;
-    m_refIdx[1] = (int8_t*)charBuf; charBuf += m_numPartitions;
-    m_cuDepth            = charBuf; charBuf += m_numPartitions;
-    m_predMode           = charBuf; charBuf += m_numPartitions; /* the order up to here is important in initCTU() and initSubCU() */
-    m_partSize           = charBuf; charBuf += m_numPartitions;
-    m_mergeFlag          = charBuf; charBuf += m_numPartitions;
-    m_interDir           = charBuf; charBuf += m_numPartitions;
-    m_mvpIdx[0]          = charBuf; charBuf += m_numPartitions;
-    m_mvpIdx[1]          = charBuf; charBuf += m_numPartitions;
-    m_tuDepth            = charBuf; charBuf += m_numPartitions;
-    m_transformSkip[0]   = charBuf; charBuf += m_numPartitions;
-    m_transformSkip[1]   = charBuf; charBuf += m_numPartitions;
-    m_transformSkip[2]   = charBuf; charBuf += m_numPartitions;
-    m_cbf[0]             = charBuf; charBuf += m_numPartitions;
-    m_cbf[1]             = charBuf; charBuf += m_numPartitions;
-    m_cbf[2]             = charBuf; charBuf += m_numPartitions;
-    m_chromaIntraDir     = charBuf; charBuf += m_numPartitions;
-
-    X265_CHECK(charBuf == dataPool.charMemBlock + (m_numPartitions * BytesPerPartition) * (instance + 1), "CU data layout is broken\n");
-
-    m_mv[0]  = dataPool.mvMemBlock + (instance * 4) * m_numPartitions;
-    m_mv[1]  = m_mv[0] +  m_numPartitions;
-    m_mvd[0] = m_mv[1] +  m_numPartitions;
-    m_mvd[1] = m_mvd[0] + m_numPartitions;
-
-    uint32_t cuSize = g_maxCUSize >> depth;
-    uint32_t sizeL = cuSize * cuSize;
-    uint32_t sizeC = sizeL >> (m_hChromaShift + m_vChromaShift);
-    m_trCoeff[0] = dataPool.trCoeffMemBlock + instance * (sizeL + sizeC * 2);
-    m_trCoeff[1] = m_trCoeff[0] + sizeL;
-    m_trCoeff[2] = m_trCoeff[0] + sizeL + sizeC;
+    if (csp == X265_CSP_I400)
+    {
+        /* Each CU's data is layed out sequentially within the charMemBlock */
+        uint8_t *charBuf = dataPool.charMemBlock + (m_numPartitions * (BytesPerPartition - 4)) * instance;
+
+        m_qp        = (int8_t*)charBuf; charBuf += m_numPartitions;
+        m_log2CUSize         = charBuf; charBuf += m_numPartitions;
+        m_lumaIntraDir       = charBuf; charBuf += m_numPartitions;
+        m_tqBypass           = charBuf; charBuf += m_numPartitions;
+        m_refIdx[0] = (int8_t*)charBuf; charBuf += m_numPartitions;
+        m_refIdx[1] = (int8_t*)charBuf; charBuf += m_numPartitions;
+        m_cuDepth            = charBuf; charBuf += m_numPartitions;
+        m_predMode           = charBuf; charBuf += m_numPartitions; /* the order up to here is important in initCTU() and initSubCU() */
+        m_partSize           = charBuf; charBuf += m_numPartitions;
+        m_mergeFlag          = charBuf; charBuf += m_numPartitions;
+        m_interDir           = charBuf; charBuf += m_numPartitions;
+        m_mvpIdx[0]          = charBuf; charBuf += m_numPartitions;
+        m_mvpIdx[1]          = charBuf; charBuf += m_numPartitions;
+        m_tuDepth            = charBuf; charBuf += m_numPartitions;
+        m_transformSkip[0]   = charBuf; charBuf += m_numPartitions;
+        m_cbf[0]             = charBuf; charBuf += m_numPartitions;
+        m_chromaIntraDir     = charBuf; charBuf += m_numPartitions;
+
+        X265_CHECK(charBuf == dataPool.charMemBlock + (m_numPartitions * (BytesPerPartition - 4)) * (instance + 1), "CU data layout is broken\n"); //BytesPerPartition
+
+        m_mv[0]  = dataPool.mvMemBlock + (instance * 4) * m_numPartitions;
+        m_mv[1]  = m_mv[0] +  m_numPartitions;
+        m_mvd[0] = m_mv[1] +  m_numPartitions;
+        m_mvd[1] = m_mvd[0] + m_numPartitions;
+
+        uint32_t cuSize = g_maxCUSize >> depth;
+        m_trCoeff[0] = dataPool.trCoeffMemBlock + instance * (cuSize * cuSize);
+        m_trCoeff[1] = m_trCoeff[2] = 0;
+        m_transformSkip[1] = m_transformSkip[2] = m_cbf[1] = m_cbf[2] = 0;
+    }
+    else
+    {
+        /* Each CU's data is layed out sequentially within the charMemBlock */
+        uint8_t *charBuf = dataPool.charMemBlock + (m_numPartitions * BytesPerPartition) * instance;
+
+        m_qp        = (int8_t*)charBuf; charBuf += m_numPartitions;
+        m_log2CUSize         = charBuf; charBuf += m_numPartitions;
+        m_lumaIntraDir       = charBuf; charBuf += m_numPartitions;
+        m_tqBypass           = charBuf; charBuf += m_numPartitions;
+        m_refIdx[0] = (int8_t*)charBuf; charBuf += m_numPartitions;
+        m_refIdx[1] = (int8_t*)charBuf; charBuf += m_numPartitions;
+        m_cuDepth            = charBuf; charBuf += m_numPartitions;
+        m_predMode           = charBuf; charBuf += m_numPartitions; /* the order up to here is important in initCTU() and initSubCU() */
+        m_partSize           = charBuf; charBuf += m_numPartitions;
+        m_mergeFlag          = charBuf; charBuf += m_numPartitions;
+        m_interDir           = charBuf; charBuf += m_numPartitions;
+        m_mvpIdx[0]          = charBuf; charBuf += m_numPartitions;
+        m_mvpIdx[1]          = charBuf; charBuf += m_numPartitions;
+        m_tuDepth            = charBuf; charBuf += m_numPartitions;
+        m_transformSkip[0]   = charBuf; charBuf += m_numPartitions;
+        m_transformSkip[1]   = charBuf; charBuf += m_numPartitions;
+        m_transformSkip[2]   = charBuf; charBuf += m_numPartitions;
+        m_cbf[0]             = charBuf; charBuf += m_numPartitions;
+        m_cbf[1]             = charBuf; charBuf += m_numPartitions;
+        m_cbf[2]             = charBuf; charBuf += m_numPartitions;
+        m_chromaIntraDir     = charBuf; charBuf += m_numPartitions;
+
+        X265_CHECK(charBuf == dataPool.charMemBlock + (m_numPartitions * BytesPerPartition) * (instance + 1), "CU data layout is broken\n");
+
+        m_mv[0]  = dataPool.mvMemBlock + (instance * 4) * m_numPartitions;
+        m_mv[1]  = m_mv[0] +  m_numPartitions;
+        m_mvd[0] = m_mv[1] +  m_numPartitions;
+        m_mvd[1] = m_mvd[0] + m_numPartitions;
+
+        uint32_t cuSize = g_maxCUSize >> depth;
+        uint32_t sizeL = cuSize * cuSize;
+        uint32_t sizeC = sizeL >> (m_hChromaShift + m_vChromaShift); // block chroma part
+        m_trCoeff[0] = dataPool.trCoeffMemBlock + instance * (sizeL + sizeC * 2);
+        m_trCoeff[1] = m_trCoeff[0] + sizeL;
+        m_trCoeff[2] = m_trCoeff[0] + sizeL + sizeC;
+    }
 }
 
 void CUData::initCTU(const Frame& frame, uint32_t cuAddr, int qp)
@@ -245,7 +284,8 @@
     /* sequential memsets */
     m_partSet((uint8_t*)m_qp, (uint8_t)qp);
     m_partSet(m_log2CUSize,   (uint8_t)g_maxLog2CUSize);
-    m_partSet(m_lumaIntraDir, (uint8_t)DC_IDX);
+    m_partSet(m_lumaIntraDir, (uint8_t)ALL_IDX);
+    m_partSet(m_chromaIntraDir, (uint8_t)ALL_IDX);
     m_partSet(m_tqBypass,     (uint8_t)frame.m_encData->m_param->bLossless);
     if (m_slice->m_sliceType != I_SLICE)
     {
@@ -256,7 +296,7 @@
     X265_CHECK(!(frame.m_encData->m_param->bLossless && !m_slice->m_pps->bTransquantBypassEnabled), "lossless enabled without TQbypass in PPS\n");
 
     /* initialize the remaining CU data in one memset */
-    memset(m_cuDepth, 0, (BytesPerPartition - 6) * m_numPartitions);
+    memset(m_cuDepth, 0, (frame.m_param->internalCsp == X265_CSP_I400 ? BytesPerPartition - 11 : BytesPerPartition - 7) * m_numPartitions);
 
     uint32_t widthInCU = m_slice->m_sps->numCuInWidth;
     m_cuLeft = (m_cuAddr % widthInCU) ? m_encData->getPicCTU(m_cuAddr - 1) : NULL;
@@ -283,14 +323,15 @@
     m_partSet((uint8_t*)m_qp, (uint8_t)qp);
 
     m_partSet(m_log2CUSize,   (uint8_t)cuGeom.log2CUSize);
-    m_partSet(m_lumaIntraDir, (uint8_t)DC_IDX);
+    m_partSet(m_lumaIntraDir, (uint8_t)ALL_IDX);
+    m_partSet(m_chromaIntraDir, (uint8_t)ALL_IDX);
     m_partSet(m_tqBypass,     (uint8_t)m_encData->m_param->bLossless);
     m_partSet((uint8_t*)m_refIdx[0], (uint8_t)REF_NOT_VALID);
     m_partSet((uint8_t*)m_refIdx[1], (uint8_t)REF_NOT_VALID);
     m_partSet(m_cuDepth,      (uint8_t)cuGeom.depth);
 
     /* initialize the remaining CU data in one memset */
-    memset(m_predMode, 0, (BytesPerPartition - 7) * m_numPartitions);
+    memset(m_predMode, 0, (ctu.m_chromaFormat == X265_CSP_I400 ? BytesPerPartition - 12 : BytesPerPartition - 8) * m_numPartitions);
 }
 
 /* Copy the results of a sub-part (split) CU to the parent CU */
@@ -314,13 +355,9 @@
     m_subPartCopy(m_mvpIdx[0] + offset, subCU.m_mvpIdx[0]);
     m_subPartCopy(m_mvpIdx[1] + offset, subCU.m_mvpIdx[1]);
     m_subPartCopy(m_tuDepth + offset, subCU.m_tuDepth);
+
     m_subPartCopy(m_transformSkip[0] + offset, subCU.m_transformSkip[0]);
-    m_subPartCopy(m_transformSkip[1] + offset, subCU.m_transformSkip[1]);
-    m_subPartCopy(m_transformSkip[2] + offset, subCU.m_transformSkip[2]);
     m_subPartCopy(m_cbf[0] + offset, subCU.m_cbf[0]);
-    m_subPartCopy(m_cbf[1] + offset, subCU.m_cbf[1]);
-    m_subPartCopy(m_cbf[2] + offset, subCU.m_cbf[2]);
-    m_subPartCopy(m_chromaIntraDir + offset, subCU.m_chromaIntraDir);
 
     memcpy(m_mv[0] + offset, subCU.m_mv[0], childGeom.numPartitions * sizeof(MV));
     memcpy(m_mv[1] + offset, subCU.m_mv[1], childGeom.numPartitions * sizeof(MV));
@@ -329,12 +366,21 @@
 
     uint32_t tmp = 1 << ((g_maxLog2CUSize - childGeom.depth) * 2);
     uint32_t tmp2 = subPartIdx * tmp;
-    memcpy(m_trCoeff[0] + tmp2, subCU.m_trCoeff[0], sizeof(coeff_t) * tmp);
+    memcpy(m_trCoeff[0] + tmp2, subCU.m_trCoeff[0], sizeof(coeff_t)* tmp);
 
-    uint32_t tmpC = tmp >> (m_hChromaShift + m_vChromaShift);
-    uint32_t tmpC2 = tmp2 >> (m_hChromaShift + m_vChromaShift);
-    memcpy(m_trCoeff[1] + tmpC2, subCU.m_trCoeff[1], sizeof(coeff_t) * tmpC);
-    memcpy(m_trCoeff[2] + tmpC2, subCU.m_trCoeff[2], sizeof(coeff_t) * tmpC);
+    if (subCU.m_chromaFormat != X265_CSP_I400)
+    {
+        m_subPartCopy(m_transformSkip[1] + offset, subCU.m_transformSkip[1]);
+        m_subPartCopy(m_transformSkip[2] + offset, subCU.m_transformSkip[2]);
+        m_subPartCopy(m_cbf[1] + offset, subCU.m_cbf[1]);
+        m_subPartCopy(m_cbf[2] + offset, subCU.m_cbf[2]);
+        m_subPartCopy(m_chromaIntraDir + offset, subCU.m_chromaIntraDir);
+
​

x265_1.8.tar.gz/source/common/cudata.h -> x265_1.9.tar.gz/source/common/cudata.h Changed

@@ -222,12 +222,12 @@
     void     copyToPic(uint32_t depth) const;
 
     /* RD-0 methods called only from encodeResidue */
-    void     copyFromPic(const CUData& ctu, const CUGeom& cuGeom);
+    void     copyFromPic(const CUData& ctu, const CUGeom& cuGeom, int csp);
     void     updatePic(uint32_t depth) const;
 
     void     setPartSizeSubParts(PartSize size)    { m_partSet(m_partSize, (uint8_t)size); }
     void     setPredModeSubParts(PredMode mode)    { m_partSet(m_predMode, (uint8_t)mode); }
-    void     clearCbf()                            { m_partSet(m_cbf[0], 0); m_partSet(m_cbf[1], 0); m_partSet(m_cbf[2], 0); }
+    void     clearCbf()                            { m_partSet(m_cbf[0], 0); if (m_chromaFormat != X265_CSP_I400) { m_partSet(m_cbf[1], 0); m_partSet(m_cbf[2], 0);} }
 
     /* these functions all take depth as an absolute depth from CTU, it is used to calculate the number of parts to copy */
     void     setQPSubParts(int8_t qp, uint32_t absPartIdx, uint32_t depth)                    { s_partSet[depth]((uint8_t*)m_qp + absPartIdx, (uint8_t)qp); }
@@ -246,7 +246,7 @@
     void     setPURefIdx(int list, int8_t refIdx, int absPartIdx, int puIdx);
 
     uint8_t  getCbf(uint32_t absPartIdx, TextType ttype, uint32_t tuDepth) const { return (m_cbf[ttype][absPartIdx] >> tuDepth) & 0x1; }
-    uint8_t  getQtRootCbf(uint32_t absPartIdx) const                             { return m_cbf[0][absPartIdx] || m_cbf[1][absPartIdx] || m_cbf[2][absPartIdx]; }
+    uint8_t  getQtRootCbf(uint32_t absPartIdx) const                             { if (m_chromaFormat == X265_CSP_I400) return m_cbf[0][absPartIdx] || false; else { return m_cbf[0][absPartIdx] || m_cbf[1][absPartIdx] || m_cbf[2][absPartIdx];} }
     int8_t   getRefQP(uint32_t currAbsIdxInCTU) const;
     uint32_t getInterMergeCandidates(uint32_t absPartIdx, uint32_t puIdx, MVField (*candMvField)[2], uint8_t* candDir) const;
     void     clipMv(MV& outMV) const;
@@ -323,7 +323,6 @@
     const uint16_t *scan;
     const uint16_t *scanCG;
     ScanType        scanType;
-    uint32_t        log2TrSizeCG;
     uint32_t        firstSignificanceMapContext;
 };
 
@@ -340,8 +339,15 @@
         uint32_t numPartition = NUM_4x4_PARTITIONS >> (depth * 2);
         uint32_t cuSize = g_maxCUSize >> depth;
         uint32_t sizeL = cuSize * cuSize;
-        uint32_t sizeC = sizeL >> (CHROMA_H_SHIFT(csp) + CHROMA_V_SHIFT(csp));
-        CHECKED_MALLOC(trCoeffMemBlock, coeff_t, (sizeL + sizeC * 2) * numInstances);
+        if (csp == X265_CSP_I400)
+        {
+            CHECKED_MALLOC(trCoeffMemBlock, coeff_t, (sizeL) * numInstances);
+        }
+        else
+        {            
+            uint32_t sizeC = sizeL >> (CHROMA_H_SHIFT(csp) + CHROMA_V_SHIFT(csp));
+            CHECKED_MALLOC(trCoeffMemBlock, coeff_t, (sizeL + sizeC * 2) * numInstances);
+        }
         CHECKED_MALLOC(charMemBlock, uint8_t, numPartition * numInstances * CUData::BytesPerPartition);
         CHECKED_MALLOC(mvMemBlock, MV, numPartition * 4 * numInstances);
         return true;

 
@@ -222,12 +222,12 @@
     void     copyToPic(uint32_t depth) const;
 
     /* RD-0 methods called only from encodeResidue */
-    void     copyFromPic(const CUData& ctu, const CUGeom& cuGeom);
+    void     copyFromPic(const CUData& ctu, const CUGeom& cuGeom, int csp);
     void     updatePic(uint32_t depth) const;
 
     void     setPartSizeSubParts(PartSize size)    { m_partSet(m_partSize, (uint8_t)size); }
     void     setPredModeSubParts(PredMode mode)    { m_partSet(m_predMode, (uint8_t)mode); }
-    void     clearCbf()                            { m_partSet(m_cbf[0], 0); m_partSet(m_cbf[1], 0); m_partSet(m_cbf[2], 0); }
+    void     clearCbf()                            { m_partSet(m_cbf[0], 0); if (m_chromaFormat != X265_CSP_I400) { m_partSet(m_cbf[1], 0); m_partSet(m_cbf[2], 0);} }
 
     /* these functions all take depth as an absolute depth from CTU, it is used to calculate the number of parts to copy */
     void     setQPSubParts(int8_t qp, uint32_t absPartIdx, uint32_t depth)                    { s_partSet[depth]((uint8_t*)m_qp + absPartIdx, (uint8_t)qp); }
@@ -246,7 +246,7 @@
     void     setPURefIdx(int list, int8_t refIdx, int absPartIdx, int puIdx);
 
     uint8_t  getCbf(uint32_t absPartIdx, TextType ttype, uint32_t tuDepth) const { return (m_cbf[ttype][absPartIdx] >> tuDepth) & 0x1; }
-    uint8_t  getQtRootCbf(uint32_t absPartIdx) const                             { return m_cbf[0][absPartIdx] || m_cbf[1][absPartIdx] || m_cbf[2][absPartIdx]; }
+    uint8_t  getQtRootCbf(uint32_t absPartIdx) const                             { if (m_chromaFormat == X265_CSP_I400) return m_cbf[0][absPartIdx] || false; else { return m_cbf[0][absPartIdx] || m_cbf[1][absPartIdx] || m_cbf[2][absPartIdx];} }
     int8_t   getRefQP(uint32_t currAbsIdxInCTU) const;
     uint32_t getInterMergeCandidates(uint32_t absPartIdx, uint32_t puIdx, MVField (*candMvField)[2], uint8_t* candDir) const;
     void     clipMv(MV& outMV) const;
@@ -323,7 +323,6 @@
     const uint16_t *scan;
     const uint16_t *scanCG;
     ScanType        scanType;
-    uint32_t        log2TrSizeCG;
     uint32_t        firstSignificanceMapContext;
 };
 
@@ -340,8 +339,15 @@
         uint32_t numPartition = NUM_4x4_PARTITIONS >> (depth * 2);
         uint32_t cuSize = g_maxCUSize >> depth;
         uint32_t sizeL = cuSize * cuSize;
-        uint32_t sizeC = sizeL >> (CHROMA_H_SHIFT(csp) + CHROMA_V_SHIFT(csp));
-        CHECKED_MALLOC(trCoeffMemBlock, coeff_t, (sizeL + sizeC * 2) * numInstances);
+        if (csp == X265_CSP_I400)
+        {
+            CHECKED_MALLOC(trCoeffMemBlock, coeff_t, (sizeL) * numInstances);
+        }
+        else
+        {            
+            uint32_t sizeC = sizeL >> (CHROMA_H_SHIFT(csp) + CHROMA_V_SHIFT(csp));
+            CHECKED_MALLOC(trCoeffMemBlock, coeff_t, (sizeL + sizeC * 2) * numInstances);
+        }
         CHECKED_MALLOC(charMemBlock, uint8_t, numPartition * numInstances * CUData::BytesPerPartition);
         CHECKED_MALLOC(mvMemBlock, MV, numPartition * 4 * numInstances);
         return true;
​

x265_1.8.tar.gz/source/common/dct.cpp -> x265_1.9.tar.gz/source/common/dct.cpp Changed

@@ -703,7 +703,10 @@
         if (level)
             ++numSig;
         level *= sign;
-        qCoef[blockpos] = (int16_t)x265_clip3(-32768, 32767, level);
+
+        // TODO: when we limit range to [-32767, 32767], we can get more performance with output change
+        //       But nquant is a little percent in rdoQuant, so I keep old dynamic range for compatible
+        qCoef[blockpos] = (int16_t)abs(x265_clip3(-32768, 32767, level));
     }
 
     return numSig;
@@ -784,11 +787,12 @@
     return scanPosLast - 1;
 }
 
+// NOTE: no defined value on lastNZPosInCG & absSumSign when ALL ZEROS block as input
 static uint32_t findPosFirstLast_c(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16])
 {
     int n;
 
-    for (n = SCAN_SET_SIZE - 1; n >= 0; --n)
+    for (n = SCAN_SET_SIZE - 1; n >= 0; n--)
     {
         const uint32_t idx = scanTbl[n];
         const uint32_t idxY = idx / MLS_CG_SIZE;
@@ -812,8 +816,17 @@
 
     uint32_t firstNZPosInCG = (uint32_t)n;
 
+    uint32_t absSumSign = 0;
+    for (n = firstNZPosInCG; n <= (int)lastNZPosInCG; n++)
+    {
+        const uint32_t idx = scanTbl[n];
+        const uint32_t idxY = idx / MLS_CG_SIZE;
+        const uint32_t idxX = idx % MLS_CG_SIZE;
+        absSumSign += dstCoeff[idxY * trSize + idxX];
+    }
+
     // NOTE: when coeff block all ZERO, the lastNZPosInCG is undefined and firstNZPosInCG is 16
-    return ((lastNZPosInCG << 16) | firstNZPosInCG);
+    return ((absSumSign << 31) | (lastNZPosInCG << 8) | firstNZPosInCG);
 }

 
@@ -703,7 +703,10 @@
         if (level)
             ++numSig;
         level *= sign;
-        qCoef[blockpos] = (int16_t)x265_clip3(-32768, 32767, level);
+
+        // TODO: when we limit range to [-32767, 32767], we can get more performance with output change
+        //       But nquant is a little percent in rdoQuant, so I keep old dynamic range for compatible
+        qCoef[blockpos] = (int16_t)abs(x265_clip3(-32768, 32767, level));
     }
 
     return numSig;
@@ -784,11 +787,12 @@
     return scanPosLast - 1;
 }
 
+// NOTE: no defined value on lastNZPosInCG & absSumSign when ALL ZEROS block as input
 static uint32_t findPosFirstLast_c(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16])
 {
     int n;
 
-    for (n = SCAN_SET_SIZE - 1; n >= 0; --n)
+    for (n = SCAN_SET_SIZE - 1; n >= 0; n--)
     {
         const uint32_t idx = scanTbl[n];
         const uint32_t idxY = idx / MLS_CG_SIZE;
@@ -812,8 +816,17 @@
 
     uint32_t firstNZPosInCG = (uint32_t)n;
 
+    uint32_t absSumSign = 0;
+    for (n = firstNZPosInCG; n <= (int)lastNZPosInCG; n++)
+    {
+        const uint32_t idx = scanTbl[n];
+        const uint32_t idxY = idx / MLS_CG_SIZE;
+        const uint32_t idxX = idx % MLS_CG_SIZE;
+        absSumSign += dstCoeff[idxY * trSize + idxX];
+    }
+
     // NOTE: when coeff block all ZERO, the lastNZPosInCG is undefined and firstNZPosInCG is 16
-    return ((lastNZPosInCG << 16) | firstNZPosInCG);
+    return ((absSumSign << 31) | (lastNZPosInCG << 8) | firstNZPosInCG);
 }
 
 
​

x265_1.8.tar.gz/source/common/deblock.cpp -> x265_1.9.tar.gz/source/common/deblock.cpp Changed

@@ -2,6 +2,7 @@
 * Copyright (C) 2013 x265 project
 *
 * Author: Gopu Govindaswamy <gopu@multicorewareinc.com>
+*         Min Chen <chenm003@163.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -108,7 +109,7 @@
     for (uint32_t e = 0; e < numUnits; e += partIdxIncr)
     {
         edgeFilterLuma(cu, absPartIdx, depth, dir, e, blockStrength);
-        if (!((e0 + e) & chromaMask))
+        if (!((e0 + e) & chromaMask) && cu->m_chromaFormat != X265_CSP_I400)
             edgeFilterChroma(cu, absPartIdx, depth, dir, e, blockStrength);
     }
 }
@@ -209,8 +210,8 @@
     const Slice* const sliceQ = cuQ->m_slice;
     const Slice* const sliceP = cuP->m_slice;
 
-    const Frame* refP0 = sliceP->getRefPic(0, cuP->m_refIdx[0][partP]);
-    const Frame* refQ0 = sliceQ->getRefPic(0, cuQ->m_refIdx[0][partQ]);
+    const Frame* refP0 = sliceP->m_refFrameList[0][cuP->m_refIdx[0][partP]];
+    const Frame* refQ0 = sliceQ->m_refFrameList[0][cuQ->m_refIdx[0][partQ]];
     const MV& mvP0 = refP0 ? cuP->m_mv[0][partP] : zeroMv;
     const MV& mvQ0 = refQ0 ? cuQ->m_mv[0][partQ] : zeroMv;
 
@@ -221,8 +222,8 @@
     }
 
     // (sliceQ->isInterB() || sliceP->isInterB())
-    const Frame* refP1 = sliceP->getRefPic(1, cuP->m_refIdx[1][partP]);
-    const Frame* refQ1 = sliceQ->getRefPic(1, cuQ->m_refIdx[1][partQ]);
+    const Frame* refP1 = sliceP->m_refFrameList[1][cuP->m_refIdx[1][partP]];
+    const Frame* refQ1 = sliceQ->m_refFrameList[1][cuQ->m_refIdx[1][partQ]];
     const MV& mvP1 = refP1 ? cuP->m_mv[1][partP] : zeroMv;
     const MV& mvQ1 = refQ1 ? cuQ->m_mv[1][partQ] : zeroMv;
 
@@ -279,31 +280,6 @@
  * \param maskQ   indicator to enable filtering on partQ
  * \param maskP1  decision weak filter/no filter for partP
  * \param maskQ1  decision weak filter/no filter for partQ */
-static inline void pelFilterLumaStrong(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tc, int32_t maskP, int32_t maskQ)
-{
-    int32_t tc2 = 2 * tc;
-    int32_t tcP = (tc2 & maskP);
-    int32_t tcQ = (tc2 & maskQ);
-    for (int32_t i = 0; i < UNIT_SIZE; i++, src += srcStep)
-    {
-        int16_t m4  = (int16_t)src[0];
-        int16_t m3  = (int16_t)src[-offset];
-        int16_t m5  = (int16_t)src[offset];
-        int16_t m2  = (int16_t)src[-offset * 2];
-        int16_t m6  = (int16_t)src[offset * 2];
-        int16_t m1  = (int16_t)src[-offset * 3];
-        int16_t m7  = (int16_t)src[offset * 3];
-        int16_t m0  = (int16_t)src[-offset * 4];
-        src[-offset * 3] = (pixel)(x265_clip3(-tcP, tcP, ((2 * m0 + 3 * m1 + m2 + m3 + m4 + 4) >> 3) - m1) + m1);
-        src[-offset * 2] = (pixel)(x265_clip3(-tcP, tcP, ((m1 + m2 + m3 + m4 + 2) >> 2) - m2) + m2);
-        src[-offset]     = (pixel)(x265_clip3(-tcP, tcP, ((m1 + 2 * m2 + 2 * m3 + 2 * m4 + m5 + 4) >> 3) - m3) + m3);
-        src[0]           = (pixel)(x265_clip3(-tcQ, tcQ, ((m2 + 2 * m3 + 2 * m4 + 2 * m5 + m6 + 4) >> 3) - m4) + m4);
-        src[offset]      = (pixel)(x265_clip3(-tcQ, tcQ, ((m3 + m4 + m5 + m6 + 2) >> 2) - m5) + m5);
-        src[offset * 2]  = (pixel)(x265_clip3(-tcQ, tcQ, ((m3 + m4 + m5 + 3 * m6 + 2 * m7 + 4) >> 3) - m6) + m6);
-    }
-}
-
-/* Weak filter */
 static inline void pelFilterLuma(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tc, int32_t maskP, int32_t maskQ,
                                  int32_t maskP1, int32_t maskQ1)
 {
@@ -445,7 +421,12 @@
                    useStrongFiltering(offset, beta, tc, src + unitOffset + srcStep * 3));
 
         if (sw)
-            pelFilterLumaStrong(src + unitOffset, srcStep, offset, tc, maskP, maskQ);
+        {
+            int32_t tc2 = 2 * tc;
+            int32_t tcP = (tc2 & maskP);
+            int32_t tcQ = (tc2 & maskQ);
+            primitives.pelFilterLumaStrong[dir](src + unitOffset, srcStep, offset, tcP, tcQ);
+        }
         else
         {
             int32_t sideThreshold = (beta + (beta >> 1)) >> 3;

 
@@ -2,6 +2,7 @@
 * Copyright (C) 2013 x265 project
 *
 * Author: Gopu Govindaswamy <gopu@multicorewareinc.com>
+*         Min Chen <chenm003@163.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -108,7 +109,7 @@
     for (uint32_t e = 0; e < numUnits; e += partIdxIncr)
     {
         edgeFilterLuma(cu, absPartIdx, depth, dir, e, blockStrength);
-        if (!((e0 + e) & chromaMask))
+        if (!((e0 + e) & chromaMask) && cu->m_chromaFormat != X265_CSP_I400)
             edgeFilterChroma(cu, absPartIdx, depth, dir, e, blockStrength);
     }
 }
@@ -209,8 +210,8 @@
     const Slice* const sliceQ = cuQ->m_slice;
     const Slice* const sliceP = cuP->m_slice;
 
-    const Frame* refP0 = sliceP->getRefPic(0, cuP->m_refIdx[0][partP]);
-    const Frame* refQ0 = sliceQ->getRefPic(0, cuQ->m_refIdx[0][partQ]);
+    const Frame* refP0 = sliceP->m_refFrameList[0][cuP->m_refIdx[0][partP]];
+    const Frame* refQ0 = sliceQ->m_refFrameList[0][cuQ->m_refIdx[0][partQ]];
     const MV& mvP0 = refP0 ? cuP->m_mv[0][partP] : zeroMv;
     const MV& mvQ0 = refQ0 ? cuQ->m_mv[0][partQ] : zeroMv;
 
@@ -221,8 +222,8 @@
     }
 
     // (sliceQ->isInterB() || sliceP->isInterB())
-    const Frame* refP1 = sliceP->getRefPic(1, cuP->m_refIdx[1][partP]);
-    const Frame* refQ1 = sliceQ->getRefPic(1, cuQ->m_refIdx[1][partQ]);
+    const Frame* refP1 = sliceP->m_refFrameList[1][cuP->m_refIdx[1][partP]];
+    const Frame* refQ1 = sliceQ->m_refFrameList[1][cuQ->m_refIdx[1][partQ]];
     const MV& mvP1 = refP1 ? cuP->m_mv[1][partP] : zeroMv;
     const MV& mvQ1 = refQ1 ? cuQ->m_mv[1][partQ] : zeroMv;
 
@@ -279,31 +280,6 @@
  * \param maskQ   indicator to enable filtering on partQ
  * \param maskP1  decision weak filter/no filter for partP
  * \param maskQ1  decision weak filter/no filter for partQ */
-static inline void pelFilterLumaStrong(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tc, int32_t maskP, int32_t maskQ)
-{
-    int32_t tc2 = 2 * tc;
-    int32_t tcP = (tc2 & maskP);
-    int32_t tcQ = (tc2 & maskQ);
-    for (int32_t i = 0; i < UNIT_SIZE; i++, src += srcStep)
-    {
-        int16_t m4  = (int16_t)src[0];
-        int16_t m3  = (int16_t)src[-offset];
-        int16_t m5  = (int16_t)src[offset];
-        int16_t m2  = (int16_t)src[-offset * 2];
-        int16_t m6  = (int16_t)src[offset * 2];
-        int16_t m1  = (int16_t)src[-offset * 3];
-        int16_t m7  = (int16_t)src[offset * 3];
-        int16_t m0  = (int16_t)src[-offset * 4];
-        src[-offset * 3] = (pixel)(x265_clip3(-tcP, tcP, ((2 * m0 + 3 * m1 + m2 + m3 + m4 + 4) >> 3) - m1) + m1);
-        src[-offset * 2] = (pixel)(x265_clip3(-tcP, tcP, ((m1 + m2 + m3 + m4 + 2) >> 2) - m2) + m2);
-        src[-offset]     = (pixel)(x265_clip3(-tcP, tcP, ((m1 + 2 * m2 + 2 * m3 + 2 * m4 + m5 + 4) >> 3) - m3) + m3);
-        src[0]           = (pixel)(x265_clip3(-tcQ, tcQ, ((m2 + 2 * m3 + 2 * m4 + 2 * m5 + m6 + 4) >> 3) - m4) + m4);
-        src[offset]      = (pixel)(x265_clip3(-tcQ, tcQ, ((m3 + m4 + m5 + m6 + 2) >> 2) - m5) + m5);
-        src[offset * 2]  = (pixel)(x265_clip3(-tcQ, tcQ, ((m3 + m4 + m5 + 3 * m6 + 2 * m7 + 4) >> 3) - m6) + m6);
-    }
-}
-
-/* Weak filter */
 static inline void pelFilterLuma(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tc, int32_t maskP, int32_t maskQ,
                                  int32_t maskP1, int32_t maskQ1)
 {
@@ -445,7 +421,12 @@
                    useStrongFiltering(offset, beta, tc, src + unitOffset + srcStep * 3));
 
         if (sw)
-            pelFilterLumaStrong(src + unitOffset, srcStep, offset, tc, maskP, maskQ);
+        {
+            int32_t tc2 = 2 * tc;
+            int32_t tcP = (tc2 & maskP);
+            int32_t tcQ = (tc2 & maskQ);
+            primitives.pelFilterLumaStrong[dir](src + unitOffset, srcStep, offset, tcP, tcQ);
+        }
         else
         {
             int32_t sideThreshold = (beta + (beta >> 1)) >> 3;
​

x265_1.8.tar.gz/source/common/deblock.h -> x265_1.9.tar.gz/source/common/deblock.h Changed

 
@@ -2,6 +2,7 @@
 * Copyright (C) 2013 x265 project
 *
 * Author: Gopu Govindaswamy <gopu@multicorewareinc.com>
+*         Min Chen <chenm003@163.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -37,24 +38,24 @@
 public:
     enum { EDGE_VER, EDGE_HOR };
 
-    void deblockCTU(const CUData* ctu, const CUGeom& cuGeom, int32_t dir);
+    static void deblockCTU(const CUData* ctu, const CUGeom& cuGeom, int32_t dir);
 
 protected:
 
     // CU-level deblocking function
-    void deblockCU(const CUData* cu, const CUGeom& cuGeom, const int32_t dir, uint8_t blockStrength[]);
+    static void deblockCU(const CUData* cu, const CUGeom& cuGeom, const int32_t dir, uint8_t blockStrength[]);
 
     // set filtering functions
-    void setEdgefilterTU(const CUData* cu, uint32_t absPartIdx, uint32_t tuDepth, int32_t dir, uint8_t blockStrength[]);
-    void setEdgefilterPU(const CUData* cu, uint32_t absPartIdx, int32_t dir, uint8_t blockStrength[], uint32_t numUnits);
-    void setEdgefilterMultiple(const CUData* cu, uint32_t absPartIdx, int32_t dir, int32_t edgeIdx, uint8_t value, uint8_t blockStrength[], uint32_t numUnits);
+    static void setEdgefilterTU(const CUData* cu, uint32_t absPartIdx, uint32_t tuDepth, int32_t dir, uint8_t blockStrength[]);
+    static void setEdgefilterPU(const CUData* cu, uint32_t absPartIdx, int32_t dir, uint8_t blockStrength[], uint32_t numUnits);
+    static void setEdgefilterMultiple(const CUData* cu, uint32_t absPartIdx, int32_t dir, int32_t edgeIdx, uint8_t value, uint8_t blockStrength[], uint32_t numUnits);
 
     // get filtering functions
-    uint8_t getBoundaryStrength(const CUData* cuQ, int32_t dir, uint32_t partQ, const uint8_t blockStrength[]);
+    static uint8_t getBoundaryStrength(const CUData* cuQ, int32_t dir, uint32_t partQ, const uint8_t blockStrength[]);
 
     // filter luma/chroma functions
-    void edgeFilterLuma(const CUData* cuQ, uint32_t absPartIdx, uint32_t depth, int32_t dir, int32_t edge, const uint8_t blockStrength[]);
-    void edgeFilterChroma(const CUData* cuQ, uint32_t absPartIdx, uint32_t depth, int32_t dir, int32_t edge, const uint8_t blockStrength[]);
+    static void edgeFilterLuma(const CUData* cuQ, uint32_t absPartIdx, uint32_t depth, int32_t dir, int32_t edge, const uint8_t blockStrength[]);
+    static void edgeFilterChroma(const CUData* cuQ, uint32_t absPartIdx, uint32_t depth, int32_t dir, int32_t edge, const uint8_t blockStrength[]);
 
     static const uint8_t s_tcTable[54];
     static const uint8_t s_betaTable[52];
​

x265_1.8.tar.gz/source/common/frame.cpp -> x265_1.9.tar.gz/source/common/frame.cpp Changed

@@ -33,22 +33,37 @@
     m_bChromaExtended = false;
     m_lowresInit = false;
     m_reconRowCount.set(0);
+    m_reconColCount = NULL;
     m_countRefEncoders = 0;
     m_encData = NULL;
     m_reconPic = NULL;
+    m_quantOffsets = NULL;
     m_next = NULL;
     m_prev = NULL;
     m_param = NULL;
     memset(&m_lowres, 0, sizeof(m_lowres));
 }
 
-bool Frame::create(x265_param *param)
+bool Frame::create(x265_param *param, float* quantOffsets)
 {
     m_fencPic = new PicYuv;
     m_param = param;
 
-    return m_fencPic->create(param->sourceWidth, param->sourceHeight, param->internalCsp) &&
-           m_lowres.create(m_fencPic, param->bframes, !!param->rc.aqMode);
+    if (m_fencPic->create(param->sourceWidth, param->sourceHeight, param->internalCsp) &&
+        m_lowres.create(m_fencPic, param->bframes, !!param->rc.aqMode))
+    {
+        X265_CHECK((m_reconColCount == NULL), "m_reconColCount was initialized");
+        m_numRows = (m_fencPic->m_picHeight + g_maxCUSize - 1)  / g_maxCUSize;
+        m_reconColCount = new ThreadSafeInteger[m_numRows];
+
+        if (quantOffsets)
+        {
+            int32_t cuCount = m_lowres.maxBlocksInRow * m_lowres.maxBlocksInCol;
+            m_quantOffsets = new float[cuCount];
+        }
+        return true;
+    }
+    return false;
 }
 
 bool Frame::allocEncodeData(x265_param *param, const SPS& sps)
@@ -56,15 +71,27 @@
     m_encData = new FrameData;
     m_reconPic = new PicYuv;
     m_encData->m_reconPic = m_reconPic;
-    bool ok = m_encData->create(param, sps) && m_reconPic->create(param->sourceWidth, param->sourceHeight, param->internalCsp);
+    bool ok = m_encData->create(*param, sps) && m_reconPic->create(param->sourceWidth, param->sourceHeight, param->internalCsp);
     if (ok)
     {
         /* initialize right border of m_reconpicYuv as SAO may read beyond the
          * end of the picture accessing uninitialized pixels */
         int maxHeight = sps.numCuInHeight * g_maxCUSize;
-        memset(m_reconPic->m_picOrg[0], 0, sizeof(pixel) * m_reconPic->m_stride * maxHeight);
-        memset(m_reconPic->m_picOrg[1], 0, sizeof(pixel) * m_reconPic->m_strideC * (maxHeight >> m_reconPic->m_vChromaShift));
-        memset(m_reconPic->m_picOrg[2], 0, sizeof(pixel) * m_reconPic->m_strideC * (maxHeight >> m_reconPic->m_vChromaShift));
+        memset(m_reconPic->m_picOrg[0], 0, sizeof(pixel)* m_reconPic->m_stride * maxHeight);
+
+        /* use pre-calculated cu/pu offsets cached in the SPS structure */
+        m_reconPic->m_cuOffsetY = sps.cuOffsetY;
+        m_reconPic->m_buOffsetY = sps.buOffsetY;
+
+        if (param->internalCsp != X265_CSP_I400)
+        {
+            memset(m_reconPic->m_picOrg[1], 0, sizeof(pixel) * m_reconPic->m_strideC * (maxHeight >> m_reconPic->m_vChromaShift));
+            memset(m_reconPic->m_picOrg[2], 0, sizeof(pixel) * m_reconPic->m_strideC * (maxHeight >> m_reconPic->m_vChromaShift));
+
+            /* use pre-calculated cu/pu offsets cached in the SPS structure */
+            m_reconPic->m_cuOffsetC = sps.cuOffsetC;
+            m_reconPic->m_buOffsetC = sps.buOffsetC;
+        }
     }
     return ok;
 }
@@ -100,5 +127,16 @@
         m_reconPic = NULL;
     }
 
+    if (m_reconColCount)
+    {
+        delete[] m_reconColCount;
+        m_reconColCount = NULL;
+    }
+
+    if (m_quantOffsets)
+    {
+        delete[] m_quantOffsets;
+    }
+
     m_lowres.destroy();
 }

 
@@ -33,22 +33,37 @@
     m_bChromaExtended = false;
     m_lowresInit = false;
     m_reconRowCount.set(0);
+    m_reconColCount = NULL;
     m_countRefEncoders = 0;
     m_encData = NULL;
     m_reconPic = NULL;
+    m_quantOffsets = NULL;
     m_next = NULL;
     m_prev = NULL;
     m_param = NULL;
     memset(&m_lowres, 0, sizeof(m_lowres));
 }
 
-bool Frame::create(x265_param *param)
+bool Frame::create(x265_param *param, float* quantOffsets)
 {
     m_fencPic = new PicYuv;
     m_param = param;
 
-    return m_fencPic->create(param->sourceWidth, param->sourceHeight, param->internalCsp) &&
-           m_lowres.create(m_fencPic, param->bframes, !!param->rc.aqMode);
+    if (m_fencPic->create(param->sourceWidth, param->sourceHeight, param->internalCsp) &&
+        m_lowres.create(m_fencPic, param->bframes, !!param->rc.aqMode))
+    {
+        X265_CHECK((m_reconColCount == NULL), "m_reconColCount was initialized");
+        m_numRows = (m_fencPic->m_picHeight + g_maxCUSize - 1)  / g_maxCUSize;
+        m_reconColCount = new ThreadSafeInteger[m_numRows];
+
+        if (quantOffsets)
+        {
+            int32_t cuCount = m_lowres.maxBlocksInRow * m_lowres.maxBlocksInCol;
+            m_quantOffsets = new float[cuCount];
+        }
+        return true;
+    }
+    return false;
 }
 
 bool Frame::allocEncodeData(x265_param *param, const SPS& sps)
@@ -56,15 +71,27 @@
     m_encData = new FrameData;
     m_reconPic = new PicYuv;
     m_encData->m_reconPic = m_reconPic;
-    bool ok = m_encData->create(param, sps) && m_reconPic->create(param->sourceWidth, param->sourceHeight, param->internalCsp);
+    bool ok = m_encData->create(*param, sps) && m_reconPic->create(param->sourceWidth, param->sourceHeight, param->internalCsp);
     if (ok)
     {
         /* initialize right border of m_reconpicYuv as SAO may read beyond the
          * end of the picture accessing uninitialized pixels */
         int maxHeight = sps.numCuInHeight * g_maxCUSize;
-        memset(m_reconPic->m_picOrg[0], 0, sizeof(pixel) * m_reconPic->m_stride * maxHeight);
-        memset(m_reconPic->m_picOrg[1], 0, sizeof(pixel) * m_reconPic->m_strideC * (maxHeight >> m_reconPic->m_vChromaShift));
-        memset(m_reconPic->m_picOrg[2], 0, sizeof(pixel) * m_reconPic->m_strideC * (maxHeight >> m_reconPic->m_vChromaShift));
+        memset(m_reconPic->m_picOrg[0], 0, sizeof(pixel)* m_reconPic->m_stride * maxHeight);
+
+        /* use pre-calculated cu/pu offsets cached in the SPS structure */
+        m_reconPic->m_cuOffsetY = sps.cuOffsetY;
+        m_reconPic->m_buOffsetY = sps.buOffsetY;
+
+        if (param->internalCsp != X265_CSP_I400)
+        {
+            memset(m_reconPic->m_picOrg[1], 0, sizeof(pixel) * m_reconPic->m_strideC * (maxHeight >> m_reconPic->m_vChromaShift));
+            memset(m_reconPic->m_picOrg[2], 0, sizeof(pixel) * m_reconPic->m_strideC * (maxHeight >> m_reconPic->m_vChromaShift));
+
+            /* use pre-calculated cu/pu offsets cached in the SPS structure */
+            m_reconPic->m_cuOffsetC = sps.cuOffsetC;
+            m_reconPic->m_buOffsetC = sps.buOffsetC;
+        }
     }
     return ok;
 }
@@ -100,5 +127,16 @@
         m_reconPic = NULL;
     }
 
+    if (m_reconColCount)
+    {
+        delete[] m_reconColCount;
+        m_reconColCount = NULL;
+    }
+
+    if (m_quantOffsets)
+    {
+        delete[] m_quantOffsets;
+    }
+
     m_lowres.destroy();
 }
​

x265_1.8.tar.gz/source/common/frame.h -> x265_1.9.tar.gz/source/common/frame.h Changed

@@ -35,7 +35,7 @@
 class PicYuv;
 struct SPS;
 
-#define IS_REFERENCED(frame) (frame->m_lowres.sliceType != X265_TYPE_B) 
+#define IS_REFERENCED(frame) (frame->m_lowres.sliceType != X265_TYPE_B)
 
 class Frame
 {
@@ -59,8 +59,12 @@
     bool                   m_lowresInit;         // lowres init complete (pre-analysis)
     bool                   m_bChromaExtended;    // orig chroma planes motion extended for weight analysis
 
+    float*                 m_quantOffsets;       // points to quantOffsets in x265_picture
+
     /* Frame Parallelism - notification between FrameEncoders of available motion reference rows */
     ThreadSafeInteger      m_reconRowCount;      // count of CTU rows completely reconstructed and extended for motion reference
+    ThreadSafeInteger*     m_reconColCount;      // count of CTU cols completely reconstructed and extended for motion reference
+    int32_t                m_numRows;
     volatile uint32_t      m_countRefEncoders;   // count of FrameEncoder threads monitoring m_reconRowCount
 
     Frame*                 m_next;               // PicList doubly linked list pointers
@@ -69,7 +73,7 @@
     x265_analysis_data     m_analysisData;
     Frame();
 
-    bool create(x265_param *param);
+    bool create(x265_param *param, float* quantOffsets);
     bool allocEncodeData(x265_param *param, const SPS& sps);
     void reinit(const SPS& sps);
     void destroy();

 
@@ -35,7 +35,7 @@
 class PicYuv;
 struct SPS;
 
-#define IS_REFERENCED(frame) (frame->m_lowres.sliceType != X265_TYPE_B) 
+#define IS_REFERENCED(frame) (frame->m_lowres.sliceType != X265_TYPE_B)
 
 class Frame
 {
@@ -59,8 +59,12 @@
     bool                   m_lowresInit;         // lowres init complete (pre-analysis)
     bool                   m_bChromaExtended;    // orig chroma planes motion extended for weight analysis
 
+    float*                 m_quantOffsets;       // points to quantOffsets in x265_picture
+
     /* Frame Parallelism - notification between FrameEncoders of available motion reference rows */
     ThreadSafeInteger      m_reconRowCount;      // count of CTU rows completely reconstructed and extended for motion reference
+    ThreadSafeInteger*     m_reconColCount;      // count of CTU cols completely reconstructed and extended for motion reference
+    int32_t                m_numRows;
     volatile uint32_t      m_countRefEncoders;   // count of FrameEncoder threads monitoring m_reconRowCount
 
     Frame*                 m_next;               // PicList doubly linked list pointers
@@ -69,7 +73,7 @@
     x265_analysis_data     m_analysisData;
     Frame();
 
-    bool create(x265_param *param);
+    bool create(x265_param *param, float* quantOffsets);
     bool allocEncodeData(x265_param *param, const SPS& sps);
     void reinit(const SPS& sps);
     void destroy();
​

x265_1.8.tar.gz/source/common/framedata.cpp -> x265_1.9.tar.gz/source/common/framedata.cpp Changed

 
@@ -31,15 +31,15 @@
     memset(this, 0, sizeof(*this));
 }
 
-bool FrameData::create(x265_param *param, const SPS& sps)
+bool FrameData::create(const x265_param& param, const SPS& sps)
 {
-    m_param = param;
+    m_param = &param;
     m_slice  = new Slice;
     m_picCTU = new CUData[sps.numCUsInFrame];
 
-    m_cuMemPool.create(0, param->internalCsp, sps.numCUsInFrame);
+    m_cuMemPool.create(0, param.internalCsp, sps.numCUsInFrame);
     for (uint32_t ctuAddr = 0; ctuAddr < sps.numCUsInFrame; ctuAddr++)
-        m_picCTU[ctuAddr].initialize(m_cuMemPool, 0, param->internalCsp, ctuAddr);
+        m_picCTU[ctuAddr].initialize(m_cuMemPool, 0, param.internalCsp, ctuAddr);
 
     CHECKED_MALLOC(m_cuStat, RCStatCU, sps.numCUsInFrame);
     CHECKED_MALLOC(m_rowStat, RCStatRow, sps.numCuInHeight);
​

x265_1.8.tar.gz/source/common/framedata.h -> x265_1.9.tar.gz/source/common/framedata.h Changed

@@ -55,8 +55,7 @@
     double      avgLumaDistortion;
     double      avgChromaDistortion;
     double      avgPsyEnergy;
-    double      avgLumaLevel;
-    double      lumaLevel;
+    double      avgResEnergy;
     double      percentIntraNxN;
     double      percentSkipCu[NUM_CU_DEPTH];
     double      percentMergeCu[NUM_CU_DEPTH];
@@ -69,13 +68,13 @@
     uint64_t    lumaDistortion;
     uint64_t    chromaDistortion;
     uint64_t    psyEnergy;
+    uint64_t    resEnergy;
     uint64_t    cntSkipCu[NUM_CU_DEPTH];
     uint64_t    cntMergeCu[NUM_CU_DEPTH];
     uint64_t    cntInter[NUM_CU_DEPTH];
     uint64_t    cntIntra[NUM_CU_DEPTH];
     uint64_t    cuInterDistribution[NUM_CU_DEPTH][INTER_MODES];
     uint64_t    cuIntraDistribution[NUM_CU_DEPTH][INTRA_MODES];
-    uint16_t    maxLumaLevel;
 
     FrameStats()
     {
@@ -96,7 +95,7 @@
 
     Slice*         m_slice;
     SAOParam*      m_saoParam;
-    x265_param*    m_param;
+    const x265_param* m_param;
 
     FrameData*     m_freeListNext;
     PicYuv*        m_reconPic;
@@ -135,19 +134,44 @@
     RCStatCU*      m_cuStat;
     RCStatRow*     m_rowStat;
     FrameStats     m_frameStats; // stats of current frame for multi-pass encodes
+    /* data needed for periodic intra refresh */
+    struct PeriodicIR
+    {
+        uint32_t   pirStartCol;
+        uint32_t   pirEndCol;
+        int        framesSinceLastPir;
+    };
 
+    PeriodicIR     m_pir;
     double         m_avgQpRc;    /* avg QP as decided by rate-control */
     double         m_avgQpAq;    /* avg QP as decided by AQ in addition to rate-control */
     double         m_rateFactor; /* calculated based on the Frame QP */
 
     FrameData();
 
-    bool create(x265_param *param, const SPS& sps);
+    bool create(const x265_param& param, const SPS& sps);
     void reinit(const SPS& sps);
     void destroy();
+    inline CUData* getPicCTU(uint32_t ctuAddr) { return &m_picCTU[ctuAddr]; }
+};
+
+/* Stores intra analysis data for a single frame. This struct needs better packing */
+struct analysis_intra_data
+{
+    uint8_t*  depth;
+    uint8_t*  modes;
+    char*     partSizes;
+    uint8_t*  chromaModes;
+};
 
-    CUData* getPicCTU(uint32_t ctuAddr) { return &m_picCTU[ctuAddr]; }
+/* Stores inter analysis data for a single frame */
+struct analysis_inter_data
+{
+    MV*         mv;
+    int32_t*    ref;
+    uint8_t*    depth;
+    uint8_t*    modes;
+    uint32_t*   bestMergeCand;
 };
 }
-
 #endif // ifndef X265_FRAMEDATA_H

 
@@ -55,8 +55,7 @@
     double      avgLumaDistortion;
     double      avgChromaDistortion;
     double      avgPsyEnergy;
-    double      avgLumaLevel;
-    double      lumaLevel;
+    double      avgResEnergy;
     double      percentIntraNxN;
     double      percentSkipCu[NUM_CU_DEPTH];
     double      percentMergeCu[NUM_CU_DEPTH];
@@ -69,13 +68,13 @@
     uint64_t    lumaDistortion;
     uint64_t    chromaDistortion;
     uint64_t    psyEnergy;
+    uint64_t    resEnergy;
     uint64_t    cntSkipCu[NUM_CU_DEPTH];
     uint64_t    cntMergeCu[NUM_CU_DEPTH];
     uint64_t    cntInter[NUM_CU_DEPTH];
     uint64_t    cntIntra[NUM_CU_DEPTH];
     uint64_t    cuInterDistribution[NUM_CU_DEPTH][INTER_MODES];
     uint64_t    cuIntraDistribution[NUM_CU_DEPTH][INTRA_MODES];
-    uint16_t    maxLumaLevel;
 
     FrameStats()
     {
@@ -96,7 +95,7 @@
 
     Slice*         m_slice;
     SAOParam*      m_saoParam;
-    x265_param*    m_param;
+    const x265_param* m_param;
 
     FrameData*     m_freeListNext;
     PicYuv*        m_reconPic;
@@ -135,19 +134,44 @@
     RCStatCU*      m_cuStat;
     RCStatRow*     m_rowStat;
     FrameStats     m_frameStats; // stats of current frame for multi-pass encodes
+    /* data needed for periodic intra refresh */
+    struct PeriodicIR
+    {
+        uint32_t   pirStartCol;
+        uint32_t   pirEndCol;
+        int        framesSinceLastPir;
+    };
 
+    PeriodicIR     m_pir;
     double         m_avgQpRc;    /* avg QP as decided by rate-control */
     double         m_avgQpAq;    /* avg QP as decided by AQ in addition to rate-control */
     double         m_rateFactor; /* calculated based on the Frame QP */
 
     FrameData();
 
-    bool create(x265_param *param, const SPS& sps);
+    bool create(const x265_param& param, const SPS& sps);
     void reinit(const SPS& sps);
     void destroy();
+    inline CUData* getPicCTU(uint32_t ctuAddr) { return &m_picCTU[ctuAddr]; }
+};
+
+/* Stores intra analysis data for a single frame. This struct needs better packing */
+struct analysis_intra_data
+{
+    uint8_t*  depth;
+    uint8_t*  modes;
+    char*     partSizes;
+    uint8_t*  chromaModes;
+};
 
-    CUData* getPicCTU(uint32_t ctuAddr) { return &m_picCTU[ctuAddr]; }
+/* Stores inter analysis data for a single frame */
+struct analysis_inter_data
+{
+    MV*         mv;
+    int32_t*    ref;
+    uint8_t*    depth;
+    uint8_t*    modes;
+    uint32_t*   bestMergeCand;
 };
 }
-
 #endif // ifndef X265_FRAMEDATA_H
​

x265_1.8.tar.gz/source/common/ipfilter.cpp -> x265_1.9.tar.gz/source/common/ipfilter.cpp Changed

 
@@ -4,6 +4,7 @@
  * Authors: Deepthi Devaki <deepthidevaki@multicorewareinc.com>,
  *          Rajesh Paulraj <rajesh@multicorewareinc.com>
  *          Praveen Kumar Tiwari <praveen@multicorewareinc.com>
+ *          Min Chen <chenm003@163.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
​

x265_1.8.tar.gz/source/common/loopfilter.cpp -> x265_1.9.tar.gz/source/common/loopfilter.cpp Changed

@@ -3,6 +3,7 @@
 *
 * Authors: Praveen Kumar Tiwari <praveen@multicorewareinc.com>
 *          Dnyaneshwar Gorade <dnyaneshwar@multicorewareinc.com>
+*          Min Chen <chenm003@163.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -136,6 +137,27 @@
         rec += stride;
     }
 }
+
+static void pelFilterLumaStrong_c(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tcP, int32_t tcQ)
+{
+    for (int32_t i = 0; i < UNIT_SIZE; i++, src += srcStep)
+    {
+        int16_t m4  = (int16_t)src[0];
+        int16_t m3  = (int16_t)src[-offset];
+        int16_t m5  = (int16_t)src[offset];
+        int16_t m2  = (int16_t)src[-offset * 2];
+        int16_t m6  = (int16_t)src[offset * 2];
+        int16_t m1  = (int16_t)src[-offset * 3];
+        int16_t m7  = (int16_t)src[offset * 3];
+        int16_t m0  = (int16_t)src[-offset * 4];
+        src[-offset * 3] = (pixel)(x265_clip3(-tcP, tcP, ((2 * m0 + 3 * m1 + m2 + m3 + m4 + 4) >> 3) - m1) + m1);
+        src[-offset * 2] = (pixel)(x265_clip3(-tcP, tcP, ((m1 + m2 + m3 + m4 + 2) >> 2) - m2) + m2);
+        src[-offset]     = (pixel)(x265_clip3(-tcP, tcP, ((m1 + 2 * m2 + 2 * m3 + 2 * m4 + m5 + 4) >> 3) - m3) + m3);
+        src[0]           = (pixel)(x265_clip3(-tcQ, tcQ, ((m2 + 2 * m3 + 2 * m4 + 2 * m5 + m6 + 4) >> 3) - m4) + m4);
+        src[offset]      = (pixel)(x265_clip3(-tcQ, tcQ, ((m3 + m4 + m5 + m6 + 2) >> 2) - m5) + m5);
+        src[offset * 2]  = (pixel)(x265_clip3(-tcQ, tcQ, ((m3 + m4 + m5 + 3 * m6 + 2 * m7 + 4) >> 3) - m6) + m6);
+    }
+}
 }
 
 namespace X265_NS {
@@ -150,5 +172,9 @@
     p.saoCuOrgE3[1] = processSaoCUE3;
     p.saoCuOrgB0 = processSaoCUB0;
     p.sign = calSign;
+
+    // C code is same for EDGE_VER and EDGE_HOR only asm code is different
+    p.pelFilterLumaStrong[0] = pelFilterLumaStrong_c;
+    p.pelFilterLumaStrong[1] = pelFilterLumaStrong_c;
 }
 }

 
@@ -3,6 +3,7 @@
 *
 * Authors: Praveen Kumar Tiwari <praveen@multicorewareinc.com>
 *          Dnyaneshwar Gorade <dnyaneshwar@multicorewareinc.com>
+*          Min Chen <chenm003@163.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -136,6 +137,27 @@
         rec += stride;
     }
 }
+
+static void pelFilterLumaStrong_c(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tcP, int32_t tcQ)
+{
+    for (int32_t i = 0; i < UNIT_SIZE; i++, src += srcStep)
+    {
+        int16_t m4  = (int16_t)src[0];
+        int16_t m3  = (int16_t)src[-offset];
+        int16_t m5  = (int16_t)src[offset];
+        int16_t m2  = (int16_t)src[-offset * 2];
+        int16_t m6  = (int16_t)src[offset * 2];
+        int16_t m1  = (int16_t)src[-offset * 3];
+        int16_t m7  = (int16_t)src[offset * 3];
+        int16_t m0  = (int16_t)src[-offset * 4];
+        src[-offset * 3] = (pixel)(x265_clip3(-tcP, tcP, ((2 * m0 + 3 * m1 + m2 + m3 + m4 + 4) >> 3) - m1) + m1);
+        src[-offset * 2] = (pixel)(x265_clip3(-tcP, tcP, ((m1 + m2 + m3 + m4 + 2) >> 2) - m2) + m2);
+        src[-offset]     = (pixel)(x265_clip3(-tcP, tcP, ((m1 + 2 * m2 + 2 * m3 + 2 * m4 + m5 + 4) >> 3) - m3) + m3);
+        src[0]           = (pixel)(x265_clip3(-tcQ, tcQ, ((m2 + 2 * m3 + 2 * m4 + 2 * m5 + m6 + 4) >> 3) - m4) + m4);
+        src[offset]      = (pixel)(x265_clip3(-tcQ, tcQ, ((m3 + m4 + m5 + m6 + 2) >> 2) - m5) + m5);
+        src[offset * 2]  = (pixel)(x265_clip3(-tcQ, tcQ, ((m3 + m4 + m5 + 3 * m6 + 2 * m7 + 4) >> 3) - m6) + m6);
+    }
+}
 }
 
 namespace X265_NS {
@@ -150,5 +172,9 @@
     p.saoCuOrgE3[1] = processSaoCUE3;
     p.saoCuOrgB0 = processSaoCUB0;
     p.sign = calSign;
+
+    // C code is same for EDGE_VER and EDGE_HOR only asm code is different
+    p.pelFilterLumaStrong[0] = pelFilterLumaStrong_c;
+    p.pelFilterLumaStrong[1] = pelFilterLumaStrong_c;
 }
 }
​

x265_1.8.tar.gz/source/common/lowres.cpp -> x265_1.9.tar.gz/source/common/lowres.cpp Changed

 
@@ -52,6 +52,7 @@
         CHECKED_MALLOC(qpAqOffset, double, cuCount);
         CHECKED_MALLOC(invQscaleFactor, int, cuCount);
         CHECKED_MALLOC(qpCuTreeOffset, double, cuCount);
+        CHECKED_MALLOC(blockVariance, uint32_t, cuCount);
     }
     CHECKED_MALLOC(propagateCost, uint16_t, cuCount);
 
@@ -120,18 +121,17 @@
     X265_FREE(invQscaleFactor);
     X265_FREE(qpCuTreeOffset);
     X265_FREE(propagateCost);
+    X265_FREE(blockVariance);
 }
 
 // (re) initialize lowres state
 void Lowres::init(PicYuv *origPic, int poc)
 {
     bLastMiniGopBFrame = false;
-    bScenecut = false;  // could be a scene-cut, until ruled out by flash detection
     bKeyframe = false; // Not a keyframe unless identified by lookahead
     frameNum = poc;
     leadingBframes = 0;
     indB = 0;
-    satdCost = (int64_t)-1;
     memset(costEst, -1, sizeof(costEst));
     memset(weightedCostDelta, 0, sizeof(weightedCostDelta));
 
​

x265_1.8.tar.gz/source/common/lowres.h -> x265_1.9.tar.gz/source/common/lowres.h Changed

 
@@ -143,12 +143,15 @@
     double*   qpAqOffset;      // AQ QP offset values for each 16x16 CU
     double*   qpCuTreeOffset;  // cuTree QP offset values for each 16x16 CU
     int*      invQscaleFactor; // qScale values for qp Aq Offsets
+    uint32_t* blockVariance;
     uint64_t  wp_ssd[3];       // This is different than SSDY, this is sum(pixel^2) - sum(pixel)^2 for entire frame
     uint64_t  wp_sum[3];
+    uint64_t  frameVariance;
 
     /* cutree intermediate data */
     uint16_t* propagateCost;
     double    weightedCostDelta[X265_BFRAME_MAX + 2];
+    ReferencePlanes weightedRef[X265_BFRAME_MAX + 2];
 
     bool create(PicYuv *origPic, int _bframes, bool bAqEnabled);
     void destroy();
​

x265_1.8.tar.gz/source/common/param.cpp -> x265_1.9.tar.gz/source/common/param.cpp Changed

@@ -147,7 +147,7 @@
     param->bFrameAdaptive = X265_B_ADAPT_TRELLIS;
     param->bBPyramid = 1;
     param->scenecutThreshold = 40; /* Magic number pulled in from x264 */
-    param->lookaheadSlices = 0;
+    param->lookaheadSlices = 8;
 
     /* Intra Coding Tools */
     param->bEnableConstrainedIntra = 0;
@@ -159,7 +159,8 @@
     param->subpelRefine = 2;
     param->searchRange = 57;
     param->maxNumMergeCand = 2;
-    param->limitReferences = 0;
+    param->limitReferences = 3;
+    param->limitModes = 0;
     param->bEnableWeightedPred = 1;
     param->bEnableWeightedBiPred = 0;
     param->bEnableEarlySkip = 0;
@@ -184,7 +185,7 @@
     param->cbQpOffset = 0;
     param->crQpOffset = 0;
     param->rdPenalty = 0;
-    param->psyRd = 0.3;
+    param->psyRd = 2.0;
     param->psyRdoq = 0.0;
     param->analysisMode = 0;
     param->analysisFileName = NULL;
@@ -241,6 +242,10 @@
     param->vui.defDispWinRightOffset = 0;
     param->vui.defDispWinTopOffset = 0;
     param->vui.defDispWinBottomOffset = 0;
+    param->maxCLL = 0;
+    param->maxFALL = 0;
+    param->minLuma = 0;
+    param->maxLuma = (1 << X265_DEPTH) - 1;
 }
 
 int x265_param_default_preset(x265_param* param, const char* preset, const char* tune)
@@ -274,9 +279,9 @@
             param->bEnableWeightedPred = 0;
             param->rdLevel = 2;
             param->maxNumReferences = 1;
+            param->limitReferences = 0;
             param->rc.aqStrength = 0.0;
             param->rc.aqMode = X265_AQ_NONE;
-            param->rc.cuTree = 0;
             param->rc.qgSize = 32;
             param->bEnableFastIntra = 1;
         }
@@ -291,9 +296,9 @@
             param->bEnableWeightedPred = 0;
             param->rdLevel = 2;
             param->maxNumReferences = 1;
+            param->limitReferences = 0;
             param->rc.aqStrength = 0.0;
             param->rc.aqMode = X265_AQ_NONE;
-            param->rc.cuTree = 0;
             param->rc.qgSize = 32;
             param->bEnableSAO = 0;
             param->bEnableFastIntra = 1;
@@ -301,13 +306,11 @@
         else if (!strcmp(preset, "veryfast"))
         {
             param->lookaheadDepth = 15;
-            param->maxCUSize = 32;
             param->bFrameAdaptive = 0;
             param->subpelRefine = 1;
             param->bEnableEarlySkip = 1;
             param->rdLevel = 2;
-            param->maxNumReferences = 1;
-            param->rc.cuTree = 0;
+            param->maxNumReferences = 2;
             param->rc.qgSize = 32;
             param->bEnableFastIntra = 1;
         }
@@ -317,8 +320,7 @@
             param->bFrameAdaptive = 0;
             param->bEnableEarlySkip = 1;
             param->rdLevel = 2;
-            param->maxNumReferences = 1;
-            param->rc.cuTree = 0;
+            param->maxNumReferences = 2;
             param->bEnableFastIntra = 1;
         }
         else if (!strcmp(preset, "fast"))
@@ -326,7 +328,7 @@
             param->lookaheadDepth = 15;
             param->bFrameAdaptive = 0;
             param->rdLevel = 2;
-            param->maxNumReferences = 2;
+            param->maxNumReferences = 3;
             param->bEnableFastIntra = 1;
         }
         else if (!strcmp(preset, "medium"))
@@ -343,6 +345,9 @@
             param->subpelRefine = 3;
             param->maxNumMergeCand = 3;
             param->searchMethod = X265_STAR_SEARCH;
+            param->maxNumReferences = 4;
+            param->limitModes = 1;
+            param->lookaheadSlices = 4; // limit parallelism as already enough work exists
         }
         else if (!strcmp(preset, "slower"))
         {
@@ -359,7 +364,11 @@
             param->subpelRefine = 3;
             param->maxNumMergeCand = 3;
             param->searchMethod = X265_STAR_SEARCH;
+            param->maxNumReferences = 4;
+            param->limitReferences = 2;
+            param->limitModes = 1;
             param->bIntraInBFrames = 1;
+            param->lookaheadSlices = 4; // limit parallelism as already enough work exists
         }
         else if (!strcmp(preset, "veryslow"))
         {
@@ -377,7 +386,10 @@
             param->maxNumMergeCand = 4;
             param->searchMethod = X265_STAR_SEARCH;
             param->maxNumReferences = 5;
+            param->limitReferences = 1;
+            param->limitModes = 1;
             param->bIntraInBFrames = 1;
+            param->lookaheadSlices = 0; // disabled for best quality
         }
         else if (!strcmp(preset, "placebo"))
         {
@@ -397,8 +409,10 @@
             param->searchMethod = X265_STAR_SEARCH;
             param->bEnableTransformSkip = 1;
             param->maxNumReferences = 5;
+            param->limitReferences = 0;
             param->rc.bEnableSlowFirstPass = 1;
             param->bIntraInBFrames = 1;
+            param->lookaheadSlices = 0; // disabled for best quality
             // TODO: optimized esa
         }
         else
@@ -565,10 +579,14 @@
     OPT2("level-idc", "level")
     {
         /* allow "5.1" or "51", both converted to integer 51 */
-        if (atof(value) < 7)
+        /* if level-idc specifies an obviously wrong value in either float or int, 
+        throw error consistently. Stronger level checking will be done in encoder_open() */
+        if (atof(value) < 10)
             p->levelIdc = (int)(10 * atof(value) + .5);
-        else
+        else if (atoi(value) < 100)
             p->levelIdc = atoi(value);
+        else 
+            bError = true;
     }
     OPT("high-tier") p->bHighTier = atobool(value);
     OPT("allow-non-conformance") p->bAllowNonConformance = atobool(value);
@@ -608,6 +626,7 @@
     OPT2("constrained-intra", "cip") p->bEnableConstrainedIntra = atobool(value);
     OPT("fast-intra") p->bEnableFastIntra = atobool(value);
     OPT("open-gop") p->bOpenGOP = atobool(value);
+    OPT("intra-refresh") p->bIntraRefresh = atobool(value);
     OPT("lookahead-slices") p->lookaheadSlices = atoi(value);
     OPT("scenecut")
     {
@@ -644,6 +663,7 @@
     }
     OPT("ref") p->maxNumReferences = atoi(value);
     OPT("limit-refs") p->limitReferences = atoi(value);
+    OPT("limit-modes") p->limitModes = atobool(value);
     OPT("weightp") p->bEnableWeightedPred = atobool(value);
     OPT("weightb") p->bEnableWeightedBiPred = atobool(value);
     OPT("cbqpoffs") p->cbQpOffset = atoi(value);
@@ -854,7 +874,9 @@
     OPT("analysis-file") p->analysisFileName = strdup(value);
     OPT("qg-size") p->rc.qgSize = atoi(value);
     OPT("master-display") p->masteringDisplayColorVolume = strdup(value);
-    OPT("max-cll") p->contentLightLevelInfo = strdup(value);
+    OPT("max-cll") bError |= sscanf(value, "%hu,%hu", &p->maxCLL, &p->maxFALL) != 2;
+    OPT("min-luma") p->minLuma = (uint16_t)atoi(value);
+    OPT("max-luma") p->maxLuma = (uint16_t)atoi(value);
     else
         return X265_PARAM_BAD_NAME;
 #undef OPT
@@ -1035,6 +1057,8 @@
           "subme must be greater than or equal to 0");
     CHECK(param->limitReferences > 3,
           "limitReferences must be 0, 1, 2 or 3");
+    CHECK(param->limitModes > 1,
+          "limitRectAmp must be 0, 1");
     CHECK(param->frameNumThreads < 0 || param->frameNumThreads > X265_MAX_FRAME_THREADS,
           "frameNumThreads (--frame-threads) must be [0 .. X265_MAX_FRAME_THREADS)");
     CHECK(param->cbQpOffset < -12, "Min. Chroma Cb QP Offset is -12");
@@ -1063,8 +1087,8 @@
 
     CHECK(param->sourceWidth < (int)param->maxCUSize || param->sourceHeight < (int)param->maxCUSize,
           "Picture size must be at least one CTU");
-    CHECK(param->internalCsp < X265_CSP_I420 || X265_CSP_I444 < param->internalCsp,
-          "Color space must be i420, i422, or i444");
+    CHECK(param->internalCsp < X265_CSP_I400 || X265_CSP_I444 < param->internalCsp,
+          "chroma subsampling must be i400 (4:0:0 monochrome), i420 (4:2:0 default), i422 (4:2:0), i444 (4:4:4)");

 
@@ -147,7 +147,7 @@
     param->bFrameAdaptive = X265_B_ADAPT_TRELLIS;
     param->bBPyramid = 1;
     param->scenecutThreshold = 40; /* Magic number pulled in from x264 */
-    param->lookaheadSlices = 0;
+    param->lookaheadSlices = 8;
 
     /* Intra Coding Tools */
     param->bEnableConstrainedIntra = 0;
@@ -159,7 +159,8 @@
     param->subpelRefine = 2;
     param->searchRange = 57;
     param->maxNumMergeCand = 2;
-    param->limitReferences = 0;
+    param->limitReferences = 3;
+    param->limitModes = 0;
     param->bEnableWeightedPred = 1;
     param->bEnableWeightedBiPred = 0;
     param->bEnableEarlySkip = 0;
@@ -184,7 +185,7 @@
     param->cbQpOffset = 0;
     param->crQpOffset = 0;
     param->rdPenalty = 0;
-    param->psyRd = 0.3;
+    param->psyRd = 2.0;
     param->psyRdoq = 0.0;
     param->analysisMode = 0;
     param->analysisFileName = NULL;
@@ -241,6 +242,10 @@
     param->vui.defDispWinRightOffset = 0;
     param->vui.defDispWinTopOffset = 0;
     param->vui.defDispWinBottomOffset = 0;
+    param->maxCLL = 0;
+    param->maxFALL = 0;
+    param->minLuma = 0;
+    param->maxLuma = (1 << X265_DEPTH) - 1;
 }
 
 int x265_param_default_preset(x265_param* param, const char* preset, const char* tune)
@@ -274,9 +279,9 @@
             param->bEnableWeightedPred = 0;
             param->rdLevel = 2;
             param->maxNumReferences = 1;
+            param->limitReferences = 0;
             param->rc.aqStrength = 0.0;
             param->rc.aqMode = X265_AQ_NONE;
-            param->rc.cuTree = 0;
             param->rc.qgSize = 32;
             param->bEnableFastIntra = 1;
         }
@@ -291,9 +296,9 @@
             param->bEnableWeightedPred = 0;
             param->rdLevel = 2;
             param->maxNumReferences = 1;
+            param->limitReferences = 0;
             param->rc.aqStrength = 0.0;
             param->rc.aqMode = X265_AQ_NONE;
-            param->rc.cuTree = 0;
             param->rc.qgSize = 32;
             param->bEnableSAO = 0;
             param->bEnableFastIntra = 1;
@@ -301,13 +306,11 @@
         else if (!strcmp(preset, "veryfast"))
         {
             param->lookaheadDepth = 15;
-            param->maxCUSize = 32;
             param->bFrameAdaptive = 0;
             param->subpelRefine = 1;
             param->bEnableEarlySkip = 1;
             param->rdLevel = 2;
-            param->maxNumReferences = 1;
-            param->rc.cuTree = 0;
+            param->maxNumReferences = 2;
             param->rc.qgSize = 32;
             param->bEnableFastIntra = 1;
         }
@@ -317,8 +320,7 @@
             param->bFrameAdaptive = 0;
             param->bEnableEarlySkip = 1;
             param->rdLevel = 2;
-            param->maxNumReferences = 1;
-            param->rc.cuTree = 0;
+            param->maxNumReferences = 2;
             param->bEnableFastIntra = 1;
         }
         else if (!strcmp(preset, "fast"))
@@ -326,7 +328,7 @@
             param->lookaheadDepth = 15;
             param->bFrameAdaptive = 0;
             param->rdLevel = 2;
-            param->maxNumReferences = 2;
+            param->maxNumReferences = 3;
             param->bEnableFastIntra = 1;
         }
         else if (!strcmp(preset, "medium"))
@@ -343,6 +345,9 @@
             param->subpelRefine = 3;
             param->maxNumMergeCand = 3;
             param->searchMethod = X265_STAR_SEARCH;
+            param->maxNumReferences = 4;
+            param->limitModes = 1;
+            param->lookaheadSlices = 4; // limit parallelism as already enough work exists
         }
         else if (!strcmp(preset, "slower"))
         {
@@ -359,7 +364,11 @@
             param->subpelRefine = 3;
             param->maxNumMergeCand = 3;
             param->searchMethod = X265_STAR_SEARCH;
+            param->maxNumReferences = 4;
+            param->limitReferences = 2;
+            param->limitModes = 1;
             param->bIntraInBFrames = 1;
+            param->lookaheadSlices = 4; // limit parallelism as already enough work exists
         }
         else if (!strcmp(preset, "veryslow"))
         {
@@ -377,7 +386,10 @@
             param->maxNumMergeCand = 4;
             param->searchMethod = X265_STAR_SEARCH;
             param->maxNumReferences = 5;
+            param->limitReferences = 1;
+            param->limitModes = 1;
             param->bIntraInBFrames = 1;
+            param->lookaheadSlices = 0; // disabled for best quality
         }
         else if (!strcmp(preset, "placebo"))
         {
@@ -397,8 +409,10 @@
             param->searchMethod = X265_STAR_SEARCH;
             param->bEnableTransformSkip = 1;
             param->maxNumReferences = 5;
+            param->limitReferences = 0;
             param->rc.bEnableSlowFirstPass = 1;
             param->bIntraInBFrames = 1;
+            param->lookaheadSlices = 0; // disabled for best quality
             // TODO: optimized esa
         }
         else
@@ -565,10 +579,14 @@
     OPT2("level-idc", "level")
     {
         /* allow "5.1" or "51", both converted to integer 51 */
-        if (atof(value) < 7)
+        /* if level-idc specifies an obviously wrong value in either float or int, 
+        throw error consistently. Stronger level checking will be done in encoder_open() */
+        if (atof(value) < 10)
             p->levelIdc = (int)(10 * atof(value) + .5);
-        else
+        else if (atoi(value) < 100)
             p->levelIdc = atoi(value);
+        else 
+            bError = true;
     }
     OPT("high-tier") p->bHighTier = atobool(value);
     OPT("allow-non-conformance") p->bAllowNonConformance = atobool(value);
@@ -608,6 +626,7 @@
     OPT2("constrained-intra", "cip") p->bEnableConstrainedIntra = atobool(value);
     OPT("fast-intra") p->bEnableFastIntra = atobool(value);
     OPT("open-gop") p->bOpenGOP = atobool(value);
+    OPT("intra-refresh") p->bIntraRefresh = atobool(value);
     OPT("lookahead-slices") p->lookaheadSlices = atoi(value);
     OPT("scenecut")
     {
@@ -644,6 +663,7 @@
     }
     OPT("ref") p->maxNumReferences = atoi(value);
     OPT("limit-refs") p->limitReferences = atoi(value);
+    OPT("limit-modes") p->limitModes = atobool(value);
     OPT("weightp") p->bEnableWeightedPred = atobool(value);
     OPT("weightb") p->bEnableWeightedBiPred = atobool(value);
     OPT("cbqpoffs") p->cbQpOffset = atoi(value);
@@ -854,7 +874,9 @@
     OPT("analysis-file") p->analysisFileName = strdup(value);
     OPT("qg-size") p->rc.qgSize = atoi(value);
     OPT("master-display") p->masteringDisplayColorVolume = strdup(value);
-    OPT("max-cll") p->contentLightLevelInfo = strdup(value);
+    OPT("max-cll") bError |= sscanf(value, "%hu,%hu", &p->maxCLL, &p->maxFALL) != 2;
+    OPT("min-luma") p->minLuma = (uint16_t)atoi(value);
+    OPT("max-luma") p->maxLuma = (uint16_t)atoi(value);
     else
         return X265_PARAM_BAD_NAME;
 #undef OPT
@@ -1035,6 +1057,8 @@
           "subme must be greater than or equal to 0");
     CHECK(param->limitReferences > 3,
           "limitReferences must be 0, 1, 2 or 3");
+    CHECK(param->limitModes > 1,
+          "limitRectAmp must be 0, 1");
     CHECK(param->frameNumThreads < 0 || param->frameNumThreads > X265_MAX_FRAME_THREADS,
           "frameNumThreads (--frame-threads) must be [0 .. X265_MAX_FRAME_THREADS)");
     CHECK(param->cbQpOffset < -12, "Min. Chroma Cb QP Offset is -12");
@@ -1063,8 +1087,8 @@
 
     CHECK(param->sourceWidth < (int)param->maxCUSize || param->sourceHeight < (int)param->maxCUSize,
           "Picture size must be at least one CTU");
-    CHECK(param->internalCsp < X265_CSP_I420 || X265_CSP_I444 < param->internalCsp,
-          "Color space must be i420, i422, or i444");
+    CHECK(param->internalCsp < X265_CSP_I400 || X265_CSP_I444 < param->internalCsp,
+          "chroma subsampling must be i400 (4:0:0 monochrome), i420 (4:2:0 default), i422 (4:2:0), i444 (4:4:4)");
​

x265_1.8.tar.gz/source/common/picyuv.cpp -> x265_1.9.tar.gz/source/common/picyuv.cpp Changed

@@ -2,6 +2,7 @@
  * Copyright (C) 2015 x265 project
  *
  * Authors: Steve Borho <steve@borho.org>
+ *          Min Chen <chenm003@163.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -42,6 +43,9 @@
     m_cuOffsetC = NULL;
     m_buOffsetY = NULL;
     m_buOffsetC = NULL;
+
+    m_maxLumaLevel = 0;
+    m_avgLumaLevel = 0;
 }
 
 bool PicYuv::create(uint32_t picWidth, uint32_t picHeight, uint32_t picCsp)
@@ -59,20 +63,27 @@
     m_lumaMarginY = g_maxCUSize + 16; // margin for 8-tap filter and infinite padding
     m_stride = (numCuInWidth * g_maxCUSize) + (m_lumaMarginX << 1);
 
-    m_chromaMarginX = m_lumaMarginX;  // keep 16-byte alignment for chroma CTUs
-    m_chromaMarginY = m_lumaMarginY >> m_vChromaShift;
-
-    m_strideC = ((numCuInWidth * g_maxCUSize) >> m_hChromaShift) + (m_chromaMarginX * 2);
     int maxHeight = numCuInHeight * g_maxCUSize;
-
     CHECKED_MALLOC(m_picBuf[0], pixel, m_stride * (maxHeight + (m_lumaMarginY * 2)));
-    CHECKED_MALLOC(m_picBuf[1], pixel, m_strideC * ((maxHeight >> m_vChromaShift) + (m_chromaMarginY * 2)));
-    CHECKED_MALLOC(m_picBuf[2], pixel, m_strideC * ((maxHeight >> m_vChromaShift) + (m_chromaMarginY * 2)));
+    m_picOrg[0] = m_picBuf[0] + m_lumaMarginY * m_stride + m_lumaMarginX;
+
+    if (picCsp != X265_CSP_I400)
+    {
+        m_chromaMarginX = m_lumaMarginX;  // keep 16-byte alignment for chroma CTUs
+        m_chromaMarginY = m_lumaMarginY >> m_vChromaShift;
+        m_strideC = ((numCuInWidth * g_maxCUSize) >> m_hChromaShift) + (m_chromaMarginX * 2);
 
-    m_picOrg[0] = m_picBuf[0] + m_lumaMarginY   * m_stride  + m_lumaMarginX;
-    m_picOrg[1] = m_picBuf[1] + m_chromaMarginY * m_strideC + m_chromaMarginX;
-    m_picOrg[2] = m_picBuf[2] + m_chromaMarginY * m_strideC + m_chromaMarginX;
+        CHECKED_MALLOC(m_picBuf[1], pixel, m_strideC * ((maxHeight >> m_vChromaShift) + (m_chromaMarginY * 2)));
+        CHECKED_MALLOC(m_picBuf[2], pixel, m_strideC * ((maxHeight >> m_vChromaShift) + (m_chromaMarginY * 2)));
 
+        m_picOrg[1] = m_picBuf[1] + m_chromaMarginY * m_strideC + m_chromaMarginX;
+        m_picOrg[2] = m_picBuf[2] + m_chromaMarginY * m_strideC + m_chromaMarginX;
+    }
+    else
+    {
+        m_picBuf[1] = m_picBuf[2] = NULL;
+        m_picOrg[1] = m_picOrg[2] = NULL;
+    }
     return true;
 
 fail:
@@ -85,27 +96,45 @@
 bool PicYuv::createOffsets(const SPS& sps)
 {
     uint32_t numPartitions = 1 << (g_unitSizeDepth * 2);
-    CHECKED_MALLOC(m_cuOffsetY, intptr_t, sps.numCuInWidth * sps.numCuInHeight);
-    CHECKED_MALLOC(m_cuOffsetC, intptr_t, sps.numCuInWidth * sps.numCuInHeight);
-    for (uint32_t cuRow = 0; cuRow < sps.numCuInHeight; cuRow++)
+
+    if (m_picCsp != X265_CSP_I400)
     {
-        for (uint32_t cuCol = 0; cuCol < sps.numCuInWidth; cuCol++)
+        CHECKED_MALLOC(m_cuOffsetY, intptr_t, sps.numCuInWidth * sps.numCuInHeight);
+        CHECKED_MALLOC(m_cuOffsetC, intptr_t, sps.numCuInWidth * sps.numCuInHeight);
+        for (uint32_t cuRow = 0; cuRow < sps.numCuInHeight; cuRow++)
         {
-            m_cuOffsetY[cuRow * sps.numCuInWidth + cuCol] = m_stride * cuRow * g_maxCUSize + cuCol * g_maxCUSize;
-            m_cuOffsetC[cuRow * sps.numCuInWidth + cuCol] = m_strideC * cuRow * (g_maxCUSize >> m_vChromaShift) + cuCol * (g_maxCUSize >> m_hChromaShift);
+            for (uint32_t cuCol = 0; cuCol < sps.numCuInWidth; cuCol++)
+            {
+                m_cuOffsetY[cuRow * sps.numCuInWidth + cuCol] = m_stride * cuRow * g_maxCUSize + cuCol * g_maxCUSize;
+                m_cuOffsetC[cuRow * sps.numCuInWidth + cuCol] = m_strideC * cuRow * (g_maxCUSize >> m_vChromaShift) + cuCol * (g_maxCUSize >> m_hChromaShift);
+            }
         }
-    }
 
-    CHECKED_MALLOC(m_buOffsetY, intptr_t, (size_t)numPartitions);
-    CHECKED_MALLOC(m_buOffsetC, intptr_t, (size_t)numPartitions);
-    for (uint32_t idx = 0; idx < numPartitions; ++idx)
-    {
-        intptr_t x = g_zscanToPelX[idx];
-        intptr_t y = g_zscanToPelY[idx];
-        m_buOffsetY[idx] = m_stride * y + x;
-        m_buOffsetC[idx] = m_strideC * (y >> m_vChromaShift) + (x >> m_hChromaShift);
+        CHECKED_MALLOC(m_buOffsetY, intptr_t, (size_t)numPartitions);
+        CHECKED_MALLOC(m_buOffsetC, intptr_t, (size_t)numPartitions);
+        for (uint32_t idx = 0; idx < numPartitions; ++idx)
+        {
+            intptr_t x = g_zscanToPelX[idx];
+            intptr_t y = g_zscanToPelY[idx];
+            m_buOffsetY[idx] = m_stride * y + x;
+            m_buOffsetC[idx] = m_strideC * (y >> m_vChromaShift) + (x >> m_hChromaShift);
+        }
     }
+    else
+    {
+        CHECKED_MALLOC(m_cuOffsetY, intptr_t, sps.numCuInWidth * sps.numCuInHeight);
+        for (uint32_t cuRow = 0; cuRow < sps.numCuInHeight; cuRow++)
+        for (uint32_t cuCol = 0; cuCol < sps.numCuInWidth; cuCol++)
+            m_cuOffsetY[cuRow * sps.numCuInWidth + cuCol] = m_stride * cuRow * g_maxCUSize + cuCol * g_maxCUSize;
 
+        CHECKED_MALLOC(m_buOffsetY, intptr_t, (size_t)numPartitions);
+        for (uint32_t idx = 0; idx < numPartitions; ++idx)
+        {
+            intptr_t x = g_zscanToPelX[idx];
+            intptr_t y = g_zscanToPelY[idx];
+            m_buOffsetY[idx] = m_stride * y + x;
+        }
+    }
     return true;
 
 fail:
@@ -121,7 +150,7 @@
 
 /* Copy pixels from an x265_picture into internal PicYuv instance.
  * Shift pixels as necessary, mask off bits above X265_DEPTH for safety. */
-void PicYuv::copyFromPicture(const x265_picture& pic, int padx, int pady)
+void PicYuv::copyFromPicture(const x265_picture& pic, const x265_param& param, int padx, int pady)
 {
     /* m_picWidth is the width that is being encoded, padx indicates how many
      * of those pixels are padding to reach multiple of MinCU(4) size.
@@ -155,28 +184,29 @@
 #if (X265_DEPTH > 8)
         {
             pixel *yPixel = m_picOrg[0];
-            pixel *uPixel = m_picOrg[1];
-            pixel *vPixel = m_picOrg[2];
 
             uint8_t *yChar = (uint8_t*)pic.planes[0];
-            uint8_t *uChar = (uint8_t*)pic.planes[1];
-            uint8_t *vChar = (uint8_t*)pic.planes[2];
             int shift = (X265_DEPTH - 8);
 
             primitives.planecopy_cp(yChar, pic.stride[0] / sizeof(*yChar), yPixel, m_stride, width, height, shift);
-            primitives.planecopy_cp(uChar, pic.stride[1] / sizeof(*uChar), uPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift);
-            primitives.planecopy_cp(vChar, pic.stride[2] / sizeof(*vChar), vPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift);
+
+            if (pic.colorSpace != X265_CSP_I400)
+            {
+                pixel *uPixel = m_picOrg[1];
+                pixel *vPixel = m_picOrg[2];
+
+                uint8_t *uChar = (uint8_t*)pic.planes[1];
+                uint8_t *vChar = (uint8_t*)pic.planes[2];
+
+                primitives.planecopy_cp(uChar, pic.stride[1] / sizeof(*uChar), uPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift);
+                primitives.planecopy_cp(vChar, pic.stride[2] / sizeof(*vChar), vPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift);
+            }
         }
 #else /* Case for (X265_DEPTH == 8) */
         // TODO: Does we need this path? may merge into above in future
         {
             pixel *yPixel = m_picOrg[0];
-            pixel *uPixel = m_picOrg[1];
-            pixel *vPixel = m_picOrg[2];
-
             uint8_t *yChar = (uint8_t*)pic.planes[0];
-            uint8_t *uChar = (uint8_t*)pic.planes[1];
-            uint8_t *vChar = (uint8_t*)pic.planes[2];
 
             for (int r = 0; r < height; r++)
             {
@@ -186,15 +216,24 @@
                 yChar += pic.stride[0] / sizeof(*yChar);
             }
 
-            for (int r = 0; r < height >> m_vChromaShift; r++)
+            if (pic.colorSpace != X265_CSP_I400)
             {
-                memcpy(uPixel, uChar, (width >> m_hChromaShift) * sizeof(pixel));
-                memcpy(vPixel, vChar, (width >> m_hChromaShift) * sizeof(pixel));
+                pixel *uPixel = m_picOrg[1];
+                pixel *vPixel = m_picOrg[2];
+
+                uint8_t *uChar = (uint8_t*)pic.planes[1];
+                uint8_t *vChar = (uint8_t*)pic.planes[2];
+
+                for (int r = 0; r < height >> m_vChromaShift; r++)
+                {
+                    memcpy(uPixel, uChar, (width >> m_hChromaShift) * sizeof(pixel));
+                    memcpy(vPixel, vChar, (width >> m_hChromaShift) * sizeof(pixel));
 
-                uPixel += m_strideC;
-                vPixel += m_strideC;
-                uChar += pic.stride[1] / sizeof(*uChar);
-                vChar += pic.stride[2] / sizeof(*vChar);
+                    uPixel += m_strideC;
+                    vPixel += m_strideC;
+                    uChar += pic.stride[1] / sizeof(*uChar);
+                    vChar += pic.stride[2] / sizeof(*vChar);
+                }
             }
         }
 #endif /* (X265_DEPTH > 8) */
@@ -205,43 +244,63 @@

 
@@ -2,6 +2,7 @@
  * Copyright (C) 2015 x265 project
  *
  * Authors: Steve Borho <steve@borho.org>
+ *          Min Chen <chenm003@163.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -42,6 +43,9 @@
     m_cuOffsetC = NULL;
     m_buOffsetY = NULL;
     m_buOffsetC = NULL;
+
+    m_maxLumaLevel = 0;
+    m_avgLumaLevel = 0;
 }
 
 bool PicYuv::create(uint32_t picWidth, uint32_t picHeight, uint32_t picCsp)
@@ -59,20 +63,27 @@
     m_lumaMarginY = g_maxCUSize + 16; // margin for 8-tap filter and infinite padding
     m_stride = (numCuInWidth * g_maxCUSize) + (m_lumaMarginX << 1);
 
-    m_chromaMarginX = m_lumaMarginX;  // keep 16-byte alignment for chroma CTUs
-    m_chromaMarginY = m_lumaMarginY >> m_vChromaShift;
-
-    m_strideC = ((numCuInWidth * g_maxCUSize) >> m_hChromaShift) + (m_chromaMarginX * 2);
     int maxHeight = numCuInHeight * g_maxCUSize;
-
     CHECKED_MALLOC(m_picBuf[0], pixel, m_stride * (maxHeight + (m_lumaMarginY * 2)));
-    CHECKED_MALLOC(m_picBuf[1], pixel, m_strideC * ((maxHeight >> m_vChromaShift) + (m_chromaMarginY * 2)));
-    CHECKED_MALLOC(m_picBuf[2], pixel, m_strideC * ((maxHeight >> m_vChromaShift) + (m_chromaMarginY * 2)));
+    m_picOrg[0] = m_picBuf[0] + m_lumaMarginY * m_stride + m_lumaMarginX;
+
+    if (picCsp != X265_CSP_I400)
+    {
+        m_chromaMarginX = m_lumaMarginX;  // keep 16-byte alignment for chroma CTUs
+        m_chromaMarginY = m_lumaMarginY >> m_vChromaShift;
+        m_strideC = ((numCuInWidth * g_maxCUSize) >> m_hChromaShift) + (m_chromaMarginX * 2);
 
-    m_picOrg[0] = m_picBuf[0] + m_lumaMarginY   * m_stride  + m_lumaMarginX;
-    m_picOrg[1] = m_picBuf[1] + m_chromaMarginY * m_strideC + m_chromaMarginX;
-    m_picOrg[2] = m_picBuf[2] + m_chromaMarginY * m_strideC + m_chromaMarginX;
+        CHECKED_MALLOC(m_picBuf[1], pixel, m_strideC * ((maxHeight >> m_vChromaShift) + (m_chromaMarginY * 2)));
+        CHECKED_MALLOC(m_picBuf[2], pixel, m_strideC * ((maxHeight >> m_vChromaShift) + (m_chromaMarginY * 2)));
 
+        m_picOrg[1] = m_picBuf[1] + m_chromaMarginY * m_strideC + m_chromaMarginX;
+        m_picOrg[2] = m_picBuf[2] + m_chromaMarginY * m_strideC + m_chromaMarginX;
+    }
+    else
+    {
+        m_picBuf[1] = m_picBuf[2] = NULL;
+        m_picOrg[1] = m_picOrg[2] = NULL;
+    }
     return true;
 
 fail:
@@ -85,27 +96,45 @@
 bool PicYuv::createOffsets(const SPS& sps)
 {
     uint32_t numPartitions = 1 << (g_unitSizeDepth * 2);
-    CHECKED_MALLOC(m_cuOffsetY, intptr_t, sps.numCuInWidth * sps.numCuInHeight);
-    CHECKED_MALLOC(m_cuOffsetC, intptr_t, sps.numCuInWidth * sps.numCuInHeight);
-    for (uint32_t cuRow = 0; cuRow < sps.numCuInHeight; cuRow++)
+
+    if (m_picCsp != X265_CSP_I400)
     {
-        for (uint32_t cuCol = 0; cuCol < sps.numCuInWidth; cuCol++)
+        CHECKED_MALLOC(m_cuOffsetY, intptr_t, sps.numCuInWidth * sps.numCuInHeight);
+        CHECKED_MALLOC(m_cuOffsetC, intptr_t, sps.numCuInWidth * sps.numCuInHeight);
+        for (uint32_t cuRow = 0; cuRow < sps.numCuInHeight; cuRow++)
         {
-            m_cuOffsetY[cuRow * sps.numCuInWidth + cuCol] = m_stride * cuRow * g_maxCUSize + cuCol * g_maxCUSize;
-            m_cuOffsetC[cuRow * sps.numCuInWidth + cuCol] = m_strideC * cuRow * (g_maxCUSize >> m_vChromaShift) + cuCol * (g_maxCUSize >> m_hChromaShift);
+            for (uint32_t cuCol = 0; cuCol < sps.numCuInWidth; cuCol++)
+            {
+                m_cuOffsetY[cuRow * sps.numCuInWidth + cuCol] = m_stride * cuRow * g_maxCUSize + cuCol * g_maxCUSize;
+                m_cuOffsetC[cuRow * sps.numCuInWidth + cuCol] = m_strideC * cuRow * (g_maxCUSize >> m_vChromaShift) + cuCol * (g_maxCUSize >> m_hChromaShift);
+            }
         }
-    }
 
-    CHECKED_MALLOC(m_buOffsetY, intptr_t, (size_t)numPartitions);
-    CHECKED_MALLOC(m_buOffsetC, intptr_t, (size_t)numPartitions);
-    for (uint32_t idx = 0; idx < numPartitions; ++idx)
-    {
-        intptr_t x = g_zscanToPelX[idx];
-        intptr_t y = g_zscanToPelY[idx];
-        m_buOffsetY[idx] = m_stride * y + x;
-        m_buOffsetC[idx] = m_strideC * (y >> m_vChromaShift) + (x >> m_hChromaShift);
+        CHECKED_MALLOC(m_buOffsetY, intptr_t, (size_t)numPartitions);
+        CHECKED_MALLOC(m_buOffsetC, intptr_t, (size_t)numPartitions);
+        for (uint32_t idx = 0; idx < numPartitions; ++idx)
+        {
+            intptr_t x = g_zscanToPelX[idx];
+            intptr_t y = g_zscanToPelY[idx];
+            m_buOffsetY[idx] = m_stride * y + x;
+            m_buOffsetC[idx] = m_strideC * (y >> m_vChromaShift) + (x >> m_hChromaShift);
+        }
     }
+    else
+    {
+        CHECKED_MALLOC(m_cuOffsetY, intptr_t, sps.numCuInWidth * sps.numCuInHeight);
+        for (uint32_t cuRow = 0; cuRow < sps.numCuInHeight; cuRow++)
+        for (uint32_t cuCol = 0; cuCol < sps.numCuInWidth; cuCol++)
+            m_cuOffsetY[cuRow * sps.numCuInWidth + cuCol] = m_stride * cuRow * g_maxCUSize + cuCol * g_maxCUSize;
 
+        CHECKED_MALLOC(m_buOffsetY, intptr_t, (size_t)numPartitions);
+        for (uint32_t idx = 0; idx < numPartitions; ++idx)
+        {
+            intptr_t x = g_zscanToPelX[idx];
+            intptr_t y = g_zscanToPelY[idx];
+            m_buOffsetY[idx] = m_stride * y + x;
+        }
+    }
     return true;
 
 fail:
@@ -121,7 +150,7 @@
 
 /* Copy pixels from an x265_picture into internal PicYuv instance.
  * Shift pixels as necessary, mask off bits above X265_DEPTH for safety. */
-void PicYuv::copyFromPicture(const x265_picture& pic, int padx, int pady)
+void PicYuv::copyFromPicture(const x265_picture& pic, const x265_param& param, int padx, int pady)
 {
     /* m_picWidth is the width that is being encoded, padx indicates how many
      * of those pixels are padding to reach multiple of MinCU(4) size.
@@ -155,28 +184,29 @@
 #if (X265_DEPTH > 8)
         {
             pixel *yPixel = m_picOrg[0];
-            pixel *uPixel = m_picOrg[1];
-            pixel *vPixel = m_picOrg[2];
 
             uint8_t *yChar = (uint8_t*)pic.planes[0];
-            uint8_t *uChar = (uint8_t*)pic.planes[1];
-            uint8_t *vChar = (uint8_t*)pic.planes[2];
             int shift = (X265_DEPTH - 8);
 
             primitives.planecopy_cp(yChar, pic.stride[0] / sizeof(*yChar), yPixel, m_stride, width, height, shift);
-            primitives.planecopy_cp(uChar, pic.stride[1] / sizeof(*uChar), uPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift);
-            primitives.planecopy_cp(vChar, pic.stride[2] / sizeof(*vChar), vPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift);
+
+            if (pic.colorSpace != X265_CSP_I400)
+            {
+                pixel *uPixel = m_picOrg[1];
+                pixel *vPixel = m_picOrg[2];
+
+                uint8_t *uChar = (uint8_t*)pic.planes[1];
+                uint8_t *vChar = (uint8_t*)pic.planes[2];
+
+                primitives.planecopy_cp(uChar, pic.stride[1] / sizeof(*uChar), uPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift);
+                primitives.planecopy_cp(vChar, pic.stride[2] / sizeof(*vChar), vPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift);
+            }
         }
 #else /* Case for (X265_DEPTH == 8) */
         // TODO: Does we need this path? may merge into above in future
         {
             pixel *yPixel = m_picOrg[0];
-            pixel *uPixel = m_picOrg[1];
-            pixel *vPixel = m_picOrg[2];
-
             uint8_t *yChar = (uint8_t*)pic.planes[0];
-            uint8_t *uChar = (uint8_t*)pic.planes[1];
-            uint8_t *vChar = (uint8_t*)pic.planes[2];
 
             for (int r = 0; r < height; r++)
             {
@@ -186,15 +216,24 @@
                 yChar += pic.stride[0] / sizeof(*yChar);
             }
 
-            for (int r = 0; r < height >> m_vChromaShift; r++)
+            if (pic.colorSpace != X265_CSP_I400)
             {
-                memcpy(uPixel, uChar, (width >> m_hChromaShift) * sizeof(pixel));
-                memcpy(vPixel, vChar, (width >> m_hChromaShift) * sizeof(pixel));
+                pixel *uPixel = m_picOrg[1];
+                pixel *vPixel = m_picOrg[2];
+
+                uint8_t *uChar = (uint8_t*)pic.planes[1];
+                uint8_t *vChar = (uint8_t*)pic.planes[2];
+
+                for (int r = 0; r < height >> m_vChromaShift; r++)
+                {
+                    memcpy(uPixel, uChar, (width >> m_hChromaShift) * sizeof(pixel));
+                    memcpy(vPixel, vChar, (width >> m_hChromaShift) * sizeof(pixel));
 
-                uPixel += m_strideC;
-                vPixel += m_strideC;
-                uChar += pic.stride[1] / sizeof(*uChar);
-                vChar += pic.stride[2] / sizeof(*vChar);
+                    uPixel += m_strideC;
+                    vPixel += m_strideC;
+                    uChar += pic.stride[1] / sizeof(*uChar);
+                    vChar += pic.stride[2] / sizeof(*vChar);
+                }
             }
         }
 #endif /* (X265_DEPTH > 8) */
@@ -205,43 +244,63 @@
​

x265_1.8.tar.gz/source/common/picyuv.h -> x265_1.9.tar.gz/source/common/picyuv.h Changed

 
@@ -60,13 +60,16 @@
     uint32_t m_chromaMarginX;
     uint32_t m_chromaMarginY;
 
+    uint16_t m_maxLumaLevel;
+    double   m_avgLumaLevel;
+
     PicYuv();
 
     bool  create(uint32_t picWidth, uint32_t picHeight, uint32_t csp);
     bool  createOffsets(const SPS& sps);
     void  destroy();
 
-    void  copyFromPicture(const x265_picture&, int padx, int pady);
+    void  copyFromPicture(const x265_picture&, const x265_param& param, int padx, int pady);
 
     intptr_t getChromaAddrOffset(uint32_t ctuAddr, uint32_t absPartIdx) const { return m_cuOffsetC[ctuAddr] + m_buOffsetC[absPartIdx]; }
 
​

x265_1.8.tar.gz/source/common/pixel.cpp -> x265_1.9.tar.gz/source/common/pixel.cpp Changed

@@ -25,6 +25,7 @@
  *****************************************************************************/
 
 #include "common.h"
+#include "slicetype.h"      // LOWRES_COST_MASK
 #include "primitives.h"
 #include "x265.h"
 
@@ -117,9 +118,9 @@
 }
 
 template<int lx, int ly, class T1, class T2>
-sse_ret_t sse(const T1* pix1, intptr_t stride_pix1, const T2* pix2, intptr_t stride_pix2)
+sse_t sse(const T1* pix1, intptr_t stride_pix1, const T2* pix2, intptr_t stride_pix2)
 {
-    sse_ret_t sum = 0;
+    sse_t sum = 0;
     int tmp;
 
     for (int y = 0; y < ly; y++)
@@ -187,37 +188,6 @@
     return (int)(sum >> 1);
 }
 
-static int satd_4x4(const int16_t* pix1, intptr_t stride_pix1)
-{
-    int32_t tmp[4][4];
-    int32_t s01, s23, d01, d23;
-    int32_t satd = 0;
-    int d;
-
-    for (d = 0; d < 4; d++, pix1 += stride_pix1)
-    {
-        s01 = pix1[0] + pix1[1];
-        s23 = pix1[2] + pix1[3];
-        d01 = pix1[0] - pix1[1];
-        d23 = pix1[2] - pix1[3];
-
-        tmp[d][0] = s01 + s23;
-        tmp[d][1] = s01 - s23;
-        tmp[d][2] = d01 - d23;
-        tmp[d][3] = d01 + d23;
-    }
-
-    for (d = 0; d < 4; d++)
-    {
-        s01 = tmp[0][d] + tmp[1][d];
-        s23 = tmp[2][d] + tmp[3][d];
-        d01 = tmp[0][d] - tmp[1][d];
-        d23 = tmp[2][d] - tmp[3][d];
-        satd += abs(s01 + s23) + abs(s01 - s23) + abs(d01 - d23) + abs(d01 + d23);
-    }
-    return (int)(satd / 2);
-}
-
 // x264's SWAR version of satd 8x4, performs two 4x4 SATDs at once
 static int satd_8x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
 {
@@ -313,57 +283,6 @@
     return (int)((_sa8d_8x8(pix1, i_pix1, pix2, i_pix2) + 2) >> 2);
 }
 
-inline int _sa8d_8x8(const int16_t* pix1, intptr_t i_pix1)
-{
-    int32_t tmp[8][8];
-    int32_t a0, a1, a2, a3, a4, a5, a6, a7;
-    int32_t sum = 0;
-
-    for (int i = 0; i < 8; i++, pix1 += i_pix1)
-    {
-        a0 = pix1[0] + pix1[1];
-        a1 = pix1[2] + pix1[3];
-        a2 = pix1[4] + pix1[5];
-        a3 = pix1[6] + pix1[7];
-        a4 = pix1[0] - pix1[1];
-        a5 = pix1[2] - pix1[3];
-        a6 = pix1[4] - pix1[5];
-        a7 = pix1[6] - pix1[7];
-        tmp[i][0] = (a0 + a1) + (a2 + a3);
-        tmp[i][1] = (a0 + a1) - (a2 + a3);
-        tmp[i][2] = (a0 - a1) + (a2 - a3);
-        tmp[i][3] = (a0 - a1) - (a2 - a3);
-        tmp[i][4] = (a4 + a5) + (a6 + a7);
-        tmp[i][5] = (a4 + a5) - (a6 + a7);
-        tmp[i][6] = (a4 - a5) + (a6 - a7);
-        tmp[i][7] = (a4 - a5) - (a6 - a7);
-    }
-
-    for (int i = 0; i < 8; i++)
-    {
-        a0 = (tmp[0][i] + tmp[1][i]) + (tmp[2][i] + tmp[3][i]);
-        a2 = (tmp[0][i] + tmp[1][i]) - (tmp[2][i] + tmp[3][i]);
-        a1 = (tmp[0][i] - tmp[1][i]) + (tmp[2][i] - tmp[3][i]);
-        a3 = (tmp[0][i] - tmp[1][i]) - (tmp[2][i] - tmp[3][i]);
-        a4 = (tmp[4][i] + tmp[5][i]) + (tmp[6][i] + tmp[7][i]);
-        a6 = (tmp[4][i] + tmp[5][i]) - (tmp[6][i] + tmp[7][i]);
-        a5 = (tmp[4][i] - tmp[5][i]) + (tmp[6][i] - tmp[7][i]);
-        a7 = (tmp[4][i] - tmp[5][i]) - (tmp[6][i] - tmp[7][i]);
-        a0 = abs(a0 + a4) + abs(a0 - a4);
-        a0 += abs(a1 + a5) + abs(a1 - a5);
-        a0 += abs(a2 + a6) + abs(a2 - a6);
-        a0 += abs(a3 + a7) + abs(a3 - a7);
-        sum += a0;
-    }
-
-    return (int)sum;
-}
-
-static int sa8d_8x8(const int16_t* pix1, intptr_t i_pix1)
-{
-    return (int)((_sa8d_8x8(pix1, i_pix1) + 2) >> 2);
-}
-
 static int sa8d_16x16(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
 {
     int sum = _sa8d_8x8(pix1, i_pix1, pix2, i_pix2)
@@ -403,9 +322,9 @@
 }
 
 template<int size>
-int pixel_ssd_s_c(const int16_t* a, intptr_t dstride)
+sse_t pixel_ssd_s_c(const int16_t* a, intptr_t dstride)
 {
-    int sum = 0;
+    sse_t sum = 0;
     for (int y = 0; y < size; y++)
     {
         for (int x = 0; x < size; x++)
@@ -783,39 +702,6 @@
     }
 }
 
-template<int size>
-int psyCost_ss(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride)
-{
-    static int16_t zeroBuf[8] /* = { 0 } */;
-
-    if (size)
-    {
-        int dim = 1 << (size + 2);
-        uint32_t totEnergy = 0;
-        for (int i = 0; i < dim; i += 8)
-        {
-            for (int j = 0; j < dim; j+= 8)
-            {
-                /* AC energy, measured by sa8d (AC + DC) minus SAD (DC) */
-                int sourceEnergy = sa8d_8x8(source + i * sstride + j, sstride) - 
-                                   (sad<8, 8>(source + i * sstride + j, sstride, zeroBuf, 0) >> 2);
-                int reconEnergy =  sa8d_8x8(recon + i * rstride + j, rstride) - 
-                                   (sad<8, 8>(recon + i * rstride + j, rstride, zeroBuf, 0) >> 2);
-
-                totEnergy += abs(sourceEnergy - reconEnergy);
-            }
-        }
-        return totEnergy;
-    }
-    else
-    {
-        /* 4x4 is too small for sa8d */
-        int sourceEnergy = satd_4x4(source, sstride) - (sad<4, 4>(source, sstride, zeroBuf, 0) >> 2);
-        int reconEnergy = satd_4x4(recon, rstride) - (sad<4, 4>(recon, rstride, zeroBuf, 0) >> 2);
-        return abs(sourceEnergy - reconEnergy);
-    }
-}
-
 template<int bx, int by>
 void blockcopy_pp_c(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb)
 {
@@ -960,19 +846,57 @@
 /* Estimate the total amount of influence on future quality that could be had if we
  * were to improve the reference samples used to inter predict any given CU. */
 static void estimateCUPropagateCost(int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, const uint16_t* interCosts,
-                             const int32_t* invQscales, const double* fpsFactor, int len)
+                                    const int32_t* invQscales, const double* fpsFactor, int len)
 {
-    double fps = *fpsFactor / 256;
+    double fps = *fpsFactor / 256;  // range[0.01, 1.00]
 
     for (int i = 0; i < len; i++)
     {
-        double intraCost       = intraCosts[i] * invQscales[i];
-        double propagateAmount = (double)propagateIn[i] + intraCost * fps;
-        double propagateNum    = (double)intraCosts[i] - (interCosts[i] & ((1 << 14) - 1));
-        double propagateDenom  = (double)intraCosts[i];
+        int intraCost = intraCosts[i];
+        int interCost = X265_MIN(intraCosts[i], interCosts[i] & LOWRES_COST_MASK);
+        double propagateIntra  = intraCost * invQscales[i]; // Q16 x Q8.8 = Q24.8
+        double propagateAmount = (double)propagateIn[i] + propagateIntra * fps; // Q16.0 + Q24.8 x Q0.x = Q25.0
+        double propagateNum    = (double)(intraCost - interCost); // Q32 - Q32 = Q33.0
+
+#if 0
+        // algorithm that output match to asm
+        float intraRcp = (float)1.0f / intraCost;   // VC can't mapping this into RCPPS
+        float intraRcpError1 = (float)intraCost * (float)intraRcp;
+        intraRcpError1 *= (float)intraRcp;
+        float intraRcpError2 = intraRcp + intraRcp;
+        float propagateDenom = intraRcpError2 - intraRcpError1;
+        dst[i] = (int)(propagateAmount * propagateNum * (double)propagateDenom + 0.5);
+#else
+        double propagateDenom  = (double)intraCost;             // Q32

 
@@ -25,6 +25,7 @@
  *****************************************************************************/
 
 #include "common.h"
+#include "slicetype.h"      // LOWRES_COST_MASK
 #include "primitives.h"
 #include "x265.h"
 
@@ -117,9 +118,9 @@
 }
 
 template<int lx, int ly, class T1, class T2>
-sse_ret_t sse(const T1* pix1, intptr_t stride_pix1, const T2* pix2, intptr_t stride_pix2)
+sse_t sse(const T1* pix1, intptr_t stride_pix1, const T2* pix2, intptr_t stride_pix2)
 {
-    sse_ret_t sum = 0;
+    sse_t sum = 0;
     int tmp;
 
     for (int y = 0; y < ly; y++)
@@ -187,37 +188,6 @@
     return (int)(sum >> 1);
 }
 
-static int satd_4x4(const int16_t* pix1, intptr_t stride_pix1)
-{
-    int32_t tmp[4][4];
-    int32_t s01, s23, d01, d23;
-    int32_t satd = 0;
-    int d;
-
-    for (d = 0; d < 4; d++, pix1 += stride_pix1)
-    {
-        s01 = pix1[0] + pix1[1];
-        s23 = pix1[2] + pix1[3];
-        d01 = pix1[0] - pix1[1];
-        d23 = pix1[2] - pix1[3];
-
-        tmp[d][0] = s01 + s23;
-        tmp[d][1] = s01 - s23;
-        tmp[d][2] = d01 - d23;
-        tmp[d][3] = d01 + d23;
-    }
-
-    for (d = 0; d < 4; d++)
-    {
-        s01 = tmp[0][d] + tmp[1][d];
-        s23 = tmp[2][d] + tmp[3][d];
-        d01 = tmp[0][d] - tmp[1][d];
-        d23 = tmp[2][d] - tmp[3][d];
-        satd += abs(s01 + s23) + abs(s01 - s23) + abs(d01 - d23) + abs(d01 + d23);
-    }
-    return (int)(satd / 2);
-}
-
 // x264's SWAR version of satd 8x4, performs two 4x4 SATDs at once
 static int satd_8x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
 {
@@ -313,57 +283,6 @@
     return (int)((_sa8d_8x8(pix1, i_pix1, pix2, i_pix2) + 2) >> 2);
 }
 
-inline int _sa8d_8x8(const int16_t* pix1, intptr_t i_pix1)
-{
-    int32_t tmp[8][8];
-    int32_t a0, a1, a2, a3, a4, a5, a6, a7;
-    int32_t sum = 0;
-
-    for (int i = 0; i < 8; i++, pix1 += i_pix1)
-    {
-        a0 = pix1[0] + pix1[1];
-        a1 = pix1[2] + pix1[3];
-        a2 = pix1[4] + pix1[5];
-        a3 = pix1[6] + pix1[7];
-        a4 = pix1[0] - pix1[1];
-        a5 = pix1[2] - pix1[3];
-        a6 = pix1[4] - pix1[5];
-        a7 = pix1[6] - pix1[7];
-        tmp[i][0] = (a0 + a1) + (a2 + a3);
-        tmp[i][1] = (a0 + a1) - (a2 + a3);
-        tmp[i][2] = (a0 - a1) + (a2 - a3);
-        tmp[i][3] = (a0 - a1) - (a2 - a3);
-        tmp[i][4] = (a4 + a5) + (a6 + a7);
-        tmp[i][5] = (a4 + a5) - (a6 + a7);
-        tmp[i][6] = (a4 - a5) + (a6 - a7);
-        tmp[i][7] = (a4 - a5) - (a6 - a7);
-    }
-
-    for (int i = 0; i < 8; i++)
-    {
-        a0 = (tmp[0][i] + tmp[1][i]) + (tmp[2][i] + tmp[3][i]);
-        a2 = (tmp[0][i] + tmp[1][i]) - (tmp[2][i] + tmp[3][i]);
-        a1 = (tmp[0][i] - tmp[1][i]) + (tmp[2][i] - tmp[3][i]);
-        a3 = (tmp[0][i] - tmp[1][i]) - (tmp[2][i] - tmp[3][i]);
-        a4 = (tmp[4][i] + tmp[5][i]) + (tmp[6][i] + tmp[7][i]);
-        a6 = (tmp[4][i] + tmp[5][i]) - (tmp[6][i] + tmp[7][i]);
-        a5 = (tmp[4][i] - tmp[5][i]) + (tmp[6][i] - tmp[7][i]);
-        a7 = (tmp[4][i] - tmp[5][i]) - (tmp[6][i] - tmp[7][i]);
-        a0 = abs(a0 + a4) + abs(a0 - a4);
-        a0 += abs(a1 + a5) + abs(a1 - a5);
-        a0 += abs(a2 + a6) + abs(a2 - a6);
-        a0 += abs(a3 + a7) + abs(a3 - a7);
-        sum += a0;
-    }
-
-    return (int)sum;
-}
-
-static int sa8d_8x8(const int16_t* pix1, intptr_t i_pix1)
-{
-    return (int)((_sa8d_8x8(pix1, i_pix1) + 2) >> 2);
-}
-
 static int sa8d_16x16(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
 {
     int sum = _sa8d_8x8(pix1, i_pix1, pix2, i_pix2)
@@ -403,9 +322,9 @@
 }
 
 template<int size>
-int pixel_ssd_s_c(const int16_t* a, intptr_t dstride)
+sse_t pixel_ssd_s_c(const int16_t* a, intptr_t dstride)
 {
-    int sum = 0;
+    sse_t sum = 0;
     for (int y = 0; y < size; y++)
     {
         for (int x = 0; x < size; x++)
@@ -783,39 +702,6 @@
     }
 }
 
-template<int size>
-int psyCost_ss(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride)
-{
-    static int16_t zeroBuf[8] /* = { 0 } */;
-
-    if (size)
-    {
-        int dim = 1 << (size + 2);
-        uint32_t totEnergy = 0;
-        for (int i = 0; i < dim; i += 8)
-        {
-            for (int j = 0; j < dim; j+= 8)
-            {
-                /* AC energy, measured by sa8d (AC + DC) minus SAD (DC) */
-                int sourceEnergy = sa8d_8x8(source + i * sstride + j, sstride) - 
-                                   (sad<8, 8>(source + i * sstride + j, sstride, zeroBuf, 0) >> 2);
-                int reconEnergy =  sa8d_8x8(recon + i * rstride + j, rstride) - 
-                                   (sad<8, 8>(recon + i * rstride + j, rstride, zeroBuf, 0) >> 2);
-
-                totEnergy += abs(sourceEnergy - reconEnergy);
-            }
-        }
-        return totEnergy;
-    }
-    else
-    {
-        /* 4x4 is too small for sa8d */
-        int sourceEnergy = satd_4x4(source, sstride) - (sad<4, 4>(source, sstride, zeroBuf, 0) >> 2);
-        int reconEnergy = satd_4x4(recon, rstride) - (sad<4, 4>(recon, rstride, zeroBuf, 0) >> 2);
-        return abs(sourceEnergy - reconEnergy);
-    }
-}
-
 template<int bx, int by>
 void blockcopy_pp_c(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb)
 {
@@ -960,19 +846,57 @@
 /* Estimate the total amount of influence on future quality that could be had if we
  * were to improve the reference samples used to inter predict any given CU. */
 static void estimateCUPropagateCost(int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, const uint16_t* interCosts,
-                             const int32_t* invQscales, const double* fpsFactor, int len)
+                                    const int32_t* invQscales, const double* fpsFactor, int len)
 {
-    double fps = *fpsFactor / 256;
+    double fps = *fpsFactor / 256;  // range[0.01, 1.00]
 
     for (int i = 0; i < len; i++)
     {
-        double intraCost       = intraCosts[i] * invQscales[i];
-        double propagateAmount = (double)propagateIn[i] + intraCost * fps;
-        double propagateNum    = (double)intraCosts[i] - (interCosts[i] & ((1 << 14) - 1));
-        double propagateDenom  = (double)intraCosts[i];
+        int intraCost = intraCosts[i];
+        int interCost = X265_MIN(intraCosts[i], interCosts[i] & LOWRES_COST_MASK);
+        double propagateIntra  = intraCost * invQscales[i]; // Q16 x Q8.8 = Q24.8
+        double propagateAmount = (double)propagateIn[i] + propagateIntra * fps; // Q16.0 + Q24.8 x Q0.x = Q25.0
+        double propagateNum    = (double)(intraCost - interCost); // Q32 - Q32 = Q33.0
+
+#if 0
+        // algorithm that output match to asm
+        float intraRcp = (float)1.0f / intraCost;   // VC can't mapping this into RCPPS
+        float intraRcpError1 = (float)intraCost * (float)intraRcp;
+        intraRcpError1 *= (float)intraRcp;
+        float intraRcpError2 = intraRcp + intraRcp;
+        float propagateDenom = intraRcpError2 - intraRcpError1;
+        dst[i] = (int)(propagateAmount * propagateNum * (double)propagateDenom + 0.5);
+#else
+        double propagateDenom  = (double)intraCost;             // Q32
​

x265_1.8.tar.gz/source/common/predict.cpp -> x265_1.9.tar.gz/source/common/predict.cpp Changed

@@ -2,6 +2,7 @@
 * Copyright (C) 2013 x265 project
 *
 * Authors: Deepthi Nandakumar <deepthi@multicorewareinc.com>
+*          Min Chen <chenm003@163.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -98,7 +99,7 @@
 
         if (cu.m_slice->m_pps->bUseWeightPred && wp0->bPresentFlag)
         {
-            for (int plane = 0; plane < 3; plane++)
+            for (int plane = 0; plane < (bChroma ? 3 : 1); plane++)
             {
                 wv0[plane].w      = wp0[plane].inputWeight;
                 wv0[plane].offset = wp0[plane].inputOffset * (1 << (X265_DEPTH - 8));
@@ -109,18 +110,18 @@
             ShortYuv& shortYuv = m_predShortYuv[0];
 
             if (bLuma)
-                predInterLumaShort(pu, shortYuv, *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
+                predInterLumaShort(pu, shortYuv, *cu.m_slice->m_refReconPicList[0][refIdx0], mv0);
             if (bChroma)
-                predInterChromaShort(pu, shortYuv, *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
+                predInterChromaShort(pu, shortYuv, *cu.m_slice->m_refReconPicList[0][refIdx0], mv0);
 
             addWeightUni(pu, predYuv, shortYuv, wv0, bLuma, bChroma);
         }
         else
         {
             if (bLuma)
-                predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
+                predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList[0][refIdx0], mv0);
             if (bChroma)
-                predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
+                predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList[0][refIdx0], mv0);
         }
     }
     else
@@ -141,7 +142,7 @@
             if (pwp0 && pwp1 && (pwp0->bPresentFlag || pwp1->bPresentFlag))
             {
                 /* biprediction weighting */
-                for (int plane = 0; plane < 3; plane++)
+                for (int plane = 0; plane < (bChroma ? 3 : 1); plane++)
                 {
                     wv0[plane].w = pwp0[plane].inputWeight;
                     wv0[plane].o = pwp0[plane].inputOffset * (1 << (X265_DEPTH - 8));
@@ -158,7 +159,7 @@
             {
                 /* uniprediction weighting, always outputs to wv0 */
                 const WeightParam* pwp = (refIdx0 >= 0) ? pwp0 : pwp1;
-                for (int plane = 0; plane < 3; plane++)
+                for (int plane = 0; plane < (bChroma ? 3 : 1); plane++)
                 {
                     wv0[plane].w = pwp[plane].inputWeight;
                     wv0[plane].offset = pwp[plane].inputOffset * (1 << (X265_DEPTH - 8));
@@ -179,13 +180,13 @@
 
             if (bLuma)
             {
-                predInterLumaShort(pu, m_predShortYuv[0], *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
-                predInterLumaShort(pu, m_predShortYuv[1], *cu.m_slice->m_refPicList[1][refIdx1]->m_reconPic, mv1);
+                predInterLumaShort(pu, m_predShortYuv[0], *cu.m_slice->m_refReconPicList[0][refIdx0], mv0);
+                predInterLumaShort(pu, m_predShortYuv[1], *cu.m_slice->m_refReconPicList[1][refIdx1], mv1);
             }
             if (bChroma)
             {
-                predInterChromaShort(pu, m_predShortYuv[0], *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
-                predInterChromaShort(pu, m_predShortYuv[1], *cu.m_slice->m_refPicList[1][refIdx1]->m_reconPic, mv1);
+                predInterChromaShort(pu, m_predShortYuv[0], *cu.m_slice->m_refReconPicList[0][refIdx0], mv0);
+                predInterChromaShort(pu, m_predShortYuv[1], *cu.m_slice->m_refReconPicList[1][refIdx1], mv1);
             }
 
             if (pwp0 && pwp1 && (pwp0->bPresentFlag || pwp1->bPresentFlag))
@@ -203,18 +204,18 @@
                 ShortYuv& shortYuv = m_predShortYuv[0];
 
                 if (bLuma)
-                    predInterLumaShort(pu, shortYuv, *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
+                    predInterLumaShort(pu, shortYuv, *cu.m_slice->m_refReconPicList[0][refIdx0], mv0);
                 if (bChroma)
-                    predInterChromaShort(pu, shortYuv, *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
+                    predInterChromaShort(pu, shortYuv, *cu.m_slice->m_refReconPicList[0][refIdx0], mv0);
 
                 addWeightUni(pu, predYuv, shortYuv, wv0, bLuma, bChroma);
             }
             else
             {
                 if (bLuma)
-                    predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
+                    predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList[0][refIdx0], mv0);
                 if (bChroma)
-                    predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
+                    predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList[0][refIdx0], mv0);
             }
         }
         else
@@ -230,18 +231,18 @@
                 ShortYuv& shortYuv = m_predShortYuv[0];
 
                 if (bLuma)
-                    predInterLumaShort(pu, shortYuv, *cu.m_slice->m_refPicList[1][refIdx1]->m_reconPic, mv1);
+                    predInterLumaShort(pu, shortYuv, *cu.m_slice->m_refReconPicList[1][refIdx1], mv1);
                 if (bChroma)
-                    predInterChromaShort(pu, shortYuv, *cu.m_slice->m_refPicList[1][refIdx1]->m_reconPic, mv1);
+                    predInterChromaShort(pu, shortYuv, *cu.m_slice->m_refReconPicList[1][refIdx1], mv1);
 
                 addWeightUni(pu, predYuv, shortYuv, wv0, bLuma, bChroma);
             }
             else
             {
                 if (bLuma)
-                    predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refPicList[1][refIdx1]->m_reconPic, mv1);
+                    predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList[1][refIdx1], mv1);
                 if (bChroma)
-                    predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refPicList[1][refIdx1]->m_reconPic, mv1);
+                    predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList[1][refIdx1], mv1);
             }
         }
     }
@@ -600,8 +601,9 @@
     int tuSize = 1 << intraNeighbors.log2TrSize;
     int tuSize2 = tuSize << 1;
 
-    pixel* adiOrigin = cu.m_encData->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + puAbsPartIdx);
-    intptr_t picStride = cu.m_encData->m_reconPic->m_stride;
+    PicYuv* reconPic = cu.m_encData->m_reconPic;
+    pixel* adiOrigin = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + puAbsPartIdx);
+    intptr_t picStride = reconPic->m_stride;
 
     fillReferenceSamples(adiOrigin, picStride, intraNeighbors, intraNeighbourBuf[0]);
 
@@ -648,8 +650,9 @@
 
 void Predict::initAdiPatternChroma(const CUData& cu, const CUGeom& cuGeom, uint32_t puAbsPartIdx, const IntraNeighbors& intraNeighbors, uint32_t chromaId)
 {
-    const pixel* adiOrigin = cu.m_encData->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + puAbsPartIdx);
-    intptr_t picStride = cu.m_encData->m_reconPic->m_strideC;
+    PicYuv* reconPic = cu.m_encData->m_reconPic;
+    const pixel* adiOrigin = reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + puAbsPartIdx);
+    intptr_t picStride = reconPic->m_strideC;
 
     fillReferenceSamples(adiOrigin, picStride, intraNeighbors, intraNeighbourBuf[0]);

 
@@ -2,6 +2,7 @@
 * Copyright (C) 2013 x265 project
 *
 * Authors: Deepthi Nandakumar <deepthi@multicorewareinc.com>
+*          Min Chen <chenm003@163.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -98,7 +99,7 @@
 
         if (cu.m_slice->m_pps->bUseWeightPred && wp0->bPresentFlag)
         {
-            for (int plane = 0; plane < 3; plane++)
+            for (int plane = 0; plane < (bChroma ? 3 : 1); plane++)
             {
                 wv0[plane].w      = wp0[plane].inputWeight;
                 wv0[plane].offset = wp0[plane].inputOffset * (1 << (X265_DEPTH - 8));
@@ -109,18 +110,18 @@
             ShortYuv& shortYuv = m_predShortYuv[0];
 
             if (bLuma)
-                predInterLumaShort(pu, shortYuv, *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
+                predInterLumaShort(pu, shortYuv, *cu.m_slice->m_refReconPicList[0][refIdx0], mv0);
             if (bChroma)
-                predInterChromaShort(pu, shortYuv, *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
+                predInterChromaShort(pu, shortYuv, *cu.m_slice->m_refReconPicList[0][refIdx0], mv0);
 
             addWeightUni(pu, predYuv, shortYuv, wv0, bLuma, bChroma);
         }
         else
         {
             if (bLuma)
-                predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
+                predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList[0][refIdx0], mv0);
             if (bChroma)
-                predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
+                predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList[0][refIdx0], mv0);
         }
     }
     else
@@ -141,7 +142,7 @@
             if (pwp0 && pwp1 && (pwp0->bPresentFlag || pwp1->bPresentFlag))
             {
                 /* biprediction weighting */
-                for (int plane = 0; plane < 3; plane++)
+                for (int plane = 0; plane < (bChroma ? 3 : 1); plane++)
                 {
                     wv0[plane].w = pwp0[plane].inputWeight;
                     wv0[plane].o = pwp0[plane].inputOffset * (1 << (X265_DEPTH - 8));
@@ -158,7 +159,7 @@
             {
                 /* uniprediction weighting, always outputs to wv0 */
                 const WeightParam* pwp = (refIdx0 >= 0) ? pwp0 : pwp1;
-                for (int plane = 0; plane < 3; plane++)
+                for (int plane = 0; plane < (bChroma ? 3 : 1); plane++)
                 {
                     wv0[plane].w = pwp[plane].inputWeight;
                     wv0[plane].offset = pwp[plane].inputOffset * (1 << (X265_DEPTH - 8));
@@ -179,13 +180,13 @@
 
             if (bLuma)
             {
-                predInterLumaShort(pu, m_predShortYuv[0], *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
-                predInterLumaShort(pu, m_predShortYuv[1], *cu.m_slice->m_refPicList[1][refIdx1]->m_reconPic, mv1);
+                predInterLumaShort(pu, m_predShortYuv[0], *cu.m_slice->m_refReconPicList[0][refIdx0], mv0);
+                predInterLumaShort(pu, m_predShortYuv[1], *cu.m_slice->m_refReconPicList[1][refIdx1], mv1);
             }
             if (bChroma)
             {
-                predInterChromaShort(pu, m_predShortYuv[0], *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
-                predInterChromaShort(pu, m_predShortYuv[1], *cu.m_slice->m_refPicList[1][refIdx1]->m_reconPic, mv1);
+                predInterChromaShort(pu, m_predShortYuv[0], *cu.m_slice->m_refReconPicList[0][refIdx0], mv0);
+                predInterChromaShort(pu, m_predShortYuv[1], *cu.m_slice->m_refReconPicList[1][refIdx1], mv1);
             }
 
             if (pwp0 && pwp1 && (pwp0->bPresentFlag || pwp1->bPresentFlag))
@@ -203,18 +204,18 @@
                 ShortYuv& shortYuv = m_predShortYuv[0];
 
                 if (bLuma)
-                    predInterLumaShort(pu, shortYuv, *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
+                    predInterLumaShort(pu, shortYuv, *cu.m_slice->m_refReconPicList[0][refIdx0], mv0);
                 if (bChroma)
-                    predInterChromaShort(pu, shortYuv, *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
+                    predInterChromaShort(pu, shortYuv, *cu.m_slice->m_refReconPicList[0][refIdx0], mv0);
 
                 addWeightUni(pu, predYuv, shortYuv, wv0, bLuma, bChroma);
             }
             else
             {
                 if (bLuma)
-                    predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
+                    predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList[0][refIdx0], mv0);
                 if (bChroma)
-                    predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
+                    predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList[0][refIdx0], mv0);
             }
         }
         else
@@ -230,18 +231,18 @@
                 ShortYuv& shortYuv = m_predShortYuv[0];
 
                 if (bLuma)
-                    predInterLumaShort(pu, shortYuv, *cu.m_slice->m_refPicList[1][refIdx1]->m_reconPic, mv1);
+                    predInterLumaShort(pu, shortYuv, *cu.m_slice->m_refReconPicList[1][refIdx1], mv1);
                 if (bChroma)
-                    predInterChromaShort(pu, shortYuv, *cu.m_slice->m_refPicList[1][refIdx1]->m_reconPic, mv1);
+                    predInterChromaShort(pu, shortYuv, *cu.m_slice->m_refReconPicList[1][refIdx1], mv1);
 
                 addWeightUni(pu, predYuv, shortYuv, wv0, bLuma, bChroma);
             }
             else
             {
                 if (bLuma)
-                    predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refPicList[1][refIdx1]->m_reconPic, mv1);
+                    predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList[1][refIdx1], mv1);
                 if (bChroma)
-                    predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refPicList[1][refIdx1]->m_reconPic, mv1);
+                    predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList[1][refIdx1], mv1);
             }
         }
     }
@@ -600,8 +601,9 @@
     int tuSize = 1 << intraNeighbors.log2TrSize;
     int tuSize2 = tuSize << 1;
 
-    pixel* adiOrigin = cu.m_encData->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + puAbsPartIdx);
-    intptr_t picStride = cu.m_encData->m_reconPic->m_stride;
+    PicYuv* reconPic = cu.m_encData->m_reconPic;
+    pixel* adiOrigin = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + puAbsPartIdx);
+    intptr_t picStride = reconPic->m_stride;
 
     fillReferenceSamples(adiOrigin, picStride, intraNeighbors, intraNeighbourBuf[0]);
 
@@ -648,8 +650,9 @@
 
 void Predict::initAdiPatternChroma(const CUData& cu, const CUGeom& cuGeom, uint32_t puAbsPartIdx, const IntraNeighbors& intraNeighbors, uint32_t chromaId)
 {
-    const pixel* adiOrigin = cu.m_encData->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + puAbsPartIdx);
-    intptr_t picStride = cu.m_encData->m_reconPic->m_strideC;
+    PicYuv* reconPic = cu.m_encData->m_reconPic;
+    const pixel* adiOrigin = reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + puAbsPartIdx);
+    intptr_t picStride = reconPic->m_strideC;
 
     fillReferenceSamples(adiOrigin, picStride, intraNeighbors, intraNeighbourBuf[0]);
 
​

x265_1.8.tar.gz/source/common/predict.h -> x265_1.9.tar.gz/source/common/predict.h Changed

 
@@ -2,6 +2,7 @@
 * Copyright (C) 2013 x265 project
 *
 * Authors: Deepthi Nandakumar <deepthi@multicorewareinc.com>
+*          Min Chen <chenm003@163.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
​

x265_1.8.tar.gz/source/common/primitives.h -> x265_1.9.tar.gz/source/common/primitives.h Changed

@@ -112,9 +112,9 @@
 
 typedef int  (*pixelcmp_t)(const pixel* fenc, intptr_t fencstride, const pixel* fref, intptr_t frefstride); // fenc is aligned
 typedef int  (*pixelcmp_ss_t)(const int16_t* fenc, intptr_t fencstride, const int16_t* fref, intptr_t frefstride);
-typedef sse_ret_t (*pixel_sse_t)(const pixel* fenc, intptr_t fencstride, const pixel* fref, intptr_t frefstride); // fenc is aligned
-typedef sse_ret_t (*pixel_sse_ss_t)(const int16_t* fenc, intptr_t fencstride, const int16_t* fref, intptr_t frefstride);
-typedef int  (*pixel_ssd_s_t)(const int16_t* fenc, intptr_t fencstride);
+typedef sse_t (*pixel_sse_t)(const pixel* fenc, intptr_t fencstride, const pixel* fref, intptr_t frefstride); // fenc is aligned
+typedef sse_t (*pixel_sse_ss_t)(const int16_t* fenc, intptr_t fencstride, const int16_t* fref, intptr_t frefstride);
+typedef sse_t (*pixel_ssd_s_t)(const int16_t* fenc, intptr_t fencstride);
 typedef void (*pixelcmp_x4_t)(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
 typedef void (*pixelcmp_x3_t)(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
 typedef void (*blockfill_s_t)(int16_t* dst, intptr_t dstride, int16_t val);
@@ -176,15 +176,16 @@
 typedef void (*saoCuOrgE3_t)(pixel* rec, int8_t* upBuff1, int8_t* m_offsetEo, intptr_t stride, int startX, int endX);
 typedef void (*saoCuOrgB0_t)(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
 
-typedef void (*saoCuStatsBO_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count);
-typedef void (*saoCuStatsE0_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count);
-typedef void (*saoCuStatsE1_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
-typedef void (*saoCuStatsE2_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBuff, int endX, int endY, int32_t *stats, int32_t *count);
-typedef void (*saoCuStatsE3_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
+typedef void (*saoCuStatsBO_t)(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count);
+typedef void (*saoCuStatsE0_t)(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count);
+typedef void (*saoCuStatsE1_t)(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
+typedef void (*saoCuStatsE2_t)(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBuff, int endX, int endY, int32_t *stats, int32_t *count);
+typedef void (*saoCuStatsE3_t)(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
 
 typedef void (*sign_t)(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
 typedef void (*planecopy_cp_t) (const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
 typedef void (*planecopy_sp_t) (const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
+typedef pixel (*planeClipAndMax_t)(pixel *src, intptr_t stride, int width, int height, uint64_t *outsum, const pixel minPix, const pixel maxPix);
 
 typedef void (*cutree_propagate_cost) (int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, const uint16_t* interCosts, const int32_t* invQscales, const double* fpsFactor, int len);
 
@@ -195,6 +196,8 @@
 typedef uint32_t (*costCoeffRemain_t)(uint16_t *absCoeff, int numNonZero, int idx);
 typedef uint32_t (*costC1C2Flag_t)(uint16_t *absCoeff, intptr_t numC1Flag, uint8_t *baseCtxMod, intptr_t ctxOffset);
 
+typedef void (*pelFilterLumaStrong_t)(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tcP, int32_t tcQ);
+
 /* Function pointers to optimized encoder primitives. Each pointer can reference
  * either an assembly routine, a SIMD intrinsic primitive, or a C function */
 struct EncoderPrimitives
@@ -259,7 +262,6 @@
         pixel_sse_t     sse_pp;        // Sum of Square Error (pixel, pixel) fenc alignment not assumed
         pixel_sse_ss_t  sse_ss;        // Sum of Square Error (short, short) fenc alignment not assumed
         pixelcmp_t      psy_cost_pp;   // difference in AC energy between two pixel blocks
-        pixelcmp_ss_t   psy_cost_ss;   // difference in AC energy between two signed residual blocks
         pixel_ssd_s_t   ssd_s;         // Sum of Square Error (residual coeff to self)
         pixelcmp_t      sa8d;          // Sum of Transformed Differences (8x8 Hadamard), uses satd for 4x4 intra TU
 
@@ -316,6 +318,7 @@
     planecopy_cp_t        planecopy_cp;
     planecopy_sp_t        planecopy_sp;
     planecopy_sp_t        planecopy_sp_shl;
+    planeClipAndMax_t     planeClipAndMax;
 
     weightp_sp_t          weight_sp;
     weightp_pp_t          weight_pp;
@@ -328,6 +331,7 @@
     costCoeffRemain_t     costCoeffRemain;
     costC1C2Flag_t        costC1C2Flag;
 
+    pelFilterLumaStrong_t pelFilterLumaStrong[2]; // EDGE_VER = 0, EDGE_HOR = 1
 
     /* There is one set of chroma primitives per color space. An encoder will
      * have just a single color space and thus it will only ever use one entry

 
@@ -112,9 +112,9 @@
 
 typedef int  (*pixelcmp_t)(const pixel* fenc, intptr_t fencstride, const pixel* fref, intptr_t frefstride); // fenc is aligned
 typedef int  (*pixelcmp_ss_t)(const int16_t* fenc, intptr_t fencstride, const int16_t* fref, intptr_t frefstride);
-typedef sse_ret_t (*pixel_sse_t)(const pixel* fenc, intptr_t fencstride, const pixel* fref, intptr_t frefstride); // fenc is aligned
-typedef sse_ret_t (*pixel_sse_ss_t)(const int16_t* fenc, intptr_t fencstride, const int16_t* fref, intptr_t frefstride);
-typedef int  (*pixel_ssd_s_t)(const int16_t* fenc, intptr_t fencstride);
+typedef sse_t (*pixel_sse_t)(const pixel* fenc, intptr_t fencstride, const pixel* fref, intptr_t frefstride); // fenc is aligned
+typedef sse_t (*pixel_sse_ss_t)(const int16_t* fenc, intptr_t fencstride, const int16_t* fref, intptr_t frefstride);
+typedef sse_t (*pixel_ssd_s_t)(const int16_t* fenc, intptr_t fencstride);
 typedef void (*pixelcmp_x4_t)(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
 typedef void (*pixelcmp_x3_t)(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
 typedef void (*blockfill_s_t)(int16_t* dst, intptr_t dstride, int16_t val);
@@ -176,15 +176,16 @@
 typedef void (*saoCuOrgE3_t)(pixel* rec, int8_t* upBuff1, int8_t* m_offsetEo, intptr_t stride, int startX, int endX);
 typedef void (*saoCuOrgB0_t)(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
 
-typedef void (*saoCuStatsBO_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count);
-typedef void (*saoCuStatsE0_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count);
-typedef void (*saoCuStatsE1_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
-typedef void (*saoCuStatsE2_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBuff, int endX, int endY, int32_t *stats, int32_t *count);
-typedef void (*saoCuStatsE3_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
+typedef void (*saoCuStatsBO_t)(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count);
+typedef void (*saoCuStatsE0_t)(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count);
+typedef void (*saoCuStatsE1_t)(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
+typedef void (*saoCuStatsE2_t)(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBuff, int endX, int endY, int32_t *stats, int32_t *count);
+typedef void (*saoCuStatsE3_t)(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
 
 typedef void (*sign_t)(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
 typedef void (*planecopy_cp_t) (const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
 typedef void (*planecopy_sp_t) (const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
+typedef pixel (*planeClipAndMax_t)(pixel *src, intptr_t stride, int width, int height, uint64_t *outsum, const pixel minPix, const pixel maxPix);
 
 typedef void (*cutree_propagate_cost) (int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, const uint16_t* interCosts, const int32_t* invQscales, const double* fpsFactor, int len);
 
@@ -195,6 +196,8 @@
 typedef uint32_t (*costCoeffRemain_t)(uint16_t *absCoeff, int numNonZero, int idx);
 typedef uint32_t (*costC1C2Flag_t)(uint16_t *absCoeff, intptr_t numC1Flag, uint8_t *baseCtxMod, intptr_t ctxOffset);
 
+typedef void (*pelFilterLumaStrong_t)(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tcP, int32_t tcQ);
+
 /* Function pointers to optimized encoder primitives. Each pointer can reference
  * either an assembly routine, a SIMD intrinsic primitive, or a C function */
 struct EncoderPrimitives
@@ -259,7 +262,6 @@
         pixel_sse_t     sse_pp;        // Sum of Square Error (pixel, pixel) fenc alignment not assumed
         pixel_sse_ss_t  sse_ss;        // Sum of Square Error (short, short) fenc alignment not assumed
         pixelcmp_t      psy_cost_pp;   // difference in AC energy between two pixel blocks
-        pixelcmp_ss_t   psy_cost_ss;   // difference in AC energy between two signed residual blocks
         pixel_ssd_s_t   ssd_s;         // Sum of Square Error (residual coeff to self)
         pixelcmp_t      sa8d;          // Sum of Transformed Differences (8x8 Hadamard), uses satd for 4x4 intra TU
 
@@ -316,6 +318,7 @@
     planecopy_cp_t        planecopy_cp;
     planecopy_sp_t        planecopy_sp;
     planecopy_sp_t        planecopy_sp_shl;
+    planeClipAndMax_t     planeClipAndMax;
 
     weightp_sp_t          weight_sp;
     weightp_pp_t          weight_pp;
@@ -328,6 +331,7 @@
     costCoeffRemain_t     costCoeffRemain;
     costC1C2Flag_t        costC1C2Flag;
 
+    pelFilterLumaStrong_t pelFilterLumaStrong[2]; // EDGE_VER = 0, EDGE_HOR = 1
 
     /* There is one set of chroma primitives per color space. An encoder will
      * have just a single color space and thus it will only ever use one entry
​

x265_1.8.tar.gz/source/common/quant.cpp -> x265_1.9.tar.gz/source/common/quant.cpp Changed

@@ -2,6 +2,7 @@
  * Copyright (C) 2015 x265 project
  *
  * Authors: Steve Borho <steve@borho.org>
+ *          Min Chen <chenm003@163.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -50,9 +51,8 @@
     return y + ((x - y) & ((x - y) >> (sizeof(int) * CHAR_BIT - 1))); // min(x, y)
 }
 
-inline int getICRate(uint32_t absLevel, int32_t diffLevel, const int* greaterOneBits, const int* levelAbsBits, const uint32_t absGoRice, const uint32_t maxVlc, uint32_t c1c2Idx)
+inline int getICRate(uint32_t absLevel, int32_t diffLevel, const int* greaterOneBits, const int* levelAbsBits, const uint32_t absGoRice, const uint32_t maxVlc, const uint32_t c1c2Rate)
 {
-    X265_CHECK(c1c2Idx <= 3, "c1c2Idx check failure\n");
     X265_CHECK(absGoRice <= 4, "absGoRice check failure\n");
     if (!absLevel)
     {
@@ -94,12 +94,7 @@
         uint32_t numBins = fastMin(prefLen + absGoRice, 8 /* g_goRicePrefixLen[absGoRice] + absGoRice */);
 
         rate += numBins << 15;
-
-        if (c1c2Idx & 1)
-            rate += greaterOneBits[1];
-
-        if (c1c2Idx == 3)
-            rate += levelAbsBits[1];
+        rate += c1c2Rate;
     }
     return rate;
 }
@@ -140,7 +135,7 @@
 }
 
 /* Calculates the cost for specific absolute transform level */
-inline uint32_t getICRateCost(uint32_t absLevel, int32_t diffLevel, const int* greaterOneBits, const int* levelAbsBits, uint32_t absGoRice, uint32_t c1c2Idx)
+inline uint32_t getICRateCost(uint32_t absLevel, int32_t diffLevel, const int* greaterOneBits, const int* levelAbsBits, uint32_t absGoRice, const uint32_t c1c2Rate)
 {
     X265_CHECK(absLevel, "absLevel should not be zero\n");
 
@@ -175,16 +170,15 @@
 
             rate = (COEF_REMAIN_BIN_REDUCTION + length + absGoRice + 1 + length) << 15;
         }
-        if (c1c2Idx & 1)
-            rate += greaterOneBits[1];
-        if (c1c2Idx == 3)
-            rate += levelAbsBits[1];
+        rate += c1c2Rate;
         return rate;
     }
 }
 
 }
 
+Quant::rdoQuant_t Quant::rdoQuant_func[NUM_CU_DEPTH] = {&Quant::rdoQuant<2>, &Quant::rdoQuant<3>, &Quant::rdoQuant<4>, &Quant::rdoQuant<5>};
+
 Quant::Quant()
 {
     m_resiDctCoeff = NULL;
@@ -229,8 +223,11 @@
 {
     m_nr = m_frameNr ? &m_frameNr[ctu.m_encData->m_frameEncoderID] : NULL;
     m_qpParam[TEXT_LUMA].setQpParam(qp + QP_BD_OFFSET);
-    setChromaQP(qp + ctu.m_slice->m_pps->chromaQpOffset[0], TEXT_CHROMA_U, ctu.m_chromaFormat);
-    setChromaQP(qp + ctu.m_slice->m_pps->chromaQpOffset[1], TEXT_CHROMA_V, ctu.m_chromaFormat);
+    if (ctu.m_chromaFormat != X265_CSP_I400)
+    {
+        setChromaQP(qp + ctu.m_slice->m_pps->chromaQpOffset[0], TEXT_CHROMA_U, ctu.m_chromaFormat);
+        setChromaQP(qp + ctu.m_slice->m_pps->chromaQpOffset[1], TEXT_CHROMA_V, ctu.m_chromaFormat);
+    }
 }
 
 void Quant::setChromaQP(int qpin, TextType ttype, int chFmt)
@@ -444,18 +441,18 @@
             primitives.cu[sizeIdx].dct(m_fencShortBuf, m_fencDctCoeff, trSize);
         }
 
-        if (m_nr)
+        if (m_nr && m_nr->offset)
         {
             /* denoise is not applied to intra residual, so DST can be ignored */
             int cat = sizeIdx + 4 * !isLuma + 8 * !isIntra;
             int numCoeff = 1 << (log2TrSize * 2);
-            primitives.denoiseDct(m_resiDctCoeff, m_nr->residualSum[cat], m_nr->offsetDenoise[cat], numCoeff);
+            primitives.denoiseDct(m_resiDctCoeff, m_nr->residualSum[cat], m_nr->offset[cat], numCoeff);
             m_nr->count[cat]++;
         }
     }
 
     if (m_rdoqLevel)
-        return rdoQuant(cu, coeff, log2TrSize, ttype, absPartIdx, usePsy);
+        return (this->*rdoQuant_func[log2TrSize - 2])(cu, coeff, ttype, absPartIdx, usePsy);
     else
     {
         int deltaU[32 * 32];
@@ -550,9 +547,10 @@
 
 /* Rate distortion optimized quantization for entropy coding engines using
  * probability models like CABAC */
-uint32_t Quant::rdoQuant(const CUData& cu, int16_t* dstCoeff, uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool usePsy)
+template<uint32_t log2TrSize>
+uint32_t Quant::rdoQuant(const CUData& cu, int16_t* dstCoeff, TextType ttype, uint32_t absPartIdx, bool usePsy)
 {
-    int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
+    const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
     int scalingListType = (cu.isIntra(absPartIdx) ? 0 : 3) + ttype;
     const uint32_t usePsyMask = usePsy ? -1 : 0;
 
@@ -564,13 +562,13 @@
     int add = (1 << (qbits - 1));
     const int32_t* qCoef = m_scalingList->m_quantCoef[log2TrSize - 2][scalingListType][rem];
 
-    int numCoeff = 1 << (log2TrSize * 2);
+    const int numCoeff = 1 << (log2TrSize * 2);
     uint32_t numSig = primitives.nquant(m_resiDctCoeff, qCoef, dstCoeff, qbits, add, numCoeff);
     X265_CHECK((int)numSig == primitives.cu[log2TrSize - 2].count_nonzero(dstCoeff), "numSig differ\n");
     if (!numSig)
         return 0;
 
-    uint32_t trSize = 1 << log2TrSize;
+    const uint32_t trSize = 1 << log2TrSize;
     int64_t lambda2 = m_qpParam[ttype].lambda2;
     const int64_t psyScale = ((int64_t)m_psyRdoqScale * m_qpParam[ttype].lambda);
 
@@ -580,20 +578,20 @@
     const int32_t* unquantScale = m_scalingList->m_dequantCoef[log2TrSize - 2][scalingListType][rem];
     int unquantShift = QUANT_IQUANT_SHIFT - QUANT_SHIFT - transformShift + (m_scalingList->m_bEnabled ? 4 : 0);
     int unquantRound = (unquantShift > per) ? 1 << (unquantShift - per - 1) : 0;
-    int scaleBits = SCALE_BITS - 2 * transformShift;
+    const int scaleBits = SCALE_BITS - 2 * transformShift;
 
 #define UNQUANT(lvl)    (((lvl) * (unquantScale[blkPos] << per) + unquantRound) >> unquantShift)
 #define SIGCOST(bits)   ((lambda2 * (bits)) >> 8)
 #define RDCOST(d, bits) ((((int64_t)d * d) << scaleBits) + SIGCOST(bits))
 #define PSYVALUE(rec)   ((psyScale * (rec)) >> X265_MAX(0, (2 * transformShift + 1)))
 
-    int64_t costCoeff[32 * 32];   /* d*d + lambda * bits */
-    int64_t costUncoded[32 * 32]; /* d*d + lambda * 0    */
-    int64_t costSig[32 * 32];     /* lambda * bits       */
+    int64_t costCoeff[trSize * trSize];   /* d*d + lambda * bits */
+    int64_t costUncoded[trSize * trSize]; /* d*d + lambda * 0    */
+    int64_t costSig[trSize * trSize];     /* lambda * bits       */
 
-    int rateIncUp[32 * 32];      /* signal overhead of increasing level */
-    int rateIncDown[32 * 32];    /* signal overhead of decreasing level */
-    int sigRateDelta[32 * 32];   /* signal difference between zero and non-zero */
+    int rateIncUp[trSize * trSize];      /* signal overhead of increasing level */
+    int rateIncDown[trSize * trSize];    /* signal overhead of decreasing level */
+    int sigRateDelta[trSize * trSize];   /* signal difference between zero and non-zero */
 
     int64_t costCoeffGroupSig[MLS_GRP_NUM]; /* lambda * bits of group coding cost */
     uint64_t sigCoeffGroupFlag64 = 0;
@@ -611,7 +609,8 @@
 
     TUEntropyCodingParameters codeParams;
     cu.getTUEntropyCodingParameters(codeParams, absPartIdx, log2TrSize, bIsLuma);
-    const uint32_t cgNum = 1 << (codeParams.log2TrSizeCG * 2);
+    const uint32_t log2TrSizeCG = log2TrSize - 2;
+    const uint32_t cgNum = 1 << (log2TrSizeCG * 2);
     const uint32_t cgStride = (trSize >> MLS_CG_LOG2_SIZE);
 
     uint8_t coeffNum[MLS_GRP_NUM];      // value range[0, 16]
@@ -742,8 +741,8 @@
     {
         uint32_t ctxSet = (cgScanPos && bIsLuma) ? 2 : 0;
         const uint32_t cgBlkPos = codeParams.scanCG[cgScanPos];
-        const uint32_t cgPosY   = cgBlkPos >> codeParams.log2TrSizeCG;
-        const uint32_t cgPosX   = cgBlkPos - (cgPosY << codeParams.log2TrSizeCG);
+        const uint32_t cgPosY   = cgBlkPos >> log2TrSizeCG;
+        const uint32_t cgPosX   = cgBlkPos & ((1 << log2TrSizeCG) - 1);
         const uint64_t cgBlkPosMask = ((uint64_t)1 << cgBlkPos);
         const int patternSigCtx = calcPatternSigCtx(sigCoeffGroupFlag64, cgPosX, cgPosY, cgBlkPos, cgStride);
         const int ctxSigOffset = codeParams.firstSignificanceMapContext + (cgScanPos && bIsLuma ? 3 : 0);
@@ -829,6 +828,7 @@
         uint32_t subFlagMask = coeffFlag[cgScanPos];
         int    c2            = 0;
         uint32_t goRiceParam = 0;
+        uint32_t levelThreshold = 3;
         uint32_t c1Idx       = 0;
         uint32_t c2Idx       = 0;
         /* iterate over coefficients in each group in reverse scan order */
@@ -836,7 +836,7 @@
         {
             scanPos              = (cgScanPos << MLS_CG_SIZE) + scanPosinCG;
             uint32_t blkPos      = codeParams.scan[scanPos];
-            uint32_t maxAbsLevel = abs(dstCoeff[blkPos]);             /* abs(quantized coeff) */
+            uint32_t maxAbsLevel = dstCoeff[blkPos];                  /* abs(quantized coeff) */
             int signCoef         = m_resiDctCoeff[blkPos];            /* pre-quantization DCT coeff */
             int predictedCoef    = m_fencDctCoeff[blkPos] - signCoef; /* predicted DCT = source DCT - residual DCT*/
 
@@ -855,7 +855,11 @@
 
             // coefficient level estimation
             const int* greaterOneBits = estBitsSbac.greaterOneBits[4 * ctxSet + c1];
-            const uint32_t ctxSig = (blkPos == 0) ? 0 : table_cnt[(trSize == 4) ? 4 : patternSigCtx][g_scan4x4[codeParams.scanType][scanPosinCG]] + ctxSigOffset;
+            //const uint32_t ctxSig = (blkPos == 0) ? 0 : table_cnt[(trSize == 4) ? 4 : patternSigCtx][g_scan4x4[codeParams.scanType][scanPosinCG]] + ctxSigOffset;
+            static const uint64_t table_cnt64[4] = {0x0000000100110112ULL, 0x0000000011112222ULL, 0x0012001200120012ULL, 0x2222222222222222ULL};

 
@@ -2,6 +2,7 @@
  * Copyright (C) 2015 x265 project
  *
  * Authors: Steve Borho <steve@borho.org>
+ *          Min Chen <chenm003@163.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -50,9 +51,8 @@
     return y + ((x - y) & ((x - y) >> (sizeof(int) * CHAR_BIT - 1))); // min(x, y)
 }
 
-inline int getICRate(uint32_t absLevel, int32_t diffLevel, const int* greaterOneBits, const int* levelAbsBits, const uint32_t absGoRice, const uint32_t maxVlc, uint32_t c1c2Idx)
+inline int getICRate(uint32_t absLevel, int32_t diffLevel, const int* greaterOneBits, const int* levelAbsBits, const uint32_t absGoRice, const uint32_t maxVlc, const uint32_t c1c2Rate)
 {
-    X265_CHECK(c1c2Idx <= 3, "c1c2Idx check failure\n");
     X265_CHECK(absGoRice <= 4, "absGoRice check failure\n");
     if (!absLevel)
     {
@@ -94,12 +94,7 @@
         uint32_t numBins = fastMin(prefLen + absGoRice, 8 /* g_goRicePrefixLen[absGoRice] + absGoRice */);
 
         rate += numBins << 15;
-
-        if (c1c2Idx & 1)
-            rate += greaterOneBits[1];
-
-        if (c1c2Idx == 3)
-            rate += levelAbsBits[1];
+        rate += c1c2Rate;
     }
     return rate;
 }
@@ -140,7 +135,7 @@
 }
 
 /* Calculates the cost for specific absolute transform level */
-inline uint32_t getICRateCost(uint32_t absLevel, int32_t diffLevel, const int* greaterOneBits, const int* levelAbsBits, uint32_t absGoRice, uint32_t c1c2Idx)
+inline uint32_t getICRateCost(uint32_t absLevel, int32_t diffLevel, const int* greaterOneBits, const int* levelAbsBits, uint32_t absGoRice, const uint32_t c1c2Rate)
 {
     X265_CHECK(absLevel, "absLevel should not be zero\n");
 
@@ -175,16 +170,15 @@
 
             rate = (COEF_REMAIN_BIN_REDUCTION + length + absGoRice + 1 + length) << 15;
         }
-        if (c1c2Idx & 1)
-            rate += greaterOneBits[1];
-        if (c1c2Idx == 3)
-            rate += levelAbsBits[1];
+        rate += c1c2Rate;
         return rate;
     }
 }
 
 }
 
+Quant::rdoQuant_t Quant::rdoQuant_func[NUM_CU_DEPTH] = {&Quant::rdoQuant<2>, &Quant::rdoQuant<3>, &Quant::rdoQuant<4>, &Quant::rdoQuant<5>};
+
 Quant::Quant()
 {
     m_resiDctCoeff = NULL;
@@ -229,8 +223,11 @@
 {
     m_nr = m_frameNr ? &m_frameNr[ctu.m_encData->m_frameEncoderID] : NULL;
     m_qpParam[TEXT_LUMA].setQpParam(qp + QP_BD_OFFSET);
-    setChromaQP(qp + ctu.m_slice->m_pps->chromaQpOffset[0], TEXT_CHROMA_U, ctu.m_chromaFormat);
-    setChromaQP(qp + ctu.m_slice->m_pps->chromaQpOffset[1], TEXT_CHROMA_V, ctu.m_chromaFormat);
+    if (ctu.m_chromaFormat != X265_CSP_I400)
+    {
+        setChromaQP(qp + ctu.m_slice->m_pps->chromaQpOffset[0], TEXT_CHROMA_U, ctu.m_chromaFormat);
+        setChromaQP(qp + ctu.m_slice->m_pps->chromaQpOffset[1], TEXT_CHROMA_V, ctu.m_chromaFormat);
+    }
 }
 
 void Quant::setChromaQP(int qpin, TextType ttype, int chFmt)
@@ -444,18 +441,18 @@
             primitives.cu[sizeIdx].dct(m_fencShortBuf, m_fencDctCoeff, trSize);
         }
 
-        if (m_nr)
+        if (m_nr && m_nr->offset)
         {
             /* denoise is not applied to intra residual, so DST can be ignored */
             int cat = sizeIdx + 4 * !isLuma + 8 * !isIntra;
             int numCoeff = 1 << (log2TrSize * 2);
-            primitives.denoiseDct(m_resiDctCoeff, m_nr->residualSum[cat], m_nr->offsetDenoise[cat], numCoeff);
+            primitives.denoiseDct(m_resiDctCoeff, m_nr->residualSum[cat], m_nr->offset[cat], numCoeff);
             m_nr->count[cat]++;
         }
     }
 
     if (m_rdoqLevel)
-        return rdoQuant(cu, coeff, log2TrSize, ttype, absPartIdx, usePsy);
+        return (this->*rdoQuant_func[log2TrSize - 2])(cu, coeff, ttype, absPartIdx, usePsy);
     else
     {
         int deltaU[32 * 32];
@@ -550,9 +547,10 @@
 
 /* Rate distortion optimized quantization for entropy coding engines using
  * probability models like CABAC */
-uint32_t Quant::rdoQuant(const CUData& cu, int16_t* dstCoeff, uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool usePsy)
+template<uint32_t log2TrSize>
+uint32_t Quant::rdoQuant(const CUData& cu, int16_t* dstCoeff, TextType ttype, uint32_t absPartIdx, bool usePsy)
 {
-    int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
+    const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
     int scalingListType = (cu.isIntra(absPartIdx) ? 0 : 3) + ttype;
     const uint32_t usePsyMask = usePsy ? -1 : 0;
 
@@ -564,13 +562,13 @@
     int add = (1 << (qbits - 1));
     const int32_t* qCoef = m_scalingList->m_quantCoef[log2TrSize - 2][scalingListType][rem];
 
-    int numCoeff = 1 << (log2TrSize * 2);
+    const int numCoeff = 1 << (log2TrSize * 2);
     uint32_t numSig = primitives.nquant(m_resiDctCoeff, qCoef, dstCoeff, qbits, add, numCoeff);
     X265_CHECK((int)numSig == primitives.cu[log2TrSize - 2].count_nonzero(dstCoeff), "numSig differ\n");
     if (!numSig)
         return 0;
 
-    uint32_t trSize = 1 << log2TrSize;
+    const uint32_t trSize = 1 << log2TrSize;
     int64_t lambda2 = m_qpParam[ttype].lambda2;
     const int64_t psyScale = ((int64_t)m_psyRdoqScale * m_qpParam[ttype].lambda);
 
@@ -580,20 +578,20 @@
     const int32_t* unquantScale = m_scalingList->m_dequantCoef[log2TrSize - 2][scalingListType][rem];
     int unquantShift = QUANT_IQUANT_SHIFT - QUANT_SHIFT - transformShift + (m_scalingList->m_bEnabled ? 4 : 0);
     int unquantRound = (unquantShift > per) ? 1 << (unquantShift - per - 1) : 0;
-    int scaleBits = SCALE_BITS - 2 * transformShift;
+    const int scaleBits = SCALE_BITS - 2 * transformShift;
 
 #define UNQUANT(lvl)    (((lvl) * (unquantScale[blkPos] << per) + unquantRound) >> unquantShift)
 #define SIGCOST(bits)   ((lambda2 * (bits)) >> 8)
 #define RDCOST(d, bits) ((((int64_t)d * d) << scaleBits) + SIGCOST(bits))
 #define PSYVALUE(rec)   ((psyScale * (rec)) >> X265_MAX(0, (2 * transformShift + 1)))
 
-    int64_t costCoeff[32 * 32];   /* d*d + lambda * bits */
-    int64_t costUncoded[32 * 32]; /* d*d + lambda * 0    */
-    int64_t costSig[32 * 32];     /* lambda * bits       */
+    int64_t costCoeff[trSize * trSize];   /* d*d + lambda * bits */
+    int64_t costUncoded[trSize * trSize]; /* d*d + lambda * 0    */
+    int64_t costSig[trSize * trSize];     /* lambda * bits       */
 
-    int rateIncUp[32 * 32];      /* signal overhead of increasing level */
-    int rateIncDown[32 * 32];    /* signal overhead of decreasing level */
-    int sigRateDelta[32 * 32];   /* signal difference between zero and non-zero */
+    int rateIncUp[trSize * trSize];      /* signal overhead of increasing level */
+    int rateIncDown[trSize * trSize];    /* signal overhead of decreasing level */
+    int sigRateDelta[trSize * trSize];   /* signal difference between zero and non-zero */
 
     int64_t costCoeffGroupSig[MLS_GRP_NUM]; /* lambda * bits of group coding cost */
     uint64_t sigCoeffGroupFlag64 = 0;
@@ -611,7 +609,8 @@
 
     TUEntropyCodingParameters codeParams;
     cu.getTUEntropyCodingParameters(codeParams, absPartIdx, log2TrSize, bIsLuma);
-    const uint32_t cgNum = 1 << (codeParams.log2TrSizeCG * 2);
+    const uint32_t log2TrSizeCG = log2TrSize - 2;
+    const uint32_t cgNum = 1 << (log2TrSizeCG * 2);
     const uint32_t cgStride = (trSize >> MLS_CG_LOG2_SIZE);
 
     uint8_t coeffNum[MLS_GRP_NUM];      // value range[0, 16]
@@ -742,8 +741,8 @@
     {
         uint32_t ctxSet = (cgScanPos && bIsLuma) ? 2 : 0;
         const uint32_t cgBlkPos = codeParams.scanCG[cgScanPos];
-        const uint32_t cgPosY   = cgBlkPos >> codeParams.log2TrSizeCG;
-        const uint32_t cgPosX   = cgBlkPos - (cgPosY << codeParams.log2TrSizeCG);
+        const uint32_t cgPosY   = cgBlkPos >> log2TrSizeCG;
+        const uint32_t cgPosX   = cgBlkPos & ((1 << log2TrSizeCG) - 1);
         const uint64_t cgBlkPosMask = ((uint64_t)1 << cgBlkPos);
         const int patternSigCtx = calcPatternSigCtx(sigCoeffGroupFlag64, cgPosX, cgPosY, cgBlkPos, cgStride);
         const int ctxSigOffset = codeParams.firstSignificanceMapContext + (cgScanPos && bIsLuma ? 3 : 0);
@@ -829,6 +828,7 @@
         uint32_t subFlagMask = coeffFlag[cgScanPos];
         int    c2            = 0;
         uint32_t goRiceParam = 0;
+        uint32_t levelThreshold = 3;
         uint32_t c1Idx       = 0;
         uint32_t c2Idx       = 0;
         /* iterate over coefficients in each group in reverse scan order */
@@ -836,7 +836,7 @@
         {
             scanPos              = (cgScanPos << MLS_CG_SIZE) + scanPosinCG;
             uint32_t blkPos      = codeParams.scan[scanPos];
-            uint32_t maxAbsLevel = abs(dstCoeff[blkPos]);             /* abs(quantized coeff) */
+            uint32_t maxAbsLevel = dstCoeff[blkPos];                  /* abs(quantized coeff) */
             int signCoef         = m_resiDctCoeff[blkPos];            /* pre-quantization DCT coeff */
             int predictedCoef    = m_fencDctCoeff[blkPos] - signCoef; /* predicted DCT = source DCT - residual DCT*/
 
@@ -855,7 +855,11 @@
 
             // coefficient level estimation
             const int* greaterOneBits = estBitsSbac.greaterOneBits[4 * ctxSet + c1];
-            const uint32_t ctxSig = (blkPos == 0) ? 0 : table_cnt[(trSize == 4) ? 4 : patternSigCtx][g_scan4x4[codeParams.scanType][scanPosinCG]] + ctxSigOffset;
+            //const uint32_t ctxSig = (blkPos == 0) ? 0 : table_cnt[(trSize == 4) ? 4 : patternSigCtx][g_scan4x4[codeParams.scanType][scanPosinCG]] + ctxSigOffset;
+            static const uint64_t table_cnt64[4] = {0x0000000100110112ULL, 0x0000000011112222ULL, 0x0012001200120012ULL, 0x2222222222222222ULL};
​

x265_1.8.tar.gz/source/common/quant.h -> x265_1.9.tar.gz/source/common/quant.h Changed

 
@@ -2,6 +2,7 @@
  * Copyright (C) 2015 x265 project
  *
  * Authors: Steve Borho <steve@borho.org>
+ *          Min Chen <chenm003@163.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -59,18 +60,18 @@
     }
 };
 
-#define MAX_NUM_TR_COEFFS        MAX_TR_SIZE * MAX_TR_SIZE /* Maximum number of transform coefficients, for a 32x32 transform */
-#define MAX_NUM_TR_CATEGORIES    16                        /* 32, 16, 8, 4 transform categories each for luma and chroma */
-
 // NOTE: MUST be 16-byte aligned for asm code
 struct NoiseReduction
 {
     /* 0 = luma 4x4,   1 = luma 8x8,   2 = luma 16x16,   3 = luma 32x32
      * 4 = chroma 4x4, 5 = chroma 8x8, 6 = chroma 16x16, 7 = chroma 32x32
      * Intra 0..7 - Inter 8..15 */
-    ALIGN_VAR_16(uint32_t, residualSum[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS]);
-    uint32_t count[MAX_NUM_TR_CATEGORIES];
-    uint16_t offsetDenoise[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS];
+    ALIGN_VAR_16(uint32_t, nrResidualSum[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS]);
+    uint32_t nrCount[MAX_NUM_TR_CATEGORIES];
+    uint16_t nrOffsetDenoise[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS];
+    uint16_t (*offset)[MAX_NUM_TR_COEFFS];
+    uint32_t (*residualSum)[MAX_NUM_TR_COEFFS];
+    uint32_t *count;
 };
 
 class Quant
@@ -125,8 +126,8 @@
         const uint32_t sigPos = (uint32_t)(sigCoeffGroupFlag64 >> (cgBlkPos + 1)); // just need lowest 7-bits valid
 
         // TODO: instruction BT is faster, but _bittest64 still generate instruction 'BT m, r' in VS2012
-        const uint32_t sigRight = ((uint32_t)(cgPosX - (trSizeCG - 1)) >> 31) & sigPos;
-        const uint32_t sigLower = ((uint32_t)(cgPosY - (trSizeCG - 1)) >> 31) & (sigPos >> (trSizeCG - 1));
+        const uint32_t sigRight = (cgPosX != (trSizeCG - 1)) & sigPos;
+        const uint32_t sigLower = (cgPosY != (trSizeCG - 1)) & (sigPos >> (trSizeCG - 1));
         return sigRight + sigLower * 2;
     }
 
@@ -136,8 +137,8 @@
         X265_CHECK(cgBlkPos < 64, "cgBlkPos is too large\n");
         // NOTE: unsafe shift operator, see NOTE in calcPatternSigCtx
         const uint32_t sigPos = (uint32_t)(cgGroupMask >> (cgBlkPos + 1)); // just need lowest 8-bits valid
-        const uint32_t sigRight = ((uint32_t)(cgPosX - (trSizeCG - 1)) >> 31) & sigPos;
-        const uint32_t sigLower = ((uint32_t)(cgPosY - (trSizeCG - 1)) >> 31) & (sigPos >> (trSizeCG - 1));
+        const uint32_t sigRight = (cgPosX != (trSizeCG - 1)) & sigPos;
+        const uint32_t sigLower = (cgPosY != (trSizeCG - 1)) & (sigPos >> (trSizeCG - 1));
 
         return (sigRight | sigLower);
     }
@@ -151,7 +152,14 @@
 
     uint32_t signBitHidingHDQ(int16_t* qcoeff, int32_t* deltaU, uint32_t numSig, const TUEntropyCodingParameters &codingParameters, uint32_t log2TrSize);
 
-    uint32_t rdoQuant(const CUData& cu, int16_t* dstCoeff, uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx, bool usePsy);
+    template<uint32_t log2TrSize>
+    uint32_t rdoQuant(const CUData& cu, int16_t* dstCoeff, TextType ttype, uint32_t absPartIdx, bool usePsy);
+
+public:
+    typedef uint32_t (Quant::*rdoQuant_t)(const CUData& cu, int16_t* dstCoeff, TextType ttype, uint32_t absPartIdx, bool usePsy);
+
+private:
+    static rdoQuant_t rdoQuant_func[NUM_CU_DEPTH];
 };
 }
 
​

x265_1.8.tar.gz/source/common/shortyuv.cpp -> x265_1.9.tar.gz/source/common/shortyuv.cpp Changed

@@ -40,19 +40,26 @@
 bool ShortYuv::create(uint32_t size, int csp)
 {
     m_csp = csp;
+    m_size = size;
     m_hChromaShift = CHROMA_H_SHIFT(csp);
     m_vChromaShift = CHROMA_V_SHIFT(csp);
-
-    m_size = size;
-    m_csize = size >> m_hChromaShift;
-
     size_t sizeL = size * size;
-    size_t sizeC = sizeL >> (m_hChromaShift + m_vChromaShift);
-    X265_CHECK((sizeC & 15) == 0, "invalid size");
 
-    CHECKED_MALLOC(m_buf[0], int16_t, sizeL + sizeC * 2);
-    m_buf[1] = m_buf[0] + sizeL;
-    m_buf[2] = m_buf[0] + sizeL + sizeC;
+    if (csp != X265_CSP_I400)
+    {
+        m_csize = size >> m_hChromaShift;
+        size_t sizeC = sizeL >> (m_hChromaShift + m_vChromaShift);
+        X265_CHECK((sizeC & 15) == 0, "invalid size");
+
+        CHECKED_MALLOC(m_buf[0], int16_t, sizeL + sizeC * 2);
+        m_buf[1] = m_buf[0] + sizeL;
+        m_buf[2] = m_buf[0] + sizeL + sizeC;
+    }
+    else
+    {
+        CHECKED_MALLOC(m_buf[0], int16_t, sizeL);
+        m_buf[1] = m_buf[2] = NULL;
+    }
     return true;
 
 fail:
@@ -75,8 +82,11 @@
 {
     const int sizeIdx = log2Size - 2;
     primitives.cu[sizeIdx].sub_ps(m_buf[0], m_size, srcYuv0.m_buf[0], srcYuv1.m_buf[0], srcYuv0.m_size, srcYuv1.m_size);
-    primitives.chroma[m_csp].cu[sizeIdx].sub_ps(m_buf[1], m_csize, srcYuv0.m_buf[1], srcYuv1.m_buf[1], srcYuv0.m_csize, srcYuv1.m_csize);
-    primitives.chroma[m_csp].cu[sizeIdx].sub_ps(m_buf[2], m_csize, srcYuv0.m_buf[2], srcYuv1.m_buf[2], srcYuv0.m_csize, srcYuv1.m_csize);
+    if (m_csp != X265_CSP_I400)
+    {
+        primitives.chroma[m_csp].cu[sizeIdx].sub_ps(m_buf[1], m_csize, srcYuv0.m_buf[1], srcYuv1.m_buf[1], srcYuv0.m_csize, srcYuv1.m_csize);
+        primitives.chroma[m_csp].cu[sizeIdx].sub_ps(m_buf[2], m_csize, srcYuv0.m_buf[2], srcYuv1.m_buf[2], srcYuv0.m_csize, srcYuv1.m_csize);
+    }
 }
 
 void ShortYuv::copyPartToPartLuma(ShortYuv& dstYuv, uint32_t absPartIdx, uint32_t log2Size) const

 
@@ -40,19 +40,26 @@
 bool ShortYuv::create(uint32_t size, int csp)
 {
     m_csp = csp;
+    m_size = size;
     m_hChromaShift = CHROMA_H_SHIFT(csp);
     m_vChromaShift = CHROMA_V_SHIFT(csp);
-
-    m_size = size;
-    m_csize = size >> m_hChromaShift;
-
     size_t sizeL = size * size;
-    size_t sizeC = sizeL >> (m_hChromaShift + m_vChromaShift);
-    X265_CHECK((sizeC & 15) == 0, "invalid size");
 
-    CHECKED_MALLOC(m_buf[0], int16_t, sizeL + sizeC * 2);
-    m_buf[1] = m_buf[0] + sizeL;
-    m_buf[2] = m_buf[0] + sizeL + sizeC;
+    if (csp != X265_CSP_I400)
+    {
+        m_csize = size >> m_hChromaShift;
+        size_t sizeC = sizeL >> (m_hChromaShift + m_vChromaShift);
+        X265_CHECK((sizeC & 15) == 0, "invalid size");
+
+        CHECKED_MALLOC(m_buf[0], int16_t, sizeL + sizeC * 2);
+        m_buf[1] = m_buf[0] + sizeL;
+        m_buf[2] = m_buf[0] + sizeL + sizeC;
+    }
+    else
+    {
+        CHECKED_MALLOC(m_buf[0], int16_t, sizeL);
+        m_buf[1] = m_buf[2] = NULL;
+    }
     return true;
 
 fail:
@@ -75,8 +82,11 @@
 {
     const int sizeIdx = log2Size - 2;
     primitives.cu[sizeIdx].sub_ps(m_buf[0], m_size, srcYuv0.m_buf[0], srcYuv1.m_buf[0], srcYuv0.m_size, srcYuv1.m_size);
-    primitives.chroma[m_csp].cu[sizeIdx].sub_ps(m_buf[1], m_csize, srcYuv0.m_buf[1], srcYuv1.m_buf[1], srcYuv0.m_csize, srcYuv1.m_csize);
-    primitives.chroma[m_csp].cu[sizeIdx].sub_ps(m_buf[2], m_csize, srcYuv0.m_buf[2], srcYuv1.m_buf[2], srcYuv0.m_csize, srcYuv1.m_csize);
+    if (m_csp != X265_CSP_I400)
+    {
+        primitives.chroma[m_csp].cu[sizeIdx].sub_ps(m_buf[1], m_csize, srcYuv0.m_buf[1], srcYuv1.m_buf[1], srcYuv0.m_csize, srcYuv1.m_csize);
+        primitives.chroma[m_csp].cu[sizeIdx].sub_ps(m_buf[2], m_csize, srcYuv0.m_buf[2], srcYuv1.m_buf[2], srcYuv0.m_csize, srcYuv1.m_csize);
+    }
 }
 
 void ShortYuv::copyPartToPartLuma(ShortYuv& dstYuv, uint32_t absPartIdx, uint32_t log2Size) const
​

x265_1.8.tar.gz/source/common/slice.cpp -> x265_1.9.tar.gz/source/common/slice.cpp Changed

@@ -33,7 +33,9 @@
 {
     if (m_sliceType == I_SLICE)
     {
-        memset(m_refPicList, 0, sizeof(m_refPicList));
+        memset(m_refFrameList, 0, sizeof(m_refFrameList));
+        memset(m_refReconPicList, 0, sizeof(m_refReconPicList));
+        memset(m_refPOCList, 0, sizeof(m_refPOCList));
         m_numRefIdx[1] = m_numRefIdx[0] = 0;
         return;
     }
@@ -106,13 +108,13 @@
     {
         cIdx = rIdx % numPocTotalCurr;
         X265_CHECK(cIdx >= 0 && cIdx < numPocTotalCurr, "RPS index check fail\n");
-        m_refPicList[0][rIdx] = rpsCurrList0[cIdx];
+        m_refFrameList[0][rIdx] = rpsCurrList0[cIdx];
     }
 
     if (m_sliceType != B_SLICE)
     {
         m_numRefIdx[1] = 0;
-        memset(m_refPicList[1], 0, sizeof(m_refPicList[1]));
+        memset(m_refFrameList[1], 0, sizeof(m_refFrameList[1]));
     }
     else
     {
@@ -120,13 +122,13 @@
         {
             cIdx = rIdx % numPocTotalCurr;
             X265_CHECK(cIdx >= 0 && cIdx < numPocTotalCurr, "RPS index check fail\n");
-            m_refPicList[1][rIdx] = rpsCurrList1[cIdx];
+            m_refFrameList[1][rIdx] = rpsCurrList1[cIdx];
         }
     }
 
     for (int dir = 0; dir < 2; dir++)
         for (int numRefIdx = 0; numRefIdx < m_numRefIdx[dir]; numRefIdx++)
-            m_refPOCList[dir][numRefIdx] = m_refPicList[dir][numRefIdx]->m_poc;
+            m_refPOCList[dir][numRefIdx] = m_refFrameList[dir][numRefIdx]->m_poc;
 }
 
 void Slice::disableWeights()

 
@@ -33,7 +33,9 @@
 {
     if (m_sliceType == I_SLICE)
     {
-        memset(m_refPicList, 0, sizeof(m_refPicList));
+        memset(m_refFrameList, 0, sizeof(m_refFrameList));
+        memset(m_refReconPicList, 0, sizeof(m_refReconPicList));
+        memset(m_refPOCList, 0, sizeof(m_refPOCList));
         m_numRefIdx[1] = m_numRefIdx[0] = 0;
         return;
     }
@@ -106,13 +108,13 @@
     {
         cIdx = rIdx % numPocTotalCurr;
         X265_CHECK(cIdx >= 0 && cIdx < numPocTotalCurr, "RPS index check fail\n");
-        m_refPicList[0][rIdx] = rpsCurrList0[cIdx];
+        m_refFrameList[0][rIdx] = rpsCurrList0[cIdx];
     }
 
     if (m_sliceType != B_SLICE)
     {
         m_numRefIdx[1] = 0;
-        memset(m_refPicList[1], 0, sizeof(m_refPicList[1]));
+        memset(m_refFrameList[1], 0, sizeof(m_refFrameList[1]));
     }
     else
     {
@@ -120,13 +122,13 @@
         {
             cIdx = rIdx % numPocTotalCurr;
             X265_CHECK(cIdx >= 0 && cIdx < numPocTotalCurr, "RPS index check fail\n");
-            m_refPicList[1][rIdx] = rpsCurrList1[cIdx];
+            m_refFrameList[1][rIdx] = rpsCurrList1[cIdx];
         }
     }
 
     for (int dir = 0; dir < 2; dir++)
         for (int numRefIdx = 0; numRefIdx < m_numRefIdx[dir]; numRefIdx++)
-            m_refPOCList[dir][numRefIdx] = m_refPicList[dir][numRefIdx]->m_poc;
+            m_refPOCList[dir][numRefIdx] = m_refFrameList[dir][numRefIdx]->m_poc;
 }
 
 void Slice::disableWeights()
​

x265_1.8.tar.gz/source/common/slice.h -> x265_1.9.tar.gz/source/common/slice.h Changed

@@ -31,6 +31,7 @@
 
 class Frame;
 class PicList;
+class PicYuv;
 class MotionReference;
 
 enum SliceType
@@ -104,6 +105,12 @@
 
 struct ProfileTierLevel
 {
+    int      profileIdc;
+    int      levelIdc;
+    uint32_t minCrForLevel;
+    uint32_t maxLumaSrForLevel;
+    uint32_t bitDepthConstraint;
+    int      chromaFormatConstraint;
     bool     tierFlag;
     bool     progressiveSourceFlag;
     bool     interlacedSourceFlag;
@@ -113,12 +120,6 @@
     bool     intraConstraintFlag;
     bool     onePictureOnlyConstraintFlag;
     bool     lowerBitRateConstraintFlag;
-    int      profileIdc;
-    int      levelIdc;
-    uint32_t minCrForLevel;
-    uint32_t maxLumaSrForLevel;
-    uint32_t bitDepthConstraint;
-    int      chromaFormatConstraint;
 };
 
 struct HRDInfo
@@ -151,21 +152,21 @@
 
 struct VPS
 {
+    HRDInfo          hrdParameters;
+    ProfileTierLevel ptl;
     uint32_t         maxTempSubLayers;
     uint32_t         numReorderPics;
     uint32_t         maxDecPicBuffering;
     uint32_t         maxLatencyIncrease;
-    HRDInfo          hrdParameters;
-    ProfileTierLevel ptl;
 };
 
 struct Window
 {
-    bool bEnabled;
     int  leftOffset;
     int  rightOffset;
     int  topOffset;
     int  bottomOffset;
+    bool bEnabled;
 
     Window()
     {
@@ -175,40 +176,41 @@
 
 struct VUI
 {
-    bool       aspectRatioInfoPresentFlag;
     int        aspectRatioIdc;
     int        sarWidth;
     int        sarHeight;
-
-    bool       overscanInfoPresentFlag;
-    bool       overscanAppropriateFlag;
-
-    bool       videoSignalTypePresentFlag;
     int        videoFormat;
-    bool       videoFullRangeFlag;
-
-    bool       colourDescriptionPresentFlag;
     int        colourPrimaries;
     int        transferCharacteristics;
     int        matrixCoefficients;
-
-    bool       chromaLocInfoPresentFlag;
     int        chromaSampleLocTypeTopField;
     int        chromaSampleLocTypeBottomField;
 
-    Window     defaultDisplayWindow;
-
+    bool       aspectRatioInfoPresentFlag;
+    bool       overscanInfoPresentFlag;
+    bool       overscanAppropriateFlag;
+    bool       videoSignalTypePresentFlag;
+    bool       videoFullRangeFlag;
+    bool       colourDescriptionPresentFlag;
+    bool       chromaLocInfoPresentFlag;
     bool       frameFieldInfoPresentFlag;
     bool       fieldSeqFlag;
-
     bool       hrdParametersPresentFlag;
-    HRDInfo    hrdParameters;
 
+    HRDInfo    hrdParameters;
+    Window     defaultDisplayWindow;
     TimingInfo timingInfo;
 };
 
 struct SPS
 {
+    /* cached PicYuv offset arrays, shared by all instances of
+     * PicYuv created by this encoder */
+    intptr_t* cuOffsetY;
+    intptr_t* cuOffsetC;
+    intptr_t* buOffsetY;
+    intptr_t* buOffsetC;
+
     int      chromaFormatIdc;        // use param
     uint32_t picWidthInLumaSamples;  // use param
     uint32_t picHeightInLumaSamples; // use param
@@ -228,8 +230,6 @@
     uint32_t quadtreeTUMaxDepthInter; // use param
     uint32_t quadtreeTUMaxDepthIntra; // use param
 
-    bool     bUseSAO; // use param
-    bool     bUseAMP; // use param
     uint32_t maxAMPDepth;
 
     uint32_t maxTempSubLayers;   // max number of Temporal Sub layers
@@ -237,11 +237,26 @@
     uint32_t maxLatencyIncrease;
     int      numReorderPics;
 
+    bool     bUseSAO; // use param
+    bool     bUseAMP; // use param
     bool     bUseStrongIntraSmoothing; // use param
     bool     bTemporalMVPEnabled;
 
     Window   conformanceWindow;
     VUI      vuiParameters;
+
+    SPS()
+    {
+        memset(this, 0, sizeof(*this));
+    }
+
+    ~SPS()
+    {
+        X265_FREE(cuOffsetY);
+        X265_FREE(cuOffsetC);
+        X265_FREE(buOffsetY);
+        X265_FREE(buOffsetC);
+    }
 };
 
 struct PPS
@@ -249,6 +264,8 @@
     uint32_t maxCuDQPDepth;
 
     int      chromaQpOffset[2];      // use param
+    int      deblockingFilterBetaOffsetDiv2;
+    int      deblockingFilterTcOffsetDiv2;
 
     bool     bUseWeightPred;         // use param
     bool     bUseWeightedBiPred;     // use param
@@ -262,17 +279,15 @@
 
     bool     bDeblockingFilterControlPresent;
     bool     bPicDisableDeblockingFilter;
-    int      deblockingFilterBetaOffsetDiv2;
-    int      deblockingFilterTcOffsetDiv2;
 };
 
 struct WeightParam
 {
     // Explicit weighted prediction parameters parsed in slice header,
-    bool     bPresentFlag;
     uint32_t log2WeightDenom;
     int      inputWeight;
     int      inputOffset;
+    bool     bPresentFlag;
 
     /* makes a non-h265 weight (i.e. fix7), into an h265 weight */
     void setFromWeightAndOffset(int w, int o, int denom, bool bNormalize)
@@ -304,6 +319,9 @@
 
     const SPS*  m_sps;
     const PPS*  m_pps;
+    Frame*      m_refFrameList[2][MAX_NUM_REF + 1];
+    PicYuv*     m_refReconPicList[2][MAX_NUM_REF + 1];
+
     WeightParam m_weightPredTable[2][MAX_NUM_REF][3]; // [list][refIdx][0:Y, 1:U, 2:V]
     MotionReference (*m_mref)[MAX_NUM_REF + 1];
     RPS         m_rps;
@@ -312,34 +330,28 @@
     SliceType   m_sliceType;
     int         m_sliceQp;
     int         m_poc;
-    
     int         m_lastIDR;
 
-    bool        m_bCheckLDC;       // TODO: is this necessary?
-    bool        m_sLFaseFlag;      // loop filter boundary flag
-    bool        m_colFromL0Flag;   // collocated picture from List0 or List1 flag

 
@@ -31,6 +31,7 @@
 
 class Frame;
 class PicList;
+class PicYuv;
 class MotionReference;
 
 enum SliceType
@@ -104,6 +105,12 @@
 
 struct ProfileTierLevel
 {
+    int      profileIdc;
+    int      levelIdc;
+    uint32_t minCrForLevel;
+    uint32_t maxLumaSrForLevel;
+    uint32_t bitDepthConstraint;
+    int      chromaFormatConstraint;
     bool     tierFlag;
     bool     progressiveSourceFlag;
     bool     interlacedSourceFlag;
@@ -113,12 +120,6 @@
     bool     intraConstraintFlag;
     bool     onePictureOnlyConstraintFlag;
     bool     lowerBitRateConstraintFlag;
-    int      profileIdc;
-    int      levelIdc;
-    uint32_t minCrForLevel;
-    uint32_t maxLumaSrForLevel;
-    uint32_t bitDepthConstraint;
-    int      chromaFormatConstraint;
 };
 
 struct HRDInfo
@@ -151,21 +152,21 @@
 
 struct VPS
 {
+    HRDInfo          hrdParameters;
+    ProfileTierLevel ptl;
     uint32_t         maxTempSubLayers;
     uint32_t         numReorderPics;
     uint32_t         maxDecPicBuffering;
     uint32_t         maxLatencyIncrease;
-    HRDInfo          hrdParameters;
-    ProfileTierLevel ptl;
 };
 
 struct Window
 {
-    bool bEnabled;
     int  leftOffset;
     int  rightOffset;
     int  topOffset;
     int  bottomOffset;
+    bool bEnabled;
 
     Window()
     {
@@ -175,40 +176,41 @@
 
 struct VUI
 {
-    bool       aspectRatioInfoPresentFlag;
     int        aspectRatioIdc;
     int        sarWidth;
     int        sarHeight;
-
-    bool       overscanInfoPresentFlag;
-    bool       overscanAppropriateFlag;
-
-    bool       videoSignalTypePresentFlag;
     int        videoFormat;
-    bool       videoFullRangeFlag;
-
-    bool       colourDescriptionPresentFlag;
     int        colourPrimaries;
     int        transferCharacteristics;
     int        matrixCoefficients;
-
-    bool       chromaLocInfoPresentFlag;
     int        chromaSampleLocTypeTopField;
     int        chromaSampleLocTypeBottomField;
 
-    Window     defaultDisplayWindow;
-
+    bool       aspectRatioInfoPresentFlag;
+    bool       overscanInfoPresentFlag;
+    bool       overscanAppropriateFlag;
+    bool       videoSignalTypePresentFlag;
+    bool       videoFullRangeFlag;
+    bool       colourDescriptionPresentFlag;
+    bool       chromaLocInfoPresentFlag;
     bool       frameFieldInfoPresentFlag;
     bool       fieldSeqFlag;
-
     bool       hrdParametersPresentFlag;
-    HRDInfo    hrdParameters;
 
+    HRDInfo    hrdParameters;
+    Window     defaultDisplayWindow;
     TimingInfo timingInfo;
 };
 
 struct SPS
 {
+    /* cached PicYuv offset arrays, shared by all instances of
+     * PicYuv created by this encoder */
+    intptr_t* cuOffsetY;
+    intptr_t* cuOffsetC;
+    intptr_t* buOffsetY;
+    intptr_t* buOffsetC;
+
     int      chromaFormatIdc;        // use param
     uint32_t picWidthInLumaSamples;  // use param
     uint32_t picHeightInLumaSamples; // use param
@@ -228,8 +230,6 @@
     uint32_t quadtreeTUMaxDepthInter; // use param
     uint32_t quadtreeTUMaxDepthIntra; // use param
 
-    bool     bUseSAO; // use param
-    bool     bUseAMP; // use param
     uint32_t maxAMPDepth;
 
     uint32_t maxTempSubLayers;   // max number of Temporal Sub layers
@@ -237,11 +237,26 @@
     uint32_t maxLatencyIncrease;
     int      numReorderPics;
 
+    bool     bUseSAO; // use param
+    bool     bUseAMP; // use param
     bool     bUseStrongIntraSmoothing; // use param
     bool     bTemporalMVPEnabled;
 
     Window   conformanceWindow;
     VUI      vuiParameters;
+
+    SPS()
+    {
+        memset(this, 0, sizeof(*this));
+    }
+
+    ~SPS()
+    {
+        X265_FREE(cuOffsetY);
+        X265_FREE(cuOffsetC);
+        X265_FREE(buOffsetY);
+        X265_FREE(buOffsetC);
+    }
 };
 
 struct PPS
@@ -249,6 +264,8 @@
     uint32_t maxCuDQPDepth;
 
     int      chromaQpOffset[2];      // use param
+    int      deblockingFilterBetaOffsetDiv2;
+    int      deblockingFilterTcOffsetDiv2;
 
     bool     bUseWeightPred;         // use param
     bool     bUseWeightedBiPred;     // use param
@@ -262,17 +279,15 @@
 
     bool     bDeblockingFilterControlPresent;
     bool     bPicDisableDeblockingFilter;
-    int      deblockingFilterBetaOffsetDiv2;
-    int      deblockingFilterTcOffsetDiv2;
 };
 
 struct WeightParam
 {
     // Explicit weighted prediction parameters parsed in slice header,
-    bool     bPresentFlag;
     uint32_t log2WeightDenom;
     int      inputWeight;
     int      inputOffset;
+    bool     bPresentFlag;
 
     /* makes a non-h265 weight (i.e. fix7), into an h265 weight */
     void setFromWeightAndOffset(int w, int o, int denom, bool bNormalize)
@@ -304,6 +319,9 @@
 
     const SPS*  m_sps;
     const PPS*  m_pps;
+    Frame*      m_refFrameList[2][MAX_NUM_REF + 1];
+    PicYuv*     m_refReconPicList[2][MAX_NUM_REF + 1];
+
     WeightParam m_weightPredTable[2][MAX_NUM_REF][3]; // [list][refIdx][0:Y, 1:U, 2:V]
     MotionReference (*m_mref)[MAX_NUM_REF + 1];
     RPS         m_rps;
@@ -312,34 +330,28 @@
     SliceType   m_sliceType;
     int         m_sliceQp;
     int         m_poc;
-    
     int         m_lastIDR;
 
-    bool        m_bCheckLDC;       // TODO: is this necessary?
-    bool        m_sLFaseFlag;      // loop filter boundary flag
-    bool        m_colFromL0Flag;   // collocated picture from List0 or List1 flag
​

x265_1.8.tar.gz/source/common/threading.h -> x265_1.9.tar.gz/source/common/threading.h Changed

 
@@ -2,6 +2,7 @@
  * Copyright (C) 2013 x265 project
  *
  * Authors: Steve Borho <steve@borho.org>
+ *          Min Chen <chenm003@163.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -204,6 +205,15 @@
         return ret;
     }
 
+    int getIncr(int n = 1)
+    {
+        EnterCriticalSection(&m_cs);
+        int ret = m_val;
+        m_val += n;
+        LeaveCriticalSection(&m_cs);
+        return ret;
+    }
+
     void set(int newval)
     {
         EnterCriticalSection(&m_cs);
@@ -393,6 +403,15 @@
         return ret;
     }
 
+    int getIncr(int n = 1)
+    {
+        pthread_mutex_lock(&m_mutex);
+        int ret = m_val;
+        m_val += n;
+        pthread_mutex_unlock(&m_mutex);
+        return ret;
+    }
+
     void set(int newval)
     {
         pthread_mutex_lock(&m_mutex);
​

x265_1.8.tar.gz/source/common/threadpool.cpp -> x265_1.9.tar.gz/source/common/threadpool.cpp Changed

@@ -2,6 +2,7 @@
  * Copyright (C) 2013 x265 project
  *
  * Authors: Steve Borho <steve@borho.org>
+ *          Min Chen <chenm003@163.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -59,6 +60,9 @@
 #if HAVE_LIBNUMA
 #include <numa.h>
 #endif
+#if defined(_MSC_VER)
+# define strcasecmp _stricmp
+#endif
 
 namespace X265_NS {
 // x265 private namespace
@@ -226,8 +230,13 @@
 {
     enum { MAX_NODE_NUM = 127 };
     int cpusPerNode[MAX_NODE_NUM + 1];
+    int threadsPerPool[MAX_NODE_NUM + 2];
+    uint64_t nodeMaskPerPool[MAX_NODE_NUM + 2];
 
     memset(cpusPerNode, 0, sizeof(cpusPerNode));
+    memset(threadsPerPool, 0, sizeof(threadsPerPool));
+    memset(nodeMaskPerPool, 0, sizeof(nodeMaskPerPool));
+
     int numNumaNodes = X265_MIN(getNumaNodeCount(), MAX_NODE_NUM);
     int cpuCount = getCpuCount();
     bool bNumaSupport = false;
@@ -258,7 +267,7 @@
         for (int i = 0; i < numNumaNodes; i++)
             x265_log(p, X265_LOG_DEBUG, "detected NUMA node %d with %d logical cores\n", i, cpusPerNode[i]);
 
-    /* limit nodes based on param->numaPools */
+    /* limit threads based on param->numaPools */
     if (p->numaPools && *p->numaPools)
     {
         const char *nodeStr = p->numaPools;
@@ -266,19 +275,30 @@
         {
             if (!*nodeStr)
             {
-                cpusPerNode[i] = 0;
+                threadsPerPool[i] = 0;
                 continue;
             }
             else if (*nodeStr == '-')
-                cpusPerNode[i] = 0;
-            else if (*nodeStr == '*')
+                threadsPerPool[i] = 0;
+			else if (*nodeStr == '*' || !strcasecmp(nodeStr, "NULL"))
+            {
+                for (int j = i; j < numNumaNodes; j++)
+                {
+                    threadsPerPool[numNumaNodes] += cpusPerNode[j];
+                    nodeMaskPerPool[numNumaNodes] |= ((uint64_t)1 << j);
+                }
                 break;
+            }
             else if (*nodeStr == '+')
-                ;
+            {
+                threadsPerPool[numNumaNodes] += cpusPerNode[i];
+                nodeMaskPerPool[numNumaNodes] |= ((uint64_t)1 << i);
+            }
             else
             {
                 int count = atoi(nodeStr);
-                cpusPerNode[i] = X265_MIN(count, cpusPerNode[i]);
+                threadsPerPool[i] = X265_MIN(count, cpusPerNode[i]);
+                nodeMaskPerPool[i] = ((uint64_t)1 << i);
             }
 
             /* consume current node string, comma, and white-space */
@@ -288,14 +308,31 @@
                ++nodeStr;
         }
     }
+    else
+    {
+        for (int i = 0; i < numNumaNodes; i++)
+        {
+            threadsPerPool[numNumaNodes]  += cpusPerNode[i];
+            nodeMaskPerPool[numNumaNodes] |= ((uint64_t)1 << i);
+        }
+    }
+ 
+    // If the last pool size is > MAX_POOL_THREADS, clip it to spawn thread pools only of size >= 1/2 max (heuristic)
+    if ((threadsPerPool[numNumaNodes] > MAX_POOL_THREADS) &&
+        ((threadsPerPool[numNumaNodes] % MAX_POOL_THREADS) < (MAX_POOL_THREADS / 2)))
+    {
+        threadsPerPool[numNumaNodes] -= (threadsPerPool[numNumaNodes] % MAX_POOL_THREADS);
+        x265_log(p, X265_LOG_DEBUG,
+                 "Creating only %d worker threads beyond specified numbers with --pools (if specified) to prevent asymmetry in pools; may not use all HW contexts\n", threadsPerPool[numNumaNodes]);
+    }
 
     numPools = 0;
-    for (int i = 0; i < numNumaNodes; i++)
+    for (int i = 0; i < numNumaNodes + 1; i++)
     {
         if (bNumaSupport)
             x265_log(p, X265_LOG_DEBUG, "NUMA node %d may use %d logical cores\n", i, cpusPerNode[i]);
-        if (cpusPerNode[i])
-            numPools += (cpusPerNode[i] + MAX_POOL_THREADS - 1) / MAX_POOL_THREADS;
+        if (threadsPerPool[i])
+            numPools += (threadsPerPool[i] + MAX_POOL_THREADS - 1) / MAX_POOL_THREADS;
     }
 
     if (!numPools)
@@ -314,20 +351,27 @@
         int node = 0;
         for (int i = 0; i < numPools; i++)
         {
-            while (!cpusPerNode[node])
+            while (!threadsPerPool[node])
                 node++;
-            int cores = X265_MIN(MAX_POOL_THREADS, cpusPerNode[node]);
-            if (!pools[i].create(cores, maxProviders, node))
+            int numThreads = X265_MIN(MAX_POOL_THREADS, threadsPerPool[node]);
+            if (!pools[i].create(numThreads, maxProviders, nodeMaskPerPool[node]))
             {
                 X265_FREE(pools);
                 numPools = 0;
                 return NULL;
             }
             if (numNumaNodes > 1)
-                x265_log(p, X265_LOG_INFO, "Thread pool %d using %d threads on NUMA node %d\n", i, cores, node);
+            {
+                char *nodesstr = new char[64 * strlen(",63") + 1];
+                int len = 0;
+                for (int j = 0; j < 64; j++)
+                    if ((nodeMaskPerPool[node] >> j) & 1)
+                        len += sprintf(nodesstr + len, ",%d", j);
+                x265_log(p, X265_LOG_INFO, "Thread pool %d using %d threads on numa nodes %s\n", i, numThreads, nodesstr + 1);
+            }
             else
-                x265_log(p, X265_LOG_INFO, "Thread pool created using %d threads\n", cores);
-            cpusPerNode[node] -= cores;
+                x265_log(p, X265_LOG_INFO, "Thread pool created using %d threads\n", numThreads);
+            threadsPerPool[node] -= numThreads;
         }
     }
     else
@@ -340,11 +384,37 @@
     memset(this, 0, sizeof(*this));
 }
 
-bool ThreadPool::create(int numThreads, int maxProviders, int node)
+bool ThreadPool::create(int numThreads, int maxProviders, uint64_t nodeMask)
 {
     X265_CHECK(numThreads <= MAX_POOL_THREADS, "a single thread pool cannot have more than MAX_POOL_THREADS threads\n");
 
-    m_numaNode = node;
+#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 
+    m_winCpuMask = 0x0;
+    GROUP_AFFINITY groupAffinity;
+    for (int i = 0; i < getNumaNodeCount(); i++)
+    {
+        int numaNode = ((nodeMask >> i) & 0x1U) ? i : -1;
+        if (numaNode != -1)
+            if (GetNumaNodeProcessorMaskEx((USHORT)numaNode, &groupAffinity))
+                m_winCpuMask |= groupAffinity.Mask;
+    }
+    m_numaMask = &m_winCpuMask;
+#elif HAVE_LIBNUMA
+    if (numa_available() >= 0)
+    {
+        struct bitmask* nodemask = numa_allocate_nodemask();
+        if (nodemask)
+        {
+            *(nodemask->maskp) = nodeMask;
+            m_numaMask = nodemask;
+        }
+        else
+            x265_log(NULL, X265_LOG_ERROR, "unable to get NUMA node mask for %lx\n", nodeMask);
+    }
+#else
+    (void)nodeMask;
+#endif
+
     m_numWorkers = numThreads;
 
     m_workers = X265_MALLOC(WorkerThread, numThreads);
@@ -398,36 +468,39 @@
 
     X265_FREE(m_workers);
     X265_FREE(m_jpTable);
+
+#if HAVE_LIBNUMA
+    if(m_numaMask)
+        numa_free_nodemask((struct bitmask*)m_numaMask);
+#endif
 }
 
 void ThreadPool::setCurrentThreadAffinity()
 {
-    setThreadNodeAffinity(m_numaNode);

 
@@ -2,6 +2,7 @@
  * Copyright (C) 2013 x265 project
  *
  * Authors: Steve Borho <steve@borho.org>
+ *          Min Chen <chenm003@163.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -59,6 +60,9 @@
 #if HAVE_LIBNUMA
 #include <numa.h>
 #endif
+#if defined(_MSC_VER)
+# define strcasecmp _stricmp
+#endif
 
 namespace X265_NS {
 // x265 private namespace
@@ -226,8 +230,13 @@
 {
     enum { MAX_NODE_NUM = 127 };
     int cpusPerNode[MAX_NODE_NUM + 1];
+    int threadsPerPool[MAX_NODE_NUM + 2];
+    uint64_t nodeMaskPerPool[MAX_NODE_NUM + 2];
 
     memset(cpusPerNode, 0, sizeof(cpusPerNode));
+    memset(threadsPerPool, 0, sizeof(threadsPerPool));
+    memset(nodeMaskPerPool, 0, sizeof(nodeMaskPerPool));
+
     int numNumaNodes = X265_MIN(getNumaNodeCount(), MAX_NODE_NUM);
     int cpuCount = getCpuCount();
     bool bNumaSupport = false;
@@ -258,7 +267,7 @@
         for (int i = 0; i < numNumaNodes; i++)
             x265_log(p, X265_LOG_DEBUG, "detected NUMA node %d with %d logical cores\n", i, cpusPerNode[i]);
 
-    /* limit nodes based on param->numaPools */
+    /* limit threads based on param->numaPools */
     if (p->numaPools && *p->numaPools)
     {
         const char *nodeStr = p->numaPools;
@@ -266,19 +275,30 @@
         {
             if (!*nodeStr)
             {
-                cpusPerNode[i] = 0;
+                threadsPerPool[i] = 0;
                 continue;
             }
             else if (*nodeStr == '-')
-                cpusPerNode[i] = 0;
-            else if (*nodeStr == '*')
+                threadsPerPool[i] = 0;
+           else if (*nodeStr == '*' || !strcasecmp(nodeStr, "NULL"))
+            {
+                for (int j = i; j < numNumaNodes; j++)
+                {
+                    threadsPerPool[numNumaNodes] += cpusPerNode[j];
+                    nodeMaskPerPool[numNumaNodes] |= ((uint64_t)1 << j);
+                }
                 break;
+            }
             else if (*nodeStr == '+')
-                ;
+            {
+                threadsPerPool[numNumaNodes] += cpusPerNode[i];
+                nodeMaskPerPool[numNumaNodes] |= ((uint64_t)1 << i);
+            }
             else
             {
                 int count = atoi(nodeStr);
-                cpusPerNode[i] = X265_MIN(count, cpusPerNode[i]);
+                threadsPerPool[i] = X265_MIN(count, cpusPerNode[i]);
+                nodeMaskPerPool[i] = ((uint64_t)1 << i);
             }
 
             /* consume current node string, comma, and white-space */
@@ -288,14 +308,31 @@
                ++nodeStr;
         }
     }
+    else
+    {
+        for (int i = 0; i < numNumaNodes; i++)
+        {
+            threadsPerPool[numNumaNodes]  += cpusPerNode[i];
+            nodeMaskPerPool[numNumaNodes] |= ((uint64_t)1 << i);
+        }
+    }
+ 
+    // If the last pool size is > MAX_POOL_THREADS, clip it to spawn thread pools only of size >= 1/2 max (heuristic)
+    if ((threadsPerPool[numNumaNodes] > MAX_POOL_THREADS) &&
+        ((threadsPerPool[numNumaNodes] % MAX_POOL_THREADS) < (MAX_POOL_THREADS / 2)))
+    {
+        threadsPerPool[numNumaNodes] -= (threadsPerPool[numNumaNodes] % MAX_POOL_THREADS);
+        x265_log(p, X265_LOG_DEBUG,
+                 "Creating only %d worker threads beyond specified numbers with --pools (if specified) to prevent asymmetry in pools; may not use all HW contexts\n", threadsPerPool[numNumaNodes]);
+    }
 
     numPools = 0;
-    for (int i = 0; i < numNumaNodes; i++)
+    for (int i = 0; i < numNumaNodes + 1; i++)
     {
         if (bNumaSupport)
             x265_log(p, X265_LOG_DEBUG, "NUMA node %d may use %d logical cores\n", i, cpusPerNode[i]);
-        if (cpusPerNode[i])
-            numPools += (cpusPerNode[i] + MAX_POOL_THREADS - 1) / MAX_POOL_THREADS;
+        if (threadsPerPool[i])
+            numPools += (threadsPerPool[i] + MAX_POOL_THREADS - 1) / MAX_POOL_THREADS;
     }
 
     if (!numPools)
@@ -314,20 +351,27 @@
         int node = 0;
         for (int i = 0; i < numPools; i++)
         {
-            while (!cpusPerNode[node])
+            while (!threadsPerPool[node])
                 node++;
-            int cores = X265_MIN(MAX_POOL_THREADS, cpusPerNode[node]);
-            if (!pools[i].create(cores, maxProviders, node))
+            int numThreads = X265_MIN(MAX_POOL_THREADS, threadsPerPool[node]);
+            if (!pools[i].create(numThreads, maxProviders, nodeMaskPerPool[node]))
             {
                 X265_FREE(pools);
                 numPools = 0;
                 return NULL;
             }
             if (numNumaNodes > 1)
-                x265_log(p, X265_LOG_INFO, "Thread pool %d using %d threads on NUMA node %d\n", i, cores, node);
+            {
+                char *nodesstr = new char[64 * strlen(",63") + 1];
+                int len = 0;
+                for (int j = 0; j < 64; j++)
+                    if ((nodeMaskPerPool[node] >> j) & 1)
+                        len += sprintf(nodesstr + len, ",%d", j);
+                x265_log(p, X265_LOG_INFO, "Thread pool %d using %d threads on numa nodes %s\n", i, numThreads, nodesstr + 1);
+            }
             else
-                x265_log(p, X265_LOG_INFO, "Thread pool created using %d threads\n", cores);
-            cpusPerNode[node] -= cores;
+                x265_log(p, X265_LOG_INFO, "Thread pool created using %d threads\n", numThreads);
+            threadsPerPool[node] -= numThreads;
         }
     }
     else
@@ -340,11 +384,37 @@
     memset(this, 0, sizeof(*this));
 }
 
-bool ThreadPool::create(int numThreads, int maxProviders, int node)
+bool ThreadPool::create(int numThreads, int maxProviders, uint64_t nodeMask)
 {
     X265_CHECK(numThreads <= MAX_POOL_THREADS, "a single thread pool cannot have more than MAX_POOL_THREADS threads\n");
 
-    m_numaNode = node;
+#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 
+    m_winCpuMask = 0x0;
+    GROUP_AFFINITY groupAffinity;
+    for (int i = 0; i < getNumaNodeCount(); i++)
+    {
+        int numaNode = ((nodeMask >> i) & 0x1U) ? i : -1;
+        if (numaNode != -1)
+            if (GetNumaNodeProcessorMaskEx((USHORT)numaNode, &groupAffinity))
+                m_winCpuMask |= groupAffinity.Mask;
+    }
+    m_numaMask = &m_winCpuMask;
+#elif HAVE_LIBNUMA
+    if (numa_available() >= 0)
+    {
+        struct bitmask* nodemask = numa_allocate_nodemask();
+        if (nodemask)
+        {
+            *(nodemask->maskp) = nodeMask;
+            m_numaMask = nodemask;
+        }
+        else
+            x265_log(NULL, X265_LOG_ERROR, "unable to get NUMA node mask for %lx\n", nodeMask);
+    }
+#else
+    (void)nodeMask;
+#endif
+
     m_numWorkers = numThreads;
 
     m_workers = X265_MALLOC(WorkerThread, numThreads);
@@ -398,36 +468,39 @@
 
     X265_FREE(m_workers);
     X265_FREE(m_jpTable);
+
+#if HAVE_LIBNUMA
+    if(m_numaMask)
+        numa_free_nodemask((struct bitmask*)m_numaMask);
+#endif
 }
 
 void ThreadPool::setCurrentThreadAffinity()
 {
-    setThreadNodeAffinity(m_numaNode);
​

x265_1.8.tar.gz/source/common/threadpool.h -> x265_1.9.tar.gz/source/common/threadpool.h Changed

 
@@ -83,7 +83,10 @@
     sleepbitmap_t m_sleepBitmap;
     int           m_numProviders;
     int           m_numWorkers;
-    int           m_numaNode;
+    void*         m_numaMask; // node mask in linux, cpu mask in windows
+#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 
+    DWORD_PTR     m_winCpuMask;
+#endif
     bool          m_isActive;
 
     JobProvider** m_jpTable;
@@ -92,7 +95,7 @@
     ThreadPool();
     ~ThreadPool();
 
-    bool create(int numThreads, int maxProviders, int node);
+    bool create(int numThreads, int maxProviders, uint64_t nodeMask);
     bool start();
     void stopWorkers();
     void setCurrentThreadAffinity();
@@ -103,7 +106,7 @@
 
     static int  getCpuCount();
     static int  getNumaNodeCount();
-    static void setThreadNodeAffinity(int node);
+    static void setThreadNodeAffinity(void *numaMask);
 };
 
 /* Any worker thread may enlist the help of idle worker threads from the same
​

x265_1.8.tar.gz/source/common/version.cpp -> x265_1.9.tar.gz/source/common/version.cpp Changed

 
@@ -2,6 +2,7 @@
  * Copyright (C) 2013 x265 project
  *
  * Authors: Steve Borho <steve@borho.org>
+ *          Min Chen <chenm003@163.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
​

x265_1.8.tar.gz/source/common/wavefront.cpp -> x265_1.9.tar.gz/source/common/wavefront.cpp Changed

 
@@ -2,6 +2,7 @@
  * Copyright (C) 2013 x265 project
  *
  * Authors: Steve Borho <steve@borho.org>
+ *          Min Chen <chenm003@163.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
​

x265_1.8.tar.gz/source/common/wavefront.h -> x265_1.9.tar.gz/source/common/wavefront.h Changed

 
@@ -2,6 +2,7 @@
  * Copyright (C) 2013 x265 project
  *
  * Authors: Steve Borho <steve@borho.org>
+ *          Min Chen <chenm003@163.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
​

x265_1.8.tar.gz/source/common/x86/asm-primitives.cpp -> x265_1.9.tar.gz/source/common/x86/asm-primitives.cpp Changed

@@ -962,11 +962,8 @@
 
         p.cu[BLOCK_4x4].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar4_sse2);
         p.cu[BLOCK_8x8].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar8_sse2);
-
-#if X265_DEPTH <= 10
         p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar16_sse2);
         p.cu[BLOCK_32x32].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar32_sse2);
-#endif /* X265_DEPTH <= 10 */
         ALL_LUMA_TU_S(intra_pred[DC_IDX], intra_pred_dc, sse2);
 
         p.cu[BLOCK_4x4].intra_pred[2] = PFX(intra_pred_ang4_2_sse2);
@@ -1003,13 +1000,12 @@
         p.cu[BLOCK_4x4].intra_pred[33] = PFX(intra_pred_ang4_33_sse2);
 
         p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_32x64_sse2);
-#if X265_DEPTH <= 10
-        p.cu[BLOCK_4x4].sse_ss = PFX(pixel_ssd_ss_4x4_mmx2);
-        ALL_LUMA_CU(sse_ss, pixel_ssd_ss, sse2);
-
         p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_4x8_mmx2);
         p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_8x16_sse2);
         p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_16x32_sse2);
+#if X265_DEPTH <= 10
+        p.cu[BLOCK_4x4].sse_ss = PFX(pixel_ssd_ss_4x4_mmx2);
+        ALL_LUMA_CU(sse_ss, pixel_ssd_ss, sse2);
 #endif
         p.cu[BLOCK_4x4].dct = PFX(dct4_sse2);
         p.cu[BLOCK_8x8].dct = PFX(dct8_sse2);
@@ -1031,6 +1027,7 @@
         ALL_CHROMA_444_PU(p2s, filterPixelToShort, sse2);
         ALL_LUMA_PU(convert_p2s, filterPixelToShort, sse2);
         ALL_LUMA_TU(count_nonzero, count_nonzero, sse2);
+        p.propagateCost = PFX(mbtree_propagate_cost_sse2);
     }
     if (cpuMask & X265_CPU_SSE3)
     {
@@ -1144,11 +1141,8 @@
 
         p.cu[BLOCK_4x4].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar4_sse4);
         p.cu[BLOCK_8x8].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar8_sse4);
-
-#if X265_DEPTH <= 10
         p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar16_sse4);
         p.cu[BLOCK_32x32].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar32_sse4);
-#endif
         ALL_LUMA_TU_S(intra_pred[DC_IDX], intra_pred_dc, sse4);
         INTRA_ANG_SSE4_COMMON(sse4);
         INTRA_ANG_SSE4_HIGH(sse4);
@@ -1158,14 +1152,12 @@
         p.weight_sp = PFX(weight_sp_sse4);
 
         p.cu[BLOCK_4x4].psy_cost_pp = PFX(psyCost_pp_4x4_sse4);
-        p.cu[BLOCK_4x4].psy_cost_ss = PFX(psyCost_ss_4x4_sse4);
 
         // TODO: check POPCNT flag!
         ALL_LUMA_TU_S(copy_cnt, copy_cnt_, sse4);
 #if X265_DEPTH <= 10
         ALL_LUMA_CU(psy_cost_pp, psyCost_pp, sse4);
 #endif
-        ALL_LUMA_CU(psy_cost_ss, psyCost_ss, sse4);
 
         p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].p2s = PFX(filterPixelToShort_2x4_sse4);
         p.chroma[X265_CSP_I420].pu[CHROMA_420_2x8].p2s = PFX(filterPixelToShort_2x8_sse4);
@@ -1173,6 +1165,7 @@
         p.chroma[X265_CSP_I422].pu[CHROMA_422_2x8].p2s = PFX(filterPixelToShort_2x8_sse4);
         p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].p2s = PFX(filterPixelToShort_2x16_sse4);
         p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].p2s = PFX(filterPixelToShort_6x16_sse4);
+        p.costCoeffRemain = PFX(costCoeffRemain_sse4);
     }
     if (cpuMask & X265_CPU_AVX)
     {
@@ -1306,6 +1299,7 @@
         p.pu[LUMA_64x32].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x32_avx);
         p.pu[LUMA_64x48].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x48_avx);
         p.pu[LUMA_64x64].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x64_avx);
+        p.propagateCost = PFX(mbtree_propagate_cost_avx);
     }
     if (cpuMask & X265_CPU_XOP)
     {
@@ -1319,6 +1313,9 @@
     }
     if (cpuMask & X265_CPU_AVX2)
     {
+#if X265_DEPTH == 12
+        ASSIGN_SA8D(avx2);
+#endif
         p.cu[BLOCK_4x4].intra_filter = PFX(intra_filter_4x4_avx2);
 
         // TODO: the planecopy_sp is really planecopy_SC now, must be fix it
@@ -1479,20 +1476,14 @@
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].addAvg = PFX(addAvg_32x16_avx2);
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].addAvg = PFX(addAvg_32x48_avx2);
 
-        p.cu[BLOCK_4x4].psy_cost_ss = PFX(psyCost_ss_4x4_avx2);
-        p.cu[BLOCK_8x8].psy_cost_ss = PFX(psyCost_ss_8x8_avx2);
-        p.cu[BLOCK_16x16].psy_cost_ss = PFX(psyCost_ss_16x16_avx2);
-        p.cu[BLOCK_32x32].psy_cost_ss = PFX(psyCost_ss_32x32_avx2);
-        p.cu[BLOCK_64x64].psy_cost_ss = PFX(psyCost_ss_64x64_avx2);
         p.cu[BLOCK_4x4].psy_cost_pp = PFX(psyCost_pp_4x4_avx2);
-#if X265_DEPTH <= 10
+        p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar16_avx2);
+        p.cu[BLOCK_32x32].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar32_avx2);
+
         p.cu[BLOCK_8x8].psy_cost_pp = PFX(psyCost_pp_8x8_avx2);
         p.cu[BLOCK_16x16].psy_cost_pp = PFX(psyCost_pp_16x16_avx2);
         p.cu[BLOCK_32x32].psy_cost_pp = PFX(psyCost_pp_32x32_avx2);
         p.cu[BLOCK_64x64].psy_cost_pp = PFX(psyCost_pp_64x64_avx2);
-        p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar16_avx2);
-        p.cu[BLOCK_32x32].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar32_avx2);
-#endif
 
         p.cu[BLOCK_16x16].intra_pred[DC_IDX] = PFX(intra_pred_dc16_avx2);
         p.cu[BLOCK_32x32].intra_pred[DC_IDX] = PFX(intra_pred_dc32_avx2);
@@ -1536,20 +1527,13 @@
         p.cu[BLOCK_16x16].ssd_s = PFX(pixel_ssd_s_16_avx2);
         p.cu[BLOCK_32x32].ssd_s = PFX(pixel_ssd_s_32_avx2);
 
-#if X265_DEPTH <= 10
-        p.cu[BLOCK_16x16].sse_ss = PFX(pixel_ssd_ss_16x16_avx2);
-        p.cu[BLOCK_32x32].sse_ss = PFX(pixel_ssd_ss_32x32_avx2);
-        p.cu[BLOCK_64x64].sse_ss = PFX(pixel_ssd_ss_64x64_avx2);
-
-        p.cu[BLOCK_16x16].sse_pp = PFX(pixel_ssd_16x16_avx2);
-        p.cu[BLOCK_32x32].sse_pp = PFX(pixel_ssd_32x32_avx2);
-        p.cu[BLOCK_64x64].sse_pp = PFX(pixel_ssd_64x64_avx2);
-        p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sse_pp = PFX(pixel_ssd_16x16_avx2);
-        p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sse_pp = PFX(pixel_ssd_32x32_avx2);
+        p.cu[BLOCK_16x16].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_16x16_avx2);
+        p.cu[BLOCK_32x32].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_32x32_avx2);
+        p.cu[BLOCK_64x64].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_64x64_avx2);
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sse_pp = (pixel_sse_t)PFX(pixel_ssd_16x16_avx2);
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sse_pp = (pixel_sse_t)PFX(pixel_ssd_32x32_avx2);
         p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_16x32_avx2);
         p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_32x64_avx2);
-#endif
-
         p.quant = PFX(quant_avx2);
         p.nquant = PFX(nquant_avx2);
         p.dequant_normal  = PFX(dequant_normal_avx2);
@@ -1588,21 +1572,16 @@
         p.cu[BLOCK_16x16].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_16_avx2);
         p.cu[BLOCK_32x32].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_32_avx2);
 
-#if X265_DEPTH <= 10
-        ALL_LUMA_TU_S(dct, dct, avx2);
         ALL_LUMA_TU_S(idct, idct, avx2);
-#endif
+        ALL_LUMA_TU_S(dct, dct, avx2);
+
         ALL_LUMA_CU_S(transpose, transpose, avx2);
 
         ALL_LUMA_PU(luma_vpp, interp_8tap_vert_pp, avx2);
         ALL_LUMA_PU(luma_vps, interp_8tap_vert_ps, avx2);
-#if X265_DEPTH <= 10
         ALL_LUMA_PU(luma_vsp, interp_8tap_vert_sp, avx2);
-#endif
         ALL_LUMA_PU(luma_vss, interp_8tap_vert_ss, avx2);
-#if X265_DEPTH <= 10
         p.pu[LUMA_4x4].luma_vsp = PFX(interp_8tap_vert_sp_4x4_avx2);               // since ALL_LUMA_PU didn't declare 4x4 size, calling separately luma_vsp function to use 
-#endif
 
         p.cu[BLOCK_16x16].add_ps = PFX(pixel_add_ps_16x16_avx2);
         p.cu[BLOCK_32x32].add_ps = PFX(pixel_add_ps_32x32_avx2);
@@ -1625,7 +1604,6 @@
         p.pu[LUMA_16x12].sad = PFX(pixel_sad_16x12_avx2);
         p.pu[LUMA_16x16].sad = PFX(pixel_sad_16x16_avx2);
         p.pu[LUMA_16x32].sad = PFX(pixel_sad_16x32_avx2);
-#if X265_DEPTH <= 10
         p.pu[LUMA_16x64].sad = PFX(pixel_sad_16x64_avx2);
         p.pu[LUMA_32x8].sad = PFX(pixel_sad_32x8_avx2);
         p.pu[LUMA_32x16].sad = PFX(pixel_sad_32x16_avx2);
@@ -1637,7 +1615,6 @@
         p.pu[LUMA_64x32].sad = PFX(pixel_sad_64x32_avx2);
         p.pu[LUMA_64x48].sad = PFX(pixel_sad_64x48_avx2);
         p.pu[LUMA_64x64].sad = PFX(pixel_sad_64x64_avx2);
-#endif
 
         p.pu[LUMA_16x4].sad_x3 = PFX(pixel_sad_x3_16x4_avx2);
         p.pu[LUMA_16x8].sad_x3 = PFX(pixel_sad_x3_16x8_avx2);
@@ -1712,7 +1689,6 @@
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s = PFX(filterPixelToShort_32x48_avx2);
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s = PFX(filterPixelToShort_32x64_avx2);
 
-#if X265_DEPTH <= 10
         p.pu[LUMA_4x4].luma_hps = PFX(interp_8tap_horiz_ps_4x4_avx2);
         p.pu[LUMA_4x8].luma_hps = PFX(interp_8tap_horiz_ps_4x8_avx2);
         p.pu[LUMA_4x16].luma_hps = PFX(interp_8tap_horiz_ps_4x16_avx2);
@@ -1738,7 +1714,6 @@
         p.pu[LUMA_48x64].luma_hps = PFX(interp_8tap_horiz_ps_48x64_avx2);
         p.pu[LUMA_24x32].luma_hps = PFX(interp_8tap_horiz_ps_24x32_avx2);
         p.pu[LUMA_12x16].luma_hps = PFX(interp_8tap_horiz_ps_12x16_avx2);
-#endif
 
         p.pu[LUMA_4x4].luma_hpp = PFX(interp_8tap_horiz_pp_4x4_avx2);
         p.pu[LUMA_4x8].luma_hpp = PFX(interp_8tap_horiz_pp_4x8_avx2);
@@ -1766,7 +1741,6 @@
         p.pu[LUMA_24x32].luma_hpp = PFX(interp_8tap_horiz_pp_24x32_avx2);
         p.pu[LUMA_48x64].luma_hpp = PFX(interp_8tap_horiz_pp_48x64_avx2);

 
@@ -962,11 +962,8 @@
 
         p.cu[BLOCK_4x4].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar4_sse2);
         p.cu[BLOCK_8x8].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar8_sse2);
-
-#if X265_DEPTH <= 10
         p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar16_sse2);
         p.cu[BLOCK_32x32].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar32_sse2);
-#endif /* X265_DEPTH <= 10 */
         ALL_LUMA_TU_S(intra_pred[DC_IDX], intra_pred_dc, sse2);
 
         p.cu[BLOCK_4x4].intra_pred[2] = PFX(intra_pred_ang4_2_sse2);
@@ -1003,13 +1000,12 @@
         p.cu[BLOCK_4x4].intra_pred[33] = PFX(intra_pred_ang4_33_sse2);
 
         p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_32x64_sse2);
-#if X265_DEPTH <= 10
-        p.cu[BLOCK_4x4].sse_ss = PFX(pixel_ssd_ss_4x4_mmx2);
-        ALL_LUMA_CU(sse_ss, pixel_ssd_ss, sse2);
-
         p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_4x8_mmx2);
         p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_8x16_sse2);
         p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_16x32_sse2);
+#if X265_DEPTH <= 10
+        p.cu[BLOCK_4x4].sse_ss = PFX(pixel_ssd_ss_4x4_mmx2);
+        ALL_LUMA_CU(sse_ss, pixel_ssd_ss, sse2);
 #endif
         p.cu[BLOCK_4x4].dct = PFX(dct4_sse2);
         p.cu[BLOCK_8x8].dct = PFX(dct8_sse2);
@@ -1031,6 +1027,7 @@
         ALL_CHROMA_444_PU(p2s, filterPixelToShort, sse2);
         ALL_LUMA_PU(convert_p2s, filterPixelToShort, sse2);
         ALL_LUMA_TU(count_nonzero, count_nonzero, sse2);
+        p.propagateCost = PFX(mbtree_propagate_cost_sse2);
     }
     if (cpuMask & X265_CPU_SSE3)
     {
@@ -1144,11 +1141,8 @@
 
         p.cu[BLOCK_4x4].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar4_sse4);
         p.cu[BLOCK_8x8].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar8_sse4);
-
-#if X265_DEPTH <= 10
         p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar16_sse4);
         p.cu[BLOCK_32x32].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar32_sse4);
-#endif
         ALL_LUMA_TU_S(intra_pred[DC_IDX], intra_pred_dc, sse4);
         INTRA_ANG_SSE4_COMMON(sse4);
         INTRA_ANG_SSE4_HIGH(sse4);
@@ -1158,14 +1152,12 @@
         p.weight_sp = PFX(weight_sp_sse4);
 
         p.cu[BLOCK_4x4].psy_cost_pp = PFX(psyCost_pp_4x4_sse4);
-        p.cu[BLOCK_4x4].psy_cost_ss = PFX(psyCost_ss_4x4_sse4);
 
         // TODO: check POPCNT flag!
         ALL_LUMA_TU_S(copy_cnt, copy_cnt_, sse4);
 #if X265_DEPTH <= 10
         ALL_LUMA_CU(psy_cost_pp, psyCost_pp, sse4);
 #endif
-        ALL_LUMA_CU(psy_cost_ss, psyCost_ss, sse4);
 
         p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].p2s = PFX(filterPixelToShort_2x4_sse4);
         p.chroma[X265_CSP_I420].pu[CHROMA_420_2x8].p2s = PFX(filterPixelToShort_2x8_sse4);
@@ -1173,6 +1165,7 @@
         p.chroma[X265_CSP_I422].pu[CHROMA_422_2x8].p2s = PFX(filterPixelToShort_2x8_sse4);
         p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].p2s = PFX(filterPixelToShort_2x16_sse4);
         p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].p2s = PFX(filterPixelToShort_6x16_sse4);
+        p.costCoeffRemain = PFX(costCoeffRemain_sse4);
     }
     if (cpuMask & X265_CPU_AVX)
     {
@@ -1306,6 +1299,7 @@
         p.pu[LUMA_64x32].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x32_avx);
         p.pu[LUMA_64x48].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x48_avx);
         p.pu[LUMA_64x64].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x64_avx);
+        p.propagateCost = PFX(mbtree_propagate_cost_avx);
     }
     if (cpuMask & X265_CPU_XOP)
     {
@@ -1319,6 +1313,9 @@
     }
     if (cpuMask & X265_CPU_AVX2)
     {
+#if X265_DEPTH == 12
+        ASSIGN_SA8D(avx2);
+#endif
         p.cu[BLOCK_4x4].intra_filter = PFX(intra_filter_4x4_avx2);
 
         // TODO: the planecopy_sp is really planecopy_SC now, must be fix it
@@ -1479,20 +1476,14 @@
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].addAvg = PFX(addAvg_32x16_avx2);
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].addAvg = PFX(addAvg_32x48_avx2);
 
-        p.cu[BLOCK_4x4].psy_cost_ss = PFX(psyCost_ss_4x4_avx2);
-        p.cu[BLOCK_8x8].psy_cost_ss = PFX(psyCost_ss_8x8_avx2);
-        p.cu[BLOCK_16x16].psy_cost_ss = PFX(psyCost_ss_16x16_avx2);
-        p.cu[BLOCK_32x32].psy_cost_ss = PFX(psyCost_ss_32x32_avx2);
-        p.cu[BLOCK_64x64].psy_cost_ss = PFX(psyCost_ss_64x64_avx2);
         p.cu[BLOCK_4x4].psy_cost_pp = PFX(psyCost_pp_4x4_avx2);
-#if X265_DEPTH <= 10
+        p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar16_avx2);
+        p.cu[BLOCK_32x32].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar32_avx2);
+
         p.cu[BLOCK_8x8].psy_cost_pp = PFX(psyCost_pp_8x8_avx2);
         p.cu[BLOCK_16x16].psy_cost_pp = PFX(psyCost_pp_16x16_avx2);
         p.cu[BLOCK_32x32].psy_cost_pp = PFX(psyCost_pp_32x32_avx2);
         p.cu[BLOCK_64x64].psy_cost_pp = PFX(psyCost_pp_64x64_avx2);
-        p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar16_avx2);
-        p.cu[BLOCK_32x32].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar32_avx2);
-#endif
 
         p.cu[BLOCK_16x16].intra_pred[DC_IDX] = PFX(intra_pred_dc16_avx2);
         p.cu[BLOCK_32x32].intra_pred[DC_IDX] = PFX(intra_pred_dc32_avx2);
@@ -1536,20 +1527,13 @@
         p.cu[BLOCK_16x16].ssd_s = PFX(pixel_ssd_s_16_avx2);
         p.cu[BLOCK_32x32].ssd_s = PFX(pixel_ssd_s_32_avx2);
 
-#if X265_DEPTH <= 10
-        p.cu[BLOCK_16x16].sse_ss = PFX(pixel_ssd_ss_16x16_avx2);
-        p.cu[BLOCK_32x32].sse_ss = PFX(pixel_ssd_ss_32x32_avx2);
-        p.cu[BLOCK_64x64].sse_ss = PFX(pixel_ssd_ss_64x64_avx2);
-
-        p.cu[BLOCK_16x16].sse_pp = PFX(pixel_ssd_16x16_avx2);
-        p.cu[BLOCK_32x32].sse_pp = PFX(pixel_ssd_32x32_avx2);
-        p.cu[BLOCK_64x64].sse_pp = PFX(pixel_ssd_64x64_avx2);
-        p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sse_pp = PFX(pixel_ssd_16x16_avx2);
-        p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sse_pp = PFX(pixel_ssd_32x32_avx2);
+        p.cu[BLOCK_16x16].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_16x16_avx2);
+        p.cu[BLOCK_32x32].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_32x32_avx2);
+        p.cu[BLOCK_64x64].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_64x64_avx2);
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sse_pp = (pixel_sse_t)PFX(pixel_ssd_16x16_avx2);
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sse_pp = (pixel_sse_t)PFX(pixel_ssd_32x32_avx2);
         p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_16x32_avx2);
         p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_32x64_avx2);
-#endif
-
         p.quant = PFX(quant_avx2);
         p.nquant = PFX(nquant_avx2);
         p.dequant_normal  = PFX(dequant_normal_avx2);
@@ -1588,21 +1572,16 @@
         p.cu[BLOCK_16x16].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_16_avx2);
         p.cu[BLOCK_32x32].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_32_avx2);
 
-#if X265_DEPTH <= 10
-        ALL_LUMA_TU_S(dct, dct, avx2);
         ALL_LUMA_TU_S(idct, idct, avx2);
-#endif
+        ALL_LUMA_TU_S(dct, dct, avx2);
+
         ALL_LUMA_CU_S(transpose, transpose, avx2);
 
         ALL_LUMA_PU(luma_vpp, interp_8tap_vert_pp, avx2);
         ALL_LUMA_PU(luma_vps, interp_8tap_vert_ps, avx2);
-#if X265_DEPTH <= 10
         ALL_LUMA_PU(luma_vsp, interp_8tap_vert_sp, avx2);
-#endif
         ALL_LUMA_PU(luma_vss, interp_8tap_vert_ss, avx2);
-#if X265_DEPTH <= 10
         p.pu[LUMA_4x4].luma_vsp = PFX(interp_8tap_vert_sp_4x4_avx2);               // since ALL_LUMA_PU didn't declare 4x4 size, calling separately luma_vsp function to use 
-#endif
 
         p.cu[BLOCK_16x16].add_ps = PFX(pixel_add_ps_16x16_avx2);
         p.cu[BLOCK_32x32].add_ps = PFX(pixel_add_ps_32x32_avx2);
@@ -1625,7 +1604,6 @@
         p.pu[LUMA_16x12].sad = PFX(pixel_sad_16x12_avx2);
         p.pu[LUMA_16x16].sad = PFX(pixel_sad_16x16_avx2);
         p.pu[LUMA_16x32].sad = PFX(pixel_sad_16x32_avx2);
-#if X265_DEPTH <= 10
         p.pu[LUMA_16x64].sad = PFX(pixel_sad_16x64_avx2);
         p.pu[LUMA_32x8].sad = PFX(pixel_sad_32x8_avx2);
         p.pu[LUMA_32x16].sad = PFX(pixel_sad_32x16_avx2);
@@ -1637,7 +1615,6 @@
         p.pu[LUMA_64x32].sad = PFX(pixel_sad_64x32_avx2);
         p.pu[LUMA_64x48].sad = PFX(pixel_sad_64x48_avx2);
         p.pu[LUMA_64x64].sad = PFX(pixel_sad_64x64_avx2);
-#endif
 
         p.pu[LUMA_16x4].sad_x3 = PFX(pixel_sad_x3_16x4_avx2);
         p.pu[LUMA_16x8].sad_x3 = PFX(pixel_sad_x3_16x8_avx2);
@@ -1712,7 +1689,6 @@
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s = PFX(filterPixelToShort_32x48_avx2);
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s = PFX(filterPixelToShort_32x64_avx2);
 
-#if X265_DEPTH <= 10
         p.pu[LUMA_4x4].luma_hps = PFX(interp_8tap_horiz_ps_4x4_avx2);
         p.pu[LUMA_4x8].luma_hps = PFX(interp_8tap_horiz_ps_4x8_avx2);
         p.pu[LUMA_4x16].luma_hps = PFX(interp_8tap_horiz_ps_4x16_avx2);
@@ -1738,7 +1714,6 @@
         p.pu[LUMA_48x64].luma_hps = PFX(interp_8tap_horiz_ps_48x64_avx2);
         p.pu[LUMA_24x32].luma_hps = PFX(interp_8tap_horiz_ps_24x32_avx2);
         p.pu[LUMA_12x16].luma_hps = PFX(interp_8tap_horiz_ps_12x16_avx2);
-#endif
 
         p.pu[LUMA_4x4].luma_hpp = PFX(interp_8tap_horiz_pp_4x4_avx2);
         p.pu[LUMA_4x8].luma_hpp = PFX(interp_8tap_horiz_pp_4x8_avx2);
@@ -1766,7 +1741,6 @@
         p.pu[LUMA_24x32].luma_hpp = PFX(interp_8tap_horiz_pp_24x32_avx2);
         p.pu[LUMA_48x64].luma_hpp = PFX(interp_8tap_horiz_pp_48x64_avx2);
 
​

x265_1.8.tar.gz/source/common/x86/blockcopy8.asm -> x265_1.9.tar.gz/source/common/x86/blockcopy8.asm Changed

 
@@ -3,6 +3,7 @@
 ;*
 ;* Authors: Praveen Kumar Tiwari <praveen@multicorewareinc.com>
 ;*          Murugan Vairavel <murugan@multicorewareinc.com>
+;*          Min Chen <chenm003@163.com>
 ;*
 ;* This program is free software; you can redistribute it and/or modify
 ;* it under the terms of the GNU General Public License as published by
​

x265_1.8.tar.gz/source/common/x86/blockcopy8.h -> x265_1.9.tar.gz/source/common/x86/blockcopy8.h Changed

 
@@ -2,6 +2,7 @@
  * Copyright (C) 2013 x265 project
  *
  * Authors: Steve Borho <steve@borho.org>
+;*          Min Chen <chenm003@163.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
​

x265_1.8.tar.gz/source/common/x86/const-a.asm -> x265_1.9.tar.gz/source/common/x86/const-a.asm Changed

@@ -2,6 +2,7 @@
 ;* const-a.asm: x86 global constants
 ;*****************************************************************************
 ;* Copyright (C) 2010-2013 x264 project
+;* Copyright (C) 2013-2015 x265 project
 ;*
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
 ;*          Fiona Glaser <fiona@x264.com>
@@ -31,10 +32,10 @@
 
 ;; 8-bit constants
 
-const pb_0,                 times 16 db 0
+const pb_0,                 times 32 db 0
 const pb_1,                 times 32 db 1
 const pb_2,                 times 32 db 2
-const pb_3,                 times 16 db 3
+const pb_3,                 times 32 db 3
 const pb_4,                 times 32 db 4
 const pb_8,                 times 32 db 8
 const pb_15,                times 32 db 15
@@ -54,6 +55,11 @@
 const pb_shuf8x8c,          times  1 db   0,   0,   0,   0,   2,   2,   2,   2,   4,   4,   4,   4,   6,   6,   6,   6
 const pb_movemask,          times 16 db 0x00
                             times 16 db 0xFF
+
+const pb_movemask_32,       times 32 db 0x00
+                            times 32 db 0xFF
+                            times 32 db 0x00
+
 const pb_0000000000000F0F,  times  2 db 0xff, 0x00
                             times 12 db 0x00
 const pb_000000000000000F,           db 0xff
@@ -61,6 +67,7 @@
 
 ;; 16-bit constants
 
+const pw_n1,                times 16 dw -1
 const pw_1,                 times 16 dw 1
 const pw_2,                 times 16 dw 2
 const pw_3,                 times 16 dw 3
@@ -86,12 +93,12 @@
 const pw_ff00,              times  8 dw 0xff00
 const pw_2000,              times 16 dw 0x2000
 const pw_8000,              times  8 dw 0x8000
-const pw_3fff,              times  8 dw 0x3fff
+const pw_3fff,              times 16 dw 0x3fff
 const pw_32_0,              times  4 dw 32,
                             times  4 dw 0
 const pw_pixel_max,         times 16 dw ((1 << BIT_DEPTH)-1)
 
-const pw_0_15,              times  2 dw   0,   1,   2,   3,   4,   5,   6,   7
+const pw_0_7,               times  2 dw   0,   1,   2,   3,   4,   5,   6,   7
 const pw_ppppmmmm,          times  1 dw   1,   1,   1,   1,  -1,  -1,  -1,  -1
 const pw_ppmmppmm,          times  1 dw   1,   1,  -1,  -1,   1,   1,  -1,  -1
 const pw_pmpmpmpm,          times 16 dw   1,  -1,   1,  -1,   1,  -1,   1,  -1
@@ -107,6 +114,7 @@
                             times  7 dw 0xff
 const hmul_16p,             times 16 db   1
                             times  8 db   1,  -1
+const pw_exp2_0_15,                  dw 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768
 
 
 ;; 32-bit constants
@@ -115,8 +123,9 @@
 const pd_2,                 times  8 dd 2
 const pd_4,                 times  4 dd 4
 const pd_8,                 times  4 dd 8
+const pd_15,                times  8 dd 15
 const pd_16,                times  8 dd 16
-const pd_31,                times  4 dd 31
+const pd_31,                times  8 dd 31
 const pd_32,                times  8 dd 32
 const pd_64,                times  4 dd 64
 const pd_128,               times  4 dd 128
@@ -129,7 +138,12 @@
 const pd_524416,            times  4 dd 524416
 const pd_n32768,            times  8 dd 0xffff8000
 const pd_n131072,           times  4 dd 0xfffe0000
-
+const pd_0000ffff,          times  8 dd 0x0000FFFF
+const pd_planar16_mul0,     times  1 dd  15,  14,  13,  12,  11,  10,   9,   8,    7,   6,   5,   4,   3,   2,   1,   0
+const pd_planar16_mul1,     times  1 dd   1,   2,   3,   4,   5,   6,   7,   8,    9,  10,  11,  12,  13,  14,  15,  16
+const pd_planar32_mul1,     times  1 dd  31,  30,  29,  28,  27,  26,  25,  24,   23,  22,  21,  20,  19,  18,  17,  16
+const pd_planar32_mul2,     times  1 dd  17,  18,  19,  20,  21,  22,  23,  24,   25,  26,  27,  28,  29,  30,  31,  32
+const pd_planar16_mul2,     times  1 dd  15,  14,  13,  12,  11,  10,   9,   8,    7,   6,   5,   4,   3,   2,   1,   0
 const trans8_shuf,          times  1 dd   0,   4,   1,   5,   2,   6,   3,   7
 
 const popcnt_table

 
@@ -2,6 +2,7 @@
 ;* const-a.asm: x86 global constants
 ;*****************************************************************************
 ;* Copyright (C) 2010-2013 x264 project
+;* Copyright (C) 2013-2015 x265 project
 ;*
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
 ;*          Fiona Glaser <fiona@x264.com>
@@ -31,10 +32,10 @@
 
 ;; 8-bit constants
 
-const pb_0,                 times 16 db 0
+const pb_0,                 times 32 db 0
 const pb_1,                 times 32 db 1
 const pb_2,                 times 32 db 2
-const pb_3,                 times 16 db 3
+const pb_3,                 times 32 db 3
 const pb_4,                 times 32 db 4
 const pb_8,                 times 32 db 8
 const pb_15,                times 32 db 15
@@ -54,6 +55,11 @@
 const pb_shuf8x8c,          times  1 db   0,   0,   0,   0,   2,   2,   2,   2,   4,   4,   4,   4,   6,   6,   6,   6
 const pb_movemask,          times 16 db 0x00
                             times 16 db 0xFF
+
+const pb_movemask_32,       times 32 db 0x00
+                            times 32 db 0xFF
+                            times 32 db 0x00
+
 const pb_0000000000000F0F,  times  2 db 0xff, 0x00
                             times 12 db 0x00
 const pb_000000000000000F,           db 0xff
@@ -61,6 +67,7 @@
 
 ;; 16-bit constants
 
+const pw_n1,                times 16 dw -1
 const pw_1,                 times 16 dw 1
 const pw_2,                 times 16 dw 2
 const pw_3,                 times 16 dw 3
@@ -86,12 +93,12 @@
 const pw_ff00,              times  8 dw 0xff00
 const pw_2000,              times 16 dw 0x2000
 const pw_8000,              times  8 dw 0x8000
-const pw_3fff,              times  8 dw 0x3fff
+const pw_3fff,              times 16 dw 0x3fff
 const pw_32_0,              times  4 dw 32,
                             times  4 dw 0
 const pw_pixel_max,         times 16 dw ((1 << BIT_DEPTH)-1)
 
-const pw_0_15,              times  2 dw   0,   1,   2,   3,   4,   5,   6,   7
+const pw_0_7,               times  2 dw   0,   1,   2,   3,   4,   5,   6,   7
 const pw_ppppmmmm,          times  1 dw   1,   1,   1,   1,  -1,  -1,  -1,  -1
 const pw_ppmmppmm,          times  1 dw   1,   1,  -1,  -1,   1,   1,  -1,  -1
 const pw_pmpmpmpm,          times 16 dw   1,  -1,   1,  -1,   1,  -1,   1,  -1
@@ -107,6 +114,7 @@
                             times  7 dw 0xff
 const hmul_16p,             times 16 db   1
                             times  8 db   1,  -1
+const pw_exp2_0_15,                  dw 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768
 
 
 ;; 32-bit constants
@@ -115,8 +123,9 @@
 const pd_2,                 times  8 dd 2
 const pd_4,                 times  4 dd 4
 const pd_8,                 times  4 dd 8
+const pd_15,                times  8 dd 15
 const pd_16,                times  8 dd 16
-const pd_31,                times  4 dd 31
+const pd_31,                times  8 dd 31
 const pd_32,                times  8 dd 32
 const pd_64,                times  4 dd 64
 const pd_128,               times  4 dd 128
@@ -129,7 +138,12 @@
 const pd_524416,            times  4 dd 524416
 const pd_n32768,            times  8 dd 0xffff8000
 const pd_n131072,           times  4 dd 0xfffe0000
-
+const pd_0000ffff,          times  8 dd 0x0000FFFF
+const pd_planar16_mul0,     times  1 dd  15,  14,  13,  12,  11,  10,   9,   8,    7,   6,   5,   4,   3,   2,   1,   0
+const pd_planar16_mul1,     times  1 dd   1,   2,   3,   4,   5,   6,   7,   8,    9,  10,  11,  12,  13,  14,  15,  16
+const pd_planar32_mul1,     times  1 dd  31,  30,  29,  28,  27,  26,  25,  24,   23,  22,  21,  20,  19,  18,  17,  16
+const pd_planar32_mul2,     times  1 dd  17,  18,  19,  20,  21,  22,  23,  24,   25,  26,  27,  28,  29,  30,  31,  32
+const pd_planar16_mul2,     times  1 dd  15,  14,  13,  12,  11,  10,   9,   8,    7,   6,   5,   4,   3,   2,   1,   0
 const trans8_shuf,          times  1 dd   0,   4,   1,   5,   2,   6,   3,   7
 
 const popcnt_table
​

x265_1.8.tar.gz/source/common/x86/cpu-a.asm -> x265_1.9.tar.gz/source/common/x86/cpu-a.asm Changed

 
@@ -2,6 +2,7 @@
 ;* cpu-a.asm: x86 cpu utilities
 ;*****************************************************************************
 ;* Copyright (C) 2003-2013 x264 project
+;* Copyright (C) 2013-2015 x265 project
 ;*
 ;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
 ;*          Loren Merritt <lorenm@u.washington.edu>
​

x265_1.8.tar.gz/source/common/x86/dct8.asm -> x265_1.9.tar.gz/source/common/x86/dct8.asm Changed

@@ -2115,15 +2115,15 @@
     mova     m0, [r0]
     pabsw    m1, m0
 
-    mova     m2, [r1]
+    movu     m2, [r1]
     pmovsxwd m3, m1
     paddd    m2, m3
-    mova     [r1], m2
-    mova     m2, [r1 + 16]
+    movu     [r1], m2
+    movu     m2, [r1 + 16]
     psrldq   m3, m1, 8
     pmovsxwd m4, m3
     paddd    m2, m4
-    mova     [r1 + 16], m2
+    movu     [r1 + 16], m2
 
     movu     m3, [r2]
     psubusw  m1, m3
@@ -2174,7 +2174,7 @@
     pmaddwd         m0,                 m%4
     phaddd          m2,                 m0
     paddd           m2,                 m5
-    psrad           m2,                 DCT_SHIFT
+    psrad           m2,                 DCT8_SHIFT1
     packssdw        m2,                 m2
     vpermq          m2,                 m2, 0x08
     mova            [r5 + %2],          xm2
@@ -2190,7 +2190,7 @@
     phaddd          m8,                 m9
     phaddd          m6,                 m8
     paddd           m6,                 m5
-    psrad           m6,                 DCT_SHIFT2
+    psrad           m6,                 DCT8_SHIFT2
 
     vbroadcasti128  m4,                 [r6 + %2]
     pmaddwd         m10,                m0, m4
@@ -2201,7 +2201,7 @@
     phaddd          m8,                 m9
     phaddd          m10,                m8
     paddd           m10,                m5
-    psrad           m10,                DCT_SHIFT2
+    psrad           m10,                DCT8_SHIFT2
 
     packssdw        m6,                 m10
     vpermq          m10,                m6, 0xD8
@@ -2210,18 +2210,7 @@
 
 INIT_YMM avx2
 cglobal dct8, 3, 7, 11, 0-8*16
-%if BIT_DEPTH == 12
-    %define         DCT_SHIFT          6
-    vbroadcasti128  m5,                [pd_16]
-%elif BIT_DEPTH == 10
-    %define         DCT_SHIFT          4
-    vbroadcasti128  m5,                [pd_8]
-%elif BIT_DEPTH == 8
-    %define         DCT_SHIFT          2
-    vbroadcasti128  m5,                [pd_2]
-%else
-    %error Unsupported BIT_DEPTH!
-%endif
+vbroadcasti128      m5,                [pd_ %+ DCT8_ROUND1]
 %define             DCT_SHIFT2         9
 
     add             r2d,               r2d
@@ -2265,7 +2254,7 @@
     DCT8_PASS_1     7 * 16,             7 * 16, 4, 1
 
     ;pass2
-    vbroadcasti128  m5,                [pd_256]
+    vbroadcasti128  m5,                [pd_ %+ DCT8_ROUND2]
 
     mova            m0,                [r5]
     mova            m1,                [r5 + 32]
@@ -2904,7 +2893,7 @@
 cglobal idct8, 3, 7, 13, 0-8*16
 %if BIT_DEPTH == 12
     %define         IDCT_SHIFT2        8
-    vpbroadcastd    m12,                [pd_256]
+    vpbroadcastd    m12,                [pd_128]
 %elif BIT_DEPTH == 10
     %define         IDCT_SHIFT2        10
     vpbroadcastd    m12,                [pd_512]
@@ -3065,7 +3054,7 @@
 cglobal idct16, 3, 7, 16, 0-16*mmsize
 %if BIT_DEPTH == 12
     %define         IDCT_SHIFT2        8
-    vpbroadcastd    m15,                [pd_256]
+    vpbroadcastd    m15,                [pd_128]
 %elif BIT_DEPTH == 10
     %define         IDCT_SHIFT2        10
     vpbroadcastd    m15,                [pd_512]
@@ -3487,7 +3476,7 @@
 
 %if BIT_DEPTH == 12
     %define         IDCT_SHIFT2        8
-    vpbroadcastd    m15,                [pd_256]
+    vpbroadcastd    m15,                [pd_128]
 %elif BIT_DEPTH == 10
     %define         IDCT_SHIFT2        10
     vpbroadcastd    m15,                [pd_512]
@@ -3651,7 +3640,7 @@
 %define             IDCT_SHIFT1         7
 %if BIT_DEPTH == 12
     %define         IDCT_SHIFT2        8
-    vpbroadcastd    m5,                [pd_256]
+    vpbroadcastd    m5,                [pd_128]
 %elif BIT_DEPTH == 10
     %define         IDCT_SHIFT2        10
     vpbroadcastd    m5,                [pd_512]

 
@@ -2115,15 +2115,15 @@
     mova     m0, [r0]
     pabsw    m1, m0
 
-    mova     m2, [r1]
+    movu     m2, [r1]
     pmovsxwd m3, m1
     paddd    m2, m3
-    mova     [r1], m2
-    mova     m2, [r1 + 16]
+    movu     [r1], m2
+    movu     m2, [r1 + 16]
     psrldq   m3, m1, 8
     pmovsxwd m4, m3
     paddd    m2, m4
-    mova     [r1 + 16], m2
+    movu     [r1 + 16], m2
 
     movu     m3, [r2]
     psubusw  m1, m3
@@ -2174,7 +2174,7 @@
     pmaddwd         m0,                 m%4
     phaddd          m2,                 m0
     paddd           m2,                 m5
-    psrad           m2,                 DCT_SHIFT
+    psrad           m2,                 DCT8_SHIFT1
     packssdw        m2,                 m2
     vpermq          m2,                 m2, 0x08
     mova            [r5 + %2],          xm2
@@ -2190,7 +2190,7 @@
     phaddd          m8,                 m9
     phaddd          m6,                 m8
     paddd           m6,                 m5
-    psrad           m6,                 DCT_SHIFT2
+    psrad           m6,                 DCT8_SHIFT2
 
     vbroadcasti128  m4,                 [r6 + %2]
     pmaddwd         m10,                m0, m4
@@ -2201,7 +2201,7 @@
     phaddd          m8,                 m9
     phaddd          m10,                m8
     paddd           m10,                m5
-    psrad           m10,                DCT_SHIFT2
+    psrad           m10,                DCT8_SHIFT2
 
     packssdw        m6,                 m10
     vpermq          m10,                m6, 0xD8
@@ -2210,18 +2210,7 @@
 
 INIT_YMM avx2
 cglobal dct8, 3, 7, 11, 0-8*16
-%if BIT_DEPTH == 12
-    %define         DCT_SHIFT          6
-    vbroadcasti128  m5,                [pd_16]
-%elif BIT_DEPTH == 10
-    %define         DCT_SHIFT          4
-    vbroadcasti128  m5,                [pd_8]
-%elif BIT_DEPTH == 8
-    %define         DCT_SHIFT          2
-    vbroadcasti128  m5,                [pd_2]
-%else
-    %error Unsupported BIT_DEPTH!
-%endif
+vbroadcasti128      m5,                [pd_ %+ DCT8_ROUND1]
 %define             DCT_SHIFT2         9
 
     add             r2d,               r2d
@@ -2265,7 +2254,7 @@
     DCT8_PASS_1     7 * 16,             7 * 16, 4, 1
 
     ;pass2
-    vbroadcasti128  m5,                [pd_256]
+    vbroadcasti128  m5,                [pd_ %+ DCT8_ROUND2]
 
     mova            m0,                [r5]
     mova            m1,                [r5 + 32]
@@ -2904,7 +2893,7 @@
 cglobal idct8, 3, 7, 13, 0-8*16
 %if BIT_DEPTH == 12
     %define         IDCT_SHIFT2        8
-    vpbroadcastd    m12,                [pd_256]
+    vpbroadcastd    m12,                [pd_128]
 %elif BIT_DEPTH == 10
     %define         IDCT_SHIFT2        10
     vpbroadcastd    m12,                [pd_512]
@@ -3065,7 +3054,7 @@
 cglobal idct16, 3, 7, 16, 0-16*mmsize
 %if BIT_DEPTH == 12
     %define         IDCT_SHIFT2        8
-    vpbroadcastd    m15,                [pd_256]
+    vpbroadcastd    m15,                [pd_128]
 %elif BIT_DEPTH == 10
     %define         IDCT_SHIFT2        10
     vpbroadcastd    m15,                [pd_512]
@@ -3487,7 +3476,7 @@
 
 %if BIT_DEPTH == 12
     %define         IDCT_SHIFT2        8
-    vpbroadcastd    m15,                [pd_256]
+    vpbroadcastd    m15,                [pd_128]
 %elif BIT_DEPTH == 10
     %define         IDCT_SHIFT2        10
     vpbroadcastd    m15,                [pd_512]
@@ -3651,7 +3640,7 @@
 %define             IDCT_SHIFT1         7
 %if BIT_DEPTH == 12
     %define         IDCT_SHIFT2        8
-    vpbroadcastd    m5,                [pd_256]
+    vpbroadcastd    m5,                [pd_128]
 %elif BIT_DEPTH == 10
     %define         IDCT_SHIFT2        10
     vpbroadcastd    m5,                [pd_512]
​

x265_1.8.tar.gz/source/common/x86/dct8.h -> x265_1.9.tar.gz/source/common/x86/dct8.h Changed

 
@@ -2,6 +2,7 @@
  * Copyright (C) 2013 x265 project
  *
  * Authors: Nabajit Deka <nabajit@multicorewareinc.com>
+;*          Min Chen <chenm003@163.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
​

x265_1.8.tar.gz/source/common/x86/intrapred16.asm -> x265_1.9.tar.gz/source/common/x86/intrapred16.asm Changed

@@ -109,9 +109,11 @@
 cextern pw_16
 cextern pw_31
 cextern pw_32
+cextern pd_15
 cextern pd_16
 cextern pd_31
 cextern pd_32
+cextern pd_0000ffff
 cextern pw_4096
 cextern pw_pixel_max
 cextern multiL
@@ -123,7 +125,12 @@
 cextern pb_unpackwq1
 cextern pb_unpackwq2
 cextern pw_planar16_mul
+cextern pd_planar16_mul0
+cextern pd_planar16_mul1
 cextern pw_planar32_mul
+cextern pd_planar32_mul1
+cextern pd_planar32_mul2
+cextern pd_planar16_mul2
 
 ;-----------------------------------------------------------------------------------
 ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* above, int, int filter)
@@ -731,6 +738,117 @@
 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
 ;---------------------------------------------------------------------------------------
 INIT_XMM sse2
+%if ARCH_X86_64 == 1 && BIT_DEPTH == 12
+cglobal intra_pred_planar16, 3,5,13
+    add             r1d, r1d
+    pxor            m12, m12
+
+    movu            m2, [r2 + 2]
+    movu            m10, [r2 + 18]
+
+    punpckhwd       m7, m2, m12
+    punpcklwd       m2, m12
+    punpckhwd       m0, m10, m12
+    punpcklwd       m10, m12
+
+    movzx           r3d, word [r2 + 34]                     ; topRight   = above[16]
+    lea             r4, [pd_planar16_mul1]
+
+    movd            m3, r3d
+    pshufd          m3, m3, 0                               ; topRight
+
+    pmaddwd         m8, m3, [r4 + 3*mmsize]                 ; (x + 1) * topRight
+    pmaddwd         m4, m3, [r4 + 2*mmsize]                 ; (x + 1) * topRight
+    pmaddwd         m9, m3, [r4 + 1*mmsize]                 ; (x + 1) * topRight
+    pmaddwd         m3, m3, [r4 + 0*mmsize]                 ; (x + 1) * topRight
+
+    mova            m11, [pd_15]
+    pmaddwd         m1, m2,  m11                            ; (blkSize - 1 - y) * above[x]
+    pmaddwd         m6, m7,  m11                            ; (blkSize - 1 - y) * above[x]
+    pmaddwd         m5, m10, m11                            ; (blkSize - 1 - y) * above[x]
+    pmaddwd         m11, m0                                 ; (blkSize - 1 - y) * above[x]
+
+    paddd           m4, m5
+    paddd           m3, m1
+    paddd           m8, m11
+    paddd           m9, m6
+
+    mova            m5, [pd_16]
+    paddd           m3, m5
+    paddd           m9, m5
+    paddd           m4, m5
+    paddd           m8, m5
+
+    movzx           r4d, word [r2 + 98]                     ; bottomLeft = left[16]
+    movd            m6, r4d
+    pshufd          m6, m6, 0                               ; bottomLeft
+
+    paddd           m4, m6
+    paddd           m3, m6
+    paddd           m8, m6
+    paddd           m9, m6
+
+    psubd           m1, m6, m0                              ; column 12-15
+    psubd           m11, m6, m10                            ; column 8-11
+    psubd           m10, m6, m7                             ; column 4-7
+    psubd           m6, m2                                  ; column 0-3
+
+    add             r2, 66
+    lea             r4, [pd_planar16_mul0]
+
+%macro INTRA_PRED_PLANAR16_sse2 1
+    movzx           r3d, word [r2 + %1*2]
+    movd            m5, r3d
+    pshufd          m5, m5, 0
+
+    pmaddwd         m0, m5, [r4 + 3*mmsize]                 ; column 12-15
+    pmaddwd         m2, m5, [r4 + 2*mmsize]                 ; column 8-11
+    pmaddwd         m7, m5, [r4 + 1*mmsize]                 ; column 4-7
+    pmaddwd         m5, m5, [r4 + 0*mmsize]                 ; column 0-3
+
+    paddd           m0, m8
+    paddd           m2, m4
+    paddd           m7, m9
+    paddd           m5, m3
+
+    paddd           m8, m1
+    paddd           m4, m11
+    paddd           m9, m10
+    paddd           m3, m6
+
+    psrad           m0, 5
+    psrad           m2, 5
+    psrad           m7, 5
+    psrad           m5, 5
+
+    packssdw        m2, m0
+    packssdw        m5, m7
+    movu            [r0], m5
+    movu            [r0 + mmsize], m2
+
+    add             r0, r1
+%endmacro
+
+    INTRA_PRED_PLANAR16_sse2 0
+    INTRA_PRED_PLANAR16_sse2 1
+    INTRA_PRED_PLANAR16_sse2 2
+    INTRA_PRED_PLANAR16_sse2 3
+    INTRA_PRED_PLANAR16_sse2 4
+    INTRA_PRED_PLANAR16_sse2 5
+    INTRA_PRED_PLANAR16_sse2 6
+    INTRA_PRED_PLANAR16_sse2 7
+    INTRA_PRED_PLANAR16_sse2 8
+    INTRA_PRED_PLANAR16_sse2 9
+    INTRA_PRED_PLANAR16_sse2 10
+    INTRA_PRED_PLANAR16_sse2 11
+    INTRA_PRED_PLANAR16_sse2 12
+    INTRA_PRED_PLANAR16_sse2 13
+    INTRA_PRED_PLANAR16_sse2 14
+    INTRA_PRED_PLANAR16_sse2 15
+    RET
+
+%else
+; code for BIT_DEPTH == 10
 cglobal intra_pred_planar16, 3,3,8
     movu            m2, [r2 + 2]
     movu            m7, [r2 + 18]
@@ -809,7 +927,180 @@
     INTRA_PRED_PLANAR_16 14
     INTRA_PRED_PLANAR_16 15
     RET
+%endif
+
+;---------------------------------------------------------------------------------------
+; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
+;---------------------------------------------------------------------------------------
+INIT_XMM sse2
+%if ARCH_X86_64 == 1 && BIT_DEPTH == 12
+cglobal intra_pred_planar32, 3,7,16
+    ; NOTE: align stack to 64 bytes, so all of local data in same cache line
+    mov             r6, rsp
+    sub             rsp, 4*mmsize
+    and             rsp, ~63
+    %define         m16 [rsp + 0 * mmsize]
+    %define         m17 [rsp + 1 * mmsize]
+    %define         m18 [rsp + 2 * mmsize]
+    %define         m19 [rsp + 3 * mmsize]
+
+    add             r1, r1
+    pxor            m12, m12
+
+    movzx           r3d, word [r2 + 66]
+    lea             r4, [planar32_table1]
+
+    movd            m0, r3d
+    pshufd          m0, m0, 0
+
+    pmaddwd         m8, m0, [r4 + 0]
+    pmaddwd         m9, m0, [r4 + 16]
+    pmaddwd         m10, m0, [r4 + 32]
+    pmaddwd         m11, m0, [r4 + 48]
+    pmaddwd         m7, m0, [r4 + 64]
+    pmaddwd         m13, m0, [r4 + 80]
+    pmaddwd         m14, m0, [r4 + 96]
+    pmaddwd         m15, m0, [r4 + 112]
+
+    movzx           r3d, word [r2 + 194]
+    movd            m0, r3d
+    pshufd          m0, m0, 0
+
+    paddd           m8, m0
+    paddd           m9, m0
+    paddd           m10, m0
+    paddd           m11, m0
+    paddd           m7, m0
+    paddd           m13, m0
+    paddd           m14, m0
+    paddd           m15, m0
+
+    paddd           m8, [pd_32]
+    paddd           m9, [pd_32]
+    paddd           m10, [pd_32]
+    paddd           m11, [pd_32]
+    paddd           m7, [pd_32]

 
@@ -109,9 +109,11 @@
 cextern pw_16
 cextern pw_31
 cextern pw_32
+cextern pd_15
 cextern pd_16
 cextern pd_31
 cextern pd_32
+cextern pd_0000ffff
 cextern pw_4096
 cextern pw_pixel_max
 cextern multiL
@@ -123,7 +125,12 @@
 cextern pb_unpackwq1
 cextern pb_unpackwq2
 cextern pw_planar16_mul
+cextern pd_planar16_mul0
+cextern pd_planar16_mul1
 cextern pw_planar32_mul
+cextern pd_planar32_mul1
+cextern pd_planar32_mul2
+cextern pd_planar16_mul2
 
 ;-----------------------------------------------------------------------------------
 ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* above, int, int filter)
@@ -731,6 +738,117 @@
 ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
 ;---------------------------------------------------------------------------------------
 INIT_XMM sse2
+%if ARCH_X86_64 == 1 && BIT_DEPTH == 12
+cglobal intra_pred_planar16, 3,5,13
+    add             r1d, r1d
+    pxor            m12, m12
+
+    movu            m2, [r2 + 2]
+    movu            m10, [r2 + 18]
+
+    punpckhwd       m7, m2, m12
+    punpcklwd       m2, m12
+    punpckhwd       m0, m10, m12
+    punpcklwd       m10, m12
+
+    movzx           r3d, word [r2 + 34]                     ; topRight   = above[16]
+    lea             r4, [pd_planar16_mul1]
+
+    movd            m3, r3d
+    pshufd          m3, m3, 0                               ; topRight
+
+    pmaddwd         m8, m3, [r4 + 3*mmsize]                 ; (x + 1) * topRight
+    pmaddwd         m4, m3, [r4 + 2*mmsize]                 ; (x + 1) * topRight
+    pmaddwd         m9, m3, [r4 + 1*mmsize]                 ; (x + 1) * topRight
+    pmaddwd         m3, m3, [r4 + 0*mmsize]                 ; (x + 1) * topRight
+
+    mova            m11, [pd_15]
+    pmaddwd         m1, m2,  m11                            ; (blkSize - 1 - y) * above[x]
+    pmaddwd         m6, m7,  m11                            ; (blkSize - 1 - y) * above[x]
+    pmaddwd         m5, m10, m11                            ; (blkSize - 1 - y) * above[x]
+    pmaddwd         m11, m0                                 ; (blkSize - 1 - y) * above[x]
+
+    paddd           m4, m5
+    paddd           m3, m1
+    paddd           m8, m11
+    paddd           m9, m6
+
+    mova            m5, [pd_16]
+    paddd           m3, m5
+    paddd           m9, m5
+    paddd           m4, m5
+    paddd           m8, m5
+
+    movzx           r4d, word [r2 + 98]                     ; bottomLeft = left[16]
+    movd            m6, r4d
+    pshufd          m6, m6, 0                               ; bottomLeft
+
+    paddd           m4, m6
+    paddd           m3, m6
+    paddd           m8, m6
+    paddd           m9, m6
+
+    psubd           m1, m6, m0                              ; column 12-15
+    psubd           m11, m6, m10                            ; column 8-11
+    psubd           m10, m6, m7                             ; column 4-7
+    psubd           m6, m2                                  ; column 0-3
+
+    add             r2, 66
+    lea             r4, [pd_planar16_mul0]
+
+%macro INTRA_PRED_PLANAR16_sse2 1
+    movzx           r3d, word [r2 + %1*2]
+    movd            m5, r3d
+    pshufd          m5, m5, 0
+
+    pmaddwd         m0, m5, [r4 + 3*mmsize]                 ; column 12-15
+    pmaddwd         m2, m5, [r4 + 2*mmsize]                 ; column 8-11
+    pmaddwd         m7, m5, [r4 + 1*mmsize]                 ; column 4-7
+    pmaddwd         m5, m5, [r4 + 0*mmsize]                 ; column 0-3
+
+    paddd           m0, m8
+    paddd           m2, m4
+    paddd           m7, m9
+    paddd           m5, m3
+
+    paddd           m8, m1
+    paddd           m4, m11
+    paddd           m9, m10
+    paddd           m3, m6
+
+    psrad           m0, 5
+    psrad           m2, 5
+    psrad           m7, 5
+    psrad           m5, 5
+
+    packssdw        m2, m0
+    packssdw        m5, m7
+    movu            [r0], m5
+    movu            [r0 + mmsize], m2
+
+    add             r0, r1
+%endmacro
+
+    INTRA_PRED_PLANAR16_sse2 0
+    INTRA_PRED_PLANAR16_sse2 1
+    INTRA_PRED_PLANAR16_sse2 2
+    INTRA_PRED_PLANAR16_sse2 3
+    INTRA_PRED_PLANAR16_sse2 4
+    INTRA_PRED_PLANAR16_sse2 5
+    INTRA_PRED_PLANAR16_sse2 6
+    INTRA_PRED_PLANAR16_sse2 7
+    INTRA_PRED_PLANAR16_sse2 8
+    INTRA_PRED_PLANAR16_sse2 9
+    INTRA_PRED_PLANAR16_sse2 10
+    INTRA_PRED_PLANAR16_sse2 11
+    INTRA_PRED_PLANAR16_sse2 12
+    INTRA_PRED_PLANAR16_sse2 13
+    INTRA_PRED_PLANAR16_sse2 14
+    INTRA_PRED_PLANAR16_sse2 15
+    RET
+
+%else
+; code for BIT_DEPTH == 10
 cglobal intra_pred_planar16, 3,3,8
     movu            m2, [r2 + 2]
     movu            m7, [r2 + 18]
@@ -809,7 +927,180 @@
     INTRA_PRED_PLANAR_16 14
     INTRA_PRED_PLANAR_16 15
     RET
+%endif
+
+;---------------------------------------------------------------------------------------
+; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
+;---------------------------------------------------------------------------------------
+INIT_XMM sse2
+%if ARCH_X86_64 == 1 && BIT_DEPTH == 12
+cglobal intra_pred_planar32, 3,7,16
+    ; NOTE: align stack to 64 bytes, so all of local data in same cache line
+    mov             r6, rsp
+    sub             rsp, 4*mmsize
+    and             rsp, ~63
+    %define         m16 [rsp + 0 * mmsize]
+    %define         m17 [rsp + 1 * mmsize]
+    %define         m18 [rsp + 2 * mmsize]
+    %define         m19 [rsp + 3 * mmsize]
+
+    add             r1, r1
+    pxor            m12, m12
+
+    movzx           r3d, word [r2 + 66]
+    lea             r4, [planar32_table1]
+
+    movd            m0, r3d
+    pshufd          m0, m0, 0
+
+    pmaddwd         m8, m0, [r4 + 0]
+    pmaddwd         m9, m0, [r4 + 16]
+    pmaddwd         m10, m0, [r4 + 32]
+    pmaddwd         m11, m0, [r4 + 48]
+    pmaddwd         m7, m0, [r4 + 64]
+    pmaddwd         m13, m0, [r4 + 80]
+    pmaddwd         m14, m0, [r4 + 96]
+    pmaddwd         m15, m0, [r4 + 112]
+
+    movzx           r3d, word [r2 + 194]
+    movd            m0, r3d
+    pshufd          m0, m0, 0
+
+    paddd           m8, m0
+    paddd           m9, m0
+    paddd           m10, m0
+    paddd           m11, m0
+    paddd           m7, m0
+    paddd           m13, m0
+    paddd           m14, m0
+    paddd           m15, m0
+
+    paddd           m8, [pd_32]
+    paddd           m9, [pd_32]
+    paddd           m10, [pd_32]
+    paddd           m11, [pd_32]
+    paddd           m7, [pd_32]
​

x265_1.8.tar.gz/source/common/x86/intrapred8.asm -> x265_1.9.tar.gz/source/common/x86/intrapred8.asm Changed

@@ -27,7 +27,9 @@
 
 SECTION_RODATA 32
 
-intra_pred_shuff_0_8:    times 2 db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
+const intra_pred_shuff_0_8,     times 2 db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
+                                        db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
+
 intra_pred_shuff_15_0:   times 2 db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
 
 intra_filter4_shuf0:  times 2 db  2,  3,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13
@@ -54,13 +56,13 @@
 c_shuf8_0:            db  0,  1,  1,  2,  2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8
 c_deinterval8:        db  0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15
 pb_unpackbq:          db  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  1,  1,  1,  1
-c_mode16_12:    db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 6
-c_mode16_13:    db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 7, 4
-c_mode16_14:    db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 12, 10, 7, 5, 2
+c_mode16_12:          db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 6
+c_mode16_13:          db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 7, 4
+c_mode16_14:          db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 12, 10, 7, 5, 2
 c_mode16_15:          db  0,  0,  0,  0,  0,  0,  0,  0, 15, 13, 11,  9,  8,  6,  4,  2
 c_mode16_16:          db  8,  6,  5,  3,  2,  0, 15, 14, 12, 11,  9,  8,  6,  5,  3,  2
 c_mode16_17:          db  4,  2,  1,  0, 15, 14, 12, 11, 10,  9,  7,  6,  5,  4,  2,  1
-c_mode16_18:    db 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1
+c_mode16_18:          db 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1
 
 ALIGN 32
 c_ang8_src1_9_2_10:   db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
@@ -259,235 +261,6 @@
                      db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
                      db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
 
-
-ALIGN 32
-c_ang32_mode_27:    db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
-                    db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
-                    db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
-                    db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
-                    db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
-                    db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
-                    db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
-                    db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
-                    db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2
-                    db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
-                    db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
-                    db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
-                    db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
-                    db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
-                    db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
-                    db 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
-                    db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
-
-
-ALIGN 32
-c_ang32_mode_28:    db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
-                    db 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
-                    db 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
-                    db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
-                    db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
-                    db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
-                    db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
-                    db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
-                    db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
-                    db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31
-                    db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9
-                    db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19
-                    db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29
-                    db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7
-                    db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17
-                    db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27
-                    db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
-
-ALIGN 32
-c_ang32_mode_29:    db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
-                    db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27
-                    db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13
-                    db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31
-                    db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17
-                    db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
-                    db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
-                    db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
-                    db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
-                    db 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25
-                    db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11
-                    db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29
-                    db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15
-                    db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
-                    db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
-                    db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
-                    db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
-                    db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23
-                    db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
-
-
-ALIGN 32
-c_ang32_mode_30:    db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
-                    db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
-                    db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
-                    db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27
-                    db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21
-                    db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15
-                    db 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
-                    db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
-                    db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
-                    db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29,  3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29
-                    db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23
-                    db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17
-                    db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
-                    db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
-                    db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
-                    db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31
-                    db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25
-                    db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19
-                    db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
-
-
-ALIGN 32
-c_ang32_mode_31:    db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17
-                    db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19
-                    db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21
-                    db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23
-                    db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25
-                    db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27
-                    db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29
-                    db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31
-                    db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
-                    db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
-                    db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
-                    db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
-                    db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
-                    db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
-                    db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
-                    db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
-                    db 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15
-                    db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
-
-
-ALIGN 32
-c_ang32_mode_32:   db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21
-                   db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31
-                   db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
-                   db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
-                   db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19
-                   db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29
-                   db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
-                   db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
-                   db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17
-                   db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27
-                   db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
-                   db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
-                   db 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15
-                   db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25
-                   db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
-                   db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
-                   db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13
-                   db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23
-                   db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
-                   db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
-                   db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11
-                   db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
-
-ALIGN 32
-c_ang32_mode_25:   db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
-                   db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
-                   db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
-                   db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
-                   db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
-                   db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
-                   db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
-                   db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
-                   db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
-                   db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
-                   db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
-                   db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
-                   db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
-                   db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
-                   db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
-                   db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
-
-ALIGN 32
-c_ang32_mode_24:   db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
-                   db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
-                   db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2
-                   db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
-                   db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
-                   db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
-                   db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
-                   db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
-                   db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
-                   db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1
-                   db 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23
-                   db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13
-                   db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3
-                   db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25
-                   db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15
-                   db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5
-                   db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
-
-

 
@@ -27,7 +27,9 @@
 
 SECTION_RODATA 32
 
-intra_pred_shuff_0_8:    times 2 db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
+const intra_pred_shuff_0_8,     times 2 db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
+                                        db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
+
 intra_pred_shuff_15_0:   times 2 db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
 
 intra_filter4_shuf0:  times 2 db  2,  3,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13
@@ -54,13 +56,13 @@
 c_shuf8_0:            db  0,  1,  1,  2,  2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8
 c_deinterval8:        db  0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15
 pb_unpackbq:          db  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  1,  1,  1,  1
-c_mode16_12:    db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 6
-c_mode16_13:    db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 7, 4
-c_mode16_14:    db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 12, 10, 7, 5, 2
+c_mode16_12:          db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 6
+c_mode16_13:          db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 7, 4
+c_mode16_14:          db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 12, 10, 7, 5, 2
 c_mode16_15:          db  0,  0,  0,  0,  0,  0,  0,  0, 15, 13, 11,  9,  8,  6,  4,  2
 c_mode16_16:          db  8,  6,  5,  3,  2,  0, 15, 14, 12, 11,  9,  8,  6,  5,  3,  2
 c_mode16_17:          db  4,  2,  1,  0, 15, 14, 12, 11, 10,  9,  7,  6,  5,  4,  2,  1
-c_mode16_18:    db 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1
+c_mode16_18:          db 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1
 
 ALIGN 32
 c_ang8_src1_9_2_10:   db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
@@ -259,235 +261,6 @@
                      db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
                      db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
 
-
-ALIGN 32
-c_ang32_mode_27:    db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
-                    db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
-                    db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
-                    db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
-                    db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
-                    db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
-                    db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
-                    db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
-                    db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2
-                    db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
-                    db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
-                    db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
-                    db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
-                    db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
-                    db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
-                    db 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
-                    db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
-
-
-ALIGN 32
-c_ang32_mode_28:    db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
-                    db 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
-                    db 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
-                    db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
-                    db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
-                    db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
-                    db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
-                    db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
-                    db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
-                    db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31
-                    db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9
-                    db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19
-                    db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29
-                    db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7
-                    db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17
-                    db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27
-                    db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
-
-ALIGN 32
-c_ang32_mode_29:    db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
-                    db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27
-                    db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13
-                    db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31
-                    db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17
-                    db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
-                    db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
-                    db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
-                    db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
-                    db 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25
-                    db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11
-                    db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29
-                    db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15
-                    db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
-                    db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
-                    db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
-                    db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
-                    db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23
-                    db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
-
-
-ALIGN 32
-c_ang32_mode_30:    db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
-                    db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
-                    db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
-                    db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27
-                    db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21
-                    db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15
-                    db 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
-                    db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
-                    db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
-                    db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29,  3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29
-                    db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23
-                    db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17
-                    db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
-                    db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
-                    db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
-                    db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31
-                    db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25
-                    db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19
-                    db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
-
-
-ALIGN 32
-c_ang32_mode_31:    db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17
-                    db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19
-                    db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21
-                    db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23
-                    db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25
-                    db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27
-                    db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29
-                    db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31
-                    db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
-                    db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
-                    db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
-                    db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
-                    db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
-                    db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
-                    db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
-                    db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
-                    db 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15
-                    db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
-
-
-ALIGN 32
-c_ang32_mode_32:   db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21
-                   db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31
-                   db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
-                   db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
-                   db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19
-                   db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29
-                   db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
-                   db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
-                   db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17
-                   db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27
-                   db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
-                   db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
-                   db 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15
-                   db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25
-                   db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
-                   db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
-                   db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13
-                   db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23
-                   db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
-                   db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
-                   db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11
-                   db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
-
-ALIGN 32
-c_ang32_mode_25:   db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
-                   db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
-                   db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
-                   db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
-                   db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
-                   db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
-                   db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
-                   db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
-                   db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
-                   db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
-                   db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
-                   db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
-                   db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
-                   db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
-                   db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
-                   db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
-
-ALIGN 32
-c_ang32_mode_24:   db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
-                   db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
-                   db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2
-                   db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
-                   db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
-                   db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
-                   db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
-                   db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
-                   db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
-                   db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1
-                   db 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23
-                   db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13
-                   db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3
-                   db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25
-                   db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15
-                   db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5
-                   db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
-
-
​

x265_1.8.tar.gz/source/common/x86/intrapred8_allangs.asm -> x265_1.9.tar.gz/source/common/x86/intrapred8_allangs.asm Changed

@@ -27,62 +27,63 @@
 
 SECTION_RODATA 32
 
-all_ang4_shuff: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
-                db 0, 1, 1, 2, 2, 3, 3, 4, 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6, 3, 4, 4, 5, 5, 6, 6, 7
-                db 0, 1, 1, 2, 2, 3, 3, 4, 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6
-                db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5
-                db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 1, 2, 2, 3, 3, 4, 4, 5
-                db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4
-                db 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
-                db 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12
-                db 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 4, 0, 0, 9, 9, 10, 10, 11
-                db 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 2, 0, 0, 9, 9, 10, 10, 11, 2, 0, 0, 9, 9, 10, 10, 11
-                db 0, 9, 9, 10, 10, 11, 11, 12, 2, 0, 0, 9, 9, 10, 10, 11, 2, 0, 0, 9, 9, 10, 10, 11, 4, 2, 2, 0, 0, 9, 9, 10
-                db 0, 9, 9, 10, 10, 11, 11, 12, 2, 0, 0, 9, 9, 10, 10, 11, 2, 0, 0, 9, 9, 10, 10, 11, 3, 2, 2, 0, 0, 9, 9, 10
-                db 0, 9, 9, 10, 10, 11, 11, 12, 1, 0, 0, 9, 9, 10, 10, 11, 2, 1, 1, 0, 0, 9, 9, 10, 4, 2, 2, 1, 1, 0, 0, 9
-                db 0, 1, 2, 3, 9, 0, 1, 2, 10, 9, 0, 1, 11, 10, 9, 0, 0, 1, 2, 3, 9, 0, 1, 2, 10, 9, 0, 1, 11, 10, 9, 0
-                db 0, 1, 1, 2, 2, 3, 3, 4, 9, 0, 0, 1, 1, 2, 2, 3, 10, 9, 9, 0, 0, 1, 1, 2, 12, 10, 10, 9, 9, 0, 0, 1
-                db 0, 1, 1, 2, 2, 3, 3, 4, 10, 0, 0, 1, 1, 2, 2, 3, 10, 0, 0, 1, 1, 2, 2, 3, 11, 10, 10, 0, 0, 1, 1, 2
-                db 0, 1, 1, 2, 2, 3, 3, 4, 10, 0, 0, 1, 1, 2, 2, 3, 10, 0, 0, 1, 1, 2, 2, 3, 12, 10, 10, 0, 0, 1, 1, 2
-                db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 10, 0, 0, 1, 1, 2, 2, 3, 10, 0, 0, 1, 1, 2, 2, 3
-                db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 12, 0, 0, 1, 1, 2, 2, 3
-                db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4
-                db 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4
-                db 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5
-                db 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6
-                db 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6, 2, 3, 3, 4, 4, 5, 5, 6
-                db 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6, 2, 3, 3, 4, 4, 5, 5, 6, 3, 4, 4, 5, 5, 6, 6, 7
-                db 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6, 3, 4, 4, 5, 5, 6, 6, 7, 4, 5, 5, 6, 6, 7, 7, 8
-                db 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8
-
-all_ang4: db 6, 26, 6, 26, 6, 26, 6, 26, 12, 20, 12, 20, 12, 20, 12, 20, 18, 14, 18, 14, 18, 14, 18, 14, 24, 8, 24, 8, 24, 8, 24, 8
-          db 11, 21, 11, 21, 11, 21, 11, 21, 22, 10, 22, 10, 22, 10, 22, 10, 1, 31, 1, 31, 1, 31, 1, 31, 12, 20, 12, 20, 12, 20, 12, 20
-          db 15, 17, 15, 17, 15, 17, 15, 17, 30, 2, 30, 2, 30, 2, 30, 2, 13, 19, 13, 19, 13, 19, 13, 19, 28, 4, 28, 4, 28, 4, 28, 4
-          db 19, 13, 19, 13, 19, 13, 19, 13, 6, 26, 6, 26, 6, 26, 6, 26, 25, 7, 25, 7, 25, 7, 25, 7, 12, 20, 12, 20, 12, 20, 12, 20
-          db 23, 9, 23, 9, 23, 9, 23, 9, 14, 18, 14, 18, 14, 18, 14, 18, 5, 27, 5, 27, 5, 27, 5, 27, 28, 4, 28, 4, 28, 4, 28, 4
-          db 27, 5, 27, 5, 27, 5, 27, 5, 22, 10, 22, 10, 22, 10, 22, 10, 17, 15, 17, 15, 17, 15, 17, 15, 12, 20, 12, 20, 12, 20, 12, 20
-          db 30, 2, 30, 2, 30, 2, 30, 2, 28, 4, 28, 4, 28, 4, 28, 4, 26, 6, 26, 6, 26, 6, 26, 6, 24, 8, 24, 8, 24, 8, 24, 8
-          db 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24
-          db 5, 27, 5, 27, 5, 27, 5, 27, 10, 22, 10, 22, 10, 22, 10, 22, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12
-          db 9, 23, 9, 23, 9, 23, 9, 23, 18, 14, 18, 14, 18, 14, 18, 14, 27, 5, 27, 5, 27, 5, 27, 5, 4, 28, 4, 28, 4, 28, 4, 28
-          db 13, 19, 13, 19, 13, 19, 13, 19, 26, 6, 26, 6, 26, 6, 26, 6, 7, 25, 7, 25, 7, 25, 7, 25, 20, 12, 20, 12, 20, 12, 20, 12
-          db 17, 15, 17, 15, 17, 15, 17, 15, 2, 30, 2, 30, 2, 30, 2, 30, 19, 13, 19, 13, 19, 13, 19, 13, 4, 28, 4, 28, 4, 28, 4, 28
-          db 21, 11, 21, 11, 21, 11, 21, 11, 10, 22, 10, 22, 10, 22, 10, 22, 31, 1, 31, 1, 31, 1, 31, 1, 20, 12, 20, 12, 20, 12, 20, 12
-          db 26, 6, 26, 6, 26, 6, 26, 6, 20, 12, 20, 12, 20, 12, 20, 12, 14, 18, 14, 18, 14, 18, 14, 18, 8, 24, 8, 24, 8, 24, 8, 24
-          db 26, 6, 26, 6, 26, 6, 26, 6, 20, 12, 20, 12, 20, 12, 20, 12, 14, 18, 14, 18, 14, 18, 14, 18, 8, 24, 8, 24, 8, 24, 8, 24
-          db 21, 11, 21, 11, 21, 11, 21, 11, 10, 22, 10, 22, 10, 22, 10, 22, 31, 1, 31, 1, 31, 1, 31, 1, 20, 12, 20, 12, 20, 12, 20, 12
-          db 17, 15, 17, 15, 17, 15, 17, 15, 2, 30, 2, 30, 2, 30, 2, 30, 19, 13, 19, 13, 19, 13, 19, 13, 4, 28, 4, 28, 4, 28, 4, 28
-          db 13, 19, 13, 19, 13, 19, 13, 19, 26, 6, 26, 6, 26, 6, 26, 6, 7, 25, 7, 25, 7, 25, 7, 25, 20, 12, 20, 12, 20, 12, 20, 12
-          db 9, 23, 9, 23, 9, 23, 9, 23, 18, 14, 18, 14, 18, 14, 18, 14, 27, 5, 27, 5, 27, 5, 27, 5, 4, 28, 4, 28, 4, 28, 4, 28
-          db 5, 27, 5, 27, 5, 27, 5, 27, 10, 22, 10, 22, 10, 22, 10, 22, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12
-          db 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24
-          db 30, 2, 30, 2, 30, 2, 30, 2, 28, 4, 28, 4, 28, 4, 28, 4, 26, 6, 26, 6, 26, 6, 26, 6, 24, 8, 24, 8, 24, 8, 24, 8
-          db 27, 5, 27, 5, 27, 5, 27, 5, 22, 10, 22, 10, 22, 10, 22, 10, 17, 15, 17, 15, 17, 15, 17, 15, 12, 20, 12, 20, 12, 20, 12, 20
-          db 23, 9, 23, 9, 23, 9, 23, 9, 14, 18, 14, 18, 14, 18, 14, 18, 5, 27, 5, 27, 5, 27, 5, 27, 28, 4, 28, 4, 28, 4, 28, 4
-          db 19, 13, 19, 13, 19, 13, 19, 13, 6, 26, 6, 26, 6, 26, 6, 26, 25, 7, 25, 7, 25, 7, 25, 7, 12, 20, 12, 20, 12, 20, 12, 20
-          db 15, 17, 15, 17, 15, 17, 15, 17, 30, 2, 30, 2, 30, 2, 30, 2, 13, 19, 13, 19, 13, 19, 13, 19, 28, 4, 28, 4, 28, 4, 28, 4
-          db 11, 21, 11, 21, 11, 21, 11, 21, 22, 10, 22, 10, 22, 10, 22, 10, 1, 31, 1, 31, 1, 31, 1, 31, 12, 20, 12, 20, 12, 20, 12, 20
-          db 6, 26, 6, 26, 6, 26, 6, 26, 12, 20, 12, 20, 12, 20, 12, 20, 18, 14, 18, 14, 18, 14, 18, 14, 24, 8, 24, 8, 24, 8, 24, 8
+const allAng4_shuf_mode2,       db  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6,  4,  5,  6,  7,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6,  4,  5,  6,  7
+const allAng4_shuf_mode3_4,     db  0,  1,  1,  2,  2,  3,  3,  4,  1,  2,  2,  3,  3,  4,  4,  5,  0,  1,  1,  2,  2,  3,  3,  4,  1,  2,  2,  3,  3,  4,  4,  5
+                                db  2,  3,  3,  4,  4,  5,  5,  6,  3,  4,  4,  5,  5,  6,  6,  7,  1,  2,  2,  3,  3,  4,  4,  5,  2,  3,  3,  4,  4,  5,  5,  6
+const allAng4_shuf_mode5_6,     db  0,  1,  1,  2,  2,  3,  3,  4,  1,  2,  2,  3,  3,  4,  4,  5,  0,  1,  1,  2,  2,  3,  3,  4,  0,  1,  1,  2,  2,  3,  3,  4
+                                db  1,  2,  2,  3,  3,  4,  4,  5,  2,  3,  3,  4,  4,  5,  5,  6,  1,  2,  2,  3,  3,  4,  4,  5,  1,  2,  2,  3,  3,  4,  4,  5
+const allAng4_shuf_mode7_8,     db  0,  1,  1,  2,  2,  3,  3,  4,  0,  1,  1,  2,  2,  3,  3,  4,  0,  1,  1,  2,  2,  3,  3,  4,  0,  1,  1,  2,  2,  3,  3,  4
+                                db  0,  1,  1,  2,  2,  3,  3,  4,  1,  2,  2,  3,  3,  4,  4,  5,  0,  1,  1,  2,  2,  3,  3,  4,  0,  1,  1,  2,  2,  3,  3,  4
+const allAng4_shuf_mode10,      db  0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3
+const allAng4_shuf_mode11_12,   db  0,  9,  9, 10, 10, 11, 11, 12,  0,  9,  9, 10, 10, 11, 11, 12,  0,  9,  9, 10, 10, 11, 11, 12,  0,  9,  9, 10, 10, 11, 11, 12
+const allAng4_shuf_mode13_14,   db  0,  9,  9, 10, 10, 11, 11, 12,  4,  0,  0,  9,  9, 10, 10, 11,  2,  0,  0,  9,  9, 10, 10, 11,  2,  0,  0,  9,  9, 10, 10, 11
+const allAng4_shuf_mode15_16,   db  0,  9,  9, 10, 10, 11, 11, 12,  2,  0,  0,  9,  9, 10, 10, 11,  0,  9,  9, 10, 10, 11, 11, 12,  2,  0,  0,  9,  9, 10, 10, 11
+                                db  2,  0,  0,  9,  9, 10, 10, 11,  4,  2,  2,  0,  0,  9,  9, 10,  2,  0,  0,  9,  9, 10, 10, 11,  3,  2,  2,  0,  0,  9,  9, 10
+const allAng4_shuf_mode17,      db  0,  9,  9, 10, 10, 11, 11, 12,  1,  0,  0,  9,  9, 10, 10, 11,  2,  1,  1,  0,  0,  9,  9, 10,  4,  2,  2,  1,  1,  0,  0,  9
+                                db  0,  1,  2,  3,  9,  0,  1,  2, 10,  9,  0,  1, 11, 10,  9,  0,  0,  1,  2,  3,  9,  0,  1,  2, 10,  9,  0,  1, 11, 10,  9,  0
+const allAng4_shuf_mode18,      db  0,  1,  2,  3,  9,  0,  1,  2, 10,  9,  0,  1, 11, 10,  9,  0,  0,  1,  2,  3,  9,  0,  1,  2, 10,  9,  0,  1, 11, 10,  9,  0
+const allAng4_shuf_mode19_20,   db  0,  1,  1,  2,  2,  3,  3,  4,  9,  0,  0,  1,  1,  2,  2,  3,  0,  1,  1,  2,  2,  3,  3,  4, 10,  0,  0,  1,  1,  2,  2,  3
+                                db 10,  9,  9,  0,  0,  1,  1,  2, 12, 10, 10,  9,  9,  0,  0,  1, 10,  0,  0,  1,  1,  2,  2,  3, 11, 10, 10,  0,  0,  1,  1,  2
+const allAng4_shuf_mode21_22,   db  0,  1,  1,  2,  2,  3,  3,  4, 10,  0,  0,  1,  1,  2,  2,  3,  0,  1,  1,  2,  2,  3,  3,  4,  0,  1,  1,  2,  2,  3,  3,  4
+                                db 10,  0,  0,  1,  1,  2,  2,  3, 12, 10, 10,  0,  0,  1,  1,  2, 10,  0,  0,  1,  1,  2,  2,  3, 10,  0,  0,  1,  1,  2,  2,  3
+const allAng4_shuf_mode23_24,   db  0,  1,  1,  2,  2,  3,  3,  4,  0,  1,  1,  2,  2,  3,  3,  4,  0,  1,  1,  2,  2,  3,  3,  4,  0,  1,  1,  2,  2,  3,  3,  4
+                                db  0,  1,  1,  2,  2,  3,  3,  4, 12,  0,  0,  1,  1,  2,  2,  3,  0,  1,  1,  2,  2,  3,  3,  4,  0,  1,  1,  2,  2,  3,  3,  4
+const allAng4_shuf_mode26,      db  1,  2,  3,  4,  1,  2,  3,  4,  1,  2,  3,  4,  1,  2,  3,  4,  1,  2,  3,  4,  1,  2,  3,  4,  1,  2,  3,  4,  1,  2,  3,  4
+const allAng4_shuf_mode27_28,   db  1,  2,  2,  3,  3,  4,  4,  5,  1,  2,  2,  3,  3,  4,  4,  5,  1,  2,  2,  3,  3,  4,  4,  5,  1,  2,  2,  3,  3,  4,  4,  5
+const allAng4_shuf_mode29_30,   db  1,  2,  2,  3,  3,  4,  4,  5,  2,  3,  3,  4,  4,  5,  5,  6,  2,  3,  3,  4,  4,  5,  5,  6,  2,  3,  3,  4,  4,  5,  5,  6
+const allAng4_shuf_mode31_32,   db  1,  2,  2,  3,  3,  4,  4,  5,  2,  3,  3,  4,  4,  5,  5,  6,  1,  2,  2,  3,  3,  4,  4,  5,  2,  3,  3,  4,  4,  5,  5,  6
+                                db  2,  3,  3,  4,  4,  5,  5,  6,  3,  4,  4,  5,  5,  6,  6,  7,  2,  3,  3,  4,  4,  5,  5,  6,  3,  4,  4,  5,  5,  6,  6,  7
+const allAng4_shuf_mode33,      db  1,  2,  2,  3,  3,  4,  4,  5,  2,  3,  3,  4,  4,  5,  5,  6,  3,  4,  4,  5,  5,  6,  6,  7,  4,  5,  5,  6,  6,  7,  7,  8
+const allAng4_shuf_mode34,      db  2,  3,  4,  5,  3,  4,  5,  6,  4,  5,  6,  7,  5,  6,  7,  8,  2,  3,  4,  5,  3,  4,  5,  6,  4,  5,  6,  7,  5,  6,  7,  8
+
+const allAng4_fact_mode3_4,     db  6, 26,  6, 26,  6, 26,  6, 26, 12, 20, 12, 20, 12, 20, 12, 20, 11, 21, 11, 21, 11, 21, 11, 21, 22, 10, 22, 10, 22, 10, 22, 10
+                                db 18, 14, 18, 14, 18, 14, 18, 14, 24,  8, 24,  8, 24,  8, 24,  8,  1, 31,  1, 31,  1, 31,  1, 31, 12, 20, 12, 20, 12, 20, 12, 20
+const allAng4_fact_mode5_6,     db 15, 17, 15, 17, 15, 17, 15, 17, 30,  2, 30,  2, 30,  2, 30,  2, 19, 13, 19, 13, 19, 13, 19, 13,  6, 26,  6, 26,  6, 26,  6, 26
+                                db 13, 19, 13, 19, 13, 19, 13, 19, 28,  4, 28,  4, 28,  4, 28,  4, 25,  7, 25,  7, 25,  7, 25,  7, 12, 20, 12, 20, 12, 20, 12, 20
+const allAng4_fact_mode7_8,     db 23,  9, 23,  9, 23,  9, 23,  9, 14, 18, 14, 18, 14, 18, 14, 18, 27,  5, 27,  5, 27,  5, 27,  5, 22, 10, 22, 10, 22, 10, 22, 10
+                                db  5, 27,  5, 27,  5, 27,  5, 27, 28,  4, 28,  4, 28,  4, 28,  4, 17, 15, 17, 15, 17, 15, 17, 15, 12, 20, 12, 20, 12, 20, 12, 20
+const allAng4_fact_mode9,       db 30,  2, 30,  2, 30,  2, 30,  2, 28,  4, 28,  4, 28,  4, 28,  4, 26,  6, 26,  6, 26,  6, 26,  6, 24,  8, 24,  8, 24,  8, 24,  8
+const allAng4_fact_mode11_12,   db  2, 30,  2, 30,  2, 30,  2, 30,  4, 28,  4, 28,  4, 28,  4, 28,  5, 27,  5, 27,  5, 27,  5, 27, 10, 22, 10, 22, 10, 22, 10, 22
+                                db  6, 26,  6, 26,  6, 26,  6, 26,  8, 24,  8, 24,  8, 24,  8, 24, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12
+const allAng4_fact_mode13_14,   db  9, 23,  9, 23,  9, 23,  9, 23, 18, 14, 18, 14, 18, 14, 18, 14, 13, 19, 13, 19, 13, 19, 13, 19, 26,  6, 26,  6, 26,  6, 26,  6
+                                db 27,  5, 27,  5, 27,  5, 27,  5,  4, 28,  4, 28,  4, 28,  4, 28,  7, 25,  7, 25,  7, 25,  7, 25, 20, 12, 20, 12, 20, 12, 20, 12
+const allAng4_fact_mode15_16,   db 17, 15, 17, 15, 17, 15, 17, 15,  2, 30,  2, 30,  2, 30,  2, 30, 21, 11, 21, 11, 21, 11, 21, 11, 10, 22, 10, 22, 10, 22, 10, 22
+                                db 19, 13, 19, 13, 19, 13, 19, 13,  4, 28,  4, 28,  4, 28,  4, 28, 31,  1, 31,  1, 31,  1, 31,  1, 20, 12, 20, 12, 20, 12, 20, 12
+const allAng4_fact_mode17,      db 26,  6, 26,  6, 26,  6, 26,  6, 20, 12, 20, 12, 20, 12, 20, 12, 14, 18, 14, 18, 14, 18, 14, 18,  8, 24,  8, 24,  8, 24,  8, 24
+const allAng4_fact_mode19_20,   db 26,  6, 26,  6, 26,  6, 26,  6, 20, 12, 20, 12, 20, 12, 20, 12, 21, 11, 21, 11, 21, 11, 21, 11, 10, 22, 10, 22, 10, 22, 10, 22
+                                db 14, 18, 14, 18, 14, 18, 14, 18,  8, 24,  8, 24,  8, 24,  8, 24, 31,  1, 31,  1, 31,  1, 31,  1, 20, 12, 20, 12, 20, 12, 20, 12
+const allAng4_fact_mode21_22,   db 17, 15, 17, 15, 17, 15, 17, 15,  2, 30,  2, 30,  2, 30,  2, 30, 13, 19, 13, 19, 13, 19, 13, 19, 26,  6, 26,  6, 26,  6, 26,  6
+                                db 19, 13, 19, 13, 19, 13, 19, 13,  4, 28,  4, 28,  4, 28,  4, 28,  7, 25,  7, 25,  7, 25,  7, 25, 20, 12, 20, 12, 20, 12, 20, 12
+const allAng4_fact_mode23_24,   db  9, 23,  9, 23,  9, 23,  9, 23, 18, 14, 18, 14, 18, 14, 18, 14,  5, 27,  5, 27,  5, 27,  5, 27, 10, 22, 10, 22, 10, 22, 10, 22
+                                db 27,  5, 27,  5, 27,  5, 27,  5,  4, 28,  4, 28,  4, 28,  4, 28, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12
+const allAng4_fact_mode25,      db  2, 30,  2, 30,  2, 30,  2, 30,  4, 28,  4, 28,  4, 28,  4, 28,  6, 26,  6, 26,  6, 26,  6, 26,  8, 24,  8, 24,  8, 24,  8, 24
+const allAng4_fact_mode27_28,   db 30,  2, 30,  2, 30,  2, 30,  2, 28,  4, 28,  4, 28,  4, 28,  4, 27,  5, 27,  5, 27,  5, 27,  5, 22, 10, 22, 10, 22, 10, 22, 10
+                                db 26,  6, 26,  6, 26,  6, 26,  6, 24,  8, 24,  8, 24,  8, 24,  8, 17, 15, 17, 15, 17, 15, 17, 15, 12, 20, 12, 20, 12, 20, 12, 20
+const allAng4_fact_mode29_30,   db 23,  9, 23,  9, 23,  9, 23,  9, 14, 18, 14, 18, 14, 18, 14, 18, 19, 13, 19, 13, 19, 13, 19, 13,  6, 26,  6, 26,  6, 26,  6, 26
+                                db  5, 27,  5, 27,  5, 27,  5, 27, 28,  4, 28,  4, 28,  4, 28,  4, 25,  7, 25,  7, 25,  7, 25,  7, 12, 20, 12, 20, 12, 20, 12, 20
+const allAng4_fact_mode31_32,   db 15, 17, 15, 17, 15, 17, 15, 17, 30,  2, 30,  2, 30,  2, 30,  2, 11, 21, 11, 21, 11, 21, 11, 21, 22, 10, 22, 10, 22, 10, 22, 10
+                                db 13, 19, 13, 19, 13, 19, 13, 19, 28,  4, 28,  4, 28,  4, 28,  4,  1, 31,  1, 31,  1, 31,  1, 31, 12, 20, 12, 20, 12, 20, 12, 20
+const allAng4_fact_mode33,      db  6, 26,  6, 26,  6, 26,  6, 26, 12, 20, 12, 20, 12, 20, 12, 20, 18, 14, 18, 14, 18, 14, 18, 14, 24,  8, 24,  8, 24,  8, 24,  8
 
 
 SECTION .text
@@ -23075,80 +23076,69 @@
 ; void all_angs_pred_4x4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma)
 ;-----------------------------------------------------------------------------
 INIT_YMM avx2
-cglobal all_angs_pred_4x4, 4, 4, 6
+cglobal all_angs_pred_4x4, 2, 2, 6
 
     mova           m5, [pw_1024]
-    lea            r2, [all_ang4]
-    lea            r3, [all_ang4_shuff]
 
 ; mode 2
 
     vbroadcasti128 m0, [r1 + 9]
-    mova           xm1, xm0
-    psrldq         xm1, 1
-    pshufb         xm1, [r3]
+    pshufb         m1, m0, [allAng4_shuf_mode2]
     movu           [r0], xm1
 
 ; mode 3
 
-    pshufb         m1, m0, [r3 + 1 * mmsize]
-    pmaddubsw      m1, [r2]
+    pshufb         m1, m0, [allAng4_shuf_mode3_4]
+    pmaddubsw      m1, [allAng4_fact_mode3_4]
     pmulhrsw       m1, m5
 
 ; mode 4
 
-    pshufb         m2, m0, [r3 + 2 * mmsize]
-    pmaddubsw      m2, [r2 + 1 * mmsize]
+    pshufb         m2, m0, [allAng4_shuf_mode3_4 + mmsize]
+    pmaddubsw      m2, [allAng4_fact_mode3_4 + mmsize]
     pmulhrsw       m2, m5
     packuswb       m1, m2
-    vpermq         m1, m1, 11011000b
     movu           [r0 + (3 - 2) * 16], m1
 
 ; mode 5
 
-    pshufb         m1, m0, [r3 + 2 * mmsize]
-    pmaddubsw      m1, [r2 + 2 * mmsize]
+    pshufb         m1, m0, [allAng4_shuf_mode5_6]
+    pmaddubsw      m1, [allAng4_fact_mode5_6]
     pmulhrsw       m1, m5
 
 ; mode 6
 
-    pshufb         m2, m0, [r3 + 3 * mmsize]
-    pmaddubsw      m2, [r2 + 3 * mmsize]
+    pshufb         m2, m0, [allAng4_shuf_mode5_6 + mmsize]
+    pmaddubsw      m2, [allAng4_fact_mode5_6 + mmsize]
     pmulhrsw       m2, m5
     packuswb       m1, m2
-    vpermq         m1, m1, 11011000b
     movu           [r0 + (5 - 2) * 16], m1
 
-    add            r3, 4 * mmsize
-    add            r2, 4 * mmsize
-
 ; mode 7
 
-    pshufb         m1, m0, [r3 + 0 * mmsize]
-    pmaddubsw      m1, [r2 + 0 * mmsize]
+    pshufb         m3, m0, [allAng4_shuf_mode7_8]
+    pmaddubsw      m1, m3, [allAng4_fact_mode7_8]
     pmulhrsw       m1, m5
 
 ; mode 8
 
-    pshufb         m2, m0, [r3 + 1 * mmsize]
-    pmaddubsw      m2, [r2 + 1 * mmsize]
+    pshufb         m2, m0, [allAng4_shuf_mode7_8 + mmsize]
+    pmaddubsw      m2, [allAng4_fact_mode7_8 + mmsize]
     pmulhrsw       m2, m5
     packuswb       m1, m2
-    vpermq         m1, m1, 11011000b
     movu           [r0 + (7 - 2) * 16], m1

 
@@ -27,62 +27,63 @@
 
 SECTION_RODATA 32
 
-all_ang4_shuff: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
-                db 0, 1, 1, 2, 2, 3, 3, 4, 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6, 3, 4, 4, 5, 5, 6, 6, 7
-                db 0, 1, 1, 2, 2, 3, 3, 4, 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6
-                db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5
-                db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 1, 2, 2, 3, 3, 4, 4, 5
-                db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4
-                db 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
-                db 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12
-                db 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 4, 0, 0, 9, 9, 10, 10, 11
-                db 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 2, 0, 0, 9, 9, 10, 10, 11, 2, 0, 0, 9, 9, 10, 10, 11
-                db 0, 9, 9, 10, 10, 11, 11, 12, 2, 0, 0, 9, 9, 10, 10, 11, 2, 0, 0, 9, 9, 10, 10, 11, 4, 2, 2, 0, 0, 9, 9, 10
-                db 0, 9, 9, 10, 10, 11, 11, 12, 2, 0, 0, 9, 9, 10, 10, 11, 2, 0, 0, 9, 9, 10, 10, 11, 3, 2, 2, 0, 0, 9, 9, 10
-                db 0, 9, 9, 10, 10, 11, 11, 12, 1, 0, 0, 9, 9, 10, 10, 11, 2, 1, 1, 0, 0, 9, 9, 10, 4, 2, 2, 1, 1, 0, 0, 9
-                db 0, 1, 2, 3, 9, 0, 1, 2, 10, 9, 0, 1, 11, 10, 9, 0, 0, 1, 2, 3, 9, 0, 1, 2, 10, 9, 0, 1, 11, 10, 9, 0
-                db 0, 1, 1, 2, 2, 3, 3, 4, 9, 0, 0, 1, 1, 2, 2, 3, 10, 9, 9, 0, 0, 1, 1, 2, 12, 10, 10, 9, 9, 0, 0, 1
-                db 0, 1, 1, 2, 2, 3, 3, 4, 10, 0, 0, 1, 1, 2, 2, 3, 10, 0, 0, 1, 1, 2, 2, 3, 11, 10, 10, 0, 0, 1, 1, 2
-                db 0, 1, 1, 2, 2, 3, 3, 4, 10, 0, 0, 1, 1, 2, 2, 3, 10, 0, 0, 1, 1, 2, 2, 3, 12, 10, 10, 0, 0, 1, 1, 2
-                db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 10, 0, 0, 1, 1, 2, 2, 3, 10, 0, 0, 1, 1, 2, 2, 3
-                db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 12, 0, 0, 1, 1, 2, 2, 3
-                db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4
-                db 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4
-                db 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5
-                db 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6
-                db 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6, 2, 3, 3, 4, 4, 5, 5, 6
-                db 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6, 2, 3, 3, 4, 4, 5, 5, 6, 3, 4, 4, 5, 5, 6, 6, 7
-                db 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6, 3, 4, 4, 5, 5, 6, 6, 7, 4, 5, 5, 6, 6, 7, 7, 8
-                db 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8
-
-all_ang4: db 6, 26, 6, 26, 6, 26, 6, 26, 12, 20, 12, 20, 12, 20, 12, 20, 18, 14, 18, 14, 18, 14, 18, 14, 24, 8, 24, 8, 24, 8, 24, 8
-          db 11, 21, 11, 21, 11, 21, 11, 21, 22, 10, 22, 10, 22, 10, 22, 10, 1, 31, 1, 31, 1, 31, 1, 31, 12, 20, 12, 20, 12, 20, 12, 20
-          db 15, 17, 15, 17, 15, 17, 15, 17, 30, 2, 30, 2, 30, 2, 30, 2, 13, 19, 13, 19, 13, 19, 13, 19, 28, 4, 28, 4, 28, 4, 28, 4
-          db 19, 13, 19, 13, 19, 13, 19, 13, 6, 26, 6, 26, 6, 26, 6, 26, 25, 7, 25, 7, 25, 7, 25, 7, 12, 20, 12, 20, 12, 20, 12, 20
-          db 23, 9, 23, 9, 23, 9, 23, 9, 14, 18, 14, 18, 14, 18, 14, 18, 5, 27, 5, 27, 5, 27, 5, 27, 28, 4, 28, 4, 28, 4, 28, 4
-          db 27, 5, 27, 5, 27, 5, 27, 5, 22, 10, 22, 10, 22, 10, 22, 10, 17, 15, 17, 15, 17, 15, 17, 15, 12, 20, 12, 20, 12, 20, 12, 20
-          db 30, 2, 30, 2, 30, 2, 30, 2, 28, 4, 28, 4, 28, 4, 28, 4, 26, 6, 26, 6, 26, 6, 26, 6, 24, 8, 24, 8, 24, 8, 24, 8
-          db 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24
-          db 5, 27, 5, 27, 5, 27, 5, 27, 10, 22, 10, 22, 10, 22, 10, 22, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12
-          db 9, 23, 9, 23, 9, 23, 9, 23, 18, 14, 18, 14, 18, 14, 18, 14, 27, 5, 27, 5, 27, 5, 27, 5, 4, 28, 4, 28, 4, 28, 4, 28
-          db 13, 19, 13, 19, 13, 19, 13, 19, 26, 6, 26, 6, 26, 6, 26, 6, 7, 25, 7, 25, 7, 25, 7, 25, 20, 12, 20, 12, 20, 12, 20, 12
-          db 17, 15, 17, 15, 17, 15, 17, 15, 2, 30, 2, 30, 2, 30, 2, 30, 19, 13, 19, 13, 19, 13, 19, 13, 4, 28, 4, 28, 4, 28, 4, 28
-          db 21, 11, 21, 11, 21, 11, 21, 11, 10, 22, 10, 22, 10, 22, 10, 22, 31, 1, 31, 1, 31, 1, 31, 1, 20, 12, 20, 12, 20, 12, 20, 12
-          db 26, 6, 26, 6, 26, 6, 26, 6, 20, 12, 20, 12, 20, 12, 20, 12, 14, 18, 14, 18, 14, 18, 14, 18, 8, 24, 8, 24, 8, 24, 8, 24
-          db 26, 6, 26, 6, 26, 6, 26, 6, 20, 12, 20, 12, 20, 12, 20, 12, 14, 18, 14, 18, 14, 18, 14, 18, 8, 24, 8, 24, 8, 24, 8, 24
-          db 21, 11, 21, 11, 21, 11, 21, 11, 10, 22, 10, 22, 10, 22, 10, 22, 31, 1, 31, 1, 31, 1, 31, 1, 20, 12, 20, 12, 20, 12, 20, 12
-          db 17, 15, 17, 15, 17, 15, 17, 15, 2, 30, 2, 30, 2, 30, 2, 30, 19, 13, 19, 13, 19, 13, 19, 13, 4, 28, 4, 28, 4, 28, 4, 28
-          db 13, 19, 13, 19, 13, 19, 13, 19, 26, 6, 26, 6, 26, 6, 26, 6, 7, 25, 7, 25, 7, 25, 7, 25, 20, 12, 20, 12, 20, 12, 20, 12
-          db 9, 23, 9, 23, 9, 23, 9, 23, 18, 14, 18, 14, 18, 14, 18, 14, 27, 5, 27, 5, 27, 5, 27, 5, 4, 28, 4, 28, 4, 28, 4, 28
-          db 5, 27, 5, 27, 5, 27, 5, 27, 10, 22, 10, 22, 10, 22, 10, 22, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12
-          db 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24
-          db 30, 2, 30, 2, 30, 2, 30, 2, 28, 4, 28, 4, 28, 4, 28, 4, 26, 6, 26, 6, 26, 6, 26, 6, 24, 8, 24, 8, 24, 8, 24, 8
-          db 27, 5, 27, 5, 27, 5, 27, 5, 22, 10, 22, 10, 22, 10, 22, 10, 17, 15, 17, 15, 17, 15, 17, 15, 12, 20, 12, 20, 12, 20, 12, 20
-          db 23, 9, 23, 9, 23, 9, 23, 9, 14, 18, 14, 18, 14, 18, 14, 18, 5, 27, 5, 27, 5, 27, 5, 27, 28, 4, 28, 4, 28, 4, 28, 4
-          db 19, 13, 19, 13, 19, 13, 19, 13, 6, 26, 6, 26, 6, 26, 6, 26, 25, 7, 25, 7, 25, 7, 25, 7, 12, 20, 12, 20, 12, 20, 12, 20
-          db 15, 17, 15, 17, 15, 17, 15, 17, 30, 2, 30, 2, 30, 2, 30, 2, 13, 19, 13, 19, 13, 19, 13, 19, 28, 4, 28, 4, 28, 4, 28, 4
-          db 11, 21, 11, 21, 11, 21, 11, 21, 22, 10, 22, 10, 22, 10, 22, 10, 1, 31, 1, 31, 1, 31, 1, 31, 12, 20, 12, 20, 12, 20, 12, 20
-          db 6, 26, 6, 26, 6, 26, 6, 26, 12, 20, 12, 20, 12, 20, 12, 20, 18, 14, 18, 14, 18, 14, 18, 14, 24, 8, 24, 8, 24, 8, 24, 8
+const allAng4_shuf_mode2,       db  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6,  4,  5,  6,  7,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6,  4,  5,  6,  7
+const allAng4_shuf_mode3_4,     db  0,  1,  1,  2,  2,  3,  3,  4,  1,  2,  2,  3,  3,  4,  4,  5,  0,  1,  1,  2,  2,  3,  3,  4,  1,  2,  2,  3,  3,  4,  4,  5
+                                db  2,  3,  3,  4,  4,  5,  5,  6,  3,  4,  4,  5,  5,  6,  6,  7,  1,  2,  2,  3,  3,  4,  4,  5,  2,  3,  3,  4,  4,  5,  5,  6
+const allAng4_shuf_mode5_6,     db  0,  1,  1,  2,  2,  3,  3,  4,  1,  2,  2,  3,  3,  4,  4,  5,  0,  1,  1,  2,  2,  3,  3,  4,  0,  1,  1,  2,  2,  3,  3,  4
+                                db  1,  2,  2,  3,  3,  4,  4,  5,  2,  3,  3,  4,  4,  5,  5,  6,  1,  2,  2,  3,  3,  4,  4,  5,  1,  2,  2,  3,  3,  4,  4,  5
+const allAng4_shuf_mode7_8,     db  0,  1,  1,  2,  2,  3,  3,  4,  0,  1,  1,  2,  2,  3,  3,  4,  0,  1,  1,  2,  2,  3,  3,  4,  0,  1,  1,  2,  2,  3,  3,  4
+                                db  0,  1,  1,  2,  2,  3,  3,  4,  1,  2,  2,  3,  3,  4,  4,  5,  0,  1,  1,  2,  2,  3,  3,  4,  0,  1,  1,  2,  2,  3,  3,  4
+const allAng4_shuf_mode10,      db  0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3
+const allAng4_shuf_mode11_12,   db  0,  9,  9, 10, 10, 11, 11, 12,  0,  9,  9, 10, 10, 11, 11, 12,  0,  9,  9, 10, 10, 11, 11, 12,  0,  9,  9, 10, 10, 11, 11, 12
+const allAng4_shuf_mode13_14,   db  0,  9,  9, 10, 10, 11, 11, 12,  4,  0,  0,  9,  9, 10, 10, 11,  2,  0,  0,  9,  9, 10, 10, 11,  2,  0,  0,  9,  9, 10, 10, 11
+const allAng4_shuf_mode15_16,   db  0,  9,  9, 10, 10, 11, 11, 12,  2,  0,  0,  9,  9, 10, 10, 11,  0,  9,  9, 10, 10, 11, 11, 12,  2,  0,  0,  9,  9, 10, 10, 11
+                                db  2,  0,  0,  9,  9, 10, 10, 11,  4,  2,  2,  0,  0,  9,  9, 10,  2,  0,  0,  9,  9, 10, 10, 11,  3,  2,  2,  0,  0,  9,  9, 10
+const allAng4_shuf_mode17,      db  0,  9,  9, 10, 10, 11, 11, 12,  1,  0,  0,  9,  9, 10, 10, 11,  2,  1,  1,  0,  0,  9,  9, 10,  4,  2,  2,  1,  1,  0,  0,  9
+                                db  0,  1,  2,  3,  9,  0,  1,  2, 10,  9,  0,  1, 11, 10,  9,  0,  0,  1,  2,  3,  9,  0,  1,  2, 10,  9,  0,  1, 11, 10,  9,  0
+const allAng4_shuf_mode18,      db  0,  1,  2,  3,  9,  0,  1,  2, 10,  9,  0,  1, 11, 10,  9,  0,  0,  1,  2,  3,  9,  0,  1,  2, 10,  9,  0,  1, 11, 10,  9,  0
+const allAng4_shuf_mode19_20,   db  0,  1,  1,  2,  2,  3,  3,  4,  9,  0,  0,  1,  1,  2,  2,  3,  0,  1,  1,  2,  2,  3,  3,  4, 10,  0,  0,  1,  1,  2,  2,  3
+                                db 10,  9,  9,  0,  0,  1,  1,  2, 12, 10, 10,  9,  9,  0,  0,  1, 10,  0,  0,  1,  1,  2,  2,  3, 11, 10, 10,  0,  0,  1,  1,  2
+const allAng4_shuf_mode21_22,   db  0,  1,  1,  2,  2,  3,  3,  4, 10,  0,  0,  1,  1,  2,  2,  3,  0,  1,  1,  2,  2,  3,  3,  4,  0,  1,  1,  2,  2,  3,  3,  4
+                                db 10,  0,  0,  1,  1,  2,  2,  3, 12, 10, 10,  0,  0,  1,  1,  2, 10,  0,  0,  1,  1,  2,  2,  3, 10,  0,  0,  1,  1,  2,  2,  3
+const allAng4_shuf_mode23_24,   db  0,  1,  1,  2,  2,  3,  3,  4,  0,  1,  1,  2,  2,  3,  3,  4,  0,  1,  1,  2,  2,  3,  3,  4,  0,  1,  1,  2,  2,  3,  3,  4
+                                db  0,  1,  1,  2,  2,  3,  3,  4, 12,  0,  0,  1,  1,  2,  2,  3,  0,  1,  1,  2,  2,  3,  3,  4,  0,  1,  1,  2,  2,  3,  3,  4
+const allAng4_shuf_mode26,      db  1,  2,  3,  4,  1,  2,  3,  4,  1,  2,  3,  4,  1,  2,  3,  4,  1,  2,  3,  4,  1,  2,  3,  4,  1,  2,  3,  4,  1,  2,  3,  4
+const allAng4_shuf_mode27_28,   db  1,  2,  2,  3,  3,  4,  4,  5,  1,  2,  2,  3,  3,  4,  4,  5,  1,  2,  2,  3,  3,  4,  4,  5,  1,  2,  2,  3,  3,  4,  4,  5
+const allAng4_shuf_mode29_30,   db  1,  2,  2,  3,  3,  4,  4,  5,  2,  3,  3,  4,  4,  5,  5,  6,  2,  3,  3,  4,  4,  5,  5,  6,  2,  3,  3,  4,  4,  5,  5,  6
+const allAng4_shuf_mode31_32,   db  1,  2,  2,  3,  3,  4,  4,  5,  2,  3,  3,  4,  4,  5,  5,  6,  1,  2,  2,  3,  3,  4,  4,  5,  2,  3,  3,  4,  4,  5,  5,  6
+                                db  2,  3,  3,  4,  4,  5,  5,  6,  3,  4,  4,  5,  5,  6,  6,  7,  2,  3,  3,  4,  4,  5,  5,  6,  3,  4,  4,  5,  5,  6,  6,  7
+const allAng4_shuf_mode33,      db  1,  2,  2,  3,  3,  4,  4,  5,  2,  3,  3,  4,  4,  5,  5,  6,  3,  4,  4,  5,  5,  6,  6,  7,  4,  5,  5,  6,  6,  7,  7,  8
+const allAng4_shuf_mode34,      db  2,  3,  4,  5,  3,  4,  5,  6,  4,  5,  6,  7,  5,  6,  7,  8,  2,  3,  4,  5,  3,  4,  5,  6,  4,  5,  6,  7,  5,  6,  7,  8
+
+const allAng4_fact_mode3_4,     db  6, 26,  6, 26,  6, 26,  6, 26, 12, 20, 12, 20, 12, 20, 12, 20, 11, 21, 11, 21, 11, 21, 11, 21, 22, 10, 22, 10, 22, 10, 22, 10
+                                db 18, 14, 18, 14, 18, 14, 18, 14, 24,  8, 24,  8, 24,  8, 24,  8,  1, 31,  1, 31,  1, 31,  1, 31, 12, 20, 12, 20, 12, 20, 12, 20
+const allAng4_fact_mode5_6,     db 15, 17, 15, 17, 15, 17, 15, 17, 30,  2, 30,  2, 30,  2, 30,  2, 19, 13, 19, 13, 19, 13, 19, 13,  6, 26,  6, 26,  6, 26,  6, 26
+                                db 13, 19, 13, 19, 13, 19, 13, 19, 28,  4, 28,  4, 28,  4, 28,  4, 25,  7, 25,  7, 25,  7, 25,  7, 12, 20, 12, 20, 12, 20, 12, 20
+const allAng4_fact_mode7_8,     db 23,  9, 23,  9, 23,  9, 23,  9, 14, 18, 14, 18, 14, 18, 14, 18, 27,  5, 27,  5, 27,  5, 27,  5, 22, 10, 22, 10, 22, 10, 22, 10
+                                db  5, 27,  5, 27,  5, 27,  5, 27, 28,  4, 28,  4, 28,  4, 28,  4, 17, 15, 17, 15, 17, 15, 17, 15, 12, 20, 12, 20, 12, 20, 12, 20
+const allAng4_fact_mode9,       db 30,  2, 30,  2, 30,  2, 30,  2, 28,  4, 28,  4, 28,  4, 28,  4, 26,  6, 26,  6, 26,  6, 26,  6, 24,  8, 24,  8, 24,  8, 24,  8
+const allAng4_fact_mode11_12,   db  2, 30,  2, 30,  2, 30,  2, 30,  4, 28,  4, 28,  4, 28,  4, 28,  5, 27,  5, 27,  5, 27,  5, 27, 10, 22, 10, 22, 10, 22, 10, 22
+                                db  6, 26,  6, 26,  6, 26,  6, 26,  8, 24,  8, 24,  8, 24,  8, 24, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12
+const allAng4_fact_mode13_14,   db  9, 23,  9, 23,  9, 23,  9, 23, 18, 14, 18, 14, 18, 14, 18, 14, 13, 19, 13, 19, 13, 19, 13, 19, 26,  6, 26,  6, 26,  6, 26,  6
+                                db 27,  5, 27,  5, 27,  5, 27,  5,  4, 28,  4, 28,  4, 28,  4, 28,  7, 25,  7, 25,  7, 25,  7, 25, 20, 12, 20, 12, 20, 12, 20, 12
+const allAng4_fact_mode15_16,   db 17, 15, 17, 15, 17, 15, 17, 15,  2, 30,  2, 30,  2, 30,  2, 30, 21, 11, 21, 11, 21, 11, 21, 11, 10, 22, 10, 22, 10, 22, 10, 22
+                                db 19, 13, 19, 13, 19, 13, 19, 13,  4, 28,  4, 28,  4, 28,  4, 28, 31,  1, 31,  1, 31,  1, 31,  1, 20, 12, 20, 12, 20, 12, 20, 12
+const allAng4_fact_mode17,      db 26,  6, 26,  6, 26,  6, 26,  6, 20, 12, 20, 12, 20, 12, 20, 12, 14, 18, 14, 18, 14, 18, 14, 18,  8, 24,  8, 24,  8, 24,  8, 24
+const allAng4_fact_mode19_20,   db 26,  6, 26,  6, 26,  6, 26,  6, 20, 12, 20, 12, 20, 12, 20, 12, 21, 11, 21, 11, 21, 11, 21, 11, 10, 22, 10, 22, 10, 22, 10, 22
+                                db 14, 18, 14, 18, 14, 18, 14, 18,  8, 24,  8, 24,  8, 24,  8, 24, 31,  1, 31,  1, 31,  1, 31,  1, 20, 12, 20, 12, 20, 12, 20, 12
+const allAng4_fact_mode21_22,   db 17, 15, 17, 15, 17, 15, 17, 15,  2, 30,  2, 30,  2, 30,  2, 30, 13, 19, 13, 19, 13, 19, 13, 19, 26,  6, 26,  6, 26,  6, 26,  6
+                                db 19, 13, 19, 13, 19, 13, 19, 13,  4, 28,  4, 28,  4, 28,  4, 28,  7, 25,  7, 25,  7, 25,  7, 25, 20, 12, 20, 12, 20, 12, 20, 12
+const allAng4_fact_mode23_24,   db  9, 23,  9, 23,  9, 23,  9, 23, 18, 14, 18, 14, 18, 14, 18, 14,  5, 27,  5, 27,  5, 27,  5, 27, 10, 22, 10, 22, 10, 22, 10, 22
+                                db 27,  5, 27,  5, 27,  5, 27,  5,  4, 28,  4, 28,  4, 28,  4, 28, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12
+const allAng4_fact_mode25,      db  2, 30,  2, 30,  2, 30,  2, 30,  4, 28,  4, 28,  4, 28,  4, 28,  6, 26,  6, 26,  6, 26,  6, 26,  8, 24,  8, 24,  8, 24,  8, 24
+const allAng4_fact_mode27_28,   db 30,  2, 30,  2, 30,  2, 30,  2, 28,  4, 28,  4, 28,  4, 28,  4, 27,  5, 27,  5, 27,  5, 27,  5, 22, 10, 22, 10, 22, 10, 22, 10
+                                db 26,  6, 26,  6, 26,  6, 26,  6, 24,  8, 24,  8, 24,  8, 24,  8, 17, 15, 17, 15, 17, 15, 17, 15, 12, 20, 12, 20, 12, 20, 12, 20
+const allAng4_fact_mode29_30,   db 23,  9, 23,  9, 23,  9, 23,  9, 14, 18, 14, 18, 14, 18, 14, 18, 19, 13, 19, 13, 19, 13, 19, 13,  6, 26,  6, 26,  6, 26,  6, 26
+                                db  5, 27,  5, 27,  5, 27,  5, 27, 28,  4, 28,  4, 28,  4, 28,  4, 25,  7, 25,  7, 25,  7, 25,  7, 12, 20, 12, 20, 12, 20, 12, 20
+const allAng4_fact_mode31_32,   db 15, 17, 15, 17, 15, 17, 15, 17, 30,  2, 30,  2, 30,  2, 30,  2, 11, 21, 11, 21, 11, 21, 11, 21, 22, 10, 22, 10, 22, 10, 22, 10
+                                db 13, 19, 13, 19, 13, 19, 13, 19, 28,  4, 28,  4, 28,  4, 28,  4,  1, 31,  1, 31,  1, 31,  1, 31, 12, 20, 12, 20, 12, 20, 12, 20
+const allAng4_fact_mode33,      db  6, 26,  6, 26,  6, 26,  6, 26, 12, 20, 12, 20, 12, 20, 12, 20, 18, 14, 18, 14, 18, 14, 18, 14, 24,  8, 24,  8, 24,  8, 24,  8
 
 
 SECTION .text
@@ -23075,80 +23076,69 @@
 ; void all_angs_pred_4x4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma)
 ;-----------------------------------------------------------------------------
 INIT_YMM avx2
-cglobal all_angs_pred_4x4, 4, 4, 6
+cglobal all_angs_pred_4x4, 2, 2, 6
 
     mova           m5, [pw_1024]
-    lea            r2, [all_ang4]
-    lea            r3, [all_ang4_shuff]
 
 ; mode 2
 
     vbroadcasti128 m0, [r1 + 9]
-    mova           xm1, xm0
-    psrldq         xm1, 1
-    pshufb         xm1, [r3]
+    pshufb         m1, m0, [allAng4_shuf_mode2]
     movu           [r0], xm1
 
 ; mode 3
 
-    pshufb         m1, m0, [r3 + 1 * mmsize]
-    pmaddubsw      m1, [r2]
+    pshufb         m1, m0, [allAng4_shuf_mode3_4]
+    pmaddubsw      m1, [allAng4_fact_mode3_4]
     pmulhrsw       m1, m5
 
 ; mode 4
 
-    pshufb         m2, m0, [r3 + 2 * mmsize]
-    pmaddubsw      m2, [r2 + 1 * mmsize]
+    pshufb         m2, m0, [allAng4_shuf_mode3_4 + mmsize]
+    pmaddubsw      m2, [allAng4_fact_mode3_4 + mmsize]
     pmulhrsw       m2, m5
     packuswb       m1, m2
-    vpermq         m1, m1, 11011000b
     movu           [r0 + (3 - 2) * 16], m1
 
 ; mode 5
 
-    pshufb         m1, m0, [r3 + 2 * mmsize]
-    pmaddubsw      m1, [r2 + 2 * mmsize]
+    pshufb         m1, m0, [allAng4_shuf_mode5_6]
+    pmaddubsw      m1, [allAng4_fact_mode5_6]
     pmulhrsw       m1, m5
 
 ; mode 6
 
-    pshufb         m2, m0, [r3 + 3 * mmsize]
-    pmaddubsw      m2, [r2 + 3 * mmsize]
+    pshufb         m2, m0, [allAng4_shuf_mode5_6 + mmsize]
+    pmaddubsw      m2, [allAng4_fact_mode5_6 + mmsize]
     pmulhrsw       m2, m5
     packuswb       m1, m2
-    vpermq         m1, m1, 11011000b
     movu           [r0 + (5 - 2) * 16], m1
 
-    add            r3, 4 * mmsize
-    add            r2, 4 * mmsize
-
 ; mode 7
 
-    pshufb         m1, m0, [r3 + 0 * mmsize]
-    pmaddubsw      m1, [r2 + 0 * mmsize]
+    pshufb         m3, m0, [allAng4_shuf_mode7_8]
+    pmaddubsw      m1, m3, [allAng4_fact_mode7_8]
     pmulhrsw       m1, m5
 
 ; mode 8
 
-    pshufb         m2, m0, [r3 + 1 * mmsize]
-    pmaddubsw      m2, [r2 + 1 * mmsize]
+    pshufb         m2, m0, [allAng4_shuf_mode7_8 + mmsize]
+    pmaddubsw      m2, [allAng4_fact_mode7_8 + mmsize]
     pmulhrsw       m2, m5
     packuswb       m1, m2
-    vpermq         m1, m1, 11011000b
     movu           [r0 + (7 - 2) * 16], m1
 
​

x265_1.8.tar.gz/source/common/x86/ipfilter16.asm -> x265_1.9.tar.gz/source/common/x86/ipfilter16.asm Changed

@@ -4869,7 +4869,7 @@
 %ifidn %2,pp
     vbroadcasti128  m8, [INTERP_OFFSET_PP]
 %elifidn %2, sp
-    mova            m8, [INTERP_OFFSET_SP]
+    vbroadcasti128  m8, [INTERP_OFFSET_SP]
 %else
     vbroadcasti128  m8, [INTERP_OFFSET_PS]
 %endif
@@ -5011,11 +5011,11 @@
     mov       r4d, %1/2
 
 %ifidn %2, pp
-    mova      m7, [INTERP_OFFSET_PP]
+    vbroadcasti128  m7, [INTERP_OFFSET_PP]
 %elifidn %2, sp
-    mova      m7, [INTERP_OFFSET_SP]
+    vbroadcasti128  m7, [INTERP_OFFSET_SP]
 %elifidn %2, ps
-    mova      m7, [INTERP_OFFSET_PS]
+    vbroadcasti128  m7, [INTERP_OFFSET_PS]
 %endif
 
 .loopH:
@@ -5183,11 +5183,11 @@
     mov       r4d, %1/2
 
 %ifidn %2, pp
-    mova      m7, [INTERP_OFFSET_PP]
+    vbroadcasti128  m7, [INTERP_OFFSET_PP]
 %elifidn %2, sp
-    mova      m7, [INTERP_OFFSET_SP]
+    vbroadcasti128  m7, [INTERP_OFFSET_SP]
 %elifidn %2, ps
-    mova      m7, [INTERP_OFFSET_PS]
+    vbroadcasti128  m7, [INTERP_OFFSET_PS]
 %endif
 
 .loopH:
@@ -5325,11 +5325,11 @@
     mov       r4d, %1/2
 
 %ifidn %2, pp
-    mova      m7, [INTERP_OFFSET_PP]
+    vbroadcasti128  m7, [INTERP_OFFSET_PP]
 %elifidn %2, sp
-    mova      m7, [INTERP_OFFSET_SP]
+    vbroadcasti128  m7, [INTERP_OFFSET_SP]
 %elifidn %2, ps
-    mova      m7, [INTERP_OFFSET_PS]
+    vbroadcasti128  m7, [INTERP_OFFSET_PS]
 %endif
 
 .loopH:
@@ -5456,11 +5456,11 @@
     mov       r4d, %1/2
 
 %ifidn %2, pp
-    mova      m7, [INTERP_OFFSET_PP]
+    vbroadcasti128  m7, [INTERP_OFFSET_PP]
 %elifidn %2, sp
-    mova      m7, [INTERP_OFFSET_SP]
+    vbroadcasti128  m7, [INTERP_OFFSET_SP]
 %elifidn %2, ps
-    mova      m7, [INTERP_OFFSET_PS]
+    vbroadcasti128  m7, [INTERP_OFFSET_PS]
 %endif
 
 .loopH:
@@ -5609,11 +5609,11 @@
     mov       r4d, %1/2
 
 %ifidn %2, pp
-    mova      m7, [INTERP_OFFSET_PP]
+    vbroadcasti128  m7, [INTERP_OFFSET_PP]
 %elifidn %2, sp
-    mova      m7, [INTERP_OFFSET_SP]
+    vbroadcasti128  m7, [INTERP_OFFSET_SP]
 %elifidn %2, ps
-    mova      m7, [INTERP_OFFSET_PS]
+    vbroadcasti128  m7, [INTERP_OFFSET_PS]
 %endif
 
 .loopH:
@@ -5732,11 +5732,11 @@
     mov       r4d, 32
 
 %ifidn %1, pp
-    mova      m7, [INTERP_OFFSET_PP]
+    vbroadcasti128  m7, [INTERP_OFFSET_PP]
 %elifidn %1, sp
-    mova      m7, [INTERP_OFFSET_SP]
+    vbroadcasti128  m7, [INTERP_OFFSET_SP]
 %elifidn %1, ps
-    mova      m7, [INTERP_OFFSET_PS]
+    vbroadcasti128  m7, [INTERP_OFFSET_PS]
 %endif
 
 .loopH:
@@ -6068,7 +6068,7 @@
 %ifidn %1,pp
     vbroadcasti128  m6, [pd_32]
 %elifidn %1, sp
-    mova            m6, [pd_524800]
+    vbroadcasti128  m6, [INTERP_OFFSET_SP]
 %else
     vbroadcasti128  m6, [INTERP_OFFSET_PS]
 %endif
@@ -6178,7 +6178,7 @@
 %ifidn %1,pp
     vbroadcasti128  m11, [pd_32]
 %elifidn %1, sp
-    mova            m11, [pd_524800]
+    vbroadcasti128  m11, [INTERP_OFFSET_SP]
 %else
     vbroadcasti128  m11, [INTERP_OFFSET_PS]
 %endif
@@ -6816,7 +6816,7 @@
 %ifidn %1,pp
     vbroadcasti128  m14, [pd_32]
 %elifidn %1, sp
-    mova            m14, [INTERP_OFFSET_SP]
+    vbroadcasti128  m14, [INTERP_OFFSET_SP]
 %else
     vbroadcasti128  m14, [INTERP_OFFSET_PS]
 %endif
@@ -6867,7 +6867,7 @@
 %ifidn %3,pp
     vbroadcasti128  m14, [pd_32]
 %elifidn %3, sp
-    mova            m14, [INTERP_OFFSET_SP]
+    vbroadcasti128  m14, [INTERP_OFFSET_SP]
 %else
     vbroadcasti128  m14, [INTERP_OFFSET_PS]
 %endif
@@ -6950,7 +6950,7 @@
 %ifidn %1,pp
     vbroadcasti128  m14, [pd_32]
 %elifidn %1, sp
-    mova            m14, [INTERP_OFFSET_SP]
+    vbroadcasti128  m14, [INTERP_OFFSET_SP]
 %else
     vbroadcasti128  m14, [INTERP_OFFSET_PS]
 %endif
@@ -7597,7 +7597,7 @@
 %ifidn %1,pp
     vbroadcasti128  m11, [pd_32]
 %elifidn %1, sp
-    mova            m11, [INTERP_OFFSET_SP]
+    vbroadcasti128  m11, [INTERP_OFFSET_SP]
 %else
     vbroadcasti128  m11, [INTERP_OFFSET_PS]
 %endif
@@ -7644,7 +7644,7 @@
 %ifidn %1,pp
     vbroadcasti128  m14, [pd_32]
 %elifidn %1, sp
-    mova            m14, [INTERP_OFFSET_SP]
+    vbroadcasti128  m14, [INTERP_OFFSET_SP]
 %else
     vbroadcasti128  m14, [INTERP_OFFSET_PS]
 %endif
@@ -7816,7 +7816,7 @@
 %ifidn %1,pp
     vbroadcasti128  m7, [pd_32]
 %elifidn %1, sp
-    mova            m7, [INTERP_OFFSET_SP]
+    vbroadcasti128  m7, [INTERP_OFFSET_SP]
 %else
     vbroadcasti128  m7, [INTERP_OFFSET_PS]
 %endif
@@ -7861,7 +7861,7 @@
 %ifidn %1,pp
     vbroadcasti128  m7, [pd_32]
 %elifidn %1, sp
-    mova            m7, [INTERP_OFFSET_SP]
+    vbroadcasti128  m7, [INTERP_OFFSET_SP]
 %else
     vbroadcasti128  m7, [INTERP_OFFSET_PS]
 %endif
@@ -7901,7 +7901,7 @@
 %ifidn %1,pp
     vbroadcasti128  m14, [pd_32]
 %elifidn %1, sp
-    mova            m14, [INTERP_OFFSET_SP]
+    vbroadcasti128  m14, [INTERP_OFFSET_SP]
 %else
     vbroadcasti128  m14, [INTERP_OFFSET_PS]
 %endif
@@ -8248,7 +8248,7 @@
 %ifidn %1,pp
     vbroadcasti128  m7, [pd_32]
 %elifidn %1, sp
-    mova            m7, [INTERP_OFFSET_SP]
+    vbroadcasti128  m7, [INTERP_OFFSET_SP]
 %else
     vbroadcasti128  m7, [INTERP_OFFSET_PS]
 %endif
@@ -8668,7 +8668,7 @@
 %ifidn %1,pp

 
@@ -4869,7 +4869,7 @@
 %ifidn %2,pp
     vbroadcasti128  m8, [INTERP_OFFSET_PP]
 %elifidn %2, sp
-    mova            m8, [INTERP_OFFSET_SP]
+    vbroadcasti128  m8, [INTERP_OFFSET_SP]
 %else
     vbroadcasti128  m8, [INTERP_OFFSET_PS]
 %endif
@@ -5011,11 +5011,11 @@
     mov       r4d, %1/2
 
 %ifidn %2, pp
-    mova      m7, [INTERP_OFFSET_PP]
+    vbroadcasti128  m7, [INTERP_OFFSET_PP]
 %elifidn %2, sp
-    mova      m7, [INTERP_OFFSET_SP]
+    vbroadcasti128  m7, [INTERP_OFFSET_SP]
 %elifidn %2, ps
-    mova      m7, [INTERP_OFFSET_PS]
+    vbroadcasti128  m7, [INTERP_OFFSET_PS]
 %endif
 
 .loopH:
@@ -5183,11 +5183,11 @@
     mov       r4d, %1/2
 
 %ifidn %2, pp
-    mova      m7, [INTERP_OFFSET_PP]
+    vbroadcasti128  m7, [INTERP_OFFSET_PP]
 %elifidn %2, sp
-    mova      m7, [INTERP_OFFSET_SP]
+    vbroadcasti128  m7, [INTERP_OFFSET_SP]
 %elifidn %2, ps
-    mova      m7, [INTERP_OFFSET_PS]
+    vbroadcasti128  m7, [INTERP_OFFSET_PS]
 %endif
 
 .loopH:
@@ -5325,11 +5325,11 @@
     mov       r4d, %1/2
 
 %ifidn %2, pp
-    mova      m7, [INTERP_OFFSET_PP]
+    vbroadcasti128  m7, [INTERP_OFFSET_PP]
 %elifidn %2, sp
-    mova      m7, [INTERP_OFFSET_SP]
+    vbroadcasti128  m7, [INTERP_OFFSET_SP]
 %elifidn %2, ps
-    mova      m7, [INTERP_OFFSET_PS]
+    vbroadcasti128  m7, [INTERP_OFFSET_PS]
 %endif
 
 .loopH:
@@ -5456,11 +5456,11 @@
     mov       r4d, %1/2
 
 %ifidn %2, pp
-    mova      m7, [INTERP_OFFSET_PP]
+    vbroadcasti128  m7, [INTERP_OFFSET_PP]
 %elifidn %2, sp
-    mova      m7, [INTERP_OFFSET_SP]
+    vbroadcasti128  m7, [INTERP_OFFSET_SP]
 %elifidn %2, ps
-    mova      m7, [INTERP_OFFSET_PS]
+    vbroadcasti128  m7, [INTERP_OFFSET_PS]
 %endif
 
 .loopH:
@@ -5609,11 +5609,11 @@
     mov       r4d, %1/2
 
 %ifidn %2, pp
-    mova      m7, [INTERP_OFFSET_PP]
+    vbroadcasti128  m7, [INTERP_OFFSET_PP]
 %elifidn %2, sp
-    mova      m7, [INTERP_OFFSET_SP]
+    vbroadcasti128  m7, [INTERP_OFFSET_SP]
 %elifidn %2, ps
-    mova      m7, [INTERP_OFFSET_PS]
+    vbroadcasti128  m7, [INTERP_OFFSET_PS]
 %endif
 
 .loopH:
@@ -5732,11 +5732,11 @@
     mov       r4d, 32
 
 %ifidn %1, pp
-    mova      m7, [INTERP_OFFSET_PP]
+    vbroadcasti128  m7, [INTERP_OFFSET_PP]
 %elifidn %1, sp
-    mova      m7, [INTERP_OFFSET_SP]
+    vbroadcasti128  m7, [INTERP_OFFSET_SP]
 %elifidn %1, ps
-    mova      m7, [INTERP_OFFSET_PS]
+    vbroadcasti128  m7, [INTERP_OFFSET_PS]
 %endif
 
 .loopH:
@@ -6068,7 +6068,7 @@
 %ifidn %1,pp
     vbroadcasti128  m6, [pd_32]
 %elifidn %1, sp
-    mova            m6, [pd_524800]
+    vbroadcasti128  m6, [INTERP_OFFSET_SP]
 %else
     vbroadcasti128  m6, [INTERP_OFFSET_PS]
 %endif
@@ -6178,7 +6178,7 @@
 %ifidn %1,pp
     vbroadcasti128  m11, [pd_32]
 %elifidn %1, sp
-    mova            m11, [pd_524800]
+    vbroadcasti128  m11, [INTERP_OFFSET_SP]
 %else
     vbroadcasti128  m11, [INTERP_OFFSET_PS]
 %endif
@@ -6816,7 +6816,7 @@
 %ifidn %1,pp
     vbroadcasti128  m14, [pd_32]
 %elifidn %1, sp
-    mova            m14, [INTERP_OFFSET_SP]
+    vbroadcasti128  m14, [INTERP_OFFSET_SP]
 %else
     vbroadcasti128  m14, [INTERP_OFFSET_PS]
 %endif
@@ -6867,7 +6867,7 @@
 %ifidn %3,pp
     vbroadcasti128  m14, [pd_32]
 %elifidn %3, sp
-    mova            m14, [INTERP_OFFSET_SP]
+    vbroadcasti128  m14, [INTERP_OFFSET_SP]
 %else
     vbroadcasti128  m14, [INTERP_OFFSET_PS]
 %endif
@@ -6950,7 +6950,7 @@
 %ifidn %1,pp
     vbroadcasti128  m14, [pd_32]
 %elifidn %1, sp
-    mova            m14, [INTERP_OFFSET_SP]
+    vbroadcasti128  m14, [INTERP_OFFSET_SP]
 %else
     vbroadcasti128  m14, [INTERP_OFFSET_PS]
 %endif
@@ -7597,7 +7597,7 @@
 %ifidn %1,pp
     vbroadcasti128  m11, [pd_32]
 %elifidn %1, sp
-    mova            m11, [INTERP_OFFSET_SP]
+    vbroadcasti128  m11, [INTERP_OFFSET_SP]
 %else
     vbroadcasti128  m11, [INTERP_OFFSET_PS]
 %endif
@@ -7644,7 +7644,7 @@
 %ifidn %1,pp
     vbroadcasti128  m14, [pd_32]
 %elifidn %1, sp
-    mova            m14, [INTERP_OFFSET_SP]
+    vbroadcasti128  m14, [INTERP_OFFSET_SP]
 %else
     vbroadcasti128  m14, [INTERP_OFFSET_PS]
 %endif
@@ -7816,7 +7816,7 @@
 %ifidn %1,pp
     vbroadcasti128  m7, [pd_32]
 %elifidn %1, sp
-    mova            m7, [INTERP_OFFSET_SP]
+    vbroadcasti128  m7, [INTERP_OFFSET_SP]
 %else
     vbroadcasti128  m7, [INTERP_OFFSET_PS]
 %endif
@@ -7861,7 +7861,7 @@
 %ifidn %1,pp
     vbroadcasti128  m7, [pd_32]
 %elifidn %1, sp
-    mova            m7, [INTERP_OFFSET_SP]
+    vbroadcasti128  m7, [INTERP_OFFSET_SP]
 %else
     vbroadcasti128  m7, [INTERP_OFFSET_PS]
 %endif
@@ -7901,7 +7901,7 @@
 %ifidn %1,pp
     vbroadcasti128  m14, [pd_32]
 %elifidn %1, sp
-    mova            m14, [INTERP_OFFSET_SP]
+    vbroadcasti128  m14, [INTERP_OFFSET_SP]
 %else
     vbroadcasti128  m14, [INTERP_OFFSET_PS]
 %endif
@@ -8248,7 +8248,7 @@
 %ifidn %1,pp
     vbroadcasti128  m7, [pd_32]
 %elifidn %1, sp
-    mova            m7, [INTERP_OFFSET_SP]
+    vbroadcasti128  m7, [INTERP_OFFSET_SP]
 %else
     vbroadcasti128  m7, [INTERP_OFFSET_PS]
 %endif
@@ -8668,7 +8668,7 @@
 %ifidn %1,pp
​

x265_1.8.tar.gz/source/common/x86/ipfilter8.asm -> x265_1.9.tar.gz/source/common/x86/ipfilter8.asm Changed

@@ -12541,6 +12541,459 @@
 ;-----------------------------------------------------------------------------
 ; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
 ;-----------------------------------------------------------------------------
+INIT_YMM avx2
+cglobal filterPixelToShort_16x4, 3, 4, 2
+    mov             r3d, r3m
+    add             r3d, r3d
+
+    ; load constant
+    vbroadcasti128  m1, [pw_2000]
+
+    pmovzxbw        m0, [r0]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2], m0
+
+    pmovzxbw        m0, [r0 + r1]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2 + r3], m0
+
+    pmovzxbw        m0, [r0 + r1 * 2]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2 + r3 * 2], m0
+
+    lea             r1, [r1 * 3]
+    lea             r3, [r3 * 3]
+
+    pmovzxbw        m0, [r0 + r1]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2 + r3], m0
+    RET
+
+;-----------------------------------------------------------------------------
+; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
+;-----------------------------------------------------------------------------
+INIT_YMM avx2
+cglobal filterPixelToShort_16x8, 3, 6, 2
+    mov             r3d, r3m
+    add             r3d, r3d
+    lea             r4, [r1 * 3]
+    lea             r5, [r3 * 3]
+
+    ; load constant
+    vbroadcasti128  m1, [pw_2000]
+
+    pmovzxbw        m0, [r0]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2], m0
+
+    pmovzxbw        m0, [r0 + r1]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2 + r3], m0
+
+    pmovzxbw        m0, [r0 + r1 * 2]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2 + r3 * 2], m0
+
+    pmovzxbw        m0, [r0 + r4]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2 + r5], m0
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+
+    pmovzxbw        m0, [r0]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2], m0
+
+    pmovzxbw        m0, [r0 + r1]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2 + r3], m0
+
+    pmovzxbw        m0, [r0 + r1 * 2]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2 + r3 * 2], m0
+
+    pmovzxbw        m0, [r0 + r4]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2 + r5], m0
+    RET
+
+;-----------------------------------------------------------------------------
+; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
+;-----------------------------------------------------------------------------
+INIT_YMM avx2
+cglobal filterPixelToShort_16x12, 3, 6, 2
+    mov             r3d, r3m
+    add             r3d, r3d
+    lea             r4, [r1 * 3]
+    lea             r5, [r3 * 3]
+
+    ; load constant
+    vbroadcasti128  m1, [pw_2000]
+
+    pmovzxbw        m0, [r0]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2], m0
+
+    pmovzxbw        m0, [r0 + r1]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2 + r3], m0
+
+    pmovzxbw        m0, [r0 + r1 * 2]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2 + r3 * 2], m0
+
+    pmovzxbw        m0, [r0 + r4]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2 + r5], m0
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+
+    pmovzxbw        m0, [r0]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2], m0
+
+    pmovzxbw        m0, [r0 + r1]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2 + r3], m0
+
+    pmovzxbw        m0, [r0 + r1 * 2]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2 + r3 * 2], m0
+
+    pmovzxbw        m0, [r0 + r4]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2 + r5], m0
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+
+    pmovzxbw        m0, [r0]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2], m0
+
+    pmovzxbw        m0, [r0 + r1]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2 + r3], m0
+
+    pmovzxbw        m0, [r0 + r1 * 2]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2 + r3 * 2], m0
+
+    pmovzxbw        m0, [r0 + r4]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2 + r5], m0
+    RET
+
+;-----------------------------------------------------------------------------
+; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
+;-----------------------------------------------------------------------------
+INIT_YMM avx2
+cglobal filterPixelToShort_16x16, 3, 6, 2
+    mov             r3d, r3m
+    add             r3d, r3d
+    lea             r4, [r1 * 3]
+    lea             r5, [r3 * 3]
+
+    ; load constant
+    vbroadcasti128  m1, [pw_2000]
+
+    pmovzxbw        m0, [r0]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2], m0
+
+    pmovzxbw        m0, [r0 + r1]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2 + r3], m0
+
+    pmovzxbw        m0, [r0 + r1 * 2]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2 + r3 * 2], m0

 
@@ -12541,6 +12541,459 @@
 ;-----------------------------------------------------------------------------
 ; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
 ;-----------------------------------------------------------------------------
+INIT_YMM avx2
+cglobal filterPixelToShort_16x4, 3, 4, 2
+    mov             r3d, r3m
+    add             r3d, r3d
+
+    ; load constant
+    vbroadcasti128  m1, [pw_2000]
+
+    pmovzxbw        m0, [r0]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2], m0
+
+    pmovzxbw        m0, [r0 + r1]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2 + r3], m0
+
+    pmovzxbw        m0, [r0 + r1 * 2]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2 + r3 * 2], m0
+
+    lea             r1, [r1 * 3]
+    lea             r3, [r3 * 3]
+
+    pmovzxbw        m0, [r0 + r1]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2 + r3], m0
+    RET
+
+;-----------------------------------------------------------------------------
+; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
+;-----------------------------------------------------------------------------
+INIT_YMM avx2
+cglobal filterPixelToShort_16x8, 3, 6, 2
+    mov             r3d, r3m
+    add             r3d, r3d
+    lea             r4, [r1 * 3]
+    lea             r5, [r3 * 3]
+
+    ; load constant
+    vbroadcasti128  m1, [pw_2000]
+
+    pmovzxbw        m0, [r0]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2], m0
+
+    pmovzxbw        m0, [r0 + r1]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2 + r3], m0
+
+    pmovzxbw        m0, [r0 + r1 * 2]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2 + r3 * 2], m0
+
+    pmovzxbw        m0, [r0 + r4]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2 + r5], m0
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+
+    pmovzxbw        m0, [r0]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2], m0
+
+    pmovzxbw        m0, [r0 + r1]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2 + r3], m0
+
+    pmovzxbw        m0, [r0 + r1 * 2]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2 + r3 * 2], m0
+
+    pmovzxbw        m0, [r0 + r4]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2 + r5], m0
+    RET
+
+;-----------------------------------------------------------------------------
+; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
+;-----------------------------------------------------------------------------
+INIT_YMM avx2
+cglobal filterPixelToShort_16x12, 3, 6, 2
+    mov             r3d, r3m
+    add             r3d, r3d
+    lea             r4, [r1 * 3]
+    lea             r5, [r3 * 3]
+
+    ; load constant
+    vbroadcasti128  m1, [pw_2000]
+
+    pmovzxbw        m0, [r0]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2], m0
+
+    pmovzxbw        m0, [r0 + r1]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2 + r3], m0
+
+    pmovzxbw        m0, [r0 + r1 * 2]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2 + r3 * 2], m0
+
+    pmovzxbw        m0, [r0 + r4]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2 + r5], m0
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+
+    pmovzxbw        m0, [r0]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2], m0
+
+    pmovzxbw        m0, [r0 + r1]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2 + r3], m0
+
+    pmovzxbw        m0, [r0 + r1 * 2]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2 + r3 * 2], m0
+
+    pmovzxbw        m0, [r0 + r4]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2 + r5], m0
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+
+    pmovzxbw        m0, [r0]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2], m0
+
+    pmovzxbw        m0, [r0 + r1]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2 + r3], m0
+
+    pmovzxbw        m0, [r0 + r1 * 2]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2 + r3 * 2], m0
+
+    pmovzxbw        m0, [r0 + r4]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2 + r5], m0
+    RET
+
+;-----------------------------------------------------------------------------
+; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
+;-----------------------------------------------------------------------------
+INIT_YMM avx2
+cglobal filterPixelToShort_16x16, 3, 6, 2
+    mov             r3d, r3m
+    add             r3d, r3d
+    lea             r4, [r1 * 3]
+    lea             r5, [r3 * 3]
+
+    ; load constant
+    vbroadcasti128  m1, [pw_2000]
+
+    pmovzxbw        m0, [r0]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2], m0
+
+    pmovzxbw        m0, [r0 + r1]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2 + r3], m0
+
+    pmovzxbw        m0, [r0 + r1 * 2]
+    psllw           m0, 6
+    psubw           m0, m1
+    movu            [r2 + r3 * 2], m0
​

x265_1.8.tar.gz/source/common/x86/loopfilter.asm -> x265_1.9.tar.gz/source/common/x86/loopfilter.asm Changed

@@ -26,24 +26,28 @@
 ;*****************************************************************************/
 
 %include "x86inc.asm"
+%include "x86util.asm"
 
 SECTION_RODATA 32
 pb_31:      times 32 db 31
 pb_124:     times 32 db 124
 pb_15:      times 32 db 15
-pb_movemask_32:  times 32 db 0x00
-                 times 32 db 0xFF
 
 SECTION .text
 cextern pb_1
-cextern pb_128
 cextern pb_2
+cextern pb_3
+cextern pb_4
+cextern pb_01
+cextern pb_128
+cextern pw_1
+cextern pw_n1
 cextern pw_2
+cextern pw_4
 cextern pw_pixel_max
 cextern pb_movemask
-cextern pw_1
+cextern pb_movemask_32
 cextern hmul_16p
-cextern pb_4
 
 
 ;============================================================================================================
@@ -1989,79 +1993,94 @@
 %endif
 
 ;--------------------------------------------------------------------------------------------------------------------------
-; saoCuStatsBO_c(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count)
+; saoCuStatsBO_c(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count)
 ;--------------------------------------------------------------------------------------------------------------------------
 %if ARCH_X86_64
 INIT_XMM sse4
-cglobal saoCuStatsBO, 7,12,6
-    mova        m3, [hmul_16p + 16]
-    mova        m4, [pb_124]
-    mova        m5, [pb_4]
-    xor         r7d, r7d
+cglobal saoCuStatsBO, 7,13,2
+    mova        m0, [pb_124]
+    add         r5, 4
+    add         r6, 4
 
 .loopH:
-    mov         r10, r0
+    mov         r12, r0
     mov         r11, r1
     mov         r9d, r3d
+
 .loopL:
     movu        m1, [r11]
-    movu        m0, [r10]
+    psrlw       m1, 1                   ; rec[x] >> boShift
+    pand        m1, m0
 
-    punpckhbw   m2, m0, m1
-    punpcklbw   m0, m1
-    psrlw       m1, 1               ; rec[x] >> boShift
-    pmaddubsw   m2, m3
-    pmaddubsw   m0, m3
-    pand        m1, m4
-    paddb       m1, m5
+    cmp         r9d, 8
+    jle        .proc8
 
+    movq        r10, m1
 %assign x 0
-%rep 16
-    pextrb      r7d, m1, x
+%rep 8
+    movzx       r7d, r10b
+    shr         r10, 8
 
-%if (x < 8)
-    pextrw      r8d, m0, (x % 8)
-%else
-    pextrw      r8d, m2, (x % 8)
-%endif
-    movsx       r8d, r8w
-    inc         dword  [r6 + r7]    ; count[classIdx]++
-    add         [r5 + r7], r8d      ; stats[classIdx] += (fenc[x] - rec[x]);
+    movsx       r8d, word [r12 + x*2]   ; diff[x]
+    inc         dword  [r6 + r7]        ; count[classIdx]++
+    add         [r5 + r7], r8d          ; stats[classIdx] += (fenc[x] - rec[x]);
+%assign x x+1
+%endrep
+    movhlps     m1, m1
+    sub         r9d, 8
+    add         r12, 8*2
+
+.proc8:
+    movq        r10, m1
+%assign x 0
+%rep 8
+    movzx       r7d, r10b
+    shr         r10, 8
+
+    movsx       r8d, word [r12 + x*2]   ; diff[x]
+    inc         dword  [r6 + r7]        ; count[classIdx]++
+    add         [r5 + r7], r8d          ; stats[classIdx] += (fenc[x] - rec[x]);
     dec         r9d
-    jz          .next
+    jz         .next
 %assign x x+1
 %endrep
 
-    add         r10, 16
+    add         r12, 8*2
     add         r11, 16
-    jmp         .loopL
+    jmp        .loopL
 
 .next:
-    add         r0, r2
+    add         r0, 64*2                ; MAX_CU_SIZE
     add         r1, r2
     dec         r4d
-    jnz         .loopH
+    jnz        .loopH
     RET
 %endif
 
 ;-----------------------------------------------------------------------------------------------------------------------
-; saoCuStatsE0(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count)
+; saoCuStatsE0(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count)
 ;-----------------------------------------------------------------------------------------------------------------------
 %if ARCH_X86_64
 INIT_XMM sse4
-cglobal saoCuStatsE0, 5,9,8, 0-32
+cglobal saoCuStatsE0, 3,10,6, 0-32
     mov         r3d, r3m
-    mov         r8, r5mp
+    mov         r4d, r4m
+    mov         r9, r5mp
 
     ; clear internal temporary buffer
     pxor        m0, m0
     mova        [rsp], m0
     mova        [rsp + mmsize], m0
     mova        m4, [pb_128]
-    mova        m5, [hmul_16p + 16]
-    mova        m6, [pb_2]
+    mova        m5, [pb_2]
     xor         r7d, r7d
 
+    ; correct stride for diff[] and rec
+    mov         r6d, r3d
+    and         r6d, ~15
+    sub         r2, r6
+    lea         r8, [(r6 - 64) * 2]             ; 64 = MAX_CU_SIZE
+
 .loopH:
     mov         r5d, r3d
 
@@ -2075,100 +2094,257 @@
     pinsrb      m0, r7d, 15
 
 .loopL:
-    movu        m7, [r1]
+    movu        m3, [r1]
     movu        m2, [r1 + 1]
 
-    pxor        m1, m7, m4
-    pxor        m3, m2, m4
-    pcmpgtb     m2, m1, m3
-    pcmpgtb     m3, m1
-    pand        m2, [pb_1]
-    por         m2, m3              ; signRight
+    pxor        m1, m3, m4
+    pxor        m2, m4
+    pcmpgtb     m3, m1, m2
+    pcmpgtb     m2, m1
+    pand        m3, [pb_1]
+    por         m2, m3                          ; signRight
 
     palignr     m3, m2, m0, 15
-    psignb      m3, m4              ; signLeft
+    psignb      m3, m4                          ; signLeft
 
     mova        m0, m2
     paddb       m2, m3
-    paddb       m2, m6              ; edgeType
+    paddb       m2, m5                          ; edgeType
 
     ; stats[edgeType]
-    movu        m3, [r0]            ; fenc[0-15]
-    punpckhbw   m1, m3, m7
-    punpcklbw   m3, m7
-    pmaddubsw   m1, m5
-    pmaddubsw   m3, m5

 
@@ -26,24 +26,28 @@
 ;*****************************************************************************/
 
 %include "x86inc.asm"
+%include "x86util.asm"
 
 SECTION_RODATA 32
 pb_31:      times 32 db 31
 pb_124:     times 32 db 124
 pb_15:      times 32 db 15
-pb_movemask_32:  times 32 db 0x00
-                 times 32 db 0xFF
 
 SECTION .text
 cextern pb_1
-cextern pb_128
 cextern pb_2
+cextern pb_3
+cextern pb_4
+cextern pb_01
+cextern pb_128
+cextern pw_1
+cextern pw_n1
 cextern pw_2
+cextern pw_4
 cextern pw_pixel_max
 cextern pb_movemask
-cextern pw_1
+cextern pb_movemask_32
 cextern hmul_16p
-cextern pb_4
 
 
 ;============================================================================================================
@@ -1989,79 +1993,94 @@
 %endif
 
 ;--------------------------------------------------------------------------------------------------------------------------
-; saoCuStatsBO_c(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count)
+; saoCuStatsBO_c(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count)
 ;--------------------------------------------------------------------------------------------------------------------------
 %if ARCH_X86_64
 INIT_XMM sse4
-cglobal saoCuStatsBO, 7,12,6
-    mova        m3, [hmul_16p + 16]
-    mova        m4, [pb_124]
-    mova        m5, [pb_4]
-    xor         r7d, r7d
+cglobal saoCuStatsBO, 7,13,2
+    mova        m0, [pb_124]
+    add         r5, 4
+    add         r6, 4
 
 .loopH:
-    mov         r10, r0
+    mov         r12, r0
     mov         r11, r1
     mov         r9d, r3d
+
 .loopL:
     movu        m1, [r11]
-    movu        m0, [r10]
+    psrlw       m1, 1                   ; rec[x] >> boShift
+    pand        m1, m0
 
-    punpckhbw   m2, m0, m1
-    punpcklbw   m0, m1
-    psrlw       m1, 1               ; rec[x] >> boShift
-    pmaddubsw   m2, m3
-    pmaddubsw   m0, m3
-    pand        m1, m4
-    paddb       m1, m5
+    cmp         r9d, 8
+    jle        .proc8
 
+    movq        r10, m1
 %assign x 0
-%rep 16
-    pextrb      r7d, m1, x
+%rep 8
+    movzx       r7d, r10b
+    shr         r10, 8
 
-%if (x < 8)
-    pextrw      r8d, m0, (x % 8)
-%else
-    pextrw      r8d, m2, (x % 8)
-%endif
-    movsx       r8d, r8w
-    inc         dword  [r6 + r7]    ; count[classIdx]++
-    add         [r5 + r7], r8d      ; stats[classIdx] += (fenc[x] - rec[x]);
+    movsx       r8d, word [r12 + x*2]   ; diff[x]
+    inc         dword  [r6 + r7]        ; count[classIdx]++
+    add         [r5 + r7], r8d          ; stats[classIdx] += (fenc[x] - rec[x]);
+%assign x x+1
+%endrep
+    movhlps     m1, m1
+    sub         r9d, 8
+    add         r12, 8*2
+
+.proc8:
+    movq        r10, m1
+%assign x 0
+%rep 8
+    movzx       r7d, r10b
+    shr         r10, 8
+
+    movsx       r8d, word [r12 + x*2]   ; diff[x]
+    inc         dword  [r6 + r7]        ; count[classIdx]++
+    add         [r5 + r7], r8d          ; stats[classIdx] += (fenc[x] - rec[x]);
     dec         r9d
-    jz          .next
+    jz         .next
 %assign x x+1
 %endrep
 
-    add         r10, 16
+    add         r12, 8*2
     add         r11, 16
-    jmp         .loopL
+    jmp        .loopL
 
 .next:
-    add         r0, r2
+    add         r0, 64*2                ; MAX_CU_SIZE
     add         r1, r2
     dec         r4d
-    jnz         .loopH
+    jnz        .loopH
     RET
 %endif
 
 ;-----------------------------------------------------------------------------------------------------------------------
-; saoCuStatsE0(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count)
+; saoCuStatsE0(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count)
 ;-----------------------------------------------------------------------------------------------------------------------
 %if ARCH_X86_64
 INIT_XMM sse4
-cglobal saoCuStatsE0, 5,9,8, 0-32
+cglobal saoCuStatsE0, 3,10,6, 0-32
     mov         r3d, r3m
-    mov         r8, r5mp
+    mov         r4d, r4m
+    mov         r9, r5mp
 
     ; clear internal temporary buffer
     pxor        m0, m0
     mova        [rsp], m0
     mova        [rsp + mmsize], m0
     mova        m4, [pb_128]
-    mova        m5, [hmul_16p + 16]
-    mova        m6, [pb_2]
+    mova        m5, [pb_2]
     xor         r7d, r7d
 
+    ; correct stride for diff[] and rec
+    mov         r6d, r3d
+    and         r6d, ~15
+    sub         r2, r6
+    lea         r8, [(r6 - 64) * 2]             ; 64 = MAX_CU_SIZE
+
 .loopH:
     mov         r5d, r3d
 
@@ -2075,100 +2094,257 @@
     pinsrb      m0, r7d, 15
 
 .loopL:
-    movu        m7, [r1]
+    movu        m3, [r1]
     movu        m2, [r1 + 1]
 
-    pxor        m1, m7, m4
-    pxor        m3, m2, m4
-    pcmpgtb     m2, m1, m3
-    pcmpgtb     m3, m1
-    pand        m2, [pb_1]
-    por         m2, m3              ; signRight
+    pxor        m1, m3, m4
+    pxor        m2, m4
+    pcmpgtb     m3, m1, m2
+    pcmpgtb     m2, m1
+    pand        m3, [pb_1]
+    por         m2, m3                          ; signRight
 
     palignr     m3, m2, m0, 15
-    psignb      m3, m4              ; signLeft
+    psignb      m3, m4                          ; signLeft
 
     mova        m0, m2
     paddb       m2, m3
-    paddb       m2, m6              ; edgeType
+    paddb       m2, m5                          ; edgeType
 
     ; stats[edgeType]
-    movu        m3, [r0]            ; fenc[0-15]
-    punpckhbw   m1, m3, m7
-    punpcklbw   m3, m7
-    pmaddubsw   m1, m5
-    pmaddubsw   m3, m5
​

x265_1.8.tar.gz/source/common/x86/loopfilter.h -> x265_1.9.tar.gz/source/common/x86/loopfilter.h Changed

@@ -3,6 +3,7 @@
  *
  * Authors: Dnyaneshwar Gorade <dnyaneshwar@multicorewareinc.com>
  *          Praveen Kumar Tiwari <praveen@multicorewareinc.com>
+;*          Min Chen <chenm003@163.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -35,14 +36,17 @@
     void PFX(saoCuOrgE3_ ## cpu)(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX); \
     void PFX(saoCuOrgE3_32_ ## cpu)(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX); \
     void PFX(saoCuOrgB0_ ## cpu)(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride); \
-    void PFX(saoCuStatsBO_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); \
-    void PFX(saoCuStatsE0_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); \
-    void PFX(saoCuStatsE1_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count); \
-    void PFX(saoCuStatsE2_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBufft, int endX, int endY, int32_t *stats, int32_t *count); \
-    void PFX(saoCuStatsE3_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count); \
+    void PFX(saoCuStatsBO_ ## cpu)(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); \
+    void PFX(saoCuStatsE0_ ## cpu)(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); \
+    void PFX(saoCuStatsE1_ ## cpu)(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count); \
+    void PFX(saoCuStatsE2_ ## cpu)(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBufft, int endX, int endY, int32_t *stats, int32_t *count); \
+    void PFX(saoCuStatsE3_ ## cpu)(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count); \
     void PFX(calSign_ ## cpu)(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
 
 DECL_SAO(sse4);
 DECL_SAO(avx2);
 
+void PFX(pelFilterLumaStrong_V_sse4)(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tcP, int32_t tcQ);
+void PFX(pelFilterLumaStrong_H_sse4)(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tcP, int32_t tcQ);
+
 #endif // ifndef X265_LOOPFILTER_H

 
@@ -3,6 +3,7 @@
  *
  * Authors: Dnyaneshwar Gorade <dnyaneshwar@multicorewareinc.com>
  *          Praveen Kumar Tiwari <praveen@multicorewareinc.com>
+;*          Min Chen <chenm003@163.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -35,14 +36,17 @@
     void PFX(saoCuOrgE3_ ## cpu)(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX); \
     void PFX(saoCuOrgE3_32_ ## cpu)(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX); \
     void PFX(saoCuOrgB0_ ## cpu)(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride); \
-    void PFX(saoCuStatsBO_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); \
-    void PFX(saoCuStatsE0_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); \
-    void PFX(saoCuStatsE1_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count); \
-    void PFX(saoCuStatsE2_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBufft, int endX, int endY, int32_t *stats, int32_t *count); \
-    void PFX(saoCuStatsE3_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count); \
+    void PFX(saoCuStatsBO_ ## cpu)(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); \
+    void PFX(saoCuStatsE0_ ## cpu)(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); \
+    void PFX(saoCuStatsE1_ ## cpu)(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count); \
+    void PFX(saoCuStatsE2_ ## cpu)(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBufft, int endX, int endY, int32_t *stats, int32_t *count); \
+    void PFX(saoCuStatsE3_ ## cpu)(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count); \
     void PFX(calSign_ ## cpu)(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
 
 DECL_SAO(sse4);
 DECL_SAO(avx2);
 
+void PFX(pelFilterLumaStrong_V_sse4)(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tcP, int32_t tcQ);
+void PFX(pelFilterLumaStrong_H_sse4)(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tcP, int32_t tcQ);
+
 #endif // ifndef X265_LOOPFILTER_H
​

x265_1.8.tar.gz/source/common/x86/mc-a.asm -> x265_1.9.tar.gz/source/common/x86/mc-a.asm Changed

@@ -2,6 +2,7 @@
 ;* mc-a.asm: x86 motion compensation
 ;*****************************************************************************
 ;* Copyright (C) 2003-2013 x264 project
+;* Copyright (C) 2013-2015 x265 project
 ;*
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
 ;*          Fiona Glaser <fiona@x264.com>
@@ -3989,8 +3990,12 @@
     test dword r4m, 15
     jz pixel_avg_w%1_sse2
 %endif
+%if (%1 == 8)
+    jmp pixel_avg_w8_unaligned_sse2
+%else
     jmp pixel_avg_w%1_mmx2
 %endif
+%endif
 %endmacro
 
 ;-----------------------------------------------------------------------------
@@ -4049,6 +4054,32 @@
     lea     r4, [r4 + 4 * r5]
 %endmacro
 
+INIT_XMM sse2
+cglobal pixel_avg_w8_unaligned
+    AVG_START
+.height_loop:
+%if HIGH_BIT_DEPTH
+    ; NO TEST BRANCH!
+    movu    m0, [t2]
+    movu    m1, [t2+SIZEOF_PIXEL*t3]
+    movu    m2, [t4]
+    movu    m3, [t4+SIZEOF_PIXEL*t5]
+    pavgw   m0, m2
+    pavgw   m1, m3
+    movu    [t0], m0
+    movu    [t0+SIZEOF_PIXEL*t1], m1
+%else ;!HIGH_BIT_DEPTH
+    movq    m0, [t2]
+    movhps  m0, [t2+SIZEOF_PIXEL*t3]
+    movq    m1, [t4]
+    movhps  m1, [t4+SIZEOF_PIXEL*t5]
+    pavgb   m0, m1
+    movq    [t0], m0
+    movhps  [t0+SIZEOF_PIXEL*t1], m0
+%endif
+    AVG_END
+
+
 ;-------------------------------------------------------------------------------------------------------------------------------
 ;void pixelavg_pp(pixel dst, intptr_t dstride, const pixel src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int)
 ;-------------------------------------------------------------------------------------------------------------------------------
@@ -4115,11 +4146,11 @@
 AVGH 4, 4
 AVGH 4, 2
 
-AVG_FUNC 8, movq, movq
-AVGH 8, 32
-AVGH 8, 16
-AVGH 8,  8
-AVGH 8,  4
+;AVG_FUNC 8, movq, movq
+;AVGH 8, 32
+;AVGH 8, 16
+;AVGH 8,  8
+;AVGH 8,  4
 
 AVG_FUNC 16, movq, movq
 AVGH 16, 64
@@ -4197,7 +4228,7 @@
 AVGH 4, 4
 AVGH 4, 2
 
-AVG_FUNC 8, movq, movq
+;AVG_FUNC 8, movq, movq
 AVGH 8, 32
 AVGH 8, 16
 AVGH 8,  8
@@ -4418,6 +4449,37 @@
     call pixel_avg_16x64_8bit
     call pixel_avg_16x64_8bit
     RET
+
+cglobal pixel_avg_48x64, 6,7,4
+   mov          r6d, 4
+.loop:
+%rep 8
+    movu        m0, [r2]
+    movu        xm2, [r2 + mmsize]
+    movu        m1, [r4]
+    movu        xm3, [r4 + mmsize]
+    pavgb       m0, m1
+    pavgb       xm2, xm3
+    movu        [r0], m0
+    movu        [r0 + mmsize], xm2
+
+    movu        m0, [r2 + r3]
+    movu        xm2, [r2 + r3 + mmsize]
+    movu        m1, [r4 + r5]
+    movu        xm3, [r4 + r5 + mmsize]
+    pavgb       m0, m1
+    pavgb       xm2, xm3
+    movu        [r0 + r1], m0
+    movu        [r0 + r1 + mmsize], xm2
+
+    lea         r2, [r2 + r3 * 2]
+    lea         r4, [r4 + r5 * 2]
+    lea         r0, [r0 + r1 * 2]
+%endrep
+
+    dec         r6d
+    jnz         .loop
+    RET
 %endif
 
 ;=============================================================================

 
@@ -2,6 +2,7 @@
 ;* mc-a.asm: x86 motion compensation
 ;*****************************************************************************
 ;* Copyright (C) 2003-2013 x264 project
+;* Copyright (C) 2013-2015 x265 project
 ;*
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
 ;*          Fiona Glaser <fiona@x264.com>
@@ -3989,8 +3990,12 @@
     test dword r4m, 15
     jz pixel_avg_w%1_sse2
 %endif
+%if (%1 == 8)
+    jmp pixel_avg_w8_unaligned_sse2
+%else
     jmp pixel_avg_w%1_mmx2
 %endif
+%endif
 %endmacro
 
 ;-----------------------------------------------------------------------------
@@ -4049,6 +4054,32 @@
     lea     r4, [r4 + 4 * r5]
 %endmacro
 
+INIT_XMM sse2
+cglobal pixel_avg_w8_unaligned
+    AVG_START
+.height_loop:
+%if HIGH_BIT_DEPTH
+    ; NO TEST BRANCH!
+    movu    m0, [t2]
+    movu    m1, [t2+SIZEOF_PIXEL*t3]
+    movu    m2, [t4]
+    movu    m3, [t4+SIZEOF_PIXEL*t5]
+    pavgw   m0, m2
+    pavgw   m1, m3
+    movu    [t0], m0
+    movu    [t0+SIZEOF_PIXEL*t1], m1
+%else ;!HIGH_BIT_DEPTH
+    movq    m0, [t2]
+    movhps  m0, [t2+SIZEOF_PIXEL*t3]
+    movq    m1, [t4]
+    movhps  m1, [t4+SIZEOF_PIXEL*t5]
+    pavgb   m0, m1
+    movq    [t0], m0
+    movhps  [t0+SIZEOF_PIXEL*t1], m0
+%endif
+    AVG_END
+
+
 ;-------------------------------------------------------------------------------------------------------------------------------
 ;void pixelavg_pp(pixel dst, intptr_t dstride, const pixel src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int)
 ;-------------------------------------------------------------------------------------------------------------------------------
@@ -4115,11 +4146,11 @@
 AVGH 4, 4
 AVGH 4, 2
 
-AVG_FUNC 8, movq, movq
-AVGH 8, 32
-AVGH 8, 16
-AVGH 8,  8
-AVGH 8,  4
+;AVG_FUNC 8, movq, movq
+;AVGH 8, 32
+;AVGH 8, 16
+;AVGH 8,  8
+;AVGH 8,  4
 
 AVG_FUNC 16, movq, movq
 AVGH 16, 64
@@ -4197,7 +4228,7 @@
 AVGH 4, 4
 AVGH 4, 2
 
-AVG_FUNC 8, movq, movq
+;AVG_FUNC 8, movq, movq
 AVGH 8, 32
 AVGH 8, 16
 AVGH 8,  8
@@ -4418,6 +4449,37 @@
     call pixel_avg_16x64_8bit
     call pixel_avg_16x64_8bit
     RET
+
+cglobal pixel_avg_48x64, 6,7,4
+   mov          r6d, 4
+.loop:
+%rep 8
+    movu        m0, [r2]
+    movu        xm2, [r2 + mmsize]
+    movu        m1, [r4]
+    movu        xm3, [r4 + mmsize]
+    pavgb       m0, m1
+    pavgb       xm2, xm3
+    movu        [r0], m0
+    movu        [r0 + mmsize], xm2
+
+    movu        m0, [r2 + r3]
+    movu        xm2, [r2 + r3 + mmsize]
+    movu        m1, [r4 + r5]
+    movu        xm3, [r4 + r5 + mmsize]
+    pavgb       m0, m1
+    pavgb       xm2, xm3
+    movu        [r0 + r1], m0
+    movu        [r0 + r1 + mmsize], xm2
+
+    lea         r2, [r2 + r3 * 2]
+    lea         r4, [r4 + r5 * 2]
+    lea         r0, [r0 + r1 * 2]
+%endrep
+
+    dec         r6d
+    jnz         .loop
+    RET
 %endif
 
 ;=============================================================================
​

x265_1.8.tar.gz/source/common/x86/mc-a2.asm -> x265_1.9.tar.gz/source/common/x86/mc-a2.asm Changed

@@ -2,12 +2,14 @@
 ;* mc-a2.asm: x86 motion compensation
 ;*****************************************************************************
 ;* Copyright (C) 2005-2013 x264 project
+;* Copyright (C) 2013-2015 x265 project
 ;*
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
 ;*          Fiona Glaser <fiona@x264.com>
 ;*          Holger Lubitz <holger@lubitz.org>
 ;*          Mathieu Monnier <manao@melix.net>
 ;*          Oskar Arvidsson <oskar@irock.se>
+;*          Min Chen <chenm003@163.com>
 ;*
 ;* This program is free software; you can redistribute it and/or modify
 ;* it under the terms of the GNU General Public License as published by
@@ -46,6 +48,8 @@
 pd_16: times 4 dd 16
 pd_0f: times 4 dd 0xffff
 pf_inv256: times 8 dd 0.00390625
+const pd_inv256,    times 4 dq 0.00390625
+const pd_0_5,       times 4 dq 0.5
 
 SECTION .text
 
@@ -987,151 +991,227 @@
 %endif
 
 ;-----------------------------------------------------------------------------
-; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
-;                             uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len )
+; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, int32_t *intra_costs,
+;                             uint16_t *inter_costs, int32_t *inv_qscales, double *fps_factor, int len )
 ;-----------------------------------------------------------------------------
-%macro MBTREE 0
+INIT_XMM sse2
 cglobal mbtree_propagate_cost, 7,7,7
-    add        r6d, r6d
-    lea         r0, [r0+r6*2]
-    add         r1, r6
-    add         r2, r6
-    add         r3, r6
-    add         r4, r6
-    neg         r6
-    pxor      xmm4, xmm4
-    movss     xmm6, [r5]
-    shufps    xmm6, xmm6, 0
-    mulps     xmm6, [pf_inv256]
-    movdqa    xmm5, [pw_3fff]
+    dec         r6d
+    movsd       m6, [r5]
+    mulpd       m6, [pd_inv256]
+    xor         r5d, r5d
+    lea         r0, [r0+r5*2]
+    pxor        m4, m4
+    movlhps     m6, m6
+    mova        m5, [pw_3fff]
+
 .loop:
-    movq      xmm2, [r2+r6] ; intra
-    movq      xmm0, [r4+r6] ; invq
-    movq      xmm3, [r3+r6] ; inter
-    movq      xmm1, [r1+r6] ; prop
-    punpcklwd xmm2, xmm4
-    punpcklwd xmm0, xmm4
-    pmaddwd   xmm0, xmm2
-    pand      xmm3, xmm5
-    punpcklwd xmm1, xmm4
-    punpcklwd xmm3, xmm4
-%if cpuflag(fma4)
-    cvtdq2ps  xmm0, xmm0
-    cvtdq2ps  xmm1, xmm1
-    fmaddps   xmm0, xmm0, xmm6, xmm1
-    cvtdq2ps  xmm1, xmm2
-    psubd     xmm2, xmm3
-    cvtdq2ps  xmm2, xmm2
-    rcpps     xmm3, xmm1
-    mulps     xmm1, xmm3
-    mulps     xmm0, xmm2
-    addps     xmm2, xmm3, xmm3
-    fnmaddps  xmm3, xmm1, xmm3, xmm2
-    mulps     xmm0, xmm3
-%else
-    cvtdq2ps  xmm0, xmm0
-    mulps     xmm0, xmm6    ; intra*invq*fps_factor>>8
-    cvtdq2ps  xmm1, xmm1    ; prop
-    addps     xmm0, xmm1    ; prop + (intra*invq*fps_factor>>8)
-    cvtdq2ps  xmm1, xmm2    ; intra
-    psubd     xmm2, xmm3    ; intra - inter
-    cvtdq2ps  xmm2, xmm2    ; intra - inter
-    rcpps     xmm3, xmm1    ; 1 / intra 1st approximation
-    mulps     xmm1, xmm3    ; intra * (1/intra 1st approx)
-    mulps     xmm1, xmm3    ; intra * (1/intra 1st approx)^2
-    mulps     xmm0, xmm2    ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
-    addps     xmm3, xmm3    ; 2 * (1/intra 1st approx)
-    subps     xmm3, xmm1    ; 2nd approximation for 1/intra
-    mulps     xmm0, xmm3    ; / intra
-%endif
-    cvtps2dq  xmm0, xmm0
-    movdqa [r0+r6*2], xmm0
-    add         r6, 8
-    jl .loop
+    movh        m2, [r2+r5*4]       ; intra
+    movh        m0, [r4+r5*4]       ; invq
+    movd        m3, [r3+r5*2]       ; inter
+    pand        m3, m5
+    punpcklwd   m3, m4
+
+    ; PMINSD
+    pcmpgtd     m1, m2, m3
+    pand        m3, m1
+    pandn       m1, m2
+    por         m3, m1
+
+    movd        m1, [r1+r5*2]       ; prop
+    punpckldq   m2, m2
+    punpckldq   m0, m0
+    pmuludq     m0, m2
+    pshufd      m2, m2, q3120
+    pshufd      m0, m0, q3120
+
+    punpcklwd   m1, m4
+    cvtdq2pd    m0, m0
+    mulpd       m0, m6              ; intra*invq*fps_factor>>8
+    cvtdq2pd    m1, m1              ; prop
+    addpd       m0, m1              ; prop + (intra*invq*fps_factor>>8)
+    ;cvtdq2ps    m1, m2              ; intra
+    cvtdq2pd    m1, m2              ; intra
+    psubd       m2, m3              ; intra - inter
+    cvtdq2pd    m2, m2              ; intra - inter
+    ;rcpps       m3, m1
+    ;mulps       m1, m3              ; intra * (1/intra 1st approx)
+    ;mulps       m1, m3              ; intra * (1/intra 1st approx)^2
+    ;addps       m3, m3              ; 2 * (1/intra 1st approx)
+    ;subps       m3, m1              ; 2nd approximation for 1/intra
+    ;cvtps2pd    m3, m3              ; 1 / intra 1st approximation
+    mulpd       m0, m2              ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
+    ;mulpd       m0, m3              ; / intra
+
+    ; TODO: DIVPD very slow, but match to C model output, since it is not bottleneck function, I comment above faster code
+    divpd       m0, m1
+    addpd       m0, [pd_0_5]
+    cvttpd2dq    m0, m0
+
+    movh        [r0+r5*4], m0
+    add         r5d, 2
+    cmp         r5d, r6d
+    jl         .loop
+
+    xor         r6d, r5d
+    jnz         .even
+    movd        m2, [r2+r5*4]       ; intra
+    movd        m0, [r4+r5*4]       ; invq
+    movd        m3, [r3+r5*2]       ; inter
+    pand        m3, m5
+    punpcklwd   m3, m4
+
+    ; PMINSD
+    pcmpgtd     m1, m2, m3
+    pand        m3, m1
+    pandn       m1, m2
+    por         m3, m1
+
+    movd        m1, [r1+r5*2]       ; prop
+    punpckldq   m2, m2              ; DWORD [_ 1 _ 0]
+    punpckldq   m0, m0
+    pmuludq     m0, m2              ; QWORD [m1 m0]
+    pshufd      m2, m2, q3120
+    pshufd      m0, m0, q3120
+    punpcklwd   m1, m4
+    cvtdq2pd    m0, m0
+    mulpd       m0, m6              ; intra*invq*fps_factor>>8
+    cvtdq2pd    m1, m1              ; prop
+    addpd       m0, m1              ; prop + (intra*invq*fps_factor>>8)
+    cvtdq2pd    m1, m2              ; intra
+    psubd       m2, m3              ; intra - inter
+    cvtdq2pd    m2, m2              ; intra - inter
+    mulpd       m0, m2              ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
+
+    divpd       m0, m1
+    addpd       m0, [pd_0_5]
+    cvttpd2dq    m0, m0
+    movd        [r0+r5*4], m0
+.even:
     RET
-%endmacro
 
-INIT_XMM sse2
-MBTREE
-; Bulldozer only has a 128-bit float unit, so the AVX version of this function is actually slower.
-INIT_XMM fma4
-MBTREE
-
-%macro INT16_UNPACK 1
-    vpunpckhwd   xm4, xm%1, xm7
-    vpunpcklwd  xm%1, xm7
-    vinsertf128  m%1, m%1, xm4, 1
-%endmacro
 
+;-----------------------------------------------------------------------------
+; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, int32_t *intra_costs,

 
@@ -2,12 +2,14 @@
 ;* mc-a2.asm: x86 motion compensation
 ;*****************************************************************************
 ;* Copyright (C) 2005-2013 x264 project
+;* Copyright (C) 2013-2015 x265 project
 ;*
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
 ;*          Fiona Glaser <fiona@x264.com>
 ;*          Holger Lubitz <holger@lubitz.org>
 ;*          Mathieu Monnier <manao@melix.net>
 ;*          Oskar Arvidsson <oskar@irock.se>
+;*          Min Chen <chenm003@163.com>
 ;*
 ;* This program is free software; you can redistribute it and/or modify
 ;* it under the terms of the GNU General Public License as published by
@@ -46,6 +48,8 @@
 pd_16: times 4 dd 16
 pd_0f: times 4 dd 0xffff
 pf_inv256: times 8 dd 0.00390625
+const pd_inv256,    times 4 dq 0.00390625
+const pd_0_5,       times 4 dq 0.5
 
 SECTION .text
 
@@ -987,151 +991,227 @@
 %endif
 
 ;-----------------------------------------------------------------------------
-; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
-;                             uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len )
+; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, int32_t *intra_costs,
+;                             uint16_t *inter_costs, int32_t *inv_qscales, double *fps_factor, int len )
 ;-----------------------------------------------------------------------------
-%macro MBTREE 0
+INIT_XMM sse2
 cglobal mbtree_propagate_cost, 7,7,7
-    add        r6d, r6d
-    lea         r0, [r0+r6*2]
-    add         r1, r6
-    add         r2, r6
-    add         r3, r6
-    add         r4, r6
-    neg         r6
-    pxor      xmm4, xmm4
-    movss     xmm6, [r5]
-    shufps    xmm6, xmm6, 0
-    mulps     xmm6, [pf_inv256]
-    movdqa    xmm5, [pw_3fff]
+    dec         r6d
+    movsd       m6, [r5]
+    mulpd       m6, [pd_inv256]
+    xor         r5d, r5d
+    lea         r0, [r0+r5*2]
+    pxor        m4, m4
+    movlhps     m6, m6
+    mova        m5, [pw_3fff]
+
 .loop:
-    movq      xmm2, [r2+r6] ; intra
-    movq      xmm0, [r4+r6] ; invq
-    movq      xmm3, [r3+r6] ; inter
-    movq      xmm1, [r1+r6] ; prop
-    punpcklwd xmm2, xmm4
-    punpcklwd xmm0, xmm4
-    pmaddwd   xmm0, xmm2
-    pand      xmm3, xmm5
-    punpcklwd xmm1, xmm4
-    punpcklwd xmm3, xmm4
-%if cpuflag(fma4)
-    cvtdq2ps  xmm0, xmm0
-    cvtdq2ps  xmm1, xmm1
-    fmaddps   xmm0, xmm0, xmm6, xmm1
-    cvtdq2ps  xmm1, xmm2
-    psubd     xmm2, xmm3
-    cvtdq2ps  xmm2, xmm2
-    rcpps     xmm3, xmm1
-    mulps     xmm1, xmm3
-    mulps     xmm0, xmm2
-    addps     xmm2, xmm3, xmm3
-    fnmaddps  xmm3, xmm1, xmm3, xmm2
-    mulps     xmm0, xmm3
-%else
-    cvtdq2ps  xmm0, xmm0
-    mulps     xmm0, xmm6    ; intra*invq*fps_factor>>8
-    cvtdq2ps  xmm1, xmm1    ; prop
-    addps     xmm0, xmm1    ; prop + (intra*invq*fps_factor>>8)
-    cvtdq2ps  xmm1, xmm2    ; intra
-    psubd     xmm2, xmm3    ; intra - inter
-    cvtdq2ps  xmm2, xmm2    ; intra - inter
-    rcpps     xmm3, xmm1    ; 1 / intra 1st approximation
-    mulps     xmm1, xmm3    ; intra * (1/intra 1st approx)
-    mulps     xmm1, xmm3    ; intra * (1/intra 1st approx)^2
-    mulps     xmm0, xmm2    ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
-    addps     xmm3, xmm3    ; 2 * (1/intra 1st approx)
-    subps     xmm3, xmm1    ; 2nd approximation for 1/intra
-    mulps     xmm0, xmm3    ; / intra
-%endif
-    cvtps2dq  xmm0, xmm0
-    movdqa [r0+r6*2], xmm0
-    add         r6, 8
-    jl .loop
+    movh        m2, [r2+r5*4]       ; intra
+    movh        m0, [r4+r5*4]       ; invq
+    movd        m3, [r3+r5*2]       ; inter
+    pand        m3, m5
+    punpcklwd   m3, m4
+
+    ; PMINSD
+    pcmpgtd     m1, m2, m3
+    pand        m3, m1
+    pandn       m1, m2
+    por         m3, m1
+
+    movd        m1, [r1+r5*2]       ; prop
+    punpckldq   m2, m2
+    punpckldq   m0, m0
+    pmuludq     m0, m2
+    pshufd      m2, m2, q3120
+    pshufd      m0, m0, q3120
+
+    punpcklwd   m1, m4
+    cvtdq2pd    m0, m0
+    mulpd       m0, m6              ; intra*invq*fps_factor>>8
+    cvtdq2pd    m1, m1              ; prop
+    addpd       m0, m1              ; prop + (intra*invq*fps_factor>>8)
+    ;cvtdq2ps    m1, m2              ; intra
+    cvtdq2pd    m1, m2              ; intra
+    psubd       m2, m3              ; intra - inter
+    cvtdq2pd    m2, m2              ; intra - inter
+    ;rcpps       m3, m1
+    ;mulps       m1, m3              ; intra * (1/intra 1st approx)
+    ;mulps       m1, m3              ; intra * (1/intra 1st approx)^2
+    ;addps       m3, m3              ; 2 * (1/intra 1st approx)
+    ;subps       m3, m1              ; 2nd approximation for 1/intra
+    ;cvtps2pd    m3, m3              ; 1 / intra 1st approximation
+    mulpd       m0, m2              ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
+    ;mulpd       m0, m3              ; / intra
+
+    ; TODO: DIVPD very slow, but match to C model output, since it is not bottleneck function, I comment above faster code
+    divpd       m0, m1
+    addpd       m0, [pd_0_5]
+    cvttpd2dq    m0, m0
+
+    movh        [r0+r5*4], m0
+    add         r5d, 2
+    cmp         r5d, r6d
+    jl         .loop
+
+    xor         r6d, r5d
+    jnz         .even
+    movd        m2, [r2+r5*4]       ; intra
+    movd        m0, [r4+r5*4]       ; invq
+    movd        m3, [r3+r5*2]       ; inter
+    pand        m3, m5
+    punpcklwd   m3, m4
+
+    ; PMINSD
+    pcmpgtd     m1, m2, m3
+    pand        m3, m1
+    pandn       m1, m2
+    por         m3, m1
+
+    movd        m1, [r1+r5*2]       ; prop
+    punpckldq   m2, m2              ; DWORD [_ 1 _ 0]
+    punpckldq   m0, m0
+    pmuludq     m0, m2              ; QWORD [m1 m0]
+    pshufd      m2, m2, q3120
+    pshufd      m0, m0, q3120
+    punpcklwd   m1, m4
+    cvtdq2pd    m0, m0
+    mulpd       m0, m6              ; intra*invq*fps_factor>>8
+    cvtdq2pd    m1, m1              ; prop
+    addpd       m0, m1              ; prop + (intra*invq*fps_factor>>8)
+    cvtdq2pd    m1, m2              ; intra
+    psubd       m2, m3              ; intra - inter
+    cvtdq2pd    m2, m2              ; intra - inter
+    mulpd       m0, m2              ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
+
+    divpd       m0, m1
+    addpd       m0, [pd_0_5]
+    cvttpd2dq    m0, m0
+    movd        [r0+r5*4], m0
+.even:
     RET
-%endmacro
 
-INIT_XMM sse2
-MBTREE
-; Bulldozer only has a 128-bit float unit, so the AVX version of this function is actually slower.
-INIT_XMM fma4
-MBTREE
-
-%macro INT16_UNPACK 1
-    vpunpckhwd   xm4, xm%1, xm7
-    vpunpcklwd  xm%1, xm7
-    vinsertf128  m%1, m%1, xm4, 1
-%endmacro
 
+;-----------------------------------------------------------------------------
+; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, int32_t *intra_costs,
​

x265_1.8.tar.gz/source/common/x86/mc.h -> x265_1.9.tar.gz/source/common/x86/mc.h Changed

 
@@ -36,4 +36,14 @@
 
 #undef LOWRES
 
+#define PROPAGATE_COST(cpu) \
+    void PFX(mbtree_propagate_cost_ ## cpu)(int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, \
+                                              const uint16_t* interCosts, const int32_t* invQscales, const double* fpsFactor, int len);
+
+PROPAGATE_COST(sse2)
+PROPAGATE_COST(avx)
+PROPAGATE_COST(avx2)
+
+#undef PROPAGATE_COST
+
 #endif // ifndef X265_MC_H
​

x265_1.8.tar.gz/source/common/x86/pixel-a.asm -> x265_1.9.tar.gz/source/common/x86/pixel-a.asm Changed

@@ -2,6 +2,7 @@
 ;* pixel.asm: x86 pixel metrics
 ;*****************************************************************************
 ;* Copyright (C) 2003-2013 x264 project
+;* Copyright (C) 2013-2015 x265 project
 ;*
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
 ;*          Holger Lubitz <holger@lubitz.org>
@@ -70,6 +71,7 @@
 cextern pd_2
 cextern hmul_16p
 cextern pb_movemask
+cextern pb_movemask_32
 cextern pw_pixel_max
 
 ;=============================================================================
@@ -6497,6 +6499,1357 @@
 %endif ; !ARCH_X86_64
 %endmacro ; SA8D
 
+
+%if ARCH_X86_64 == 1 && BIT_DEPTH == 12
+INIT_YMM avx2
+cglobal sa8d_8x8_12bit
+    pmovzxwd        m0, [r0]
+    pmovzxwd        m9, [r2]
+    psubd           m0, m9
+
+    pmovzxwd        m1, [r0 + r1]
+    pmovzxwd        m9, [r2 + r3]
+    psubd           m1, m9
+
+    pmovzxwd        m2, [r0 + r1 * 2]
+    pmovzxwd        m9, [r2 + r3 * 2]
+    psubd           m2, m9
+
+    pmovzxwd        m8, [r0 + r4]
+    pmovzxwd        m9, [r2 + r5]
+    psubd           m8, m9
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+
+    pmovzxwd        m4, [r0]
+    pmovzxwd        m9, [r2]
+    psubd           m4, m9
+
+    pmovzxwd        m5, [r0 + r1]
+    pmovzxwd        m9, [r2 + r3]
+    psubd           m5, m9
+
+    pmovzxwd        m3, [r0 + r1 * 2]
+    pmovzxwd        m9, [r2 + r3 * 2]
+    psubd           m3, m9
+
+    pmovzxwd        m7, [r0 + r4]
+    pmovzxwd        m9, [r2 + r5]
+    psubd           m7, m9
+
+    mova            m6, m0
+    paddd           m0, m1
+    psubd           m1, m6
+    mova            m6, m2
+    paddd           m2, m8
+    psubd           m8, m6
+    mova            m6, m0
+
+    punpckldq       m0, m1
+    punpckhdq       m6, m1
+
+    mova            m1, m0
+    paddd           m0, m6
+    psubd           m6, m1
+    mova            m1, m2
+
+    punpckldq       m2, m8
+    punpckhdq       m1, m8
+
+    mova            m8, m2
+    paddd           m2, m1
+    psubd           m1, m8
+    mova            m8, m4
+    paddd           m4, m5
+    psubd           m5, m8
+    mova            m8, m3
+    paddd           m3, m7
+    psubd           m7, m8
+    mova            m8, m4
+
+    punpckldq       m4, m5
+    punpckhdq       m8, m5
+
+    mova            m5, m4
+    paddd           m4, m8
+    psubd           m8, m5
+    mova            m5, m3
+    punpckldq       m3, m7
+    punpckhdq       m5, m7
+
+    mova            m7, m3
+    paddd           m3, m5
+    psubd           m5, m7
+    mova            m7, m0
+    paddd           m0, m2
+    psubd           m2, m7
+    mova            m7, m6
+    paddd           m6, m1
+    psubd           m1, m7
+    mova            m7, m0
+
+    punpcklqdq      m0, m2
+    punpckhqdq      m7, m2
+
+    mova            m2, m0
+    paddd           m0, m7
+    psubd           m7, m2
+    mova            m2, m6
+
+    punpcklqdq      m6, m1
+    punpckhqdq      m2, m1
+
+    mova            m1, m6
+    paddd           m6, m2
+    psubd           m2, m1
+    mova            m1, m4
+    paddd           m4, m3
+    psubd           m3, m1
+    mova            m1, m8
+    paddd           m8, m5
+    psubd           m5, m1
+    mova            m1, m4
+
+    punpcklqdq      m4, m3
+    punpckhqdq      m1, m3
+
+    mova            m3, m4
+    paddd           m4, m1
+    psubd           m1, m3
+    mova            m3, m8
+
+    punpcklqdq      m8, m5
+    punpckhqdq      m3, m5
+
+    mova            m5, m8
+    paddd           m8, m3
+    psubd           m3, m5
+    mova            m5, m0
+    paddd           m0, m4
+    psubd           m4, m5
+    mova            m5, m7
+    paddd           m7, m1
+    psubd           m1, m5
+    mova            m5, m0
+
+    vinserti128     m0, m0, xm4, 1
+    vperm2i128      m5, m5, m4, 00110001b
+
+    pxor            m4, m4
+    psubd           m4, m0
+    pmaxsd          m0, m4
+    pxor            m4, m4
+    psubd           m4, m5
+    pmaxsd          m5, m4
+    pmaxsd          m0, m5
+    mova            m4, m7
+
+    vinserti128     m7, m7, xm1, 1
+    vperm2i128      m4, m4, m1, 00110001b
+
+    pxor            m1, m1
+    psubd           m1, m7
+    pmaxsd          m7, m1
+    pxor            m1, m1
+    psubd           m1, m4
+    pmaxsd          m4, m1
+    pmaxsd          m7, m4
+    mova            m1, m6
+    paddd           m6, m8
+    psubd           m8, m1
+    mova            m1, m2
+    paddd           m2, m3
+    psubd           m3, m1
+    mova            m1, m6
+
+    vinserti128     m6, m6, xm8, 1
+    vperm2i128      m1, m1, m8, 00110001b
+
+    pxor            m8, m8
+    psubd           m8, m6
+    pmaxsd          m6, m8
+    pxor            m8, m8
+    psubd           m8, m1
+    pmaxsd          m1, m8
+    pmaxsd          m6, m1
+    mova            m8, m2
+
+    vinserti128     m2, m2, xm3, 1
+    vperm2i128      m8, m8, m3, 00110001b
+
+    pxor            m3, m3

 
@@ -2,6 +2,7 @@
 ;* pixel.asm: x86 pixel metrics
 ;*****************************************************************************
 ;* Copyright (C) 2003-2013 x264 project
+;* Copyright (C) 2013-2015 x265 project
 ;*
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
 ;*          Holger Lubitz <holger@lubitz.org>
@@ -70,6 +71,7 @@
 cextern pd_2
 cextern hmul_16p
 cextern pb_movemask
+cextern pb_movemask_32
 cextern pw_pixel_max
 
 ;=============================================================================
@@ -6497,6 +6499,1357 @@
 %endif ; !ARCH_X86_64
 %endmacro ; SA8D
 
+
+%if ARCH_X86_64 == 1 && BIT_DEPTH == 12
+INIT_YMM avx2
+cglobal sa8d_8x8_12bit
+    pmovzxwd        m0, [r0]
+    pmovzxwd        m9, [r2]
+    psubd           m0, m9
+
+    pmovzxwd        m1, [r0 + r1]
+    pmovzxwd        m9, [r2 + r3]
+    psubd           m1, m9
+
+    pmovzxwd        m2, [r0 + r1 * 2]
+    pmovzxwd        m9, [r2 + r3 * 2]
+    psubd           m2, m9
+
+    pmovzxwd        m8, [r0 + r4]
+    pmovzxwd        m9, [r2 + r5]
+    psubd           m8, m9
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+
+    pmovzxwd        m4, [r0]
+    pmovzxwd        m9, [r2]
+    psubd           m4, m9
+
+    pmovzxwd        m5, [r0 + r1]
+    pmovzxwd        m9, [r2 + r3]
+    psubd           m5, m9
+
+    pmovzxwd        m3, [r0 + r1 * 2]
+    pmovzxwd        m9, [r2 + r3 * 2]
+    psubd           m3, m9
+
+    pmovzxwd        m7, [r0 + r4]
+    pmovzxwd        m9, [r2 + r5]
+    psubd           m7, m9
+
+    mova            m6, m0
+    paddd           m0, m1
+    psubd           m1, m6
+    mova            m6, m2
+    paddd           m2, m8
+    psubd           m8, m6
+    mova            m6, m0
+
+    punpckldq       m0, m1
+    punpckhdq       m6, m1
+
+    mova            m1, m0
+    paddd           m0, m6
+    psubd           m6, m1
+    mova            m1, m2
+
+    punpckldq       m2, m8
+    punpckhdq       m1, m8
+
+    mova            m8, m2
+    paddd           m2, m1
+    psubd           m1, m8
+    mova            m8, m4
+    paddd           m4, m5
+    psubd           m5, m8
+    mova            m8, m3
+    paddd           m3, m7
+    psubd           m7, m8
+    mova            m8, m4
+
+    punpckldq       m4, m5
+    punpckhdq       m8, m5
+
+    mova            m5, m4
+    paddd           m4, m8
+    psubd           m8, m5
+    mova            m5, m3
+    punpckldq       m3, m7
+    punpckhdq       m5, m7
+
+    mova            m7, m3
+    paddd           m3, m5
+    psubd           m5, m7
+    mova            m7, m0
+    paddd           m0, m2
+    psubd           m2, m7
+    mova            m7, m6
+    paddd           m6, m1
+    psubd           m1, m7
+    mova            m7, m0
+
+    punpcklqdq      m0, m2
+    punpckhqdq      m7, m2
+
+    mova            m2, m0
+    paddd           m0, m7
+    psubd           m7, m2
+    mova            m2, m6
+
+    punpcklqdq      m6, m1
+    punpckhqdq      m2, m1
+
+    mova            m1, m6
+    paddd           m6, m2
+    psubd           m2, m1
+    mova            m1, m4
+    paddd           m4, m3
+    psubd           m3, m1
+    mova            m1, m8
+    paddd           m8, m5
+    psubd           m5, m1
+    mova            m1, m4
+
+    punpcklqdq      m4, m3
+    punpckhqdq      m1, m3
+
+    mova            m3, m4
+    paddd           m4, m1
+    psubd           m1, m3
+    mova            m3, m8
+
+    punpcklqdq      m8, m5
+    punpckhqdq      m3, m5
+
+    mova            m5, m8
+    paddd           m8, m3
+    psubd           m3, m5
+    mova            m5, m0
+    paddd           m0, m4
+    psubd           m4, m5
+    mova            m5, m7
+    paddd           m7, m1
+    psubd           m1, m5
+    mova            m5, m0
+
+    vinserti128     m0, m0, xm4, 1
+    vperm2i128      m5, m5, m4, 00110001b
+
+    pxor            m4, m4
+    psubd           m4, m0
+    pmaxsd          m0, m4
+    pxor            m4, m4
+    psubd           m4, m5
+    pmaxsd          m5, m4
+    pmaxsd          m0, m5
+    mova            m4, m7
+
+    vinserti128     m7, m7, xm1, 1
+    vperm2i128      m4, m4, m1, 00110001b
+
+    pxor            m1, m1
+    psubd           m1, m7
+    pmaxsd          m7, m1
+    pxor            m1, m1
+    psubd           m1, m4
+    pmaxsd          m4, m1
+    pmaxsd          m7, m4
+    mova            m1, m6
+    paddd           m6, m8
+    psubd           m8, m1
+    mova            m1, m2
+    paddd           m2, m3
+    psubd           m3, m1
+    mova            m1, m6
+
+    vinserti128     m6, m6, xm8, 1
+    vperm2i128      m1, m1, m8, 00110001b
+
+    pxor            m8, m8
+    psubd           m8, m6
+    pmaxsd          m6, m8
+    pxor            m8, m8
+    psubd           m8, m1
+    pmaxsd          m1, m8
+    pmaxsd          m6, m1
+    mova            m8, m2
+
+    vinserti128     m2, m2, xm3, 1
+    vperm2i128      m8, m8, m3, 00110001b
+
+    pxor            m3, m3
​

x265_1.8.tar.gz/source/common/x86/pixel-util.h -> x265_1.9.tar.gz/source/common/x86/pixel-util.h Changed

@@ -2,6 +2,7 @@
  * Copyright (C) 2013 x265 project
  *
  * Authors: Steve Borho <steve@borho.org>
+;*          Min Chen <chenm003@163.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -55,5 +56,6 @@
 int PFX(scanPosLast_avx2_bmi2(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* scanCG4x4, const int trSize));
 uint32_t PFX(findPosFirstLast_ssse3(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16]));
 uint32_t PFX(costCoeffNxN_sse4(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase));
+uint32_t PFX(costCoeffNxN_avx2_bmi2(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase));
 
 #endif // ifndef X265_PIXEL_UTIL_H

 
@@ -2,6 +2,7 @@
  * Copyright (C) 2013 x265 project
  *
  * Authors: Steve Borho <steve@borho.org>
+;*          Min Chen <chenm003@163.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -55,5 +56,6 @@
 int PFX(scanPosLast_avx2_bmi2(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* scanCG4x4, const int trSize));
 uint32_t PFX(findPosFirstLast_ssse3(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16]));
 uint32_t PFX(costCoeffNxN_sse4(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase));
+uint32_t PFX(costCoeffNxN_avx2_bmi2(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase));
 
 #endif // ifndef X265_PIXEL_UTIL_H
​

x265_1.8.tar.gz/source/common/x86/pixel-util8.asm -> x265_1.9.tar.gz/source/common/x86/pixel-util8.asm Changed

@@ -49,6 +49,7 @@
 mask_ff:                times 16 db 0xff
                         times 16 db 0
 deinterleave_shuf:      times  2 db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
+interleave_shuf:        times  2 db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
 deinterleave_word_shuf: times  2 db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15
 hmulw_16p:              times  8 dw 1
                         times  4 dw 1, -1
@@ -56,7 +57,7 @@
 SECTION .text
 
 cextern pw_1
-cextern pw_0_15
+cextern pw_0_7
 cextern pb_1
 cextern pb_128
 cextern pw_00ff
@@ -78,6 +79,7 @@
 cextern trans8_shuf
 cextern_naked private_prefix %+ _entropyStateBits
 cextern pb_movemask
+cextern pw_exp2_0_15
 
 ;-----------------------------------------------------------------------------
 ; void getResidual(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride)
@@ -792,6 +794,7 @@
     pshufd      m6, m6, 0       ; m6 = add
     mov         r3d, r4d        ; r3 = numCoeff
     shr         r4d, 3
+    pxor        m4, m4
 
 .loop:
     pmovsxwd    m0, [r0]        ; m0 = level
@@ -810,13 +813,13 @@
     psignd      m3, m1
 
     packssdw    m2, m3
+    pabsw       m2, m2
 
     movu        [r2], m2
     add         r0, 16
     add         r1, 32
     add         r2, 16
 
-    pxor        m4, m4
     pcmpeqw     m2, m4
     psubw       m7, m2
 
@@ -862,9 +865,11 @@
     psignd      m2, m0
 
     packssdw    m1, m2
-    vpermq      m2, m1, q3120
+    pabsw       m1, m1
 
+    vpermq      m2, m1, q3120
     movu        [r2], m2
+
     add         r0, mmsize
     add         r1, mmsize * 2
     add         r2, mmsize
@@ -1560,7 +1565,7 @@
     movd        m0, r6d
     pshuflw     m0, m0, 0
     punpcklqdq  m0, m0
-    pcmpgtw     m0, [pw_0_15]
+    pcmpgtw     m0, [pw_0_7]
 
 .loopH:
     mov         r6d, r4d
@@ -1718,7 +1723,7 @@
     pshuflw                   m0, m0, 0
     punpcklqdq                m0, m0
     vinserti128               m0, m0, xm0, 1
-    pcmpgtw                   m0, [pw_0_15]
+    pcmpgtw                   m0, [pw_0_7]
 
 .loopH:
     mov                       r6d, r4d
@@ -6397,6 +6402,78 @@
     movd   edx, xm6
 %endif
     RET
+
+INIT_YMM avx2
+cglobal pixel_var_32x32, 2,4,7
+    VAR_START 0
+    mov             r2d, 16
+
+.loop:
+    pmovzxbw        m0, [r0]
+    pmovzxbw        m3, [r0 + 16]
+    pmovzxbw        m1, [r0 + r1]
+    pmovzxbw        m4, [r0 + r1 + 16]
+
+    lea             r0, [r0 + r1 * 2]
+
+    VAR_CORE
+
+    dec             r2d
+    jg              .loop
+
+    vextracti128   xm0, m5, 1
+    vextracti128   xm1, m6, 1
+    paddw          xm5, xm0
+    paddd          xm6, xm1
+    HADDW          xm5, xm2
+    HADDD          xm6, xm1
+
+%if ARCH_X86_64
+    punpckldq      xm5, xm6
+    movq           rax, xm5
+%else
+    movd           eax, xm5
+    movd           edx, xm6
+%endif
+    RET
+
+INIT_YMM avx2
+cglobal pixel_var_64x64, 2,4,7
+    VAR_START 0
+    mov             r2d, 64
+
+.loop:
+    pmovzxbw        m0, [r0]
+    pmovzxbw        m3, [r0 + 16]
+    pmovzxbw        m1, [r0 + mmsize]
+    pmovzxbw        m4, [r0 + mmsize + 16]
+
+    lea             r0, [r0 + r1]
+
+    VAR_CORE
+
+    dec             r2d
+    jg              .loop
+
+    pxor            m1, m1
+    punpcklwd       m0, m5, m1
+    punpckhwd       m5, m1
+    paddd           m5, m0
+    vextracti128   xm2, m5, 1
+    vextracti128   xm1, m6, 1
+    paddd          xm5, xm2
+    paddd          xm6, xm1
+    HADDD          xm5, xm2
+    HADDD          xm6, xm1
+
+%if ARCH_X86_64
+    punpckldq      xm5, xm6
+    movq           rax, xm5
+%else
+    movd           eax, xm5
+    movd           edx, xm6
+%endif
+    RET
 %endif ; !HIGH_BIT_DEPTH
 
 %macro VAR2_END 3
@@ -6578,10 +6655,10 @@
 
 
 ;-----------------------------------------------------------------------------
-; uint32_t[last first] findPosFirstAndLast(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16])
+; uint32_t[sumSign last first] findPosFirstLast(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16], uint32_t *absSum)
 ;-----------------------------------------------------------------------------
 INIT_XMM ssse3
-cglobal findPosFirstLast, 3,3,3
+cglobal findPosFirstLast, 3,3,4
     ; convert stride to int16_t
     add         r1d, r1d
 
@@ -6593,10 +6670,22 @@
     movh        m1, [r0]
     movhps      m1, [r0 + r1]
     movh        m2, [r0 + r1 * 2]
-    lea         r1, [r1 * 3]
+    lea         r1d, [r1 * 3]
     movhps      m2, [r0 + r1]
+    pxor        m3, m1, m2
     packsswb    m1, m2
 
+    ; get absSum
+    movhlps     m2, m3
+    pxor        m3, m2
+    pshufd      m2, m3, q2301
+    pxor        m3, m2
+    movd        r0d, m3
+    mov         r2d, r0d
+    shr         r2d, 16
+    xor         r2d, r0d
+    shl         r2d, 31
+
     ; get non-zero mask
     pxor        m2, m2
     pcmpeqb     m1, m2
@@ -6609,319 +6698,10 @@
     not         r0d
     bsr         r1w, r0w
     bsf         eax, r0d    ; side effect: clear AH to Zero
-    shl         r1d, 16

 
@@ -49,6 +49,7 @@
 mask_ff:                times 16 db 0xff
                         times 16 db 0
 deinterleave_shuf:      times  2 db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
+interleave_shuf:        times  2 db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
 deinterleave_word_shuf: times  2 db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15
 hmulw_16p:              times  8 dw 1
                         times  4 dw 1, -1
@@ -56,7 +57,7 @@
 SECTION .text
 
 cextern pw_1
-cextern pw_0_15
+cextern pw_0_7
 cextern pb_1
 cextern pb_128
 cextern pw_00ff
@@ -78,6 +79,7 @@
 cextern trans8_shuf
 cextern_naked private_prefix %+ _entropyStateBits
 cextern pb_movemask
+cextern pw_exp2_0_15
 
 ;-----------------------------------------------------------------------------
 ; void getResidual(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride)
@@ -792,6 +794,7 @@
     pshufd      m6, m6, 0       ; m6 = add
     mov         r3d, r4d        ; r3 = numCoeff
     shr         r4d, 3
+    pxor        m4, m4
 
 .loop:
     pmovsxwd    m0, [r0]        ; m0 = level
@@ -810,13 +813,13 @@
     psignd      m3, m1
 
     packssdw    m2, m3
+    pabsw       m2, m2
 
     movu        [r2], m2
     add         r0, 16
     add         r1, 32
     add         r2, 16
 
-    pxor        m4, m4
     pcmpeqw     m2, m4
     psubw       m7, m2
 
@@ -862,9 +865,11 @@
     psignd      m2, m0
 
     packssdw    m1, m2
-    vpermq      m2, m1, q3120
+    pabsw       m1, m1
 
+    vpermq      m2, m1, q3120
     movu        [r2], m2
+
     add         r0, mmsize
     add         r1, mmsize * 2
     add         r2, mmsize
@@ -1560,7 +1565,7 @@
     movd        m0, r6d
     pshuflw     m0, m0, 0
     punpcklqdq  m0, m0
-    pcmpgtw     m0, [pw_0_15]
+    pcmpgtw     m0, [pw_0_7]
 
 .loopH:
     mov         r6d, r4d
@@ -1718,7 +1723,7 @@
     pshuflw                   m0, m0, 0
     punpcklqdq                m0, m0
     vinserti128               m0, m0, xm0, 1
-    pcmpgtw                   m0, [pw_0_15]
+    pcmpgtw                   m0, [pw_0_7]
 
 .loopH:
     mov                       r6d, r4d
@@ -6397,6 +6402,78 @@
     movd   edx, xm6
 %endif
     RET
+
+INIT_YMM avx2
+cglobal pixel_var_32x32, 2,4,7
+    VAR_START 0
+    mov             r2d, 16
+
+.loop:
+    pmovzxbw        m0, [r0]
+    pmovzxbw        m3, [r0 + 16]
+    pmovzxbw        m1, [r0 + r1]
+    pmovzxbw        m4, [r0 + r1 + 16]
+
+    lea             r0, [r0 + r1 * 2]
+
+    VAR_CORE
+
+    dec             r2d
+    jg              .loop
+
+    vextracti128   xm0, m5, 1
+    vextracti128   xm1, m6, 1
+    paddw          xm5, xm0
+    paddd          xm6, xm1
+    HADDW          xm5, xm2
+    HADDD          xm6, xm1
+
+%if ARCH_X86_64
+    punpckldq      xm5, xm6
+    movq           rax, xm5
+%else
+    movd           eax, xm5
+    movd           edx, xm6
+%endif
+    RET
+
+INIT_YMM avx2
+cglobal pixel_var_64x64, 2,4,7
+    VAR_START 0
+    mov             r2d, 64
+
+.loop:
+    pmovzxbw        m0, [r0]
+    pmovzxbw        m3, [r0 + 16]
+    pmovzxbw        m1, [r0 + mmsize]
+    pmovzxbw        m4, [r0 + mmsize + 16]
+
+    lea             r0, [r0 + r1]
+
+    VAR_CORE
+
+    dec             r2d
+    jg              .loop
+
+    pxor            m1, m1
+    punpcklwd       m0, m5, m1
+    punpckhwd       m5, m1
+    paddd           m5, m0
+    vextracti128   xm2, m5, 1
+    vextracti128   xm1, m6, 1
+    paddd          xm5, xm2
+    paddd          xm6, xm1
+    HADDD          xm5, xm2
+    HADDD          xm6, xm1
+
+%if ARCH_X86_64
+    punpckldq      xm5, xm6
+    movq           rax, xm5
+%else
+    movd           eax, xm5
+    movd           edx, xm6
+%endif
+    RET
 %endif ; !HIGH_BIT_DEPTH
 
 %macro VAR2_END 3
@@ -6578,10 +6655,10 @@
 
 
 ;-----------------------------------------------------------------------------
-; uint32_t[last first] findPosFirstAndLast(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16])
+; uint32_t[sumSign last first] findPosFirstLast(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16], uint32_t *absSum)
 ;-----------------------------------------------------------------------------
 INIT_XMM ssse3
-cglobal findPosFirstLast, 3,3,3
+cglobal findPosFirstLast, 3,3,4
     ; convert stride to int16_t
     add         r1d, r1d
 
@@ -6593,10 +6670,22 @@
     movh        m1, [r0]
     movhps      m1, [r0 + r1]
     movh        m2, [r0 + r1 * 2]
-    lea         r1, [r1 * 3]
+    lea         r1d, [r1 * 3]
     movhps      m2, [r0 + r1]
+    pxor        m3, m1, m2
     packsswb    m1, m2
 
+    ; get absSum
+    movhlps     m2, m3
+    pxor        m3, m2
+    pshufd      m2, m3, q2301
+    pxor        m3, m2
+    movd        r0d, m3
+    mov         r2d, r0d
+    shr         r2d, 16
+    xor         r2d, r0d
+    shl         r2d, 31
+
     ; get non-zero mask
     pxor        m2, m2
     pcmpeqb     m1, m2
@@ -6609,319 +6698,10 @@
     not         r0d
     bsr         r1w, r0w
     bsf         eax, r0d    ; side effect: clear AH to Zero
-    shl         r1d, 16
​

x265_1.8.tar.gz/source/common/x86/pixel.h -> x265_1.9.tar.gz/source/common/x86/pixel.h Changed

@@ -2,10 +2,12 @@
  * pixel.h: x86 pixel metrics
  *****************************************************************************
  * Copyright (C) 2003-2013 x264 project
+ * Copyright (C) 2013-2015 x265 project
  *
  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
  *          Loren Merritt <lorenm@u.washington.edu>
  *          Fiona Glaser <fiona@x264.com>
+;*          Min Chen <chenm003@163.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -34,9 +36,10 @@
 void PFX(upShift_16_avx2)(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
 void PFX(upShift_8_sse4)(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
 void PFX(upShift_8_avx2)(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
+pixel PFX(planeClipAndMax_avx2)(pixel *src, intptr_t stride, int width, int height, uint64_t *outsum, const pixel minPix, const pixel maxPix);
 
 #define DECL_PIXELS(cpu) \
-    FUNCDEF_PU(uint32_t, pixel_ssd, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
+    FUNCDEF_PU(sse_t, pixel_ssd, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
     FUNCDEF_PU(int, pixel_sa8d, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
     FUNCDEF_PU(void, pixel_sad_x3, cpu, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*); \
     FUNCDEF_PU(void, pixel_sad_x4, cpu, const pixel*, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*); \
@@ -45,10 +48,10 @@
     FUNCDEF_PU(void, pixel_sub_ps, cpu, int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1); \
     FUNCDEF_CHROMA_PU(int, pixel_satd, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
     FUNCDEF_CHROMA_PU(int, pixel_sad, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
-    FUNCDEF_CHROMA_PU(uint32_t, pixel_ssd_ss, cpu, const int16_t*, intptr_t, const int16_t*, intptr_t); \
+    FUNCDEF_CHROMA_PU(sse_t, pixel_ssd_ss, cpu, const int16_t*, intptr_t, const int16_t*, intptr_t); \
     FUNCDEF_CHROMA_PU(void, addAvg, cpu, const int16_t*, const int16_t*, pixel*, intptr_t, intptr_t, intptr_t); \
-    FUNCDEF_CHROMA_PU(int, pixel_ssd_s, cpu, const int16_t*, intptr_t); \
-    FUNCDEF_TU_S(int, pixel_ssd_s, cpu, const int16_t*, intptr_t); \
+    FUNCDEF_CHROMA_PU(sse_t, pixel_ssd_s, cpu, const int16_t*, intptr_t); \
+    FUNCDEF_TU_S(sse_t, pixel_ssd_s, cpu, const int16_t*, intptr_t); \
     FUNCDEF_TU(uint64_t, pixel_var, cpu, const pixel*, intptr_t); \
     FUNCDEF_TU(int, psyCost_pp, cpu, const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride); \
     FUNCDEF_TU(int, psyCost_ss, cpu, const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride)

 
@@ -2,10 +2,12 @@
  * pixel.h: x86 pixel metrics
  *****************************************************************************
  * Copyright (C) 2003-2013 x264 project
+ * Copyright (C) 2013-2015 x265 project
  *
  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
  *          Loren Merritt <lorenm@u.washington.edu>
  *          Fiona Glaser <fiona@x264.com>
+;*          Min Chen <chenm003@163.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -34,9 +36,10 @@
 void PFX(upShift_16_avx2)(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
 void PFX(upShift_8_sse4)(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
 void PFX(upShift_8_avx2)(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
+pixel PFX(planeClipAndMax_avx2)(pixel *src, intptr_t stride, int width, int height, uint64_t *outsum, const pixel minPix, const pixel maxPix);
 
 #define DECL_PIXELS(cpu) \
-    FUNCDEF_PU(uint32_t, pixel_ssd, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
+    FUNCDEF_PU(sse_t, pixel_ssd, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
     FUNCDEF_PU(int, pixel_sa8d, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
     FUNCDEF_PU(void, pixel_sad_x3, cpu, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*); \
     FUNCDEF_PU(void, pixel_sad_x4, cpu, const pixel*, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*); \
@@ -45,10 +48,10 @@
     FUNCDEF_PU(void, pixel_sub_ps, cpu, int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1); \
     FUNCDEF_CHROMA_PU(int, pixel_satd, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
     FUNCDEF_CHROMA_PU(int, pixel_sad, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
-    FUNCDEF_CHROMA_PU(uint32_t, pixel_ssd_ss, cpu, const int16_t*, intptr_t, const int16_t*, intptr_t); \
+    FUNCDEF_CHROMA_PU(sse_t, pixel_ssd_ss, cpu, const int16_t*, intptr_t, const int16_t*, intptr_t); \
     FUNCDEF_CHROMA_PU(void, addAvg, cpu, const int16_t*, const int16_t*, pixel*, intptr_t, intptr_t, intptr_t); \
-    FUNCDEF_CHROMA_PU(int, pixel_ssd_s, cpu, const int16_t*, intptr_t); \
-    FUNCDEF_TU_S(int, pixel_ssd_s, cpu, const int16_t*, intptr_t); \
+    FUNCDEF_CHROMA_PU(sse_t, pixel_ssd_s, cpu, const int16_t*, intptr_t); \
+    FUNCDEF_TU_S(sse_t, pixel_ssd_s, cpu, const int16_t*, intptr_t); \
     FUNCDEF_TU(uint64_t, pixel_var, cpu, const pixel*, intptr_t); \
     FUNCDEF_TU(int, psyCost_pp, cpu, const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride); \
     FUNCDEF_TU(int, psyCost_ss, cpu, const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride)
​

x265_1.8.tar.gz/source/common/x86/pixeladd8.asm -> x265_1.9.tar.gz/source/common/x86/pixeladd8.asm Changed

 
@@ -2,6 +2,7 @@
 ;* Copyright (C) 2013 x265 project
 ;*
 ;* Authors: Praveen Kumar Tiwari <praveen@multicorewareinc.com>
+;*          Min Chen <chenm003@163.com>
 ;*
 ;* This program is free software; you can redistribute it and/or modify
 ;* it under the terms of the GNU General Public License as published by
​

x265_1.8.tar.gz/source/common/x86/sad-a.asm -> x265_1.9.tar.gz/source/common/x86/sad-a.asm Changed

@@ -2,6 +2,7 @@
 ;* sad-a.asm: x86 sad functions
 ;*****************************************************************************
 ;* Copyright (C) 2003-2013 x264 project
+;* Copyright (C) 2013-2015 x265 project
 ;*
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
 ;*          Fiona Glaser <fiona@x264.com>
@@ -3328,6 +3329,730 @@
     SAD_X4_END_SSE2 1
 %endmacro
 
+%if ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 0
+INIT_YMM avx2
+%macro SAD_X4_64x8_AVX2 0
+    movu            m4, [r0]
+    movu            m5, [r1]
+    movu            m6, [r2]
+    movu            m7, [r3]
+    movu            m8, [r4]
+
+    psadbw          m9, m4, m5
+    paddd           m0, m9
+    psadbw          m5, m4, m6
+    paddd           m1, m5
+    psadbw          m6, m4, m7
+    paddd           m2, m6
+    psadbw          m4, m8
+    paddd           m3, m4
+
+    movu            m4, [r0 + mmsize]
+    movu            m5, [r1 + mmsize]
+    movu            m6, [r2 + mmsize]
+    movu            m7, [r3 + mmsize]
+    movu            m8, [r4 + mmsize]
+
+    psadbw          m9, m4, m5
+    paddd           m0, m9
+    psadbw          m5, m4, m6
+    paddd           m1, m5
+    psadbw          m6, m4, m7
+    paddd           m2, m6
+    psadbw          m4, m8
+    paddd           m3, m4
+
+    movu            m4, [r0 + FENC_STRIDE]
+    movu            m5, [r1 + r5]
+    movu            m6, [r2 + r5]
+    movu            m7, [r3 + r5]
+    movu            m8, [r4 + r5]
+
+    psadbw          m9, m4, m5
+    paddd           m0, m9
+    psadbw          m5, m4, m6
+    paddd           m1, m5
+    psadbw          m6, m4, m7
+    paddd           m2, m6
+    psadbw          m4, m8
+    paddd           m3, m4
+
+    movu            m4, [r0 + FENC_STRIDE + mmsize]
+    movu            m5, [r1 + r5 + mmsize]
+    movu            m6, [r2 + r5 + mmsize]
+    movu            m7, [r3 + r5 + mmsize]
+    movu            m8, [r4 + r5 + mmsize]
+
+    psadbw          m9, m4, m5
+    paddd           m0, m9
+    psadbw          m5, m4, m6
+    paddd           m1, m5
+    psadbw          m6, m4, m7
+    paddd           m2, m6
+    psadbw          m4, m8
+    paddd           m3, m4
+
+    movu            m4, [r0 + FENC_STRIDE * 2]
+    movu            m5, [r1 + r5 * 2]
+    movu            m6, [r2 + r5 * 2]
+    movu            m7, [r3 + r5 * 2]
+    movu            m8, [r4 + r5 * 2]
+
+    psadbw          m9, m4, m5
+    paddd           m0, m9
+    psadbw          m5, m4, m6
+    paddd           m1, m5
+    psadbw          m6, m4, m7
+    paddd           m2, m6
+    psadbw          m4, m8
+    paddd           m3, m4
+
+    movu            m4, [r0 + FENC_STRIDE * 2 + mmsize]
+    movu            m5, [r1 + r5 * 2 + mmsize]
+    movu            m6, [r2 + r5 * 2 + mmsize]
+    movu            m7, [r3 + r5 * 2 + mmsize]
+    movu            m8, [r4 + r5 * 2 + mmsize]
+
+    psadbw          m9, m4, m5
+    paddd           m0, m9
+    psadbw          m5, m4, m6
+    paddd           m1, m5
+    psadbw          m6, m4, m7
+    paddd           m2, m6
+    psadbw          m4, m8
+    paddd           m3, m4
+
+    movu            m4, [r0 + FENC_STRIDE * 3]
+    movu            m5, [r1 + r7]
+    movu            m6, [r2 + r7]
+    movu            m7, [r3 + r7]
+    movu            m8, [r4 + r7]
+
+    psadbw          m9, m4, m5
+    paddd           m0, m9
+    psadbw          m5, m4, m6
+    paddd           m1, m5
+    psadbw          m6, m4, m7
+    paddd           m2, m6
+    psadbw          m4, m8
+    paddd           m3, m4
+
+    movu            m4, [r0 + FENC_STRIDE * 3 + mmsize]
+    movu            m5, [r1 + r7 + mmsize]
+    movu            m6, [r2 + r7 + mmsize]
+    movu            m7, [r3 + r7 + mmsize]
+    movu            m8, [r4 + r7 + mmsize]
+
+    psadbw          m9, m4, m5
+    paddd           m0, m9
+    psadbw          m5, m4, m6
+    paddd           m1, m5
+    psadbw          m6, m4, m7
+    paddd           m2, m6
+    psadbw          m4, m8
+    paddd           m3, m4
+
+    add             r0, FENC_STRIDE * 4
+    lea             r1, [r1 + r5 * 4]
+    lea             r2, [r2 + r5 * 4]
+    lea             r3, [r3 + r5 * 4]
+    lea             r4, [r4 + r5 * 4]
+
+    movu            m4, [r0]
+    movu            m5, [r1]
+    movu            m6, [r2]
+    movu            m7, [r3]
+    movu            m8, [r4]
+
+    psadbw          m9, m4, m5
+    paddd           m0, m9
+    psadbw          m5, m4, m6
+    paddd           m1, m5
+    psadbw          m6, m4, m7
+    paddd           m2, m6
+    psadbw          m4, m8
+    paddd           m3, m4
+
+    movu            m4, [r0 + mmsize]
+    movu            m5, [r1 + mmsize]
+    movu            m6, [r2 + mmsize]
+    movu            m7, [r3 + mmsize]
+    movu            m8, [r4 + mmsize]
+
+    psadbw          m9, m4, m5
+    paddd           m0, m9
+    psadbw          m5, m4, m6
+    paddd           m1, m5
+    psadbw          m6, m4, m7
+    paddd           m2, m6
+    psadbw          m4, m8
+    paddd           m3, m4
+
+    movu            m4, [r0 + FENC_STRIDE]
+    movu            m5, [r1 + r5]
+    movu            m6, [r2 + r5]
+    movu            m7, [r3 + r5]
+    movu            m8, [r4 + r5]
+
+    psadbw          m9, m4, m5
+    paddd           m0, m9
+    psadbw          m5, m4, m6
+    paddd           m1, m5
+    psadbw          m6, m4, m7
+    paddd           m2, m6
+    psadbw          m4, m8
+    paddd           m3, m4
+
+    movu            m4, [r0 + FENC_STRIDE + mmsize]
+    movu            m5, [r1 + r5 + mmsize]
+    movu            m6, [r2 + r5 + mmsize]
+    movu            m7, [r3 + r5 + mmsize]
+    movu            m8, [r4 + r5 + mmsize]
+
+    psadbw          m9, m4, m5
+    paddd           m0, m9
+    psadbw          m5, m4, m6
+    paddd           m1, m5
+    psadbw          m6, m4, m7
+    paddd           m2, m6
+    psadbw          m4, m8
+    paddd           m3, m4

 
@@ -2,6 +2,7 @@
 ;* sad-a.asm: x86 sad functions
 ;*****************************************************************************
 ;* Copyright (C) 2003-2013 x264 project
+;* Copyright (C) 2013-2015 x265 project
 ;*
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
 ;*          Fiona Glaser <fiona@x264.com>
@@ -3328,6 +3329,730 @@
     SAD_X4_END_SSE2 1
 %endmacro
 
+%if ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 0
+INIT_YMM avx2
+%macro SAD_X4_64x8_AVX2 0
+    movu            m4, [r0]
+    movu            m5, [r1]
+    movu            m6, [r2]
+    movu            m7, [r3]
+    movu            m8, [r4]
+
+    psadbw          m9, m4, m5
+    paddd           m0, m9
+    psadbw          m5, m4, m6
+    paddd           m1, m5
+    psadbw          m6, m4, m7
+    paddd           m2, m6
+    psadbw          m4, m8
+    paddd           m3, m4
+
+    movu            m4, [r0 + mmsize]
+    movu            m5, [r1 + mmsize]
+    movu            m6, [r2 + mmsize]
+    movu            m7, [r3 + mmsize]
+    movu            m8, [r4 + mmsize]
+
+    psadbw          m9, m4, m5
+    paddd           m0, m9
+    psadbw          m5, m4, m6
+    paddd           m1, m5
+    psadbw          m6, m4, m7
+    paddd           m2, m6
+    psadbw          m4, m8
+    paddd           m3, m4
+
+    movu            m4, [r0 + FENC_STRIDE]
+    movu            m5, [r1 + r5]
+    movu            m6, [r2 + r5]
+    movu            m7, [r3 + r5]
+    movu            m8, [r4 + r5]
+
+    psadbw          m9, m4, m5
+    paddd           m0, m9
+    psadbw          m5, m4, m6
+    paddd           m1, m5
+    psadbw          m6, m4, m7
+    paddd           m2, m6
+    psadbw          m4, m8
+    paddd           m3, m4
+
+    movu            m4, [r0 + FENC_STRIDE + mmsize]
+    movu            m5, [r1 + r5 + mmsize]
+    movu            m6, [r2 + r5 + mmsize]
+    movu            m7, [r3 + r5 + mmsize]
+    movu            m8, [r4 + r5 + mmsize]
+
+    psadbw          m9, m4, m5
+    paddd           m0, m9
+    psadbw          m5, m4, m6
+    paddd           m1, m5
+    psadbw          m6, m4, m7
+    paddd           m2, m6
+    psadbw          m4, m8
+    paddd           m3, m4
+
+    movu            m4, [r0 + FENC_STRIDE * 2]
+    movu            m5, [r1 + r5 * 2]
+    movu            m6, [r2 + r5 * 2]
+    movu            m7, [r3 + r5 * 2]
+    movu            m8, [r4 + r5 * 2]
+
+    psadbw          m9, m4, m5
+    paddd           m0, m9
+    psadbw          m5, m4, m6
+    paddd           m1, m5
+    psadbw          m6, m4, m7
+    paddd           m2, m6
+    psadbw          m4, m8
+    paddd           m3, m4
+
+    movu            m4, [r0 + FENC_STRIDE * 2 + mmsize]
+    movu            m5, [r1 + r5 * 2 + mmsize]
+    movu            m6, [r2 + r5 * 2 + mmsize]
+    movu            m7, [r3 + r5 * 2 + mmsize]
+    movu            m8, [r4 + r5 * 2 + mmsize]
+
+    psadbw          m9, m4, m5
+    paddd           m0, m9
+    psadbw          m5, m4, m6
+    paddd           m1, m5
+    psadbw          m6, m4, m7
+    paddd           m2, m6
+    psadbw          m4, m8
+    paddd           m3, m4
+
+    movu            m4, [r0 + FENC_STRIDE * 3]
+    movu            m5, [r1 + r7]
+    movu            m6, [r2 + r7]
+    movu            m7, [r3 + r7]
+    movu            m8, [r4 + r7]
+
+    psadbw          m9, m4, m5
+    paddd           m0, m9
+    psadbw          m5, m4, m6
+    paddd           m1, m5
+    psadbw          m6, m4, m7
+    paddd           m2, m6
+    psadbw          m4, m8
+    paddd           m3, m4
+
+    movu            m4, [r0 + FENC_STRIDE * 3 + mmsize]
+    movu            m5, [r1 + r7 + mmsize]
+    movu            m6, [r2 + r7 + mmsize]
+    movu            m7, [r3 + r7 + mmsize]
+    movu            m8, [r4 + r7 + mmsize]
+
+    psadbw          m9, m4, m5
+    paddd           m0, m9
+    psadbw          m5, m4, m6
+    paddd           m1, m5
+    psadbw          m6, m4, m7
+    paddd           m2, m6
+    psadbw          m4, m8
+    paddd           m3, m4
+
+    add             r0, FENC_STRIDE * 4
+    lea             r1, [r1 + r5 * 4]
+    lea             r2, [r2 + r5 * 4]
+    lea             r3, [r3 + r5 * 4]
+    lea             r4, [r4 + r5 * 4]
+
+    movu            m4, [r0]
+    movu            m5, [r1]
+    movu            m6, [r2]
+    movu            m7, [r3]
+    movu            m8, [r4]
+
+    psadbw          m9, m4, m5
+    paddd           m0, m9
+    psadbw          m5, m4, m6
+    paddd           m1, m5
+    psadbw          m6, m4, m7
+    paddd           m2, m6
+    psadbw          m4, m8
+    paddd           m3, m4
+
+    movu            m4, [r0 + mmsize]
+    movu            m5, [r1 + mmsize]
+    movu            m6, [r2 + mmsize]
+    movu            m7, [r3 + mmsize]
+    movu            m8, [r4 + mmsize]
+
+    psadbw          m9, m4, m5
+    paddd           m0, m9
+    psadbw          m5, m4, m6
+    paddd           m1, m5
+    psadbw          m6, m4, m7
+    paddd           m2, m6
+    psadbw          m4, m8
+    paddd           m3, m4
+
+    movu            m4, [r0 + FENC_STRIDE]
+    movu            m5, [r1 + r5]
+    movu            m6, [r2 + r5]
+    movu            m7, [r3 + r5]
+    movu            m8, [r4 + r5]
+
+    psadbw          m9, m4, m5
+    paddd           m0, m9
+    psadbw          m5, m4, m6
+    paddd           m1, m5
+    psadbw          m6, m4, m7
+    paddd           m2, m6
+    psadbw          m4, m8
+    paddd           m3, m4
+
+    movu            m4, [r0 + FENC_STRIDE + mmsize]
+    movu            m5, [r1 + r5 + mmsize]
+    movu            m6, [r2 + r5 + mmsize]
+    movu            m7, [r3 + r5 + mmsize]
+    movu            m8, [r4 + r5 + mmsize]
+
+    psadbw          m9, m4, m5
+    paddd           m0, m9
+    psadbw          m5, m4, m6
+    paddd           m1, m5
+    psadbw          m6, m4, m7
+    paddd           m2, m6
+    psadbw          m4, m8
+    paddd           m3, m4
​

x265_1.8.tar.gz/source/common/x86/sad16-a.asm -> x265_1.9.tar.gz/source/common/x86/sad16-a.asm Changed

@@ -413,77 +413,50 @@
 SAD  16, 32
 
 INIT_YMM avx2
-cglobal pixel_sad_16x64, 4,7,4
+cglobal pixel_sad_16x64, 4,5,5
     pxor    m0, m0
-    pxor    m3, m3
-    mov     r4d, 64 / 8
-    add     r3d, r3d
-    add     r1d, r1d
-    lea     r5,     [r1 * 3]
-    lea     r6,     [r3 * 3]
+    mov     r4d, 16
+    mova    m4, [pw_1]
 .loop:
     movu    m1, [r2]
-    movu    m2, [r2 + r3]
+    movu    m2, [r2 + r3 * 2]
     psubw   m1, [r0]
-    psubw   m2, [r0 + r1]
-    pabsw   m1, m1
-    pabsw   m2, m2
-    paddw   m0, m1
-    paddw   m3, m2
-
-    movu    m1, [r2 + 2 * r3]
-    movu    m2, [r2 + r6]
-    psubw   m1, [r0 + 2 * r1]
-    psubw   m2, [r0 + r5]
+    psubw   m2, [r0 + r1 * 2]
     pabsw   m1, m1
     pabsw   m2, m2
-    paddw   m0, m1
-    paddw   m3, m2
-
+    paddw   m3, m1, m2
     lea     r0, [r0 + 4 * r1]
     lea     r2, [r2 + 4 * r3]
 
     movu    m1, [r2]
-    movu    m2, [r2 + r3]
+    movu    m2, [r2 + r3 * 2]
     psubw   m1, [r0]
-    psubw   m2, [r0 + r1]
+    psubw   m2, [r0 + r1 * 2]
     pabsw   m1, m1
     pabsw   m2, m2
-    paddw   m0, m1
-    paddw   m3, m2
-
-    movu    m1, [r2 + 2 * r3]
-    movu    m2, [r2 + r6]
-    psubw   m1, [r0 + 2 * r1]
-    psubw   m2, [r0 + r5]
-    pabsw   m1, m1
-    pabsw   m2, m2
-    paddw   m0, m1
-    paddw   m3, m2
-
-    lea     r0, [r0 + 4 * r1]
-    lea     r2, [r2 + 4 * r3]
-
-    dec    r4d
-    jg .loop
-
-    HADDUWD m0, m1
-    HADDUWD m3, m1
-    HADDD   m0, m1
-    HADDD   m3, m1
+    paddw   m1, m2
+    pmaddwd m3, m4
     paddd   m0, m3
+    pmaddwd m1, m4
+    paddd   m0, m1
+    lea     r0, [r0+4*r1]
+    lea     r2, [r2+4*r3]
+    dec     r4d
+    jg      .loop
 
+    HADDD   m0, m1
     movd    eax, xm0
     RET
 
 INIT_YMM avx2
-cglobal pixel_sad_32x8, 4,7,5
+cglobal pixel_sad_32x8, 4,7,7
     pxor    m0, m0
     mov     r4d, 8/4
+    mova    m6, [pw_1]
     add     r3d, r3d
     add     r1d, r1d
-    lea     r5,     [r1 * 3]
-    lea     r6,     [r3 * 3]
+    lea     r5d,     [r1 * 3]
+    lea     r6d,     [r3 * 3]
 .loop:
     movu    m1, [r2]
     movu    m2, [r2 + 32]
@@ -499,8 +472,7 @@
     pabsw   m4, m4
     paddw   m1, m2
     paddw   m3, m4
-    paddw   m0, m1
-    paddw   m0, m3
+    paddw   m5, m1, m3
 
     movu    m1, [r2 + 2 * r3]
     movu    m2, [r2 + 2 * r3 + 32]
@@ -518,24 +490,28 @@
     pabsw   m4, m4
     paddw   m1, m2
     paddw   m3, m4
-    paddw   m0, m1
-    paddw   m0, m3
+    paddw   m1, m3
 
+    pmaddwd m5, m6
+    paddd   m0, m5
+    pmaddwd m1, m6
+    paddd   m0, m1
     dec    r4d
     jg .loop
 
-    HADDW   m0, m1
+    HADDD   m0, m1
     movd    eax, xm0
     RET
 
 INIT_YMM avx2
-cglobal pixel_sad_32x16, 4,7,5
+cglobal pixel_sad_32x16, 4,7,7
     pxor    m0, m0
     mov     r4d, 16/8
+    mova    m6, [pw_1]
     add     r3d, r3d
     add     r1d, r1d
-    lea     r5,     [r1 * 3]
-    lea     r6,     [r3 * 3]
+    lea     r5d,     [r1 * 3]
+    lea     r6d,     [r3 * 3]
 .loop:
     movu    m1, [r2]
     movu    m2, [r2 + 32]
@@ -551,8 +527,7 @@
     pabsw   m4, m4
     paddw   m1, m2
     paddw   m3, m4
-    paddw   m0, m1
-    paddw   m0, m3
+    paddw   m5, m1, m3
 
     movu    m1, [r2 + 2 * r3]
     movu    m2, [r2 + 2 * r3 + 32]
@@ -570,8 +545,12 @@
     pabsw   m4, m4
     paddw   m1, m2
     paddw   m3, m4
-    paddw   m0, m1
-    paddw   m0, m3
+    paddw   m1, m3
+
+    pmaddwd m5, m6
+    paddd   m0, m5
+    pmaddwd m1, m6
+    paddd   m0, m1
 
     movu    m1, [r2]
     movu    m2, [r2 + 32]
@@ -587,8 +566,7 @@
     pabsw   m4, m4
     paddw   m1, m2
     paddw   m3, m4
-    paddw   m0, m1
-    paddw   m0, m3
+    paddw   m5, m1, m3
 
     movu    m1, [r2 + 2 * r3]
     movu    m2, [r2 + 2 * r3 + 32]
@@ -606,24 +584,28 @@
     pabsw   m4, m4
     paddw   m1, m2
     paddw   m3, m4
-    paddw   m0, m1
-    paddw   m0, m3
+    paddw   m1, m3
 
+    pmaddwd m5, m6
+    paddd   m0, m5
+    pmaddwd m1, m6
+    paddd   m0, m1
     dec    r4d
     jg .loop
 
-    HADDW   m0, m1
+    HADDD   m0, m1
     movd    eax, xm0
     RET
 
 INIT_YMM avx2

 
@@ -413,77 +413,50 @@
 SAD  16, 32
 
 INIT_YMM avx2
-cglobal pixel_sad_16x64, 4,7,4
+cglobal pixel_sad_16x64, 4,5,5
     pxor    m0, m0
-    pxor    m3, m3
-    mov     r4d, 64 / 8
-    add     r3d, r3d
-    add     r1d, r1d
-    lea     r5,     [r1 * 3]
-    lea     r6,     [r3 * 3]
+    mov     r4d, 16
+    mova    m4, [pw_1]
 .loop:
     movu    m1, [r2]
-    movu    m2, [r2 + r3]
+    movu    m2, [r2 + r3 * 2]
     psubw   m1, [r0]
-    psubw   m2, [r0 + r1]
-    pabsw   m1, m1
-    pabsw   m2, m2
-    paddw   m0, m1
-    paddw   m3, m2
-
-    movu    m1, [r2 + 2 * r3]
-    movu    m2, [r2 + r6]
-    psubw   m1, [r0 + 2 * r1]
-    psubw   m2, [r0 + r5]
+    psubw   m2, [r0 + r1 * 2]
     pabsw   m1, m1
     pabsw   m2, m2
-    paddw   m0, m1
-    paddw   m3, m2
-
+    paddw   m3, m1, m2
     lea     r0, [r0 + 4 * r1]
     lea     r2, [r2 + 4 * r3]
 
     movu    m1, [r2]
-    movu    m2, [r2 + r3]
+    movu    m2, [r2 + r3 * 2]
     psubw   m1, [r0]
-    psubw   m2, [r0 + r1]
+    psubw   m2, [r0 + r1 * 2]
     pabsw   m1, m1
     pabsw   m2, m2
-    paddw   m0, m1
-    paddw   m3, m2
-
-    movu    m1, [r2 + 2 * r3]
-    movu    m2, [r2 + r6]
-    psubw   m1, [r0 + 2 * r1]
-    psubw   m2, [r0 + r5]
-    pabsw   m1, m1
-    pabsw   m2, m2
-    paddw   m0, m1
-    paddw   m3, m2
-
-    lea     r0, [r0 + 4 * r1]
-    lea     r2, [r2 + 4 * r3]
-
-    dec    r4d
-    jg .loop
-
-    HADDUWD m0, m1
-    HADDUWD m3, m1
-    HADDD   m0, m1
-    HADDD   m3, m1
+    paddw   m1, m2
+    pmaddwd m3, m4
     paddd   m0, m3
+    pmaddwd m1, m4
+    paddd   m0, m1
+    lea     r0, [r0+4*r1]
+    lea     r2, [r2+4*r3]
+    dec     r4d
+    jg      .loop
 
+    HADDD   m0, m1
     movd    eax, xm0
     RET
 
 INIT_YMM avx2
-cglobal pixel_sad_32x8, 4,7,5
+cglobal pixel_sad_32x8, 4,7,7
     pxor    m0, m0
     mov     r4d, 8/4
+    mova    m6, [pw_1]
     add     r3d, r3d
     add     r1d, r1d
-    lea     r5,     [r1 * 3]
-    lea     r6,     [r3 * 3]
+    lea     r5d,     [r1 * 3]
+    lea     r6d,     [r3 * 3]
 .loop:
     movu    m1, [r2]
     movu    m2, [r2 + 32]
@@ -499,8 +472,7 @@
     pabsw   m4, m4
     paddw   m1, m2
     paddw   m3, m4
-    paddw   m0, m1
-    paddw   m0, m3
+    paddw   m5, m1, m3
 
     movu    m1, [r2 + 2 * r3]
     movu    m2, [r2 + 2 * r3 + 32]
@@ -518,24 +490,28 @@
     pabsw   m4, m4
     paddw   m1, m2
     paddw   m3, m4
-    paddw   m0, m1
-    paddw   m0, m3
+    paddw   m1, m3
 
+    pmaddwd m5, m6
+    paddd   m0, m5
+    pmaddwd m1, m6
+    paddd   m0, m1
     dec    r4d
     jg .loop
 
-    HADDW   m0, m1
+    HADDD   m0, m1
     movd    eax, xm0
     RET
 
 INIT_YMM avx2
-cglobal pixel_sad_32x16, 4,7,5
+cglobal pixel_sad_32x16, 4,7,7
     pxor    m0, m0
     mov     r4d, 16/8
+    mova    m6, [pw_1]
     add     r3d, r3d
     add     r1d, r1d
-    lea     r5,     [r1 * 3]
-    lea     r6,     [r3 * 3]
+    lea     r5d,     [r1 * 3]
+    lea     r6d,     [r3 * 3]
 .loop:
     movu    m1, [r2]
     movu    m2, [r2 + 32]
@@ -551,8 +527,7 @@
     pabsw   m4, m4
     paddw   m1, m2
     paddw   m3, m4
-    paddw   m0, m1
-    paddw   m0, m3
+    paddw   m5, m1, m3
 
     movu    m1, [r2 + 2 * r3]
     movu    m2, [r2 + 2 * r3 + 32]
@@ -570,8 +545,12 @@
     pabsw   m4, m4
     paddw   m1, m2
     paddw   m3, m4
-    paddw   m0, m1
-    paddw   m0, m3
+    paddw   m1, m3
+
+    pmaddwd m5, m6
+    paddd   m0, m5
+    pmaddwd m1, m6
+    paddd   m0, m1
 
     movu    m1, [r2]
     movu    m2, [r2 + 32]
@@ -587,8 +566,7 @@
     pabsw   m4, m4
     paddw   m1, m2
     paddw   m3, m4
-    paddw   m0, m1
-    paddw   m0, m3
+    paddw   m5, m1, m3
 
     movu    m1, [r2 + 2 * r3]
     movu    m2, [r2 + 2 * r3 + 32]
@@ -606,24 +584,28 @@
     pabsw   m4, m4
     paddw   m1, m2
     paddw   m3, m4
-    paddw   m0, m1
-    paddw   m0, m3
+    paddw   m1, m3
 
+    pmaddwd m5, m6
+    paddd   m0, m5
+    pmaddwd m1, m6
+    paddd   m0, m1
     dec    r4d
     jg .loop
 
-    HADDW   m0, m1
+    HADDD   m0, m1
     movd    eax, xm0
     RET
 
 INIT_YMM avx2
​

x265_1.8.tar.gz/source/common/x86/ssd-a.asm -> x265_1.9.tar.gz/source/common/x86/ssd-a.asm Changed

@@ -2,11 +2,13 @@
 ;* ssd-a.asm: x86 ssd functions
 ;*****************************************************************************
 ;* Copyright (C) 2003-2013 x264 project
+;* Copyright (C) 2013-2015 x265 project
 ;*
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
 ;*          Fiona Glaser <fiona@x264.com>
 ;*          Laurent Aimar <fenrir@via.ecp.fr>
 ;*          Alex Izvorski <aizvorksi@gmail.com>
+;*          Min Chen <chenm003@163.com>
 ;*
 ;* This program is free software; you can redistribute it and/or modify
 ;* it under the terms of the GNU General Public License as published by
@@ -105,8 +107,32 @@
     dec    r4d
     jg .loop
 %endif
+%if BIT_DEPTH == 12 && %1 >= 16 && %2 >=16
+%if  mmsize == 16
+    movu            m5, m0
+    pxor            m6, m6
+    punpckldq       m0, m6
+    punpckhdq       m5, m6
+    paddq           m0, m5
+    movhlps         m5, m0
+    paddq           m0, m5
+    movq            r6, xm0
+%elif mmsize == 32
+    movu            m1, m0
+    pxor            m2, m2
+    punpckldq       m0, m2
+    punpckhdq       m1, m2
+    paddq           m0, m1
+    vextracti128    xm2, m0, 1
+    paddq           xm2, xm0
+    movhlps         xm1, xm2
+    paddq           xm2, xm1
+    movq            rax, xm2
+%endif
+%else 
     HADDD   m0, m5
-    movd   eax, xm0
+    movd    eax,xm0
+%endif
 %ifidn movu,movq ; detect MMX
     EMMS
 %endif
@@ -168,6 +194,154 @@
     movq        rax, m9
     RET
 %endmacro
+%macro SSD_ONE_SS_32 0
+cglobal pixel_ssd_ss_32x32, 4,5,8
+    add         r1d, r1d
+    add         r3d, r3d
+    pxor        m5, m5
+    pxor        m6, m6
+    mov         r4d, 2
+
+.iterate:
+    mov         r5d, 16
+    pxor        m4, m4
+    pxor        m7, m7
+.loop:
+    movu        m0, [r0]
+    movu        m1, [r0 + mmsize]
+    movu        m2, [r2]
+    movu        m3, [r2 + mmsize]
+    psubw       m0, m2
+    psubw       m1, m3
+    pmaddwd     m0, m0
+    pmaddwd     m1, m1
+    paddd       m4, m0
+    paddd       m7, m1
+    movu        m0, [r0 + 2 * mmsize]
+    movu        m1, [r0 + 3 * mmsize]
+    movu        m2, [r2 + 2 * mmsize]
+    movu        m3, [r2 + 3 * mmsize]
+    psubw       m0, m2
+    psubw       m1, m3
+    pmaddwd     m0, m0
+    pmaddwd     m1, m1
+    paddd       m4, m0
+    paddd       m7, m1
+
+    add         r0, r1
+    add         r2, r3
+
+    dec         r5d
+    jnz         .loop
+
+    mova        m0, m4
+    pxor        m1, m1
+    punpckldq   m0, m1
+    punpckhdq   m4, m1
+    paddq       m5, m0
+    paddq       m6, m4
+
+    mova        m0, m7
+    punpckldq   m0, m1
+    punpckhdq   m7, m1
+    paddq       m5, m0
+    paddq       m6, m7
+
+    dec         r4d
+    jnz         .iterate
+
+    paddq       m5, m6
+    movhlps     m2, m5
+    paddq       m5, m2
+    movq        rax, m5
+    RET
+%endmacro
+
+%macro SSD_ONE_SS_64 0
+cglobal pixel_ssd_ss_64x64, 4,6,8
+    add         r1d, r1d
+    add         r3d, r3d
+    pxor        m5, m5
+    pxor        m6, m6
+    mov         r5d, 8
+
+.iterate:
+    pxor        m4, m4
+    pxor        m7, m7
+    mov         r4d, 8
+
+.loop:
+    ;----process 1st half a row----
+    movu        m0, [r0]
+    movu        m1, [r0 + mmsize]
+    movu        m2, [r2]
+    movu        m3, [r2 + mmsize]
+    psubw       m0, m2
+    psubw       m1, m3
+    pmaddwd     m0, m0
+    pmaddwd     m1, m1
+    paddd       m4, m0
+    paddd       m7, m1
+    movu        m0, [r0 + 2 * mmsize]
+    movu        m1, [r0 + 3 * mmsize]
+    movu        m2, [r2 + 2 * mmsize]
+    movu        m3, [r2 + 3 * mmsize]
+    psubw       m0, m2
+    psubw       m1, m3
+    pmaddwd     m0, m0
+    pmaddwd     m1, m1
+    paddd       m4, m0
+    paddd       m7, m1
+    ;----process 2nd half a row----
+    movu        m0, [r0 + 4 * mmsize]
+    movu        m1, [r0 + 5 * mmsize]
+    movu        m2, [r2 + 4 * mmsize]
+    movu        m3, [r2 + 5 * mmsize]
+    psubw       m0, m2
+    psubw       m1, m3
+    pmaddwd     m0, m0
+    pmaddwd     m1, m1
+    paddd       m4, m0
+    paddd       m7, m1
+    movu        m0, [r0 + 6 * mmsize]
+    movu        m1, [r0 + 7 * mmsize]
+    movu        m2, [r2 + 6 * mmsize]
+    movu        m3, [r2 + 7 * mmsize]
+    psubw       m0, m2
+    psubw       m1, m3
+    pmaddwd     m0, m0
+    pmaddwd     m1, m1
+    paddd       m4, m0
+    paddd       m7, m1
+
+    add         r0, r1
+    add         r2, r3
+
+    dec         r4d
+    jnz         .loop
+
+    mova        m0, m4
+    pxor        m1, m1
+    punpckldq   m0, m1
+    punpckhdq   m4, m1
+    paddq       m5, m0
+    paddq       m6, m4
+
+    mova        m0, m7
+    punpckldq   m0, m1
+    punpckhdq   m7, m1
+    paddq       m5, m0
+    paddq       m6, m7
+
+    dec         r5
+    jne         .iterate
+
+    paddq       m5, m6
+    movhlps     m2, m5
+    paddq       m5, m2
+    movq        rax, m5
+    RET
+%endmacro

 
@@ -2,11 +2,13 @@
 ;* ssd-a.asm: x86 ssd functions
 ;*****************************************************************************
 ;* Copyright (C) 2003-2013 x264 project
+;* Copyright (C) 2013-2015 x265 project
 ;*
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
 ;*          Fiona Glaser <fiona@x264.com>
 ;*          Laurent Aimar <fenrir@via.ecp.fr>
 ;*          Alex Izvorski <aizvorksi@gmail.com>
+;*          Min Chen <chenm003@163.com>
 ;*
 ;* This program is free software; you can redistribute it and/or modify
 ;* it under the terms of the GNU General Public License as published by
@@ -105,8 +107,32 @@
     dec    r4d
     jg .loop
 %endif
+%if BIT_DEPTH == 12 && %1 >= 16 && %2 >=16
+%if  mmsize == 16
+    movu            m5, m0
+    pxor            m6, m6
+    punpckldq       m0, m6
+    punpckhdq       m5, m6
+    paddq           m0, m5
+    movhlps         m5, m0
+    paddq           m0, m5
+    movq            r6, xm0
+%elif mmsize == 32
+    movu            m1, m0
+    pxor            m2, m2
+    punpckldq       m0, m2
+    punpckhdq       m1, m2
+    paddq           m0, m1
+    vextracti128    xm2, m0, 1
+    paddq           xm2, xm0
+    movhlps         xm1, xm2
+    paddq           xm2, xm1
+    movq            rax, xm2
+%endif
+%else 
     HADDD   m0, m5
-    movd   eax, xm0
+    movd    eax,xm0
+%endif
 %ifidn movu,movq ; detect MMX
     EMMS
 %endif
@@ -168,6 +194,154 @@
     movq        rax, m9
     RET
 %endmacro
+%macro SSD_ONE_SS_32 0
+cglobal pixel_ssd_ss_32x32, 4,5,8
+    add         r1d, r1d
+    add         r3d, r3d
+    pxor        m5, m5
+    pxor        m6, m6
+    mov         r4d, 2
+
+.iterate:
+    mov         r5d, 16
+    pxor        m4, m4
+    pxor        m7, m7
+.loop:
+    movu        m0, [r0]
+    movu        m1, [r0 + mmsize]
+    movu        m2, [r2]
+    movu        m3, [r2 + mmsize]
+    psubw       m0, m2
+    psubw       m1, m3
+    pmaddwd     m0, m0
+    pmaddwd     m1, m1
+    paddd       m4, m0
+    paddd       m7, m1
+    movu        m0, [r0 + 2 * mmsize]
+    movu        m1, [r0 + 3 * mmsize]
+    movu        m2, [r2 + 2 * mmsize]
+    movu        m3, [r2 + 3 * mmsize]
+    psubw       m0, m2
+    psubw       m1, m3
+    pmaddwd     m0, m0
+    pmaddwd     m1, m1
+    paddd       m4, m0
+    paddd       m7, m1
+
+    add         r0, r1
+    add         r2, r3
+
+    dec         r5d
+    jnz         .loop
+
+    mova        m0, m4
+    pxor        m1, m1
+    punpckldq   m0, m1
+    punpckhdq   m4, m1
+    paddq       m5, m0
+    paddq       m6, m4
+
+    mova        m0, m7
+    punpckldq   m0, m1
+    punpckhdq   m7, m1
+    paddq       m5, m0
+    paddq       m6, m7
+
+    dec         r4d
+    jnz         .iterate
+
+    paddq       m5, m6
+    movhlps     m2, m5
+    paddq       m5, m2
+    movq        rax, m5
+    RET
+%endmacro
+
+%macro SSD_ONE_SS_64 0
+cglobal pixel_ssd_ss_64x64, 4,6,8
+    add         r1d, r1d
+    add         r3d, r3d
+    pxor        m5, m5
+    pxor        m6, m6
+    mov         r5d, 8
+
+.iterate:
+    pxor        m4, m4
+    pxor        m7, m7
+    mov         r4d, 8
+
+.loop:
+    ;----process 1st half a row----
+    movu        m0, [r0]
+    movu        m1, [r0 + mmsize]
+    movu        m2, [r2]
+    movu        m3, [r2 + mmsize]
+    psubw       m0, m2
+    psubw       m1, m3
+    pmaddwd     m0, m0
+    pmaddwd     m1, m1
+    paddd       m4, m0
+    paddd       m7, m1
+    movu        m0, [r0 + 2 * mmsize]
+    movu        m1, [r0 + 3 * mmsize]
+    movu        m2, [r2 + 2 * mmsize]
+    movu        m3, [r2 + 3 * mmsize]
+    psubw       m0, m2
+    psubw       m1, m3
+    pmaddwd     m0, m0
+    pmaddwd     m1, m1
+    paddd       m4, m0
+    paddd       m7, m1
+    ;----process 2nd half a row----
+    movu        m0, [r0 + 4 * mmsize]
+    movu        m1, [r0 + 5 * mmsize]
+    movu        m2, [r2 + 4 * mmsize]
+    movu        m3, [r2 + 5 * mmsize]
+    psubw       m0, m2
+    psubw       m1, m3
+    pmaddwd     m0, m0
+    pmaddwd     m1, m1
+    paddd       m4, m0
+    paddd       m7, m1
+    movu        m0, [r0 + 6 * mmsize]
+    movu        m1, [r0 + 7 * mmsize]
+    movu        m2, [r2 + 6 * mmsize]
+    movu        m3, [r2 + 7 * mmsize]
+    psubw       m0, m2
+    psubw       m1, m3
+    pmaddwd     m0, m0
+    pmaddwd     m1, m1
+    paddd       m4, m0
+    paddd       m7, m1
+
+    add         r0, r1
+    add         r2, r3
+
+    dec         r4d
+    jnz         .loop
+
+    mova        m0, m4
+    pxor        m1, m1
+    punpckldq   m0, m1
+    punpckhdq   m4, m1
+    paddq       m5, m0
+    paddq       m6, m4
+
+    mova        m0, m7
+    punpckldq   m0, m1
+    punpckhdq   m7, m1
+    paddq       m5, m0
+    paddq       m6, m7
+
+    dec         r5
+    jne         .iterate
+
+    paddq       m5, m6
+    movhlps     m2, m5
+    paddq       m5, m2
+    movq        rax, m5
+    RET
+%endmacro
​

x265_1.8.tar.gz/source/common/x86/x86util.asm -> x265_1.9.tar.gz/source/common/x86/x86util.asm Changed

 
@@ -5,6 +5,7 @@
 ;*
 ;* Authors: Holger Lubitz <holger@lubitz.org>
 ;*          Loren Merritt <lorenm@u.washington.edu>
+;*          Min Chen <chenm003@163.com>
 ;*
 ;* This program is free software; you can redistribute it and/or modify
 ;* it under the terms of the GNU General Public License as published by
​

x265_1.8.tar.gz/source/common/yuv.cpp -> x265_1.9.tar.gz/source/common/yuv.cpp Changed

 
@@ -2,6 +2,7 @@
  * Copyright (C) 2015 x265 project
  *
  * Authors: Steve Borho <steve@borho.org>
+ *          Min Chen <chenm003@163.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -50,7 +51,7 @@
     {
         CHECKED_MALLOC(m_buf[0], pixel, size * size + 8);
         m_buf[1] = m_buf[2] = 0;
-        m_csize = MAX_INT;
+        m_csize = 0;
         return true;
     }
     else
@@ -82,22 +83,26 @@
 {
     pixel* dstY = dstPic.getLumaAddr(cuAddr, absPartIdx);
     primitives.cu[m_part].copy_pp(dstY, dstPic.m_stride, m_buf[0], m_size);
-
-    pixel* dstU = dstPic.getCbAddr(cuAddr, absPartIdx);
-    pixel* dstV = dstPic.getCrAddr(cuAddr, absPartIdx);
-    primitives.chroma[m_csp].cu[m_part].copy_pp(dstU, dstPic.m_strideC, m_buf[1], m_csize);
-    primitives.chroma[m_csp].cu[m_part].copy_pp(dstV, dstPic.m_strideC, m_buf[2], m_csize);
+    if (m_csp != X265_CSP_I400)
+    {
+        pixel* dstU = dstPic.getCbAddr(cuAddr, absPartIdx);
+        pixel* dstV = dstPic.getCrAddr(cuAddr, absPartIdx);
+        primitives.chroma[m_csp].cu[m_part].copy_pp(dstU, dstPic.m_strideC, m_buf[1], m_csize);
+        primitives.chroma[m_csp].cu[m_part].copy_pp(dstV, dstPic.m_strideC, m_buf[2], m_csize);
+    }
 }
 
 void Yuv::copyFromPicYuv(const PicYuv& srcPic, uint32_t cuAddr, uint32_t absPartIdx)
 {
     const pixel* srcY = srcPic.getLumaAddr(cuAddr, absPartIdx);
     primitives.cu[m_part].copy_pp(m_buf[0], m_size, srcY, srcPic.m_stride);
-
-    const pixel* srcU = srcPic.getCbAddr(cuAddr, absPartIdx);
-    const pixel* srcV = srcPic.getCrAddr(cuAddr, absPartIdx);
-    primitives.chroma[m_csp].cu[m_part].copy_pp(m_buf[1], m_csize, srcU, srcPic.m_strideC);
-    primitives.chroma[m_csp].cu[m_part].copy_pp(m_buf[2], m_csize, srcV, srcPic.m_strideC);
+    if (m_csp != X265_CSP_I400)
+    {
+        const pixel* srcU = srcPic.getCbAddr(cuAddr, absPartIdx);
+        const pixel* srcV = srcPic.getCrAddr(cuAddr, absPartIdx);
+        primitives.chroma[m_csp].cu[m_part].copy_pp(m_buf[1], m_csize, srcU, srcPic.m_strideC);
+        primitives.chroma[m_csp].cu[m_part].copy_pp(m_buf[2], m_csize, srcV, srcPic.m_strideC);
+    }
 }
 
 void Yuv::copyFromYuv(const Yuv& srcYuv)
@@ -105,8 +110,11 @@
     X265_CHECK(m_size >= srcYuv.m_size, "invalid size\n");
 
     primitives.cu[m_part].copy_pp(m_buf[0], m_size, srcYuv.m_buf[0], srcYuv.m_size);
-    primitives.chroma[m_csp].cu[m_part].copy_pp(m_buf[1], m_csize, srcYuv.m_buf[1], srcYuv.m_csize);
-    primitives.chroma[m_csp].cu[m_part].copy_pp(m_buf[2], m_csize, srcYuv.m_buf[2], srcYuv.m_csize);
+    if (m_csp != X265_CSP_I400)
+    {
+        primitives.chroma[m_csp].cu[m_part].copy_pp(m_buf[1], m_csize, srcYuv.m_buf[1], srcYuv.m_csize);
+        primitives.chroma[m_csp].cu[m_part].copy_pp(m_buf[2], m_csize, srcYuv.m_buf[2], srcYuv.m_csize);
+    }
 }
 
 /* This version is intended for use by ME, which required FENC_STRIDE for luma fenc pixels */
@@ -130,11 +138,13 @@
 {
     pixel* dstY = dstYuv.getLumaAddr(absPartIdx);
     primitives.cu[m_part].copy_pp(dstY, dstYuv.m_size, m_buf[0], m_size);
-
-    pixel* dstU = dstYuv.getCbAddr(absPartIdx);
-    pixel* dstV = dstYuv.getCrAddr(absPartIdx);
-    primitives.chroma[m_csp].cu[m_part].copy_pp(dstU, dstYuv.m_csize, m_buf[1], m_csize);
-    primitives.chroma[m_csp].cu[m_part].copy_pp(dstV, dstYuv.m_csize, m_buf[2], m_csize);
+    if (m_csp != X265_CSP_I400)
+    {
+        pixel* dstU = dstYuv.getCbAddr(absPartIdx);
+        pixel* dstV = dstYuv.getCrAddr(absPartIdx);
+        primitives.chroma[m_csp].cu[m_part].copy_pp(dstU, dstYuv.m_csize, m_buf[1], m_csize);
+        primitives.chroma[m_csp].cu[m_part].copy_pp(dstV, dstYuv.m_csize, m_buf[2], m_csize);
+    }
 }
 
 void Yuv::copyPartToYuv(Yuv& dstYuv, uint32_t absPartIdx) const
@@ -142,20 +152,25 @@
     pixel* srcY = m_buf[0] + getAddrOffset(absPartIdx, m_size);
     pixel* dstY = dstYuv.m_buf[0];
     primitives.cu[dstYuv.m_part].copy_pp(dstY, dstYuv.m_size, srcY, m_size);
-
-    pixel* srcU = m_buf[1] + getChromaAddrOffset(absPartIdx);
-    pixel* srcV = m_buf[2] + getChromaAddrOffset(absPartIdx);
-    pixel* dstU = dstYuv.m_buf[1];
-    pixel* dstV = dstYuv.m_buf[2];
-    primitives.chroma[m_csp].cu[dstYuv.m_part].copy_pp(dstU, dstYuv.m_csize, srcU, m_csize);
-    primitives.chroma[m_csp].cu[dstYuv.m_part].copy_pp(dstV, dstYuv.m_csize, srcV, m_csize);
+    if (m_csp != X265_CSP_I400)
+    {
+        pixel* srcU = m_buf[1] + getChromaAddrOffset(absPartIdx);
+        pixel* srcV = m_buf[2] + getChromaAddrOffset(absPartIdx);
+        pixel* dstU = dstYuv.m_buf[1];
+        pixel* dstV = dstYuv.m_buf[2];
+        primitives.chroma[m_csp].cu[dstYuv.m_part].copy_pp(dstU, dstYuv.m_csize, srcU, m_csize);
+        primitives.chroma[m_csp].cu[dstYuv.m_part].copy_pp(dstV, dstYuv.m_csize, srcV, m_csize);
+    }
 }
 
 void Yuv::addClip(const Yuv& srcYuv0, const ShortYuv& srcYuv1, uint32_t log2SizeL)
 {
     primitives.cu[log2SizeL - 2].add_ps(m_buf[0], m_size, srcYuv0.m_buf[0], srcYuv1.m_buf[0], srcYuv0.m_size, srcYuv1.m_size);
-    primitives.chroma[m_csp].cu[log2SizeL - 2].add_ps(m_buf[1], m_csize, srcYuv0.m_buf[1], srcYuv1.m_buf[1], srcYuv0.m_csize, srcYuv1.m_csize);
-    primitives.chroma[m_csp].cu[log2SizeL - 2].add_ps(m_buf[2], m_csize, srcYuv0.m_buf[2], srcYuv1.m_buf[2], srcYuv0.m_csize, srcYuv1.m_csize);
+    if (m_csp != X265_CSP_I400)
+    {
+        primitives.chroma[m_csp].cu[log2SizeL - 2].add_ps(m_buf[1], m_csize, srcYuv0.m_buf[1], srcYuv1.m_buf[1], srcYuv0.m_csize, srcYuv1.m_csize);
+        primitives.chroma[m_csp].cu[log2SizeL - 2].add_ps(m_buf[2], m_csize, srcYuv0.m_buf[2], srcYuv1.m_buf[2], srcYuv0.m_csize, srcYuv1.m_csize);
+    }
 }
 
 void Yuv::addAvg(const ShortYuv& srcYuv0, const ShortYuv& srcYuv1, uint32_t absPartIdx, uint32_t width, uint32_t height, bool bLuma, bool bChroma)
​

x265_1.8.tar.gz/source/encoder/analysis.cpp -> x265_1.9.tar.gz/source/encoder/analysis.cpp Changed

@@ -3,6 +3,7 @@
 *
 * Authors: Deepthi Nandakumar <deepthi@multicorewareinc.com>
 *          Steve Borho <steve@borho.org>
+*          Min Chen <chenm003@163.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -71,12 +72,11 @@
 
 Analysis::Analysis()
 {
-    m_reuseIntraDataCTU = NULL;
     m_reuseInterDataCTU = NULL;
     m_reuseRef = NULL;
     m_reuseBestMergeCand = NULL;
+    m_reuseMv = NULL;
 }
-
 bool Analysis::create(ThreadLocalData *tld)
 {
     m_tld = tld;
@@ -127,9 +127,6 @@
     m_frame = &frame;
 
 #if _DEBUG || CHECKED_BUILD
-    for (uint32_t i = 0; i <= g_maxCUDepth; i++)
-        for (uint32_t j = 0; j < MAX_PRED_TYPES; j++)
-            m_modeDepth[i].pred[j].invalidate();
     invalidateContexts(0);
 #endif
 
@@ -140,40 +137,46 @@
     m_modeDepth[0].fencYuv.copyFromPicYuv(*m_frame->m_fencPic, ctu.m_cuAddr, 0);
 
     uint32_t numPartition = ctu.m_numPartitions;
-    if (m_param->analysisMode)
+    if (m_param->analysisMode && m_slice->m_sliceType != I_SLICE)
     {
-        if (m_slice->m_sliceType == I_SLICE)
-            m_reuseIntraDataCTU = (analysis_intra_data*)m_frame->m_analysisData.intraData;
-        else
-        {
-            int numPredDir = m_slice->isInterP() ? 1 : 2;
-            m_reuseInterDataCTU = (analysis_inter_data*)m_frame->m_analysisData.interData;
-            m_reuseRef = &m_reuseInterDataCTU->ref[ctu.m_cuAddr * X265_MAX_PRED_MODE_PER_CTU * numPredDir];
-            m_reuseBestMergeCand = &m_reuseInterDataCTU->bestMergeCand[ctu.m_cuAddr * CUGeom::MAX_GEOMS];
-        }
+        int numPredDir = m_slice->isInterP() ? 1 : 2;
+        m_reuseInterDataCTU = (analysis_inter_data*)m_frame->m_analysisData.interData;
+        m_reuseRef = &m_reuseInterDataCTU->ref[ctu.m_cuAddr * X265_MAX_PRED_MODE_PER_CTU * numPredDir];
+        m_reuseBestMergeCand = &m_reuseInterDataCTU->bestMergeCand[ctu.m_cuAddr * CUGeom::MAX_GEOMS];
+        m_reuseMv = &m_reuseInterDataCTU->mv[ctu.m_cuAddr * X265_MAX_PRED_MODE_PER_CTU * numPredDir];
     }
-
     ProfileCUScope(ctu, totalCTUTime, totalCTUs);
 
-    uint32_t zOrder = 0;
     if (m_slice->m_sliceType == I_SLICE)
     {
-        compressIntraCU(ctu, cuGeom, zOrder, qp);
-        if (m_param->analysisMode == X265_ANALYSIS_SAVE && m_frame->m_analysisData.intraData)
+        analysis_intra_data* intraDataCTU = (analysis_intra_data*)m_frame->m_analysisData.intraData;
+        if (m_param->analysisMode == X265_ANALYSIS_LOAD)
+        {
+            memcpy(ctu.m_cuDepth, &intraDataCTU->depth[ctu.m_cuAddr * numPartition], sizeof(uint8_t) * numPartition);
+            memcpy(ctu.m_lumaIntraDir, &intraDataCTU->modes[ctu.m_cuAddr * numPartition], sizeof(uint8_t) * numPartition);
+            memcpy(ctu.m_partSize, &intraDataCTU->partSizes[ctu.m_cuAddr * numPartition], sizeof(char) * numPartition);
+            memcpy(ctu.m_chromaIntraDir, &intraDataCTU->chromaModes[ctu.m_cuAddr * numPartition], sizeof(uint8_t) * numPartition);
+        }
+        compressIntraCU(ctu, cuGeom, qp);
+        if (m_param->analysisMode == X265_ANALYSIS_SAVE && intraDataCTU)
         {
             CUData* bestCU = &m_modeDepth[0].bestMode->cu;
-            memcpy(&m_reuseIntraDataCTU->depth[ctu.m_cuAddr * numPartition], bestCU->m_cuDepth, sizeof(uint8_t) * numPartition);
-            memcpy(&m_reuseIntraDataCTU->modes[ctu.m_cuAddr * numPartition], bestCU->m_lumaIntraDir, sizeof(uint8_t) * numPartition);
-            memcpy(&m_reuseIntraDataCTU->partSizes[ctu.m_cuAddr * numPartition], bestCU->m_partSize, sizeof(uint8_t) * numPartition);
-            memcpy(&m_reuseIntraDataCTU->chromaModes[ctu.m_cuAddr * numPartition], bestCU->m_chromaIntraDir, sizeof(uint8_t) * numPartition);
+            memcpy(&intraDataCTU->depth[ctu.m_cuAddr * numPartition], bestCU->m_cuDepth, sizeof(uint8_t) * numPartition);
+            memcpy(&intraDataCTU->modes[ctu.m_cuAddr * numPartition], bestCU->m_lumaIntraDir, sizeof(uint8_t) * numPartition);
+            memcpy(&intraDataCTU->partSizes[ctu.m_cuAddr * numPartition], bestCU->m_partSize, sizeof(uint8_t) * numPartition);
+            memcpy(&intraDataCTU->chromaModes[ctu.m_cuAddr * numPartition], bestCU->m_chromaIntraDir, sizeof(uint8_t) * numPartition);
         }
     }
     else
     {
-        if (!m_param->rdLevel)
+        if (m_param->bIntraRefresh && m_slice->m_sliceType == P_SLICE &&
+            ctu.m_cuPelX / g_maxCUSize >= frame.m_encData->m_pir.pirStartCol
+            && ctu.m_cuPelX / g_maxCUSize < frame.m_encData->m_pir.pirEndCol)
+            compressIntraCU(ctu, cuGeom, qp);
+        else if (!m_param->rdLevel)
         {
             /* In RD Level 0/1, copy source pixels into the reconstructed block so
-            * they are available for intra predictions */
+             * they are available for intra predictions */
             m_modeDepth[0].fencYuv.copyToPicYuv(*m_frame->m_reconPic, ctu.m_cuAddr, 0);
 
             compressInterCU_rd0_4(ctu, cuGeom, qp);
@@ -187,6 +190,7 @@
             compressInterCU_rd0_4(ctu, cuGeom, qp);
         else
         {
+            uint32_t zOrder = 0;
             compressInterCU_rd5_6(ctu, cuGeom, zOrder, qp);
             if (m_param->analysisMode == X265_ANALYSIS_SAVE && m_frame->m_analysisData.interData)
             {
@@ -212,8 +216,7 @@
         md.pred[PRED_LOSSLESS].initCosts();
         md.pred[PRED_LOSSLESS].cu.initLosslessCU(md.bestMode->cu, cuGeom);
         PartSize size = (PartSize)md.pred[PRED_LOSSLESS].cu.m_partSize[0];
-        uint8_t* modes = md.pred[PRED_LOSSLESS].cu.m_lumaIntraDir;
-        checkIntra(md.pred[PRED_LOSSLESS], cuGeom, size, modes, NULL);
+        checkIntra(md.pred[PRED_LOSSLESS], cuGeom, size);
         checkBestMode(md.pred[PRED_LOSSLESS], cuGeom.depth);
     }
     else
@@ -226,7 +229,7 @@
     }
 }
 
-void Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, uint32_t& zOrder, int32_t qp)
+void Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp)
 {
     uint32_t depth = cuGeom.depth;
     ModeDepth& md = m_modeDepth[depth];
@@ -235,42 +238,37 @@
     bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
     bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
 
-    if (m_param->analysisMode == X265_ANALYSIS_LOAD)
-    {
-        uint8_t* reuseDepth  = &m_reuseIntraDataCTU->depth[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
-        uint8_t* reuseModes  = &m_reuseIntraDataCTU->modes[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
-        char* reusePartSizes = &m_reuseIntraDataCTU->partSizes[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
-        uint8_t* reuseChromaModes = &m_reuseIntraDataCTU->chromaModes[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
+    bool bAlreadyDecided = parentCTU.m_lumaIntraDir[cuGeom.absPartIdx] != (uint8_t)ALL_IDX;
+    bool bDecidedDepth = parentCTU.m_cuDepth[cuGeom.absPartIdx] == depth;
 
-        if (mightNotSplit && depth == reuseDepth[zOrder] && zOrder == cuGeom.absPartIdx)
+    if (bAlreadyDecided)
+    {
+        if (bDecidedDepth)
         {
-            PartSize size = (PartSize)reusePartSizes[zOrder];
-            Mode& mode = size == SIZE_2Nx2N ? md.pred[PRED_INTRA] : md.pred[PRED_INTRA_NxN];
+            Mode& mode = md.pred[0];
+            md.bestMode = &mode;
             mode.cu.initSubCU(parentCTU, cuGeom, qp);
-            checkIntra(mode, cuGeom, size, &reuseModes[zOrder], &reuseChromaModes[zOrder]);
-            checkBestMode(mode, depth);
+            memcpy(mode.cu.m_lumaIntraDir, parentCTU.m_lumaIntraDir + cuGeom.absPartIdx, cuGeom.numPartitions);
+            memcpy(mode.cu.m_chromaIntraDir, parentCTU.m_chromaIntraDir + cuGeom.absPartIdx, cuGeom.numPartitions);
+            checkIntra(mode, cuGeom, (PartSize)parentCTU.m_partSize[cuGeom.absPartIdx]);
 
             if (m_bTryLossless)
                 tryLossless(cuGeom);
 
             if (mightSplit)
                 addSplitFlagCost(*md.bestMode, cuGeom.depth);
-
-            // increment zOrder offset to point to next best depth in sharedDepth buffer
-            zOrder += g_depthInc[g_maxCUDepth - 1][reuseDepth[zOrder]];
-            mightSplit = false;
         }
     }
-    else if (mightNotSplit)
+    else if (cuGeom.log2CUSize != MAX_LOG2_CU_SIZE && mightNotSplit)
     {
         md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom, qp);
-        checkIntra(md.pred[PRED_INTRA], cuGeom, SIZE_2Nx2N, NULL, NULL);
+        checkIntra(md.pred[PRED_INTRA], cuGeom, SIZE_2Nx2N);
         checkBestMode(md.pred[PRED_INTRA], depth);
 
         if (cuGeom.log2CUSize == 3 && m_slice->m_sps->quadtreeTULog2MinSize < 3)
         {
             md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom, qp);
-            checkIntra(md.pred[PRED_INTRA_NxN], cuGeom, SIZE_NxN, NULL, NULL);
+            checkIntra(md.pred[PRED_INTRA_NxN], cuGeom, SIZE_NxN);
             checkBestMode(md.pred[PRED_INTRA_NxN], depth);
         }
 
@@ -281,6 +279,9 @@
             addSplitFlagCost(*md.bestMode, cuGeom.depth);
     }
 
+    // stop recursion if we reach the depth of previous analysis decision
+    mightSplit &= !(bAlreadyDecided && bDecidedDepth);
+
     if (mightSplit)
     {
         Mode* splitPred = &md.pred[PRED_SPLIT];
@@ -305,7 +306,7 @@
                 if (m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth)
                     nextQP = setLambdaFromQP(parentCTU, calculateQpforCuSize(parentCTU, childGeom));
 
-                compressIntraCU(parentCTU, childGeom, zOrder, nextQP);
+                compressIntraCU(parentCTU, childGeom, nextQP);
 
                 // Save best CU and pred data for this sub CU

 
@@ -3,6 +3,7 @@
 *
 * Authors: Deepthi Nandakumar <deepthi@multicorewareinc.com>
 *          Steve Borho <steve@borho.org>
+*          Min Chen <chenm003@163.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -71,12 +72,11 @@
 
 Analysis::Analysis()
 {
-    m_reuseIntraDataCTU = NULL;
     m_reuseInterDataCTU = NULL;
     m_reuseRef = NULL;
     m_reuseBestMergeCand = NULL;
+    m_reuseMv = NULL;
 }
-
 bool Analysis::create(ThreadLocalData *tld)
 {
     m_tld = tld;
@@ -127,9 +127,6 @@
     m_frame = &frame;
 
 #if _DEBUG || CHECKED_BUILD
-    for (uint32_t i = 0; i <= g_maxCUDepth; i++)
-        for (uint32_t j = 0; j < MAX_PRED_TYPES; j++)
-            m_modeDepth[i].pred[j].invalidate();
     invalidateContexts(0);
 #endif
 
@@ -140,40 +137,46 @@
     m_modeDepth[0].fencYuv.copyFromPicYuv(*m_frame->m_fencPic, ctu.m_cuAddr, 0);
 
     uint32_t numPartition = ctu.m_numPartitions;
-    if (m_param->analysisMode)
+    if (m_param->analysisMode && m_slice->m_sliceType != I_SLICE)
     {
-        if (m_slice->m_sliceType == I_SLICE)
-            m_reuseIntraDataCTU = (analysis_intra_data*)m_frame->m_analysisData.intraData;
-        else
-        {
-            int numPredDir = m_slice->isInterP() ? 1 : 2;
-            m_reuseInterDataCTU = (analysis_inter_data*)m_frame->m_analysisData.interData;
-            m_reuseRef = &m_reuseInterDataCTU->ref[ctu.m_cuAddr * X265_MAX_PRED_MODE_PER_CTU * numPredDir];
-            m_reuseBestMergeCand = &m_reuseInterDataCTU->bestMergeCand[ctu.m_cuAddr * CUGeom::MAX_GEOMS];
-        }
+        int numPredDir = m_slice->isInterP() ? 1 : 2;
+        m_reuseInterDataCTU = (analysis_inter_data*)m_frame->m_analysisData.interData;
+        m_reuseRef = &m_reuseInterDataCTU->ref[ctu.m_cuAddr * X265_MAX_PRED_MODE_PER_CTU * numPredDir];
+        m_reuseBestMergeCand = &m_reuseInterDataCTU->bestMergeCand[ctu.m_cuAddr * CUGeom::MAX_GEOMS];
+        m_reuseMv = &m_reuseInterDataCTU->mv[ctu.m_cuAddr * X265_MAX_PRED_MODE_PER_CTU * numPredDir];
     }
-
     ProfileCUScope(ctu, totalCTUTime, totalCTUs);
 
-    uint32_t zOrder = 0;
     if (m_slice->m_sliceType == I_SLICE)
     {
-        compressIntraCU(ctu, cuGeom, zOrder, qp);
-        if (m_param->analysisMode == X265_ANALYSIS_SAVE && m_frame->m_analysisData.intraData)
+        analysis_intra_data* intraDataCTU = (analysis_intra_data*)m_frame->m_analysisData.intraData;
+        if (m_param->analysisMode == X265_ANALYSIS_LOAD)
+        {
+            memcpy(ctu.m_cuDepth, &intraDataCTU->depth[ctu.m_cuAddr * numPartition], sizeof(uint8_t) * numPartition);
+            memcpy(ctu.m_lumaIntraDir, &intraDataCTU->modes[ctu.m_cuAddr * numPartition], sizeof(uint8_t) * numPartition);
+            memcpy(ctu.m_partSize, &intraDataCTU->partSizes[ctu.m_cuAddr * numPartition], sizeof(char) * numPartition);
+            memcpy(ctu.m_chromaIntraDir, &intraDataCTU->chromaModes[ctu.m_cuAddr * numPartition], sizeof(uint8_t) * numPartition);
+        }
+        compressIntraCU(ctu, cuGeom, qp);
+        if (m_param->analysisMode == X265_ANALYSIS_SAVE && intraDataCTU)
         {
             CUData* bestCU = &m_modeDepth[0].bestMode->cu;
-            memcpy(&m_reuseIntraDataCTU->depth[ctu.m_cuAddr * numPartition], bestCU->m_cuDepth, sizeof(uint8_t) * numPartition);
-            memcpy(&m_reuseIntraDataCTU->modes[ctu.m_cuAddr * numPartition], bestCU->m_lumaIntraDir, sizeof(uint8_t) * numPartition);
-            memcpy(&m_reuseIntraDataCTU->partSizes[ctu.m_cuAddr * numPartition], bestCU->m_partSize, sizeof(uint8_t) * numPartition);
-            memcpy(&m_reuseIntraDataCTU->chromaModes[ctu.m_cuAddr * numPartition], bestCU->m_chromaIntraDir, sizeof(uint8_t) * numPartition);
+            memcpy(&intraDataCTU->depth[ctu.m_cuAddr * numPartition], bestCU->m_cuDepth, sizeof(uint8_t) * numPartition);
+            memcpy(&intraDataCTU->modes[ctu.m_cuAddr * numPartition], bestCU->m_lumaIntraDir, sizeof(uint8_t) * numPartition);
+            memcpy(&intraDataCTU->partSizes[ctu.m_cuAddr * numPartition], bestCU->m_partSize, sizeof(uint8_t) * numPartition);
+            memcpy(&intraDataCTU->chromaModes[ctu.m_cuAddr * numPartition], bestCU->m_chromaIntraDir, sizeof(uint8_t) * numPartition);
         }
     }
     else
     {
-        if (!m_param->rdLevel)
+        if (m_param->bIntraRefresh && m_slice->m_sliceType == P_SLICE &&
+            ctu.m_cuPelX / g_maxCUSize >= frame.m_encData->m_pir.pirStartCol
+            && ctu.m_cuPelX / g_maxCUSize < frame.m_encData->m_pir.pirEndCol)
+            compressIntraCU(ctu, cuGeom, qp);
+        else if (!m_param->rdLevel)
         {
             /* In RD Level 0/1, copy source pixels into the reconstructed block so
-            * they are available for intra predictions */
+             * they are available for intra predictions */
             m_modeDepth[0].fencYuv.copyToPicYuv(*m_frame->m_reconPic, ctu.m_cuAddr, 0);
 
             compressInterCU_rd0_4(ctu, cuGeom, qp);
@@ -187,6 +190,7 @@
             compressInterCU_rd0_4(ctu, cuGeom, qp);
         else
         {
+            uint32_t zOrder = 0;
             compressInterCU_rd5_6(ctu, cuGeom, zOrder, qp);
             if (m_param->analysisMode == X265_ANALYSIS_SAVE && m_frame->m_analysisData.interData)
             {
@@ -212,8 +216,7 @@
         md.pred[PRED_LOSSLESS].initCosts();
         md.pred[PRED_LOSSLESS].cu.initLosslessCU(md.bestMode->cu, cuGeom);
         PartSize size = (PartSize)md.pred[PRED_LOSSLESS].cu.m_partSize[0];
-        uint8_t* modes = md.pred[PRED_LOSSLESS].cu.m_lumaIntraDir;
-        checkIntra(md.pred[PRED_LOSSLESS], cuGeom, size, modes, NULL);
+        checkIntra(md.pred[PRED_LOSSLESS], cuGeom, size);
         checkBestMode(md.pred[PRED_LOSSLESS], cuGeom.depth);
     }
     else
@@ -226,7 +229,7 @@
     }
 }
 
-void Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, uint32_t& zOrder, int32_t qp)
+void Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp)
 {
     uint32_t depth = cuGeom.depth;
     ModeDepth& md = m_modeDepth[depth];
@@ -235,42 +238,37 @@
     bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
     bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
 
-    if (m_param->analysisMode == X265_ANALYSIS_LOAD)
-    {
-        uint8_t* reuseDepth  = &m_reuseIntraDataCTU->depth[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
-        uint8_t* reuseModes  = &m_reuseIntraDataCTU->modes[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
-        char* reusePartSizes = &m_reuseIntraDataCTU->partSizes[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
-        uint8_t* reuseChromaModes = &m_reuseIntraDataCTU->chromaModes[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
+    bool bAlreadyDecided = parentCTU.m_lumaIntraDir[cuGeom.absPartIdx] != (uint8_t)ALL_IDX;
+    bool bDecidedDepth = parentCTU.m_cuDepth[cuGeom.absPartIdx] == depth;
 
-        if (mightNotSplit && depth == reuseDepth[zOrder] && zOrder == cuGeom.absPartIdx)
+    if (bAlreadyDecided)
+    {
+        if (bDecidedDepth)
         {
-            PartSize size = (PartSize)reusePartSizes[zOrder];
-            Mode& mode = size == SIZE_2Nx2N ? md.pred[PRED_INTRA] : md.pred[PRED_INTRA_NxN];
+            Mode& mode = md.pred[0];
+            md.bestMode = &mode;
             mode.cu.initSubCU(parentCTU, cuGeom, qp);
-            checkIntra(mode, cuGeom, size, &reuseModes[zOrder], &reuseChromaModes[zOrder]);
-            checkBestMode(mode, depth);
+            memcpy(mode.cu.m_lumaIntraDir, parentCTU.m_lumaIntraDir + cuGeom.absPartIdx, cuGeom.numPartitions);
+            memcpy(mode.cu.m_chromaIntraDir, parentCTU.m_chromaIntraDir + cuGeom.absPartIdx, cuGeom.numPartitions);
+            checkIntra(mode, cuGeom, (PartSize)parentCTU.m_partSize[cuGeom.absPartIdx]);
 
             if (m_bTryLossless)
                 tryLossless(cuGeom);
 
             if (mightSplit)
                 addSplitFlagCost(*md.bestMode, cuGeom.depth);
-
-            // increment zOrder offset to point to next best depth in sharedDepth buffer
-            zOrder += g_depthInc[g_maxCUDepth - 1][reuseDepth[zOrder]];
-            mightSplit = false;
         }
     }
-    else if (mightNotSplit)
+    else if (cuGeom.log2CUSize != MAX_LOG2_CU_SIZE && mightNotSplit)
     {
         md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom, qp);
-        checkIntra(md.pred[PRED_INTRA], cuGeom, SIZE_2Nx2N, NULL, NULL);
+        checkIntra(md.pred[PRED_INTRA], cuGeom, SIZE_2Nx2N);
         checkBestMode(md.pred[PRED_INTRA], depth);
 
         if (cuGeom.log2CUSize == 3 && m_slice->m_sps->quadtreeTULog2MinSize < 3)
         {
             md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom, qp);
-            checkIntra(md.pred[PRED_INTRA_NxN], cuGeom, SIZE_NxN, NULL, NULL);
+            checkIntra(md.pred[PRED_INTRA_NxN], cuGeom, SIZE_NxN);
             checkBestMode(md.pred[PRED_INTRA_NxN], depth);
         }
 
@@ -281,6 +279,9 @@
             addSplitFlagCost(*md.bestMode, cuGeom.depth);
     }
 
+    // stop recursion if we reach the depth of previous analysis decision
+    mightSplit &= !(bAlreadyDecided && bDecidedDepth);
+
     if (mightSplit)
     {
         Mode* splitPred = &md.pred[PRED_SPLIT];
@@ -305,7 +306,7 @@
                 if (m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth)
                     nextQP = setLambdaFromQP(parentCTU, calculateQpforCuSize(parentCTU, childGeom));
 
-                compressIntraCU(parentCTU, childGeom, zOrder, nextQP);
+                compressIntraCU(parentCTU, childGeom, nextQP);
 
                 // Save best CU and pred data for this sub CU
​

x265_1.8.tar.gz/source/encoder/analysis.h -> x265_1.9.tar.gz/source/encoder/analysis.h Changed

 
@@ -3,6 +3,7 @@
 *
 * Authors: Deepthi Nandakumar <deepthi@multicorewareinc.com>
 *          Steve Borho <steve@borho.org>
+*          Min Chen <chenm003@163.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -40,6 +41,21 @@
 
 class Entropy;
 
+struct SplitData
+{
+    uint32_t splitRefs;
+    uint32_t mvCost[2];
+    uint64_t sa8dCost;
+
+    void initSplitCUData()
+    {
+        splitRefs = 0;
+        mvCost[0] = 0; // L0
+        mvCost[1] = 0; // L1
+        sa8dCost    = 0;
+    }
+};
+
 class Analysis : public Search
 {
 public:
@@ -101,20 +117,20 @@
     Mode& compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, const Entropy& initialContext);
 
 protected:
-
     /* Analysis data for load/save modes, keeps getting incremented as CTU analysis proceeds and data is consumed or read */
-    analysis_intra_data* m_reuseIntraDataCTU;
     analysis_inter_data* m_reuseInterDataCTU;
+    MV*                  m_reuseMv;
     int32_t*             m_reuseRef;
     uint32_t*            m_reuseBestMergeCand;
+    uint32_t m_splitRefIdx[4];
 
     /* full analysis for an I-slice CU */
-    void compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, uint32_t &zOrder, int32_t qp);
+    void compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp);
 
     /* full analysis for a P or B slice CU */
-    void compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp);
-    uint32_t compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp);
-    uint32_t compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGeom, uint32_t &zOrder, int32_t qp);
+    uint32_t compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp);
+    SplitData compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp);
+    SplitData compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGeom, uint32_t &zOrder, int32_t qp);
 
     /* measure merge and skip */
     void checkMerge2Nx2N_rd0_4(Mode& skip, Mode& merge, const CUGeom& cuGeom);
@@ -139,13 +155,11 @@
     /* generate residual and recon pixels for an entire CTU recursively (RD0) */
     void encodeResidue(const CUData& parentCTU, const CUGeom& cuGeom);
 
-    int calculateQpforCuSize(const CUData& ctu, const CUGeom& cuGeom);
+    int calculateQpforCuSize(const CUData& ctu, const CUGeom& cuGeom, double baseQP = -1);
 
     /* check whether current mode is the new best */
     inline void checkBestMode(Mode& mode, uint32_t depth)
     {
-        X265_CHECK(mode.ok(), "mode costs are uninitialized\n");
-
         ModeDepth& md = m_modeDepth[depth];
         if (md.bestMode)
         {
​

x265_1.8.tar.gz/source/encoder/api.cpp -> x265_1.9.tar.gz/source/encoder/api.cpp Changed

@@ -72,9 +72,7 @@
 #endif
 
 #if HIGH_BIT_DEPTH
-    if (X265_DEPTH == 12)
-        x265_log(p, X265_LOG_WARNING, "Main12 is HIGHLY experimental, do not use!\n");
-    else if (X265_DEPTH != 10 && X265_DEPTH != 12)
+    if (X265_DEPTH != 10 && X265_DEPTH != 12)
 #else
     if (X265_DEPTH != 8)
 #endif
@@ -247,6 +245,16 @@
     }
 }
 
+int x265_encoder_intra_refresh(x265_encoder *enc)
+{
+    if (!enc)
+        return -1;
+
+    Encoder *encoder = static_cast<Encoder*>(enc);
+    encoder->m_bQueuedIntraRefresh = 1;
+    return 0;
+}
+
 void x265_cleanup(void)
 {
     if (!g_ctuSizeConfigured)
@@ -268,6 +276,7 @@
     pic->bitDepth = param->internalBitDepth;
     pic->colorSpace = param->internalCsp;
     pic->forceqp = X265_QP_AUTO;
+    pic->quantOffsets = NULL;
     if (param->analysisMode)
     {
         uint32_t widthInCU       = (param->sourceWidth  + g_maxCUSize - 1) >> g_maxLog2CUSize;
@@ -318,6 +327,7 @@
     &x265_cleanup,
 
     sizeof(x265_frame_stats),
+    &x265_encoder_intra_refresh,
 };
 
 typedef const x265_api* (*api_get_func)(int bitDepth);

 
@@ -72,9 +72,7 @@
 #endif
 
 #if HIGH_BIT_DEPTH
-    if (X265_DEPTH == 12)
-        x265_log(p, X265_LOG_WARNING, "Main12 is HIGHLY experimental, do not use!\n");
-    else if (X265_DEPTH != 10 && X265_DEPTH != 12)
+    if (X265_DEPTH != 10 && X265_DEPTH != 12)
 #else
     if (X265_DEPTH != 8)
 #endif
@@ -247,6 +245,16 @@
     }
 }
 
+int x265_encoder_intra_refresh(x265_encoder *enc)
+{
+    if (!enc)
+        return -1;
+
+    Encoder *encoder = static_cast<Encoder*>(enc);
+    encoder->m_bQueuedIntraRefresh = 1;
+    return 0;
+}
+
 void x265_cleanup(void)
 {
     if (!g_ctuSizeConfigured)
@@ -268,6 +276,7 @@
     pic->bitDepth = param->internalBitDepth;
     pic->colorSpace = param->internalCsp;
     pic->forceqp = X265_QP_AUTO;
+    pic->quantOffsets = NULL;
     if (param->analysisMode)
     {
         uint32_t widthInCU       = (param->sourceWidth  + g_maxCUSize - 1) >> g_maxLog2CUSize;
@@ -318,6 +327,7 @@
     &x265_cleanup,
 
     sizeof(x265_frame_stats),
+    &x265_encoder_intra_refresh,
 };
 
 typedef const x265_api* (*api_get_func)(int bitDepth);
​

x265_1.8.tar.gz/source/encoder/bitcost.cpp -> x265_1.9.tar.gz/source/encoder/bitcost.cpp Changed

 
@@ -2,6 +2,7 @@
  * Copyright (C) 2013 x265 project
  *
  * Authors: Steve Borho <steve@borho.org>
+ *          Min Chen <chenm003@163.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -40,7 +41,12 @@
             x265_emms(); // just to be safe
 
             CalculateLogs();
-            s_costs[qp] = new uint16_t[4 * BC_MAX_MV + 1] + 2 * BC_MAX_MV;
+            s_costs[qp] = X265_MALLOC(uint16_t, 4 * BC_MAX_MV + 1) + 2 * BC_MAX_MV;
+            if (!s_costs[qp])
+            {
+                x265_log(NULL, X265_LOG_ERROR, "BitCost s_costs buffer allocation failure\n");
+                return;
+            }
             double lambda = x265_lambda_tab[qp];
 
             // estimate same cost for negative and positive MVD
@@ -66,11 +72,16 @@
 {
     if (!s_bitsizes)
     {
-        s_bitsizes = new float[2 * BC_MAX_MV + 1];
+        s_bitsizes = X265_MALLOC(float, 4 * BC_MAX_MV + 1) + 2 * BC_MAX_MV;
+        if (!s_bitsizes)
+        {
+            x265_log(NULL, X265_LOG_ERROR, "BitCost s_bitsizes buffer allocation failure\n");
+            return;
+        }
         s_bitsizes[0] = 0.718f;
         float log2_2 = 2.0f / log(2.0f);  // 2 x 1/log(2)
         for (int i = 1; i <= 2 * BC_MAX_MV; i++)
-            s_bitsizes[i] = log((float)(i + 1)) * log2_2 + 1.718f;
+            s_bitsizes[i] = s_bitsizes[-i] = log((float)(i + 1)) * log2_2 + 1.718f;
     }
 }
 
@@ -80,12 +91,15 @@
     {
         if (s_costs[i])
         {
-            delete [] (s_costs[i] - 2 * BC_MAX_MV);
+            X265_FREE(s_costs[i] - 2 * BC_MAX_MV);
 
-            s_costs[i] = 0;
+            s_costs[i] = NULL;
         }
     }
 
-    delete [] s_bitsizes;
-    s_bitsizes = 0;
+    if (s_bitsizes)
+    {
+        X265_FREE(s_bitsizes - 2 * BC_MAX_MV);
+        s_bitsizes = NULL;
+    }
 }
​

x265_1.8.tar.gz/source/encoder/bitcost.h -> x265_1.9.tar.gz/source/encoder/bitcost.h Changed

 
@@ -47,14 +47,14 @@
     // return bit cost of motion vector difference, without lambda
     inline uint32_t bitcost(const MV& mv) const
     {
-        return (uint32_t)(s_bitsizes[abs(mv.x - m_mvp.x)] +
-                          s_bitsizes[abs(mv.y - m_mvp.y)] + 0.5f);
+        return (uint32_t)(s_bitsizes[mv.x - m_mvp.x] +
+                          s_bitsizes[mv.y - m_mvp.y] + 0.5f);
     }
 
     static inline uint32_t bitcost(const MV& mv, const MV& mvp)
     {
-        return (uint32_t)(s_bitsizes[abs(mv.x - mvp.x)] +
-                          s_bitsizes[abs(mv.y - mvp.y)] + 0.5f);
+        return (uint32_t)(s_bitsizes[mv.x - mvp.x] +
+                          s_bitsizes[mv.y - mvp.y] + 0.5f);
     }
 
     static void destroy();
​

x265_1.8.tar.gz/source/encoder/dpb.cpp -> x265_1.9.tar.gz/source/encoder/dpb.cpp Changed

@@ -47,16 +47,16 @@
         delete curFrame;
     }
 
-    while (m_picSymFreeList)
+    while (m_frameDataFreeList)
     {
-        FrameData* next = m_picSymFreeList->m_freeListNext;
-        m_picSymFreeList->destroy();
+        FrameData* next = m_frameDataFreeList->m_freeListNext;
+        m_frameDataFreeList->destroy();
 
-        m_picSymFreeList->m_reconPic->destroy();
-        delete m_picSymFreeList->m_reconPic;
+        m_frameDataFreeList->m_reconPic->destroy();
+        delete m_frameDataFreeList->m_reconPic;
 
-        delete m_picSymFreeList;
-        m_picSymFreeList = next;
+        delete m_frameDataFreeList;
+        m_frameDataFreeList = next;
     }
 }
 
@@ -74,13 +74,19 @@
             curFrame->m_reconRowCount.set(0);
             curFrame->m_bChromaExtended = false;
 
+            // Reset column counter
+            X265_CHECK(curFrame->m_reconColCount != NULL, "curFrame->m_reconColCount check failure");
+            X265_CHECK(curFrame->m_numRows > 0, "curFrame->m_numRows check failure");
+            for(int32_t col = 0; col < curFrame->m_numRows; col++)
+                curFrame->m_reconColCount[col].set(0);
+
             // iterator is invalidated by remove, restart scan
             m_picList.remove(*curFrame);
             iterFrame = m_picList.first();
 
             m_freeList.pushBack(*curFrame);
-            curFrame->m_encData->m_freeListNext = m_picSymFreeList;
-            m_picSymFreeList = curFrame->m_encData;
+            curFrame->m_encData->m_freeListNext = m_frameDataFreeList;
+            m_frameDataFreeList = curFrame->m_encData;
             curFrame->m_encData = NULL;
             curFrame->m_reconPic = NULL;
         }
@@ -171,7 +177,7 @@
     {
         for (int ref = 0; ref < slice->m_numRefIdx[l]; ref++)
         {
-            Frame *refpic = slice->m_refPicList[l][ref];
+            Frame *refpic = slice->m_refFrameList[l][ref];
             ATOMIC_INC(&refpic->m_countRefEncoders);
         }
     }

 
@@ -47,16 +47,16 @@
         delete curFrame;
     }
 
-    while (m_picSymFreeList)
+    while (m_frameDataFreeList)
     {
-        FrameData* next = m_picSymFreeList->m_freeListNext;
-        m_picSymFreeList->destroy();
+        FrameData* next = m_frameDataFreeList->m_freeListNext;
+        m_frameDataFreeList->destroy();
 
-        m_picSymFreeList->m_reconPic->destroy();
-        delete m_picSymFreeList->m_reconPic;
+        m_frameDataFreeList->m_reconPic->destroy();
+        delete m_frameDataFreeList->m_reconPic;
 
-        delete m_picSymFreeList;
-        m_picSymFreeList = next;
+        delete m_frameDataFreeList;
+        m_frameDataFreeList = next;
     }
 }
 
@@ -74,13 +74,19 @@
             curFrame->m_reconRowCount.set(0);
             curFrame->m_bChromaExtended = false;
 
+            // Reset column counter
+            X265_CHECK(curFrame->m_reconColCount != NULL, "curFrame->m_reconColCount check failure");
+            X265_CHECK(curFrame->m_numRows > 0, "curFrame->m_numRows check failure");
+            for(int32_t col = 0; col < curFrame->m_numRows; col++)
+                curFrame->m_reconColCount[col].set(0);
+
             // iterator is invalidated by remove, restart scan
             m_picList.remove(*curFrame);
             iterFrame = m_picList.first();
 
             m_freeList.pushBack(*curFrame);
-            curFrame->m_encData->m_freeListNext = m_picSymFreeList;
-            m_picSymFreeList = curFrame->m_encData;
+            curFrame->m_encData->m_freeListNext = m_frameDataFreeList;
+            m_frameDataFreeList = curFrame->m_encData;
             curFrame->m_encData = NULL;
             curFrame->m_reconPic = NULL;
         }
@@ -171,7 +177,7 @@
     {
         for (int ref = 0; ref < slice->m_numRefIdx[l]; ref++)
         {
-            Frame *refpic = slice->m_refPicList[l][ref];
+            Frame *refpic = slice->m_refFrameList[l][ref];
             ATOMIC_INC(&refpic->m_countRefEncoders);
         }
     }
​

x265_1.8.tar.gz/source/encoder/dpb.h -> x265_1.9.tar.gz/source/encoder/dpb.h Changed

 
@@ -46,14 +46,14 @@
     bool               m_bTemporalSublayer;
     PicList            m_picList;
     PicList            m_freeList;
-    FrameData*         m_picSymFreeList;
+    FrameData*         m_frameDataFreeList;
 
     DPB(x265_param *param)
     {
         m_lastIDR = 0;
         m_pocCRA = 0;
         m_bRefreshPending = false;
-        m_picSymFreeList = NULL;
+        m_frameDataFreeList = NULL;
         m_maxRefL0 = param->maxNumReferences;
         m_maxRefL1 = param->bBPyramid ? 2 : 1;
         m_bOpenGOP = param->bOpenGOP;
​

x265_1.8.tar.gz/source/encoder/encoder.cpp -> x265_1.9.tar.gz/source/encoder/encoder.cpp Changed

@@ -2,6 +2,7 @@
  * Copyright (C) 2013 x265 project
  *
  * Authors: Steve Borho <steve@borho.org>
+ *          Min Chen <chenm003@163.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -39,6 +40,10 @@
 
 #include "x265.h"
 
+#if _MSC_VER
+#pragma warning(disable: 4996) // POSIX functions are just fine, thanks
+#endif
+
 namespace X265_NS {
 const char g_sliceTypeToChar[] = {'B', 'P', 'I'};
 }
@@ -66,12 +71,9 @@
     m_outputCount = 0;
     m_param = NULL;
     m_latestParam = NULL;
-    m_cuOffsetY = NULL;
-    m_cuOffsetC = NULL;
-    m_buOffsetY = NULL;
-    m_buOffsetC = NULL;
     m_threadPool = NULL;
     m_analysisFile = NULL;
+    m_offsetEmergency = NULL;
     for (int i = 0; i < X265_MAX_FRAME_THREADS; i++)
         m_frameEncoder[i] = NULL;
 
@@ -191,6 +193,7 @@
     {
         x265_log(m_param, X265_LOG_ERROR, "Unable to allocate scaling list arrays\n");
         m_aborted = true;
+        return;
     }
     else if (!m_param->scalingLists || !strcmp(m_param->scalingLists, "off"))
         m_scalingList.m_bEnabled = false;
@@ -198,7 +201,6 @@
         m_scalingList.setDefaultScalingList();
     else if (m_scalingList.parseScalingList(m_param->scalingLists))
         m_aborted = true;
-    m_scalingList.setupQuantMatrices();
 
     m_lookahead = new Lookahead(m_param, m_threadPool);
     if (m_numPools)
@@ -213,6 +215,82 @@
     initVPS(&m_vps);
     initSPS(&m_sps);
     initPPS(&m_pps);
+   
+    if (m_param->rc.vbvBufferSize)
+    {
+        m_offsetEmergency = (uint16_t(*)[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS])X265_MALLOC(uint16_t, MAX_NUM_TR_CATEGORIES * MAX_NUM_TR_COEFFS * (QP_MAX_MAX - QP_MAX_SPEC));
+        if (!m_offsetEmergency)
+        {
+            x265_log(m_param, X265_LOG_ERROR, "Unable to allocate memory\n");
+            m_aborted = true;
+            return;
+        }
+
+        bool scalingEnabled = m_scalingList.m_bEnabled;
+        if (!scalingEnabled)
+        {
+            m_scalingList.setDefaultScalingList();
+            m_scalingList.setupQuantMatrices();
+        }
+        else
+            m_scalingList.setupQuantMatrices();
+
+        for (int q = 0; q < QP_MAX_MAX - QP_MAX_SPEC; q++)
+        {
+            for (int cat = 0; cat < MAX_NUM_TR_CATEGORIES; cat++)
+            {
+                uint16_t *nrOffset = m_offsetEmergency[q][cat];
+
+                int trSize = cat & 3;
+
+                int coefCount = 1 << ((trSize + 2) * 2);
+
+                /* Denoise chroma first then luma, then DC. */
+                int dcThreshold = (QP_MAX_MAX - QP_MAX_SPEC) * 2 / 3;
+                int lumaThreshold = (QP_MAX_MAX - QP_MAX_SPEC) * 2 / 3;
+                int chromaThreshold = 0;
+
+                int thresh = (cat < 4 || (cat >= 8 && cat < 12)) ? lumaThreshold : chromaThreshold;
+
+                double quantF = (double)(1ULL << (q / 6 + 16 + 8));
+
+                for (int i = 0; i < coefCount; i++)
+                {
+                    /* True "emergency mode": remove all DCT coefficients */
+                    if (q == QP_MAX_MAX - QP_MAX_SPEC - 1)
+                    {
+                        nrOffset[i] = INT16_MAX;
+                        continue;
+                    }
+
+                    int iThresh = i == 0 ? dcThreshold : thresh;
+                    if (q < iThresh)
+                    {
+                        nrOffset[i] = 0;
+                        continue;
+                    }
+
+                    int numList = (cat >= 8) * 3 + ((int)!iThresh);
+
+                    double pos = (double)(q - iThresh + 1) / (QP_MAX_MAX - QP_MAX_SPEC - iThresh);
+                    double start = quantF / (m_scalingList.m_quantCoef[trSize][numList][QP_MAX_SPEC % 6][i]);
+
+                    // Formula chosen as an exponential scale to vaguely mimic the effects of a higher quantizer.
+                    double bias = (pow(2, pos * (QP_MAX_MAX - QP_MAX_SPEC)) * 0.003 - 0.003) * start;
+                    nrOffset[i] = (uint16_t)X265_MIN(bias + 0.5, INT16_MAX);
+                }
+            }
+        }
+
+        if (!scalingEnabled)
+        {
+            m_scalingList.m_bEnabled = false;
+            m_scalingList.m_bDataPresent = false;
+            m_scalingList.setupQuantMatrices();
+        }
+    }
+    else
+        m_scalingList.setupQuantMatrices();
 
     int numRows = (m_param->sourceHeight + g_maxCUSize - 1) / g_maxCUSize;
     int numCols = (m_param->sourceWidth  + g_maxCUSize - 1) / g_maxCUSize;
@@ -259,6 +337,8 @@
     m_encodeStartTime = x265_mdate();
 
     m_nalList.m_annexB = !!m_param->bAnnexB;
+
+    m_emitCLLSEI = p->maxCLL || p->maxFALL;
 }
 
 void Encoder::stopJobs()
@@ -318,10 +398,7 @@
         delete m_rateControl;
     }
 
-    X265_FREE(m_cuOffsetY);
-    X265_FREE(m_cuOffsetC);
-    X265_FREE(m_buOffsetY);
-    X265_FREE(m_buOffsetC);
+    X265_FREE(m_offsetEmergency);
 
     if (m_analysisFile)
         fclose(m_analysisFile);
@@ -335,7 +412,6 @@
         free((char*)m_param->scalingLists);
         free((char*)m_param->numaPools);
         free((char*)m_param->masteringDisplayColorVolume);
-        free((char*)m_param->contentLightLevelInfo);
 
         PARAM_NS::x265_param_free(m_param);
     }
@@ -361,6 +437,45 @@
     }
 }
 
+void Encoder::calcRefreshInterval(Frame* frameEnc)
+{
+    Slice* slice = frameEnc->m_encData->m_slice;
+    uint32_t numBlocksInRow = slice->m_sps->numCuInWidth;
+    FrameData::PeriodicIR* pir = &frameEnc->m_encData->m_pir;
+    if (slice->m_sliceType == I_SLICE)
+    {
+        pir->framesSinceLastPir = 0;
+        m_bQueuedIntraRefresh = 0;
+        /* PIR is currently only supported with ref == 1, so any intra frame effectively refreshes
+         * the whole frame and counts as an intra refresh. */
+        pir->pirEndCol = numBlocksInRow;
+    }
+    else if (slice->m_sliceType == P_SLICE)
+    {
+        Frame* ref = frameEnc->m_encData->m_slice->m_refFrameList[0][0];
+        int pocdiff = frameEnc->m_poc - ref->m_poc;
+        int numPFramesInGOP = m_param->keyframeMax / pocdiff;
+        int increment = (numBlocksInRow + numPFramesInGOP - 1) / numPFramesInGOP;
+        pir->pirEndCol = ref->m_encData->m_pir.pirEndCol;
+        pir->framesSinceLastPir = ref->m_encData->m_pir.framesSinceLastPir + pocdiff;
+        if (pir->framesSinceLastPir >= m_param->keyframeMax ||
+            (m_bQueuedIntraRefresh && pir->pirEndCol >= numBlocksInRow))
+        {
+            pir->pirEndCol = 0;
+            pir->framesSinceLastPir = 0;
+            m_bQueuedIntraRefresh = 0;
+            frameEnc->m_lowres.bKeyframe = 1;
+        }
+        pir->pirStartCol = pir->pirEndCol;
+        pir->pirEndCol += increment;
+        /* If our intra refresh has reached the right side of the frame, we're done. */
+        if (pir->pirEndCol >= numBlocksInRow)
+        {
+            pir->pirEndCol = numBlocksInRow;

 
@@ -2,6 +2,7 @@
  * Copyright (C) 2013 x265 project
  *
  * Authors: Steve Borho <steve@borho.org>
+ *          Min Chen <chenm003@163.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -39,6 +40,10 @@
 
 #include "x265.h"
 
+#if _MSC_VER
+#pragma warning(disable: 4996) // POSIX functions are just fine, thanks
+#endif
+
 namespace X265_NS {
 const char g_sliceTypeToChar[] = {'B', 'P', 'I'};
 }
@@ -66,12 +71,9 @@
     m_outputCount = 0;
     m_param = NULL;
     m_latestParam = NULL;
-    m_cuOffsetY = NULL;
-    m_cuOffsetC = NULL;
-    m_buOffsetY = NULL;
-    m_buOffsetC = NULL;
     m_threadPool = NULL;
     m_analysisFile = NULL;
+    m_offsetEmergency = NULL;
     for (int i = 0; i < X265_MAX_FRAME_THREADS; i++)
         m_frameEncoder[i] = NULL;
 
@@ -191,6 +193,7 @@
     {
         x265_log(m_param, X265_LOG_ERROR, "Unable to allocate scaling list arrays\n");
         m_aborted = true;
+        return;
     }
     else if (!m_param->scalingLists || !strcmp(m_param->scalingLists, "off"))
         m_scalingList.m_bEnabled = false;
@@ -198,7 +201,6 @@
         m_scalingList.setDefaultScalingList();
     else if (m_scalingList.parseScalingList(m_param->scalingLists))
         m_aborted = true;
-    m_scalingList.setupQuantMatrices();
 
     m_lookahead = new Lookahead(m_param, m_threadPool);
     if (m_numPools)
@@ -213,6 +215,82 @@
     initVPS(&m_vps);
     initSPS(&m_sps);
     initPPS(&m_pps);
+   
+    if (m_param->rc.vbvBufferSize)
+    {
+        m_offsetEmergency = (uint16_t(*)[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS])X265_MALLOC(uint16_t, MAX_NUM_TR_CATEGORIES * MAX_NUM_TR_COEFFS * (QP_MAX_MAX - QP_MAX_SPEC));
+        if (!m_offsetEmergency)
+        {
+            x265_log(m_param, X265_LOG_ERROR, "Unable to allocate memory\n");
+            m_aborted = true;
+            return;
+        }
+
+        bool scalingEnabled = m_scalingList.m_bEnabled;
+        if (!scalingEnabled)
+        {
+            m_scalingList.setDefaultScalingList();
+            m_scalingList.setupQuantMatrices();
+        }
+        else
+            m_scalingList.setupQuantMatrices();
+
+        for (int q = 0; q < QP_MAX_MAX - QP_MAX_SPEC; q++)
+        {
+            for (int cat = 0; cat < MAX_NUM_TR_CATEGORIES; cat++)
+            {
+                uint16_t *nrOffset = m_offsetEmergency[q][cat];
+
+                int trSize = cat & 3;
+
+                int coefCount = 1 << ((trSize + 2) * 2);
+
+                /* Denoise chroma first then luma, then DC. */
+                int dcThreshold = (QP_MAX_MAX - QP_MAX_SPEC) * 2 / 3;
+                int lumaThreshold = (QP_MAX_MAX - QP_MAX_SPEC) * 2 / 3;
+                int chromaThreshold = 0;
+
+                int thresh = (cat < 4 || (cat >= 8 && cat < 12)) ? lumaThreshold : chromaThreshold;
+
+                double quantF = (double)(1ULL << (q / 6 + 16 + 8));
+
+                for (int i = 0; i < coefCount; i++)
+                {
+                    /* True "emergency mode": remove all DCT coefficients */
+                    if (q == QP_MAX_MAX - QP_MAX_SPEC - 1)
+                    {
+                        nrOffset[i] = INT16_MAX;
+                        continue;
+                    }
+
+                    int iThresh = i == 0 ? dcThreshold : thresh;
+                    if (q < iThresh)
+                    {
+                        nrOffset[i] = 0;
+                        continue;
+                    }
+
+                    int numList = (cat >= 8) * 3 + ((int)!iThresh);
+
+                    double pos = (double)(q - iThresh + 1) / (QP_MAX_MAX - QP_MAX_SPEC - iThresh);
+                    double start = quantF / (m_scalingList.m_quantCoef[trSize][numList][QP_MAX_SPEC % 6][i]);
+
+                    // Formula chosen as an exponential scale to vaguely mimic the effects of a higher quantizer.
+                    double bias = (pow(2, pos * (QP_MAX_MAX - QP_MAX_SPEC)) * 0.003 - 0.003) * start;
+                    nrOffset[i] = (uint16_t)X265_MIN(bias + 0.5, INT16_MAX);
+                }
+            }
+        }
+
+        if (!scalingEnabled)
+        {
+            m_scalingList.m_bEnabled = false;
+            m_scalingList.m_bDataPresent = false;
+            m_scalingList.setupQuantMatrices();
+        }
+    }
+    else
+        m_scalingList.setupQuantMatrices();
 
     int numRows = (m_param->sourceHeight + g_maxCUSize - 1) / g_maxCUSize;
     int numCols = (m_param->sourceWidth  + g_maxCUSize - 1) / g_maxCUSize;
@@ -259,6 +337,8 @@
     m_encodeStartTime = x265_mdate();
 
     m_nalList.m_annexB = !!m_param->bAnnexB;
+
+    m_emitCLLSEI = p->maxCLL || p->maxFALL;
 }
 
 void Encoder::stopJobs()
@@ -318,10 +398,7 @@
         delete m_rateControl;
     }
 
-    X265_FREE(m_cuOffsetY);
-    X265_FREE(m_cuOffsetC);
-    X265_FREE(m_buOffsetY);
-    X265_FREE(m_buOffsetC);
+    X265_FREE(m_offsetEmergency);
 
     if (m_analysisFile)
         fclose(m_analysisFile);
@@ -335,7 +412,6 @@
         free((char*)m_param->scalingLists);
         free((char*)m_param->numaPools);
         free((char*)m_param->masteringDisplayColorVolume);
-        free((char*)m_param->contentLightLevelInfo);
 
         PARAM_NS::x265_param_free(m_param);
     }
@@ -361,6 +437,45 @@
     }
 }
 
+void Encoder::calcRefreshInterval(Frame* frameEnc)
+{
+    Slice* slice = frameEnc->m_encData->m_slice;
+    uint32_t numBlocksInRow = slice->m_sps->numCuInWidth;
+    FrameData::PeriodicIR* pir = &frameEnc->m_encData->m_pir;
+    if (slice->m_sliceType == I_SLICE)
+    {
+        pir->framesSinceLastPir = 0;
+        m_bQueuedIntraRefresh = 0;
+        /* PIR is currently only supported with ref == 1, so any intra frame effectively refreshes
+         * the whole frame and counts as an intra refresh. */
+        pir->pirEndCol = numBlocksInRow;
+    }
+    else if (slice->m_sliceType == P_SLICE)
+    {
+        Frame* ref = frameEnc->m_encData->m_slice->m_refFrameList[0][0];
+        int pocdiff = frameEnc->m_poc - ref->m_poc;
+        int numPFramesInGOP = m_param->keyframeMax / pocdiff;
+        int increment = (numBlocksInRow + numPFramesInGOP - 1) / numPFramesInGOP;
+        pir->pirEndCol = ref->m_encData->m_pir.pirEndCol;
+        pir->framesSinceLastPir = ref->m_encData->m_pir.framesSinceLastPir + pocdiff;
+        if (pir->framesSinceLastPir >= m_param->keyframeMax ||
+            (m_bQueuedIntraRefresh && pir->pirEndCol >= numBlocksInRow))
+        {
+            pir->pirEndCol = 0;
+            pir->framesSinceLastPir = 0;
+            m_bQueuedIntraRefresh = 0;
+            frameEnc->m_lowres.bKeyframe = 1;
+        }
+        pir->pirStartCol = pir->pirEndCol;
+        pir->pirEndCol += increment;
+        /* If our intra refresh has reached the right side of the frame, we're done. */
+        if (pir->pirEndCol >= numBlocksInRow)
+        {
+            pir->pirEndCol = numBlocksInRow;
​

x265_1.8.tar.gz/source/encoder/encoder.h -> x265_1.9.tar.gz/source/encoder/encoder.h Changed

@@ -45,8 +45,10 @@
     double        m_psnrSumV;
     double        m_globalSsim;
     double        m_totalQp;
+    double        m_maxFALL;
     uint64_t      m_accBits;
     uint32_t      m_numPics;
+    uint16_t      m_maxCLL;
 
     EncStats()
     {
@@ -54,6 +56,8 @@
         m_accBits = 0;
         m_numPics = 0;
         m_totalQp = 0;
+        m_maxCLL = 0;
+        m_maxFALL = 0;
     }
 
     void addQP(double aveQp);
@@ -75,64 +79,62 @@
 {
 public:
 
-    int                m_pocLast;         // time index (POC)
-    int                m_encodedFrameNum;
-    int                m_outputCount;
+    uint32_t           m_residualSumEmergency[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS];
+    uint32_t           m_countEmergency[MAX_NUM_TR_CATEGORIES];
+    uint16_t           (*m_offsetEmergency)[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS];
 
-    int                m_bframeDelay;
     int64_t            m_firstPts;
     int64_t            m_bframeDelayTime;
     int64_t            m_prevReorderedPts[2];
+    int64_t            m_encodeStartTime;
 
-    ThreadPool*        m_threadPool;
-    FrameEncoder*      m_frameEncoder[X265_MAX_FRAME_THREADS];
-    DPB*               m_dpb;
-
-    Frame*             m_exportedPic;
-
+    int                m_pocLast;         // time index (POC)
+    int                m_encodedFrameNum;
+    int                m_outputCount;
+    int                m_bframeDelay;
     int                m_numPools;
     int                m_curEncoder;
 
-    /* cached PicYuv offset arrays, shared by all instances of
-     * PicYuv created by this encoder */
-    intptr_t*          m_cuOffsetY;
-    intptr_t*          m_cuOffsetC;
-    intptr_t*          m_buOffsetY;
-    intptr_t*          m_buOffsetC;
-
-    /* Collect statistics globally */
-    EncStats           m_analyzeAll;
-    EncStats           m_analyzeI;
-    EncStats           m_analyzeP;
-    EncStats           m_analyzeB;
-    int64_t            m_encodeStartTime;
-
     // weighted prediction
     int                m_numLumaWPFrames;    // number of P frames with weighted luma reference
     int                m_numChromaWPFrames;  // number of P frames with weighted chroma reference
     int                m_numLumaWPBiFrames;  // number of B frames with weighted luma reference
     int                m_numChromaWPBiFrames; // number of B frames with weighted chroma reference
-    FILE*              m_analysisFile;
     int                m_conformanceMode;
-    VPS                m_vps;
-    SPS                m_sps;
-    PPS                m_pps;
-    NALList            m_nalList;
-    ScalingList        m_scalingList;      // quantization matrix information
-
     int                m_lastBPSEI;
     uint32_t           m_numDelayedPic;
 
+    ThreadPool*        m_threadPool;
+    FrameEncoder*      m_frameEncoder[X265_MAX_FRAME_THREADS];
+    DPB*               m_dpb;
+    Frame*             m_exportedPic;
+    FILE*              m_analysisFile;
     x265_param*        m_param;
     x265_param*        m_latestParam;
     RateControl*       m_rateControl;
     Lookahead*         m_lookahead;
+
+    /* Collect statistics globally */
+    EncStats           m_analyzeAll;
+    EncStats           m_analyzeI;
+    EncStats           m_analyzeP;
+    EncStats           m_analyzeB;
+    VPS                m_vps;
+    SPS                m_sps;
+    PPS                m_pps;
+    NALList            m_nalList;
+    ScalingList        m_scalingList;      // quantization matrix information
     Window             m_conformanceWindow;
 
+    bool               m_emitCLLSEI;
     bool               m_bZeroLatency;     // x265_encoder_encode() returns NALs for the input picture, zero lag
     bool               m_aborted;          // fatal error detected
     bool               m_reconfigured;      // reconfigure of encoder detected
 
+    /* Begin intra refresh when one not in progress or else begin one as soon as the current 
+     * one is done. Requires bIntraRefresh to be set.*/
+    int                m_bQueuedIntraRefresh;
+
     Encoder();
     ~Encoder() {}
 
@@ -164,7 +166,9 @@
 
     void writeAnalysisFile(x265_analysis_data* pic);
 
-    void finishFrameStats(Frame* pic, FrameEncoder *curEncoder, uint64_t bits, x265_frame_stats* frameStats);
+    void finishFrameStats(Frame* pic, FrameEncoder *curEncoder, x265_frame_stats* frameStats, int inPoc);
+
+    void calcRefreshInterval(Frame* frameEnc);
 
 protected:

 
@@ -45,8 +45,10 @@
     double        m_psnrSumV;
     double        m_globalSsim;
     double        m_totalQp;
+    double        m_maxFALL;
     uint64_t      m_accBits;
     uint32_t      m_numPics;
+    uint16_t      m_maxCLL;
 
     EncStats()
     {
@@ -54,6 +56,8 @@
         m_accBits = 0;
         m_numPics = 0;
         m_totalQp = 0;
+        m_maxCLL = 0;
+        m_maxFALL = 0;
     }
 
     void addQP(double aveQp);
@@ -75,64 +79,62 @@
 {
 public:
 
-    int                m_pocLast;         // time index (POC)
-    int                m_encodedFrameNum;
-    int                m_outputCount;
+    uint32_t           m_residualSumEmergency[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS];
+    uint32_t           m_countEmergency[MAX_NUM_TR_CATEGORIES];
+    uint16_t           (*m_offsetEmergency)[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS];
 
-    int                m_bframeDelay;
     int64_t            m_firstPts;
     int64_t            m_bframeDelayTime;
     int64_t            m_prevReorderedPts[2];
+    int64_t            m_encodeStartTime;
 
-    ThreadPool*        m_threadPool;
-    FrameEncoder*      m_frameEncoder[X265_MAX_FRAME_THREADS];
-    DPB*               m_dpb;
-
-    Frame*             m_exportedPic;
-
+    int                m_pocLast;         // time index (POC)
+    int                m_encodedFrameNum;
+    int                m_outputCount;
+    int                m_bframeDelay;
     int                m_numPools;
     int                m_curEncoder;
 
-    /* cached PicYuv offset arrays, shared by all instances of
-     * PicYuv created by this encoder */
-    intptr_t*          m_cuOffsetY;
-    intptr_t*          m_cuOffsetC;
-    intptr_t*          m_buOffsetY;
-    intptr_t*          m_buOffsetC;
-
-    /* Collect statistics globally */
-    EncStats           m_analyzeAll;
-    EncStats           m_analyzeI;
-    EncStats           m_analyzeP;
-    EncStats           m_analyzeB;
-    int64_t            m_encodeStartTime;
-
     // weighted prediction
     int                m_numLumaWPFrames;    // number of P frames with weighted luma reference
     int                m_numChromaWPFrames;  // number of P frames with weighted chroma reference
     int                m_numLumaWPBiFrames;  // number of B frames with weighted luma reference
     int                m_numChromaWPBiFrames; // number of B frames with weighted chroma reference
-    FILE*              m_analysisFile;
     int                m_conformanceMode;
-    VPS                m_vps;
-    SPS                m_sps;
-    PPS                m_pps;
-    NALList            m_nalList;
-    ScalingList        m_scalingList;      // quantization matrix information
-
     int                m_lastBPSEI;
     uint32_t           m_numDelayedPic;
 
+    ThreadPool*        m_threadPool;
+    FrameEncoder*      m_frameEncoder[X265_MAX_FRAME_THREADS];
+    DPB*               m_dpb;
+    Frame*             m_exportedPic;
+    FILE*              m_analysisFile;
     x265_param*        m_param;
     x265_param*        m_latestParam;
     RateControl*       m_rateControl;
     Lookahead*         m_lookahead;
+
+    /* Collect statistics globally */
+    EncStats           m_analyzeAll;
+    EncStats           m_analyzeI;
+    EncStats           m_analyzeP;
+    EncStats           m_analyzeB;
+    VPS                m_vps;
+    SPS                m_sps;
+    PPS                m_pps;
+    NALList            m_nalList;
+    ScalingList        m_scalingList;      // quantization matrix information
     Window             m_conformanceWindow;
 
+    bool               m_emitCLLSEI;
     bool               m_bZeroLatency;     // x265_encoder_encode() returns NALs for the input picture, zero lag
     bool               m_aborted;          // fatal error detected
     bool               m_reconfigured;      // reconfigure of encoder detected
 
+    /* Begin intra refresh when one not in progress or else begin one as soon as the current 
+     * one is done. Requires bIntraRefresh to be set.*/
+    int                m_bQueuedIntraRefresh;
+
     Encoder();
     ~Encoder() {}
 
@@ -164,7 +166,9 @@
 
     void writeAnalysisFile(x265_analysis_data* pic);
 
-    void finishFrameStats(Frame* pic, FrameEncoder *curEncoder, uint64_t bits, x265_frame_stats* frameStats);
+    void finishFrameStats(Frame* pic, FrameEncoder *curEncoder, x265_frame_stats* frameStats, int inPoc);
+
+    void calcRefreshInterval(Frame* frameEnc);
 
 protected:
 
​

x265_1.8.tar.gz/source/encoder/entropy.cpp -> x265_1.9.tar.gz/source/encoder/entropy.cpp Changed

@@ -2,6 +2,7 @@
 * Copyright (C) 2013 x265 project
 *
 * Authors: Steve Borho <steve@borho.org>
+*          Min Chen <chenm003@163.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -429,7 +430,8 @@
     if (slice.m_sps->bUseSAO)
     {
         WRITE_FLAG(saoParam->bSaoFlag[0], "slice_sao_luma_flag");
-        WRITE_FLAG(saoParam->bSaoFlag[1], "slice_sao_chroma_flag");
+        if (encData.m_param->internalCsp != X265_CSP_I400)
+            WRITE_FLAG(saoParam->bSaoFlag[1], "slice_sao_chroma_flag");
     }
 
     // check if numRefIdx match the defaults (1, hard-coded in PPS). If not, override
@@ -828,6 +830,79 @@
     }
 }
 
+void Entropy::encodeTransformLuma(const CUData& cu, uint32_t absPartIdx, uint32_t curDepth, uint32_t log2CurSize,
+                              bool& bCodeDQP, const uint32_t depthRange[2])
+{
+    const bool subdiv = cu.m_tuDepth[absPartIdx] > curDepth;
+
+    /* in each of these conditions, the subdiv flag is implied and not signaled,
+     * so we have checks to make sure the implied value matches our intentions */
+    if (cu.isIntra(absPartIdx) && cu.m_partSize[absPartIdx] != SIZE_2Nx2N && log2CurSize == MIN_LOG2_CU_SIZE)
+    {
+        X265_CHECK(subdiv, "intra NxN requires TU depth below CU depth\n");
+    }
+    else if (cu.isInter(absPartIdx) && cu.m_partSize[absPartIdx] != SIZE_2Nx2N &&
+             !curDepth && cu.m_slice->m_sps->quadtreeTUMaxDepthInter == 1)
+    {
+        X265_CHECK(subdiv, "inter TU must be smaller than CU when not 2Nx2N part size: log2CurSize %d, depthRange[0] %d\n", log2CurSize, depthRange[0]);
+    }
+    else if (log2CurSize > depthRange[1])
+    {
+        X265_CHECK(subdiv, "TU is larger than the max allowed, it should have been split\n");
+    }
+    else if (log2CurSize == cu.m_slice->m_sps->quadtreeTULog2MinSize || log2CurSize == depthRange[0])
+    {
+        X265_CHECK(!subdiv, "min sized TU cannot be subdivided\n");
+    }
+    else
+    {
+        X265_CHECK(log2CurSize > depthRange[0], "transform size failure\n");
+        codeTransformSubdivFlag(subdiv, 5 - log2CurSize);
+    }
+
+    if (subdiv)
+    {
+        --log2CurSize;
+        ++curDepth;
+
+        uint32_t qNumParts = 1 << (log2CurSize - LOG2_UNIT_SIZE) * 2;
+
+        encodeTransformLuma(cu, absPartIdx + 0 * qNumParts, curDepth, log2CurSize, bCodeDQP, depthRange);
+        encodeTransformLuma(cu, absPartIdx + 1 * qNumParts, curDepth, log2CurSize, bCodeDQP, depthRange);
+        encodeTransformLuma(cu, absPartIdx + 2 * qNumParts, curDepth, log2CurSize, bCodeDQP, depthRange);
+        encodeTransformLuma(cu, absPartIdx + 3 * qNumParts, curDepth, log2CurSize, bCodeDQP, depthRange);
+        return;
+    }
+
+    if (!cu.isIntra(absPartIdx) && !curDepth)
+    {
+        X265_CHECK(cu.getCbf(absPartIdx, TEXT_LUMA, 0), "CBF should have been set\n");
+    }
+    else
+        codeQtCbfLuma(cu, absPartIdx, curDepth);
+
+    uint32_t cbfY = cu.getCbf(absPartIdx, TEXT_LUMA, curDepth);
+
+    if (!cbfY)
+        return;
+
+    // dQP: only for CTU once
+    if (cu.m_slice->m_pps->bUseDQP && bCodeDQP)
+    {
+        uint32_t log2CUSize = cu.m_log2CUSize[absPartIdx];
+        uint32_t absPartIdxLT = absPartIdx & (0xFF << (log2CUSize - LOG2_UNIT_SIZE) * 2);
+        codeDeltaQP(cu, absPartIdxLT);
+        bCodeDQP = false;
+    }
+
+    if (cbfY)
+    {
+        uint32_t coeffOffset = absPartIdx << (LOG2_UNIT_SIZE * 2);
+        codeCoeffNxN(cu, cu.m_trCoeff[0] + coeffOffset, absPartIdx, log2CurSize, TEXT_LUMA);
+    }
+}
+
+
 void Entropy::codePredInfo(const CUData& cu, uint32_t absPartIdx)
 {
     if (cu.isIntra(absPartIdx)) // If it is intra mode, encode intra prediction mode.
@@ -908,7 +983,10 @@
     }
 
     uint32_t log2CUSize = cu.m_log2CUSize[absPartIdx];
-    encodeTransform(cu, absPartIdx, 0, log2CUSize, bCodeDQP, depthRange);
+    if (cu.m_chromaFormat == X265_CSP_I400)
+        encodeTransformLuma(cu, absPartIdx, 0, log2CUSize, bCodeDQP, depthRange);
+    else
+        encodeTransform(cu, absPartIdx, 0, log2CUSize, bCodeDQP, depthRange);
 }
 
 void Entropy::codeSaoOffset(const SaoCtuParam& ctuParam, int plane)
@@ -1010,7 +1088,7 @@
 void Entropy::codePredWeightTable(const Slice& slice)
 {
     const WeightParam *wp;
-    bool            bChroma      = true; // 4:0:0 not yet supported
+    bool            bChroma = slice.m_sps->chromaFormatIdc != X265_CSP_I400;
     bool            bDenomCoded  = false;
     int             numRefDirs   = slice.m_sliceType == B_SLICE ? 2 : 1;
     uint32_t        totalSignalledWeightFlags = 0;
@@ -1565,11 +1643,16 @@
     uint8_t * const baseCtx = bIsLuma ? &m_contextState[OFF_SIG_FLAG_CTX] : &m_contextState[OFF_SIG_FLAG_CTX + NUM_SIG_FLAG_CTX_LUMA];
     uint32_t c1 = 1;
     int scanPosSigOff = scanPosLast - (lastScanSet << MLS_CG_SIZE) - 1;
-    ALIGN_VAR_32(uint16_t, absCoeff[(1 << MLS_CG_SIZE)]);
+    ALIGN_VAR_32(uint16_t, absCoeff[(1 << MLS_CG_SIZE) + 1]);   // extra 2 bytes(+1) space for AVX2 assembly, +1 because (numNonZero<=1) in costCoeffNxN path
     uint32_t numNonZero = 1;
     unsigned long lastNZPosInCG;
     unsigned long firstNZPosInCG;
 
+#if _DEBUG
+    // Unnecessary, for Valgrind-3.10.0 only
+    memset(absCoeff, 0, sizeof(absCoeff));
+#endif
+
     absCoeff[0] = (uint16_t)abs(coeff[posLast]);
 
     for (int subSet = lastScanSet; subSet >= 0; subSet--)
@@ -1715,6 +1798,7 @@
             {
                 // maximum g_entropyBits are 18-bits and maximum of count are 16, so intermedia of sum are 22-bits
                 const uint8_t *tabSigCtx = table_cnt[(log2TrSize == 2) ? 4 : (uint32_t)patternSigCtx];
+                X265_CHECK(numNonZero <= 1, "numNonZero check failure");
                 uint32_t sum = primitives.costCoeffNxN(g_scan4x4[codingParameters.scanType], &coeff[blkPosBase], (intptr_t)trSize, absCoeff + numNonZero, tabSigCtx, scanFlagMask, baseCtx, offset + posOffset, scanPosSigOff, subPosBase);
 
 #if CHECKED_BUILD || _DEBUG
@@ -1919,43 +2003,78 @@
         numCtx = bIsLuma ? 12 : 3;
     }
 
-    if (bIsLuma)
-    {
-        for (uint32_t bin = 0; bin < 2; bin++)
-            estBitsSbac.significantBits[bin][0] = sbacGetEntropyBits(m_contextState[OFF_SIG_FLAG_CTX], bin);
+    const int ctxSigOffset = OFF_SIG_FLAG_CTX + (bIsLuma ? 0 : NUM_SIG_FLAG_CTX_LUMA);
+
+    estBitsSbac.significantBits[0][0] = sbacGetEntropyBits(m_contextState[ctxSigOffset], 0);
+    estBitsSbac.significantBits[1][0] = sbacGetEntropyBits(m_contextState[ctxSigOffset], 1);
 
-        for (int ctxIdx = firstCtx; ctxIdx < firstCtx + numCtx; ctxIdx++)
-            for (uint32_t bin = 0; bin < 2; bin++)
-                estBitsSbac.significantBits[bin][ctxIdx] = sbacGetEntropyBits(m_contextState[OFF_SIG_FLAG_CTX + ctxIdx], bin);
+    for (int ctxIdx = firstCtx; ctxIdx < firstCtx + numCtx; ctxIdx++)
+    {
+        estBitsSbac.significantBits[0][ctxIdx] = sbacGetEntropyBits(m_contextState[ctxSigOffset + ctxIdx], 0);
+        estBitsSbac.significantBits[1][ctxIdx] = sbacGetEntropyBits(m_contextState[ctxSigOffset + ctxIdx], 1);
     }
-    else
+
+    const uint32_t maxGroupIdx = log2TrSize * 2 - 1;
+    if (bIsLuma)
     {
-        for (uint32_t bin = 0; bin < 2; bin++)
-            estBitsSbac.significantBits[bin][0] = sbacGetEntropyBits(m_contextState[OFF_SIG_FLAG_CTX + (NUM_SIG_FLAG_CTX_LUMA + 0)], bin);
+        if (log2TrSize == 2)
+        {
+            for (int i = 0, ctxIdx = 0; i < 2; i++, ctxIdx += NUM_CTX_LAST_FLAG_XY)
+            {
+                int bits = 0;
+                const uint8_t *ctxState = &m_contextState[OFF_CTX_LAST_FLAG_X + ctxIdx];
 
-        for (int ctxIdx = firstCtx; ctxIdx < firstCtx + numCtx; ctxIdx++)
-            for (uint32_t bin = 0; bin < 2; bin++)
-                estBitsSbac.significantBits[bin][ctxIdx] = sbacGetEntropyBits(m_contextState[OFF_SIG_FLAG_CTX + (NUM_SIG_FLAG_CTX_LUMA + ctxIdx)], bin);
-    }
+                for (uint32_t ctx = 0; ctx < 3; ctx++)
+                {
+                    estBitsSbac.lastBits[i][ctx] = bits + sbacGetEntropyBits(ctxState[ctx], 0);
+                    bits += sbacGetEntropyBits(ctxState[ctx], 1);
+                }
 
-    int blkSizeOffset = bIsLuma ? ((log2TrSize - 2) * 3 + ((log2TrSize - 1) >> 2)) : NUM_CTX_LAST_FLAG_XY_LUMA;
-    int ctxShift = bIsLuma ? ((log2TrSize + 1) >> 2) : log2TrSize - 2;
-    uint32_t maxGroupIdx = log2TrSize * 2 - 1;
+                estBitsSbac.lastBits[i][maxGroupIdx] = bits;
+            }
+        }
+        else
+        {
+            const int blkSizeOffset = ((log2TrSize - 2) * 3 + (log2TrSize == 5));

 
@@ -2,6 +2,7 @@
 * Copyright (C) 2013 x265 project
 *
 * Authors: Steve Borho <steve@borho.org>
+*          Min Chen <chenm003@163.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -429,7 +430,8 @@
     if (slice.m_sps->bUseSAO)
     {
         WRITE_FLAG(saoParam->bSaoFlag[0], "slice_sao_luma_flag");
-        WRITE_FLAG(saoParam->bSaoFlag[1], "slice_sao_chroma_flag");
+        if (encData.m_param->internalCsp != X265_CSP_I400)
+            WRITE_FLAG(saoParam->bSaoFlag[1], "slice_sao_chroma_flag");
     }
 
     // check if numRefIdx match the defaults (1, hard-coded in PPS). If not, override
@@ -828,6 +830,79 @@
     }
 }
 
+void Entropy::encodeTransformLuma(const CUData& cu, uint32_t absPartIdx, uint32_t curDepth, uint32_t log2CurSize,
+                              bool& bCodeDQP, const uint32_t depthRange[2])
+{
+    const bool subdiv = cu.m_tuDepth[absPartIdx] > curDepth;
+
+    /* in each of these conditions, the subdiv flag is implied and not signaled,
+     * so we have checks to make sure the implied value matches our intentions */
+    if (cu.isIntra(absPartIdx) && cu.m_partSize[absPartIdx] != SIZE_2Nx2N && log2CurSize == MIN_LOG2_CU_SIZE)
+    {
+        X265_CHECK(subdiv, "intra NxN requires TU depth below CU depth\n");
+    }
+    else if (cu.isInter(absPartIdx) && cu.m_partSize[absPartIdx] != SIZE_2Nx2N &&
+             !curDepth && cu.m_slice->m_sps->quadtreeTUMaxDepthInter == 1)
+    {
+        X265_CHECK(subdiv, "inter TU must be smaller than CU when not 2Nx2N part size: log2CurSize %d, depthRange[0] %d\n", log2CurSize, depthRange[0]);
+    }
+    else if (log2CurSize > depthRange[1])
+    {
+        X265_CHECK(subdiv, "TU is larger than the max allowed, it should have been split\n");
+    }
+    else if (log2CurSize == cu.m_slice->m_sps->quadtreeTULog2MinSize || log2CurSize == depthRange[0])
+    {
+        X265_CHECK(!subdiv, "min sized TU cannot be subdivided\n");
+    }
+    else
+    {
+        X265_CHECK(log2CurSize > depthRange[0], "transform size failure\n");
+        codeTransformSubdivFlag(subdiv, 5 - log2CurSize);
+    }
+
+    if (subdiv)
+    {
+        --log2CurSize;
+        ++curDepth;
+
+        uint32_t qNumParts = 1 << (log2CurSize - LOG2_UNIT_SIZE) * 2;
+
+        encodeTransformLuma(cu, absPartIdx + 0 * qNumParts, curDepth, log2CurSize, bCodeDQP, depthRange);
+        encodeTransformLuma(cu, absPartIdx + 1 * qNumParts, curDepth, log2CurSize, bCodeDQP, depthRange);
+        encodeTransformLuma(cu, absPartIdx + 2 * qNumParts, curDepth, log2CurSize, bCodeDQP, depthRange);
+        encodeTransformLuma(cu, absPartIdx + 3 * qNumParts, curDepth, log2CurSize, bCodeDQP, depthRange);
+        return;
+    }
+
+    if (!cu.isIntra(absPartIdx) && !curDepth)
+    {
+        X265_CHECK(cu.getCbf(absPartIdx, TEXT_LUMA, 0), "CBF should have been set\n");
+    }
+    else
+        codeQtCbfLuma(cu, absPartIdx, curDepth);
+
+    uint32_t cbfY = cu.getCbf(absPartIdx, TEXT_LUMA, curDepth);
+
+    if (!cbfY)
+        return;
+
+    // dQP: only for CTU once
+    if (cu.m_slice->m_pps->bUseDQP && bCodeDQP)
+    {
+        uint32_t log2CUSize = cu.m_log2CUSize[absPartIdx];
+        uint32_t absPartIdxLT = absPartIdx & (0xFF << (log2CUSize - LOG2_UNIT_SIZE) * 2);
+        codeDeltaQP(cu, absPartIdxLT);
+        bCodeDQP = false;
+    }
+
+    if (cbfY)
+    {
+        uint32_t coeffOffset = absPartIdx << (LOG2_UNIT_SIZE * 2);
+        codeCoeffNxN(cu, cu.m_trCoeff[0] + coeffOffset, absPartIdx, log2CurSize, TEXT_LUMA);
+    }
+}
+
+
 void Entropy::codePredInfo(const CUData& cu, uint32_t absPartIdx)
 {
     if (cu.isIntra(absPartIdx)) // If it is intra mode, encode intra prediction mode.
@@ -908,7 +983,10 @@
     }
 
     uint32_t log2CUSize = cu.m_log2CUSize[absPartIdx];
-    encodeTransform(cu, absPartIdx, 0, log2CUSize, bCodeDQP, depthRange);
+    if (cu.m_chromaFormat == X265_CSP_I400)
+        encodeTransformLuma(cu, absPartIdx, 0, log2CUSize, bCodeDQP, depthRange);
+    else
+        encodeTransform(cu, absPartIdx, 0, log2CUSize, bCodeDQP, depthRange);
 }
 
 void Entropy::codeSaoOffset(const SaoCtuParam& ctuParam, int plane)
@@ -1010,7 +1088,7 @@
 void Entropy::codePredWeightTable(const Slice& slice)
 {
     const WeightParam *wp;
-    bool            bChroma      = true; // 4:0:0 not yet supported
+    bool            bChroma = slice.m_sps->chromaFormatIdc != X265_CSP_I400;
     bool            bDenomCoded  = false;
     int             numRefDirs   = slice.m_sliceType == B_SLICE ? 2 : 1;
     uint32_t        totalSignalledWeightFlags = 0;
@@ -1565,11 +1643,16 @@
     uint8_t * const baseCtx = bIsLuma ? &m_contextState[OFF_SIG_FLAG_CTX] : &m_contextState[OFF_SIG_FLAG_CTX + NUM_SIG_FLAG_CTX_LUMA];
     uint32_t c1 = 1;
     int scanPosSigOff = scanPosLast - (lastScanSet << MLS_CG_SIZE) - 1;
-    ALIGN_VAR_32(uint16_t, absCoeff[(1 << MLS_CG_SIZE)]);
+    ALIGN_VAR_32(uint16_t, absCoeff[(1 << MLS_CG_SIZE) + 1]);   // extra 2 bytes(+1) space for AVX2 assembly, +1 because (numNonZero<=1) in costCoeffNxN path
     uint32_t numNonZero = 1;
     unsigned long lastNZPosInCG;
     unsigned long firstNZPosInCG;
 
+#if _DEBUG
+    // Unnecessary, for Valgrind-3.10.0 only
+    memset(absCoeff, 0, sizeof(absCoeff));
+#endif
+
     absCoeff[0] = (uint16_t)abs(coeff[posLast]);
 
     for (int subSet = lastScanSet; subSet >= 0; subSet--)
@@ -1715,6 +1798,7 @@
             {
                 // maximum g_entropyBits are 18-bits and maximum of count are 16, so intermedia of sum are 22-bits
                 const uint8_t *tabSigCtx = table_cnt[(log2TrSize == 2) ? 4 : (uint32_t)patternSigCtx];
+                X265_CHECK(numNonZero <= 1, "numNonZero check failure");
                 uint32_t sum = primitives.costCoeffNxN(g_scan4x4[codingParameters.scanType], &coeff[blkPosBase], (intptr_t)trSize, absCoeff + numNonZero, tabSigCtx, scanFlagMask, baseCtx, offset + posOffset, scanPosSigOff, subPosBase);
 
 #if CHECKED_BUILD || _DEBUG
@@ -1919,43 +2003,78 @@
         numCtx = bIsLuma ? 12 : 3;
     }
 
-    if (bIsLuma)
-    {
-        for (uint32_t bin = 0; bin < 2; bin++)
-            estBitsSbac.significantBits[bin][0] = sbacGetEntropyBits(m_contextState[OFF_SIG_FLAG_CTX], bin);
+    const int ctxSigOffset = OFF_SIG_FLAG_CTX + (bIsLuma ? 0 : NUM_SIG_FLAG_CTX_LUMA);
+
+    estBitsSbac.significantBits[0][0] = sbacGetEntropyBits(m_contextState[ctxSigOffset], 0);
+    estBitsSbac.significantBits[1][0] = sbacGetEntropyBits(m_contextState[ctxSigOffset], 1);
 
-        for (int ctxIdx = firstCtx; ctxIdx < firstCtx + numCtx; ctxIdx++)
-            for (uint32_t bin = 0; bin < 2; bin++)
-                estBitsSbac.significantBits[bin][ctxIdx] = sbacGetEntropyBits(m_contextState[OFF_SIG_FLAG_CTX + ctxIdx], bin);
+    for (int ctxIdx = firstCtx; ctxIdx < firstCtx + numCtx; ctxIdx++)
+    {
+        estBitsSbac.significantBits[0][ctxIdx] = sbacGetEntropyBits(m_contextState[ctxSigOffset + ctxIdx], 0);
+        estBitsSbac.significantBits[1][ctxIdx] = sbacGetEntropyBits(m_contextState[ctxSigOffset + ctxIdx], 1);
     }
-    else
+
+    const uint32_t maxGroupIdx = log2TrSize * 2 - 1;
+    if (bIsLuma)
     {
-        for (uint32_t bin = 0; bin < 2; bin++)
-            estBitsSbac.significantBits[bin][0] = sbacGetEntropyBits(m_contextState[OFF_SIG_FLAG_CTX + (NUM_SIG_FLAG_CTX_LUMA + 0)], bin);
+        if (log2TrSize == 2)
+        {
+            for (int i = 0, ctxIdx = 0; i < 2; i++, ctxIdx += NUM_CTX_LAST_FLAG_XY)
+            {
+                int bits = 0;
+                const uint8_t *ctxState = &m_contextState[OFF_CTX_LAST_FLAG_X + ctxIdx];
 
-        for (int ctxIdx = firstCtx; ctxIdx < firstCtx + numCtx; ctxIdx++)
-            for (uint32_t bin = 0; bin < 2; bin++)
-                estBitsSbac.significantBits[bin][ctxIdx] = sbacGetEntropyBits(m_contextState[OFF_SIG_FLAG_CTX + (NUM_SIG_FLAG_CTX_LUMA + ctxIdx)], bin);
-    }
+                for (uint32_t ctx = 0; ctx < 3; ctx++)
+                {
+                    estBitsSbac.lastBits[i][ctx] = bits + sbacGetEntropyBits(ctxState[ctx], 0);
+                    bits += sbacGetEntropyBits(ctxState[ctx], 1);
+                }
 
-    int blkSizeOffset = bIsLuma ? ((log2TrSize - 2) * 3 + ((log2TrSize - 1) >> 2)) : NUM_CTX_LAST_FLAG_XY_LUMA;
-    int ctxShift = bIsLuma ? ((log2TrSize + 1) >> 2) : log2TrSize - 2;
-    uint32_t maxGroupIdx = log2TrSize * 2 - 1;
+                estBitsSbac.lastBits[i][maxGroupIdx] = bits;
+            }
+        }
+        else
+        {
+            const int blkSizeOffset = ((log2TrSize - 2) * 3 + (log2TrSize == 5));
 
​

x265_1.8.tar.gz/source/encoder/entropy.h -> x265_1.9.tar.gz/source/encoder/entropy.h Changed

 
@@ -2,6 +2,7 @@
 * Copyright (C) 2013 x265 project
 *
 * Authors: Steve Borho <steve@borho.org>
+*          Min Chen <chenm003@163.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -246,6 +247,8 @@
 
     void encodeTransform(const CUData& cu, uint32_t absPartIdx, uint32_t tuDepth, uint32_t log2TrSize,
                          bool& bCodeDQP, const uint32_t depthRange[2]);
+    void encodeTransformLuma(const CUData& cu, uint32_t absPartIdx, uint32_t tuDepth, uint32_t log2TrSize,
+                         bool& bCodeDQP, const uint32_t depthRange[2]);
 
     void copyFrom(const Entropy& src);
     void copyContextsFrom(const Entropy& src);
​

x265_1.8.tar.gz/source/encoder/frameencoder.cpp -> x265_1.9.tar.gz/source/encoder/frameencoder.cpp Changed

@@ -104,7 +104,8 @@
     m_param = top->m_param;
     m_numRows = numRows;
     m_numCols = numCols;
-    m_filterRowDelay = (m_param->bEnableSAO && m_param->bSaoNonDeblocked) ?
+    m_filterRowDelay = ((m_param->bEnableSAO && m_param->bSaoNonDeblocked)
+                        || (!m_param->bEnableLoopFilter && m_param->bEnableSAO)) ?
                         2 : (m_param->bEnableSAO || m_param->bEnableLoopFilter ? 1 : 0);
     m_filterRowDelayCus = m_filterRowDelay * numCols;
     m_rows = new CTURow[m_numRows];
@@ -124,7 +125,7 @@
         m_pool = NULL;
     }
 
-    m_frameFilter.init(top, this, numRows);
+    m_frameFilter.init(top, this, numRows, numCols);
 
     // initialize HRD parameters of SPS
     if (m_param->bEmitHRDSEI || !!m_param->interlaceMode)
@@ -135,7 +136,7 @@
         ok &= m_rce.picTimingSEI && m_rce.hrdTiming;
     }
 
-    if (m_param->noiseReductionIntra || m_param->noiseReductionInter)
+    if (m_param->noiseReductionIntra || m_param->noiseReductionInter || m_param->rc.vbvBufferSize)
         m_nr = X265_MALLOC(NoiseReduction, 1);
     if (m_nr)
         memset(m_nr, 0, sizeof(NoiseReduction));
@@ -275,7 +276,7 @@
         m_localTldIdx = 0;
     }
 
-    m_done.trigger();     /* signal that thread is initialized */ 
+    m_done.trigger();     /* signal that thread is initialized */
     m_enable.wait();      /* Encoder::encode() triggers this event */
 
     while (m_threadActive)
@@ -357,15 +358,52 @@
             WeightParam *w = NULL;
             if ((bUseWeightP || bUseWeightB) && slice->m_weightPredTable[l][ref][0].bPresentFlag)
                 w = slice->m_weightPredTable[l][ref];
-            m_mref[l][ref].init(slice->m_refPicList[l][ref]->m_reconPic, w, *m_param);
+            slice->m_refReconPicList[l][ref] = slice->m_refFrameList[l][ref]->m_reconPic;
+            m_mref[l][ref].init(slice->m_refReconPicList[l][ref], w, *m_param);
         }
     }
 
+    int numTLD;
+    if (m_pool)
+        numTLD = m_param->bEnableWavefront ? m_pool->m_numWorkers : m_pool->m_numWorkers + m_pool->m_numProviders;
+    else
+        numTLD = 1;
+
     /* Get the QP for this frame from rate control. This call may block until
      * frames ahead of it in encode order have called rateControlEnd() */
     int qp = m_top->m_rateControl->rateControlStart(m_frame, &m_rce, m_top);
     m_rce.newQp = qp;
 
+    if (m_nr)
+    {
+        if (qp > QP_MAX_SPEC && m_frame->m_param->rc.vbvBufferSize)
+        {
+            for (int i = 0; i < numTLD; i++)
+            {
+                m_tld[i].analysis.m_quant.m_frameNr[m_jpId].offset = m_top->m_offsetEmergency[qp - QP_MAX_SPEC - 1];
+                m_tld[i].analysis.m_quant.m_frameNr[m_jpId].residualSum = m_top->m_residualSumEmergency;
+                m_tld[i].analysis.m_quant.m_frameNr[m_jpId].count = m_top->m_countEmergency;
+            }
+        }
+        else
+        {
+            if (m_param->noiseReductionIntra || m_param->noiseReductionInter)
+            {
+                for (int i = 0; i < numTLD; i++)
+                {
+                    m_tld[i].analysis.m_quant.m_frameNr[m_jpId].offset = m_tld[i].analysis.m_quant.m_frameNr[m_jpId].nrOffsetDenoise;
+                    m_tld[i].analysis.m_quant.m_frameNr[m_jpId].residualSum = m_tld[i].analysis.m_quant.m_frameNr[m_jpId].nrResidualSum;
+                    m_tld[i].analysis.m_quant.m_frameNr[m_jpId].count = m_tld[i].analysis.m_quant.m_frameNr[m_jpId].nrCount;
+                }
+            }
+            else
+            {
+                for (int i = 0; i < numTLD; i++)
+                    m_tld[i].analysis.m_quant.m_frameNr[m_jpId].offset = NULL;
+            }
+        }
+    }
+
     /* Clip slice QP to 0-51 spec range before encoding */
     slice->m_sliceQp = x265_clip3(-QP_BD_OFFSET, QP_MAX_SPEC, qp);
 
@@ -458,7 +496,7 @@
     /* CQP and CRF (without capped VBV) doesn't use mid-frame statistics to 
      * tune RateControl parameters for other frames.
      * Hence, for these modes, update m_startEndOrder and unlock RC for previous threads waiting in
-     * RateControlEnd here, after the slicecontexts are initialized. For the rest - ABR
+     * RateControlEnd here, after the slice contexts are initialized. For the rest - ABR
      * and VBV, unlock only after rateControlUpdateStats of this frame is called */
     if (m_param->rc.rateControlMode != X265_RC_ABR && !m_top->m_rateControl->m_isVbv)
     {
@@ -482,7 +520,7 @@
             {
                 for (int ref = 0; ref < slice->m_numRefIdx[l]; ref++)
                 {
-                    Frame *refpic = slice->m_refPicList[l][ref];
+                    Frame *refpic = slice->m_refFrameList[l][ref];
 
                     uint32_t reconRowCount = refpic->m_reconRowCount.get();
                     while ((reconRowCount != m_numRows) && (reconRowCount < row + m_refLagRows))
@@ -521,7 +559,7 @@
                     int list = l;
                     for (int ref = 0; ref < slice->m_numRefIdx[list]; ref++)
                     {
-                        Frame *refpic = slice->m_refPicList[list][ref];
+                        Frame *refpic = slice->m_refFrameList[list][ref];
 
                         uint32_t reconRowCount = refpic->m_reconRowCount.get();
                         while ((reconRowCount != m_numRows) && (reconRowCount < i + m_refLagRows))
@@ -572,10 +610,7 @@
         m_frame->m_encData->m_frameStats.lumaDistortion   += m_rows[i].rowStats.lumaDistortion;
         m_frame->m_encData->m_frameStats.chromaDistortion += m_rows[i].rowStats.chromaDistortion;
         m_frame->m_encData->m_frameStats.psyEnergy        += m_rows[i].rowStats.psyEnergy;
-        m_frame->m_encData->m_frameStats.lumaLevel        += m_rows[i].rowStats.lumaLevel;
-
-        if (m_rows[i].rowStats.maxLumaLevel > m_frame->m_encData->m_frameStats.maxLumaLevel)
-            m_frame->m_encData->m_frameStats.maxLumaLevel = m_rows[i].rowStats.maxLumaLevel;
+        m_frame->m_encData->m_frameStats.resEnergy        += m_rows[i].rowStats.resEnergy;
         for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
         {
             m_frame->m_encData->m_frameStats.cntSkipCu[depth] += m_rows[i].rowStats.cntSkipCu[depth];
@@ -589,7 +624,7 @@
     m_frame->m_encData->m_frameStats.avgLumaDistortion   = (double)(m_frame->m_encData->m_frameStats.lumaDistortion) / m_frame->m_encData->m_frameStats.totalCtu;
     m_frame->m_encData->m_frameStats.avgChromaDistortion = (double)(m_frame->m_encData->m_frameStats.chromaDistortion) / m_frame->m_encData->m_frameStats.totalCtu;
     m_frame->m_encData->m_frameStats.avgPsyEnergy        = (double)(m_frame->m_encData->m_frameStats.psyEnergy) / m_frame->m_encData->m_frameStats.totalCtu;
-    m_frame->m_encData->m_frameStats.avgLumaLevel        = m_frame->m_encData->m_frameStats.lumaLevel / m_frame->m_encData->m_frameStats.totalCtu;
+    m_frame->m_encData->m_frameStats.avgResEnergy        = (double)(m_frame->m_encData->m_frameStats.resEnergy) / m_frame->m_encData->m_frameStats.totalCtu;
     m_frame->m_encData->m_frameStats.percentIntraNxN     = (double)(m_frame->m_encData->m_frameStats.cntIntraNxN * 100) / m_frame->m_encData->m_frameStats.totalCu;
     for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
     {
@@ -626,22 +661,23 @@
 
     if (m_param->decodedPictureHashSEI)
     {
+        int planes = (m_frame->m_param->internalCsp != X265_CSP_I400) ? 3 : 1;
         if (m_param->decodedPictureHashSEI == 1)
         {
             m_seiReconPictureDigest.m_method = SEIDecodedPictureHash::MD5;
-            for (int i = 0; i < 3; i++)
+            for (int i = 0; i < planes; i++)
                 MD5Final(&m_state[i], m_seiReconPictureDigest.m_digest[i]);
         }
         else if (m_param->decodedPictureHashSEI == 2)
         {
             m_seiReconPictureDigest.m_method = SEIDecodedPictureHash::CRC;
-            for (int i = 0; i < 3; i++)
+            for (int i = 0; i < planes; i++)
                 crcFinish(m_crc[i], m_seiReconPictureDigest.m_digest[i]);
         }
         else if (m_param->decodedPictureHashSEI == 3)
         {
             m_seiReconPictureDigest.m_method = SEIDecodedPictureHash::CHECKSUM;
-            for (int i = 0; i < 3; i++)
+            for (int i = 0; i < planes; i++)
                 checksumFinish(m_checksum[i], m_seiReconPictureDigest.m_digest[i]);
         }
 
@@ -678,41 +714,40 @@
     {
         for (int ref = 0; ref < slice->m_numRefIdx[l]; ref++)
         {
-            Frame *refpic = slice->m_refPicList[l][ref];
+            Frame *refpic = slice->m_refFrameList[l][ref];
             ATOMIC_DEC(&refpic->m_countRefEncoders);
         }
     }
 
-    int numTLD;
-    if (m_pool)
-        numTLD = m_param->bEnableWavefront ? m_pool->m_numWorkers : m_pool->m_numWorkers + m_pool->m_numProviders;
-    else
-        numTLD = 1;
-
     if (m_nr)
     {
-        /* Accumulate NR statistics from all worker threads */
-        for (int i = 0; i < numTLD; i++)
+        bool nrEnabled = (m_rce.newQp < QP_MAX_SPEC || !m_param->rc.vbvBufferSize) && (m_param->noiseReductionIntra || m_param->noiseReductionInter);
+
+        if (nrEnabled)
         {
-            NoiseReduction* nr = &m_tld[i].analysis.m_quant.m_frameNr[m_jpId];
-            for (int cat = 0; cat < MAX_NUM_TR_CATEGORIES; cat++)
+            /* Accumulate NR statistics from all worker threads */
+            for (int i = 0; i < numTLD; i++)
             {
-                for (int coeff = 0; coeff < MAX_NUM_TR_COEFFS; coeff++)
-                    m_nr->residualSum[cat][coeff] += nr->residualSum[cat][coeff];
-            
-                m_nr->count[cat] += nr->count[cat];
+                NoiseReduction* nr = &m_tld[i].analysis.m_quant.m_frameNr[m_jpId];

 
@@ -104,7 +104,8 @@
     m_param = top->m_param;
     m_numRows = numRows;
     m_numCols = numCols;
-    m_filterRowDelay = (m_param->bEnableSAO && m_param->bSaoNonDeblocked) ?
+    m_filterRowDelay = ((m_param->bEnableSAO && m_param->bSaoNonDeblocked)
+                        || (!m_param->bEnableLoopFilter && m_param->bEnableSAO)) ?
                         2 : (m_param->bEnableSAO || m_param->bEnableLoopFilter ? 1 : 0);
     m_filterRowDelayCus = m_filterRowDelay * numCols;
     m_rows = new CTURow[m_numRows];
@@ -124,7 +125,7 @@
         m_pool = NULL;
     }
 
-    m_frameFilter.init(top, this, numRows);
+    m_frameFilter.init(top, this, numRows, numCols);
 
     // initialize HRD parameters of SPS
     if (m_param->bEmitHRDSEI || !!m_param->interlaceMode)
@@ -135,7 +136,7 @@
         ok &= m_rce.picTimingSEI && m_rce.hrdTiming;
     }
 
-    if (m_param->noiseReductionIntra || m_param->noiseReductionInter)
+    if (m_param->noiseReductionIntra || m_param->noiseReductionInter || m_param->rc.vbvBufferSize)
         m_nr = X265_MALLOC(NoiseReduction, 1);
     if (m_nr)
         memset(m_nr, 0, sizeof(NoiseReduction));
@@ -275,7 +276,7 @@
         m_localTldIdx = 0;
     }
 
-    m_done.trigger();     /* signal that thread is initialized */ 
+    m_done.trigger();     /* signal that thread is initialized */
     m_enable.wait();      /* Encoder::encode() triggers this event */
 
     while (m_threadActive)
@@ -357,15 +358,52 @@
             WeightParam *w = NULL;
             if ((bUseWeightP || bUseWeightB) && slice->m_weightPredTable[l][ref][0].bPresentFlag)
                 w = slice->m_weightPredTable[l][ref];
-            m_mref[l][ref].init(slice->m_refPicList[l][ref]->m_reconPic, w, *m_param);
+            slice->m_refReconPicList[l][ref] = slice->m_refFrameList[l][ref]->m_reconPic;
+            m_mref[l][ref].init(slice->m_refReconPicList[l][ref], w, *m_param);
         }
     }
 
+    int numTLD;
+    if (m_pool)
+        numTLD = m_param->bEnableWavefront ? m_pool->m_numWorkers : m_pool->m_numWorkers + m_pool->m_numProviders;
+    else
+        numTLD = 1;
+
     /* Get the QP for this frame from rate control. This call may block until
      * frames ahead of it in encode order have called rateControlEnd() */
     int qp = m_top->m_rateControl->rateControlStart(m_frame, &m_rce, m_top);
     m_rce.newQp = qp;
 
+    if (m_nr)
+    {
+        if (qp > QP_MAX_SPEC && m_frame->m_param->rc.vbvBufferSize)
+        {
+            for (int i = 0; i < numTLD; i++)
+            {
+                m_tld[i].analysis.m_quant.m_frameNr[m_jpId].offset = m_top->m_offsetEmergency[qp - QP_MAX_SPEC - 1];
+                m_tld[i].analysis.m_quant.m_frameNr[m_jpId].residualSum = m_top->m_residualSumEmergency;
+                m_tld[i].analysis.m_quant.m_frameNr[m_jpId].count = m_top->m_countEmergency;
+            }
+        }
+        else
+        {
+            if (m_param->noiseReductionIntra || m_param->noiseReductionInter)
+            {
+                for (int i = 0; i < numTLD; i++)
+                {
+                    m_tld[i].analysis.m_quant.m_frameNr[m_jpId].offset = m_tld[i].analysis.m_quant.m_frameNr[m_jpId].nrOffsetDenoise;
+                    m_tld[i].analysis.m_quant.m_frameNr[m_jpId].residualSum = m_tld[i].analysis.m_quant.m_frameNr[m_jpId].nrResidualSum;
+                    m_tld[i].analysis.m_quant.m_frameNr[m_jpId].count = m_tld[i].analysis.m_quant.m_frameNr[m_jpId].nrCount;
+                }
+            }
+            else
+            {
+                for (int i = 0; i < numTLD; i++)
+                    m_tld[i].analysis.m_quant.m_frameNr[m_jpId].offset = NULL;
+            }
+        }
+    }
+
     /* Clip slice QP to 0-51 spec range before encoding */
     slice->m_sliceQp = x265_clip3(-QP_BD_OFFSET, QP_MAX_SPEC, qp);
 
@@ -458,7 +496,7 @@
     /* CQP and CRF (without capped VBV) doesn't use mid-frame statistics to 
      * tune RateControl parameters for other frames.
      * Hence, for these modes, update m_startEndOrder and unlock RC for previous threads waiting in
-     * RateControlEnd here, after the slicecontexts are initialized. For the rest - ABR
+     * RateControlEnd here, after the slice contexts are initialized. For the rest - ABR
      * and VBV, unlock only after rateControlUpdateStats of this frame is called */
     if (m_param->rc.rateControlMode != X265_RC_ABR && !m_top->m_rateControl->m_isVbv)
     {
@@ -482,7 +520,7 @@
             {
                 for (int ref = 0; ref < slice->m_numRefIdx[l]; ref++)
                 {
-                    Frame *refpic = slice->m_refPicList[l][ref];
+                    Frame *refpic = slice->m_refFrameList[l][ref];
 
                     uint32_t reconRowCount = refpic->m_reconRowCount.get();
                     while ((reconRowCount != m_numRows) && (reconRowCount < row + m_refLagRows))
@@ -521,7 +559,7 @@
                     int list = l;
                     for (int ref = 0; ref < slice->m_numRefIdx[list]; ref++)
                     {
-                        Frame *refpic = slice->m_refPicList[list][ref];
+                        Frame *refpic = slice->m_refFrameList[list][ref];
 
                         uint32_t reconRowCount = refpic->m_reconRowCount.get();
                         while ((reconRowCount != m_numRows) && (reconRowCount < i + m_refLagRows))
@@ -572,10 +610,7 @@
         m_frame->m_encData->m_frameStats.lumaDistortion   += m_rows[i].rowStats.lumaDistortion;
         m_frame->m_encData->m_frameStats.chromaDistortion += m_rows[i].rowStats.chromaDistortion;
         m_frame->m_encData->m_frameStats.psyEnergy        += m_rows[i].rowStats.psyEnergy;
-        m_frame->m_encData->m_frameStats.lumaLevel        += m_rows[i].rowStats.lumaLevel;
-
-        if (m_rows[i].rowStats.maxLumaLevel > m_frame->m_encData->m_frameStats.maxLumaLevel)
-            m_frame->m_encData->m_frameStats.maxLumaLevel = m_rows[i].rowStats.maxLumaLevel;
+        m_frame->m_encData->m_frameStats.resEnergy        += m_rows[i].rowStats.resEnergy;
         for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
         {
             m_frame->m_encData->m_frameStats.cntSkipCu[depth] += m_rows[i].rowStats.cntSkipCu[depth];
@@ -589,7 +624,7 @@
     m_frame->m_encData->m_frameStats.avgLumaDistortion   = (double)(m_frame->m_encData->m_frameStats.lumaDistortion) / m_frame->m_encData->m_frameStats.totalCtu;
     m_frame->m_encData->m_frameStats.avgChromaDistortion = (double)(m_frame->m_encData->m_frameStats.chromaDistortion) / m_frame->m_encData->m_frameStats.totalCtu;
     m_frame->m_encData->m_frameStats.avgPsyEnergy        = (double)(m_frame->m_encData->m_frameStats.psyEnergy) / m_frame->m_encData->m_frameStats.totalCtu;
-    m_frame->m_encData->m_frameStats.avgLumaLevel        = m_frame->m_encData->m_frameStats.lumaLevel / m_frame->m_encData->m_frameStats.totalCtu;
+    m_frame->m_encData->m_frameStats.avgResEnergy        = (double)(m_frame->m_encData->m_frameStats.resEnergy) / m_frame->m_encData->m_frameStats.totalCtu;
     m_frame->m_encData->m_frameStats.percentIntraNxN     = (double)(m_frame->m_encData->m_frameStats.cntIntraNxN * 100) / m_frame->m_encData->m_frameStats.totalCu;
     for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
     {
@@ -626,22 +661,23 @@
 
     if (m_param->decodedPictureHashSEI)
     {
+        int planes = (m_frame->m_param->internalCsp != X265_CSP_I400) ? 3 : 1;
         if (m_param->decodedPictureHashSEI == 1)
         {
             m_seiReconPictureDigest.m_method = SEIDecodedPictureHash::MD5;
-            for (int i = 0; i < 3; i++)
+            for (int i = 0; i < planes; i++)
                 MD5Final(&m_state[i], m_seiReconPictureDigest.m_digest[i]);
         }
         else if (m_param->decodedPictureHashSEI == 2)
         {
             m_seiReconPictureDigest.m_method = SEIDecodedPictureHash::CRC;
-            for (int i = 0; i < 3; i++)
+            for (int i = 0; i < planes; i++)
                 crcFinish(m_crc[i], m_seiReconPictureDigest.m_digest[i]);
         }
         else if (m_param->decodedPictureHashSEI == 3)
         {
             m_seiReconPictureDigest.m_method = SEIDecodedPictureHash::CHECKSUM;
-            for (int i = 0; i < 3; i++)
+            for (int i = 0; i < planes; i++)
                 checksumFinish(m_checksum[i], m_seiReconPictureDigest.m_digest[i]);
         }
 
@@ -678,41 +714,40 @@
     {
         for (int ref = 0; ref < slice->m_numRefIdx[l]; ref++)
         {
-            Frame *refpic = slice->m_refPicList[l][ref];
+            Frame *refpic = slice->m_refFrameList[l][ref];
             ATOMIC_DEC(&refpic->m_countRefEncoders);
         }
     }
 
-    int numTLD;
-    if (m_pool)
-        numTLD = m_param->bEnableWavefront ? m_pool->m_numWorkers : m_pool->m_numWorkers + m_pool->m_numProviders;
-    else
-        numTLD = 1;
-
     if (m_nr)
     {
-        /* Accumulate NR statistics from all worker threads */
-        for (int i = 0; i < numTLD; i++)
+        bool nrEnabled = (m_rce.newQp < QP_MAX_SPEC || !m_param->rc.vbvBufferSize) && (m_param->noiseReductionIntra || m_param->noiseReductionInter);
+
+        if (nrEnabled)
         {
-            NoiseReduction* nr = &m_tld[i].analysis.m_quant.m_frameNr[m_jpId];
-            for (int cat = 0; cat < MAX_NUM_TR_CATEGORIES; cat++)
+            /* Accumulate NR statistics from all worker threads */
+            for (int i = 0; i < numTLD; i++)
             {
-                for (int coeff = 0; coeff < MAX_NUM_TR_COEFFS; coeff++)
-                    m_nr->residualSum[cat][coeff] += nr->residualSum[cat][coeff];
-            
-                m_nr->count[cat] += nr->count[cat];
+                NoiseReduction* nr = &m_tld[i].analysis.m_quant.m_frameNr[m_jpId];
​

x265_1.8.tar.gz/source/encoder/framefilter.cpp -> x265_1.9.tar.gz/source/encoder/framefilter.cpp Changed

@@ -35,177 +35,486 @@
 static uint64_t computeSSD(pixel *fenc, pixel *rec, intptr_t stride, uint32_t width, uint32_t height);
 static float calculateSSIM(pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, uint32_t width, uint32_t height, void *buf, uint32_t& cnt);
 
-FrameFilter::FrameFilter()
-    : m_param(NULL)
-    , m_frame(NULL)
-    , m_frameEncoder(NULL)
-    , m_ssimBuf(NULL)
-{
-}
-
 void FrameFilter::destroy()
 {
-    if (m_param->bEnableSAO)
-        m_sao.destroy();
-
     X265_FREE(m_ssimBuf);
+
+    if (m_parallelFilter)
+    {
+        if (m_param->bEnableSAO)
+        {
+            for(int row = 0; row < m_numRows; row++)
+                m_parallelFilter[row].m_sao.destroy((row == 0 ? 1 : 0));
+        }
+
+        delete[] m_parallelFilter;
+        m_parallelFilter = NULL;
+    }
 }
 
-void FrameFilter::init(Encoder *top, FrameEncoder *frame, int numRows)
+void FrameFilter::init(Encoder *top, FrameEncoder *frame, int numRows, uint32_t numCols)
 {
     m_param = top->m_param;
     m_frameEncoder = frame;
     m_numRows = numRows;
+    m_numCols = numCols;
     m_hChromaShift = CHROMA_H_SHIFT(m_param->internalCsp);
     m_vChromaShift = CHROMA_V_SHIFT(m_param->internalCsp);
     m_pad[0] = top->m_sps.conformanceWindow.rightOffset;
     m_pad[1] = top->m_sps.conformanceWindow.bottomOffset;
     m_saoRowDelay = m_param->bEnableLoopFilter ? 1 : 0;
-    m_lastHeight = m_param->sourceHeight % g_maxCUSize ? m_param->sourceHeight % g_maxCUSize : g_maxCUSize;
-
-    if (m_param->bEnableSAO)
-        if (!m_sao.create(m_param))
-            m_param->bEnableSAO = 0;
+    m_lastHeight = (m_param->sourceHeight % g_maxCUSize) ? (m_param->sourceHeight % g_maxCUSize) : g_maxCUSize;
+    m_lastWidth = (m_param->sourceWidth % g_maxCUSize) ? (m_param->sourceWidth % g_maxCUSize) : g_maxCUSize;
 
     if (m_param->bEnableSsim)
         m_ssimBuf = X265_MALLOC(int, 8 * (m_param->sourceWidth / 4 + 3));
+
+    m_parallelFilter = new ParallelFilter[numRows];
+
+    if (m_parallelFilter)
+    {
+        if (m_param->bEnableSAO)
+        {
+            for(int row = 0; row < numRows; row++)
+            {
+                if (!m_parallelFilter[row].m_sao.create(m_param, (row == 0 ? 1 : 0)))
+                    m_param->bEnableSAO = 0;
+                else
+                {
+                    if (row != 0)
+                        m_parallelFilter[row].m_sao.createFromRootNode(&m_parallelFilter[0].m_sao);
+                }
+
+            }
+        }
+
+        for(int row = 0; row < numRows; row++)
+        {
+            // Setting maximum bound information
+            m_parallelFilter[row].m_rowHeight = (row == numRows - 1) ? m_lastHeight : g_maxCUSize;
+            m_parallelFilter[row].m_row = row;
+            m_parallelFilter[row].m_rowAddr = row * numCols;
+            m_parallelFilter[row].m_frameFilter = this;
+
+            if (row > 0)
+                m_parallelFilter[row].m_prevRow = &m_parallelFilter[row - 1];
+        }
+    }
+
 }
 
 void FrameFilter::start(Frame *frame, Entropy& initState, int qp)
 {
     m_frame = frame;
 
-    if (m_param->bEnableSAO)
-        m_sao.startSlice(frame, initState, qp);
+    // Reset Filter Data Struct
+    if (m_parallelFilter)
+    {
+        for(int row = 0; row < m_numRows; row++)
+        {
+            if (m_param->bEnableSAO)
+                m_parallelFilter[row].m_sao.startSlice(frame, initState, qp);
+
+            m_parallelFilter[row].m_lastCol.set(0);
+            m_parallelFilter[row].m_allowedCol.set(0);
+            m_parallelFilter[row].m_lastDeblocked.set(-1);
+            m_parallelFilter[row].m_encData = frame->m_encData;
+        }
+
+        // Reset SAO common statistics
+        if (m_param->bEnableSAO)
+            m_parallelFilter[0].m_sao.resetStats();
+    }
 }
 
-void FrameFilter::processRow(int row)
+/* restore original YUV samples to recon after SAO (if lossless) */
+static void restoreOrigLosslessYuv(const CUData* cu, Frame& frame, uint32_t absPartIdx)
 {
-    ProfileScopeEvent(filterCTURow);
+    const int size = cu->m_log2CUSize[absPartIdx] - 2;
+    const uint32_t cuAddr = cu->m_cuAddr;
 
-#if DETAILED_CU_STATS
-    ScopedElapsedTime filterPerfScope(m_frameEncoder->m_cuStats.loopFilterElapsedTime);
-    m_frameEncoder->m_cuStats.countLoopFilter++;
-#endif
+    PicYuv* reconPic = frame.m_reconPic;
+    PicYuv* fencPic  = frame.m_fencPic;
 
-    if (!m_param->bEnableLoopFilter && !m_param->bEnableSAO)
+    pixel* dst = reconPic->getLumaAddr(cuAddr, absPartIdx);
+    pixel* src = fencPic->getLumaAddr(cuAddr, absPartIdx);
+
+    primitives.cu[size].copy_pp(dst, reconPic->m_stride, src, fencPic->m_stride);
+
+    if (cu->m_chromaFormat != X265_CSP_I400)
     {
-        processRowPost(row);
+        pixel* dstCb = reconPic->getCbAddr(cuAddr, absPartIdx);
+        pixel* srcCb = fencPic->getCbAddr(cuAddr, absPartIdx);
+        pixel* dstCr = reconPic->getCrAddr(cuAddr, absPartIdx);
+        pixel* srcCr = fencPic->getCrAddr(cuAddr, absPartIdx);
+
+        const int csp = fencPic->m_picCsp;
+        primitives.chroma[csp].cu[size].copy_pp(dstCb, reconPic->m_strideC, srcCb, fencPic->m_strideC);
+        primitives.chroma[csp].cu[size].copy_pp(dstCr, reconPic->m_strideC, srcCr, fencPic->m_strideC);
+    }
+}
+
+/* Original YUV restoration for CU in lossless coding */
+static void origCUSampleRestoration(const CUData* cu, const CUGeom& cuGeom, Frame& frame)
+{
+    uint32_t absPartIdx = cuGeom.absPartIdx;
+    if (cu->m_cuDepth[absPartIdx] > cuGeom.depth)
+    {
+        for (int subPartIdx = 0; subPartIdx < 4; subPartIdx++)
+        {
+            const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
+            if (childGeom.flags & CUGeom::PRESENT)
+                origCUSampleRestoration(cu, childGeom, frame);
+        }
         return;
     }
-    FrameData& encData = *m_frame->m_encData;
-    const uint32_t numCols = encData.m_slice->m_sps->numCuInWidth;
-    const uint32_t lineStartCUAddr = row * numCols;
 
-    if (m_param->bEnableLoopFilter)
+    // restore original YUV samples
+    if (cu->m_tqBypass[absPartIdx])
+        restoreOrigLosslessYuv(cu, frame, absPartIdx);
+}
+
+void FrameFilter::ParallelFilter::copySaoAboveRef(PicYuv* reconPic, uint32_t cuAddr, int col)
+{
+    // Copy SAO Top Reference Pixels
+    int ctuWidth  = g_maxCUSize;
+    const pixel* recY = reconPic->getPlaneAddr(0, cuAddr) - (m_rowAddr == 0 ? 0 : reconPic->m_stride);
+
+    // Luma
+    memcpy(&m_sao.m_tmpU[0][col * ctuWidth], recY, ctuWidth * sizeof(pixel));
+    X265_CHECK(col * ctuWidth + ctuWidth <= m_sao.m_numCuInWidth * ctuWidth, "m_tmpU buffer beyond bound write detected");
+
+    // Chroma
+    if (m_frameFilter->m_param->internalCsp != X265_CSP_I400)
+    {
+        ctuWidth  >>= m_sao.m_hChromaShift;
+
+        const pixel* recU = reconPic->getPlaneAddr(1, cuAddr) - (m_rowAddr == 0 ? 0 : reconPic->m_strideC);
+        const pixel* recV = reconPic->getPlaneAddr(2, cuAddr) - (m_rowAddr == 0 ? 0 : reconPic->m_strideC);
+        memcpy(&m_sao.m_tmpU[1][col * ctuWidth], recU, ctuWidth * sizeof(pixel));
+        memcpy(&m_sao.m_tmpU[2][col * ctuWidth], recV, ctuWidth * sizeof(pixel));
+
+        X265_CHECK(col * ctuWidth + ctuWidth <= m_sao.m_numCuInWidth * ctuWidth, "m_tmpU buffer beyond bound write detected");
+    }
+}
+
+void FrameFilter::ParallelFilter::processSaoUnitCu(SAOParam *saoParam, int col)
+{

 
@@ -35,177 +35,486 @@
 static uint64_t computeSSD(pixel *fenc, pixel *rec, intptr_t stride, uint32_t width, uint32_t height);
 static float calculateSSIM(pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, uint32_t width, uint32_t height, void *buf, uint32_t& cnt);
 
-FrameFilter::FrameFilter()
-    : m_param(NULL)
-    , m_frame(NULL)
-    , m_frameEncoder(NULL)
-    , m_ssimBuf(NULL)
-{
-}
-
 void FrameFilter::destroy()
 {
-    if (m_param->bEnableSAO)
-        m_sao.destroy();
-
     X265_FREE(m_ssimBuf);
+
+    if (m_parallelFilter)
+    {
+        if (m_param->bEnableSAO)
+        {
+            for(int row = 0; row < m_numRows; row++)
+                m_parallelFilter[row].m_sao.destroy((row == 0 ? 1 : 0));
+        }
+
+        delete[] m_parallelFilter;
+        m_parallelFilter = NULL;
+    }
 }
 
-void FrameFilter::init(Encoder *top, FrameEncoder *frame, int numRows)
+void FrameFilter::init(Encoder *top, FrameEncoder *frame, int numRows, uint32_t numCols)
 {
     m_param = top->m_param;
     m_frameEncoder = frame;
     m_numRows = numRows;
+    m_numCols = numCols;
     m_hChromaShift = CHROMA_H_SHIFT(m_param->internalCsp);
     m_vChromaShift = CHROMA_V_SHIFT(m_param->internalCsp);
     m_pad[0] = top->m_sps.conformanceWindow.rightOffset;
     m_pad[1] = top->m_sps.conformanceWindow.bottomOffset;
     m_saoRowDelay = m_param->bEnableLoopFilter ? 1 : 0;
-    m_lastHeight = m_param->sourceHeight % g_maxCUSize ? m_param->sourceHeight % g_maxCUSize : g_maxCUSize;
-
-    if (m_param->bEnableSAO)
-        if (!m_sao.create(m_param))
-            m_param->bEnableSAO = 0;
+    m_lastHeight = (m_param->sourceHeight % g_maxCUSize) ? (m_param->sourceHeight % g_maxCUSize) : g_maxCUSize;
+    m_lastWidth = (m_param->sourceWidth % g_maxCUSize) ? (m_param->sourceWidth % g_maxCUSize) : g_maxCUSize;
 
     if (m_param->bEnableSsim)
         m_ssimBuf = X265_MALLOC(int, 8 * (m_param->sourceWidth / 4 + 3));
+
+    m_parallelFilter = new ParallelFilter[numRows];
+
+    if (m_parallelFilter)
+    {
+        if (m_param->bEnableSAO)
+        {
+            for(int row = 0; row < numRows; row++)
+            {
+                if (!m_parallelFilter[row].m_sao.create(m_param, (row == 0 ? 1 : 0)))
+                    m_param->bEnableSAO = 0;
+                else
+                {
+                    if (row != 0)
+                        m_parallelFilter[row].m_sao.createFromRootNode(&m_parallelFilter[0].m_sao);
+                }
+
+            }
+        }
+
+        for(int row = 0; row < numRows; row++)
+        {
+            // Setting maximum bound information
+            m_parallelFilter[row].m_rowHeight = (row == numRows - 1) ? m_lastHeight : g_maxCUSize;
+            m_parallelFilter[row].m_row = row;
+            m_parallelFilter[row].m_rowAddr = row * numCols;
+            m_parallelFilter[row].m_frameFilter = this;
+
+            if (row > 0)
+                m_parallelFilter[row].m_prevRow = &m_parallelFilter[row - 1];
+        }
+    }
+
 }
 
 void FrameFilter::start(Frame *frame, Entropy& initState, int qp)
 {
     m_frame = frame;
 
-    if (m_param->bEnableSAO)
-        m_sao.startSlice(frame, initState, qp);
+    // Reset Filter Data Struct
+    if (m_parallelFilter)
+    {
+        for(int row = 0; row < m_numRows; row++)
+        {
+            if (m_param->bEnableSAO)
+                m_parallelFilter[row].m_sao.startSlice(frame, initState, qp);
+
+            m_parallelFilter[row].m_lastCol.set(0);
+            m_parallelFilter[row].m_allowedCol.set(0);
+            m_parallelFilter[row].m_lastDeblocked.set(-1);
+            m_parallelFilter[row].m_encData = frame->m_encData;
+        }
+
+        // Reset SAO common statistics
+        if (m_param->bEnableSAO)
+            m_parallelFilter[0].m_sao.resetStats();
+    }
 }
 
-void FrameFilter::processRow(int row)
+/* restore original YUV samples to recon after SAO (if lossless) */
+static void restoreOrigLosslessYuv(const CUData* cu, Frame& frame, uint32_t absPartIdx)
 {
-    ProfileScopeEvent(filterCTURow);
+    const int size = cu->m_log2CUSize[absPartIdx] - 2;
+    const uint32_t cuAddr = cu->m_cuAddr;
 
-#if DETAILED_CU_STATS
-    ScopedElapsedTime filterPerfScope(m_frameEncoder->m_cuStats.loopFilterElapsedTime);
-    m_frameEncoder->m_cuStats.countLoopFilter++;
-#endif
+    PicYuv* reconPic = frame.m_reconPic;
+    PicYuv* fencPic  = frame.m_fencPic;
 
-    if (!m_param->bEnableLoopFilter && !m_param->bEnableSAO)
+    pixel* dst = reconPic->getLumaAddr(cuAddr, absPartIdx);
+    pixel* src = fencPic->getLumaAddr(cuAddr, absPartIdx);
+
+    primitives.cu[size].copy_pp(dst, reconPic->m_stride, src, fencPic->m_stride);
+
+    if (cu->m_chromaFormat != X265_CSP_I400)
     {
-        processRowPost(row);
+        pixel* dstCb = reconPic->getCbAddr(cuAddr, absPartIdx);
+        pixel* srcCb = fencPic->getCbAddr(cuAddr, absPartIdx);
+        pixel* dstCr = reconPic->getCrAddr(cuAddr, absPartIdx);
+        pixel* srcCr = fencPic->getCrAddr(cuAddr, absPartIdx);
+
+        const int csp = fencPic->m_picCsp;
+        primitives.chroma[csp].cu[size].copy_pp(dstCb, reconPic->m_strideC, srcCb, fencPic->m_strideC);
+        primitives.chroma[csp].cu[size].copy_pp(dstCr, reconPic->m_strideC, srcCr, fencPic->m_strideC);
+    }
+}
+
+/* Original YUV restoration for CU in lossless coding */
+static void origCUSampleRestoration(const CUData* cu, const CUGeom& cuGeom, Frame& frame)
+{
+    uint32_t absPartIdx = cuGeom.absPartIdx;
+    if (cu->m_cuDepth[absPartIdx] > cuGeom.depth)
+    {
+        for (int subPartIdx = 0; subPartIdx < 4; subPartIdx++)
+        {
+            const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
+            if (childGeom.flags & CUGeom::PRESENT)
+                origCUSampleRestoration(cu, childGeom, frame);
+        }
         return;
     }
-    FrameData& encData = *m_frame->m_encData;
-    const uint32_t numCols = encData.m_slice->m_sps->numCuInWidth;
-    const uint32_t lineStartCUAddr = row * numCols;
 
-    if (m_param->bEnableLoopFilter)
+    // restore original YUV samples
+    if (cu->m_tqBypass[absPartIdx])
+        restoreOrigLosslessYuv(cu, frame, absPartIdx);
+}
+
+void FrameFilter::ParallelFilter::copySaoAboveRef(PicYuv* reconPic, uint32_t cuAddr, int col)
+{
+    // Copy SAO Top Reference Pixels
+    int ctuWidth  = g_maxCUSize;
+    const pixel* recY = reconPic->getPlaneAddr(0, cuAddr) - (m_rowAddr == 0 ? 0 : reconPic->m_stride);
+
+    // Luma
+    memcpy(&m_sao.m_tmpU[0][col * ctuWidth], recY, ctuWidth * sizeof(pixel));
+    X265_CHECK(col * ctuWidth + ctuWidth <= m_sao.m_numCuInWidth * ctuWidth, "m_tmpU buffer beyond bound write detected");
+
+    // Chroma
+    if (m_frameFilter->m_param->internalCsp != X265_CSP_I400)
+    {
+        ctuWidth  >>= m_sao.m_hChromaShift;
+
+        const pixel* recU = reconPic->getPlaneAddr(1, cuAddr) - (m_rowAddr == 0 ? 0 : reconPic->m_strideC);
+        const pixel* recV = reconPic->getPlaneAddr(2, cuAddr) - (m_rowAddr == 0 ? 0 : reconPic->m_strideC);
+        memcpy(&m_sao.m_tmpU[1][col * ctuWidth], recU, ctuWidth * sizeof(pixel));
+        memcpy(&m_sao.m_tmpU[2][col * ctuWidth], recV, ctuWidth * sizeof(pixel));
+
+        X265_CHECK(col * ctuWidth + ctuWidth <= m_sao.m_numCuInWidth * ctuWidth, "m_tmpU buffer beyond bound write detected");
+    }
+}
+
+void FrameFilter::ParallelFilter::processSaoUnitCu(SAOParam *saoParam, int col)
+{
​

x265_1.8.tar.gz/source/encoder/framefilter.h -> x265_1.9.tar.gz/source/encoder/framefilter.h Changed

@@ -29,6 +29,7 @@
 #include "frame.h"
 #include "deblock.h"
 #include "sao.h"
+#include "threadpool.h" // class BondedTaskGroup
 
 namespace X265_NS {
 // private x265 namespace
@@ -39,7 +40,7 @@
 struct ThreadLocalData;
 
 // Manages the processing of a single frame loopfilter
-class FrameFilter : public Deblock
+class FrameFilter
 {
 public:
 
@@ -50,24 +51,86 @@
     int           m_vChromaShift;
     int           m_pad[2];
 
-    SAO           m_sao;
     int           m_numRows;
+    int           m_numCols;
     int           m_saoRowDelay;
     int           m_lastHeight;
+    int           m_lastWidth;
     
-    void*         m_ssimBuf; /* Temp storage for ssim computation */
+    void*         m_ssimBuf;        /* Temp storage for ssim computation */
 
-    FrameFilter();
+#define MAX_PFILTER_CUS     (4) /* maximum CUs for every thread */
+    class ParallelFilter : public BondedTaskGroup, public Deblock
+    {
+    public:
+        uint32_t            m_rowHeight;
+        int                 m_row;
+        uint32_t            m_rowAddr;
+        FrameFilter*        m_frameFilter;
+        FrameData*          m_encData;
+        ParallelFilter*     m_prevRow;
+        SAO                 m_sao;
+        ThreadSafeInteger   m_lastCol;          /* The column that next to process */
+        ThreadSafeInteger   m_allowedCol;       /* The column that processed from Encode pipeline */
+        ThreadSafeInteger   m_lastDeblocked;   /* The column that finished all of Deblock stages  */
 
-    void init(Encoder *top, FrameEncoder *frame, int numRows);
+        ParallelFilter()
+            : m_rowHeight(0)
+            , m_row(0)
+            , m_rowAddr(0)
+            , m_frameFilter(NULL)
+            , m_encData(NULL)
+            , m_prevRow(NULL)
+        {
+        }
+
+        ~ParallelFilter()
+        { }
+
+        void processTasks(int workerThreadId);
+
+        // Apply SAO on a CU in current row
+        void processSaoUnitCu(SAOParam *saoParam, int col);
+
+        // Copy and Save SAO reference pixels for SAO Rdo decide
+        void copySaoAboveRef(PicYuv* reconPic, uint32_t cuAddr, int col);
+
+        // Post-Process (Border extension)
+        void processPostCu(int col) const;
+
+        uint32_t getCUHeight() const
+        {
+            return m_rowHeight;
+        }
+
+    protected:
+
+        ParallelFilter operator=(const ParallelFilter&);
+    };
+
+    ParallelFilter*     m_parallelFilter;
+
+    FrameFilter()
+        : m_param(NULL)
+        , m_frame(NULL)
+        , m_frameEncoder(NULL)
+        , m_ssimBuf(NULL)
+        , m_parallelFilter(NULL)
+    {
+    }
+
+    uint32_t getCUWidth(int colNum) const
+    {
+        return (colNum == (int)m_numCols - 1) ? m_lastWidth : g_maxCUSize;
+    }
+
+    void init(Encoder *top, FrameEncoder *frame, int numRows, uint32_t numCols);
     void destroy();
 
     void start(Frame *pic, Entropy& initState, int qp);
 
     void processRow(int row);
-    void processRowPost(int row);
-    void processSao(int row);
-    uint32_t getCUHeight(int rowNum) const;
+    void processPostRow(int row);
 };
 }

 
@@ -29,6 +29,7 @@
 #include "frame.h"
 #include "deblock.h"
 #include "sao.h"
+#include "threadpool.h" // class BondedTaskGroup
 
 namespace X265_NS {
 // private x265 namespace
@@ -39,7 +40,7 @@
 struct ThreadLocalData;
 
 // Manages the processing of a single frame loopfilter
-class FrameFilter : public Deblock
+class FrameFilter
 {
 public:
 
@@ -50,24 +51,86 @@
     int           m_vChromaShift;
     int           m_pad[2];
 
-    SAO           m_sao;
     int           m_numRows;
+    int           m_numCols;
     int           m_saoRowDelay;
     int           m_lastHeight;
+    int           m_lastWidth;
     
-    void*         m_ssimBuf; /* Temp storage for ssim computation */
+    void*         m_ssimBuf;        /* Temp storage for ssim computation */
 
-    FrameFilter();
+#define MAX_PFILTER_CUS     (4) /* maximum CUs for every thread */
+    class ParallelFilter : public BondedTaskGroup, public Deblock
+    {
+    public:
+        uint32_t            m_rowHeight;
+        int                 m_row;
+        uint32_t            m_rowAddr;
+        FrameFilter*        m_frameFilter;
+        FrameData*          m_encData;
+        ParallelFilter*     m_prevRow;
+        SAO                 m_sao;
+        ThreadSafeInteger   m_lastCol;          /* The column that next to process */
+        ThreadSafeInteger   m_allowedCol;       /* The column that processed from Encode pipeline */
+        ThreadSafeInteger   m_lastDeblocked;   /* The column that finished all of Deblock stages  */
 
-    void init(Encoder *top, FrameEncoder *frame, int numRows);
+        ParallelFilter()
+            : m_rowHeight(0)
+            , m_row(0)
+            , m_rowAddr(0)
+            , m_frameFilter(NULL)
+            , m_encData(NULL)
+            , m_prevRow(NULL)
+        {
+        }
+
+        ~ParallelFilter()
+        { }
+
+        void processTasks(int workerThreadId);
+
+        // Apply SAO on a CU in current row
+        void processSaoUnitCu(SAOParam *saoParam, int col);
+
+        // Copy and Save SAO reference pixels for SAO Rdo decide
+        void copySaoAboveRef(PicYuv* reconPic, uint32_t cuAddr, int col);
+
+        // Post-Process (Border extension)
+        void processPostCu(int col) const;
+
+        uint32_t getCUHeight() const
+        {
+            return m_rowHeight;
+        }
+
+    protected:
+
+        ParallelFilter operator=(const ParallelFilter&);
+    };
+
+    ParallelFilter*     m_parallelFilter;
+
+    FrameFilter()
+        : m_param(NULL)
+        , m_frame(NULL)
+        , m_frameEncoder(NULL)
+        , m_ssimBuf(NULL)
+        , m_parallelFilter(NULL)
+    {
+    }
+
+    uint32_t getCUWidth(int colNum) const
+    {
+        return (colNum == (int)m_numCols - 1) ? m_lastWidth : g_maxCUSize;
+    }
+
+    void init(Encoder *top, FrameEncoder *frame, int numRows, uint32_t numCols);
     void destroy();
 
     void start(Frame *pic, Entropy& initState, int qp);
 
     void processRow(int row);
-    void processRowPost(int row);
-    void processSao(int row);
-    uint32_t getCUHeight(int rowNum) const;
+    void processPostRow(int row);
 };
 }
 
​

x265_1.8.tar.gz/source/encoder/level.cpp -> x265_1.9.tar.gz/source/encoder/level.cpp Changed

 
@@ -2,6 +2,7 @@
  * Copyright (C) 2013 x265 project
  *
  * Authors: Steve Borho <steve@borho.org>
+ *          Min Chen <chenm003@163.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -462,7 +463,7 @@
     {
         if (param->internalCsp != X265_CSP_I420)
         {
-            x265_log(param, X265_LOG_ERROR, "%s profile not compatible with %s input color space.\n",
+            x265_log(param, X265_LOG_ERROR, "%s profile not compatible with %s input chroma subsampling.\n",
                      profile, x265_source_csp_names[param->internalCsp]);
             return -1;
         }
@@ -472,7 +473,7 @@
     {
         if (param->internalCsp != X265_CSP_I420 && param->internalCsp != X265_CSP_I422)
         {
-            x265_log(param, X265_LOG_ERROR, "%s profile not compatible with %s input color space.\n",
+            x265_log(param, X265_LOG_ERROR, "%s profile not compatible with %s input chroma subsampling.\n",
                      profile, x265_source_csp_names[param->internalCsp]);
             return -1;
         }
​

x265_1.8.tar.gz/source/encoder/motion.cpp -> x265_1.9.tar.gz/source/encoder/motion.cpp Changed

 
@@ -2,6 +2,7 @@
  * Copyright (C) 2013 x265 project
  *
  * Authors: Steve Borho <steve@borho.org>
+ *          Min Chen <chenm003@163.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -188,11 +189,12 @@
     satd = primitives.pu[partEnum].satd;
     sad_x3 = primitives.pu[partEnum].sad_x3;
     sad_x4 = primitives.pu[partEnum].sad_x4;
+
     chromaSatd = primitives.chroma[fencPUYuv.m_csp].pu[partEnum].satd;
 
     /* Enable chroma residual cost if subpelRefine level is greater than 2 and chroma block size
      * is an even multiple of 4x4 pixels (indicated by non-null chromaSatd pointer) */
-    bChromaSATD = subpelRefine > 2 && chromaSatd;
+    bChromaSATD = subpelRefine > 2 && chromaSatd && (srcFencYuv.m_csp != X265_CSP_I400);
     X265_CHECK(!(bChromaSATD && !workload[subpelRefine].hpel_satd), "Chroma SATD cannot be used with SAD hpel\n");
 
     ctuAddr = _ctuAddr;
@@ -1214,8 +1216,11 @@
         const pixel* refCb = ref->getCbAddr(ctuAddr, absPartIdx) + refOffset;
         const pixel* refCr = ref->getCrAddr(ctuAddr, absPartIdx) + refOffset;
 
-        xFrac = qmv.x & ((1 << shiftHor) - 1);
-        yFrac = qmv.y & ((1 << shiftVer) - 1);
+        X265_CHECK((hshift == 0) || (hshift == 1), "hshift must be 0 or 1\n");
+        X265_CHECK((vshift == 0) || (vshift == 1), "vshift must be 0 or 1\n");
+
+        xFrac = qmv.x & (hshift ? 7 : 3);
+        yFrac = qmv.y & (vshift ? 7 : 3);
 
         if (!(yFrac | xFrac))
         {
​

x265_1.8.tar.gz/source/encoder/motion.h -> x265_1.9.tar.gz/source/encoder/motion.h Changed

 
@@ -2,6 +2,7 @@
  * Copyright (C) 2013 x265 project
  *
  * Authors: Steve Borho <steve@borho.org>
+ *          Min Chen <chenm003@163.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
​

x265_1.8.tar.gz/source/encoder/nal.cpp -> x265_1.9.tar.gz/source/encoder/nal.cpp Changed

 
@@ -2,6 +2,7 @@
 * Copyright (C) 2013 x265 project
 *
 * Authors: Steve Borho <steve@borho.org>
+*          Min Chen <chenm003@163.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
​

x265_1.8.tar.gz/source/encoder/ratecontrol.cpp -> x265_1.9.tar.gz/source/encoder/ratecontrol.cpp Changed

@@ -23,6 +23,10 @@
  * For more information, contact us at license @ x265.com.
  *****************************************************************************/
 
+#if _MSC_VER
+#pragma warning(disable: 4127) // conditional expression is constant, yes I know
+#endif
+
 #include "common.h"
 #include "param.h"
 #include "frame.h"
@@ -142,6 +146,9 @@
     rce->expectedVbv = rce2Pass->expectedVbv;
     rce->blurredComplexity = rce2Pass->blurredComplexity;
     rce->sliceType = rce2Pass->sliceType;
+    rce->qpNoVbv = rce2Pass->qpNoVbv;
+    rce->newQp = rce2Pass->newQp;
+    rce->qRceq = rce2Pass->qRceq;
 }
 
 }  // end anonymous namespace
@@ -205,7 +212,7 @@
             m_rateFactorMaxDecrement = m_param->rc.rfConstant - m_param->rc.rfConstantMin;
     }
     m_isAbr = m_param->rc.rateControlMode != X265_RC_CQP && !m_param->rc.bStatRead;
-    m_2pass = m_param->rc.rateControlMode == X265_RC_ABR && m_param->rc.bStatRead;
+    m_2pass = (m_param->rc.rateControlMode == X265_RC_ABR || m_param->rc.vbvMaxBitrate > 0) && m_param->rc.bStatRead;
     m_bitrate = m_param->rc.bitrate * 1000;
     m_frameDuration = (double)m_param->fpsDenom / m_param->fpsNum;
     m_qp = m_param->rc.qp;
@@ -219,6 +226,7 @@
     m_cutreeStatFileOut = m_cutreeStatFileIn = NULL;
     m_rce2Pass = NULL;
     m_lastBsliceSatdCost = 0;
+    m_movingAvgSum = 0.0;
 
     // vbv initialization
     m_param->rc.vbvBufferSize = x265_clip3(0, 2000000, m_param->rc.vbvBufferSize);
@@ -444,6 +452,7 @@
                 CMP_OPT_FIRST_PASS("open-gop", m_param->bOpenGOP);
                 CMP_OPT_FIRST_PASS("keyint", m_param->keyframeMax);
                 CMP_OPT_FIRST_PASS("scenecut", m_param->scenecutThreshold);
+                CMP_OPT_FIRST_PASS("intra-refresh", m_param->bIntraRefresh);
 
                 if ((p = strstr(opts, "b-adapt=")) != 0 && sscanf(p, "b-adapt=%d", &i) && i >= X265_B_ADAPT_NONE && i <= X265_B_ADAPT_TRELLIS)
                 {
@@ -488,6 +497,12 @@
                  x265_log(m_param, X265_LOG_ERROR, "Rce Entries for 2 pass cannot be allocated\n");
                  return false;
             }
+            m_encOrder = X265_MALLOC(int, m_numEntries);
+            if (!m_encOrder)
+            {
+                x265_log(m_param, X265_LOG_ERROR, "Encode order for 2 pass cannot be allocated\n");
+                return false;
+            }
             /* init all to skipped p frames */
             for (int i = 0; i < m_numEntries; i++)
             {
@@ -504,22 +519,24 @@
             {
                 RateControlEntry *rce;
                 int frameNumber;
+                int encodeOrder;
                 char picType;
                 int e;
                 char *next;
-                double qpRc, qpAq;
+                double qpRc, qpAq, qNoVbv, qRceq;
                 next = strstr(p, ";");
                 if (next)
                     *next++ = 0;
-                e = sscanf(p, " in:%d ", &frameNumber);
+                e = sscanf(p, " in:%d out:%d", &frameNumber, &encodeOrder);
                 if (frameNumber < 0 || frameNumber >= m_numEntries)
                 {
                     x265_log(m_param, X265_LOG_ERROR, "bad frame number (%d) at stats line %d\n", frameNumber, i);
                     return false;
                 }
-                rce = &m_rce2Pass[frameNumber];
-                e += sscanf(p, " in:%*d out:%*d type:%c q:%lf q-aq:%lf tex:%d mv:%d misc:%d icu:%lf pcu:%lf scu:%lf",
-                       &picType, &qpRc, &qpAq, &rce->coeffBits,
+                rce = &m_rce2Pass[encodeOrder];
+                m_encOrder[frameNumber] = encodeOrder;
+                e += sscanf(p, " in:%*d out:%*d type:%c q:%lf q-aq:%lf q-noVbv:%lf q-Rceq:%lf tex:%d mv:%d misc:%d icu:%lf pcu:%lf scu:%lf",
+                       &picType, &qpRc, &qpAq, &qNoVbv, &qRceq, &rce->coeffBits,
                        &rce->mvBits, &rce->miscBits, &rce->iCuCount, &rce->pCuCount,
                        &rce->skipCuCount);
                 rce->keptAsRef = true;
@@ -538,13 +555,16 @@
                     x265_log(m_param, X265_LOG_ERROR, "statistics are damaged at line %d, parser out=%d\n", i, e);
                     return false;
                 }
-                rce->qScale = x265_qp2qScale(qpRc);
+                rce->qScale = rce->newQScale = x265_qp2qScale(qpRc);
                 totalQpAq += qpAq;
+                rce->qpNoVbv = qNoVbv;
+                rce->qpaRc = qpRc;
+                rce->qpAq = qpAq;
+                rce->qRceq = qRceq;
                 p = next;
             }
             X265_FREE(statsBuf);
-
-            if (m_param->rc.rateControlMode == X265_RC_ABR)
+            if (m_param->rc.rateControlMode == X265_RC_ABR || m_param->rc.vbvMaxBitrate > 0)
             {
                 if (!initPass2())
                     return false;
@@ -627,11 +647,8 @@
 
     #undef MAX_DURATION
 }
-
-bool RateControl::initPass2()
+bool RateControl::analyseABR2Pass(int startIndex, int endIndex, uint64_t allAvailableBits)
 {
-    uint64_t allConstBits = 0;
-    uint64_t allAvailableBits = uint64_t(m_param->rc.bitrate * 1000. * m_numEntries * m_frameDuration);
     double rateFactor, stepMult;
     double qBlur = m_param->rc.qblur;
     double cplxBlur = m_param->rc.complexityBlur;
@@ -640,30 +657,19 @@
     double *qScale, *blurredQscale;
     double baseCplx = m_ncu * (m_param->bframes ? 120 : 80);
     double clippedDuration = CLIP_DURATION(m_frameDuration) / BASE_FRAME_DURATION;
-
-    /* find total/average complexity & const_bits */
-    for (int i = 0; i < m_numEntries; i++)
-        allConstBits += m_rce2Pass[i].miscBits;
-
-    if (allAvailableBits < allConstBits)
-    {
-        x265_log(m_param, X265_LOG_ERROR, "requested bitrate is too low. estimated minimum is %d kbps\n",
-                 (int)(allConstBits * m_fps / m_numEntries * 1000.));
-        return false;
-    }
-
+    int framesCount = endIndex - startIndex + 1;
     /* Blur complexities, to reduce local fluctuation of QP.
      * We don't blur the QPs directly, because then one very simple frame
      * could drag down the QP of a nearby complex frame and give it more
      * bits than intended. */
-    for (int i = 0; i < m_numEntries; i++)
+    for (int i = startIndex; i <= endIndex; i++)
     {
         double weightSum = 0;
         double cplxSum = 0;
         double weight = 1.0;
         double gaussianWeight;
         /* weighted average of cplx of future frames */
-        for (int j = 1; j < cplxBlur * 2 && j < m_numEntries - i; j++)
+        for (int j = 1; j < cplxBlur * 2 && j <= endIndex - i; j++)
         {
             RateControlEntry *rcj = &m_rce2Pass[i + j];
             weight *= 1 - pow(rcj->iCuCount / m_ncu, 2);
@@ -687,11 +693,10 @@
         }
         m_rce2Pass[i].blurredComplexity = cplxSum / weightSum;
     }
-
-    CHECKED_MALLOC(qScale, double, m_numEntries);
+    CHECKED_MALLOC(qScale, double, framesCount);
     if (filterSize > 1)
     {
-        CHECKED_MALLOC(blurredQscale, double, m_numEntries);
+        CHECKED_MALLOC(blurredQscale, double, framesCount);
     }
     else
         blurredQscale = qScale;
@@ -702,9 +707,8 @@
      * because qscale2bits is not invertible, but we can start with the simple
      * approximation of scaling the 1st pass by the ratio of bitrates.
      * The search range is probably overkill, but speed doesn't matter here. */
-
     expectedBits = 1;
-    for (int i = 0; i < m_numEntries; i++)
+    for (int i = startIndex; i <= endIndex; i++)
     {
         RateControlEntry* rce = &m_rce2Pass[i];
         double q = getQScale(rce, 1.0);
@@ -781,12 +785,10 @@
     X265_FREE(qScale);
     if (filterSize > 1)
         X265_FREE(blurredQscale);
-
     if (m_isVbv)
-        if (!vbv2Pass(allAvailableBits))
+    if (!vbv2Pass(allAvailableBits, endIndex, startIndex))
             return false;
-    expectedBits = countExpectedBits();
-
+    expectedBits = countExpectedBits(startIndex, endIndex);
     if (fabs(expectedBits / allAvailableBits - 1.0) > 0.01)
     {
         double avgq = 0;
@@ -819,7 +821,123 @@
     return false;
 }

 
@@ -23,6 +23,10 @@
  * For more information, contact us at license @ x265.com.
  *****************************************************************************/
 
+#if _MSC_VER
+#pragma warning(disable: 4127) // conditional expression is constant, yes I know
+#endif
+
 #include "common.h"
 #include "param.h"
 #include "frame.h"
@@ -142,6 +146,9 @@
     rce->expectedVbv = rce2Pass->expectedVbv;
     rce->blurredComplexity = rce2Pass->blurredComplexity;
     rce->sliceType = rce2Pass->sliceType;
+    rce->qpNoVbv = rce2Pass->qpNoVbv;
+    rce->newQp = rce2Pass->newQp;
+    rce->qRceq = rce2Pass->qRceq;
 }
 
 }  // end anonymous namespace
@@ -205,7 +212,7 @@
             m_rateFactorMaxDecrement = m_param->rc.rfConstant - m_param->rc.rfConstantMin;
     }
     m_isAbr = m_param->rc.rateControlMode != X265_RC_CQP && !m_param->rc.bStatRead;
-    m_2pass = m_param->rc.rateControlMode == X265_RC_ABR && m_param->rc.bStatRead;
+    m_2pass = (m_param->rc.rateControlMode == X265_RC_ABR || m_param->rc.vbvMaxBitrate > 0) && m_param->rc.bStatRead;
     m_bitrate = m_param->rc.bitrate * 1000;
     m_frameDuration = (double)m_param->fpsDenom / m_param->fpsNum;
     m_qp = m_param->rc.qp;
@@ -219,6 +226,7 @@
     m_cutreeStatFileOut = m_cutreeStatFileIn = NULL;
     m_rce2Pass = NULL;
     m_lastBsliceSatdCost = 0;
+    m_movingAvgSum = 0.0;
 
     // vbv initialization
     m_param->rc.vbvBufferSize = x265_clip3(0, 2000000, m_param->rc.vbvBufferSize);
@@ -444,6 +452,7 @@
                 CMP_OPT_FIRST_PASS("open-gop", m_param->bOpenGOP);
                 CMP_OPT_FIRST_PASS("keyint", m_param->keyframeMax);
                 CMP_OPT_FIRST_PASS("scenecut", m_param->scenecutThreshold);
+                CMP_OPT_FIRST_PASS("intra-refresh", m_param->bIntraRefresh);
 
                 if ((p = strstr(opts, "b-adapt=")) != 0 && sscanf(p, "b-adapt=%d", &i) && i >= X265_B_ADAPT_NONE && i <= X265_B_ADAPT_TRELLIS)
                 {
@@ -488,6 +497,12 @@
                  x265_log(m_param, X265_LOG_ERROR, "Rce Entries for 2 pass cannot be allocated\n");
                  return false;
             }
+            m_encOrder = X265_MALLOC(int, m_numEntries);
+            if (!m_encOrder)
+            {
+                x265_log(m_param, X265_LOG_ERROR, "Encode order for 2 pass cannot be allocated\n");
+                return false;
+            }
             /* init all to skipped p frames */
             for (int i = 0; i < m_numEntries; i++)
             {
@@ -504,22 +519,24 @@
             {
                 RateControlEntry *rce;
                 int frameNumber;
+                int encodeOrder;
                 char picType;
                 int e;
                 char *next;
-                double qpRc, qpAq;
+                double qpRc, qpAq, qNoVbv, qRceq;
                 next = strstr(p, ";");
                 if (next)
                     *next++ = 0;
-                e = sscanf(p, " in:%d ", &frameNumber);
+                e = sscanf(p, " in:%d out:%d", &frameNumber, &encodeOrder);
                 if (frameNumber < 0 || frameNumber >= m_numEntries)
                 {
                     x265_log(m_param, X265_LOG_ERROR, "bad frame number (%d) at stats line %d\n", frameNumber, i);
                     return false;
                 }
-                rce = &m_rce2Pass[frameNumber];
-                e += sscanf(p, " in:%*d out:%*d type:%c q:%lf q-aq:%lf tex:%d mv:%d misc:%d icu:%lf pcu:%lf scu:%lf",
-                       &picType, &qpRc, &qpAq, &rce->coeffBits,
+                rce = &m_rce2Pass[encodeOrder];
+                m_encOrder[frameNumber] = encodeOrder;
+                e += sscanf(p, " in:%*d out:%*d type:%c q:%lf q-aq:%lf q-noVbv:%lf q-Rceq:%lf tex:%d mv:%d misc:%d icu:%lf pcu:%lf scu:%lf",
+                       &picType, &qpRc, &qpAq, &qNoVbv, &qRceq, &rce->coeffBits,
                        &rce->mvBits, &rce->miscBits, &rce->iCuCount, &rce->pCuCount,
                        &rce->skipCuCount);
                 rce->keptAsRef = true;
@@ -538,13 +555,16 @@
                     x265_log(m_param, X265_LOG_ERROR, "statistics are damaged at line %d, parser out=%d\n", i, e);
                     return false;
                 }
-                rce->qScale = x265_qp2qScale(qpRc);
+                rce->qScale = rce->newQScale = x265_qp2qScale(qpRc);
                 totalQpAq += qpAq;
+                rce->qpNoVbv = qNoVbv;
+                rce->qpaRc = qpRc;
+                rce->qpAq = qpAq;
+                rce->qRceq = qRceq;
                 p = next;
             }
             X265_FREE(statsBuf);
-
-            if (m_param->rc.rateControlMode == X265_RC_ABR)
+            if (m_param->rc.rateControlMode == X265_RC_ABR || m_param->rc.vbvMaxBitrate > 0)
             {
                 if (!initPass2())
                     return false;
@@ -627,11 +647,8 @@
 
     #undef MAX_DURATION
 }
-
-bool RateControl::initPass2()
+bool RateControl::analyseABR2Pass(int startIndex, int endIndex, uint64_t allAvailableBits)
 {
-    uint64_t allConstBits = 0;
-    uint64_t allAvailableBits = uint64_t(m_param->rc.bitrate * 1000. * m_numEntries * m_frameDuration);
     double rateFactor, stepMult;
     double qBlur = m_param->rc.qblur;
     double cplxBlur = m_param->rc.complexityBlur;
@@ -640,30 +657,19 @@
     double *qScale, *blurredQscale;
     double baseCplx = m_ncu * (m_param->bframes ? 120 : 80);
     double clippedDuration = CLIP_DURATION(m_frameDuration) / BASE_FRAME_DURATION;
-
-    /* find total/average complexity & const_bits */
-    for (int i = 0; i < m_numEntries; i++)
-        allConstBits += m_rce2Pass[i].miscBits;
-
-    if (allAvailableBits < allConstBits)
-    {
-        x265_log(m_param, X265_LOG_ERROR, "requested bitrate is too low. estimated minimum is %d kbps\n",
-                 (int)(allConstBits * m_fps / m_numEntries * 1000.));
-        return false;
-    }
-
+    int framesCount = endIndex - startIndex + 1;
     /* Blur complexities, to reduce local fluctuation of QP.
      * We don't blur the QPs directly, because then one very simple frame
      * could drag down the QP of a nearby complex frame and give it more
      * bits than intended. */
-    for (int i = 0; i < m_numEntries; i++)
+    for (int i = startIndex; i <= endIndex; i++)
     {
         double weightSum = 0;
         double cplxSum = 0;
         double weight = 1.0;
         double gaussianWeight;
         /* weighted average of cplx of future frames */
-        for (int j = 1; j < cplxBlur * 2 && j < m_numEntries - i; j++)
+        for (int j = 1; j < cplxBlur * 2 && j <= endIndex - i; j++)
         {
             RateControlEntry *rcj = &m_rce2Pass[i + j];
             weight *= 1 - pow(rcj->iCuCount / m_ncu, 2);
@@ -687,11 +693,10 @@
         }
         m_rce2Pass[i].blurredComplexity = cplxSum / weightSum;
     }
-
-    CHECKED_MALLOC(qScale, double, m_numEntries);
+    CHECKED_MALLOC(qScale, double, framesCount);
     if (filterSize > 1)
     {
-        CHECKED_MALLOC(blurredQscale, double, m_numEntries);
+        CHECKED_MALLOC(blurredQscale, double, framesCount);
     }
     else
         blurredQscale = qScale;
@@ -702,9 +707,8 @@
      * because qscale2bits is not invertible, but we can start with the simple
      * approximation of scaling the 1st pass by the ratio of bitrates.
      * The search range is probably overkill, but speed doesn't matter here. */
-
     expectedBits = 1;
-    for (int i = 0; i < m_numEntries; i++)
+    for (int i = startIndex; i <= endIndex; i++)
     {
         RateControlEntry* rce = &m_rce2Pass[i];
         double q = getQScale(rce, 1.0);
@@ -781,12 +785,10 @@
     X265_FREE(qScale);
     if (filterSize > 1)
         X265_FREE(blurredQscale);
-
     if (m_isVbv)
-        if (!vbv2Pass(allAvailableBits))
+    if (!vbv2Pass(allAvailableBits, endIndex, startIndex))
             return false;
-    expectedBits = countExpectedBits();
-
+    expectedBits = countExpectedBits(startIndex, endIndex);
     if (fabs(expectedBits / allAvailableBits - 1.0) > 0.01)
     {
         double avgq = 0;
@@ -819,7 +821,123 @@
     return false;
 }
 
​

x265_1.8.tar.gz/source/encoder/ratecontrol.h -> x265_1.9.tar.gz/source/encoder/ratecontrol.h Changed

@@ -48,6 +48,7 @@
 
 struct Predictor
 {
+    double coeffMin;
     double coeff;
     double count;
     double decay;
@@ -74,6 +75,7 @@
     double  qpaRc;
     double  qpAq;
     double  qRceq;
+    double  qpPrev;
     double  frameSizePlanned;  /* frame Size decided by RateCotrol before encoding the frame */
     double  bufferRate;
     double  movingAvgSum;
@@ -167,6 +169,8 @@
     int64_t m_satdCostWindow[50];
     int64_t m_encodedBitsWindow[50];
     int     m_sliderPos;
+    int64_t m_lastRemovedSatdCost;
+    double  m_movingAvgSum;
 
     /* To detect a pattern of low detailed static frames in single pass ABR using satdcosts */
     int64_t m_lastBsliceSatdCost;
@@ -205,8 +209,8 @@
     double  m_lastAccumPNorm;
     double  m_expectedBitsSum;   /* sum of qscale2bits after rceq, ratefactor, and overflow, only includes finished frames */
     int64_t m_predictedBits;
+    int     *m_encOrder;
     RateControlEntry* m_rce2Pass;
-
     struct
     {
         uint16_t *qpBuffer[2]; /* Global buffers for converting MB-tree quantizer data. */
@@ -258,11 +262,12 @@
     void   checkAndResetABR(RateControlEntry* rce, bool isFrameDone);
     double predictRowsSizeSum(Frame* pic, RateControlEntry* rce, double qpm, int32_t& encodedBits);
     bool   initPass2();
+    bool   analyseABR2Pass(int startPoc, int endPoc, uint64_t allAvailableBits);
     void   initFramePredictors();
     double getDiffLimitedQScale(RateControlEntry *rce, double q);
-    double countExpectedBits();
-    bool   vbv2Pass(uint64_t allAvailableBits);
-    bool   findUnderflow(double *fills, int *t0, int *t1, int over);
+    double countExpectedBits(int startPos, int framesCount);
+    bool   vbv2Pass(uint64_t allAvailableBits, int frameCount, int startPos);
+    bool   findUnderflow(double *fills, int *t0, int *t1, int over, int framesCount);
     bool   fixUnderflow(int t0, int t1, double adjustment, double qscaleMin, double qscaleMax);
 };
 }

 
@@ -48,6 +48,7 @@
 
 struct Predictor
 {
+    double coeffMin;
     double coeff;
     double count;
     double decay;
@@ -74,6 +75,7 @@
     double  qpaRc;
     double  qpAq;
     double  qRceq;
+    double  qpPrev;
     double  frameSizePlanned;  /* frame Size decided by RateCotrol before encoding the frame */
     double  bufferRate;
     double  movingAvgSum;
@@ -167,6 +169,8 @@
     int64_t m_satdCostWindow[50];
     int64_t m_encodedBitsWindow[50];
     int     m_sliderPos;
+    int64_t m_lastRemovedSatdCost;
+    double  m_movingAvgSum;
 
     /* To detect a pattern of low detailed static frames in single pass ABR using satdcosts */
     int64_t m_lastBsliceSatdCost;
@@ -205,8 +209,8 @@
     double  m_lastAccumPNorm;
     double  m_expectedBitsSum;   /* sum of qscale2bits after rceq, ratefactor, and overflow, only includes finished frames */
     int64_t m_predictedBits;
+    int     *m_encOrder;
     RateControlEntry* m_rce2Pass;
-
     struct
     {
         uint16_t *qpBuffer[2]; /* Global buffers for converting MB-tree quantizer data. */
@@ -258,11 +262,12 @@
     void   checkAndResetABR(RateControlEntry* rce, bool isFrameDone);
     double predictRowsSizeSum(Frame* pic, RateControlEntry* rce, double qpm, int32_t& encodedBits);
     bool   initPass2();
+    bool   analyseABR2Pass(int startPoc, int endPoc, uint64_t allAvailableBits);
     void   initFramePredictors();
     double getDiffLimitedQScale(RateControlEntry *rce, double q);
-    double countExpectedBits();
-    bool   vbv2Pass(uint64_t allAvailableBits);
-    bool   findUnderflow(double *fills, int *t0, int *t1, int over);
+    double countExpectedBits(int startPos, int framesCount);
+    bool   vbv2Pass(uint64_t allAvailableBits, int frameCount, int startPos);
+    bool   findUnderflow(double *fills, int *t0, int *t1, int over, int framesCount);
     bool   fixUnderflow(int t0, int t1, double adjustment, double qscaleMin, double qscaleMax);
 };
 }
​

x265_1.8.tar.gz/source/encoder/rdcost.h -> x265_1.9.tar.gz/source/encoder/rdcost.h Changed

 
@@ -2,6 +2,7 @@
 * Copyright (C) 2013 x265 project
 *
 * Authors: Steve Borho <steve@borho.org>
+*          Min Chen <chenm003@163.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -73,13 +74,18 @@
             qpCr = x265_clip3(QP_MIN, QP_MAX_SPEC, qp + slice.m_pps->chromaQpOffset[1]);
         }
 
-        int chroma_offset_idx = X265_MIN(qp - qpCb + 12, MAX_CHROMA_LAMBDA_OFFSET);
-        uint16_t lambdaOffset = m_psyRd ? x265_chroma_lambda2_offset_tab[chroma_offset_idx] : 256;
-        m_chromaDistWeight[0] = lambdaOffset;
+        if (slice.m_sps->chromaFormatIdc == X265_CSP_I444)
+        {
+            int chroma_offset_idx = X265_MIN(qp - qpCb + 12, MAX_CHROMA_LAMBDA_OFFSET);
+            uint16_t lambdaOffset = m_psyRd ? x265_chroma_lambda2_offset_tab[chroma_offset_idx] : 256;
+            m_chromaDistWeight[0] = lambdaOffset;
 
-        chroma_offset_idx = X265_MIN(qp - qpCr + 12, MAX_CHROMA_LAMBDA_OFFSET);
-        lambdaOffset = m_psyRd ? x265_chroma_lambda2_offset_tab[chroma_offset_idx] : 256;
-        m_chromaDistWeight[1] = lambdaOffset;
+            chroma_offset_idx = X265_MIN(qp - qpCr + 12, MAX_CHROMA_LAMBDA_OFFSET);
+            lambdaOffset = m_psyRd ? x265_chroma_lambda2_offset_tab[chroma_offset_idx] : 256;
+            m_chromaDistWeight[1] = lambdaOffset;
+        }
+        else
+            m_chromaDistWeight[0] = m_chromaDistWeight[1] = 256;
     }
 
     void setLambda(double lambda2, double lambda)
@@ -88,9 +94,9 @@
         m_lambda = (uint64_t)floor(256.0 * lambda);
     }
 
-    inline uint64_t calcRdCost(sse_ret_t distortion, uint32_t bits) const
+    inline uint64_t calcRdCost(sse_t distortion, uint32_t bits) const
     {
-#if X265_DEPTH <= 10
+#if X265_DEPTH < 10
         X265_CHECK(bits <= (UINT64_MAX - 128) / m_lambda2,
                    "calcRdCost wrap detected dist: %u, bits %u, lambda: " X265_LL "\n",
                    distortion, bits, m_lambda2);
@@ -108,15 +114,18 @@
         return primitives.cu[size].psy_cost_pp(source, sstride, recon, rstride);
     }
 
-    /* return the difference in energy between the source block and the recon block */
-    inline int psyCost(int size, const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride) const
-    {
-        return primitives.cu[size].psy_cost_ss(source, sstride, recon, rstride);
-    }
-
     /* return the RD cost of this prediction, including the effect of psy-rd */
-    inline uint64_t calcPsyRdCost(sse_ret_t distortion, uint32_t bits, uint32_t psycost) const
+    inline uint64_t calcPsyRdCost(sse_t distortion, uint32_t bits, uint32_t psycost) const
     {
+#if X265_DEPTH < 10
+        X265_CHECK((bits <= (UINT64_MAX / m_lambda2)) && (psycost <= UINT64_MAX / (m_lambda * m_psyRd)),
+                   "calcPsyRdCost wrap detected dist: %u, bits: %u, lambda: " X265_LL ", lambda2: " X265_LL "\n",
+                   distortion, bits, m_lambda, m_lambda2);
+#else
+        X265_CHECK((bits <= (UINT64_MAX / m_lambda2)) && (psycost <= UINT64_MAX / (m_lambda * m_psyRd)),
+                   "calcPsyRdCost wrap detected dist: " X265_LL ", bits: %u, lambda: " X265_LL ", lambda2: " X265_LL "\n",
+                   distortion, bits, m_lambda, m_lambda2);
+#endif
         return distortion + ((m_lambda * m_psyRd * psycost) >> 24) + ((bits * m_lambda2) >> 8);
     }
 
@@ -127,9 +136,9 @@
         return sadCost + ((bits * m_lambda + 128) >> 8);
     }
 
-    inline sse_ret_t scaleChromaDist(uint32_t plane, sse_ret_t dist) const
+    inline sse_t scaleChromaDist(uint32_t plane, sse_t dist) const
     {
-#if X265_DEPTH <= 10
+#if X265_DEPTH < 10
         X265_CHECK(dist <= (UINT64_MAX - 128) / m_chromaDistWeight[plane - 1],
                    "scaleChromaDist wrap detected dist: %u, lambda: %u\n",
                    dist, m_chromaDistWeight[plane - 1]);
@@ -138,11 +147,13 @@
                    "scaleChromaDist wrap detected dist: " X265_LL " lambda: %u\n",
                    dist, m_chromaDistWeight[plane - 1]);
 #endif
-        return (sse_ret_t)((dist * (uint64_t)m_chromaDistWeight[plane - 1] + 128) >> 8);
+        return (sse_t)((dist * (uint64_t)m_chromaDistWeight[plane - 1] + 128) >> 8);
     }
 
     inline uint32_t getCost(uint32_t bits) const
     {
+        X265_CHECK(bits <= (UINT64_MAX - 128) / m_lambda,
+                   "getCost wrap detected bits: %u, lambda: " X265_LL "\n", bits, m_lambda);
         return (uint32_t)((bits * m_lambda + 128) >> 8);
     }
 };
​

x265_1.8.tar.gz/source/encoder/reference.cpp -> x265_1.9.tar.gz/source/encoder/reference.cpp Changed

 
@@ -68,7 +68,7 @@
         intptr_t stride = reconPic->m_stride;
         int cuHeight = g_maxCUSize;
 
-        for (int c = 0; c < numInterpPlanes; c++)
+        for (int c = 0; c < (p.internalCsp != X265_CSP_I400 ? numInterpPlanes : 1); c++)
         {
             if (c == 1)
             {
​

x265_1.8.tar.gz/source/encoder/sao.cpp -> x265_1.9.tar.gz/source/encoder/sao.cpp Changed

@@ -73,9 +73,6 @@
 
 SAO::SAO()
 {
-    m_count = NULL;
-    m_offset = NULL;
-    m_offsetOrg = NULL;
     m_countPreDblk = NULL;
     m_offsetOrgPreDblk = NULL;
     m_refDepth = 0;
@@ -84,28 +81,22 @@
     m_param = NULL;
     m_clipTable = NULL;
     m_clipTableBase = NULL;
-    m_tmpU1[0] = NULL;
-    m_tmpU1[1] = NULL;
-    m_tmpU1[2] = NULL;
-    m_tmpU2[0] = NULL;
-    m_tmpU2[1] = NULL;
-    m_tmpU2[2] = NULL;
-    m_tmpL1 = NULL;
-    m_tmpL2 = NULL;
-
-    m_depthSaoRate[0][0] = 0;
-    m_depthSaoRate[0][1] = 0;
-    m_depthSaoRate[0][2] = 0;
-    m_depthSaoRate[0][3] = 0;
-    m_depthSaoRate[1][0] = 0;
-    m_depthSaoRate[1][1] = 0;
-    m_depthSaoRate[1][2] = 0;
-    m_depthSaoRate[1][3] = 0;
+    m_tmpU[0] = NULL;
+    m_tmpU[1] = NULL;
+    m_tmpU[2] = NULL;
+    m_tmpL1[0] = NULL;
+    m_tmpL1[1] = NULL;
+    m_tmpL1[2] = NULL;
+    m_tmpL2[0] = NULL;
+    m_tmpL2[1] = NULL;
+    m_tmpL2[2] = NULL;
+    m_depthSaoRate = NULL;
 }
 
-bool SAO::create(x265_param* param)
+bool SAO::create(x265_param* param, int initCommon)
 {
     m_param = param;
+    m_chromaFormat = param->internalCsp;
     m_hChromaShift = CHROMA_H_SHIFT(param->internalCsp);
     m_vChromaShift = CHROMA_V_SHIFT(param->internalCsp);
 
@@ -116,37 +107,56 @@
     const pixel rangeExt = maxY >> 1;
     int numCtu = m_numCuInWidth * m_numCuInHeight;
 
-    CHECKED_MALLOC(m_clipTableBase,  pixel, maxY + 2 * rangeExt);
-
-    CHECKED_MALLOC(m_tmpL1, pixel, g_maxCUSize + 1);
-    CHECKED_MALLOC(m_tmpL2, pixel, g_maxCUSize + 1);
-
-    for (int i = 0; i < 3; i++)
+    for (int i = 0; i < (param->internalCsp != X265_CSP_I400 ? 3 : 1); i++)
     {
+        CHECKED_MALLOC(m_tmpL1[i], pixel, g_maxCUSize + 1);
+        CHECKED_MALLOC(m_tmpL2[i], pixel, g_maxCUSize + 1);
+
         // SAO asm code will read 1 pixel before and after, so pad by 2
-        CHECKED_MALLOC(m_tmpU1[i], pixel, m_param->sourceWidth + 2);
-        m_tmpU1[i] += 1;
-        CHECKED_MALLOC(m_tmpU2[i], pixel, m_param->sourceWidth + 2);
-        m_tmpU2[i] += 1;
+        // NOTE: m_param->sourceWidth+2 enough, to avoid condition check in copySaoAboveRef(), I alloc more up to 63 bytes in here
+        CHECKED_MALLOC(m_tmpU[i], pixel, m_numCuInWidth * g_maxCUSize + 2 + 32);
+        m_tmpU[i] += 1;
     }
 
-    CHECKED_MALLOC(m_count, PerClass, NUM_PLANE);
-    CHECKED_MALLOC(m_offset, PerClass, NUM_PLANE);
-    CHECKED_MALLOC(m_offsetOrg, PerClass, NUM_PLANE);
-
-    CHECKED_MALLOC(m_countPreDblk, PerPlane, numCtu);
-    CHECKED_MALLOC(m_offsetOrgPreDblk, PerPlane, numCtu);
-
-    m_clipTable = &(m_clipTableBase[rangeExt]);
-
-    for (int i = 0; i < rangeExt; i++)
-        m_clipTableBase[i] = 0;
+    if (initCommon)
+    {
+        CHECKED_MALLOC(m_countPreDblk, PerPlane, numCtu);
+        CHECKED_MALLOC(m_offsetOrgPreDblk, PerPlane, numCtu);
+        CHECKED_MALLOC(m_depthSaoRate, double, 2 * SAO_DEPTHRATE_SIZE);
+
+        m_depthSaoRate[0 * SAO_DEPTHRATE_SIZE + 0] = 0;
+        m_depthSaoRate[0 * SAO_DEPTHRATE_SIZE + 1] = 0;
+        m_depthSaoRate[0 * SAO_DEPTHRATE_SIZE + 2] = 0;
+        m_depthSaoRate[0 * SAO_DEPTHRATE_SIZE + 3] = 0;
+        m_depthSaoRate[1 * SAO_DEPTHRATE_SIZE + 0] = 0;
+        m_depthSaoRate[1 * SAO_DEPTHRATE_SIZE + 1] = 0;
+        m_depthSaoRate[1 * SAO_DEPTHRATE_SIZE + 2] = 0;
+        m_depthSaoRate[1 * SAO_DEPTHRATE_SIZE + 3] = 0;
+
+        CHECKED_MALLOC(m_clipTableBase,  pixel, maxY + 2 * rangeExt);
+        m_clipTable = &(m_clipTableBase[rangeExt]);
+
+        // Share with fast clip lookup table
+        if (initCommon)
+        {
+            for (int i = 0; i < rangeExt; i++)
+                m_clipTableBase[i] = 0;
 
-    for (int i = 0; i < maxY; i++)
-        m_clipTable[i] = (pixel)i;
+            for (int i = 0; i < maxY; i++)
+                m_clipTable[i] = (pixel)i;
 
-    for (int i = maxY; i < maxY + rangeExt; i++)
-        m_clipTable[i] = maxY;
+            for (int i = maxY; i < maxY + rangeExt; i++)
+                m_clipTable[i] = maxY;
+        }
+    }
+    else
+    {
+        // must initialize these common pointer outside of function
+        m_countPreDblk = NULL;
+        m_offsetOrgPreDblk = NULL;
+        m_clipTableBase = NULL;
+        m_clipTable = NULL;
+    }
 
     return true;
 
@@ -154,34 +164,61 @@
     return false;
 }
 
-void SAO::destroy()
+void SAO::createFromRootNode(SAO* root)
 {
-    X265_FREE(m_clipTableBase);
-
-    X265_FREE(m_tmpL1);
-    X265_FREE(m_tmpL2);
+    X265_CHECK(m_countPreDblk == NULL, "duplicate initialize on m_countPreDblk");
+    X265_CHECK(m_offsetOrgPreDblk == NULL, "duplicate initialize on m_offsetOrgPreDblk");
+    X265_CHECK(m_depthSaoRate == NULL, "duplicate initialize on m_depthSaoRate");
+    X265_CHECK(m_clipTableBase == NULL, "duplicate initialize on m_clipTableBase");
+    X265_CHECK(m_clipTable == NULL, "duplicate initialize on m_clipTable");
+
+    m_countPreDblk = root->m_countPreDblk;
+    m_offsetOrgPreDblk = root->m_offsetOrgPreDblk;
+    m_depthSaoRate = root->m_depthSaoRate;
+    m_clipTableBase = root->m_clipTableBase; // Unnecessary
+    m_clipTable = root->m_clipTable;
+}
 
+void SAO::destroy(int destoryCommon)
+{
     for (int i = 0; i < 3; i++)
     {
-        if (m_tmpU1[i]) X265_FREE(m_tmpU1[i] - 1);
-        if (m_tmpU2[i]) X265_FREE(m_tmpU2[i] - 1);
+        if (m_tmpL1[i])
+        {
+            X265_FREE(m_tmpL1[i]);
+            m_tmpL1[i] = NULL;
+        }
+
+        if (m_tmpL2[i])
+        {
+            X265_FREE(m_tmpL2[i]);
+            m_tmpL2[i] = NULL;
+        }
+
+        if (m_tmpU[i])
+        {
+            X265_FREE(m_tmpU[i] - 1);
+            m_tmpU[i] = NULL;
+        }
     }
 
-    X265_FREE(m_count);
-    X265_FREE(m_offset);
-    X265_FREE(m_offsetOrg);
-    X265_FREE(m_countPreDblk);
-    X265_FREE(m_offsetOrgPreDblk);
+    if (destoryCommon)
+    {
+        X265_FREE_ZERO(m_countPreDblk);
+        X265_FREE_ZERO(m_offsetOrgPreDblk);
+        X265_FREE_ZERO(m_depthSaoRate);
+        X265_FREE_ZERO(m_clipTableBase);
+    }
 }
 
 /* allocate memory for SAO parameters */
 void SAO::allocSaoParam(SAOParam* saoParam) const
 {
+    int planes = (m_param->internalCsp != X265_CSP_I400) ? 3 : 1;

 
@@ -73,9 +73,6 @@
 
 SAO::SAO()
 {
-    m_count = NULL;
-    m_offset = NULL;
-    m_offsetOrg = NULL;
     m_countPreDblk = NULL;
     m_offsetOrgPreDblk = NULL;
     m_refDepth = 0;
@@ -84,28 +81,22 @@
     m_param = NULL;
     m_clipTable = NULL;
     m_clipTableBase = NULL;
-    m_tmpU1[0] = NULL;
-    m_tmpU1[1] = NULL;
-    m_tmpU1[2] = NULL;
-    m_tmpU2[0] = NULL;
-    m_tmpU2[1] = NULL;
-    m_tmpU2[2] = NULL;
-    m_tmpL1 = NULL;
-    m_tmpL2 = NULL;
-
-    m_depthSaoRate[0][0] = 0;
-    m_depthSaoRate[0][1] = 0;
-    m_depthSaoRate[0][2] = 0;
-    m_depthSaoRate[0][3] = 0;
-    m_depthSaoRate[1][0] = 0;
-    m_depthSaoRate[1][1] = 0;
-    m_depthSaoRate[1][2] = 0;
-    m_depthSaoRate[1][3] = 0;
+    m_tmpU[0] = NULL;
+    m_tmpU[1] = NULL;
+    m_tmpU[2] = NULL;
+    m_tmpL1[0] = NULL;
+    m_tmpL1[1] = NULL;
+    m_tmpL1[2] = NULL;
+    m_tmpL2[0] = NULL;
+    m_tmpL2[1] = NULL;
+    m_tmpL2[2] = NULL;
+    m_depthSaoRate = NULL;
 }
 
-bool SAO::create(x265_param* param)
+bool SAO::create(x265_param* param, int initCommon)
 {
     m_param = param;
+    m_chromaFormat = param->internalCsp;
     m_hChromaShift = CHROMA_H_SHIFT(param->internalCsp);
     m_vChromaShift = CHROMA_V_SHIFT(param->internalCsp);
 
@@ -116,37 +107,56 @@
     const pixel rangeExt = maxY >> 1;
     int numCtu = m_numCuInWidth * m_numCuInHeight;
 
-    CHECKED_MALLOC(m_clipTableBase,  pixel, maxY + 2 * rangeExt);
-
-    CHECKED_MALLOC(m_tmpL1, pixel, g_maxCUSize + 1);
-    CHECKED_MALLOC(m_tmpL2, pixel, g_maxCUSize + 1);
-
-    for (int i = 0; i < 3; i++)
+    for (int i = 0; i < (param->internalCsp != X265_CSP_I400 ? 3 : 1); i++)
     {
+        CHECKED_MALLOC(m_tmpL1[i], pixel, g_maxCUSize + 1);
+        CHECKED_MALLOC(m_tmpL2[i], pixel, g_maxCUSize + 1);
+
         // SAO asm code will read 1 pixel before and after, so pad by 2
-        CHECKED_MALLOC(m_tmpU1[i], pixel, m_param->sourceWidth + 2);
-        m_tmpU1[i] += 1;
-        CHECKED_MALLOC(m_tmpU2[i], pixel, m_param->sourceWidth + 2);
-        m_tmpU2[i] += 1;
+        // NOTE: m_param->sourceWidth+2 enough, to avoid condition check in copySaoAboveRef(), I alloc more up to 63 bytes in here
+        CHECKED_MALLOC(m_tmpU[i], pixel, m_numCuInWidth * g_maxCUSize + 2 + 32);
+        m_tmpU[i] += 1;
     }
 
-    CHECKED_MALLOC(m_count, PerClass, NUM_PLANE);
-    CHECKED_MALLOC(m_offset, PerClass, NUM_PLANE);
-    CHECKED_MALLOC(m_offsetOrg, PerClass, NUM_PLANE);
-
-    CHECKED_MALLOC(m_countPreDblk, PerPlane, numCtu);
-    CHECKED_MALLOC(m_offsetOrgPreDblk, PerPlane, numCtu);
-
-    m_clipTable = &(m_clipTableBase[rangeExt]);
-
-    for (int i = 0; i < rangeExt; i++)
-        m_clipTableBase[i] = 0;
+    if (initCommon)
+    {
+        CHECKED_MALLOC(m_countPreDblk, PerPlane, numCtu);
+        CHECKED_MALLOC(m_offsetOrgPreDblk, PerPlane, numCtu);
+        CHECKED_MALLOC(m_depthSaoRate, double, 2 * SAO_DEPTHRATE_SIZE);
+
+        m_depthSaoRate[0 * SAO_DEPTHRATE_SIZE + 0] = 0;
+        m_depthSaoRate[0 * SAO_DEPTHRATE_SIZE + 1] = 0;
+        m_depthSaoRate[0 * SAO_DEPTHRATE_SIZE + 2] = 0;
+        m_depthSaoRate[0 * SAO_DEPTHRATE_SIZE + 3] = 0;
+        m_depthSaoRate[1 * SAO_DEPTHRATE_SIZE + 0] = 0;
+        m_depthSaoRate[1 * SAO_DEPTHRATE_SIZE + 1] = 0;
+        m_depthSaoRate[1 * SAO_DEPTHRATE_SIZE + 2] = 0;
+        m_depthSaoRate[1 * SAO_DEPTHRATE_SIZE + 3] = 0;
+
+        CHECKED_MALLOC(m_clipTableBase,  pixel, maxY + 2 * rangeExt);
+        m_clipTable = &(m_clipTableBase[rangeExt]);
+
+        // Share with fast clip lookup table
+        if (initCommon)
+        {
+            for (int i = 0; i < rangeExt; i++)
+                m_clipTableBase[i] = 0;
 
-    for (int i = 0; i < maxY; i++)
-        m_clipTable[i] = (pixel)i;
+            for (int i = 0; i < maxY; i++)
+                m_clipTable[i] = (pixel)i;
 
-    for (int i = maxY; i < maxY + rangeExt; i++)
-        m_clipTable[i] = maxY;
+            for (int i = maxY; i < maxY + rangeExt; i++)
+                m_clipTable[i] = maxY;
+        }
+    }
+    else
+    {
+        // must initialize these common pointer outside of function
+        m_countPreDblk = NULL;
+        m_offsetOrgPreDblk = NULL;
+        m_clipTableBase = NULL;
+        m_clipTable = NULL;
+    }
 
     return true;
 
@@ -154,34 +164,61 @@
     return false;
 }
 
-void SAO::destroy()
+void SAO::createFromRootNode(SAO* root)
 {
-    X265_FREE(m_clipTableBase);
-
-    X265_FREE(m_tmpL1);
-    X265_FREE(m_tmpL2);
+    X265_CHECK(m_countPreDblk == NULL, "duplicate initialize on m_countPreDblk");
+    X265_CHECK(m_offsetOrgPreDblk == NULL, "duplicate initialize on m_offsetOrgPreDblk");
+    X265_CHECK(m_depthSaoRate == NULL, "duplicate initialize on m_depthSaoRate");
+    X265_CHECK(m_clipTableBase == NULL, "duplicate initialize on m_clipTableBase");
+    X265_CHECK(m_clipTable == NULL, "duplicate initialize on m_clipTable");
+
+    m_countPreDblk = root->m_countPreDblk;
+    m_offsetOrgPreDblk = root->m_offsetOrgPreDblk;
+    m_depthSaoRate = root->m_depthSaoRate;
+    m_clipTableBase = root->m_clipTableBase; // Unnecessary
+    m_clipTable = root->m_clipTable;
+}
 
+void SAO::destroy(int destoryCommon)
+{
     for (int i = 0; i < 3; i++)
     {
-        if (m_tmpU1[i]) X265_FREE(m_tmpU1[i] - 1);
-        if (m_tmpU2[i]) X265_FREE(m_tmpU2[i] - 1);
+        if (m_tmpL1[i])
+        {
+            X265_FREE(m_tmpL1[i]);
+            m_tmpL1[i] = NULL;
+        }
+
+        if (m_tmpL2[i])
+        {
+            X265_FREE(m_tmpL2[i]);
+            m_tmpL2[i] = NULL;
+        }
+
+        if (m_tmpU[i])
+        {
+            X265_FREE(m_tmpU[i] - 1);
+            m_tmpU[i] = NULL;
+        }
     }
 
-    X265_FREE(m_count);
-    X265_FREE(m_offset);
-    X265_FREE(m_offsetOrg);
-    X265_FREE(m_countPreDblk);
-    X265_FREE(m_offsetOrgPreDblk);
+    if (destoryCommon)
+    {
+        X265_FREE_ZERO(m_countPreDblk);
+        X265_FREE_ZERO(m_offsetOrgPreDblk);
+        X265_FREE_ZERO(m_depthSaoRate);
+        X265_FREE_ZERO(m_clipTableBase);
+    }
 }
 
 /* allocate memory for SAO parameters */
 void SAO::allocSaoParam(SAOParam* saoParam) const
 {
+    int planes = (m_param->internalCsp != X265_CSP_I400) ? 3 : 1;
​

x265_1.8.tar.gz/source/encoder/sao.h -> x265_1.9.tar.gz/source/encoder/sao.h Changed

@@ -62,6 +62,7 @@
     enum { NUM_EDGETYPE = 5 };
     enum { NUM_PLANE = 3 };
     enum { NUM_MERGE_MODE = 3 };
+    enum { SAO_DEPTHRATE_SIZE = 4 };
 
     static const uint32_t s_eoTable[NUM_EDGETYPE];
 
@@ -71,18 +72,19 @@
 protected:
 
     /* allocated per part */
-    PerClass*   m_count;
-    PerClass*   m_offset;
-    PerClass*   m_offsetOrg;
+    PerPlane    m_count;
+    PerPlane    m_offset;
+    PerPlane    m_offsetOrg;
 
     /* allocated per CTU */
     PerPlane*   m_countPreDblk;
     PerPlane*   m_offsetOrgPreDblk;
 
-    double      m_depthSaoRate[2][4];
-    int8_t      m_offsetBo[SAO_NUM_BO_CLASSES];
-    int8_t      m_offsetEo[NUM_EDGETYPE];
+    double*     m_depthSaoRate;
+    int8_t      m_offsetBo[NUM_PLANE][SAO_NUM_BO_CLASSES];
+    int8_t      m_offsetEo[NUM_PLANE][NUM_EDGETYPE];
 
+    int         m_chromaFormat;
     int         m_numCuInWidth;
     int         m_numCuInHeight;
     int         m_hChromaShift;
@@ -91,10 +93,9 @@
     pixel*      m_clipTable;
     pixel*      m_clipTableBase;
 
-    pixel*      m_tmpU1[3];
-    pixel*      m_tmpU2[3];
-    pixel*      m_tmpL1;
-    pixel*      m_tmpL2;
+    pixel*      m_tmpU[3];
+    pixel*      m_tmpL1[3];
+    pixel*      m_tmpL2[3];
 
 public:
 
@@ -119,8 +120,9 @@
 
     SAO();
 
-    bool create(x265_param* param);
-    void destroy();
+    bool create(x265_param* param, int initCommon);
+    void createFromRootNode(SAO *root);
+    void destroy(int destoryCommon);
 
     void allocSaoParam(SAOParam* saoParam) const;
 
@@ -131,6 +133,8 @@
     // CTU-based SAO process without slice granularity
     void processSaoCu(int addr, int typeIdx, int plane);
     void processSaoUnitRow(SaoCtuParam* ctuParam, int idxY, int plane);
+    void processSaoUnitCuLuma(SaoCtuParam* ctuParam, int idxY, int idxX);
+    void processSaoUnitCuChroma(SaoCtuParam* ctuParam[3], int idxY, int idxX);
 
     void copySaoUnit(SaoCtuParam* saoUnitDst, const SaoCtuParam* saoUnitSrc);
 
@@ -146,6 +150,9 @@
 
     void rdoSaoUnitRowEnd(const SAOParam* saoParam, int numctus);
     void rdoSaoUnitRow(SAOParam* saoParam, int idxY);
+    void rdoSaoUnitCu(SAOParam* saoParam, int rowBaseAddr, int idxX, int addr);
+
+    friend class FrameFilter;
 };
 
 }

 
@@ -62,6 +62,7 @@
     enum { NUM_EDGETYPE = 5 };
     enum { NUM_PLANE = 3 };
     enum { NUM_MERGE_MODE = 3 };
+    enum { SAO_DEPTHRATE_SIZE = 4 };
 
     static const uint32_t s_eoTable[NUM_EDGETYPE];
 
@@ -71,18 +72,19 @@
 protected:
 
     /* allocated per part */
-    PerClass*   m_count;
-    PerClass*   m_offset;
-    PerClass*   m_offsetOrg;
+    PerPlane    m_count;
+    PerPlane    m_offset;
+    PerPlane    m_offsetOrg;
 
     /* allocated per CTU */
     PerPlane*   m_countPreDblk;
     PerPlane*   m_offsetOrgPreDblk;
 
-    double      m_depthSaoRate[2][4];
-    int8_t      m_offsetBo[SAO_NUM_BO_CLASSES];
-    int8_t      m_offsetEo[NUM_EDGETYPE];
+    double*     m_depthSaoRate;
+    int8_t      m_offsetBo[NUM_PLANE][SAO_NUM_BO_CLASSES];
+    int8_t      m_offsetEo[NUM_PLANE][NUM_EDGETYPE];
 
+    int         m_chromaFormat;
     int         m_numCuInWidth;
     int         m_numCuInHeight;
     int         m_hChromaShift;
@@ -91,10 +93,9 @@
     pixel*      m_clipTable;
     pixel*      m_clipTableBase;
 
-    pixel*      m_tmpU1[3];
-    pixel*      m_tmpU2[3];
-    pixel*      m_tmpL1;
-    pixel*      m_tmpL2;
+    pixel*      m_tmpU[3];
+    pixel*      m_tmpL1[3];
+    pixel*      m_tmpL2[3];
 
 public:
 
@@ -119,8 +120,9 @@
 
     SAO();
 
-    bool create(x265_param* param);
-    void destroy();
+    bool create(x265_param* param, int initCommon);
+    void createFromRootNode(SAO *root);
+    void destroy(int destoryCommon);
 
     void allocSaoParam(SAOParam* saoParam) const;
 
@@ -131,6 +133,8 @@
     // CTU-based SAO process without slice granularity
     void processSaoCu(int addr, int typeIdx, int plane);
     void processSaoUnitRow(SaoCtuParam* ctuParam, int idxY, int plane);
+    void processSaoUnitCuLuma(SaoCtuParam* ctuParam, int idxY, int idxX);
+    void processSaoUnitCuChroma(SaoCtuParam* ctuParam[3], int idxY, int idxX);
 
     void copySaoUnit(SaoCtuParam* saoUnitDst, const SaoCtuParam* saoUnitSrc);
 
@@ -146,6 +150,9 @@
 
     void rdoSaoUnitRowEnd(const SAOParam* saoParam, int numctus);
     void rdoSaoUnitRow(SAOParam* saoParam, int idxY);
+    void rdoSaoUnitCu(SAOParam* saoParam, int rowBaseAddr, int idxX, int addr);
+
+    friend class FrameFilter;
 };
 
 }
​

x265_1.8.tar.gz/source/encoder/search.cpp -> x265_1.9.tar.gz/source/encoder/search.cpp Changed

@@ -2,6 +2,7 @@
 * Copyright (C) 2013 x265 project
 *
 * Authors: Steve Borho <steve@borho.org>
+*          Min Chen <chenm003@163.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -80,7 +81,7 @@
     m_me.init(param.searchMethod, param.subpelRefine, param.internalCsp);
 
     bool ok = m_quant.init(param.rdoqLevel, param.psyRdoq, scalingList, m_entropyCoder);
-    if (m_param->noiseReductionIntra || m_param->noiseReductionInter)
+    if (m_param->noiseReductionIntra || m_param->noiseReductionInter || m_param->rc.vbvBufferSize)
         ok &= m_quant.allocNoiseReduction(param);
 
     ok &= Predict::allocBuffers(param.internalCsp); /* sets m_hChromaShift & m_vChromaShift */
@@ -97,13 +98,27 @@
      * the coeffRQT and reconQtYuv are allocated to the max CU size at every depth. The parts
      * which are reconstructed at each depth are valid. At the end, the transform depth table
      * is walked and the coeff and recon at the correct depths are collected */
-    for (uint32_t i = 0; i <= m_numLayers; i++)
+
+    if (param.internalCsp != X265_CSP_I400)
+    {
+        for (uint32_t i = 0; i <= m_numLayers; i++)
+        {
+            CHECKED_MALLOC(m_rqt[i].coeffRQT[0], coeff_t, sizeL + sizeC * 2);
+            m_rqt[i].coeffRQT[1] = m_rqt[i].coeffRQT[0] + sizeL;
+            m_rqt[i].coeffRQT[2] = m_rqt[i].coeffRQT[0] + sizeL + sizeC;
+            ok &= m_rqt[i].reconQtYuv.create(g_maxCUSize, param.internalCsp);
+            ok &= m_rqt[i].resiQtYuv.create(g_maxCUSize, param.internalCsp);
+        }
+    }
+    else
     {
-        CHECKED_MALLOC(m_rqt[i].coeffRQT[0], coeff_t, sizeL + sizeC * 2);
-        m_rqt[i].coeffRQT[1] = m_rqt[i].coeffRQT[0] + sizeL;
-        m_rqt[i].coeffRQT[2] = m_rqt[i].coeffRQT[0] + sizeL + sizeC;
-        ok &= m_rqt[i].reconQtYuv.create(g_maxCUSize, param.internalCsp);
-        ok &= m_rqt[i].resiQtYuv.create(g_maxCUSize, param.internalCsp);
+        for (uint32_t i = 0; i <= m_numLayers; i++)
+        {
+            CHECKED_MALLOC(m_rqt[i].coeffRQT[0], coeff_t, sizeL);
+            m_rqt[i].coeffRQT[1] = m_rqt[i].coeffRQT[2] = NULL;
+            ok &= m_rqt[i].reconQtYuv.create(g_maxCUSize, param.internalCsp);
+            ok &= m_rqt[i].resiQtYuv.create(g_maxCUSize, param.internalCsp);
+        }
     }
 
     /* the rest of these buffers are indexed per-depth */
@@ -116,12 +131,22 @@
         ok &= m_rqt[i].bidirPredYuv[1].create(cuSize, param.internalCsp);
     }
 
-    CHECKED_MALLOC(m_qtTempCbf[0], uint8_t, numPartitions * 3);
-    m_qtTempCbf[1] = m_qtTempCbf[0] + numPartitions;
-    m_qtTempCbf[2] = m_qtTempCbf[0] + numPartitions * 2;
-    CHECKED_MALLOC(m_qtTempTransformSkipFlag[0], uint8_t, numPartitions * 3);
-    m_qtTempTransformSkipFlag[1] = m_qtTempTransformSkipFlag[0] + numPartitions;
-    m_qtTempTransformSkipFlag[2] = m_qtTempTransformSkipFlag[0] + numPartitions * 2;
+    if (param.internalCsp != X265_CSP_I400)
+    {
+        CHECKED_MALLOC(m_qtTempCbf[0], uint8_t, numPartitions * 3);
+        m_qtTempCbf[1] = m_qtTempCbf[0] + numPartitions;
+        m_qtTempCbf[2] = m_qtTempCbf[0] + numPartitions * 2;
+        CHECKED_MALLOC(m_qtTempTransformSkipFlag[0], uint8_t, numPartitions * 3);
+        m_qtTempTransformSkipFlag[1] = m_qtTempTransformSkipFlag[0] + numPartitions;
+        m_qtTempTransformSkipFlag[2] = m_qtTempTransformSkipFlag[0] + numPartitions * 2;
+    }
+    else
+    {
+        CHECKED_MALLOC(m_qtTempCbf[0], uint8_t, numPartitions);
+        m_qtTempCbf[1] = m_qtTempCbf[2] = NULL;
+        CHECKED_MALLOC(m_qtTempTransformSkipFlag[0], uint8_t, numPartitions);
+        m_qtTempTransformSkipFlag[1] = m_qtTempTransformSkipFlag[2] = NULL;
+    }
 
     CHECKED_MALLOC(m_intraPred, pixel, (32 * 32) * (33 + 3));
     m_fencScaled = m_intraPred + 32 * 32;
@@ -163,12 +188,12 @@
     X265_FREE(m_tsRecon);
 }
 
-int Search::setLambdaFromQP(const CUData& ctu, int qp)
+int Search::setLambdaFromQP(const CUData& ctu, int qp, int lambdaQp)
 {
     X265_CHECK(qp >= QP_MIN && qp <= QP_MAX_MAX, "QP used for lambda is out of range\n");
 
     m_me.setQP(qp);
-    m_rdCost.setQP(*m_slice, qp);
+    m_rdCost.setQP(*m_slice, lambdaQp < 0 ? qp : lambdaQp);
 
     int quantQP = x265_clip3(QP_MIN, QP_MAX_SPEC, qp);
     m_quant.setQPforQuant(ctu, quantQP);
@@ -446,8 +471,9 @@
     }
 
     // set reconstruction for next intra prediction blocks if full TU prediction won
-    pixel*   picReconY = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
-    intptr_t picStride = m_frame->m_reconPic->m_stride;
+    PicYuv*  reconPic = m_frame->m_reconPic;
+    pixel*   picReconY = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
+    intptr_t picStride = reconPic->m_stride;
     primitives.cu[sizeIdx].copy_pp(picReconY, picStride, reconQt, reconQtStride);
 
     outCost.rdcost     += fullCost.rdcost;
@@ -530,7 +556,7 @@
             // no residual coded, recon = pred
             primitives.cu[sizeIdx].copy_pp(tmpRecon, tmpReconStride, pred, stride);
 
-        sse_ret_t tmpDist = primitives.cu[sizeIdx].sse_pp(tmpRecon, tmpReconStride, fenc, stride);
+        sse_t tmpDist = primitives.cu[sizeIdx].sse_pp(tmpRecon, tmpReconStride, fenc, stride);
 
         cu.setTransformSkipSubParts(useTSkip, TEXT_LUMA, absPartIdx, fullDepth);
         cu.setCbfSubParts((!!numSig) << tuDepth, TEXT_LUMA, absPartIdx, fullDepth);
@@ -611,8 +637,9 @@
     }
 
     // set reconstruction for next intra prediction blocks
-    pixel*   picReconY = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
-    intptr_t picStride = m_frame->m_reconPic->m_stride;
+    PicYuv*  reconPic = m_frame->m_reconPic;
+    pixel*   picReconY = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
+    intptr_t picStride = reconPic->m_stride;
     primitives.cu[sizeIdx].copy_pp(picReconY, picStride, reconQt, reconQtStride);
 
     outCost.rdcost += fullCost.rdcost;
@@ -661,8 +688,9 @@
         uint32_t sizeIdx   = log2TrSize - 2;
         primitives.cu[sizeIdx].calcresidual(fenc, pred, residual, stride);
 
-        pixel*   picReconY = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
-        intptr_t picStride = m_frame->m_reconPic->m_stride;
+        PicYuv*  reconPic = m_frame->m_reconPic;
+        pixel*   picReconY = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
+        intptr_t picStride = reconPic->m_stride;
 
         uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffY, log2TrSize, TEXT_LUMA, absPartIdx, false);
         if (numSig)
@@ -750,7 +778,7 @@
 }
 
 /* returns distortion */
-uint32_t Search::codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, uint32_t& psyEnergy)
+void Search::codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, Cost& outCost)
 {
     CUData& cu = mode.cu;
     uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
@@ -758,10 +786,10 @@
     if (tuDepth < cu.m_tuDepth[absPartIdx])
     {
         uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
-        uint32_t outDist = 0, splitCbfU = 0, splitCbfV = 0;
+        uint32_t splitCbfU = 0, splitCbfV = 0;
         for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
         {
-            outDist += codeIntraChromaQt(mode, cuGeom, tuDepth + 1, qPartIdx, psyEnergy);
+            codeIntraChromaQt(mode, cuGeom, tuDepth + 1, qPartIdx, outCost);
             splitCbfU |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
             splitCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
         }
@@ -770,8 +798,7 @@
             cu.m_cbf[1][absPartIdx + offs] |= (splitCbfU << tuDepth);
             cu.m_cbf[2][absPartIdx + offs] |= (splitCbfV << tuDepth);
         }
-
-        return outDist;
+        return;
     }
 
     uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
@@ -780,7 +807,7 @@
     {
         X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
         if (absPartIdx & 3)
-            return 0;
+            return;
         log2TrSizeC = 2;
         tuDepthC--;
     }
@@ -791,13 +818,15 @@
     bool checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && log2TrSizeC <= MAX_LOG2_TS_SIZE && !cu.m_tqBypass[0];
     checkTransformSkip &= !m_param->bEnableTSkipFast || (log2TrSize <= MAX_LOG2_TS_SIZE && cu.m_transformSkip[TEXT_LUMA][absPartIdx]);
     if (checkTransformSkip)
-        return codeIntraChromaTSkip(mode, cuGeom, tuDepth, tuDepthC, absPartIdx, psyEnergy);
+    {
+        codeIntraChromaTSkip(mode, cuGeom, tuDepth, tuDepthC, absPartIdx, outCost);
+        return;
+    }
 
     ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv;
     uint32_t qtLayer = log2TrSize - 2;
     uint32_t stride = mode.fencYuv->m_csize;
     const uint32_t sizeIdxC = log2TrSizeC - 2;
-    sse_ret_t outDist = 0;
 
     uint32_t curPartNum = cuGeom.numPartitions >> tuDepthC * 2;
     const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
@@ -821,8 +850,9 @@

 
@@ -2,6 +2,7 @@
 * Copyright (C) 2013 x265 project
 *
 * Authors: Steve Borho <steve@borho.org>
+*          Min Chen <chenm003@163.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -80,7 +81,7 @@
     m_me.init(param.searchMethod, param.subpelRefine, param.internalCsp);
 
     bool ok = m_quant.init(param.rdoqLevel, param.psyRdoq, scalingList, m_entropyCoder);
-    if (m_param->noiseReductionIntra || m_param->noiseReductionInter)
+    if (m_param->noiseReductionIntra || m_param->noiseReductionInter || m_param->rc.vbvBufferSize)
         ok &= m_quant.allocNoiseReduction(param);
 
     ok &= Predict::allocBuffers(param.internalCsp); /* sets m_hChromaShift & m_vChromaShift */
@@ -97,13 +98,27 @@
      * the coeffRQT and reconQtYuv are allocated to the max CU size at every depth. The parts
      * which are reconstructed at each depth are valid. At the end, the transform depth table
      * is walked and the coeff and recon at the correct depths are collected */
-    for (uint32_t i = 0; i <= m_numLayers; i++)
+
+    if (param.internalCsp != X265_CSP_I400)
+    {
+        for (uint32_t i = 0; i <= m_numLayers; i++)
+        {
+            CHECKED_MALLOC(m_rqt[i].coeffRQT[0], coeff_t, sizeL + sizeC * 2);
+            m_rqt[i].coeffRQT[1] = m_rqt[i].coeffRQT[0] + sizeL;
+            m_rqt[i].coeffRQT[2] = m_rqt[i].coeffRQT[0] + sizeL + sizeC;
+            ok &= m_rqt[i].reconQtYuv.create(g_maxCUSize, param.internalCsp);
+            ok &= m_rqt[i].resiQtYuv.create(g_maxCUSize, param.internalCsp);
+        }
+    }
+    else
     {
-        CHECKED_MALLOC(m_rqt[i].coeffRQT[0], coeff_t, sizeL + sizeC * 2);
-        m_rqt[i].coeffRQT[1] = m_rqt[i].coeffRQT[0] + sizeL;
-        m_rqt[i].coeffRQT[2] = m_rqt[i].coeffRQT[0] + sizeL + sizeC;
-        ok &= m_rqt[i].reconQtYuv.create(g_maxCUSize, param.internalCsp);
-        ok &= m_rqt[i].resiQtYuv.create(g_maxCUSize, param.internalCsp);
+        for (uint32_t i = 0; i <= m_numLayers; i++)
+        {
+            CHECKED_MALLOC(m_rqt[i].coeffRQT[0], coeff_t, sizeL);
+            m_rqt[i].coeffRQT[1] = m_rqt[i].coeffRQT[2] = NULL;
+            ok &= m_rqt[i].reconQtYuv.create(g_maxCUSize, param.internalCsp);
+            ok &= m_rqt[i].resiQtYuv.create(g_maxCUSize, param.internalCsp);
+        }
     }
 
     /* the rest of these buffers are indexed per-depth */
@@ -116,12 +131,22 @@
         ok &= m_rqt[i].bidirPredYuv[1].create(cuSize, param.internalCsp);
     }
 
-    CHECKED_MALLOC(m_qtTempCbf[0], uint8_t, numPartitions * 3);
-    m_qtTempCbf[1] = m_qtTempCbf[0] + numPartitions;
-    m_qtTempCbf[2] = m_qtTempCbf[0] + numPartitions * 2;
-    CHECKED_MALLOC(m_qtTempTransformSkipFlag[0], uint8_t, numPartitions * 3);
-    m_qtTempTransformSkipFlag[1] = m_qtTempTransformSkipFlag[0] + numPartitions;
-    m_qtTempTransformSkipFlag[2] = m_qtTempTransformSkipFlag[0] + numPartitions * 2;
+    if (param.internalCsp != X265_CSP_I400)
+    {
+        CHECKED_MALLOC(m_qtTempCbf[0], uint8_t, numPartitions * 3);
+        m_qtTempCbf[1] = m_qtTempCbf[0] + numPartitions;
+        m_qtTempCbf[2] = m_qtTempCbf[0] + numPartitions * 2;
+        CHECKED_MALLOC(m_qtTempTransformSkipFlag[0], uint8_t, numPartitions * 3);
+        m_qtTempTransformSkipFlag[1] = m_qtTempTransformSkipFlag[0] + numPartitions;
+        m_qtTempTransformSkipFlag[2] = m_qtTempTransformSkipFlag[0] + numPartitions * 2;
+    }
+    else
+    {
+        CHECKED_MALLOC(m_qtTempCbf[0], uint8_t, numPartitions);
+        m_qtTempCbf[1] = m_qtTempCbf[2] = NULL;
+        CHECKED_MALLOC(m_qtTempTransformSkipFlag[0], uint8_t, numPartitions);
+        m_qtTempTransformSkipFlag[1] = m_qtTempTransformSkipFlag[2] = NULL;
+    }
 
     CHECKED_MALLOC(m_intraPred, pixel, (32 * 32) * (33 + 3));
     m_fencScaled = m_intraPred + 32 * 32;
@@ -163,12 +188,12 @@
     X265_FREE(m_tsRecon);
 }
 
-int Search::setLambdaFromQP(const CUData& ctu, int qp)
+int Search::setLambdaFromQP(const CUData& ctu, int qp, int lambdaQp)
 {
     X265_CHECK(qp >= QP_MIN && qp <= QP_MAX_MAX, "QP used for lambda is out of range\n");
 
     m_me.setQP(qp);
-    m_rdCost.setQP(*m_slice, qp);
+    m_rdCost.setQP(*m_slice, lambdaQp < 0 ? qp : lambdaQp);
 
     int quantQP = x265_clip3(QP_MIN, QP_MAX_SPEC, qp);
     m_quant.setQPforQuant(ctu, quantQP);
@@ -446,8 +471,9 @@
     }
 
     // set reconstruction for next intra prediction blocks if full TU prediction won
-    pixel*   picReconY = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
-    intptr_t picStride = m_frame->m_reconPic->m_stride;
+    PicYuv*  reconPic = m_frame->m_reconPic;
+    pixel*   picReconY = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
+    intptr_t picStride = reconPic->m_stride;
     primitives.cu[sizeIdx].copy_pp(picReconY, picStride, reconQt, reconQtStride);
 
     outCost.rdcost     += fullCost.rdcost;
@@ -530,7 +556,7 @@
             // no residual coded, recon = pred
             primitives.cu[sizeIdx].copy_pp(tmpRecon, tmpReconStride, pred, stride);
 
-        sse_ret_t tmpDist = primitives.cu[sizeIdx].sse_pp(tmpRecon, tmpReconStride, fenc, stride);
+        sse_t tmpDist = primitives.cu[sizeIdx].sse_pp(tmpRecon, tmpReconStride, fenc, stride);
 
         cu.setTransformSkipSubParts(useTSkip, TEXT_LUMA, absPartIdx, fullDepth);
         cu.setCbfSubParts((!!numSig) << tuDepth, TEXT_LUMA, absPartIdx, fullDepth);
@@ -611,8 +637,9 @@
     }
 
     // set reconstruction for next intra prediction blocks
-    pixel*   picReconY = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
-    intptr_t picStride = m_frame->m_reconPic->m_stride;
+    PicYuv*  reconPic = m_frame->m_reconPic;
+    pixel*   picReconY = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
+    intptr_t picStride = reconPic->m_stride;
     primitives.cu[sizeIdx].copy_pp(picReconY, picStride, reconQt, reconQtStride);
 
     outCost.rdcost += fullCost.rdcost;
@@ -661,8 +688,9 @@
         uint32_t sizeIdx   = log2TrSize - 2;
         primitives.cu[sizeIdx].calcresidual(fenc, pred, residual, stride);
 
-        pixel*   picReconY = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
-        intptr_t picStride = m_frame->m_reconPic->m_stride;
+        PicYuv*  reconPic = m_frame->m_reconPic;
+        pixel*   picReconY = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
+        intptr_t picStride = reconPic->m_stride;
 
         uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffY, log2TrSize, TEXT_LUMA, absPartIdx, false);
         if (numSig)
@@ -750,7 +778,7 @@
 }
 
 /* returns distortion */
-uint32_t Search::codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, uint32_t& psyEnergy)
+void Search::codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, Cost& outCost)
 {
     CUData& cu = mode.cu;
     uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
@@ -758,10 +786,10 @@
     if (tuDepth < cu.m_tuDepth[absPartIdx])
     {
         uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
-        uint32_t outDist = 0, splitCbfU = 0, splitCbfV = 0;
+        uint32_t splitCbfU = 0, splitCbfV = 0;
         for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
         {
-            outDist += codeIntraChromaQt(mode, cuGeom, tuDepth + 1, qPartIdx, psyEnergy);
+            codeIntraChromaQt(mode, cuGeom, tuDepth + 1, qPartIdx, outCost);
             splitCbfU |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
             splitCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
         }
@@ -770,8 +798,7 @@
             cu.m_cbf[1][absPartIdx + offs] |= (splitCbfU << tuDepth);
             cu.m_cbf[2][absPartIdx + offs] |= (splitCbfV << tuDepth);
         }
-
-        return outDist;
+        return;
     }
 
     uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
@@ -780,7 +807,7 @@
     {
         X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
         if (absPartIdx & 3)
-            return 0;
+            return;
         log2TrSizeC = 2;
         tuDepthC--;
     }
@@ -791,13 +818,15 @@
     bool checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && log2TrSizeC <= MAX_LOG2_TS_SIZE && !cu.m_tqBypass[0];
     checkTransformSkip &= !m_param->bEnableTSkipFast || (log2TrSize <= MAX_LOG2_TS_SIZE && cu.m_transformSkip[TEXT_LUMA][absPartIdx]);
     if (checkTransformSkip)
-        return codeIntraChromaTSkip(mode, cuGeom, tuDepth, tuDepthC, absPartIdx, psyEnergy);
+    {
+        codeIntraChromaTSkip(mode, cuGeom, tuDepth, tuDepthC, absPartIdx, outCost);
+        return;
+    }
 
     ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv;
     uint32_t qtLayer = log2TrSize - 2;
     uint32_t stride = mode.fencYuv->m_csize;
     const uint32_t sizeIdxC = log2TrSizeC - 2;
-    sse_ret_t outDist = 0;
 
     uint32_t curPartNum = cuGeom.numPartitions >> tuDepthC * 2;
     const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
@@ -821,8 +850,9 @@
​

x265_1.8.tar.gz/source/encoder/search.h -> x265_1.9.tar.gz/source/encoder/search.h Changed

@@ -2,6 +2,7 @@
 * Copyright (C) 2013 x265 project
 *
 * Authors: Steve Borho <steve@borho.org>
+*          Min Chen <chenm003@163.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -84,8 +85,14 @@
     MV       mvp;
     int      mvpIdx;
     int      ref;
-    uint32_t cost;
     int      bits;
+    uint32_t mvCost;
+    uint32_t cost;
+
+    MotionData()
+    {
+        memset(this, 0, sizeof(MotionData));
+    }
 };
 
 struct Mode
@@ -105,16 +112,17 @@
     // temporal candidate.
     InterNeighbourMV interNeighbours[6];
 
-    uint64_t   rdCost;     // sum of partition (psy) RD costs          (sse(fenc, recon) + lambda2 * bits)
-    uint64_t   sa8dCost;   // sum of partition sa8d distortion costs   (sa8d(fenc, pred) + lambda * bits)
-    uint32_t   sa8dBits;   // signal bits used in sa8dCost calculation
-    uint32_t   psyEnergy;  // sum of partition psycho-visual energy difference
-    sse_ret_t  lumaDistortion;
-    sse_ret_t  chromaDistortion;
-    sse_ret_t  distortion; // sum of partition SSE distortion
-    uint32_t   totalBits;  // sum of partition bits (mv + coeff)
-    uint32_t   mvBits;     // Mv bits + Ref + block type (or intra mode)
-    uint32_t   coeffBits;  // Texture bits (DCT Coeffs)
+    uint64_t    rdCost;     // sum of partition (psy) RD costs          (sse(fenc, recon) + lambda2 * bits)
+    uint64_t    sa8dCost;   // sum of partition sa8d distortion costs   (sa8d(fenc, pred) + lambda * bits)
+    uint32_t    sa8dBits;   // signal bits used in sa8dCost calculation
+    uint32_t    psyEnergy;  // sum of partition psycho-visual energy difference
+    sse_t   resEnergy;  // sum of partition residual energy after motion prediction
+    sse_t   lumaDistortion;
+    sse_t   chromaDistortion;
+    sse_t  distortion; // sum of partition SSE distortion
+    uint32_t    totalBits;  // sum of partition bits (mv + coeff)
+    uint32_t    mvBits;     // Mv bits + Ref + block type (or intra mode)
+    uint32_t    coeffBits;  // Texture bits (DCT Coeffs)
 
     void initCosts()
     {
@@ -122,6 +130,7 @@
         sa8dCost = 0;
         sa8dBits = 0;
         psyEnergy = 0;
+        resEnergy = 0;
         lumaDistortion = 0;
         chromaDistortion = 0;
         distortion = 0;
@@ -130,62 +139,13 @@
         coeffBits = 0;
     }
 
-    void invalidate()
-    {
-        /* set costs to invalid data, catch uninitialized re-use */
-        rdCost = UINT64_MAX / 2;
-        sa8dCost = UINT64_MAX / 2;
-        sa8dBits = MAX_UINT / 2;
-        psyEnergy = MAX_UINT / 2;
-#if X265_DEPTH <= 10
-        lumaDistortion = MAX_UINT / 2;
-        chromaDistortion = MAX_UINT / 2;
-        distortion = MAX_UINT / 2;
-#else
-        lumaDistortion = UINT64_MAX / 2;
-        chromaDistortion = UINT64_MAX / 2;
-        distortion = UINT64_MAX / 2;
-#endif
-        totalBits = MAX_UINT / 2;
-        mvBits = MAX_UINT / 2;
-        coeffBits = MAX_UINT / 2;
-    }
-
-    bool ok() const
-    {
-#if X265_DEPTH <= 10
-        return !(rdCost >= UINT64_MAX / 2 ||
-            sa8dCost >= UINT64_MAX / 2 ||
-            sa8dBits >= MAX_UINT / 2 ||
-            psyEnergy >= MAX_UINT / 2 ||
-            lumaDistortion >= MAX_UINT / 2 ||
-            chromaDistortion >= MAX_UINT / 2 ||
-            distortion >= MAX_UINT / 2 ||
-            totalBits >= MAX_UINT / 2 ||
-            mvBits >= MAX_UINT / 2 ||
-            coeffBits >= MAX_UINT / 2);
-#else
-        return !(rdCost >= UINT64_MAX / 2 ||
-                 sa8dCost >= UINT64_MAX / 2 ||
-                 sa8dBits >= MAX_UINT / 2 ||
-                 psyEnergy >= MAX_UINT / 2 ||
-                 lumaDistortion >= UINT64_MAX / 2 ||
-                 chromaDistortion >= UINT64_MAX / 2 ||
-                 distortion >= UINT64_MAX / 2 ||
-                 totalBits >= MAX_UINT / 2 ||
-                 mvBits >= MAX_UINT / 2 ||
-                 coeffBits >= MAX_UINT / 2);
-#endif
-    }
-
     void addSubCosts(const Mode& subMode)
     {
-        X265_CHECK(subMode.ok(), "sub-mode not initialized");
-
         rdCost += subMode.rdCost;
         sa8dCost += subMode.sa8dCost;
         sa8dBits += subMode.sa8dBits;
         psyEnergy += subMode.psyEnergy;
+        resEnergy += subMode.resEnergy;
         lumaDistortion += subMode.lumaDistortion;
         chromaDistortion += subMode.chromaDistortion;
         distortion += subMode.distortion;
@@ -325,13 +285,13 @@
     ~Search();
 
     bool     initSearch(const x265_param& param, ScalingList& scalingList);
-    int      setLambdaFromQP(const CUData& ctu, int qp); /* returns real quant QP in valid spec range */
+    int      setLambdaFromQP(const CUData& ctu, int qp, int lambdaQP = -1); /* returns real quant QP in valid spec range */
 
     // mark temp RD entropy contexts as uninitialized; useful for finding loads without stores
     void     invalidateContexts(int fromDepth);
 
-    // full RD search of intra modes. if sharedModes is not NULL, it directly uses them
-    void     checkIntra(Mode& intraMode, const CUGeom& cuGeom, PartSize partSize, uint8_t* sharedModes, uint8_t* sharedChromaModes);
+    // full RD search of intra modes
+    void     checkIntra(Mode& intraMode, const CUGeom& cuGeom, PartSize partSizes);
 
     // select best intra mode using only sa8d costs, cannot measure NxN intra
     void     checkIntraInInter(Mode& intraMode, const CUGeom& cuGeom);
@@ -397,10 +357,10 @@
     void     saveResidualQTData(CUData& cu, ShortYuv& resiYuv, uint32_t absPartIdx, uint32_t tuDepth);
 
     // RDO search of luma intra modes; result is fully encoded luma. luma distortion is returned
-    uint32_t estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, const uint32_t depthRange[2], uint8_t* sharedModes);
+    sse_t estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, const uint32_t depthRange[2]);
 
     // RDO select best chroma mode from luma; result is fully encode chroma. chroma distortion is returned
-    uint32_t estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom, uint8_t* sharedChromaModes);
+    sse_t estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom);
 
     void     codeSubdivCbfQTChroma(const CUData& cu, uint32_t tuDepth, uint32_t absPartIdx);
     void     codeInterSubdivCbfQT(CUData& cu, uint32_t absPartIdx, const uint32_t tuDepth, const uint32_t depthRange[2]);
@@ -410,12 +370,12 @@
     {
         uint64_t rdcost;
         uint32_t bits;
-        sse_ret_t distortion;
+        sse_t distortion;
         uint32_t energy;
         Cost() { rdcost = 0; bits = 0; distortion = 0; energy = 0; }
     };
 
-    uint64_t estimateNullCbfCost(uint32_t &dist, uint32_t &psyEnergy, uint32_t tuDepth, TextType compId);
+    uint64_t estimateNullCbfCost(sse_t dist, uint32_t psyEnergy, uint32_t tuDepth, TextType compId);
     void     estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, ShortYuv& resiYuv, Cost& costs, const uint32_t depthRange[2]);
 
     // generate prediction, generate residual and recon. if bAllowSplit, find optimal RQT splits
@@ -424,8 +384,8 @@
     void     extractIntraResultQT(CUData& cu, Yuv& reconYuv, uint32_t tuDepth, uint32_t absPartIdx);
 
     // generate chroma prediction, generate residual and recon
-    uint32_t codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, uint32_t& psyEnergy);
-    uint32_t codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t tuDepthC, uint32_t absPartIdx, uint32_t& psyEnergy);
+    void     codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, Cost& outCost);
+    void     codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t tuDepthC, uint32_t absPartIdx, Cost& outCost);
     void     extractIntraResultChromaQT(CUData& cu, Yuv& reconYuv, uint32_t absPartIdx, uint32_t tuDepth);
 
     // reshuffle CBF flags after coding a pair of 4:2:2 chroma blocks

 
@@ -2,6 +2,7 @@
 * Copyright (C) 2013 x265 project
 *
 * Authors: Steve Borho <steve@borho.org>
+*          Min Chen <chenm003@163.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -84,8 +85,14 @@
     MV       mvp;
     int      mvpIdx;
     int      ref;
-    uint32_t cost;
     int      bits;
+    uint32_t mvCost;
+    uint32_t cost;
+
+    MotionData()
+    {
+        memset(this, 0, sizeof(MotionData));
+    }
 };
 
 struct Mode
@@ -105,16 +112,17 @@
     // temporal candidate.
     InterNeighbourMV interNeighbours[6];
 
-    uint64_t   rdCost;     // sum of partition (psy) RD costs          (sse(fenc, recon) + lambda2 * bits)
-    uint64_t   sa8dCost;   // sum of partition sa8d distortion costs   (sa8d(fenc, pred) + lambda * bits)
-    uint32_t   sa8dBits;   // signal bits used in sa8dCost calculation
-    uint32_t   psyEnergy;  // sum of partition psycho-visual energy difference
-    sse_ret_t  lumaDistortion;
-    sse_ret_t  chromaDistortion;
-    sse_ret_t  distortion; // sum of partition SSE distortion
-    uint32_t   totalBits;  // sum of partition bits (mv + coeff)
-    uint32_t   mvBits;     // Mv bits + Ref + block type (or intra mode)
-    uint32_t   coeffBits;  // Texture bits (DCT Coeffs)
+    uint64_t    rdCost;     // sum of partition (psy) RD costs          (sse(fenc, recon) + lambda2 * bits)
+    uint64_t    sa8dCost;   // sum of partition sa8d distortion costs   (sa8d(fenc, pred) + lambda * bits)
+    uint32_t    sa8dBits;   // signal bits used in sa8dCost calculation
+    uint32_t    psyEnergy;  // sum of partition psycho-visual energy difference
+    sse_t   resEnergy;  // sum of partition residual energy after motion prediction
+    sse_t   lumaDistortion;
+    sse_t   chromaDistortion;
+    sse_t  distortion; // sum of partition SSE distortion
+    uint32_t    totalBits;  // sum of partition bits (mv + coeff)
+    uint32_t    mvBits;     // Mv bits + Ref + block type (or intra mode)
+    uint32_t    coeffBits;  // Texture bits (DCT Coeffs)
 
     void initCosts()
     {
@@ -122,6 +130,7 @@
         sa8dCost = 0;
         sa8dBits = 0;
         psyEnergy = 0;
+        resEnergy = 0;
         lumaDistortion = 0;
         chromaDistortion = 0;
         distortion = 0;
@@ -130,62 +139,13 @@
         coeffBits = 0;
     }
 
-    void invalidate()
-    {
-        /* set costs to invalid data, catch uninitialized re-use */
-        rdCost = UINT64_MAX / 2;
-        sa8dCost = UINT64_MAX / 2;
-        sa8dBits = MAX_UINT / 2;
-        psyEnergy = MAX_UINT / 2;
-#if X265_DEPTH <= 10
-        lumaDistortion = MAX_UINT / 2;
-        chromaDistortion = MAX_UINT / 2;
-        distortion = MAX_UINT / 2;
-#else
-        lumaDistortion = UINT64_MAX / 2;
-        chromaDistortion = UINT64_MAX / 2;
-        distortion = UINT64_MAX / 2;
-#endif
-        totalBits = MAX_UINT / 2;
-        mvBits = MAX_UINT / 2;
-        coeffBits = MAX_UINT / 2;
-    }
-
-    bool ok() const
-    {
-#if X265_DEPTH <= 10
-        return !(rdCost >= UINT64_MAX / 2 ||
-            sa8dCost >= UINT64_MAX / 2 ||
-            sa8dBits >= MAX_UINT / 2 ||
-            psyEnergy >= MAX_UINT / 2 ||
-            lumaDistortion >= MAX_UINT / 2 ||
-            chromaDistortion >= MAX_UINT / 2 ||
-            distortion >= MAX_UINT / 2 ||
-            totalBits >= MAX_UINT / 2 ||
-            mvBits >= MAX_UINT / 2 ||
-            coeffBits >= MAX_UINT / 2);
-#else
-        return !(rdCost >= UINT64_MAX / 2 ||
-                 sa8dCost >= UINT64_MAX / 2 ||
-                 sa8dBits >= MAX_UINT / 2 ||
-                 psyEnergy >= MAX_UINT / 2 ||
-                 lumaDistortion >= UINT64_MAX / 2 ||
-                 chromaDistortion >= UINT64_MAX / 2 ||
-                 distortion >= UINT64_MAX / 2 ||
-                 totalBits >= MAX_UINT / 2 ||
-                 mvBits >= MAX_UINT / 2 ||
-                 coeffBits >= MAX_UINT / 2);
-#endif
-    }
-
     void addSubCosts(const Mode& subMode)
     {
-        X265_CHECK(subMode.ok(), "sub-mode not initialized");
-
         rdCost += subMode.rdCost;
         sa8dCost += subMode.sa8dCost;
         sa8dBits += subMode.sa8dBits;
         psyEnergy += subMode.psyEnergy;
+        resEnergy += subMode.resEnergy;
         lumaDistortion += subMode.lumaDistortion;
         chromaDistortion += subMode.chromaDistortion;
         distortion += subMode.distortion;
@@ -325,13 +285,13 @@
     ~Search();
 
     bool     initSearch(const x265_param& param, ScalingList& scalingList);
-    int      setLambdaFromQP(const CUData& ctu, int qp); /* returns real quant QP in valid spec range */
+    int      setLambdaFromQP(const CUData& ctu, int qp, int lambdaQP = -1); /* returns real quant QP in valid spec range */
 
     // mark temp RD entropy contexts as uninitialized; useful for finding loads without stores
     void     invalidateContexts(int fromDepth);
 
-    // full RD search of intra modes. if sharedModes is not NULL, it directly uses them
-    void     checkIntra(Mode& intraMode, const CUGeom& cuGeom, PartSize partSize, uint8_t* sharedModes, uint8_t* sharedChromaModes);
+    // full RD search of intra modes
+    void     checkIntra(Mode& intraMode, const CUGeom& cuGeom, PartSize partSizes);
 
     // select best intra mode using only sa8d costs, cannot measure NxN intra
     void     checkIntraInInter(Mode& intraMode, const CUGeom& cuGeom);
@@ -397,10 +357,10 @@
     void     saveResidualQTData(CUData& cu, ShortYuv& resiYuv, uint32_t absPartIdx, uint32_t tuDepth);
 
     // RDO search of luma intra modes; result is fully encoded luma. luma distortion is returned
-    uint32_t estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, const uint32_t depthRange[2], uint8_t* sharedModes);
+    sse_t estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, const uint32_t depthRange[2]);
 
     // RDO select best chroma mode from luma; result is fully encode chroma. chroma distortion is returned
-    uint32_t estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom, uint8_t* sharedChromaModes);
+    sse_t estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom);
 
     void     codeSubdivCbfQTChroma(const CUData& cu, uint32_t tuDepth, uint32_t absPartIdx);
     void     codeInterSubdivCbfQT(CUData& cu, uint32_t absPartIdx, const uint32_t tuDepth, const uint32_t depthRange[2]);
@@ -410,12 +370,12 @@
     {
         uint64_t rdcost;
         uint32_t bits;
-        sse_ret_t distortion;
+        sse_t distortion;
         uint32_t energy;
         Cost() { rdcost = 0; bits = 0; distortion = 0; energy = 0; }
     };
 
-    uint64_t estimateNullCbfCost(uint32_t &dist, uint32_t &psyEnergy, uint32_t tuDepth, TextType compId);
+    uint64_t estimateNullCbfCost(sse_t dist, uint32_t psyEnergy, uint32_t tuDepth, TextType compId);
     void     estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, ShortYuv& resiYuv, Cost& costs, const uint32_t depthRange[2]);
 
     // generate prediction, generate residual and recon. if bAllowSplit, find optimal RQT splits
@@ -424,8 +384,8 @@
     void     extractIntraResultQT(CUData& cu, Yuv& reconYuv, uint32_t tuDepth, uint32_t absPartIdx);
 
     // generate chroma prediction, generate residual and recon
-    uint32_t codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, uint32_t& psyEnergy);
-    uint32_t codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t tuDepthC, uint32_t absPartIdx, uint32_t& psyEnergy);
+    void     codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, Cost& outCost);
+    void     codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t tuDepthC, uint32_t absPartIdx, Cost& outCost);
     void     extractIntraResultChromaQT(CUData& cu, Yuv& reconYuv, uint32_t absPartIdx, uint32_t tuDepth);
 
     // reshuffle CBF flags after coding a pair of 4:2:2 chroma blocks
​

x265_1.8.tar.gz/source/encoder/sei.h -> x265_1.9.tar.gz/source/encoder/sei.h Changed

@@ -163,12 +163,6 @@
 
     PayloadType payloadType() const { return CONTENT_LIGHT_LEVEL_INFO; }
 
-    bool parse(const char* value)
-    {
-        return sscanf(value, "%hu,%hu",
-                      &max_content_light_level, &max_pic_average_light_level) == 2;
-    }
-
     void write(Bitstream& bs, const SPS&)
     {
         m_bitIf = &bs;
@@ -195,29 +189,31 @@
 
     uint8_t m_digest[3][16];
 
-    void write(Bitstream& bs, const SPS&)
+    void write(Bitstream& bs, const SPS& sps)
     {
         m_bitIf = &bs;
 
+        int planes = (sps.chromaFormatIdc != X265_CSP_I400) ? 3 : 1;
+
         WRITE_CODE(DECODED_PICTURE_HASH, 8, "payload_type");
 
         switch (m_method)
         {
         case MD5:
-            WRITE_CODE(1 + 16 * 3, 8, "payload_size");
+            WRITE_CODE(1 + 16 * planes, 8, "payload_size");
             WRITE_CODE(MD5, 8, "hash_type");
             break;
         case CRC:
-            WRITE_CODE(1 + 2 * 3, 8, "payload_size");
+            WRITE_CODE(1 + 2 * planes, 8, "payload_size");
             WRITE_CODE(CRC, 8, "hash_type");
             break;
         case CHECKSUM:
-            WRITE_CODE(1 + 4 * 3, 8, "payload_size");
+            WRITE_CODE(1 + 4 * planes, 8, "payload_size");
             WRITE_CODE(CHECKSUM, 8, "hash_type");
             break;
         }
 
-        for (int yuvIdx = 0; yuvIdx < 3; yuvIdx++)
+        for (int yuvIdx = 0; yuvIdx < planes; yuvIdx++)
         {
             if (m_method == MD5)
             {

 
@@ -163,12 +163,6 @@
 
     PayloadType payloadType() const { return CONTENT_LIGHT_LEVEL_INFO; }
 
-    bool parse(const char* value)
-    {
-        return sscanf(value, "%hu,%hu",
-                      &max_content_light_level, &max_pic_average_light_level) == 2;
-    }
-
     void write(Bitstream& bs, const SPS&)
     {
         m_bitIf = &bs;
@@ -195,29 +189,31 @@
 
     uint8_t m_digest[3][16];
 
-    void write(Bitstream& bs, const SPS&)
+    void write(Bitstream& bs, const SPS& sps)
     {
         m_bitIf = &bs;
 
+        int planes = (sps.chromaFormatIdc != X265_CSP_I400) ? 3 : 1;
+
         WRITE_CODE(DECODED_PICTURE_HASH, 8, "payload_type");
 
         switch (m_method)
         {
         case MD5:
-            WRITE_CODE(1 + 16 * 3, 8, "payload_size");
+            WRITE_CODE(1 + 16 * planes, 8, "payload_size");
             WRITE_CODE(MD5, 8, "hash_type");
             break;
         case CRC:
-            WRITE_CODE(1 + 2 * 3, 8, "payload_size");
+            WRITE_CODE(1 + 2 * planes, 8, "payload_size");
             WRITE_CODE(CRC, 8, "hash_type");
             break;
         case CHECKSUM:
-            WRITE_CODE(1 + 4 * 3, 8, "payload_size");
+            WRITE_CODE(1 + 4 * planes, 8, "payload_size");
             WRITE_CODE(CHECKSUM, 8, "hash_type");
             break;
         }
 
-        for (int yuvIdx = 0; yuvIdx < 3; yuvIdx++)
+        for (int yuvIdx = 0; yuvIdx < planes; yuvIdx++)
         {
             if (m_method == MD5)
             {
​

x265_1.8.tar.gz/source/encoder/slicetype.cpp -> x265_1.9.tar.gz/source/encoder/slicetype.cpp Changed

@@ -83,8 +83,11 @@
     uint32_t var;
 
     var  = acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[0] + blockOffsetLuma, stride, 0, csp);
-    var += acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[1] + blockOffsetChroma, cStride, 1, csp);
-    var += acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[2] + blockOffsetChroma, cStride, 2, csp);
+    if (csp != X265_CSP_I400)
+    {
+        var += acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[1] + blockOffsetChroma, cStride, 1, csp);
+        var += acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[2] + blockOffsetChroma, cStride, 2, csp);
+    }
     x265_emms();
     return var;
 }
@@ -96,6 +99,7 @@
     int maxRow = curFrame->m_fencPic->m_picHeight;
     int blockCount = curFrame->m_lowres.maxBlocksInRow * curFrame->m_lowres.maxBlocksInCol;
 
+    float* quantOffsets = curFrame->m_quantOffsets;
     for (int y = 0; y < 3; y++)
     {
         curFrame->m_lowres.wp_ssd[y] = 0;
@@ -113,10 +117,21 @@
 
         if (param->rc.aqMode && param->rc.aqStrength == 0)
         {
-            memset(curFrame->m_lowres.qpCuTreeOffset, 0, cuCount * sizeof(double));
-            memset(curFrame->m_lowres.qpAqOffset, 0, cuCount * sizeof(double));
-            for (int cuxy = 0; cuxy < cuCount; cuxy++)
-                curFrame->m_lowres.invQscaleFactor[cuxy] = 256;
+            if (quantOffsets)
+            {
+                for (int cuxy = 0; cuxy < cuCount; cuxy++)
+                {
+                    curFrame->m_lowres.qpCuTreeOffset[cuxy] = curFrame->m_lowres.qpAqOffset[cuxy] = quantOffsets[cuxy];
+                    curFrame->m_lowres.invQscaleFactor[cuxy] = x265_exp2fix8(curFrame->m_lowres.qpCuTreeOffset[cuxy]);
+                }
+            }
+            else
+            {
+                memset(curFrame->m_lowres.qpCuTreeOffset, 0, cuCount * sizeof(double));
+                memset(curFrame->m_lowres.qpAqOffset, 0, cuCount * sizeof(double));
+                for (int cuxy = 0; cuxy < cuCount; cuxy++)
+                    curFrame->m_lowres.invQscaleFactor[cuxy] = 256;
+            }
         }
 
         /* Need variance data for weighted prediction */
@@ -135,19 +150,25 @@
         if (param->rc.aqMode == X265_AQ_AUTO_VARIANCE || param->rc.aqMode == X265_AQ_AUTO_VARIANCE_BIASED)
         {
             double bit_depth_correction = 1.f / (1 << (2*(X265_DEPTH-8)));
+            curFrame->m_lowres.frameVariance = 0;
+            uint64_t rowVariance = 0;
             for (blockY = 0; blockY < maxRow; blockY += 16)
             {
+                rowVariance = 0;
                 for (blockX = 0; blockX < maxCol; blockX += 16)
                 {
                     uint32_t energy = acEnergyCu(curFrame, blockX, blockY, param->internalCsp);
+                    curFrame->m_lowres.blockVariance[blockXY] = energy;
+                    rowVariance += energy;
                     qp_adj = pow(energy * bit_depth_correction + 1, 0.1);
                     curFrame->m_lowres.qpCuTreeOffset[blockXY] = qp_adj;
                     avg_adj += qp_adj;
                     avg_adj_pow2 += qp_adj * qp_adj;
                     blockXY++;
                 }
+                curFrame->m_lowres.frameVariance += (rowVariance / maxCol);
             }
-
+            curFrame->m_lowres.frameVariance /= maxRow;
             avg_adj /= blockCount;
             avg_adj_pow2 /= blockCount;
             strength = param->rc.aqStrength * avg_adj;
@@ -177,6 +198,8 @@
                     uint32_t energy = acEnergyCu(curFrame, blockX, blockY, param->internalCsp);
                     qp_adj = strength * (X265_LOG2(X265_MAX(energy, 1)) - (14.427f + 2 * (X265_DEPTH - 8)));
                 }
+                if (quantOffsets != NULL)
+                    qp_adj += quantOffsets[blockXY];
                 curFrame->m_lowres.qpAqOffset[blockXY] = qp_adj;
                 curFrame->m_lowres.qpCuTreeOffset[blockXY] = qp_adj;
                 curFrame->m_lowres.invQscaleFactor[blockXY] = x265_exp2fix8(qp_adj);
@@ -328,7 +351,7 @@
 
         primitives.weight_pp(ref.buffer[0], wbuffer[0], stride, widthHeight, paddedLines,
             scale, round << correction, denom + correction, offset);
-        src = weightedRef.fpelPlane[0];
+        src = fenc.weightedRef[fenc.frameNum - ref.frameNum].fpelPlane[0];
     }
 
     uint32_t cost = 0;
@@ -350,7 +373,6 @@
 bool LookaheadTLD::allocWeightedRef(Lowres& fenc)
 {
     intptr_t planesize = fenc.buffer[1] - fenc.buffer[0];
-    intptr_t padoffset = fenc.lowresPlane[0] - fenc.buffer[0];
     paddedLines = (int)(planesize / fenc.lumaStride);
 
     wbuffer[0] = X265_MALLOC(pixel, 4 * planesize);
@@ -363,14 +385,6 @@
     else
         return false;
 
-    for (int i = 0; i < 4; i++)
-        weightedRef.lowresPlane[i] = wbuffer[i] + padoffset;
-
-    weightedRef.fpelPlane[0] = weightedRef.lowresPlane[0];
-    weightedRef.lumaStride = fenc.lumaStride;
-    weightedRef.isLowres = true;
-    weightedRef.isWeighted = false;
-
     return true;
 }
 
@@ -388,6 +402,16 @@
             return;
     }
 
+    ReferencePlanes& weightedRef = fenc.weightedRef[deltaIndex];
+    intptr_t padoffset = fenc.lowresPlane[0] - fenc.buffer[0];
+    for (int i = 0; i < 4; i++)
+        weightedRef.lowresPlane[i] = wbuffer[i] + padoffset;
+
+    weightedRef.fpelPlane[0] = weightedRef.lowresPlane[0];
+    weightedRef.lumaStride = fenc.lumaStride;
+    weightedRef.isLowres = true;
+    weightedRef.isWeighted = false;
+
     /* epsilon is chosen to require at least a numerator of 127 (with denominator = 128) */
     float guessScale, fencMean, refMean;
     x265_emms();
@@ -478,7 +502,13 @@
 
     m_8x8Height = ((m_param->sourceHeight / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
     m_8x8Width = ((m_param->sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
-    m_8x8Blocks = m_8x8Width > 2 && m_8x8Height > 2 ? (m_8x8Width - 2) * (m_8x8Height - 2) : m_8x8Width * m_8x8Height;
+    m_cuCount = m_8x8Width * m_8x8Height;
+    m_8x8Blocks = m_8x8Width > 2 && m_8x8Height > 2 ? (m_cuCount + 4 - 2 * (m_8x8Width + m_8x8Height)) : m_cuCount;
+
+    /* Allow the strength to be adjusted via qcompress, since the two concepts
+     * are very similar. */
+
+    m_cuTreeStrength = 5.0 * (1.0 - m_param->rc.qCompress);
 
     m_lastKeyframe = -m_param->keyframeMax;
     m_sliceTypeBusy = false;
@@ -502,7 +532,16 @@
     m_bBatchFrameCosts = m_bBatchMotionSearch;
 
     if (m_param->lookaheadSlices && !m_pool)
+    {
+        x265_log(param, X265_LOG_WARNING, "No pools found; disabling lookahead-slices\n");
+        m_param->lookaheadSlices = 0;
+    }
+
+    if (m_param->lookaheadSlices && (m_param->sourceHeight < 720))
+    {
+        x265_log(param, X265_LOG_WARNING, "Source height < 720p; disabling lookahead-slices\n");
         m_param->lookaheadSlices = 0;
+    }
 
     if (m_param->lookaheadSlices > 1)
     {
@@ -715,16 +754,16 @@
 
     case P_SLICE:
         b = p1 = poc - l0poc;
-        frames[p0] = &slice->m_refPicList[0][0]->m_lowres;
+        frames[p0] = &slice->m_refFrameList[0][0]->m_lowres;
         frames[b] = &curFrame->m_lowres;
         break;
 
     case B_SLICE:
         b = poc - l0poc;
         p1 = b + l1poc - poc;
-        frames[p0] = &slice->m_refPicList[0][0]->m_lowres;
+        frames[p0] = &slice->m_refFrameList[0][0]->m_lowres;
         frames[b] = &curFrame->m_lowres;
-        frames[p1] = &slice->m_refPicList[1][0]->m_lowres;
+        frames[p1] = &slice->m_refFrameList[1][0]->m_lowres;
         break;
 
     default:
@@ -736,10 +775,13 @@
     if (m_param->rc.cuTree && !m_param->rc.bStatRead)
         /* update row satds based on cutree offsets */
         curFrame->m_lowres.satdCost = frameCostRecalculate(frames, p0, p1, b);
-    else if (m_param->rc.aqMode)
-        curFrame->m_lowres.satdCost = curFrame->m_lowres.costEstAq[b - p0][p1 - b];
-    else
-        curFrame->m_lowres.satdCost = curFrame->m_lowres.costEst[b - p0][p1 - b];
+    else if (m_param->analysisMode != X265_ANALYSIS_LOAD)
+    {
+        if (m_param->rc.aqMode)
+            curFrame->m_lowres.satdCost = curFrame->m_lowres.costEstAq[b - p0][p1 - b];
+        else
+            curFrame->m_lowres.satdCost = curFrame->m_lowres.costEst[b - p0][p1 - b];
+    }

 
@@ -83,8 +83,11 @@
     uint32_t var;
 
     var  = acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[0] + blockOffsetLuma, stride, 0, csp);
-    var += acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[1] + blockOffsetChroma, cStride, 1, csp);
-    var += acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[2] + blockOffsetChroma, cStride, 2, csp);
+    if (csp != X265_CSP_I400)
+    {
+        var += acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[1] + blockOffsetChroma, cStride, 1, csp);
+        var += acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[2] + blockOffsetChroma, cStride, 2, csp);
+    }
     x265_emms();
     return var;
 }
@@ -96,6 +99,7 @@
     int maxRow = curFrame->m_fencPic->m_picHeight;
     int blockCount = curFrame->m_lowres.maxBlocksInRow * curFrame->m_lowres.maxBlocksInCol;
 
+    float* quantOffsets = curFrame->m_quantOffsets;
     for (int y = 0; y < 3; y++)
     {
         curFrame->m_lowres.wp_ssd[y] = 0;
@@ -113,10 +117,21 @@
 
         if (param->rc.aqMode && param->rc.aqStrength == 0)
         {
-            memset(curFrame->m_lowres.qpCuTreeOffset, 0, cuCount * sizeof(double));
-            memset(curFrame->m_lowres.qpAqOffset, 0, cuCount * sizeof(double));
-            for (int cuxy = 0; cuxy < cuCount; cuxy++)
-                curFrame->m_lowres.invQscaleFactor[cuxy] = 256;
+            if (quantOffsets)
+            {
+                for (int cuxy = 0; cuxy < cuCount; cuxy++)
+                {
+                    curFrame->m_lowres.qpCuTreeOffset[cuxy] = curFrame->m_lowres.qpAqOffset[cuxy] = quantOffsets[cuxy];
+                    curFrame->m_lowres.invQscaleFactor[cuxy] = x265_exp2fix8(curFrame->m_lowres.qpCuTreeOffset[cuxy]);
+                }
+            }
+            else
+            {
+                memset(curFrame->m_lowres.qpCuTreeOffset, 0, cuCount * sizeof(double));
+                memset(curFrame->m_lowres.qpAqOffset, 0, cuCount * sizeof(double));
+                for (int cuxy = 0; cuxy < cuCount; cuxy++)
+                    curFrame->m_lowres.invQscaleFactor[cuxy] = 256;
+            }
         }
 
         /* Need variance data for weighted prediction */
@@ -135,19 +150,25 @@
         if (param->rc.aqMode == X265_AQ_AUTO_VARIANCE || param->rc.aqMode == X265_AQ_AUTO_VARIANCE_BIASED)
         {
             double bit_depth_correction = 1.f / (1 << (2*(X265_DEPTH-8)));
+            curFrame->m_lowres.frameVariance = 0;
+            uint64_t rowVariance = 0;
             for (blockY = 0; blockY < maxRow; blockY += 16)
             {
+                rowVariance = 0;
                 for (blockX = 0; blockX < maxCol; blockX += 16)
                 {
                     uint32_t energy = acEnergyCu(curFrame, blockX, blockY, param->internalCsp);
+                    curFrame->m_lowres.blockVariance[blockXY] = energy;
+                    rowVariance += energy;
                     qp_adj = pow(energy * bit_depth_correction + 1, 0.1);
                     curFrame->m_lowres.qpCuTreeOffset[blockXY] = qp_adj;
                     avg_adj += qp_adj;
                     avg_adj_pow2 += qp_adj * qp_adj;
                     blockXY++;
                 }
+                curFrame->m_lowres.frameVariance += (rowVariance / maxCol);
             }
-
+            curFrame->m_lowres.frameVariance /= maxRow;
             avg_adj /= blockCount;
             avg_adj_pow2 /= blockCount;
             strength = param->rc.aqStrength * avg_adj;
@@ -177,6 +198,8 @@
                     uint32_t energy = acEnergyCu(curFrame, blockX, blockY, param->internalCsp);
                     qp_adj = strength * (X265_LOG2(X265_MAX(energy, 1)) - (14.427f + 2 * (X265_DEPTH - 8)));
                 }
+                if (quantOffsets != NULL)
+                    qp_adj += quantOffsets[blockXY];
                 curFrame->m_lowres.qpAqOffset[blockXY] = qp_adj;
                 curFrame->m_lowres.qpCuTreeOffset[blockXY] = qp_adj;
                 curFrame->m_lowres.invQscaleFactor[blockXY] = x265_exp2fix8(qp_adj);
@@ -328,7 +351,7 @@
 
         primitives.weight_pp(ref.buffer[0], wbuffer[0], stride, widthHeight, paddedLines,
             scale, round << correction, denom + correction, offset);
-        src = weightedRef.fpelPlane[0];
+        src = fenc.weightedRef[fenc.frameNum - ref.frameNum].fpelPlane[0];
     }
 
     uint32_t cost = 0;
@@ -350,7 +373,6 @@
 bool LookaheadTLD::allocWeightedRef(Lowres& fenc)
 {
     intptr_t planesize = fenc.buffer[1] - fenc.buffer[0];
-    intptr_t padoffset = fenc.lowresPlane[0] - fenc.buffer[0];
     paddedLines = (int)(planesize / fenc.lumaStride);
 
     wbuffer[0] = X265_MALLOC(pixel, 4 * planesize);
@@ -363,14 +385,6 @@
     else
         return false;
 
-    for (int i = 0; i < 4; i++)
-        weightedRef.lowresPlane[i] = wbuffer[i] + padoffset;
-
-    weightedRef.fpelPlane[0] = weightedRef.lowresPlane[0];
-    weightedRef.lumaStride = fenc.lumaStride;
-    weightedRef.isLowres = true;
-    weightedRef.isWeighted = false;
-
     return true;
 }
 
@@ -388,6 +402,16 @@
             return;
     }
 
+    ReferencePlanes& weightedRef = fenc.weightedRef[deltaIndex];
+    intptr_t padoffset = fenc.lowresPlane[0] - fenc.buffer[0];
+    for (int i = 0; i < 4; i++)
+        weightedRef.lowresPlane[i] = wbuffer[i] + padoffset;
+
+    weightedRef.fpelPlane[0] = weightedRef.lowresPlane[0];
+    weightedRef.lumaStride = fenc.lumaStride;
+    weightedRef.isLowres = true;
+    weightedRef.isWeighted = false;
+
     /* epsilon is chosen to require at least a numerator of 127 (with denominator = 128) */
     float guessScale, fencMean, refMean;
     x265_emms();
@@ -478,7 +502,13 @@
 
     m_8x8Height = ((m_param->sourceHeight / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
     m_8x8Width = ((m_param->sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
-    m_8x8Blocks = m_8x8Width > 2 && m_8x8Height > 2 ? (m_8x8Width - 2) * (m_8x8Height - 2) : m_8x8Width * m_8x8Height;
+    m_cuCount = m_8x8Width * m_8x8Height;
+    m_8x8Blocks = m_8x8Width > 2 && m_8x8Height > 2 ? (m_cuCount + 4 - 2 * (m_8x8Width + m_8x8Height)) : m_cuCount;
+
+    /* Allow the strength to be adjusted via qcompress, since the two concepts
+     * are very similar. */
+
+    m_cuTreeStrength = 5.0 * (1.0 - m_param->rc.qCompress);
 
     m_lastKeyframe = -m_param->keyframeMax;
     m_sliceTypeBusy = false;
@@ -502,7 +532,16 @@
     m_bBatchFrameCosts = m_bBatchMotionSearch;
 
     if (m_param->lookaheadSlices && !m_pool)
+    {
+        x265_log(param, X265_LOG_WARNING, "No pools found; disabling lookahead-slices\n");
+        m_param->lookaheadSlices = 0;
+    }
+
+    if (m_param->lookaheadSlices && (m_param->sourceHeight < 720))
+    {
+        x265_log(param, X265_LOG_WARNING, "Source height < 720p; disabling lookahead-slices\n");
         m_param->lookaheadSlices = 0;
+    }
 
     if (m_param->lookaheadSlices > 1)
     {
@@ -715,16 +754,16 @@
 
     case P_SLICE:
         b = p1 = poc - l0poc;
-        frames[p0] = &slice->m_refPicList[0][0]->m_lowres;
+        frames[p0] = &slice->m_refFrameList[0][0]->m_lowres;
         frames[b] = &curFrame->m_lowres;
         break;
 
     case B_SLICE:
         b = poc - l0poc;
         p1 = b + l1poc - poc;
-        frames[p0] = &slice->m_refPicList[0][0]->m_lowres;
+        frames[p0] = &slice->m_refFrameList[0][0]->m_lowres;
         frames[b] = &curFrame->m_lowres;
-        frames[p1] = &slice->m_refPicList[1][0]->m_lowres;
+        frames[p1] = &slice->m_refFrameList[1][0]->m_lowres;
         break;
 
     default:
@@ -736,10 +775,13 @@
     if (m_param->rc.cuTree && !m_param->rc.bStatRead)
         /* update row satds based on cutree offsets */
         curFrame->m_lowres.satdCost = frameCostRecalculate(frames, p0, p1, b);
-    else if (m_param->rc.aqMode)
-        curFrame->m_lowres.satdCost = curFrame->m_lowres.costEstAq[b - p0][p1 - b];
-    else
-        curFrame->m_lowres.satdCost = curFrame->m_lowres.costEst[b - p0][p1 - b];
+    else if (m_param->analysisMode != X265_ANALYSIS_LOAD)
+    {
+        if (m_param->rc.aqMode)
+            curFrame->m_lowres.satdCost = curFrame->m_lowres.costEstAq[b - p0][p1 - b];
+        else
+            curFrame->m_lowres.satdCost = curFrame->m_lowres.costEst[b - p0][p1 - b];
+    }
​

x265_1.8.tar.gz/source/encoder/slicetype.h -> x265_1.9.tar.gz/source/encoder/slicetype.h Changed

 
@@ -2,6 +2,7 @@
  * Copyright (C) 2013 x265 project
  *
  * Authors: Steve Borho <steve@borho.org>
+ *          Min Chen <chenm003@163.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -44,7 +45,6 @@
 struct LookaheadTLD
 {
     MotionEstimate  me;
-    ReferencePlanes weightedRef;
     pixel*          wbuffer[4];
     int             widthInCU;
     int             heightInCU;
@@ -103,29 +103,30 @@
     PicList       m_outputQueue;     // pictures to be encoded, in encode order
     Lock          m_inputLock;
     Lock          m_outputLock;
-
-    /* pre-lookahead */
-    int           m_fullQueueSize;
-    bool          m_isActive;
-    bool          m_sliceTypeBusy;
-    bool          m_bAdaptiveQuant;
-    bool          m_outputSignalRequired;
-    bool          m_bBatchMotionSearch;
-    bool          m_bBatchFrameCosts;
     Event         m_outputSignal;
-
     LookaheadTLD* m_tld;
     x265_param*   m_param;
     Lowres*       m_lastNonB;
     int*          m_scratch;         // temp buffer for cutree propagate
-    
+
+    /* pre-lookahead */
+    int           m_fullQueueSize;
     int           m_histogram[X265_BFRAME_MAX + 1];
     int           m_lastKeyframe;
     int           m_8x8Width;
     int           m_8x8Height;
     int           m_8x8Blocks;
+    int           m_cuCount;
     int           m_numCoopSlices;
     int           m_numRowsPerSlice;
+    double        m_cuTreeStrength;
+
+    bool          m_isActive;
+    bool          m_sliceTypeBusy;
+    bool          m_bAdaptiveQuant;
+    bool          m_outputSignalRequired;
+    bool          m_bBatchMotionSearch;
+    bool          m_bBatchFrameCosts;
     bool          m_filled;
     bool          m_isSceneTransition;
     Lookahead(x265_param *param, ThreadPool *pool);
​

x265_1.8.tar.gz/source/encoder/weightPrediction.cpp -> x265_1.9.tar.gz/source/encoder/weightPrediction.cpp Changed

@@ -4,6 +4,7 @@
  * Author: Shazeb Nawaz Khan <shazeb@multicorewareinc.com>
  *         Steve Borho <steve@borho.org>
  *         Kavitha Sampas <kavitha@multicorewareinc.com>
+ *         Min Chen <chenm003@163.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -259,13 +260,13 @@
     for (int list = 0; list < cache.numPredDir; list++)
     {
         WeightParam *weights = wp[list][0];
-        Frame *refFrame = slice.m_refPicList[list][0];
+        Frame *refFrame = slice.m_refFrameList[list][0];
         Lowres& refLowres = refFrame->m_lowres;
         int diffPoc = abs(curPoc - refFrame->m_poc);
 
         /* prepare estimates */
         float guessScale[3], fencMean[3], refMean[3];
-        for (int plane = 0; plane < 3; plane++)
+        for (int plane = 0; plane < (param.internalCsp != X265_CSP_I400 ? 3 : 1); plane++)
         {
             SET_WEIGHT(weights[plane], false, 1, 0, 0);
             uint64_t fencVar = fenc.wp_ssd[plane] + !refLowres.wp_ssd[plane];
@@ -289,7 +290,7 @@
 
         MV *mvs = NULL;
 
-        for (int plane = 0; plane < 3; plane++)
+        for (int plane = 0; plane < (param.internalCsp != X265_CSP_I400 ? 3 : 1); plane++)
         {
             denom = plane ? chromaDenom : lumaDenom;
             if (plane && !weights[0].bPresentFlag)
@@ -328,7 +329,7 @@
                 {
                     /* reference chroma planes must be extended prior to being
                      * used as motion compensation sources */
-                    if (!refFrame->m_bChromaExtended)
+                    if (!refFrame->m_bChromaExtended && param.internalCsp != X265_CSP_I400)
                     {
                         refFrame->m_bChromaExtended = true;
                         PicYuv *refPic = refFrame->m_fencPic;

 
@@ -4,6 +4,7 @@
  * Author: Shazeb Nawaz Khan <shazeb@multicorewareinc.com>
  *         Steve Borho <steve@borho.org>
  *         Kavitha Sampas <kavitha@multicorewareinc.com>
+ *         Min Chen <chenm003@163.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -259,13 +260,13 @@
     for (int list = 0; list < cache.numPredDir; list++)
     {
         WeightParam *weights = wp[list][0];
-        Frame *refFrame = slice.m_refPicList[list][0];
+        Frame *refFrame = slice.m_refFrameList[list][0];
         Lowres& refLowres = refFrame->m_lowres;
         int diffPoc = abs(curPoc - refFrame->m_poc);
 
         /* prepare estimates */
         float guessScale[3], fencMean[3], refMean[3];
-        for (int plane = 0; plane < 3; plane++)
+        for (int plane = 0; plane < (param.internalCsp != X265_CSP_I400 ? 3 : 1); plane++)
         {
             SET_WEIGHT(weights[plane], false, 1, 0, 0);
             uint64_t fencVar = fenc.wp_ssd[plane] + !refLowres.wp_ssd[plane];
@@ -289,7 +290,7 @@
 
         MV *mvs = NULL;
 
-        for (int plane = 0; plane < 3; plane++)
+        for (int plane = 0; plane < (param.internalCsp != X265_CSP_I400 ? 3 : 1); plane++)
         {
             denom = plane ? chromaDenom : lumaDenom;
             if (plane && !weights[0].bPresentFlag)
@@ -328,7 +329,7 @@
                 {
                     /* reference chroma planes must be extended prior to being
                      * used as motion compensation sources */
-                    if (!refFrame->m_bChromaExtended)
+                    if (!refFrame->m_bChromaExtended && param.internalCsp != X265_CSP_I400)
                     {
                         refFrame->m_bChromaExtended = true;
                         PicYuv *refPic = refFrame->m_fencPic;
​

x265_1.8.tar.gz/source/output/y4m.cpp -> x265_1.9.tar.gz/source/output/y4m.cpp Changed

 
@@ -70,7 +70,7 @@
         x265_log(NULL, X265_LOG_WARNING, "y4m: forcing reconstructed pixels to 8 bits\n");
 #endif
 
-    X265_CHECK(pic.colorSpace == colorSpace, "invalid color space\n");
+    X265_CHECK(pic.colorSpace == colorSpace, "invalid chroma subsampling\n");
 
 #if HIGH_BIT_DEPTH
 
​

x265_1.8.tar.gz/source/output/yuv.cpp -> x265_1.9.tar.gz/source/output/yuv.cpp Changed

 
@@ -53,7 +53,7 @@
     uint64_t fileOffset = pic.poc;
     fileOffset *= frameSize;
 
-    X265_CHECK(pic.colorSpace == colorSpace, "invalid color space\n");
+    X265_CHECK(pic.colorSpace == colorSpace, "invalid chroma subsampling\n");
     X265_CHECK(pic.bitDepth == (int)depth, "invalid bit depth\n");
 
 #if HIGH_BIT_DEPTH
​

x265_1.8.tar.gz/source/profile/vtune/CMakeLists.txt -> x265_1.9.tar.gz/source/profile/vtune/CMakeLists.txt Changed

 
@@ -1,2 +1,2 @@
-include_directories($ENV{VTUNE_AMPLIFIER_XE_2015_DIR}/include)
+include_directories(${VTUNE_INCLUDE_DIR})
 add_library(vtune vtune.h vtune.cpp ../cpuEvents.h)
​

x265_1.8.tar.gz/source/profile/vtune/vtune.cpp -> x265_1.9.tar.gz/source/profile/vtune/vtune.cpp Changed

 
@@ -30,7 +30,6 @@
 const char *stringNames[] =
 {
 #include "../cpuEvents.h"
-    ""
 };
 #undef CPU_EVENT
 
@@ -44,7 +43,8 @@
 void vtuneInit()
 {
     domain = __itt_domain_create("x265");
-    for (size_t i = 0; i < sizeof(stringNames) / sizeof(const char *); i++)
+    size_t length = sizeof(stringNames) / sizeof(const char *);
+    for (size_t i = 0; i < length; i++)
         taskHandle[i] = __itt_string_handle_create(stringNames[i]);
 }
 
​

x265_1.8.tar.gz/source/test/checkasm-a.asm -> x265_1.9.tar.gz/source/test/checkasm-a.asm Changed

 
@@ -2,9 +2,11 @@
 ;* checkasm-a.asm: assembly check tool
 ;*****************************************************************************
 ;* Copyright (C) 2008-2014 x264 project
+;* Copyright (C) 2013-2015 x265 project
 ;*
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
 ;*          Henrik Gramner <henrik@gramner.com>
+;*          Min Chen <chenm003@163.com>
 ;*
 ;* This program is free software; you can redistribute it and/or modify
 ;* it under the terms of the GNU General Public License as published by
​

x265_1.8.tar.gz/source/test/intrapredharness.cpp -> x265_1.9.tar.gz/source/test/intrapredharness.cpp Changed

 
@@ -130,6 +130,8 @@
                 if (memcmp(pixel_out_vec + k * FENC_STRIDE, pixel_out_c + k * FENC_STRIDE, width * sizeof(pixel)))
                 {
                     printf("ang_%dx%d, Mode = %d, Row = %d failed !!\n", width, width, pmode, k);
+                    ref[pmode](pixel_out_c, stride, pixel_buff + j, pmode, bFilter);
+                    opt[pmode](pixel_out_vec, stride, pixel_buff + j, pmode, bFilter);
                     return false;
                 }
             }
​

x265_1.8.tar.gz/source/test/ipfilterharness.h -> x265_1.9.tar.gz/source/test/ipfilterharness.h Changed

 
@@ -4,6 +4,7 @@
  * Authors: Deepthi Devaki <deepthidevaki@multicorewareinc.com>,
  *          Rajesh Paulraj <rajesh@multicorewareinc.com>
  *          Praveen Kumar Tiwari <praveen@multicorewareinc.com>
+ *          Min Chen <chenm003@163.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
​

x265_1.8.tar.gz/source/test/pixelharness.cpp -> x265_1.9.tar.gz/source/test/pixelharness.cpp Changed

@@ -2,6 +2,7 @@
  * Copyright (C) 2013 x265 project
  *
  * Authors: Steve Borho <steve@borho.org>
+ *          Min Chen <chenm003@163.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -41,6 +42,7 @@
         int_test_buff[0][i]     = rand() % SHORT_MAX;
         ushort_test_buff[0][i]  = rand() % ((1 << 16) - 1);
         uchar_test_buff[0][i]   = rand() % ((1 << 8) - 1);
+        residual_test_buff[0][i] = (rand() % (2 * RMAX + 1)) - RMAX - 1;// For sse_ss only
 
         pixel_test_buff[1][i]   = PIXEL_MIN;
         short_test_buff[1][i]   = SMIN;
@@ -49,6 +51,7 @@
         int_test_buff[1][i]     = SHORT_MIN;
         ushort_test_buff[1][i]  = PIXEL_MIN;
         uchar_test_buff[1][i]   = PIXEL_MIN;
+        residual_test_buff[1][i] = RMIN;
 
         pixel_test_buff[2][i]   = PIXEL_MAX;
         short_test_buff[2][i]   = SMAX;
@@ -57,6 +60,7 @@
         int_test_buff[2][i]     = SHORT_MAX;
         ushort_test_buff[2][i]  = ((1 << 16) - 1);
         uchar_test_buff[2][i]   = 255;
+        residual_test_buff[2][i] = RMAX;
 
         pbuf1[i] = rand() & PIXEL_MAX;
         pbuf2[i] = rand() & PIXEL_MAX;
@@ -103,8 +107,8 @@
     {
         int index1 = rand() % TEST_CASES;
         int index2 = rand() % TEST_CASES;
-        sse_ret_t vres = (sse_ret_t)checked(opt, pixel_test_buff[index1], stride, pixel_test_buff[index2] + j, stride);
-        sse_ret_t cres = ref(pixel_test_buff[index1], stride, pixel_test_buff[index2] + j, stride);
+        sse_t vres = (sse_t)checked(opt, pixel_test_buff[index1], stride, pixel_test_buff[index2] + j, stride);
+        sse_t cres = ref(pixel_test_buff[index1], stride, pixel_test_buff[index2] + j, stride);
         if (vres != cres)
             return false;
 
@@ -124,8 +128,8 @@
     {
         int index1 = rand() % TEST_CASES;
         int index2 = rand() % TEST_CASES;
-        sse_ret_t vres = (sse_ret_t)checked(opt, short_test_buff[index1], stride, short_test_buff[index2] + j, stride);
-        sse_ret_t cres = ref(short_test_buff[index1], stride, short_test_buff[index2] + j, stride);
+        sse_t vres = (sse_t)checked(opt, residual_test_buff[index1], stride, residual_test_buff[index2] + j, stride);
+        sse_t cres = ref(residual_test_buff[index1], stride, residual_test_buff[index2] + j, stride);
         if (vres != cres)
             return false;
 
@@ -227,8 +231,8 @@
     {
         // NOTE: stride must be multiple of 16, because minimum block is 4x4
         int stride = (STRIDE + (rand() % STRIDE)) & ~15;
-        int cres = ref(sbuf1 + j, stride);
-        int vres = (int)checked(opt, sbuf1 + j, (intptr_t)stride);
+        sse_t cres = ref(sbuf1 + j, stride);
+        sse_t vres = (sse_t)checked(opt, sbuf1 + j, (intptr_t)stride);
 
         if (cres != vres)
             return false;
@@ -854,7 +858,7 @@
         int width = (rand() % 4) + 1; // range[1-4]
         float cres = ref(sum0, sum1, width);
         float vres = checked_float(opt, sum0, sum1, width);
-        if (fabs(vres - cres) > 0.00001)
+        if (fabs(vres - cres) > 0.0001)
             return false;
 
         reportfail();
@@ -1061,8 +1065,8 @@
         int endX = MAX_CU_SIZE - (rand() % 5);
         int endY = MAX_CU_SIZE - (rand() % 4) - 1;
 
-        ref(pbuf2 + j + 1, pbuf3 + 1, stride, endX, endY, stats_ref, count_ref);
-        checked(opt, pbuf2 + j + 1, pbuf3 + 1, stride, endX, endY, stats_vec, count_vec);
+        ref(sbuf2 + j + 1, pbuf3 + 1, stride, endX, endY, stats_ref, count_ref);
+        checked(opt, sbuf2 + j + 1, pbuf3 + 1, stride, endX, endY, stats_vec, count_vec);
 
         if (memcmp(stats_ref, stats_vec, sizeof(stats_ref)) || memcmp(count_ref, count_vec, sizeof(count_ref)))
             return false;
@@ -1097,8 +1101,8 @@
         int endX = MAX_CU_SIZE - (rand() % 5) - 1;
         int endY = MAX_CU_SIZE - (rand() % 4) - 1;
 
-        ref(pbuf2 + j + 1, pbuf3 + j + 1, stride, endX, endY, stats_ref, count_ref);
-        checked(opt, pbuf2 + j + 1, pbuf3 + j + 1, stride, endX, endY, stats_vec, count_vec);
+        ref(sbuf2 + j + 1, pbuf3 + j + 1, stride, endX, endY, stats_ref, count_ref);
+        checked(opt, sbuf2 + j + 1, pbuf3 + j + 1, stride, endX, endY, stats_vec, count_vec);
 
         if (memcmp(stats_ref, stats_vec, sizeof(stats_ref)) || memcmp(count_ref, count_vec, sizeof(count_ref)))
             return false;
@@ -1141,8 +1145,8 @@
         int endX = MAX_CU_SIZE - (rand() % 5);
         int endY = MAX_CU_SIZE - (rand() % 4) - 1;
 
-        ref(pbuf2 + 1, pbuf3 + 1, stride, upBuff1_ref, endX, endY, stats_ref, count_ref);
-        checked(opt, pbuf2 + 1, pbuf3 + 1, stride, upBuff1_vec, endX, endY, stats_vec, count_vec);
+        ref(sbuf2 + 1, pbuf3 + 1, stride, upBuff1_ref, endX, endY, stats_ref, count_ref);
+        checked(opt, sbuf2 + 1, pbuf3 + 1, stride, upBuff1_vec, endX, endY, stats_vec, count_vec);
 
         if (   memcmp(_upBuff1_ref, _upBuff1_vec, sizeof(_upBuff1_ref))
             || memcmp(stats_ref, stats_vec, sizeof(stats_ref))
@@ -1193,8 +1197,8 @@
         int endX = MAX_CU_SIZE - (rand() % 5) - 1;
         int endY = MAX_CU_SIZE - (rand() % 4) - 1;
 
-        ref(pbuf2 + 1, pbuf3 + 1, stride, upBuff1_ref, upBufft_ref, endX, endY, stats_ref, count_ref);
-        checked(opt, pbuf2 + 1, pbuf3 + 1, stride, upBuff1_vec, upBufft_vec, endX, endY, stats_vec, count_vec);
+        ref(sbuf2 + 1, pbuf3 + 1, stride, upBuff1_ref, upBufft_ref, endX, endY, stats_ref, count_ref);
+        checked(opt, sbuf2 + 1, pbuf3 + 1, stride, upBuff1_vec, upBufft_vec, endX, endY, stats_vec, count_vec);
 
         // TODO: don't check upBuff*, the latest output pixels different, and can move into stack temporary buffer in future
         if (   memcmp(_upBuff1_ref, _upBuff1_vec, sizeof(_upBuff1_ref))
@@ -1244,8 +1248,8 @@
         int endX = MAX_CU_SIZE - (rand() % 5) - 1;
         int endY = MAX_CU_SIZE - (rand() % 4) - 1;
 
-        ref(pbuf2, pbuf3, stride, upBuff1_ref, endX, endY, stats_ref, count_ref);
-        checked(opt, pbuf2, pbuf3, stride, upBuff1_vec, endX, endY, stats_vec, count_vec);
+        ref(sbuf2, pbuf3, stride, upBuff1_ref, endX, endY, stats_ref, count_ref);
+        checked(opt, sbuf2, pbuf3, stride, upBuff1_vec, endX, endY, stats_vec, count_vec);
 
         if (   memcmp(_upBuff1_ref, _upBuff1_vec, sizeof(_upBuff1_ref))
             || memcmp(stats_ref, stats_vec, sizeof(stats_ref))
@@ -1295,8 +1299,8 @@
 
     memset(ref_dest, 0xCD, sizeof(ref_dest));
     memset(opt_dest, 0xCD, sizeof(opt_dest));
-    int width = 32 + rand() % 32;
-    int height = 32 + rand() % 32;
+    int width = 32 + (rand() % 32);
+    int height = 32 + (rand() % 32);
     intptr_t srcStride = 64;
     intptr_t dstStride = width;
     int j = 0;
@@ -1304,11 +1308,23 @@
     for (int i = 0; i < ITERS; i++)
     {
         int index = i % TEST_CASES;
+
         checked(opt, ushort_test_buff[index] + j, srcStride, opt_dest, dstStride, width, height, (int)8, (uint16_t)((1 << X265_DEPTH) - 1));
         ref(ushort_test_buff[index] + j, srcStride, ref_dest, dstStride, width, height, (int)8, (uint16_t)((1 << X265_DEPTH) - 1));
 
-        if (memcmp(ref_dest, opt_dest, width * height * sizeof(pixel)))
+        if (memcmp(ref_dest, opt_dest, dstStride * height * sizeof(pixel)))
+        {
+            memcpy(opt_dest, ref_dest, sizeof(ref_dest));
+            opt(ushort_test_buff[index] + j, srcStride, opt_dest, dstStride, width, height, (int)8, (uint16_t)((1 << X265_DEPTH) - 1));
             return false;
+        }
+
+        // check tail memory area
+        for(int x = width; x < dstStride; x++)
+        {
+            if (opt_dest[(height - 1 * dstStride) + x] != 0xCD)
+                return false;
+        }
 
         reportfail();
         j += INCR;
@@ -1340,6 +1356,13 @@
         if (memcmp(ref_dest, opt_dest, sizeof(ref_dest)))
             return false;
 
+        // check tail memory area
+        for(int x = width; x < dstStride; x++)
+        {
+            if (opt_dest[(height - 1 * dstStride) + x] != 0xCD)
+                return false;
+        }
+
         reportfail();
         j += INCR;
     }
@@ -1356,16 +1379,16 @@
     memset(opt_dest, 0xCD, sizeof(opt_dest));
 
     double fps = 1.0;
-    int width = 16 + rand() % 64;
     int j = 0;
 
     for (int i = 0; i < ITERS; i++)
     {
+        int width = 16 + rand() % 64;
         int index = i % TEST_CASES;
         checked(opt, opt_dest, ushort_test_buff[index] + j, int_test_buff[index] + j, ushort_test_buff[index] + j, int_test_buff[index] + j, &fps, width);
         ref(ref_dest, ushort_test_buff[index] + j, int_test_buff[index] + j, ushort_test_buff[index] + j, int_test_buff[index] + j, &fps, width);
 
-        if (memcmp(ref_dest, opt_dest, width * sizeof(pixel)))
+        if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)))
             return false;
 
         reportfail();
@@ -1397,28 +1420,6 @@
     return true;

 
@@ -2,6 +2,7 @@
  * Copyright (C) 2013 x265 project
  *
  * Authors: Steve Borho <steve@borho.org>
+ *          Min Chen <chenm003@163.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -41,6 +42,7 @@
         int_test_buff[0][i]     = rand() % SHORT_MAX;
         ushort_test_buff[0][i]  = rand() % ((1 << 16) - 1);
         uchar_test_buff[0][i]   = rand() % ((1 << 8) - 1);
+        residual_test_buff[0][i] = (rand() % (2 * RMAX + 1)) - RMAX - 1;// For sse_ss only
 
         pixel_test_buff[1][i]   = PIXEL_MIN;
         short_test_buff[1][i]   = SMIN;
@@ -49,6 +51,7 @@
         int_test_buff[1][i]     = SHORT_MIN;
         ushort_test_buff[1][i]  = PIXEL_MIN;
         uchar_test_buff[1][i]   = PIXEL_MIN;
+        residual_test_buff[1][i] = RMIN;
 
         pixel_test_buff[2][i]   = PIXEL_MAX;
         short_test_buff[2][i]   = SMAX;
@@ -57,6 +60,7 @@
         int_test_buff[2][i]     = SHORT_MAX;
         ushort_test_buff[2][i]  = ((1 << 16) - 1);
         uchar_test_buff[2][i]   = 255;
+        residual_test_buff[2][i] = RMAX;
 
         pbuf1[i] = rand() & PIXEL_MAX;
         pbuf2[i] = rand() & PIXEL_MAX;
@@ -103,8 +107,8 @@
     {
         int index1 = rand() % TEST_CASES;
         int index2 = rand() % TEST_CASES;
-        sse_ret_t vres = (sse_ret_t)checked(opt, pixel_test_buff[index1], stride, pixel_test_buff[index2] + j, stride);
-        sse_ret_t cres = ref(pixel_test_buff[index1], stride, pixel_test_buff[index2] + j, stride);
+        sse_t vres = (sse_t)checked(opt, pixel_test_buff[index1], stride, pixel_test_buff[index2] + j, stride);
+        sse_t cres = ref(pixel_test_buff[index1], stride, pixel_test_buff[index2] + j, stride);
         if (vres != cres)
             return false;
 
@@ -124,8 +128,8 @@
     {
         int index1 = rand() % TEST_CASES;
         int index2 = rand() % TEST_CASES;
-        sse_ret_t vres = (sse_ret_t)checked(opt, short_test_buff[index1], stride, short_test_buff[index2] + j, stride);
-        sse_ret_t cres = ref(short_test_buff[index1], stride, short_test_buff[index2] + j, stride);
+        sse_t vres = (sse_t)checked(opt, residual_test_buff[index1], stride, residual_test_buff[index2] + j, stride);
+        sse_t cres = ref(residual_test_buff[index1], stride, residual_test_buff[index2] + j, stride);
         if (vres != cres)
             return false;
 
@@ -227,8 +231,8 @@
     {
         // NOTE: stride must be multiple of 16, because minimum block is 4x4
         int stride = (STRIDE + (rand() % STRIDE)) & ~15;
-        int cres = ref(sbuf1 + j, stride);
-        int vres = (int)checked(opt, sbuf1 + j, (intptr_t)stride);
+        sse_t cres = ref(sbuf1 + j, stride);
+        sse_t vres = (sse_t)checked(opt, sbuf1 + j, (intptr_t)stride);
 
         if (cres != vres)
             return false;
@@ -854,7 +858,7 @@
         int width = (rand() % 4) + 1; // range[1-4]
         float cres = ref(sum0, sum1, width);
         float vres = checked_float(opt, sum0, sum1, width);
-        if (fabs(vres - cres) > 0.00001)
+        if (fabs(vres - cres) > 0.0001)
             return false;
 
         reportfail();
@@ -1061,8 +1065,8 @@
         int endX = MAX_CU_SIZE - (rand() % 5);
         int endY = MAX_CU_SIZE - (rand() % 4) - 1;
 
-        ref(pbuf2 + j + 1, pbuf3 + 1, stride, endX, endY, stats_ref, count_ref);
-        checked(opt, pbuf2 + j + 1, pbuf3 + 1, stride, endX, endY, stats_vec, count_vec);
+        ref(sbuf2 + j + 1, pbuf3 + 1, stride, endX, endY, stats_ref, count_ref);
+        checked(opt, sbuf2 + j + 1, pbuf3 + 1, stride, endX, endY, stats_vec, count_vec);
 
         if (memcmp(stats_ref, stats_vec, sizeof(stats_ref)) || memcmp(count_ref, count_vec, sizeof(count_ref)))
             return false;
@@ -1097,8 +1101,8 @@
         int endX = MAX_CU_SIZE - (rand() % 5) - 1;
         int endY = MAX_CU_SIZE - (rand() % 4) - 1;
 
-        ref(pbuf2 + j + 1, pbuf3 + j + 1, stride, endX, endY, stats_ref, count_ref);
-        checked(opt, pbuf2 + j + 1, pbuf3 + j + 1, stride, endX, endY, stats_vec, count_vec);
+        ref(sbuf2 + j + 1, pbuf3 + j + 1, stride, endX, endY, stats_ref, count_ref);
+        checked(opt, sbuf2 + j + 1, pbuf3 + j + 1, stride, endX, endY, stats_vec, count_vec);
 
         if (memcmp(stats_ref, stats_vec, sizeof(stats_ref)) || memcmp(count_ref, count_vec, sizeof(count_ref)))
             return false;
@@ -1141,8 +1145,8 @@
         int endX = MAX_CU_SIZE - (rand() % 5);
         int endY = MAX_CU_SIZE - (rand() % 4) - 1;
 
-        ref(pbuf2 + 1, pbuf3 + 1, stride, upBuff1_ref, endX, endY, stats_ref, count_ref);
-        checked(opt, pbuf2 + 1, pbuf3 + 1, stride, upBuff1_vec, endX, endY, stats_vec, count_vec);
+        ref(sbuf2 + 1, pbuf3 + 1, stride, upBuff1_ref, endX, endY, stats_ref, count_ref);
+        checked(opt, sbuf2 + 1, pbuf3 + 1, stride, upBuff1_vec, endX, endY, stats_vec, count_vec);
 
         if (   memcmp(_upBuff1_ref, _upBuff1_vec, sizeof(_upBuff1_ref))
             || memcmp(stats_ref, stats_vec, sizeof(stats_ref))
@@ -1193,8 +1197,8 @@
         int endX = MAX_CU_SIZE - (rand() % 5) - 1;
         int endY = MAX_CU_SIZE - (rand() % 4) - 1;
 
-        ref(pbuf2 + 1, pbuf3 + 1, stride, upBuff1_ref, upBufft_ref, endX, endY, stats_ref, count_ref);
-        checked(opt, pbuf2 + 1, pbuf3 + 1, stride, upBuff1_vec, upBufft_vec, endX, endY, stats_vec, count_vec);
+        ref(sbuf2 + 1, pbuf3 + 1, stride, upBuff1_ref, upBufft_ref, endX, endY, stats_ref, count_ref);
+        checked(opt, sbuf2 + 1, pbuf3 + 1, stride, upBuff1_vec, upBufft_vec, endX, endY, stats_vec, count_vec);
 
         // TODO: don't check upBuff*, the latest output pixels different, and can move into stack temporary buffer in future
         if (   memcmp(_upBuff1_ref, _upBuff1_vec, sizeof(_upBuff1_ref))
@@ -1244,8 +1248,8 @@
         int endX = MAX_CU_SIZE - (rand() % 5) - 1;
         int endY = MAX_CU_SIZE - (rand() % 4) - 1;
 
-        ref(pbuf2, pbuf3, stride, upBuff1_ref, endX, endY, stats_ref, count_ref);
-        checked(opt, pbuf2, pbuf3, stride, upBuff1_vec, endX, endY, stats_vec, count_vec);
+        ref(sbuf2, pbuf3, stride, upBuff1_ref, endX, endY, stats_ref, count_ref);
+        checked(opt, sbuf2, pbuf3, stride, upBuff1_vec, endX, endY, stats_vec, count_vec);
 
         if (   memcmp(_upBuff1_ref, _upBuff1_vec, sizeof(_upBuff1_ref))
             || memcmp(stats_ref, stats_vec, sizeof(stats_ref))
@@ -1295,8 +1299,8 @@
 
     memset(ref_dest, 0xCD, sizeof(ref_dest));
     memset(opt_dest, 0xCD, sizeof(opt_dest));
-    int width = 32 + rand() % 32;
-    int height = 32 + rand() % 32;
+    int width = 32 + (rand() % 32);
+    int height = 32 + (rand() % 32);
     intptr_t srcStride = 64;
     intptr_t dstStride = width;
     int j = 0;
@@ -1304,11 +1308,23 @@
     for (int i = 0; i < ITERS; i++)
     {
         int index = i % TEST_CASES;
+
         checked(opt, ushort_test_buff[index] + j, srcStride, opt_dest, dstStride, width, height, (int)8, (uint16_t)((1 << X265_DEPTH) - 1));
         ref(ushort_test_buff[index] + j, srcStride, ref_dest, dstStride, width, height, (int)8, (uint16_t)((1 << X265_DEPTH) - 1));
 
-        if (memcmp(ref_dest, opt_dest, width * height * sizeof(pixel)))
+        if (memcmp(ref_dest, opt_dest, dstStride * height * sizeof(pixel)))
+        {
+            memcpy(opt_dest, ref_dest, sizeof(ref_dest));
+            opt(ushort_test_buff[index] + j, srcStride, opt_dest, dstStride, width, height, (int)8, (uint16_t)((1 << X265_DEPTH) - 1));
             return false;
+        }
+
+        // check tail memory area
+        for(int x = width; x < dstStride; x++)
+        {
+            if (opt_dest[(height - 1 * dstStride) + x] != 0xCD)
+                return false;
+        }
 
         reportfail();
         j += INCR;
@@ -1340,6 +1356,13 @@
         if (memcmp(ref_dest, opt_dest, sizeof(ref_dest)))
             return false;
 
+        // check tail memory area
+        for(int x = width; x < dstStride; x++)
+        {
+            if (opt_dest[(height - 1 * dstStride) + x] != 0xCD)
+                return false;
+        }
+
         reportfail();
         j += INCR;
     }
@@ -1356,16 +1379,16 @@
     memset(opt_dest, 0xCD, sizeof(opt_dest));
 
     double fps = 1.0;
-    int width = 16 + rand() % 64;
     int j = 0;
 
     for (int i = 0; i < ITERS; i++)
     {
+        int width = 16 + rand() % 64;
         int index = i % TEST_CASES;
         checked(opt, opt_dest, ushort_test_buff[index] + j, int_test_buff[index] + j, ushort_test_buff[index] + j, int_test_buff[index] + j, &fps, width);
         ref(ref_dest, ushort_test_buff[index] + j, int_test_buff[index] + j, ushort_test_buff[index] + j, int_test_buff[index] + j, &fps, width);
 
-        if (memcmp(ref_dest, opt_dest, width * sizeof(pixel)))
+        if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)))
             return false;
 
         reportfail();
@@ -1397,28 +1420,6 @@
     return true;
​

x265_1.8.tar.gz/source/test/pixelharness.h -> x265_1.9.tar.gz/source/test/pixelharness.h Changed

 
@@ -2,6 +2,7 @@
  * Copyright (C) 2013 x265 project
  *
  * Authors: Steve Borho <steve@borho.org>
+ *          Min Chen <chenm003@163.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -40,6 +41,8 @@
     enum { TEST_CASES = 3 };
     enum { SMAX = 1 << 12 };
     enum { SMIN = -1 << 12 };
+    enum { RMAX = PIXEL_MAX - PIXEL_MIN }; //The maximum value obtained by subtracting pixel values (residual max)
+    enum { RMIN = PIXEL_MIN - PIXEL_MAX }; //The minimum value obtained by subtracting pixel values (residual min)
 
     ALIGN_VAR_32(pixel, pbuf1[BUFFSIZE]);
     pixel    pbuf2[BUFFSIZE];
@@ -64,6 +67,7 @@
     uint16_t ushort_test_buff[TEST_CASES][BUFFSIZE];
     uint8_t  uchar_test_buff[TEST_CASES][BUFFSIZE];
     double   double_test_buff[TEST_CASES][BUFFSIZE];
+    int16_t  residual_test_buff[TEST_CASES][BUFFSIZE];
 
     bool check_pixelcmp(pixelcmp_t ref, pixelcmp_t opt);
     bool check_pixel_sse(pixel_sse_t ref, pixel_sse_t opt);
@@ -110,12 +114,15 @@
     bool check_planecopy_cp(planecopy_cp_t ref, planecopy_cp_t opt);
     bool check_cutree_propagate_cost(cutree_propagate_cost ref, cutree_propagate_cost opt);
     bool check_psyCost_pp(pixelcmp_t ref, pixelcmp_t opt);
-    bool check_psyCost_ss(pixelcmp_ss_t ref, pixelcmp_ss_t opt);
     bool check_calSign(sign_t ref, sign_t opt);
     bool check_scanPosLast(scanPosLast_t ref, scanPosLast_t opt);
     bool check_findPosFirstLast(findPosFirstLast_t ref, findPosFirstLast_t opt);
     bool check_costCoeffNxN(costCoeffNxN_t ref, costCoeffNxN_t opt);
     bool check_costCoeffRemain(costCoeffRemain_t ref, costCoeffRemain_t opt);
+    bool check_costC1C2Flag(costC1C2Flag_t ref, costC1C2Flag_t opt);
+    bool check_planeClipAndMax(planeClipAndMax_t ref, planeClipAndMax_t opt);
+    bool check_pelFilterLumaStrong_V(pelFilterLumaStrong_t ref, pelFilterLumaStrong_t opt);
+    bool check_pelFilterLumaStrong_H(pelFilterLumaStrong_t ref, pelFilterLumaStrong_t opt);
 
 public:
 
​

x265_1.8.tar.gz/source/test/regression-tests.txt -> x265_1.9.tar.gz/source/test/regression-tests.txt Changed

@@ -11,124 +11,132 @@
 # consistent across many machines, you must force a certain -FN so it is
 # not auto-detected.
 
+BasketballDrive_1920x1080_50.y4m,--preset ultrafast --signhide --colormatrix bt709
+BasketballDrive_1920x1080_50.y4m,--preset superfast --psy-rd 1 --ctu 16 --no-wpp --limit-modes
+BasketballDrive_1920x1080_50.y4m,--preset veryfast --tune zerolatency --no-temporal-mvp
 BasketballDrive_1920x1080_50.y4m,--preset faster --aq-strength 2 --merange 190
 BasketballDrive_1920x1080_50.y4m,--preset medium --ctu 16 --max-tu-size 8 --subme 7 --qg-size 16 --cu-lossless
 BasketballDrive_1920x1080_50.y4m,--preset medium --keyint -1 --nr-inter 100 -F4 --no-sao
+BasketballDrive_1920x1080_50.y4m,--preset medium --no-cutree --analysis-mode=save --bitrate 7000 --limit-modes,--preset medium --no-cutree --analysis-mode=load --bitrate 7000 --limit-modes
 BasketballDrive_1920x1080_50.y4m,--preset slow --nr-intra 100 -F4 --aq-strength 3 --qg-size 16 --limit-refs 1
 BasketballDrive_1920x1080_50.y4m,--preset slower --lossless --chromaloc 3 --subme 0
-BasketballDrive_1920x1080_50.y4m,--preset superfast --psy-rd 1 --ctu 16 --no-wpp
-BasketballDrive_1920x1080_50.y4m,--preset ultrafast --signhide --colormatrix bt709
-BasketballDrive_1920x1080_50.y4m,--preset veryfast --tune zerolatency --no-temporal-mvp
-BasketballDrive_1920x1080_50.y4m,--preset veryslow --crf 4 --cu-lossless --pmode --limit-refs 1
+BasketballDrive_1920x1080_50.y4m,--preset slower --no-cutree --analysis-mode=save --bitrate 7000,--preset slower --no-cutree --analysis-mode=load --bitrate 7000
+BasketballDrive_1920x1080_50.y4m,--preset veryslow --crf 4 --cu-lossless --pmode --limit-refs 1 --aq-mode 3
+BasketballDrive_1920x1080_50.y4m,--preset veryslow --no-cutree --analysis-mode=save --bitrate 7000 --tskip-fast,--preset veryslow --no-cutree --analysis-mode=load --bitrate 7000  --tskip-fast
+BasketballDrive_1920x1080_50.y4m,--preset veryslow --recon-y4m-exec "ffplay -i pipe:0 -autoexit"
+Coastguard-4k.y4m,--preset ultrafast --recon-y4m-exec "ffplay -i pipe:0 -autoexit"
+Coastguard-4k.y4m,--preset superfast --tune grain --overscan=crop
+Coastguard-4k.y4m,--preset veryfast --no-cutree --analysis-mode=save --bitrate 15000,--preset veryfast --no-cutree --analysis-mode=load --bitrate 15000
 Coastguard-4k.y4m,--preset medium --rdoq-level 1 --tune ssim --no-signhide --me umh
 Coastguard-4k.y4m,--preset slow --tune psnr --cbqpoffs -1 --crqpoffs 1 --limit-refs 1
-Coastguard-4k.y4m,--preset superfast --tune grain --overscan=crop
-CrowdRun_1920x1080_50_10bit_422.yuv,--preset fast --aq-mode 0 --sar 2 --range full
+CrowdRun_1920x1080_50_10bit_422.yuv,--preset ultrafast --weightp --tune zerolatency --qg-size 16
+CrowdRun_1920x1080_50_10bit_422.yuv,--preset superfast --weightp --no-wpp --sao
+CrowdRun_1920x1080_50_10bit_422.yuv,--preset veryfast --temporal-layers --tune grain
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset faster --max-tu-size 4 --min-cu-size 32
+CrowdRun_1920x1080_50_10bit_422.yuv,--preset fast --aq-mode 0 --sar 2 --range full
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset medium --no-wpp --no-cutree --no-strong-intra-smoothing --limit-refs 1
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset slow --no-wpp --tune ssim --transfer smpte240m
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset slower --tune ssim --tune fastdecode --limit-refs 2
-CrowdRun_1920x1080_50_10bit_422.yuv,--preset superfast --weightp --no-wpp --sao
-CrowdRun_1920x1080_50_10bit_422.yuv,--preset ultrafast --weightp --tune zerolatency --qg-size 16
-CrowdRun_1920x1080_50_10bit_422.yuv,--preset veryfast --temporal-layers --tune grain
-CrowdRun_1920x1080_50_10bit_444.yuv,--preset medium --dither --keyint -1 --rdoq-level 1
-CrowdRun_1920x1080_50_10bit_444.yuv,--preset superfast --weightp --dither --no-psy-rd
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset ultrafast --weightp --no-wpp --no-open-gop
+CrowdRun_1920x1080_50_10bit_444.yuv,--preset superfast --weightp --dither --no-psy-rd
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset veryfast --temporal-layers --repeat-headers --limit-refs 2
+CrowdRun_1920x1080_50_10bit_444.yuv,--preset medium --dither --keyint -1 --rdoq-level 1 --limit-modes
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset veryslow --tskip --tskip-fast --no-scenecut
-DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset medium --tune psnr --bframes 16
-DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset slow --temporal-layers --no-psy-rd --qg-size 32 --limit-refs 0 --cu-lossless
 DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset superfast --weightp --qg-size 16
+DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset medium --tune psnr --bframes 16 --limit-modes
+DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset slow --temporal-layers --no-psy-rd --qg-size 32 --limit-refs 0 --cu-lossless
+DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset veryfast --weightp --nr-intra 1000 -F4
 DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset medium --nr-inter 500 -F4 --no-psy-rdoq
 DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset slower --no-weightp --rdoq-level 0 --limit-refs 3
-DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset veryfast --weightp --nr-intra 1000 -F4
-FourPeople_1280x720_60.y4m,--preset medium --qp 38 --no-psy-rd
 FourPeople_1280x720_60.y4m,--preset superfast --no-wpp --lookahead-slices 2
+FourPeople_1280x720_60.y4m,--preset medium --qp 38 --no-psy-rd
+FourPeople_1280x720_60.y4m,--preset medium --recon-y4m-exec "ffplay -i pipe:0 -autoexit"
+FourPeople_1280x720_60.y4m,--preset veryslow --numa-pools "none"
+Keiba_832x480_30.y4m,--preset superfast --no-fast-intra --nr-intra 1000 -F4
 Keiba_832x480_30.y4m,--preset medium --pmode --tune grain
 Keiba_832x480_30.y4m,--preset slower --fast-intra --nr-inter 500 -F4 --limit-refs 0
-Keiba_832x480_30.y4m,--preset superfast --no-fast-intra --nr-intra 1000 -F4
-Kimono1_1920x1080_24_10bit_444.yuv,--preset medium --min-cu-size 32
 Kimono1_1920x1080_24_10bit_444.yuv,--preset superfast --weightb
-KristenAndSara_1280x720_60.y4m,--preset medium --no-cutree --max-tu-size 16
-KristenAndSara_1280x720_60.y4m,--preset slower --pmode --max-tu-size 8 --limit-refs 0
-KristenAndSara_1280x720_60.y4m,--preset superfast --min-cu-size 16 --qg-size 16 --limit-refs 1
+Kimono1_1920x1080_24_10bit_444.yuv,--preset medium --min-cu-size 32
 KristenAndSara_1280x720_60.y4m,--preset ultrafast --strong-intra-smoothing
-NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset medium --tune grain --limit-refs 2
+KristenAndSara_1280x720_60.y4m,--preset superfast --min-cu-size 16 --qg-size 16 --limit-refs 1
+KristenAndSara_1280x720_60.y4m,--preset medium --no-cutree --max-tu-size 16
+KristenAndSara_1280x720_60.y4m,--preset slower --pmode --max-tu-size 8 --limit-refs 0 --limit-modes
 NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset superfast --tune psnr
-News-4k.y4m,--preset medium --tune ssim --no-sao --qg-size 16
+NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset medium --tune grain --limit-refs 2
+NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset slow --no-cutree --analysis-mode=save --bitrate 9000,--preset slow --no-cutree --analysis-mode=load --bitrate 9000
+News-4k.y4m,--preset ultrafast --no-cutree --analysis-mode=save --bitrate 15000,--preset ultrafast --no-cutree --analysis-mode=load --bitrate 15000
 News-4k.y4m,--preset superfast --lookahead-slices 6 --aq-mode 0
+News-4k.y4m,--preset medium --tune ssim --no-sao --qg-size 16
+OldTownCross_1920x1080_50_10bit_422.yuv,--preset superfast --weightp
 OldTownCross_1920x1080_50_10bit_422.yuv,--preset medium --no-weightp
 OldTownCross_1920x1080_50_10bit_422.yuv,--preset slower --tune fastdecode
-OldTownCross_1920x1080_50_10bit_422.yuv,--preset superfast --weightp
+ParkScene_1920x1080_24_10bit_444.yuv,--preset superfast --weightp --lookahead-slices 4
 ParkScene_1920x1080_24.y4m,--preset medium --qp 40 --rdpenalty 2 --tu-intra-depth 3
 ParkScene_1920x1080_24.y4m,--preset slower --no-weightp
-ParkScene_1920x1080_24_10bit_444.yuv,--preset superfast --weightp --lookahead-slices 4
+RaceHorses_416x240_30.y4m,--preset superfast --no-cutree
 RaceHorses_416x240_30.y4m,--preset medium --tskip-fast --tskip
 RaceHorses_416x240_30.y4m,--preset slower --keyint -1 --rdoq-level 0
-RaceHorses_416x240_30.y4m,--preset superfast --no-cutree
 RaceHorses_416x240_30.y4m,--preset veryslow --tskip-fast --tskip --limit-refs 3
-RaceHorses_416x240_30_10bit.yuv,--preset fast --lookahead-slices 2 --b-intra --limit-refs 1
-RaceHorses_416x240_30_10bit.yuv,--preset faster --rdoq-level 0 --dither
-RaceHorses_416x240_30_10bit.yuv,--preset slow --tune grain
 RaceHorses_416x240_30_10bit.yuv,--preset ultrafast --tune psnr --limit-refs 1
 RaceHorses_416x240_30_10bit.yuv,--preset veryfast --weightb
+RaceHorses_416x240_30_10bit.yuv,--preset faster --rdoq-level 0 --dither
+RaceHorses_416x240_30_10bit.yuv,--preset fast --lookahead-slices 2 --b-intra --limit-refs 1
+RaceHorses_416x240_30_10bit.yuv,--preset slow --tune grain  --limit-modes
 RaceHorses_416x240_30_10bit.yuv,--preset placebo --limit-refs 1
 SteamLocomotiveTrain_2560x1600_60_10bit_crop.yuv,--preset medium --dither
-big_buck_bunny_360p24.y4m,--preset faster --keyint 240 --min-keyint 60 --rc-lookahead 200
-big_buck_bunny_360p24.y4m,--preset medium --keyint 60 --min-keyint 48 --weightb --limit-refs 3
-big_buck_bunny_360p24.y4m,--preset slow --psy-rdoq 2.0 --rdoq-level 1 --no-b-intra
-big_buck_bunny_360p24.y4m,--preset superfast --psy-rdoq 2.0
 big_buck_bunny_360p24.y4m,--preset ultrafast --deblock=2
+big_buck_bunny_360p24.y4m,--preset superfast --psy-rdoq 2.0 --aq-mode 3
 big_buck_bunny_360p24.y4m,--preset veryfast --no-deblock
-city_4cif_60fps.y4m,--preset medium --crf 4 --cu-lossless --sao-non-deblock
+big_buck_bunny_360p24.y4m,--preset faster --keyint 240 --min-keyint 60 --rc-lookahead 200
+big_buck_bunny_360p24.y4m,--preset medium --keyint 60 --min-keyint 48 --weightb --limit-refs 3
+big_buck_bunny_360p24.y4m,--preset slow --psy-rdoq 2.0 --rdoq-level 1 --no-b-intra --aq-mode 3
 city_4cif_60fps.y4m,--preset superfast --rdpenalty 1 --tu-intra-depth 2
+city_4cif_60fps.y4m,--preset medium --crf 4 --cu-lossless --sao-non-deblock
 city_4cif_60fps.y4m,--preset slower --scaling-list default
 city_4cif_60fps.y4m,--preset veryslow --rdpenalty 2 --sao-non-deblock --no-b-intra --limit-refs 0
-ducks_take_off_420_720p50.y4m,--preset fast --deblock 6 --bframes 16 --rc-lookahead 40
+ducks_take_off_420_720p50.y4m,--preset ultrafast --constrained-intra --rd 1
+ducks_take_off_444_720p50.y4m,--preset superfast --weightp --limit-refs 2
 ducks_take_off_420_720p50.y4m,--preset faster --qp 24 --deblock -6 --limit-refs 2
+ducks_take_off_420_720p50.y4m,--preset fast --deblock 6 --bframes 16 --rc-lookahead 40
 ducks_take_off_420_720p50.y4m,--preset medium --tskip --tskip-fast --constrained-intra
-ducks_take_off_420_720p50.y4m,--preset slow --scaling-list default --qp 40
-ducks_take_off_420_720p50.y4m,--preset ultrafast --constrained-intra --rd 1
-ducks_take_off_420_720p50.y4m,--preset veryslow --constrained-intra --bframes 2
 ducks_take_off_444_720p50.y4m,--preset medium --qp 38 --no-scenecut
-ducks_take_off_444_720p50.y4m,--preset superfast --weightp --rd 0 --limit-refs 2
+ducks_take_off_420_720p50.y4m,--preset slow --scaling-list default --qp 40
 ducks_take_off_444_720p50.y4m,--preset slower --psy-rd 1 --psy-rdoq 2.0 --rdoq-level 1 --limit-refs 1
+ducks_take_off_420_720p50.y4m,--preset slower --no-wpp
+ducks_take_off_420_720p50.y4m,--preset veryslow --constrained-intra --bframes 2
+mobile_calendar_422_ntsc.y4m,--preset superfast --weightp
 mobile_calendar_422_ntsc.y4m,--preset medium --bitrate 500 -F4
 mobile_calendar_422_ntsc.y4m,--preset slower --tskip --tskip-fast
-mobile_calendar_422_ntsc.y4m,--preset superfast --weightp --rd 0
 mobile_calendar_422_ntsc.y4m,--preset veryslow --tskip --limit-refs 2
+old_town_cross_444_720p50.y4m,--preset ultrafast --weightp --min-cu 32
+old_town_cross_444_720p50.y4m,--preset superfast --weightp --min-cu 16 --limit-modes
+old_town_cross_444_720p50.y4m,--preset veryfast --qp 1 --tune ssim
 old_town_cross_444_720p50.y4m,--preset faster --rd 1 --tune zero-latency
+old_town_cross_444_720p50.y4m,--preset fast --no-cutree --analysis-mode=save --bitrate 3000 --early-skip,--preset fast --no-cutree --analysis-mode=load --bitrate 3000 --early-skip
 old_town_cross_444_720p50.y4m,--preset medium --keyint -1 --no-weightp --ref 6
 old_town_cross_444_720p50.y4m,--preset slow --rdoq-level 1 --early-skip --ref 7 --no-b-pyramid
 old_town_cross_444_720p50.y4m,--preset slower --crf 4 --cu-lossless
-old_town_cross_444_720p50.y4m,--preset superfast --weightp --min-cu 16
-old_town_cross_444_720p50.y4m,--preset ultrafast --weightp --min-cu 32
-old_town_cross_444_720p50.y4m,--preset veryfast --qp 1 --tune ssim
 parkrun_ter_720p50.y4m,--preset medium --no-open-gop --sao-non-deblock --crf 4 --cu-lossless
 parkrun_ter_720p50.y4m,--preset slower --fast-intra --no-rect --tune grain
-silent_cif_420.y4m,--preset medium --me full --rect --amp
 silent_cif_420.y4m,--preset superfast --weightp --rect
+silent_cif_420.y4m,--preset medium --me full --rect --amp
 silent_cif_420.y4m,--preset placebo --ctu 32 --no-sao --qg-size 16
-vtc1nw_422_ntsc.y4m,--preset medium --scaling-list default --ctu 16 --ref 5
-vtc1nw_422_ntsc.y4m,--preset slower --nr-inter 1000 -F4 --tune fast-decode --qg-size 16
+washdc_422_ntsc.y4m,--preset ultrafast --weightp --tu-intra-depth 4
 vtc1nw_422_ntsc.y4m,--preset superfast --weightp --nr-intra 100 -F4
-washdc_422_ntsc.y4m,--preset faster --rdoq-level 1 --max-merge 5
-washdc_422_ntsc.y4m,--preset medium --no-weightp --max-tu-size 4 --limit-refs 1
-washdc_422_ntsc.y4m,--preset slower --psy-rdoq 2.0 --rdoq-level 2 --qg-size 32 --limit-refs 1
 washdc_422_ntsc.y4m,--preset superfast --psy-rd 1 --tune zerolatency
-washdc_422_ntsc.y4m,--preset ultrafast --weightp --tu-intra-depth 4
 washdc_422_ntsc.y4m,--preset veryfast --tu-inter-depth 4
-washdc_422_ntsc.y4m,--preset veryslow --crf 4 --cu-lossless --limit-refs 3
-BasketballDrive_1920x1080_50.y4m,--preset medium --no-cutree --analysis-mode=save --bitrate 15000,--preset medium --no-cutree --analysis-mode=load --bitrate 13000,--preset medium --no-cutree --analysis-mode=load --bitrate 11000,--preset medium --no-cutree --analysis-mode=load --bitrate 9000,--preset medium --no-cutree --analysis-mode=load --bitrate 7000
-NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset slow --no-cutree --analysis-mode=save --bitrate 15000,--preset slow --no-cutree --analysis-mode=load --bitrate 13000,--preset slow --no-cutree --analysis-mode=load --bitrate 11000,--preset slow --no-cutree --analysis-mode=load --bitrate 9000,--preset slow --no-cutree --analysis-mode=load --bitrate 7000
-old_town_cross_444_720p50.y4m,--preset veryslow --no-cutree --analysis-mode=save --bitrate 15000 --early-skip,--preset veryslow --no-cutree --analysis-mode=load --bitrate 13000 --early-skip,--preset veryslow --no-cutree --analysis-mode=load --bitrate 11000 --early-skip,--preset veryslow --no-cutree --analysis-mode=load --bitrate 9000 --early-skip,--preset veryslow --no-cutree --analysis-mode=load --bitrate 7000 --early-skip
-Johnny_1280x720_60.y4m,--preset medium --no-cutree --analysis-mode=save --bitrate 15000 --tskip-fast,--preset medium --no-cutree --analysis-mode=load --bitrate 13000  --tskip-fast,--preset medium --no-cutree --analysis-mode=load --bitrate 11000  --tskip-fast,--preset medium --no-cutree --analysis-mode=load --bitrate 9000  --tskip-fast,--preset medium --no-cutree --analysis-mode=load --bitrate 7000  --tskip-fast
-BasketballDrive_1920x1080_50.y4m,--preset medium --recon-y4m-exec "ffplay -i pipe:0 -autoexit"
-FourPeople_1280x720_60.y4m,--preset ultrafast --recon-y4m-exec "ffplay -i pipe:0 -autoexit"
-FourPeople_1280x720_60.y4m,--preset veryslow --recon-y4m-exec "ffplay -i pipe:0 -autoexit"
+washdc_422_ntsc.y4m,--preset faster --rdoq-level 1 --max-merge 5
+vtc1nw_422_ntsc.y4m,--preset medium --scaling-list default --ctu 16 --ref 5
+washdc_422_ntsc.y4m,--preset medium --no-weightp --max-tu-size 4 --limit-refs 1 --aq-mode 2
+vtc1nw_422_ntsc.y4m,--preset slower --nr-inter 1000 -F4 --tune fast-decode --qg-size 16
+washdc_422_ntsc.y4m,--preset slower --psy-rdoq 2.0 --rdoq-level 2 --qg-size 32 --limit-refs 1
+washdc_422_ntsc.y4m,--preset veryslow --crf 4 --cu-lossless --limit-refs 3 --limit-modes
+
+# Main12 intraCost overflow bug test
+720p50_parkrun_ter.y4m,--preset medium
 
 # interlace test, even though input YUV is not field seperated
-CrowdRun_1920x1080_50_10bit_422.yuv,--preset fast --interlace bff
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset faster --interlace tff
+CrowdRun_1920x1080_50_10bit_422.yuv,--preset fast --interlace bff
 
 # vim: tw=200

 
@@ -11,124 +11,132 @@
 # consistent across many machines, you must force a certain -FN so it is
 # not auto-detected.
 
+BasketballDrive_1920x1080_50.y4m,--preset ultrafast --signhide --colormatrix bt709
+BasketballDrive_1920x1080_50.y4m,--preset superfast --psy-rd 1 --ctu 16 --no-wpp --limit-modes
+BasketballDrive_1920x1080_50.y4m,--preset veryfast --tune zerolatency --no-temporal-mvp
 BasketballDrive_1920x1080_50.y4m,--preset faster --aq-strength 2 --merange 190
 BasketballDrive_1920x1080_50.y4m,--preset medium --ctu 16 --max-tu-size 8 --subme 7 --qg-size 16 --cu-lossless
 BasketballDrive_1920x1080_50.y4m,--preset medium --keyint -1 --nr-inter 100 -F4 --no-sao
+BasketballDrive_1920x1080_50.y4m,--preset medium --no-cutree --analysis-mode=save --bitrate 7000 --limit-modes,--preset medium --no-cutree --analysis-mode=load --bitrate 7000 --limit-modes
 BasketballDrive_1920x1080_50.y4m,--preset slow --nr-intra 100 -F4 --aq-strength 3 --qg-size 16 --limit-refs 1
 BasketballDrive_1920x1080_50.y4m,--preset slower --lossless --chromaloc 3 --subme 0
-BasketballDrive_1920x1080_50.y4m,--preset superfast --psy-rd 1 --ctu 16 --no-wpp
-BasketballDrive_1920x1080_50.y4m,--preset ultrafast --signhide --colormatrix bt709
-BasketballDrive_1920x1080_50.y4m,--preset veryfast --tune zerolatency --no-temporal-mvp
-BasketballDrive_1920x1080_50.y4m,--preset veryslow --crf 4 --cu-lossless --pmode --limit-refs 1
+BasketballDrive_1920x1080_50.y4m,--preset slower --no-cutree --analysis-mode=save --bitrate 7000,--preset slower --no-cutree --analysis-mode=load --bitrate 7000
+BasketballDrive_1920x1080_50.y4m,--preset veryslow --crf 4 --cu-lossless --pmode --limit-refs 1 --aq-mode 3
+BasketballDrive_1920x1080_50.y4m,--preset veryslow --no-cutree --analysis-mode=save --bitrate 7000 --tskip-fast,--preset veryslow --no-cutree --analysis-mode=load --bitrate 7000  --tskip-fast
+BasketballDrive_1920x1080_50.y4m,--preset veryslow --recon-y4m-exec "ffplay -i pipe:0 -autoexit"
+Coastguard-4k.y4m,--preset ultrafast --recon-y4m-exec "ffplay -i pipe:0 -autoexit"
+Coastguard-4k.y4m,--preset superfast --tune grain --overscan=crop
+Coastguard-4k.y4m,--preset veryfast --no-cutree --analysis-mode=save --bitrate 15000,--preset veryfast --no-cutree --analysis-mode=load --bitrate 15000
 Coastguard-4k.y4m,--preset medium --rdoq-level 1 --tune ssim --no-signhide --me umh
 Coastguard-4k.y4m,--preset slow --tune psnr --cbqpoffs -1 --crqpoffs 1 --limit-refs 1
-Coastguard-4k.y4m,--preset superfast --tune grain --overscan=crop
-CrowdRun_1920x1080_50_10bit_422.yuv,--preset fast --aq-mode 0 --sar 2 --range full
+CrowdRun_1920x1080_50_10bit_422.yuv,--preset ultrafast --weightp --tune zerolatency --qg-size 16
+CrowdRun_1920x1080_50_10bit_422.yuv,--preset superfast --weightp --no-wpp --sao
+CrowdRun_1920x1080_50_10bit_422.yuv,--preset veryfast --temporal-layers --tune grain
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset faster --max-tu-size 4 --min-cu-size 32
+CrowdRun_1920x1080_50_10bit_422.yuv,--preset fast --aq-mode 0 --sar 2 --range full
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset medium --no-wpp --no-cutree --no-strong-intra-smoothing --limit-refs 1
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset slow --no-wpp --tune ssim --transfer smpte240m
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset slower --tune ssim --tune fastdecode --limit-refs 2
-CrowdRun_1920x1080_50_10bit_422.yuv,--preset superfast --weightp --no-wpp --sao
-CrowdRun_1920x1080_50_10bit_422.yuv,--preset ultrafast --weightp --tune zerolatency --qg-size 16
-CrowdRun_1920x1080_50_10bit_422.yuv,--preset veryfast --temporal-layers --tune grain
-CrowdRun_1920x1080_50_10bit_444.yuv,--preset medium --dither --keyint -1 --rdoq-level 1
-CrowdRun_1920x1080_50_10bit_444.yuv,--preset superfast --weightp --dither --no-psy-rd
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset ultrafast --weightp --no-wpp --no-open-gop
+CrowdRun_1920x1080_50_10bit_444.yuv,--preset superfast --weightp --dither --no-psy-rd
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset veryfast --temporal-layers --repeat-headers --limit-refs 2
+CrowdRun_1920x1080_50_10bit_444.yuv,--preset medium --dither --keyint -1 --rdoq-level 1 --limit-modes
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset veryslow --tskip --tskip-fast --no-scenecut
-DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset medium --tune psnr --bframes 16
-DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset slow --temporal-layers --no-psy-rd --qg-size 32 --limit-refs 0 --cu-lossless
 DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset superfast --weightp --qg-size 16
+DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset medium --tune psnr --bframes 16 --limit-modes
+DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset slow --temporal-layers --no-psy-rd --qg-size 32 --limit-refs 0 --cu-lossless
+DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset veryfast --weightp --nr-intra 1000 -F4
 DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset medium --nr-inter 500 -F4 --no-psy-rdoq
 DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset slower --no-weightp --rdoq-level 0 --limit-refs 3
-DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset veryfast --weightp --nr-intra 1000 -F4
-FourPeople_1280x720_60.y4m,--preset medium --qp 38 --no-psy-rd
 FourPeople_1280x720_60.y4m,--preset superfast --no-wpp --lookahead-slices 2
+FourPeople_1280x720_60.y4m,--preset medium --qp 38 --no-psy-rd
+FourPeople_1280x720_60.y4m,--preset medium --recon-y4m-exec "ffplay -i pipe:0 -autoexit"
+FourPeople_1280x720_60.y4m,--preset veryslow --numa-pools "none"
+Keiba_832x480_30.y4m,--preset superfast --no-fast-intra --nr-intra 1000 -F4
 Keiba_832x480_30.y4m,--preset medium --pmode --tune grain
 Keiba_832x480_30.y4m,--preset slower --fast-intra --nr-inter 500 -F4 --limit-refs 0
-Keiba_832x480_30.y4m,--preset superfast --no-fast-intra --nr-intra 1000 -F4
-Kimono1_1920x1080_24_10bit_444.yuv,--preset medium --min-cu-size 32
 Kimono1_1920x1080_24_10bit_444.yuv,--preset superfast --weightb
-KristenAndSara_1280x720_60.y4m,--preset medium --no-cutree --max-tu-size 16
-KristenAndSara_1280x720_60.y4m,--preset slower --pmode --max-tu-size 8 --limit-refs 0
-KristenAndSara_1280x720_60.y4m,--preset superfast --min-cu-size 16 --qg-size 16 --limit-refs 1
+Kimono1_1920x1080_24_10bit_444.yuv,--preset medium --min-cu-size 32
 KristenAndSara_1280x720_60.y4m,--preset ultrafast --strong-intra-smoothing
-NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset medium --tune grain --limit-refs 2
+KristenAndSara_1280x720_60.y4m,--preset superfast --min-cu-size 16 --qg-size 16 --limit-refs 1
+KristenAndSara_1280x720_60.y4m,--preset medium --no-cutree --max-tu-size 16
+KristenAndSara_1280x720_60.y4m,--preset slower --pmode --max-tu-size 8 --limit-refs 0 --limit-modes
 NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset superfast --tune psnr
-News-4k.y4m,--preset medium --tune ssim --no-sao --qg-size 16
+NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset medium --tune grain --limit-refs 2
+NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset slow --no-cutree --analysis-mode=save --bitrate 9000,--preset slow --no-cutree --analysis-mode=load --bitrate 9000
+News-4k.y4m,--preset ultrafast --no-cutree --analysis-mode=save --bitrate 15000,--preset ultrafast --no-cutree --analysis-mode=load --bitrate 15000
 News-4k.y4m,--preset superfast --lookahead-slices 6 --aq-mode 0
+News-4k.y4m,--preset medium --tune ssim --no-sao --qg-size 16
+OldTownCross_1920x1080_50_10bit_422.yuv,--preset superfast --weightp
 OldTownCross_1920x1080_50_10bit_422.yuv,--preset medium --no-weightp
 OldTownCross_1920x1080_50_10bit_422.yuv,--preset slower --tune fastdecode
-OldTownCross_1920x1080_50_10bit_422.yuv,--preset superfast --weightp
+ParkScene_1920x1080_24_10bit_444.yuv,--preset superfast --weightp --lookahead-slices 4
 ParkScene_1920x1080_24.y4m,--preset medium --qp 40 --rdpenalty 2 --tu-intra-depth 3
 ParkScene_1920x1080_24.y4m,--preset slower --no-weightp
-ParkScene_1920x1080_24_10bit_444.yuv,--preset superfast --weightp --lookahead-slices 4
+RaceHorses_416x240_30.y4m,--preset superfast --no-cutree
 RaceHorses_416x240_30.y4m,--preset medium --tskip-fast --tskip
 RaceHorses_416x240_30.y4m,--preset slower --keyint -1 --rdoq-level 0
-RaceHorses_416x240_30.y4m,--preset superfast --no-cutree
 RaceHorses_416x240_30.y4m,--preset veryslow --tskip-fast --tskip --limit-refs 3
-RaceHorses_416x240_30_10bit.yuv,--preset fast --lookahead-slices 2 --b-intra --limit-refs 1
-RaceHorses_416x240_30_10bit.yuv,--preset faster --rdoq-level 0 --dither
-RaceHorses_416x240_30_10bit.yuv,--preset slow --tune grain
 RaceHorses_416x240_30_10bit.yuv,--preset ultrafast --tune psnr --limit-refs 1
 RaceHorses_416x240_30_10bit.yuv,--preset veryfast --weightb
+RaceHorses_416x240_30_10bit.yuv,--preset faster --rdoq-level 0 --dither
+RaceHorses_416x240_30_10bit.yuv,--preset fast --lookahead-slices 2 --b-intra --limit-refs 1
+RaceHorses_416x240_30_10bit.yuv,--preset slow --tune grain  --limit-modes
 RaceHorses_416x240_30_10bit.yuv,--preset placebo --limit-refs 1
 SteamLocomotiveTrain_2560x1600_60_10bit_crop.yuv,--preset medium --dither
-big_buck_bunny_360p24.y4m,--preset faster --keyint 240 --min-keyint 60 --rc-lookahead 200
-big_buck_bunny_360p24.y4m,--preset medium --keyint 60 --min-keyint 48 --weightb --limit-refs 3
-big_buck_bunny_360p24.y4m,--preset slow --psy-rdoq 2.0 --rdoq-level 1 --no-b-intra
-big_buck_bunny_360p24.y4m,--preset superfast --psy-rdoq 2.0
 big_buck_bunny_360p24.y4m,--preset ultrafast --deblock=2
+big_buck_bunny_360p24.y4m,--preset superfast --psy-rdoq 2.0 --aq-mode 3
 big_buck_bunny_360p24.y4m,--preset veryfast --no-deblock
-city_4cif_60fps.y4m,--preset medium --crf 4 --cu-lossless --sao-non-deblock
+big_buck_bunny_360p24.y4m,--preset faster --keyint 240 --min-keyint 60 --rc-lookahead 200
+big_buck_bunny_360p24.y4m,--preset medium --keyint 60 --min-keyint 48 --weightb --limit-refs 3
+big_buck_bunny_360p24.y4m,--preset slow --psy-rdoq 2.0 --rdoq-level 1 --no-b-intra --aq-mode 3
 city_4cif_60fps.y4m,--preset superfast --rdpenalty 1 --tu-intra-depth 2
+city_4cif_60fps.y4m,--preset medium --crf 4 --cu-lossless --sao-non-deblock
 city_4cif_60fps.y4m,--preset slower --scaling-list default
 city_4cif_60fps.y4m,--preset veryslow --rdpenalty 2 --sao-non-deblock --no-b-intra --limit-refs 0
-ducks_take_off_420_720p50.y4m,--preset fast --deblock 6 --bframes 16 --rc-lookahead 40
+ducks_take_off_420_720p50.y4m,--preset ultrafast --constrained-intra --rd 1
+ducks_take_off_444_720p50.y4m,--preset superfast --weightp --limit-refs 2
 ducks_take_off_420_720p50.y4m,--preset faster --qp 24 --deblock -6 --limit-refs 2
+ducks_take_off_420_720p50.y4m,--preset fast --deblock 6 --bframes 16 --rc-lookahead 40
 ducks_take_off_420_720p50.y4m,--preset medium --tskip --tskip-fast --constrained-intra
-ducks_take_off_420_720p50.y4m,--preset slow --scaling-list default --qp 40
-ducks_take_off_420_720p50.y4m,--preset ultrafast --constrained-intra --rd 1
-ducks_take_off_420_720p50.y4m,--preset veryslow --constrained-intra --bframes 2
 ducks_take_off_444_720p50.y4m,--preset medium --qp 38 --no-scenecut
-ducks_take_off_444_720p50.y4m,--preset superfast --weightp --rd 0 --limit-refs 2
+ducks_take_off_420_720p50.y4m,--preset slow --scaling-list default --qp 40
 ducks_take_off_444_720p50.y4m,--preset slower --psy-rd 1 --psy-rdoq 2.0 --rdoq-level 1 --limit-refs 1
+ducks_take_off_420_720p50.y4m,--preset slower --no-wpp
+ducks_take_off_420_720p50.y4m,--preset veryslow --constrained-intra --bframes 2
+mobile_calendar_422_ntsc.y4m,--preset superfast --weightp
 mobile_calendar_422_ntsc.y4m,--preset medium --bitrate 500 -F4
 mobile_calendar_422_ntsc.y4m,--preset slower --tskip --tskip-fast
-mobile_calendar_422_ntsc.y4m,--preset superfast --weightp --rd 0
 mobile_calendar_422_ntsc.y4m,--preset veryslow --tskip --limit-refs 2
+old_town_cross_444_720p50.y4m,--preset ultrafast --weightp --min-cu 32
+old_town_cross_444_720p50.y4m,--preset superfast --weightp --min-cu 16 --limit-modes
+old_town_cross_444_720p50.y4m,--preset veryfast --qp 1 --tune ssim
 old_town_cross_444_720p50.y4m,--preset faster --rd 1 --tune zero-latency
+old_town_cross_444_720p50.y4m,--preset fast --no-cutree --analysis-mode=save --bitrate 3000 --early-skip,--preset fast --no-cutree --analysis-mode=load --bitrate 3000 --early-skip
 old_town_cross_444_720p50.y4m,--preset medium --keyint -1 --no-weightp --ref 6
 old_town_cross_444_720p50.y4m,--preset slow --rdoq-level 1 --early-skip --ref 7 --no-b-pyramid
 old_town_cross_444_720p50.y4m,--preset slower --crf 4 --cu-lossless
-old_town_cross_444_720p50.y4m,--preset superfast --weightp --min-cu 16
-old_town_cross_444_720p50.y4m,--preset ultrafast --weightp --min-cu 32
-old_town_cross_444_720p50.y4m,--preset veryfast --qp 1 --tune ssim
 parkrun_ter_720p50.y4m,--preset medium --no-open-gop --sao-non-deblock --crf 4 --cu-lossless
 parkrun_ter_720p50.y4m,--preset slower --fast-intra --no-rect --tune grain
-silent_cif_420.y4m,--preset medium --me full --rect --amp
 silent_cif_420.y4m,--preset superfast --weightp --rect
+silent_cif_420.y4m,--preset medium --me full --rect --amp
 silent_cif_420.y4m,--preset placebo --ctu 32 --no-sao --qg-size 16
-vtc1nw_422_ntsc.y4m,--preset medium --scaling-list default --ctu 16 --ref 5
-vtc1nw_422_ntsc.y4m,--preset slower --nr-inter 1000 -F4 --tune fast-decode --qg-size 16
+washdc_422_ntsc.y4m,--preset ultrafast --weightp --tu-intra-depth 4
 vtc1nw_422_ntsc.y4m,--preset superfast --weightp --nr-intra 100 -F4
-washdc_422_ntsc.y4m,--preset faster --rdoq-level 1 --max-merge 5
-washdc_422_ntsc.y4m,--preset medium --no-weightp --max-tu-size 4 --limit-refs 1
-washdc_422_ntsc.y4m,--preset slower --psy-rdoq 2.0 --rdoq-level 2 --qg-size 32 --limit-refs 1
 washdc_422_ntsc.y4m,--preset superfast --psy-rd 1 --tune zerolatency
-washdc_422_ntsc.y4m,--preset ultrafast --weightp --tu-intra-depth 4
 washdc_422_ntsc.y4m,--preset veryfast --tu-inter-depth 4
-washdc_422_ntsc.y4m,--preset veryslow --crf 4 --cu-lossless --limit-refs 3
-BasketballDrive_1920x1080_50.y4m,--preset medium --no-cutree --analysis-mode=save --bitrate 15000,--preset medium --no-cutree --analysis-mode=load --bitrate 13000,--preset medium --no-cutree --analysis-mode=load --bitrate 11000,--preset medium --no-cutree --analysis-mode=load --bitrate 9000,--preset medium --no-cutree --analysis-mode=load --bitrate 7000
-NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset slow --no-cutree --analysis-mode=save --bitrate 15000,--preset slow --no-cutree --analysis-mode=load --bitrate 13000,--preset slow --no-cutree --analysis-mode=load --bitrate 11000,--preset slow --no-cutree --analysis-mode=load --bitrate 9000,--preset slow --no-cutree --analysis-mode=load --bitrate 7000
-old_town_cross_444_720p50.y4m,--preset veryslow --no-cutree --analysis-mode=save --bitrate 15000 --early-skip,--preset veryslow --no-cutree --analysis-mode=load --bitrate 13000 --early-skip,--preset veryslow --no-cutree --analysis-mode=load --bitrate 11000 --early-skip,--preset veryslow --no-cutree --analysis-mode=load --bitrate 9000 --early-skip,--preset veryslow --no-cutree --analysis-mode=load --bitrate 7000 --early-skip
-Johnny_1280x720_60.y4m,--preset medium --no-cutree --analysis-mode=save --bitrate 15000 --tskip-fast,--preset medium --no-cutree --analysis-mode=load --bitrate 13000  --tskip-fast,--preset medium --no-cutree --analysis-mode=load --bitrate 11000  --tskip-fast,--preset medium --no-cutree --analysis-mode=load --bitrate 9000  --tskip-fast,--preset medium --no-cutree --analysis-mode=load --bitrate 7000  --tskip-fast
-BasketballDrive_1920x1080_50.y4m,--preset medium --recon-y4m-exec "ffplay -i pipe:0 -autoexit"
-FourPeople_1280x720_60.y4m,--preset ultrafast --recon-y4m-exec "ffplay -i pipe:0 -autoexit"
-FourPeople_1280x720_60.y4m,--preset veryslow --recon-y4m-exec "ffplay -i pipe:0 -autoexit"
+washdc_422_ntsc.y4m,--preset faster --rdoq-level 1 --max-merge 5
+vtc1nw_422_ntsc.y4m,--preset medium --scaling-list default --ctu 16 --ref 5
+washdc_422_ntsc.y4m,--preset medium --no-weightp --max-tu-size 4 --limit-refs 1 --aq-mode 2
+vtc1nw_422_ntsc.y4m,--preset slower --nr-inter 1000 -F4 --tune fast-decode --qg-size 16
+washdc_422_ntsc.y4m,--preset slower --psy-rdoq 2.0 --rdoq-level 2 --qg-size 32 --limit-refs 1
+washdc_422_ntsc.y4m,--preset veryslow --crf 4 --cu-lossless --limit-refs 3 --limit-modes
+
+# Main12 intraCost overflow bug test
+720p50_parkrun_ter.y4m,--preset medium
 
 # interlace test, even though input YUV is not field seperated
-CrowdRun_1920x1080_50_10bit_422.yuv,--preset fast --interlace bff
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset faster --interlace tff
+CrowdRun_1920x1080_50_10bit_422.yuv,--preset fast --interlace bff
 
 # vim: tw=200
​

x265_1.8.tar.gz/source/test/smoke-tests.txt -> x265_1.9.tar.gz/source/test/smoke-tests.txt Changed

 
@@ -19,3 +19,6 @@
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset=medium --max-tu-size 16
 DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset=veryfast --min-cu 16
 DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset=fast --weightb --interlace bff
+
+# Main12 intraCost overflow bug test
+720p50_parkrun_ter.y4m,--preset medium
​

x265_1.8.tar.gz/source/test/testbench.cpp -> x265_1.9.tar.gz/source/test/testbench.cpp Changed

 
@@ -4,6 +4,7 @@
  * Authors: Gopu Govindaswamy <gopu@govindaswamy.org>
  *          Mandar Gurav <mandar@multicorewareinc.com>
  *          Mahesh Pittala <mahesh@multicorewareinc.com>
+ *          Min Chen <chenm003@163.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
​

x265_1.8.tar.gz/source/test/testharness.h -> x265_1.9.tar.gz/source/test/testharness.h Changed

 
@@ -2,6 +2,7 @@
  * Copyright (C) 2013 x265 project
  *
  * Authors: Steve Borho <steve@borho.org>
+ *          Min Chen <chenm003@163.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
​

x265_1.8.tar.gz/source/x265-extras.cpp -> x265_1.9.tar.gz/source/x265-extras.cpp Changed

@@ -36,7 +36,7 @@
     "I count, I ave-QP, I kbps, I-PSNR Y, I-PSNR U, I-PSNR V, I-SSIM (dB), "
     "P count, P ave-QP, P kbps, P-PSNR Y, P-PSNR U, P-PSNR V, P-SSIM (dB), "
     "B count, B ave-QP, B kbps, B-PSNR Y, B-PSNR U, B-PSNR V, B-SSIM (dB), "
-    "Version\n";
+    "MaxCLL, MaxFALL, Version\n";
 
 FILE* x265_csvlog_open(const x265_api& api, const x265_param& param, const char* fname, int level)
 {
@@ -61,54 +61,58 @@
         {
             if (level)
             {
-                fprintf(csvfp, "Encode Order, Type, POC, QP, Bits, ");
+                fprintf(csvfp, "Encode Order, Type, POC, QP, Bits, Scenecut, ");
                 if (param.rc.rateControlMode == X265_RC_CRF)
                     fprintf(csvfp, "RateFactor, ");
-                fprintf(csvfp, "Y PSNR, U PSNR, V PSNR, YUV PSNR, SSIM, SSIM (dB),  List 0, List 1");
-                /* detailed performance statistics */
-                fprintf(csvfp, ", DecideWait (ms), Row0Wait (ms), Wall time (ms), Ref Wait Wall (ms), Total CTU time (ms), Stall Time (ms), Avg WPP, Row Blocks");
-                if (level >= 2)
+                if (param.bEnablePsnr)
+                    fprintf(csvfp, "Y PSNR, U PSNR, V PSNR, YUV PSNR, ");
+                if (param.bEnableSsim)
+                    fprintf(csvfp, "SSIM, SSIM(dB), ");
+                fprintf(csvfp, "Latency, ");
+                fprintf(csvfp, "List 0, List 1");
+                uint32_t size = param.maxCUSize;
+                for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
+                {
+                    fprintf(csvfp, ", Intra %dx%d DC, Intra %dx%d Planar, Intra %dx%d Ang", size, size, size, size, size, size);
+                    size /= 2;
+                }
+                fprintf(csvfp, ", 4x4");
+                size = param.maxCUSize;
+                if (param.bEnableRectInter)
                 {
-                    uint32_t size = param.maxCUSize;
-                    for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
-                    {
-                        fprintf(csvfp, ", Intra %dx%d DC, Intra %dx%d Planar, Intra %dx%d Ang", size, size, size, size, size, size);
-                        size /= 2;
-                    }
-                    fprintf(csvfp, ", 4x4");
-                    size = param.maxCUSize;
-                    if (param.bEnableRectInter)
-                    {
-                        for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
-                        {
-                            fprintf(csvfp, ", Inter %dx%d, Inter %dx%d (Rect)", size, size, size, size);
-                            if (param.bEnableAMP)
-                                fprintf(csvfp, ", Inter %dx%d (Amp)", size, size);
-                            size /= 2;
-                        }
-                    }
-                    else
-                    {
-                        for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
-                        {
-                            fprintf(csvfp, ", Inter %dx%d", size, size);
-                            size /= 2;
-                        }
-                    }
-                    size = param.maxCUSize;
                     for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
                     {
-                        fprintf(csvfp, ", Skip %dx%d", size, size);
+                        fprintf(csvfp, ", Inter %dx%d, Inter %dx%d (Rect)", size, size, size, size);
+                        if (param.bEnableAMP)
+                            fprintf(csvfp, ", Inter %dx%d (Amp)", size, size);
                         size /= 2;
                     }
-                    size = param.maxCUSize;
+                }
+                else
+                {
                     for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
                     {
-                        fprintf(csvfp, ", Merge %dx%d", size, size);
+                        fprintf(csvfp, ", Inter %dx%d", size, size);
                         size /= 2;
                     }
-                    fprintf(csvfp, ", Avg Luma Distortion, Avg Chroma Distortion, Avg psyEnergy, Avg Luma Level, Max Luma Level");
                 }
+                size = param.maxCUSize;
+                for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
+                {
+                    fprintf(csvfp, ", Skip %dx%d", size, size);
+                    size /= 2;
+                }
+                size = param.maxCUSize;
+                for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
+                {
+                    fprintf(csvfp, ", Merge %dx%d", size, size);
+                    size /= 2;
+                }
+                fprintf(csvfp, ", Avg Luma Distortion, Avg Chroma Distortion, Avg psyEnergy, Avg Luma Level, Max Luma Level, Avg Residual Energy");
+
+                /* detailed performance statistics */
+                if (level >= 2)
+                    fprintf(csvfp, ", DecideWait (ms), Row0Wait (ms), Wall time (ms), Ref Wait Wall (ms), Total CTU time (ms), Stall Time (ms), Avg WPP, Row Blocks");
                 fprintf(csvfp, "\n");
             }
             else
@@ -125,17 +129,14 @@
         return;
 
     const x265_frame_stats* frameStats = &pic.frameData;
-    fprintf(csvfp, "%d, %c-SLICE, %4d, %2.2lf, %10d,", frameStats->encoderOrder, frameStats->sliceType, frameStats->poc, frameStats->qp, (int)frameStats->bits);
+    fprintf(csvfp, "%d, %c-SLICE, %4d, %2.2lf, %10d, %d,", frameStats->encoderOrder, frameStats->sliceType, frameStats->poc, frameStats->qp, (int)frameStats->bits, frameStats->bScenecut);
     if (param.rc.rateControlMode == X265_RC_CRF)
         fprintf(csvfp, "%.3lf,", frameStats->rateFactor);
     if (param.bEnablePsnr)
         fprintf(csvfp, "%.3lf, %.3lf, %.3lf, %.3lf,", frameStats->psnrY, frameStats->psnrU, frameStats->psnrV, frameStats->psnr);
-    else
-        fputs(" -, -, -, -,", csvfp);
     if (param.bEnableSsim)
         fprintf(csvfp, " %.6f, %6.3f,", frameStats->ssim, x265_ssim2dB(frameStats->ssim));
-    else
-        fputs(" -, -,", csvfp);
+    fprintf(csvfp, "%d, ", frameStats->frameLatency);
     if (frameStats->sliceType == 'I')
         fputs(" -, -,", csvfp);
     else
@@ -154,32 +155,33 @@
         else
             fputs(" -,", csvfp);
     }
-    fprintf(csvfp, " %.1lf, %.1lf, %.1lf, %.1lf, %.1lf, %.1lf,", frameStats->decideWaitTime, frameStats->row0WaitTime, frameStats->wallTime, frameStats->refWaitWallTime, frameStats->totalCTUTime, frameStats->stallTime);
-    fprintf(csvfp, " %.3lf, %d", frameStats->avgWPP, frameStats->countRowBlocks);
-    if (level >= 2)
+    for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
+        fprintf(csvfp, "%5.2lf%%, %5.2lf%%, %5.2lf%%,", frameStats->cuStats.percentIntraDistribution[depth][0], frameStats->cuStats.percentIntraDistribution[depth][1], frameStats->cuStats.percentIntraDistribution[depth][2]);
+    fprintf(csvfp, "%5.2lf%%", frameStats->cuStats.percentIntraNxN);
+    if (param.bEnableRectInter)
     {
         for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
-            fprintf(csvfp, ", %5.2lf%%, %5.2lf%%, %5.2lf%%", frameStats->cuStats.percentIntraDistribution[depth][0], frameStats->cuStats.percentIntraDistribution[depth][1], frameStats->cuStats.percentIntraDistribution[depth][2]);
-        fprintf(csvfp, ", %5.2lf%%", frameStats->cuStats.percentIntraNxN);
-        if (param.bEnableRectInter)
         {
-            for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
-            {
-                fprintf(csvfp, ", %5.2lf%%, %5.2lf%%", frameStats->cuStats.percentInterDistribution[depth][0], frameStats->cuStats.percentInterDistribution[depth][1]);
-                if (param.bEnableAMP)
-                    fprintf(csvfp, ", %5.2lf%%", frameStats->cuStats.percentInterDistribution[depth][2]);
-            }
+            fprintf(csvfp, ", %5.2lf%%, %5.2lf%%", frameStats->cuStats.percentInterDistribution[depth][0], frameStats->cuStats.percentInterDistribution[depth][1]);
+            if (param.bEnableAMP)
+                fprintf(csvfp, ", %5.2lf%%", frameStats->cuStats.percentInterDistribution[depth][2]);
         }
-        else
-        {
-            for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
-                fprintf(csvfp, ", %5.2lf%%", frameStats->cuStats.percentInterDistribution[depth][0]);
-        }
-        for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
-            fprintf(csvfp, ", %5.2lf%%", frameStats->cuStats.percentSkipCu[depth]);
+    }
+    else
+    {
         for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
-            fprintf(csvfp, ", %5.2lf%%", frameStats->cuStats.percentMergeCu[depth]);
-        fprintf(csvfp, ", %.2lf, %.2lf, %.2lf, %.2lf, %d", frameStats->avgLumaDistortion, frameStats->avgChromaDistortion, frameStats->avgPsyEnergy, frameStats->avgLumaLevel, frameStats->maxLumaLevel);
+            fprintf(csvfp, ", %5.2lf%%", frameStats->cuStats.percentInterDistribution[depth][0]);
+    }
+    for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
+        fprintf(csvfp, ", %5.2lf%%", frameStats->cuStats.percentSkipCu[depth]);
+    for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
+        fprintf(csvfp, ", %5.2lf%%", frameStats->cuStats.percentMergeCu[depth]);
+    fprintf(csvfp, ", %.2lf, %.2lf, %.2lf, %.2lf, %d, %.2lf", frameStats->avgLumaDistortion, frameStats->avgChromaDistortion, frameStats->avgPsyEnergy, frameStats->avgLumaLevel, frameStats->maxLumaLevel, frameStats->avgResEnergy);
+
+    if (level >= 2)
+    {
+        fprintf(csvfp, ", %.1lf, %.1lf, %.1lf, %.1lf, %.1lf, %.1lf,", frameStats->decideWaitTime, frameStats->row0WaitTime, frameStats->wallTime, frameStats->refWaitWallTime, frameStats->totalCTUTime, frameStats->stallTime);
+        fprintf(csvfp, " %.3lf, %d", frameStats->avgWPP, frameStats->countRowBlocks);
     }
     fprintf(csvfp, "\n");
     fflush(stderr);
@@ -198,11 +200,13 @@
     }
 
     // CLI arguments or other
+    fputc('"', csvfp);
     for (int i = 1; i < argc; i++)
     {
-        if (i) fputc(' ', csvfp);
+        fputc(' ', csvfp);
         fputs(argv[i], csvfp);
     }
+    fputc('"', csvfp);
 
     // current date and time
     time_t now;
@@ -273,7 +277,7 @@
     else
         fprintf(csvfp, " -, -, -, -, -, -, -,");
 
-    fprintf(csvfp, " %s\n", api.version_str);
+    fprintf(csvfp, " %-6u, %-6u, %s\n", stats.maxCLL, stats.maxFALL, api.version_str);

 
@@ -36,7 +36,7 @@
     "I count, I ave-QP, I kbps, I-PSNR Y, I-PSNR U, I-PSNR V, I-SSIM (dB), "
     "P count, P ave-QP, P kbps, P-PSNR Y, P-PSNR U, P-PSNR V, P-SSIM (dB), "
     "B count, B ave-QP, B kbps, B-PSNR Y, B-PSNR U, B-PSNR V, B-SSIM (dB), "
-    "Version\n";
+    "MaxCLL, MaxFALL, Version\n";
 
 FILE* x265_csvlog_open(const x265_api& api, const x265_param& param, const char* fname, int level)
 {
@@ -61,54 +61,58 @@
         {
             if (level)
             {
-                fprintf(csvfp, "Encode Order, Type, POC, QP, Bits, ");
+                fprintf(csvfp, "Encode Order, Type, POC, QP, Bits, Scenecut, ");
                 if (param.rc.rateControlMode == X265_RC_CRF)
                     fprintf(csvfp, "RateFactor, ");
-                fprintf(csvfp, "Y PSNR, U PSNR, V PSNR, YUV PSNR, SSIM, SSIM (dB),  List 0, List 1");
-                /* detailed performance statistics */
-                fprintf(csvfp, ", DecideWait (ms), Row0Wait (ms), Wall time (ms), Ref Wait Wall (ms), Total CTU time (ms), Stall Time (ms), Avg WPP, Row Blocks");
-                if (level >= 2)
+                if (param.bEnablePsnr)
+                    fprintf(csvfp, "Y PSNR, U PSNR, V PSNR, YUV PSNR, ");
+                if (param.bEnableSsim)
+                    fprintf(csvfp, "SSIM, SSIM(dB), ");
+                fprintf(csvfp, "Latency, ");
+                fprintf(csvfp, "List 0, List 1");
+                uint32_t size = param.maxCUSize;
+                for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
+                {
+                    fprintf(csvfp, ", Intra %dx%d DC, Intra %dx%d Planar, Intra %dx%d Ang", size, size, size, size, size, size);
+                    size /= 2;
+                }
+                fprintf(csvfp, ", 4x4");
+                size = param.maxCUSize;
+                if (param.bEnableRectInter)
                 {
-                    uint32_t size = param.maxCUSize;
-                    for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
-                    {
-                        fprintf(csvfp, ", Intra %dx%d DC, Intra %dx%d Planar, Intra %dx%d Ang", size, size, size, size, size, size);
-                        size /= 2;
-                    }
-                    fprintf(csvfp, ", 4x4");
-                    size = param.maxCUSize;
-                    if (param.bEnableRectInter)
-                    {
-                        for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
-                        {
-                            fprintf(csvfp, ", Inter %dx%d, Inter %dx%d (Rect)", size, size, size, size);
-                            if (param.bEnableAMP)
-                                fprintf(csvfp, ", Inter %dx%d (Amp)", size, size);
-                            size /= 2;
-                        }
-                    }
-                    else
-                    {
-                        for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
-                        {
-                            fprintf(csvfp, ", Inter %dx%d", size, size);
-                            size /= 2;
-                        }
-                    }
-                    size = param.maxCUSize;
                     for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
                     {
-                        fprintf(csvfp, ", Skip %dx%d", size, size);
+                        fprintf(csvfp, ", Inter %dx%d, Inter %dx%d (Rect)", size, size, size, size);
+                        if (param.bEnableAMP)
+                            fprintf(csvfp, ", Inter %dx%d (Amp)", size, size);
                         size /= 2;
                     }
-                    size = param.maxCUSize;
+                }
+                else
+                {
                     for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
                     {
-                        fprintf(csvfp, ", Merge %dx%d", size, size);
+                        fprintf(csvfp, ", Inter %dx%d", size, size);
                         size /= 2;
                     }
-                    fprintf(csvfp, ", Avg Luma Distortion, Avg Chroma Distortion, Avg psyEnergy, Avg Luma Level, Max Luma Level");
                 }
+                size = param.maxCUSize;
+                for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
+                {
+                    fprintf(csvfp, ", Skip %dx%d", size, size);
+                    size /= 2;
+                }
+                size = param.maxCUSize;
+                for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
+                {
+                    fprintf(csvfp, ", Merge %dx%d", size, size);
+                    size /= 2;
+                }
+                fprintf(csvfp, ", Avg Luma Distortion, Avg Chroma Distortion, Avg psyEnergy, Avg Luma Level, Max Luma Level, Avg Residual Energy");
+
+                /* detailed performance statistics */
+                if (level >= 2)
+                    fprintf(csvfp, ", DecideWait (ms), Row0Wait (ms), Wall time (ms), Ref Wait Wall (ms), Total CTU time (ms), Stall Time (ms), Avg WPP, Row Blocks");
                 fprintf(csvfp, "\n");
             }
             else
@@ -125,17 +129,14 @@
         return;
 
     const x265_frame_stats* frameStats = &pic.frameData;
-    fprintf(csvfp, "%d, %c-SLICE, %4d, %2.2lf, %10d,", frameStats->encoderOrder, frameStats->sliceType, frameStats->poc, frameStats->qp, (int)frameStats->bits);
+    fprintf(csvfp, "%d, %c-SLICE, %4d, %2.2lf, %10d, %d,", frameStats->encoderOrder, frameStats->sliceType, frameStats->poc, frameStats->qp, (int)frameStats->bits, frameStats->bScenecut);
     if (param.rc.rateControlMode == X265_RC_CRF)
         fprintf(csvfp, "%.3lf,", frameStats->rateFactor);
     if (param.bEnablePsnr)
         fprintf(csvfp, "%.3lf, %.3lf, %.3lf, %.3lf,", frameStats->psnrY, frameStats->psnrU, frameStats->psnrV, frameStats->psnr);
-    else
-        fputs(" -, -, -, -,", csvfp);
     if (param.bEnableSsim)
         fprintf(csvfp, " %.6f, %6.3f,", frameStats->ssim, x265_ssim2dB(frameStats->ssim));
-    else
-        fputs(" -, -,", csvfp);
+    fprintf(csvfp, "%d, ", frameStats->frameLatency);
     if (frameStats->sliceType == 'I')
         fputs(" -, -,", csvfp);
     else
@@ -154,32 +155,33 @@
         else
             fputs(" -,", csvfp);
     }
-    fprintf(csvfp, " %.1lf, %.1lf, %.1lf, %.1lf, %.1lf, %.1lf,", frameStats->decideWaitTime, frameStats->row0WaitTime, frameStats->wallTime, frameStats->refWaitWallTime, frameStats->totalCTUTime, frameStats->stallTime);
-    fprintf(csvfp, " %.3lf, %d", frameStats->avgWPP, frameStats->countRowBlocks);
-    if (level >= 2)
+    for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
+        fprintf(csvfp, "%5.2lf%%, %5.2lf%%, %5.2lf%%,", frameStats->cuStats.percentIntraDistribution[depth][0], frameStats->cuStats.percentIntraDistribution[depth][1], frameStats->cuStats.percentIntraDistribution[depth][2]);
+    fprintf(csvfp, "%5.2lf%%", frameStats->cuStats.percentIntraNxN);
+    if (param.bEnableRectInter)
     {
         for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
-            fprintf(csvfp, ", %5.2lf%%, %5.2lf%%, %5.2lf%%", frameStats->cuStats.percentIntraDistribution[depth][0], frameStats->cuStats.percentIntraDistribution[depth][1], frameStats->cuStats.percentIntraDistribution[depth][2]);
-        fprintf(csvfp, ", %5.2lf%%", frameStats->cuStats.percentIntraNxN);
-        if (param.bEnableRectInter)
         {
-            for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
-            {
-                fprintf(csvfp, ", %5.2lf%%, %5.2lf%%", frameStats->cuStats.percentInterDistribution[depth][0], frameStats->cuStats.percentInterDistribution[depth][1]);
-                if (param.bEnableAMP)
-                    fprintf(csvfp, ", %5.2lf%%", frameStats->cuStats.percentInterDistribution[depth][2]);
-            }
+            fprintf(csvfp, ", %5.2lf%%, %5.2lf%%", frameStats->cuStats.percentInterDistribution[depth][0], frameStats->cuStats.percentInterDistribution[depth][1]);
+            if (param.bEnableAMP)
+                fprintf(csvfp, ", %5.2lf%%", frameStats->cuStats.percentInterDistribution[depth][2]);
         }
-        else
-        {
-            for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
-                fprintf(csvfp, ", %5.2lf%%", frameStats->cuStats.percentInterDistribution[depth][0]);
-        }
-        for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
-            fprintf(csvfp, ", %5.2lf%%", frameStats->cuStats.percentSkipCu[depth]);
+    }
+    else
+    {
         for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
-            fprintf(csvfp, ", %5.2lf%%", frameStats->cuStats.percentMergeCu[depth]);
-        fprintf(csvfp, ", %.2lf, %.2lf, %.2lf, %.2lf, %d", frameStats->avgLumaDistortion, frameStats->avgChromaDistortion, frameStats->avgPsyEnergy, frameStats->avgLumaLevel, frameStats->maxLumaLevel);
+            fprintf(csvfp, ", %5.2lf%%", frameStats->cuStats.percentInterDistribution[depth][0]);
+    }
+    for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
+        fprintf(csvfp, ", %5.2lf%%", frameStats->cuStats.percentSkipCu[depth]);
+    for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
+        fprintf(csvfp, ", %5.2lf%%", frameStats->cuStats.percentMergeCu[depth]);
+    fprintf(csvfp, ", %.2lf, %.2lf, %.2lf, %.2lf, %d, %.2lf", frameStats->avgLumaDistortion, frameStats->avgChromaDistortion, frameStats->avgPsyEnergy, frameStats->avgLumaLevel, frameStats->maxLumaLevel, frameStats->avgResEnergy);
+
+    if (level >= 2)
+    {
+        fprintf(csvfp, ", %.1lf, %.1lf, %.1lf, %.1lf, %.1lf, %.1lf,", frameStats->decideWaitTime, frameStats->row0WaitTime, frameStats->wallTime, frameStats->refWaitWallTime, frameStats->totalCTUTime, frameStats->stallTime);
+        fprintf(csvfp, " %.3lf, %d", frameStats->avgWPP, frameStats->countRowBlocks);
     }
     fprintf(csvfp, "\n");
     fflush(stderr);
@@ -198,11 +200,13 @@
     }
 
     // CLI arguments or other
+    fputc('"', csvfp);
     for (int i = 1; i < argc; i++)
     {
-        if (i) fputc(' ', csvfp);
+        fputc(' ', csvfp);
         fputs(argv[i], csvfp);
     }
+    fputc('"', csvfp);
 
     // current date and time
     time_t now;
@@ -273,7 +277,7 @@
     else
         fprintf(csvfp, " -, -, -, -, -, -, -,");
 
-    fprintf(csvfp, " %s\n", api.version_str);
+    fprintf(csvfp, " %-6u, %-6u, %s\n", stats.maxCLL, stats.maxFALL, api.version_str);
​

x265_1.8.tar.gz/source/x265.cpp -> x265_1.9.tar.gz/source/x265.cpp Changed

 
@@ -486,6 +486,7 @@
             pic_org.forceqp = qp + 1;
         if (type == 'I') pic_org.sliceType = X265_TYPE_IDR;
         else if (type == 'i') pic_org.sliceType = X265_TYPE_I;
+        else if (type == 'K') pic_org.sliceType = param->bOpenGOP ? X265_TYPE_I : X265_TYPE_IDR;
         else if (type == 'P') pic_org.sliceType = X265_TYPE_P;
         else if (type == 'B') pic_org.sliceType = X265_TYPE_BREF;
         else if (type == 'b') pic_org.sliceType = X265_TYPE_B;
​

x265_1.8.tar.gz/source/x265.def.in -> x265_1.9.tar.gz/source/x265.def.in Changed

 
@@ -22,3 +22,4 @@
 x265_cleanup
 x265_api_get_${X265_BUILD}
 x265_api_query
+x265_encoder_intra_refresh
​

x265_1.8.tar.gz/source/x265.h -> x265_1.9.tar.gz/source/x265.h Changed

@@ -2,6 +2,7 @@
  * Copyright (C) 2013 x265 project
  *
  * Authors: Steve Borho <steve@borho.org>
+ *          Min Chen <chenm003@163.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -91,13 +92,15 @@
 /* Stores all analysis data for a single frame */
 typedef struct x265_analysis_data
 {
-    void*            interData;
-    void*            intraData;
+    int64_t          satdCost;
     uint32_t         frameRecordSize;
     uint32_t         poc;
     uint32_t         sliceType;
     uint32_t         numCUsInFrame;
     uint32_t         numPartitions;
+    void*            interData;
+    void*            intraData;
+    int              bScenecut;
 } x265_analysis_data;
 
 /* cu statistics */
@@ -132,6 +135,7 @@
     double           avgLumaDistortion;
     double           avgChromaDistortion;
     double           avgPsyEnergy;
+    double           avgResEnergy;
     double           avgLumaLevel;
     uint64_t         bits;
     int              encoderOrder;
@@ -141,6 +145,8 @@
     int              list1POC[16];
     uint16_t         maxLumaLevel;
     char             sliceType;
+    int              bScenecut;
+    int              frameLatency;
     x265_cu_stats    cuStats;
 } x265_frame_stats;
 
@@ -205,6 +211,13 @@
      * this data structure */
     x265_analysis_data analysisData;
 
+    /* An array of quantizer offsets to be applied to this image during encoding.
+     * These are added on top of the decisions made by rateControl.
+     * Adaptive quantization must be enabled to use this feature. These quantizer
+     * offsets should be given for each 16x16 block. Behavior if quant
+     * offsets differ between encoding passes is undefined. */
+    float            *quantOffsets;
+
     /* Frame level statistics */
     x265_frame_stats frameData;
 
@@ -378,6 +391,8 @@
     x265_sliceType_stats  statsI;               /* statistics of I slice */
     x265_sliceType_stats  statsP;               /* statistics of P slice */
     x265_sliceType_stats  statsB;               /* statistics of B slice */
+    uint16_t              maxCLL;               /* maximum content light level */
+    uint16_t              maxFALL;              /* maximum frame average light level */
 } x265_stats;
 
 /* String values accepted by x265_param_parse() (and CLI) for various parameters */
@@ -604,7 +619,7 @@
 
     /* Enables the emission of a user data SEI with the stream headers which
      * describes the encoder version, build info, and parameters. This is
-     * very helpful for debugging, but may interfere with regression tests. 
+     * very helpful for debugging, but may interfere with regression tests.
      * Default enabled */
     int       bEmitInfoSEI;
 
@@ -664,9 +679,9 @@
     int       bBPyramid;
 
     /* A value which is added to the cost estimate of B frames in the lookahead.
-     * It may be a positive value (making B frames appear more expensive, which
-     * causes the lookahead to chose more P frames) or negative, which makes the
-     * lookahead chose more B frames. Default is 0, there are no limits */
+     * It may be a positive value (making B frames appear less expensive, which
+     * biases the lookahead to choose more B frames) or negative, which makes the
+     * lookahead choose more P frames. Default is 0, there are no limits */
     int       bFrameBias;
 
     /* The number of frames that must be queued in the lookahead before it may
@@ -691,6 +706,11 @@
      * should detect scene cuts. The default (40) is recommended. */
     int       scenecutThreshold;
 
+    /* Replace keyframes by using a column of intra blocks that move across the video
+     * from one side to the other, thereby "refreshing" the image. In effect, instead of a
+     * big keyframe, the keyframe is "spread" over many frames. */
+    int       bIntraRefresh;
+
     /*== Coding Unit (CU) definitions ==*/
 
     /* Maximum CU width and height in pixels.  The size must be 64, 32, or 16.
@@ -810,6 +830,9 @@
      * 4 split CUs at the next lower CU depth.  The two flags may be combined */
     uint32_t  limitReferences;
 
+    /* Limit modes analyzed for each CU using cost metrics from the 4 sub-CUs */
+    uint32_t limitModes;
+
     /* ME search method (DIA, HEX, UMH, STAR, FULL). The search patterns
      * (methods) are sorted in increasing complexity, with diamond being the
      * simplest and fastest and full being the slowest.  DIA, HEX, and UMH were
@@ -920,7 +943,7 @@
     /* Psycho-visual rate-distortion strength. Only has an effect in presets
      * which use RDO. It makes mode decision favor options which preserve the
      * energy of the source, at the cost of lost compression. The value must
-     * be between 0 and 2.0, 1.0 is typical. Default 0.3 */
+     * be between 0 and 5.0, 1.0 is typical. Default 2.0 */
     double    psyRd;
 
     /* Strength of psycho-visual optimizations in quantization. Only has an
@@ -1038,7 +1061,7 @@
 
         /* Enable slow and a more detailed first pass encode in multi pass rate control */
         int       bEnableSlowFirstPass;
-        
+
         /* rate-control overrides */
         int        zoneCount;
         x265_zone* zones;
@@ -1051,14 +1074,14 @@
          * values will affect all encoders in the same process */
         const char* lambdaFileName;
 
-        /* Enable stricter conditions to check bitrate deviations in CBR mode. May compromise 
+        /* Enable stricter conditions to check bitrate deviations in CBR mode. May compromise
          * quality to maintain bitrate adherence */
         int bStrictCbr;
 
-        /* Enable adaptive quantization at CU granularity. This parameter specifies 
-         * the minimum CU size at which QP can be adjusted, i.e. Quantization Group 
-         * (QG) size. Allowed values are 64, 32, 16 provided it falls within the 
-         * inclusuve range [maxCUSize, minCUSize]. Experimental, default: maxCUSize*/
+        /* Enable adaptive quantization at CU granularity. This parameter specifies
+         * the minimum CU size at which QP can be adjusted, i.e. Quantization Group
+         * (QG) size. Allowed values are 64, 32, 16 provided it falls within the
+         * inclusuve range [maxCUSize, minCUSize]. Experimental, default: maxCUSize */
         uint32_t qgSize;
     } rc;
 
@@ -1165,12 +1188,27 @@
      * max,min luminance values. */
     const char* masteringDisplayColorVolume;
 
-    /* Content light level info SEI, specified as a string which is parsed when
-     * the stream header SEI are emitted. The string format is "%hu,%hu" where
-     * %hu are unsigned 16bit integers. The first value is the max content light
-     * level (or 0 if no maximum is indicated), the second value is the maximum
-     * picture average light level (or 0). */
-    const char* contentLightLevelInfo;
+    /* Maximum Content light level(MaxCLL), specified as integer that indicates the
+     * maximum pixel intensity level in units of 1 candela per square metre of the
+     * bitstream. x265 will also calculate MaxCLL programmatically from the input
+     * pixel values and set in the Content light level info SEI */
+    uint16_t maxCLL;
+
+    /* Maximum Frame Average Light Level(MaxFALL), specified as integer that indicates
+     * the maximum frame average intensity level in units of 1 candela per square
+     * metre of the bitstream. x265 will also calculate MaxFALL programmatically
+     * from the input pixel values and set in the Content light level info SEI */
+    uint16_t maxFALL;
+
+    /* Minimum luma level of input source picture, specified as a integer which
+     * would automatically increase any luma values below the specified --min-luma
+     * value to that value. */
+    uint16_t minLuma;
+
+    /* Maximum luma level of input source picture, specified as a integer which
+     * would automatically decrease any luma values above the specified --max-luma
+     * value to that value. */
+    uint16_t maxLuma;
 
 } x265_param;
 
@@ -1211,7 +1249,7 @@
     "main422-10", "main422-10-intra",
     "main444-10", "main444-10-intra",
 
-    "main12",     "main12-intra",                  /* Highly Experimental */
+    "main12",     "main12-intra",
     "main422-12", "main422-12-intra",
     "main444-12", "main444-12-intra",
 
@@ -1347,6 +1385,22 @@
  *      close an encoder handler */
 void x265_encoder_close(x265_encoder *);
 
+/* x265_encoder_intra_refresh:
+ *      If an intra refresh is not in progress, begin one with the next P-frame.
+ *      If an intra refresh is in progress, begin one as soon as the current one finishes.
+ *      Requires bIntraRefresh to be set.
+ *

 
@@ -2,6 +2,7 @@
  * Copyright (C) 2013 x265 project
  *
  * Authors: Steve Borho <steve@borho.org>
+ *          Min Chen <chenm003@163.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -91,13 +92,15 @@
 /* Stores all analysis data for a single frame */
 typedef struct x265_analysis_data
 {
-    void*            interData;
-    void*            intraData;
+    int64_t          satdCost;
     uint32_t         frameRecordSize;
     uint32_t         poc;
     uint32_t         sliceType;
     uint32_t         numCUsInFrame;
     uint32_t         numPartitions;
+    void*            interData;
+    void*            intraData;
+    int              bScenecut;
 } x265_analysis_data;
 
 /* cu statistics */
@@ -132,6 +135,7 @@
     double           avgLumaDistortion;
     double           avgChromaDistortion;
     double           avgPsyEnergy;
+    double           avgResEnergy;
     double           avgLumaLevel;
     uint64_t         bits;
     int              encoderOrder;
@@ -141,6 +145,8 @@
     int              list1POC[16];
     uint16_t         maxLumaLevel;
     char             sliceType;
+    int              bScenecut;
+    int              frameLatency;
     x265_cu_stats    cuStats;
 } x265_frame_stats;
 
@@ -205,6 +211,13 @@
      * this data structure */
     x265_analysis_data analysisData;
 
+    /* An array of quantizer offsets to be applied to this image during encoding.
+     * These are added on top of the decisions made by rateControl.
+     * Adaptive quantization must be enabled to use this feature. These quantizer
+     * offsets should be given for each 16x16 block. Behavior if quant
+     * offsets differ between encoding passes is undefined. */
+    float            *quantOffsets;
+
     /* Frame level statistics */
     x265_frame_stats frameData;
 
@@ -378,6 +391,8 @@
     x265_sliceType_stats  statsI;               /* statistics of I slice */
     x265_sliceType_stats  statsP;               /* statistics of P slice */
     x265_sliceType_stats  statsB;               /* statistics of B slice */
+    uint16_t              maxCLL;               /* maximum content light level */
+    uint16_t              maxFALL;              /* maximum frame average light level */
 } x265_stats;
 
 /* String values accepted by x265_param_parse() (and CLI) for various parameters */
@@ -604,7 +619,7 @@
 
     /* Enables the emission of a user data SEI with the stream headers which
      * describes the encoder version, build info, and parameters. This is
-     * very helpful for debugging, but may interfere with regression tests. 
+     * very helpful for debugging, but may interfere with regression tests.
      * Default enabled */
     int       bEmitInfoSEI;
 
@@ -664,9 +679,9 @@
     int       bBPyramid;
 
     /* A value which is added to the cost estimate of B frames in the lookahead.
-     * It may be a positive value (making B frames appear more expensive, which
-     * causes the lookahead to chose more P frames) or negative, which makes the
-     * lookahead chose more B frames. Default is 0, there are no limits */
+     * It may be a positive value (making B frames appear less expensive, which
+     * biases the lookahead to choose more B frames) or negative, which makes the
+     * lookahead choose more P frames. Default is 0, there are no limits */
     int       bFrameBias;
 
     /* The number of frames that must be queued in the lookahead before it may
@@ -691,6 +706,11 @@
      * should detect scene cuts. The default (40) is recommended. */
     int       scenecutThreshold;
 
+    /* Replace keyframes by using a column of intra blocks that move across the video
+     * from one side to the other, thereby "refreshing" the image. In effect, instead of a
+     * big keyframe, the keyframe is "spread" over many frames. */
+    int       bIntraRefresh;
+
     /*== Coding Unit (CU) definitions ==*/
 
     /* Maximum CU width and height in pixels.  The size must be 64, 32, or 16.
@@ -810,6 +830,9 @@
      * 4 split CUs at the next lower CU depth.  The two flags may be combined */
     uint32_t  limitReferences;
 
+    /* Limit modes analyzed for each CU using cost metrics from the 4 sub-CUs */
+    uint32_t limitModes;
+
     /* ME search method (DIA, HEX, UMH, STAR, FULL). The search patterns
      * (methods) are sorted in increasing complexity, with diamond being the
      * simplest and fastest and full being the slowest.  DIA, HEX, and UMH were
@@ -920,7 +943,7 @@
     /* Psycho-visual rate-distortion strength. Only has an effect in presets
      * which use RDO. It makes mode decision favor options which preserve the
      * energy of the source, at the cost of lost compression. The value must
-     * be between 0 and 2.0, 1.0 is typical. Default 0.3 */
+     * be between 0 and 5.0, 1.0 is typical. Default 2.0 */
     double    psyRd;
 
     /* Strength of psycho-visual optimizations in quantization. Only has an
@@ -1038,7 +1061,7 @@
 
         /* Enable slow and a more detailed first pass encode in multi pass rate control */
         int       bEnableSlowFirstPass;
-        
+
         /* rate-control overrides */
         int        zoneCount;
         x265_zone* zones;
@@ -1051,14 +1074,14 @@
          * values will affect all encoders in the same process */
         const char* lambdaFileName;
 
-        /* Enable stricter conditions to check bitrate deviations in CBR mode. May compromise 
+        /* Enable stricter conditions to check bitrate deviations in CBR mode. May compromise
          * quality to maintain bitrate adherence */
         int bStrictCbr;
 
-        /* Enable adaptive quantization at CU granularity. This parameter specifies 
-         * the minimum CU size at which QP can be adjusted, i.e. Quantization Group 
-         * (QG) size. Allowed values are 64, 32, 16 provided it falls within the 
-         * inclusuve range [maxCUSize, minCUSize]. Experimental, default: maxCUSize*/
+        /* Enable adaptive quantization at CU granularity. This parameter specifies
+         * the minimum CU size at which QP can be adjusted, i.e. Quantization Group
+         * (QG) size. Allowed values are 64, 32, 16 provided it falls within the
+         * inclusuve range [maxCUSize, minCUSize]. Experimental, default: maxCUSize */
         uint32_t qgSize;
     } rc;
 
@@ -1165,12 +1188,27 @@
      * max,min luminance values. */
     const char* masteringDisplayColorVolume;
 
-    /* Content light level info SEI, specified as a string which is parsed when
-     * the stream header SEI are emitted. The string format is "%hu,%hu" where
-     * %hu are unsigned 16bit integers. The first value is the max content light
-     * level (or 0 if no maximum is indicated), the second value is the maximum
-     * picture average light level (or 0). */
-    const char* contentLightLevelInfo;
+    /* Maximum Content light level(MaxCLL), specified as integer that indicates the
+     * maximum pixel intensity level in units of 1 candela per square metre of the
+     * bitstream. x265 will also calculate MaxCLL programmatically from the input
+     * pixel values and set in the Content light level info SEI */
+    uint16_t maxCLL;
+
+    /* Maximum Frame Average Light Level(MaxFALL), specified as integer that indicates
+     * the maximum frame average intensity level in units of 1 candela per square
+     * metre of the bitstream. x265 will also calculate MaxFALL programmatically
+     * from the input pixel values and set in the Content light level info SEI */
+    uint16_t maxFALL;
+
+    /* Minimum luma level of input source picture, specified as a integer which
+     * would automatically increase any luma values below the specified --min-luma
+     * value to that value. */
+    uint16_t minLuma;
+
+    /* Maximum luma level of input source picture, specified as a integer which
+     * would automatically decrease any luma values above the specified --max-luma
+     * value to that value. */
+    uint16_t maxLuma;
 
 } x265_param;
 
@@ -1211,7 +1249,7 @@
     "main422-10", "main422-10-intra",
     "main444-10", "main444-10-intra",
 
-    "main12",     "main12-intra",                  /* Highly Experimental */
+    "main12",     "main12-intra",
     "main422-12", "main422-12-intra",
     "main444-12", "main444-12-intra",
 
@@ -1347,6 +1385,22 @@
  *      close an encoder handler */
 void x265_encoder_close(x265_encoder *);
 
+/* x265_encoder_intra_refresh:
+ *      If an intra refresh is not in progress, begin one with the next P-frame.
+ *      If an intra refresh is in progress, begin one as soon as the current one finishes.
+ *      Requires bIntraRefresh to be set.
+ *
​

x265_1.8.tar.gz/source/x265cli.h -> x265_1.9.tar.gz/source/x265cli.h Changed

@@ -116,6 +116,7 @@
     { "min-keyint",     required_argument, NULL, 'i' },
     { "scenecut",       required_argument, NULL, 0 },
     { "no-scenecut",          no_argument, NULL, 0 },
+    { "intra-refresh",        no_argument, NULL, 0 },
     { "rc-lookahead",   required_argument, NULL, 0 },
     { "lookahead-slices", required_argument, NULL, 0 },
     { "bframes",        required_argument, NULL, 'b' },
@@ -126,6 +127,8 @@
     { "b-pyramid",            no_argument, NULL, 0 },
     { "ref",            required_argument, NULL, 0 },
     { "limit-refs",     required_argument, NULL, 0 },
+    { "no-limit-modes",       no_argument, NULL, 0 },
+    { "limit-modes",          no_argument, NULL, 0 },
     { "no-weightp",           no_argument, NULL, 0 },
     { "weightp",              no_argument, NULL, 'w' },
     { "no-weightb",           no_argument, NULL, 0 },
@@ -192,6 +195,8 @@
     { "crop-rect",      required_argument, NULL, 0 }, /* DEPRECATED */
     { "master-display", required_argument, NULL, 0 },
     { "max-cll",        required_argument, NULL, 0 },
+    { "min-luma",       required_argument, NULL, 0 },
+    { "max-luma",       required_argument, NULL, 0 },
     { "no-dither",            no_argument, NULL, 0 },
     { "dither",               no_argument, NULL, 0 },
     { "no-repeat-headers",    no_argument, NULL, 0 },
@@ -251,14 +256,18 @@
     H0("   --log-level <string>          Logging level: none error warning info debug full. Default %s\n", X265_NS::logLevelNames[param->logLevel + 1]);
     H0("   --no-progress                 Disable CLI progress reports\n");
     H0("   --csv <filename>              Comma separated log file, if csv-log-level > 0 frame level statistics, else one line per run\n");
-    H0("   --csv-log-level               Level of csv logging, if csv-log-level > 0 frame level statistics, else one line per run: 0-2\n");
+    H0("   --csv-log-level <integer>     Level of csv logging, if csv-log-level > 0 frame level statistics, else one line per run: 0-2\n");
     H0("\nInput Options:\n");
     H0("   --input <filename>            Raw YUV or Y4M input file name. `-` for stdin\n");
     H1("   --y4m                         Force parsing of input stream as YUV4MPEG2 regardless of file extension\n");
     H0("   --fps <float|rational>        Source frame rate (float or num/denom), auto-detected if Y4M\n");
     H0("   --input-res WxH               Source picture size [w x h], auto-detected if Y4M\n");
     H1("   --input-depth <integer>       Bit-depth of input file. Default 8\n");
-    H1("   --input-csp <string>          Source color space: i420, i444 or i422, auto-detected if Y4M. Default: i420\n");
+    H1("   --input-csp <string>          Chroma subsampling, auto-detected if Y4M\n");
+    H1("                                 0 - i400 (4:0:0 monochrome)\n");
+    H1("                                 1 - i420 (4:2:0 default)\n");
+    H1("                                 2 - i422 (4:2:2)\n");
+    H1("                                 3 - i444 (4:4:4)\n");
     H0("-f/--frames <integer>            Maximum number of frames to encode. Default all\n");
     H0("   --seek <integer>              First frame to encode\n");
     H1("   --[no-]interlace <bff|tff>    Indicate input pictures are interlace fields in temporal order. Default progressive\n");
@@ -292,7 +301,7 @@
     H0("   --tu-inter-depth <integer>    Max TU recursive depth for inter CUs. Default %d\n", param->tuQTMaxInterDepth);
     H0("\nAnalysis:\n");
     H0("   --rd <0..6>                   Level of RDO in mode decision 0:least....6:full RDO. Default %d\n", param->rdLevel);
-    H0("   --[no-]psy-rd <0..2.0>        Strength of psycho-visual rate distortion optimization, 0 to disable. Default %.1f\n", param->psyRd);
+    H0("   --[no-]psy-rd <0..5.0>        Strength of psycho-visual rate distortion optimization, 0 to disable. Default %.1f\n", param->psyRd);
     H0("   --[no-]rdoq-level <0|1|2>     Level of RDO in quantization 0:none, 1:levels, 2:levels & coding groups. Default %d\n", param->rdoqLevel);
     H0("   --[no-]psy-rdoq <0..50.0>     Strength of psycho-visual optimization in RDO quantization, 0 to disable. Default %.1f\n", param->psyRdoq);
     H0("   --[no-]early-skip             Enable early SKIP detection. Default %s\n", OPT(param->bEnableEarlySkip));
@@ -308,12 +317,13 @@
     H0("\nTemporal / motion search options:\n");
     H0("   --max-merge <1..5>            Maximum number of merge candidates. Default %d\n", param->maxNumMergeCand);
     H0("   --ref <integer>               max number of L0 references to be allowed (1 .. 16) Default %d\n", param->maxNumReferences);
-    H0("   --limit-refs <0|1|2|3>        limit references per depth (1) or CU (2) or both (3). Default %d\n", param->limitReferences);
+    H0("   --limit-refs <0|1|2|3>        Limit references per depth (1) or CU (2) or both (3). Default %d\n", param->limitReferences);
     H0("   --me <string>                 Motion search method dia hex umh star full. Default %d\n", param->searchMethod);
     H0("-m/--subme <integer>             Amount of subpel refinement to perform (0:least .. 7:most). Default %d \n", param->subpelRefine);
     H0("   --merange <integer>           Motion search range. Default %d\n", param->searchRange);
     H0("   --[no-]rect                   Enable rectangular motion partitions Nx2N and 2NxN. Default %s\n", OPT(param->bEnableRectInter));
     H0("   --[no-]amp                    Enable asymmetric motion partitions, requires --rect. Default %s\n", OPT(param->bEnableAMP));
+    H0("   --[no-]limit-modes            Limit rectangular and asymmetric motion predictions. Default %d\n", param->limitModes);
     H1("   --[no-]temporal-mvp           Enable temporal MV predictors. Default %s\n", OPT(param->bEnableTemporalMvp));
     H0("\nSpatial / intra options:\n");
     H0("   --[no-]strong-intra-smoothing Enable strong intra smoothing for 32x32 blocks. Default %s\n", OPT(param->bEnableStrongIntraSmoothing));
@@ -327,6 +337,7 @@
     H0("-i/--min-keyint <integer>        Scenecuts closer together than this are coded as I, not IDR. Default: auto\n");
     H0("   --no-scenecut                 Disable adaptive I-frame decision\n");
     H0("   --scenecut <integer>          How aggressively to insert extra I-frames. Default %d\n", param->scenecutThreshold);
+    H0("   --intra-refresh               Use Periodic Intra Refresh instead of IDR frames\n");
     H0("   --rc-lookahead <integer>      Number of frames for frame-type lookahead (determines encoder latency) Default %d\n", param->lookaheadDepth);
     H1("   --lookahead-slices <0..16>    Number of slices to use per lookahead cost estimate. Default %d\n", param->lookaheadSlices);
     H0("   --bframes <integer>           Maximum number of consecutive b-frames (now it only enables B GOP structure) Default %d\n", param->bframes);
@@ -335,7 +346,7 @@
     H0("   --[no-]b-pyramid              Use B-frames as references. Default %s\n", OPT(param->bBPyramid));
     H1("   --qpfile <string>             Force frametypes and QPs for some or all frames\n");
     H1("                                 Format of each line: framenumber frametype QP\n");
-    H1("                                 QP is optional (none lets x265 choose). Frametypes: I,i,P,B,b.\n");
+    H1("                                 QP is optional (none lets x265 choose). Frametypes: I,i,K,P,B,b.\n");
     H1("                                 QPs are restricted by qpmin/qpmax.\n");
     H0("\nRate control, Adaptive Quantization:\n");
     H0("   --bitrate <integer>           Target bitrate (kbps) for ABR (implied). Default %d\n", param->rc.bitrate);
@@ -403,6 +414,8 @@
     H0("   --master-display <string>     SMPTE ST 2086 master display color volume info SEI (HDR)\n");
     H0("                                    format: G(x,y)B(x,y)R(x,y)WP(x,y)L(max,min)\n");
     H0("   --max-cll <string>            Emit content light level info SEI as \"cll,fall\" (HDR)\n");
+    H0("   --min-luma <integer>          Minimum luma plane value of input source picture\n");
+    H0("   --max-luma <integer>          Maximum luma plane value of input source picture\n");
     H0("\nBitstream options:\n");
     H0("   --[no-]repeat-headers         Emit SPS and PPS headers at each keyframe. Default %s\n", OPT(param->bRepeatHeaders));
     H0("   --[no-]info                   Emit SEI identifying encoder and parameters. Default %s\n", OPT(param->bEmitInfoSEI));

 
@@ -116,6 +116,7 @@
     { "min-keyint",     required_argument, NULL, 'i' },
     { "scenecut",       required_argument, NULL, 0 },
     { "no-scenecut",          no_argument, NULL, 0 },
+    { "intra-refresh",        no_argument, NULL, 0 },
     { "rc-lookahead",   required_argument, NULL, 0 },
     { "lookahead-slices", required_argument, NULL, 0 },
     { "bframes",        required_argument, NULL, 'b' },
@@ -126,6 +127,8 @@
     { "b-pyramid",            no_argument, NULL, 0 },
     { "ref",            required_argument, NULL, 0 },
     { "limit-refs",     required_argument, NULL, 0 },
+    { "no-limit-modes",       no_argument, NULL, 0 },
+    { "limit-modes",          no_argument, NULL, 0 },
     { "no-weightp",           no_argument, NULL, 0 },
     { "weightp",              no_argument, NULL, 'w' },
     { "no-weightb",           no_argument, NULL, 0 },
@@ -192,6 +195,8 @@
     { "crop-rect",      required_argument, NULL, 0 }, /* DEPRECATED */
     { "master-display", required_argument, NULL, 0 },
     { "max-cll",        required_argument, NULL, 0 },
+    { "min-luma",       required_argument, NULL, 0 },
+    { "max-luma",       required_argument, NULL, 0 },
     { "no-dither",            no_argument, NULL, 0 },
     { "dither",               no_argument, NULL, 0 },
     { "no-repeat-headers",    no_argument, NULL, 0 },
@@ -251,14 +256,18 @@
     H0("   --log-level <string>          Logging level: none error warning info debug full. Default %s\n", X265_NS::logLevelNames[param->logLevel + 1]);
     H0("   --no-progress                 Disable CLI progress reports\n");
     H0("   --csv <filename>              Comma separated log file, if csv-log-level > 0 frame level statistics, else one line per run\n");
-    H0("   --csv-log-level               Level of csv logging, if csv-log-level > 0 frame level statistics, else one line per run: 0-2\n");
+    H0("   --csv-log-level <integer>     Level of csv logging, if csv-log-level > 0 frame level statistics, else one line per run: 0-2\n");
     H0("\nInput Options:\n");
     H0("   --input <filename>            Raw YUV or Y4M input file name. `-` for stdin\n");
     H1("   --y4m                         Force parsing of input stream as YUV4MPEG2 regardless of file extension\n");
     H0("   --fps <float|rational>        Source frame rate (float or num/denom), auto-detected if Y4M\n");
     H0("   --input-res WxH               Source picture size [w x h], auto-detected if Y4M\n");
     H1("   --input-depth <integer>       Bit-depth of input file. Default 8\n");
-    H1("   --input-csp <string>          Source color space: i420, i444 or i422, auto-detected if Y4M. Default: i420\n");
+    H1("   --input-csp <string>          Chroma subsampling, auto-detected if Y4M\n");
+    H1("                                 0 - i400 (4:0:0 monochrome)\n");
+    H1("                                 1 - i420 (4:2:0 default)\n");
+    H1("                                 2 - i422 (4:2:2)\n");
+    H1("                                 3 - i444 (4:4:4)\n");
     H0("-f/--frames <integer>            Maximum number of frames to encode. Default all\n");
     H0("   --seek <integer>              First frame to encode\n");
     H1("   --[no-]interlace <bff|tff>    Indicate input pictures are interlace fields in temporal order. Default progressive\n");
@@ -292,7 +301,7 @@
     H0("   --tu-inter-depth <integer>    Max TU recursive depth for inter CUs. Default %d\n", param->tuQTMaxInterDepth);
     H0("\nAnalysis:\n");
     H0("   --rd <0..6>                   Level of RDO in mode decision 0:least....6:full RDO. Default %d\n", param->rdLevel);
-    H0("   --[no-]psy-rd <0..2.0>        Strength of psycho-visual rate distortion optimization, 0 to disable. Default %.1f\n", param->psyRd);
+    H0("   --[no-]psy-rd <0..5.0>        Strength of psycho-visual rate distortion optimization, 0 to disable. Default %.1f\n", param->psyRd);
     H0("   --[no-]rdoq-level <0|1|2>     Level of RDO in quantization 0:none, 1:levels, 2:levels & coding groups. Default %d\n", param->rdoqLevel);
     H0("   --[no-]psy-rdoq <0..50.0>     Strength of psycho-visual optimization in RDO quantization, 0 to disable. Default %.1f\n", param->psyRdoq);
     H0("   --[no-]early-skip             Enable early SKIP detection. Default %s\n", OPT(param->bEnableEarlySkip));
@@ -308,12 +317,13 @@
     H0("\nTemporal / motion search options:\n");
     H0("   --max-merge <1..5>            Maximum number of merge candidates. Default %d\n", param->maxNumMergeCand);
     H0("   --ref <integer>               max number of L0 references to be allowed (1 .. 16) Default %d\n", param->maxNumReferences);
-    H0("   --limit-refs <0|1|2|3>        limit references per depth (1) or CU (2) or both (3). Default %d\n", param->limitReferences);
+    H0("   --limit-refs <0|1|2|3>        Limit references per depth (1) or CU (2) or both (3). Default %d\n", param->limitReferences);
     H0("   --me <string>                 Motion search method dia hex umh star full. Default %d\n", param->searchMethod);
     H0("-m/--subme <integer>             Amount of subpel refinement to perform (0:least .. 7:most). Default %d \n", param->subpelRefine);
     H0("   --merange <integer>           Motion search range. Default %d\n", param->searchRange);
     H0("   --[no-]rect                   Enable rectangular motion partitions Nx2N and 2NxN. Default %s\n", OPT(param->bEnableRectInter));
     H0("   --[no-]amp                    Enable asymmetric motion partitions, requires --rect. Default %s\n", OPT(param->bEnableAMP));
+    H0("   --[no-]limit-modes            Limit rectangular and asymmetric motion predictions. Default %d\n", param->limitModes);
     H1("   --[no-]temporal-mvp           Enable temporal MV predictors. Default %s\n", OPT(param->bEnableTemporalMvp));
     H0("\nSpatial / intra options:\n");
     H0("   --[no-]strong-intra-smoothing Enable strong intra smoothing for 32x32 blocks. Default %s\n", OPT(param->bEnableStrongIntraSmoothing));
@@ -327,6 +337,7 @@
     H0("-i/--min-keyint <integer>        Scenecuts closer together than this are coded as I, not IDR. Default: auto\n");
     H0("   --no-scenecut                 Disable adaptive I-frame decision\n");
     H0("   --scenecut <integer>          How aggressively to insert extra I-frames. Default %d\n", param->scenecutThreshold);
+    H0("   --intra-refresh               Use Periodic Intra Refresh instead of IDR frames\n");
     H0("   --rc-lookahead <integer>      Number of frames for frame-type lookahead (determines encoder latency) Default %d\n", param->lookaheadDepth);
     H1("   --lookahead-slices <0..16>    Number of slices to use per lookahead cost estimate. Default %d\n", param->lookaheadSlices);
     H0("   --bframes <integer>           Maximum number of consecutive b-frames (now it only enables B GOP structure) Default %d\n", param->bframes);
@@ -335,7 +346,7 @@
     H0("   --[no-]b-pyramid              Use B-frames as references. Default %s\n", OPT(param->bBPyramid));
     H1("   --qpfile <string>             Force frametypes and QPs for some or all frames\n");
     H1("                                 Format of each line: framenumber frametype QP\n");
-    H1("                                 QP is optional (none lets x265 choose). Frametypes: I,i,P,B,b.\n");
+    H1("                                 QP is optional (none lets x265 choose). Frametypes: I,i,K,P,B,b.\n");
     H1("                                 QPs are restricted by qpmin/qpmax.\n");
     H0("\nRate control, Adaptive Quantization:\n");
     H0("   --bitrate <integer>           Target bitrate (kbps) for ABR (implied). Default %d\n", param->rc.bitrate);
@@ -403,6 +414,8 @@
     H0("   --master-display <string>     SMPTE ST 2086 master display color volume info SEI (HDR)\n");
     H0("                                    format: G(x,y)B(x,y)R(x,y)WP(x,y)L(max,min)\n");
     H0("   --max-cll <string>            Emit content light level info SEI as \"cll,fall\" (HDR)\n");
+    H0("   --min-luma <integer>          Minimum luma plane value of input source picture\n");
+    H0("   --max-luma <integer>          Maximum luma plane value of input source picture\n");
     H0("\nBitstream options:\n");
     H0("   --[no-]repeat-headers         Emit SPS and PPS headers at each keyframe. Default %s\n", OPT(param->bRepeatHeaders));
     H0("   --[no-]info                   Emit SEI identifying encoder and parameters. Default %s\n", OPT(param->bEmitInfoSEI));
​