Changes of Revision 13

x265.changes Changed
x
 
1
@@ -1,4 +1,40 @@
2
 -------------------------------------------------------------------
3
+Sun Aug 28 11:51:23 UTC 2016 - joerg.lorenzen@ki.tng.de
4
+
5
+- Update to version 2.0
6
+  API and Key Behavior Changes
7
+  * x265_rc_stats added to x265_picture, containing all RC decision
8
+    points for that frame.
9
+  * PTL: high tier is now allowed by default, chosen only if
10
+    necessary.
11
+  * multi-pass: First pass now uses slow-firstpass by default,
12
+    enabling better RC decisions in future passes.
13
+  * pools: fix behaviour on multi-socketed Windows systems, provide
14
+    more flexibility in determining thread and pool counts.
15
+  * ABR: improve bits allocation in the first few frames, abr reset,
16
+    vbv and cutree improved.
17
+  New Features
18
+  * uhd-bd: Enforce Ultra-HD Blu-ray Disc parameters
19
+    (overrides any other settings).
20
+  * rskip: Enables skipping recursion to analyze lower CU sizes
21
+    using heuristics at different rd-levels. Provides good visual
22
+    quality gains at the highest quality presets.
23
+  * rc-grain: Enables a new rate control mode specifically for
24
+    grainy content. Strictly prevents QP oscillations within and
25
+    between frames to avoid grain fluctuations.
26
+  * tune grain: A fully refactored and improved option to encode
27
+    film grain content including QP control as well as analysis
28
+    options.
29
+  * asm: ARM assembly is now enabled by default, native or cross
30
+    compiled builds supported on armv6 and later systems.
31
+  Misc
32
+  * An SSIM calculation bug was corrected
33
+- soname bump to 87.
34
+- Fixed arm.patch.
35
+- Added libnuma-devel as buildrequires for arch x86_64 (except
36
+  for openSUSE 13.1 because libnuma-devel >= 2.0.9 is required).
37
+
38
+-------------------------------------------------------------------
39
 Wed Feb  3 13:22:42 UTC 2016 - idonmez@suse.com
40
 
41
 - Update to version 1.9
42
x265.spec Changed
28
 
1
@@ -1,10 +1,10 @@
2
 # based on the spec file from https://build.opensuse.org/package/view_file/home:Simmphonie/libx265/
3
 
4
 Name:           x265
5
-%define soname  79
6
+%define soname  87
7
 %define libname lib%{name}
8
 %define libsoname %{libname}-%{soname}
9
-Version:        1.9
10
+Version:        2.0
11
 Release:        0
12
 License:        GPL-2.0+
13
 Summary:        A free h265/HEVC encoder - encoder binary
14
@@ -14,6 +14,13 @@
15
 Patch0:         arm.patch
16
 BuildRequires:  gcc gcc-c++
17
 BuildRequires:  cmake >= 2.8.8
18
+# for openSUSE 13.1 only libnuma-devel = 2.0.8 is available, but version 2.0.9 or higher is required
19
+# build against version 2.0.8 failes with "error: 'numa_bitmask_weight' was not declared in this scope"
20
+%if ! ( 0%{?suse_version} == 1310 )
21
+%ifarch x86_64
22
+BuildRequires:  libnuma-devel >= 2.0.9
23
+%endif
24
+%endif
25
 BuildRequires:  pkg-config
26
 BuildRequires:  yasm >= 1.2.0
27
 BuildRoot:      %{_tmppath}/%{name}-%{version}-build
28
arm.patch Changed
98
 
1
@@ -1,19 +1,25 @@
2
-Index: x265_11047/source/CMakeLists.txt
3
+Index: x265_2.0/source/CMakeLists.txt
4
 ===================================================================
5
---- x265_11047.orig/source/CMakeLists.txt
6
-+++ x265_11047/source/CMakeLists.txt
7
-@@ -56,10 +56,22 @@ elseif(POWERMATCH GREATER "-1")
8
+--- x265_2.0.orig/source/CMakeLists.txt
9
++++ x265_2.0/source/CMakeLists.txt
10
+@@ -60,15 +60,22 @@
11
      message(STATUS "Detected POWER target processor")
12
      set(POWER 1)
13
      add_definitions(-DX265_ARCH_POWER=1)
14
+-elseif(ARMMATCH GREATER "-1")
15
+-    if(CROSS_COMPILE_ARM)
16
+-        message(STATUS "Cross compiling for ARM arch")
17
+-    else()
18
+-        set(CROSS_COMPILE_ARM 0)
19
+-    endif()
20
+-    message(STATUS "Detected ARM target processor")
21
+-    set(ARM 1)
22
+-    add_definitions(-DX265_ARCH_ARM=1 -DHAVE_ARMV6=1)
23
 +elseif(${SYSPROC} MATCHES "armv5.*")
24
 +    message(STATUS "Detected ARMV5 system processor")
25
 +    set(ARMV5 1)
26
 +    add_definitions(-DX265_ARCH_ARM=1 -DHAVE_ARMV6=0 -DHAVE_NEON=0)
27
- elseif(${SYSPROC} STREQUAL "armv6l")
28
--    message(STATUS "Detected ARM target processor")
29
--    set(ARM 1)
30
--    add_definitions(-DX265_ARCH_ARM=1 -DHAVE_ARMV6=1)
31
++elseif(${SYSPROC} STREQUAL "armv6l")
32
 +    message(STATUS "Detected ARMV6 system processor")
33
 +    set(ARMV6 1)
34
 +    add_definitions(-DX265_ARCH_ARM=1 -DHAVE_ARMV6=1 -DHAVE_NEON=0)
35
@@ -28,21 +34,32 @@
36
  else()
37
      message(STATUS "CMAKE_SYSTEM_PROCESSOR value `${CMAKE_SYSTEM_PROCESSOR}` is unknown")
38
      message(STATUS "Please add this value near ${CMAKE_CURRENT_LIST_FILE}:${CMAKE_CURRENT_LIST_LINE}")
39
-@@ -169,8 +181,8 @@ if(GCC)
40
-     elseif(X86 AND NOT X64)
41
-         add_definitions(-march=i686)
42
+@@ -186,18 +193,9 @@
43
+             add_definitions(-march=i686)
44
+         endif()
45
      endif()
46
--    if(ARM)
47
--        add_definitions(-march=armv6 -mfloat-abi=hard -mfpu=vfp)
48
+-    if(ARM AND CROSS_COMPILE_ARM)
49
+-        set(ARM_ARGS -march=armv6 -mfloat-abi=soft -mfpu=vfp -marm -fPIC)
50
+-    elseif(ARM)
51
+-        find_package(Neon)
52
+-        if(CPU_HAS_NEON)
53
+-            set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=neon -marm -fPIC)
54
+-            add_definitions(-DHAVE_NEON)
55
+-        else()
56
+-            set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=vfp -marm)
57
+-        endif()
58
+-    endif()
59
+-    add_definitions(${ARM_ARGS})
60
 +    if(ARMV7)
61
 +        add_definitions(-fPIC)
62
-     endif()
63
++    endif()
64
      if(FPROFILE_GENERATE)
65
          if(INTEL_CXX)
66
-Index: x265_11047/source/common/cpu.cpp
67
+             add_definitions(-prof-gen -prof-dir="${CMAKE_CURRENT_BINARY_DIR}")
68
+Index: x265_2.0/source/common/cpu.cpp
69
 ===================================================================
70
---- x265_11047.orig/source/common/cpu.cpp
71
-+++ x265_11047/source/common/cpu.cpp
72
+--- x265_2.0.orig/source/common/cpu.cpp
73
++++ x265_2.0/source/common/cpu.cpp
74
 @@ -37,7 +37,7 @@
75
  #include <machine/cpu.h>
76
  #endif
77
@@ -52,3 +69,20 @@
78
  #include <signal.h>
79
  #include <setjmp.h>
80
  static sigjmp_buf jmpbuf;
81
+@@ -340,7 +340,6 @@
82
+     }
83
+ 
84
+     canjump = 1;
85
+-    PFX(cpu_neon_test)();
86
+     canjump = 0;
87
+     signal(SIGILL, oldsig);
88
+ #endif // if !HAVE_NEON
89
+@@ -356,7 +355,7 @@
90
+     // which may result in incorrect detection and the counters stuck enabled.
91
+     // right now Apple does not seem to support performance counters for this test
92
+ #ifndef __MACH__
93
+-    flags |= PFX(cpu_fast_neon_mrc_test)() ? X265_CPU_FAST_NEON_MRC : 0;
94
++    //flags |= PFX(cpu_fast_neon_mrc_test)() ? X265_CPU_FAST_NEON_MRC : 0;
95
+ #endif
96
+     // TODO: write dual issue test? currently it's A8 (dual issue) vs. A9 (fast mrc)
97
+ #endif // if HAVE_ARMV6
98
x265_1.9.tar.gz/.hg_archival.txt -> x265_2.0.tar.gz/.hg_archival.txt Changed
8
 
1
@@ -1,4 +1,4 @@
2
 repo: 09fe40627f03a0f9c3e6ac78b22ac93da23f9fdf
3
-node: 1d3b6e448e01ec40b392ef78b7e55a86249fbe68
4
+node: 960c9991d0dcf46559c32e070418d3cbb7e8aa2f
5
 branch: stable
6
-tag: 1.9
7
+tag: 2.0
8
x265_1.9.tar.gz/.hgtags -> x265_2.0.tar.gz/.hgtags Changed
6
 
1
@@ -17,3 +17,4 @@
2
 cbeb7d8a4880e4020c4545dd8e498432c3c6cad3 1.6
3
 8425278def1edf0931dc33fc518e1950063e76b0 1.7
4
 e27327f5da35c5feb660360336fdc94bd0afe719 1.8
5
+1d3b6e448e01ec40b392ef78b7e55a86249fbe68 1.9
6
x265_2.0.tar.gz/build/arm-linux/crosscompile.cmake Added
17
 
1
@@ -0,0 +1,15 @@
2
+# CMake toolchain file for cross compiling x265 for ARM arch
3
+# This feature is only supported as experimental. Use with caution.
4
+# Please report bugs on bitbucket
5
+# Run cmake with: cmake -DCMAKE_TOOLCHAIN_FILE=crosscompile.cmake -G "Unix Makefiles" ../../source && ccmake ../../source
6
+
7
+set(CROSS_COMPILE_ARM 1)
8
+set(CMAKE_SYSTEM_NAME Linux)
9
+set(CMAKE_SYSTEM_PROCESSOR armv6l)
10
+
11
+# specify the cross compiler
12
+set(CMAKE_C_COMPILER arm-linux-gnueabi-gcc)
13
+set(CMAKE_CXX_COMPILER arm-linux-gnueabi-g++)
14
+
15
+# specify the target environment
16
+SET(CMAKE_FIND_ROOT_PATH  /usr/arm-linux-gnueabi)
17
x265_2.0.tar.gz/build/arm-linux/make-Makefiles.bash Added
6
 
1
@@ -0,0 +1,4 @@
2
+#!/bin/bash
3
+# Run this from within a bash shell
4
+
5
+cmake -G "Unix Makefiles" ../../source && ccmake ../../source
6
x265_1.9.tar.gz/doc/reST/api.rst -> x265_2.0.tar.gz/doc/reST/api.rst Changed
11
 
1
@@ -180,7 +180,8 @@
2
     *       used to modify encoder parameters.
3
     *      various parameters from x265_param are copied.
4
     *      this takes effect immediately, on whichever frame is encoded next;
5
-    *      returns 0 on success, negative on parameter validation error.
6
+    *      returns negative on parameter validation error, 0 on successful reconfigure
7
+    *      and 1 when a reconfigure is already in progress.
8
     *
9
     *      not all parameters can be changed; see the actual function for a
10
     *      detailed breakdown.  since not all parameters can be changed, moving
11
x265_1.9.tar.gz/doc/reST/cli.rst -> x265_2.0.tar.gz/doc/reST/cli.rst Changed
201
 
1
@@ -376,10 +376,10 @@
2
 
3
 .. option:: --dither
4
 
5
-   Enable high quality downscaling. Dithering is based on the diffusion
6
-   of errors from one row of pixels to the next row of pixels in a
7
-   picture. Only applicable when the input bit depth is larger than
8
-   8bits and internal bit depth is 8bits. Default disabled
9
+   Enable high quality downscaling to the encoder's internal bitdepth. 
10
+   Dithering is based on the diffusion of errors from one row of pixels 
11
+   to the next row of pixels in a picture. Only applicable when the 
12
+   input bit depth is larger than 8bits. Default disabled
13
 
14
    **CLI ONLY**
15
 
16
@@ -522,16 +522,14 @@
17
 
18
 .. option:: --high-tier, --no-high-tier
19
 
20
-   If :option:`--level-idc` has been specified, the option adds the
21
-   intention to support the High tier of that level. If your specified
22
-   level does not support a High tier, a warning is issued and this
23
-   modifier flag is ignored. If :option:`--level-idc` has been specified,
24
-   but not --high-tier, then the encoder will attempt to encode at the 
25
-   specified level, main tier first, turning on high tier only if 
26
-   necessary and available at that level.
27
+   If :option:`--level-idc` has been specified, --high-tier allows the
28
+   support of high tier at that level. The encoder will first attempt to encode 
29
+   at the specified level, main tier first, turning on high tier only if 
30
+   necessary and available at that level.If your requested level does not 
31
+   support a High tier, high tier will not be supported. If --no-high-tier 
32
+   has been specified, then the encoder will attempt to encode only at the main tier.
33
 
34
-   If :option:`--level-idc` has not been specified, this argument is
35
-   ignored.
36
+   Default: enabled
37
 
38
 .. option:: --ref <1..16>
39
 
40
@@ -564,6 +562,15 @@
41
 
42
    Default: disabled
43
 
44
+.. option:: --uhd-bd
45
+
46
+    Enable Ultra HD Blu-ray format support. If specified with incompatible
47
+    encoding options, the encoder will attempt to modify/set the right 
48
+    encode specifications. If the encoder is unable to do so, this option
49
+    will be turned OFF. Highly experimental.
50
+   
51
+    Default: disabled
52
+   
53
 .. note::
54
 
55
    :option:`--profile`, :option:`--level-idc`, and
56
@@ -600,7 +607,7 @@
57
 Mode decision / Analysis
58
 ========================
59
 
60
-.. option:: --rd <0..6>
61
+.. option:: --rd <1..6>
62
 
63
    Level of RDO in mode decision. The higher the value, the more
64
    exhaustive the analysis and the more rate distortion optimization is
65
@@ -629,7 +636,7 @@
66
    | 6     | Currently same as 5                                           |
67
    +-------+---------------------------------------------------------------+
68
 
69
-   **Range of values:** 0: least .. 6: full RDO analysis
70
+   **Range of values:** 1: least .. 6: full RDO analysis
71
 
72
 Options which affect the coding unit quad-tree, sometimes referred to as
73
 the prediction quad-tree.
74
@@ -722,8 +729,18 @@
75
 
76
 .. option:: --early-skip, --no-early-skip
77
 
78
-   Measure full CU size (2Nx2N) merge candidates first; if no residual
79
-   is found the analysis is short circuited. Default disabled
80
+   Measure 2Nx2N merge candidates first; if no residual is found, 
81
+   additional modes at that depth are not analysed. Default disabled
82
+
83
+.. option:: --rskip, --no-rskip
84
+
85
+   This option determines early exit from CU depth recursion. When a skip CU is
86
+   found, additional heuristics (depending on rd-level) are used to decide whether
87
+   to terminate recursion. In rdlevels 5 and 6, comparison with inter2Nx2N is used, 
88
+   while at rdlevels 4 and neighbour costs are used to skip recursion.
89
+   Provides minimal quality degradation at good performance gains when enabled. 
90
+
91
+   Default: enabled, disabled for :option:`--tune grain`
92
 
93
 .. option:: --fast-intra, --no-fast-intra
94
 
95
@@ -756,6 +773,14 @@
96
    evaluate if luma used tskip. Inter block tskip analysis is
97
    unmodified. Default disabled
98
 
99
+.. option:: --rd-refine, --no-rd-refine
100
+
101
+   For each analysed CU, calculate R-D cost on the best partition mode
102
+   for a range of QP values, to find the optimal rounding effect.
103
+   Default disabled.
104
+
105
+   Only effective at RD levels 5 and 6
106
+
107
 Analysis re-use options, to improve performance when encoding the same
108
 sequence multiple times (presumably at varying bitrates). The encoder
109
 will not reuse analysis if the resolution and slice type parameters do
110
@@ -1039,7 +1064,7 @@
111
 cause ringing artifacts. psy-rdoq is less accurate than psy-rd, it is
112
 biasing towards energy in general while psy-rd biases towards the energy
113
 of the source image. But very large psy-rdoq values can sometimes be
114
-beneficial, preserving film grain for instance.
115
+beneficial.
116
 
117
 As a general rule, when both psycho-visual features are disabled, the
118
 encoder will tend to blur blocks in areas of difficult motion. Turning
119
@@ -1076,8 +1101,8 @@
120
    energy in the reconstructed image. This generally improves perceived
121
    visual quality at the cost of lower quality metric scores.  It only
122
    has effect when :option:`--rdoq-level` is 1 or 2. High values can
123
-   be beneficial in preserving high-frequency detail like film grain.
124
-   Default: 1.0
125
+   be beneficial in preserving high-frequency detail.
126
+   Default: 0.0 (1.0 for presets slow, slower, veryslow)
127
 
128
    **Range of values:** 0 .. 50.0
129
 
130
@@ -1336,13 +1361,13 @@
131
 
132
 .. option:: --slow-firstpass, --no-slow-firstpass
133
 
134
-   Enable a slow and more detailed first pass encode in multi-pass rate
135
-   control mode.  Speed of the first pass encode is slightly lesser and
136
-   quality midly improved when compared to the default settings in a
137
-   multi-pass encode. Default disabled (turbo mode enabled)
138
+   Enable first pass encode with the exact settings specified. 
139
+   The quality in subsequent multi-pass encodes is better
140
+   (compared to first pass) when the settings match across each pass. 
141
+   Default enabled.
142
 
143
-   When **turbo** first pass is not disabled, these options are
144
-   set on the first pass to improve performance:
145
+   When slow first pass is disabled, a **turbo** encode with the following
146
+   go-fast options is used to improve performance:
147
    
148
    * :option:`--fast-intra`
149
    * :option:`--no-rect`
150
@@ -1408,7 +1433,16 @@
151
 
152
    The maximum single adjustment in QP allowed to rate control. Default
153
    4
154
-
155
+   
156
+.. option:: --rc-grain, --no-rc-grain
157
+
158
+   Enables a specialised ratecontrol algorithm for film grain content. This 
159
+   parameter strictly minimises QP fluctuations within and across frames 
160
+   and removes pulsing of grain. Default disabled. 
161
+   Enabled when :option:'--tune' grain is applied. It is highly recommended 
162
+   that this option is used through the tune grain feature where a combination 
163
+   of param options are used to improve visual quality.
164
+   
165
 .. option:: --qblur <float>
166
 
167
    Temporally blur quants. Default 0.5
168
@@ -1660,10 +1694,13 @@
169
    a string which is parsed when the stream header SEI are emitted. The
170
    string format is "G(%hu,%hu)B(%hu,%hu)R(%hu,%hu)WP(%hu,%hu)L(%u,%u)"
171
    where %hu are unsigned 16bit integers and %u are unsigned 32bit
172
-   integers. The SEI includes X,Y display primaries for RGB channels,
173
-   white point X,Y and max,min luminance values. (HDR)
174
+   integers. The SEI includes X,Y display primaries for RGB channels
175
+   and white point (WP) in units of 0.00002 and max,min luminance (L)
176
+   values in units of 0.0001 candela per meter square. (HDR)
177
 
178
-   Example for D65P3 1000-nits:
179
+   Example for a P3D65 1000-nits monitor, where G(x=0.265, y=0.690),
180
+   B(x=0.150, y=0.060), R(x=0.680, y=0.320), WP(x=0.3127, y=0.3290),
181
+   L(max=1000, min=0.0001):
182
 
183
        G(13250,34500)B(7500,3000)R(34000,16000)WP(15635,16450)L(10000000,1)
184
 
185
@@ -1672,8 +1709,9 @@
186
 
187
 .. option:: --max-cll <string>
188
 
189
-   Maximum content light level and maximum frame average light level as
190
-   required by the Consumer Electronics Association 861.3 specification.
191
+   Maximum content light level (MaxCLL) and maximum frame average light
192
+   level (MaxFALL) as required by the Consumer Electronics Association
193
+   861.3 specification.
194
 
195
    Specified as a string which is parsed when the stream header SEI are
196
    emitted. The string format is "%hu,%hu" where %hu are unsigned 16bit
197
@@ -1681,6 +1719,11 @@
198
    maximum is indicated), the second value is the maximum picture
199
    average light level (or 0). (HDR)
200
 
201
x265_1.9.tar.gz/doc/reST/presets.rst -> x265_2.0.tar.gz/doc/reST/presets.rst Changed
201
 
1
@@ -21,68 +21,80 @@
2
 The presets adjust encoder parameters as shown in the following table.
3
 Any parameters below that are specified in your command-line will be 
4
 changed from the value specified by the preset.
5
-
6
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
7
-|                 |ultrafast |superfast |veryfast |faster |fast |medium |slow |slower |veryslow |placebo |
8
-+=================+==========+==========+=========+=======+=====+=======+=====+=======+=========+========+
9
-| ctu             |    32    |    32    |   64    |  64   | 64  |  64   | 64  |  64   |   64    |  64    |
10
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
11
-| min-cu-size     |    16    |     8    |    8    |   8   |  8  |   8   |  8  |   8   |    8    |   8    |
12
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
13
-| bframes         |     3    |     3    |    4    |   4   |  4  |   4   |  4  |   8   |    8    |   8    |
14
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
15
-| b-adapt         |     0    |     0    |    0    |   0   |  0  |   2   |  2  |   2   |    2    |   2    |
16
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
17
-| rc-lookahead    |     5    |    10    |   15    |  15   | 15  |  20   | 25  |  30   |   40    |  60    |
18
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
19
-| lookahead-slices|     8    |     8    |    8    |   8   |  8  |   8   |  4  |   4   |    1    |   1    |
20
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
21
-| scenecut        |     0    |    40    |   40    |  40   | 40  |  40   | 40  |  40   |   40    |  40    |
22
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
23
-| ref             |     1    |     1    |    2    |   2   |  3  |   3   |  4  |   4   |    5    |   5    |
24
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
25
-| limit-refs      |     0    |     0    |    3    |   3   |  3  |   3   |  3  |   2   |    1    |   0    |
26
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
27
-| me              |    dia   |   hex    |   hex   |  hex  |hex  |  hex  |star | star  |   star  |  star  |
28
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
29
-| merange         |    57    |    57    |   57    |  57   | 57  |  57   | 57  |  57   |   57    |  92    |
30
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
31
-| subme           |     0    |     1    |    1    |   2   |  2  |   2   |  3  |   3   |    4    |   5    |
32
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
33
-| rect            |     0    |     0    |    0    |   0   |  0  |   0   |  1  |   1   |    1    |   1    |
34
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
35
-| amp             |     0    |     0    |    0    |   0   |  0  |   0   |  0  |   1   |    1    |   1    |
36
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
37
-| limit-modes     |     0    |     0    |    0    |   0   |  0  |   0   |  1  |   1   |    1    |   0    |
38
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
39
-| max-merge       |     2    |     2    |    2    |   2   |  2  |   2   |  3  |   3   |    4    |   5    |
40
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
41
-| early-skip      |     1    |     1    |    1    |   1   |  0  |   0   |  0  |   0   |    0    |   0    |
42
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
43
-| fast-intra      |     1    |     1    |    1    |   1   |  1  |   0   |  0  |   0   |    0    |   0    |
44
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
45
-| b-intra         |     0    |     0    |    0    |   0   |  0  |   0   |  0  |   1   |    1    |   1    |
46
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
47
-| sao             |     0    |     0    |    1    |   1   |  1  |   1   |  1  |   1   |    1    |   1    |
48
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
49
-| signhide        |     0    |     1    |    1    |   1   |  1  |   1   |  1  |   1   |    1    |   1    |
50
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
51
-| weightp         |     0    |     0    |    1    |   1   |  1  |   1   |  1  |   1   |    1    |   1    |
52
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
53
-| weightb         |     0    |     0    |    0    |   0   |  0  |   0   |  0  |   1   |    1    |   1    |
54
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
55
-| aq-mode         |     0    |     0    |    1    |   1   |  1  |   1   |  1  |   1   |    1    |   1    |
56
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
57
-| cuTree          |     1    |     1    |    1    |   1   |  1  |   1   |  1  |   1   |    1    |   1    |
58
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
59
-| rdLevel         |     2    |     2    |    2    |   2   |  2  |   3   |  4  |   6   |    6    |   6    |
60
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
61
-| rdoq-level      |     0    |     0    |    0    |   0   |  0  |   0   |  2  |   2   |    2    |   2    |
62
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
63
-| tu-intra        |     1    |     1    |    1    |   1   |  1  |   1   |  1  |   2   |    3    |   4    |
64
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
65
-| tu-inter        |     1    |     1    |    1    |   1   |  1  |   1   |  1  |   2   |    3    |   4    |
66
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
67
+   0. ultrafast
68
+   1. superfast
69
+   2. veryfast
70
+   3. faster
71
+   4. fast
72
+   5. medium **(default)**
73
+   6. slow
74
+   7. slower
75
+   8. veryslow
76
+   9. placebo
77
+
78
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
79
+| preset          |  0  |  1  |  2  |   3 |   4 |   5 |   6  |   7  |   8  |  9   |
80
++=================+=====+=====+=====+=====+=====+=====+======+======+======+======+
81
+| ctu             | 32  | 32  | 64  |  64 |  64 |  64 |  64  |  64  |  64  | 64   |
82
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
83
+| min-cu-size     | 16  |  8  |  8  |   8 |   8 |   8 |   8  |   8  |   8  |  8   |
84
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
85
+| bframes         |  3  |  3  |  4  |   4 |   4 |   4 |   4  |   8  |   8  |  8   |
86
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
87
+| b-adapt         |  0  |  0  |  0  |   0 |   0 |   2 |   2  |   2  |   2  |  2   |
88
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
89
+| rc-lookahead    |  5  | 10  | 15  |  15 |  15 |  20 |  25  |  30  |  40  | 60   |
90
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
91
+| lookahead-slices|  8  |  8  |  8  |   8 |   8 |   8 |   4  |   4  |   1  |  1   |
92
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
93
+| scenecut        |  0  | 40  | 40  |  40 |  40 |  40 |  40  |  40  |  40  | 40   |
94
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
95
+| ref             |  1  |  1  |  2  |   2 |   3 |   3 |   4  |   4  |   5  |  5   |
96
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
97
+| limit-refs      |  0  |  0  |  3  |   3 |   3 |   3 |   3  |   2  |   1  |  0   |
98
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
99
+| me              | dia | hex | hex | hex | hex | hex | star | star | star | star |
100
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
101
+| merange         | 57  | 57  | 57  |  57 |  57 |  57 |  57  |  57  |  57  | 92   |
102
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
103
+| subme           |  0  |  1  |  1  |   2 |   2 |   2 |   3  |   3  |   4  |  5   |
104
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
105
+| rect            |  0  |  0  |  0  |   0 |   0 |   0 |   1  |   1  |   1  |  1   |
106
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
107
+| amp             |  0  |  0  |  0  |   0 |   0 |   0 |   0  |   1  |   1  |  1   |
108
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
109
+| limit-modes     |  0  |  0  |  0  |   0 |   0 |   0 |   1  |   1  |   1  |  0   |
110
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
111
+| max-merge       |  2  |  2  |  2  |   2 |   2 |   2 |   3  |   3  |   4  |  5   |
112
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
113
+| early-skip      |  1  |  1  |  1  |   1 |   0 |   0 |   0  |   0  |   0  |  0   |
114
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
115
+| recursion-skip  |  1  |  1  |  1  |   1 |   1 |   1 |   1  |   1  |   0  |  0   |
116
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
117
+| fast-intra      |  1  |  1  |  1  |   1 |   1 |   0 |   0  |   0  |   0  |  0   |
118
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
119
+| b-intra         |  0  |  0  |  0  |   0 |   0 |   0 |   0  |   1  |   1  |  1   |
120
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
121
+| sao             |  0  |  0  |  1  |   1 |   1 |   1 |   1  |   1  |   1  |  1   |
122
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
123
+| signhide        |  0  |  1  |  1  |   1 |   1 |   1 |   1  |   1  |   1  |  1   |
124
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
125
+| weightp         |  0  |  0  |  1  |   1 |   1 |   1 |   1  |   1  |   1  |  1   |
126
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
127
+| weightb         |  0  |  0  |  0  |   0 |   0 |   0 |   0  |   1  |   1  |  1   |
128
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
129
+| aq-mode         |  0  |  0  |  1  |   1 |   1 |   1 |   1  |   1  |   1  |  1   |
130
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
131
+| cuTree          |  1  |  1  |  1  |   1 |   1 |   1 |   1  |   1  |   1  |  1   |
132
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
133
+| rdLevel         |  2  |  2  |  2  |   2 |   2 |   3 |   4  |   6  |   6  |  6   |
134
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
135
+| rdoq-level      |  0  |  0  |  0  |   0 |   0 |   0 |   2  |   2  |   2  |  2   |
136
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
137
+| tu-intra        |  1  |  1  |  1  |   1 |   1 |   1 |   1  |   2  |   3  |  4   |
138
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
139
+| tu-inter        |  1  |  1  |  1  |   1 |   1 |   1 |   1  |   2  |   3  |  4   |
140
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
141
 
142
 .. _tunings:
143
 
144
@@ -117,33 +129,32 @@
145
 
146
 
147
 
148
-Film Grain Retention
149
-~~~~~~~~~~~~~~~~~~~~
150
-
151
-:option:`--tune` *grain* tries to improve the retention of film grain in
152
-the reconstructed output. It disables rate distortion optimizations in
153
-quantization, and increases the default psy-rd.
154
-
155
-    * :option:`--psy-rd` 0.5
156
-    * :option:`--rdoq-level` 0
157
-    * :option:`--psy-rdoq` 0
158
-
159
-It lowers the strength of adaptive quantization, so residual energy can
160
-be more evenly distributed across the (noisy) picture:
161
+Film Grain
162
+~~~~~~~~~~
163
 
164
-    * :option:`--aq-strength` 0.3
165
-
166
-And it similarly tunes rate control to prevent the slice QP from
167
-swinging too wildly from frame to frame:
168
+:option:`--tune` *grain* aims to encode grainy content with the best 
169
+visual quality. The purpose of this option is neither to retain nor 
170
+eliminate grain, but prevent noticeable artifacts caused by uneven 
171
+distribution of grain. :option:`--tune` *grain* strongly restricts 
172
+algorithms that vary the quantization parameter within and across frames.
173
+Tune grain also biases towards decisions that retain more high frequency
174
+components.
175
 
176
+    * :option:`--aq-mode` 0
177
+    * :option:`--cutree` 0
178
     * :option:`--ipratio` 1.1
179
-    * :option:`--pbratio` 1.1
180
-    * :option:`--qcomp` 0.8
181
-
182
-And lastly it reduces the strength of deblocking to prevent grain being
183
-blurred on block boundaries:
184
-
185
-    * :option:`--deblock` -2
186
+    * :option:`--pbratio` 1.0
187
+    * :option:`--qpstep` 1
188
+    * :option:`--sao` 0
189
+    * :option:`--psy-rd` 4.0
190
+    * :option:`--psy-rdoq` 10.0
191
+    * :option:`--recursion-skip` 0
192
+    
193
+It also enables a specialised ratecontrol algorithm :option:`--rc-grain` 
194
+that strictly minimises QP fluctuations across frames, while still allowing 
195
+the encoder to hit bitrate targets and VBV buffer limits (with a slightly 
196
+higher margin of error than normal). It is highly recommended that this 
197
+algorithm is used only through the :option:`--tune` *grain* feature.
198
 
199
 Fast Decode
200
 ~~~~~~~~~~~
201
x265_1.9.tar.gz/source/CMakeLists.txt -> x265_2.0.tar.gz/source/CMakeLists.txt Changed
159
 
1
@@ -30,7 +30,7 @@
2
 mark_as_advanced(FPROFILE_USE FPROFILE_GENERATE NATIVE_BUILD)
3
 
4
 # X265_BUILD must be incremented each time the public API is changed
5
-set(X265_BUILD 79)
6
+set(X265_BUILD 87)
7
 configure_file("${PROJECT_SOURCE_DIR}/x265.def.in"
8
                "${PROJECT_BINARY_DIR}/x265.def")
9
 configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in"
10
@@ -41,7 +41,9 @@
11
 # System architecture detection
12
 string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" SYSPROC)
13
 set(X86_ALIASES x86 i386 i686 x86_64 amd64)
14
+set(ARM_ALIASES armv6l armv7l)
15
 list(FIND X86_ALIASES "${SYSPROC}" X86MATCH)
16
+list(FIND ARM_ALIASES "${SYSPROC}" ARMMATCH)
17
 set(POWER_ALIASES ppc64 ppc64le)
18
 list(FIND POWER_ALIASES "${SYSPROC}" POWERMATCH)
19
 if("${SYSPROC}" STREQUAL "" OR X86MATCH GREATER "-1")
20
@@ -58,7 +60,12 @@
21
     message(STATUS "Detected POWER target processor")
22
     set(POWER 1)
23
     add_definitions(-DX265_ARCH_POWER=1)
24
-elseif(${SYSPROC} STREQUAL "armv6l")
25
+elseif(ARMMATCH GREATER "-1")
26
+    if(CROSS_COMPILE_ARM)
27
+        message(STATUS "Cross compiling for ARM arch")
28
+    else()
29
+        set(CROSS_COMPILE_ARM 0)
30
+    endif()
31
     message(STATUS "Detected ARM target processor")
32
     set(ARM 1)
33
     add_definitions(-DX265_ARCH_ARM=1 -DHAVE_ARMV6=1)
34
@@ -174,11 +181,23 @@
35
             add_definitions(-march=native)
36
         endif()
37
     elseif(X86 AND NOT X64)
38
-        add_definitions(-march=i686)
39
+        string(FIND "${CMAKE_CXX_FLAGS}" "-march" marchPos)
40
+        if(marchPos LESS "0")
41
+            add_definitions(-march=i686)
42
+        endif()
43
     endif()
44
-    if(ARM)
45
-        add_definitions(-march=armv6 -mfloat-abi=hard -mfpu=vfp)
46
+    if(ARM AND CROSS_COMPILE_ARM)
47
+        set(ARM_ARGS -march=armv6 -mfloat-abi=soft -mfpu=vfp -marm -fPIC)
48
+    elseif(ARM)
49
+        find_package(Neon)
50
+        if(CPU_HAS_NEON)
51
+            set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=neon -marm -fPIC)
52
+            add_definitions(-DHAVE_NEON)
53
+        else()
54
+            set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=vfp -marm)
55
+        endif()
56
     endif()
57
+    add_definitions(${ARM_ARGS})
58
     if(FPROFILE_GENERATE)
59
         if(INTEL_CXX)
60
             add_definitions(-prof-gen -prof-dir="${CMAKE_CURRENT_BINARY_DIR}")
61
@@ -269,7 +288,9 @@
62
 endif(GCC)
63
 
64
 find_package(Yasm)
65
-if(YASM_FOUND AND X86)
66
+if(ARM OR CROSS_COMPILE_ARM)
67
+    option(ENABLE_ASSEMBLY "Enable use of assembly coded primitives" ON)
68
+elseif(YASM_FOUND AND X86)
69
     if (YASM_VERSION_STRING VERSION_LESS "1.2.0")
70
         message(STATUS "Yasm version ${YASM_VERSION_STRING} is too old. 1.2.0 or later required")
71
         option(ENABLE_ASSEMBLY "Enable use of assembly coded primitives" OFF)
72
@@ -409,7 +430,7 @@
73
 add_subdirectory(encoder)
74
 add_subdirectory(common)
75
 
76
-if((MSVC_IDE OR XCODE) AND ENABLE_ASSEMBLY)
77
+if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)
78
     # this is required because of this cmake bug
79
     # http://www.cmake.org/Bug/print_bug_page.php?bug_id=8170
80
     if(WIN32)
81
@@ -417,19 +438,36 @@
82
     else()
83
         set(SUFFIX o)
84
     endif()
85
-    foreach(ASM ${MSVC_ASMS})
86
-        set(YASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/x86/${ASM})
87
-        list(APPEND YASM_SRCS ${YASM_SRC})
88
-        list(APPEND YASM_OBJS ${ASM}.${SUFFIX})
89
-        add_custom_command(
90
-            OUTPUT ${ASM}.${SUFFIX}
91
-            COMMAND ${YASM_EXECUTABLE} ARGS ${YASM_FLAGS} ${YASM_SRC} -o ${ASM}.${SUFFIX}
92
-            DEPENDS ${YASM_SRC})
93
-    endforeach()
94
+
95
+    if(ARM OR CROSS_COMPILE_ARM)
96
+    # compile ARM arch asm files here
97
+        enable_language(ASM)
98
+        foreach(ASM ${ARM_ASMS})
99
+            set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/arm/${ASM})
100
+            list(APPEND ASM_SRCS ${ASM_SRC})
101
+            list(APPEND ASM_OBJS ${ASM}.${SUFFIX})
102
+            add_custom_command(
103
+                OUTPUT ${ASM}.${SUFFIX}
104
+                COMMAND ${CMAKE_CXX_COMPILER}
105
+                ARGS ${ARM_ARGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}
106
+                DEPENDS ${ASM_SRC})
107
+        endforeach()
108
+    elseif(X86)
109
+    # compile X86 arch asm files here
110
+        foreach(ASM ${MSVC_ASMS})
111
+            set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/x86/${ASM})
112
+            list(APPEND ASM_SRCS ${ASM_SRC})
113
+            list(APPEND ASM_OBJS ${ASM}.${SUFFIX})
114
+            add_custom_command(
115
+                OUTPUT ${ASM}.${SUFFIX}
116
+                COMMAND ${YASM_EXECUTABLE} ARGS ${YASM_FLAGS} ${ASM_SRC} -o ${ASM}.${SUFFIX}
117
+                DEPENDS ${ASM_SRC})
118
+        endforeach()
119
+    endif()
120
 endif()
121
 
122
-source_group(ASM FILES ${YASM_SRCS})
123
-add_library(x265-static STATIC $<TARGET_OBJECTS:encoder> $<TARGET_OBJECTS:common> ${YASM_OBJS} ${YASM_SRCS})
124
+source_group(ASM FILES ${ASM_SRCS})
125
+add_library(x265-static STATIC $<TARGET_OBJECTS:encoder> $<TARGET_OBJECTS:common> ${ASM_OBJS} ${ASM_SRCS})
126
 if(NOT MSVC)
127
     set_target_properties(x265-static PROPERTIES OUTPUT_NAME x265)
128
 endif()
129
@@ -463,7 +501,7 @@
130
 
131
 option(ENABLE_SHARED "Build shared library" ON)
132
 if(ENABLE_SHARED)
133
-    add_library(x265-shared SHARED "${PROJECT_BINARY_DIR}/x265.def" ${YASM_OBJS}
134
+    add_library(x265-shared SHARED "${PROJECT_BINARY_DIR}/x265.def" ${ASM_OBJS}
135
                 ${X265_RC_FILE} $<TARGET_OBJECTS:encoder> $<TARGET_OBJECTS:common>)
136
     if(EXTRA_LIB)
137
         target_link_libraries(x265-shared ${EXTRA_LIB})
138
@@ -559,7 +597,7 @@
139
         # Xcode seems unable to link the CLI with libs, so link as one targget
140
         add_executable(cli ../COPYING ${InputFiles} ${OutputFiles} ${GETOPT}
141
                        x265.cpp x265.h x265cli.h x265-extras.h x265-extras.cpp
142
-                       $<TARGET_OBJECTS:encoder> $<TARGET_OBJECTS:common> ${YASM_OBJS} ${YASM_SRCS})
143
+                       $<TARGET_OBJECTS:encoder> $<TARGET_OBJECTS:common> ${ASM_OBJS} ${ASM_SRCS})
144
     else()
145
         add_executable(cli ../COPYING ${InputFiles} ${OutputFiles} ${GETOPT} ${X265_RC_FILE}
146
                        ${ExportDefs} x265.cpp x265.h x265cli.h x265-extras.h x265-extras.cpp)
147
@@ -587,3 +625,11 @@
148
         add_subdirectory(test)
149
     endif()
150
 endif()
151
+
152
+get_directory_property(hasParent PARENT_DIRECTORY)
153
+if(hasParent)
154
+    if(PLATFORM_LIBS)
155
+        LIST(REMOVE_DUPLICATES PLATFORM_LIBS)
156
+        set(PLATFORM_LIBS ${PLATFORM_LIBS} PARENT_SCOPE)
157
+    endif(PLATFORM_LIBS)
158
+endif(hasParent)
159
x265_2.0.tar.gz/source/cmake/FindNeon.cmake Added
12
 
1
@@ -0,0 +1,10 @@
2
+include(FindPackageHandleStandardArgs)
3
+
4
+# Check the version of neon supported by the ARM CPU
5
+execute_process(COMMAND cat /proc/cpuinfo | grep Features | grep neon
6
+                OUTPUT_VARIABLE neon_version
7
+                ERROR_QUIET
8
+                OUTPUT_STRIP_TRAILING_WHITESPACE)
9
+if(neon_version)
10
+    set(CPU_HAS_NEON 1)
11
+endif()
12
x265_1.9.tar.gz/source/cmake/version.cmake -> x265_2.0.tar.gz/source/cmake/version.cmake Changed
68
 
1
@@ -52,39 +52,55 @@
2
         )
3
     execute_process(
4
         COMMAND
5
-        ${HG_EXECUTABLE} log -r. --template "{node|short}"
6
+        ${HG_EXECUTABLE} log -r. --template "{node}"
7
         WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
8
-        OUTPUT_VARIABLE HG_REVISION_ID
9
+        OUTPUT_VARIABLE X265_REVISION_ID
10
         ERROR_QUIET
11
         OUTPUT_STRIP_TRAILING_WHITESPACE
12
         )
13
+    string(SUBSTRING "${X265_REVISION_ID}" 0 12 X265_REVISION_ID)
14
 
15
     if(X265_LATEST_TAG MATCHES "^r")
16
         string(SUBSTRING ${X265_LATEST_TAG} 1 -1 X265_LATEST_TAG)
17
     endif()
18
-    if(X265_TAG_DISTANCE STREQUAL "0")
19
-        set(X265_VERSION "${X265_LATEST_TAG}")
20
-    else()
21
-        set(X265_VERSION "${X265_LATEST_TAG}+${X265_TAG_DISTANCE}-${HG_REVISION_ID}")
22
-    endif()
23
 elseif(GIT_EXECUTABLE AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/../.git)
24
     execute_process(
25
         COMMAND
26
-        ${GIT_EXECUTABLE} describe --tags --abbrev=0
27
+        ${GIT_EXECUTABLE} rev-list --tags --max-count=1
28
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
29
+        OUTPUT_VARIABLE X265_LATEST_TAG_COMMIT
30
+        ERROR_QUIET
31
+        OUTPUT_STRIP_TRAILING_WHITESPACE
32
+        )
33
+    execute_process(
34
+        COMMAND
35
+        ${GIT_EXECUTABLE} describe --tags ${X265_LATEST_TAG_COMMIT}
36
         WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
37
         OUTPUT_VARIABLE X265_LATEST_TAG
38
         ERROR_QUIET
39
         OUTPUT_STRIP_TRAILING_WHITESPACE
40
         )
41
-
42
     execute_process(
43
         COMMAND
44
-        ${GIT_EXECUTABLE} describe --tags
45
+        ${GIT_EXECUTABLE} rev-list ${X265_LATEST_TAG}.. --count --first-parent
46
         WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
47
-        OUTPUT_VARIABLE X265_VERSION
48
+        OUTPUT_VARIABLE X265_TAG_DISTANCE
49
         ERROR_QUIET
50
         OUTPUT_STRIP_TRAILING_WHITESPACE
51
         )
52
+    execute_process(
53
+        COMMAND
54
+        ${GIT_EXECUTABLE} log -1 --format=g%h
55
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
56
+        OUTPUT_VARIABLE X265_REVISION_ID
57
+        ERROR_QUIET
58
+        OUTPUT_STRIP_TRAILING_WHITESPACE
59
+        )
60
+endif()
61
+if(X265_TAG_DISTANCE STREQUAL "0")
62
+    set(X265_VERSION "${X265_LATEST_TAG}")
63
+else()
64
+    set(X265_VERSION "${X265_LATEST_TAG}+${X265_TAG_DISTANCE}-${X265_REVISION_ID}")
65
 endif()
66
 
67
 message(STATUS "x265 version ${X265_VERSION}")
68
x265_1.9.tar.gz/source/common/CMakeLists.txt -> x265_2.0.tar.gz/source/common/CMakeLists.txt Changed
49
 
1
@@ -16,12 +16,14 @@
2
 if(ENABLE_ASSEMBLY)
3
     set_source_files_properties(threading.cpp primitives.cpp PROPERTIES COMPILE_FLAGS -DENABLE_ASSEMBLY=1)
4
     list(APPEND VFLAGS "-DENABLE_ASSEMBLY=1")
5
+endif(ENABLE_ASSEMBLY)
6
 
7
+if(ENABLE_ASSEMBLY AND X86)
8
     set(SSE3  vec/dct-sse3.cpp)
9
     set(SSSE3 vec/dct-ssse3.cpp)
10
     set(SSE41 vec/dct-sse41.cpp)
11
 
12
-    if(MSVC AND X86)
13
+    if(MSVC)
14
         set(PRIMITIVES ${SSE3} ${SSSE3} ${SSE41})
15
         set(WARNDISABLE "/wd4100") # unreferenced formal parameter
16
         if(INTEL_CXX)
17
@@ -38,7 +40,7 @@
18
             set_source_files_properties(${SSE3} ${SSSE3} ${SSE41} PROPERTIES COMPILE_FLAGS "${WARNDISABLE} /arch:SSE2")
19
         endif()
20
     endif()
21
-    if(GCC AND X86)
22
+    if(GCC)
23
         if(CLANG)
24
             # llvm intrinsic headers cause shadow warnings
25
             set(WARNDISABLE "-Wno-shadow -Wno-unused-parameter")
26
@@ -81,7 +83,21 @@
27
         set(ASM_PRIMITIVES ${ASM_PRIMITIVES} x86/${SRC})
28
     endforeach()
29
     source_group(Assembly FILES ${ASM_PRIMITIVES})
30
-endif(ENABLE_ASSEMBLY)
31
+endif(ENABLE_ASSEMBLY AND X86)
32
+
33
+if(ENABLE_ASSEMBLY AND (ARM OR CROSS_COMPILE_ARM))
34
+    set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h loopfilter.h)
35
+
36
+    # add ARM assembly/intrinsic files here
37
+    set(A_SRCS asm.S cpu-a.S mc-a.S sad-a.S pixel-util.S ssd-a.S blockcopy8.S ipfilter8.S dct-a.S)
38
+    set(VEC_PRIMITIVES)
39
+
40
+    set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources")
41
+    foreach(SRC ${C_SRCS})
42
+        set(ASM_PRIMITIVES ${ASM_PRIMITIVES} arm/${SRC})
43
+    endforeach()
44
+    source_group(Assembly FILES ${ASM_PRIMITIVES})
45
+endif(ENABLE_ASSEMBLY AND (ARM OR CROSS_COMPILE_ARM))
46
 
47
 # set_target_properties can't do list expansion
48
 string(REPLACE ";" " " VERSION_FLAGS "${VFLAGS}")
49
x265_2.0.tar.gz/source/common/arm/asm-primitives.cpp Added
201
 
1
@@ -0,0 +1,1022 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2016 x265 project
4
+ *
5
+ * Authors: Steve Borho <steve@borho.org>
6
+ *          Praveen Kumar Tiwari <praveen@multicorewareinc.com>
7
+ *          Min Chen <chenm003@163.com> <min.chen@multicorewareinc.com>
8
+ *          Dnyaneshwar Gorade <dnyaneshwar@multicorewareinc.com>
9
+ *
10
+ * This program is free software; you can redistribute it and/or modify
11
+ * it under the terms of the GNU General Public License as published by
12
+ * the Free Software Foundation; either version 2 of the License, or
13
+ * (at your option) any later version.
14
+ *
15
+ * This program is distributed in the hope that it will be useful,
16
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18
+ * GNU General Public License for more details.
19
+ *
20
+ * You should have received a copy of the GNU General Public License
21
+ * along with this program; if not, write to the Free Software
22
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
23
+ *
24
+ * This program is also available under a commercial proprietary license.
25
+ * For more information, contact us at license @ x265.com.
26
+ *****************************************************************************/
27
+
28
+#include "common.h"
29
+#include "primitives.h"
30
+#include "x265.h"
31
+#include "cpu.h"
32
+
33
+extern "C" {
34
+#include "blockcopy8.h"
35
+#include "pixel.h"
36
+#include "pixel-util.h"
37
+#include "ipfilter8.h"
38
+#include "dct8.h"
39
+}
40
+
41
+namespace X265_NS {
42
+// private x265 namespace
43
+
44
+void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask)
45
+{
46
+    if (cpuMask & X265_CPU_NEON)
47
+    {
48
+        // ssim_4x4x2_core
49
+        p.ssim_4x4x2_core = PFX(ssim_4x4x2_core_neon);
50
+
51
+        // addAvg
52
+         p.pu[LUMA_4x4].addAvg   = PFX(addAvg_4x4_neon);
53
+         p.pu[LUMA_4x8].addAvg   = PFX(addAvg_4x8_neon);
54
+         p.pu[LUMA_4x16].addAvg  = PFX(addAvg_4x16_neon);
55
+         p.pu[LUMA_8x4].addAvg   = PFX(addAvg_8x4_neon);
56
+         p.pu[LUMA_8x8].addAvg   = PFX(addAvg_8x8_neon);
57
+         p.pu[LUMA_8x16].addAvg  = PFX(addAvg_8x16_neon);
58
+         p.pu[LUMA_8x32].addAvg  = PFX(addAvg_8x32_neon);
59
+         p.pu[LUMA_12x16].addAvg = PFX(addAvg_12x16_neon);
60
+         p.pu[LUMA_16x4].addAvg  = PFX(addAvg_16x4_neon);
61
+         p.pu[LUMA_16x8].addAvg  = PFX(addAvg_16x8_neon);
62
+         p.pu[LUMA_16x12].addAvg = PFX(addAvg_16x12_neon);
63
+         p.pu[LUMA_16x16].addAvg = PFX(addAvg_16x16_neon);
64
+         p.pu[LUMA_16x32].addAvg = PFX(addAvg_16x32_neon);
65
+         p.pu[LUMA_16x64].addAvg = PFX(addAvg_16x64_neon);
66
+         p.pu[LUMA_24x32].addAvg = PFX(addAvg_24x32_neon);
67
+         p.pu[LUMA_32x8].addAvg  = PFX(addAvg_32x8_neon);
68
+         p.pu[LUMA_32x16].addAvg = PFX(addAvg_32x16_neon);
69
+         p.pu[LUMA_32x24].addAvg = PFX(addAvg_32x24_neon);
70
+         p.pu[LUMA_32x32].addAvg = PFX(addAvg_32x32_neon);
71
+         p.pu[LUMA_32x64].addAvg = PFX(addAvg_32x64_neon);
72
+         p.pu[LUMA_48x64].addAvg = PFX(addAvg_48x64_neon);
73
+         p.pu[LUMA_64x16].addAvg = PFX(addAvg_64x16_neon);
74
+         p.pu[LUMA_64x32].addAvg = PFX(addAvg_64x32_neon);
75
+         p.pu[LUMA_64x48].addAvg = PFX(addAvg_64x48_neon);
76
+         p.pu[LUMA_64x64].addAvg = PFX(addAvg_64x64_neon);
77
+
78
+        // chroma addAvg
79
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].addAvg   = PFX(addAvg_4x2_neon);
80
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].addAvg   = PFX(addAvg_4x4_neon);
81
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].addAvg   = PFX(addAvg_4x8_neon);
82
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].addAvg  = PFX(addAvg_4x16_neon);
83
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].addAvg   = PFX(addAvg_6x8_neon);
84
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].addAvg   = PFX(addAvg_8x2_neon);
85
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].addAvg   = PFX(addAvg_8x4_neon);
86
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].addAvg   = PFX(addAvg_8x6_neon);
87
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].addAvg   = PFX(addAvg_8x8_neon);
88
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].addAvg  = PFX(addAvg_8x16_neon);
89
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].addAvg  = PFX(addAvg_8x32_neon);
90
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].addAvg = PFX(addAvg_12x16_neon);
91
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].addAvg  = PFX(addAvg_16x4_neon);
92
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].addAvg  = PFX(addAvg_16x8_neon);
93
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].addAvg = PFX(addAvg_16x12_neon);
94
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].addAvg = PFX(addAvg_16x16_neon);
95
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].addAvg = PFX(addAvg_16x32_neon);
96
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].addAvg = PFX(addAvg_24x32_neon);
97
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].addAvg  = PFX(addAvg_32x8_neon);
98
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].addAvg = PFX(addAvg_32x16_neon);
99
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].addAvg = PFX(addAvg_32x24_neon);
100
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].addAvg = PFX(addAvg_32x32_neon);
101
+
102
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].addAvg   = PFX(addAvg_4x8_neon);
103
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].addAvg  = PFX(addAvg_4x16_neon);
104
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].addAvg  = PFX(addAvg_4x32_neon);
105
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].addAvg  = PFX(addAvg_6x16_neon);
106
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].addAvg   = PFX(addAvg_8x4_neon);
107
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].addAvg   = PFX(addAvg_8x8_neon);
108
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].addAvg  = PFX(addAvg_8x12_neon);
109
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].addAvg  = PFX(addAvg_8x16_neon);
110
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].addAvg  = PFX(addAvg_8x32_neon);
111
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].addAvg  = PFX(addAvg_8x64_neon);
112
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].addAvg = PFX(addAvg_12x32_neon);
113
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].addAvg  = PFX(addAvg_16x8_neon);
114
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].addAvg = PFX(addAvg_16x16_neon);
115
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].addAvg = PFX(addAvg_16x24_neon);
116
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].addAvg = PFX(addAvg_16x32_neon);
117
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].addAvg = PFX(addAvg_16x64_neon);
118
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].addAvg = PFX(addAvg_24x64_neon);
119
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].addAvg = PFX(addAvg_32x16_neon);
120
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].addAvg = PFX(addAvg_32x32_neon);
121
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].addAvg = PFX(addAvg_32x48_neon);
122
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].addAvg = PFX(addAvg_32x64_neon);
123
+
124
+        // quant
125
+         p.quant = PFX(quant_neon);
126
+         p.nquant = PFX(nquant_neon);
127
+
128
+        // dequant_scaling
129
+         p.dequant_scaling = PFX(dequant_scaling_neon);
130
+         p.dequant_normal  = PFX(dequant_normal_neon);
131
+
132
+        // luma satd
133
+         p.pu[LUMA_4x4].satd   = PFX(pixel_satd_4x4_neon);
134
+         p.pu[LUMA_4x8].satd   = PFX(pixel_satd_4x8_neon);
135
+         p.pu[LUMA_4x16].satd  = PFX(pixel_satd_4x16_neon);
136
+         p.pu[LUMA_8x4].satd   = PFX(pixel_satd_8x4_neon);
137
+         p.pu[LUMA_8x8].satd   = PFX(pixel_satd_8x8_neon);
138
+         p.pu[LUMA_8x16].satd  = PFX(pixel_satd_8x16_neon);
139
+         p.pu[LUMA_8x32].satd  = PFX(pixel_satd_8x32_neon);
140
+         p.pu[LUMA_12x16].satd = PFX(pixel_satd_12x16_neon);
141
+         p.pu[LUMA_16x4].satd  = PFX(pixel_satd_16x4_neon);
142
+         p.pu[LUMA_16x8].satd  = PFX(pixel_satd_16x8_neon);
143
+         p.pu[LUMA_16x16].satd = PFX(pixel_satd_16x16_neon);
144
+         p.pu[LUMA_16x32].satd = PFX(pixel_satd_16x32_neon);
145
+         p.pu[LUMA_16x64].satd = PFX(pixel_satd_16x64_neon);
146
+         p.pu[LUMA_24x32].satd = PFX(pixel_satd_24x32_neon);
147
+         p.pu[LUMA_32x8].satd  = PFX(pixel_satd_32x8_neon);
148
+         p.pu[LUMA_32x16].satd = PFX(pixel_satd_32x16_neon);
149
+         p.pu[LUMA_32x24].satd = PFX(pixel_satd_32x24_neon);
150
+         p.pu[LUMA_32x32].satd = PFX(pixel_satd_32x32_neon);
151
+         p.pu[LUMA_32x64].satd = PFX(pixel_satd_32x64_neon);
152
+         p.pu[LUMA_48x64].satd = PFX(pixel_satd_48x64_neon);
153
+         p.pu[LUMA_64x16].satd = PFX(pixel_satd_64x16_neon);
154
+         p.pu[LUMA_64x32].satd = PFX(pixel_satd_64x32_neon);
155
+         p.pu[LUMA_64x48].satd = PFX(pixel_satd_64x48_neon);
156
+         p.pu[LUMA_64x64].satd = PFX(pixel_satd_64x64_neon);
157
+
158
+        // chroma satd
159
+         p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].satd    = PFX(pixel_satd_4x4_neon);
160
+         p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].satd    = PFX(pixel_satd_4x8_neon);
161
+         p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].satd   = PFX(pixel_satd_4x16_neon);
162
+         p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].satd    = PFX(pixel_satd_8x4_neon);
163
+         p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].satd    = PFX(pixel_satd_8x8_neon);
164
+         p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].satd   = PFX(pixel_satd_8x16_neon);
165
+         p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].satd   = PFX(pixel_satd_8x32_neon);
166
+         p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].satd  = PFX(pixel_satd_12x16_neon);
167
+         p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].satd   = PFX(pixel_satd_16x4_neon);
168
+         p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].satd   = PFX(pixel_satd_16x8_neon);
169
+         p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].satd  = PFX(pixel_satd_16x12_neon);
170
+         p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].satd  = PFX(pixel_satd_16x16_neon);
171
+         p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].satd  = PFX(pixel_satd_16x32_neon);
172
+         p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].satd  = PFX(pixel_satd_24x32_neon);
173
+         p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].satd   = PFX(pixel_satd_32x8_neon);
174
+         p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].satd  = PFX(pixel_satd_32x16_neon);
175
+         p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].satd  = PFX(pixel_satd_32x24_neon);
176
+         p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].satd  = PFX(pixel_satd_32x32_neon);
177
+
178
+         p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].satd    = PFX(pixel_satd_4x4_neon);
179
+         p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd    = PFX(pixel_satd_4x8_neon);
180
+         p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].satd   = PFX(pixel_satd_4x16_neon);
181
+         p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].satd   = PFX(pixel_satd_4x32_neon);
182
+         p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].satd    = PFX(pixel_satd_8x4_neon);
183
+         p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].satd    = PFX(pixel_satd_8x8_neon);
184
+         p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].satd   = PFX(pixel_satd_8x12_neon);
185
+         p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].satd   = PFX(pixel_satd_8x16_neon);
186
+         p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].satd   = PFX(pixel_satd_8x32_neon);
187
+         p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].satd   = PFX(pixel_satd_8x64_neon);
188
+         p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].satd  = PFX(pixel_satd_12x32_neon);
189
+         p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].satd   = PFX(pixel_satd_16x8_neon);
190
+         p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].satd  = PFX(pixel_satd_16x16_neon);
191
+         p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].satd  = PFX(pixel_satd_16x24_neon);
192
+         p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].satd  = PFX(pixel_satd_16x32_neon);
193
+         p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].satd  = PFX(pixel_satd_16x64_neon);
194
+         p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].satd  = PFX(pixel_satd_24x64_neon);
195
+         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd  = PFX(pixel_satd_32x16_neon);
196
+         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].satd  = PFX(pixel_satd_32x32_neon);
197
+         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].satd  = PFX(pixel_satd_32x48_neon);
198
+         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].satd  = PFX(pixel_satd_32x64_neon);
199
+
200
+        // chroma_hpp
201
x265_2.0.tar.gz/source/common/arm/asm.S Added
196
 
1
@@ -0,0 +1,194 @@
2
+/*****************************************************************************
3
+ * asm.S: arm utility macros
4
+ *****************************************************************************
5
+ * Copyright (C) 2016 x265 project
6
+ *
7
+ * Authors: Mans Rullgard <mans@mansr.com>
8
+ *          David Conrad <lessen42@gmail.com>
9
+ *          Dnyaneshwar Gorade <dnyaneshwar@multicorewareinc.com>
10
+ *
11
+ * This program is free software; you can redistribute it and/or modify
12
+ * it under the terms of the GNU General Public License as published by
13
+ * the Free Software Foundation; either version 2 of the License, or
14
+ * (at your option) any later version.
15
+ *
16
+ * This program is distributed in the hope that it will be useful,
17
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
18
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19
+ * GNU General Public License for more details.
20
+ *
21
+ * You should have received a copy of the GNU General Public License
22
+ * along with this program; if not, write to the Free Software
23
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
24
+ *
25
+ * This program is also available under a commercial proprietary license.
26
+ * For more information, contact us at license @ x265.com.
27
+ *****************************************************************************/
28
+
29
+.syntax unified
30
+
31
+#if   HAVE_NEON
32
+        .arch           armv7-a
33
+#elif HAVE_ARMV6T2
34
+        .arch           armv6t2
35
+#elif HAVE_ARMV6
36
+        .arch           armv6
37
+#endif
38
+
39
+.fpu neon
40
+
41
+#ifdef PREFIX
42
+#   define EXTERN_ASM _
43
+#else
44
+#   define EXTERN_ASM
45
+#endif
46
+
47
+#ifdef __ELF__
48
+#   define ELF
49
+#else
50
+#   define ELF @
51
+#endif
52
+
53
+#if HAVE_AS_FUNC
54
+#   define FUNC
55
+#else
56
+#   define FUNC @
57
+#endif
58
+
59
+.macro require8, val=1
60
+ELF     .eabi_attribute 24, \val
61
+.endm
62
+
63
+.macro preserve8, val=1
64
+ELF     .eabi_attribute 25, \val
65
+.endm
66
+
67
+.macro function name, export=1
68
+    .macro endfunc
69
+ELF     .size   \name, . - \name
70
+FUNC    .endfunc
71
+        .purgem endfunc
72
+    .endm
73
+        .align  2
74
+.if \export == 1
75
+        .global EXTERN_ASM\name
76
+ELF     .hidden EXTERN_ASM\name
77
+ELF     .type   EXTERN_ASM\name, %function
78
+FUNC    .func   EXTERN_ASM\name
79
+EXTERN_ASM\name:
80
+.else
81
+ELF     .hidden \name
82
+ELF     .type   \name, %function
83
+FUNC    .func   \name
84
+\name:
85
+.endif
86
+.endm
87
+
88
+.macro movrel rd, val
89
+#if HAVE_ARMV6T2 && !defined(PIC)
90
+        movw            \rd, #:lower16:\val
91
+        movt            \rd, #:upper16:\val
92
+#else
93
+        ldr             \rd, =\val
94
+#endif
95
+.endm
96
+
97
+.macro movconst rd, val
98
+#if HAVE_ARMV6T2
99
+    movw        \rd, #:lower16:\val
100
+.if \val >> 16
101
+    movt        \rd, #:upper16:\val
102
+.endif
103
+#else
104
+    ldr         \rd, =\val
105
+#endif
106
+.endm
107
+
108
+#define GLUE(a, b) a ## b
109
+#define JOIN(a, b) GLUE(a, b)
110
+#define X(s) JOIN(EXTERN_ASM, s)
111
+
112
+#define FENC_STRIDE 64
113
+#define FDEC_STRIDE 32
114
+
115
+.macro HORIZ_ADD dest, a, b
116
+.ifnb \b
117
+    vadd.u16    \a, \a, \b
118
+.endif
119
+    vpaddl.u16  \a, \a
120
+    vpaddl.u32  \dest, \a
121
+.endm
122
+
123
+.macro SUMSUB_AB sum, diff, a, b
124
+    vadd.s16    \sum,  \a, \b
125
+    vsub.s16    \diff, \a, \b
126
+.endm
127
+
128
+.macro SUMSUB_ABCD s1, d1, s2, d2, a, b, c, d
129
+    SUMSUB_AB   \s1, \d1, \a, \b
130
+    SUMSUB_AB   \s2, \d2, \c, \d
131
+.endm
132
+
133
+.macro ABS2 a b
134
+    vabs.s16 \a, \a
135
+    vabs.s16 \b, \b
136
+.endm
137
+
138
+// dist = distance in elements (0 for vertical pass, 1/2 for horizontal passes)
139
+// op = sumsub/amax (sum and diff / maximum of absolutes)
140
+// d1/2 = destination registers
141
+// s1/2 = source registers
142
+.macro HADAMARD dist, op, d1, d2, s1, s2
143
+.if \dist == 1
144
+    vtrn.16     \s1, \s2
145
+.else
146
+    vtrn.32     \s1, \s2
147
+.endif
148
+.ifc \op, sumsub
149
+    SUMSUB_AB   \d1, \d2, \s1, \s2
150
+.else
151
+    vabs.s16    \s1, \s1
152
+    vabs.s16    \s2, \s2
153
+    vmax.s16    \d1, \s1, \s2
154
+.endif
155
+.endm
156
+
157
+.macro TRANSPOSE8x8 r0 r1 r2 r3 r4 r5 r6 r7
158
+    vtrn.32         \r0, \r4
159
+    vtrn.32         \r1, \r5
160
+    vtrn.32         \r2, \r6
161
+    vtrn.32         \r3, \r7
162
+    vtrn.16         \r0, \r2
163
+    vtrn.16         \r1, \r3
164
+    vtrn.16         \r4, \r6
165
+    vtrn.16         \r5, \r7
166
+    vtrn.8          \r0, \r1
167
+    vtrn.8          \r2, \r3
168
+    vtrn.8          \r4, \r5
169
+    vtrn.8          \r6, \r7
170
+.endm
171
+
172
+.macro TRANSPOSE4x4 r0 r1 r2 r3
173
+    vtrn.16         \r0, \r2
174
+    vtrn.16         \r1, \r3
175
+    vtrn.8          \r0, \r1
176
+    vtrn.8          \r2, \r3
177
+.endm
178
+
179
+.macro TRANSPOSE4x4_16  r0, r1, r2, r3
180
+    vtrn.32     \r0, \r2            // r0 = [21 20 01 00], r2 = [23 22 03 02]
181
+    vtrn.32     \r1, \r3            // r1 = [31 30 11 10], r3 = [33 32 13 12]
182
+    vtrn.16     \r0, \r1            // r0 = [30 20 10 00], r1 = [31 21 11 01]
183
+    vtrn.16     \r2, \r3            // r2 = [32 22 12 02], r3 = [33 23 13 03]
184
+.endm
185
+
186
+.macro TRANSPOSE4x4x2_16  rA0, rA1, rA2, rA3, rB0, rB1, rB2, rB3
187
+    vtrn.32     \rA0, \rA2          // r0 = [21 20 01 00], r2 = [23 22 03 02]
188
+    vtrn.32     \rA1, \rA3          // r1 = [31 30 11 10], r3 = [33 32 13 12]
189
+    vtrn.32     \rB0, \rB2
190
+    vtrn.32     \rB1, \rB3
191
+    vtrn.16     \rA0, \rA1          // r0 = [30 20 10 00], r1 = [31 21 11 01]
192
+    vtrn.16     \rA2, \rA3          // r2 = [32 22 12 02], r3 = [33 23 13 03]
193
+    vtrn.16     \rB0, \rB1
194
+    vtrn.16     \rB2, \rB3
195
+.endm
196
x265_2.0.tar.gz/source/common/arm/blockcopy8.S Added
201
 
1
@@ -0,0 +1,838 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2016 x265 project
4
+ *
5
+ * Authors: Radhakrishnan VR <radhakrishnan@multicorewareinc.com>
6
+ * 
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "asm.S"
26
+
27
+.section .rodata
28
+
29
+.align 4
30
+
31
+.text
32
+
33
+/* void blockcopy_sp(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb)
34
+ *
35
+ * r0   - a
36
+ * r1   - stridea
37
+ * r2   - b
38
+ * r3   - strideb */
39
+function x265_blockcopy_sp_4x4_neon
40
+    lsl             r3, #1
41
+.rept 2
42
+    vld1.u16        {q0}, [r2], r3
43
+    vld1.u16        {q1}, [r2], r3
44
+    vmovn.u16       d0, q0
45
+    vmovn.u16       d1, q1
46
+    vst1.u32        {d0[0]}, [r0], r1
47
+    vst1.u32        {d1[0]}, [r0], r1
48
+.endr
49
+    bx              lr
50
+endfunc
51
+
52
+function x265_blockcopy_sp_8x8_neon
53
+    lsl             r3, #1
54
+.rept 4
55
+    vld1.u16        {q0}, [r2], r3
56
+    vld1.u16        {q1}, [r2], r3
57
+    vmovn.u16       d0, q0
58
+    vmovn.u16       d1, q1
59
+    vst1.u8         {d0}, [r0], r1
60
+    vst1.u8         {d1}, [r0], r1
61
+.endr
62
+    bx              lr
63
+endfunc
64
+
65
+function x265_blockcopy_sp_16x16_neon
66
+    lsl             r3, #1
67
+.rept 8
68
+    vld1.u16        {q0, q1}, [r2], r3
69
+    vld1.u16        {q2, q3}, [r2], r3
70
+    vmovn.u16       d0, q0
71
+    vmovn.u16       d1, q1
72
+    vmovn.u16       d2, q2
73
+    vmovn.u16       d3, q3
74
+    vst1.u8         {q0}, [r0], r1
75
+    vst1.u8         {q1}, [r0], r1
76
+.endr
77
+    bx              lr
78
+endfunc
79
+
80
+function x265_blockcopy_sp_32x32_neon
81
+    mov             r12, #4
82
+    lsl             r3, #1
83
+    sub             r3, #32
84
+loop_csp32:
85
+    subs            r12, #1
86
+.rept 4
87
+    vld1.u16        {q0, q1}, [r2]!
88
+    vld1.u16        {q2, q3}, [r2], r3
89
+    vld1.u16        {q8, q9}, [r2]!
90
+    vld1.u16        {q10, q11}, [r2], r3
91
+
92
+    vmovn.u16       d0, q0
93
+    vmovn.u16       d1, q1
94
+    vmovn.u16       d2, q2
95
+    vmovn.u16       d3, q3
96
+
97
+    vmovn.u16       d4, q8
98
+    vmovn.u16       d5, q9
99
+    vmovn.u16       d6, q10
100
+    vmovn.u16       d7, q11
101
+
102
+    vst1.u8         {q0, q1}, [r0], r1
103
+    vst1.u8         {q2, q3}, [r0], r1
104
+.endr
105
+    bne             loop_csp32
106
+    bx              lr
107
+endfunc
108
+
109
+function x265_blockcopy_sp_64x64_neon
110
+    mov             r12, #16
111
+    lsl             r3, #1
112
+    sub             r3, #96
113
+    sub             r1, #32
114
+loop_csp64:
115
+    subs            r12, #1
116
+.rept 4
117
+    vld1.u16        {q0, q1}, [r2]!
118
+    vld1.u16        {q2, q3}, [r2]!
119
+    vld1.u16        {q8, q9}, [r2]!
120
+    vld1.u16        {q10, q11}, [r2], r3
121
+
122
+    vmovn.u16       d0, q0
123
+    vmovn.u16       d1, q1
124
+    vmovn.u16       d2, q2
125
+    vmovn.u16       d3, q3
126
+
127
+    vmovn.u16       d4, q8
128
+    vmovn.u16       d5, q9
129
+    vmovn.u16       d6, q10
130
+    vmovn.u16       d7, q11
131
+
132
+    vst1.u8         {q0, q1}, [r0]!
133
+    vst1.u8         {q2, q3}, [r0], r1
134
+.endr
135
+    bne             loop_csp64
136
+    bx              lr
137
+endfunc
138
+
139
+// void blockcopy_ps(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb)
140
+function x265_blockcopy_ps_4x4_neon
141
+    lsl             r1, #1
142
+.rept 2
143
+    vld1.u8         {d0}, [r2], r3
144
+    vld1.u8         {d1}, [r2], r3
145
+    vmovl.u8        q1, d0
146
+    vmovl.u8        q2, d1
147
+    vst1.u16        {d2}, [r0], r1
148
+    vst1.u16        {d4}, [r0], r1
149
+.endr
150
+    bx              lr
151
+endfunc
152
+
153
+function x265_blockcopy_ps_8x8_neon
154
+    lsl             r1, #1
155
+.rept 4
156
+    vld1.u8         {d0}, [r2], r3
157
+    vld1.u8         {d1}, [r2], r3
158
+    vmovl.u8        q1, d0
159
+    vmovl.u8        q2, d1
160
+    vst1.u16        {q1}, [r0], r1
161
+    vst1.u16        {q2}, [r0], r1
162
+.endr
163
+    bx              lr
164
+endfunc
165
+
166
+function x265_blockcopy_ps_16x16_neon
167
+    lsl             r1, #1
168
+.rept 8
169
+    vld1.u8         {q0}, [r2], r3
170
+    vld1.u8         {q1}, [r2], r3
171
+    vmovl.u8        q8, d0
172
+    vmovl.u8        q9, d1
173
+    vmovl.u8        q10, d2
174
+    vmovl.u8        q11, d3
175
+    vst1.u16        {q8, q9}, [r0], r1
176
+    vst1.u16        {q10, q11}, [r0], r1
177
+.endr
178
+    bx              lr
179
+endfunc
180
+
181
+function x265_blockcopy_ps_32x32_neon
182
+    lsl             r1, #1
183
+    sub             r1, #32
184
+    mov             r12, #4
185
+loop_cps32:
186
+    subs            r12, #1
187
+.rept 4
188
+    vld1.u8         {q0, q1}, [r2], r3
189
+    vld1.u8         {q2, q3}, [r2], r3
190
+    vmovl.u8        q8, d0
191
+    vmovl.u8        q9, d1
192
+    vmovl.u8        q10, d2
193
+    vmovl.u8        q11, d3
194
+
195
+    vmovl.u8        q12, d4
196
+    vmovl.u8        q13, d5
197
+    vmovl.u8        q14, d6
198
+    vmovl.u8        q15, d7
199
+
200
+    vst1.u16        {q8, q9}, [r0]!
201
x265_2.0.tar.gz/source/common/arm/blockcopy8.h Added
125
 
1
@@ -0,0 +1,123 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2016 x265 project
4
+ *
5
+ * Authors: Steve Borho <steve@borho.org>
6
+ *          Min Chen <chenm003@163.com>
7
+ *          Dnyaneshwar Gorade <dnyaneshwar@multicorewareinc.com>
8
+ *
9
+ * This program is free software; you can redistribute it and/or modify
10
+ * it under the terms of the GNU General Public License as published by
11
+ * the Free Software Foundation; either version 2 of the License, or
12
+ * (at your option) any later version.
13
+ *
14
+ * This program is distributed in the hope that it will be useful,
15
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17
+ * GNU General Public License for more details.
18
+ *
19
+ * You should have received a copy of the GNU General Public License
20
+ * along with this program; if not, write to the Free Software
21
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
22
+ *
23
+ * This program is also available under a commercial proprietary license.
24
+ * For more information, contact us at license @ x265.com.
25
+ *****************************************************************************/
26
+
27
+#ifndef X265_BLOCKCOPY8_ARM_H
28
+#define X265_BLOCKCOPY8_ARM_H
29
+
30
+void x265_blockcopy_pp_16x16_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
31
+void x265_blockcopy_pp_8x4_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
32
+void x265_blockcopy_pp_8x8_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
33
+void x265_blockcopy_pp_8x16_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
34
+void x265_blockcopy_pp_8x32_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
35
+void x265_blockcopy_pp_12x16_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
36
+void x265_blockcopy_pp_4x4_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
37
+void x265_blockcopy_pp_4x8_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
38
+void x265_blockcopy_pp_4x16_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
39
+void x265_blockcopy_pp_16x4_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
40
+void x265_blockcopy_pp_16x8_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
41
+void x265_blockcopy_pp_16x12_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
42
+void x265_blockcopy_pp_16x32_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
43
+void x265_blockcopy_pp_16x64_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
44
+void x265_blockcopy_pp_24x32_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
45
+void x265_blockcopy_pp_32x8_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
46
+void x265_blockcopy_pp_32x16_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
47
+void x265_blockcopy_pp_32x24_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
48
+void x265_blockcopy_pp_32x32_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
49
+void x265_blockcopy_pp_32x64_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
50
+void x265_blockcopy_pp_48x64_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
51
+void x265_blockcopy_pp_64x16_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
52
+void x265_blockcopy_pp_64x32_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
53
+void x265_blockcopy_pp_64x48_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
54
+void x265_blockcopy_pp_64x64_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
55
+void x265_blockcopy_pp_2x4_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
56
+void x265_blockcopy_pp_2x8_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
57
+void x265_blockcopy_pp_2x16_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
58
+void x265_blockcopy_pp_6x8_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
59
+void x265_blockcopy_pp_6x16_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
60
+void x265_blockcopy_pp_8x2_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
61
+void x265_blockcopy_pp_8x6_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
62
+void x265_blockcopy_pp_8x12_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
63
+void x265_blockcopy_pp_8x64_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
64
+void x265_blockcopy_pp_12x32_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
65
+void x265_blockcopy_pp_4x2_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
66
+void x265_blockcopy_pp_4x32_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
67
+void x265_blockcopy_pp_16x24_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
68
+void x265_blockcopy_pp_24x64_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
69
+void x265_blockcopy_pp_32x48_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
70
+
71
+void x265_cpy2Dto1D_shr_4x4_neon(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
72
+void x265_cpy2Dto1D_shr_8x8_neon(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
73
+void x265_cpy2Dto1D_shr_16x16_neon(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
74
+void x265_cpy2Dto1D_shr_32x32_neon(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
75
+
76
+void x265_blockcopy_sp_4x4_neon(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
77
+void x265_blockcopy_sp_8x8_neon(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
78
+void x265_blockcopy_sp_16x16_neon(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
79
+void x265_blockcopy_sp_32x32_neon(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
80
+void x265_blockcopy_sp_64x64_neon(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
81
+
82
+void x265_blockcopy_ps_4x4_neon(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb);
83
+void x265_blockcopy_ps_8x8_neon(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb);
84
+void x265_blockcopy_ps_16x16_neon(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb);
85
+void x265_blockcopy_ps_32x32_neon(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb);
86
+void x265_blockcopy_ps_64x64_neon(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb);
87
+
88
+void x265_blockcopy_ss_4x4_neon(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
89
+void x265_blockcopy_ss_8x8_neon(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
90
+void x265_blockcopy_ss_16x16_neon(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
91
+void x265_blockcopy_ss_32x32_neon(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
92
+void x265_blockcopy_ss_64x64_neon(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
93
+
94
+// chroma blockcopy
95
+void x265_blockcopy_ss_4x8_neon(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
96
+void x265_blockcopy_ss_8x16_neon(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
97
+void x265_blockcopy_ss_16x32_neon(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
98
+void x265_blockcopy_ss_32x64_neon(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
99
+
100
+void x265_blockcopy_sp_4x8_neon(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
101
+void x265_blockcopy_sp_8x16_neon(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
102
+void x265_blockcopy_sp_16x32_neon(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
103
+void x265_blockcopy_sp_32x64_neon(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
104
+
105
+void x265_blockcopy_ps_4x8_neon(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb);
106
+void x265_blockcopy_ps_8x16_neon(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb);
107
+void x265_blockcopy_ps_16x32_neon(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb);
108
+void x265_blockcopy_ps_32x64_neon(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb);
109
+
110
+void x265_blockfill_s_4x4_neon(int16_t* dst, intptr_t dstride, int16_t val);
111
+void x265_blockfill_s_8x8_neon(int16_t* dst, intptr_t dstride, int16_t val);
112
+void x265_blockfill_s_16x16_neon(int16_t* dst, intptr_t dstride, int16_t val);
113
+void x265_blockfill_s_32x32_neon(int16_t* dst, intptr_t dstride, int16_t val);
114
+
115
+uint32_t x265_copy_cnt_4_neon(int16_t* coeff, const int16_t* residual, intptr_t resiStride);
116
+uint32_t x265_copy_cnt_8_neon(int16_t* coeff, const int16_t* residual, intptr_t resiStride);
117
+uint32_t x265_copy_cnt_16_neon(int16_t* coeff, const int16_t* residual, intptr_t resiStride);
118
+uint32_t x265_copy_cnt_32_neon(int16_t* coeff, const int16_t* residual, intptr_t resiStride);
119
+
120
+int x265_count_nonzero_4_neon(const int16_t* quantCoeff);
121
+int x265_count_nonzero_8_neon(const int16_t* quantCoeff);
122
+int x265_count_nonzero_16_neon(const int16_t* quantCoeff);
123
+int x265_count_nonzero_32_neon(const int16_t* quantCoeff);
124
+#endif // ifndef X265_I386_PIXEL_ARM_H
125
x265_2.0.tar.gz/source/common/arm/cpu-a.S Added
111
 
1
@@ -0,0 +1,109 @@
2
+/*****************************************************************************
3
+ * cpu-a.S: arm cpu detection
4
+ *****************************************************************************
5
+ * Copyright (C) 2016 x265 project
6
+ *
7
+ * Authors: David Conrad <lessen42@gmail.com>
8
+ *          Dnyaneshwar Gorade <dnyaneshwar@multicorewareinc.com>
9
+ *
10
+ * This program is free software; you can redistribute it and/or modify
11
+ * it under the terms of the GNU General Public License as published by
12
+ * the Free Software Foundation; either version 2 of the License, or
13
+ * (at your option) any later version.
14
+ *
15
+ * This program is distributed in the hope that it will be useful,
16
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18
+ * GNU General Public License for more details.
19
+ *
20
+ * You should have received a copy of the GNU General Public License
21
+ * along with this program; if not, write to the Free Software
22
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
23
+ *
24
+ * This program is also available under a commercial proprietary license.
25
+ * For more information, contact us at license @ x265.com.
26
+ *****************************************************************************/
27
+
28
+#include "asm.S"
29
+
30
+.align 2
31
+
32
+// done in gas because .fpu neon overrides the refusal to assemble
33
+// instructions the selected -march/-mcpu doesn't support
34
+function x265_cpu_neon_test
35
+    vadd.i16    q0, q0, q0
36
+    bx          lr
37
+endfunc
38
+
39
+// return: 0 on success
40
+//         1 if counters were already enabled
41
+//         9 if lo-res counters were already enabled
42
+function x265_cpu_enable_armv7_counter, export=0
43
+    mrc         p15, 0, r2, c9, c12, 0      // read PMNC
44
+    ands        r0, r2, #1
45
+    andne       r0, r2, #9
46
+
47
+    orr         r2, r2, #1                  // enable counters
48
+    bic         r2, r2, #8                  // full resolution
49
+    mcreq       p15, 0, r2, c9, c12, 0      // write PMNC
50
+    mov         r2, #1 << 31                // enable cycle counter
51
+    mcr         p15, 0, r2, c9, c12, 1      // write CNTENS
52
+    bx          lr
53
+endfunc
54
+
55
+function x265_cpu_disable_armv7_counter, export=0
56
+    mrc         p15, 0, r0, c9, c12, 0      // read PMNC
57
+    bic         r0, r0, #1                  // disable counters
58
+    mcr         p15, 0, r0, c9, c12, 0      // write PMNC
59
+    bx          lr
60
+endfunc
61
+
62
+
63
+.macro READ_TIME r
64
+    mrc         p15, 0, \r, c9, c13, 0
65
+.endm
66
+
67
+// return: 0 if transfers neon -> arm transfers take more than 10 cycles
68
+//         nonzero otherwise
69
+function x265_cpu_fast_neon_mrc_test
70
+    // check for user access to performance counters
71
+    mrc         p15, 0, r0, c9, c14, 0
72
+    cmp         r0, #0
73
+    bxeq        lr
74
+
75
+    push        {r4-r6,lr}
76
+    bl          x265_cpu_enable_armv7_counter
77
+    ands        r1, r0, #8
78
+    mov         r3, #0
79
+    mov         ip, #4
80
+    mov         r6, #4
81
+    moveq       r5, #1
82
+    movne       r5, #64
83
+
84
+average_loop:
85
+    mov         r4, r5
86
+    READ_TIME   r1
87
+1:  subs        r4, r4, #1
88
+.rept 8
89
+    vmov.u32    lr, d0[0]
90
+    add         lr, lr, lr
91
+.endr
92
+    bgt         1b
93
+    READ_TIME   r2
94
+
95
+    subs        r6, r6, #1
96
+    sub         r2, r2, r1
97
+    cmpgt       r2, #30 << 3    // assume context switch if it took over 30 cycles
98
+    addle       r3, r3, r2
99
+    subsle      ip, ip, #1
100
+    bgt         average_loop
101
+
102
+    // disable counters if we enabled them
103
+    ands        r0, r0, #1
104
+    bleq        x265_cpu_disable_armv7_counter
105
+
106
+    lsr         r0, r3, #5
107
+    cmp         r0, #10
108
+    movgt       r0, #0
109
+    pop         {r4-r6,pc}
110
+endfunc
111
x265_2.0.tar.gz/source/common/arm/dct-a.S Added
201
 
1
@@ -0,0 +1,900 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2016 x265 project
4
+ *
5
+ * Authors: Min Chen <chenm003@163.com>
6
+ * 
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "asm.S"
26
+
27
+.section .rodata
28
+
29
+.align 4
30
+
31
+.text
32
+
33
+.align 4
34
+
35
+//        dst[0 * line] = ((64 * E[0] + 64 * E[1] + add) >> shift);
36
+//        dst[2 * line] = ((64 * E[0] - 64 * E[1] + add) >> shift);
37
+//        dst[1 * line] = ((83 * O[0] + 36 * O[1] + add) >> shift);
38
+//        dst[3 * line] = ((36 * O[0] - 83 * O[1] + add) >> shift);
39
+
40
+/* void dct4_c(const int16_t* src, int16_t* dst, intptr_t srcStride) */
41
+function x265_dct_4x4_neon
42
+    mov             r2, r2, lsl #1
43
+    vld1.16         {d0}, [r0, :64], r2                     // d0  = [03 02 01 00]
44
+    vld1.16         {d1}, [r0, :64], r2                     // d1  = [13 12 11 10]
45
+    vld1.16         {d2}, [r0, :64], r2                     // d2  = [23 22 21 20]
46
+    vld1.16         {d3}, [r0, :64]                         // d3  = [33 32 31 30]
47
+
48
+    vtrn.32         q0, q1                                  // q0  = [31 30 11 10 21 20 01 00], q1 = [33 32 13 12 23 22 03 02]
49
+    vrev32.16       q1, q1                                  // q1  = [32 33 12 13 22 23 02 03]
50
+
51
+    movconst        r0, 0x00240053
52
+    movconst        r2, 0xFFAD0024
53
+
54
+    // DCT-1D
55
+    vadd.s16        q2, q0, q1                              // q2  = [E31 E30 E11 E10 E21 E20 E01 E00]
56
+    vsub.s16        q3, q0, q1                              // q3  = [O31 O30 O11 O10 O21 O20 O01 O00]
57
+    vdup.32         d16, r0                                 // d16 = [ 36  83]
58
+    vdup.32         d17, r2                                 // d17 = [-83  36]
59
+    vtrn.16         d4, d5                                  // d4  = [E30 E20 E10 E00], d5 = [E31 E21 E11 E01]
60
+    vtrn.32         d6, d7                                  // q3  = [O31 O30 O21 O20 O11 O10 O01 O00]
61
+
62
+    vmull.s16       q9, d6, d16
63
+    vmull.s16       q10, d7, d16                            // [q9, q10] = [ 36*O1 83*O0] -> [1]
64
+    vmull.s16       q11, d6, d17
65
+    vmull.s16       q12, d7, d17                            // [q11,q12] = [-83*O1 36*O0] -> [3]
66
+
67
+    vadd.s16        d0, d4, d5                              // d0 = [E0 + E1]
68
+    vsub.s16        d1, d4, d5                              // d1 = [E0 - E1]
69
+
70
+    vpadd.s32       d18, d18, d19                           // q9  = [1]
71
+    vpadd.s32       d19, d20, d21
72
+    vpadd.s32       d20, d22, d23                           // q10 = [3]
73
+    vpadd.s32       d21, d24, d25
74
+
75
+    vshll.s16       q1, d0, #6                              // q1  = 64 * [0]
76
+    vshll.s16       q2, d1, #6                              // q2  = 64 * [2]
77
+
78
+    // TODO: Dynamic Range is 11+6-1 bits
79
+    vqrshrn.s32     d25, q9, 1                              // d25 = R[13 12 11 10]
80
+    vqrshrn.s32     d24, q1, 1                              // d24 = R[03 02 01 00]
81
+    vqrshrn.s32     d26, q2, 1                              // q26 = R[23 22 21 20]
82
+    vqrshrn.s32     d27, q10, 1                             // d27 = R[33 32 31 30]
83
+
84
+
85
+    // DCT-2D
86
+    vmovl.s16       q0, d16                                // q14 = [ 36  83]
87
+
88
+    vtrn.32         q12, q13                                // q12 = [31 30 11 10 21 20 01 00], q13 = [33 32 13 12 23 22 03 02]
89
+    vrev32.16       q13, q13                                // q13 = [32 33 12 13 22 23 02 03]
90
+
91
+    vaddl.s16       q1, d24, d26                            // q0  = [E21 E20 E01 E00]
92
+    vaddl.s16       q2, d25, d27                            // q1  = [E31 E30 E11 E10]
93
+    vsubl.s16       q3, d24, d26                            // q2  = [O21 O20 O01 O00]
94
+    vsubl.s16       q8, d25, d27                            // q3  = [O31 O30 O11 O10]
95
+
96
+    vtrn.32         q1, q2                                  // q1  = [E30 E20 E10 E00], q2  = [E31 E21 E11 E01]
97
+    vtrn.32         q3, q8                                  // q3  = [O30 O20 O10 O00], q8  = [O31 O21 O11 O01]
98
+
99
+    vmul.s32        q9, q3, d0[0]                           // q9  = [83*O30 83*O20 83*O10 83*O00]
100
+    vmul.s32        q10, q8, d0[1]                          // q10 = [36*O31 36*O21 36*O11 36*O01]
101
+    vmul.s32        q11, q3, d0[1]                          // q11 = [36*O30 36*O20 36*O10 36*O00]
102
+    vmul.s32        q12, q8, d0[0]                          // q12 = [83*O31 83*O21 83*O11 83*O01]
103
+
104
+    vadd.s32        q0, q1, q2                              // d0 = [E0 + E1]
105
+    vsub.s32        q1, q1, q2                              // d1 = [E0 - E1]
106
+
107
+    vadd.s32        q9, q9, q10
108
+    vsub.s32        q10, q11, q12
109
+
110
+    vshl.s32        q0, q0, #6                              // q1  = 64 * [0]
111
+    vshl.s32        q1, q1, #6                              // q2  = 64 * [2]
112
+
113
+    vqrshrn.s32     d25, q9, 8                              // d25 = R[13 12 11 10]
114
+    vqrshrn.s32     d27, q10, 8                             // d27 = R[33 32 31 30]
115
+
116
+    vqrshrn.s32     d24, q0, 8                              // d24 = R[03 02 01 00]
117
+    vqrshrn.s32     d26, q1, 8                              // q26 = R[23 22 21 20]
118
+
119
+    vst1.16         {d24-d27}, [r1]
120
+
121
+    bx              lr
122
+endfunc
123
+
124
+/* uses registers q4 - q7 for temp values */
125
+.macro tr4 r0, r1, r2, r3
126
+    vsub.s32    q8, \r0, \r3    // EO0
127
+    vadd.s32    q9, \r0, \r3    // EE0
128
+    vadd.s32    q10, \r1, \r2   // EE1
129
+    vsub.s32    q11, \r1, \r2   // EO1
130
+
131
+    vmul.s32    \r1, q8, d0[0]  // 83 * EO0
132
+    vmul.s32    \r3, q8, d0[1]  // 36 * EO0
133
+    vshl.s32    q9, q9, #6      // 64 * EE0
134
+    vshl.s32    q10, q10, #6    // 64 * EE1
135
+    vmla.s32    \r1, q11, d0[1] // 83 * EO0 + 36 * EO1
136
+    vmls.s32    \r3, q11, d0[0] // 36 * EO0 - 83 * EO1
137
+    vadd.s32    \r0, q9, q10    // 64 * (EE0 + EE1)
138
+    vsub.s32    \r2, q9, q10    // 64 * (EE0 - EE1)
139
+.endm
140
+
141
+
142
+.macro tr8 r0, r1, r2, r3
143
+    vmul.s32  q12, \r0, d1[1]   //  89 * src1
144
+    vmul.s32  q13, \r0, d1[0]   //  75 * src1
145
+    vmul.s32  q14, \r0, d2[1]   //  50 * src1
146
+    vmul.s32  q15, \r0, d2[0]   //  18 * src1
147
+
148
+    vmla.s32  q12, \r1, d1[0]   //  75 * src3
149
+    vmls.s32  q13, \r1, d2[0]   // -18 * src3
150
+    vmls.s32  q14, \r1, d1[1]   // -89 * src3
151
+    vmls.s32  q15, \r1, d2[1]   // -50 * src3
152
+
153
+    vmla.s32  q12, \r2, d2[1]   //  50 * src5
154
+    vmls.s32  q13, \r2, d1[1]   // -89 * src5
155
+    vmla.s32  q14, \r2, d2[0]   //  18 * src5
156
+    vmla.s32  q15, \r2, d1[0]   //  75 * src5
157
+
158
+    vmla.s32  q12, \r3, d2[0]   //  18 * src7
159
+    vmls.s32  q13, \r3, d2[1]   // -50 * src7
160
+    vmla.s32  q14, \r3, d1[0]   //  75 * src7
161
+    vmls.s32  q15, \r3, d1[1]   // -89 * src7
162
+.endm
163
+
164
+
165
+// TODO: in the DCT-2D stage, I spending 4x8=32 LD/ST operators because I haven't temporary buffer
166
+/* void dct8_c(const int16_t* src, int16_t* dst, intptr_t srcStride) */
167
+function x265_dct_8x8_neon
168
+    vpush {q4-q7}
169
+
170
+    mov r2, r2, lsl #1
171
+
172
+    adr r3, ctr4
173
+    vld1.16 {d0-d2}, [r3]
174
+    mov r3, r1
175
+
176
+    // DCT-1D
177
+    // top half
178
+    vld1.16 {q12}, [r0], r2
179
+    vld1.16 {q13}, [r0], r2
180
+    vld1.16 {q14}, [r0], r2
181
+    vld1.16 {q15}, [r0], r2
182
+
183
+    TRANSPOSE4x4x2_16 d24, d26, d28, d30,  d25, d27, d29, d31
184
+
185
+    // |--|
186
+    // |24|
187
+    // |26|
188
+    // |28|
189
+    // |30|
190
+    // |25|
191
+    // |27|
192
+    // |29|
193
+    // |31|
194
+    // |--|
195
+
196
+    vaddl.s16 q4, d28, d27
197
+    vaddl.s16 q5, d30, d25
198
+    vaddl.s16 q2, d24, d31
199
+    vaddl.s16 q3, d26, d29
200
+
201
x265_2.0.tar.gz/source/common/arm/dct8.h Added
34
 
1
@@ -0,0 +1,32 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2016 x265 project
4
+ *
5
+ * Authors: Min Chen <chenm003@163.com>
6
+ *          Dnyaneshwar Gorade <dnyaneshwar@multicorewareinc.com>
7
+ *
8
+ * This program is free software; you can redistribute it and/or modify
9
+ * it under the terms of the GNU General Public License as published by
10
+ * the Free Software Foundation; either version 2 of the License, or
11
+ * (at your option) any later version.
12
+ *
13
+ * This program is distributed in the hope that it will be useful,
14
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16
+ * GNU General Public License for more details.
17
+ *
18
+ * You should have received a copy of the GNU General Public License
19
+ * along with this program; if not, write to the Free Software
20
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
21
+ *
22
+ * This program is also available under a commercial proprietary license.
23
+ * For more information, contact us at license @ x265.com.
24
+ *****************************************************************************/
25
+
26
+#ifndef X265_DCT8_ARM_H
27
+#define X265_DCT8_ARM_H
28
+
29
+void PFX(dct_4x4_neon)(const int16_t* src, int16_t* dst, intptr_t srcStride);
30
+void PFX(dct_8x8_neon)(const int16_t* src, int16_t* dst, intptr_t srcStride);
31
+void PFX(dct_16x16_neon)(const int16_t* src, int16_t* dst, intptr_t srcStride);
32
+
33
+#endif // ifndef X265_DCT8_ARM_H
34
x265_2.0.tar.gz/source/common/arm/intrapred.h Added
33
 
1
@@ -0,0 +1,31 @@
2
+/*****************************************************************************
3
+ * intrapred.h: Intra Prediction metrics
4
+ *****************************************************************************
5
+ * Copyright (C) 2003-2013 x264 project
6
+ *
7
+ * Authors: Min Chen <chenm003@163.com> <min.chen@multicorewareinc.com>
8
+ *          Praveen Kumar Tiwari <praveen@multicorewareinc.com>
9
+ *          Dnyaneshwar Gorade <dnyaneshwar@multicorewareinc.com>
10
+ *
11
+ * This program is free software; you can redistribute it and/or modify
12
+ * it under the terms of the GNU General Public License as published by
13
+ * the Free Software Foundation; either version 2 of the License, or
14
+ * (at your option) any later version.
15
+ *
16
+ * This program is distributed in the hope that it will be useful,
17
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
18
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19
+ * GNU General Public License for more details.
20
+ *
21
+ * You should have received a copy of the GNU General Public License
22
+ * along with this program; if not, write to the Free Software
23
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
24
+ *
25
+ * This program is also available under a commercial proprietary license.
26
+ * For more information, contact us at license @ x265.com.
27
+ *****************************************************************************/
28
+
29
+#ifndef X265_INTRAPRED_ARM_H
30
+#define X265_INTRAPRED_ARM_H
31
+
32
+#endif // ifndef X265_INTRAPRED_ARM_H
33
x265_2.0.tar.gz/source/common/arm/ipfilter8.S Added
201
 
1
@@ -0,0 +1,3341 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2016 x265 project
4
+ *
5
+ * Authors: Dnyaneshwar G <dnyaneshwar@multicorewareinc.com>
6
+ *          Radhakrishnan VR <radhakrishnan@multicorewareinc.com>
7
+ *          Min Chen <min.chen@multicorewareinc.com>
8
+ * 
9
+ * This program is free software; you can redistribute it and/or modify
10
+ * it under the terms of the GNU General Public License as published by
11
+ * the Free Software Foundation; either version 2 of the License, or
12
+ * (at your option) any later version.
13
+ *
14
+ * This program is distributed in the hope that it will be useful,
15
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17
+ * GNU General Public License for more details.
18
+ *
19
+ * You should have received a copy of the GNU General Public License
20
+ * along with this program; if not, write to the Free Software
21
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
22
+ *
23
+ * This program is also available under a commercial proprietary license.
24
+ * For more information, contact us at license @ x265.com.
25
+ *****************************************************************************/
26
+
27
+#include "asm.S"
28
+
29
+.section .rodata
30
+.align 4
31
+
32
+g_lumaFilter:
33
+.word 0,0,0,0,0,0,64,64,0,0,0,0,0,0,0,0
34
+.word -1,-1,4,4,-10,-10,58,58,17,17,-5,-5,1,1,0,0
35
+.word -1,-1,4,4,-11,-11,40,40,40,40,-11,-11,4,4,-1,-1
36
+.word 0,0,1,1,-5,-5,17,17,58,58,-10,-10,4,4,-1,-1 
37
+g_chromaFilter:
38
+.word 0, 0, 64, 64, 0, 0, 0, 0
39
+.word -2, -2, 58, 58, 10, 10, -2, -2
40
+.word -4, -4, 54, 54, 16, 16, -2, -2
41
+.word -6, -6, 46, 46, 28, 28, -4, -4
42
+.word -4, -4, 36, 36, 36, 36, -4 ,-4
43
+.word -4, -4, 28, 28, 46, 46, -6, -6
44
+.word -2, -2, 16, 16, 54, 54, -4 ,-4
45
+.word -2, -2, 10, 10, 58, 58, -2, -2
46
+
47
+
48
+.text
49
+
50
+// filterPixelToShort(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride)
51
+function x265_filterPixelToShort_4x4_neon
52
+    vld1.u32    {d0[]}, [r0], r1
53
+    vld1.u32    {d0[1]}, [r0], r1
54
+    vld1.u32    {d1[]}, [r0], r1
55
+    vld1.u32    {d1[1]}, [r0], r1
56
+
57
+    // avoid load pipeline stall
58
+    vmov.i16    q1, #0xE000
59
+
60
+    vshll.u8    q2, d0, #6
61
+    vshll.u8    q3, d1, #6
62
+    vadd.i16    q2, q1
63
+    vadd.i16    q3, q1
64
+
65
+    add         r3, r3
66
+    vst1.16     {d4}, [r2], r3
67
+    vst1.16     {d5}, [r2], r3
68
+    vst1.16     {d6}, [r2], r3
69
+    vst1.16     {d7}, [r2], r3
70
+
71
+    bx          lr
72
+endfunc
73
+
74
+function x265_filterPixelToShort_4x8_neon
75
+    add         r3, r3
76
+    vmov.u16    q8, #64
77
+    vmov.u16    q9, #8192
78
+    vneg.s16    q9, q9
79
+.rept 4
80
+    vld1.u8     {d0}, [r0], r1
81
+    vld1.u8     {d2}, [r0], r1
82
+    vmovl.u8    q0, d0
83
+    vmovl.u8    q1, d2
84
+    vmov        q2, q9
85
+    vmov        q3, q9
86
+    vmla.s16    q2, q0, q8
87
+    vmla.s16    q3, q1, q8
88
+    vst1.16     {d4}, [r2], r3
89
+    vst1.16     {d6}, [r2], r3
90
+.endr
91
+    bx          lr
92
+endfunc
93
+
94
+function x265_filterPixelToShort_4x16_neon
95
+    add         r3, r3
96
+    vmov.u16    q8, #64
97
+    vmov.u16    q9, #8192
98
+    vneg.s16    q9, q9
99
+.rept 8
100
+    vld1.u8     {d0}, [r0], r1
101
+    vld1.u8     {d2}, [r0], r1
102
+    vmovl.u8    q0, d0
103
+    vmovl.u8    q1, d2
104
+    vmov        q2, q9
105
+    vmov        q3, q9
106
+    vmla.s16    q2, q0, q8
107
+    vmla.s16    q3, q1, q8
108
+    vst1.16     {d4}, [r2], r3
109
+    vst1.16     {d6}, [r2], r3
110
+.endr
111
+    bx          lr
112
+endfunc
113
+
114
+function x265_filterPixelToShort_8x4_neon
115
+    add         r3, r3
116
+    vmov.u16    q8, #64
117
+    vmov.u16    q9, #8192
118
+    vneg.s16    q9, q9
119
+.rept 2
120
+    vld1.u8     {d0}, [r0], r1
121
+    vld1.u8     {d2}, [r0], r1
122
+    vmovl.u8    q0, d0
123
+    vmovl.u8    q1, d2
124
+    vmov        q2, q9
125
+    vmov        q3, q9
126
+    vmla.s16    q2, q0, q8
127
+    vmla.s16    q3, q1, q8
128
+    vst1.16     {q2}, [r2], r3
129
+    vst1.16     {q3}, [r2], r3
130
+.endr
131
+    bx          lr
132
+endfunc
133
+
134
+function x265_filterPixelToShort_8x8_neon
135
+    add         r3, r3
136
+    vmov.u16    q8, #64
137
+    vmov.u16    q9, #8192
138
+    vneg.s16    q9, q9
139
+.rept 4
140
+    vld1.u8     {d0}, [r0], r1
141
+    vld1.u8     {d2}, [r0], r1
142
+    vmovl.u8    q0, d0
143
+    vmovl.u8    q1, d2
144
+    vmov        q2, q9
145
+    vmov        q3, q9
146
+    vmla.s16    q2, q0, q8
147
+    vmla.s16    q3, q1, q8
148
+    vst1.16     {q2}, [r2], r3
149
+    vst1.16     {q3}, [r2], r3
150
+.endr
151
+    bx          lr
152
+endfunc
153
+
154
+function x265_filterPixelToShort_8x16_neon
155
+    add         r3, r3
156
+    vmov.u16    q8, #64
157
+    vmov.u16    q9, #8192
158
+    vneg.s16    q9, q9
159
+.rept 8
160
+    vld1.u8     {d0}, [r0], r1
161
+    vld1.u8     {d2}, [r0], r1
162
+    vmovl.u8    q0, d0
163
+    vmovl.u8    q1, d2
164
+    vmov        q2, q9
165
+    vmov        q3, q9
166
+    vmla.s16    q2, q0, q8
167
+    vmla.s16    q3, q1, q8
168
+    vst1.16     {q2}, [r2], r3
169
+    vst1.16     {q3}, [r2], r3
170
+.endr
171
+    bx          lr
172
+endfunc
173
+
174
+function x265_filterPixelToShort_8x32_neon
175
+    add         r3, r3
176
+    vmov.u16    q8, #64
177
+    vmov.u16    q9, #8192
178
+    vneg.s16    q9, q9
179
+.rept 16
180
+    vld1.u8     {d0}, [r0], r1
181
+    vld1.u8     {d2}, [r0], r1
182
+    vmovl.u8    q0, d0
183
+    vmovl.u8    q1, d2
184
+    vmov        q2, q9
185
+    vmov        q3, q9
186
+    vmla.s16    q2, q0, q8
187
+    vmla.s16    q3, q1, q8
188
+    vst1.16     {q2}, [r2], r3
189
+    vst1.16     {q3}, [r2], r3
190
+.endr
191
+    bx          lr
192
+endfunc
193
+
194
+function x265_filterPixelToShort_12x16_neon
195
+    add         r3, r3
196
+    vmov.u16    q8, #64
197
+    vmov.u16    q9, #8192
198
+    vneg.s16    q9, q9
199
+.rept 16
200
+    vld1.u8     {d2-d3}, [r0], r1
201
x265_2.0.tar.gz/source/common/arm/ipfilter8.h Added
201
 
1
@@ -0,0 +1,342 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2016 x265 project
4
+ *
5
+ * Authors: Steve Borho <steve@borho.org>
6
+ *          Dnyaneshwar Gorade <dnyaneshwar@multicorewareinc.com>
7
+ *
8
+ * This program is free software; you can redistribute it and/or modify
9
+ * it under the terms of the GNU General Public License as published by
10
+ * the Free Software Foundation; either version 2 of the License, or
11
+ * (at your option) any later version.
12
+ *
13
+ * This program is distributed in the hope that it will be useful,
14
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16
+ * GNU General Public License for more details.
17
+ *
18
+ * You should have received a copy of the GNU General Public License
19
+ * along with this program; if not, write to the Free Software
20
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
21
+ *
22
+ * This program is also available under a commercial proprietary license.
23
+ * For more information, contact us at license @ x265.com.
24
+ *****************************************************************************/
25
+
26
+#ifndef X265_IPFILTER8_ARM_H
27
+#define X265_IPFILTER8_ARM_H
28
+
29
+void x265_filterPixelToShort_4x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
30
+void x265_filterPixelToShort_4x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
31
+void x265_filterPixelToShort_4x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
32
+void x265_filterPixelToShort_8x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
33
+void x265_filterPixelToShort_8x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
34
+void x265_filterPixelToShort_8x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
35
+void x265_filterPixelToShort_8x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
36
+void x265_filterPixelToShort_12x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
37
+void x265_filterPixelToShort_16x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
38
+void x265_filterPixelToShort_16x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
39
+void x265_filterPixelToShort_16x12_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
40
+void x265_filterPixelToShort_16x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
41
+void x265_filterPixelToShort_16x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
42
+void x265_filterPixelToShort_16x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
43
+void x265_filterPixelToShort_24x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
44
+void x265_filterPixelToShort_32x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
45
+void x265_filterPixelToShort_32x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
46
+void x265_filterPixelToShort_32x24_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
47
+void x265_filterPixelToShort_32x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
48
+void x265_filterPixelToShort_32x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
49
+void x265_filterPixelToShort_48x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
50
+void x265_filterPixelToShort_64x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
51
+void x265_filterPixelToShort_64x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
52
+void x265_filterPixelToShort_64x48_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
53
+void x265_filterPixelToShort_64x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
54
+
55
+void x265_interp_8tap_vert_pp_4x4_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
56
+void x265_interp_8tap_vert_pp_4x8_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
57
+void x265_interp_8tap_vert_pp_4x16_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
58
+void x265_interp_8tap_vert_pp_8x4_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
59
+void x265_interp_8tap_vert_pp_8x8_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
60
+void x265_interp_8tap_vert_pp_8x16_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
61
+void x265_interp_8tap_vert_pp_8x32_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
62
+void x265_interp_8tap_vert_pp_16x4_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
63
+void x265_interp_8tap_vert_pp_16x8_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
64
+void x265_interp_8tap_vert_pp_16x16_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
65
+void x265_interp_8tap_vert_pp_16x32_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
66
+void x265_interp_8tap_vert_pp_16x64_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
67
+void x265_interp_8tap_vert_pp_16x12_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
68
+void x265_interp_8tap_vert_pp_32x8_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
69
+void x265_interp_8tap_vert_pp_32x16_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
70
+void x265_interp_8tap_vert_pp_32x32_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
71
+void x265_interp_8tap_vert_pp_32x64_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
72
+void x265_interp_8tap_vert_pp_32x24_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
73
+void x265_interp_8tap_vert_pp_64x16_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
74
+void x265_interp_8tap_vert_pp_64x32_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
75
+void x265_interp_8tap_vert_pp_64x64_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
76
+void x265_interp_8tap_vert_pp_64x48_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
77
+void x265_interp_8tap_vert_pp_24x32_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
78
+void x265_interp_8tap_vert_pp_48x64_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
79
+void x265_interp_8tap_vert_pp_12x16_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
80
+
81
+void x265_interp_8tap_vert_sp_4x4_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
82
+void x265_interp_8tap_vert_sp_4x8_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
83
+void x265_interp_8tap_vert_sp_4x16_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
84
+void x265_interp_8tap_vert_sp_8x4_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
85
+void x265_interp_8tap_vert_sp_8x8_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
86
+void x265_interp_8tap_vert_sp_8x16_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
87
+void x265_interp_8tap_vert_sp_8x32_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
88
+void x265_interp_8tap_vert_sp_16x4_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
89
+void x265_interp_8tap_vert_sp_16x8_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
90
+void x265_interp_8tap_vert_sp_16x16_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
91
+void x265_interp_8tap_vert_sp_16x32_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
92
+void x265_interp_8tap_vert_sp_16x64_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
93
+void x265_interp_8tap_vert_sp_16x12_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
94
+void x265_interp_8tap_vert_sp_32x8_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
95
+void x265_interp_8tap_vert_sp_32x16_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
96
+void x265_interp_8tap_vert_sp_32x32_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
97
+void x265_interp_8tap_vert_sp_32x64_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
98
+void x265_interp_8tap_vert_sp_32x24_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
99
+void x265_interp_8tap_vert_sp_64x16_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
100
+void x265_interp_8tap_vert_sp_64x32_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
101
+void x265_interp_8tap_vert_sp_64x64_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
102
+void x265_interp_8tap_vert_sp_64x48_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
103
+void x265_interp_8tap_vert_sp_24x32_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
104
+void x265_interp_8tap_vert_sp_48x64_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
105
+void x265_interp_8tap_vert_sp_12x16_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
106
+
107
+void x265_interp_8tap_vert_ps_4x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
108
+void x265_interp_8tap_vert_ps_4x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
109
+void x265_interp_8tap_vert_ps_4x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
110
+void x265_interp_8tap_vert_ps_8x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
111
+void x265_interp_8tap_vert_ps_8x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
112
+void x265_interp_8tap_vert_ps_8x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
113
+void x265_interp_8tap_vert_ps_8x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
114
+void x265_interp_8tap_vert_ps_16x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
115
+void x265_interp_8tap_vert_ps_16x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
116
+void x265_interp_8tap_vert_ps_16x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
117
+void x265_interp_8tap_vert_ps_16x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
118
+void x265_interp_8tap_vert_ps_16x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
119
+void x265_interp_8tap_vert_ps_16x12_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
120
+void x265_interp_8tap_vert_ps_32x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
121
+void x265_interp_8tap_vert_ps_32x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
122
+void x265_interp_8tap_vert_ps_32x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
123
+void x265_interp_8tap_vert_ps_32x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
124
+void x265_interp_8tap_vert_ps_32x24_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
125
+void x265_interp_8tap_vert_ps_64x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
126
+void x265_interp_8tap_vert_ps_64x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
127
+void x265_interp_8tap_vert_ps_64x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
128
+void x265_interp_8tap_vert_ps_64x48_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
129
+void x265_interp_8tap_vert_ps_24x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
130
+void x265_interp_8tap_vert_ps_48x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
131
+void x265_interp_8tap_vert_ps_12x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
132
+
133
+void x265_interp_4tap_vert_pp_8x2_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
134
+void x265_interp_4tap_vert_pp_8x4_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
135
+void x265_interp_4tap_vert_pp_8x6_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
136
+void x265_interp_4tap_vert_pp_8x8_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
137
+void x265_interp_4tap_vert_pp_8x16_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
138
+void x265_interp_4tap_vert_pp_8x32_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
139
+void x265_interp_4tap_vert_pp_8x64_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
140
+void x265_interp_4tap_vert_pp_8x12_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
141
+void x265_interp_4tap_vert_pp_16x4_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
142
+void x265_interp_4tap_vert_pp_16x8_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
143
+void x265_interp_4tap_vert_pp_16x12_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
144
+void x265_interp_4tap_vert_pp_16x16_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
145
+void x265_interp_4tap_vert_pp_16x32_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
146
+void x265_interp_4tap_vert_pp_16x64_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
147
+void x265_interp_4tap_vert_pp_16x24_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
148
+void x265_interp_4tap_vert_pp_32x8_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
149
+void x265_interp_4tap_vert_pp_32x16_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
150
+void x265_interp_4tap_vert_pp_32x24_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
151
+void x265_interp_4tap_vert_pp_32x32_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
152
+void x265_interp_4tap_vert_pp_32x64_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
153
+void x265_interp_4tap_vert_pp_32x48_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
154
+void x265_interp_4tap_vert_pp_24x32_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
155
+void x265_interp_4tap_vert_pp_24x64_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
156
+void x265_interp_4tap_vert_pp_48x64_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
157
+void x265_interp_4tap_vert_pp_64x16_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
158
+void x265_interp_4tap_vert_pp_64x32_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
159
+void x265_interp_4tap_vert_pp_64x64_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
160
+void x265_interp_4tap_vert_pp_64x48_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
161
+
162
+void x265_interp_4tap_vert_ps_8x2_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
163
+void x265_interp_4tap_vert_ps_8x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
164
+void x265_interp_4tap_vert_ps_8x6_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
165
+void x265_interp_4tap_vert_ps_8x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
166
+void x265_interp_4tap_vert_ps_8x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
167
+void x265_interp_4tap_vert_ps_8x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
168
+void x265_interp_4tap_vert_ps_8x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
169
+void x265_interp_4tap_vert_ps_8x12_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
170
+void x265_interp_4tap_vert_ps_16x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
171
+void x265_interp_4tap_vert_ps_16x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
172
+void x265_interp_4tap_vert_ps_16x12_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
173
+void x265_interp_4tap_vert_ps_16x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
174
+void x265_interp_4tap_vert_ps_16x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
175
+void x265_interp_4tap_vert_ps_16x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
176
+void x265_interp_4tap_vert_ps_16x24_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
177
+void x265_interp_4tap_vert_ps_32x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
178
+void x265_interp_4tap_vert_ps_32x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
179
+void x265_interp_4tap_vert_ps_32x24_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
180
+void x265_interp_4tap_vert_ps_32x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
181
+void x265_interp_4tap_vert_ps_32x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
182
+void x265_interp_4tap_vert_ps_32x48_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
183
+void x265_interp_4tap_vert_ps_24x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
184
+void x265_interp_4tap_vert_ps_24x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
185
+void x265_interp_4tap_vert_ps_48x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
186
+void x265_interp_4tap_vert_ps_64x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
187
+void x265_interp_4tap_vert_ps_64x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
188
+void x265_interp_4tap_vert_ps_64x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
189
+void x265_interp_4tap_vert_ps_64x48_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
190
+
191
+void x265_interp_4tap_vert_sp_8x2_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
192
+void x265_interp_4tap_vert_sp_8x4_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
193
+void x265_interp_4tap_vert_sp_8x6_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
194
+void x265_interp_4tap_vert_sp_8x8_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
195
+void x265_interp_4tap_vert_sp_8x16_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
196
+void x265_interp_4tap_vert_sp_8x32_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
197
+void x265_interp_4tap_vert_sp_8x64_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
198
+void x265_interp_4tap_vert_sp_8x12_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
199
+void x265_interp_4tap_vert_sp_16x4_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
200
+void x265_interp_4tap_vert_sp_16x8_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
201
x265_2.0.tar.gz/source/common/arm/loopfilter.h Added
31
 
1
@@ -0,0 +1,29 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2016 x265 project
4
+ *
5
+ * Authors: Dnyaneshwar Gorade <dnyaneshwar@multicorewareinc.com>
6
+ *          Praveen Kumar Tiwari <praveen@multicorewareinc.com>
7
+;*          Min Chen <chenm003@163.com>
8
+ *
9
+ * This program is free software; you can redistribute it and/or modify
10
+ * it under the terms of the GNU General Public License as published by
11
+ * the Free Software Foundation; either version 2 of the License, or
12
+ * (at your option) any later version.
13
+ *
14
+ * This program is distributed in the hope that it will be useful,
15
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17
+ * GNU General Public License for more details.
18
+ *
19
+ * You should have received a copy of the GNU General Public License
20
+ * along with this program; if not, write to the Free Software
21
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
22
+ *
23
+ * This program is also available under a commercial proprietary license.
24
+ * For more information, contact us at license @ x265.com.
25
+ *****************************************************************************/
26
+
27
+#ifndef X265_LOOPFILTER_ARM_H
28
+#define X265_LOOPFILTER_ARM_H
29
+
30
+#endif // ifndef X265_LOOPFILTER_ARM_H
31
x265_2.0.tar.gz/source/common/arm/mc-a.S Added
201
 
1
@@ -0,0 +1,1172 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2016 x265 project
4
+ *
5
+ * Authors: Dnyaneshwar Gorade <dnyaneshwar@multicorewareinc.com>
6
+ *          Radhakrishnan <radhakrishnan@multicorewareinc.com>
7
+ *
8
+ * This program is free software; you can redistribute it and/or modify
9
+ * it under the terms of the GNU General Public License as published by
10
+ * the Free Software Foundation; either version 2 of the License, or
11
+ * (at your option) any later version.
12
+ *
13
+ * This program is distributed in the hope that it will be useful,
14
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16
+ * GNU General Public License for more details.
17
+ *
18
+ * You should have received a copy of the GNU General Public License
19
+ * along with this program; if not, write to the Free Software
20
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
21
+ *
22
+ * This program is also available under a commercial proprietary license.
23
+ * For more information, contact us at license @ x265.com.
24
+ *****************************************************************************/
25
+
26
+#include "asm.S"
27
+
28
+.section .rodata
29
+
30
+.align 4
31
+
32
+.text
33
+
34
+/* blockcopy_pp_16x16(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
35
+ *
36
+ * r0   - dst
37
+ * r1   - dstStride
38
+ * r2   - src
39
+ * r3   - srcStride */
40
+function x265_blockcopy_pp_16x16_neon
41
+.rept 16
42
+    vld1.8          {q0}, [r2]
43
+    vst1.8          {q0}, [r0]
44
+    add             r2, r2, r3
45
+    add             r0, r0, r1
46
+.endr
47
+    bx              lr
48
+endfunc
49
+
50
+.macro blockcopy_pp_4xN_neon h
51
+function x265_blockcopy_pp_4x\h\()_neon
52
+.rept \h
53
+    ldr             r12, [r2], r3
54
+    str             r12, [r0], r1
55
+.endr
56
+    bx              lr
57
+endfunc
58
+.endm
59
+
60
+blockcopy_pp_4xN_neon 4
61
+blockcopy_pp_4xN_neon 8
62
+blockcopy_pp_4xN_neon 16
63
+blockcopy_pp_4xN_neon 2
64
+blockcopy_pp_4xN_neon 32
65
+
66
+.macro blockcopy_pp_16xN_neon h
67
+function x265_blockcopy_pp_16x\h\()_neon
68
+.rept \h    
69
+    vld1.8          {q0}, [r2], r3
70
+    vst1.8          {q0}, [r0], r1
71
+.endr
72
+    bx              lr
73
+endfunc
74
+.endm
75
+
76
+blockcopy_pp_16xN_neon 4
77
+blockcopy_pp_16xN_neon 8
78
+blockcopy_pp_16xN_neon 12
79
+blockcopy_pp_16xN_neon 24
80
+
81
+.macro blockcopy_pp_16xN1_neon h i
82
+function x265_blockcopy_pp_16x\h\()_neon
83
+    mov             r12, #\i
84
+loop_16x\h\():
85
+.rept 8
86
+    vld1.8          {q0}, [r2], r3
87
+    vst1.8          {q0}, [r0], r1
88
+.endr
89
+    subs            r12, r12, #1
90
+    bne             loop_16x\h
91
+    bx              lr
92
+endfunc
93
+.endm
94
+
95
+blockcopy_pp_16xN1_neon 32 4
96
+blockcopy_pp_16xN1_neon 64 8
97
+
98
+.macro blockcopy_pp_8xN_neon h
99
+function x265_blockcopy_pp_8x\h\()_neon
100
+.rept \h    
101
+    vld1.8          {d0}, [r2], r3
102
+    vst1.8          {d0}, [r0], r1
103
+.endr
104
+    bx              lr
105
+endfunc
106
+.endm
107
+
108
+blockcopy_pp_8xN_neon 4
109
+blockcopy_pp_8xN_neon 8
110
+blockcopy_pp_8xN_neon 16
111
+blockcopy_pp_8xN_neon 32
112
+blockcopy_pp_8xN_neon 2
113
+blockcopy_pp_8xN_neon 6
114
+blockcopy_pp_8xN_neon 12
115
+
116
+function x265_blockcopy_pp_12x16_neon
117
+    sub             r3, #8
118
+    sub             r1, #8
119
+.rept 16
120
+    vld1.8          {d0}, [r2]!
121
+    ldr             r12, [r2], r3
122
+    vst1.8          {d0}, [r0]!
123
+    str             r12, [r0], r1
124
+.endr
125
+    bx              lr
126
+endfunc
127
+
128
+function x265_blockcopy_pp_24x32_neon
129
+    mov             r12, #4
130
+loop_24x32:
131
+.rept 8
132
+    vld1.8          {d0, d1, d2}, [r2], r3
133
+    vst1.8          {d0, d1, d2}, [r0], r1
134
+.endr
135
+    subs            r12, r12, #1
136
+    bne             loop_24x32
137
+    bx              lr
138
+endfunc
139
+
140
+function x265_blockcopy_pp_32x8_neon
141
+.rept 8
142
+    vld1.8          {q0, q1}, [r2], r3
143
+    vst1.8          {q0, q1}, [r0], r1
144
+.endr 
145
+    bx              lr
146
+endfunc
147
+
148
+.macro blockcopy_pp_32xN_neon h i
149
+function x265_blockcopy_pp_32x\h\()_neon
150
+    mov             r12, #\i
151
+loop_32x\h\():
152
+.rept 8
153
+    vld1.8          {q0, q1}, [r2], r3
154
+    vst1.8          {q0, q1}, [r0], r1
155
+.endr
156
+    subs            r12, r12, #1
157
+    bne             loop_32x\h
158
+    bx              lr
159
+endfunc
160
+.endm
161
+
162
+blockcopy_pp_32xN_neon 16 2
163
+blockcopy_pp_32xN_neon 24 3
164
+blockcopy_pp_32xN_neon 32 4
165
+blockcopy_pp_32xN_neon 64 8
166
+blockcopy_pp_32xN_neon 48 6
167
+
168
+function x265_blockcopy_pp_48x64_neon
169
+    mov             r12, #8
170
+    sub             r3, #32
171
+    sub             r1, #32
172
+loop_48x64:
173
+.rept 8
174
+    vld1.8          {q0, q1}, [r2]!
175
+    vld1.8          {q2}, [r2], r3
176
+    vst1.8          {q0, q1}, [r0]!
177
+    vst1.8          {q2}, [r0], r1
178
+.endr
179
+    subs            r12, r12, #1
180
+    bne             loop_48x64
181
+    bx              lr
182
+endfunc
183
+
184
+.macro blockcopy_pp_64xN_neon h i
185
+function x265_blockcopy_pp_64x\h\()_neon
186
+    mov             r12, #\i
187
+    sub             r3, #32
188
+    sub             r1, #32
189
+loop_64x\h\():
190
+.rept 4
191
+    vld1.8          {q0, q1}, [r2]!
192
+    vld1.8          {q2, q3}, [r2], r3
193
+    vst1.8          {q0, q1}, [r0]!
194
+    vst1.8          {q2, q3}, [r0], r1
195
+.endr
196
+    subs            r12, r12, #1
197
+    bne             loop_64x\h
198
+    bx              lr
199
+endfunc
200
+.endm
201
x265_2.0.tar.gz/source/common/arm/mc.h Added
29
 
1
@@ -0,0 +1,27 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2016 x265 project
4
+ *
5
+ * Authors: Steve Borho <steve@borho.org>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#ifndef X265_MC_ARM_H
26
+#define X265_MC_ARM_H
27
+
28
+#endif // ifndef X265_MC_ARM_H
29
x265_2.0.tar.gz/source/common/arm/pixel-util.S Added
201
 
1
@@ -0,0 +1,2451 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2016 x265 project
4
+ *
5
+ * Authors: Dnyaneshwar G <dnyaneshwar@multicorewareinc.com>
6
+ *          Radhakrishnan VR <radhakrishnan@multicorewareinc.com>
7
+ *          Min Chen <min.chen@multicorewareinc.com>
8
+ * 
9
+ * This program is free software; you can redistribute it and/or modify
10
+ * it under the terms of the GNU General Public License as published by
11
+ * the Free Software Foundation; either version 2 of the License, or
12
+ * (at your option) any later version.
13
+ *
14
+ * This program is distributed in the hope that it will be useful,
15
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17
+ * GNU General Public License for more details.
18
+ *
19
+ * You should have received a copy of the GNU General Public License
20
+ * along with this program; if not, write to the Free Software
21
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
22
+ *
23
+ * This program is also available under a commercial proprietary license.
24
+ * For more information, contact us at license @ x265.com.
25
+ *****************************************************************************/
26
+
27
+#include "asm.S"
28
+
29
+.section .rodata
30
+
31
+.align 4
32
+
33
+
34
+.text
35
+
36
+.macro VAR_SQR_SUM qsqr_sum, qsqr_last, qsqr_temp, dsrc, num=0, vpadal=vpadal.u16
37
+    vmull.u8        \qsqr_temp, \dsrc, \dsrc
38
+    vaddw.u8        q\num, q\num, \dsrc
39
+    \vpadal         \qsqr_sum, \qsqr_last
40
+.endm
41
+
42
+function x265_pixel_var_8x8_neon
43
+    vld1.u8         {d16}, [r0], r1
44
+    vmull.u8        q1, d16, d16
45
+    vmovl.u8        q0, d16
46
+    vld1.u8         {d18}, [r0], r1
47
+    vmull.u8        q2, d18, d18
48
+    vaddw.u8        q0, q0, d18
49
+
50
+    vld1.u8         {d20}, [r0], r1
51
+    VAR_SQR_SUM     q1, q1, q3, d20, 0, vpaddl.u16
52
+    vld1.u8         {d22}, [r0], r1
53
+    VAR_SQR_SUM     q2, q2, q8, d22, 0, vpaddl.u16
54
+
55
+    vld1.u8         {d24}, [r0], r1
56
+    VAR_SQR_SUM     q1, q3, q9, d24
57
+    vld1.u8         {d26}, [r0], r1
58
+    VAR_SQR_SUM     q2, q8, q10, d26
59
+    vld1.u8         {d24}, [r0], r1
60
+    VAR_SQR_SUM     q1, q9, q14, d24
61
+    vld1.u8         {d26}, [r0], r1
62
+    VAR_SQR_SUM     q2, q10, q15, d26
63
+
64
+    vpaddl.u16      q8, q14
65
+    vpaddl.u16      q9, q15
66
+    vadd.u32        q1, q1, q8
67
+    vadd.u16        d0, d0, d1
68
+    vadd.u32        q1, q1, q9
69
+    vadd.u32        q1, q1, q2
70
+    vpaddl.u16      d0, d0
71
+    vadd.u32        d2, d2, d3
72
+    vpadd.u32       d0, d0, d2
73
+
74
+    vmov            r0, r1, d0
75
+    bx              lr
76
+endfunc
77
+
78
+function x265_pixel_var_16x16_neon
79
+    veor.u8         q0, q0
80
+    veor.u8         q1, q1
81
+    veor.u8         q2, q2
82
+    veor.u8         q14, q14
83
+    veor.u8         q15, q15
84
+    mov             ip, #4
85
+
86
+.var16_loop:
87
+    subs            ip, ip, #1
88
+    vld1.u8         {q8}, [r0], r1
89
+    VAR_SQR_SUM     q1, q14, q12, d16
90
+    VAR_SQR_SUM     q2, q15, q13, d17
91
+
92
+    vld1.u8         {q9}, [r0], r1
93
+    VAR_SQR_SUM     q1, q12, q14, d18
94
+    VAR_SQR_SUM     q2, q13, q15, d19
95
+
96
+    vld1.u8         {q8}, [r0], r1
97
+    VAR_SQR_SUM     q1, q14, q12, d16
98
+    VAR_SQR_SUM     q2, q15, q13, d17
99
+
100
+    vld1.u8         {q9}, [r0], r1
101
+    VAR_SQR_SUM     q1, q12, q14, d18
102
+    VAR_SQR_SUM     q2, q13, q15, d19
103
+    bgt             .var16_loop
104
+
105
+    vpaddl.u16      q8, q14
106
+    vpaddl.u16      q9, q15
107
+    vadd.u32        q1, q1, q8
108
+    vadd.u16        d0, d0, d1
109
+    vadd.u32        q1, q1, q9
110
+    vadd.u32        q1, q1, q2
111
+    vpaddl.u16      d0, d0
112
+    vadd.u32        d2, d2, d3
113
+    vpadd.u32       d0, d0, d2
114
+
115
+    vmov            r0, r1, d0
116
+    bx              lr
117
+endfunc
118
+
119
+function x265_pixel_var_32x32_neon
120
+    veor.u8         q0, q0
121
+    veor.u8         q1, q1
122
+    veor.u8         q2, q2
123
+    veor.u8         q14, q14
124
+    veor.u8         q15, q15
125
+    mov             ip, #8
126
+
127
+.var32_loop:
128
+    subs            ip, ip, #1
129
+    vld1.u8         {q8-q9}, [r0], r1
130
+    VAR_SQR_SUM     q1, q14, q12, d16
131
+    VAR_SQR_SUM     q2, q15, q13, d17
132
+    VAR_SQR_SUM     q1, q12, q14, d18
133
+    VAR_SQR_SUM     q2, q13, q15, d19
134
+
135
+    vld1.u8         {q8-q9}, [r0], r1
136
+    VAR_SQR_SUM     q1, q14, q12, d16
137
+    VAR_SQR_SUM     q2, q15, q13, d17
138
+    VAR_SQR_SUM     q1, q12, q14, d18
139
+    VAR_SQR_SUM     q2, q13, q15, d19
140
+
141
+    vld1.u8         {q8-q9}, [r0], r1
142
+    VAR_SQR_SUM     q1, q14, q12, d16
143
+    VAR_SQR_SUM     q2, q15, q13, d17
144
+    VAR_SQR_SUM     q1, q12, q14, d18
145
+    VAR_SQR_SUM     q2, q13, q15, d19
146
+
147
+    vld1.u8         {q8-q9}, [r0], r1
148
+    VAR_SQR_SUM     q1, q14, q12, d16
149
+    VAR_SQR_SUM     q2, q15, q13, d17
150
+    VAR_SQR_SUM     q1, q12, q14, d18
151
+    VAR_SQR_SUM     q2, q13, q15, d19
152
+    bgt             .var32_loop
153
+
154
+    vpaddl.u16      q8, q14
155
+    vpaddl.u16      q9, q15
156
+    vadd.u32        q1, q1, q8
157
+    vadd.u16        d0, d0, d1
158
+    vadd.u32        q1, q1, q9
159
+    vadd.u32        q1, q1, q2
160
+    vpaddl.u16      d0, d0
161
+    vadd.u32        d2, d2, d3
162
+    vpadd.u32       d0, d0, d2
163
+
164
+    vmov            r0, r1, d0
165
+    bx              lr
166
+endfunc
167
+
168
+function x265_pixel_var_64x64_neon
169
+    sub             r1, #32
170
+    veor.u8         q0, q0
171
+    veor.u8         q1, q1
172
+    veor.u8         q2, q2
173
+    veor.u8         q3, q3
174
+    veor.u8         q14, q14
175
+    veor.u8         q15, q15
176
+    mov             ip, #16
177
+
178
+.var64_loop:
179
+    subs            ip, ip, #1
180
+    vld1.u8         {q8-q9}, [r0]!
181
+    VAR_SQR_SUM     q1, q14, q12, d16
182
+    VAR_SQR_SUM     q2, q15, q13, d17
183
+    VAR_SQR_SUM     q1, q12, q14, d18
184
+    VAR_SQR_SUM     q2, q13, q15, d19
185
+
186
+    vld1.u8         {q8-q9}, [r0], r1
187
+    VAR_SQR_SUM     q1, q14, q12, d16, 3
188
+    VAR_SQR_SUM     q2, q15, q13, d17, 3
189
+    VAR_SQR_SUM     q1, q12, q14, d18, 3
190
+    VAR_SQR_SUM     q2, q13, q15, d19, 3
191
+
192
+    vld1.u8         {q8-q9}, [r0]!
193
+    VAR_SQR_SUM     q1, q14, q12, d16
194
+    VAR_SQR_SUM     q2, q15, q13, d17
195
+    VAR_SQR_SUM     q1, q12, q14, d18
196
+    VAR_SQR_SUM     q2, q13, q15, d19
197
+
198
+    vld1.u8         {q8-q9}, [r0], r1
199
+    VAR_SQR_SUM     q1, q14, q12, d16, 3
200
+    VAR_SQR_SUM     q2, q15, q13, d17, 3
201
x265_2.0.tar.gz/source/common/arm/pixel-util.h Added
94
 
1
@@ -0,0 +1,92 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2016 x265 project
4
+ *
5
+ * Authors: Steve Borho <steve@borho.org>
6
+;*          Min Chen <chenm003@163.com>
7
+ *
8
+ * This program is free software; you can redistribute it and/or modify
9
+ * it under the terms of the GNU General Public License as published by
10
+ * the Free Software Foundation; either version 2 of the License, or
11
+ * (at your option) any later version.
12
+ *
13
+ * This program is distributed in the hope that it will be useful,
14
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16
+ * GNU General Public License for more details.
17
+ *
18
+ * You should have received a copy of the GNU General Public License
19
+ * along with this program; if not, write to the Free Software
20
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
21
+ *
22
+ * This program is also available under a commercial proprietary license.
23
+ * For more information, contact us at license @ x265.com.
24
+ *****************************************************************************/
25
+
26
+#ifndef X265_PIXEL_UTIL_ARM_H
27
+#define X265_PIXEL_UTIL_ARM_H
28
+
29
+uint64_t x265_pixel_var_8x8_neon(const pixel* pix, intptr_t stride);
30
+uint64_t x265_pixel_var_16x16_neon(const pixel* pix, intptr_t stride);
31
+uint64_t x265_pixel_var_32x32_neon(const pixel* pix, intptr_t stride);
32
+uint64_t x265_pixel_var_64x64_neon(const pixel* pix, intptr_t stride);
33
+
34
+void x265_getResidual4_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
35
+void x265_getResidual8_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
36
+void x265_getResidual16_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
37
+void x265_getResidual32_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
38
+
39
+void x265_scale1D_128to64_neon(pixel *dst, const pixel *src);
40
+void x265_scale2D_64to32_neon(pixel* dst, const pixel* src, intptr_t stride);
41
+
42
+int x265_pixel_satd_4x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
43
+int x265_pixel_satd_4x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
44
+int x265_pixel_satd_4x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
45
+int x265_pixel_satd_4x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
46
+int x265_pixel_satd_8x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
47
+int x265_pixel_satd_8x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
48
+int x265_pixel_satd_8x12_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
49
+int x265_pixel_satd_8x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
50
+int x265_pixel_satd_8x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
51
+int x265_pixel_satd_8x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
52
+int x265_pixel_satd_12x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
53
+int x265_pixel_satd_12x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
54
+int x265_pixel_satd_16x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
55
+int x265_pixel_satd_16x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
56
+int x265_pixel_satd_16x12_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
57
+int x265_pixel_satd_16x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
58
+int x265_pixel_satd_16x24_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
59
+int x265_pixel_satd_16x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
60
+int x265_pixel_satd_16x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
61
+int x265_pixel_satd_24x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
62
+int x265_pixel_satd_24x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
63
+int x265_pixel_satd_32x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
64
+int x265_pixel_satd_32x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
65
+int x265_pixel_satd_32x24_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
66
+int x265_pixel_satd_32x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
67
+int x265_pixel_satd_32x48_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
68
+int x265_pixel_satd_32x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
69
+int x265_pixel_satd_48x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
70
+int x265_pixel_satd_64x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
71
+int x265_pixel_satd_64x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
72
+int x265_pixel_satd_64x48_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
73
+int x265_pixel_satd_64x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
74
+
75
+int x265_pixel_sa8d_8x8_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
76
+int x265_pixel_sa8d_8x16_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
77
+int x265_pixel_sa8d_16x16_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
78
+int x265_pixel_sa8d_16x32_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
79
+int x265_pixel_sa8d_32x32_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
80
+int x265_pixel_sa8d_32x64_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
81
+int x265_pixel_sa8d_64x64_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
82
+
83
+uint32_t x265_quant_neon(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff);
84
+uint32_t x265_nquant_neon(const int16_t* coef, const int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff);
85
+
86
+void x265_dequant_scaling_neon(const int16_t* quantCoef, const int32_t* deQuantCoef, int16_t* coef, int num, int per, int shift);
87
+void x265_dequant_normal_neon(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift);
88
+
89
+void x265_ssim_4x4x2_core_neon(const pixel* pix1, intptr_t stride1, const pixel* pix2, intptr_t stride2, int sums[2][4]);
90
+
91
+int PFX(psyCost_4x4_neon)(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
92
+
93
+#endif // ifndef X265_PIXEL_UTIL_ARM_H
94
x265_2.0.tar.gz/source/common/arm/pixel.h Added
201
 
1
@@ -0,0 +1,215 @@
2
+/*****************************************************************************
3
+ * pixel.h: x86 pixel metrics
4
+ *****************************************************************************
5
+ * Copyright (C) 2003-2013 x264 project
6
+ * Copyright (C) 2013-2016 x265 project
7
+ *
8
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
9
+ *          Loren Merritt <lorenm@u.washington.edu>
10
+ *          Fiona Glaser <fiona@x264.com>
11
+ *          Min Chen <chenm003@163.com>
12
+ *
13
+ * This program is free software; you can redistribute it and/or modify
14
+ * it under the terms of the GNU General Public License as published by
15
+ * the Free Software Foundation; either version 2 of the License, or
16
+ * (at your option) any later version.
17
+ *
18
+ * This program is distributed in the hope that it will be useful,
19
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
20
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21
+ * GNU General Public License for more details.
22
+ *
23
+ * You should have received a copy of the GNU General Public License
24
+ * along with this program; if not, write to the Free Software
25
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
26
+ *
27
+ * This program is also available under a commercial proprietary license.
28
+ * For more information, contact us at license @ x265.com.
29
+ *****************************************************************************/
30
+
31
+#ifndef X265_I386_PIXEL_ARM_H
32
+#define X265_I386_PIXEL_ARM_H
33
+
34
+int x265_pixel_sad_4x4_armv6(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
35
+int x265_pixel_sad_4x8_armv6(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
36
+int x265_pixel_sad_4x16_armv6(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
37
+int x265_pixel_sad_8x4_neon(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
38
+int x265_pixel_sad_8x8_neon(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
39
+int x265_pixel_sad_8x16_neon(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
40
+int x265_pixel_sad_8x32_neon(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
41
+int x265_pixel_sad_16x4_neon(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
42
+int x265_pixel_sad_16x8_neon(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
43
+int x265_pixel_sad_16x16_neon(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
44
+int x265_pixel_sad_16x12_neon(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
45
+int x265_pixel_sad_16x32_neon(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
46
+int x265_pixel_sad_16x64_neon(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
47
+int x265_pixel_sad_32x8_neon(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
48
+int x265_pixel_sad_32x16_neon(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
49
+int x265_pixel_sad_32x32_neon(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
50
+int x265_pixel_sad_32x64_neon(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
51
+int x265_pixel_sad_32x24_neon(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
52
+int x265_pixel_sad_64x16_neon(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
53
+int x265_pixel_sad_64x32_neon(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
54
+int x265_pixel_sad_64x64_neon(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
55
+int x265_pixel_sad_64x48_neon(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
56
+int x265_pixel_sad_12x16_neon(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
57
+int x265_pixel_sad_24x32_neon(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
58
+int x265_pixel_sad_48x64_neon(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
59
+
60
+void x265_pixel_avg_pp_4x4_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
61
+void x265_pixel_avg_pp_4x8_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
62
+void x265_pixel_avg_pp_4x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
63
+void x265_pixel_avg_pp_8x4_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
64
+void x265_pixel_avg_pp_8x8_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
65
+void x265_pixel_avg_pp_8x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
66
+void x265_pixel_avg_pp_8x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
67
+void x265_pixel_avg_pp_12x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
68
+void x265_pixel_avg_pp_16x4_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
69
+void x265_pixel_avg_pp_16x8_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
70
+void x265_pixel_avg_pp_16x12_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
71
+void x265_pixel_avg_pp_16x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
72
+void x265_pixel_avg_pp_16x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
73
+void x265_pixel_avg_pp_16x64_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
74
+void x265_pixel_avg_pp_24x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
75
+void x265_pixel_avg_pp_32x8_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
76
+void x265_pixel_avg_pp_32x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
77
+void x265_pixel_avg_pp_32x24_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
78
+void x265_pixel_avg_pp_32x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
79
+void x265_pixel_avg_pp_32x64_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
80
+void x265_pixel_avg_pp_48x64_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
81
+void x265_pixel_avg_pp_64x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
82
+void x265_pixel_avg_pp_64x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
83
+void x265_pixel_avg_pp_64x48_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
84
+void x265_pixel_avg_pp_64x64_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
85
+
86
+void x265_sad_x3_4x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
87
+void x265_sad_x3_4x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
88
+void x265_sad_x3_4x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
89
+void x265_sad_x3_8x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
90
+void x265_sad_x3_8x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
91
+void x265_sad_x3_8x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
92
+void x265_sad_x3_8x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
93
+void x265_sad_x3_12x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
94
+void x265_sad_x3_16x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
95
+void x265_sad_x3_16x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
96
+void x265_sad_x3_16x12_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
97
+void x265_sad_x3_16x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
98
+void x265_sad_x3_16x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
99
+void x265_sad_x3_16x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
100
+void x265_sad_x3_24x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
101
+void x265_sad_x3_32x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
102
+void x265_sad_x3_32x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
103
+void x265_sad_x3_32x24_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
104
+void x265_sad_x3_32x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
105
+void x265_sad_x3_32x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
106
+void x265_sad_x3_48x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
107
+void x265_sad_x3_64x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
108
+void x265_sad_x3_64x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
109
+void x265_sad_x3_64x48_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
110
+void x265_sad_x3_64x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
111
+
112
+void x265_sad_x4_4x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
113
+void x265_sad_x4_4x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
114
+void x265_sad_x4_4x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
115
+void x265_sad_x4_8x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
116
+void x265_sad_x4_8x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
117
+void x265_sad_x4_8x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
118
+void x265_sad_x4_8x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
119
+void x265_sad_x4_12x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
120
+void x265_sad_x4_16x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
121
+void x265_sad_x4_16x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
122
+void x265_sad_x4_16x12_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
123
+void x265_sad_x4_16x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
124
+void x265_sad_x4_16x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
125
+void x265_sad_x4_16x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
126
+void x265_sad_x4_24x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
127
+void x265_sad_x4_32x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
128
+void x265_sad_x4_32x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
129
+void x265_sad_x4_32x24_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
130
+void x265_sad_x4_32x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
131
+void x265_sad_x4_32x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
132
+void x265_sad_x4_48x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
133
+void x265_sad_x4_64x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
134
+void x265_sad_x4_64x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
135
+void x265_sad_x4_64x48_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
136
+void x265_sad_x4_64x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
137
+
138
+sse_t x265_pixel_sse_pp_4x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
139
+sse_t x265_pixel_sse_pp_8x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
140
+sse_t x265_pixel_sse_pp_16x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
141
+sse_t x265_pixel_sse_pp_32x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
142
+sse_t x265_pixel_sse_pp_64x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
143
+
144
+sse_t x265_pixel_sse_ss_4x4_neon(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2);
145
+sse_t x265_pixel_sse_ss_8x8_neon(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2);
146
+sse_t x265_pixel_sse_ss_16x16_neon(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2);
147
+sse_t x265_pixel_sse_ss_32x32_neon(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2);
148
+sse_t x265_pixel_sse_ss_64x64_neon(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2);
149
+
150
+sse_t x265_pixel_ssd_s_4x4_neon(const int16_t* a, intptr_t dstride);
151
+sse_t x265_pixel_ssd_s_8x8_neon(const int16_t* a, intptr_t dstride);
152
+sse_t x265_pixel_ssd_s_16x16_neon(const int16_t* a, intptr_t dstride);
153
+sse_t x265_pixel_ssd_s_32x32_neon(const int16_t* a, intptr_t dstride);
154
+sse_t x265_pixel_ssd_s_64x64_neon(const int16_t* a, intptr_t dstride);
155
+
156
+void x265_pixel_sub_ps_4x4_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1);
157
+void x265_pixel_sub_ps_8x8_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1);
158
+void x265_pixel_sub_ps_16x16_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1);
159
+void x265_pixel_sub_ps_32x32_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1);
160
+void x265_pixel_sub_ps_64x64_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1);
161
+void x265_pixel_sub_ps_4x8_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1);
162
+void x265_pixel_sub_ps_8x16_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1);
163
+void x265_pixel_sub_ps_16x32_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1);
164
+void x265_pixel_sub_ps_32x64_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1);
165
+
166
+void x265_pixel_add_ps_4x4_neon(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
167
+void x265_pixel_add_ps_8x8_neon(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
168
+void x265_pixel_add_ps_16x16_neon(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
169
+void x265_pixel_add_ps_32x32_neon(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
170
+void x265_pixel_add_ps_64x64_neon(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
171
+void x265_pixel_add_ps_4x8_neon(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
172
+void x265_pixel_add_ps_8x16_neon(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
173
+void x265_pixel_add_ps_16x32_neon(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
174
+void x265_pixel_add_ps_32x64_neon(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
175
+
176
+void x265_pixel_planecopy_cp_neon(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
177
+
178
+void x265_addAvg_4x4_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
179
+void x265_addAvg_4x8_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
180
+void x265_addAvg_4x16_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
181
+void x265_addAvg_8x4_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
182
+void x265_addAvg_8x8_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
183
+void x265_addAvg_8x16_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
184
+void x265_addAvg_8x32_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
185
+void x265_addAvg_12x16_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
186
+void x265_addAvg_16x4_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
187
+void x265_addAvg_16x8_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
188
+void x265_addAvg_16x12_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
189
+void x265_addAvg_16x16_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
190
+void x265_addAvg_16x32_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
191
+void x265_addAvg_16x64_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
192
+void x265_addAvg_24x32_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
193
+void x265_addAvg_32x8_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
194
+void x265_addAvg_32x16_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
195
+void x265_addAvg_32x24_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
196
+void x265_addAvg_32x32_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
197
+void x265_addAvg_32x64_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
198
+void x265_addAvg_48x64_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
199
+void x265_addAvg_64x16_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
200
+void x265_addAvg_64x32_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
201
x265_2.0.tar.gz/source/common/arm/sad-a.S Added
201
 
1
@@ -0,0 +1,1356 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2016 x265 project
4
+ *
5
+ * Authors: David Conrad <lessen42@gmail.com>
6
+ *          Janne Grunau <janne-x264@jannau.net>
7
+ *          Dnyaneshwar G <dnyaneshwar@multicorewareinc.com>
8
+ * 
9
+ * This program is free software; you can redistribute it and/or modify
10
+ * it under the terms of the GNU General Public License as published by
11
+ * the Free Software Foundation; either version 2 of the License, or
12
+ * (at your option) any later version.
13
+ *
14
+ * This program is distributed in the hope that it will be useful,
15
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17
+ * GNU General Public License for more details.
18
+ *
19
+ * You should have received a copy of the GNU General Public License
20
+ * along with this program; if not, write to the Free Software
21
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
22
+ *
23
+ * This program is also available under a commercial proprietary license.
24
+ * For more information, contact us at license @ x265.com.
25
+ *****************************************************************************/
26
+
27
+#include "asm.S"
28
+
29
+.section .rodata
30
+
31
+.align 4
32
+sad12_mask:
33
+.byte 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 0, 0, 0, 0
34
+
35
+.text
36
+
37
+/* sad4x4(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
38
+ *
39
+ * r0   - dst
40
+ * r1   - dstStride
41
+ * r2   - src
42
+ * r3   - srcStride */
43
+
44
+.macro SAD4_ARMV6 h
45
+function x265_pixel_sad_4x\h\()_armv6
46
+    push        {r4-r6,lr}
47
+    ldr         r4, [r2], r3
48
+    ldr         r5, [r0], r1
49
+    ldr         r6, [r2], r3
50
+    ldr         lr, [r0], r1
51
+    usad8       ip, r4, r5
52
+.rept (\h - 2)/2
53
+    ldr         r4, [r2], r3
54
+    ldr         r5, [r0], r1
55
+    usada8      ip, r6, lr, ip
56
+    ldr         r6, [r2], r3
57
+    ldr         lr, [r0], r1
58
+    usada8      ip, r4, r5, ip
59
+.endr
60
+    usada8      r0, r6, lr, ip
61
+    pop         {r4-r6,pc}
62
+endfunc
63
+.endm
64
+
65
+SAD4_ARMV6 4
66
+SAD4_ARMV6 8
67
+SAD4_ARMV6 16
68
+
69
+.macro SAD8_NEON h
70
+function x265_pixel_sad_8x\h\()_neon
71
+    vld1.8          d0, [r0], r1        // row 0
72
+    vld1.8          d1, [r2], r3        // row 1
73
+    vabdl.u8        q1, d0, d1
74
+
75
+.rept \h-1
76
+    vld1.8          d0, [r0], r1        // row 2,4,6
77
+    vld1.8          d1, [r2], r3        // row 3,5,7
78
+    vabal.u8        q1, d0, d1
79
+.endr
80
+
81
+    vadd.u16        d2, d2, d3
82
+    vpadd.u16       d0, d2, d2
83
+    vpaddl.u16      d0, d0
84
+    vmov.u32        r0, d0[0]
85
+    bx              lr
86
+endfunc
87
+.endm
88
+
89
+SAD8_NEON 4
90
+SAD8_NEON 8
91
+SAD8_NEON 16
92
+SAD8_NEON 32
93
+
94
+.macro SAD16_NEON h
95
+function x265_pixel_sad_16x\h\()_neon
96
+    vld1.8          {q0}, [r0], r1      // row 0
97
+    vld1.8          {q1}, [r2], r3
98
+    vld1.8          {q2}, [r0], r1      // row 1
99
+    vld1.8          {q3}, [r2], r3
100
+
101
+    vabdl.u8        q8, d0, d2
102
+    vabdl.u8        q9, d1, d3
103
+    vabal.u8        q8, d4, d6
104
+    vabal.u8        q9, d5, d7
105
+    mov             r12, #(\h-2)/2
106
+
107
+.loop_16x\h:
108
+
109
+    subs            r12, #1
110
+    vld1.8          {q0}, [r0], r1
111
+    vld1.8          {q1}, [r2], r3
112
+    vld1.8          {q2}, [r0], r1
113
+    vld1.8          {q3}, [r2], r3
114
+
115
+    vabal.u8        q8, d0, d2
116
+    vabal.u8        q9, d1, d3
117
+    vabal.u8        q8, d4, d6
118
+    vabal.u8        q9, d5, d7
119
+    bne             .loop_16x\h
120
+
121
+    vadd.u16        q8, q8, q9
122
+.if \h == 64
123
+    vaddl.u16       q0, d16, d17
124
+    vpadd.u32       d0, d0, d1
125
+    vpadd.u32       d0, d0
126
+.else
127
+    vadd.u16        d16, d16, d17
128
+    vpadd.u16       d0, d16, d16
129
+    vpaddl.u16      d0, d0
130
+.endif
131
+    vmov.u32        r0, d0[0]
132
+    bx              lr
133
+endfunc
134
+.endm
135
+
136
+SAD16_NEON 4
137
+SAD16_NEON 8
138
+SAD16_NEON 16
139
+SAD16_NEON 12
140
+SAD16_NEON 32
141
+SAD16_NEON 64
142
+
143
+.macro SAD32_NEON h
144
+function x265_pixel_sad_32x\h\()_neon
145
+    veor.u8         q8, q8
146
+    veor.u8         q9, q9
147
+    veor.u8         q10, q10
148
+    veor.u8         q11, q11
149
+    mov             r12, #\h/8
150
+
151
+.loop_32x\h:
152
+
153
+    subs            r12, #1
154
+.rept 4
155
+    vld1.8          {q0, q1}, [r0], r1           // row 0
156
+    vld1.8          {q2, q3}, [r2], r3           // row 0
157
+    vld1.8          {q12, q13}, [r0], r1         // row 1
158
+    vld1.8          {q14, q15}, [r2], r3         // row 1
159
+
160
+    vabal.u8        q8, d0, d4
161
+    vabal.u8        q9, d1, d5
162
+    vabal.u8        q10, d2, d6
163
+    vabal.u8        q11, d3, d7
164
+
165
+    vabal.u8        q8, d24, d28
166
+    vabal.u8        q9, d25, d29
167
+    vabal.u8        q10, d26, d30
168
+    vabal.u8        q11, d27, d31
169
+.endr
170
+    bne             .loop_32x\h
171
+
172
+    vadd.u16        q8, q8, q9
173
+    vadd.u16        q10, q10, q11
174
+.if \h == 64
175
+    vaddl.u16       q0, d16, d17
176
+    vpadd.u32       d0, d0, d1
177
+    vpaddl.u32      d0, d0
178
+
179
+    vaddl.u16       q1, d20, d21
180
+    vpadd.u32       d2, d2, d3
181
+    vpaddl.u32      d2, d2
182
+
183
+    vadd.u32        d0,d0,d2
184
+.else
185
+    vadd.u16        d16, d16, d17
186
+    vpadd.u16       d0, d16, d16
187
+    vpaddl.u16      d0, d0
188
+
189
+    vadd.u16        d20, d20, d21
190
+    vpadd.u16       d1, d20, d20
191
+    vpaddl.u16      d1, d1
192
+
193
+    vadd.u32        d0,d0,d1
194
+.endif
195
+    vmov.u32        r0,  d0[0]
196
+    bx              lr
197
+endfunc
198
+.endm
199
+
200
+SAD32_NEON 8
201
x265_2.0.tar.gz/source/common/arm/ssd-a.S Added
201
 
1
@@ -0,0 +1,469 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2016 x265 project
4
+ *
5
+ * Authors: Dnyaneshwar G <dnyaneshwar@multicorewareinc.com>
6
+ * 
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "asm.S"
26
+
27
+.section .rodata
28
+
29
+.align 4
30
+
31
+
32
+.text
33
+
34
+
35
+function x265_pixel_sse_pp_4x4_neon
36
+    vld1.32     {d16[]}, [r0], r1
37
+    vld1.32     {d17[]}, [r2], r3
38
+    vsubl.u8    q2, d16, d17
39
+    vld1.32     {d16[]}, [r0], r1
40
+    vmull.s16   q0, d4, d4
41
+    vld1.32     {d17[]}, [r2], r3
42
+
43
+    vsubl.u8    q2, d16, d17
44
+    vld1.32     {d16[]}, [r0], r1
45
+    vmlal.s16   q0, d4, d4
46
+    vld1.32     {d17[]}, [r2], r3
47
+
48
+    vsubl.u8    q2, d16, d17
49
+    vld1.32     {d16[]}, [r0], r1
50
+    vmlal.s16   q0, d4, d4
51
+    vld1.32     {d17[]}, [r2], r3
52
+
53
+    vsubl.u8    q2, d16, d17
54
+    vmlal.s16   q0, d4, d4
55
+    vadd.s32    d0, d0, d1
56
+    vpadd.s32   d0, d0, d0
57
+    vmov.32     r0, d0[0]
58
+    bx          lr
59
+endfunc
60
+
61
+function x265_pixel_sse_pp_8x8_neon
62
+    vld1.64     {d16}, [r0], r1
63
+    vld1.64     {d17}, [r2], r3
64
+    vsubl.u8    q2, d16, d17
65
+    vld1.64     {d16}, [r0], r1
66
+    vmull.s16   q0, d4, d4
67
+    vmlal.s16   q0, d5, d5
68
+    vld1.64     {d17}, [r2], r3
69
+
70
+.rept 6
71
+    vsubl.u8    q2, d16, d17
72
+    vld1.64     {d16}, [r0], r1
73
+    vmlal.s16   q0, d4, d4
74
+    vmlal.s16   q0, d5, d5
75
+    vld1.64     {d17}, [r2], r3
76
+.endr
77
+    vsubl.u8    q2, d16, d17
78
+    vmlal.s16   q0, d4, d4
79
+    vmlal.s16   q0, d5, d5
80
+    vadd.s32    d0, d0, d1
81
+    vpadd.s32   d0, d0, d0
82
+    vmov.32     r0, d0[0]
83
+    bx          lr
84
+endfunc
85
+
86
+function x265_pixel_sse_pp_16x16_neon
87
+    vld1.64     {d16-d17}, [r0], r1
88
+    vld1.64     {d18-d19}, [r2], r3
89
+    vsubl.u8    q2, d16, d18
90
+    vsubl.u8    q3, d17, d19
91
+    vld1.64     {d16-d17}, [r0], r1
92
+    vmull.s16   q0, d4, d4
93
+    vmlal.s16   q0, d5, d5
94
+    vld1.64     {d18-d19}, [r2], r3
95
+    vmlal.s16   q0, d6, d6
96
+    vmlal.s16   q0, d7, d7
97
+
98
+.rept 14
99
+    vsubl.u8    q2, d16, d18
100
+    vsubl.u8    q3, d17, d19
101
+    vld1.64     {d16-d17}, [r0], r1
102
+    vmlal.s16   q0, d4, d4
103
+    vmlal.s16   q0, d5, d5
104
+    vld1.64     {d18-d19}, [r2], r3
105
+    vmlal.s16   q0, d6, d6
106
+    vmlal.s16   q0, d7, d7
107
+.endr
108
+    vsubl.u8    q2, d16, d18
109
+    vsubl.u8    q3, d17, d19
110
+    vmlal.s16   q0, d4, d4
111
+    vmlal.s16   q0, d5, d5
112
+    vmlal.s16   q0, d6, d6
113
+    vmlal.s16   q0, d7, d7
114
+    vadd.s32    d0, d0, d1
115
+    vpadd.s32   d0, d0, d0
116
+    vmov.32     r0, d0[0]
117
+    bx          lr
118
+endfunc
119
+
120
+function x265_pixel_sse_pp_32x32_neon
121
+    mov         r12, #8
122
+    veor.u8     q0, q0
123
+    veor.u8     q1, q1
124
+
125
+.loop_sse_pp_32:
126
+    subs        r12, #1
127
+.rept 4
128
+    vld1.64     {q8-q9}, [r0], r1
129
+    vld1.64     {q10-q11}, [r2], r3
130
+    vsubl.u8    q2, d16, d20
131
+    vsubl.u8    q3, d17, d21
132
+    vsubl.u8    q12, d18, d22
133
+    vsubl.u8    q13, d19, d23
134
+    vmlal.s16   q0, d4, d4
135
+    vmlal.s16   q1, d5, d5
136
+    vmlal.s16   q0, d6, d6
137
+    vmlal.s16   q1, d7, d7
138
+    vmlal.s16   q0, d24, d24
139
+    vmlal.s16   q1, d25, d25
140
+    vmlal.s16   q0, d26, d26
141
+    vmlal.s16   q1, d27, d27
142
+.endr
143
+    bne         .loop_sse_pp_32
144
+    vadd.s32    q0, q1
145
+    vadd.s32    d0, d0, d1
146
+    vpadd.s32   d0, d0, d0
147
+    vmov.32     r0, d0[0]
148
+    bx          lr
149
+endfunc
150
+
151
+function x265_pixel_sse_pp_64x64_neon
152
+    sub         r1, #32
153
+    sub         r3, #32
154
+    mov         r12, #16
155
+    veor.u8     q0, q0
156
+    veor.u8     q1, q1
157
+
158
+.loop_sse_pp_64:
159
+    subs        r12, #1
160
+.rept 4
161
+    vld1.64     {q8-q9}, [r0]!
162
+    vld1.64     {q10-q11}, [r2]!
163
+    vsubl.u8    q2, d16, d20
164
+    vsubl.u8    q3, d17, d21
165
+    vsubl.u8    q12, d18, d22
166
+    vsubl.u8    q13, d19, d23
167
+    vmlal.s16   q0, d4, d4
168
+    vmlal.s16   q1, d5, d5
169
+    vmlal.s16   q0, d6, d6
170
+    vmlal.s16   q1, d7, d7
171
+    vmlal.s16   q0, d24, d24
172
+    vmlal.s16   q1, d25, d25
173
+    vmlal.s16   q0, d26, d26
174
+    vmlal.s16   q1, d27, d27
175
+
176
+    vld1.64     {q8-q9}, [r0], r1
177
+    vld1.64     {q10-q11}, [r2], r3
178
+    vsubl.u8    q2, d16, d20
179
+    vsubl.u8    q3, d17, d21
180
+    vsubl.u8    q12, d18, d22
181
+    vsubl.u8    q13, d19, d23
182
+    vmlal.s16   q0, d4, d4
183
+    vmlal.s16   q1, d5, d5
184
+    vmlal.s16   q0, d6, d6
185
+    vmlal.s16   q1, d7, d7
186
+    vmlal.s16   q0, d24, d24
187
+    vmlal.s16   q1, d25, d25
188
+    vmlal.s16   q0, d26, d26
189
+    vmlal.s16   q1, d27, d27
190
+.endr
191
+    bne         .loop_sse_pp_64
192
+    vadd.s32    q0, q1
193
+    vadd.s32    d0, d0, d1
194
+    vpadd.s32   d0, d0, d0
195
+    vmov.32     r0, d0[0]
196
+    bx          lr
197
+endfunc
198
+
199
+function x265_pixel_sse_ss_4x4_neon
200
+    add         r1, r1
201
x265_1.9.tar.gz/source/common/common.cpp -> x265_2.0.tar.gz/source/common/common.cpp Changed
118
 
1
@@ -29,6 +29,8 @@
2
 #if _WIN32
3
 #include <sys/types.h>
4
 #include <sys/timeb.h>
5
+#include <io.h>
6
+#include <fcntl.h>
7
 #else
8
 #include <sys/time.h>
9
 #endif
10
@@ -139,6 +141,94 @@
11
     fputs(buffer, stderr);
12
 }
13
 
14
+#if _WIN32
15
+/* For Unicode filenames in Windows we convert UTF-8 strings to UTF-16 and we use _w functions.
16
+ * For other OS we do not make any changes. */
17
+void general_log_file(const x265_param* param, const char* caller, int level, const char* fmt, ...)
18
+{
19
+    if (param && level > param->logLevel)
20
+        return;
21
+    const int bufferSize = 4096;
22
+    char buffer[bufferSize];
23
+    int p = 0;
24
+    const char* log_level;
25
+    switch (level)
26
+    {
27
+    case X265_LOG_ERROR:
28
+        log_level = "error";
29
+        break;
30
+    case X265_LOG_WARNING:
31
+        log_level = "warning";
32
+        break;
33
+    case X265_LOG_INFO:
34
+        log_level = "info";
35
+        break;
36
+    case X265_LOG_DEBUG:
37
+        log_level = "debug";
38
+        break;
39
+    case X265_LOG_FULL:
40
+        log_level = "full";
41
+        break;
42
+    default:
43
+        log_level = "unknown";
44
+        break;
45
+    }
46
+
47
+    if (caller)
48
+        p += sprintf(buffer, "%-4s [%s]: ", caller, log_level);
49
+    va_list arg;
50
+    va_start(arg, fmt);
51
+    vsnprintf(buffer + p, bufferSize - p, fmt, arg);
52
+    va_end(arg);
53
+
54
+    HANDLE console = GetStdHandle(STD_ERROR_HANDLE);
55
+    DWORD mode;
56
+    if (GetConsoleMode(console, &mode))
57
+    {
58
+        wchar_t buf_utf16[bufferSize];
59
+        int length_utf16 = MultiByteToWideChar(CP_UTF8, 0, buffer, -1, buf_utf16, sizeof(buf_utf16)/sizeof(wchar_t)) - 1;
60
+        if (length_utf16 > 0)
61
+            WriteConsoleW(console, buf_utf16, length_utf16, &mode, NULL);
62
+    }
63
+    else
64
+        fputs(buffer, stderr);
65
+}
66
+
67
+FILE* x265_fopen(const char* fileName, const char* mode)
68
+{
69
+    wchar_t buf_utf16[MAX_PATH * 2], mode_utf16[16];
70
+
71
+    if (MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, fileName, -1, buf_utf16, sizeof(buf_utf16)/sizeof(wchar_t)) &&
72
+        MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, mode, -1, mode_utf16, sizeof(mode_utf16)/sizeof(wchar_t)))
73
+    {
74
+        return _wfopen(buf_utf16, mode_utf16);
75
+    }
76
+    return NULL;
77
+}
78
+
79
+int x265_unlink(const char* fileName)
80
+{
81
+    wchar_t buf_utf16[MAX_PATH * 2];
82
+
83
+    if (MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, fileName, -1, buf_utf16, sizeof(buf_utf16)/sizeof(wchar_t)))
84
+        return _wunlink(buf_utf16);
85
+
86
+    return -1;
87
+}
88
+
89
+int x265_rename(const char* oldName, const char* newName)
90
+{
91
+    wchar_t old_utf16[MAX_PATH * 2], new_utf16[MAX_PATH * 2];
92
+
93
+    if (MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, oldName, -1, old_utf16, sizeof(old_utf16)/sizeof(wchar_t)) &&
94
+        MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, newName, -1, new_utf16, sizeof(new_utf16)/sizeof(wchar_t)))
95
+    {
96
+        return _wrename(old_utf16, new_utf16);
97
+    }
98
+    return -1;
99
+}
100
+#endif
101
+
102
 double x265_ssim2dB(double ssim)
103
 {
104
     double inv_ssim = 1 - ssim;
105
@@ -177,10 +267,10 @@
106
     size_t fSize;
107
     char *buf = NULL;
108
 
109
-    FILE *fh = fopen(filename, "rb");
110
+    FILE *fh = x265_fopen(filename, "rb");
111
     if (!fh)
112
     {
113
-        x265_log(NULL, X265_LOG_ERROR, "unable to open file %s\n", filename);
114
+        x265_log_file(NULL, X265_LOG_ERROR, "unable to open file %s\n", filename);
115
         return NULL;
116
     }
117
 
118
x265_1.9.tar.gz/source/common/common.h -> x265_2.0.tar.gz/source/common/common.h Changed
30
 
1
@@ -322,6 +322,8 @@
2
 #define MAX_NUM_TR_COEFFS           MAX_TR_SIZE * MAX_TR_SIZE // Maximum number of transform coefficients, for a 32x32 transform
3
 #define MAX_NUM_TR_CATEGORIES       16                        // 32, 16, 8, 4 transform categories each for luma and chroma
4
 
5
+#define PIXEL_MAX ((1 << X265_DEPTH) - 1)
6
+
7
 namespace X265_NS {
8
 
9
 enum { SAO_NUM_OFFSET = 4 };
10
@@ -402,7 +404,19 @@
11
 /* located in common.cpp */
12
 int64_t  x265_mdate(void);
13
 #define  x265_log(param, ...) general_log(param, "x265", __VA_ARGS__)
14
+#define  x265_log_file(param, ...) general_log_file(param, "x265", __VA_ARGS__)
15
 void     general_log(const x265_param* param, const char* caller, int level, const char* fmt, ...);
16
+#if _WIN32
17
+void     general_log_file(const x265_param* param, const char* caller, int level, const char* fmt, ...);
18
+FILE*    x265_fopen(const char* fileName, const char* mode);
19
+int      x265_unlink(const char* fileName);
20
+int      x265_rename(const char* oldName, const char* newName);
21
+#else
22
+#define  general_log_file(param, caller, level, fmt, ...) general_log(param, caller, level, fmt, __VA_ARGS__)
23
+#define  x265_fopen(fileName, mode) fopen(fileName, mode)
24
+#define  x265_unlink(fileName) unlink(fileName)
25
+#define  x265_rename(oldName, newName) rename(oldName, newName)
26
+#endif
27
 int      x265_exp2fix8(double x);
28
 
29
 double   x265_ssim2dB(double ssim);
30
x265_1.9.tar.gz/source/common/constants.cpp -> x265_2.0.tar.gz/source/common/constants.cpp Changed
201
 
1
@@ -555,18 +555,6 @@
2
     0x38, 
3
 };
4
 
5
-/* Contains how much to increment shared depth buffer for different ctu sizes to get next best depth
6
- * here, depth 0 = 64x64, depth 1 = 32x32, depth 2 = 16x16 and depth 3 = 8x8
7
- * if ctu = 64, depth buffer size is 256 combination of depth values 0, 1, 2, 3
8
- * if ctu = 32, depth buffer size is 64 combination of depth values 1, 2, 3
9
- * if ctu = 16, depth buffer size is 16 combination of depth values 2, 3 */
10
-const uint32_t g_depthInc[3][4] =
11
-{
12
-    { 16,  4,  0, 0},
13
-    { 64, 16,  4, 1},
14
-    {256, 64, 16, 4}
15
-};
16
-
17
 /* g_depthScanIdx [y][x] */
18
 const uint32_t g_depthScanIdx[8][8] =
19
 {
20
@@ -580,4 +568,236 @@
21
     {  42,  43,  46,  47,  58,  59,  62,  63,  }
22
 };
23
 
24
+/* Rec.2020 YUV to RGB Non-constant luminance */
25
+const double g_YUVtoRGB_BT2020[3][3] = 
26
+{
27
+    {   1.00,   0.00,      1.47460,   },
28
+    {   1.00,  -0.16455,  -0.57135,   },
29
+    {   1.00,   1.88140,   0.00,      }
30
+};
31
+
32
+const double g_ST2084_PQTable[MAX_HDR_LEGAL_RANGE - MIN_HDR_LEGAL_RANGE + 1] = 
33
+{
34
+    0,
35
+    5.25912035416561E-05, 0.000170826479250824, 0.000342874260206259, 0.000565730978088069,
36
+    0.000838361593599196, 0.0011605708550711, 0.00153261170332205, 0.00195500928122658,
37
+    0.00242846920816411, 0.00295382484798614, 0.00353200479131171, 0.00416401171798929,
38
+    0.00485090808272845, 0.00559380610060962, 0.00639386055422149, 0.00725226351560689,
39
+    0.0081702404049783, 0.00914904700558975, 0.010189967177051, 0.0112943110883226,
40
+    0.0124634138437419, 0.0136986344106386, 0.0150013547814312, 0.0163729793201926,
41
+    0.0178149342559234, 0.0193286672936668, 0.0209156473211494, 0.022577364193536,
42
+    0.0243153285825585, 0.0261310718791221, 0.0280261461406398, 0.0300021240760516,
43
+    0.0320605990628007, 0.0342031851910785, 0.036431517331512, 0.0387472512230819,
44
+    0.0411520635786705, 0.0436476522060052, 0.046235736142162, 0.0489180558000865,
45
+    0.0516963731258075, 0.0545724717652363, 0.0575481572396137, 0.0606252571287911,
46
+    0.0638056212616694, 0.0670911219131892, 0.0704836540073949, 0.0739851353261047,
47
+    0.0775975067228409, 0.0813227323416811, 0.0851627998407477, 0.0891197206201265,
48
+    0.0931955300539647, 0.0973922877266004, 0.101712077672541, 0.106157008620188,
49
+    0.110729214239187, 0.115430853391267, 0.120264110384523, 0.125231195231086,
50
+    0.130334343908053, 0.135575818621706, 0.140957908074883, 0.146482927737596,
51
+    0.152153220120717, 0.157971155052834, 0.163939129960184, 0.170059570149691,
52
+    0.176334929095073, 0.182767688726043, 0.189360359720598, 0.196115481800328,
53
+    0.203035624028883, 0.210123385113499, 0.21738139370961, 0.224812308728624,
54
+    0.232418819648774, 0.240203646829142, 0.248169541826838, 0.256319287717358,
55
+    0.264655699418179, 0.273181624015456, 0.281899941094164, 0.29081356307129,
56
+    0.299925435532481, 0.309238537571936, 0.318755882135647, 0.32848051636804,
57
+    0.338415521962, 0.34856401551231, 0.358929148872555, 0.369514109515577,
58
+    0.380322120897342, 0.391356442824469, 0.402620371825233, 0.414117241524302,
59
+    0.425850423021013, 0.437823325271459, 0.450039395474131, 0.4625021194595,
60
+    0.475215022083238, 0.488181667623337, 0.501405660181076, 0.514890644085913,
61
+    0.528640304304275, 0.542658366852319, 0.556948599212766, 0.571514810755682,
62
+    0.58636085316357, 0.601490620860234, 0.616908051444177, 0.632617126126042,
63
+    0.648621870170268, 0.664926353341107, 0.681534690353104, 0.6984510413256,
64
+    0.715679612242097, 0.733224655413817, 0.751090469947712, 0.769281402219399,
65
+    0.78780184635024, 0.806656244689427, 0.82584908830055, 0.84538491745295,
66
+    0.865268322117971, 0.885503942469945, 0.906096469391926, 0.927050644986733,
67
+    0.948371263092526, 0.970063169803824, 0.99213126399724, 1.01458049786256,
68
+    1.03741587743901, 1.06064246315667, 1.08426537038311, 1.10828976997558,
69
+    1.13272088883845, 1.1575640104859, 1.18282447561067, 1.20850768265765,
70
+    1.23461908840365, 1.26116420854251, 1.28814861827608, 1.31557795291099,
71
+    1.34345790846097, 1.37179424225547, 1.40059277355414, 1.42985938416685,
72
+    1.45960001908056, 1.48982068709166, 1.52052746144494, 1.55172648047831,
73
+    1.58342394827458, 1.61562613531883, 1.6483393791628, 1.68157008509547,
74
+    1.71532472682031, 1.74960984713914, 1.78443205864284, 1.81979804440872,
75
+    1.85571455870433, 1.8921884276992, 1.92922655018235, 1.9668358982877,
76
+    2.0050235182263, 2.04379653102551, 2.0831621332761, 2.12312759788576,
77
+    2.16370027484092, 2.20488759197549, 2.2466970557472, 2.28913625202187,
78
+    2.33221284686502, 2.37593458734142, 2.42030930232274, 2.46534490330251,
79
+    2.51104938521982, 2.55743082729067, 2.60449739384781, 2.65225733518805,
80
+    2.70071898842928, 2.74989077837451, 2.79978121838576, 2.85039891126499,
81
+    2.90175255014517, 2.95385091938954, 3.00670289549934, 3.06031744803115,
82
+    3.11470364052283, 3.16987063142876, 3.22582767506471, 3.2825841225609,
83
+    3.3401494228253, 3.39853312351689, 3.45774487202715, 3.51779441647257,
84
+    3.57869160669604, 3.64044639527875, 3.7030688385618, 3.76656909767725,
85
+    3.83095743959148, 3.89624423815599, 3.96243997517042, 4.02955524145598,
86
+    4.09760073793895, 4.16658727674518, 4.2365257823051, 4.30742729247016,
87
+    4.37930295964014, 4.45216405190141, 4.52602195417663, 4.60088816938553,
88
+    4.67677431961831, 4.75369214731843, 4.83165351647993, 4.91067041385396,
89
+    4.99075495016979, 5.07191936136577, 5.15417600983301, 5.23753738567282,
90
+    5.32201610796449, 5.40762492604782, 5.49437672081637, 5.58228450602463,
91
+    5.67136142960816, 5.76162077501684, 5.85307596256082, 5.94574055077076,
92
+    6.03962823777015, 6.13475286266291, 6.2311284069342, 6.32876899586396,
93
+    6.42768889995753, 6.5279025363866, 6.62942447044656, 6.73226941703026,
94
+    6.83645224211186, 6.94198796425035, 7.04889175610325, 7.15717894596024,
95
+    7.2668650192892, 7.37796562029657, 7.49049655350635, 7.60447378535363,
96
+    7.71991344579293, 7.83683182992318, 7.95524539963073, 8.07517078524564,
97
+    8.19662478721649, 8.31962437780235, 8.44418670277909, 8.57032908316786,
98
+    8.69806901697162, 8.82742418094208, 8.95841243235119, 9.09105181078918,
99
+    9.22536053997842, 9.36135702960081, 9.4990598771529, 9.63848786980913,
100
+    9.77965998631185, 9.92259539887546, 10.0673134751131, 10.2138337799773,
101
+    10.3621760777285, 10.5123603339148, 10.6644067173761, 10.8183356022682,
102
+    10.9741675701064, 11.1319234118292, 11.2916241298841, 11.4532909403319,
103
+    11.6169452749761, 11.782608783511, 11.9503033356888, 12.120051023515,
104
+    12.2918741634627, 12.4657952987048, 12.6418372013776, 12.8200228748588,
105
+    13.0003755560757, 13.1829187178276, 13.367676071144, 13.5546715676512,
106
+    13.7439294019804, 13.9354740141834, 14.1293300921851, 14.3255225742508,
107
+    14.5240766514895, 14.7250177703705, 14.9283716352778, 15.1341642110757,
108
+    15.3424217257167, 15.5531706728631, 15.7664378145379, 15.9822501838117,
109
+    16.2006350874992, 16.4216201089027, 16.6452331105667, 16.8715022370722,
110
+    17.1004559178516, 17.3321228700381, 17.5665321013393, 17.8037129129401,
111
+    18.0436949024415, 18.2865079668192, 18.5321823054235, 18.7807484229967,
112
+    19.0322371327346, 19.2866795593684, 19.5441071422852, 19.8045516386728,
113
+    20.068045126707, 20.3346200087623, 20.6043090146575, 20.8771452049349,
114
+    21.1531619741772, 21.4323930543496, 21.7148725181833, 22.0006347825899,
115
+    22.2897146121093, 22.5821471224015, 22.8779677837589, 23.1772124246723,
116
+    23.4799172354157, 23.7861187716811, 24.0958539582449, 24.4091600926726,
117
+    24.7260748490581, 25.0466362818137, 25.3708828294739, 25.6988533185695,
118
+    26.0305869675189, 26.3661233905639, 26.7055026017538, 27.0487650189598,
119
+    27.3959514679386, 27.7471031864343, 28.1022618283194, 28.4614694677879,
120
+    28.8247686035749, 29.1922021632471, 29.5638135074984, 29.9396464345297,
121
+    30.3197451844465, 30.7041544437129, 31.0929193496474, 31.4860854949729,
122
+    31.8836989324014, 32.2858061792735, 32.6924542222466, 33.1036905220286,
123
+    33.5195630181606, 33.9401201338504, 34.3654107808513, 34.7954843644001,
124
+    35.2303907882032, 35.6701804594619, 36.1149042939698, 36.5646137212482,
125
+    37.0193606897411, 37.4791976720634, 37.944177670299, 38.4143542213633,
126
+    38.8897814024065, 39.3705138362898, 39.8566066971106, 40.3481157157767,
127
+    40.8450971856484, 41.3476079682522, 41.8557054990105, 42.369447793091,
128
+    42.8888934512647, 43.4141016658423, 43.9451322266965, 44.4820455273072,
129
+    45.0249025708978, 45.57376497661, 46.128694985791, 46.6897554682848,
130
+    47.257009928828, 47.8305225135037, 48.4103580162663, 48.9965818855272,
131
+    49.589260230802, 50.1884598294566, 50.794248133489, 51.4066932764077,
132
+    52.0258640801652, 52.6518300621766, 53.2846614424041, 53.9244291505136,
133
+    54.5712048331156, 55.2250608610794, 55.8860703369173, 56.5543071022513,
134
+    57.2298457453516, 57.9127616087739, 58.6031307970611, 59.3010301845114,
135
+    60.0065374230609, 60.7197309502355, 61.4406899971675, 62.1694945967356,
136
+    62.9062255917496, 63.6509646432403, 64.4037942388625, 65.1647977013236,
137
+    65.9340591969731, 66.7116637444152, 67.4976972232724, 68.2922463830112,
138
+    69.0953988518382, 69.9072431457598, 70.7278686776501, 71.5573657664994,
139
+    72.3958256466906, 73.2433404774142, 74.1000033521872, 74.9659083084248,
140
+    75.8411503371909, 76.7258253929696, 77.6200304036002, 78.5238632802992,
141
+    79.4374229277768, 80.3608092544678, 81.2941231828966, 82.2374666600933,
142
+    83.1909426682048, 84.154655235138, 85.1287094453491, 86.1132114507694,
143
+    87.108268481825, 88.1139888585565, 89.1304820019001, 90.1578584450571,
144
+    91.1962298449948, 92.2457089940652, 93.3064098317639, 94.3784474565997,
145
+    95.4619381380949, 96.5569993289116, 97.6637496771184, 98.7823090385655,
146
+    99.9127984894415, 101.055340338899, 102.210058141845, 103.377076711919,
147
+    104.556522134513, 105.748521780005, 106.953204317117, 108.170699726403,
148
+    109.401139313892, 110.644655724874, 111.901382957862, 113.171456378648,
149
+    114.455012734562, 115.752190168864, 117.063128235285, 118.387967912751,
150
+    119.726851620228, 121.079923231788, 122.447328091724, 123.829213029981,
151
+    125.225726377642, 126.637017982633, 128.063239225529, 129.504543035659,
152
+    130.961083907258, 132.43301791588, 133.920502734926, 135.423697652396,
153
+    136.942763587828, 138.477863109372, 140.029160451099, 141.596821530472,
154
+    143.181013966024, 144.781907095212, 146.399671992475, 148.034481487503,
155
+    149.686510183665, 151.355934476676, 153.042932573466, 154.747684511235,
156
+    156.470372176717, 158.211179325695, 159.970291602654, 161.747896560765,
157
+    163.544183681914, 165.359344397174, 167.193572107279, 169.047062203492,
158
+    170.920012088617, 172.812621198221, 174.725091022243, 176.657625126586,
159
+    178.610429175187, 180.583710952171, 182.577680384379, 184.59254956399,
160
+    186.628532771569, 188.685846499193, 190.764709473972, 192.865342681753,
161
+    194.987969391112, 197.13281517763, 199.300107948348, 201.490077966701,
162
+    203.702957877374, 205.938982731875, 208.198390014006, 210.481419665809,
163
+    212.788314113849, 215.119318295558, 217.474679686168, 219.854648325694,
164
+    222.259476846381, 224.689420500319, 227.144737187562, 229.625687484264,
165
+    232.132534671514, 234.665544764103, 237.224986539876, 239.811131569336,
166
+    242.424254245529, 245.064631814346, 247.73254440507, 250.428275061399,
167
+    253.152109772633, 255.904337505438, 258.685250235678, 261.49514298094,
168
+    264.334313833161, 267.203063991664, 270.101697796781, 273.03052276345,
169
+    275.989849615675, 278.979992320954, 282.001268125309, 285.053997588697,
170
+    288.138504620796, 291.255116517118, 294.404163995707, 297.585981234071,
171
+    300.800905906628, 304.049279222569, 307.331445964095, 310.647754525259,
172
+    313.998556950887, 317.384208976364, 320.805070067649, 324.26150346164,
173
+    327.753876207298, 331.28255920701, 334.84792725845, 338.450359096983,
174
+    342.090237438443, 345.767949022632, 349.483884657022, 353.238439261111,
175
+    357.032011911288, 360.865005886229, 364.73782871259, 368.650892211681,
176
+    372.604612546163, 376.59941026756, 380.635710364328, 384.713942310386,
177
+    388.83454011424, 392.997942368521, 397.20459230049, 401.454937822634,
178
+    405.749431584178, 410.088531023082, 414.47269841859, 418.902400944533,
179
+    423.378110722949, 427.900304878816, 432.469465594816, 437.086080167171,
180
+    441.750641062068, 446.463645972511, 451.225597876033, 456.037005092914,
181
+    460.89838134554, 465.81024581748, 470.773123214509, 475.787543825096,
182
+    480.854043582649, 485.973164127686, 491.14545287122, 496.371463058725,
183
+    501.651753834779, 506.986890308486, 512.377443619739, 517.823991006384,
184
+    523.32711587159, 528.887407852831, 534.505462890955, 540.181883300517,
185
+    545.917277840779, 551.712261787277, 557.567457004939, 563.48349202123,
186
+    569.461002100643, 575.500629320033, 581.603022644652, 587.76883800521,
187
+    593.998738375827, 600.29339385279, 606.653481734616, 613.07968660232,
188
+    619.572700401503, 626.133222524762, 632.761959895347, 639.459627051767,
189
+    646.226946233466, 653.064647467273, 659.973468655012, 666.954155662449,
190
+    674.007462408703, 681.134150957274, 688.334991607664, 695.610762988527,
191
+    702.962252151562, 710.390254666907, 717.895574719168, 725.479025205175,
192
+    733.141427832198, 740.883613218127, 748.706420992262, 756.610699897378,
193
+    764.597307893424, 772.667112261926, 780.820989711908, 789.059826487117,
194
+    797.384518474445, 805.79597131351, 814.295100508111, 822.882831538009,
195
+    831.560099973222, 840.327851588798, 849.187042481472, 858.138639187298,
196
+    867.183618801265, 876.322969097945, 885.557688653527, 894.88878696958,
197
+    904.317284598324, 913.844213269149, 923.470616016881, 933.197547311661,
198
+    943.02607318998, 952.957271387842, 962.99223147528, 973.13205499233,
199
+    983.377855587028, 993.730759155025, 1004.19190398011, 1014.7624408779,
200
+    1025.44353334027, 1036.23635768138, 1047.14210318612, 1058.16197226031,
201
x265_1.9.tar.gz/source/common/constants.h -> x265_2.0.tar.gz/source/common/constants.h Changed
18
 
1
@@ -96,9 +96,15 @@
2
 // Intra tables
3
 extern const uint8_t g_intraFilterFlags[NUM_INTRA_MODE];
4
 
5
-extern const uint32_t g_depthInc[3][4];
6
 extern const uint32_t g_depthScanIdx[8][8];
7
 
8
+extern const double g_YUVtoRGB_BT2020[3][3];
9
+
10
+#define MIN_HDR_LEGAL_RANGE 64
11
+#define MAX_HDR_LEGAL_RANGE 940
12
+#define CBCR_OFFSET 512
13
+extern const double g_ST2084_PQTable[MAX_HDR_LEGAL_RANGE - MIN_HDR_LEGAL_RANGE + 1];
14
+
15
 }
16
 
17
 #endif
18
x265_1.9.tar.gz/source/common/contexts.h -> x265_2.0.tar.gz/source/common/contexts.h Changed
198
 
1
@@ -117,196 +117,8 @@
2
 #define sbacGetEntropyBits(S, V) (g_entropyBits[(S) ^ (V)])
3
 #define sbacGetEntropyBitsTrm(V) (g_entropyBits[126 ^ (V)])
4
 
5
-#define MAX_NUM_CHANNEL_TYPE     2
6
-
7
 static const uint32_t ctxCbf[3][5] = { { 1, 0, 0, 0, 0 }, { 2, 3, 4, 5, 6 }, { 2, 3, 4, 5, 6 } };
8
-static const uint32_t significanceMapContextSetStart[MAX_NUM_CHANNEL_TYPE][3] = { { 0,  9, 21 }, { 0,  9, 12 } };
9
-static const uint32_t significanceMapContextSetSize[MAX_NUM_CHANNEL_TYPE][3]  = { { 9, 12,  6 }, { 9,  3,  3 } };
10
-static const uint32_t nonDiagonalScan8x8ContextOffset[MAX_NUM_CHANNEL_TYPE]   = {  6, 0  };
11
-static const uint32_t notFirstGroupNeighbourhoodContextOffset[MAX_NUM_CHANNEL_TYPE] = { 3, 0 };
12
-
13
-// initial probability for cu_transquant_bypass flag
14
-static const uint8_t INIT_CU_TRANSQUANT_BYPASS_FLAG[3][NUM_TQUANT_BYPASS_FLAG_CTX] =
15
-{
16
-    { 154 },
17
-    { 154 },
18
-    { 154 },
19
-};
20
-
21
-// initial probability for split flag
22
-static const uint8_t INIT_SPLIT_FLAG[3][NUM_SPLIT_FLAG_CTX] =
23
-{
24
-    { 107,  139,  126, },
25
-    { 107,  139,  126, },
26
-    { 139,  141,  157, },
27
-};
28
-
29
-static const uint8_t INIT_SKIP_FLAG[3][NUM_SKIP_FLAG_CTX] =
30
-{
31
-    { 197,  185,  201, },
32
-    { 197,  185,  201, },
33
-    { CNU,  CNU,  CNU, },
34
-};
35
-
36
-static const uint8_t INIT_MERGE_FLAG_EXT[3][NUM_MERGE_FLAG_EXT_CTX] =
37
-{
38
-    { 154, },
39
-    { 110, },
40
-    { CNU, },
41
-};
42
-
43
-static const uint8_t INIT_MERGE_IDX_EXT[3][NUM_MERGE_IDX_EXT_CTX] =
44
-{
45
-    { 137, },
46
-    { 122, },
47
-    { CNU, },
48
-};
49
-
50
-static const uint8_t INIT_PART_SIZE[3][NUM_PART_SIZE_CTX] =
51
-{
52
-    { 154,  139,  154, 154 },
53
-    { 154,  139,  154, 154 },
54
-    { 184,  CNU,  CNU, CNU },
55
-};
56
-
57
-static const uint8_t INIT_PRED_MODE[3][NUM_PRED_MODE_CTX] =
58
-{
59
-    { 134, },
60
-    { 149, },
61
-    { CNU, },
62
-};
63
-
64
-static const uint8_t INIT_INTRA_PRED_MODE[3][NUM_ADI_CTX] =
65
-{
66
-    { 183, },
67
-    { 154, },
68
-    { 184, },
69
-};
70
-
71
-static const uint8_t INIT_CHROMA_PRED_MODE[3][NUM_CHROMA_PRED_CTX] =
72
-{
73
-    { 152,  139, },
74
-    { 152,  139, },
75
-    {  63,  139, },
76
-};
77
-
78
-static const uint8_t INIT_INTER_DIR[3][NUM_INTER_DIR_CTX] =
79
-{
80
-    {  95,   79,   63,   31,  31, },
81
-    {  95,   79,   63,   31,  31, },
82
-    { CNU,  CNU,  CNU,  CNU, CNU, },
83
-};
84
-
85
-static const uint8_t INIT_MVD[3][NUM_MV_RES_CTX] =
86
-{
87
-    { 169,  198, },
88
-    { 140,  198, },
89
-    { CNU,  CNU, },
90
-};
91
-
92
-static const uint8_t INIT_REF_PIC[3][NUM_REF_NO_CTX] =
93
-{
94
-    { 153,  153 },
95
-    { 153,  153 },
96
-    { CNU,  CNU },
97
-};
98
-
99
-static const uint8_t INIT_DQP[3][NUM_DELTA_QP_CTX] =
100
-{
101
-    { 154,  154,  154, },
102
-    { 154,  154,  154, },
103
-    { 154,  154,  154, },
104
-};
105
-
106
-static const uint8_t INIT_QT_CBF[3][NUM_QT_CBF_CTX] =
107
-{
108
-    { 153,  111,  149,   92,  167,  154,  154 },
109
-    { 153,  111,  149,  107,  167,  154,  154 },
110
-    { 111,  141,   94,  138,  182,  154,  154 },
111
-};
112
-
113
-static const uint8_t INIT_QT_ROOT_CBF[3][NUM_QT_ROOT_CBF_CTX] =
114
-{
115
-    {  79, },
116
-    {  79, },
117
-    { CNU, },
118
-};
119
-
120
-static const uint8_t INIT_LAST[3][NUM_CTX_LAST_FLAG_XY] =
121
-{
122
-    { 125,  110,  124,  110,   95,   94,  125,  111,  111,   79,  125,  126,  111,  111,   79,
123
-      108,  123,   93 },
124
-    { 125,  110,   94,  110,   95,   79,  125,  111,  110,   78,  110,  111,  111,   95,   94,
125
-      108,  123,  108 },
126
-    { 110,  110,  124,  125,  140,  153,  125,  127,  140,  109,  111,  143,  127,  111,   79,
127
-      108,  123,   63 },
128
-};
129
-
130
-static const uint8_t INIT_SIG_CG_FLAG[3][2 * NUM_SIG_CG_FLAG_CTX] =
131
-{
132
-    { 121,  140,
133
-      61,  154, },
134
-    { 121,  140,
135
-      61,  154, },
136
-    {  91,  171,
137
-       134,  141, },
138
-};
139
-
140
-static const uint8_t INIT_SIG_FLAG[3][NUM_SIG_FLAG_CTX] =
141
-{
142
-    { 170,  154,  139,  153,  139,  123,  123,   63,  124,  166,  183,  140,  136,  153,  154,  166,  183,  140,  136,  153,  154,  166,  183,  140,  136,  153,  154,  170,  153,  138,  138,  122,  121,  122,  121,  167,  151,  183,  140,  151,  183,  140,  },
143
-    { 155,  154,  139,  153,  139,  123,  123,   63,  153,  166,  183,  140,  136,  153,  154,  166,  183,  140,  136,  153,  154,  166,  183,  140,  136,  153,  154,  170,  153,  123,  123,  107,  121,  107,  121,  167,  151,  183,  140,  151,  183,  140,  },
144
-    { 111,  111,  125,  110,  110,   94,  124,  108,  124,  107,  125,  141,  179,  153,  125,  107,  125,  141,  179,  153,  125,  107,  125,  141,  179,  153,  125,  140,  139,  182,  182,  152,  136,  152,  136,  153,  136,  139,  111,  136,  139,  111,  },
145
-};
146
-
147
-static const uint8_t INIT_ONE_FLAG[3][NUM_ONE_FLAG_CTX] =
148
-{
149
-    { 154,  196,  167,  167,  154,  152,  167,  182,  182,  134,  149,  136,  153,  121,  136,  122,  169,  208,  166,  167,  154,  152,  167,  182, },
150
-    { 154,  196,  196,  167,  154,  152,  167,  182,  182,  134,  149,  136,  153,  121,  136,  137,  169,  194,  166,  167,  154,  167,  137,  182, },
151
-    { 140,   92,  137,  138,  140,  152,  138,  139,  153,   74,  149,   92,  139,  107,  122,  152,  140,  179,  166,  182,  140,  227,  122,  197, },
152
-};
153
-
154
-static const uint8_t INIT_ABS_FLAG[3][NUM_ABS_FLAG_CTX] =
155
-{
156
-    { 107,  167,   91,  107,  107,  167, },
157
-    { 107,  167,   91,  122,  107,  167, },
158
-    { 138,  153,  136,  167,  152,  152, },
159
-};
160
-
161
-static const uint8_t INIT_MVP_IDX[3][NUM_MVP_IDX_CTX] =
162
-{
163
-    { 168 },
164
-    { 168 },
165
-    { CNU },
166
-};
167
-
168
-static const uint8_t INIT_SAO_MERGE_FLAG[3][NUM_SAO_MERGE_FLAG_CTX] =
169
-{
170
-    { 153,  },
171
-    { 153,  },
172
-    { 153,  },
173
-};
174
-
175
-static const uint8_t INIT_SAO_TYPE_IDX[3][NUM_SAO_TYPE_IDX_CTX] =
176
-{
177
-    { 160, },
178
-    { 185, },
179
-    { 200, },
180
-};
181
-
182
-static const uint8_t INIT_TRANS_SUBDIV_FLAG[3][NUM_TRANS_SUBDIV_FLAG_CTX] =
183
-{
184
-    { 224,  167,  122, },
185
-    { 124,  138,   94, },
186
-    { 153,  138,  138, },
187
-};
188
 
189
-static const uint8_t INIT_TRANSFORMSKIP_FLAG[3][2 * NUM_TRANSFORMSKIP_FLAG_CTX] =
190
-{
191
-    { 139,  139 },
192
-    { 139,  139 },
193
-    { 139,  139 },
194
-};
195
 }
196
 
197
 #endif // ifndef X265_CONTEXTS_H
198
x265_1.9.tar.gz/source/common/cpu.cpp -> x265_2.0.tar.gz/source/common/cpu.cpp Changed
14
 
1
@@ -274,9 +274,9 @@
2
         if (!cache && max_basic_cap >= 2)
3
         {
4
             // Cache and TLB Information
5
-            static const char cache32_ids[] = { 0x0a, 0x0c, 0x41, 0x42, 0x43, 0x44, 0x45, 0x82, 0x83, 0x84, 0x85, 0 };
6
-            static const char cache64_ids[] = { 0x22, 0x23, 0x25, 0x29, 0x2c, 0x46, 0x47, 0x49, 0x60, 0x66, 0x67,
7
-                                                0x68, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7c, 0x7f, 0x86, 0x87, 0 };
8
+            static const char cache32_ids[] = { '\x0a','\x0c','\x41','\x42','\x43','\x44','\x45','\x82','\x83','\x84','\x85','\0' };
9
+            static const char cache64_ids[] = { '\x22','\x23','\x25','\x29','\x2c','\x46','\x47','\x49','\x60','\x66','\x67',
10
+                                                '\x68','\x78','\x79','\x7a','\x7b','\x7c','\x7c','\x7f','\x86','\x87','\0' };
11
             uint32_t buf[4];
12
             int max, i = 0;
13
             do
14
x265_1.9.tar.gz/source/common/cudata.cpp -> x265_2.0.tar.gz/source/common/cudata.cpp Changed
46
 
1
@@ -480,7 +480,7 @@
2
 }
3
 
4
 /* The reverse of copyToPic, called only by encodeResidue */
5
-void CUData::copyFromPic(const CUData& ctu, const CUGeom& cuGeom, int csp)
6
+void CUData::copyFromPic(const CUData& ctu, const CUGeom& cuGeom, int csp, bool copyQp)
7
 {
8
     m_encData       = ctu.m_encData;
9
     m_slice         = ctu.m_slice;
10
@@ -491,7 +491,8 @@
11
     m_numPartitions = cuGeom.numPartitions;
12
 
13
     /* copy out all prediction info for this part */
14
-    m_partCopy((uint8_t*)m_qp, (uint8_t*)ctu.m_qp + m_absIdxInCTU);
15
+    if (copyQp) m_partCopy((uint8_t*)m_qp, (uint8_t*)ctu.m_qp + m_absIdxInCTU);
16
+
17
     m_partCopy(m_log2CUSize,   ctu.m_log2CUSize + m_absIdxInCTU);
18
     m_partCopy(m_lumaIntraDir, ctu.m_lumaIntraDir + m_absIdxInCTU);
19
     m_partCopy(m_tqBypass,     ctu.m_tqBypass + m_absIdxInCTU);
20
@@ -526,7 +527,7 @@
21
 }
22
 
23
 /* Only called by encodeResidue, these fields can be modified during inter/intra coding */
24
-void CUData::updatePic(uint32_t depth) const
25
+void CUData::updatePic(uint32_t depth, int picCsp) const
26
 {
27
     CUData& ctu = *m_encData->getPicCTU(m_cuAddr);
28
 
29
@@ -540,7 +541,7 @@
30
     uint32_t tmpY2 = m_absIdxInCTU << (LOG2_UNIT_SIZE * 2);
31
     memcpy(ctu.m_trCoeff[0] + tmpY2, m_trCoeff[0], sizeof(coeff_t)* tmpY);
32
 
33
-    if (ctu.m_chromaFormat != X265_CSP_I400)
34
+    if (ctu.m_chromaFormat != X265_CSP_I400 && picCsp != X265_CSP_I400)
35
     {
36
         m_partCopy(ctu.m_transformSkip[1] + m_absIdxInCTU, m_transformSkip[1]);
37
         m_partCopy(ctu.m_transformSkip[2] + m_absIdxInCTU, m_transformSkip[2]);
38
@@ -2088,6 +2089,7 @@
39
                 cu->absPartIdx = g_depthScanIdx[yOffset][xOffset] * 4;
40
                 cu->numPartitions = (NUM_4x4_PARTITIONS >> ((g_maxLog2CUSize - cu->log2CUSize) * 2));
41
                 cu->depth = g_log2Size[maxCUSize] - log2CUSize;
42
+                cu->geomRecurId = cuIdx;
43
 
44
                 cu->flags = 0;
45
                 CU_SET_FLAG(cu->flags, CUGeom::PRESENT, presentFlag);
46
x265_1.9.tar.gz/source/common/cudata.h -> x265_2.0.tar.gz/source/common/cudata.h Changed
29
 
1
@@ -87,6 +87,7 @@
2
     uint32_t numPartitions; // Number of 4x4 blocks in the CU
3
     uint32_t flags;         // CU flags.
4
     uint32_t depth;         // depth of this CU relative from CTU
5
+    uint32_t geomRecurId;   // Unique geom id from 0 to MAX_GEOMS - 1 for every depth
6
 };
7
 
8
 struct MVField
9
@@ -222,8 +223,8 @@
10
     void     copyToPic(uint32_t depth) const;
11
 
12
     /* RD-0 methods called only from encodeResidue */
13
-    void     copyFromPic(const CUData& ctu, const CUGeom& cuGeom, int csp);
14
-    void     updatePic(uint32_t depth) const;
15
+    void     copyFromPic(const CUData& ctu, const CUGeom& cuGeom, int csp, bool copyQp = true);
16
+    void     updatePic(uint32_t depth, int picCsp) const;
17
 
18
     void     setPartSizeSubParts(PartSize size)    { m_partSet(m_partSize, (uint8_t)size); }
19
     void     setPredModeSubParts(PredMode mode)    { m_partSet(m_predMode, (uint8_t)mode); }
20
@@ -246,7 +247,7 @@
21
     void     setPURefIdx(int list, int8_t refIdx, int absPartIdx, int puIdx);
22
 
23
     uint8_t  getCbf(uint32_t absPartIdx, TextType ttype, uint32_t tuDepth) const { return (m_cbf[ttype][absPartIdx] >> tuDepth) & 0x1; }
24
-    uint8_t  getQtRootCbf(uint32_t absPartIdx) const                             { if (m_chromaFormat == X265_CSP_I400) return m_cbf[0][absPartIdx] || false; else { return m_cbf[0][absPartIdx] || m_cbf[1][absPartIdx] || m_cbf[2][absPartIdx];} }
25
+    bool     getQtRootCbf(uint32_t absPartIdx) const                             { return (m_cbf[0][absPartIdx] || ((m_chromaFormat != X265_CSP_I400) && (m_cbf[1][absPartIdx] || m_cbf[2][absPartIdx]))); }
26
     int8_t   getRefQP(uint32_t currAbsIdxInCTU) const;
27
     uint32_t getInterMergeCandidates(uint32_t absPartIdx, uint32_t puIdx, MVField (*candMvField)[2], uint8_t* candDir) const;
28
     void     clipMv(MV& outMV) const;
29
x265_1.9.tar.gz/source/common/deblock.cpp -> x265_2.0.tar.gz/source/common/deblock.cpp Changed
38
 
1
@@ -319,27 +319,6 @@
2
     }
3
 }
4
 
5
-/* Deblocking of one line/column for the chrominance component
6
- * \param src     pointer to picture data
7
- * \param offset  offset value for picture data
8
- * \param tc      tc value
9
- * \param maskP   indicator to disable filtering on partP
10
- * \param maskQ   indicator to disable filtering on partQ */
11
-static inline void pelFilterChroma(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tc, int32_t maskP, int32_t maskQ)
12
-{
13
-    for (int32_t i = 0; i < UNIT_SIZE; i++, src += srcStep)
14
-    {
15
-        int16_t m4  = (int16_t)src[0];
16
-        int16_t m3  = (int16_t)src[-offset];
17
-        int16_t m5  = (int16_t)src[offset];
18
-        int16_t m2  = (int16_t)src[-offset * 2];
19
-
20
-        int32_t delta = x265_clip3(-tc, tc, ((((m4 - m3) * 4) + m2 - m5 + 4) >> 3));
21
-        src[-offset] = x265_clip(m3 + (delta & maskP));
22
-        src[0] = x265_clip(m4 - (delta & maskQ));
23
-    }
24
-}
25
-
26
 void Deblock::edgeFilterLuma(const CUData* cuQ, uint32_t absPartIdx, uint32_t depth, int32_t dir, int32_t edge, const uint8_t blockStrength[])
27
 {
28
     PicYuv* reconPic = cuQ->m_encData->m_reconPic;
29
@@ -517,7 +496,7 @@
30
             int32_t tc = s_tcTable[indexTC] << bitdepthShift;
31
             pixel* srcC = srcChroma[chromaIdx];
32
 
33
-            pelFilterChroma(srcC + unitOffset, srcStep, offset, tc, maskP, maskQ);
34
+            primitives.pelFilterChroma[dir](srcC + unitOffset, srcStep, offset, tc, maskP, maskQ);
35
         }
36
     }
37
 }
38
x265_1.9.tar.gz/source/common/frame.cpp -> x265_2.0.tar.gz/source/common/frame.cpp Changed
41
 
1
@@ -42,12 +42,14 @@
2
     m_prev = NULL;
3
     m_param = NULL;
4
     memset(&m_lowres, 0, sizeof(m_lowres));
5
+    m_rcData = NULL;
6
 }
7
 
8
 bool Frame::create(x265_param *param, float* quantOffsets)
9
 {
10
     m_fencPic = new PicYuv;
11
     m_param = param;
12
+    CHECKED_MALLOC_ZERO(m_rcData, RcStats, 1);
13
 
14
     if (m_fencPic->create(param->sourceWidth, param->sourceHeight, param->internalCsp) &&
15
         m_lowres.create(m_fencPic, param->bframes, !!param->rc.aqMode))
16
@@ -64,14 +66,17 @@
17
         return true;
18
     }
19
     return false;
20
+fail:
21
+    return false;
22
 }
23
 
24
 bool Frame::allocEncodeData(x265_param *param, const SPS& sps)
25
 {
26
     m_encData = new FrameData;
27
     m_reconPic = new PicYuv;
28
+    m_param = param;
29
     m_encData->m_reconPic = m_reconPic;
30
-    bool ok = m_encData->create(*param, sps) && m_reconPic->create(param->sourceWidth, param->sourceHeight, param->internalCsp);
31
+    bool ok = m_encData->create(*param, sps, m_fencPic->m_picCsp) && m_reconPic->create(param->sourceWidth, param->sourceHeight, param->internalCsp);
32
     if (ok)
33
     {
34
         /* initialize right border of m_reconpicYuv as SAO may read beyond the
35
@@ -139,4 +144,5 @@
36
     }
37
 
38
     m_lowres.destroy();
39
+    X265_FREE(m_rcData);
40
 }
41
x265_1.9.tar.gz/source/common/frame.h -> x265_2.0.tar.gz/source/common/frame.h Changed
45
 
1
@@ -37,6 +37,27 @@
2
 
3
 #define IS_REFERENCED(frame) (frame->m_lowres.sliceType != X265_TYPE_B)
4
 
5
+/* Ratecontrol statistics */
6
+struct RcStats
7
+{
8
+    double   qpaRc;
9
+    double   qpAq;
10
+    double   qRceq;
11
+    double   qpNoVbv;
12
+    double   newQScale;
13
+    double   iCuCount;
14
+    double   pCuCount;
15
+    double   skipCuCount;
16
+    double   qScale;
17
+    int      mvBits;
18
+    int      miscBits;
19
+    int      coeffBits;
20
+    int      poc;
21
+    int      encodeOrder;
22
+    int      sliceType;
23
+    int      keptAsRef;
24
+};
25
+
26
 class Frame
27
 {
28
 public:
29
@@ -49,6 +70,7 @@
30
     /* Data associated with x265_picture */
31
     PicYuv*                m_fencPic;
32
     int                    m_poc;
33
+    int                    m_encodeOrder;
34
     int64_t                m_pts;                // user provided presentation time stamp
35
     int64_t                m_reorderedPts;
36
     int64_t                m_dts;
37
@@ -71,6 +93,7 @@
38
     Frame*                 m_prev;
39
     x265_param*            m_param;              // Points to the latest param set for the frame.
40
     x265_analysis_data     m_analysisData;
41
+    RcStats*               m_rcData;
42
     Frame();
43
 
44
     bool create(x265_param *param, float* quantOffsets);
45
x265_1.9.tar.gz/source/common/framedata.cpp -> x265_2.0.tar.gz/source/common/framedata.cpp Changed
22
 
1
@@ -31,17 +31,18 @@
2
     memset(this, 0, sizeof(*this));
3
 }
4
 
5
-bool FrameData::create(const x265_param& param, const SPS& sps)
6
+bool FrameData::create(const x265_param& param, const SPS& sps, int csp)
7
 {
8
     m_param = &param;
9
     m_slice  = new Slice;
10
     m_picCTU = new CUData[sps.numCUsInFrame];
11
+    m_picCsp = csp;
12
 
13
     m_cuMemPool.create(0, param.internalCsp, sps.numCUsInFrame);
14
     for (uint32_t ctuAddr = 0; ctuAddr < sps.numCUsInFrame; ctuAddr++)
15
         m_picCTU[ctuAddr].initialize(m_cuMemPool, 0, param.internalCsp, ctuAddr);
16
 
17
-    CHECKED_MALLOC(m_cuStat, RCStatCU, sps.numCUsInFrame);
18
+    CHECKED_MALLOC_ZERO(m_cuStat, RCStatCU, sps.numCUsInFrame);
19
     CHECKED_MALLOC(m_rowStat, RCStatRow, sps.numCuInHeight);
20
     reinit(sps);
21
     return true;
22
x265_1.9.tar.gz/source/common/framedata.h -> x265_2.0.tar.gz/source/common/framedata.h Changed
28
 
1
@@ -146,10 +146,11 @@
2
     double         m_avgQpRc;    /* avg QP as decided by rate-control */
3
     double         m_avgQpAq;    /* avg QP as decided by AQ in addition to rate-control */
4
     double         m_rateFactor; /* calculated based on the Frame QP */
5
+    int            m_picCsp;
6
 
7
     FrameData();
8
 
9
-    bool create(const x265_param& param, const SPS& sps);
10
+    bool create(const x265_param& param, const SPS& sps, int csp);
11
     void reinit(const SPS& sps);
12
     void destroy();
13
     inline CUData* getPicCTU(uint32_t ctuAddr) { return &m_picCTU[ctuAddr]; }
14
@@ -168,10 +169,12 @@
15
 struct analysis_inter_data
16
 {
17
     MV*         mv;
18
+    WeightParam* wt;
19
     int32_t*    ref;
20
     uint8_t*    depth;
21
     uint8_t*    modes;
22
-    uint32_t*   bestMergeCand;
23
+    uint8_t*    partSize;
24
+    uint8_t*    mergeFlag;
25
 };
26
 }
27
 #endif // ifndef X265_FRAMEDATA_H
28
x265_1.9.tar.gz/source/common/ipfilter.cpp -> x265_2.0.tar.gz/source/common/ipfilter.cpp Changed
15
 
1
@@ -365,10 +365,10 @@
2
 template<int N, int width, int height>
3
 void interp_hv_pp_c(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int idxX, int idxY)
4
 {
5
-    short immedVals[(64 + 8) * (64 + 8)];
6
+    ALIGN_VAR_32(int16_t, immed[width * (height + N - 1)]);
7
 
8
-    interp_horiz_ps_c<N, width, height>(src, srcStride, immedVals, width, idxX, 1);
9
-    filterVertical_sp_c<N>(immedVals + 3 * width, width, dst, dstStride, width, height, idxY);
10
+    interp_horiz_ps_c<N, width, height>(src, srcStride, immed, width, idxX, 1);
11
+    filterVertical_sp_c<N>(immed + (N / 2 - 1) * width, width, dst, dstStride, width, height, idxY);
12
 }
13
 }
14
 
15
x265_1.9.tar.gz/source/common/loopfilter.cpp -> x265_2.0.tar.gz/source/common/loopfilter.cpp Changed
45
 
1
@@ -27,7 +27,6 @@
2
 #include "primitives.h"
3
 
4
 #define PIXEL_MIN 0
5
-#define PIXEL_MAX ((1 << X265_DEPTH) - 1)
6
 
7
 namespace {
8
 
9
@@ -158,6 +157,27 @@
10
         src[offset * 2]  = (pixel)(x265_clip3(-tcQ, tcQ, ((m3 + m4 + m5 + 3 * m6 + 2 * m7 + 4) >> 3) - m6) + m6);
11
     }
12
 }
13
+
14
+/* Deblocking of one line/column for the chrominance component
15
+* \param src     pointer to picture data
16
+* \param offset  offset value for picture data
17
+* \param tc      tc value
18
+* \param maskP   indicator to disable filtering on partP
19
+* \param maskQ   indicator to disable filtering on partQ */
20
+static void pelFilterChroma_c(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tc, int32_t maskP, int32_t maskQ)
21
+{
22
+    for (int32_t i = 0; i < UNIT_SIZE; i++, src += srcStep)
23
+    {
24
+        int16_t m4 = (int16_t)src[0];
25
+        int16_t m3 = (int16_t)src[-offset];
26
+        int16_t m5 = (int16_t)src[offset];
27
+        int16_t m2 = (int16_t)src[-offset * 2];
28
+
29
+        int32_t delta = x265_clip3(-tc, tc, ((((m4 - m3) * 4) + m2 - m5 + 4) >> 3));
30
+        src[-offset]  = x265_clip(m3 + (delta & maskP));
31
+        src[0]        = x265_clip(m4 - (delta & maskQ));
32
+    }
33
+}
34
 }
35
 
36
 namespace X265_NS {
37
@@ -176,5 +196,7 @@
38
     // C code is same for EDGE_VER and EDGE_HOR only asm code is different
39
     p.pelFilterLumaStrong[0] = pelFilterLumaStrong_c;
40
     p.pelFilterLumaStrong[1] = pelFilterLumaStrong_c;
41
+    p.pelFilterChroma[0]     = pelFilterChroma_c;
42
+    p.pelFilterChroma[1]     = pelFilterChroma_c;
43
 }
44
 }
45
x265_1.9.tar.gz/source/common/param.cpp -> x265_2.0.tar.gz/source/common/param.cpp Changed
201
 
1
@@ -121,9 +121,9 @@
2
     /* Source specifications */
3
     param->internalBitDepth = X265_DEPTH;
4
     param->internalCsp = X265_CSP_I420;
5
-
6
-    param->levelIdc = 0;
7
-    param->bHighTier = 0;
8
+    param->levelIdc = 0; //Auto-detect level
9
+    param->uhdBluray = 0;
10
+    param->bHighTier = 1; //Allow high tier by default
11
     param->interlaceMode = 0;
12
     param->bAnnexB = 1;
13
     param->bRepeatHeaders = 0;
14
@@ -164,6 +164,7 @@
15
     param->bEnableWeightedPred = 1;
16
     param->bEnableWeightedBiPred = 0;
17
     param->bEnableEarlySkip = 0;
18
+    param->bEnableRecursionSkip = 1;
19
     param->bEnableAMP = 0;
20
     param->bEnableRectInter = 0;
21
     param->rdLevel = 3;
22
@@ -193,6 +194,7 @@
23
     param->bLossless = 0;
24
     param->bCULossless = 0;
25
     param->bEnableTemporalSubLayers = 0;
26
+    param->bEnableRdRefine = 0;
27
 
28
     /* Rate control options */
29
     param->rc.vbvMaxBitrate = 0;
30
@@ -219,8 +221,9 @@
31
     param->rc.qblur = 0.5;
32
     param->rc.zoneCount = 0;
33
     param->rc.zones = NULL;
34
-    param->rc.bEnableSlowFirstPass = 0;
35
+    param->rc.bEnableSlowFirstPass = 1;
36
     param->rc.bStrictCbr = 0;
37
+    param->rc.bEnableGrain = 0;
38
 
39
     /* Video Usability Information (VUI) */
40
     param->vui.aspectRatioIdc = 0;
41
@@ -245,7 +248,7 @@
42
     param->maxCLL = 0;
43
     param->maxFALL = 0;
44
     param->minLuma = 0;
45
-    param->maxLuma = (1 << X265_DEPTH) - 1;
46
+    param->maxLuma = PIXEL_MAX;
47
 }
48
 
49
 int x265_param_default_preset(x265_param* param, const char* preset, const char* tune)
50
@@ -408,9 +411,9 @@
51
             param->maxNumMergeCand = 5;
52
             param->searchMethod = X265_STAR_SEARCH;
53
             param->bEnableTransformSkip = 1;
54
+            param->bEnableRecursionSkip = 0;
55
             param->maxNumReferences = 5;
56
             param->limitReferences = 0;
57
-            param->rc.bEnableSlowFirstPass = 1;
58
             param->bIntraInBFrames = 1;
59
             param->lookaheadSlices = 0; // disabled for best quality
60
             // TODO: optimized esa
61
@@ -453,16 +456,16 @@
62
         }
63
         else if (!strcmp(tune, "grain"))
64
         {
65
-            param->deblockingFilterBetaOffset = -2;
66
-            param->deblockingFilterTCOffset = -2;
67
-            param->bIntraInBFrames = 0;
68
-            param->rdoqLevel = 2;
69
-            param->psyRdoq = 10.0;
70
-            param->psyRd = 0.5;
71
             param->rc.ipFactor = 1.1;
72
-            param->rc.pbFactor = 1.1;
73
-            param->rc.aqStrength = 0.3;
74
-            param->rc.qCompress = 0.8;
75
+            param->rc.pbFactor = 1.0;
76
+            param->rc.cuTree = 0;
77
+            param->rc.aqMode = 0;
78
+            param->rc.qpStep = 1;
79
+            param->rc.bEnableGrain = 1;
80
+            param->bEnableRecursionSkip = 0;
81
+            param->psyRd = 4.0;
82
+            param->psyRdoq = 10.0;
83
+            param->bEnableSAO = 0;
84
         }
85
         else
86
             return -1;
87
@@ -616,6 +619,7 @@
88
     OPT("max-merge") p->maxNumMergeCand = (uint32_t)atoi(value);
89
     OPT("temporal-mvp") p->bEnableTemporalMvp = atobool(value);
90
     OPT("early-skip") p->bEnableEarlySkip = atobool(value);
91
+    OPT("rskip") p->bEnableRecursionSkip = atobool(value);
92
     OPT("rdpenalty") p->rdPenalty = atoi(value);
93
     OPT("tskip") p->bEnableTransformSkip = atobool(value);
94
     OPT("no-tskip-fast") p->bEnableTSkipFast = atobool(value);
95
@@ -702,6 +706,7 @@
96
         else
97
             p->psyRdoq = 0.0;
98
     }
99
+    OPT("rd-refine") p->bEnableRdRefine = atobool(value);
100
     OPT("signhide") p->bEnableSignHiding = atobool(value);
101
     OPT("b-intra") p->bIntraInBFrames = atobool(value);
102
     OPT("lft") p->bEnableLoopFilter = atobool(value); /* DEPRECATED */
103
@@ -757,6 +762,7 @@
104
         p->rc.qp = atoi(value);
105
         p->rc.rateControlMode = X265_RC_CQP;
106
     }
107
+    OPT("rc-grain") p->rc.bEnableGrain = atobool(value);
108
     OPT("zones")
109
     {
110
         p->rc.zoneCount = 1;
111
@@ -877,6 +883,7 @@
112
     OPT("max-cll") bError |= sscanf(value, "%hu,%hu", &p->maxCLL, &p->maxFALL) != 2;
113
     OPT("min-luma") p->minLuma = (uint16_t)atoi(value);
114
     OPT("max-luma") p->maxLuma = (uint16_t)atoi(value);
115
+    OPT("uhd-bd") p->uhdBluray = atobool(value);
116
     else
117
         return X265_PARAM_BAD_NAME;
118
 #undef OPT
119
@@ -1023,7 +1030,8 @@
120
 {
121
 #define CHECK(expr, msg) check_failed |= _confirm(param, expr, msg)
122
     int check_failed = 0; /* abort if there is a fatal configuration problem */
123
-
124
+    CHECK(param->uhdBluray == 1 && (X265_DEPTH != 10 || param->internalCsp != 1 || param->interlaceMode != 0),
125
+        "uhd-bd: bit depth, chroma subsample, source picture type must be 10, 4:2:0, progressive");
126
     CHECK(param->maxCUSize != 64 && param->maxCUSize != 32 && param->maxCUSize != 16,
127
           "max cu size must be 16, 32, or 64");
128
     if (check_failed == 1)
129
@@ -1096,7 +1104,7 @@
130
 
131
     CHECK(param->rc.rateControlMode > X265_RC_CRF || param->rc.rateControlMode < X265_RC_ABR,
132
           "Rate control mode is out of range");
133
-    CHECK(param->rdLevel < 0 || param->rdLevel > 6,
134
+    CHECK(param->rdLevel < 1 || param->rdLevel > 6,
135
           "RD Level is out of range");
136
     CHECK(param->rdoqLevel < 0 || param->rdoqLevel > 2,
137
         "RDOQ Level is out of range");
138
@@ -1194,12 +1202,12 @@
139
         CHECK(0 > param->noiseReductionIntra || param->noiseReductionIntra > 2000, "Valid noise reduction range 0 - 2000");
140
     if (param->noiseReductionInter)
141
         CHECK(0 > param->noiseReductionInter || param->noiseReductionInter > 2000, "Valid noise reduction range 0 - 2000");
142
-    CHECK(param->rc.rateControlMode == X265_RC_CRF && param->rc.bStatRead && param->rc.vbvMaxBitrate == 0,
143
-          "Constant rate-factor is incompatible with 2pass");
144
     CHECK(param->rc.rateControlMode == X265_RC_CQP && param->rc.bStatRead,
145
           "Constant QP is incompatible with 2pass");
146
     CHECK(param->rc.bStrictCbr && (param->rc.bitrate <= 0 || param->rc.vbvBufferSize <=0),
147
           "Strict-cbr cannot be applied without specifying target bitrate or vbv bufsize");
148
+    CHECK(param->analysisMode && (param->analysisMode < X265_ANALYSIS_OFF || param->analysisMode > X265_ANALYSIS_LOAD),
149
+        "Invalid analysis mode. Analysis mode 0: OFF 1: SAVE : 2 LOAD");
150
     return check_failed;
151
 }
152
 
153
@@ -1225,18 +1233,21 @@
154
     uint32_t maxLog2CUSize = (uint32_t)g_log2Size[param->maxCUSize];
155
     uint32_t minLog2CUSize = (uint32_t)g_log2Size[param->minCUSize];
156
 
157
-    if (ATOMIC_INC(&g_ctuSizeConfigured) > 1)
158
+    Lock gLock;
159
+    ScopedLock sLock(gLock);
160
+
161
+    if (++g_ctuSizeConfigured > 1)
162
     {
163
         if (g_maxCUSize != param->maxCUSize)
164
         {
165
-            x265_log(param, X265_LOG_ERROR, "maxCUSize must be the same for all encoders in a single process");
166
-            return -1;
167
+            x265_log(param, X265_LOG_WARNING, "maxCUSize must be the same for all encoders in a single process");
168
         }
169
         if (g_maxCUDepth != maxLog2CUSize - minLog2CUSize)
170
         {
171
-            x265_log(param, X265_LOG_ERROR, "maxCUDepth must be the same for all encoders in a single process");
172
-            return -1;
173
+            x265_log(param, X265_LOG_WARNING, "maxCUDepth must be the same for all encoders in a single process");
174
         }
175
+        param->maxCUSize = g_maxCUSize;
176
+        return x265_check_params(param); /* Check again, since param may have changed */
177
     }
178
     else
179
     {
180
@@ -1302,8 +1313,9 @@
181
     x265_log(param, X265_LOG_INFO, "Lookahead / bframes / badapt        : %d / %d / %d\n", param->lookaheadDepth, param->bframes, param->bFrameAdaptive);
182
     x265_log(param, X265_LOG_INFO, "b-pyramid / weightp / weightb       : %d / %d / %d\n",
183
              param->bBPyramid, param->bEnableWeightedPred, param->bEnableWeightedBiPred);
184
-    x265_log(param, X265_LOG_INFO, "References / ref-limit  cu / depth  : %d / %d / %d\n",
185
-             param->maxNumReferences, !!(param->limitReferences & X265_REF_LIMIT_CU), !!(param->limitReferences & X265_REF_LIMIT_DEPTH));
186
+    x265_log(param, X265_LOG_INFO, "References / ref-limit  cu / depth  : %d / %s / %s\n",
187
+             param->maxNumReferences, (param->limitReferences & X265_REF_LIMIT_CU) ? "on" : "off",
188
+             (param->limitReferences & X265_REF_LIMIT_DEPTH) ? "on" : "off");
189
 
190
     if (param->rc.aqMode)
191
         x265_log(param, X265_LOG_INFO, "AQ: mode / str / qg-size / cu-tree  : %d / %0.1f / %d / %d\n", param->rc.aqMode,
192
@@ -1336,7 +1348,9 @@
193
     TOOLVAL(param->psyRd, "psy-rd=%.2lf");
194
     TOOLVAL(param->rdoqLevel, "rdoq=%d");
195
     TOOLVAL(param->psyRdoq, "psy-rdoq=%.2lf");
196
+    TOOLOPT(param->bEnableRdRefine, "rd-refine");
197
     TOOLOPT(param->bEnableEarlySkip, "early-skip");
198
+    TOOLOPT(param->bEnableRecursionSkip, "rskip");
199
     TOOLVAL(param->noiseReductionIntra, "nr-intra=%d");
200
     TOOLVAL(param->noiseReductionInter, "nr-inter=%d");
201
x265_1.9.tar.gz/source/common/param.h -> x265_2.0.tar.gz/source/common/param.h Changed
9
 
1
@@ -30,7 +30,6 @@
2
 int   x265_check_params(x265_param *param);
3
 int   x265_set_globals(x265_param *param);
4
 void  x265_print_params(x265_param *param);
5
-void  x265_print_reconfigured_params(x265_param* param, x265_param* reconfiguredParam);
6
 void  x265_param_apply_fastfirstpass(x265_param *p);
7
 char* x265_param2string(x265_param *param);
8
 int   x265_atoi(const char *str, bool& bError);
9
x265_1.9.tar.gz/source/common/picyuv.cpp -> x265_2.0.tar.gz/source/common/picyuv.cpp Changed
90
 
1
@@ -46,6 +46,10 @@
2
 
3
     m_maxLumaLevel = 0;
4
     m_avgLumaLevel = 0;
5
+    m_stride = 0;
6
+    m_strideC = 0;
7
+    m_hChromaShift = 0;
8
+    m_vChromaShift = 0;
9
 }
10
 
11
 bool PicYuv::create(uint32_t picWidth, uint32_t picHeight, uint32_t picCsp)
12
@@ -176,6 +180,7 @@
13
      * warnings from valgrind about using uninitialized pixels */
14
     padx++;
15
     pady++;
16
+    m_picCsp = pic.colorSpace;
17
 
18
     X265_CHECK(pic.bitDepth >= 8, "pic.bitDepth check failure");
19
 
20
@@ -190,7 +195,7 @@
21
 
22
             primitives.planecopy_cp(yChar, pic.stride[0] / sizeof(*yChar), yPixel, m_stride, width, height, shift);
23
 
24
-            if (pic.colorSpace != X265_CSP_I400)
25
+            if (param.internalCsp != X265_CSP_I400)
26
             {
27
                 pixel *uPixel = m_picOrg[1];
28
                 pixel *vPixel = m_picOrg[2];
29
@@ -216,7 +221,7 @@
30
                 yChar += pic.stride[0] / sizeof(*yChar);
31
             }
32
 
33
-            if (pic.colorSpace != X265_CSP_I400)
34
+            if (param.internalCsp != X265_CSP_I400)
35
             {
36
                 pixel *uPixel = m_picOrg[1];
37
                 pixel *vPixel = m_picOrg[2];
38
@@ -258,7 +263,7 @@
39
             primitives.planecopy_sp_shl(yShort, pic.stride[0] / sizeof(*yShort), yPixel, m_stride, width, height, shift, mask);
40
         }
41
 
42
-        if (pic.colorSpace != X265_CSP_I400)
43
+        if (param.internalCsp != X265_CSP_I400)
44
         {
45
             pixel *uPixel = m_picOrg[1];
46
             pixel *vPixel = m_picOrg[2];
47
@@ -279,12 +284,25 @@
48
         }
49
     }
50
 
51
-    /* extend the right edge if width was not multiple of the minimum CU size */
52
-    uint64_t sumLuma;
53
     pixel *Y = m_picOrg[0];
54
-    m_maxLumaLevel = primitives.planeClipAndMax(Y, m_stride, width, height, &sumLuma, (pixel)param.minLuma, (pixel)param.maxLuma);
55
-    m_avgLumaLevel = (double)(sumLuma) / (m_picHeight * m_picWidth);
56
+    pixel *U = m_picOrg[1];
57
+    pixel *V = m_picOrg[2];
58
 
59
+#if HIGH_BIT_DEPTH
60
+    bool calcHDRParams = !!param.minLuma || (param.maxLuma != PIXEL_MAX);
61
+    /* Apply min/max luma bounds for HDR pixel manipulations */
62
+    if (calcHDRParams)
63
+    {
64
+        X265_CHECK(pic.bitDepth == 10, "HDR stats can be applied/calculated only for 10bpp content");
65
+        uint64_t sumLuma;
66
+        m_maxLumaLevel = primitives.planeClipAndMax(Y, m_stride, width, height, &sumLuma, (pixel)param.minLuma, (pixel)param.maxLuma);
67
+        m_avgLumaLevel = (double) sumLuma / (m_picHeight * m_picWidth);
68
+    }
69
+#else
70
+    (void) param;
71
+#endif
72
+
73
+    /* extend the right edge if width was not multiple of the minimum CU size */
74
     for (int r = 0; r < height; r++)
75
     {
76
         for (int x = 0; x < padx; x++)
77
@@ -297,11 +315,8 @@
78
     for (int i = 1; i <= pady; i++)
79
         memcpy(Y + i * m_stride, Y, (width + padx) * sizeof(pixel));
80
 
81
-    if (pic.colorSpace != X265_CSP_I400)
82
+    if (param.internalCsp != X265_CSP_I400)
83
     {
84
-        pixel *U = m_picOrg[1];
85
-        pixel *V = m_picOrg[2];
86
-
87
         for (int r = 0; r < height >> m_vChromaShift; r++)
88
         {
89
             for (int x = 0; x < padx >> m_hChromaShift; x++)
90
x265_1.9.tar.gz/source/common/picyuv.h -> x265_2.0.tar.gz/source/common/picyuv.h Changed
10
 
1
@@ -60,7 +60,7 @@
2
     uint32_t m_chromaMarginX;
3
     uint32_t m_chromaMarginY;
4
 
5
-    uint16_t m_maxLumaLevel;
6
+    pixel m_maxLumaLevel;
7
     double   m_avgLumaLevel;
8
 
9
     PicYuv();
10
x265_1.9.tar.gz/source/common/pixel.cpp -> x265_2.0.tar.gz/source/common/pixel.cpp Changed
72
 
1
@@ -607,7 +607,6 @@
2
  * s1*s1, s2*s2, and s1*s2 also obtain this value for edge cases: ((2^10-1)*16*4)^2 = 4286582784.
3
  * Maximum value for 9-bit is: ss*64 = (2^9-1)^2*16*4*64 = 1069551616, which will not overflow. */
4
 
5
-#define PIXEL_MAX ((1 << X265_DEPTH) - 1)
6
 #if HIGH_BIT_DEPTH
7
     X265_CHECK((X265_DEPTH == 10) || (X265_DEPTH == 12), "ssim invalid depth\n");
8
 #define type float
9
@@ -873,7 +872,25 @@
10
     }
11
 }
12
 
13
-static pixel planeClipAndMax_c(pixel *src, intptr_t stride, int width, int height, uint64_t *outsum, const pixel minPix, const pixel maxPix)
14
+/* Conversion between double and Q8.8 fixed point (big-endian) for storage */
15
+static void cuTreeFix8Pack(uint16_t *dst, double *src, int count)
16
+{
17
+    for (int i = 0; i < count; i++)
18
+        dst[i] = (uint16_t)(src[i] * 256.0);
19
+}
20
+
21
+static void cuTreeFix8Unpack(double *dst, uint16_t *src, int count)
22
+{
23
+    for (int i = 0; i < count; i++)
24
+    {
25
+        int16_t qpFix8 = src[i];
26
+        dst[i] = (double)(qpFix8) / 256.0;
27
+    }
28
+}
29
+
30
+#if HIGH_BIT_DEPTH
31
+static pixel planeClipAndMax_c(pixel *src, intptr_t stride, int width, int height, uint64_t *outsum, 
32
+                               const pixel minPix, const pixel maxPix)
33
 {
34
     pixel maxLumaLevel = 0;
35
     uint64_t sumLuma = 0;
36
@@ -882,21 +899,18 @@
37
     {
38
         for (int c = 0; c < width; c++)
39
         {
40
-            /* Clip luma of source picture to max and min values before extending edges of picYuv */
41
+            /* Clip luma of source picture to max and min*/
42
             src[c] = x265_clip3((pixel)minPix, (pixel)maxPix, src[c]);
43
-
44
-            /* Determine maximum and average luma level in a picture */
45
             maxLumaLevel = X265_MAX(src[c], maxLumaLevel);
46
             sumLuma += src[c];
47
         }
48
-
49
         src += stride;
50
     }
51
-
52
     *outsum = sumLuma;
53
     return maxLumaLevel;
54
 }
55
 
56
+#endif
57
 }  // end anonymous namespace
58
 
59
 namespace X265_NS {
60
@@ -1181,7 +1195,11 @@
61
     p.planecopy_cp = planecopy_cp_c;
62
     p.planecopy_sp = planecopy_sp_c;
63
     p.planecopy_sp_shl = planecopy_sp_shl_c;
64
+#if HIGH_BIT_DEPTH
65
     p.planeClipAndMax = planeClipAndMax_c;
66
+#endif
67
     p.propagateCost = estimateCUPropagateCost;
68
+    p.fix8Unpack = cuTreeFix8Unpack;
69
+    p.fix8Pack = cuTreeFix8Pack;
70
 }
71
 }
72
x265_1.9.tar.gz/source/common/predict.cpp -> x265_2.0.tar.gz/source/common/predict.cpp Changed
201
 
1
@@ -57,12 +57,10 @@
2
 
3
 Predict::Predict()
4
 {
5
-    m_immedVals = NULL;
6
 }
7
 
8
 Predict::~Predict()
9
 {
10
-    X265_FREE(m_immedVals);
11
     m_predShortYuv[0].destroy();
12
     m_predShortYuv[1].destroy();
13
 }
14
@@ -72,12 +70,8 @@
15
     m_csp = csp;
16
     m_hChromaShift = CHROMA_H_SHIFT(csp);
17
     m_vChromaShift = CHROMA_V_SHIFT(csp);
18
-    CHECKED_MALLOC(m_immedVals, int16_t, 64 * (64 + NTAPS_LUMA - 1));
19
 
20
     return m_predShortYuv[0].create(MAX_CU_SIZE, csp) && m_predShortYuv[1].create(MAX_CU_SIZE, csp);
21
-
22
-fail:
23
-    return false;
24
 }
25
 
26
 void Predict::motionCompensation(const CUData& cu, const PredictionUnit& pu, Yuv& predYuv, bool bLuma, bool bChroma)
27
@@ -258,8 +252,8 @@
28
     int partEnum = partitionFromSizes(pu.width, pu.height);
29
     const pixel* src = refPic.getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx) + srcOffset;
30
 
31
-    int xFrac = mv.x & 0x3;
32
-    int yFrac = mv.y & 0x3;
33
+    int xFrac = mv.x & 3;
34
+    int yFrac = mv.y & 3;
35
 
36
     if (!(yFrac | xFrac))
37
         primitives.pu[partEnum].copy_pp(dst, dstStride, src, srcStride);
38
@@ -280,14 +274,14 @@
39
     intptr_t srcOffset = (mv.x >> 2) + (mv.y >> 2) * srcStride;
40
     const pixel* src = refPic.getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx) + srcOffset;
41
 
42
-    int xFrac = mv.x & 0x3;
43
-    int yFrac = mv.y & 0x3;
44
-
45
     int partEnum = partitionFromSizes(pu.width, pu.height);
46
 
47
     X265_CHECK((pu.width % 4) + (pu.height % 4) == 0, "width or height not divisible by 4\n");
48
     X265_CHECK(dstStride == MAX_CU_SIZE, "stride expected to be max cu size\n");
49
 
50
+    int xFrac = mv.x & 3;
51
+    int yFrac = mv.y & 3;
52
+
53
     if (!(yFrac | xFrac))
54
         primitives.pu[partEnum].convert_p2s(src, srcStride, dst, dstStride);
55
     else if (!yFrac)
56
@@ -296,11 +290,12 @@
57
         primitives.pu[partEnum].luma_vps(src, srcStride, dst, dstStride, yFrac);
58
     else
59
     {
60
-        int tmpStride = pu.width;
61
-        int filterSize = NTAPS_LUMA;
62
-        int halfFilterSize = (filterSize >> 1);
63
-        primitives.pu[partEnum].luma_hps(src, srcStride, m_immedVals, tmpStride, xFrac, 1);
64
-        primitives.pu[partEnum].luma_vss(m_immedVals + (halfFilterSize - 1) * tmpStride, tmpStride, dst, dstStride, yFrac);
65
+        ALIGN_VAR_32(int16_t, immed[MAX_CU_SIZE * (MAX_CU_SIZE + NTAPS_LUMA - 1)]);
66
+        int immedStride = pu.width;
67
+        int halfFilterSize = NTAPS_LUMA >> 1;
68
+
69
+        primitives.pu[partEnum].luma_hps(src, srcStride, immed, immedStride, xFrac, 1);
70
+        primitives.pu[partEnum].luma_vss(immed + (halfFilterSize - 1) * immedStride, immedStride, dst, dstStride, yFrac);
71
     }
72
 }
73
 
74
@@ -309,10 +304,10 @@
75
     intptr_t dstStride = dstYuv.m_csize;
76
     intptr_t refStride = refPic.m_strideC;
77
 
78
-    int shiftHor = (2 + m_hChromaShift);
79
-    int shiftVer = (2 + m_vChromaShift);
80
+    int mvx = mv.x << (1 - m_hChromaShift);
81
+    int mvy = mv.y << (1 - m_vChromaShift);
82
 
83
-    intptr_t refOffset = (mv.x >> shiftHor) + (mv.y >> shiftVer) * refStride;
84
+    intptr_t refOffset = (mvx >> 3) + (mvy >> 3) * refStride;
85
 
86
     const pixel* refCb = refPic.getCbAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx) + refOffset;
87
     const pixel* refCr = refPic.getCrAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx) + refOffset;
88
@@ -320,11 +315,11 @@
89
     pixel* dstCb = dstYuv.getCbAddr(pu.puAbsPartIdx);
90
     pixel* dstCr = dstYuv.getCrAddr(pu.puAbsPartIdx);
91
 
92
-    int xFrac = mv.x & ((1 << shiftHor) - 1);
93
-    int yFrac = mv.y & ((1 << shiftVer) - 1);
94
-
95
     int partEnum = partitionFromSizes(pu.width, pu.height);
96
-    
97
+
98
+    int xFrac = mvx & 7;
99
+    int yFrac = mvy & 7;
100
+
101
     if (!(yFrac | xFrac))
102
     {
103
         primitives.chroma[m_csp].pu[partEnum].copy_pp(dstCb, dstStride, refCb, refStride);
104
@@ -332,37 +327,36 @@
105
     }
106
     else if (!yFrac)
107
     {
108
-        primitives.chroma[m_csp].pu[partEnum].filter_hpp(refCb, refStride, dstCb, dstStride, xFrac << (1 - m_hChromaShift));
109
-        primitives.chroma[m_csp].pu[partEnum].filter_hpp(refCr, refStride, dstCr, dstStride, xFrac << (1 - m_hChromaShift));
110
+        primitives.chroma[m_csp].pu[partEnum].filter_hpp(refCb, refStride, dstCb, dstStride, xFrac);
111
+        primitives.chroma[m_csp].pu[partEnum].filter_hpp(refCr, refStride, dstCr, dstStride, xFrac);
112
     }
113
     else if (!xFrac)
114
     {
115
-        primitives.chroma[m_csp].pu[partEnum].filter_vpp(refCb, refStride, dstCb, dstStride, yFrac << (1 - m_vChromaShift));
116
-        primitives.chroma[m_csp].pu[partEnum].filter_vpp(refCr, refStride, dstCr, dstStride, yFrac << (1 - m_vChromaShift));
117
+        primitives.chroma[m_csp].pu[partEnum].filter_vpp(refCb, refStride, dstCb, dstStride, yFrac);
118
+        primitives.chroma[m_csp].pu[partEnum].filter_vpp(refCr, refStride, dstCr, dstStride, yFrac);
119
     }
120
     else
121
     {
122
-        int extStride = pu.width >> m_hChromaShift;
123
-        int filterSize = NTAPS_CHROMA;
124
-        int halfFilterSize = (filterSize >> 1);
125
-
126
-        primitives.chroma[m_csp].pu[partEnum].filter_hps(refCb, refStride, m_immedVals, extStride, xFrac << (1 - m_hChromaShift), 1);
127
-        primitives.chroma[m_csp].pu[partEnum].filter_vsp(m_immedVals + (halfFilterSize - 1) * extStride, extStride, dstCb, dstStride, yFrac << (1 - m_vChromaShift));
128
-
129
-        primitives.chroma[m_csp].pu[partEnum].filter_hps(refCr, refStride, m_immedVals, extStride, xFrac << (1 - m_hChromaShift), 1);
130
-        primitives.chroma[m_csp].pu[partEnum].filter_vsp(m_immedVals + (halfFilterSize - 1) * extStride, extStride, dstCr, dstStride, yFrac << (1 - m_vChromaShift));
131
+        ALIGN_VAR_32(int16_t, immed[MAX_CU_SIZE * (MAX_CU_SIZE + NTAPS_CHROMA - 1)]);
132
+        int immedStride = pu.width >> m_hChromaShift;
133
+        int halfFilterSize = NTAPS_CHROMA >> 1;
134
+
135
+        primitives.chroma[m_csp].pu[partEnum].filter_hps(refCb, refStride, immed, immedStride, xFrac, 1);
136
+        primitives.chroma[m_csp].pu[partEnum].filter_vsp(immed + (halfFilterSize - 1) * immedStride, immedStride, dstCb, dstStride, yFrac);
137
+        primitives.chroma[m_csp].pu[partEnum].filter_hps(refCr, refStride, immed, immedStride, xFrac, 1);
138
+        primitives.chroma[m_csp].pu[partEnum].filter_vsp(immed + (halfFilterSize - 1) * immedStride, immedStride, dstCr, dstStride, yFrac);
139
     }
140
 }
141
 
142
 void Predict::predInterChromaShort(const PredictionUnit& pu, ShortYuv& dstSYuv, const PicYuv& refPic, const MV& mv) const
143
 {
144
-    intptr_t refStride = refPic.m_strideC;
145
     intptr_t dstStride = dstSYuv.m_csize;
146
+    intptr_t refStride = refPic.m_strideC;
147
 
148
-    int shiftHor = (2 + m_hChromaShift);
149
-    int shiftVer = (2 + m_vChromaShift);
150
+    int mvx = mv.x << (1 - m_hChromaShift);
151
+    int mvy = mv.y << (1 - m_vChromaShift);
152
 
153
-    intptr_t refOffset = (mv.x >> shiftHor) + (mv.y >> shiftVer) * refStride;
154
+    intptr_t refOffset = (mvx >> 3) + (mvy >> 3) * refStride;
155
 
156
     const pixel* refCb = refPic.getCbAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx) + refOffset;
157
     const pixel* refCr = refPic.getCrAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx) + refOffset;
158
@@ -370,15 +364,15 @@
159
     int16_t* dstCb = dstSYuv.getCbAddr(pu.puAbsPartIdx);
160
     int16_t* dstCr = dstSYuv.getCrAddr(pu.puAbsPartIdx);
161
 
162
-    int xFrac = mv.x & ((1 << shiftHor) - 1);
163
-    int yFrac = mv.y & ((1 << shiftVer) - 1);
164
-
165
     int partEnum = partitionFromSizes(pu.width, pu.height);
166
     
167
     uint32_t cxWidth  = pu.width >> m_hChromaShift;
168
 
169
     X265_CHECK(((cxWidth | (pu.height >> m_vChromaShift)) % 2) == 0, "chroma block size expected to be multiple of 2\n");
170
 
171
+    int xFrac = mvx & 7;
172
+    int yFrac = mvy & 7;
173
+
174
     if (!(yFrac | xFrac))
175
     {
176
         primitives.chroma[m_csp].pu[partEnum].p2s(refCb, refStride, dstCb, dstStride);
177
@@ -386,23 +380,24 @@
178
     }
179
     else if (!yFrac)
180
     {
181
-        primitives.chroma[m_csp].pu[partEnum].filter_hps(refCb, refStride, dstCb, dstStride, xFrac << (1 - m_hChromaShift), 0);
182
-        primitives.chroma[m_csp].pu[partEnum].filter_hps(refCr, refStride, dstCr, dstStride, xFrac << (1 - m_hChromaShift), 0);
183
+        primitives.chroma[m_csp].pu[partEnum].filter_hps(refCb, refStride, dstCb, dstStride, xFrac, 0);
184
+        primitives.chroma[m_csp].pu[partEnum].filter_hps(refCr, refStride, dstCr, dstStride, xFrac, 0);
185
     }
186
     else if (!xFrac)
187
     {
188
-        primitives.chroma[m_csp].pu[partEnum].filter_vps(refCb, refStride, dstCb, dstStride, yFrac << (1 - m_vChromaShift));
189
-        primitives.chroma[m_csp].pu[partEnum].filter_vps(refCr, refStride, dstCr, dstStride, yFrac << (1 - m_vChromaShift));
190
+        primitives.chroma[m_csp].pu[partEnum].filter_vps(refCb, refStride, dstCb, dstStride, yFrac);
191
+        primitives.chroma[m_csp].pu[partEnum].filter_vps(refCr, refStride, dstCr, dstStride, yFrac);
192
     }
193
     else
194
     {
195
-        int extStride = cxWidth;
196
-        int filterSize = NTAPS_CHROMA;
197
-        int halfFilterSize = (filterSize >> 1);
198
-        primitives.chroma[m_csp].pu[partEnum].filter_hps(refCb, refStride, m_immedVals, extStride, xFrac << (1 - m_hChromaShift), 1);
199
-        primitives.chroma[m_csp].pu[partEnum].filter_vss(m_immedVals + (halfFilterSize - 1) * extStride, extStride, dstCb, dstStride, yFrac << (1 - m_vChromaShift));
200
-        primitives.chroma[m_csp].pu[partEnum].filter_hps(refCr, refStride, m_immedVals, extStride, xFrac << (1 - m_hChromaShift), 1);
201
x265_1.9.tar.gz/source/common/predict.h -> x265_2.0.tar.gz/source/common/predict.h Changed
9
 
1
@@ -73,7 +73,6 @@
2
     };
3
 
4
     ShortYuv  m_predShortYuv[2]; /* temporary storage for weighted prediction */
5
-    int16_t*  m_immedVals;
6
 
7
     // Unfiltered/filtered neighbours of the current partition.
8
     pixel     intraNeighbourBuf[2][258];
9
x265_1.9.tar.gz/source/common/primitives.cpp -> x265_2.0.tar.gz/source/common/primitives.cpp Changed
31
 
1
@@ -238,7 +238,9 @@
2
             primitives.cu[i].intra_pred_allangs = NULL;
3
 
4
 #if ENABLE_ASSEMBLY
5
+#if X265_ARCH_X86
6
         setupInstrinsicPrimitives(primitives, param->cpuid);
7
+#endif
8
         setupAssemblyPrimitives(primitives, param->cpuid);
9
 #endif
10
 
11
@@ -249,7 +251,7 @@
12
 }
13
 }
14
 
15
-#if ENABLE_ASSEMBLY
16
+#if ENABLE_ASSEMBLY && X265_ARCH_X86
17
 /* these functions are implemented in assembly. When assembly is not being
18
  * compiled, they are unnecessary and can be NOPs */
19
 #else
20
@@ -258,7 +260,10 @@
21
 void PFX(cpu_emms)(void) {}
22
 void PFX(cpu_cpuid)(uint32_t, uint32_t *eax, uint32_t *, uint32_t *, uint32_t *) { *eax = 0; }
23
 void PFX(cpu_xgetbv)(uint32_t, uint32_t *, uint32_t *) {}
24
+
25
+#if X265_ARCH_ARM == 0
26
 void PFX(cpu_neon_test)(void) {}
27
 int PFX(cpu_fast_neon_mrc_test)(void) { return 0; }
28
+#endif // X265_ARCH_ARM
29
 }
30
 #endif
31
x265_1.9.tar.gz/source/common/primitives.h -> x265_2.0.tar.gz/source/common/primitives.h Changed
36
 
1
@@ -189,6 +189,9 @@
2
 
3
 typedef void (*cutree_propagate_cost) (int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, const uint16_t* interCosts, const int32_t* invQscales, const double* fpsFactor, int len);
4
 
5
+typedef void (*cutree_fix8_unpack)(double *dst, uint16_t *src, int count);
6
+typedef void (*cutree_fix8_pack)(uint16_t *dst, double *src, int count);
7
+
8
 typedef int (*scanPosLast_t)(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* scanCG4x4, const int trSize);
9
 typedef uint32_t (*findPosFirstLast_t)(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16]);
10
 
11
@@ -197,6 +200,7 @@
12
 typedef uint32_t (*costC1C2Flag_t)(uint16_t *absCoeff, intptr_t numC1Flag, uint8_t *baseCtxMod, intptr_t ctxOffset);
13
 
14
 typedef void (*pelFilterLumaStrong_t)(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tcP, int32_t tcQ);
15
+typedef void (*pelFilterChroma_t)(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tc, int32_t maskP, int32_t maskQ);
16
 
17
 /* Function pointers to optimized encoder primitives. Each pointer can reference
18
  * either an assembly routine, a SIMD intrinsic primitive, or a C function */
19
@@ -313,6 +317,8 @@
20
 
21
     downscale_t           frameInitLowres;
22
     cutree_propagate_cost propagateCost;
23
+    cutree_fix8_unpack    fix8Unpack;
24
+    cutree_fix8_pack      fix8Pack;
25
 
26
     extendCURowBorder_t   extendRowBorder;
27
     planecopy_cp_t        planecopy_cp;
28
@@ -332,6 +338,7 @@
29
     costC1C2Flag_t        costC1C2Flag;
30
 
31
     pelFilterLumaStrong_t pelFilterLumaStrong[2]; // EDGE_VER = 0, EDGE_HOR = 1
32
+    pelFilterChroma_t     pelFilterChroma[2];     // EDGE_VER = 0, EDGE_HOR = 1
33
 
34
     /* There is one set of chroma primitives per color space. An encoder will
35
      * have just a single color space and thus it will only ever use one entry
36
x265_1.9.tar.gz/source/common/quant.cpp -> x265_2.0.tar.gz/source/common/quant.cpp Changed
21
 
1
@@ -188,10 +188,9 @@
2
     m_nr           = NULL;
3
 }
4
 
5
-bool Quant::init(int rdoqLevel, double psyScale, const ScalingList& scalingList, Entropy& entropy)
6
+bool Quant::init(double psyScale, const ScalingList& scalingList, Entropy& entropy)
7
 {
8
     m_entropyCoder = &entropy;
9
-    m_rdoqLevel    = rdoqLevel;
10
     m_psyRdoqScale = (int32_t)(psyScale * 256.0);
11
     X265_CHECK((psyScale * 256.0) < (double)MAX_INT, "psyScale value too large\n");
12
     m_scalingList  = &scalingList;
13
@@ -223,6 +222,7 @@
14
 {
15
     m_nr = m_frameNr ? &m_frameNr[ctu.m_encData->m_frameEncoderID] : NULL;
16
     m_qpParam[TEXT_LUMA].setQpParam(qp + QP_BD_OFFSET);
17
+    m_rdoqLevel = ctu.m_encData->m_param->rdoqLevel;
18
     if (ctu.m_chromaFormat != X265_CSP_I400)
19
     {
20
         setChromaQP(qp + ctu.m_slice->m_pps->chromaQpOffset[0], TEXT_CHROMA_U, ctu.m_chromaFormat);
21
x265_1.9.tar.gz/source/common/quant.h -> x265_2.0.tar.gz/source/common/quant.h Changed
10
 
1
@@ -100,7 +100,7 @@
2
     ~Quant();
3
 
4
     /* one-time setup */
5
-    bool init(int rdoqLevel, double psyScale, const ScalingList& scalingList, Entropy& entropy);
6
+    bool init(double psyScale, const ScalingList& scalingList, Entropy& entropy);
7
     bool allocNoiseReduction(const x265_param& param);
8
 
9
     /* CU setup */
10
x265_1.9.tar.gz/source/common/scalinglist.cpp -> x265_2.0.tar.gz/source/common/scalinglist.cpp Changed
60
 
1
@@ -57,7 +57,11 @@
2
     },
3
     {
4
         "INTRA32X32_LUMA",
5
+        "",
6
+        "",
7
         "INTER32X32_LUMA",
8
+        "",
9
+        "",
10
     },
11
 };
12
 const char MatrixType_DC[4][12][22] =
13
@@ -76,7 +80,11 @@
14
     },
15
     {
16
         "INTRA32X32_LUMA_DC",
17
+        "",
18
+        "",
19
         "INTER32X32_LUMA_DC",
20
+        "",
21
+        "",
22
     },
23
 };
24
 
25
@@ -246,15 +254,15 @@
26
 
27
     char line[1024];
28
     int32_t *src = NULL;
29
+    fseek(fp, 0, 0);
30
 
31
     for (int sizeIdc = 0; sizeIdc < NUM_SIZES; sizeIdc++)
32
     {
33
         int size = X265_MIN(MAX_MATRIX_COEF_NUM, s_numCoefPerSize[sizeIdc]);
34
-        for (int listIdc = 0; listIdc < NUM_LISTS; listIdc++)
35
+        for (int listIdc = 0; listIdc < NUM_LISTS;  listIdc += (sizeIdc == 3) ? 3 : 1)
36
         {
37
             src = m_scalingListCoef[sizeIdc][listIdc];
38
 
39
-            fseek(fp, 0, 0);
40
             do
41
             {
42
                 char *ret = fgets(line, 1024, fp);
43
@@ -282,7 +290,6 @@
44
 
45
             if (sizeIdc > BLOCK_8x8)
46
             {
47
-                fseek(fp, 0, 0);
48
                 do
49
                 {
50
                     char *ret = fgets(line, 1024, fp);
51
@@ -310,7 +317,7 @@
52
     fclose(fp);
53
 
54
     m_bEnabled = true;
55
-    m_bDataPresent = !checkDefaultScalingList();
56
+    m_bDataPresent = true;
57
 
58
     return false;
59
 }
60
x265_1.9.tar.gz/source/common/shortyuv.cpp -> x265_2.0.tar.gz/source/common/shortyuv.cpp Changed
15
 
1
@@ -78,11 +78,11 @@
2
     memset(m_buf[2], 0, (m_csize * m_csize) * sizeof(int16_t));
3
 }
4
 
5
-void ShortYuv::subtract(const Yuv& srcYuv0, const Yuv& srcYuv1, uint32_t log2Size)
6
+void ShortYuv::subtract(const Yuv& srcYuv0, const Yuv& srcYuv1, uint32_t log2Size, int picCsp)
7
 {
8
     const int sizeIdx = log2Size - 2;
9
     primitives.cu[sizeIdx].sub_ps(m_buf[0], m_size, srcYuv0.m_buf[0], srcYuv1.m_buf[0], srcYuv0.m_size, srcYuv1.m_size);
10
-    if (m_csp != X265_CSP_I400)
11
+    if (m_csp != X265_CSP_I400 && picCsp != X265_CSP_I400)
12
     {
13
         primitives.chroma[m_csp].cu[sizeIdx].sub_ps(m_buf[1], m_csize, srcYuv0.m_buf[1], srcYuv1.m_buf[1], srcYuv0.m_csize, srcYuv1.m_csize);
14
         primitives.chroma[m_csp].cu[sizeIdx].sub_ps(m_buf[2], m_csize, srcYuv0.m_buf[2], srcYuv1.m_buf[2], srcYuv0.m_csize, srcYuv1.m_csize);
15
x265_1.9.tar.gz/source/common/shortyuv.h -> x265_2.0.tar.gz/source/common/shortyuv.h Changed
10
 
1
@@ -64,7 +64,7 @@
2
     const int16_t* getCrAddr(uint32_t absPartIdx) const                         { return m_buf[2] + getChromaAddrOffset(absPartIdx); }
3
     const int16_t* getChromaAddr(uint32_t chromaId, uint32_t partUnitIdx) const { return m_buf[chromaId] + getChromaAddrOffset(partUnitIdx); }
4
 
5
-    void subtract(const Yuv& srcYuv0, const Yuv& srcYuv1, uint32_t log2Size);
6
+    void subtract(const Yuv& srcYuv0, const Yuv& srcYuv1, uint32_t log2Size, int picCsp);
7
 
8
     void copyPartToPartLuma(ShortYuv& dstYuv, uint32_t absPartIdx, uint32_t log2Size) const;
9
     void copyPartToPartChroma(ShortYuv& dstYuv, uint32_t absPartIdx, uint32_t log2SizeL) const;
10
x265_1.9.tar.gz/source/common/threadpool.cpp -> x265_2.0.tar.gz/source/common/threadpool.cpp Changed
201
 
1
@@ -28,6 +28,10 @@
2
 
3
 #include <new>
4
 
5
+#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7
6
+#include <winnt.h>
7
+#endif
8
+
9
 #if X86_64
10
 
11
 #ifdef __GNUC__
12
@@ -64,6 +68,21 @@
13
 # define strcasecmp _stricmp
14
 #endif
15
 
16
+#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7
17
+const uint64_t m1 = 0x5555555555555555; //binary: 0101...
18
+const uint64_t m2 = 0x3333333333333333; //binary: 00110011..
19
+const uint64_t m3 = 0x0f0f0f0f0f0f0f0f; //binary:  4 zeros,  4 ones ...
20
+const uint64_t h01 = 0x0101010101010101; //the sum of 256 to the power of 0,1,2,3...
21
+
22
+static int popCount(uint64_t x)
23
+{
24
+    x -= (x >> 1) & m1;
25
+    x = (x & m2) + ((x >> 2) & m2);
26
+    x = (x + (x >> 4)) & m3;
27
+    return (x * h01) >> 56;
28
+}
29
+#endif
30
+
31
 namespace X265_NS {
32
 // x265 private namespace
33
 
34
@@ -238,7 +257,6 @@
35
     memset(nodeMaskPerPool, 0, sizeof(nodeMaskPerPool));
36
 
37
     int numNumaNodes = X265_MIN(getNumaNodeCount(), MAX_NODE_NUM);
38
-    int cpuCount = getCpuCount();
39
     bool bNumaSupport = false;
40
 
41
 #if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 
42
@@ -248,26 +266,54 @@
43
 #endif
44
 
45
 
46
-    for (int i = 0; i < cpuCount; i++)
47
+#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7
48
+    PGROUP_AFFINITY groupAffinityPointer = new GROUP_AFFINITY;
49
+    for (int i = 0; i < numNumaNodes; i++)
50
     {
51
-#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 
52
-        UCHAR node;
53
-        if (GetNumaProcessorNode((UCHAR)i, &node))
54
-            cpusPerNode[X265_MIN(node, (UCHAR)MAX_NODE_NUM)]++;
55
-        else
56
+        GetNumaNodeProcessorMaskEx((UCHAR)i, groupAffinityPointer);
57
+        cpusPerNode[i] = popCount(groupAffinityPointer->Mask);
58
+    }
59
+    delete groupAffinityPointer;
60
 #elif HAVE_LIBNUMA
61
-        if (bNumaSupport >= 0)
62
-            cpusPerNode[X265_MIN(numa_node_of_cpu(i), MAX_NODE_NUM)]++;
63
-        else
64
-#endif
65
-            cpusPerNode[0]++;
66
+    if (bNumaSupport)
67
+    {
68
+        struct bitmask* bitMask = numa_allocate_cpumask();
69
+        for (int i = 0; i < numNumaNodes; i++)
70
+        {
71
+            int ret = numa_node_to_cpus(i, bitMask);
72
+            if (!ret)
73
+                cpusPerNode[i] = numa_bitmask_weight(bitMask);
74
+            else
75
+                x265_log(p, X265_LOG_ERROR, "Failed to genrate CPU mask\n");
76
+        }
77
+        numa_free_cpumask(bitMask);
78
     }
79
+#else // NUMA not supported
80
+    cpusPerNode[0] = getCpuCount();
81
+#endif
82
 
83
     if (bNumaSupport && p->logLevel >= X265_LOG_DEBUG)
84
-        for (int i = 0; i < numNumaNodes; i++)
85
-            x265_log(p, X265_LOG_DEBUG, "detected NUMA node %d with %d logical cores\n", i, cpusPerNode[i]);
86
-
87
-    /* limit threads based on param->numaPools */
88
+    for (int i = 0; i < numNumaNodes; i++)
89
+        x265_log(p, X265_LOG_DEBUG, "detected NUMA node %d with %d logical cores\n", i, cpusPerNode[i]);
90
+    /* limit threads based on param->numaPools
91
+     * For windows because threads can't be allocated to live across sockets
92
+     * changing the default behavior to be per-socket pools -- FIXME */
93
+#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7
94
+     if (!p->numaPools)
95
+     {
96
+         char poolString[50] = "";
97
+         for (int i = 0; i < numNumaNodes; i++)
98
+         {
99
+             char nextCount[10] = "";
100
+             if (i)
101
+                 sprintf(nextCount, ",%d", cpusPerNode[i]);
102
+             else
103
+                   sprintf(nextCount, "%d", cpusPerNode[i]);
104
+             strcat(poolString, nextCount);
105
+         }
106
+         x265_param_parse(p, "pools", poolString);
107
+     }
108
+#endif
109
     if (p->numaPools && *p->numaPools)
110
     {
111
         const char *nodeStr = p->numaPools;
112
@@ -280,7 +326,7 @@
113
             }
114
             else if (*nodeStr == '-')
115
                 threadsPerPool[i] = 0;
116
-           else if (*nodeStr == '*' || !strcasecmp(nodeStr, "NULL"))
117
+            else if (*nodeStr == '*' || !strcasecmp(nodeStr, "NULL"))
118
             {
119
                 for (int j = i; j < numNumaNodes; j++)
120
                 {
121
@@ -297,8 +343,16 @@
122
             else
123
             {
124
                 int count = atoi(nodeStr);
125
-                threadsPerPool[i] = X265_MIN(count, cpusPerNode[i]);
126
-                nodeMaskPerPool[i] = ((uint64_t)1 << i);
127
+                if (i > 0 || strchr(nodeStr, ','))   // it is comma -> old logic
128
+                {
129
+                    threadsPerPool[i] = X265_MIN(count, cpusPerNode[i]);
130
+                    nodeMaskPerPool[i] = ((uint64_t)1 << i);
131
+                }
132
+                else                                 // new logic: exactly 'count' threads on all NUMAs
133
+                {
134
+                    threadsPerPool[numNumaNodes] = X265_MIN(count, numNumaNodes * MAX_POOL_THREADS);
135
+                    nodeMaskPerPool[numNumaNodes] = ((uint64_t)-1 >> (64 - numNumaNodes));
136
+                }
137
             }
138
 
139
             /* consume current node string, comma, and white-space */
140
@@ -389,16 +443,15 @@
141
     X265_CHECK(numThreads <= MAX_POOL_THREADS, "a single thread pool cannot have more than MAX_POOL_THREADS threads\n");
142
 
143
 #if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 
144
-    m_winCpuMask = 0x0;
145
-    GROUP_AFFINITY groupAffinity;
146
+    memset(&m_groupAffinity, 0, sizeof(GROUP_AFFINITY));
147
     for (int i = 0; i < getNumaNodeCount(); i++)
148
     {
149
         int numaNode = ((nodeMask >> i) & 0x1U) ? i : -1;
150
         if (numaNode != -1)
151
-            if (GetNumaNodeProcessorMaskEx((USHORT)numaNode, &groupAffinity))
152
-                m_winCpuMask |= groupAffinity.Mask;
153
+        if (GetNumaNodeProcessorMaskEx((USHORT)numaNode, &m_groupAffinity))
154
+            break;
155
     }
156
-    m_numaMask = &m_winCpuMask;
157
+    m_numaMask = &m_groupAffinity.Mask;
158
 #elif HAVE_LIBNUMA
159
     if (numa_available() >= 0)
160
     {
161
@@ -480,11 +533,16 @@
162
     setThreadNodeAffinity(m_numaMask);
163
 }
164
 
165
-/* static */
166
 void ThreadPool::setThreadNodeAffinity(void *numaMask)
167
 {
168
 #if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 
169
-    if (SetThreadAffinityMask(GetCurrentThread(), *((DWORD_PTR*)numaMask)))
170
+    UNREFERENCED_PARAMETER(numaMask);
171
+    GROUP_AFFINITY groupAffinity;
172
+    memset(&groupAffinity, 0, sizeof(GROUP_AFFINITY));
173
+    groupAffinity.Group = m_groupAffinity.Group;
174
+    groupAffinity.Mask = m_groupAffinity.Mask;
175
+    const PGROUP_AFFINITY affinityPointer = &groupAffinity;
176
+    if (SetThreadGroupAffinity(GetCurrentThread(), affinityPointer, NULL))
177
         return;
178
     else
179
         x265_log(NULL, X265_LOG_ERROR, "unable to set thread affinity for NUMA node mask\n");
180
@@ -524,10 +582,25 @@
181
 /* static */
182
 int ThreadPool::getCpuCount()
183
 {
184
-#if _WIN32
185
+#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7
186
+    enum { MAX_NODE_NUM = 127 };
187
+    int cpus = 0;
188
+    int numNumaNodes = X265_MIN(getNumaNodeCount(), MAX_NODE_NUM);
189
+    GROUP_AFFINITY groupAffinity;
190
+    for (int i = 0; i < numNumaNodes; i++)
191
+    {
192
+        GetNumaNodeProcessorMaskEx((UCHAR)i, &groupAffinity);
193
+        cpus += popCount(groupAffinity.Mask);
194
+    }
195
+    return cpus;
196
+#elif _WIN32
197
     SYSTEM_INFO sysinfo;
198
     GetSystemInfo(&sysinfo);
199
     return sysinfo.dwNumberOfProcessors;
200
+#elif __unix__ && X265_ARCH_ARM
201
x265_1.9.tar.gz/source/common/threadpool.h -> x265_2.0.tar.gz/source/common/threadpool.h Changed
26
 
1
@@ -85,7 +85,7 @@
2
     int           m_numWorkers;
3
     void*         m_numaMask; // node mask in linux, cpu mask in windows
4
 #if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 
5
-    DWORD_PTR     m_winCpuMask;
6
+    GROUP_AFFINITY m_groupAffinity;
7
 #endif
8
     bool          m_isActive;
9
 
10
@@ -99,6 +99,7 @@
11
     bool start();
12
     void stopWorkers();
13
     void setCurrentThreadAffinity();
14
+    void setThreadNodeAffinity(void *numaMask);
15
     int  tryAcquireSleepingThread(sleepbitmap_t firstTryBitmap, sleepbitmap_t secondTryBitmap);
16
     int  tryBondPeers(int maxPeers, sleepbitmap_t peerBitmap, BondedTaskGroup& master);
17
 
18
@@ -106,7 +107,6 @@
19
 
20
     static int  getCpuCount();
21
     static int  getNumaNodeCount();
22
-    static void setThreadNodeAffinity(void *numaMask);
23
 };
24
 
25
 /* Any worker thread may enlist the help of idle worker threads from the same
26
x265_1.9.tar.gz/source/common/x86/asm-primitives.cpp -> x265_2.0.tar.gz/source/common/x86/asm-primitives.cpp Changed
119
 
1
@@ -861,12 +861,12 @@
2
 template<int size>
3
 void interp_8tap_hv_pp_cpu(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int idxX, int idxY)
4
 {
5
-    ALIGN_VAR_32(int16_t, immed[MAX_CU_SIZE * (MAX_CU_SIZE + NTAPS_LUMA)]);
6
-    const int filterSize = NTAPS_LUMA;
7
-    const int halfFilterSize = filterSize >> 1;
8
+    ALIGN_VAR_32(int16_t, immed[MAX_CU_SIZE * (MAX_CU_SIZE + NTAPS_LUMA - 1)]);
9
+    const int halfFilterSize = NTAPS_LUMA >> 1;
10
+    const int immedStride = MAX_CU_SIZE;
11
 
12
-    primitives.pu[size].luma_hps(src, srcStride, immed, MAX_CU_SIZE, idxX, 1);
13
-    primitives.pu[size].luma_vsp(immed + (halfFilterSize - 1) * MAX_CU_SIZE, MAX_CU_SIZE, dst, dstStride, idxY);
14
+    primitives.pu[size].luma_hps(src, srcStride, immed, immedStride, idxX, 1);
15
+    primitives.pu[size].luma_vsp(immed + (halfFilterSize - 1) * immedStride, immedStride, dst, dstStride, idxY);
16
 }
17
 
18
 #if HIGH_BIT_DEPTH
19
@@ -1098,9 +1098,16 @@
20
         p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].p2s = PFX(filterPixelToShort_8x2_ssse3);
21
         p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].p2s = PFX(filterPixelToShort_8x6_ssse3);
22
         p.findPosFirstLast = PFX(findPosFirstLast_ssse3);
23
+        p.fix8Unpack = PFX(cutree_fix8_unpack_ssse3);
24
+        p.fix8Pack = PFX(cutree_fix8_pack_ssse3);
25
     }
26
     if (cpuMask & X265_CPU_SSE4)
27
     {
28
+        p.pelFilterLumaStrong[0] = PFX(pelFilterLumaStrong_V_sse4);
29
+        p.pelFilterLumaStrong[1] = PFX(pelFilterLumaStrong_H_sse4);
30
+        p.pelFilterChroma[0] = PFX(pelFilterChroma_V_sse4);
31
+        p.pelFilterChroma[1] = PFX(pelFilterChroma_H_sse4);
32
+
33
         p.saoCuOrgE0 = PFX(saoCuOrgE0_sse4);
34
         p.saoCuOrgE1 = PFX(saoCuOrgE1_sse4);
35
         p.saoCuOrgE1_2Rows = PFX(saoCuOrgE1_2Rows_sse4);
36
@@ -1166,6 +1173,12 @@
37
         p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].p2s = PFX(filterPixelToShort_2x16_sse4);
38
         p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].p2s = PFX(filterPixelToShort_6x16_sse4);
39
         p.costCoeffRemain = PFX(costCoeffRemain_sse4);
40
+#if X86_64
41
+        p.saoCuStatsE0 = PFX(saoCuStatsE0_sse4);
42
+        p.saoCuStatsE1 = PFX(saoCuStatsE1_sse4);
43
+        p.saoCuStatsE2 = PFX(saoCuStatsE2_sse4);
44
+        p.saoCuStatsE3 = PFX(saoCuStatsE3_sse4);
45
+#endif
46
     }
47
     if (cpuMask & X265_CPU_AVX)
48
     {
49
@@ -2141,11 +2154,23 @@
50
 
51
         p.frameInitLowres = PFX(frame_init_lowres_core_avx2);
52
         p.propagateCost = PFX(mbtree_propagate_cost_avx2);
53
+        p.fix8Unpack = PFX(cutree_fix8_unpack_avx2);
54
+        p.fix8Pack = PFX(cutree_fix8_pack_avx2);
55
+
56
+        /* TODO: This kernel needs to be modified to work with HIGH_BIT_DEPTH only 
57
+        p.planeClipAndMax = PFX(planeClipAndMax_avx2); */
58
 
59
         // TODO: depends on hps and vsp
60
         ALL_LUMA_PU_T(luma_hvpp, interp_8tap_hv_pp_cpu);                        // calling luma_hvpp for all sizes
61
         p.pu[LUMA_4x4].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_4x4>;             // ALL_LUMA_PU_T has declared all sizes except 4x4, hence calling luma_hvpp[4x4] 
62
 
63
+#if X265_DEPTH == 10
64
+        p.pu[LUMA_8x8].satd = PFX(pixel_satd_8x8_avx2);
65
+        p.cu[LUMA_8x8].sa8d = PFX(pixel_sa8d_8x8_avx2);
66
+        p.cu[LUMA_16x16].sa8d = PFX(pixel_sa8d_16x16_avx2);
67
+        p.cu[LUMA_32x32].sa8d = PFX(pixel_sa8d_32x32_avx2);
68
+#endif
69
+
70
         if (cpuMask & X265_CPU_BMI2)
71
         {
72
             p.scanPosLast = PFX(scanPosLast_avx2_bmi2);
73
@@ -2434,6 +2459,8 @@
74
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s = PFX(filterPixelToShort_32x48_ssse3);
75
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s = PFX(filterPixelToShort_32x64_ssse3);
76
         p.findPosFirstLast = PFX(findPosFirstLast_ssse3);
77
+        p.fix8Unpack = PFX(cutree_fix8_unpack_ssse3);
78
+        p.fix8Pack = PFX(cutree_fix8_pack_ssse3);
79
     }
80
     if (cpuMask & X265_CPU_SSE4)
81
     {
82
@@ -2529,8 +2556,10 @@
83
 #if X86_64
84
         p.pelFilterLumaStrong[0] = PFX(pelFilterLumaStrong_V_sse4);
85
         p.pelFilterLumaStrong[1] = PFX(pelFilterLumaStrong_H_sse4);
86
+        p.pelFilterChroma[0] = PFX(pelFilterChroma_V_sse4);
87
+        p.pelFilterChroma[1] = PFX(pelFilterChroma_H_sse4);
88
 
89
-        p.saoCuStatsBO = PFX(saoCuStatsBO_sse4);
90
+//        p.saoCuStatsBO = PFX(saoCuStatsBO_sse4);
91
         p.saoCuStatsE0 = PFX(saoCuStatsE0_sse4);
92
         p.saoCuStatsE1 = PFX(saoCuStatsE1_sse4);
93
         p.saoCuStatsE2 = PFX(saoCuStatsE2_sse4);
94
@@ -2932,6 +2961,7 @@
95
         p.cu[BLOCK_8x8].intra_pred[14] = PFX(intra_pred_ang8_14_avx2);
96
         p.cu[BLOCK_8x8].intra_pred[15] = PFX(intra_pred_ang8_15_avx2);
97
         p.cu[BLOCK_8x8].intra_pred[16] = PFX(intra_pred_ang8_16_avx2);
98
+        p.cu[BLOCK_8x8].intra_pred[17] = PFX(intra_pred_ang8_17_avx2);
99
         p.cu[BLOCK_8x8].intra_pred[20] = PFX(intra_pred_ang8_20_avx2);
100
         p.cu[BLOCK_8x8].intra_pred[21] = PFX(intra_pred_ang8_21_avx2);
101
         p.cu[BLOCK_8x8].intra_pred[22] = PFX(intra_pred_ang8_22_avx2);
102
@@ -3651,7 +3681,6 @@
103
         p.chroma[X265_CSP_I420].cu[CHROMA_420_32x32].copy_ps = PFX(blockcopy_ps_32x32_avx2);
104
         p.chroma[X265_CSP_I422].cu[CHROMA_422_32x64].copy_ps = PFX(blockcopy_ps_32x64_avx2);
105
         p.cu[BLOCK_64x64].copy_ps = PFX(blockcopy_ps_64x64_avx2);
106
-        p.planeClipAndMax = PFX(planeClipAndMax_avx2);
107
 
108
         p.pu[LUMA_32x8].sad_x3 = PFX(pixel_sad_x3_32x8_avx2);
109
         p.pu[LUMA_32x16].sad_x3 = PFX(pixel_sad_x3_32x16_avx2);
110
@@ -3663,6 +3692,8 @@
111
         p.pu[LUMA_64x48].sad_x3 = PFX(pixel_sad_x3_64x48_avx2);
112
         p.pu[LUMA_64x64].sad_x3 = PFX(pixel_sad_x3_64x64_avx2);
113
         p.pu[LUMA_48x64].sad_x3 = PFX(pixel_sad_x3_48x64_avx2);
114
+        p.fix8Unpack = PFX(cutree_fix8_unpack_avx2);
115
+        p.fix8Pack = PFX(cutree_fix8_pack_avx2);
116
 
117
     }
118
 #endif
119
x265_1.9.tar.gz/source/common/x86/blockcopy8.asm -> x265_2.0.tar.gz/source/common/x86/blockcopy8.asm Changed
10
 
1
@@ -28,8 +28,6 @@
2
 
3
 SECTION_RODATA 32
4
 
5
-tab_Vm:    db 0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0
6
-
7
 cextern pb_4
8
 cextern pb_1
9
 cextern pb_16
10
x265_1.9.tar.gz/source/common/x86/const-a.asm -> x265_2.0.tar.gz/source/common/x86/const-a.asm Changed
50
 
1
@@ -40,12 +40,16 @@
2
 const pb_8,                 times 32 db 8
3
 const pb_15,                times 32 db 15
4
 const pb_16,                times 32 db 16
5
+const pb_31,                times 32 db 31
6
 const pb_32,                times 32 db 32
7
 const pb_64,                times 32 db 64
8
+const pb_124,               times 32 db 124
9
 const pb_128,               times 32 db 128
10
 const pb_a1,                times 16 db 0xa1
11
 
12
 const pb_01,                times  8 db   0,   1
13
+const pb_0123,              times  4 db   0,   1
14
+                            times  4 db   2,   3
15
 const hsub_mul,             times 16 db   1,  -1
16
 const pw_swap,              times  2 db   6,   7,   4,   5,   2,   3,   0,   1
17
 const pb_unpackbd1,         times  2 db   0,   0,   0,   0,   1,   1,   1,   1,   2,   2,   2,   2,   3,   3,   3,   3
18
@@ -64,6 +68,8 @@
19
                             times 12 db 0x00
20
 const pb_000000000000000F,           db 0xff
21
                             times 15 db 0x00
22
+const pb_shuf_off4,         times  2 db   0,   4,   1,   5,   2,   6,   3,   7
23
+const pw_shuf_off4,         times  1 db   0,   1,   8,   9,   2,   3,  10,  11,   4,   5,  12,  13,   6,   7,  14,  15
24
 
25
 ;; 16-bit constants
26
 
27
@@ -115,6 +121,8 @@
28
 const hmul_16p,             times 16 db   1
29
                             times  8 db   1,  -1
30
 const pw_exp2_0_15,                  dw 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768
31
+const pw_1_ffff,            times  4 dw 1
32
+                            times  4 dw 0xFFFF
33
 
34
 
35
 ;; 32-bit constants
36
@@ -146,10 +154,6 @@
37
 const pd_planar16_mul2,     times  1 dd  15,  14,  13,  12,  11,  10,   9,   8,    7,   6,   5,   4,   3,   2,   1,   0
38
 const trans8_shuf,          times  1 dd   0,   4,   1,   5,   2,   6,   3,   7
39
 
40
-const popcnt_table
41
-%assign x 0
42
-%rep 256
43
-; population count
44
-db ((x>>0)&1)+((x>>1)&1)+((x>>2)&1)+((x>>3)&1)+((x>>4)&1)+((x>>5)&1)+((x>>6)&1)+((x>>7)&1)
45
-%assign x x+1
46
-%endrep
47
+;; 64-bit constants
48
+
49
+const pq_1,                 times 1 dq 1
50
x265_1.9.tar.gz/source/common/x86/intrapred8.asm -> x265_2.0.tar.gz/source/common/x86/intrapred8.asm Changed
201
 
1
@@ -355,55 +355,55 @@
2
                             times 8 db (32-22), 22
3
                             times 8 db (32-11), 11
4
 
5
-const ang16_shuf_mode9,    times 8 db 0, 1
6
-                           times 8 db 1, 2
7
+const ang16_shuf_mode9,     times 8 db 0, 1
8
+                            times 8 db 1, 2
9
 
10
-const angHor_tab_9,  db (32-2), 2, (32-4), 4, (32-6), 6, (32-8), 8, (32-10), 10, (32-12), 12, (32-14), 14, (32-16), 16
11
-                     db (32-18), 18, (32-20), 20, (32-22), 22, (32-24),  24, (32-26),  26, (32-28), 28, (32-30), 30, (32-32), 32
12
+const angHor_tab_9,         db (32-2), 2, (32-4), 4, (32-6), 6, (32-8), 8, (32-10), 10, (32-12), 12, (32-14), 14, (32-16), 16
13
+                            db (32-18), 18, (32-20), 20, (32-22), 22, (32-24),  24, (32-26),  26, (32-28), 28, (32-30), 30, (32-32), 32
14
 
15
-const angHor_tab_11, db (32-30), 30, (32-28), 28, (32-26), 26, (32-24), 24, (32-22), 22, (32-20), 20, (32-18), 18, (32-16), 16
16
-                     db (32-14), 14, (32-12), 12, (32-10), 10, (32- 8),  8, (32- 6),  6, (32- 4),  4, (32- 2),  2, (32- 0),  0
17
+const angHor_tab_11,        db (32-30), 30, (32-28), 28, (32-26), 26, (32-24), 24, (32-22), 22, (32-20), 20, (32-18), 18, (32-16), 16
18
+                            db (32-14), 14, (32-12), 12, (32-10), 10, (32- 8),  8, (32- 6),  6, (32- 4),  4, (32- 2),  2, (32- 0),  0
19
 
20
-const ang16_shuf_mode12,   db 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 1, 2, 1, 2, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 2, 3, 2, 3
21
-                           db 1, 2, 1, 2, 1, 2, 1, 2, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3, 1, 2, 1, 2, 1, 2, 1, 2
22
+const ang16_shuf_mode12,    db 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 1, 2, 1, 2, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 2, 3, 2, 3
23
+                            db 1, 2, 1, 2, 1, 2, 1, 2, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3, 1, 2, 1, 2, 1, 2, 1, 2
24
 
25
-const angHor_tab_12, db (32-27), 27, (32-22), 22, (32-17), 17, (32-12), 12, (32-7), 7, (32-2), 2, (32-29), 29, (32-24), 24
26
-                     db (32-19), 19, (32-14), 14, (32-9), 9, (32-4), 4, (32-31), 31, (32-26),  26, (32-21), 21, (32-16), 16
27
+const angHor_tab_12,        db (32-27), 27, (32-22), 22, (32-17), 17, (32-12), 12, (32-7), 7, (32-2), 2, (32-29), 29, (32-24), 24
28
+                            db (32-19), 19, (32-14), 14, (32-9), 9, (32-4), 4, (32-31), 31, (32-26),  26, (32-21), 21, (32-16), 16
29
 
30
-const ang16_shuf_mode13,   db 4, 5, 4, 5, 4, 5, 3, 4, 3, 4, 3, 4, 3, 4, 2, 3, 5, 6, 5, 6, 5, 6, 4, 5, 4, 5, 4, 5, 4, 5, 3, 4
31
-                           db 2, 3, 2, 3, 1, 2, 1, 2, 1, 2, 1, 2, 0, 1, 0, 1, 3, 4, 3, 4, 2, 3, 2, 3, 2, 3, 2, 3, 1, 2, 1, 2
32
-                           db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 7, 4, 0, 0 ,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 7, 4, 0
33
+const ang16_shuf_mode13,    db 4, 5, 4, 5, 4, 5, 3, 4, 3, 4, 3, 4, 3, 4, 2, 3, 5, 6, 5, 6, 5, 6, 4, 5, 4, 5, 4, 5, 4, 5, 3, 4
34
+                            db 2, 3, 2, 3, 1, 2, 1, 2, 1, 2, 1, 2, 0, 1, 0, 1, 3, 4, 3, 4, 2, 3, 2, 3, 2, 3, 2, 3, 1, 2, 1, 2
35
+                            db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 7, 4, 0, 0 ,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 7, 4, 0
36
 
37
-const angHor_tab_13, db (32-23), 23, (32-14), 14, (32-5), 5, (32-28), 28, (32-19), 19, (32-10), 10, (32-1), 1, (32-24), 24
38
-                     db (32-15), 15, (32-6), 6, (32-29), 29, (32-20), 20, (32-11), 11, (32-2), 2, (32-25), 25, (32-16), 16
39
+const angHor_tab_13,        db (32-23), 23, (32-14), 14, (32-5), 5, (32-28), 28, (32-19), 19, (32-10), 10, (32-1), 1, (32-24), 24
40
+                            db (32-15), 15, (32-6), 6, (32-29), 29, (32-20), 20, (32-11), 11, (32-2), 2, (32-25), 25, (32-16), 16
41
 
42
-const ang16_shuf_mode14,   db 6, 7, 6, 7, 5, 6, 5, 6, 4, 5, 4, 5, 4, 5, 3, 4, 7, 8, 7, 8, 6, 7, 6, 7, 5, 6, 5, 6, 5, 6, 4, 5
43
-                           db 3, 4, 2, 3, 2, 3, 2, 3, 1, 2, 1, 2, 0, 1, 0, 1, 4, 5, 3, 4, 3, 4, 3, 4, 2, 3, 2, 3, 1, 2, 1, 2
44
-                           db 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 12, 10, 7, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 12, 10, 7, 5, 2, 0
45
+const ang16_shuf_mode14,    db 6, 7, 6, 7, 5, 6, 5, 6, 4, 5, 4, 5, 4, 5, 3, 4, 7, 8, 7, 8, 6, 7, 6, 7, 5, 6, 5, 6, 5, 6, 4, 5
46
+                            db 3, 4, 2, 3, 2, 3, 2, 3, 1, 2, 1, 2, 0, 1, 0, 1, 4, 5, 3, 4, 3, 4, 3, 4, 2, 3, 2, 3, 1, 2, 1, 2
47
+                            db 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 12, 10, 7, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 12, 10, 7, 5, 2, 0
48
 
49
-const angHor_tab_14, db (32-19), 19, (32-6), 6, (32-25), 25, (32-12), 12, (32-31), 31, (32-18), 18, (32-5), 5, (32-24), 24
50
-                     db (32-11), 11, (32-30), 30, (32-17), 17, (32-4), 4, (32-23), 23, (32-10), 10, (32-29), 29, (32-16), 16
51
+const angHor_tab_14,        db (32-19), 19, (32-6), 6, (32-25), 25, (32-12), 12, (32-31), 31, (32-18), 18, (32-5), 5, (32-24), 24
52
+                            db (32-11), 11, (32-30), 30, (32-17), 17, (32-4), 4, (32-23), 23, (32-10), 10, (32-29), 29, (32-16), 16
53
 
54
-const ang16_shuf_mode15,   db 8, 9, 7, 8, 7, 8, 6, 7, 6, 7, 5, 6, 5, 6, 4, 5, 9, 10, 8, 9, 8, 9, 7, 8, 7, 8, 6, 7, 6, 7, 5, 6
55
-                           db 4, 5, 3, 4, 3, 4, 2, 3, 2, 3, 1, 2, 1, 2, 0, 1, 5, 6, 4, 5, 4, 5, 3, 4, 3, 4, 2, 3, 2, 3, 1, 2
56
-                           db 0, 0, 0, 0, 0, 0, 0, 15, 13, 11, 9, 8, 6, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 15, 13, 11, 9, 8, 6, 4, 2, 0
57
+const ang16_shuf_mode15,    db 8, 9, 7, 8, 7, 8, 6, 7, 6, 7, 5, 6, 5, 6, 4, 5, 9, 10, 8, 9, 8, 9, 7, 8, 7, 8, 6, 7, 6, 7, 5, 6
58
+                            db 4, 5, 3, 4, 3, 4, 2, 3, 2, 3, 1, 2, 1, 2, 0, 1, 5, 6, 4, 5, 4, 5, 3, 4, 3, 4, 2, 3, 2, 3, 1, 2
59
+                            db 0, 0, 0, 0, 0, 0, 0, 15, 13, 11, 9, 8, 6, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 15, 13, 11, 9, 8, 6, 4, 2, 0
60
 
61
-const angHor_tab_15, db (32-15), 15, (32-30), 30, (32-13), 13, (32-28), 28, (32-11), 11, (32-26), 26, (32-9), 9, (32-24), 24
62
-                     db (32-7), 7, (32-22), 22, (32-5), 5, (32-20), 20, (32-3), 3, (32-18), 18, (32-1), 1, (32- 16), 16
63
+const angHor_tab_15,        db (32-15), 15, (32-30), 30, (32-13), 13, (32-28), 28, (32-11), 11, (32-26), 26, (32-9), 9, (32-24), 24
64
+                            db (32-7), 7, (32-22), 22, (32-5), 5, (32-20), 20, (32-3), 3, (32-18), 18, (32-1), 1, (32- 16), 16
65
 
66
-const ang16_shuf_mode16,   db 10, 11, 9, 10, 9, 10, 8, 9, 7, 8, 7, 8, 6, 7, 5, 6, 11, 12, 10, 11, 10, 11, 9, 10, 8, 9, 8, 9, 7, 8, 6, 7
67
-                           db 5, 6, 4, 5, 3, 4, 3, 4, 2, 3, 1, 2, 1, 2, 0, 1, 6, 7, 5, 6, 4, 5, 4, 5, 3, 4, 2, 3, 2, 3, 1, 2
68
-                           db 0 ,0, 0, 0, 0, 15, 14, 12 , 11, 9, 8, 6, 5, 3, 2, 0, 0, 0, 0, 0, 0, 15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0
69
+const ang16_shuf_mode16,    db 10, 11, 9, 10, 9, 10, 8, 9, 7, 8, 7, 8, 6, 7, 5, 6, 11, 12, 10, 11, 10, 11, 9, 10, 8, 9, 8, 9, 7, 8, 6, 7
70
+                            db 5, 6, 4, 5, 3, 4, 3, 4, 2, 3, 1, 2, 1, 2, 0, 1, 6, 7, 5, 6, 4, 5, 4, 5, 3, 4, 2, 3, 2, 3, 1, 2
71
+                            db 0 ,0, 0, 0, 0, 15, 14, 12 , 11, 9, 8, 6, 5, 3, 2, 0, 0, 0, 0, 0, 0, 15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0
72
 
73
-const angHor_tab_16, db (32-11), 11, (32-22), 22, (32-1), 1, (32-12), 12, (32-23), 23, (32-2), 2, (32-13), 13, (32-24), 24
74
-                     db (32-3), 3, (32-14), 14, (32-25), 25, (32-4), 4, (32-15), 15, (32-26), 26, (32-5), 5, (32-16), 16
75
+const angHor_tab_16,        db (32-11), 11, (32-22), 22, (32-1), 1, (32-12), 12, (32-23), 23, (32-2), 2, (32-13), 13, (32-24), 24
76
+                            db (32-3), 3, (32-14), 14, (32-25), 25, (32-4), 4, (32-15), 15, (32-26), 26, (32-5), 5, (32-16), 16
77
 
78
-const ang16_shuf_mode17,   db 12, 13, 11, 12, 10, 11, 9, 10, 8, 9, 8, 9, 7, 8, 6, 7, 13, 14, 12, 13, 11, 12, 10, 11, 9, 10, 9, 10, 8, 9, 7, 8
79
-                           db 5, 6, 4, 5, 4, 5, 3, 4, 2, 3, 1, 2, 0, 1, 0, 1, 6, 7, 5, 6, 5, 6, 4, 5, 3, 4, 2, 3, 1, 2, 1, 2
80
-                           db 0, 0, 0, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, 0, 0, 0, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0
81
+const ang16_shuf_mode17,    db 12, 13, 11, 12, 10, 11, 9, 10, 8, 9, 8, 9, 7, 8, 6, 7, 13, 14, 12, 13, 11, 12, 10, 11, 9, 10, 9, 10, 8, 9, 7, 8
82
+                            db 5, 6, 4, 5, 4, 5, 3, 4, 2, 3, 1, 2, 0, 1, 0, 1, 6, 7, 5, 6, 5, 6, 4, 5, 3, 4, 2, 3, 1, 2, 1, 2
83
+                            db 0, 0, 0, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, 0, 0, 0, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0
84
 
85
-const angHor_tab_17, db (32- 6),  6, (32-12), 12, (32-18), 18, (32-24), 24, (32-30), 30, (32- 4),  4, (32-10), 10, (32-16), 16
86
-                     db (32-22), 22, (32-28), 28, (32- 2),  2, (32- 8),  8, (32-14), 14, (32-20), 20, (32-26), 26, (32- 0),  0
87
+const angHor_tab_17,        db (32- 6),  6, (32-12), 12, (32-18), 18, (32-24), 24, (32-30), 30, (32- 4),  4, (32-10), 10, (32-16), 16
88
+                            db (32-22), 22, (32-28), 28, (32- 2),  2, (32- 8),  8, (32-14), 14, (32-20), 20, (32-26), 26, (32- 0),  0
89
 
90
 ; Intrapred_angle32x32, modes 1 to 33 constants
91
 const ang32_shuf_mode9,         times 8 db 0, 1
92
@@ -467,6 +467,39 @@
93
                                 dd  0,  0,  2,  3,  0,  0,  7,  1
94
                                 dd  0,  0,  5,  6,  0,  0,  0,  0
95
 
96
+; Intrapred_angle8x8, modes 1 to 33 constants
97
+const ang8_shuf_mode3,          db  0,  1,  1,  2,  2,  3,  3,  4,  4,  5,  4,  5,  5,  6,  6,  7,  1,  2,  2,  3,  3,  4,  4,  5,  5,  6,  5,  6,  6,  7,  7,  8
98
+const ang8_shuf_mode4,          db  0,  1,  1,  2,  1,  2,  2,  3,  3,  4,  3,  4,  4,  5,  5,  6,  1,  2,  2,  3,  2,  3,  3,  4,  4,  5,  4,  5,  5,  6,  6,  7
99
+const ang8_shuf_mode5,          db  0,  1,  1,  2,  1,  2,  2,  3,  2,  3,  3,  4,  3,  4,  4,  5,  1,  2,  2,  3,  2,  3,  3,  4,  3,  4,  4,  5,  4,  5,  5,  6
100
+const ang8_shuf_mode6,          db  0,  1,  0,  1,  1,  2,  1,  2,  2,  3,  2,  3,  2,  3,  3,  4,  1,  2,  1,  2,  2,  3,  2,  3,  3,  4,  3,  4,  3,  4,  4,  5
101
+const ang8_shuf_mode7,          db  0,  1,  0,  1,  0,  1,  1,  2,  1,  2,  1,  2,  1,  2,  2,  3,  1,  2,  1,  2,  1,  2,  2,  3,  2,  3,  2,  3,  2,  3,  3,  4
102
+const ang8_shuf_mode8,          db  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  1,  2,  1,  2,  1,  2,  1,  2,  1,  2,  1,  2,  1,  2,  1,  2,  2,  3,  2,  3
103
+const ang8_shuf_mode9,          db  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  1,  2,  1,  2,  1,  2,  1,  2,  1,  2,  1,  2,  1,  2,  1,  2
104
+const ang8_shuf_mode12,         db  7,  8,  7,  8,  7,  8,  7,  8,  7,  8,  7,  8,  6,  7,  6,  7,  8,  9,  8,  9,  8,  9,  8,  9,  8,  9,  8,  9,  7,  8,  7,  8
105
+const ang8_shuf_mode13,         db  8,  9,  8,  9,  8,  9,  7,  8,  7,  8,  7,  8,  7,  8,  6,  7,  9, 10,  9, 10,  9, 10,  8,  9,  8,  9,  8,  9,  8,  9,  7,  8
106
+const ang8_shuf_mode14,         db  9, 10,  9, 10,  8,  9,  8,  9,  7,  8,  7,  8,  7,  8,  6,  7, 10, 11, 10, 11,  9, 10,  9, 10,  8,  9,  8,  9,  8,  9,  7,  8
107
+const ang8_shuf_mode15,         db 10, 11,  9, 10,  9, 10,  8,  9,  8,  9,  7,  8,  7,  8,  6,  7, 11, 12, 10, 11, 10, 11,  9, 10,  9, 10,  8,  9,  8,  9,  7,  8
108
+                                db  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  8,  6,  4,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  8,  6,  4,  2,  0
109
+const ang8_shuf_mode16,         db 11, 12, 10, 11, 10, 11,  9, 10,  8,  9,  8,  9,  7,  8,  6,  7, 12, 13, 11, 12, 11, 12, 10, 11,  9, 10,  9, 10,  8,  9,  7,  8
110
+                                db  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  8,  6,  5,  3,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  8,  6,  5,  3,  2,  0
111
+const ang8_shuf_mode17,         db 12, 13, 11, 12, 10, 11,  9, 10,  8,  9,  8,  9,  7,  8,  6,  7, 13, 14, 12, 13, 11, 12, 10, 11,  9, 10,  9, 10,  8,  9,  7,  8
112
+                                db  0,  0,  0,  0,  0,  0,  0,  0,  0,  7,  6,  5,  4,  2,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  7,  6,  5,  4,  2,  1,  0
113
+
114
+const ang8_fact_mode3,          db (32-26), 26, (32-20), 20, (32-14), 14, (32- 8),  8, (32- 2),  2, (32-28), 28, (32-22), 22, (32-16), 16
115
+const ang8_fact_mode4,          db (32-21), 21, (32-10), 10, (32-31), 31, (32-20), 20, (32- 9),  9, (32-30), 30, (32-19), 19, (32- 8),  8
116
+const ang8_fact_mode5,          db (32-17), 17, (32- 2),  2, (32-19), 19, (32- 4),  4, (32-21), 21, (32- 6),  6, (32-23), 23, (32- 8),  8
117
+const ang8_fact_mode6,          db (32-13), 13, (32-26), 26, (32- 7),  7, (32-20), 20, (32- 1),  1, (32-14), 14, (32-27), 27, (32- 8),  8
118
+const ang8_fact_mode7,          db (32- 9),  9, (32-18), 18, (32-27), 27, (32- 4),  4, (32-13), 13, (32-22), 22, (32-31), 31, (32- 8),  8
119
+const ang8_fact_mode8,          db (32- 5),  5, (32-10), 10, (32-15), 15, (32-20), 20, (32-25), 25, (32-30), 30, (32- 3),  3, (32- 8),  8
120
+const ang8_fact_mode9,          db (32- 2),  2, (32- 4),  4, (32- 6),  6, (32- 8),  8, (32-10), 10, (32-12), 12, (32-14), 14, (32-16), 16
121
+const ang8_fact_mode11,         db (32-30), 30, (32-28), 28, (32-26), 26, (32-24), 24, (32-22), 22, (32-20), 20, (32-18), 18, (32-16), 16
122
+const ang8_fact_mode12,         db (32-27), 27, (32-22), 22, (32-17), 17, (32-12), 12, (32- 7),  7, (32- 2),  2, (32-29), 29, (32-24), 24
123
+const ang8_fact_mode13,         db (32-23), 23, (32-14), 14, (32- 5),  5, (32-28), 28, (32-19), 19, (32-10), 10, (32- 1),  1, (32-24), 24
124
+const ang8_fact_mode14,         db (32-19), 19, (32- 6),  6, (32-25), 25, (32-12), 12, (32-31), 31, (32-18), 18, (32- 5),  5, (32-24), 24
125
+const ang8_fact_mode15,         db (32-15), 15, (32-30), 30, (32-13), 13, (32-28), 28, (32-11), 11, (32-26), 26, (32- 9),  9, (32-24), 24
126
+const ang8_fact_mode16,         db (32-11), 11, (32-22), 22, (32- 1),  1, (32-12), 12, (32-23), 23, (32- 2),  2, (32-13), 13, (32-24), 24
127
+const ang8_fact_mode17,         db (32- 6),  6, (32-12), 12, (32-18), 18, (32-24), 24, (32-30), 30, (32- 4),  4, (32-10), 10, (32-16), 16
128
+
129
 const ang_table
130
 %assign x 0
131
 %rep 32
132
@@ -490,6 +523,7 @@
133
 
134
 SECTION .text
135
 cextern pb_1
136
+cextern pb_2
137
 cextern pw_2
138
 cextern pw_3
139
 cextern pw_4
140
@@ -18582,48 +18616,48 @@
141
 ; void intraPredAng8(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter)
142
 ;-----------------------------------------------------------------------------------------
143
 INIT_YMM avx2
144
-cglobal intra_pred_ang8_3, 3,4,5
145
-    mova              m3, [pw_1024]
146
+%macro ang8_store8x8 0
147
+    lea               r3, [3 * r1]
148
+    vextracti128      xm2, m1, 1
149
+    vextracti128      xm5, m4, 1
150
+    movq              [r0], xm1
151
+    movq              [r0 + r1], xm2
152
+    movhps            [r0 + 2 * r1], xm1
153
+    movhps            [r0 + r3], xm2
154
+    lea               r0, [r0 + 4 * r1]
155
+    movq              [r0], xm4
156
+    movq              [r0 + r1], xm5
157
+    movhps            [r0 + 2 * r1], xm4
158
+    movhps            [r0 + r3], xm5
159
+%endmacro
160
+
161
+cglobal intra_pred_ang8_3, 3,4,6
162
     vbroadcasti128    m0, [r2 + 17]
163
+    mova              m5, [ang8_shuf_mode3]
164
+    mova              m3, [pb_2]
165
 
166
-    pshufb            m1, m0, [c_ang8_src1_9_2_10]
167
-    pshufb            m2, m0, [c_ang8_src3_11_4_12]
168
-    pshufb            m4, m0, [c_ang8_src5_13_5_13]
169
-    pshufb            m0,     [c_ang8_src6_14_7_15]
170
+    pshufb            m1, m0, m5
171
+    paddb             m5, m3
172
+    pshufb            m2, m0, m5
173
+    paddb             m5, m3
174
+    pshufb            m4, m0, m5
175
+    paddb             m5, m3
176
+    pshufb            m0, m5
177
 
178
-    pmaddubsw         m1, [c_ang8_26_20]
179
+    vbroadcasti128    m5, [ang8_fact_mode3]
180
+    mova              m3, [pw_1024]
181
+    pmaddubsw         m1, m5
182
+    pmaddubsw         m2, m5
183
+    pmaddubsw         m4, m5
184
+    pmaddubsw         m0, m5
185
     pmulhrsw          m1, m3
186
-    pmaddubsw         m2, [c_ang8_14_8]
187
     pmulhrsw          m2, m3
188
-    pmaddubsw         m4, [c_ang8_2_28]
189
     pmulhrsw          m4, m3
190
-    pmaddubsw         m0, [c_ang8_22_16]
191
     pmulhrsw          m0, m3
192
     packuswb          m1, m2
193
     packuswb          m4, m0
194
 
195
-    vperm2i128        m2, m1, m4, 00100000b
196
-    vperm2i128        m1, m1, m4, 00110001b
197
-    punpcklbw         m4, m2, m1
198
-    punpckhbw         m2, m1
199
-    punpcklwd         m1, m4, m2
200
-    punpckhwd         m4, m2
201
x265_1.9.tar.gz/source/common/x86/ipfilter16.asm -> x265_2.0.tar.gz/source/common/x86/ipfilter16.asm Changed
201
 
1
@@ -116,6 +116,7 @@
2
                   dw  -1, 4, -11, 40,  40, -11, 4, -1
3
                   dw   0, 1, -5,  17,  58, -10, 4, -1
4
 
5
+ALIGN 32
6
 tab_LumaCoeffV:   times 4 dw 0, 0
7
                   times 4 dw 0, 64
8
                   times 4 dw 0, 0
9
@@ -161,9 +162,8 @@
10
 const interp8_hpp_shuf,     db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9
11
                             db 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13
12
 
13
-const pb_shuf,  db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9
14
-                db 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13
15
-
16
+const interp8_hpp_shuf_new, db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9
17
+                            db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13
18
 
19
 SECTION .text
20
 cextern pd_8
21
@@ -10407,7 +10407,7 @@
22
     vpbroadcastq        m0, [tab_LumaCoeff + r4]
23
     vpbroadcastq        m1, [tab_LumaCoeff + r4 + 8]
24
 %endif
25
-    mova                m3, [pb_shuf]
26
+    mova                m3, [interp8_hpp_shuf]
27
     vbroadcasti128      m2, [INTERP_OFFSET_PS]
28
 
29
     ; register map
30
@@ -10475,7 +10475,7 @@
31
     vpbroadcastq        m0, [tab_LumaCoeff + r4]
32
     vpbroadcastq        m1, [tab_LumaCoeff + r4 + 8]
33
 %endif
34
-    mova                m3, [pb_shuf]
35
+    mova                m3, [interp8_hpp_shuf]
36
     vbroadcasti128      m2, [INTERP_OFFSET_PS]
37
 
38
     ; register map
39
@@ -10536,16 +10536,16 @@
40
     add                 r3d, r3d
41
     mov                 r4d, r4m
42
     mov                 r5d, r5m
43
-    shl                 r4d, 4
44
+    shl                 r4d, 6
45
 %ifdef PIC
46
-    lea                 r6, [tab_LumaCoeff]
47
-    vpbroadcastq        m0, [r6 + r4]
48
-    vpbroadcastq        m1, [r6 + r4 + 8]
49
+    lea                 r6, [tab_LumaCoeffV]
50
+    movu                m0, [r6 + r4]
51
+    movu                m1, [r6 + r4 + mmsize]
52
 %else
53
-    vpbroadcastq        m0, [tab_LumaCoeff + r4]
54
-    vpbroadcastq        m1, [tab_LumaCoeff + r4 + 8]
55
+    movu                m0, [tab_LumaCoeffV + r4]
56
+    movu                m1, [tab_LumaCoeffV + r4 + mmsize]
57
 %endif
58
-    mova                m3, [pb_shuf]
59
+    mova                m3, [interp8_hpp_shuf_new]
60
     vbroadcasti128      m2, [INTERP_OFFSET_PS]
61
 
62
     ; register map
63
@@ -10554,7 +10554,7 @@
64
     sub                 r0, 6
65
     test                r5d, r5d
66
     mov                 r4d, %2
67
-    jz                  .loop0
68
+    jz                 .loop0
69
     lea                 r6, [r1*3]
70
     sub                 r0, r6
71
     add                 r4d, 7
72
@@ -10563,64 +10563,64 @@
73
 %assign x 0
74
 %rep %1/16
75
     vbroadcasti128      m4, [r0 + x]
76
-    vbroadcasti128      m5, [r0 + 8 + x]
77
+    vbroadcasti128      m5, [r0 + 4 * SIZEOF_PIXEL + x]
78
     pshufb              m4, m3
79
-    pshufb              m7, m5, m3
80
+    pshufb              m5, m3
81
 
82
     pmaddwd             m4, m0
83
-    pmaddwd             m7, m1
84
+    pmaddwd             m7, m5, m1
85
     paddd               m4, m7
86
+    vextracti128        xm7, m4, 1
87
+    paddd               xm4, xm7
88
+    paddd               xm4, xm2
89
+    psrad               xm4, INTERP_SHIFT_PS
90
 
91
     vbroadcasti128      m6, [r0 + 16 + x]
92
-    pshufb              m5, m3
93
-    pshufb              m7, m6, m3
94
+    pshufb              m6, m3
95
 
96
     pmaddwd             m5, m0
97
-    pmaddwd             m7, m1
98
+    pmaddwd             m7, m6, m1
99
     paddd               m5, m7
100
-
101
-    phaddd              m4, m5
102
-    vpermq              m4, m4, q3120
103
-    paddd               m4, m2
104
-    vextracti128        xm5,m4, 1
105
-    psrad               xm4, INTERP_SHIFT_PS
106
+    vextracti128        xm7, m5, 1
107
+    paddd               xm5, xm7
108
+    paddd               xm5, xm2
109
     psrad               xm5, INTERP_SHIFT_PS
110
-    packssdw            xm4, xm5
111
 
112
+    packssdw            xm4, xm5
113
     movu                [r2 + x], xm4
114
 
115
     vbroadcasti128      m5, [r0 + 24 + x]
116
-    pshufb              m6, m3
117
-    pshufb              m7, m5, m3
118
+    pshufb              m5, m3
119
 
120
     pmaddwd             m6, m0
121
-    pmaddwd             m7, m1
122
+    pmaddwd             m7, m5, m1
123
     paddd               m6, m7
124
+    vextracti128        xm7, m6, 1
125
+    paddd               xm6, xm7
126
+    paddd               xm6, xm2
127
+    psrad               xm6, INTERP_SHIFT_PS
128
 
129
     vbroadcasti128      m7, [r0 + 32 + x]
130
-    pshufb              m5, m3
131
     pshufb              m7, m3
132
 
133
     pmaddwd             m5, m0
134
     pmaddwd             m7, m1
135
     paddd               m5, m7
136
-
137
-    phaddd              m6, m5
138
-    vpermq              m6, m6, q3120
139
-    paddd               m6, m2
140
-    vextracti128        xm5,m6, 1
141
-    psrad               xm6, INTERP_SHIFT_PS
142
+    vextracti128        xm7, m5, 1
143
+    paddd               xm5, xm7
144
+    paddd               xm5, xm2
145
     psrad               xm5, INTERP_SHIFT_PS
146
-    packssdw            xm6, xm5
147
 
148
+    packssdw            xm6, xm5
149
     movu                [r2 + 16 + x], xm6
150
-    %assign x x+32
151
-    %endrep
152
+
153
+%assign x x+32
154
+%endrep
155
 
156
     add                 r2, r3
157
     add                 r0, r1
158
     dec                 r4d
159
-    jnz                 .loop0
160
+    jnz                .loop0
161
     RET
162
 %endif
163
 %endmacro
164
@@ -10656,7 +10656,7 @@
165
     vpbroadcastq        m0, [tab_LumaCoeff + r4]
166
     vpbroadcastq        m1, [tab_LumaCoeff + r4 + 8]
167
 %endif
168
-    mova                m3, [pb_shuf]
169
+    mova                m3, [interp8_hpp_shuf]
170
     vbroadcasti128      m2, [INTERP_OFFSET_PS]
171
 
172
     ; register map
173
@@ -10749,7 +10749,7 @@
174
     vpbroadcastq        m0, [tab_LumaCoeff + r4]
175
     vpbroadcastq        m1, [tab_LumaCoeff + r4 + 8]
176
 %endif
177
-    mova                m3, [pb_shuf]
178
+    mova                m3, [interp8_hpp_shuf]
179
     vbroadcasti128      m2, [INTERP_OFFSET_PS]
180
 
181
     ; register map
182
@@ -10824,7 +10824,7 @@
183
 %else
184
     vpbroadcastq        m0, [tab_ChromaCoeff + r4 * 8]
185
 %endif
186
-    mova                m3, [pb_shuf]
187
+    mova                m3, [interp8_hpp_shuf]
188
     vbroadcasti128      m2, [INTERP_OFFSET_PS]
189
 
190
     ; register map
191
@@ -10883,7 +10883,7 @@
192
 %else
193
     vpbroadcastq        m0, [tab_ChromaCoeff + r4 * 8]
194
 %endif
195
-    mova                m3, [pb_shuf]
196
+    mova                m3, [interp8_hpp_shuf]
197
     vbroadcasti128      m2, [INTERP_OFFSET_PS]
198
 
199
     ; register map
200
@@ -10956,7 +10956,7 @@
201
x265_1.9.tar.gz/source/common/x86/loopfilter.asm -> x265_2.0.tar.gz/source/common/x86/loopfilter.asm Changed
201
 
1
@@ -29,9 +29,6 @@
2
 %include "x86util.asm"
3
 
4
 SECTION_RODATA 32
5
-pb_31:      times 32 db 31
6
-pb_124:     times 32 db 124
7
-pb_15:      times 32 db 15
8
 
9
 SECTION .text
10
 cextern pb_1
11
@@ -39,6 +36,10 @@
12
 cextern pb_3
13
 cextern pb_4
14
 cextern pb_01
15
+cextern pb_0123
16
+cextern pb_15
17
+cextern pb_31
18
+cextern pb_124
19
 cextern pb_128
20
 cextern pw_1
21
 cextern pw_n1
22
@@ -48,7 +49,9 @@
23
 cextern pb_movemask
24
 cextern pb_movemask_32
25
 cextern hmul_16p
26
-
27
+cextern pw_1_ffff
28
+cextern pb_shuf_off4
29
+cextern pw_shuf_off4
30
 
31
 ;============================================================================================================
32
 ; void saoCuOrgE0(pixel * rec, int8_t * offsetEo, int lcuWidth, int8_t* signLeft, intptr_t stride)
33
@@ -154,7 +157,9 @@
34
     sub         r4d, 16
35
     jnz        .loopH
36
     RET
37
-%else ; HIGH_BIT_DEPTH
38
+
39
+%else ; HIGH_BIT_DEPTH == 1
40
+
41
 cglobal saoCuOrgE0, 5, 5, 8, rec, offsetEo, lcuWidth, signLeft, stride
42
 
43
     mov         r4d, r4m
44
@@ -240,7 +245,7 @@
45
     sub         r4d, 16
46
     jnz        .loopH
47
     RET
48
-%endif
49
+%endif ; HIGH_BIT_DEPTH == 0
50
 
51
 INIT_YMM avx2
52
 %if HIGH_BIT_DEPTH
53
@@ -2061,6 +2066,117 @@
54
 ; saoCuStatsE0(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count)
55
 ;-----------------------------------------------------------------------------------------------------------------------
56
 %if ARCH_X86_64
57
+
58
+%if HIGH_BIT_DEPTH == 1
59
+INIT_XMM sse4
60
+cglobal saoCuStatsE0, 3,10,8, 0-32
61
+    mov         r3d, r3m
62
+    mov         r4d, r4m
63
+    mov         r9, r5mp
64
+
65
+    ; clear internal temporary buffer
66
+    pxor        m0, m0
67
+    mova        [rsp], m0
68
+    mova        [rsp + mmsize], m0
69
+    mova        m4, [pw_1]
70
+    mova        m5, [pb_2]
71
+    xor         r7d, r7d
72
+
73
+    ; correct stride for diff[] and rec
74
+    mov         r6d, r3d
75
+    and         r6d, ~15
76
+    sub         r2, r6
77
+    lea         r8, [(r6 - 64) * 2]             ; 64 = MAX_CU_SIZE
78
+
79
+    FIX_STRIDES r2
80
+
81
+.loopH:
82
+    mov         r5d, r3d
83
+
84
+    ; calculate signLeft
85
+    mov         r7w, [r1]
86
+    sub         r7w, [r1 - SIZEOF_PIXEL]
87
+    seta        r7b
88
+    setb        r6b
89
+    sub         r7b, r6b
90
+    neg         r7b
91
+    pinsrb      m0, r7d, 15
92
+
93
+.loopL:
94
+
95
+    movu        m3, [r1]
96
+    movu        m2, [r1 + SIZEOF_PIXEL]
97
+    pcmpgtw     m6, m3, m2
98
+    pcmpgtw     m2, m3
99
+    pand        m6, m4
100
+    por         m2, m6
101
+
102
+    movu        m3, [r1 + mmsize]
103
+    movu        m6, [r1 + mmsize + SIZEOF_PIXEL]
104
+    pcmpgtw     m7, m3, m6
105
+    pcmpgtw     m6, m3
106
+    pand        m7, m4
107
+    por         m7, m6
108
+
109
+    packsswb    m2, m7                          ; signRight
110
+
111
+    palignr     m3, m2, m0, 15
112
+
113
+    pxor        m6, m6
114
+    psubb       m6, m3                          ; signLeft
115
+
116
+    mova        m0, m2
117
+    paddb       m2, m6
118
+    paddb       m2, m5                          ; edgeType
119
+
120
+    ; stats[edgeType]
121
+%assign x 0
122
+%rep 16
123
+    pextrb      r7d, m2, x
124
+
125
+    movsx       r6d, word [r0 + x * 2]
126
+    inc         word [rsp + r7 * 2]             ; tmp_count[edgeType]++
127
+    add         [rsp + 5 * 2 + r7 * 4], r6d     ; tmp_stats[edgeType] += (fenc[x] - rec[x])
128
+    dec         r5d
129
+    jz         .next
130
+%assign x x+1
131
+%endrep
132
+
133
+    add         r0, 16*2
134
+    add         r1, 16 * SIZEOF_PIXEL
135
+    jmp        .loopL
136
+
137
+.next:
138
+    sub         r0, r8
139
+    add         r1, r2
140
+
141
+    dec         r4d
142
+    jnz        .loopH
143
+
144
+    ; sum to global buffer
145
+    mov         r0, r6mp
146
+
147
+    ; s_eoTable = {1, 2, 0, 3, 4}
148
+    pmovzxwd    m0, [rsp + 0 * 2]
149
+    pshufd      m0, m0, q3102
150
+    movu        m1, [r0]
151
+    paddd       m0, m1
152
+    movu        [r0], m0
153
+    movzx       r5d, word [rsp + 4 * 2]
154
+    add         [r0 + 4 * 4], r5d
155
+
156
+    movu        m0, [rsp + 5 * 2 + 0 * 4]
157
+    pshufd      m0, m0, q3102
158
+    movu        m1, [r9]
159
+    paddd       m0, m1
160
+    movu        [r9], m0
161
+    mov         r6d, [rsp + 5 * 2 + 4 * 4]
162
+    add         [r9 + 4 * 4], r6d
163
+    RET
164
+%endif ; HIGH_BIT_DEPTH=1
165
+
166
+
167
+%if HIGH_BIT_DEPTH == 0
168
 INIT_XMM sse4
169
 cglobal saoCuStatsE0, 3,10,6, 0-32
170
     mov         r3d, r3m
171
@@ -2086,7 +2202,7 @@
172
 
173
     ; calculate signLeft
174
     mov         r7b, [r1]
175
-    sub         r7b, [r1 - 1]
176
+    sub         r7b, [r1 - SIZEOF_PIXEL]
177
     seta        r7b
178
     setb        r6b
179
     sub         r7b, r6b
180
@@ -2095,13 +2211,14 @@
181
 
182
 .loopL:
183
     movu        m3, [r1]
184
-    movu        m2, [r1 + 1]
185
+    movu        m2, [r1 + SIZEOF_PIXEL]
186
 
187
     pxor        m1, m3, m4
188
     pxor        m2, m4
189
     pcmpgtb     m3, m1, m2
190
     pcmpgtb     m2, m1
191
     pand        m3, [pb_1]
192
+
193
     por         m2, m3                          ; signRight
194
 
195
     palignr     m3, m2, m0, 15
196
@@ -2125,7 +2242,7 @@
197
 %endrep
198
 
199
     add         r0, 16*2
200
-    add         r1, 16
201
x265_1.9.tar.gz/source/common/x86/loopfilter.h -> x265_2.0.tar.gz/source/common/x86/loopfilter.h Changed
9
 
1
@@ -48,5 +48,7 @@
2
 
3
 void PFX(pelFilterLumaStrong_V_sse4)(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tcP, int32_t tcQ);
4
 void PFX(pelFilterLumaStrong_H_sse4)(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tcP, int32_t tcQ);
5
+void PFX(pelFilterChroma_V_sse4)(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tc, int32_t maskP, int32_t maskQ);
6
+void PFX(pelFilterChroma_H_sse4)(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tc, int32_t maskP, int32_t maskQ);
7
 
8
 #endif // ifndef X265_LOOPFILTER_H
9
x265_1.9.tar.gz/source/common/x86/mc-a.asm -> x265_2.0.tar.gz/source/common/x86/mc-a.asm Changed
26
 
1
@@ -53,7 +53,6 @@
2
              times 8 db 2
3
              times 8 db 4
4
              times 8 db 6
5
-sq_1: times 1 dq 1
6
 
7
 SECTION .text
8
 
9
@@ -74,6 +73,7 @@
10
 cextern pw_pixel_max
11
 cextern pd_32
12
 cextern pd_64
13
+cextern pq_1
14
 
15
 ;====================================================================================================================
16
 ;void addAvg (int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride)
17
@@ -3638,7 +3638,7 @@
18
     mova        m3, [r4+16]
19
     movd        m2, [r4+32]         ; denom
20
     mova        m4, [pw_pixel_max]
21
-    paddw       m2, [sq_1]          ; denom+1
22
+    paddw       m2, [pq_1]          ; denom+1
23
 %endmacro
24
 
25
 ; src1, src2
26
x265_1.9.tar.gz/source/common/x86/mc-a2.asm -> x265_2.0.tar.gz/source/common/x86/mc-a2.asm Changed
151
 
1
@@ -43,11 +43,11 @@
2
 deinterleave_shuf32a: db 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30
3
 deinterleave_shuf32b: db 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31
4
 %endif
5
-pw_1024: times 16 dw 1024
6
 
7
-pd_16: times 4 dd 16
8
-pd_0f: times 4 dd 0xffff
9
-pf_inv256: times 8 dd 0.00390625
10
+cutree_fix8_unpack_shuf: db -1,-1, 0, 1,-1,-1, 2, 3,-1,-1, 4, 5,-1,-1, 6, 7
11
+                         db -1,-1, 8, 9,-1,-1,10,11,-1,-1,12,13,-1,-1,14,15
12
+
13
+const pq_256,       times 4 dq 256.0
14
 const pd_inv256,    times 4 dq 0.00390625
15
 const pd_0_5,       times 4 dq 0.5
16
 
17
@@ -59,9 +59,11 @@
18
 cextern pw_32
19
 cextern pw_512
20
 cextern pw_00ff
21
+cextern pw_1024
22
 cextern pw_3fff
23
 cextern pw_pixel_max
24
 cextern pd_ffff
25
+cextern pd_16
26
 
27
 ;The hpel_filter routines use non-temporal writes for output.
28
 ;The following defines may be uncommented for testing.
29
@@ -1215,3 +1217,121 @@
30
 
31
 INIT_YMM avx2
32
 MBTREE_AVX
33
+
34
+
35
+%macro CUTREE_FIX8 0
36
+;-----------------------------------------------------------------------------
37
+; void cutree_fix8_pack( uint16_t *dst, double *src, int count )
38
+;-----------------------------------------------------------------------------
39
+cglobal cutree_fix8_pack, 3, 4, 5
40
+    movapd       m2, [pq_256]
41
+    sub          r2d, mmsize / 2
42
+    movsxdifnidn r2, r2d
43
+    lea          r1, [r1 + 8 * r2]
44
+    lea          r0, [r0 + 2 * r2]
45
+    neg          r2
46
+    jg .skip_loop
47
+.loop:
48
+    mulpd        m0, m2, [r1 + 8 * r2]
49
+    mulpd        m1, m2, [r1 + 8 * r2 + mmsize]
50
+    mulpd        m3, m2, [r1 + 8 * r2 + 2 * mmsize]
51
+    mulpd        m4, m2, [r1 + 8 * r2 + 3 * mmsize]
52
+    cvttpd2dq    xm0, m0
53
+    cvttpd2dq    xm1, m1
54
+    cvttpd2dq    xm3, m3
55
+    cvttpd2dq    xm4, m4
56
+%if mmsize == 32
57
+    vinserti128  m0, m0, xm3, 1
58
+    vinserti128  m1, m1, xm4, 1
59
+    packssdw     m0, m1
60
+%else
61
+    punpcklqdq   m0, m1
62
+    punpcklqdq   m3, m4
63
+    packssdw     m0, m3
64
+%endif
65
+    mova         [r0 + 2 * r2], m0
66
+    add          r2, mmsize / 2
67
+    jle .loop
68
+.skip_loop:
69
+    sub          r2, mmsize / 2
70
+    jz .end
71
+    ; Do the remaining values in scalar in order to avoid overreading src.
72
+.scalar:
73
+    movq         xm0, [r1 + 8 * r2 + 4 * mmsize] 
74
+    mulsd        xm0, xm2
75
+    cvttsd2si    r3d, xm0
76
+    mov          [r0 + 2 * r2 + mmsize], r3w
77
+    inc          r2
78
+    jl .scalar
79
+.end:
80
+    RET
81
+
82
+;-----------------------------------------------------------------------------
83
+; void cutree_fix8_unpack( double *dst, uint16_t *src, int count )
84
+;-----------------------------------------------------------------------------
85
+cglobal cutree_fix8_unpack, 3, 4, 7
86
+%if mmsize != 32
87
+    mova           m4, [cutree_fix8_unpack_shuf+16]
88
+%endif
89
+    movapd         m2, [pd_inv256]
90
+    mova           m3, [cutree_fix8_unpack_shuf]
91
+    sub            r2d, mmsize / 2
92
+    movsxdifnidn   r2, r2d
93
+    lea            r1, [r1 + 2 * r2]
94
+    lea            r0, [r0 + 8 * r2]
95
+    neg            r2
96
+    jg .skip_loop
97
+.loop:
98
+%if mmsize == 32
99
+    vbroadcasti128 m0, [r1 + 2 * r2]
100
+    vbroadcasti128 m1, [r1 + 2 * r2 + 16]
101
+    pshufb         m0, m3
102
+    pshufb         m1, m3
103
+%else
104
+    mova           m1, [r1 + 2 * r2]
105
+    pshufb         m0, m1, m3
106
+    pshufb         m1, m4
107
+%endif
108
+    psrad          m0, 16 ; sign-extend
109
+    psrad          m1, 16
110
+    cvtdq2pd       m5, xm0
111
+    cvtdq2pd       m6, xm1
112
+%if mmsize == 32
113
+    vpermq         m0, m0, q1032
114
+    vpermq         m1, m1, q1032
115
+%else
116
+    psrldq         m0, 8
117
+    psrldq         m1, 8
118
+%endif
119
+    cvtdq2pd       m0, xm0
120
+    cvtdq2pd       m1, xm1
121
+    mulpd          m0, m2
122
+    mulpd          m1, m2
123
+    mulpd          m5, m2
124
+    mulpd          m6, m2
125
+    movapd         [r0 + 8 * r2], m5
126
+    movapd         [r0 + 8 * r2 + mmsize], m0
127
+    movapd         [r0 + 8 * r2 + mmsize * 2], m6
128
+    movapd         [r0 + 8 * r2 + mmsize * 3], m1
129
+    add            r2, mmsize / 2
130
+    jle .loop
131
+.skip_loop:
132
+    sub            r2, mmsize / 2
133
+    jz .end
134
+.scalar:
135
+    movzx          r3d, word [r1 + 2 * r2 + mmsize]
136
+    movsx          r3d, r3w
137
+    cvtsi2sd       xm0, r3d
138
+    mulsd          xm0, xm2
139
+    movsd          [r0 + 8 * r2 + 4 * mmsize], xm0
140
+    inc            r2
141
+    jl .scalar
142
+.end:
143
+    RET
144
+%endmacro
145
+
146
+INIT_XMM ssse3
147
+CUTREE_FIX8
148
+
149
+INIT_YMM avx2
150
+CUTREE_FIX8
151
x265_1.9.tar.gz/source/common/x86/mc.h -> x265_2.0.tar.gz/source/common/x86/mc.h Changed
22
 
1
@@ -46,4 +46,20 @@
2
 
3
 #undef PROPAGATE_COST
4
 
5
+#define FIX8UNPACK(cpu) \
6
+    void PFX(cutree_fix8_unpack_ ## cpu)(double *dst, uint16_t *src, int count);
7
+
8
+FIX8UNPACK(ssse3)
9
+FIX8UNPACK(avx2)
10
+
11
+#undef FIX8UNPACK
12
+
13
+#define FIX8PACK(cpu) \
14
+    void PFX(cutree_fix8_pack_## cpu)(uint16_t *dst, double *src, int count);
15
+
16
+FIX8PACK(ssse3)
17
+FIX8PACK(avx2)
18
+
19
+#undef FIX8PACK
20
+
21
 #endif // ifndef X265_MC_H
22
x265_1.9.tar.gz/source/common/x86/pixel-a.asm -> x265_2.0.tar.gz/source/common/x86/pixel-a.asm Changed
201
 
1
@@ -50,9 +50,6 @@
2
 transd_shuf1: SHUFFLE_MASK_W 0, 8, 2, 10, 4, 12, 6, 14
3
 transd_shuf2: SHUFFLE_MASK_W 1, 9, 3, 11, 5, 13, 7, 15
4
 
5
-sw_f0:     dq 0xfff0, 0
6
-pd_f0:     times 4 dd 0xffff0000
7
-
8
 SECTION .text
9
 
10
 cextern pb_0
11
@@ -67,7 +64,6 @@
12
 cextern pw_pmpmpmpm
13
 cextern pw_pmmpzzzz
14
 cextern pd_1
15
-cextern popcnt_table
16
 cextern pd_2
17
 cextern hmul_16p
18
 cextern pb_movemask
19
@@ -13803,3 +13799,589 @@
20
     movzx           eax, al
21
     RET
22
 %endif ; ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 0
23
+
24
+
25
+%if HIGH_BIT_DEPTH == 1 && BIT_DEPTH == 10
26
+%macro LOAD_DIFF_AVX2 4
27
+    movu       %1, %3
28
+    movu       %2, %4
29
+    psubw      %1, %2
30
+%endmacro
31
+
32
+%macro LOAD_DIFF_8x4P_AVX2 6-8 r0,r2 ; 4x dest, 2x temp, 2x pointer
33
+    LOAD_DIFF_AVX2 xm%1, xm%5, [%7],      [%8]
34
+    LOAD_DIFF_AVX2 xm%2, xm%6, [%7+r1],   [%8+r3]
35
+    LOAD_DIFF_AVX2 xm%3, xm%5, [%7+2*r1], [%8+2*r3]
36
+    LOAD_DIFF_AVX2 xm%4, xm%6, [%7+r4],   [%8+r5]
37
+
38
+    ;lea %7, [%7+4*r1]
39
+    ;lea %8, [%8+4*r3]
40
+%endmacro
41
+
42
+INIT_YMM avx2
43
+cglobal pixel_satd_8x8, 4,4,7
44
+
45
+    FIX_STRIDES r1, r3
46
+    pxor    xm6, xm6
47
+
48
+    ; load_diff 0 & 4
49
+    movu    xm0, [r0]
50
+    movu    xm1, [r2]
51
+    vinserti128 m0, m0, [r0 + r1 * 4], 1
52
+    vinserti128 m1, m1, [r2 + r3 * 4], 1
53
+    psubw   m0, m1
54
+    add     r0, r1
55
+    add     r2, r3
56
+
57
+    ; load_diff 1 & 5
58
+    movu    xm1, [r0]
59
+    movu    xm2, [r2]
60
+    vinserti128 m1, m1, [r0 + r1 * 4], 1
61
+    vinserti128 m2, m2, [r2 + r3 * 4], 1
62
+    psubw   m1, m2
63
+    add     r0, r1
64
+    add     r2, r3
65
+
66
+    ; load_diff 2 & 6
67
+    movu    xm2, [r0]
68
+    movu    xm3, [r2]
69
+    vinserti128 m2, m2, [r0 + r1 * 4], 1
70
+    vinserti128 m3, m3, [r2 + r3 * 4], 1
71
+    psubw   m2, m3
72
+    add     r0, r1
73
+    add     r2, r3
74
+
75
+    ; load_diff 3 & 7
76
+    movu    xm3, [r0]
77
+    movu    xm4, [r2]
78
+    vinserti128 m3, m3, [r0 + r1 * 4], 1
79
+    vinserti128 m4, m4, [r2 + r3 * 4], 1
80
+    psubw   m3, m4
81
+
82
+    SATD_8x4_SSE vertical, 0, 1, 2, 3, 4, 5, 6
83
+
84
+    vextracti128 xm0, m6, 1
85
+    paddw xm6, xm0
86
+    HADDUW xm6, xm0
87
+    movd   eax, xm6
88
+    RET
89
+
90
+INIT_XMM avx2
91
+cglobal pixel_sa8d_8x8_internal
92
+    lea  r6, [r0+4*r1]
93
+    lea  r7, [r2+4*r3]
94
+    LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2
95
+    LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7
96
+
97
+    HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax
98
+    ;HADAMARD2_2D 0, 1, 2, 8, 6, wd
99
+    ;HADAMARD2_2D 4, 5, 3, 9, 6, wd
100
+    ;HADAMARD2_2D 0, 2, 1, 8, 6, dq
101
+    ;HADAMARD2_2D 4, 3, 5, 9, 6, dq
102
+    ;HADAMARD2_2D 0, 4, 2, 3, 6, qdq, amax
103
+    ;HADAMARD2_2D 1, 5, 8, 9, 6, qdq, amax
104
+
105
+    paddw m0, m1
106
+    paddw m0, m2
107
+    paddw m0, m8
108
+    SAVE_MM_PERMUTATION
109
+    ret
110
+
111
+
112
+INIT_XMM avx2
113
+cglobal pixel_sa8d_8x8, 4,8,12
114
+    FIX_STRIDES r1, r3
115
+    lea  r4, [3*r1]
116
+    lea  r5, [3*r3]
117
+    call pixel_sa8d_8x8_internal
118
+    HADDUW m0, m1
119
+    movd eax, m0
120
+    add eax, 1
121
+    shr eax, 1
122
+    RET
123
+
124
+
125
+INIT_YMM avx2
126
+cglobal pixel_sa8d_16x16, 4,8,12
127
+    FIX_STRIDES r1, r3
128
+    lea  r4, [3*r1]
129
+    lea  r5, [3*r3]
130
+    lea  r6, [r0+4*r1]
131
+    lea  r7, [r2+4*r3]
132
+    vbroadcasti128 m7, [pw_1]
133
+
134
+    ; Top 16x8
135
+    ;LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2
136
+    movu m0, [r0]                                   ; 10 bits
137
+    movu m5, [r2]
138
+    psubw m0, m5                                    ; 11 bits
139
+    movu m1, [r0 + r1]
140
+    movu m6, [r2 + r3]
141
+    psubw m1, m6
142
+    movu m2, [r0 + r1 * 2]
143
+    movu m5, [r2 + r3 * 2]
144
+    psubw m2, m5
145
+    movu m8, [r0 + r4]
146
+    movu m6, [r2 + r5]
147
+    psubw m8, m6
148
+
149
+    ;LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7
150
+    movu m4, [r6]
151
+    movu m11, [r7]
152
+    psubw m4, m11
153
+    movu m5, [r6 + r1]
154
+    movu m6, [r7 + r3]
155
+    psubw m5, m6
156
+    movu m3, [r6 + r1 * 2]
157
+    movu m11, [r7 + r3 * 2]
158
+    psubw m3, m11
159
+    movu m9, [r6 + r4]
160
+    movu m6, [r7 + r5]
161
+    psubw m9, m6
162
+
163
+    HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax    ; 16 bits
164
+    pmaddwd m0, m7
165
+    pmaddwd m1, m7
166
+    pmaddwd m2, m7
167
+    pmaddwd m8, m7
168
+    paddd m0, m1
169
+    paddd m2, m8
170
+    paddd m10, m0, m2
171
+
172
+    lea  r0, [r0+8*r1]
173
+    lea  r2, [r2+8*r3]
174
+    lea  r6, [r6+8*r1]
175
+    lea  r7, [r7+8*r3]
176
+
177
+    ; Bottom 16x8
178
+    ;LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2
179
+    movu m0, [r0]
180
+    movu m5, [r2]
181
+    psubw m0, m5
182
+    movu m1, [r0 + r1]
183
+    movu m6, [r2 + r3]
184
+    psubw m1, m6
185
+    movu m2, [r0 + r1 * 2]
186
+    movu m5, [r2 + r3 * 2]
187
+    psubw m2, m5
188
+    movu m8, [r0 + r4]
189
+    movu m6, [r2 + r5]
190
+    psubw m8, m6
191
+
192
+    ;LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7
193
+    movu m4, [r6]
194
+    movu m11, [r7]
195
+    psubw m4, m11
196
+    movu m5, [r6 + r1]
197
+    movu m6, [r7 + r3]
198
+    psubw m5, m6
199
+    movu m3, [r6 + r1 * 2]
200
+    movu m11, [r7 + r3 * 2]
201
x265_1.9.tar.gz/source/common/yuv.cpp -> x265_2.0.tar.gz/source/common/yuv.cpp Changed
23
 
1
@@ -163,14 +163,19 @@
2
     }
3
 }
4
 
5
-void Yuv::addClip(const Yuv& srcYuv0, const ShortYuv& srcYuv1, uint32_t log2SizeL)
6
+void Yuv::addClip(const Yuv& srcYuv0, const ShortYuv& srcYuv1, uint32_t log2SizeL, int picCsp)
7
 {
8
     primitives.cu[log2SizeL - 2].add_ps(m_buf[0], m_size, srcYuv0.m_buf[0], srcYuv1.m_buf[0], srcYuv0.m_size, srcYuv1.m_size);
9
-    if (m_csp != X265_CSP_I400)
10
+    if (m_csp != X265_CSP_I400 && picCsp != X265_CSP_I400)
11
     {
12
         primitives.chroma[m_csp].cu[log2SizeL - 2].add_ps(m_buf[1], m_csize, srcYuv0.m_buf[1], srcYuv1.m_buf[1], srcYuv0.m_csize, srcYuv1.m_csize);
13
         primitives.chroma[m_csp].cu[log2SizeL - 2].add_ps(m_buf[2], m_csize, srcYuv0.m_buf[2], srcYuv1.m_buf[2], srcYuv0.m_csize, srcYuv1.m_csize);
14
     }
15
+    if (picCsp == X265_CSP_I400 && m_csp != X265_CSP_I400)
16
+    {
17
+        primitives.chroma[m_csp].cu[m_part].copy_pp(m_buf[1], m_csize, srcYuv0.m_buf[1], srcYuv0.m_csize);
18
+        primitives.chroma[m_csp].cu[m_part].copy_pp(m_buf[2], m_csize, srcYuv0.m_buf[2], srcYuv0.m_csize);
19
+    }
20
 }
21
 
22
 void Yuv::addAvg(const ShortYuv& srcYuv0, const ShortYuv& srcYuv1, uint32_t absPartIdx, uint32_t width, uint32_t height, bool bLuma, bool bChroma)
23
x265_1.9.tar.gz/source/common/yuv.h -> x265_2.0.tar.gz/source/common/yuv.h Changed
10
 
1
@@ -73,7 +73,7 @@
2
     void   copyPartToYuv(Yuv& dstYuv, uint32_t absPartIdx) const;
3
 
4
     // Clip(srcYuv0 + srcYuv1) -> m_buf .. aka recon = clip(pred + residual)
5
-    void   addClip(const Yuv& srcYuv0, const ShortYuv& srcYuv1, uint32_t log2SizeL);
6
+    void   addClip(const Yuv& srcYuv0, const ShortYuv& srcYuv1, uint32_t log2SizeL, int picCsp);
7
 
8
     // (srcYuv0 + srcYuv1)/2 for YUV partition (bidir averaging)
9
     void   addAvg(const ShortYuv& srcYuv0, const ShortYuv& srcYuv1, uint32_t absPartIdx, uint32_t width, uint32_t height, bool bLuma, bool bChroma);
10
x265_1.9.tar.gz/source/compat/msvc/stdint.h -> x265_2.0.tar.gz/source/compat/msvc/stdint.h Changed
9
 
1
@@ -8,6 +8,7 @@
2
 #if !defined(UINT64_MAX)
3
 #include <limits.h>
4
 #define UINT64_MAX _UI64_MAX
5
+#define INT64_MAX _I64_MAX
6
 #define INT16_MAX  _I16_MAX
7
 #endif
8
 
9
x265_1.9.tar.gz/source/encoder/analysis.cpp -> x265_2.0.tar.gz/source/encoder/analysis.cpp Changed
201
 
1
@@ -74,14 +74,18 @@
2
 {
3
     m_reuseInterDataCTU = NULL;
4
     m_reuseRef = NULL;
5
-    m_reuseBestMergeCand = NULL;
6
-    m_reuseMv = NULL;
7
+    m_bHD = false;
8
 }
9
 bool Analysis::create(ThreadLocalData *tld)
10
 {
11
     m_tld = tld;
12
     m_bTryLossless = m_param->bCULossless && !m_param->bLossless && m_param->rdLevel >= 2;
13
-    m_bChromaSa8d = m_param->rdLevel >= 3;
14
+
15
+    int costArrSize = 1;
16
+    uint32_t maxDQPDepth = g_log2Size[m_param->maxCUSize] - g_log2Size[m_param->rc.qgSize];
17
+    for (uint32_t i = 1; i <= maxDQPDepth; i++)
18
+        costArrSize += (1 << (i * 2));
19
+    cacheCost = X265_MALLOC(uint64_t, costArrSize);
20
 
21
     int csp = m_param->internalCsp;
22
     uint32_t cuSize = g_maxCUSize;
23
@@ -102,6 +106,8 @@
24
             md.pred[j].fencYuv = &md.fencYuv;
25
         }
26
     }
27
+    if (m_param->sourceHeight >= 1080)
28
+        m_bHD = true;
29
 
30
     return ok;
31
 }
32
@@ -119,12 +125,14 @@
33
             m_modeDepth[i].pred[j].reconYuv.destroy();
34
         }
35
     }
36
+    X265_FREE(cacheCost);
37
 }
38
 
39
 Mode& Analysis::compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, const Entropy& initialContext)
40
 {
41
     m_slice = ctu.m_slice;
42
     m_frame = &frame;
43
+    m_bChromaSa8d = m_param->rdLevel >= 3;
44
 
45
 #if _DEBUG || CHECKED_BUILD
46
     invalidateContexts(0);
47
@@ -142,8 +150,13 @@
48
         int numPredDir = m_slice->isInterP() ? 1 : 2;
49
         m_reuseInterDataCTU = (analysis_inter_data*)m_frame->m_analysisData.interData;
50
         m_reuseRef = &m_reuseInterDataCTU->ref[ctu.m_cuAddr * X265_MAX_PRED_MODE_PER_CTU * numPredDir];
51
-        m_reuseBestMergeCand = &m_reuseInterDataCTU->bestMergeCand[ctu.m_cuAddr * CUGeom::MAX_GEOMS];
52
-        m_reuseMv = &m_reuseInterDataCTU->mv[ctu.m_cuAddr * X265_MAX_PRED_MODE_PER_CTU * numPredDir];
53
+        m_reuseDepth = &m_reuseInterDataCTU->depth[ctu.m_cuAddr * ctu.m_numPartitions];
54
+        m_reuseModes = &m_reuseInterDataCTU->modes[ctu.m_cuAddr * ctu.m_numPartitions];
55
+        m_reusePartSize = &m_reuseInterDataCTU->partSize[ctu.m_cuAddr * ctu.m_numPartitions];
56
+        m_reuseMergeFlag = &m_reuseInterDataCTU->mergeFlag[ctu.m_cuAddr * ctu.m_numPartitions];
57
+        if (m_param->analysisMode == X265_ANALYSIS_SAVE)
58
+            for (int i = 0; i < X265_MAX_PRED_MODE_PER_CTU * numPredDir; i++)
59
+                m_reuseRef[i] = -1;
60
     }
61
     ProfileCUScope(ctu, totalCTUTime, totalCTUs);
62
 
63
@@ -158,14 +171,6 @@
64
             memcpy(ctu.m_chromaIntraDir, &intraDataCTU->chromaModes[ctu.m_cuAddr * numPartition], sizeof(uint8_t) * numPartition);
65
         }
66
         compressIntraCU(ctu, cuGeom, qp);
67
-        if (m_param->analysisMode == X265_ANALYSIS_SAVE && intraDataCTU)
68
-        {
69
-            CUData* bestCU = &m_modeDepth[0].bestMode->cu;
70
-            memcpy(&intraDataCTU->depth[ctu.m_cuAddr * numPartition], bestCU->m_cuDepth, sizeof(uint8_t) * numPartition);
71
-            memcpy(&intraDataCTU->modes[ctu.m_cuAddr * numPartition], bestCU->m_lumaIntraDir, sizeof(uint8_t) * numPartition);
72
-            memcpy(&intraDataCTU->partSizes[ctu.m_cuAddr * numPartition], bestCU->m_partSize, sizeof(uint8_t) * numPartition);
73
-            memcpy(&intraDataCTU->chromaModes[ctu.m_cuAddr * numPartition], bestCU->m_chromaIntraDir, sizeof(uint8_t) * numPartition);
74
-        }
75
     }
76
     else
77
     {
78
@@ -189,18 +194,12 @@
79
         else if (m_param->rdLevel <= 4)
80
             compressInterCU_rd0_4(ctu, cuGeom, qp);
81
         else
82
-        {
83
-            uint32_t zOrder = 0;
84
-            compressInterCU_rd5_6(ctu, cuGeom, zOrder, qp);
85
-            if (m_param->analysisMode == X265_ANALYSIS_SAVE && m_frame->m_analysisData.interData)
86
-            {
87
-                CUData* bestCU = &m_modeDepth[0].bestMode->cu;
88
-                memcpy(&m_reuseInterDataCTU->depth[ctu.m_cuAddr * numPartition], bestCU->m_cuDepth, sizeof(uint8_t) * numPartition);
89
-                memcpy(&m_reuseInterDataCTU->modes[ctu.m_cuAddr * numPartition], bestCU->m_predMode, sizeof(uint8_t) * numPartition);
90
-            }
91
-        }
92
+            compressInterCU_rd5_6(ctu, cuGeom, qp);
93
     }
94
 
95
+    if (m_param->bEnableRdRefine)
96
+        qprdRefine(ctu, cuGeom, qp, qp);
97
+
98
     return *m_modeDepth[0].bestMode;
99
 }
100
 
101
@@ -229,6 +228,61 @@
102
     }
103
 }
104
 
105
+void Analysis::qprdRefine(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp, int32_t lqp)
106
+{
107
+    uint32_t depth = cuGeom.depth;
108
+    ModeDepth& md = m_modeDepth[depth];
109
+    md.bestMode = NULL;
110
+
111
+    bool bDecidedDepth = parentCTU.m_cuDepth[cuGeom.absPartIdx] == depth;
112
+
113
+    int bestCUQP = qp;
114
+    int lambdaQP = lqp;
115
+
116
+    bool doQPRefine = (bDecidedDepth && depth <= m_slice->m_pps->maxCuDQPDepth) || (!bDecidedDepth && depth == m_slice->m_pps->maxCuDQPDepth);
117
+
118
+    if (doQPRefine)
119
+    {
120
+        uint64_t bestCUCost, origCUCost, cuCost, cuPrevCost;
121
+
122
+        int cuIdx = (cuGeom.childOffset - 1) / 3;
123
+        bestCUCost = origCUCost = cacheCost[cuIdx];
124
+
125
+        for (int dir = 2; dir >= -2; dir -= 4)
126
+        {
127
+            int threshold = 1;
128
+            int failure = 0;
129
+            cuPrevCost = origCUCost;
130
+
131
+            int modCUQP = qp + dir;
132
+            while (modCUQP >= QP_MIN && modCUQP <= QP_MAX_SPEC)
133
+            {
134
+                recodeCU(parentCTU, cuGeom, modCUQP, qp);
135
+                cuCost = md.bestMode->rdCost;
136
+
137
+                COPY2_IF_LT(bestCUCost, cuCost, bestCUQP, modCUQP);
138
+                if (cuCost < cuPrevCost)
139
+                    failure = 0;
140
+                else
141
+                    failure++;
142
+
143
+                if (failure > threshold)
144
+                    break;
145
+
146
+                cuPrevCost = cuCost;
147
+                modCUQP += dir;
148
+            }
149
+        }
150
+        lambdaQP = bestCUQP;
151
+    }
152
+
153
+    recodeCU(parentCTU, cuGeom, bestCUQP, lambdaQP);
154
+
155
+    /* Copy best data to encData CTU and recon */
156
+    md.bestMode->cu.copyToPic(depth);
157
+    md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, parentCTU.m_cuAddr, cuGeom.absPartIdx);
158
+}
159
+
160
 void Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp)
161
 {
162
     uint32_t depth = cuGeom.depth;
163
@@ -334,6 +388,12 @@
164
         checkBestMode(*splitPred, depth);
165
     }
166
 
167
+    if (m_param->bEnableRdRefine && depth <= m_slice->m_pps->maxCuDQPDepth)
168
+    {
169
+        int cuIdx = (cuGeom.childOffset - 1) / 3;
170
+        cacheCost[cuIdx] = md.bestMode->rdCost;
171
+    }
172
+
173
     /* Copy best data to encData CTU and recon */
174
     md.bestMode->cu.copyToPic(depth);
175
     if (md.bestMode != &md.pred[PRED_SPLIT])
176
@@ -377,6 +437,7 @@
177
         slave.m_slice = m_slice;
178
         slave.m_frame = m_frame;
179
         slave.m_param = m_param;
180
+        slave.m_bChromaSa8d = m_param->rdLevel >= 3;
181
         slave.setLambdaFromQP(md.pred[PRED_2Nx2N].cu, m_rdCost.m_qp);
182
         slave.invalidateContexts(0);
183
         slave.m_rqt[pmode.cuGeom.depth].cur.load(m_rqt[pmode.cuGeom.depth].cur);
184
@@ -555,7 +616,7 @@
185
         if (m_param->rdLevel <= 4)
186
             checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
187
         else
188
-            checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom, false);
189
+            checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
190
     }
191
 
192
     bool bNoSplit = false;
193
@@ -827,8 +888,11 @@
194
     bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
195
     bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
196
     uint32_t minDepth = topSkipMinDepth(parentCTU, cuGeom);
197
-    bool earlyskip = false;
198
+    bool skipModes = false; /* Skip any remaining mode analyses at current depth */
199
+    bool skipRecursion = false; /* Skip recursion */
200
     bool splitIntra = true;
201
x265_1.9.tar.gz/source/encoder/analysis.h -> x265_2.0.tar.gz/source/encoder/analysis.h Changed
56
 
1
@@ -108,6 +108,7 @@
2
     ModeDepth m_modeDepth[NUM_CU_DEPTH];
3
     bool      m_bTryLossless;
4
     bool      m_bChromaSa8d;
5
+    bool      m_bHD;
6
 
7
     Analysis();
8
 
9
@@ -117,12 +118,19 @@
10
     Mode& compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, const Entropy& initialContext);
11
 
12
 protected:
13
-    /* Analysis data for load/save modes, keeps getting incremented as CTU analysis proceeds and data is consumed or read */
14
+    /* Analysis data for save/load mode, writes/reads data based on absPartIdx */
15
     analysis_inter_data* m_reuseInterDataCTU;
16
-    MV*                  m_reuseMv;
17
     int32_t*             m_reuseRef;
18
-    uint32_t*            m_reuseBestMergeCand;
19
+    uint8_t*             m_reuseDepth;
20
+    uint8_t*             m_reuseModes;
21
+    uint8_t*             m_reusePartSize;
22
+    uint8_t*             m_reuseMergeFlag;
23
+
24
     uint32_t m_splitRefIdx[4];
25
+    uint64_t* cacheCost;
26
+
27
+    /* refine RD based on QP for rd-levels 5 and 6 */
28
+    void qprdRefine(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp, int32_t lqp);
29
 
30
     /* full analysis for an I-slice CU */
31
     void compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp);
32
@@ -130,11 +138,13 @@
33
     /* full analysis for a P or B slice CU */
34
     uint32_t compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp);
35
     SplitData compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp);
36
-    SplitData compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGeom, uint32_t &zOrder, int32_t qp);
37
+    SplitData compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp);
38
+
39
+    void recodeCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp, int32_t origqp = -1);
40
 
41
     /* measure merge and skip */
42
     void checkMerge2Nx2N_rd0_4(Mode& skip, Mode& merge, const CUGeom& cuGeom);
43
-    void checkMerge2Nx2N_rd5_6(Mode& skip, Mode& merge, const CUGeom& cuGeom, bool isShareMergeCand);
44
+    void checkMerge2Nx2N_rd5_6(Mode& skip, Mode& merge, const CUGeom& cuGeom);
45
 
46
     /* measure inter options */
47
     void checkInter_rd0_4(Mode& interMode, const CUGeom& cuGeom, PartSize partSize, uint32_t refmask[2]);
48
@@ -151,6 +161,7 @@
49
     /* work-avoidance heuristics for RD levels < 5 */
50
     uint32_t topSkipMinDepth(const CUData& parentCTU, const CUGeom& cuGeom);
51
     bool recursionDepthCheck(const CUData& parentCTU, const CUGeom& cuGeom, const Mode& bestMode);
52
+    bool complexityCheckCU(const Mode& bestMode);
53
 
54
     /* generate residual and recon pixels for an entire CTU recursively (RD0) */
55
     void encodeResidue(const CUData& parentCTU, const CUGeom& cuGeom);
56
x265_1.9.tar.gz/source/encoder/api.cpp -> x265_2.0.tar.gz/source/encoder/api.cpp Changed
24
 
1
@@ -166,15 +166,20 @@
2
 
3
     x265_param save;
4
     Encoder* encoder = static_cast<Encoder*>(enc);
5
+    if (encoder->m_reconfigure) /* Reconfigure in progress */
6
+        return 1;
7
     memcpy(&save, encoder->m_latestParam, sizeof(x265_param));
8
     int ret = encoder->reconfigureParam(encoder->m_latestParam, param_in);
9
     if (ret)
10
+    {
11
         /* reconfigure failed, recover saved param set */
12
         memcpy(encoder->m_latestParam, &save, sizeof(x265_param));
13
+        ret = -1;
14
+    }
15
     else
16
     {
17
-        encoder->m_reconfigured = true;
18
-        x265_print_reconfigured_params(&save, encoder->m_latestParam);
19
+        encoder->m_reconfigure = true;
20
+        encoder->printReconfigureParams();
21
     }
22
     return ret;
23
 }
24
x265_1.9.tar.gz/source/encoder/dpb.cpp -> x265_2.0.tar.gz/source/encoder/dpb.cpp Changed
12
 
1
@@ -146,8 +146,8 @@
2
     // Mark pictures in m_piclist as unreferenced if they are not included in RPS
3
     applyReferencePictureSet(&slice->m_rps, pocCurr);
4
 
5
-    slice->m_numRefIdx[0] = X265_MIN(m_maxRefL0, slice->m_rps.numberOfNegativePictures); // Ensuring L0 contains just the -ve POC
6
-    slice->m_numRefIdx[1] = X265_MIN(m_maxRefL1, slice->m_rps.numberOfPositivePictures);
7
+    slice->m_numRefIdx[0] = X265_MIN(newFrame->m_param->maxNumReferences, slice->m_rps.numberOfNegativePictures); // Ensuring L0 contains just the -ve POC
8
+    slice->m_numRefIdx[1] = X265_MIN(newFrame->m_param->bBPyramid ? 2 : 1, slice->m_rps.numberOfPositivePictures);
9
     slice->setRefPicList(m_picList);
10
 
11
     X265_CHECK(slice->m_sliceType != B_SLICE || slice->m_numRefIdx[1], "B slice without L1 references (non-fatal)\n");
12
x265_1.9.tar.gz/source/encoder/dpb.h -> x265_2.0.tar.gz/source/encoder/dpb.h Changed
19
 
1
@@ -39,8 +39,6 @@
2
 
3
     int                m_lastIDR;
4
     int                m_pocCRA;
5
-    int                m_maxRefL0;
6
-    int                m_maxRefL1;
7
     int                m_bOpenGOP;
8
     bool               m_bRefreshPending;
9
     bool               m_bTemporalSublayer;
10
@@ -54,8 +52,6 @@
11
         m_pocCRA = 0;
12
         m_bRefreshPending = false;
13
         m_frameDataFreeList = NULL;
14
-        m_maxRefL0 = param->maxNumReferences;
15
-        m_maxRefL1 = param->bBPyramid ? 2 : 1;
16
         m_bOpenGOP = param->bOpenGOP;
17
         m_bTemporalSublayer = !!param->bEnableTemporalSubLayers;
18
     }
19
x265_1.9.tar.gz/source/encoder/encoder.cpp -> x265_2.0.tar.gz/source/encoder/encoder.cpp Changed
201
 
1
@@ -55,7 +55,7 @@
2
 Encoder::Encoder()
3
 {
4
     m_aborted = false;
5
-    m_reconfigured = false;
6
+    m_reconfigure = false;
7
     m_encodedFrameNum = 0;
8
     m_pocLast = -1;
9
     m_curEncoder = 0;
10
@@ -361,7 +361,10 @@
11
     }
12
 
13
     if (m_threadPool)
14
-        m_threadPool->stopWorkers();
15
+    {
16
+        for (int i = 0; i < m_numPools; i++)
17
+            m_threadPool[i].stopWorkers();
18
+    }
19
 }
20
 
21
 void Encoder::destroy()
22
@@ -508,12 +511,6 @@
23
 
24
     if (pic_in)
25
     {
26
-        if (pic_in->colorSpace != m_param->internalCsp)
27
-        {
28
-            x265_log(m_param, X265_LOG_ERROR, "Unsupported chroma subsampling (%d) on input\n",
29
-                     pic_in->colorSpace);
30
-            return -1;
31
-        }
32
         if (pic_in->bitDepth < 8 || pic_in->bitDepth > 16)
33
         {
34
             x265_log(m_param, X265_LOG_ERROR, "Input bit depth (%d) must be between 8 and 16\n",
35
@@ -525,7 +522,7 @@
36
         if (m_dpb->m_freeList.empty())
37
         {
38
             inFrame = new Frame;
39
-            x265_param* p = m_reconfigured? m_latestParam : m_param;
40
+            x265_param* p = m_reconfigure ? m_latestParam : m_param;
41
             if (inFrame->create(p, pic_in->quantOffsets))
42
             {
43
                 /* the first PicYuv created is asked to generate the CU and block unit offset
44
@@ -535,7 +532,7 @@
45
                 {
46
                     inFrame->m_fencPic->m_cuOffsetY = m_sps.cuOffsetY;
47
                     inFrame->m_fencPic->m_buOffsetY = m_sps.buOffsetY;
48
-                    if (pic_in->colorSpace != X265_CSP_I400)
49
+                    if (m_param->internalCsp != X265_CSP_I400)
50
                     {
51
                         inFrame->m_fencPic->m_cuOffsetC = m_sps.cuOffsetC;
52
                         inFrame->m_fencPic->m_buOffsetC = m_sps.buOffsetC;
53
@@ -555,7 +552,7 @@
54
                     {
55
                         m_sps.cuOffsetY = inFrame->m_fencPic->m_cuOffsetY;
56
                         m_sps.buOffsetY = inFrame->m_fencPic->m_buOffsetY;
57
-                        if (pic_in->colorSpace != X265_CSP_I400)
58
+                        if (m_param->internalCsp != X265_CSP_I400)
59
                         {
60
                             m_sps.cuOffsetC = inFrame->m_fencPic->m_cuOffsetC;
61
                             m_sps.cuOffsetY = inFrame->m_fencPic->m_cuOffsetY;
62
@@ -591,7 +588,7 @@
63
         inFrame->m_userData  = pic_in->userData;
64
         inFrame->m_pts       = pic_in->pts;
65
         inFrame->m_forceqp   = pic_in->forceqp;
66
-        inFrame->m_param     = m_reconfigured ? m_latestParam : m_param;
67
+        inFrame->m_param     = m_reconfigure ? m_latestParam : m_param;
68
         
69
         if (pic_in->quantOffsets != NULL)
70
         {
71
@@ -719,7 +716,7 @@
72
                     pic_out->analysisData.numPartitions = outFrame->m_analysisData.numPartitions;
73
                     pic_out->analysisData.interData = outFrame->m_analysisData.interData;
74
                     pic_out->analysisData.intraData = outFrame->m_analysisData.intraData;
75
-                    writeAnalysisFile(&pic_out->analysisData);
76
+                    writeAnalysisFile(&pic_out->analysisData, *outFrame->m_encData);
77
                     freeAnalysis(&pic_out->analysisData);
78
                 }
79
             }
80
@@ -780,6 +777,27 @@
81
                 if (m_rateControl->writeRateControlFrameStats(outFrame, &curEncoder->m_rce))
82
                     m_aborted = true;
83
 
84
+            if (pic_out && m_param->rc.bStatWrite)
85
+            {
86
+                /* m_rcData is allocated for every frame */
87
+                pic_out->rcData = outFrame->m_rcData;
88
+                outFrame->m_rcData->qpaRc = outFrame->m_encData->m_avgQpRc;
89
+                outFrame->m_rcData->qRceq = curEncoder->m_rce.qRceq;
90
+                outFrame->m_rcData->qpNoVbv = curEncoder->m_rce.qpNoVbv;
91
+                outFrame->m_rcData->coeffBits = outFrame->m_encData->m_frameStats.coeffBits;
92
+                outFrame->m_rcData->miscBits = outFrame->m_encData->m_frameStats.miscBits;
93
+                outFrame->m_rcData->mvBits = outFrame->m_encData->m_frameStats.mvBits;
94
+                outFrame->m_rcData->qScale = outFrame->m_rcData->newQScale = x265_qp2qScale(outFrame->m_encData->m_avgQpRc);
95
+                outFrame->m_rcData->poc = curEncoder->m_rce.poc;
96
+                outFrame->m_rcData->encodeOrder = curEncoder->m_rce.encodeOrder;
97
+                outFrame->m_rcData->sliceType = curEncoder->m_rce.sliceType;
98
+                outFrame->m_rcData->keptAsRef = curEncoder->m_rce.sliceType == B_SLICE && !IS_REFERENCED(outFrame) ? 0 : 1;
99
+                outFrame->m_rcData->qpAq = outFrame->m_encData->m_avgQpAq;
100
+                outFrame->m_rcData->iCuCount = outFrame->m_encData->m_frameStats.percent8x8Intra * m_rateControl->m_ncu;
101
+                outFrame->m_rcData->pCuCount = outFrame->m_encData->m_frameStats.percent8x8Inter * m_rateControl->m_ncu;
102
+                outFrame->m_rcData->skipCuCount = outFrame->m_encData->m_frameStats.percent8x8Skip  * m_rateControl->m_ncu;
103
+            }
104
+
105
             /* Allow this frame to be recycled if no frame encoders are using it for reference */
106
             if (!pic_out)
107
             {
108
@@ -800,16 +818,32 @@
109
             frameEnc = m_lookahead->getDecidedPicture();
110
         if (frameEnc && !pass)
111
         {
112
+            if (curEncoder->m_reconfigure)
113
+            {
114
+                /* One round robin cycle of FE reconfigure is complete */
115
+                /* Safe to copy m_latestParam to Encoder::m_param, encoder reconfigure complete */
116
+                for (int frameEncId = 0; frameEncId < m_param->frameNumThreads; frameEncId++)
117
+                    m_frameEncoder[frameEncId]->m_reconfigure = false;
118
+                memcpy (m_param, m_latestParam, sizeof(x265_param));
119
+                m_reconfigure = false;
120
+            }
121
+
122
+            /* Initiate reconfigure for this FE if necessary */
123
+            curEncoder->m_param = m_reconfigure ? m_latestParam : m_param;
124
+            curEncoder->m_reconfigure = m_reconfigure;
125
+
126
             /* give this frame a FrameData instance before encoding */
127
             if (m_dpb->m_frameDataFreeList)
128
             {
129
                 frameEnc->m_encData = m_dpb->m_frameDataFreeList;
130
                 m_dpb->m_frameDataFreeList = m_dpb->m_frameDataFreeList->m_freeListNext;
131
                 frameEnc->reinit(m_sps);
132
+                frameEnc->m_param = m_reconfigure ? m_latestParam : m_param;
133
+                frameEnc->m_encData->m_param = m_reconfigure ? m_latestParam : m_param;
134
             }
135
             else
136
             {
137
-                frameEnc->allocEncodeData(m_param, m_sps);
138
+                frameEnc->allocEncodeData(m_reconfigure ? m_latestParam : m_param, m_sps);
139
                 Slice* slice = frameEnc->m_encData->m_slice;
140
                 slice->m_sps = &m_sps;
141
                 slice->m_pps = &m_pps;
142
@@ -817,7 +851,7 @@
143
                 slice->m_endCUAddr = slice->realEndAddress(m_sps.numCUsInFrame * NUM_4x4_PARTITIONS);
144
             }
145
 
146
-            curEncoder->m_rce.encodeOrder = m_encodedFrameNum++;
147
+            curEncoder->m_rce.encodeOrder = frameEnc->m_encodeOrder = m_encodedFrameNum++;
148
             if (m_bframeDelay)
149
             {
150
                 int64_t *prevReorderedPts = m_prevReorderedPts;
151
@@ -867,28 +901,23 @@
152
 int Encoder::reconfigureParam(x265_param* encParam, x265_param* param)
153
 {
154
     encParam->maxNumReferences = param->maxNumReferences; // never uses more refs than specified in stream headers
155
-    encParam->bEnableLoopFilter = param->bEnableLoopFilter;
156
-    encParam->deblockingFilterTCOffset = param->deblockingFilterTCOffset;
157
-    encParam->deblockingFilterBetaOffset = param->deblockingFilterBetaOffset;
158
     encParam->bEnableFastIntra = param->bEnableFastIntra;
159
     encParam->bEnableEarlySkip = param->bEnableEarlySkip;
160
-    encParam->bEnableTemporalMvp = param->bEnableTemporalMvp;
161
-    /* Scratch buffer prevents me_range from being increased for esa/tesa
162
-    if (param->searchMethod < X265_FULL_SEARCH || param->searchMethod < encParam->searchRange)
163
-        encParam->searchRange = param->searchRange; */
164
-    encParam->noiseReductionInter = param->noiseReductionInter;
165
-    encParam->noiseReductionIntra = param->noiseReductionIntra;
166
+    encParam->bEnableRecursionSkip = param->bEnableRecursionSkip;
167
+    encParam->searchMethod = param->searchMethod;
168
+    /* Scratch buffer prevents me_range from being increased for esa/tesa */
169
+    if (param->searchRange < encParam->searchRange)
170
+        encParam->searchRange = param->searchRange;
171
     /* We can't switch out of subme=0 during encoding. */
172
     if (encParam->subpelRefine)
173
         encParam->subpelRefine = param->subpelRefine;
174
     encParam->rdoqLevel = param->rdoqLevel;
175
     encParam->rdLevel = param->rdLevel;
176
-    encParam->bEnableTSkipFast = param->bEnableTSkipFast;
177
-    encParam->psyRd = param->psyRd;
178
-    encParam->psyRdoq = param->psyRdoq;
179
-    encParam->bEnableSignHiding = param->bEnableSignHiding;
180
-    encParam->bEnableFastIntra = param->bEnableFastIntra;
181
-    encParam->maxTUSize = param->maxTUSize;
182
+    encParam->bEnableRectInter = param->bEnableRectInter;
183
+    encParam->maxNumMergeCand = param->maxNumMergeCand;
184
+    encParam->bIntraInBFrames = param->bIntraInBFrames;
185
+    /* To add: Loop Filter/deblocking controls, transform skip, signhide require PPS to be resent */
186
+    /* To add: SAO, temporal MVP, AMP, TU depths require SPS to be resent, at every CVS boundary */
187
     return x265_check_params(encParam);
188
 }
189
 
190
@@ -1214,12 +1243,6 @@
191
 
192
         stats->maxCLL         = m_analyzeAll.m_maxCLL;
193
         stats->maxFALL        = (uint16_t)(m_analyzeAll.m_maxFALL / m_analyzeAll.m_numPics);
194
-
195
-        if (m_emitCLLSEI)
196
-        {
197
-            m_param->maxCLL = stats->maxCLL;
198
-            m_param->maxFALL = stats->maxFALL;
199
-        }
200
     }
201
x265_1.9.tar.gz/source/encoder/encoder.h -> x265_2.0.tar.gz/source/encoder/encoder.h Changed
45
 
1
@@ -74,6 +74,7 @@
2
 class Lookahead;
3
 class RateControl;
4
 class ThreadPool;
5
+class FrameData;
6
 
7
 class Encoder : public x265_encoder
8
 {
9
@@ -110,7 +111,7 @@
10
     Frame*             m_exportedPic;
11
     FILE*              m_analysisFile;
12
     x265_param*        m_param;
13
-    x265_param*        m_latestParam;
14
+    x265_param*        m_latestParam;     // Holds latest param during a reconfigure
15
     RateControl*       m_rateControl;
16
     Lookahead*         m_lookahead;
17
 
18
@@ -129,7 +130,7 @@
19
     bool               m_emitCLLSEI;
20
     bool               m_bZeroLatency;     // x265_encoder_encode() returns NALs for the input picture, zero lag
21
     bool               m_aborted;          // fatal error detected
22
-    bool               m_reconfigured;      // reconfigure of encoder detected
23
+    bool               m_reconfigure;      // Encoder reconfigure in progress
24
 
25
     /* Begin intra refresh when one not in progress or else begin one as soon as the current 
26
      * one is done. Requires bIntraRefresh to be set.*/
27
@@ -152,6 +153,8 @@
28
 
29
     void printSummary();
30
 
31
+    void printReconfigureParams();
32
+
33
     char* statsString(EncStats&, char*);
34
 
35
     void configure(x265_param *param);
36
@@ -164,7 +167,7 @@
37
 
38
     void readAnalysisFile(x265_analysis_data* analysis, int poc);
39
 
40
-    void writeAnalysisFile(x265_analysis_data* pic);
41
+    void writeAnalysisFile(x265_analysis_data* pic, FrameData &curEncData);
42
 
43
     void finishFrameStats(Frame* pic, FrameEncoder *curEncoder, x265_frame_stats* frameStats, int inPoc);
44
 
45
x265_1.9.tar.gz/source/encoder/entropy.cpp -> x265_2.0.tar.gz/source/encoder/entropy.cpp Changed
201
 
1
@@ -38,6 +38,189 @@
2
 
3
 namespace X265_NS {
4
 
5
+// initial probability for cu_transquant_bypass flag
6
+static const uint8_t INIT_CU_TRANSQUANT_BYPASS_FLAG[3][NUM_TQUANT_BYPASS_FLAG_CTX] =
7
+{
8
+    { 154 },
9
+    { 154 },
10
+    { 154 },
11
+};
12
+
13
+// initial probability for split flag
14
+static const uint8_t INIT_SPLIT_FLAG[3][NUM_SPLIT_FLAG_CTX] =
15
+{
16
+    { 107,  139,  126, },
17
+    { 107,  139,  126, },
18
+    { 139,  141,  157, },
19
+};
20
+
21
+static const uint8_t INIT_SKIP_FLAG[3][NUM_SKIP_FLAG_CTX] =
22
+{
23
+    { 197,  185,  201, },
24
+    { 197,  185,  201, },
25
+    { CNU,  CNU,  CNU, },
26
+};
27
+
28
+static const uint8_t INIT_MERGE_FLAG_EXT[3][NUM_MERGE_FLAG_EXT_CTX] =
29
+{
30
+    { 154, },
31
+    { 110, },
32
+    { CNU, },
33
+};
34
+
35
+static const uint8_t INIT_MERGE_IDX_EXT[3][NUM_MERGE_IDX_EXT_CTX] =
36
+{
37
+    { 137, },
38
+    { 122, },
39
+    { CNU, },
40
+};
41
+
42
+static const uint8_t INIT_PART_SIZE[3][NUM_PART_SIZE_CTX] =
43
+{
44
+    { 154,  139,  154, 154 },
45
+    { 154,  139,  154, 154 },
46
+    { 184,  CNU,  CNU, CNU },
47
+};
48
+
49
+static const uint8_t INIT_PRED_MODE[3][NUM_PRED_MODE_CTX] =
50
+{
51
+    { 134, },
52
+    { 149, },
53
+    { CNU, },
54
+};
55
+
56
+static const uint8_t INIT_INTRA_PRED_MODE[3][NUM_ADI_CTX] =
57
+{
58
+    { 183, },
59
+    { 154, },
60
+    { 184, },
61
+};
62
+
63
+static const uint8_t INIT_CHROMA_PRED_MODE[3][NUM_CHROMA_PRED_CTX] =
64
+{
65
+    { 152,  139, },
66
+    { 152,  139, },
67
+    {  63,  139, },
68
+};
69
+
70
+static const uint8_t INIT_INTER_DIR[3][NUM_INTER_DIR_CTX] =
71
+{
72
+    {  95,   79,   63,   31,  31, },
73
+    {  95,   79,   63,   31,  31, },
74
+    { CNU,  CNU,  CNU,  CNU, CNU, },
75
+};
76
+
77
+static const uint8_t INIT_MVD[3][NUM_MV_RES_CTX] =
78
+{
79
+    { 169,  198, },
80
+    { 140,  198, },
81
+    { CNU,  CNU, },
82
+};
83
+
84
+static const uint8_t INIT_REF_PIC[3][NUM_REF_NO_CTX] =
85
+{
86
+    { 153,  153 },
87
+    { 153,  153 },
88
+    { CNU,  CNU },
89
+};
90
+
91
+static const uint8_t INIT_DQP[3][NUM_DELTA_QP_CTX] =
92
+{
93
+    { 154,  154,  154, },
94
+    { 154,  154,  154, },
95
+    { 154,  154,  154, },
96
+};
97
+
98
+static const uint8_t INIT_QT_CBF[3][NUM_QT_CBF_CTX] =
99
+{
100
+    { 153,  111,  149,   92,  167,  154,  154 },
101
+    { 153,  111,  149,  107,  167,  154,  154 },
102
+    { 111,  141,   94,  138,  182,  154,  154 },
103
+};
104
+
105
+static const uint8_t INIT_QT_ROOT_CBF[3][NUM_QT_ROOT_CBF_CTX] =
106
+{
107
+    {  79, },
108
+    {  79, },
109
+    { CNU, },
110
+};
111
+
112
+static const uint8_t INIT_LAST[3][NUM_CTX_LAST_FLAG_XY] =
113
+{
114
+    { 125,  110,  124,  110,   95,   94,  125,  111,  111,   79,  125,  126,  111,  111,   79,
115
+      108,  123,   93 },
116
+    { 125,  110,   94,  110,   95,   79,  125,  111,  110,   78,  110,  111,  111,   95,   94,
117
+      108,  123,  108 },
118
+    { 110,  110,  124,  125,  140,  153,  125,  127,  140,  109,  111,  143,  127,  111,   79,
119
+      108,  123,   63 },
120
+};
121
+
122
+static const uint8_t INIT_SIG_CG_FLAG[3][2 * NUM_SIG_CG_FLAG_CTX] =
123
+{
124
+    { 121,  140,
125
+      61,  154, },
126
+    { 121,  140,
127
+      61,  154, },
128
+    {  91,  171,
129
+       134,  141, },
130
+};
131
+
132
+static const uint8_t INIT_SIG_FLAG[3][NUM_SIG_FLAG_CTX] =
133
+{
134
+    { 170,  154,  139,  153,  139,  123,  123,   63,  124,  166,  183,  140,  136,  153,  154,  166,  183,  140,  136,  153,  154,  166,  183,  140,  136,  153,  154,  170,  153,  138,  138,  122,  121,  122,  121,  167,  151,  183,  140,  151,  183,  140,  },
135
+    { 155,  154,  139,  153,  139,  123,  123,   63,  153,  166,  183,  140,  136,  153,  154,  166,  183,  140,  136,  153,  154,  166,  183,  140,  136,  153,  154,  170,  153,  123,  123,  107,  121,  107,  121,  167,  151,  183,  140,  151,  183,  140,  },
136
+    { 111,  111,  125,  110,  110,   94,  124,  108,  124,  107,  125,  141,  179,  153,  125,  107,  125,  141,  179,  153,  125,  107,  125,  141,  179,  153,  125,  140,  139,  182,  182,  152,  136,  152,  136,  153,  136,  139,  111,  136,  139,  111,  },
137
+};
138
+
139
+static const uint8_t INIT_ONE_FLAG[3][NUM_ONE_FLAG_CTX] =
140
+{
141
+    { 154,  196,  167,  167,  154,  152,  167,  182,  182,  134,  149,  136,  153,  121,  136,  122,  169,  208,  166,  167,  154,  152,  167,  182, },
142
+    { 154,  196,  196,  167,  154,  152,  167,  182,  182,  134,  149,  136,  153,  121,  136,  137,  169,  194,  166,  167,  154,  167,  137,  182, },
143
+    { 140,   92,  137,  138,  140,  152,  138,  139,  153,   74,  149,   92,  139,  107,  122,  152,  140,  179,  166,  182,  140,  227,  122,  197, },
144
+};
145
+
146
+static const uint8_t INIT_ABS_FLAG[3][NUM_ABS_FLAG_CTX] =
147
+{
148
+    { 107,  167,   91,  107,  107,  167, },
149
+    { 107,  167,   91,  122,  107,  167, },
150
+    { 138,  153,  136,  167,  152,  152, },
151
+};
152
+
153
+static const uint8_t INIT_MVP_IDX[3][NUM_MVP_IDX_CTX] =
154
+{
155
+    { 168 },
156
+    { 168 },
157
+    { CNU },
158
+};
159
+
160
+static const uint8_t INIT_SAO_MERGE_FLAG[3][NUM_SAO_MERGE_FLAG_CTX] =
161
+{
162
+    { 153,  },
163
+    { 153,  },
164
+    { 153,  },
165
+};
166
+
167
+static const uint8_t INIT_SAO_TYPE_IDX[3][NUM_SAO_TYPE_IDX_CTX] =
168
+{
169
+    { 160, },
170
+    { 185, },
171
+    { 200, },
172
+};
173
+
174
+static const uint8_t INIT_TRANS_SUBDIV_FLAG[3][NUM_TRANS_SUBDIV_FLAG_CTX] =
175
+{
176
+    { 224,  167,  122, },
177
+    { 124,  138,   94, },
178
+    { 153,  138,  138, },
179
+};
180
+
181
+static const uint8_t INIT_TRANSFORMSKIP_FLAG[3][2 * NUM_TRANSFORMSKIP_FLAG_CTX] =
182
+{
183
+    { 139,  139 },
184
+    { 139,  139 },
185
+    { 139,  139 },
186
+};
187
+
188
 Entropy::Entropy()
189
 {
190
     markValid();
191
@@ -306,7 +489,7 @@
192
 {
193
     for (int sizeId = 0; sizeId < ScalingList::NUM_SIZES; sizeId++)
194
     {
195
-        for (int listId = 0; listId < ScalingList::NUM_LISTS; listId++)
196
+        for (int listId = 0; listId < ScalingList::NUM_LISTS; listId += (sizeId == 3) ? 3 : 1)
197
         {
198
             int predList = scalingList.checkPredMode(sizeId, listId);
199
             WRITE_FLAG(predList < 0, "scaling_list_pred_mode_flag");
200
@@ -334,12 +517,7 @@
201
x265_1.9.tar.gz/source/encoder/entropy.h -> x265_2.0.tar.gz/source/encoder/entropy.h Changed
25
 
1
@@ -162,13 +162,13 @@
2
 
3
     void codePartSize(const CUData& cu, uint32_t absPartIdx, uint32_t depth);
4
     void codePredInfo(const CUData& cu, uint32_t absPartIdx);
5
-    inline void codeQtCbfLuma(const CUData& cu, uint32_t absPartIdx, uint32_t tuDepth) { codeQtCbfLuma(cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth), tuDepth); }
6
 
7
     void codeQtCbfChroma(const CUData& cu, uint32_t absPartIdx, TextType ttype, uint32_t tuDepth, bool lowestLevel);
8
     void codeCoeff(const CUData& cu, uint32_t absPartIdx, bool& bCodeDQP, const uint32_t depthRange[2]);
9
     void codeCoeffNxN(const CUData& cu, const coeff_t* coef, uint32_t absPartIdx, uint32_t log2TrSize, TextType ttype);
10
 
11
     inline void codeSaoMerge(uint32_t code)                          { encodeBin(code, m_contextState[OFF_SAO_MERGE_FLAG_CTX]); }
12
+    inline void codeSaoType(uint32_t code)                           { encodeBin(code, m_contextState[OFF_SAO_TYPE_IDX_CTX]); }
13
     inline void codeMVPIdx(uint32_t symbol)                          { encodeBin(symbol, m_contextState[OFF_MVP_IDX_CTX]); }
14
     inline void codeMergeFlag(const CUData& cu, uint32_t absPartIdx) { encodeBin(cu.m_mergeFlag[absPartIdx], m_contextState[OFF_MERGE_FLAG_EXT_CTX]); }
15
     inline void codeSkipFlag(const CUData& cu, uint32_t absPartIdx)  { encodeBin(cu.isSkipped(absPartIdx), m_contextState[OFF_SKIP_FLAG_CTX + cu.getCtxSkipFlag(absPartIdx)]); }
16
@@ -182,6 +182,8 @@
17
     inline void codeTransformSkipFlags(uint32_t transformSkip, TextType ttype) { encodeBin(transformSkip, m_contextState[OFF_TRANSFORMSKIP_FLAG_CTX + (ttype ? NUM_TRANSFORMSKIP_FLAG_CTX : 0)]); }
18
     void codeDeltaQP(const CUData& cu, uint32_t absPartIdx);
19
     void codeSaoOffset(const SaoCtuParam& ctuParam, int plane);
20
+    void codeSaoOffsetEO(int *offset, int typeIdx, int plane);
21
+    void codeSaoOffsetBO(int *offset, int bandPos, int plane);
22
 
23
     /* RDO functions */
24
     void estBit(EstBitsSbac& estBitsSbac, uint32_t log2TrSize, bool bIsLuma) const;
25
x265_1.9.tar.gz/source/encoder/frameencoder.cpp -> x265_2.0.tar.gz/source/encoder/frameencoder.cpp Changed
191
 
1
@@ -41,6 +41,7 @@
2
 FrameEncoder::FrameEncoder()
3
 {
4
     m_prevOutputTime = x265_mdate();
5
+    m_reconfigure = false;
6
     m_isFrameEncoder = true;
7
     m_threadActive = true;
8
     m_slicetypeWaitTime = 0;
9
@@ -104,6 +105,7 @@
10
     m_param = top->m_param;
11
     m_numRows = numRows;
12
     m_numCols = numCols;
13
+    m_reconfigure = false;
14
     m_filterRowDelay = ((m_param->bEnableSAO && m_param->bSaoNonDeblocked)
15
                         || (!m_param->bEnableLoopFilter && m_param->bEnableSAO)) ?
16
                         2 : (m_param->bEnableSAO || m_param->bEnableLoopFilter ? 1 : 0);
17
@@ -213,7 +215,6 @@
18
 {
19
     m_slicetypeWaitTime = x265_mdate() - m_prevOutputTime;
20
     m_frame = curFrame;
21
-    m_param = curFrame->m_param;
22
     m_sliceType = curFrame->m_lowres.sliceType;
23
     curFrame->m_encData->m_frameEncoderID = m_jpId;
24
     curFrame->m_encData->m_jobProvider = this;
25
@@ -333,18 +334,40 @@
26
     // Weighted Prediction parameters estimation.
27
     bool bUseWeightP = slice->m_sliceType == P_SLICE && slice->m_pps->bUseWeightPred;
28
     bool bUseWeightB = slice->m_sliceType == B_SLICE && slice->m_pps->bUseWeightedBiPred;
29
+
30
+    WeightParam* reuseWP = NULL;
31
+    if (m_param->analysisMode && (bUseWeightP || bUseWeightB))
32
+        reuseWP = ((analysis_inter_data*)m_frame->m_analysisData.interData)->wt;
33
+
34
     if (bUseWeightP || bUseWeightB)
35
     {
36
 #if DETAILED_CU_STATS
37
         m_cuStats.countWeightAnalyze++;
38
         ScopedElapsedTime time(m_cuStats.weightAnalyzeTime);
39
 #endif
40
-        WeightAnalysis wa(*this);
41
-        if (m_pool && wa.tryBondPeers(*this, 1))
42
-            /* use an idle worker for weight analysis */
43
-            wa.waitForExit();
44
+        if (m_param->analysisMode == X265_ANALYSIS_LOAD)
45
+        {
46
+            for (int list = 0; list < slice->isInterB() + 1; list++) 
47
+            {
48
+                for (int plane = 0; plane < (m_param->internalCsp != X265_CSP_I400 ? 3 : 1); plane++)
49
+                {
50
+                    for (int ref = 1; ref < slice->m_numRefIdx[list]; ref++)
51
+                        SET_WEIGHT(slice->m_weightPredTable[list][ref][plane], false, 1 << reuseWP->log2WeightDenom, reuseWP->log2WeightDenom, 0);
52
+                    slice->m_weightPredTable[list][0][plane] = *(reuseWP++);
53
+                }
54
+            }
55
+        }
56
         else
57
-            weightAnalyse(*slice, *m_frame, *m_param);
58
+        {
59
+            WeightAnalysis wa(*this);
60
+            if (m_pool && wa.tryBondPeers(*this, 1))
61
+                /* use an idle worker for weight analysis */
62
+                wa.waitForExit();
63
+            else
64
+                weightAnalyse(*slice, *m_frame, *m_param);
65
+
66
+        }
67
+
68
     }
69
     else
70
         slice->disableWeights();
71
@@ -361,6 +384,12 @@
72
             slice->m_refReconPicList[l][ref] = slice->m_refFrameList[l][ref]->m_reconPic;
73
             m_mref[l][ref].init(slice->m_refReconPicList[l][ref], w, *m_param);
74
         }
75
+        if (m_param->analysisMode == X265_ANALYSIS_SAVE && (bUseWeightP || bUseWeightB))
76
+        {
77
+            for (int i = 0; i < (m_param->internalCsp != X265_CSP_I400 ? 3 : 1); i++)
78
+                *(reuseWP++) = slice->m_weightPredTable[l][0][i];
79
+        }
80
+
81
     }
82
 
83
     int numTLD;
84
@@ -371,6 +400,7 @@
85
 
86
     /* Get the QP for this frame from rate control. This call may block until
87
      * frames ahead of it in encode order have called rateControlEnd() */
88
+    m_rce.encodeOrder = m_frame->m_encodeOrder;
89
     int qp = m_top->m_rateControl->rateControlStart(m_frame, &m_rce, m_top);
90
     m_rce.newQp = qp;
91
 
92
@@ -409,7 +439,7 @@
93
 
94
     m_initSliceContext.resetEntropy(*slice);
95
 
96
-    m_frameFilter.start(m_frame, m_initSliceContext, qp);
97
+    m_frameFilter.start(m_frame, m_initSliceContext);
98
 
99
     /* ensure all rows are blocked prior to initializing row CTU counters */
100
     WaveFront::clearEnabledRowMask();
101
@@ -969,44 +999,48 @@
102
         /* Deblock with idle threading */
103
         if (m_param->bEnableLoopFilter | m_param->bEnableSAO)
104
         {
105
-            // TODO: Multiple Threading
106
-            // Delay ONE row to avoid Intra Prediction Conflict
107
-            if (m_pool && (row >= 1))
108
+            // NOTE: in VBV mode, we may reencode anytime, so we can't do Deblock stage-Horizon and SAO
109
+            if (!bIsVbv)
110
             {
111
-                // Waitting last threading finish
112
-                m_frameFilter.m_parallelFilter[row - 1].waitForExit();
113
+                // TODO: Multiple Threading
114
+                // Delay ONE row to avoid Intra Prediction Conflict
115
+                if (m_pool && (row >= 1))
116
+                {
117
+                    // Waitting last threading finish
118
+                    m_frameFilter.m_parallelFilter[row - 1].waitForExit();
119
 
120
-                // Processing new group
121
-                int allowCol = col;
122
+                    // Processing new group
123
+                    int allowCol = col;
124
 
125
-                // avoid race condition on last column
126
-                if (row >= 2)
127
-                {
128
-                    allowCol = X265_MIN(((col == numCols - 1) ? m_frameFilter.m_parallelFilter[row - 2].m_lastDeblocked.get()
129
-                                                              : m_frameFilter.m_parallelFilter[row - 2].m_lastCol.get()), (int)col);
130
+                    // avoid race condition on last column
131
+                    if (row >= 2)
132
+                    {
133
+                        allowCol = X265_MIN(((col == numCols - 1) ? m_frameFilter.m_parallelFilter[row - 2].m_lastDeblocked.get()
134
+                                                                  : m_frameFilter.m_parallelFilter[row - 2].m_lastCol.get()), (int)col);
135
+                    }
136
+                    m_frameFilter.m_parallelFilter[row - 1].m_allowedCol.set(allowCol);
137
+                    m_frameFilter.m_parallelFilter[row - 1].tryBondPeers(*this, 1);
138
                 }
139
-                m_frameFilter.m_parallelFilter[row - 1].m_allowedCol.set(allowCol);
140
-                m_frameFilter.m_parallelFilter[row - 1].tryBondPeers(*this, 1);
141
-            }
142
 
143
-            // Last Row may start early
144
-            if (m_pool && (row == m_numRows - 1))
145
-            {
146
-                // Waiting for the last thread to finish
147
-                m_frameFilter.m_parallelFilter[row].waitForExit();
148
+                // Last Row may start early
149
+                if (m_pool && (row == m_numRows - 1))
150
+                {
151
+                    // Waiting for the last thread to finish
152
+                    m_frameFilter.m_parallelFilter[row].waitForExit();
153
 
154
-                // Deblocking last row
155
-                int allowCol = col;
156
+                    // Deblocking last row
157
+                    int allowCol = col;
158
 
159
-                // avoid race condition on last column
160
-                if (row >= 2)
161
-                {
162
-                    allowCol = X265_MIN(((col == numCols - 1) ? m_frameFilter.m_parallelFilter[row - 1].m_lastDeblocked.get()
163
-                                                              : m_frameFilter.m_parallelFilter[row - 1].m_lastCol.get()), (int)col);
164
+                    // avoid race condition on last column
165
+                    if (row >= 2)
166
+                    {
167
+                        allowCol = X265_MIN(((col == numCols - 1) ? m_frameFilter.m_parallelFilter[row - 1].m_lastDeblocked.get()
168
+                                                                  : m_frameFilter.m_parallelFilter[row - 1].m_lastCol.get()), (int)col);
169
+                    }
170
+                    m_frameFilter.m_parallelFilter[row].m_allowedCol.set(allowCol);
171
+                    m_frameFilter.m_parallelFilter[row].tryBondPeers(*this, 1);
172
                 }
173
-                m_frameFilter.m_parallelFilter[row].m_allowedCol.set(allowCol);
174
-                m_frameFilter.m_parallelFilter[row].tryBondPeers(*this, 1);
175
-            }
176
+            } // end of !bIsVbv
177
         }
178
         // Both Loopfilter and SAO Disabled
179
         else
180
@@ -1179,7 +1213,9 @@
181
     uint32_t rowCount = 0;
182
     if (m_param->rc.rateControlMode == X265_RC_ABR || bIsVbv)
183
     {
184
-        if ((uint32_t)m_rce.encodeOrder <= 2 * (m_param->fpsNum / m_param->fpsDenom))
185
+        if (!m_rce.encodeOrder)
186
+            rowCount = m_numRows - 1;
187
+        else if ((uint32_t)m_rce.encodeOrder <= 2 * (m_param->fpsNum / m_param->fpsDenom))
188
             rowCount = X265_MIN((m_numRows + 1) / 2, m_numRows - 1);
189
         else
190
             rowCount = X265_MIN(m_refLagRows, m_numRows - 1);
191
x265_1.9.tar.gz/source/encoder/frameencoder.h -> x265_2.0.tar.gz/source/encoder/frameencoder.h Changed
10
 
1
@@ -129,7 +129,7 @@
2
     Event                    m_done;
3
     Event                    m_completionEvent;
4
     int                      m_localTldIdx;
5
-
6
+    bool                     m_reconfigure; /* reconfigure in progress */
7
     volatile bool            m_threadActive;
8
     volatile bool            m_bAllRowsStop;
9
     volatile int             m_completionCount;
10
x265_1.9.tar.gz/source/encoder/framefilter.cpp -> x265_2.0.tar.gz/source/encoder/framefilter.cpp Changed
126
 
1
@@ -54,7 +54,7 @@
2
 
3
 void FrameFilter::init(Encoder *top, FrameEncoder *frame, int numRows, uint32_t numCols)
4
 {
5
-    m_param = top->m_param;
6
+    m_param = frame->m_param;
7
     m_frameEncoder = frame;
8
     m_numRows = numRows;
9
     m_numCols = numCols;
10
@@ -103,7 +103,7 @@
11
 
12
 }
13
 
14
-void FrameFilter::start(Frame *frame, Entropy& initState, int qp)
15
+void FrameFilter::start(Frame *frame, Entropy& initState)
16
 {
17
     m_frame = frame;
18
 
19
@@ -113,7 +113,7 @@
20
         for(int row = 0; row < m_numRows; row++)
21
         {
22
             if (m_param->bEnableSAO)
23
-                m_parallelFilter[row].m_sao.startSlice(frame, initState, qp);
24
+                m_parallelFilter[row].m_sao.startSlice(frame, initState);
25
 
26
             m_parallelFilter[row].m_lastCol.set(0);
27
             m_parallelFilter[row].m_allowedCol.set(0);
28
@@ -198,14 +198,14 @@
29
     }
30
 }
31
 
32
-void FrameFilter::ParallelFilter::processSaoUnitCu(SAOParam *saoParam, int col)
33
+void FrameFilter::ParallelFilter::processSaoCTU(SAOParam *saoParam, int col)
34
 {
35
     // TODO: apply SAO on CU and copy back soon, is it necessary?
36
     if (saoParam->bSaoFlag[0])
37
-        m_sao.processSaoUnitCuLuma(saoParam->ctuParam[0], m_row, col);
38
+        m_sao.generateLumaOffsets(saoParam->ctuParam[0], m_row, col);
39
 
40
     if (saoParam->bSaoFlag[1])
41
-        m_sao.processSaoUnitCuChroma(saoParam->ctuParam, m_row, col);
42
+        m_sao.generateChromaOffsets(saoParam->ctuParam, m_row, col);
43
 
44
     if (m_encData->m_slice->m_pps->bTransquantBypassEnabled)
45
     {
46
@@ -320,11 +320,14 @@
47
     const uint32_t* ctuGeomMap = m_frameFilter->m_frameEncoder->m_ctuGeomMap;
48
     PicYuv* reconPic = m_encData->m_reconPic;
49
     const int colStart = m_lastCol.get();
50
-    // TODO: Waiting previous row finish or simple clip on it?
51
-    const int colEnd = m_allowedCol.get();
52
     const int numCols = m_frameFilter->m_numCols;
53
+    // TODO: Waiting previous row finish or simple clip on it?
54
+    int colEnd = m_allowedCol.get();
55
 
56
     // Avoid threading conflict
57
+    if (m_prevRow && colEnd > m_prevRow->m_lastDeblocked.get())
58
+        colEnd = m_prevRow->m_lastDeblocked.get();
59
+
60
     if (colStart >= colEnd)
61
         return;
62
 
63
@@ -368,7 +371,7 @@
64
                 if (m_row >= 1 && col >= 3)
65
                 {
66
                     // Must delay 1 row to avoid thread data race conflict
67
-                    m_prevRow->processSaoUnitCu(saoParam, col - 3);
68
+                    m_prevRow->processSaoCTU(saoParam, col - 3);
69
                     m_prevRow->processPostCu(col - 3);
70
                 }
71
             }
72
@@ -409,19 +412,19 @@
73
             // Process Previous Rows SAO CU
74
             if (m_row >= 1 && numCols >= 3)
75
             {
76
-                m_prevRow->processSaoUnitCu(saoParam, numCols - 3);
77
+                m_prevRow->processSaoCTU(saoParam, numCols - 3);
78
                 m_prevRow->processPostCu(numCols - 3);
79
             }
80
 
81
             if (m_row >= 1 && numCols >= 2)
82
             {
83
-                m_prevRow->processSaoUnitCu(saoParam, numCols - 2);
84
+                m_prevRow->processSaoCTU(saoParam, numCols - 2);
85
                 m_prevRow->processPostCu(numCols - 2);
86
             }
87
 
88
             if (m_row >= 1 && numCols >= 1)
89
             {
90
-                m_prevRow->processSaoUnitCu(saoParam, numCols - 1);
91
+                m_prevRow->processSaoCTU(saoParam, numCols - 1);
92
                 m_prevRow->processPostCu(numCols - 1);
93
             }
94
 
95
@@ -475,7 +478,7 @@
96
                 for(int col = 0; col < m_numCols; col++)
97
                 {
98
                     // NOTE: must use processSaoUnitCu(), it include TQBypass logic
99
-                    m_parallelFilter[row].processSaoUnitCu(saoParam, col);
100
+                    m_parallelFilter[row].processSaoCTU(saoParam, col);
101
                 }
102
             }
103
 
104
@@ -550,10 +553,10 @@
105
         pixel *fenc = m_frame->m_fencPic->m_picOrg[0];
106
         intptr_t stride1 = reconPic->m_stride;
107
         intptr_t stride2 = m_frame->m_fencPic->m_stride;
108
-        uint32_t bEnd = ((row + 1) == (this->m_numRows - 1));
109
+        uint32_t bEnd = ((row) == (this->m_numRows - 1));
110
         uint32_t bStart = (row == 0);
111
         uint32_t minPixY = row * g_maxCUSize - 4 * !bStart;
112
-        uint32_t maxPixY = (row + 1) * g_maxCUSize - 4 * !bEnd;
113
+        uint32_t maxPixY = X265_MIN((row + 1) * g_maxCUSize - 4 * !bEnd, (uint32_t)m_param->sourceHeight);
114
         uint32_t ssim_cnt;
115
         x265_emms();
116
 
117
@@ -723,7 +726,7 @@
118
         {
119
             std::swap(sum0, sum1);
120
             for (uint32_t x = 0; x < width; x += 2)
121
-                primitives.ssim_4x4x2_core(&pix1[(4 * x + (z * stride1))], stride1, &pix2[(4 * x + (z * stride2))], stride2, &sum0[x]);
122
+                primitives.ssim_4x4x2_core(&pix1[4 * (x + (z * stride1))], stride1, &pix2[4 * (x + (z * stride2))], stride2, &sum0[x]);
123
         }
124
 
125
         for (uint32_t x = 0; x < width - 1; x += 4)
126
x265_1.9.tar.gz/source/encoder/framefilter.h -> x265_2.0.tar.gz/source/encoder/framefilter.h Changed
19
 
1
@@ -90,7 +90,7 @@
2
         void processTasks(int workerThreadId);
3
 
4
         // Apply SAO on a CU in current row
5
-        void processSaoUnitCu(SAOParam *saoParam, int col);
6
+        void processSaoCTU(SAOParam *saoParam, int col);
7
 
8
         // Copy and Save SAO reference pixels for SAO Rdo decide
9
         void copySaoAboveRef(PicYuv* reconPic, uint32_t cuAddr, int col);
10
@@ -127,7 +127,7 @@
11
     void init(Encoder *top, FrameEncoder *frame, int numRows, uint32_t numCols);
12
     void destroy();
13
 
14
-    void start(Frame *pic, Entropy& initState, int qp);
15
+    void start(Frame *pic, Entropy& initState);
16
 
17
     void processRow(int row);
18
     void processPostRow(int row);
19
x265_1.9.tar.gz/source/encoder/level.cpp -> x265_2.0.tar.gz/source/encoder/level.cpp Changed
183
 
1
@@ -131,6 +131,14 @@
2
         vps.ptl.levelIdc = Level::LEVEL8_5;
3
         vps.ptl.tierFlag = Level::MAIN;
4
     }
5
+    else if (param.uhdBluray)
6
+    {
7
+        i = 8;
8
+        vps.ptl.levelIdc = levels[i].levelEnum;
9
+        vps.ptl.tierFlag = Level::HIGH;
10
+        vps.ptl.minCrForLevel = levels[i].minCompressionRatio;
11
+        vps.ptl.maxLumaSrForLevel = levels[i].maxLumaSamplesPerSecond;
12
+    }
13
     else for (i = 0; i < NumLevels; i++)
14
     {
15
         if (lumaSamples > levels[i].maxLumaSamples)
16
@@ -145,8 +153,10 @@
17
             continue;
18
         else if (param.sourceHeight > sqrt(levels[i].maxLumaSamples * 8.0f))
19
             continue;
20
-
21
+        else if (param.levelIdc && param.levelIdc != levels[i].levelIdc)
22
+            continue;
23
         uint32_t maxDpbSize = MaxDpbPicBuf;
24
+
25
         if (lumaSamples <= (levels[i].maxLumaSamples >> 2))
26
             maxDpbSize = X265_MIN(4 * MaxDpbPicBuf, 16);
27
         else if (lumaSamples <= (levels[i].maxLumaSamples >> 1))
28
@@ -188,7 +198,7 @@
29
             CHECK_RANGE((uint32_t)param.rc.vbvBufferSize, levels[i].maxCpbSizeMain, levels[i].maxCpbSizeHigh))
30
         {
31
             /* The bitrate or buffer size are out of range for Main tier, but in
32
-             * range for High tier. If the user requested High tier then give
33
+             * range for High tier. If the user allowed High tier then give
34
              * them High tier at this level.  Otherwise allow the loop to
35
              * progress to the Main tier of the next level */
36
             if (param.bHighTier)
37
@@ -279,7 +289,7 @@
38
 bool enforceLevel(x265_param& param, VPS& vps)
39
 {
40
     vps.numReorderPics = (param.bBPyramid && param.bframes > 1) ? 2 : !!param.bframes;
41
-    vps.maxDecPicBuffering = X265_MIN(MAX_NUM_REF, X265_MAX(vps.numReorderPics + 2, (uint32_t)param.maxNumReferences) + vps.numReorderPics);
42
+    vps.maxDecPicBuffering = X265_MIN(MAX_NUM_REF, X265_MAX(vps.numReorderPics + 2, (uint32_t)param.maxNumReferences) + 1);
43
 
44
     /* no level specified by user, just auto-detect from the configuration */
45
     if (param.levelIdc <= 0)
46
@@ -290,17 +300,14 @@
47
         level++;
48
     if (levels[level].levelIdc != param.levelIdc)
49
     {
50
-        x265_log(&param, X265_LOG_WARNING, "specified level %d does not exist\n", param.levelIdc);
51
+        x265_log(&param, X265_LOG_ERROR, "specified level %d does not exist\n", param.levelIdc);
52
         return false;
53
     }
54
 
55
     LevelSpec& l = levels[level];
56
-    bool highTier = !!param.bHighTier;
57
-    if (highTier && l.maxBitrateHigh == MAX_UINT)
58
-    {
59
-        highTier = false;
60
-        x265_log(&param, X265_LOG_WARNING, "Level %s has no High tier, using Main tier\n", l.name);
61
-    }
62
+
63
+    //highTier is allowed for this level and has not been explicitly disabled. This does not mean it is the final chosen tier
64
+    bool allowHighTier = l.maxBitrateHigh < MAX_UINT && param.bHighTier;
65
 
66
     uint32_t lumaSamples = param.sourceWidth * param.sourceHeight;
67
     uint32_t samplesPerSec = (uint32_t)(lumaSamples * ((double)param.fpsNum / param.fpsDenom));
68
@@ -313,47 +320,51 @@
69
         ok = false;
70
     if (!ok)
71
     {
72
-        x265_log(&param, X265_LOG_WARNING, "picture dimensions are out of range for specified level\n");
73
+        x265_log(&param, X265_LOG_ERROR, "picture dimensions are out of range for specified level\n");
74
         return false;
75
     }
76
     else if (samplesPerSec > l.maxLumaSamplesPerSecond)
77
     {
78
-        x265_log(&param, X265_LOG_WARNING, "frame rate is out of range for specified level\n");
79
+        x265_log(&param, X265_LOG_ERROR, "frame rate is out of range for specified level\n");
80
         return false;
81
     }
82
 
83
-    if ((uint32_t)param.rc.vbvMaxBitrate > (highTier ? l.maxBitrateHigh : l.maxBitrateMain))
84
+    /* Adjustments of Bitrate, VBV buffer size, refs will be triggered only if specified params do not fit 
85
+     * within the max limits of that level (high tier if allowed, main otherwise)
86
+     */
87
+
88
+    if ((uint32_t)param.rc.vbvMaxBitrate > (allowHighTier ? l.maxBitrateHigh : l.maxBitrateMain))
89
     {
90
-        param.rc.vbvMaxBitrate = highTier ? l.maxBitrateHigh : l.maxBitrateMain;
91
-        x265_log(&param, X265_LOG_INFO, "lowering VBV max bitrate to %dKbps\n", param.rc.vbvMaxBitrate);
92
+        param.rc.vbvMaxBitrate = allowHighTier ? l.maxBitrateHigh : l.maxBitrateMain;
93
+        x265_log(&param, X265_LOG_WARNING, "lowering VBV max bitrate to %dKbps\n", param.rc.vbvMaxBitrate);
94
     }
95
-    if ((uint32_t)param.rc.vbvBufferSize > (highTier ? l.maxCpbSizeHigh : l.maxCpbSizeMain))
96
+    if ((uint32_t)param.rc.vbvBufferSize > (allowHighTier ? l.maxCpbSizeHigh : l.maxCpbSizeMain))
97
     {
98
-        param.rc.vbvBufferSize = highTier ? l.maxCpbSizeHigh : l.maxCpbSizeMain;
99
-        x265_log(&param, X265_LOG_INFO, "lowering VBV buffer size to %dKb\n", param.rc.vbvBufferSize);
100
+        param.rc.vbvBufferSize = allowHighTier ? l.maxCpbSizeHigh : l.maxCpbSizeMain;
101
+        x265_log(&param, X265_LOG_WARNING, "lowering VBV buffer size to %dKb\n", param.rc.vbvBufferSize);
102
     }
103
 
104
     switch (param.rc.rateControlMode)
105
     {
106
     case X265_RC_ABR:
107
-        if ((uint32_t)param.rc.bitrate > (highTier ? l.maxBitrateHigh : l.maxBitrateMain))
108
+        if ((uint32_t)param.rc.bitrate > (allowHighTier ? l.maxBitrateHigh : l.maxBitrateMain))
109
         {
110
-            param.rc.bitrate = l.maxBitrateHigh;
111
-            x265_log(&param, X265_LOG_INFO, "lowering target bitrate to High tier limit of %dKbps\n", param.rc.bitrate);
112
+            param.rc.bitrate =  allowHighTier ? l.maxBitrateHigh : l.maxBitrateMain;
113
+            x265_log(&param, X265_LOG_WARNING, "lowering target bitrate to High tier limit of %dKbps\n", param.rc.bitrate);
114
         }
115
         break;
116
 
117
     case X265_RC_CQP:
118
-        x265_log(&param, X265_LOG_WARNING, "Constant QP is inconsistent with specifying a decoder level, no bitrate guarantee is possible.\n");
119
+        x265_log(&param, X265_LOG_ERROR, "Constant QP is inconsistent with specifying a decoder level, no bitrate guarantee is possible.\n");
120
         return false;
121
 
122
     case X265_RC_CRF:
123
         if (!param.rc.vbvBufferSize || !param.rc.vbvMaxBitrate)
124
         {
125
             if (!param.rc.vbvMaxBitrate)
126
-                param.rc.vbvMaxBitrate = highTier ? l.maxBitrateHigh : l.maxBitrateMain;
127
+                param.rc.vbvMaxBitrate = allowHighTier ? l.maxBitrateHigh : l.maxBitrateMain;
128
             if (!param.rc.vbvBufferSize)
129
-                param.rc.vbvBufferSize = highTier ? l.maxCpbSizeHigh : l.maxCpbSizeMain;
130
+                param.rc.vbvBufferSize = allowHighTier ? l.maxCpbSizeHigh : l.maxCpbSizeMain;
131
             x265_log(&param, X265_LOG_WARNING, "Specifying a decoder level with constant rate factor rate-control requires\n");
132
             x265_log(&param, X265_LOG_WARNING, "enabling VBV with vbv-bufsize=%dkb vbv-maxrate=%dkbps. VBV outputs are non-deterministic!\n",
133
                      param.rc.vbvBufferSize, param.rc.vbvMaxBitrate);
134
@@ -368,27 +379,30 @@
135
     /* The value of sps_max_dec_pic_buffering_minus1[ HighestTid ] + 1 shall be less than or equal to MaxDpbSize */
136
     const uint32_t MaxDpbPicBuf = 6;
137
     uint32_t maxDpbSize = MaxDpbPicBuf;
138
-    if (lumaSamples <= (l.maxLumaSamples >> 2))
139
-        maxDpbSize = X265_MIN(4 * MaxDpbPicBuf, 16);
140
-    else if (lumaSamples <= (l.maxLumaSamples >> 1))
141
-        maxDpbSize = X265_MIN(2 * MaxDpbPicBuf, 16);
142
-    else if (lumaSamples <= ((3 * l.maxLumaSamples) >> 2))
143
-        maxDpbSize = X265_MIN((4 * MaxDpbPicBuf) / 3, 16);
144
+    if (!param.uhdBluray) /* Do not change MaxDpbPicBuf for UHD-Bluray */
145
+    {
146
+        if (lumaSamples <= (l.maxLumaSamples >> 2))
147
+            maxDpbSize = X265_MIN(4 * MaxDpbPicBuf, 16);
148
+        else if (lumaSamples <= (l.maxLumaSamples >> 1))
149
+            maxDpbSize = X265_MIN(2 * MaxDpbPicBuf, 16);
150
+        else if (lumaSamples <= ((3 * l.maxLumaSamples) >> 2))
151
+            maxDpbSize = X265_MIN((4 * MaxDpbPicBuf) / 3, 16);
152
+    }
153
 
154
     int savedRefCount = param.maxNumReferences;
155
     while (vps.maxDecPicBuffering > maxDpbSize && param.maxNumReferences > 1)
156
     {
157
         param.maxNumReferences--;
158
-        vps.maxDecPicBuffering = X265_MIN(MAX_NUM_REF, X265_MAX(vps.numReorderPics + 1, (uint32_t)param.maxNumReferences) + vps.numReorderPics);
159
+        vps.maxDecPicBuffering = X265_MIN(MAX_NUM_REF, X265_MAX(vps.numReorderPics + 1, (uint32_t)param.maxNumReferences) + 1);
160
     }
161
     if (param.maxNumReferences != savedRefCount)
162
-        x265_log(&param, X265_LOG_INFO, "Lowering max references to %d to meet level requirement\n", param.maxNumReferences);
163
+        x265_log(&param, X265_LOG_WARNING, "Lowering max references to %d to meet level requirement\n", param.maxNumReferences);
164
 
165
     /* For level 5 and higher levels, the value of CtbSizeY shall be equal to 32 or 64 */
166
     if (param.levelIdc >= 50 && param.maxCUSize < 32)
167
     {
168
         param.maxCUSize = 32;
169
-        x265_log(&param, X265_LOG_INFO, "Levels 5.0 and above require a maximum CTU size of at least 32, using --ctu 32\n");
170
+        x265_log(&param, X265_LOG_WARNING, "Levels 5.0 and above require a maximum CTU size of at least 32, using --ctu 32\n");
171
     }
172
 
173
     /* The value of NumPocTotalCurr shall be less than or equal to 8 */
174
@@ -396,7 +410,7 @@
175
     if (numPocTotalCurr > 8)
176
     {
177
         param.maxNumReferences = 8 - !!param.bframes;
178
-        x265_log(&param, X265_LOG_INFO, "Lowering max references to %d to meet numPocTotalCurr requirement\n", param.maxNumReferences);
179
+        x265_log(&param, X265_LOG_WARNING, "Lowering max references to %d to meet numPocTotalCurr requirement\n", param.maxNumReferences);
180
     }
181
 
182
     return true;
183
x265_1.9.tar.gz/source/encoder/motion.cpp -> x265_2.0.tar.gz/source/encoder/motion.cpp Changed
186
 
1
@@ -111,10 +111,8 @@
2
     chromaSatd = NULL;
3
 }
4
 
5
-void MotionEstimate::init(int method, int refine, int csp)
6
+void MotionEstimate::init(int csp)
7
 {
8
-    searchMethod = method;
9
-    subpelRefine = refine;
10
     fencPUYuv.create(FENC_STRIDE, csp);
11
 }
12
 
13
@@ -162,7 +160,7 @@
14
 }
15
 
16
 /* Called by lookahead, luma only, no use of PicYuv */
17
-void MotionEstimate::setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int pwidth, int pheight)
18
+void MotionEstimate::setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int pwidth, int pheight, const int method, const int refine)
19
 {
20
     partEnum = partitionFromSizes(pwidth, pheight);
21
     X265_CHECK(LUMA_4x4 != partEnum, "4x4 inter partition detected!\n");
22
@@ -175,13 +173,17 @@
23
     blockOffset = offset;
24
     absPartIdx = ctuAddr = -1;
25
 
26
+    /* Search params */
27
+    searchMethod = method;
28
+    subpelRefine = refine;
29
+
30
     /* copy PU block into cache */
31
     primitives.pu[partEnum].copy_pp(fencPUYuv.m_buf[0], FENC_STRIDE, fencY + offset, stride);
32
     X265_CHECK(!bChromaSATD, "chroma distortion measurements impossible in this code path\n");
33
 }
34
 
35
 /* Called by Search::predInterSearch() or --pme equivalent, chroma residual might be considered */
36
-void MotionEstimate::setSourcePU(const Yuv& srcFencYuv, int _ctuAddr, int cuPartIdx, int puPartIdx, int pwidth, int pheight)
37
+void MotionEstimate::setSourcePU(const Yuv& srcFencYuv, int _ctuAddr, int cuPartIdx, int puPartIdx, int pwidth, int pheight, const int method, const int refine, bool bChroma)
38
 {
39
     partEnum = partitionFromSizes(pwidth, pheight);
40
     X265_CHECK(LUMA_4x4 != partEnum, "4x4 inter partition detected!\n");
41
@@ -192,9 +194,13 @@
42
 
43
     chromaSatd = primitives.chroma[fencPUYuv.m_csp].pu[partEnum].satd;
44
 
45
+    /* Set search characteristics */
46
+    searchMethod = method;
47
+    subpelRefine = refine;
48
+
49
     /* Enable chroma residual cost if subpelRefine level is greater than 2 and chroma block size
50
      * is an even multiple of 4x4 pixels (indicated by non-null chromaSatd pointer) */
51
-    bChromaSATD = subpelRefine > 2 && chromaSatd && (srcFencYuv.m_csp != X265_CSP_I400);
52
+    bChromaSATD = subpelRefine > 2 && chromaSatd && (srcFencYuv.m_csp != X265_CSP_I400 && bChroma);
53
     X265_CHECK(!(bChromaSATD && !workload[subpelRefine].hpel_satd), "Chroma SATD cannot be used with SAD hpel\n");
54
 
55
     ctuAddr = _ctuAddr;
56
@@ -1174,15 +1180,17 @@
57
 int MotionEstimate::subpelCompare(ReferencePlanes *ref, const MV& qmv, pixelcmp_t cmp)
58
 {
59
     intptr_t refStride = ref->lumaStride;
60
-    pixel *fref = ref->fpelPlane[0] + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * refStride;
61
+    const pixel* fref = ref->fpelPlane[0] + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * refStride;
62
     int xFrac = qmv.x & 0x3;
63
     int yFrac = qmv.y & 0x3;
64
     int cost;
65
-    intptr_t lclStride = fencPUYuv.m_size;
66
-    X265_CHECK(lclStride == FENC_STRIDE, "fenc buffer is assumed to have FENC_STRIDE by sad_x3 and sad_x4\n");
67
+    const intptr_t fencStride = FENC_STRIDE;
68
+    X265_CHECK(fencPUYuv.m_size == FENC_STRIDE, "fenc buffer is assumed to have FENC_STRIDE by sad_x3 and sad_x4\n");
69
 
70
+    ALIGN_VAR_32(pixel, subpelbuf[MAX_CU_SIZE * MAX_CU_SIZE]);
71
+    
72
     if (!(yFrac | xFrac))
73
-        cost = cmp(fencPUYuv.m_buf[0], lclStride, fref, refStride);
74
+        cost = cmp(fencPUYuv.m_buf[0], fencStride, fref, refStride);
75
     else
76
     {
77
         /* we are taking a short-cut here if the reference is weighted. To be
78
@@ -1190,15 +1198,13 @@
79
          * the final 16bit values prior to rounding and down shifting. Instead we
80
          * are simply interpolating the weighted full-pel pixels. Not 100%
81
          * accurate but good enough for fast qpel ME */
82
-        ALIGN_VAR_32(pixel, subpelbuf[64 * 64]);
83
         if (!yFrac)
84
-            primitives.pu[partEnum].luma_hpp(fref, refStride, subpelbuf, lclStride, xFrac);
85
+            primitives.pu[partEnum].luma_hpp(fref, refStride, subpelbuf, blockwidth, xFrac);
86
         else if (!xFrac)
87
-            primitives.pu[partEnum].luma_vpp(fref, refStride, subpelbuf, lclStride, yFrac);
88
+            primitives.pu[partEnum].luma_vpp(fref, refStride, subpelbuf, blockwidth, yFrac);
89
         else
90
-            primitives.pu[partEnum].luma_hvpp(fref, refStride, subpelbuf, lclStride, xFrac, yFrac);
91
-
92
-        cost = cmp(fencPUYuv.m_buf[0], lclStride, subpelbuf, lclStride);
93
+            primitives.pu[partEnum].luma_hvpp(fref, refStride, subpelbuf, blockwidth, xFrac, yFrac);
94
+        cost = cmp(fencPUYuv.m_buf[0], fencStride, subpelbuf, blockwidth);
95
     }
96
 
97
     if (bChromaSATD)
98
@@ -1206,12 +1212,12 @@
99
         int csp    = fencPUYuv.m_csp;
100
         int hshift = fencPUYuv.m_hChromaShift;
101
         int vshift = fencPUYuv.m_vChromaShift;
102
-        int shiftHor = (2 + hshift);
103
-        int shiftVer = (2 + vshift);
104
-        lclStride = fencPUYuv.m_csize;
105
+        int mvx = qmv.x << (1 - hshift);
106
+        int mvy = qmv.y << (1 - vshift);
107
+        intptr_t fencStrideC = fencPUYuv.m_csize;
108
 
109
         intptr_t refStrideC = ref->reconPic->m_strideC;
110
-        intptr_t refOffset = (qmv.x >> shiftHor) + (qmv.y >> shiftVer) * refStrideC;
111
+        intptr_t refOffset = (mvx >> 3) + (mvy >> 3) * refStrideC;
112
 
113
         const pixel* refCb = ref->getCbAddr(ctuAddr, absPartIdx) + refOffset;
114
         const pixel* refCr = ref->getCrAddr(ctuAddr, absPartIdx) + refOffset;
115
@@ -1219,48 +1225,46 @@
116
         X265_CHECK((hshift == 0) || (hshift == 1), "hshift must be 0 or 1\n");
117
         X265_CHECK((vshift == 0) || (vshift == 1), "vshift must be 0 or 1\n");
118
 
119
-        xFrac = qmv.x & (hshift ? 7 : 3);
120
-        yFrac = qmv.y & (vshift ? 7 : 3);
121
+        xFrac = mvx & 7;
122
+        yFrac = mvy & 7;
123
 
124
         if (!(yFrac | xFrac))
125
         {
126
-            cost += chromaSatd(fencPUYuv.m_buf[1], lclStride, refCb, refStrideC);
127
-            cost += chromaSatd(fencPUYuv.m_buf[2], lclStride, refCr, refStrideC);
128
+            cost += chromaSatd(fencPUYuv.m_buf[1], fencStrideC, refCb, refStrideC);
129
+            cost += chromaSatd(fencPUYuv.m_buf[2], fencStrideC, refCr, refStrideC);
130
         }
131
         else
132
         {
133
-            ALIGN_VAR_32(pixel, subpelbuf[64 * 64]);
134
+            int blockwidthC = blockwidth >> hshift;
135
+
136
             if (!yFrac)
137
             {
138
-                primitives.chroma[csp].pu[partEnum].filter_hpp(refCb, refStrideC, subpelbuf, lclStride, xFrac << (1 - hshift));
139
-                cost += chromaSatd(fencPUYuv.m_buf[1], lclStride, subpelbuf, lclStride);
140
+                primitives.chroma[csp].pu[partEnum].filter_hpp(refCb, refStrideC, subpelbuf, blockwidthC, xFrac);
141
+                cost += chromaSatd(fencPUYuv.m_buf[1], fencStrideC, subpelbuf, blockwidthC);
142
 
143
-                primitives.chroma[csp].pu[partEnum].filter_hpp(refCr, refStrideC, subpelbuf, lclStride, xFrac << (1 - hshift));
144
-                cost += chromaSatd(fencPUYuv.m_buf[2], lclStride, subpelbuf, lclStride);
145
+                primitives.chroma[csp].pu[partEnum].filter_hpp(refCr, refStrideC, subpelbuf, blockwidthC, xFrac);
146
+                cost += chromaSatd(fencPUYuv.m_buf[2], fencStrideC, subpelbuf, blockwidthC);
147
             }
148
             else if (!xFrac)
149
             {
150
-                primitives.chroma[csp].pu[partEnum].filter_vpp(refCb, refStrideC, subpelbuf, lclStride, yFrac << (1 - vshift));
151
-                cost += chromaSatd(fencPUYuv.m_buf[1], lclStride, subpelbuf, lclStride);
152
+                primitives.chroma[csp].pu[partEnum].filter_vpp(refCb, refStrideC, subpelbuf, blockwidthC, yFrac);
153
+                cost += chromaSatd(fencPUYuv.m_buf[1], fencStrideC, subpelbuf, blockwidthC);
154
 
155
-                primitives.chroma[csp].pu[partEnum].filter_vpp(refCr, refStrideC, subpelbuf, lclStride, yFrac << (1 - vshift));
156
-                cost += chromaSatd(fencPUYuv.m_buf[2], lclStride, subpelbuf, lclStride);
157
+                primitives.chroma[csp].pu[partEnum].filter_vpp(refCr, refStrideC, subpelbuf, blockwidthC, yFrac);
158
+                cost += chromaSatd(fencPUYuv.m_buf[2], fencStrideC, subpelbuf, blockwidthC);
159
             }
160
             else
161
             {
162
-                ALIGN_VAR_32(int16_t, immed[64 * (64 + NTAPS_CHROMA)]);
163
-
164
-                int extStride = blockwidth >> hshift;
165
-                int filterSize = NTAPS_CHROMA;
166
-                int halfFilterSize = (filterSize >> 1);
167
+                ALIGN_VAR_32(int16_t, immed[MAX_CU_SIZE * (MAX_CU_SIZE + NTAPS_LUMA - 1)]);
168
+                const int halfFilterSize = (NTAPS_CHROMA >> 1);
169
 
170
-                primitives.chroma[csp].pu[partEnum].filter_hps(refCb, refStrideC, immed, extStride, xFrac << (1 - hshift), 1);
171
-                primitives.chroma[csp].pu[partEnum].filter_vsp(immed + (halfFilterSize - 1) * extStride, extStride, subpelbuf, lclStride, yFrac << (1 - vshift));
172
-                cost += chromaSatd(fencPUYuv.m_buf[1], lclStride, subpelbuf, lclStride);
173
+                primitives.chroma[csp].pu[partEnum].filter_hps(refCb, refStrideC, immed, blockwidthC, xFrac, 1);
174
+                primitives.chroma[csp].pu[partEnum].filter_vsp(immed + (halfFilterSize - 1) * blockwidthC, blockwidthC, subpelbuf, blockwidthC, yFrac);
175
+                cost += chromaSatd(fencPUYuv.m_buf[1], fencStrideC, subpelbuf, blockwidthC);
176
 
177
-                primitives.chroma[csp].pu[partEnum].filter_hps(refCr, refStrideC, immed, extStride, xFrac << (1 - hshift), 1);
178
-                primitives.chroma[csp].pu[partEnum].filter_vsp(immed + (halfFilterSize - 1) * extStride, extStride, subpelbuf, lclStride, yFrac << (1 - vshift));
179
-                cost += chromaSatd(fencPUYuv.m_buf[2], lclStride, subpelbuf, lclStride);
180
+                primitives.chroma[csp].pu[partEnum].filter_hps(refCr, refStrideC, immed, blockwidthC, xFrac, 1);
181
+                primitives.chroma[csp].pu[partEnum].filter_vsp(immed + (halfFilterSize - 1) * blockwidthC, blockwidthC, subpelbuf, blockwidthC, yFrac);
182
+                cost += chromaSatd(fencPUYuv.m_buf[2], fencStrideC, subpelbuf, blockwidthC);
183
             }
184
         }
185
     }
186
x265_1.9.tar.gz/source/encoder/motion.h -> x265_2.0.tar.gz/source/encoder/motion.h Changed
17
 
1
@@ -70,12 +70,12 @@
2
 
3
     static void initScales();
4
     static int hpelIterationCount(int subme);
5
-    void init(int method, int refine, int csp);
6
+    void init(int csp);
7
 
8
     /* Methods called at slice setup */
9
 
10
-    void setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int pwidth, int pheight);
11
-    void setSourcePU(const Yuv& srcFencYuv, int ctuAddr, int cuPartIdx, int puPartIdx, int pwidth, int pheight);
12
+    void setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int pwidth, int pheight, const int searchMethod, const int subpelRefine);
13
+    void setSourcePU(const Yuv& srcFencYuv, int ctuAddr, int cuPartIdx, int puPartIdx, int pwidth, int pheight, const int searchMethod, const int subpelRefine, bool bChroma);
14
 
15
     /* buf*() and motionEstimate() methods all use cached fenc pixels and thus
16
      * require setSourcePU() to be called prior. */
17
x265_1.9.tar.gz/source/encoder/ratecontrol.cpp -> x265_2.0.tar.gz/source/encoder/ratecontrol.cpp Changed
201
 
1
@@ -53,7 +53,7 @@
2
 {\
3
     bErr = 0;\
4
     p = strstr(opts, opt "=");\
5
-    char* q = strstr(opts, "no-"opt);\
6
+    char* q = strstr(opts, "no-" opt);\
7
     if (p && sscanf(p, opt "=%d" , &i) && param_val != i)\
8
         bErr = 1;\
9
     else if (!param_val && !q && !p)\
10
@@ -91,24 +91,6 @@
11
     return z + lut[x];
12
 }
13
 
14
-inline void reduceFraction(int* n, int* d)
15
-{
16
-    int a = *n;
17
-    int b = *d;
18
-    int c;
19
-    if (!a || !b)
20
-        return;
21
-    c = a % b;
22
-    while (c)
23
-    {
24
-        a = b;
25
-        b = c;
26
-        c = a % b;
27
-    }
28
-    *n /= b;
29
-    *d /= b;
30
-}
31
-
32
 inline char *strcatFilename(const char *input, const char *suffix)
33
 {
34
     char *output = X265_MALLOC(char, strlen(input) + strlen(suffix) + 1);
35
@@ -190,6 +172,8 @@
36
     m_numEntries = 0;
37
     m_isSceneTransition = false;
38
     m_lastPredictorReset = 0;
39
+    m_avgPFrameQp = 0;
40
+    m_isFirstMiniGop = false;
41
     if (m_param->rc.rateControlMode == X265_RC_CRF)
42
     {
43
         m_param->rc.qp = (int)m_param->rc.rfConstant;
44
@@ -212,7 +196,7 @@
45
             m_rateFactorMaxDecrement = m_param->rc.rfConstant - m_param->rc.rfConstantMin;
46
     }
47
     m_isAbr = m_param->rc.rateControlMode != X265_RC_CQP && !m_param->rc.bStatRead;
48
-    m_2pass = (m_param->rc.rateControlMode == X265_RC_ABR || m_param->rc.vbvMaxBitrate > 0) && m_param->rc.bStatRead;
49
+    m_2pass = m_param->rc.rateControlMode != X265_RC_CQP && m_param->rc.bStatRead;
50
     m_bitrate = m_param->rc.bitrate * 1000;
51
     m_frameDuration = (double)m_param->fpsDenom / m_param->fpsNum;
52
     m_qp = m_param->rc.qp;
53
@@ -225,8 +209,10 @@
54
     m_statFileOut = NULL;
55
     m_cutreeStatFileOut = m_cutreeStatFileIn = NULL;
56
     m_rce2Pass = NULL;
57
+    m_encOrder = NULL;
58
     m_lastBsliceSatdCost = 0;
59
     m_movingAvgSum = 0.0;
60
+    m_isNextGop = false;
61
 
62
     // vbv initialization
63
     m_param->rc.vbvBufferSize = x265_clip3(0, 2000000, m_param->rc.vbvBufferSize);
64
@@ -288,9 +274,13 @@
65
     m_ipOffset = 6.0 * X265_LOG2(m_param->rc.ipFactor);
66
     m_pbOffset = 6.0 * X265_LOG2(m_param->rc.pbFactor);
67
 
68
+    for (int i = 0; i < QP_MAX_MAX; i++)
69
+        m_qpToEncodedBits[i] = 0;
70
+
71
     /* Adjust the first frame in order to stabilize the quality level compared to the rest */
72
 #define ABR_INIT_QP_MIN (24)
73
-#define ABR_INIT_QP_MAX (40)
74
+#define ABR_INIT_QP_MAX (37)
75
+#define ABR_INIT_QP_GRAIN_MAX (33)
76
 #define ABR_SCENECUT_INIT_QP_MIN (12)
77
 #define CRF_INIT_QP (int)m_param->rc.rfConstant
78
     for (int i = 0; i < 3; i++)
79
@@ -361,6 +351,7 @@
80
         m_amortizeFraction = 0.85;
81
         m_amortizeFrames = m_param->totalFrames / 2;
82
     }
83
+
84
     for (int i = 0; i < s_slidingWindowFrames; i++)
85
     {
86
         m_satdCostWindow[i] = 0;
87
@@ -370,15 +361,22 @@
88
     m_isPatternPresent = false;
89
     m_numBframesInPattern = 0;
90
 
91
-    /* 720p videos seem to be a good cutoff for cplxrSum */
92
-    double tuneCplxFactor = (m_param->rc.cuTree && m_ncu > 3600) ? 2.5 : 1;
93
+    m_isGrainEnabled = false;
94
+    if(m_param->rc.bEnableGrain) // tune for grainy content OR equal p-b frame sizes
95
+    m_isGrainEnabled = true;
96
+    for (int i = 0; i < 3; i++)
97
+    m_lastQScaleFor[i] = x265_qp2qScale(m_param->rc.rateControlMode == X265_RC_CRF ? CRF_INIT_QP : ABR_INIT_QP_MIN);
98
+    m_avgPFrameQp = 0 ;
99
 
100
+    /* 720p videos seem to be a good cutoff for cplxrSum */
101
+    double tuneCplxFactor = (m_ncu > 3600 && m_param->rc.cuTree) ? 2.5 : m_isGrainEnabled ? 1.9 : 1;
102
     /* estimated ratio that produces a reasonable QP for the first I-frame */
103
     m_cplxrSum = .01 * pow(7.0e5, m_qCompress) * pow(m_ncu, 0.5) * tuneCplxFactor;
104
     m_wantedBitsWindow = m_bitrate * m_frameDuration;
105
     m_accumPNorm = .01;
106
     m_accumPQp = (m_param->rc.rateControlMode == X265_RC_CRF ? CRF_INIT_QP : ABR_INIT_QP_MIN) * m_accumPNorm;
107
 
108
+
109
     /* Frame Predictors used in vbv */
110
     initFramePredictors();
111
     if (!m_statFileOut && (m_param->rc.bStatWrite || m_param->rc.bStatRead))
112
@@ -401,11 +399,11 @@
113
                 char *tmpFile = strcatFilename(fileName, ".cutree");
114
                 if (!tmpFile)
115
                     return false;
116
-                m_cutreeStatFileIn = fopen(tmpFile, "rb");
117
+                m_cutreeStatFileIn = x265_fopen(tmpFile, "rb");
118
                 X265_FREE(tmpFile);
119
                 if (!m_cutreeStatFileIn)
120
                 {
121
-                    x265_log(m_param, X265_LOG_ERROR, "can't open stats file %s\n", tmpFile);
122
+                    x265_log_file(m_param, X265_LOG_ERROR, "can't open stats file %s.cutree\n", fileName);
123
                     return false;
124
                 }
125
             }
126
@@ -417,7 +415,7 @@
127
                 return false;
128
             }
129
             {
130
-                int i, j;
131
+                int i, j, m;
132
                 uint32_t k , l;
133
                 bool bErr = false;
134
                 char *opts = statsBuf;
135
@@ -439,6 +437,11 @@
136
                     x265_log(m_param, X265_LOG_ERROR, "fps specified in stats file not valid\n");
137
                     return false;
138
                 }
139
+                if (((p = strstr(opts, " vbv-maxrate=")) == 0 || sscanf(p, " vbv-maxrate=%d", &m) != 1) && m_param->rc.rateControlMode == X265_RC_CRF)
140
+                {
141
+                    x265_log(m_param, X265_LOG_ERROR, "Constant rate-factor is incompatible with 2pass without vbv-maxrate in the previous pass\n");
142
+                    return false;
143
+                }
144
                 if (k != m_param->fpsNum || l != m_param->fpsDenom)
145
                 {
146
                     x265_log(m_param, X265_LOG_ERROR, "fps mismatch with 1st pass (%u/%u vs %u/%u)\n",
147
@@ -564,8 +567,10 @@
148
                 p = next;
149
             }
150
             X265_FREE(statsBuf);
151
-            if (m_param->rc.rateControlMode == X265_RC_ABR || m_param->rc.vbvMaxBitrate > 0)
152
+            if (m_param->rc.rateControlMode != X265_RC_CQP)
153
             {
154
+                m_start = 0;
155
+                m_isQpModified = true;
156
                 if (!initPass2())
157
                     return false;
158
             } /* else we're using constant quant, so no need to run the bitrate allocation */
159
@@ -579,11 +584,11 @@
160
             statFileTmpname = strcatFilename(fileName, ".temp");
161
             if (!statFileTmpname)
162
                 return false;
163
-            m_statFileOut = fopen(statFileTmpname, "wb");
164
+            m_statFileOut = x265_fopen(statFileTmpname, "wb");
165
             X265_FREE(statFileTmpname);
166
             if (!m_statFileOut)
167
             {
168
-                x265_log(m_param, X265_LOG_ERROR, "can't open stats file %s\n", statFileTmpname);
169
+                x265_log_file(m_param, X265_LOG_ERROR, "can't open stats file %s.temp\n", fileName);
170
                 return false;
171
             }
172
             p = x265_param2string(m_param);
173
@@ -595,11 +600,11 @@
174
                 statFileTmpname = strcatFilename(fileName, ".cutree.temp");
175
                 if (!statFileTmpname)
176
                     return false;
177
-                m_cutreeStatFileOut = fopen(statFileTmpname, "wb");
178
+                m_cutreeStatFileOut = x265_fopen(statFileTmpname, "wb");
179
                 X265_FREE(statFileTmpname);
180
                 if (!m_cutreeStatFileOut)
181
                 {
182
-                    x265_log(m_param, X265_LOG_ERROR, "can't open mbtree stats file %s\n", statFileTmpname);
183
+                    x265_log_file(m_param, X265_LOG_ERROR, "can't open mbtree stats file %s.cutree.temp\n", fileName);
184
                     return false;
185
                 }
186
             }
187
@@ -647,7 +652,7 @@
188
 
189
     #undef MAX_DURATION
190
 }
191
-bool RateControl::analyseABR2Pass(int startIndex, int endIndex, uint64_t allAvailableBits)
192
+bool RateControl::analyseABR2Pass(uint64_t allAvailableBits)
193
 {
194
     double rateFactor, stepMult;
195
     double qBlur = m_param->rc.qblur;
196
@@ -657,21 +662,21 @@
197
     double *qScale, *blurredQscale;
198
     double baseCplx = m_ncu * (m_param->bframes ? 120 : 80);
199
     double clippedDuration = CLIP_DURATION(m_frameDuration) / BASE_FRAME_DURATION;
200
-    int framesCount = endIndex - startIndex + 1;
201
x265_1.9.tar.gz/source/encoder/ratecontrol.h -> x265_2.0.tar.gz/source/encoder/ratecontrol.h Changed
77
 
1
@@ -107,6 +107,7 @@
2
     int      miscBits;
3
     int      coeffBits;
4
     bool     keptAsRef;
5
+    bool     scenecut;
6
 
7
     SEIPictureTiming *picTimingSEI;
8
     HRDTiming        *hrdTiming;
9
@@ -126,8 +127,9 @@
10
     bool   m_isVbv;
11
     bool   m_isCbr;
12
     bool   m_singleFrameVbv;
13
-
14
+    bool   m_isGrainEnabled;
15
     bool   m_isAbrReset;
16
+    bool   m_isNextGop;
17
     int    m_lastAbrResetPoc;
18
 
19
     double m_rateTolerance;
20
@@ -141,7 +143,8 @@
21
     double m_vbvMaxRate;       /* in kbps */
22
     double m_rateFactorMaxIncrement; /* Don't allow RF above (CRF + this value). */
23
     double m_rateFactorMaxDecrement; /* don't allow RF below (this value). */
24
-
25
+    double m_avgPFrameQp;
26
+    bool   m_isFirstMiniGop;
27
     Predictor m_pred[4];       /* Slice predictors to preidct bits for each Slice type - I,P,Bref and B */
28
     int64_t m_leadingNoBSatd;
29
     int     m_predType;       /* Type of slice predictors to be used - depends on the slice type */
30
@@ -178,7 +181,7 @@
31
     bool    m_isPatternPresent;
32
     bool    m_isSceneTransition;
33
     int     m_lastPredictorReset;
34
-
35
+    double  m_qpToEncodedBits[QP_MAX_MAX + 1];
36
     /* a common variable on which rateControlStart, rateControlEnd and rateControUpdateStats waits to
37
      * sync the calls to these functions. For example
38
      * -F2:
39
@@ -202,7 +205,11 @@
40
 
41
     /* 2 pass */
42
     bool    m_2pass;
43
+    bool    m_isGopReEncoded;
44
+    bool    m_isQpModified;
45
     int     m_numEntries;
46
+    int     m_start;
47
+    int     m_reencode;
48
     FILE*   m_statFileOut;
49
     FILE*   m_cutreeStatFileOut;
50
     FILE*   m_cutreeStatFileIn;
51
@@ -235,6 +242,8 @@
52
     bool cuTreeReadFor2Pass(Frame* curFrame);
53
     void hrdFullness(SEIBufferingPeriod* sei);
54
     int writeRateControlFrameStats(Frame* curFrame, RateControlEntry* rce);
55
+    bool   initPass2();
56
+
57
 protected:
58
 
59
     static const int   s_slidingWindowFrames;
60
@@ -261,14 +270,14 @@
61
     double predictSize(Predictor *p, double q, double var);
62
     void   checkAndResetABR(RateControlEntry* rce, bool isFrameDone);
63
     double predictRowsSizeSum(Frame* pic, RateControlEntry* rce, double qpm, int32_t& encodedBits);
64
-    bool   initPass2();
65
-    bool   analyseABR2Pass(int startPoc, int endPoc, uint64_t allAvailableBits);
66
+    bool   analyseABR2Pass(uint64_t allAvailableBits);
67
     void   initFramePredictors();
68
     double getDiffLimitedQScale(RateControlEntry *rce, double q);
69
     double countExpectedBits(int startPos, int framesCount);
70
     bool   vbv2Pass(uint64_t allAvailableBits, int frameCount, int startPos);
71
     bool   findUnderflow(double *fills, int *t0, int *t1, int over, int framesCount);
72
     bool   fixUnderflow(int t0, int t1, double adjustment, double qscaleMin, double qscaleMax);
73
+    double tuneQScaleForGrain(double rcOverflow);
74
 };
75
 }
76
 #endif // ifndef X265_RATECONTROL_H
77
x265_1.9.tar.gz/source/encoder/reference.cpp -> x265_2.0.tar.gz/source/encoder/reference.cpp Changed
10
 
1
@@ -68,7 +68,7 @@
2
         intptr_t stride = reconPic->m_stride;
3
         int cuHeight = g_maxCUSize;
4
 
5
-        for (int c = 0; c < (p.internalCsp != X265_CSP_I400 ? numInterpPlanes : 1); c++)
6
+        for (int c = 0; c < (p.internalCsp != X265_CSP_I400 && recPic->m_picCsp != X265_CSP_I400 ? numInterpPlanes : 1); c++)
7
         {
8
             if (c == 1)
9
             {
10
x265_1.9.tar.gz/source/encoder/sao.cpp -> x265_2.0.tar.gz/source/encoder/sao.cpp Changed
201
 
1
@@ -53,7 +53,7 @@
2
     return r;
3
 }
4
 
5
-inline int64_t estSaoDist(int32_t count, int offset, int32_t offsetOrg)
6
+inline int64_t estSaoDist(int32_t count, int32_t offset, int32_t offsetOrg)
7
 {
8
     return (count * offset - offsetOrg * 2) * offset;
9
 }
10
@@ -76,8 +76,6 @@
11
     m_countPreDblk = NULL;
12
     m_offsetOrgPreDblk = NULL;
13
     m_refDepth = 0;
14
-    m_lumaLambda = 0;
15
-    m_chromaLambda = 0;
16
     m_param = NULL;
17
     m_clipTable = NULL;
18
     m_clipTableBase = NULL;
19
@@ -120,8 +118,11 @@
20
 
21
     if (initCommon)
22
     {
23
-        CHECKED_MALLOC(m_countPreDblk, PerPlane, numCtu);
24
-        CHECKED_MALLOC(m_offsetOrgPreDblk, PerPlane, numCtu);
25
+        if (m_param->bSaoNonDeblocked)
26
+        {
27
+            CHECKED_MALLOC(m_countPreDblk, PerPlane, numCtu);
28
+            CHECKED_MALLOC(m_offsetOrgPreDblk, PerPlane, numCtu);
29
+        }
30
         CHECKED_MALLOC(m_depthSaoRate, double, 2 * SAO_DEPTHRATE_SIZE);
31
 
32
         m_depthSaoRate[0 * SAO_DEPTHRATE_SIZE + 0] = 0;
33
@@ -137,17 +138,16 @@
34
         m_clipTable = &(m_clipTableBase[rangeExt]);
35
 
36
         // Share with fast clip lookup table
37
-        if (initCommon)
38
-        {
39
-            for (int i = 0; i < rangeExt; i++)
40
-                m_clipTableBase[i] = 0;
41
 
42
-            for (int i = 0; i < maxY; i++)
43
-                m_clipTable[i] = (pixel)i;
44
+        for (int i = 0; i < rangeExt; i++)
45
+            m_clipTableBase[i] = 0;
46
+
47
+        for (int i = 0; i < maxY; i++)
48
+            m_clipTable[i] = (pixel)i;
49
+
50
+        for (int i = maxY; i < maxY + rangeExt; i++)
51
+            m_clipTable[i] = maxY;
52
 
53
-            for (int i = maxY; i < maxY + rangeExt; i++)
54
-                m_clipTable[i] = maxY;
55
-        }
56
     }
57
     else
58
     {
59
@@ -204,8 +204,11 @@
60
 
61
     if (destoryCommon)
62
     {
63
-        X265_FREE_ZERO(m_countPreDblk);
64
-        X265_FREE_ZERO(m_offsetOrgPreDblk);
65
+        if (m_param->bSaoNonDeblocked)
66
+        {
67
+            X265_FREE_ZERO(m_countPreDblk);
68
+            X265_FREE_ZERO(m_offsetOrgPreDblk);
69
+        }
70
         X265_FREE_ZERO(m_depthSaoRate);
71
         X265_FREE_ZERO(m_clipTableBase);
72
     }
73
@@ -221,17 +224,10 @@
74
         saoParam->ctuParam[i] = new SaoCtuParam[m_numCuInHeight * m_numCuInWidth];
75
 }
76
 
77
-void SAO::startSlice(Frame* frame, Entropy& initState, int qp)
78
+void SAO::startSlice(Frame* frame, Entropy& initState)
79
 {
80
-    Slice* slice = frame->m_encData->m_slice;
81
-    int qpCb = qp;
82
-    if (m_param->internalCsp == X265_CSP_I420)
83
-        qpCb = x265_clip3(QP_MIN, QP_MAX_MAX, (int)g_chromaScale[qp + slice->m_pps->chromaQpOffset[0]]);
84
-    else
85
-        qpCb = X265_MIN(qp + slice->m_pps->chromaQpOffset[0], QP_MAX_SPEC);
86
-    m_lumaLambda = x265_lambda2_tab[qp];
87
-    m_chromaLambda = x265_lambda2_tab[qpCb]; // Use Cb QP for SAO chroma
88
     m_frame = frame;
89
+    Slice* slice = m_frame->m_encData->m_slice;
90
 
91
     switch (slice->m_sliceType)
92
     {
93
@@ -259,7 +255,7 @@
94
     }
95
 
96
     saoParam->bSaoFlag[0] = true;
97
-    saoParam->bSaoFlag[1] = m_param->internalCsp != X265_CSP_I400;
98
+    saoParam->bSaoFlag[1] = m_param->internalCsp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400;
99
 
100
     m_numNoSao[0] = 0; // Luma
101
     m_numNoSao[1] = 0; // Chroma
102
@@ -275,9 +271,8 @@
103
 }
104
 
105
 // CTU-based SAO process without slice granularity
106
-void SAO::processSaoCu(int addr, int typeIdx, int plane)
107
+void SAO::applyPixelOffsets(int addr, int typeIdx, int plane)
108
 {
109
-    int x, y;
110
     PicYuv* reconPic = m_frame->m_reconPic;
111
     pixel* rec = reconPic->getPlaneAddr(plane, addr);
112
     intptr_t stride = plane ? reconPic->m_strideC : reconPic->m_stride;
113
@@ -302,20 +297,13 @@
114
     ctuWidth  = rpelx - lpelx;
115
     ctuHeight = bpely - tpely;
116
 
117
-    int startX;
118
-    int startY;
119
-    int endX;
120
-    int endY;
121
-    pixel* tmpL;
122
-    pixel* tmpU;
123
-
124
     int8_t _upBuff1[MAX_CU_SIZE + 2], *upBuff1 = _upBuff1 + 1, signLeft1[2];
125
     int8_t _upBufft[MAX_CU_SIZE + 2], *upBufft = _upBufft + 1;
126
 
127
     memset(_upBuff1 + MAX_CU_SIZE, 0, 2 * sizeof(int8_t)); /* avoid valgrind uninit warnings */
128
 
129
-    tmpL = m_tmpL1[plane];
130
-    tmpU = &(m_tmpU[plane][lpelx]);
131
+    pixel* tmpL = m_tmpL1[plane];
132
+    pixel* tmpU = &(m_tmpU[plane][lpelx]);
133
 
134
     int8_t* offsetEo = m_offsetEo[plane];
135
 
136
@@ -324,14 +312,14 @@
137
     case SAO_EO_0: // dir: -
138
     {
139
         pixel firstPxl = 0, lastPxl = 0, row1FirstPxl = 0, row1LastPxl = 0;
140
-        startX = !lpelx;
141
-        endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth;
142
+        int startX = !lpelx;
143
+        int endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth;
144
         if (ctuWidth & 15)
145
         {
146
-            for (y = 0; y < ctuHeight; y++)
147
+            for (int y = 0; y < ctuHeight; y++, rec += stride)
148
             {
149
                 int signLeft = signOf(rec[startX] - tmpL[y]);
150
-                for (x = startX; x < endX; x++)
151
+                for (int x = startX; x < endX; x++)
152
                 {
153
                     int signRight = signOf(rec[x] - rec[x + 1]);
154
                     int edgeType = signRight + signLeft + 2;
155
@@ -339,13 +327,11 @@
156
 
157
                     rec[x] = m_clipTable[rec[x] + offsetEo[edgeType]];
158
                 }
159
-
160
-                rec += stride;
161
             }
162
         }
163
         else
164
         {
165
-            for (y = 0; y < ctuHeight; y += 2)
166
+            for (int y = 0; y < ctuHeight; y += 2, rec += 2 * stride)
167
             {
168
                 signLeft1[0] = signOf(rec[startX] - tmpL[y]);
169
                 signLeft1[1] = signOf(rec[stride + startX] - tmpL[y + 1]);
170
@@ -375,27 +361,25 @@
171
                     rec[ctuWidth - 1] = lastPxl;
172
                     rec[stride + ctuWidth - 1] = row1LastPxl;
173
                 }
174
-
175
-                rec += 2 * stride;
176
             }
177
         }
178
         break;
179
     }
180
     case SAO_EO_1: // dir: |
181
     {
182
-        startY = !tpely;
183
-        endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight;
184
+        int startY = !tpely;
185
+        int endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight;
186
         if (!tpely)
187
             rec += stride;
188
 
189
         if (ctuWidth & 15)
190
         {
191
-            for (x = 0; x < ctuWidth; x++)
192
+            for (int x = 0; x < ctuWidth; x++)
193
                 upBuff1[x] = signOf(rec[x] - tmpU[x]);
194
 
195
-            for (y = startY; y < endY; y++)
196
+            for (int y = startY; y < endY; y++, rec += stride)
197
             {
198
-                for (x = 0; x < ctuWidth; x++)
199
+                for (int x = 0; x < ctuWidth; x++)
200
                 {
201
x265_1.9.tar.gz/source/encoder/sao.h -> x265_2.0.tar.gz/source/encoder/sao.h Changed
92
 
1
@@ -33,13 +33,6 @@
2
 namespace X265_NS {
3
 // private namespace
4
 
5
-enum SAOTypeLen
6
-{
7
-    SAO_EO_LEN = 4,
8
-    SAO_BO_LEN = 4,
9
-    SAO_NUM_BO_CLASSES = 32
10
-};
11
-
12
 enum SAOType
13
 {
14
     SAO_EO_0 = 0,
15
@@ -56,12 +49,11 @@
16
 
17
     enum { SAO_MAX_DEPTH = 4 };
18
     enum { SAO_BO_BITS  = 5 };
19
-    enum { MAX_NUM_SAO_CLASS = 33 };
20
+    enum { MAX_NUM_SAO_CLASS = 32 };
21
     enum { SAO_BIT_INC = 0 }; /* in HM12.0, it wrote as X265_MAX(X265_DEPTH - 10, 0) */
22
     enum { OFFSET_THRESH = 1 << X265_MIN(X265_DEPTH - 5, 5) };
23
     enum { NUM_EDGETYPE = 5 };
24
     enum { NUM_PLANE = 3 };
25
-    enum { NUM_MERGE_MODE = 3 };
26
     enum { SAO_DEPTHRATE_SIZE = 4 };
27
 
28
     static const uint32_t s_eoTable[NUM_EDGETYPE];
29
@@ -81,7 +73,7 @@
30
     PerPlane*   m_offsetOrgPreDblk;
31
 
32
     double*     m_depthSaoRate;
33
-    int8_t      m_offsetBo[NUM_PLANE][SAO_NUM_BO_CLASSES];
34
+    int8_t      m_offsetBo[NUM_PLANE][MAX_NUM_SAO_CLASS];
35
     int8_t      m_offsetEo[NUM_PLANE][NUM_EDGETYPE];
36
 
37
     int         m_chromaFormat;
38
@@ -114,10 +106,6 @@
39
     int         m_refDepth;
40
     int         m_numNoSao[2];
41
 
42
-    double      m_lumaLambda;
43
-    double      m_chromaLambda;
44
-    /* TODO: No doubles for distortion */
45
-
46
     SAO();
47
 
48
     bool create(x265_param* param, int initCommon);
49
@@ -126,31 +114,27 @@
50
 
51
     void allocSaoParam(SAOParam* saoParam) const;
52
 
53
-    void startSlice(Frame* pic, Entropy& initState, int qp);
54
+    void startSlice(Frame* pic, Entropy& initState);
55
     void resetStats();
56
-    void resetSaoUnit(SaoCtuParam* saoUnit);
57
 
58
     // CTU-based SAO process without slice granularity
59
-    void processSaoCu(int addr, int typeIdx, int plane);
60
+    void applyPixelOffsets(int addr, int typeIdx, int plane);
61
     void processSaoUnitRow(SaoCtuParam* ctuParam, int idxY, int plane);
62
-    void processSaoUnitCuLuma(SaoCtuParam* ctuParam, int idxY, int idxX);
63
-    void processSaoUnitCuChroma(SaoCtuParam* ctuParam[3], int idxY, int idxX);
64
+    void generateLumaOffsets(SaoCtuParam* ctuParam, int idxY, int idxX);
65
+    void generateChromaOffsets(SaoCtuParam* ctuParam[3], int idxY, int idxX);
66
 
67
-    void copySaoUnit(SaoCtuParam* saoUnitDst, const SaoCtuParam* saoUnitSrc);
68
-
69
-    void calcSaoStatsCu(int addr, int plane);
70
+    void calcSaoStatsCTU(int addr, int plane);
71
     void calcSaoStatsCu_BeforeDblk(Frame* pic, int idxX, int idxY);
72
 
73
-    void saoComponentParamDist(SAOParam* saoParam, int addr, int addrUp, int addrLeft, SaoCtuParam mergeSaoParam[2], double* mergeDist);
74
-    void sao2ChromaParamDist(SAOParam* saoParam, int addr, int addrUp, int addrLeft, SaoCtuParam mergeSaoParam[][2], double* mergeDist);
75
-
76
-    inline int estIterOffset(int typeIdx, int classIdx, double lambda, int offset, int32_t count, int32_t offsetOrg,
77
-                             int32_t* currentDistortionTableBo, double* currentRdCostTableBo);
78
-    inline int64_t estSaoTypeDist(int plane, int typeIdx, double lambda, int32_t* currentDistortionTableBo, double* currentRdCostTableBo);
79
+    void saoLumaComponentParamDist(SAOParam* saoParam, int addr, int64_t& rateDist, int64_t* lambda, int64_t& bestCost);
80
+    void saoChromaComponentParamDist(SAOParam* saoParam, int addr, int64_t& rateDist, int64_t* lambda, int64_t& bestCost);
81
 
82
+    void estIterOffset(int typeIdx, int64_t lambda, int32_t count, int32_t offsetOrg, int32_t& offset, int32_t& distClasses, int64_t& costClasses);
83
     void rdoSaoUnitRowEnd(const SAOParam* saoParam, int numctus);
84
-    void rdoSaoUnitRow(SAOParam* saoParam, int idxY);
85
     void rdoSaoUnitCu(SAOParam* saoParam, int rowBaseAddr, int idxX, int addr);
86
+    int64_t calcSaoRdoCost(int64_t distortion, uint32_t bits, int64_t lambda);
87
+
88
+    void saoStatsInitialOffset(int planes);
89
 
90
     friend class FrameFilter;
91
 };
92
x265_1.9.tar.gz/source/encoder/search.cpp -> x265_2.0.tar.gz/source/encoder/search.cpp Changed
201
 
1
@@ -73,14 +73,13 @@
2
 {
3
     uint32_t maxLog2CUSize = g_log2Size[param.maxCUSize];
4
     m_param = &param;
5
-    m_bEnableRDOQ = !!param.rdoqLevel;
6
     m_bFrameParallel = param.frameNumThreads > 1;
7
     m_numLayers = g_log2Size[param.maxCUSize] - 2;
8
 
9
     m_rdCost.setPsyRdScale(param.psyRd);
10
-    m_me.init(param.searchMethod, param.subpelRefine, param.internalCsp);
11
+    m_me.init(param.internalCsp);
12
 
13
-    bool ok = m_quant.init(param.rdoqLevel, param.psyRdoq, scalingList, m_entropyCoder);
14
+    bool ok = m_quant.init(param.psyRdoq, scalingList, m_entropyCoder);
15
     if (m_param->noiseReductionIntra || m_param->noiseReductionInter || m_param->rc.vbvBufferSize)
16
         ok &= m_quant.allocNoiseReduction(param);
17
 
18
@@ -223,9 +222,10 @@
19
 
20
     if (!(log2TrSize - m_hChromaShift < 2))
21
     {
22
-        if (!tuDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth - 1))
23
+        uint32_t parentIdx = absPartIdx & (0xFF << (log2TrSize + 1 - LOG2_UNIT_SIZE) * 2);
24
+        if (!tuDepth || cu.getCbf(parentIdx, TEXT_CHROMA_U, tuDepth - 1))
25
             m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_U, tuDepth, !subdiv);
26
-        if (!tuDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth - 1))
27
+        if (!tuDepth || cu.getCbf(parentIdx, TEXT_CHROMA_V, tuDepth - 1))
28
             m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_V, tuDepth, !subdiv);
29
     }
30
 
31
@@ -296,6 +296,7 @@
32
     uint32_t sizeIdx    = log2TrSize - 2;
33
     bool mightNotSplit  = log2TrSize <= depthRange[1];
34
     bool mightSplit     = (log2TrSize > depthRange[0]) && (bAllowSplit || !mightNotSplit);
35
+    bool bEnableRDOQ  = !!m_param->rdoqLevel;
36
 
37
     /* If maximum RD penalty, force spits at TU size 32x32 if SPS allows TUs of 16x16 */
38
     if (m_param->rdPenalty == 2 && m_slice->m_sliceType != I_SLICE && log2TrSize == 5 && depthRange[0] <= 4)
39
@@ -336,7 +337,7 @@
40
         coeff_t* coeffY       = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
41
 
42
         // store original entropy coding status
43
-        if (m_bEnableRDOQ)
44
+        if (bEnableRDOQ)
45
             m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
46
 
47
         primitives.cu[sizeIdx].calcresidual(fenc, pred, residual, stride);
48
@@ -434,8 +435,7 @@
49
 
50
             cbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1);
51
         }
52
-        for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
53
-            cu.m_cbf[0][absPartIdx + offs] |= (cbf << tuDepth);
54
+        cu.m_cbf[0][absPartIdx] |= (cbf << tuDepth);
55
 
56
         if (mightNotSplit && log2TrSize != depthRange[0])
57
         {
58
@@ -487,6 +487,7 @@
59
     uint32_t fullDepth = cuGeom.depth + tuDepth;
60
     uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
61
     uint32_t tuSize = 1 << log2TrSize;
62
+    bool bEnableRDOQ = !!m_param->rdoqLevel;
63
 
64
     X265_CHECK(tuSize <= MAX_TS_SIZE, "transform skip is only possible at 4x4 TUs\n");
65
 
66
@@ -525,7 +526,7 @@
67
     // store original entropy coding status
68
     m_entropyCoder.store(m_rqt[fullDepth].rqtRoot);
69
 
70
-    if (m_bEnableRDOQ)
71
+    if (bEnableRDOQ)
72
         m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
73
 
74
     int checkTransformSkip = 1;
75
@@ -717,8 +718,7 @@
76
             residualTransformQuantIntra(mode, cuGeom, qPartIdx, tuDepth + 1, depthRange);
77
             cbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1);
78
         }
79
-        for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
80
-            cu.m_cbf[0][absPartIdx + offs] |= (cbf << tuDepth);
81
+        cu.m_cbf[0][absPartIdx] |= (cbf << tuDepth);
82
     }
83
 }
84
 
85
@@ -782,6 +782,7 @@
86
 {
87
     CUData& cu = mode.cu;
88
     uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
89
+    bool bEnableRDOQ = !!m_param->rdoqLevel;
90
 
91
     if (tuDepth < cu.m_tuDepth[absPartIdx])
92
     {
93
@@ -793,11 +794,9 @@
94
             splitCbfU |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
95
             splitCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
96
         }
97
-        for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
98
-        {
99
-            cu.m_cbf[1][absPartIdx + offs] |= (splitCbfU << tuDepth);
100
-            cu.m_cbf[2][absPartIdx + offs] |= (splitCbfV << tuDepth);
101
-        }
102
+        cu.m_cbf[1][absPartIdx] |= (splitCbfU << tuDepth);
103
+        cu.m_cbf[2][absPartIdx] |= (splitCbfV << tuDepth);
104
+
105
         return;
106
     }
107
 
108
@@ -812,7 +811,7 @@
109
         tuDepthC--;
110
     }
111
 
112
-    if (m_bEnableRDOQ)
113
+    if (bEnableRDOQ)
114
         m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSizeC, false);
115
 
116
     bool checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && log2TrSizeC <= MAX_LOG2_TS_SIZE && !cu.m_tqBypass[0];
117
@@ -1091,11 +1090,8 @@
118
             splitCbfU |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
119
             splitCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
120
         }
121
-        for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
122
-        {
123
-            cu.m_cbf[1][absPartIdx + offs] |= (splitCbfU << tuDepth);
124
-            cu.m_cbf[2][absPartIdx + offs] |= (splitCbfV << tuDepth);
125
-        }
126
+        cu.m_cbf[1][absPartIdx] |= (splitCbfU << tuDepth);
127
+        cu.m_cbf[2][absPartIdx] |= (splitCbfV << tuDepth);
128
 
129
         return;
130
     }
131
@@ -1629,8 +1625,7 @@
132
         for (uint32_t qIdx = 0, qPartIdx = 0; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
133
             combCbfY |= cu.getCbf(qPartIdx, TEXT_LUMA, 1);
134
 
135
-        for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
136
-            cu.m_cbf[0][offs] |= combCbfY;
137
+        cu.m_cbf[0][0] |= combCbfY;
138
     }
139
 
140
     // TODO: remove this
141
@@ -1732,6 +1727,12 @@
142
         else
143
             cu.getAllowedChromaDir(absPartIdxC, modeList);
144
 
145
+        if (m_frame->m_fencPic->m_picCsp  == X265_CSP_I400 && m_csp != X265_CSP_I400)
146
+        {
147
+            for (uint32_t l = 1; l < NUM_CHROMA_MODE; l++)
148
+                modeList[l] = modeList[0];
149
+            maxMode = 1;
150
+        }
151
         // check chroma modes
152
         for (uint32_t mode = minMode; mode < maxMode; mode++)
153
         {
154
@@ -1816,11 +1817,8 @@
155
             combCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, 1);
156
         }
157
 
158
-        for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
159
-        {
160
-            cu.m_cbf[1][offs] |= combCbfU;
161
-            cu.m_cbf[2][offs] |= combCbfV;
162
-        }
163
+        cu.m_cbf[1][0] |= combCbfU;
164
+        cu.m_cbf[2][0] |= combCbfV;
165
     }
166
 
167
     /* TODO: remove this */
168
@@ -1974,7 +1972,8 @@
169
         slave.m_frame = m_frame;
170
         slave.m_param = m_param;
171
         slave.setLambdaFromQP(pme.mode.cu, m_rdCost.m_qp);
172
-        slave.m_me.setSourcePU(*pme.mode.fencYuv, pme.pu.ctuAddr, pme.pu.cuAbsPartIdx, pme.pu.puAbsPartIdx, pme.pu.width, pme.pu.height);
173
+        bool bChroma = slave.m_frame->m_fencPic->m_picCsp != X265_CSP_I400;
174
+        slave.m_me.setSourcePU(*pme.mode.fencYuv, pme.pu.ctuAddr, pme.pu.cuAbsPartIdx, pme.pu.puAbsPartIdx, pme.pu.width, pme.pu.height, m_param->searchMethod, m_param->subpelRefine, bChroma);
175
     }
176
 
177
     /* Perform ME, repeat until no more work is available */
178
@@ -2015,9 +2014,12 @@
179
     int mvpIdx = selectMVP(interMode.cu, pu, amvp, list, ref);
180
     MV mvmin, mvmax, outmv, mvp = amvp[mvpIdx];
181
 
182
-    MV lmv = getLowresMV(interMode.cu, pu, list, ref);
183
-    if (lmv.notZero())
184
-        mvc[numMvc++] = lmv;
185
+    if (!m_param->analysisMode) /* Prevents load/save outputs from diverging if lowresMV is not available */
186
+    {
187
+        MV lmv = getLowresMV(interMode.cu, pu, list, ref);
188
+        if (lmv.notZero())
189
+            mvc[numMvc++] = lmv;
190
+    }
191
 
192
     setSearchRange(interMode.cu, mvp, m_param->searchRange, mvmin, mvmax);
193
 
194
@@ -2074,7 +2076,7 @@
195
         MotionData* bestME = interMode.bestME[puIdx];
196
         PredictionUnit pu(cu, cuGeom, puIdx);
197
 
198
-        m_me.setSourcePU(*interMode.fencYuv, pu.ctuAddr, pu.cuAbsPartIdx, pu.puAbsPartIdx, pu.width, pu.height);
199
+        m_me.setSourcePU(*interMode.fencYuv, pu.ctuAddr, pu.cuAbsPartIdx, pu.puAbsPartIdx, pu.width, pu.height, m_param->searchMethod, m_param->subpelRefine, bChromaMC);
200
 
201
x265_1.9.tar.gz/source/encoder/search.h -> x265_2.0.tar.gz/source/encoder/search.h Changed
9
 
1
@@ -272,7 +272,6 @@
2
     pixel*          m_tsRecon;        /* transform skip reconstructed pixels 32x32 */
3
 
4
     bool            m_bFrameParallel;
5
-    bool            m_bEnableRDOQ;
6
     uint32_t        m_numLayers;
7
     uint32_t        m_refLagPixels;
8
 
9
x265_1.9.tar.gz/source/encoder/slicetype.cpp -> x265_2.0.tar.gz/source/encoder/slicetype.cpp Changed
36
 
1
@@ -83,7 +83,7 @@
2
     uint32_t var;
3
 
4
     var  = acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[0] + blockOffsetLuma, stride, 0, csp);
5
-    if (csp != X265_CSP_I400)
6
+    if (csp != X265_CSP_I400 && curFrame->m_fencPic->m_picCsp != X265_CSP_I400)
7
     {
8
         var += acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[1] + blockOffsetChroma, cStride, 1, csp);
9
         var += acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[2] + blockOffsetChroma, cStride, 2, csp);
10
@@ -456,10 +456,13 @@
11
     COPY4_IF_LT(minscore, s, minscale, curScale, minoff, curOffset, found, 1);
12
 
13
     /* Use a smaller denominator if possible */
14
-    while (mindenom > 0 && !(minscale & 1))
15
+    if (mindenom > 0 && !(minscale & 1))
16
     {
17
-        mindenom--;
18
-        minscale >>= 1;
19
+        unsigned long idx;
20
+        CTZ(idx, minscale);
21
+        int shift = X265_MIN((int)idx, mindenom);
22
+        mindenom -= shift;
23
+        minscale >>= shift;
24
     }
25
 
26
     if (!found || (minscale == 1 << mindenom && minoff == 0) || (float)minscore / origscore > 0.998f)
27
@@ -2081,7 +2084,7 @@
28
     const intptr_t pelOffset = cuSize * cuX + cuSize * cuY * fenc->lumaStride;
29
 
30
     if (bBidir || bDoSearch[0] || bDoSearch[1])
31
-        tld.me.setSourcePU(fenc->lowresPlane[0], fenc->lumaStride, pelOffset, cuSize, cuSize);
32
+        tld.me.setSourcePU(fenc->lowresPlane[0], fenc->lumaStride, pelOffset, cuSize, cuSize, X265_HEX_SEARCH, 1);
33
 
34
     /* A small, arbitrary bias to avoid VBV problems caused by zero-residual lookahead blocks. */
35
     int lowresPenalty = 4;
36
x265_1.9.tar.gz/source/encoder/slicetype.h -> x265_2.0.tar.gz/source/encoder/slicetype.h Changed
11
 
1
@@ -60,8 +60,8 @@
2
 
3
     LookaheadTLD()
4
     {
5
+        me.init(X265_CSP_I400);
6
         me.setQP(X265_LOOKAHEAD_QP);
7
-        me.init(X265_HEX_SEARCH, 1, X265_CSP_I400);
8
         for (int i = 0; i < 4; i++)
9
             wbuffer[i] = NULL;
10
         widthInCU = heightInCU = ncu = paddedLines = 0;
11
x265_1.9.tar.gz/source/encoder/weightPrediction.cpp -> x265_2.0.tar.gz/source/encoder/weightPrediction.cpp Changed
78
 
1
@@ -31,6 +31,7 @@
2
 #include "slice.h"
3
 #include "mv.h"
4
 #include "bitstream.h"
5
+#include "threading.h"
6
 
7
 using namespace X265_NS;
8
 namespace {
9
@@ -132,25 +133,25 @@
10
                 intptr_t fpeloffset = (mv.y >> 2) * stride + (mv.x >> 2);
11
                 pixel *temp = src + pixoff + fpeloffset;
12
 
13
-                int xFrac = mv.x & 0x7;
14
-                int yFrac = mv.y & 0x7;
15
-                if ((yFrac | xFrac) == 0)
16
+                int xFrac = mv.x & 7;
17
+                int yFrac = mv.y & 7;
18
+                if (!(yFrac | xFrac))
19
                 {
20
                     primitives.chroma[csp].pu[LUMA_16x16].copy_pp(mcout + pixoff, stride, temp, stride);
21
                 }
22
-                else if (yFrac == 0)
23
+                else if (!yFrac)
24
                 {
25
                     primitives.chroma[csp].pu[LUMA_16x16].filter_hpp(temp, stride, mcout + pixoff, stride, xFrac);
26
                 }
27
-                else if (xFrac == 0)
28
+                else if (!xFrac)
29
                 {
30
                     primitives.chroma[csp].pu[LUMA_16x16].filter_vpp(temp, stride, mcout + pixoff, stride, yFrac);
31
                 }
32
                 else
33
                 {
34
-                    ALIGN_VAR_16(int16_t, imm[16 * (16 + NTAPS_CHROMA)]);
35
-                    primitives.chroma[csp].pu[LUMA_16x16].filter_hps(temp, stride, imm, bw, xFrac, 1);
36
-                    primitives.chroma[csp].pu[LUMA_16x16].filter_vsp(imm + ((NTAPS_CHROMA >> 1) - 1) * bw, bw, mcout + pixoff, stride, yFrac);
37
+                    ALIGN_VAR_16(int16_t, immed[16 * (16 + NTAPS_CHROMA - 1)]);
38
+                    primitives.chroma[csp].pu[LUMA_16x16].filter_hps(temp, stride, immed, bw, xFrac, 1);
39
+                    primitives.chroma[csp].pu[LUMA_16x16].filter_vsp(immed + ((NTAPS_CHROMA >> 1) - 1) * bw, bw, mcout + pixoff, stride, yFrac);
40
                 }
41
             }
42
             else
43
@@ -232,7 +233,7 @@
44
     cache.numPredDir = slice.isInterP() ? 1 : 2;
45
     cache.lowresWidthInCU = fenc.width >> 3;
46
     cache.lowresHeightInCU = fenc.lines >> 3;
47
-    cache.csp = fencPic->m_picCsp;
48
+    cache.csp = param.internalCsp;
49
     cache.hshift = CHROMA_H_SHIFT(cache.csp);
50
     cache.vshift = CHROMA_V_SHIFT(cache.csp);
51
 
52
@@ -329,7 +330,7 @@
53
                 {
54
                     /* reference chroma planes must be extended prior to being
55
                      * used as motion compensation sources */
56
-                    if (!refFrame->m_bChromaExtended && param.internalCsp != X265_CSP_I400)
57
+                    if (!refFrame->m_bChromaExtended && param.internalCsp != X265_CSP_I400 && frame.m_fencPic->m_picCsp != X265_CSP_I400)
58
                     {
59
                         refFrame->m_bChromaExtended = true;
60
                         PicYuv *refPic = refFrame->m_fencPic;
61
@@ -456,10 +457,13 @@
62
             /* Use a smaller luma denominator if possible */
63
             if (!(plane || list))
64
             {
65
-                while (mindenom > 0 && !(minscale & 1))
66
+                if (mindenom > 0 && !(minscale & 1))
67
                 {
68
-                    mindenom--;
69
-                    minscale >>= 1;
70
+                    unsigned long idx;
71
+                    CTZ(idx, minscale);
72
+                    int shift = X265_MIN((int)idx, mindenom);
73
+                    mindenom -= shift;
74
+                    minscale >>= shift;
75
                 }
76
             }
77
 
78
x265_1.9.tar.gz/source/input/y4m.cpp -> x265_2.0.tar.gz/source/input/y4m.cpp Changed
10
 
1
@@ -417,6 +417,8 @@
2
     {
3
         int pixelbytes = depth > 8 ? 2 : 1;
4
         pic.bitDepth = depth;
5
+        pic.framesize = framesize;
6
+        pic.height = height;
7
         pic.colorSpace = colorSpace;
8
         pic.stride[0] = width * pixelbytes;
9
         pic.stride[1] = pic.stride[0] >> x265_cli_csps[colorSpace].width[1];
10
x265_1.9.tar.gz/source/input/yuv.cpp -> x265_2.0.tar.gz/source/input/yuv.cpp Changed
10
 
1
@@ -225,6 +225,8 @@
2
         uint32_t pixelbytes = depth > 8 ? 2 : 1;
3
         pic.colorSpace = colorSpace;
4
         pic.bitDepth = depth;
5
+        pic.framesize = framesize;
6
+        pic.height = height;
7
         pic.stride[0] = width * pixelbytes;
8
         pic.stride[1] = pic.stride[0] >> x265_cli_csps[colorSpace].width[1];
9
         pic.stride[2] = pic.stride[0] >> x265_cli_csps[colorSpace].width[2];
10
x265_1.9.tar.gz/source/output/raw.cpp -> x265_2.0.tar.gz/source/output/raw.cpp Changed
43
 
1
@@ -32,11 +32,11 @@
2
     b_fail = false;
3
     if (!strcmp(fname, "-"))
4
     {
5
-        ofs = &cout;
6
+        ofs = stdout;
7
         return;
8
     }
9
-    ofs = new ofstream(fname, ios::binary | ios::out);
10
-    if (ofs->fail())
11
+    ofs = x265_fopen(fname, "wb");
12
+    if (!ofs || ferror(ofs))
13
         b_fail = true;
14
 }
15
 
16
@@ -51,7 +51,7 @@
17
 
18
     for (uint32_t i = 0; i < nalcount; i++)
19
     {
20
-        ofs->write((const char*)nal->payload, nal->sizeBytes);
21
+        fwrite((const void*)nal->payload, 1, nal->sizeBytes, ofs);
22
         bytes += nal->sizeBytes;
23
         nal++;
24
     }
25
@@ -65,7 +65,7 @@
26
 
27
     for (uint32_t i = 0; i < nalcount; i++)
28
     {
29
-        ofs->write((const char*)nal->payload, nal->sizeBytes);
30
+        fwrite((const void*)nal->payload, 1, nal->sizeBytes, ofs);
31
         bytes += nal->sizeBytes;
32
         nal++;
33
     }
34
@@ -75,6 +75,6 @@
35
 
36
 void RAWOutput::closeFile(int64_t, int64_t)
37
 {
38
-    if (ofs != &cout)
39
-        delete ofs;
40
+    if (ofs != stdout)
41
+        fclose(ofs);
42
 }
43
x265_1.9.tar.gz/source/output/raw.h -> x265_2.0.tar.gz/source/output/raw.h Changed
10
 
1
@@ -35,7 +35,7 @@
2
 {
3
 protected:
4
 
5
-    std::ostream* ofs;
6
+    FILE* ofs;
7
 
8
     bool b_fail;
9
 
10
x265_1.9.tar.gz/source/test/CMakeLists.txt -> x265_2.0.tar.gz/source/test/CMakeLists.txt Changed
50
 
1
@@ -1,4 +1,12 @@
2
 # vim: syntax=cmake
3
+
4
+check_symbol_exists(__rdtsc "intrin.h" HAVE_RDTSC)
5
+if(HAVE_RDTSC)
6
+    add_definitions(-DHAVE_RDTSC=1)
7
+endif()
8
+
9
+# add X86 assembly files
10
+if(X86)
11
 enable_language(ASM_YASM)
12
 
13
 if(MSVC_IDE)
14
@@ -11,11 +19,23 @@
15
 else()
16
     set(YASM_SRC checkasm-a.asm)
17
 endif()
18
+endif(X86)
19
 
20
-check_symbol_exists(__rdtsc "intrin.h" HAVE_RDTSC)
21
-if(HAVE_RDTSC)
22
-    add_definitions(-DHAVE_RDTSC=1)
23
-endif()
24
+# add ARM assembly files
25
+if(ARM OR CROSS_COMPILE_ARM)
26
+    enable_language(ASM)
27
+    set(YASM_SRC checkasm-arm.S)
28
+    add_custom_command(
29
+        OUTPUT checkasm-arm.obj
30
+        COMMAND ${CMAKE_CXX_COMPILER}
31
+        ARGS ${YASM_FLAGS} ${CMAKE_CURRENT_SOURCE_DIR}/checkasm-arm.S -o checkasm-arm.obj
32
+        DEPENDS checkasm-arm.S)
33
+endif(ARM OR CROSS_COMPILE_ARM)
34
+
35
+# add PowerPC assembly files
36
+if(POWER)
37
+    set(YASM_SRC)
38
+endif(POWER)
39
 
40
 add_executable(TestBench ${YASM_SRC}
41
     testbench.cpp testharness.h
42
@@ -23,6 +43,7 @@
43
     mbdstharness.cpp mbdstharness.h
44
     ipfilterharness.cpp ipfilterharness.h
45
     intrapredharness.cpp intrapredharness.h)
46
+
47
 target_link_libraries(TestBench x265-static ${PLATFORM_LIBS})
48
 if(LINKER_OPTIONS)
49
     if(EXTRA_LIB)
50
x265_2.0.tar.gz/source/test/checkasm-arm.S Added
135
 
1
@@ -0,0 +1,133 @@
2
+/****************************************************************************
3
+ * checkasm-arm.S: assembly check tool
4
+ *****************************************************************************
5
+ * Copyright (C) 2016 x265 project
6
+ *
7
+ * Authors: Martin Storsjo <martin@martin.st>
8
+ *          Dnyaneshwar Gorade <dnyaneshwar@multicorewareinc.com>
9
+ *
10
+ * This program is free software; you can redistribute it and/or modify
11
+ * it under the terms of the GNU General Public License as published by
12
+ * the Free Software Foundation; either version 2 of the License, or
13
+ * (at your option) any later version.
14
+ *
15
+ * This program is distributed in the hope that it will be useful,
16
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18
+ * GNU General Public License for more details.
19
+ *
20
+ * You should have received a copy of the GNU General Public License
21
+ * along with this program; if not, write to the Free Software
22
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
23
+ *
24
+ * This program is also available under a commercial proprietary license.
25
+ * For more information, contact us at license @ x265.com.
26
+ *****************************************************************************/
27
+
28
+#include "../common/arm/asm.S"
29
+
30
+.section .rodata
31
+.align 4
32
+register_init:
33
+.quad 0x21f86d66c8ca00ce
34
+.quad 0x75b6ba21077c48ad
35
+.quad 0xed56bb2dcb3c7736
36
+.quad 0x8bda43d3fd1a7e06
37
+.quad 0xb64a9c9e5d318408
38
+.quad 0xdf9a54b303f1d3a3
39
+.quad 0x4a75479abd64e097
40
+.quad 0x249214109d5d1c88
41
+
42
+error_message:
43
+.asciz "failed to preserve register"
44
+
45
+.text
46
+
47
+@ max number of args used by any x265 asm function.
48
+#define MAX_ARGS 15
49
+
50
+#define ARG_STACK 4*(MAX_ARGS - 2)
51
+
52
+.macro clobbercheck variant
53
+.equ pushed, 4*10
54
+function x265_checkasm_call_\variant
55
+    push        {r4-r11, lr}
56
+.ifc \variant, neon
57
+    vpush       {q4-q7}
58
+.equ pushed, pushed + 16*4
59
+.endif
60
+
61
+    movrel      r12, register_init
62
+.ifc \variant, neon
63
+    vldm        r12, {q4-q7}
64
+.endif
65
+    ldm         r12, {r4-r11}
66
+
67
+    push        {r1}
68
+
69
+    sub         sp,  sp,  #ARG_STACK
70
+.equ pos, 0
71
+.rept MAX_ARGS-2
72
+    ldr         r12, [sp, #ARG_STACK + pushed + 8 + pos]
73
+    str         r12, [sp, #pos]
74
+.equ pos, pos + 4
75
+.endr
76
+
77
+    mov         r12, r0
78
+    mov         r0,  r2
79
+    mov         r1,  r3
80
+    ldrd        r2,  r3,  [sp, #ARG_STACK + pushed]
81
+    blx         r12
82
+    add         sp,  sp,  #ARG_STACK
83
+    pop         {r2}
84
+
85
+    push        {r0, r1}
86
+    movrel      r12, register_init
87
+.ifc \variant, neon
88
+    vldm        r12, {q0-q3}
89
+    veor        q0,  q0,  q4
90
+    veor        q1,  q1,  q5
91
+    veor        q2,  q2,  q6
92
+    veor        q3,  q3,  q7
93
+    vorr        q0,  q0,  q1
94
+    vorr        q0,  q0,  q2
95
+    vorr        q0,  q0,  q3
96
+    vorr        d0,  d0,  d1
97
+    vrev64.32   d1,  d0
98
+    vorr        d0,  d0,  d1
99
+    vmov.32     r3,  d0[0]
100
+.else
101
+    mov         r3,  #0
102
+.endif
103
+
104
+.macro check_reg reg1, reg2
105
+    ldrd        r0,  r1,  [r12], #8
106
+    eor         r0,  r0, \reg1
107
+    eor         r1,  r1, \reg2
108
+    orr         r3,  r3, r0
109
+    orr         r3,  r3, r1
110
+.endm
111
+    check_reg   r4,  r5
112
+    check_reg   r6,  r7
113
+    check_reg   r8,  r9
114
+    check_reg   r10, r11
115
+.purgem check_reg
116
+
117
+    cmp         r3,  #0
118
+    beq         0f
119
+
120
+    mov         r12, #0
121
+    str         r12, [r2]
122
+    movrel      r0, error_message
123
+    bl          puts
124
+0:
125
+    pop         {r0, r1}
126
+.ifc \variant, neon
127
+    vpop        {q4-q7}
128
+.endif
129
+    pop         {r4-r11, pc}
130
+endfunc
131
+.endm
132
+
133
+clobbercheck neon
134
+clobbercheck noneon
135
x265_1.9.tar.gz/source/test/pixelharness.cpp -> x265_2.0.tar.gz/source/test/pixelharness.cpp Changed
201
 
1
@@ -43,6 +43,7 @@
2
         ushort_test_buff[0][i]  = rand() % ((1 << 16) - 1);
3
         uchar_test_buff[0][i]   = rand() % ((1 << 8) - 1);
4
         residual_test_buff[0][i] = (rand() % (2 * RMAX + 1)) - RMAX - 1;// For sse_ss only
5
+        double_test_buff[0][i]  = (double)(short_test_buff[0][i]) / 256.0;
6
 
7
         pixel_test_buff[1][i]   = PIXEL_MIN;
8
         short_test_buff[1][i]   = SMIN;
9
@@ -52,6 +53,7 @@
10
         ushort_test_buff[1][i]  = PIXEL_MIN;
11
         uchar_test_buff[1][i]   = PIXEL_MIN;
12
         residual_test_buff[1][i] = RMIN;
13
+        double_test_buff[1][i]  = (double)(short_test_buff[1][i]) / 256.0;
14
 
15
         pixel_test_buff[2][i]   = PIXEL_MAX;
16
         short_test_buff[2][i]   = SMAX;
17
@@ -61,6 +63,7 @@
18
         ushort_test_buff[2][i]  = ((1 << 16) - 1);
19
         uchar_test_buff[2][i]   = 255;
20
         residual_test_buff[2][i] = RMAX;
21
+        double_test_buff[2][i] = (double)(short_test_buff[2][i]) / 256.0;
22
 
23
         pbuf1[i] = rand() & PIXEL_MAX;
24
         pbuf2[i] = rand() & PIXEL_MAX;
25
@@ -858,9 +861,8 @@
26
         int width = (rand() % 4) + 1; // range[1-4]
27
         float cres = ref(sum0, sum1, width);
28
         float vres = checked_float(opt, sum0, sum1, width);
29
-        if (fabs(vres - cres) > 0.0001)
30
+        if (fabs(vres - cres) > 0.001)
31
             return false;
32
-
33
         reportfail();
34
     }
35
 
36
@@ -1398,6 +1400,60 @@
37
     return true;
38
 }
39
 
40
+bool PixelHarness::check_cutree_fix8_pack(cutree_fix8_pack ref, cutree_fix8_pack opt)
41
+{
42
+    ALIGN_VAR_32(uint16_t, ref_dest[64 * 64]);
43
+    ALIGN_VAR_32(uint16_t, opt_dest[64 * 64]);
44
+
45
+    memset(ref_dest, 0xCD, sizeof(ref_dest));
46
+    memset(opt_dest, 0xCD, sizeof(opt_dest));
47
+
48
+    int j = 0;
49
+
50
+    for (int i = 0; i < ITERS; i++)
51
+    {
52
+        int count = 256 + i;
53
+        int index = i % TEST_CASES;
54
+        checked(opt, opt_dest, double_test_buff[index] + j, count);
55
+        ref(ref_dest, double_test_buff[index] + j, count);
56
+
57
+        if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(uint16_t)))
58
+            return false;
59
+
60
+        reportfail();
61
+        j += INCR;
62
+    }
63
+
64
+    return true;
65
+}
66
+
67
+bool PixelHarness::check_cutree_fix8_unpack(cutree_fix8_unpack ref, cutree_fix8_unpack opt)
68
+{
69
+    ALIGN_VAR_32(double, ref_dest[64 * 64]);
70
+    ALIGN_VAR_32(double, opt_dest[64 * 64]);
71
+
72
+    memset(ref_dest, 0xCD, sizeof(ref_dest));
73
+    memset(opt_dest, 0xCD, sizeof(opt_dest));
74
+
75
+    int j = 0;
76
+
77
+    for (int i = 0; i < ITERS; i++)
78
+    {
79
+        int count = 256 + i;
80
+        int index = i % TEST_CASES;
81
+        checked(opt, opt_dest, ushort_test_buff[index] + j, count);
82
+        ref(ref_dest, ushort_test_buff[index] + j, count);
83
+
84
+        if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(double)))
85
+            return false;
86
+
87
+        reportfail();
88
+        j += INCR;
89
+    }
90
+
91
+    return true;
92
+}
93
+
94
 bool PixelHarness::check_psyCost_pp(pixelcmp_t ref, pixelcmp_t opt)
95
 {
96
     int j = 0, index1, index2, optres, refres;
97
@@ -1819,34 +1875,6 @@
98
     return true;
99
 }
100
 
101
-bool PixelHarness::check_planeClipAndMax(planeClipAndMax_t ref, planeClipAndMax_t opt)
102
-{
103
-    for (int i = 0; i < ITERS; i++)
104
-    {
105
-        intptr_t rand_stride = rand() % STRIDE;
106
-        int rand_width = (rand() % (STRIDE * 2)) + 1;
107
-        const int rand_height = (rand() % MAX_HEIGHT) + 1;
108
-        const pixel rand_min = rand() % 32;
109
-        const pixel rand_max = PIXEL_MAX - (rand() % 32);
110
-        uint64_t ref_sum, opt_sum;
111
-
112
-        // video width must be more than or equal to 32
113
-        if (rand_width < 32)
114
-            rand_width = 32;
115
-
116
-        // stride must be more than or equal to width
117
-        if (rand_stride < rand_width)
118
-            rand_stride = rand_width;
119
-
120
-        pixel ref_max = ref(pbuf1, rand_stride, rand_width, rand_height, &ref_sum, rand_min, rand_max);
121
-        pixel opt_max = (pixel)checked(opt, pbuf1, rand_stride, rand_width, rand_height, &opt_sum, rand_min, rand_max);
122
-
123
-        if (ref_max != opt_max)
124
-            return false;
125
-    }
126
-    return true;
127
-}
128
-
129
 bool PixelHarness::check_pelFilterLumaStrong_H(pelFilterLumaStrong_t ref, pelFilterLumaStrong_t opt)
130
 {
131
     intptr_t srcStep = 1, offset = 64;
132
@@ -1913,6 +1941,68 @@
133
     return true;
134
 }
135
 
136
+bool PixelHarness::check_pelFilterChroma_H(pelFilterChroma_t ref, pelFilterChroma_t opt)
137
+{
138
+    intptr_t srcStep = 1, offset = 64;
139
+    int32_t maskP, maskQ, tc;
140
+    int j = 0;
141
+
142
+    pixel pixel_test_buff1[TEST_CASES][BUFFSIZE];
143
+    for (int i = 0; i < TEST_CASES; i++)
144
+        memcpy(pixel_test_buff1[i], pixel_test_buff[i], sizeof(pixel)* BUFFSIZE);
145
+
146
+    for (int i = 0; i < ITERS; i++)
147
+    {
148
+        tc = rand() % PIXEL_MAX;
149
+        maskP = (rand() % PIXEL_MAX) - 1;
150
+        maskQ = (rand() % PIXEL_MAX) - 1;
151
+
152
+        int index = rand() % 3;
153
+
154
+        ref(pixel_test_buff[index] + 4 * offset + j, srcStep, offset, tc, maskP, maskQ);
155
+        checked(opt, pixel_test_buff1[index] + 4 * offset + j, srcStep, offset, tc, maskP, maskQ);
156
+
157
+        if (memcmp(pixel_test_buff[index], pixel_test_buff1[index], sizeof(pixel)* BUFFSIZE))
158
+            return false;
159
+
160
+        reportfail()
161
+        j += INCR;
162
+    }
163
+
164
+    return true;
165
+}
166
+
167
+bool PixelHarness::check_pelFilterChroma_V(pelFilterChroma_t ref, pelFilterChroma_t opt)
168
+{
169
+    intptr_t srcStep = 64, offset = 1;
170
+    int32_t maskP, maskQ, tc;
171
+    int j = 0;
172
+
173
+    pixel pixel_test_buff1[TEST_CASES][BUFFSIZE];
174
+    for (int i = 0; i < TEST_CASES; i++)
175
+        memcpy(pixel_test_buff1[i], pixel_test_buff[i], sizeof(pixel)* BUFFSIZE);
176
+
177
+    for (int i = 0; i < ITERS; i++)
178
+    {
179
+        tc = rand() % PIXEL_MAX;
180
+        maskP = (rand() % PIXEL_MAX) - 1;
181
+        maskQ = (rand() % PIXEL_MAX) - 1;
182
+
183
+        int index = rand() % 3;
184
+
185
+        ref(pixel_test_buff[index] + 4 + j, srcStep, offset, tc, maskP, maskQ);
186
+        checked(opt, pixel_test_buff1[index] + 4 + j, srcStep, offset, tc, maskP, maskQ);
187
+
188
+        if (memcmp(pixel_test_buff[index], pixel_test_buff1[index], sizeof(pixel)* BUFFSIZE))
189
+            return false;
190
+
191
+        reportfail()
192
+        j += INCR;
193
+    }
194
+
195
+    return true;
196
+}
197
+
198
 bool PixelHarness::testPU(int part, const EncoderPrimitives& ref, const EncoderPrimitives& opt)
199
 {
200
     if (opt.pu[part].satd)
201
x265_1.9.tar.gz/source/test/pixelharness.h -> x265_2.0.tar.gz/source/test/pixelharness.h Changed
22
 
1
@@ -113,6 +113,8 @@
2
     bool check_planecopy_sp(planecopy_sp_t ref, planecopy_sp_t opt);
3
     bool check_planecopy_cp(planecopy_cp_t ref, planecopy_cp_t opt);
4
     bool check_cutree_propagate_cost(cutree_propagate_cost ref, cutree_propagate_cost opt);
5
+    bool check_cutree_fix8_pack(cutree_fix8_pack ref, cutree_fix8_pack opt);
6
+    bool check_cutree_fix8_unpack(cutree_fix8_unpack ref, cutree_fix8_unpack opt);
7
     bool check_psyCost_pp(pixelcmp_t ref, pixelcmp_t opt);
8
     bool check_calSign(sign_t ref, sign_t opt);
9
     bool check_scanPosLast(scanPosLast_t ref, scanPosLast_t opt);
10
@@ -120,9 +122,10 @@
11
     bool check_costCoeffNxN(costCoeffNxN_t ref, costCoeffNxN_t opt);
12
     bool check_costCoeffRemain(costCoeffRemain_t ref, costCoeffRemain_t opt);
13
     bool check_costC1C2Flag(costC1C2Flag_t ref, costC1C2Flag_t opt);
14
-    bool check_planeClipAndMax(planeClipAndMax_t ref, planeClipAndMax_t opt);
15
     bool check_pelFilterLumaStrong_V(pelFilterLumaStrong_t ref, pelFilterLumaStrong_t opt);
16
     bool check_pelFilterLumaStrong_H(pelFilterLumaStrong_t ref, pelFilterLumaStrong_t opt);
17
+    bool check_pelFilterChroma_V(pelFilterChroma_t ref, pelFilterChroma_t opt);
18
+    bool check_pelFilterChroma_H(pelFilterChroma_t ref, pelFilterChroma_t opt);
19
 
20
 public:
21
 
22
x265_1.9.tar.gz/source/test/rate-control-tests.txt -> x265_2.0.tar.gz/source/test/rate-control-tests.txt Changed
13
 
1
@@ -25,6 +25,11 @@
2
 
3
 
4
 # multi-pass rate control tests
5
+sita_1920x1080_30.yuv, --preset ultrafast --crf 20 --no-cutree --no-scenecut --keyint 50 --no-open-gop --pass 1 --vbv-bufsize 7000 --vbv-maxrate 5000, --preset ultrafast --crf 20 --no-cutree --no-scenecut --keyint 50 --no-open-gop --pass 2 --vbv-bufsize 7000 --vbv-maxrate 5000
6
+sita_1920x1080_30.yuv, --preset medium --crf 20 --no-cutree --no-scenecut --keyint 50 --no-open-gop --pass 1 --vbv-bufsize 7000 --vbv-maxrate 5000, --preset medium --crf 20 --no-cutree --no-scenecut --keyint 50 --no-open-gop --pass 2 --vbv-bufsize 7000 --vbv-maxrate 5000
7
+sintel_trailer_2k_480p24.y4m, --preset medium --crf 18 --no-cutree --no-scenecut --no-open-gop --keyint 50 --vbv-bufsize 1200 --vbv-maxrate 1000 --pass 1, --preset medium --crf 18 --no-cutree --no-scenecut --no-open-gop --keyint 50 --vbv-bufsize 1200 --vbv-maxrate 1000 --pass 2
8
+sintel_trailer_2k_480p24.y4m, --preset veryslow --crf 18 --no-cutree --no-scenecut --no-open-gop --keyint 50 --vbv-bufsize 1200 --vbv-maxrate 1000 --pass 1, --preset veryslow --crf 18 --no-cutree --no-scenecut --no-open-gop --keyint 50 --vbv-bufsize 1200 --vbv-maxrate 1000 --pass 2
9
+ten_teaser_3840x2160_50_10bit.yuv, --preset medium --crf 25 --no-cutree --no-open-gop --no-scenecut --keyint 50 --vbv-maxrate 10000 --vbv-bufsize 12000 --pass 1, --preset medium --crf 25 --no-cutree --no-open-gop --no-scenecut --keyint 50 --vbv-maxrate 10000 --vbv-bufsize 12000 --pass 2
10
 big_buck_bunny_360p24.y4m,--preset slow --crf 40 --pass 1 -f 5000,--preset slow --bitrate 200 --pass 2 -f 5000
11
 big_buck_bunny_360p24.y4m,--preset medium --bitrate 700 --pass 1 -F4 --slow-firstpass -f 5000 ,--preset medium --bitrate 700 --vbv-bufsize 900 --vbv-maxrate 700 --pass 2 -F4 -f 5000
12
 112_1920x1080_25.yuv,--preset fast --bitrate 1000 --vbv-maxrate 1000 --vbv-bufsize 1000 --strict-cbr --pass 1 -F4,--preset fast --bitrate 1000 --vbv-maxrate 3000 --vbv-bufsize 3000 --pass 2 -F4
13
x265_1.9.tar.gz/source/test/regression-tests.txt -> x265_2.0.tar.gz/source/test/regression-tests.txt Changed
9
 
1
@@ -67,6 +67,7 @@
2
 News-4k.y4m,--preset ultrafast --no-cutree --analysis-mode=save --bitrate 15000,--preset ultrafast --no-cutree --analysis-mode=load --bitrate 15000
3
 News-4k.y4m,--preset superfast --lookahead-slices 6 --aq-mode 0
4
 News-4k.y4m,--preset medium --tune ssim --no-sao --qg-size 16
5
+News-4k.y4m,--preset veryslow --no-rskip
6
 OldTownCross_1920x1080_50_10bit_422.yuv,--preset superfast --weightp
7
 OldTownCross_1920x1080_50_10bit_422.yuv,--preset medium --no-weightp
8
 OldTownCross_1920x1080_50_10bit_422.yuv,--preset slower --tune fastdecode
9
x265_1.9.tar.gz/source/test/testbench.cpp -> x265_2.0.tar.gz/source/test/testbench.cpp Changed
37
 
1
@@ -169,6 +169,9 @@
2
         { "XOP", X265_CPU_XOP },
3
         { "AVX2", X265_CPU_AVX2 },
4
         { "BMI2", X265_CPU_AVX2 | X265_CPU_BMI1 | X265_CPU_BMI2 },
5
+        { "ARMv6", X265_CPU_ARMV6 },
6
+        { "NEON", X265_CPU_NEON },
7
+        { "FastNeonMRC", X265_CPU_FAST_NEON_MRC },
8
         { "", 0 },
9
     };
10
 
11
@@ -182,6 +185,7 @@
12
         else
13
             continue;
14
 
15
+#if X265_ARCH_X86
16
         EncoderPrimitives vecprim;
17
         memset(&vecprim, 0, sizeof(vecprim));
18
         setupInstrinsicPrimitives(vecprim, test_arch[i].flag);
19
@@ -197,6 +201,7 @@
20
                 return -1;
21
             }
22
         }
23
+#endif
24
 
25
         EncoderPrimitives asmprim;
26
         memset(&asmprim, 0, sizeof(asmprim));
27
@@ -220,7 +225,9 @@
28
 
29
     EncoderPrimitives optprim;
30
     memset(&optprim, 0, sizeof(optprim));
31
+#if X265_ARCH_X86
32
     setupInstrinsicPrimitives(optprim, cpuid);
33
+#endif
34
     setupAssemblyPrimitives(optprim, cpuid);
35
 
36
     /* Note that we do not setup aliases for performance tests, that would be
37
x265_1.9.tar.gz/source/test/testharness.h -> x265_2.0.tar.gz/source/test/testharness.h Changed
37
 
1
@@ -32,7 +32,6 @@
2
 #pragma warning(disable: 4324) // structure was padded due to __declspec(align())
3
 #endif
4
 
5
-#define PIXEL_MAX ((1 << X265_DEPTH) - 1)
6
 #define PIXEL_MIN 0
7
 #define SHORT_MAX  32767
8
 #define SHORT_MIN -32767
9
@@ -75,10 +74,17 @@
10
 {
11
     uint32_t a = 0;
12
 
13
+#if X265_ARCH_X86
14
     asm volatile("rdtsc" : "=a" (a) ::"edx");
15
+#elif X265_ARCH_ARM
16
+    // TOD-DO: verify following inline asm to get cpu Timestamp Counter for ARM arch
17
+    // asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r"(a));
18
+
19
+    // TO-DO: replace clock() function with appropriate ARM cpu instructions
20
+    a = clock();
21
+#endif
22
     return a;
23
 }
24
-
25
 #endif // ifdef _MSC_VER
26
 
27
 #define BENCH_RUNS 1000
28
@@ -125,7 +131,7 @@
29
  * needs an explicit asm check because it only sometimes crashes in normal use. */
30
 intptr_t PFX(checkasm_call)(intptr_t (*func)(), int *ok, ...);
31
 float PFX(checkasm_call_float)(float (*func)(), int *ok, ...);
32
-#else
33
+#elif X265_ARCH_ARM == 0
34
 #define PFX(stack_pagealign)(func, align) func()
35
 #endif
36
 
37
x265_1.9.tar.gz/source/x265-extras.cpp -> x265_2.0.tar.gz/source/x265-extras.cpp Changed
111
 
1
@@ -46,17 +46,17 @@
2
         return NULL;
3
     }
4
 
5
-    FILE *csvfp = fopen(fname, "r");
6
+    FILE *csvfp = x265_fopen(fname, "r");
7
     if (csvfp)
8
     {
9
         /* file already exists, re-open for append */
10
         fclose(csvfp);
11
-        return fopen(fname, "ab");
12
+        return x265_fopen(fname, "ab");
13
     }
14
     else
15
     {
16
         /* new CSV file, write header */
17
-        csvfp = fopen(fname, "wb");
18
+        csvfp = x265_fopen(fname, "wb");
19
         if (csvfp)
20
         {
21
             if (level)
22
@@ -280,9 +280,9 @@
23
     fprintf(csvfp, " %-6u, %-6u, %s\n", stats.maxCLL, stats.maxFALL, api.version_str);
24
 }
25
 
26
-/* The dithering algorithm is based on Sierra-2-4A error diffusion. */
27
-static void ditherPlane(pixel *dst, int dstStride, uint16_t *src, int srcStride,
28
-                        int width, int height, int16_t *errors, int bitDepth)
29
+/* The dithering algorithm is based on Sierra-2-4A error diffusion.
30
+ * We convert planes in place (without allocating a new buffer). */
31
+static void ditherPlane(uint16_t *src, int srcStride, int width, int height, int16_t *errors, int bitDepth)
32
 {
33
     const int lShift = 16 - bitDepth;
34
     const int rShift = 16 - bitDepth + 2;
35
@@ -290,15 +290,34 @@
36
     const int pixelMax = (1 << bitDepth) - 1;
37
 
38
     memset(errors, 0, (width + 1) * sizeof(int16_t));
39
-    int pitch = 1;
40
-    for (int y = 0; y < height; y++, src += srcStride, dst += dstStride)
41
+
42
+    if (bitDepth == 8)
43
     {
44
-        int16_t err = 0;
45
-        for (int x = 0; x < width; x++)
46
+        for (int y = 0; y < height; y++, src += srcStride)
47
         {
48
-            err = err * 2 + errors[x] + errors[x + 1];
49
-            dst[x * pitch] = (pixel)x265_clip3(0, pixelMax, ((src[x * 1] << 2) + err + half) >> rShift);
50
-            errors[x] = err = src[x * pitch] - (dst[x * pitch] << lShift);
51
+            uint8_t* dst = (uint8_t *)src;
52
+            int16_t err = 0;
53
+            for (int x = 0; x < width; x++)
54
+            {
55
+                err = err * 2 + errors[x] + errors[x + 1];
56
+                int tmpDst = x265_clip3(0, pixelMax, ((src[x] << 2) + err + half) >> rShift);
57
+                errors[x] = err = (int16_t)(src[x] - (tmpDst << lShift));
58
+                dst[x] = (uint8_t)tmpDst;
59
+            }
60
+        }
61
+    }
62
+    else
63
+    {
64
+        for (int y = 0; y < height; y++, src += srcStride)
65
+        {
66
+            int16_t err = 0;
67
+            for (int x = 0; x < width; x++)
68
+            {
69
+                err = err * 2 + errors[x] + errors[x + 1];
70
+                int tmpDst = x265_clip3(0, pixelMax, ((src[x] << 2) + err + half) >> rShift);
71
+                errors[x] = err = (int16_t)(src[x] - (tmpDst << lShift));
72
+                src[x] = (uint16_t)tmpDst;
73
+            }
74
         }
75
     }
76
 }
77
@@ -317,10 +336,16 @@
78
         return;
79
     }
80
 
81
+    if (picIn.bitDepth == bitDepth)
82
+    {
83
+        fprintf(stderr, "extras[error]: dither support enabled only if encoder depth is different from picture depth\n");
84
+        return;
85
+    }
86
+
87
     /* This portion of code is from readFrame in x264. */
88
     for (int i = 0; i < x265_cli_csps[picIn.colorSpace].planes; i++)
89
     {
90
-        if ((picIn.bitDepth & 7) && (picIn.bitDepth != 16))
91
+        if (picIn.bitDepth < 16)
92
         {
93
             /* upconvert non 16bit high depth planes to 16bit */
94
             uint16_t *plane = (uint16_t*)picIn.planes[i];
95
@@ -332,14 +357,10 @@
96
             for (uint32_t j = 0; j < pixelCount; j++)
97
                 plane[j] = plane[j] << lShift;
98
         }
99
-    }
100
 
101
-    for (int i = 0; i < x265_cli_csps[picIn.colorSpace].planes; i++)
102
-    {
103
         int height = (int)(picHeight >> x265_cli_csps[picIn.colorSpace].height[i]);
104
         int width = (int)(picWidth >> x265_cli_csps[picIn.colorSpace].width[i]);
105
 
106
-        ditherPlane(((pixel*)picIn.planes[i]), picIn.stride[i] / sizeof(pixel), ((uint16_t*)picIn.planes[i]),
107
-                    picIn.stride[i] / 2, width, height, errorBuf, bitDepth);
108
+        ditherPlane(((uint16_t*)picIn.planes[i]), picIn.stride[i] / 2, width, height, errorBuf, bitDepth);
109
     }
110
 }
111
x265_1.9.tar.gz/source/x265.cpp -> x265_2.0.tar.gz/source/x265.cpp Changed
128
 
1
@@ -29,14 +29,10 @@
2
 #include "x265-extras.h"
3
 #include "x265cli.h"
4
 
5
-#include "common.h"
6
 #include "input/input.h"
7
 #include "output/output.h"
8
 #include "output/reconplay.h"
9
 
10
-#include "param.h"
11
-#include "cpu.h"
12
-
13
 #if HAVE_VLD
14
 /* Visual Leak Detector */
15
 #include <vld.h>
16
@@ -312,12 +308,9 @@
17
             OPT("recon-y4m-exec") reconPlayCmd = optarg;
18
             OPT("qpfile")
19
             {
20
-                this->qpfile = fopen(optarg, "rb");
21
+                this->qpfile = x265_fopen(optarg, "rb");
22
                 if (!this->qpfile)
23
-                {
24
-                    x265_log(param, X265_LOG_ERROR, "%s qpfile not found or error in opening qp file\n", optarg);
25
-                    return false;
26
-                }
27
+                    x265_log_file(param, X265_LOG_ERROR, "%s qpfile not found or error in opening qp file\n", optarg);
28
             }
29
             else
30
                 bError |= !!api->param_parse(param, long_options[long_options_index].name, optarg);
31
@@ -378,7 +371,7 @@
32
     this->input = InputFile::open(info, this->bForceY4m);
33
     if (!this->input || this->input->isFail())
34
     {
35
-        x265_log(param, X265_LOG_ERROR, "unable to open input file <%s>\n", inputfn);
36
+        x265_log_file(param, X265_LOG_ERROR, "unable to open input file <%s>\n", inputfn);
37
         return true;
38
     }
39
 
40
@@ -455,10 +448,10 @@
41
     this->output = OutputFile::open(outputfn, info);
42
     if (this->output->isFail())
43
     {
44
-        x265_log(param, X265_LOG_ERROR, "failed to open output file <%s> for writing\n", outputfn);
45
+        x265_log_file(param, X265_LOG_ERROR, "failed to open output file <%s> for writing\n", outputfn);
46
         return true;
47
     }
48
-    general_log(param, this->output->getName(), X265_LOG_INFO, "output file: %s\n", outputfn);
49
+    general_log_file(param, this->output->getName(), X265_LOG_INFO, "output file: %s\n", outputfn);
50
     return false;
51
 }
52
 
53
@@ -497,6 +490,39 @@
54
     return 1;
55
 }
56
 
57
+#ifdef _WIN32
58
+/* Copy of x264 code, which allows for Unicode characters in the command line.
59
+ * Retrieve command line arguments as UTF-8. */
60
+static int get_argv_utf8(int *argc_ptr, char ***argv_ptr)
61
+{
62
+    int ret = 0;
63
+    wchar_t **argv_utf16 = CommandLineToArgvW(GetCommandLineW(), argc_ptr);
64
+    if (argv_utf16)
65
+    {
66
+        int argc = *argc_ptr;
67
+        int offset = (argc + 1) * sizeof(char*);
68
+        int size = offset;
69
+
70
+        for (int i = 0; i < argc; i++)
71
+            size += WideCharToMultiByte(CP_UTF8, 0, argv_utf16[i], -1, NULL, 0, NULL, NULL);
72
+
73
+        char **argv = *argv_ptr = (char**)malloc(size);
74
+        if (argv)
75
+        {
76
+            for (int i = 0; i < argc; i++)
77
+            {
78
+                argv[i] = (char*)argv + offset;
79
+                offset += WideCharToMultiByte(CP_UTF8, 0, argv_utf16[i], -1, argv[i], size - offset, NULL, NULL);
80
+            }
81
+            argv[argc] = NULL;
82
+            ret = 1;
83
+        }
84
+        LocalFree(argv_utf16);
85
+    }
86
+    return ret;
87
+}
88
+#endif
89
+
90
 /* CLI return codes:
91
  *
92
  * 0 - encode successful
93
@@ -517,6 +543,10 @@
94
 
95
     GetConsoleTitle(orgConsoleTitle, CONSOLE_TITLE_SIZE);
96
     SetThreadExecutionState(ES_CONTINUOUS | ES_SYSTEM_REQUIRED | ES_AWAYMODE_REQUIRED);
97
+#if _WIN32
98
+    char** orgArgv = argv;
99
+    get_argv_utf8(&argc, &argv);
100
+#endif
101
 
102
     ReconPlay* reconPlay = NULL;
103
     CLIOptions cliopt;
104
@@ -560,7 +590,7 @@
105
         cliopt.csvfpt = x265_csvlog_open(*api, *param, cliopt.csvfn, cliopt.csvLogLevel);
106
         if (!cliopt.csvfpt)
107
         {
108
-            x265_log(param, X265_LOG_ERROR, "Unable to open CSV log file <%s>, aborting\n", cliopt.csvfn);
109
+            x265_log_file(param, X265_LOG_ERROR, "Unable to open CSV log file <%s>, aborting\n", cliopt.csvfn);
110
             cliopt.destroy();
111
             if (cliopt.api)
112
                 cliopt.api->param_free(cliopt.param);
113
@@ -747,6 +777,14 @@
114
     SetConsoleTitle(orgConsoleTitle);
115
     SetThreadExecutionState(ES_CONTINUOUS);
116
 
117
+#if _WIN32
118
+    if (argv != orgArgv)
119
+    {
120
+        free(argv);
121
+        argv = orgArgv;
122
+    }
123
+#endif
124
+
125
 #if HAVE_VLD
126
     assert(VLDReportLeaks() == 0);
127
 #endif
128
x265_1.9.tar.gz/source/x265.h -> x265_2.0.tar.gz/source/x265.h Changed
111
 
1
@@ -98,9 +98,9 @@
2
     uint32_t         sliceType;
3
     uint32_t         numCUsInFrame;
4
     uint32_t         numPartitions;
5
+    int              bScenecut;
6
     void*            interData;
7
     void*            intraData;
8
-    int              bScenecut;
9
 } x265_analysis_data;
10
 
11
 /* cu statistics */
12
@@ -221,6 +221,14 @@
13
     /* Frame level statistics */
14
     x265_frame_stats frameData;
15
 
16
+    /* Ratecontrol statistics for collecting the ratecontrol information.
17
+     * It is not used for collecting the last pass ratecontrol data in 
18
+     * multi pass ratecontrol mode. */
19
+    void*  rcData;
20
+
21
+    uint64_t framesize;
22
+
23
+    int    height;
24
 } x265_picture;
25
 
26
 typedef enum
27
@@ -587,6 +595,11 @@
28
      * Main (0) and High (1) tier. Default is Main tier (0) */
29
     int       bHighTier;
30
 
31
+    /* Enable UHD Blu-ray compatibility support. If specified, the encoder will
32
+     * attempt to modify/set the encode specifications. If the encoder is unable 
33
+     * to do so, this option will be turned OFF. */
34
+    int       uhdBluray;
35
+
36
     /* The maximum number of L0 references a P or B slice may use. This
37
      * influences the size of the decoded picture buffer. The higher this
38
      * number, the more reference frames there will be available for motion
39
@@ -764,7 +777,7 @@
40
      * enabled). At level 2 rate-distortion cost is used to make decimate decisions
41
      * on each 4x4 coding group (including the cost of signaling the group within
42
      * the group bitmap).  Psy-rdoq is less effective at preserving energy when
43
-     * RDOQ is at level 2 */
44
+     * RDOQ is at level 2. Default: 0 */
45
     int       rdoqLevel;
46
 
47
     /* Enable the implicit signaling of the sign bit of the last coefficient of
48
@@ -896,23 +909,27 @@
49
     /* Note: when deblocking and SAO are both enabled, the loop filter CU lag is
50
      * only one row, as they operate in series on the same row. */
51
 
52
-    /* Select the method in which SAO deals with deblocking boundary pixels.  If
53
+    /* Select the method in which SAO deals with deblocking boundary pixels. If
54
      * disabled the right and bottom boundary areas are skipped. If enabled,
55
      * non-deblocked pixels are used entirely. Default is disabled */
56
     int       bSaoNonDeblocked;
57
 
58
     /*== Analysis tools ==*/
59
 
60
-    /* A value between X265_NO_RDO_NO_RDOQ and X265_RDO_LEVEL which determines
61
-     * the level of rate distortion optimizations to perform during mode
62
-     * decisions and quantization. The more RDO the better the compression
63
-     * efficiency at a major cost of performance. Default is no RDO (0) */
64
+    /* A value between 1 and 6 (both inclusive) which determines the level of 
65
+     * rate distortion optimizations to perform during mode and depth decisions.
66
+     * The more RDO the better the compression efficiency at a major cost of 
67
+     * performance. Default is 3 */
68
     int       rdLevel;
69
 
70
-    /* Enable early skip decisions to avoid intra and inter analysis in likely
71
+    /* Enable early skip decisions to avoid analysing additional modes in likely
72
      * skip blocks. Default is disabled */
73
     int       bEnableEarlySkip;
74
 
75
+    /* Enable early CU size decisions to avoid recursing to higher depths. 
76
+     * Default is enabled */
77
+    int bEnableRecursionSkip;
78
+
79
     /* Use a faster search method to find the best intra mode. Default is 0 */
80
     int       bEnableFastIntra;
81
 
82
@@ -947,10 +964,16 @@
83
     double    psyRd;
84
 
85
     /* Strength of psycho-visual optimizations in quantization. Only has an
86
-     * effect in presets which use RDOQ (rd-levels 4 and 5).  The value must be
87
-     * between 0 and 50, 1.0 is typical. Default 1.0 */
88
+     * effect when RDOQ is enabled (presets slow, slower and veryslow). The 
89
+     * value must be between 0 and 50, 1.0 is typical. Default 0 */
90
     double    psyRdoq;
91
 
92
+    /* Perform quantisation parameter based RD refinement. RD cost is calculated
93
+     * on the best CU partitions, chosen after the CU analysis, for a range of QPs
94
+     * to find the optimal rounding effect. Only effective at rd-levels 5 and 6.
95
+     * Default disabled */
96
+    int       bEnableRdRefine;
97
+
98
     /* If X265_ANALYSIS_SAVE, write per-frame analysis information into analysis
99
      * buffers.  if X265_ANALYSIS_LOAD, read analysis information into analysis
100
      * buffer and use this analysis information to reduce the amount of work
101
@@ -1083,6 +1106,9 @@
102
          * (QG) size. Allowed values are 64, 32, 16 provided it falls within the
103
          * inclusuve range [maxCUSize, minCUSize]. Experimental, default: maxCUSize */
104
         uint32_t qgSize;
105
+
106
+        /* internally enable if tune grain is set */
107
+        int      bEnableGrain;
108
     } rc;
109
 
110
     /*== Video Usability Information ==*/
111
x265_1.9.tar.gz/source/x265cli.h -> x265_2.0.tar.gz/source/x265cli.h Changed
67
 
1
@@ -53,6 +53,7 @@
2
     { "profile",        required_argument, NULL, 'P' },
3
     { "level-idc",      required_argument, NULL, 0 },
4
     { "high-tier",            no_argument, NULL, 0 },
5
+    { "uhd-bd",               no_argument, NULL, 0 },
6
     { "no-high-tier",         no_argument, NULL, 0 },
7
     { "allow-non-conformance",no_argument, NULL, 0 },
8
     { "no-allow-non-conformance",no_argument, NULL, 0 },
9
@@ -96,6 +97,8 @@
10
     { "amp",                  no_argument, NULL, 0 },
11
     { "no-early-skip",        no_argument, NULL, 0 },
12
     { "early-skip",           no_argument, NULL, 0 },
13
+    { "no-rskip",             no_argument, NULL, 0 },
14
+    { "rskip",                no_argument, NULL, 0 },
15
     { "no-fast-cbf",          no_argument, NULL, 0 },
16
     { "fast-cbf",             no_argument, NULL, 0 },
17
     { "no-tskip",             no_argument, NULL, 0 },
18
@@ -143,6 +146,8 @@
19
     { "qp",             required_argument, NULL, 'q' },
20
     { "aq-mode",        required_argument, NULL, 0 },
21
     { "aq-strength",    required_argument, NULL, 0 },
22
+    { "rc-grain",             no_argument, NULL, 0 },
23
+    { "no-rc-grain",          no_argument, NULL, 0 },
24
     { "ipratio",        required_argument, NULL, 0 },
25
     { "pbratio",        required_argument, NULL, 0 },
26
     { "qcomp",          required_argument, NULL, 0 },
27
@@ -159,6 +164,8 @@
28
     { "psy-rdoq",       required_argument, NULL, 0 },
29
     { "no-psy-rd",            no_argument, NULL, 0 },
30
     { "no-psy-rdoq",          no_argument, NULL, 0 },
31
+    { "rd-refine",            no_argument, NULL, 0 },
32
+    { "no-rd-refine",         no_argument, NULL, 0 },
33
     { "scaling-list",   required_argument, NULL, 0 },
34
     { "lossless",             no_argument, NULL, 0 },
35
     { "no-lossless",          no_argument, NULL, 0 },
36
@@ -279,6 +286,7 @@
37
     H0("-P/--profile <string>            Enforce an encode profile: main, main10, mainstillpicture\n");
38
     H0("   --level-idc <integer|float>   Force a minimum required decoder level (as '5.0' or '50')\n");
39
     H0("   --[no-]high-tier              If a decoder level is specified, this modifier selects High tier of that level\n");
40
+    H0("   --uhd-bd                      Enable UHD Bluray compatibility support\n");
41
     H0("   --[no-]allow-non-conformance  Allow the encoder to generate profile NONE bitstreams. Default %s\n", OPT(param->bAllowNonConformance));
42
     H0("\nThreading, performance:\n");
43
     H0("   --pools <integer,...>         Comma separated thread count per thread pool (pool per NUMA node)\n");
44
@@ -300,11 +308,13 @@
45
     H0("   --tu-intra-depth <integer>    Max TU recursive depth for intra CUs. Default %d\n", param->tuQTMaxIntraDepth);
46
     H0("   --tu-inter-depth <integer>    Max TU recursive depth for inter CUs. Default %d\n", param->tuQTMaxInterDepth);
47
     H0("\nAnalysis:\n");
48
-    H0("   --rd <0..6>                   Level of RDO in mode decision 0:least....6:full RDO. Default %d\n", param->rdLevel);
49
+    H0("   --rd <1..6>                   Level of RDO in mode decision 1:least....6:full RDO. Default %d\n", param->rdLevel);
50
     H0("   --[no-]psy-rd <0..5.0>        Strength of psycho-visual rate distortion optimization, 0 to disable. Default %.1f\n", param->psyRd);
51
     H0("   --[no-]rdoq-level <0|1|2>     Level of RDO in quantization 0:none, 1:levels, 2:levels & coding groups. Default %d\n", param->rdoqLevel);
52
     H0("   --[no-]psy-rdoq <0..50.0>     Strength of psycho-visual optimization in RDO quantization, 0 to disable. Default %.1f\n", param->psyRdoq);
53
+    H0("   --[no-]rd-refine              Enable QP based RD refinement for rd levels 5 and 6. Default %s\n", OPT(param->bEnableRdRefine));
54
     H0("   --[no-]early-skip             Enable early SKIP detection. Default %s\n", OPT(param->bEnableEarlySkip));
55
+    H0("   --[no-]rskip                  Enable early exit from recursion. Default %s\n", OPT(param->bEnableRecursionSkip));
56
     H1("   --[no-]tskip-fast             Enable fast intra transform skipping. Default %s\n", OPT(param->bEnableTSkipFast));
57
     H1("   --nr-intra <integer>          An integer value in range of 0 to 2000, which denotes strength of noise reduction in intra CUs. Default 0\n");
58
     H1("   --nr-inter <integer>          An integer value in range of 0 to 2000, which denotes strength of noise reduction in inter CUs. Default 0\n");
59
@@ -373,6 +383,7 @@
60
     H0("   --aq-strength <float>         Reduces blocking and blurring in flat and textured areas (0 to 3.0). Default %.2f\n", param->rc.aqStrength);
61
     H0("   --qg-size <int>               Specifies the size of the quantization group (64, 32, 16). Default %d\n", param->rc.qgSize);
62
     H0("   --[no-]cutree                 Enable cutree for Adaptive Quantization. Default %s\n", OPT(param->rc.cuTree));
63
+    H0("   --[no-]rc-grain               Enable ratecontrol mode to handle grains specifically. turned on with tune grain. Default %s\n", OPT(param->rc.bEnableGrain));
64
     H1("   --ipratio <float>             QP factor between I and P. Default %.2f\n", param->rc.ipFactor);
65
     H1("   --pbratio <float>             QP factor between P and B. Default %.2f\n", param->rc.pbFactor);
66
     H1("   --qcomp <float>               Weight given to predicted complexity. Default %.2f\n", param->rc.qCompress);
67