Overview

Request 2870 (accepted)

Update to version 2.0

Submit package home:enzokiel:branches:Essentials / x265 to package Essentials / x265

x265.changes Changed
x
 
1
@@ -1,4 +1,40 @@
2
 -------------------------------------------------------------------
3
+Sun Aug 28 11:51:23 UTC 2016 - joerg.lorenzen@ki.tng.de
4
+
5
+- Update to version 2.0
6
+  API and Key Behavior Changes
7
+  * x265_rc_stats added to x265_picture, containing all RC decision
8
+    points for that frame.
9
+  * PTL: high tier is now allowed by default, chosen only if
10
+    necessary.
11
+  * multi-pass: First pass now uses slow-firstpass by default,
12
+    enabling better RC decisions in future passes.
13
+  * pools: fix behaviour on multi-socketed Windows systems, provide
14
+    more flexibility in determining thread and pool counts.
15
+  * ABR: improve bits allocation in the first few frames, abr reset,
16
+    vbv and cutree improved.
17
+  New Features
18
+  * uhd-bd: Enforce Ultra-HD Blu-ray Disc parameters
19
+    (overrides any other settings).
20
+  * rskip: Enables skipping recursion to analyze lower CU sizes
21
+    using heuristics at different rd-levels. Provides good visual
22
+    quality gains at the highest quality presets.
23
+  * rc-grain: Enables a new rate control mode specifically for
24
+    grainy content. Strictly prevents QP oscillations within and
25
+    between frames to avoid grain fluctuations.
26
+  * tune grain: A fully refactored and improved option to encode
27
+    film grain content including QP control as well as analysis
28
+    options.
29
+  * asm: ARM assembly is now enabled by default, native or cross
30
+    compiled builds supported on armv6 and later systems.
31
+  Misc
32
+  * An SSIM calculation bug was corrected
33
+- soname bump to 87.
34
+- Fixed arm.patch.
35
+- Added libnuma-devel as buildrequires for arch x86_64 (except
36
+  for openSUSE 13.1 because libnuma-devel >= 2.0.9 is required).
37
+
38
+-------------------------------------------------------------------
39
 Wed Feb  3 13:22:42 UTC 2016 - idonmez@suse.com
40
 
41
 - Update to version 1.9
42
x265.spec Changed
28
 
1
@@ -1,10 +1,10 @@
2
 # based on the spec file from https://build.opensuse.org/package/view_file/home:Simmphonie/libx265/
3
 
4
 Name:           x265
5
-%define soname  79
6
+%define soname  87
7
 %define libname lib%{name}
8
 %define libsoname %{libname}-%{soname}
9
-Version:        1.9
10
+Version:        2.0
11
 Release:        0
12
 License:        GPL-2.0+
13
 Summary:        A free h265/HEVC encoder - encoder binary
14
@@ -14,6 +14,13 @@
15
 Patch0:         arm.patch
16
 BuildRequires:  gcc gcc-c++
17
 BuildRequires:  cmake >= 2.8.8
18
+# for openSUSE 13.1 only libnuma-devel = 2.0.8 is available, but version 2.0.9 or higher is required
19
+# build against version 2.0.8 failes with "error: 'numa_bitmask_weight' was not declared in this scope"
20
+%if ! ( 0%{?suse_version} == 1310 )
21
+%ifarch x86_64
22
+BuildRequires:  libnuma-devel >= 2.0.9
23
+%endif
24
+%endif
25
 BuildRequires:  pkg-config
26
 BuildRequires:  yasm >= 1.2.0
27
 BuildRoot:      %{_tmppath}/%{name}-%{version}-build
28
arm.patch Changed
98
 
1
@@ -1,19 +1,25 @@
2
-Index: x265_11047/source/CMakeLists.txt
3
+Index: x265_2.0/source/CMakeLists.txt
4
 ===================================================================
5
---- x265_11047.orig/source/CMakeLists.txt
6
-+++ x265_11047/source/CMakeLists.txt
7
-@@ -56,10 +56,22 @@ elseif(POWERMATCH GREATER "-1")
8
+--- x265_2.0.orig/source/CMakeLists.txt
9
++++ x265_2.0/source/CMakeLists.txt
10
+@@ -60,15 +60,22 @@
11
      message(STATUS "Detected POWER target processor")
12
      set(POWER 1)
13
      add_definitions(-DX265_ARCH_POWER=1)
14
+-elseif(ARMMATCH GREATER "-1")
15
+-    if(CROSS_COMPILE_ARM)
16
+-        message(STATUS "Cross compiling for ARM arch")
17
+-    else()
18
+-        set(CROSS_COMPILE_ARM 0)
19
+-    endif()
20
+-    message(STATUS "Detected ARM target processor")
21
+-    set(ARM 1)
22
+-    add_definitions(-DX265_ARCH_ARM=1 -DHAVE_ARMV6=1)
23
 +elseif(${SYSPROC} MATCHES "armv5.*")
24
 +    message(STATUS "Detected ARMV5 system processor")
25
 +    set(ARMV5 1)
26
 +    add_definitions(-DX265_ARCH_ARM=1 -DHAVE_ARMV6=0 -DHAVE_NEON=0)
27
- elseif(${SYSPROC} STREQUAL "armv6l")
28
--    message(STATUS "Detected ARM target processor")
29
--    set(ARM 1)
30
--    add_definitions(-DX265_ARCH_ARM=1 -DHAVE_ARMV6=1)
31
++elseif(${SYSPROC} STREQUAL "armv6l")
32
 +    message(STATUS "Detected ARMV6 system processor")
33
 +    set(ARMV6 1)
34
 +    add_definitions(-DX265_ARCH_ARM=1 -DHAVE_ARMV6=1 -DHAVE_NEON=0)
35
@@ -28,21 +34,32 @@
36
  else()
37
      message(STATUS "CMAKE_SYSTEM_PROCESSOR value `${CMAKE_SYSTEM_PROCESSOR}` is unknown")
38
      message(STATUS "Please add this value near ${CMAKE_CURRENT_LIST_FILE}:${CMAKE_CURRENT_LIST_LINE}")
39
-@@ -169,8 +181,8 @@ if(GCC)
40
-     elseif(X86 AND NOT X64)
41
-         add_definitions(-march=i686)
42
+@@ -186,18 +193,9 @@
43
+             add_definitions(-march=i686)
44
+         endif()
45
      endif()
46
--    if(ARM)
47
--        add_definitions(-march=armv6 -mfloat-abi=hard -mfpu=vfp)
48
+-    if(ARM AND CROSS_COMPILE_ARM)
49
+-        set(ARM_ARGS -march=armv6 -mfloat-abi=soft -mfpu=vfp -marm -fPIC)
50
+-    elseif(ARM)
51
+-        find_package(Neon)
52
+-        if(CPU_HAS_NEON)
53
+-            set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=neon -marm -fPIC)
54
+-            add_definitions(-DHAVE_NEON)
55
+-        else()
56
+-            set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=vfp -marm)
57
+-        endif()
58
+-    endif()
59
+-    add_definitions(${ARM_ARGS})
60
 +    if(ARMV7)
61
 +        add_definitions(-fPIC)
62
-     endif()
63
++    endif()
64
      if(FPROFILE_GENERATE)
65
          if(INTEL_CXX)
66
-Index: x265_11047/source/common/cpu.cpp
67
+             add_definitions(-prof-gen -prof-dir="${CMAKE_CURRENT_BINARY_DIR}")
68
+Index: x265_2.0/source/common/cpu.cpp
69
 ===================================================================
70
---- x265_11047.orig/source/common/cpu.cpp
71
-+++ x265_11047/source/common/cpu.cpp
72
+--- x265_2.0.orig/source/common/cpu.cpp
73
++++ x265_2.0/source/common/cpu.cpp
74
 @@ -37,7 +37,7 @@
75
  #include <machine/cpu.h>
76
  #endif
77
@@ -52,3 +69,20 @@
78
  #include <signal.h>
79
  #include <setjmp.h>
80
  static sigjmp_buf jmpbuf;
81
+@@ -340,7 +340,6 @@
82
+     }
83
+ 
84
+     canjump = 1;
85
+-    PFX(cpu_neon_test)();
86
+     canjump = 0;
87
+     signal(SIGILL, oldsig);
88
+ #endif // if !HAVE_NEON
89
+@@ -356,7 +355,7 @@
90
+     // which may result in incorrect detection and the counters stuck enabled.
91
+     // right now Apple does not seem to support performance counters for this test
92
+ #ifndef __MACH__
93
+-    flags |= PFX(cpu_fast_neon_mrc_test)() ? X265_CPU_FAST_NEON_MRC : 0;
94
++    //flags |= PFX(cpu_fast_neon_mrc_test)() ? X265_CPU_FAST_NEON_MRC : 0;
95
+ #endif
96
+     // TODO: write dual issue test? currently it's A8 (dual issue) vs. A9 (fast mrc)
97
+ #endif // if HAVE_ARMV6
98
x265_1.9.tar.gz/.hg_archival.txt -> x265_2.0.tar.gz/.hg_archival.txt Changed
8
 
1
@@ -1,4 +1,4 @@
2
 repo: 09fe40627f03a0f9c3e6ac78b22ac93da23f9fdf
3
-node: 1d3b6e448e01ec40b392ef78b7e55a86249fbe68
4
+node: 960c9991d0dcf46559c32e070418d3cbb7e8aa2f
5
 branch: stable
6
-tag: 1.9
7
+tag: 2.0
8
x265_1.9.tar.gz/.hgtags -> x265_2.0.tar.gz/.hgtags Changed
6
 
1
@@ -17,3 +17,4 @@
2
 cbeb7d8a4880e4020c4545dd8e498432c3c6cad3 1.6
3
 8425278def1edf0931dc33fc518e1950063e76b0 1.7
4
 e27327f5da35c5feb660360336fdc94bd0afe719 1.8
5
+1d3b6e448e01ec40b392ef78b7e55a86249fbe68 1.9
6
x265_2.0.tar.gz/build/arm-linux/crosscompile.cmake Added
17
 
1
@@ -0,0 +1,15 @@
2
+# CMake toolchain file for cross compiling x265 for ARM arch
3
+# This feature is only supported as experimental. Use with caution.
4
+# Please report bugs on bitbucket
5
+# Run cmake with: cmake -DCMAKE_TOOLCHAIN_FILE=crosscompile.cmake -G "Unix Makefiles" ../../source && ccmake ../../source
6
+
7
+set(CROSS_COMPILE_ARM 1)
8
+set(CMAKE_SYSTEM_NAME Linux)
9
+set(CMAKE_SYSTEM_PROCESSOR armv6l)
10
+
11
+# specify the cross compiler
12
+set(CMAKE_C_COMPILER arm-linux-gnueabi-gcc)
13
+set(CMAKE_CXX_COMPILER arm-linux-gnueabi-g++)
14
+
15
+# specify the target environment
16
+SET(CMAKE_FIND_ROOT_PATH  /usr/arm-linux-gnueabi)
17
x265_2.0.tar.gz/build/arm-linux/make-Makefiles.bash Added
6
 
1
@@ -0,0 +1,4 @@
2
+#!/bin/bash
3
+# Run this from within a bash shell
4
+
5
+cmake -G "Unix Makefiles" ../../source && ccmake ../../source
6
x265_1.9.tar.gz/doc/reST/api.rst -> x265_2.0.tar.gz/doc/reST/api.rst Changed
11
 
1
@@ -180,7 +180,8 @@
2
     *       used to modify encoder parameters.
3
     *      various parameters from x265_param are copied.
4
     *      this takes effect immediately, on whichever frame is encoded next;
5
-    *      returns 0 on success, negative on parameter validation error.
6
+    *      returns negative on parameter validation error, 0 on successful reconfigure
7
+    *      and 1 when a reconfigure is already in progress.
8
     *
9
     *      not all parameters can be changed; see the actual function for a
10
     *      detailed breakdown.  since not all parameters can be changed, moving
11
x265_1.9.tar.gz/doc/reST/cli.rst -> x265_2.0.tar.gz/doc/reST/cli.rst Changed
209
 
1
@@ -376,10 +376,10 @@
2
 
3
 .. option:: --dither
4
 
5
-   Enable high quality downscaling. Dithering is based on the diffusion
6
-   of errors from one row of pixels to the next row of pixels in a
7
-   picture. Only applicable when the input bit depth is larger than
8
-   8bits and internal bit depth is 8bits. Default disabled
9
+   Enable high quality downscaling to the encoder's internal bitdepth. 
10
+   Dithering is based on the diffusion of errors from one row of pixels 
11
+   to the next row of pixels in a picture. Only applicable when the 
12
+   input bit depth is larger than 8bits. Default disabled
13
 
14
    **CLI ONLY**
15
 
16
@@ -522,16 +522,14 @@
17
 
18
 .. option:: --high-tier, --no-high-tier
19
 
20
-   If :option:`--level-idc` has been specified, the option adds the
21
-   intention to support the High tier of that level. If your specified
22
-   level does not support a High tier, a warning is issued and this
23
-   modifier flag is ignored. If :option:`--level-idc` has been specified,
24
-   but not --high-tier, then the encoder will attempt to encode at the 
25
-   specified level, main tier first, turning on high tier only if 
26
-   necessary and available at that level.
27
+   If :option:`--level-idc` has been specified, --high-tier allows the
28
+   support of high tier at that level. The encoder will first attempt to encode 
29
+   at the specified level, main tier first, turning on high tier only if 
30
+   necessary and available at that level.If your requested level does not 
31
+   support a High tier, high tier will not be supported. If --no-high-tier 
32
+   has been specified, then the encoder will attempt to encode only at the main tier.
33
 
34
-   If :option:`--level-idc` has not been specified, this argument is
35
-   ignored.
36
+   Default: enabled
37
 
38
 .. option:: --ref <1..16>
39
 
40
@@ -564,6 +562,15 @@
41
 
42
    Default: disabled
43
 
44
+.. option:: --uhd-bd
45
+
46
+    Enable Ultra HD Blu-ray format support. If specified with incompatible
47
+    encoding options, the encoder will attempt to modify/set the right 
48
+    encode specifications. If the encoder is unable to do so, this option
49
+    will be turned OFF. Highly experimental.
50
+   
51
+    Default: disabled
52
+   
53
 .. note::
54
 
55
    :option:`--profile`, :option:`--level-idc`, and
56
@@ -600,7 +607,7 @@
57
 Mode decision / Analysis
58
 ========================
59
 
60
-.. option:: --rd <0..6>
61
+.. option:: --rd <1..6>
62
 
63
    Level of RDO in mode decision. The higher the value, the more
64
    exhaustive the analysis and the more rate distortion optimization is
65
@@ -629,7 +636,7 @@
66
    | 6     | Currently same as 5                                           |
67
    +-------+---------------------------------------------------------------+
68
 
69
-   **Range of values:** 0: least .. 6: full RDO analysis
70
+   **Range of values:** 1: least .. 6: full RDO analysis
71
 
72
 Options which affect the coding unit quad-tree, sometimes referred to as
73
 the prediction quad-tree.
74
@@ -722,8 +729,18 @@
75
 
76
 .. option:: --early-skip, --no-early-skip
77
 
78
-   Measure full CU size (2Nx2N) merge candidates first; if no residual
79
-   is found the analysis is short circuited. Default disabled
80
+   Measure 2Nx2N merge candidates first; if no residual is found, 
81
+   additional modes at that depth are not analysed. Default disabled
82
+
83
+.. option:: --rskip, --no-rskip
84
+
85
+   This option determines early exit from CU depth recursion. When a skip CU is
86
+   found, additional heuristics (depending on rd-level) are used to decide whether
87
+   to terminate recursion. In rdlevels 5 and 6, comparison with inter2Nx2N is used, 
88
+   while at rdlevels 4 and neighbour costs are used to skip recursion.
89
+   Provides minimal quality degradation at good performance gains when enabled. 
90
+
91
+   Default: enabled, disabled for :option:`--tune grain`
92
 
93
 .. option:: --fast-intra, --no-fast-intra
94
 
95
@@ -756,6 +773,14 @@
96
    evaluate if luma used tskip. Inter block tskip analysis is
97
    unmodified. Default disabled
98
 
99
+.. option:: --rd-refine, --no-rd-refine
100
+
101
+   For each analysed CU, calculate R-D cost on the best partition mode
102
+   for a range of QP values, to find the optimal rounding effect.
103
+   Default disabled.
104
+
105
+   Only effective at RD levels 5 and 6
106
+
107
 Analysis re-use options, to improve performance when encoding the same
108
 sequence multiple times (presumably at varying bitrates). The encoder
109
 will not reuse analysis if the resolution and slice type parameters do
110
@@ -1039,7 +1064,7 @@
111
 cause ringing artifacts. psy-rdoq is less accurate than psy-rd, it is
112
 biasing towards energy in general while psy-rd biases towards the energy
113
 of the source image. But very large psy-rdoq values can sometimes be
114
-beneficial, preserving film grain for instance.
115
+beneficial.
116
 
117
 As a general rule, when both psycho-visual features are disabled, the
118
 encoder will tend to blur blocks in areas of difficult motion. Turning
119
@@ -1076,8 +1101,8 @@
120
    energy in the reconstructed image. This generally improves perceived
121
    visual quality at the cost of lower quality metric scores.  It only
122
    has effect when :option:`--rdoq-level` is 1 or 2. High values can
123
-   be beneficial in preserving high-frequency detail like film grain.
124
-   Default: 1.0
125
+   be beneficial in preserving high-frequency detail.
126
+   Default: 0.0 (1.0 for presets slow, slower, veryslow)
127
 
128
    **Range of values:** 0 .. 50.0
129
 
130
@@ -1336,13 +1361,13 @@
131
 
132
 .. option:: --slow-firstpass, --no-slow-firstpass
133
 
134
-   Enable a slow and more detailed first pass encode in multi-pass rate
135
-   control mode.  Speed of the first pass encode is slightly lesser and
136
-   quality midly improved when compared to the default settings in a
137
-   multi-pass encode. Default disabled (turbo mode enabled)
138
+   Enable first pass encode with the exact settings specified. 
139
+   The quality in subsequent multi-pass encodes is better
140
+   (compared to first pass) when the settings match across each pass. 
141
+   Default enabled.
142
 
143
-   When **turbo** first pass is not disabled, these options are
144
-   set on the first pass to improve performance:
145
+   When slow first pass is disabled, a **turbo** encode with the following
146
+   go-fast options is used to improve performance:
147
    
148
    * :option:`--fast-intra`
149
    * :option:`--no-rect`
150
@@ -1408,7 +1433,16 @@
151
 
152
    The maximum single adjustment in QP allowed to rate control. Default
153
    4
154
-
155
+   
156
+.. option:: --rc-grain, --no-rc-grain
157
+
158
+   Enables a specialised ratecontrol algorithm for film grain content. This 
159
+   parameter strictly minimises QP fluctuations within and across frames 
160
+   and removes pulsing of grain. Default disabled. 
161
+   Enabled when :option:'--tune' grain is applied. It is highly recommended 
162
+   that this option is used through the tune grain feature where a combination 
163
+   of param options are used to improve visual quality.
164
+   
165
 .. option:: --qblur <float>
166
 
167
    Temporally blur quants. Default 0.5
168
@@ -1660,10 +1694,13 @@
169
    a string which is parsed when the stream header SEI are emitted. The
170
    string format is "G(%hu,%hu)B(%hu,%hu)R(%hu,%hu)WP(%hu,%hu)L(%u,%u)"
171
    where %hu are unsigned 16bit integers and %u are unsigned 32bit
172
-   integers. The SEI includes X,Y display primaries for RGB channels,
173
-   white point X,Y and max,min luminance values. (HDR)
174
+   integers. The SEI includes X,Y display primaries for RGB channels
175
+   and white point (WP) in units of 0.00002 and max,min luminance (L)
176
+   values in units of 0.0001 candela per meter square. (HDR)
177
 
178
-   Example for D65P3 1000-nits:
179
+   Example for a P3D65 1000-nits monitor, where G(x=0.265, y=0.690),
180
+   B(x=0.150, y=0.060), R(x=0.680, y=0.320), WP(x=0.3127, y=0.3290),
181
+   L(max=1000, min=0.0001):
182
 
183
        G(13250,34500)B(7500,3000)R(34000,16000)WP(15635,16450)L(10000000,1)
184
 
185
@@ -1672,8 +1709,9 @@
186
 
187
 .. option:: --max-cll <string>
188
 
189
-   Maximum content light level and maximum frame average light level as
190
-   required by the Consumer Electronics Association 861.3 specification.
191
+   Maximum content light level (MaxCLL) and maximum frame average light
192
+   level (MaxFALL) as required by the Consumer Electronics Association
193
+   861.3 specification.
194
 
195
    Specified as a string which is parsed when the stream header SEI are
196
    emitted. The string format is "%hu,%hu" where %hu are unsigned 16bit
197
@@ -1681,6 +1719,11 @@
198
    maximum is indicated), the second value is the maximum picture
199
    average light level (or 0). (HDR)
200
 
201
+   Example for MaxCLL=1000 candela per square meter, MaxFALL=400
202
+   candela per square meter:
203
+
204
+       --max-cll “1000,400”
205
+
206
    Note that this string value will need to be escaped or quoted to
207
    protect against shell expansion on many platforms. No default.
208
 
209
x265_1.9.tar.gz/doc/reST/presets.rst -> x265_2.0.tar.gz/doc/reST/presets.rst Changed
201
 
1
@@ -21,68 +21,80 @@
2
 The presets adjust encoder parameters as shown in the following table.
3
 Any parameters below that are specified in your command-line will be 
4
 changed from the value specified by the preset.
5
-
6
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
7
-|                 |ultrafast |superfast |veryfast |faster |fast |medium |slow |slower |veryslow |placebo |
8
-+=================+==========+==========+=========+=======+=====+=======+=====+=======+=========+========+
9
-| ctu             |    32    |    32    |   64    |  64   | 64  |  64   | 64  |  64   |   64    |  64    |
10
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
11
-| min-cu-size     |    16    |     8    |    8    |   8   |  8  |   8   |  8  |   8   |    8    |   8    |
12
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
13
-| bframes         |     3    |     3    |    4    |   4   |  4  |   4   |  4  |   8   |    8    |   8    |
14
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
15
-| b-adapt         |     0    |     0    |    0    |   0   |  0  |   2   |  2  |   2   |    2    |   2    |
16
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
17
-| rc-lookahead    |     5    |    10    |   15    |  15   | 15  |  20   | 25  |  30   |   40    |  60    |
18
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
19
-| lookahead-slices|     8    |     8    |    8    |   8   |  8  |   8   |  4  |   4   |    1    |   1    |
20
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
21
-| scenecut        |     0    |    40    |   40    |  40   | 40  |  40   | 40  |  40   |   40    |  40    |
22
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
23
-| ref             |     1    |     1    |    2    |   2   |  3  |   3   |  4  |   4   |    5    |   5    |
24
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
25
-| limit-refs      |     0    |     0    |    3    |   3   |  3  |   3   |  3  |   2   |    1    |   0    |
26
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
27
-| me              |    dia   |   hex    |   hex   |  hex  |hex  |  hex  |star | star  |   star  |  star  |
28
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
29
-| merange         |    57    |    57    |   57    |  57   | 57  |  57   | 57  |  57   |   57    |  92    |
30
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
31
-| subme           |     0    |     1    |    1    |   2   |  2  |   2   |  3  |   3   |    4    |   5    |
32
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
33
-| rect            |     0    |     0    |    0    |   0   |  0  |   0   |  1  |   1   |    1    |   1    |
34
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
35
-| amp             |     0    |     0    |    0    |   0   |  0  |   0   |  0  |   1   |    1    |   1    |
36
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
37
-| limit-modes     |     0    |     0    |    0    |   0   |  0  |   0   |  1  |   1   |    1    |   0    |
38
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
39
-| max-merge       |     2    |     2    |    2    |   2   |  2  |   2   |  3  |   3   |    4    |   5    |
40
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
41
-| early-skip      |     1    |     1    |    1    |   1   |  0  |   0   |  0  |   0   |    0    |   0    |
42
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
43
-| fast-intra      |     1    |     1    |    1    |   1   |  1  |   0   |  0  |   0   |    0    |   0    |
44
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
45
-| b-intra         |     0    |     0    |    0    |   0   |  0  |   0   |  0  |   1   |    1    |   1    |
46
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
47
-| sao             |     0    |     0    |    1    |   1   |  1  |   1   |  1  |   1   |    1    |   1    |
48
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
49
-| signhide        |     0    |     1    |    1    |   1   |  1  |   1   |  1  |   1   |    1    |   1    |
50
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
51
-| weightp         |     0    |     0    |    1    |   1   |  1  |   1   |  1  |   1   |    1    |   1    |
52
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
53
-| weightb         |     0    |     0    |    0    |   0   |  0  |   0   |  0  |   1   |    1    |   1    |
54
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
55
-| aq-mode         |     0    |     0    |    1    |   1   |  1  |   1   |  1  |   1   |    1    |   1    |
56
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
57
-| cuTree          |     1    |     1    |    1    |   1   |  1  |   1   |  1  |   1   |    1    |   1    |
58
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
59
-| rdLevel         |     2    |     2    |    2    |   2   |  2  |   3   |  4  |   6   |    6    |   6    |
60
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
61
-| rdoq-level      |     0    |     0    |    0    |   0   |  0  |   0   |  2  |   2   |    2    |   2    |
62
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
63
-| tu-intra        |     1    |     1    |    1    |   1   |  1  |   1   |  1  |   2   |    3    |   4    |
64
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
65
-| tu-inter        |     1    |     1    |    1    |   1   |  1  |   1   |  1  |   2   |    3    |   4    |
66
-+-----------------+----------+----------+---------+-------+-----+-------+-----+-------+---------+--------+
67
+   0. ultrafast
68
+   1. superfast
69
+   2. veryfast
70
+   3. faster
71
+   4. fast
72
+   5. medium **(default)**
73
+   6. slow
74
+   7. slower
75
+   8. veryslow
76
+   9. placebo
77
+
78
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
79
+| preset          |  0  |  1  |  2  |   3 |   4 |   5 |   6  |   7  |   8  |  9   |
80
++=================+=====+=====+=====+=====+=====+=====+======+======+======+======+
81
+| ctu             | 32  | 32  | 64  |  64 |  64 |  64 |  64  |  64  |  64  | 64   |
82
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
83
+| min-cu-size     | 16  |  8  |  8  |   8 |   8 |   8 |   8  |   8  |   8  |  8   |
84
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
85
+| bframes         |  3  |  3  |  4  |   4 |   4 |   4 |   4  |   8  |   8  |  8   |
86
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
87
+| b-adapt         |  0  |  0  |  0  |   0 |   0 |   2 |   2  |   2  |   2  |  2   |
88
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
89
+| rc-lookahead    |  5  | 10  | 15  |  15 |  15 |  20 |  25  |  30  |  40  | 60   |
90
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
91
+| lookahead-slices|  8  |  8  |  8  |   8 |   8 |   8 |   4  |   4  |   1  |  1   |
92
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
93
+| scenecut        |  0  | 40  | 40  |  40 |  40 |  40 |  40  |  40  |  40  | 40   |
94
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
95
+| ref             |  1  |  1  |  2  |   2 |   3 |   3 |   4  |   4  |   5  |  5   |
96
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
97
+| limit-refs      |  0  |  0  |  3  |   3 |   3 |   3 |   3  |   2  |   1  |  0   |
98
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
99
+| me              | dia | hex | hex | hex | hex | hex | star | star | star | star |
100
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
101
+| merange         | 57  | 57  | 57  |  57 |  57 |  57 |  57  |  57  |  57  | 92   |
102
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
103
+| subme           |  0  |  1  |  1  |   2 |   2 |   2 |   3  |   3  |   4  |  5   |
104
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
105
+| rect            |  0  |  0  |  0  |   0 |   0 |   0 |   1  |   1  |   1  |  1   |
106
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
107
+| amp             |  0  |  0  |  0  |   0 |   0 |   0 |   0  |   1  |   1  |  1   |
108
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
109
+| limit-modes     |  0  |  0  |  0  |   0 |   0 |   0 |   1  |   1  |   1  |  0   |
110
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
111
+| max-merge       |  2  |  2  |  2  |   2 |   2 |   2 |   3  |   3  |   4  |  5   |
112
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
113
+| early-skip      |  1  |  1  |  1  |   1 |   0 |   0 |   0  |   0  |   0  |  0   |
114
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
115
+| recursion-skip  |  1  |  1  |  1  |   1 |   1 |   1 |   1  |   1  |   0  |  0   |
116
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
117
+| fast-intra      |  1  |  1  |  1  |   1 |   1 |   0 |   0  |   0  |   0  |  0   |
118
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
119
+| b-intra         |  0  |  0  |  0  |   0 |   0 |   0 |   0  |   1  |   1  |  1   |
120
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
121
+| sao             |  0  |  0  |  1  |   1 |   1 |   1 |   1  |   1  |   1  |  1   |
122
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
123
+| signhide        |  0  |  1  |  1  |   1 |   1 |   1 |   1  |   1  |   1  |  1   |
124
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
125
+| weightp         |  0  |  0  |  1  |   1 |   1 |   1 |   1  |   1  |   1  |  1   |
126
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
127
+| weightb         |  0  |  0  |  0  |   0 |   0 |   0 |   0  |   1  |   1  |  1   |
128
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
129
+| aq-mode         |  0  |  0  |  1  |   1 |   1 |   1 |   1  |   1  |   1  |  1   |
130
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
131
+| cuTree          |  1  |  1  |  1  |   1 |   1 |   1 |   1  |   1  |   1  |  1   |
132
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
133
+| rdLevel         |  2  |  2  |  2  |   2 |   2 |   3 |   4  |   6  |   6  |  6   |
134
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
135
+| rdoq-level      |  0  |  0  |  0  |   0 |   0 |   0 |   2  |   2  |   2  |  2   |
136
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
137
+| tu-intra        |  1  |  1  |  1  |   1 |   1 |   1 |   1  |   2  |   3  |  4   |
138
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
139
+| tu-inter        |  1  |  1  |  1  |   1 |   1 |   1 |   1  |   2  |   3  |  4   |
140
++-----------------+-----+-----+-----+-----+-----+-----+------+------+------+------+
141
 
142
 .. _tunings:
143
 
144
@@ -117,33 +129,32 @@
145
 
146
 
147
 
148
-Film Grain Retention
149
-~~~~~~~~~~~~~~~~~~~~
150
-
151
-:option:`--tune` *grain* tries to improve the retention of film grain in
152
-the reconstructed output. It disables rate distortion optimizations in
153
-quantization, and increases the default psy-rd.
154
-
155
-    * :option:`--psy-rd` 0.5
156
-    * :option:`--rdoq-level` 0
157
-    * :option:`--psy-rdoq` 0
158
-
159
-It lowers the strength of adaptive quantization, so residual energy can
160
-be more evenly distributed across the (noisy) picture:
161
+Film Grain
162
+~~~~~~~~~~
163
 
164
-    * :option:`--aq-strength` 0.3
165
-
166
-And it similarly tunes rate control to prevent the slice QP from
167
-swinging too wildly from frame to frame:
168
+:option:`--tune` *grain* aims to encode grainy content with the best 
169
+visual quality. The purpose of this option is neither to retain nor 
170
+eliminate grain, but prevent noticeable artifacts caused by uneven 
171
+distribution of grain. :option:`--tune` *grain* strongly restricts 
172
+algorithms that vary the quantization parameter within and across frames.
173
+Tune grain also biases towards decisions that retain more high frequency
174
+components.
175
 
176
+    * :option:`--aq-mode` 0
177
+    * :option:`--cutree` 0
178
     * :option:`--ipratio` 1.1
179
-    * :option:`--pbratio` 1.1
180
-    * :option:`--qcomp` 0.8
181
-
182
-And lastly it reduces the strength of deblocking to prevent grain being
183
-blurred on block boundaries:
184
-
185
-    * :option:`--deblock` -2
186
+    * :option:`--pbratio` 1.0
187
+    * :option:`--qpstep` 1
188
+    * :option:`--sao` 0
189
+    * :option:`--psy-rd` 4.0
190
+    * :option:`--psy-rdoq` 10.0
191
+    * :option:`--recursion-skip` 0
192
+    
193
+It also enables a specialised ratecontrol algorithm :option:`--rc-grain` 
194
+that strictly minimises QP fluctuations across frames, while still allowing 
195
+the encoder to hit bitrate targets and VBV buffer limits (with a slightly 
196
+higher margin of error than normal). It is highly recommended that this 
197
+algorithm is used only through the :option:`--tune` *grain* feature.
198
 
199
 Fast Decode
200
 ~~~~~~~~~~~
201
x265_1.9.tar.gz/source/CMakeLists.txt -> x265_2.0.tar.gz/source/CMakeLists.txt Changed
159
 
1
@@ -30,7 +30,7 @@
2
 mark_as_advanced(FPROFILE_USE FPROFILE_GENERATE NATIVE_BUILD)
3
 
4
 # X265_BUILD must be incremented each time the public API is changed
5
-set(X265_BUILD 79)
6
+set(X265_BUILD 87)
7
 configure_file("${PROJECT_SOURCE_DIR}/x265.def.in"
8
                "${PROJECT_BINARY_DIR}/x265.def")
9
 configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in"
10
@@ -41,7 +41,9 @@
11
 # System architecture detection
12
 string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" SYSPROC)
13
 set(X86_ALIASES x86 i386 i686 x86_64 amd64)
14
+set(ARM_ALIASES armv6l armv7l)
15
 list(FIND X86_ALIASES "${SYSPROC}" X86MATCH)
16
+list(FIND ARM_ALIASES "${SYSPROC}" ARMMATCH)
17
 set(POWER_ALIASES ppc64 ppc64le)
18
 list(FIND POWER_ALIASES "${SYSPROC}" POWERMATCH)
19
 if("${SYSPROC}" STREQUAL "" OR X86MATCH GREATER "-1")
20
@@ -58,7 +60,12 @@
21
     message(STATUS "Detected POWER target processor")
22
     set(POWER 1)
23
     add_definitions(-DX265_ARCH_POWER=1)
24
-elseif(${SYSPROC} STREQUAL "armv6l")
25
+elseif(ARMMATCH GREATER "-1")
26
+    if(CROSS_COMPILE_ARM)
27
+        message(STATUS "Cross compiling for ARM arch")
28
+    else()
29
+        set(CROSS_COMPILE_ARM 0)
30
+    endif()
31
     message(STATUS "Detected ARM target processor")
32
     set(ARM 1)
33
     add_definitions(-DX265_ARCH_ARM=1 -DHAVE_ARMV6=1)
34
@@ -174,11 +181,23 @@
35
             add_definitions(-march=native)
36
         endif()
37
     elseif(X86 AND NOT X64)
38
-        add_definitions(-march=i686)
39
+        string(FIND "${CMAKE_CXX_FLAGS}" "-march" marchPos)
40
+        if(marchPos LESS "0")
41
+            add_definitions(-march=i686)
42
+        endif()
43
     endif()
44
-    if(ARM)
45
-        add_definitions(-march=armv6 -mfloat-abi=hard -mfpu=vfp)
46
+    if(ARM AND CROSS_COMPILE_ARM)
47
+        set(ARM_ARGS -march=armv6 -mfloat-abi=soft -mfpu=vfp -marm -fPIC)
48
+    elseif(ARM)
49
+        find_package(Neon)
50
+        if(CPU_HAS_NEON)
51
+            set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=neon -marm -fPIC)
52
+            add_definitions(-DHAVE_NEON)
53
+        else()
54
+            set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=vfp -marm)
55
+        endif()
56
     endif()
57
+    add_definitions(${ARM_ARGS})
58
     if(FPROFILE_GENERATE)
59
         if(INTEL_CXX)
60
             add_definitions(-prof-gen -prof-dir="${CMAKE_CURRENT_BINARY_DIR}")
61
@@ -269,7 +288,9 @@
62
 endif(GCC)
63
 
64
 find_package(Yasm)
65
-if(YASM_FOUND AND X86)
66
+if(ARM OR CROSS_COMPILE_ARM)
67
+    option(ENABLE_ASSEMBLY "Enable use of assembly coded primitives" ON)
68
+elseif(YASM_FOUND AND X86)
69
     if (YASM_VERSION_STRING VERSION_LESS "1.2.0")
70
         message(STATUS "Yasm version ${YASM_VERSION_STRING} is too old. 1.2.0 or later required")
71
         option(ENABLE_ASSEMBLY "Enable use of assembly coded primitives" OFF)
72
@@ -409,7 +430,7 @@
73
 add_subdirectory(encoder)
74
 add_subdirectory(common)
75
 
76
-if((MSVC_IDE OR XCODE) AND ENABLE_ASSEMBLY)
77
+if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)
78
     # this is required because of this cmake bug
79
     # http://www.cmake.org/Bug/print_bug_page.php?bug_id=8170
80
     if(WIN32)
81
@@ -417,19 +438,36 @@
82
     else()
83
         set(SUFFIX o)
84
     endif()
85
-    foreach(ASM ${MSVC_ASMS})
86
-        set(YASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/x86/${ASM})
87
-        list(APPEND YASM_SRCS ${YASM_SRC})
88
-        list(APPEND YASM_OBJS ${ASM}.${SUFFIX})
89
-        add_custom_command(
90
-            OUTPUT ${ASM}.${SUFFIX}
91
-            COMMAND ${YASM_EXECUTABLE} ARGS ${YASM_FLAGS} ${YASM_SRC} -o ${ASM}.${SUFFIX}
92
-            DEPENDS ${YASM_SRC})
93
-    endforeach()
94
+
95
+    if(ARM OR CROSS_COMPILE_ARM)
96
+    # compile ARM arch asm files here
97
+        enable_language(ASM)
98
+        foreach(ASM ${ARM_ASMS})
99
+            set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/arm/${ASM})
100
+            list(APPEND ASM_SRCS ${ASM_SRC})
101
+            list(APPEND ASM_OBJS ${ASM}.${SUFFIX})
102
+            add_custom_command(
103
+                OUTPUT ${ASM}.${SUFFIX}
104
+                COMMAND ${CMAKE_CXX_COMPILER}
105
+                ARGS ${ARM_ARGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}
106
+                DEPENDS ${ASM_SRC})
107
+        endforeach()
108
+    elseif(X86)
109
+    # compile X86 arch asm files here
110
+        foreach(ASM ${MSVC_ASMS})
111
+            set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/x86/${ASM})
112
+            list(APPEND ASM_SRCS ${ASM_SRC})
113
+            list(APPEND ASM_OBJS ${ASM}.${SUFFIX})
114
+            add_custom_command(
115
+                OUTPUT ${ASM}.${SUFFIX}
116
+                COMMAND ${YASM_EXECUTABLE} ARGS ${YASM_FLAGS} ${ASM_SRC} -o ${ASM}.${SUFFIX}
117
+                DEPENDS ${ASM_SRC})
118
+        endforeach()
119
+    endif()
120
 endif()
121
 
122
-source_group(ASM FILES ${YASM_SRCS})
123
-add_library(x265-static STATIC $<TARGET_OBJECTS:encoder> $<TARGET_OBJECTS:common> ${YASM_OBJS} ${YASM_SRCS})
124
+source_group(ASM FILES ${ASM_SRCS})
125
+add_library(x265-static STATIC $<TARGET_OBJECTS:encoder> $<TARGET_OBJECTS:common> ${ASM_OBJS} ${ASM_SRCS})
126
 if(NOT MSVC)
127
     set_target_properties(x265-static PROPERTIES OUTPUT_NAME x265)
128
 endif()
129
@@ -463,7 +501,7 @@
130
 
131
 option(ENABLE_SHARED "Build shared library" ON)
132
 if(ENABLE_SHARED)
133
-    add_library(x265-shared SHARED "${PROJECT_BINARY_DIR}/x265.def" ${YASM_OBJS}
134
+    add_library(x265-shared SHARED "${PROJECT_BINARY_DIR}/x265.def" ${ASM_OBJS}
135
                 ${X265_RC_FILE} $<TARGET_OBJECTS:encoder> $<TARGET_OBJECTS:common>)
136
     if(EXTRA_LIB)
137
         target_link_libraries(x265-shared ${EXTRA_LIB})
138
@@ -559,7 +597,7 @@
139
         # Xcode seems unable to link the CLI with libs, so link as one targget
140
         add_executable(cli ../COPYING ${InputFiles} ${OutputFiles} ${GETOPT}
141
                        x265.cpp x265.h x265cli.h x265-extras.h x265-extras.cpp
142
-                       $<TARGET_OBJECTS:encoder> $<TARGET_OBJECTS:common> ${YASM_OBJS} ${YASM_SRCS})
143
+                       $<TARGET_OBJECTS:encoder> $<TARGET_OBJECTS:common> ${ASM_OBJS} ${ASM_SRCS})
144
     else()
145
         add_executable(cli ../COPYING ${InputFiles} ${OutputFiles} ${GETOPT} ${X265_RC_FILE}
146
                        ${ExportDefs} x265.cpp x265.h x265cli.h x265-extras.h x265-extras.cpp)
147
@@ -587,3 +625,11 @@
148
         add_subdirectory(test)
149
     endif()
150
 endif()
151
+
152
+get_directory_property(hasParent PARENT_DIRECTORY)
153
+if(hasParent)
154
+    if(PLATFORM_LIBS)
155
+        LIST(REMOVE_DUPLICATES PLATFORM_LIBS)
156
+        set(PLATFORM_LIBS ${PLATFORM_LIBS} PARENT_SCOPE)
157
+    endif(PLATFORM_LIBS)
158
+endif(hasParent)
159
x265_2.0.tar.gz/source/cmake/FindNeon.cmake Added
12
 
1
@@ -0,0 +1,10 @@
2
+include(FindPackageHandleStandardArgs)
3
+
4
+# Check the version of neon supported by the ARM CPU
5
+execute_process(COMMAND cat /proc/cpuinfo | grep Features | grep neon
6
+                OUTPUT_VARIABLE neon_version
7
+                ERROR_QUIET
8
+                OUTPUT_STRIP_TRAILING_WHITESPACE)
9
+if(neon_version)
10
+    set(CPU_HAS_NEON 1)
11
+endif()
12
x265_1.9.tar.gz/source/cmake/version.cmake -> x265_2.0.tar.gz/source/cmake/version.cmake Changed
68
 
1
@@ -52,39 +52,55 @@
2
         )
3
     execute_process(
4
         COMMAND
5
-        ${HG_EXECUTABLE} log -r. --template "{node|short}"
6
+        ${HG_EXECUTABLE} log -r. --template "{node}"
7
         WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
8
-        OUTPUT_VARIABLE HG_REVISION_ID
9
+        OUTPUT_VARIABLE X265_REVISION_ID
10
         ERROR_QUIET
11
         OUTPUT_STRIP_TRAILING_WHITESPACE
12
         )
13
+    string(SUBSTRING "${X265_REVISION_ID}" 0 12 X265_REVISION_ID)
14
 
15
     if(X265_LATEST_TAG MATCHES "^r")
16
         string(SUBSTRING ${X265_LATEST_TAG} 1 -1 X265_LATEST_TAG)
17
     endif()
18
-    if(X265_TAG_DISTANCE STREQUAL "0")
19
-        set(X265_VERSION "${X265_LATEST_TAG}")
20
-    else()
21
-        set(X265_VERSION "${X265_LATEST_TAG}+${X265_TAG_DISTANCE}-${HG_REVISION_ID}")
22
-    endif()
23
 elseif(GIT_EXECUTABLE AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/../.git)
24
     execute_process(
25
         COMMAND
26
-        ${GIT_EXECUTABLE} describe --tags --abbrev=0
27
+        ${GIT_EXECUTABLE} rev-list --tags --max-count=1
28
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
29
+        OUTPUT_VARIABLE X265_LATEST_TAG_COMMIT
30
+        ERROR_QUIET
31
+        OUTPUT_STRIP_TRAILING_WHITESPACE
32
+        )
33
+    execute_process(
34
+        COMMAND
35
+        ${GIT_EXECUTABLE} describe --tags ${X265_LATEST_TAG_COMMIT}
36
         WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
37
         OUTPUT_VARIABLE X265_LATEST_TAG
38
         ERROR_QUIET
39
         OUTPUT_STRIP_TRAILING_WHITESPACE
40
         )
41
-
42
     execute_process(
43
         COMMAND
44
-        ${GIT_EXECUTABLE} describe --tags
45
+        ${GIT_EXECUTABLE} rev-list ${X265_LATEST_TAG}.. --count --first-parent
46
         WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
47
-        OUTPUT_VARIABLE X265_VERSION
48
+        OUTPUT_VARIABLE X265_TAG_DISTANCE
49
         ERROR_QUIET
50
         OUTPUT_STRIP_TRAILING_WHITESPACE
51
         )
52
+    execute_process(
53
+        COMMAND
54
+        ${GIT_EXECUTABLE} log -1 --format=g%h
55
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
56
+        OUTPUT_VARIABLE X265_REVISION_ID
57
+        ERROR_QUIET
58
+        OUTPUT_STRIP_TRAILING_WHITESPACE
59
+        )
60
+endif()
61
+if(X265_TAG_DISTANCE STREQUAL "0")
62
+    set(X265_VERSION "${X265_LATEST_TAG}")
63
+else()
64
+    set(X265_VERSION "${X265_LATEST_TAG}+${X265_TAG_DISTANCE}-${X265_REVISION_ID}")
65
 endif()
66
 
67
 message(STATUS "x265 version ${X265_VERSION}")
68
x265_1.9.tar.gz/source/common/CMakeLists.txt -> x265_2.0.tar.gz/source/common/CMakeLists.txt Changed
49
 
1
@@ -16,12 +16,14 @@
2
 if(ENABLE_ASSEMBLY)
3
     set_source_files_properties(threading.cpp primitives.cpp PROPERTIES COMPILE_FLAGS -DENABLE_ASSEMBLY=1)
4
     list(APPEND VFLAGS "-DENABLE_ASSEMBLY=1")
5
+endif(ENABLE_ASSEMBLY)
6
 
7
+if(ENABLE_ASSEMBLY AND X86)
8
     set(SSE3  vec/dct-sse3.cpp)
9
     set(SSSE3 vec/dct-ssse3.cpp)
10
     set(SSE41 vec/dct-sse41.cpp)
11
 
12
-    if(MSVC AND X86)
13
+    if(MSVC)
14
         set(PRIMITIVES ${SSE3} ${SSSE3} ${SSE41})
15
         set(WARNDISABLE "/wd4100") # unreferenced formal parameter
16
         if(INTEL_CXX)
17
@@ -38,7 +40,7 @@
18
             set_source_files_properties(${SSE3} ${SSSE3} ${SSE41} PROPERTIES COMPILE_FLAGS "${WARNDISABLE} /arch:SSE2")
19
         endif()
20
     endif()
21
-    if(GCC AND X86)
22
+    if(GCC)
23
         if(CLANG)
24
             # llvm intrinsic headers cause shadow warnings
25
             set(WARNDISABLE "-Wno-shadow -Wno-unused-parameter")
26
@@ -81,7 +83,21 @@
27
         set(ASM_PRIMITIVES ${ASM_PRIMITIVES} x86/${SRC})
28
     endforeach()
29
     source_group(Assembly FILES ${ASM_PRIMITIVES})
30
-endif(ENABLE_ASSEMBLY)
31
+endif(ENABLE_ASSEMBLY AND X86)
32
+
33
+if(ENABLE_ASSEMBLY AND (ARM OR CROSS_COMPILE_ARM))
34
+    set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h loopfilter.h)
35
+
36
+    # add ARM assembly/intrinsic files here
37
+    set(A_SRCS asm.S cpu-a.S mc-a.S sad-a.S pixel-util.S ssd-a.S blockcopy8.S ipfilter8.S dct-a.S)
38
+    set(VEC_PRIMITIVES)
39
+
40
+    set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources")
41
+    foreach(SRC ${C_SRCS})
42
+        set(ASM_PRIMITIVES ${ASM_PRIMITIVES} arm/${SRC})
43
+    endforeach()
44
+    source_group(Assembly FILES ${ASM_PRIMITIVES})
45
+endif(ENABLE_ASSEMBLY AND (ARM OR CROSS_COMPILE_ARM))
46
 
47
 # set_target_properties can't do list expansion
48
 string(REPLACE ";" " " VERSION_FLAGS "${VFLAGS}")
49
x265_2.0.tar.gz/source/common/arm/asm-primitives.cpp Added
1024
 
1
@@ -0,0 +1,1022 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2016 x265 project
4
+ *
5
+ * Authors: Steve Borho <steve@borho.org>
6
+ *          Praveen Kumar Tiwari <praveen@multicorewareinc.com>
7
+ *          Min Chen <chenm003@163.com> <min.chen@multicorewareinc.com>
8
+ *          Dnyaneshwar Gorade <dnyaneshwar@multicorewareinc.com>
9
+ *
10
+ * This program is free software; you can redistribute it and/or modify
11
+ * it under the terms of the GNU General Public License as published by
12
+ * the Free Software Foundation; either version 2 of the License, or
13
+ * (at your option) any later version.
14
+ *
15
+ * This program is distributed in the hope that it will be useful,
16
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18
+ * GNU General Public License for more details.
19
+ *
20
+ * You should have received a copy of the GNU General Public License
21
+ * along with this program; if not, write to the Free Software
22
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
23
+ *
24
+ * This program is also available under a commercial proprietary license.
25
+ * For more information, contact us at license @ x265.com.
26
+ *****************************************************************************/
27
+
28
+#include "common.h"
29
+#include "primitives.h"
30
+#include "x265.h"
31
+#include "cpu.h"
32
+
33
+extern "C" {
34
+#include "blockcopy8.h"
35
+#include "pixel.h"
36
+#include "pixel-util.h"
37
+#include "ipfilter8.h"
38
+#include "dct8.h"
39
+}
40
+
41
+namespace X265_NS {
42
+// private x265 namespace
43
+
44
+void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask)
45
+{
46
+    if (cpuMask & X265_CPU_NEON)
47
+    {
48
+        // ssim_4x4x2_core
49
+        p.ssim_4x4x2_core = PFX(ssim_4x4x2_core_neon);
50
+
51
+        // addAvg
52
+         p.pu[LUMA_4x4].addAvg   = PFX(addAvg_4x4_neon);
53
+         p.pu[LUMA_4x8].addAvg   = PFX(addAvg_4x8_neon);
54
+         p.pu[LUMA_4x16].addAvg  = PFX(addAvg_4x16_neon);
55
+         p.pu[LUMA_8x4].addAvg   = PFX(addAvg_8x4_neon);
56
+         p.pu[LUMA_8x8].addAvg   = PFX(addAvg_8x8_neon);
57
+         p.pu[LUMA_8x16].addAvg  = PFX(addAvg_8x16_neon);
58
+         p.pu[LUMA_8x32].addAvg  = PFX(addAvg_8x32_neon);
59
+         p.pu[LUMA_12x16].addAvg = PFX(addAvg_12x16_neon);
60
+         p.pu[LUMA_16x4].addAvg  = PFX(addAvg_16x4_neon);
61
+         p.pu[LUMA_16x8].addAvg  = PFX(addAvg_16x8_neon);
62
+         p.pu[LUMA_16x12].addAvg = PFX(addAvg_16x12_neon);
63
+         p.pu[LUMA_16x16].addAvg = PFX(addAvg_16x16_neon);
64
+         p.pu[LUMA_16x32].addAvg = PFX(addAvg_16x32_neon);
65
+         p.pu[LUMA_16x64].addAvg = PFX(addAvg_16x64_neon);
66
+         p.pu[LUMA_24x32].addAvg = PFX(addAvg_24x32_neon);
67
+         p.pu[LUMA_32x8].addAvg  = PFX(addAvg_32x8_neon);
68
+         p.pu[LUMA_32x16].addAvg = PFX(addAvg_32x16_neon);
69
+         p.pu[LUMA_32x24].addAvg = PFX(addAvg_32x24_neon);
70
+         p.pu[LUMA_32x32].addAvg = PFX(addAvg_32x32_neon);
71
+         p.pu[LUMA_32x64].addAvg = PFX(addAvg_32x64_neon);
72
+         p.pu[LUMA_48x64].addAvg = PFX(addAvg_48x64_neon);
73
+         p.pu[LUMA_64x16].addAvg = PFX(addAvg_64x16_neon);
74
+         p.pu[LUMA_64x32].addAvg = PFX(addAvg_64x32_neon);
75
+         p.pu[LUMA_64x48].addAvg = PFX(addAvg_64x48_neon);
76
+         p.pu[LUMA_64x64].addAvg = PFX(addAvg_64x64_neon);
77
+
78
+        // chroma addAvg
79
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].addAvg   = PFX(addAvg_4x2_neon);
80
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].addAvg   = PFX(addAvg_4x4_neon);
81
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].addAvg   = PFX(addAvg_4x8_neon);
82
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].addAvg  = PFX(addAvg_4x16_neon);
83
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].addAvg   = PFX(addAvg_6x8_neon);
84
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].addAvg   = PFX(addAvg_8x2_neon);
85
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].addAvg   = PFX(addAvg_8x4_neon);
86
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].addAvg   = PFX(addAvg_8x6_neon);
87
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].addAvg   = PFX(addAvg_8x8_neon);
88
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].addAvg  = PFX(addAvg_8x16_neon);
89
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].addAvg  = PFX(addAvg_8x32_neon);
90
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].addAvg = PFX(addAvg_12x16_neon);
91
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].addAvg  = PFX(addAvg_16x4_neon);
92
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].addAvg  = PFX(addAvg_16x8_neon);
93
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].addAvg = PFX(addAvg_16x12_neon);
94
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].addAvg = PFX(addAvg_16x16_neon);
95
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].addAvg = PFX(addAvg_16x32_neon);
96
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].addAvg = PFX(addAvg_24x32_neon);
97
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].addAvg  = PFX(addAvg_32x8_neon);
98
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].addAvg = PFX(addAvg_32x16_neon);
99
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].addAvg = PFX(addAvg_32x24_neon);
100
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].addAvg = PFX(addAvg_32x32_neon);
101
+
102
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].addAvg   = PFX(addAvg_4x8_neon);
103
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].addAvg  = PFX(addAvg_4x16_neon);
104
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].addAvg  = PFX(addAvg_4x32_neon);
105
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].addAvg  = PFX(addAvg_6x16_neon);
106
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].addAvg   = PFX(addAvg_8x4_neon);
107
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].addAvg   = PFX(addAvg_8x8_neon);
108
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].addAvg  = PFX(addAvg_8x12_neon);
109
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].addAvg  = PFX(addAvg_8x16_neon);
110
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].addAvg  = PFX(addAvg_8x32_neon);
111
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].addAvg  = PFX(addAvg_8x64_neon);
112
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].addAvg = PFX(addAvg_12x32_neon);
113
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].addAvg  = PFX(addAvg_16x8_neon);
114
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].addAvg = PFX(addAvg_16x16_neon);
115
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].addAvg = PFX(addAvg_16x24_neon);
116
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].addAvg = PFX(addAvg_16x32_neon);
117
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].addAvg = PFX(addAvg_16x64_neon);
118
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].addAvg = PFX(addAvg_24x64_neon);
119
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].addAvg = PFX(addAvg_32x16_neon);
120
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].addAvg = PFX(addAvg_32x32_neon);
121
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].addAvg = PFX(addAvg_32x48_neon);
122
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].addAvg = PFX(addAvg_32x64_neon);
123
+
124
+        // quant
125
+         p.quant = PFX(quant_neon);
126
+         p.nquant = PFX(nquant_neon);
127
+
128
+        // dequant_scaling
129
+         p.dequant_scaling = PFX(dequant_scaling_neon);
130
+         p.dequant_normal  = PFX(dequant_normal_neon);
131
+
132
+        // luma satd
133
+         p.pu[LUMA_4x4].satd   = PFX(pixel_satd_4x4_neon);
134
+         p.pu[LUMA_4x8].satd   = PFX(pixel_satd_4x8_neon);
135
+         p.pu[LUMA_4x16].satd  = PFX(pixel_satd_4x16_neon);
136
+         p.pu[LUMA_8x4].satd   = PFX(pixel_satd_8x4_neon);
137
+         p.pu[LUMA_8x8].satd   = PFX(pixel_satd_8x8_neon);
138
+         p.pu[LUMA_8x16].satd  = PFX(pixel_satd_8x16_neon);
139
+         p.pu[LUMA_8x32].satd  = PFX(pixel_satd_8x32_neon);
140
+         p.pu[LUMA_12x16].satd = PFX(pixel_satd_12x16_neon);
141
+         p.pu[LUMA_16x4].satd  = PFX(pixel_satd_16x4_neon);
142
+         p.pu[LUMA_16x8].satd  = PFX(pixel_satd_16x8_neon);
143
+         p.pu[LUMA_16x16].satd = PFX(pixel_satd_16x16_neon);
144
+         p.pu[LUMA_16x32].satd = PFX(pixel_satd_16x32_neon);
145
+         p.pu[LUMA_16x64].satd = PFX(pixel_satd_16x64_neon);
146
+         p.pu[LUMA_24x32].satd = PFX(pixel_satd_24x32_neon);
147
+         p.pu[LUMA_32x8].satd  = PFX(pixel_satd_32x8_neon);
148
+         p.pu[LUMA_32x16].satd = PFX(pixel_satd_32x16_neon);
149
+         p.pu[LUMA_32x24].satd = PFX(pixel_satd_32x24_neon);
150
+         p.pu[LUMA_32x32].satd = PFX(pixel_satd_32x32_neon);
151
+         p.pu[LUMA_32x64].satd = PFX(pixel_satd_32x64_neon);
152
+         p.pu[LUMA_48x64].satd = PFX(pixel_satd_48x64_neon);
153
+         p.pu[LUMA_64x16].satd = PFX(pixel_satd_64x16_neon);
154
+         p.pu[LUMA_64x32].satd = PFX(pixel_satd_64x32_neon);
155
+         p.pu[LUMA_64x48].satd = PFX(pixel_satd_64x48_neon);
156
+         p.pu[LUMA_64x64].satd = PFX(pixel_satd_64x64_neon);
157
+
158
+        // chroma satd
159
+         p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].satd    = PFX(pixel_satd_4x4_neon);
160
+         p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].satd    = PFX(pixel_satd_4x8_neon);
161
+         p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].satd   = PFX(pixel_satd_4x16_neon);
162
+         p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].satd    = PFX(pixel_satd_8x4_neon);
163
+         p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].satd    = PFX(pixel_satd_8x8_neon);
164
+         p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].satd   = PFX(pixel_satd_8x16_neon);
165
+         p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].satd   = PFX(pixel_satd_8x32_neon);
166
+         p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].satd  = PFX(pixel_satd_12x16_neon);
167
+         p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].satd   = PFX(pixel_satd_16x4_neon);
168
+         p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].satd   = PFX(pixel_satd_16x8_neon);
169
+         p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].satd  = PFX(pixel_satd_16x12_neon);
170
+         p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].satd  = PFX(pixel_satd_16x16_neon);
171
+         p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].satd  = PFX(pixel_satd_16x32_neon);
172
+         p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].satd  = PFX(pixel_satd_24x32_neon);
173
+         p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].satd   = PFX(pixel_satd_32x8_neon);
174
+         p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].satd  = PFX(pixel_satd_32x16_neon);
175
+         p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].satd  = PFX(pixel_satd_32x24_neon);
176
+         p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].satd  = PFX(pixel_satd_32x32_neon);
177
+
178
+         p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].satd    = PFX(pixel_satd_4x4_neon);
179
+         p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd    = PFX(pixel_satd_4x8_neon);
180
+         p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].satd   = PFX(pixel_satd_4x16_neon);
181
+         p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].satd   = PFX(pixel_satd_4x32_neon);
182
+         p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].satd    = PFX(pixel_satd_8x4_neon);
183
+         p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].satd    = PFX(pixel_satd_8x8_neon);
184
+         p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].satd   = PFX(pixel_satd_8x12_neon);
185
+         p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].satd   = PFX(pixel_satd_8x16_neon);
186
+         p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].satd   = PFX(pixel_satd_8x32_neon);
187
+         p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].satd   = PFX(pixel_satd_8x64_neon);
188
+         p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].satd  = PFX(pixel_satd_12x32_neon);
189
+         p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].satd   = PFX(pixel_satd_16x8_neon);
190
+         p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].satd  = PFX(pixel_satd_16x16_neon);
191
+         p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].satd  = PFX(pixel_satd_16x24_neon);
192
+         p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].satd  = PFX(pixel_satd_16x32_neon);
193
+         p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].satd  = PFX(pixel_satd_16x64_neon);
194
+         p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].satd  = PFX(pixel_satd_24x64_neon);
195
+         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd  = PFX(pixel_satd_32x16_neon);
196
+         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].satd  = PFX(pixel_satd_32x32_neon);
197
+         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].satd  = PFX(pixel_satd_32x48_neon);
198
+         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].satd  = PFX(pixel_satd_32x64_neon);
199
+
200
+        // chroma_hpp
201
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].filter_hpp   = PFX(interp_4tap_horiz_pp_4x2_neon);
202
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].filter_hpp   = PFX(interp_4tap_horiz_pp_4x4_neon);
203
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].filter_hpp   = PFX(interp_4tap_horiz_pp_4x8_neon);
204
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].filter_hpp  = PFX(interp_4tap_horiz_pp_4x16_neon);
205
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].filter_hpp   = PFX(interp_4tap_horiz_pp_8x2_neon);
206
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].filter_hpp   = PFX(interp_4tap_horiz_pp_8x4_neon);
207
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].filter_hpp   = PFX(interp_4tap_horiz_pp_8x6_neon);
208
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].filter_hpp   = PFX(interp_4tap_horiz_pp_8x8_neon);
209
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].filter_hpp  = PFX(interp_4tap_horiz_pp_8x16_neon);
210
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].filter_hpp  = PFX(interp_4tap_horiz_pp_8x32_neon);
211
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].filter_hpp = PFX(interp_4tap_horiz_pp_12x16_neon);
212
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_hpp  = PFX(interp_4tap_horiz_pp_16x4_neon);
213
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_hpp  = PFX(interp_4tap_horiz_pp_16x8_neon);
214
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_hpp = PFX(interp_4tap_horiz_pp_16x12_neon);
215
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_hpp = PFX(interp_4tap_horiz_pp_16x16_neon);
216
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].filter_hpp = PFX(interp_4tap_horiz_pp_16x32_neon);
217
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].filter_hpp = PFX(interp_4tap_horiz_pp_24x32_neon);
218
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_hpp  = PFX(interp_4tap_horiz_pp_32x8_neon);
219
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_hpp = PFX(interp_4tap_horiz_pp_32x16_neon);
220
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_hpp = PFX(interp_4tap_horiz_pp_32x24_neon);
221
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_hpp = PFX(interp_4tap_horiz_pp_32x32_neon);
222
+
223
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].filter_hpp   = PFX(interp_4tap_horiz_pp_4x4_neon);
224
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].filter_hpp   = PFX(interp_4tap_horiz_pp_4x8_neon);
225
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].filter_hpp  = PFX(interp_4tap_horiz_pp_4x16_neon);
226
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].filter_hpp  = PFX(interp_4tap_horiz_pp_4x32_neon);
227
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].filter_hpp   = PFX(interp_4tap_horiz_pp_8x4_neon);
228
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].filter_hpp   = PFX(interp_4tap_horiz_pp_8x8_neon);
229
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].filter_hpp  = PFX(interp_4tap_horiz_pp_8x12_neon);
230
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].filter_hpp  = PFX(interp_4tap_horiz_pp_8x16_neon);
231
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].filter_hpp  = PFX(interp_4tap_horiz_pp_8x32_neon);
232
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].filter_hpp  = PFX(interp_4tap_horiz_pp_8x64_neon);
233
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].filter_hpp = PFX(interp_4tap_horiz_pp_12x32_neon);
234
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_hpp  = PFX(interp_4tap_horiz_pp_16x8_neon);
235
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_hpp = PFX(interp_4tap_horiz_pp_16x16_neon);
236
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_hpp = PFX(interp_4tap_horiz_pp_16x24_neon);
237
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].filter_hpp = PFX(interp_4tap_horiz_pp_16x32_neon);
238
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].filter_hpp = PFX(interp_4tap_horiz_pp_16x64_neon);
239
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].filter_hpp = PFX(interp_4tap_horiz_pp_24x64_neon);
240
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_hpp = PFX(interp_4tap_horiz_pp_32x16_neon);
241
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_hpp = PFX(interp_4tap_horiz_pp_32x32_neon);
242
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_hpp = PFX(interp_4tap_horiz_pp_32x48_neon);
243
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_hpp = PFX(interp_4tap_horiz_pp_32x64_neon);
244
+
245
+        p.chroma[X265_CSP_I444].pu[LUMA_4x4].filter_hpp   = PFX(interp_4tap_horiz_pp_4x4_neon);
246
+        p.chroma[X265_CSP_I444].pu[LUMA_4x8].filter_hpp   = PFX(interp_4tap_horiz_pp_4x8_neon);
247
+        p.chroma[X265_CSP_I444].pu[LUMA_4x16].filter_hpp  = PFX(interp_4tap_horiz_pp_4x16_neon);
248
+        p.chroma[X265_CSP_I444].pu[LUMA_8x4].filter_hpp   = PFX(interp_4tap_horiz_pp_8x4_neon);
249
+        p.chroma[X265_CSP_I444].pu[LUMA_8x8].filter_hpp   = PFX(interp_4tap_horiz_pp_8x8_neon);
250
+        p.chroma[X265_CSP_I444].pu[LUMA_8x16].filter_hpp  = PFX(interp_4tap_horiz_pp_8x16_neon);
251
+        p.chroma[X265_CSP_I444].pu[LUMA_8x32].filter_hpp  = PFX(interp_4tap_horiz_pp_8x32_neon);
252
+        p.chroma[X265_CSP_I444].pu[LUMA_12x16].filter_hpp = PFX(interp_4tap_horiz_pp_12x16_neon);
253
+        p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_hpp  = PFX(interp_4tap_horiz_pp_16x4_neon);
254
+        p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_hpp  = PFX(interp_4tap_horiz_pp_16x8_neon);
255
+        p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_hpp = PFX(interp_4tap_horiz_pp_16x12_neon);
256
+        p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_hpp = PFX(interp_4tap_horiz_pp_16x16_neon);
257
+        p.chroma[X265_CSP_I444].pu[LUMA_16x32].filter_hpp = PFX(interp_4tap_horiz_pp_16x32_neon);
258
+        p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_hpp = PFX(interp_4tap_horiz_pp_16x64_neon);
259
+        p.chroma[X265_CSP_I444].pu[LUMA_24x32].filter_hpp = PFX(interp_4tap_horiz_pp_24x32_neon);
260
+        p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_hpp  = PFX(interp_4tap_horiz_pp_32x8_neon);
261
+        p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_hpp = PFX(interp_4tap_horiz_pp_32x16_neon);
262
+        p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_hpp = PFX(interp_4tap_horiz_pp_32x24_neon);
263
+        p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_hpp = PFX(interp_4tap_horiz_pp_32x32_neon);
264
+        p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_hpp = PFX(interp_4tap_horiz_pp_32x64_neon);
265
+        p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_hpp = PFX(interp_4tap_horiz_pp_48x64_neon);
266
+        p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_hpp = PFX(interp_4tap_horiz_pp_64x16_neon);
267
+        p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_hpp = PFX(interp_4tap_horiz_pp_64x32_neon);
268
+        p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_hpp = PFX(interp_4tap_horiz_pp_64x48_neon);
269
+        p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_hpp = PFX(interp_4tap_horiz_pp_64x64_neon);
270
+
271
+        // chroma_hps
272
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].filter_hps   = PFX(interp_4tap_horiz_ps_4x2_neon);
273
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].filter_hps   = PFX(interp_4tap_horiz_ps_4x4_neon);
274
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].filter_hps   = PFX(interp_4tap_horiz_ps_4x8_neon);
275
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].filter_hps  = PFX(interp_4tap_horiz_ps_4x16_neon);
276
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].filter_hps   = PFX(interp_4tap_horiz_ps_8x2_neon);
277
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].filter_hps   = PFX(interp_4tap_horiz_ps_8x4_neon);
278
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].filter_hps   = PFX(interp_4tap_horiz_ps_8x6_neon);
279
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].filter_hps   = PFX(interp_4tap_horiz_ps_8x8_neon);
280
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].filter_hps  = PFX(interp_4tap_horiz_ps_8x16_neon);
281
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].filter_hps  = PFX(interp_4tap_horiz_ps_8x32_neon);
282
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].filter_hps = PFX(interp_4tap_horiz_ps_12x16_neon);
283
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_hps  = PFX(interp_4tap_horiz_ps_16x4_neon);
284
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_hps  = PFX(interp_4tap_horiz_ps_16x8_neon);
285
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_hps = PFX(interp_4tap_horiz_ps_16x12_neon);
286
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_hps = PFX(interp_4tap_horiz_ps_16x16_neon);
287
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].filter_hps = PFX(interp_4tap_horiz_ps_16x32_neon);
288
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].filter_hps = PFX(interp_4tap_horiz_ps_24x32_neon);
289
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_hps  = PFX(interp_4tap_horiz_ps_32x8_neon);
290
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_hps = PFX(interp_4tap_horiz_ps_32x16_neon);
291
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_hps = PFX(interp_4tap_horiz_ps_32x24_neon);
292
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_hps = PFX(interp_4tap_horiz_ps_32x32_neon);
293
+
294
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].filter_hps   = PFX(interp_4tap_horiz_ps_4x4_neon);
295
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].filter_hps   = PFX(interp_4tap_horiz_ps_4x8_neon);
296
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].filter_hps  = PFX(interp_4tap_horiz_ps_4x16_neon);
297
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].filter_hps  = PFX(interp_4tap_horiz_ps_4x32_neon);
298
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].filter_hps   = PFX(interp_4tap_horiz_ps_8x4_neon);
299
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].filter_hps   = PFX(interp_4tap_horiz_ps_8x8_neon);
300
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].filter_hps  = PFX(interp_4tap_horiz_ps_8x12_neon);
301
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].filter_hps  = PFX(interp_4tap_horiz_ps_8x16_neon);
302
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].filter_hps  = PFX(interp_4tap_horiz_ps_8x32_neon);
303
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].filter_hps  = PFX(interp_4tap_horiz_ps_8x64_neon);
304
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].filter_hps = PFX(interp_4tap_horiz_ps_12x32_neon);
305
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_hps  = PFX(interp_4tap_horiz_ps_16x8_neon);
306
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_hps = PFX(interp_4tap_horiz_ps_16x16_neon);
307
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_hps = PFX(interp_4tap_horiz_ps_16x24_neon);
308
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].filter_hps = PFX(interp_4tap_horiz_ps_16x32_neon);
309
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].filter_hps = PFX(interp_4tap_horiz_ps_16x64_neon);
310
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].filter_hps = PFX(interp_4tap_horiz_ps_24x64_neon);
311
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_hps = PFX(interp_4tap_horiz_ps_32x16_neon);
312
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_hps = PFX(interp_4tap_horiz_ps_32x32_neon);
313
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_hps = PFX(interp_4tap_horiz_ps_32x48_neon);
314
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_hps = PFX(interp_4tap_horiz_ps_32x64_neon);
315
+
316
+        p.chroma[X265_CSP_I444].pu[LUMA_4x4].filter_hps   = PFX(interp_4tap_horiz_ps_4x4_neon);
317
+        p.chroma[X265_CSP_I444].pu[LUMA_4x8].filter_hps   = PFX(interp_4tap_horiz_ps_4x8_neon);
318
+        p.chroma[X265_CSP_I444].pu[LUMA_4x16].filter_hps  = PFX(interp_4tap_horiz_ps_4x16_neon);
319
+        p.chroma[X265_CSP_I444].pu[LUMA_8x4].filter_hps   = PFX(interp_4tap_horiz_ps_8x4_neon);
320
+        p.chroma[X265_CSP_I444].pu[LUMA_8x8].filter_hps   = PFX(interp_4tap_horiz_ps_8x8_neon);
321
+        p.chroma[X265_CSP_I444].pu[LUMA_8x16].filter_hps  = PFX(interp_4tap_horiz_ps_8x16_neon);
322
+        p.chroma[X265_CSP_I444].pu[LUMA_8x32].filter_hps  = PFX(interp_4tap_horiz_ps_8x32_neon);
323
+        p.chroma[X265_CSP_I444].pu[LUMA_12x16].filter_hps = PFX(interp_4tap_horiz_ps_12x16_neon);
324
+        p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_hps  = PFX(interp_4tap_horiz_ps_16x4_neon);
325
+        p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_hps  = PFX(interp_4tap_horiz_ps_16x8_neon);
326
+        p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_hps = PFX(interp_4tap_horiz_ps_16x12_neon);
327
+        p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_hps = PFX(interp_4tap_horiz_ps_16x16_neon);
328
+        p.chroma[X265_CSP_I444].pu[LUMA_16x32].filter_hps = PFX(interp_4tap_horiz_ps_16x32_neon);
329
+        p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_hps = PFX(interp_4tap_horiz_ps_16x64_neon);
330
+        p.chroma[X265_CSP_I444].pu[LUMA_24x32].filter_hps = PFX(interp_4tap_horiz_ps_24x32_neon);
331
+        p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_hps  = PFX(interp_4tap_horiz_ps_32x8_neon);
332
+        p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_hps = PFX(interp_4tap_horiz_ps_32x16_neon);
333
+        p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_hps = PFX(interp_4tap_horiz_ps_32x24_neon);
334
+        p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_hps = PFX(interp_4tap_horiz_ps_32x32_neon);
335
+        p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_hps = PFX(interp_4tap_horiz_ps_32x64_neon);
336
+        p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_hps = PFX(interp_4tap_horiz_ps_48x64_neon);
337
+        p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_hps = PFX(interp_4tap_horiz_ps_64x16_neon);
338
+        p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_hps = PFX(interp_4tap_horiz_ps_64x32_neon);
339
+        p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_hps = PFX(interp_4tap_horiz_ps_64x48_neon);
340
+        p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_hps = PFX(interp_4tap_horiz_ps_64x64_neon);
341
+
342
+        // luma_hpp
343
+        p.pu[LUMA_4x4].luma_hpp   = PFX(interp_horiz_pp_4x4_neon);
344
+        p.pu[LUMA_4x8].luma_hpp   = PFX(interp_horiz_pp_4x8_neon);
345
+        p.pu[LUMA_4x16].luma_hpp  = PFX(interp_horiz_pp_4x16_neon);
346
+        p.pu[LUMA_8x4].luma_hpp   = PFX(interp_horiz_pp_8x4_neon);
347
+        p.pu[LUMA_8x8].luma_hpp   = PFX(interp_horiz_pp_8x8_neon);
348
+        p.pu[LUMA_8x16].luma_hpp  = PFX(interp_horiz_pp_8x16_neon);
349
+        p.pu[LUMA_8x32].luma_hpp  = PFX(interp_horiz_pp_8x32_neon);
350
+        p.pu[LUMA_12x16].luma_hpp = PFX(interp_horiz_pp_12x16_neon);
351
+        p.pu[LUMA_16x4].luma_hpp  = PFX(interp_horiz_pp_16x4_neon);
352
+        p.pu[LUMA_16x8].luma_hpp  = PFX(interp_horiz_pp_16x8_neon);
353
+        p.pu[LUMA_16x12].luma_hpp = PFX(interp_horiz_pp_16x12_neon);
354
+        p.pu[LUMA_16x16].luma_hpp = PFX(interp_horiz_pp_16x16_neon);
355
+        p.pu[LUMA_16x32].luma_hpp = PFX(interp_horiz_pp_16x32_neon);
356
+        p.pu[LUMA_16x64].luma_hpp = PFX(interp_horiz_pp_16x64_neon);
357
+        p.pu[LUMA_24x32].luma_hpp = PFX(interp_horiz_pp_24x32_neon);
358
+        p.pu[LUMA_32x8].luma_hpp  = PFX(interp_horiz_pp_32x8_neon);
359
+        p.pu[LUMA_32x16].luma_hpp = PFX(interp_horiz_pp_32x16_neon);
360
+        p.pu[LUMA_32x24].luma_hpp = PFX(interp_horiz_pp_32x24_neon);
361
+        p.pu[LUMA_32x32].luma_hpp = PFX(interp_horiz_pp_32x32_neon);
362
+        p.pu[LUMA_32x64].luma_hpp = PFX(interp_horiz_pp_32x64_neon);
363
+        p.pu[LUMA_48x64].luma_hpp = PFX(interp_horiz_pp_48x64_neon);
364
+        p.pu[LUMA_64x16].luma_hpp = PFX(interp_horiz_pp_64x16_neon);
365
+        p.pu[LUMA_64x32].luma_hpp = PFX(interp_horiz_pp_64x32_neon);
366
+        p.pu[LUMA_64x48].luma_hpp = PFX(interp_horiz_pp_64x48_neon);
367
+        p.pu[LUMA_64x64].luma_hpp = PFX(interp_horiz_pp_64x64_neon);
368
+
369
+        // luma_hps
370
+        p.pu[LUMA_4x4].luma_hps   = PFX(interp_horiz_ps_4x4_neon);
371
+        p.pu[LUMA_4x8].luma_hps   = PFX(interp_horiz_ps_4x8_neon);
372
+        p.pu[LUMA_4x16].luma_hps  = PFX(interp_horiz_ps_4x16_neon);
373
+        p.pu[LUMA_8x4].luma_hps   = PFX(interp_horiz_ps_8x4_neon);
374
+        p.pu[LUMA_8x8].luma_hps   = PFX(interp_horiz_ps_8x8_neon);
375
+        p.pu[LUMA_8x16].luma_hps  = PFX(interp_horiz_ps_8x16_neon);
376
+        p.pu[LUMA_8x32].luma_hps  = PFX(interp_horiz_ps_8x32_neon);
377
+        p.pu[LUMA_12x16].luma_hps = PFX(interp_horiz_ps_12x16_neon);
378
+        p.pu[LUMA_16x4].luma_hps  = PFX(interp_horiz_ps_16x4_neon);
379
+        p.pu[LUMA_16x8].luma_hps  = PFX(interp_horiz_ps_16x8_neon);
380
+        p.pu[LUMA_16x12].luma_hps = PFX(interp_horiz_ps_16x12_neon);
381
+        p.pu[LUMA_16x16].luma_hps = PFX(interp_horiz_ps_16x16_neon);
382
+        p.pu[LUMA_16x32].luma_hps = PFX(interp_horiz_ps_16x32_neon);
383
+        p.pu[LUMA_16x64].luma_hps = PFX(interp_horiz_ps_16x64_neon);
384
+        p.pu[LUMA_24x32].luma_hps = PFX(interp_horiz_ps_24x32_neon);
385
+        p.pu[LUMA_32x8].luma_hps  = PFX(interp_horiz_ps_32x8_neon);
386
+        p.pu[LUMA_32x16].luma_hps = PFX(interp_horiz_ps_32x16_neon);
387
+        p.pu[LUMA_32x24].luma_hps = PFX(interp_horiz_ps_32x24_neon);
388
+        p.pu[LUMA_32x32].luma_hps = PFX(interp_horiz_ps_32x32_neon);
389
+        p.pu[LUMA_32x64].luma_hps = PFX(interp_horiz_ps_32x64_neon);
390
+        p.pu[LUMA_48x64].luma_hps = PFX(interp_horiz_ps_48x64_neon);
391
+        p.pu[LUMA_64x16].luma_hps = PFX(interp_horiz_ps_64x16_neon);
392
+        p.pu[LUMA_64x32].luma_hps = PFX(interp_horiz_ps_64x32_neon);
393
+        p.pu[LUMA_64x48].luma_hps = PFX(interp_horiz_ps_64x48_neon);
394
+        p.pu[LUMA_64x64].luma_hps = PFX(interp_horiz_ps_64x64_neon);
395
+
396
+        // count nonzero
397
+        p.cu[BLOCK_4x4].count_nonzero     = PFX(count_nonzero_4_neon);
398
+        p.cu[BLOCK_8x8].count_nonzero     = PFX(count_nonzero_8_neon);
399
+        p.cu[BLOCK_16x16].count_nonzero   = PFX(count_nonzero_16_neon);
400
+        p.cu[BLOCK_32x32].count_nonzero   = PFX(count_nonzero_32_neon);
401
+
402
+        //scale2D_64to32
403
+        p.scale2D_64to32  = PFX(scale2D_64to32_neon);
404
+
405
+        // scale1D_128to64
406
+        p.scale1D_128to64 = PFX(scale1D_128to64_neon);
407
+
408
+        // copy_count
409
+        p.cu[BLOCK_4x4].copy_cnt     = PFX(copy_cnt_4_neon);
410
+        p.cu[BLOCK_8x8].copy_cnt     = PFX(copy_cnt_8_neon);
411
+        p.cu[BLOCK_16x16].copy_cnt   = PFX(copy_cnt_16_neon);
412
+        p.cu[BLOCK_32x32].copy_cnt   = PFX(copy_cnt_32_neon);
413
+
414
+        // filterPixelToShort
415
+        p.pu[LUMA_4x4].convert_p2s   = PFX(filterPixelToShort_4x4_neon);
416
+        p.pu[LUMA_4x8].convert_p2s   = PFX(filterPixelToShort_4x8_neon);
417
+        p.pu[LUMA_4x16].convert_p2s  = PFX(filterPixelToShort_4x16_neon);
418
+        p.pu[LUMA_8x4].convert_p2s   = PFX(filterPixelToShort_8x4_neon);
419
+        p.pu[LUMA_8x8].convert_p2s   = PFX(filterPixelToShort_8x8_neon);
420
+        p.pu[LUMA_8x16].convert_p2s  = PFX(filterPixelToShort_8x16_neon);
421
+        p.pu[LUMA_8x32].convert_p2s  = PFX(filterPixelToShort_8x32_neon);
422
+        p.pu[LUMA_12x16].convert_p2s = PFX(filterPixelToShort_12x16_neon);
423
+        p.pu[LUMA_16x4].convert_p2s  = PFX(filterPixelToShort_16x4_neon);
424
+        p.pu[LUMA_16x8].convert_p2s  = PFX(filterPixelToShort_16x8_neon);
425
+        p.pu[LUMA_16x12].convert_p2s = PFX(filterPixelToShort_16x12_neon);
426
+        p.pu[LUMA_16x16].convert_p2s = PFX(filterPixelToShort_16x16_neon);
427
+        p.pu[LUMA_16x32].convert_p2s = PFX(filterPixelToShort_16x32_neon);
428
+        p.pu[LUMA_16x64].convert_p2s = PFX(filterPixelToShort_16x64_neon);
429
+        p.pu[LUMA_24x32].convert_p2s = PFX(filterPixelToShort_24x32_neon);
430
+        p.pu[LUMA_32x8].convert_p2s  = PFX(filterPixelToShort_32x8_neon);
431
+        p.pu[LUMA_32x16].convert_p2s = PFX(filterPixelToShort_32x16_neon);
432
+        p.pu[LUMA_32x24].convert_p2s = PFX(filterPixelToShort_32x24_neon);
433
+        p.pu[LUMA_32x32].convert_p2s = PFX(filterPixelToShort_32x32_neon);
434
+        p.pu[LUMA_32x64].convert_p2s = PFX(filterPixelToShort_32x64_neon);
435
+        p.pu[LUMA_48x64].convert_p2s = PFX(filterPixelToShort_48x64_neon);
436
+        p.pu[LUMA_64x16].convert_p2s = PFX(filterPixelToShort_64x16_neon);
437
+        p.pu[LUMA_64x32].convert_p2s = PFX(filterPixelToShort_64x32_neon);
438
+        p.pu[LUMA_64x48].convert_p2s = PFX(filterPixelToShort_64x48_neon);
439
+        p.pu[LUMA_64x64].convert_p2s = PFX(filterPixelToShort_64x64_neon);
440
+
441
+        // Block_fill
442
+        p.cu[BLOCK_4x4].blockfill_s   = PFX(blockfill_s_4x4_neon);
443
+        p.cu[BLOCK_8x8].blockfill_s   = PFX(blockfill_s_8x8_neon);
444
+        p.cu[BLOCK_16x16].blockfill_s = PFX(blockfill_s_16x16_neon);
445
+        p.cu[BLOCK_32x32].blockfill_s = PFX(blockfill_s_32x32_neon);
446
+
447
+        // Blockcopy_ss
448
+        p.cu[BLOCK_4x4].copy_ss   = PFX(blockcopy_ss_4x4_neon);
449
+        p.cu[BLOCK_8x8].copy_ss   = PFX(blockcopy_ss_8x8_neon);
450
+        p.cu[BLOCK_16x16].copy_ss = PFX(blockcopy_ss_16x16_neon);
451
+        p.cu[BLOCK_32x32].copy_ss = PFX(blockcopy_ss_32x32_neon);
452
+        p.cu[BLOCK_64x64].copy_ss = PFX(blockcopy_ss_64x64_neon);
453
+
454
+        // Blockcopy_ps
455
+        p.cu[BLOCK_4x4].copy_ps   = PFX(blockcopy_ps_4x4_neon);
456
+        p.cu[BLOCK_8x8].copy_ps   = PFX(blockcopy_ps_8x8_neon);
457
+        p.cu[BLOCK_16x16].copy_ps = PFX(blockcopy_ps_16x16_neon);
458
+        p.cu[BLOCK_32x32].copy_ps = PFX(blockcopy_ps_32x32_neon);
459
+        p.cu[BLOCK_64x64].copy_ps = PFX(blockcopy_ps_64x64_neon);
460
+
461
+        // Blockcopy_sp
462
+        p.cu[BLOCK_4x4].copy_sp   = PFX(blockcopy_sp_4x4_neon);
463
+        p.cu[BLOCK_8x8].copy_sp   = PFX(blockcopy_sp_8x8_neon);
464
+        p.cu[BLOCK_16x16].copy_sp = PFX(blockcopy_sp_16x16_neon);
465
+        p.cu[BLOCK_32x32].copy_sp = PFX(blockcopy_sp_32x32_neon);
466
+        p.cu[BLOCK_64x64].copy_sp = PFX(blockcopy_sp_64x64_neon);
467
+
468
+        // chroma blockcopy_ss
469
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].copy_ss   = PFX(blockcopy_ss_4x4_neon);
470
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].copy_ss   = PFX(blockcopy_ss_8x8_neon);
471
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].copy_ss = PFX(blockcopy_ss_16x16_neon);
472
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].copy_ss = PFX(blockcopy_ss_32x32_neon);
473
+        p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].copy_ss   = PFX(blockcopy_ss_4x8_neon);
474
+        p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].copy_ss  = PFX(blockcopy_ss_8x16_neon);
475
+        p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].copy_ss = PFX(blockcopy_ss_16x32_neon);
476
+        p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_ss = PFX(blockcopy_ss_32x64_neon);
477
+
478
+        // chroma blockcopy_ps
479
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].copy_ps   = PFX(blockcopy_ps_4x4_neon);
480
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].copy_ps   = PFX(blockcopy_ps_8x8_neon);
481
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].copy_ps = PFX(blockcopy_ps_16x16_neon);
482
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].copy_ps = PFX(blockcopy_ps_32x32_neon);
483
+        p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].copy_ps   = PFX(blockcopy_ps_4x8_neon);
484
+        p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].copy_ps  = PFX(blockcopy_ps_8x16_neon);
485
+        p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].copy_ps = PFX(blockcopy_ps_16x32_neon);
486
+        p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_ps = PFX(blockcopy_ps_32x64_neon);
487
+
488
+        // chroma blockcopy_sp
489
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].copy_sp   = PFX(blockcopy_sp_4x4_neon);
490
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].copy_sp   = PFX(blockcopy_sp_8x8_neon);
491
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].copy_sp = PFX(blockcopy_sp_16x16_neon);
492
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].copy_sp = PFX(blockcopy_sp_32x32_neon);
493
+        p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].copy_sp   = PFX(blockcopy_sp_4x8_neon);
494
+        p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].copy_sp  = PFX(blockcopy_sp_8x16_neon);
495
+        p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].copy_sp = PFX(blockcopy_sp_16x32_neon);
496
+        p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_sp = PFX(blockcopy_sp_32x64_neon);
497
+
498
+        // pixel_add_ps
499
+        p.cu[BLOCK_4x4].add_ps   = PFX(pixel_add_ps_4x4_neon);
500
+        p.cu[BLOCK_8x8].add_ps   = PFX(pixel_add_ps_8x8_neon);
501
+        p.cu[BLOCK_16x16].add_ps = PFX(pixel_add_ps_16x16_neon);
502
+        p.cu[BLOCK_32x32].add_ps = PFX(pixel_add_ps_32x32_neon);
503
+        p.cu[BLOCK_64x64].add_ps = PFX(pixel_add_ps_64x64_neon);
504
+
505
+        // chroma add_ps
506
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].add_ps   = PFX(pixel_add_ps_4x4_neon);
507
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].add_ps   = PFX(pixel_add_ps_8x8_neon);
508
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].add_ps = PFX(pixel_add_ps_16x16_neon);
509
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps = PFX(pixel_add_ps_32x32_neon);
510
+        p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].add_ps   = PFX(pixel_add_ps_4x8_neon);
511
+        p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].add_ps  = PFX(pixel_add_ps_8x16_neon);
512
+        p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].add_ps = PFX(pixel_add_ps_16x32_neon);
513
+        p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps = PFX(pixel_add_ps_32x64_neon);
514
+
515
+        // cpy2Dto1D_shr
516
+        p.cu[BLOCK_4x4].cpy2Dto1D_shr   = PFX(cpy2Dto1D_shr_4x4_neon);
517
+        p.cu[BLOCK_8x8].cpy2Dto1D_shr   = PFX(cpy2Dto1D_shr_8x8_neon);
518
+        p.cu[BLOCK_16x16].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_16x16_neon);
519
+        p.cu[BLOCK_32x32].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_32x32_neon);
520
+
521
+        // ssd_s
522
+        p.cu[BLOCK_4x4].ssd_s   = PFX(pixel_ssd_s_4x4_neon);
523
+        p.cu[BLOCK_8x8].ssd_s   = PFX(pixel_ssd_s_8x8_neon);
524
+        p.cu[BLOCK_16x16].ssd_s = PFX(pixel_ssd_s_16x16_neon);
525
+        p.cu[BLOCK_32x32].ssd_s = PFX(pixel_ssd_s_32x32_neon);
526
+
527
+        // sse_ss
528
+        p.cu[BLOCK_4x4].sse_ss   = PFX(pixel_sse_ss_4x4_neon);
529
+        p.cu[BLOCK_8x8].sse_ss   = PFX(pixel_sse_ss_8x8_neon);
530
+        p.cu[BLOCK_16x16].sse_ss = PFX(pixel_sse_ss_16x16_neon);
531
+        p.cu[BLOCK_32x32].sse_ss = PFX(pixel_sse_ss_32x32_neon);
532
+        p.cu[BLOCK_64x64].sse_ss = PFX(pixel_sse_ss_64x64_neon);
533
+
534
+        // pixel_sub_ps
535
+        p.cu[BLOCK_4x4].sub_ps   = PFX(pixel_sub_ps_4x4_neon);
536
+        p.cu[BLOCK_8x8].sub_ps   = PFX(pixel_sub_ps_8x8_neon);
537
+        p.cu[BLOCK_16x16].sub_ps = PFX(pixel_sub_ps_16x16_neon);
538
+        p.cu[BLOCK_32x32].sub_ps = PFX(pixel_sub_ps_32x32_neon);
539
+        p.cu[BLOCK_64x64].sub_ps = PFX(pixel_sub_ps_64x64_neon);
540
+
541
+        // chroma sub_ps
542
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].sub_ps   = PFX(pixel_sub_ps_4x4_neon);
543
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].sub_ps   = PFX(pixel_sub_ps_8x8_neon);
544
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sub_ps = PFX(pixel_sub_ps_16x16_neon);
545
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sub_ps = PFX(pixel_sub_ps_32x32_neon);
546
+        p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].sub_ps   = PFX(pixel_sub_ps_4x8_neon);
547
+        p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].sub_ps  = PFX(pixel_sub_ps_8x16_neon);
548
+        p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sub_ps = PFX(pixel_sub_ps_16x32_neon);
549
+        p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sub_ps = PFX(pixel_sub_ps_32x64_neon);
550
+
551
+        // calc_Residual
552
+        p.cu[BLOCK_4x4].calcresidual   = PFX(getResidual4_neon);
553
+        p.cu[BLOCK_8x8].calcresidual   = PFX(getResidual8_neon);
554
+        p.cu[BLOCK_16x16].calcresidual = PFX(getResidual16_neon);
555
+        p.cu[BLOCK_32x32].calcresidual = PFX(getResidual32_neon);
556
+
557
+        // sse_pp
558
+        p.cu[BLOCK_4x4].sse_pp   = PFX(pixel_sse_pp_4x4_neon);
559
+        p.cu[BLOCK_8x8].sse_pp   = PFX(pixel_sse_pp_8x8_neon);
560
+        p.cu[BLOCK_16x16].sse_pp = PFX(pixel_sse_pp_16x16_neon);
561
+        p.cu[BLOCK_32x32].sse_pp = PFX(pixel_sse_pp_32x32_neon);
562
+        p.cu[BLOCK_64x64].sse_pp = PFX(pixel_sse_pp_64x64_neon);
563
+
564
+        // pixel_var
565
+        p.cu[BLOCK_8x8].var   = PFX(pixel_var_8x8_neon);
566
+        p.cu[BLOCK_16x16].var = PFX(pixel_var_16x16_neon);
567
+        p.cu[BLOCK_32x32].var = PFX(pixel_var_32x32_neon);
568
+        p.cu[BLOCK_64x64].var = PFX(pixel_var_64x64_neon);
569
+
570
+        // blockcopy
571
+        p.pu[LUMA_16x16].copy_pp = PFX(blockcopy_pp_16x16_neon);
572
+        p.pu[LUMA_8x4].copy_pp   = PFX(blockcopy_pp_8x4_neon);
573
+        p.pu[LUMA_8x8].copy_pp   = PFX(blockcopy_pp_8x8_neon);
574
+        p.pu[LUMA_8x16].copy_pp  = PFX(blockcopy_pp_8x16_neon);
575
+        p.pu[LUMA_8x32].copy_pp  = PFX(blockcopy_pp_8x32_neon);
576
+        p.pu[LUMA_12x16].copy_pp = PFX(blockcopy_pp_12x16_neon);
577
+        p.pu[LUMA_4x4].copy_pp   = PFX(blockcopy_pp_4x4_neon);
578
+        p.pu[LUMA_4x8].copy_pp   = PFX(blockcopy_pp_4x8_neon);
579
+        p.pu[LUMA_4x16].copy_pp  = PFX(blockcopy_pp_4x16_neon);
580
+        p.pu[LUMA_16x4].copy_pp  = PFX(blockcopy_pp_16x4_neon);
581
+        p.pu[LUMA_16x8].copy_pp  = PFX(blockcopy_pp_16x8_neon);
582
+        p.pu[LUMA_16x12].copy_pp = PFX(blockcopy_pp_16x12_neon);
583
+        p.pu[LUMA_16x32].copy_pp = PFX(blockcopy_pp_16x32_neon);
584
+        p.pu[LUMA_16x64].copy_pp = PFX(blockcopy_pp_16x64_neon);
585
+        p.pu[LUMA_24x32].copy_pp = PFX(blockcopy_pp_24x32_neon);
586
+        p.pu[LUMA_32x8].copy_pp  = PFX(blockcopy_pp_32x8_neon);
587
+        p.pu[LUMA_32x16].copy_pp = PFX(blockcopy_pp_32x16_neon);
588
+        p.pu[LUMA_32x24].copy_pp = PFX(blockcopy_pp_32x24_neon);
589
+        p.pu[LUMA_32x32].copy_pp = PFX(blockcopy_pp_32x32_neon);
590
+        p.pu[LUMA_32x64].copy_pp = PFX(blockcopy_pp_32x64_neon);
591
+        p.pu[LUMA_48x64].copy_pp = PFX(blockcopy_pp_48x64_neon);
592
+        p.pu[LUMA_64x16].copy_pp = PFX(blockcopy_pp_64x16_neon);
593
+        p.pu[LUMA_64x32].copy_pp = PFX(blockcopy_pp_64x32_neon);
594
+        p.pu[LUMA_64x48].copy_pp = PFX(blockcopy_pp_64x48_neon);
595
+        p.pu[LUMA_64x64].copy_pp = PFX(blockcopy_pp_64x64_neon);
596
+
597
+        // chroma blockcopy
598
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].copy_pp   = PFX(blockcopy_pp_2x4_neon);
599
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_2x8].copy_pp   = PFX(blockcopy_pp_2x8_neon);
600
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].copy_pp   = PFX(blockcopy_pp_4x2_neon);
601
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].copy_pp   = PFX(blockcopy_pp_4x4_neon);
602
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].copy_pp   = PFX(blockcopy_pp_4x8_neon);
603
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].copy_pp  = PFX(blockcopy_pp_4x16_neon);
604
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].copy_pp   = PFX(blockcopy_pp_6x8_neon);
605
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].copy_pp   = PFX(blockcopy_pp_8x2_neon);
606
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].copy_pp   = PFX(blockcopy_pp_8x4_neon);
607
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].copy_pp   = PFX(blockcopy_pp_8x6_neon);
608
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].copy_pp   = PFX(blockcopy_pp_8x8_neon);
609
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].copy_pp  = PFX(blockcopy_pp_8x16_neon);
610
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].copy_pp  = PFX(blockcopy_pp_8x32_neon);
611
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].copy_pp = PFX(blockcopy_pp_12x16_neon);
612
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].copy_pp  = PFX(blockcopy_pp_16x4_neon);
613
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].copy_pp  = PFX(blockcopy_pp_16x8_neon);
614
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].copy_pp = PFX(blockcopy_pp_16x12_neon);
615
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].copy_pp = PFX(blockcopy_pp_16x16_neon);
616
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].copy_pp = PFX(blockcopy_pp_16x32_neon);
617
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].copy_pp = PFX(blockcopy_pp_24x32_neon);
618
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].copy_pp  = PFX(blockcopy_pp_32x8_neon);
619
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].copy_pp = PFX(blockcopy_pp_32x16_neon);
620
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].copy_pp = PFX(blockcopy_pp_32x24_neon);
621
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].copy_pp = PFX(blockcopy_pp_32x32_neon);
622
+
623
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].copy_pp  = PFX(blockcopy_pp_2x16_neon);
624
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].copy_pp   = PFX(blockcopy_pp_4x4_neon);
625
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].copy_pp   = PFX(blockcopy_pp_4x8_neon);
626
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].copy_pp  = PFX(blockcopy_pp_4x16_neon);
627
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].copy_pp  = PFX(blockcopy_pp_4x32_neon);
628
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].copy_pp  = PFX(blockcopy_pp_6x16_neon);
629
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].copy_pp   = PFX(blockcopy_pp_8x4_neon);
630
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].copy_pp   = PFX(blockcopy_pp_8x8_neon);
631
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].copy_pp  = PFX(blockcopy_pp_8x12_neon);
632
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].copy_pp  = PFX(blockcopy_pp_8x16_neon);
633
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].copy_pp  = PFX(blockcopy_pp_8x32_neon);
634
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].copy_pp  = PFX(blockcopy_pp_8x64_neon);
635
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].copy_pp = PFX(blockcopy_pp_12x32_neon);
636
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].copy_pp  = PFX(blockcopy_pp_16x8_neon);
637
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].copy_pp = PFX(blockcopy_pp_16x16_neon);
638
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].copy_pp = PFX(blockcopy_pp_16x24_neon);
639
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].copy_pp = PFX(blockcopy_pp_16x32_neon);
640
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].copy_pp = PFX(blockcopy_pp_16x64_neon);
641
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].copy_pp = PFX(blockcopy_pp_24x64_neon);
642
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].copy_pp = PFX(blockcopy_pp_32x16_neon);
643
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].copy_pp = PFX(blockcopy_pp_32x32_neon);
644
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].copy_pp = PFX(blockcopy_pp_32x48_neon);
645
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].copy_pp = PFX(blockcopy_pp_32x64_neon);
646
+
647
+        // sad
648
+        p.pu[LUMA_8x4].sad    = PFX(pixel_sad_8x4_neon);
649
+        p.pu[LUMA_8x8].sad    = PFX(pixel_sad_8x8_neon);
650
+        p.pu[LUMA_8x16].sad   = PFX(pixel_sad_8x16_neon);
651
+        p.pu[LUMA_8x32].sad   = PFX(pixel_sad_8x32_neon);
652
+        p.pu[LUMA_16x4].sad   = PFX(pixel_sad_16x4_neon);
653
+        p.pu[LUMA_16x8].sad   = PFX(pixel_sad_16x8_neon);
654
+        p.pu[LUMA_16x16].sad  = PFX(pixel_sad_16x16_neon);
655
+        p.pu[LUMA_16x12].sad  = PFX(pixel_sad_16x12_neon);
656
+        p.pu[LUMA_16x32].sad  = PFX(pixel_sad_16x32_neon);
657
+        p.pu[LUMA_16x64].sad  = PFX(pixel_sad_16x64_neon);
658
+        p.pu[LUMA_32x8].sad   = PFX(pixel_sad_32x8_neon);
659
+        p.pu[LUMA_32x16].sad  = PFX(pixel_sad_32x16_neon);
660
+        p.pu[LUMA_32x32].sad  = PFX(pixel_sad_32x32_neon);
661
+        p.pu[LUMA_32x64].sad  = PFX(pixel_sad_32x64_neon);
662
+        p.pu[LUMA_32x24].sad  = PFX(pixel_sad_32x24_neon);
663
+        p.pu[LUMA_64x16].sad  = PFX(pixel_sad_64x16_neon);
664
+        p.pu[LUMA_64x32].sad  = PFX(pixel_sad_64x32_neon);
665
+        p.pu[LUMA_64x64].sad  = PFX(pixel_sad_64x64_neon);
666
+        p.pu[LUMA_64x48].sad  = PFX(pixel_sad_64x48_neon);
667
+        p.pu[LUMA_12x16].sad  = PFX(pixel_sad_12x16_neon);
668
+        p.pu[LUMA_24x32].sad  = PFX(pixel_sad_24x32_neon);
669
+        p.pu[LUMA_48x64].sad  = PFX(pixel_sad_48x64_neon);
670
+
671
+        // sad_x3
672
+        p.pu[LUMA_4x4].sad_x3   = PFX(sad_x3_4x4_neon);
673
+        p.pu[LUMA_4x8].sad_x3   = PFX(sad_x3_4x8_neon);
674
+        p.pu[LUMA_4x16].sad_x3  = PFX(sad_x3_4x16_neon);
675
+        p.pu[LUMA_8x4].sad_x3   = PFX(sad_x3_8x4_neon);
676
+        p.pu[LUMA_8x8].sad_x3   = PFX(sad_x3_8x8_neon);
677
+        p.pu[LUMA_8x16].sad_x3  = PFX(sad_x3_8x16_neon);
678
+        p.pu[LUMA_8x32].sad_x3  = PFX(sad_x3_8x32_neon);
679
+        p.pu[LUMA_12x16].sad_x3 = PFX(sad_x3_12x16_neon);
680
+        p.pu[LUMA_16x4].sad_x3  = PFX(sad_x3_16x4_neon);
681
+        p.pu[LUMA_16x8].sad_x3  = PFX(sad_x3_16x8_neon);
682
+        p.pu[LUMA_16x12].sad_x3 = PFX(sad_x3_16x12_neon);
683
+        p.pu[LUMA_16x16].sad_x3 = PFX(sad_x3_16x16_neon);
684
+        p.pu[LUMA_16x32].sad_x3 = PFX(sad_x3_16x32_neon);
685
+        p.pu[LUMA_16x64].sad_x3 = PFX(sad_x3_16x64_neon);
686
+        p.pu[LUMA_24x32].sad_x3 = PFX(sad_x3_24x32_neon);
687
+        p.pu[LUMA_32x8].sad_x3  = PFX(sad_x3_32x8_neon);
688
+        p.pu[LUMA_32x16].sad_x3 = PFX(sad_x3_32x16_neon);
689
+        p.pu[LUMA_32x24].sad_x3 = PFX(sad_x3_32x24_neon);
690
+        p.pu[LUMA_32x32].sad_x3 = PFX(sad_x3_32x32_neon);
691
+        p.pu[LUMA_32x64].sad_x3 = PFX(sad_x3_32x64_neon);
692
+        p.pu[LUMA_48x64].sad_x3 = PFX(sad_x3_48x64_neon);
693
+        p.pu[LUMA_64x16].sad_x3 = PFX(sad_x3_64x16_neon);
694
+        p.pu[LUMA_64x32].sad_x3 = PFX(sad_x3_64x32_neon);
695
+        p.pu[LUMA_64x48].sad_x3 = PFX(sad_x3_64x48_neon);
696
+        p.pu[LUMA_64x64].sad_x3 = PFX(sad_x3_64x64_neon);
697
+
698
+        // sad_x4
699
+        p.pu[LUMA_4x4].sad_x4   = PFX(sad_x4_4x4_neon);
700
+        p.pu[LUMA_4x8].sad_x4   = PFX(sad_x4_4x8_neon);
701
+        p.pu[LUMA_4x16].sad_x4  = PFX(sad_x4_4x16_neon);
702
+        p.pu[LUMA_8x4].sad_x4   = PFX(sad_x4_8x4_neon);
703
+        p.pu[LUMA_8x8].sad_x4   = PFX(sad_x4_8x8_neon);
704
+        p.pu[LUMA_8x16].sad_x4  = PFX(sad_x4_8x16_neon);
705
+        p.pu[LUMA_8x32].sad_x4  = PFX(sad_x4_8x32_neon);
706
+        p.pu[LUMA_12x16].sad_x4 = PFX(sad_x4_12x16_neon);
707
+        p.pu[LUMA_16x4].sad_x4  = PFX(sad_x4_16x4_neon);
708
+        p.pu[LUMA_16x8].sad_x4  = PFX(sad_x4_16x8_neon);
709
+        p.pu[LUMA_16x12].sad_x4 = PFX(sad_x4_16x12_neon);
710
+        p.pu[LUMA_16x16].sad_x4 = PFX(sad_x4_16x16_neon);
711
+        p.pu[LUMA_16x32].sad_x4 = PFX(sad_x4_16x32_neon);
712
+        p.pu[LUMA_16x64].sad_x4 = PFX(sad_x4_16x64_neon);
713
+        p.pu[LUMA_24x32].sad_x4 = PFX(sad_x4_24x32_neon);
714
+        p.pu[LUMA_32x8].sad_x4  = PFX(sad_x4_32x8_neon);
715
+        p.pu[LUMA_32x16].sad_x4 = PFX(sad_x4_32x16_neon);
716
+        p.pu[LUMA_32x24].sad_x4 = PFX(sad_x4_32x24_neon);
717
+        p.pu[LUMA_32x32].sad_x4 = PFX(sad_x4_32x32_neon);
718
+        p.pu[LUMA_32x64].sad_x4 = PFX(sad_x4_32x64_neon);
719
+        p.pu[LUMA_48x64].sad_x4 = PFX(sad_x4_48x64_neon);
720
+        p.pu[LUMA_64x16].sad_x4 = PFX(sad_x4_64x16_neon);
721
+        p.pu[LUMA_64x32].sad_x4 = PFX(sad_x4_64x32_neon);
722
+        p.pu[LUMA_64x48].sad_x4 = PFX(sad_x4_64x48_neon);
723
+        p.pu[LUMA_64x64].sad_x4 = PFX(sad_x4_64x64_neon);
724
+
725
+        // pixel_avg_pp
726
+        p.pu[LUMA_4x4].pixelavg_pp   = PFX(pixel_avg_pp_4x4_neon);
727
+        p.pu[LUMA_4x8].pixelavg_pp   = PFX(pixel_avg_pp_4x8_neon);
728
+        p.pu[LUMA_4x16].pixelavg_pp  = PFX(pixel_avg_pp_4x16_neon);
729
+        p.pu[LUMA_8x4].pixelavg_pp   = PFX(pixel_avg_pp_8x4_neon);
730
+        p.pu[LUMA_8x8].pixelavg_pp   = PFX(pixel_avg_pp_8x8_neon);
731
+        p.pu[LUMA_8x16].pixelavg_pp  = PFX(pixel_avg_pp_8x16_neon);
732
+        p.pu[LUMA_8x32].pixelavg_pp  = PFX(pixel_avg_pp_8x32_neon);
733
+        p.pu[LUMA_12x16].pixelavg_pp = PFX(pixel_avg_pp_12x16_neon);
734
+        p.pu[LUMA_16x4].pixelavg_pp  = PFX(pixel_avg_pp_16x4_neon);
735
+        p.pu[LUMA_16x8].pixelavg_pp  = PFX(pixel_avg_pp_16x8_neon);
736
+        p.pu[LUMA_16x12].pixelavg_pp = PFX(pixel_avg_pp_16x12_neon);
737
+        p.pu[LUMA_16x16].pixelavg_pp = PFX(pixel_avg_pp_16x16_neon);
738
+        p.pu[LUMA_16x32].pixelavg_pp = PFX(pixel_avg_pp_16x32_neon);
739
+        p.pu[LUMA_16x64].pixelavg_pp = PFX(pixel_avg_pp_16x64_neon);
740
+        p.pu[LUMA_24x32].pixelavg_pp = PFX(pixel_avg_pp_24x32_neon);
741
+        p.pu[LUMA_32x8].pixelavg_pp  = PFX(pixel_avg_pp_32x8_neon);
742
+        p.pu[LUMA_32x16].pixelavg_pp = PFX(pixel_avg_pp_32x16_neon);
743
+        p.pu[LUMA_32x24].pixelavg_pp = PFX(pixel_avg_pp_32x24_neon);
744
+        p.pu[LUMA_32x32].pixelavg_pp = PFX(pixel_avg_pp_32x32_neon);
745
+        p.pu[LUMA_32x64].pixelavg_pp = PFX(pixel_avg_pp_32x64_neon);
746
+        p.pu[LUMA_48x64].pixelavg_pp = PFX(pixel_avg_pp_48x64_neon);
747
+        p.pu[LUMA_64x16].pixelavg_pp = PFX(pixel_avg_pp_64x16_neon);
748
+        p.pu[LUMA_64x32].pixelavg_pp = PFX(pixel_avg_pp_64x32_neon);
749
+        p.pu[LUMA_64x48].pixelavg_pp = PFX(pixel_avg_pp_64x48_neon);
750
+        p.pu[LUMA_64x64].pixelavg_pp = PFX(pixel_avg_pp_64x64_neon);
751
+
752
+        // planecopy
753
+        p.planecopy_cp = PFX(pixel_planecopy_cp_neon);
754
+
755
+        p.cu[BLOCK_4x4].sa8d   = PFX(pixel_satd_4x4_neon);
756
+        p.cu[BLOCK_8x8].sa8d = PFX(pixel_sa8d_8x8_neon);
757
+        p.cu[BLOCK_16x16].sa8d = PFX(pixel_sa8d_16x16_neon);
758
+        p.cu[BLOCK_32x32].sa8d = PFX(pixel_sa8d_32x32_neon);
759
+        p.cu[BLOCK_64x64].sa8d = PFX(pixel_sa8d_64x64_neon);
760
+        p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].sa8d = PFX(pixel_sa8d_8x16_neon); 
761
+        p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sa8d = PFX(pixel_sa8d_16x32_neon);
762
+        p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sa8d = PFX(pixel_sa8d_32x64_neon);
763
+
764
+        // vertical interpolation filters
765
+        p.pu[LUMA_4x4].luma_vpp     = PFX(interp_8tap_vert_pp_4x4_neon);
766
+        p.pu[LUMA_4x8].luma_vpp     = PFX(interp_8tap_vert_pp_4x8_neon);
767
+        p.pu[LUMA_4x16].luma_vpp    = PFX(interp_8tap_vert_pp_4x16_neon);
768
+        p.pu[LUMA_8x4].luma_vpp     = PFX(interp_8tap_vert_pp_8x4_neon);
769
+        p.pu[LUMA_8x8].luma_vpp     = PFX(interp_8tap_vert_pp_8x8_neon);
770
+        p.pu[LUMA_8x16].luma_vpp    = PFX(interp_8tap_vert_pp_8x16_neon);
771
+        p.pu[LUMA_8x32].luma_vpp    = PFX(interp_8tap_vert_pp_8x32_neon);
772
+        p.pu[LUMA_16x4].luma_vpp    = PFX(interp_8tap_vert_pp_16x4_neon);
773
+        p.pu[LUMA_16x8].luma_vpp    = PFX(interp_8tap_vert_pp_16x8_neon);
774
+        p.pu[LUMA_16x16].luma_vpp   = PFX(interp_8tap_vert_pp_16x16_neon);
775
+        p.pu[LUMA_16x32].luma_vpp   = PFX(interp_8tap_vert_pp_16x32_neon);
776
+        p.pu[LUMA_16x64].luma_vpp   = PFX(interp_8tap_vert_pp_16x64_neon);
777
+        p.pu[LUMA_16x12].luma_vpp   = PFX(interp_8tap_vert_pp_16x12_neon);
778
+        p.pu[LUMA_32x8].luma_vpp    = PFX(interp_8tap_vert_pp_32x8_neon);
779
+        p.pu[LUMA_32x16].luma_vpp   = PFX(interp_8tap_vert_pp_32x16_neon);
780
+        p.pu[LUMA_32x32].luma_vpp   = PFX(interp_8tap_vert_pp_32x32_neon);
781
+        p.pu[LUMA_32x64].luma_vpp   = PFX(interp_8tap_vert_pp_32x64_neon);
782
+        p.pu[LUMA_32x24].luma_vpp   = PFX(interp_8tap_vert_pp_32x24_neon);
783
+        p.pu[LUMA_64x16].luma_vpp   = PFX(interp_8tap_vert_pp_64x16_neon);
784
+        p.pu[LUMA_64x32].luma_vpp   = PFX(interp_8tap_vert_pp_64x32_neon);
785
+        p.pu[LUMA_64x64].luma_vpp   = PFX(interp_8tap_vert_pp_64x64_neon);
786
+        p.pu[LUMA_64x48].luma_vpp   = PFX(interp_8tap_vert_pp_64x48_neon);
787
+        p.pu[LUMA_24x32].luma_vpp   = PFX(interp_8tap_vert_pp_24x32_neon);
788
+        p.pu[LUMA_48x64].luma_vpp   = PFX(interp_8tap_vert_pp_48x64_neon);
789
+        p.pu[LUMA_12x16].luma_vpp   = PFX(interp_8tap_vert_pp_12x16_neon);
790
+
791
+        p.pu[LUMA_4x4].luma_vsp     = PFX(interp_8tap_vert_sp_4x4_neon);
792
+        p.pu[LUMA_4x8].luma_vsp     = PFX(interp_8tap_vert_sp_4x8_neon);
793
+        p.pu[LUMA_4x16].luma_vsp    = PFX(interp_8tap_vert_sp_4x16_neon);
794
+        p.pu[LUMA_8x4].luma_vsp     = PFX(interp_8tap_vert_sp_8x4_neon);
795
+        p.pu[LUMA_8x8].luma_vsp     = PFX(interp_8tap_vert_sp_8x8_neon);
796
+        p.pu[LUMA_8x16].luma_vsp    = PFX(interp_8tap_vert_sp_8x16_neon);
797
+        p.pu[LUMA_8x32].luma_vsp    = PFX(interp_8tap_vert_sp_8x32_neon);
798
+        p.pu[LUMA_16x4].luma_vsp    = PFX(interp_8tap_vert_sp_16x4_neon);
799
+        p.pu[LUMA_16x8].luma_vsp    = PFX(interp_8tap_vert_sp_16x8_neon);
800
+        p.pu[LUMA_16x16].luma_vsp   = PFX(interp_8tap_vert_sp_16x16_neon);
801
+        p.pu[LUMA_16x32].luma_vsp   = PFX(interp_8tap_vert_sp_16x32_neon);
802
+        p.pu[LUMA_16x64].luma_vsp   = PFX(interp_8tap_vert_sp_16x64_neon);
803
+        p.pu[LUMA_16x12].luma_vsp   = PFX(interp_8tap_vert_sp_16x12_neon);
804
+        p.pu[LUMA_32x8].luma_vsp    = PFX(interp_8tap_vert_sp_32x8_neon);
805
+        p.pu[LUMA_32x16].luma_vsp   = PFX(interp_8tap_vert_sp_32x16_neon);
806
+        p.pu[LUMA_32x32].luma_vsp   = PFX(interp_8tap_vert_sp_32x32_neon);
807
+        p.pu[LUMA_32x64].luma_vsp   = PFX(interp_8tap_vert_sp_32x64_neon);
808
+        p.pu[LUMA_32x24].luma_vsp   = PFX(interp_8tap_vert_sp_32x24_neon);
809
+        p.pu[LUMA_64x16].luma_vsp   = PFX(interp_8tap_vert_sp_64x16_neon);
810
+        p.pu[LUMA_64x32].luma_vsp   = PFX(interp_8tap_vert_sp_64x32_neon);
811
+        p.pu[LUMA_64x64].luma_vsp   = PFX(interp_8tap_vert_sp_64x64_neon);
812
+        p.pu[LUMA_64x48].luma_vsp   = PFX(interp_8tap_vert_sp_64x48_neon);
813
+        p.pu[LUMA_24x32].luma_vsp   = PFX(interp_8tap_vert_sp_24x32_neon);
814
+        p.pu[LUMA_48x64].luma_vsp   = PFX(interp_8tap_vert_sp_48x64_neon);
815
+        p.pu[LUMA_12x16].luma_vsp   = PFX(interp_8tap_vert_sp_12x16_neon);
816
+
817
+        p.pu[LUMA_4x4].luma_vps     = PFX(interp_8tap_vert_ps_4x4_neon);
818
+        p.pu[LUMA_4x8].luma_vps     = PFX(interp_8tap_vert_ps_4x8_neon);
819
+        p.pu[LUMA_4x16].luma_vps    = PFX(interp_8tap_vert_ps_4x16_neon);
820
+        p.pu[LUMA_8x4].luma_vps     = PFX(interp_8tap_vert_ps_8x4_neon);
821
+        p.pu[LUMA_8x8].luma_vps     = PFX(interp_8tap_vert_ps_8x8_neon);
822
+        p.pu[LUMA_8x16].luma_vps    = PFX(interp_8tap_vert_ps_8x16_neon);
823
+        p.pu[LUMA_8x32].luma_vps    = PFX(interp_8tap_vert_ps_8x32_neon);
824
+        p.pu[LUMA_16x4].luma_vps    = PFX(interp_8tap_vert_ps_16x4_neon);
825
+        p.pu[LUMA_16x8].luma_vps    = PFX(interp_8tap_vert_ps_16x8_neon);
826
+        p.pu[LUMA_16x16].luma_vps   = PFX(interp_8tap_vert_ps_16x16_neon);
827
+        p.pu[LUMA_16x32].luma_vps   = PFX(interp_8tap_vert_ps_16x32_neon);
828
+        p.pu[LUMA_16x64].luma_vps   = PFX(interp_8tap_vert_ps_16x64_neon);
829
+        p.pu[LUMA_16x12].luma_vps   = PFX(interp_8tap_vert_ps_16x12_neon);
830
+        p.pu[LUMA_32x8].luma_vps    = PFX(interp_8tap_vert_ps_32x8_neon);
831
+        p.pu[LUMA_32x16].luma_vps   = PFX(interp_8tap_vert_ps_32x16_neon);
832
+        p.pu[LUMA_32x32].luma_vps   = PFX(interp_8tap_vert_ps_32x32_neon);
833
+        p.pu[LUMA_32x64].luma_vps   = PFX(interp_8tap_vert_ps_32x64_neon);
834
+        p.pu[LUMA_32x24].luma_vps   = PFX(interp_8tap_vert_ps_32x24_neon);
835
+        p.pu[LUMA_64x16].luma_vps   = PFX(interp_8tap_vert_ps_64x16_neon);
836
+        p.pu[LUMA_64x32].luma_vps   = PFX(interp_8tap_vert_ps_64x32_neon);
837
+        p.pu[LUMA_64x64].luma_vps   = PFX(interp_8tap_vert_ps_64x64_neon);
838
+        p.pu[LUMA_64x48].luma_vps   = PFX(interp_8tap_vert_ps_64x48_neon);
839
+        p.pu[LUMA_24x32].luma_vps   = PFX(interp_8tap_vert_ps_24x32_neon);
840
+        p.pu[LUMA_48x64].luma_vps   = PFX(interp_8tap_vert_ps_48x64_neon);
841
+        p.pu[LUMA_12x16].luma_vps   = PFX(interp_8tap_vert_ps_12x16_neon);
842
+
843
+        //vertical chroma filters
844
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].filter_vpp = PFX(interp_4tap_vert_pp_8x2_neon);
845
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].filter_vpp = PFX(interp_4tap_vert_pp_8x4_neon);
846
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].filter_vpp = PFX(interp_4tap_vert_pp_8x6_neon);
847
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].filter_vpp = PFX(interp_4tap_vert_pp_8x8_neon);
848
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].filter_vpp = PFX(interp_4tap_vert_pp_8x16_neon);
849
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].filter_vpp = PFX(interp_4tap_vert_pp_8x32_neon);
850
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_vpp = PFX(interp_4tap_vert_pp_16x4_neon);
851
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_vpp = PFX(interp_4tap_vert_pp_16x8_neon);
852
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_vpp = PFX(interp_4tap_vert_pp_16x12_neon);
853
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_vpp = PFX(interp_4tap_vert_pp_16x16_neon);
854
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].filter_vpp = PFX(interp_4tap_vert_pp_16x32_neon);
855
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_vpp = PFX(interp_4tap_vert_pp_32x8_neon);
856
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vpp = PFX(interp_4tap_vert_pp_32x16_neon);
857
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vpp = PFX(interp_4tap_vert_pp_32x24_neon);
858
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_vpp = PFX(interp_4tap_vert_pp_32x32_neon);
859
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].filter_vpp = PFX(interp_4tap_vert_pp_24x32_neon);
860
+
861
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].filter_vpp = PFX(interp_4tap_vert_pp_8x4_neon);
862
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].filter_vpp = PFX(interp_4tap_vert_pp_8x8_neon);
863
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].filter_vpp = PFX(interp_4tap_vert_pp_8x16_neon);
864
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].filter_vpp = PFX(interp_4tap_vert_pp_8x32_neon);
865
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].filter_vpp = PFX(interp_4tap_vert_pp_8x12_neon);
866
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].filter_vpp = PFX(interp_4tap_vert_pp_8x64_neon);
867
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_vpp = PFX(interp_4tap_vert_pp_16x8_neon);
868
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_vpp = PFX(interp_4tap_vert_pp_16x16_neon);
869
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].filter_vpp = PFX(interp_4tap_vert_pp_16x32_neon);
870
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].filter_vpp = PFX(interp_4tap_vert_pp_16x64_neon);
871
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_vpp = PFX(interp_4tap_vert_pp_16x24_neon);
872
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_vpp = PFX(interp_4tap_vert_pp_32x16_neon);
873
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_vpp = PFX(interp_4tap_vert_pp_32x32_neon);
874
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_vpp = PFX(interp_4tap_vert_pp_32x64_neon);
875
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_vpp = PFX(interp_4tap_vert_pp_32x48_neon);
876
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].filter_vpp = PFX(interp_4tap_vert_pp_24x64_neon);
877
+
878
+        p.chroma[X265_CSP_I444].pu[LUMA_8x4].filter_vpp = PFX(interp_4tap_vert_pp_8x4_neon);
879
+        p.chroma[X265_CSP_I444].pu[LUMA_8x8].filter_vpp = PFX(interp_4tap_vert_pp_8x8_neon);
880
+        p.chroma[X265_CSP_I444].pu[LUMA_8x16].filter_vpp = PFX(interp_4tap_vert_pp_8x16_neon);
881
+        p.chroma[X265_CSP_I444].pu[LUMA_8x32].filter_vpp = PFX(interp_4tap_vert_pp_8x32_neon);
882
+        p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_vpp = PFX(interp_4tap_vert_pp_16x4_neon);
883
+        p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_vpp = PFX(interp_4tap_vert_pp_16x8_neon);
884
+        p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_vpp = PFX(interp_4tap_vert_pp_16x12_neon);
885
+        p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_vpp = PFX(interp_4tap_vert_pp_16x16_neon);
886
+        p.chroma[X265_CSP_I444].pu[LUMA_16x32].filter_vpp = PFX(interp_4tap_vert_pp_16x32_neon);
887
+        p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_vpp = PFX(interp_4tap_vert_pp_16x64_neon);
888
+        p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_vpp = PFX(interp_4tap_vert_pp_32x8_neon);
889
+        p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_vpp = PFX(interp_4tap_vert_pp_32x16_neon);
890
+        p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_vpp = PFX(interp_4tap_vert_pp_32x32_neon);
891
+        p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_vpp = PFX(interp_4tap_vert_pp_32x64_neon);
892
+        p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vpp = PFX(interp_4tap_vert_pp_64x16_neon);
893
+        p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vpp = PFX(interp_4tap_vert_pp_64x32_neon);
894
+        p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vpp = PFX(interp_4tap_vert_pp_64x48_neon);
895
+        p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vpp = PFX(interp_4tap_vert_pp_64x64_neon);
896
+        p.chroma[X265_CSP_I444].pu[LUMA_24x32].filter_vpp = PFX(interp_4tap_vert_pp_24x32_neon);
897
+        p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vpp = PFX(interp_4tap_vert_pp_48x64_neon);
898
+
899
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].filter_vps = PFX(interp_4tap_vert_ps_8x2_neon);
900
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].filter_vps = PFX(interp_4tap_vert_ps_8x4_neon);
901
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].filter_vps = PFX(interp_4tap_vert_ps_8x6_neon);
902
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].filter_vps = PFX(interp_4tap_vert_ps_8x8_neon);
903
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].filter_vps = PFX(interp_4tap_vert_ps_8x16_neon);
904
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].filter_vps = PFX(interp_4tap_vert_ps_8x32_neon);
905
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_vps = PFX(interp_4tap_vert_ps_16x4_neon);
906
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_vps = PFX(interp_4tap_vert_ps_16x8_neon);
907
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_vps = PFX(interp_4tap_vert_ps_16x12_neon);
908
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_vps = PFX(interp_4tap_vert_ps_16x16_neon);
909
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].filter_vps = PFX(interp_4tap_vert_ps_16x32_neon);
910
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_vps = PFX(interp_4tap_vert_ps_32x8_neon);
911
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vps = PFX(interp_4tap_vert_ps_32x16_neon);
912
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vps = PFX(interp_4tap_vert_ps_32x24_neon);
913
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_vps = PFX(interp_4tap_vert_ps_32x32_neon);
914
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].filter_vps = PFX(interp_4tap_vert_ps_24x32_neon);
915
+
916
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].filter_vps = PFX(interp_4tap_vert_ps_8x4_neon);
917
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].filter_vps = PFX(interp_4tap_vert_ps_8x8_neon);
918
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].filter_vps = PFX(interp_4tap_vert_ps_8x16_neon);
919
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].filter_vps = PFX(interp_4tap_vert_ps_8x32_neon);
920
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].filter_vps = PFX(interp_4tap_vert_ps_8x12_neon);
921
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].filter_vps = PFX(interp_4tap_vert_ps_8x64_neon);
922
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_vps = PFX(interp_4tap_vert_ps_16x8_neon);
923
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_vps = PFX(interp_4tap_vert_ps_16x16_neon);
924
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].filter_vps = PFX(interp_4tap_vert_ps_16x32_neon);
925
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].filter_vps = PFX(interp_4tap_vert_ps_16x64_neon);
926
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_vps = PFX(interp_4tap_vert_ps_16x24_neon);
927
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_vps = PFX(interp_4tap_vert_ps_32x16_neon);
928
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_vps = PFX(interp_4tap_vert_ps_32x32_neon);
929
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_vps = PFX(interp_4tap_vert_ps_32x64_neon);
930
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_vps = PFX(interp_4tap_vert_ps_32x48_neon);
931
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].filter_vps = PFX(interp_4tap_vert_ps_24x64_neon);
932
+
933
+        p.chroma[X265_CSP_I444].pu[LUMA_8x4].filter_vps = PFX(interp_4tap_vert_ps_8x4_neon);
934
+        p.chroma[X265_CSP_I444].pu[LUMA_8x8].filter_vps = PFX(interp_4tap_vert_ps_8x8_neon);
935
+        p.chroma[X265_CSP_I444].pu[LUMA_8x16].filter_vps = PFX(interp_4tap_vert_ps_8x16_neon);
936
+        p.chroma[X265_CSP_I444].pu[LUMA_8x32].filter_vps = PFX(interp_4tap_vert_ps_8x32_neon);
937
+        p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_vps = PFX(interp_4tap_vert_ps_16x4_neon);
938
+        p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_vps = PFX(interp_4tap_vert_ps_16x8_neon);
939
+        p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_vps = PFX(interp_4tap_vert_ps_16x12_neon);
940
+        p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_vps = PFX(interp_4tap_vert_ps_16x16_neon);
941
+        p.chroma[X265_CSP_I444].pu[LUMA_16x32].filter_vps = PFX(interp_4tap_vert_ps_16x32_neon);
942
+        p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_vps = PFX(interp_4tap_vert_ps_16x64_neon);
943
+        p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_vps = PFX(interp_4tap_vert_ps_32x8_neon);
944
+        p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_vps = PFX(interp_4tap_vert_ps_32x16_neon);
945
+        p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_vps = PFX(interp_4tap_vert_ps_32x32_neon);
946
+        p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_vps = PFX(interp_4tap_vert_ps_32x64_neon);
947
+        p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vps = PFX(interp_4tap_vert_ps_64x16_neon);
948
+        p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vps = PFX(interp_4tap_vert_ps_64x32_neon);
949
+        p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vps = PFX(interp_4tap_vert_ps_64x48_neon);
950
+        p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vps = PFX(interp_4tap_vert_ps_64x64_neon);
951
+        p.chroma[X265_CSP_I444].pu[LUMA_24x32].filter_vps = PFX(interp_4tap_vert_ps_24x32_neon);
952
+        p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vps = PFX(interp_4tap_vert_ps_48x64_neon);
953
+
954
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].filter_vsp = PFX(interp_4tap_vert_sp_8x2_neon);
955
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].filter_vsp = PFX(interp_4tap_vert_sp_8x4_neon);
956
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].filter_vsp = PFX(interp_4tap_vert_sp_8x6_neon);
957
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].filter_vsp = PFX(interp_4tap_vert_sp_8x8_neon);
958
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].filter_vsp = PFX(interp_4tap_vert_sp_8x16_neon);
959
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].filter_vsp = PFX(interp_4tap_vert_sp_8x32_neon);
960
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_vsp = PFX(interp_4tap_vert_sp_16x4_neon);
961
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_vsp = PFX(interp_4tap_vert_sp_16x8_neon);
962
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_vsp = PFX(interp_4tap_vert_sp_16x12_neon);
963
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_vsp = PFX(interp_4tap_vert_sp_16x16_neon);
964
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].filter_vsp = PFX(interp_4tap_vert_sp_16x32_neon);
965
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_vsp = PFX(interp_4tap_vert_sp_32x8_neon);
966
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vsp = PFX(interp_4tap_vert_sp_32x16_neon);
967
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vsp = PFX(interp_4tap_vert_sp_32x24_neon);
968
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_vsp = PFX(interp_4tap_vert_sp_32x32_neon);
969
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].filter_vsp = PFX(interp_4tap_vert_sp_24x32_neon);
970
+
971
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].filter_vsp = PFX(interp_4tap_vert_sp_8x4_neon);
972
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].filter_vsp = PFX(interp_4tap_vert_sp_8x8_neon);
973
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].filter_vsp = PFX(interp_4tap_vert_sp_8x16_neon);
974
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].filter_vsp = PFX(interp_4tap_vert_sp_8x32_neon);
975
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].filter_vsp = PFX(interp_4tap_vert_sp_8x12_neon);
976
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].filter_vsp = PFX(interp_4tap_vert_sp_8x64_neon);
977
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_vsp = PFX(interp_4tap_vert_sp_16x8_neon);
978
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_vsp = PFX(interp_4tap_vert_sp_16x16_neon);
979
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].filter_vsp = PFX(interp_4tap_vert_sp_16x32_neon);
980
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].filter_vsp = PFX(interp_4tap_vert_sp_16x64_neon);
981
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_vsp = PFX(interp_4tap_vert_sp_16x24_neon);
982
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_vsp = PFX(interp_4tap_vert_sp_32x16_neon);
983
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_vsp = PFX(interp_4tap_vert_sp_32x32_neon);
984
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_vsp = PFX(interp_4tap_vert_sp_32x64_neon);
985
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_vsp = PFX(interp_4tap_vert_sp_32x48_neon);
986
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].filter_vsp = PFX(interp_4tap_vert_sp_24x64_neon);
987
+
988
+        p.chroma[X265_CSP_I444].pu[LUMA_8x4].filter_vsp = PFX(interp_4tap_vert_sp_8x4_neon);
989
+        p.chroma[X265_CSP_I444].pu[LUMA_8x8].filter_vsp = PFX(interp_4tap_vert_sp_8x8_neon);
990
+        p.chroma[X265_CSP_I444].pu[LUMA_8x16].filter_vsp = PFX(interp_4tap_vert_sp_8x16_neon);
991
+        p.chroma[X265_CSP_I444].pu[LUMA_8x32].filter_vsp = PFX(interp_4tap_vert_sp_8x32_neon);
992
+        p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_vsp = PFX(interp_4tap_vert_sp_16x4_neon);
993
+        p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_vsp = PFX(interp_4tap_vert_sp_16x8_neon);
994
+        p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_vsp = PFX(interp_4tap_vert_sp_16x12_neon);
995
+        p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_vsp = PFX(interp_4tap_vert_sp_16x16_neon);
996
+        p.chroma[X265_CSP_I444].pu[LUMA_16x32].filter_vsp = PFX(interp_4tap_vert_sp_16x32_neon);
997
+        p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_vsp = PFX(interp_4tap_vert_sp_16x64_neon);
998
+        p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_vsp = PFX(interp_4tap_vert_sp_32x8_neon);
999
+        p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_vsp = PFX(interp_4tap_vert_sp_32x16_neon);
1000
+        p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_vsp = PFX(interp_4tap_vert_sp_32x32_neon);
1001
+        p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_vsp = PFX(interp_4tap_vert_sp_32x64_neon);
1002
+        p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vsp = PFX(interp_4tap_vert_sp_64x16_neon);
1003
+        p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vsp = PFX(interp_4tap_vert_sp_64x32_neon);
1004
+        p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vsp = PFX(interp_4tap_vert_sp_64x48_neon);
1005
+        p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vsp = PFX(interp_4tap_vert_sp_64x64_neon);
1006
+        p.chroma[X265_CSP_I444].pu[LUMA_24x32].filter_vsp = PFX(interp_4tap_vert_sp_24x32_neon);
1007
+        p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vsp = PFX(interp_4tap_vert_sp_48x64_neon);
1008
+
1009
+        p.cu[BLOCK_4x4].dct = PFX(dct_4x4_neon);
1010
+        p.cu[BLOCK_8x8].dct = PFX(dct_8x8_neon);
1011
+        p.cu[BLOCK_16x16].dct = PFX(dct_16x16_neon);
1012
+#if !HIGH_BIT_DEPTH
1013
+        p.cu[BLOCK_4x4].psy_cost_pp = PFX(psyCost_4x4_neon);
1014
+#endif // !HIGH_BIT_DEPTH
1015
+    }
1016
+    if (cpuMask & X265_CPU_ARMV6)
1017
+    {
1018
+        p.pu[LUMA_4x4].sad = PFX(pixel_sad_4x4_armv6);
1019
+        p.pu[LUMA_4x8].sad = PFX(pixel_sad_4x8_armv6);
1020
+        p.pu[LUMA_4x16].sad=PFX(pixel_sad_4x16_armv6);
1021
+    }
1022
+}
1023
+} // namespace X265_NS
1024
x265_2.0.tar.gz/source/common/arm/asm.S Added
196
 
1
@@ -0,0 +1,194 @@
2
+/*****************************************************************************
3
+ * asm.S: arm utility macros
4
+ *****************************************************************************
5
+ * Copyright (C) 2016 x265 project
6
+ *
7
+ * Authors: Mans Rullgard <mans@mansr.com>
8
+ *          David Conrad <lessen42@gmail.com>
9
+ *          Dnyaneshwar Gorade <dnyaneshwar@multicorewareinc.com>
10
+ *
11
+ * This program is free software; you can redistribute it and/or modify
12
+ * it under the terms of the GNU General Public License as published by
13
+ * the Free Software Foundation; either version 2 of the License, or
14
+ * (at your option) any later version.
15
+ *
16
+ * This program is distributed in the hope that it will be useful,
17
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
18
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19
+ * GNU General Public License for more details.
20
+ *
21
+ * You should have received a copy of the GNU General Public License
22
+ * along with this program; if not, write to the Free Software
23
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
24
+ *
25
+ * This program is also available under a commercial proprietary license.
26
+ * For more information, contact us at license @ x265.com.
27
+ *****************************************************************************/
28
+
29
+.syntax unified
30
+
31
+#if   HAVE_NEON
32
+        .arch           armv7-a
33
+#elif HAVE_ARMV6T2
34
+        .arch           armv6t2
35
+#elif HAVE_ARMV6
36
+        .arch           armv6
37
+#endif
38
+
39
+.fpu neon
40
+
41
+#ifdef PREFIX
42
+#   define EXTERN_ASM _
43
+#else
44
+#   define EXTERN_ASM
45
+#endif
46
+
47
+#ifdef __ELF__
48
+#   define ELF
49
+#else
50
+#   define ELF @
51
+#endif
52
+
53
+#if HAVE_AS_FUNC
54
+#   define FUNC
55
+#else
56
+#   define FUNC @
57
+#endif
58
+
59
+.macro require8, val=1
60
+ELF     .eabi_attribute 24, \val
61
+.endm
62
+
63
+.macro preserve8, val=1
64
+ELF     .eabi_attribute 25, \val
65
+.endm
66
+
67
+.macro function name, export=1
68
+    .macro endfunc
69
+ELF     .size   \name, . - \name
70
+FUNC    .endfunc
71
+        .purgem endfunc
72
+    .endm
73
+        .align  2
74
+.if \export == 1
75
+        .global EXTERN_ASM\name
76
+ELF     .hidden EXTERN_ASM\name
77
+ELF     .type   EXTERN_ASM\name, %function
78
+FUNC    .func   EXTERN_ASM\name
79
+EXTERN_ASM\name:
80
+.else
81
+ELF     .hidden \name
82
+ELF     .type   \name, %function
83
+FUNC    .func   \name
84
+\name:
85
+.endif
86
+.endm
87
+
88
+.macro movrel rd, val
89
+#if HAVE_ARMV6T2 && !defined(PIC)
90
+        movw            \rd, #:lower16:\val
91
+        movt            \rd, #:upper16:\val
92
+#else
93
+        ldr             \rd, =\val
94
+#endif
95
+.endm
96
+
97
+.macro movconst rd, val
98
+#if HAVE_ARMV6T2
99
+    movw        \rd, #:lower16:\val
100
+.if \val >> 16
101
+    movt        \rd, #:upper16:\val
102
+.endif
103
+#else
104
+    ldr         \rd, =\val
105
+#endif
106
+.endm
107
+
108
+#define GLUE(a, b) a ## b
109
+#define JOIN(a, b) GLUE(a, b)
110
+#define X(s) JOIN(EXTERN_ASM, s)
111
+
112
+#define FENC_STRIDE 64
113
+#define FDEC_STRIDE 32
114
+
115
+.macro HORIZ_ADD dest, a, b
116
+.ifnb \b
117
+    vadd.u16    \a, \a, \b
118
+.endif
119
+    vpaddl.u16  \a, \a
120
+    vpaddl.u32  \dest, \a
121
+.endm
122
+
123
+.macro SUMSUB_AB sum, diff, a, b
124
+    vadd.s16    \sum,  \a, \b
125
+    vsub.s16    \diff, \a, \b
126
+.endm
127
+
128
+.macro SUMSUB_ABCD s1, d1, s2, d2, a, b, c, d
129
+    SUMSUB_AB   \s1, \d1, \a, \b
130
+    SUMSUB_AB   \s2, \d2, \c, \d
131
+.endm
132
+
133
+.macro ABS2 a b
134
+    vabs.s16 \a, \a
135
+    vabs.s16 \b, \b
136
+.endm
137
+
138
+// dist = distance in elements (0 for vertical pass, 1/2 for horizontal passes)
139
+// op = sumsub/amax (sum and diff / maximum of absolutes)
140
+// d1/2 = destination registers
141
+// s1/2 = source registers
142
+.macro HADAMARD dist, op, d1, d2, s1, s2
143
+.if \dist == 1
144
+    vtrn.16     \s1, \s2
145
+.else
146
+    vtrn.32     \s1, \s2
147
+.endif
148
+.ifc \op, sumsub
149
+    SUMSUB_AB   \d1, \d2, \s1, \s2
150
+.else
151
+    vabs.s16    \s1, \s1
152
+    vabs.s16    \s2, \s2
153
+    vmax.s16    \d1, \s1, \s2
154
+.endif
155
+.endm
156
+
157
+.macro TRANSPOSE8x8 r0 r1 r2 r3 r4 r5 r6 r7
158
+    vtrn.32         \r0, \r4
159
+    vtrn.32         \r1, \r5
160
+    vtrn.32         \r2, \r6
161
+    vtrn.32         \r3, \r7
162
+    vtrn.16         \r0, \r2
163
+    vtrn.16         \r1, \r3
164
+    vtrn.16         \r4, \r6
165
+    vtrn.16         \r5, \r7
166
+    vtrn.8          \r0, \r1
167
+    vtrn.8          \r2, \r3
168
+    vtrn.8          \r4, \r5
169
+    vtrn.8          \r6, \r7
170
+.endm
171
+
172
+.macro TRANSPOSE4x4 r0 r1 r2 r3
173
+    vtrn.16         \r0, \r2
174
+    vtrn.16         \r1, \r3
175
+    vtrn.8          \r0, \r1
176
+    vtrn.8          \r2, \r3
177
+.endm
178
+
179
+.macro TRANSPOSE4x4_16  r0, r1, r2, r3
180
+    vtrn.32     \r0, \r2            // r0 = [21 20 01 00], r2 = [23 22 03 02]
181
+    vtrn.32     \r1, \r3            // r1 = [31 30 11 10], r3 = [33 32 13 12]
182
+    vtrn.16     \r0, \r1            // r0 = [30 20 10 00], r1 = [31 21 11 01]
183
+    vtrn.16     \r2, \r3            // r2 = [32 22 12 02], r3 = [33 23 13 03]
184
+.endm
185
+
186
+.macro TRANSPOSE4x4x2_16  rA0, rA1, rA2, rA3, rB0, rB1, rB2, rB3
187
+    vtrn.32     \rA0, \rA2          // r0 = [21 20 01 00], r2 = [23 22 03 02]
188
+    vtrn.32     \rA1, \rA3          // r1 = [31 30 11 10], r3 = [33 32 13 12]
189
+    vtrn.32     \rB0, \rB2
190
+    vtrn.32     \rB1, \rB3
191
+    vtrn.16     \rA0, \rA1          // r0 = [30 20 10 00], r1 = [31 21 11 01]
192
+    vtrn.16     \rA2, \rA3          // r2 = [32 22 12 02], r3 = [33 23 13 03]
193
+    vtrn.16     \rB0, \rB1
194
+    vtrn.16     \rB2, \rB3
195
+.endm
196
x265_2.0.tar.gz/source/common/arm/blockcopy8.S Added
840
 
1
@@ -0,0 +1,838 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2016 x265 project
4
+ *
5
+ * Authors: Radhakrishnan VR <radhakrishnan@multicorewareinc.com>
6
+ * 
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "asm.S"
26
+
27
+.section .rodata
28
+
29
+.align 4
30
+
31
+.text
32
+
33
+/* void blockcopy_sp(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb)
34
+ *
35
+ * r0   - a
36
+ * r1   - stridea
37
+ * r2   - b
38
+ * r3   - strideb */
39
+function x265_blockcopy_sp_4x4_neon
40
+    lsl             r3, #1
41
+.rept 2
42
+    vld1.u16        {q0}, [r2], r3
43
+    vld1.u16        {q1}, [r2], r3
44
+    vmovn.u16       d0, q0
45
+    vmovn.u16       d1, q1
46
+    vst1.u32        {d0[0]}, [r0], r1
47
+    vst1.u32        {d1[0]}, [r0], r1
48
+.endr
49
+    bx              lr
50
+endfunc
51
+
52
+function x265_blockcopy_sp_8x8_neon
53
+    lsl             r3, #1
54
+.rept 4
55
+    vld1.u16        {q0}, [r2], r3
56
+    vld1.u16        {q1}, [r2], r3
57
+    vmovn.u16       d0, q0
58
+    vmovn.u16       d1, q1
59
+    vst1.u8         {d0}, [r0], r1
60
+    vst1.u8         {d1}, [r0], r1
61
+.endr
62
+    bx              lr
63
+endfunc
64
+
65
+function x265_blockcopy_sp_16x16_neon
66
+    lsl             r3, #1
67
+.rept 8
68
+    vld1.u16        {q0, q1}, [r2], r3
69
+    vld1.u16        {q2, q3}, [r2], r3
70
+    vmovn.u16       d0, q0
71
+    vmovn.u16       d1, q1
72
+    vmovn.u16       d2, q2
73
+    vmovn.u16       d3, q3
74
+    vst1.u8         {q0}, [r0], r1
75
+    vst1.u8         {q1}, [r0], r1
76
+.endr
77
+    bx              lr
78
+endfunc
79
+
80
+function x265_blockcopy_sp_32x32_neon
81
+    mov             r12, #4
82
+    lsl             r3, #1
83
+    sub             r3, #32
84
+loop_csp32:
85
+    subs            r12, #1
86
+.rept 4
87
+    vld1.u16        {q0, q1}, [r2]!
88
+    vld1.u16        {q2, q3}, [r2], r3
89
+    vld1.u16        {q8, q9}, [r2]!
90
+    vld1.u16        {q10, q11}, [r2], r3
91
+
92
+    vmovn.u16       d0, q0
93
+    vmovn.u16       d1, q1
94
+    vmovn.u16       d2, q2
95
+    vmovn.u16       d3, q3
96
+
97
+    vmovn.u16       d4, q8
98
+    vmovn.u16       d5, q9
99
+    vmovn.u16       d6, q10
100
+    vmovn.u16       d7, q11
101
+
102
+    vst1.u8         {q0, q1}, [r0], r1
103
+    vst1.u8         {q2, q3}, [r0], r1
104
+.endr
105
+    bne             loop_csp32
106
+    bx              lr
107
+endfunc
108
+
109
+function x265_blockcopy_sp_64x64_neon
110
+    mov             r12, #16
111
+    lsl             r3, #1
112
+    sub             r3, #96
113
+    sub             r1, #32
114
+loop_csp64:
115
+    subs            r12, #1
116
+.rept 4
117
+    vld1.u16        {q0, q1}, [r2]!
118
+    vld1.u16        {q2, q3}, [r2]!
119
+    vld1.u16        {q8, q9}, [r2]!
120
+    vld1.u16        {q10, q11}, [r2], r3
121
+
122
+    vmovn.u16       d0, q0
123
+    vmovn.u16       d1, q1
124
+    vmovn.u16       d2, q2
125
+    vmovn.u16       d3, q3
126
+
127
+    vmovn.u16       d4, q8
128
+    vmovn.u16       d5, q9
129
+    vmovn.u16       d6, q10
130
+    vmovn.u16       d7, q11
131
+
132
+    vst1.u8         {q0, q1}, [r0]!
133
+    vst1.u8         {q2, q3}, [r0], r1
134
+.endr
135
+    bne             loop_csp64
136
+    bx              lr
137
+endfunc
138
+
139
+// void blockcopy_ps(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb)
140
+function x265_blockcopy_ps_4x4_neon
141
+    lsl             r1, #1
142
+.rept 2
143
+    vld1.u8         {d0}, [r2], r3
144
+    vld1.u8         {d1}, [r2], r3
145
+    vmovl.u8        q1, d0
146
+    vmovl.u8        q2, d1
147
+    vst1.u16        {d2}, [r0], r1
148
+    vst1.u16        {d4}, [r0], r1
149
+.endr
150
+    bx              lr
151
+endfunc
152
+
153
+function x265_blockcopy_ps_8x8_neon
154
+    lsl             r1, #1
155
+.rept 4
156
+    vld1.u8         {d0}, [r2], r3
157
+    vld1.u8         {d1}, [r2], r3
158
+    vmovl.u8        q1, d0
159
+    vmovl.u8        q2, d1
160
+    vst1.u16        {q1}, [r0], r1
161
+    vst1.u16        {q2}, [r0], r1
162
+.endr
163
+    bx              lr
164
+endfunc
165
+
166
+function x265_blockcopy_ps_16x16_neon
167
+    lsl             r1, #1
168
+.rept 8
169
+    vld1.u8         {q0}, [r2], r3
170
+    vld1.u8         {q1}, [r2], r3
171
+    vmovl.u8        q8, d0
172
+    vmovl.u8        q9, d1
173
+    vmovl.u8        q10, d2
174
+    vmovl.u8        q11, d3
175
+    vst1.u16        {q8, q9}, [r0], r1
176
+    vst1.u16        {q10, q11}, [r0], r1
177
+.endr
178
+    bx              lr
179
+endfunc
180
+
181
+function x265_blockcopy_ps_32x32_neon
182
+    lsl             r1, #1
183
+    sub             r1, #32
184
+    mov             r12, #4
185
+loop_cps32:
186
+    subs            r12, #1
187
+.rept 4
188
+    vld1.u8         {q0, q1}, [r2], r3
189
+    vld1.u8         {q2, q3}, [r2], r3
190
+    vmovl.u8        q8, d0
191
+    vmovl.u8        q9, d1
192
+    vmovl.u8        q10, d2
193
+    vmovl.u8        q11, d3
194
+
195
+    vmovl.u8        q12, d4
196
+    vmovl.u8        q13, d5
197
+    vmovl.u8        q14, d6
198
+    vmovl.u8        q15, d7
199
+
200
+    vst1.u16        {q8, q9}, [r0]!
201
+    vst1.u16        {q10, q11}, [r0], r1
202
+    vst1.u16        {q12, q13}, [r0]!
203
+    vst1.u16        {q14, q15}, [r0], r1
204
+.endr
205
+    bne             loop_cps32
206
+    bx              lr
207
+endfunc
208
+
209
+function x265_blockcopy_ps_64x64_neon
210
+    lsl             r1, #1
211
+    sub             r1, #96
212
+    sub             r3, #32
213
+    mov             r12, #16
214
+loop_cps64:
215
+    subs            r12, #1
216
+.rept 4
217
+    vld1.u8         {q0, q1}, [r2]!
218
+    vld1.u8         {q2, q3}, [r2], r3
219
+    vmovl.u8        q8, d0
220
+    vmovl.u8        q9, d1
221
+    vmovl.u8        q10, d2
222
+    vmovl.u8        q11, d3
223
+
224
+    vmovl.u8        q12, d4
225
+    vmovl.u8        q13, d5
226
+    vmovl.u8        q14, d6
227
+    vmovl.u8        q15, d7
228
+
229
+    vst1.u16        {q8, q9}, [r0]!
230
+    vst1.u16        {q10, q11}, [r0]!
231
+    vst1.u16        {q12, q13}, [r0]!
232
+    vst1.u16        {q14, q15}, [r0], r1
233
+.endr
234
+    bne             loop_cps64
235
+    bx              lr
236
+endfunc
237
+
238
+// void x265_blockcopy_ss(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb)
239
+function x265_blockcopy_ss_4x4_neon
240
+    lsl             r1, #1
241
+    lsl             r3, #1
242
+.rept 2
243
+    vld1.u16        {d0}, [r2], r3
244
+    vld1.u16        {d1}, [r2], r3
245
+    vst1.u16        {d0}, [r0], r1
246
+    vst1.u16        {d1}, [r0], r1
247
+.endr
248
+    bx              lr
249
+endfunc
250
+
251
+function x265_blockcopy_ss_8x8_neon
252
+    lsl             r1, #1
253
+    lsl             r3, #1
254
+.rept 4
255
+    vld1.u16        {q0}, [r2], r3
256
+    vld1.u16        {q1}, [r2], r3
257
+    vst1.u16        {q0}, [r0], r1
258
+    vst1.u16        {q1}, [r0], r1
259
+.endr
260
+    bx              lr
261
+endfunc
262
+
263
+function x265_blockcopy_ss_16x16_neon
264
+    lsl             r1, #1
265
+    lsl             r3, #1
266
+.rept 8
267
+    vld1.u16        {q0, q1}, [r2], r3
268
+    vld1.u16        {q2, q3}, [r2], r3
269
+    vst1.u16        {q0, q1}, [r0], r1
270
+    vst1.u16        {q2, q3}, [r0], r1
271
+.endr
272
+    bx              lr
273
+endfunc
274
+
275
+function x265_blockcopy_ss_32x32_neon
276
+    lsl             r1, #1
277
+    lsl             r3, #1
278
+    mov             r12, #4
279
+    sub             r1, #32
280
+    sub             r3, #32
281
+loop_css32:
282
+    subs            r12, #1
283
+.rept 8
284
+    vld1.u16        {q0, q1}, [r2]!
285
+    vld1.u16        {q2, q3}, [r2], r3
286
+    vst1.u16        {q0, q1}, [r0]!
287
+    vst1.u16        {q2, q3}, [r0], r1
288
+.endr
289
+    bne             loop_css32
290
+    bx              lr
291
+endfunc
292
+
293
+function x265_blockcopy_ss_64x64_neon
294
+    lsl             r1, #1
295
+    lsl             r3, #1
296
+    mov             r12, #8
297
+    sub             r1, #96
298
+    sub             r3, #96
299
+loop_css64:
300
+    subs            r12, #1
301
+.rept 8
302
+    vld1.u16        {q0, q1}, [r2]!
303
+    vld1.u16        {q2, q3}, [r2]!
304
+    vld1.u16        {q8, q9}, [r2]!
305
+    vld1.u16        {q10, q11}, [r2], r3
306
+
307
+    vst1.u16        {q0, q1}, [r0]!
308
+    vst1.u16        {q2, q3}, [r0]!
309
+    vst1.u16        {q8, q9}, [r0]!
310
+    vst1.u16        {q10, q11}, [r0], r1
311
+.endr
312
+    bne             loop_css64
313
+    bx              lr
314
+endfunc
315
+
316
+/******** Chroma blockcopy********/
317
+function x265_blockcopy_ss_4x8_neon
318
+    lsl             r1, #1
319
+    lsl             r3, #1
320
+.rept 4
321
+    vld1.u16        {d0}, [r2], r3
322
+    vld1.u16        {d1}, [r2], r3
323
+    vst1.u16        {d0}, [r0], r1
324
+    vst1.u16        {d1}, [r0], r1
325
+.endr
326
+    bx              lr
327
+endfunc
328
+
329
+function x265_blockcopy_ss_8x16_neon
330
+    lsl             r1, #1
331
+    lsl             r3, #1
332
+.rept 8
333
+    vld1.u16        {q0}, [r2], r3
334
+    vld1.u16        {q1}, [r2], r3
335
+    vst1.u16        {q0}, [r0], r1
336
+    vst1.u16        {q1}, [r0], r1
337
+.endr
338
+    bx              lr
339
+endfunc
340
+
341
+function x265_blockcopy_ss_16x32_neon
342
+    lsl             r1, #1
343
+    lsl             r3, #1
344
+.rept 16
345
+    vld1.u16        {q0, q1}, [r2], r3
346
+    vld1.u16        {q2, q3}, [r2], r3
347
+    vst1.u16        {q0, q1}, [r0], r1
348
+    vst1.u16        {q2, q3}, [r0], r1
349
+.endr
350
+    bx              lr
351
+endfunc
352
+
353
+function x265_blockcopy_ss_32x64_neon
354
+    lsl             r1, #1
355
+    lsl             r3, #1
356
+    mov             r12, #8
357
+    sub             r1, #32
358
+    sub             r3, #32
359
+loop_css_32x64:
360
+    subs            r12, #1
361
+.rept 8
362
+    vld1.u16        {q0, q1}, [r2]!
363
+    vld1.u16        {q2, q3}, [r2], r3
364
+    vst1.u16        {q0, q1}, [r0]!
365
+    vst1.u16        {q2, q3}, [r0], r1
366
+.endr
367
+    bne             loop_css_32x64
368
+    bx              lr
369
+endfunc
370
+
371
+// chroma blockcopy_ps
372
+function x265_blockcopy_ps_4x8_neon
373
+    lsl             r1, #1
374
+.rept 4
375
+    vld1.u8         {d0}, [r2], r3
376
+    vld1.u8         {d1}, [r2], r3
377
+    vmovl.u8        q1, d0
378
+    vmovl.u8        q2, d1
379
+    vst1.u16        {d2}, [r0], r1
380
+    vst1.u16        {d4}, [r0], r1
381
+.endr
382
+    bx              lr
383
+endfunc
384
+
385
+function x265_blockcopy_ps_8x16_neon
386
+    lsl             r1, #1
387
+.rept 8
388
+    vld1.u8         {d0}, [r2], r3
389
+    vld1.u8         {d1}, [r2], r3
390
+    vmovl.u8        q1, d0
391
+    vmovl.u8        q2, d1
392
+    vst1.u16        {q1}, [r0], r1
393
+    vst1.u16        {q2}, [r0], r1
394
+.endr
395
+    bx              lr
396
+endfunc
397
+
398
+function x265_blockcopy_ps_16x32_neon
399
+    lsl             r1, #1
400
+    mov             r12, #4
401
+loop_cps_16x32:
402
+    subs            r12, #1
403
+.rept 4
404
+    vld1.u8         {q0}, [r2], r3
405
+    vld1.u8         {q1}, [r2], r3
406
+    vmovl.u8        q8, d0
407
+    vmovl.u8        q9, d1
408
+    vmovl.u8        q10, d2
409
+    vmovl.u8        q11, d3
410
+    vst1.u16        {q8, q9}, [r0], r1
411
+    vst1.u16        {q10, q11}, [r0], r1
412
+.endr
413
+    bne             loop_cps_16x32
414
+    bx              lr
415
+endfunc
416
+
417
+function x265_blockcopy_ps_32x64_neon
418
+    lsl             r1, #1
419
+    sub             r1, #32
420
+    mov             r12, #8
421
+loop_cps_32x64:
422
+    subs            r12, #1
423
+.rept 4
424
+    vld1.u8         {q0, q1}, [r2], r3
425
+    vld1.u8         {q2, q3}, [r2], r3
426
+    vmovl.u8        q8, d0
427
+    vmovl.u8        q9, d1
428
+    vmovl.u8        q10, d2
429
+    vmovl.u8        q11, d3
430
+
431
+    vmovl.u8        q12, d4
432
+    vmovl.u8        q13, d5
433
+    vmovl.u8        q14, d6
434
+    vmovl.u8        q15, d7
435
+
436
+    vst1.u16        {q8, q9}, [r0]!
437
+    vst1.u16        {q10, q11}, [r0], r1
438
+    vst1.u16        {q12, q13}, [r0]!
439
+    vst1.u16        {q14, q15}, [r0], r1
440
+.endr
441
+    bne             loop_cps_32x64
442
+    bx              lr
443
+endfunc
444
+
445
+// chroma blockcopy_sp
446
+function x265_blockcopy_sp_4x8_neon
447
+    lsl             r3, #1
448
+.rept 4
449
+    vld1.u16        {q0}, [r2], r3
450
+    vld1.u16        {q1}, [r2], r3
451
+    vmovn.u16       d0, q0
452
+    vmovn.u16       d1, q1
453
+    vst1.u32        {d0[0]}, [r0], r1
454
+    vst1.u32        {d1[0]}, [r0], r1
455
+.endr
456
+    bx              lr
457
+endfunc
458
+
459
+function x265_blockcopy_sp_8x16_neon
460
+    lsl             r3, #1
461
+.rept 8
462
+    vld1.u16        {q0}, [r2], r3
463
+    vld1.u16        {q1}, [r2], r3
464
+    vmovn.u16       d0, q0
465
+    vmovn.u16       d1, q1
466
+    vst1.u8         {d0}, [r0], r1
467
+    vst1.u8         {d1}, [r0], r1
468
+.endr
469
+    bx              lr
470
+endfunc
471
+
472
+function x265_blockcopy_sp_16x32_neon
473
+    lsl             r3, #1
474
+    mov             r12, #4
475
+loop_csp_16x32:
476
+    subs            r12, #1
477
+.rept 4
478
+    vld1.u16        {q0, q1}, [r2], r3
479
+    vld1.u16        {q2, q3}, [r2], r3
480
+    vmovn.u16       d0, q0
481
+    vmovn.u16       d1, q1
482
+    vmovn.u16       d2, q2
483
+    vmovn.u16       d3, q3
484
+    vst1.u8         {q0}, [r0], r1
485
+    vst1.u8         {q1}, [r0], r1
486
+.endr
487
+    bne             loop_csp_16x32
488
+    bx              lr
489
+endfunc
490
+
491
+function x265_blockcopy_sp_32x64_neon
492
+    mov             r12, #8
493
+    lsl             r3, #1
494
+    sub             r3, #32
495
+loop_csp_32x64:
496
+    subs            r12, #1
497
+.rept 4
498
+    vld1.u16        {q0, q1}, [r2]!
499
+    vld1.u16        {q2, q3}, [r2], r3
500
+    vld1.u16        {q8, q9}, [r2]!
501
+    vld1.u16        {q10, q11}, [r2], r3
502
+
503
+    vmovn.u16       d0, q0
504
+    vmovn.u16       d1, q1
505
+    vmovn.u16       d2, q2
506
+    vmovn.u16       d3, q3
507
+
508
+    vmovn.u16       d4, q8
509
+    vmovn.u16       d5, q9
510
+    vmovn.u16       d6, q10
511
+    vmovn.u16       d7, q11
512
+
513
+    vst1.u8         {q0, q1}, [r0], r1
514
+    vst1.u8         {q2, q3}, [r0], r1
515
+.endr
516
+    bne             loop_csp_32x64
517
+    bx              lr
518
+endfunc
519
+
520
+// void x265_blockfill_s_neon(int16_t* dst, intptr_t dstride, int16_t val)
521
+function x265_blockfill_s_4x4_neon
522
+    vdup.u16        d0, r2
523
+    lsl             r1, #1
524
+.rept 4
525
+    vst1.16         {d0}, [r0], r1
526
+.endr
527
+    bx              lr
528
+endfunc
529
+
530
+function x265_blockfill_s_8x8_neon
531
+    vdup.u16        q0, r2
532
+    lsl             r1, #1
533
+.rept 8
534
+    vst1.16         {q0}, [r0], r1
535
+.endr
536
+    bx              lr
537
+endfunc
538
+
539
+function x265_blockfill_s_16x16_neon
540
+    vdup.u16        q0, r2
541
+    vmov            q1, q0
542
+    lsl             r1, #1
543
+.rept 16
544
+    vst1.16         {q0, q1}, [r0], r1
545
+.endr
546
+    bx              lr
547
+endfunc
548
+
549
+function x265_blockfill_s_32x32_neon
550
+    vdup.u16        q0, r2
551
+    vmov            q1, q0
552
+    lsl             r1, #1
553
+    sub             r1, #32
554
+.rept 32
555
+    vst1.16         {q0, q1}, [r0]!
556
+    vst1.16         {q0, q1}, [r0], r1
557
+.endr
558
+    bx              lr
559
+endfunc
560
+
561
+// uint32_t copy_count(int16_t* coeff, const int16_t* residual, intptr_t resiStride)
562
+function x265_copy_cnt_4_neon
563
+    lsl             r2, #1
564
+    mov             r12, #8
565
+    veor            d4, d4
566
+.rept 2
567
+    vld1.s16        {d0}, [r1], r2
568
+    vld1.s16        {d1}, [r1], r2
569
+    vclz.i16        d2, d0
570
+    vclz.i16        d3, d1
571
+    vshr.u16        q1, #4
572
+    vadd.u16        d2, d3
573
+    vadd.u16        d4, d2
574
+    vst1.s16        {d0}, [r0], r12
575
+    vst1.s16        {d1}, [r0], r12
576
+.endr
577
+    vpadd.u16       d4, d4
578
+    vpadd.u16       d4, d4
579
+    vmov.u16        r12, d4[0]
580
+    rsb             r0, r12, #16
581
+    bx              lr
582
+endfunc
583
+
584
+function x265_copy_cnt_8_neon
585
+    lsl             r2, #1
586
+    mov             r12, #16
587
+    veor            q8, q8
588
+.rept 4
589
+    vld1.s16        {q0}, [r1], r2
590
+    vld1.s16        {q1}, [r1], r2
591
+    vclz.i16        q2, q0
592
+    vclz.i16        q3, q1
593
+    vshr.u16        q2, #4
594
+    vshr.u16        q3, #4
595
+    vadd.u16        q2, q3
596
+    vadd.u16        q8, q2
597
+    vst1.s16        {q0}, [r0], r12
598
+    vst1.s16        {q1}, [r0], r12
599
+.endr
600
+    vadd.u16        d16, d17
601
+    vpadd.u16       d16, d16
602
+    vpadd.u16       d16, d16
603
+    vmov.u16        r12, d16[0]
604
+    rsb             r0, r12, #64
605
+    bx              lr
606
+endfunc
607
+
608
+function x265_copy_cnt_16_neon
609
+    lsl             r2, #1
610
+    mov             r12, #32
611
+    veor            q2, q2
612
+.rept 16
613
+    vld1.s16        {q0, q1}, [r1], r2
614
+    vst1.s16        {q0, q1}, [r0], r12
615
+    vclz.i16        q8, q0
616
+    vclz.i16        q9, q1
617
+    vshr.u16        q8, #4
618
+    vshr.u16        q9, #4
619
+    vadd.u16        q8, q9
620
+    vadd.u16        q2, q8
621
+.endr
622
+    vadd.u16        d4, d5
623
+    vpadd.u16       d4, d4
624
+    vpadd.u16       d4, d4
625
+
626
+    vmov.u16        r12, d4[0]
627
+    rsb             r0, r12, #256
628
+    bx              lr
629
+endfunc
630
+
631
+function x265_copy_cnt_32_neon
632
+    lsl             r2, #1
633
+    sub             r2, #32
634
+    mov             r12, #32
635
+    veor            q12, q12
636
+.rept 32
637
+    vld1.s16        {q0, q1}, [r1]!
638
+    vld1.s16        {q2, q3}, [r1], r2
639
+    vst1.s16        {q0, q1}, [r0]!
640
+    vst1.s16        {q2, q3}, [r0], r12
641
+
642
+    vclz.i16        q8, q0
643
+    vclz.i16        q9, q1
644
+    vclz.i16        q10, q2
645
+    vclz.i16        q11, q3
646
+
647
+    vshr.u16        q8, #4
648
+    vshr.u16        q9, #4
649
+    vshr.u16        q10, #4
650
+    vshr.u16        q11, #4
651
+
652
+    vadd.u16        q8, q9
653
+    vadd.u16        q10, q11
654
+    vadd.u16        q8, q10
655
+    vadd.u16        q12, q8
656
+.endr
657
+    vadd.u16        d24, d25
658
+    vpadd.u16       d24, d24
659
+    vpadd.u16       d24, d24
660
+
661
+    vmov.u16        r12, d24[0]
662
+    rsb             r0, r12, #1024
663
+    bx              lr
664
+endfunc
665
+
666
+// int  count_nonzero_c(const int16_t* quantCoeff)
667
+function x265_count_nonzero_4_neon
668
+    vld1.s16        {d0-d3}, [r0]
669
+    vceq.u16        q0, #0
670
+    vceq.u16        q1, #0
671
+    eor             r1, r1
672
+    vtrn.8          q0, q1
673
+
674
+    vshr.u8         q0, #7
675
+
676
+    vadd.u8         d0, d1
677
+    vshr.u64        d1, d0, #32
678
+    vadd.u8         d0, d1
679
+    vmov.u32        r0, d0[0]
680
+    usad8           r0, r0, r1
681
+    rsb             r0, #16
682
+    bx              lr
683
+endfunc
684
+
685
+function x265_count_nonzero_8_neon
686
+    vldm            r0, {q8-q15}
687
+    eor             r1, r1
688
+    vceq.u16        q8, #0
689
+    vceq.u16        q9, #0
690
+    vceq.u16        q10, #0
691
+    vceq.u16        q11, #0
692
+    vceq.u16        q12, #0
693
+    vceq.u16        q13, #0
694
+    vceq.u16        q14, #0
695
+    vceq.u16        q15, #0
696
+
697
+    vtrn.8          q8, q9
698
+    vtrn.8          q10, q11
699
+    vtrn.8          q12, q13
700
+    vtrn.8          q14, q15
701
+
702
+    vadd.s8         q8, q10
703
+    vadd.s8         q12, q14
704
+    vadd.s8         q8, q12
705
+
706
+    vadd.s8         d16, d17
707
+    vshr.u64        d17, d16, #32
708
+    vadd.s8         d16, d17
709
+    vabs.s8         d16, d16
710
+
711
+    vmov.u32        r0, d16[0]
712
+    usad8           r0, r0, r1
713
+    rsb             r0, #64
714
+    bx              lr
715
+endfunc
716
+
717
+function x265_count_nonzero_16_neon
718
+    vldm            r0!, {q8-q15}
719
+    eor             r1, r1
720
+    vceq.u16        q8, #0
721
+    vceq.u16        q9, #0
722
+    vceq.u16        q10, #0
723
+    vceq.u16        q11, #0
724
+    vceq.u16        q12, #0
725
+    vceq.u16        q13, #0
726
+    vceq.u16        q14, #0
727
+    vceq.u16        q15, #0
728
+
729
+    vtrn.8          q8, q9
730
+    vtrn.8          q10, q11
731
+    vtrn.8          q12, q13
732
+    vtrn.8          q14, q15
733
+
734
+    vmov            q0, q8
735
+    vmov            q1, q10
736
+    vmov            q2, q12
737
+    vmov            q3, q14
738
+
739
+.rept 3
740
+    vldm            r0!, {q8-q15}
741
+    vceq.u16        q8, #0
742
+    vceq.u16        q9, #0
743
+    vceq.u16        q10, #0
744
+    vceq.u16        q11, #0
745
+    vceq.u16        q12, #0
746
+    vceq.u16        q13, #0
747
+    vceq.u16        q14, #0
748
+    vceq.u16        q15, #0
749
+
750
+    vtrn.8          q8, q9
751
+    vtrn.8          q10, q11
752
+    vtrn.8          q12, q13
753
+    vtrn.8          q14, q15
754
+
755
+    vadd.s8         q0, q8
756
+    vadd.s8         q1, q10
757
+    vadd.s8         q2, q12
758
+    vadd.s8         q3, q14
759
+.endr
760
+
761
+    vadd.s8         q0, q1
762
+    vadd.s8         q2, q3
763
+    vadd.s8         q0, q2                      // dynamic range is 4+1 bits
764
+
765
+    vadd.s8         d0, d1
766
+    vshr.u64        d1, d0, #32
767
+    vadd.s8         d0, d1
768
+    vabs.s8         d0, d0                      // maximum value of each element are 64
769
+
770
+    vmov.u32        r0, d0[0]
771
+    usad8           r0, r0, r1
772
+    rsb             r0, #256
773
+    bx              lr
774
+endfunc
775
+
776
+function x265_count_nonzero_32_neon
777
+    vldm            r0!, {q8-q15}
778
+    vceq.u16        q8, #0
779
+    vceq.u16        q9, #0
780
+    vceq.u16        q10, #0
781
+    vceq.u16        q11, #0
782
+    vceq.u16        q12, #0
783
+    vceq.u16        q13, #0
784
+    vceq.u16        q14, #0
785
+    vceq.u16        q15, #0
786
+
787
+    vtrn.8          q8, q9
788
+    vtrn.8          q10, q11
789
+    vtrn.8          q12, q13
790
+    vtrn.8          q14, q15
791
+
792
+    mov             r1, #15
793
+
794
+    vmov            q0, q8
795
+    vmov            q1, q10
796
+    vmov            q2, q12
797
+    vmov            q3, q14
798
+
799
+.loop:    
800
+    vldm            r0!, {q8-q15}
801
+    subs            r1, #1
802
+
803
+    vceq.u16        q8, #0
804
+    vceq.u16        q9, #0
805
+    vceq.u16        q10, #0
806
+    vceq.u16        q11, #0
807
+    vceq.u16        q12, #0
808
+    vceq.u16        q13, #0
809
+    vceq.u16        q14, #0
810
+    vceq.u16        q15, #0
811
+
812
+    vtrn.8          q8, q9
813
+    vtrn.8          q10, q11
814
+    vtrn.8          q12, q13
815
+    vtrn.8          q14, q15
816
+
817
+    vadd.s8         q0, q8
818
+    vadd.s8         q1, q10
819
+    vadd.s8         q2, q12
820
+    vadd.s8         q3, q14
821
+    bgt            .loop
822
+
823
+    // sum
824
+    vadd.s8         q0, q1
825
+    vadd.s8         q2, q3
826
+    vadd.s8         q0, q2                      // dynamic range is 6+1 bits
827
+
828
+    vaddl.s8        q0, d0, d1
829
+    vadd.s16        d0, d1
830
+    vshr.u64        d1, d0, #32
831
+    vadd.s16        d0, d1
832
+    vabs.s16        d0, d0                      //  maximum value of each element are 512
833
+
834
+    vmov.u32        r0, d0[0]
835
+    uasx            r0, r0, r0
836
+    mov             r0, r0, lsr 16
837
+    rsb             r0, #1024
838
+    bx              lr
839
+endfunc
840
x265_2.0.tar.gz/source/common/arm/blockcopy8.h Added
125
 
1
@@ -0,0 +1,123 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2016 x265 project
4
+ *
5
+ * Authors: Steve Borho <steve@borho.org>
6
+ *          Min Chen <chenm003@163.com>
7
+ *          Dnyaneshwar Gorade <dnyaneshwar@multicorewareinc.com>
8
+ *
9
+ * This program is free software; you can redistribute it and/or modify
10
+ * it under the terms of the GNU General Public License as published by
11
+ * the Free Software Foundation; either version 2 of the License, or
12
+ * (at your option) any later version.
13
+ *
14
+ * This program is distributed in the hope that it will be useful,
15
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17
+ * GNU General Public License for more details.
18
+ *
19
+ * You should have received a copy of the GNU General Public License
20
+ * along with this program; if not, write to the Free Software
21
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
22
+ *
23
+ * This program is also available under a commercial proprietary license.
24
+ * For more information, contact us at license @ x265.com.
25
+ *****************************************************************************/
26
+
27
+#ifndef X265_BLOCKCOPY8_ARM_H
28
+#define X265_BLOCKCOPY8_ARM_H
29
+
30
+void x265_blockcopy_pp_16x16_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
31
+void x265_blockcopy_pp_8x4_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
32
+void x265_blockcopy_pp_8x8_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
33
+void x265_blockcopy_pp_8x16_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
34
+void x265_blockcopy_pp_8x32_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
35
+void x265_blockcopy_pp_12x16_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
36
+void x265_blockcopy_pp_4x4_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
37
+void x265_blockcopy_pp_4x8_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
38
+void x265_blockcopy_pp_4x16_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
39
+void x265_blockcopy_pp_16x4_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
40
+void x265_blockcopy_pp_16x8_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
41
+void x265_blockcopy_pp_16x12_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
42
+void x265_blockcopy_pp_16x32_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
43
+void x265_blockcopy_pp_16x64_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
44
+void x265_blockcopy_pp_24x32_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
45
+void x265_blockcopy_pp_32x8_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
46
+void x265_blockcopy_pp_32x16_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
47
+void x265_blockcopy_pp_32x24_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
48
+void x265_blockcopy_pp_32x32_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
49
+void x265_blockcopy_pp_32x64_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
50
+void x265_blockcopy_pp_48x64_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
51
+void x265_blockcopy_pp_64x16_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
52
+void x265_blockcopy_pp_64x32_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
53
+void x265_blockcopy_pp_64x48_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
54
+void x265_blockcopy_pp_64x64_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
55
+void x265_blockcopy_pp_2x4_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
56
+void x265_blockcopy_pp_2x8_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
57
+void x265_blockcopy_pp_2x16_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
58
+void x265_blockcopy_pp_6x8_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
59
+void x265_blockcopy_pp_6x16_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
60
+void x265_blockcopy_pp_8x2_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
61
+void x265_blockcopy_pp_8x6_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
62
+void x265_blockcopy_pp_8x12_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
63
+void x265_blockcopy_pp_8x64_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
64
+void x265_blockcopy_pp_12x32_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
65
+void x265_blockcopy_pp_4x2_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
66
+void x265_blockcopy_pp_4x32_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
67
+void x265_blockcopy_pp_16x24_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
68
+void x265_blockcopy_pp_24x64_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
69
+void x265_blockcopy_pp_32x48_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
70
+
71
+void x265_cpy2Dto1D_shr_4x4_neon(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
72
+void x265_cpy2Dto1D_shr_8x8_neon(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
73
+void x265_cpy2Dto1D_shr_16x16_neon(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
74
+void x265_cpy2Dto1D_shr_32x32_neon(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
75
+
76
+void x265_blockcopy_sp_4x4_neon(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
77
+void x265_blockcopy_sp_8x8_neon(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
78
+void x265_blockcopy_sp_16x16_neon(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
79
+void x265_blockcopy_sp_32x32_neon(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
80
+void x265_blockcopy_sp_64x64_neon(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
81
+
82
+void x265_blockcopy_ps_4x4_neon(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb);
83
+void x265_blockcopy_ps_8x8_neon(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb);
84
+void x265_blockcopy_ps_16x16_neon(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb);
85
+void x265_blockcopy_ps_32x32_neon(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb);
86
+void x265_blockcopy_ps_64x64_neon(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb);
87
+
88
+void x265_blockcopy_ss_4x4_neon(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
89
+void x265_blockcopy_ss_8x8_neon(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
90
+void x265_blockcopy_ss_16x16_neon(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
91
+void x265_blockcopy_ss_32x32_neon(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
92
+void x265_blockcopy_ss_64x64_neon(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
93
+
94
+// chroma blockcopy
95
+void x265_blockcopy_ss_4x8_neon(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
96
+void x265_blockcopy_ss_8x16_neon(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
97
+void x265_blockcopy_ss_16x32_neon(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
98
+void x265_blockcopy_ss_32x64_neon(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
99
+
100
+void x265_blockcopy_sp_4x8_neon(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
101
+void x265_blockcopy_sp_8x16_neon(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
102
+void x265_blockcopy_sp_16x32_neon(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
103
+void x265_blockcopy_sp_32x64_neon(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
104
+
105
+void x265_blockcopy_ps_4x8_neon(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb);
106
+void x265_blockcopy_ps_8x16_neon(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb);
107
+void x265_blockcopy_ps_16x32_neon(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb);
108
+void x265_blockcopy_ps_32x64_neon(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb);
109
+
110
+void x265_blockfill_s_4x4_neon(int16_t* dst, intptr_t dstride, int16_t val);
111
+void x265_blockfill_s_8x8_neon(int16_t* dst, intptr_t dstride, int16_t val);
112
+void x265_blockfill_s_16x16_neon(int16_t* dst, intptr_t dstride, int16_t val);
113
+void x265_blockfill_s_32x32_neon(int16_t* dst, intptr_t dstride, int16_t val);
114
+
115
+uint32_t x265_copy_cnt_4_neon(int16_t* coeff, const int16_t* residual, intptr_t resiStride);
116
+uint32_t x265_copy_cnt_8_neon(int16_t* coeff, const int16_t* residual, intptr_t resiStride);
117
+uint32_t x265_copy_cnt_16_neon(int16_t* coeff, const int16_t* residual, intptr_t resiStride);
118
+uint32_t x265_copy_cnt_32_neon(int16_t* coeff, const int16_t* residual, intptr_t resiStride);
119
+
120
+int x265_count_nonzero_4_neon(const int16_t* quantCoeff);
121
+int x265_count_nonzero_8_neon(const int16_t* quantCoeff);
122
+int x265_count_nonzero_16_neon(const int16_t* quantCoeff);
123
+int x265_count_nonzero_32_neon(const int16_t* quantCoeff);
124
+#endif // ifndef X265_I386_PIXEL_ARM_H
125
x265_2.0.tar.gz/source/common/arm/cpu-a.S Added
111
 
1
@@ -0,0 +1,109 @@
2
+/*****************************************************************************
3
+ * cpu-a.S: arm cpu detection
4
+ *****************************************************************************
5
+ * Copyright (C) 2016 x265 project
6
+ *
7
+ * Authors: David Conrad <lessen42@gmail.com>
8
+ *          Dnyaneshwar Gorade <dnyaneshwar@multicorewareinc.com>
9
+ *
10
+ * This program is free software; you can redistribute it and/or modify
11
+ * it under the terms of the GNU General Public License as published by
12
+ * the Free Software Foundation; either version 2 of the License, or
13
+ * (at your option) any later version.
14
+ *
15
+ * This program is distributed in the hope that it will be useful,
16
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18
+ * GNU General Public License for more details.
19
+ *
20
+ * You should have received a copy of the GNU General Public License
21
+ * along with this program; if not, write to the Free Software
22
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
23
+ *
24
+ * This program is also available under a commercial proprietary license.
25
+ * For more information, contact us at license @ x265.com.
26
+ *****************************************************************************/
27
+
28
+#include "asm.S"
29
+
30
+.align 2
31
+
32
+// done in gas because .fpu neon overrides the refusal to assemble
33
+// instructions the selected -march/-mcpu doesn't support
34
+function x265_cpu_neon_test
35
+    vadd.i16    q0, q0, q0
36
+    bx          lr
37
+endfunc
38
+
39
+// return: 0 on success
40
+//         1 if counters were already enabled
41
+//         9 if lo-res counters were already enabled
42
+function x265_cpu_enable_armv7_counter, export=0
43
+    mrc         p15, 0, r2, c9, c12, 0      // read PMNC
44
+    ands        r0, r2, #1
45
+    andne       r0, r2, #9
46
+
47
+    orr         r2, r2, #1                  // enable counters
48
+    bic         r2, r2, #8                  // full resolution
49
+    mcreq       p15, 0, r2, c9, c12, 0      // write PMNC
50
+    mov         r2, #1 << 31                // enable cycle counter
51
+    mcr         p15, 0, r2, c9, c12, 1      // write CNTENS
52
+    bx          lr
53
+endfunc
54
+
55
+function x265_cpu_disable_armv7_counter, export=0
56
+    mrc         p15, 0, r0, c9, c12, 0      // read PMNC
57
+    bic         r0, r0, #1                  // disable counters
58
+    mcr         p15, 0, r0, c9, c12, 0      // write PMNC
59
+    bx          lr
60
+endfunc
61
+
62
+
63
+.macro READ_TIME r
64
+    mrc         p15, 0, \r, c9, c13, 0
65
+.endm
66
+
67
+// return: 0 if transfers neon -> arm transfers take more than 10 cycles
68
+//         nonzero otherwise
69
+function x265_cpu_fast_neon_mrc_test
70
+    // check for user access to performance counters
71
+    mrc         p15, 0, r0, c9, c14, 0
72
+    cmp         r0, #0
73
+    bxeq        lr
74
+
75
+    push        {r4-r6,lr}
76
+    bl          x265_cpu_enable_armv7_counter
77
+    ands        r1, r0, #8
78
+    mov         r3, #0
79
+    mov         ip, #4
80
+    mov         r6, #4
81
+    moveq       r5, #1
82
+    movne       r5, #64
83
+
84
+average_loop:
85
+    mov         r4, r5
86
+    READ_TIME   r1
87
+1:  subs        r4, r4, #1
88
+.rept 8
89
+    vmov.u32    lr, d0[0]
90
+    add         lr, lr, lr
91
+.endr
92
+    bgt         1b
93
+    READ_TIME   r2
94
+
95
+    subs        r6, r6, #1
96
+    sub         r2, r2, r1
97
+    cmpgt       r2, #30 << 3    // assume context switch if it took over 30 cycles
98
+    addle       r3, r3, r2
99
+    subsle      ip, ip, #1
100
+    bgt         average_loop
101
+
102
+    // disable counters if we enabled them
103
+    ands        r0, r0, #1
104
+    bleq        x265_cpu_disable_armv7_counter
105
+
106
+    lsr         r0, r3, #5
107
+    cmp         r0, #10
108
+    movgt       r0, #0
109
+    pop         {r4-r6,pc}
110
+endfunc
111
x265_2.0.tar.gz/source/common/arm/dct-a.S Added
902
 
1
@@ -0,0 +1,900 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2016 x265 project
4
+ *
5
+ * Authors: Min Chen <chenm003@163.com>
6
+ * 
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "asm.S"
26
+
27
+.section .rodata
28
+
29
+.align 4
30
+
31
+.text
32
+
33
+.align 4
34
+
35
+//        dst[0 * line] = ((64 * E[0] + 64 * E[1] + add) >> shift);
36
+//        dst[2 * line] = ((64 * E[0] - 64 * E[1] + add) >> shift);
37
+//        dst[1 * line] = ((83 * O[0] + 36 * O[1] + add) >> shift);
38
+//        dst[3 * line] = ((36 * O[0] - 83 * O[1] + add) >> shift);
39
+
40
+/* void dct4_c(const int16_t* src, int16_t* dst, intptr_t srcStride) */
41
+function x265_dct_4x4_neon
42
+    mov             r2, r2, lsl #1
43
+    vld1.16         {d0}, [r0, :64], r2                     // d0  = [03 02 01 00]
44
+    vld1.16         {d1}, [r0, :64], r2                     // d1  = [13 12 11 10]
45
+    vld1.16         {d2}, [r0, :64], r2                     // d2  = [23 22 21 20]
46
+    vld1.16         {d3}, [r0, :64]                         // d3  = [33 32 31 30]
47
+
48
+    vtrn.32         q0, q1                                  // q0  = [31 30 11 10 21 20 01 00], q1 = [33 32 13 12 23 22 03 02]
49
+    vrev32.16       q1, q1                                  // q1  = [32 33 12 13 22 23 02 03]
50
+
51
+    movconst        r0, 0x00240053
52
+    movconst        r2, 0xFFAD0024
53
+
54
+    // DCT-1D
55
+    vadd.s16        q2, q0, q1                              // q2  = [E31 E30 E11 E10 E21 E20 E01 E00]
56
+    vsub.s16        q3, q0, q1                              // q3  = [O31 O30 O11 O10 O21 O20 O01 O00]
57
+    vdup.32         d16, r0                                 // d16 = [ 36  83]
58
+    vdup.32         d17, r2                                 // d17 = [-83  36]
59
+    vtrn.16         d4, d5                                  // d4  = [E30 E20 E10 E00], d5 = [E31 E21 E11 E01]
60
+    vtrn.32         d6, d7                                  // q3  = [O31 O30 O21 O20 O11 O10 O01 O00]
61
+
62
+    vmull.s16       q9, d6, d16
63
+    vmull.s16       q10, d7, d16                            // [q9, q10] = [ 36*O1 83*O0] -> [1]
64
+    vmull.s16       q11, d6, d17
65
+    vmull.s16       q12, d7, d17                            // [q11,q12] = [-83*O1 36*O0] -> [3]
66
+
67
+    vadd.s16        d0, d4, d5                              // d0 = [E0 + E1]
68
+    vsub.s16        d1, d4, d5                              // d1 = [E0 - E1]
69
+
70
+    vpadd.s32       d18, d18, d19                           // q9  = [1]
71
+    vpadd.s32       d19, d20, d21
72
+    vpadd.s32       d20, d22, d23                           // q10 = [3]
73
+    vpadd.s32       d21, d24, d25
74
+
75
+    vshll.s16       q1, d0, #6                              // q1  = 64 * [0]
76
+    vshll.s16       q2, d1, #6                              // q2  = 64 * [2]
77
+
78
+    // TODO: Dynamic Range is 11+6-1 bits
79
+    vqrshrn.s32     d25, q9, 1                              // d25 = R[13 12 11 10]
80
+    vqrshrn.s32     d24, q1, 1                              // d24 = R[03 02 01 00]
81
+    vqrshrn.s32     d26, q2, 1                              // q26 = R[23 22 21 20]
82
+    vqrshrn.s32     d27, q10, 1                             // d27 = R[33 32 31 30]
83
+
84
+
85
+    // DCT-2D
86
+    vmovl.s16       q0, d16                                // q14 = [ 36  83]
87
+
88
+    vtrn.32         q12, q13                                // q12 = [31 30 11 10 21 20 01 00], q13 = [33 32 13 12 23 22 03 02]
89
+    vrev32.16       q13, q13                                // q13 = [32 33 12 13 22 23 02 03]
90
+
91
+    vaddl.s16       q1, d24, d26                            // q0  = [E21 E20 E01 E00]
92
+    vaddl.s16       q2, d25, d27                            // q1  = [E31 E30 E11 E10]
93
+    vsubl.s16       q3, d24, d26                            // q2  = [O21 O20 O01 O00]
94
+    vsubl.s16       q8, d25, d27                            // q3  = [O31 O30 O11 O10]
95
+
96
+    vtrn.32         q1, q2                                  // q1  = [E30 E20 E10 E00], q2  = [E31 E21 E11 E01]
97
+    vtrn.32         q3, q8                                  // q3  = [O30 O20 O10 O00], q8  = [O31 O21 O11 O01]
98
+
99
+    vmul.s32        q9, q3, d0[0]                           // q9  = [83*O30 83*O20 83*O10 83*O00]
100
+    vmul.s32        q10, q8, d0[1]                          // q10 = [36*O31 36*O21 36*O11 36*O01]
101
+    vmul.s32        q11, q3, d0[1]                          // q11 = [36*O30 36*O20 36*O10 36*O00]
102
+    vmul.s32        q12, q8, d0[0]                          // q12 = [83*O31 83*O21 83*O11 83*O01]
103
+
104
+    vadd.s32        q0, q1, q2                              // d0 = [E0 + E1]
105
+    vsub.s32        q1, q1, q2                              // d1 = [E0 - E1]
106
+
107
+    vadd.s32        q9, q9, q10
108
+    vsub.s32        q10, q11, q12
109
+
110
+    vshl.s32        q0, q0, #6                              // q1  = 64 * [0]
111
+    vshl.s32        q1, q1, #6                              // q2  = 64 * [2]
112
+
113
+    vqrshrn.s32     d25, q9, 8                              // d25 = R[13 12 11 10]
114
+    vqrshrn.s32     d27, q10, 8                             // d27 = R[33 32 31 30]
115
+
116
+    vqrshrn.s32     d24, q0, 8                              // d24 = R[03 02 01 00]
117
+    vqrshrn.s32     d26, q1, 8                              // q26 = R[23 22 21 20]
118
+
119
+    vst1.16         {d24-d27}, [r1]
120
+
121
+    bx              lr
122
+endfunc
123
+
124
+/* uses registers q4 - q7 for temp values */
125
+.macro tr4 r0, r1, r2, r3
126
+    vsub.s32    q8, \r0, \r3    // EO0
127
+    vadd.s32    q9, \r0, \r3    // EE0
128
+    vadd.s32    q10, \r1, \r2   // EE1
129
+    vsub.s32    q11, \r1, \r2   // EO1
130
+
131
+    vmul.s32    \r1, q8, d0[0]  // 83 * EO0
132
+    vmul.s32    \r3, q8, d0[1]  // 36 * EO0
133
+    vshl.s32    q9, q9, #6      // 64 * EE0
134
+    vshl.s32    q10, q10, #6    // 64 * EE1
135
+    vmla.s32    \r1, q11, d0[1] // 83 * EO0 + 36 * EO1
136
+    vmls.s32    \r3, q11, d0[0] // 36 * EO0 - 83 * EO1
137
+    vadd.s32    \r0, q9, q10    // 64 * (EE0 + EE1)
138
+    vsub.s32    \r2, q9, q10    // 64 * (EE0 - EE1)
139
+.endm
140
+
141
+
142
+.macro tr8 r0, r1, r2, r3
143
+    vmul.s32  q12, \r0, d1[1]   //  89 * src1
144
+    vmul.s32  q13, \r0, d1[0]   //  75 * src1
145
+    vmul.s32  q14, \r0, d2[1]   //  50 * src1
146
+    vmul.s32  q15, \r0, d2[0]   //  18 * src1
147
+
148
+    vmla.s32  q12, \r1, d1[0]   //  75 * src3
149
+    vmls.s32  q13, \r1, d2[0]   // -18 * src3
150
+    vmls.s32  q14, \r1, d1[1]   // -89 * src3
151
+    vmls.s32  q15, \r1, d2[1]   // -50 * src3
152
+
153
+    vmla.s32  q12, \r2, d2[1]   //  50 * src5
154
+    vmls.s32  q13, \r2, d1[1]   // -89 * src5
155
+    vmla.s32  q14, \r2, d2[0]   //  18 * src5
156
+    vmla.s32  q15, \r2, d1[0]   //  75 * src5
157
+
158
+    vmla.s32  q12, \r3, d2[0]   //  18 * src7
159
+    vmls.s32  q13, \r3, d2[1]   // -50 * src7
160
+    vmla.s32  q14, \r3, d1[0]   //  75 * src7
161
+    vmls.s32  q15, \r3, d1[1]   // -89 * src7
162
+.endm
163
+
164
+
165
+// TODO: in the DCT-2D stage, I spending 4x8=32 LD/ST operators because I haven't temporary buffer
166
+/* void dct8_c(const int16_t* src, int16_t* dst, intptr_t srcStride) */
167
+function x265_dct_8x8_neon
168
+    vpush {q4-q7}
169
+
170
+    mov r2, r2, lsl #1
171
+
172
+    adr r3, ctr4
173
+    vld1.16 {d0-d2}, [r3]
174
+    mov r3, r1
175
+
176
+    // DCT-1D
177
+    // top half
178
+    vld1.16 {q12}, [r0], r2
179
+    vld1.16 {q13}, [r0], r2
180
+    vld1.16 {q14}, [r0], r2
181
+    vld1.16 {q15}, [r0], r2
182
+
183
+    TRANSPOSE4x4x2_16 d24, d26, d28, d30,  d25, d27, d29, d31
184
+
185
+    // |--|
186
+    // |24|
187
+    // |26|
188
+    // |28|
189
+    // |30|
190
+    // |25|
191
+    // |27|
192
+    // |29|
193
+    // |31|
194
+    // |--|
195
+
196
+    vaddl.s16 q4, d28, d27
197
+    vaddl.s16 q5, d30, d25
198
+    vaddl.s16 q2, d24, d31
199
+    vaddl.s16 q3, d26, d29
200
+
201
+    tr4 q2, q3, q4, q5
202
+
203
+    vqrshrn.s32 d20, q3, 2
204
+    vqrshrn.s32 d16, q2, 2
205
+    vqrshrn.s32 d17, q4, 2
206
+    vqrshrn.s32 d21, q5, 2
207
+
208
+    vsubl.s16 q2, d24, d31
209
+    vsubl.s16 q3, d26, d29
210
+    vsubl.s16 q4, d28, d27
211
+    vsubl.s16 q5, d30, d25
212
+
213
+    tr8 q2, q3, q4, q5
214
+
215
+    vqrshrn.s32 d18, q12, 2
216
+    vqrshrn.s32 d22, q13, 2
217
+    vqrshrn.s32 d19, q14, 2
218
+    vqrshrn.s32 d23, q15, 2
219
+
220
+    vstm r1!, {d16-d23]
221
+
222
+    // bottom half
223
+    vld1.16 {q12}, [r0], r2
224
+    vld1.16 {q13}, [r0], r2
225
+    vld1.16 {q14}, [r0], r2
226
+    vld1.16 {q15}, [r0], r2
227
+    mov r2, #8*2
228
+
229
+    TRANSPOSE4x4x2_16 d24, d26, d28, d30,  d25, d27, d29, d31
230
+
231
+    // |--|
232
+    // |24|
233
+    // |26|
234
+    // |28|
235
+    // |30|
236
+    // |25|
237
+    // |27|
238
+    // |29|
239
+    // |31|
240
+    // |--|
241
+
242
+    vaddl.s16 q4, d28, d27
243
+    vaddl.s16 q5, d30, d25
244
+    vaddl.s16 q2, d24, d31
245
+    vaddl.s16 q3, d26, d29
246
+
247
+    tr4 q2, q3, q4, q5
248
+
249
+    vqrshrn.s32 d20, q3, 2
250
+    vqrshrn.s32 d16, q2, 2
251
+    vqrshrn.s32 d17, q4, 2
252
+    vqrshrn.s32 d21, q5, 2
253
+
254
+    vsubl.s16 q2, d24, d31
255
+    vsubl.s16 q3, d26, d29
256
+    vsubl.s16 q4, d28, d27
257
+    vsubl.s16 q5, d30, d25
258
+
259
+    tr8 q2, q3, q4, q5
260
+
261
+    vqrshrn.s32 d18, q12, 2
262
+    vqrshrn.s32 d22, q13, 2
263
+    vqrshrn.s32 d19, q14, 2
264
+    vqrshrn.s32 d23, q15, 2
265
+
266
+    vstm r1, {d16-d23]
267
+    mov r1, r3
268
+
269
+    // DCT-2D
270
+    // left half
271
+    vld1.16 {d24}, [r1], r2
272
+    vld1.16 {d26}, [r1], r2
273
+    vld1.16 {d28}, [r1], r2
274
+    vld1.16 {d30}, [r1], r2
275
+    vld1.16 {d25}, [r1], r2
276
+    vld1.16 {d27}, [r1], r2
277
+    vld1.16 {d29}, [r1], r2
278
+    vld1.16 {d31}, [r1], r2
279
+    mov r1, r3
280
+
281
+    TRANSPOSE4x4x2_16 d24, d26, d28, d30,  d25, d27, d29, d31
282
+
283
+    // |--|
284
+    // |24|
285
+    // |26|
286
+    // |28|
287
+    // |30|
288
+    // |25|
289
+    // |27|
290
+    // |29|
291
+    // |31|
292
+    // |--|
293
+
294
+    vaddl.s16 q4, d28, d27
295
+    vaddl.s16 q5, d30, d25
296
+    vaddl.s16 q2, d24, d31
297
+    vaddl.s16 q3, d26, d29
298
+
299
+    tr4 q2, q3, q4, q5
300
+
301
+    vqrshrn.s32 d18, q3, 9
302
+    vqrshrn.s32 d16, q2, 9
303
+    vqrshrn.s32 d20, q4, 9
304
+    vqrshrn.s32 d22, q5, 9
305
+
306
+    vsubl.s16 q2, d24, d31
307
+    vsubl.s16 q3, d26, d29
308
+    vsubl.s16 q4, d28, d27
309
+    vsubl.s16 q5, d30, d25
310
+
311
+    tr8 q2, q3, q4, q5
312
+
313
+    vqrshrn.s32 d17, q12, 9
314
+    vqrshrn.s32 d19, q13, 9
315
+    vqrshrn.s32 d21, q14, 9
316
+    vqrshrn.s32 d23, q15, 9
317
+
318
+    add r3, #8
319
+    vst1.16 {d16}, [r1], r2
320
+    vst1.16 {d17}, [r1], r2
321
+    vst1.16 {d18}, [r1], r2
322
+    vst1.16 {d19}, [r1], r2
323
+    vst1.16 {d20}, [r1], r2
324
+    vst1.16 {d21}, [r1], r2
325
+    vst1.16 {d22}, [r1], r2
326
+    vst1.16 {d23}, [r1], r2
327
+    mov r1, r3
328
+
329
+
330
+    // right half
331
+    vld1.16 {d24}, [r1], r2
332
+    vld1.16 {d26}, [r1], r2
333
+    vld1.16 {d28}, [r1], r2
334
+    vld1.16 {d30}, [r1], r2
335
+    vld1.16 {d25}, [r1], r2
336
+    vld1.16 {d27}, [r1], r2
337
+    vld1.16 {d29}, [r1], r2
338
+    vld1.16 {d31}, [r1], r2
339
+    mov r1, r3
340
+
341
+    TRANSPOSE4x4x2_16 d24, d26, d28, d30,  d25, d27, d29, d31
342
+
343
+    // |--|
344
+    // |24|
345
+    // |26|
346
+    // |28|
347
+    // |30|
348
+    // |25|
349
+    // |27|
350
+    // |29|
351
+    // |31|
352
+    // |--|
353
+
354
+    vaddl.s16 q4, d28, d27
355
+    vaddl.s16 q5, d30, d25
356
+    vaddl.s16 q2, d24, d31
357
+    vaddl.s16 q3, d26, d29
358
+
359
+    tr4 q2, q3, q4, q5
360
+
361
+    vqrshrn.s32 d18, q3, 9
362
+    vqrshrn.s32 d16, q2, 9
363
+    vqrshrn.s32 d20, q4, 9
364
+    vqrshrn.s32 d22, q5, 9
365
+
366
+    vsubl.s16 q2, d24, d31
367
+    vsubl.s16 q3, d26, d29
368
+    vsubl.s16 q4, d28, d27
369
+    vsubl.s16 q5, d30, d25
370
+
371
+    tr8 q2, q3, q4, q5
372
+
373
+    vqrshrn.s32 d17, q12, 9
374
+    vqrshrn.s32 d19, q13, 9
375
+    vqrshrn.s32 d21, q14, 9
376
+    vqrshrn.s32 d23, q15, 9
377
+
378
+    vst1.16 {d16}, [r1], r2
379
+    vst1.16 {d17}, [r1], r2
380
+    vst1.16 {d18}, [r1], r2
381
+    vst1.16 {d19}, [r1], r2
382
+    vst1.16 {d20}, [r1], r2
383
+    vst1.16 {d21}, [r1], r2
384
+    vst1.16 {d22}, [r1], r2
385
+    vst1.16 {d23}, [r1], r2
386
+
387
+    vpop {q4-q7}
388
+    bx lr
389
+endfunc
390
+
391
+
392
+.align 8
393
+pw_tr16: .hword 90, 87, 80, 70,  57, 43, 25,  9     // q0 = [ 9 25 43 57 70 80 87 90]
394
+         .hword 83, 36, 75, 89,  18, 50, 00, 00     // q1 = [ x  x 50 18 89 75 36 83]
395
+
396
+.align 8
397
+ctr4:
398
+    .word 83            // d0[0] = 83
399
+    .word 36            // d0[1] = 36
400
+ctr8:
401
+    .word 75            // d1[0] = 75
402
+    .word 89            // d1[1] = 89
403
+    .word 18            // d2[0] = 18
404
+    .word 50            // d2[1] = 50
405
+ctr16:
406
+    .word 90, 87        // d0
407
+    .word 80, 70        // d1
408
+    .word 57, 43        // d2
409
+    .word 25,  9        // d3
410
+
411
+/* void dct16_c(const int16_t* src, int16_t* dst, intptr_t srcStride) */
412
+function x265_dct_16x16_neon
413
+    push {lr}
414
+
415
+    // fill 3 of pipeline stall cycles (dependency link on SP)
416
+    add r2, r2
417
+    adr r3, pw_tr16
418
+    mov r12, #16/4
419
+
420
+    vpush {q4-q7}
421
+
422
+    // TODO: 16x16 transpose buffer (may share with input buffer in future)
423
+    sub sp, #16*16*2
424
+
425
+    vld1.16 {d0-d3}, [r3]
426
+    mov r3, sp
427
+    mov lr, #4*16*2
428
+
429
+    // DCT-1D
430
+.loop1:
431
+    // Row[0-3]
432
+    vld1.16 {q8-q9}, [r0, :64], r2      // q8  = [07 06 05 04 03 02 01 00], q9  = [0F 0E 0D 0C 0B 0A 09 08]
433
+    vld1.16 {q10-q11}, [r0, :64], r2    // q10 = [17 16 15 14 13 12 11 10], q11 = [1F 1E 1D 1C 1B 1A 19 18]
434
+    vld1.16 {q12-q13}, [r0, :64], r2    // q12 = [27 26 25 24 23 22 21 20], q13 = [2F 2E 2D 2C 2B 2A 29 28]
435
+    vld1.16 {q14-q15}, [r0, :64], r2    // q14 = [37 36 35 34 33 32 31 30], q15 = [3F 3E 3D 3C 3B 3A 39 38]
436
+
437
+    // Register map
438
+    // | 16 17 18 19 |
439
+    // | 20 21 22 23 |
440
+    // | 24 25 26 27 |
441
+    // | 28 29 30 31 |
442
+
443
+    // Transpose 16x4
444
+    vtrn.32 q8, q12                     // q8  = [25 24 05 04 21 20 01 00], q12 = [27 26 07 06 23 22 03 02]
445
+    vtrn.32 q10, q14                    // q10 = [35 34 15 14 31 30 11 10], q14 = [37 36 17 16 33 32 13 12]
446
+    vtrn.32 q9, q13                     // q9  = [2D 2C 0D 0C 29 28 09 08], q13 = [2F 2E 0F 0E 2B 2A 0B 0A]
447
+    vtrn.32 q11, q15                    // q11 = [3D 3C 1D 1C 39 38 19 18], q15 = [3F 3E 1F 1E 3B 3A 1B 1A]
448
+
449
+    vtrn.16 q8, q10                     // q8  = [34 24 14 04 30 20 10 00], q10 = [35 25 15 05 31 21 11 01]
450
+    vtrn.16 q12, q14                    // q12 = [36 26 16 06 32 22 12 02], q14 = [37 27 17 07 33 23 13 03]
451
+    vtrn.16 q13, q15                    // q13 = [3E 2E 1E 0E 3A 2A 1A 0A], q15 = [3F 2F 1F 0F 3B 2B 1B 0B]
452
+    vtrn.16 q9, q11                     // q9  = [3C 2C 1C 0C 38 28 18 08], q11 = [3D 2D 1D 0D 39 29 19 09]
453
+
454
+    vswp d26, d27                       // q13 = [3A 2A 1A 0A 3E 2E 1E 0E]
455
+    vswp d30, d31                       // q15 = [3B 2B 1B 0B 3F 2F 1F 0F]
456
+    vswp d18, d19                       // q9  = [38 28 18 08 3C 2C 1C 0C]
457
+    vswp d22, d23                       // q11 = [39 29 19 09 3D 2D 1D 0D]
458
+
459
+    // E[0-7] - 10 bits
460
+    vadd.s16 q4, q8, q15                // q4  = [E4 E0]
461
+    vadd.s16 q5, q10, q13               // q5  = [E5 E1]
462
+    vadd.s16 q6, q12, q11               // q6  = [E6 E2]
463
+    vadd.s16 q7, q14, q9                // q7  = [E7 E3]
464
+
465
+    // O[0-7] - 10 bits
466
+    vsub.s16 q8, q8, q15                // q8  = [O4 O0]
467
+    vsub.s16 q9, q14, q9                // q9  = [O7 O3]
468
+    vsub.s16 q10, q10, q13              // q10 = [O5 O1]
469
+    vsub.s16 q11, q12, q11              // q11 = [O6 O2]
470
+
471
+    // reorder Ex for EE/EO
472
+    vswp d9, d14                        // q4  = [E3 E0], q7  = [E7 E4]
473
+    vswp d11, d12                       // q5  = [E2 E1], q6  = [E6 E5]
474
+    vswp d14, d15                       // q7  = [E4 E7]
475
+    vswp d12, d13                       // q6  = [E5 E6]
476
+
477
+    // EE[0-3] - 11 bits
478
+    vadd.s16 q2, q4, q7                 // q2  = [EE3 EE0]
479
+    vadd.s16 q3, q5, q6                 // q3  = [EE2 EE1]
480
+
481
+    // EO[0-3] - 11 bits
482
+    vsub.s16 q4, q4, q7                 // q4  = [EO3 EO0]
483
+    vsub.s16 q5, q5, q6                 // q5  = [EO2 EO1]
484
+
485
+    // EEx[0-1] - 12 bits
486
+    vadd.s16 d12, d4, d5                // q6  = [EEE1 EEE0]
487
+    vadd.s16 d13, d6, d7
488
+    vsub.s16 d14, d4, d5                // q7  = [EEO1 EEO0]
489
+    vsub.s16 d15, d6, d7
490
+
491
+    // NEON Register map
492
+    // Ex -> [q4, q5, q6, q7], Ox -> [q8, q9, q10, q11], Const -> [q0, q1], Free -> [q2, q3, q12, q13, q14, q15]
493
+
494
+    // ODD[4,12]
495
+    vmull.s16 q14, d14, d2[0]           // q14 = EEO0 * 83
496
+    vmull.s16 q15, d14, d2[1]           // q15 = EEO0 * 36
497
+    vmlal.s16 q14, d15, d2[1]           // q14+= EEO1 * 36
498
+    vmlsl.s16 q15, d15, d2[0]           // q15+= EEO1 *-83
499
+
500
+    vadd.s16 d4, d12, d13               // d4  = (EEE0 + EEE1)
501
+    vsub.s16 d12, d13                   // d12 = (EEE0 - EEE1)
502
+
503
+    // Row
504
+    vmull.s16 q12, d16, d0[0]           // q12 =  O0 * 90
505
+    vmull.s16 q13, d8, d2[3]            // q13 = EO0 * 89
506
+    vqrshrn.s32 d14, q14, 3
507
+    vqrshrn.s32 d15, q15, 3             // q7  = [12 4]     -> [12  4]
508
+    vmull.s16 q14, d16, d0[1]           // q14 =  O0 * 87
509
+    vmull.s16 q15, d16, d0[2]           // q15 =  O0 * 80
510
+    vshll.s16 q2, d4, #6                // q2  = (EEE0 + EEE1) * 64 -> [ 0]
511
+    vshll.s16 q6, d12, #6               // q6  = (EEE0 - EEE1) * 64 -> [ 8]
512
+
513
+    vmlal.s16 q12, d20, d0[1]           // q12+=  O1 * 87
514
+    vmlal.s16 q13, d10, d2[2]           // q13+= EO1 * 75
515
+    vmlal.s16 q14, d20, d1[0]           // q14+=  O1 * 57
516
+    vmlal.s16 q15, d20, d1[3]           // q15+=  O1 *  9
517
+    vqrshrn.s32 d4, q2, 3               // q2  = [- 0]
518
+    vqrshrn.s32 d12, q6, 3              // q6  = [- 8]
519
+
520
+    vmlal.s16 q12, d22, d0[2]           // q12+=  O2 * 80
521
+    vmlal.s16 q13, d11, d3[1]           // q13+= EO2 * 50
522
+    vmlal.s16 q14, d22, d1[3]           // q14+=  O2 *  9
523
+    vmlsl.s16 q15, d22, d0[3]           // q15+=  O2 *-70
524
+
525
+    vmlal.s16 q12, d18, d0[3]           // q12+=  O3 * 70
526
+    vmlal.s16 q13, d9,  d3[0]           // q13+= EO3 * 18   -> [ 2]
527
+    vmlsl.s16 q14, d18, d1[1]           // q14+=  O3 *-43
528
+    vmlsl.s16 q15, d18, d0[1]           // q15+=  O3 *-87
529
+
530
+    vmlal.s16 q12, d17, d1[0]           // q12+=  O4 * 57
531
+    vmlsl.s16 q14, d17, d0[2]           // q14+=  O4 *-80
532
+    vmlsl.s16 q15, d17, d1[2]           // q15+=  O4 *-25
533
+    vqrshrn.s32 d6, q13, 3              // q3  = [- 2]
534
+    vmull.s16 q13, d8,  d2[2]           // q13 = EO0 * 75
535
+
536
+    vmlal.s16 q12, d21, d1[1]           // q12+=  O5 * 43
537
+    vmlsl.s16 q13, d10, d3[0]           // q13+= EO1 *-18
538
+    vmlsl.s16 q14, d21, d0[0]           // q14+=  O5 *-90
539
+    vmlal.s16 q15, d21, d1[0]           // q15+=  O5 * 57
540
+
541
+    vmlal.s16 q12, d23, d1[2]           // q12+=  O6 * 25
542
+    vmlsl.s16 q13, d11, d2[3]           // q13+= EO2 *-89
543
+    vmlsl.s16 q14, d23, d0[3]           // q14+=  O6 *-70
544
+    vmlal.s16 q15, d23, d0[0]           // q15+=  O6 * 90
545
+
546
+    vmlal.s16 q12, d19, d1[3]           // q12+=  O7 *  9   -> [ 1]
547
+    vmlsl.s16 q13, d9,  d3[1]           // q13+= EO3 *-50   -> [ 6]
548
+    vmlsl.s16 q14, d19, d1[2]           // q14+=  O7 *-25   -> [ 3]
549
+    vmlal.s16 q15, d19, d1[1]           // q15+=  O7 * 43   -> [ 5]
550
+    vqrshrn.s32 d5, q12, 3              // q2  = [1 0]
551
+
552
+    vmull.s16 q12, d16, d0[3]           // q12 =  O0 * 70
553
+    vqrshrn.s32 d7, q14, 3              // q3  = [3 2]
554
+    vmull.s16 q14, d16, d1[0]           // q14 =  O0 * 57
555
+
556
+    vmlsl.s16 q12, d20, d1[1]           // q12+=  O1 *-43
557
+    vmlsl.s16 q14, d20, d0[2]           // q14+=  O1 *-80
558
+
559
+    vmlsl.s16 q12, d22, d0[1]           // q12+=  O2 *-87
560
+    vmlsl.s16 q14, d22, d1[2]           // q14+=  O2 *-25
561
+
562
+    vmlal.s16 q12, d18, d1[3]           // q12+=  O3 *  9
563
+    vmlal.s16 q14, d18, d0[0]           // q14+=  O3 * 90
564
+
565
+    // Row[0-3]
566
+    vst4.16 {d4-d7}, [r3], lr
567
+
568
+    vqrshrn.s32 d5, q15, 3              // q2  = [5 -]
569
+    vqrshrn.s32 d6, q13, 3              // q3  = [- 6]
570
+    vmull.s16 q13, d8,  d3[1]           // q13 = EO0 * 50
571
+    vmlal.s16 q12, d17, d0[0]           // q12+=  O4 * 90
572
+    vmlsl.s16 q14, d17, d1[3]           // q14+=  O4 *-9
573
+    vmull.s16 q15, d16, d1[1]           // q15 =  O0 * 43
574
+
575
+    vmlsl.s16 q13, d10, d2[3]           // q13+= EO1 *-89
576
+    vmlal.s16 q12, d21, d1[2]           // q12+=  O5 * 25
577
+    vmlsl.s16 q14, d21, d0[1]           // q14+=  O5 *-87
578
+    vmlsl.s16 q15, d20, d0[0]           // q15+=  O1 *-90
579
+
580
+    vmlal.s16 q13, d11, d3[0]           // q13+= EO2 * 18
581
+    vmlsl.s16 q12, d23, d0[2]           // q12+=  O6 *-80
582
+    vmlal.s16 q14, d23, d1[1]           // q14+=  O6 * 43
583
+    vmlal.s16 q15, d22, d1[0]           // q15+=  O2 * 57
584
+
585
+    vmlal.s16 q13, d9,  d2[2]           // q13+= EO3 * 75   -> [10]
586
+    vmlsl.s16 q12, d19, d1[0]           // q12+=  O7 *-57   -> [ 7]
587
+    vmlal.s16 q14, d19, d0[3]           // q14+=  O7 * 70   -> [ 9]
588
+    vmlal.s16 q15, d18, d1[2]           // q15+=  O3 * 25
589
+    vmlsl.s16 q15, d17, d0[1]           // q15+=  O4 *-87
590
+    vmlal.s16 q15, d21, d0[3]           // q15+=  O5 * 70
591
+    vmlal.s16 q15, d23, d1[3]           // q15+=  O6 *  9
592
+    vmlsl.s16 q15, d19, d0[2]           // q15+=  O7 *-80   -> [11]
593
+    vmov d4, d14                        // q2  = [5 4]
594
+    vqrshrn.s32 d14, q13, 3             // q7  = [12 10]
595
+    vmull.s16 q13, d8,  d3[0]           // q13 = EO0 * 18
596
+    vqrshrn.s32 d7, q12, 3              // q3  = [7 6]
597
+    vmull.s16 q12, d16, d1[2]           // q12 =  O0 * 25
598
+    vmlsl.s16 q13, d9,  d2[3]           // q13 = EO3 *-89
599
+    vmull.s16 q4, d16, d1[3]            // q4  =  O0 *  9
600
+    vmlsl.s16 q12, d20, d0[3]           // q12+=  O1 *-70
601
+    vmlsl.s16 q13, d10, d3[1]           // q13 = EO1 *-50
602
+    vmlsl.s16 q4, d20, d1[2]            // q4 +=  O1 *-25
603
+    vmlal.s16 q12, d22, d0[0]           // q12+=  O2 * 90
604
+    vmlal.s16 q13, d11, d2[2]           // q13 = EO2 * 75   -> [14]
605
+    vmlal.s16 q4, d22, d1[1]            // q4 +=  O2 * 43
606
+    vmlsl.s16 q12, d18, d0[2]           // q12+=  O3 *-80
607
+    vmlsl.s16 q4, d18, d1[0]            // q4 +=  O3 *-57
608
+    vmlal.s16 q12, d17, d1[1]           // q12+=  O4 * 43
609
+    vqrshrn.s32 d13, q14, 3             // q6  = [9 8]
610
+    vmov d28, d15                       // q14 = [- 12]
611
+    vqrshrn.s32 d15, q15, 3             // q7  = [11 10]
612
+    vqrshrn.s32 d30, q13, 3             // q15 = [- 14]
613
+    vmlal.s16 q4, d17, d0[3]            // q4 +=  O4 * 70
614
+    vmlal.s16 q12, d21, d1[3]           // q12+=  O5 *  9
615
+    vmlsl.s16 q4, d21, d0[2]            // q4 +=  O5 *-80
616
+    vmlsl.s16 q12, d23, d1[0]           // q12+=  O6 *-57
617
+    vmlal.s16 q4, d23, d0[1]            // q4 +=  O6 * 87
618
+    vmlal.s16 q12, d19, d0[1]           // q12+=  O7 * 87   -> [13]
619
+    vmlsl.s16 q4, d19, d0[0]            // q4 +=  O7 *-90   -> [15]
620
+
621
+    // Row[4-7]
622
+    vst4.16 {d4-d7}, [r3], lr
623
+    vqrshrn.s32 d29, q12, 3             // q14 = [13 12]
624
+    vqrshrn.s32 d31, q4, 3              // q15 = [15 14]
625
+
626
+    // Row[8-11]
627
+    vst4.16 {d12-d15}, [r3], lr
628
+
629
+    // Row[12-15]
630
+    vst4.16 {d28-d31}, [r3]!
631
+
632
+
633
+    // loop into next process group
634
+    sub r3, #3*4*16*2
635
+    subs r12, #1
636
+    bgt .loop1
637
+
638
+
639
+    // DCT-2D
640
+    // r[0,2,3,12,lr], q[2-15] are free here
641
+    mov r2, sp                          // r3 -> internal temporary buffer
642
+    mov r3, #16*2*2
643
+    mov r12, #16/4                      // Process 4 rows every loop
644
+
645
+.loop2:
646
+    vldm r2, {q8-q15}
647
+
648
+    // d16 = [30 20 10 00]
649
+    // d17 = [31 21 11 01]
650
+    // q18 = [32 22 12 02]
651
+    // d19 = [33 23 13 03]
652
+    // d20 = [34 24 14 04]
653
+    // d21 = [35 25 15 05]
654
+    // q22 = [36 26 16 06]
655
+    // d23 = [37 27 17 07]
656
+    // d24 = [38 28 18 08]
657
+    // d25 = [39 29 19 09]
658
+    // q26 = [3A 2A 1A 0A]
659
+    // d27 = [3B 2B 1B 0B]
660
+    // d28 = [3C 2C 1C 0C]
661
+    // d29 = [3D 2D 1D 0D]
662
+    // q30 = [3E 2E 1E 0E]
663
+    // d31 = [3F 2F 1F 0F]
664
+
665
+    // NOTE: the ARM haven't enough SIMD registers, so I have to process Even & Odd part series.
666
+
667
+    // Process Even
668
+
669
+    // E
670
+    vaddl.s16 q2,  d16, d31             // q2  = [E30 E20 E10 E00]
671
+    vaddl.s16 q3,  d17, d30             // q3  = [E31 E21 E11 E01]
672
+    vaddl.s16 q4,  d18, d29             // q4  = [E32 E22 E12 E02]
673
+    vaddl.s16 q5,  d19, d28             // q5  = [E33 E23 E13 E03]
674
+    vaddl.s16 q9,  d23, d24             // q9  = [E37 E27 E17 E07]
675
+    vaddl.s16 q8,  d22, d25             // q8  = [E36 E26 E16 E06]
676
+    vaddl.s16 q7,  d21, d26             // q7  = [E35 E25 E15 E05]
677
+    vaddl.s16 q6,  d20, d27             // q6  = [E34 E24 E14 E04]
678
+
679
+    // EE & EO
680
+    vadd.s32 q13, q2, q9                // q13 = [EE30 EE20 EE10 EE00]
681
+    vsub.s32 q9, q2, q9                 // q9  = [EO30 EO20 EO10 EO00]
682
+
683
+    vadd.s32 q2, q5, q6                 // q2  = [EE33 EE23 EE13 EE03]
684
+    vsub.s32 q12, q5, q6                // q12 = [EO33 EO23 EO13 EO03]
685
+
686
+    vadd.s32 q14, q3, q8                // q14 = [EE31 EE21 EE11 EE01]
687
+    vsub.s32 q10, q3, q8                // q10 = [EO31 EO21 EO11 EO01]
688
+
689
+    vadd.s32 q15, q4, q7                // q15 = [EE32 EE22 EE12 EE02]
690
+    vsub.s32 q11, q4, q7                // q11 = [EO32 EO22 EO12 EO02]
691
+
692
+    // Free=[3,4,5,6,7,8]
693
+
694
+    // EEE & EEO
695
+    vadd.s32 q5, q13, q2                // q5  = [EEE30 EEE20 EEE10 EEE00]
696
+    vadd.s32 q6, q14, q15               // q6  = [EEE31 EEE21 EEE11 EEE01]
697
+    vsub.s32 q7, q13, q2                // q7  = [EEO30 EEO20 EEO10 EEO00]
698
+    vsub.s32 q8, q14, q15               // q8  = [EEO31 EEO21 EEO11 EEO01]
699
+
700
+    // Convert Const for Dct EE to 32-bits
701
+    adr r0, ctr4
702
+    vld1.32 {d0-d3}, [r0, :64]
703
+
704
+    // Register Map (Qx)
705
+    // Free=[2,3,4,13,14,15], Const=[0,1], EEEx=[5,6,7,8], EO=[9,10,11,12]
706
+
707
+    vadd.s32 q15, q5, q6                // q15 = EEE0 + EEE1    ->  0
708
+    vmul.s32 q2, q9, d1[1]              // q2  = EO0 * 89       ->  2
709
+    vmul.s32 q3, q7, d0[0]              // q3  = EEO0 * 83      ->  4
710
+    vmul.s32 q4, q9, d1[0]              // q4  = EO0 * 75       ->  6
711
+    vmul.s32 q14, q9, d2[1]             // q14 = EO0 * 50       -> 10
712
+
713
+    vshl.s32 q15, #6                    // q15                  -> [ 0]'
714
+    vmla.s32 q2, q10, d1[0]             // q2 += EO1 * 75
715
+    vmla.s32 q3, q8, d0[1]              // q3 += EEO1 * 36      -> [ 4]'
716
+    vmls.s32 q4, q10, d2[0]             // q4 += EO1 *-18
717
+    vmls.s32 q14, q10, d1[1]            // q14+= EO1 *-89
718
+    vmul.s32 q13, q7, d0[1]             // q13 = EEO0 * 36      -> 12 
719
+
720
+    vqrshrn.s32 d30, q15, 10            // d30                  -> [ 0]
721
+    vqrshrn.s32 d31, q3, 10             // d31                  -> [ 4]
722
+    vmls.s32 q4, q11, d1[1]             // q4 += EO2 *-89
723
+    vsub.s32 q3, q5, q6                 // q3  = EEE0 - EEE1    ->  8
724
+    vmla.s32 q2, q11, d2[1]             // q2 += EO2 * 50
725
+    vmla.s32 q14, q11, d2[0]            // q14+= EO2 * 18
726
+    vmls.s32 q13, q8, d0[0]             // q13+= EEO1 *-83      -> [12]'
727
+    vst1.16 {d30}, [r1], r3             // Stroe [ 0]
728
+
729
+    vshl.s32 q3, #6                     // q3                   -> [ 8]'
730
+    vmls.s32 q4, q12, d2[1]             // q4 += EO3 *-50       -> [ 6]'
731
+    vmla.s32 q2, q12, d2[0]             // q2 += EO3 * 18       -> [ 2]'
732
+    vqrshrn.s32 d26, q13, 10            // d26                  -> [12]
733
+    vmla.s32 q14, q12, d1[0]            // q14+= EO3 * 75       -> [10]'
734
+
735
+    vqrshrn.s32 d30, q3, 10             // d30                  -> [ 8]
736
+    vmul.s32 q3, q9, d2[0]              // q3  = EO0 * 18       -> 14
737
+    vqrshrn.s32 d4, q2, 10              // d4                   -> [ 2]
738
+    vmls.s32 q3, q10, d2[1]             // q3 += EO1 *-50
739
+    vqrshrn.s32 d5, q4, 10              // d30                  -> [ 6]
740
+    vmla.s32 q3, q11, d1[0]             // q3 += EO2 * 75
741
+    vqrshrn.s32 d27, q14, 10            // d27                  -> [10]
742
+    vmls.s32 q3, q12, d1[1]             // q3 += EO3 *-89       -> [14]'
743
+
744
+    vst1.16 {d4 }, [r1], r3             // Stroe [ 2]
745
+    vst1.16 {d31}, [r1], r3             // Stroe [ 4]
746
+    vst1.16 {d5 }, [r1], r3             // Stroe [ 6]
747
+    vst1.16 {d30}, [r1], r3             // Stroe [ 8]
748
+    vqrshrn.s32 d30, q3, 10             // d30                  -> [14]
749
+    vst1.16 {d27}, [r1], r3             // Stroe [10]
750
+    vst1.16 {d26}, [r1], r3             // Stroe [12]
751
+    vst1.16 {d30}, [r1], r3             // Stroe [14]
752
+
753
+    // Process Odd
754
+    sub r1, #(15*16)*2
755
+    vldm r2!, {q8-q15}
756
+
757
+    // d8  = [30 20 10 00]
758
+    // d9  = [31 21 11 01]
759
+    // q10 = [32 22 12 02]
760
+    // d11 = [33 23 13 03]
761
+    // d12 = [34 24 14 04]
762
+    // d13 = [35 25 15 05]
763
+    // q14 = [36 26 16 06]
764
+    // d15 = [37 27 17 07]
765
+    // d16 = [38 28 18 08]
766
+    // d17 = [39 29 19 09]
767
+    // q18 = [3A 2A 1A 0A]
768
+    // d19 = [3B 2B 1B 0B]
769
+    // d20 = [3C 2C 1C 0C]
770
+    // d21 = [3D 2D 1D 0D]
771
+    // q22 = [3E 2E 1E 0E]
772
+    // d23 = [3F 2F 1F 0F]
773
+
774
+    // O
775
+    vsubl.s16 q2,  d16, d31             // q2  = [O30 O20 O10 O00]
776
+    vsubl.s16 q3,  d17, d30             // q3  = [O31 O21 O11 O01]
777
+    vsubl.s16 q4,  d18, d29             // q4  = [O32 O22 O12 O02]
778
+    vsubl.s16 q5,  d19, d28             // q5  = [O33 O23 O13 O03]
779
+    vsubl.s16 q9,  d23, d24             // q9  = [O37 O27 O17 O07]
780
+    vsubl.s16 q8,  d22, d25             // q8  = [O36 O26 O16 O06]
781
+    vsubl.s16 q7,  d21, d26             // q7  = [O35 O25 O15 O05]
782
+    vsubl.s16 q6,  d20, d27             // q6  = [O34 O24 O14 O04]
783
+
784
+    // Load DCT Ox Constant
785
+    adr r0, ctr16
786
+    vld1.32 {d0-d3}, [r0]
787
+
788
+    // Register Map (Qx)
789
+    // Free=[10,11,12,13,14,15], Const=[0,1], O=[2,3,4,5,6,7,8,9]
790
+
791
+    vmul.s32 q10, q2, d0[0]             // q10 = O0 * 90        ->  1
792
+    vmul.s32 q11, q2, d0[1]             // q11 = O0 * 87        ->  3
793
+    vmul.s32 q12, q2, d1[0]             // q12 = O0 * 80        ->  5
794
+    vmul.s32 q13, q2, d1[1]             // q13 = O0 * 70        ->  7
795
+    vmul.s32 q14, q2, d2[0]             // q14 = O0 * 57        ->  9
796
+    vmul.s32 q15, q2, d2[1]             // q15 = O0 * 43        -> 11
797
+
798
+    vmla.s32 q10, q3, d0[1]             // q10+= O1 * 87
799
+    vmla.s32 q11, q3, d2[0]             // q11+= O1 * 57
800
+    vmla.s32 q12, q3, d3[1]             // q12+= O1 *  9
801
+    vmls.s32 q13, q3, d2[1]             // q13+= O1 *-43
802
+    vmls.s32 q14, q3, d1[0]             // q14+= O1 *-80
803
+    vmls.s32 q15, q3, d0[0]             // q15+= O1 *-90
804
+
805
+    vmla.s32 q10, q4, d1[0]             // q10+= O2 * 80
806
+    vmla.s32 q11, q4, d3[1]             // q11+= O2 *  9
807
+    vmls.s32 q12, q4, d1[1]             // q12+= O2 *-70
808
+    vmls.s32 q13, q4, d0[1]             // q13+= O2 *-87
809
+    vmls.s32 q14, q4, d3[0]             // q14+= O2 *-25
810
+    vmla.s32 q15, q4, d2[0]             // q15+= O2 * 57
811
+
812
+    vmla.s32 q10, q5, d1[1]             // q10+= O3 * 70
813
+    vmls.s32 q11, q5, d2[1]             // q11+= O3 *-43
814
+    vmls.s32 q12, q5, d0[1]             // q12+= O3 *-87
815
+    vmla.s32 q13, q5, d3[1]             // q13+= O3 *  9
816
+    vmla.s32 q14, q5, d0[0]             // q14+= O3 * 90
817
+    vmla.s32 q15, q5, d3[0]             // q15+= O3 * 25
818
+
819
+    vmla.s32 q10, q6, d2[0]             // q10+= O4 * 57
820
+    vmls.s32 q11, q6, d1[0]             // q11+= O4 *-80
821
+    vmls.s32 q12, q6, d3[0]             // q12+= O4 *-25
822
+    vmla.s32 q13, q6, d0[0]             // q13+= O4 * 90
823
+    vmls.s32 q14, q6, d3[1]             // q14+= O4 *-9
824
+    vmls.s32 q15, q6, d0[1]             // q15+= O4 *-87
825
+
826
+    vmla.s32 q10, q7, d2[1]             // q10+= O5 * 43
827
+    vmls.s32 q11, q7, d0[0]             // q11+= O5 *-90
828
+    vmla.s32 q12, q7, d2[0]             // q12+= O5 * 57
829
+    vmla.s32 q13, q7, d3[0]             // q13+= O5 * 25
830
+    vmls.s32 q14, q7, d0[1]             // q14+= O5 *-87
831
+    vmla.s32 q15, q7, d1[1]             // q15+= O5 * 70
832
+
833
+    vmla.s32 q10, q8, d3[0]             // q10+= O6 * 25
834
+    vmls.s32 q11, q8, d1[1]             // q11+= O6 *-70
835
+    vmla.s32 q12, q8, d0[0]             // q12+= O6 * 90
836
+    vmls.s32 q13, q8, d1[0]             // q13+= O6 *-80
837
+    vmla.s32 q14, q8, d2[1]             // q14+= O6 * 43
838
+    vmla.s32 q15, q8, d3[1]             // q15+= O6 *  9
839
+
840
+    vmla.s32 q10, q9, d3[1]             // q10+= O7 *  9        -> [ 1]'
841
+    vmls.s32 q11, q9, d3[0]             // q11+= O7 *-25        -> [ 3]'
842
+    vmla.s32 q12, q9, d2[1]             // q12+= O7 * 43        -> [ 5]'
843
+    vqrshrn.s32 d20, q10, 10            // d20                  -> [ 1]
844
+    vmls.s32 q13, q9, d2[0]             // q13+= O7 *-57        -> [ 7]'
845
+    vqrshrn.s32 d21, q11, 10            // d21                  -> [ 3]
846
+
847
+    vmul.s32 q11, q2, d3[0]             // q11 = O0 * 25        -> 13
848
+    vmul.s32 q2,  q2, d3[1]             // q2  = O0 *  9        -> 15
849
+
850
+    vst1.16 {d20}, [r1], r3             // Stroe [ 1]
851
+    vst1.16 {d21}, [r1], r3             // Stroe [ 3]
852
+
853
+    vmls.s32 q11, q3, d1[1]             // q11+= O1 *-70
854
+    vmls.s32 q2,  q3, d3[0]             // q2 += O1 *-25
855
+
856
+    vmla.s32 q14, q9, d1[1]             // q14+= O7 * 70        -> [ 9]'
857
+    vmls.s32 q15, q9, d1[0]             // q15+= O7 *-80        -> [11]'
858
+
859
+    vqrshrn.s32 d24, q12, 10            // d24                  -> [ 5]
860
+
861
+    vqrshrn.s32 d25, q13, 10            // d25                  -> [ 7]
862
+    vqrshrn.s32 d28, q14, 10            // d28                  -> [ 9]
863
+    vqrshrn.s32 d29, q15, 10            // d29                  -> [11]
864
+
865
+    vst1.16 {d24}, [r1], r3             // Stroe [ 5]
866
+    vst1.16 {d25}, [r1], r3             // Stroe [ 7]
867
+    vst1.16 {d28}, [r1], r3             // Stroe [ 9]
868
+    vst1.16 {d29}, [r1], r3             // Stroe [11]
869
+
870
+    vmla.s32 q11, q4, d0[0]             // q11+= O2 * 90
871
+    vmla.s32 q2,  q4, d2[1]             // q2 += O2 * 43
872
+
873
+    vmls.s32 q11, q5, d1[0]             // q11+= O3 *-80
874
+    vmls.s32 q2,  q5, d2[0]             // q2 += O3 *-57
875
+
876
+    vmla.s32 q11, q6, d2[1]             // q11+= O4 * 43
877
+    vmla.s32 q2,  q6, d1[1]             // q2 += O4 * 70
878
+
879
+    vmla.s32 q11, q7, d3[1]             // q11+= O5 *  9
880
+    vmls.s32 q2,  q7, d1[0]             // q2 += O5 *-80
881
+
882
+    vmls.s32 q11, q8, d2[0]             // q11+= O6 *-57
883
+    vmla.s32 q2,  q8, d0[1]             // q2 += O6 * 87
884
+
885
+    vmla.s32 q11, q9, d0[1]             // q11+= O7 * 87        -> [13]'
886
+    vmls.s32 q2,  q9, d0[0]             // q2 += O7 *-90        -> [15]'
887
+
888
+    vqrshrn.s32 d6, q11, 10             // d6                   -> [13]
889
+    vqrshrn.s32 d7, q2, 10              // d7                   -> [15]
890
+    vst1.16 {d6}, [r1], r3              // Stroe [13]
891
+    vst1.16 {d7}, [r1], r3              // Stroe [15]
892
+
893
+    sub r1, #(17*16-4)*2
894
+    subs r12, #1
895
+    bgt .loop2
896
+
897
+    add sp, #16*16*2
898
+    vpop {q4-q7}
899
+    pop {pc}
900
+endfunc
901
+
902
x265_2.0.tar.gz/source/common/arm/dct8.h Added
34
 
1
@@ -0,0 +1,32 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2016 x265 project
4
+ *
5
+ * Authors: Min Chen <chenm003@163.com>
6
+ *          Dnyaneshwar Gorade <dnyaneshwar@multicorewareinc.com>
7
+ *
8
+ * This program is free software; you can redistribute it and/or modify
9
+ * it under the terms of the GNU General Public License as published by
10
+ * the Free Software Foundation; either version 2 of the License, or
11
+ * (at your option) any later version.
12
+ *
13
+ * This program is distributed in the hope that it will be useful,
14
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16
+ * GNU General Public License for more details.
17
+ *
18
+ * You should have received a copy of the GNU General Public License
19
+ * along with this program; if not, write to the Free Software
20
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
21
+ *
22
+ * This program is also available under a commercial proprietary license.
23
+ * For more information, contact us at license @ x265.com.
24
+ *****************************************************************************/
25
+
26
+#ifndef X265_DCT8_ARM_H
27
+#define X265_DCT8_ARM_H
28
+
29
+void PFX(dct_4x4_neon)(const int16_t* src, int16_t* dst, intptr_t srcStride);
30
+void PFX(dct_8x8_neon)(const int16_t* src, int16_t* dst, intptr_t srcStride);
31
+void PFX(dct_16x16_neon)(const int16_t* src, int16_t* dst, intptr_t srcStride);
32
+
33
+#endif // ifndef X265_DCT8_ARM_H
34
x265_2.0.tar.gz/source/common/arm/intrapred.h Added
33
 
1
@@ -0,0 +1,31 @@
2
+/*****************************************************************************
3
+ * intrapred.h: Intra Prediction metrics
4
+ *****************************************************************************
5
+ * Copyright (C) 2003-2013 x264 project
6
+ *
7
+ * Authors: Min Chen <chenm003@163.com> <min.chen@multicorewareinc.com>
8
+ *          Praveen Kumar Tiwari <praveen@multicorewareinc.com>
9
+ *          Dnyaneshwar Gorade <dnyaneshwar@multicorewareinc.com>
10
+ *
11
+ * This program is free software; you can redistribute it and/or modify
12
+ * it under the terms of the GNU General Public License as published by
13
+ * the Free Software Foundation; either version 2 of the License, or
14
+ * (at your option) any later version.
15
+ *
16
+ * This program is distributed in the hope that it will be useful,
17
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
18
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19
+ * GNU General Public License for more details.
20
+ *
21
+ * You should have received a copy of the GNU General Public License
22
+ * along with this program; if not, write to the Free Software
23
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
24
+ *
25
+ * This program is also available under a commercial proprietary license.
26
+ * For more information, contact us at license @ x265.com.
27
+ *****************************************************************************/
28
+
29
+#ifndef X265_INTRAPRED_ARM_H
30
+#define X265_INTRAPRED_ARM_H
31
+
32
+#endif // ifndef X265_INTRAPRED_ARM_H
33
x265_2.0.tar.gz/source/common/arm/ipfilter8.S Added
3343
 
1
@@ -0,0 +1,3341 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2016 x265 project
4
+ *
5
+ * Authors: Dnyaneshwar G <dnyaneshwar@multicorewareinc.com>
6
+ *          Radhakrishnan VR <radhakrishnan@multicorewareinc.com>
7
+ *          Min Chen <min.chen@multicorewareinc.com>
8
+ * 
9
+ * This program is free software; you can redistribute it and/or modify
10
+ * it under the terms of the GNU General Public License as published by
11
+ * the Free Software Foundation; either version 2 of the License, or
12
+ * (at your option) any later version.
13
+ *
14
+ * This program is distributed in the hope that it will be useful,
15
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17
+ * GNU General Public License for more details.
18
+ *
19
+ * You should have received a copy of the GNU General Public License
20
+ * along with this program; if not, write to the Free Software
21
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
22
+ *
23
+ * This program is also available under a commercial proprietary license.
24
+ * For more information, contact us at license @ x265.com.
25
+ *****************************************************************************/
26
+
27
+#include "asm.S"
28
+
29
+.section .rodata
30
+.align 4
31
+
32
+g_lumaFilter:
33
+.word 0,0,0,0,0,0,64,64,0,0,0,0,0,0,0,0
34
+.word -1,-1,4,4,-10,-10,58,58,17,17,-5,-5,1,1,0,0
35
+.word -1,-1,4,4,-11,-11,40,40,40,40,-11,-11,4,4,-1,-1
36
+.word 0,0,1,1,-5,-5,17,17,58,58,-10,-10,4,4,-1,-1 
37
+g_chromaFilter:
38
+.word 0, 0, 64, 64, 0, 0, 0, 0
39
+.word -2, -2, 58, 58, 10, 10, -2, -2
40
+.word -4, -4, 54, 54, 16, 16, -2, -2
41
+.word -6, -6, 46, 46, 28, 28, -4, -4
42
+.word -4, -4, 36, 36, 36, 36, -4 ,-4
43
+.word -4, -4, 28, 28, 46, 46, -6, -6
44
+.word -2, -2, 16, 16, 54, 54, -4 ,-4
45
+.word -2, -2, 10, 10, 58, 58, -2, -2
46
+
47
+
48
+.text
49
+
50
+// filterPixelToShort(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride)
51
+function x265_filterPixelToShort_4x4_neon
52
+    vld1.u32    {d0[]}, [r0], r1
53
+    vld1.u32    {d0[1]}, [r0], r1
54
+    vld1.u32    {d1[]}, [r0], r1
55
+    vld1.u32    {d1[1]}, [r0], r1
56
+
57
+    // avoid load pipeline stall
58
+    vmov.i16    q1, #0xE000
59
+
60
+    vshll.u8    q2, d0, #6
61
+    vshll.u8    q3, d1, #6
62
+    vadd.i16    q2, q1
63
+    vadd.i16    q3, q1
64
+
65
+    add         r3, r3
66
+    vst1.16     {d4}, [r2], r3
67
+    vst1.16     {d5}, [r2], r3
68
+    vst1.16     {d6}, [r2], r3
69
+    vst1.16     {d7}, [r2], r3
70
+
71
+    bx          lr
72
+endfunc
73
+
74
+function x265_filterPixelToShort_4x8_neon
75
+    add         r3, r3
76
+    vmov.u16    q8, #64
77
+    vmov.u16    q9, #8192
78
+    vneg.s16    q9, q9
79
+.rept 4
80
+    vld1.u8     {d0}, [r0], r1
81
+    vld1.u8     {d2}, [r0], r1
82
+    vmovl.u8    q0, d0
83
+    vmovl.u8    q1, d2
84
+    vmov        q2, q9
85
+    vmov        q3, q9
86
+    vmla.s16    q2, q0, q8
87
+    vmla.s16    q3, q1, q8
88
+    vst1.16     {d4}, [r2], r3
89
+    vst1.16     {d6}, [r2], r3
90
+.endr
91
+    bx          lr
92
+endfunc
93
+
94
+function x265_filterPixelToShort_4x16_neon
95
+    add         r3, r3
96
+    vmov.u16    q8, #64
97
+    vmov.u16    q9, #8192
98
+    vneg.s16    q9, q9
99
+.rept 8
100
+    vld1.u8     {d0}, [r0], r1
101
+    vld1.u8     {d2}, [r0], r1
102
+    vmovl.u8    q0, d0
103
+    vmovl.u8    q1, d2
104
+    vmov        q2, q9
105
+    vmov        q3, q9
106
+    vmla.s16    q2, q0, q8
107
+    vmla.s16    q3, q1, q8
108
+    vst1.16     {d4}, [r2], r3
109
+    vst1.16     {d6}, [r2], r3
110
+.endr
111
+    bx          lr
112
+endfunc
113
+
114
+function x265_filterPixelToShort_8x4_neon
115
+    add         r3, r3
116
+    vmov.u16    q8, #64
117
+    vmov.u16    q9, #8192
118
+    vneg.s16    q9, q9
119
+.rept 2
120
+    vld1.u8     {d0}, [r0], r1
121
+    vld1.u8     {d2}, [r0], r1
122
+    vmovl.u8    q0, d0
123
+    vmovl.u8    q1, d2
124
+    vmov        q2, q9
125
+    vmov        q3, q9
126
+    vmla.s16    q2, q0, q8
127
+    vmla.s16    q3, q1, q8
128
+    vst1.16     {q2}, [r2], r3
129
+    vst1.16     {q3}, [r2], r3
130
+.endr
131
+    bx          lr
132
+endfunc
133
+
134
+function x265_filterPixelToShort_8x8_neon
135
+    add         r3, r3
136
+    vmov.u16    q8, #64
137
+    vmov.u16    q9, #8192
138
+    vneg.s16    q9, q9
139
+.rept 4
140
+    vld1.u8     {d0}, [r0], r1
141
+    vld1.u8     {d2}, [r0], r1
142
+    vmovl.u8    q0, d0
143
+    vmovl.u8    q1, d2
144
+    vmov        q2, q9
145
+    vmov        q3, q9
146
+    vmla.s16    q2, q0, q8
147
+    vmla.s16    q3, q1, q8
148
+    vst1.16     {q2}, [r2], r3
149
+    vst1.16     {q3}, [r2], r3
150
+.endr
151
+    bx          lr
152
+endfunc
153
+
154
+function x265_filterPixelToShort_8x16_neon
155
+    add         r3, r3
156
+    vmov.u16    q8, #64
157
+    vmov.u16    q9, #8192
158
+    vneg.s16    q9, q9
159
+.rept 8
160
+    vld1.u8     {d0}, [r0], r1
161
+    vld1.u8     {d2}, [r0], r1
162
+    vmovl.u8    q0, d0
163
+    vmovl.u8    q1, d2
164
+    vmov        q2, q9
165
+    vmov        q3, q9
166
+    vmla.s16    q2, q0, q8
167
+    vmla.s16    q3, q1, q8
168
+    vst1.16     {q2}, [r2], r3
169
+    vst1.16     {q3}, [r2], r3
170
+.endr
171
+    bx          lr
172
+endfunc
173
+
174
+function x265_filterPixelToShort_8x32_neon
175
+    add         r3, r3
176
+    vmov.u16    q8, #64
177
+    vmov.u16    q9, #8192
178
+    vneg.s16    q9, q9
179
+.rept 16
180
+    vld1.u8     {d0}, [r0], r1
181
+    vld1.u8     {d2}, [r0], r1
182
+    vmovl.u8    q0, d0
183
+    vmovl.u8    q1, d2
184
+    vmov        q2, q9
185
+    vmov        q3, q9
186
+    vmla.s16    q2, q0, q8
187
+    vmla.s16    q3, q1, q8
188
+    vst1.16     {q2}, [r2], r3
189
+    vst1.16     {q3}, [r2], r3
190
+.endr
191
+    bx          lr
192
+endfunc
193
+
194
+function x265_filterPixelToShort_12x16_neon
195
+    add         r3, r3
196
+    vmov.u16    q8, #64
197
+    vmov.u16    q9, #8192
198
+    vneg.s16    q9, q9
199
+.rept 16
200
+    vld1.u8     {d2-d3}, [r0], r1
201
+    vmovl.u8    q0, d2
202
+    vmovl.u8    q1, d3
203
+    vmov        q2, q9
204
+    vmov        q3, q9
205
+    vmla.s16    q2, q0, q8
206
+    vmla.s16    q3, q1, q8
207
+    vst1.16     {d4, d5, d6}, [r2], r3
208
+.endr
209
+    bx          lr
210
+endfunc
211
+
212
+function x265_filterPixelToShort_16x4_neon
213
+    add         r3, r3
214
+    vmov.u16    q8, #64
215
+    vmov.u16    q9, #8192
216
+    vneg.s16    q9, q9
217
+.rept 4
218
+    vld1.u8     {d2-d3}, [r0], r1
219
+    vmovl.u8    q0, d2
220
+    vmovl.u8    q1, d3
221
+    vmov        q2, q9
222
+    vmov        q3, q9
223
+    vmla.s16    q2, q0, q8
224
+    vmla.s16    q3, q1, q8
225
+    vst1.16     {q2-q3}, [r2], r3
226
+.endr
227
+    bx          lr
228
+endfunc
229
+
230
+function x265_filterPixelToShort_16x8_neon
231
+    add         r3, r3
232
+    vmov.u16    q8, #64
233
+    vmov.u16    q9, #8192
234
+    vneg.s16    q9, q9
235
+.rept 8
236
+    vld1.u8     {d2-d3}, [r0], r1
237
+    vmovl.u8    q0, d2
238
+    vmovl.u8    q1, d3
239
+    vmov        q2, q9
240
+    vmov        q3, q9
241
+    vmla.s16    q2, q0, q8
242
+    vmla.s16    q3, q1, q8
243
+    vst1.16     {q2-q3}, [r2], r3
244
+.endr
245
+    bx          lr
246
+endfunc
247
+
248
+function x265_filterPixelToShort_16x12_neon
249
+    add         r3, r3
250
+    vmov.u16    q8, #64
251
+    vmov.u16    q9, #8192
252
+    vneg.s16    q9, q9
253
+.rept 12
254
+    vld1.u8     {d2-d3}, [r0], r1
255
+    vmovl.u8    q0, d2
256
+    vmovl.u8    q1, d3
257
+    vmov        q2, q9
258
+    vmov        q3, q9
259
+    vmla.s16    q2, q0, q8
260
+    vmla.s16    q3, q1, q8
261
+    vst1.16     {q2-q3}, [r2], r3
262
+.endr
263
+    bx          lr
264
+endfunc
265
+
266
+function x265_filterPixelToShort_16x16_neon
267
+    add         r3, r3
268
+    vmov.u16    q8, #64
269
+    vmov.u16    q9, #8192
270
+    vneg.s16    q9, q9
271
+.rept 16
272
+    vld1.u8     {d2-d3}, [r0], r1
273
+    vmovl.u8    q0, d2
274
+    vmovl.u8    q1, d3
275
+    vmov        q2, q9
276
+    vmov        q3, q9
277
+    vmla.s16    q2, q0, q8
278
+    vmla.s16    q3, q1, q8
279
+    vst1.16     {q2-q3}, [r2], r3
280
+.endr
281
+    bx          lr
282
+endfunc
283
+
284
+function x265_filterPixelToShort_16x32_neon
285
+    add         r3, r3
286
+    vmov.u16    q8, #64
287
+    vmov.u16    q9, #8192
288
+    vneg.s16    q9, q9
289
+.rept 32
290
+    vld1.u8     {d2-d3}, [r0], r1
291
+    vmovl.u8    q0, d2
292
+    vmovl.u8    q1, d3
293
+    vmov        q2, q9
294
+    vmov        q3, q9
295
+    vmla.s16    q2, q0, q8
296
+    vmla.s16    q3, q1, q8
297
+    vst1.16     {q2-q3}, [r2], r3
298
+.endr
299
+    bx          lr
300
+endfunc
301
+
302
+function x265_filterPixelToShort_16x64_neon
303
+    add         r3, r3
304
+    vmov.u16    q8, #64
305
+    vmov.u16    q9, #8192
306
+    vneg.s16    q9, q9
307
+.rept 64
308
+    vld1.u8     {d2-d3}, [r0], r1
309
+    vmovl.u8    q0, d2
310
+    vmovl.u8    q1, d3
311
+    vmov        q2, q9
312
+    vmov        q3, q9
313
+    vmla.s16    q2, q0, q8
314
+    vmla.s16    q3, q1, q8
315
+    vst1.16     {q2-q3}, [r2], r3
316
+.endr   
317
+    bx          lr
318
+endfunc
319
+
320
+function x265_filterPixelToShort_24x32_neon
321
+    add         r3, r3
322
+    sub         r3, #32
323
+    vmov.u16    q0, #64
324
+    vmov.u16    q1, #8192
325
+    vneg.s16    q1, q1
326
+.rept 32
327
+    vld1.u8     {d18, d19, d20}, [r0], r1
328
+    vmovl.u8    q8, d18
329
+    vmovl.u8    q9, d19
330
+    vmovl.u8    q11, d20
331
+    vmov        q2, q1
332
+    vmov        q3, q1
333
+    vmla.s16    q2, q8, q0
334
+    vmla.s16    q3, q9, q0
335
+    vst1.16     {q2-q3}, [r2]!
336
+    vmov        q2, q1
337
+    vmla.s16    q2, q11, q0
338
+    vst1.16     {q2}, [r2], r3
339
+.endr
340
+    bx          lr
341
+endfunc
342
+
343
+function x265_filterPixelToShort_32x8_neon
344
+    add         r3, r3
345
+    sub         r3, #32
346
+    vmov.u16    q0, #64
347
+    vmov.u16    q1, #8192
348
+    vneg.s16    q1, q1
349
+.rept 8
350
+    vld1.u8     {q9-q10}, [r0], r1
351
+    vmovl.u8    q8, d18
352
+    vmovl.u8    q9, d19
353
+    vmovl.u8    q11, d20
354
+    vmovl.u8    q10, d21
355
+    vmov        q2, q1
356
+    vmov        q3, q1
357
+    vmla.s16    q2, q8, q0
358
+    vmla.s16    q3, q9, q0
359
+    vst1.16     {q2-q3}, [r2]!
360
+    vmov        q2, q1
361
+    vmov        q3, q1
362
+    vmla.s16    q2, q11, q0
363
+    vmla.s16    q3, q10, q0
364
+    vst1.16     {q2-q3}, [r2], r3
365
+.endr
366
+    bx          lr
367
+endfunc
368
+
369
+function x265_filterPixelToShort_32x16_neon
370
+    add         r3, r3
371
+    sub         r3, #32
372
+    vmov.u16    q0, #64
373
+    vmov.u16    q1, #8192
374
+    vneg.s16    q1, q1
375
+    mov         r12, #8
376
+.loop_filterP2S_32x16:
377
+    subs        r12, #1
378
+.rept 2
379
+    vld1.u8     {q9-q10}, [r0], r1
380
+    vmovl.u8    q8, d18
381
+    vmovl.u8    q9, d19
382
+    vmovl.u8    q11, d20
383
+    vmovl.u8    q10, d21
384
+    vmov        q2, q1
385
+    vmov        q3, q1
386
+    vmla.s16    q2, q8, q0
387
+    vmla.s16    q3, q9, q0
388
+    vst1.16     {q2-q3}, [r2]!
389
+    vmov        q2, q1
390
+    vmov        q3, q1
391
+    vmla.s16    q2, q11, q0
392
+    vmla.s16    q3, q10, q0
393
+    vst1.16     {q2-q3}, [r2], r3
394
+.endr
395
+    bgt         .loop_filterP2S_32x16
396
+    bx          lr
397
+endfunc
398
+
399
+function x265_filterPixelToShort_32x24_neon
400
+    add         r3, r3
401
+    sub         r3, #32
402
+    vmov.u16    q0, #64
403
+    vmov.u16    q1, #8192
404
+    vneg.s16    q1, q1
405
+    mov         r12, #12
406
+.loop_filterP2S_32x24:
407
+    subs        r12, #1
408
+.rept 2
409
+    vld1.u8     {q9-q10}, [r0], r1
410
+    vmovl.u8    q8, d18
411
+    vmovl.u8    q9, d19
412
+    vmovl.u8    q11, d20
413
+    vmovl.u8    q10, d21
414
+    vmov        q2, q1
415
+    vmov        q3, q1
416
+    vmla.s16    q2, q8, q0
417
+    vmla.s16    q3, q9, q0
418
+    vst1.16     {q2-q3}, [r2]!
419
+    vmov        q2, q1
420
+    vmov        q3, q1
421
+    vmla.s16    q2, q11, q0
422
+    vmla.s16    q3, q10, q0
423
+    vst1.16     {q2-q3}, [r2], r3
424
+.endr
425
+    bgt         .loop_filterP2S_32x24
426
+    bx          lr
427
+endfunc
428
+
429
+function x265_filterPixelToShort_32x32_neon
430
+    add         r3, r3
431
+    sub         r3, #32
432
+    vmov.u16    q0, #64
433
+    vmov.u16    q1, #8192
434
+    vneg.s16    q1, q1
435
+    mov         r12, #16
436
+.loop_filterP2S_32x32:
437
+    subs        r12, #1
438
+.rept 2
439
+    vld1.u8     {q9-q10}, [r0], r1
440
+    vmovl.u8    q8, d18
441
+    vmovl.u8    q9, d19
442
+    vmovl.u8    q11, d20
443
+    vmovl.u8    q10, d21
444
+    vmov        q2, q1
445
+    vmov        q3, q1
446
+    vmla.s16    q2, q8, q0
447
+    vmla.s16    q3, q9, q0
448
+    vst1.16     {q2-q3}, [r2]!
449
+    vmov        q2, q1
450
+    vmov        q3, q1
451
+    vmla.s16    q2, q11, q0
452
+    vmla.s16    q3, q10, q0
453
+    vst1.16     {q2-q3}, [r2], r3
454
+.endr
455
+    bgt         .loop_filterP2S_32x32
456
+    bx          lr
457
+endfunc
458
+
459
+function x265_filterPixelToShort_32x64_neon
460
+    add         r3, r3
461
+    sub         r3, #32
462
+    vmov.u16    q0, #64
463
+    vmov.u16    q1, #8192
464
+    vneg.s16    q1, q1
465
+    mov         r12, #32
466
+.loop_filterP2S_32x64:
467
+    subs        r12, #1
468
+.rept 2
469
+    vld1.u8     {q9-q10}, [r0], r1
470
+    vmovl.u8    q8, d18
471
+    vmovl.u8    q9, d19
472
+    vmovl.u8    q11, d20
473
+    vmovl.u8    q10, d21
474
+    vmov        q2, q1
475
+    vmov        q3, q1
476
+    vmla.s16    q2, q8, q0
477
+    vmla.s16    q3, q9, q0
478
+    vst1.16     {q2-q3}, [r2]!
479
+    vmov        q2, q1
480
+    vmov        q3, q1
481
+    vmla.s16    q2, q11, q0
482
+    vmla.s16    q3, q10, q0
483
+    vst1.16     {q2-q3}, [r2], r3
484
+.endr
485
+    bgt         .loop_filterP2S_32x64
486
+    bx          lr
487
+endfunc
488
+
489
+function x265_filterPixelToShort_64x16_neon
490
+    add         r3, r3
491
+    sub         r1, #32
492
+    sub         r3, #96
493
+    vmov.u16    q0, #64
494
+    vmov.u16    q1, #8192
495
+    vneg.s16    q1, q1
496
+    mov         r12, #8
497
+.loop_filterP2S_64x16:
498
+    subs        r12, #1
499
+.rept 2
500
+    vld1.u8     {q9-q10}, [r0]!
501
+    vmovl.u8    q8, d18
502
+    vmovl.u8    q9, d19
503
+    vmovl.u8    q11, d20
504
+    vmovl.u8    q10, d21
505
+    vmov        q2, q1
506
+    vmov        q3, q1
507
+    vmla.s16    q2, q8, q0
508
+    vmla.s16    q3, q9, q0
509
+    vst1.16     {q2-q3}, [r2]!
510
+    vmov        q2, q1
511
+    vmov        q3, q1
512
+    vmla.s16    q2, q11, q0
513
+    vmla.s16    q3, q10, q0
514
+    vst1.16     {q2-q3}, [r2]!
515
+
516
+    vld1.u8     {q9-q10}, [r0], r1
517
+    vmovl.u8    q8, d18
518
+    vmovl.u8    q9, d19
519
+    vmovl.u8    q11, d20
520
+    vmovl.u8    q10, d21
521
+    vmov        q2, q1
522
+    vmov        q3, q1
523
+    vmla.s16    q2, q8, q0
524
+    vmla.s16    q3, q9, q0
525
+    vst1.16     {q2-q3}, [r2]!
526
+    vmov        q2, q1
527
+    vmov        q3, q1
528
+    vmla.s16    q2, q11, q0
529
+    vmla.s16    q3, q10, q0
530
+    vst1.16     {q2-q3}, [r2], r3
531
+.endr
532
+    bgt         .loop_filterP2S_64x16
533
+    bx          lr
534
+endfunc
535
+
536
+function x265_filterPixelToShort_64x32_neon
537
+    add         r3, r3
538
+    sub         r1, #32
539
+    sub         r3, #96
540
+    vmov.u16    q0, #64
541
+    vmov.u16    q1, #8192
542
+    vneg.s16    q1, q1
543
+    mov         r12, #16
544
+.loop_filterP2S_64x32:
545
+    subs        r12, #1
546
+.rept 2
547
+    vld1.u8     {q9-q10}, [r0]!
548
+    vmovl.u8    q8, d18
549
+    vmovl.u8    q9, d19
550
+    vmovl.u8    q11, d20
551
+    vmovl.u8    q10, d21
552
+    vmov        q2, q1
553
+    vmov        q3, q1
554
+    vmla.s16    q2, q8, q0
555
+    vmla.s16    q3, q9, q0
556
+    vst1.16     {q2-q3}, [r2]!
557
+    vmov        q2, q1
558
+    vmov        q3, q1
559
+    vmla.s16    q2, q11, q0
560
+    vmla.s16    q3, q10, q0
561
+    vst1.16     {q2-q3}, [r2]!
562
+
563
+    vld1.u8     {q9-q10}, [r0], r1
564
+    vmovl.u8    q8, d18
565
+    vmovl.u8    q9, d19
566
+    vmovl.u8    q11, d20
567
+    vmovl.u8    q10, d21
568
+    vmov        q2, q1
569
+    vmov        q3, q1
570
+    vmla.s16    q2, q8, q0
571
+    vmla.s16    q3, q9, q0
572
+    vst1.16     {q2-q3}, [r2]!
573
+    vmov        q2, q1
574
+    vmov        q3, q1
575
+    vmla.s16    q2, q11, q0
576
+    vmla.s16    q3, q10, q0
577
+    vst1.16     {q2-q3}, [r2], r3
578
+.endr
579
+    bgt         .loop_filterP2S_64x32
580
+    bx          lr
581
+endfunc
582
+
583
+function x265_filterPixelToShort_64x48_neon
584
+    add         r3, r3
585
+    sub         r1, #32
586
+    sub         r3, #96
587
+    vmov.u16    q0, #64
588
+    vmov.u16    q1, #8192
589
+    vneg.s16    q1, q1
590
+    mov         r12, #24
591
+.loop_filterP2S_64x48:
592
+    subs        r12, #1
593
+.rept 2
594
+    vld1.u8     {q9-q10}, [r0]!
595
+    vmovl.u8    q8, d18
596
+    vmovl.u8    q9, d19
597
+    vmovl.u8    q11, d20
598
+    vmovl.u8    q10, d21
599
+    vmov        q2, q1
600
+    vmov        q3, q1
601
+    vmla.s16    q2, q8, q0
602
+    vmla.s16    q3, q9, q0
603
+    vst1.16     {q2-q3}, [r2]!
604
+    vmov        q2, q1
605
+    vmov        q3, q1
606
+    vmla.s16    q2, q11, q0
607
+    vmla.s16    q3, q10, q0
608
+    vst1.16     {q2-q3}, [r2]!
609
+
610
+    vld1.u8     {q9-q10}, [r0], r1
611
+    vmovl.u8    q8, d18
612
+    vmovl.u8    q9, d19
613
+    vmovl.u8    q11, d20
614
+    vmovl.u8    q10, d21
615
+    vmov        q2, q1
616
+    vmov        q3, q1
617
+    vmla.s16    q2, q8, q0
618
+    vmla.s16    q3, q9, q0
619
+    vst1.16     {q2-q3}, [r2]!
620
+    vmov        q2, q1
621
+    vmov        q3, q1
622
+    vmla.s16    q2, q11, q0
623
+    vmla.s16    q3, q10, q0
624
+    vst1.16     {q2-q3}, [r2], r3
625
+.endr
626
+    bgt         .loop_filterP2S_64x48
627
+    bx          lr
628
+endfunc
629
+
630
+function x265_filterPixelToShort_64x64_neon
631
+    add         r3, r3
632
+    sub         r1, #32
633
+    sub         r3, #96
634
+    vmov.u16    q0, #64
635
+    vmov.u16    q1, #8192
636
+    vneg.s16    q1, q1
637
+    mov         r12, #32
638
+.loop_filterP2S_64x64:
639
+    subs        r12, #1
640
+.rept 2
641
+    vld1.u8     {q9-q10}, [r0]!
642
+    vmovl.u8    q8, d18
643
+    vmovl.u8    q9, d19
644
+    vmovl.u8    q11, d20
645
+    vmovl.u8    q10, d21
646
+    vmov        q2, q1
647
+    vmov        q3, q1
648
+    vmla.s16    q2, q8, q0
649
+    vmla.s16    q3, q9, q0
650
+    vst1.16     {q2-q3}, [r2]!
651
+    vmov        q2, q1
652
+    vmov        q3, q1
653
+    vmla.s16    q2, q11, q0
654
+    vmla.s16    q3, q10, q0
655
+    vst1.16     {q2-q3}, [r2]!
656
+
657
+    vld1.u8     {q9-q10}, [r0], r1
658
+    vmovl.u8    q8, d18
659
+    vmovl.u8    q9, d19
660
+    vmovl.u8    q11, d20
661
+    vmovl.u8    q10, d21
662
+    vmov        q2, q1
663
+    vmov        q3, q1
664
+    vmla.s16    q2, q8, q0
665
+    vmla.s16    q3, q9, q0
666
+    vst1.16     {q2-q3}, [r2]!
667
+    vmov        q2, q1
668
+    vmov        q3, q1
669
+    vmla.s16    q2, q11, q0
670
+    vmla.s16    q3, q10, q0
671
+    vst1.16     {q2-q3}, [r2], r3
672
+.endr
673
+    bgt         .loop_filterP2S_64x64
674
+    bx          lr
675
+endfunc
676
+
677
+function x265_filterPixelToShort_48x64_neon
678
+    add         r3, r3
679
+    sub         r1, #32
680
+    sub         r3, #64
681
+    vmov.u16    q0, #64
682
+    vmov.u16    q1, #8192
683
+    vneg.s16    q1, q1
684
+    mov         r12, #32
685
+.loop_filterP2S_48x64:
686
+    subs        r12, #1
687
+.rept 2
688
+    vld1.u8     {q9-q10}, [r0]!
689
+    vmovl.u8    q8, d18
690
+    vmovl.u8    q9, d19
691
+    vmovl.u8    q11, d20
692
+    vmovl.u8    q10, d21
693
+    vmov        q2, q1
694
+    vmov        q3, q1
695
+    vmla.s16    q2, q8, q0
696
+    vmla.s16    q3, q9, q0
697
+    vst1.16     {q2-q3}, [r2]!
698
+    vmov        q2, q1
699
+    vmov        q3, q1
700
+    vmla.s16    q2, q11, q0
701
+    vmla.s16    q3, q10, q0
702
+    vst1.16     {q2-q3}, [r2]!
703
+
704
+    vld1.u8     {q9}, [r0], r1
705
+    vmovl.u8    q8, d18
706
+    vmovl.u8    q9, d19
707
+    vmov        q2, q1
708
+    vmov        q3, q1
709
+    vmla.s16    q2, q8, q0
710
+    vmla.s16    q3, q9, q0
711
+    vst1.16     {q2-q3}, [r2], r3
712
+.endr
713
+    bgt         .loop_filterP2S_48x64
714
+    bx          lr
715
+endfunc
716
+
717
+//**************luma_vpp************
718
+.align 8
719
+// TODO: I don't like S16 in here, but the VMUL with scalar doesn't support (U8 x U8)
720
+g_luma_s16:
721
+.hword   0, 0,   0, 64,  0,   0, 0,  0
722
+.hword  -1, 4, -10, 58, 17,  -5, 1,  0
723
+.hword  -1, 4, -11, 40, 40, -11, 4, -1
724
+.hword   0, 1,  -5, 17, 58, -10, 4, -1
725
+
726
+.macro LUMA_VPP_4xN h
727
+function x265_interp_8tap_vert_pp_4x\h\()_neon
728
+    ldr         r12, [sp]
729
+    push        {lr}
730
+    adr         lr, g_luma_s16
731
+    sub         r0, r1
732
+    sub         r0, r0, r1, lsl #1          // src -= 3 * srcStride
733
+    add         lr, lr, r12, lsl #4
734
+    vld1.16     {q0}, [lr, :64]             // q8 = luma interpolate coeff
735
+    vdup.s16    d24, d0[0]
736
+    vdup.s16    d25, d0[1]
737
+    vdup.s16    d26, d0[2]
738
+    vdup.s16    d27, d0[3]
739
+    vdup.s16    d28, d1[0]
740
+    vdup.s16    d29, d1[1]
741
+    vdup.s16    d30, d1[2]
742
+    vdup.s16    d31, d1[3]
743
+
744
+    mov         r12, #\h
745
+
746
+    // prepare to load 8 lines
747
+    vld1.u32    {d0[0]}, [r0], r1
748
+    vld1.u32    {d0[1]}, [r0], r1
749
+    vld1.u32    {d2[0]}, [r0], r1
750
+    vld1.u32    {d2[1]}, [r0], r1
751
+    vld1.u32    {d4[0]}, [r0], r1
752
+    vld1.u32    {d4[1]}, [r0], r1
753
+    vld1.u32    {d6[0]}, [r0], r1
754
+    vld1.u32    {d6[1]}, [r0], r1
755
+    vmovl.u8    q0, d0
756
+    vmovl.u8    q1, d2
757
+    vmovl.u8    q2, d4
758
+    vmovl.u8    q3, d6
759
+
760
+.loop_4x\h:
761
+    // TODO: read extra 1 row for speed optimize, may made crash on OS X platform!
762
+    vld1.u32    {d16[0]}, [r0], r1
763
+    vld1.u32    {d16[1]}, [r0], r1
764
+    vmovl.u8    q8, d16
765
+
766
+    // row[0-1]
767
+    vmul.s16    q9, q0, q12
768
+    vext.64     q11, q0, q1, 1
769
+    vmul.s16    q10, q11, q12
770
+    vmov        q0, q1
771
+
772
+    // row[2-3]
773
+    vmla.s16    q9, q1, q13
774
+    vext.64     q11, q1, q2, 1
775
+    vmla.s16    q10, q11, q13
776
+    vmov        q1, q2
777
+
778
+    // row[4-5]
779
+    vmla.s16    q9, q2, q14
780
+    vext.64     q11, q2, q3, 1
781
+    vmla.s16    q10, q11, q14
782
+    vmov        q2, q3
783
+
784
+    // row[6-7]
785
+    vmla.s16    q9, q3, q15
786
+    vext.64     q11, q3, q8, 1
787
+    vmla.s16    q10, q11, q15
788
+    vmov        q3, q8
789
+
790
+    // sum row[0-7]
791
+    vadd.s16    d18, d18, d19
792
+    vadd.s16    d19, d20, d21
793
+
794
+    vqrshrun.s16 d18, q9, #6
795
+    vst1.u32    {d18[0]}, [r2], r3
796
+    vst1.u32    {d18[1]}, [r2], r3
797
+
798
+    subs        r12, #2
799
+    bne        .loop_4x4
800
+
801
+    pop         {pc}
802
+    .ltorg
803
+endfunc
804
+.endm
805
+
806
+LUMA_VPP_4xN 4
807
+LUMA_VPP_4xN 8
808
+LUMA_VPP_4xN 16
809
+
810
+.macro qpel_filter_0_32b
811
+    vmov.i16        d17, #64
812
+    vmovl.u8        q11, d3
813
+    vmull.s16       q9, d22, d17    // 64*d0
814
+    vmull.s16       q10, d23, d17   // 64*d1
815
+.endm
816
+
817
+.macro qpel_filter_1_32b
818
+    vmov.i16        d16, #58
819
+    vmovl.u8        q11, d3
820
+    vmull.s16       q9, d22, d16        // 58 * d0
821
+    vmull.s16       q10, d23, d16       // 58 * d1
822
+
823
+    vmov.i16        d17, #10
824
+    vmovl.u8        q13, d2
825
+    vmull.s16       q11, d26, d17       // 10 * c0
826
+    vmull.s16       q12, d27, d17       // 10 * c1
827
+
828
+    vmov.i16        d16, #17
829
+    vmovl.u8        q15, d4
830
+    vmull.s16       q13, d30, d16       // 17 * e0
831
+    vmull.s16       q14, d31, d16       // 17 * e1
832
+
833
+    vmov.i16        d17, #5
834
+    vmovl.u8        q1, d5
835
+    vmull.s16       q15, d2, d17        //  5 * f0
836
+    vmull.s16       q8, d3, d17         //  5 * f1
837
+
838
+    vsub.s32        q9, q11             // 58 * d0 - 10 * c0
839
+    vsub.s32        q10, q12            // 58 * d1 - 10 * c1
840
+
841
+    vmovl.u8       q1, d1
842
+    vshll.s16      q11, d2, #2         // 4 * b0
843
+    vshll.s16      q12, d3, #2         // 4 * b1
844
+
845
+    vadd.s32       q9, q13             // 58 * d0 - 10 * c0 + 17 * e0
846
+    vadd.s32       q10, q14            // 58 * d1 - 10 * c1 + 17 * e1
847
+
848
+    vmovl.u8       q1, d0
849
+    vmovl.u8       q2, d6
850
+    vsubl.s16      q13, d4, d2         // g0 - a0
851
+    vsubl.s16      q14, d5, d3         // g1 - a1
852
+
853
+    vadd.s32       q9, q11             // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0
854
+    vadd.s32       q10, q12            // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1
855
+    vsub.s32       q13, q15            // g0 - a0 - 5 * f0
856
+    vsub.s32       q14, q8             // g1 - a1 - 5 * f1
857
+    vadd.s32       q9, q13             // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0 + g0 - a0 - 5 * f0
858
+    vadd.s32       q10, q14            // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1 + g1 - a1 - 5 * f1
859
+.endm
860
+
861
+.macro qpel_filter_2_32b
862
+    vmov.i32        q8, #11
863
+    vmovl.u8        q11, d3
864
+    vmovl.u8        q12, d4
865
+    vaddl.s16       q9, d22,d24        // d0 + e0
866
+    vaddl.s16       q10, d23, d25      // d1 + e1
867
+
868
+    vmovl.u8        q13, d2            //c
869
+    vmovl.u8        q14, d5            //f
870
+    vaddl.s16       q11, d26, d28      // c0 + f0
871
+    vaddl.s16       q12, d27, d29      // c1 + f1
872
+
873
+    vmul.s32        q11, q8            // 11 * (c0 + f0)
874
+    vmul.s32        q12, q8            // 11 * (c1 + f1)
875
+
876
+    vmov.i32        q8, #40
877
+    vmul.s32        q9, q8             // 40 * (d0 + e0)
878
+    vmul.s32        q10, q8            // 40 * (d1 + e1)
879
+
880
+    vmovl.u8        q13, d1            //b
881
+    vmovl.u8        q14, d6            //g
882
+    vaddl.s16       q15, d26, d28      // b0 + g0
883
+    vaddl.s16       q8, d27, d29       // b1 + g1
884
+
885
+    vmovl.u8        q1, d0             //a
886
+    vmovl.u8        q2, d7             //h
887
+    vaddl.s16       q13, d2, d4        // a0 + h0
888
+    vaddl.s16       q14, d3, d5        // a1 + h1
889
+
890
+    vshl.s32        q15, #2            // 4*(b0+g0)
891
+    vshl.s32        q8, #2             // 4*(b1+g1)
892
+
893
+    vadd.s32        q11, q13           // 11 * (c0 + f0) + a0 + h0
894
+    vadd.s32        q12, q14           // 11 * (c1 + f1) + a1 + h1
895
+    vadd.s32        q9, q15            // 40 * (d0 + e0) + 4*(b0+g0)
896
+    vadd.s32        q10, q8            // 40 * (d1 + e1) + 4*(b1+g1)
897
+    vsub.s32        q9, q11            // 40 * (d0 + e0) + 4*(b0+g0) - (11 * (c0 + f0) + a0 + h0)
898
+    vsub.s32        q10, q12           // 40 * (d1 + e1) + 4*(b1+g1) - (11 * (c1 + f1) + a1 + h1)
899
+.endm
900
+
901
+.macro qpel_filter_3_32b
902
+
903
+    vmov.i16        d16, #17
904
+    vmov.i16        d17, #5
905
+
906
+    vmovl.u8        q11, d3
907
+    vmull.s16       q9, d22, d16       // 17 * d0
908
+    vmull.s16       q10, d23, d16      // 17 * d1
909
+
910
+    vmovl.u8        q13, d2
911
+    vmull.s16       q11, d26, d17      // 5 * c0
912
+    vmull.s16       q12, d27, d17      // 5* c1
913
+
914
+    vmov.i16        d16, #58
915
+    vmovl.u8        q15, d4
916
+    vmull.s16       q13, d30, d16      // 58 * e0
917
+    vmull.s16       q14, d31, d16      // 58 * e1
918
+
919
+    vmov.i16        d17, #10
920
+    vmovl.u8        q1, d5
921
+    vmull.s16       q15, d2, d17       // 10 * f0
922
+    vmull.s16       q8, d3, d17        // 10 * f1
923
+
924
+    vsub.s32        q9, q11            // 17 * d0 - 5 * c0
925
+    vsub.s32        q10, q12           // 17 * d1 - 5 * c1
926
+
927
+    vmovl.u8        q1, d6
928
+    vshll.s16       q11, d2, #2        // 4 * g0
929
+    vshll.s16       q12, d3, #2        // 4 * g1
930
+
931
+    vadd.s32        q9, q13            // 17 * d0 - 5 * c0+ 58 * e0
932
+    vadd.s32        q10, q14           // 17 * d1 - 5 * c1 + 58 * e1
933
+
934
+    vmovl.u8        q1, d1
935
+    vmovl.u8        q2, d7
936
+    vsubl.s16      q13, d2, d4         // b0 - h0
937
+    vsubl.s16      q14, d3, d5         // b1 - h1
938
+
939
+    vadd.s32        q9, q11            // 17 * d0 - 5 * c0+ 58 * e0 +4 * g0
940
+    vadd.s32        q10, q12           // 17 * d1 - 5 * c1 + 58 * e1+4 * g1
941
+    vsub.s32        q13, q15           // 17 * d0 - 5 * c0+ 58 * e0 +4 * g0 -10 * f0
942
+    vsub.s32        q14, q8            // 17 * d1 - 5 * c1 + 58 * e1+4 * g1 - 10*f1
943
+    vadd.s32        q9, q13            //  17 * d0 - 5 * c0+ 58 * e0 +4 * g0 -10 * f0 +b0 - h0
944
+    vadd.s32        q10, q14           // 17 * d1 - 5 * c1 + 58 * e1+4 * g1 - 10*f1 + b1 - h1
945
+.endm
946
+
947
+.macro FILTER_VPP a b filterv
948
+
949
+.loop_\filterv\()_\a\()x\b:
950
+
951
+    mov             r7, r2
952
+    mov             r6, r0
953
+    eor             r8, r8
954
+
955
+.loop_w8_\filterv\()_\a\()x\b:
956
+
957
+    add             r6, r0, r8
958
+
959
+    pld [r6]
960
+    vld1.u8         d0, [r6], r1
961
+    pld [r6]
962
+    vld1.u8         d1, [r6], r1
963
+    pld [r6]
964
+    vld1.u8         d2, [r6], r1
965
+    pld [r6]
966
+    vld1.u8         d3, [r6], r1
967
+    pld [r6]
968
+    vld1.u8         d4, [r6], r1
969
+    pld [r6]
970
+    vld1.u8         d5, [r6], r1
971
+    pld [r6]
972
+    vld1.u8         d6, [r6], r1
973
+    pld [r6]
974
+    vld1.u8         d7, [r6], r1
975
+
976
+    veor.u8         q9, q9
977
+    veor.u8         q10, q10
978
+
979
+   \filterv
980
+
981
+    mov             r12,#32
982
+    vdup.32         q8, r12
983
+    vadd.s32        q9, q8
984
+    vqshrun.s32     d0, q9, #6
985
+    vadd.s32        q10, q8
986
+    vqshrun.s32     d1, q10, #6
987
+    vqmovn.u16      d0, q0
988
+    vst1.u8         d0, [r7]!
989
+
990
+    add             r8, #8
991
+    cmp             r8, #\a
992
+    blt             .loop_w8_\filterv\()_\a\()x\b
993
+
994
+    add             r0, r1
995
+    add             r2, r3
996
+    subs            r4, #1
997
+    bne             .loop_\filterv\()_\a\()x\b 
998
+
999
+.endm 
1000
+
1001
+.macro LUMA_VPP  w h
1002
+function x265_interp_8tap_vert_pp_\w\()x\h\()_neon
1003
+
1004
+    push            {r4, r5, r6, r7, r8}
1005
+    ldr             r5, [sp, #4 * 5]
1006
+    mov             r4, r1, lsl #2
1007
+    sub             r4, r1
1008
+    sub             r0, r4
1009
+    mov             r4, #\h
1010
+
1011
+    cmp             r5, #0
1012
+    beq              0f
1013
+    cmp             r5, #1
1014
+    beq              1f
1015
+    cmp             r5, #2
1016
+    beq              2f
1017
+    cmp             r5, #3
1018
+    beq              3f
1019
+0:
1020
+    FILTER_VPP  \w \h qpel_filter_0_32b
1021
+    b            5f
1022
+1:
1023
+    FILTER_VPP  \w \h qpel_filter_1_32b
1024
+    b            5f
1025
+2:
1026
+    FILTER_VPP  \w \h qpel_filter_2_32b
1027
+    b            5f
1028
+3:
1029
+    FILTER_VPP  \w \h qpel_filter_3_32b
1030
+    b            5f
1031
+5:
1032
+    pop             {r4, r5, r6, r7, r8}
1033
+    bx              lr
1034
+endfunc
1035
+.endm
1036
+
1037
+LUMA_VPP 8 4
1038
+LUMA_VPP 8 8
1039
+LUMA_VPP 8 16
1040
+LUMA_VPP 8 32
1041
+LUMA_VPP 16 4
1042
+LUMA_VPP 16 8
1043
+LUMA_VPP 16 16
1044
+LUMA_VPP 16 32
1045
+LUMA_VPP 16 64
1046
+LUMA_VPP 16 12
1047
+LUMA_VPP 32 8
1048
+LUMA_VPP 32 16
1049
+LUMA_VPP 32 32
1050
+LUMA_VPP 32 64
1051
+LUMA_VPP 32 24
1052
+LUMA_VPP 64 16
1053
+LUMA_VPP 64 32
1054
+LUMA_VPP 64 64
1055
+LUMA_VPP 64 48
1056
+LUMA_VPP 24 32
1057
+LUMA_VPP 48 64
1058
+
1059
+function x265_interp_8tap_vert_pp_12x16_neon
1060
+    push            {r4, r5, r6, r7}
1061
+    ldr             r5, [sp, #4 * 4]
1062
+    mov             r4, r1, lsl #2
1063
+    sub             r4, r1
1064
+    sub             r0, r4
1065
+
1066
+    mov             r4, #16
1067
+.loop_vpp_12x16:
1068
+
1069
+    mov             r6, r0
1070
+    mov             r7, r2
1071
+
1072
+    pld [r6]
1073
+    vld1.u8         d0, [r6], r1
1074
+    pld [r6]
1075
+    vld1.u8         d1, [r6], r1
1076
+    pld [r6]
1077
+    vld1.u8         d2, [r6], r1
1078
+    pld [r6]
1079
+    vld1.u8         d3, [r6], r1
1080
+    pld [r6]
1081
+    vld1.u8         d4, [r6], r1
1082
+    pld [r6]
1083
+    vld1.u8         d5, [r6], r1
1084
+    pld [r6]
1085
+    vld1.u8         d6, [r6], r1
1086
+    pld [r6]
1087
+    vld1.u8         d7, [r6], r1
1088
+
1089
+    veor.u8         q9, q9
1090
+    veor.u8         q10, q10
1091
+
1092
+    cmp             r5,#0
1093
+    beq              0f
1094
+    cmp             r5,#1
1095
+    beq              1f
1096
+    cmp             r5,#2
1097
+    beq              2f
1098
+    cmp             r5,#3
1099
+    beq              3f
1100
+0:
1101
+    qpel_filter_0_32b
1102
+    b            5f
1103
+1:
1104
+    qpel_filter_1_32b
1105
+    b            5f
1106
+2:
1107
+    qpel_filter_2_32b
1108
+    b            5f
1109
+3:
1110
+    qpel_filter_3_32b
1111
+    b            5f
1112
+5:
1113
+    mov             r12,#32
1114
+    vdup.32         q8, r12
1115
+    vadd.s32        q9, q8
1116
+    vqshrun.s32     d0, q9, #6
1117
+    vadd.s32        q10, q8
1118
+    vqshrun.s32     d1, q10, #6
1119
+    vqmovn.u16      d0, q0
1120
+    vst1.u8         d0, [r7]!
1121
+
1122
+    add             r6, r0, #8
1123
+
1124
+    pld [r6]
1125
+    vld1.u8         d0, [r6], r1
1126
+    pld [r6]
1127
+    vld1.u8         d1, [r6], r1
1128
+    pld [r6]
1129
+    vld1.u8         d2, [r6], r1
1130
+    pld [r6]
1131
+    vld1.u8         d3, [r6], r1
1132
+    pld [r6]
1133
+    vld1.u8         d4, [r6], r1
1134
+    pld [r6]
1135
+    vld1.u8         d5, [r6], r1
1136
+    pld [r6]
1137
+    vld1.u8         d6, [r6], r1
1138
+    pld [r6]
1139
+    vld1.u8         d7, [r6], r1
1140
+
1141
+    veor.u8         q9, q9
1142
+    veor.u8         q10, q10
1143
+
1144
+    cmp             r5,#0
1145
+    beq              0f
1146
+    cmp             r5,#1
1147
+    beq              1f
1148
+    cmp             r5,#2
1149
+    beq              2f
1150
+    cmp             r5,#3
1151
+    beq              3f
1152
+0:
1153
+    qpel_filter_0_32b
1154
+    b            5f
1155
+1:
1156
+    qpel_filter_1_32b
1157
+    b            5f
1158
+2:
1159
+    qpel_filter_2_32b
1160
+    b            5f
1161
+3:
1162
+    qpel_filter_3_32b
1163
+    b            5f
1164
+5:
1165
+    mov             r12,#32
1166
+    vdup.32         q8, r12
1167
+    vadd.s32        q9, q8
1168
+    vqshrun.s32     d0, q9, #6
1169
+    vadd.s32        q10, q8
1170
+    vqshrun.s32     d1, q10, #6
1171
+    vqmovn.u16      d0, q0
1172
+    vst1.u32        d0[0], [r7]!
1173
+
1174
+    add             r0, r1
1175
+    add             r2, r3
1176
+    subs            r4, #1
1177
+    bne             .loop_vpp_12x16
1178
+
1179
+    pop             {r4, r5, r6, r7}
1180
+    bx              lr
1181
+endfunc
1182
+//**************luma_vsp************
1183
+.macro LUMA_VSP_4xN h
1184
+function x265_interp_8tap_vert_sp_4x\h\()_neon
1185
+    push            {r4, r5, r6}
1186
+    ldr             r4, [sp, #4 * 3]
1187
+    mov             r5, r4, lsl #6
1188
+    lsl             r1, #1
1189
+    mov             r4, r1, lsl #2
1190
+    sub             r4, r1
1191
+    sub             r0, r4
1192
+
1193
+    mov             r12, #1
1194
+    lsl             r12, #19
1195
+    add             r12, #2048
1196
+    vdup.32         q8, r12
1197
+    mov             r4, #\h
1198
+.loop_vsp_4x\h:
1199
+    movrel          r12, g_lumaFilter
1200
+    add             r12, r5
1201
+    mov             r6, r0
1202
+
1203
+    pld [r6]
1204
+    vld1.u16         d0, [r6], r1
1205
+    pld [r6]
1206
+    vld1.u16         d1, [r6], r1
1207
+    pld [r6]
1208
+    vld1.u16         d2, [r6], r1
1209
+    pld [r6]
1210
+    vld1.u16         d3, [r6], r1
1211
+    pld [r6]
1212
+    vld1.u16         d4, [r6], r1
1213
+    pld [r6]
1214
+    vld1.u16         d5, [r6], r1
1215
+    pld [r6]
1216
+    vld1.u16         d6, [r6], r1
1217
+    pld [r6]
1218
+    vld1.u16         d7, [r6], r1
1219
+
1220
+    veor.u8         q9, q9
1221
+
1222
+    vmovl.s16       q11, d0
1223
+    vld1.s32        d24, [r12]!
1224
+    vmov.s32        d25, d24
1225
+    vmla.s32        q9, q12, q11
1226
+
1227
+    vmovl.s16       q11, d1
1228
+    vld1.s32        d24, [r12]!
1229
+    vmov.s32        d25, d24
1230
+    vmla.s32        q9, q12, q11
1231
+
1232
+    vmovl.s16       q11, d2
1233
+    vld1.s32        d24, [r12]!
1234
+    vmov.s32        d25, d24
1235
+    vmla.s32        q9, q12, q11
1236
+
1237
+    vmovl.s16       q11, d3
1238
+    vld1.s32        d24, [r12]!
1239
+    vmov.s32        d25, d24
1240
+    vmla.s32        q9, q12, q11
1241
+
1242
+    vmovl.s16       q11, d4
1243
+    vld1.s32        d24, [r12]!
1244
+    vmov.s32        d25, d24
1245
+    vmla.s32        q9, q12, q11
1246
+
1247
+    vmovl.s16       q11, d5
1248
+    vld1.s32        d24, [r12]!
1249
+    vmov.s32        d25, d24
1250
+    vmla.s32        q9, q12, q11
1251
+
1252
+    vmovl.s16       q11, d6
1253
+    vld1.s32        d24, [r12]!
1254
+    vmov.s32        d25, d24
1255
+    vmla.s32        q9, q12, q11
1256
+
1257
+    vmovl.s16       q11, d7
1258
+    vld1.s32        d24, [r12]!
1259
+    vmov.s32        d25, d24
1260
+    vmla.s32        q9, q12, q11
1261
+
1262
+
1263
+    vadd.s32        q9, q8
1264
+    vqshrun.s32     d0, q9, #12
1265
+    vqmovn.u16      d0, q0
1266
+    vst1.u32        d0[0], [r2], r3
1267
+
1268
+    add             r0, r1
1269
+    subs            r4, #1
1270
+    bne             .loop_vsp_4x\h
1271
+    pop             {r4, r5, r6}
1272
+    bx              lr
1273
+    .ltorg
1274
+endfunc
1275
+.endm
1276
+
1277
+LUMA_VSP_4xN 4
1278
+LUMA_VSP_4xN 8
1279
+LUMA_VSP_4xN 16
1280
+
1281
+.macro qpel_filter_0_32b_1
1282
+    vmov.i16        d17, #64
1283
+    vmull.s16       q9, d6, d17    // 64*d0
1284
+    vmull.s16       q10, d7, d17   // 64*d1
1285
+.endm
1286
+
1287
+.macro qpel_filter_1_32b_1
1288
+    vmov.i16        d16, #58
1289
+    vmov.i16        d17, #10
1290
+    vmull.s16       q9, d6, d16    // 58 * d0
1291
+    vmull.s16       q10, d7, d16   // 58 * d1
1292
+    vmov.i16        d16, #17
1293
+    vmull.s16       q11, d4, d17   // 10 * c0
1294
+    vmull.s16       q12, d5, d17   // 10 * c1
1295
+    vmov.i16        d17, #5
1296
+    vmull.s16       q13, d8, d16   // 17 * e0
1297
+    vmull.s16       q14, d9, d16   // 17 * e1
1298
+    vmull.s16       q15, d10, d17  //  5 * f0
1299
+    vmull.s16       q8, d11, d17   //  5 * f1
1300
+    vsub.s32        q9, q11        // 58 * d0 - 10 * c0
1301
+    vsub.s32        q10, q12       // 58 * d1 - 10 * c1
1302
+    vshll.s16       q11, d2, #2    // 4 * b0
1303
+    vshll.s16       q12, d3, #2    // 4 * b1
1304
+    vadd.s32        q9, q13        // 58 * d0 - 10 * c0 + 17 * e0
1305
+    vadd.s32        q10, q14       // 58 * d1 - 10 * c1 + 17 * e1
1306
+    vsubl.s16       q13, d12, d0   // g0 - a0
1307
+    vsubl.s16       q14, d13, d1   // g1 - a1
1308
+    vadd.s32        q9, q11        // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0
1309
+    vadd.s32        q10, q12       // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1
1310
+    vsub.s32        q13, q15       // g0 - a0 - 5 * f0
1311
+    vsub.s32        q14, q8        // g1 - a1 - 5 * f1
1312
+    vadd.s32        q9, q13        // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0 + g0 - a0 - 5 * f0
1313
+    vadd.s32        q10, q14       // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1 + g1 - a1 - 5 * f1
1314
+.endm
1315
+
1316
+.macro qpel_filter_2_32b_1
1317
+    vmov.i32        q8, #11
1318
+    vaddl.s16       q9, d6, d8    // d0 + e0
1319
+    vaddl.s16       q10, d7, d9   // d1 + e1
1320
+    vaddl.s16       q11, d4, d10  // c0 + f0
1321
+    vaddl.s16       q12, d5, d11  // c1 + f1
1322
+    vmul.s32        q11, q8       // 11 * (c0 + f0)
1323
+    vmul.s32        q12, q8       // 11 * (c1 + f1)
1324
+    vmov.i32        q8, #40
1325
+    vaddl.s16       q15, d2, d12  // b0 + g0
1326
+    vmul.s32        q9, q8        // 40 * (d0 + e0)
1327
+    vmul.s32        q10, q8       // 40 * (d1 + e1)
1328
+    vaddl.s16       q8, d3, d13   // b1 + g1
1329
+    vaddl.s16       q13, d0, d14  // a0 + h0
1330
+    vaddl.s16       q14, d1, d15  // a1 + h1
1331
+    vshl.s32        q15, #2       // 4*(b0+g0)
1332
+    vshl.s32        q8, #2        // 4*(b1+g1)
1333
+    vadd.s32        q11, q13      // 11 * (c0 + f0) + a0 + h0
1334
+    vadd.s32        q12, q14      // 11 * (c1 + f1) + a1 + h1
1335
+    vadd.s32        q9, q15       // 40 * (d0 + e0) + 4*(b0+g0)
1336
+    vadd.s32        q10, q8       // 40 * (d1 + e1) + 4*(b1+g1)
1337
+    vsub.s32        q9, q11       // 40 * (d0 + e0) + 4*(b0+g0) - (11 * (c0 + f0) + a0 + h0)
1338
+    vsub.s32        q10, q12      // 40 * (d1 + e1) + 4*(b1+g1) - (11 * (c1 + f1) + a1 + h1) 
1339
+.endm
1340
+
1341
+.macro qpel_filter_3_32b_1
1342
+    vmov.i16        d16, #17
1343
+    vmov.i16        d17, #5
1344
+    vmull.s16       q9, d6, d16   // 17 * d0
1345
+    vmull.s16       q10, d7, d16  // 17 * d1
1346
+    vmull.s16       q11, d4, d17  // 5 * c0
1347
+    vmull.s16       q12, d5, d17  // 5* c1
1348
+    vmov.i16        d16, #58
1349
+    vmull.s16       q13, d8, d16  // 58 * e0
1350
+    vmull.s16       q14, d9, d16  // 58 * e1
1351
+    vmov.i16        d17, #10
1352
+    vmull.s16       q15, d10, d17 // 10 * f0
1353
+    vmull.s16       q8, d11, d17  // 10 * f1
1354
+    vsub.s32        q9, q11       // 17 * d0 - 5 * c0
1355
+    vsub.s32        q10, q12      // 17 * d1 - 5 * c1
1356
+    vshll.s16       q11, d12, #2  // 4 * g0
1357
+    vshll.s16       q12, d13, #2  // 4 * g1
1358
+    vadd.s32        q9, q13       // 17 * d0 - 5 * c0+ 58 * e0
1359
+    vadd.s32        q10, q14      // 17 * d1 - 5 * c1 + 58 * e1
1360
+    vsubl.s16       q13, d2, d14  // b0 - h0
1361
+    vsubl.s16       q14, d3, d15  // b1 - h1
1362
+    vadd.s32        q9, q11       // 17 * d0 - 5 * c0+ 58 * e0 +4 * g0
1363
+    vadd.s32        q10, q12      // 17 * d1 - 5 * c1 + 58 * e1+4 * g1
1364
+    vsub.s32        q13, q15      // 17 * d0 - 5 * c0+ 58 * e0 +4 * g0 -10 * f0
1365
+    vsub.s32        q14, q8       // 17 * d1 - 5 * c1 + 58 * e1+4 * g1 - 10*f1
1366
+    vadd.s32        q9, q13       //  17 * d0 - 5 * c0+ 58 * e0 +4 * g0 -10 * f0 +b0 - h0
1367
+    vadd.s32        q10, q14      // 17 * d1 - 5 * c1 + 58 * e1+4 * g1 - 10*f1 + b1 - h1
1368
+.endm
1369
+
1370
+.macro FILTER_VSP a b filterv
1371
+
1372
+    vpush           { q4 - q7}
1373
+.loop_\filterv\()_\a\()x\b:
1374
+
1375
+    mov             r7, r2
1376
+    mov             r6, r0
1377
+    eor             r8, r8
1378
+
1379
+.loop_w8_\filterv\()_\a\()x\b:
1380
+
1381
+    add             r6, r0, r8
1382
+
1383
+    pld [r6]
1384
+    vld1.u16         {q0}, [r6], r1
1385
+    pld [r6]
1386
+    vld1.u16         {q1}, [r6], r1
1387
+    pld [r6]
1388
+    vld1.u16         {q2}, [r6], r1
1389
+    pld [r6]
1390
+    vld1.u16         {q3}, [r6], r1
1391
+    pld [r6]
1392
+    vld1.u16         {q4}, [r6], r1
1393
+    pld [r6]
1394
+    vld1.u16         {q5}, [r6], r1
1395
+    pld [r6]
1396
+    vld1.u16         {q6}, [r6], r1
1397
+    pld [r6]
1398
+    vld1.u16         {q7}, [r6], r1
1399
+
1400
+    veor.u8         q9, q9
1401
+    veor.u8         q10, q10
1402
+
1403
+   \filterv
1404
+
1405
+    mov             r12,#1
1406
+    lsl             r12, #19
1407
+    add             r12, #2048
1408
+    vdup.32         q8, r12
1409
+    vadd.s32        q9, q8
1410
+    vqshrun.s32     d0, q9, #12
1411
+    vadd.s32        q10, q8
1412
+    vqshrun.s32     d1, q10, #12
1413
+    vqmovn.u16      d0, q0
1414
+    vst1.u8         d0, [r7]!
1415
+
1416
+
1417
+    add             r8, #16
1418
+    mov             r12, #\a
1419
+    lsl             r12, #1
1420
+    cmp             r8, r12
1421
+    blt             .loop_w8_\filterv\()_\a\()x\b
1422
+
1423
+    add             r0, r1
1424
+    add             r2, r3
1425
+    subs            r4, #1
1426
+    bne             .loop_\filterv\()_\a\()x\b
1427
+
1428
+    vpop            { q4 - q7}
1429
+
1430
+.endm 
1431
+
1432
+.macro LUMA_VSP  w h
1433
+function x265_interp_8tap_vert_sp_\w\()x\h\()_neon
1434
+
1435
+    push            {r4, r5, r6, r7, r8}
1436
+    ldr             r5, [sp, #4 * 5]
1437
+    lsl             r1, #1
1438
+    mov             r4, r1, lsl #2
1439
+    sub             r4, r1
1440
+    sub             r0, r4
1441
+    mov             r4, #\h
1442
+
1443
+    cmp             r5, #0
1444
+    beq              0f
1445
+    cmp             r5, #1
1446
+    beq              1f
1447
+    cmp             r5, #2
1448
+    beq              2f
1449
+    cmp             r5, #3
1450
+    beq              3f
1451
+0:
1452
+    FILTER_VSP  \w \h qpel_filter_0_32b_1
1453
+    b            5f
1454
+1:
1455
+    FILTER_VSP  \w \h qpel_filter_1_32b_1
1456
+    b            5f
1457
+2:
1458
+    FILTER_VSP  \w \h qpel_filter_2_32b_1
1459
+    b            5f
1460
+3:
1461
+    FILTER_VSP  \w \h qpel_filter_3_32b_1
1462
+    b            5f
1463
+5:
1464
+    pop             {r4, r5, r6, r7, r8}
1465
+    bx              lr
1466
+endfunc
1467
+.endm
1468
+
1469
+
1470
+LUMA_VSP 8 4
1471
+LUMA_VSP 8 8
1472
+LUMA_VSP 8 16
1473
+LUMA_VSP 8 32
1474
+LUMA_VSP 16 4
1475
+LUMA_VSP 16 8
1476
+LUMA_VSP 16 16
1477
+LUMA_VSP 16 32
1478
+LUMA_VSP 16 64
1479
+LUMA_VSP 16 12
1480
+LUMA_VSP 32 8
1481
+LUMA_VSP 32 16
1482
+LUMA_VSP 32 32
1483
+LUMA_VSP 32 64
1484
+LUMA_VSP 32 24
1485
+LUMA_VSP 64 16
1486
+LUMA_VSP 64 32
1487
+LUMA_VSP 64 64
1488
+LUMA_VSP 64 48
1489
+LUMA_VSP 24 32
1490
+LUMA_VSP 48 64
1491
+
1492
+function x265_interp_8tap_vert_sp_12x16_neon
1493
+    push            {r4, r5, r6, r7}
1494
+    ldr             r5, [sp, #4 * 4]
1495
+    lsl             r1, #1
1496
+    mov             r4, r1, lsl #2
1497
+    sub             r4, r1
1498
+    sub             r0, r4
1499
+
1500
+    mov             r4, #16
1501
+    vpush           { q4 - q7}
1502
+.loop1_12x16:
1503
+
1504
+    mov             r6, r0
1505
+    mov             r7, r2
1506
+
1507
+    pld [r6]
1508
+    vld1.u16         {q0}, [r6], r1
1509
+    pld [r6]
1510
+    vld1.u16         {q1}, [r6], r1
1511
+    pld [r6]
1512
+    vld1.u8          {q2}, [r6], r1
1513
+    pld [r6]
1514
+    vld1.u16         {q3}, [r6], r1
1515
+    pld [r6]
1516
+    vld1.u16         {q4}, [r6], r1
1517
+    pld [r6]
1518
+    vld1.u16         {q5}, [r6], r1
1519
+    pld [r6]
1520
+    vld1.u16         {q6}, [r6], r1
1521
+    pld [r6]
1522
+    vld1.u16         {q7}, [r6], r1
1523
+
1524
+    veor.u8         q9, q9
1525
+    veor.u8         q10, q10
1526
+
1527
+    cmp             r5,#0
1528
+    beq              0f
1529
+    cmp             r5,#1
1530
+    beq              1f
1531
+    cmp             r5,#2
1532
+    beq              2f
1533
+    cmp             r5,#3
1534
+    beq              3f
1535
+0:
1536
+    qpel_filter_0_32b_1
1537
+    b            5f
1538
+1:
1539
+    qpel_filter_1_32b_1
1540
+    b            5f
1541
+2:
1542
+    qpel_filter_2_32b_1
1543
+    b            5f
1544
+3:
1545
+    qpel_filter_3_32b_1
1546
+    b            5f
1547
+5:
1548
+    mov             r12,#1
1549
+    lsl             r12, #19
1550
+    add             r12, #2048
1551
+    vdup.32         q8, r12
1552
+    vadd.s32        q9, q8
1553
+    vqshrun.s32     d0, q9, #12
1554
+    vadd.s32        q10, q8
1555
+    vqshrun.s32     d1, q10, #12
1556
+    vqmovn.u16      d0, q0
1557
+    vst1.u8         d0, [r7]!
1558
+
1559
+    add             r6, r0, #16
1560
+
1561
+    pld [r6]
1562
+    vld1.u16         {q0}, [r6], r1
1563
+    pld [r6]
1564
+    vld1.u16         {q1}, [r6], r1
1565
+    pld [r6]
1566
+    vld1.u8          {q2}, [r6], r1
1567
+    pld [r6]
1568
+    vld1.u16         {q3}, [r6], r1
1569
+    pld [r6]
1570
+    vld1.u16         {q4}, [r6], r1
1571
+    pld [r6]
1572
+    vld1.u16         {q5}, [r6], r1
1573
+    pld [r6]
1574
+    vld1.u16         {q6}, [r6], r1
1575
+    pld [r6]
1576
+    vld1.u16         {q7}, [r6], r1
1577
+
1578
+    veor.u8         q9, q9
1579
+    veor.u8         q10, q10
1580
+
1581
+    cmp             r5,#0
1582
+    beq              0f
1583
+    cmp             r5,#1
1584
+    beq              1f
1585
+    cmp             r5,#2
1586
+    beq              2f
1587
+    cmp             r5,#3
1588
+    beq              3f
1589
+0:
1590
+    qpel_filter_0_32b_1
1591
+    b            5f
1592
+1:
1593
+    qpel_filter_1_32b_1
1594
+    b            5f
1595
+2:
1596
+    qpel_filter_2_32b_1
1597
+    b            5f
1598
+3:
1599
+    qpel_filter_3_32b_1
1600
+    b            5f
1601
+5:
1602
+    mov             r12,#1
1603
+    lsl             r12, #19
1604
+    add             r12, #2048
1605
+    vdup.32         q8, r12
1606
+    vadd.s32        q9, q8
1607
+    vqshrun.s32     d0, q9, #12
1608
+    vadd.s32        q10, q8
1609
+    vqshrun.s32     d1, q10, #12
1610
+    vqmovn.u16      d0, q0
1611
+    vst1.u32        d0[0], [r7]!
1612
+
1613
+    add             r0, r1
1614
+    add             r2, r3
1615
+    subs            r4, #1
1616
+    bne             .loop1_12x16
1617
+    vpop            { q4 - q7}
1618
+    pop             {r4, r5, r6, r7}
1619
+    bx              lr
1620
+endfunc
1621
+//**************luma_vps*****************
1622
+.macro LUMA_VPS_4xN h
1623
+function x265_interp_8tap_vert_ps_4x\h\()_neon
1624
+    push           {r4, r5, r6}
1625
+    ldr             r4, [sp, #4 * 3]
1626
+    lsl             r3, #1
1627
+    mov             r5, r4, lsl #6
1628
+    mov             r4, r1, lsl #2
1629
+    sub             r4, r1
1630
+    sub             r0, r4
1631
+
1632
+    mov             r4, #8192
1633
+    vdup.32         q8, r4
1634
+    mov             r4, #\h
1635
+
1636
+.loop_vps_4x\h:
1637
+    movrel          r12, g_lumaFilter
1638
+    add             r12, r5
1639
+    mov             r6, r0
1640
+
1641
+    pld [r6]
1642
+    vld1.u32        d0[0], [r6], r1
1643
+    pld [r6]
1644
+    vld1.u32        d0[1], [r6], r1
1645
+    pld [r6]
1646
+    vld1.u32        d1[0], [r6], r1
1647
+    pld [r6]
1648
+    vld1.u32        d1[1], [r6], r1
1649
+    pld [r6]
1650
+    vld1.u32        d2[0], [r6], r1
1651
+    pld [r6]
1652
+    vld1.u32        d2[1], [r6], r1
1653
+    pld [r6]
1654
+    vld1.u32        d3[0], [r6], r1
1655
+    pld [r6]
1656
+    vld1.u32        d3[1], [r6], r1
1657
+
1658
+    veor.u8         q9, q9
1659
+
1660
+    vmovl.u8        q11, d0
1661
+    vmovl.u16       q12, d22
1662
+    vmovl.u16       q13, d23
1663
+    vld1.s32        d20, [r12]!
1664
+    vmov.s32        d21, d20
1665
+    vmla.s32        q9, q12, q10
1666
+    vld1.s32        d20, [r12]!
1667
+    vmov.s32        d21, d20
1668
+    vmla.s32        q9, q13, q10
1669
+
1670
+    vmovl.u8        q11, d1
1671
+    vmovl.u16       q12, d22
1672
+    vmovl.u16       q13, d23
1673
+    vld1.s32        d20, [r12]!
1674
+    vmov.s32        d21, d20
1675
+    vmla.s32        q9, q12, q10
1676
+    vld1.s32        d20, [r12]!
1677
+    vmov.s32        d21, d20
1678
+    vmla.s32        q9, q13, q10
1679
+
1680
+    vmovl.u8        q11, d2
1681
+    vmovl.u16       q12, d22
1682
+    vmovl.u16       q13, d23
1683
+    vld1.s32        d20, [r12]!
1684
+    vmov.s32        d21, d20
1685
+    vmla.s32        q9, q12, q10
1686
+    vld1.s32        d20, [r12]!
1687
+    vmov.s32        d21, d20
1688
+    vmla.s32        q9, q13, q10
1689
+
1690
+    vmovl.u8        q11, d3
1691
+    vmovl.u16       q12, d22
1692
+    vmovl.u16       q13, d23
1693
+    vld1.s32        d20, [r12]!
1694
+    vmov.s32        d21, d20
1695
+    vmla.s32        q9, q12, q10
1696
+    vld1.s32        d20, [r12]!
1697
+    vmov.s32        d21, d20
1698
+    vmla.s32        q9, q13, q10
1699
+
1700
+    vsub.s32        q9, q8
1701
+    vqmovn.s32      d0, q9
1702
+    vst1.u16        d0, [r2], r3
1703
+
1704
+    add             r0, r1
1705
+    subs            r4, #1
1706
+    bne             .loop_vps_4x\h
1707
+
1708
+    pop             {r4, r5, r6}
1709
+    bx              lr
1710
+    .ltorg
1711
+endfunc
1712
+.endm
1713
+
1714
+LUMA_VPS_4xN 4
1715
+LUMA_VPS_4xN 8
1716
+LUMA_VPS_4xN 16
1717
+
1718
+
1719
+.macro FILTER_VPS a b filterv
1720
+
1721
+.loop_ps_\filterv\()_\a\()x\b:
1722
+
1723
+    mov             r7, r2
1724
+    mov             r6, r0
1725
+    eor             r8, r8
1726
+
1727
+.loop_ps_w8_\filterv\()_\a\()x\b:
1728
+
1729
+    add             r6, r0, r8
1730
+
1731
+    pld [r6]
1732
+    vld1.u8         d0, [r6], r1
1733
+    pld [r6]
1734
+    vld1.u8         d1, [r6], r1
1735
+    pld [r6]
1736
+    vld1.u8         d2, [r6], r1
1737
+    pld [r6]
1738
+    vld1.u8         d3, [r6], r1
1739
+    pld [r6]
1740
+    vld1.u8         d4, [r6], r1
1741
+    pld [r6]
1742
+    vld1.u8         d5, [r6], r1
1743
+    pld [r6]
1744
+    vld1.u8         d6, [r6], r1
1745
+    pld [r6]
1746
+    vld1.u8         d7, [r6], r1
1747
+
1748
+    veor.u8         q9, q9
1749
+    veor.u8         q10, q10
1750
+
1751
+   \filterv
1752
+
1753
+    mov             r12,#8192
1754
+    vdup.32         q8, r12
1755
+    vsub.s32        q9, q8
1756
+    vqmovn.s32      d0, q9
1757
+    vsub.s32        q10, q8
1758
+    vqmovn.s32      d1, q10
1759
+    vst1.u16         {q0}, [r7]!
1760
+
1761
+    add             r8, #8
1762
+    cmp             r8, #\a
1763
+    blt             .loop_ps_w8_\filterv\()_\a\()x\b
1764
+
1765
+    add             r0, r1
1766
+    add             r2, r3
1767
+    subs            r4, #1
1768
+    bne             .loop_ps_\filterv\()_\a\()x\b 
1769
+
1770
+.endm 
1771
+
1772
+.macro LUMA_VPS  w h
1773
+function x265_interp_8tap_vert_ps_\w\()x\h\()_neon
1774
+
1775
+    push            {r4, r5, r6, r7, r8}
1776
+    ldr             r5, [sp, #4 * 5]
1777
+    lsl             r3, #1
1778
+    mov             r4, r1, lsl #2
1779
+    sub             r4, r1
1780
+    sub             r0, r4
1781
+    mov             r4, #\h
1782
+
1783
+    cmp             r5, #0
1784
+    beq              0f
1785
+    cmp             r5, #1
1786
+    beq              1f
1787
+    cmp             r5, #2
1788
+    beq              2f
1789
+    cmp             r5, #3
1790
+    beq              3f
1791
+0:
1792
+    FILTER_VPS  \w \h qpel_filter_0_32b
1793
+    b            5f
1794
+1:
1795
+    FILTER_VPS  \w \h qpel_filter_1_32b
1796
+    b            5f
1797
+2:
1798
+    FILTER_VPS  \w \h qpel_filter_2_32b
1799
+    b            5f
1800
+3:
1801
+    FILTER_VPS  \w \h qpel_filter_3_32b
1802
+    b            5f
1803
+5:
1804
+    pop             {r4, r5, r6, r7, r8}
1805
+    bx              lr
1806
+endfunc
1807
+.endm
1808
+
1809
+LUMA_VPS 8 4
1810
+LUMA_VPS 8 8
1811
+LUMA_VPS 8 16
1812
+LUMA_VPS 8 32
1813
+LUMA_VPS 16 4
1814
+LUMA_VPS 16 8
1815
+LUMA_VPS 16 16
1816
+LUMA_VPS 16 32
1817
+LUMA_VPS 16 64
1818
+LUMA_VPS 16 12
1819
+LUMA_VPS 32 8
1820
+LUMA_VPS 32 16
1821
+LUMA_VPS 32 32
1822
+LUMA_VPS 32 64
1823
+LUMA_VPS 32 24
1824
+LUMA_VPS 64 16
1825
+LUMA_VPS 64 32
1826
+LUMA_VPS 64 64
1827
+LUMA_VPS 64 48
1828
+LUMA_VPS 24 32
1829
+LUMA_VPS 48 64
1830
+
1831
+function x265_interp_8tap_vert_ps_12x16_neon
1832
+    push            {r4, r5, r6, r7}
1833
+    lsl             r3, #1
1834
+    ldr             r5, [sp, #4 * 4]
1835
+    mov             r4, r1, lsl #2
1836
+    sub             r4, r1
1837
+    sub             r0, r4
1838
+
1839
+    mov             r4, #16
1840
+.loop_vps_12x16:
1841
+
1842
+    mov             r6, r0
1843
+    mov             r7, r2
1844
+
1845
+    pld [r6]
1846
+    vld1.u8         d0, [r6], r1
1847
+    pld [r6]
1848
+    vld1.u8         d1, [r6], r1
1849
+    pld [r6]
1850
+    vld1.u8         d2, [r6], r1
1851
+    pld [r6]
1852
+    vld1.u8         d3, [r6], r1
1853
+    pld [r6]
1854
+    vld1.u8         d4, [r6], r1
1855
+    pld [r6]
1856
+    vld1.u8         d5, [r6], r1
1857
+    pld [r6]
1858
+    vld1.u8         d6, [r6], r1
1859
+    pld [r6]
1860
+    vld1.u8         d7, [r6], r1
1861
+
1862
+    veor.u8         q9, q9
1863
+    veor.u8         q10, q10
1864
+
1865
+    cmp             r5,#0
1866
+    beq              0f
1867
+    cmp             r5,#1
1868
+    beq              1f
1869
+    cmp             r5,#2
1870
+    beq              2f
1871
+    cmp             r5,#3
1872
+    beq              3f
1873
+0:
1874
+    qpel_filter_0_32b
1875
+    b            5f
1876
+1:
1877
+    qpel_filter_1_32b
1878
+    b            5f
1879
+2:
1880
+    qpel_filter_2_32b
1881
+    b            5f
1882
+3:
1883
+    qpel_filter_3_32b
1884
+    b            5f
1885
+5:
1886
+    mov             r12,#8192
1887
+    vdup.32         q8, r12
1888
+    vsub.s32        q9, q8
1889
+    vqmovn.s32      d0, q9
1890
+    vsub.s32        q10, q8
1891
+    vqmovn.s32      d1, q10
1892
+    vst1.u8         {q0}, [r7]!
1893
+
1894
+    add             r6, r0, #8
1895
+
1896
+    pld [r6]
1897
+    vld1.u8         d0, [r6], r1
1898
+    pld [r6]
1899
+    vld1.u8         d1, [r6], r1
1900
+    pld [r6]
1901
+    vld1.u8         d2, [r6], r1
1902
+    pld [r6]
1903
+    vld1.u8         d3, [r6], r1
1904
+    pld [r6]
1905
+    vld1.u8         d4, [r6], r1
1906
+    pld [r6]
1907
+    vld1.u8         d5, [r6], r1
1908
+    pld [r6]
1909
+    vld1.u8         d6, [r6], r1
1910
+    pld [r6]
1911
+    vld1.u8         d7, [r6], r1
1912
+
1913
+    veor.u8         q9, q9
1914
+    veor.u8         q10, q10
1915
+
1916
+    cmp             r5,#0
1917
+    beq              0f
1918
+    cmp             r5,#1
1919
+    beq              1f
1920
+    cmp             r5,#2
1921
+    beq              2f
1922
+    cmp             r5,#3
1923
+    beq              3f
1924
+0:
1925
+    qpel_filter_0_32b
1926
+    b            5f
1927
+1:
1928
+    qpel_filter_1_32b
1929
+    b            5f
1930
+2:
1931
+    qpel_filter_2_32b
1932
+    b            5f
1933
+3:
1934
+    qpel_filter_3_32b
1935
+    b            5f
1936
+5:
1937
+    mov             r12,#8192
1938
+    vdup.32         q8, r12
1939
+    vsub.s32        q9, q8
1940
+    vqmovn.s32      d0, q9
1941
+    vst1.u8         d0, [r7]!
1942
+
1943
+    add             r0, r1
1944
+    add             r2, r3
1945
+    subs            r4, #1
1946
+    bne             .loop_vps_12x16
1947
+
1948
+    pop             {r4, r5, r6, r7}
1949
+    bx              lr
1950
+endfunc
1951
+
1952
+//************chroma_vpp************
1953
+
1954
+.macro qpel_filter_chroma_0_32b
1955
+    vmov.i16        d16, #64
1956
+    vmull.s16       q6, d6, d16    // 64*b0
1957
+    vmull.s16       q7, d7, d16   // 64*b1
1958
+.endm
1959
+
1960
+.macro qpel_filter_chroma_1_32b
1961
+    vmov.i16        d16, #58
1962
+    vmov.i16        d17, #10
1963
+    vmull.s16       q9, d6, d16     // 58*b0
1964
+    vmull.s16       q10, d7, d16     // 58*b1
1965
+    vmull.s16       q11, d8, d17    // 10*c0
1966
+    vmull.s16       q12, d9, d17    // 10*c1
1967
+    vadd.s16        q2, q5          //a +d
1968
+    vshll.s16       q13, d4, #1     // 2 * (a0+d0)
1969
+    vshll.s16       q14, d5, #1     // 2 * (a1+d1)
1970
+    vsub.s32        q9, q13         // 58*b0 - 2 * (a0+d0)
1971
+    vsub.s32        q10, q14         // 58*b1 - 2 * (a1+d1)
1972
+    vadd.s32        q6, q9, q11         // 58*b0 - 2 * (a0+d0) +10*c0
1973
+    vadd.s32        q7, q10, q12         // 58*b1 - 2 * (a1+d1) +10*c1
1974
+.endm
1975
+
1976
+.macro qpel_filter_chroma_2_32b
1977
+    vmov.i16        d16, #54
1978
+    vmull.s16       q9, d6, d16     // 54*b0
1979
+    vmull.s16       q10, d7, d16     // 54*b1
1980
+    vshll.s16       q11, d4, #2     // 4 * a0
1981
+    vshll.s16       q12, d5, #2     // 4 * a1
1982
+    vshll.s16       q13, d8, #4     // 16 * c0
1983
+    vshll.s16       q14, d9, #4     // 16 * c1
1984
+    vshll.s16       q15, d10, #1     // 2 * d0
1985
+    vshll.s16       q8, d11, #1     // 2 * d1
1986
+
1987
+    vadd.s32        q9, q13         // 54*b0 + 16 * c0
1988
+    vadd.s32        q10, q14        // 54*b1 + 16 * c1
1989
+    vadd.s32        q11, q15         // 4 * a0 +2 * d0
1990
+    vadd.s32        q12, q8         // 4 * a1 +2 * d1
1991
+    vsub.s32        q6, q9, q11     // 54*b0 + 16 * c0 - ( 4 * a0 +2 * d0)
1992
+    vsub.s32        q7, q10, q12     // 54*b0 + 16 * c0 - ( 4 * a0 +2 * d0)
1993
+.endm
1994
+
1995
+.macro qpel_filter_chroma_3_32b
1996
+    vmov.i16        d16, #46
1997
+    vmov.i16        d17, #28
1998
+    vmull.s16       q9, d6, d16     // 46*b0
1999
+    vmull.s16       q10, d7, d16     // 46*b1
2000
+    vmull.s16       q11, d8, d17    // 28*c0
2001
+    vmull.s16       q12, d9, d17    // 28*c1
2002
+    vmov.i16        d17, #6
2003
+    vshll.s16       q13, d10, #2     // 4 * d0
2004
+    vshll.s16       q14, d11, #2     // 4 * d1
2005
+    vmull.s16       q15, d4, d17    // 6*a0
2006
+    vmull.s16       q8, d5, d17    // 6*a1
2007
+    vadd.s32        q9, q11         // 46*b0 + 28*c0
2008
+    vadd.s32        q10, q12        //  46*b1 + 28*c1
2009
+    vadd.s32        q13, q15         // 4 * d0 + 6*a0
2010
+    vadd.s32        q14, q8        // 4 * d1 + 6*a1
2011
+    vsub.s32        q6, q9, q13         // 46*b0 + 28*c0 -(4 * d0 + 6*a0)
2012
+    vsub.s32        q7, q10, q14         //  46*b1 + 28*c1 -(4 * d1 + 6*a1)
2013
+.endm
2014
+
2015
+.macro qpel_filter_chroma_4_32b
2016
+    vmov.i16        d16, #36
2017
+    vadd.s16        q2, q5          // a +d
2018
+    vadd.s16        q3, q4          // b+c
2019
+    vmull.s16       q9, d6, d16     // 36*(b0 + c0)
2020
+    vmull.s16       q10, d7, d16     // 36*(b1 + c1)
2021
+    vshll.s16       q11, d4, #2     // 4 * (a0+d0)
2022
+    vshll.s16       q12, d5, #2     // 4 * (a1+d1)
2023
+    vsub.s32        q6, q9, q11         // 36*(b0 + c0) - ( 4 * (a0+d0))
2024
+    vsub.s32        q7, q10, q12         // 36*(b1 + c1) - ( 4 * (a1+d1))
2025
+.endm
2026
+
2027
+.macro qpel_filter_chroma_5_32b
2028
+    vmov.i16        d16, #46
2029
+    vmov.i16        d17, #28
2030
+    vmull.s16       q9, d6, d17     // 28*b0
2031
+    vmull.s16       q10, d7, d17     // 28*b1
2032
+    vmull.s16       q11, d8, d16    // 46*c0
2033
+    vmull.s16       q12, d9, d16    // 46*c1
2034
+    vmov.i16        d17, #6
2035
+    vshll.s16       q13, d4, #2     // 4 * a0
2036
+    vshll.s16       q14, d5, #2     // 4 * a1
2037
+    vmull.s16       q15, d10, d17    // 6*d0
2038
+    vmull.s16       q8, d11, d17    // 6*d1
2039
+    vadd.s32        q9, q11         // 28*b0 + 46*c0
2040
+    vadd.s32        q10, q12        //  28*b1 + 46*c1
2041
+    vadd.s32        q13, q15         // 4 * a0 + 6*d0
2042
+    vadd.s32        q14, q8        //  4 * a1 + 6*d1
2043
+    vsub.s32        q6, q9, q13         // 28*b0 + 46*c0- (4 * a0 + 6*d0)
2044
+    vsub.s32        q7, q10, q14         //   28*b1 + 46*c1- (4 * a1 + 6*d1)
2045
+.endm
2046
+
2047
+.macro qpel_filter_chroma_6_32b
2048
+    vmov.i16        d16, #54
2049
+    vmull.s16       q9, d8, d16     // 54*c0
2050
+    vmull.s16       q10, d9, d16     // 54*c1
2051
+    vshll.s16       q11, d4, #1     // 2 * a0
2052
+    vshll.s16       q12, d5, #1     // 2 * a1
2053
+    vshll.s16       q13, d6, #4     // 16 * b0
2054
+    vshll.s16       q14, d7, #4     // 16 * b1
2055
+    vshll.s16       q15, d10, #2     // 4 * d0
2056
+    vshll.s16       q8, d11, #2     // 4 * d1
2057
+    vadd.s32        q9, q13         // 54*c0 + 16 * b0
2058
+    vadd.s32        q10, q14        // 54*c1 + 16 * b1
2059
+    vadd.s32        q11, q15         // 2 * a0 + 4 * d0
2060
+    vadd.s32        q12, q8         // 2 * a1 + 4 * d1
2061
+    vsub.s32        q6, q9, q11     // 54*c0 + 16 * b0 - ( 2 * a0 + 4 * d0)
2062
+    vsub.s32        q7, q10, q12     //  54*c1 + 16 * b1 - ( 2 * a1 + 4 * d1)
2063
+.endm
2064
+
2065
+.macro qpel_filter_chroma_7_32b
2066
+    vmov.i16        d16, #10
2067
+    vmov.i16        d17, #58
2068
+    vmull.s16       q9, d6, d16     // 10*b0
2069
+    vmull.s16       q10, d7, d16     // 10*b1
2070
+    vmull.s16       q11, d8, d17    // 58*c0
2071
+    vmull.s16       q12, d9, d17    // 58*c1
2072
+    vadd.s16        q2, q5          //a +d
2073
+    vshll.s16       q13, d4, #1     // 2 * (a0+d0)
2074
+    vshll.s16       q14, d5, #1     // 2 * (a1+d1)
2075
+    vsub.s32        q9, q13         // 58*c0 - 2 * (a0+d0)
2076
+    vsub.s32        q10, q14         // 58*c1 - 2 * (a1+d1)
2077
+    vadd.s32        q6, q9, q11         // 58*c0 - 2 * (a0+d0) +10*b0
2078
+    vadd.s32        q7, q10, q12         // 58*c1 - 2 * (a1+d1) +10*b1
2079
+.endm
2080
+
2081
+.macro FILTER_CHROMA_VPP a b filterv
2082
+
2083
+    vpush           {q4-q7}
2084
+
2085
+.loop_\filterv\()_\a\()x\b:
2086
+
2087
+    mov             r7, r2
2088
+    mov             r6, r0
2089
+    eor             r8, r8
2090
+
2091
+.loop_w8_\filterv\()_\a\()x\b:
2092
+
2093
+    add             r6, r0, r8
2094
+
2095
+    pld [r6]
2096
+    vld1.u8         d0, [r6], r1
2097
+    pld [r6]
2098
+    vld1.u8         d1, [r6], r1
2099
+    pld [r6]
2100
+    vld1.u8         d2, [r6], r1
2101
+    pld [r6]
2102
+    vld1.u8         d3, [r6], r1
2103
+
2104
+    vmovl.u8        q2, d0
2105
+    vmovl.u8        q3, d1
2106
+    vmovl.u8        q4, d2
2107
+    vmovl.u8        q5, d3
2108
+
2109
+    veor.u8         q6, q6
2110
+    veor.u8         q7, q7
2111
+
2112
+   \filterv
2113
+
2114
+    mov             r12,#32
2115
+    vdup.32         q8, r12
2116
+    vadd.s32        q6, q8
2117
+    vqshrun.s32     d0, q6, #6
2118
+    vadd.s32        q7, q8
2119
+    vqshrun.s32     d1, q7, #6
2120
+    vqmovn.u16      d0, q0
2121
+    vst1.u8         d0, [r7]!
2122
+
2123
+    add             r8, #8
2124
+    cmp             r8, #\a
2125
+    blt             .loop_w8_\filterv\()_\a\()x\b
2126
+
2127
+    add             r0, r1
2128
+    add             r2, r3
2129
+    subs            r4, #1
2130
+    bne             .loop_\filterv\()_\a\()x\b 
2131
+    vpop            {q4-q7}
2132
+.endm 
2133
+
2134
+.macro CHROMA_VPP  w h
2135
+function x265_interp_4tap_vert_pp_\w\()x\h\()_neon
2136
+
2137
+    push            {r4, r5, r6, r7, r8}
2138
+    ldr             r5, [sp, #4 * 5]
2139
+    sub             r0, r1
2140
+    mov             r4, #\h
2141
+
2142
+    cmp             r5, #0
2143
+    beq              0f
2144
+    cmp             r5, #1
2145
+    beq              1f
2146
+    cmp             r5, #2
2147
+    beq              2f
2148
+    cmp             r5, #3
2149
+    beq              3f
2150
+    cmp             r5, #4
2151
+    beq              4f
2152
+    cmp             r5, #5
2153
+    beq              5f
2154
+    cmp             r5, #6
2155
+    beq              6f
2156
+    cmp             r5, #7
2157
+    beq              7f
2158
+0:
2159
+    FILTER_CHROMA_VPP  \w \h qpel_filter_chroma_0_32b
2160
+    b            8f
2161
+1:
2162
+    FILTER_CHROMA_VPP  \w \h qpel_filter_chroma_1_32b
2163
+    b            8f
2164
+2:
2165
+    FILTER_CHROMA_VPP  \w \h qpel_filter_chroma_2_32b
2166
+    b            8f
2167
+3:
2168
+    FILTER_CHROMA_VPP  \w \h qpel_filter_chroma_3_32b
2169
+    b            8f
2170
+4:
2171
+    FILTER_CHROMA_VPP  \w \h qpel_filter_chroma_4_32b
2172
+    b            8f
2173
+5:
2174
+    FILTER_CHROMA_VPP  \w \h qpel_filter_chroma_5_32b
2175
+    b            8f
2176
+6:
2177
+    FILTER_CHROMA_VPP  \w \h qpel_filter_chroma_6_32b
2178
+    b            8f
2179
+7:
2180
+    FILTER_CHROMA_VPP  \w \h qpel_filter_chroma_7_32b
2181
+    b            8f
2182
+8:
2183
+    pop             {r4, r5, r6, r7, r8}
2184
+    bx              lr
2185
+endfunc
2186
+.endm
2187
+
2188
+CHROMA_VPP 8 2
2189
+CHROMA_VPP 8 4
2190
+CHROMA_VPP 8 6
2191
+CHROMA_VPP 8 8
2192
+CHROMA_VPP 8 16
2193
+CHROMA_VPP 8 32
2194
+CHROMA_VPP 8 12
2195
+CHROMA_VPP 8 64
2196
+CHROMA_VPP 16 4
2197
+CHROMA_VPP 16 8
2198
+CHROMA_VPP 16 12
2199
+CHROMA_VPP 16 16
2200
+CHROMA_VPP 16 32
2201
+CHROMA_VPP 16 64
2202
+CHROMA_VPP 16 24
2203
+CHROMA_VPP 32 8
2204
+CHROMA_VPP 32 16
2205
+CHROMA_VPP 32 24
2206
+CHROMA_VPP 32 32
2207
+CHROMA_VPP 32 64
2208
+CHROMA_VPP 32 48
2209
+CHROMA_VPP 24 32
2210
+CHROMA_VPP 24 64
2211
+CHROMA_VPP 64 16
2212
+CHROMA_VPP 64 32
2213
+CHROMA_VPP 64 48
2214
+CHROMA_VPP 64 64
2215
+CHROMA_VPP 48 64
2216
+
2217
+.macro FILTER_CHROMA_VPS a b filterv
2218
+
2219
+    vpush           {q4-q7}
2220
+
2221
+.loop_vps_\filterv\()_\a\()x\b:
2222
+
2223
+    mov             r7, r2
2224
+    mov             r6, r0
2225
+    eor             r8, r8
2226
+
2227
+.loop_vps_w8_\filterv\()_\a\()x\b:
2228
+
2229
+    add             r6, r0, r8
2230
+
2231
+    pld [r6]
2232
+    vld1.u8         d0, [r6], r1
2233
+    pld [r6]
2234
+    vld1.u8         d1, [r6], r1
2235
+    pld [r6]
2236
+    vld1.u8         d2, [r6], r1
2237
+    pld [r6]
2238
+    vld1.u8         d3, [r6], r1
2239
+
2240
+    vmovl.u8        q2, d0
2241
+    vmovl.u8        q3, d1
2242
+    vmovl.u8        q4, d2
2243
+    vmovl.u8        q5, d3
2244
+
2245
+    veor.u8         q6, q6
2246
+    veor.u8         q7, q7
2247
+
2248
+   \filterv
2249
+
2250
+    mov             r12,#8192
2251
+    vdup.32         q8, r12
2252
+    vsub.s32        q6, q8
2253
+    vqmovn.s32      d0, q6
2254
+    vsub.s32        q7, q8
2255
+    vqmovn.s32      d1, q7
2256
+    vst1.u16         {q0}, [r7]!
2257
+
2258
+    add             r8, #8
2259
+    cmp             r8, #\a
2260
+    blt             .loop_vps_w8_\filterv\()_\a\()x\b
2261
+
2262
+    add             r0, r1
2263
+    add             r2, r3
2264
+    subs            r4, #1
2265
+    bne             .loop_vps_\filterv\()_\a\()x\b 
2266
+    vpop            {q4-q7}
2267
+.endm 
2268
+
2269
+.macro CHROMA_VPS  w h
2270
+function x265_interp_4tap_vert_ps_\w\()x\h\()_neon
2271
+
2272
+    push            {r4, r5, r6, r7, r8}
2273
+    ldr             r5, [sp, #4 * 5]
2274
+    lsl             r3, #1
2275
+    sub             r0, r1
2276
+    mov             r4, #\h
2277
+
2278
+    cmp             r5, #0
2279
+    beq              0f
2280
+    cmp             r5, #1
2281
+    beq              1f
2282
+    cmp             r5, #2
2283
+    beq              2f
2284
+    cmp             r5, #3
2285
+    beq              3f
2286
+    cmp             r5, #4
2287
+    beq              4f
2288
+    cmp             r5, #5
2289
+    beq              5f
2290
+    cmp             r5, #6
2291
+    beq              6f
2292
+    cmp             r5, #7
2293
+    beq              7f
2294
+0:
2295
+    FILTER_CHROMA_VPS  \w \h qpel_filter_chroma_0_32b
2296
+    b            8f
2297
+1:
2298
+    FILTER_CHROMA_VPS  \w \h qpel_filter_chroma_1_32b
2299
+    b            8f
2300
+2:
2301
+    FILTER_CHROMA_VPS  \w \h qpel_filter_chroma_2_32b
2302
+    b            8f
2303
+3:
2304
+    FILTER_CHROMA_VPS  \w \h qpel_filter_chroma_3_32b
2305
+    b            8f
2306
+4:
2307
+    FILTER_CHROMA_VPS  \w \h qpel_filter_chroma_4_32b
2308
+    b            8f
2309
+5:
2310
+    FILTER_CHROMA_VPS  \w \h qpel_filter_chroma_5_32b
2311
+    b            8f
2312
+6:
2313
+    FILTER_CHROMA_VPS  \w \h qpel_filter_chroma_6_32b
2314
+    b            8f
2315
+7:
2316
+    FILTER_CHROMA_VPS  \w \h qpel_filter_chroma_7_32b
2317
+    b            8f
2318
+8:
2319
+    pop             {r4, r5, r6, r7, r8}
2320
+    bx              lr
2321
+endfunc
2322
+.endm
2323
+
2324
+CHROMA_VPS 8 2
2325
+CHROMA_VPS 8 4
2326
+CHROMA_VPS 8 6
2327
+CHROMA_VPS 8 8
2328
+CHROMA_VPS 8 16
2329
+CHROMA_VPS 8 32
2330
+CHROMA_VPS 8 12
2331
+CHROMA_VPS 8 64
2332
+CHROMA_VPS 16 4
2333
+CHROMA_VPS 16 8
2334
+CHROMA_VPS 16 12
2335
+CHROMA_VPS 16 16
2336
+CHROMA_VPS 16 32
2337
+CHROMA_VPS 16 64
2338
+CHROMA_VPS 16 24
2339
+CHROMA_VPS 32 8
2340
+CHROMA_VPS 32 16
2341
+CHROMA_VPS 32 24
2342
+CHROMA_VPS 32 32
2343
+CHROMA_VPS 32 64
2344
+CHROMA_VPS 32 48
2345
+CHROMA_VPS 24 32
2346
+CHROMA_VPS 24 64
2347
+CHROMA_VPS 64 16
2348
+CHROMA_VPS 64 32
2349
+CHROMA_VPS 64 48
2350
+CHROMA_VPS 64 64
2351
+CHROMA_VPS 48 64
2352
+
2353
+.macro FILTER_CHROMA_VSP a b filterv
2354
+
2355
+    vpush           {q4-q7}
2356
+
2357
+.loop_vsp_\filterv\()_\a\()x\b:
2358
+
2359
+    mov             r7, r2
2360
+    mov             r6, r0
2361
+    eor             r8, r8
2362
+
2363
+.loop_vsp_w8_\filterv\()_\a\()x\b:
2364
+
2365
+    add             r6, r0, r8
2366
+
2367
+    pld [r6]
2368
+    vld1.u16         {q2}, [r6], r1
2369
+    pld [r6]
2370
+    vld1.u16         {q3}, [r6], r1
2371
+    pld [r6]
2372
+    vld1.u16         {q4}, [r6], r1
2373
+    pld [r6]
2374
+    vld1.u16         {q5}, [r6], r1
2375
+
2376
+    veor.u8         q6, q6
2377
+    veor.u8         q7, q7
2378
+
2379
+   \filterv
2380
+
2381
+    mov             r12,#1
2382
+    lsl             r12, #19
2383
+    add             r12, #2048
2384
+    vdup.32         q8, r12
2385
+    vadd.s32        q6, q8
2386
+    vqshrun.s32     d0, q6, #12
2387
+    vadd.s32        q7, q8
2388
+    vqshrun.s32     d1, q7, #12
2389
+    vqmovn.u16      d0, q0
2390
+    vst1.u8         d0, [r7]!
2391
+
2392
+    add             r8, #16
2393
+    mov             r12, #\a
2394
+    lsl             r12, #1
2395
+    cmp             r8, r12
2396
+    blt             .loop_vsp_w8_\filterv\()_\a\()x\b
2397
+
2398
+    add             r0, r1
2399
+    add             r2, r3
2400
+    subs            r4, #1
2401
+    bne             .loop_vsp_\filterv\()_\a\()x\b 
2402
+    vpop            {q4-q7}
2403
+.endm 
2404
+
2405
+.macro CHROMA_VSP  w h
2406
+function x265_interp_4tap_vert_sp_\w\()x\h\()_neon
2407
+
2408
+    push            {r4, r5, r6, r7, r8}
2409
+    ldr             r5, [sp, #4 * 5]
2410
+    lsl             r1, #1
2411
+    sub             r0, r1
2412
+    mov             r4, #\h
2413
+
2414
+    cmp             r5, #0
2415
+    beq              0f
2416
+    cmp             r5, #1
2417
+    beq              1f
2418
+    cmp             r5, #2
2419
+    beq              2f
2420
+    cmp             r5, #3
2421
+    beq              3f
2422
+    cmp             r5, #4
2423
+    beq              4f
2424
+    cmp             r5, #5
2425
+    beq              5f
2426
+    cmp             r5, #6
2427
+    beq              6f
2428
+    cmp             r5, #7
2429
+    beq              7f
2430
+0:
2431
+    FILTER_CHROMA_VSP  \w \h qpel_filter_chroma_0_32b
2432
+    b            8f
2433
+1:
2434
+    FILTER_CHROMA_VSP  \w \h qpel_filter_chroma_1_32b
2435
+    b            8f
2436
+2:
2437
+    FILTER_CHROMA_VSP  \w \h qpel_filter_chroma_2_32b
2438
+    b            8f
2439
+3:
2440
+    FILTER_CHROMA_VSP  \w \h qpel_filter_chroma_3_32b
2441
+    b            8f
2442
+4:
2443
+    FILTER_CHROMA_VSP  \w \h qpel_filter_chroma_4_32b
2444
+    b            8f
2445
+5:
2446
+    FILTER_CHROMA_VSP  \w \h qpel_filter_chroma_5_32b
2447
+    b            8f
2448
+6:
2449
+    FILTER_CHROMA_VSP  \w \h qpel_filter_chroma_6_32b
2450
+    b            8f
2451
+7:
2452
+    FILTER_CHROMA_VSP  \w \h qpel_filter_chroma_7_32b
2453
+    b            8f
2454
+8:
2455
+    pop             {r4, r5, r6, r7, r8}
2456
+    bx              lr
2457
+endfunc
2458
+.endm
2459
+
2460
+CHROMA_VSP 8 2
2461
+CHROMA_VSP 8 4
2462
+CHROMA_VSP 8 6
2463
+CHROMA_VSP 8 8
2464
+CHROMA_VSP 8 16
2465
+CHROMA_VSP 8 32
2466
+CHROMA_VSP 8 12
2467
+CHROMA_VSP 8 64
2468
+CHROMA_VSP 16 4
2469
+CHROMA_VSP 16 8
2470
+CHROMA_VSP 16 12
2471
+CHROMA_VSP 16 16
2472
+CHROMA_VSP 16 32
2473
+CHROMA_VSP 16 64
2474
+CHROMA_VSP 16 24
2475
+CHROMA_VSP 32 8
2476
+CHROMA_VSP 32 16
2477
+CHROMA_VSP 32 24
2478
+CHROMA_VSP 32 32
2479
+CHROMA_VSP 32 64
2480
+CHROMA_VSP 32 48
2481
+CHROMA_VSP 24 32
2482
+CHROMA_VSP 24 64
2483
+CHROMA_VSP 64 16
2484
+CHROMA_VSP 64 32
2485
+CHROMA_VSP 64 48
2486
+CHROMA_VSP 64 64
2487
+CHROMA_VSP 48 64
2488
+
2489
+ // void interp_horiz_pp_c(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
2490
+.macro vextin8
2491
+    pld             [r5]
2492
+    vld1.8          {q3}, [r5]!
2493
+    vext.8          d0, d6, d7, #1
2494
+    vext.8          d1, d6, d7, #2
2495
+    vext.8          d2, d6, d7, #3
2496
+    vext.8          d3, d6, d7, #4
2497
+    vext.8          d4, d6, d7, #5
2498
+    vext.8          d5, d6, d7, #6
2499
+    vext.8          d6, d6, d7, #7
2500
+.endm
2501
+
2502
+.macro HPP_FILTER a b filterhpp
2503
+    mov             r12,#32
2504
+    mov             r6, #\b
2505
+    sub             r3, #\a
2506
+    mov             r8, #\a
2507
+    cmp             r8, #4
2508
+    beq             4f
2509
+    cmp             r8, #12
2510
+    beq             12f
2511
+    b               6f
2512
+4:
2513
+    HPP_FILTER_4 \a \b \filterhpp
2514
+    b               5f
2515
+12:
2516
+    HPP_FILTER_12 \a \b \filterhpp
2517
+    b               5f
2518
+6:
2519
+loop2_hpp_\filterhpp\()_\a\()x\b:
2520
+    mov             r7, #\a
2521
+    lsr             r7, #3
2522
+    mov             r5, r0
2523
+    sub             r5, #4
2524
+loop3_hpp_\filterhpp\()_\a\()x\b:
2525
+    vextin8
2526
+    \filterhpp
2527
+    vdup.32         q8, r12
2528
+    vadd.s32        q9, q8
2529
+    vqshrun.s32     d0, q9, #6
2530
+    vadd.s32        q10, q8
2531
+    vqshrun.s32     d1, q10, #6
2532
+    vqmovn.u16      d0, q0
2533
+    vst1.u8         d0, [r2]!
2534
+    subs            r7, #1
2535
+    sub             r5, #8
2536
+    bne             loop3_hpp_\filterhpp\()_\a\()x\b
2537
+    subs            r6, #1
2538
+    add             r0, r1
2539
+    add             r2, r3
2540
+    bne             loop2_hpp_\filterhpp\()_\a\()x\b
2541
+5:
2542
+.endm
2543
+
2544
+.macro HPP_FILTER_4 w h filterhpp
2545
+loop4_hpp_\filterhpp\()_\w\()x\h:
2546
+    mov             r5, r0
2547
+    sub             r5, #4
2548
+    vextin8
2549
+    \filterhpp
2550
+    vdup.32         q8, r12
2551
+    vadd.s32        q9, q8
2552
+    vqshrun.s32     d0, q9, #6
2553
+    vadd.s32        q10, q8
2554
+    vqshrun.s32     d1, q10, #6
2555
+    vqmovn.u16      d0, q0
2556
+    vst1.u32        {d0[0]}, [r2]!
2557
+    sub             r5, #8
2558
+    subs            r6, #1
2559
+    add             r0, r1
2560
+    add             r2, r3
2561
+    bne             loop4_hpp_\filterhpp\()_\w\()x\h
2562
+.endm
2563
+
2564
+.macro HPP_FILTER_12 w h filterhpp
2565
+loop12_hpp_\filterhpp\()_\w\()x\h:
2566
+    mov             r5, r0
2567
+    sub             r5, #4
2568
+    vextin8
2569
+    \filterhpp
2570
+    vdup.32         q8, r12
2571
+    vadd.s32        q9, q8
2572
+    vqshrun.s32     d0, q9, #6
2573
+    vadd.s32        q10, q8
2574
+    vqshrun.s32     d1, q10, #6
2575
+    vqmovn.u16      d0, q0
2576
+    vst1.u8         {d0}, [r2]!
2577
+    sub             r5, #8
2578
+
2579
+    vextin8
2580
+    \filterhpp
2581
+    vdup.32         q8, r12
2582
+    vadd.s32        q9, q8
2583
+    vqshrun.s32     d0, q9, #6
2584
+    vadd.s32        q10, q8
2585
+    vqshrun.s32     d1, q10, #6
2586
+    vqmovn.u16      d0, q0
2587
+    vst1.u32        {d0[0]}, [r2]!
2588
+    add             r2, r3
2589
+    subs            r6, #1
2590
+    add             r0, r1
2591
+    bne             loop12_hpp_\filterhpp\()_\w\()x\h
2592
+.endm
2593
+
2594
+.macro LUMA_HPP w h
2595
+function x265_interp_horiz_pp_\w\()x\h\()_neon
2596
+    push            {r4, r5, r6, r7, r8}
2597
+    ldr             r4, [sp, #20]
2598
+    cmp             r4, #0
2599
+    beq             0f
2600
+    cmp             r4, #1
2601
+    beq             1f
2602
+    cmp             r4, #2
2603
+    beq             2f
2604
+    cmp             r4, #3
2605
+    beq             3f
2606
+0:
2607
+    HPP_FILTER  \w \h qpel_filter_0_32b
2608
+    b               5f
2609
+1:
2610
+    HPP_FILTER  \w \h qpel_filter_1_32b
2611
+    b               5f
2612
+2:
2613
+    HPP_FILTER  \w \h qpel_filter_2_32b
2614
+    b               5f
2615
+3:
2616
+    HPP_FILTER  \w \h qpel_filter_3_32b
2617
+    b               5f
2618
+5:
2619
+    pop             {r4, r5, r6, r7, r8}
2620
+    bx              lr
2621
+endfunc
2622
+.endm
2623
+
2624
+LUMA_HPP    4 4
2625
+LUMA_HPP    4 8
2626
+LUMA_HPP    4 16
2627
+LUMA_HPP    8 4
2628
+LUMA_HPP    8 8
2629
+LUMA_HPP    8 16
2630
+LUMA_HPP    8 32
2631
+LUMA_HPP    12 16
2632
+LUMA_HPP    16 4
2633
+LUMA_HPP    16 8
2634
+LUMA_HPP    16 12
2635
+LUMA_HPP    16 16
2636
+LUMA_HPP    16 32
2637
+LUMA_HPP    16 64
2638
+LUMA_HPP    24 32
2639
+LUMA_HPP    32 8
2640
+LUMA_HPP    32 16
2641
+LUMA_HPP    32 24
2642
+LUMA_HPP    32 32
2643
+LUMA_HPP    32 64
2644
+LUMA_HPP    48 64
2645
+LUMA_HPP    64 16
2646
+LUMA_HPP    64 32
2647
+LUMA_HPP    64 48
2648
+LUMA_HPP    64 64
2649
+
2650
+// void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)
2651
+.macro HPS_FILTER a b filterhps
2652
+    mov             r12, #8192
2653
+    mov             r6, r10
2654
+    sub             r3, #\a
2655
+    lsl             r3, #1
2656
+
2657
+    mov             r8, #\a
2658
+    cmp             r8, #4
2659
+    beq             14f
2660
+    cmp             r8, #12
2661
+    beq             15f
2662
+    b               7f
2663
+14:
2664
+    HPS_FILTER_4 \a \b \filterhps
2665
+    b               10f
2666
+15:
2667
+    HPS_FILTER_12 \a \b \filterhps
2668
+    b               10f
2669
+7:
2670
+    cmp             r9, #0
2671
+    beq             8f
2672
+    cmp             r9, #1
2673
+    beq             9f
2674
+8:
2675
+loop1_hps_\filterhps\()_\a\()x\b\()_rowext0:
2676
+    mov             r7, #\a
2677
+    lsr             r7, #3
2678
+    mov             r5, r0
2679
+    sub             r5, #4
2680
+loop2_hps_\filterhps\()_\a\()x\b\()_rowext0:
2681
+    vextin8
2682
+    \filterhps
2683
+    vdup.32         q8, r12
2684
+    vsub.s32        q9, q8
2685
+    vsub.s32        q10, q8
2686
+    vmovn.u32       d0, q9
2687
+    vmovn.u32       d1, q10
2688
+    vst1.s16        {q0}, [r2]!
2689
+    subs            r7, #1
2690
+    sub             r5, #8
2691
+    bne             loop2_hps_\filterhps\()_\a\()x\b\()_rowext0
2692
+    subs            r6, #1
2693
+    add             r0, r1
2694
+    add             r2, r3
2695
+    bne             loop1_hps_\filterhps\()_\a\()x\b\()_rowext0
2696
+    b               10f
2697
+9:
2698
+loop3_hps_\filterhps\()_\a\()x\b\()_rowext1:
2699
+    mov             r7, #\a
2700
+    lsr             r7, #3
2701
+    mov             r5, r0
2702
+    sub             r5, #4
2703
+loop4_hps_\filterhps\()_\a\()x\b\()_rowext1:
2704
+    vextin8
2705
+    \filterhps
2706
+    vdup.32         q8, r12
2707
+    vsub.s32        q9, q8
2708
+    vsub.s32        q10, q8
2709
+    vmovn.u32       d0, q9
2710
+    vmovn.u32       d1, q10
2711
+    vst1.s16        {q0}, [r2]!
2712
+    subs            r7, #1
2713
+    sub             r5, #8
2714
+    bne             loop4_hps_\filterhps\()_\a\()x\b\()_rowext1
2715
+    subs            r6, #1
2716
+    add             r0, r1
2717
+    add             r2, r3
2718
+    bne             loop3_hps_\filterhps\()_\a\()x\b\()_rowext1
2719
+10:
2720
+.endm
2721
+
2722
+.macro HPS_FILTER_4 w h filterhps
2723
+    cmp             r9, #0
2724
+    beq             11f
2725
+    cmp             r9, #1
2726
+    beq             12f
2727
+11:
2728
+loop4_hps_\filterhps\()_\w\()x\h\()_rowext0:
2729
+    mov             r5, r0
2730
+    sub             r5, #4
2731
+    vextin8
2732
+    \filterhps
2733
+    vdup.32         q8, r12
2734
+    vsub.s32        q9, q8
2735
+    vmovn.u32       d0, q9
2736
+    vst1.s16        {d0}, [r2]!
2737
+    sub             r5, #8
2738
+    subs            r6, #1
2739
+    add             r0, r1
2740
+    add             r2, r3
2741
+    bne             loop4_hps_\filterhps\()_\w\()x\h\()_rowext0
2742
+    b               13f
2743
+12:
2744
+loop5_hps_\filterhps\()_\w\()x\h\()_rowext1:
2745
+    mov             r5, r0
2746
+    sub             r5, #4
2747
+    vextin8
2748
+    \filterhps
2749
+    vdup.32         q8, r12
2750
+    vsub.s32        q9, q8
2751
+    vmovn.u32       d0, q9
2752
+    vst1.s16        {d0}, [r2]!
2753
+    sub             r5, #8
2754
+    subs            r6, #1
2755
+    add             r0, r1
2756
+    add             r2, r3
2757
+    bne             loop5_hps_\filterhps\()_\w\()x\h\()_rowext1
2758
+13:
2759
+.endm
2760
+
2761
+.macro HPS_FILTER_12 w h filterhps
2762
+    cmp             r9, #0
2763
+    beq             14f
2764
+    cmp             r9, #1
2765
+    beq             15f
2766
+14:
2767
+loop12_hps_\filterhps\()_\w\()x\h\()_rowext0:
2768
+    mov             r5, r0
2769
+    sub             r5, #4
2770
+    vextin8
2771
+    \filterhps
2772
+    vdup.32         q8, r12
2773
+    vsub.s32        q9, q8
2774
+    vsub.s32        q10, q8
2775
+    vmovn.u32       d0, q9
2776
+    vmovn.u32       d1, q10
2777
+    vst1.s16        {q0}, [r2]!
2778
+    sub             r5, #8
2779
+
2780
+    vextin8
2781
+    \filterhps
2782
+    vdup.32         q8, r12
2783
+    vsub.s32        q9, q8
2784
+    vmovn.u32       d0, q9
2785
+    vst1.s16        {d0}, [r2]!
2786
+    add             r2, r3
2787
+    subs            r6, #1
2788
+    add             r0, r1
2789
+    bne             loop12_hps_\filterhps\()_\w\()x\h\()_rowext0
2790
+    b               16f
2791
+15:
2792
+loop12_hps_\filterhps\()_\w\()x\h\()_rowext1:
2793
+    mov             r5, r0
2794
+    sub             r5, #4
2795
+    vextin8
2796
+    \filterhps
2797
+    vdup.32         q8, r12
2798
+    vsub.s32        q9, q8
2799
+    vsub.s32        q10, q8
2800
+    vmovn.u32       d0, q9
2801
+    vmovn.u32       d1, q10
2802
+    vst1.s16        {q0}, [r2]!
2803
+    sub             r5, #8
2804
+
2805
+    vextin8
2806
+    \filterhps
2807
+    vdup.32         q8, r12
2808
+    vsub.s32        q9, q8
2809
+    vmovn.u32       d0, q9
2810
+    vst1.s16        {d0}, [r2]!
2811
+    add             r2, r3
2812
+    subs            r6, #1
2813
+    add             r0, r1
2814
+    bne             loop12_hps_\filterhps\()_\w\()x\h\()_rowext1
2815
+16:
2816
+.endm
2817
+
2818
+.macro LUMA_HPS w h
2819
+function x265_interp_horiz_ps_\w\()x\h\()_neon
2820
+    push            {r4, r5, r6, r7, r8, r9, r10}
2821
+    ldr             r4, [sp, #28]
2822
+    ldr             r9, [sp, #32]
2823
+    mov             r10, #\h
2824
+    cmp             r9, #0
2825
+    beq             6f
2826
+    sub             r0, r0, r1, lsl #2
2827
+    add             r0, r1
2828
+    add             r10, #7
2829
+6:
2830
+    cmp             r4, #0
2831
+    beq             0f
2832
+    cmp             r4, #1
2833
+    beq             1f
2834
+    cmp             r4, #2
2835
+    beq             2f
2836
+    cmp             r4, #3
2837
+    beq             3f
2838
+0:
2839
+    HPS_FILTER  \w \h qpel_filter_0_32b
2840
+    b               5f
2841
+1:
2842
+    HPS_FILTER  \w \h qpel_filter_1_32b
2843
+    b               5f
2844
+2:
2845
+    HPS_FILTER  \w \h qpel_filter_2_32b
2846
+    b               5f
2847
+3:
2848
+    HPS_FILTER  \w \h qpel_filter_3_32b
2849
+    b               5f
2850
+5:
2851
+    pop             {r4, r5, r6, r7, r8, r9, r10}
2852
+    bx              lr
2853
+endfunc
2854
+.endm
2855
+
2856
+LUMA_HPS    4 4
2857
+LUMA_HPS    4 8
2858
+LUMA_HPS    4 16
2859
+LUMA_HPS    8 4
2860
+LUMA_HPS    8 8
2861
+LUMA_HPS    8 16
2862
+LUMA_HPS    8 32
2863
+LUMA_HPS    12 16
2864
+LUMA_HPS    16 4
2865
+LUMA_HPS    16 8
2866
+LUMA_HPS    16 12
2867
+LUMA_HPS    16 16
2868
+LUMA_HPS    16 32
2869
+LUMA_HPS    16 64
2870
+LUMA_HPS    24 32
2871
+LUMA_HPS    32 8
2872
+LUMA_HPS    32 16
2873
+LUMA_HPS    32 24
2874
+LUMA_HPS    32 32
2875
+LUMA_HPS    32 64
2876
+LUMA_HPS    48 64
2877
+LUMA_HPS    64 16
2878
+LUMA_HPS    64 32
2879
+LUMA_HPS    64 48
2880
+LUMA_HPS    64 64
2881
+
2882
+// ******* Chroma_hpp *******
2883
+.macro vextin8_chroma
2884
+    pld             [r5]
2885
+    vld1.8          {q3}, [r5]!
2886
+    vext.8          d0, d6, d7, #1
2887
+    vext.8          d1, d6, d7, #2
2888
+    vext.8          d2, d6, d7, #3
2889
+    vext.8          d3, d6, d7, #4
2890
+
2891
+    vmovl.u8        q2, d0
2892
+    vmovl.u8        q3, d1
2893
+    vmovl.u8        q4, d2
2894
+    vmovl.u8        q5, d3
2895
+.endm
2896
+
2897
+.macro FILTER_CHROMA_HPP a b filterhpp
2898
+    vpush           {q4-q7}
2899
+    mov             r12,#32
2900
+    mov             r6, #\b
2901
+    sub             r3, #\a
2902
+    mov             r8, #\a
2903
+    cmp             r8, #4
2904
+    beq             11f
2905
+    cmp             r8, #12
2906
+    beq             12f
2907
+    b               13f
2908
+11:
2909
+    FILTER_CHROMA_HPP_4 \a \b \filterhpp
2910
+    b               14f
2911
+12:
2912
+    FILTER_CHROMA_HPP_12 \a \b \filterhpp
2913
+    b               14f
2914
+13:
2915
+    veor            q6, q6
2916
+    veor            q7, q7
2917
+
2918
+loop2_hpp_\filterhpp\()_\a\()x\b:
2919
+    mov             r7, #\a
2920
+    lsr             r7, #3
2921
+    mov             r5, r0
2922
+    sub             r5, #2
2923
+loop3_hpp_\filterhpp\()_\a\()x\b:
2924
+    vextin8_chroma
2925
+    \filterhpp
2926
+    vdup.32         q8, r12
2927
+    vadd.s32        q6, q8
2928
+    vqshrun.s32     d0, q6, #6
2929
+    vadd.s32        q7, q8
2930
+    vqshrun.s32     d1, q7, #6
2931
+    vqmovn.u16      d0, q0
2932
+    vst1.u8         d0, [r2]!
2933
+    subs            r7, #1
2934
+    sub             r5, #8
2935
+    bne             loop3_hpp_\filterhpp\()_\a\()x\b
2936
+    subs            r6, #1
2937
+    add             r0, r1
2938
+    add             r2, r3
2939
+    bne             loop2_hpp_\filterhpp\()_\a\()x\b
2940
+14:
2941
+    vpop            {q4-q7}
2942
+.endm
2943
+
2944
+.macro FILTER_CHROMA_HPP_4 w h filterhpp
2945
+loop4_hpp_\filterhpp\()_\w\()x\h:
2946
+    mov             r5, r0
2947
+    sub             r5, #2
2948
+    vextin8_chroma
2949
+    \filterhpp
2950
+    vdup.32         q8, r12
2951
+    vadd.s32        q6, q8
2952
+    vqshrun.s32     d0, q6, #6
2953
+    vadd.s32        q7, q8
2954
+    vqshrun.s32     d1, q7, #6
2955
+    vqmovn.u16      d0, q0
2956
+    vst1.u32        {d0[0]}, [r2]!
2957
+    sub             r5, #8
2958
+    subs            r6, #1
2959
+    add             r0, r1
2960
+    add             r2, r3
2961
+    bne             loop4_hpp_\filterhpp\()_\w\()x\h
2962
+.endm
2963
+
2964
+.macro FILTER_CHROMA_HPP_12 w h filterhpp
2965
+loop12_hpp_\filterhpp\()_\w\()x\h:
2966
+    mov             r5, r0
2967
+    sub             r5, #2
2968
+    vextin8_chroma
2969
+    \filterhpp
2970
+    vdup.32         q8, r12
2971
+    vadd.s32        q6, q8
2972
+    vqshrun.s32     d0, q6, #6
2973
+    vadd.s32        q7, q8
2974
+    vqshrun.s32     d1, q7, #6
2975
+    vqmovn.u16      d0, q0
2976
+    vst1.u8         {d0}, [r2]!
2977
+    sub             r5, #8
2978
+
2979
+    vextin8_chroma
2980
+    \filterhpp
2981
+    vdup.32         q8, r12
2982
+    vadd.s32        q6, q8
2983
+    vqshrun.s32     d0, q6, #6
2984
+    vadd.s32        q7, q8
2985
+    vqshrun.s32     d1, q7, #6
2986
+    vqmovn.u16      d0, q0
2987
+    vst1.u32        {d0[0]}, [r2]!
2988
+    add             r2, r3
2989
+    subs            r6, #1
2990
+    add             r0, r1
2991
+    bne             loop12_hpp_\filterhpp\()_\w\()x\h
2992
+.endm
2993
+
2994
+.macro CHROMA_HPP  w h
2995
+function x265_interp_4tap_horiz_pp_\w\()x\h\()_neon
2996
+
2997
+    push            {r4, r5, r6, r7, r8}
2998
+    ldr             r4, [sp, #4 * 5]
2999
+
3000
+    cmp             r4, #0
3001
+    beq              0f
3002
+    cmp             r4, #1
3003
+    beq              1f
3004
+    cmp             r4, #2
3005
+    beq              2f
3006
+    cmp             r4, #3
3007
+    beq              3f
3008
+    cmp             r4, #4
3009
+    beq              4f
3010
+    cmp             r4, #5
3011
+    beq              5f
3012
+    cmp             r4, #6
3013
+    beq              6f
3014
+    cmp             r4, #7
3015
+    beq              7f
3016
+0:
3017
+    FILTER_CHROMA_HPP  \w \h qpel_filter_chroma_0_32b
3018
+    b            8f
3019
+1:
3020
+    FILTER_CHROMA_HPP  \w \h qpel_filter_chroma_1_32b
3021
+    b            8f
3022
+2:
3023
+    FILTER_CHROMA_HPP  \w \h qpel_filter_chroma_2_32b
3024
+    b            8f
3025
+3:
3026
+    FILTER_CHROMA_HPP  \w \h qpel_filter_chroma_3_32b
3027
+    b            8f
3028
+4:
3029
+    FILTER_CHROMA_HPP  \w \h qpel_filter_chroma_4_32b
3030
+    b            8f
3031
+5:
3032
+    FILTER_CHROMA_HPP  \w \h qpel_filter_chroma_5_32b
3033
+    b            8f
3034
+6:
3035
+    FILTER_CHROMA_HPP  \w \h qpel_filter_chroma_6_32b
3036
+    b            8f
3037
+7:
3038
+    FILTER_CHROMA_HPP  \w \h qpel_filter_chroma_7_32b
3039
+
3040
+8:
3041
+    pop             {r4, r5, r6, r7, r8}
3042
+    bx              lr
3043
+endfunc
3044
+.endm
3045
+
3046
+CHROMA_HPP 4 2
3047
+CHROMA_HPP 4 4
3048
+CHROMA_HPP 4 8
3049
+CHROMA_HPP 4 16
3050
+CHROMA_HPP 4 32
3051
+CHROMA_HPP 8 2
3052
+CHROMA_HPP 8 4
3053
+CHROMA_HPP 8 6
3054
+CHROMA_HPP 8 8
3055
+CHROMA_HPP 8 12
3056
+CHROMA_HPP 8 16
3057
+CHROMA_HPP 8 32
3058
+CHROMA_HPP 8 64
3059
+CHROMA_HPP 12 16
3060
+CHROMA_HPP 12 32
3061
+CHROMA_HPP 16 4
3062
+CHROMA_HPP 16 8
3063
+CHROMA_HPP 16 12
3064
+CHROMA_HPP 16 16
3065
+CHROMA_HPP 16 24
3066
+CHROMA_HPP 16 32
3067
+CHROMA_HPP 16 64
3068
+CHROMA_HPP 24 32
3069
+CHROMA_HPP 24 64
3070
+CHROMA_HPP 32 8
3071
+CHROMA_HPP 32 16
3072
+CHROMA_HPP 32 24
3073
+CHROMA_HPP 32 32
3074
+CHROMA_HPP 32 48
3075
+CHROMA_HPP 32 64
3076
+CHROMA_HPP 48 64
3077
+CHROMA_HPP 64 16
3078
+CHROMA_HPP 64 32
3079
+CHROMA_HPP 64 48
3080
+CHROMA_HPP 64 64
3081
+// ***** Chroma_hps *****
3082
+.macro FILTER_CHROMA_HPS a b filterhps
3083
+    vpush           {q4-q7}
3084
+    mov             r12, #8192
3085
+    mov             r6, r10
3086
+    sub             r3, #\a
3087
+    lsl             r3, #1
3088
+
3089
+    mov             r8, #\a
3090
+    cmp             r8, #4
3091
+    beq             14f
3092
+    cmp             r8, #12
3093
+    beq             15f
3094
+    b               16f
3095
+14:
3096
+    FILTER_CHROMA_HPS_4 \a \b \filterhps
3097
+    b               10f
3098
+15:
3099
+    FILTER_CHROMA_HPS_12 \a \b \filterhps
3100
+    b               10f
3101
+16:
3102
+    cmp             r9, #0
3103
+    beq             17f
3104
+    cmp             r9, #1
3105
+    beq             18f
3106
+17:
3107
+loop1_hps_\filterhps\()_\a\()x\b\()_rowext0:
3108
+    mov             r7, #\a
3109
+    lsr             r7, #3
3110
+    mov             r5, r0
3111
+    sub             r5, #2
3112
+loop2_hps_\filterhps\()_\a\()x\b\()_rowext0:
3113
+    vextin8_chroma
3114
+    \filterhps
3115
+    vdup.32         q8, r12
3116
+    vsub.s32        q6, q8
3117
+    vsub.s32        q7, q8
3118
+    vmovn.u32       d0, q6
3119
+    vmovn.u32       d1, q7
3120
+    vst1.s16        {q0}, [r2]!
3121
+    subs            r7, #1
3122
+    sub             r5, #8
3123
+    bne             loop2_hps_\filterhps\()_\a\()x\b\()_rowext0
3124
+    subs            r6, #1
3125
+    add             r0, r1
3126
+    add             r2, r3
3127
+    bne             loop1_hps_\filterhps\()_\a\()x\b\()_rowext0
3128
+    b               10f
3129
+18:
3130
+loop3_hps_\filterhps\()_\a\()x\b\()_rowext1:
3131
+    mov             r7, #\a
3132
+    lsr             r7, #3
3133
+    mov             r5, r0
3134
+    sub             r5, #2
3135
+loop4_hps_\filterhps\()_\a\()x\b\()_rowext1:
3136
+    vextin8_chroma
3137
+    \filterhps
3138
+    vdup.32         q8, r12
3139
+    vsub.s32        q6, q8
3140
+    vsub.s32        q7, q8
3141
+    vmovn.u32       d0, q6
3142
+    vmovn.u32       d1, q7
3143
+    vst1.s16        {q0}, [r2]!
3144
+    subs            r7, #1
3145
+    sub             r5, #8
3146
+    bne             loop4_hps_\filterhps\()_\a\()x\b\()_rowext1
3147
+    subs            r6, #1
3148
+    add             r0, r1
3149
+    add             r2, r3
3150
+    bne             loop3_hps_\filterhps\()_\a\()x\b\()_rowext1
3151
+10:
3152
+    vpop            {q4-q7}
3153
+.endm
3154
+
3155
+.macro FILTER_CHROMA_HPS_4 w h filterhps
3156
+    cmp             r9, #0
3157
+    beq             19f
3158
+    cmp             r9, #1
3159
+    beq             20f
3160
+19:
3161
+loop4_hps_\filterhps\()_\w\()x\h\()_rowext0:
3162
+    mov             r5, r0
3163
+    sub             r5, #2
3164
+    vextin8_chroma
3165
+    \filterhps
3166
+    vdup.32         q8, r12
3167
+    vsub.s32        q6, q8
3168
+    vmovn.u32       d0, q6
3169
+    vst1.s16        {d0}, [r2]!
3170
+    sub             r5, #8
3171
+    subs            r6, #1
3172
+    add             r0, r1
3173
+    add             r2, r3
3174
+    bne             loop4_hps_\filterhps\()_\w\()x\h\()_rowext0
3175
+    b               21f
3176
+20:
3177
+loop5_hps_\filterhps\()_\w\()x\h\()_rowext1:
3178
+    mov             r5, r0
3179
+    sub             r5, #2
3180
+    vextin8_chroma
3181
+    \filterhps
3182
+    vdup.32         q8, r12
3183
+    vsub.s32        q6, q8
3184
+    vmovn.u32       d0, q6
3185
+    vst1.s16        {d0}, [r2]!
3186
+    sub             r5, #8
3187
+    subs            r6, #1
3188
+    add             r0, r1
3189
+    add             r2, r3
3190
+    bne             loop5_hps_\filterhps\()_\w\()x\h\()_rowext1
3191
+21:
3192
+.endm
3193
+
3194
+.macro FILTER_CHROMA_HPS_12 w h filterhpp
3195
+    cmp             r9, #0
3196
+    beq             22f
3197
+    cmp             r9, #1
3198
+    beq             23f
3199
+22:
3200
+loop12_hps_\filterhpp\()_\w\()x\h\()_rowext0:
3201
+    mov             r5, r0
3202
+    sub             r5, #2
3203
+    vextin8_chroma
3204
+    \filterhpp
3205
+    vdup.32         q8, r12
3206
+    vsub.s32        q6, q8
3207
+    vsub.s32        q7, q8
3208
+    vmovn.u32       d0, q6
3209
+    vmovn.u32       d1, q7
3210
+    vst1.s16        {q0}, [r2]!
3211
+    sub             r5, #8
3212
+
3213
+    vextin8_chroma
3214
+    \filterhpp
3215
+    vdup.32         q8, r12
3216
+    vsub.s32        q6, q8
3217
+    vmovn.u32       d0, q6
3218
+    vst1.s16        {d0}, [r2]!
3219
+    add             r2, r3
3220
+    subs            r6, #1
3221
+    add             r0, r1
3222
+    bne             loop12_hps_\filterhpp\()_\w\()x\h\()_rowext0
3223
+    b               24f
3224
+23:
3225
+loop12_hps_\filterhpp\()_\w\()x\h\()_rowext1:
3226
+    mov             r5, r0
3227
+    sub             r5, #2
3228
+    vextin8_chroma
3229
+    \filterhpp
3230
+    vdup.32         q8, r12
3231
+    vsub.s32        q6, q8
3232
+    vsub.s32        q7, q8
3233
+    vmovn.u32       d0, q6
3234
+    vmovn.u32       d1, q7
3235
+    vst1.s16        {q0}, [r2]!
3236
+    sub             r5, #8
3237
+
3238
+    vextin8_chroma
3239
+    \filterhpp
3240
+    vdup.32         q8, r12
3241
+    vsub.s32        q6, q8
3242
+    vmovn.u32       d0, q6
3243
+    vst1.s16        {d0}, [r2]!
3244
+    add             r2, r3
3245
+    subs            r6, #1
3246
+    add             r0, r1
3247
+    bne             loop12_hps_\filterhpp\()_\w\()x\h\()_rowext1
3248
+24:
3249
+.endm
3250
+
3251
+.macro CHROMA_HPS w h
3252
+function x265_interp_4tap_horiz_ps_\w\()x\h\()_neon
3253
+    push            {r4, r5, r6, r7, r8, r9, r10}
3254
+    ldr             r4, [sp, #28]
3255
+    ldr             r9, [sp, #32]
3256
+    mov             r10, #\h
3257
+    cmp             r9, #0
3258
+    beq             9f
3259
+    sub             r0, r1
3260
+    add             r10, #3
3261
+9:
3262
+    cmp             r4, #0
3263
+    beq              0f
3264
+    cmp             r4, #1
3265
+    beq              1f
3266
+    cmp             r4, #2
3267
+    beq              2f
3268
+    cmp             r4, #3
3269
+    beq              3f
3270
+    cmp             r4, #4
3271
+    beq              4f
3272
+    cmp             r4, #5
3273
+    beq              5f
3274
+    cmp             r4, #6
3275
+    beq              6f
3276
+    cmp             r4, #7
3277
+    beq              7f
3278
+0:
3279
+    FILTER_CHROMA_HPS  \w \h qpel_filter_chroma_0_32b
3280
+    b            8f
3281
+1:
3282
+    FILTER_CHROMA_HPS  \w \h qpel_filter_chroma_1_32b
3283
+    b            8f
3284
+2:
3285
+    FILTER_CHROMA_HPS  \w \h qpel_filter_chroma_2_32b
3286
+    b            8f
3287
+3:
3288
+    FILTER_CHROMA_HPS  \w \h qpel_filter_chroma_3_32b
3289
+    b            8f
3290
+4:
3291
+    FILTER_CHROMA_HPS  \w \h qpel_filter_chroma_4_32b
3292
+    b            8f
3293
+5:
3294
+    FILTER_CHROMA_HPS  \w \h qpel_filter_chroma_5_32b
3295
+    b            8f
3296
+6:
3297
+    FILTER_CHROMA_HPS  \w \h qpel_filter_chroma_6_32b
3298
+    b            8f
3299
+7:
3300
+    FILTER_CHROMA_HPS  \w \h qpel_filter_chroma_7_32b
3301
+
3302
+8:
3303
+    pop             {r4, r5, r6, r7, r8, r9, r10}
3304
+    bx              lr
3305
+endfunc
3306
+.endm
3307
+
3308
+CHROMA_HPS 4 2
3309
+CHROMA_HPS 4 4
3310
+CHROMA_HPS 4 8
3311
+CHROMA_HPS 4 16
3312
+CHROMA_HPS 4 32
3313
+CHROMA_HPS 8 2
3314
+CHROMA_HPS 8 4
3315
+CHROMA_HPS 8 6
3316
+CHROMA_HPS 8 8
3317
+CHROMA_HPS 8 12
3318
+CHROMA_HPS 8 16
3319
+CHROMA_HPS 8 32
3320
+CHROMA_HPS 8 64
3321
+CHROMA_HPS 12 16
3322
+CHROMA_HPS 12 32
3323
+CHROMA_HPS 16 4
3324
+CHROMA_HPS 16 8
3325
+CHROMA_HPS 16 12
3326
+CHROMA_HPS 16 16
3327
+CHROMA_HPS 16 24
3328
+CHROMA_HPS 16 32
3329
+CHROMA_HPS 16 64
3330
+CHROMA_HPS 24 32
3331
+CHROMA_HPS 24 64
3332
+CHROMA_HPS 32 8
3333
+CHROMA_HPS 32 16
3334
+CHROMA_HPS 32 24
3335
+CHROMA_HPS 32 32
3336
+CHROMA_HPS 32 48
3337
+CHROMA_HPS 32 64
3338
+CHROMA_HPS 48 64
3339
+CHROMA_HPS 64 16
3340
+CHROMA_HPS 64 32
3341
+CHROMA_HPS 64 48
3342
+CHROMA_HPS 64 64
3343
x265_2.0.tar.gz/source/common/arm/ipfilter8.h Added
344
 
1
@@ -0,0 +1,342 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2016 x265 project
4
+ *
5
+ * Authors: Steve Borho <steve@borho.org>
6
+ *          Dnyaneshwar Gorade <dnyaneshwar@multicorewareinc.com>
7
+ *
8
+ * This program is free software; you can redistribute it and/or modify
9
+ * it under the terms of the GNU General Public License as published by
10
+ * the Free Software Foundation; either version 2 of the License, or
11
+ * (at your option) any later version.
12
+ *
13
+ * This program is distributed in the hope that it will be useful,
14
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16
+ * GNU General Public License for more details.
17
+ *
18
+ * You should have received a copy of the GNU General Public License
19
+ * along with this program; if not, write to the Free Software
20
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
21
+ *
22
+ * This program is also available under a commercial proprietary license.
23
+ * For more information, contact us at license @ x265.com.
24
+ *****************************************************************************/
25
+
26
+#ifndef X265_IPFILTER8_ARM_H
27
+#define X265_IPFILTER8_ARM_H
28
+
29
+void x265_filterPixelToShort_4x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
30
+void x265_filterPixelToShort_4x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
31
+void x265_filterPixelToShort_4x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
32
+void x265_filterPixelToShort_8x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
33
+void x265_filterPixelToShort_8x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
34
+void x265_filterPixelToShort_8x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
35
+void x265_filterPixelToShort_8x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
36
+void x265_filterPixelToShort_12x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
37
+void x265_filterPixelToShort_16x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
38
+void x265_filterPixelToShort_16x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
39
+void x265_filterPixelToShort_16x12_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
40
+void x265_filterPixelToShort_16x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
41
+void x265_filterPixelToShort_16x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
42
+void x265_filterPixelToShort_16x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
43
+void x265_filterPixelToShort_24x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
44
+void x265_filterPixelToShort_32x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
45
+void x265_filterPixelToShort_32x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
46
+void x265_filterPixelToShort_32x24_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
47
+void x265_filterPixelToShort_32x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
48
+void x265_filterPixelToShort_32x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
49
+void x265_filterPixelToShort_48x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
50
+void x265_filterPixelToShort_64x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
51
+void x265_filterPixelToShort_64x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
52
+void x265_filterPixelToShort_64x48_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
53
+void x265_filterPixelToShort_64x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
54
+
55
+void x265_interp_8tap_vert_pp_4x4_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
56
+void x265_interp_8tap_vert_pp_4x8_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
57
+void x265_interp_8tap_vert_pp_4x16_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
58
+void x265_interp_8tap_vert_pp_8x4_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
59
+void x265_interp_8tap_vert_pp_8x8_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
60
+void x265_interp_8tap_vert_pp_8x16_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
61
+void x265_interp_8tap_vert_pp_8x32_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
62
+void x265_interp_8tap_vert_pp_16x4_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
63
+void x265_interp_8tap_vert_pp_16x8_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
64
+void x265_interp_8tap_vert_pp_16x16_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
65
+void x265_interp_8tap_vert_pp_16x32_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
66
+void x265_interp_8tap_vert_pp_16x64_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
67
+void x265_interp_8tap_vert_pp_16x12_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
68
+void x265_interp_8tap_vert_pp_32x8_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
69
+void x265_interp_8tap_vert_pp_32x16_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
70
+void x265_interp_8tap_vert_pp_32x32_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
71
+void x265_interp_8tap_vert_pp_32x64_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
72
+void x265_interp_8tap_vert_pp_32x24_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
73
+void x265_interp_8tap_vert_pp_64x16_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
74
+void x265_interp_8tap_vert_pp_64x32_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
75
+void x265_interp_8tap_vert_pp_64x64_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
76
+void x265_interp_8tap_vert_pp_64x48_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
77
+void x265_interp_8tap_vert_pp_24x32_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
78
+void x265_interp_8tap_vert_pp_48x64_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
79
+void x265_interp_8tap_vert_pp_12x16_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
80
+
81
+void x265_interp_8tap_vert_sp_4x4_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
82
+void x265_interp_8tap_vert_sp_4x8_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
83
+void x265_interp_8tap_vert_sp_4x16_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
84
+void x265_interp_8tap_vert_sp_8x4_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
85
+void x265_interp_8tap_vert_sp_8x8_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
86
+void x265_interp_8tap_vert_sp_8x16_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
87
+void x265_interp_8tap_vert_sp_8x32_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
88
+void x265_interp_8tap_vert_sp_16x4_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
89
+void x265_interp_8tap_vert_sp_16x8_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
90
+void x265_interp_8tap_vert_sp_16x16_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
91
+void x265_interp_8tap_vert_sp_16x32_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
92
+void x265_interp_8tap_vert_sp_16x64_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
93
+void x265_interp_8tap_vert_sp_16x12_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
94
+void x265_interp_8tap_vert_sp_32x8_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
95
+void x265_interp_8tap_vert_sp_32x16_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
96
+void x265_interp_8tap_vert_sp_32x32_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
97
+void x265_interp_8tap_vert_sp_32x64_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
98
+void x265_interp_8tap_vert_sp_32x24_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
99
+void x265_interp_8tap_vert_sp_64x16_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
100
+void x265_interp_8tap_vert_sp_64x32_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
101
+void x265_interp_8tap_vert_sp_64x64_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
102
+void x265_interp_8tap_vert_sp_64x48_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
103
+void x265_interp_8tap_vert_sp_24x32_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
104
+void x265_interp_8tap_vert_sp_48x64_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
105
+void x265_interp_8tap_vert_sp_12x16_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
106
+
107
+void x265_interp_8tap_vert_ps_4x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
108
+void x265_interp_8tap_vert_ps_4x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
109
+void x265_interp_8tap_vert_ps_4x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
110
+void x265_interp_8tap_vert_ps_8x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
111
+void x265_interp_8tap_vert_ps_8x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
112
+void x265_interp_8tap_vert_ps_8x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
113
+void x265_interp_8tap_vert_ps_8x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
114
+void x265_interp_8tap_vert_ps_16x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
115
+void x265_interp_8tap_vert_ps_16x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
116
+void x265_interp_8tap_vert_ps_16x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
117
+void x265_interp_8tap_vert_ps_16x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
118
+void x265_interp_8tap_vert_ps_16x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
119
+void x265_interp_8tap_vert_ps_16x12_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
120
+void x265_interp_8tap_vert_ps_32x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
121
+void x265_interp_8tap_vert_ps_32x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
122
+void x265_interp_8tap_vert_ps_32x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
123
+void x265_interp_8tap_vert_ps_32x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
124
+void x265_interp_8tap_vert_ps_32x24_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
125
+void x265_interp_8tap_vert_ps_64x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
126
+void x265_interp_8tap_vert_ps_64x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
127
+void x265_interp_8tap_vert_ps_64x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
128
+void x265_interp_8tap_vert_ps_64x48_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
129
+void x265_interp_8tap_vert_ps_24x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
130
+void x265_interp_8tap_vert_ps_48x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
131
+void x265_interp_8tap_vert_ps_12x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
132
+
133
+void x265_interp_4tap_vert_pp_8x2_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
134
+void x265_interp_4tap_vert_pp_8x4_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
135
+void x265_interp_4tap_vert_pp_8x6_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
136
+void x265_interp_4tap_vert_pp_8x8_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
137
+void x265_interp_4tap_vert_pp_8x16_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
138
+void x265_interp_4tap_vert_pp_8x32_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
139
+void x265_interp_4tap_vert_pp_8x64_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
140
+void x265_interp_4tap_vert_pp_8x12_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
141
+void x265_interp_4tap_vert_pp_16x4_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
142
+void x265_interp_4tap_vert_pp_16x8_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
143
+void x265_interp_4tap_vert_pp_16x12_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
144
+void x265_interp_4tap_vert_pp_16x16_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
145
+void x265_interp_4tap_vert_pp_16x32_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
146
+void x265_interp_4tap_vert_pp_16x64_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
147
+void x265_interp_4tap_vert_pp_16x24_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
148
+void x265_interp_4tap_vert_pp_32x8_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
149
+void x265_interp_4tap_vert_pp_32x16_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
150
+void x265_interp_4tap_vert_pp_32x24_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
151
+void x265_interp_4tap_vert_pp_32x32_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
152
+void x265_interp_4tap_vert_pp_32x64_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
153
+void x265_interp_4tap_vert_pp_32x48_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
154
+void x265_interp_4tap_vert_pp_24x32_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
155
+void x265_interp_4tap_vert_pp_24x64_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
156
+void x265_interp_4tap_vert_pp_48x64_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
157
+void x265_interp_4tap_vert_pp_64x16_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
158
+void x265_interp_4tap_vert_pp_64x32_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
159
+void x265_interp_4tap_vert_pp_64x64_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
160
+void x265_interp_4tap_vert_pp_64x48_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
161
+
162
+void x265_interp_4tap_vert_ps_8x2_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
163
+void x265_interp_4tap_vert_ps_8x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
164
+void x265_interp_4tap_vert_ps_8x6_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
165
+void x265_interp_4tap_vert_ps_8x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
166
+void x265_interp_4tap_vert_ps_8x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
167
+void x265_interp_4tap_vert_ps_8x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
168
+void x265_interp_4tap_vert_ps_8x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
169
+void x265_interp_4tap_vert_ps_8x12_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
170
+void x265_interp_4tap_vert_ps_16x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
171
+void x265_interp_4tap_vert_ps_16x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
172
+void x265_interp_4tap_vert_ps_16x12_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
173
+void x265_interp_4tap_vert_ps_16x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
174
+void x265_interp_4tap_vert_ps_16x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
175
+void x265_interp_4tap_vert_ps_16x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
176
+void x265_interp_4tap_vert_ps_16x24_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
177
+void x265_interp_4tap_vert_ps_32x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
178
+void x265_interp_4tap_vert_ps_32x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
179
+void x265_interp_4tap_vert_ps_32x24_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
180
+void x265_interp_4tap_vert_ps_32x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
181
+void x265_interp_4tap_vert_ps_32x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
182
+void x265_interp_4tap_vert_ps_32x48_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
183
+void x265_interp_4tap_vert_ps_24x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
184
+void x265_interp_4tap_vert_ps_24x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
185
+void x265_interp_4tap_vert_ps_48x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
186
+void x265_interp_4tap_vert_ps_64x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
187
+void x265_interp_4tap_vert_ps_64x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
188
+void x265_interp_4tap_vert_ps_64x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
189
+void x265_interp_4tap_vert_ps_64x48_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
190
+
191
+void x265_interp_4tap_vert_sp_8x2_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
192
+void x265_interp_4tap_vert_sp_8x4_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
193
+void x265_interp_4tap_vert_sp_8x6_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
194
+void x265_interp_4tap_vert_sp_8x8_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
195
+void x265_interp_4tap_vert_sp_8x16_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
196
+void x265_interp_4tap_vert_sp_8x32_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
197
+void x265_interp_4tap_vert_sp_8x64_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
198
+void x265_interp_4tap_vert_sp_8x12_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
199
+void x265_interp_4tap_vert_sp_16x4_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
200
+void x265_interp_4tap_vert_sp_16x8_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
201
+void x265_interp_4tap_vert_sp_16x12_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
202
+void x265_interp_4tap_vert_sp_16x16_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
203
+void x265_interp_4tap_vert_sp_16x32_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
204
+void x265_interp_4tap_vert_sp_16x64_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
205
+void x265_interp_4tap_vert_sp_16x24_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
206
+void x265_interp_4tap_vert_sp_32x8_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
207
+void x265_interp_4tap_vert_sp_32x16_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
208
+void x265_interp_4tap_vert_sp_32x24_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
209
+void x265_interp_4tap_vert_sp_32x32_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
210
+void x265_interp_4tap_vert_sp_32x64_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
211
+void x265_interp_4tap_vert_sp_32x48_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
212
+void x265_interp_4tap_vert_sp_24x32_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
213
+void x265_interp_4tap_vert_sp_24x64_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
214
+void x265_interp_4tap_vert_sp_48x64_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
215
+void x265_interp_4tap_vert_sp_64x16_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
216
+void x265_interp_4tap_vert_sp_64x32_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
217
+void x265_interp_4tap_vert_sp_64x64_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
218
+void x265_interp_4tap_vert_sp_64x48_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
219
+
220
+void x265_interp_horiz_pp_4x4_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
221
+void x265_interp_horiz_pp_4x8_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
222
+void x265_interp_horiz_pp_4x16_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
223
+void x265_interp_horiz_pp_8x4_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
224
+void x265_interp_horiz_pp_8x8_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
225
+void x265_interp_horiz_pp_8x16_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
226
+void x265_interp_horiz_pp_8x32_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
227
+void x265_interp_horiz_pp_12x16_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
228
+void x265_interp_horiz_pp_16x4_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
229
+void x265_interp_horiz_pp_16x8_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
230
+void x265_interp_horiz_pp_16x12_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
231
+void x265_interp_horiz_pp_16x16_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
232
+void x265_interp_horiz_pp_16x32_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
233
+void x265_interp_horiz_pp_16x64_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
234
+void x265_interp_horiz_pp_24x32_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
235
+void x265_interp_horiz_pp_32x8_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
236
+void x265_interp_horiz_pp_32x16_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
237
+void x265_interp_horiz_pp_32x24_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
238
+void x265_interp_horiz_pp_32x32_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
239
+void x265_interp_horiz_pp_32x64_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
240
+void x265_interp_horiz_pp_48x64_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
241
+void x265_interp_horiz_pp_64x16_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
242
+void x265_interp_horiz_pp_64x32_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
243
+void x265_interp_horiz_pp_64x48_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
244
+void x265_interp_horiz_pp_64x64_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
245
+
246
+void x265_interp_horiz_ps_4x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
247
+void x265_interp_horiz_ps_4x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
248
+void x265_interp_horiz_ps_4x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
249
+void x265_interp_horiz_ps_8x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
250
+void x265_interp_horiz_ps_8x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
251
+void x265_interp_horiz_ps_8x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
252
+void x265_interp_horiz_ps_8x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
253
+void x265_interp_horiz_ps_12x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
254
+void x265_interp_horiz_ps_16x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
255
+void x265_interp_horiz_ps_16x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
256
+void x265_interp_horiz_ps_16x12_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
257
+void x265_interp_horiz_ps_16x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
258
+void x265_interp_horiz_ps_16x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
259
+void x265_interp_horiz_ps_16x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
260
+void x265_interp_horiz_ps_24x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
261
+void x265_interp_horiz_ps_32x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
262
+void x265_interp_horiz_ps_32x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
263
+void x265_interp_horiz_ps_32x24_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
264
+void x265_interp_horiz_ps_32x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
265
+void x265_interp_horiz_ps_32x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
266
+void x265_interp_horiz_ps_48x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
267
+void x265_interp_horiz_ps_64x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
268
+void x265_interp_horiz_ps_64x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
269
+void x265_interp_horiz_ps_64x48_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
270
+void x265_interp_horiz_ps_64x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
271
+
272
+void x265_interp_4tap_horiz_pp_4x2_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
273
+void x265_interp_4tap_horiz_pp_4x4_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
274
+void x265_interp_4tap_horiz_pp_4x8_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
275
+void x265_interp_4tap_horiz_pp_4x16_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
276
+void x265_interp_4tap_horiz_pp_4x32_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
277
+void x265_interp_4tap_horiz_pp_8x2_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
278
+void x265_interp_4tap_horiz_pp_8x4_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
279
+void x265_interp_4tap_horiz_pp_8x6_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
280
+void x265_interp_4tap_horiz_pp_8x8_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
281
+void x265_interp_4tap_horiz_pp_8x12_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
282
+void x265_interp_4tap_horiz_pp_8x16_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
283
+void x265_interp_4tap_horiz_pp_8x32_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
284
+void x265_interp_4tap_horiz_pp_8x64_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
285
+void x265_interp_4tap_horiz_pp_12x16_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
286
+void x265_interp_4tap_horiz_pp_12x32_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
287
+void x265_interp_4tap_horiz_pp_16x4_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
288
+void x265_interp_4tap_horiz_pp_16x8_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
289
+void x265_interp_4tap_horiz_pp_16x12_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
290
+void x265_interp_4tap_horiz_pp_16x16_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
291
+void x265_interp_4tap_horiz_pp_16x24_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
292
+void x265_interp_4tap_horiz_pp_16x32_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
293
+void x265_interp_4tap_horiz_pp_16x64_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
294
+void x265_interp_4tap_horiz_pp_24x32_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
295
+void x265_interp_4tap_horiz_pp_24x64_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
296
+void x265_interp_4tap_horiz_pp_32x8_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
297
+void x265_interp_4tap_horiz_pp_32x16_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
298
+void x265_interp_4tap_horiz_pp_32x24_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
299
+void x265_interp_4tap_horiz_pp_32x32_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
300
+void x265_interp_4tap_horiz_pp_32x48_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
301
+void x265_interp_4tap_horiz_pp_32x64_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
302
+void x265_interp_4tap_horiz_pp_48x64_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
303
+void x265_interp_4tap_horiz_pp_64x16_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
304
+void x265_interp_4tap_horiz_pp_64x32_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
305
+void x265_interp_4tap_horiz_pp_64x48_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
306
+void x265_interp_4tap_horiz_pp_64x64_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
307
+
308
+void x265_interp_4tap_horiz_ps_4x2_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
309
+void x265_interp_4tap_horiz_ps_4x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
310
+void x265_interp_4tap_horiz_ps_4x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
311
+void x265_interp_4tap_horiz_ps_4x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
312
+void x265_interp_4tap_horiz_ps_4x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
313
+void x265_interp_4tap_horiz_ps_8x2_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
314
+void x265_interp_4tap_horiz_ps_8x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
315
+void x265_interp_4tap_horiz_ps_8x6_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
316
+void x265_interp_4tap_horiz_ps_8x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
317
+void x265_interp_4tap_horiz_ps_8x12_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
318
+void x265_interp_4tap_horiz_ps_8x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
319
+void x265_interp_4tap_horiz_ps_8x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
320
+void x265_interp_4tap_horiz_ps_8x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
321
+void x265_interp_4tap_horiz_ps_12x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
322
+void x265_interp_4tap_horiz_ps_12x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
323
+void x265_interp_4tap_horiz_ps_16x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
324
+void x265_interp_4tap_horiz_ps_16x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
325
+void x265_interp_4tap_horiz_ps_16x12_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
326
+void x265_interp_4tap_horiz_ps_16x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
327
+void x265_interp_4tap_horiz_ps_16x24_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
328
+void x265_interp_4tap_horiz_ps_16x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
329
+void x265_interp_4tap_horiz_ps_16x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
330
+void x265_interp_4tap_horiz_ps_24x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
331
+void x265_interp_4tap_horiz_ps_24x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
332
+void x265_interp_4tap_horiz_ps_32x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
333
+void x265_interp_4tap_horiz_ps_32x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
334
+void x265_interp_4tap_horiz_ps_32x24_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
335
+void x265_interp_4tap_horiz_ps_32x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
336
+void x265_interp_4tap_horiz_ps_32x48_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
337
+void x265_interp_4tap_horiz_ps_32x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
338
+void x265_interp_4tap_horiz_ps_48x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
339
+void x265_interp_4tap_horiz_ps_64x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
340
+void x265_interp_4tap_horiz_ps_64x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
341
+void x265_interp_4tap_horiz_ps_64x48_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
342
+void x265_interp_4tap_horiz_ps_64x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
343
+#endif // ifndef X265_IPFILTER8_ARM_H
344
x265_2.0.tar.gz/source/common/arm/loopfilter.h Added
31
 
1
@@ -0,0 +1,29 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2016 x265 project
4
+ *
5
+ * Authors: Dnyaneshwar Gorade <dnyaneshwar@multicorewareinc.com>
6
+ *          Praveen Kumar Tiwari <praveen@multicorewareinc.com>
7
+;*          Min Chen <chenm003@163.com>
8
+ *
9
+ * This program is free software; you can redistribute it and/or modify
10
+ * it under the terms of the GNU General Public License as published by
11
+ * the Free Software Foundation; either version 2 of the License, or
12
+ * (at your option) any later version.
13
+ *
14
+ * This program is distributed in the hope that it will be useful,
15
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17
+ * GNU General Public License for more details.
18
+ *
19
+ * You should have received a copy of the GNU General Public License
20
+ * along with this program; if not, write to the Free Software
21
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
22
+ *
23
+ * This program is also available under a commercial proprietary license.
24
+ * For more information, contact us at license @ x265.com.
25
+ *****************************************************************************/
26
+
27
+#ifndef X265_LOOPFILTER_ARM_H
28
+#define X265_LOOPFILTER_ARM_H
29
+
30
+#endif // ifndef X265_LOOPFILTER_ARM_H
31
x265_2.0.tar.gz/source/common/arm/mc-a.S Added
1174
 
1
@@ -0,0 +1,1172 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2016 x265 project
4
+ *
5
+ * Authors: Dnyaneshwar Gorade <dnyaneshwar@multicorewareinc.com>
6
+ *          Radhakrishnan <radhakrishnan@multicorewareinc.com>
7
+ *
8
+ * This program is free software; you can redistribute it and/or modify
9
+ * it under the terms of the GNU General Public License as published by
10
+ * the Free Software Foundation; either version 2 of the License, or
11
+ * (at your option) any later version.
12
+ *
13
+ * This program is distributed in the hope that it will be useful,
14
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16
+ * GNU General Public License for more details.
17
+ *
18
+ * You should have received a copy of the GNU General Public License
19
+ * along with this program; if not, write to the Free Software
20
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
21
+ *
22
+ * This program is also available under a commercial proprietary license.
23
+ * For more information, contact us at license @ x265.com.
24
+ *****************************************************************************/
25
+
26
+#include "asm.S"
27
+
28
+.section .rodata
29
+
30
+.align 4
31
+
32
+.text
33
+
34
+/* blockcopy_pp_16x16(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
35
+ *
36
+ * r0   - dst
37
+ * r1   - dstStride
38
+ * r2   - src
39
+ * r3   - srcStride */
40
+function x265_blockcopy_pp_16x16_neon
41
+.rept 16
42
+    vld1.8          {q0}, [r2]
43
+    vst1.8          {q0}, [r0]
44
+    add             r2, r2, r3
45
+    add             r0, r0, r1
46
+.endr
47
+    bx              lr
48
+endfunc
49
+
50
+.macro blockcopy_pp_4xN_neon h
51
+function x265_blockcopy_pp_4x\h\()_neon
52
+.rept \h
53
+    ldr             r12, [r2], r3
54
+    str             r12, [r0], r1
55
+.endr
56
+    bx              lr
57
+endfunc
58
+.endm
59
+
60
+blockcopy_pp_4xN_neon 4
61
+blockcopy_pp_4xN_neon 8
62
+blockcopy_pp_4xN_neon 16
63
+blockcopy_pp_4xN_neon 2
64
+blockcopy_pp_4xN_neon 32
65
+
66
+.macro blockcopy_pp_16xN_neon h
67
+function x265_blockcopy_pp_16x\h\()_neon
68
+.rept \h    
69
+    vld1.8          {q0}, [r2], r3
70
+    vst1.8          {q0}, [r0], r1
71
+.endr
72
+    bx              lr
73
+endfunc
74
+.endm
75
+
76
+blockcopy_pp_16xN_neon 4
77
+blockcopy_pp_16xN_neon 8
78
+blockcopy_pp_16xN_neon 12
79
+blockcopy_pp_16xN_neon 24
80
+
81
+.macro blockcopy_pp_16xN1_neon h i
82
+function x265_blockcopy_pp_16x\h\()_neon
83
+    mov             r12, #\i
84
+loop_16x\h\():
85
+.rept 8
86
+    vld1.8          {q0}, [r2], r3
87
+    vst1.8          {q0}, [r0], r1
88
+.endr
89
+    subs            r12, r12, #1
90
+    bne             loop_16x\h
91
+    bx              lr
92
+endfunc
93
+.endm
94
+
95
+blockcopy_pp_16xN1_neon 32 4
96
+blockcopy_pp_16xN1_neon 64 8
97
+
98
+.macro blockcopy_pp_8xN_neon h
99
+function x265_blockcopy_pp_8x\h\()_neon
100
+.rept \h    
101
+    vld1.8          {d0}, [r2], r3
102
+    vst1.8          {d0}, [r0], r1
103
+.endr
104
+    bx              lr
105
+endfunc
106
+.endm
107
+
108
+blockcopy_pp_8xN_neon 4
109
+blockcopy_pp_8xN_neon 8
110
+blockcopy_pp_8xN_neon 16
111
+blockcopy_pp_8xN_neon 32
112
+blockcopy_pp_8xN_neon 2
113
+blockcopy_pp_8xN_neon 6
114
+blockcopy_pp_8xN_neon 12
115
+
116
+function x265_blockcopy_pp_12x16_neon
117
+    sub             r3, #8
118
+    sub             r1, #8
119
+.rept 16
120
+    vld1.8          {d0}, [r2]!
121
+    ldr             r12, [r2], r3
122
+    vst1.8          {d0}, [r0]!
123
+    str             r12, [r0], r1
124
+.endr
125
+    bx              lr
126
+endfunc
127
+
128
+function x265_blockcopy_pp_24x32_neon
129
+    mov             r12, #4
130
+loop_24x32:
131
+.rept 8
132
+    vld1.8          {d0, d1, d2}, [r2], r3
133
+    vst1.8          {d0, d1, d2}, [r0], r1
134
+.endr
135
+    subs            r12, r12, #1
136
+    bne             loop_24x32
137
+    bx              lr
138
+endfunc
139
+
140
+function x265_blockcopy_pp_32x8_neon
141
+.rept 8
142
+    vld1.8          {q0, q1}, [r2], r3
143
+    vst1.8          {q0, q1}, [r0], r1
144
+.endr 
145
+    bx              lr
146
+endfunc
147
+
148
+.macro blockcopy_pp_32xN_neon h i
149
+function x265_blockcopy_pp_32x\h\()_neon
150
+    mov             r12, #\i
151
+loop_32x\h\():
152
+.rept 8
153
+    vld1.8          {q0, q1}, [r2], r3
154
+    vst1.8          {q0, q1}, [r0], r1
155
+.endr
156
+    subs            r12, r12, #1
157
+    bne             loop_32x\h
158
+    bx              lr
159
+endfunc
160
+.endm
161
+
162
+blockcopy_pp_32xN_neon 16 2
163
+blockcopy_pp_32xN_neon 24 3
164
+blockcopy_pp_32xN_neon 32 4
165
+blockcopy_pp_32xN_neon 64 8
166
+blockcopy_pp_32xN_neon 48 6
167
+
168
+function x265_blockcopy_pp_48x64_neon
169
+    mov             r12, #8
170
+    sub             r3, #32
171
+    sub             r1, #32
172
+loop_48x64:
173
+.rept 8
174
+    vld1.8          {q0, q1}, [r2]!
175
+    vld1.8          {q2}, [r2], r3
176
+    vst1.8          {q0, q1}, [r0]!
177
+    vst1.8          {q2}, [r0], r1
178
+.endr
179
+    subs            r12, r12, #1
180
+    bne             loop_48x64
181
+    bx              lr
182
+endfunc
183
+
184
+.macro blockcopy_pp_64xN_neon h i
185
+function x265_blockcopy_pp_64x\h\()_neon
186
+    mov             r12, #\i
187
+    sub             r3, #32
188
+    sub             r1, #32
189
+loop_64x\h\():
190
+.rept 4
191
+    vld1.8          {q0, q1}, [r2]!
192
+    vld1.8          {q2, q3}, [r2], r3
193
+    vst1.8          {q0, q1}, [r0]!
194
+    vst1.8          {q2, q3}, [r0], r1
195
+.endr
196
+    subs            r12, r12, #1
197
+    bne             loop_64x\h
198
+    bx              lr
199
+endfunc
200
+.endm
201
+
202
+blockcopy_pp_64xN_neon 16 4
203
+blockcopy_pp_64xN_neon 32 8
204
+blockcopy_pp_64xN_neon 48 12
205
+blockcopy_pp_64xN_neon 64 16
206
+
207
+.macro blockcopy_pp_2xN_neon h
208
+function x265_blockcopy_pp_2x\h\()_neon
209
+.rept \h
210
+    ldrh            r12, [r2], r3
211
+    strh            r12, [r0], r1
212
+.endr
213
+    bx              lr
214
+endfunc
215
+.endm
216
+
217
+blockcopy_pp_2xN_neon 4
218
+blockcopy_pp_2xN_neon 8
219
+blockcopy_pp_2xN_neon 16
220
+
221
+.macro blockcopy_pp_6xN_neon h i
222
+function x265_blockcopy_pp_6x\h\()_neon
223
+    sub             r1, #4
224
+.rept \i
225
+    vld1.8          {d0}, [r2], r3
226
+    vld1.8          {d1}, [r2], r3
227
+    vst1.32         {d0[0]}, [r0]!
228
+    vst1.16         {d0[2]}, [r0], r1
229
+    vst1.32         {d1[0]}, [r0]!
230
+    vst1.16         {d1[2]}, [r0], r1
231
+.endr
232
+    bx              lr
233
+endfunc
234
+.endm
235
+blockcopy_pp_6xN_neon 8 4
236
+blockcopy_pp_6xN_neon 16 8
237
+
238
+function x265_blockcopy_pp_8x64_neon
239
+    mov             r12, #4
240
+loop_pp_8x64:
241
+    subs            r12, #1
242
+.rept 16
243
+    vld1.8          {d0}, [r2], r3
244
+    vst1.8          {d0}, [r0], r1
245
+.endr
246
+    bne             loop_pp_8x64
247
+    bx              lr
248
+endfunc
249
+
250
+function x265_blockcopy_pp_12x32_neon
251
+    push            {r4}
252
+    sub             r3, #8
253
+    sub             r1, #8
254
+    mov             r12, #4
255
+loop_pp_12x32:
256
+    subs            r12, #1
257
+.rept 8
258
+    vld1.8          {d0}, [r2]!
259
+    ldr             r4, [r2], r3
260
+    vst1.8          {d0}, [r0]!
261
+    str             r4, [r0], r1
262
+.endr
263
+    bne             loop_pp_12x32
264
+    pop            {r4}
265
+    bx              lr
266
+endfunc
267
+
268
+function x265_blockcopy_pp_24x64_neon
269
+    mov             r12, #4
270
+loop_24x64:
271
+.rept 16
272
+    vld1.8          {d0, d1, d2}, [r2], r3
273
+    vst1.8          {d0, d1, d2}, [r0], r1
274
+.endr
275
+    subs            r12, r12, #1
276
+    bne             loop_24x64
277
+    bx              lr
278
+endfunc
279
+
280
+// void pixelavg_pp(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int)
281
+.macro pixel_avg_pp_4xN_neon h
282
+function x265_pixel_avg_pp_4x\h\()_neon
283
+    push            {r4}
284
+    ldr             r4, [sp, #4]
285
+    ldr             r12, [sp, #8]
286
+.rept \h
287
+    vld1.32         {d0[]}, [r2], r3
288
+    vld1.32         {d1[]}, [r4], r12
289
+    vrhadd.u8       d2, d0, d1
290
+    vst1.32         {d2[0]}, [r0], r1
291
+.endr
292
+    pop             {r4}
293
+    bx              lr
294
+endfunc
295
+.endm
296
+
297
+pixel_avg_pp_4xN_neon 4
298
+pixel_avg_pp_4xN_neon 8
299
+pixel_avg_pp_4xN_neon 16
300
+
301
+.macro pixel_avg_pp_8xN_neon h
302
+function x265_pixel_avg_pp_8x\h\()_neon
303
+    push            {r4}
304
+    ldr             r4, [sp, #4]
305
+    ldr             r12, [sp, #8]
306
+.rept \h
307
+    vld1.8          {d0}, [r2], r3
308
+    vld1.8          {d1}, [r4], r12
309
+    vrhadd.u8       d2, d0, d1
310
+    vst1.8          {d2}, [r0], r1
311
+.endr    
312
+    pop             {r4}
313
+    bx              lr
314
+endfunc
315
+.endm
316
+
317
+pixel_avg_pp_8xN_neon 4
318
+pixel_avg_pp_8xN_neon 8
319
+pixel_avg_pp_8xN_neon 16
320
+pixel_avg_pp_8xN_neon 32
321
+
322
+function x265_pixel_avg_pp_12x16_neon
323
+    push            {r4, r6}
324
+    mov             r6, #8
325
+    ldr             r4, [sp, #8]
326
+    ldr             r12, [sp, #12]
327
+    sub             r1, r6
328
+    sub             r3, r6
329
+    sub             r12, r6
330
+.rept 16
331
+    vld1.32         {d0}, [r2]!
332
+    vld1.32         {d1[0]}, [r2], r3
333
+    vld1.32         {d2}, [r4]!
334
+    vld1.32         {d3[0]}, [r4], r12
335
+    vrhadd.u8       d0, d0, d2
336
+    vrhadd.u8       d1, d1, d3
337
+    vst1.8          {d0}, [r0]!
338
+    vst1.32         {d1[0]}, [r0], r1
339
+.endr
340
+    pop            {r4, r6}
341
+    bx              lr
342
+endfunc
343
+
344
+.macro pixel_avg_pp_16xN_neon h
345
+function x265_pixel_avg_pp_16x\h\()_neon
346
+    push            {r4}
347
+    ldr             r4, [sp, #4]
348
+    ldr             r12, [sp, #8]
349
+.rept \h
350
+    vld1.8          {q0}, [r2], r3
351
+    vld1.8          {q1}, [r4], r12
352
+    vrhadd.u8       q2, q0, q1
353
+    vst1.8          {q2}, [r0], r1
354
+.endr    
355
+    pop             {r4}
356
+    bx              lr
357
+endfunc
358
+.endm
359
+
360
+pixel_avg_pp_16xN_neon 4
361
+pixel_avg_pp_16xN_neon 8
362
+pixel_avg_pp_16xN_neon 12
363
+pixel_avg_pp_16xN_neon 16
364
+pixel_avg_pp_16xN_neon 32
365
+
366
+function x265_pixel_avg_pp_16x64_neon
367
+    push            {r4, r6}
368
+    ldr             r4, [sp, #8]
369
+    ldr             r12, [sp, #12]
370
+    mov             r6, #8
371
+lpavg_16x64:
372
+.rept 8
373
+    vld1.8          {q0}, [r2], r3
374
+    vld1.8          {q1}, [r4], r12
375
+    vrhadd.u8       q2, q0, q1
376
+    vst1.8          {q2}, [r0], r1
377
+.endr  
378
+    subs            r6, r6, #1
379
+    bne             lpavg_16x64
380
+    pop             {r4 , r6}
381
+    bx              lr
382
+endfunc
383
+
384
+function x265_pixel_avg_pp_24x32_neon
385
+    push            {r4, r6}
386
+    ldr             r4, [sp, #8]
387
+    ldr             r12, [sp, #12]
388
+    mov             r6, #4
389
+lpavg_24x32:
390
+.rept 8
391
+    vld1.8          {d0, d1, d2}, [r2], r3
392
+    vld1.8          {d3, d4, d5}, [r4], r12
393
+    vrhadd.u8       d0, d0, d3
394
+    vrhadd.u8       d1, d1, d4
395
+    vrhadd.u8       d2, d2, d5
396
+    vst1.8          {d0, d1, d2}, [r0], r1
397
+.endr
398
+    subs            r6, r6, #1
399
+    bne             lpavg_24x32
400
+    pop             {r4, r6}
401
+    bx              lr
402
+endfunc
403
+
404
+.macro pixel_avg_pp_32xN_neon h
405
+function x265_pixel_avg_pp_32x\h\()_neon
406
+    push            {r4}
407
+    ldr             r4, [sp, #4]
408
+    ldr             r12, [sp, #8]
409
+.rept \h
410
+    vld1.8          {q0, q1}, [r2], r3
411
+    vld1.8          {q2, q3}, [r4], r12
412
+    vrhadd.u8       q0, q0, q2
413
+    vrhadd.u8       q1, q1, q3
414
+    vst1.8          {q0, q1}, [r0], r1
415
+.endr    
416
+    pop             {r4}
417
+    bx              lr
418
+endfunc
419
+.endm
420
+
421
+pixel_avg_pp_32xN_neon 8
422
+pixel_avg_pp_32xN_neon 16
423
+pixel_avg_pp_32xN_neon 24
424
+
425
+.macro pixel_avg_pp_32xN1_neon h i
426
+function x265_pixel_avg_pp_32x\h\()_neon
427
+    push            {r4, r6}
428
+    ldr             r4, [sp, #8]
429
+    ldr             r12, [sp, #12]
430
+    mov             r6, #\i
431
+lpavg_32x\h\():
432
+.rept 8
433
+    vld1.8          {q0, q1}, [r2], r3
434
+    vld1.8          {q2, q3}, [r4], r12
435
+    vrhadd.u8       q0, q0, q2
436
+    vrhadd.u8       q1, q1, q3
437
+    vst1.8          {q0, q1}, [r0], r1
438
+.endr  
439
+    subs            r6, r6, #1
440
+    bne             lpavg_32x\h
441
+    pop             {r4, r6}
442
+    bx              lr
443
+endfunc
444
+.endm
445
+
446
+pixel_avg_pp_32xN1_neon 32 4 
447
+pixel_avg_pp_32xN1_neon 64 8
448
+
449
+function x265_pixel_avg_pp_48x64_neon
450
+    push            {r4, r6, r7}
451
+    ldr             r4, [sp, #12]
452
+    ldr             r12, [sp, #16]
453
+    mov             r6, #8
454
+    mov             r7, #32
455
+    sub             r1, r7
456
+    sub             r3, r7
457
+    sub             r12, r7
458
+lpavg_48x64:
459
+.rept 8
460
+    vld1.8          {q0, q1}, [r2]!
461
+    vld1.8          {q2}, [r2], r3
462
+    vld1.8          {q8, q9}, [r4]!
463
+    vld1.8          {q10}, [r4], r12
464
+    vrhadd.u8       q0, q0, q8
465
+    vrhadd.u8       q1, q1, q9
466
+    vrhadd.u8       q2, q2, q10
467
+    vst1.8          {q0, q1}, [r0]!
468
+    vst1.8          {q2}, [r0], r1
469
+.endr
470
+    subs            r6, r6, #1
471
+    bne             lpavg_48x64
472
+    pop             {r4, r6, r7}
473
+    bx              lr
474
+endfunc
475
+
476
+.macro pixel_avg_pp_64xN_neon h i
477
+function x265_pixel_avg_pp_64x\h\()_neon
478
+    push            {r4, r6, r7}
479
+    ldr             r4, [sp, #12]
480
+    ldr             r12, [sp, #16]
481
+    mov             r7, #32
482
+    mov             r6, #\i
483
+    sub             r3, r7
484
+    sub             r12, r7
485
+    sub             r1, r7
486
+lpavg_64x\h\():
487
+.rept 4
488
+    vld1.8          {q0, q1}, [r2]!
489
+    vld1.8          {q2, q3}, [r2], r3
490
+    vld1.8          {q8, q9}, [r4]!
491
+    vld1.8          {q10, q11}, [r4], r12
492
+    vrhadd.u8       q0, q0, q8
493
+    vrhadd.u8       q1, q1, q9
494
+    vrhadd.u8       q2, q2, q10
495
+    vrhadd.u8       q3, q3, q11
496
+    vst1.8          {q0, q1}, [r0]!
497
+    vst1.8          {q2, q3}, [r0], r1
498
+.endr
499
+    subs            r6, r6, #1
500
+    bne             lpavg_64x\h
501
+    pop             {r4, r6, r7}
502
+    bx              lr
503
+endfunc
504
+.endm
505
+
506
+pixel_avg_pp_64xN_neon 16 4
507
+pixel_avg_pp_64xN_neon 32 8
508
+pixel_avg_pp_64xN_neon 48 12
509
+pixel_avg_pp_64xN_neon 64 16
510
+
511
+// void x265_cpy2Dto1D_shr_4x4_neon(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift)
512
+function x265_cpy2Dto1D_shr_4x4_neon
513
+    add             r2, r2
514
+    vdup.16         q0, r3
515
+    vceq.s16        q1, q1
516
+    vshl.s16        q1, q0
517
+    vsri.s16        q1, #1
518
+    vneg.s16        q0, q0
519
+    vld1.s16        {d4}, [r1], r2
520
+    vld1.s16        {d5}, [r1], r2
521
+    vld1.s16        {d6}, [r1], r2
522
+    vld1.s16        {d7}, [r1], r2
523
+    vsub.s16        q2, q1
524
+    vsub.s16        q3, q1
525
+    vshl.s16        q2, q0
526
+    vshl.s16        q3, q0
527
+    vst1.16         {q2-q3}, [r0]
528
+    bx              lr
529
+endfunc
530
+
531
+function x265_cpy2Dto1D_shr_8x8_neon
532
+    add             r2, r2
533
+    vdup.16         q0, r3
534
+    vceq.s16        q1, q1
535
+    vshl.s16        q1, q0
536
+    vsri.s16        q1, #1
537
+    vneg.s16        q0, q0
538
+.rept 4
539
+    vld1.s16        {q2}, [r1], r2
540
+    vld1.s16        {q3}, [r1], r2
541
+    vsub.s16        q2, q1
542
+    vsub.s16        q3, q1
543
+    vshl.s16        q2, q0
544
+    vshl.s16        q3, q0
545
+    vst1.16         {q2-q3}, [r0]!
546
+.endr
547
+    bx              lr
548
+endfunc
549
+
550
+function x265_cpy2Dto1D_shr_16x16_neon
551
+    add             r2, r2
552
+    vdup.16         q0, r3
553
+    vceq.s16        q1, q1
554
+    vshl.s16        q1, q0
555
+    vsri.s16        q1, #1
556
+    vneg.s16        q0, q0
557
+    mov             r3, #4
558
+.loop_cpy2Dto1D_shr_16:
559
+    subs            r3, #1
560
+.rept 4
561
+    vld1.s16        {q2-q3}, [r1], r2
562
+    vsub.s16        q2, q1
563
+    vsub.s16        q3, q1
564
+    vshl.s16        q2, q0
565
+    vshl.s16        q3, q0
566
+    vst1.16         {q2-q3}, [r0]!
567
+.endr
568
+    bgt             .loop_cpy2Dto1D_shr_16
569
+    bx              lr
570
+endfunc
571
+
572
+function x265_cpy2Dto1D_shr_32x32_neon
573
+    add             r2, r2
574
+    sub             r2, #32
575
+    vdup.16         q0, r3
576
+    vceq.s16        q1, q1
577
+    vshl.s16        q1, q0
578
+    vsri.s16        q1, #1
579
+    vneg.s16        q0, q0
580
+    mov             r3, 16
581
+.loop_cpy2Dto1D_shr_32:
582
+    subs            r3, #1
583
+.rept 2
584
+    vld1.s16        {q2-q3}, [r1]!
585
+    vld1.s16        {q8-q9}, [r1], r2
586
+    vsub.s16        q2, q1
587
+    vsub.s16        q3, q1
588
+    vsub.s16        q8, q1
589
+    vsub.s16        q9, q1
590
+    vshl.s16        q2, q0
591
+    vshl.s16        q3, q0
592
+    vshl.s16        q8, q0
593
+    vshl.s16        q9, q0
594
+    vst1.16         {q2-q3}, [r0]!
595
+    vst1.16         {q8-q9}, [r0]!
596
+.endr
597
+    bgt             .loop_cpy2Dto1D_shr_32
598
+    bx              lr
599
+endfunc
600
+
601
+// void addAvg(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride)
602
+.macro addAvg_8xN h i
603
+function x265_addAvg_8x\h\()_neon
604
+    push            {r4, r5, r6}
605
+    ldr             r4, [sp, #12]
606
+    ldr             r5, [sp, #16]
607
+    lsl             r3, #1
608
+    lsl             r4, #1
609
+    mov             r12, #\i
610
+    vmov.i16        d0, #16448
611
+
612
+loop_addavg_8x\h:
613
+    subs            r12, #1
614
+    vld1.16         {q1}, [r0], r3        // src1
615
+    vld1.16         {q2}, [r1], r4        // src2
616
+    vld1.16         {q10}, [r0], r3        // src1
617
+    vld1.16         {q11}, [r1], r4        // src2
618
+
619
+    vadd.s16        q1, q2
620
+    vaddl.s16       q8, d2, d0
621
+    vaddl.s16       q9, d3, d0
622
+    vadd.s16        q10, q11
623
+    vaddl.s16       q1, d20, d0
624
+    vaddl.s16       q2, d21, d0
625
+
626
+    vshrn.s32       d20, q8, #7
627
+    vshrn.s32       d21, q9, #7
628
+    vshrn.s32       d22, q1, #7
629
+    vshrn.s32       d23, q2, #7
630
+
631
+    vqmovun.s16     d2, q10
632
+    vqmovun.s16     d3, q11
633
+    vst1.8          {d2}, [r2], r5
634
+    vst1.8          {d3}, [r2], r5
635
+
636
+    bne             loop_addavg_8x\h
637
+    pop             {r4, r5, r6}
638
+    bx              lr
639
+endfunc
640
+.endm
641
+
642
+addAvg_8xN 4 2
643
+addAvg_8xN 8 4
644
+addAvg_8xN 16 8
645
+addAvg_8xN 32 16
646
+addAvg_8xN 2 1
647
+addAvg_8xN 6 3
648
+addAvg_8xN 12 6
649
+addAvg_8xN 64 32
650
+
651
+function x265_addAvg_4x4_neon
652
+    push            {r4, r5, r6}
653
+    ldr             r4, [sp, #12]
654
+    ldr             r5, [sp, #16]
655
+    lsl             r3, #1
656
+    lsl             r4, #1
657
+    vmov.i16        d0, #16448
658
+
659
+.rept 2
660
+    vld1.16         {d2}, [r0], r3        // src1
661
+    vld1.16         {d4}, [r0], r3
662
+    vld1.16         {d3}, [r1], r4        // src2
663
+    vld1.16         {d5}, [r1], r4
664
+
665
+    vadd.s16        d2, d3
666
+    vadd.s16        d4, d5
667
+    vaddl.s16       q8, d2, d0
668
+    vaddl.s16       q9, d4, d0
669
+    vshrn.s32       d20, q8, #7
670
+    vshrn.s32       d21, q9, #7
671
+    vqmovun.s16     d2, q10
672
+
673
+    vst1.32         {d2[0]}, [r2], r5
674
+    vst1.32         {d2[1]}, [r2], r5
675
+.endr
676
+    pop             {r4, r5, r6}
677
+    bx              lr
678
+endfunc
679
+
680
+.macro addAvg_4xN h i
681
+function x265_addAvg_4x\h\()_neon
682
+    push            {r4, r5, r6}
683
+    ldr             r4, [sp, #12]
684
+    ldr             r5, [sp, #16]
685
+    lsl             r3, #1
686
+    lsl             r4, #1
687
+    mov             r12, #\i
688
+    vmov.i16        d0, #16448
689
+
690
+loop_addavg_4x\h\():
691
+    subs            r12, #1
692
+    vld1.16         {d2}, [r0], r3        // src1
693
+    vld1.16         {d4}, [r0], r3
694
+    vld1.16         {d3}, [r1], r4        // src2
695
+    vld1.16         {d5}, [r1], r4
696
+
697
+    vadd.s16        d2, d3
698
+    vadd.s16        d4, d5
699
+    vaddl.s16       q8, d2, d0
700
+    vaddl.s16       q9, d4, d0
701
+    vshrn.s32       d20, q8, #7
702
+    vshrn.s32       d21, q9, #7
703
+    vqmovun.s16     d2, q10
704
+
705
+    vst1.32         {d2[0]}, [r2], r5
706
+    vst1.32         {d2[1]}, [r2], r5
707
+    bne             loop_addavg_4x\h
708
+    pop             {r4, r5, r6}
709
+    bx              lr
710
+endfunc
711
+.endm
712
+
713
+addAvg_4xN 8 4
714
+addAvg_4xN 16 8
715
+addAvg_4xN 2 1
716
+addAvg_4xN 32 16
717
+
718
+.macro addAvg_6xN h i
719
+function x265_addAvg_6x\h\()_neon
720
+    push            {r4, r5, r6}
721
+    ldr             r4, [sp, #12]
722
+    ldr             r5, [sp, #16]
723
+    lsl             r3, #1
724
+    lsl             r4, #1
725
+    sub             r5, #4
726
+    mov             r12, #\i
727
+    vmov.i16        d0, #16448
728
+
729
+loop_addavg_6x\h:
730
+    subs            r12, #1
731
+    vld1.16         {q1}, [r0], r3        // src1
732
+    vld1.16         {q2}, [r1], r4        // src2
733
+    vld1.16         {q10}, [r0], r3        // src1
734
+    vld1.16         {q11}, [r1], r4        // src2
735
+
736
+    vadd.s16        q1, q2
737
+    vaddl.s16       q8, d2, d0
738
+    vaddl.s16       q9, d3, d0
739
+    vadd.s16        q10, q11
740
+    vaddl.s16       q1, d20, d0
741
+    vaddl.s16       q2, d21, d0
742
+
743
+    vshrn.s32       d20, q8, #7
744
+    vshrn.s32       d21, q9, #7
745
+    vshrn.s32       d22, q1, #7
746
+    vshrn.s32       d23, q2, #7
747
+
748
+    vqmovun.s16     d2, q10
749
+    vqmovun.s16     d3, q11
750
+    vst1.32         {d2[0]}, [r2]!
751
+    vst1.16         {d2[2]}, [r2], r5
752
+    vst1.32         {d3[0]}, [r2]!
753
+    vst1.16         {d3[2]}, [r2], r5
754
+
755
+    bne             loop_addavg_6x\h
756
+    pop             {r4, r5, r6}
757
+    bx              lr
758
+endfunc
759
+.endm
760
+
761
+addAvg_6xN 8 4
762
+addAvg_6xN 16 8
763
+
764
+function x265_addAvg_12x16_neon
765
+    push            {r4, r5, r6}
766
+    ldr             r4, [sp, #12]
767
+    ldr             r5, [sp, #16]
768
+    lsl             r3, #1
769
+    lsl             r4, #1
770
+    sub             r5, #8
771
+    mov             r12, #16
772
+    vmov.i16        d0, #16448
773
+
774
+loop_addAvg_12X16:
775
+    subs            r12, #1
776
+    vld1.16         {d2, d3, d4}, [r0], r3
777
+    vld1.16         {d16, d17, d18}, [r1], r4
778
+
779
+    vadd.s16        q1, q8
780
+    vaddl.s16       q11, d2, d0
781
+    vaddl.s16       q10, d3, d0
782
+    vadd.s16        d4, d18
783
+    vaddl.s16       q9, d0, d4
784
+
785
+    vshrn.s32       d2, q11, #7
786
+    vshrn.s32       d3, q10, #7
787
+    vshrn.s32       d4, q9, #7
788
+    veor            d5, d5
789
+
790
+    vqmovun.s16     d6, q1
791
+    vqmovun.s16     d7, q2
792
+    vst1.8          {d6}, [r2]!
793
+    vst1.32         {d7[0]}, [r2], r5
794
+
795
+    bne             loop_addAvg_12X16
796
+    pop             {r4, r5, r6}
797
+    bx              lr
798
+endfunc
799
+
800
+function x265_addAvg_12x32_neon
801
+    push            {r4, r5, r6}
802
+    ldr             r4, [sp, #12]
803
+    ldr             r5, [sp, #16]
804
+    lsl             r3, #1
805
+    lsl             r4, #1
806
+    sub             r5, #8
807
+    mov             r12, #32
808
+    vmov.i16        d0, #16448
809
+
810
+loop_addAvg_12X32:
811
+    subs            r12, #1
812
+    vld1.16         {d2, d3, d4}, [r0], r3
813
+    vld1.16         {d16, d17, d18}, [r1], r4
814
+
815
+    vadd.s16        q1, q8
816
+    vaddl.s16       q11, d2, d0
817
+    vaddl.s16       q10, d3, d0
818
+    vadd.s16        d4, d18
819
+    vaddl.s16       q9, d0, d4
820
+
821
+    vshrn.s32       d2, q11, #7
822
+    vshrn.s32       d3, q10, #7
823
+    vshrn.s32       d4, q9, #7
824
+    veor            d5, d5
825
+
826
+    vqmovun.s16     d6, q1
827
+    vqmovun.s16     d7, q2
828
+    vst1.8          {d6}, [r2]!
829
+    vst1.32         {d7[0]}, [r2], r5
830
+
831
+    bne             loop_addAvg_12X32
832
+    pop             {r4, r5, r6}
833
+    bx              lr
834
+endfunc
835
+
836
+.macro addAvg_16xN h
837
+function x265_addAvg_16x\h\()_neon
838
+    push            {r4, r5, r6}
839
+    ldr             r4, [sp, #12]
840
+    ldr             r5, [sp, #16]
841
+    lsl             r3, #1
842
+    lsl             r4, #1
843
+    mov             r12, #\h
844
+    vmov.i16        d0, #16448
845
+
846
+loop_addavg_16x\h:
847
+    subs            r12, #1
848
+    vld1.16         {q1, q2}, [r0], r3             // src1
849
+    vld1.16         {q8, q9}, [r1], r4             // src2
850
+
851
+    vadd.s16        q1, q8
852
+    vaddl.s16       q10, d2, d0
853
+    vaddl.s16       q11, d3, d0
854
+    vadd.s16        q2, q9
855
+    vaddl.s16       q8, d4, d0
856
+    vaddl.s16       q9, d5, d0
857
+
858
+    vshrn.s32       d2, q10, #7
859
+    vshrn.s32       d3, q11, #7
860
+    vshrn.s32       d4, q8, #7
861
+    vshrn.s32       d5, q9, #7
862
+
863
+    vqmovun.s16     d6, q1
864
+    vqmovun.s16     d7, q2
865
+    vst1.8          {q3}, [r2], r5
866
+
867
+    bne             loop_addavg_16x\h
868
+    pop             {r4, r5, r6}
869
+    bx              lr
870
+endfunc
871
+.endm
872
+
873
+addAvg_16xN 4
874
+addAvg_16xN 8
875
+addAvg_16xN 12
876
+addAvg_16xN 16
877
+addAvg_16xN 32
878
+addAvg_16xN 64
879
+addAvg_16xN 24
880
+
881
+function x265_addAvg_24x32_neon
882
+    push            {r4, r5, r6}
883
+    ldr             r4, [sp, #12]
884
+    ldr             r5, [sp, #16]
885
+    lsl             r3, #1
886
+    lsl             r4, #1
887
+    sub             r3, #32
888
+    sub             r4, #32
889
+    mov             r12, #32
890
+    vmov.i16        d0, #16448
891
+
892
+loop_addavg_24x32:
893
+    subs            r12, #1
894
+    vld1.16         {q1, q2}, [r0]!             // src1
895
+    vld1.16         {q3}, [r0], r3
896
+    vld1.16         {q8, q9}, [r1]!             // src2
897
+    vld1.16         {q10}, [r1], r4
898
+
899
+    vadd.s16        q1, q8
900
+    vaddl.s16       q12, d2, d0
901
+    vaddl.s16       q13, d3, d0
902
+    vadd.s16        q2, q9
903
+    vaddl.s16       q8, d4, d0
904
+    vaddl.s16       q9, d5, d0
905
+    vadd.s16        q3, q10
906
+    vaddl.s16       q10, d6, d0
907
+    vaddl.s16       q11, d7, d0
908
+
909
+    vshrn.s32       d2, q12, #7
910
+    vshrn.s32       d3, q13, #7
911
+    vshrn.s32       d4, q8, #7
912
+    vshrn.s32       d5, q9, #7
913
+    vshrn.s32       d6, q10, #7
914
+    vshrn.s32       d7, q11, #7
915
+
916
+    vqmovun.s16     d16, q1
917
+    vqmovun.s16     d17, q2
918
+    vqmovun.s16     d18, q3
919
+    vst1.8          {d16, d17, d18}, [r2], r5
920
+    bne             loop_addavg_24x32
921
+
922
+    pop             {r4, r5, r6}
923
+    bx              lr
924
+endfunc
925
+
926
+function x265_addAvg_24x64_neon
927
+    push            {r4, r5, r6}
928
+    ldr             r4, [sp, #12]
929
+    ldr             r5, [sp, #16]
930
+    lsl             r3, #1
931
+    lsl             r4, #1
932
+    sub             r3, #32
933
+    sub             r4, #32
934
+    mov             r12, #64
935
+    vmov.i16        d0, #16448
936
+
937
+loop_addavg_24x64:
938
+    subs            r12, #1
939
+    vld1.16         {q1, q2}, [r0]!             // src1
940
+    vld1.16         {q3}, [r0], r3
941
+    vld1.16         {q8, q9}, [r1]!             // src2
942
+    vld1.16         {q10}, [r1], r4
943
+
944
+    vadd.s16        q1, q8
945
+    vaddl.s16       q12, d2, d0
946
+    vaddl.s16       q13, d3, d0
947
+    vadd.s16        q2, q9
948
+    vaddl.s16       q8, d4, d0
949
+    vaddl.s16       q9, d5, d0
950
+    vadd.s16        q3, q10
951
+    vaddl.s16       q10, d6, d0
952
+    vaddl.s16       q11, d7, d0
953
+
954
+    vshrn.s32       d2, q12, #7
955
+    vshrn.s32       d3, q13, #7
956
+    vshrn.s32       d4, q8, #7
957
+    vshrn.s32       d5, q9, #7
958
+    vshrn.s32       d6, q10, #7
959
+    vshrn.s32       d7, q11, #7
960
+
961
+    vqmovun.s16     d16, q1
962
+    vqmovun.s16     d17, q2
963
+    vqmovun.s16     d18, q3
964
+    vst1.8          {d16, d17, d18}, [r2], r5
965
+    bne             loop_addavg_24x64
966
+
967
+    pop             {r4, r5, r6}
968
+    bx              lr
969
+endfunc
970
+
971
+.macro addAvg32 x y z
972
+    mov             r12, #\y
973
+loop_addavg_\x\()x\y\()_\z:
974
+    subs            r12, #1
975
+    vld1.16         {q8, q9}, [r0]!               // src1
976
+    vld1.16         {q10, q11}, [r0], r3
977
+    vld1.16         {q12, q13}, [r1]!             // src2
978
+    vld1.16         {q14, q15}, [r1], r4
979
+
980
+    vadd.s16        q8, q12
981
+    vaddl.s16       q1, d16, d0
982
+    vaddl.s16       q2, d17, d0
983
+    vadd.s16        q9, q13
984
+    vaddl.s16       q12, d18, d0
985
+    vaddl.s16       q13, d19, d0
986
+
987
+    vshrn.s32       d6, q1, #7
988
+    vshrn.s32       d7, q2, #7
989
+    vshrn.s32       d2, q12, #7
990
+    vshrn.s32       d3, q13, #7
991
+    vqmovun.s16     d16, q3
992
+    vqmovun.s16     d17, q1
993
+
994
+    vadd.s16        q10, q14
995
+    vaddl.s16       q1, d20, d0
996
+    vaddl.s16       q2, d21, d0
997
+    vadd.s16        q11, q15
998
+    vaddl.s16       q12, d22, d0
999
+    vaddl.s16       q13, d23, d0
1000
+
1001
+    vshrn.s32       d6, q1, #7
1002
+    vshrn.s32       d7, q2, #7
1003
+    vshrn.s32       d2, q12, #7
1004
+    vshrn.s32       d3, q13, #7
1005
+    vqmovun.s16     d18, q3
1006
+    vqmovun.s16     d19, q1
1007
+    vst1.8          {q8, q9}, [r2], r5
1008
+    bne             loop_addavg_\x\()x\y\()_\z
1009
+.endm
1010
+
1011
+.macro addAvg_32xN h
1012
+function x265_addAvg_32x\h\()_neon
1013
+    push            {r4, r5, r6}
1014
+    ldr             r4, [sp, #12]
1015
+    ldr             r5, [sp, #16]
1016
+    lsl             r3, #1
1017
+    lsl             r4, #1
1018
+    sub             r3, #32
1019
+    sub             r4, #32
1020
+    vmov.i16        d0, #16448
1021
+
1022
+    addAvg32 32 \h 1
1023
+    pop             {r4, r5, r6}
1024
+    bx              lr
1025
+endfunc
1026
+.endm
1027
+
1028
+addAvg_32xN 8
1029
+addAvg_32xN 16
1030
+addAvg_32xN 24
1031
+addAvg_32xN 32
1032
+addAvg_32xN 64
1033
+addAvg_32xN 48
1034
+
1035
+function x265_addAvg_48x64_neon
1036
+    push            {r4, r5, r6, r7, r8}
1037
+    ldr             r4, [sp, #20]
1038
+    ldr             r5, [sp, #24]
1039
+    lsl             r3, #1
1040
+    lsl             r4, #1
1041
+    sub             r3, #32
1042
+    sub             r4, #32
1043
+    vmov.i16        d0, #16448
1044
+    mov             r7, r0
1045
+    mov             r8, r1
1046
+
1047
+    addAvg32 48 64 1                               // 32x64
1048
+    add             r0, r7, #64
1049
+    add             r1, r8, #64
1050
+    sub             r2, r2, r5, lsl #6
1051
+    add             r2, #32
1052
+    add             r3, #32
1053
+    add             r4, #32
1054
+
1055
+    mov             r12, #64
1056
+loop_addavg_16x64_2:                               // 16x64
1057
+    subs            r12, #1
1058
+    vld1.16         {q1, q2}, [r0], r3             // src1
1059
+    vld1.16         {q8, q9}, [r1], r4             // src2
1060
+
1061
+    vadd.s16        q1, q8
1062
+    vaddl.s16       q10, d2, d0
1063
+    vaddl.s16       q11, d3, d0
1064
+    vadd.s16        q2, q9
1065
+    vaddl.s16       q8, d4, d0
1066
+    vaddl.s16       q9, d5, d0
1067
+
1068
+    vshrn.s32       d2, q10, #7
1069
+    vshrn.s32       d3, q11, #7
1070
+    vshrn.s32       d4, q8, #7
1071
+    vshrn.s32       d5, q9, #7
1072
+
1073
+    vqmovun.s16     d6, q1
1074
+    vqmovun.s16     d7, q2
1075
+    vst1.8          {q3}, [r2], r5
1076
+    bne             loop_addavg_16x64_2
1077
+
1078
+    pop             {r4, r5, r6, r7, r8}
1079
+    bx              lr
1080
+endfunc
1081
+
1082
+function x265_addAvg_64x16_neon
1083
+    push            {r4, r5, r6, r7, r8}
1084
+    ldr             r4, [sp, #20]
1085
+    ldr             r5, [sp, #24]
1086
+    lsl             r3, #1
1087
+    lsl             r4, #1
1088
+    sub             r3, #32
1089
+    sub             r4, #32
1090
+    vmov.i16        d0, #16448
1091
+    mov             r7, r0
1092
+    mov             r8, r1
1093
+
1094
+    addAvg32 64 16 1
1095
+    add             r0, r7, #64
1096
+    add             r1, r8, #64
1097
+    sub             r2, r2, r5, lsl #4
1098
+    add             r2, #32
1099
+    addAvg32 64 16 2
1100
+
1101
+    pop             {r4, r5, r6, r7, r8}
1102
+    bx              lr
1103
+endfunc
1104
+
1105
+function x265_addAvg_64x32_neon
1106
+    push            {r4, r5, r6, r7, r8}
1107
+    ldr             r4, [sp, #20]
1108
+    ldr             r5, [sp, #24]
1109
+    lsl             r3, #1
1110
+    lsl             r4, #1
1111
+    sub             r3, #32
1112
+    sub             r4, #32
1113
+    vmov.i16        d0, #16448
1114
+    mov             r7, r0
1115
+    mov             r8, r1
1116
+
1117
+    addAvg32 64 32 1
1118
+    add             r0, r7, #64
1119
+    add             r1, r8, #64
1120
+    sub             r2, r2, r5, lsl #5
1121
+    add             r2, #32
1122
+    addAvg32 64 32 2
1123
+
1124
+    pop             {r4, r5, r6, r7, r8}
1125
+    bx              lr
1126
+endfunc
1127
+
1128
+function x265_addAvg_64x48_neon
1129
+    push            {r4, r5, r6, r7, r8}
1130
+    ldr             r4, [sp, #20]
1131
+    ldr             r5, [sp, #24]
1132
+    lsl             r3, #1
1133
+    lsl             r4, #1
1134
+    sub             r3, #32
1135
+    sub             r4, #32
1136
+    vmov.i16        d0, #16448
1137
+    mov             r7, r0
1138
+    mov             r8, r1
1139
+
1140
+    addAvg32 64 48 1
1141
+    add             r0, r7, #64
1142
+    add             r1, r8, #64
1143
+    sub             r2, r2, r5, lsl #5
1144
+    sub             r2, r2, r5, lsl #4
1145
+    add             r2, #32
1146
+    addAvg32 64 48 2
1147
+
1148
+    pop             {r4, r5, r6, r7, r8}
1149
+    bx              lr
1150
+endfunc
1151
+
1152
+function x265_addAvg_64x64_neon
1153
+    push            {r4, r5, r6, r7, r8}
1154
+    ldr             r4, [sp, #20]
1155
+    ldr             r5, [sp, #24]
1156
+    lsl             r3, #1
1157
+    lsl             r4, #1
1158
+    sub             r3, #32
1159
+    sub             r4, #32
1160
+    vmov.i16        d0, #16448
1161
+    mov             r7, r0
1162
+    mov             r8, r1
1163
+
1164
+    addAvg32 64 64 1
1165
+    add             r0, r7, #64
1166
+    add             r1, r8, #64
1167
+    sub             r2, r2, r5, lsl #6
1168
+    add             r2, #32
1169
+    addAvg32 64 64 2
1170
+
1171
+    pop             {r4, r5, r6, r7, r8}
1172
+    bx              lr
1173
+endfunc
1174
x265_2.0.tar.gz/source/common/arm/mc.h Added
29
 
1
@@ -0,0 +1,27 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2016 x265 project
4
+ *
5
+ * Authors: Steve Borho <steve@borho.org>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#ifndef X265_MC_ARM_H
26
+#define X265_MC_ARM_H
27
+
28
+#endif // ifndef X265_MC_ARM_H
29
x265_2.0.tar.gz/source/common/arm/pixel-util.S Added
2453
 
1
@@ -0,0 +1,2451 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2016 x265 project
4
+ *
5
+ * Authors: Dnyaneshwar G <dnyaneshwar@multicorewareinc.com>
6
+ *          Radhakrishnan VR <radhakrishnan@multicorewareinc.com>
7
+ *          Min Chen <min.chen@multicorewareinc.com>
8
+ * 
9
+ * This program is free software; you can redistribute it and/or modify
10
+ * it under the terms of the GNU General Public License as published by
11
+ * the Free Software Foundation; either version 2 of the License, or
12
+ * (at your option) any later version.
13
+ *
14
+ * This program is distributed in the hope that it will be useful,
15
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17
+ * GNU General Public License for more details.
18
+ *
19
+ * You should have received a copy of the GNU General Public License
20
+ * along with this program; if not, write to the Free Software
21
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
22
+ *
23
+ * This program is also available under a commercial proprietary license.
24
+ * For more information, contact us at license @ x265.com.
25
+ *****************************************************************************/
26
+
27
+#include "asm.S"
28
+
29
+.section .rodata
30
+
31
+.align 4
32
+
33
+
34
+.text
35
+
36
+.macro VAR_SQR_SUM qsqr_sum, qsqr_last, qsqr_temp, dsrc, num=0, vpadal=vpadal.u16
37
+    vmull.u8        \qsqr_temp, \dsrc, \dsrc
38
+    vaddw.u8        q\num, q\num, \dsrc
39
+    \vpadal         \qsqr_sum, \qsqr_last
40
+.endm
41
+
42
+function x265_pixel_var_8x8_neon
43
+    vld1.u8         {d16}, [r0], r1
44
+    vmull.u8        q1, d16, d16
45
+    vmovl.u8        q0, d16
46
+    vld1.u8         {d18}, [r0], r1
47
+    vmull.u8        q2, d18, d18
48
+    vaddw.u8        q0, q0, d18
49
+
50
+    vld1.u8         {d20}, [r0], r1
51
+    VAR_SQR_SUM     q1, q1, q3, d20, 0, vpaddl.u16
52
+    vld1.u8         {d22}, [r0], r1
53
+    VAR_SQR_SUM     q2, q2, q8, d22, 0, vpaddl.u16
54
+
55
+    vld1.u8         {d24}, [r0], r1
56
+    VAR_SQR_SUM     q1, q3, q9, d24
57
+    vld1.u8         {d26}, [r0], r1
58
+    VAR_SQR_SUM     q2, q8, q10, d26
59
+    vld1.u8         {d24}, [r0], r1
60
+    VAR_SQR_SUM     q1, q9, q14, d24
61
+    vld1.u8         {d26}, [r0], r1
62
+    VAR_SQR_SUM     q2, q10, q15, d26
63
+
64
+    vpaddl.u16      q8, q14
65
+    vpaddl.u16      q9, q15
66
+    vadd.u32        q1, q1, q8
67
+    vadd.u16        d0, d0, d1
68
+    vadd.u32        q1, q1, q9
69
+    vadd.u32        q1, q1, q2
70
+    vpaddl.u16      d0, d0
71
+    vadd.u32        d2, d2, d3
72
+    vpadd.u32       d0, d0, d2
73
+
74
+    vmov            r0, r1, d0
75
+    bx              lr
76
+endfunc
77
+
78
+function x265_pixel_var_16x16_neon
79
+    veor.u8         q0, q0
80
+    veor.u8         q1, q1
81
+    veor.u8         q2, q2
82
+    veor.u8         q14, q14
83
+    veor.u8         q15, q15
84
+    mov             ip, #4
85
+
86
+.var16_loop:
87
+    subs            ip, ip, #1
88
+    vld1.u8         {q8}, [r0], r1
89
+    VAR_SQR_SUM     q1, q14, q12, d16
90
+    VAR_SQR_SUM     q2, q15, q13, d17
91
+
92
+    vld1.u8         {q9}, [r0], r1
93
+    VAR_SQR_SUM     q1, q12, q14, d18
94
+    VAR_SQR_SUM     q2, q13, q15, d19
95
+
96
+    vld1.u8         {q8}, [r0], r1
97
+    VAR_SQR_SUM     q1, q14, q12, d16
98
+    VAR_SQR_SUM     q2, q15, q13, d17
99
+
100
+    vld1.u8         {q9}, [r0], r1
101
+    VAR_SQR_SUM     q1, q12, q14, d18
102
+    VAR_SQR_SUM     q2, q13, q15, d19
103
+    bgt             .var16_loop
104
+
105
+    vpaddl.u16      q8, q14
106
+    vpaddl.u16      q9, q15
107
+    vadd.u32        q1, q1, q8
108
+    vadd.u16        d0, d0, d1
109
+    vadd.u32        q1, q1, q9
110
+    vadd.u32        q1, q1, q2
111
+    vpaddl.u16      d0, d0
112
+    vadd.u32        d2, d2, d3
113
+    vpadd.u32       d0, d0, d2
114
+
115
+    vmov            r0, r1, d0
116
+    bx              lr
117
+endfunc
118
+
119
+function x265_pixel_var_32x32_neon
120
+    veor.u8         q0, q0
121
+    veor.u8         q1, q1
122
+    veor.u8         q2, q2
123
+    veor.u8         q14, q14
124
+    veor.u8         q15, q15
125
+    mov             ip, #8
126
+
127
+.var32_loop:
128
+    subs            ip, ip, #1
129
+    vld1.u8         {q8-q9}, [r0], r1
130
+    VAR_SQR_SUM     q1, q14, q12, d16
131
+    VAR_SQR_SUM     q2, q15, q13, d17
132
+    VAR_SQR_SUM     q1, q12, q14, d18
133
+    VAR_SQR_SUM     q2, q13, q15, d19
134
+
135
+    vld1.u8         {q8-q9}, [r0], r1
136
+    VAR_SQR_SUM     q1, q14, q12, d16
137
+    VAR_SQR_SUM     q2, q15, q13, d17
138
+    VAR_SQR_SUM     q1, q12, q14, d18
139
+    VAR_SQR_SUM     q2, q13, q15, d19
140
+
141
+    vld1.u8         {q8-q9}, [r0], r1
142
+    VAR_SQR_SUM     q1, q14, q12, d16
143
+    VAR_SQR_SUM     q2, q15, q13, d17
144
+    VAR_SQR_SUM     q1, q12, q14, d18
145
+    VAR_SQR_SUM     q2, q13, q15, d19
146
+
147
+    vld1.u8         {q8-q9}, [r0], r1
148
+    VAR_SQR_SUM     q1, q14, q12, d16
149
+    VAR_SQR_SUM     q2, q15, q13, d17
150
+    VAR_SQR_SUM     q1, q12, q14, d18
151
+    VAR_SQR_SUM     q2, q13, q15, d19
152
+    bgt             .var32_loop
153
+
154
+    vpaddl.u16      q8, q14
155
+    vpaddl.u16      q9, q15
156
+    vadd.u32        q1, q1, q8
157
+    vadd.u16        d0, d0, d1
158
+    vadd.u32        q1, q1, q9
159
+    vadd.u32        q1, q1, q2
160
+    vpaddl.u16      d0, d0
161
+    vadd.u32        d2, d2, d3
162
+    vpadd.u32       d0, d0, d2
163
+
164
+    vmov            r0, r1, d0
165
+    bx              lr
166
+endfunc
167
+
168
+function x265_pixel_var_64x64_neon
169
+    sub             r1, #32
170
+    veor.u8         q0, q0
171
+    veor.u8         q1, q1
172
+    veor.u8         q2, q2
173
+    veor.u8         q3, q3
174
+    veor.u8         q14, q14
175
+    veor.u8         q15, q15
176
+    mov             ip, #16
177
+
178
+.var64_loop:
179
+    subs            ip, ip, #1
180
+    vld1.u8         {q8-q9}, [r0]!
181
+    VAR_SQR_SUM     q1, q14, q12, d16
182
+    VAR_SQR_SUM     q2, q15, q13, d17
183
+    VAR_SQR_SUM     q1, q12, q14, d18
184
+    VAR_SQR_SUM     q2, q13, q15, d19
185
+
186
+    vld1.u8         {q8-q9}, [r0], r1
187
+    VAR_SQR_SUM     q1, q14, q12, d16, 3
188
+    VAR_SQR_SUM     q2, q15, q13, d17, 3
189
+    VAR_SQR_SUM     q1, q12, q14, d18, 3
190
+    VAR_SQR_SUM     q2, q13, q15, d19, 3
191
+
192
+    vld1.u8         {q8-q9}, [r0]!
193
+    VAR_SQR_SUM     q1, q14, q12, d16
194
+    VAR_SQR_SUM     q2, q15, q13, d17
195
+    VAR_SQR_SUM     q1, q12, q14, d18
196
+    VAR_SQR_SUM     q2, q13, q15, d19
197
+
198
+    vld1.u8         {q8-q9}, [r0], r1
199
+    VAR_SQR_SUM     q1, q14, q12, d16, 3
200
+    VAR_SQR_SUM     q2, q15, q13, d17, 3
201
+    VAR_SQR_SUM     q1, q12, q14, d18, 3
202
+    VAR_SQR_SUM     q2, q13, q15, d19, 3
203
+
204
+    vld1.u8         {q8-q9}, [r0]!
205
+    VAR_SQR_SUM     q1, q14, q12, d16
206
+    VAR_SQR_SUM     q2, q15, q13, d17
207
+    VAR_SQR_SUM     q1, q12, q14, d18
208
+    VAR_SQR_SUM     q2, q13, q15, d19
209
+
210
+    vld1.u8         {q8-q9}, [r0], r1
211
+    VAR_SQR_SUM     q1, q14, q12, d16, 3
212
+    VAR_SQR_SUM     q2, q15, q13, d17, 3
213
+    VAR_SQR_SUM     q1, q12, q14, d18, 3
214
+    VAR_SQR_SUM     q2, q13, q15, d19, 3
215
+
216
+    vld1.u8         {q8-q9}, [r0]!
217
+    VAR_SQR_SUM     q1, q14, q12, d16
218
+    VAR_SQR_SUM     q2, q15, q13, d17
219
+    VAR_SQR_SUM     q1, q12, q14, d18
220
+    VAR_SQR_SUM     q2, q13, q15, d19
221
+
222
+    vld1.u8         {q8-q9}, [r0], r1
223
+    VAR_SQR_SUM     q1, q14, q12, d16, 3
224
+    VAR_SQR_SUM     q2, q15, q13, d17, 3
225
+    VAR_SQR_SUM     q1, q12, q14, d18, 3
226
+    VAR_SQR_SUM     q2, q13, q15, d19, 3
227
+    bgt             .var64_loop
228
+
229
+    vpaddl.u16      q8, q14
230
+    vpaddl.u16      q9, q15
231
+    vadd.u32        q1, q1, q8
232
+    vadd.u32        q1, q1, q9
233
+    vadd.u32        q1, q1, q2
234
+    vpaddl.u16      d0, d0
235
+    vpaddl.u16      d1, d1
236
+    vpaddl.u16      d6, d6
237
+    vpaddl.u16      d7, d7
238
+    vadd.u32        d0, d1
239
+    vadd.u32        d6, d7
240
+    vadd.u32        d0, d6
241
+    vadd.u32        d2, d2, d3
242
+    vpadd.u32       d0, d0, d2
243
+
244
+    vmov            r0, r1, d0
245
+    bx              lr
246
+endfunc
247
+
248
+/* void getResidual4_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
249
+ * r0   - fenc
250
+ * r1   - pred
251
+ * r2   - residual
252
+ * r3   - Stride */
253
+function x265_getResidual4_neon
254
+    lsl             r12, r3, #1
255
+.rept 2
256
+    vld1.u8         {d0}, [r0], r3
257
+    vld1.u8         {d1}, [r1], r3
258
+    vld1.u8         {d2}, [r0], r3
259
+    vld1.u8         {d3}, [r1], r3
260
+    vsubl.u8        q2, d0, d1
261
+    vsubl.u8        q3, d2, d3
262
+    vst1.s16        {d4}, [r2], r12
263
+    vst1.s16        {d6}, [r2], r12
264
+.endr
265
+    bx              lr
266
+endfunc
267
+
268
+function x265_getResidual8_neon
269
+    lsl             r12, r3, #1
270
+.rept 4
271
+    vld1.u8         {d0}, [r0], r3
272
+    vld1.u8         {d1}, [r1], r3
273
+    vld1.u8         {d2}, [r0], r3
274
+    vld1.u8         {d3}, [r1], r3
275
+    vsubl.u8        q2, d0, d1
276
+    vsubl.u8        q3, d2, d3
277
+    vst1.s16        {q2}, [r2], r12
278
+    vst1.s16        {q3}, [r2], r12
279
+.endr
280
+    bx              lr
281
+endfunc
282
+
283
+function x265_getResidual16_neon
284
+    lsl             r12, r3, #1
285
+.rept 8
286
+    vld1.u8         {d0, d1}, [r0], r3
287
+    vld1.u8         {d2, d3}, [r1], r3
288
+    vld1.u8         {d4, d5}, [r0], r3
289
+    vld1.u8         {d6, d7}, [r1], r3
290
+    vsubl.u8        q8, d0, d2
291
+    vsubl.u8        q9, d1, d3
292
+    vsubl.u8        q10, d4, d6
293
+    vsubl.u8        q11, d5, d7
294
+    vst1.s16        {q8, q9}, [r2], r12
295
+    vst1.s16        {q10, q11}, [r2], r12
296
+.endr
297
+    bx              lr
298
+endfunc
299
+
300
+function x265_getResidual32_neon
301
+    push            {r4}
302
+    lsl             r12, r3, #1
303
+    sub             r12, #32
304
+    mov             r4, #4
305
+loop_res32:
306
+    subs            r4, r4, #1
307
+.rept 8
308
+    vld1.u8         {q0, q1}, [r0], r3
309
+    vld1.u8         {q2, q3}, [r1], r3
310
+    vsubl.u8        q8, d0, d4
311
+    vsubl.u8        q9, d1, d5
312
+    vsubl.u8        q10, d2, d6
313
+    vsubl.u8        q11, d3, d7
314
+    vst1.s16        {q8, q9}, [r2]!
315
+    vst1.s16        {q10, q11}, [r2], r12
316
+.endr
317
+    bne             loop_res32
318
+    pop             {r4}
319
+    bx              lr
320
+endfunc
321
+
322
+// void pixel_sub_ps_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1)
323
+function x265_pixel_sub_ps_4x4_neon
324
+    push            {r4}
325
+    lsl             r1, r1, #1
326
+    ldr             r4, [sp, #4]
327
+    ldr             r12, [sp, #8]
328
+.rept 2
329
+    vld1.u8         {d0}, [r2], r4
330
+    vld1.u8         {d1}, [r3], r12
331
+    vld1.u8         {d2}, [r2], r4
332
+    vld1.u8         {d3}, [r3], r12
333
+    vsubl.u8        q2, d0, d1
334
+    vsubl.u8        q3, d2, d3
335
+    vst1.s16        {d4}, [r0], r1
336
+    vst1.s16        {d6}, [r0], r1
337
+.endr
338
+    pop             {r4}
339
+    bx              lr
340
+endfunc
341
+
342
+function x265_pixel_sub_ps_8x8_neon
343
+    push            {r4}
344
+    lsl             r1, r1, #1
345
+    ldr             r4, [sp, #4]
346
+    ldr             r12, [sp, #8]
347
+.rept 4
348
+    vld1.u8         {d0}, [r2], r4
349
+    vld1.u8         {d1}, [r3], r12
350
+    vld1.u8         {d2}, [r2], r4
351
+    vld1.u8         {d3}, [r3], r12
352
+    vsubl.u8        q2, d0, d1
353
+    vsubl.u8        q3, d2, d3
354
+    vst1.s16        {q2}, [r0], r1
355
+    vst1.s16        {q3}, [r0], r1
356
+.endr
357
+    pop             {r4}
358
+    bx              lr
359
+endfunc
360
+
361
+function x265_pixel_sub_ps_16x16_neon
362
+    push            {r4, r5}
363
+    lsl             r1, r1, #1
364
+    ldr             r4, [sp, #8]
365
+    ldr             r12, [sp, #12]
366
+    mov             r5, #2
367
+loop_sub16:
368
+    subs            r5, r5, #1
369
+.rept 4
370
+    vld1.u8         {q0}, [r2], r4
371
+    vld1.u8         {q1}, [r3], r12
372
+    vld1.u8         {q2}, [r2], r4
373
+    vld1.u8         {q3}, [r3], r12
374
+    vsubl.u8        q8, d0, d2
375
+    vsubl.u8        q9, d1, d3
376
+    vsubl.u8        q10, d4, d6
377
+    vsubl.u8        q11, d5, d7
378
+    vst1.s16        {q8, q9}, [r0], r1
379
+    vst1.s16        {q10, q11}, [r0], r1
380
+.endr
381
+    bne             loop_sub16
382
+    pop             {r4, r5}
383
+    bx              lr
384
+endfunc
385
+
386
+function x265_pixel_sub_ps_32x32_neon
387
+    push            {r4, r5}
388
+    lsl             r1, r1, #1
389
+    ldr             r4, [sp, #8]
390
+    ldr             r12, [sp, #12]
391
+    sub             r1, #32
392
+    mov             r5, #8
393
+loop_sub32:
394
+    subs            r5, r5, #1
395
+.rept 4
396
+    vld1.u8         {q0, q1}, [r2], r4
397
+    vld1.u8         {q2, q3}, [r3], r12
398
+    vsubl.u8        q8, d0, d4
399
+    vsubl.u8        q9, d1, d5
400
+    vsubl.u8        q10, d2, d6
401
+    vsubl.u8        q11, d3, d7
402
+    vst1.s16        {q8, q9}, [r0]!
403
+    vst1.s16        {q10, q11}, [r0], r1
404
+.endr
405
+    bne             loop_sub32
406
+    pop             {r4, r5}
407
+    bx              lr
408
+endfunc
409
+
410
+function x265_pixel_sub_ps_64x64_neon
411
+    push            {r4, r5}
412
+    lsl             r1, r1, #1
413
+    ldr             r4, [sp, #8]
414
+    ldr             r12, [sp, #12]
415
+    sub             r1, #96
416
+    sub             r4, #32
417
+    sub             r12, #32
418
+    mov             r5, #32
419
+loop_sub64:
420
+    subs            r5, r5, #1
421
+.rept 2
422
+    vld1.u8         {q0, q1}, [r2]!
423
+    vld1.u8         {q2, q3}, [r2], r4
424
+    vld1.u8         {q8, q9}, [r3]!
425
+    vld1.u8         {q10, q11}, [r3], r12
426
+    vsubl.u8        q12, d0, d16
427
+    vsubl.u8        q13, d1, d17
428
+    vsubl.u8        q14, d2, d18
429
+    vsubl.u8        q15, d3, d19
430
+    vsubl.u8        q0, d4, d20
431
+    vsubl.u8        q1, d5, d21
432
+    vsubl.u8        q2, d6, d22
433
+    vsubl.u8        q3, d7, d23
434
+    vst1.s16        {q12, q13}, [r0]!
435
+    vst1.s16        {q14, q15}, [r0]!
436
+    vst1.s16        {q0, q1}, [r0]!
437
+    vst1.s16        {q2, q3}, [r0], r1
438
+.endr
439
+    bne             loop_sub64
440
+    pop             {r4, r5}
441
+    bx              lr
442
+endfunc
443
+
444
+// chroma sub_ps
445
+function x265_pixel_sub_ps_4x8_neon
446
+    push            {r4}
447
+    lsl             r1, r1, #1
448
+    ldr             r4, [sp, #4]
449
+    ldr             r12, [sp, #8]
450
+.rept 4
451
+    vld1.u8         {d0}, [r2], r4
452
+    vld1.u8         {d1}, [r3], r12
453
+    vld1.u8         {d2}, [r2], r4
454
+    vld1.u8         {d3}, [r3], r12
455
+    vsubl.u8        q2, d0, d1
456
+    vsubl.u8        q3, d2, d3
457
+    vst1.s16        {d4}, [r0], r1
458
+    vst1.s16        {d6}, [r0], r1
459
+.endr
460
+    pop             {r4}
461
+    bx              lr
462
+endfunc
463
+
464
+function x265_pixel_sub_ps_8x16_neon
465
+    push            {r4}
466
+    lsl             r1, r1, #1
467
+    ldr             r4, [sp, #4]
468
+    ldr             r12, [sp, #8]
469
+.rept 8
470
+    vld1.u8         {d0}, [r2], r4
471
+    vld1.u8         {d1}, [r3], r12
472
+    vld1.u8         {d2}, [r2], r4
473
+    vld1.u8         {d3}, [r3], r12
474
+    vsubl.u8        q2, d0, d1
475
+    vsubl.u8        q3, d2, d3
476
+    vst1.s16        {q2}, [r0], r1
477
+    vst1.s16        {q3}, [r0], r1
478
+.endr
479
+    pop             {r4}
480
+    bx              lr
481
+endfunc
482
+
483
+function x265_pixel_sub_ps_16x32_neon
484
+    push            {r4, r5}
485
+    lsl             r1, r1, #1
486
+    ldr             r4, [sp, #8]
487
+    ldr             r12, [sp, #12]
488
+    mov             r5, #4
489
+loop_sub_16x32:
490
+    subs            r5, r5, #1
491
+.rept 4
492
+    vld1.u8         {q0}, [r2], r4
493
+    vld1.u8         {q1}, [r3], r12
494
+    vld1.u8         {q2}, [r2], r4
495
+    vld1.u8         {q3}, [r3], r12
496
+    vsubl.u8        q8, d0, d2
497
+    vsubl.u8        q9, d1, d3
498
+    vsubl.u8        q10, d4, d6
499
+    vsubl.u8        q11, d5, d7
500
+    vst1.s16        {q8, q9}, [r0], r1
501
+    vst1.s16        {q10, q11}, [r0], r1
502
+.endr
503
+    bne             loop_sub_16x32
504
+    pop             {r4, r5}
505
+    bx              lr
506
+endfunc
507
+
508
+function x265_pixel_sub_ps_32x64_neon
509
+    push            {r4, r5}
510
+    lsl             r1, r1, #1
511
+    ldr             r4, [sp, #8]
512
+    ldr             r12, [sp, #12]
513
+    sub             r1, #32
514
+    mov             r5, #16
515
+loop_sub_32x64:
516
+    subs            r5, r5, #1
517
+.rept 4
518
+    vld1.u8         {q0, q1}, [r2], r4
519
+    vld1.u8         {q2, q3}, [r3], r12
520
+    vsubl.u8        q8, d0, d4
521
+    vsubl.u8        q9, d1, d5
522
+    vsubl.u8        q10, d2, d6
523
+    vsubl.u8        q11, d3, d7
524
+    vst1.s16        {q8, q9}, [r0]!
525
+    vst1.s16        {q10, q11}, [r0], r1
526
+.endr
527
+    bne             loop_sub_32x64
528
+    pop             {r4, r5}
529
+    bx              lr
530
+endfunc
531
+
532
+// void x265_pixel_add_ps_neon(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
533
+function x265_pixel_add_ps_4x4_neon
534
+    push            {r4}
535
+    ldr             r4, [sp, #4]
536
+    ldr             r12, [sp, #8]
537
+    lsl             r12, #1
538
+    vmov.u16        q10, #255
539
+    veor.u16        q11, q11
540
+    veor.u16        d3, d3
541
+    veor.u16        d5, d5
542
+.rept 2
543
+    vld1.u8         {d0}, [r2], r4
544
+    vld1.u8         {d1}, [r2], r4
545
+    vld1.s16        {d2}, [r3], r12
546
+    vld1.s16        {d4}, [r3], r12
547
+    vmovl.u8        q8, d0
548
+    vmovl.u8        q9, d1
549
+    vadd.s16        q1, q1, q8
550
+    vadd.s16        q2, q2, q9
551
+    vqmovun.s16     d0, q1
552
+    vqmovun.s16     d1, q2
553
+    vst1.32         {d0[0]}, [r0], r1
554
+    vst1.32         {d1[0]}, [r0], r1
555
+.endr
556
+    pop             {r4}
557
+    bx              lr
558
+endfunc
559
+
560
+function x265_pixel_add_ps_8x8_neon
561
+    push            {r4}
562
+    ldr             r4, [sp, #4]
563
+    ldr             r12, [sp, #8]
564
+    lsl             r12, #1
565
+    vmov.u16        q10, #255
566
+    veor.u16        q11, q11
567
+.rept 4
568
+    vld1.u8         {d0}, [r2], r4
569
+    vld1.u8         {d1}, [r2], r4
570
+    vld1.s16        {q8}, [r3], r12
571
+    vld1.s16        {q9}, [r3], r12
572
+    vmovl.u8        q1, d0
573
+    vmovl.u8        q2, d1
574
+    vadd.s16        q1, q1, q8
575
+    vadd.s16        q2, q2, q9
576
+    vqmovun.s16     d0, q1
577
+    vqmovun.s16     d1, q2
578
+    vst1.8          {d0}, [r0], r1
579
+    vst1.8          {d1}, [r0], r1
580
+.endr
581
+    pop             {r4}
582
+    bx              lr
583
+endfunc
584
+
585
+.macro pixel_add_ps_16xN_neon h i
586
+function x265_pixel_add_ps_16x\h\()_neon
587
+    push            {r4, r5}
588
+    ldr             r4, [sp, #8]
589
+    ldr             r12, [sp, #12]
590
+    lsl             r12, #1
591
+    vmov.u16        q10, #255
592
+    veor.u16        q11, q11
593
+    mov             r5, #\i
594
+loop_addps_16x\h\():
595
+    subs            r5, #1
596
+.rept 4
597
+    vld1.u8         {q0}, [r2], r4
598
+    vld1.u8         {q1}, [r2], r4
599
+    vld1.s16        {q8, q9}, [r3], r12
600
+    vld1.s16        {q12, q13}, [r3], r12
601
+
602
+    vmovl.u8        q2, d0
603
+    vmovl.u8        q3, d1
604
+    vmovl.u8        q0, d2
605
+    vmovl.u8        q1, d3
606
+
607
+    vadd.s16        q2, q2, q8
608
+    vadd.s16        q3, q3, q9
609
+    vadd.s16        q0, q0, q12
610
+    vadd.s16        q1, q1, q13
611
+
612
+    vqmovun.s16     d4, q2
613
+    vqmovun.s16     d5, q3
614
+    vqmovun.s16     d0, q0
615
+    vqmovun.s16     d1, q1
616
+    vst1.8          {d4, d5}, [r0], r1
617
+    vst1.8          {d0, d1}, [r0], r1
618
+.endr
619
+    bne             loop_addps_16x\h
620
+    pop             {r4, r5}
621
+    bx              lr
622
+endfunc
623
+.endm
624
+
625
+pixel_add_ps_16xN_neon 16 2
626
+pixel_add_ps_16xN_neon 32 4
627
+
628
+.macro pixel_add_ps_32xN_neon h i
629
+ function x265_pixel_add_ps_32x\h\()_neon
630
+    push            {r4, r5}
631
+    ldr             r4, [sp, #8]
632
+    ldr             r12, [sp, #12]
633
+    lsl             r12, #1
634
+    vmov.u16        q10, #255
635
+    veor.u16        q11, q11
636
+    mov             r5, #\i
637
+    sub             r12, #32
638
+loop_addps_32x\h\():
639
+    subs            r5, #1
640
+.rept 4
641
+    vld1.u8         {q0, q1}, [r2], r4
642
+    vld1.s16        {q8, q9}, [r3]!
643
+    vld1.s16        {q12, q13}, [r3], r12
644
+
645
+    vmovl.u8        q2, d0
646
+    vmovl.u8        q3, d1
647
+    vmovl.u8        q14, d2
648
+    vmovl.u8        q15, d3
649
+
650
+    vadd.s16        q2, q2, q8
651
+    vadd.s16        q3, q3, q9
652
+    vadd.s16        q14, q14, q12
653
+    vadd.s16        q15, q15, q13
654
+
655
+    vqmovun.s16     d0, q2
656
+    vqmovun.s16     d1, q3
657
+    vqmovun.s16     d2, q14
658
+    vqmovun.s16     d3, q15
659
+    vst1.8          {q0, q1}, [r0], r1
660
+.endr
661
+    bne             loop_addps_32x\h
662
+    pop             {r4, r5}
663
+    bx              lr
664
+endfunc
665
+.endm
666
+
667
+pixel_add_ps_32xN_neon 32 8
668
+pixel_add_ps_32xN_neon 64 16
669
+
670
+function x265_pixel_add_ps_64x64_neon
671
+    push            {r4, r5}
672
+    vpush           {q4, q5, q6, q7}
673
+    ldr             r4, [sp, #72]
674
+    ldr             r12, [sp, #76]
675
+    lsl             r12, #1
676
+    vmov.u16        q2, #255
677
+    veor.u16        q3, q3
678
+    mov             r5, #32
679
+    sub             r1, #32
680
+    sub             r4, #32
681
+    sub             r12, #96
682
+loop_addps64:
683
+    subs            r5, #1
684
+.rept 2
685
+    vld1.u8         {q0, q1}, [r2]!
686
+    vld1.s16        {q8, q9}, [r3]!
687
+    vld1.s16        {q10, q11}, [r3]!
688
+    vld1.s16        {q12, q13}, [r3]!
689
+    vld1.s16        {q14, q15}, [r3], r12
690
+
691
+    vmovl.u8        q4, d0
692
+    vmovl.u8        q5, d1
693
+    vmovl.u8        q6, d2
694
+    vmovl.u8        q7, d3
695
+
696
+    vadd.s16        q4, q4, q8
697
+    vadd.s16        q5, q5, q9
698
+    vadd.s16        q6, q6, q10
699
+    vadd.s16        q7, q7, q11
700
+
701
+    vqmovun.s16     d0, q4
702
+    vqmovun.s16     d1, q5
703
+    vqmovun.s16     d2, q6
704
+    vqmovun.s16     d3, q7
705
+
706
+    vst1.u8         {q0, q1}, [r0]!
707
+    vld1.u8         {q0, q1}, [r2], r4
708
+    vmovl.u8        q4, d0
709
+    vmovl.u8        q5, d1
710
+    vmovl.u8        q6, d2
711
+    vmovl.u8        q7, d3
712
+
713
+    vadd.s16        q4, q4, q12
714
+    vadd.s16        q5, q5, q13
715
+    vadd.s16        q6, q6, q14
716
+    vadd.s16        q7, q7, q15
717
+
718
+    vqmovun.s16     d0, q4
719
+    vqmovun.s16     d1, q5
720
+    vqmovun.s16     d2, q6
721
+    vqmovun.s16     d3, q7
722
+    vst1.u8         {q0, q1}, [r0], r1
723
+.endr
724
+    bne             loop_addps64
725
+    vpop            {q4, q5, q6, q7}
726
+    pop             {r4, r5}
727
+    bx              lr
728
+endfunc
729
+
730
+// Chroma add_ps
731
+function x265_pixel_add_ps_4x8_neon
732
+    push            {r4}
733
+    ldr             r4, [sp, #4]
734
+    ldr             r12, [sp, #8]
735
+    lsl             r12, #1
736
+    vmov.u16        q10, #255
737
+    veor.u16        q11, q11
738
+    veor.u16        d3, d3
739
+    veor.u16        d5, d5
740
+.rept 4
741
+    vld1.u8         {d0}, [r2], r4
742
+    vld1.u8         {d1}, [r2], r4
743
+    vld1.s16        {d2}, [r3], r12
744
+    vld1.s16        {d4}, [r3], r12
745
+    vmovl.u8        q8, d0
746
+    vmovl.u8        q9, d1
747
+    vadd.s16        q1, q1, q8
748
+    vadd.s16        q2, q2, q9
749
+    vqmovun.s16     d0, q1
750
+    vqmovun.s16     d1, q2
751
+    vst1.32         {d0[0]}, [r0], r1
752
+    vst1.32         {d1[0]}, [r0], r1
753
+.endr
754
+    pop             {r4}
755
+    bx              lr
756
+endfunc
757
+
758
+function x265_pixel_add_ps_8x16_neon
759
+    push            {r4, r5}
760
+    ldr             r4, [sp, #8]
761
+    ldr             r12, [sp, #12]
762
+    lsl             r12, #1
763
+    vmov.u16        q10, #255
764
+    veor.u16        q11, q11
765
+    mov             r5, #2
766
+loop_add_8x16:
767
+    subs            r5, #1
768
+.rept 4
769
+    vld1.u8         {d0}, [r2], r4
770
+    vld1.u8         {d1}, [r2], r4
771
+    vld1.s16        {q8}, [r3], r12
772
+    vld1.s16        {q9}, [r3], r12
773
+    vmovl.u8        q1, d0
774
+    vmovl.u8        q2, d1
775
+    vadd.s16        q1, q1, q8
776
+    vadd.s16        q2, q2, q9
777
+    vqmovun.s16     d0, q1
778
+    vqmovun.s16     d1, q2
779
+    vst1.8          {d0}, [r0], r1
780
+    vst1.8          {d1}, [r0], r1
781
+.endr
782
+    bne             loop_add_8x16
783
+    pop             {r4, r5}
784
+    bx              lr
785
+endfunc
786
+
787
+// void scale1D_128to64(pixel *dst, const pixel *src)
788
+function x265_scale1D_128to64_neon 
789
+    mov             r12, #32
790
+.rept 2
791
+    vld2.u8         {q8, q9}, [r1]!
792
+    vld2.u8         {q10, q11}, [r1]!
793
+    vld2.u8         {q12, q13}, [r1]!
794
+    vld2.u8         {q14, q15}, [r1], r12
795
+
796
+    vrhadd.u8       q0, q8, q9
797
+    vrhadd.u8       q1, q10, q11
798
+    vrhadd.u8       q2, q12, q13
799
+    vrhadd.u8       q3, q14, q15
800
+
801
+    vst1.u8         {q0, q1}, [r0]!
802
+    vst1.u8         {q2, q3}, [r0], r12
803
+.endr
804
+    bx              lr
805
+endfunc
806
+
807
+// void scale2D_64to32(pixel* dst, const pixel* src, intptr_t stride)
808
+function x265_scale2D_64to32_neon
809
+    sub             r2, #32
810
+    mov             r3, #16
811
+loop_scale2D:
812
+    subs            r3, #1
813
+.rept 2
814
+    vld2.8          {q8, q9}, [r1]!
815
+    vld2.8          {q10, q11}, [r1], r2
816
+    vld2.8          {q12, q13}, [r1]!
817
+    vld2.8          {q14, q15}, [r1], r2
818
+
819
+    vaddl.u8        q0, d16, d18
820
+    vaddl.u8        q1, d17, d19
821
+    vaddl.u8        q2, d20, d22
822
+    vaddl.u8        q3, d21, d23
823
+
824
+    vaddl.u8        q8, d24, d26
825
+    vaddl.u8        q9, d25, d27
826
+    vaddl.u8        q10, d28, d30
827
+    vaddl.u8        q11, d29, d31
828
+
829
+    vadd.u16        q0, q8
830
+    vadd.u16        q1, q9
831
+    vadd.u16        q2, q10
832
+    vadd.u16        q3, q11
833
+
834
+    vrshrn.u16      d16, q0, #2
835
+    vrshrn.u16      d17, q1, #2
836
+    vrshrn.u16      d18, q2, #2
837
+    vrshrn.u16      d19, q3, #2
838
+    vst1.8          {q8, q9}, [r0]!
839
+.endr
840
+    bne             loop_scale2D
841
+    bx              lr
842
+endfunc
843
+
844
+function x265_pixel_planecopy_cp_neon
845
+    push            {r4, r5, r6, r7}
846
+    ldr             r4, [sp, #4 * 4]
847
+    ldr             r5, [sp, #4 * 4 + 4]
848
+    ldr             r12, [sp, #4 * 4 + 8]
849
+    vdup.8          q2, r12
850
+    sub             r5, #1
851
+
852
+.loop_h:
853
+    mov             r6, r0
854
+    mov             r12, r2
855
+    eor             r7, r7
856
+.loop_w:
857
+    vld1.u8         {q0}, [r6]!
858
+    vshl.u8         q0, q0, q2
859
+    vst1.u8         {q0}, [r12]!
860
+
861
+    add             r7, #16
862
+    cmp             r7, r4
863
+    blt             .loop_w
864
+
865
+    add             r0, r1
866
+    add             r2, r3
867
+
868
+    subs             r5, #1
869
+    bgt             .loop_h
870
+
871
+// handle last row
872
+    mov             r5, r4
873
+    lsr             r5, #3
874
+
875
+.loopW8:
876
+    vld1.u8         d0, [r0]!
877
+    vshl.u8         d0, d0, d4
878
+    vst1.u8         d0, [r2]!
879
+    subs            r4, r4, #8
880
+    subs            r5, #1
881
+    bgt             .loopW8
882
+
883
+    mov             r5,#8
884
+    sub             r5, r4
885
+    sub             r0, r5
886
+    sub             r2, r5
887
+    vld1.u8         d0, [r0]
888
+    vshl.u8         d0, d0, d4
889
+    vst1.u8         d0, [r2]
890
+
891
+    pop             {r4, r5, r6, r7}
892
+    bx              lr
893
+endfunc
894
+
895
+//******* satd *******
896
+.macro satd_4x4_neon
897
+    vld1.32         {d1[]}, [r2], r3
898
+    vld1.32         {d0[]}, [r0,:32], r1
899
+    vld1.32         {d3[]}, [r2], r3
900
+    vld1.32         {d2[]}, [r0,:32], r1
901
+    vld1.32         {d1[1]}, [r2], r3
902
+    vld1.32         {d0[1]}, [r0,:32], r1
903
+    vld1.32         {d3[1]}, [r2], r3
904
+    vld1.32         {d2[1]}, [r0,:32], r1
905
+    vsubl.u8        q0, d0, d1
906
+    vsubl.u8        q1, d2, d3
907
+    SUMSUB_AB       q2, q3, q0, q1
908
+    SUMSUB_ABCD     d0, d2, d1, d3, d4, d5, d6, d7
909
+    HADAMARD        1, sumsub, q2, q3, q0, q1
910
+    HADAMARD        2, amax, q0,, q2, q3
911
+    HORIZ_ADD       d0, d0, d1
912
+.endm
913
+
914
+function x265_pixel_satd_4x4_neon
915
+    satd_4x4_neon
916
+    vmov.32         r0, d0[0]
917
+    bx              lr
918
+endfunc
919
+
920
+.macro LOAD_DIFF_8x4_1 q0 q1 q2 q3
921
+    vld1.32         {d1}, [r2], r3
922
+    vld1.32         {d0}, [r0,:64], r1
923
+    vsubl.u8        \q0, d0, d1
924
+    vld1.32         {d3}, [r2], r3
925
+    vld1.32         {d2}, [r0,:64], r1
926
+    vsubl.u8        \q1, d2, d3
927
+    vld1.32         {d5}, [r2], r3
928
+    vld1.32         {d4}, [r0,:64], r1
929
+    vsubl.u8        \q2, d4, d5
930
+    vld1.32         {d7}, [r2], r3
931
+    vld1.32         {d6}, [r0,:64], r1
932
+    vsubl.u8        \q3, d6, d7
933
+.endm
934
+
935
+.macro x265_satd_4x8_8x4_end_neon
936
+    vadd.s16        q0, q8, q10
937
+    vadd.s16        q1, q9, q11
938
+    vsub.s16        q2, q8, q10
939
+    vsub.s16        q3, q9, q11
940
+
941
+    vtrn.16         q0, q1
942
+    vadd.s16        q8, q0, q1
943
+    vtrn.16         q2, q3
944
+    vsub.s16        q9, q0, q1
945
+    vadd.s16        q10, q2, q3
946
+    vsub.s16        q11, q2, q3
947
+    vtrn.32         q8, q10
948
+    vabs.s16        q8, q8
949
+    vtrn.32         q9, q11
950
+    vabs.s16        q10, q10
951
+    vabs.s16        q9, q9
952
+    vabs.s16        q11, q11
953
+    vmax.u16        q0, q8, q10
954
+    vmax.u16        q1, q9, q11
955
+    vadd.u16        q0, q0, q1
956
+    HORIZ_ADD       d0, d0, d1
957
+.endm
958
+
959
+.macro pixel_satd_4x8_neon
960
+    vld1.32         {d1[]}, [r2], r3
961
+    vld1.32         {d0[]}, [r0,:32], r1
962
+    vld1.32         {d3[]}, [r2], r3
963
+    vld1.32         {d2[]}, [r0,:32], r1
964
+    vld1.32         {d5[]}, [r2], r3
965
+    vld1.32         {d4[]}, [r0,:32], r1
966
+    vld1.32         {d7[]}, [r2], r3
967
+    vld1.32         {d6[]}, [r0,:32], r1
968
+
969
+    vld1.32         {d1[1]}, [r2], r3
970
+    vld1.32         {d0[1]}, [r0,:32], r1
971
+    vsubl.u8        q0, d0, d1
972
+    vld1.32         {d3[1]}, [r2], r3
973
+    vld1.32         {d2[1]}, [r0,:32], r1
974
+    vsubl.u8        q1, d2, d3
975
+    vld1.32         {d5[1]}, [r2], r3
976
+    vld1.32         {d4[1]}, [r0,:32], r1
977
+    vsubl.u8        q2, d4, d5
978
+    vld1.32         {d7[1]}, [r2], r3
979
+    SUMSUB_AB       q8, q9, q0, q1
980
+    vld1.32         {d6[1]}, [r0,:32], r1
981
+    vsubl.u8        q3, d6, d7
982
+    SUMSUB_AB       q10, q11, q2, q3
983
+    x265_satd_4x8_8x4_end_neon
984
+.endm
985
+
986
+function x265_pixel_satd_4x8_neon
987
+    pixel_satd_4x8_neon
988
+    vmov.32         r0, d0[0]
989
+    bx              lr
990
+endfunc
991
+
992
+function x265_pixel_satd_4x16_neon
993
+    push            {r4, r5}
994
+    eor             r4, r4
995
+    pixel_satd_4x8_neon
996
+    vmov.32         r5, d0[0]
997
+    add             r4, r5
998
+    pixel_satd_4x8_neon
999
+    vmov.32         r5, d0[0]
1000
+    add             r0, r5, r4
1001
+    pop             {r4, r5}
1002
+    bx              lr
1003
+endfunc
1004
+
1005
+function x265_pixel_satd_4x32_neon
1006
+    push            {r4, r5}
1007
+    eor             r4, r4
1008
+.rept 4
1009
+    pixel_satd_4x8_neon
1010
+    vmov.32         r5, d0[0]
1011
+    add             r4, r5
1012
+.endr
1013
+    mov             r0, r4
1014
+    pop             {r4, r5}
1015
+    bx              lr
1016
+endfunc
1017
+
1018
+function x265_pixel_satd_12x16_neon
1019
+    push            {r4, r5, r6, r7}
1020
+    vpush           {d8-d11}
1021
+    mov             ip, lr
1022
+    mov             r4, r0
1023
+    mov             r5, r2
1024
+    eor             r7, r7
1025
+    pixel_satd_4x8_neon
1026
+    vmov.32         r6, d0[0]
1027
+    add             r7, r6
1028
+    pixel_satd_4x8_neon
1029
+    vmov.32         r6, d0[0]
1030
+    add             r7, r6
1031
+
1032
+    add             r0, r4, #4
1033
+    add             r2, r5, #4
1034
+    pixel_satd_4x8_neon
1035
+    vmov.32         r6, d0[0]
1036
+    add             r7, r6
1037
+    pixel_satd_4x8_neon
1038
+    vmov.32         r6, d0[0]
1039
+    add             r7, r6
1040
+
1041
+    add             r0, r4, #8
1042
+    add             r2, r5, #8
1043
+    pixel_satd_4x8_neon
1044
+    vmov.32         r6, d0[0]
1045
+    add             r7, r6
1046
+    pixel_satd_4x8_neon
1047
+    vmov.32         r6, d0[0]
1048
+    add             r0, r7, r6
1049
+    vpop            {d8-d11}
1050
+    pop             {r4, r5, r6, r7}
1051
+    mov             lr, ip
1052
+    bx              lr
1053
+endfunc
1054
+
1055
+function x265_pixel_satd_12x32_neon
1056
+    push            {r4, r5, r6, r7}
1057
+    vpush           {d8-d11}
1058
+    mov             ip, lr
1059
+    mov             r4, r0
1060
+    mov             r5, r2
1061
+    eor             r7, r7
1062
+.rept 4
1063
+    pixel_satd_4x8_neon
1064
+    vmov.32         r6, d0[0]
1065
+    add             r7, r6
1066
+.endr
1067
+
1068
+    add             r0, r4, #4
1069
+    add             r2, r5, #4
1070
+.rept 4
1071
+    pixel_satd_4x8_neon
1072
+    vmov.32         r6, d0[0]
1073
+    add             r7, r6
1074
+.endr
1075
+
1076
+    add             r0, r4, #8
1077
+    add             r2, r5, #8
1078
+.rept 4
1079
+    pixel_satd_4x8_neon
1080
+    vmov.32         r6, d0[0]
1081
+    add             r7, r6
1082
+.endr
1083
+
1084
+    mov             r0, r7
1085
+    vpop            {d8-d11}
1086
+    pop             {r4, r5, r6, r7}
1087
+    mov             lr, ip
1088
+    bx              lr
1089
+endfunc
1090
+
1091
+function x265_pixel_satd_8x4_neon
1092
+    push            {r4, r5, r6}
1093
+    mov             r4, r0
1094
+    mov             r5, r2
1095
+    satd_4x4_neon
1096
+    add             r0, r4, #4
1097
+    add             r2, r5, #4
1098
+    vmov.32         r6, d0[0]
1099
+    satd_4x4_neon
1100
+    vmov.32         r0, d0[0]
1101
+    add             r0, r0, r6
1102
+    pop             {r4, r5, r6}
1103
+    bx              lr
1104
+endfunc
1105
+
1106
+function x265_pixel_satd_8x8_neon
1107
+    mov             ip, lr
1108
+    push            {r4, r5, r6, r7}
1109
+    eor             r4, r4
1110
+    mov             r6, r0
1111
+    mov             r7, r2
1112
+    pixel_satd_4x8_neon
1113
+    vmov.32         r5, d0[0]
1114
+    add             r4, r5
1115
+    add             r0, r6, #4
1116
+    add             r2, r7, #4
1117
+    pixel_satd_4x8_neon
1118
+    vmov.32         r5, d0[0]
1119
+    add             r0, r4, r5
1120
+    pop             {r4, r5, r6, r7}
1121
+    mov             lr, ip
1122
+    bx              lr
1123
+endfunc
1124
+
1125
+function x265_pixel_satd_8x12_neon
1126
+    push            {r4, r5, r6, r7}
1127
+    mov             r4, r0
1128
+    mov             r5, r2
1129
+    eor             r7, r7
1130
+    satd_4x4_neon
1131
+    vmov.32         r6, d0[0]
1132
+    add             r7, r6
1133
+    add             r0, r4, #4
1134
+    add             r2, r5, #4
1135
+    satd_4x4_neon
1136
+    vmov.32         r6, d0[0]
1137
+    add             r7, r6
1138
+.rept 2
1139
+    sub             r0, #4
1140
+    sub             r2, #4
1141
+    mov             r4, r0
1142
+    mov             r5, r2
1143
+    satd_4x4_neon
1144
+    vmov.32         r6, d0[0]
1145
+    add             r7, r6
1146
+    add             r0, r4, #4
1147
+    add             r2, r5, #4
1148
+    satd_4x4_neon
1149
+    vmov.32         r6, d0[0]
1150
+    add             r7, r6
1151
+.endr
1152
+    mov             r0, r7
1153
+    pop             {r4, r5, r6, r7}
1154
+    bx              lr
1155
+endfunc
1156
+
1157
+function x265_pixel_satd_8x16_neon
1158
+    vpush           {d8-d11}
1159
+    mov             ip, lr
1160
+    bl              x265_satd_8x8_neon
1161
+    vadd.u16        q4, q12, q13
1162
+    vadd.u16        q5, q14, q15
1163
+
1164
+    bl              x265_satd_8x8_neon
1165
+    vadd.u16        q4, q4, q12
1166
+    vadd.u16        q5, q5, q13
1167
+    vadd.u16        q4, q4, q14
1168
+    vadd.u16        q5, q5, q15
1169
+
1170
+    vadd.u16        q0, q4, q5
1171
+    HORIZ_ADD       d0, d0, d1
1172
+    vpop            {d8-d11}
1173
+    mov             lr, ip
1174
+    vmov.32         r0, d0[0]
1175
+    bx              lr
1176
+endfunc
1177
+
1178
+function x265_pixel_satd_8x32_neon
1179
+    vpush           {d8-d11}
1180
+    mov             ip, lr
1181
+    bl              x265_satd_8x8_neon
1182
+    vadd.u16        q4, q12, q13
1183
+    vadd.u16        q5, q14, q15
1184
+.rept 3
1185
+    bl              x265_satd_8x8_neon
1186
+    vadd.u16        q4, q4, q12
1187
+    vadd.u16        q5, q5, q13
1188
+    vadd.u16        q4, q4, q14
1189
+    vadd.u16        q5, q5, q15
1190
+.endr
1191
+    vadd.u16        q0, q4, q5
1192
+    HORIZ_ADD       d0, d0, d1
1193
+    vpop            {d8-d11}
1194
+    mov             lr, ip
1195
+    vmov.32         r0, d0[0]
1196
+    bx              lr
1197
+endfunc
1198
+
1199
+function x265_pixel_satd_8x64_neon
1200
+    vpush           {d8-d11}
1201
+    mov             ip, lr
1202
+    bl              x265_satd_8x8_neon
1203
+    vadd.u16        q4, q12, q13
1204
+    vadd.u16        q5, q14, q15
1205
+.rept 7
1206
+    bl              x265_satd_8x8_neon
1207
+    vadd.u16        q4, q4, q12
1208
+    vadd.u16        q5, q5, q13
1209
+    vadd.u16        q4, q4, q14
1210
+    vadd.u16        q5, q5, q15
1211
+.endr
1212
+    vadd.u16        q0, q4, q5
1213
+    HORIZ_ADD       d0, d0, d1
1214
+    vpop            {d8-d11}
1215
+    mov             lr, ip
1216
+    vmov.32         r0, d0[0]
1217
+    bx              lr
1218
+endfunc
1219
+
1220
+function x265_satd_8x8_neon
1221
+    LOAD_DIFF_8x4_1   q8, q9, q10, q11
1222
+    vld1.64         {d7}, [r2], r3
1223
+    vld1.64         {d6}, [r0,:64], r1
1224
+    vsubl.u8        q12, d6, d7
1225
+    SUMSUB_AB       q0, q1, q8, q9
1226
+
1227
+    vld1.64         {d17}, [r2], r3
1228
+    vld1.64         {d16}, [r0,:64], r1
1229
+    vsubl.u8        q13, d16, d17
1230
+    SUMSUB_AB       q2, q3, q10, q11
1231
+
1232
+    vld1.64         {d19}, [r2], r3
1233
+    vld1.64         {d18}, [r0,:64], r1
1234
+    vsubl.u8        q14, d18, d19
1235
+    SUMSUB_AB       q8, q10, q0, q2
1236
+
1237
+    vld1.64         {d1}, [r2], r3
1238
+    vld1.64         {d0}, [r0,:64], r1
1239
+    vsubl.u8        q15, d0, d1
1240
+    SUMSUB_AB       q9, q11, q1, q3
1241
+endfunc
1242
+
1243
+// one vertical hadamard pass and two horizontal
1244
+function x265_satd_8x4v_8x8h_neon, export=0
1245
+    SUMSUB_ABCD     q0, q1, q2, q3, q12, q13, q14, q15
1246
+    SUMSUB_AB       q12, q14, q0, q2
1247
+    SUMSUB_AB       q13, q15, q1, q3
1248
+    vtrn.16         q8, q9
1249
+    vtrn.16         q10, q11
1250
+
1251
+    SUMSUB_AB       q0, q1, q8, q9
1252
+    SUMSUB_AB       q2, q3, q10, q11
1253
+    vtrn.16         q12, q13
1254
+    vtrn.16         q14, q15
1255
+
1256
+    SUMSUB_AB       q8, q9, q12, q13
1257
+    SUMSUB_AB       q10, q11, q14, q15
1258
+    vtrn.32         q0, q2
1259
+    vtrn.32         q1, q3
1260
+    ABS2            q0, q2
1261
+    ABS2            q1, q3
1262
+
1263
+    vtrn.32         q8, q10
1264
+    vtrn.32         q9, q11
1265
+    ABS2            q8, q10
1266
+    ABS2            q9, q11
1267
+
1268
+    vmax.s16        q12, q0, q2
1269
+    vmax.s16        q13, q1, q3
1270
+    vmax.s16        q14, q8, q10
1271
+    vmax.s16        q15, q9, q11
1272
+    bx              lr
1273
+endfunc
1274
+
1275
+function x265_satd_16x4_neon, export=0
1276
+    vld1.64         {d2-d3}, [r2], r3
1277
+    vld1.64         {d0-d1}, [r0,:128], r1
1278
+    vsubl.u8        q8, d0, d2
1279
+    vsubl.u8        q12, d1, d3
1280
+
1281
+    vld1.64         {d6-d7}, [r2], r3
1282
+    vld1.64         {d4-d5}, [r0,:128], r1
1283
+    vsubl.u8        q9, d4, d6
1284
+    vsubl.u8        q13, d5, d7
1285
+
1286
+    vld1.64         {d2-d3}, [r2], r3
1287
+    vld1.64         {d0-d1}, [r0,:128], r1
1288
+    vsubl.u8        q10, d0, d2
1289
+    vsubl.u8        q14, d1, d3
1290
+
1291
+    vld1.64         {d6-d7}, [r2], r3
1292
+    vld1.64         {d4-d5}, [r0,:128], r1
1293
+    vsubl.u8        q11, d4, d6
1294
+    vsubl.u8        q15, d5, d7
1295
+
1296
+    vadd.s16        q0, q8, q9
1297
+    vsub.s16        q1, q8, q9
1298
+    SUMSUB_AB       q2, q3, q10, q11
1299
+    SUMSUB_ABCD     q8, q10, q9, q11, q0, q2, q1, q3
1300
+    b               x265_satd_8x4v_8x8h_neon
1301
+endfunc
1302
+
1303
+function x265_pixel_satd_16x4_neon
1304
+    vpush           {d8-d11}
1305
+    mov             ip, lr
1306
+    bl              x265_satd_16x4_neon
1307
+    vadd.u16        q4, q12, q13
1308
+    vadd.u16        q5, q14, q15
1309
+    vadd.u16        q0, q4, q5
1310
+    HORIZ_ADD       d0, d0, d1
1311
+    vpop            {d8-d11}
1312
+    mov             lr, ip
1313
+    vmov.32         r0, d0[0]
1314
+    bx              lr
1315
+endfunc
1316
+
1317
+function x265_pixel_satd_16x8_neon
1318
+    vpush           {d8-d11}
1319
+    mov             ip, lr
1320
+    bl              x265_satd_16x4_neon
1321
+    vadd.u16        q4, q12, q13
1322
+    vadd.u16        q5, q14, q15
1323
+
1324
+    bl              x265_satd_16x4_neon
1325
+    vadd.u16        q4, q4, q12
1326
+    vadd.u16        q5, q5, q13
1327
+    vadd.u16        q4, q4, q14
1328
+    vadd.u16        q5, q5, q15
1329
+
1330
+    vadd.u16        q0, q4, q5
1331
+    HORIZ_ADD       d0, d0, d1
1332
+    vpop            {d8-d11}
1333
+    mov             lr, ip
1334
+    vmov.32         r0, d0[0]
1335
+    bx              lr
1336
+endfunc
1337
+
1338
+function x265_pixel_satd_16x12_neon
1339
+    vpush           {d8-d11}
1340
+    mov             ip, lr
1341
+    bl              x265_satd_16x4_neon
1342
+    vadd.u16        q4, q12, q13
1343
+    vadd.u16        q5, q14, q15
1344
+.rept 2
1345
+    bl              x265_satd_16x4_neon
1346
+    vadd.u16        q4, q4, q12
1347
+    vadd.u16        q5, q5, q13
1348
+    vadd.u16        q4, q4, q14
1349
+    vadd.u16        q5, q5, q15
1350
+.endr
1351
+    vadd.u16        q0, q4, q5
1352
+    HORIZ_ADD       d0, d0, d1
1353
+    vpop            {d8-d11}
1354
+    mov             lr, ip
1355
+    vmov.32         r0, d0[0]
1356
+    bx              lr
1357
+endfunc
1358
+
1359
+function x265_pixel_satd_16x16_neon
1360
+    vpush           {d8-d11}
1361
+    mov             ip, lr
1362
+    bl              x265_satd_16x4_neon
1363
+    vadd.u16        q4, q12, q13
1364
+    vadd.u16        q5, q14, q15
1365
+.rept 3
1366
+    bl              x265_satd_16x4_neon
1367
+    vadd.u16        q4, q4, q12
1368
+    vadd.u16        q5, q5, q13
1369
+    vadd.u16        q4, q4, q14
1370
+    vadd.u16        q5, q5, q15
1371
+.endr
1372
+    vadd.u16        q0, q4, q5
1373
+    HORIZ_ADD       d0, d0, d1
1374
+    vpop            {d8-d11}
1375
+    mov             lr, ip
1376
+    vmov.32         r0, d0[0]
1377
+    bx              lr
1378
+endfunc
1379
+
1380
+function x265_pixel_satd_16x24_neon
1381
+    vpush           {d8-d11}
1382
+    mov             ip, lr
1383
+    bl              x265_satd_16x4_neon
1384
+    vadd.u16        q4, q12, q13
1385
+    vadd.u16        q5, q14, q15
1386
+.rept 5
1387
+    bl              x265_satd_16x4_neon
1388
+    vadd.u16        q4, q4, q12
1389
+    vadd.u16        q5, q5, q13
1390
+    vadd.u16        q4, q4, q14
1391
+    vadd.u16        q5, q5, q15
1392
+.endr
1393
+    vadd.u16        q0, q4, q5
1394
+    HORIZ_ADD       d0, d0, d1
1395
+    vpop            {d8-d11}
1396
+    mov             lr, ip
1397
+    vmov.32         r0, d0[0]
1398
+    bx              lr
1399
+endfunc
1400
+
1401
+.macro pixel_satd_16x32_neon
1402
+    bl              x265_satd_16x4_neon
1403
+    vadd.u16        q4, q12, q13
1404
+    vadd.u16        q5, q14, q15
1405
+.rept 7
1406
+    bl              x265_satd_16x4_neon
1407
+    vadd.u16        q4, q4, q12
1408
+    vadd.u16        q5, q5, q13
1409
+    vadd.u16        q4, q4, q14
1410
+    vadd.u16        q5, q5, q15
1411
+.endr
1412
+.endm
1413
+
1414
+function x265_pixel_satd_16x32_neon
1415
+    vpush           {d8-d11}
1416
+    mov             ip, lr
1417
+    pixel_satd_16x32_neon
1418
+    vadd.u16        q0, q4, q5
1419
+    HORIZ_ADD       d0, d0, d1
1420
+    vpop            {d8-d11}
1421
+    mov             lr, ip
1422
+    vmov.32         r0, d0[0]
1423
+    bx              lr
1424
+endfunc
1425
+
1426
+function x265_pixel_satd_16x64_neon
1427
+    push            {r6, r7}
1428
+    vpush           {d8-d11}
1429
+    mov             ip, lr
1430
+    eor             r7, r7
1431
+    pixel_satd_16x32_neon
1432
+    vadd.u16        q0, q4, q5
1433
+    HORIZ_ADD       d0, d0, d1
1434
+    vmov.32         r6, d0[0]
1435
+    add             r7, r6
1436
+
1437
+    veor            q4, q5
1438
+    veor            q5, q5
1439
+    pixel_satd_16x32_neon
1440
+    vadd.u16        q0, q4, q5
1441
+    HORIZ_ADD       d0, d0, d1
1442
+    vmov.32         r6, d0[0]
1443
+    add             r0, r7, r6
1444
+    vpop            {d8-d11}
1445
+    pop             {r6, r7}
1446
+    mov             lr, ip
1447
+    bx              lr
1448
+endfunc
1449
+
1450
+function x265_pixel_satd_24x32_neon
1451
+    push            {r4, r5, r6, r7}
1452
+    vpush           {d8-d11}
1453
+    mov             ip, lr
1454
+    eor             r7, r7
1455
+    mov             r4, r0
1456
+    mov             r5, r2
1457
+.rept 3
1458
+    veor            q4, q4
1459
+    veor            q5, q5
1460
+.rept 4
1461
+    bl              x265_satd_8x8_neon
1462
+    vadd.u16        q4, q4, q12
1463
+    vadd.u16        q5, q5, q13
1464
+    vadd.u16        q4, q4, q14
1465
+    vadd.u16        q5, q5, q15
1466
+.endr
1467
+    vadd.u16        q0, q4, q5
1468
+    HORIZ_ADD       d0, d0, d1
1469
+    vmov.32         r6, d0[0]
1470
+    add             r7, r6
1471
+    add             r4, #8
1472
+    add             r5, #8
1473
+    mov             r0, r4
1474
+    mov             r2, r5
1475
+.endr
1476
+    mov             r0, r7
1477
+    vpop            {d8-d11}
1478
+    pop             {r4, r5, r6, r7}
1479
+    mov             lr, ip
1480
+    bx              lr
1481
+endfunc
1482
+
1483
+function x265_pixel_satd_24x64_neon
1484
+    push            {r4, r5, r6, r7}
1485
+    vpush           {d8-d11}
1486
+    mov             ip, lr
1487
+    eor             r7, r7
1488
+    mov             r4, r0
1489
+    mov             r5, r2
1490
+.rept 3
1491
+    veor            q4, q4
1492
+    veor            q5, q5
1493
+.rept 4
1494
+    bl              x265_satd_8x8_neon
1495
+    vadd.u16        q4, q4, q12
1496
+    vadd.u16        q5, q5, q13
1497
+    vadd.u16        q4, q4, q14
1498
+    vadd.u16        q5, q5, q15
1499
+.endr
1500
+    vadd.u16        q0, q4, q5
1501
+    HORIZ_ADD       d0, d0, d1
1502
+    vmov.32         r6, d0[0]
1503
+    add             r7, r6
1504
+    add             r4, #8
1505
+    add             r5, #8
1506
+    mov             r0, r4
1507
+    mov             r2, r5
1508
+.endr
1509
+
1510
+    sub             r4, #24
1511
+    sub             r5, #24
1512
+    add             r0, r4, r1, lsl #5
1513
+    add             r2, r5, r3, lsl #5
1514
+    mov             r4, r0
1515
+    mov             r5, r2
1516
+.rept 3
1517
+    veor            q4, q4
1518
+    veor            q5, q5
1519
+.rept 4
1520
+    bl              x265_satd_8x8_neon
1521
+    vadd.u16        q4, q4, q12
1522
+    vadd.u16        q5, q5, q13
1523
+    vadd.u16        q4, q4, q14
1524
+    vadd.u16        q5, q5, q15
1525
+.endr
1526
+    vadd.u16        q0, q4, q5
1527
+    HORIZ_ADD       d0, d0, d1
1528
+    vmov.32         r6, d0[0]
1529
+    add             r7, r6
1530
+    add             r4, #8
1531
+    add             r5, #8
1532
+    mov             r0, r4
1533
+    mov             r2, r5
1534
+.endr
1535
+    mov             r0, r7
1536
+    vpop            {d8-d11}
1537
+    pop             {r4, r5, r6, r7}
1538
+    mov             lr, ip
1539
+    bx              lr
1540
+endfunc
1541
+
1542
+.macro pixel_satd_32x8
1543
+    mov             r4, r0
1544
+    mov             r5, r2
1545
+    bl              x265_satd_16x4_neon
1546
+    vadd.u16        q4, q4, q12
1547
+    vadd.u16        q5, q5, q13
1548
+    vadd.u16        q4, q4, q14
1549
+    vadd.u16        q5, q5, q15
1550
+
1551
+    bl              x265_satd_16x4_neon
1552
+    vadd.u16        q4, q4, q12
1553
+    vadd.u16        q5, q5, q13
1554
+    vadd.u16        q4, q4, q14
1555
+    vadd.u16        q5, q5, q15
1556
+
1557
+    add             r0, r4, #16
1558
+    add             r2, r5, #16
1559
+    bl              x265_satd_16x4_neon
1560
+    vadd.u16        q4, q4, q12
1561
+    vadd.u16        q5, q5, q13
1562
+    vadd.u16        q4, q4, q14
1563
+    vadd.u16        q5, q5, q15
1564
+
1565
+    bl              x265_satd_16x4_neon
1566
+    vadd.u16        q4, q4, q12
1567
+    vadd.u16        q5, q5, q13
1568
+    vadd.u16        q4, q4, q14
1569
+    vadd.u16        q5, q5, q15
1570
+.endm
1571
+
1572
+function x265_pixel_satd_32x8_neon
1573
+    push            {r4, r5}
1574
+    vpush           {d8-d11}
1575
+    mov             ip, lr
1576
+    veor            q4, q4
1577
+    veor            q5, q5
1578
+    pixel_satd_32x8
1579
+    vadd.u16        q0, q4, q5
1580
+    HORIZ_ADD       d0, d0, d1
1581
+    vmov.32         r0, d0[0]
1582
+    vpop            {d8-d11}
1583
+    pop             {r4, r5}
1584
+    mov             lr, ip
1585
+    bx              lr
1586
+endfunc
1587
+
1588
+.macro satd_32x16_neon
1589
+    veor            q4, q4
1590
+    veor            q5, q5
1591
+    pixel_satd_32x8
1592
+    sub             r0, #16
1593
+    sub             r2, #16
1594
+    pixel_satd_32x8
1595
+    vadd.u16        q0, q4, q5
1596
+    HORIZ_ADD       d0, d0, d1
1597
+    vmov.32         r6, d0[0]
1598
+.endm
1599
+
1600
+function x265_pixel_satd_32x16_neon
1601
+    push            {r4, r5, r6}
1602
+    vpush           {d8-d11}
1603
+    mov             ip, lr
1604
+    satd_32x16_neon
1605
+    mov             r0, r6
1606
+    vpop            {d8-d11}
1607
+    pop             {r4, r5, r6}
1608
+    mov             lr, ip
1609
+    bx              lr
1610
+endfunc
1611
+
1612
+function x265_pixel_satd_32x24_neon
1613
+    push            {r4, r5, r6}
1614
+    vpush           {d8-d11}
1615
+    mov             ip, lr
1616
+    satd_32x16_neon
1617
+    veor            q4, q4
1618
+    veor            q5, q5
1619
+    sub             r0, #16
1620
+    sub             r2, #16
1621
+    pixel_satd_32x8
1622
+    vadd.u16        q0, q4, q5
1623
+    HORIZ_ADD       d0, d0, d1
1624
+    vmov.32         r0, d0[0]
1625
+    add             r0, r6
1626
+    vpop            {d8-d11}
1627
+    pop             {r4, r5, r6}
1628
+    mov             lr, ip
1629
+    bx              lr
1630
+endfunc
1631
+
1632
+function x265_pixel_satd_32x32_neon
1633
+    push            {r4, r5, r6, r7}
1634
+    vpush           {d8-d11}
1635
+    mov             ip, lr
1636
+    eor             r7, r7
1637
+    satd_32x16_neon
1638
+    sub             r0, #16
1639
+    sub             r2, #16
1640
+    add             r7, r6
1641
+    satd_32x16_neon
1642
+    add             r0, r7, r6
1643
+    vpop            {d8-d11}
1644
+    pop             {r4, r5, r6, r7}
1645
+    mov             lr, ip
1646
+    bx              lr
1647
+endfunc
1648
+
1649
+function x265_pixel_satd_32x48_neon
1650
+    push            {r4, r5, r6, r7}
1651
+    vpush           {d8-d11}
1652
+    mov             ip, lr
1653
+    eor             r7, r7
1654
+.rept 2
1655
+    satd_32x16_neon
1656
+    sub             r0, #16
1657
+    sub             r2, #16
1658
+    add             r7, r6
1659
+.endr
1660
+    satd_32x16_neon
1661
+    add             r0, r7, r6
1662
+    vpop            {d8-d11}
1663
+    pop             {r4, r5, r6, r7}
1664
+    mov             lr, ip
1665
+    bx              lr
1666
+endfunc
1667
+
1668
+function x265_pixel_satd_32x64_neon
1669
+    push            {r4, r5, r6, r7}
1670
+    vpush           {d8-d11}
1671
+    mov             ip, lr
1672
+    eor             r7, r7
1673
+.rept 3
1674
+    satd_32x16_neon
1675
+    sub             r0, #16
1676
+    sub             r2, #16
1677
+    add             r7, r6
1678
+.endr
1679
+    satd_32x16_neon
1680
+    add             r0, r7, r6
1681
+    vpop            {d8-d11}
1682
+    pop             {r4, r5, r6, r7}
1683
+    mov             lr, ip
1684
+    bx              lr
1685
+endfunc
1686
+
1687
+.macro satd_64x16_neon
1688
+    mov             r8, r0
1689
+    mov             r9, r2
1690
+    satd_32x16_neon
1691
+    add             r7, r6
1692
+    add             r0, r8, #32
1693
+    add             r2, r9, #32
1694
+    satd_32x16_neon
1695
+    add             r7, r6
1696
+.endm
1697
+
1698
+function x265_pixel_satd_64x16_neon
1699
+    push            {r4, r5, r6, r7, r8, r9}
1700
+    vpush           {d8-d11}
1701
+    mov             ip, lr
1702
+    eor             r7, r7
1703
+    satd_64x16_neon
1704
+    mov             r0, r7
1705
+    vpop            {d8-d11}
1706
+    pop             {r4, r5, r6, r7, r8, r9}
1707
+    mov             lr, ip
1708
+    bx              lr
1709
+endfunc
1710
+
1711
+function x265_pixel_satd_64x32_neon
1712
+    push            {r4, r5, r6, r7, r8, r9}
1713
+    vpush           {d8-d11}
1714
+    mov             ip, lr
1715
+    eor             r7, r7
1716
+    satd_64x16_neon
1717
+    sub             r0, #48
1718
+    sub             r2, #48
1719
+    satd_64x16_neon
1720
+    mov             r0, r7
1721
+    vpop            {d8-d11}
1722
+    pop             {r4, r5, r6, r7, r8, r9}
1723
+    mov             lr, ip
1724
+    bx              lr
1725
+endfunc
1726
+
1727
+function x265_pixel_satd_64x48_neon
1728
+    push            {r4, r5, r6, r7, r8, r9}
1729
+    vpush           {d8-d11}
1730
+    mov             ip, lr
1731
+    eor             r7, r7
1732
+    satd_64x16_neon
1733
+    sub             r0, #48
1734
+    sub             r2, #48
1735
+    satd_64x16_neon
1736
+    sub             r0, #48
1737
+    sub             r2, #48
1738
+    satd_64x16_neon
1739
+    mov             r0, r7
1740
+    vpop            {d8-d11}
1741
+    pop             {r4, r5, r6, r7, r8, r9}
1742
+    mov             lr, ip
1743
+    bx              lr
1744
+endfunc
1745
+
1746
+function x265_pixel_satd_64x64_neon
1747
+    push            {r4, r5, r6, r7, r8, r9}
1748
+    vpush           {d8-d11}
1749
+    mov             ip, lr
1750
+    eor             r7, r7
1751
+    satd_64x16_neon
1752
+    sub             r0, #48
1753
+    sub             r2, #48
1754
+    satd_64x16_neon
1755
+    sub             r0, #48
1756
+    sub             r2, #48
1757
+    satd_64x16_neon
1758
+    sub             r0, #48
1759
+    sub             r2, #48
1760
+    satd_64x16_neon
1761
+    mov             r0, r7
1762
+    vpop            {d8-d11}
1763
+    pop             {r4, r5, r6, r7, r8, r9}
1764
+    mov             lr, ip
1765
+    bx              lr
1766
+endfunc
1767
+
1768
+function x265_pixel_satd_48x64_neon
1769
+    push            {r4, r5, r6, r7, r8, r9}
1770
+    vpush           {d8-d11}
1771
+    mov             ip, lr
1772
+    eor             r7, r7
1773
+    mov             r8, r0
1774
+    mov             r9, r2
1775
+.rept 3
1776
+    satd_32x16_neon
1777
+    sub             r0, #16
1778
+    sub             r2, #16
1779
+    add             r7, r6
1780
+.endr
1781
+    satd_32x16_neon
1782
+    add             r7, r6
1783
+
1784
+    add             r0, r8, #32
1785
+    add             r2, r9, #32
1786
+    pixel_satd_16x32_neon
1787
+    vadd.u16        q0, q4, q5
1788
+    HORIZ_ADD       d0, d0, d1
1789
+    vmov.32         r6, d0[0]
1790
+    add             r7, r6
1791
+
1792
+    veor            q4, q5
1793
+    veor            q5, q5
1794
+    pixel_satd_16x32_neon
1795
+    vadd.u16        q0, q4, q5
1796
+    HORIZ_ADD       d0, d0, d1
1797
+    vmov.32         r6, d0[0]
1798
+    add             r0, r7, r6
1799
+
1800
+    vpop            {d8-d11}
1801
+    pop             {r4, r5, r6, r7, r8, r9}
1802
+    mov             lr, ip
1803
+    bx              lr
1804
+endfunc
1805
+
1806
+.macro LOAD_DIFF_8x4 q0 q1 q2 q3
1807
+    vld1.32         {d1}, [r2], r3
1808
+    vld1.32         {d0}, [r0,:64], r1
1809
+    vsubl.u8        \q0, d0, d1
1810
+    vld1.32         {d3}, [r2], r3
1811
+    vld1.32         {d2}, [r0,:64], r1
1812
+    vsubl.u8        \q1, d2, d3
1813
+    vld1.32         {d5}, [r2], r3
1814
+    vld1.32         {d4}, [r0,:64], r1
1815
+    vsubl.u8        \q2, d4, d5
1816
+    vld1.32         {d7}, [r2], r3
1817
+    vld1.32         {d6}, [r0,:64], r1
1818
+    vsubl.u8        \q3, d6, d7
1819
+.endm
1820
+
1821
+.macro HADAMARD4_V r1, r2, r3, r4, t1, t2, t3, t4
1822
+    SUMSUB_ABCD \t1, \t2, \t3, \t4, \r1, \r2, \r3, \r4
1823
+    SUMSUB_ABCD \r1, \r3, \r2, \r4, \t1, \t3, \t2, \t4
1824
+.endm
1825
+
1826
+.macro sa8d_satd_8x8 satd=
1827
+function x265_sa8d_\satd\()8x8_neon, export=0
1828
+    LOAD_DIFF_8x4   q8,  q9,  q10, q11
1829
+    vld1.64         {d7}, [r2], r3
1830
+    SUMSUB_AB       q0,  q1,  q8,  q9
1831
+    vld1.64         {d6}, [r0,:64], r1
1832
+    vsubl.u8        q12, d6,  d7
1833
+    vld1.64         {d17}, [r2], r3
1834
+    SUMSUB_AB       q2,  q3,  q10, q11
1835
+    vld1.64         {d16}, [r0,:64], r1
1836
+    vsubl.u8        q13, d16, d17
1837
+    vld1.64         {d19}, [r2], r3
1838
+    SUMSUB_AB       q8,  q10, q0,  q2
1839
+    vld1.64         {d18}, [r0,:64], r1
1840
+    vsubl.u8        q14, d18, d19
1841
+    vld1.64         {d1}, [r2], r3
1842
+    SUMSUB_AB       q9,  q11, q1,  q3
1843
+    vld1.64         {d0}, [r0,:64], r1
1844
+    vsubl.u8        q15, d0,  d1
1845
+
1846
+    HADAMARD4_V     q12, q13, q14, q15,  q0,  q1,  q2,  q3
1847
+
1848
+    SUMSUB_ABCD     q0,  q8,  q1,  q9,   q8,  q12, q9,  q13
1849
+    SUMSUB_AB       q2,  q10, q10, q14
1850
+    vtrn.16         q8,  q9
1851
+    SUMSUB_AB       q3,  q11, q11, q15
1852
+    vtrn.16         q0,  q1
1853
+    SUMSUB_AB       q12, q13, q8,  q9
1854
+    vtrn.16         q10, q11
1855
+    SUMSUB_AB       q8,  q9,  q0,  q1
1856
+    vtrn.16         q2,  q3
1857
+    SUMSUB_AB       q14, q15, q10, q11
1858
+    vadd.i16        q10, q2,  q3
1859
+    vtrn.32         q12, q14
1860
+    vsub.i16        q11, q2,  q3
1861
+    vtrn.32         q13, q15
1862
+    SUMSUB_AB       q0,  q2,  q12, q14
1863
+    vtrn.32         q8,  q10
1864
+    SUMSUB_AB       q1,  q3,  q13, q15
1865
+    vtrn.32         q9,  q11
1866
+    SUMSUB_AB       q12, q14, q8,  q10
1867
+    SUMSUB_AB       q13, q15, q9,  q11
1868
+
1869
+    vswp            d1,  d24
1870
+    ABS2            q0,  q12
1871
+    vswp            d3,  d26
1872
+    ABS2            q1,  q13
1873
+    vswp            d5,  d28
1874
+    ABS2            q2,  q14
1875
+    vswp            d7,  d30
1876
+    ABS2            q3,  q15
1877
+    vmax.s16        q8,  q0,  q12
1878
+    vmax.s16        q9,  q1,  q13
1879
+    vmax.s16        q10, q2,  q14
1880
+    vmax.s16        q11, q3,  q15
1881
+    vadd.i16        q8,  q8,  q9
1882
+    vadd.i16        q9,  q10, q11
1883
+
1884
+    bx              lr
1885
+endfunc
1886
+.endm
1887
+
1888
+sa8d_satd_8x8
1889
+
1890
+function x265_pixel_sa8d_8x8_neon
1891
+    mov             ip,  lr
1892
+    bl              x265_sa8d_8x8_neon
1893
+    vadd.u16        q0,  q8,  q9
1894
+    HORIZ_ADD       d0,  d0,  d1
1895
+    mov             lr,  ip
1896
+    vmov.32         r0,  d0[0]
1897
+    add             r0,  r0,  #1
1898
+    lsr             r0,  r0,  #1
1899
+    bx              lr
1900
+endfunc
1901
+
1902
+function x265_pixel_sa8d_8x16_neon
1903
+    push            {r4, r5}
1904
+    mov             ip,  lr
1905
+    bl              x265_sa8d_8x8_neon
1906
+    vadd.u16        q0,  q8,  q9
1907
+    HORIZ_ADD       d0,  d0,  d1
1908
+    vmov.32         r5, d0[0]
1909
+    add             r5,  r5,  #1
1910
+    lsr             r5,  r5,  #1
1911
+    bl              x265_sa8d_8x8_neon
1912
+    vadd.u16        q0,  q8,  q9
1913
+    HORIZ_ADD       d0,  d0,  d1
1914
+    vmov.32         r4, d0[0]
1915
+    add             r4,  r4,  #1
1916
+    lsr             r4,  r4,  #1
1917
+    add             r0, r4, r5
1918
+    mov             lr,  ip
1919
+    pop             {r4, r5}
1920
+    bx              lr
1921
+endfunc
1922
+
1923
+function x265_pixel_sa8d_16x16_neon
1924
+    vpush           {d8 - d11}
1925
+    mov             ip,  lr
1926
+    bl              x265_sa8d_8x8_neon
1927
+    vpaddl.u16      q4,  q8
1928
+    vpaddl.u16      q5,  q9
1929
+    bl              x265_sa8d_8x8_neon
1930
+    vpadal.u16      q4,  q8
1931
+    vpadal.u16      q5,  q9
1932
+    sub             r0,  r0,  r1,  lsl #4
1933
+    sub             r2,  r2,  r3,  lsl #4
1934
+    add             r0,  r0,  #8
1935
+    add             r2,  r2,  #8
1936
+    bl              x265_sa8d_8x8_neon
1937
+    vpadal.u16      q4,  q8
1938
+    vpadal.u16      q5,  q9
1939
+    bl              x265_sa8d_8x8_neon
1940
+    vpaddl.u16      q8,  q8
1941
+    vpaddl.u16      q9,  q9
1942
+    vadd.u32        q0,  q4,  q8
1943
+    vadd.u32        q1,  q5,  q9
1944
+    vadd.u32        q0,  q0,  q1
1945
+    vadd.u32        d0,  d0,  d1
1946
+    vpadd.u32       d0,  d0,  d0
1947
+    vpop            {d8-d11}
1948
+    mov             lr,  ip
1949
+    vmov.32         r0,  d0[0]
1950
+    add             r0,  r0,  #1
1951
+    lsr             r0,  r0,  #1
1952
+    bx              lr
1953
+endfunc
1954
+
1955
+function x265_quant_neon
1956
+    push            {r4-r6}
1957
+    ldr             r4, [sp, #3* 4]
1958
+    mov             r12, #1
1959
+    lsl             r12, r4
1960
+    vdup.s32        d0, r12                // q0 = 2^qbits
1961
+    neg             r12, r4
1962
+    vdup.s32        q1, r12                // q1= -qbits
1963
+    add             r12, #8
1964
+    vdup.s32        q2, r12                // q2= -qbits+8
1965
+    ldr             r4, [sp, #3* 4 + 4]
1966
+    vdup.s32        q3, r4                 // q3= add
1967
+    ldr             r4, [sp, #3* 4 + 8]    // r4= numcoeff
1968
+
1969
+    lsr             r4, r4 ,#2
1970
+    veor.s32        q4, q4                 // q4= accumulate numsig
1971
+    eor             r5, r5
1972
+    veor.s32        q12, q12
1973
+
1974
+.loop_quant:
1975
+
1976
+    vld1.s16        d16, [r0]!
1977
+    vmovl.s16       q9, d16                // q9= coef[blockpos]
1978
+
1979
+    vclt.s32        q8, q9, #0             // q8= sign
1980
+
1981
+    vabs.s32        q9, q9                 // q9= level=abs(coef[blockpos])
1982
+    vld1.s32        {q10}, [r1]!           // q10= quantCoeff[blockpos]
1983
+    vmul.i32        q9, q9, q10            // q9 = tmplevel = abs(level) * quantCoeff[blockpos];
1984
+
1985
+    vadd.s32        q10, q9, q3            // q10= tmplevel+add
1986
+    vshl.s32        q10, q10, q1           // q10= level =(tmplevel+add) >> qbits
1987
+
1988
+    vmls.s32        q9, q10, d0[0]         // q10= tmplevel - (level << qBits)
1989
+    vshl.s32        q11, q9, q2            // q11= ((tmplevel - (level << qBits)) >> qBits8)
1990
+    vst1.s32        {q11}, [r2]!           // store deltaU
1991
+
1992
+    // numsig
1993
+    vceq.s32        q11, q10, q12
1994
+    vadd.s32        q4, q11
1995
+    add             r5, #4
1996
+
1997
+    veor.s32        q11, q10, q8
1998
+    vsub.s32        q11, q11, q8
1999
+    vqmovn.s32      d16, q11
2000
+    vst1.s16        d16, [r3]!
2001
+
2002
+    subs            r4, #1
2003
+    bne             .loop_quant
2004
+
2005
+    vadd.u32        d8, d9
2006
+    vpadd.u32       d8, d8
2007
+    vmov.32         r12, d8[0]
2008
+    add             r0, r5, r12
2009
+
2010
+    pop             {r4-r6}
2011
+    bx              lr
2012
+endfunc
2013
+
2014
+function x265_nquant_neon
2015
+    push            {r4}
2016
+    neg             r12, r3
2017
+    vdup.s32        q0, r12                 // q0= -qbits
2018
+    ldr             r3, [sp, #1* 4]
2019
+    vdup.s32        q1, r3                  // add
2020
+    ldr             r3, [sp, #1* 4 + 4]     // numcoeff
2021
+
2022
+    lsr             r3, r3 ,#2
2023
+    veor.s32        q4, q4                 // q4= accumulate numsig
2024
+    eor             r4, r4
2025
+    veor.s32        q12, q12
2026
+
2027
+.loop_nquant:
2028
+
2029
+    vld1.s16        d16, [r0]!
2030
+    vmovl.s16       q9, d16                // q9= coef[blockpos]
2031
+
2032
+    vclt.s32        q8, q9, #0             // q8= sign
2033
+
2034
+    vabs.s32        q9, q9                 // q9= level=abs(coef[blockpos])
2035
+    vld1.s32        {q10}, [r1]!           // q10= quantCoeff[blockpos]
2036
+    vmul.i32        q9, q9, q10            // q9 = tmplevel = abs(level) * quantCoeff[blockpos];
2037
+
2038
+    vadd.s32        q10, q9, q1            // q10= tmplevel+add
2039
+    vshl.s32        q10, q10, q0           // q10= level =(tmplevel+add) >> qbits
2040
+
2041
+    // numsig
2042
+    vceq.s32        q11, q10, q12
2043
+    vadd.s32        q4, q11
2044
+    add             r4, #4
2045
+
2046
+    veor.s32        q11, q10, q8
2047
+    vsub.s32        q11, q11, q8
2048
+    vqmovn.s32      d16, q11
2049
+    vabs.s16        d17, d16
2050
+    vst1.s16        d17, [r2]!
2051
+
2052
+    subs            r3, #1
2053
+    bne             .loop_nquant
2054
+
2055
+    vadd.u32        d8, d9
2056
+    vpadd.u32       d8, d8
2057
+    vmov.32         r12, d8[0]
2058
+    add             r0, r4, r12
2059
+
2060
+    pop             {r4}
2061
+    bx              lr
2062
+endfunc
2063
+.macro sa8d_16x16 reg
2064
+    bl              x265_sa8d_8x8_neon
2065
+    vpaddl.u16      q4,  q8
2066
+    vpaddl.u16      q5,  q9
2067
+    bl              x265_sa8d_8x8_neon
2068
+    vpadal.u16      q4,  q8
2069
+    vpadal.u16      q5,  q9
2070
+    sub             r0,  r0,  r1,  lsl #4
2071
+    sub             r2,  r2,  r3,  lsl #4
2072
+    add             r0,  r0,  #8
2073
+    add             r2,  r2,  #8
2074
+    bl              x265_sa8d_8x8_neon
2075
+    vpadal.u16      q4,  q8
2076
+    vpadal.u16      q5,  q9
2077
+    bl              x265_sa8d_8x8_neon
2078
+    vpaddl.u16      q8,  q8
2079
+    vpaddl.u16      q9,  q9
2080
+    vadd.u32        q0,  q4,  q8
2081
+    vadd.u32        q1,  q5,  q9
2082
+    vadd.u32        q0,  q0,  q1
2083
+    vadd.u32        d0,  d0,  d1
2084
+    vpadd.u32       d0,  d0,  d0
2085
+    vmov.32         \reg,  d0[0]
2086
+    add             \reg,  \reg,  #1
2087
+    lsr             \reg,  \reg,  #1
2088
+.endm
2089
+
2090
+function x265_pixel_sa8d_16x32_neon
2091
+    push            {r4, r5}
2092
+    vpush           {d8 - d11}
2093
+    mov             ip,  lr
2094
+
2095
+    sa8d_16x16 r4
2096
+
2097
+    sub             r0,  r0,  #8
2098
+    sub             r2,  r2,  #8
2099
+
2100
+    sa8d_16x16 r5
2101
+
2102
+    add             r0, r4, r5
2103
+    vpop            {d8 - d11}
2104
+    pop             {r4, r5}
2105
+    mov             lr,  ip
2106
+    bx              lr
2107
+endfunc
2108
+
2109
+function x265_pixel_sa8d_32x32_neon
2110
+    push            {r4 - r7}
2111
+    vpush           {d8 - d11}
2112
+    mov             ip,  lr
2113
+
2114
+    sa8d_16x16 r4
2115
+
2116
+    sub             r0,  r0,  r1,  lsl #4
2117
+    sub             r2,  r2,  r3,  lsl #4
2118
+    add             r0,  r0,  #8
2119
+    add             r2,  r2,  #8
2120
+
2121
+    sa8d_16x16 r5
2122
+
2123
+    sub             r0,  r0,  #24
2124
+    sub             r2,  r2,  #24
2125
+
2126
+   sa8d_16x16 r6
2127
+
2128
+    sub             r0,  r0,  r1,  lsl #4
2129
+    sub             r2,  r2,  r3,  lsl #4
2130
+    add             r0,  r0,  #8
2131
+    add             r2,  r2,  #8
2132
+
2133
+   sa8d_16x16 r7
2134
+
2135
+    add             r4, r4, r5
2136
+    add             r6, r6, r7
2137
+    add             r0, r4, r6
2138
+    vpop            {d8 - d11}
2139
+    pop             {r4 - r7}
2140
+    mov             lr,  ip
2141
+    bx              lr
2142
+endfunc
2143
+
2144
+function x265_pixel_sa8d_32x64_neon
2145
+    push            {r4 - r10}
2146
+    vpush           {d8 - d11}
2147
+    mov             ip,  lr
2148
+
2149
+    mov             r10, #4
2150
+    eor             r9, r9
2151
+
2152
+.loop_32:
2153
+
2154
+    sa8d_16x16 r4
2155
+
2156
+    sub             r0,  r0,  r1,  lsl #4
2157
+    sub             r2,  r2,  r3,  lsl #4
2158
+    add             r0,  r0,  #8
2159
+    add             r2,  r2,  #8
2160
+
2161
+    sa8d_16x16 r5
2162
+
2163
+    add             r4, r4, r5
2164
+    add             r9, r9, r4
2165
+
2166
+    sub             r0,  r0,  #24
2167
+    sub             r2,  r2,  #24
2168
+
2169
+    subs            r10, #1
2170
+    bgt            .loop_32
2171
+
2172
+    mov             r0, r9
2173
+    vpop            {d8-d11}
2174
+    pop             {r4-r10}
2175
+    mov             lr,  ip
2176
+    bx              lr
2177
+endfunc
2178
+
2179
+function x265_pixel_sa8d_64x64_neon
2180
+    push            {r4-r10}
2181
+    vpush           {d8-d11}
2182
+    mov             ip,  lr
2183
+
2184
+    mov             r10, #4
2185
+    eor             r9, r9
2186
+
2187
+.loop_1:
2188
+
2189
+    sa8d_16x16 r4
2190
+
2191
+    sub             r0,  r0,  r1,  lsl #4
2192
+    sub             r2,  r2,  r3,  lsl #4
2193
+    add             r0,  r0,  #8
2194
+    add             r2,  r2,  #8
2195
+
2196
+    sa8d_16x16 r5
2197
+
2198
+    sub             r0,  r0,  r1,  lsl #4
2199
+    sub             r2,  r2,  r3,  lsl #4
2200
+    add             r0,  r0,  #8
2201
+    add             r2,  r2,  #8
2202
+
2203
+   sa8d_16x16 r6
2204
+
2205
+    sub             r0,  r0,  r1,  lsl #4
2206
+    sub             r2,  r2,  r3,  lsl #4
2207
+    add             r0,  r0,  #8
2208
+    add             r2,  r2,  #8
2209
+
2210
+   sa8d_16x16 r7
2211
+
2212
+    add             r4, r4, r5
2213
+    add             r6, r6, r7
2214
+    add             r8, r4, r6
2215
+    add             r9, r9, r8
2216
+
2217
+    sub             r0,  r0,  #56
2218
+    sub             r2,  r2,  #56
2219
+
2220
+    subs            r10, #1
2221
+    bgt            .loop_1
2222
+
2223
+    mov             r0, r9
2224
+    vpop            {d8-d11}
2225
+    pop             {r4-r10}
2226
+    mov             lr,  ip
2227
+    bx              lr
2228
+endfunc
2229
+
2230
+/***** dequant_scaling*****/
2231
+// void dequant_scaling_c(const int16_t* quantCoef, const int32_t* deQuantCoef, int16_t* coef, int num, int per, int shift)
2232
+function x265_dequant_scaling_neon
2233
+    push            {r4, r5, r6, r7}
2234
+    ldr             r4, [sp, #16]       // per
2235
+    ldr             r5, [sp, #20]       //.shift
2236
+    add             r5, #4              // shift + 4
2237
+    lsr             r3, #3              // num / 8
2238
+    cmp             r5, r4
2239
+    blt             skip
2240
+
2241
+    mov             r12, #1
2242
+    sub             r6, r5, r4          // shift - per
2243
+    sub             r6, #1              // shift - per - 1
2244
+    lsl             r6, r12, r6         // 1 << shift - per - 1 (add)
2245
+    vdup.32         q0, r6
2246
+    sub             r7, r4, r5          // per - shift
2247
+    vdup.32         q3, r7
2248
+
2249
+dequant_loop1:
2250
+    vld1.16         {q9}, [r0]!          // quantCoef
2251
+    vld1.32         {q2}, [r1]!          // deQuantCoef
2252
+    vld1.32         {q10}, [r1]!
2253
+    vmovl.s16       q1, d18
2254
+    vmovl.s16       q9, d19
2255
+
2256
+    vmul.s32        q1, q2              // quantCoef * deQuantCoef
2257
+    vmul.s32        q9, q10
2258
+    vadd.s32        q1, q0              // quantCoef * deQuantCoef + add
2259
+    vadd.s32        q9, q0
2260
+
2261
+    vshl.s32        q1, q3
2262
+    vshl.s32        q9, q3
2263
+    vqmovn.s32      d16, q1             // x265_clip3
2264
+    vqmovn.s32      d17, q9
2265
+    subs            r3, #1
2266
+    vst1.16         {q8}, [r2]!
2267
+    bne             dequant_loop1
2268
+    b               1f
2269
+
2270
+skip:
2271
+    sub             r6, r4, r5          // per - shift
2272
+    vdup.16         q0, r6
2273
+
2274
+dequant_loop2:
2275
+    vld1.16         {q9}, [r0]!          // quantCoef
2276
+    vld1.32         {q2}, [r1]!          // deQuantCoef
2277
+    vld1.32         {q10}, [r1]!
2278
+    vmovl.s16       q1, d18
2279
+    vmovl.s16       q9, d19
2280
+
2281
+    vmul.s32        q1, q2              // quantCoef * deQuantCoef
2282
+    vmul.s32        q9, q10
2283
+    vqmovn.s32      d16, q1             // x265_clip3
2284
+    vqmovn.s32      d17, q9
2285
+
2286
+    vqshl.s16       q8, q0             // coefQ << per - shift
2287
+    subs            r3, #1
2288
+    vst1.16         {q8}, [r2]!
2289
+    bne             dequant_loop2
2290
+1:
2291
+    pop             {r4, r5, r6, r7}
2292
+    bx              lr
2293
+endfunc
2294
+
2295
+// void dequant_normal_c(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift)
2296
+function x265_dequant_normal_neon
2297
+    ldr             r12, [sp]            // shift
2298
+#if HIGH_BIT_DEPTH  // NEVER TEST path
2299
+    cmp             r3, #32768
2300
+    lsrlt           r3, #(BIT_DEPTH - 8)
2301
+    sublt           r12, #(BIT_DEPTH - 8)
2302
+#endif
2303
+    lsr             r2, #4              // num / 16
2304
+
2305
+    neg             r12, r12
2306
+    vdup.16         q0, r3
2307
+    vdup.32         q1, r12
2308
+
2309
+.dqn_loop1:
2310
+    vld1.16         {d4-d7}, [r0]!
2311
+
2312
+    vmull.s16       q8, d4, d0
2313
+    vmull.s16       q9, d5, d0
2314
+    vmull.s16       q10, d6, d0
2315
+    vmull.s16       q11, d7, d0
2316
+
2317
+    vrshl.s32       q8, q1
2318
+    vrshl.s32       q9, q1
2319
+    vrshl.s32       q10, q1
2320
+    vrshl.s32       q11, q1
2321
+    vqmovn.s32      d16, q8
2322
+    vqmovn.s32      d17, q9
2323
+    vqmovn.s32      d18, q10
2324
+    vqmovn.s32      d19, q11
2325
+
2326
+    subs            r2, #1
2327
+    vst1.16         {d16-d19}, [r1]!
2328
+    bgt            .dqn_loop1
2329
+    bx              lr
2330
+endfunc
2331
+
2332
+/********* ssim ***********/
2333
+// void x265_ssim_4x4x2_core_neon(const pixel* pix1, intptr_t stride1, const pixel* pix2, intptr_t stride2, int sums[2][4]);
2334
+function x265_ssim_4x4x2_core_neon
2335
+    ldr             r12, [sp]
2336
+
2337
+    vld1.64         {d0}, [r0], r1
2338
+    vld1.64         {d1}, [r0], r1
2339
+    vld1.64         {d2}, [r0], r1
2340
+    vld1.64         {d3}, [r0], r1
2341
+
2342
+    vld1.64         {d4}, [r2], r3
2343
+    vld1.64         {d5}, [r2], r3
2344
+    vld1.64         {d6}, [r2], r3
2345
+    vld1.64         {d7}, [r2], r3
2346
+
2347
+    vpaddl.u8       q8, q0
2348
+    vpadal.u8       q8, q1
2349
+    vpaddl.u8       q9, q2
2350
+    vpadal.u8       q9, q3
2351
+    vadd.u16        d16, d17
2352
+    vpaddl.u16      d16, d16
2353
+    vadd.u16        d18, d19
2354
+    vpaddl.u16      d17, d18
2355
+
2356
+    vmull.u8        q10, d0, d0
2357
+    vmull.u8        q11, d1, d1
2358
+    vmull.u8        q12, d2, d2
2359
+    vmull.u8        q13, d3, d3
2360
+    vpaddl.u16      q10, q10
2361
+    vpadal.u16      q10, q11
2362
+    vpadal.u16      q10, q12
2363
+    vpadal.u16      q10, q13
2364
+
2365
+    vmull.u8        q9, d4, d4
2366
+    vmull.u8        q11, d5, d5
2367
+    vmull.u8        q12, d6, d6
2368
+    vmull.u8        q13, d7, d7
2369
+    vpadal.u16      q10, q9
2370
+    vpadal.u16      q10, q11
2371
+    vpadal.u16      q10, q12
2372
+    vpadal.u16      q10, q13
2373
+    vpadd.u32       d18, d20, d21
2374
+
2375
+    vmull.u8        q10, d0, d4
2376
+    vmull.u8        q11, d1, d5
2377
+    vmull.u8        q12, d2, d6
2378
+    vmull.u8        q13, d3, d7
2379
+    vpaddl.u16      q10, q10
2380
+    vpadal.u16      q10, q11
2381
+    vpadal.u16      q10, q12
2382
+    vpadal.u16      q10, q13
2383
+    vpadd.u32       d19, d20, d21
2384
+
2385
+    vst4.32         {d16-d19}, [r12]
2386
+    bx              lr
2387
+endfunc
2388
+
2389
+// int psyCost_pp(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride)
2390
+function x265_psyCost_4x4_neon
2391
+    vld1.32         {d16[]}, [r0,:32], r1                   // d16 = [A03 A02 A01 A00 A03 A02 A01 A00]
2392
+    vld1.32         {d17[]}, [r0,:32], r1                   // d17 = [A13 A12 A11 A10 A13 A12 A11 A10]
2393
+    vld1.32         {d16[1]}, [r0,:32], r1                  // d16 = [A23 A22 A21 A20 A03 A02 A01 A00]
2394
+    vld1.32         {d17[1]}, [r0,:32], r1                  // d17 = [A33 A32 A31 A30 A13 A12 A11 A10]
2395
+
2396
+    vld1.32         {d18[]}, [r2,:32], r3                   // d18 = [B03 B02 B01 B00 B03 B02 B01 B00]
2397
+    vld1.32         {d19[]}, [r2,:32], r3                   // d19 = [B13 B12 B11 B10 B13 B12 B11 B10]
2398
+    vld1.32         {d18[1]}, [r2,:32], r3                  // d18 = [B23 B22 B21 B20 B03 B02 B01 B00]
2399
+    vld1.32         {d19[1]}, [r2,:32], r3                  // d19 = [B33 B32 B31 B30 B13 B12 B11 B10]
2400
+
2401
+    vaddl.u8        q2, d16, d17                            // q2 = [2+3 0+1]
2402
+    vsubl.u8        q3, d16, d17                            // q3 = [2-3 0-1]
2403
+    vaddl.u8        q12, d18, d19
2404
+    vsubl.u8        q13, d18, d19
2405
+
2406
+    SUMSUB_ABCD     d0, d2, d1, d3, d4, d5, d6, d7          // q0 = [(0-1)+(2-3) (0+1)+(2+3)], q1 = [(0-1)-(2-3) (0+1)-(2+3)]
2407
+    SUMSUB_ABCD     d20, d22, d21, d23, d24, d25, d26, d27
2408
+
2409
+    // Hadamard-1D
2410
+    vtrn.16         q0, q1
2411
+    vtrn.16         q10, q11
2412
+    SUMSUB_AB       q2, q3, q0, q1                          // q2 = [((0-1)-(2-3))+((0-1)+(2-3)) ((0+1)-(2+3))+((0+1)+(2+3))], q3 = [((0-1)-(2-3))-((0-1)+(2-3)) ((0+1)-(2+3))-((0+1)+(2+3))]
2413
+    SUMSUB_AB       q12, q13, q10, q11
2414
+
2415
+    // SAD Stage-0
2416
+    vaddl.u8        q14, d16, d17                           // q14 = [S23x4 S01x4]
2417
+    vaddl.u8        q15, d18, d19
2418
+
2419
+    // Hadamard-2D
2420
+    vtrn.32         q2, q3
2421
+    vtrn.32         q12, q13
2422
+    vabs.s16        q2, q2
2423
+    vabs.s16        q12, q12
2424
+    vabs.s16        q3, q3
2425
+    vabs.s16        q13, q13
2426
+
2427
+    // SAD Stage-1
2428
+    vadd.u16        d28, d29                                // SAD: reduce to 4 elements
2429
+    vadd.u16        d30, d31
2430
+
2431
+    vmax.s16        q0, q2, q3
2432
+    vmax.s16        q10, q12, q13
2433
+
2434
+    // SAD Stage-2
2435
+    vpadd.u16       d28, d30                                // SAD: reduce to 2 elements
2436
+
2437
+    // SAD & SATD Final Stage
2438
+    vswp            d1, d20
2439
+    vadd.u16        q0, q10
2440
+    vpaddl.u16      d28, d28                                // d28 = SAD_DWORD[B A]
2441
+    vpadd.u16       d0, d1
2442
+    vshr.u32        d28, #2                                 // d28 = SAD_DWORD[B A] >> 2
2443
+    vpaddl.u16      d0, d0                                  // d0 = SATD_DWORD[B A]
2444
+    vsub.s32        d0, d28                                 // d0 = SATD - SAD
2445
+    vmov.32         r0, d0[0]
2446
+    vmov.32         r1, d0[1]
2447
+    subs            r0, r1
2448
+    rsbmi           r0, r0, #0
2449
+
2450
+    bx              lr
2451
+endfunc
2452
+
2453
x265_2.0.tar.gz/source/common/arm/pixel-util.h Added
94
 
1
@@ -0,0 +1,92 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2016 x265 project
4
+ *
5
+ * Authors: Steve Borho <steve@borho.org>
6
+;*          Min Chen <chenm003@163.com>
7
+ *
8
+ * This program is free software; you can redistribute it and/or modify
9
+ * it under the terms of the GNU General Public License as published by
10
+ * the Free Software Foundation; either version 2 of the License, or
11
+ * (at your option) any later version.
12
+ *
13
+ * This program is distributed in the hope that it will be useful,
14
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16
+ * GNU General Public License for more details.
17
+ *
18
+ * You should have received a copy of the GNU General Public License
19
+ * along with this program; if not, write to the Free Software
20
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
21
+ *
22
+ * This program is also available under a commercial proprietary license.
23
+ * For more information, contact us at license @ x265.com.
24
+ *****************************************************************************/
25
+
26
+#ifndef X265_PIXEL_UTIL_ARM_H
27
+#define X265_PIXEL_UTIL_ARM_H
28
+
29
+uint64_t x265_pixel_var_8x8_neon(const pixel* pix, intptr_t stride);
30
+uint64_t x265_pixel_var_16x16_neon(const pixel* pix, intptr_t stride);
31
+uint64_t x265_pixel_var_32x32_neon(const pixel* pix, intptr_t stride);
32
+uint64_t x265_pixel_var_64x64_neon(const pixel* pix, intptr_t stride);
33
+
34
+void x265_getResidual4_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
35
+void x265_getResidual8_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
36
+void x265_getResidual16_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
37
+void x265_getResidual32_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
38
+
39
+void x265_scale1D_128to64_neon(pixel *dst, const pixel *src);
40
+void x265_scale2D_64to32_neon(pixel* dst, const pixel* src, intptr_t stride);
41
+
42
+int x265_pixel_satd_4x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
43
+int x265_pixel_satd_4x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
44
+int x265_pixel_satd_4x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
45
+int x265_pixel_satd_4x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
46
+int x265_pixel_satd_8x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
47
+int x265_pixel_satd_8x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
48
+int x265_pixel_satd_8x12_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
49
+int x265_pixel_satd_8x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
50
+int x265_pixel_satd_8x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
51
+int x265_pixel_satd_8x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
52
+int x265_pixel_satd_12x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
53
+int x265_pixel_satd_12x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
54
+int x265_pixel_satd_16x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
55
+int x265_pixel_satd_16x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
56
+int x265_pixel_satd_16x12_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
57
+int x265_pixel_satd_16x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
58
+int x265_pixel_satd_16x24_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
59
+int x265_pixel_satd_16x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
60
+int x265_pixel_satd_16x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
61
+int x265_pixel_satd_24x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
62
+int x265_pixel_satd_24x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
63
+int x265_pixel_satd_32x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
64
+int x265_pixel_satd_32x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
65
+int x265_pixel_satd_32x24_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
66
+int x265_pixel_satd_32x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
67
+int x265_pixel_satd_32x48_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
68
+int x265_pixel_satd_32x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
69
+int x265_pixel_satd_48x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
70
+int x265_pixel_satd_64x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
71
+int x265_pixel_satd_64x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
72
+int x265_pixel_satd_64x48_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
73
+int x265_pixel_satd_64x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
74
+
75
+int x265_pixel_sa8d_8x8_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
76
+int x265_pixel_sa8d_8x16_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
77
+int x265_pixel_sa8d_16x16_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
78
+int x265_pixel_sa8d_16x32_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
79
+int x265_pixel_sa8d_32x32_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
80
+int x265_pixel_sa8d_32x64_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
81
+int x265_pixel_sa8d_64x64_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
82
+
83
+uint32_t x265_quant_neon(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff);
84
+uint32_t x265_nquant_neon(const int16_t* coef, const int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff);
85
+
86
+void x265_dequant_scaling_neon(const int16_t* quantCoef, const int32_t* deQuantCoef, int16_t* coef, int num, int per, int shift);
87
+void x265_dequant_normal_neon(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift);
88
+
89
+void x265_ssim_4x4x2_core_neon(const pixel* pix1, intptr_t stride1, const pixel* pix2, intptr_t stride2, int sums[2][4]);
90
+
91
+int PFX(psyCost_4x4_neon)(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
92
+
93
+#endif // ifndef X265_PIXEL_UTIL_ARM_H
94
x265_2.0.tar.gz/source/common/arm/pixel.h Added
217
 
1
@@ -0,0 +1,215 @@
2
+/*****************************************************************************
3
+ * pixel.h: x86 pixel metrics
4
+ *****************************************************************************
5
+ * Copyright (C) 2003-2013 x264 project
6
+ * Copyright (C) 2013-2016 x265 project
7
+ *
8
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
9
+ *          Loren Merritt <lorenm@u.washington.edu>
10
+ *          Fiona Glaser <fiona@x264.com>
11
+ *          Min Chen <chenm003@163.com>
12
+ *
13
+ * This program is free software; you can redistribute it and/or modify
14
+ * it under the terms of the GNU General Public License as published by
15
+ * the Free Software Foundation; either version 2 of the License, or
16
+ * (at your option) any later version.
17
+ *
18
+ * This program is distributed in the hope that it will be useful,
19
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
20
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21
+ * GNU General Public License for more details.
22
+ *
23
+ * You should have received a copy of the GNU General Public License
24
+ * along with this program; if not, write to the Free Software
25
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
26
+ *
27
+ * This program is also available under a commercial proprietary license.
28
+ * For more information, contact us at license @ x265.com.
29
+ *****************************************************************************/
30
+
31
+#ifndef X265_I386_PIXEL_ARM_H
32
+#define X265_I386_PIXEL_ARM_H
33
+
34
+int x265_pixel_sad_4x4_armv6(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
35
+int x265_pixel_sad_4x8_armv6(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
36
+int x265_pixel_sad_4x16_armv6(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
37
+int x265_pixel_sad_8x4_neon(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
38
+int x265_pixel_sad_8x8_neon(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
39
+int x265_pixel_sad_8x16_neon(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
40
+int x265_pixel_sad_8x32_neon(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
41
+int x265_pixel_sad_16x4_neon(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
42
+int x265_pixel_sad_16x8_neon(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
43
+int x265_pixel_sad_16x16_neon(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
44
+int x265_pixel_sad_16x12_neon(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
45
+int x265_pixel_sad_16x32_neon(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
46
+int x265_pixel_sad_16x64_neon(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
47
+int x265_pixel_sad_32x8_neon(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
48
+int x265_pixel_sad_32x16_neon(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
49
+int x265_pixel_sad_32x32_neon(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
50
+int x265_pixel_sad_32x64_neon(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
51
+int x265_pixel_sad_32x24_neon(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
52
+int x265_pixel_sad_64x16_neon(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
53
+int x265_pixel_sad_64x32_neon(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
54
+int x265_pixel_sad_64x64_neon(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
55
+int x265_pixel_sad_64x48_neon(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
56
+int x265_pixel_sad_12x16_neon(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
57
+int x265_pixel_sad_24x32_neon(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
58
+int x265_pixel_sad_48x64_neon(const pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
59
+
60
+void x265_pixel_avg_pp_4x4_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
61
+void x265_pixel_avg_pp_4x8_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
62
+void x265_pixel_avg_pp_4x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
63
+void x265_pixel_avg_pp_8x4_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
64
+void x265_pixel_avg_pp_8x8_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
65
+void x265_pixel_avg_pp_8x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
66
+void x265_pixel_avg_pp_8x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
67
+void x265_pixel_avg_pp_12x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
68
+void x265_pixel_avg_pp_16x4_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
69
+void x265_pixel_avg_pp_16x8_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
70
+void x265_pixel_avg_pp_16x12_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
71
+void x265_pixel_avg_pp_16x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
72
+void x265_pixel_avg_pp_16x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
73
+void x265_pixel_avg_pp_16x64_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
74
+void x265_pixel_avg_pp_24x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
75
+void x265_pixel_avg_pp_32x8_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
76
+void x265_pixel_avg_pp_32x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
77
+void x265_pixel_avg_pp_32x24_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
78
+void x265_pixel_avg_pp_32x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
79
+void x265_pixel_avg_pp_32x64_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
80
+void x265_pixel_avg_pp_48x64_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
81
+void x265_pixel_avg_pp_64x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
82
+void x265_pixel_avg_pp_64x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
83
+void x265_pixel_avg_pp_64x48_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
84
+void x265_pixel_avg_pp_64x64_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
85
+
86
+void x265_sad_x3_4x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
87
+void x265_sad_x3_4x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
88
+void x265_sad_x3_4x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
89
+void x265_sad_x3_8x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
90
+void x265_sad_x3_8x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
91
+void x265_sad_x3_8x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
92
+void x265_sad_x3_8x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
93
+void x265_sad_x3_12x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
94
+void x265_sad_x3_16x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
95
+void x265_sad_x3_16x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
96
+void x265_sad_x3_16x12_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
97
+void x265_sad_x3_16x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
98
+void x265_sad_x3_16x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
99
+void x265_sad_x3_16x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
100
+void x265_sad_x3_24x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
101
+void x265_sad_x3_32x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
102
+void x265_sad_x3_32x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
103
+void x265_sad_x3_32x24_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
104
+void x265_sad_x3_32x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
105
+void x265_sad_x3_32x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
106
+void x265_sad_x3_48x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
107
+void x265_sad_x3_64x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
108
+void x265_sad_x3_64x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
109
+void x265_sad_x3_64x48_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
110
+void x265_sad_x3_64x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
111
+
112
+void x265_sad_x4_4x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
113
+void x265_sad_x4_4x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
114
+void x265_sad_x4_4x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
115
+void x265_sad_x4_8x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
116
+void x265_sad_x4_8x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
117
+void x265_sad_x4_8x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
118
+void x265_sad_x4_8x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
119
+void x265_sad_x4_12x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
120
+void x265_sad_x4_16x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
121
+void x265_sad_x4_16x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
122
+void x265_sad_x4_16x12_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
123
+void x265_sad_x4_16x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
124
+void x265_sad_x4_16x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
125
+void x265_sad_x4_16x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
126
+void x265_sad_x4_24x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
127
+void x265_sad_x4_32x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
128
+void x265_sad_x4_32x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
129
+void x265_sad_x4_32x24_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
130
+void x265_sad_x4_32x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
131
+void x265_sad_x4_32x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
132
+void x265_sad_x4_48x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
133
+void x265_sad_x4_64x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
134
+void x265_sad_x4_64x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
135
+void x265_sad_x4_64x48_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
136
+void x265_sad_x4_64x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
137
+
138
+sse_t x265_pixel_sse_pp_4x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
139
+sse_t x265_pixel_sse_pp_8x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
140
+sse_t x265_pixel_sse_pp_16x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
141
+sse_t x265_pixel_sse_pp_32x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
142
+sse_t x265_pixel_sse_pp_64x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
143
+
144
+sse_t x265_pixel_sse_ss_4x4_neon(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2);
145
+sse_t x265_pixel_sse_ss_8x8_neon(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2);
146
+sse_t x265_pixel_sse_ss_16x16_neon(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2);
147
+sse_t x265_pixel_sse_ss_32x32_neon(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2);
148
+sse_t x265_pixel_sse_ss_64x64_neon(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2);
149
+
150
+sse_t x265_pixel_ssd_s_4x4_neon(const int16_t* a, intptr_t dstride);
151
+sse_t x265_pixel_ssd_s_8x8_neon(const int16_t* a, intptr_t dstride);
152
+sse_t x265_pixel_ssd_s_16x16_neon(const int16_t* a, intptr_t dstride);
153
+sse_t x265_pixel_ssd_s_32x32_neon(const int16_t* a, intptr_t dstride);
154
+sse_t x265_pixel_ssd_s_64x64_neon(const int16_t* a, intptr_t dstride);
155
+
156
+void x265_pixel_sub_ps_4x4_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1);
157
+void x265_pixel_sub_ps_8x8_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1);
158
+void x265_pixel_sub_ps_16x16_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1);
159
+void x265_pixel_sub_ps_32x32_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1);
160
+void x265_pixel_sub_ps_64x64_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1);
161
+void x265_pixel_sub_ps_4x8_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1);
162
+void x265_pixel_sub_ps_8x16_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1);
163
+void x265_pixel_sub_ps_16x32_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1);
164
+void x265_pixel_sub_ps_32x64_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1);
165
+
166
+void x265_pixel_add_ps_4x4_neon(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
167
+void x265_pixel_add_ps_8x8_neon(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
168
+void x265_pixel_add_ps_16x16_neon(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
169
+void x265_pixel_add_ps_32x32_neon(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
170
+void x265_pixel_add_ps_64x64_neon(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
171
+void x265_pixel_add_ps_4x8_neon(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
172
+void x265_pixel_add_ps_8x16_neon(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
173
+void x265_pixel_add_ps_16x32_neon(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
174
+void x265_pixel_add_ps_32x64_neon(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
175
+
176
+void x265_pixel_planecopy_cp_neon(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
177
+
178
+void x265_addAvg_4x4_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
179
+void x265_addAvg_4x8_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
180
+void x265_addAvg_4x16_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
181
+void x265_addAvg_8x4_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
182
+void x265_addAvg_8x8_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
183
+void x265_addAvg_8x16_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
184
+void x265_addAvg_8x32_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
185
+void x265_addAvg_12x16_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
186
+void x265_addAvg_16x4_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
187
+void x265_addAvg_16x8_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
188
+void x265_addAvg_16x12_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
189
+void x265_addAvg_16x16_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
190
+void x265_addAvg_16x32_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
191
+void x265_addAvg_16x64_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
192
+void x265_addAvg_24x32_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
193
+void x265_addAvg_32x8_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
194
+void x265_addAvg_32x16_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
195
+void x265_addAvg_32x24_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
196
+void x265_addAvg_32x32_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
197
+void x265_addAvg_32x64_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
198
+void x265_addAvg_48x64_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
199
+void x265_addAvg_64x16_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
200
+void x265_addAvg_64x32_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
201
+void x265_addAvg_64x48_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
202
+void x265_addAvg_64x64_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
203
+
204
+void x265_addAvg_4x2_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
205
+void x265_addAvg_4x32_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
206
+void x265_addAvg_6x8_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
207
+void x265_addAvg_6x16_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
208
+void x265_addAvg_8x2_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
209
+void x265_addAvg_8x6_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
210
+void x265_addAvg_8x12_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
211
+void x265_addAvg_8x64_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
212
+void x265_addAvg_12x32_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
213
+void x265_addAvg_16x24_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
214
+void x265_addAvg_24x64_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
215
+void x265_addAvg_32x48_neon(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
216
+#endif // ifndef X265_I386_PIXEL_ARM_H
217
x265_2.0.tar.gz/source/common/arm/sad-a.S Added
1358
 
1
@@ -0,0 +1,1356 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2016 x265 project
4
+ *
5
+ * Authors: David Conrad <lessen42@gmail.com>
6
+ *          Janne Grunau <janne-x264@jannau.net>
7
+ *          Dnyaneshwar G <dnyaneshwar@multicorewareinc.com>
8
+ * 
9
+ * This program is free software; you can redistribute it and/or modify
10
+ * it under the terms of the GNU General Public License as published by
11
+ * the Free Software Foundation; either version 2 of the License, or
12
+ * (at your option) any later version.
13
+ *
14
+ * This program is distributed in the hope that it will be useful,
15
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17
+ * GNU General Public License for more details.
18
+ *
19
+ * You should have received a copy of the GNU General Public License
20
+ * along with this program; if not, write to the Free Software
21
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
22
+ *
23
+ * This program is also available under a commercial proprietary license.
24
+ * For more information, contact us at license @ x265.com.
25
+ *****************************************************************************/
26
+
27
+#include "asm.S"
28
+
29
+.section .rodata
30
+
31
+.align 4
32
+sad12_mask:
33
+.byte 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 0, 0, 0, 0
34
+
35
+.text
36
+
37
+/* sad4x4(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
38
+ *
39
+ * r0   - dst
40
+ * r1   - dstStride
41
+ * r2   - src
42
+ * r3   - srcStride */
43
+
44
+.macro SAD4_ARMV6 h
45
+function x265_pixel_sad_4x\h\()_armv6
46
+    push        {r4-r6,lr}
47
+    ldr         r4, [r2], r3
48
+    ldr         r5, [r0], r1
49
+    ldr         r6, [r2], r3
50
+    ldr         lr, [r0], r1
51
+    usad8       ip, r4, r5
52
+.rept (\h - 2)/2
53
+    ldr         r4, [r2], r3
54
+    ldr         r5, [r0], r1
55
+    usada8      ip, r6, lr, ip
56
+    ldr         r6, [r2], r3
57
+    ldr         lr, [r0], r1
58
+    usada8      ip, r4, r5, ip
59
+.endr
60
+    usada8      r0, r6, lr, ip
61
+    pop         {r4-r6,pc}
62
+endfunc
63
+.endm
64
+
65
+SAD4_ARMV6 4
66
+SAD4_ARMV6 8
67
+SAD4_ARMV6 16
68
+
69
+.macro SAD8_NEON h
70
+function x265_pixel_sad_8x\h\()_neon
71
+    vld1.8          d0, [r0], r1        // row 0
72
+    vld1.8          d1, [r2], r3        // row 1
73
+    vabdl.u8        q1, d0, d1
74
+
75
+.rept \h-1
76
+    vld1.8          d0, [r0], r1        // row 2,4,6
77
+    vld1.8          d1, [r2], r3        // row 3,5,7
78
+    vabal.u8        q1, d0, d1
79
+.endr
80
+
81
+    vadd.u16        d2, d2, d3
82
+    vpadd.u16       d0, d2, d2
83
+    vpaddl.u16      d0, d0
84
+    vmov.u32        r0, d0[0]
85
+    bx              lr
86
+endfunc
87
+.endm
88
+
89
+SAD8_NEON 4
90
+SAD8_NEON 8
91
+SAD8_NEON 16
92
+SAD8_NEON 32
93
+
94
+.macro SAD16_NEON h
95
+function x265_pixel_sad_16x\h\()_neon
96
+    vld1.8          {q0}, [r0], r1      // row 0
97
+    vld1.8          {q1}, [r2], r3
98
+    vld1.8          {q2}, [r0], r1      // row 1
99
+    vld1.8          {q3}, [r2], r3
100
+
101
+    vabdl.u8        q8, d0, d2
102
+    vabdl.u8        q9, d1, d3
103
+    vabal.u8        q8, d4, d6
104
+    vabal.u8        q9, d5, d7
105
+    mov             r12, #(\h-2)/2
106
+
107
+.loop_16x\h:
108
+
109
+    subs            r12, #1
110
+    vld1.8          {q0}, [r0], r1
111
+    vld1.8          {q1}, [r2], r3
112
+    vld1.8          {q2}, [r0], r1
113
+    vld1.8          {q3}, [r2], r3
114
+
115
+    vabal.u8        q8, d0, d2
116
+    vabal.u8        q9, d1, d3
117
+    vabal.u8        q8, d4, d6
118
+    vabal.u8        q9, d5, d7
119
+    bne             .loop_16x\h
120
+
121
+    vadd.u16        q8, q8, q9
122
+.if \h == 64
123
+    vaddl.u16       q0, d16, d17
124
+    vpadd.u32       d0, d0, d1
125
+    vpadd.u32       d0, d0
126
+.else
127
+    vadd.u16        d16, d16, d17
128
+    vpadd.u16       d0, d16, d16
129
+    vpaddl.u16      d0, d0
130
+.endif
131
+    vmov.u32        r0, d0[0]
132
+    bx              lr
133
+endfunc
134
+.endm
135
+
136
+SAD16_NEON 4
137
+SAD16_NEON 8
138
+SAD16_NEON 16
139
+SAD16_NEON 12
140
+SAD16_NEON 32
141
+SAD16_NEON 64
142
+
143
+.macro SAD32_NEON h
144
+function x265_pixel_sad_32x\h\()_neon
145
+    veor.u8         q8, q8
146
+    veor.u8         q9, q9
147
+    veor.u8         q10, q10
148
+    veor.u8         q11, q11
149
+    mov             r12, #\h/8
150
+
151
+.loop_32x\h:
152
+
153
+    subs            r12, #1
154
+.rept 4
155
+    vld1.8          {q0, q1}, [r0], r1           // row 0
156
+    vld1.8          {q2, q3}, [r2], r3           // row 0
157
+    vld1.8          {q12, q13}, [r0], r1         // row 1
158
+    vld1.8          {q14, q15}, [r2], r3         // row 1
159
+
160
+    vabal.u8        q8, d0, d4
161
+    vabal.u8        q9, d1, d5
162
+    vabal.u8        q10, d2, d6
163
+    vabal.u8        q11, d3, d7
164
+
165
+    vabal.u8        q8, d24, d28
166
+    vabal.u8        q9, d25, d29
167
+    vabal.u8        q10, d26, d30
168
+    vabal.u8        q11, d27, d31
169
+.endr
170
+    bne             .loop_32x\h
171
+
172
+    vadd.u16        q8, q8, q9
173
+    vadd.u16        q10, q10, q11
174
+.if \h == 64
175
+    vaddl.u16       q0, d16, d17
176
+    vpadd.u32       d0, d0, d1
177
+    vpaddl.u32      d0, d0
178
+
179
+    vaddl.u16       q1, d20, d21
180
+    vpadd.u32       d2, d2, d3
181
+    vpaddl.u32      d2, d2
182
+
183
+    vadd.u32        d0,d0,d2
184
+.else
185
+    vadd.u16        d16, d16, d17
186
+    vpadd.u16       d0, d16, d16
187
+    vpaddl.u16      d0, d0
188
+
189
+    vadd.u16        d20, d20, d21
190
+    vpadd.u16       d1, d20, d20
191
+    vpaddl.u16      d1, d1
192
+
193
+    vadd.u32        d0,d0,d1
194
+.endif
195
+    vmov.u32        r0,  d0[0]
196
+    bx              lr
197
+endfunc
198
+.endm
199
+
200
+SAD32_NEON 8
201
+SAD32_NEON 16
202
+SAD32_NEON 24
203
+SAD32_NEON 32
204
+SAD32_NEON 64
205
+
206
+.macro SAD64_NEON h
207
+function x265_pixel_sad_64x\h\()_neon
208
+    veor.u8         q8, q8
209
+    veor.u8         q9, q9
210
+    veor.u8         q10, q10
211
+    veor.u8         q11, q11
212
+    mov             r12, #32
213
+    sub             r1, r12
214
+    sub             r3, r12
215
+    mov             r12, #\h/8
216
+
217
+.loop_64x\h:
218
+
219
+    subs            r12, #1
220
+.rept 4
221
+    // Columns 0-32
222
+    vld1.8          {q0, q1}, [r0]!
223
+    vld1.8          {q2, q3}, [r2]!
224
+    vabal.u8        q8, d0, d4
225
+    vabal.u8        q9, d1, d5
226
+    vabal.u8        q10, d2, d6
227
+    vabal.u8        q11, d3, d7
228
+    // Columns 32-64
229
+    vld1.8          {q0, q1}, [r0],r1
230
+    vld1.8          {q2, q3}, [r2],r3
231
+    vabal.u8        q8, d0, d4
232
+    vabal.u8        q9, d1, d5
233
+    vabal.u8        q10, d2, d6
234
+    vabal.u8        q11, d3, d7
235
+    // Columns 0-32
236
+    vld1.8          {q12, q13}, [r0]!
237
+    vld1.8          {q14, q15}, [r2]!
238
+    vabal.u8        q8, d24, d28
239
+    vabal.u8        q9, d25, d29
240
+    vabal.u8        q10, d26, d30
241
+    vabal.u8        q11, d27, d31
242
+    // Columns 32-64
243
+    vld1.8          {q12, q13}, [r0],r1
244
+    vld1.8          {q14, q15}, [r2],r3
245
+    vabal.u8        q8, d24, d28
246
+    vabal.u8        q9, d25, d29
247
+    vabal.u8        q10, d26, d30
248
+    vabal.u8        q11, d27, d31
249
+.endr
250
+    bne             .loop_64x\h
251
+
252
+    vadd.u16        q8, q8, q9
253
+    vadd.u16        q10, q10, q11
254
+
255
+    vaddl.u16       q0, d16, d17
256
+    vpadd.u32       d0, d0, d1
257
+    vpaddl.u32      d0, d0
258
+
259
+    vaddl.u16       q1, d20, d21
260
+    vpadd.u32       d2, d2, d3
261
+    vpaddl.u32      d2, d2
262
+
263
+    vadd.u32        d0,d0,d2
264
+
265
+    vmov.u32        r0,  d0[0]
266
+    bx              lr
267
+endfunc
268
+.endm
269
+
270
+SAD64_NEON 16
271
+SAD64_NEON 32
272
+SAD64_NEON 48
273
+SAD64_NEON 64
274
+
275
+function x265_pixel_sad_24x32_neon
276
+    veor.u8         q0, q0
277
+    veor.u8         q1, q1
278
+    veor.u8         q2, q2
279
+    veor.u8         q8, q8
280
+    veor.u8         q9, q9
281
+    veor.u8         q10, q10
282
+    mov             r12, #16
283
+    sub             r1, #16
284
+    sub             r3, #16
285
+    mov             r12, #8
286
+
287
+.loop_24x32:
288
+
289
+    subs            r12, #1
290
+.rept 4
291
+    vld1.8          {q0}, [r0]!
292
+    vld1.8          {q1}, [r2]!
293
+    vabal.u8        q8, d0, d2
294
+    vabal.u8        q9, d1, d3
295
+
296
+    vld1.8          {d0}, [r0], r1
297
+    vld1.8          {d1}, [r2], r3
298
+    vabal.u8        q10, d0, d1
299
+.endr
300
+    bne             .loop_24x32
301
+
302
+    vadd.u16        q8, q8, q9
303
+    vadd.u16        d16, d16, d17
304
+    vpadd.u16       d0, d16, d16
305
+    vpaddl.u16      d0, d0
306
+    vadd.u16        d20, d20, d21
307
+    vpadd.u16       d1, d20, d20
308
+    vpaddl.u16      d1, d1
309
+    vadd.u32        d0,d0,d1
310
+    vmov.u32        r0, d0[0]
311
+    bx              lr
312
+endfunc
313
+
314
+function x265_pixel_sad_48x64_neon
315
+    veor.u8         q3, q3
316
+    veor.u8         q11, q11
317
+    veor.u8         q12, q12
318
+    veor.u8         q13, q13
319
+    veor.u8         q14, q14
320
+    veor.u8         q15, q15
321
+    mov             r12, #32
322
+    sub             r1, #32
323
+    sub             r3, #32
324
+    mov             r12, #16
325
+
326
+.loop_48x64:
327
+
328
+    subs            r12, #1
329
+.rept 4
330
+    vld1.8          {q0,q1}, [r0]!
331
+    vld1.8          {q2}, [r0], r1
332
+    vld1.8          {q8,q9}, [r2]!
333
+    vld1.8          {q10}, [r2], r3
334
+    vabal.u8        q3, d0, d16
335
+    vabal.u8        q11, d1, d17
336
+    vabal.u8        q12, d2, d18
337
+    vabal.u8        q13, d3, d19
338
+    vabal.u8        q14, d4, d20
339
+    vabal.u8        q15, d5, d21
340
+.endr
341
+    bne             .loop_48x64
342
+
343
+    vadd.u16        q3, q3, q11
344
+    vadd.u16        d6, d6, d7
345
+    vpaddl.u16      d0, d6
346
+    vpadd.u32       d0, d0
347
+
348
+    vadd.u16        q12, q12, q13
349
+    vadd.u16        d24, d24, d25
350
+    vpaddl.u16      d1, d24
351
+    vpadd.u32       d1, d1
352
+
353
+    vadd.u16        q14,q14,q15
354
+    vadd.u16        d28, d28, d29
355
+    vpaddl.u16      d2, d28
356
+    vpadd.u32       d2, d2
357
+
358
+    vadd.u32        d0, d0, d1
359
+    vadd.u32        d0, d0, d2
360
+    vmov.u32        r0, d0[0]
361
+    bx              lr
362
+endfunc
363
+
364
+// SAD_X3 and SAD_X4 code start
365
+
366
+.macro SAD_X_START_4 x
367
+    vld1.32         {d0[]}, [r0], r12
368
+    vld1.32         {d1[]}, [r1], r4
369
+    vld1.32         {d2[]}, [r2], r4
370
+    vld1.32         {d3[]}, [r3], r4
371
+.if \x == 4
372
+    vld1.32         {d4[]}, [lr], r4
373
+.endif
374
+    vabdl.u8        q8, d0, d1
375
+    vabdl.u8        q9, d0, d2
376
+    vabdl.u8        q10, d0, d3
377
+.if \x == 4
378
+    vabdl.u8        q11, d0, d4
379
+.endif
380
+.endm
381
+
382
+.macro SAD_X_4 x
383
+    vld1.32         {d0[]}, [r0], r12
384
+    vld1.32         {d1[]}, [r1], r4
385
+    vld1.32         {d2[]}, [r2], r4
386
+    vld1.32         {d3[]}, [r3], r4
387
+.if \x == 4
388
+    vld1.32         {d4[]}, [lr], r4
389
+.endif
390
+    vabal.u8        q8, d0, d1
391
+    vabal.u8        q9, d0, d2
392
+    vabal.u8        q10, d0, d3
393
+.if \x == 4
394
+    vabal.u8        q11, d0, d4
395
+.endif
396
+.endm
397
+
398
+.macro SAD_X_4xN x, h
399
+function x265_sad_x\x\()_4x\h\()_neon
400
+    push            {r4, r5, lr}
401
+.if \x == 3
402
+    ldrd            r4, r5, [sp, #12]
403
+.else
404
+    ldr             lr, [sp, #12]
405
+    ldrd            r4, r5, [sp, #16]
406
+.endif
407
+    mov             r12, #FENC_STRIDE
408
+
409
+    SAD_X_START_4 \x
410
+.rept \h - 1
411
+    SAD_X_4 \x 
412
+.endr
413
+    vpadd.u16       d0, d16, d18
414
+    vpadd.u16       d1, d20, d22
415
+    vpaddl.u16      q0, q0
416
+.if \x == 3
417
+    vst1.32         {d0}, [r5]!
418
+    vst1.32         {d1[0]}, [r5, :32]
419
+.else
420
+    vst1.32         {d0-d1}, [r5]
421
+.endif
422
+    pop             {r4, r5, lr}
423
+    bx              lr
424
+endfunc
425
+.endm
426
+
427
+SAD_X_4xN 3 4
428
+SAD_X_4xN 3 8
429
+SAD_X_4xN 3 16
430
+
431
+SAD_X_4xN 4 4
432
+SAD_X_4xN 4 8
433
+SAD_X_4xN 4 16
434
+
435
+.macro SAD_X_START_8 x
436
+    vld1.8          {d0}, [r0], r12
437
+    vld1.8          {d1}, [r1], r4
438
+    vld1.8          {d2}, [r2], r4
439
+    vld1.8          {d3}, [r3], r4
440
+.if \x == 4
441
+    vld1.8          {d4}, [lr], r4
442
+.endif
443
+    vabdl.u8        q8, d0, d1
444
+    vabdl.u8        q9, d0, d2
445
+    vabdl.u8        q10, d0, d3
446
+.if \x == 4
447
+    vabdl.u8        q11, d0, d4
448
+.endif
449
+.endm
450
+
451
+.macro SAD_X_8 x
452
+    vld1.8          {d0}, [r0], r12
453
+    vld1.8          {d1}, [r1], r4
454
+    vld1.8          {d2}, [r2], r4
455
+    vld1.8          {d3}, [r3], r4
456
+.if \x == 4
457
+    vld1.8          {d4}, [lr], r4
458
+.endif
459
+    vabal.u8        q8, d0, d1
460
+    vabal.u8        q9, d0, d2
461
+    vabal.u8        q10, d0, d3
462
+.if \x == 4
463
+    vabal.u8        q11, d0, d4
464
+.endif
465
+.endm
466
+
467
+.macro SAD_X_8xN x, h
468
+function x265_sad_x\x\()_8x\h\()_neon
469
+    push            {r4, r5, lr}
470
+.if \x == 3
471
+    ldrd            r4, r5, [sp, #12]
472
+.else
473
+    ldr             lr, [sp, #12]
474
+    ldrd            r4, r5, [sp, #16]
475
+.endif
476
+    mov             r12, #FENC_STRIDE
477
+    SAD_X_START_8 \x
478
+.rept \h - 1
479
+    SAD_X_8 \x
480
+.endr
481
+    vadd.u16        d16, d16, d17
482
+    vadd.u16        d18, d18, d19
483
+    vadd.u16        d20, d20, d21
484
+    vadd.u16        d22, d22, d23
485
+
486
+    vpadd.u16       d0, d16, d18
487
+    vpadd.u16       d1, d20, d22
488
+    vpaddl.u16      q0, q0
489
+.if \x == 3
490
+    vst1.32         {d0}, [r5]!
491
+    vst1.32         {d1[0]}, [r5, :32]
492
+.else
493
+    vst1.32         {d0-d1}, [r5]
494
+.endif
495
+    pop             {r4, r5, lr}
496
+    bx              lr
497
+endfunc
498
+.endm
499
+
500
+SAD_X_8xN 3 4
501
+SAD_X_8xN 3 8
502
+SAD_X_8xN 3 16
503
+SAD_X_8xN 3 32
504
+
505
+SAD_X_8xN 4 4
506
+SAD_X_8xN 4 8
507
+SAD_X_8xN 4 16
508
+SAD_X_8xN 4 32
509
+
510
+.macro SAD_X_START_16 x
511
+    vld1.8          {q0}, [r0], r12
512
+    vld1.8          {q1}, [r1], r4
513
+    vld1.8          {q2}, [r2], r4
514
+    vld1.8          {q3}, [r3], r4
515
+    vabdl.u8        q8, d0, d2
516
+    vabdl.u8        q9, d1, d3
517
+    vabdl.u8        q10, d0, d4
518
+    vabdl.u8        q11, d1, d5
519
+    vabdl.u8        q12, d0, d6
520
+    vabdl.u8        q13, d1, d7
521
+.if \x == 4
522
+    vld1.8          {q3}, [lr], r4
523
+    vabdl.u8        q14, d0, d6
524
+    vabdl.u8        q15, d1, d7 
525
+.endif
526
+.endm
527
+
528
+.macro SAD_X_16 x
529
+    vld1.8          {q0}, [r0], r12
530
+    vld1.8          {q1}, [r1], r4
531
+    vld1.8          {q2}, [r2], r4
532
+    vld1.8          {q3}, [r3], r4
533
+    vabal.u8        q8, d0, d2
534
+    vabal.u8        q9, d1, d3
535
+    vabal.u8        q10, d0, d4
536
+    vabal.u8        q11, d1, d5
537
+    vabal.u8        q12, d0, d6
538
+    vabal.u8        q13, d1, d7
539
+.if \x == 4
540
+    vld1.8          {q3}, [lr], r4
541
+    vabal.u8        q14, d0, d6
542
+    vabal.u8        q15, d1, d7
543
+.endif
544
+.endm
545
+
546
+.macro SAD_X_16xN x, h
547
+function x265_sad_x\x\()_16x\h\()_neon
548
+    push            {r4, r5, lr}
549
+.if \x == 3
550
+    ldrd            r4, r5, [sp, #12]
551
+.else
552
+    ldr             lr, [sp, #12]
553
+    ldrd            r4, r5, [sp, #16]
554
+.endif
555
+    mov             r12, #FENC_STRIDE
556
+    SAD_X_START_16 \x
557
+.rept \h - 1
558
+    SAD_X_16 \x
559
+.endr
560
+    vadd.u16        q8, q8, q9
561
+    vadd.u16        q10, q10, q11
562
+    vadd.u16        q12, q12, q13
563
+.if \x == 4
564
+    vadd.u16        q14, q14, q15
565
+.endif
566
+    vadd.u16        d16, d16, d17
567
+    vadd.u16        d20, d20, d21
568
+    vadd.u16        d24, d24, d25
569
+.if \x == 4
570
+    vadd.u16        d28, d28, d29
571
+.endif
572
+
573
+.if \h <= 32
574
+    vpadd.u16       d0, d16, d20
575
+    vpadd.u16       d1, d24, d28
576
+    vpaddl.u16      q0, q0
577
+  .if \x == 3
578
+    vst1.32         {d0}, [r5]!
579
+    vst1.32         {d1[0]}, [r5, :32]
580
+  .else
581
+    vst1.32         {d0-d1}, [r5]
582
+  .endif
583
+.else
584
+    vpaddl.u16      d16, d16
585
+    vpaddl.u16      d20, d20
586
+    vpaddl.u16      d24, d24
587
+  .if \x == 4
588
+    vpaddl.u16      d28, d28
589
+  .endif
590
+    vpaddl.u32      d16, d16
591
+    vpaddl.u32      d20, d20
592
+    vpaddl.u32      d24, d24
593
+  .if \x == 4
594
+    vpaddl.u32      d28, d28
595
+  .endif
596
+    vst1.32         {d16[0]}, [r5]!
597
+    vst1.32         {d20[0]}, [r5]!
598
+  .if \x == 3
599
+    vst1.32         {d24[0]}, [r5]
600
+  .endif
601
+  .if \x == 4
602
+    vst1.32         {d24[0]}, [r5]!
603
+    vst1.32         {d28[0]}, [r5]
604
+  .endif
605
+.endif
606
+    pop             {r4, r5, lr}
607
+    bx              lr
608
+endfunc
609
+.endm
610
+
611
+SAD_X_16xN 3 4
612
+SAD_X_16xN 3 12
613
+
614
+SAD_X_16xN 4 4
615
+SAD_X_16xN 4 12
616
+
617
+.macro SAD_X_16xN_LOOP x, h
618
+function x265_sad_x\x\()_16x\h\()_neon
619
+    push            {r4-r6, lr}
620
+.if \x == 3
621
+    ldrd            r4, r5, [sp, #16]
622
+.else
623
+    ldr             lr, [sp, #16]
624
+    ldrd            r4, r5, [sp, #20]
625
+.endif
626
+    mov             r12, #FENC_STRIDE
627
+    mov             r6, #\h/8
628
+    veor.u8         q8, q8
629
+    veor.u8         q9, q9
630
+    veor.u8         q10, q10
631
+    veor.u8         q11, q11
632
+    veor.u8         q12, q12
633
+    veor.u8         q13, q13
634
+.if \x == 4
635
+    veor.u8         q14, q14
636
+    veor.u8         q15, q15
637
+.endif
638
+
639
+.loop_sad_x\x\()_16x\h:
640
+.rept 8
641
+    SAD_X_16 \x
642
+.endr
643
+    subs            r6, #1
644
+    bne             .loop_sad_x\x\()_16x\h
645
+
646
+    vadd.u16        q8, q8, q9
647
+    vadd.u16        q10, q10, q11
648
+    vadd.u16        q12, q12, q13
649
+.if \x == 4
650
+    vadd.u16        q14, q14, q15
651
+.endif
652
+    vadd.u16        d16, d16, d17
653
+    vadd.u16        d20, d20, d21
654
+    vadd.u16        d24, d24, d25
655
+.if \x == 4
656
+    vadd.u16        d28, d28, d29
657
+.endif
658
+
659
+.if \h <= 32
660
+    vpadd.u16       d0, d16, d20
661
+    vpadd.u16       d1, d24, d28
662
+    vpaddl.u16      q0, q0
663
+  .if \x == 3
664
+    vst1.32         {d0}, [r5]!
665
+    vst1.32         {d1[0]}, [r5, :32]
666
+  .else
667
+    vst1.32         {d0-d1}, [r5]
668
+  .endif
669
+.else
670
+    vpaddl.u16      d16, d16
671
+    vpaddl.u16      d20, d20
672
+    vpaddl.u16      d24, d24
673
+  .if \x == 4
674
+    vpaddl.u16      d28, d28
675
+  .endif
676
+    vpaddl.u32      d16, d16
677
+    vpaddl.u32      d20, d20
678
+    vpaddl.u32      d24, d24
679
+  .if \x == 4
680
+    vpaddl.u32      d28, d28
681
+  .endif
682
+    vst1.32         {d16[0]}, [r5]!
683
+    vst1.32         {d20[0]}, [r5]!
684
+  .if \x == 3
685
+    vst1.32         {d24[0]}, [r5]
686
+  .endif
687
+  .if \x == 4
688
+    vst1.32         {d24[0]}, [r5]!
689
+    vst1.32         {d28[0]}, [r5]
690
+  .endif
691
+.endif
692
+    pop             {r4-r6, lr}
693
+    bx              lr
694
+endfunc
695
+.endm
696
+
697
+SAD_X_16xN_LOOP 3 8
698
+SAD_X_16xN_LOOP 3 16
699
+SAD_X_16xN_LOOP 3 32
700
+SAD_X_16xN_LOOP 3 64
701
+
702
+SAD_X_16xN_LOOP 4 8
703
+SAD_X_16xN_LOOP 4 16
704
+SAD_X_16xN_LOOP 4 32
705
+SAD_X_16xN_LOOP 4 64
706
+
707
+.macro SAD_X_32 x
708
+    vld1.8          {q0}, [r0]!
709
+    vld1.8          {q1}, [r1]!
710
+    vld1.8          {q2}, [r2]!
711
+    vld1.8          {q3}, [r3]!
712
+    vabal.u8        q8, d0, d2
713
+    vabal.u8        q9, d1, d3
714
+    vabal.u8        q10, d0, d4
715
+    vabal.u8        q11, d1, d5
716
+    vabal.u8        q12, d0, d6
717
+    vabal.u8        q13, d1, d7
718
+.if \x == 4
719
+    vld1.8          {q3}, [lr]!
720
+    vabal.u8        q14, d0, d6
721
+    vabal.u8        q15, d1, d7
722
+.endif
723
+    vld1.8          {q0}, [r0], r12
724
+    vld1.8          {q1}, [r1], r4
725
+    vld1.8          {q2}, [r2], r4
726
+    vld1.8          {q3}, [r3], r4
727
+    vabal.u8        q8, d0, d2
728
+    vabal.u8        q9, d1, d3
729
+    vabal.u8        q10, d0, d4
730
+    vabal.u8        q11, d1, d5
731
+    vabal.u8        q12, d0, d6
732
+    vabal.u8        q13, d1, d7
733
+.if \x == 4
734
+    vld1.8          {q3}, [lr], r4
735
+    vabal.u8        q14, d0, d6
736
+    vabal.u8        q15, d1, d7
737
+.endif
738
+.endm
739
+
740
+.macro SAD_X_32xN x, h
741
+function x265_sad_x\x\()_32x\h\()_neon
742
+    push            {r4-r6, lr}
743
+.if \x == 3
744
+    ldrd            r4, r5, [sp, #16]
745
+.else
746
+    ldr             lr, [sp, #16]
747
+    ldrd            r4, r5, [sp, #20]
748
+.endif
749
+    mov             r12, #FENC_STRIDE
750
+    sub             r12, #16
751
+    sub             r4, #16
752
+    mov             r6, #\h/8
753
+    veor.u8         q8, q8
754
+    veor.u8         q9, q9
755
+    veor.u8         q10, q10
756
+    veor.u8         q11, q11
757
+    veor.u8         q12, q12
758
+    veor.u8         q13, q13
759
+.if \x == 4
760
+    veor.u8         q14, q14
761
+    veor.u8         q15, q15
762
+.endif
763
+
764
+loop_sad_x\x\()_32x\h:
765
+.rept 8
766
+    SAD_X_32 \x
767
+.endr
768
+    subs            r6, #1
769
+    bgt             loop_sad_x\x\()_32x\h
770
+
771
+.if \h <= 32
772
+    vadd.u16        q8, q8, q9
773
+    vadd.u16        q10, q10, q11
774
+    vadd.u16        q12, q12, q13
775
+  .if \x == 4
776
+    vadd.u16        q14, q14, q15
777
+  .endif
778
+    vadd.u16        d16, d16, d17
779
+    vadd.u16        d20, d20, d21
780
+    vadd.u16        d24, d24, d25
781
+  .if \x == 4
782
+    vadd.u16        d28, d28, d29
783
+  .endif
784
+.else
785
+    vpaddl.u16      q8, q8
786
+    vpaddl.u16      q9, q9
787
+    vpaddl.u16      q10, q10
788
+    vpaddl.u16      q11, q11
789
+    vpaddl.u16      q12, q12
790
+    vpaddl.u16      q13, q13
791
+  .if \x == 4
792
+    vpaddl.u16      q14, q14
793
+    vpaddl.u16      q15, q15
794
+  .endif
795
+    vadd.u32        q8, q8, q9
796
+    vadd.u32        q10, q10, q11
797
+    vadd.u32        q12, q12, q13
798
+  .if \x == 4
799
+    vadd.u32        q14, q14, q15
800
+  .endif
801
+    vadd.u32        d16, d16, d17
802
+    vadd.u32        d20, d20, d21
803
+    vadd.u32        d24, d24, d25
804
+  .if \x == 4
805
+    vadd.u32        d28, d28, d29
806
+  .endif
807
+.endif
808
+
809
+.if \h <= 16
810
+    vpadd.u16       d0, d16, d20
811
+    vpadd.u16       d1, d24, d28
812
+    vpaddl.u16      q0, q0
813
+  .if \x == 3
814
+    vst1.32         {d0}, [r5]!
815
+    vst1.32         {d1[0]}, [r5, :32]
816
+  .else
817
+    vst1.32         {d0-d1}, [r5]
818
+  .endif
819
+.elseif \h <= 32
820
+    vpaddl.u16      d16, d16
821
+    vpaddl.u16      d20, d20
822
+    vpaddl.u16      d24, d24
823
+  .if \x == 4
824
+    vpaddl.u16      d28, d28
825
+  .endif
826
+    vpaddl.u32      d16, d16
827
+    vpaddl.u32      d20, d20
828
+    vpaddl.u32      d24, d24
829
+  .if \x == 4
830
+    vpaddl.u32      d28, d28
831
+  .endif
832
+    vst1.32         {d16[0]}, [r5]!
833
+    vst1.32         {d20[0]}, [r5]!
834
+  .if \x == 3
835
+    vst1.32         {d24[0]}, [r5]
836
+  .endif
837
+  .if \x == 4
838
+    vst1.32         {d24[0]}, [r5]!
839
+    vst1.32         {d28[0]}, [r5]
840
+  .endif
841
+.elseif \h <= 64
842
+    vpaddl.u32      d16, d16
843
+    vpaddl.u32      d20, d20
844
+    vpaddl.u32      d24, d24
845
+  .if \x == 4
846
+    vpaddl.u32      d28, d28
847
+  .endif
848
+    vst1.32         {d16[0]}, [r5]!
849
+    vst1.32         {d20[0]}, [r5]!
850
+  .if \x == 3
851
+    vst1.32         {d24[0]}, [r5]
852
+  .endif
853
+  .if \x == 4
854
+    vst1.32         {d24[0]}, [r5]!
855
+    vst1.32         {d28[0]}, [r5]
856
+  .endif
857
+.endif
858
+    pop             {r4-r6, lr}
859
+    bx              lr
860
+endfunc
861
+.endm
862
+
863
+SAD_X_32xN 3 8
864
+SAD_X_32xN 3 16
865
+SAD_X_32xN 3 24
866
+SAD_X_32xN 3 32
867
+SAD_X_32xN 3 64
868
+
869
+SAD_X_32xN 4 8
870
+SAD_X_32xN 4 16
871
+SAD_X_32xN 4 24
872
+SAD_X_32xN 4 32
873
+SAD_X_32xN 4 64
874
+
875
+.macro SAD_X_64 x
876
+.rept 3
877
+    vld1.8          {q0}, [r0]!
878
+    vld1.8          {q1}, [r1]!
879
+    vld1.8          {q2}, [r2]!
880
+    vld1.8          {q3}, [r3]!
881
+    vabal.u8        q8, d0, d2
882
+    vabal.u8        q9, d1, d3
883
+    vabal.u8        q10, d0, d4
884
+    vabal.u8        q11, d1, d5
885
+    vabal.u8        q12, d0, d6
886
+    vabal.u8        q13, d1, d7
887
+.if \x == 4
888
+    vld1.8          {q3}, [lr]!
889
+    vabal.u8        q14, d0, d6
890
+    vabal.u8        q15, d1, d7
891
+.endif
892
+.endr
893
+    vld1.8          {q0}, [r0], r12
894
+    vld1.8          {q1}, [r1], r4
895
+    vld1.8          {q2}, [r2], r4
896
+    vld1.8          {q3}, [r3], r4
897
+    vabal.u8        q8, d0, d2
898
+    vabal.u8        q9, d1, d3
899
+    vabal.u8        q10, d0, d4
900
+    vabal.u8        q11, d1, d5
901
+    vabal.u8        q12, d0, d6
902
+    vabal.u8        q13, d1, d7
903
+.if \x == 4
904
+    vld1.8          {q3}, [lr], r4
905
+    vabal.u8        q14, d0, d6
906
+    vabal.u8        q15, d1, d7
907
+.endif
908
+.endm
909
+
910
+.macro SAD_X_64xN x, h
911
+function x265_sad_x\x\()_64x\h\()_neon
912
+    push            {r4-r6, lr}
913
+.if \x == 3
914
+    ldrd            r4, r5, [sp, #16]
915
+.else
916
+    ldr             lr, [sp, #16]
917
+    ldrd            r4, r5, [sp, #20]
918
+.endif
919
+    mov             r12, #FENC_STRIDE
920
+    sub             r12, #48
921
+    sub             r4, #48
922
+    mov             r6, #\h/8
923
+    veor.u8         q8, q8
924
+    veor.u8         q9, q9
925
+    veor.u8         q10, q10
926
+    veor.u8         q11, q11
927
+    veor.u8         q12, q12
928
+    veor.u8         q13, q13
929
+.if \x == 4
930
+    veor.u8         q14, q14
931
+    veor.u8         q15, q15
932
+.endif
933
+.loop_sad_x\x\()_64x\h:
934
+.rept 8
935
+    SAD_X_64 \x
936
+.endr
937
+    subs            r6, #1
938
+    bne             .loop_sad_x\x\()_64x\h
939
+
940
+.if \h <= 16
941
+    vadd.u16        q8, q8, q9
942
+    vadd.u16        q10, q10, q11
943
+    vadd.u16        q12, q12, q13
944
+  .if \x == 4
945
+    vadd.u16        q14, q14, q15
946
+  .endif
947
+    vadd.u16        d16, d16, d17
948
+    vadd.u16        d20, d20, d21
949
+    vadd.u16        d24, d24, d25
950
+  .if \x == 4
951
+    vadd.u16        d28, d28, d29
952
+  .endif
953
+.else
954
+    vpaddl.u16      q8, q8
955
+    vpaddl.u16      q9, q9
956
+    vpaddl.u16      q10, q10
957
+    vpaddl.u16      q11, q11
958
+    vpaddl.u16      q12, q12
959
+    vpaddl.u16      q13, q13
960
+  .if \x == 4
961
+    vpaddl.u16      q14, q14
962
+    vpaddl.u16      q15, q15
963
+  .endif
964
+    vadd.u32        q8, q8, q9
965
+    vadd.u32        q10, q10, q11
966
+    vadd.u32        q12, q12, q13
967
+  .if \x == 4
968
+    vadd.u32        q14, q14, q15
969
+  .endif
970
+    vadd.u32        d16, d16, d17
971
+    vadd.u32        d20, d20, d21
972
+    vadd.u32        d24, d24, d25
973
+  .if \x == 4
974
+    vadd.u32        d28, d28, d29
975
+  .endif
976
+.endif
977
+
978
+.if \h <= 16
979
+    vpaddl.u16      d16, d16
980
+    vpaddl.u16      d20, d20
981
+    vpaddl.u16      d24, d24
982
+  .if \x == 4
983
+    vpaddl.u16      d28, d28
984
+  .endif 
985
+.endif
986
+    vpaddl.u32      d16, d16
987
+    vpaddl.u32      d20, d20
988
+    vpaddl.u32      d24, d24
989
+.if \x == 4
990
+    vpaddl.u32      d28, d28
991
+.endif
992
+    vst1.32         {d16[0]}, [r5]!
993
+    vst1.32         {d20[0]}, [r5]!
994
+.if \x == 3
995
+    vst1.32         {d24[0]}, [r5]
996
+.endif
997
+.if \x == 4
998
+    vst1.32         {d24[0]}, [r5]!
999
+    vst1.32         {d28[0]}, [r5]
1000
+.endif
1001
+    pop             {r4-r6, lr}
1002
+    bx              lr
1003
+endfunc
1004
+.endm
1005
+
1006
+SAD_X_64xN 3 16
1007
+SAD_X_64xN 3 32
1008
+SAD_X_64xN 3 48
1009
+SAD_X_64xN 3 64
1010
+
1011
+SAD_X_64xN 4 16
1012
+SAD_X_64xN 4 32
1013
+SAD_X_64xN 4 48
1014
+SAD_X_64xN 4 64
1015
+
1016
+.macro SAD_X_48 x
1017
+.rept 2
1018
+    vld1.8          {q0}, [r0]!
1019
+    vld1.8          {q1}, [r1]!
1020
+    vld1.8          {q2}, [r2]!
1021
+    vld1.8          {q3}, [r3]!
1022
+    vabal.u8        q8, d0, d2
1023
+    vabal.u8        q9, d1, d3
1024
+    vabal.u8        q10, d0, d4
1025
+    vabal.u8        q11, d1, d5
1026
+    vabal.u8        q12, d0, d6
1027
+    vabal.u8        q13, d1, d7
1028
+.if \x == 4
1029
+    vld1.8          {q3}, [lr]!
1030
+    vabal.u8        q14, d0, d6
1031
+    vabal.u8        q15, d1, d7
1032
+.endif
1033
+.endr
1034
+    vld1.8          {q0}, [r0], r12
1035
+    vld1.8          {q1}, [r1], r4
1036
+    vld1.8          {q2}, [r2], r4
1037
+    vld1.8          {q3}, [r3], r4
1038
+    vabal.u8        q8, d0, d2
1039
+    vabal.u8        q9, d1, d3
1040
+    vabal.u8        q10, d0, d4
1041
+    vabal.u8        q11, d1, d5
1042
+    vabal.u8        q12, d0, d6
1043
+    vabal.u8        q13, d1, d7
1044
+.if \x == 4
1045
+    vld1.8          {q3}, [lr], r4
1046
+    vabal.u8        q14, d0, d6
1047
+    vabal.u8        q15, d1, d7
1048
+.endif
1049
+.endm
1050
+
1051
+.macro SAD_X_48x64 x
1052
+function x265_sad_x\x\()_48x64_neon
1053
+    push            {r4-r6, lr}
1054
+.if \x == 3
1055
+    ldrd            r4, r5, [sp, #16]
1056
+.else
1057
+    ldr             lr, [sp, #16]
1058
+    ldrd            r4, r5, [sp, #20]
1059
+.endif
1060
+    mov             r12, #FENC_STRIDE
1061
+    sub             r12, #32
1062
+    sub             r4, #32
1063
+    mov             r6, #8
1064
+    veor.u8         q8, q8
1065
+    veor.u8         q9, q9
1066
+    veor.u8         q10, q10
1067
+    veor.u8         q11, q11
1068
+    veor.u8         q12, q12
1069
+    veor.u8         q13, q13
1070
+.if \x == 4
1071
+    veor.u8         q14, q14
1072
+    veor.u8         q15, q15
1073
+.endif
1074
+
1075
+.loop_sad_x\x\()_48x64:
1076
+.rept 8
1077
+    SAD_X_48 \x
1078
+.endr
1079
+    subs            r6, #1
1080
+    bne             .loop_sad_x\x\()_48x64
1081
+
1082
+    vpaddl.u16      q8, q8
1083
+    vpaddl.u16      q9, q9
1084
+    vpaddl.u16      q10, q10
1085
+    vpaddl.u16      q11, q11
1086
+    vpaddl.u16      q12, q12
1087
+    vpaddl.u16      q13, q13
1088
+.if \x == 4
1089
+    vpaddl.u16      q14, q14
1090
+    vpaddl.u16      q15, q15
1091
+.endif
1092
+    vadd.u32        q8, q8, q9
1093
+    vadd.u32        q10, q10, q11
1094
+    vadd.u32        q12, q12, q13
1095
+.if \x == 4
1096
+    vadd.u32        q14, q14, q15
1097
+.endif
1098
+    vadd.u32        d16, d16, d17
1099
+    vadd.u32        d20, d20, d21
1100
+    vadd.u32        d24, d24, d25
1101
+.if \x == 4
1102
+    vadd.u32        d28, d28, d29
1103
+.endif
1104
+    vpaddl.u32      d16, d16
1105
+    vpaddl.u32      d20, d20
1106
+    vpaddl.u32      d24, d24
1107
+    vpaddl.u32      d28, d28
1108
+.if \x == 4
1109
+    vpaddl.u32      d28, d28
1110
+.endif
1111
+    vst1.32         {d16[0]}, [r5]!
1112
+    vst1.32         {d20[0]}, [r5]!
1113
+.if \x == 3
1114
+    vst1.32         {d24[0]}, [r5]
1115
+.endif
1116
+.if \x == 4
1117
+    vst1.32         {d24[0]}, [r5]!
1118
+    vst1.32         {d28[0]}, [r5]
1119
+.endif
1120
+    pop             {r4-r6, lr}
1121
+    bx              lr
1122
+endfunc
1123
+.endm
1124
+
1125
+SAD_X_48x64 3
1126
+SAD_X_48x64 4
1127
+
1128
+.macro SAD_X_24 x
1129
+    vld1.8          {q0}, [r0]!
1130
+    vld1.8          {q1}, [r1]!
1131
+    vld1.8          {q2}, [r2]!
1132
+    vld1.8          {q3}, [r3]!
1133
+    vabal.u8        q8, d0, d2
1134
+    vabal.u8        q9, d1, d3
1135
+    vabal.u8        q10, d0, d4
1136
+    vabal.u8        q11, d1, d5
1137
+    vabal.u8        q12, d0, d6
1138
+    vabal.u8        q13, d1, d7
1139
+.if \x == 4
1140
+    vld1.8          {q3}, [lr]!
1141
+    vabal.u8        q14, d0, d6
1142
+    vabal.u8        q15, d1, d7
1143
+.endif
1144
+    vld1.8          {d0}, [r0], r12
1145
+    vld1.8          {d1}, [r1], r4
1146
+    vld1.8          {d2}, [r2], r4
1147
+    vld1.8          {d3}, [r3], r4
1148
+.if \x == 4
1149
+    vld1.8          {d8}, [lr], r4
1150
+.endif
1151
+    vabal.u8        q8, d0, d1
1152
+    vabal.u8        q10, d0, d2
1153
+    vabal.u8        q12, d0, d3
1154
+.if \x == 4
1155
+    vabal.u8        q14, d0, d8
1156
+.endif
1157
+.endm
1158
+
1159
+.macro SAD_X_24x32 x
1160
+function x265_sad_x\x\()_24x32_neon
1161
+     push           {r4-r6, lr}
1162
+.if \x == 3
1163
+    ldrd            r4, r5, [sp, #16]
1164
+.else
1165
+    ldr             lr, [sp, #16]
1166
+    ldrd            r4, r5, [sp, #20]
1167
+.endif
1168
+    mov             r12, #FENC_STRIDE
1169
+    sub             r12, #16
1170
+    sub             r4, #16
1171
+    mov             r6, #4
1172
+    veor.u8         q8, q8
1173
+    veor.u8         q9, q9
1174
+    veor.u8         q10, q10
1175
+    veor.u8         q11, q11
1176
+    veor.u8         q12, q12
1177
+    veor.u8         q13, q13
1178
+.if \x == 4
1179
+    veor.u8         q14, q14
1180
+    veor.u8         q15, q15
1181
+.endif
1182
+
1183
+.loop_sad_x\x\()_24x32:
1184
+.rept 8
1185
+    SAD_X_24 \x
1186
+.endr
1187
+    subs            r6, #1
1188
+    bne             .loop_sad_x\x\()_24x32
1189
+
1190
+    vadd.u16        q8, q8, q9
1191
+    vadd.u16        q10, q10, q11
1192
+    vadd.u16        q12, q12, q13
1193
+.if \x == 4
1194
+    vadd.u16        q14, q14, q15
1195
+.endif
1196
+    vadd.u16        d16, d16, d17
1197
+    vadd.u16        d20, d20, d21
1198
+    vadd.u16        d24, d24, d25
1199
+.if \x == 4
1200
+    vadd.u16        d28, d28, d29
1201
+.endif
1202
+    vpaddl.u16      d16, d16
1203
+    vpaddl.u16      d20, d20
1204
+    vpaddl.u16      d24, d24
1205
+.if \x == 4
1206
+    vpaddl.u16      d28, d28
1207
+.endif
1208
+    vpaddl.u32      d16, d16
1209
+    vpaddl.u32      d20, d20
1210
+    vpaddl.u32      d24, d24
1211
+.if \x == 4
1212
+    vpaddl.u32      d28, d28
1213
+.endif
1214
+.if \x == 4
1215
+    vpaddl.u32      d28, d28
1216
+.endif
1217
+    vst1.32         {d16[0]}, [r5]!
1218
+    vst1.32         {d20[0]}, [r5]!
1219
+.if \x == 3
1220
+    vst1.32         {d24[0]}, [r5]
1221
+.endif
1222
+.if \x == 4
1223
+    vst1.32         {d24[0]}, [r5]!
1224
+    vst1.32         {d28[0]}, [r5]
1225
+.endif
1226
+    pop             {r4-r6, lr}
1227
+    bx              lr
1228
+endfunc
1229
+.endm
1230
+
1231
+SAD_X_24x32 3
1232
+SAD_X_24x32 4
1233
+
1234
+// SAD_X3 and SAD_X4 code end
1235
+
1236
+.macro SAD_X_START_12 x
1237
+    vld1.8          {q0}, [r0], r12
1238
+    vld1.8          {q1}, [r1], r4
1239
+    vld1.8          {q2}, [r2], r4
1240
+    vld1.8          {q3}, [r3], r4
1241
+    vand.u8         q0, q15
1242
+    vand.u8         q1, q15
1243
+    vand.u8         q2, q15
1244
+    vand.u8         q3, q15
1245
+    vabdl.u8        q5, d0, d2
1246
+    vabdl.u8        q8, d1, d3
1247
+    vabdl.u8        q9, d0, d4
1248
+    vabdl.u8        q10, d1, d5
1249
+    vabdl.u8        q11, d0, d6
1250
+    vabdl.u8        q12, d1, d7
1251
+.if \x == 4
1252
+    vld1.8          {q3}, [lr], r4
1253
+    vand.u8         q3, q15
1254
+    vabdl.u8        q13, d0, d6
1255
+    vabdl.u8        q14, d1, d7
1256
+.endif
1257
+.endm
1258
+
1259
+
1260
+.macro SAD_X_12 x
1261
+    vld1.8          {q0}, [r0], r12
1262
+    vld1.8          {q1}, [r1], r4
1263
+    vld1.8          {q2}, [r2], r4
1264
+    vld1.8          {q3}, [r3], r4
1265
+    vand.u8         q0, q15
1266
+    vand.u8         q1, q15
1267
+    vand.u8         q2, q15
1268
+    vand.u8         q3, q15
1269
+    vabal.u8        q5, d0, d2
1270
+    vabal.u8        q8, d1, d3
1271
+    vabal.u8        q9, d0, d4
1272
+    vabal.u8        q10, d1, d5
1273
+    vabal.u8        q11, d0, d6
1274
+    vabal.u8        q12, d1, d7
1275
+.if \x == 4
1276
+    vld1.8          {q3}, [lr], r4
1277
+    vand.u8         q3, q15
1278
+    vabal.u8        q13, d0, d6
1279
+    vabal.u8        q14, d1, d7
1280
+.endif
1281
+.endm
1282
+
1283
+.macro SAD_X_12x16 x
1284
+function x265_sad_x\x\()_12x16_neon
1285
+    push            {r4-r5, lr}
1286
+    vpush           {q5}
1287
+.if \x == 3
1288
+    ldrd            r4, r5, [sp, #28]
1289
+.else
1290
+    ldr             lr, [sp, #28]
1291
+    ldrd            r4, r5, [sp, #32]
1292
+.endif
1293
+    movrel          r12, sad12_mask
1294
+    vld1.8          {q15}, [r12]
1295
+    mov             r12, #FENC_STRIDE
1296
+
1297
+    SAD_X_START_12 \x    
1298
+.rept 15
1299
+    SAD_X_12 \x
1300
+.endr
1301
+    vadd.u16        q5, q5, q8
1302
+    vadd.u16        q9, q9, q10
1303
+    vadd.u16        q11, q11, q12
1304
+.if \x == 4
1305
+    vadd.u16        q13, q13, q14
1306
+.endif
1307
+    vadd.u16        d10, d10, d11
1308
+    vadd.u16        d18, d18, d19
1309
+    vadd.u16        d22, d22, d23
1310
+.if \x == 4
1311
+    vadd.u16        d26, d26, d27
1312
+.endif
1313
+    vpadd.u16       d0, d10, d18
1314
+    vpadd.u16       d1, d22, d26
1315
+    vpaddl.u16      q0, q0
1316
+.if \x == 3
1317
+    vst1.32         {d0}, [r5]!
1318
+    vst1.32         {d1[0]}, [r5, :32]
1319
+.else
1320
+    vst1.32         {d0-d1}, [r5]
1321
+.endif
1322
+    vpop            {q5}
1323
+    pop             {r4-r5, lr}
1324
+    bx              lr
1325
+endfunc
1326
+.endm
1327
+
1328
+SAD_X_12x16 3
1329
+SAD_X_12x16 4
1330
+
1331
+function x265_pixel_sad_12x16_neon
1332
+    veor.u8         q8, q8
1333
+    veor.u8         q9, q9
1334
+    movrel          r12, sad12_mask
1335
+    vld1.8          {q15}, [r12]
1336
+.rept 8
1337
+    vld1.8          {q0}, [r0], r1
1338
+    vld1.8          {q1}, [r2], r3
1339
+    vand.u8         q0, q15
1340
+    vand.u8         q1, q15
1341
+    vld1.8          {q2}, [r0], r1
1342
+    vld1.8          {q3}, [r2], r3
1343
+    vand.u8         q2, q15
1344
+    vand.u8         q3, q15
1345
+    vabal.u8        q8, d0, d2
1346
+    vabal.u8        q9, d1, d3
1347
+    vabal.u8        q8, d4, d6
1348
+    vabal.u8        q9, d5, d7
1349
+.endr
1350
+    vadd.u16        q8, q8, q9
1351
+    vadd.u16        d16, d16, d17
1352
+    vpadd.u16       d0, d16, d16
1353
+    vpaddl.u16      d0, d0
1354
+    vmov.u32        r0, d0[0]
1355
+    bx              lr
1356
+endfunc
1357
+
1358
x265_2.0.tar.gz/source/common/arm/ssd-a.S Added
471
 
1
@@ -0,0 +1,469 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2016 x265 project
4
+ *
5
+ * Authors: Dnyaneshwar G <dnyaneshwar@multicorewareinc.com>
6
+ * 
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#include "asm.S"
26
+
27
+.section .rodata
28
+
29
+.align 4
30
+
31
+
32
+.text
33
+
34
+
35
+function x265_pixel_sse_pp_4x4_neon
36
+    vld1.32     {d16[]}, [r0], r1
37
+    vld1.32     {d17[]}, [r2], r3
38
+    vsubl.u8    q2, d16, d17
39
+    vld1.32     {d16[]}, [r0], r1
40
+    vmull.s16   q0, d4, d4
41
+    vld1.32     {d17[]}, [r2], r3
42
+
43
+    vsubl.u8    q2, d16, d17
44
+    vld1.32     {d16[]}, [r0], r1
45
+    vmlal.s16   q0, d4, d4
46
+    vld1.32     {d17[]}, [r2], r3
47
+
48
+    vsubl.u8    q2, d16, d17
49
+    vld1.32     {d16[]}, [r0], r1
50
+    vmlal.s16   q0, d4, d4
51
+    vld1.32     {d17[]}, [r2], r3
52
+
53
+    vsubl.u8    q2, d16, d17
54
+    vmlal.s16   q0, d4, d4
55
+    vadd.s32    d0, d0, d1
56
+    vpadd.s32   d0, d0, d0
57
+    vmov.32     r0, d0[0]
58
+    bx          lr
59
+endfunc
60
+
61
+function x265_pixel_sse_pp_8x8_neon
62
+    vld1.64     {d16}, [r0], r1
63
+    vld1.64     {d17}, [r2], r3
64
+    vsubl.u8    q2, d16, d17
65
+    vld1.64     {d16}, [r0], r1
66
+    vmull.s16   q0, d4, d4
67
+    vmlal.s16   q0, d5, d5
68
+    vld1.64     {d17}, [r2], r3
69
+
70
+.rept 6
71
+    vsubl.u8    q2, d16, d17
72
+    vld1.64     {d16}, [r0], r1
73
+    vmlal.s16   q0, d4, d4
74
+    vmlal.s16   q0, d5, d5
75
+    vld1.64     {d17}, [r2], r3
76
+.endr
77
+    vsubl.u8    q2, d16, d17
78
+    vmlal.s16   q0, d4, d4
79
+    vmlal.s16   q0, d5, d5
80
+    vadd.s32    d0, d0, d1
81
+    vpadd.s32   d0, d0, d0
82
+    vmov.32     r0, d0[0]
83
+    bx          lr
84
+endfunc
85
+
86
+function x265_pixel_sse_pp_16x16_neon
87
+    vld1.64     {d16-d17}, [r0], r1
88
+    vld1.64     {d18-d19}, [r2], r3
89
+    vsubl.u8    q2, d16, d18
90
+    vsubl.u8    q3, d17, d19
91
+    vld1.64     {d16-d17}, [r0], r1
92
+    vmull.s16   q0, d4, d4
93
+    vmlal.s16   q0, d5, d5
94
+    vld1.64     {d18-d19}, [r2], r3
95
+    vmlal.s16   q0, d6, d6
96
+    vmlal.s16   q0, d7, d7
97
+
98
+.rept 14
99
+    vsubl.u8    q2, d16, d18
100
+    vsubl.u8    q3, d17, d19
101
+    vld1.64     {d16-d17}, [r0], r1
102
+    vmlal.s16   q0, d4, d4
103
+    vmlal.s16   q0, d5, d5
104
+    vld1.64     {d18-d19}, [r2], r3
105
+    vmlal.s16   q0, d6, d6
106
+    vmlal.s16   q0, d7, d7
107
+.endr
108
+    vsubl.u8    q2, d16, d18
109
+    vsubl.u8    q3, d17, d19
110
+    vmlal.s16   q0, d4, d4
111
+    vmlal.s16   q0, d5, d5
112
+    vmlal.s16   q0, d6, d6
113
+    vmlal.s16   q0, d7, d7
114
+    vadd.s32    d0, d0, d1
115
+    vpadd.s32   d0, d0, d0
116
+    vmov.32     r0, d0[0]
117
+    bx          lr
118
+endfunc
119
+
120
+function x265_pixel_sse_pp_32x32_neon
121
+    mov         r12, #8
122
+    veor.u8     q0, q0
123
+    veor.u8     q1, q1
124
+
125
+.loop_sse_pp_32:
126
+    subs        r12, #1
127
+.rept 4
128
+    vld1.64     {q8-q9}, [r0], r1
129
+    vld1.64     {q10-q11}, [r2], r3
130
+    vsubl.u8    q2, d16, d20
131
+    vsubl.u8    q3, d17, d21
132
+    vsubl.u8    q12, d18, d22
133
+    vsubl.u8    q13, d19, d23
134
+    vmlal.s16   q0, d4, d4
135
+    vmlal.s16   q1, d5, d5
136
+    vmlal.s16   q0, d6, d6
137
+    vmlal.s16   q1, d7, d7
138
+    vmlal.s16   q0, d24, d24
139
+    vmlal.s16   q1, d25, d25
140
+    vmlal.s16   q0, d26, d26
141
+    vmlal.s16   q1, d27, d27
142
+.endr
143
+    bne         .loop_sse_pp_32
144
+    vadd.s32    q0, q1
145
+    vadd.s32    d0, d0, d1
146
+    vpadd.s32   d0, d0, d0
147
+    vmov.32     r0, d0[0]
148
+    bx          lr
149
+endfunc
150
+
151
+function x265_pixel_sse_pp_64x64_neon
152
+    sub         r1, #32
153
+    sub         r3, #32
154
+    mov         r12, #16
155
+    veor.u8     q0, q0
156
+    veor.u8     q1, q1
157
+
158
+.loop_sse_pp_64:
159
+    subs        r12, #1
160
+.rept 4
161
+    vld1.64     {q8-q9}, [r0]!
162
+    vld1.64     {q10-q11}, [r2]!
163
+    vsubl.u8    q2, d16, d20
164
+    vsubl.u8    q3, d17, d21
165
+    vsubl.u8    q12, d18, d22
166
+    vsubl.u8    q13, d19, d23
167
+    vmlal.s16   q0, d4, d4
168
+    vmlal.s16   q1, d5, d5
169
+    vmlal.s16   q0, d6, d6
170
+    vmlal.s16   q1, d7, d7
171
+    vmlal.s16   q0, d24, d24
172
+    vmlal.s16   q1, d25, d25
173
+    vmlal.s16   q0, d26, d26
174
+    vmlal.s16   q1, d27, d27
175
+
176
+    vld1.64     {q8-q9}, [r0], r1
177
+    vld1.64     {q10-q11}, [r2], r3
178
+    vsubl.u8    q2, d16, d20
179
+    vsubl.u8    q3, d17, d21
180
+    vsubl.u8    q12, d18, d22
181
+    vsubl.u8    q13, d19, d23
182
+    vmlal.s16   q0, d4, d4
183
+    vmlal.s16   q1, d5, d5
184
+    vmlal.s16   q0, d6, d6
185
+    vmlal.s16   q1, d7, d7
186
+    vmlal.s16   q0, d24, d24
187
+    vmlal.s16   q1, d25, d25
188
+    vmlal.s16   q0, d26, d26
189
+    vmlal.s16   q1, d27, d27
190
+.endr
191
+    bne         .loop_sse_pp_64
192
+    vadd.s32    q0, q1
193
+    vadd.s32    d0, d0, d1
194
+    vpadd.s32   d0, d0, d0
195
+    vmov.32     r0, d0[0]
196
+    bx          lr
197
+endfunc
198
+
199
+function x265_pixel_sse_ss_4x4_neon
200
+    add         r1, r1
201
+    add         r3, r3
202
+
203
+    vld1.s16    {d16}, [r0], r1
204
+    vld1.s16    {d18}, [r2], r3
205
+    vsub.s16    q2, q8, q9
206
+    vld1.s16    {d16}, [r0], r1
207
+    vmull.s16   q0, d4, d4
208
+    vld1.s16    {d18}, [r2], r3
209
+
210
+    vsub.s16    q2, q8, q9
211
+    vld1.s16    {d16}, [r0], r1
212
+    vmlal.s16   q0, d4, d4
213
+    vld1.s16    {d18}, [r2], r3
214
+
215
+    vsub.s16    q2, q8, q9
216
+    vld1.s16    {d16}, [r0], r1
217
+    vmlal.s16   q0, d4, d4
218
+    vld1.s16    {d18}, [r2], r3
219
+
220
+    vsub.s16    q2, q8, q9
221
+    vmlal.s16   q0, d4, d4
222
+
223
+    vadd.s32    d0, d0, d1
224
+    vpadd.s32   d0, d0, d0
225
+    vmov.32     r0, d0[0]
226
+    bx          lr
227
+endfunc
228
+
229
+function x265_pixel_sse_ss_8x8_neon
230
+    add         r1, r1
231
+    add         r3, r3
232
+
233
+    vld1.s16    {q8}, [r0], r1
234
+    vld1.s16    {q9}, [r2], r3
235
+    vsub.s16    q8, q9
236
+    vmull.s16   q0, d16, d16
237
+    vmull.s16   q1, d17, d17
238
+
239
+.rept 7
240
+    vld1.s16    {q8}, [r0], r1
241
+    vld1.s16    {q9}, [r2], r3
242
+    vsub.s16    q8, q9
243
+    vmlal.s16   q0, d16, d16
244
+    vmlal.s16   q1, d17, d17
245
+.endr
246
+    vadd.s32    q0, q1
247
+    vadd.s32    d0, d0, d1
248
+    vpadd.s32   d0, d0, d0
249
+    vmov.32     r0, d0[0]
250
+    bx          lr
251
+endfunc
252
+
253
+function x265_pixel_sse_ss_16x16_neon
254
+    add         r1, r1
255
+    add         r3, r3
256
+
257
+    mov         r12, #4
258
+    veor.u8     q0, q0
259
+    veor.u8     q1, q1
260
+
261
+.loop_sse_ss_16:
262
+    subs        r12, #1
263
+.rept 4
264
+    vld1.s16    {q8-q9}, [r0], r1
265
+    vld1.s16    {q10-q11}, [r2], r3
266
+    vsub.s16    q8, q10
267
+    vsub.s16    q9, q11
268
+    vmlal.s16   q0, d16, d16
269
+    vmlal.s16   q1, d17, d17
270
+    vmlal.s16   q0, d18, d18
271
+    vmlal.s16   q1, d19, d19
272
+.endr
273
+    bne         .loop_sse_ss_16
274
+    vadd.s32    q0, q1
275
+    vadd.s32    d0, d0, d1
276
+    vpadd.s32   d0, d0, d0
277
+    vmov.32     r0, d0[0]
278
+    bx          lr
279
+endfunc
280
+
281
+function x265_pixel_sse_ss_32x32_neon
282
+    add         r1, r1
283
+    add         r3, r3
284
+    sub         r1, #32
285
+    sub         r3, #32
286
+    mov         r12, #8
287
+    veor.u8     q0, q0
288
+    veor.u8     q1, q1
289
+
290
+.loop_sse_ss_32:
291
+    subs        r12, #1
292
+.rept 4
293
+    vld1.s16    {q8-q9}, [r0]!
294
+    vld1.s16    {q10-q11}, [r2]!
295
+    vsub.s16    q8, q10
296
+    vsub.s16    q9, q11
297
+    vmlal.s16   q0, d16, d16
298
+    vmlal.s16   q1, d17, d17
299
+    vmlal.s16   q0, d18, d18
300
+    vmlal.s16   q1, d19, d19
301
+
302
+    vld1.s16    {q8-q9}, [r0], r1
303
+    vld1.s16    {q10-q11}, [r2], r3
304
+    vsub.s16    q8, q10
305
+    vsub.s16    q9, q11
306
+    vmlal.s16   q0, d16, d16
307
+    vmlal.s16   q1, d17, d17
308
+    vmlal.s16   q0, d18, d18
309
+    vmlal.s16   q1, d19, d19
310
+.endr
311
+    bne         .loop_sse_ss_32
312
+    vadd.s32    q0, q1
313
+    vadd.s32    d0, d0, d1
314
+    vpadd.s32   d0, d0, d0
315
+    vmov.32     r0, d0[0]
316
+    bx          lr
317
+endfunc
318
+
319
+function x265_pixel_sse_ss_64x64_neon
320
+    add         r1, r1
321
+    add         r3, r3
322
+    sub         r1, #96
323
+    sub         r3, #96
324
+    mov         r12, #32
325
+    veor.u8     q0, q0
326
+    veor.u8     q1, q1
327
+
328
+.loop_sse_ss_64:
329
+    subs        r12, #1
330
+.rept 2
331
+    vld1.s16    {q8-q9}, [r0]!
332
+    vld1.s16    {q10-q11}, [r2]!
333
+    vsub.s16    q8, q10
334
+    vsub.s16    q9, q11
335
+    vmlal.s16   q0, d16, d16
336
+    vmlal.s16   q1, d17, d17
337
+    vmlal.s16   q0, d18, d18
338
+    vmlal.s16   q1, d19, d19
339
+
340
+    vld1.s16    {q8-q9}, [r0]!
341
+    vld1.s16    {q10-q11}, [r2]!
342
+    vsub.s16    q8, q10
343
+    vsub.s16    q9, q11
344
+    vmlal.s16   q0, d16, d16
345
+    vmlal.s16   q1, d17, d17
346
+    vmlal.s16   q0, d18, d18
347
+    vmlal.s16   q1, d19, d19
348
+
349
+    vld1.s16    {q8-q9}, [r0]!
350
+    vld1.s16    {q10-q11}, [r2]!
351
+    vsub.s16    q8, q10
352
+    vsub.s16    q9, q11
353
+    vmlal.s16   q0, d16, d16
354
+    vmlal.s16   q1, d17, d17
355
+    vmlal.s16   q0, d18, d18
356
+    vmlal.s16   q1, d19, d19
357
+
358
+    vld1.s16    {q8-q9}, [r0], r1
359
+    vld1.s16    {q10-q11}, [r2], r3
360
+    vsub.s16    q8, q10
361
+    vsub.s16    q9, q11
362
+    vmlal.s16   q0, d16, d16
363
+    vmlal.s16   q1, d17, d17
364
+    vmlal.s16   q0, d18, d18
365
+    vmlal.s16   q1, d19, d19
366
+.endr
367
+    bne         .loop_sse_ss_64
368
+    vadd.s32    q0, q1
369
+    vadd.s32    d0, d0, d1
370
+    vpadd.s32   d0, d0, d0
371
+    vmov.32     r0, d0[0]
372
+    bx          lr
373
+endfunc
374
+
375
+function x265_pixel_ssd_s_4x4_neon
376
+    add         r1, r1
377
+    vld1.s16    {d4}, [r0], r1
378
+    vld1.s16    {d5}, [r0], r1
379
+    vld1.s16    {d6}, [r0], r1
380
+    vld1.s16    {d7}, [r0]
381
+    vmull.s16   q0, d4, d4
382
+    vmull.s16   q1, d5, d5
383
+    vmlal.s16   q0, d6, d6
384
+    vmlal.s16   q1, d7, d7
385
+    vadd.s32    q0, q1
386
+    vadd.s32    d0, d0, d1
387
+    vpadd.s32   d0, d0, d0
388
+    vmov.32     r0, d0[0]
389
+    bx          lr
390
+endfunc
391
+
392
+function x265_pixel_ssd_s_8x8_neon
393
+    add         r1, r1
394
+    vld1.s16    {q8}, [r0], r1
395
+    vld1.s16    {q9}, [r0], r1
396
+    vmull.s16   q0, d16, d16
397
+    vmull.s16   q1, d17, d17
398
+    vmlal.s16   q0, d18, d18
399
+    vmlal.s16   q1, d19, d19
400
+.rept 3
401
+    vld1.s16    {q8}, [r0], r1
402
+    vld1.s16    {q9}, [r0], r1
403
+    vmlal.s16   q0, d16, d16
404
+    vmlal.s16   q1, d17, d17
405
+    vmlal.s16   q0, d18, d18
406
+    vmlal.s16   q1, d19, d19
407
+.endr
408
+    vadd.s32    q0, q1
409
+    vadd.s32    d0, d0, d1
410
+    vpadd.s32   d0, d0, d0
411
+    vmov.32     r0, d0[0]
412
+    bx          lr
413
+endfunc
414
+
415
+function x265_pixel_ssd_s_16x16_neon
416
+    add         r1, r1
417
+    mov         r12, #4
418
+    veor.u8     q0, q0
419
+    veor.u8     q1, q1
420
+
421
+.loop_ssd_s_16:
422
+    subs        r12, #1
423
+.rept 2
424
+    vld1.s16    {q8-q9}, [r0], r1
425
+    vld1.s16    {q10-q11}, [r0], r1
426
+    vmlal.s16   q0, d16, d16
427
+    vmlal.s16   q1, d17, d17
428
+    vmlal.s16   q0, d18, d18
429
+    vmlal.s16   q1, d19, d19
430
+    vmlal.s16   q0, d20, d20
431
+    vmlal.s16   q1, d21, d21
432
+    vmlal.s16   q0, d22, d22
433
+    vmlal.s16   q1, d23, d23
434
+.endr
435
+    bne         .loop_ssd_s_16
436
+    vadd.s32    q0, q1
437
+    vadd.s32    d0, d0, d1
438
+    vpadd.s32   d0, d0, d0
439
+    vmov.32     r0, d0[0]
440
+    bx          lr
441
+endfunc
442
+
443
+function x265_pixel_ssd_s_32x32_neon
444
+    add         r1, r1
445
+    sub         r1, #32
446
+    mov         r12, #8
447
+    veor.u8     q0, q0
448
+    veor.u8     q1, q1
449
+
450
+.loop_ssd_s_32:
451
+    subs        r12, #1
452
+.rept 4
453
+    vld1.s16    {q8-q9}, [r0]!
454
+    vld1.s16    {q10-q11}, [r0], r1
455
+    vmlal.s16   q0, d16, d16
456
+    vmlal.s16   q1, d17, d17
457
+    vmlal.s16   q0, d18, d18
458
+    vmlal.s16   q1, d19, d19
459
+    vmlal.s16   q0, d20, d20
460
+    vmlal.s16   q1, d21, d21
461
+    vmlal.s16   q0, d22, d22
462
+    vmlal.s16   q1, d23, d23
463
+.endr
464
+    bne         .loop_ssd_s_32
465
+    vadd.s32    q0, q1
466
+    vadd.s32    d0, d0, d1
467
+    vpadd.s32   d0, d0, d0
468
+    vmov.32     r0, d0[0]
469
+    bx          lr
470
+endfunc
471
x265_1.9.tar.gz/source/common/common.cpp -> x265_2.0.tar.gz/source/common/common.cpp Changed
118
 
1
@@ -29,6 +29,8 @@
2
 #if _WIN32
3
 #include <sys/types.h>
4
 #include <sys/timeb.h>
5
+#include <io.h>
6
+#include <fcntl.h>
7
 #else
8
 #include <sys/time.h>
9
 #endif
10
@@ -139,6 +141,94 @@
11
     fputs(buffer, stderr);
12
 }
13
 
14
+#if _WIN32
15
+/* For Unicode filenames in Windows we convert UTF-8 strings to UTF-16 and we use _w functions.
16
+ * For other OS we do not make any changes. */
17
+void general_log_file(const x265_param* param, const char* caller, int level, const char* fmt, ...)
18
+{
19
+    if (param && level > param->logLevel)
20
+        return;
21
+    const int bufferSize = 4096;
22
+    char buffer[bufferSize];
23
+    int p = 0;
24
+    const char* log_level;
25
+    switch (level)
26
+    {
27
+    case X265_LOG_ERROR:
28
+        log_level = "error";
29
+        break;
30
+    case X265_LOG_WARNING:
31
+        log_level = "warning";
32
+        break;
33
+    case X265_LOG_INFO:
34
+        log_level = "info";
35
+        break;
36
+    case X265_LOG_DEBUG:
37
+        log_level = "debug";
38
+        break;
39
+    case X265_LOG_FULL:
40
+        log_level = "full";
41
+        break;
42
+    default:
43
+        log_level = "unknown";
44
+        break;
45
+    }
46
+
47
+    if (caller)
48
+        p += sprintf(buffer, "%-4s [%s]: ", caller, log_level);
49
+    va_list arg;
50
+    va_start(arg, fmt);
51
+    vsnprintf(buffer + p, bufferSize - p, fmt, arg);
52
+    va_end(arg);
53
+
54
+    HANDLE console = GetStdHandle(STD_ERROR_HANDLE);
55
+    DWORD mode;
56
+    if (GetConsoleMode(console, &mode))
57
+    {
58
+        wchar_t buf_utf16[bufferSize];
59
+        int length_utf16 = MultiByteToWideChar(CP_UTF8, 0, buffer, -1, buf_utf16, sizeof(buf_utf16)/sizeof(wchar_t)) - 1;
60
+        if (length_utf16 > 0)
61
+            WriteConsoleW(console, buf_utf16, length_utf16, &mode, NULL);
62
+    }
63
+    else
64
+        fputs(buffer, stderr);
65
+}
66
+
67
+FILE* x265_fopen(const char* fileName, const char* mode)
68
+{
69
+    wchar_t buf_utf16[MAX_PATH * 2], mode_utf16[16];
70
+
71
+    if (MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, fileName, -1, buf_utf16, sizeof(buf_utf16)/sizeof(wchar_t)) &&
72
+        MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, mode, -1, mode_utf16, sizeof(mode_utf16)/sizeof(wchar_t)))
73
+    {
74
+        return _wfopen(buf_utf16, mode_utf16);
75
+    }
76
+    return NULL;
77
+}
78
+
79
+int x265_unlink(const char* fileName)
80
+{
81
+    wchar_t buf_utf16[MAX_PATH * 2];
82
+
83
+    if (MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, fileName, -1, buf_utf16, sizeof(buf_utf16)/sizeof(wchar_t)))
84
+        return _wunlink(buf_utf16);
85
+
86
+    return -1;
87
+}
88
+
89
+int x265_rename(const char* oldName, const char* newName)
90
+{
91
+    wchar_t old_utf16[MAX_PATH * 2], new_utf16[MAX_PATH * 2];
92
+
93
+    if (MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, oldName, -1, old_utf16, sizeof(old_utf16)/sizeof(wchar_t)) &&
94
+        MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, newName, -1, new_utf16, sizeof(new_utf16)/sizeof(wchar_t)))
95
+    {
96
+        return _wrename(old_utf16, new_utf16);
97
+    }
98
+    return -1;
99
+}
100
+#endif
101
+
102
 double x265_ssim2dB(double ssim)
103
 {
104
     double inv_ssim = 1 - ssim;
105
@@ -177,10 +267,10 @@
106
     size_t fSize;
107
     char *buf = NULL;
108
 
109
-    FILE *fh = fopen(filename, "rb");
110
+    FILE *fh = x265_fopen(filename, "rb");
111
     if (!fh)
112
     {
113
-        x265_log(NULL, X265_LOG_ERROR, "unable to open file %s\n", filename);
114
+        x265_log_file(NULL, X265_LOG_ERROR, "unable to open file %s\n", filename);
115
         return NULL;
116
     }
117
 
118
x265_1.9.tar.gz/source/common/common.h -> x265_2.0.tar.gz/source/common/common.h Changed
30
 
1
@@ -322,6 +322,8 @@
2
 #define MAX_NUM_TR_COEFFS           MAX_TR_SIZE * MAX_TR_SIZE // Maximum number of transform coefficients, for a 32x32 transform
3
 #define MAX_NUM_TR_CATEGORIES       16                        // 32, 16, 8, 4 transform categories each for luma and chroma
4
 
5
+#define PIXEL_MAX ((1 << X265_DEPTH) - 1)
6
+
7
 namespace X265_NS {
8
 
9
 enum { SAO_NUM_OFFSET = 4 };
10
@@ -402,7 +404,19 @@
11
 /* located in common.cpp */
12
 int64_t  x265_mdate(void);
13
 #define  x265_log(param, ...) general_log(param, "x265", __VA_ARGS__)
14
+#define  x265_log_file(param, ...) general_log_file(param, "x265", __VA_ARGS__)
15
 void     general_log(const x265_param* param, const char* caller, int level, const char* fmt, ...);
16
+#if _WIN32
17
+void     general_log_file(const x265_param* param, const char* caller, int level, const char* fmt, ...);
18
+FILE*    x265_fopen(const char* fileName, const char* mode);
19
+int      x265_unlink(const char* fileName);
20
+int      x265_rename(const char* oldName, const char* newName);
21
+#else
22
+#define  general_log_file(param, caller, level, fmt, ...) general_log(param, caller, level, fmt, __VA_ARGS__)
23
+#define  x265_fopen(fileName, mode) fopen(fileName, mode)
24
+#define  x265_unlink(fileName) unlink(fileName)
25
+#define  x265_rename(oldName, newName) rename(oldName, newName)
26
+#endif
27
 int      x265_exp2fix8(double x);
28
 
29
 double   x265_ssim2dB(double ssim);
30
x265_1.9.tar.gz/source/common/constants.cpp -> x265_2.0.tar.gz/source/common/constants.cpp Changed
257
 
1
@@ -555,18 +555,6 @@
2
     0x38, 
3
 };
4
 
5
-/* Contains how much to increment shared depth buffer for different ctu sizes to get next best depth
6
- * here, depth 0 = 64x64, depth 1 = 32x32, depth 2 = 16x16 and depth 3 = 8x8
7
- * if ctu = 64, depth buffer size is 256 combination of depth values 0, 1, 2, 3
8
- * if ctu = 32, depth buffer size is 64 combination of depth values 1, 2, 3
9
- * if ctu = 16, depth buffer size is 16 combination of depth values 2, 3 */
10
-const uint32_t g_depthInc[3][4] =
11
-{
12
-    { 16,  4,  0, 0},
13
-    { 64, 16,  4, 1},
14
-    {256, 64, 16, 4}
15
-};
16
-
17
 /* g_depthScanIdx [y][x] */
18
 const uint32_t g_depthScanIdx[8][8] =
19
 {
20
@@ -580,4 +568,236 @@
21
     {  42,  43,  46,  47,  58,  59,  62,  63,  }
22
 };
23
 
24
+/* Rec.2020 YUV to RGB Non-constant luminance */
25
+const double g_YUVtoRGB_BT2020[3][3] = 
26
+{
27
+    {   1.00,   0.00,      1.47460,   },
28
+    {   1.00,  -0.16455,  -0.57135,   },
29
+    {   1.00,   1.88140,   0.00,      }
30
+};
31
+
32
+const double g_ST2084_PQTable[MAX_HDR_LEGAL_RANGE - MIN_HDR_LEGAL_RANGE + 1] = 
33
+{
34
+    0,
35
+    5.25912035416561E-05, 0.000170826479250824, 0.000342874260206259, 0.000565730978088069,
36
+    0.000838361593599196, 0.0011605708550711, 0.00153261170332205, 0.00195500928122658,
37
+    0.00242846920816411, 0.00295382484798614, 0.00353200479131171, 0.00416401171798929,
38
+    0.00485090808272845, 0.00559380610060962, 0.00639386055422149, 0.00725226351560689,
39
+    0.0081702404049783, 0.00914904700558975, 0.010189967177051, 0.0112943110883226,
40
+    0.0124634138437419, 0.0136986344106386, 0.0150013547814312, 0.0163729793201926,
41
+    0.0178149342559234, 0.0193286672936668, 0.0209156473211494, 0.022577364193536,
42
+    0.0243153285825585, 0.0261310718791221, 0.0280261461406398, 0.0300021240760516,
43
+    0.0320605990628007, 0.0342031851910785, 0.036431517331512, 0.0387472512230819,
44
+    0.0411520635786705, 0.0436476522060052, 0.046235736142162, 0.0489180558000865,
45
+    0.0516963731258075, 0.0545724717652363, 0.0575481572396137, 0.0606252571287911,
46
+    0.0638056212616694, 0.0670911219131892, 0.0704836540073949, 0.0739851353261047,
47
+    0.0775975067228409, 0.0813227323416811, 0.0851627998407477, 0.0891197206201265,
48
+    0.0931955300539647, 0.0973922877266004, 0.101712077672541, 0.106157008620188,
49
+    0.110729214239187, 0.115430853391267, 0.120264110384523, 0.125231195231086,
50
+    0.130334343908053, 0.135575818621706, 0.140957908074883, 0.146482927737596,
51
+    0.152153220120717, 0.157971155052834, 0.163939129960184, 0.170059570149691,
52
+    0.176334929095073, 0.182767688726043, 0.189360359720598, 0.196115481800328,
53
+    0.203035624028883, 0.210123385113499, 0.21738139370961, 0.224812308728624,
54
+    0.232418819648774, 0.240203646829142, 0.248169541826838, 0.256319287717358,
55
+    0.264655699418179, 0.273181624015456, 0.281899941094164, 0.29081356307129,
56
+    0.299925435532481, 0.309238537571936, 0.318755882135647, 0.32848051636804,
57
+    0.338415521962, 0.34856401551231, 0.358929148872555, 0.369514109515577,
58
+    0.380322120897342, 0.391356442824469, 0.402620371825233, 0.414117241524302,
59
+    0.425850423021013, 0.437823325271459, 0.450039395474131, 0.4625021194595,
60
+    0.475215022083238, 0.488181667623337, 0.501405660181076, 0.514890644085913,
61
+    0.528640304304275, 0.542658366852319, 0.556948599212766, 0.571514810755682,
62
+    0.58636085316357, 0.601490620860234, 0.616908051444177, 0.632617126126042,
63
+    0.648621870170268, 0.664926353341107, 0.681534690353104, 0.6984510413256,
64
+    0.715679612242097, 0.733224655413817, 0.751090469947712, 0.769281402219399,
65
+    0.78780184635024, 0.806656244689427, 0.82584908830055, 0.84538491745295,
66
+    0.865268322117971, 0.885503942469945, 0.906096469391926, 0.927050644986733,
67
+    0.948371263092526, 0.970063169803824, 0.99213126399724, 1.01458049786256,
68
+    1.03741587743901, 1.06064246315667, 1.08426537038311, 1.10828976997558,
69
+    1.13272088883845, 1.1575640104859, 1.18282447561067, 1.20850768265765,
70
+    1.23461908840365, 1.26116420854251, 1.28814861827608, 1.31557795291099,
71
+    1.34345790846097, 1.37179424225547, 1.40059277355414, 1.42985938416685,
72
+    1.45960001908056, 1.48982068709166, 1.52052746144494, 1.55172648047831,
73
+    1.58342394827458, 1.61562613531883, 1.6483393791628, 1.68157008509547,
74
+    1.71532472682031, 1.74960984713914, 1.78443205864284, 1.81979804440872,
75
+    1.85571455870433, 1.8921884276992, 1.92922655018235, 1.9668358982877,
76
+    2.0050235182263, 2.04379653102551, 2.0831621332761, 2.12312759788576,
77
+    2.16370027484092, 2.20488759197549, 2.2466970557472, 2.28913625202187,
78
+    2.33221284686502, 2.37593458734142, 2.42030930232274, 2.46534490330251,
79
+    2.51104938521982, 2.55743082729067, 2.60449739384781, 2.65225733518805,
80
+    2.70071898842928, 2.74989077837451, 2.79978121838576, 2.85039891126499,
81
+    2.90175255014517, 2.95385091938954, 3.00670289549934, 3.06031744803115,
82
+    3.11470364052283, 3.16987063142876, 3.22582767506471, 3.2825841225609,
83
+    3.3401494228253, 3.39853312351689, 3.45774487202715, 3.51779441647257,
84
+    3.57869160669604, 3.64044639527875, 3.7030688385618, 3.76656909767725,
85
+    3.83095743959148, 3.89624423815599, 3.96243997517042, 4.02955524145598,
86
+    4.09760073793895, 4.16658727674518, 4.2365257823051, 4.30742729247016,
87
+    4.37930295964014, 4.45216405190141, 4.52602195417663, 4.60088816938553,
88
+    4.67677431961831, 4.75369214731843, 4.83165351647993, 4.91067041385396,
89
+    4.99075495016979, 5.07191936136577, 5.15417600983301, 5.23753738567282,
90
+    5.32201610796449, 5.40762492604782, 5.49437672081637, 5.58228450602463,
91
+    5.67136142960816, 5.76162077501684, 5.85307596256082, 5.94574055077076,
92
+    6.03962823777015, 6.13475286266291, 6.2311284069342, 6.32876899586396,
93
+    6.42768889995753, 6.5279025363866, 6.62942447044656, 6.73226941703026,
94
+    6.83645224211186, 6.94198796425035, 7.04889175610325, 7.15717894596024,
95
+    7.2668650192892, 7.37796562029657, 7.49049655350635, 7.60447378535363,
96
+    7.71991344579293, 7.83683182992318, 7.95524539963073, 8.07517078524564,
97
+    8.19662478721649, 8.31962437780235, 8.44418670277909, 8.57032908316786,
98
+    8.69806901697162, 8.82742418094208, 8.95841243235119, 9.09105181078918,
99
+    9.22536053997842, 9.36135702960081, 9.4990598771529, 9.63848786980913,
100
+    9.77965998631185, 9.92259539887546, 10.0673134751131, 10.2138337799773,
101
+    10.3621760777285, 10.5123603339148, 10.6644067173761, 10.8183356022682,
102
+    10.9741675701064, 11.1319234118292, 11.2916241298841, 11.4532909403319,
103
+    11.6169452749761, 11.782608783511, 11.9503033356888, 12.120051023515,
104
+    12.2918741634627, 12.4657952987048, 12.6418372013776, 12.8200228748588,
105
+    13.0003755560757, 13.1829187178276, 13.367676071144, 13.5546715676512,
106
+    13.7439294019804, 13.9354740141834, 14.1293300921851, 14.3255225742508,
107
+    14.5240766514895, 14.7250177703705, 14.9283716352778, 15.1341642110757,
108
+    15.3424217257167, 15.5531706728631, 15.7664378145379, 15.9822501838117,
109
+    16.2006350874992, 16.4216201089027, 16.6452331105667, 16.8715022370722,
110
+    17.1004559178516, 17.3321228700381, 17.5665321013393, 17.8037129129401,
111
+    18.0436949024415, 18.2865079668192, 18.5321823054235, 18.7807484229967,
112
+    19.0322371327346, 19.2866795593684, 19.5441071422852, 19.8045516386728,
113
+    20.068045126707, 20.3346200087623, 20.6043090146575, 20.8771452049349,
114
+    21.1531619741772, 21.4323930543496, 21.7148725181833, 22.0006347825899,
115
+    22.2897146121093, 22.5821471224015, 22.8779677837589, 23.1772124246723,
116
+    23.4799172354157, 23.7861187716811, 24.0958539582449, 24.4091600926726,
117
+    24.7260748490581, 25.0466362818137, 25.3708828294739, 25.6988533185695,
118
+    26.0305869675189, 26.3661233905639, 26.7055026017538, 27.0487650189598,
119
+    27.3959514679386, 27.7471031864343, 28.1022618283194, 28.4614694677879,
120
+    28.8247686035749, 29.1922021632471, 29.5638135074984, 29.9396464345297,
121
+    30.3197451844465, 30.7041544437129, 31.0929193496474, 31.4860854949729,
122
+    31.8836989324014, 32.2858061792735, 32.6924542222466, 33.1036905220286,
123
+    33.5195630181606, 33.9401201338504, 34.3654107808513, 34.7954843644001,
124
+    35.2303907882032, 35.6701804594619, 36.1149042939698, 36.5646137212482,
125
+    37.0193606897411, 37.4791976720634, 37.944177670299, 38.4143542213633,
126
+    38.8897814024065, 39.3705138362898, 39.8566066971106, 40.3481157157767,
127
+    40.8450971856484, 41.3476079682522, 41.8557054990105, 42.369447793091,
128
+    42.8888934512647, 43.4141016658423, 43.9451322266965, 44.4820455273072,
129
+    45.0249025708978, 45.57376497661, 46.128694985791, 46.6897554682848,
130
+    47.257009928828, 47.8305225135037, 48.4103580162663, 48.9965818855272,
131
+    49.589260230802, 50.1884598294566, 50.794248133489, 51.4066932764077,
132
+    52.0258640801652, 52.6518300621766, 53.2846614424041, 53.9244291505136,
133
+    54.5712048331156, 55.2250608610794, 55.8860703369173, 56.5543071022513,
134
+    57.2298457453516, 57.9127616087739, 58.6031307970611, 59.3010301845114,
135
+    60.0065374230609, 60.7197309502355, 61.4406899971675, 62.1694945967356,
136
+    62.9062255917496, 63.6509646432403, 64.4037942388625, 65.1647977013236,
137
+    65.9340591969731, 66.7116637444152, 67.4976972232724, 68.2922463830112,
138
+    69.0953988518382, 69.9072431457598, 70.7278686776501, 71.5573657664994,
139
+    72.3958256466906, 73.2433404774142, 74.1000033521872, 74.9659083084248,
140
+    75.8411503371909, 76.7258253929696, 77.6200304036002, 78.5238632802992,
141
+    79.4374229277768, 80.3608092544678, 81.2941231828966, 82.2374666600933,
142
+    83.1909426682048, 84.154655235138, 85.1287094453491, 86.1132114507694,
143
+    87.108268481825, 88.1139888585565, 89.1304820019001, 90.1578584450571,
144
+    91.1962298449948, 92.2457089940652, 93.3064098317639, 94.3784474565997,
145
+    95.4619381380949, 96.5569993289116, 97.6637496771184, 98.7823090385655,
146
+    99.9127984894415, 101.055340338899, 102.210058141845, 103.377076711919,
147
+    104.556522134513, 105.748521780005, 106.953204317117, 108.170699726403,
148
+    109.401139313892, 110.644655724874, 111.901382957862, 113.171456378648,
149
+    114.455012734562, 115.752190168864, 117.063128235285, 118.387967912751,
150
+    119.726851620228, 121.079923231788, 122.447328091724, 123.829213029981,
151
+    125.225726377642, 126.637017982633, 128.063239225529, 129.504543035659,
152
+    130.961083907258, 132.43301791588, 133.920502734926, 135.423697652396,
153
+    136.942763587828, 138.477863109372, 140.029160451099, 141.596821530472,
154
+    143.181013966024, 144.781907095212, 146.399671992475, 148.034481487503,
155
+    149.686510183665, 151.355934476676, 153.042932573466, 154.747684511235,
156
+    156.470372176717, 158.211179325695, 159.970291602654, 161.747896560765,
157
+    163.544183681914, 165.359344397174, 167.193572107279, 169.047062203492,
158
+    170.920012088617, 172.812621198221, 174.725091022243, 176.657625126586,
159
+    178.610429175187, 180.583710952171, 182.577680384379, 184.59254956399,
160
+    186.628532771569, 188.685846499193, 190.764709473972, 192.865342681753,
161
+    194.987969391112, 197.13281517763, 199.300107948348, 201.490077966701,
162
+    203.702957877374, 205.938982731875, 208.198390014006, 210.481419665809,
163
+    212.788314113849, 215.119318295558, 217.474679686168, 219.854648325694,
164
+    222.259476846381, 224.689420500319, 227.144737187562, 229.625687484264,
165
+    232.132534671514, 234.665544764103, 237.224986539876, 239.811131569336,
166
+    242.424254245529, 245.064631814346, 247.73254440507, 250.428275061399,
167
+    253.152109772633, 255.904337505438, 258.685250235678, 261.49514298094,
168
+    264.334313833161, 267.203063991664, 270.101697796781, 273.03052276345,
169
+    275.989849615675, 278.979992320954, 282.001268125309, 285.053997588697,
170
+    288.138504620796, 291.255116517118, 294.404163995707, 297.585981234071,
171
+    300.800905906628, 304.049279222569, 307.331445964095, 310.647754525259,
172
+    313.998556950887, 317.384208976364, 320.805070067649, 324.26150346164,
173
+    327.753876207298, 331.28255920701, 334.84792725845, 338.450359096983,
174
+    342.090237438443, 345.767949022632, 349.483884657022, 353.238439261111,
175
+    357.032011911288, 360.865005886229, 364.73782871259, 368.650892211681,
176
+    372.604612546163, 376.59941026756, 380.635710364328, 384.713942310386,
177
+    388.83454011424, 392.997942368521, 397.20459230049, 401.454937822634,
178
+    405.749431584178, 410.088531023082, 414.47269841859, 418.902400944533,
179
+    423.378110722949, 427.900304878816, 432.469465594816, 437.086080167171,
180
+    441.750641062068, 446.463645972511, 451.225597876033, 456.037005092914,
181
+    460.89838134554, 465.81024581748, 470.773123214509, 475.787543825096,
182
+    480.854043582649, 485.973164127686, 491.14545287122, 496.371463058725,
183
+    501.651753834779, 506.986890308486, 512.377443619739, 517.823991006384,
184
+    523.32711587159, 528.887407852831, 534.505462890955, 540.181883300517,
185
+    545.917277840779, 551.712261787277, 557.567457004939, 563.48349202123,
186
+    569.461002100643, 575.500629320033, 581.603022644652, 587.76883800521,
187
+    593.998738375827, 600.29339385279, 606.653481734616, 613.07968660232,
188
+    619.572700401503, 626.133222524762, 632.761959895347, 639.459627051767,
189
+    646.226946233466, 653.064647467273, 659.973468655012, 666.954155662449,
190
+    674.007462408703, 681.134150957274, 688.334991607664, 695.610762988527,
191
+    702.962252151562, 710.390254666907, 717.895574719168, 725.479025205175,
192
+    733.141427832198, 740.883613218127, 748.706420992262, 756.610699897378,
193
+    764.597307893424, 772.667112261926, 780.820989711908, 789.059826487117,
194
+    797.384518474445, 805.79597131351, 814.295100508111, 822.882831538009,
195
+    831.560099973222, 840.327851588798, 849.187042481472, 858.138639187298,
196
+    867.183618801265, 876.322969097945, 885.557688653527, 894.88878696958,
197
+    904.317284598324, 913.844213269149, 923.470616016881, 933.197547311661,
198
+    943.02607318998, 952.957271387842, 962.99223147528, 973.13205499233,
199
+    983.377855587028, 993.730759155025, 1004.19190398011, 1014.7624408779,
200
+    1025.44353334027, 1036.23635768138, 1047.14210318612, 1058.16197226031,
201
+    1069.29718058216, 1080.54895725615, 1091.91854496832, 1103.40720014439,
202
+    1115.01619310819, 1126.74680824381, 1138.60034415848, 1150.57811384819,
203
+    1162.68144486462, 1174.91167948465, 1187.27017488269, 1199.75830330268,
204
+    1212.37745223534, 1225.12902459516, 1238.01443890053, 1251.03512945689,
205
+    1264.19254654015, 1277.48815658428, 1290.92344237023, 1304.49990321753,
206
+    1318.21905517769, 1332.0824312314, 1346.09158148618, 1360.24807337821,
207
+    1374.55349187613, 1389.00943968636, 1403.61753746281, 1418.37942401772,
208
+    1433.29675653564, 1448.37121079053, 1463.60448136459, 1478.99828187054,
209
+    1494.55434517686, 1510.27442363459, 1526.16028930875, 1542.21373421151,
210
+    1558.43657053802, 1574.8306309066, 1591.39776860023, 1608.13985781215,
211
+    1625.05879389502, 1642.15649361107, 1659.43489538767, 1676.89595957601,
212
+    1694.54166871017, 1712.37402777397, 1730.39506446684, 1748.60682947636,
213
+    1767.01139675239, 1785.61086378491, 1804.40735188573, 1823.40300647457,
214
+    1842.59999736598, 1862.00051906422, 1881.60679105712, 1901.42105811765,
215
+    1921.44559060702, 1941.68268478254, 1962.13466310849, 1982.80387457295,
216
+    2003.69269500608, 2024.80352740423, 2046.13880225813, 2067.70097788409,
217
+    2089.4925407609, 2111.51600586931, 2133.77391703832, 2156.2688472933,
218
+    2179.00339921048, 2201.98020527506, 2225.20192824396, 2248.67126151315,
219
+    2272.39092949114, 2296.36368797505, 2320.59232453288, 2345.07965889086,
220
+    2369.82854332463, 2394.84186305701, 2420.1225366596, 2445.67351646045,
221
+    2471.4977889564, 2497.5983752314, 2523.97833137945, 2550.64074893434,
222
+    2577.58875530317, 2604.8255142071, 2632.35422612708, 2660.17812875505,
223
+    2688.30049745283, 2716.72464571406, 2745.45392563483, 2774.49172838938,
224
+    2803.84148471127, 2833.50666538283, 2863.49078172885, 2893.79738611828,
225
+    2924.43007247227, 2955.39247677789, 2986.68827760926, 3018.32119665627,
226
+    3050.29499925996, 3082.61349495315, 3115.28053801072, 3148.30002800544,
227
+    3181.67591037289, 3215.41217698172, 3249.51286671181, 3283.98206604386,
228
+    3318.8239096497, 3354.04258099714, 3389.64231295962, 3425.62738843341,
229
+    3462.00214096588, 3498.770955389, 3535.93826846362, 3573.50856952949,
230
+    3611.48640116911, 3649.87635987397, 3688.68309672536, 3727.91131807909,
231
+    3767.56578626554, 3807.6513202933, 3848.17279656462, 3889.13514960257,
232
+    3930.54337278366, 3972.40251908377, 4014.71770183098, 4057.49409547529,
233
+    4100.73693635754, 4144.45152349895, 4188.64321939905, 4233.31745083673,
234
+    4278.47970969433, 4324.13555378427, 4370.2906076885, 4416.9505636112,
235
+    4464.12118224336, 4511.80829363585, 4560.01779808583, 4608.75566703869,
236
+    4658.02794399743, 4707.84074544526, 4758.20026178446, 4809.11275828399,
237
+    4860.58457604072, 4912.6221329584, 4965.23192473005, 5018.42052584652,
238
+    5072.19459060902, 5126.56085415876, 5181.52613352201, 5237.09732866887,
239
+    5293.28142358609, 5350.08548736398, 5407.51667529896, 5465.58223001341,
240
+    5524.28948258769, 5583.64585370912, 5643.65885483892, 5704.33608939131,
241
+    5765.68525393099, 5827.71413938938, 5890.43063229428, 5953.84271601949,
242
+    6017.95847204743, 6082.78608125617, 6148.33382521752, 6214.610087517,
243
+    6281.62335509419, 6349.38221959681, 6417.89537875378, 6487.17163777577,
244
+    6557.21991076552, 6628.04922215295, 6699.66870814791, 6772.08761821761,
245
+    6845.31531658155, 6919.36128372573, 6994.23511794429, 7069.94653689413,
246
+    7146.5053791833, 7223.92160596987, 7302.20530258909, 7381.36668020537,
247
+    7461.41607748598, 7542.36396229371, 7624.22093341411, 7706.99772229679,
248
+    7790.70519482415, 7875.35435311374, 7960.95633733285, 8047.52242755054,
249
+    8135.06404560776, 8223.5927570193, 8313.12027290238, 8403.65845193137,
250
+    8495.21930231871, 8587.81498382941, 8681.45780982398, 8776.16024932246,
251
+    8871.93492910726, 8968.79463585546, 9066.75231829962, 9165.82108941207,
252
+    9266.0142286397, 9367.34518415456, 9469.8275751412, 9573.47519411942,
253
+    9678.30200930089, 9784.32216698275, 9891.54999396144, 10000
254
+};
255
+
256
 }
257
x265_1.9.tar.gz/source/common/constants.h -> x265_2.0.tar.gz/source/common/constants.h Changed
18
 
1
@@ -96,9 +96,15 @@
2
 // Intra tables
3
 extern const uint8_t g_intraFilterFlags[NUM_INTRA_MODE];
4
 
5
-extern const uint32_t g_depthInc[3][4];
6
 extern const uint32_t g_depthScanIdx[8][8];
7
 
8
+extern const double g_YUVtoRGB_BT2020[3][3];
9
+
10
+#define MIN_HDR_LEGAL_RANGE 64
11
+#define MAX_HDR_LEGAL_RANGE 940
12
+#define CBCR_OFFSET 512
13
+extern const double g_ST2084_PQTable[MAX_HDR_LEGAL_RANGE - MIN_HDR_LEGAL_RANGE + 1];
14
+
15
 }
16
 
17
 #endif
18
x265_1.9.tar.gz/source/common/contexts.h -> x265_2.0.tar.gz/source/common/contexts.h Changed
198
 
1
@@ -117,196 +117,8 @@
2
 #define sbacGetEntropyBits(S, V) (g_entropyBits[(S) ^ (V)])
3
 #define sbacGetEntropyBitsTrm(V) (g_entropyBits[126 ^ (V)])
4
 
5
-#define MAX_NUM_CHANNEL_TYPE     2
6
-
7
 static const uint32_t ctxCbf[3][5] = { { 1, 0, 0, 0, 0 }, { 2, 3, 4, 5, 6 }, { 2, 3, 4, 5, 6 } };
8
-static const uint32_t significanceMapContextSetStart[MAX_NUM_CHANNEL_TYPE][3] = { { 0,  9, 21 }, { 0,  9, 12 } };
9
-static const uint32_t significanceMapContextSetSize[MAX_NUM_CHANNEL_TYPE][3]  = { { 9, 12,  6 }, { 9,  3,  3 } };
10
-static const uint32_t nonDiagonalScan8x8ContextOffset[MAX_NUM_CHANNEL_TYPE]   = {  6, 0  };
11
-static const uint32_t notFirstGroupNeighbourhoodContextOffset[MAX_NUM_CHANNEL_TYPE] = { 3, 0 };
12
-
13
-// initial probability for cu_transquant_bypass flag
14
-static const uint8_t INIT_CU_TRANSQUANT_BYPASS_FLAG[3][NUM_TQUANT_BYPASS_FLAG_CTX] =
15
-{
16
-    { 154 },
17
-    { 154 },
18
-    { 154 },
19
-};
20
-
21
-// initial probability for split flag
22
-static const uint8_t INIT_SPLIT_FLAG[3][NUM_SPLIT_FLAG_CTX] =
23
-{
24
-    { 107,  139,  126, },
25
-    { 107,  139,  126, },
26
-    { 139,  141,  157, },
27
-};
28
-
29
-static const uint8_t INIT_SKIP_FLAG[3][NUM_SKIP_FLAG_CTX] =
30
-{
31
-    { 197,  185,  201, },
32
-    { 197,  185,  201, },
33
-    { CNU,  CNU,  CNU, },
34
-};
35
-
36
-static const uint8_t INIT_MERGE_FLAG_EXT[3][NUM_MERGE_FLAG_EXT_CTX] =
37
-{
38
-    { 154, },
39
-    { 110, },
40
-    { CNU, },
41
-};
42
-
43
-static const uint8_t INIT_MERGE_IDX_EXT[3][NUM_MERGE_IDX_EXT_CTX] =
44
-{
45
-    { 137, },
46
-    { 122, },
47
-    { CNU, },
48
-};
49
-
50
-static const uint8_t INIT_PART_SIZE[3][NUM_PART_SIZE_CTX] =
51
-{
52
-    { 154,  139,  154, 154 },
53
-    { 154,  139,  154, 154 },
54
-    { 184,  CNU,  CNU, CNU },
55
-};
56
-
57
-static const uint8_t INIT_PRED_MODE[3][NUM_PRED_MODE_CTX] =
58
-{
59
-    { 134, },
60
-    { 149, },
61
-    { CNU, },
62
-};
63
-
64
-static const uint8_t INIT_INTRA_PRED_MODE[3][NUM_ADI_CTX] =
65
-{
66
-    { 183, },
67
-    { 154, },
68
-    { 184, },
69
-};
70
-
71
-static const uint8_t INIT_CHROMA_PRED_MODE[3][NUM_CHROMA_PRED_CTX] =
72
-{
73
-    { 152,  139, },
74
-    { 152,  139, },
75
-    {  63,  139, },
76
-};
77
-
78
-static const uint8_t INIT_INTER_DIR[3][NUM_INTER_DIR_CTX] =
79
-{
80
-    {  95,   79,   63,   31,  31, },
81
-    {  95,   79,   63,   31,  31, },
82
-    { CNU,  CNU,  CNU,  CNU, CNU, },
83
-};
84
-
85
-static const uint8_t INIT_MVD[3][NUM_MV_RES_CTX] =
86
-{
87
-    { 169,  198, },
88
-    { 140,  198, },
89
-    { CNU,  CNU, },
90
-};
91
-
92
-static const uint8_t INIT_REF_PIC[3][NUM_REF_NO_CTX] =
93
-{
94
-    { 153,  153 },
95
-    { 153,  153 },
96
-    { CNU,  CNU },
97
-};
98
-
99
-static const uint8_t INIT_DQP[3][NUM_DELTA_QP_CTX] =
100
-{
101
-    { 154,  154,  154, },
102
-    { 154,  154,  154, },
103
-    { 154,  154,  154, },
104
-};
105
-
106
-static const uint8_t INIT_QT_CBF[3][NUM_QT_CBF_CTX] =
107
-{
108
-    { 153,  111,  149,   92,  167,  154,  154 },
109
-    { 153,  111,  149,  107,  167,  154,  154 },
110
-    { 111,  141,   94,  138,  182,  154,  154 },
111
-};
112
-
113
-static const uint8_t INIT_QT_ROOT_CBF[3][NUM_QT_ROOT_CBF_CTX] =
114
-{
115
-    {  79, },
116
-    {  79, },
117
-    { CNU, },
118
-};
119
-
120
-static const uint8_t INIT_LAST[3][NUM_CTX_LAST_FLAG_XY] =
121
-{
122
-    { 125,  110,  124,  110,   95,   94,  125,  111,  111,   79,  125,  126,  111,  111,   79,
123
-      108,  123,   93 },
124
-    { 125,  110,   94,  110,   95,   79,  125,  111,  110,   78,  110,  111,  111,   95,   94,
125
-      108,  123,  108 },
126
-    { 110,  110,  124,  125,  140,  153,  125,  127,  140,  109,  111,  143,  127,  111,   79,
127
-      108,  123,   63 },
128
-};
129
-
130
-static const uint8_t INIT_SIG_CG_FLAG[3][2 * NUM_SIG_CG_FLAG_CTX] =
131
-{
132
-    { 121,  140,
133
-      61,  154, },
134
-    { 121,  140,
135
-      61,  154, },
136
-    {  91,  171,
137
-       134,  141, },
138
-};
139
-
140
-static const uint8_t INIT_SIG_FLAG[3][NUM_SIG_FLAG_CTX] =
141
-{
142
-    { 170,  154,  139,  153,  139,  123,  123,   63,  124,  166,  183,  140,  136,  153,  154,  166,  183,  140,  136,  153,  154,  166,  183,  140,  136,  153,  154,  170,  153,  138,  138,  122,  121,  122,  121,  167,  151,  183,  140,  151,  183,  140,  },
143
-    { 155,  154,  139,  153,  139,  123,  123,   63,  153,  166,  183,  140,  136,  153,  154,  166,  183,  140,  136,  153,  154,  166,  183,  140,  136,  153,  154,  170,  153,  123,  123,  107,  121,  107,  121,  167,  151,  183,  140,  151,  183,  140,  },
144
-    { 111,  111,  125,  110,  110,   94,  124,  108,  124,  107,  125,  141,  179,  153,  125,  107,  125,  141,  179,  153,  125,  107,  125,  141,  179,  153,  125,  140,  139,  182,  182,  152,  136,  152,  136,  153,  136,  139,  111,  136,  139,  111,  },
145
-};
146
-
147
-static const uint8_t INIT_ONE_FLAG[3][NUM_ONE_FLAG_CTX] =
148
-{
149
-    { 154,  196,  167,  167,  154,  152,  167,  182,  182,  134,  149,  136,  153,  121,  136,  122,  169,  208,  166,  167,  154,  152,  167,  182, },
150
-    { 154,  196,  196,  167,  154,  152,  167,  182,  182,  134,  149,  136,  153,  121,  136,  137,  169,  194,  166,  167,  154,  167,  137,  182, },
151
-    { 140,   92,  137,  138,  140,  152,  138,  139,  153,   74,  149,   92,  139,  107,  122,  152,  140,  179,  166,  182,  140,  227,  122,  197, },
152
-};
153
-
154
-static const uint8_t INIT_ABS_FLAG[3][NUM_ABS_FLAG_CTX] =
155
-{
156
-    { 107,  167,   91,  107,  107,  167, },
157
-    { 107,  167,   91,  122,  107,  167, },
158
-    { 138,  153,  136,  167,  152,  152, },
159
-};
160
-
161
-static const uint8_t INIT_MVP_IDX[3][NUM_MVP_IDX_CTX] =
162
-{
163
-    { 168 },
164
-    { 168 },
165
-    { CNU },
166
-};
167
-
168
-static const uint8_t INIT_SAO_MERGE_FLAG[3][NUM_SAO_MERGE_FLAG_CTX] =
169
-{
170
-    { 153,  },
171
-    { 153,  },
172
-    { 153,  },
173
-};
174
-
175
-static const uint8_t INIT_SAO_TYPE_IDX[3][NUM_SAO_TYPE_IDX_CTX] =
176
-{
177
-    { 160, },
178
-    { 185, },
179
-    { 200, },
180
-};
181
-
182
-static const uint8_t INIT_TRANS_SUBDIV_FLAG[3][NUM_TRANS_SUBDIV_FLAG_CTX] =
183
-{
184
-    { 224,  167,  122, },
185
-    { 124,  138,   94, },
186
-    { 153,  138,  138, },
187
-};
188
 
189
-static const uint8_t INIT_TRANSFORMSKIP_FLAG[3][2 * NUM_TRANSFORMSKIP_FLAG_CTX] =
190
-{
191
-    { 139,  139 },
192
-    { 139,  139 },
193
-    { 139,  139 },
194
-};
195
 }
196
 
197
 #endif // ifndef X265_CONTEXTS_H
198
x265_1.9.tar.gz/source/common/cpu.cpp -> x265_2.0.tar.gz/source/common/cpu.cpp Changed
14
 
1
@@ -274,9 +274,9 @@
2
         if (!cache && max_basic_cap >= 2)
3
         {
4
             // Cache and TLB Information
5
-            static const char cache32_ids[] = { 0x0a, 0x0c, 0x41, 0x42, 0x43, 0x44, 0x45, 0x82, 0x83, 0x84, 0x85, 0 };
6
-            static const char cache64_ids[] = { 0x22, 0x23, 0x25, 0x29, 0x2c, 0x46, 0x47, 0x49, 0x60, 0x66, 0x67,
7
-                                                0x68, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7c, 0x7f, 0x86, 0x87, 0 };
8
+            static const char cache32_ids[] = { '\x0a','\x0c','\x41','\x42','\x43','\x44','\x45','\x82','\x83','\x84','\x85','\0' };
9
+            static const char cache64_ids[] = { '\x22','\x23','\x25','\x29','\x2c','\x46','\x47','\x49','\x60','\x66','\x67',
10
+                                                '\x68','\x78','\x79','\x7a','\x7b','\x7c','\x7c','\x7f','\x86','\x87','\0' };
11
             uint32_t buf[4];
12
             int max, i = 0;
13
             do
14
x265_1.9.tar.gz/source/common/cudata.cpp -> x265_2.0.tar.gz/source/common/cudata.cpp Changed
46
 
1
@@ -480,7 +480,7 @@
2
 }
3
 
4
 /* The reverse of copyToPic, called only by encodeResidue */
5
-void CUData::copyFromPic(const CUData& ctu, const CUGeom& cuGeom, int csp)
6
+void CUData::copyFromPic(const CUData& ctu, const CUGeom& cuGeom, int csp, bool copyQp)
7
 {
8
     m_encData       = ctu.m_encData;
9
     m_slice         = ctu.m_slice;
10
@@ -491,7 +491,8 @@
11
     m_numPartitions = cuGeom.numPartitions;
12
 
13
     /* copy out all prediction info for this part */
14
-    m_partCopy((uint8_t*)m_qp, (uint8_t*)ctu.m_qp + m_absIdxInCTU);
15
+    if (copyQp) m_partCopy((uint8_t*)m_qp, (uint8_t*)ctu.m_qp + m_absIdxInCTU);
16
+
17
     m_partCopy(m_log2CUSize,   ctu.m_log2CUSize + m_absIdxInCTU);
18
     m_partCopy(m_lumaIntraDir, ctu.m_lumaIntraDir + m_absIdxInCTU);
19
     m_partCopy(m_tqBypass,     ctu.m_tqBypass + m_absIdxInCTU);
20
@@ -526,7 +527,7 @@
21
 }
22
 
23
 /* Only called by encodeResidue, these fields can be modified during inter/intra coding */
24
-void CUData::updatePic(uint32_t depth) const
25
+void CUData::updatePic(uint32_t depth, int picCsp) const
26
 {
27
     CUData& ctu = *m_encData->getPicCTU(m_cuAddr);
28
 
29
@@ -540,7 +541,7 @@
30
     uint32_t tmpY2 = m_absIdxInCTU << (LOG2_UNIT_SIZE * 2);
31
     memcpy(ctu.m_trCoeff[0] + tmpY2, m_trCoeff[0], sizeof(coeff_t)* tmpY);
32
 
33
-    if (ctu.m_chromaFormat != X265_CSP_I400)
34
+    if (ctu.m_chromaFormat != X265_CSP_I400 && picCsp != X265_CSP_I400)
35
     {
36
         m_partCopy(ctu.m_transformSkip[1] + m_absIdxInCTU, m_transformSkip[1]);
37
         m_partCopy(ctu.m_transformSkip[2] + m_absIdxInCTU, m_transformSkip[2]);
38
@@ -2088,6 +2089,7 @@
39
                 cu->absPartIdx = g_depthScanIdx[yOffset][xOffset] * 4;
40
                 cu->numPartitions = (NUM_4x4_PARTITIONS >> ((g_maxLog2CUSize - cu->log2CUSize) * 2));
41
                 cu->depth = g_log2Size[maxCUSize] - log2CUSize;
42
+                cu->geomRecurId = cuIdx;
43
 
44
                 cu->flags = 0;
45
                 CU_SET_FLAG(cu->flags, CUGeom::PRESENT, presentFlag);
46
x265_1.9.tar.gz/source/common/cudata.h -> x265_2.0.tar.gz/source/common/cudata.h Changed
29
 
1
@@ -87,6 +87,7 @@
2
     uint32_t numPartitions; // Number of 4x4 blocks in the CU
3
     uint32_t flags;         // CU flags.
4
     uint32_t depth;         // depth of this CU relative from CTU
5
+    uint32_t geomRecurId;   // Unique geom id from 0 to MAX_GEOMS - 1 for every depth
6
 };
7
 
8
 struct MVField
9
@@ -222,8 +223,8 @@
10
     void     copyToPic(uint32_t depth) const;
11
 
12
     /* RD-0 methods called only from encodeResidue */
13
-    void     copyFromPic(const CUData& ctu, const CUGeom& cuGeom, int csp);
14
-    void     updatePic(uint32_t depth) const;
15
+    void     copyFromPic(const CUData& ctu, const CUGeom& cuGeom, int csp, bool copyQp = true);
16
+    void     updatePic(uint32_t depth, int picCsp) const;
17
 
18
     void     setPartSizeSubParts(PartSize size)    { m_partSet(m_partSize, (uint8_t)size); }
19
     void     setPredModeSubParts(PredMode mode)    { m_partSet(m_predMode, (uint8_t)mode); }
20
@@ -246,7 +247,7 @@
21
     void     setPURefIdx(int list, int8_t refIdx, int absPartIdx, int puIdx);
22
 
23
     uint8_t  getCbf(uint32_t absPartIdx, TextType ttype, uint32_t tuDepth) const { return (m_cbf[ttype][absPartIdx] >> tuDepth) & 0x1; }
24
-    uint8_t  getQtRootCbf(uint32_t absPartIdx) const                             { if (m_chromaFormat == X265_CSP_I400) return m_cbf[0][absPartIdx] || false; else { return m_cbf[0][absPartIdx] || m_cbf[1][absPartIdx] || m_cbf[2][absPartIdx];} }
25
+    bool     getQtRootCbf(uint32_t absPartIdx) const                             { return (m_cbf[0][absPartIdx] || ((m_chromaFormat != X265_CSP_I400) && (m_cbf[1][absPartIdx] || m_cbf[2][absPartIdx]))); }
26
     int8_t   getRefQP(uint32_t currAbsIdxInCTU) const;
27
     uint32_t getInterMergeCandidates(uint32_t absPartIdx, uint32_t puIdx, MVField (*candMvField)[2], uint8_t* candDir) const;
28
     void     clipMv(MV& outMV) const;
29
x265_1.9.tar.gz/source/common/deblock.cpp -> x265_2.0.tar.gz/source/common/deblock.cpp Changed
38
 
1
@@ -319,27 +319,6 @@
2
     }
3
 }
4
 
5
-/* Deblocking of one line/column for the chrominance component
6
- * \param src     pointer to picture data
7
- * \param offset  offset value for picture data
8
- * \param tc      tc value
9
- * \param maskP   indicator to disable filtering on partP
10
- * \param maskQ   indicator to disable filtering on partQ */
11
-static inline void pelFilterChroma(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tc, int32_t maskP, int32_t maskQ)
12
-{
13
-    for (int32_t i = 0; i < UNIT_SIZE; i++, src += srcStep)
14
-    {
15
-        int16_t m4  = (int16_t)src[0];
16
-        int16_t m3  = (int16_t)src[-offset];
17
-        int16_t m5  = (int16_t)src[offset];
18
-        int16_t m2  = (int16_t)src[-offset * 2];
19
-
20
-        int32_t delta = x265_clip3(-tc, tc, ((((m4 - m3) * 4) + m2 - m5 + 4) >> 3));
21
-        src[-offset] = x265_clip(m3 + (delta & maskP));
22
-        src[0] = x265_clip(m4 - (delta & maskQ));
23
-    }
24
-}
25
-
26
 void Deblock::edgeFilterLuma(const CUData* cuQ, uint32_t absPartIdx, uint32_t depth, int32_t dir, int32_t edge, const uint8_t blockStrength[])
27
 {
28
     PicYuv* reconPic = cuQ->m_encData->m_reconPic;
29
@@ -517,7 +496,7 @@
30
             int32_t tc = s_tcTable[indexTC] << bitdepthShift;
31
             pixel* srcC = srcChroma[chromaIdx];
32
 
33
-            pelFilterChroma(srcC + unitOffset, srcStep, offset, tc, maskP, maskQ);
34
+            primitives.pelFilterChroma[dir](srcC + unitOffset, srcStep, offset, tc, maskP, maskQ);
35
         }
36
     }
37
 }
38
x265_1.9.tar.gz/source/common/frame.cpp -> x265_2.0.tar.gz/source/common/frame.cpp Changed
41
 
1
@@ -42,12 +42,14 @@
2
     m_prev = NULL;
3
     m_param = NULL;
4
     memset(&m_lowres, 0, sizeof(m_lowres));
5
+    m_rcData = NULL;
6
 }
7
 
8
 bool Frame::create(x265_param *param, float* quantOffsets)
9
 {
10
     m_fencPic = new PicYuv;
11
     m_param = param;
12
+    CHECKED_MALLOC_ZERO(m_rcData, RcStats, 1);
13
 
14
     if (m_fencPic->create(param->sourceWidth, param->sourceHeight, param->internalCsp) &&
15
         m_lowres.create(m_fencPic, param->bframes, !!param->rc.aqMode))
16
@@ -64,14 +66,17 @@
17
         return true;
18
     }
19
     return false;
20
+fail:
21
+    return false;
22
 }
23
 
24
 bool Frame::allocEncodeData(x265_param *param, const SPS& sps)
25
 {
26
     m_encData = new FrameData;
27
     m_reconPic = new PicYuv;
28
+    m_param = param;
29
     m_encData->m_reconPic = m_reconPic;
30
-    bool ok = m_encData->create(*param, sps) && m_reconPic->create(param->sourceWidth, param->sourceHeight, param->internalCsp);
31
+    bool ok = m_encData->create(*param, sps, m_fencPic->m_picCsp) && m_reconPic->create(param->sourceWidth, param->sourceHeight, param->internalCsp);
32
     if (ok)
33
     {
34
         /* initialize right border of m_reconpicYuv as SAO may read beyond the
35
@@ -139,4 +144,5 @@
36
     }
37
 
38
     m_lowres.destroy();
39
+    X265_FREE(m_rcData);
40
 }
41
x265_1.9.tar.gz/source/common/frame.h -> x265_2.0.tar.gz/source/common/frame.h Changed
45
 
1
@@ -37,6 +37,27 @@
2
 
3
 #define IS_REFERENCED(frame) (frame->m_lowres.sliceType != X265_TYPE_B)
4
 
5
+/* Ratecontrol statistics */
6
+struct RcStats
7
+{
8
+    double   qpaRc;
9
+    double   qpAq;
10
+    double   qRceq;
11
+    double   qpNoVbv;
12
+    double   newQScale;
13
+    double   iCuCount;
14
+    double   pCuCount;
15
+    double   skipCuCount;
16
+    double   qScale;
17
+    int      mvBits;
18
+    int      miscBits;
19
+    int      coeffBits;
20
+    int      poc;
21
+    int      encodeOrder;
22
+    int      sliceType;
23
+    int      keptAsRef;
24
+};
25
+
26
 class Frame
27
 {
28
 public:
29
@@ -49,6 +70,7 @@
30
     /* Data associated with x265_picture */
31
     PicYuv*                m_fencPic;
32
     int                    m_poc;
33
+    int                    m_encodeOrder;
34
     int64_t                m_pts;                // user provided presentation time stamp
35
     int64_t                m_reorderedPts;
36
     int64_t                m_dts;
37
@@ -71,6 +93,7 @@
38
     Frame*                 m_prev;
39
     x265_param*            m_param;              // Points to the latest param set for the frame.
40
     x265_analysis_data     m_analysisData;
41
+    RcStats*               m_rcData;
42
     Frame();
43
 
44
     bool create(x265_param *param, float* quantOffsets);
45
x265_1.9.tar.gz/source/common/framedata.cpp -> x265_2.0.tar.gz/source/common/framedata.cpp Changed
22
 
1
@@ -31,17 +31,18 @@
2
     memset(this, 0, sizeof(*this));
3
 }
4
 
5
-bool FrameData::create(const x265_param& param, const SPS& sps)
6
+bool FrameData::create(const x265_param& param, const SPS& sps, int csp)
7
 {
8
     m_param = &param;
9
     m_slice  = new Slice;
10
     m_picCTU = new CUData[sps.numCUsInFrame];
11
+    m_picCsp = csp;
12
 
13
     m_cuMemPool.create(0, param.internalCsp, sps.numCUsInFrame);
14
     for (uint32_t ctuAddr = 0; ctuAddr < sps.numCUsInFrame; ctuAddr++)
15
         m_picCTU[ctuAddr].initialize(m_cuMemPool, 0, param.internalCsp, ctuAddr);
16
 
17
-    CHECKED_MALLOC(m_cuStat, RCStatCU, sps.numCUsInFrame);
18
+    CHECKED_MALLOC_ZERO(m_cuStat, RCStatCU, sps.numCUsInFrame);
19
     CHECKED_MALLOC(m_rowStat, RCStatRow, sps.numCuInHeight);
20
     reinit(sps);
21
     return true;
22
x265_1.9.tar.gz/source/common/framedata.h -> x265_2.0.tar.gz/source/common/framedata.h Changed
28
 
1
@@ -146,10 +146,11 @@
2
     double         m_avgQpRc;    /* avg QP as decided by rate-control */
3
     double         m_avgQpAq;    /* avg QP as decided by AQ in addition to rate-control */
4
     double         m_rateFactor; /* calculated based on the Frame QP */
5
+    int            m_picCsp;
6
 
7
     FrameData();
8
 
9
-    bool create(const x265_param& param, const SPS& sps);
10
+    bool create(const x265_param& param, const SPS& sps, int csp);
11
     void reinit(const SPS& sps);
12
     void destroy();
13
     inline CUData* getPicCTU(uint32_t ctuAddr) { return &m_picCTU[ctuAddr]; }
14
@@ -168,10 +169,12 @@
15
 struct analysis_inter_data
16
 {
17
     MV*         mv;
18
+    WeightParam* wt;
19
     int32_t*    ref;
20
     uint8_t*    depth;
21
     uint8_t*    modes;
22
-    uint32_t*   bestMergeCand;
23
+    uint8_t*    partSize;
24
+    uint8_t*    mergeFlag;
25
 };
26
 }
27
 #endif // ifndef X265_FRAMEDATA_H
28
x265_1.9.tar.gz/source/common/ipfilter.cpp -> x265_2.0.tar.gz/source/common/ipfilter.cpp Changed
15
 
1
@@ -365,10 +365,10 @@
2
 template<int N, int width, int height>
3
 void interp_hv_pp_c(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int idxX, int idxY)
4
 {
5
-    short immedVals[(64 + 8) * (64 + 8)];
6
+    ALIGN_VAR_32(int16_t, immed[width * (height + N - 1)]);
7
 
8
-    interp_horiz_ps_c<N, width, height>(src, srcStride, immedVals, width, idxX, 1);
9
-    filterVertical_sp_c<N>(immedVals + 3 * width, width, dst, dstStride, width, height, idxY);
10
+    interp_horiz_ps_c<N, width, height>(src, srcStride, immed, width, idxX, 1);
11
+    filterVertical_sp_c<N>(immed + (N / 2 - 1) * width, width, dst, dstStride, width, height, idxY);
12
 }
13
 }
14
 
15
x265_1.9.tar.gz/source/common/loopfilter.cpp -> x265_2.0.tar.gz/source/common/loopfilter.cpp Changed
45
 
1
@@ -27,7 +27,6 @@
2
 #include "primitives.h"
3
 
4
 #define PIXEL_MIN 0
5
-#define PIXEL_MAX ((1 << X265_DEPTH) - 1)
6
 
7
 namespace {
8
 
9
@@ -158,6 +157,27 @@
10
         src[offset * 2]  = (pixel)(x265_clip3(-tcQ, tcQ, ((m3 + m4 + m5 + 3 * m6 + 2 * m7 + 4) >> 3) - m6) + m6);
11
     }
12
 }
13
+
14
+/* Deblocking of one line/column for the chrominance component
15
+* \param src     pointer to picture data
16
+* \param offset  offset value for picture data
17
+* \param tc      tc value
18
+* \param maskP   indicator to disable filtering on partP
19
+* \param maskQ   indicator to disable filtering on partQ */
20
+static void pelFilterChroma_c(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tc, int32_t maskP, int32_t maskQ)
21
+{
22
+    for (int32_t i = 0; i < UNIT_SIZE; i++, src += srcStep)
23
+    {
24
+        int16_t m4 = (int16_t)src[0];
25
+        int16_t m3 = (int16_t)src[-offset];
26
+        int16_t m5 = (int16_t)src[offset];
27
+        int16_t m2 = (int16_t)src[-offset * 2];
28
+
29
+        int32_t delta = x265_clip3(-tc, tc, ((((m4 - m3) * 4) + m2 - m5 + 4) >> 3));
30
+        src[-offset]  = x265_clip(m3 + (delta & maskP));
31
+        src[0]        = x265_clip(m4 - (delta & maskQ));
32
+    }
33
+}
34
 }
35
 
36
 namespace X265_NS {
37
@@ -176,5 +196,7 @@
38
     // C code is same for EDGE_VER and EDGE_HOR only asm code is different
39
     p.pelFilterLumaStrong[0] = pelFilterLumaStrong_c;
40
     p.pelFilterLumaStrong[1] = pelFilterLumaStrong_c;
41
+    p.pelFilterChroma[0]     = pelFilterChroma_c;
42
+    p.pelFilterChroma[1]     = pelFilterChroma_c;
43
 }
44
 }
45
x265_1.9.tar.gz/source/common/param.cpp -> x265_2.0.tar.gz/source/common/param.cpp Changed
275
 
1
@@ -121,9 +121,9 @@
2
     /* Source specifications */
3
     param->internalBitDepth = X265_DEPTH;
4
     param->internalCsp = X265_CSP_I420;
5
-
6
-    param->levelIdc = 0;
7
-    param->bHighTier = 0;
8
+    param->levelIdc = 0; //Auto-detect level
9
+    param->uhdBluray = 0;
10
+    param->bHighTier = 1; //Allow high tier by default
11
     param->interlaceMode = 0;
12
     param->bAnnexB = 1;
13
     param->bRepeatHeaders = 0;
14
@@ -164,6 +164,7 @@
15
     param->bEnableWeightedPred = 1;
16
     param->bEnableWeightedBiPred = 0;
17
     param->bEnableEarlySkip = 0;
18
+    param->bEnableRecursionSkip = 1;
19
     param->bEnableAMP = 0;
20
     param->bEnableRectInter = 0;
21
     param->rdLevel = 3;
22
@@ -193,6 +194,7 @@
23
     param->bLossless = 0;
24
     param->bCULossless = 0;
25
     param->bEnableTemporalSubLayers = 0;
26
+    param->bEnableRdRefine = 0;
27
 
28
     /* Rate control options */
29
     param->rc.vbvMaxBitrate = 0;
30
@@ -219,8 +221,9 @@
31
     param->rc.qblur = 0.5;
32
     param->rc.zoneCount = 0;
33
     param->rc.zones = NULL;
34
-    param->rc.bEnableSlowFirstPass = 0;
35
+    param->rc.bEnableSlowFirstPass = 1;
36
     param->rc.bStrictCbr = 0;
37
+    param->rc.bEnableGrain = 0;
38
 
39
     /* Video Usability Information (VUI) */
40
     param->vui.aspectRatioIdc = 0;
41
@@ -245,7 +248,7 @@
42
     param->maxCLL = 0;
43
     param->maxFALL = 0;
44
     param->minLuma = 0;
45
-    param->maxLuma = (1 << X265_DEPTH) - 1;
46
+    param->maxLuma = PIXEL_MAX;
47
 }
48
 
49
 int x265_param_default_preset(x265_param* param, const char* preset, const char* tune)
50
@@ -408,9 +411,9 @@
51
             param->maxNumMergeCand = 5;
52
             param->searchMethod = X265_STAR_SEARCH;
53
             param->bEnableTransformSkip = 1;
54
+            param->bEnableRecursionSkip = 0;
55
             param->maxNumReferences = 5;
56
             param->limitReferences = 0;
57
-            param->rc.bEnableSlowFirstPass = 1;
58
             param->bIntraInBFrames = 1;
59
             param->lookaheadSlices = 0; // disabled for best quality
60
             // TODO: optimized esa
61
@@ -453,16 +456,16 @@
62
         }
63
         else if (!strcmp(tune, "grain"))
64
         {
65
-            param->deblockingFilterBetaOffset = -2;
66
-            param->deblockingFilterTCOffset = -2;
67
-            param->bIntraInBFrames = 0;
68
-            param->rdoqLevel = 2;
69
-            param->psyRdoq = 10.0;
70
-            param->psyRd = 0.5;
71
             param->rc.ipFactor = 1.1;
72
-            param->rc.pbFactor = 1.1;
73
-            param->rc.aqStrength = 0.3;
74
-            param->rc.qCompress = 0.8;
75
+            param->rc.pbFactor = 1.0;
76
+            param->rc.cuTree = 0;
77
+            param->rc.aqMode = 0;
78
+            param->rc.qpStep = 1;
79
+            param->rc.bEnableGrain = 1;
80
+            param->bEnableRecursionSkip = 0;
81
+            param->psyRd = 4.0;
82
+            param->psyRdoq = 10.0;
83
+            param->bEnableSAO = 0;
84
         }
85
         else
86
             return -1;
87
@@ -616,6 +619,7 @@
88
     OPT("max-merge") p->maxNumMergeCand = (uint32_t)atoi(value);
89
     OPT("temporal-mvp") p->bEnableTemporalMvp = atobool(value);
90
     OPT("early-skip") p->bEnableEarlySkip = atobool(value);
91
+    OPT("rskip") p->bEnableRecursionSkip = atobool(value);
92
     OPT("rdpenalty") p->rdPenalty = atoi(value);
93
     OPT("tskip") p->bEnableTransformSkip = atobool(value);
94
     OPT("no-tskip-fast") p->bEnableTSkipFast = atobool(value);
95
@@ -702,6 +706,7 @@
96
         else
97
             p->psyRdoq = 0.0;
98
     }
99
+    OPT("rd-refine") p->bEnableRdRefine = atobool(value);
100
     OPT("signhide") p->bEnableSignHiding = atobool(value);
101
     OPT("b-intra") p->bIntraInBFrames = atobool(value);
102
     OPT("lft") p->bEnableLoopFilter = atobool(value); /* DEPRECATED */
103
@@ -757,6 +762,7 @@
104
         p->rc.qp = atoi(value);
105
         p->rc.rateControlMode = X265_RC_CQP;
106
     }
107
+    OPT("rc-grain") p->rc.bEnableGrain = atobool(value);
108
     OPT("zones")
109
     {
110
         p->rc.zoneCount = 1;
111
@@ -877,6 +883,7 @@
112
     OPT("max-cll") bError |= sscanf(value, "%hu,%hu", &p->maxCLL, &p->maxFALL) != 2;
113
     OPT("min-luma") p->minLuma = (uint16_t)atoi(value);
114
     OPT("max-luma") p->maxLuma = (uint16_t)atoi(value);
115
+    OPT("uhd-bd") p->uhdBluray = atobool(value);
116
     else
117
         return X265_PARAM_BAD_NAME;
118
 #undef OPT
119
@@ -1023,7 +1030,8 @@
120
 {
121
 #define CHECK(expr, msg) check_failed |= _confirm(param, expr, msg)
122
     int check_failed = 0; /* abort if there is a fatal configuration problem */
123
-
124
+    CHECK(param->uhdBluray == 1 && (X265_DEPTH != 10 || param->internalCsp != 1 || param->interlaceMode != 0),
125
+        "uhd-bd: bit depth, chroma subsample, source picture type must be 10, 4:2:0, progressive");
126
     CHECK(param->maxCUSize != 64 && param->maxCUSize != 32 && param->maxCUSize != 16,
127
           "max cu size must be 16, 32, or 64");
128
     if (check_failed == 1)
129
@@ -1096,7 +1104,7 @@
130
 
131
     CHECK(param->rc.rateControlMode > X265_RC_CRF || param->rc.rateControlMode < X265_RC_ABR,
132
           "Rate control mode is out of range");
133
-    CHECK(param->rdLevel < 0 || param->rdLevel > 6,
134
+    CHECK(param->rdLevel < 1 || param->rdLevel > 6,
135
           "RD Level is out of range");
136
     CHECK(param->rdoqLevel < 0 || param->rdoqLevel > 2,
137
         "RDOQ Level is out of range");
138
@@ -1194,12 +1202,12 @@
139
         CHECK(0 > param->noiseReductionIntra || param->noiseReductionIntra > 2000, "Valid noise reduction range 0 - 2000");
140
     if (param->noiseReductionInter)
141
         CHECK(0 > param->noiseReductionInter || param->noiseReductionInter > 2000, "Valid noise reduction range 0 - 2000");
142
-    CHECK(param->rc.rateControlMode == X265_RC_CRF && param->rc.bStatRead && param->rc.vbvMaxBitrate == 0,
143
-          "Constant rate-factor is incompatible with 2pass");
144
     CHECK(param->rc.rateControlMode == X265_RC_CQP && param->rc.bStatRead,
145
           "Constant QP is incompatible with 2pass");
146
     CHECK(param->rc.bStrictCbr && (param->rc.bitrate <= 0 || param->rc.vbvBufferSize <=0),
147
           "Strict-cbr cannot be applied without specifying target bitrate or vbv bufsize");
148
+    CHECK(param->analysisMode && (param->analysisMode < X265_ANALYSIS_OFF || param->analysisMode > X265_ANALYSIS_LOAD),
149
+        "Invalid analysis mode. Analysis mode 0: OFF 1: SAVE : 2 LOAD");
150
     return check_failed;
151
 }
152
 
153
@@ -1225,18 +1233,21 @@
154
     uint32_t maxLog2CUSize = (uint32_t)g_log2Size[param->maxCUSize];
155
     uint32_t minLog2CUSize = (uint32_t)g_log2Size[param->minCUSize];
156
 
157
-    if (ATOMIC_INC(&g_ctuSizeConfigured) > 1)
158
+    Lock gLock;
159
+    ScopedLock sLock(gLock);
160
+
161
+    if (++g_ctuSizeConfigured > 1)
162
     {
163
         if (g_maxCUSize != param->maxCUSize)
164
         {
165
-            x265_log(param, X265_LOG_ERROR, "maxCUSize must be the same for all encoders in a single process");
166
-            return -1;
167
+            x265_log(param, X265_LOG_WARNING, "maxCUSize must be the same for all encoders in a single process");
168
         }
169
         if (g_maxCUDepth != maxLog2CUSize - minLog2CUSize)
170
         {
171
-            x265_log(param, X265_LOG_ERROR, "maxCUDepth must be the same for all encoders in a single process");
172
-            return -1;
173
+            x265_log(param, X265_LOG_WARNING, "maxCUDepth must be the same for all encoders in a single process");
174
         }
175
+        param->maxCUSize = g_maxCUSize;
176
+        return x265_check_params(param); /* Check again, since param may have changed */
177
     }
178
     else
179
     {
180
@@ -1302,8 +1313,9 @@
181
     x265_log(param, X265_LOG_INFO, "Lookahead / bframes / badapt        : %d / %d / %d\n", param->lookaheadDepth, param->bframes, param->bFrameAdaptive);
182
     x265_log(param, X265_LOG_INFO, "b-pyramid / weightp / weightb       : %d / %d / %d\n",
183
              param->bBPyramid, param->bEnableWeightedPred, param->bEnableWeightedBiPred);
184
-    x265_log(param, X265_LOG_INFO, "References / ref-limit  cu / depth  : %d / %d / %d\n",
185
-             param->maxNumReferences, !!(param->limitReferences & X265_REF_LIMIT_CU), !!(param->limitReferences & X265_REF_LIMIT_DEPTH));
186
+    x265_log(param, X265_LOG_INFO, "References / ref-limit  cu / depth  : %d / %s / %s\n",
187
+             param->maxNumReferences, (param->limitReferences & X265_REF_LIMIT_CU) ? "on" : "off",
188
+             (param->limitReferences & X265_REF_LIMIT_DEPTH) ? "on" : "off");
189
 
190
     if (param->rc.aqMode)
191
         x265_log(param, X265_LOG_INFO, "AQ: mode / str / qg-size / cu-tree  : %d / %0.1f / %d / %d\n", param->rc.aqMode,
192
@@ -1336,7 +1348,9 @@
193
     TOOLVAL(param->psyRd, "psy-rd=%.2lf");
194
     TOOLVAL(param->rdoqLevel, "rdoq=%d");
195
     TOOLVAL(param->psyRdoq, "psy-rdoq=%.2lf");
196
+    TOOLOPT(param->bEnableRdRefine, "rd-refine");
197
     TOOLOPT(param->bEnableEarlySkip, "early-skip");
198
+    TOOLOPT(param->bEnableRecursionSkip, "rskip");
199
     TOOLVAL(param->noiseReductionIntra, "nr-intra=%d");
200
     TOOLVAL(param->noiseReductionInter, "nr-inter=%d");
201
     TOOLOPT(param->bEnableTSkipFast, "tskip-fast");
202
@@ -1367,43 +1381,6 @@
203
     fflush(stderr);
204
 }
205
 
206
-void x265_print_reconfigured_params(x265_param* param, x265_param* reconfiguredParam)
207
-{
208
-    if (!param || !reconfiguredParam)
209
-        return;
210
-
211
-    x265_log(param,X265_LOG_INFO, "Reconfigured param options :\n");
212
-
213
-    char buf[80] = { 0 };
214
-    char tmp[40];
215
-#define TOOLCMP(COND1, COND2, STR, VAL)  if (COND1 != COND2) { sprintf(tmp, STR, VAL); appendtool(param, buf, sizeof(buf), tmp); }
216
-    TOOLCMP(param->maxNumReferences, reconfiguredParam->maxNumReferences, "ref=%d", reconfiguredParam->maxNumReferences);
217
-    TOOLCMP(param->maxTUSize, reconfiguredParam->maxTUSize, "max-tu-size=%d", reconfiguredParam->maxTUSize);
218
-    TOOLCMP(param->searchRange, reconfiguredParam->searchRange, "merange=%d", reconfiguredParam->searchRange);
219
-    TOOLCMP(param->subpelRefine, reconfiguredParam->subpelRefine, "subme= %d", reconfiguredParam->subpelRefine);
220
-    TOOLCMP(param->rdLevel, reconfiguredParam->rdLevel, "rd=%d", reconfiguredParam->rdLevel);
221
-    TOOLCMP(param->psyRd, reconfiguredParam->psyRd, "psy-rd=%.2lf", reconfiguredParam->psyRd);
222
-    TOOLCMP(param->rdoqLevel, reconfiguredParam->rdoqLevel, "rdoq=%d", reconfiguredParam->rdoqLevel);
223
-    TOOLCMP(param->psyRdoq, reconfiguredParam->psyRdoq, "psy-rdoq=%.2lf", reconfiguredParam->psyRdoq);
224
-    TOOLCMP(param->noiseReductionIntra, reconfiguredParam->noiseReductionIntra, "nr-intra=%d", reconfiguredParam->noiseReductionIntra);
225
-    TOOLCMP(param->noiseReductionInter, reconfiguredParam->noiseReductionInter, "nr-inter=%d", reconfiguredParam->noiseReductionInter);
226
-    TOOLCMP(param->bEnableTSkipFast, reconfiguredParam->bEnableTSkipFast, "tskip-fast=%d", reconfiguredParam->bEnableTSkipFast);
227
-    TOOLCMP(param->bEnableSignHiding, reconfiguredParam->bEnableSignHiding, "signhide=%d", reconfiguredParam->bEnableSignHiding);
228
-    TOOLCMP(param->bEnableFastIntra, reconfiguredParam->bEnableFastIntra, "fast-intra=%d", reconfiguredParam->bEnableFastIntra);
229
-    if (param->bEnableLoopFilter && (param->deblockingFilterBetaOffset != reconfiguredParam->deblockingFilterBetaOffset 
230
-        || param->deblockingFilterTCOffset != reconfiguredParam->deblockingFilterTCOffset))
231
-    {
232
-        sprintf(tmp, "deblock(tC=%d:B=%d)", param->deblockingFilterTCOffset, param->deblockingFilterBetaOffset);
233
-        appendtool(param, buf, sizeof(buf), tmp);
234
-    }
235
-    else
236
-        TOOLCMP(param->bEnableLoopFilter,  reconfiguredParam->bEnableLoopFilter, "deblock=%d", reconfiguredParam->bEnableLoopFilter);
237
-
238
-    TOOLCMP(param->bEnableTemporalMvp, reconfiguredParam->bEnableTemporalMvp, "tmvp=%d", reconfiguredParam->bEnableTemporalMvp);
239
-    TOOLCMP(param->bEnableEarlySkip, reconfiguredParam->bEnableEarlySkip, "early-skip=%d", reconfiguredParam->bEnableEarlySkip);
240
-    x265_log(param, X265_LOG_INFO, "tools:%s\n", buf);
241
-}
242
-
243
 char *x265_param2string(x265_param* p)
244
 {
245
     char *buf, *s;
246
@@ -1413,7 +1390,7 @@
247
         return NULL;
248
 
249
 #define BOOL(param, cliopt) \
250
-    s += sprintf(s, " %s", (param) ? cliopt : "no-"cliopt);
251
+    s += sprintf(s, " %s", (param) ? cliopt : "no-" cliopt);
252
 
253
     s += sprintf(s, "%dx%d", p->sourceWidth,p->sourceHeight);
254
     s += sprintf(s, " fps=%u/%u", p->fpsNum, p->fpsDenom);
255
@@ -1432,6 +1409,7 @@
256
     s += sprintf(s, " max-merge=%d", p->maxNumMergeCand);
257
     BOOL(p->bEnableTemporalMvp, "temporal-mvp");
258
     BOOL(p->bEnableEarlySkip, "early-skip");
259
+    BOOL(p->bEnableRecursionSkip, "rskip");
260
     s += sprintf(s, " rdpenalty=%d", p->rdPenalty);
261
     BOOL(p->bEnableTransformSkip, "tskip");
262
     BOOL(p->bEnableTSkipFast, "tskip-fast");
263
@@ -1465,9 +1443,10 @@
264
     s += sprintf(s, " psy-rd=%.2f", p->psyRd);
265
     s += sprintf(s, " rdoq-level=%d", p->rdoqLevel);
266
     s += sprintf(s, " psy-rdoq=%.2f", p->psyRdoq);
267
+    BOOL(p->bEnableRdRefine, "rd-refine");
268
     BOOL(p->bEnableSignHiding, "signhide");
269
     BOOL(p->bEnableLoopFilter, "deblock");
270
-    if (p->bEnableLoopFilter && (p->deblockingFilterBetaOffset || p->deblockingFilterTCOffset))
271
+    if (p->bEnableLoopFilter)
272
         s += sprintf(s, "=%d:%d", p->deblockingFilterTCOffset, p->deblockingFilterBetaOffset);
273
     BOOL(p->bEnableSAO, "sao");
274
     BOOL(p->bSaoNonDeblocked, "sao-non-deblock");
275
x265_1.9.tar.gz/source/common/param.h -> x265_2.0.tar.gz/source/common/param.h Changed
9
 
1
@@ -30,7 +30,6 @@
2
 int   x265_check_params(x265_param *param);
3
 int   x265_set_globals(x265_param *param);
4
 void  x265_print_params(x265_param *param);
5
-void  x265_print_reconfigured_params(x265_param* param, x265_param* reconfiguredParam);
6
 void  x265_param_apply_fastfirstpass(x265_param *p);
7
 char* x265_param2string(x265_param *param);
8
 int   x265_atoi(const char *str, bool& bError);
9
x265_1.9.tar.gz/source/common/picyuv.cpp -> x265_2.0.tar.gz/source/common/picyuv.cpp Changed
90
 
1
@@ -46,6 +46,10 @@
2
 
3
     m_maxLumaLevel = 0;
4
     m_avgLumaLevel = 0;
5
+    m_stride = 0;
6
+    m_strideC = 0;
7
+    m_hChromaShift = 0;
8
+    m_vChromaShift = 0;
9
 }
10
 
11
 bool PicYuv::create(uint32_t picWidth, uint32_t picHeight, uint32_t picCsp)
12
@@ -176,6 +180,7 @@
13
      * warnings from valgrind about using uninitialized pixels */
14
     padx++;
15
     pady++;
16
+    m_picCsp = pic.colorSpace;
17
 
18
     X265_CHECK(pic.bitDepth >= 8, "pic.bitDepth check failure");
19
 
20
@@ -190,7 +195,7 @@
21
 
22
             primitives.planecopy_cp(yChar, pic.stride[0] / sizeof(*yChar), yPixel, m_stride, width, height, shift);
23
 
24
-            if (pic.colorSpace != X265_CSP_I400)
25
+            if (param.internalCsp != X265_CSP_I400)
26
             {
27
                 pixel *uPixel = m_picOrg[1];
28
                 pixel *vPixel = m_picOrg[2];
29
@@ -216,7 +221,7 @@
30
                 yChar += pic.stride[0] / sizeof(*yChar);
31
             }
32
 
33
-            if (pic.colorSpace != X265_CSP_I400)
34
+            if (param.internalCsp != X265_CSP_I400)
35
             {
36
                 pixel *uPixel = m_picOrg[1];
37
                 pixel *vPixel = m_picOrg[2];
38
@@ -258,7 +263,7 @@
39
             primitives.planecopy_sp_shl(yShort, pic.stride[0] / sizeof(*yShort), yPixel, m_stride, width, height, shift, mask);
40
         }
41
 
42
-        if (pic.colorSpace != X265_CSP_I400)
43
+        if (param.internalCsp != X265_CSP_I400)
44
         {
45
             pixel *uPixel = m_picOrg[1];
46
             pixel *vPixel = m_picOrg[2];
47
@@ -279,12 +284,25 @@
48
         }
49
     }
50
 
51
-    /* extend the right edge if width was not multiple of the minimum CU size */
52
-    uint64_t sumLuma;
53
     pixel *Y = m_picOrg[0];
54
-    m_maxLumaLevel = primitives.planeClipAndMax(Y, m_stride, width, height, &sumLuma, (pixel)param.minLuma, (pixel)param.maxLuma);
55
-    m_avgLumaLevel = (double)(sumLuma) / (m_picHeight * m_picWidth);
56
+    pixel *U = m_picOrg[1];
57
+    pixel *V = m_picOrg[2];
58
 
59
+#if HIGH_BIT_DEPTH
60
+    bool calcHDRParams = !!param.minLuma || (param.maxLuma != PIXEL_MAX);
61
+    /* Apply min/max luma bounds for HDR pixel manipulations */
62
+    if (calcHDRParams)
63
+    {
64
+        X265_CHECK(pic.bitDepth == 10, "HDR stats can be applied/calculated only for 10bpp content");
65
+        uint64_t sumLuma;
66
+        m_maxLumaLevel = primitives.planeClipAndMax(Y, m_stride, width, height, &sumLuma, (pixel)param.minLuma, (pixel)param.maxLuma);
67
+        m_avgLumaLevel = (double) sumLuma / (m_picHeight * m_picWidth);
68
+    }
69
+#else
70
+    (void) param;
71
+#endif
72
+
73
+    /* extend the right edge if width was not multiple of the minimum CU size */
74
     for (int r = 0; r < height; r++)
75
     {
76
         for (int x = 0; x < padx; x++)
77
@@ -297,11 +315,8 @@
78
     for (int i = 1; i <= pady; i++)
79
         memcpy(Y + i * m_stride, Y, (width + padx) * sizeof(pixel));
80
 
81
-    if (pic.colorSpace != X265_CSP_I400)
82
+    if (param.internalCsp != X265_CSP_I400)
83
     {
84
-        pixel *U = m_picOrg[1];
85
-        pixel *V = m_picOrg[2];
86
-
87
         for (int r = 0; r < height >> m_vChromaShift; r++)
88
         {
89
             for (int x = 0; x < padx >> m_hChromaShift; x++)
90
x265_1.9.tar.gz/source/common/picyuv.h -> x265_2.0.tar.gz/source/common/picyuv.h Changed
10
 
1
@@ -60,7 +60,7 @@
2
     uint32_t m_chromaMarginX;
3
     uint32_t m_chromaMarginY;
4
 
5
-    uint16_t m_maxLumaLevel;
6
+    pixel m_maxLumaLevel;
7
     double   m_avgLumaLevel;
8
 
9
     PicYuv();
10
x265_1.9.tar.gz/source/common/pixel.cpp -> x265_2.0.tar.gz/source/common/pixel.cpp Changed
72
 
1
@@ -607,7 +607,6 @@
2
  * s1*s1, s2*s2, and s1*s2 also obtain this value for edge cases: ((2^10-1)*16*4)^2 = 4286582784.
3
  * Maximum value for 9-bit is: ss*64 = (2^9-1)^2*16*4*64 = 1069551616, which will not overflow. */
4
 
5
-#define PIXEL_MAX ((1 << X265_DEPTH) - 1)
6
 #if HIGH_BIT_DEPTH
7
     X265_CHECK((X265_DEPTH == 10) || (X265_DEPTH == 12), "ssim invalid depth\n");
8
 #define type float
9
@@ -873,7 +872,25 @@
10
     }
11
 }
12
 
13
-static pixel planeClipAndMax_c(pixel *src, intptr_t stride, int width, int height, uint64_t *outsum, const pixel minPix, const pixel maxPix)
14
+/* Conversion between double and Q8.8 fixed point (big-endian) for storage */
15
+static void cuTreeFix8Pack(uint16_t *dst, double *src, int count)
16
+{
17
+    for (int i = 0; i < count; i++)
18
+        dst[i] = (uint16_t)(src[i] * 256.0);
19
+}
20
+
21
+static void cuTreeFix8Unpack(double *dst, uint16_t *src, int count)
22
+{
23
+    for (int i = 0; i < count; i++)
24
+    {
25
+        int16_t qpFix8 = src[i];
26
+        dst[i] = (double)(qpFix8) / 256.0;
27
+    }
28
+}
29
+
30
+#if HIGH_BIT_DEPTH
31
+static pixel planeClipAndMax_c(pixel *src, intptr_t stride, int width, int height, uint64_t *outsum, 
32
+                               const pixel minPix, const pixel maxPix)
33
 {
34
     pixel maxLumaLevel = 0;
35
     uint64_t sumLuma = 0;
36
@@ -882,21 +899,18 @@
37
     {
38
         for (int c = 0; c < width; c++)
39
         {
40
-            /* Clip luma of source picture to max and min values before extending edges of picYuv */
41
+            /* Clip luma of source picture to max and min*/
42
             src[c] = x265_clip3((pixel)minPix, (pixel)maxPix, src[c]);
43
-
44
-            /* Determine maximum and average luma level in a picture */
45
             maxLumaLevel = X265_MAX(src[c], maxLumaLevel);
46
             sumLuma += src[c];
47
         }
48
-
49
         src += stride;
50
     }
51
-
52
     *outsum = sumLuma;
53
     return maxLumaLevel;
54
 }
55
 
56
+#endif
57
 }  // end anonymous namespace
58
 
59
 namespace X265_NS {
60
@@ -1181,7 +1195,11 @@
61
     p.planecopy_cp = planecopy_cp_c;
62
     p.planecopy_sp = planecopy_sp_c;
63
     p.planecopy_sp_shl = planecopy_sp_shl_c;
64
+#if HIGH_BIT_DEPTH
65
     p.planeClipAndMax = planeClipAndMax_c;
66
+#endif
67
     p.propagateCost = estimateCUPropagateCost;
68
+    p.fix8Unpack = cuTreeFix8Unpack;
69
+    p.fix8Pack = cuTreeFix8Pack;
70
 }
71
 }
72
x265_1.9.tar.gz/source/common/predict.cpp -> x265_2.0.tar.gz/source/common/predict.cpp Changed
213
 
1
@@ -57,12 +57,10 @@
2
 
3
 Predict::Predict()
4
 {
5
-    m_immedVals = NULL;
6
 }
7
 
8
 Predict::~Predict()
9
 {
10
-    X265_FREE(m_immedVals);
11
     m_predShortYuv[0].destroy();
12
     m_predShortYuv[1].destroy();
13
 }
14
@@ -72,12 +70,8 @@
15
     m_csp = csp;
16
     m_hChromaShift = CHROMA_H_SHIFT(csp);
17
     m_vChromaShift = CHROMA_V_SHIFT(csp);
18
-    CHECKED_MALLOC(m_immedVals, int16_t, 64 * (64 + NTAPS_LUMA - 1));
19
 
20
     return m_predShortYuv[0].create(MAX_CU_SIZE, csp) && m_predShortYuv[1].create(MAX_CU_SIZE, csp);
21
-
22
-fail:
23
-    return false;
24
 }
25
 
26
 void Predict::motionCompensation(const CUData& cu, const PredictionUnit& pu, Yuv& predYuv, bool bLuma, bool bChroma)
27
@@ -258,8 +252,8 @@
28
     int partEnum = partitionFromSizes(pu.width, pu.height);
29
     const pixel* src = refPic.getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx) + srcOffset;
30
 
31
-    int xFrac = mv.x & 0x3;
32
-    int yFrac = mv.y & 0x3;
33
+    int xFrac = mv.x & 3;
34
+    int yFrac = mv.y & 3;
35
 
36
     if (!(yFrac | xFrac))
37
         primitives.pu[partEnum].copy_pp(dst, dstStride, src, srcStride);
38
@@ -280,14 +274,14 @@
39
     intptr_t srcOffset = (mv.x >> 2) + (mv.y >> 2) * srcStride;
40
     const pixel* src = refPic.getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx) + srcOffset;
41
 
42
-    int xFrac = mv.x & 0x3;
43
-    int yFrac = mv.y & 0x3;
44
-
45
     int partEnum = partitionFromSizes(pu.width, pu.height);
46
 
47
     X265_CHECK((pu.width % 4) + (pu.height % 4) == 0, "width or height not divisible by 4\n");
48
     X265_CHECK(dstStride == MAX_CU_SIZE, "stride expected to be max cu size\n");
49
 
50
+    int xFrac = mv.x & 3;
51
+    int yFrac = mv.y & 3;
52
+
53
     if (!(yFrac | xFrac))
54
         primitives.pu[partEnum].convert_p2s(src, srcStride, dst, dstStride);
55
     else if (!yFrac)
56
@@ -296,11 +290,12 @@
57
         primitives.pu[partEnum].luma_vps(src, srcStride, dst, dstStride, yFrac);
58
     else
59
     {
60
-        int tmpStride = pu.width;
61
-        int filterSize = NTAPS_LUMA;
62
-        int halfFilterSize = (filterSize >> 1);
63
-        primitives.pu[partEnum].luma_hps(src, srcStride, m_immedVals, tmpStride, xFrac, 1);
64
-        primitives.pu[partEnum].luma_vss(m_immedVals + (halfFilterSize - 1) * tmpStride, tmpStride, dst, dstStride, yFrac);
65
+        ALIGN_VAR_32(int16_t, immed[MAX_CU_SIZE * (MAX_CU_SIZE + NTAPS_LUMA - 1)]);
66
+        int immedStride = pu.width;
67
+        int halfFilterSize = NTAPS_LUMA >> 1;
68
+
69
+        primitives.pu[partEnum].luma_hps(src, srcStride, immed, immedStride, xFrac, 1);
70
+        primitives.pu[partEnum].luma_vss(immed + (halfFilterSize - 1) * immedStride, immedStride, dst, dstStride, yFrac);
71
     }
72
 }
73
 
74
@@ -309,10 +304,10 @@
75
     intptr_t dstStride = dstYuv.m_csize;
76
     intptr_t refStride = refPic.m_strideC;
77
 
78
-    int shiftHor = (2 + m_hChromaShift);
79
-    int shiftVer = (2 + m_vChromaShift);
80
+    int mvx = mv.x << (1 - m_hChromaShift);
81
+    int mvy = mv.y << (1 - m_vChromaShift);
82
 
83
-    intptr_t refOffset = (mv.x >> shiftHor) + (mv.y >> shiftVer) * refStride;
84
+    intptr_t refOffset = (mvx >> 3) + (mvy >> 3) * refStride;
85
 
86
     const pixel* refCb = refPic.getCbAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx) + refOffset;
87
     const pixel* refCr = refPic.getCrAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx) + refOffset;
88
@@ -320,11 +315,11 @@
89
     pixel* dstCb = dstYuv.getCbAddr(pu.puAbsPartIdx);
90
     pixel* dstCr = dstYuv.getCrAddr(pu.puAbsPartIdx);
91
 
92
-    int xFrac = mv.x & ((1 << shiftHor) - 1);
93
-    int yFrac = mv.y & ((1 << shiftVer) - 1);
94
-
95
     int partEnum = partitionFromSizes(pu.width, pu.height);
96
-    
97
+
98
+    int xFrac = mvx & 7;
99
+    int yFrac = mvy & 7;
100
+
101
     if (!(yFrac | xFrac))
102
     {
103
         primitives.chroma[m_csp].pu[partEnum].copy_pp(dstCb, dstStride, refCb, refStride);
104
@@ -332,37 +327,36 @@
105
     }
106
     else if (!yFrac)
107
     {
108
-        primitives.chroma[m_csp].pu[partEnum].filter_hpp(refCb, refStride, dstCb, dstStride, xFrac << (1 - m_hChromaShift));
109
-        primitives.chroma[m_csp].pu[partEnum].filter_hpp(refCr, refStride, dstCr, dstStride, xFrac << (1 - m_hChromaShift));
110
+        primitives.chroma[m_csp].pu[partEnum].filter_hpp(refCb, refStride, dstCb, dstStride, xFrac);
111
+        primitives.chroma[m_csp].pu[partEnum].filter_hpp(refCr, refStride, dstCr, dstStride, xFrac);
112
     }
113
     else if (!xFrac)
114
     {
115
-        primitives.chroma[m_csp].pu[partEnum].filter_vpp(refCb, refStride, dstCb, dstStride, yFrac << (1 - m_vChromaShift));
116
-        primitives.chroma[m_csp].pu[partEnum].filter_vpp(refCr, refStride, dstCr, dstStride, yFrac << (1 - m_vChromaShift));
117
+        primitives.chroma[m_csp].pu[partEnum].filter_vpp(refCb, refStride, dstCb, dstStride, yFrac);
118
+        primitives.chroma[m_csp].pu[partEnum].filter_vpp(refCr, refStride, dstCr, dstStride, yFrac);
119
     }
120
     else
121
     {
122
-        int extStride = pu.width >> m_hChromaShift;
123
-        int filterSize = NTAPS_CHROMA;
124
-        int halfFilterSize = (filterSize >> 1);
125
-
126
-        primitives.chroma[m_csp].pu[partEnum].filter_hps(refCb, refStride, m_immedVals, extStride, xFrac << (1 - m_hChromaShift), 1);
127
-        primitives.chroma[m_csp].pu[partEnum].filter_vsp(m_immedVals + (halfFilterSize - 1) * extStride, extStride, dstCb, dstStride, yFrac << (1 - m_vChromaShift));
128
-
129
-        primitives.chroma[m_csp].pu[partEnum].filter_hps(refCr, refStride, m_immedVals, extStride, xFrac << (1 - m_hChromaShift), 1);
130
-        primitives.chroma[m_csp].pu[partEnum].filter_vsp(m_immedVals + (halfFilterSize - 1) * extStride, extStride, dstCr, dstStride, yFrac << (1 - m_vChromaShift));
131
+        ALIGN_VAR_32(int16_t, immed[MAX_CU_SIZE * (MAX_CU_SIZE + NTAPS_CHROMA - 1)]);
132
+        int immedStride = pu.width >> m_hChromaShift;
133
+        int halfFilterSize = NTAPS_CHROMA >> 1;
134
+
135
+        primitives.chroma[m_csp].pu[partEnum].filter_hps(refCb, refStride, immed, immedStride, xFrac, 1);
136
+        primitives.chroma[m_csp].pu[partEnum].filter_vsp(immed + (halfFilterSize - 1) * immedStride, immedStride, dstCb, dstStride, yFrac);
137
+        primitives.chroma[m_csp].pu[partEnum].filter_hps(refCr, refStride, immed, immedStride, xFrac, 1);
138
+        primitives.chroma[m_csp].pu[partEnum].filter_vsp(immed + (halfFilterSize - 1) * immedStride, immedStride, dstCr, dstStride, yFrac);
139
     }
140
 }
141
 
142
 void Predict::predInterChromaShort(const PredictionUnit& pu, ShortYuv& dstSYuv, const PicYuv& refPic, const MV& mv) const
143
 {
144
-    intptr_t refStride = refPic.m_strideC;
145
     intptr_t dstStride = dstSYuv.m_csize;
146
+    intptr_t refStride = refPic.m_strideC;
147
 
148
-    int shiftHor = (2 + m_hChromaShift);
149
-    int shiftVer = (2 + m_vChromaShift);
150
+    int mvx = mv.x << (1 - m_hChromaShift);
151
+    int mvy = mv.y << (1 - m_vChromaShift);
152
 
153
-    intptr_t refOffset = (mv.x >> shiftHor) + (mv.y >> shiftVer) * refStride;
154
+    intptr_t refOffset = (mvx >> 3) + (mvy >> 3) * refStride;
155
 
156
     const pixel* refCb = refPic.getCbAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx) + refOffset;
157
     const pixel* refCr = refPic.getCrAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx) + refOffset;
158
@@ -370,15 +364,15 @@
159
     int16_t* dstCb = dstSYuv.getCbAddr(pu.puAbsPartIdx);
160
     int16_t* dstCr = dstSYuv.getCrAddr(pu.puAbsPartIdx);
161
 
162
-    int xFrac = mv.x & ((1 << shiftHor) - 1);
163
-    int yFrac = mv.y & ((1 << shiftVer) - 1);
164
-
165
     int partEnum = partitionFromSizes(pu.width, pu.height);
166
     
167
     uint32_t cxWidth  = pu.width >> m_hChromaShift;
168
 
169
     X265_CHECK(((cxWidth | (pu.height >> m_vChromaShift)) % 2) == 0, "chroma block size expected to be multiple of 2\n");
170
 
171
+    int xFrac = mvx & 7;
172
+    int yFrac = mvy & 7;
173
+
174
     if (!(yFrac | xFrac))
175
     {
176
         primitives.chroma[m_csp].pu[partEnum].p2s(refCb, refStride, dstCb, dstStride);
177
@@ -386,23 +380,24 @@
178
     }
179
     else if (!yFrac)
180
     {
181
-        primitives.chroma[m_csp].pu[partEnum].filter_hps(refCb, refStride, dstCb, dstStride, xFrac << (1 - m_hChromaShift), 0);
182
-        primitives.chroma[m_csp].pu[partEnum].filter_hps(refCr, refStride, dstCr, dstStride, xFrac << (1 - m_hChromaShift), 0);
183
+        primitives.chroma[m_csp].pu[partEnum].filter_hps(refCb, refStride, dstCb, dstStride, xFrac, 0);
184
+        primitives.chroma[m_csp].pu[partEnum].filter_hps(refCr, refStride, dstCr, dstStride, xFrac, 0);
185
     }
186
     else if (!xFrac)
187
     {
188
-        primitives.chroma[m_csp].pu[partEnum].filter_vps(refCb, refStride, dstCb, dstStride, yFrac << (1 - m_vChromaShift));
189
-        primitives.chroma[m_csp].pu[partEnum].filter_vps(refCr, refStride, dstCr, dstStride, yFrac << (1 - m_vChromaShift));
190
+        primitives.chroma[m_csp].pu[partEnum].filter_vps(refCb, refStride, dstCb, dstStride, yFrac);
191
+        primitives.chroma[m_csp].pu[partEnum].filter_vps(refCr, refStride, dstCr, dstStride, yFrac);
192
     }
193
     else
194
     {
195
-        int extStride = cxWidth;
196
-        int filterSize = NTAPS_CHROMA;
197
-        int halfFilterSize = (filterSize >> 1);
198
-        primitives.chroma[m_csp].pu[partEnum].filter_hps(refCb, refStride, m_immedVals, extStride, xFrac << (1 - m_hChromaShift), 1);
199
-        primitives.chroma[m_csp].pu[partEnum].filter_vss(m_immedVals + (halfFilterSize - 1) * extStride, extStride, dstCb, dstStride, yFrac << (1 - m_vChromaShift));
200
-        primitives.chroma[m_csp].pu[partEnum].filter_hps(refCr, refStride, m_immedVals, extStride, xFrac << (1 - m_hChromaShift), 1);
201
-        primitives.chroma[m_csp].pu[partEnum].filter_vss(m_immedVals + (halfFilterSize - 1) * extStride, extStride, dstCr, dstStride, yFrac << (1 - m_vChromaShift));
202
+        ALIGN_VAR_32(int16_t, immed[MAX_CU_SIZE * (MAX_CU_SIZE + NTAPS_CHROMA - 1)]);
203
+        int immedStride = cxWidth;
204
+        int halfFilterSize = NTAPS_CHROMA >> 1;
205
+
206
+        primitives.chroma[m_csp].pu[partEnum].filter_hps(refCb, refStride, immed, immedStride, xFrac, 1);
207
+        primitives.chroma[m_csp].pu[partEnum].filter_vss(immed + (halfFilterSize - 1) * immedStride, immedStride, dstCb, dstStride, yFrac);
208
+        primitives.chroma[m_csp].pu[partEnum].filter_hps(refCr, refStride, immed, immedStride, xFrac, 1);
209
+        primitives.chroma[m_csp].pu[partEnum].filter_vss(immed + (halfFilterSize - 1) * immedStride, immedStride, dstCr, dstStride, yFrac);
210
     }
211
 }
212
 
213
x265_1.9.tar.gz/source/common/predict.h -> x265_2.0.tar.gz/source/common/predict.h Changed
9
 
1
@@ -73,7 +73,6 @@
2
     };
3
 
4
     ShortYuv  m_predShortYuv[2]; /* temporary storage for weighted prediction */
5
-    int16_t*  m_immedVals;
6
 
7
     // Unfiltered/filtered neighbours of the current partition.
8
     pixel     intraNeighbourBuf[2][258];
9
x265_1.9.tar.gz/source/common/primitives.cpp -> x265_2.0.tar.gz/source/common/primitives.cpp Changed
31
 
1
@@ -238,7 +238,9 @@
2
             primitives.cu[i].intra_pred_allangs = NULL;
3
 
4
 #if ENABLE_ASSEMBLY
5
+#if X265_ARCH_X86
6
         setupInstrinsicPrimitives(primitives, param->cpuid);
7
+#endif
8
         setupAssemblyPrimitives(primitives, param->cpuid);
9
 #endif
10
 
11
@@ -249,7 +251,7 @@
12
 }
13
 }
14
 
15
-#if ENABLE_ASSEMBLY
16
+#if ENABLE_ASSEMBLY && X265_ARCH_X86
17
 /* these functions are implemented in assembly. When assembly is not being
18
  * compiled, they are unnecessary and can be NOPs */
19
 #else
20
@@ -258,7 +260,10 @@
21
 void PFX(cpu_emms)(void) {}
22
 void PFX(cpu_cpuid)(uint32_t, uint32_t *eax, uint32_t *, uint32_t *, uint32_t *) { *eax = 0; }
23
 void PFX(cpu_xgetbv)(uint32_t, uint32_t *, uint32_t *) {}
24
+
25
+#if X265_ARCH_ARM == 0
26
 void PFX(cpu_neon_test)(void) {}
27
 int PFX(cpu_fast_neon_mrc_test)(void) { return 0; }
28
+#endif // X265_ARCH_ARM
29
 }
30
 #endif
31
x265_1.9.tar.gz/source/common/primitives.h -> x265_2.0.tar.gz/source/common/primitives.h Changed
36
 
1
@@ -189,6 +189,9 @@
2
 
3
 typedef void (*cutree_propagate_cost) (int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, const uint16_t* interCosts, const int32_t* invQscales, const double* fpsFactor, int len);
4
 
5
+typedef void (*cutree_fix8_unpack)(double *dst, uint16_t *src, int count);
6
+typedef void (*cutree_fix8_pack)(uint16_t *dst, double *src, int count);
7
+
8
 typedef int (*scanPosLast_t)(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* scanCG4x4, const int trSize);
9
 typedef uint32_t (*findPosFirstLast_t)(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16]);
10
 
11
@@ -197,6 +200,7 @@
12
 typedef uint32_t (*costC1C2Flag_t)(uint16_t *absCoeff, intptr_t numC1Flag, uint8_t *baseCtxMod, intptr_t ctxOffset);
13
 
14
 typedef void (*pelFilterLumaStrong_t)(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tcP, int32_t tcQ);
15
+typedef void (*pelFilterChroma_t)(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tc, int32_t maskP, int32_t maskQ);
16
 
17
 /* Function pointers to optimized encoder primitives. Each pointer can reference
18
  * either an assembly routine, a SIMD intrinsic primitive, or a C function */
19
@@ -313,6 +317,8 @@
20
 
21
     downscale_t           frameInitLowres;
22
     cutree_propagate_cost propagateCost;
23
+    cutree_fix8_unpack    fix8Unpack;
24
+    cutree_fix8_pack      fix8Pack;
25
 
26
     extendCURowBorder_t   extendRowBorder;
27
     planecopy_cp_t        planecopy_cp;
28
@@ -332,6 +338,7 @@
29
     costC1C2Flag_t        costC1C2Flag;
30
 
31
     pelFilterLumaStrong_t pelFilterLumaStrong[2]; // EDGE_VER = 0, EDGE_HOR = 1
32
+    pelFilterChroma_t     pelFilterChroma[2];     // EDGE_VER = 0, EDGE_HOR = 1
33
 
34
     /* There is one set of chroma primitives per color space. An encoder will
35
      * have just a single color space and thus it will only ever use one entry
36
x265_1.9.tar.gz/source/common/quant.cpp -> x265_2.0.tar.gz/source/common/quant.cpp Changed
21
 
1
@@ -188,10 +188,9 @@
2
     m_nr           = NULL;
3
 }
4
 
5
-bool Quant::init(int rdoqLevel, double psyScale, const ScalingList& scalingList, Entropy& entropy)
6
+bool Quant::init(double psyScale, const ScalingList& scalingList, Entropy& entropy)
7
 {
8
     m_entropyCoder = &entropy;
9
-    m_rdoqLevel    = rdoqLevel;
10
     m_psyRdoqScale = (int32_t)(psyScale * 256.0);
11
     X265_CHECK((psyScale * 256.0) < (double)MAX_INT, "psyScale value too large\n");
12
     m_scalingList  = &scalingList;
13
@@ -223,6 +222,7 @@
14
 {
15
     m_nr = m_frameNr ? &m_frameNr[ctu.m_encData->m_frameEncoderID] : NULL;
16
     m_qpParam[TEXT_LUMA].setQpParam(qp + QP_BD_OFFSET);
17
+    m_rdoqLevel = ctu.m_encData->m_param->rdoqLevel;
18
     if (ctu.m_chromaFormat != X265_CSP_I400)
19
     {
20
         setChromaQP(qp + ctu.m_slice->m_pps->chromaQpOffset[0], TEXT_CHROMA_U, ctu.m_chromaFormat);
21
x265_1.9.tar.gz/source/common/quant.h -> x265_2.0.tar.gz/source/common/quant.h Changed
10
 
1
@@ -100,7 +100,7 @@
2
     ~Quant();
3
 
4
     /* one-time setup */
5
-    bool init(int rdoqLevel, double psyScale, const ScalingList& scalingList, Entropy& entropy);
6
+    bool init(double psyScale, const ScalingList& scalingList, Entropy& entropy);
7
     bool allocNoiseReduction(const x265_param& param);
8
 
9
     /* CU setup */
10
x265_1.9.tar.gz/source/common/scalinglist.cpp -> x265_2.0.tar.gz/source/common/scalinglist.cpp Changed
60
 
1
@@ -57,7 +57,11 @@
2
     },
3
     {
4
         "INTRA32X32_LUMA",
5
+        "",
6
+        "",
7
         "INTER32X32_LUMA",
8
+        "",
9
+        "",
10
     },
11
 };
12
 const char MatrixType_DC[4][12][22] =
13
@@ -76,7 +80,11 @@
14
     },
15
     {
16
         "INTRA32X32_LUMA_DC",
17
+        "",
18
+        "",
19
         "INTER32X32_LUMA_DC",
20
+        "",
21
+        "",
22
     },
23
 };
24
 
25
@@ -246,15 +254,15 @@
26
 
27
     char line[1024];
28
     int32_t *src = NULL;
29
+    fseek(fp, 0, 0);
30
 
31
     for (int sizeIdc = 0; sizeIdc < NUM_SIZES; sizeIdc++)
32
     {
33
         int size = X265_MIN(MAX_MATRIX_COEF_NUM, s_numCoefPerSize[sizeIdc]);
34
-        for (int listIdc = 0; listIdc < NUM_LISTS; listIdc++)
35
+        for (int listIdc = 0; listIdc < NUM_LISTS;  listIdc += (sizeIdc == 3) ? 3 : 1)
36
         {
37
             src = m_scalingListCoef[sizeIdc][listIdc];
38
 
39
-            fseek(fp, 0, 0);
40
             do
41
             {
42
                 char *ret = fgets(line, 1024, fp);
43
@@ -282,7 +290,6 @@
44
 
45
             if (sizeIdc > BLOCK_8x8)
46
             {
47
-                fseek(fp, 0, 0);
48
                 do
49
                 {
50
                     char *ret = fgets(line, 1024, fp);
51
@@ -310,7 +317,7 @@
52
     fclose(fp);
53
 
54
     m_bEnabled = true;
55
-    m_bDataPresent = !checkDefaultScalingList();
56
+    m_bDataPresent = true;
57
 
58
     return false;
59
 }
60
x265_1.9.tar.gz/source/common/shortyuv.cpp -> x265_2.0.tar.gz/source/common/shortyuv.cpp Changed
15
 
1
@@ -78,11 +78,11 @@
2
     memset(m_buf[2], 0, (m_csize * m_csize) * sizeof(int16_t));
3
 }
4
 
5
-void ShortYuv::subtract(const Yuv& srcYuv0, const Yuv& srcYuv1, uint32_t log2Size)
6
+void ShortYuv::subtract(const Yuv& srcYuv0, const Yuv& srcYuv1, uint32_t log2Size, int picCsp)
7
 {
8
     const int sizeIdx = log2Size - 2;
9
     primitives.cu[sizeIdx].sub_ps(m_buf[0], m_size, srcYuv0.m_buf[0], srcYuv1.m_buf[0], srcYuv0.m_size, srcYuv1.m_size);
10
-    if (m_csp != X265_CSP_I400)
11
+    if (m_csp != X265_CSP_I400 && picCsp != X265_CSP_I400)
12
     {
13
         primitives.chroma[m_csp].cu[sizeIdx].sub_ps(m_buf[1], m_csize, srcYuv0.m_buf[1], srcYuv1.m_buf[1], srcYuv0.m_csize, srcYuv1.m_csize);
14
         primitives.chroma[m_csp].cu[sizeIdx].sub_ps(m_buf[2], m_csize, srcYuv0.m_buf[2], srcYuv1.m_buf[2], srcYuv0.m_csize, srcYuv1.m_csize);
15
x265_1.9.tar.gz/source/common/shortyuv.h -> x265_2.0.tar.gz/source/common/shortyuv.h Changed
10
 
1
@@ -64,7 +64,7 @@
2
     const int16_t* getCrAddr(uint32_t absPartIdx) const                         { return m_buf[2] + getChromaAddrOffset(absPartIdx); }
3
     const int16_t* getChromaAddr(uint32_t chromaId, uint32_t partUnitIdx) const { return m_buf[chromaId] + getChromaAddrOffset(partUnitIdx); }
4
 
5
-    void subtract(const Yuv& srcYuv0, const Yuv& srcYuv1, uint32_t log2Size);
6
+    void subtract(const Yuv& srcYuv0, const Yuv& srcYuv1, uint32_t log2Size, int picCsp);
7
 
8
     void copyPartToPartLuma(ShortYuv& dstYuv, uint32_t absPartIdx, uint32_t log2Size) const;
9
     void copyPartToPartChroma(ShortYuv& dstYuv, uint32_t absPartIdx, uint32_t log2SizeL) const;
10
x265_1.9.tar.gz/source/common/threadpool.cpp -> x265_2.0.tar.gz/source/common/threadpool.cpp Changed
207
 
1
@@ -28,6 +28,10 @@
2
 
3
 #include <new>
4
 
5
+#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7
6
+#include <winnt.h>
7
+#endif
8
+
9
 #if X86_64
10
 
11
 #ifdef __GNUC__
12
@@ -64,6 +68,21 @@
13
 # define strcasecmp _stricmp
14
 #endif
15
 
16
+#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7
17
+const uint64_t m1 = 0x5555555555555555; //binary: 0101...
18
+const uint64_t m2 = 0x3333333333333333; //binary: 00110011..
19
+const uint64_t m3 = 0x0f0f0f0f0f0f0f0f; //binary:  4 zeros,  4 ones ...
20
+const uint64_t h01 = 0x0101010101010101; //the sum of 256 to the power of 0,1,2,3...
21
+
22
+static int popCount(uint64_t x)
23
+{
24
+    x -= (x >> 1) & m1;
25
+    x = (x & m2) + ((x >> 2) & m2);
26
+    x = (x + (x >> 4)) & m3;
27
+    return (x * h01) >> 56;
28
+}
29
+#endif
30
+
31
 namespace X265_NS {
32
 // x265 private namespace
33
 
34
@@ -238,7 +257,6 @@
35
     memset(nodeMaskPerPool, 0, sizeof(nodeMaskPerPool));
36
 
37
     int numNumaNodes = X265_MIN(getNumaNodeCount(), MAX_NODE_NUM);
38
-    int cpuCount = getCpuCount();
39
     bool bNumaSupport = false;
40
 
41
 #if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 
42
@@ -248,26 +266,54 @@
43
 #endif
44
 
45
 
46
-    for (int i = 0; i < cpuCount; i++)
47
+#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7
48
+    PGROUP_AFFINITY groupAffinityPointer = new GROUP_AFFINITY;
49
+    for (int i = 0; i < numNumaNodes; i++)
50
     {
51
-#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 
52
-        UCHAR node;
53
-        if (GetNumaProcessorNode((UCHAR)i, &node))
54
-            cpusPerNode[X265_MIN(node, (UCHAR)MAX_NODE_NUM)]++;
55
-        else
56
+        GetNumaNodeProcessorMaskEx((UCHAR)i, groupAffinityPointer);
57
+        cpusPerNode[i] = popCount(groupAffinityPointer->Mask);
58
+    }
59
+    delete groupAffinityPointer;
60
 #elif HAVE_LIBNUMA
61
-        if (bNumaSupport >= 0)
62
-            cpusPerNode[X265_MIN(numa_node_of_cpu(i), MAX_NODE_NUM)]++;
63
-        else
64
-#endif
65
-            cpusPerNode[0]++;
66
+    if (bNumaSupport)
67
+    {
68
+        struct bitmask* bitMask = numa_allocate_cpumask();
69
+        for (int i = 0; i < numNumaNodes; i++)
70
+        {
71
+            int ret = numa_node_to_cpus(i, bitMask);
72
+            if (!ret)
73
+                cpusPerNode[i] = numa_bitmask_weight(bitMask);
74
+            else
75
+                x265_log(p, X265_LOG_ERROR, "Failed to genrate CPU mask\n");
76
+        }
77
+        numa_free_cpumask(bitMask);
78
     }
79
+#else // NUMA not supported
80
+    cpusPerNode[0] = getCpuCount();
81
+#endif
82
 
83
     if (bNumaSupport && p->logLevel >= X265_LOG_DEBUG)
84
-        for (int i = 0; i < numNumaNodes; i++)
85
-            x265_log(p, X265_LOG_DEBUG, "detected NUMA node %d with %d logical cores\n", i, cpusPerNode[i]);
86
-
87
-    /* limit threads based on param->numaPools */
88
+    for (int i = 0; i < numNumaNodes; i++)
89
+        x265_log(p, X265_LOG_DEBUG, "detected NUMA node %d with %d logical cores\n", i, cpusPerNode[i]);
90
+    /* limit threads based on param->numaPools
91
+     * For windows because threads can't be allocated to live across sockets
92
+     * changing the default behavior to be per-socket pools -- FIXME */
93
+#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7
94
+     if (!p->numaPools)
95
+     {
96
+         char poolString[50] = "";
97
+         for (int i = 0; i < numNumaNodes; i++)
98
+         {
99
+             char nextCount[10] = "";
100
+             if (i)
101
+                 sprintf(nextCount, ",%d", cpusPerNode[i]);
102
+             else
103
+                   sprintf(nextCount, "%d", cpusPerNode[i]);
104
+             strcat(poolString, nextCount);
105
+         }
106
+         x265_param_parse(p, "pools", poolString);
107
+     }
108
+#endif
109
     if (p->numaPools && *p->numaPools)
110
     {
111
         const char *nodeStr = p->numaPools;
112
@@ -280,7 +326,7 @@
113
             }
114
             else if (*nodeStr == '-')
115
                 threadsPerPool[i] = 0;
116
-           else if (*nodeStr == '*' || !strcasecmp(nodeStr, "NULL"))
117
+            else if (*nodeStr == '*' || !strcasecmp(nodeStr, "NULL"))
118
             {
119
                 for (int j = i; j < numNumaNodes; j++)
120
                 {
121
@@ -297,8 +343,16 @@
122
             else
123
             {
124
                 int count = atoi(nodeStr);
125
-                threadsPerPool[i] = X265_MIN(count, cpusPerNode[i]);
126
-                nodeMaskPerPool[i] = ((uint64_t)1 << i);
127
+                if (i > 0 || strchr(nodeStr, ','))   // it is comma -> old logic
128
+                {
129
+                    threadsPerPool[i] = X265_MIN(count, cpusPerNode[i]);
130
+                    nodeMaskPerPool[i] = ((uint64_t)1 << i);
131
+                }
132
+                else                                 // new logic: exactly 'count' threads on all NUMAs
133
+                {
134
+                    threadsPerPool[numNumaNodes] = X265_MIN(count, numNumaNodes * MAX_POOL_THREADS);
135
+                    nodeMaskPerPool[numNumaNodes] = ((uint64_t)-1 >> (64 - numNumaNodes));
136
+                }
137
             }
138
 
139
             /* consume current node string, comma, and white-space */
140
@@ -389,16 +443,15 @@
141
     X265_CHECK(numThreads <= MAX_POOL_THREADS, "a single thread pool cannot have more than MAX_POOL_THREADS threads\n");
142
 
143
 #if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 
144
-    m_winCpuMask = 0x0;
145
-    GROUP_AFFINITY groupAffinity;
146
+    memset(&m_groupAffinity, 0, sizeof(GROUP_AFFINITY));
147
     for (int i = 0; i < getNumaNodeCount(); i++)
148
     {
149
         int numaNode = ((nodeMask >> i) & 0x1U) ? i : -1;
150
         if (numaNode != -1)
151
-            if (GetNumaNodeProcessorMaskEx((USHORT)numaNode, &groupAffinity))
152
-                m_winCpuMask |= groupAffinity.Mask;
153
+        if (GetNumaNodeProcessorMaskEx((USHORT)numaNode, &m_groupAffinity))
154
+            break;
155
     }
156
-    m_numaMask = &m_winCpuMask;
157
+    m_numaMask = &m_groupAffinity.Mask;
158
 #elif HAVE_LIBNUMA
159
     if (numa_available() >= 0)
160
     {
161
@@ -480,11 +533,16 @@
162
     setThreadNodeAffinity(m_numaMask);
163
 }
164
 
165
-/* static */
166
 void ThreadPool::setThreadNodeAffinity(void *numaMask)
167
 {
168
 #if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 
169
-    if (SetThreadAffinityMask(GetCurrentThread(), *((DWORD_PTR*)numaMask)))
170
+    UNREFERENCED_PARAMETER(numaMask);
171
+    GROUP_AFFINITY groupAffinity;
172
+    memset(&groupAffinity, 0, sizeof(GROUP_AFFINITY));
173
+    groupAffinity.Group = m_groupAffinity.Group;
174
+    groupAffinity.Mask = m_groupAffinity.Mask;
175
+    const PGROUP_AFFINITY affinityPointer = &groupAffinity;
176
+    if (SetThreadGroupAffinity(GetCurrentThread(), affinityPointer, NULL))
177
         return;
178
     else
179
         x265_log(NULL, X265_LOG_ERROR, "unable to set thread affinity for NUMA node mask\n");
180
@@ -524,10 +582,25 @@
181
 /* static */
182
 int ThreadPool::getCpuCount()
183
 {
184
-#if _WIN32
185
+#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7
186
+    enum { MAX_NODE_NUM = 127 };
187
+    int cpus = 0;
188
+    int numNumaNodes = X265_MIN(getNumaNodeCount(), MAX_NODE_NUM);
189
+    GROUP_AFFINITY groupAffinity;
190
+    for (int i = 0; i < numNumaNodes; i++)
191
+    {
192
+        GetNumaNodeProcessorMaskEx((UCHAR)i, &groupAffinity);
193
+        cpus += popCount(groupAffinity.Mask);
194
+    }
195
+    return cpus;
196
+#elif _WIN32
197
     SYSTEM_INFO sysinfo;
198
     GetSystemInfo(&sysinfo);
199
     return sysinfo.dwNumberOfProcessors;
200
+#elif __unix__ && X265_ARCH_ARM
201
+    /* Return the number of processors configured by OS. Because, most embedded linux distributions
202
+     * uses only one processor as the scheduler doesn't have enough work to utilize all processors */
203
+    return sysconf(_SC_NPROCESSORS_CONF);
204
 #elif __unix__
205
     return sysconf(_SC_NPROCESSORS_ONLN);
206
 #elif MACOS
207
x265_1.9.tar.gz/source/common/threadpool.h -> x265_2.0.tar.gz/source/common/threadpool.h Changed
26
 
1
@@ -85,7 +85,7 @@
2
     int           m_numWorkers;
3
     void*         m_numaMask; // node mask in linux, cpu mask in windows
4
 #if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 
5
-    DWORD_PTR     m_winCpuMask;
6
+    GROUP_AFFINITY m_groupAffinity;
7
 #endif
8
     bool          m_isActive;
9
 
10
@@ -99,6 +99,7 @@
11
     bool start();
12
     void stopWorkers();
13
     void setCurrentThreadAffinity();
14
+    void setThreadNodeAffinity(void *numaMask);
15
     int  tryAcquireSleepingThread(sleepbitmap_t firstTryBitmap, sleepbitmap_t secondTryBitmap);
16
     int  tryBondPeers(int maxPeers, sleepbitmap_t peerBitmap, BondedTaskGroup& master);
17
 
18
@@ -106,7 +107,6 @@
19
 
20
     static int  getCpuCount();
21
     static int  getNumaNodeCount();
22
-    static void setThreadNodeAffinity(void *numaMask);
23
 };
24
 
25
 /* Any worker thread may enlist the help of idle worker threads from the same
26
x265_1.9.tar.gz/source/common/x86/asm-primitives.cpp -> x265_2.0.tar.gz/source/common/x86/asm-primitives.cpp Changed
119
 
1
@@ -861,12 +861,12 @@
2
 template<int size>
3
 void interp_8tap_hv_pp_cpu(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int idxX, int idxY)
4
 {
5
-    ALIGN_VAR_32(int16_t, immed[MAX_CU_SIZE * (MAX_CU_SIZE + NTAPS_LUMA)]);
6
-    const int filterSize = NTAPS_LUMA;
7
-    const int halfFilterSize = filterSize >> 1;
8
+    ALIGN_VAR_32(int16_t, immed[MAX_CU_SIZE * (MAX_CU_SIZE + NTAPS_LUMA - 1)]);
9
+    const int halfFilterSize = NTAPS_LUMA >> 1;
10
+    const int immedStride = MAX_CU_SIZE;
11
 
12
-    primitives.pu[size].luma_hps(src, srcStride, immed, MAX_CU_SIZE, idxX, 1);
13
-    primitives.pu[size].luma_vsp(immed + (halfFilterSize - 1) * MAX_CU_SIZE, MAX_CU_SIZE, dst, dstStride, idxY);
14
+    primitives.pu[size].luma_hps(src, srcStride, immed, immedStride, idxX, 1);
15
+    primitives.pu[size].luma_vsp(immed + (halfFilterSize - 1) * immedStride, immedStride, dst, dstStride, idxY);
16
 }
17
 
18
 #if HIGH_BIT_DEPTH
19
@@ -1098,9 +1098,16 @@
20
         p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].p2s = PFX(filterPixelToShort_8x2_ssse3);
21
         p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].p2s = PFX(filterPixelToShort_8x6_ssse3);
22
         p.findPosFirstLast = PFX(findPosFirstLast_ssse3);
23
+        p.fix8Unpack = PFX(cutree_fix8_unpack_ssse3);
24
+        p.fix8Pack = PFX(cutree_fix8_pack_ssse3);
25
     }
26
     if (cpuMask & X265_CPU_SSE4)
27
     {
28
+        p.pelFilterLumaStrong[0] = PFX(pelFilterLumaStrong_V_sse4);
29
+        p.pelFilterLumaStrong[1] = PFX(pelFilterLumaStrong_H_sse4);
30
+        p.pelFilterChroma[0] = PFX(pelFilterChroma_V_sse4);
31
+        p.pelFilterChroma[1] = PFX(pelFilterChroma_H_sse4);
32
+
33
         p.saoCuOrgE0 = PFX(saoCuOrgE0_sse4);
34
         p.saoCuOrgE1 = PFX(saoCuOrgE1_sse4);
35
         p.saoCuOrgE1_2Rows = PFX(saoCuOrgE1_2Rows_sse4);
36
@@ -1166,6 +1173,12 @@
37
         p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].p2s = PFX(filterPixelToShort_2x16_sse4);
38
         p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].p2s = PFX(filterPixelToShort_6x16_sse4);
39
         p.costCoeffRemain = PFX(costCoeffRemain_sse4);
40
+#if X86_64
41
+        p.saoCuStatsE0 = PFX(saoCuStatsE0_sse4);
42
+        p.saoCuStatsE1 = PFX(saoCuStatsE1_sse4);
43
+        p.saoCuStatsE2 = PFX(saoCuStatsE2_sse4);
44
+        p.saoCuStatsE3 = PFX(saoCuStatsE3_sse4);
45
+#endif
46
     }
47
     if (cpuMask & X265_CPU_AVX)
48
     {
49
@@ -2141,11 +2154,23 @@
50
 
51
         p.frameInitLowres = PFX(frame_init_lowres_core_avx2);
52
         p.propagateCost = PFX(mbtree_propagate_cost_avx2);
53
+        p.fix8Unpack = PFX(cutree_fix8_unpack_avx2);
54
+        p.fix8Pack = PFX(cutree_fix8_pack_avx2);
55
+
56
+        /* TODO: This kernel needs to be modified to work with HIGH_BIT_DEPTH only 
57
+        p.planeClipAndMax = PFX(planeClipAndMax_avx2); */
58
 
59
         // TODO: depends on hps and vsp
60
         ALL_LUMA_PU_T(luma_hvpp, interp_8tap_hv_pp_cpu);                        // calling luma_hvpp for all sizes
61
         p.pu[LUMA_4x4].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_4x4>;             // ALL_LUMA_PU_T has declared all sizes except 4x4, hence calling luma_hvpp[4x4] 
62
 
63
+#if X265_DEPTH == 10
64
+        p.pu[LUMA_8x8].satd = PFX(pixel_satd_8x8_avx2);
65
+        p.cu[LUMA_8x8].sa8d = PFX(pixel_sa8d_8x8_avx2);
66
+        p.cu[LUMA_16x16].sa8d = PFX(pixel_sa8d_16x16_avx2);
67
+        p.cu[LUMA_32x32].sa8d = PFX(pixel_sa8d_32x32_avx2);
68
+#endif
69
+
70
         if (cpuMask & X265_CPU_BMI2)
71
         {
72
             p.scanPosLast = PFX(scanPosLast_avx2_bmi2);
73
@@ -2434,6 +2459,8 @@
74
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s = PFX(filterPixelToShort_32x48_ssse3);
75
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s = PFX(filterPixelToShort_32x64_ssse3);
76
         p.findPosFirstLast = PFX(findPosFirstLast_ssse3);
77
+        p.fix8Unpack = PFX(cutree_fix8_unpack_ssse3);
78
+        p.fix8Pack = PFX(cutree_fix8_pack_ssse3);
79
     }
80
     if (cpuMask & X265_CPU_SSE4)
81
     {
82
@@ -2529,8 +2556,10 @@
83
 #if X86_64
84
         p.pelFilterLumaStrong[0] = PFX(pelFilterLumaStrong_V_sse4);
85
         p.pelFilterLumaStrong[1] = PFX(pelFilterLumaStrong_H_sse4);
86
+        p.pelFilterChroma[0] = PFX(pelFilterChroma_V_sse4);
87
+        p.pelFilterChroma[1] = PFX(pelFilterChroma_H_sse4);
88
 
89
-        p.saoCuStatsBO = PFX(saoCuStatsBO_sse4);
90
+//        p.saoCuStatsBO = PFX(saoCuStatsBO_sse4);
91
         p.saoCuStatsE0 = PFX(saoCuStatsE0_sse4);
92
         p.saoCuStatsE1 = PFX(saoCuStatsE1_sse4);
93
         p.saoCuStatsE2 = PFX(saoCuStatsE2_sse4);
94
@@ -2932,6 +2961,7 @@
95
         p.cu[BLOCK_8x8].intra_pred[14] = PFX(intra_pred_ang8_14_avx2);
96
         p.cu[BLOCK_8x8].intra_pred[15] = PFX(intra_pred_ang8_15_avx2);
97
         p.cu[BLOCK_8x8].intra_pred[16] = PFX(intra_pred_ang8_16_avx2);
98
+        p.cu[BLOCK_8x8].intra_pred[17] = PFX(intra_pred_ang8_17_avx2);
99
         p.cu[BLOCK_8x8].intra_pred[20] = PFX(intra_pred_ang8_20_avx2);
100
         p.cu[BLOCK_8x8].intra_pred[21] = PFX(intra_pred_ang8_21_avx2);
101
         p.cu[BLOCK_8x8].intra_pred[22] = PFX(intra_pred_ang8_22_avx2);
102
@@ -3651,7 +3681,6 @@
103
         p.chroma[X265_CSP_I420].cu[CHROMA_420_32x32].copy_ps = PFX(blockcopy_ps_32x32_avx2);
104
         p.chroma[X265_CSP_I422].cu[CHROMA_422_32x64].copy_ps = PFX(blockcopy_ps_32x64_avx2);
105
         p.cu[BLOCK_64x64].copy_ps = PFX(blockcopy_ps_64x64_avx2);
106
-        p.planeClipAndMax = PFX(planeClipAndMax_avx2);
107
 
108
         p.pu[LUMA_32x8].sad_x3 = PFX(pixel_sad_x3_32x8_avx2);
109
         p.pu[LUMA_32x16].sad_x3 = PFX(pixel_sad_x3_32x16_avx2);
110
@@ -3663,6 +3692,8 @@
111
         p.pu[LUMA_64x48].sad_x3 = PFX(pixel_sad_x3_64x48_avx2);
112
         p.pu[LUMA_64x64].sad_x3 = PFX(pixel_sad_x3_64x64_avx2);
113
         p.pu[LUMA_48x64].sad_x3 = PFX(pixel_sad_x3_48x64_avx2);
114
+        p.fix8Unpack = PFX(cutree_fix8_unpack_avx2);
115
+        p.fix8Pack = PFX(cutree_fix8_pack_avx2);
116
 
117
     }
118
 #endif
119
x265_1.9.tar.gz/source/common/x86/blockcopy8.asm -> x265_2.0.tar.gz/source/common/x86/blockcopy8.asm Changed
10
 
1
@@ -28,8 +28,6 @@
2
 
3
 SECTION_RODATA 32
4
 
5
-tab_Vm:    db 0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0
6
-
7
 cextern pb_4
8
 cextern pb_1
9
 cextern pb_16
10
x265_1.9.tar.gz/source/common/x86/const-a.asm -> x265_2.0.tar.gz/source/common/x86/const-a.asm Changed
50
 
1
@@ -40,12 +40,16 @@
2
 const pb_8,                 times 32 db 8
3
 const pb_15,                times 32 db 15
4
 const pb_16,                times 32 db 16
5
+const pb_31,                times 32 db 31
6
 const pb_32,                times 32 db 32
7
 const pb_64,                times 32 db 64
8
+const pb_124,               times 32 db 124
9
 const pb_128,               times 32 db 128
10
 const pb_a1,                times 16 db 0xa1
11
 
12
 const pb_01,                times  8 db   0,   1
13
+const pb_0123,              times  4 db   0,   1
14
+                            times  4 db   2,   3
15
 const hsub_mul,             times 16 db   1,  -1
16
 const pw_swap,              times  2 db   6,   7,   4,   5,   2,   3,   0,   1
17
 const pb_unpackbd1,         times  2 db   0,   0,   0,   0,   1,   1,   1,   1,   2,   2,   2,   2,   3,   3,   3,   3
18
@@ -64,6 +68,8 @@
19
                             times 12 db 0x00
20
 const pb_000000000000000F,           db 0xff
21
                             times 15 db 0x00
22
+const pb_shuf_off4,         times  2 db   0,   4,   1,   5,   2,   6,   3,   7
23
+const pw_shuf_off4,         times  1 db   0,   1,   8,   9,   2,   3,  10,  11,   4,   5,  12,  13,   6,   7,  14,  15
24
 
25
 ;; 16-bit constants
26
 
27
@@ -115,6 +121,8 @@
28
 const hmul_16p,             times 16 db   1
29
                             times  8 db   1,  -1
30
 const pw_exp2_0_15,                  dw 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768
31
+const pw_1_ffff,            times  4 dw 1
32
+                            times  4 dw 0xFFFF
33
 
34
 
35
 ;; 32-bit constants
36
@@ -146,10 +154,6 @@
37
 const pd_planar16_mul2,     times  1 dd  15,  14,  13,  12,  11,  10,   9,   8,    7,   6,   5,   4,   3,   2,   1,   0
38
 const trans8_shuf,          times  1 dd   0,   4,   1,   5,   2,   6,   3,   7
39
 
40
-const popcnt_table
41
-%assign x 0
42
-%rep 256
43
-; population count
44
-db ((x>>0)&1)+((x>>1)&1)+((x>>2)&1)+((x>>3)&1)+((x>>4)&1)+((x>>5)&1)+((x>>6)&1)+((x>>7)&1)
45
-%assign x x+1
46
-%endrep
47
+;; 64-bit constants
48
+
49
+const pq_1,                 times 1 dq 1
50
x265_1.9.tar.gz/source/common/x86/intrapred8.asm -> x265_2.0.tar.gz/source/common/x86/intrapred8.asm Changed
1115
 
1
@@ -355,55 +355,55 @@
2
                             times 8 db (32-22), 22
3
                             times 8 db (32-11), 11
4
 
5
-const ang16_shuf_mode9,    times 8 db 0, 1
6
-                           times 8 db 1, 2
7
+const ang16_shuf_mode9,     times 8 db 0, 1
8
+                            times 8 db 1, 2
9
 
10
-const angHor_tab_9,  db (32-2), 2, (32-4), 4, (32-6), 6, (32-8), 8, (32-10), 10, (32-12), 12, (32-14), 14, (32-16), 16
11
-                     db (32-18), 18, (32-20), 20, (32-22), 22, (32-24),  24, (32-26),  26, (32-28), 28, (32-30), 30, (32-32), 32
12
+const angHor_tab_9,         db (32-2), 2, (32-4), 4, (32-6), 6, (32-8), 8, (32-10), 10, (32-12), 12, (32-14), 14, (32-16), 16
13
+                            db (32-18), 18, (32-20), 20, (32-22), 22, (32-24),  24, (32-26),  26, (32-28), 28, (32-30), 30, (32-32), 32
14
 
15
-const angHor_tab_11, db (32-30), 30, (32-28), 28, (32-26), 26, (32-24), 24, (32-22), 22, (32-20), 20, (32-18), 18, (32-16), 16
16
-                     db (32-14), 14, (32-12), 12, (32-10), 10, (32- 8),  8, (32- 6),  6, (32- 4),  4, (32- 2),  2, (32- 0),  0
17
+const angHor_tab_11,        db (32-30), 30, (32-28), 28, (32-26), 26, (32-24), 24, (32-22), 22, (32-20), 20, (32-18), 18, (32-16), 16
18
+                            db (32-14), 14, (32-12), 12, (32-10), 10, (32- 8),  8, (32- 6),  6, (32- 4),  4, (32- 2),  2, (32- 0),  0
19
 
20
-const ang16_shuf_mode12,   db 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 1, 2, 1, 2, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 2, 3, 2, 3
21
-                           db 1, 2, 1, 2, 1, 2, 1, 2, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3, 1, 2, 1, 2, 1, 2, 1, 2
22
+const ang16_shuf_mode12,    db 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 1, 2, 1, 2, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 2, 3, 2, 3
23
+                            db 1, 2, 1, 2, 1, 2, 1, 2, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3, 1, 2, 1, 2, 1, 2, 1, 2
24
 
25
-const angHor_tab_12, db (32-27), 27, (32-22), 22, (32-17), 17, (32-12), 12, (32-7), 7, (32-2), 2, (32-29), 29, (32-24), 24
26
-                     db (32-19), 19, (32-14), 14, (32-9), 9, (32-4), 4, (32-31), 31, (32-26),  26, (32-21), 21, (32-16), 16
27
+const angHor_tab_12,        db (32-27), 27, (32-22), 22, (32-17), 17, (32-12), 12, (32-7), 7, (32-2), 2, (32-29), 29, (32-24), 24
28
+                            db (32-19), 19, (32-14), 14, (32-9), 9, (32-4), 4, (32-31), 31, (32-26),  26, (32-21), 21, (32-16), 16
29
 
30
-const ang16_shuf_mode13,   db 4, 5, 4, 5, 4, 5, 3, 4, 3, 4, 3, 4, 3, 4, 2, 3, 5, 6, 5, 6, 5, 6, 4, 5, 4, 5, 4, 5, 4, 5, 3, 4
31
-                           db 2, 3, 2, 3, 1, 2, 1, 2, 1, 2, 1, 2, 0, 1, 0, 1, 3, 4, 3, 4, 2, 3, 2, 3, 2, 3, 2, 3, 1, 2, 1, 2
32
-                           db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 7, 4, 0, 0 ,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 7, 4, 0
33
+const ang16_shuf_mode13,    db 4, 5, 4, 5, 4, 5, 3, 4, 3, 4, 3, 4, 3, 4, 2, 3, 5, 6, 5, 6, 5, 6, 4, 5, 4, 5, 4, 5, 4, 5, 3, 4
34
+                            db 2, 3, 2, 3, 1, 2, 1, 2, 1, 2, 1, 2, 0, 1, 0, 1, 3, 4, 3, 4, 2, 3, 2, 3, 2, 3, 2, 3, 1, 2, 1, 2
35
+                            db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 7, 4, 0, 0 ,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 7, 4, 0
36
 
37
-const angHor_tab_13, db (32-23), 23, (32-14), 14, (32-5), 5, (32-28), 28, (32-19), 19, (32-10), 10, (32-1), 1, (32-24), 24
38
-                     db (32-15), 15, (32-6), 6, (32-29), 29, (32-20), 20, (32-11), 11, (32-2), 2, (32-25), 25, (32-16), 16
39
+const angHor_tab_13,        db (32-23), 23, (32-14), 14, (32-5), 5, (32-28), 28, (32-19), 19, (32-10), 10, (32-1), 1, (32-24), 24
40
+                            db (32-15), 15, (32-6), 6, (32-29), 29, (32-20), 20, (32-11), 11, (32-2), 2, (32-25), 25, (32-16), 16
41
 
42
-const ang16_shuf_mode14,   db 6, 7, 6, 7, 5, 6, 5, 6, 4, 5, 4, 5, 4, 5, 3, 4, 7, 8, 7, 8, 6, 7, 6, 7, 5, 6, 5, 6, 5, 6, 4, 5
43
-                           db 3, 4, 2, 3, 2, 3, 2, 3, 1, 2, 1, 2, 0, 1, 0, 1, 4, 5, 3, 4, 3, 4, 3, 4, 2, 3, 2, 3, 1, 2, 1, 2
44
-                           db 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 12, 10, 7, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 12, 10, 7, 5, 2, 0
45
+const ang16_shuf_mode14,    db 6, 7, 6, 7, 5, 6, 5, 6, 4, 5, 4, 5, 4, 5, 3, 4, 7, 8, 7, 8, 6, 7, 6, 7, 5, 6, 5, 6, 5, 6, 4, 5
46
+                            db 3, 4, 2, 3, 2, 3, 2, 3, 1, 2, 1, 2, 0, 1, 0, 1, 4, 5, 3, 4, 3, 4, 3, 4, 2, 3, 2, 3, 1, 2, 1, 2
47
+                            db 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 12, 10, 7, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 12, 10, 7, 5, 2, 0
48
 
49
-const angHor_tab_14, db (32-19), 19, (32-6), 6, (32-25), 25, (32-12), 12, (32-31), 31, (32-18), 18, (32-5), 5, (32-24), 24
50
-                     db (32-11), 11, (32-30), 30, (32-17), 17, (32-4), 4, (32-23), 23, (32-10), 10, (32-29), 29, (32-16), 16
51
+const angHor_tab_14,        db (32-19), 19, (32-6), 6, (32-25), 25, (32-12), 12, (32-31), 31, (32-18), 18, (32-5), 5, (32-24), 24
52
+                            db (32-11), 11, (32-30), 30, (32-17), 17, (32-4), 4, (32-23), 23, (32-10), 10, (32-29), 29, (32-16), 16
53
 
54
-const ang16_shuf_mode15,   db 8, 9, 7, 8, 7, 8, 6, 7, 6, 7, 5, 6, 5, 6, 4, 5, 9, 10, 8, 9, 8, 9, 7, 8, 7, 8, 6, 7, 6, 7, 5, 6
55
-                           db 4, 5, 3, 4, 3, 4, 2, 3, 2, 3, 1, 2, 1, 2, 0, 1, 5, 6, 4, 5, 4, 5, 3, 4, 3, 4, 2, 3, 2, 3, 1, 2
56
-                           db 0, 0, 0, 0, 0, 0, 0, 15, 13, 11, 9, 8, 6, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 15, 13, 11, 9, 8, 6, 4, 2, 0
57
+const ang16_shuf_mode15,    db 8, 9, 7, 8, 7, 8, 6, 7, 6, 7, 5, 6, 5, 6, 4, 5, 9, 10, 8, 9, 8, 9, 7, 8, 7, 8, 6, 7, 6, 7, 5, 6
58
+                            db 4, 5, 3, 4, 3, 4, 2, 3, 2, 3, 1, 2, 1, 2, 0, 1, 5, 6, 4, 5, 4, 5, 3, 4, 3, 4, 2, 3, 2, 3, 1, 2
59
+                            db 0, 0, 0, 0, 0, 0, 0, 15, 13, 11, 9, 8, 6, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 15, 13, 11, 9, 8, 6, 4, 2, 0
60
 
61
-const angHor_tab_15, db (32-15), 15, (32-30), 30, (32-13), 13, (32-28), 28, (32-11), 11, (32-26), 26, (32-9), 9, (32-24), 24
62
-                     db (32-7), 7, (32-22), 22, (32-5), 5, (32-20), 20, (32-3), 3, (32-18), 18, (32-1), 1, (32- 16), 16
63
+const angHor_tab_15,        db (32-15), 15, (32-30), 30, (32-13), 13, (32-28), 28, (32-11), 11, (32-26), 26, (32-9), 9, (32-24), 24
64
+                            db (32-7), 7, (32-22), 22, (32-5), 5, (32-20), 20, (32-3), 3, (32-18), 18, (32-1), 1, (32- 16), 16
65
 
66
-const ang16_shuf_mode16,   db 10, 11, 9, 10, 9, 10, 8, 9, 7, 8, 7, 8, 6, 7, 5, 6, 11, 12, 10, 11, 10, 11, 9, 10, 8, 9, 8, 9, 7, 8, 6, 7
67
-                           db 5, 6, 4, 5, 3, 4, 3, 4, 2, 3, 1, 2, 1, 2, 0, 1, 6, 7, 5, 6, 4, 5, 4, 5, 3, 4, 2, 3, 2, 3, 1, 2
68
-                           db 0 ,0, 0, 0, 0, 15, 14, 12 , 11, 9, 8, 6, 5, 3, 2, 0, 0, 0, 0, 0, 0, 15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0
69
+const ang16_shuf_mode16,    db 10, 11, 9, 10, 9, 10, 8, 9, 7, 8, 7, 8, 6, 7, 5, 6, 11, 12, 10, 11, 10, 11, 9, 10, 8, 9, 8, 9, 7, 8, 6, 7
70
+                            db 5, 6, 4, 5, 3, 4, 3, 4, 2, 3, 1, 2, 1, 2, 0, 1, 6, 7, 5, 6, 4, 5, 4, 5, 3, 4, 2, 3, 2, 3, 1, 2
71
+                            db 0 ,0, 0, 0, 0, 15, 14, 12 , 11, 9, 8, 6, 5, 3, 2, 0, 0, 0, 0, 0, 0, 15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0
72
 
73
-const angHor_tab_16, db (32-11), 11, (32-22), 22, (32-1), 1, (32-12), 12, (32-23), 23, (32-2), 2, (32-13), 13, (32-24), 24
74
-                     db (32-3), 3, (32-14), 14, (32-25), 25, (32-4), 4, (32-15), 15, (32-26), 26, (32-5), 5, (32-16), 16
75
+const angHor_tab_16,        db (32-11), 11, (32-22), 22, (32-1), 1, (32-12), 12, (32-23), 23, (32-2), 2, (32-13), 13, (32-24), 24
76
+                            db (32-3), 3, (32-14), 14, (32-25), 25, (32-4), 4, (32-15), 15, (32-26), 26, (32-5), 5, (32-16), 16
77
 
78
-const ang16_shuf_mode17,   db 12, 13, 11, 12, 10, 11, 9, 10, 8, 9, 8, 9, 7, 8, 6, 7, 13, 14, 12, 13, 11, 12, 10, 11, 9, 10, 9, 10, 8, 9, 7, 8
79
-                           db 5, 6, 4, 5, 4, 5, 3, 4, 2, 3, 1, 2, 0, 1, 0, 1, 6, 7, 5, 6, 5, 6, 4, 5, 3, 4, 2, 3, 1, 2, 1, 2
80
-                           db 0, 0, 0, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, 0, 0, 0, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0
81
+const ang16_shuf_mode17,    db 12, 13, 11, 12, 10, 11, 9, 10, 8, 9, 8, 9, 7, 8, 6, 7, 13, 14, 12, 13, 11, 12, 10, 11, 9, 10, 9, 10, 8, 9, 7, 8
82
+                            db 5, 6, 4, 5, 4, 5, 3, 4, 2, 3, 1, 2, 0, 1, 0, 1, 6, 7, 5, 6, 5, 6, 4, 5, 3, 4, 2, 3, 1, 2, 1, 2
83
+                            db 0, 0, 0, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, 0, 0, 0, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0
84
 
85
-const angHor_tab_17, db (32- 6),  6, (32-12), 12, (32-18), 18, (32-24), 24, (32-30), 30, (32- 4),  4, (32-10), 10, (32-16), 16
86
-                     db (32-22), 22, (32-28), 28, (32- 2),  2, (32- 8),  8, (32-14), 14, (32-20), 20, (32-26), 26, (32- 0),  0
87
+const angHor_tab_17,        db (32- 6),  6, (32-12), 12, (32-18), 18, (32-24), 24, (32-30), 30, (32- 4),  4, (32-10), 10, (32-16), 16
88
+                            db (32-22), 22, (32-28), 28, (32- 2),  2, (32- 8),  8, (32-14), 14, (32-20), 20, (32-26), 26, (32- 0),  0
89
 
90
 ; Intrapred_angle32x32, modes 1 to 33 constants
91
 const ang32_shuf_mode9,         times 8 db 0, 1
92
@@ -467,6 +467,39 @@
93
                                 dd  0,  0,  2,  3,  0,  0,  7,  1
94
                                 dd  0,  0,  5,  6,  0,  0,  0,  0
95
 
96
+; Intrapred_angle8x8, modes 1 to 33 constants
97
+const ang8_shuf_mode3,          db  0,  1,  1,  2,  2,  3,  3,  4,  4,  5,  4,  5,  5,  6,  6,  7,  1,  2,  2,  3,  3,  4,  4,  5,  5,  6,  5,  6,  6,  7,  7,  8
98
+const ang8_shuf_mode4,          db  0,  1,  1,  2,  1,  2,  2,  3,  3,  4,  3,  4,  4,  5,  5,  6,  1,  2,  2,  3,  2,  3,  3,  4,  4,  5,  4,  5,  5,  6,  6,  7
99
+const ang8_shuf_mode5,          db  0,  1,  1,  2,  1,  2,  2,  3,  2,  3,  3,  4,  3,  4,  4,  5,  1,  2,  2,  3,  2,  3,  3,  4,  3,  4,  4,  5,  4,  5,  5,  6
100
+const ang8_shuf_mode6,          db  0,  1,  0,  1,  1,  2,  1,  2,  2,  3,  2,  3,  2,  3,  3,  4,  1,  2,  1,  2,  2,  3,  2,  3,  3,  4,  3,  4,  3,  4,  4,  5
101
+const ang8_shuf_mode7,          db  0,  1,  0,  1,  0,  1,  1,  2,  1,  2,  1,  2,  1,  2,  2,  3,  1,  2,  1,  2,  1,  2,  2,  3,  2,  3,  2,  3,  2,  3,  3,  4
102
+const ang8_shuf_mode8,          db  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  1,  2,  1,  2,  1,  2,  1,  2,  1,  2,  1,  2,  1,  2,  1,  2,  2,  3,  2,  3
103
+const ang8_shuf_mode9,          db  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  1,  2,  1,  2,  1,  2,  1,  2,  1,  2,  1,  2,  1,  2,  1,  2
104
+const ang8_shuf_mode12,         db  7,  8,  7,  8,  7,  8,  7,  8,  7,  8,  7,  8,  6,  7,  6,  7,  8,  9,  8,  9,  8,  9,  8,  9,  8,  9,  8,  9,  7,  8,  7,  8
105
+const ang8_shuf_mode13,         db  8,  9,  8,  9,  8,  9,  7,  8,  7,  8,  7,  8,  7,  8,  6,  7,  9, 10,  9, 10,  9, 10,  8,  9,  8,  9,  8,  9,  8,  9,  7,  8
106
+const ang8_shuf_mode14,         db  9, 10,  9, 10,  8,  9,  8,  9,  7,  8,  7,  8,  7,  8,  6,  7, 10, 11, 10, 11,  9, 10,  9, 10,  8,  9,  8,  9,  8,  9,  7,  8
107
+const ang8_shuf_mode15,         db 10, 11,  9, 10,  9, 10,  8,  9,  8,  9,  7,  8,  7,  8,  6,  7, 11, 12, 10, 11, 10, 11,  9, 10,  9, 10,  8,  9,  8,  9,  7,  8
108
+                                db  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  8,  6,  4,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  8,  6,  4,  2,  0
109
+const ang8_shuf_mode16,         db 11, 12, 10, 11, 10, 11,  9, 10,  8,  9,  8,  9,  7,  8,  6,  7, 12, 13, 11, 12, 11, 12, 10, 11,  9, 10,  9, 10,  8,  9,  7,  8
110
+                                db  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  8,  6,  5,  3,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  8,  6,  5,  3,  2,  0
111
+const ang8_shuf_mode17,         db 12, 13, 11, 12, 10, 11,  9, 10,  8,  9,  8,  9,  7,  8,  6,  7, 13, 14, 12, 13, 11, 12, 10, 11,  9, 10,  9, 10,  8,  9,  7,  8
112
+                                db  0,  0,  0,  0,  0,  0,  0,  0,  0,  7,  6,  5,  4,  2,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  7,  6,  5,  4,  2,  1,  0
113
+
114
+const ang8_fact_mode3,          db (32-26), 26, (32-20), 20, (32-14), 14, (32- 8),  8, (32- 2),  2, (32-28), 28, (32-22), 22, (32-16), 16
115
+const ang8_fact_mode4,          db (32-21), 21, (32-10), 10, (32-31), 31, (32-20), 20, (32- 9),  9, (32-30), 30, (32-19), 19, (32- 8),  8
116
+const ang8_fact_mode5,          db (32-17), 17, (32- 2),  2, (32-19), 19, (32- 4),  4, (32-21), 21, (32- 6),  6, (32-23), 23, (32- 8),  8
117
+const ang8_fact_mode6,          db (32-13), 13, (32-26), 26, (32- 7),  7, (32-20), 20, (32- 1),  1, (32-14), 14, (32-27), 27, (32- 8),  8
118
+const ang8_fact_mode7,          db (32- 9),  9, (32-18), 18, (32-27), 27, (32- 4),  4, (32-13), 13, (32-22), 22, (32-31), 31, (32- 8),  8
119
+const ang8_fact_mode8,          db (32- 5),  5, (32-10), 10, (32-15), 15, (32-20), 20, (32-25), 25, (32-30), 30, (32- 3),  3, (32- 8),  8
120
+const ang8_fact_mode9,          db (32- 2),  2, (32- 4),  4, (32- 6),  6, (32- 8),  8, (32-10), 10, (32-12), 12, (32-14), 14, (32-16), 16
121
+const ang8_fact_mode11,         db (32-30), 30, (32-28), 28, (32-26), 26, (32-24), 24, (32-22), 22, (32-20), 20, (32-18), 18, (32-16), 16
122
+const ang8_fact_mode12,         db (32-27), 27, (32-22), 22, (32-17), 17, (32-12), 12, (32- 7),  7, (32- 2),  2, (32-29), 29, (32-24), 24
123
+const ang8_fact_mode13,         db (32-23), 23, (32-14), 14, (32- 5),  5, (32-28), 28, (32-19), 19, (32-10), 10, (32- 1),  1, (32-24), 24
124
+const ang8_fact_mode14,         db (32-19), 19, (32- 6),  6, (32-25), 25, (32-12), 12, (32-31), 31, (32-18), 18, (32- 5),  5, (32-24), 24
125
+const ang8_fact_mode15,         db (32-15), 15, (32-30), 30, (32-13), 13, (32-28), 28, (32-11), 11, (32-26), 26, (32- 9),  9, (32-24), 24
126
+const ang8_fact_mode16,         db (32-11), 11, (32-22), 22, (32- 1),  1, (32-12), 12, (32-23), 23, (32- 2),  2, (32-13), 13, (32-24), 24
127
+const ang8_fact_mode17,         db (32- 6),  6, (32-12), 12, (32-18), 18, (32-24), 24, (32-30), 30, (32- 4),  4, (32-10), 10, (32-16), 16
128
+
129
 const ang_table
130
 %assign x 0
131
 %rep 32
132
@@ -490,6 +523,7 @@
133
 
134
 SECTION .text
135
 cextern pb_1
136
+cextern pb_2
137
 cextern pw_2
138
 cextern pw_3
139
 cextern pw_4
140
@@ -18582,48 +18616,48 @@
141
 ; void intraPredAng8(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter)
142
 ;-----------------------------------------------------------------------------------------
143
 INIT_YMM avx2
144
-cglobal intra_pred_ang8_3, 3,4,5
145
-    mova              m3, [pw_1024]
146
+%macro ang8_store8x8 0
147
+    lea               r3, [3 * r1]
148
+    vextracti128      xm2, m1, 1
149
+    vextracti128      xm5, m4, 1
150
+    movq              [r0], xm1
151
+    movq              [r0 + r1], xm2
152
+    movhps            [r0 + 2 * r1], xm1
153
+    movhps            [r0 + r3], xm2
154
+    lea               r0, [r0 + 4 * r1]
155
+    movq              [r0], xm4
156
+    movq              [r0 + r1], xm5
157
+    movhps            [r0 + 2 * r1], xm4
158
+    movhps            [r0 + r3], xm5
159
+%endmacro
160
+
161
+cglobal intra_pred_ang8_3, 3,4,6
162
     vbroadcasti128    m0, [r2 + 17]
163
+    mova              m5, [ang8_shuf_mode3]
164
+    mova              m3, [pb_2]
165
 
166
-    pshufb            m1, m0, [c_ang8_src1_9_2_10]
167
-    pshufb            m2, m0, [c_ang8_src3_11_4_12]
168
-    pshufb            m4, m0, [c_ang8_src5_13_5_13]
169
-    pshufb            m0,     [c_ang8_src6_14_7_15]
170
+    pshufb            m1, m0, m5
171
+    paddb             m5, m3
172
+    pshufb            m2, m0, m5
173
+    paddb             m5, m3
174
+    pshufb            m4, m0, m5
175
+    paddb             m5, m3
176
+    pshufb            m0, m5
177
 
178
-    pmaddubsw         m1, [c_ang8_26_20]
179
+    vbroadcasti128    m5, [ang8_fact_mode3]
180
+    mova              m3, [pw_1024]
181
+    pmaddubsw         m1, m5
182
+    pmaddubsw         m2, m5
183
+    pmaddubsw         m4, m5
184
+    pmaddubsw         m0, m5
185
     pmulhrsw          m1, m3
186
-    pmaddubsw         m2, [c_ang8_14_8]
187
     pmulhrsw          m2, m3
188
-    pmaddubsw         m4, [c_ang8_2_28]
189
     pmulhrsw          m4, m3
190
-    pmaddubsw         m0, [c_ang8_22_16]
191
     pmulhrsw          m0, m3
192
     packuswb          m1, m2
193
     packuswb          m4, m0
194
 
195
-    vperm2i128        m2, m1, m4, 00100000b
196
-    vperm2i128        m1, m1, m4, 00110001b
197
-    punpcklbw         m4, m2, m1
198
-    punpckhbw         m2, m1
199
-    punpcklwd         m1, m4, m2
200
-    punpckhwd         m4, m2
201
-    mova              m0, [trans8_shuf]
202
-    vpermd            m1, m0, m1
203
-    vpermd            m4, m0, m4
204
-
205
-    lea               r3, [3 * r1]
206
-    movq              [r0], xm1
207
-    movhps            [r0 + r1], xm1
208
-    vextracti128      xm2, m1, 1
209
-    movq              [r0 + 2 * r1], xm2
210
-    movhps            [r0 + r3], xm2
211
-    lea               r0, [r0 + 4 * r1]
212
-    movq              [r0], xm4
213
-    movhps            [r0 + r1], xm4
214
-    vextracti128      xm2, m4, 1
215
-    movq              [r0 + 2 * r1], xm2
216
-    movhps            [r0 + r3], xm2
217
+    ang8_store8x8
218
     RET
219
 
220
 INIT_YMM avx2
221
@@ -18662,48 +18696,33 @@
222
     RET
223
 
224
 INIT_YMM avx2
225
-cglobal intra_pred_ang8_4, 3,4,5
226
-    mova              m3, [pw_1024]
227
+cglobal intra_pred_ang8_4, 3,4,6
228
     vbroadcasti128    m0, [r2 + 17]
229
+    mova              m5, [ang8_shuf_mode4]
230
+    mova              m3, [pb_2]
231
 
232
-    pshufb            m1, m0, [c_ang8_src1_9_2_10]
233
-    pshufb            m2, m0, [c_ang8_src2_10_3_11]
234
-    pshufb            m4, m0, [c_ang8_src4_12_4_12]
235
-    pshufb            m0,     [c_ang8_src5_13_6_14]
236
+    pshufb            m1, m0, m5
237
+    paddb             m5, m3
238
+    pshufb            m2, m0, m5
239
+    paddb             m5, m3
240
+    pshufb            m4, m0, m5
241
+    paddb             m5, m3
242
+    pshufb            m0, m5
243
 
244
-    pmaddubsw         m1, [c_ang8_21_10]
245
+    vbroadcasti128    m5, [ang8_fact_mode4]
246
+    mova              m3, [pw_1024]
247
+    pmaddubsw         m1, m5
248
+    pmaddubsw         m2, m5
249
+    pmaddubsw         m4, m5
250
+    pmaddubsw         m0, m5
251
     pmulhrsw          m1, m3
252
-    pmaddubsw         m2, [c_ang8_31_20]
253
     pmulhrsw          m2, m3
254
-    pmaddubsw         m4, [c_ang8_9_30]
255
     pmulhrsw          m4, m3
256
-    pmaddubsw         m0, [c_ang8_19_8]
257
     pmulhrsw          m0, m3
258
     packuswb          m1, m2
259
     packuswb          m4, m0
260
 
261
-    vperm2i128        m2, m1, m4, 00100000b
262
-    vperm2i128        m1, m1, m4, 00110001b
263
-    punpcklbw         m4, m2, m1
264
-    punpckhbw         m2, m1
265
-    punpcklwd         m1, m4, m2
266
-    punpckhwd         m4, m2
267
-    mova              m0, [trans8_shuf]
268
-    vpermd            m1, m0, m1
269
-    vpermd            m4, m0, m4
270
-
271
-    lea               r3, [3 * r1]
272
-    movq              [r0], xm1
273
-    movhps            [r0 + r1], xm1
274
-    vextracti128      xm2, m1, 1
275
-    movq              [r0 + 2 * r1], xm2
276
-    movhps            [r0 + r3], xm2
277
-    lea               r0, [r0 + 4 * r1]
278
-    movq              [r0], xm4
279
-    movhps            [r0 + r1], xm4
280
-    vextracti128      xm2, m4, 1
281
-    movq              [r0 + 2 * r1], xm2
282
-    movhps            [r0 + r3], xm2
283
+    ang8_store8x8
284
     RET
285
 
286
 INIT_YMM avx2
287
@@ -18743,48 +18762,33 @@
288
 
289
 
290
 INIT_YMM avx2
291
-cglobal intra_pred_ang8_5, 3, 4, 5
292
-    mova              m3, [pw_1024]
293
+cglobal intra_pred_ang8_5, 3, 4, 6
294
     vbroadcasti128    m0, [r2 + 17]
295
+    mova              m5, [ang8_shuf_mode5]
296
+    mova              m3, [pb_2]
297
 
298
-    pshufb            m1, m0, [c_ang8_src1_9_2_10]
299
-    pshufb            m2, m0, [c_ang8_src2_10_3_11]
300
-    pshufb            m4, m0, [c_ang8_src3_11_4_12]
301
-    pshufb            m0,     [c_ang8_src4_12_5_13]
302
+    pshufb            m1, m0, m5
303
+    paddb             m5, m3
304
+    pshufb            m2, m0, m5
305
+    paddb             m5, m3
306
+    pshufb            m4, m0, m5
307
+    paddb             m5, m3
308
+    pshufb            m0, m5
309
 
310
-    pmaddubsw         m1, [c_ang8_17_2]
311
+    vbroadcasti128    m5, [ang8_fact_mode5]
312
+    mova              m3, [pw_1024]
313
+    pmaddubsw         m1, m5
314
+    pmaddubsw         m2, m5
315
+    pmaddubsw         m4, m5
316
+    pmaddubsw         m0, m5
317
     pmulhrsw          m1, m3
318
-    pmaddubsw         m2, [c_ang8_19_4]
319
     pmulhrsw          m2, m3
320
-    pmaddubsw         m4, [c_ang8_21_6]
321
     pmulhrsw          m4, m3
322
-    pmaddubsw         m0, [c_ang8_23_8]
323
     pmulhrsw          m0, m3
324
     packuswb          m1, m2
325
     packuswb          m4, m0
326
 
327
-    vperm2i128        m2, m1, m4, 00100000b
328
-    vperm2i128        m1, m1, m4, 00110001b
329
-    punpcklbw         m4, m2, m1
330
-    punpckhbw         m2, m1
331
-    punpcklwd         m1, m4, m2
332
-    punpckhwd         m4, m2
333
-    mova              m0, [trans8_shuf]
334
-    vpermd            m1, m0, m1
335
-    vpermd            m4, m0, m4
336
-
337
-    lea               r3, [3 * r1]
338
-    movq              [r0], xm1
339
-    movhps            [r0 + r1], xm1
340
-    vextracti128      xm2, m1, 1
341
-    movq              [r0 + 2 * r1], xm2
342
-    movhps            [r0 + r3], xm2
343
-    lea               r0, [r0 + 4 * r1]
344
-    movq              [r0], xm4
345
-    movhps            [r0 + r1], xm4
346
-    vextracti128      xm2, m4, 1
347
-    movq              [r0 + 2 * r1], xm2
348
-    movhps            [r0 + r3], xm2
349
+    ang8_store8x8
350
     RET
351
 
352
 INIT_YMM avx2
353
@@ -18824,48 +18828,33 @@
354
 
355
 
356
 INIT_YMM avx2
357
-cglobal intra_pred_ang8_6, 3, 4, 5
358
-    mova              m3, [pw_1024]
359
+cglobal intra_pred_ang8_6, 3, 4, 6
360
     vbroadcasti128    m0, [r2 + 17]
361
+    mova              m5, [ang8_shuf_mode6]
362
+    mova              m3, [pb_2]
363
 
364
-    pshufb            m1, m0, [intra_pred_shuff_0_8]
365
-    pshufb            m2, m0, [c_ang8_src2_10_2_10]
366
-    pshufb            m4, m0, [c_ang8_src3_11_3_11]
367
-    pshufb            m0,     [c_ang8_src3_11_4_12]
368
+    pshufb            m1, m0, m5
369
+    paddb             m5, m3
370
+    pshufb            m2, m0, m5
371
+    paddb             m5, m3
372
+    pshufb            m4, m0, m5
373
+    paddb             m5, m3
374
+    pshufb            m0, m5
375
 
376
-    pmaddubsw         m1, [c_ang8_13_26]
377
+    vbroadcasti128    m5, [ang8_fact_mode6]
378
+    mova              m3, [pw_1024]
379
+    pmaddubsw         m1, m5
380
+    pmaddubsw         m2, m5
381
+    pmaddubsw         m4, m5
382
+    pmaddubsw         m0, m5
383
     pmulhrsw          m1, m3
384
-    pmaddubsw         m2, [c_ang8_7_20]
385
     pmulhrsw          m2, m3
386
-    pmaddubsw         m4, [c_ang8_1_14]
387
     pmulhrsw          m4, m3
388
-    pmaddubsw         m0, [c_ang8_27_8]
389
     pmulhrsw          m0, m3
390
     packuswb          m1, m2
391
     packuswb          m4, m0
392
 
393
-    vperm2i128        m2, m1, m4, 00100000b
394
-    vperm2i128        m1, m1, m4, 00110001b
395
-    punpcklbw         m4, m2, m1
396
-    punpckhbw         m2, m1
397
-    punpcklwd         m1, m4, m2
398
-    punpckhwd         m4, m2
399
-    mova              m0, [trans8_shuf]
400
-    vpermd            m1, m0, m1
401
-    vpermd            m4, m0, m4
402
-
403
-    lea               r3, [3 * r1]
404
-    movq              [r0], xm1
405
-    movhps            [r0 + r1], xm1
406
-    vextracti128      xm2, m1, 1
407
-    movq              [r0 + 2 * r1], xm2
408
-    movhps            [r0 + r3], xm2
409
-    lea               r0, [r0 + 4 * r1]
410
-    movq              [r0], xm4
411
-    movhps            [r0 + r1], xm4
412
-    vextracti128      xm2, m4, 1
413
-    movq              [r0 + 2 * r1], xm2
414
-    movhps            [r0 + r3], xm2
415
+    ang8_store8x8
416
     RET
417
 
418
 INIT_YMM avx2
419
@@ -18905,46 +18894,33 @@
420
 
421
 
422
 INIT_YMM avx2
423
-cglobal intra_pred_ang8_9, 3, 5, 5
424
-    mova              m3, [pw_1024]
425
+cglobal intra_pred_ang8_9, 3, 5, 6
426
     vbroadcasti128    m0, [r2 + 17]
427
+    mova              m5, [ang8_shuf_mode9]
428
+    mova              m3, [pb_2]
429
 
430
-    pshufb            m0, [intra_pred_shuff_0_8]
431
+    pshufb            m1, m0, m5
432
+    paddb             m5, m3
433
+    pshufb            m2, m0, m5
434
+    paddb             m5, m3
435
+    pshufb            m4, m0, m5
436
+    paddb             m5, m3
437
+    pshufb            m0, m5
438
 
439
-    lea               r4, [c_ang8_mode_27]
440
-    pmaddubsw         m1, m0, [r4]
441
+    vbroadcasti128    m5, [ang8_fact_mode9]
442
+    mova              m3, [pw_1024]
443
+    pmaddubsw         m1, m5
444
+    pmaddubsw         m2, m5
445
+    pmaddubsw         m4, m5
446
+    pmaddubsw         m0, m5
447
     pmulhrsw          m1, m3
448
-    pmaddubsw         m2, m0, [r4 + mmsize]
449
     pmulhrsw          m2, m3
450
-    pmaddubsw         m4, m0, [r4 + 2 * mmsize]
451
     pmulhrsw          m4, m3
452
-    pmaddubsw         m0, [r4 + 3 * mmsize]
453
     pmulhrsw          m0, m3
454
     packuswb          m1, m2
455
     packuswb          m4, m0
456
 
457
-    vperm2i128        m2, m1, m4, 00100000b
458
-    vperm2i128        m1, m1, m4, 00110001b
459
-    punpcklbw         m4, m2, m1
460
-    punpckhbw         m2, m1
461
-    punpcklwd         m1, m4, m2
462
-    punpckhwd         m4, m2
463
-    mova              m0, [trans8_shuf]
464
-    vpermd            m1, m0, m1
465
-    vpermd            m4, m0, m4
466
-
467
-    lea               r3, [3 * r1]
468
-    movq              [r0], xm1
469
-    movhps            [r0 + r1], xm1
470
-    vextracti128      xm2, m1, 1
471
-    movq              [r0 + 2 * r1], xm2
472
-    movhps            [r0 + r3], xm2
473
-    lea               r0, [r0 + 4 * r1]
474
-    movq              [r0], xm4
475
-    movhps            [r0 + r1], xm4
476
-    vextracti128      xm2, m4, 1
477
-    movq              [r0 + 2 * r1], xm2
478
-    movhps            [r0 + r3], xm2
479
+    ang8_store8x8
480
     RET
481
 
482
 INIT_YMM avx2
483
@@ -19015,48 +18991,33 @@
484
 
485
 
486
 INIT_YMM avx2
487
-cglobal intra_pred_ang8_7, 3, 4, 5
488
-    mova              m3, [pw_1024]
489
+cglobal intra_pred_ang8_7, 3, 4, 6
490
     vbroadcasti128    m0, [r2 + 17]
491
+    mova              m5, [ang8_shuf_mode7]
492
+    mova              m3, [pb_2]
493
 
494
-    pshufb            m1, m0, [intra_pred_shuff_0_8]
495
-    pshufb            m2, m0, [c_ang8_src1_9_2_10]
496
-    pshufb            m4, m0, [c_ang8_src2_10_2_10]
497
-    pshufb            m0,     [c_ang8_src2_10_3_11]
498
+    pshufb            m1, m0, m5
499
+    paddb             m5, m3
500
+    pshufb            m2, m0, m5
501
+    paddb             m5, m3
502
+    pshufb            m4, m0, m5
503
+    paddb             m5, m3
504
+    pshufb            m0, m5
505
 
506
-    pmaddubsw         m1, [c_ang8_9_18]
507
+    vbroadcasti128    m5, [ang8_fact_mode7]
508
+    mova              m3, [pw_1024]
509
+    pmaddubsw         m1, m5
510
+    pmaddubsw         m2, m5
511
+    pmaddubsw         m4, m5
512
+    pmaddubsw         m0, m5
513
     pmulhrsw          m1, m3
514
-    pmaddubsw         m2, [c_ang8_27_4]
515
     pmulhrsw          m2, m3
516
-    pmaddubsw         m4, [c_ang8_13_22]
517
     pmulhrsw          m4, m3
518
-    pmaddubsw         m0, [c_ang8_31_8]
519
     pmulhrsw          m0, m3
520
     packuswb          m1, m2
521
     packuswb          m4, m0
522
 
523
-    vperm2i128        m2, m1, m4, 00100000b
524
-    vperm2i128        m1, m1, m4, 00110001b
525
-    punpcklbw         m4, m2, m1
526
-    punpckhbw         m2, m1
527
-    punpcklwd         m1, m4, m2
528
-    punpckhwd         m4, m2
529
-    mova              m0, [trans8_shuf]
530
-    vpermd            m1, m0, m1
531
-    vpermd            m4, m0, m4
532
-
533
-    lea               r3, [3 * r1]
534
-    movq              [r0], xm1
535
-    movhps            [r0 + r1], xm1
536
-    vextracti128      xm2, m1, 1
537
-    movq              [r0 + 2 * r1], xm2
538
-    movhps            [r0 + r3], xm2
539
-    lea               r0, [r0 + 4 * r1]
540
-    movq              [r0], xm4
541
-    movhps            [r0 + r1], xm4
542
-    vextracti128      xm2, m4, 1
543
-    movq              [r0 + 2 * r1], xm2
544
-    movhps            [r0 + r3], xm2
545
+    ang8_store8x8
546
     RET
547
 
548
 INIT_YMM avx2
549
@@ -19097,48 +19058,32 @@
550
 
551
 INIT_YMM avx2
552
 cglobal intra_pred_ang8_8, 3, 4, 6
553
-    mova              m3, [pw_1024]
554
     vbroadcasti128    m0, [r2 + 17]
555
-    mova              m5, [intra_pred_shuff_0_8]
556
+    mova              m5, [ang8_shuf_mode8]
557
+    mova              m3, [pb_2]
558
 
559
     pshufb            m1, m0, m5
560
+    paddb             m5, m3
561
     pshufb            m2, m0, m5
562
+    paddb             m5, m3
563
     pshufb            m4, m0, m5
564
-    pshufb            m0,     [c_ang8_src2_10_2_10]
565
+    paddb             m5, m3
566
+    pshufb            m0, m5
567
 
568
-    pmaddubsw         m1, [c_ang8_5_10]
569
+    vbroadcasti128    m5, [ang8_fact_mode8]
570
+    mova              m3, [pw_1024]
571
+    pmaddubsw         m1, m5
572
+    pmaddubsw         m2, m5
573
+    pmaddubsw         m4, m5
574
+    pmaddubsw         m0, m5
575
     pmulhrsw          m1, m3
576
-    pmaddubsw         m2, [c_ang8_15_20]
577
     pmulhrsw          m2, m3
578
-    pmaddubsw         m4, [c_ang8_25_30]
579
     pmulhrsw          m4, m3
580
-    pmaddubsw         m0, [c_ang8_3_8]
581
     pmulhrsw          m0, m3
582
     packuswb          m1, m2
583
     packuswb          m4, m0
584
 
585
-    vperm2i128        m2, m1, m4, 00100000b
586
-    vperm2i128        m1, m1, m4, 00110001b
587
-    punpcklbw         m4, m2, m1
588
-    punpckhbw         m2, m1
589
-    punpcklwd         m1, m4, m2
590
-    punpckhwd         m4, m2
591
-    mova              m0, [trans8_shuf]
592
-    vpermd            m1, m0, m1
593
-    vpermd            m4, m0, m4
594
-
595
-    lea               r3, [3 * r1]
596
-    movq              [r0], xm1
597
-    movhps            [r0 + r1], xm1
598
-    vextracti128      xm2, m1, 1
599
-    movq              [r0 + 2 * r1], xm2
600
-    movhps            [r0 + r3], xm2
601
-    lea               r0, [r0 + 4 * r1]
602
-    movq              [r0], xm4
603
-    movhps            [r0 + r1], xm4
604
-    vextracti128      xm2, m4, 1
605
-    movq              [r0 + 2 * r1], xm2
606
-    movhps            [r0 + r3], xm2
607
+    ang8_store8x8
608
     RET
609
 
610
 INIT_YMM avx2
611
@@ -19179,163 +19124,139 @@
612
 
613
 
614
 INIT_YMM avx2
615
-cglobal intra_pred_ang8_11, 3, 5, 5
616
-    mova              m3, [pw_1024]
617
+cglobal intra_pred_ang8_11, 3, 5, 6
618
+    mova               m3, [pw_1024]
619
     movu              xm1, [r2 + 16]
620
     pinsrb            xm1, [r2], 0
621
-    pshufb            xm1, [intra_pred_shuff_0_8]
622
-    vinserti128       m0, m1, xm1, 1
623
+    vinserti128        m0, m1, xm1, 1
624
 
625
-    lea               r4, [c_ang8_mode_25]
626
-    pmaddubsw         m1, m0, [r4]
627
+    mova              m5, [ang8_shuf_mode9]
628
+    mova              m3, [pb_2]
629
+
630
+    pshufb            m1, m0, m5
631
+    paddb             m5, m3
632
+    pshufb            m2, m0, m5
633
+    paddb             m5, m3
634
+    pshufb            m4, m0, m5
635
+    paddb             m5, m3
636
+    pshufb            m0, m5
637
+
638
+    vbroadcasti128    m5, [ang8_fact_mode11]
639
+    mova              m3, [pw_1024]
640
+    pmaddubsw         m1, m5
641
+    pmaddubsw         m2, m5
642
+    pmaddubsw         m4, m5
643
+    pmaddubsw         m0, m5
644
     pmulhrsw          m1, m3
645
-    pmaddubsw         m2, m0, [r4 + mmsize]
646
     pmulhrsw          m2, m3
647
-    pmaddubsw         m4, m0, [r4 + 2 * mmsize]
648
     pmulhrsw          m4, m3
649
-    pmaddubsw         m0, [r4 + 3 * mmsize]
650
     pmulhrsw          m0, m3
651
     packuswb          m1, m2
652
     packuswb          m4, m0
653
 
654
-    vperm2i128        m2, m1, m4, 00100000b
655
-    vperm2i128        m1, m1, m4, 00110001b
656
-    punpcklbw         m4, m2, m1
657
-    punpckhbw         m2, m1
658
-    punpcklwd         m1, m4, m2
659
-    punpckhwd         m4, m2
660
-    mova              m0, [trans8_shuf]
661
-    vpermd            m1, m0, m1
662
-    vpermd            m4, m0, m4
663
-
664
-    lea               r3, [3 * r1]
665
-    movq              [r0], xm1
666
-    movhps            [r0 + r1], xm1
667
-    vextracti128      xm2, m1, 1
668
-    movq              [r0 + 2 * r1], xm2
669
-    movhps            [r0 + r3], xm2
670
-    lea               r0, [r0 + 4 * r1]
671
-    movq              [r0], xm4
672
-    movhps            [r0 + r1], xm4
673
-    vextracti128      xm2, m4, 1
674
-    movq              [r0 + 2 * r1], xm2
675
-    movhps            [r0 + r3], xm2
676
+    ang8_store8x8
677
     RET
678
 
679
 INIT_YMM avx2
680
 cglobal intra_pred_ang8_15, 3, 6, 6
681
-    mova              m3, [pw_1024]
682
-    movu              xm5, [r2 + 16]
683
-    pinsrb            xm5, [r2], 0
684
-    lea               r5, [intra_pred_shuff_0_8]
685
-    mova              xm0, xm5
686
-    pslldq            xm5, 1
687
-    pinsrb            xm5, [r2 + 2], 0
688
-    vinserti128       m0, m0, xm5, 1
689
-    pshufb            m0, [r5]
690
+    vbroadcasti128    m1, [r2 + 17]
691
+    vbroadcasti128    m2, [r2]
692
+    mova              m3, [ang8_shuf_mode15 + mmsize]
693
+    pshufb            m2, m3
694
+    palignr           m1, m2, 11
695
+
696
+    mova              m5, [ang8_shuf_mode15]
697
+    mova              m3, [pb_2]
698
+    pshufb            m0, m1, m5
699
+    psubb             m5, m3
700
+    pshufb            m4, m1, m5
701
+    psubb             m5, m3
702
+    pshufb            m2, m1, m5
703
+    psubb             m5, m3
704
+    pshufb            m1, m5
705
 
706
-    lea               r4, [c_ang8_mode_15]
707
-    pmaddubsw         m1, m0, [r4]
708
+    vbroadcasti128    m5, [ang8_fact_mode15]
709
+    mova              m3, [pw_1024]
710
+    pmaddubsw         m1, m5
711
+    pmaddubsw         m2, m5
712
+    pmaddubsw         m4, m5
713
+    pmaddubsw         m0, m5
714
     pmulhrsw          m1, m3
715
-    mova              xm0, xm5
716
-    pslldq            xm5, 1
717
-    pinsrb            xm5, [r2 + 4], 0
718
-    vinserti128       m0, m0, xm5, 1
719
-    pshufb            m0, [r5]
720
-    pmaddubsw         m2, m0, [r4 + mmsize]
721
     pmulhrsw          m2, m3
722
-    mova              xm0, xm5
723
-    pslldq            xm5, 1
724
-    pinsrb            xm5, [r2 + 6], 0
725
-    vinserti128       m0, m0, xm5, 1
726
-    pshufb            m0, [r5]
727
-    pmaddubsw         m4, m0, [r4 + 2 * mmsize]
728
     pmulhrsw          m4, m3
729
-    mova              xm0, xm5
730
-    pslldq            xm5, 1
731
-    pinsrb            xm5, [r2 + 8], 0
732
-    vinserti128       m0, m0, xm5, 1
733
-    pshufb            m0, [r5]
734
-    pmaddubsw         m0, [r4 + 3 * mmsize]
735
     pmulhrsw          m0, m3
736
     packuswb          m1, m2
737
     packuswb          m4, m0
738
 
739
-    vperm2i128        m2, m1, m4, 00100000b
740
-    vperm2i128        m1, m1, m4, 00110001b
741
-    punpcklbw         m4, m2, m1
742
-    punpckhbw         m2, m1
743
-    punpcklwd         m1, m4, m2
744
-    punpckhwd         m4, m2
745
-    mova              m0, [trans8_shuf]
746
-    vpermd            m1, m0, m1
747
-    vpermd            m4, m0, m4
748
-
749
-    lea               r3, [3 * r1]
750
-    movq              [r0], xm1
751
-    movhps            [r0 + r1], xm1
752
-    vextracti128      xm2, m1, 1
753
-    movq              [r0 + 2 * r1], xm2
754
-    movhps            [r0 + r3], xm2
755
-    lea               r0, [r0 + 4 * r1]
756
-    movq              [r0], xm4
757
-    movhps            [r0 + r1], xm4
758
-    vextracti128      xm2, m4, 1
759
-    movq              [r0 + 2 * r1], xm2
760
-    movhps            [r0 + r3], xm2
761
+    ang8_store8x8
762
     RET
763
 
764
 INIT_YMM avx2
765
-cglobal intra_pred_ang8_16, 3,4,7
766
-    lea                 r0, [r0 + r1 * 8]
767
-    sub                 r0, r1
768
-    neg                 r1
769
-    lea                 r3, [r1 * 3]
770
-    vbroadcasti128      m0, [angHor8_tab_16]            ; m0 = factor
771
-    mova                m1, [intra_pred8_shuff16]       ; m1 = 4 of Row shuffle
772
-    movu                m2, [intra_pred8_shuff16 + 8]   ; m2 = 4 of Row shuffle
773
+cglobal intra_pred_ang8_16, 3,4,6
774
+    vbroadcasti128    m1, [r2 + 17]
775
+    vbroadcasti128    m2, [r2]
776
+    mova              m3, [ang8_shuf_mode16 + mmsize]
777
+    pshufb            m2, m3
778
+    palignr           m1, m2, 10
779
+
780
+    mova              m5, [ang8_shuf_mode16]
781
+    mova              m3, [pb_2]
782
+    pshufb            m0, m1, m5
783
+    psubb             m5, m3
784
+    pshufb            m4, m1, m5
785
+    psubb             m5, m3
786
+    pshufb            m2, m1, m5
787
+    psubb             m5, m3
788
+    pshufb            m1, m5
789
 
790
-    ; prepare reference pixel
791
-    movq                xm3, [r2 + 16 + 1]              ; m3 = [-1 -2 -3 -4 -5 -6 -7 -8 x x x x x x x x]
792
-    movhps              xm3, [r2 + 2]                   ; m3 = [-1 -2 -3 -4 -5 -6 -7 -8 2 3 x 5 6 x 8 x]
793
-    pslldq              xm3, 1
794
-    pinsrb              xm3, [r2], 0                    ; m3 = [ 0 -1 -2 -3 -4 -5 -6 -7 -8 2 3 x 5 6 x 8]
795
-    pshufb              xm3, [c_ang8_mode_16]
796
-    vinserti128         m3, m3, xm3, 1                  ; m3 = [-8 -7 -6 -5 -4 -3 -2 -1  0 2 3 5 6 8]
797
+    vbroadcasti128    m5, [ang8_fact_mode16]
798
+    mova              m3, [pw_1024]
799
+    pmaddubsw         m1, m5
800
+    pmaddubsw         m2, m5
801
+    pmaddubsw         m4, m5
802
+    pmaddubsw         m0, m5
803
+    pmulhrsw          m1, m3
804
+    pmulhrsw          m2, m3
805
+    pmulhrsw          m4, m3
806
+    pmulhrsw          m0, m3
807
+    packuswb          m1, m2
808
+    packuswb          m4, m0
809
 
810
-    ; process 4 rows
811
-    pshufb              m4, m3, m1
812
-    pshufb              m5, m3, m2
813
-    psrldq              m3, 4
814
-    punpcklbw           m6, m5, m4
815
-    punpckhbw           m5, m4
816
-    pmaddubsw           m6, m0
817
-    pmulhrsw            m6, [pw_1024]
818
-    pmaddubsw           m5, m0
819
-    pmulhrsw            m5, [pw_1024]
820
-    packuswb            m6, m5
821
-    vextracti128        xm5, m6, 1
822
-    movq                [r0], xm6
823
-    movhps              [r0 + r1], xm6
824
-    movq                [r0 + r1 * 2], xm5
825
-    movhps              [r0 + r3], xm5
826
+    ang8_store8x8
827
+    RET
828
 
829
-    ; process 4 rows
830
-    lea                 r0, [r0 + r1 * 4]
831
-    pshufb              m4, m3, m1
832
-    pshufb              m5, m3, m2
833
-    punpcklbw           m6, m5, m4
834
-    punpckhbw           m5, m4
835
-    pmaddubsw           m6, m0
836
-    pmulhrsw            m6, [pw_1024]
837
-    pmaddubsw           m5, m0
838
-    pmulhrsw            m5, [pw_1024]
839
-    packuswb            m6, m5
840
-    vextracti128        xm5, m6, 1
841
-    movq                [r0], xm6
842
-    movhps              [r0 + r1], xm6
843
-    movq                [r0 + r1 * 2], xm5
844
-    movhps              [r0 + r3], xm5
845
+INIT_YMM avx2
846
+cglobal intra_pred_ang8_17, 3,4,6
847
+    vbroadcasti128    m1, [r2 + 17]
848
+    vbroadcasti128    m2, [r2]
849
+    mova              m3, [ang8_shuf_mode17 + mmsize]
850
+    pshufb            m2, m3
851
+    palignr           m1, m2, 9
852
+
853
+    mova              m5, [ang8_shuf_mode17]
854
+    mova              m3, [pb_2]
855
+    pshufb            m0, m1, m5
856
+    psubb             m5, m3
857
+    pshufb            m4, m1, m5
858
+    psubb             m5, m3
859
+    pshufb            m2, m1, m5
860
+    psubb             m5, m3
861
+    pshufb            m1, m5
862
+
863
+    vbroadcasti128    m5, [ang8_fact_mode17]
864
+    mova              m3, [pw_1024]
865
+    pmaddubsw         m1, m5
866
+    pmaddubsw         m2, m5
867
+    pmaddubsw         m4, m5
868
+    pmaddubsw         m0, m5
869
+    pmulhrsw          m1, m3
870
+    pmulhrsw          m2, m3
871
+    pmulhrsw          m4, m3
872
+    pmulhrsw          m0, m3
873
+    packuswb          m1, m2
874
+    packuswb          m4, m0
875
+
876
+    ang8_store8x8
877
     RET
878
 
879
 %if 1
880
@@ -19548,113 +19469,73 @@
881
 
882
 INIT_YMM avx2
883
 cglobal intra_pred_ang8_14, 3, 6, 6
884
-    mova              m3, [pw_1024]
885
-    movu              xm5, [r2 + 16]
886
-    pinsrb            xm5, [r2], 0
887
-    lea               r5, [intra_pred_shuff_0_8]
888
-    vinserti128       m0, m5, xm5, 1
889
-    pshufb            m0, [r5]
890
+    movu              xm1, [r2 + 13]
891
+    vinserti128       m1, m1, xm1, 1
892
 
893
-    lea               r4, [c_ang8_mode_14]
894
-    pmaddubsw         m1, m0, [r4]
895
+    pinsrb            xm1, [r2 + 0], 3
896
+    pinsrb            xm1, [r2 + 2], 2
897
+    pinsrb            xm1, [r2 + 5], 1
898
+    pinsrb            xm1, [r2 + 7], 0
899
+    vinserti128       m1, m1, xm1, 1
900
+
901
+    mova              m5, [ang8_shuf_mode14]
902
+    mova              m3, [pb_2]
903
+    pshufb            m0, m1, m5
904
+    psubb             m5, m3
905
+    pshufb            m4, m1, m5
906
+    psubb             m5, m3
907
+    pshufb            m2, m1, m5
908
+    psubb             m5, m3
909
+    pshufb            m1, m5
910
+
911
+    vbroadcasti128    m5, [ang8_fact_mode14]
912
+    mova              m3, [pw_1024]
913
+    pmaddubsw         m1, m5
914
+    pmaddubsw         m2, m5
915
+    pmaddubsw         m4, m5
916
+    pmaddubsw         m0, m5
917
     pmulhrsw          m1, m3
918
-    pslldq            xm5, 1
919
-    pinsrb            xm5, [r2 + 2], 0
920
-    vinserti128       m0, m5, xm5, 1
921
-    pshufb            m0, [r5]
922
-    pmaddubsw         m2, m0, [r4 + mmsize]
923
     pmulhrsw          m2, m3
924
-    pslldq            xm5, 1
925
-    pinsrb            xm5, [r2 + 5], 0
926
-    vinserti128       m0, m5, xm5, 1
927
-    pshufb            m0, [r5]
928
-    pmaddubsw         m4, m0, [r4 + 2 * mmsize]
929
     pmulhrsw          m4, m3
930
-    pslldq            xm5, 1
931
-    pinsrb            xm5, [r2 + 7], 0
932
-    pshufb            xm5, [r5]
933
-    vinserti128       m0, m0, xm5, 1
934
-    pmaddubsw         m0, [r4 + 3 * mmsize]
935
     pmulhrsw          m0, m3
936
     packuswb          m1, m2
937
     packuswb          m4, m0
938
 
939
-    vperm2i128        m2, m1, m4, 00100000b
940
-    vperm2i128        m1, m1, m4, 00110001b
941
-    punpcklbw         m4, m2, m1
942
-    punpckhbw         m2, m1
943
-    punpcklwd         m1, m4, m2
944
-    punpckhwd         m4, m2
945
-    mova              m0, [trans8_shuf]
946
-    vpermd            m1, m0, m1
947
-    vpermd            m4, m0, m4
948
-
949
-    lea               r3, [3 * r1]
950
-    movq              [r0], xm1
951
-    movhps            [r0 + r1], xm1
952
-    vextracti128      xm2, m1, 1
953
-    movq              [r0 + 2 * r1], xm2
954
-    movhps            [r0 + r3], xm2
955
-    lea               r0, [r0 + 4 * r1]
956
-    movq              [r0], xm4
957
-    movhps            [r0 + r1], xm4
958
-    vextracti128      xm2, m4, 1
959
-    movq              [r0 + 2 * r1], xm2
960
-    movhps            [r0 + r3], xm2
961
+    ang8_store8x8
962
     RET
963
 
964
 INIT_YMM avx2
965
 cglobal intra_pred_ang8_13, 3, 6, 6
966
-    mova              m3, [pw_1024]
967
-    movu              xm5, [r2 + 16]
968
-    pinsrb            xm5, [r2], 0
969
-    lea               r5, [intra_pred_shuff_0_8]
970
-    vinserti128       m0, m5, xm5, 1
971
-    pshufb            m0, [r5]
972
+    movu              xm1, [r2 + 14]
973
+    pinsrb            xm1, [r2 + 0], 2
974
+    pinsrb            xm1, [r2 + 4], 1
975
+    pinsrb            xm1, [r2 + 7], 0
976
+    vinserti128       m1, m1, xm1, 1
977
+
978
+    mova              m5, [ang8_shuf_mode13]
979
+    mova              m3, [pb_2]
980
+    pshufb            m0, m1, m5
981
+    psubb             m5, m3
982
+    pshufb            m4, m1, m5
983
+    psubb             m5, m3
984
+    pshufb            m2, m1, m5
985
+    psubb             m5, m3
986
+    pshufb            m1, m5
987
 
988
-    lea               r4, [c_ang8_mode_13]
989
-    pmaddubsw         m1, m0, [r4]
990
+    vbroadcasti128    m5, [ang8_fact_mode13]
991
+    mova              m3, [pw_1024]
992
+    pmaddubsw         m1, m5
993
+    pmaddubsw         m2, m5
994
+    pmaddubsw         m4, m5
995
+    pmaddubsw         m0, m5
996
     pmulhrsw          m1, m3
997
-    pslldq            xm5, 1
998
-    pinsrb            xm5, [r2 + 4], 0
999
-    pshufb            xm4, xm5, [r5]
1000
-    vinserti128       m0, m0, xm4, 1
1001
-    pmaddubsw         m2, m0, [r4 + mmsize]
1002
     pmulhrsw          m2, m3
1003
-    vinserti128       m0, m0, xm4, 0
1004
-    pmaddubsw         m4, m0, [r4 + 2 * mmsize]
1005
     pmulhrsw          m4, m3
1006
-    pslldq            xm5, 1
1007
-    pinsrb            xm5, [r2 + 7], 0
1008
-    pshufb            xm5, [r5]
1009
-    vinserti128       m0, m0, xm5, 1
1010
-    pmaddubsw         m0, [r4 + 3 * mmsize]
1011
     pmulhrsw          m0, m3
1012
     packuswb          m1, m2
1013
     packuswb          m4, m0
1014
 
1015
-    vperm2i128        m2, m1, m4, 00100000b
1016
-    vperm2i128        m1, m1, m4, 00110001b
1017
-    punpcklbw         m4, m2, m1
1018
-    punpckhbw         m2, m1
1019
-    punpcklwd         m1, m4, m2
1020
-    punpckhwd         m4, m2
1021
-    mova              m0, [trans8_shuf]
1022
-    vpermd            m1, m0, m1
1023
-    vpermd            m4, m0, m4
1024
-
1025
-    lea               r3, [3 * r1]
1026
-    movq              [r0], xm1
1027
-    movhps            [r0 + r1], xm1
1028
-    vextracti128      xm2, m1, 1
1029
-    movq              [r0 + 2 * r1], xm2
1030
-    movhps            [r0 + r3], xm2
1031
-    lea               r0, [r0 + 4 * r1]
1032
-    movq              [r0], xm4
1033
-    movhps            [r0 + r1], xm4
1034
-    vextracti128      xm2, m4, 1
1035
-    movq              [r0 + 2 * r1], xm2
1036
-    movhps            [r0 + r3], xm2
1037
+    ang8_store8x8
1038
     RET
1039
 
1040
 
1041
@@ -19703,51 +19584,36 @@
1042
     RET
1043
 
1044
 INIT_YMM avx2
1045
-cglobal intra_pred_ang8_12, 3, 5, 5
1046
-    mova              m3, [pw_1024]
1047
-    movu              xm1, [r2 + 16]
1048
-    pinsrb            xm1, [r2], 0
1049
-    pshufb            xm1, [intra_pred_shuff_0_8]
1050
-    vinserti128       m0, m1, xm1, 1
1051
+cglobal intra_pred_ang8_12, 3, 5, 6
1052
+    movu              xm1, [r2 + 15]
1053
+    pinsrb            xm1, [r2 + 0], 1
1054
+    pinsrb            xm1, [r2 + 6], 0
1055
+    vinserti128       m1, m1, xm1, 1
1056
+
1057
+    mova              m5, [ang8_shuf_mode12]
1058
+    mova              m3, [pb_2]
1059
+    pshufb            m0, m1, m5
1060
+    psubb             m5, m3
1061
+    pshufb            m4, m1, m5
1062
+    psubb             m5, m3
1063
+    pshufb            m2, m1, m5
1064
+    psubb             m5, m3
1065
+    pshufb            m1, m5
1066
 
1067
-    lea               r4, [c_ang8_mode_24]
1068
-    pmaddubsw         m1, m0, [r4]
1069
+    vbroadcasti128    m5, [ang8_fact_mode12]
1070
+    mova              m3, [pw_1024]
1071
+    pmaddubsw         m1, m5
1072
+    pmaddubsw         m2, m5
1073
+    pmaddubsw         m4, m5
1074
+    pmaddubsw         m0, m5
1075
     pmulhrsw          m1, m3
1076
-    pmaddubsw         m2, m0, [r4 + mmsize]
1077
     pmulhrsw          m2, m3
1078
-    pmaddubsw         m4, m0, [r4 + 2 * mmsize]
1079
     pmulhrsw          m4, m3
1080
-    pslldq            xm0, 2
1081
-    pinsrb            xm0, [r2 + 6], 0
1082
-    pinsrb            xm0, [r2 + 0], 1
1083
-    vinserti128       m0, m0, xm0, 1
1084
-    pmaddubsw         m0, [r4 + 3 * mmsize]
1085
     pmulhrsw          m0, m3
1086
     packuswb          m1, m2
1087
     packuswb          m4, m0
1088
 
1089
-    vperm2i128        m2, m1, m4, 00100000b
1090
-    vperm2i128        m1, m1, m4, 00110001b
1091
-    punpcklbw         m4, m2, m1
1092
-    punpckhbw         m2, m1
1093
-    punpcklwd         m1, m4, m2
1094
-    punpckhwd         m4, m2
1095
-    mova              m0, [trans8_shuf]
1096
-    vpermd            m1, m0, m1
1097
-    vpermd            m4, m0, m4
1098
-
1099
-    lea               r3, [3 * r1]
1100
-    movq              [r0], xm1
1101
-    movhps            [r0 + r1], xm1
1102
-    vextracti128      xm2, m1, 1
1103
-    movq              [r0 + 2 * r1], xm2
1104
-    movhps            [r0 + r3], xm2
1105
-    lea               r0, [r0 + 4 * r1]
1106
-    movq              [r0], xm4
1107
-    movhps            [r0 + r1], xm4
1108
-    vextracti128      xm2, m4, 1
1109
-    movq              [r0 + 2 * r1], xm2
1110
-    movhps            [r0 + r3], xm2
1111
+    ang8_store8x8
1112
     RET
1113
 
1114
 INIT_YMM avx2
1115
x265_1.9.tar.gz/source/common/x86/ipfilter16.asm -> x265_2.0.tar.gz/source/common/x86/ipfilter16.asm Changed
254
 
1
@@ -116,6 +116,7 @@
2
                   dw  -1, 4, -11, 40,  40, -11, 4, -1
3
                   dw   0, 1, -5,  17,  58, -10, 4, -1
4
 
5
+ALIGN 32
6
 tab_LumaCoeffV:   times 4 dw 0, 0
7
                   times 4 dw 0, 64
8
                   times 4 dw 0, 0
9
@@ -161,9 +162,8 @@
10
 const interp8_hpp_shuf,     db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9
11
                             db 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13
12
 
13
-const pb_shuf,  db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9
14
-                db 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13
15
-
16
+const interp8_hpp_shuf_new, db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9
17
+                            db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13
18
 
19
 SECTION .text
20
 cextern pd_8
21
@@ -10407,7 +10407,7 @@
22
     vpbroadcastq        m0, [tab_LumaCoeff + r4]
23
     vpbroadcastq        m1, [tab_LumaCoeff + r4 + 8]
24
 %endif
25
-    mova                m3, [pb_shuf]
26
+    mova                m3, [interp8_hpp_shuf]
27
     vbroadcasti128      m2, [INTERP_OFFSET_PS]
28
 
29
     ; register map
30
@@ -10475,7 +10475,7 @@
31
     vpbroadcastq        m0, [tab_LumaCoeff + r4]
32
     vpbroadcastq        m1, [tab_LumaCoeff + r4 + 8]
33
 %endif
34
-    mova                m3, [pb_shuf]
35
+    mova                m3, [interp8_hpp_shuf]
36
     vbroadcasti128      m2, [INTERP_OFFSET_PS]
37
 
38
     ; register map
39
@@ -10536,16 +10536,16 @@
40
     add                 r3d, r3d
41
     mov                 r4d, r4m
42
     mov                 r5d, r5m
43
-    shl                 r4d, 4
44
+    shl                 r4d, 6
45
 %ifdef PIC
46
-    lea                 r6, [tab_LumaCoeff]
47
-    vpbroadcastq        m0, [r6 + r4]
48
-    vpbroadcastq        m1, [r6 + r4 + 8]
49
+    lea                 r6, [tab_LumaCoeffV]
50
+    movu                m0, [r6 + r4]
51
+    movu                m1, [r6 + r4 + mmsize]
52
 %else
53
-    vpbroadcastq        m0, [tab_LumaCoeff + r4]
54
-    vpbroadcastq        m1, [tab_LumaCoeff + r4 + 8]
55
+    movu                m0, [tab_LumaCoeffV + r4]
56
+    movu                m1, [tab_LumaCoeffV + r4 + mmsize]
57
 %endif
58
-    mova                m3, [pb_shuf]
59
+    mova                m3, [interp8_hpp_shuf_new]
60
     vbroadcasti128      m2, [INTERP_OFFSET_PS]
61
 
62
     ; register map
63
@@ -10554,7 +10554,7 @@
64
     sub                 r0, 6
65
     test                r5d, r5d
66
     mov                 r4d, %2
67
-    jz                  .loop0
68
+    jz                 .loop0
69
     lea                 r6, [r1*3]
70
     sub                 r0, r6
71
     add                 r4d, 7
72
@@ -10563,64 +10563,64 @@
73
 %assign x 0
74
 %rep %1/16
75
     vbroadcasti128      m4, [r0 + x]
76
-    vbroadcasti128      m5, [r0 + 8 + x]
77
+    vbroadcasti128      m5, [r0 + 4 * SIZEOF_PIXEL + x]
78
     pshufb              m4, m3
79
-    pshufb              m7, m5, m3
80
+    pshufb              m5, m3
81
 
82
     pmaddwd             m4, m0
83
-    pmaddwd             m7, m1
84
+    pmaddwd             m7, m5, m1
85
     paddd               m4, m7
86
+    vextracti128        xm7, m4, 1
87
+    paddd               xm4, xm7
88
+    paddd               xm4, xm2
89
+    psrad               xm4, INTERP_SHIFT_PS
90
 
91
     vbroadcasti128      m6, [r0 + 16 + x]
92
-    pshufb              m5, m3
93
-    pshufb              m7, m6, m3
94
+    pshufb              m6, m3
95
 
96
     pmaddwd             m5, m0
97
-    pmaddwd             m7, m1
98
+    pmaddwd             m7, m6, m1
99
     paddd               m5, m7
100
-
101
-    phaddd              m4, m5
102
-    vpermq              m4, m4, q3120
103
-    paddd               m4, m2
104
-    vextracti128        xm5,m4, 1
105
-    psrad               xm4, INTERP_SHIFT_PS
106
+    vextracti128        xm7, m5, 1
107
+    paddd               xm5, xm7
108
+    paddd               xm5, xm2
109
     psrad               xm5, INTERP_SHIFT_PS
110
-    packssdw            xm4, xm5
111
 
112
+    packssdw            xm4, xm5
113
     movu                [r2 + x], xm4
114
 
115
     vbroadcasti128      m5, [r0 + 24 + x]
116
-    pshufb              m6, m3
117
-    pshufb              m7, m5, m3
118
+    pshufb              m5, m3
119
 
120
     pmaddwd             m6, m0
121
-    pmaddwd             m7, m1
122
+    pmaddwd             m7, m5, m1
123
     paddd               m6, m7
124
+    vextracti128        xm7, m6, 1
125
+    paddd               xm6, xm7
126
+    paddd               xm6, xm2
127
+    psrad               xm6, INTERP_SHIFT_PS
128
 
129
     vbroadcasti128      m7, [r0 + 32 + x]
130
-    pshufb              m5, m3
131
     pshufb              m7, m3
132
 
133
     pmaddwd             m5, m0
134
     pmaddwd             m7, m1
135
     paddd               m5, m7
136
-
137
-    phaddd              m6, m5
138
-    vpermq              m6, m6, q3120
139
-    paddd               m6, m2
140
-    vextracti128        xm5,m6, 1
141
-    psrad               xm6, INTERP_SHIFT_PS
142
+    vextracti128        xm7, m5, 1
143
+    paddd               xm5, xm7
144
+    paddd               xm5, xm2
145
     psrad               xm5, INTERP_SHIFT_PS
146
-    packssdw            xm6, xm5
147
 
148
+    packssdw            xm6, xm5
149
     movu                [r2 + 16 + x], xm6
150
-    %assign x x+32
151
-    %endrep
152
+
153
+%assign x x+32
154
+%endrep
155
 
156
     add                 r2, r3
157
     add                 r0, r1
158
     dec                 r4d
159
-    jnz                 .loop0
160
+    jnz                .loop0
161
     RET
162
 %endif
163
 %endmacro
164
@@ -10656,7 +10656,7 @@
165
     vpbroadcastq        m0, [tab_LumaCoeff + r4]
166
     vpbroadcastq        m1, [tab_LumaCoeff + r4 + 8]
167
 %endif
168
-    mova                m3, [pb_shuf]
169
+    mova                m3, [interp8_hpp_shuf]
170
     vbroadcasti128      m2, [INTERP_OFFSET_PS]
171
 
172
     ; register map
173
@@ -10749,7 +10749,7 @@
174
     vpbroadcastq        m0, [tab_LumaCoeff + r4]
175
     vpbroadcastq        m1, [tab_LumaCoeff + r4 + 8]
176
 %endif
177
-    mova                m3, [pb_shuf]
178
+    mova                m3, [interp8_hpp_shuf]
179
     vbroadcasti128      m2, [INTERP_OFFSET_PS]
180
 
181
     ; register map
182
@@ -10824,7 +10824,7 @@
183
 %else
184
     vpbroadcastq        m0, [tab_ChromaCoeff + r4 * 8]
185
 %endif
186
-    mova                m3, [pb_shuf]
187
+    mova                m3, [interp8_hpp_shuf]
188
     vbroadcasti128      m2, [INTERP_OFFSET_PS]
189
 
190
     ; register map
191
@@ -10883,7 +10883,7 @@
192
 %else
193
     vpbroadcastq        m0, [tab_ChromaCoeff + r4 * 8]
194
 %endif
195
-    mova                m3, [pb_shuf]
196
+    mova                m3, [interp8_hpp_shuf]
197
     vbroadcasti128      m2, [INTERP_OFFSET_PS]
198
 
199
     ; register map
200
@@ -10956,7 +10956,7 @@
201
 %else
202
     vpbroadcastq        m0, [tab_ChromaCoeff + r4 * 8]
203
 %endif
204
-    mova                m3, [pb_shuf]
205
+    mova                m3, [interp8_hpp_shuf]
206
     vbroadcasti128      m2, [INTERP_OFFSET_PS]
207
 
208
     ; register map
209
@@ -11038,7 +11038,7 @@
210
 %else
211
     vpbroadcastq        m0, [tab_ChromaCoeff + r4 * 8]
212
 %endif
213
-    mova                m3, [pb_shuf]
214
+    mova                m3, [interp8_hpp_shuf]
215
     vbroadcasti128      m2, [INTERP_OFFSET_PS]
216
 
217
     ; register map
218
@@ -11103,7 +11103,7 @@
219
 %else
220
     vpbroadcastq        m0, [tab_ChromaCoeff + r4 * 8]
221
 %endif
222
-    mova                m3, [pb_shuf]
223
+    mova                m3, [interp8_hpp_shuf]
224
     vbroadcasti128      m2, [INTERP_OFFSET_PS]
225
 
226
     ; register map
227
@@ -11204,7 +11204,7 @@
228
 %else
229
     vpbroadcastq        m0, [tab_ChromaCoeff + r4 * 8]
230
 %endif
231
-    mova                m3, [pb_shuf]
232
+    mova                m3, [interp8_hpp_shuf]
233
     vbroadcasti128      m2, [INTERP_OFFSET_PS]
234
 
235
     ; register map
236
@@ -11357,7 +11357,7 @@
237
 %else
238
     vpbroadcastq        m0, [tab_ChromaCoeff + r4 * 8]
239
 %endif
240
-    mova                m3, [pb_shuf]
241
+    mova                m3, [interp8_hpp_shuf]
242
     vbroadcasti128      m2, [INTERP_OFFSET_PS]
243
 
244
     ; register map
245
@@ -11477,7 +11477,7 @@
246
 %else
247
     vpbroadcastq        m0, [tab_ChromaCoeff + r4 * 8]
248
 %endif
249
-    mova                m3, [pb_shuf]
250
+    mova                m3, [interp8_hpp_shuf]
251
     vbroadcasti128      m2, [INTERP_OFFSET_PS]
252
 
253
     ; register map
254
x265_1.9.tar.gz/source/common/x86/loopfilter.asm -> x265_2.0.tar.gz/source/common/x86/loopfilter.asm Changed
881
 
1
@@ -29,9 +29,6 @@
2
 %include "x86util.asm"
3
 
4
 SECTION_RODATA 32
5
-pb_31:      times 32 db 31
6
-pb_124:     times 32 db 124
7
-pb_15:      times 32 db 15
8
 
9
 SECTION .text
10
 cextern pb_1
11
@@ -39,6 +36,10 @@
12
 cextern pb_3
13
 cextern pb_4
14
 cextern pb_01
15
+cextern pb_0123
16
+cextern pb_15
17
+cextern pb_31
18
+cextern pb_124
19
 cextern pb_128
20
 cextern pw_1
21
 cextern pw_n1
22
@@ -48,7 +49,9 @@
23
 cextern pb_movemask
24
 cextern pb_movemask_32
25
 cextern hmul_16p
26
-
27
+cextern pw_1_ffff
28
+cextern pb_shuf_off4
29
+cextern pw_shuf_off4
30
 
31
 ;============================================================================================================
32
 ; void saoCuOrgE0(pixel * rec, int8_t * offsetEo, int lcuWidth, int8_t* signLeft, intptr_t stride)
33
@@ -154,7 +157,9 @@
34
     sub         r4d, 16
35
     jnz        .loopH
36
     RET
37
-%else ; HIGH_BIT_DEPTH
38
+
39
+%else ; HIGH_BIT_DEPTH == 1
40
+
41
 cglobal saoCuOrgE0, 5, 5, 8, rec, offsetEo, lcuWidth, signLeft, stride
42
 
43
     mov         r4d, r4m
44
@@ -240,7 +245,7 @@
45
     sub         r4d, 16
46
     jnz        .loopH
47
     RET
48
-%endif
49
+%endif ; HIGH_BIT_DEPTH == 0
50
 
51
 INIT_YMM avx2
52
 %if HIGH_BIT_DEPTH
53
@@ -2061,6 +2066,117 @@
54
 ; saoCuStatsE0(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count)
55
 ;-----------------------------------------------------------------------------------------------------------------------
56
 %if ARCH_X86_64
57
+
58
+%if HIGH_BIT_DEPTH == 1
59
+INIT_XMM sse4
60
+cglobal saoCuStatsE0, 3,10,8, 0-32
61
+    mov         r3d, r3m
62
+    mov         r4d, r4m
63
+    mov         r9, r5mp
64
+
65
+    ; clear internal temporary buffer
66
+    pxor        m0, m0
67
+    mova        [rsp], m0
68
+    mova        [rsp + mmsize], m0
69
+    mova        m4, [pw_1]
70
+    mova        m5, [pb_2]
71
+    xor         r7d, r7d
72
+
73
+    ; correct stride for diff[] and rec
74
+    mov         r6d, r3d
75
+    and         r6d, ~15
76
+    sub         r2, r6
77
+    lea         r8, [(r6 - 64) * 2]             ; 64 = MAX_CU_SIZE
78
+
79
+    FIX_STRIDES r2
80
+
81
+.loopH:
82
+    mov         r5d, r3d
83
+
84
+    ; calculate signLeft
85
+    mov         r7w, [r1]
86
+    sub         r7w, [r1 - SIZEOF_PIXEL]
87
+    seta        r7b
88
+    setb        r6b
89
+    sub         r7b, r6b
90
+    neg         r7b
91
+    pinsrb      m0, r7d, 15
92
+
93
+.loopL:
94
+
95
+    movu        m3, [r1]
96
+    movu        m2, [r1 + SIZEOF_PIXEL]
97
+    pcmpgtw     m6, m3, m2
98
+    pcmpgtw     m2, m3
99
+    pand        m6, m4
100
+    por         m2, m6
101
+
102
+    movu        m3, [r1 + mmsize]
103
+    movu        m6, [r1 + mmsize + SIZEOF_PIXEL]
104
+    pcmpgtw     m7, m3, m6
105
+    pcmpgtw     m6, m3
106
+    pand        m7, m4
107
+    por         m7, m6
108
+
109
+    packsswb    m2, m7                          ; signRight
110
+
111
+    palignr     m3, m2, m0, 15
112
+
113
+    pxor        m6, m6
114
+    psubb       m6, m3                          ; signLeft
115
+
116
+    mova        m0, m2
117
+    paddb       m2, m6
118
+    paddb       m2, m5                          ; edgeType
119
+
120
+    ; stats[edgeType]
121
+%assign x 0
122
+%rep 16
123
+    pextrb      r7d, m2, x
124
+
125
+    movsx       r6d, word [r0 + x * 2]
126
+    inc         word [rsp + r7 * 2]             ; tmp_count[edgeType]++
127
+    add         [rsp + 5 * 2 + r7 * 4], r6d     ; tmp_stats[edgeType] += (fenc[x] - rec[x])
128
+    dec         r5d
129
+    jz         .next
130
+%assign x x+1
131
+%endrep
132
+
133
+    add         r0, 16*2
134
+    add         r1, 16 * SIZEOF_PIXEL
135
+    jmp        .loopL
136
+
137
+.next:
138
+    sub         r0, r8
139
+    add         r1, r2
140
+
141
+    dec         r4d
142
+    jnz        .loopH
143
+
144
+    ; sum to global buffer
145
+    mov         r0, r6mp
146
+
147
+    ; s_eoTable = {1, 2, 0, 3, 4}
148
+    pmovzxwd    m0, [rsp + 0 * 2]
149
+    pshufd      m0, m0, q3102
150
+    movu        m1, [r0]
151
+    paddd       m0, m1
152
+    movu        [r0], m0
153
+    movzx       r5d, word [rsp + 4 * 2]
154
+    add         [r0 + 4 * 4], r5d
155
+
156
+    movu        m0, [rsp + 5 * 2 + 0 * 4]
157
+    pshufd      m0, m0, q3102
158
+    movu        m1, [r9]
159
+    paddd       m0, m1
160
+    movu        [r9], m0
161
+    mov         r6d, [rsp + 5 * 2 + 4 * 4]
162
+    add         [r9 + 4 * 4], r6d
163
+    RET
164
+%endif ; HIGH_BIT_DEPTH=1
165
+
166
+
167
+%if HIGH_BIT_DEPTH == 0
168
 INIT_XMM sse4
169
 cglobal saoCuStatsE0, 3,10,6, 0-32
170
     mov         r3d, r3m
171
@@ -2086,7 +2202,7 @@
172
 
173
     ; calculate signLeft
174
     mov         r7b, [r1]
175
-    sub         r7b, [r1 - 1]
176
+    sub         r7b, [r1 - SIZEOF_PIXEL]
177
     seta        r7b
178
     setb        r6b
179
     sub         r7b, r6b
180
@@ -2095,13 +2211,14 @@
181
 
182
 .loopL:
183
     movu        m3, [r1]
184
-    movu        m2, [r1 + 1]
185
+    movu        m2, [r1 + SIZEOF_PIXEL]
186
 
187
     pxor        m1, m3, m4
188
     pxor        m2, m4
189
     pcmpgtb     m3, m1, m2
190
     pcmpgtb     m2, m1
191
     pand        m3, [pb_1]
192
+
193
     por         m2, m3                          ; signRight
194
 
195
     palignr     m3, m2, m0, 15
196
@@ -2125,7 +2242,7 @@
197
 %endrep
198
 
199
     add         r0, 16*2
200
-    add         r1, 16
201
+    add         r1, 16 * SIZEOF_PIXEL
202
     jmp        .loopL
203
 
204
 .next:
205
@@ -2155,6 +2272,7 @@
206
     mov         r6d, [rsp + 5 * 2 + 4 * 4]
207
     add         [r9 + 4 * 4], r6d
208
     RET
209
+%endif ; HIGH_BIT_DEPTH=0
210
 
211
 
212
 ;-----------------------------------------------------------------------------------------------------------------------
213
@@ -2341,6 +2459,112 @@
214
 ; saoCuStatsE1_c(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count)
215
 ;-------------------------------------------------------------------------------------------------------------------------------------------
216
 %if ARCH_X86_64
217
+
218
+%if HIGH_BIT_DEPTH
219
+INIT_XMM sse4
220
+cglobal saoCuStatsE1, 4,12,8,0-32    ; Stack: 5 of stats and 5 of count
221
+    mov         r5d, r5m
222
+    mov         r4d, r4m
223
+
224
+    ; clear internal temporary buffer
225
+    pxor        m0, m0
226
+    mova        [rsp], m0
227
+    mova        [rsp + mmsize], m0
228
+    mova        m5, [pw_1]
229
+    mova        m6, [pb_2]
230
+    movh        m7, [r3 + r4]
231
+
232
+    FIX_STRIDES r2d
233
+
234
+.loopH:
235
+    mov         r6d, r4d
236
+    mov         r9, r0
237
+    mov         r10, r1
238
+    mov         r11, r3
239
+
240
+.loopW:
241
+    ; signDown
242
+    movu        m1, [r10]
243
+    movu        m2, [r10 + r2]
244
+    pcmpgtw     m3, m1, m2
245
+    pcmpgtw     m2, m1
246
+    pand        m3, m5
247
+    por         m2, m3
248
+
249
+    movu        m3, [r10 + mmsize]
250
+    movu        m4, [r10 + mmsize + r2]
251
+    pcmpgtw     m0, m3, m4
252
+    pcmpgtw     m4, m3
253
+    pand        m0, m5
254
+    por         m4, m0
255
+    packsswb    m2, m4
256
+
257
+    pxor        m3, m3
258
+    psubb       m3, m2                          ; -signDown
259
+
260
+    ; edgeType
261
+    movu        m4, [r11]
262
+    paddb       m4, m6
263
+    paddb       m2, m4
264
+
265
+    ; update upBuff1
266
+    movu        [r11], m3
267
+
268
+    ; 16 pixels
269
+%assign x 0
270
+%rep 16
271
+    pextrb      r7d, m2, x
272
+    inc         word [rsp + r7 * 2]
273
+
274
+    ; stats[edgeType]
275
+    movsx       r8d, word [r9 + x * 2]
276
+    add         [rsp + 5 * 2 + r7 * 4], r8d
277
+
278
+    dec         r6d
279
+    jz         .next
280
+%assign x x+1
281
+%endrep
282
+
283
+    add         r9, mmsize * 2
284
+    add         r10, mmsize * SIZEOF_PIXEL
285
+    add         r11, mmsize
286
+    jmp        .loopW
287
+
288
+.next:
289
+    ; restore pointer upBuff1
290
+    add         r0, 64*2                        ; MAX_CU_SIZE
291
+    add         r1, r2
292
+
293
+    dec         r5d
294
+    jg         .loopH
295
+
296
+    ; restore unavailable pixels
297
+    movh        [r3 + r4], m7
298
+
299
+    ; sum to global buffer
300
+    mov         r1, r6m
301
+    mov         r0, r7m
302
+
303
+    ; s_eoTable = {1,2,0,3,4}
304
+    pmovzxwd    m0, [rsp + 0 * 2]
305
+    pshufd      m0, m0, q3102
306
+    movu        m1, [r0]
307
+    paddd       m0, m1
308
+    movu        [r0], m0
309
+    movzx       r5d, word [rsp + 4 * 2]
310
+    add         [r0 + 4 * 4], r5d
311
+
312
+    movu        m0, [rsp + 5 * 2 + 0 * 4]
313
+    pshufd      m0, m0, q3102
314
+    movu        m1, [r1]
315
+    paddd       m0, m1
316
+    movu        [r1], m0
317
+    mov         r6d, [rsp + 5 * 2 + 4 * 4]
318
+    add         [r1 + 4 * 4], r6d
319
+    RET
320
+
321
+%else ; HIGH_BIT_DEPTH == 1
322
+
323
 INIT_XMM sse4
324
 cglobal saoCuStatsE1, 4,12,8,0-32    ; Stack: 5 of stats and 5 of count
325
     mov         r5d, r5m
326
@@ -2435,6 +2659,7 @@
327
     mov         r6d, [rsp + 5 * 2 + 4 * 4]
328
     add         [r1 + 4 * 4], r6d
329
     RET
330
+%endif ; HIGH_BIT_DEPTH == 0
331
 
332
 
333
 INIT_YMM avx2
334
@@ -2650,6 +2875,129 @@
335
 ;}
336
 
337
 %if ARCH_X86_64
338
+
339
+%if HIGH_BIT_DEPTH == 1
340
+INIT_XMM sse4
341
+cglobal saoCuStatsE2, 5,9,7,0-32    ; Stack: 5 of stats and 5 of count
342
+    mov         r5d, r5m
343
+    FIX_STRIDES r2d
344
+
345
+    ; clear internal temporary buffer
346
+    pxor        m0, m0
347
+    mova        [rsp], m0
348
+    mova        [rsp + mmsize], m0
349
+    mova        m5, [pw_1]
350
+    mova        m6, [pb_2]
351
+
352
+.loopH:
353
+    ; TODO: merge into SIMD in below
354
+    ; get upBuffX[0]
355
+    mov         r6w, [r1 + r2]
356
+    sub         r6w, [r1 -  1 * SIZEOF_PIXEL]
357
+    seta        r6b
358
+    setb        r7b
359
+    sub         r6b, r7b
360
+    mov         [r4], r6b
361
+
362
+    ; backup unavailable pixels
363
+    movh        m0, [r4 + r5 + 1]
364
+
365
+    mov         r6d, r5d
366
+.loopW:
367
+    ; signDown
368
+    ; stats[edgeType]
369
+    ; edgeType
370
+    movu        m1, [r1]
371
+    movu        m2, [r1 + r2 + 1 * SIZEOF_PIXEL]
372
+    pcmpgtw     m3, m1, m2
373
+    pcmpgtw     m2, m1
374
+    pand        m2, m5
375
+    por         m3, m2
376
+
377
+    movu        m1, [r1 + mmsize]
378
+    movu        m2, [r1 + r2 + 1 * SIZEOF_PIXEL + mmsize]
379
+    pcmpgtw     m4, m1, m2
380
+    pcmpgtw     m2, m1
381
+    pand        m2, m5
382
+    por         m4, m2
383
+    packsswb    m3, m4
384
+
385
+    movu        m4, [r3]
386
+    paddb       m4, m6
387
+    psubb       m4, m3
388
+
389
+    ; update upBuff1
390
+    movu        [r4 + 1], m3
391
+
392
+    ; 16 pixels
393
+%assign x 0
394
+%rep 16
395
+    pextrb      r7d, m4, x
396
+    inc    word [rsp + r7 * 2]
397
+
398
+    movsx       r8d, word [r0 + x * 2]
399
+    add         [rsp + 5 * 2 + r7 * 4], r8d
400
+
401
+    dec         r6d
402
+    jz         .next
403
+%assign x x+1
404
+%endrep
405
+
406
+    add         r0, mmsize * 2
407
+    add         r1, mmsize * SIZEOF_PIXEL
408
+    add         r3, mmsize
409
+    add         r4, mmsize
410
+    jmp        .loopW
411
+
412
+.next:
413
+    xchg        r3, r4
414
+
415
+    ; restore pointer upBuff1
416
+    mov         r6d, r5d
417
+    and         r6d, ~15
418
+    neg         r6                              ; MUST BE 64-bits, it is Negtive
419
+
420
+    ; move to next row
421
+
422
+    ; move back to start point
423
+    add         r3, r6
424
+    add         r4, r6
425
+
426
+    ; adjust with stride
427
+    lea         r0, [r0 + (r6 + 64) * 2]        ; 64 = MAX_CU_SIZE
428
+    add         r1, r2
429
+    lea         r1, [r1 + r6 * SIZEOF_PIXEL]
430
+
431
+    ; restore unavailable pixels
432
+    movh        [r3 + r5 + 1], m0
433
+
434
+    dec    byte r6m
435
+    jg         .loopH
436
+
437
+    ; sum to global buffer
438
+    mov         r1, r7m
439
+    mov         r0, r8m
440
+
441
+    ; s_eoTable = {1,2,0,3,4}
442
+    pmovzxwd    m0, [rsp + 0 * 2]
443
+    pshufd      m0, m0, q3102
444
+    movu        m1, [r0]
445
+    paddd       m0, m1
446
+    movu        [r0], m0
447
+    movzx       r5d, word [rsp + 4 * 2]
448
+    add         [r0 + 4 * 4], r5d
449
+
450
+    movu        m0, [rsp + 5 * 2 + 0 * 4]
451
+    pshufd      m0, m0, q3102
452
+    movu        m1, [r1]
453
+    paddd       m0, m1
454
+    movu        [r1], m0
455
+    mov         r6d, [rsp + 5 * 2 + 4 * 4]
456
+    add         [r1 + 4 * 4], r6d
457
+    RET
458
+
459
+%else ; HIGH_BIT_DEPTH == 1
460
+
461
 ; TODO: x64 only because I need temporary register r7,r8, easy portab to x86
462
 INIT_XMM sse4
463
 cglobal saoCuStatsE2, 5,9,8,0-32    ; Stack: 5 of stats and 5 of count
464
@@ -2767,6 +3115,7 @@
465
     add         [r1 + 4 * 4], r6d
466
     RET
467
 
468
+%endif ; HIGH_BIT_DEPTH == 0
469
 
470
 INIT_YMM avx2
471
 cglobal saoCuStatsE2, 5,10,16                        ; Stack: 5 of stats and 5 of count
472
@@ -2994,6 +3343,119 @@
473
 ;}
474
 
475
 %if ARCH_X86_64
476
+
477
+%if HIGH_BIT_DEPTH == 1
478
+INIT_XMM sse4
479
+cglobal saoCuStatsE3, 4,9,8,0-32    ; Stack: 5 of stats and 5 of count
480
+    mov         r4d, r4m
481
+    mov         r5d, r5m
482
+    FIX_STRIDES r2d
483
+
484
+    ; clear internal temporary buffer
485
+    pxor        m0, m0
486
+    mova        [rsp], m0
487
+    mova        [rsp + mmsize], m0
488
+    ;mova        m0, [pb_128]
489
+    mova        m5, [pw_1]
490
+    mova        m6, [pb_2]
491
+    movh        m7, [r3 + r4]
492
+
493
+.loopH:
494
+    mov         r6d, r4d
495
+
496
+.loopW:
497
+    ; signDown
498
+    movu        m1, [r1]
499
+    movu        m2, [r1 + r2 - 1 * SIZEOF_PIXEL]
500
+    pcmpgtw     m3, m1, m2
501
+    pcmpgtw     m2, m1
502
+    pand        m2, m5
503
+    por         m3, m2
504
+
505
+    movu        m1, [r1 + mmsize]
506
+    movu        m2, [r1 + r2 - 1 * SIZEOF_PIXEL + mmsize]
507
+    pcmpgtw     m4, m1, m2
508
+    pcmpgtw     m2, m1
509
+    pand        m2, m5
510
+    por         m4, m2
511
+    packsswb    m3, m4
512
+
513
+    ; edgeType
514
+    movu        m4, [r3]
515
+    paddb       m4, m6
516
+    psubb       m4, m3
517
+
518
+    ; update upBuff1
519
+    movu        [r3 - 1], m3
520
+
521
+    ; stats[edgeType]
522
+    pxor        m1, m0
523
+
524
+    ; 16 pixels
525
+%assign x 0
526
+%rep 16
527
+    pextrb      r7d, m4, x
528
+    inc    word [rsp + r7 * 2]
529
+
530
+    movsx       r8d, word [r0 + x * 2]
531
+    add         [rsp + 5 * 2 + r7 * 4], r8d
532
+
533
+    dec         r6d
534
+    jz         .next
535
+%assign x x+1
536
+%endrep
537
+
538
+    add         r0, 16 * 2
539
+    add         r1, 16 * SIZEOF_PIXEL
540
+    add         r3, 16
541
+    jmp         .loopW
542
+
543
+.next:
544
+    ; restore pointer upBuff1
545
+    mov         r6d, r4d
546
+    and         r6d, ~15
547
+    neg         r6                              ; MUST BE 64-bits, it is Negtive
548
+
549
+    ; move to next row
550
+
551
+    ; move back to start point
552
+    add         r3, r6
553
+
554
+    ; adjust with stride
555
+    lea         r0, [r0 + (r6 + 64) * 2]        ; 64 = MAX_CU_SIZE
556
+    add         r1, r2
557
+    lea         r1, [r1 + r6 * SIZEOF_PIXEL]
558
+
559
+    dec         r5d
560
+    jg         .loopH
561
+
562
+    ; restore unavailable pixels
563
+    movh        [r3 + r4], m7
564
+
565
+    ; sum to global buffer
566
+    mov         r1, r6m
567
+    mov         r0, r7m
568
+
569
+    ; s_eoTable = {1,2,0,3,4}
570
+    pmovzxwd    m0, [rsp + 0 * 2]
571
+    pshufd      m0, m0, q3102
572
+    movu        m1, [r0]
573
+    paddd       m0, m1
574
+    movu        [r0], m0
575
+    movzx       r5d, word [rsp + 4 * 2]
576
+    add         [r0 + 4 * 4], r5d
577
+
578
+    movu        m0, [rsp + 5 * 2 + 0 * 4]
579
+    pshufd      m0, m0, q3102
580
+    movu        m1, [r1]
581
+    paddd       m0, m1
582
+    movu        [r1], m0
583
+    mov         r6d, [rsp + 5 * 2 + 4 * 4]
584
+    add         [r1 + 4 * 4], r6d
585
+    RET
586
+
587
+%else ; HIGH_BIT_DEPTH == 1
588
+
589
 INIT_XMM sse4
590
 cglobal saoCuStatsE3, 4,9,8,0-32    ; Stack: 5 of stats and 5 of count
591
     mov         r4d, r4m
592
@@ -3099,6 +3561,7 @@
593
     add         [r1 + 4 * 4], r6d
594
     RET
595
 
596
+%endif ; HIGH_BIT_DEPTH == 0
597
 
598
 INIT_YMM avx2
599
 cglobal saoCuStatsE3, 4,10,16           ; Stack: 5 of stats and 5 of count
600
@@ -3297,6 +3760,9 @@
601
 
602
 INIT_XMM sse4
603
 cglobal pelFilterLumaStrong_H, 5,7,10
604
+%if HIGH_BIT_DEPTH
605
+    add             r2d, r2d
606
+%endif
607
     mov             r1, r2
608
     neg             r3d
609
     neg             r4d
610
@@ -3305,6 +3771,16 @@
611
     lea             r5, [r2 * 3]
612
     lea             r6, [r1 * 3]
613
 
614
+%if HIGH_BIT_DEPTH
615
+    movu            m4, [r0]                ; src[0]
616
+    movu            m3, [r0 + r1]           ; src[-offset]
617
+    movu            m2, [r0 + r1 * 2]       ; src[-offset * 2]
618
+    movu            m1, [r0 + r6]           ; src[-offset * 3]
619
+    movu            m0, [r0 + r1 * 4]       ; src[-offset * 4]
620
+    movu            m5, [r0 + r2]           ; src[offset]
621
+    movu            m6, [r0 + r2 * 2]       ; src[offset * 2]
622
+    movu            m7, [r0 + r5]           ; src[offset * 3]
623
+%else
624
     pmovzxbw        m4, [r0]                ; src[0]
625
     pmovzxbw        m3, [r0 + r1]           ; src[-offset]
626
     pmovzxbw        m2, [r0 + r1 * 2]       ; src[-offset * 2]
627
@@ -3313,6 +3789,7 @@
628
     pmovzxbw        m5, [r0 + r2]           ; src[offset]
629
     pmovzxbw        m6, [r0 + r2 * 2]       ; src[offset * 2]
630
     pmovzxbw        m7, [r0 + r5]           ; src[offset * 3]
631
+%endif
632
 
633
     paddw           m0, m0                  ; m0*2
634
     mova            m8, m2
635
@@ -3380,6 +3857,15 @@
636
     paddw           m0, m1
637
     paddw           m3, m4
638
     paddw           m9, m5
639
+
640
+%if HIGH_BIT_DEPTH
641
+    movh            [r0 + r6], m0
642
+    movhps          [r0 + r1], m0
643
+    movh            [r0], m3
644
+    movhps          [r0 + r2 * 2], m3,
645
+    movh            [r0 + r2 * 1], m9
646
+    movhps          [r0 + r1 * 2], m9
647
+%else
648
     packuswb        m0, m0
649
     packuswb        m3, m9
650
 
651
@@ -3389,14 +3875,41 @@
652
     pextrd          [r0 + r2 * 2], m3, 1
653
     pextrd          [r0 + r2 * 1], m3, 2
654
     pextrd          [r0 + r1 * 2], m3, 3
655
+%endif
656
     RET
657
 
658
 INIT_XMM sse4
659
 cglobal pelFilterLumaStrong_V, 5,5,10
660
+%if HIGH_BIT_DEPTH
661
+    add             r1d, r1d
662
+%endif
663
     neg             r3d
664
     neg             r4d
665
     lea             r2, [r1 * 3]
666
 
667
+%if HIGH_BIT_DEPTH
668
+    movu            m0, [r0 - 8]            ; src[-offset * 4] row 0
669
+    movu            m1, [r0 + r1 * 1 - 8]   ; src[-offset * 4] row 1
670
+    movu            m2, [r0 + r1 * 2 - 8]   ; src[-offset * 4] row 2
671
+    movu            m3, [r0 + r2 * 1 - 8]   ; src[-offset * 4] row 3
672
+
673
+    punpckhwd       m4, m0, m1              ; [m4 m4 m5 m5 m6 m6 m7 m7]
674
+    punpcklwd       m0, m1                  ; [m0 m0 m1 m1 m2 m2 m3 m3]
675
+
676
+    punpckhwd       m5, m2, m3              ; [m4 m4 m5 m5 m6 m6 m7 m7]
677
+    punpcklwd       m2, m3                  ; [m0 m0 m1 m1 m2 m2 m3 m3]
678
+
679
+    punpckhdq       m3, m0, m2              ; [m2 m2 m2 m2 m3 m3 m3 m3]
680
+    punpckldq       m0, m2                  ; [m0 m0 m0 m0 m1 m1 m1 m1]
681
+    psrldq          m1, m0, 8               ; [m1 m1 m1 m1 x x x x]
682
+    mova            m2, m3                  ; [m2 m2 m2 m2 x x x x]
683
+    punpckhqdq      m3, m3                  ; [m3 m3 m3 m3 x x x x]
684
+
685
+    punpckhdq       m6, m4, m5              ; [m6 m6 m6 m6 m7 m7 m7 m7]
686
+    punpckldq       m4, m5                  ; [m4 m4 m4 m4 m5 m5 m5 m5]
687
+    psrldq          m7, m6, 8
688
+    psrldq          m5, m4, 8
689
+%else
690
     movh            m0, [r0 - 4]            ; src[-offset * 4] row 0
691
     movh            m1, [r0 + r1 * 1 - 4]   ; src[-offset * 4] row 1
692
     movh            m2, [r0 + r1 * 2 - 4]   ; src[-offset * 4] row 2
693
@@ -3429,6 +3942,7 @@
694
     pmovzxbw        m5, m5
695
     pmovzxbw        m6, m6
696
     pmovzxbw        m7, m7
697
+%endif
698
 
699
     paddw           m0, m0                  ; m0*2
700
     mova            m8, m2
701
@@ -3496,6 +4010,35 @@
702
     paddw           m0, m1
703
     paddw           m3, m4
704
     paddw           m9, m5
705
+
706
+%if HIGH_BIT_DEPTH
707
+    ; 4x6 output rows -
708
+    ; m0 - col 0
709
+    ; m3 - col 3
710
+
711
+    psrldq           m1, m0, 8
712
+    psrldq           m2, m3, 8
713
+
714
+    mova            m4, m9
715
+    psrldq          m5, m9, 8
716
+
717
+    ; transpose 4x6 to 6x4
718
+    punpcklwd       m0, m5
719
+    punpcklwd       m1, m3
720
+    punpcklwd       m4, m2
721
+
722
+    punpckldq       m9, m0, m1
723
+    punpckhdq       m0, m1
724
+
725
+    movh            [r0 + r1 * 0 - 6], m9
726
+    movhps          [r0 + r1 * 1 - 6], m9
727
+    movh            [r0 + r1 * 2 - 6], m0
728
+    movhps          [r0 + r2 * 1 - 6], m0
729
+    pextrd          [r0 + r1 * 0 + 2], m4, 0
730
+    pextrd          [r0 + r1 * 1 + 2], m4, 1
731
+    pextrd          [r0 + r1 * 2 + 2], m4, 2
732
+    pextrd          [r0 + r2 * 1 + 2], m4, 3
733
+%else
734
     packuswb        m0, m0
735
     packuswb        m3, m9
736
 
737
@@ -3525,5 +4068,143 @@
738
     pextrw          [r0 + r1 * 1 + 1], m4, 1
739
     pextrw          [r0 + r1 * 2 + 1], m4, 2
740
     pextrw          [r0 + r2 * 1 + 1], m4, 3
741
+%endif
742
+    RET
743
+%endif ; ARCH_X86_64
744
+
745
+%if ARCH_X86_64
746
+INIT_XMM sse4
747
+cglobal pelFilterChroma_H, 6,6,5
748
+%if HIGH_BIT_DEPTH
749
+    add             r2d, r2d
750
+%endif
751
+    mov             r1, r2
752
+    neg             r3d
753
+    neg             r1
754
+
755
+%if HIGH_BIT_DEPTH
756
+    movu            m4, [r0]                ; src[0]
757
+    movu            m3, [r0 + r1]           ; src[-offset]
758
+    movu            m0, [r0 + r2]           ; src[offset]
759
+    movu            m2, [r0 + r1 * 2]       ; src[-offset * 2]
760
+%else
761
+    pmovzxbw        m4, [r0]                ; src[0]
762
+    pmovzxbw        m3, [r0 + r1]           ; src[-offset]
763
+    pmovzxbw        m0, [r0 + r2]           ; src[offset]
764
+    pmovzxbw        m2, [r0 + r1 * 2]       ; src[-offset * 2]
765
+%endif
766
+
767
+    psubw           m1, m4, m3              ; m4 - m3
768
+    psubw           m2, m0                  ; m2 - m5
769
+    paddw           m2, [pw_4]
770
+    psllw           m1, 2                   ; (m4 - m3) * 4
771
+    paddw           m1, m2
772
+    psraw           m1, 3
773
+
774
+    movd            m0, r3d
775
+    pshufb          m0, [pb_01]             ; -tc
776
+
777
+    pmaxsw          m1, m0
778
+    psignw          m0, [pw_n1]
779
+    pminsw          m1, m0                  ; delta
780
+    punpcklqdq      m1, m1
781
+
782
+    shl             r5d, 16
783
+    or              r5w, r4w
784
+    punpcklqdq      m3, m4
785
+    mova            m2, [pw_1_ffff]
786
+
787
+    movd            m0, r5d
788
+    pshufb          m0, [pb_0123]
789
+
790
+    pand            m0, m1                  ; (delta & maskP) (delta & maskQ)
791
+    psignw          m0, m2
792
+    paddw           m3, m0
793
+
794
+    pxor            m0, m0
795
+    pmaxsw          m3, m0
796
+    pminsw          m3, [pw_pixel_max]
797
+
798
+%if HIGH_BIT_DEPTH
799
+    movh            [r0 + r1], m3
800
+    movhps          [r0], m3
801
+%else
802
+    packuswb        m3, m3
803
+    movd            [r0 + r1], m3
804
+    pextrd          [r0], m3, 1
805
+%endif
806
+    RET
807
+
808
+INIT_XMM sse4
809
+cglobal pelFilterChroma_V, 6,6,5
810
+%if HIGH_BIT_DEPTH
811
+    add             r1d, r1d
812
+%endif
813
+    neg             r3d
814
+    lea             r2, [r1 * 3]
815
+
816
+%if HIGH_BIT_DEPTH
817
+    movu            m4, [r0 + r1 * 0 - 4]   ; src[-offset*2, -offset, 0, offset] [m2 m3 m4 m5]
818
+    movu            m3, [r0 + r1 * 1 - 4]
819
+    movu            m0, [r0 + r1 * 2 - 4]
820
+    movu            m2, [r0 + r2 * 1 - 4]
821
+%else
822
+    pmovzxbw        m4, [r0 + r1 * 0 - 2]   ; src[-offset*2, -offset, 0, offset] [m2 m3 m4 m5]
823
+    pmovzxbw        m3, [r0 + r1 * 1 - 2]
824
+    pmovzxbw        m0, [r0 + r1 * 2 - 2]
825
+    pmovzxbw        m2, [r0 + r2 * 1 - 2]
826
+%endif
827
+    punpcklwd       m4, m3
828
+    punpcklwd       m0, m2
829
+    punpckldq       m2, m4, m0              ; [m2 m2 m2 m2 m3 m3 m3 m3]
830
+    punpckhdq       m4, m0                  ; [m4 m4 m4 m4 m5 m5 m5 m5]
831
+    psrldq          m3, m2, 8
832
+    psrldq          m0, m4, 8
833
+
834
+    psubw           m1, m4, m3              ; m4 - m3
835
+    psubw           m2, m0                  ; m2 - m5
836
+    paddw           m2, [pw_4]
837
+    psllw           m1, 2                   ; (m4 - m3) * 4
838
+    paddw           m1, m2
839
+    psraw           m1, 3
840
+
841
+    movd            m0, r3d
842
+    pshufb          m0, [pb_01]             ; -tc
843
+
844
+    pmaxsw          m1, m0
845
+    psignw          m0, [pw_n1]
846
+    pminsw          m1, m0                  ; delta
847
+    punpcklqdq      m1, m1
848
+
849
+    shl             r5d, 16
850
+    or              r5w, r4w
851
+    punpcklqdq      m3, m4
852
+    mova            m2, [pw_1_ffff]
853
+
854
+    movd            m0, r5d
855
+    pshufb          m0, [pb_0123]
856
+
857
+    pand            m0, m1                  ; (delta & maskP) (delta & maskQ)
858
+    psignw          m0, m2
859
+    paddw           m3, m0
860
+
861
+    pxor            m0, m0
862
+    pmaxsw          m3, m0
863
+    pminsw          m3, [pw_pixel_max]
864
+
865
+%if HIGH_BIT_DEPTH
866
+    pshufb          m3, [pw_shuf_off4]
867
+    pextrd          [r0 + r1 * 0 - 2], m3, 0
868
+    pextrd          [r0 + r1 * 1 - 2], m3, 1
869
+    pextrd          [r0 + r1 * 2 - 2], m3, 2
870
+    pextrd          [r0 + r2 * 1 - 2], m3, 3
871
+%else
872
+    packuswb        m3, m3
873
+    pshufb          m3, [pb_shuf_off4]
874
+    pextrw          [r0 + r1 * 0 - 1], m3, 0
875
+    pextrw          [r0 + r1 * 1 - 1], m3, 1
876
+    pextrw          [r0 + r1 * 2 - 1], m3, 2
877
+    pextrw          [r0 + r2 * 1 - 1], m3, 3
878
+%endif
879
     RET
880
 %endif ; ARCH_X86_64
881
x265_1.9.tar.gz/source/common/x86/loopfilter.h -> x265_2.0.tar.gz/source/common/x86/loopfilter.h Changed
9
 
1
@@ -48,5 +48,7 @@
2
 
3
 void PFX(pelFilterLumaStrong_V_sse4)(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tcP, int32_t tcQ);
4
 void PFX(pelFilterLumaStrong_H_sse4)(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tcP, int32_t tcQ);
5
+void PFX(pelFilterChroma_V_sse4)(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tc, int32_t maskP, int32_t maskQ);
6
+void PFX(pelFilterChroma_H_sse4)(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tc, int32_t maskP, int32_t maskQ);
7
 
8
 #endif // ifndef X265_LOOPFILTER_H
9
x265_1.9.tar.gz/source/common/x86/mc-a.asm -> x265_2.0.tar.gz/source/common/x86/mc-a.asm Changed
26
 
1
@@ -53,7 +53,6 @@
2
              times 8 db 2
3
              times 8 db 4
4
              times 8 db 6
5
-sq_1: times 1 dq 1
6
 
7
 SECTION .text
8
 
9
@@ -74,6 +73,7 @@
10
 cextern pw_pixel_max
11
 cextern pd_32
12
 cextern pd_64
13
+cextern pq_1
14
 
15
 ;====================================================================================================================
16
 ;void addAvg (int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride)
17
@@ -3638,7 +3638,7 @@
18
     mova        m3, [r4+16]
19
     movd        m2, [r4+32]         ; denom
20
     mova        m4, [pw_pixel_max]
21
-    paddw       m2, [sq_1]          ; denom+1
22
+    paddw       m2, [pq_1]          ; denom+1
23
 %endmacro
24
 
25
 ; src1, src2
26
x265_1.9.tar.gz/source/common/x86/mc-a2.asm -> x265_2.0.tar.gz/source/common/x86/mc-a2.asm Changed
151
 
1
@@ -43,11 +43,11 @@
2
 deinterleave_shuf32a: db 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30
3
 deinterleave_shuf32b: db 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31
4
 %endif
5
-pw_1024: times 16 dw 1024
6
 
7
-pd_16: times 4 dd 16
8
-pd_0f: times 4 dd 0xffff
9
-pf_inv256: times 8 dd 0.00390625
10
+cutree_fix8_unpack_shuf: db -1,-1, 0, 1,-1,-1, 2, 3,-1,-1, 4, 5,-1,-1, 6, 7
11
+                         db -1,-1, 8, 9,-1,-1,10,11,-1,-1,12,13,-1,-1,14,15
12
+
13
+const pq_256,       times 4 dq 256.0
14
 const pd_inv256,    times 4 dq 0.00390625
15
 const pd_0_5,       times 4 dq 0.5
16
 
17
@@ -59,9 +59,11 @@
18
 cextern pw_32
19
 cextern pw_512
20
 cextern pw_00ff
21
+cextern pw_1024
22
 cextern pw_3fff
23
 cextern pw_pixel_max
24
 cextern pd_ffff
25
+cextern pd_16
26
 
27
 ;The hpel_filter routines use non-temporal writes for output.
28
 ;The following defines may be uncommented for testing.
29
@@ -1215,3 +1217,121 @@
30
 
31
 INIT_YMM avx2
32
 MBTREE_AVX
33
+
34
+
35
+%macro CUTREE_FIX8 0
36
+;-----------------------------------------------------------------------------
37
+; void cutree_fix8_pack( uint16_t *dst, double *src, int count )
38
+;-----------------------------------------------------------------------------
39
+cglobal cutree_fix8_pack, 3, 4, 5
40
+    movapd       m2, [pq_256]
41
+    sub          r2d, mmsize / 2
42
+    movsxdifnidn r2, r2d
43
+    lea          r1, [r1 + 8 * r2]
44
+    lea          r0, [r0 + 2 * r2]
45
+    neg          r2
46
+    jg .skip_loop
47
+.loop:
48
+    mulpd        m0, m2, [r1 + 8 * r2]
49
+    mulpd        m1, m2, [r1 + 8 * r2 + mmsize]
50
+    mulpd        m3, m2, [r1 + 8 * r2 + 2 * mmsize]
51
+    mulpd        m4, m2, [r1 + 8 * r2 + 3 * mmsize]
52
+    cvttpd2dq    xm0, m0
53
+    cvttpd2dq    xm1, m1
54
+    cvttpd2dq    xm3, m3
55
+    cvttpd2dq    xm4, m4
56
+%if mmsize == 32
57
+    vinserti128  m0, m0, xm3, 1
58
+    vinserti128  m1, m1, xm4, 1
59
+    packssdw     m0, m1
60
+%else
61
+    punpcklqdq   m0, m1
62
+    punpcklqdq   m3, m4
63
+    packssdw     m0, m3
64
+%endif
65
+    mova         [r0 + 2 * r2], m0
66
+    add          r2, mmsize / 2
67
+    jle .loop
68
+.skip_loop:
69
+    sub          r2, mmsize / 2
70
+    jz .end
71
+    ; Do the remaining values in scalar in order to avoid overreading src.
72
+.scalar:
73
+    movq         xm0, [r1 + 8 * r2 + 4 * mmsize] 
74
+    mulsd        xm0, xm2
75
+    cvttsd2si    r3d, xm0
76
+    mov          [r0 + 2 * r2 + mmsize], r3w
77
+    inc          r2
78
+    jl .scalar
79
+.end:
80
+    RET
81
+
82
+;-----------------------------------------------------------------------------
83
+; void cutree_fix8_unpack( double *dst, uint16_t *src, int count )
84
+;-----------------------------------------------------------------------------
85
+cglobal cutree_fix8_unpack, 3, 4, 7
86
+%if mmsize != 32
87
+    mova           m4, [cutree_fix8_unpack_shuf+16]
88
+%endif
89
+    movapd         m2, [pd_inv256]
90
+    mova           m3, [cutree_fix8_unpack_shuf]
91
+    sub            r2d, mmsize / 2
92
+    movsxdifnidn   r2, r2d
93
+    lea            r1, [r1 + 2 * r2]
94
+    lea            r0, [r0 + 8 * r2]
95
+    neg            r2
96
+    jg .skip_loop
97
+.loop:
98
+%if mmsize == 32
99
+    vbroadcasti128 m0, [r1 + 2 * r2]
100
+    vbroadcasti128 m1, [r1 + 2 * r2 + 16]
101
+    pshufb         m0, m3
102
+    pshufb         m1, m3
103
+%else
104
+    mova           m1, [r1 + 2 * r2]
105
+    pshufb         m0, m1, m3
106
+    pshufb         m1, m4
107
+%endif
108
+    psrad          m0, 16 ; sign-extend
109
+    psrad          m1, 16
110
+    cvtdq2pd       m5, xm0
111
+    cvtdq2pd       m6, xm1
112
+%if mmsize == 32
113
+    vpermq         m0, m0, q1032
114
+    vpermq         m1, m1, q1032
115
+%else
116
+    psrldq         m0, 8
117
+    psrldq         m1, 8
118
+%endif
119
+    cvtdq2pd       m0, xm0
120
+    cvtdq2pd       m1, xm1
121
+    mulpd          m0, m2
122
+    mulpd          m1, m2
123
+    mulpd          m5, m2
124
+    mulpd          m6, m2
125
+    movapd         [r0 + 8 * r2], m5
126
+    movapd         [r0 + 8 * r2 + mmsize], m0
127
+    movapd         [r0 + 8 * r2 + mmsize * 2], m6
128
+    movapd         [r0 + 8 * r2 + mmsize * 3], m1
129
+    add            r2, mmsize / 2
130
+    jle .loop
131
+.skip_loop:
132
+    sub            r2, mmsize / 2
133
+    jz .end
134
+.scalar:
135
+    movzx          r3d, word [r1 + 2 * r2 + mmsize]
136
+    movsx          r3d, r3w
137
+    cvtsi2sd       xm0, r3d
138
+    mulsd          xm0, xm2
139
+    movsd          [r0 + 8 * r2 + 4 * mmsize], xm0
140
+    inc            r2
141
+    jl .scalar
142
+.end:
143
+    RET
144
+%endmacro
145
+
146
+INIT_XMM ssse3
147
+CUTREE_FIX8
148
+
149
+INIT_YMM avx2
150
+CUTREE_FIX8
151
x265_1.9.tar.gz/source/common/x86/mc.h -> x265_2.0.tar.gz/source/common/x86/mc.h Changed
22
 
1
@@ -46,4 +46,20 @@
2
 
3
 #undef PROPAGATE_COST
4
 
5
+#define FIX8UNPACK(cpu) \
6
+    void PFX(cutree_fix8_unpack_ ## cpu)(double *dst, uint16_t *src, int count);
7
+
8
+FIX8UNPACK(ssse3)
9
+FIX8UNPACK(avx2)
10
+
11
+#undef FIX8UNPACK
12
+
13
+#define FIX8PACK(cpu) \
14
+    void PFX(cutree_fix8_pack_## cpu)(uint16_t *dst, double *src, int count);
15
+
16
+FIX8PACK(ssse3)
17
+FIX8PACK(avx2)
18
+
19
+#undef FIX8PACK
20
+
21
 #endif // ifndef X265_MC_H
22
x265_1.9.tar.gz/source/common/x86/pixel-a.asm -> x265_2.0.tar.gz/source/common/x86/pixel-a.asm Changed
609
 
1
@@ -50,9 +50,6 @@
2
 transd_shuf1: SHUFFLE_MASK_W 0, 8, 2, 10, 4, 12, 6, 14
3
 transd_shuf2: SHUFFLE_MASK_W 1, 9, 3, 11, 5, 13, 7, 15
4
 
5
-sw_f0:     dq 0xfff0, 0
6
-pd_f0:     times 4 dd 0xffff0000
7
-
8
 SECTION .text
9
 
10
 cextern pb_0
11
@@ -67,7 +64,6 @@
12
 cextern pw_pmpmpmpm
13
 cextern pw_pmmpzzzz
14
 cextern pd_1
15
-cextern popcnt_table
16
 cextern pd_2
17
 cextern hmul_16p
18
 cextern pb_movemask
19
@@ -13803,3 +13799,589 @@
20
     movzx           eax, al
21
     RET
22
 %endif ; ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 0
23
+
24
+
25
+%if HIGH_BIT_DEPTH == 1 && BIT_DEPTH == 10
26
+%macro LOAD_DIFF_AVX2 4
27
+    movu       %1, %3
28
+    movu       %2, %4
29
+    psubw      %1, %2
30
+%endmacro
31
+
32
+%macro LOAD_DIFF_8x4P_AVX2 6-8 r0,r2 ; 4x dest, 2x temp, 2x pointer
33
+    LOAD_DIFF_AVX2 xm%1, xm%5, [%7],      [%8]
34
+    LOAD_DIFF_AVX2 xm%2, xm%6, [%7+r1],   [%8+r3]
35
+    LOAD_DIFF_AVX2 xm%3, xm%5, [%7+2*r1], [%8+2*r3]
36
+    LOAD_DIFF_AVX2 xm%4, xm%6, [%7+r4],   [%8+r5]
37
+
38
+    ;lea %7, [%7+4*r1]
39
+    ;lea %8, [%8+4*r3]
40
+%endmacro
41
+
42
+INIT_YMM avx2
43
+cglobal pixel_satd_8x8, 4,4,7
44
+
45
+    FIX_STRIDES r1, r3
46
+    pxor    xm6, xm6
47
+
48
+    ; load_diff 0 & 4
49
+    movu    xm0, [r0]
50
+    movu    xm1, [r2]
51
+    vinserti128 m0, m0, [r0 + r1 * 4], 1
52
+    vinserti128 m1, m1, [r2 + r3 * 4], 1
53
+    psubw   m0, m1
54
+    add     r0, r1
55
+    add     r2, r3
56
+
57
+    ; load_diff 1 & 5
58
+    movu    xm1, [r0]
59
+    movu    xm2, [r2]
60
+    vinserti128 m1, m1, [r0 + r1 * 4], 1
61
+    vinserti128 m2, m2, [r2 + r3 * 4], 1
62
+    psubw   m1, m2
63
+    add     r0, r1
64
+    add     r2, r3
65
+
66
+    ; load_diff 2 & 6
67
+    movu    xm2, [r0]
68
+    movu    xm3, [r2]
69
+    vinserti128 m2, m2, [r0 + r1 * 4], 1
70
+    vinserti128 m3, m3, [r2 + r3 * 4], 1
71
+    psubw   m2, m3
72
+    add     r0, r1
73
+    add     r2, r3
74
+
75
+    ; load_diff 3 & 7
76
+    movu    xm3, [r0]
77
+    movu    xm4, [r2]
78
+    vinserti128 m3, m3, [r0 + r1 * 4], 1
79
+    vinserti128 m4, m4, [r2 + r3 * 4], 1
80
+    psubw   m3, m4
81
+
82
+    SATD_8x4_SSE vertical, 0, 1, 2, 3, 4, 5, 6
83
+
84
+    vextracti128 xm0, m6, 1
85
+    paddw xm6, xm0
86
+    HADDUW xm6, xm0
87
+    movd   eax, xm6
88
+    RET
89
+
90
+INIT_XMM avx2
91
+cglobal pixel_sa8d_8x8_internal
92
+    lea  r6, [r0+4*r1]
93
+    lea  r7, [r2+4*r3]
94
+    LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2
95
+    LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7
96
+
97
+    HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax
98
+    ;HADAMARD2_2D 0, 1, 2, 8, 6, wd
99
+    ;HADAMARD2_2D 4, 5, 3, 9, 6, wd
100
+    ;HADAMARD2_2D 0, 2, 1, 8, 6, dq
101
+    ;HADAMARD2_2D 4, 3, 5, 9, 6, dq
102
+    ;HADAMARD2_2D 0, 4, 2, 3, 6, qdq, amax
103
+    ;HADAMARD2_2D 1, 5, 8, 9, 6, qdq, amax
104
+
105
+    paddw m0, m1
106
+    paddw m0, m2
107
+    paddw m0, m8
108
+    SAVE_MM_PERMUTATION
109
+    ret
110
+
111
+
112
+INIT_XMM avx2
113
+cglobal pixel_sa8d_8x8, 4,8,12
114
+    FIX_STRIDES r1, r3
115
+    lea  r4, [3*r1]
116
+    lea  r5, [3*r3]
117
+    call pixel_sa8d_8x8_internal
118
+    HADDUW m0, m1
119
+    movd eax, m0
120
+    add eax, 1
121
+    shr eax, 1
122
+    RET
123
+
124
+
125
+INIT_YMM avx2
126
+cglobal pixel_sa8d_16x16, 4,8,12
127
+    FIX_STRIDES r1, r3
128
+    lea  r4, [3*r1]
129
+    lea  r5, [3*r3]
130
+    lea  r6, [r0+4*r1]
131
+    lea  r7, [r2+4*r3]
132
+    vbroadcasti128 m7, [pw_1]
133
+
134
+    ; Top 16x8
135
+    ;LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2
136
+    movu m0, [r0]                                   ; 10 bits
137
+    movu m5, [r2]
138
+    psubw m0, m5                                    ; 11 bits
139
+    movu m1, [r0 + r1]
140
+    movu m6, [r2 + r3]
141
+    psubw m1, m6
142
+    movu m2, [r0 + r1 * 2]
143
+    movu m5, [r2 + r3 * 2]
144
+    psubw m2, m5
145
+    movu m8, [r0 + r4]
146
+    movu m6, [r2 + r5]
147
+    psubw m8, m6
148
+
149
+    ;LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7
150
+    movu m4, [r6]
151
+    movu m11, [r7]
152
+    psubw m4, m11
153
+    movu m5, [r6 + r1]
154
+    movu m6, [r7 + r3]
155
+    psubw m5, m6
156
+    movu m3, [r6 + r1 * 2]
157
+    movu m11, [r7 + r3 * 2]
158
+    psubw m3, m11
159
+    movu m9, [r6 + r4]
160
+    movu m6, [r7 + r5]
161
+    psubw m9, m6
162
+
163
+    HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax    ; 16 bits
164
+    pmaddwd m0, m7
165
+    pmaddwd m1, m7
166
+    pmaddwd m2, m7
167
+    pmaddwd m8, m7
168
+    paddd m0, m1
169
+    paddd m2, m8
170
+    paddd m10, m0, m2
171
+
172
+    lea  r0, [r0+8*r1]
173
+    lea  r2, [r2+8*r3]
174
+    lea  r6, [r6+8*r1]
175
+    lea  r7, [r7+8*r3]
176
+
177
+    ; Bottom 16x8
178
+    ;LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2
179
+    movu m0, [r0]
180
+    movu m5, [r2]
181
+    psubw m0, m5
182
+    movu m1, [r0 + r1]
183
+    movu m6, [r2 + r3]
184
+    psubw m1, m6
185
+    movu m2, [r0 + r1 * 2]
186
+    movu m5, [r2 + r3 * 2]
187
+    psubw m2, m5
188
+    movu m8, [r0 + r4]
189
+    movu m6, [r2 + r5]
190
+    psubw m8, m6
191
+
192
+    ;LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7
193
+    movu m4, [r6]
194
+    movu m11, [r7]
195
+    psubw m4, m11
196
+    movu m5, [r6 + r1]
197
+    movu m6, [r7 + r3]
198
+    psubw m5, m6
199
+    movu m3, [r6 + r1 * 2]
200
+    movu m11, [r7 + r3 * 2]
201
+    psubw m3, m11
202
+    movu m9, [r6 + r4]
203
+    movu m6, [r7 + r5]
204
+    psubw m9, m6
205
+
206
+    HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax
207
+    pmaddwd m0, m7
208
+    pmaddwd m1, m7
209
+    pmaddwd m2, m7
210
+    pmaddwd m8, m7
211
+    paddd m0, m1
212
+    paddd m2, m8
213
+    paddd m10, m0
214
+    paddd m10, m2
215
+
216
+    HADDD m10, m0
217
+
218
+    movd eax, xm10
219
+    add  eax, 1
220
+    shr  eax, 1
221
+    RET
222
+
223
+
224
+; TODO: optimize me, need more 2 of YMM registers because C model get partial result every 16x16 block
225
+INIT_YMM avx2
226
+cglobal pixel_sa8d_32x32, 4,8,14
227
+    FIX_STRIDES r1, r3
228
+    lea  r4, [3*r1]
229
+    lea  r5, [3*r3]
230
+    lea  r6, [r0+4*r1]
231
+    lea  r7, [r2+4*r3]
232
+    vbroadcasti128 m7, [pw_1]
233
+
234
+
235
+    ;SA8D[16x8] ; pix[0]
236
+    ;LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2
237
+    movu m0, [r0]
238
+    movu m5, [r2]
239
+    psubw m0, m5
240
+    movu m1, [r0 + r1]
241
+    movu m6, [r2 + r3]
242
+    psubw m1, m6
243
+    movu m2, [r0 + r1 * 2]
244
+    movu m5, [r2 + r3 * 2]
245
+    psubw m2, m5
246
+    movu m8, [r0 + r4]
247
+    movu m6, [r2 + r5]
248
+    psubw m8, m6
249
+
250
+    ;LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7
251
+    movu m4, [r6]
252
+    movu m11, [r7]
253
+    psubw m4, m11
254
+    movu m5, [r6 + r1]
255
+    movu m6, [r7 + r3]
256
+    psubw m5, m6
257
+    movu m3, [r6 + r1 * 2]
258
+    movu m11, [r7 + r3 * 2]
259
+    psubw m3, m11
260
+    movu m9, [r6 + r4]
261
+    movu m6, [r7 + r5]
262
+    psubw m9, m6
263
+
264
+    HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax
265
+    pmaddwd m0, m7
266
+    pmaddwd m1, m7
267
+    pmaddwd m2, m7
268
+    pmaddwd m8, m7
269
+    paddd m0, m1
270
+    paddd m2, m8
271
+    paddd m10, m0, m2
272
+
273
+
274
+    ; SA8D[16x8] ; pix[16]
275
+    add  r0, mmsize
276
+    add  r2, mmsize
277
+    add  r6, mmsize
278
+    add  r7, mmsize
279
+
280
+    ;LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2
281
+    movu m0, [r0]
282
+    movu m5, [r2]
283
+    psubw m0, m5
284
+    movu m1, [r0 + r1]
285
+    movu m6, [r2 + r3]
286
+    psubw m1, m6
287
+    movu m2, [r0 + r1 * 2]
288
+    movu m5, [r2 + r3 * 2]
289
+    psubw m2, m5
290
+    movu m8, [r0 + r4]
291
+    movu m6, [r2 + r5]
292
+    psubw m8, m6
293
+
294
+    ;LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7
295
+    movu m4, [r6]
296
+    movu m11, [r7]
297
+    psubw m4, m11
298
+    movu m5, [r6 + r1]
299
+    movu m6, [r7 + r3]
300
+    psubw m5, m6
301
+    movu m3, [r6 + r1 * 2]
302
+    movu m11, [r7 + r3 * 2]
303
+    psubw m3, m11
304
+    movu m9, [r6 + r4]
305
+    movu m6, [r7 + r5]
306
+    psubw m9, m6
307
+
308
+    HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax
309
+    pmaddwd m0, m7
310
+    pmaddwd m1, m7
311
+    pmaddwd m2, m7
312
+    pmaddwd m8, m7
313
+    paddd m0, m1
314
+    paddd m2, m8
315
+    paddd m12, m0, m2
316
+
317
+
318
+    ; SA8D[16x8] ; pix[8*stride+16]
319
+    lea  r0, [r0+8*r1]
320
+    lea  r2, [r2+8*r3]
321
+    lea  r6, [r6+8*r1]
322
+    lea  r7, [r7+8*r3]
323
+
324
+    ;LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2
325
+    movu m0, [r0]
326
+    movu m5, [r2]
327
+    psubw m0, m5
328
+    movu m1, [r0 + r1]
329
+    movu m6, [r2 + r3]
330
+    psubw m1, m6
331
+    movu m2, [r0 + r1 * 2]
332
+    movu m5, [r2 + r3 * 2]
333
+    psubw m2, m5
334
+    movu m8, [r0 + r4]
335
+    movu m6, [r2 + r5]
336
+    psubw m8, m6
337
+
338
+    ;LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7
339
+    movu m4, [r6]
340
+    movu m11, [r7]
341
+    psubw m4, m11
342
+    movu m5, [r6 + r1]
343
+    movu m6, [r7 + r3]
344
+    psubw m5, m6
345
+    movu m3, [r6 + r1 * 2]
346
+    movu m11, [r7 + r3 * 2]
347
+    psubw m3, m11
348
+    movu m9, [r6 + r4]
349
+    movu m6, [r7 + r5]
350
+    psubw m9, m6
351
+
352
+    HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax
353
+    pmaddwd m0, m7
354
+    pmaddwd m1, m7
355
+    pmaddwd m2, m7
356
+    pmaddwd m8, m7
357
+    paddd m0, m1
358
+    paddd m2, m8
359
+    paddd m12, m0
360
+    paddd m12, m2
361
+
362
+    ; sum[1]
363
+    HADDD m12, m0
364
+
365
+
366
+    ; SA8D[16x8] ; pix[8*stride]
367
+    sub  r0, mmsize
368
+    sub  r2, mmsize
369
+    sub  r6, mmsize
370
+    sub  r7, mmsize
371
+
372
+    ;LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2
373
+    movu m0, [r0]
374
+    movu m5, [r2]
375
+    psubw m0, m5
376
+    movu m1, [r0 + r1]
377
+    movu m6, [r2 + r3]
378
+    psubw m1, m6
379
+    movu m2, [r0 + r1 * 2]
380
+    movu m5, [r2 + r3 * 2]
381
+    psubw m2, m5
382
+    movu m8, [r0 + r4]
383
+    movu m6, [r2 + r5]
384
+    psubw m8, m6
385
+
386
+    ;LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7
387
+    movu m4, [r6]
388
+    movu m11, [r7]
389
+    psubw m4, m11
390
+    movu m5, [r6 + r1]
391
+    movu m6, [r7 + r3]
392
+    psubw m5, m6
393
+    movu m3, [r6 + r1 * 2]
394
+    movu m11, [r7 + r3 * 2]
395
+    psubw m3, m11
396
+    movu m9, [r6 + r4]
397
+    movu m6, [r7 + r5]
398
+    psubw m9, m6
399
+
400
+    HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax
401
+    pmaddwd m0, m7
402
+    pmaddwd m1, m7
403
+    pmaddwd m2, m7
404
+    pmaddwd m8, m7
405
+    paddd m0, m1
406
+    paddd m2, m8
407
+    paddd m10, m0
408
+    paddd m10, m2
409
+
410
+    ; sum[0]
411
+    HADDD m10, m0
412
+    punpckldq xm10, xm12
413
+
414
+
415
+    ;SA8D[16x8] ; pix[16*stridr]
416
+    lea  r0, [r0+8*r1]
417
+    lea  r2, [r2+8*r3]
418
+    lea  r6, [r6+8*r1]
419
+    lea  r7, [r7+8*r3]
420
+
421
+    ;LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2
422
+    movu m0, [r0]
423
+    movu m5, [r2]
424
+    psubw m0, m5
425
+    movu m1, [r0 + r1]
426
+    movu m6, [r2 + r3]
427
+    psubw m1, m6
428
+    movu m2, [r0 + r1 * 2]
429
+    movu m5, [r2 + r3 * 2]
430
+    psubw m2, m5
431
+    movu m8, [r0 + r4]
432
+    movu m6, [r2 + r5]
433
+    psubw m8, m6
434
+
435
+    ;LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7
436
+    movu m4, [r6]
437
+    movu m11, [r7]
438
+    psubw m4, m11
439
+    movu m5, [r6 + r1]
440
+    movu m6, [r7 + r3]
441
+    psubw m5, m6
442
+    movu m3, [r6 + r1 * 2]
443
+    movu m11, [r7 + r3 * 2]
444
+    psubw m3, m11
445
+    movu m9, [r6 + r4]
446
+    movu m6, [r7 + r5]
447
+    psubw m9, m6
448
+
449
+    HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax
450
+    pmaddwd m0, m7
451
+    pmaddwd m1, m7
452
+    pmaddwd m2, m7
453
+    pmaddwd m8, m7
454
+    paddd m0, m1
455
+    paddd m2, m8
456
+    paddd m12, m0, m2
457
+
458
+
459
+    ; SA8D[16x8] ; pix[16*stride+16]
460
+    add  r0, mmsize
461
+    add  r2, mmsize
462
+    add  r6, mmsize
463
+    add  r7, mmsize
464
+
465
+    ;LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2
466
+    movu m0, [r0]
467
+    movu m5, [r2]
468
+    psubw m0, m5
469
+    movu m1, [r0 + r1]
470
+    movu m6, [r2 + r3]
471
+    psubw m1, m6
472
+    movu m2, [r0 + r1 * 2]
473
+    movu m5, [r2 + r3 * 2]
474
+    psubw m2, m5
475
+    movu m8, [r0 + r4]
476
+    movu m6, [r2 + r5]
477
+    psubw m8, m6
478
+
479
+    ;LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7
480
+    movu m4, [r6]
481
+    movu m11, [r7]
482
+    psubw m4, m11
483
+    movu m5, [r6 + r1]
484
+    movu m6, [r7 + r3]
485
+    psubw m5, m6
486
+    movu m3, [r6 + r1 * 2]
487
+    movu m11, [r7 + r3 * 2]
488
+    psubw m3, m11
489
+    movu m9, [r6 + r4]
490
+    movu m6, [r7 + r5]
491
+    psubw m9, m6
492
+
493
+    HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax
494
+    pmaddwd m0, m7
495
+    pmaddwd m1, m7
496
+    pmaddwd m2, m7
497
+    pmaddwd m8, m7
498
+    paddd m0, m1
499
+    paddd m2, m8
500
+    paddd m13, m0, m2
501
+
502
+
503
+    ; SA8D[16x8] ; pix[24*stride+16]
504
+    lea  r0, [r0+8*r1]
505
+    lea  r2, [r2+8*r3]
506
+    lea  r6, [r6+8*r1]
507
+    lea  r7, [r7+8*r3]
508
+
509
+    ;LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2
510
+    movu m0, [r0]
511
+    movu m5, [r2]
512
+    psubw m0, m5
513
+    movu m1, [r0 + r1]
514
+    movu m6, [r2 + r3]
515
+    psubw m1, m6
516
+    movu m2, [r0 + r1 * 2]
517
+    movu m5, [r2 + r3 * 2]
518
+    psubw m2, m5
519
+    movu m8, [r0 + r4]
520
+    movu m6, [r2 + r5]
521
+    psubw m8, m6
522
+
523
+    ;LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7
524
+    movu m4, [r6]
525
+    movu m11, [r7]
526
+    psubw m4, m11
527
+    movu m5, [r6 + r1]
528
+    movu m6, [r7 + r3]
529
+    psubw m5, m6
530
+    movu m3, [r6 + r1 * 2]
531
+    movu m11, [r7 + r3 * 2]
532
+    psubw m3, m11
533
+    movu m9, [r6 + r4]
534
+    movu m6, [r7 + r5]
535
+    psubw m9, m6
536
+
537
+    HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax
538
+    pmaddwd m0, m7
539
+    pmaddwd m1, m7
540
+    pmaddwd m2, m7
541
+    pmaddwd m8, m7
542
+    paddd m0, m1
543
+    paddd m2, m8
544
+    paddd m13, m0
545
+    paddd m13, m2
546
+
547
+    ; sum[3]
548
+    HADDD m13, m0
549
+
550
+
551
+    ; SA8D[16x8] ; pix[24*stride]
552
+    sub  r0, mmsize
553
+    sub  r2, mmsize
554
+    sub  r6, mmsize
555
+    sub  r7, mmsize
556
+
557
+    ;LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2
558
+    movu m0, [r0]
559
+    movu m5, [r2]
560
+    psubw m0, m5
561
+    movu m1, [r0 + r1]
562
+    movu m6, [r2 + r3]
563
+    psubw m1, m6
564
+    movu m2, [r0 + r1 * 2]
565
+    movu m5, [r2 + r3 * 2]
566
+    psubw m2, m5
567
+    movu m8, [r0 + r4]
568
+    movu m6, [r2 + r5]
569
+    psubw m8, m6
570
+
571
+    ;LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7
572
+    movu m4, [r6]
573
+    movu m11, [r7]
574
+    psubw m4, m11
575
+    movu m5, [r6 + r1]
576
+    movu m6, [r7 + r3]
577
+    psubw m5, m6
578
+    movu m3, [r6 + r1 * 2]
579
+    movu m11, [r7 + r3 * 2]
580
+    psubw m3, m11
581
+    movu m9, [r6 + r4]
582
+    movu m6, [r7 + r5]
583
+    psubw m9, m6
584
+
585
+    HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax
586
+    pmaddwd m0, m7
587
+    pmaddwd m1, m7
588
+    pmaddwd m2, m7
589
+    pmaddwd m8, m7
590
+    paddd m0, m1
591
+    paddd m2, m8
592
+    paddd m12, m0
593
+    paddd m12, m2
594
+
595
+    ; sum[2]
596
+    HADDD m12, m0
597
+    punpckldq xm12, xm13
598
+
599
+    ; SA8D
600
+    punpcklqdq xm0, xm10, xm12
601
+    paddd xm0, [pd_1]
602
+    psrld xm0, 1
603
+    HADDD xm0, xm1
604
+
605
+    movd eax, xm0
606
+    RET
607
+
608
+%endif ; HIGH_BIT_DEPTH == 1 && BIT_DEPTH == 10
609
x265_1.9.tar.gz/source/common/yuv.cpp -> x265_2.0.tar.gz/source/common/yuv.cpp Changed
23
 
1
@@ -163,14 +163,19 @@
2
     }
3
 }
4
 
5
-void Yuv::addClip(const Yuv& srcYuv0, const ShortYuv& srcYuv1, uint32_t log2SizeL)
6
+void Yuv::addClip(const Yuv& srcYuv0, const ShortYuv& srcYuv1, uint32_t log2SizeL, int picCsp)
7
 {
8
     primitives.cu[log2SizeL - 2].add_ps(m_buf[0], m_size, srcYuv0.m_buf[0], srcYuv1.m_buf[0], srcYuv0.m_size, srcYuv1.m_size);
9
-    if (m_csp != X265_CSP_I400)
10
+    if (m_csp != X265_CSP_I400 && picCsp != X265_CSP_I400)
11
     {
12
         primitives.chroma[m_csp].cu[log2SizeL - 2].add_ps(m_buf[1], m_csize, srcYuv0.m_buf[1], srcYuv1.m_buf[1], srcYuv0.m_csize, srcYuv1.m_csize);
13
         primitives.chroma[m_csp].cu[log2SizeL - 2].add_ps(m_buf[2], m_csize, srcYuv0.m_buf[2], srcYuv1.m_buf[2], srcYuv0.m_csize, srcYuv1.m_csize);
14
     }
15
+    if (picCsp == X265_CSP_I400 && m_csp != X265_CSP_I400)
16
+    {
17
+        primitives.chroma[m_csp].cu[m_part].copy_pp(m_buf[1], m_csize, srcYuv0.m_buf[1], srcYuv0.m_csize);
18
+        primitives.chroma[m_csp].cu[m_part].copy_pp(m_buf[2], m_csize, srcYuv0.m_buf[2], srcYuv0.m_csize);
19
+    }
20
 }
21
 
22
 void Yuv::addAvg(const ShortYuv& srcYuv0, const ShortYuv& srcYuv1, uint32_t absPartIdx, uint32_t width, uint32_t height, bool bLuma, bool bChroma)
23
x265_1.9.tar.gz/source/common/yuv.h -> x265_2.0.tar.gz/source/common/yuv.h Changed
10
 
1
@@ -73,7 +73,7 @@
2
     void   copyPartToYuv(Yuv& dstYuv, uint32_t absPartIdx) const;
3
 
4
     // Clip(srcYuv0 + srcYuv1) -> m_buf .. aka recon = clip(pred + residual)
5
-    void   addClip(const Yuv& srcYuv0, const ShortYuv& srcYuv1, uint32_t log2SizeL);
6
+    void   addClip(const Yuv& srcYuv0, const ShortYuv& srcYuv1, uint32_t log2SizeL, int picCsp);
7
 
8
     // (srcYuv0 + srcYuv1)/2 for YUV partition (bidir averaging)
9
     void   addAvg(const ShortYuv& srcYuv0, const ShortYuv& srcYuv1, uint32_t absPartIdx, uint32_t width, uint32_t height, bool bLuma, bool bChroma);
10
x265_1.9.tar.gz/source/compat/msvc/stdint.h -> x265_2.0.tar.gz/source/compat/msvc/stdint.h Changed
9
 
1
@@ -8,6 +8,7 @@
2
 #if !defined(UINT64_MAX)
3
 #include <limits.h>
4
 #define UINT64_MAX _UI64_MAX
5
+#define INT64_MAX _I64_MAX
6
 #define INT16_MAX  _I16_MAX
7
 #endif
8
 
9
x265_1.9.tar.gz/source/encoder/analysis.cpp -> x265_2.0.tar.gz/source/encoder/analysis.cpp Changed
1507
 
1
@@ -74,14 +74,18 @@
2
 {
3
     m_reuseInterDataCTU = NULL;
4
     m_reuseRef = NULL;
5
-    m_reuseBestMergeCand = NULL;
6
-    m_reuseMv = NULL;
7
+    m_bHD = false;
8
 }
9
 bool Analysis::create(ThreadLocalData *tld)
10
 {
11
     m_tld = tld;
12
     m_bTryLossless = m_param->bCULossless && !m_param->bLossless && m_param->rdLevel >= 2;
13
-    m_bChromaSa8d = m_param->rdLevel >= 3;
14
+
15
+    int costArrSize = 1;
16
+    uint32_t maxDQPDepth = g_log2Size[m_param->maxCUSize] - g_log2Size[m_param->rc.qgSize];
17
+    for (uint32_t i = 1; i <= maxDQPDepth; i++)
18
+        costArrSize += (1 << (i * 2));
19
+    cacheCost = X265_MALLOC(uint64_t, costArrSize);
20
 
21
     int csp = m_param->internalCsp;
22
     uint32_t cuSize = g_maxCUSize;
23
@@ -102,6 +106,8 @@
24
             md.pred[j].fencYuv = &md.fencYuv;
25
         }
26
     }
27
+    if (m_param->sourceHeight >= 1080)
28
+        m_bHD = true;
29
 
30
     return ok;
31
 }
32
@@ -119,12 +125,14 @@
33
             m_modeDepth[i].pred[j].reconYuv.destroy();
34
         }
35
     }
36
+    X265_FREE(cacheCost);
37
 }
38
 
39
 Mode& Analysis::compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, const Entropy& initialContext)
40
 {
41
     m_slice = ctu.m_slice;
42
     m_frame = &frame;
43
+    m_bChromaSa8d = m_param->rdLevel >= 3;
44
 
45
 #if _DEBUG || CHECKED_BUILD
46
     invalidateContexts(0);
47
@@ -142,8 +150,13 @@
48
         int numPredDir = m_slice->isInterP() ? 1 : 2;
49
         m_reuseInterDataCTU = (analysis_inter_data*)m_frame->m_analysisData.interData;
50
         m_reuseRef = &m_reuseInterDataCTU->ref[ctu.m_cuAddr * X265_MAX_PRED_MODE_PER_CTU * numPredDir];
51
-        m_reuseBestMergeCand = &m_reuseInterDataCTU->bestMergeCand[ctu.m_cuAddr * CUGeom::MAX_GEOMS];
52
-        m_reuseMv = &m_reuseInterDataCTU->mv[ctu.m_cuAddr * X265_MAX_PRED_MODE_PER_CTU * numPredDir];
53
+        m_reuseDepth = &m_reuseInterDataCTU->depth[ctu.m_cuAddr * ctu.m_numPartitions];
54
+        m_reuseModes = &m_reuseInterDataCTU->modes[ctu.m_cuAddr * ctu.m_numPartitions];
55
+        m_reusePartSize = &m_reuseInterDataCTU->partSize[ctu.m_cuAddr * ctu.m_numPartitions];
56
+        m_reuseMergeFlag = &m_reuseInterDataCTU->mergeFlag[ctu.m_cuAddr * ctu.m_numPartitions];
57
+        if (m_param->analysisMode == X265_ANALYSIS_SAVE)
58
+            for (int i = 0; i < X265_MAX_PRED_MODE_PER_CTU * numPredDir; i++)
59
+                m_reuseRef[i] = -1;
60
     }
61
     ProfileCUScope(ctu, totalCTUTime, totalCTUs);
62
 
63
@@ -158,14 +171,6 @@
64
             memcpy(ctu.m_chromaIntraDir, &intraDataCTU->chromaModes[ctu.m_cuAddr * numPartition], sizeof(uint8_t) * numPartition);
65
         }
66
         compressIntraCU(ctu, cuGeom, qp);
67
-        if (m_param->analysisMode == X265_ANALYSIS_SAVE && intraDataCTU)
68
-        {
69
-            CUData* bestCU = &m_modeDepth[0].bestMode->cu;
70
-            memcpy(&intraDataCTU->depth[ctu.m_cuAddr * numPartition], bestCU->m_cuDepth, sizeof(uint8_t) * numPartition);
71
-            memcpy(&intraDataCTU->modes[ctu.m_cuAddr * numPartition], bestCU->m_lumaIntraDir, sizeof(uint8_t) * numPartition);
72
-            memcpy(&intraDataCTU->partSizes[ctu.m_cuAddr * numPartition], bestCU->m_partSize, sizeof(uint8_t) * numPartition);
73
-            memcpy(&intraDataCTU->chromaModes[ctu.m_cuAddr * numPartition], bestCU->m_chromaIntraDir, sizeof(uint8_t) * numPartition);
74
-        }
75
     }
76
     else
77
     {
78
@@ -189,18 +194,12 @@
79
         else if (m_param->rdLevel <= 4)
80
             compressInterCU_rd0_4(ctu, cuGeom, qp);
81
         else
82
-        {
83
-            uint32_t zOrder = 0;
84
-            compressInterCU_rd5_6(ctu, cuGeom, zOrder, qp);
85
-            if (m_param->analysisMode == X265_ANALYSIS_SAVE && m_frame->m_analysisData.interData)
86
-            {
87
-                CUData* bestCU = &m_modeDepth[0].bestMode->cu;
88
-                memcpy(&m_reuseInterDataCTU->depth[ctu.m_cuAddr * numPartition], bestCU->m_cuDepth, sizeof(uint8_t) * numPartition);
89
-                memcpy(&m_reuseInterDataCTU->modes[ctu.m_cuAddr * numPartition], bestCU->m_predMode, sizeof(uint8_t) * numPartition);
90
-            }
91
-        }
92
+            compressInterCU_rd5_6(ctu, cuGeom, qp);
93
     }
94
 
95
+    if (m_param->bEnableRdRefine)
96
+        qprdRefine(ctu, cuGeom, qp, qp);
97
+
98
     return *m_modeDepth[0].bestMode;
99
 }
100
 
101
@@ -229,6 +228,61 @@
102
     }
103
 }
104
 
105
+void Analysis::qprdRefine(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp, int32_t lqp)
106
+{
107
+    uint32_t depth = cuGeom.depth;
108
+    ModeDepth& md = m_modeDepth[depth];
109
+    md.bestMode = NULL;
110
+
111
+    bool bDecidedDepth = parentCTU.m_cuDepth[cuGeom.absPartIdx] == depth;
112
+
113
+    int bestCUQP = qp;
114
+    int lambdaQP = lqp;
115
+
116
+    bool doQPRefine = (bDecidedDepth && depth <= m_slice->m_pps->maxCuDQPDepth) || (!bDecidedDepth && depth == m_slice->m_pps->maxCuDQPDepth);
117
+
118
+    if (doQPRefine)
119
+    {
120
+        uint64_t bestCUCost, origCUCost, cuCost, cuPrevCost;
121
+
122
+        int cuIdx = (cuGeom.childOffset - 1) / 3;
123
+        bestCUCost = origCUCost = cacheCost[cuIdx];
124
+
125
+        for (int dir = 2; dir >= -2; dir -= 4)
126
+        {
127
+            int threshold = 1;
128
+            int failure = 0;
129
+            cuPrevCost = origCUCost;
130
+
131
+            int modCUQP = qp + dir;
132
+            while (modCUQP >= QP_MIN && modCUQP <= QP_MAX_SPEC)
133
+            {
134
+                recodeCU(parentCTU, cuGeom, modCUQP, qp);
135
+                cuCost = md.bestMode->rdCost;
136
+
137
+                COPY2_IF_LT(bestCUCost, cuCost, bestCUQP, modCUQP);
138
+                if (cuCost < cuPrevCost)
139
+                    failure = 0;
140
+                else
141
+                    failure++;
142
+
143
+                if (failure > threshold)
144
+                    break;
145
+
146
+                cuPrevCost = cuCost;
147
+                modCUQP += dir;
148
+            }
149
+        }
150
+        lambdaQP = bestCUQP;
151
+    }
152
+
153
+    recodeCU(parentCTU, cuGeom, bestCUQP, lambdaQP);
154
+
155
+    /* Copy best data to encData CTU and recon */
156
+    md.bestMode->cu.copyToPic(depth);
157
+    md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, parentCTU.m_cuAddr, cuGeom.absPartIdx);
158
+}
159
+
160
 void Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp)
161
 {
162
     uint32_t depth = cuGeom.depth;
163
@@ -334,6 +388,12 @@
164
         checkBestMode(*splitPred, depth);
165
     }
166
 
167
+    if (m_param->bEnableRdRefine && depth <= m_slice->m_pps->maxCuDQPDepth)
168
+    {
169
+        int cuIdx = (cuGeom.childOffset - 1) / 3;
170
+        cacheCost[cuIdx] = md.bestMode->rdCost;
171
+    }
172
+
173
     /* Copy best data to encData CTU and recon */
174
     md.bestMode->cu.copyToPic(depth);
175
     if (md.bestMode != &md.pred[PRED_SPLIT])
176
@@ -377,6 +437,7 @@
177
         slave.m_slice = m_slice;
178
         slave.m_frame = m_frame;
179
         slave.m_param = m_param;
180
+        slave.m_bChromaSa8d = m_param->rdLevel >= 3;
181
         slave.setLambdaFromQP(md.pred[PRED_2Nx2N].cu, m_rdCost.m_qp);
182
         slave.invalidateContexts(0);
183
         slave.m_rqt[pmode.cuGeom.depth].cur.load(m_rqt[pmode.cuGeom.depth].cur);
184
@@ -555,7 +616,7 @@
185
         if (m_param->rdLevel <= 4)
186
             checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
187
         else
188
-            checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom, false);
189
+            checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
190
     }
191
 
192
     bool bNoSplit = false;
193
@@ -827,8 +888,11 @@
194
     bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
195
     bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
196
     uint32_t minDepth = topSkipMinDepth(parentCTU, cuGeom);
197
-    bool earlyskip = false;
198
+    bool skipModes = false; /* Skip any remaining mode analyses at current depth */
199
+    bool skipRecursion = false; /* Skip recursion */
200
     bool splitIntra = true;
201
+    bool skipRectAmp = false;
202
+    bool chooseMerge = false;
203
 
204
     SplitData splitData[4];
205
     splitData[0].initSplitCUData();
206
@@ -844,27 +908,56 @@
207
         md.pred[PRED_2Nx2N].sa8dCost = 0;
208
     }
209
 
210
-    /* Step 1. Evaluate Merge/Skip candidates for likely early-outs */
211
-    if (mightNotSplit && depth >= minDepth)
212
+    if (m_param->analysisMode == X265_ANALYSIS_LOAD)
213
+    {
214
+        if (mightNotSplit && depth == m_reuseDepth[cuGeom.absPartIdx])
215
+        {
216
+            if (m_reuseModes[cuGeom.absPartIdx] == MODE_SKIP)
217
+            {
218
+                md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
219
+                md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
220
+                checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
221
+
222
+                skipRecursion = !!m_param->bEnableRecursionSkip && md.bestMode;
223
+                if (m_param->rdLevel)
224
+                    skipModes = m_param->bEnableEarlySkip && md.bestMode;
225
+            }
226
+            if (m_reusePartSize[cuGeom.absPartIdx] == SIZE_2Nx2N)
227
+            {
228
+                if (m_reuseModes[cuGeom.absPartIdx] != MODE_INTRA  && m_reuseModes[cuGeom.absPartIdx] != 4)
229
+                {
230
+                    skipRectAmp = true && !!md.bestMode;
231
+                    chooseMerge = !!m_reuseMergeFlag[cuGeom.absPartIdx] && !!md.bestMode;
232
+                }
233
+            }
234
+        }
235
+    }
236
+
237
+    /* Step 1. Evaluate Merge/Skip candidates for likely early-outs, if skip mode was not set above */
238
+    if (mightNotSplit && depth >= minDepth && !md.bestMode) /* TODO: Re-evaluate if analysis load/save still works */
239
     {
240
         /* Compute Merge Cost */
241
         md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
242
         md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
243
         checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
244
         if (m_param->rdLevel)
245
-            earlyskip = m_param->bEnableEarlySkip && md.bestMode && md.bestMode->cu.isSkipped(0); // TODO: sa8d threshold per depth
246
+            skipModes = m_param->bEnableEarlySkip && md.bestMode && md.bestMode->cu.isSkipped(0); // TODO: sa8d threshold per depth
247
     }
248
 
249
-    bool bNoSplit = false;
250
-    if (md.bestMode)
251
+    if (md.bestMode && m_param->bEnableRecursionSkip)
252
     {
253
-        bNoSplit = md.bestMode->cu.isSkipped(0);
254
-        if (mightSplit && depth && depth >= minDepth && !bNoSplit)
255
-            bNoSplit = recursionDepthCheck(parentCTU, cuGeom, *md.bestMode);
256
+        skipRecursion = md.bestMode->cu.isSkipped(0);
257
+        if (mightSplit && depth >= minDepth && !skipRecursion)
258
+        {
259
+            if (depth)
260
+                skipRecursion = recursionDepthCheck(parentCTU, cuGeom, *md.bestMode);
261
+            if (m_bHD && !skipRecursion && m_param->rdLevel == 2 && md.fencYuv.m_size != MAX_CU_SIZE)
262
+                skipRecursion = complexityCheckCU(*md.bestMode);
263
+        }
264
     }
265
 
266
     /* Step 2. Evaluate each of the 4 split sub-blocks in series */
267
-    if (mightSplit && !bNoSplit)
268
+    if (mightSplit && !skipRecursion)
269
     {
270
         Mode* splitPred = &md.pred[PRED_SPLIT];
271
         splitPred->initCosts();
272
@@ -926,7 +1019,7 @@
273
         if (m_slice->m_pps->bUseDQP && depth <= m_slice->m_pps->maxCuDQPDepth && m_slice->m_pps->maxCuDQPDepth != 0)
274
             setLambdaFromQP(parentCTU, qp);
275
 
276
-        if (!earlyskip)
277
+        if (!skipModes)
278
         {
279
             uint32_t refMasks[2];
280
             refMasks[0] = allSplitRefs;
281
@@ -947,158 +1040,161 @@
282
             }
283
 
284
             Mode *bestInter = &md.pred[PRED_2Nx2N];
285
-            if (m_param->bEnableRectInter)
286
+            if (!skipRectAmp)
287
             {
288
-                uint64_t splitCost = splitData[0].sa8dCost + splitData[1].sa8dCost + splitData[2].sa8dCost + splitData[3].sa8dCost;
289
-                uint32_t threshold_2NxN, threshold_Nx2N;
290
-
291
-                if (m_slice->m_sliceType == P_SLICE)
292
-                {
293
-                    threshold_2NxN = splitData[0].mvCost[0] + splitData[1].mvCost[0];
294
-                    threshold_Nx2N = splitData[0].mvCost[0] + splitData[2].mvCost[0];
295
-                }
296
-                else
297
-                {
298
-                    threshold_2NxN = (splitData[0].mvCost[0] + splitData[1].mvCost[0] 
299
-                                    + splitData[0].mvCost[1] + splitData[1].mvCost[1] + 1) >> 1;
300
-                    threshold_Nx2N = (splitData[0].mvCost[0] + splitData[2].mvCost[0] 
301
-                                    + splitData[0].mvCost[1] + splitData[2].mvCost[1] + 1) >> 1;
302
-                }
303
-
304
-                int try_2NxN_first = threshold_2NxN < threshold_Nx2N;
305
-                if (try_2NxN_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_2NxN)
306
-                {
307
-                    refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* top */
308
-                    refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* bot */
309
-                    md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom, qp);
310
-                    checkInter_rd0_4(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, refMasks);
311
-                    if (md.pred[PRED_2NxN].sa8dCost < bestInter->sa8dCost)
312
-                        bestInter = &md.pred[PRED_2NxN];
313
-                }
314
-
315
-                if (splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_Nx2N)
316
-                {
317
-                    refMasks[0] = splitData[0].splitRefs | splitData[2].splitRefs; /* left */
318
-                    refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* right */
319
-                    md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
320
-                    checkInter_rd0_4(md.pred[PRED_Nx2N], cuGeom, SIZE_Nx2N, refMasks);
321
-                    if (md.pred[PRED_Nx2N].sa8dCost < bestInter->sa8dCost)
322
-                        bestInter = &md.pred[PRED_Nx2N];
323
-                }
324
-
325
-                if (!try_2NxN_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_2NxN)
326
-                {
327
-                    refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* top */
328
-                    refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* bot */
329
-                    md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom, qp);
330
-                    checkInter_rd0_4(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, refMasks);
331
-                    if (md.pred[PRED_2NxN].sa8dCost < bestInter->sa8dCost)
332
-                        bestInter = &md.pred[PRED_2NxN];
333
-                }
334
-            }
335
-
336
-            if (m_slice->m_sps->maxAMPDepth > depth)
337
-            {
338
-                uint64_t splitCost = splitData[0].sa8dCost + splitData[1].sa8dCost + splitData[2].sa8dCost + splitData[3].sa8dCost;
339
-                uint32_t threshold_2NxnU, threshold_2NxnD, threshold_nLx2N, threshold_nRx2N;
340
-
341
-                if (m_slice->m_sliceType == P_SLICE)
342
+                if (m_param->bEnableRectInter)
343
                 {
344
-                    threshold_2NxnU = splitData[0].mvCost[0] + splitData[1].mvCost[0];
345
-                    threshold_2NxnD = splitData[2].mvCost[0] + splitData[3].mvCost[0];
346
+                    uint64_t splitCost = splitData[0].sa8dCost + splitData[1].sa8dCost + splitData[2].sa8dCost + splitData[3].sa8dCost;
347
+                    uint32_t threshold_2NxN, threshold_Nx2N;
348
 
349
-                    threshold_nLx2N = splitData[0].mvCost[0] + splitData[2].mvCost[0];
350
-                    threshold_nRx2N = splitData[1].mvCost[0] + splitData[3].mvCost[0];
351
-                }
352
-                else
353
-                {
354
-                    threshold_2NxnU = (splitData[0].mvCost[0] + splitData[1].mvCost[0] 
355
+                    if (m_slice->m_sliceType == P_SLICE)
356
+                    {
357
+                        threshold_2NxN = splitData[0].mvCost[0] + splitData[1].mvCost[0];
358
+                        threshold_Nx2N = splitData[0].mvCost[0] + splitData[2].mvCost[0];
359
+                    }
360
+                    else
361
+                    {
362
+                        threshold_2NxN = (splitData[0].mvCost[0] + splitData[1].mvCost[0]
363
                                        + splitData[0].mvCost[1] + splitData[1].mvCost[1] + 1) >> 1;
364
-                    threshold_2NxnD = (splitData[2].mvCost[0] + splitData[3].mvCost[0] 
365
-                                       + splitData[2].mvCost[1] + splitData[3].mvCost[1] + 1) >> 1;
366
-
367
-                    threshold_nLx2N = (splitData[0].mvCost[0] + splitData[2].mvCost[0] 
368
+                        threshold_Nx2N = (splitData[0].mvCost[0] + splitData[2].mvCost[0]
369
                                        + splitData[0].mvCost[1] + splitData[2].mvCost[1] + 1) >> 1;
370
-                    threshold_nRx2N = (splitData[1].mvCost[0] + splitData[3].mvCost[0] 
371
-                                       + splitData[1].mvCost[1] + splitData[3].mvCost[1] + 1) >> 1;
372
-                }
373
-
374
-                bool bHor = false, bVer = false;
375
-                if (bestInter->cu.m_partSize[0] == SIZE_2NxN)
376
-                    bHor = true;
377
-                else if (bestInter->cu.m_partSize[0] == SIZE_Nx2N)
378
-                    bVer = true;
379
-                else if (bestInter->cu.m_partSize[0] == SIZE_2Nx2N &&
380
-                         md.bestMode && md.bestMode->cu.getQtRootCbf(0))
381
-                {
382
-                    bHor = true;
383
-                    bVer = true;
384
-                }
385
+                    }
386
 
387
-                if (bHor)
388
-                {
389
-                    int try_2NxnD_first = threshold_2NxnD < threshold_2NxnU;
390
-                    if (try_2NxnD_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_2NxnD)
391
+                    int try_2NxN_first = threshold_2NxN < threshold_Nx2N;
392
+                    if (try_2NxN_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_2NxN)
393
                     {
394
-                        refMasks[0] = allSplitRefs;                                    /* 75% top */
395
-                        refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* 25% bot */
396
-                        md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom, qp);
397
-                        checkInter_rd0_4(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, refMasks);
398
-                        if (md.pred[PRED_2NxnD].sa8dCost < bestInter->sa8dCost)
399
-                            bestInter = &md.pred[PRED_2NxnD];
400
+                        refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* top */
401
+                        refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* bot */
402
+                        md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom, qp);
403
+                        checkInter_rd0_4(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, refMasks);
404
+                        if (md.pred[PRED_2NxN].sa8dCost < bestInter->sa8dCost)
405
+                            bestInter = &md.pred[PRED_2NxN];
406
                     }
407
 
408
-                    if (splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_2NxnU)
409
+                    if (splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_Nx2N)
410
                     {
411
-                        refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* 25% top */
412
-                        refMasks[1] = allSplitRefs;                                    /* 75% bot */
413
-                        md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom, qp);
414
-                        checkInter_rd0_4(md.pred[PRED_2NxnU], cuGeom, SIZE_2NxnU, refMasks);
415
-                        if (md.pred[PRED_2NxnU].sa8dCost < bestInter->sa8dCost)
416
-                            bestInter = &md.pred[PRED_2NxnU];
417
+                        refMasks[0] = splitData[0].splitRefs | splitData[2].splitRefs; /* left */
418
+                        refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* right */
419
+                        md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
420
+                        checkInter_rd0_4(md.pred[PRED_Nx2N], cuGeom, SIZE_Nx2N, refMasks);
421
+                        if (md.pred[PRED_Nx2N].sa8dCost < bestInter->sa8dCost)
422
+                            bestInter = &md.pred[PRED_Nx2N];
423
                     }
424
 
425
-                    if (!try_2NxnD_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_2NxnD)
426
+                    if (!try_2NxN_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_2NxN)
427
                     {
428
-                        refMasks[0] = allSplitRefs;                                    /* 75% top */
429
-                        refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* 25% bot */
430
-                        md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom, qp);
431
-                        checkInter_rd0_4(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, refMasks);
432
-                        if (md.pred[PRED_2NxnD].sa8dCost < bestInter->sa8dCost)
433
-                            bestInter = &md.pred[PRED_2NxnD];
434
+                        refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* top */
435
+                        refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* bot */
436
+                        md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom, qp);
437
+                        checkInter_rd0_4(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, refMasks);
438
+                        if (md.pred[PRED_2NxN].sa8dCost < bestInter->sa8dCost)
439
+                            bestInter = &md.pred[PRED_2NxN];
440
                     }
441
                 }
442
-                if (bVer)
443
+
444
+                if (m_slice->m_sps->maxAMPDepth > depth)
445
                 {
446
-                    int try_nRx2N_first = threshold_nRx2N < threshold_nLx2N;
447
-                    if (try_nRx2N_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_nRx2N)
448
+                    uint64_t splitCost = splitData[0].sa8dCost + splitData[1].sa8dCost + splitData[2].sa8dCost + splitData[3].sa8dCost;
449
+                    uint32_t threshold_2NxnU, threshold_2NxnD, threshold_nLx2N, threshold_nRx2N;
450
+
451
+                    if (m_slice->m_sliceType == P_SLICE)
452
                     {
453
-                        refMasks[0] = allSplitRefs;                                    /* 75% left  */
454
-                        refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* 25% right */
455
-                        md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom, qp);
456
-                        checkInter_rd0_4(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, refMasks);
457
-                        if (md.pred[PRED_nRx2N].sa8dCost < bestInter->sa8dCost)
458
-                            bestInter = &md.pred[PRED_nRx2N];
459
+                        threshold_2NxnU = splitData[0].mvCost[0] + splitData[1].mvCost[0];
460
+                        threshold_2NxnD = splitData[2].mvCost[0] + splitData[3].mvCost[0];
461
+
462
+                        threshold_nLx2N = splitData[0].mvCost[0] + splitData[2].mvCost[0];
463
+                        threshold_nRx2N = splitData[1].mvCost[0] + splitData[3].mvCost[0];
464
+                    }
465
+                    else
466
+                    {
467
+                        threshold_2NxnU = (splitData[0].mvCost[0] + splitData[1].mvCost[0]
468
+                                         + splitData[0].mvCost[1] + splitData[1].mvCost[1] + 1) >> 1;
469
+                        threshold_2NxnD = (splitData[2].mvCost[0] + splitData[3].mvCost[0]
470
+                                         + splitData[2].mvCost[1] + splitData[3].mvCost[1] + 1) >> 1;
471
+
472
+                        threshold_nLx2N = (splitData[0].mvCost[0] + splitData[2].mvCost[0]
473
+                                        + splitData[0].mvCost[1] + splitData[2].mvCost[1] + 1) >> 1;
474
+                        threshold_nRx2N = (splitData[1].mvCost[0] + splitData[3].mvCost[0]
475
+                                        + splitData[1].mvCost[1] + splitData[3].mvCost[1] + 1) >> 1;
476
                     }
477
 
478
-                    if (splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_nLx2N)
479
+                    bool bHor = false, bVer = false;
480
+                    if (bestInter->cu.m_partSize[0] == SIZE_2NxN)
481
+                        bHor = true;
482
+                    else if (bestInter->cu.m_partSize[0] == SIZE_Nx2N)
483
+                        bVer = true;
484
+                    else if (bestInter->cu.m_partSize[0] == SIZE_2Nx2N &&
485
+                        md.bestMode && md.bestMode->cu.getQtRootCbf(0))
486
                     {
487
-                        refMasks[0] = splitData[0].splitRefs | splitData[2].splitRefs; /* 25% left  */
488
-                        refMasks[1] = allSplitRefs;                                    /* 75% right */
489
-                        md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom, qp);
490
-                        checkInter_rd0_4(md.pred[PRED_nLx2N], cuGeom, SIZE_nLx2N, refMasks);
491
-                        if (md.pred[PRED_nLx2N].sa8dCost < bestInter->sa8dCost)
492
-                            bestInter = &md.pred[PRED_nLx2N];
493
+                        bHor = true;
494
+                        bVer = true;
495
                     }
496
 
497
-                    if (!try_nRx2N_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_nRx2N)
498
+                    if (bHor)
499
                     {
500
-                        refMasks[0] = allSplitRefs;                                    /* 75% left  */
501
-                        refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* 25% right */
502
-                        md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom, qp);
503
-                        checkInter_rd0_4(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, refMasks);
504
-                        if (md.pred[PRED_nRx2N].sa8dCost < bestInter->sa8dCost)
505
-                            bestInter = &md.pred[PRED_nRx2N];
506
+                        int try_2NxnD_first = threshold_2NxnD < threshold_2NxnU;
507
+                        if (try_2NxnD_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_2NxnD)
508
+                        {
509
+                            refMasks[0] = allSplitRefs;                                    /* 75% top */
510
+                            refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* 25% bot */
511
+                            md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom, qp);
512
+                            checkInter_rd0_4(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, refMasks);
513
+                            if (md.pred[PRED_2NxnD].sa8dCost < bestInter->sa8dCost)
514
+                                bestInter = &md.pred[PRED_2NxnD];
515
+                        }
516
+
517
+                        if (splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_2NxnU)
518
+                        {
519
+                            refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* 25% top */
520
+                            refMasks[1] = allSplitRefs;                                    /* 75% bot */
521
+                            md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom, qp);
522
+                            checkInter_rd0_4(md.pred[PRED_2NxnU], cuGeom, SIZE_2NxnU, refMasks);
523
+                            if (md.pred[PRED_2NxnU].sa8dCost < bestInter->sa8dCost)
524
+                                bestInter = &md.pred[PRED_2NxnU];
525
+                        }
526
+
527
+                        if (!try_2NxnD_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_2NxnD)
528
+                        {
529
+                            refMasks[0] = allSplitRefs;                                    /* 75% top */
530
+                            refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* 25% bot */
531
+                            md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom, qp);
532
+                            checkInter_rd0_4(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, refMasks);
533
+                            if (md.pred[PRED_2NxnD].sa8dCost < bestInter->sa8dCost)
534
+                                bestInter = &md.pred[PRED_2NxnD];
535
+                        }
536
+                    }
537
+                    if (bVer)
538
+                    {
539
+                        int try_nRx2N_first = threshold_nRx2N < threshold_nLx2N;
540
+                        if (try_nRx2N_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_nRx2N)
541
+                        {
542
+                            refMasks[0] = allSplitRefs;                                    /* 75% left  */
543
+                            refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* 25% right */
544
+                            md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom, qp);
545
+                            checkInter_rd0_4(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, refMasks);
546
+                            if (md.pred[PRED_nRx2N].sa8dCost < bestInter->sa8dCost)
547
+                                bestInter = &md.pred[PRED_nRx2N];
548
+                        }
549
+
550
+                        if (splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_nLx2N)
551
+                        {
552
+                            refMasks[0] = splitData[0].splitRefs | splitData[2].splitRefs; /* 25% left  */
553
+                            refMasks[1] = allSplitRefs;                                    /* 75% right */
554
+                            md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom, qp);
555
+                            checkInter_rd0_4(md.pred[PRED_nLx2N], cuGeom, SIZE_nLx2N, refMasks);
556
+                            if (md.pred[PRED_nLx2N].sa8dCost < bestInter->sa8dCost)
557
+                                bestInter = &md.pred[PRED_nLx2N];
558
+                        }
559
+
560
+                        if (!try_nRx2N_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_nRx2N)
561
+                        {
562
+                            refMasks[0] = allSplitRefs;                                    /* 75% left  */
563
+                            refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* 25% right */
564
+                            md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom, qp);
565
+                            checkInter_rd0_4(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, refMasks);
566
+                            if (md.pred[PRED_nRx2N].sa8dCost < bestInter->sa8dCost)
567
+                                bestInter = &md.pred[PRED_nRx2N];
568
+                        }
569
                     }
570
                 }
571
             }
572
@@ -1106,7 +1202,7 @@
573
             if (m_param->rdLevel >= 3)
574
             {
575
                 /* Calculate RD cost of best inter option */
576
-                if (!m_bChromaSa8d && (m_csp != X265_CSP_I400)) /* When m_bChromaSa8d is enabled, chroma MC has already been done */
577
+                if ((!m_bChromaSa8d && (m_csp != X265_CSP_I400)) || (m_frame->m_fencPic->m_picCsp == X265_CSP_I400 && m_csp != X265_CSP_I400)) /* When m_bChromaSa8d is enabled, chroma MC has already been done */
578
                 {
579
                     uint32_t numPU = bestInter->cu.getNumPartInter(0);
580
                     for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
581
@@ -1115,15 +1211,26 @@
582
                         motionCompensation(bestInter->cu, pu, bestInter->predYuv, false, true);
583
                     }
584
                 }
585
-                encodeResAndCalcRdInterCU(*bestInter, cuGeom);
586
-                checkBestMode(*bestInter, depth);
587
 
588
-                /* If BIDIR is available and within 17/16 of best inter option, choose by RDO */
589
-                if (m_slice->m_sliceType == B_SLICE && md.pred[PRED_BIDIR].sa8dCost != MAX_INT64 &&
590
-                    md.pred[PRED_BIDIR].sa8dCost * 16 <= bestInter->sa8dCost * 17)
591
+                if (!chooseMerge)
592
                 {
593
-                    encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], cuGeom);
594
-                    checkBestMode(md.pred[PRED_BIDIR], depth);
595
+                    encodeResAndCalcRdInterCU(*bestInter, cuGeom);
596
+                    checkBestMode(*bestInter, depth);
597
+
598
+                    /* If BIDIR is available and within 17/16 of best inter option, choose by RDO */
599
+                    if (m_slice->m_sliceType == B_SLICE && md.pred[PRED_BIDIR].sa8dCost != MAX_INT64 &&
600
+                        md.pred[PRED_BIDIR].sa8dCost * 16 <= bestInter->sa8dCost * 17)
601
+                    {
602
+                        uint32_t numPU = md.pred[PRED_BIDIR].cu.getNumPartInter(0);
603
+                        if (m_frame->m_fencPic->m_picCsp == X265_CSP_I400 && m_csp != X265_CSP_I400)
604
+                            for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
605
+                            {
606
+                                PredictionUnit pu(md.pred[PRED_BIDIR].cu, cuGeom, puIdx);
607
+                                motionCompensation(md.pred[PRED_BIDIR].cu, pu, md.pred[PRED_BIDIR].predYuv, true, true);
608
+                            }
609
+                        encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], cuGeom);
610
+                        checkBestMode(md.pred[PRED_BIDIR], depth);
611
+                    }
612
                 }
613
 
614
                 if ((bTryIntra && md.bestMode->cu.getQtRootCbf(0)) ||
615
@@ -1198,10 +1305,10 @@
616
 
617
                         uint32_t tuDepthRange[2];
618
                         cu.getInterTUQtDepthRange(tuDepthRange, 0);
619
-                        m_rqt[cuGeom.depth].tmpResiYuv.subtract(*md.bestMode->fencYuv, md.bestMode->predYuv, cuGeom.log2CUSize);
620
+                        m_rqt[cuGeom.depth].tmpResiYuv.subtract(*md.bestMode->fencYuv, md.bestMode->predYuv, cuGeom.log2CUSize, m_frame->m_fencPic->m_picCsp);
621
                         residualTransformQuantInter(*md.bestMode, cuGeom, 0, 0, tuDepthRange);
622
                         if (cu.getQtRootCbf(0))
623
-                            md.bestMode->reconYuv.addClip(md.bestMode->predYuv, m_rqt[cuGeom.depth].tmpResiYuv, cu.m_log2CUSize[0]);
624
+                            md.bestMode->reconYuv.addClip(md.bestMode->predYuv, m_rqt[cuGeom.depth].tmpResiYuv, cu.m_log2CUSize[0], m_frame->m_fencPic->m_picCsp);
625
                         else
626
                         {
627
                             md.bestMode->reconYuv.copyFromYuv(md.bestMode->predYuv);
628
@@ -1241,7 +1348,7 @@
629
             addSplitFlagCost(*md.bestMode, cuGeom.depth);
630
     }
631
 
632
-    if (mightSplit && !bNoSplit)
633
+    if (mightSplit && !skipRecursion)
634
     {
635
         Mode* splitPred = &md.pred[PRED_SPLIT];
636
         if (!md.bestMode)
637
@@ -1279,9 +1386,8 @@
638
         splitCUData.sa8dCost    = md.pred[PRED_2Nx2N].sa8dCost;
639
     }
640
     
641
-    if (mightNotSplit)
642
+    if (mightNotSplit && md.bestMode->cu.isSkipped(0))
643
     {
644
-        /* early-out statistics */
645
         FrameData& curEncData = *m_frame->m_encData;
646
         FrameData::RCStatCU& cuStat = curEncData.m_cuStat[parentCTU.m_cuAddr];
647
         uint64_t temp = cuStat.avgCost[depth] * cuStat.count[depth];
648
@@ -1297,7 +1403,7 @@
649
     return splitCUData;
650
 }
651
 
652
-SplitData Analysis::compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGeom, uint32_t &zOrder, int32_t qp)
653
+SplitData Analysis::compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp)
654
 {
655
     uint32_t depth = cuGeom.depth;
656
     ModeDepth& md = m_modeDepth[depth];
657
@@ -1305,8 +1411,10 @@
658
 
659
     bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
660
     bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
661
-    bool foundSkip = false;
662
+    bool skipRecursion = false;
663
+    bool skipModes = false;
664
     bool splitIntra = true;
665
+    bool skipRectAmp = false;
666
 
667
     // avoid uninitialize value in below reference
668
     if (m_param->limitModes)
669
@@ -1316,41 +1424,55 @@
670
         md.pred[PRED_2Nx2N].rdCost = 0;
671
     }
672
 
673
-    if (m_param->analysisMode == X265_ANALYSIS_LOAD)
674
-    {
675
-        uint8_t* reuseDepth  = &m_reuseInterDataCTU->depth[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
676
-        uint8_t* reuseModes  = &m_reuseInterDataCTU->modes[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
677
-        if (mightNotSplit && depth == reuseDepth[zOrder] && zOrder == cuGeom.absPartIdx && reuseModes[zOrder] == MODE_SKIP)
678
-        {
679
-            md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
680
-            md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
681
-            checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom, true);
682
-
683
-            // increment zOrder offset to point to next best depth in sharedDepth buffer
684
-            zOrder += g_depthInc[g_maxCUDepth - 1][reuseDepth[zOrder]];
685
-
686
-            foundSkip = true;
687
-        }
688
-    }  
689
-
690
     SplitData splitData[4];
691
     splitData[0].initSplitCUData();
692
     splitData[1].initSplitCUData();
693
     splitData[2].initSplitCUData();
694
     splitData[3].initSplitCUData();
695
+    uint32_t allSplitRefs = splitData[0].splitRefs | splitData[1].splitRefs | splitData[2].splitRefs | splitData[3].splitRefs;
696
+    uint32_t refMasks[2];
697
+    if (m_param->analysisMode == X265_ANALYSIS_LOAD)
698
+    {
699
+        if (mightNotSplit && depth == m_reuseDepth[cuGeom.absPartIdx])
700
+        {
701
+            if (m_reuseModes[cuGeom.absPartIdx] == MODE_SKIP)
702
+            {
703
+                md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
704
+                md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
705
+                checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
706
+                skipModes = !!m_param->bEnableEarlySkip && md.bestMode;
707
+                refMasks[0] = allSplitRefs;
708
+                md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
709
+                checkInter_rd5_6(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, refMasks);
710
+                checkBestMode(md.pred[PRED_2Nx2N], cuGeom.depth);
711
+
712
+                if (m_param->bEnableRecursionSkip && depth && m_modeDepth[depth - 1].bestMode)
713
+                    skipRecursion = md.bestMode && !md.bestMode->cu.getQtRootCbf(0);
714
+            }
715
+            if (m_reusePartSize[cuGeom.absPartIdx] == SIZE_2Nx2N)
716
+                skipRectAmp = true && !!md.bestMode;
717
+        }
718
+    }
719
 
720
     /* Step 1. Evaluate Merge/Skip candidates for likely early-outs */
721
-    if (mightNotSplit && !foundSkip)
722
+    if (mightNotSplit && !md.bestMode)
723
     {
724
         md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
725
         md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
726
-        checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom, false);
727
-        foundSkip = md.bestMode && !md.bestMode->cu.getQtRootCbf(0);
728
+        checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
729
+        skipModes = m_param->bEnableEarlySkip && md.bestMode && !md.bestMode->cu.getQtRootCbf(0);
730
+        refMasks[0] = allSplitRefs;
731
+        md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
732
+        checkInter_rd5_6(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, refMasks);
733
+        checkBestMode(md.pred[PRED_2Nx2N], cuGeom.depth);
734
+
735
+        if (m_param->bEnableRecursionSkip && depth && m_modeDepth[depth - 1].bestMode)
736
+            skipRecursion = md.bestMode && !md.bestMode->cu.getQtRootCbf(0);
737
     }
738
 
739
     // estimate split cost
740
     /* Step 2. Evaluate each of the 4 split sub-blocks in series */
741
-    if (mightSplit && !foundSkip)
742
+    if (mightSplit && !skipRecursion)
743
     {
744
         Mode* splitPred = &md.pred[PRED_SPLIT];
745
         splitPred->initCosts();
746
@@ -1375,7 +1497,7 @@
747
                 if (m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth)
748
                     nextQP = setLambdaFromQP(parentCTU, calculateQpforCuSize(parentCTU, childGeom));
749
 
750
-                splitData[subPartIdx] = compressInterCU_rd5_6(parentCTU, childGeom, zOrder, nextQP);
751
+                splitData[subPartIdx] = compressInterCU_rd5_6(parentCTU, childGeom, nextQP);
752
 
753
                 // Save best CU and pred data for this sub CU
754
                 splitIntra |= nd.bestMode->cu.isIntra(0);
755
@@ -1387,7 +1509,6 @@
756
             else
757
             {
758
                 splitCU->setEmptyPart(childGeom, subPartIdx);
759
-                zOrder += g_depthInc[g_maxCUDepth - 1][nextDepth];
760
             }
761
         }
762
         nextContext->store(splitPred->contexts);
763
@@ -1402,20 +1523,16 @@
764
     /* Split CUs
765
      *   0  1
766
      *   2  3 */
767
-    uint32_t allSplitRefs = splitData[0].splitRefs | splitData[1].splitRefs | splitData[2].splitRefs | splitData[3].splitRefs;
768
+    allSplitRefs = splitData[0].splitRefs | splitData[1].splitRefs | splitData[2].splitRefs | splitData[3].splitRefs;
769
     /* Step 3. Evaluate ME (2Nx2N, rect, amp) and intra modes at current depth */
770
     if (mightNotSplit)
771
     {
772
         if (m_slice->m_pps->bUseDQP && depth <= m_slice->m_pps->maxCuDQPDepth && m_slice->m_pps->maxCuDQPDepth != 0)
773
             setLambdaFromQP(parentCTU, qp);
774
 
775
-        if (!(foundSkip && m_param->bEnableEarlySkip))
776
+        if (!skipModes)
777
         {
778
-            uint32_t refMasks[2];
779
             refMasks[0] = allSplitRefs;
780
-            md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
781
-            checkInter_rd5_6(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, refMasks);
782
-            checkBestMode(md.pred[PRED_2Nx2N], cuGeom.depth);
783
 
784
             if (m_param->limitReferences & X265_REF_LIMIT_CU)
785
             {
786
@@ -1430,155 +1547,165 @@
787
                 checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], cuGeom);
788
                 if (md.pred[PRED_BIDIR].sa8dCost < MAX_INT64)
789
                 {
790
+                    uint32_t numPU = md.pred[PRED_BIDIR].cu.getNumPartInter(0);
791
+                    if (m_frame->m_fencPic->m_picCsp == X265_CSP_I400 && m_csp != X265_CSP_I400)
792
+                        for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
793
+                        {
794
+                            PredictionUnit pu(md.pred[PRED_BIDIR].cu, cuGeom, puIdx);
795
+                            motionCompensation(md.pred[PRED_BIDIR].cu, pu, md.pred[PRED_BIDIR].predYuv, true, true);
796
+                        }
797
                     encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], cuGeom);
798
                     checkBestMode(md.pred[PRED_BIDIR], cuGeom.depth);
799
                 }
800
             }
801
 
802
-            if (m_param->bEnableRectInter)
803
+            if (!skipRectAmp)
804
             {
805
-                uint64_t splitCost = splitData[0].sa8dCost + splitData[1].sa8dCost + splitData[2].sa8dCost + splitData[3].sa8dCost;
806
-                uint32_t threshold_2NxN, threshold_Nx2N;
807
-
808
-                if (m_slice->m_sliceType == P_SLICE)
809
-                {
810
-                    threshold_2NxN = splitData[0].mvCost[0] + splitData[1].mvCost[0];
811
-                    threshold_Nx2N = splitData[0].mvCost[0] + splitData[2].mvCost[0];
812
-                }
813
-                else
814
+                if (m_param->bEnableRectInter)
815
                 {
816
-                    threshold_2NxN = (splitData[0].mvCost[0] + splitData[1].mvCost[0] 
817
-                                    + splitData[0].mvCost[1] + splitData[1].mvCost[1] + 1) >> 1;
818
-                    threshold_Nx2N = (splitData[0].mvCost[0] + splitData[2].mvCost[0] 
819
-                                    + splitData[0].mvCost[1] + splitData[2].mvCost[1] + 1) >> 1;
820
-                }
821
-
822
-                int try_2NxN_first = threshold_2NxN < threshold_Nx2N;
823
-                if (try_2NxN_first && splitCost < md.bestMode->rdCost + threshold_2NxN)
824
-                {
825
-                    refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* top */
826
-                    refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* bot */
827
-                    md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom, qp);
828
-                    checkInter_rd5_6(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, refMasks);
829
-                    checkBestMode(md.pred[PRED_2NxN], cuGeom.depth);
830
-                }
831
+                    uint64_t splitCost = splitData[0].sa8dCost + splitData[1].sa8dCost + splitData[2].sa8dCost + splitData[3].sa8dCost;
832
+                    uint32_t threshold_2NxN, threshold_Nx2N;
833
 
834
-                if (splitCost < md.bestMode->rdCost + threshold_Nx2N)
835
-                {
836
-                    refMasks[0] = splitData[0].splitRefs | splitData[2].splitRefs; /* left */
837
-                    refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* right */
838
-                    md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
839
-                    checkInter_rd5_6(md.pred[PRED_Nx2N], cuGeom, SIZE_Nx2N, refMasks);
840
-                    checkBestMode(md.pred[PRED_Nx2N], cuGeom.depth);
841
-                }
842
-
843
-                if (!try_2NxN_first && splitCost < md.bestMode->rdCost + threshold_2NxN)
844
-                {
845
-                    refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* top */
846
-                    refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* bot */
847
-                    md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom, qp);
848
-                    checkInter_rd5_6(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, refMasks);
849
-                    checkBestMode(md.pred[PRED_2NxN], cuGeom.depth);
850
-                }
851
-            }
852
-
853
-            // Try AMP (SIZE_2NxnU, SIZE_2NxnD, SIZE_nLx2N, SIZE_nRx2N)
854
-            if (m_slice->m_sps->maxAMPDepth > depth)
855
-            {
856
-                uint64_t splitCost = splitData[0].sa8dCost + splitData[1].sa8dCost + splitData[2].sa8dCost + splitData[3].sa8dCost;
857
-                uint32_t threshold_2NxnU, threshold_2NxnD, threshold_nLx2N, threshold_nRx2N;
858
-
859
-                if (m_slice->m_sliceType == P_SLICE)
860
-                {
861
-                    threshold_2NxnU = splitData[0].mvCost[0] + splitData[1].mvCost[0];
862
-                    threshold_2NxnD = splitData[2].mvCost[0] + splitData[3].mvCost[0];
863
-
864
-                    threshold_nLx2N = splitData[0].mvCost[0] + splitData[2].mvCost[0];
865
-                    threshold_nRx2N = splitData[1].mvCost[0] + splitData[3].mvCost[0];
866
-                }
867
-                else
868
-                {
869
-                    threshold_2NxnU = (splitData[0].mvCost[0] + splitData[1].mvCost[0] 
870
+                    if (m_slice->m_sliceType == P_SLICE)
871
+                    {
872
+                        threshold_2NxN = splitData[0].mvCost[0] + splitData[1].mvCost[0];
873
+                        threshold_Nx2N = splitData[0].mvCost[0] + splitData[2].mvCost[0];
874
+                    }
875
+                    else
876
+                    {
877
+                        threshold_2NxN = (splitData[0].mvCost[0] + splitData[1].mvCost[0]
878
                                        + splitData[0].mvCost[1] + splitData[1].mvCost[1] + 1) >> 1;
879
-                    threshold_2NxnD = (splitData[2].mvCost[0] + splitData[3].mvCost[0] 
880
-                                       + splitData[2].mvCost[1] + splitData[3].mvCost[1] + 1) >> 1;
881
-
882
-                    threshold_nLx2N = (splitData[0].mvCost[0] + splitData[2].mvCost[0] 
883
+                        threshold_Nx2N = (splitData[0].mvCost[0] + splitData[2].mvCost[0]
884
                                        + splitData[0].mvCost[1] + splitData[2].mvCost[1] + 1) >> 1;
885
-                    threshold_nRx2N = (splitData[1].mvCost[0] + splitData[3].mvCost[0] 
886
-                                       + splitData[1].mvCost[1] + splitData[3].mvCost[1] + 1) >> 1;
887
-                }
888
-
889
-                bool bHor = false, bVer = false;
890
-                if (md.bestMode->cu.m_partSize[0] == SIZE_2NxN)
891
-                    bHor = true;
892
-                else if (md.bestMode->cu.m_partSize[0] == SIZE_Nx2N)
893
-                    bVer = true;
894
-                else if (md.bestMode->cu.m_partSize[0] == SIZE_2Nx2N && !md.bestMode->cu.m_mergeFlag[0])
895
-                {
896
-                    bHor = true;
897
-                    bVer = true;
898
-                }
899
+                    }
900
 
901
-                if (bHor)
902
-                {
903
-                    int try_2NxnD_first = threshold_2NxnD < threshold_2NxnU;
904
-                    if (try_2NxnD_first && splitCost < md.bestMode->rdCost + threshold_2NxnD)
905
+                    int try_2NxN_first = threshold_2NxN < threshold_Nx2N;
906
+                    if (try_2NxN_first && splitCost < md.bestMode->rdCost + threshold_2NxN)
907
                     {
908
-                        refMasks[0] = allSplitRefs;                                    /* 75% top */
909
-                        refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* 25% bot */
910
-                        md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom, qp);
911
-                        checkInter_rd5_6(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, refMasks);
912
-                        checkBestMode(md.pred[PRED_2NxnD], cuGeom.depth);
913
+                        refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* top */
914
+                        refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* bot */
915
+                        md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom, qp);
916
+                        checkInter_rd5_6(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, refMasks);
917
+                        checkBestMode(md.pred[PRED_2NxN], cuGeom.depth);
918
                     }
919
 
920
-                    if (splitCost < md.bestMode->rdCost + threshold_2NxnU)
921
+                    if (splitCost < md.bestMode->rdCost + threshold_Nx2N)
922
                     {
923
-                        refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* 25% top */
924
-                        refMasks[1] = allSplitRefs;                                    /* 75% bot */
925
-                        md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom, qp);
926
-                        checkInter_rd5_6(md.pred[PRED_2NxnU], cuGeom, SIZE_2NxnU, refMasks);
927
-                        checkBestMode(md.pred[PRED_2NxnU], cuGeom.depth);
928
+                        refMasks[0] = splitData[0].splitRefs | splitData[2].splitRefs; /* left */
929
+                        refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* right */
930
+                        md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
931
+                        checkInter_rd5_6(md.pred[PRED_Nx2N], cuGeom, SIZE_Nx2N, refMasks);
932
+                        checkBestMode(md.pred[PRED_Nx2N], cuGeom.depth);
933
                     }
934
 
935
-                    if (!try_2NxnD_first && splitCost < md.bestMode->rdCost + threshold_2NxnD)
936
+                    if (!try_2NxN_first && splitCost < md.bestMode->rdCost + threshold_2NxN)
937
                     {
938
-                        refMasks[0] = allSplitRefs;                                    /* 75% top */
939
-                        refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* 25% bot */
940
-                        md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom, qp);
941
-                        checkInter_rd5_6(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, refMasks);
942
-                        checkBestMode(md.pred[PRED_2NxnD], cuGeom.depth);
943
+                        refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* top */
944
+                        refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* bot */
945
+                        md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom, qp);
946
+                        checkInter_rd5_6(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, refMasks);
947
+                        checkBestMode(md.pred[PRED_2NxN], cuGeom.depth);
948
                     }
949
                 }
950
 
951
-                if (bVer)
952
+                // Try AMP (SIZE_2NxnU, SIZE_2NxnD, SIZE_nLx2N, SIZE_nRx2N)
953
+                if (m_slice->m_sps->maxAMPDepth > depth)
954
                 {
955
-                    int try_nRx2N_first = threshold_nRx2N < threshold_nLx2N;
956
-                    if (try_nRx2N_first && splitCost < md.bestMode->rdCost + threshold_nRx2N)
957
+                    uint64_t splitCost = splitData[0].sa8dCost + splitData[1].sa8dCost + splitData[2].sa8dCost + splitData[3].sa8dCost;
958
+                    uint32_t threshold_2NxnU, threshold_2NxnD, threshold_nLx2N, threshold_nRx2N;
959
+
960
+                    if (m_slice->m_sliceType == P_SLICE)
961
                     {
962
-                        refMasks[0] = allSplitRefs;                                    /* 75% left  */
963
-                        refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* 25% right */
964
-                        md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom, qp);
965
-                        checkInter_rd5_6(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, refMasks);
966
-                        checkBestMode(md.pred[PRED_nRx2N], cuGeom.depth);
967
+                        threshold_2NxnU = splitData[0].mvCost[0] + splitData[1].mvCost[0];
968
+                        threshold_2NxnD = splitData[2].mvCost[0] + splitData[3].mvCost[0];
969
+
970
+                        threshold_nLx2N = splitData[0].mvCost[0] + splitData[2].mvCost[0];
971
+                        threshold_nRx2N = splitData[1].mvCost[0] + splitData[3].mvCost[0];
972
                     }
973
+                    else
974
+                    {
975
+                        threshold_2NxnU = (splitData[0].mvCost[0] + splitData[1].mvCost[0]
976
+                                        + splitData[0].mvCost[1] + splitData[1].mvCost[1] + 1) >> 1;
977
+                        threshold_2NxnD = (splitData[2].mvCost[0] + splitData[3].mvCost[0]
978
+                                        + splitData[2].mvCost[1] + splitData[3].mvCost[1] + 1) >> 1;
979
 
980
-                    if (splitCost < md.bestMode->rdCost + threshold_nLx2N)
981
+                        threshold_nLx2N = (splitData[0].mvCost[0] + splitData[2].mvCost[0]
982
+                                        + splitData[0].mvCost[1] + splitData[2].mvCost[1] + 1) >> 1;
983
+                        threshold_nRx2N = (splitData[1].mvCost[0] + splitData[3].mvCost[0]
984
+                                        + splitData[1].mvCost[1] + splitData[3].mvCost[1] + 1) >> 1;
985
+                    }
986
+
987
+                    bool bHor = false, bVer = false;
988
+                    if (md.bestMode->cu.m_partSize[0] == SIZE_2NxN)
989
+                        bHor = true;
990
+                    else if (md.bestMode->cu.m_partSize[0] == SIZE_Nx2N)
991
+                        bVer = true;
992
+                    else if (md.bestMode->cu.m_partSize[0] == SIZE_2Nx2N && !md.bestMode->cu.m_mergeFlag[0])
993
                     {
994
-                        refMasks[0] = splitData[0].splitRefs | splitData[2].splitRefs; /* 25% left  */
995
-                        refMasks[1] = allSplitRefs;                                    /* 75% right */
996
-                        md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom, qp);
997
-                        checkInter_rd5_6(md.pred[PRED_nLx2N], cuGeom, SIZE_nLx2N, refMasks);
998
-                        checkBestMode(md.pred[PRED_nLx2N], cuGeom.depth);
999
+                        bHor = true;
1000
+                        bVer = true;
1001
+                    }
1002
+
1003
+                    if (bHor)
1004
+                    {
1005
+                        int try_2NxnD_first = threshold_2NxnD < threshold_2NxnU;
1006
+                        if (try_2NxnD_first && splitCost < md.bestMode->rdCost + threshold_2NxnD)
1007
+                        {
1008
+                            refMasks[0] = allSplitRefs;                                    /* 75% top */
1009
+                            refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* 25% bot */
1010
+                            md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom, qp);
1011
+                            checkInter_rd5_6(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, refMasks);
1012
+                            checkBestMode(md.pred[PRED_2NxnD], cuGeom.depth);
1013
+                        }
1014
+
1015
+                        if (splitCost < md.bestMode->rdCost + threshold_2NxnU)
1016
+                        {
1017
+                            refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* 25% top */
1018
+                            refMasks[1] = allSplitRefs;                                    /* 75% bot */
1019
+                            md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom, qp);
1020
+                            checkInter_rd5_6(md.pred[PRED_2NxnU], cuGeom, SIZE_2NxnU, refMasks);
1021
+                            checkBestMode(md.pred[PRED_2NxnU], cuGeom.depth);
1022
+                        }
1023
+
1024
+                        if (!try_2NxnD_first && splitCost < md.bestMode->rdCost + threshold_2NxnD)
1025
+                        {
1026
+                            refMasks[0] = allSplitRefs;                                    /* 75% top */
1027
+                            refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* 25% bot */
1028
+                            md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom, qp);
1029
+                            checkInter_rd5_6(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, refMasks);
1030
+                            checkBestMode(md.pred[PRED_2NxnD], cuGeom.depth);
1031
+                        }
1032
                     }
1033
 
1034
-                    if (!try_nRx2N_first && splitCost < md.bestMode->rdCost + threshold_nRx2N)
1035
+                    if (bVer)
1036
                     {
1037
-                        refMasks[0] = allSplitRefs;                                    /* 75% left  */
1038
-                        refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* 25% right */
1039
-                        md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom, qp);
1040
-                        checkInter_rd5_6(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, refMasks);
1041
-                        checkBestMode(md.pred[PRED_nRx2N], cuGeom.depth);
1042
+                        int try_nRx2N_first = threshold_nRx2N < threshold_nLx2N;
1043
+                        if (try_nRx2N_first && splitCost < md.bestMode->rdCost + threshold_nRx2N)
1044
+                        {
1045
+                            refMasks[0] = allSplitRefs;                                    /* 75% left  */
1046
+                            refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* 25% right */
1047
+                            md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom, qp);
1048
+                            checkInter_rd5_6(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, refMasks);
1049
+                            checkBestMode(md.pred[PRED_nRx2N], cuGeom.depth);
1050
+                        }
1051
+
1052
+                        if (splitCost < md.bestMode->rdCost + threshold_nLx2N)
1053
+                        {
1054
+                            refMasks[0] = splitData[0].splitRefs | splitData[2].splitRefs; /* 25% left  */
1055
+                            refMasks[1] = allSplitRefs;                                    /* 75% right */
1056
+                            md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom, qp);
1057
+                            checkInter_rd5_6(md.pred[PRED_nLx2N], cuGeom, SIZE_nLx2N, refMasks);
1058
+                            checkBestMode(md.pred[PRED_nLx2N], cuGeom.depth);
1059
+                        }
1060
+
1061
+                        if (!try_nRx2N_first && splitCost < md.bestMode->rdCost + threshold_nRx2N)
1062
+                        {
1063
+                            refMasks[0] = allSplitRefs;                                    /* 75% left  */
1064
+                            refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* 25% right */
1065
+                            md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom, qp);
1066
+                            checkInter_rd5_6(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, refMasks);
1067
+                            checkBestMode(md.pred[PRED_nRx2N], cuGeom.depth);
1068
+                        }
1069
                     }
1070
                 }
1071
             }
1072
@@ -1604,6 +1731,17 @@
1073
                     ProfileCounter(parentCTU, skippedIntraCU[cuGeom.depth]);
1074
                 }
1075
             }
1076
+            if ((md.bestMode->cu.isInter(0) && !(md.bestMode->cu.m_mergeFlag[0] && md.bestMode->cu.m_partSize[0] == SIZE_2Nx2N)) && (m_frame->m_fencPic->m_picCsp == X265_CSP_I400 && m_csp != X265_CSP_I400))
1077
+            {
1078
+                uint32_t numPU = md.bestMode->cu.getNumPartInter(0);
1079
+
1080
+                for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
1081
+                {
1082
+                    PredictionUnit pu(md.bestMode->cu, cuGeom, puIdx);
1083
+                    motionCompensation(md.bestMode->cu, pu, md.bestMode->predYuv, false, m_csp != X265_CSP_I400);
1084
+                }
1085
+                encodeResAndCalcRdInterCU(*md.bestMode, cuGeom);
1086
+            }
1087
         }
1088
 
1089
         if (m_bTryLossless)
1090
@@ -1614,9 +1752,15 @@
1091
     }
1092
 
1093
     /* compare split RD cost against best cost */
1094
-    if (mightSplit && !foundSkip)
1095
+    if (mightSplit && !skipRecursion)
1096
         checkBestMode(md.pred[PRED_SPLIT], depth);
1097
 
1098
+    if (m_param->bEnableRdRefine && depth <= m_slice->m_pps->maxCuDQPDepth)
1099
+    {
1100
+        int cuIdx = (cuGeom.childOffset - 1) / 3;
1101
+        cacheCost[cuIdx] = md.bestMode->rdCost;
1102
+    }
1103
+
1104
        /* determine which motion references the parent CU should search */
1105
     SplitData splitCUData;
1106
     splitCUData.initSplitCUData();
1107
@@ -1648,6 +1792,110 @@
1108
     return splitCUData;
1109
 }
1110
 
1111
+void Analysis::recodeCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp, int32_t lqp)
1112
+{
1113
+    uint32_t depth = cuGeom.depth;
1114
+    ModeDepth& md = m_modeDepth[depth];
1115
+    md.bestMode = NULL;
1116
+
1117
+    bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
1118
+    bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
1119
+    bool bDecidedDepth = parentCTU.m_cuDepth[cuGeom.absPartIdx] == depth;
1120
+
1121
+    if (bDecidedDepth)
1122
+    {
1123
+        setLambdaFromQP(parentCTU, qp, lqp);
1124
+
1125
+        Mode& mode = md.pred[0];
1126
+        md.bestMode = &mode;
1127
+        mode.cu.initSubCU(parentCTU, cuGeom, qp);
1128
+        PartSize size = (PartSize)parentCTU.m_partSize[cuGeom.absPartIdx];
1129
+        if (parentCTU.isIntra(cuGeom.absPartIdx))
1130
+        {
1131
+            memcpy(mode.cu.m_lumaIntraDir, parentCTU.m_lumaIntraDir + cuGeom.absPartIdx, cuGeom.numPartitions);
1132
+            memcpy(mode.cu.m_chromaIntraDir, parentCTU.m_chromaIntraDir + cuGeom.absPartIdx, cuGeom.numPartitions);
1133
+            checkIntra(mode, cuGeom, size);
1134
+        }
1135
+        else
1136
+        {
1137
+            mode.cu.copyFromPic(parentCTU, cuGeom, m_csp, false);
1138
+            for (int part = 0; part < (int)parentCTU.getNumPartInter(cuGeom.absPartIdx); part++)
1139
+            {
1140
+                PredictionUnit pu(mode.cu, cuGeom, part);
1141
+                motionCompensation(mode.cu, pu, mode.predYuv, true, true);
1142
+            }
1143
+
1144
+            if (parentCTU.isSkipped(cuGeom.absPartIdx))
1145
+                encodeResAndCalcRdSkipCU(mode);
1146
+            else
1147
+                encodeResAndCalcRdInterCU(mode, cuGeom);
1148
+
1149
+            /* checkMerge2Nx2N function performs checkDQP after encoding residual, do the same */
1150
+            bool mergeInter2Nx2N = size == SIZE_2Nx2N && parentCTU.m_mergeFlag[cuGeom.absPartIdx];
1151
+            if (parentCTU.isSkipped(cuGeom.absPartIdx) || mergeInter2Nx2N)
1152
+                checkDQP(mode, cuGeom);
1153
+        }
1154
+
1155
+        if (m_bTryLossless)
1156
+            tryLossless(cuGeom);
1157
+
1158
+        if (mightSplit)
1159
+            addSplitFlagCost(*md.bestMode, cuGeom.depth);
1160
+    }
1161
+    else
1162
+    {
1163
+        Mode* splitPred = &md.pred[PRED_SPLIT];
1164
+        md.bestMode = splitPred;
1165
+        splitPred->initCosts();
1166
+        CUData* splitCU = &splitPred->cu;
1167
+        splitCU->initSubCU(parentCTU, cuGeom, qp);
1168
+
1169
+        uint32_t nextDepth = depth + 1;
1170
+        ModeDepth& nd = m_modeDepth[nextDepth];
1171
+        invalidateContexts(nextDepth);
1172
+        Entropy* nextContext = &m_rqt[depth].cur;
1173
+        int nextQP = qp;
1174
+
1175
+        for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
1176
+        {
1177
+            const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
1178
+            if (childGeom.flags & CUGeom::PRESENT)
1179
+            {
1180
+                m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.absPartIdx);
1181
+                m_rqt[nextDepth].cur.load(*nextContext);
1182
+
1183
+                if (m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth)
1184
+                    nextQP = setLambdaFromQP(parentCTU, calculateQpforCuSize(parentCTU, childGeom));
1185
+
1186
+                qprdRefine(parentCTU, childGeom, nextQP, lqp);
1187
+
1188
+                // Save best CU and pred data for this sub CU
1189
+                splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
1190
+                splitPred->addSubCosts(*nd.bestMode);
1191
+                nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx);
1192
+                nextContext = &nd.bestMode->contexts;
1193
+            }
1194
+            else
1195
+            {
1196
+                splitCU->setEmptyPart(childGeom, subPartIdx);
1197
+                // Set depth of non-present CU to 0 to ensure that correct CU is fetched as reference to code deltaQP
1198
+                memset(parentCTU.m_cuDepth + childGeom.absPartIdx, 0, childGeom.numPartitions);
1199
+            }
1200
+        }
1201
+        nextContext->store(splitPred->contexts);
1202
+        if (mightNotSplit)
1203
+            addSplitFlagCost(*splitPred, cuGeom.depth);
1204
+        else
1205
+            updateModeCost(*splitPred);
1206
+
1207
+        checkDQPForSplitPred(*splitPred, cuGeom);
1208
+
1209
+        /* Copy best data to encData CTU and recon */
1210
+        md.bestMode->cu.copyToPic(depth);
1211
+        md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, parentCTU.m_cuAddr, cuGeom.absPartIdx);
1212
+    }
1213
+}
1214
+
1215
 /* sets md.bestMode if a valid merge candidate is found, else leaves it NULL */
1216
 void Analysis::checkMerge2Nx2N_rd0_4(Mode& skip, Mode& merge, const CUGeom& cuGeom)
1217
 {
1218
@@ -1705,11 +1953,11 @@
1219
         tempPred->cu.m_mv[1][0] = candMvField[i][1].mv;
1220
         tempPred->cu.m_refIdx[0][0] = (int8_t)candMvField[i][0].refIdx;
1221
         tempPred->cu.m_refIdx[1][0] = (int8_t)candMvField[i][1].refIdx;
1222
-        motionCompensation(tempPred->cu, pu, tempPred->predYuv, true, m_bChromaSa8d && (m_csp != X265_CSP_I400));
1223
+        motionCompensation(tempPred->cu, pu, tempPred->predYuv, true, m_bChromaSa8d && (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400));
1224
 
1225
         tempPred->sa8dBits = getTUBits(i, numMergeCand);
1226
         tempPred->distortion = primitives.cu[sizeIdx].sa8d(fencYuv->m_buf[0], fencYuv->m_size, tempPred->predYuv.m_buf[0], tempPred->predYuv.m_size);
1227
-        if (m_bChromaSa8d && (m_csp != X265_CSP_I400))
1228
+        if (m_bChromaSa8d && (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400))
1229
         {
1230
             tempPred->distortion += primitives.chroma[m_csp].cu[sizeIdx].sa8d(fencYuv->m_buf[1], fencYuv->m_csize, tempPred->predYuv.m_buf[1], tempPred->predYuv.m_csize);
1231
             tempPred->distortion += primitives.chroma[m_csp].cu[sizeIdx].sa8d(fencYuv->m_buf[2], fencYuv->m_csize, tempPred->predYuv.m_buf[2], tempPred->predYuv.m_csize);
1232
@@ -1728,7 +1976,7 @@
1233
         return;
1234
 
1235
     /* calculate the motion compensation for chroma for the best mode selected */
1236
-    if (!m_bChromaSa8d && (m_csp != X265_CSP_I400)) /* Chroma MC was done above */
1237
+    if ((!m_bChromaSa8d && (m_csp != X265_CSP_I400)) || (m_frame->m_fencPic->m_picCsp == X265_CSP_I400 && m_csp != X265_CSP_I400)) /* Chroma MC was done above */
1238
         motionCompensation(bestPred->cu, pu, bestPred->predYuv, false, true);
1239
 
1240
     if (m_param->rdLevel)
1241
@@ -1766,7 +2014,7 @@
1242
 }
1243
 
1244
 /* sets md.bestMode if a valid merge candidate is found, else leaves it NULL */
1245
-void Analysis::checkMerge2Nx2N_rd5_6(Mode& skip, Mode& merge, const CUGeom& cuGeom, bool isShareMergeCand)
1246
+void Analysis::checkMerge2Nx2N_rd5_6(Mode& skip, Mode& merge, const CUGeom& cuGeom)
1247
 {
1248
     uint32_t depth = cuGeom.depth;
1249
 
1250
@@ -1794,19 +2042,13 @@
1251
     bool triedPZero = false, triedBZero = false;
1252
     bestPred->rdCost = MAX_INT64;
1253
 
1254
-    uint32_t first = 0, last = numMergeCand;
1255
-    if (isShareMergeCand)
1256
-    {
1257
-        first = *m_reuseBestMergeCand;
1258
-        last = first + 1;
1259
-    }
1260
     int safeX, maxSafeMv;
1261
     if (m_param->bIntraRefresh && m_slice->m_sliceType == P_SLICE)
1262
     {
1263
         safeX = m_slice->m_refFrameList[0][0]->m_encData->m_pir.pirEndCol * g_maxCUSize - 3;
1264
         maxSafeMv = (safeX - tempPred->cu.m_cuPelX) * 4;
1265
     }
1266
-    for (uint32_t i = first; i < last; i++)
1267
+    for (uint32_t i = 0; i < numMergeCand; i++)
1268
     {
1269
         if (m_bFrameParallel &&
1270
             (candMvField[i][0].mv.y >= (m_param->searchRange + 1) * 4 ||
1271
@@ -1845,8 +2087,7 @@
1272
 
1273
         uint8_t hasCbf = true;
1274
         bool swapped = false;
1275
-        /* bypass encoding merge with residual if analysis-mode = load as only SKIP CUs enter this function */
1276
-        if (!foundCbf0Merge && !isShareMergeCand)
1277
+        if (!foundCbf0Merge)
1278
         {
1279
             /* if the best prediction has CBF (not a skip) then try merge with residual */
1280
 
1281
@@ -1896,13 +2137,6 @@
1282
         bestPred->cu.setPURefIdx(1, (int8_t)candMvField[bestCand][1].refIdx, 0, 0);
1283
         checkDQP(*bestPred, cuGeom);
1284
     }
1285
-
1286
-    if (m_param->analysisMode)
1287
-    {
1288
-        if (m_param->analysisMode == X265_ANALYSIS_SAVE)
1289
-            *m_reuseBestMergeCand = bestPred->cu.m_mvpIdx[0][0];
1290
-        m_reuseBestMergeCand++;
1291
-    }
1292
 }
1293
 
1294
 void Analysis::checkInter_rd0_4(Mode& interMode, const CUGeom& cuGeom, PartSize partSize, uint32_t refMask[2])
1295
@@ -1914,28 +2148,25 @@
1296
 
1297
     if (m_param->analysisMode == X265_ANALYSIS_LOAD && m_reuseInterDataCTU)
1298
     {
1299
+        int refOffset = cuGeom.geomRecurId * 16 * numPredDir + partSize * numPredDir * 2;
1300
+        int index = 0;
1301
+
1302
         uint32_t numPU = interMode.cu.getNumPartInter(0);
1303
         for (uint32_t part = 0; part < numPU; part++)
1304
         {
1305
             MotionData* bestME = interMode.bestME[part];
1306
             for (int32_t i = 0; i < numPredDir; i++)
1307
-            {
1308
-                bestME[i].ref = *m_reuseRef;
1309
-                m_reuseRef++;
1310
-
1311
-                bestME[i].mv = *m_reuseMv;
1312
-                m_reuseMv++;
1313
-            }
1314
+                bestME[i].ref = m_reuseRef[refOffset + index++];
1315
         }
1316
     }
1317
-    predInterSearch(interMode, cuGeom, m_bChromaSa8d && (m_csp != X265_CSP_I400), refMask);
1318
+    predInterSearch(interMode, cuGeom, m_bChromaSa8d && (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400), refMask);
1319
 
1320
     /* predInterSearch sets interMode.sa8dBits */
1321
     const Yuv& fencYuv = *interMode.fencYuv;
1322
     Yuv& predYuv = interMode.predYuv;
1323
     int part = partitionFromLog2Size(cuGeom.log2CUSize);
1324
     interMode.distortion = primitives.cu[part].sa8d(fencYuv.m_buf[0], fencYuv.m_size, predYuv.m_buf[0], predYuv.m_size);
1325
-    if (m_bChromaSa8d && (m_csp != X265_CSP_I400))
1326
+    if (m_bChromaSa8d && (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400))
1327
     {
1328
         interMode.distortion += primitives.chroma[m_csp].cu[part].sa8d(fencYuv.m_buf[1], fencYuv.m_csize, predYuv.m_buf[1], predYuv.m_csize);
1329
         interMode.distortion += primitives.chroma[m_csp].cu[part].sa8d(fencYuv.m_buf[2], fencYuv.m_csize, predYuv.m_buf[2], predYuv.m_csize);
1330
@@ -1944,20 +2175,15 @@
1331
 
1332
     if (m_param->analysisMode == X265_ANALYSIS_SAVE && m_reuseInterDataCTU)
1333
     {
1334
+        int refOffset = cuGeom.geomRecurId * 16 * numPredDir + partSize * numPredDir * 2;
1335
+        int index = 0;
1336
+
1337
         uint32_t numPU = interMode.cu.getNumPartInter(0);
1338
         for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
1339
         {
1340
-            PredictionUnit pu(interMode.cu, cuGeom, puIdx);
1341
             MotionData* bestME = interMode.bestME[puIdx];
1342
             for (int32_t i = 0; i < numPredDir; i++)
1343
-            {
1344
-                if (bestME[i].ref >= 0)
1345
-                    *m_reuseMv = getLowresMV(interMode.cu, pu, i, bestME[i].ref);
1346
-
1347
-                *m_reuseRef = bestME[i].ref;
1348
-                m_reuseRef++;
1349
-                m_reuseMv++;
1350
-            }
1351
+                m_reuseRef[refOffset + index++] = bestME[i].ref;
1352
         }
1353
     }
1354
 }
1355
@@ -1971,41 +2197,33 @@
1356
 
1357
     if (m_param->analysisMode == X265_ANALYSIS_LOAD && m_reuseInterDataCTU)
1358
     {
1359
+        int refOffset = cuGeom.geomRecurId * 16 * numPredDir + partSize * numPredDir * 2;
1360
+        int index = 0;
1361
+
1362
         uint32_t numPU = interMode.cu.getNumPartInter(0);
1363
         for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
1364
         {
1365
             MotionData* bestME = interMode.bestME[puIdx];
1366
             for (int32_t i = 0; i < numPredDir; i++)
1367
-            {
1368
-                bestME[i].ref = *m_reuseRef;
1369
-                m_reuseRef++;
1370
-
1371
-                bestME[i].mv = *m_reuseMv;
1372
-                m_reuseMv++;
1373
-            }
1374
+                bestME[i].ref = m_reuseRef[refOffset + index++];
1375
         }
1376
     }
1377
-    predInterSearch(interMode, cuGeom, m_csp != X265_CSP_I400, refMask);
1378
+    predInterSearch(interMode, cuGeom, m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400, refMask);
1379
 
1380
     /* predInterSearch sets interMode.sa8dBits, but this is ignored */
1381
     encodeResAndCalcRdInterCU(interMode, cuGeom);
1382
 
1383
     if (m_param->analysisMode == X265_ANALYSIS_SAVE && m_reuseInterDataCTU)
1384
     {
1385
+        int refOffset = cuGeom.geomRecurId * 16 * numPredDir + partSize * numPredDir * 2;
1386
+        int index = 0;
1387
+
1388
         uint32_t numPU = interMode.cu.getNumPartInter(0);
1389
         for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
1390
         {
1391
-            PredictionUnit pu(interMode.cu, cuGeom, puIdx);
1392
             MotionData* bestME = interMode.bestME[puIdx];
1393
             for (int32_t i = 0; i < numPredDir; i++)
1394
-            {
1395
-                if (bestME[i].ref >= 0)
1396
-                    *m_reuseMv = getLowresMV(interMode.cu, pu, i, bestME[i].ref);
1397
-
1398
-                *m_reuseRef = bestME[i].ref;
1399
-                m_reuseRef++;
1400
-                m_reuseMv++;
1401
-            }
1402
+                m_reuseRef[refOffset + index++] = bestME[i].ref;
1403
         }
1404
     }
1405
 }
1406
@@ -2053,10 +2271,10 @@
1407
     cu.m_mvd[1][0] = bestME[1].mv - mvp1;
1408
 
1409
     PredictionUnit pu(cu, cuGeom, 0);
1410
-    motionCompensation(cu, pu, bidir2Nx2N.predYuv, true, m_bChromaSa8d && (m_csp != X265_CSP_I400));
1411
+    motionCompensation(cu, pu, bidir2Nx2N.predYuv, true, m_bChromaSa8d && (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400));
1412
 
1413
     int sa8d = primitives.cu[partEnum].sa8d(fencYuv.m_buf[0], fencYuv.m_size, bidir2Nx2N.predYuv.m_buf[0], bidir2Nx2N.predYuv.m_size);
1414
-    if (m_bChromaSa8d && (m_csp != X265_CSP_I400))
1415
+    if (m_bChromaSa8d && (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400))
1416
     {
1417
         /* Add in chroma distortion */
1418
         sa8d += primitives.chroma[m_csp].cu[partEnum].sa8d(fencYuv.m_buf[1], fencYuv.m_csize, bidir2Nx2N.predYuv.m_buf[1], bidir2Nx2N.predYuv.m_csize);
1419
@@ -2087,7 +2305,7 @@
1420
 
1421
         int zsa8d;
1422
 
1423
-        if (m_bChromaSa8d && (m_csp != X265_CSP_I400))
1424
+        if (m_bChromaSa8d && (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400))
1425
         {
1426
             cu.m_mv[0][0] = mvzero;
1427
             cu.m_mv[1][0] = mvzero;
1428
@@ -2135,9 +2353,9 @@
1429
             if (m_bChromaSa8d) /* real MC was already performed */
1430
                 bidir2Nx2N.predYuv.copyFromYuv(tmpPredYuv);
1431
             else
1432
-                motionCompensation(cu, pu, bidir2Nx2N.predYuv, true, m_csp != X265_CSP_I400);
1433
+                motionCompensation(cu, pu, bidir2Nx2N.predYuv, true, m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400);
1434
         }
1435
-        else if (m_bChromaSa8d && (m_csp != X265_CSP_I400))
1436
+        else if (m_bChromaSa8d && (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400))
1437
         {
1438
             /* recover overwritten motion vectors */
1439
             cu.m_mv[0][0] = bestME[0].mv;
1440
@@ -2183,7 +2401,7 @@
1441
         cu.getIntraTUQtDepthRange(tuDepthRange, 0);
1442
 
1443
         residualTransformQuantIntra(*bestMode, cuGeom, 0, 0, tuDepthRange);
1444
-        if (m_csp != X265_CSP_I400)
1445
+        if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
1446
         {
1447
             getBestIntraModeChroma(*bestMode, cuGeom);
1448
             residualQTIntraChroma(*bestMode, cuGeom, 0, 0);
1449
@@ -2207,7 +2425,7 @@
1450
                                       fencYuv.m_buf[0], predY,
1451
                                       fencYuv.m_size, predYuv.m_size);
1452
 
1453
-        if (m_csp != X265_CSP_I400)
1454
+        if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
1455
         {
1456
             pixel* predU = predYuv.getCbAddr(absPartIdx);
1457
             pixel* predV = predYuv.getCrAddr(absPartIdx);
1458
@@ -2237,7 +2455,7 @@
1459
         else
1460
             primitives.cu[sizeIdx].copy_pp(reconPic.getLumaAddr(cu.m_cuAddr, absPartIdx), reconPic.m_stride,
1461
                                            predY, predYuv.m_size);
1462
-        if (m_csp != X265_CSP_I400)
1463
+        if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
1464
         {
1465
              pixel* predU = predYuv.getCbAddr(absPartIdx);
1466
              pixel* predV = predYuv.getCrAddr(absPartIdx);
1467
@@ -2257,7 +2475,7 @@
1468
         }
1469
     }
1470
 
1471
-    cu.updatePic(cuGeom.depth);
1472
+    cu.updatePic(cuGeom.depth, m_frame->m_fencPic->m_picCsp);
1473
 }
1474
 
1475
 void Analysis::addSplitFlagCost(Mode& mode, uint32_t depth)
1476
@@ -2390,6 +2608,30 @@
1477
 
1478
     return false;
1479
 }
1480
+
1481
+bool Analysis::complexityCheckCU(const Mode& bestMode)
1482
+{
1483
+    uint32_t mean = 0;
1484
+    uint32_t homo = 0;
1485
+    uint32_t cuSize = bestMode.fencYuv->m_size;
1486
+    for (uint32_t y = 0; y < cuSize; y++) {
1487
+        for (uint32_t x = 0; x < cuSize; x++) {
1488
+            mean += (bestMode.fencYuv->m_buf[0][y * cuSize + x]);
1489
+        }
1490
+    }
1491
+    mean = mean / (cuSize * cuSize);
1492
+    for (uint32_t y = 0 ; y < cuSize; y++){
1493
+        for (uint32_t x = 0 ; x < cuSize; x++){
1494
+            homo += abs(int(bestMode.fencYuv->m_buf[0][y * cuSize + x] - mean));
1495
+        }
1496
+    }
1497
+    homo = homo / (cuSize * cuSize);
1498
+
1499
+    if (homo < (.1 * mean))
1500
+        return true;
1501
+
1502
+    return false;
1503
+}
1504
 
1505
 int Analysis::calculateQpforCuSize(const CUData& ctu, const CUGeom& cuGeom, double baseQp)
1506
 {
1507
x265_1.9.tar.gz/source/encoder/analysis.h -> x265_2.0.tar.gz/source/encoder/analysis.h Changed
56
 
1
@@ -108,6 +108,7 @@
2
     ModeDepth m_modeDepth[NUM_CU_DEPTH];
3
     bool      m_bTryLossless;
4
     bool      m_bChromaSa8d;
5
+    bool      m_bHD;
6
 
7
     Analysis();
8
 
9
@@ -117,12 +118,19 @@
10
     Mode& compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, const Entropy& initialContext);
11
 
12
 protected:
13
-    /* Analysis data for load/save modes, keeps getting incremented as CTU analysis proceeds and data is consumed or read */
14
+    /* Analysis data for save/load mode, writes/reads data based on absPartIdx */
15
     analysis_inter_data* m_reuseInterDataCTU;
16
-    MV*                  m_reuseMv;
17
     int32_t*             m_reuseRef;
18
-    uint32_t*            m_reuseBestMergeCand;
19
+    uint8_t*             m_reuseDepth;
20
+    uint8_t*             m_reuseModes;
21
+    uint8_t*             m_reusePartSize;
22
+    uint8_t*             m_reuseMergeFlag;
23
+
24
     uint32_t m_splitRefIdx[4];
25
+    uint64_t* cacheCost;
26
+
27
+    /* refine RD based on QP for rd-levels 5 and 6 */
28
+    void qprdRefine(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp, int32_t lqp);
29
 
30
     /* full analysis for an I-slice CU */
31
     void compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp);
32
@@ -130,11 +138,13 @@
33
     /* full analysis for a P or B slice CU */
34
     uint32_t compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp);
35
     SplitData compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp);
36
-    SplitData compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGeom, uint32_t &zOrder, int32_t qp);
37
+    SplitData compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp);
38
+
39
+    void recodeCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp, int32_t origqp = -1);
40
 
41
     /* measure merge and skip */
42
     void checkMerge2Nx2N_rd0_4(Mode& skip, Mode& merge, const CUGeom& cuGeom);
43
-    void checkMerge2Nx2N_rd5_6(Mode& skip, Mode& merge, const CUGeom& cuGeom, bool isShareMergeCand);
44
+    void checkMerge2Nx2N_rd5_6(Mode& skip, Mode& merge, const CUGeom& cuGeom);
45
 
46
     /* measure inter options */
47
     void checkInter_rd0_4(Mode& interMode, const CUGeom& cuGeom, PartSize partSize, uint32_t refmask[2]);
48
@@ -151,6 +161,7 @@
49
     /* work-avoidance heuristics for RD levels < 5 */
50
     uint32_t topSkipMinDepth(const CUData& parentCTU, const CUGeom& cuGeom);
51
     bool recursionDepthCheck(const CUData& parentCTU, const CUGeom& cuGeom, const Mode& bestMode);
52
+    bool complexityCheckCU(const Mode& bestMode);
53
 
54
     /* generate residual and recon pixels for an entire CTU recursively (RD0) */
55
     void encodeResidue(const CUData& parentCTU, const CUGeom& cuGeom);
56
x265_1.9.tar.gz/source/encoder/api.cpp -> x265_2.0.tar.gz/source/encoder/api.cpp Changed
24
 
1
@@ -166,15 +166,20 @@
2
 
3
     x265_param save;
4
     Encoder* encoder = static_cast<Encoder*>(enc);
5
+    if (encoder->m_reconfigure) /* Reconfigure in progress */
6
+        return 1;
7
     memcpy(&save, encoder->m_latestParam, sizeof(x265_param));
8
     int ret = encoder->reconfigureParam(encoder->m_latestParam, param_in);
9
     if (ret)
10
+    {
11
         /* reconfigure failed, recover saved param set */
12
         memcpy(encoder->m_latestParam, &save, sizeof(x265_param));
13
+        ret = -1;
14
+    }
15
     else
16
     {
17
-        encoder->m_reconfigured = true;
18
-        x265_print_reconfigured_params(&save, encoder->m_latestParam);
19
+        encoder->m_reconfigure = true;
20
+        encoder->printReconfigureParams();
21
     }
22
     return ret;
23
 }
24
x265_1.9.tar.gz/source/encoder/dpb.cpp -> x265_2.0.tar.gz/source/encoder/dpb.cpp Changed
12
 
1
@@ -146,8 +146,8 @@
2
     // Mark pictures in m_piclist as unreferenced if they are not included in RPS
3
     applyReferencePictureSet(&slice->m_rps, pocCurr);
4
 
5
-    slice->m_numRefIdx[0] = X265_MIN(m_maxRefL0, slice->m_rps.numberOfNegativePictures); // Ensuring L0 contains just the -ve POC
6
-    slice->m_numRefIdx[1] = X265_MIN(m_maxRefL1, slice->m_rps.numberOfPositivePictures);
7
+    slice->m_numRefIdx[0] = X265_MIN(newFrame->m_param->maxNumReferences, slice->m_rps.numberOfNegativePictures); // Ensuring L0 contains just the -ve POC
8
+    slice->m_numRefIdx[1] = X265_MIN(newFrame->m_param->bBPyramid ? 2 : 1, slice->m_rps.numberOfPositivePictures);
9
     slice->setRefPicList(m_picList);
10
 
11
     X265_CHECK(slice->m_sliceType != B_SLICE || slice->m_numRefIdx[1], "B slice without L1 references (non-fatal)\n");
12
x265_1.9.tar.gz/source/encoder/dpb.h -> x265_2.0.tar.gz/source/encoder/dpb.h Changed
19
 
1
@@ -39,8 +39,6 @@
2
 
3
     int                m_lastIDR;
4
     int                m_pocCRA;
5
-    int                m_maxRefL0;
6
-    int                m_maxRefL1;
7
     int                m_bOpenGOP;
8
     bool               m_bRefreshPending;
9
     bool               m_bTemporalSublayer;
10
@@ -54,8 +52,6 @@
11
         m_pocCRA = 0;
12
         m_bRefreshPending = false;
13
         m_frameDataFreeList = NULL;
14
-        m_maxRefL0 = param->maxNumReferences;
15
-        m_maxRefL1 = param->bBPyramid ? 2 : 1;
16
         m_bOpenGOP = param->bOpenGOP;
17
         m_bTemporalSublayer = !!param->bEnableTemporalSubLayers;
18
     }
19
x265_1.9.tar.gz/source/encoder/encoder.cpp -> x265_2.0.tar.gz/source/encoder/encoder.cpp Changed
669
 
1
@@ -55,7 +55,7 @@
2
 Encoder::Encoder()
3
 {
4
     m_aborted = false;
5
-    m_reconfigured = false;
6
+    m_reconfigure = false;
7
     m_encodedFrameNum = 0;
8
     m_pocLast = -1;
9
     m_curEncoder = 0;
10
@@ -361,7 +361,10 @@
11
     }
12
 
13
     if (m_threadPool)
14
-        m_threadPool->stopWorkers();
15
+    {
16
+        for (int i = 0; i < m_numPools; i++)
17
+            m_threadPool[i].stopWorkers();
18
+    }
19
 }
20
 
21
 void Encoder::destroy()
22
@@ -508,12 +511,6 @@
23
 
24
     if (pic_in)
25
     {
26
-        if (pic_in->colorSpace != m_param->internalCsp)
27
-        {
28
-            x265_log(m_param, X265_LOG_ERROR, "Unsupported chroma subsampling (%d) on input\n",
29
-                     pic_in->colorSpace);
30
-            return -1;
31
-        }
32
         if (pic_in->bitDepth < 8 || pic_in->bitDepth > 16)
33
         {
34
             x265_log(m_param, X265_LOG_ERROR, "Input bit depth (%d) must be between 8 and 16\n",
35
@@ -525,7 +522,7 @@
36
         if (m_dpb->m_freeList.empty())
37
         {
38
             inFrame = new Frame;
39
-            x265_param* p = m_reconfigured? m_latestParam : m_param;
40
+            x265_param* p = m_reconfigure ? m_latestParam : m_param;
41
             if (inFrame->create(p, pic_in->quantOffsets))
42
             {
43
                 /* the first PicYuv created is asked to generate the CU and block unit offset
44
@@ -535,7 +532,7 @@
45
                 {
46
                     inFrame->m_fencPic->m_cuOffsetY = m_sps.cuOffsetY;
47
                     inFrame->m_fencPic->m_buOffsetY = m_sps.buOffsetY;
48
-                    if (pic_in->colorSpace != X265_CSP_I400)
49
+                    if (m_param->internalCsp != X265_CSP_I400)
50
                     {
51
                         inFrame->m_fencPic->m_cuOffsetC = m_sps.cuOffsetC;
52
                         inFrame->m_fencPic->m_buOffsetC = m_sps.buOffsetC;
53
@@ -555,7 +552,7 @@
54
                     {
55
                         m_sps.cuOffsetY = inFrame->m_fencPic->m_cuOffsetY;
56
                         m_sps.buOffsetY = inFrame->m_fencPic->m_buOffsetY;
57
-                        if (pic_in->colorSpace != X265_CSP_I400)
58
+                        if (m_param->internalCsp != X265_CSP_I400)
59
                         {
60
                             m_sps.cuOffsetC = inFrame->m_fencPic->m_cuOffsetC;
61
                             m_sps.cuOffsetY = inFrame->m_fencPic->m_cuOffsetY;
62
@@ -591,7 +588,7 @@
63
         inFrame->m_userData  = pic_in->userData;
64
         inFrame->m_pts       = pic_in->pts;
65
         inFrame->m_forceqp   = pic_in->forceqp;
66
-        inFrame->m_param     = m_reconfigured ? m_latestParam : m_param;
67
+        inFrame->m_param     = m_reconfigure ? m_latestParam : m_param;
68
         
69
         if (pic_in->quantOffsets != NULL)
70
         {
71
@@ -719,7 +716,7 @@
72
                     pic_out->analysisData.numPartitions = outFrame->m_analysisData.numPartitions;
73
                     pic_out->analysisData.interData = outFrame->m_analysisData.interData;
74
                     pic_out->analysisData.intraData = outFrame->m_analysisData.intraData;
75
-                    writeAnalysisFile(&pic_out->analysisData);
76
+                    writeAnalysisFile(&pic_out->analysisData, *outFrame->m_encData);
77
                     freeAnalysis(&pic_out->analysisData);
78
                 }
79
             }
80
@@ -780,6 +777,27 @@
81
                 if (m_rateControl->writeRateControlFrameStats(outFrame, &curEncoder->m_rce))
82
                     m_aborted = true;
83
 
84
+            if (pic_out && m_param->rc.bStatWrite)
85
+            {
86
+                /* m_rcData is allocated for every frame */
87
+                pic_out->rcData = outFrame->m_rcData;
88
+                outFrame->m_rcData->qpaRc = outFrame->m_encData->m_avgQpRc;
89
+                outFrame->m_rcData->qRceq = curEncoder->m_rce.qRceq;
90
+                outFrame->m_rcData->qpNoVbv = curEncoder->m_rce.qpNoVbv;
91
+                outFrame->m_rcData->coeffBits = outFrame->m_encData->m_frameStats.coeffBits;
92
+                outFrame->m_rcData->miscBits = outFrame->m_encData->m_frameStats.miscBits;
93
+                outFrame->m_rcData->mvBits = outFrame->m_encData->m_frameStats.mvBits;
94
+                outFrame->m_rcData->qScale = outFrame->m_rcData->newQScale = x265_qp2qScale(outFrame->m_encData->m_avgQpRc);
95
+                outFrame->m_rcData->poc = curEncoder->m_rce.poc;
96
+                outFrame->m_rcData->encodeOrder = curEncoder->m_rce.encodeOrder;
97
+                outFrame->m_rcData->sliceType = curEncoder->m_rce.sliceType;
98
+                outFrame->m_rcData->keptAsRef = curEncoder->m_rce.sliceType == B_SLICE && !IS_REFERENCED(outFrame) ? 0 : 1;
99
+                outFrame->m_rcData->qpAq = outFrame->m_encData->m_avgQpAq;
100
+                outFrame->m_rcData->iCuCount = outFrame->m_encData->m_frameStats.percent8x8Intra * m_rateControl->m_ncu;
101
+                outFrame->m_rcData->pCuCount = outFrame->m_encData->m_frameStats.percent8x8Inter * m_rateControl->m_ncu;
102
+                outFrame->m_rcData->skipCuCount = outFrame->m_encData->m_frameStats.percent8x8Skip  * m_rateControl->m_ncu;
103
+            }
104
+
105
             /* Allow this frame to be recycled if no frame encoders are using it for reference */
106
             if (!pic_out)
107
             {
108
@@ -800,16 +818,32 @@
109
             frameEnc = m_lookahead->getDecidedPicture();
110
         if (frameEnc && !pass)
111
         {
112
+            if (curEncoder->m_reconfigure)
113
+            {
114
+                /* One round robin cycle of FE reconfigure is complete */
115
+                /* Safe to copy m_latestParam to Encoder::m_param, encoder reconfigure complete */
116
+                for (int frameEncId = 0; frameEncId < m_param->frameNumThreads; frameEncId++)
117
+                    m_frameEncoder[frameEncId]->m_reconfigure = false;
118
+                memcpy (m_param, m_latestParam, sizeof(x265_param));
119
+                m_reconfigure = false;
120
+            }
121
+
122
+            /* Initiate reconfigure for this FE if necessary */
123
+            curEncoder->m_param = m_reconfigure ? m_latestParam : m_param;
124
+            curEncoder->m_reconfigure = m_reconfigure;
125
+
126
             /* give this frame a FrameData instance before encoding */
127
             if (m_dpb->m_frameDataFreeList)
128
             {
129
                 frameEnc->m_encData = m_dpb->m_frameDataFreeList;
130
                 m_dpb->m_frameDataFreeList = m_dpb->m_frameDataFreeList->m_freeListNext;
131
                 frameEnc->reinit(m_sps);
132
+                frameEnc->m_param = m_reconfigure ? m_latestParam : m_param;
133
+                frameEnc->m_encData->m_param = m_reconfigure ? m_latestParam : m_param;
134
             }
135
             else
136
             {
137
-                frameEnc->allocEncodeData(m_param, m_sps);
138
+                frameEnc->allocEncodeData(m_reconfigure ? m_latestParam : m_param, m_sps);
139
                 Slice* slice = frameEnc->m_encData->m_slice;
140
                 slice->m_sps = &m_sps;
141
                 slice->m_pps = &m_pps;
142
@@ -817,7 +851,7 @@
143
                 slice->m_endCUAddr = slice->realEndAddress(m_sps.numCUsInFrame * NUM_4x4_PARTITIONS);
144
             }
145
 
146
-            curEncoder->m_rce.encodeOrder = m_encodedFrameNum++;
147
+            curEncoder->m_rce.encodeOrder = frameEnc->m_encodeOrder = m_encodedFrameNum++;
148
             if (m_bframeDelay)
149
             {
150
                 int64_t *prevReorderedPts = m_prevReorderedPts;
151
@@ -867,28 +901,23 @@
152
 int Encoder::reconfigureParam(x265_param* encParam, x265_param* param)
153
 {
154
     encParam->maxNumReferences = param->maxNumReferences; // never uses more refs than specified in stream headers
155
-    encParam->bEnableLoopFilter = param->bEnableLoopFilter;
156
-    encParam->deblockingFilterTCOffset = param->deblockingFilterTCOffset;
157
-    encParam->deblockingFilterBetaOffset = param->deblockingFilterBetaOffset;
158
     encParam->bEnableFastIntra = param->bEnableFastIntra;
159
     encParam->bEnableEarlySkip = param->bEnableEarlySkip;
160
-    encParam->bEnableTemporalMvp = param->bEnableTemporalMvp;
161
-    /* Scratch buffer prevents me_range from being increased for esa/tesa
162
-    if (param->searchMethod < X265_FULL_SEARCH || param->searchMethod < encParam->searchRange)
163
-        encParam->searchRange = param->searchRange; */
164
-    encParam->noiseReductionInter = param->noiseReductionInter;
165
-    encParam->noiseReductionIntra = param->noiseReductionIntra;
166
+    encParam->bEnableRecursionSkip = param->bEnableRecursionSkip;
167
+    encParam->searchMethod = param->searchMethod;
168
+    /* Scratch buffer prevents me_range from being increased for esa/tesa */
169
+    if (param->searchRange < encParam->searchRange)
170
+        encParam->searchRange = param->searchRange;
171
     /* We can't switch out of subme=0 during encoding. */
172
     if (encParam->subpelRefine)
173
         encParam->subpelRefine = param->subpelRefine;
174
     encParam->rdoqLevel = param->rdoqLevel;
175
     encParam->rdLevel = param->rdLevel;
176
-    encParam->bEnableTSkipFast = param->bEnableTSkipFast;
177
-    encParam->psyRd = param->psyRd;
178
-    encParam->psyRdoq = param->psyRdoq;
179
-    encParam->bEnableSignHiding = param->bEnableSignHiding;
180
-    encParam->bEnableFastIntra = param->bEnableFastIntra;
181
-    encParam->maxTUSize = param->maxTUSize;
182
+    encParam->bEnableRectInter = param->bEnableRectInter;
183
+    encParam->maxNumMergeCand = param->maxNumMergeCand;
184
+    encParam->bIntraInBFrames = param->bIntraInBFrames;
185
+    /* To add: Loop Filter/deblocking controls, transform skip, signhide require PPS to be resent */
186
+    /* To add: SAO, temporal MVP, AMP, TU depths require SPS to be resent, at every CVS boundary */
187
     return x265_check_params(encParam);
188
 }
189
 
190
@@ -1214,12 +1243,6 @@
191
 
192
         stats->maxCLL         = m_analyzeAll.m_maxCLL;
193
         stats->maxFALL        = (uint16_t)(m_analyzeAll.m_maxFALL / m_analyzeAll.m_numPics);
194
-
195
-        if (m_emitCLLSEI)
196
-        {
197
-            m_param->maxCLL = stats->maxCLL;
198
-            m_param->maxFALL = stats->maxFALL;
199
-        }
200
     }
201
 
202
     /* If new statistics are added to x265_stats, we must check here whether the
203
@@ -1304,7 +1327,7 @@
204
 
205
     if (frameStats)
206
     {
207
-        const int picOrderCntLSB = (slice->m_poc - slice->m_lastIDR + (1 << BITS_FOR_POC)) % (1 << BITS_FOR_POC);
208
+        const int picOrderCntLSB = slice->m_poc - slice->m_lastIDR;
209
 
210
         frameStats->encoderOrder = m_outputCount++;
211
         frameStats->sliceType = c;
212
@@ -1576,7 +1599,6 @@
213
 void Encoder::configure(x265_param *p)
214
 {
215
     this->m_param = p;
216
-
217
     if (p->keyframeMax < 0)
218
     {
219
         /* A negative max GOP size indicates the user wants only one I frame at
220
@@ -1741,12 +1763,20 @@
221
         x265_log(p, X265_LOG_WARNING, "Analysis load/save options incompatible with pmode/pme, Disabling pmode/pme\n");
222
         p->bDistributeMotionEstimation = p->bDistributeModeAnalysis = 0;
223
     }
224
+
225
     if (p->analysisMode && p->rc.cuTree)
226
     {
227
         x265_log(p, X265_LOG_WARNING, "Analysis load/save options works only with cu-tree off, Disabling cu-tree\n");
228
         p->rc.cuTree = 0;
229
     }
230
 
231
+    if (p->rc.bEnableGrain)
232
+    {
233
+        x265_log(p, X265_LOG_WARNING, "Rc Grain removes qp fluctuations caused by aq/cutree, Disabling aq,cu-tree\n");
234
+        p->rc.cuTree = 0;
235
+        p->rc.aqMode = 0;
236
+    }
237
+
238
     if (p->bDistributeModeAnalysis && (p->limitReferences >> 1) && 1)
239
     {
240
         x265_log(p, X265_LOG_WARNING, "Limit reference options 2 and 3 are not supported with pmode. Disabling limit reference\n");
241
@@ -1815,20 +1845,10 @@
242
         m_conformanceWindow.rightOffset = padsize;
243
     }
244
 
245
-    /* set pad size if height is not multiple of the minimum CU size */
246
-    if (p->sourceHeight & (p->minCUSize - 1))
247
-    {
248
-        uint32_t rem = p->sourceHeight & (p->minCUSize - 1);
249
-        uint32_t padsize = p->minCUSize - rem;
250
-        p->sourceHeight += padsize;
251
-
252
-        m_conformanceWindow.bEnabled = true;
253
-        m_conformanceWindow.bottomOffset = padsize;
254
-    }
255
-    if (p->bDistributeModeAnalysis && p->analysisMode)
256
+    if (p->bEnableRdRefine && (p->rdLevel < 5 || !p->rc.aqMode))
257
     {
258
-        p->analysisMode = X265_ANALYSIS_OFF;
259
-        x265_log(p, X265_LOG_WARNING, "Analysis save and load mode not supported for distributed mode analysis\n");
260
+        p->bEnableRdRefine = false;
261
+        x265_log(p, X265_LOG_WARNING, "--rd-refine disabled, requires RD level > 4 and adaptive quant\n");
262
     }
263
 
264
     bool bIsVbv = m_param->rc.vbvBufferSize > 0 && m_param->rc.vbvMaxBitrate > 0;
265
@@ -1848,6 +1868,112 @@
266
     else
267
         m_param->rc.qgSize = p->maxCUSize;
268
 
269
+    if (p->uhdBluray)
270
+    {
271
+        p->bEnableAccessUnitDelimiters = 1;
272
+        p->vui.aspectRatioIdc = 1;
273
+        p->bEmitHRDSEI = 1;
274
+        int disableUhdBd = 0;
275
+
276
+        if (p->levelIdc && p->levelIdc != 51)
277
+        {
278
+            x265_log(p, X265_LOG_WARNING, "uhd-bd: Wrong level specified, UHD Bluray mandates Level 5.1\n");
279
+        }
280
+        p->levelIdc = 51;
281
+
282
+        if (!p->bHighTier)
283
+        {
284
+            x265_log(p, X265_LOG_WARNING, "uhd-bd: Turning on high tier\n");
285
+            p->bHighTier = 1;
286
+        }
287
+
288
+        if (!p->bRepeatHeaders)
289
+        {
290
+            x265_log(p, X265_LOG_WARNING, "uhd-bd: Turning on repeat-headers\n");
291
+            p->bRepeatHeaders = 1;
292
+        }
293
+
294
+        if (p->bOpenGOP)
295
+        {
296
+            x265_log(p, X265_LOG_WARNING, "uhd-bd: Turning off open GOP\n");
297
+            p->bOpenGOP = false;
298
+        }
299
+
300
+        if (p->bIntraRefresh)
301
+        {
302
+            x265_log(p, X265_LOG_WARNING, "uhd-bd: turning off intra-refresh\n");
303
+            p->bIntraRefresh = 0;
304
+        }
305
+
306
+        if (p->keyframeMin != 1)
307
+        {
308
+            x265_log(p, X265_LOG_WARNING, "uhd-bd: keyframeMin is always 1\n");
309
+            p->keyframeMin = 1;
310
+        }
311
+
312
+        int fps = (p->fpsNum + p->fpsDenom - 1) / p->fpsDenom;
313
+        if (p->keyframeMax > fps)
314
+        {
315
+            x265_log(p, X265_LOG_WARNING, "uhd-bd: reducing keyframeMax to %d\n", fps);
316
+            p->keyframeMax = fps;
317
+        }
318
+
319
+        if (p->maxNumReferences > 6)
320
+        {
321
+            x265_log(p, X265_LOG_WARNING, "uhd-bd: reducing references to 6\n");
322
+            p->maxNumReferences = 6;
323
+        }
324
+
325
+        if (p->bEnableTemporalSubLayers)
326
+        {
327
+            x265_log(p, X265_LOG_WARNING, "uhd-bd: Turning off temporal layering\n");
328
+            p->bEnableTemporalSubLayers = 0;
329
+        }
330
+
331
+        if (p->vui.colorPrimaries != 1 && p->vui.colorPrimaries != 9)
332
+        {
333
+            x265_log(p, X265_LOG_ERROR, "uhd-bd: colour primaries should be either BT.709 or BT.2020\n");
334
+            disableUhdBd = 1;
335
+        }
336
+        else if (p->vui.colorPrimaries == 9)
337
+        {
338
+            p->vui.bEnableChromaLocInfoPresentFlag = 1;
339
+            p->vui.chromaSampleLocTypeTopField = 2;
340
+            p->vui.chromaSampleLocTypeBottomField = 2;
341
+        }
342
+
343
+        if (p->vui.transferCharacteristics != 1 && p->vui.transferCharacteristics != 14 && p->vui.transferCharacteristics != 16)
344
+        {
345
+            x265_log(p, X265_LOG_ERROR, "uhd-bd: transfer characteristics supported are BT.709, BT.2020-10 or SMPTE ST.2084\n");
346
+            disableUhdBd = 1;
347
+        }
348
+        if (p->vui.matrixCoeffs != 1 && p->vui.matrixCoeffs != 9)
349
+        {
350
+            x265_log(p, X265_LOG_ERROR, "uhd-bd: matrix coeffs supported are either BT.709 or BT.2020\n");
351
+            disableUhdBd = 1;
352
+        }
353
+        if ((p->sourceWidth != 1920 && p->sourceWidth != 3840) || (p->sourceHeight != 1080 && p->sourceHeight != 2160))
354
+        {
355
+            x265_log(p, X265_LOG_ERROR, "uhd-bd: Supported resolutions are 1920x1080 and 3840x2160\n");
356
+            disableUhdBd = 1;
357
+        }
358
+        if (disableUhdBd)
359
+        {
360
+            p->uhdBluray = 0;
361
+            x265_log(p, X265_LOG_ERROR, "uhd-bd: Disabled\n");
362
+        }
363
+    }
364
+
365
+    /* set pad size if height is not multiple of the minimum CU size */
366
+    if (p->sourceHeight & (p->minCUSize - 1))
367
+    {
368
+        uint32_t rem = p->sourceHeight & (p->minCUSize - 1);
369
+        uint32_t padsize = p->minCUSize - rem;
370
+        p->sourceHeight += padsize;
371
+        m_conformanceWindow.bEnabled = true;
372
+        m_conformanceWindow.bottomOffset = padsize;
373
+    }
374
+
375
     if (p->bLogCuStats)
376
         x265_log(p, X265_LOG_WARNING, "--cu-stats option is now deprecated\n");
377
 
378
@@ -1877,8 +2003,9 @@
379
         CHECKED_MALLOC_ZERO(interData->ref, int32_t, analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU * numDir);
380
         CHECKED_MALLOC(interData->depth, uint8_t, analysis->numPartitions * analysis->numCUsInFrame);
381
         CHECKED_MALLOC(interData->modes, uint8_t, analysis->numPartitions * analysis->numCUsInFrame);
382
-        CHECKED_MALLOC_ZERO(interData->bestMergeCand, uint32_t, analysis->numCUsInFrame * CUGeom::MAX_GEOMS);
383
-        CHECKED_MALLOC_ZERO(interData->mv, MV, analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU * numDir);
384
+        CHECKED_MALLOC(interData->partSize, uint8_t, analysis->numPartitions * analysis->numCUsInFrame);
385
+        CHECKED_MALLOC(interData->mergeFlag, uint8_t, analysis->numPartitions * analysis->numCUsInFrame);
386
+        CHECKED_MALLOC_ZERO(interData->wt, WeightParam, 3 * numDir);
387
         analysis->interData = interData;
388
     }
389
     return;
390
@@ -1903,8 +2030,9 @@
391
         X265_FREE(((analysis_inter_data*)analysis->interData)->ref);
392
         X265_FREE(((analysis_inter_data*)analysis->interData)->depth);
393
         X265_FREE(((analysis_inter_data*)analysis->interData)->modes);
394
-        X265_FREE(((analysis_inter_data*)analysis->interData)->bestMergeCand);
395
-        X265_FREE(((analysis_inter_data*)analysis->interData)->mv);
396
+        X265_FREE(((analysis_inter_data*)analysis->interData)->mergeFlag);
397
+        X265_FREE(((analysis_inter_data*)analysis->interData)->partSize);
398
+        X265_FREE(((analysis_inter_data*)analysis->interData)->wt);
399
         X265_FREE(analysis->interData);
400
     }
401
 }
402
@@ -1923,10 +2051,12 @@
403
 
404
     static uint64_t consumedBytes = 0;
405
     static uint64_t totalConsumedBytes = 0;
406
+    uint32_t depthBytes = 0;
407
     fseeko(m_analysisFile, totalConsumedBytes, SEEK_SET);
408
 
409
     int poc; uint32_t frameRecordSize;
410
     X265_FREAD(&frameRecordSize, sizeof(uint32_t), 1, m_analysisFile);
411
+    X265_FREAD(&depthBytes, sizeof(uint32_t), 1, m_analysisFile);
412
     X265_FREAD(&poc, sizeof(int), 1, m_analysisFile);
413
 
414
     uint64_t currentOffset = totalConsumedBytes;
415
@@ -1937,6 +2067,7 @@
416
         currentOffset += frameRecordSize;
417
         fseeko(m_analysisFile, currentOffset, SEEK_SET);
418
         X265_FREAD(&frameRecordSize, sizeof(uint32_t), 1, m_analysisFile);
419
+        X265_FREAD(&depthBytes, sizeof(uint32_t), 1, m_analysisFile);
420
         X265_FREAD(&poc, sizeof(int), 1, m_analysisFile);
421
     }
422
 
423
@@ -1961,36 +2092,67 @@
424
 
425
     if (analysis->sliceType == X265_TYPE_IDR || analysis->sliceType == X265_TYPE_I)
426
     {
427
-        X265_FREAD(((analysis_intra_data *)analysis->intraData)->depth, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile);
428
+        uint8_t *tempBuf = NULL, *depthBuf = NULL, *modeBuf = NULL, *partSizes = NULL;
429
+
430
+        tempBuf = X265_MALLOC(uint8_t, depthBytes * 3);
431
+        X265_FREAD(tempBuf, sizeof(uint8_t), depthBytes * 3, m_analysisFile);
432
+
433
+        depthBuf = tempBuf;
434
+        modeBuf = tempBuf + depthBytes;
435
+        partSizes = tempBuf + 2 * depthBytes;
436
+
437
+        size_t count = 0;
438
+        for (uint32_t d = 0; d < depthBytes; d++)
439
+        {
440
+            int bytes = analysis->numPartitions >> (depthBuf[d] * 2);
441
+            memset(&((analysis_intra_data *)analysis->intraData)->depth[count], depthBuf[d], bytes);
442
+            memset(&((analysis_intra_data *)analysis->intraData)->chromaModes[count], modeBuf[d], bytes);
443
+            memset(&((analysis_intra_data *)analysis->intraData)->partSizes[count], partSizes[d], bytes);
444
+            count += bytes;
445
+        }
446
         X265_FREAD(((analysis_intra_data *)analysis->intraData)->modes, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile);
447
-        X265_FREAD(((analysis_intra_data *)analysis->intraData)->partSizes, sizeof(char), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile);
448
-        X265_FREAD(((analysis_intra_data *)analysis->intraData)->chromaModes, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile);
449
+        X265_FREE(tempBuf);
450
         analysis->sliceType = X265_TYPE_I;
451
         consumedBytes += frameRecordSize;
452
     }
453
-    else if (analysis->sliceType == X265_TYPE_P)
454
-    {
455
-        X265_FREAD(((analysis_inter_data *)analysis->interData)->ref, sizeof(int32_t), analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU, m_analysisFile);
456
-        X265_FREAD(((analysis_inter_data *)analysis->interData)->depth, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile);
457
-        X265_FREAD(((analysis_inter_data *)analysis->interData)->modes, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile);
458
-        X265_FREAD(((analysis_inter_data *)analysis->interData)->bestMergeCand, sizeof(uint32_t), analysis->numCUsInFrame * CUGeom::MAX_GEOMS, m_analysisFile);
459
-        X265_FREAD(((analysis_inter_data *)analysis->interData)->mv, sizeof(MV), analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU, m_analysisFile);
460
-        consumedBytes += frameRecordSize;
461
-        totalConsumedBytes = consumedBytes;
462
-    }
463
+
464
     else
465
     {
466
-        X265_FREAD(((analysis_inter_data *)analysis->interData)->ref, sizeof(int32_t), analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU * 2, m_analysisFile);
467
-        X265_FREAD(((analysis_inter_data *)analysis->interData)->depth, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile);
468
-        X265_FREAD(((analysis_inter_data *)analysis->interData)->modes, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile);
469
-        X265_FREAD(((analysis_inter_data *)analysis->interData)->bestMergeCand, sizeof(uint32_t), analysis->numCUsInFrame * CUGeom::MAX_GEOMS, m_analysisFile);
470
-        X265_FREAD(((analysis_inter_data *)analysis->interData)->mv, sizeof(MV), analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU * 2, m_analysisFile);
471
+        uint8_t *tempBuf = NULL, *depthBuf = NULL, *modeBuf = NULL, *partSize = NULL, *mergeFlag = NULL;
472
+
473
+        tempBuf = X265_MALLOC(uint8_t, depthBytes * 4);
474
+        X265_FREAD(tempBuf, sizeof(uint8_t), depthBytes * 4, m_analysisFile);
475
+
476
+        depthBuf = tempBuf;
477
+        modeBuf = tempBuf + depthBytes;
478
+        partSize = modeBuf + depthBytes;
479
+        mergeFlag = partSize + depthBytes;
480
+
481
+        size_t count = 0;
482
+        for (uint32_t d = 0; d < depthBytes; d++)
483
+        {
484
+            int bytes = analysis->numPartitions >> (depthBuf[d] * 2);
485
+            memset(&((analysis_inter_data *)analysis->interData)->depth[count], depthBuf[d], bytes);
486
+            memset(&((analysis_inter_data *)analysis->interData)->modes[count], modeBuf[d], bytes);
487
+            memset(&((analysis_inter_data *)analysis->interData)->partSize[count], partSize[d], bytes);
488
+            memset(&((analysis_inter_data *)analysis->interData)->mergeFlag[count], mergeFlag[d], bytes);
489
+            count += bytes;
490
+        }
491
+
492
+        X265_FREE(tempBuf);
493
+
494
+        int numDir = analysis->sliceType == X265_TYPE_P ? 1 : 2;
495
+        X265_FREAD(((analysis_inter_data *)analysis->interData)->ref, sizeof(int32_t), analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU * numDir, m_analysisFile);
496
+        uint32_t numPlanes = m_param->internalCsp == X265_CSP_I400 ? 1 : 3;
497
+        X265_FREAD(((analysis_inter_data *)analysis->interData)->wt, sizeof(WeightParam), numPlanes * numDir, m_analysisFile);
498
         consumedBytes += frameRecordSize;
499
+        if (numDir == 1)
500
+            totalConsumedBytes = consumedBytes;
501
     }
502
 #undef X265_FREAD
503
 }
504
 
505
-void Encoder::writeAnalysisFile(x265_analysis_data* analysis)
506
+void Encoder::writeAnalysisFile(x265_analysis_data* analysis, FrameData &curEncData)
507
 {
508
 
509
 #define X265_FWRITE(val, size, writeSize, fileOffset)\
510
@@ -2002,26 +2164,82 @@
511
         return;\
512
     }\
513
 
514
-    /* calculate frameRecordSize */
515
-    analysis->frameRecordSize = sizeof(analysis->frameRecordSize) + sizeof(analysis->poc) + sizeof(analysis->sliceType) +
516
-                      sizeof(analysis->numCUsInFrame) + sizeof(analysis->numPartitions) + sizeof(analysis->bScenecut) + sizeof(analysis->satdCost);
517
+    uint32_t depthBytes = 0;
518
     if (analysis->sliceType == X265_TYPE_IDR || analysis->sliceType == X265_TYPE_I)
519
-        analysis->frameRecordSize += sizeof(uint8_t) * analysis->numCUsInFrame * analysis->numPartitions * 4;
520
-    else if (analysis->sliceType == X265_TYPE_P)
521
     {
522
-        analysis->frameRecordSize += sizeof(int32_t) * analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU;
523
-        analysis->frameRecordSize += sizeof(uint8_t) * analysis->numCUsInFrame * analysis->numPartitions * 2;
524
-        analysis->frameRecordSize += sizeof(uint32_t) * analysis->numCUsInFrame * CUGeom::MAX_GEOMS;
525
-        analysis->frameRecordSize += sizeof(MV) * analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU;
526
+        for (uint32_t cuAddr = 0; cuAddr < analysis->numCUsInFrame; cuAddr++)
527
+        {
528
+            uint8_t depth = 0;
529
+            uint8_t mode = 0;
530
+            uint8_t partSize = 0;
531
+
532
+            CUData* ctu = curEncData.getPicCTU(cuAddr);
533
+            analysis_intra_data* intraDataCTU = (analysis_intra_data*)analysis->intraData;
534
+
535
+            for (uint32_t absPartIdx = 0; absPartIdx < ctu->m_numPartitions; depthBytes++)
536
+            {
537
+                depth = ctu->m_cuDepth[absPartIdx];
538
+                intraDataCTU->depth[depthBytes] = depth;
539
+
540
+                mode = ctu->m_chromaIntraDir[absPartIdx];
541
+                intraDataCTU->chromaModes[depthBytes] = mode;
542
+
543
+                partSize = ctu->m_partSize[absPartIdx];
544
+                intraDataCTU->partSizes[depthBytes] = partSize;
545
+
546
+                absPartIdx += ctu->m_numPartitions >> (depth * 2);
547
+            }
548
+            memcpy(&intraDataCTU->modes[ctu->m_cuAddr * ctu->m_numPartitions], ctu->m_lumaIntraDir, sizeof(uint8_t)* ctu->m_numPartitions);
549
+        }
550
     }
551
     else
552
     {
553
-        analysis->frameRecordSize += sizeof(int32_t) * analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU * 2;
554
-        analysis->frameRecordSize += sizeof(uint8_t) * analysis->numCUsInFrame * analysis->numPartitions * 2;
555
-        analysis->frameRecordSize += sizeof(uint32_t) * analysis->numCUsInFrame * CUGeom::MAX_GEOMS;
556
-        analysis->frameRecordSize += sizeof(MV) * analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU * 2;
557
+        for (uint32_t cuAddr = 0; cuAddr < analysis->numCUsInFrame; cuAddr++)
558
+        {
559
+            uint8_t depth = 0;
560
+            uint8_t predMode = 0;
561
+            uint8_t partSize = 0;
562
+            uint8_t mergeFlag = 0;
563
+
564
+            CUData* ctu = curEncData.getPicCTU(cuAddr);
565
+            analysis_inter_data* interDataCTU = (analysis_inter_data*)analysis->interData;
566
+
567
+            for (uint32_t absPartIdx = 0; absPartIdx < ctu->m_numPartitions; depthBytes++)
568
+            {
569
+                depth = ctu->m_cuDepth[absPartIdx];
570
+                interDataCTU->depth[depthBytes] = depth;
571
+
572
+                predMode = ctu->m_predMode[absPartIdx];
573
+                if (ctu->m_refIdx[1][absPartIdx] != -1)
574
+                    predMode = 4; // used as indiacator if the block is coded as bidir
575
+
576
+                interDataCTU->modes[depthBytes] = predMode;
577
+
578
+                partSize = ctu->m_partSize[absPartIdx];
579
+                interDataCTU->partSize[depthBytes] = partSize;
580
+
581
+                mergeFlag = ctu->m_mergeFlag[absPartIdx];
582
+                interDataCTU->mergeFlag[depthBytes] = mergeFlag;
583
+
584
+                absPartIdx += ctu->m_numPartitions >> (depth * 2);
585
+            }
586
+        }
587
+    }
588
+
589
+    /* calculate frameRecordSize */
590
+    analysis->frameRecordSize = sizeof(analysis->frameRecordSize) + sizeof(depthBytes) + sizeof(analysis->poc) + sizeof(analysis->sliceType) +
591
+                      sizeof(analysis->numCUsInFrame) + sizeof(analysis->numPartitions) + sizeof(analysis->bScenecut) + sizeof(analysis->satdCost);
592
+    if (analysis->sliceType == X265_TYPE_IDR || analysis->sliceType == X265_TYPE_I)
593
+        analysis->frameRecordSize += sizeof(uint8_t)* analysis->numCUsInFrame * analysis->numPartitions + depthBytes * 3;
594
+    else
595
+    {
596
+        int numDir = (analysis->sliceType == X265_TYPE_P) ? 1 : 2;
597
+        analysis->frameRecordSize += depthBytes * 4;
598
+        analysis->frameRecordSize += sizeof(int32_t)* analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU * numDir;
599
+        analysis->frameRecordSize += sizeof(WeightParam)* 3 * numDir;
600
     }
601
     X265_FWRITE(&analysis->frameRecordSize, sizeof(uint32_t), 1, m_analysisFile);
602
+    X265_FWRITE(&depthBytes, sizeof(uint32_t), 1, m_analysisFile);
603
     X265_FWRITE(&analysis->poc, sizeof(int), 1, m_analysisFile);
604
     X265_FWRITE(&analysis->sliceType, sizeof(int), 1, m_analysisFile);
605
     X265_FWRITE(&analysis->bScenecut, sizeof(int), 1, m_analysisFile);
606
@@ -2031,26 +2249,46 @@
607
 
608
     if (analysis->sliceType == X265_TYPE_IDR || analysis->sliceType == X265_TYPE_I)
609
     {
610
-        X265_FWRITE(((analysis_intra_data*)analysis->intraData)->depth, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile);
611
+        X265_FWRITE(((analysis_intra_data*)analysis->intraData)->depth, sizeof(uint8_t), depthBytes, m_analysisFile);
612
+        X265_FWRITE(((analysis_intra_data*)analysis->intraData)->chromaModes, sizeof(uint8_t), depthBytes, m_analysisFile);
613
+        X265_FWRITE(((analysis_intra_data*)analysis->intraData)->partSizes, sizeof(char), depthBytes, m_analysisFile);
614
         X265_FWRITE(((analysis_intra_data*)analysis->intraData)->modes, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile);
615
-        X265_FWRITE(((analysis_intra_data*)analysis->intraData)->partSizes, sizeof(char), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile);
616
-        X265_FWRITE(((analysis_intra_data*)analysis->intraData)->chromaModes, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile);
617
-    }
618
-    else if (analysis->sliceType == X265_TYPE_P)
619
-    {
620
-        X265_FWRITE(((analysis_inter_data*)analysis->interData)->ref, sizeof(int32_t), analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU, m_analysisFile);
621
-        X265_FWRITE(((analysis_inter_data*)analysis->interData)->depth, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile);
622
-        X265_FWRITE(((analysis_inter_data*)analysis->interData)->modes, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile);
623
-        X265_FWRITE(((analysis_inter_data*)analysis->interData)->bestMergeCand, sizeof(uint32_t), analysis->numCUsInFrame * CUGeom::MAX_GEOMS, m_analysisFile);
624
-        X265_FWRITE(((analysis_inter_data*)analysis->interData)->mv, sizeof(MV), analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU, m_analysisFile);
625
     }
626
     else
627
     {
628
-        X265_FWRITE(((analysis_inter_data*)analysis->interData)->ref, sizeof(int32_t), analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU * 2, m_analysisFile);
629
-        X265_FWRITE(((analysis_inter_data*)analysis->interData)->depth, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile);
630
-        X265_FWRITE(((analysis_inter_data*)analysis->interData)->modes, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile);
631
-        X265_FWRITE(((analysis_inter_data*)analysis->interData)->bestMergeCand, sizeof(uint32_t), analysis->numCUsInFrame * CUGeom::MAX_GEOMS, m_analysisFile);
632
-        X265_FWRITE(((analysis_inter_data*)analysis->interData)->mv, sizeof(MV), analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU * 2, m_analysisFile);
633
+        int numDir = analysis->sliceType == X265_TYPE_P ? 1 : 2;
634
+        X265_FWRITE(((analysis_inter_data*)analysis->interData)->depth, sizeof(uint8_t), depthBytes, m_analysisFile);
635
+        X265_FWRITE(((analysis_inter_data*)analysis->interData)->modes, sizeof(uint8_t), depthBytes, m_analysisFile);
636
+        X265_FWRITE(((analysis_inter_data*)analysis->interData)->partSize, sizeof(uint8_t), depthBytes, m_analysisFile);
637
+        X265_FWRITE(((analysis_inter_data*)analysis->interData)->mergeFlag, sizeof(uint8_t), depthBytes, m_analysisFile);
638
+        X265_FWRITE(((analysis_inter_data*)analysis->interData)->ref, sizeof(int32_t), analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU * numDir, m_analysisFile);
639
+        uint32_t numPlanes = m_param->internalCsp == X265_CSP_I400 ? 1 : 3;
640
+        X265_FWRITE(((analysis_inter_data*)analysis->interData)->wt, sizeof(WeightParam), numPlanes * numDir, m_analysisFile);
641
     }
642
 #undef X265_FWRITE
643
 }
644
+
645
+void Encoder::printReconfigureParams()
646
+{
647
+    if (!m_reconfigure)
648
+        return;
649
+    x265_param* oldParam = m_param;
650
+    x265_param* newParam = m_latestParam;
651
+    
652
+    x265_log(newParam, X265_LOG_INFO, "Reconfigured param options, input Frame: %d\n", m_pocLast + 1);
653
+
654
+    char tmp[40];
655
+#define TOOLCMP(COND1, COND2, STR)  if (COND1 != COND2) { sprintf(tmp, STR, COND1, COND2); x265_log(newParam, X265_LOG_INFO, tmp); }
656
+    TOOLCMP(oldParam->maxNumReferences, newParam->maxNumReferences, "ref=%d to %d\n");
657
+    TOOLCMP(oldParam->bEnableFastIntra, newParam->bEnableFastIntra, "fast-intra=%d to %d\n");
658
+    TOOLCMP(oldParam->bEnableEarlySkip, newParam->bEnableEarlySkip, "early-skip=%d to %d\n");
659
+    TOOLCMP(oldParam->bEnableRecursionSkip, newParam->bEnableRecursionSkip, "rskip=%d to %d\n");
660
+    TOOLCMP(oldParam->searchMethod, newParam->searchMethod, "me=%d to %d\n");
661
+    TOOLCMP(oldParam->searchRange, newParam->searchRange, "merange=%d to %d\n");
662
+    TOOLCMP(oldParam->subpelRefine, newParam->subpelRefine, "subme= %d to %d\n");
663
+    TOOLCMP(oldParam->rdLevel, newParam->rdLevel, "rd=%d to %d\n");
664
+    TOOLCMP(oldParam->rdoqLevel, newParam->rdoqLevel, "rdoq=%d to %d\n" );
665
+    TOOLCMP(oldParam->bEnableRectInter, newParam->bEnableRectInter, "rect=%d to %d\n");
666
+    TOOLCMP(oldParam->maxNumMergeCand, newParam->maxNumMergeCand, "max-merge=%d to %d\n");
667
+    TOOLCMP(oldParam->bIntraInBFrames, newParam->bIntraInBFrames, "b-intra=%d to %d\n");
668
+}
669
x265_1.9.tar.gz/source/encoder/encoder.h -> x265_2.0.tar.gz/source/encoder/encoder.h Changed
45
 
1
@@ -74,6 +74,7 @@
2
 class Lookahead;
3
 class RateControl;
4
 class ThreadPool;
5
+class FrameData;
6
 
7
 class Encoder : public x265_encoder
8
 {
9
@@ -110,7 +111,7 @@
10
     Frame*             m_exportedPic;
11
     FILE*              m_analysisFile;
12
     x265_param*        m_param;
13
-    x265_param*        m_latestParam;
14
+    x265_param*        m_latestParam;     // Holds latest param during a reconfigure
15
     RateControl*       m_rateControl;
16
     Lookahead*         m_lookahead;
17
 
18
@@ -129,7 +130,7 @@
19
     bool               m_emitCLLSEI;
20
     bool               m_bZeroLatency;     // x265_encoder_encode() returns NALs for the input picture, zero lag
21
     bool               m_aborted;          // fatal error detected
22
-    bool               m_reconfigured;      // reconfigure of encoder detected
23
+    bool               m_reconfigure;      // Encoder reconfigure in progress
24
 
25
     /* Begin intra refresh when one not in progress or else begin one as soon as the current 
26
      * one is done. Requires bIntraRefresh to be set.*/
27
@@ -152,6 +153,8 @@
28
 
29
     void printSummary();
30
 
31
+    void printReconfigureParams();
32
+
33
     char* statsString(EncStats&, char*);
34
 
35
     void configure(x265_param *param);
36
@@ -164,7 +167,7 @@
37
 
38
     void readAnalysisFile(x265_analysis_data* analysis, int poc);
39
 
40
-    void writeAnalysisFile(x265_analysis_data* pic);
41
+    void writeAnalysisFile(x265_analysis_data* pic, FrameData &curEncData);
42
 
43
     void finishFrameStats(Frame* pic, FrameEncoder *curEncoder, x265_frame_stats* frameStats, int inPoc);
44
 
45
x265_1.9.tar.gz/source/encoder/entropy.cpp -> x265_2.0.tar.gz/source/encoder/entropy.cpp Changed
310
 
1
@@ -38,6 +38,189 @@
2
 
3
 namespace X265_NS {
4
 
5
+// initial probability for cu_transquant_bypass flag
6
+static const uint8_t INIT_CU_TRANSQUANT_BYPASS_FLAG[3][NUM_TQUANT_BYPASS_FLAG_CTX] =
7
+{
8
+    { 154 },
9
+    { 154 },
10
+    { 154 },
11
+};
12
+
13
+// initial probability for split flag
14
+static const uint8_t INIT_SPLIT_FLAG[3][NUM_SPLIT_FLAG_CTX] =
15
+{
16
+    { 107,  139,  126, },
17
+    { 107,  139,  126, },
18
+    { 139,  141,  157, },
19
+};
20
+
21
+static const uint8_t INIT_SKIP_FLAG[3][NUM_SKIP_FLAG_CTX] =
22
+{
23
+    { 197,  185,  201, },
24
+    { 197,  185,  201, },
25
+    { CNU,  CNU,  CNU, },
26
+};
27
+
28
+static const uint8_t INIT_MERGE_FLAG_EXT[3][NUM_MERGE_FLAG_EXT_CTX] =
29
+{
30
+    { 154, },
31
+    { 110, },
32
+    { CNU, },
33
+};
34
+
35
+static const uint8_t INIT_MERGE_IDX_EXT[3][NUM_MERGE_IDX_EXT_CTX] =
36
+{
37
+    { 137, },
38
+    { 122, },
39
+    { CNU, },
40
+};
41
+
42
+static const uint8_t INIT_PART_SIZE[3][NUM_PART_SIZE_CTX] =
43
+{
44
+    { 154,  139,  154, 154 },
45
+    { 154,  139,  154, 154 },
46
+    { 184,  CNU,  CNU, CNU },
47
+};
48
+
49
+static const uint8_t INIT_PRED_MODE[3][NUM_PRED_MODE_CTX] =
50
+{
51
+    { 134, },
52
+    { 149, },
53
+    { CNU, },
54
+};
55
+
56
+static const uint8_t INIT_INTRA_PRED_MODE[3][NUM_ADI_CTX] =
57
+{
58
+    { 183, },
59
+    { 154, },
60
+    { 184, },
61
+};
62
+
63
+static const uint8_t INIT_CHROMA_PRED_MODE[3][NUM_CHROMA_PRED_CTX] =
64
+{
65
+    { 152,  139, },
66
+    { 152,  139, },
67
+    {  63,  139, },
68
+};
69
+
70
+static const uint8_t INIT_INTER_DIR[3][NUM_INTER_DIR_CTX] =
71
+{
72
+    {  95,   79,   63,   31,  31, },
73
+    {  95,   79,   63,   31,  31, },
74
+    { CNU,  CNU,  CNU,  CNU, CNU, },
75
+};
76
+
77
+static const uint8_t INIT_MVD[3][NUM_MV_RES_CTX] =
78
+{
79
+    { 169,  198, },
80
+    { 140,  198, },
81
+    { CNU,  CNU, },
82
+};
83
+
84
+static const uint8_t INIT_REF_PIC[3][NUM_REF_NO_CTX] =
85
+{
86
+    { 153,  153 },
87
+    { 153,  153 },
88
+    { CNU,  CNU },
89
+};
90
+
91
+static const uint8_t INIT_DQP[3][NUM_DELTA_QP_CTX] =
92
+{
93
+    { 154,  154,  154, },
94
+    { 154,  154,  154, },
95
+    { 154,  154,  154, },
96
+};
97
+
98
+static const uint8_t INIT_QT_CBF[3][NUM_QT_CBF_CTX] =
99
+{
100
+    { 153,  111,  149,   92,  167,  154,  154 },
101
+    { 153,  111,  149,  107,  167,  154,  154 },
102
+    { 111,  141,   94,  138,  182,  154,  154 },
103
+};
104
+
105
+static const uint8_t INIT_QT_ROOT_CBF[3][NUM_QT_ROOT_CBF_CTX] =
106
+{
107
+    {  79, },
108
+    {  79, },
109
+    { CNU, },
110
+};
111
+
112
+static const uint8_t INIT_LAST[3][NUM_CTX_LAST_FLAG_XY] =
113
+{
114
+    { 125,  110,  124,  110,   95,   94,  125,  111,  111,   79,  125,  126,  111,  111,   79,
115
+      108,  123,   93 },
116
+    { 125,  110,   94,  110,   95,   79,  125,  111,  110,   78,  110,  111,  111,   95,   94,
117
+      108,  123,  108 },
118
+    { 110,  110,  124,  125,  140,  153,  125,  127,  140,  109,  111,  143,  127,  111,   79,
119
+      108,  123,   63 },
120
+};
121
+
122
+static const uint8_t INIT_SIG_CG_FLAG[3][2 * NUM_SIG_CG_FLAG_CTX] =
123
+{
124
+    { 121,  140,
125
+      61,  154, },
126
+    { 121,  140,
127
+      61,  154, },
128
+    {  91,  171,
129
+       134,  141, },
130
+};
131
+
132
+static const uint8_t INIT_SIG_FLAG[3][NUM_SIG_FLAG_CTX] =
133
+{
134
+    { 170,  154,  139,  153,  139,  123,  123,   63,  124,  166,  183,  140,  136,  153,  154,  166,  183,  140,  136,  153,  154,  166,  183,  140,  136,  153,  154,  170,  153,  138,  138,  122,  121,  122,  121,  167,  151,  183,  140,  151,  183,  140,  },
135
+    { 155,  154,  139,  153,  139,  123,  123,   63,  153,  166,  183,  140,  136,  153,  154,  166,  183,  140,  136,  153,  154,  166,  183,  140,  136,  153,  154,  170,  153,  123,  123,  107,  121,  107,  121,  167,  151,  183,  140,  151,  183,  140,  },
136
+    { 111,  111,  125,  110,  110,   94,  124,  108,  124,  107,  125,  141,  179,  153,  125,  107,  125,  141,  179,  153,  125,  107,  125,  141,  179,  153,  125,  140,  139,  182,  182,  152,  136,  152,  136,  153,  136,  139,  111,  136,  139,  111,  },
137
+};
138
+
139
+static const uint8_t INIT_ONE_FLAG[3][NUM_ONE_FLAG_CTX] =
140
+{
141
+    { 154,  196,  167,  167,  154,  152,  167,  182,  182,  134,  149,  136,  153,  121,  136,  122,  169,  208,  166,  167,  154,  152,  167,  182, },
142
+    { 154,  196,  196,  167,  154,  152,  167,  182,  182,  134,  149,  136,  153,  121,  136,  137,  169,  194,  166,  167,  154,  167,  137,  182, },
143
+    { 140,   92,  137,  138,  140,  152,  138,  139,  153,   74,  149,   92,  139,  107,  122,  152,  140,  179,  166,  182,  140,  227,  122,  197, },
144
+};
145
+
146
+static const uint8_t INIT_ABS_FLAG[3][NUM_ABS_FLAG_CTX] =
147
+{
148
+    { 107,  167,   91,  107,  107,  167, },
149
+    { 107,  167,   91,  122,  107,  167, },
150
+    { 138,  153,  136,  167,  152,  152, },
151
+};
152
+
153
+static const uint8_t INIT_MVP_IDX[3][NUM_MVP_IDX_CTX] =
154
+{
155
+    { 168 },
156
+    { 168 },
157
+    { CNU },
158
+};
159
+
160
+static const uint8_t INIT_SAO_MERGE_FLAG[3][NUM_SAO_MERGE_FLAG_CTX] =
161
+{
162
+    { 153,  },
163
+    { 153,  },
164
+    { 153,  },
165
+};
166
+
167
+static const uint8_t INIT_SAO_TYPE_IDX[3][NUM_SAO_TYPE_IDX_CTX] =
168
+{
169
+    { 160, },
170
+    { 185, },
171
+    { 200, },
172
+};
173
+
174
+static const uint8_t INIT_TRANS_SUBDIV_FLAG[3][NUM_TRANS_SUBDIV_FLAG_CTX] =
175
+{
176
+    { 224,  167,  122, },
177
+    { 124,  138,   94, },
178
+    { 153,  138,  138, },
179
+};
180
+
181
+static const uint8_t INIT_TRANSFORMSKIP_FLAG[3][2 * NUM_TRANSFORMSKIP_FLAG_CTX] =
182
+{
183
+    { 139,  139 },
184
+    { 139,  139 },
185
+    { 139,  139 },
186
+};
187
+
188
 Entropy::Entropy()
189
 {
190
     markValid();
191
@@ -306,7 +489,7 @@
192
 {
193
     for (int sizeId = 0; sizeId < ScalingList::NUM_SIZES; sizeId++)
194
     {
195
-        for (int listId = 0; listId < ScalingList::NUM_LISTS; listId++)
196
+        for (int listId = 0; listId < ScalingList::NUM_LISTS; listId += (sizeId == 3) ? 3 : 1)
197
         {
198
             int predList = scalingList.checkPredMode(sizeId, listId);
199
             WRITE_FLAG(predList < 0, "scaling_list_pred_mode_flag");
200
@@ -334,12 +517,7 @@
201
     for (int i = 0; i < coefNum; i++)
202
     {
203
         data = src[scan[i]] - nextCoef;
204
-        nextCoef = src[scan[i]];
205
-        if (data > 127)
206
-            data = data - 256;
207
-        if (data < -128)
208
-            data = data + 256;
209
-
210
+        nextCoef = (nextCoef + data + 256) % 256;
211
         WRITE_SVLC(data,  "scaling_list_delta_coef");
212
     }
213
 }
214
@@ -726,16 +904,12 @@
215
     bool bSmallChroma = (log2CurSize - hChromaShift) < 2;
216
     if (!curDepth || !bSmallChroma)
217
     {
218
-        if (!curDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_U, curDepth - 1))
219
+        uint32_t parentIdx = absPartIdx & (0xFF << (log2CurSize + 1 - LOG2_UNIT_SIZE) * 2);
220
+        if (!curDepth || cu.getCbf(parentIdx, TEXT_CHROMA_U, curDepth - 1))
221
             codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_U, curDepth, !subdiv);
222
-        if (!curDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_V, curDepth - 1))
223
+        if (!curDepth || cu.getCbf(parentIdx, TEXT_CHROMA_V, curDepth - 1))
224
             codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_V, curDepth, !subdiv);
225
     }
226
-    else
227
-    {
228
-        X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_U, curDepth) == cu.getCbf(absPartIdx, TEXT_CHROMA_U, curDepth - 1), "chroma xform size match failure\n");
229
-        X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_V, curDepth) == cu.getCbf(absPartIdx, TEXT_CHROMA_V, curDepth - 1), "chroma xform size match failure\n");
230
-    }
231
 
232
     if (subdiv)
233
     {
234
@@ -758,7 +932,7 @@
235
         X265_CHECK(cu.getCbf(absPartIdxC, TEXT_LUMA, 0), "CBF should have been set\n");
236
     }
237
     else
238
-        codeQtCbfLuma(cu, absPartIdx, curDepth);
239
+        codeQtCbfLuma(cu.getCbf(absPartIdx, TEXT_LUMA, curDepth), curDepth);
240
 
241
     uint32_t cbfY = cu.getCbf(absPartIdx, TEXT_LUMA, curDepth);
242
     uint32_t cbfU = cu.getCbf(absPartIdxC, TEXT_CHROMA_U, curDepth);
243
@@ -879,7 +1053,7 @@
244
         X265_CHECK(cu.getCbf(absPartIdx, TEXT_LUMA, 0), "CBF should have been set\n");
245
     }
246
     else
247
-        codeQtCbfLuma(cu, absPartIdx, curDepth);
248
+        codeQtCbfLuma(cu.getCbf(absPartIdx, TEXT_LUMA, curDepth), curDepth);
249
 
250
     uint32_t cbfY = cu.getCbf(absPartIdx, TEXT_LUMA, curDepth);
251
 
252
@@ -1005,10 +1179,10 @@
253
         enum { OFFSET_THRESH = 1 << X265_MIN(X265_DEPTH - 5, 5) };
254
         if (typeIdx == SAO_BO)
255
         {
256
-            for (int i = 0; i < SAO_BO_LEN; i++)
257
+            for (int i = 0; i < SAO_NUM_OFFSET; i++)
258
                 codeSaoMaxUvlc(abs(ctuParam.offset[i]), OFFSET_THRESH - 1);
259
 
260
-            for (int i = 0; i < SAO_BO_LEN; i++)
261
+            for (int i = 0; i < SAO_NUM_OFFSET; i++)
262
                 if (ctuParam.offset[i] != 0)
263
                     encodeBinEP(ctuParam.offset[i] < 0);
264
 
265
@@ -1026,6 +1200,44 @@
266
     }
267
 }
268
 
269
+void Entropy::codeSaoOffsetEO(int *offset, int typeIdx, int plane)
270
+{
271
+    if (plane != 2)
272
+    {
273
+        encodeBin(1, m_contextState[OFF_SAO_TYPE_IDX_CTX]);
274
+        encodeBinEP(1);
275
+    }
276
+
277
+    enum { OFFSET_THRESH = 1 << X265_MIN(X265_DEPTH - 5, 5) };
278
+
279
+    codeSaoMaxUvlc(offset[0], OFFSET_THRESH - 1);
280
+    codeSaoMaxUvlc(offset[1], OFFSET_THRESH - 1);
281
+    codeSaoMaxUvlc(-offset[2], OFFSET_THRESH - 1);
282
+    codeSaoMaxUvlc(-offset[3], OFFSET_THRESH - 1);
283
+    if (plane != 2)
284
+        encodeBinsEP((uint32_t)(typeIdx), 2);
285
+}
286
+
287
+void Entropy::codeSaoOffsetBO(int *offset, int bandPos, int plane)
288
+{
289
+    if (plane != 2)
290
+    {
291
+        encodeBin(1, m_contextState[OFF_SAO_TYPE_IDX_CTX]);
292
+        encodeBinEP(0);
293
+    }
294
+
295
+    enum { OFFSET_THRESH = 1 << X265_MIN(X265_DEPTH - 5, 5) };
296
+
297
+    for (int i = 0; i < SAO_NUM_OFFSET; i++)
298
+        codeSaoMaxUvlc(abs(offset[i]), OFFSET_THRESH - 1);
299
+
300
+    for (int i = 0; i < SAO_NUM_OFFSET; i++)
301
+        if (offset[i] != 0)
302
+            encodeBinEP(offset[i] < 0);
303
+
304
+    encodeBinsEP(bandPos, 5);
305
+}
306
+
307
 /** initialize context model with respect to QP and initialization value */
308
 uint8_t sbacInit(int qp, int initValue)
309
 {
310
x265_1.9.tar.gz/source/encoder/entropy.h -> x265_2.0.tar.gz/source/encoder/entropy.h Changed
25
 
1
@@ -162,13 +162,13 @@
2
 
3
     void codePartSize(const CUData& cu, uint32_t absPartIdx, uint32_t depth);
4
     void codePredInfo(const CUData& cu, uint32_t absPartIdx);
5
-    inline void codeQtCbfLuma(const CUData& cu, uint32_t absPartIdx, uint32_t tuDepth) { codeQtCbfLuma(cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth), tuDepth); }
6
 
7
     void codeQtCbfChroma(const CUData& cu, uint32_t absPartIdx, TextType ttype, uint32_t tuDepth, bool lowestLevel);
8
     void codeCoeff(const CUData& cu, uint32_t absPartIdx, bool& bCodeDQP, const uint32_t depthRange[2]);
9
     void codeCoeffNxN(const CUData& cu, const coeff_t* coef, uint32_t absPartIdx, uint32_t log2TrSize, TextType ttype);
10
 
11
     inline void codeSaoMerge(uint32_t code)                          { encodeBin(code, m_contextState[OFF_SAO_MERGE_FLAG_CTX]); }
12
+    inline void codeSaoType(uint32_t code)                           { encodeBin(code, m_contextState[OFF_SAO_TYPE_IDX_CTX]); }
13
     inline void codeMVPIdx(uint32_t symbol)                          { encodeBin(symbol, m_contextState[OFF_MVP_IDX_CTX]); }
14
     inline void codeMergeFlag(const CUData& cu, uint32_t absPartIdx) { encodeBin(cu.m_mergeFlag[absPartIdx], m_contextState[OFF_MERGE_FLAG_EXT_CTX]); }
15
     inline void codeSkipFlag(const CUData& cu, uint32_t absPartIdx)  { encodeBin(cu.isSkipped(absPartIdx), m_contextState[OFF_SKIP_FLAG_CTX + cu.getCtxSkipFlag(absPartIdx)]); }
16
@@ -182,6 +182,8 @@
17
     inline void codeTransformSkipFlags(uint32_t transformSkip, TextType ttype) { encodeBin(transformSkip, m_contextState[OFF_TRANSFORMSKIP_FLAG_CTX + (ttype ? NUM_TRANSFORMSKIP_FLAG_CTX : 0)]); }
18
     void codeDeltaQP(const CUData& cu, uint32_t absPartIdx);
19
     void codeSaoOffset(const SaoCtuParam& ctuParam, int plane);
20
+    void codeSaoOffsetEO(int *offset, int typeIdx, int plane);
21
+    void codeSaoOffsetBO(int *offset, int bandPos, int plane);
22
 
23
     /* RDO functions */
24
     void estBit(EstBitsSbac& estBitsSbac, uint32_t log2TrSize, bool bIsLuma) const;
25
x265_1.9.tar.gz/source/encoder/frameencoder.cpp -> x265_2.0.tar.gz/source/encoder/frameencoder.cpp Changed
191
 
1
@@ -41,6 +41,7 @@
2
 FrameEncoder::FrameEncoder()
3
 {
4
     m_prevOutputTime = x265_mdate();
5
+    m_reconfigure = false;
6
     m_isFrameEncoder = true;
7
     m_threadActive = true;
8
     m_slicetypeWaitTime = 0;
9
@@ -104,6 +105,7 @@
10
     m_param = top->m_param;
11
     m_numRows = numRows;
12
     m_numCols = numCols;
13
+    m_reconfigure = false;
14
     m_filterRowDelay = ((m_param->bEnableSAO && m_param->bSaoNonDeblocked)
15
                         || (!m_param->bEnableLoopFilter && m_param->bEnableSAO)) ?
16
                         2 : (m_param->bEnableSAO || m_param->bEnableLoopFilter ? 1 : 0);
17
@@ -213,7 +215,6 @@
18
 {
19
     m_slicetypeWaitTime = x265_mdate() - m_prevOutputTime;
20
     m_frame = curFrame;
21
-    m_param = curFrame->m_param;
22
     m_sliceType = curFrame->m_lowres.sliceType;
23
     curFrame->m_encData->m_frameEncoderID = m_jpId;
24
     curFrame->m_encData->m_jobProvider = this;
25
@@ -333,18 +334,40 @@
26
     // Weighted Prediction parameters estimation.
27
     bool bUseWeightP = slice->m_sliceType == P_SLICE && slice->m_pps->bUseWeightPred;
28
     bool bUseWeightB = slice->m_sliceType == B_SLICE && slice->m_pps->bUseWeightedBiPred;
29
+
30
+    WeightParam* reuseWP = NULL;
31
+    if (m_param->analysisMode && (bUseWeightP || bUseWeightB))
32
+        reuseWP = ((analysis_inter_data*)m_frame->m_analysisData.interData)->wt;
33
+
34
     if (bUseWeightP || bUseWeightB)
35
     {
36
 #if DETAILED_CU_STATS
37
         m_cuStats.countWeightAnalyze++;
38
         ScopedElapsedTime time(m_cuStats.weightAnalyzeTime);
39
 #endif
40
-        WeightAnalysis wa(*this);
41
-        if (m_pool && wa.tryBondPeers(*this, 1))
42
-            /* use an idle worker for weight analysis */
43
-            wa.waitForExit();
44
+        if (m_param->analysisMode == X265_ANALYSIS_LOAD)
45
+        {
46
+            for (int list = 0; list < slice->isInterB() + 1; list++) 
47
+            {
48
+                for (int plane = 0; plane < (m_param->internalCsp != X265_CSP_I400 ? 3 : 1); plane++)
49
+                {
50
+                    for (int ref = 1; ref < slice->m_numRefIdx[list]; ref++)
51
+                        SET_WEIGHT(slice->m_weightPredTable[list][ref][plane], false, 1 << reuseWP->log2WeightDenom, reuseWP->log2WeightDenom, 0);
52
+                    slice->m_weightPredTable[list][0][plane] = *(reuseWP++);
53
+                }
54
+            }
55
+        }
56
         else
57
-            weightAnalyse(*slice, *m_frame, *m_param);
58
+        {
59
+            WeightAnalysis wa(*this);
60
+            if (m_pool && wa.tryBondPeers(*this, 1))
61
+                /* use an idle worker for weight analysis */
62
+                wa.waitForExit();
63
+            else
64
+                weightAnalyse(*slice, *m_frame, *m_param);
65
+
66
+        }
67
+
68
     }
69
     else
70
         slice->disableWeights();
71
@@ -361,6 +384,12 @@
72
             slice->m_refReconPicList[l][ref] = slice->m_refFrameList[l][ref]->m_reconPic;
73
             m_mref[l][ref].init(slice->m_refReconPicList[l][ref], w, *m_param);
74
         }
75
+        if (m_param->analysisMode == X265_ANALYSIS_SAVE && (bUseWeightP || bUseWeightB))
76
+        {
77
+            for (int i = 0; i < (m_param->internalCsp != X265_CSP_I400 ? 3 : 1); i++)
78
+                *(reuseWP++) = slice->m_weightPredTable[l][0][i];
79
+        }
80
+
81
     }
82
 
83
     int numTLD;
84
@@ -371,6 +400,7 @@
85
 
86
     /* Get the QP for this frame from rate control. This call may block until
87
      * frames ahead of it in encode order have called rateControlEnd() */
88
+    m_rce.encodeOrder = m_frame->m_encodeOrder;
89
     int qp = m_top->m_rateControl->rateControlStart(m_frame, &m_rce, m_top);
90
     m_rce.newQp = qp;
91
 
92
@@ -409,7 +439,7 @@
93
 
94
     m_initSliceContext.resetEntropy(*slice);
95
 
96
-    m_frameFilter.start(m_frame, m_initSliceContext, qp);
97
+    m_frameFilter.start(m_frame, m_initSliceContext);
98
 
99
     /* ensure all rows are blocked prior to initializing row CTU counters */
100
     WaveFront::clearEnabledRowMask();
101
@@ -969,44 +999,48 @@
102
         /* Deblock with idle threading */
103
         if (m_param->bEnableLoopFilter | m_param->bEnableSAO)
104
         {
105
-            // TODO: Multiple Threading
106
-            // Delay ONE row to avoid Intra Prediction Conflict
107
-            if (m_pool && (row >= 1))
108
+            // NOTE: in VBV mode, we may reencode anytime, so we can't do Deblock stage-Horizon and SAO
109
+            if (!bIsVbv)
110
             {
111
-                // Waitting last threading finish
112
-                m_frameFilter.m_parallelFilter[row - 1].waitForExit();
113
+                // TODO: Multiple Threading
114
+                // Delay ONE row to avoid Intra Prediction Conflict
115
+                if (m_pool && (row >= 1))
116
+                {
117
+                    // Waitting last threading finish
118
+                    m_frameFilter.m_parallelFilter[row - 1].waitForExit();
119
 
120
-                // Processing new group
121
-                int allowCol = col;
122
+                    // Processing new group
123
+                    int allowCol = col;
124
 
125
-                // avoid race condition on last column
126
-                if (row >= 2)
127
-                {
128
-                    allowCol = X265_MIN(((col == numCols - 1) ? m_frameFilter.m_parallelFilter[row - 2].m_lastDeblocked.get()
129
-                                                              : m_frameFilter.m_parallelFilter[row - 2].m_lastCol.get()), (int)col);
130
+                    // avoid race condition on last column
131
+                    if (row >= 2)
132
+                    {
133
+                        allowCol = X265_MIN(((col == numCols - 1) ? m_frameFilter.m_parallelFilter[row - 2].m_lastDeblocked.get()
134
+                                                                  : m_frameFilter.m_parallelFilter[row - 2].m_lastCol.get()), (int)col);
135
+                    }
136
+                    m_frameFilter.m_parallelFilter[row - 1].m_allowedCol.set(allowCol);
137
+                    m_frameFilter.m_parallelFilter[row - 1].tryBondPeers(*this, 1);
138
                 }
139
-                m_frameFilter.m_parallelFilter[row - 1].m_allowedCol.set(allowCol);
140
-                m_frameFilter.m_parallelFilter[row - 1].tryBondPeers(*this, 1);
141
-            }
142
 
143
-            // Last Row may start early
144
-            if (m_pool && (row == m_numRows - 1))
145
-            {
146
-                // Waiting for the last thread to finish
147
-                m_frameFilter.m_parallelFilter[row].waitForExit();
148
+                // Last Row may start early
149
+                if (m_pool && (row == m_numRows - 1))
150
+                {
151
+                    // Waiting for the last thread to finish
152
+                    m_frameFilter.m_parallelFilter[row].waitForExit();
153
 
154
-                // Deblocking last row
155
-                int allowCol = col;
156
+                    // Deblocking last row
157
+                    int allowCol = col;
158
 
159
-                // avoid race condition on last column
160
-                if (row >= 2)
161
-                {
162
-                    allowCol = X265_MIN(((col == numCols - 1) ? m_frameFilter.m_parallelFilter[row - 1].m_lastDeblocked.get()
163
-                                                              : m_frameFilter.m_parallelFilter[row - 1].m_lastCol.get()), (int)col);
164
+                    // avoid race condition on last column
165
+                    if (row >= 2)
166
+                    {
167
+                        allowCol = X265_MIN(((col == numCols - 1) ? m_frameFilter.m_parallelFilter[row - 1].m_lastDeblocked.get()
168
+                                                                  : m_frameFilter.m_parallelFilter[row - 1].m_lastCol.get()), (int)col);
169
+                    }
170
+                    m_frameFilter.m_parallelFilter[row].m_allowedCol.set(allowCol);
171
+                    m_frameFilter.m_parallelFilter[row].tryBondPeers(*this, 1);
172
                 }
173
-                m_frameFilter.m_parallelFilter[row].m_allowedCol.set(allowCol);
174
-                m_frameFilter.m_parallelFilter[row].tryBondPeers(*this, 1);
175
-            }
176
+            } // end of !bIsVbv
177
         }
178
         // Both Loopfilter and SAO Disabled
179
         else
180
@@ -1179,7 +1213,9 @@
181
     uint32_t rowCount = 0;
182
     if (m_param->rc.rateControlMode == X265_RC_ABR || bIsVbv)
183
     {
184
-        if ((uint32_t)m_rce.encodeOrder <= 2 * (m_param->fpsNum / m_param->fpsDenom))
185
+        if (!m_rce.encodeOrder)
186
+            rowCount = m_numRows - 1;
187
+        else if ((uint32_t)m_rce.encodeOrder <= 2 * (m_param->fpsNum / m_param->fpsDenom))
188
             rowCount = X265_MIN((m_numRows + 1) / 2, m_numRows - 1);
189
         else
190
             rowCount = X265_MIN(m_refLagRows, m_numRows - 1);
191
x265_1.9.tar.gz/source/encoder/frameencoder.h -> x265_2.0.tar.gz/source/encoder/frameencoder.h Changed
10
 
1
@@ -129,7 +129,7 @@
2
     Event                    m_done;
3
     Event                    m_completionEvent;
4
     int                      m_localTldIdx;
5
-
6
+    bool                     m_reconfigure; /* reconfigure in progress */
7
     volatile bool            m_threadActive;
8
     volatile bool            m_bAllRowsStop;
9
     volatile int             m_completionCount;
10
x265_1.9.tar.gz/source/encoder/framefilter.cpp -> x265_2.0.tar.gz/source/encoder/framefilter.cpp Changed
126
 
1
@@ -54,7 +54,7 @@
2
 
3
 void FrameFilter::init(Encoder *top, FrameEncoder *frame, int numRows, uint32_t numCols)
4
 {
5
-    m_param = top->m_param;
6
+    m_param = frame->m_param;
7
     m_frameEncoder = frame;
8
     m_numRows = numRows;
9
     m_numCols = numCols;
10
@@ -103,7 +103,7 @@
11
 
12
 }
13
 
14
-void FrameFilter::start(Frame *frame, Entropy& initState, int qp)
15
+void FrameFilter::start(Frame *frame, Entropy& initState)
16
 {
17
     m_frame = frame;
18
 
19
@@ -113,7 +113,7 @@
20
         for(int row = 0; row < m_numRows; row++)
21
         {
22
             if (m_param->bEnableSAO)
23
-                m_parallelFilter[row].m_sao.startSlice(frame, initState, qp);
24
+                m_parallelFilter[row].m_sao.startSlice(frame, initState);
25
 
26
             m_parallelFilter[row].m_lastCol.set(0);
27
             m_parallelFilter[row].m_allowedCol.set(0);
28
@@ -198,14 +198,14 @@
29
     }
30
 }
31
 
32
-void FrameFilter::ParallelFilter::processSaoUnitCu(SAOParam *saoParam, int col)
33
+void FrameFilter::ParallelFilter::processSaoCTU(SAOParam *saoParam, int col)
34
 {
35
     // TODO: apply SAO on CU and copy back soon, is it necessary?
36
     if (saoParam->bSaoFlag[0])
37
-        m_sao.processSaoUnitCuLuma(saoParam->ctuParam[0], m_row, col);
38
+        m_sao.generateLumaOffsets(saoParam->ctuParam[0], m_row, col);
39
 
40
     if (saoParam->bSaoFlag[1])
41
-        m_sao.processSaoUnitCuChroma(saoParam->ctuParam, m_row, col);
42
+        m_sao.generateChromaOffsets(saoParam->ctuParam, m_row, col);
43
 
44
     if (m_encData->m_slice->m_pps->bTransquantBypassEnabled)
45
     {
46
@@ -320,11 +320,14 @@
47
     const uint32_t* ctuGeomMap = m_frameFilter->m_frameEncoder->m_ctuGeomMap;
48
     PicYuv* reconPic = m_encData->m_reconPic;
49
     const int colStart = m_lastCol.get();
50
-    // TODO: Waiting previous row finish or simple clip on it?
51
-    const int colEnd = m_allowedCol.get();
52
     const int numCols = m_frameFilter->m_numCols;
53
+    // TODO: Waiting previous row finish or simple clip on it?
54
+    int colEnd = m_allowedCol.get();
55
 
56
     // Avoid threading conflict
57
+    if (m_prevRow && colEnd > m_prevRow->m_lastDeblocked.get())
58
+        colEnd = m_prevRow->m_lastDeblocked.get();
59
+
60
     if (colStart >= colEnd)
61
         return;
62
 
63
@@ -368,7 +371,7 @@
64
                 if (m_row >= 1 && col >= 3)
65
                 {
66
                     // Must delay 1 row to avoid thread data race conflict
67
-                    m_prevRow->processSaoUnitCu(saoParam, col - 3);
68
+                    m_prevRow->processSaoCTU(saoParam, col - 3);
69
                     m_prevRow->processPostCu(col - 3);
70
                 }
71
             }
72
@@ -409,19 +412,19 @@
73
             // Process Previous Rows SAO CU
74
             if (m_row >= 1 && numCols >= 3)
75
             {
76
-                m_prevRow->processSaoUnitCu(saoParam, numCols - 3);
77
+                m_prevRow->processSaoCTU(saoParam, numCols - 3);
78
                 m_prevRow->processPostCu(numCols - 3);
79
             }
80
 
81
             if (m_row >= 1 && numCols >= 2)
82
             {
83
-                m_prevRow->processSaoUnitCu(saoParam, numCols - 2);
84
+                m_prevRow->processSaoCTU(saoParam, numCols - 2);
85
                 m_prevRow->processPostCu(numCols - 2);
86
             }
87
 
88
             if (m_row >= 1 && numCols >= 1)
89
             {
90
-                m_prevRow->processSaoUnitCu(saoParam, numCols - 1);
91
+                m_prevRow->processSaoCTU(saoParam, numCols - 1);
92
                 m_prevRow->processPostCu(numCols - 1);
93
             }
94
 
95
@@ -475,7 +478,7 @@
96
                 for(int col = 0; col < m_numCols; col++)
97
                 {
98
                     // NOTE: must use processSaoUnitCu(), it include TQBypass logic
99
-                    m_parallelFilter[row].processSaoUnitCu(saoParam, col);
100
+                    m_parallelFilter[row].processSaoCTU(saoParam, col);
101
                 }
102
             }
103
 
104
@@ -550,10 +553,10 @@
105
         pixel *fenc = m_frame->m_fencPic->m_picOrg[0];
106
         intptr_t stride1 = reconPic->m_stride;
107
         intptr_t stride2 = m_frame->m_fencPic->m_stride;
108
-        uint32_t bEnd = ((row + 1) == (this->m_numRows - 1));
109
+        uint32_t bEnd = ((row) == (this->m_numRows - 1));
110
         uint32_t bStart = (row == 0);
111
         uint32_t minPixY = row * g_maxCUSize - 4 * !bStart;
112
-        uint32_t maxPixY = (row + 1) * g_maxCUSize - 4 * !bEnd;
113
+        uint32_t maxPixY = X265_MIN((row + 1) * g_maxCUSize - 4 * !bEnd, (uint32_t)m_param->sourceHeight);
114
         uint32_t ssim_cnt;
115
         x265_emms();
116
 
117
@@ -723,7 +726,7 @@
118
         {
119
             std::swap(sum0, sum1);
120
             for (uint32_t x = 0; x < width; x += 2)
121
-                primitives.ssim_4x4x2_core(&pix1[(4 * x + (z * stride1))], stride1, &pix2[(4 * x + (z * stride2))], stride2, &sum0[x]);
122
+                primitives.ssim_4x4x2_core(&pix1[4 * (x + (z * stride1))], stride1, &pix2[4 * (x + (z * stride2))], stride2, &sum0[x]);
123
         }
124
 
125
         for (uint32_t x = 0; x < width - 1; x += 4)
126
x265_1.9.tar.gz/source/encoder/framefilter.h -> x265_2.0.tar.gz/source/encoder/framefilter.h Changed
19
 
1
@@ -90,7 +90,7 @@
2
         void processTasks(int workerThreadId);
3
 
4
         // Apply SAO on a CU in current row
5
-        void processSaoUnitCu(SAOParam *saoParam, int col);
6
+        void processSaoCTU(SAOParam *saoParam, int col);
7
 
8
         // Copy and Save SAO reference pixels for SAO Rdo decide
9
         void copySaoAboveRef(PicYuv* reconPic, uint32_t cuAddr, int col);
10
@@ -127,7 +127,7 @@
11
     void init(Encoder *top, FrameEncoder *frame, int numRows, uint32_t numCols);
12
     void destroy();
13
 
14
-    void start(Frame *pic, Entropy& initState, int qp);
15
+    void start(Frame *pic, Entropy& initState);
16
 
17
     void processRow(int row);
18
     void processPostRow(int row);
19
x265_1.9.tar.gz/source/encoder/level.cpp -> x265_2.0.tar.gz/source/encoder/level.cpp Changed
183
 
1
@@ -131,6 +131,14 @@
2
         vps.ptl.levelIdc = Level::LEVEL8_5;
3
         vps.ptl.tierFlag = Level::MAIN;
4
     }
5
+    else if (param.uhdBluray)
6
+    {
7
+        i = 8;
8
+        vps.ptl.levelIdc = levels[i].levelEnum;
9
+        vps.ptl.tierFlag = Level::HIGH;
10
+        vps.ptl.minCrForLevel = levels[i].minCompressionRatio;
11
+        vps.ptl.maxLumaSrForLevel = levels[i].maxLumaSamplesPerSecond;
12
+    }
13
     else for (i = 0; i < NumLevels; i++)
14
     {
15
         if (lumaSamples > levels[i].maxLumaSamples)
16
@@ -145,8 +153,10 @@
17
             continue;
18
         else if (param.sourceHeight > sqrt(levels[i].maxLumaSamples * 8.0f))
19
             continue;
20
-
21
+        else if (param.levelIdc && param.levelIdc != levels[i].levelIdc)
22
+            continue;
23
         uint32_t maxDpbSize = MaxDpbPicBuf;
24
+
25
         if (lumaSamples <= (levels[i].maxLumaSamples >> 2))
26
             maxDpbSize = X265_MIN(4 * MaxDpbPicBuf, 16);
27
         else if (lumaSamples <= (levels[i].maxLumaSamples >> 1))
28
@@ -188,7 +198,7 @@
29
             CHECK_RANGE((uint32_t)param.rc.vbvBufferSize, levels[i].maxCpbSizeMain, levels[i].maxCpbSizeHigh))
30
         {
31
             /* The bitrate or buffer size are out of range for Main tier, but in
32
-             * range for High tier. If the user requested High tier then give
33
+             * range for High tier. If the user allowed High tier then give
34
              * them High tier at this level.  Otherwise allow the loop to
35
              * progress to the Main tier of the next level */
36
             if (param.bHighTier)
37
@@ -279,7 +289,7 @@
38
 bool enforceLevel(x265_param& param, VPS& vps)
39
 {
40
     vps.numReorderPics = (param.bBPyramid && param.bframes > 1) ? 2 : !!param.bframes;
41
-    vps.maxDecPicBuffering = X265_MIN(MAX_NUM_REF, X265_MAX(vps.numReorderPics + 2, (uint32_t)param.maxNumReferences) + vps.numReorderPics);
42
+    vps.maxDecPicBuffering = X265_MIN(MAX_NUM_REF, X265_MAX(vps.numReorderPics + 2, (uint32_t)param.maxNumReferences) + 1);
43
 
44
     /* no level specified by user, just auto-detect from the configuration */
45
     if (param.levelIdc <= 0)
46
@@ -290,17 +300,14 @@
47
         level++;
48
     if (levels[level].levelIdc != param.levelIdc)
49
     {
50
-        x265_log(&param, X265_LOG_WARNING, "specified level %d does not exist\n", param.levelIdc);
51
+        x265_log(&param, X265_LOG_ERROR, "specified level %d does not exist\n", param.levelIdc);
52
         return false;
53
     }
54
 
55
     LevelSpec& l = levels[level];
56
-    bool highTier = !!param.bHighTier;
57
-    if (highTier && l.maxBitrateHigh == MAX_UINT)
58
-    {
59
-        highTier = false;
60
-        x265_log(&param, X265_LOG_WARNING, "Level %s has no High tier, using Main tier\n", l.name);
61
-    }
62
+
63
+    //highTier is allowed for this level and has not been explicitly disabled. This does not mean it is the final chosen tier
64
+    bool allowHighTier = l.maxBitrateHigh < MAX_UINT && param.bHighTier;
65
 
66
     uint32_t lumaSamples = param.sourceWidth * param.sourceHeight;
67
     uint32_t samplesPerSec = (uint32_t)(lumaSamples * ((double)param.fpsNum / param.fpsDenom));
68
@@ -313,47 +320,51 @@
69
         ok = false;
70
     if (!ok)
71
     {
72
-        x265_log(&param, X265_LOG_WARNING, "picture dimensions are out of range for specified level\n");
73
+        x265_log(&param, X265_LOG_ERROR, "picture dimensions are out of range for specified level\n");
74
         return false;
75
     }
76
     else if (samplesPerSec > l.maxLumaSamplesPerSecond)
77
     {
78
-        x265_log(&param, X265_LOG_WARNING, "frame rate is out of range for specified level\n");
79
+        x265_log(&param, X265_LOG_ERROR, "frame rate is out of range for specified level\n");
80
         return false;
81
     }
82
 
83
-    if ((uint32_t)param.rc.vbvMaxBitrate > (highTier ? l.maxBitrateHigh : l.maxBitrateMain))
84
+    /* Adjustments of Bitrate, VBV buffer size, refs will be triggered only if specified params do not fit 
85
+     * within the max limits of that level (high tier if allowed, main otherwise)
86
+     */
87
+
88
+    if ((uint32_t)param.rc.vbvMaxBitrate > (allowHighTier ? l.maxBitrateHigh : l.maxBitrateMain))
89
     {
90
-        param.rc.vbvMaxBitrate = highTier ? l.maxBitrateHigh : l.maxBitrateMain;
91
-        x265_log(&param, X265_LOG_INFO, "lowering VBV max bitrate to %dKbps\n", param.rc.vbvMaxBitrate);
92
+        param.rc.vbvMaxBitrate = allowHighTier ? l.maxBitrateHigh : l.maxBitrateMain;
93
+        x265_log(&param, X265_LOG_WARNING, "lowering VBV max bitrate to %dKbps\n", param.rc.vbvMaxBitrate);
94
     }
95
-    if ((uint32_t)param.rc.vbvBufferSize > (highTier ? l.maxCpbSizeHigh : l.maxCpbSizeMain))
96
+    if ((uint32_t)param.rc.vbvBufferSize > (allowHighTier ? l.maxCpbSizeHigh : l.maxCpbSizeMain))
97
     {
98
-        param.rc.vbvBufferSize = highTier ? l.maxCpbSizeHigh : l.maxCpbSizeMain;
99
-        x265_log(&param, X265_LOG_INFO, "lowering VBV buffer size to %dKb\n", param.rc.vbvBufferSize);
100
+        param.rc.vbvBufferSize = allowHighTier ? l.maxCpbSizeHigh : l.maxCpbSizeMain;
101
+        x265_log(&param, X265_LOG_WARNING, "lowering VBV buffer size to %dKb\n", param.rc.vbvBufferSize);
102
     }
103
 
104
     switch (param.rc.rateControlMode)
105
     {
106
     case X265_RC_ABR:
107
-        if ((uint32_t)param.rc.bitrate > (highTier ? l.maxBitrateHigh : l.maxBitrateMain))
108
+        if ((uint32_t)param.rc.bitrate > (allowHighTier ? l.maxBitrateHigh : l.maxBitrateMain))
109
         {
110
-            param.rc.bitrate = l.maxBitrateHigh;
111
-            x265_log(&param, X265_LOG_INFO, "lowering target bitrate to High tier limit of %dKbps\n", param.rc.bitrate);
112
+            param.rc.bitrate =  allowHighTier ? l.maxBitrateHigh : l.maxBitrateMain;
113
+            x265_log(&param, X265_LOG_WARNING, "lowering target bitrate to High tier limit of %dKbps\n", param.rc.bitrate);
114
         }
115
         break;
116
 
117
     case X265_RC_CQP:
118
-        x265_log(&param, X265_LOG_WARNING, "Constant QP is inconsistent with specifying a decoder level, no bitrate guarantee is possible.\n");
119
+        x265_log(&param, X265_LOG_ERROR, "Constant QP is inconsistent with specifying a decoder level, no bitrate guarantee is possible.\n");
120
         return false;
121
 
122
     case X265_RC_CRF:
123
         if (!param.rc.vbvBufferSize || !param.rc.vbvMaxBitrate)
124
         {
125
             if (!param.rc.vbvMaxBitrate)
126
-                param.rc.vbvMaxBitrate = highTier ? l.maxBitrateHigh : l.maxBitrateMain;
127
+                param.rc.vbvMaxBitrate = allowHighTier ? l.maxBitrateHigh : l.maxBitrateMain;
128
             if (!param.rc.vbvBufferSize)
129
-                param.rc.vbvBufferSize = highTier ? l.maxCpbSizeHigh : l.maxCpbSizeMain;
130
+                param.rc.vbvBufferSize = allowHighTier ? l.maxCpbSizeHigh : l.maxCpbSizeMain;
131
             x265_log(&param, X265_LOG_WARNING, "Specifying a decoder level with constant rate factor rate-control requires\n");
132
             x265_log(&param, X265_LOG_WARNING, "enabling VBV with vbv-bufsize=%dkb vbv-maxrate=%dkbps. VBV outputs are non-deterministic!\n",
133
                      param.rc.vbvBufferSize, param.rc.vbvMaxBitrate);
134
@@ -368,27 +379,30 @@
135
     /* The value of sps_max_dec_pic_buffering_minus1[ HighestTid ] + 1 shall be less than or equal to MaxDpbSize */
136
     const uint32_t MaxDpbPicBuf = 6;
137
     uint32_t maxDpbSize = MaxDpbPicBuf;
138
-    if (lumaSamples <= (l.maxLumaSamples >> 2))
139
-        maxDpbSize = X265_MIN(4 * MaxDpbPicBuf, 16);
140
-    else if (lumaSamples <= (l.maxLumaSamples >> 1))
141
-        maxDpbSize = X265_MIN(2 * MaxDpbPicBuf, 16);
142
-    else if (lumaSamples <= ((3 * l.maxLumaSamples) >> 2))
143
-        maxDpbSize = X265_MIN((4 * MaxDpbPicBuf) / 3, 16);
144
+    if (!param.uhdBluray) /* Do not change MaxDpbPicBuf for UHD-Bluray */
145
+    {
146
+        if (lumaSamples <= (l.maxLumaSamples >> 2))
147
+            maxDpbSize = X265_MIN(4 * MaxDpbPicBuf, 16);
148
+        else if (lumaSamples <= (l.maxLumaSamples >> 1))
149
+            maxDpbSize = X265_MIN(2 * MaxDpbPicBuf, 16);
150
+        else if (lumaSamples <= ((3 * l.maxLumaSamples) >> 2))
151
+            maxDpbSize = X265_MIN((4 * MaxDpbPicBuf) / 3, 16);
152
+    }
153
 
154
     int savedRefCount = param.maxNumReferences;
155
     while (vps.maxDecPicBuffering > maxDpbSize && param.maxNumReferences > 1)
156
     {
157
         param.maxNumReferences--;
158
-        vps.maxDecPicBuffering = X265_MIN(MAX_NUM_REF, X265_MAX(vps.numReorderPics + 1, (uint32_t)param.maxNumReferences) + vps.numReorderPics);
159
+        vps.maxDecPicBuffering = X265_MIN(MAX_NUM_REF, X265_MAX(vps.numReorderPics + 1, (uint32_t)param.maxNumReferences) + 1);
160
     }
161
     if (param.maxNumReferences != savedRefCount)
162
-        x265_log(&param, X265_LOG_INFO, "Lowering max references to %d to meet level requirement\n", param.maxNumReferences);
163
+        x265_log(&param, X265_LOG_WARNING, "Lowering max references to %d to meet level requirement\n", param.maxNumReferences);
164
 
165
     /* For level 5 and higher levels, the value of CtbSizeY shall be equal to 32 or 64 */
166
     if (param.levelIdc >= 50 && param.maxCUSize < 32)
167
     {
168
         param.maxCUSize = 32;
169
-        x265_log(&param, X265_LOG_INFO, "Levels 5.0 and above require a maximum CTU size of at least 32, using --ctu 32\n");
170
+        x265_log(&param, X265_LOG_WARNING, "Levels 5.0 and above require a maximum CTU size of at least 32, using --ctu 32\n");
171
     }
172
 
173
     /* The value of NumPocTotalCurr shall be less than or equal to 8 */
174
@@ -396,7 +410,7 @@
175
     if (numPocTotalCurr > 8)
176
     {
177
         param.maxNumReferences = 8 - !!param.bframes;
178
-        x265_log(&param, X265_LOG_INFO, "Lowering max references to %d to meet numPocTotalCurr requirement\n", param.maxNumReferences);
179
+        x265_log(&param, X265_LOG_WARNING, "Lowering max references to %d to meet numPocTotalCurr requirement\n", param.maxNumReferences);
180
     }
181
 
182
     return true;
183
x265_1.9.tar.gz/source/encoder/motion.cpp -> x265_2.0.tar.gz/source/encoder/motion.cpp Changed
186
 
1
@@ -111,10 +111,8 @@
2
     chromaSatd = NULL;
3
 }
4
 
5
-void MotionEstimate::init(int method, int refine, int csp)
6
+void MotionEstimate::init(int csp)
7
 {
8
-    searchMethod = method;
9
-    subpelRefine = refine;
10
     fencPUYuv.create(FENC_STRIDE, csp);
11
 }
12
 
13
@@ -162,7 +160,7 @@
14
 }
15
 
16
 /* Called by lookahead, luma only, no use of PicYuv */
17
-void MotionEstimate::setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int pwidth, int pheight)
18
+void MotionEstimate::setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int pwidth, int pheight, const int method, const int refine)
19
 {
20
     partEnum = partitionFromSizes(pwidth, pheight);
21
     X265_CHECK(LUMA_4x4 != partEnum, "4x4 inter partition detected!\n");
22
@@ -175,13 +173,17 @@
23
     blockOffset = offset;
24
     absPartIdx = ctuAddr = -1;
25
 
26
+    /* Search params */
27
+    searchMethod = method;
28
+    subpelRefine = refine;
29
+
30
     /* copy PU block into cache */
31
     primitives.pu[partEnum].copy_pp(fencPUYuv.m_buf[0], FENC_STRIDE, fencY + offset, stride);
32
     X265_CHECK(!bChromaSATD, "chroma distortion measurements impossible in this code path\n");
33
 }
34
 
35
 /* Called by Search::predInterSearch() or --pme equivalent, chroma residual might be considered */
36
-void MotionEstimate::setSourcePU(const Yuv& srcFencYuv, int _ctuAddr, int cuPartIdx, int puPartIdx, int pwidth, int pheight)
37
+void MotionEstimate::setSourcePU(const Yuv& srcFencYuv, int _ctuAddr, int cuPartIdx, int puPartIdx, int pwidth, int pheight, const int method, const int refine, bool bChroma)
38
 {
39
     partEnum = partitionFromSizes(pwidth, pheight);
40
     X265_CHECK(LUMA_4x4 != partEnum, "4x4 inter partition detected!\n");
41
@@ -192,9 +194,13 @@
42
 
43
     chromaSatd = primitives.chroma[fencPUYuv.m_csp].pu[partEnum].satd;
44
 
45
+    /* Set search characteristics */
46
+    searchMethod = method;
47
+    subpelRefine = refine;
48
+
49
     /* Enable chroma residual cost if subpelRefine level is greater than 2 and chroma block size
50
      * is an even multiple of 4x4 pixels (indicated by non-null chromaSatd pointer) */
51
-    bChromaSATD = subpelRefine > 2 && chromaSatd && (srcFencYuv.m_csp != X265_CSP_I400);
52
+    bChromaSATD = subpelRefine > 2 && chromaSatd && (srcFencYuv.m_csp != X265_CSP_I400 && bChroma);
53
     X265_CHECK(!(bChromaSATD && !workload[subpelRefine].hpel_satd), "Chroma SATD cannot be used with SAD hpel\n");
54
 
55
     ctuAddr = _ctuAddr;
56
@@ -1174,15 +1180,17 @@
57
 int MotionEstimate::subpelCompare(ReferencePlanes *ref, const MV& qmv, pixelcmp_t cmp)
58
 {
59
     intptr_t refStride = ref->lumaStride;
60
-    pixel *fref = ref->fpelPlane[0] + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * refStride;
61
+    const pixel* fref = ref->fpelPlane[0] + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * refStride;
62
     int xFrac = qmv.x & 0x3;
63
     int yFrac = qmv.y & 0x3;
64
     int cost;
65
-    intptr_t lclStride = fencPUYuv.m_size;
66
-    X265_CHECK(lclStride == FENC_STRIDE, "fenc buffer is assumed to have FENC_STRIDE by sad_x3 and sad_x4\n");
67
+    const intptr_t fencStride = FENC_STRIDE;
68
+    X265_CHECK(fencPUYuv.m_size == FENC_STRIDE, "fenc buffer is assumed to have FENC_STRIDE by sad_x3 and sad_x4\n");
69
 
70
+    ALIGN_VAR_32(pixel, subpelbuf[MAX_CU_SIZE * MAX_CU_SIZE]);
71
+    
72
     if (!(yFrac | xFrac))
73
-        cost = cmp(fencPUYuv.m_buf[0], lclStride, fref, refStride);
74
+        cost = cmp(fencPUYuv.m_buf[0], fencStride, fref, refStride);
75
     else
76
     {
77
         /* we are taking a short-cut here if the reference is weighted. To be
78
@@ -1190,15 +1198,13 @@
79
          * the final 16bit values prior to rounding and down shifting. Instead we
80
          * are simply interpolating the weighted full-pel pixels. Not 100%
81
          * accurate but good enough for fast qpel ME */
82
-        ALIGN_VAR_32(pixel, subpelbuf[64 * 64]);
83
         if (!yFrac)
84
-            primitives.pu[partEnum].luma_hpp(fref, refStride, subpelbuf, lclStride, xFrac);
85
+            primitives.pu[partEnum].luma_hpp(fref, refStride, subpelbuf, blockwidth, xFrac);
86
         else if (!xFrac)
87
-            primitives.pu[partEnum].luma_vpp(fref, refStride, subpelbuf, lclStride, yFrac);
88
+            primitives.pu[partEnum].luma_vpp(fref, refStride, subpelbuf, blockwidth, yFrac);
89
         else
90
-            primitives.pu[partEnum].luma_hvpp(fref, refStride, subpelbuf, lclStride, xFrac, yFrac);
91
-
92
-        cost = cmp(fencPUYuv.m_buf[0], lclStride, subpelbuf, lclStride);
93
+            primitives.pu[partEnum].luma_hvpp(fref, refStride, subpelbuf, blockwidth, xFrac, yFrac);
94
+        cost = cmp(fencPUYuv.m_buf[0], fencStride, subpelbuf, blockwidth);
95
     }
96
 
97
     if (bChromaSATD)
98
@@ -1206,12 +1212,12 @@
99
         int csp    = fencPUYuv.m_csp;
100
         int hshift = fencPUYuv.m_hChromaShift;
101
         int vshift = fencPUYuv.m_vChromaShift;
102
-        int shiftHor = (2 + hshift);
103
-        int shiftVer = (2 + vshift);
104
-        lclStride = fencPUYuv.m_csize;
105
+        int mvx = qmv.x << (1 - hshift);
106
+        int mvy = qmv.y << (1 - vshift);
107
+        intptr_t fencStrideC = fencPUYuv.m_csize;
108
 
109
         intptr_t refStrideC = ref->reconPic->m_strideC;
110
-        intptr_t refOffset = (qmv.x >> shiftHor) + (qmv.y >> shiftVer) * refStrideC;
111
+        intptr_t refOffset = (mvx >> 3) + (mvy >> 3) * refStrideC;
112
 
113
         const pixel* refCb = ref->getCbAddr(ctuAddr, absPartIdx) + refOffset;
114
         const pixel* refCr = ref->getCrAddr(ctuAddr, absPartIdx) + refOffset;
115
@@ -1219,48 +1225,46 @@
116
         X265_CHECK((hshift == 0) || (hshift == 1), "hshift must be 0 or 1\n");
117
         X265_CHECK((vshift == 0) || (vshift == 1), "vshift must be 0 or 1\n");
118
 
119
-        xFrac = qmv.x & (hshift ? 7 : 3);
120
-        yFrac = qmv.y & (vshift ? 7 : 3);
121
+        xFrac = mvx & 7;
122
+        yFrac = mvy & 7;
123
 
124
         if (!(yFrac | xFrac))
125
         {
126
-            cost += chromaSatd(fencPUYuv.m_buf[1], lclStride, refCb, refStrideC);
127
-            cost += chromaSatd(fencPUYuv.m_buf[2], lclStride, refCr, refStrideC);
128
+            cost += chromaSatd(fencPUYuv.m_buf[1], fencStrideC, refCb, refStrideC);
129
+            cost += chromaSatd(fencPUYuv.m_buf[2], fencStrideC, refCr, refStrideC);
130
         }
131
         else
132
         {
133
-            ALIGN_VAR_32(pixel, subpelbuf[64 * 64]);
134
+            int blockwidthC = blockwidth >> hshift;
135
+
136
             if (!yFrac)
137
             {
138
-                primitives.chroma[csp].pu[partEnum].filter_hpp(refCb, refStrideC, subpelbuf, lclStride, xFrac << (1 - hshift));
139
-                cost += chromaSatd(fencPUYuv.m_buf[1], lclStride, subpelbuf, lclStride);
140
+                primitives.chroma[csp].pu[partEnum].filter_hpp(refCb, refStrideC, subpelbuf, blockwidthC, xFrac);
141
+                cost += chromaSatd(fencPUYuv.m_buf[1], fencStrideC, subpelbuf, blockwidthC);
142
 
143
-                primitives.chroma[csp].pu[partEnum].filter_hpp(refCr, refStrideC, subpelbuf, lclStride, xFrac << (1 - hshift));
144
-                cost += chromaSatd(fencPUYuv.m_buf[2], lclStride, subpelbuf, lclStride);
145
+                primitives.chroma[csp].pu[partEnum].filter_hpp(refCr, refStrideC, subpelbuf, blockwidthC, xFrac);
146
+                cost += chromaSatd(fencPUYuv.m_buf[2], fencStrideC, subpelbuf, blockwidthC);
147
             }
148
             else if (!xFrac)
149
             {
150
-                primitives.chroma[csp].pu[partEnum].filter_vpp(refCb, refStrideC, subpelbuf, lclStride, yFrac << (1 - vshift));
151
-                cost += chromaSatd(fencPUYuv.m_buf[1], lclStride, subpelbuf, lclStride);
152
+                primitives.chroma[csp].pu[partEnum].filter_vpp(refCb, refStrideC, subpelbuf, blockwidthC, yFrac);
153
+                cost += chromaSatd(fencPUYuv.m_buf[1], fencStrideC, subpelbuf, blockwidthC);
154
 
155
-                primitives.chroma[csp].pu[partEnum].filter_vpp(refCr, refStrideC, subpelbuf, lclStride, yFrac << (1 - vshift));
156
-                cost += chromaSatd(fencPUYuv.m_buf[2], lclStride, subpelbuf, lclStride);
157
+                primitives.chroma[csp].pu[partEnum].filter_vpp(refCr, refStrideC, subpelbuf, blockwidthC, yFrac);
158
+                cost += chromaSatd(fencPUYuv.m_buf[2], fencStrideC, subpelbuf, blockwidthC);
159
             }
160
             else
161
             {
162
-                ALIGN_VAR_32(int16_t, immed[64 * (64 + NTAPS_CHROMA)]);
163
-
164
-                int extStride = blockwidth >> hshift;
165
-                int filterSize = NTAPS_CHROMA;
166
-                int halfFilterSize = (filterSize >> 1);
167
+                ALIGN_VAR_32(int16_t, immed[MAX_CU_SIZE * (MAX_CU_SIZE + NTAPS_LUMA - 1)]);
168
+                const int halfFilterSize = (NTAPS_CHROMA >> 1);
169
 
170
-                primitives.chroma[csp].pu[partEnum].filter_hps(refCb, refStrideC, immed, extStride, xFrac << (1 - hshift), 1);
171
-                primitives.chroma[csp].pu[partEnum].filter_vsp(immed + (halfFilterSize - 1) * extStride, extStride, subpelbuf, lclStride, yFrac << (1 - vshift));
172
-                cost += chromaSatd(fencPUYuv.m_buf[1], lclStride, subpelbuf, lclStride);
173
+                primitives.chroma[csp].pu[partEnum].filter_hps(refCb, refStrideC, immed, blockwidthC, xFrac, 1);
174
+                primitives.chroma[csp].pu[partEnum].filter_vsp(immed + (halfFilterSize - 1) * blockwidthC, blockwidthC, subpelbuf, blockwidthC, yFrac);
175
+                cost += chromaSatd(fencPUYuv.m_buf[1], fencStrideC, subpelbuf, blockwidthC);
176
 
177
-                primitives.chroma[csp].pu[partEnum].filter_hps(refCr, refStrideC, immed, extStride, xFrac << (1 - hshift), 1);
178
-                primitives.chroma[csp].pu[partEnum].filter_vsp(immed + (halfFilterSize - 1) * extStride, extStride, subpelbuf, lclStride, yFrac << (1 - vshift));
179
-                cost += chromaSatd(fencPUYuv.m_buf[2], lclStride, subpelbuf, lclStride);
180
+                primitives.chroma[csp].pu[partEnum].filter_hps(refCr, refStrideC, immed, blockwidthC, xFrac, 1);
181
+                primitives.chroma[csp].pu[partEnum].filter_vsp(immed + (halfFilterSize - 1) * blockwidthC, blockwidthC, subpelbuf, blockwidthC, yFrac);
182
+                cost += chromaSatd(fencPUYuv.m_buf[2], fencStrideC, subpelbuf, blockwidthC);
183
             }
184
         }
185
     }
186
x265_1.9.tar.gz/source/encoder/motion.h -> x265_2.0.tar.gz/source/encoder/motion.h Changed
17
 
1
@@ -70,12 +70,12 @@
2
 
3
     static void initScales();
4
     static int hpelIterationCount(int subme);
5
-    void init(int method, int refine, int csp);
6
+    void init(int csp);
7
 
8
     /* Methods called at slice setup */
9
 
10
-    void setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int pwidth, int pheight);
11
-    void setSourcePU(const Yuv& srcFencYuv, int ctuAddr, int cuPartIdx, int puPartIdx, int pwidth, int pheight);
12
+    void setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int pwidth, int pheight, const int searchMethod, const int subpelRefine);
13
+    void setSourcePU(const Yuv& srcFencYuv, int ctuAddr, int cuPartIdx, int puPartIdx, int pwidth, int pheight, const int searchMethod, const int subpelRefine, bool bChroma);
14
 
15
     /* buf*() and motionEstimate() methods all use cached fenc pixels and thus
16
      * require setSourcePU() to be called prior. */
17
x265_1.9.tar.gz/source/encoder/ratecontrol.cpp -> x265_2.0.tar.gz/source/encoder/ratecontrol.cpp Changed
890
 
1
@@ -53,7 +53,7 @@
2
 {\
3
     bErr = 0;\
4
     p = strstr(opts, opt "=");\
5
-    char* q = strstr(opts, "no-"opt);\
6
+    char* q = strstr(opts, "no-" opt);\
7
     if (p && sscanf(p, opt "=%d" , &i) && param_val != i)\
8
         bErr = 1;\
9
     else if (!param_val && !q && !p)\
10
@@ -91,24 +91,6 @@
11
     return z + lut[x];
12
 }
13
 
14
-inline void reduceFraction(int* n, int* d)
15
-{
16
-    int a = *n;
17
-    int b = *d;
18
-    int c;
19
-    if (!a || !b)
20
-        return;
21
-    c = a % b;
22
-    while (c)
23
-    {
24
-        a = b;
25
-        b = c;
26
-        c = a % b;
27
-    }
28
-    *n /= b;
29
-    *d /= b;
30
-}
31
-
32
 inline char *strcatFilename(const char *input, const char *suffix)
33
 {
34
     char *output = X265_MALLOC(char, strlen(input) + strlen(suffix) + 1);
35
@@ -190,6 +172,8 @@
36
     m_numEntries = 0;
37
     m_isSceneTransition = false;
38
     m_lastPredictorReset = 0;
39
+    m_avgPFrameQp = 0;
40
+    m_isFirstMiniGop = false;
41
     if (m_param->rc.rateControlMode == X265_RC_CRF)
42
     {
43
         m_param->rc.qp = (int)m_param->rc.rfConstant;
44
@@ -212,7 +196,7 @@
45
             m_rateFactorMaxDecrement = m_param->rc.rfConstant - m_param->rc.rfConstantMin;
46
     }
47
     m_isAbr = m_param->rc.rateControlMode != X265_RC_CQP && !m_param->rc.bStatRead;
48
-    m_2pass = (m_param->rc.rateControlMode == X265_RC_ABR || m_param->rc.vbvMaxBitrate > 0) && m_param->rc.bStatRead;
49
+    m_2pass = m_param->rc.rateControlMode != X265_RC_CQP && m_param->rc.bStatRead;
50
     m_bitrate = m_param->rc.bitrate * 1000;
51
     m_frameDuration = (double)m_param->fpsDenom / m_param->fpsNum;
52
     m_qp = m_param->rc.qp;
53
@@ -225,8 +209,10 @@
54
     m_statFileOut = NULL;
55
     m_cutreeStatFileOut = m_cutreeStatFileIn = NULL;
56
     m_rce2Pass = NULL;
57
+    m_encOrder = NULL;
58
     m_lastBsliceSatdCost = 0;
59
     m_movingAvgSum = 0.0;
60
+    m_isNextGop = false;
61
 
62
     // vbv initialization
63
     m_param->rc.vbvBufferSize = x265_clip3(0, 2000000, m_param->rc.vbvBufferSize);
64
@@ -288,9 +274,13 @@
65
     m_ipOffset = 6.0 * X265_LOG2(m_param->rc.ipFactor);
66
     m_pbOffset = 6.0 * X265_LOG2(m_param->rc.pbFactor);
67
 
68
+    for (int i = 0; i < QP_MAX_MAX; i++)
69
+        m_qpToEncodedBits[i] = 0;
70
+
71
     /* Adjust the first frame in order to stabilize the quality level compared to the rest */
72
 #define ABR_INIT_QP_MIN (24)
73
-#define ABR_INIT_QP_MAX (40)
74
+#define ABR_INIT_QP_MAX (37)
75
+#define ABR_INIT_QP_GRAIN_MAX (33)
76
 #define ABR_SCENECUT_INIT_QP_MIN (12)
77
 #define CRF_INIT_QP (int)m_param->rc.rfConstant
78
     for (int i = 0; i < 3; i++)
79
@@ -361,6 +351,7 @@
80
         m_amortizeFraction = 0.85;
81
         m_amortizeFrames = m_param->totalFrames / 2;
82
     }
83
+
84
     for (int i = 0; i < s_slidingWindowFrames; i++)
85
     {
86
         m_satdCostWindow[i] = 0;
87
@@ -370,15 +361,22 @@
88
     m_isPatternPresent = false;
89
     m_numBframesInPattern = 0;
90
 
91
-    /* 720p videos seem to be a good cutoff for cplxrSum */
92
-    double tuneCplxFactor = (m_param->rc.cuTree && m_ncu > 3600) ? 2.5 : 1;
93
+    m_isGrainEnabled = false;
94
+    if(m_param->rc.bEnableGrain) // tune for grainy content OR equal p-b frame sizes
95
+    m_isGrainEnabled = true;
96
+    for (int i = 0; i < 3; i++)
97
+    m_lastQScaleFor[i] = x265_qp2qScale(m_param->rc.rateControlMode == X265_RC_CRF ? CRF_INIT_QP : ABR_INIT_QP_MIN);
98
+    m_avgPFrameQp = 0 ;
99
 
100
+    /* 720p videos seem to be a good cutoff for cplxrSum */
101
+    double tuneCplxFactor = (m_ncu > 3600 && m_param->rc.cuTree) ? 2.5 : m_isGrainEnabled ? 1.9 : 1;
102
     /* estimated ratio that produces a reasonable QP for the first I-frame */
103
     m_cplxrSum = .01 * pow(7.0e5, m_qCompress) * pow(m_ncu, 0.5) * tuneCplxFactor;
104
     m_wantedBitsWindow = m_bitrate * m_frameDuration;
105
     m_accumPNorm = .01;
106
     m_accumPQp = (m_param->rc.rateControlMode == X265_RC_CRF ? CRF_INIT_QP : ABR_INIT_QP_MIN) * m_accumPNorm;
107
 
108
+
109
     /* Frame Predictors used in vbv */
110
     initFramePredictors();
111
     if (!m_statFileOut && (m_param->rc.bStatWrite || m_param->rc.bStatRead))
112
@@ -401,11 +399,11 @@
113
                 char *tmpFile = strcatFilename(fileName, ".cutree");
114
                 if (!tmpFile)
115
                     return false;
116
-                m_cutreeStatFileIn = fopen(tmpFile, "rb");
117
+                m_cutreeStatFileIn = x265_fopen(tmpFile, "rb");
118
                 X265_FREE(tmpFile);
119
                 if (!m_cutreeStatFileIn)
120
                 {
121
-                    x265_log(m_param, X265_LOG_ERROR, "can't open stats file %s\n", tmpFile);
122
+                    x265_log_file(m_param, X265_LOG_ERROR, "can't open stats file %s.cutree\n", fileName);
123
                     return false;
124
                 }
125
             }
126
@@ -417,7 +415,7 @@
127
                 return false;
128
             }
129
             {
130
-                int i, j;
131
+                int i, j, m;
132
                 uint32_t k , l;
133
                 bool bErr = false;
134
                 char *opts = statsBuf;
135
@@ -439,6 +437,11 @@
136
                     x265_log(m_param, X265_LOG_ERROR, "fps specified in stats file not valid\n");
137
                     return false;
138
                 }
139
+                if (((p = strstr(opts, " vbv-maxrate=")) == 0 || sscanf(p, " vbv-maxrate=%d", &m) != 1) && m_param->rc.rateControlMode == X265_RC_CRF)
140
+                {
141
+                    x265_log(m_param, X265_LOG_ERROR, "Constant rate-factor is incompatible with 2pass without vbv-maxrate in the previous pass\n");
142
+                    return false;
143
+                }
144
                 if (k != m_param->fpsNum || l != m_param->fpsDenom)
145
                 {
146
                     x265_log(m_param, X265_LOG_ERROR, "fps mismatch with 1st pass (%u/%u vs %u/%u)\n",
147
@@ -564,8 +567,10 @@
148
                 p = next;
149
             }
150
             X265_FREE(statsBuf);
151
-            if (m_param->rc.rateControlMode == X265_RC_ABR || m_param->rc.vbvMaxBitrate > 0)
152
+            if (m_param->rc.rateControlMode != X265_RC_CQP)
153
             {
154
+                m_start = 0;
155
+                m_isQpModified = true;
156
                 if (!initPass2())
157
                     return false;
158
             } /* else we're using constant quant, so no need to run the bitrate allocation */
159
@@ -579,11 +584,11 @@
160
             statFileTmpname = strcatFilename(fileName, ".temp");
161
             if (!statFileTmpname)
162
                 return false;
163
-            m_statFileOut = fopen(statFileTmpname, "wb");
164
+            m_statFileOut = x265_fopen(statFileTmpname, "wb");
165
             X265_FREE(statFileTmpname);
166
             if (!m_statFileOut)
167
             {
168
-                x265_log(m_param, X265_LOG_ERROR, "can't open stats file %s\n", statFileTmpname);
169
+                x265_log_file(m_param, X265_LOG_ERROR, "can't open stats file %s.temp\n", fileName);
170
                 return false;
171
             }
172
             p = x265_param2string(m_param);
173
@@ -595,11 +600,11 @@
174
                 statFileTmpname = strcatFilename(fileName, ".cutree.temp");
175
                 if (!statFileTmpname)
176
                     return false;
177
-                m_cutreeStatFileOut = fopen(statFileTmpname, "wb");
178
+                m_cutreeStatFileOut = x265_fopen(statFileTmpname, "wb");
179
                 X265_FREE(statFileTmpname);
180
                 if (!m_cutreeStatFileOut)
181
                 {
182
-                    x265_log(m_param, X265_LOG_ERROR, "can't open mbtree stats file %s\n", statFileTmpname);
183
+                    x265_log_file(m_param, X265_LOG_ERROR, "can't open mbtree stats file %s.cutree.temp\n", fileName);
184
                     return false;
185
                 }
186
             }
187
@@ -647,7 +652,7 @@
188
 
189
     #undef MAX_DURATION
190
 }
191
-bool RateControl::analyseABR2Pass(int startIndex, int endIndex, uint64_t allAvailableBits)
192
+bool RateControl::analyseABR2Pass(uint64_t allAvailableBits)
193
 {
194
     double rateFactor, stepMult;
195
     double qBlur = m_param->rc.qblur;
196
@@ -657,21 +662,21 @@
197
     double *qScale, *blurredQscale;
198
     double baseCplx = m_ncu * (m_param->bframes ? 120 : 80);
199
     double clippedDuration = CLIP_DURATION(m_frameDuration) / BASE_FRAME_DURATION;
200
-    int framesCount = endIndex - startIndex + 1;
201
     /* Blur complexities, to reduce local fluctuation of QP.
202
      * We don't blur the QPs directly, because then one very simple frame
203
      * could drag down the QP of a nearby complex frame and give it more
204
      * bits than intended. */
205
-    for (int i = startIndex; i <= endIndex; i++)
206
+    for (int i = 0; i < m_numEntries; i++)
207
     {
208
         double weightSum = 0;
209
         double cplxSum = 0;
210
         double weight = 1.0;
211
         double gaussianWeight;
212
         /* weighted average of cplx of future frames */
213
-        for (int j = 1; j < cplxBlur * 2 && j <= endIndex - i; j++)
214
+        for (int j = 1; j < cplxBlur * 2 && j < m_numEntries - i; j++)
215
         {
216
-            RateControlEntry *rcj = &m_rce2Pass[i + j];
217
+            int index = m_encOrder[i + j];
218
+            RateControlEntry *rcj = &m_rce2Pass[index];
219
             weight *= 1 - pow(rcj->iCuCount / m_ncu, 2);
220
             if (weight < 0.0001)
221
                 break;
222
@@ -683,7 +688,8 @@
223
         weight = 1.0;
224
         for (int j = 0; j <= cplxBlur * 2 && j <= i; j++)
225
         {
226
-            RateControlEntry *rcj = &m_rce2Pass[i - j];
227
+            int index = m_encOrder[i - j];
228
+            RateControlEntry *rcj = &m_rce2Pass[index];
229
             gaussianWeight = weight * exp(-j * j / 200.0);
230
             weightSum += gaussianWeight;
231
             cplxSum += gaussianWeight * (qScale2bits(rcj, 1) - rcj->miscBits) / clippedDuration;
232
@@ -691,12 +697,12 @@
233
             if (weight < .0001)
234
                 break;
235
         }
236
-        m_rce2Pass[i].blurredComplexity = cplxSum / weightSum;
237
+        m_rce2Pass[m_encOrder[i]].blurredComplexity = cplxSum / weightSum;
238
     }
239
-    CHECKED_MALLOC(qScale, double, framesCount);
240
+    CHECKED_MALLOC(qScale, double, m_numEntries);
241
     if (filterSize > 1)
242
     {
243
-        CHECKED_MALLOC(blurredQscale, double, framesCount);
244
+        CHECKED_MALLOC(blurredQscale, double, m_numEntries);
245
     }
246
     else
247
         blurredQscale = qScale;
248
@@ -708,9 +714,9 @@
249
      * approximation of scaling the 1st pass by the ratio of bitrates.
250
      * The search range is probably overkill, but speed doesn't matter here. */
251
     expectedBits = 1;
252
-    for (int i = startIndex; i <= endIndex; i++)
253
+    for (int i = 0; i < m_numEntries; i++)
254
     {
255
-        RateControlEntry* rce = &m_rce2Pass[i];
256
+        RateControlEntry* rce = &m_rce2Pass[m_encOrder[i]];
257
         double q = getQScale(rce, 1.0);
258
         expectedBits += qScale2bits(rce, q);
259
         m_lastQScaleFor[rce->sliceType] = q;
260
@@ -733,7 +739,7 @@
261
         /* find qscale */
262
         for (int i = 0; i < m_numEntries; i++)
263
         {
264
-            RateControlEntry *rce = &m_rce2Pass[i];
265
+            RateControlEntry *rce = &m_rce2Pass[m_encOrder[i]];
266
             qScale[i] = getQScale(rce, rateFactor);
267
             m_lastQScaleFor[rce->sliceType] = qScale[i];
268
         }
269
@@ -741,7 +747,7 @@
270
         /* fixed I/B qscale relative to P */
271
         for (int i = m_numEntries - 1; i >= 0; i--)
272
         {
273
-            qScale[i] = getDiffLimitedQScale(&m_rce2Pass[i], qScale[i]);
274
+            qScale[i] = getDiffLimitedQScale(&m_rce2Pass[m_encOrder[i]], qScale[i]);
275
             X265_CHECK(qScale[i] >= 0, "qScale became negative\n");
276
         }
277
 
278
@@ -760,7 +766,7 @@
279
                     double coeff = qBlur == 0 ? 1.0 : exp(-d * d / (qBlur * qBlur));
280
                     if (idx < 0 || idx >= m_numEntries)
281
                         continue;
282
-                    if (m_rce2Pass[i].sliceType != m_rce2Pass[idx].sliceType)
283
+                    if (m_rce2Pass[m_encOrder[i]].sliceType != m_rce2Pass[m_encOrder[idx]].sliceType)
284
                         continue;
285
                     q += qScale[idx] * coeff;
286
                     sum += coeff;
287
@@ -772,7 +778,7 @@
288
         /* find expected bits */
289
         for (int i = 0; i < m_numEntries; i++)
290
         {
291
-            RateControlEntry *rce = &m_rce2Pass[i];
292
+            RateControlEntry *rce = &m_rce2Pass[m_encOrder[i]];
293
             rce->newQScale = clipQscale(NULL, rce, blurredQscale[i]); // check if needed
294
             X265_CHECK(rce->newQScale >= 0, "new Qscale is negative\n");
295
             expectedBits += qScale2bits(rce, rce->newQScale);
296
@@ -786,9 +792,9 @@
297
     if (filterSize > 1)
298
         X265_FREE(blurredQscale);
299
     if (m_isVbv)
300
-    if (!vbv2Pass(allAvailableBits, endIndex, startIndex))
301
+    if (!vbv2Pass(allAvailableBits, m_numEntries - 1, 0))
302
             return false;
303
-    expectedBits = countExpectedBits(startIndex, endIndex);
304
+    expectedBits = countExpectedBits(0, m_numEntries - 1);
305
     if (fabs(expectedBits / allAvailableBits - 1.0) > 0.01)
306
     {
307
         double avgq = 0;
308
@@ -826,13 +832,12 @@
309
     uint64_t allConstBits = 0, allCodedBits = 0;
310
     uint64_t allAvailableBits = uint64_t(m_param->rc.bitrate * 1000. * m_numEntries * m_frameDuration);
311
     int startIndex, framesCount, endIndex;
312
-    int fps = (int)(m_fps + 0.5);
313
+    int fps = X265_MIN(m_param->keyframeMax, (int)(m_fps + 0.5));
314
     startIndex = endIndex = framesCount = 0;
315
-    bool isQpModified = true;
316
     int diffQp = 0;
317
     double targetBits = 0;
318
     double expectedBits = 0;
319
-    for (startIndex = 0, endIndex = 0; endIndex < m_numEntries; endIndex++)
320
+    for (startIndex = m_start, endIndex = m_start; endIndex < m_numEntries; endIndex++)
321
     {
322
         allConstBits += m_rce2Pass[endIndex].miscBits;
323
         allCodedBits += m_rce2Pass[endIndex].coeffBits + m_rce2Pass[endIndex].mvBits;
324
@@ -846,11 +851,16 @@
325
             {
326
                 if (diffQp >= 1)
327
                 {
328
-                    if (!isQpModified && endIndex > fps)
329
+                    if (!m_isQpModified && endIndex > fps)
330
                     {
331
                         double factor = 2;
332
                         double step = 0;
333
-                        for (int start = endIndex; start <= endIndex + fps - 1 && start < m_numEntries; start++)
334
+                        if (endIndex + fps >= m_numEntries)
335
+                        {
336
+                            m_start = endIndex - (endIndex % fps);
337
+                            return true;
338
+                        }
339
+                        for (int start = endIndex + 1; start <= endIndex + fps && start < m_numEntries; start++)
340
                         {
341
                             RateControlEntry *rce = &m_rce2Pass[start];
342
                             targetBits += qScale2bits(rce, x265_qp2qScale(rce->qpNoVbv));
343
@@ -858,12 +868,13 @@
344
                         }
345
                         if (expectedBits < 0.95 * targetBits)
346
                         {
347
-                            isQpModified = true;
348
+                            m_isQpModified = true;
349
+                            m_isGopReEncoded = true;
350
                             while (endIndex + fps < m_numEntries)
351
                             {
352
                                 step = pow(2, factor / 6.0);
353
                                 expectedBits = 0;
354
-                                for (int start = endIndex; start <= endIndex + fps - 1; start++)
355
+                                for (int start = endIndex + 1; start <= endIndex + fps; start++)
356
                                 {
357
                                     RateControlEntry *rce = &m_rce2Pass[start];
358
                                     rce->newQScale = rce->qScale / step;
359
@@ -878,13 +889,13 @@
360
                             }
361
 
362
                             if (m_isVbv && endIndex + fps < m_numEntries)
363
-                                if (!vbv2Pass((uint64_t)targetBits, endIndex + fps - 1, endIndex))
364
+                                if (!vbv2Pass((uint64_t)targetBits, endIndex + fps, endIndex + 1))
365
                                     return false;
366
 
367
                             targetBits = 0;
368
                             expectedBits = 0;
369
 
370
-                            for (int start = endIndex - fps; start <= endIndex - 1; start++)
371
+                            for (int start = endIndex - fps + 1; start <= endIndex; start++)
372
                             {
373
                                 RateControlEntry *rce = &m_rce2Pass[start];
374
                                 targetBits += qScale2bits(rce, x265_qp2qScale(rce->qpNoVbv));
375
@@ -893,7 +904,7 @@
376
                             {
377
                                 step = pow(2, factor / 6.0);
378
                                 expectedBits = 0;
379
-                                for (int start = endIndex - fps; start <= endIndex - 1; start++)
380
+                                for (int start = endIndex - fps + 1; start <= endIndex; start++)
381
                                 {
382
                                     RateControlEntry *rce = &m_rce2Pass[start];
383
                                     rce->newQScale = rce->qScale * step;
384
@@ -907,10 +918,13 @@
385
                                      break;
386
                             }
387
                             if (m_isVbv)
388
-                                if (!vbv2Pass((uint64_t)targetBits, endIndex - 1, endIndex - fps))
389
+                                if (!vbv2Pass((uint64_t)targetBits, endIndex, endIndex - fps + 1))
390
                                     return false;
391
                             diffQp = 0;
392
+                            m_reencode = endIndex - fps + 1;
393
+                            endIndex = endIndex + fps;
394
                             startIndex = endIndex + 1;
395
+                            m_start = startIndex;
396
                             targetBits = expectedBits = 0;
397
                         }
398
                         else
399
@@ -918,7 +932,7 @@
400
                     }
401
                 }
402
                 else
403
-                    isQpModified = false;
404
+                    m_isQpModified = false;
405
             }
406
         }
407
     }
408
@@ -931,9 +945,12 @@
409
                      (int)(allConstBits * m_fps / framesCount * 1000.));
410
             return false;
411
         }
412
-        if (!analyseABR2Pass(0, m_numEntries - 1, allAvailableBits))
413
+        if (!analyseABR2Pass(allAvailableBits))
414
             return false;
415
     }
416
+
417
+    m_start = X265_MAX(m_start, endIndex - fps);
418
+
419
     return true;
420
 }
421
 
422
@@ -1049,12 +1066,12 @@
423
     }
424
     m_pred[0].coeff = m_pred[3].coeff = 0.75;
425
     m_pred[0].coeffMin = m_pred[3].coeffMin = 0.75 / 4;
426
-    if (m_param->rc.qCompress >= 0.8) // when tuned for grain 
427
+    if (m_isGrainEnabled) // when tuned for grain 
428
     {
429
         m_pred[1].coeffMin = 0.75 / 4;
430
         m_pred[1].coeff = 0.75;
431
-        m_pred[0].coeff = m_pred[3].coeff = 0.5;
432
-        m_pred[0].coeffMin = m_pred[3].coeffMin = 0.5 / 4;
433
+        m_pred[0].coeff = m_pred[3].coeff = 0.75;
434
+        m_pred[0].coeffMin = m_pred[3].coeffMin = 0.75 / 4;
435
     }
436
 }
437
 
438
@@ -1088,11 +1105,15 @@
439
         copyRceData(rce, &m_rce2Pass[index]);
440
     }
441
     rce->isActive = true;
442
+    rce->scenecut = false;
443
     bool isRefFrameScenecut = m_sliceType!= I_SLICE && m_curSlice->m_refFrameList[0][0]->m_lowres.bScenecut;
444
+    m_isFirstMiniGop = m_sliceType == I_SLICE ? true : m_isFirstMiniGop;
445
     if (curFrame->m_lowres.bScenecut)
446
     {
447
         m_isSceneTransition = true;
448
+        rce->scenecut = true;
449
         m_lastPredictorReset = rce->encodeOrder;
450
+
451
         initFramePredictors();
452
     }
453
     else if (m_sliceType != B_SLICE && !isRefFrameScenecut)
454
@@ -1197,6 +1218,7 @@
455
         double q = x265_qScale2qp(rateEstimateQscale(curFrame, rce));
456
         q = x265_clip3((double)QP_MIN, (double)QP_MAX_MAX, q);
457
         m_qp = int(q + 0.5);
458
+        q = m_isGrainEnabled ? m_qp : q;
459
         rce->qpaRc = curEncData.m_avgQpRc = curEncData.m_avgQpAq = q;
460
         /* copy value of lastRceq into thread local rce struct *to be used in RateControlEnd() */
461
         rce->qRceq = m_lastRceq;
462
@@ -1322,14 +1344,6 @@
463
         m_accumPNorm = mask * (1 + m_accumPNorm);
464
     }
465
 
466
-    x265_zone* zone = getZone();
467
-    if (zone)
468
-    {
469
-        if (zone->bForceQp)
470
-            q = x265_qp2qScale(zone->qp);
471
-        else
472
-            q /= zone->bitrateFactor;
473
-    }
474
     return q;
475
 }
476
 double RateControl::countExpectedBits(int startPos, int endPos)
477
@@ -1418,12 +1432,9 @@
478
             }
479
             while(type != sliceTypeActual);
480
         }
481
+        primitives.fix8Unpack(frame->m_lowres.qpCuTreeOffset, m_cuTreeStats.qpBuffer[m_cuTreeStats.qpBufPos], m_ncu);
482
         for (int i = 0; i < m_ncu; i++)
483
-        {
484
-            int16_t qpFix8 = m_cuTreeStats.qpBuffer[m_cuTreeStats.qpBufPos][i];
485
-            frame->m_lowres.qpCuTreeOffset[i] = (double)(qpFix8) / 256.0;
486
             frame->m_lowres.invQscaleFactor[i] = x265_exp2fix8(frame->m_lowres.qpCuTreeOffset[i]);
487
-        }
488
         m_cuTreeStats.qpBufPos--;
489
     }
490
     return true;
491
@@ -1436,8 +1447,6 @@
492
 double RateControl::tuneAbrQScaleFromFeedback(double qScale)
493
 {
494
     double abrBuffer = 2 * m_rateTolerance * m_bitrate;
495
-    if (m_currentSatd)
496
-    {
497
         /* use framesDone instead of POC as poc count is not serial with bframes enabled */
498
         double overflow = 1.0;
499
         double timeDone = (double)(m_framesDone - m_param->frameNumThreads + 1) * m_frameDuration;
500
@@ -1450,16 +1459,31 @@
501
         }
502
 
503
         if (wantedBits > 0 && encodedBits > 0 && (!m_partialResidualFrames || 
504
-            m_param->rc.bStrictCbr))
505
+            m_param->rc.bStrictCbr || m_isGrainEnabled))
506
         {
507
             abrBuffer *= X265_MAX(1, sqrt(timeDone));
508
             overflow = x265_clip3(.5, 2.0, 1.0 + (encodedBits - wantedBits) / abrBuffer);
509
             qScale *= overflow;
510
         }
511
-    }
512
     return qScale;
513
 }
514
 
515
+double RateControl::tuneQScaleForGrain(double rcOverflow)
516
+{
517
+    double qpstep = rcOverflow > 1.1 ? rcOverflow : m_lstep;
518
+    double qScaleAvg = x265_qp2qScale(m_avgPFrameQp);
519
+    double  q = m_lastQScaleFor[P_SLICE];
520
+    int curQp = int (x265_qScale2qp(m_lastQScaleFor[P_SLICE]) + 0.5);
521
+    double curBitrate = m_qpToEncodedBits[curQp] * int(m_fps + 0.5);
522
+    int newQp = rcOverflow > 1.1 ? curQp + 2 : rcOverflow > 1 ? curQp + 1 : curQp - 1 ;
523
+    double projectedBitrate =  int(m_fps + 0.5) * m_qpToEncodedBits[newQp];
524
+    if (curBitrate > 0 && projectedBitrate > 0)
525
+        q =  abs(projectedBitrate - m_bitrate) < abs (curBitrate - m_bitrate) ? x265_qp2qScale(newQp) : m_lastQScaleFor[P_SLICE];
526
+    else
527
+        q = rcOverflow > 1 ? qScaleAvg * qpstep : rcOverflow < 1 ?  qScaleAvg / qpstep : m_lastQScaleFor[P_SLICE];
528
+    return q;
529
+}
530
+
531
 double RateControl::rateEstimateQscale(Frame* curFrame, RateControlEntry *rce)
532
 {
533
     double q;
534
@@ -1525,6 +1549,7 @@
535
                 q0 = q1;
536
             }
537
         }
538
+
539
         if (prevRefSlice->m_sliceType == B_SLICE && IS_REFERENCED(m_curSlice->m_refFrameList[0][0]))
540
             q0 -= m_pbOffset / 2;
541
         if (nextRefSlice->m_sliceType == B_SLICE && IS_REFERENCED(m_curSlice->m_refFrameList[1][0]))
542
@@ -1535,7 +1560,9 @@
543
             q = q1;
544
         else if (i1)
545
             q = q0;
546
-        else
547
+        else if(m_isGrainEnabled && !m_2pass)
548
+                q = q1;
549
+            else
550
             q = (q0 * dt1 + q1 * dt0) / (dt0 + dt1);
551
 
552
         if (IS_REFERENCED(curFrame))
553
@@ -1543,7 +1570,7 @@
554
         else
555
             q += m_pbOffset;
556
 
557
-        /* Set a min qp at scenechanges and transitions */
558
+                /* Set a min qp at scenechanges and transitions */
559
         if (m_isSceneTransition)
560
         {
561
             q = X265_MAX(ABR_SCENECUT_INIT_QP_MIN, q);
562
@@ -1553,11 +1580,28 @@
563
         double qScale = x265_qp2qScale(q);
564
         rce->qpNoVbv = q;
565
         double lmin = 0, lmax = 0;
566
+        if (m_isGrainEnabled && m_isFirstMiniGop)
567
+        {
568
+            lmin = m_lastQScaleFor[P_SLICE] / m_lstep;
569
+            lmax = m_lastQScaleFor[P_SLICE] * m_lstep;
570
+            double tunedQscale = tuneAbrQScaleFromFeedback(qScale);
571
+            double overflow = tunedQscale / qScale;
572
+            if (!m_isAbrReset)
573
+                qScale = x265_clip3(lmin, lmax, qScale);
574
+            m_avgPFrameQp = m_avgPFrameQp == 0 ? rce->qpNoVbv : m_avgPFrameQp;
575
+            if (overflow != 1)
576
+            {
577
+                qScale = tuneQScaleForGrain(overflow);
578
+                q = x265_qScale2qp(qScale);
579
+            }
580
+            rce->qpNoVbv = q;
581
+        }
582
         if (m_isVbv)
583
         {
584
             lmin = m_lastQScaleFor[P_SLICE] / m_lstep;
585
             lmax = m_lastQScaleFor[P_SLICE] * m_lstep;
586
-            if (m_isCbr)
587
+
588
+            if (m_isCbr && !m_isGrainEnabled)
589
             {
590
                 qScale = tuneAbrQScaleFromFeedback(qScale);
591
                 if (!m_isAbrReset)
592
@@ -1581,7 +1625,17 @@
593
             rce->frameSizePlanned = X265_MIN(rce->frameSizePlanned, rce->frameSizeMaximum);
594
             rce->frameSizeEstimated = rce->frameSizePlanned;
595
         }
596
+
597
         rce->newQScale = qScale;
598
+        if(rce->bLastMiniGopBFrame)
599
+        {
600
+            if (m_isFirstMiniGop && m_isGrainEnabled)
601
+            {
602
+                m_avgPFrameQp = (m_avgPFrameQp + rce->qpNoVbv) / 2;
603
+                m_lastQScaleFor[P_SLICE] = x265_qp2qScale(m_avgPFrameQp);
604
+            }
605
+            m_isFirstMiniGop = false;
606
+        }
607
         return qScale;
608
     }
609
     else
610
@@ -1608,6 +1662,14 @@
611
             }
612
             diff = m_predictedBits - (int64_t)rce->expectedBits;
613
             q = rce->newQScale;
614
+            x265_zone* zone = getZone();
615
+            if (zone)
616
+            {
617
+                if (zone->bForceQp)
618
+                    q = x265_qp2qScale(zone->qp);
619
+                else
620
+                    q /= zone->bitrateFactor;
621
+            }
622
             q /= x265_clip3(0.5, 2.0, (double)(abrBuffer - diff) / abrBuffer);
623
             if (m_expectedBitsSum > 0)
624
             {
625
@@ -1617,6 +1679,9 @@
626
                 double w = x265_clip3(0.0, 1.0, curTime * 100);
627
                 q *= pow((double)m_totalBits / m_expectedBitsSum, w);
628
             }
629
+            if (m_framesDone == 0 && m_param->rc.rateControlMode == X265_RC_ABR && m_isGrainEnabled)
630
+                q = X265_MIN(x265_qp2qScale(ABR_INIT_QP_GRAIN_MAX), q);
631
+
632
             rce->qpNoVbv = x265_qScale2qp(q);
633
             if (m_isVbv)
634
             {
635
@@ -1669,21 +1734,50 @@
636
             if (m_param->rc.rateControlMode == X265_RC_CRF)
637
             {
638
                 q = getQScale(rce, m_rateFactorConstant);
639
+                x265_zone* zone = getZone();
640
+                if (zone)
641
+                {
642
+                    if (zone->bForceQp)
643
+                        q = x265_qp2qScale(zone->qp);
644
+                    else
645
+                        q /= zone->bitrateFactor;
646
+                }
647
             }
648
             else
649
             {
650
                 if (!m_param->rc.bStatRead)
651
                     checkAndResetABR(rce, false);
652
                 double initialQScale = getQScale(rce, m_wantedBitsWindow / m_cplxrSum);
653
-                q = tuneAbrQScaleFromFeedback(initialQScale);
654
-                overflow = q / initialQScale;
655
+                x265_zone* zone = getZone();
656
+                if (zone)
657
+                {
658
+                    if (zone->bForceQp)
659
+                        initialQScale = x265_qp2qScale(zone->qp);
660
+                    else
661
+                        initialQScale /= zone->bitrateFactor;
662
+                }
663
+                double tunedQScale = tuneAbrQScaleFromFeedback(initialQScale);
664
+                overflow = tunedQScale / initialQScale;
665
+                q = !m_partialResidualFrames? tunedQScale : initialQScale;
666
+                bool isEncodeEnd = (m_param->totalFrames && 
667
+                    m_framesDone > 0.75 * m_param->totalFrames) ? 1 : 0;
668
+                bool isEncodeBeg = m_framesDone < (int)(m_fps + 0.5);
669
+                if (m_isGrainEnabled)
670
+                {
671
+                    if(m_sliceType!= I_SLICE && m_framesDone && !isEncodeEnd &&
672
+                        ((overflow < 1.05 && overflow > 0.95) || isEncodeBeg))
673
+                    {
674
+                        q = tuneQScaleForGrain(overflow);
675
+                    }
676
+                }
677
             }
678
-            if (m_sliceType == I_SLICE && m_param->keyframeMax > 1
679
-                && m_lastNonBPictType != I_SLICE && !m_isAbrReset)
680
+            if ((m_sliceType == I_SLICE && m_param->keyframeMax > 1
681
+                && m_lastNonBPictType != I_SLICE && !m_isAbrReset) || (m_isNextGop && !m_framesDone))
682
             {
683
                 if (!m_param->rc.bStrictCbr)
684
                     q = x265_qp2qScale(m_accumPQp / m_accumPNorm);
685
                 q /= fabs(m_param->rc.ipFactor);
686
+                m_avgPFrameQp = 0;
687
             }
688
             else if (m_framesDone > 0)
689
             {
690
@@ -1691,7 +1785,7 @@
691
                 {
692
                     lqmin = m_lastQScaleFor[m_sliceType] / m_lstep;
693
                     lqmax = m_lastQScaleFor[m_sliceType] * m_lstep;
694
-                    if (!m_partialResidualFrames)
695
+                    if (!m_partialResidualFrames || m_isGrainEnabled)
696
                     {
697
                         if (overflow > 1.1 && m_framesDone > 3)
698
                             lqmax *= m_lstep;
699
@@ -1708,8 +1802,9 @@
700
             else if (m_framesDone == 0 && !m_isVbv && m_param->rc.rateControlMode == X265_RC_ABR)
701
             {
702
                 /* for ABR alone, clip the first I frame qp */
703
-                lqmax = x265_qp2qScale(ABR_INIT_QP_MAX) * m_lstep;
704
-                q = X265_MIN(lqmax, q);
705
+                    lqmax =  (m_lstep * m_isGrainEnabled) ? x265_qp2qScale(ABR_INIT_QP_GRAIN_MAX) :
706
+                        x265_qp2qScale(ABR_INIT_QP_MAX);
707
+                    q = X265_MIN(lqmax, q);
708
             }
709
             q = x265_clip3(MIN_QPSCALE, MAX_MAX_QPSCALE, q);
710
             /* Set a min qp at scenechanges and transitions */
711
@@ -1720,6 +1815,11 @@
712
                m_lastQScaleFor[P_SLICE] = X265_MAX(minScenecutQscale, m_lastQScaleFor[P_SLICE]);
713
             }
714
             rce->qpNoVbv = x265_qScale2qp(q);
715
+            if(m_sliceType == P_SLICE)
716
+            {
717
+                m_avgPFrameQp = m_avgPFrameQp == 0 ? rce->qpNoVbv : m_avgPFrameQp;
718
+                m_avgPFrameQp = (m_avgPFrameQp + rce->qpNoVbv) / 2;
719
+            }
720
             q = clipQscale(curFrame, rce, q);
721
             /*  clip qp to permissible range after vbv-lookahead estimation to avoid possible
722
              * mispredictions by initial frame size predictors, after each scenecut */
723
@@ -1806,7 +1906,7 @@
724
     double abrBuffer = 2 * m_rateTolerance * m_bitrate;
725
 
726
     // Check if current Slice is a scene cut that follows low detailed/blank frames
727
-    if (rce->lastSatd > 4 * rce->movingAvgSum)
728
+    if (rce->lastSatd > 4 * rce->movingAvgSum || rce->scenecut)
729
     {
730
         if (!m_isAbrReset && rce->movingAvgSum > 0
731
             && (m_isPatternPresent || !m_param->bframes))
732
@@ -1842,18 +1942,17 @@
733
     const HRDInfo* hrd = &vui->hrdParameters;
734
     int num = 90000;
735
     int denom = hrd->bitRateValue << (hrd->bitRateScale + BR_SHIFT);
736
-    reduceFraction(&num, &denom);
737
     int64_t cpbState = (int64_t)m_bufferFillFinal;
738
     int64_t cpbSize = (int64_t)hrd->cpbSizeValue << (hrd->cpbSizeScale + CPB_SHIFT);
739
 
740
     if (cpbState < 0 || cpbState > cpbSize)
741
     {
742
         x265_log(m_param, X265_LOG_WARNING, "CPB %s: %.0lf bits in a %.0lf-bit buffer\n",
743
-                 cpbState < 0 ? "underflow" : "overflow", (float)cpbState/denom, (float)cpbSize/denom);
744
+                 cpbState < 0 ? "underflow" : "overflow", (float)cpbState, (float)cpbSize);
745
     }
746
 
747
-    seiBP->m_initialCpbRemovalDelay = (uint32_t)(num * cpbState + denom) / denom;
748
-    seiBP->m_initialCpbRemovalDelayOffset = (uint32_t)((num * cpbSize + denom) / denom - seiBP->m_initialCpbRemovalDelay);
749
+    seiBP->m_initialCpbRemovalDelay = (uint32_t)(num * cpbState / denom);
750
+    seiBP->m_initialCpbRemovalDelayOffset = (uint32_t)(num * cpbSize / denom - seiBP->m_initialCpbRemovalDelay);
751
 }
752
 
753
 void RateControl::updateVbvPlan(Encoder* enc)
754
@@ -2084,8 +2183,6 @@
755
 
756
 int RateControl::rowDiagonalVbvRateControl(Frame* curFrame, uint32_t row, RateControlEntry* rce, double& qpVbv)
757
 {
758
-    if (m_param->rc.bStatRead && m_param->rc.rateControlMode == X265_RC_CRF)
759
-        return 0;
760
     FrameData& curEncData = *curFrame->m_encData;
761
     double qScaleVbv = x265_qp2qScale(qpVbv);
762
     uint64_t rowSatdCost = curEncData.m_rowStat[row].diagSatd;
763
@@ -2260,15 +2357,7 @@
764
         m_lastRceq = q;
765
         q /= rateFactor;
766
     }
767
-    
768
-    x265_zone* zone = getZone();
769
-    if (zone)
770
-    {
771
-        if (zone->bForceQp)
772
-            q = x265_qp2qScale(zone->qp);
773
-        else
774
-            q /= zone->bitrateFactor;
775
-    }
776
+
777
     return q;
778
 }
779
 
780
@@ -2336,21 +2425,25 @@
781
     {
782
         if (m_isVbv && !(m_2pass && m_param->rc.rateControlMode == X265_RC_CRF))
783
         {
784
+            double avgQpRc = 0;
785
             /* determine avg QP decided by VBV rate control */
786
             for (uint32_t i = 0; i < slice->m_sps->numCuInHeight; i++)
787
-                curEncData.m_avgQpRc += curEncData.m_rowStat[i].sumQpRc;
788
+                avgQpRc += curEncData.m_rowStat[i].sumQpRc;
789
 
790
-            curEncData.m_avgQpRc /= slice->m_sps->numCUsInFrame;
791
+            avgQpRc /= slice->m_sps->numCUsInFrame;
792
+            curEncData.m_avgQpRc = x265_clip3((double)QP_MIN, (double)QP_MAX_MAX, avgQpRc);
793
             rce->qpaRc = curEncData.m_avgQpRc;
794
         }
795
 
796
         if (m_param->rc.aqMode)
797
         {
798
+            double avgQpAq = 0;
799
             /* determine actual avg encoded QP, after AQ/cutree adjustments */
800
             for (uint32_t i = 0; i < slice->m_sps->numCuInHeight; i++)
801
-                curEncData.m_avgQpAq += curEncData.m_rowStat[i].sumQpAq;
802
+                avgQpAq += curEncData.m_rowStat[i].sumQpAq;
803
 
804
-            curEncData.m_avgQpAq /= (slice->m_sps->numCUsInFrame * NUM_4x4_PARTITIONS);
805
+            avgQpAq /= (slice->m_sps->numCUsInFrame * NUM_4x4_PARTITIONS);
806
+            curEncData.m_avgQpAq = avgQpAq;
807
         }
808
         else
809
             curEncData.m_avgQpAq = curEncData.m_avgQpRc;
810
@@ -2367,13 +2460,13 @@
811
         bool is2passCrfChange = false;
812
         if (m_2pass)
813
         {
814
-            if (abs(curEncData.m_avgQpRc - rce->qpPrev) > 0.1)
815
+            if (fabs(curEncData.m_avgQpRc - rce->qpPrev) > 0.1)
816
             {
817
                 qpRef = rce->qpPrev;
818
                 is2passCrfChange = true;
819
             }
820
         }
821
-        if (is2passCrfChange || abs(qpRef - rce->qpNoVbv) > 0.5)
822
+        if (is2passCrfChange || fabs(qpRef - rce->qpNoVbv) > 0.5)
823
         {
824
             double crfFactor = rce->qRceq /x265_qp2qScale(qpRef);
825
             double baseCplx = m_ncu * (m_param->bframes ? 120 : 80);
826
@@ -2426,6 +2519,11 @@
827
         int pos = m_sliderPos - m_param->frameNumThreads;
828
         if (pos >= 0)
829
             m_encodedBitsWindow[pos % s_slidingWindowFrames] = actualBits;
830
+        if(rce->sliceType != I_SLICE)
831
+        {
832
+        int qp = int (rce->qpaRc + 0.5);
833
+        m_qpToEncodedBits[qp] =  m_qpToEncodedBits[qp] == 0 ? actualBits : (m_qpToEncodedBits[qp] + actualBits) * 0.5;
834
+        }
835
     }
836
 
837
     if (m_2pass)
838
@@ -2493,8 +2591,7 @@
839
     if (m_param->rc.cuTree && IS_REFERENCED(curFrame) && !m_param->rc.bStatRead)
840
     {
841
         uint8_t sliceType = (uint8_t)rce->sliceType;
842
-        for (int i = 0; i < m_ncu; i++)
843
-                m_cuTreeStats.qpBuffer[0][i] = (uint16_t)(curFrame->m_lowres.qpCuTreeOffset[i] * 256.0);
844
+        primitives.fix8Pack(m_cuTreeStats.qpBuffer[0], curFrame->m_lowres.qpCuTreeOffset, m_ncu);
845
         if (fwrite(&sliceType, 1, 1, m_cutreeStatFileOut) < 1)
846
             goto writeFailure;
847
         if (fwrite(m_cuTreeStats.qpBuffer[0], sizeof(uint16_t), m_ncu, m_cutreeStatFileOut) < (size_t)m_ncu)
848
@@ -2542,13 +2639,12 @@
849
         int bError = 1;
850
         if (tmpFileName)
851
         {
852
-           unlink(fileName);
853
-           bError = rename(tmpFileName, fileName);
854
+            x265_unlink(fileName);
855
+            bError = x265_rename(tmpFileName, fileName);
856
         }
857
         if (bError)
858
         {
859
-            x265_log(m_param, X265_LOG_ERROR, "failed to rename output stats file to \"%s\"\n",
860
-                     fileName);
861
+            x265_log_file(m_param, X265_LOG_ERROR, "failed to rename output stats file to \"%s\"\n", fileName);
862
         }
863
         X265_FREE(tmpFileName);
864
     }
865
@@ -2561,13 +2657,12 @@
866
         int bError = 1;
867
         if (tmpFileName && newFileName)
868
         {
869
-           unlink(newFileName);
870
-           bError = rename(tmpFileName, newFileName);
871
+            x265_unlink(newFileName);
872
+            bError = x265_rename(tmpFileName, newFileName);
873
         }
874
         if (bError)
875
         {
876
-            x265_log(m_param, X265_LOG_ERROR, "failed to rename cutree output stats file to \"%s\"\n",
877
-                     newFileName);
878
+            x265_log_file(m_param, X265_LOG_ERROR, "failed to rename cutree output stats file to \"%s\"\n", newFileName);
879
         }
880
         X265_FREE(tmpFileName);
881
         X265_FREE(newFileName);
882
@@ -2577,6 +2672,7 @@
883
         fclose(m_cutreeStatFileIn);
884
 
885
     X265_FREE(m_rce2Pass);
886
+    X265_FREE(m_encOrder);
887
     for (int i = 0; i < 2; i++)
888
         X265_FREE(m_cuTreeStats.qpBuffer[i]);
889
     
890
x265_1.9.tar.gz/source/encoder/ratecontrol.h -> x265_2.0.tar.gz/source/encoder/ratecontrol.h Changed
77
 
1
@@ -107,6 +107,7 @@
2
     int      miscBits;
3
     int      coeffBits;
4
     bool     keptAsRef;
5
+    bool     scenecut;
6
 
7
     SEIPictureTiming *picTimingSEI;
8
     HRDTiming        *hrdTiming;
9
@@ -126,8 +127,9 @@
10
     bool   m_isVbv;
11
     bool   m_isCbr;
12
     bool   m_singleFrameVbv;
13
-
14
+    bool   m_isGrainEnabled;
15
     bool   m_isAbrReset;
16
+    bool   m_isNextGop;
17
     int    m_lastAbrResetPoc;
18
 
19
     double m_rateTolerance;
20
@@ -141,7 +143,8 @@
21
     double m_vbvMaxRate;       /* in kbps */
22
     double m_rateFactorMaxIncrement; /* Don't allow RF above (CRF + this value). */
23
     double m_rateFactorMaxDecrement; /* don't allow RF below (this value). */
24
-
25
+    double m_avgPFrameQp;
26
+    bool   m_isFirstMiniGop;
27
     Predictor m_pred[4];       /* Slice predictors to preidct bits for each Slice type - I,P,Bref and B */
28
     int64_t m_leadingNoBSatd;
29
     int     m_predType;       /* Type of slice predictors to be used - depends on the slice type */
30
@@ -178,7 +181,7 @@
31
     bool    m_isPatternPresent;
32
     bool    m_isSceneTransition;
33
     int     m_lastPredictorReset;
34
-
35
+    double  m_qpToEncodedBits[QP_MAX_MAX + 1];
36
     /* a common variable on which rateControlStart, rateControlEnd and rateControUpdateStats waits to
37
      * sync the calls to these functions. For example
38
      * -F2:
39
@@ -202,7 +205,11 @@
40
 
41
     /* 2 pass */
42
     bool    m_2pass;
43
+    bool    m_isGopReEncoded;
44
+    bool    m_isQpModified;
45
     int     m_numEntries;
46
+    int     m_start;
47
+    int     m_reencode;
48
     FILE*   m_statFileOut;
49
     FILE*   m_cutreeStatFileOut;
50
     FILE*   m_cutreeStatFileIn;
51
@@ -235,6 +242,8 @@
52
     bool cuTreeReadFor2Pass(Frame* curFrame);
53
     void hrdFullness(SEIBufferingPeriod* sei);
54
     int writeRateControlFrameStats(Frame* curFrame, RateControlEntry* rce);
55
+    bool   initPass2();
56
+
57
 protected:
58
 
59
     static const int   s_slidingWindowFrames;
60
@@ -261,14 +270,14 @@
61
     double predictSize(Predictor *p, double q, double var);
62
     void   checkAndResetABR(RateControlEntry* rce, bool isFrameDone);
63
     double predictRowsSizeSum(Frame* pic, RateControlEntry* rce, double qpm, int32_t& encodedBits);
64
-    bool   initPass2();
65
-    bool   analyseABR2Pass(int startPoc, int endPoc, uint64_t allAvailableBits);
66
+    bool   analyseABR2Pass(uint64_t allAvailableBits);
67
     void   initFramePredictors();
68
     double getDiffLimitedQScale(RateControlEntry *rce, double q);
69
     double countExpectedBits(int startPos, int framesCount);
70
     bool   vbv2Pass(uint64_t allAvailableBits, int frameCount, int startPos);
71
     bool   findUnderflow(double *fills, int *t0, int *t1, int over, int framesCount);
72
     bool   fixUnderflow(int t0, int t1, double adjustment, double qscaleMin, double qscaleMax);
73
+    double tuneQScaleForGrain(double rcOverflow);
74
 };
75
 }
76
 #endif // ifndef X265_RATECONTROL_H
77
x265_1.9.tar.gz/source/encoder/reference.cpp -> x265_2.0.tar.gz/source/encoder/reference.cpp Changed
10
 
1
@@ -68,7 +68,7 @@
2
         intptr_t stride = reconPic->m_stride;
3
         int cuHeight = g_maxCUSize;
4
 
5
-        for (int c = 0; c < (p.internalCsp != X265_CSP_I400 ? numInterpPlanes : 1); c++)
6
+        for (int c = 0; c < (p.internalCsp != X265_CSP_I400 && recPic->m_picCsp != X265_CSP_I400 ? numInterpPlanes : 1); c++)
7
         {
8
             if (c == 1)
9
             {
10
x265_1.9.tar.gz/source/encoder/sao.cpp -> x265_2.0.tar.gz/source/encoder/sao.cpp Changed
1432
 
1
@@ -53,7 +53,7 @@
2
     return r;
3
 }
4
 
5
-inline int64_t estSaoDist(int32_t count, int offset, int32_t offsetOrg)
6
+inline int64_t estSaoDist(int32_t count, int32_t offset, int32_t offsetOrg)
7
 {
8
     return (count * offset - offsetOrg * 2) * offset;
9
 }
10
@@ -76,8 +76,6 @@
11
     m_countPreDblk = NULL;
12
     m_offsetOrgPreDblk = NULL;
13
     m_refDepth = 0;
14
-    m_lumaLambda = 0;
15
-    m_chromaLambda = 0;
16
     m_param = NULL;
17
     m_clipTable = NULL;
18
     m_clipTableBase = NULL;
19
@@ -120,8 +118,11 @@
20
 
21
     if (initCommon)
22
     {
23
-        CHECKED_MALLOC(m_countPreDblk, PerPlane, numCtu);
24
-        CHECKED_MALLOC(m_offsetOrgPreDblk, PerPlane, numCtu);
25
+        if (m_param->bSaoNonDeblocked)
26
+        {
27
+            CHECKED_MALLOC(m_countPreDblk, PerPlane, numCtu);
28
+            CHECKED_MALLOC(m_offsetOrgPreDblk, PerPlane, numCtu);
29
+        }
30
         CHECKED_MALLOC(m_depthSaoRate, double, 2 * SAO_DEPTHRATE_SIZE);
31
 
32
         m_depthSaoRate[0 * SAO_DEPTHRATE_SIZE + 0] = 0;
33
@@ -137,17 +138,16 @@
34
         m_clipTable = &(m_clipTableBase[rangeExt]);
35
 
36
         // Share with fast clip lookup table
37
-        if (initCommon)
38
-        {
39
-            for (int i = 0; i < rangeExt; i++)
40
-                m_clipTableBase[i] = 0;
41
 
42
-            for (int i = 0; i < maxY; i++)
43
-                m_clipTable[i] = (pixel)i;
44
+        for (int i = 0; i < rangeExt; i++)
45
+            m_clipTableBase[i] = 0;
46
+
47
+        for (int i = 0; i < maxY; i++)
48
+            m_clipTable[i] = (pixel)i;
49
+
50
+        for (int i = maxY; i < maxY + rangeExt; i++)
51
+            m_clipTable[i] = maxY;
52
 
53
-            for (int i = maxY; i < maxY + rangeExt; i++)
54
-                m_clipTable[i] = maxY;
55
-        }
56
     }
57
     else
58
     {
59
@@ -204,8 +204,11 @@
60
 
61
     if (destoryCommon)
62
     {
63
-        X265_FREE_ZERO(m_countPreDblk);
64
-        X265_FREE_ZERO(m_offsetOrgPreDblk);
65
+        if (m_param->bSaoNonDeblocked)
66
+        {
67
+            X265_FREE_ZERO(m_countPreDblk);
68
+            X265_FREE_ZERO(m_offsetOrgPreDblk);
69
+        }
70
         X265_FREE_ZERO(m_depthSaoRate);
71
         X265_FREE_ZERO(m_clipTableBase);
72
     }
73
@@ -221,17 +224,10 @@
74
         saoParam->ctuParam[i] = new SaoCtuParam[m_numCuInHeight * m_numCuInWidth];
75
 }
76
 
77
-void SAO::startSlice(Frame* frame, Entropy& initState, int qp)
78
+void SAO::startSlice(Frame* frame, Entropy& initState)
79
 {
80
-    Slice* slice = frame->m_encData->m_slice;
81
-    int qpCb = qp;
82
-    if (m_param->internalCsp == X265_CSP_I420)
83
-        qpCb = x265_clip3(QP_MIN, QP_MAX_MAX, (int)g_chromaScale[qp + slice->m_pps->chromaQpOffset[0]]);
84
-    else
85
-        qpCb = X265_MIN(qp + slice->m_pps->chromaQpOffset[0], QP_MAX_SPEC);
86
-    m_lumaLambda = x265_lambda2_tab[qp];
87
-    m_chromaLambda = x265_lambda2_tab[qpCb]; // Use Cb QP for SAO chroma
88
     m_frame = frame;
89
+    Slice* slice = m_frame->m_encData->m_slice;
90
 
91
     switch (slice->m_sliceType)
92
     {
93
@@ -259,7 +255,7 @@
94
     }
95
 
96
     saoParam->bSaoFlag[0] = true;
97
-    saoParam->bSaoFlag[1] = m_param->internalCsp != X265_CSP_I400;
98
+    saoParam->bSaoFlag[1] = m_param->internalCsp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400;
99
 
100
     m_numNoSao[0] = 0; // Luma
101
     m_numNoSao[1] = 0; // Chroma
102
@@ -275,9 +271,8 @@
103
 }
104
 
105
 // CTU-based SAO process without slice granularity
106
-void SAO::processSaoCu(int addr, int typeIdx, int plane)
107
+void SAO::applyPixelOffsets(int addr, int typeIdx, int plane)
108
 {
109
-    int x, y;
110
     PicYuv* reconPic = m_frame->m_reconPic;
111
     pixel* rec = reconPic->getPlaneAddr(plane, addr);
112
     intptr_t stride = plane ? reconPic->m_strideC : reconPic->m_stride;
113
@@ -302,20 +297,13 @@
114
     ctuWidth  = rpelx - lpelx;
115
     ctuHeight = bpely - tpely;
116
 
117
-    int startX;
118
-    int startY;
119
-    int endX;
120
-    int endY;
121
-    pixel* tmpL;
122
-    pixel* tmpU;
123
-
124
     int8_t _upBuff1[MAX_CU_SIZE + 2], *upBuff1 = _upBuff1 + 1, signLeft1[2];
125
     int8_t _upBufft[MAX_CU_SIZE + 2], *upBufft = _upBufft + 1;
126
 
127
     memset(_upBuff1 + MAX_CU_SIZE, 0, 2 * sizeof(int8_t)); /* avoid valgrind uninit warnings */
128
 
129
-    tmpL = m_tmpL1[plane];
130
-    tmpU = &(m_tmpU[plane][lpelx]);
131
+    pixel* tmpL = m_tmpL1[plane];
132
+    pixel* tmpU = &(m_tmpU[plane][lpelx]);
133
 
134
     int8_t* offsetEo = m_offsetEo[plane];
135
 
136
@@ -324,14 +312,14 @@
137
     case SAO_EO_0: // dir: -
138
     {
139
         pixel firstPxl = 0, lastPxl = 0, row1FirstPxl = 0, row1LastPxl = 0;
140
-        startX = !lpelx;
141
-        endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth;
142
+        int startX = !lpelx;
143
+        int endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth;
144
         if (ctuWidth & 15)
145
         {
146
-            for (y = 0; y < ctuHeight; y++)
147
+            for (int y = 0; y < ctuHeight; y++, rec += stride)
148
             {
149
                 int signLeft = signOf(rec[startX] - tmpL[y]);
150
-                for (x = startX; x < endX; x++)
151
+                for (int x = startX; x < endX; x++)
152
                 {
153
                     int signRight = signOf(rec[x] - rec[x + 1]);
154
                     int edgeType = signRight + signLeft + 2;
155
@@ -339,13 +327,11 @@
156
 
157
                     rec[x] = m_clipTable[rec[x] + offsetEo[edgeType]];
158
                 }
159
-
160
-                rec += stride;
161
             }
162
         }
163
         else
164
         {
165
-            for (y = 0; y < ctuHeight; y += 2)
166
+            for (int y = 0; y < ctuHeight; y += 2, rec += 2 * stride)
167
             {
168
                 signLeft1[0] = signOf(rec[startX] - tmpL[y]);
169
                 signLeft1[1] = signOf(rec[stride + startX] - tmpL[y + 1]);
170
@@ -375,27 +361,25 @@
171
                     rec[ctuWidth - 1] = lastPxl;
172
                     rec[stride + ctuWidth - 1] = row1LastPxl;
173
                 }
174
-
175
-                rec += 2 * stride;
176
             }
177
         }
178
         break;
179
     }
180
     case SAO_EO_1: // dir: |
181
     {
182
-        startY = !tpely;
183
-        endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight;
184
+        int startY = !tpely;
185
+        int endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight;
186
         if (!tpely)
187
             rec += stride;
188
 
189
         if (ctuWidth & 15)
190
         {
191
-            for (x = 0; x < ctuWidth; x++)
192
+            for (int x = 0; x < ctuWidth; x++)
193
                 upBuff1[x] = signOf(rec[x] - tmpU[x]);
194
 
195
-            for (y = startY; y < endY; y++)
196
+            for (int y = startY; y < endY; y++, rec += stride)
197
             {
198
-                for (x = 0; x < ctuWidth; x++)
199
+                for (int x = 0; x < ctuWidth; x++)
200
                 {
201
                     int8_t signDown = signOf(rec[x] - rec[x + stride]);
202
                     int edgeType = signDown + upBuff1[x] + 2;
203
@@ -403,8 +387,6 @@
204
 
205
                     rec[x] = m_clipTable[rec[x] + offsetEo[edgeType]];
206
                 }
207
-
208
-                rec += stride;
209
             }
210
         }
211
         else
212
@@ -412,11 +394,9 @@
213
             primitives.sign(upBuff1, rec, tmpU, ctuWidth);
214
 
215
             int diff = (endY - startY) % 2;
216
-            for (y = startY; y < endY - diff; y += 2)
217
-            {
218
+            for (int y = startY; y < endY - diff; y += 2, rec += 2 * stride)
219
                 primitives.saoCuOrgE1_2Rows(rec, upBuff1, offsetEo, stride, ctuWidth);
220
-                rec += 2 * stride;
221
-            }
222
+
223
             if (diff & 1)
224
                 primitives.saoCuOrgE1(rec, upBuff1, offsetEo, stride, ctuWidth);
225
         }
226
@@ -425,11 +405,11 @@
227
     }
228
     case SAO_EO_2: // dir: 135
229
     {
230
-        startX = !lpelx;
231
-        endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth;
232
+        int startX = !lpelx;
233
+        int endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth;
234
 
235
-        startY = !tpely;
236
-        endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight;
237
+        int startY = !tpely;
238
+        int endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight;
239
 
240
         if (!tpely)
241
             rec += stride;
242
@@ -454,16 +434,16 @@
243
         }
244
         else
245
         {
246
-            for (x = startX; x < endX; x++)
247
+            for (int x = startX; x < endX; x++)
248
                 upBuff1[x] = signOf(rec[x] - tmpU[x - 1]);
249
         }
250
 
251
         if (ctuWidth & 15)
252
         {
253
-             for (y = startY; y < endY; y++)
254
+             for (int y = startY; y < endY; y++, rec += stride)
255
              {
256
                  upBufft[startX] = signOf(rec[stride + startX] - tmpL[y]);
257
-                 for (x = startX; x < endX; x++)
258
+                 for (int x = startX; x < endX; x++)
259
                  {
260
                      int8_t signDown = signOf(rec[x] - rec[x + stride + 1]);
261
                      int edgeType = signDown + upBuff1[x] + 2;
262
@@ -472,13 +452,11 @@
263
                  }
264
 
265
                  std::swap(upBuff1, upBufft);
266
-
267
-                 rec += stride;
268
              }
269
         }
270
         else
271
         {
272
-            for (y = startY; y < endY; y++)
273
+            for (int y = startY; y < endY; y++, rec += stride)
274
             {
275
                 int8_t iSignDown2 = signOf(rec[stride + startX] - tmpL[y]);
276
 
277
@@ -487,30 +465,29 @@
278
                 upBufft[startX] = iSignDown2;
279
 
280
                 std::swap(upBuff1, upBufft);
281
-                rec += stride;
282
             }
283
         }
284
         break;
285
     }
286
     case SAO_EO_3: // dir: 45
287
     {
288
-        startX = !lpelx;
289
-        endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth;
290
+        int startX = !lpelx;
291
+        int endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth;
292
 
293
-        startY = !tpely;
294
-        endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight;
295
+        int startY = !tpely;
296
+        int endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight;
297
 
298
         if (!tpely)
299
             rec += stride;
300
 
301
         if (ctuWidth & 15)
302
         {
303
-            for (x = startX - 1; x < endX; x++)
304
+            for (int x = startX - 1; x < endX; x++)
305
                 upBuff1[x] = signOf(rec[x] - tmpU[x + 1]);
306
 
307
-            for (y = startY; y < endY; y++)
308
+            for (int y = startY; y < endY; y++, rec += stride)
309
             {
310
-                x = startX;
311
+                int x = startX;
312
                 int8_t signDown = signOf(rec[x] - tmpL[y + 1]);
313
                 int edgeType = signDown + upBuff1[x] + 2;
314
                 upBuff1[x - 1] = -signDown;
315
@@ -525,8 +502,6 @@
316
                 }
317
 
318
                 upBuff1[endX - 1] = signOf(rec[endX - 1 + stride] - rec[endX]);
319
-
320
-                rec += stride;
321
             }
322
         }
323
         else
324
@@ -545,9 +520,9 @@
325
             if (rpelx == picWidth)
326
                 upBuff1[ctuWidth - 1] = lastSign;
327
 
328
-            for (y = startY; y < endY; y++)
329
+            for (int y = startY; y < endY; y++, rec += stride)
330
             {
331
-                x = startX;
332
+                int x = startX;
333
                 int8_t signDown = signOf(rec[x] - tmpL[y + 1]);
334
                 int edgeType = signDown + upBuff1[x] + 2;
335
                 upBuff1[x - 1] = -signDown;
336
@@ -556,8 +531,6 @@
337
                 primitives.saoCuOrgE3[endX > 16](rec, upBuff1, offsetEo, stride - 1, startX, endX);
338
 
339
                 upBuff1[endX - 1] = signOf(rec[endX - 1 + stride] - rec[endX]);
340
-
341
-                rec += stride;
342
             }
343
         }
344
 
345
@@ -571,24 +544,14 @@
346
         {
347
             #define SAO_BO_BITS 5
348
             const int boShift = X265_DEPTH - SAO_BO_BITS;
349
-            for (y = 0; y < ctuHeight; y++)
350
-            {
351
-                for (x = 0; x < ctuWidth; x++)
352
-                {
353
-                     int val = rec[x] + offsetBo[rec[x] >> boShift];
354
-                     if (val < 0)
355
-                         val = 0;
356
-                     else if (val > ((1 << X265_DEPTH) - 1))
357
-                         val = ((1 << X265_DEPTH) - 1);
358
-                     rec[x] = (pixel)val;
359
-                }
360
-                rec += stride;
361
-            }
362
+
363
+            for (int y = 0; y < ctuHeight; y++, rec += stride)
364
+                for (int x = 0; x < ctuWidth; x++)
365
+                    rec[x] = x265_clip(rec[x] + offsetBo[rec[x] >> boShift]);
366
         }
367
         else
368
-        {
369
             primitives.saoCuOrgB0(rec, offsetBo, ctuWidth, ctuHeight, stride);
370
-        }
371
+
372
         break;
373
     }
374
     default: break;
375
@@ -596,7 +559,7 @@
376
 }
377
 
378
 /* Process SAO unit */
379
-void SAO::processSaoUnitCuLuma(SaoCtuParam* ctuParam, int idxY, int idxX)
380
+void SAO::generateLumaOffsets(SaoCtuParam* ctuParam, int idxY, int idxX)
381
 {
382
     PicYuv* reconPic = m_frame->m_reconPic;
383
     intptr_t stride = reconPic->m_stride;
384
@@ -637,7 +600,7 @@
385
                 memset(m_offsetBo[0], 0, sizeof(m_offsetBo[0]));
386
 
387
                 for (int i = 0; i < SAO_NUM_OFFSET; i++)
388
-                    m_offsetBo[0][((ctuParam[addr].bandPos + i) & (SAO_NUM_BO_CLASSES - 1))] = (int8_t)(ctuParam[addr].offset[i] << SAO_BIT_INC);
389
+                    m_offsetBo[0][((ctuParam[addr].bandPos + i) & (MAX_NUM_SAO_CLASS - 1))] = (int8_t)(ctuParam[addr].offset[i] << SAO_BIT_INC);
390
             }
391
             else // if (typeIdx == SAO_EO_0 || typeIdx == SAO_EO_1 || typeIdx == SAO_EO_2 || typeIdx == SAO_EO_3)
392
             {
393
@@ -650,13 +613,13 @@
394
                     m_offsetEo[0][edgeType] = (int8_t)offset[s_eoTable[edgeType]];
395
             }
396
         }
397
-        processSaoCu(addr, typeIdx, 0);
398
+        applyPixelOffsets(addr, typeIdx, 0);
399
     }
400
     std::swap(m_tmpL1[0], m_tmpL2[0]);
401
 }
402
 
403
 /* Process SAO unit (Chroma only) */
404
-void SAO::processSaoUnitCuChroma(SaoCtuParam* ctuParam[3], int idxY, int idxX)
405
+void SAO::generateChromaOffsets(SaoCtuParam* ctuParam[3], int idxY, int idxX)
406
 {
407
     PicYuv* reconPic = m_frame->m_reconPic;
408
     intptr_t stride = reconPic->m_strideC;
409
@@ -712,7 +675,7 @@
410
                 memset(m_offsetBo[1], 0, sizeof(m_offsetBo[0]));
411
 
412
                 for (int i = 0; i < SAO_NUM_OFFSET; i++)
413
-                    m_offsetBo[1][((ctuParam[1][addr].bandPos + i) & (SAO_NUM_BO_CLASSES - 1))] = (int8_t)(ctuParam[1][addr].offset[i] << SAO_BIT_INC);
414
+                    m_offsetBo[1][((ctuParam[1][addr].bandPos + i) & (MAX_NUM_SAO_CLASS - 1))] = (int8_t)(ctuParam[1][addr].offset[i] << SAO_BIT_INC);
415
             }
416
             else // if (typeIdx == SAO_EO_0 || typeIdx == SAO_EO_1 || typeIdx == SAO_EO_2 || typeIdx == SAO_EO_3)
417
             {
418
@@ -725,7 +688,7 @@
419
                     m_offsetEo[1][edgeType] = (int8_t)offset[s_eoTable[edgeType]];
420
             }
421
         }
422
-        processSaoCu(addr, typeIdxCb, 1);
423
+        applyPixelOffsets(addr, typeIdxCb, 1);
424
     }
425
 
426
     // Process V
427
@@ -738,7 +701,7 @@
428
                 memset(m_offsetBo[2], 0, sizeof(m_offsetBo[0]));
429
 
430
                 for (int i = 0; i < SAO_NUM_OFFSET; i++)
431
-                    m_offsetBo[2][((ctuParam[2][addr].bandPos + i) & (SAO_NUM_BO_CLASSES - 1))] = (int8_t)(ctuParam[2][addr].offset[i] << SAO_BIT_INC);
432
+                    m_offsetBo[2][((ctuParam[2][addr].bandPos + i) & (MAX_NUM_SAO_CLASS - 1))] = (int8_t)(ctuParam[2][addr].offset[i] << SAO_BIT_INC);
433
             }
434
             else // if (typeIdx == SAO_EO_0 || typeIdx == SAO_EO_1 || typeIdx == SAO_EO_2 || typeIdx == SAO_EO_3)
435
             {
436
@@ -751,25 +714,15 @@
437
                     m_offsetEo[2][edgeType] = (int8_t)offset[s_eoTable[edgeType]];
438
             }
439
         }
440
-        processSaoCu(addr, typeIdxCb, 2);
441
+        applyPixelOffsets(addr, typeIdxCb, 2);
442
     }
443
 
444
     std::swap(m_tmpL1[1], m_tmpL2[1]);
445
     std::swap(m_tmpL1[2], m_tmpL2[2]);
446
 }
447
 
448
-void SAO::copySaoUnit(SaoCtuParam* saoUnitDst, const SaoCtuParam* saoUnitSrc)
449
-{
450
-    saoUnitDst->mergeMode   = saoUnitSrc->mergeMode;
451
-    saoUnitDst->typeIdx     = saoUnitSrc->typeIdx;
452
-    saoUnitDst->bandPos     = saoUnitSrc->bandPos;
453
-
454
-    for (int i = 0; i < SAO_NUM_OFFSET; i++)
455
-        saoUnitDst->offset[i] = saoUnitSrc->offset[i];
456
-}
457
-
458
 /* Calculate SAO statistics for current CTU without non-crossing slice */
459
-void SAO::calcSaoStatsCu(int addr, int plane)
460
+void SAO::calcSaoStatsCTU(int addr, int plane)
461
 {
462
     const PicYuv* reconPic = m_frame->m_reconPic;
463
     const CUData* cu = m_frame->m_encData->getPicCTU(addr);
464
@@ -982,7 +935,7 @@
465
     memset(m_offsetOrgPreDblk[addr], 0, sizeof(PerPlane));
466
 
467
     int plane_offset = 0;
468
-    for (int plane = 0; plane < (frame->m_param->internalCsp != X265_CSP_I400 ? NUM_PLANE : 1); plane++)
469
+    for (int plane = 0; plane < (frame->m_param->internalCsp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400? NUM_PLANE : 1); plane++)
470
     {
471
         if (plane == 1)
472
         {
473
@@ -1017,7 +970,7 @@
474
         {
475
             for (x = (y < startY ? startX : 0); x < ctuWidth; x++)
476
             {
477
-                int classIdx = 1 + (rec[x] >> boShift);
478
+                int classIdx = rec[x] >> boShift;
479
                 stats[classIdx] += (fenc[x] - rec[x]);
480
                 count[classIdx]++;
481
             }
482
@@ -1233,137 +1186,31 @@
483
         m_depthSaoRate[1 * SAO_DEPTHRATE_SIZE + m_refDepth] = m_numNoSao[1] / ((double)numctus);
484
 }
485
 
486
-void SAO::rdoSaoUnitRow(SAOParam* saoParam, int idxY)
487
+void SAO::rdoSaoUnitCu(SAOParam* saoParam, int rowBaseAddr, int idxX, int addr)
488
 {
489
-    SaoCtuParam mergeSaoParam[NUM_MERGE_MODE][2];
490
-    double mergeDist[NUM_MERGE_MODE];
491
-    bool allowMerge[2]; // left, up
492
-    allowMerge[1] = (idxY > 0);
493
-
494
-    for (int idxX = 0; idxX < m_numCuInWidth; idxX++)
495
-    {
496
-        int addr     = idxX + idxY * m_numCuInWidth;
497
-        int addrUp   = idxY ? addr - m_numCuInWidth : -1;
498
-        int addrLeft = idxX ? addr - 1 : -1;
499
-        allowMerge[0] = (idxX > 0);
500
-
501
-        m_entropyCoder.load(m_rdContexts.cur);
502
-        if (allowMerge[0])
503
-            m_entropyCoder.codeSaoMerge(0);
504
-        if (allowMerge[1])
505
-            m_entropyCoder.codeSaoMerge(0);
506
-        m_entropyCoder.store(m_rdContexts.temp);
507
-
508
-        // reset stats Y, Cb, Cr
509
-        X265_CHECK(sizeof(PerPlane) == (sizeof(int32_t) * (NUM_PLANE * MAX_NUM_SAO_TYPE * MAX_NUM_SAO_CLASS)), "Found Padding space in struct PerPlane");
510
-
511
-        // TODO: Confirm the address space is continuous
512
-        if (m_param->bSaoNonDeblocked)
513
-        {
514
-            memcpy(m_count, m_countPreDblk[addr], sizeof(m_count));
515
-            memcpy(m_offsetOrg, m_offsetOrgPreDblk[addr], sizeof(m_offsetOrg));
516
-        }
517
-        else
518
-        {
519
-            memset(m_count, 0, sizeof(m_count));
520
-            memset(m_offsetOrg, 0, sizeof(m_offsetOrg));
521
-        }
522
-
523
-        saoParam->ctuParam[0][addr].reset();
524
-        saoParam->ctuParam[1][addr].reset();
525
-        saoParam->ctuParam[2][addr].reset();
526
-
527
-        if (saoParam->bSaoFlag[0])
528
-            calcSaoStatsCu(addr, 0);
529
-
530
-        if (saoParam->bSaoFlag[1])
531
-        {
532
-            calcSaoStatsCu(addr, 1);
533
-            calcSaoStatsCu(addr, 2);
534
-        }
535
-
536
-        saoComponentParamDist(saoParam, addr, addrUp, addrLeft, &mergeSaoParam[0][0], mergeDist);
537
-        if (m_chromaFormat != X265_CSP_I400)
538
-            sao2ChromaParamDist(saoParam, addr, addrUp, addrLeft, mergeSaoParam, mergeDist);
539
-
540
-        if (saoParam->bSaoFlag[0] || saoParam->bSaoFlag[1])
541
-        {
542
-            // Cost of new SAO_params
543
-            m_entropyCoder.load(m_rdContexts.cur);
544
-            m_entropyCoder.resetBits();
545
-            if (allowMerge[0])
546
-                m_entropyCoder.codeSaoMerge(0);
547
-            if (allowMerge[1])
548
-                m_entropyCoder.codeSaoMerge(0);
549
-            for (int plane = 0; plane < 3; plane++)
550
-            {
551
-                if (saoParam->bSaoFlag[plane > 0])
552
-                    m_entropyCoder.codeSaoOffset(saoParam->ctuParam[plane][addr], plane);
553
-            }
554
-
555
-            uint32_t rate = m_entropyCoder.getNumberOfWrittenBits();
556
-            double bestCost = mergeDist[0] + (double)rate;
557
-            m_entropyCoder.store(m_rdContexts.temp);
558
+    Slice* slice = m_frame->m_encData->m_slice;
559
+//    int qp = slice->m_sliceQp;
560
+    const CUData* cu = m_frame->m_encData->getPicCTU(addr);
561
+    int qp = cu->m_qp[0];
562
 
563
-            // Cost of Merge
564
-            for (int mergeIdx = 0; mergeIdx < 2; ++mergeIdx)
565
-            {
566
-                if (!allowMerge[mergeIdx])
567
-                    continue;
568
-
569
-                m_entropyCoder.load(m_rdContexts.cur);
570
-                m_entropyCoder.resetBits();
571
-                if (allowMerge[0])
572
-                    m_entropyCoder.codeSaoMerge(1 - mergeIdx);
573
-                if (allowMerge[1] && (mergeIdx == 1))
574
-                    m_entropyCoder.codeSaoMerge(1);
575
-
576
-                rate = m_entropyCoder.getNumberOfWrittenBits();
577
-                double mergeCost = mergeDist[mergeIdx + 1] + (double)rate;
578
-                if (mergeCost < bestCost)
579
-                {
580
-                    SaoMergeMode mergeMode = mergeIdx ? SAO_MERGE_UP : SAO_MERGE_LEFT;
581
-                    bestCost = mergeCost;
582
-                    m_entropyCoder.store(m_rdContexts.temp);
583
-                    for (int plane = 0; plane < 3; plane++)
584
-                    {
585
-                        mergeSaoParam[plane][mergeIdx].mergeMode = mergeMode;
586
-                        if (saoParam->bSaoFlag[plane > 0])
587
-                            copySaoUnit(&saoParam->ctuParam[plane][addr], &mergeSaoParam[plane][mergeIdx]);
588
-                    }
589
-                }
590
-            }
591
+    int64_t lambda[2] = { 0 };
592
 
593
-            if (saoParam->ctuParam[0][addr].typeIdx < 0)
594
-                m_numNoSao[0]++;
595
-            if (m_chromaFormat != X265_CSP_I400 && saoParam->ctuParam[1][addr].typeIdx < 0)
596
-                m_numNoSao[1]++;
597
+    int qpCb = qp;
598
+    if (m_param->internalCsp == X265_CSP_I420)
599
+        qpCb = x265_clip3(QP_MIN, QP_MAX_MAX, (int)g_chromaScale[qp + slice->m_pps->chromaQpOffset[0]]);
600
+    else
601
+        qpCb = X265_MIN(qp + slice->m_pps->chromaQpOffset[0], QP_MAX_SPEC);
602
 
603
-            m_entropyCoder.load(m_rdContexts.temp);
604
-            m_entropyCoder.store(m_rdContexts.cur);
605
-        }
606
-    }
607
-}
608
+    lambda[0] = (int64_t)floor(256.0 * x265_lambda2_tab[qp]);
609
+    lambda[1] = (int64_t)floor(256.0 * x265_lambda2_tab[qpCb]); // Use Cb QP for SAO chroma
610
 
611
-void SAO::rdoSaoUnitCu(SAOParam* saoParam, int rowBaseAddr, int idxX, int addr)
612
-{
613
-    SaoCtuParam mergeSaoParam[NUM_MERGE_MODE][2];
614
-    double mergeDist[NUM_MERGE_MODE];
615
     const bool allowMerge[2] = {(idxX != 0), (rowBaseAddr != 0)}; // left, up
616
 
617
-    const int addrUp   = rowBaseAddr ? addr - m_numCuInWidth : -1;
618
-    const int addrLeft = idxX ? addr - 1 : -1;
619
+    const int addrMerge[2] = {(idxX ? addr - 1 : -1), (rowBaseAddr ? addr - m_numCuInWidth : -1)};// left, up
620
 
621
-    bool chroma = m_param->internalCsp != X265_CSP_I400;
622
+    bool chroma = m_param->internalCsp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400;
623
     int planes = chroma ? 3 : 1;
624
 
625
-    m_entropyCoder.load(m_rdContexts.cur);
626
-    if (allowMerge[0])
627
-        m_entropyCoder.codeSaoMerge(0);
628
-    if (allowMerge[1])
629
-        m_entropyCoder.codeSaoMerge(0);
630
-    m_entropyCoder.store(m_rdContexts.temp);
631
-
632
     // reset stats Y, Cb, Cr
633
     X265_CHECK(sizeof(PerPlane) == (sizeof(int32_t) * (NUM_PLANE * MAX_NUM_SAO_TYPE * MAX_NUM_SAO_CLASS)), "Found Padding space in struct PerPlane");
634
 
635
@@ -1383,43 +1230,59 @@
636
         saoParam->ctuParam[i][addr].reset();
637
 
638
     if (saoParam->bSaoFlag[0])
639
-        calcSaoStatsCu(addr, 0);
640
+        calcSaoStatsCTU(addr, 0);
641
 
642
     if (saoParam->bSaoFlag[1])
643
     {
644
-        calcSaoStatsCu(addr, 1);
645
-        calcSaoStatsCu(addr, 2);
646
+        calcSaoStatsCTU(addr, 1);
647
+        calcSaoStatsCTU(addr, 2);
648
     }
649
 
650
-    saoComponentParamDist(saoParam, addr, addrUp, addrLeft, &mergeSaoParam[0][0], mergeDist);
651
+    saoStatsInitialOffset(planes);
652
+
653
+    // SAO distortion calculation
654
+    m_entropyCoder.load(m_rdContexts.cur);
655
+    m_entropyCoder.resetBits();
656
+    if (allowMerge[0])
657
+        m_entropyCoder.codeSaoMerge(0);
658
+    if (allowMerge[1])
659
+        m_entropyCoder.codeSaoMerge(0);
660
+    m_entropyCoder.store(m_rdContexts.temp);
661
+
662
+    // Estimate distortion and cost of new SAO params
663
+    int64_t bestCost = 0;
664
+    int64_t rateDist = 0;
665
+    // Estimate distortion and cost of new SAO params
666
+    saoLumaComponentParamDist(saoParam, addr, rateDist, lambda, bestCost);
667
     if (chroma)
668
-        sao2ChromaParamDist(saoParam, addr, addrUp, addrLeft, mergeSaoParam, mergeDist);
669
+        saoChromaComponentParamDist(saoParam, addr, rateDist, lambda, bestCost);
670
 
671
     if (saoParam->bSaoFlag[0] || saoParam->bSaoFlag[1])
672
     {
673
-        // Cost of new SAO_params
674
-        m_entropyCoder.load(m_rdContexts.cur);
675
-        m_entropyCoder.resetBits();
676
-        if (allowMerge[0])
677
-            m_entropyCoder.codeSaoMerge(0);
678
-        if (allowMerge[1])
679
-            m_entropyCoder.codeSaoMerge(0);
680
-        for (int plane = 0; plane < planes; plane++)
681
-        {
682
-            if (saoParam->bSaoFlag[plane > 0])
683
-                m_entropyCoder.codeSaoOffset(saoParam->ctuParam[plane][addr], plane);
684
-        }
685
-
686
-        uint32_t rate = m_entropyCoder.getNumberOfWrittenBits();
687
-        double bestCost = mergeDist[0] + (double)rate;
688
-        m_entropyCoder.store(m_rdContexts.temp);
689
-
690
-        // Cost of Merge
691
+        // Cost of merge left or Up
692
         for (int mergeIdx = 0; mergeIdx < 2; ++mergeIdx)
693
         {
694
             if (!allowMerge[mergeIdx])
695
                 continue;
696
 
697
+            int64_t mergeDist = 0; 
698
+            for (int plane = 0; plane < planes; plane++)
699
+            {
700
+                int64_t estDist = 0;
701
+                SaoCtuParam* mergeSrcParam = &(saoParam->ctuParam[plane][addrMerge[mergeIdx]]);
702
+                int typeIdx = mergeSrcParam->typeIdx;
703
+                if (typeIdx >= 0)
704
+                {
705
+                    int bandPos = (typeIdx == SAO_BO) ? mergeSrcParam->bandPos : 1;
706
+                    for (int classIdx = 0; classIdx < SAO_NUM_OFFSET; classIdx++)
707
+                    {
708
+                        int mergeOffset = mergeSrcParam->offset[classIdx];
709
+                        estDist += estSaoDist(m_count[plane][typeIdx][classIdx + bandPos], mergeOffset, m_offsetOrg[plane][typeIdx][classIdx + bandPos]);
710
+                    }
711
+                }
712
+                mergeDist += (estDist << 8) / lambda[!!plane];
713
+            }
714
+
715
             m_entropyCoder.load(m_rdContexts.cur);
716
             m_entropyCoder.resetBits();
717
             if (allowMerge[0])
718
@@ -1427,8 +1290,8 @@
719
             if (allowMerge[1] && (mergeIdx == 1))
720
                 m_entropyCoder.codeSaoMerge(1);
721
 
722
-            rate = m_entropyCoder.getNumberOfWrittenBits();
723
-            double mergeCost = mergeDist[mergeIdx + 1] + (double)rate;
724
+            uint32_t estRate = m_entropyCoder.getNumberOfWrittenBits();
725
+            int64_t mergeCost = mergeDist + estRate;
726
             if (mergeCost < bestCost)
727
             {
728
                 SaoMergeMode mergeMode = mergeIdx ? SAO_MERGE_UP : SAO_MERGE_LEFT;
729
@@ -1436,9 +1299,17 @@
730
                 m_entropyCoder.store(m_rdContexts.temp);
731
                 for (int plane = 0; plane < planes; plane++)
732
                 {
733
-                    mergeSaoParam[plane][mergeIdx].mergeMode = mergeMode;
734
                     if (saoParam->bSaoFlag[plane > 0])
735
-                        copySaoUnit(&saoParam->ctuParam[plane][addr], &mergeSaoParam[plane][mergeIdx]);
736
+                    {
737
+                        SaoCtuParam* dstCtuParam   = &saoParam->ctuParam[plane][addr];
738
+                        SaoCtuParam* mergeSrcParam = &(saoParam->ctuParam[plane][addrMerge[mergeIdx]]);
739
+                        dstCtuParam->mergeMode = mergeMode;
740
+                        dstCtuParam->typeIdx   = mergeSrcParam->typeIdx;
741
+                        dstCtuParam->bandPos   = mergeSrcParam->bandPos;
742
+
743
+                        for (int i = 0; i < SAO_NUM_OFFSET; i++)
744
+                            dstCtuParam->offset[i] = mergeSrcParam->offset[i];
745
+                    }
746
                 }
747
             }
748
         }
749
@@ -1452,309 +1323,371 @@
750
     }
751
 }
752
 
753
-/** rate distortion optimization of SAO unit */
754
-inline int64_t SAO::estSaoTypeDist(int plane, int typeIdx, double lambda, int32_t* currentDistortionTableBo, double* currentRdCostTableBo)
755
+
756
+// Rounds the division of initial offsets by the number of samples in
757
+// each of the statistics table entries.
758
+void SAO::saoStatsInitialOffset(int planes)
759
 {
760
-    int64_t estDist = 0;
761
+    memset(m_offset, 0, sizeof(m_offset));
762
 
763
-    for (int classIdx = 1; classIdx < ((typeIdx < SAO_BO) ?  SAO_EO_LEN + 1 : SAO_NUM_BO_CLASSES + 1); classIdx++)
764
+    // EO
765
+    for (int plane = 0; plane < planes; plane++)
766
     {
767
-        int32_t  count = m_count[plane][typeIdx][classIdx];
768
-        int32_t& offsetOrg = m_offsetOrg[plane][typeIdx][classIdx];
769
-        int32_t& offsetOut = m_offset[plane][typeIdx][classIdx];
770
-
771
-        if (typeIdx == SAO_BO)
772
+        for (int typeIdx = 0; typeIdx < MAX_NUM_SAO_TYPE - 1; typeIdx++)
773
         {
774
-            currentDistortionTableBo[classIdx - 1] = 0;
775
-            currentRdCostTableBo[classIdx - 1] = lambda;
776
-        }
777
-        if (count)
778
-        {
779
-            int offset = roundIBDI(offsetOrg << (X265_DEPTH - 8), count);
780
-            offset = x265_clip3(-OFFSET_THRESH + 1, OFFSET_THRESH - 1, offset);
781
-            if (typeIdx < SAO_BO)
782
+            for (int classIdx = 1; classIdx < SAO_NUM_OFFSET + 1; classIdx++)
783
             {
784
-                if (classIdx < 3)
785
-                    offset = X265_MAX(offset, 0);
786
-                else
787
-                    offset = X265_MIN(offset, 0);
788
+                int32_t&  count     = m_count[plane][typeIdx][classIdx];
789
+                int32_t& offsetOrg = m_offsetOrg[plane][typeIdx][classIdx];
790
+                int32_t& offsetOut = m_offset[plane][typeIdx][classIdx];
791
+
792
+                if (count)
793
+                {
794
+                    offsetOut = roundIBDI(offsetOrg, count << SAO_BIT_INC);
795
+                    offsetOut = x265_clip3(-OFFSET_THRESH + 1, OFFSET_THRESH - 1, offsetOut);
796
+
797
+                    if (classIdx < 3) 
798
+                        offsetOut = X265_MAX(offsetOut, 0);
799
+                    else
800
+                        offsetOut = X265_MIN(offsetOut, 0);
801
+                }
802
             }
803
-            offsetOut = estIterOffset(typeIdx, classIdx, lambda, offset, count, offsetOrg, currentDistortionTableBo, currentRdCostTableBo);
804
         }
805
-        else
806
+    }
807
+
808
+    // BO
809
+    for (int plane = 0; plane < planes; plane++)
810
+    {
811
+        for (int classIdx = 0; classIdx < MAX_NUM_SAO_CLASS; classIdx++)
812
         {
813
-            offsetOrg = 0;
814
-            offsetOut = 0;
815
+            int32_t&  count     = m_count[plane][SAO_BO][classIdx];
816
+            int32_t& offsetOrg = m_offsetOrg[plane][SAO_BO][classIdx];
817
+            int32_t& offsetOut = m_offset[plane][SAO_BO][classIdx];
818
+
819
+            if (count)
820
+            {
821
+                offsetOut = roundIBDI(offsetOrg, count << SAO_BIT_INC);
822
+                offsetOut = x265_clip3(-OFFSET_THRESH + 1, OFFSET_THRESH - 1, offsetOut);
823
+            }
824
         }
825
-        if (typeIdx != SAO_BO)
826
-            estDist += estSaoDist(count, (int)offsetOut << SAO_BIT_INC, offsetOrg);
827
     }
828
+}
829
 
830
-    return estDist;
831
+inline int64_t SAO::calcSaoRdoCost(int64_t distortion, uint32_t bits, int64_t lambda)
832
+{
833
+#if X265_DEPTH < 10
834
+        X265_CHECK(bits <= (INT64_MAX - 128) / lambda,
835
+                   "calcRdCost wrap detected dist: " X265_LL ", bits %u, lambda: " X265_LL "\n",
836
+                   distortion, bits, lambda);
837
+#else
838
+        X265_CHECK(bits <= (INT64_MAX - 128) / lambda,
839
+                   "calcRdCost wrap detected dist: " X265_LL ", bits %u, lambda: " X265_LL "\n",
840
+                   distortion, bits, lambda);
841
+#endif
842
+        return distortion + ((bits * lambda + 128) >> 8);
843
 }
844
 
845
-inline int SAO::estIterOffset(int typeIdx, int classIdx, double lambda, int offset, int32_t count, int32_t offsetOrg, int32_t* currentDistortionTableBo, double* currentRdCostTableBo)
846
+void SAO::estIterOffset(int typeIdx, int64_t lambda, int32_t count, int32_t offsetOrg, int32_t& offset, int32_t& distClasses, int64_t& costClasses)
847
 {
848
-    int offsetOut = 0;
849
+    int bestOffset = 0;
850
+    distClasses    = 0;
851
 
852
-    // Assuming sending quantized value 0 results in zero offset and sending the value zero needs 1 bit. entropy coder can be used to measure the exact rate here.
853
-    double tempMinCost = lambda;
854
+    // Assuming sending quantized value 0 results in zero offset and sending the value zero needs 1 bit.
855
+    // entropy coder can be used to measure the exact rate here.
856
+    int64_t bestCost = calcSaoRdoCost(0, 1, lambda);
857
     while (offset != 0)
858
     {
859
         // Calculate the bits required for signalling the offset
860
-        int tempRate = (typeIdx == SAO_BO) ? (abs(offset) + 2) : (abs(offset) + 1);
861
+        uint32_t rate = (typeIdx == SAO_BO) ? (abs(offset) + 2) : (abs(offset) + 1);
862
         if (abs(offset) == OFFSET_THRESH - 1)
863
-            tempRate--;
864
+            rate--;
865
 
866
         // Do the dequntization before distorion calculation
867
-        int tempOffset = offset << SAO_BIT_INC;
868
-        int64_t tempDist  = estSaoDist(count, tempOffset, offsetOrg);
869
-        double tempCost   = ((double)tempDist + lambda * (double)tempRate);
870
-        if (tempCost < tempMinCost)
871
+        int64_t dist = estSaoDist(count, offset << SAO_BIT_INC, offsetOrg);
872
+        int64_t cost  = calcSaoRdoCost(dist, rate, lambda);
873
+        if (cost < bestCost)
874
         {
875
-            tempMinCost = tempCost;
876
-            offsetOut = offset;
877
-            if (typeIdx == SAO_BO)
878
-            {
879
-                currentDistortionTableBo[classIdx - 1] = (int)tempDist;
880
-                currentRdCostTableBo[classIdx - 1] = tempCost;
881
-            }
882
+            bestCost = cost;
883
+            bestOffset = offset;
884
+            distClasses = (int)dist;
885
         }
886
         offset = (offset > 0) ? (offset - 1) : (offset + 1);
887
     }
888
 
889
-    return offsetOut;
890
+    costClasses = bestCost;
891
+    offset = bestOffset;
892
 }
893
 
894
-void SAO::saoComponentParamDist(SAOParam* saoParam, int addr, int addrUp, int addrLeft, SaoCtuParam* mergeSaoParam, double* mergeDist)
895
+void SAO::saoLumaComponentParamDist(SAOParam* saoParam, int32_t addr, int64_t& rateDist, int64_t* lambda, int64_t &bestCost)
896
 {
897
     int64_t bestDist = 0;
898
+    int bestTypeIdx = -1;
899
 
900
     SaoCtuParam* lclCtuParam = &saoParam->ctuParam[0][addr];
901
 
902
-    double bestRDCostTableBo = MAX_DOUBLE;
903
-    int    bestClassTableBo  = 0;
904
-    int    currentDistortionTableBo[MAX_NUM_SAO_CLASS];
905
-    double currentRdCostTableBo[MAX_NUM_SAO_CLASS];
906
+    int32_t distClasses[MAX_NUM_SAO_CLASS];
907
+    int64_t costClasses[MAX_NUM_SAO_CLASS];
908
 
909
+    // RDO SAO_NA
910
     m_entropyCoder.load(m_rdContexts.temp);
911
     m_entropyCoder.resetBits();
912
-    m_entropyCoder.codeSaoOffset(*lclCtuParam, 0);
913
-    double dCostPartBest = m_entropyCoder.getNumberOfWrittenBits() * m_lumaLambda;
914
+    m_entropyCoder.codeSaoType(0);
915
 
916
-    for (int typeIdx = 0; typeIdx < MAX_NUM_SAO_TYPE; typeIdx++)
917
-    {
918
-        int64_t estDist = estSaoTypeDist(0, typeIdx, m_lumaLambda, currentDistortionTableBo, currentRdCostTableBo);
919
+    int64_t costPartBest = calcSaoRdoCost(0, m_entropyCoder.getNumberOfWrittenBits(), lambda[0]);
920
 
921
-        if (typeIdx == SAO_BO)
922
+    //EO distortion calculation
923
+    for (int typeIdx = 0; typeIdx < MAX_NUM_SAO_TYPE - 1; typeIdx++)
924
+    {
925
+        int64_t estDist = 0;
926
+        for (int classIdx = 1; classIdx < SAO_NUM_OFFSET + 1; classIdx++)
927
         {
928
-            // Estimate Best Position
929
-            for (int i = 0; i < SAO_NUM_BO_CLASSES - SAO_BO_LEN + 1; i++)
930
-            {
931
-                double currentRDCost = 0.0;
932
-                for (int j = i; j < i + SAO_BO_LEN; j++)
933
-                    currentRDCost += currentRdCostTableBo[j];
934
+            int32_t&  count     = m_count[0][typeIdx][classIdx];
935
+            int32_t& offsetOrg = m_offsetOrg[0][typeIdx][classIdx];
936
+            int32_t& offsetOut = m_offset[0][typeIdx][classIdx];
937
 
938
-                if (currentRDCost < bestRDCostTableBo)
939
-                {
940
-                    bestRDCostTableBo = currentRDCost;
941
-                    bestClassTableBo  = i;
942
-                }
943
-            }
944
+            estIterOffset(typeIdx, lambda[0], count, offsetOrg, offsetOut, distClasses[classIdx], costClasses[classIdx]);
945
 
946
-            // Re code all Offsets
947
-            // Code Center
948
-            estDist = 0;
949
-            for (int classIdx = bestClassTableBo; classIdx < bestClassTableBo + SAO_BO_LEN; classIdx++)
950
-                estDist += currentDistortionTableBo[classIdx];
951
+            //Calculate distortion
952
+            estDist += distClasses[classIdx];
953
         }
954
-        SaoCtuParam  ctuParamRdo;
955
-        ctuParamRdo.mergeMode = SAO_MERGE_NONE;
956
-        ctuParamRdo.typeIdx = typeIdx;
957
-        ctuParamRdo.bandPos = (typeIdx == SAO_BO) ? bestClassTableBo : 0;
958
-        for (int classIdx = 0; classIdx < SAO_NUM_OFFSET; classIdx++)
959
-            ctuParamRdo.offset[classIdx] = (int)m_offset[0][typeIdx][classIdx + ctuParamRdo.bandPos + 1];
960
 
961
         m_entropyCoder.load(m_rdContexts.temp);
962
         m_entropyCoder.resetBits();
963
-        m_entropyCoder.codeSaoOffset(ctuParamRdo, 0);
964
+        m_entropyCoder.codeSaoOffsetEO(m_offset[0][typeIdx] + 1, typeIdx, 0);
965
 
966
-        uint32_t estRate = m_entropyCoder.getNumberOfWrittenBits();
967
-        double cost = (double)estDist + m_lumaLambda * (double)estRate;
968
+        int64_t cost = calcSaoRdoCost(estDist, m_entropyCoder.getNumberOfWrittenBits(), lambda[0]);
969
 
970
-        if (cost < dCostPartBest)
971
+        if (cost < costPartBest)
972
         {
973
-            dCostPartBest = cost;
974
-            copySaoUnit(lclCtuParam, &ctuParamRdo);
975
+            costPartBest = cost;
976
             bestDist = estDist;
977
+            bestTypeIdx = typeIdx;
978
         }
979
     }
980
 
981
-    mergeDist[0] = ((double)bestDist / m_lumaLambda);
982
-    m_entropyCoder.load(m_rdContexts.temp);
983
-    m_entropyCoder.codeSaoOffset(*lclCtuParam, 0);
984
-    m_entropyCoder.store(m_rdContexts.temp);
985
+    if (bestTypeIdx != -1)
986
+    {
987
+        lclCtuParam->mergeMode = SAO_MERGE_NONE;
988
+        lclCtuParam->typeIdx = bestTypeIdx;
989
+        lclCtuParam->bandPos = 0;
990
+        for (int classIdx = 0; classIdx < SAO_NUM_OFFSET; classIdx++)
991
+            lclCtuParam->offset[classIdx] = m_offset[0][bestTypeIdx][classIdx + 1];
992
+    }
993
 
994
-    // merge left or merge up
995
-    for (int mergeIdx = 0; mergeIdx < 2; mergeIdx++)
996
+    //BO RDO
997
+    int64_t estDist = 0;
998
+    for (int classIdx = 0; classIdx < MAX_NUM_SAO_CLASS; classIdx++)
999
     {
1000
-        SaoCtuParam* mergeSrcParam = NULL;
1001
-        if (addrLeft >= 0 && mergeIdx == 0)
1002
-            mergeSrcParam = &(saoParam->ctuParam[0][addrLeft]);
1003
-        else if (addrUp >= 0 && mergeIdx == 1)
1004
-            mergeSrcParam = &(saoParam->ctuParam[0][addrUp]);
1005
-        if (mergeSrcParam)
1006
-        {
1007
-            int64_t estDist = 0;
1008
-            int typeIdx = mergeSrcParam->typeIdx;
1009
-            if (typeIdx >= 0)
1010
-            {
1011
-                int bandPos = (typeIdx == SAO_BO) ? mergeSrcParam->bandPos : 0;
1012
-                for (int classIdx = 0; classIdx < SAO_NUM_OFFSET; classIdx++)
1013
-                {
1014
-                    int mergeOffset = mergeSrcParam->offset[classIdx];
1015
-                    estDist += estSaoDist(m_count[0][typeIdx][classIdx + bandPos + 1], mergeOffset, m_offsetOrg[0][typeIdx][classIdx + bandPos + 1]);
1016
-                }
1017
-            }
1018
+        int32_t&  count    = m_count[0][SAO_BO][classIdx];
1019
+        int32_t& offsetOrg = m_offsetOrg[0][SAO_BO][classIdx];
1020
+        int32_t& offsetOut = m_offset[0][SAO_BO][classIdx];
1021
 
1022
-            copySaoUnit(&mergeSaoParam[mergeIdx], mergeSrcParam);
1023
-            mergeSaoParam[mergeIdx].mergeMode = mergeIdx ? SAO_MERGE_UP : SAO_MERGE_LEFT;
1024
+        estIterOffset(SAO_BO, lambda[0], count, offsetOrg, offsetOut, distClasses[classIdx], costClasses[classIdx]);
1025
+    }
1026
+
1027
+    // Estimate Best Position
1028
+    int64_t bestRDCostBO = MAX_INT64;
1029
+    int32_t bestClassBO  = 0;
1030
+
1031
+    for (int i = 0; i < MAX_NUM_SAO_CLASS - SAO_NUM_OFFSET + 1; i++)
1032
+    {
1033
+        int64_t currentRDCost = 0;
1034
+        for (int j = i; j < i + SAO_NUM_OFFSET; j++)
1035
+            currentRDCost += costClasses[j];
1036
 
1037
-            mergeDist[mergeIdx + 1] = ((double)estDist / m_lumaLambda);
1038
+        if (currentRDCost < bestRDCostBO)
1039
+        {
1040
+            bestRDCostBO = currentRDCost;
1041
+            bestClassBO  = i;
1042
         }
1043
     }
1044
+
1045
+    estDist = 0;
1046
+    for (int classIdx = bestClassBO; classIdx < bestClassBO + SAO_NUM_OFFSET; classIdx++)
1047
+        estDist += distClasses[classIdx];
1048
+
1049
+    m_entropyCoder.load(m_rdContexts.temp);
1050
+    m_entropyCoder.resetBits();
1051
+    m_entropyCoder.codeSaoOffsetBO(m_offset[0][SAO_BO] + bestClassBO, bestClassBO, 0);
1052
+
1053
+    int64_t cost = calcSaoRdoCost(estDist, m_entropyCoder.getNumberOfWrittenBits(), lambda[0]);
1054
+
1055
+    if (cost < costPartBest)
1056
+    {
1057
+        costPartBest = cost;
1058
+        bestDist = estDist;
1059
+
1060
+        lclCtuParam->mergeMode = SAO_MERGE_NONE;
1061
+        lclCtuParam->typeIdx = SAO_BO;
1062
+        lclCtuParam->bandPos = bestClassBO;
1063
+        for (int classIdx = 0; classIdx < SAO_NUM_OFFSET; classIdx++)
1064
+            lclCtuParam->offset[classIdx] = m_offset[0][SAO_BO][classIdx + bestClassBO];
1065
+    }
1066
+
1067
+    rateDist = (bestDist << 8) / lambda[0];
1068
+    m_entropyCoder.load(m_rdContexts.temp);
1069
+    m_entropyCoder.codeSaoOffset(*lclCtuParam, 0);
1070
+    m_entropyCoder.store(m_rdContexts.temp);
1071
+
1072
+    if (m_param->internalCsp == X265_CSP_I400)
1073
+    {
1074
+        bestCost = rateDist + m_entropyCoder.getNumberOfWrittenBits();
1075
+    }
1076
 }
1077
 
1078
-void SAO::sao2ChromaParamDist(SAOParam* saoParam, int addr, int addrUp, int addrLeft, SaoCtuParam mergeSaoParam[][2], double* mergeDist)
1079
+void SAO::saoChromaComponentParamDist(SAOParam* saoParam, int32_t addr, int64_t& rateDist, int64_t* lambda, int64_t &bestCost)
1080
 {
1081
     int64_t bestDist = 0;
1082
+    int bestTypeIdx = -1;
1083
 
1084
     SaoCtuParam* lclCtuParam[2] = { &saoParam->ctuParam[1][addr], &saoParam->ctuParam[2][addr] };
1085
 
1086
-    double currentRdCostTableBo[MAX_NUM_SAO_CLASS];
1087
-    int    bestClassTableBo[2] = { 0, 0 };
1088
-    int    currentDistortionTableBo[MAX_NUM_SAO_CLASS];
1089
+    int64_t costClasses[MAX_NUM_SAO_CLASS];
1090
+    int32_t distClasses[MAX_NUM_SAO_CLASS];
1091
+    int32_t bestClassBO[2] = { 0, 0 };
1092
 
1093
     m_entropyCoder.load(m_rdContexts.temp);
1094
     m_entropyCoder.resetBits();
1095
-    m_entropyCoder.codeSaoOffset(*lclCtuParam[0], 1);
1096
-    m_entropyCoder.codeSaoOffset(*lclCtuParam[1], 2);
1097
+    m_entropyCoder.codeSaoType(0);
1098
 
1099
-    double costPartBest = m_entropyCoder.getNumberOfWrittenBits() * m_chromaLambda;
1100
+    uint32_t bits = m_entropyCoder.getNumberOfWrittenBits();
1101
+    int64_t costPartBest = calcSaoRdoCost(0, bits, lambda[1]);
1102
 
1103
-    for (int typeIdx = 0; typeIdx < MAX_NUM_SAO_TYPE; typeIdx++)
1104
+    //EO RDO
1105
+    for (int typeIdx = 0; typeIdx < MAX_NUM_SAO_TYPE - 1; typeIdx++)
1106
     {
1107
-        int64_t estDist[2];
1108
-        if (typeIdx == SAO_BO)
1109
+        int64_t estDist[2] = {0, 0};
1110
+        for (int compIdx = 1; compIdx < 3; compIdx++)
1111
         {
1112
-            // Estimate Best Position
1113
-            for (int compIdx = 0; compIdx < 2; compIdx++)
1114
+            for (int classIdx = 1; classIdx < SAO_NUM_OFFSET + 1; classIdx++)
1115
             {
1116
-                double bestRDCostTableBo = MAX_DOUBLE;
1117
-                estDist[compIdx] = estSaoTypeDist(compIdx + 1, typeIdx, m_chromaLambda, currentDistortionTableBo, currentRdCostTableBo);
1118
-                for (int i = 0; i < SAO_NUM_BO_CLASSES - SAO_BO_LEN + 1; i++)
1119
-                {
1120
-                    double currentRDCost = 0.0;
1121
-                    for (int j = i; j < i + SAO_BO_LEN; j++)
1122
-                        currentRDCost += currentRdCostTableBo[j];
1123
+                int32_t& count = m_count[compIdx][typeIdx][classIdx];
1124
+                int32_t& offsetOrg = m_offsetOrg[compIdx][typeIdx][classIdx];
1125
+                int32_t& offsetOut = m_offset[compIdx][typeIdx][classIdx];
1126
 
1127
-                    if (currentRDCost < bestRDCostTableBo)
1128
-                    {
1129
-                        bestRDCostTableBo = currentRDCost;
1130
-                        bestClassTableBo[compIdx]  = i;
1131
-                    }
1132
-                }
1133
+                estIterOffset(typeIdx, lambda[1], count, offsetOrg, offsetOut, distClasses[classIdx], costClasses[classIdx]);
1134
 
1135
-                // Re code all Offsets
1136
-                // Code Center
1137
-                estDist[compIdx] = 0;
1138
-                for (int classIdx = bestClassTableBo[compIdx]; classIdx < bestClassTableBo[compIdx] + SAO_BO_LEN; classIdx++)
1139
-                    estDist[compIdx] += currentDistortionTableBo[classIdx];
1140
+                estDist[compIdx - 1] += distClasses[classIdx];
1141
             }
1142
         }
1143
-        else
1144
-        {
1145
-            estDist[0] = estSaoTypeDist(1, typeIdx, m_chromaLambda, currentDistortionTableBo, currentRdCostTableBo);
1146
-            estDist[1] = estSaoTypeDist(2, typeIdx, m_chromaLambda, currentDistortionTableBo, currentRdCostTableBo);
1147
-        }
1148
 
1149
         m_entropyCoder.load(m_rdContexts.temp);
1150
         m_entropyCoder.resetBits();
1151
 
1152
-        SaoCtuParam  ctuParamRdo[2];
1153
         for (int compIdx = 0; compIdx < 2; compIdx++)
1154
-        {
1155
-            ctuParamRdo[compIdx].mergeMode = SAO_MERGE_NONE;
1156
-            ctuParamRdo[compIdx].typeIdx = typeIdx;
1157
-            ctuParamRdo[compIdx].bandPos = (typeIdx == SAO_BO) ? bestClassTableBo[compIdx] : 0;
1158
-            for (int classIdx = 0; classIdx < SAO_NUM_OFFSET; classIdx++)
1159
-                ctuParamRdo[compIdx].offset[classIdx] = (int)m_offset[compIdx + 1][typeIdx][classIdx + ctuParamRdo[compIdx].bandPos + 1];
1160
-
1161
-            m_entropyCoder.codeSaoOffset(ctuParamRdo[compIdx], compIdx + 1);
1162
-        }
1163
+            m_entropyCoder.codeSaoOffsetEO(m_offset[compIdx + 1][typeIdx] + 1, typeIdx, compIdx + 1);
1164
 
1165
         uint32_t estRate = m_entropyCoder.getNumberOfWrittenBits();
1166
-        double cost = (double)(estDist[0] + estDist[1]) + m_chromaLambda * (double)estRate;
1167
+        int64_t cost = calcSaoRdoCost((estDist[0] + estDist[1]), estRate, lambda[1]);
1168
 
1169
         if (cost < costPartBest)
1170
         {
1171
             costPartBest = cost;
1172
-            copySaoUnit(lclCtuParam[0], &ctuParamRdo[0]);
1173
-            copySaoUnit(lclCtuParam[1], &ctuParamRdo[1]);
1174
             bestDist = (estDist[0] + estDist[1]);
1175
+            bestTypeIdx = typeIdx;
1176
         }
1177
     }
1178
 
1179
-    mergeDist[0] += ((double)bestDist / m_chromaLambda);
1180
-    m_entropyCoder.load(m_rdContexts.temp);
1181
-    m_entropyCoder.codeSaoOffset(*lclCtuParam[0], 1);
1182
-    m_entropyCoder.codeSaoOffset(*lclCtuParam[1], 2);
1183
-    m_entropyCoder.store(m_rdContexts.temp);
1184
-
1185
-    // merge left or merge up
1186
-    for (int mergeIdx = 0; mergeIdx < 2; mergeIdx++)
1187
+    if (bestTypeIdx != -1)
1188
     {
1189
         for (int compIdx = 0; compIdx < 2; compIdx++)
1190
         {
1191
-            int plane = compIdx + 1;
1192
-            SaoCtuParam* mergeSrcParam = NULL;
1193
-            if (addrLeft >= 0 && mergeIdx == 0)
1194
-                mergeSrcParam = &(saoParam->ctuParam[plane][addrLeft]);
1195
-            else if (addrUp >= 0 && mergeIdx == 1)
1196
-                mergeSrcParam = &(saoParam->ctuParam[plane][addrUp]);
1197
-            if (mergeSrcParam)
1198
-            {
1199
-                int64_t estDist = 0;
1200
-                int typeIdx = mergeSrcParam->typeIdx;
1201
-                if (typeIdx >= 0)
1202
-                {
1203
-                    int bandPos = (typeIdx == SAO_BO) ? mergeSrcParam->bandPos : 0;
1204
-                    for (int classIdx = 0; classIdx < SAO_NUM_OFFSET; classIdx++)
1205
-                    {
1206
-                        int mergeOffset = mergeSrcParam->offset[classIdx];
1207
-                        estDist += estSaoDist(m_count[plane][typeIdx][classIdx + bandPos + 1], mergeOffset, m_offsetOrg[plane][typeIdx][classIdx + bandPos + 1]);
1208
-                    }
1209
-                }
1210
+            lclCtuParam[compIdx]->mergeMode = SAO_MERGE_NONE;
1211
+            lclCtuParam[compIdx]->typeIdx = bestTypeIdx;
1212
+            lclCtuParam[compIdx]->bandPos = 0;
1213
+            for (int classIdx = 0; classIdx < SAO_NUM_OFFSET; classIdx++)
1214
+                lclCtuParam[compIdx]->offset[classIdx] = m_offset[compIdx + 1][bestTypeIdx][classIdx + 1];
1215
+        }
1216
+    }
1217
+
1218
+    // BO RDO
1219
+    int64_t estDist[2];
1220
+
1221
+    // Estimate Best Position
1222
+    for (int compIdx = 1; compIdx < 3; compIdx++)
1223
+    {
1224
+        int64_t bestRDCostBO = MAX_INT64;
1225
+
1226
+        for (int classIdx = 0; classIdx < MAX_NUM_SAO_CLASS; classIdx++)
1227
+        {
1228
+            int32_t&  count = m_count[compIdx][SAO_BO][classIdx];
1229
+            int32_t& offsetOrg = m_offsetOrg[compIdx][SAO_BO][classIdx];
1230
+            int32_t& offsetOut = m_offset[compIdx][SAO_BO][classIdx];
1231
+
1232
+            estIterOffset(SAO_BO, lambda[1], count, offsetOrg, offsetOut, distClasses[classIdx], costClasses[classIdx]);
1233
+        }
1234
+
1235
+        for (int i = 0; i < MAX_NUM_SAO_CLASS - SAO_NUM_OFFSET + 1; i++)
1236
+        {
1237
+            int64_t currentRDCost = 0;
1238
+            for (int j = i; j < i + SAO_NUM_OFFSET; j++)
1239
+                currentRDCost += costClasses[j];
1240
 
1241
-                copySaoUnit(&mergeSaoParam[plane][mergeIdx], mergeSrcParam);
1242
-                mergeSaoParam[plane][mergeIdx].mergeMode = mergeIdx ? SAO_MERGE_UP : SAO_MERGE_LEFT;
1243
-                mergeDist[mergeIdx + 1] += ((double)estDist / m_chromaLambda);
1244
+            if (currentRDCost < bestRDCostBO)
1245
+            {
1246
+                bestRDCostBO = currentRDCost;
1247
+                bestClassBO[compIdx - 1]  = i;
1248
             }
1249
         }
1250
+
1251
+        estDist[compIdx - 1] = 0;
1252
+        for (int classIdx = bestClassBO[compIdx - 1]; classIdx < bestClassBO[compIdx - 1] + SAO_NUM_OFFSET; classIdx++)
1253
+            estDist[compIdx - 1] += distClasses[classIdx];
1254
+    }
1255
+
1256
+    m_entropyCoder.load(m_rdContexts.temp);
1257
+    m_entropyCoder.resetBits();
1258
+
1259
+    for (int compIdx = 0; compIdx < 2; compIdx++)
1260
+        m_entropyCoder.codeSaoOffsetBO(m_offset[compIdx + 1][SAO_BO] + bestClassBO[compIdx], bestClassBO[compIdx], compIdx + 1);
1261
+
1262
+    uint32_t estRate = m_entropyCoder.getNumberOfWrittenBits();
1263
+    int64_t cost = calcSaoRdoCost((estDist[0] + estDist[1]), estRate, lambda[1]);
1264
+
1265
+    if (cost < costPartBest)
1266
+    {
1267
+        costPartBest = cost;
1268
+        bestDist = (estDist[0] + estDist[1]);
1269
+
1270
+        for (int compIdx = 0; compIdx < 2; compIdx++)
1271
+        {
1272
+            lclCtuParam[compIdx]->mergeMode = SAO_MERGE_NONE;
1273
+            lclCtuParam[compIdx]->typeIdx = SAO_BO;
1274
+            lclCtuParam[compIdx]->bandPos = bestClassBO[compIdx];
1275
+            for (int classIdx = 0; classIdx < SAO_NUM_OFFSET; classIdx++)
1276
+                lclCtuParam[compIdx]->offset[classIdx] = m_offset[compIdx + 1][SAO_BO][classIdx + bestClassBO[compIdx]];
1277
+        }
1278
+    }
1279
+
1280
+    rateDist += (bestDist << 8) / lambda[1];
1281
+    m_entropyCoder.load(m_rdContexts.temp);
1282
+
1283
+    if (saoParam->bSaoFlag[1])
1284
+    {
1285
+        m_entropyCoder.codeSaoOffset(*lclCtuParam[0], 1);
1286
+        m_entropyCoder.codeSaoOffset(*lclCtuParam[1], 2);
1287
+        m_entropyCoder.store(m_rdContexts.temp);
1288
+
1289
+        uint32_t rate = m_entropyCoder.getNumberOfWrittenBits();
1290
+        bestCost = rateDist + rate;
1291
+    }
1292
+    else
1293
+    {
1294
+        uint32_t rate = m_entropyCoder.getNumberOfWrittenBits();
1295
+        bestCost = rateDist + rate;
1296
     }
1297
 }
1298
 
1299
 // NOTE: must put in namespace X265_NS since we need class SAO
1300
 void saoCuStatsBO_c(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count)
1301
 {
1302
-    int x, y;
1303
     const int boShift = X265_DEPTH - SAO_BO_BITS;
1304
 
1305
-    for (y = 0; y < endY; y++)
1306
+    for (int y = 0; y < endY; y++)
1307
     {
1308
-        for (x = 0; x < endX; x++)
1309
+        for (int x = 0; x < endX; x++)
1310
         {
1311
-            int classIdx = 1 + (rec[x] >> boShift);
1312
+            int classIdx = rec[x] >> boShift;
1313
             stats[classIdx] += diff[x];
1314
             count[classIdx]++;
1315
         }
1316
@@ -1766,7 +1699,6 @@
1317
 
1318
 void saoCuStatsE0_c(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count)
1319
 {
1320
-    int x, y;
1321
     int32_t tmp_stats[SAO::NUM_EDGETYPE];
1322
     int32_t tmp_count[SAO::NUM_EDGETYPE];
1323
 
1324
@@ -1775,10 +1707,10 @@
1325
     memset(tmp_stats, 0, sizeof(tmp_stats));
1326
     memset(tmp_count, 0, sizeof(tmp_count));
1327
 
1328
-    for (y = 0; y < endY; y++)
1329
+    for (int y = 0; y < endY; y++)
1330
     {
1331
         int signLeft = signOf(rec[0] - rec[-1]);
1332
-        for (x = 0; x < endX; x++)
1333
+        for (int x = 0; x < endX; x++)
1334
         {
1335
             int signRight = signOf2(rec[x], rec[x + 1]);
1336
             X265_CHECK(signRight == signOf(rec[x] - rec[x + 1]), "signDown check failure\n");
1337
@@ -1794,7 +1726,7 @@
1338
         rec += stride;
1339
     }
1340
 
1341
-    for (x = 0; x < SAO::NUM_EDGETYPE; x++)
1342
+    for (int x = 0; x < SAO::NUM_EDGETYPE; x++)
1343
     {
1344
         stats[SAO::s_eoTable[x]] += tmp_stats[x];
1345
         count[SAO::s_eoTable[x]] += tmp_count[x];
1346
@@ -1806,7 +1738,6 @@
1347
     X265_CHECK(endX <= MAX_CU_SIZE, "endX check failure\n");
1348
     X265_CHECK(endY <= MAX_CU_SIZE, "endY check failure\n");
1349
 
1350
-    int x, y;
1351
     int32_t tmp_stats[SAO::NUM_EDGETYPE];
1352
     int32_t tmp_count[SAO::NUM_EDGETYPE];
1353
 
1354
@@ -1814,9 +1745,9 @@
1355
     memset(tmp_count, 0, sizeof(tmp_count));
1356
 
1357
     X265_CHECK(endX * endY <= (4096 - 16), "Assembly of saoE1 may overflow with this block size\n");
1358
-    for (y = 0; y < endY; y++)
1359
+    for (int y = 0; y < endY; y++)
1360
     {
1361
-        for (x = 0; x < endX; x++)
1362
+        for (int x = 0; x < endX; x++)
1363
         {
1364
             int signDown = signOf2(rec[x], rec[x + stride]);
1365
             X265_CHECK(signDown == signOf(rec[x] - rec[x + stride]), "signDown check failure\n");
1366
@@ -1831,7 +1762,7 @@
1367
         rec += stride;
1368
     }
1369
 
1370
-    for (x = 0; x < SAO::NUM_EDGETYPE; x++)
1371
+    for (int x = 0; x < SAO::NUM_EDGETYPE; x++)
1372
     {
1373
         stats[SAO::s_eoTable[x]] += tmp_stats[x];
1374
         count[SAO::s_eoTable[x]] += tmp_count[x];
1375
@@ -1843,17 +1774,16 @@
1376
     X265_CHECK(endX < MAX_CU_SIZE, "endX check failure\n");
1377
     X265_CHECK(endY < MAX_CU_SIZE, "endY check failure\n");
1378
 
1379
-    int x, y;
1380
     int32_t tmp_stats[SAO::NUM_EDGETYPE];
1381
     int32_t tmp_count[SAO::NUM_EDGETYPE];
1382
 
1383
     memset(tmp_stats, 0, sizeof(tmp_stats));
1384
     memset(tmp_count, 0, sizeof(tmp_count));
1385
 
1386
-    for (y = 0; y < endY; y++)
1387
+    for (int y = 0; y < endY; y++)
1388
     {
1389
         upBufft[0] = signOf(rec[stride] - rec[-1]);
1390
-        for (x = 0; x < endX; x++)
1391
+        for (int x = 0; x < endX; x++)
1392
         {
1393
             int signDown = signOf2(rec[x], rec[x + stride + 1]);
1394
             X265_CHECK(signDown == signOf(rec[x] - rec[x + stride + 1]), "signDown check failure\n");
1395
@@ -1869,7 +1799,7 @@
1396
         diff += MAX_CU_SIZE;
1397
     }
1398
 
1399
-    for (x = 0; x < SAO::NUM_EDGETYPE; x++)
1400
+    for (int x = 0; x < SAO::NUM_EDGETYPE; x++)
1401
     {
1402
         stats[SAO::s_eoTable[x]] += tmp_stats[x];
1403
         count[SAO::s_eoTable[x]] += tmp_count[x];
1404
@@ -1881,16 +1811,15 @@
1405
     X265_CHECK(endX < MAX_CU_SIZE, "endX check failure\n");
1406
     X265_CHECK(endY < MAX_CU_SIZE, "endY check failure\n");
1407
 
1408
-    int x, y;
1409
     int32_t tmp_stats[SAO::NUM_EDGETYPE];
1410
     int32_t tmp_count[SAO::NUM_EDGETYPE];
1411
 
1412
     memset(tmp_stats, 0, sizeof(tmp_stats));
1413
     memset(tmp_count, 0, sizeof(tmp_count));
1414
 
1415
-    for (y = 0; y < endY; y++)
1416
+    for (int y = 0; y < endY; y++)
1417
     {
1418
-        for (x = 0; x < endX; x++)
1419
+        for (int x = 0; x < endX; x++)
1420
         {
1421
             int signDown = signOf2(rec[x], rec[x + stride - 1]);
1422
             X265_CHECK(signDown == signOf(rec[x] - rec[x + stride - 1]), "signDown check failure\n");
1423
@@ -1908,7 +1837,7 @@
1424
         diff += MAX_CU_SIZE;
1425
     }
1426
 
1427
-    for (x = 0; x < SAO::NUM_EDGETYPE; x++)
1428
+    for (int x = 0; x < SAO::NUM_EDGETYPE; x++)
1429
     {
1430
         stats[SAO::s_eoTable[x]] += tmp_stats[x];
1431
         count[SAO::s_eoTable[x]] += tmp_count[x];
1432
x265_1.9.tar.gz/source/encoder/sao.h -> x265_2.0.tar.gz/source/encoder/sao.h Changed
92
 
1
@@ -33,13 +33,6 @@
2
 namespace X265_NS {
3
 // private namespace
4
 
5
-enum SAOTypeLen
6
-{
7
-    SAO_EO_LEN = 4,
8
-    SAO_BO_LEN = 4,
9
-    SAO_NUM_BO_CLASSES = 32
10
-};
11
-
12
 enum SAOType
13
 {
14
     SAO_EO_0 = 0,
15
@@ -56,12 +49,11 @@
16
 
17
     enum { SAO_MAX_DEPTH = 4 };
18
     enum { SAO_BO_BITS  = 5 };
19
-    enum { MAX_NUM_SAO_CLASS = 33 };
20
+    enum { MAX_NUM_SAO_CLASS = 32 };
21
     enum { SAO_BIT_INC = 0 }; /* in HM12.0, it wrote as X265_MAX(X265_DEPTH - 10, 0) */
22
     enum { OFFSET_THRESH = 1 << X265_MIN(X265_DEPTH - 5, 5) };
23
     enum { NUM_EDGETYPE = 5 };
24
     enum { NUM_PLANE = 3 };
25
-    enum { NUM_MERGE_MODE = 3 };
26
     enum { SAO_DEPTHRATE_SIZE = 4 };
27
 
28
     static const uint32_t s_eoTable[NUM_EDGETYPE];
29
@@ -81,7 +73,7 @@
30
     PerPlane*   m_offsetOrgPreDblk;
31
 
32
     double*     m_depthSaoRate;
33
-    int8_t      m_offsetBo[NUM_PLANE][SAO_NUM_BO_CLASSES];
34
+    int8_t      m_offsetBo[NUM_PLANE][MAX_NUM_SAO_CLASS];
35
     int8_t      m_offsetEo[NUM_PLANE][NUM_EDGETYPE];
36
 
37
     int         m_chromaFormat;
38
@@ -114,10 +106,6 @@
39
     int         m_refDepth;
40
     int         m_numNoSao[2];
41
 
42
-    double      m_lumaLambda;
43
-    double      m_chromaLambda;
44
-    /* TODO: No doubles for distortion */
45
-
46
     SAO();
47
 
48
     bool create(x265_param* param, int initCommon);
49
@@ -126,31 +114,27 @@
50
 
51
     void allocSaoParam(SAOParam* saoParam) const;
52
 
53
-    void startSlice(Frame* pic, Entropy& initState, int qp);
54
+    void startSlice(Frame* pic, Entropy& initState);
55
     void resetStats();
56
-    void resetSaoUnit(SaoCtuParam* saoUnit);
57
 
58
     // CTU-based SAO process without slice granularity
59
-    void processSaoCu(int addr, int typeIdx, int plane);
60
+    void applyPixelOffsets(int addr, int typeIdx, int plane);
61
     void processSaoUnitRow(SaoCtuParam* ctuParam, int idxY, int plane);
62
-    void processSaoUnitCuLuma(SaoCtuParam* ctuParam, int idxY, int idxX);
63
-    void processSaoUnitCuChroma(SaoCtuParam* ctuParam[3], int idxY, int idxX);
64
+    void generateLumaOffsets(SaoCtuParam* ctuParam, int idxY, int idxX);
65
+    void generateChromaOffsets(SaoCtuParam* ctuParam[3], int idxY, int idxX);
66
 
67
-    void copySaoUnit(SaoCtuParam* saoUnitDst, const SaoCtuParam* saoUnitSrc);
68
-
69
-    void calcSaoStatsCu(int addr, int plane);
70
+    void calcSaoStatsCTU(int addr, int plane);
71
     void calcSaoStatsCu_BeforeDblk(Frame* pic, int idxX, int idxY);
72
 
73
-    void saoComponentParamDist(SAOParam* saoParam, int addr, int addrUp, int addrLeft, SaoCtuParam mergeSaoParam[2], double* mergeDist);
74
-    void sao2ChromaParamDist(SAOParam* saoParam, int addr, int addrUp, int addrLeft, SaoCtuParam mergeSaoParam[][2], double* mergeDist);
75
-
76
-    inline int estIterOffset(int typeIdx, int classIdx, double lambda, int offset, int32_t count, int32_t offsetOrg,
77
-                             int32_t* currentDistortionTableBo, double* currentRdCostTableBo);
78
-    inline int64_t estSaoTypeDist(int plane, int typeIdx, double lambda, int32_t* currentDistortionTableBo, double* currentRdCostTableBo);
79
+    void saoLumaComponentParamDist(SAOParam* saoParam, int addr, int64_t& rateDist, int64_t* lambda, int64_t& bestCost);
80
+    void saoChromaComponentParamDist(SAOParam* saoParam, int addr, int64_t& rateDist, int64_t* lambda, int64_t& bestCost);
81
 
82
+    void estIterOffset(int typeIdx, int64_t lambda, int32_t count, int32_t offsetOrg, int32_t& offset, int32_t& distClasses, int64_t& costClasses);
83
     void rdoSaoUnitRowEnd(const SAOParam* saoParam, int numctus);
84
-    void rdoSaoUnitRow(SAOParam* saoParam, int idxY);
85
     void rdoSaoUnitCu(SAOParam* saoParam, int rowBaseAddr, int idxX, int addr);
86
+    int64_t calcSaoRdoCost(int64_t distortion, uint32_t bits, int64_t lambda);
87
+
88
+    void saoStatsInitialOffset(int planes);
89
 
90
     friend class FrameFilter;
91
 };
92
x265_1.9.tar.gz/source/encoder/search.cpp -> x265_2.0.tar.gz/source/encoder/search.cpp Changed
460
 
1
@@ -73,14 +73,13 @@
2
 {
3
     uint32_t maxLog2CUSize = g_log2Size[param.maxCUSize];
4
     m_param = &param;
5
-    m_bEnableRDOQ = !!param.rdoqLevel;
6
     m_bFrameParallel = param.frameNumThreads > 1;
7
     m_numLayers = g_log2Size[param.maxCUSize] - 2;
8
 
9
     m_rdCost.setPsyRdScale(param.psyRd);
10
-    m_me.init(param.searchMethod, param.subpelRefine, param.internalCsp);
11
+    m_me.init(param.internalCsp);
12
 
13
-    bool ok = m_quant.init(param.rdoqLevel, param.psyRdoq, scalingList, m_entropyCoder);
14
+    bool ok = m_quant.init(param.psyRdoq, scalingList, m_entropyCoder);
15
     if (m_param->noiseReductionIntra || m_param->noiseReductionInter || m_param->rc.vbvBufferSize)
16
         ok &= m_quant.allocNoiseReduction(param);
17
 
18
@@ -223,9 +222,10 @@
19
 
20
     if (!(log2TrSize - m_hChromaShift < 2))
21
     {
22
-        if (!tuDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth - 1))
23
+        uint32_t parentIdx = absPartIdx & (0xFF << (log2TrSize + 1 - LOG2_UNIT_SIZE) * 2);
24
+        if (!tuDepth || cu.getCbf(parentIdx, TEXT_CHROMA_U, tuDepth - 1))
25
             m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_U, tuDepth, !subdiv);
26
-        if (!tuDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth - 1))
27
+        if (!tuDepth || cu.getCbf(parentIdx, TEXT_CHROMA_V, tuDepth - 1))
28
             m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_V, tuDepth, !subdiv);
29
     }
30
 
31
@@ -296,6 +296,7 @@
32
     uint32_t sizeIdx    = log2TrSize - 2;
33
     bool mightNotSplit  = log2TrSize <= depthRange[1];
34
     bool mightSplit     = (log2TrSize > depthRange[0]) && (bAllowSplit || !mightNotSplit);
35
+    bool bEnableRDOQ  = !!m_param->rdoqLevel;
36
 
37
     /* If maximum RD penalty, force spits at TU size 32x32 if SPS allows TUs of 16x16 */
38
     if (m_param->rdPenalty == 2 && m_slice->m_sliceType != I_SLICE && log2TrSize == 5 && depthRange[0] <= 4)
39
@@ -336,7 +337,7 @@
40
         coeff_t* coeffY       = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
41
 
42
         // store original entropy coding status
43
-        if (m_bEnableRDOQ)
44
+        if (bEnableRDOQ)
45
             m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
46
 
47
         primitives.cu[sizeIdx].calcresidual(fenc, pred, residual, stride);
48
@@ -434,8 +435,7 @@
49
 
50
             cbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1);
51
         }
52
-        for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
53
-            cu.m_cbf[0][absPartIdx + offs] |= (cbf << tuDepth);
54
+        cu.m_cbf[0][absPartIdx] |= (cbf << tuDepth);
55
 
56
         if (mightNotSplit && log2TrSize != depthRange[0])
57
         {
58
@@ -487,6 +487,7 @@
59
     uint32_t fullDepth = cuGeom.depth + tuDepth;
60
     uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
61
     uint32_t tuSize = 1 << log2TrSize;
62
+    bool bEnableRDOQ = !!m_param->rdoqLevel;
63
 
64
     X265_CHECK(tuSize <= MAX_TS_SIZE, "transform skip is only possible at 4x4 TUs\n");
65
 
66
@@ -525,7 +526,7 @@
67
     // store original entropy coding status
68
     m_entropyCoder.store(m_rqt[fullDepth].rqtRoot);
69
 
70
-    if (m_bEnableRDOQ)
71
+    if (bEnableRDOQ)
72
         m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
73
 
74
     int checkTransformSkip = 1;
75
@@ -717,8 +718,7 @@
76
             residualTransformQuantIntra(mode, cuGeom, qPartIdx, tuDepth + 1, depthRange);
77
             cbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1);
78
         }
79
-        for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
80
-            cu.m_cbf[0][absPartIdx + offs] |= (cbf << tuDepth);
81
+        cu.m_cbf[0][absPartIdx] |= (cbf << tuDepth);
82
     }
83
 }
84
 
85
@@ -782,6 +782,7 @@
86
 {
87
     CUData& cu = mode.cu;
88
     uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
89
+    bool bEnableRDOQ = !!m_param->rdoqLevel;
90
 
91
     if (tuDepth < cu.m_tuDepth[absPartIdx])
92
     {
93
@@ -793,11 +794,9 @@
94
             splitCbfU |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
95
             splitCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
96
         }
97
-        for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
98
-        {
99
-            cu.m_cbf[1][absPartIdx + offs] |= (splitCbfU << tuDepth);
100
-            cu.m_cbf[2][absPartIdx + offs] |= (splitCbfV << tuDepth);
101
-        }
102
+        cu.m_cbf[1][absPartIdx] |= (splitCbfU << tuDepth);
103
+        cu.m_cbf[2][absPartIdx] |= (splitCbfV << tuDepth);
104
+
105
         return;
106
     }
107
 
108
@@ -812,7 +811,7 @@
109
         tuDepthC--;
110
     }
111
 
112
-    if (m_bEnableRDOQ)
113
+    if (bEnableRDOQ)
114
         m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSizeC, false);
115
 
116
     bool checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && log2TrSizeC <= MAX_LOG2_TS_SIZE && !cu.m_tqBypass[0];
117
@@ -1091,11 +1090,8 @@
118
             splitCbfU |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
119
             splitCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
120
         }
121
-        for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
122
-        {
123
-            cu.m_cbf[1][absPartIdx + offs] |= (splitCbfU << tuDepth);
124
-            cu.m_cbf[2][absPartIdx + offs] |= (splitCbfV << tuDepth);
125
-        }
126
+        cu.m_cbf[1][absPartIdx] |= (splitCbfU << tuDepth);
127
+        cu.m_cbf[2][absPartIdx] |= (splitCbfV << tuDepth);
128
 
129
         return;
130
     }
131
@@ -1629,8 +1625,7 @@
132
         for (uint32_t qIdx = 0, qPartIdx = 0; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
133
             combCbfY |= cu.getCbf(qPartIdx, TEXT_LUMA, 1);
134
 
135
-        for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
136
-            cu.m_cbf[0][offs] |= combCbfY;
137
+        cu.m_cbf[0][0] |= combCbfY;
138
     }
139
 
140
     // TODO: remove this
141
@@ -1732,6 +1727,12 @@
142
         else
143
             cu.getAllowedChromaDir(absPartIdxC, modeList);
144
 
145
+        if (m_frame->m_fencPic->m_picCsp  == X265_CSP_I400 && m_csp != X265_CSP_I400)
146
+        {
147
+            for (uint32_t l = 1; l < NUM_CHROMA_MODE; l++)
148
+                modeList[l] = modeList[0];
149
+            maxMode = 1;
150
+        }
151
         // check chroma modes
152
         for (uint32_t mode = minMode; mode < maxMode; mode++)
153
         {
154
@@ -1816,11 +1817,8 @@
155
             combCbfV |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, 1);
156
         }
157
 
158
-        for (uint32_t offs = 0; offs < 4 * qNumParts; offs++)
159
-        {
160
-            cu.m_cbf[1][offs] |= combCbfU;
161
-            cu.m_cbf[2][offs] |= combCbfV;
162
-        }
163
+        cu.m_cbf[1][0] |= combCbfU;
164
+        cu.m_cbf[2][0] |= combCbfV;
165
     }
166
 
167
     /* TODO: remove this */
168
@@ -1974,7 +1972,8 @@
169
         slave.m_frame = m_frame;
170
         slave.m_param = m_param;
171
         slave.setLambdaFromQP(pme.mode.cu, m_rdCost.m_qp);
172
-        slave.m_me.setSourcePU(*pme.mode.fencYuv, pme.pu.ctuAddr, pme.pu.cuAbsPartIdx, pme.pu.puAbsPartIdx, pme.pu.width, pme.pu.height);
173
+        bool bChroma = slave.m_frame->m_fencPic->m_picCsp != X265_CSP_I400;
174
+        slave.m_me.setSourcePU(*pme.mode.fencYuv, pme.pu.ctuAddr, pme.pu.cuAbsPartIdx, pme.pu.puAbsPartIdx, pme.pu.width, pme.pu.height, m_param->searchMethod, m_param->subpelRefine, bChroma);
175
     }
176
 
177
     /* Perform ME, repeat until no more work is available */
178
@@ -2015,9 +2014,12 @@
179
     int mvpIdx = selectMVP(interMode.cu, pu, amvp, list, ref);
180
     MV mvmin, mvmax, outmv, mvp = amvp[mvpIdx];
181
 
182
-    MV lmv = getLowresMV(interMode.cu, pu, list, ref);
183
-    if (lmv.notZero())
184
-        mvc[numMvc++] = lmv;
185
+    if (!m_param->analysisMode) /* Prevents load/save outputs from diverging if lowresMV is not available */
186
+    {
187
+        MV lmv = getLowresMV(interMode.cu, pu, list, ref);
188
+        if (lmv.notZero())
189
+            mvc[numMvc++] = lmv;
190
+    }
191
 
192
     setSearchRange(interMode.cu, mvp, m_param->searchRange, mvmin, mvmax);
193
 
194
@@ -2074,7 +2076,7 @@
195
         MotionData* bestME = interMode.bestME[puIdx];
196
         PredictionUnit pu(cu, cuGeom, puIdx);
197
 
198
-        m_me.setSourcePU(*interMode.fencYuv, pu.ctuAddr, pu.cuAbsPartIdx, pu.puAbsPartIdx, pu.width, pu.height);
199
+        m_me.setSourcePU(*interMode.fencYuv, pu.ctuAddr, pu.cuAbsPartIdx, pu.puAbsPartIdx, pu.width, pu.height, m_param->searchMethod, m_param->subpelRefine, bChromaMC);
200
 
201
         /* find best cost merge candidate. note: 2Nx2N merge and bidir are handled as separate modes */
202
         uint32_t mrgCost = numPart == 1 ? MAX_UINT : mergeEstimation(cu, cuGeom, pu, puIdx, merge);
203
@@ -2104,10 +2106,7 @@
204
                 const MV* amvp = interMode.amvpCand[list][ref];
205
                 int mvpIdx = selectMVP(cu, pu, amvp, list, ref);
206
                 MV mvmin, mvmax, outmv, mvp = amvp[mvpIdx];
207
-                MV lmv = bestME[list].mv;
208
-                if (lmv.notZero())
209
-                    mvc[numMvc++] = lmv;
210
-
211
+                
212
                 setSearchRange(cu, mvp, m_param->searchRange, mvmin, mvmax);
213
                 int satdCost = m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc, mvc, m_param->searchRange, outmv);
214
 
215
@@ -2128,8 +2127,8 @@
216
                     bestME[list].bits = bits;
217
                     bestME[list].mvCost  = mvCost;
218
                 }
219
-            }
220
-            bDoUnidir = false;
221
+                bDoUnidir = false;
222
+            }            
223
         }
224
         else if (m_param->bDistributeMotionEstimation)
225
         {
226
@@ -2199,9 +2198,12 @@
227
                     int mvpIdx = selectMVP(cu, pu, amvp, list, ref);
228
                     MV mvmin, mvmax, outmv, mvp = amvp[mvpIdx];
229
 
230
-                    MV lmv = getLowresMV(cu, pu, list, ref);
231
-                    if (lmv.notZero())
232
-                        mvc[numMvc++] = lmv;
233
+                    if (!m_param->analysisMode) /* Prevents load/save outputs from diverging when lowresMV is not available */
234
+                    {
235
+                        MV lmv = getLowresMV(cu, pu, list, ref);
236
+                        if (lmv.notZero())
237
+                            mvc[numMvc++] = lmv;
238
+                    }
239
 
240
                     setSearchRange(cu, mvp, m_param->searchRange, mvmin, mvmax);
241
                     int satdCost = m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc, mvc, m_param->searchRange, outmv);
242
@@ -2534,7 +2536,7 @@
243
     interMode.lumaDistortion = primitives.cu[part].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
244
     interMode.distortion = interMode.lumaDistortion;
245
     // Chroma
246
-    if (m_csp != X265_CSP_I400)
247
+    if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
248
     {
249
         interMode.chromaDistortion = m_rdCost.scaleChromaDist(1, primitives.chroma[m_csp].cu[part].sse_pp(fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize));
250
         interMode.chromaDistortion += m_rdCost.scaleChromaDist(2, primitives.chroma[m_csp].cu[part].sse_pp(fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize));
251
@@ -2575,7 +2577,7 @@
252
     uint32_t log2CUSize = cuGeom.log2CUSize;
253
     int sizeIdx = log2CUSize - 2;
254
 
255
-    resiYuv->subtract(*fencYuv, *predYuv, log2CUSize);
256
+    resiYuv->subtract(*fencYuv, *predYuv, log2CUSize, m_frame->m_fencPic->m_picCsp);
257
 
258
     uint32_t tuDepthRange[2];
259
     cu.getInterTUQtDepthRange(tuDepthRange, 0);
260
@@ -2589,7 +2591,7 @@
261
     if (!tqBypass)
262
     {
263
         sse_t cbf0Dist = primitives.cu[sizeIdx].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size);
264
-        if (m_csp != X265_CSP_I400)
265
+        if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
266
         {
267
             cbf0Dist += m_rdCost.scaleChromaDist(1, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[1], predYuv->m_csize, predYuv->m_buf[1], predYuv->m_csize));
268
             cbf0Dist += m_rdCost.scaleChromaDist(2, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[2], predYuv->m_csize, predYuv->m_buf[2], predYuv->m_csize));
269
@@ -2660,14 +2662,14 @@
270
     m_entropyCoder.store(interMode.contexts);
271
 
272
     if (cu.getQtRootCbf(0))
273
-        reconYuv->addClip(*predYuv, *resiYuv, log2CUSize);
274
+        reconYuv->addClip(*predYuv, *resiYuv, log2CUSize, m_frame->m_fencPic->m_picCsp);
275
     else
276
         reconYuv->copyFromYuv(*predYuv);
277
 
278
     // update with clipped distortion and cost (qp estimation loop uses unclipped values)
279
     sse_t bestLumaDist = primitives.cu[sizeIdx].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
280
     interMode.distortion = bestLumaDist;
281
-    if (m_csp != X265_CSP_I400)
282
+    if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
283
     {
284
         sse_t bestChromaDist = m_rdCost.scaleChromaDist(1, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize));
285
         bestChromaDist += m_rdCost.scaleChromaDist(2, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize));
286
@@ -2699,7 +2701,7 @@
287
     {
288
         // code full block
289
         uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
290
-        uint32_t codeChroma = (m_csp != X265_CSP_I400) ? 1 : 0;
291
+        uint32_t codeChroma = (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400) ? 1 : 0;
292
 
293
         uint32_t tuDepthC = tuDepth;
294
         if (log2TrSizeC < 2)
295
@@ -2807,20 +2809,17 @@
296
         {
297
             residualTransformQuantInter(mode, cuGeom, qPartIdx, tuDepth + 1, depthRange);
298
             ycbf |= cu.getCbf(qPartIdx, TEXT_LUMA,     tuDepth + 1);
299
-            if (m_csp != X265_CSP_I400)
300
+            if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
301
             {
302
                 ucbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
303
                 vcbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
304
             }
305
         }
306
-        for (uint32_t i = 0; i < 4 * qNumParts; ++i)
307
+        cu.m_cbf[0][absPartIdx] |= ycbf << tuDepth;
308
+        if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
309
         {
310
-            cu.m_cbf[0][absPartIdx + i] |= ycbf << tuDepth;
311
-            if (m_csp != X265_CSP_I400)
312
-            {
313
-                cu.m_cbf[1][absPartIdx + i] |= ucbf << tuDepth;
314
-                cu.m_cbf[2][absPartIdx + i] |= vcbf << tuDepth;
315
-            }
316
+            cu.m_cbf[1][absPartIdx] |= ucbf << tuDepth;
317
+            cu.m_cbf[2][absPartIdx] |= vcbf << tuDepth;
318
         }
319
     }
320
 }
321
@@ -2840,6 +2839,7 @@
322
     CUData& cu = mode.cu;
323
     uint32_t depth = cuGeom.depth + tuDepth;
324
     uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
325
+    bool bEnableRDOQ = !!m_param->rdoqLevel;
326
 
327
     bool bCheckSplit = log2TrSize > depthRange[0];
328
     bool bCheckFull = log2TrSize <= depthRange[1];
329
@@ -2851,7 +2851,7 @@
330
     X265_CHECK(bCheckFull || bCheckSplit, "check-full or check-split must be set\n");
331
 
332
     uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
333
-    uint32_t codeChroma = (m_csp != X265_CSP_I400) ? 1 : 0;
334
+    uint32_t codeChroma = (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400) ? 1 : 0;
335
     uint32_t tuDepthC = tuDepth;
336
     if (log2TrSizeC < 2)
337
     {
338
@@ -2897,7 +2897,7 @@
339
         cu.setTUDepthSubParts(tuDepth, absPartIdx, depth);
340
         cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth);
341
 
342
-        if (m_bEnableRDOQ)
343
+        if (bEnableRDOQ)
344
             m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
345
 
346
         const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
347
@@ -3011,7 +3011,7 @@
348
 
349
                     cu.setTransformSkipPartRange(0, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
350
 
351
-                    if (m_bEnableRDOQ && (chromaId != TEXT_CHROMA_V))
352
+                    if (bEnableRDOQ && (chromaId != TEXT_CHROMA_V))
353
                         m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSizeC, false);
354
 
355
                     fenc = fencYuv->getChromaAddr(chromaId, absPartIdxC);
356
@@ -3102,6 +3102,19 @@
357
             }
358
         }
359
 
360
+        if (m_frame->m_fencPic->m_picCsp == X265_CSP_I400 && m_csp != X265_CSP_I400)
361
+        {
362
+            for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)
363
+            {
364
+                TURecurse tuIterator(splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);
365
+                do
366
+                {
367
+                    uint32_t absPartIdxC = tuIterator.absPartIdxTURelCU;
368
+                    cu.setCbfPartRange(0, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
369
+                }
370
+                while(tuIterator.isNextSection());
371
+            }
372
+        }
373
         if (checkTransformSkipY)
374
         {
375
             sse_t nonZeroDistY = 0;
376
@@ -3112,7 +3125,7 @@
377
 
378
             cu.setTransformSkipSubParts(1, TEXT_LUMA, absPartIdx, depth);
379
 
380
-            if (m_bEnableRDOQ)
381
+            if (bEnableRDOQ)
382
                 m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
383
 
384
             fenc = fencYuv->getLumaAddr(absPartIdx);
385
@@ -3180,7 +3193,7 @@
386
 
387
                     cu.setTransformSkipPartRange(1, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
388
 
389
-                    if (m_bEnableRDOQ && (chromaId != TEXT_CHROMA_V))
390
+                    if (bEnableRDOQ && (chromaId != TEXT_CHROMA_V))
391
                         m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSizeC, false);
392
 
393
                     fenc = fencYuv->getChromaAddr(chromaId, absPartIdxC);
394
@@ -3311,20 +3324,17 @@
395
         {
396
             estimateResidualQT(mode, cuGeom, qPartIdx, tuDepth + 1, resiYuv, splitCost, depthRange);
397
             ycbf |= cu.getCbf(qPartIdx, TEXT_LUMA,     tuDepth + 1);
398
-            if (m_csp != X265_CSP_I400)
399
+            if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
400
             {
401
                 ucbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
402
                 vcbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
403
             }
404
         }
405
-        for (uint32_t i = 0; i < 4 * qNumParts; ++i)
406
+        cu.m_cbf[0][absPartIdx] |= ycbf << tuDepth;
407
+        if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
408
         {
409
-            cu.m_cbf[0][absPartIdx + i] |= ycbf << tuDepth;
410
-            if (m_csp != X265_CSP_I400)
411
-            {
412
-                cu.m_cbf[1][absPartIdx + i] |= ucbf << tuDepth;
413
-                cu.m_cbf[2][absPartIdx + i] |= vcbf << tuDepth;
414
-            }
415
+            cu.m_cbf[1][absPartIdx] |= ucbf << tuDepth;
416
+            cu.m_cbf[2][absPartIdx] |= vcbf << tuDepth;
417
         }
418
 
419
         // Here we were encoding cbfs and coefficients for splitted blocks. Since I have collected coefficient bits
420
@@ -3413,25 +3423,21 @@
421
 
422
     const bool bSubdiv  = tuDepth < cu.m_tuDepth[absPartIdx];
423
     uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
424
-    if (m_csp != X265_CSP_I400)
425
+    if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
426
     {
427
         if (!(log2TrSize - m_hChromaShift < 2))
428
         {
429
-            if (!tuDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth - 1))
430
+            uint32_t parentIdx = absPartIdx & (0xFF << (log2TrSize + 1 - LOG2_UNIT_SIZE) * 2);
431
+            if (!tuDepth || cu.getCbf(parentIdx, TEXT_CHROMA_U, tuDepth - 1))
432
                 m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_U, tuDepth, !bSubdiv);
433
-            if (!tuDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth - 1))
434
+            if (!tuDepth || cu.getCbf(parentIdx, TEXT_CHROMA_V, tuDepth - 1))
435
                 m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_V, tuDepth, !bSubdiv);
436
         }
437
-        else
438
-        {
439
-            X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth) == cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth - 1), "chroma CBF not matching\n");
440
-            X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth) == cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth - 1), "chroma CBF not matching\n");
441
-        }
442
     }
443
 
444
     if (!bSubdiv)
445
     {
446
-        m_entropyCoder.codeQtCbfLuma(cu, absPartIdx, tuDepth);
447
+        m_entropyCoder.codeQtCbfLuma(cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth), tuDepth);
448
     }
449
     else
450
     {
451
@@ -3456,7 +3462,7 @@
452
     const uint32_t qtLayer = log2TrSize - 2;
453
 
454
     uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
455
-    uint32_t codeChroma = (m_csp != X265_CSP_I400) ? 1 : 0;
456
+    uint32_t codeChroma = (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400) ? 1 : 0;
457
     uint32_t tuDepthC = tuDepth;
458
     if (log2TrSizeC < 2)
459
     {
460
x265_1.9.tar.gz/source/encoder/search.h -> x265_2.0.tar.gz/source/encoder/search.h Changed
9
 
1
@@ -272,7 +272,6 @@
2
     pixel*          m_tsRecon;        /* transform skip reconstructed pixels 32x32 */
3
 
4
     bool            m_bFrameParallel;
5
-    bool            m_bEnableRDOQ;
6
     uint32_t        m_numLayers;
7
     uint32_t        m_refLagPixels;
8
 
9
x265_1.9.tar.gz/source/encoder/slicetype.cpp -> x265_2.0.tar.gz/source/encoder/slicetype.cpp Changed
36
 
1
@@ -83,7 +83,7 @@
2
     uint32_t var;
3
 
4
     var  = acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[0] + blockOffsetLuma, stride, 0, csp);
5
-    if (csp != X265_CSP_I400)
6
+    if (csp != X265_CSP_I400 && curFrame->m_fencPic->m_picCsp != X265_CSP_I400)
7
     {
8
         var += acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[1] + blockOffsetChroma, cStride, 1, csp);
9
         var += acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[2] + blockOffsetChroma, cStride, 2, csp);
10
@@ -456,10 +456,13 @@
11
     COPY4_IF_LT(minscore, s, minscale, curScale, minoff, curOffset, found, 1);
12
 
13
     /* Use a smaller denominator if possible */
14
-    while (mindenom > 0 && !(minscale & 1))
15
+    if (mindenom > 0 && !(minscale & 1))
16
     {
17
-        mindenom--;
18
-        minscale >>= 1;
19
+        unsigned long idx;
20
+        CTZ(idx, minscale);
21
+        int shift = X265_MIN((int)idx, mindenom);
22
+        mindenom -= shift;
23
+        minscale >>= shift;
24
     }
25
 
26
     if (!found || (minscale == 1 << mindenom && minoff == 0) || (float)minscore / origscore > 0.998f)
27
@@ -2081,7 +2084,7 @@
28
     const intptr_t pelOffset = cuSize * cuX + cuSize * cuY * fenc->lumaStride;
29
 
30
     if (bBidir || bDoSearch[0] || bDoSearch[1])
31
-        tld.me.setSourcePU(fenc->lowresPlane[0], fenc->lumaStride, pelOffset, cuSize, cuSize);
32
+        tld.me.setSourcePU(fenc->lowresPlane[0], fenc->lumaStride, pelOffset, cuSize, cuSize, X265_HEX_SEARCH, 1);
33
 
34
     /* A small, arbitrary bias to avoid VBV problems caused by zero-residual lookahead blocks. */
35
     int lowresPenalty = 4;
36
x265_1.9.tar.gz/source/encoder/slicetype.h -> x265_2.0.tar.gz/source/encoder/slicetype.h Changed
11
 
1
@@ -60,8 +60,8 @@
2
 
3
     LookaheadTLD()
4
     {
5
+        me.init(X265_CSP_I400);
6
         me.setQP(X265_LOOKAHEAD_QP);
7
-        me.init(X265_HEX_SEARCH, 1, X265_CSP_I400);
8
         for (int i = 0; i < 4; i++)
9
             wbuffer[i] = NULL;
10
         widthInCU = heightInCU = ncu = paddedLines = 0;
11
x265_1.9.tar.gz/source/encoder/weightPrediction.cpp -> x265_2.0.tar.gz/source/encoder/weightPrediction.cpp Changed
78
 
1
@@ -31,6 +31,7 @@
2
 #include "slice.h"
3
 #include "mv.h"
4
 #include "bitstream.h"
5
+#include "threading.h"
6
 
7
 using namespace X265_NS;
8
 namespace {
9
@@ -132,25 +133,25 @@
10
                 intptr_t fpeloffset = (mv.y >> 2) * stride + (mv.x >> 2);
11
                 pixel *temp = src + pixoff + fpeloffset;
12
 
13
-                int xFrac = mv.x & 0x7;
14
-                int yFrac = mv.y & 0x7;
15
-                if ((yFrac | xFrac) == 0)
16
+                int xFrac = mv.x & 7;
17
+                int yFrac = mv.y & 7;
18
+                if (!(yFrac | xFrac))
19
                 {
20
                     primitives.chroma[csp].pu[LUMA_16x16].copy_pp(mcout + pixoff, stride, temp, stride);
21
                 }
22
-                else if (yFrac == 0)
23
+                else if (!yFrac)
24
                 {
25
                     primitives.chroma[csp].pu[LUMA_16x16].filter_hpp(temp, stride, mcout + pixoff, stride, xFrac);
26
                 }
27
-                else if (xFrac == 0)
28
+                else if (!xFrac)
29
                 {
30
                     primitives.chroma[csp].pu[LUMA_16x16].filter_vpp(temp, stride, mcout + pixoff, stride, yFrac);
31
                 }
32
                 else
33
                 {
34
-                    ALIGN_VAR_16(int16_t, imm[16 * (16 + NTAPS_CHROMA)]);
35
-                    primitives.chroma[csp].pu[LUMA_16x16].filter_hps(temp, stride, imm, bw, xFrac, 1);
36
-                    primitives.chroma[csp].pu[LUMA_16x16].filter_vsp(imm + ((NTAPS_CHROMA >> 1) - 1) * bw, bw, mcout + pixoff, stride, yFrac);
37
+                    ALIGN_VAR_16(int16_t, immed[16 * (16 + NTAPS_CHROMA - 1)]);
38
+                    primitives.chroma[csp].pu[LUMA_16x16].filter_hps(temp, stride, immed, bw, xFrac, 1);
39
+                    primitives.chroma[csp].pu[LUMA_16x16].filter_vsp(immed + ((NTAPS_CHROMA >> 1) - 1) * bw, bw, mcout + pixoff, stride, yFrac);
40
                 }
41
             }
42
             else
43
@@ -232,7 +233,7 @@
44
     cache.numPredDir = slice.isInterP() ? 1 : 2;
45
     cache.lowresWidthInCU = fenc.width >> 3;
46
     cache.lowresHeightInCU = fenc.lines >> 3;
47
-    cache.csp = fencPic->m_picCsp;
48
+    cache.csp = param.internalCsp;
49
     cache.hshift = CHROMA_H_SHIFT(cache.csp);
50
     cache.vshift = CHROMA_V_SHIFT(cache.csp);
51
 
52
@@ -329,7 +330,7 @@
53
                 {
54
                     /* reference chroma planes must be extended prior to being
55
                      * used as motion compensation sources */
56
-                    if (!refFrame->m_bChromaExtended && param.internalCsp != X265_CSP_I400)
57
+                    if (!refFrame->m_bChromaExtended && param.internalCsp != X265_CSP_I400 && frame.m_fencPic->m_picCsp != X265_CSP_I400)
58
                     {
59
                         refFrame->m_bChromaExtended = true;
60
                         PicYuv *refPic = refFrame->m_fencPic;
61
@@ -456,10 +457,13 @@
62
             /* Use a smaller luma denominator if possible */
63
             if (!(plane || list))
64
             {
65
-                while (mindenom > 0 && !(minscale & 1))
66
+                if (mindenom > 0 && !(minscale & 1))
67
                 {
68
-                    mindenom--;
69
-                    minscale >>= 1;
70
+                    unsigned long idx;
71
+                    CTZ(idx, minscale);
72
+                    int shift = X265_MIN((int)idx, mindenom);
73
+                    mindenom -= shift;
74
+                    minscale >>= shift;
75
                 }
76
             }
77
 
78
x265_1.9.tar.gz/source/input/y4m.cpp -> x265_2.0.tar.gz/source/input/y4m.cpp Changed
10
 
1
@@ -417,6 +417,8 @@
2
     {
3
         int pixelbytes = depth > 8 ? 2 : 1;
4
         pic.bitDepth = depth;
5
+        pic.framesize = framesize;
6
+        pic.height = height;
7
         pic.colorSpace = colorSpace;
8
         pic.stride[0] = width * pixelbytes;
9
         pic.stride[1] = pic.stride[0] >> x265_cli_csps[colorSpace].width[1];
10
x265_1.9.tar.gz/source/input/yuv.cpp -> x265_2.0.tar.gz/source/input/yuv.cpp Changed
10
 
1
@@ -225,6 +225,8 @@
2
         uint32_t pixelbytes = depth > 8 ? 2 : 1;
3
         pic.colorSpace = colorSpace;
4
         pic.bitDepth = depth;
5
+        pic.framesize = framesize;
6
+        pic.height = height;
7
         pic.stride[0] = width * pixelbytes;
8
         pic.stride[1] = pic.stride[0] >> x265_cli_csps[colorSpace].width[1];
9
         pic.stride[2] = pic.stride[0] >> x265_cli_csps[colorSpace].width[2];
10
x265_1.9.tar.gz/source/output/raw.cpp -> x265_2.0.tar.gz/source/output/raw.cpp Changed
43
 
1
@@ -32,11 +32,11 @@
2
     b_fail = false;
3
     if (!strcmp(fname, "-"))
4
     {
5
-        ofs = &cout;
6
+        ofs = stdout;
7
         return;
8
     }
9
-    ofs = new ofstream(fname, ios::binary | ios::out);
10
-    if (ofs->fail())
11
+    ofs = x265_fopen(fname, "wb");
12
+    if (!ofs || ferror(ofs))
13
         b_fail = true;
14
 }
15
 
16
@@ -51,7 +51,7 @@
17
 
18
     for (uint32_t i = 0; i < nalcount; i++)
19
     {
20
-        ofs->write((const char*)nal->payload, nal->sizeBytes);
21
+        fwrite((const void*)nal->payload, 1, nal->sizeBytes, ofs);
22
         bytes += nal->sizeBytes;
23
         nal++;
24
     }
25
@@ -65,7 +65,7 @@
26
 
27
     for (uint32_t i = 0; i < nalcount; i++)
28
     {
29
-        ofs->write((const char*)nal->payload, nal->sizeBytes);
30
+        fwrite((const void*)nal->payload, 1, nal->sizeBytes, ofs);
31
         bytes += nal->sizeBytes;
32
         nal++;
33
     }
34
@@ -75,6 +75,6 @@
35
 
36
 void RAWOutput::closeFile(int64_t, int64_t)
37
 {
38
-    if (ofs != &cout)
39
-        delete ofs;
40
+    if (ofs != stdout)
41
+        fclose(ofs);
42
 }
43
x265_1.9.tar.gz/source/output/raw.h -> x265_2.0.tar.gz/source/output/raw.h Changed
10
 
1
@@ -35,7 +35,7 @@
2
 {
3
 protected:
4
 
5
-    std::ostream* ofs;
6
+    FILE* ofs;
7
 
8
     bool b_fail;
9
 
10
x265_1.9.tar.gz/source/test/CMakeLists.txt -> x265_2.0.tar.gz/source/test/CMakeLists.txt Changed
50
 
1
@@ -1,4 +1,12 @@
2
 # vim: syntax=cmake
3
+
4
+check_symbol_exists(__rdtsc "intrin.h" HAVE_RDTSC)
5
+if(HAVE_RDTSC)
6
+    add_definitions(-DHAVE_RDTSC=1)
7
+endif()
8
+
9
+# add X86 assembly files
10
+if(X86)
11
 enable_language(ASM_YASM)
12
 
13
 if(MSVC_IDE)
14
@@ -11,11 +19,23 @@
15
 else()
16
     set(YASM_SRC checkasm-a.asm)
17
 endif()
18
+endif(X86)
19
 
20
-check_symbol_exists(__rdtsc "intrin.h" HAVE_RDTSC)
21
-if(HAVE_RDTSC)
22
-    add_definitions(-DHAVE_RDTSC=1)
23
-endif()
24
+# add ARM assembly files
25
+if(ARM OR CROSS_COMPILE_ARM)
26
+    enable_language(ASM)
27
+    set(YASM_SRC checkasm-arm.S)
28
+    add_custom_command(
29
+        OUTPUT checkasm-arm.obj
30
+        COMMAND ${CMAKE_CXX_COMPILER}
31
+        ARGS ${YASM_FLAGS} ${CMAKE_CURRENT_SOURCE_DIR}/checkasm-arm.S -o checkasm-arm.obj
32
+        DEPENDS checkasm-arm.S)
33
+endif(ARM OR CROSS_COMPILE_ARM)
34
+
35
+# add PowerPC assembly files
36
+if(POWER)
37
+    set(YASM_SRC)
38
+endif(POWER)
39
 
40
 add_executable(TestBench ${YASM_SRC}
41
     testbench.cpp testharness.h
42
@@ -23,6 +43,7 @@
43
     mbdstharness.cpp mbdstharness.h
44
     ipfilterharness.cpp ipfilterharness.h
45
     intrapredharness.cpp intrapredharness.h)
46
+
47
 target_link_libraries(TestBench x265-static ${PLATFORM_LIBS})
48
 if(LINKER_OPTIONS)
49
     if(EXTRA_LIB)
50
x265_2.0.tar.gz/source/test/checkasm-arm.S Added
135
 
1
@@ -0,0 +1,133 @@
2
+/****************************************************************************
3
+ * checkasm-arm.S: assembly check tool
4
+ *****************************************************************************
5
+ * Copyright (C) 2016 x265 project
6
+ *
7
+ * Authors: Martin Storsjo <martin@martin.st>
8
+ *          Dnyaneshwar Gorade <dnyaneshwar@multicorewareinc.com>
9
+ *
10
+ * This program is free software; you can redistribute it and/or modify
11
+ * it under the terms of the GNU General Public License as published by
12
+ * the Free Software Foundation; either version 2 of the License, or
13
+ * (at your option) any later version.
14
+ *
15
+ * This program is distributed in the hope that it will be useful,
16
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18
+ * GNU General Public License for more details.
19
+ *
20
+ * You should have received a copy of the GNU General Public License
21
+ * along with this program; if not, write to the Free Software
22
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
23
+ *
24
+ * This program is also available under a commercial proprietary license.
25
+ * For more information, contact us at license @ x265.com.
26
+ *****************************************************************************/
27
+
28
+#include "../common/arm/asm.S"
29
+
30
+.section .rodata
31
+.align 4
32
+register_init:
33
+.quad 0x21f86d66c8ca00ce
34
+.quad 0x75b6ba21077c48ad
35
+.quad 0xed56bb2dcb3c7736
36
+.quad 0x8bda43d3fd1a7e06
37
+.quad 0xb64a9c9e5d318408
38
+.quad 0xdf9a54b303f1d3a3
39
+.quad 0x4a75479abd64e097
40
+.quad 0x249214109d5d1c88
41
+
42
+error_message:
43
+.asciz "failed to preserve register"
44
+
45
+.text
46
+
47
+@ max number of args used by any x265 asm function.
48
+#define MAX_ARGS 15
49
+
50
+#define ARG_STACK 4*(MAX_ARGS - 2)
51
+
52
+.macro clobbercheck variant
53
+.equ pushed, 4*10
54
+function x265_checkasm_call_\variant
55
+    push        {r4-r11, lr}
56
+.ifc \variant, neon
57
+    vpush       {q4-q7}
58
+.equ pushed, pushed + 16*4
59
+.endif
60
+
61
+    movrel      r12, register_init
62
+.ifc \variant, neon
63
+    vldm        r12, {q4-q7}
64
+.endif
65
+    ldm         r12, {r4-r11}
66
+
67
+    push        {r1}
68
+
69
+    sub         sp,  sp,  #ARG_STACK
70
+.equ pos, 0
71
+.rept MAX_ARGS-2
72
+    ldr         r12, [sp, #ARG_STACK + pushed + 8 + pos]
73
+    str         r12, [sp, #pos]
74
+.equ pos, pos + 4
75
+.endr
76
+
77
+    mov         r12, r0
78
+    mov         r0,  r2
79
+    mov         r1,  r3
80
+    ldrd        r2,  r3,  [sp, #ARG_STACK + pushed]
81
+    blx         r12
82
+    add         sp,  sp,  #ARG_STACK
83
+    pop         {r2}
84
+
85
+    push        {r0, r1}
86
+    movrel      r12, register_init
87
+.ifc \variant, neon
88
+    vldm        r12, {q0-q3}
89
+    veor        q0,  q0,  q4
90
+    veor        q1,  q1,  q5
91
+    veor        q2,  q2,  q6
92
+    veor        q3,  q3,  q7
93
+    vorr        q0,  q0,  q1
94
+    vorr        q0,  q0,  q2
95
+    vorr        q0,  q0,  q3
96
+    vorr        d0,  d0,  d1
97
+    vrev64.32   d1,  d0
98
+    vorr        d0,  d0,  d1
99
+    vmov.32     r3,  d0[0]
100
+.else
101
+    mov         r3,  #0
102
+.endif
103
+
104
+.macro check_reg reg1, reg2
105
+    ldrd        r0,  r1,  [r12], #8
106
+    eor         r0,  r0, \reg1
107
+    eor         r1,  r1, \reg2
108
+    orr         r3,  r3, r0
109
+    orr         r3,  r3, r1
110
+.endm
111
+    check_reg   r4,  r5
112
+    check_reg   r6,  r7
113
+    check_reg   r8,  r9
114
+    check_reg   r10, r11
115
+.purgem check_reg
116
+
117
+    cmp         r3,  #0
118
+    beq         0f
119
+
120
+    mov         r12, #0
121
+    str         r12, [r2]
122
+    movrel      r0, error_message
123
+    bl          puts
124
+0:
125
+    pop         {r0, r1}
126
+.ifc \variant, neon
127
+    vpop        {q4-q7}
128
+.endif
129
+    pop         {r4-r11, pc}
130
+endfunc
131
+.endm
132
+
133
+clobbercheck neon
134
+clobbercheck noneon
135
x265_1.9.tar.gz/source/test/pixelharness.cpp -> x265_2.0.tar.gz/source/test/pixelharness.cpp Changed
323
 
1
@@ -43,6 +43,7 @@
2
         ushort_test_buff[0][i]  = rand() % ((1 << 16) - 1);
3
         uchar_test_buff[0][i]   = rand() % ((1 << 8) - 1);
4
         residual_test_buff[0][i] = (rand() % (2 * RMAX + 1)) - RMAX - 1;// For sse_ss only
5
+        double_test_buff[0][i]  = (double)(short_test_buff[0][i]) / 256.0;
6
 
7
         pixel_test_buff[1][i]   = PIXEL_MIN;
8
         short_test_buff[1][i]   = SMIN;
9
@@ -52,6 +53,7 @@
10
         ushort_test_buff[1][i]  = PIXEL_MIN;
11
         uchar_test_buff[1][i]   = PIXEL_MIN;
12
         residual_test_buff[1][i] = RMIN;
13
+        double_test_buff[1][i]  = (double)(short_test_buff[1][i]) / 256.0;
14
 
15
         pixel_test_buff[2][i]   = PIXEL_MAX;
16
         short_test_buff[2][i]   = SMAX;
17
@@ -61,6 +63,7 @@
18
         ushort_test_buff[2][i]  = ((1 << 16) - 1);
19
         uchar_test_buff[2][i]   = 255;
20
         residual_test_buff[2][i] = RMAX;
21
+        double_test_buff[2][i] = (double)(short_test_buff[2][i]) / 256.0;
22
 
23
         pbuf1[i] = rand() & PIXEL_MAX;
24
         pbuf2[i] = rand() & PIXEL_MAX;
25
@@ -858,9 +861,8 @@
26
         int width = (rand() % 4) + 1; // range[1-4]
27
         float cres = ref(sum0, sum1, width);
28
         float vres = checked_float(opt, sum0, sum1, width);
29
-        if (fabs(vres - cres) > 0.0001)
30
+        if (fabs(vres - cres) > 0.001)
31
             return false;
32
-
33
         reportfail();
34
     }
35
 
36
@@ -1398,6 +1400,60 @@
37
     return true;
38
 }
39
 
40
+bool PixelHarness::check_cutree_fix8_pack(cutree_fix8_pack ref, cutree_fix8_pack opt)
41
+{
42
+    ALIGN_VAR_32(uint16_t, ref_dest[64 * 64]);
43
+    ALIGN_VAR_32(uint16_t, opt_dest[64 * 64]);
44
+
45
+    memset(ref_dest, 0xCD, sizeof(ref_dest));
46
+    memset(opt_dest, 0xCD, sizeof(opt_dest));
47
+
48
+    int j = 0;
49
+
50
+    for (int i = 0; i < ITERS; i++)
51
+    {
52
+        int count = 256 + i;
53
+        int index = i % TEST_CASES;
54
+        checked(opt, opt_dest, double_test_buff[index] + j, count);
55
+        ref(ref_dest, double_test_buff[index] + j, count);
56
+
57
+        if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(uint16_t)))
58
+            return false;
59
+
60
+        reportfail();
61
+        j += INCR;
62
+    }
63
+
64
+    return true;
65
+}
66
+
67
+bool PixelHarness::check_cutree_fix8_unpack(cutree_fix8_unpack ref, cutree_fix8_unpack opt)
68
+{
69
+    ALIGN_VAR_32(double, ref_dest[64 * 64]);
70
+    ALIGN_VAR_32(double, opt_dest[64 * 64]);
71
+
72
+    memset(ref_dest, 0xCD, sizeof(ref_dest));
73
+    memset(opt_dest, 0xCD, sizeof(opt_dest));
74
+
75
+    int j = 0;
76
+
77
+    for (int i = 0; i < ITERS; i++)
78
+    {
79
+        int count = 256 + i;
80
+        int index = i % TEST_CASES;
81
+        checked(opt, opt_dest, ushort_test_buff[index] + j, count);
82
+        ref(ref_dest, ushort_test_buff[index] + j, count);
83
+
84
+        if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(double)))
85
+            return false;
86
+
87
+        reportfail();
88
+        j += INCR;
89
+    }
90
+
91
+    return true;
92
+}
93
+
94
 bool PixelHarness::check_psyCost_pp(pixelcmp_t ref, pixelcmp_t opt)
95
 {
96
     int j = 0, index1, index2, optres, refres;
97
@@ -1819,34 +1875,6 @@
98
     return true;
99
 }
100
 
101
-bool PixelHarness::check_planeClipAndMax(planeClipAndMax_t ref, planeClipAndMax_t opt)
102
-{
103
-    for (int i = 0; i < ITERS; i++)
104
-    {
105
-        intptr_t rand_stride = rand() % STRIDE;
106
-        int rand_width = (rand() % (STRIDE * 2)) + 1;
107
-        const int rand_height = (rand() % MAX_HEIGHT) + 1;
108
-        const pixel rand_min = rand() % 32;
109
-        const pixel rand_max = PIXEL_MAX - (rand() % 32);
110
-        uint64_t ref_sum, opt_sum;
111
-
112
-        // video width must be more than or equal to 32
113
-        if (rand_width < 32)
114
-            rand_width = 32;
115
-
116
-        // stride must be more than or equal to width
117
-        if (rand_stride < rand_width)
118
-            rand_stride = rand_width;
119
-
120
-        pixel ref_max = ref(pbuf1, rand_stride, rand_width, rand_height, &ref_sum, rand_min, rand_max);
121
-        pixel opt_max = (pixel)checked(opt, pbuf1, rand_stride, rand_width, rand_height, &opt_sum, rand_min, rand_max);
122
-
123
-        if (ref_max != opt_max)
124
-            return false;
125
-    }
126
-    return true;
127
-}
128
-
129
 bool PixelHarness::check_pelFilterLumaStrong_H(pelFilterLumaStrong_t ref, pelFilterLumaStrong_t opt)
130
 {
131
     intptr_t srcStep = 1, offset = 64;
132
@@ -1913,6 +1941,68 @@
133
     return true;
134
 }
135
 
136
+bool PixelHarness::check_pelFilterChroma_H(pelFilterChroma_t ref, pelFilterChroma_t opt)
137
+{
138
+    intptr_t srcStep = 1, offset = 64;
139
+    int32_t maskP, maskQ, tc;
140
+    int j = 0;
141
+
142
+    pixel pixel_test_buff1[TEST_CASES][BUFFSIZE];
143
+    for (int i = 0; i < TEST_CASES; i++)
144
+        memcpy(pixel_test_buff1[i], pixel_test_buff[i], sizeof(pixel)* BUFFSIZE);
145
+
146
+    for (int i = 0; i < ITERS; i++)
147
+    {
148
+        tc = rand() % PIXEL_MAX;
149
+        maskP = (rand() % PIXEL_MAX) - 1;
150
+        maskQ = (rand() % PIXEL_MAX) - 1;
151
+
152
+        int index = rand() % 3;
153
+
154
+        ref(pixel_test_buff[index] + 4 * offset + j, srcStep, offset, tc, maskP, maskQ);
155
+        checked(opt, pixel_test_buff1[index] + 4 * offset + j, srcStep, offset, tc, maskP, maskQ);
156
+
157
+        if (memcmp(pixel_test_buff[index], pixel_test_buff1[index], sizeof(pixel)* BUFFSIZE))
158
+            return false;
159
+
160
+        reportfail()
161
+        j += INCR;
162
+    }
163
+
164
+    return true;
165
+}
166
+
167
+bool PixelHarness::check_pelFilterChroma_V(pelFilterChroma_t ref, pelFilterChroma_t opt)
168
+{
169
+    intptr_t srcStep = 64, offset = 1;
170
+    int32_t maskP, maskQ, tc;
171
+    int j = 0;
172
+
173
+    pixel pixel_test_buff1[TEST_CASES][BUFFSIZE];
174
+    for (int i = 0; i < TEST_CASES; i++)
175
+        memcpy(pixel_test_buff1[i], pixel_test_buff[i], sizeof(pixel)* BUFFSIZE);
176
+
177
+    for (int i = 0; i < ITERS; i++)
178
+    {
179
+        tc = rand() % PIXEL_MAX;
180
+        maskP = (rand() % PIXEL_MAX) - 1;
181
+        maskQ = (rand() % PIXEL_MAX) - 1;
182
+
183
+        int index = rand() % 3;
184
+
185
+        ref(pixel_test_buff[index] + 4 + j, srcStep, offset, tc, maskP, maskQ);
186
+        checked(opt, pixel_test_buff1[index] + 4 + j, srcStep, offset, tc, maskP, maskQ);
187
+
188
+        if (memcmp(pixel_test_buff[index], pixel_test_buff1[index], sizeof(pixel)* BUFFSIZE))
189
+            return false;
190
+
191
+        reportfail()
192
+        j += INCR;
193
+    }
194
+
195
+    return true;
196
+}
197
+
198
 bool PixelHarness::testPU(int part, const EncoderPrimitives& ref, const EncoderPrimitives& opt)
199
 {
200
     if (opt.pu[part].satd)
201
@@ -2498,6 +2588,24 @@
202
         }
203
     }
204
 
205
+    if (opt.fix8Pack)
206
+    {
207
+        if (!check_cutree_fix8_pack(ref.fix8Pack, opt.fix8Pack))
208
+        {
209
+            printf("cuTreeFix8Pack failed\n");
210
+            return false;
211
+        }
212
+    }
213
+
214
+    if (opt.fix8Unpack)
215
+    {
216
+        if (!check_cutree_fix8_unpack(ref.fix8Unpack, opt.fix8Unpack))
217
+        {
218
+            printf("cuTreeFix8Unpack failed\n");
219
+            return false;
220
+        }
221
+    }
222
+
223
     if (opt.scanPosLast)
224
     {
225
         if (!check_scanPosLast(ref.scanPosLast, opt.scanPosLast))
226
@@ -2544,15 +2652,6 @@
227
     }
228
     
229
 
230
-    if (opt.planeClipAndMax)
231
-    {
232
-        if (!check_planeClipAndMax(ref.planeClipAndMax, opt.planeClipAndMax))
233
-        {
234
-            printf("planeClipAndMax failed!\n");
235
-            return false;
236
-        }
237
-    }
238
-
239
     if (opt.pelFilterLumaStrong[0])
240
     {
241
         if (!check_pelFilterLumaStrong_V(ref.pelFilterLumaStrong[0], opt.pelFilterLumaStrong[0]))
242
@@ -2571,6 +2670,24 @@
243
         }
244
     }
245
 
246
+    if (opt.pelFilterChroma[0])
247
+    {
248
+        if (!check_pelFilterChroma_V(ref.pelFilterChroma[0], opt.pelFilterChroma[0]))
249
+        {
250
+            printf("pelFilterChroma Vertical failed!\n");
251
+            return false;
252
+        }
253
+    }
254
+
255
+    if (opt.pelFilterChroma[1])
256
+    {
257
+        if (!check_pelFilterChroma_H(ref.pelFilterChroma[1], opt.pelFilterChroma[1]))
258
+        {
259
+            printf("pelFilterChroma Horizontal failed!\n");
260
+            return false;
261
+        }
262
+    }
263
+
264
     return true;
265
 }
266
 
267
@@ -2988,6 +3105,18 @@
268
         REPORT_SPEEDUP(opt.propagateCost, ref.propagateCost, ibuf1, ushort_test_buff[0], int_test_buff[0], ushort_test_buff[0], int_test_buff[0], double_test_buff[0], 80);
269
     }
270
 
271
+    if (opt.fix8Pack)
272
+    {
273
+        HEADER0("cuTreeFix8Pack");
274
+        REPORT_SPEEDUP(opt.fix8Pack, ref.fix8Pack, ushort_test_buff[0], double_test_buff[0], 390);
275
+    }
276
+
277
+    if (opt.fix8Unpack)
278
+    {
279
+        HEADER0("cuTreeFix8Unpack");
280
+        REPORT_SPEEDUP(opt.fix8Unpack, ref.fix8Unpack, double_test_buff[0], ushort_test_buff[0], 390);
281
+    }
282
+
283
     if (opt.scanPosLast)
284
     {
285
         HEADER0("scanPosLast");
286
@@ -3048,13 +3177,6 @@
287
         REPORT_SPEEDUP(opt.costC1C2Flag, ref.costC1C2Flag, abscoefBuf, C1FLAG_NUMBER, (uint8_t*)psbuf1, 1);
288
     }
289
 
290
-    if (opt.planeClipAndMax)
291
-    {
292
-        HEADER0("planeClipAndMax");
293
-        uint64_t dummy;
294
-        REPORT_SPEEDUP(opt.planeClipAndMax, ref.planeClipAndMax, pbuf1, 128, 63, 62, &dummy, 1, PIXEL_MAX - 1);
295
-    }
296
-
297
     if (opt.pelFilterLumaStrong[0])
298
     {
299
         int32_t tcP = (rand() % PIXEL_MAX) - 1;
300
@@ -3070,4 +3192,22 @@
301
         HEADER0("pelFilterLumaStrong_Horizontal");
302
         REPORT_SPEEDUP(opt.pelFilterLumaStrong[1], ref.pelFilterLumaStrong[1], pbuf1, 1, STRIDE, tcP, tcQ);
303
     }
304
+
305
+    if (opt.pelFilterChroma[0])
306
+    {
307
+        int32_t tc = (rand() % PIXEL_MAX);
308
+        int32_t maskP = (rand() % PIXEL_MAX) - 1;
309
+        int32_t maskQ = (rand() % PIXEL_MAX) - 1;
310
+        HEADER0("pelFilterChroma_Vertical");
311
+        REPORT_SPEEDUP(opt.pelFilterChroma[0], ref.pelFilterChroma[0], pbuf1, STRIDE, 1, tc, maskP, maskQ);
312
+    }
313
+
314
+    if (opt.pelFilterChroma[1])
315
+    {
316
+        int32_t tc = (rand() % PIXEL_MAX);
317
+        int32_t maskP = (rand() % PIXEL_MAX) - 1;
318
+        int32_t maskQ = (rand() % PIXEL_MAX) - 1;
319
+        HEADER0("pelFilterChroma_Horizontal");
320
+        REPORT_SPEEDUP(opt.pelFilterChroma[1], ref.pelFilterChroma[1], pbuf1, 1, STRIDE, tc, maskP, maskQ);
321
+    }
322
 }
323
x265_1.9.tar.gz/source/test/pixelharness.h -> x265_2.0.tar.gz/source/test/pixelharness.h Changed
22
 
1
@@ -113,6 +113,8 @@
2
     bool check_planecopy_sp(planecopy_sp_t ref, planecopy_sp_t opt);
3
     bool check_planecopy_cp(planecopy_cp_t ref, planecopy_cp_t opt);
4
     bool check_cutree_propagate_cost(cutree_propagate_cost ref, cutree_propagate_cost opt);
5
+    bool check_cutree_fix8_pack(cutree_fix8_pack ref, cutree_fix8_pack opt);
6
+    bool check_cutree_fix8_unpack(cutree_fix8_unpack ref, cutree_fix8_unpack opt);
7
     bool check_psyCost_pp(pixelcmp_t ref, pixelcmp_t opt);
8
     bool check_calSign(sign_t ref, sign_t opt);
9
     bool check_scanPosLast(scanPosLast_t ref, scanPosLast_t opt);
10
@@ -120,9 +122,10 @@
11
     bool check_costCoeffNxN(costCoeffNxN_t ref, costCoeffNxN_t opt);
12
     bool check_costCoeffRemain(costCoeffRemain_t ref, costCoeffRemain_t opt);
13
     bool check_costC1C2Flag(costC1C2Flag_t ref, costC1C2Flag_t opt);
14
-    bool check_planeClipAndMax(planeClipAndMax_t ref, planeClipAndMax_t opt);
15
     bool check_pelFilterLumaStrong_V(pelFilterLumaStrong_t ref, pelFilterLumaStrong_t opt);
16
     bool check_pelFilterLumaStrong_H(pelFilterLumaStrong_t ref, pelFilterLumaStrong_t opt);
17
+    bool check_pelFilterChroma_V(pelFilterChroma_t ref, pelFilterChroma_t opt);
18
+    bool check_pelFilterChroma_H(pelFilterChroma_t ref, pelFilterChroma_t opt);
19
 
20
 public:
21
 
22
x265_1.9.tar.gz/source/test/rate-control-tests.txt -> x265_2.0.tar.gz/source/test/rate-control-tests.txt Changed
13
 
1
@@ -25,6 +25,11 @@
2
 
3
 
4
 # multi-pass rate control tests
5
+sita_1920x1080_30.yuv, --preset ultrafast --crf 20 --no-cutree --no-scenecut --keyint 50 --no-open-gop --pass 1 --vbv-bufsize 7000 --vbv-maxrate 5000, --preset ultrafast --crf 20 --no-cutree --no-scenecut --keyint 50 --no-open-gop --pass 2 --vbv-bufsize 7000 --vbv-maxrate 5000
6
+sita_1920x1080_30.yuv, --preset medium --crf 20 --no-cutree --no-scenecut --keyint 50 --no-open-gop --pass 1 --vbv-bufsize 7000 --vbv-maxrate 5000, --preset medium --crf 20 --no-cutree --no-scenecut --keyint 50 --no-open-gop --pass 2 --vbv-bufsize 7000 --vbv-maxrate 5000
7
+sintel_trailer_2k_480p24.y4m, --preset medium --crf 18 --no-cutree --no-scenecut --no-open-gop --keyint 50 --vbv-bufsize 1200 --vbv-maxrate 1000 --pass 1, --preset medium --crf 18 --no-cutree --no-scenecut --no-open-gop --keyint 50 --vbv-bufsize 1200 --vbv-maxrate 1000 --pass 2
8
+sintel_trailer_2k_480p24.y4m, --preset veryslow --crf 18 --no-cutree --no-scenecut --no-open-gop --keyint 50 --vbv-bufsize 1200 --vbv-maxrate 1000 --pass 1, --preset veryslow --crf 18 --no-cutree --no-scenecut --no-open-gop --keyint 50 --vbv-bufsize 1200 --vbv-maxrate 1000 --pass 2
9
+ten_teaser_3840x2160_50_10bit.yuv, --preset medium --crf 25 --no-cutree --no-open-gop --no-scenecut --keyint 50 --vbv-maxrate 10000 --vbv-bufsize 12000 --pass 1, --preset medium --crf 25 --no-cutree --no-open-gop --no-scenecut --keyint 50 --vbv-maxrate 10000 --vbv-bufsize 12000 --pass 2
10
 big_buck_bunny_360p24.y4m,--preset slow --crf 40 --pass 1 -f 5000,--preset slow --bitrate 200 --pass 2 -f 5000
11
 big_buck_bunny_360p24.y4m,--preset medium --bitrate 700 --pass 1 -F4 --slow-firstpass -f 5000 ,--preset medium --bitrate 700 --vbv-bufsize 900 --vbv-maxrate 700 --pass 2 -F4 -f 5000
12
 112_1920x1080_25.yuv,--preset fast --bitrate 1000 --vbv-maxrate 1000 --vbv-bufsize 1000 --strict-cbr --pass 1 -F4,--preset fast --bitrate 1000 --vbv-maxrate 3000 --vbv-bufsize 3000 --pass 2 -F4
13
x265_1.9.tar.gz/source/test/regression-tests.txt -> x265_2.0.tar.gz/source/test/regression-tests.txt Changed
9
 
1
@@ -67,6 +67,7 @@
2
 News-4k.y4m,--preset ultrafast --no-cutree --analysis-mode=save --bitrate 15000,--preset ultrafast --no-cutree --analysis-mode=load --bitrate 15000
3
 News-4k.y4m,--preset superfast --lookahead-slices 6 --aq-mode 0
4
 News-4k.y4m,--preset medium --tune ssim --no-sao --qg-size 16
5
+News-4k.y4m,--preset veryslow --no-rskip
6
 OldTownCross_1920x1080_50_10bit_422.yuv,--preset superfast --weightp
7
 OldTownCross_1920x1080_50_10bit_422.yuv,--preset medium --no-weightp
8
 OldTownCross_1920x1080_50_10bit_422.yuv,--preset slower --tune fastdecode
9
x265_1.9.tar.gz/source/test/testbench.cpp -> x265_2.0.tar.gz/source/test/testbench.cpp Changed
37
 
1
@@ -169,6 +169,9 @@
2
         { "XOP", X265_CPU_XOP },
3
         { "AVX2", X265_CPU_AVX2 },
4
         { "BMI2", X265_CPU_AVX2 | X265_CPU_BMI1 | X265_CPU_BMI2 },
5
+        { "ARMv6", X265_CPU_ARMV6 },
6
+        { "NEON", X265_CPU_NEON },
7
+        { "FastNeonMRC", X265_CPU_FAST_NEON_MRC },
8
         { "", 0 },
9
     };
10
 
11
@@ -182,6 +185,7 @@
12
         else
13
             continue;
14
 
15
+#if X265_ARCH_X86
16
         EncoderPrimitives vecprim;
17
         memset(&vecprim, 0, sizeof(vecprim));
18
         setupInstrinsicPrimitives(vecprim, test_arch[i].flag);
19
@@ -197,6 +201,7 @@
20
                 return -1;
21
             }
22
         }
23
+#endif
24
 
25
         EncoderPrimitives asmprim;
26
         memset(&asmprim, 0, sizeof(asmprim));
27
@@ -220,7 +225,9 @@
28
 
29
     EncoderPrimitives optprim;
30
     memset(&optprim, 0, sizeof(optprim));
31
+#if X265_ARCH_X86
32
     setupInstrinsicPrimitives(optprim, cpuid);
33
+#endif
34
     setupAssemblyPrimitives(optprim, cpuid);
35
 
36
     /* Note that we do not setup aliases for performance tests, that would be
37
x265_1.9.tar.gz/source/test/testharness.h -> x265_2.0.tar.gz/source/test/testharness.h Changed
37
 
1
@@ -32,7 +32,6 @@
2
 #pragma warning(disable: 4324) // structure was padded due to __declspec(align())
3
 #endif
4
 
5
-#define PIXEL_MAX ((1 << X265_DEPTH) - 1)
6
 #define PIXEL_MIN 0
7
 #define SHORT_MAX  32767
8
 #define SHORT_MIN -32767
9
@@ -75,10 +74,17 @@
10
 {
11
     uint32_t a = 0;
12
 
13
+#if X265_ARCH_X86
14
     asm volatile("rdtsc" : "=a" (a) ::"edx");
15
+#elif X265_ARCH_ARM
16
+    // TOD-DO: verify following inline asm to get cpu Timestamp Counter for ARM arch
17
+    // asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r"(a));
18
+
19
+    // TO-DO: replace clock() function with appropriate ARM cpu instructions
20
+    a = clock();
21
+#endif
22
     return a;
23
 }
24
-
25
 #endif // ifdef _MSC_VER
26
 
27
 #define BENCH_RUNS 1000
28
@@ -125,7 +131,7 @@
29
  * needs an explicit asm check because it only sometimes crashes in normal use. */
30
 intptr_t PFX(checkasm_call)(intptr_t (*func)(), int *ok, ...);
31
 float PFX(checkasm_call_float)(float (*func)(), int *ok, ...);
32
-#else
33
+#elif X265_ARCH_ARM == 0
34
 #define PFX(stack_pagealign)(func, align) func()
35
 #endif
36
 
37
x265_1.9.tar.gz/source/x265-extras.cpp -> x265_2.0.tar.gz/source/x265-extras.cpp Changed
111
 
1
@@ -46,17 +46,17 @@
2
         return NULL;
3
     }
4
 
5
-    FILE *csvfp = fopen(fname, "r");
6
+    FILE *csvfp = x265_fopen(fname, "r");
7
     if (csvfp)
8
     {
9
         /* file already exists, re-open for append */
10
         fclose(csvfp);
11
-        return fopen(fname, "ab");
12
+        return x265_fopen(fname, "ab");
13
     }
14
     else
15
     {
16
         /* new CSV file, write header */
17
-        csvfp = fopen(fname, "wb");
18
+        csvfp = x265_fopen(fname, "wb");
19
         if (csvfp)
20
         {
21
             if (level)
22
@@ -280,9 +280,9 @@
23
     fprintf(csvfp, " %-6u, %-6u, %s\n", stats.maxCLL, stats.maxFALL, api.version_str);
24
 }
25
 
26
-/* The dithering algorithm is based on Sierra-2-4A error diffusion. */
27
-static void ditherPlane(pixel *dst, int dstStride, uint16_t *src, int srcStride,
28
-                        int width, int height, int16_t *errors, int bitDepth)
29
+/* The dithering algorithm is based on Sierra-2-4A error diffusion.
30
+ * We convert planes in place (without allocating a new buffer). */
31
+static void ditherPlane(uint16_t *src, int srcStride, int width, int height, int16_t *errors, int bitDepth)
32
 {
33
     const int lShift = 16 - bitDepth;
34
     const int rShift = 16 - bitDepth + 2;
35
@@ -290,15 +290,34 @@
36
     const int pixelMax = (1 << bitDepth) - 1;
37
 
38
     memset(errors, 0, (width + 1) * sizeof(int16_t));
39
-    int pitch = 1;
40
-    for (int y = 0; y < height; y++, src += srcStride, dst += dstStride)
41
+
42
+    if (bitDepth == 8)
43
     {
44
-        int16_t err = 0;
45
-        for (int x = 0; x < width; x++)
46
+        for (int y = 0; y < height; y++, src += srcStride)
47
         {
48
-            err = err * 2 + errors[x] + errors[x + 1];
49
-            dst[x * pitch] = (pixel)x265_clip3(0, pixelMax, ((src[x * 1] << 2) + err + half) >> rShift);
50
-            errors[x] = err = src[x * pitch] - (dst[x * pitch] << lShift);
51
+            uint8_t* dst = (uint8_t *)src;
52
+            int16_t err = 0;
53
+            for (int x = 0; x < width; x++)
54
+            {
55
+                err = err * 2 + errors[x] + errors[x + 1];
56
+                int tmpDst = x265_clip3(0, pixelMax, ((src[x] << 2) + err + half) >> rShift);
57
+                errors[x] = err = (int16_t)(src[x] - (tmpDst << lShift));
58
+                dst[x] = (uint8_t)tmpDst;
59
+            }
60
+        }
61
+    }
62
+    else
63
+    {
64
+        for (int y = 0; y < height; y++, src += srcStride)
65
+        {
66
+            int16_t err = 0;
67
+            for (int x = 0; x < width; x++)
68
+            {
69
+                err = err * 2 + errors[x] + errors[x + 1];
70
+                int tmpDst = x265_clip3(0, pixelMax, ((src[x] << 2) + err + half) >> rShift);
71
+                errors[x] = err = (int16_t)(src[x] - (tmpDst << lShift));
72
+                src[x] = (uint16_t)tmpDst;
73
+            }
74
         }
75
     }
76
 }
77
@@ -317,10 +336,16 @@
78
         return;
79
     }
80
 
81
+    if (picIn.bitDepth == bitDepth)
82
+    {
83
+        fprintf(stderr, "extras[error]: dither support enabled only if encoder depth is different from picture depth\n");
84
+        return;
85
+    }
86
+
87
     /* This portion of code is from readFrame in x264. */
88
     for (int i = 0; i < x265_cli_csps[picIn.colorSpace].planes; i++)
89
     {
90
-        if ((picIn.bitDepth & 7) && (picIn.bitDepth != 16))
91
+        if (picIn.bitDepth < 16)
92
         {
93
             /* upconvert non 16bit high depth planes to 16bit */
94
             uint16_t *plane = (uint16_t*)picIn.planes[i];
95
@@ -332,14 +357,10 @@
96
             for (uint32_t j = 0; j < pixelCount; j++)
97
                 plane[j] = plane[j] << lShift;
98
         }
99
-    }
100
 
101
-    for (int i = 0; i < x265_cli_csps[picIn.colorSpace].planes; i++)
102
-    {
103
         int height = (int)(picHeight >> x265_cli_csps[picIn.colorSpace].height[i]);
104
         int width = (int)(picWidth >> x265_cli_csps[picIn.colorSpace].width[i]);
105
 
106
-        ditherPlane(((pixel*)picIn.planes[i]), picIn.stride[i] / sizeof(pixel), ((uint16_t*)picIn.planes[i]),
107
-                    picIn.stride[i] / 2, width, height, errorBuf, bitDepth);
108
+        ditherPlane(((uint16_t*)picIn.planes[i]), picIn.stride[i] / 2, width, height, errorBuf, bitDepth);
109
     }
110
 }
111
x265_1.9.tar.gz/source/x265.cpp -> x265_2.0.tar.gz/source/x265.cpp Changed
128
 
1
@@ -29,14 +29,10 @@
2
 #include "x265-extras.h"
3
 #include "x265cli.h"
4
 
5
-#include "common.h"
6
 #include "input/input.h"
7
 #include "output/output.h"
8
 #include "output/reconplay.h"
9
 
10
-#include "param.h"
11
-#include "cpu.h"
12
-
13
 #if HAVE_VLD
14
 /* Visual Leak Detector */
15
 #include <vld.h>
16
@@ -312,12 +308,9 @@
17
             OPT("recon-y4m-exec") reconPlayCmd = optarg;
18
             OPT("qpfile")
19
             {
20
-                this->qpfile = fopen(optarg, "rb");
21
+                this->qpfile = x265_fopen(optarg, "rb");
22
                 if (!this->qpfile)
23
-                {
24
-                    x265_log(param, X265_LOG_ERROR, "%s qpfile not found or error in opening qp file\n", optarg);
25
-                    return false;
26
-                }
27
+                    x265_log_file(param, X265_LOG_ERROR, "%s qpfile not found or error in opening qp file\n", optarg);
28
             }
29
             else
30
                 bError |= !!api->param_parse(param, long_options[long_options_index].name, optarg);
31
@@ -378,7 +371,7 @@
32
     this->input = InputFile::open(info, this->bForceY4m);
33
     if (!this->input || this->input->isFail())
34
     {
35
-        x265_log(param, X265_LOG_ERROR, "unable to open input file <%s>\n", inputfn);
36
+        x265_log_file(param, X265_LOG_ERROR, "unable to open input file <%s>\n", inputfn);
37
         return true;
38
     }
39
 
40
@@ -455,10 +448,10 @@
41
     this->output = OutputFile::open(outputfn, info);
42
     if (this->output->isFail())
43
     {
44
-        x265_log(param, X265_LOG_ERROR, "failed to open output file <%s> for writing\n", outputfn);
45
+        x265_log_file(param, X265_LOG_ERROR, "failed to open output file <%s> for writing\n", outputfn);
46
         return true;
47
     }
48
-    general_log(param, this->output->getName(), X265_LOG_INFO, "output file: %s\n", outputfn);
49
+    general_log_file(param, this->output->getName(), X265_LOG_INFO, "output file: %s\n", outputfn);
50
     return false;
51
 }
52
 
53
@@ -497,6 +490,39 @@
54
     return 1;
55
 }
56
 
57
+#ifdef _WIN32
58
+/* Copy of x264 code, which allows for Unicode characters in the command line.
59
+ * Retrieve command line arguments as UTF-8. */
60
+static int get_argv_utf8(int *argc_ptr, char ***argv_ptr)
61
+{
62
+    int ret = 0;
63
+    wchar_t **argv_utf16 = CommandLineToArgvW(GetCommandLineW(), argc_ptr);
64
+    if (argv_utf16)
65
+    {
66
+        int argc = *argc_ptr;
67
+        int offset = (argc + 1) * sizeof(char*);
68
+        int size = offset;
69
+
70
+        for (int i = 0; i < argc; i++)
71
+            size += WideCharToMultiByte(CP_UTF8, 0, argv_utf16[i], -1, NULL, 0, NULL, NULL);
72
+
73
+        char **argv = *argv_ptr = (char**)malloc(size);
74
+        if (argv)
75
+        {
76
+            for (int i = 0; i < argc; i++)
77
+            {
78
+                argv[i] = (char*)argv + offset;
79
+                offset += WideCharToMultiByte(CP_UTF8, 0, argv_utf16[i], -1, argv[i], size - offset, NULL, NULL);
80
+            }
81
+            argv[argc] = NULL;
82
+            ret = 1;
83
+        }
84
+        LocalFree(argv_utf16);
85
+    }
86
+    return ret;
87
+}
88
+#endif
89
+
90
 /* CLI return codes:
91
  *
92
  * 0 - encode successful
93
@@ -517,6 +543,10 @@
94
 
95
     GetConsoleTitle(orgConsoleTitle, CONSOLE_TITLE_SIZE);
96
     SetThreadExecutionState(ES_CONTINUOUS | ES_SYSTEM_REQUIRED | ES_AWAYMODE_REQUIRED);
97
+#if _WIN32
98
+    char** orgArgv = argv;
99
+    get_argv_utf8(&argc, &argv);
100
+#endif
101
 
102
     ReconPlay* reconPlay = NULL;
103
     CLIOptions cliopt;
104
@@ -560,7 +590,7 @@
105
         cliopt.csvfpt = x265_csvlog_open(*api, *param, cliopt.csvfn, cliopt.csvLogLevel);
106
         if (!cliopt.csvfpt)
107
         {
108
-            x265_log(param, X265_LOG_ERROR, "Unable to open CSV log file <%s>, aborting\n", cliopt.csvfn);
109
+            x265_log_file(param, X265_LOG_ERROR, "Unable to open CSV log file <%s>, aborting\n", cliopt.csvfn);
110
             cliopt.destroy();
111
             if (cliopt.api)
112
                 cliopt.api->param_free(cliopt.param);
113
@@ -747,6 +777,14 @@
114
     SetConsoleTitle(orgConsoleTitle);
115
     SetThreadExecutionState(ES_CONTINUOUS);
116
 
117
+#if _WIN32
118
+    if (argv != orgArgv)
119
+    {
120
+        free(argv);
121
+        argv = orgArgv;
122
+    }
123
+#endif
124
+
125
 #if HAVE_VLD
126
     assert(VLDReportLeaks() == 0);
127
 #endif
128
x265_1.9.tar.gz/source/x265.h -> x265_2.0.tar.gz/source/x265.h Changed
111
 
1
@@ -98,9 +98,9 @@
2
     uint32_t         sliceType;
3
     uint32_t         numCUsInFrame;
4
     uint32_t         numPartitions;
5
+    int              bScenecut;
6
     void*            interData;
7
     void*            intraData;
8
-    int              bScenecut;
9
 } x265_analysis_data;
10
 
11
 /* cu statistics */
12
@@ -221,6 +221,14 @@
13
     /* Frame level statistics */
14
     x265_frame_stats frameData;
15
 
16
+    /* Ratecontrol statistics for collecting the ratecontrol information.
17
+     * It is not used for collecting the last pass ratecontrol data in 
18
+     * multi pass ratecontrol mode. */
19
+    void*  rcData;
20
+
21
+    uint64_t framesize;
22
+
23
+    int    height;
24
 } x265_picture;
25
 
26
 typedef enum
27
@@ -587,6 +595,11 @@
28
      * Main (0) and High (1) tier. Default is Main tier (0) */
29
     int       bHighTier;
30
 
31
+    /* Enable UHD Blu-ray compatibility support. If specified, the encoder will
32
+     * attempt to modify/set the encode specifications. If the encoder is unable 
33
+     * to do so, this option will be turned OFF. */
34
+    int       uhdBluray;
35
+
36
     /* The maximum number of L0 references a P or B slice may use. This
37
      * influences the size of the decoded picture buffer. The higher this
38
      * number, the more reference frames there will be available for motion
39
@@ -764,7 +777,7 @@
40
      * enabled). At level 2 rate-distortion cost is used to make decimate decisions
41
      * on each 4x4 coding group (including the cost of signaling the group within
42
      * the group bitmap).  Psy-rdoq is less effective at preserving energy when
43
-     * RDOQ is at level 2 */
44
+     * RDOQ is at level 2. Default: 0 */
45
     int       rdoqLevel;
46
 
47
     /* Enable the implicit signaling of the sign bit of the last coefficient of
48
@@ -896,23 +909,27 @@
49
     /* Note: when deblocking and SAO are both enabled, the loop filter CU lag is
50
      * only one row, as they operate in series on the same row. */
51
 
52
-    /* Select the method in which SAO deals with deblocking boundary pixels.  If
53
+    /* Select the method in which SAO deals with deblocking boundary pixels. If
54
      * disabled the right and bottom boundary areas are skipped. If enabled,
55
      * non-deblocked pixels are used entirely. Default is disabled */
56
     int       bSaoNonDeblocked;
57
 
58
     /*== Analysis tools ==*/
59
 
60
-    /* A value between X265_NO_RDO_NO_RDOQ and X265_RDO_LEVEL which determines
61
-     * the level of rate distortion optimizations to perform during mode
62
-     * decisions and quantization. The more RDO the better the compression
63
-     * efficiency at a major cost of performance. Default is no RDO (0) */
64
+    /* A value between 1 and 6 (both inclusive) which determines the level of 
65
+     * rate distortion optimizations to perform during mode and depth decisions.
66
+     * The more RDO the better the compression efficiency at a major cost of 
67
+     * performance. Default is 3 */
68
     int       rdLevel;
69
 
70
-    /* Enable early skip decisions to avoid intra and inter analysis in likely
71
+    /* Enable early skip decisions to avoid analysing additional modes in likely
72
      * skip blocks. Default is disabled */
73
     int       bEnableEarlySkip;
74
 
75
+    /* Enable early CU size decisions to avoid recursing to higher depths. 
76
+     * Default is enabled */
77
+    int bEnableRecursionSkip;
78
+
79
     /* Use a faster search method to find the best intra mode. Default is 0 */
80
     int       bEnableFastIntra;
81
 
82
@@ -947,10 +964,16 @@
83
     double    psyRd;
84
 
85
     /* Strength of psycho-visual optimizations in quantization. Only has an
86
-     * effect in presets which use RDOQ (rd-levels 4 and 5).  The value must be
87
-     * between 0 and 50, 1.0 is typical. Default 1.0 */
88
+     * effect when RDOQ is enabled (presets slow, slower and veryslow). The 
89
+     * value must be between 0 and 50, 1.0 is typical. Default 0 */
90
     double    psyRdoq;
91
 
92
+    /* Perform quantisation parameter based RD refinement. RD cost is calculated
93
+     * on the best CU partitions, chosen after the CU analysis, for a range of QPs
94
+     * to find the optimal rounding effect. Only effective at rd-levels 5 and 6.
95
+     * Default disabled */
96
+    int       bEnableRdRefine;
97
+
98
     /* If X265_ANALYSIS_SAVE, write per-frame analysis information into analysis
99
      * buffers.  if X265_ANALYSIS_LOAD, read analysis information into analysis
100
      * buffer and use this analysis information to reduce the amount of work
101
@@ -1083,6 +1106,9 @@
102
          * (QG) size. Allowed values are 64, 32, 16 provided it falls within the
103
          * inclusuve range [maxCUSize, minCUSize]. Experimental, default: maxCUSize */
104
         uint32_t qgSize;
105
+
106
+        /* internally enable if tune grain is set */
107
+        int      bEnableGrain;
108
     } rc;
109
 
110
     /*== Video Usability Information ==*/
111
x265_1.9.tar.gz/source/x265cli.h -> x265_2.0.tar.gz/source/x265cli.h Changed
67
 
1
@@ -53,6 +53,7 @@
2
     { "profile",        required_argument, NULL, 'P' },
3
     { "level-idc",      required_argument, NULL, 0 },
4
     { "high-tier",            no_argument, NULL, 0 },
5
+    { "uhd-bd",               no_argument, NULL, 0 },
6
     { "no-high-tier",         no_argument, NULL, 0 },
7
     { "allow-non-conformance",no_argument, NULL, 0 },
8
     { "no-allow-non-conformance",no_argument, NULL, 0 },
9
@@ -96,6 +97,8 @@
10
     { "amp",                  no_argument, NULL, 0 },
11
     { "no-early-skip",        no_argument, NULL, 0 },
12
     { "early-skip",           no_argument, NULL, 0 },
13
+    { "no-rskip",             no_argument, NULL, 0 },
14
+    { "rskip",                no_argument, NULL, 0 },
15
     { "no-fast-cbf",          no_argument, NULL, 0 },
16
     { "fast-cbf",             no_argument, NULL, 0 },
17
     { "no-tskip",             no_argument, NULL, 0 },
18
@@ -143,6 +146,8 @@
19
     { "qp",             required_argument, NULL, 'q' },
20
     { "aq-mode",        required_argument, NULL, 0 },
21
     { "aq-strength",    required_argument, NULL, 0 },
22
+    { "rc-grain",             no_argument, NULL, 0 },
23
+    { "no-rc-grain",          no_argument, NULL, 0 },
24
     { "ipratio",        required_argument, NULL, 0 },
25
     { "pbratio",        required_argument, NULL, 0 },
26
     { "qcomp",          required_argument, NULL, 0 },
27
@@ -159,6 +164,8 @@
28
     { "psy-rdoq",       required_argument, NULL, 0 },
29
     { "no-psy-rd",            no_argument, NULL, 0 },
30
     { "no-psy-rdoq",          no_argument, NULL, 0 },
31
+    { "rd-refine",            no_argument, NULL, 0 },
32
+    { "no-rd-refine",         no_argument, NULL, 0 },
33
     { "scaling-list",   required_argument, NULL, 0 },
34
     { "lossless",             no_argument, NULL, 0 },
35
     { "no-lossless",          no_argument, NULL, 0 },
36
@@ -279,6 +286,7 @@
37
     H0("-P/--profile <string>            Enforce an encode profile: main, main10, mainstillpicture\n");
38
     H0("   --level-idc <integer|float>   Force a minimum required decoder level (as '5.0' or '50')\n");
39
     H0("   --[no-]high-tier              If a decoder level is specified, this modifier selects High tier of that level\n");
40
+    H0("   --uhd-bd                      Enable UHD Bluray compatibility support\n");
41
     H0("   --[no-]allow-non-conformance  Allow the encoder to generate profile NONE bitstreams. Default %s\n", OPT(param->bAllowNonConformance));
42
     H0("\nThreading, performance:\n");
43
     H0("   --pools <integer,...>         Comma separated thread count per thread pool (pool per NUMA node)\n");
44
@@ -300,11 +308,13 @@
45
     H0("   --tu-intra-depth <integer>    Max TU recursive depth for intra CUs. Default %d\n", param->tuQTMaxIntraDepth);
46
     H0("   --tu-inter-depth <integer>    Max TU recursive depth for inter CUs. Default %d\n", param->tuQTMaxInterDepth);
47
     H0("\nAnalysis:\n");
48
-    H0("   --rd <0..6>                   Level of RDO in mode decision 0:least....6:full RDO. Default %d\n", param->rdLevel);
49
+    H0("   --rd <1..6>                   Level of RDO in mode decision 1:least....6:full RDO. Default %d\n", param->rdLevel);
50
     H0("   --[no-]psy-rd <0..5.0>        Strength of psycho-visual rate distortion optimization, 0 to disable. Default %.1f\n", param->psyRd);
51
     H0("   --[no-]rdoq-level <0|1|2>     Level of RDO in quantization 0:none, 1:levels, 2:levels & coding groups. Default %d\n", param->rdoqLevel);
52
     H0("   --[no-]psy-rdoq <0..50.0>     Strength of psycho-visual optimization in RDO quantization, 0 to disable. Default %.1f\n", param->psyRdoq);
53
+    H0("   --[no-]rd-refine              Enable QP based RD refinement for rd levels 5 and 6. Default %s\n", OPT(param->bEnableRdRefine));
54
     H0("   --[no-]early-skip             Enable early SKIP detection. Default %s\n", OPT(param->bEnableEarlySkip));
55
+    H0("   --[no-]rskip                  Enable early exit from recursion. Default %s\n", OPT(param->bEnableRecursionSkip));
56
     H1("   --[no-]tskip-fast             Enable fast intra transform skipping. Default %s\n", OPT(param->bEnableTSkipFast));
57
     H1("   --nr-intra <integer>          An integer value in range of 0 to 2000, which denotes strength of noise reduction in intra CUs. Default 0\n");
58
     H1("   --nr-inter <integer>          An integer value in range of 0 to 2000, which denotes strength of noise reduction in inter CUs. Default 0\n");
59
@@ -373,6 +383,7 @@
60
     H0("   --aq-strength <float>         Reduces blocking and blurring in flat and textured areas (0 to 3.0). Default %.2f\n", param->rc.aqStrength);
61
     H0("   --qg-size <int>               Specifies the size of the quantization group (64, 32, 16). Default %d\n", param->rc.qgSize);
62
     H0("   --[no-]cutree                 Enable cutree for Adaptive Quantization. Default %s\n", OPT(param->rc.cuTree));
63
+    H0("   --[no-]rc-grain               Enable ratecontrol mode to handle grains specifically. turned on with tune grain. Default %s\n", OPT(param->rc.bEnableGrain));
64
     H1("   --ipratio <float>             QP factor between I and P. Default %.2f\n", param->rc.ipFactor);
65
     H1("   --pbratio <float>             QP factor between P and B. Default %.2f\n", param->rc.pbFactor);
66
     H1("   --qcomp <float>               Weight given to predicted complexity. Default %.2f\n", param->rc.qCompress);
67
Refresh

No build results available

Refresh

No rpmlint results available


Olaf Hering

olh wrote over 8 years ago

Why the new condition for 13.1, whats different in that libnuma variant?

Request History
enzokiel's avatar

enzokiel created request over 8 years ago

Update to version 2.0


Olaf Hering's avatar

olh accepted request over 8 years ago