Changes of Revision 19

x265.changes Changed
x
 
1
@@ -1,4 +1,34 @@
2
 -------------------------------------------------------------------
3
+Sun Jan  1 20:32:07 UTC 2017 - idonmez@suse.com
4
+
5
+-  Update to version 2.2
6
+   Encode enhancements
7
+   * Enhancements to TU selection algorithm with early-outs for
8
+     improved speed; use --limit-tu to exercise.
9
+   * New motion search method SEA (Successive Elimination Algorithm)
10
+     supported now as :option: –me 4
11
+   * Bit-stream optimizations to improve fields in PPS and SPS for
12
+     bit-rate savings through --[no-]opt-qp-pps, 
13
+     --[no-]opt-ref-list-length-pps, and --[no-]multi-pass-opt-rps.
14
+   * Enabled using VBV constraints when encoding without WPP.
15
+   * All param options dumped in SEI packet in bitstream when info
16
+     selected.
17
+   API changes
18
+   * Options to disable SEI and optional-VUI messages from bitstream
19
+     made more descriptive.
20
+   * New option --scenecut-bias to enable controlling bias to mark
21
+     scene-cuts via cli.
22
+   * Support mono and mono16 color spaces for y4m input.
23
+   * --min-cu-size of 64 no-longer supported for reasons of
24
+     visual quality.
25
+   * API for CSV now expects version string for better integration
26
+     of x265 into other applications.
27
+   Bug fixes
28
+   * Several fixes to slice-based encoding.
29
+   * --log2-max-poc-lsb‘s range limited according to HEVC spec.
30
+   * Restrict MVs to within legal boundaries when encoding.
31
+
32
+-------------------------------------------------------------------
33
 Thu Dec 22 12:59:47 UTC 2016 - scarabeus@opensuse.org
34
 
35
 - Add conditional for the numa-devel again it was not ment to be dropped
36
x265.spec Changed
14
 
1
@@ -1,10 +1,10 @@
2
 # based on the spec file from https://build.opensuse.org/package/view_file/home:Simmphonie/libx265/
3
 
4
 Name:           x265
5
-%define soname  95
6
+%define soname  102
7
 %define libname lib%{name}
8
 %define libsoname %{libname}-%{soname}
9
-Version:        2.1
10
+Version:        2.2
11
 Release:        0
12
 License:        GPL-2.0+
13
 Summary:        A free h265/HEVC encoder - encoder binary
14
arm.patch Changed
61
 
1
@@ -1,11 +1,11 @@
2
-Index: x265_2.1/source/CMakeLists.txt
3
+Index: x265_2.2/source/CMakeLists.txt
4
 ===================================================================
5
---- x265_2.1.orig/source/CMakeLists.txt
6
-+++ x265_2.1/source/CMakeLists.txt
7
-@@ -60,15 +60,22 @@ elseif(POWERMATCH GREATER "-1")
8
-     message(STATUS "Detected POWER target processor")
9
-     set(POWER 1)
10
-     add_definitions(-DX265_ARCH_POWER=1)
11
+--- x265_2.2.orig/source/CMakeLists.txt
12
++++ x265_2.2/source/CMakeLists.txt
13
+@@ -65,15 +65,22 @@ elseif(POWERMATCH GREATER "-1")
14
+         add_definitions(-DPPC64=1)
15
+         message(STATUS "Detected POWER PPC64 target processor")
16
+     endif()
17
 -elseif(ARMMATCH GREATER "-1")
18
 -    if(CROSS_COMPILE_ARM)
19
 -        message(STATUS "Cross compiling for ARM arch")
20
@@ -34,7 +34,7 @@
21
  else()
22
      message(STATUS "CMAKE_SYSTEM_PROCESSOR value `${CMAKE_SYSTEM_PROCESSOR}` is unknown")
23
      message(STATUS "Please add this value near ${CMAKE_CURRENT_LIST_FILE}:${CMAKE_CURRENT_LIST_LINE}")
24
-@@ -190,18 +197,9 @@ if(GCC)
25
+@@ -208,18 +215,9 @@ if(GCC)
26
              endif()
27
          endif()
28
      endif()
29
@@ -55,10 +55,10 @@
30
      if(FPROFILE_GENERATE)
31
          if(INTEL_CXX)
32
              add_definitions(-prof-gen -prof-dir="${CMAKE_CURRENT_BINARY_DIR}")
33
-Index: x265_2.1/source/common/cpu.cpp
34
+Index: x265_2.2/source/common/cpu.cpp
35
 ===================================================================
36
---- x265_2.1.orig/source/common/cpu.cpp
37
-+++ x265_2.1/source/common/cpu.cpp
38
+--- x265_2.2.orig/source/common/cpu.cpp
39
++++ x265_2.2/source/common/cpu.cpp
40
 @@ -37,7 +37,7 @@
41
  #include <machine/cpu.h>
42
  #endif
43
@@ -68,7 +68,7 @@
44
  #include <signal.h>
45
  #include <setjmp.h>
46
  static sigjmp_buf jmpbuf;
47
-@@ -340,7 +340,6 @@ uint32_t cpu_detect(void)
48
+@@ -344,7 +344,6 @@ uint32_t cpu_detect(void)
49
      }
50
  
51
      canjump = 1;
52
@@ -76,7 +76,7 @@
53
      canjump = 0;
54
      signal(SIGILL, oldsig);
55
  #endif // if !HAVE_NEON
56
-@@ -356,7 +355,7 @@ uint32_t cpu_detect(void)
57
+@@ -360,7 +359,7 @@ uint32_t cpu_detect(void)
58
      // which may result in incorrect detection and the counters stuck enabled.
59
      // right now Apple does not seem to support performance counters for this test
60
  #ifndef __MACH__
61
baselibs.conf Changed
4
 
1
@@ -1,1 +1,1 @@
2
-libx265-95
3
+libx265-102
4
x265_2.1.tar.gz/.hg_archival.txt -> x265_2.2.tar.gz/.hg_archival.txt Changed
10
 
1
@@ -1,6 +1,4 @@
2
 repo: 09fe40627f03a0f9c3e6ac78b22ac93da23f9fdf
3
-node: 3e8ce3b26319dbd53ab6369e4c4e986bf30f1315
4
+node: be14a7e9755e54f0fd34911c72bdfa66981220bc
5
 branch: stable
6
-latesttag: 2.1
7
-latesttagdistance: 1
8
-changessincelatesttag: 1
9
+tag: 2.2
10
x265_2.1.tar.gz/doc/reST/cli.rst -> x265_2.2.tar.gz/doc/reST/cli.rst Changed
124
 
1
@@ -662,7 +662,7 @@
2
    and less frame parallelism as well. Because of this the faster
3
    presets use a CU size of 32. Default: 64
4
 
5
-.. option:: --min-cu-size <64|32|16|8>
6
+.. option:: --min-cu-size <32|16|8>
7
 
8
    Minimum CU size (width and height). By using 16 or 32 the encoder
9
    will not analyze the cost of CUs below that minimum threshold,
10
@@ -869,6 +869,24 @@
11
    partitions, in which case a TU split is implied and thus the
12
    residual quad-tree begins one layer below the CU quad-tree.
13
 
14
+.. option:: --limit-tu <0..4>
15
+
16
+   Enables early exit from TU depth recursion, for inter coded blocks.
17
+   Level 1 - decides to recurse to next higher depth based on cost 
18
+   comparison of full size TU and split TU.
19
+   
20
+   Level 2 - based on first split subTU's depth, limits recursion of
21
+   other split subTUs.
22
+   
23
+   Level 3 - based on the average depth of the co-located and the neighbor
24
+   CUs' TU depth, limits recursion of the current CU.
25
+   
26
+   Level 4 - uses the depth of the neighbouring/ co-located CUs TU depth 
27
+   to limit the 1st subTU depth. The 1st subTU depth is taken as the 
28
+   limiting depth for the other subTUs.
29
+
30
+   Default: 0
31
+
32
 .. option:: --nr-intra <integer>, --nr-inter <integer>
33
 
34
    Noise reduction - an adaptive deadzone applied after DCT
35
@@ -949,13 +967,17 @@
36
    encoder: a star-pattern search followed by an optional radix scan
37
    followed by an optional star-search refinement. Full is an
38
    exhaustive search; an order of magnitude slower than all other
39
-   searches but not much better than umh or star.
40
+   searches but not much better than umh or star. SEA is similar to
41
+   FULL search; a three step motion search adopted from x264: DC 
42
+   calculation followed by ADS calculation followed by SAD of the
43
+   passed motion vector candidates, hence faster than Full search. 
44
 
45
    0. dia
46
    1. hex **(default)**
47
    2. umh
48
    3. star
49
-   4. full
50
+   4. sea
51
+   5. full
52
 
53
 .. option:: --subme, -m <0..7>
54
 
55
@@ -1153,6 +1175,13 @@
56
    :option:`--scenecut` 0 or :option:`--no-scenecut` disables adaptive
57
    I frame placement. Default 40
58
 
59
+.. option:: --scenecut-bias <0..100.0>
60
+
61
+   This value represents the percentage difference between the inter cost and
62
+   intra cost of a frame used in scenecut detection. For example, a value of 5 indicates,
63
+   if the inter cost of a frame is greater than or equal to 95 percent of the intra cost of the frame,
64
+   then detect this frame as scenecut. Values between 5 and 15 are recommended. Default 5. 
65
+   
66
 .. option:: --intra-refresh
67
 
68
    Enables Periodic Intra Refresh(PIR) instead of keyframe insertion.
69
@@ -1304,7 +1333,7 @@
70
    slices using param->rc.ipFactor and param->rc.pbFactor unless QP 0
71
    is specified, in which case QP 0 is used for all slice types.  Note
72
    that QP 0 does not cause lossless encoding, it only disables
73
-   quantization. Default disabled (CRF)
74
+   quantization. Default disabled.
75
 
76
    **Range of values:** an integer from 0 to 51
77
 
78
@@ -1824,7 +1853,7 @@
79
    enhancement layer. A decoder may chose to drop the enhancement layer 
80
    and only decode and display the base layer slices.
81
    
82
-   If used with a fixed GOP (:option:`b-adapt` 0) and :option:`bframes`
83
+   If used with a fixed GOP (:option:`--b-adapt` 0) and :option:`--bframes`
84
    3 then the two layers evenly split the frame rate, with a cadence of
85
    PbBbP. You probably also want :option:`--no-scenecut` and a keyframe
86
    interval that is a multiple of 4.
87
@@ -1833,15 +1862,29 @@
88
 
89
   Maximum of the picture order count. Default 8
90
 
91
-.. option:: --discard-sei
92
+.. option:: --[no-]vui-timing-info
93
 
94
-  Discard SEI messages generated from the final bitstream. HDR-related SEI
95
-  messages are always dumped, immaterial of this option. Default disabled.
96
-   
97
-.. option:: --discard-vui
98
+   Emit VUI timing info in bitstream. Default enabled.
99
+
100
+.. option:: --[no-]vui-hrd-info
101
+
102
+   Emit VUI HRD info in  bitstream. Default enabled when
103
+   :option:`--hrd` is enabled.
104
+
105
+.. option:: --[no-]opt-qp-pps
106
+
107
+   Optimize QP in PPS (instead of default value of 26) based on the QP values
108
+   observed in last GOP. Default enabled.
109
+
110
+.. option:: --[no-]opt-ref-list-length-pps
111
+
112
+   Optimize L0 and L1 ref list length in PPS (instead of default value of 0)
113
+   based on the lengths observed in the last GOP. Default enabled.
114
+
115
+.. option:: --[no-]multi-pass-opt-rps
116
+
117
+   Enable storing commonly used RPS in SPS in multi pass mode. Default disabled.
118
 
119
-   Discard optional VUI information (timing, HRD info) from the
120
-   bitstream. Default disabled.
121
 
122
 Debugging options
123
 =================
124
x265_2.1.tar.gz/doc/reST/index.rst -> x265_2.2.tar.gz/doc/reST/index.rst Changed
6
 
1
@@ -9,3 +9,4 @@
2
    threading
3
    presets
4
    lossless
5
+   releasenotes
6
x265_2.2.tar.gz/doc/reST/releasenotes.rst Added
143
 
1
@@ -0,0 +1,141 @@
2
+*************
3
+Release Notes
4
+*************
5
+
6
+Version 2.2
7
+===========
8
+
9
+Release date - 26th December, 2016.
10
+
11
+Encoder enhancements
12
+--------------------
13
+1. Enhancements to TU selection algorithm with early-outs for improved speed; use :option:`--limit-tu` to exercise.
14
+2. New motion search method SEA (Successive Elimination Algorithm) supported now as :option: `--me` 4
15
+3. Bit-stream optimizations to improve fields in PPS and SPS for bit-rate savings through :option:`--[no-]opt-qp-pps`, :option:`--[no-]opt-ref-list-length-pps`, and :option:`--[no-]multi-pass-opt-rps`.
16
+4. Enabled using VBV constraints when encoding without WPP.
17
+5. All param options dumped in SEI packet in bitstream when info selected.
18
+6. x265 now supports POWERPC-based systems. Several key functions also have optimized ALTIVEC kernels.
19
+
20
+API changes
21
+-----------
22
+1. Options to disable SEI and optional-VUI messages from bitstream made more descriptive.
23
+2. New option :option:`--scenecut-bias` to enable controlling bias to mark scene-cuts via cli.
24
+3. Support mono and mono16 color spaces for y4m input.
25
+4. :option:`--min-cu-size` of 64 no-longer supported for reasons of visual quality (was crashing earlier anyways.)
26
+5. API for CSV now expects version string for better integration of x265 into other applications.
27
+
28
+Bug fixes
29
+---------
30
+1. Several fixes to slice-based encoding.
31
+2. :option:`--log2-max-poc-lsb`'s range limited according to HEVC spec.
32
+3. Restrict MVs to within legal boundaries when encoding.
33
+
34
+Version 2.1
35
+===========
36
+
37
+Release date - 27th September, 2016
38
+
39
+Encoder enhancements
40
+--------------------
41
+1. Support for qg-size of 8
42
+2. Support for inserting non-IDR I-frames at scenecuts and when running with settings for fixed-GOP (min-keyint = max-keyint)
43
+3. Experimental support for slice-parallelism.
44
+
45
+API changes
46
+-----------
47
+1. Encode user-define SEI messages passed in through x265_picture object.
48
+2. Disable SEI and VUI messages from the bitstream
49
+3. Specify qpmin and qpmax
50
+4. Control number of bits to encode POC.
51
+
52
+Bug fixes
53
+---------
54
+1. QP fluctuation fix for first B-frame in mini-GOP for 2-pass encoding with tune-grain.
55
+2. Assembly fix for crashes in 32-bit from dct_sse4.
56
+3. Threadpool creation fix in windows platform.
57
+
58
+Version 2.0
59
+===========
60
+
61
+Release date - 13th July, 2016
62
+
63
+New Features
64
+------------
65
+
66
+1. uhd-bd: Enable Ultra-HD Bluray support
67
+2. rskip: Enables skipping recursion to analyze lower CU sizes using heuristics at different rd-levels. Provides good visual quality gains at the highest quality presets. 
68
+3. rc-grain: Enables a new ratecontrol mode specifically for grainy content. Strictly prevents QP oscillations within and between frames to avoid grain fluctuations.
69
+4. tune grain: A fully refactored and improved option to encode film grain content including QP control as well as analysis options.
70
+5. asm: ARM assembly is now enabled by default, native or cross compiled builds supported on armv6 and later systems.
71
+
72
+API and Key Behaviour Changes
73
+-----------------------------
74
+
75
+1. x265_rc_stats added to x265_picture, containing all RC decision points for that frame
76
+2. PTL: high tier is now allowed by default, chosen only if necessary
77
+3. multi-pass: First pass now uses slow-firstpass by default, enabling better RC decisions in future passes 
78
+4. pools: fix behaviour on multi-socketed Windows systems, provide more flexibility in determining thread and pool counts
79
+5. ABR: improve bits allocation in the first few frames, abr reset, vbv and cutree improved
80
+
81
+Misc
82
+----
83
+1. An SSIM calculation bug was corrected
84
+
85
+Version 1.9
86
+===========
87
+
88
+Release date - 29th January, 2016
89
+
90
+New Features
91
+------------
92
+
93
+1. Quant offsets: This feature allows block level quantization offsets to be specified for every frame. An API-only feature.
94
+2. --intra-refresh: Keyframes can be replaced by a moving column of intra blocks in non-keyframes.
95
+3. --limit-modes: Intelligently restricts mode analysis. 
96
+4. --max-luma and --min-luma for luma clipping, optional for HDR use-cases
97
+5. Emergency denoising is now enabled by default in very low bitrate, VBV encodes
98
+
99
+API Changes
100
+-----------
101
+
102
+1. x265_frame_stats returns many additional fields: maxCLL, maxFALL, residual energy, scenecut  and latency logging
103
+2. --qpfile now supports frametype 'K"
104
+3. x265 now allows CRF ratecontrol in pass N (N greater than or equal to 2)
105
+4. Chroma subsampling format YUV 4:0:0 is now fully supported and tested
106
+
107
+Presets and Performance
108
+-----------------------
109
+
110
+1. Recently added features lookahead-slices, limit-modes, limit-refs have been enabled by default for applicable presets.
111
+2. The default psy-rd strength has been increased to 2.0
112
+3. Multi-socket machines now use a single pool of threads that can work cross-socket.
113
+
114
+Version 1.8
115
+===========
116
+
117
+Release date - 10th August, 2015
118
+
119
+API Changes
120
+-----------
121
+1. Experimental support for Main12 is now enabled. Partial assembly support exists. 
122
+2. Main12 and Intra/Still picture profiles are now supported. Still picture profile is detected based on x265_param::totalFrames.
123
+3. Three classes of encoding statistics are now available through the API. 
124
+a) x265_stats - contains encoding statistics, available through x265_encoder_get_stats()
125
+b) x265_frame_stats and x265_cu_stats - contains frame encoding statistics, available through recon x265_picture
126
+4. --csv
127
+a) x265_encoder_log() is now deprecated
128
+b) x265_param::csvfn is also deprecated
129
+5. --log-level now controls only console logging, frame level console logging has been removed.
130
+6. Support added for new color transfer characteristic ARIB STD-B67
131
+
132
+New Features
133
+------------
134
+1. limit-refs: This feature limits the references analysed for individual CUS. Provides a nice tradeoff between efficiency and performance.
135
+2. aq-mode 3: A new aq-mode that provides additional biasing for low-light conditions.
136
+3. An improved scene cut detection logic that allows ratecontrol to manage visual quality at fade-ins and fade-outs better.
137
+
138
+Preset and Tune Options
139
+-----------------------
140
+
141
+1. tune grain: Increases psyRdoq strength to 10.0, and rdoq-level to 2.
142
+2. qg-size: Default value changed to 32.
143
x265_2.1.tar.gz/source/CMakeLists.txt -> x265_2.2.tar.gz/source/CMakeLists.txt Changed
65
 
1
@@ -30,7 +30,7 @@
2
 mark_as_advanced(FPROFILE_USE FPROFILE_GENERATE NATIVE_BUILD)
3
 
4
 # X265_BUILD must be incremented each time the public API is changed
5
-set(X265_BUILD 95)
6
+set(X265_BUILD 102)
7
 configure_file("${PROJECT_SOURCE_DIR}/x265.def.in"
8
                "${PROJECT_BINARY_DIR}/x265.def")
9
 configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in"
10
@@ -60,6 +60,11 @@
11
     message(STATUS "Detected POWER target processor")
12
     set(POWER 1)
13
     add_definitions(-DX265_ARCH_POWER=1)
14
+    if("${CMAKE_SIZEOF_VOID_P}" MATCHES 8)
15
+        set(PPC64 1)
16
+        add_definitions(-DPPC64=1)
17
+        message(STATUS "Detected POWER PPC64 target processor")
18
+    endif()
19
 elseif(ARMMATCH GREATER "-1")
20
     if(CROSS_COMPILE_ARM)
21
         message(STATUS "Cross compiling for ARM arch")
22
@@ -167,6 +172,19 @@
23
 elseif(CMAKE_COMPILER_IS_GNUCXX)
24
     set(GCC 1)
25
 endif()
26
+
27
+if(CC STREQUAL "xlc")
28
+    message(STATUS "Use XLC compiler")
29
+    set(XLC 1)
30
+    set(GCC 0)
31
+    #set(CMAKE_C_COMPILER "/usr/bin/xlc")
32
+    #set(CMAKE_CXX_COMPILER "/usr/bin/xlc++")
33
+    add_definitions(-D__XLC__=1)
34
+    add_definitions(-O3 -qstrict -qhot -qaltivec)
35
+    add_definitions(-qinline=level=10 -qpath=IL:/data/video_files/latest.tpo/)
36
+endif()
37
+
38
+
39
 if(GCC)
40
     add_definitions(-Wall -Wextra -Wshadow)
41
     add_definitions(-D__STDC_LIMIT_MACROS=1)
42
@@ -396,6 +414,22 @@
43
     endif(WINXP_SUPPORT)
44
 endif()
45
 
46
+if(POWER)
47
+    # IBM Power8
48
+    option(ENABLE_ALTIVEC "Enable ALTIVEC profiling instrumentation" ON)
49
+    if(ENABLE_ALTIVEC)
50
+        add_definitions(-DHAVE_ALTIVEC=1 -maltivec -mabi=altivec)
51
+        add_definitions(-flax-vector-conversions -fpermissive)
52
+    else()
53
+        add_definitions(-DHAVE_ALTIVEC=0)
54
+    endif()
55
+
56
+    option(CPU_POWER8 "Enable CPU POWER8 profiling instrumentation" ON)
57
+    if(CPU_POWER8)
58
+        add_definitions(-mcpu=power8 -DX265_ARCH_POWER8=1)
59
+    endif()
60
+endif()
61
+
62
 include(version) # determine X265_VERSION and X265_LATEST_TAG
63
 include_directories(. common encoder "${PROJECT_BINARY_DIR}")
64
 
65
x265_2.1.tar.gz/source/common/CMakeLists.txt -> x265_2.2.tar.gz/source/common/CMakeLists.txt Changed
30
 
1
@@ -99,6 +99,19 @@
2
     source_group(Assembly FILES ${ASM_PRIMITIVES})
3
 endif(ENABLE_ASSEMBLY AND (ARM OR CROSS_COMPILE_ARM))
4
 
5
+if(POWER)
6
+    set_source_files_properties(version.cpp PROPERTIES COMPILE_FLAGS -DX265_VERSION=${X265_VERSION})
7
+    if(ENABLE_ALTIVEC)
8
+        set(ALTIVEC_SRCS pixel_altivec.cpp dct_altivec.cpp ipfilter_altivec.cpp intrapred_altivec.cpp)
9
+        foreach(SRC ${ALTIVEC_SRCS})
10
+            set(ALTIVEC_PRIMITIVES ${ALTIVEC_PRIMITIVES} ppc/${SRC})
11
+        endforeach()
12
+        source_group(Intrinsics_altivec FILES ${ALTIVEC_PRIMITIVES})
13
+        set_source_files_properties(${ALTIVEC_PRIMITIVES} PROPERTIES COMPILE_FLAGS "-Wno-unused  -Wno-unknown-pragmas -Wno-maybe-uninitialized")
14
+    endif()
15
+endif()
16
+
17
+
18
 # set_target_properties can't do list expansion
19
 string(REPLACE ";" " " VERSION_FLAGS "${VFLAGS}")
20
 set_source_files_properties(version.cpp PROPERTIES COMPILE_FLAGS ${VERSION_FLAGS})
21
@@ -116,7 +129,7 @@
22
 endif(WIN32)
23
 
24
 add_library(common OBJECT
25
-    ${ASM_PRIMITIVES} ${VEC_PRIMITIVES} ${WINXP}
26
+    ${ASM_PRIMITIVES} ${VEC_PRIMITIVES} ${ALTIVEC_PRIMITIVES} ${WINXP}
27
     primitives.cpp primitives.h
28
     pixel.cpp dct.cpp ipfilter.cpp intrapred.cpp loopfilter.cpp
29
     constants.cpp constants.h
30
x265_2.1.tar.gz/source/common/bitstream.h -> x265_2.2.tar.gz/source/common/bitstream.h Changed
9
 
1
@@ -71,6 +71,7 @@
2
     uint32_t getNumberOfWrittenBytes() const { return m_byteOccupancy; }
3
     uint32_t getNumberOfWrittenBits()  const { return m_byteOccupancy * 8 + m_partialByteBits; }
4
     const uint8_t* getFIFO() const           { return m_fifo; }
5
+    void     copyBits(Bitstream* stream)     { m_partialByteBits = stream->m_partialByteBits; m_byteOccupancy = stream->m_byteOccupancy; m_partialByte = stream->m_partialByte; }
6
 
7
     void     write(uint32_t val, uint32_t numBits);
8
     void     writeByte(uint32_t val);
9
x265_2.1.tar.gz/source/common/common.h -> x265_2.2.tar.gz/source/common/common.h Changed
27
 
1
@@ -176,7 +176,7 @@
2
 
3
 #define X265_MIN(a, b) ((a) < (b) ? (a) : (b))
4
 #define X265_MAX(a, b) ((a) > (b) ? (a) : (b))
5
-#define COPY1_IF_LT(x, y) if ((y) < (x)) (x) = (y);
6
+#define COPY1_IF_LT(x, y) {if ((y) < (x)) (x) = (y);}
7
 #define COPY2_IF_LT(x, y, a, b) \
8
     if ((y) < (x)) \
9
     { \
10
@@ -312,6 +312,7 @@
11
 
12
 #define MAX_NUM_REF_PICS            16 // max. number of pictures used for reference
13
 #define MAX_NUM_REF                 16 // max. number of entries in picture reference list
14
+#define MAX_NUM_SHORT_TERM_RPS      64 // max. number of short term reference picture set in SPS
15
 
16
 #define REF_NOT_VALID               -1
17
 
18
@@ -327,6 +328,8 @@
19
 
20
 #define PIXEL_MAX ((1 << X265_DEPTH) - 1)
21
 
22
+#define INTEGRAL_PLANE_NUM          12 // 12 integral planes for 32x32, 32x24, 32x8, 24x32, 16x16, 16x12, 16x4, 12x16, 8x32, 8x8, 4x16 and 4x4.
23
+
24
 namespace X265_NS {
25
 
26
 enum { SAO_NUM_OFFSET = 4 };
27
x265_2.1.tar.gz/source/common/cpu.cpp -> x265_2.2.tar.gz/source/common/cpu.cpp Changed
32
 
1
@@ -99,6 +99,10 @@
2
     { "ARMv6",           X265_CPU_ARMV6 },
3
     { "NEON",            X265_CPU_NEON },
4
     { "FastNeonMRC",     X265_CPU_FAST_NEON_MRC },
5
+
6
+#elif X265_ARCH_POWER8
7
+    { "Altivec",         X265_CPU_ALTIVEC },
8
+
9
 #endif // if X265_ARCH_X86
10
     { "", 0 },
11
 };
12
@@ -363,7 +367,18 @@
13
     return flags;
14
 }
15
 
16
-#else // if X265_ARCH_X86
17
+#elif X265_ARCH_POWER8
18
+
19
+uint32_t cpu_detect(void)
20
+{
21
+#if HAVE_ALTIVEC
22
+    return X265_CPU_ALTIVEC;
23
+#else
24
+    return 0;
25
+#endif
26
+}
27
+
28
+#else // if X265_ARCH_POWER8
29
 
30
 uint32_t cpu_detect(void)
31
 {
32
x265_2.1.tar.gz/source/common/cudata.cpp -> x265_2.2.tar.gz/source/common/cudata.cpp Changed
11
 
1
@@ -296,6 +296,9 @@
2
     /* initialize the remaining CU data in one memset */
3
     memset(m_cuDepth, 0, (frame.m_param->internalCsp == X265_CSP_I400 ? BytesPerPartition - 11 : BytesPerPartition - 7) * m_numPartitions);
4
 
5
+    for (int8_t i = 0; i < NUM_TU_DEPTH; i++)
6
+        m_refTuDepth[i] = -1;
7
+
8
     uint32_t widthInCU = m_slice->m_sps->numCuInWidth;
9
     m_cuLeft = (m_cuAddr % widthInCU) ? m_encData->getPicCTU(m_cuAddr - 1) : NULL;
10
     m_cuAbove = (m_cuAddr >= widthInCU) && !m_bFirstRowInSlice ? m_encData->getPicCTU(m_cuAddr - widthInCU) : NULL;
11
x265_2.1.tar.gz/source/common/cudata.h -> x265_2.2.tar.gz/source/common/cudata.h Changed
29
 
1
@@ -28,6 +28,8 @@
2
 #include "slice.h"
3
 #include "mv.h"
4
 
5
+#define NUM_TU_DEPTH 21
6
+
7
 namespace X265_NS {
8
 // private namespace
9
 
10
@@ -204,6 +206,7 @@
11
     enum { BytesPerPartition = 21 };  // combined sizeof() of all per-part data
12
 
13
     coeff_t*      m_trCoeff[3];       // transformed coefficient buffer per plane
14
+    int8_t        m_refTuDepth[NUM_TU_DEPTH];   // TU depth of CU at depths 0, 1 and 2
15
 
16
     MV*           m_mv[2];            // array of motion vectors per list
17
     MV*           m_mvd[2];           // array of coded motion vector deltas per list
18
@@ -355,9 +358,8 @@
19
             CHECKED_MALLOC(trCoeffMemBlock, coeff_t, (sizeL + sizeC * 2) * numInstances);
20
         }
21
         CHECKED_MALLOC(charMemBlock, uint8_t, numPartition * numInstances * CUData::BytesPerPartition);
22
-        CHECKED_MALLOC(mvMemBlock, MV, numPartition * 4 * numInstances);
23
+        CHECKED_MALLOC_ZERO(mvMemBlock, MV, numPartition * 4 * numInstances);
24
         return true;
25
-
26
     fail:
27
         return false;
28
     }
29
x265_2.1.tar.gz/source/common/framedata.cpp -> x265_2.2.tar.gz/source/common/framedata.cpp Changed
41
 
1
@@ -37,6 +37,9 @@
2
     m_slice  = new Slice;
3
     m_picCTU = new CUData[sps.numCUsInFrame];
4
     m_picCsp = csp;
5
+    m_spsrpsIdx = -1;
6
+    if (param.rc.bStatWrite)
7
+        m_spsrps = const_cast<RPS*>(sps.spsrps);
8
 
9
     m_cuMemPool.create(0, param.internalCsp, sps.numCUsInFrame);
10
     for (uint32_t ctuAddr = 0; ctuAddr < sps.numCUsInFrame; ctuAddr++)
11
@@ -45,6 +48,12 @@
12
     CHECKED_MALLOC_ZERO(m_cuStat, RCStatCU, sps.numCUsInFrame);
13
     CHECKED_MALLOC(m_rowStat, RCStatRow, sps.numCuInHeight);
14
     reinit(sps);
15
+    
16
+    for (int i = 0; i < INTEGRAL_PLANE_NUM; i++)
17
+    {
18
+        m_meBuffer[i] = NULL;
19
+        m_meIntegral[i] = NULL;
20
+    }
21
     return true;
22
 
23
 fail:
24
@@ -67,4 +76,16 @@
25
 
26
     X265_FREE(m_cuStat);
27
     X265_FREE(m_rowStat);
28
+
29
+    if (m_meBuffer)
30
+    {
31
+        for (int i = 0; i < INTEGRAL_PLANE_NUM; i++)
32
+        {
33
+            if (m_meBuffer[i] != NULL)
34
+            {
35
+                X265_FREE(m_meBuffer[i]);
36
+                m_meBuffer[i] = NULL;
37
+            }
38
+        }
39
+    }
40
 }
41
x265_2.1.tar.gz/source/common/framedata.h -> x265_2.2.tar.gz/source/common/framedata.h Changed
44
 
1
@@ -106,6 +106,9 @@
2
     CUDataMemPool  m_cuMemPool;
3
     CUData*        m_picCTU;
4
 
5
+    RPS*           m_spsrps;
6
+    int            m_spsrpsIdx;
7
+
8
     /* Rate control data used during encode and by references */
9
     struct RCStatCU
10
     {
11
@@ -123,10 +126,10 @@
12
         uint32_t encodedBits;   /* sum of 'totalBits' of encoded CTUs */
13
         uint32_t satdForVbv;    /* sum of lowres (estimated) costs for entire row */
14
         uint32_t intraSatdForVbv; /* sum of lowres (estimated) intra costs for entire row */
15
-        uint32_t diagSatd;
16
-        uint32_t diagIntraSatd;
17
-        double   diagQp;
18
-        double   diagQpScale;
19
+        uint32_t rowSatd;
20
+        uint32_t rowIntraSatd;
21
+        double   rowQp;
22
+        double   rowQpScale;
23
         double   sumQpRc;
24
         double   sumQpAq;
25
     };
26
@@ -148,6 +151,9 @@
27
     double         m_rateFactor; /* calculated based on the Frame QP */
28
     int            m_picCsp;
29
 
30
+    uint32_t*              m_meIntegral[INTEGRAL_PLANE_NUM];       // 12 integral planes for 32x32, 32x24, 32x8, 24x32, 16x16, 16x12, 16x4, 12x16, 8x32, 8x8, 4x16 and 4x4.
31
+    uint32_t*              m_meBuffer[INTEGRAL_PLANE_NUM];
32
+
33
     FrameData();
34
 
35
     bool create(const x265_param& param, const SPS& sps, int csp);
36
@@ -168,7 +174,6 @@
37
 /* Stores inter analysis data for a single frame */
38
 struct analysis_inter_data
39
 {
40
-    MV*         mv;
41
     WeightParam* wt;
42
     int32_t*    ref;
43
     uint8_t*    depth;
44
x265_2.1.tar.gz/source/common/param.cpp -> x265_2.2.tar.gz/source/common/param.cpp Changed
201
 
1
@@ -149,6 +149,7 @@
2
     param->bBPyramid = 1;
3
     param->scenecutThreshold = 40; /* Magic number pulled in from x264 */
4
     param->lookaheadSlices = 8;
5
+    param->scenecutBias = 5.0;
6
 
7
     /* Intra Coding Tools */
8
     param->bEnableConstrainedIntra = 0;
9
@@ -176,6 +177,7 @@
10
     param->maxNumReferences = 3;
11
     param->bEnableTemporalMvp = 1;
12
     param->bSourceReferenceEstimation = 0;
13
+    param->limitTU = 0;
14
 
15
     /* Loop Filter */
16
     param->bEnableLoopFilter = 1;
17
@@ -197,6 +199,7 @@
18
     param->bCULossless = 0;
19
     param->bEnableTemporalSubLayers = 0;
20
     param->bEnableRdRefine = 0;
21
+    param->bMultiPassOptRPS = 0;
22
 
23
     /* Rate control options */
24
     param->rc.vbvMaxBitrate = 0;
25
@@ -229,8 +232,6 @@
26
     param->rc.qpMin = 0;
27
     param->rc.qpMax = QP_MAX_MAX;
28
 
29
-    param->bDiscardOptionalVUI = 0;
30
-
31
     /* Video Usability Information (VUI) */
32
     param->vui.aspectRatioIdc = 0;
33
     param->vui.sarWidth = 0;
34
@@ -256,8 +257,13 @@
35
     param->minLuma = 0;
36
     param->maxLuma = PIXEL_MAX;
37
     param->log2MaxPocLsb = 8;
38
-    param->bDiscardSEI = false;
39
     param->maxSlices = 1;
40
+
41
+    param->bEmitVUITimingInfo   = 1;
42
+    param->bEmitVUIHRDInfo      = 1;
43
+    param->bOptQpPPS            = 1;
44
+    param->bOptRefListLengthPPS = 1;
45
+
46
 }
47
 
48
 int x265_param_default_preset(x265_param* param, const char* preset, const char* tune)
49
@@ -901,21 +907,19 @@
50
     // solve "fatal error C1061: compiler limit : blocks nested too deeply"
51
     if (bExtraParams)
52
     {
53
-        bExtraParams = false;
54
-        if (0) ;
55
-        OPT("slices") p->maxSlices = atoi(value);
56
-        else
57
-            bExtraParams = true;
58
-    }
59
-
60
-    if (bExtraParams)
61
-    {
62
         if (0) ;
63
         OPT("qpmin") p->rc.qpMin = atoi(value);
64
         OPT("analyze-src-pics") p->bSourceReferenceEstimation = atobool(value);
65
         OPT("log2-max-poc-lsb") p->log2MaxPocLsb = atoi(value);
66
-        OPT("discard-sei") p->bDiscardSEI = atobool(value);
67
-        OPT("discard-vui") p->bDiscardOptionalVUI = atobool(value);
68
+        OPT("vui-timing-info") p->bEmitVUITimingInfo = atobool(value);
69
+        OPT("vui-hrd-info") p->bEmitVUIHRDInfo = atobool(value);
70
+        OPT("slices") p->maxSlices = atoi(value);
71
+        OPT("limit-tu") p->limitTU = atoi(value);
72
+        OPT("opt-qp-pps") p->bOptQpPPS = atobool(value);
73
+        OPT("opt-ref-list-length-pps") p->bOptRefListLengthPPS = atobool(value);
74
+        OPT("multi-pass-opt-rps") p->bMultiPassOptRPS = atobool(value);
75
+        OPT("scenecut-bias") p->scenecutBias = atof(value);
76
+
77
         else
78
             return X265_PARAM_BAD_NAME;
79
     }
80
@@ -1078,8 +1082,8 @@
81
         "Multiple-Slices mode must be enable Wavefront Parallel Processing (--wpp)");
82
     CHECK(param->internalBitDepth != X265_DEPTH,
83
           "internalBitDepth must match compiled bit depth");
84
-    CHECK(param->minCUSize != 64 && param->minCUSize != 32 && param->minCUSize != 16 && param->minCUSize != 8,
85
-          "minimim CU size must be 8, 16, 32, or 64");
86
+    CHECK(param->minCUSize != 32 && param->minCUSize != 16 && param->minCUSize != 8,
87
+          "minimim CU size must be 8, 16 or 32");
88
     CHECK(param->minCUSize > param->maxCUSize,
89
           "min CU size must be less than or equal to max CU size");
90
     CHECK(param->rc.qp < -6 * (param->internalBitDepth - 8) || param->rc.qp > QP_MAX_SPEC,
91
@@ -1088,8 +1092,8 @@
92
           "Frame rate numerator and denominator must be specified");
93
     CHECK(param->interlaceMode < 0 || param->interlaceMode > 2,
94
           "Interlace mode must be 0 (progressive) 1 (top-field first) or 2 (bottom field first)");
95
-    CHECK(param->searchMethod<0 || param->searchMethod> X265_FULL_SEARCH,
96
-          "Search method is not supported value (0:DIA 1:HEX 2:UMH 3:HM 5:FULL)");
97
+    CHECK(param->searchMethod < 0 || param->searchMethod > X265_FULL_SEARCH,
98
+          "Search method is not supported value (0:DIA 1:HEX 2:UMH 3:HM 4:SEA 5:FULL)");
99
     CHECK(param->searchRange < 0,
100
           "Search Range must be more than 0");
101
     CHECK(param->searchRange >= 32768,
102
@@ -1122,6 +1126,7 @@
103
           "QuadtreeTUMaxDepthInter must be less than or equal to the difference between log2(maxCUSize) and QuadtreeTULog2MinSize plus 1");
104
     CHECK((param->maxTUSize != 32 && param->maxTUSize != 16 && param->maxTUSize != 8 && param->maxTUSize != 4),
105
           "max TU size must be 4, 8, 16, or 32");
106
+    CHECK(param->limitTU > 4, "Invalid limit-tu option, limit-TU must be between 0 and 4");
107
     CHECK(param->maxNumMergeCand < 1, "MaxNumMergeCand must be 1 or greater.");
108
     CHECK(param->maxNumMergeCand > 5, "MaxNumMergeCand must be 5 or smaller.");
109
 
110
@@ -1217,6 +1222,8 @@
111
           "Valid Logging level -1:none 0:error 1:warning 2:info 3:debug 4:full");
112
     CHECK(param->scenecutThreshold < 0,
113
           "scenecutThreshold must be greater than 0");
114
+    CHECK(param->scenecutBias < 0 || 100 < param->scenecutBias,
115
+           "scenecut-bias must be between 0 and 100");
116
     CHECK(param->rdPenalty < 0 || param->rdPenalty > 2,
117
           "Valid penalty for 32x32 intra TU in non-I slices. 0:disabled 1:RD-penalty 2:maximum");
118
     CHECK(param->keyframeMax < -1,
119
@@ -1247,10 +1254,12 @@
120
         "qpmax exceeds supported range (0 to 69)");
121
     CHECK(param->rc.qpMin < QP_MIN || param->rc.qpMin > QP_MAX_MAX,
122
         "qpmin exceeds supported range (0 to 69)");
123
-    CHECK(param->log2MaxPocLsb < 4,
124
-        "maximum of the picture order count can not be less than 4");
125
-    CHECK(1 > param->maxSlices || param->maxSlices > ((param->sourceHeight + param->maxCUSize - 1) / param->maxCUSize),
126
-        "The slices can not be more than number of rows");
127
+    CHECK(param->log2MaxPocLsb < 4 || param->log2MaxPocLsb > 16,
128
+        "Supported range for log2MaxPocLsb is 4 to 16");
129
+#if !X86_64
130
+    CHECK(param->searchMethod == X265_SEA && (param->sourceWidth > 840 || param->sourceHeight > 480),
131
+        "SEA motion search does not support resolutions greater than 480p in 32 bit build");
132
+#endif
133
     return check_failed;
134
 }
135
 
136
@@ -1338,9 +1347,8 @@
137
 
138
     x265_log(param, X265_LOG_INFO, "ME / range / subpel / merge         : %s / %d / %d / %d\n",
139
              x265_motion_est_names[param->searchMethod], param->searchRange, param->subpelRefine, param->maxNumMergeCand);
140
-
141
     if (param->keyframeMax != INT_MAX || param->scenecutThreshold)
142
-        x265_log(param, X265_LOG_INFO, "Keyframe min / max / scenecut       : %d / %d / %d\n", param->keyframeMin, param->keyframeMax, param->scenecutThreshold);
143
+        x265_log(param, X265_LOG_INFO, "Keyframe min / max / scenecut / bias: %d / %d / %d / %.2lf\n", param->keyframeMin, param->keyframeMax, param->scenecutThreshold, param->scenecutBias * 100);
144
     else
145
         x265_log(param, X265_LOG_INFO, "Keyframe min / max / scenecut       : disabled\n");
146
 
147
@@ -1395,6 +1403,7 @@
148
     TOOLVAL(param->noiseReductionInter, "nr-inter=%d");
149
     TOOLOPT(param->bEnableTSkipFast, "tskip-fast");
150
     TOOLOPT(!param->bEnableTSkipFast && param->bEnableTransformSkip, "tskip");
151
+    TOOLVAL(param->limitTU , "limit-tu=%d");
152
     TOOLOPT(param->bCULossless, "cu-lossless");
153
     TOOLOPT(param->bEnableSignHiding, "signhide");
154
     TOOLOPT(param->bEnableTemporalMvp, "tmvp");
155
@@ -1423,7 +1432,7 @@
156
     fflush(stderr);
157
 }
158
 
159
-char *x265_param2string(x265_param* p)
160
+char *x265_param2string(x265_param* p, int padx, int pady)
161
 {
162
     char *buf, *s;
163
 
164
@@ -1434,70 +1443,92 @@
165
 #define BOOL(param, cliopt) \
166
     s += sprintf(s, " %s", (param) ? cliopt : "no-" cliopt);
167
 
168
-    s += sprintf(s, "%dx%d", p->sourceWidth,p->sourceHeight);
169
-    s += sprintf(s, " fps=%u/%u", p->fpsNum, p->fpsDenom);
170
-    s += sprintf(s, " bitdepth=%d", p->internalBitDepth);
171
+    s += sprintf(s, "cpuid=%d", p->cpuid);
172
+    s += sprintf(s, " frame-threads=%d", p->frameNumThreads);
173
+    if (p->numaPools)
174
+        s += sprintf(s, " numa-pools=%s", p->numaPools);
175
     BOOL(p->bEnableWavefront, "wpp");
176
+    BOOL(p->bDistributeModeAnalysis, "pmode");
177
+    BOOL(p->bDistributeMotionEstimation, "pme");
178
+    BOOL(p->bEnablePsnr, "psnr");
179
+    BOOL(p->bEnableSsim, "ssim");
180
+    s += sprintf(s, " log-level=%d", p->logLevel);
181
+    s += sprintf(s, " bitdepth=%d", p->internalBitDepth);
182
+    s += sprintf(s, " input-csp=%d", p->internalCsp);
183
+    s += sprintf(s, " fps=%u/%u", p->fpsNum, p->fpsDenom);
184
+    s += sprintf(s, " input-res=%dx%d", p->sourceWidth - padx, p->sourceHeight - pady);
185
+    s += sprintf(s, " interlace=%d", p->interlaceMode);
186
+    s += sprintf(s, " total-frames=%d", p->totalFrames);
187
+    s += sprintf(s, " level-idc=%d", p->levelIdc);
188
+    s += sprintf(s, " high-tier=%d", p->bHighTier);
189
+    s += sprintf(s, " uhd-bd=%d", p->uhdBluray);
190
+    s += sprintf(s, " ref=%d", p->maxNumReferences);
191
+    BOOL(p->bAllowNonConformance, "allow-non-conformance");
192
+    BOOL(p->bRepeatHeaders, "repeat-headers");
193
+    BOOL(p->bAnnexB, "annexb");
194
+    BOOL(p->bEnableAccessUnitDelimiters, "aud");
195
+    BOOL(p->bEmitHRDSEI, "hrd");
196
+    BOOL(p->bEmitInfoSEI, "info");
197
+    s += sprintf(s, " hash=%d", p->decodedPictureHashSEI);
198
+    BOOL(p->bEnableTemporalSubLayers, "temporal-layers");
199
+    BOOL(p->bOpenGOP, "open-gop");
200
+    s += sprintf(s, " min-keyint=%d", p->keyframeMin);
201
x265_2.1.tar.gz/source/common/param.h -> x265_2.2.tar.gz/source/common/param.h Changed
10
 
1
@@ -31,7 +31,7 @@
2
 int   x265_set_globals(x265_param *param);
3
 void  x265_print_params(x265_param *param);
4
 void  x265_param_apply_fastfirstpass(x265_param *p);
5
-char* x265_param2string(x265_param *param);
6
+char* x265_param2string(x265_param *param, int padx, int pady);
7
 int   x265_atoi(const char *str, bool& bError);
8
 double x265_atof(const char *str, bool& bError);
9
 int   parseCpuName(const char *value, bool& bError);
10
x265_2.1.tar.gz/source/common/pixel.cpp -> x265_2.2.tar.gz/source/common/pixel.cpp Changed
87
 
1
@@ -117,6 +117,52 @@
2
     }
3
 }
4
 
5
+template<int lx, int ly>
6
+int ads_x4(int encDC[4], uint32_t *sums, int delta, uint16_t *costMvX, int16_t *mvs, int width, int thresh)
7
+{
8
+    int nmv = 0;
9
+    for (int16_t i = 0; i < width; i++, sums++)
10
+    {
11
+        int ads = abs(encDC[0] - long(sums[0]))
12
+            + abs(encDC[1] - long(sums[lx >> 1]))
13
+            + abs(encDC[2] - long(sums[delta]))
14
+            + abs(encDC[3] - long(sums[delta + (lx >> 1)]))
15
+            + costMvX[i];
16
+        if (ads < thresh)
17
+            mvs[nmv++] = i;
18
+    }
19
+    return nmv;
20
+}
21
+
22
+template<int lx, int ly>
23
+int ads_x2(int encDC[2], uint32_t *sums, int delta, uint16_t *costMvX, int16_t *mvs, int width, int thresh)
24
+{
25
+    int nmv = 0;
26
+    for (int16_t i = 0; i < width; i++, sums++)
27
+    {
28
+        int ads = abs(encDC[0] - long(sums[0]))
29
+            + abs(encDC[1] - long(sums[delta]))
30
+            + costMvX[i];
31
+        if (ads < thresh)
32
+            mvs[nmv++] = i;
33
+    }
34
+    return nmv;
35
+}
36
+
37
+template<int lx, int ly>
38
+int ads_x1(int encDC[1], uint32_t *sums, int, uint16_t *costMvX, int16_t *mvs, int width, int thresh)
39
+{
40
+    int nmv = 0;
41
+    for (int16_t i = 0; i < width; i++, sums++)
42
+    {
43
+        int ads = abs(encDC[0] - long(sums[0]))
44
+            + costMvX[i];
45
+        if (ads < thresh)
46
+            mvs[nmv++] = i;
47
+    }
48
+    return nmv;
49
+}
50
+
51
 template<int lx, int ly, class T1, class T2>
52
 sse_t sse(const T1* pix1, intptr_t stride_pix1, const T2* pix2, intptr_t stride_pix2)
53
 {
54
@@ -991,6 +1037,32 @@
55
     LUMA_PU(64, 16);
56
     LUMA_PU(16, 64);
57
 
58
+    p.pu[LUMA_4x4].ads = ads_x1<4, 4>;
59
+    p.pu[LUMA_8x8].ads = ads_x1<8, 8>;
60
+    p.pu[LUMA_8x4].ads = ads_x2<8, 4>;
61
+    p.pu[LUMA_4x8].ads = ads_x2<4, 8>;
62
+    p.pu[LUMA_16x16].ads = ads_x4<16, 16>;
63
+    p.pu[LUMA_16x8].ads = ads_x2<16, 8>;
64
+    p.pu[LUMA_8x16].ads = ads_x2<8, 16>;
65
+    p.pu[LUMA_16x12].ads = ads_x1<16, 12>;
66
+    p.pu[LUMA_12x16].ads = ads_x1<12, 16>;
67
+    p.pu[LUMA_16x4].ads = ads_x1<16, 4>;
68
+    p.pu[LUMA_4x16].ads = ads_x1<4, 16>;
69
+    p.pu[LUMA_32x32].ads = ads_x4<32, 32>;
70
+    p.pu[LUMA_32x16].ads = ads_x2<32, 16>;
71
+    p.pu[LUMA_16x32].ads = ads_x2<16, 32>;
72
+    p.pu[LUMA_32x24].ads = ads_x4<32, 24>;
73
+    p.pu[LUMA_24x32].ads = ads_x4<24, 32>;
74
+    p.pu[LUMA_32x8].ads = ads_x4<32, 8>;
75
+    p.pu[LUMA_8x32].ads = ads_x4<8, 32>;
76
+    p.pu[LUMA_64x64].ads = ads_x4<64, 64>;
77
+    p.pu[LUMA_64x32].ads = ads_x2<64, 32>;
78
+    p.pu[LUMA_32x64].ads = ads_x2<32, 64>;
79
+    p.pu[LUMA_64x48].ads = ads_x4<64, 48>;
80
+    p.pu[LUMA_48x64].ads = ads_x4<48, 64>;
81
+    p.pu[LUMA_64x16].ads = ads_x4<64, 16>;
82
+    p.pu[LUMA_16x64].ads = ads_x4<16, 64>;
83
+
84
     p.pu[LUMA_4x4].satd   = satd_4x4;
85
     p.pu[LUMA_8x8].satd   = satd8<8, 8>;
86
     p.pu[LUMA_8x4].satd   = satd_8x4;
87
x265_2.2.tar.gz/source/common/ppc/dct_altivec.cpp Added
201
 
1
@@ -0,0 +1,819 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2013 x265 project
4
+ *
5
+ * Authors: Roger Moussalli <rmoussal@us.ibm.com>
6
+ *          Min Chen <min.chen@multicorewareinc.com>
7
+ *
8
+ * This program is free software; you can redistribute it and/or modify
9
+ * it under the terms of the GNU General Public License as published by
10
+ * the Free Software Foundation; either version 2 of the License, or
11
+ * (at your option) any later version.
12
+ *
13
+ * This program is distributed in the hope that it will be useful,
14
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16
+ * GNU General Public License for more details.
17
+ *
18
+ * You should have received a copy of the GNU General Public License
19
+ * along with this program; if not, write to the Free Software
20
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
21
+ *
22
+ * This program is also available under a commercial proprietary license.
23
+ * For more information, contact us at license @ x265.com.
24
+ *****************************************************************************/
25
+
26
+#include "common.h"
27
+#include "primitives.h"
28
+#include "contexts.h"   // costCoeffNxN_c
29
+#include "threading.h"  // CLZ
30
+#include "ppccommon.h"
31
+
32
+using namespace X265_NS;
33
+
34
+static uint32_t quant_altivec(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff)
35
+{
36
+
37
+    X265_CHECK(qBits >= 8, "qBits less than 8\n");
38
+
39
+    X265_CHECK((numCoeff % 16) == 0, "numCoeff must be multiple of 16\n");
40
+
41
+    int qBits8 = qBits - 8;
42
+    uint32_t numSig = 0;
43
+
44
+
45
+    int level[8] ;
46
+    int sign[8] ;
47
+    int tmplevel[8] ;
48
+
49
+    const vector signed short v_zeros = {0, 0, 0, 0, 0, 0, 0, 0} ;
50
+    const vector signed short v_neg1 = {-1, -1, -1, -1, -1, -1, -1, -1} ;
51
+    const vector signed short v_pos1_ss = {1, 1, 1, 1, 1, 1, 1, 1} ;
52
+    const vector signed int v_pos1_sw = {1, 1, 1, 1} ;
53
+
54
+    const vector signed int v_clip_high = {32767, 32767, 32767, 32767} ;
55
+    const vector signed int v_clip_low = {-32768, -32768, -32768, -32768} ;
56
+
57
+
58
+    vector signed short v_level_ss ;
59
+    vector signed int v_level_0, v_level_1 ;
60
+    vector signed int v_tmplevel_0, v_tmplevel_1 ;
61
+    vector signed short v_sign_ss ;
62
+    vector signed int v_sign_0, v_sign_1 ;
63
+    vector signed int v_quantCoeff_0, v_quantCoeff_1 ;
64
+
65
+    vector signed int v_numSig = {0, 0, 0, 0} ;
66
+
67
+    vector signed int v_add ;
68
+    v_add[0] = add ;
69
+    v_add = vec_splat(v_add, 0) ;
70
+
71
+    vector unsigned int v_qBits ;
72
+    v_qBits[0] = qBits ;
73
+    v_qBits = vec_splat(v_qBits, 0) ;
74
+
75
+    vector unsigned int v_qBits8 ;
76
+    v_qBits8[0] = qBits8 ;
77
+    v_qBits8 = vec_splat(v_qBits8, 0) ;
78
+
79
+
80
+    for (int blockpos_outer = 0; blockpos_outer < numCoeff; blockpos_outer+=16)
81
+    {
82
+        int blockpos = blockpos_outer ;
83
+
84
+        // for(int ii=0; ii<8; ii++) { level[ii] = coef[blockpos+ii] ;}
85
+        v_level_ss = vec_xl(0, &coef[blockpos]) ;
86
+        v_level_0 = vec_unpackh(v_level_ss) ;
87
+        v_level_1 = vec_unpackl(v_level_ss) ;
88
+
89
+
90
+        // for(int ii=0; ii<8; ii++) { sign[ii] = (level[ii] < 0 ? -1 : 1) ;}
91
+        vector bool short v_level_cmplt0 ;
92
+        v_level_cmplt0 = vec_cmplt(v_level_ss, v_zeros) ;
93
+        v_sign_ss = vec_sel(v_pos1_ss, v_neg1, v_level_cmplt0) ;
94
+        v_sign_0 = vec_unpackh(v_sign_ss) ;
95
+        v_sign_1 = vec_unpackl(v_sign_ss) ;
96
+        
97
+        
98
+
99
+        // for(int ii=0; ii<8; ii++) { tmplevel[ii] = abs(level[ii]) * quantCoeff[blockpos+ii] ;}
100
+        v_level_0 = vec_abs(v_level_0) ;
101
+        v_level_1 = vec_abs(v_level_1) ;
102
+        v_quantCoeff_0 = vec_xl(0, &quantCoeff[blockpos]) ;
103
+        v_quantCoeff_1 = vec_xl(16, &quantCoeff[blockpos]) ;
104
+        
105
+        asm ("vmuluwm %0,%1,%2"
106
+              : "=v" (v_tmplevel_0)
107
+              : "v"  (v_level_0) , "v" (v_quantCoeff_0)
108
+            ) ;
109
+
110
+        asm ("vmuluwm %0,%1,%2"
111
+              : "=v" (v_tmplevel_1)
112
+              : "v"  (v_level_1) , "v" (v_quantCoeff_1)
113
+            ) ;
114
+
115
+
116
+
117
+        // for(int ii=0; ii<8; ii++) { level[ii] = ((tmplevel[ii] + add) >> qBits) ;}
118
+        v_level_0 = vec_sra(vec_add(v_tmplevel_0, v_add), v_qBits) ;
119
+        v_level_1 = vec_sra(vec_add(v_tmplevel_1, v_add), v_qBits) ;
120
+
121
+        // for(int ii=0; ii<8; ii++) { deltaU[blockpos+ii] = ((tmplevel[ii] - (level[ii] << qBits)) >> qBits8) ;} 
122
+        vector signed int v_temp_0_sw, v_temp_1_sw ;
123
+        v_temp_0_sw = vec_sl(v_level_0, v_qBits) ;
124
+        v_temp_1_sw = vec_sl(v_level_1, v_qBits) ;
125
+
126
+        v_temp_0_sw = vec_sub(v_tmplevel_0, v_temp_0_sw) ;
127
+        v_temp_1_sw = vec_sub(v_tmplevel_1, v_temp_1_sw) ;
128
+
129
+        v_temp_0_sw = vec_sra(v_temp_0_sw, v_qBits8) ;
130
+        v_temp_1_sw = vec_sra(v_temp_1_sw, v_qBits8) ;
131
+
132
+        vec_xst(v_temp_0_sw, 0, &deltaU[blockpos]) ;
133
+        vec_xst(v_temp_1_sw, 16, &deltaU[blockpos]) ;
134
+
135
+
136
+        // for(int ii=0; ii<8; ii++) { if(level[ii]) ++numSig ; }
137
+        vector bool int v_level_cmpeq0 ;
138
+        vector signed int v_level_inc ;
139
+        v_level_cmpeq0 = vec_cmpeq(v_level_0, (vector signed int)v_zeros) ;
140
+        v_level_inc = vec_sel(v_pos1_sw, (vector signed int)v_zeros, v_level_cmpeq0) ;
141
+        v_numSig = vec_add(v_numSig, v_level_inc) ;
142
+
143
+        v_level_cmpeq0 = vec_cmpeq(v_level_1, (vector signed int)v_zeros) ;
144
+        v_level_inc = vec_sel(v_pos1_sw, (vector signed int)v_zeros, v_level_cmpeq0) ;
145
+        v_numSig = vec_add(v_numSig, v_level_inc) ;
146
+
147
+
148
+        // for(int ii=0; ii<8; ii++) { level[ii] *= sign[ii]; }
149
+        asm ("vmuluwm %0,%1,%2"
150
+              : "=v" (v_level_0)
151
+              : "v"  (v_level_0) , "v" (v_sign_0)
152
+            ) ;
153
+
154
+        asm ("vmuluwm %0,%1,%2"
155
+              : "=v" (v_level_1)
156
+              : "v"  (v_level_1) , "v" (v_sign_1)
157
+            ) ;
158
+
159
+
160
+
161
+        // for(int ii=0; ii<8; ii++) {qCoef[blockpos+ii] = (int16_t)x265_clip3(-32768, 32767, level[ii]);}
162
+        vector bool int v_level_cmp_clip_high, v_level_cmp_clip_low ;
163
+
164
+        v_level_cmp_clip_high = vec_cmpgt(v_level_0, v_clip_high) ;
165
+        v_level_0 = vec_sel(v_level_0, v_clip_high, v_level_cmp_clip_high) ;
166
+        v_level_cmp_clip_low = vec_cmplt(v_level_0, v_clip_low) ;
167
+        v_level_0 = vec_sel(v_level_0, v_clip_low, v_level_cmp_clip_low) ;
168
+
169
+
170
+        v_level_cmp_clip_high = vec_cmpgt(v_level_1, v_clip_high) ;
171
+        v_level_1 = vec_sel(v_level_1, v_clip_high, v_level_cmp_clip_high) ;
172
+        v_level_cmp_clip_low = vec_cmplt(v_level_1, v_clip_low) ;
173
+        v_level_1 = vec_sel(v_level_1, v_clip_low, v_level_cmp_clip_low) ;
174
+
175
+        v_level_ss = vec_pack(v_level_0, v_level_1) ;
176
+
177
+        vec_xst(v_level_ss, 0, &qCoef[blockpos]) ;
178
+
179
+
180
+
181
+
182
+        // UNROLL ONCE MORE (which is ok since loops for multiple of 16 times, though that is NOT obvious to the compiler)
183
+        blockpos += 8 ;
184
+
185
+        // for(int ii=0; ii<8; ii++) { level[ii] = coef[blockpos+ii] ;}
186
+        v_level_ss = vec_xl(0, &coef[blockpos]) ;
187
+        v_level_0 = vec_unpackh(v_level_ss) ;
188
+        v_level_1 = vec_unpackl(v_level_ss) ;
189
+
190
+
191
+        // for(int ii=0; ii<8; ii++) { sign[ii] = (level[ii] < 0 ? -1 : 1) ;}
192
+        v_level_cmplt0 = vec_cmplt(v_level_ss, v_zeros) ;
193
+        v_sign_ss = vec_sel(v_pos1_ss, v_neg1, v_level_cmplt0) ;
194
+        v_sign_0 = vec_unpackh(v_sign_ss) ;
195
+        v_sign_1 = vec_unpackl(v_sign_ss) ;
196
+        
197
+        
198
+
199
+        // for(int ii=0; ii<8; ii++) { tmplevel[ii] = abs(level[ii]) * quantCoeff[blockpos+ii] ;}
200
+        v_level_0 = vec_abs(v_level_0) ;
201
x265_2.2.tar.gz/source/common/ppc/intrapred_altivec.cpp Added
201
 
1
@@ -0,0 +1,30809 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2013 x265 project
4
+ *
5
+ * Authors: Roger Moussalli <rmoussal@us.ibm.com>
6
+ *          Min Chen <min.chen@multicorewareinc.com>
7
+ *
8
+ * This program is free software; you can redistribute it and/or modify
9
+ * it under the terms of the GNU General Public License as published by
10
+ * the Free Software Foundation; either version 2 of the License, or
11
+ * (at your option) any later version.
12
+ *
13
+ * This program is distributed in the hope that it will be useful,
14
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16
+ * GNU General Public License for more details.
17
+ *
18
+ * You should have received a copy of the GNU General Public License
19
+ * along with this program; if not, write to the Free Software
20
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
21
+ *
22
+ * This program is also available under a commercial proprietary license.
23
+ * For more information, contact us at license @ x265.com.
24
+ *****************************************************************************/
25
+
26
+#include <iostream>
27
+#include <vector>
28
+#include <assert.h>
29
+#include <math.h>
30
+#include <cmath>
31
+#include <linux/types.h>
32
+#include <stdlib.h>
33
+#include <stdio.h>
34
+#include <stdint.h>
35
+#include <sys/time.h>
36
+#include <string.h>
37
+
38
+#include "common.h"
39
+#include "primitives.h"
40
+#include "x265.h"
41
+#include "ppccommon.h"
42
+
43
+//using namespace std ;
44
+namespace X265_NS {
45
+
46
+/* INTRA Prediction - altivec implementation */
47
+template<int width, int dirMode>
48
+void intra_pred(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter){};
49
+
50
+template<>
51
+void intra_pred<4, 2>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
52
+{
53
+    if(dstStride == 4) {   
54
+        const vec_u8_t srcV = vec_xl(10, srcPix0); /* offset = width2+2 = width<<1 + 2*/
55
+        const vec_u8_t mask = {0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03,0x04, 0x02, 0x03,0x04,0x05, 0x03,0x04,0x05, 0x06}; 
56
+        vec_u8_t vout = vec_perm(srcV, srcV, mask);
57
+        vec_xst(vout, 0, dst); 
58
+    }
59
+    else if(dstStride%16 == 0){
60
+        vec_u8_t v0 = vec_xl(10, srcPix0);
61
+        vec_ste((vec_u32_t)v0, 0, (unsigned int*)dst);
62
+        vec_u8_t v1 = vec_xl(11, srcPix0);
63
+        vec_ste((vec_u32_t)v1, 0, (unsigned int*)(dst+dstStride));
64
+        vec_u8_t v2 = vec_xl(12, srcPix0);
65
+        vec_ste((vec_u32_t)v2, 0, (unsigned int*)(dst+dstStride*2));
66
+        vec_u8_t v3 = vec_xl(13, srcPix0);
67
+        vec_ste((vec_u32_t)v3, 0, (unsigned int*)(dst+dstStride*3));
68
+    }
69
+    else{
70
+        const vec_u8_t srcV = vec_xl(10, srcPix0); /* offset = width2+2 = width<<1 + 2*/
71
+        const vec_u8_t mask_0 = {0x00, 0x01, 0x02, 0x03, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f}; 
72
+        const vec_u8_t mask_1 = {0x01, 0x02, 0x03, 0x04, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f}; 
73
+        const vec_u8_t mask_2 = {0x02, 0x03, 0x04, 0x05, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f}; 
74
+        const vec_u8_t mask_3 = {0x03, 0x04, 0x05, 0x06, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f}; 
75
+        vec_u8_t v0 = vec_perm(srcV, vec_xl(0, dst), mask_0);
76
+        vec_xst(v0, 0, dst);
77
+        vec_u8_t v1 = vec_perm(srcV, vec_xl(dstStride, dst), mask_1);
78
+        vec_xst(v1, dstStride, dst);
79
+        vec_u8_t v2 = vec_perm(srcV, vec_xl(dstStride*2, dst), mask_2);
80
+        vec_xst(v2, dstStride*2, dst);
81
+        vec_u8_t v3 = vec_perm(srcV,  vec_xl(dstStride*3, dst), mask_3);
82
+        vec_xst(v3, dstStride*3, dst);
83
+    }
84
+#ifdef DEBUG
85
+        for (int y = 0; y < 4; y++)
86
+        {
87
+            for (int x = 0; x < 4; x++)
88
+            {
89
+                printf("%d ",dst[y * dstStride + x] );         
90
+            }
91
+            printf("\n");          
92
+        }
93
+        printf("\n\n");            
94
+#endif     
95
+}
96
+
97
+template<>
98
+void intra_pred<8, 2>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
99
+{
100
+    if(dstStride == 8) {   
101
+        const vec_u8_t srcV1 = vec_xl(18, srcPix0); /* offset = width2+2 = width<<1 + 2*/
102
+        const vec_u8_t mask_0 = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x01, 0x02, 0x03,0x04, 0x05, 0x06, 0x07, 0x08};
103
+        const vec_u8_t mask_1 = {0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a};
104
+        const vec_u8_t mask_2 = {0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c};
105
+        const vec_u8_t mask_3 = {0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e};
106
+        vec_u8_t v0 = vec_perm(srcV1, srcV1, mask_0);
107
+        vec_u8_t v1 = vec_perm(srcV1, srcV1, mask_1);
108
+        vec_u8_t v2 = vec_perm(srcV1, srcV1, mask_2);
109
+        vec_u8_t v3 = vec_perm(srcV1, srcV1, mask_3);
110
+        vec_xst(v0, 0, dst);
111
+        vec_xst(v1, 16, dst); 
112
+        vec_xst(v2, 32, dst); 
113
+        vec_xst(v3, 48, dst); 
114
+    }
115
+    else{
116
+        //pixel *out = dst;    
117
+        const vec_u8_t srcV1 = vec_xl(18, srcPix0); /* offset = width2+2 = width<<1 + 2*/
118
+        const vec_u8_t mask_0 = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
119
+        const vec_u8_t mask_1 = {0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
120
+        const vec_u8_t mask_2 = {0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
121
+        const vec_u8_t mask_3 = {0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
122
+        const vec_u8_t mask_4 = {0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
123
+        const vec_u8_t mask_5 = {0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
124
+        const vec_u8_t mask_6 = {0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
125
+        const vec_u8_t mask_7 = {0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
126
+        vec_u8_t v0 = vec_perm(srcV1, vec_xl(0, dst), mask_0);
127
+        vec_xst(v0, 0, dst);
128
+        vec_u8_t v1 = vec_perm(srcV1, vec_xl(dstStride, dst), mask_1);
129
+        vec_xst(v1, dstStride, dst);
130
+        vec_u8_t v2 = vec_perm(srcV1, vec_xl(dstStride*2, dst), mask_2);
131
+        vec_xst(v2, dstStride*2, dst);
132
+        vec_u8_t v3 = vec_perm(srcV1,  vec_xl(dstStride*3, dst), mask_3);
133
+        vec_xst(v3, dstStride*3, dst);
134
+        vec_u8_t v4 = vec_perm(srcV1,  vec_xl(dstStride*4, dst), mask_4);
135
+        vec_xst(v4, dstStride*4, dst);
136
+        vec_u8_t v5 = vec_perm(srcV1,  vec_xl(dstStride*5, dst), mask_5);
137
+        vec_xst(v5, dstStride*5, dst);
138
+        vec_u8_t v6 = vec_perm(srcV1,  vec_xl(dstStride*6, dst), mask_6);
139
+        vec_xst(v6, dstStride*6, dst);
140
+        vec_u8_t v7 = vec_perm(srcV1,  vec_xl(dstStride*7, dst), mask_7);
141
+        vec_xst(v7, dstStride*7, dst);
142
+    }
143
+   
144
+#ifdef DEBUG
145
+        for (int y = 0; y < 8; y++)
146
+        {
147
+            for (int x = 0; x < 8; x++)
148
+            {
149
+                printf("%d ",dst[y * dstStride + x] );         
150
+            }
151
+            printf("\n");          
152
+        }
153
+        printf("\n\n");            
154
+#endif     
155
+}
156
+
157
+template<>
158
+void intra_pred<16, 2>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
159
+{
160
+    int i;
161
+    //int off = dstStride; 
162
+    //const pixel *srcPix = srcPix0;
163
+    for(i=0; i<16; i++){
164
+        vec_xst(   vec_xl(34+i, srcPix0), i*dstStride, dst); /* first offset = width2+2 = width<<1 + 2*/
165
+    }
166
+#ifdef DEBUG
167
+        for (int y = 0; y < 16; y++)
168
+        {
169
+            for (int x = 0; x <16; x++)
170
+            {
171
+                printf("%d ",dst[y * dstStride + x] );         
172
+            }
173
+            printf("\n");          
174
+        }
175
+        printf("\n\n");            
176
+#endif     
177
+}
178
+
179
+template<>
180
+void intra_pred<32, 2>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
181
+{
182
+    int i;
183
+    int off = dstStride;   
184
+    //const pixel *srcPix = srcPix0;
185
+    for(i=0; i<32; i++){
186
+        off = i*dstStride;     
187
+        vec_xst(   vec_xl(66+i, srcPix0), off, dst); /* first offset = width2+2 = width<<1 + 2*/
188
+        vec_xst(   vec_xl(82+i, srcPix0), off+16, dst); /* first offset = width2+2 = width<<1 + 2*/
189
+    }
190
+#ifdef DEBUG
191
+        for (int y = 0; y < 32; y++)
192
+        {
193
+            for (int x = 0; x <32; x++)
194
+            {
195
+                printf("%d ",dst[y * dstStride + x] );         
196
+            }
197
+            printf("\n");          
198
+        }
199
+        printf("\n\n");            
200
+#endif     
201
x265_2.2.tar.gz/source/common/ppc/ipfilter_altivec.cpp Added
201
 
1
@@ -0,0 +1,1522 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2013 x265 project
4
+ *
5
+ * Authors: Roger Moussalli <rmoussal@us.ibm.com>
6
+ *          Min Chen <min.chen@multicorewareinc.com>
7
+ *
8
+ * This program is free software; you can redistribute it and/or modify
9
+ * it under the terms of the GNU General Public License as published by
10
+ * the Free Software Foundation; either version 2 of the License, or
11
+ * (at your option) any later version.
12
+ *
13
+ * This program is distributed in the hope that it will be useful,
14
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16
+ * GNU General Public License for more details.
17
+ *
18
+ * You should have received a copy of the GNU General Public License
19
+ * along with this program; if not, write to the Free Software
20
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
21
+ *
22
+ * This program is also available under a commercial proprietary license.
23
+ * For more information, contact us at license @ x265.com.
24
+ *****************************************************************************/
25
+
26
+#include <iostream>
27
+#include "common.h"
28
+#include "primitives.h"
29
+#include "ppccommon.h"
30
+
31
+using namespace X265_NS;
32
+
33
+// ORIGINAL : for(col=0; col<16; col++) {sum[col]  = src[ocol+col + 0 * srcStride] * c[0];}
34
+#define multiply_pixel_coeff(/*vector int*/ v_sum_0, /*vector int*/ v_sum_1, /*vector int*/ v_sum_2, /*vector int*/ v_sum_3, /*const pixel * */ src, /*int*/ src_offset, /*vector signed short*/ v_coeff) \
35
+{ \
36
+    vector unsigned char v_pixel ; \
37
+    vector signed short v_pixel_16_h, v_pixel_16_l ; \
38
+    const vector signed short v_mask_unisgned_8_to_16 = {0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF} ; \
39
+\
40
+    /* load the pixels */ \
41
+    v_pixel = vec_xl(src_offset, src) ; \
42
+\
43
+    /* unpack the 8-bit pixels to 16-bit values (and undo the sign extension) */ \
44
+    v_pixel_16_h = vec_unpackh((vector signed char)v_pixel) ; \
45
+    v_pixel_16_l = vec_unpackl((vector signed char)v_pixel) ; \
46
+    v_pixel_16_h = vec_and(v_pixel_16_h, v_mask_unisgned_8_to_16) ; \
47
+    v_pixel_16_l = vec_and(v_pixel_16_l, v_mask_unisgned_8_to_16) ; \
48
+\
49
+    /* multiply the pixels by the coefficient */ \
50
+    v_sum_0 = vec_mule(v_pixel_16_h, v_coeff) ; \
51
+    v_sum_1 = vec_mulo(v_pixel_16_h, v_coeff) ; \
52
+    v_sum_2 = vec_mule(v_pixel_16_l, v_coeff) ; \
53
+    v_sum_3 = vec_mulo(v_pixel_16_l, v_coeff) ; \
54
+} // end multiply_pixel_coeff()
55
+
56
+
57
+// ORIGINAL : for(col=0; col<16; col++) {sum[col] += src[ocol+col + 1 * srcStride] * c[1];}
58
+#define multiply_accumulate_pixel_coeff(/*vector int*/ v_sum_0, /*vector int*/ v_sum_1, /*vector int*/ v_sum_2, /*vector int*/ v_sum_3, /*const pixel * */ src, /*int*/ src_offset, /*vector signed short*/ v_coeff) \
59
+{ \
60
+    vector unsigned char v_pixel ; \
61
+    vector signed short v_pixel_16_h, v_pixel_16_l ; \
62
+    vector int v_product_int_0, v_product_int_1, v_product_int_2, v_product_int_3 ; \
63
+    const vector signed short v_mask_unisgned_8_to_16 = {0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF} ; \
64
+\
65
+    /* ORIGINAL : for(col=0; col<16; col++) {sum[col]  = src[ocol+col + 0 * srcStride] * c[0];} */ \
66
+    /* load the pixels */ \
67
+    v_pixel = vec_xl(src_offset, src) ; \
68
+\
69
+    /* unpack the 8-bit pixels to 16-bit values (and undo the sign extension) */ \
70
+    v_pixel_16_h = vec_unpackh((vector signed char)v_pixel) ; \
71
+    v_pixel_16_l = vec_unpackl((vector signed char)v_pixel) ; \
72
+    v_pixel_16_h = vec_and(v_pixel_16_h, v_mask_unisgned_8_to_16) ; \
73
+    v_pixel_16_l = vec_and(v_pixel_16_l, v_mask_unisgned_8_to_16) ; \
74
+\
75
+    /* multiply the pixels by the coefficient */ \
76
+    v_product_int_0 = vec_mule(v_pixel_16_h, v_coeff) ; \
77
+    v_product_int_1 = vec_mulo(v_pixel_16_h, v_coeff) ; \
78
+    v_product_int_2 = vec_mule(v_pixel_16_l, v_coeff) ; \
79
+    v_product_int_3 = vec_mulo(v_pixel_16_l, v_coeff) ; \
80
+\
81
+    /* accumulate the results with the sum vectors */ \
82
+    v_sum_0 = vec_add(v_sum_0, v_product_int_0) ; \
83
+    v_sum_1 = vec_add(v_sum_1, v_product_int_1) ; \
84
+    v_sum_2 = vec_add(v_sum_2, v_product_int_2) ; \
85
+    v_sum_3 = vec_add(v_sum_3, v_product_int_3) ; \
86
+} // end multiply_accumulate_pixel_coeff()
87
+
88
+
89
+
90
+#if 0
91
+//ORIGINAL
92
+// Works with the following values:
93
+// N = 8
94
+// width >= 16 (multiple of 16)
95
+// any height
96
+template<int N, int width, int height>
97
+void interp_vert_pp_altivec(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
98
+{
99
+
100
+
101
+    const int16_t* c = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx];
102
+    const int shift = IF_FILTER_PREC;
103
+    const int offset = 1 << (shift - 1);
104
+    const uint16_t maxVal = (1 << X265_DEPTH) - 1;
105
+
106
+    src -= (N / 2 - 1) * srcStride;
107
+
108
+
109
+    // Vector to hold replicated shift amount
110
+    const vector unsigned int v_shift = {shift, shift, shift, shift} ;
111
+
112
+    // Vector to hold replicated offset
113
+    const vector int v_offset = {offset, offset, offset, offset} ;
114
+
115
+    // Vector to hold replicated maxVal
116
+    const vector signed short v_maxVal = {maxVal, maxVal, maxVal, maxVal, maxVal, maxVal, maxVal, maxVal} ;
117
+
118
+
119
+    // Vector to hold replicated coefficients (one coefficient replicated per vector)
120
+    vector signed short v_coeff_0, v_coeff_1, v_coeff_2, v_coeff_3, v_coeff_4, v_coeff_5, v_coeff_6, v_coeff_7 ;
121
+    vector signed short v_coefficients = vec_xl(0, c) ; // load all coefficients into one vector
122
+    
123
+    // Replicate the coefficients into respective vectors
124
+    v_coeff_0 = vec_splat(v_coefficients, 0) ;
125
+    v_coeff_1 = vec_splat(v_coefficients, 1) ;
126
+    v_coeff_2 = vec_splat(v_coefficients, 2) ;
127
+    v_coeff_3 = vec_splat(v_coefficients, 3) ;
128
+    v_coeff_4 = vec_splat(v_coefficients, 4) ;
129
+    v_coeff_5 = vec_splat(v_coefficients, 5) ;
130
+    v_coeff_6 = vec_splat(v_coefficients, 6) ;
131
+    v_coeff_7 = vec_splat(v_coefficients, 7) ;
132
+
133
+    
134
+
135
+    int row, ocol, col;
136
+    for (row = 0; row < height; row++)
137
+    {
138
+        for (ocol = 0; ocol < width; ocol+=16)
139
+        {
140
+
141
+
142
+           // int sum[16] ;
143
+           // int16_t val[16] ;
144
+
145
+           // --> for(col=0; col<16; col++) {sum[col]  = src[ocol+col + 1 * srcStride] * c[0];}
146
+           // --> for(col=0; col<16; col++) {sum[col] += src[ocol+col + 1 * srcStride] * c[1];}
147
+           // --> for(col=0; col<16; col++) {sum[col] += src[ocol+col + 2 * srcStride] * c[2];}
148
+           // --> for(col=0; col<16; col++) {sum[col] += src[ocol+col + 3 * srcStride] * c[3];}
149
+           // --> for(col=0; col<16; col++) {sum[col] += src[ocol+col + 4 * srcStride] * c[4];}
150
+           // --> for(col=0; col<16; col++) {sum[col] += src[ocol+col + 5 * srcStride] * c[5];}
151
+           // --> for(col=0; col<16; col++) {sum[col] += src[ocol+col + 6 * srcStride] * c[6];}
152
+           // --> for(col=0; col<16; col++) {sum[col] += src[ocol+col + 7 * srcStride] * c[7];}
153
+
154
+
155
+           vector signed int v_sum_0, v_sum_1, v_sum_2, v_sum_3 ;
156
+            vector signed short v_val_0, v_val_1 ;
157
+
158
+
159
+
160
+            multiply_pixel_coeff(v_sum_0, v_sum_1, v_sum_2, v_sum_3, src, ocol, v_coeff_0) ;
161
+            multiply_accumulate_pixel_coeff(v_sum_0, v_sum_1, v_sum_2, v_sum_3, src, ocol + 1 * srcStride, v_coeff_1) ;
162
+            multiply_accumulate_pixel_coeff(v_sum_0, v_sum_1, v_sum_2, v_sum_3, src, ocol + 2 * srcStride, v_coeff_2) ;
163
+            multiply_accumulate_pixel_coeff(v_sum_0, v_sum_1, v_sum_2, v_sum_3, src, ocol + 3 * srcStride, v_coeff_3) ;
164
+            multiply_accumulate_pixel_coeff(v_sum_0, v_sum_1, v_sum_2, v_sum_3, src, ocol + 4 * srcStride, v_coeff_4) ;
165
+            multiply_accumulate_pixel_coeff(v_sum_0, v_sum_1, v_sum_2, v_sum_3, src, ocol + 5 * srcStride, v_coeff_5) ;
166
+            multiply_accumulate_pixel_coeff(v_sum_0, v_sum_1, v_sum_2, v_sum_3, src, ocol + 6 * srcStride, v_coeff_6) ;
167
+            multiply_accumulate_pixel_coeff(v_sum_0, v_sum_1, v_sum_2, v_sum_3, src, ocol + 7 * srcStride, v_coeff_7) ;
168
+
169
+
170
+
171
+
172
+
173
+            // --> for(col=0; col<16; col++) {val[col] = (int16_t)((sum[col] + offset) >> shift);}
174
+            // Add offset
175
+            v_sum_0 = vec_add(v_sum_0, v_offset) ;
176
+            v_sum_1 = vec_add(v_sum_1, v_offset) ;
177
+            v_sum_2 = vec_add(v_sum_2, v_offset) ;
178
+            v_sum_3 = vec_add(v_sum_3, v_offset) ;
179
+            // Shift right by "shift"
180
+            v_sum_0 = vec_sra(v_sum_0, v_shift) ;
181
+            v_sum_1 = vec_sra(v_sum_1, v_shift) ;
182
+            v_sum_2 = vec_sra(v_sum_2, v_shift) ;
183
+            v_sum_3 = vec_sra(v_sum_3, v_shift) ;
184
+
185
+            // Pack into 16-bit numbers
186
+            v_val_0 = vec_pack(v_sum_0, v_sum_2) ;
187
+            v_val_1 = vec_pack(v_sum_1, v_sum_3) ;
188
+
189
+
190
+            
191
+            // --> for(col=0; col<16; col++) {val[col] = (val[col] < 0) ? 0 : val[col];}
192
+            vector bool short v_comp_zero_0, v_comp_zero_1 ;
193
+            vector signed short v_max_masked_0, v_max_masked_1 ;
194
+            vector signed short zeros16 = {0,0,0,0,0,0,0,0} ;
195
+            // Compute less than 0
196
+            v_comp_zero_0 = vec_cmplt(v_val_0, zeros16) ;
197
+            v_comp_zero_1 = vec_cmplt(v_val_1, zeros16) ;
198
+            // Keep values that are greater or equal to 0
199
+            v_val_0 = vec_andc(v_val_0, v_comp_zero_0) ;
200
+            v_val_1 = vec_andc(v_val_1, v_comp_zero_1) ;
201
x265_2.2.tar.gz/source/common/ppc/pixel_altivec.cpp Added
201
 
1
@@ -0,0 +1,4321 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2013 x265 project
4
+ *
5
+ * Authors: Steve Borho <steve@borho.org>
6
+ *          Mandar Gurav <mandar@multicorewareinc.com>
7
+ *          Mahesh Pittala <mahesh@multicorewareinc.com>
8
+ *          Min Chen <min.chen@multicorewareinc.com>
9
+ *
10
+ * This program is free software; you can redistribute it and/or modify
11
+ * it under the terms of the GNU General Public License as published by
12
+ * the Free Software Foundation; either version 2 of the License, or
13
+ * (at your option) any later version.
14
+ *
15
+ * This program is distributed in the hope that it will be useful,
16
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18
+ * GNU General Public License for more details.
19
+ *
20
+ * You should have received a copy of the GNU General Public License
21
+ * along with this program; if not, write to the Free Software
22
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
23
+ *
24
+ * This program is also available under a commercial proprietary license.
25
+ * For more information, contact us at license @ x265.com.
26
+ *****************************************************************************/
27
+
28
+#include "common.h"
29
+#include "primitives.h"
30
+#include "x265.h"
31
+#include "ppccommon.h"
32
+
33
+#include <cstdlib> // abs()
34
+
35
+//using namespace X265_NS;
36
+
37
+namespace X265_NS {
38
+// place functions in anonymous namespace (file static)
39
+
40
+ /* Null vector */
41
+#define LOAD_ZERO const vec_u8_t zerov = vec_splat_u8( 0 )
42
+
43
+#define zero_u8v  (vec_u8_t)  zerov
44
+#define zero_s8v  (vec_s8_t)  zerov
45
+#define zero_u16v (vec_u16_t) zerov
46
+#define zero_s16v (vec_s16_t) zerov
47
+#define zero_u32v (vec_u32_t) zerov
48
+#define zero_s32v (vec_s32_t) zerov
49
+
50
+ /* 8 <-> 16 bits conversions */
51
+#ifdef WORDS_BIGENDIAN
52
+#define vec_u8_to_u16_h(v) (vec_u16_t) vec_mergeh( zero_u8v, (vec_u8_t) v )
53
+#define vec_u8_to_u16_l(v) (vec_u16_t) vec_mergel( zero_u8v, (vec_u8_t) v )
54
+#define vec_u8_to_s16_h(v) (vec_s16_t) vec_mergeh( zero_u8v, (vec_u8_t) v )
55
+#define vec_u8_to_s16_l(v) (vec_s16_t) vec_mergel( zero_u8v, (vec_u8_t) v )
56
+#else
57
+#define vec_u8_to_u16_h(v) (vec_u16_t) vec_mergeh( (vec_u8_t) v, zero_u8v )
58
+#define vec_u8_to_u16_l(v) (vec_u16_t) vec_mergel( (vec_u8_t) v, zero_u8v )
59
+#define vec_u8_to_s16_h(v) (vec_s16_t) vec_mergeh( (vec_u8_t) v, zero_u8v )
60
+#define vec_u8_to_s16_l(v) (vec_s16_t) vec_mergel( (vec_u8_t) v, zero_u8v )
61
+#endif
62
+
63
+#define vec_u8_to_u16(v) vec_u8_to_u16_h(v)
64
+#define vec_u8_to_s16(v) vec_u8_to_s16_h(v)
65
+
66
+#if defined(__GNUC__)
67
+#define ALIGN_VAR_8(T, var)  T var __attribute__((aligned(8)))
68
+#define ALIGN_VAR_16(T, var) T var __attribute__((aligned(16)))
69
+#define ALIGN_VAR_32(T, var) T var __attribute__((aligned(32)))
70
+#elif defined(_MSC_VER)
71
+#define ALIGN_VAR_8(T, var)  __declspec(align(8)) T var
72
+#define ALIGN_VAR_16(T, var) __declspec(align(16)) T var
73
+#define ALIGN_VAR_32(T, var) __declspec(align(32)) T var
74
+#endif // if defined(__GNUC__)
75
+
76
+typedef uint8_t  pixel;
77
+typedef uint32_t sum2_t ;
78
+typedef uint16_t sum_t ;
79
+#define BITS_PER_SUM (8 * sizeof(sum_t))
80
+
81
+/***********************************************************************
82
+ * SAD routines - altivec implementation
83
+ **********************************************************************/
84
+template<int lx, int ly>
85
+void inline sum_columns_altivec(vec_s32_t sumv, int* sum){}
86
+
87
+template<int lx, int ly>
88
+int inline sad16_altivec(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
89
+{
90
+    assert(lx <=16);
91
+    LOAD_ZERO;
92
+    vec_u8_t  pix1v, pix2v;
93
+    vec_u8_t  absv = zero_u8v;
94
+    vec_s32_t sumv = zero_s32v;
95
+    ALIGN_VAR_16(int, sum );
96
+
97
+    for( int y = 0; y < ly; y++ )
98
+    {
99
+        pix1v = /*vec_vsx_ld*/vec_xl( 0, pix1);
100
+        pix2v = /*vec_vsx_ld*/vec_xl( 0, pix2);
101
+        //print_vec_u8("pix1v", &pix1v);
102
+        //print_vec_u8("pix2v", &pix2v);
103
+
104
+        absv = (vector unsigned char)vec_sub(vec_max(pix1v, pix2v), vec_min(pix1v, pix2v)); 
105
+        //print_vec_u8("abs sub", &absv);
106
+
107
+        sumv = (vec_s32_t) vec_sum4s( absv, (vec_u32_t) sumv);
108
+        //print_vec_i("vec_sum4s 0", &sumv);
109
+
110
+        pix1 += stride_pix1;
111
+        pix2 += stride_pix2;
112
+    }
113
+
114
+    sum_columns_altivec<lx, ly>(sumv, &sum);
115
+    //printf("<%d %d>%d\n", lx, ly, sum);
116
+    return sum;
117
+}
118
+
119
+template<int lx, int ly> //to be implemented later
120
+int sad16_altivec(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2)
121
+{
122
+    int sum = 0;
123
+    return sum;
124
+}
125
+
126
+template<int lx, int ly>//to be implemented later
127
+int sad_altivec(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2)
128
+{
129
+    int sum = 0;
130
+    return sum;
131
+}
132
+
133
+template<>
134
+void inline sum_columns_altivec<16, 4>(vec_s32_t sumv, int* sum)
135
+{
136
+    LOAD_ZERO;
137
+    sumv = vec_sums( sumv, zero_s32v );
138
+    //print_vec_i("vec_sums", &sumv);
139
+    sumv = vec_splat( sumv, 3 );
140
+    //print_vec_i("vec_splat 3", &sumv);
141
+    vec_ste( sumv, 0, sum );
142
+}
143
+
144
+template<>
145
+void inline sum_columns_altivec<16, 8>(vec_s32_t sumv, int* sum)
146
+{
147
+    LOAD_ZERO;
148
+    sumv = vec_sums( sumv, zero_s32v );
149
+    //print_vec_i("vec_sums", &sumv);
150
+    sumv = vec_splat( sumv, 3 );
151
+    //print_vec_i("vec_splat 3", &sumv);
152
+    vec_ste( sumv, 0, sum );
153
+}
154
+
155
+template<>
156
+void inline sum_columns_altivec<16, 12>(vec_s32_t sumv, int* sum)
157
+{
158
+    LOAD_ZERO;
159
+    sumv = vec_sums( sumv, zero_s32v );
160
+    //print_vec_i("vec_sums", &sumv);
161
+    sumv = vec_splat( sumv, 3 );
162
+    //print_vec_i("vec_splat 3", &sumv);
163
+    vec_ste( sumv, 0, sum );
164
+}
165
+
166
+template<>
167
+void inline sum_columns_altivec<16, 16>(vec_s32_t sumv, int* sum)
168
+{
169
+    LOAD_ZERO;
170
+    sumv = vec_sums( sumv, zero_s32v );
171
+    //print_vec_i("vec_sums", &sumv);
172
+    sumv = vec_splat( sumv, 3 );
173
+    //print_vec_i("vec_splat 3", &sumv);
174
+    vec_ste( sumv, 0, sum );
175
+}
176
+
177
+template<>
178
+void inline sum_columns_altivec<16, 24>(vec_s32_t sumv, int* sum)
179
+{
180
+    LOAD_ZERO;
181
+    sumv = vec_sums( sumv, zero_s32v );
182
+    //print_vec_i("vec_sums", &sumv);
183
+    sumv = vec_splat( sumv, 3 );
184
+    //print_vec_i("vec_splat 3", &sumv);
185
+    vec_ste( sumv, 0, sum );
186
+}
187
+
188
+template<>
189
+void inline sum_columns_altivec<16, 32>(vec_s32_t sumv, int* sum)
190
+{
191
+    LOAD_ZERO;
192
+    sumv = vec_sums( sumv, zero_s32v );
193
+    //print_vec_i("vec_sums", &sumv);
194
+    sumv = vec_splat( sumv, 3 );
195
+    //print_vec_i("vec_splat 3", &sumv);
196
+    vec_ste( sumv, 0, sum );
197
+}
198
+
199
+template<>
200
+void inline sum_columns_altivec<16, 48>(vec_s32_t sumv, int* sum)
201
x265_2.2.tar.gz/source/common/ppc/ppccommon.h Added
93
 
1
@@ -0,0 +1,91 @@
2
+/*****************************************************************************
3
+ * Copyright (C) 2013 x265 project
4
+ *
5
+ * Authors: Min Chen <min.chen@multicorewareinc.com>
6
+ *
7
+ * This program is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU General Public License as published by
9
+ * the Free Software Foundation; either version 2 of the License, or
10
+ * (at your option) any later version.
11
+ *
12
+ * This program is distributed in the hope that it will be useful,
13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
+ * GNU General Public License for more details.
16
+ *
17
+ * You should have received a copy of the GNU General Public License
18
+ * along with this program; if not, write to the Free Software
19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
20
+ *
21
+ * This program is also available under a commercial proprietary license.
22
+ * For more information, contact us at license @ x265.com.
23
+ *****************************************************************************/
24
+
25
+#ifndef X265_PPCCOMMON_H
26
+#define X265_PPCCOMMON_H
27
+
28
+
29
+#if HAVE_ALTIVEC
30
+#include <altivec.h>
31
+
32
+#define vec_u8_t  vector unsigned char
33
+#define vec_s8_t  vector signed char
34
+#define vec_u16_t vector unsigned short
35
+#define vec_s16_t vector signed short
36
+#define vec_u32_t vector unsigned int
37
+#define vec_s32_t vector signed int
38
+
39
+//copy from x264
40
+#define LOAD_ZERO const vec_u8_t zerov = vec_splat_u8( 0 )
41
+
42
+#define zero_u8v  (vec_u8_t)  zerov
43
+#define zero_s8v  (vec_s8_t)  zerov
44
+#define zero_u16v (vec_u16_t) zerov
45
+#define zero_s16v (vec_s16_t) zerov
46
+#define zero_u32v (vec_u32_t) zerov
47
+#define zero_s32v (vec_s32_t) zerov
48
+
49
+/***********************************************************************
50
+ * 8 <-> 16 bits conversions
51
+ **********************************************************************/
52
+#ifdef WORDS_BIGENDIAN
53
+#define vec_u8_to_u16_h(v) (vec_u16_t) vec_mergeh( zero_u8v, (vec_u8_t) v )
54
+#define vec_u8_to_u16_l(v) (vec_u16_t) vec_mergel( zero_u8v, (vec_u8_t) v )
55
+#define vec_u8_to_s16_h(v) (vec_s16_t) vec_mergeh( zero_u8v, (vec_u8_t) v )
56
+#define vec_u8_to_s16_l(v) (vec_s16_t) vec_mergel( zero_u8v, (vec_u8_t) v )
57
+#else
58
+#define vec_u8_to_u16_h(v) (vec_u16_t) vec_mergeh( (vec_u8_t) v, zero_u8v )
59
+#define vec_u8_to_u16_l(v) (vec_u16_t) vec_mergel( (vec_u8_t) v, zero_u8v )
60
+#define vec_u8_to_s16_h(v) (vec_s16_t) vec_mergeh( (vec_u8_t) v, zero_u8v )
61
+#define vec_u8_to_s16_l(v) (vec_s16_t) vec_mergel( (vec_u8_t) v, zero_u8v )
62
+#endif
63
+
64
+#define vec_u8_to_u16(v) vec_u8_to_u16_h(v)
65
+#define vec_u8_to_s16(v) vec_u8_to_s16_h(v)
66
+
67
+#ifdef WORDS_BIGENDIAN
68
+#define vec_u16_to_u32_h(v) (vec_u32_t) vec_mergeh( zero_u16v, (vec_u16_t) v )
69
+#define vec_u16_to_u32_l(v) (vec_u32_t) vec_mergel( zero_u16v, (vec_u16_t) v )
70
+#define vec_u16_to_s32_h(v) (vec_s32_t) vec_mergeh( zero_u16v, (vec_u16_t) v )
71
+#define vec_u16_to_s32_l(v) (vec_s32_t) vec_mergel( zero_u16v, (vec_u16_t) v )
72
+#else
73
+#define vec_u16_to_u32_h(v) (vec_u32_t) vec_mergeh( (vec_u16_t) v, zero_u16v )
74
+#define vec_u16_to_u32_l(v) (vec_u32_t) vec_mergel( (vec_u16_t) v, zero_u16v )
75
+#define vec_u16_to_s32_h(v) (vec_s32_t) vec_mergeh( (vec_u16_t) v, zero_u16v )
76
+#define vec_u16_to_s32_l(v) (vec_s32_t) vec_mergel( (vec_u16_t) v, zero_u16v )
77
+#endif
78
+
79
+#define vec_u16_to_u32(v) vec_u16_to_u32_h(v)
80
+#define vec_u16_to_s32(v) vec_u16_to_s32_h(v)
81
+
82
+#define vec_u32_to_u16(v) vec_pack( v, zero_u32v )
83
+#define vec_s32_to_u16(v) vec_packsu( v, zero_s32v )
84
+
85
+#define BITS_PER_SUM (8 * sizeof(sum_t))
86
+
87
+#endif /* HAVE_ALTIVEC */
88
+
89
+#endif /* X265_PPCCOMMON_H */
90
+
91
+
92
+
93
x265_2.1.tar.gz/source/common/primitives.cpp -> x265_2.2.tar.gz/source/common/primitives.cpp Changed
17
 
1
@@ -243,6 +243,15 @@
2
 #endif
3
         setupAssemblyPrimitives(primitives, param->cpuid);
4
 #endif
5
+#if HAVE_ALTIVEC
6
+        if (param->cpuid & X265_CPU_ALTIVEC)
7
+        {
8
+            setupPixelPrimitives_altivec(primitives);       // pixel_altivec.cpp, overwrite the initialization for altivec optimizated functions
9
+            setupDCTPrimitives_altivec(primitives);         // dct_altivec.cpp, overwrite the initialization for altivec optimizated functions
10
+            setupFilterPrimitives_altivec(primitives);      // ipfilter.cpp, overwrite the initialization for altivec optimizated functions
11
+            setupIntraPrimitives_altivec(primitives);       // intrapred_altivec.cpp, overwrite the initialization for altivec optimizated functions
12
+        }
13
+#endif
14
 
15
         setupAliasPrimitives(primitives);
16
     }
17
x265_2.1.tar.gz/source/common/primitives.h -> x265_2.2.tar.gz/source/common/primitives.h Changed
53
 
1
@@ -115,6 +115,7 @@
2
 typedef sse_t (*pixel_sse_t)(const pixel* fenc, intptr_t fencstride, const pixel* fref, intptr_t frefstride); // fenc is aligned
3
 typedef sse_t (*pixel_sse_ss_t)(const int16_t* fenc, intptr_t fencstride, const int16_t* fref, intptr_t frefstride);
4
 typedef sse_t (*pixel_ssd_s_t)(const int16_t* fenc, intptr_t fencstride);
5
+typedef int(*pixelcmp_ads_t)(int encDC[], uint32_t *sums, int delta, uint16_t *costMvX, int16_t *mvs, int width, int thresh);
6
 typedef void (*pixelcmp_x4_t)(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
7
 typedef void (*pixelcmp_x3_t)(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
8
 typedef void (*blockfill_s_t)(int16_t* dst, intptr_t dstride, int16_t val);
9
@@ -217,6 +218,7 @@
10
         pixelcmp_t     sad;         // Sum of Absolute Differences
11
         pixelcmp_x3_t  sad_x3;      // Sum of Absolute Differences, 3 mv offsets at once
12
         pixelcmp_x4_t  sad_x4;      // Sum of Absolute Differences, 4 mv offsets at once
13
+        pixelcmp_ads_t ads;         // Absolute Differences sum
14
         pixelcmp_t     satd;        // Sum of Absolute Transformed Differences (4x4 Hadamard)
15
 
16
         filter_pp_t    luma_hpp;    // 8-tap luma motion compensation interpolation filters
17
@@ -402,6 +404,22 @@
18
     return part;
19
 }
20
 
21
+/* Computes the size of the LumaPU for a given LumaPU enum */
22
+inline void sizesFromPartition(int part, int *width, int *height)
23
+{
24
+    X265_CHECK(part >= 0 && part <= 24, "Invalid part %d \n", part);
25
+    extern const uint8_t lumaPartitionMapTable[];
26
+    int index = 0;
27
+    for (int i = 0; i < 256;i++)
28
+        if (part == lumaPartitionMapTable[i])
29
+        {
30
+            index = i;
31
+            break;
32
+        }
33
+    *width = 4 * ((index >> 4) + 1);
34
+    *height = 4 * ((index % 16) + 1);
35
+}
36
+
37
 inline int partitionFromLog2Size(int log2Size)
38
 {
39
     X265_CHECK(2 <= log2Size && log2Size <= 6, "Invalid block size\n");
40
@@ -412,6 +430,12 @@
41
 void setupInstrinsicPrimitives(EncoderPrimitives &p, int cpuMask);
42
 void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask);
43
 void setupAliasPrimitives(EncoderPrimitives &p);
44
+#if HAVE_ALTIVEC
45
+void setupPixelPrimitives_altivec(EncoderPrimitives &p);
46
+void setupDCTPrimitives_altivec(EncoderPrimitives &p);
47
+void setupFilterPrimitives_altivec(EncoderPrimitives &p);
48
+void setupIntraPrimitives_altivec(EncoderPrimitives &p);
49
+#endif
50
 }
51
 
52
 #if !EXPORT_C_API
53
x265_2.1.tar.gz/source/common/scalinglist.cpp -> x265_2.2.tar.gz/source/common/scalinglist.cpp Changed
154
 
1
@@ -29,64 +29,6 @@
2
 // file-anonymous namespace
3
 
4
 /* Strings for scaling list file parsing */
5
-const char MatrixType[4][6][20] =
6
-{
7
-    {
8
-        "INTRA4X4_LUMA",
9
-        "INTRA4X4_CHROMAU",
10
-        "INTRA4X4_CHROMAV",
11
-        "INTER4X4_LUMA",
12
-        "INTER4X4_CHROMAU",
13
-        "INTER4X4_CHROMAV"
14
-    },
15
-    {
16
-        "INTRA8X8_LUMA",
17
-        "INTRA8X8_CHROMAU",
18
-        "INTRA8X8_CHROMAV",
19
-        "INTER8X8_LUMA",
20
-        "INTER8X8_CHROMAU",
21
-        "INTER8X8_CHROMAV"
22
-    },
23
-    {
24
-        "INTRA16X16_LUMA",
25
-        "INTRA16X16_CHROMAU",
26
-        "INTRA16X16_CHROMAV",
27
-        "INTER16X16_LUMA",
28
-        "INTER16X16_CHROMAU",
29
-        "INTER16X16_CHROMAV"
30
-    },
31
-    {
32
-        "INTRA32X32_LUMA",
33
-        "",
34
-        "",
35
-        "INTER32X32_LUMA",
36
-        "",
37
-        "",
38
-    },
39
-};
40
-const char MatrixType_DC[4][12][22] =
41
-{
42
-    {
43
-    },
44
-    {
45
-    },
46
-    {
47
-        "INTRA16X16_LUMA_DC",
48
-        "INTRA16X16_CHROMAU_DC",
49
-        "INTRA16X16_CHROMAV_DC",
50
-        "INTER16X16_LUMA_DC",
51
-        "INTER16X16_CHROMAU_DC",
52
-        "INTER16X16_CHROMAV_DC"
53
-    },
54
-    {
55
-        "INTRA32X32_LUMA_DC",
56
-        "",
57
-        "",
58
-        "INTER32X32_LUMA_DC",
59
-        "",
60
-        "",
61
-    },
62
-};
63
 
64
 static int quantTSDefault4x4[16] =
65
 {
66
@@ -124,6 +66,64 @@
67
 
68
 namespace X265_NS {
69
 // private namespace
70
+    const char ScalingList::MatrixType[4][6][20] =
71
+    {
72
+        {
73
+            "INTRA4X4_LUMA",
74
+            "INTRA4X4_CHROMAU",
75
+            "INTRA4X4_CHROMAV",
76
+            "INTER4X4_LUMA",
77
+            "INTER4X4_CHROMAU",
78
+            "INTER4X4_CHROMAV"
79
+        },
80
+        {
81
+            "INTRA8X8_LUMA",
82
+            "INTRA8X8_CHROMAU",
83
+            "INTRA8X8_CHROMAV",
84
+            "INTER8X8_LUMA",
85
+            "INTER8X8_CHROMAU",
86
+            "INTER8X8_CHROMAV"
87
+        },
88
+        {
89
+            "INTRA16X16_LUMA",
90
+            "INTRA16X16_CHROMAU",
91
+            "INTRA16X16_CHROMAV",
92
+            "INTER16X16_LUMA",
93
+            "INTER16X16_CHROMAU",
94
+            "INTER16X16_CHROMAV"
95
+        },
96
+        {
97
+            "INTRA32X32_LUMA",
98
+            "",
99
+            "",
100
+            "INTER32X32_LUMA",
101
+            "",
102
+            "",
103
+        },
104
+    };
105
+    const char ScalingList::MatrixType_DC[4][12][22] =
106
+    {
107
+        {
108
+        },
109
+        {
110
+        },
111
+        {
112
+            "INTRA16X16_LUMA_DC",
113
+            "INTRA16X16_CHROMAU_DC",
114
+            "INTRA16X16_CHROMAV_DC",
115
+            "INTER16X16_LUMA_DC",
116
+            "INTER16X16_CHROMAU_DC",
117
+            "INTER16X16_CHROMAV_DC"
118
+        },
119
+        {
120
+            "INTRA32X32_LUMA_DC",
121
+            "",
122
+            "",
123
+            "INTER32X32_LUMA_DC",
124
+            "",
125
+            "",
126
+        },
127
+    };
128
 
129
 const int     ScalingList::s_numCoefPerSize[NUM_SIZES] = { 16, 64, 256, 1024 };
130
 const int32_t ScalingList::s_quantScales[NUM_REM] = { 26214, 23302, 20560, 18396, 16384, 14564 };
131
@@ -312,6 +312,22 @@
132
                 m_scalingListDC[sizeIdc][listIdc] = data;
133
             }
134
         }
135
+        if (sizeIdc == 3)
136
+        {
137
+            for (int listIdc = 1; listIdc < NUM_LISTS; listIdc++)
138
+            {
139
+                if (listIdc % 3 != 0)
140
+                {
141
+                    src = m_scalingListCoef[sizeIdc][listIdc];
142
+                    const int *srcNextSmallerSize = m_scalingListCoef[sizeIdc - 1][listIdc];
143
+                    for (int i = 0; i < size; i++)
144
+                    {
145
+                        src[i] = srcNextSmallerSize[i];
146
+                    }
147
+                    m_scalingListDC[sizeIdc][listIdc] = m_scalingListDC[sizeIdc - 1][listIdc];
148
+                }
149
+            }
150
+        }
151
     }
152
 
153
     fclose(fp);
154
x265_2.1.tar.gz/source/common/scalinglist.h -> x265_2.2.tar.gz/source/common/scalinglist.h Changed
10
 
1
@@ -42,6 +42,8 @@
2
     static const int     s_numCoefPerSize[NUM_SIZES];
3
     static const int32_t s_invQuantScales[NUM_REM];
4
     static const int32_t s_quantScales[NUM_REM];
5
+    static const char MatrixType[4][6][20];
6
+    static const char MatrixType_DC[4][12][22];
7
 
8
     int32_t  m_scalingListDC[NUM_SIZES][NUM_LISTS];   // the DC value of the matrix coefficient for 16x16
9
     int32_t* m_scalingListCoef[NUM_SIZES][NUM_LISTS]; // quantization matrix
10
x265_2.1.tar.gz/source/common/slice.h -> x265_2.2.tar.gz/source/common/slice.h Changed
58
 
1
@@ -239,11 +239,16 @@
2
     uint32_t maxLatencyIncrease;
3
     int      numReorderPics;
4
 
5
+    RPS      spsrps[MAX_NUM_SHORT_TERM_RPS];
6
+    int      spsrpsNum;
7
+    int      numGOPBegin;
8
+
9
     bool     bUseSAO; // use param
10
     bool     bUseAMP; // use param
11
     bool     bUseStrongIntraSmoothing; // use param
12
     bool     bTemporalMVPEnabled;
13
-    bool     bDiscardOptionalVUI;
14
+    bool     bEmitVUITimingInfo;
15
+    bool     bEmitVUIHRDInfo;
16
 
17
     Window   conformanceWindow;
18
     VUI      vuiParameters;
19
@@ -282,6 +287,8 @@
20
 
21
     bool     bDeblockingFilterControlPresent;
22
     bool     bPicDisableDeblockingFilter;
23
+
24
+    int      numRefIdxDefault[2];
25
 };
26
 
27
 struct WeightParam
28
@@ -334,6 +341,7 @@
29
     int         m_sliceQp;
30
     int         m_poc;
31
     int         m_lastIDR;
32
+    int         m_rpsIdx;
33
 
34
     uint32_t    m_colRefIdx;       // never modified
35
 
36
@@ -347,6 +355,10 @@
37
     bool        m_sLFaseFlag;      // loop filter boundary flag
38
     bool        m_colFromL0Flag;   // collocated picture from List0 or List1 flag
39
 
40
+    int         m_iPPSQpMinus26;
41
+    int         numRefIdxDefault[2];
42
+    int         m_iNumRPSInSPS;
43
+
44
     Slice()
45
     {
46
         m_lastIDR = 0;
47
@@ -356,6 +368,10 @@
48
         memset(m_refReconPicList, 0, sizeof(m_refReconPicList));
49
         memset(m_refPOCList, 0, sizeof(m_refPOCList));
50
         disableWeights();
51
+        m_iPPSQpMinus26 = 0;
52
+        numRefIdxDefault[0] = 1;
53
+        numRefIdxDefault[1] = 1;
54
+        m_rpsIdx = -1;
55
     }
56
 
57
     void disableWeights();
58
x265_2.1.tar.gz/source/common/version.cpp -> x265_2.2.tar.gz/source/common/version.cpp Changed
10
 
1
@@ -77,7 +77,7 @@
2
 #define BITS    "[32 bit]"
3
 #endif
4
 
5
-#if defined(ENABLE_ASSEMBLY)
6
+#if defined(ENABLE_ASSEMBLY) || HAVE_ALTIVEC
7
 #define ASM     ""
8
 #else
9
 #define ASM     "[noasm]"
10
x265_2.1.tar.gz/source/common/yuv.cpp -> x265_2.2.tar.gz/source/common/yuv.cpp Changed
13
 
1
@@ -47,6 +47,11 @@
2
     m_size  = size;
3
     m_part = partitionFromSizes(size, size);
4
 
5
+    for (int i = 0; i < 2; i++)
6
+        for (int j = 0; j < MAX_NUM_REF; j++)
7
+            for (int k = 0; k < INTEGRAL_PLANE_NUM; k++)
8
+                m_integral[i][j][k] = NULL;
9
+
10
     if (csp == X265_CSP_I400)
11
     {
12
         CHECKED_MALLOC(m_buf[0], pixel, size * size + 8);
13
x265_2.1.tar.gz/source/common/yuv.h -> x265_2.2.tar.gz/source/common/yuv.h Changed
9
 
1
@@ -48,6 +48,7 @@
2
     int      m_csp;
3
     int      m_hChromaShift;
4
     int      m_vChromaShift;
5
+    uint32_t *m_integral[2][MAX_NUM_REF][INTEGRAL_PLANE_NUM];
6
 
7
     Yuv();
8
 
9
x265_2.1.tar.gz/source/encoder/analysis.cpp -> x265_2.2.tar.gz/source/encoder/analysis.cpp Changed
198
 
1
@@ -203,6 +203,57 @@
2
     return *m_modeDepth[0].bestMode;
3
 }
4
 
5
+int32_t Analysis::loadTUDepth(CUGeom cuGeom, CUData parentCTU)
6
+{
7
+    float predDepth = 0;
8
+    CUData* neighbourCU;
9
+    uint8_t count = 0;
10
+    int32_t maxTUDepth = -1;
11
+    neighbourCU = m_slice->m_refFrameList[0][0]->m_encData->m_picCTU;
12
+    predDepth += neighbourCU->m_refTuDepth[cuGeom.geomRecurId];
13
+    count++;
14
+    if (m_slice->isInterB())
15
+    {
16
+        neighbourCU = m_slice->m_refFrameList[1][0]->m_encData->m_picCTU;
17
+        predDepth += neighbourCU->m_refTuDepth[cuGeom.geomRecurId];
18
+        count++;
19
+    }
20
+    if (parentCTU.m_cuAbove)
21
+    {
22
+        predDepth += parentCTU.m_cuAbove->m_refTuDepth[cuGeom.geomRecurId];
23
+        count++;
24
+        if (parentCTU.m_cuAboveLeft)
25
+        {
26
+            predDepth += parentCTU.m_cuAboveLeft->m_refTuDepth[cuGeom.geomRecurId];
27
+            count++;
28
+        }
29
+        if (parentCTU.m_cuAboveRight)
30
+        {
31
+            predDepth += parentCTU.m_cuAboveRight->m_refTuDepth[cuGeom.geomRecurId];
32
+            count++;
33
+        }
34
+    }
35
+    if (parentCTU.m_cuLeft)
36
+    {
37
+        predDepth += parentCTU.m_cuLeft->m_refTuDepth[cuGeom.geomRecurId];
38
+        count++;
39
+    }
40
+    predDepth /= count;
41
+
42
+    if (predDepth == 0)
43
+        maxTUDepth = 0;
44
+    else if (predDepth < 1)
45
+        maxTUDepth = 1;
46
+    else if (predDepth >= 1 && predDepth <= 1.5)
47
+        maxTUDepth = 2;
48
+    else if (predDepth > 1.5 && predDepth <= 2.5)
49
+        maxTUDepth = 3;
50
+    else
51
+        maxTUDepth = -1;
52
+
53
+    return maxTUDepth;
54
+}
55
+
56
 void Analysis::tryLossless(const CUGeom& cuGeom)
57
 {
58
     ModeDepth& md = m_modeDepth[cuGeom.depth];
59
@@ -394,6 +445,16 @@
60
         cacheCost[cuIdx] = md.bestMode->rdCost;
61
     }
62
 
63
+    /* Save Intra CUs TU depth only when analysis mode is OFF */
64
+    if ((m_limitTU & X265_TU_LIMIT_NEIGH) && cuGeom.log2CUSize >= 4 && !m_param->analysisMode)
65
+    {
66
+        CUData* ctu = md.bestMode->cu.m_encData->getPicCTU(parentCTU.m_cuAddr);
67
+        int8_t maxTUDepth = -1;
68
+        for (uint32_t i = 0; i < cuGeom.numPartitions; i++)
69
+            maxTUDepth = X265_MAX(maxTUDepth, md.pred[PRED_INTRA].cu.m_tuDepth[i]);
70
+        ctu->m_refTuDepth[cuGeom.geomRecurId] = maxTUDepth;
71
+    }
72
+
73
     /* Copy best data to encData CTU and recon */
74
     md.bestMode->cu.copyToPic(depth);
75
     if (md.bestMode != &md.pred[PRED_SPLIT])
76
@@ -883,6 +944,16 @@
77
     ModeDepth& md = m_modeDepth[depth];
78
     md.bestMode = NULL;
79
 
80
+    if (m_param->searchMethod == X265_SEA)
81
+    {
82
+        int numPredDir = m_slice->isInterP() ? 1 : 2;
83
+        int offset = (int)(m_frame->m_reconPic->m_cuOffsetY[parentCTU.m_cuAddr] + m_frame->m_reconPic->m_buOffsetY[cuGeom.absPartIdx]);
84
+        for (int list = 0; list < numPredDir; list++)
85
+            for (int i = 0; i < m_frame->m_encData->m_slice->m_numRefIdx[list]; i++)
86
+                for (int planes = 0; planes < INTEGRAL_PLANE_NUM; planes++)
87
+                    m_modeDepth[depth].fencYuv.m_integral[list][i][planes] = m_frame->m_encData->m_slice->m_refFrameList[list][i]->m_encData->m_meIntegral[planes] + offset;
88
+    }
89
+
90
     PicYuv& reconPic = *m_frame->m_reconPic;
91
 
92
     bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
93
@@ -894,6 +965,9 @@
94
     bool skipRectAmp = false;
95
     bool chooseMerge = false;
96
 
97
+    if ((m_limitTU & X265_TU_LIMIT_NEIGH) && cuGeom.log2CUSize >= 4)
98
+        m_maxTUDepth = loadTUDepth(cuGeom, parentCTU);
99
+
100
     SplitData splitData[4];
101
     splitData[0].initSplitCUData();
102
     splitData[1].initSplitCUData();
103
@@ -1400,6 +1474,18 @@
104
     if (m_param->rdLevel)
105
         md.bestMode->reconYuv.copyToPicYuv(reconPic, cuAddr, cuGeom.absPartIdx);
106
 
107
+    if ((m_limitTU & X265_TU_LIMIT_NEIGH) && cuGeom.log2CUSize >= 4)
108
+    {
109
+        if (mightNotSplit)
110
+        {
111
+            CUData* ctu = md.bestMode->cu.m_encData->getPicCTU(parentCTU.m_cuAddr);
112
+            int8_t maxTUDepth = -1;
113
+            for (uint32_t i = 0; i < cuGeom.numPartitions; i++)
114
+                maxTUDepth = X265_MAX(maxTUDepth, md.bestMode->cu.m_tuDepth[i]);
115
+            ctu->m_refTuDepth[cuGeom.geomRecurId] = maxTUDepth;
116
+        }
117
+    }
118
+
119
     return splitCUData;
120
 }
121
 
122
@@ -1409,6 +1495,16 @@
123
     ModeDepth& md = m_modeDepth[depth];
124
     md.bestMode = NULL;
125
 
126
+    if (m_param->searchMethod == X265_SEA)
127
+    {
128
+        int numPredDir = m_slice->isInterP() ? 1 : 2;
129
+        int offset = (int)(m_frame->m_reconPic->m_cuOffsetY[parentCTU.m_cuAddr] + m_frame->m_reconPic->m_buOffsetY[cuGeom.absPartIdx]);
130
+        for (int list = 0; list < numPredDir; list++)
131
+            for (int i = 0; i < m_frame->m_encData->m_slice->m_numRefIdx[list]; i++)
132
+                for (int planes = 0; planes < INTEGRAL_PLANE_NUM; planes++)
133
+                    m_modeDepth[depth].fencYuv.m_integral[list][i][planes] = m_frame->m_encData->m_slice->m_refFrameList[list][i]->m_encData->m_meIntegral[planes] + offset;
134
+    }
135
+
136
     bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
137
     bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
138
     bool skipRecursion = false;
139
@@ -1424,6 +1520,9 @@
140
         md.pred[PRED_2Nx2N].rdCost = 0;
141
     }
142
 
143
+    if ((m_limitTU & X265_TU_LIMIT_NEIGH) && cuGeom.log2CUSize >= 4)
144
+        m_maxTUDepth = loadTUDepth(cuGeom, parentCTU);
145
+
146
     SplitData splitData[4];
147
     splitData[0].initSplitCUData();
148
     splitData[1].initSplitCUData();
149
@@ -1751,6 +1850,18 @@
150
             addSplitFlagCost(*md.bestMode, cuGeom.depth);
151
     }
152
 
153
+    if ((m_limitTU & X265_TU_LIMIT_NEIGH) && cuGeom.log2CUSize >= 4)
154
+    {
155
+        if (mightNotSplit)
156
+        {
157
+            CUData* ctu = md.bestMode->cu.m_encData->getPicCTU(parentCTU.m_cuAddr);
158
+            int8_t maxTUDepth = -1;
159
+            for (uint32_t i = 0; i < cuGeom.numPartitions; i++)
160
+                maxTUDepth = X265_MAX(maxTUDepth, md.bestMode->cu.m_tuDepth[i]);
161
+            ctu->m_refTuDepth[cuGeom.geomRecurId] = maxTUDepth;
162
+        }
163
+    }
164
+
165
     /* compare split RD cost against best cost */
166
     if (mightSplit && !skipRecursion)
167
         checkBestMode(md.pred[PRED_SPLIT], depth);
168
@@ -1942,12 +2053,12 @@
169
             if (m_param->maxSlices > 1)
170
             {
171
                 // NOTE: First row in slice can't negative
172
-                if ((candMvField[i][0].mv.y < m_sliceMinY) | (candMvField[i][1].mv.y < m_sliceMinY))
173
+                if (X265_MIN(candMvField[i][0].mv.y, candMvField[i][1].mv.y) < m_sliceMinY)
174
                     continue;
175
 
176
                 // Last row in slice can't reference beyond bound since it is another slice area
177
                 // TODO: we may beyond bound in future since these area have a chance to finish because we use parallel slices. Necessary prepare research on load balance
178
-                if ((candMvField[i][0].mv.y > m_sliceMaxY) | (candMvField[i][1].mv.y > m_sliceMaxY))
179
+                if (X265_MAX(candMvField[i][0].mv.y, candMvField[i][1].mv.y) > m_sliceMaxY)
180
                     continue;
181
             }
182
 
183
@@ -2072,12 +2183,12 @@
184
             if (m_param->maxSlices > 1)
185
             {
186
                 // NOTE: First row in slice can't negative
187
-                if ((candMvField[i][0].mv.y < m_sliceMinY) | (candMvField[i][1].mv.y < m_sliceMinY))
188
+                if (X265_MIN(candMvField[i][0].mv.y, candMvField[i][1].mv.y) < m_sliceMinY)
189
                     continue;
190
 
191
                 // Last row in slice can't reference beyond bound since it is another slice area
192
                 // TODO: we may beyond bound in future since these area have a chance to finish because we use parallel slices. Necessary prepare research on load balance
193
-                if ((candMvField[i][0].mv.y > m_sliceMaxY) | (candMvField[i][1].mv.y > m_sliceMaxY))
194
+                if (X265_MAX(candMvField[i][0].mv.y, candMvField[i][1].mv.y) > m_sliceMaxY)
195
                     continue;
196
             }
197
 
198
x265_2.1.tar.gz/source/encoder/analysis.h -> x265_2.2.tar.gz/source/encoder/analysis.h Changed
9
 
1
@@ -116,6 +116,7 @@
2
     void destroy();
3
 
4
     Mode& compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, const Entropy& initialContext);
5
+    int32_t loadTUDepth(CUGeom cuGeom, CUData parentCTU);
6
 
7
 protected:
8
     /* Analysis data for save/load mode, writes/reads data based on absPartIdx */
9
x265_2.1.tar.gz/source/encoder/api.cpp -> x265_2.2.tar.gz/source/encoder/api.cpp Changed
13
 
1
@@ -141,6 +141,11 @@
2
         Encoder *encoder = static_cast<Encoder*>(enc);
3
         Entropy sbacCoder;
4
         Bitstream bs;
5
+        if (encoder->m_param->rc.bStatRead && encoder->m_param->bMultiPassOptRPS)
6
+        {
7
+            if (!encoder->computeSPSRPSIndex())
8
+                return -1;
9
+        }
10
         encoder->getStreamHeaders(encoder->m_nalList, sbacCoder, bs);
11
         *pp_nal = &encoder->m_nalList.m_nal[0];
12
         if (pi_nal) *pi_nal = encoder->m_nalList.m_numNal;
13
x265_2.1.tar.gz/source/encoder/bitcost.cpp -> x265_2.2.tar.gz/source/encoder/bitcost.cpp Changed
62
 
1
@@ -54,16 +54,40 @@
2
                 s_costs[qp][i] = s_costs[qp][-i] = (uint16_t)X265_MIN(s_bitsizes[i] * lambda + 0.5f, (1 << 15) - 1);
3
         }
4
     }
5
-
6
+    for (int j = 0; j < 4; j++)
7
+    {
8
+        if (!s_fpelMvCosts[qp][j])
9
+        {
10
+            ScopedLock s(s_costCalcLock);
11
+            if (!s_fpelMvCosts[qp][j])
12
+            {
13
+                s_fpelMvCosts[qp][j] = X265_MALLOC(uint16_t, BC_MAX_MV + 1) + (BC_MAX_MV >> 1);
14
+                if (!s_fpelMvCosts[qp][j])
15
+                {
16
+                    x265_log(NULL, X265_LOG_ERROR, "BitCost s_fpelMvCosts buffer allocation failure\n");
17
+                    return;
18
+                }
19
+                for (int i = -(BC_MAX_MV >> 1); i < (BC_MAX_MV >> 1); i++)
20
+                {
21
+                    s_fpelMvCosts[qp][j][i] = s_costs[qp][i * 4 + j];
22
+                }
23
+            }
24
+        }
25
+    }
26
     m_cost = s_costs[qp];
27
+    for (int j = 0; j < 4; j++)
28
+    {
29
+        m_fpelMvCosts[j] = s_fpelMvCosts[qp][j];
30
+    }
31
 }
32
-
33
 /***
34
  * Class static data and methods
35
  */
36
 
37
 uint16_t *BitCost::s_costs[BC_MAX_QP];
38
 
39
+uint16_t* BitCost::s_fpelMvCosts[BC_MAX_QP][4];
40
+
41
 float *BitCost::s_bitsizes;
42
 
43
 Lock BitCost::s_costCalcLock;
44
@@ -96,6 +120,17 @@
45
             s_costs[i] = NULL;
46
         }
47
     }
48
+    for (int i = 0; i < BC_MAX_QP; i++)
49
+    {
50
+        for (int j = 0; j < 4; j++)
51
+        {
52
+            if (s_fpelMvCosts[i][j])
53
+            {
54
+                X265_FREE(s_fpelMvCosts[i][j] - (BC_MAX_MV >> 1));
55
+                s_fpelMvCosts[i][j] = NULL;
56
+            }
57
+        }
58
+    }
59
 
60
     if (s_bitsizes)
61
     {
62
x265_2.1.tar.gz/source/encoder/bitcost.h -> x265_2.2.tar.gz/source/encoder/bitcost.h Changed
19
 
1
@@ -67,6 +67,8 @@
2
 
3
     uint16_t *m_cost;
4
 
5
+    uint16_t *m_fpelMvCosts[4];
6
+
7
     MV        m_mvp;
8
 
9
     BitCost& operator =(const BitCost&);
10
@@ -84,6 +86,8 @@
11
 
12
     static uint16_t *s_costs[BC_MAX_QP];
13
 
14
+    static uint16_t *s_fpelMvCosts[BC_MAX_QP][4];
15
+
16
     static Lock s_costCalcLock;
17
 
18
     static void CalculateLogs();
19
x265_2.1.tar.gz/source/encoder/dpb.cpp -> x265_2.2.tar.gz/source/encoder/dpb.cpp Changed
21
 
1
@@ -92,6 +92,19 @@
2
             m_freeList.pushBack(*curFrame);
3
             curFrame->m_encData->m_freeListNext = m_frameDataFreeList;
4
             m_frameDataFreeList = curFrame->m_encData;
5
+
6
+            if (curFrame->m_encData->m_meBuffer)
7
+            {
8
+                for (int i = 0; i < INTEGRAL_PLANE_NUM; i++)
9
+                {
10
+                    if (curFrame->m_encData->m_meBuffer[i] != NULL)
11
+                    {
12
+                        X265_FREE(curFrame->m_encData->m_meBuffer[i]);
13
+                        curFrame->m_encData->m_meBuffer[i] = NULL;
14
+                    }
15
+                }
16
+            }
17
+
18
             curFrame->m_encData = NULL;
19
             curFrame->m_reconPic = NULL;
20
         }
21
x265_2.1.tar.gz/source/encoder/encoder.cpp -> x265_2.2.tar.gz/source/encoder/encoder.cpp Changed
201
 
1
@@ -74,6 +74,10 @@
2
     m_threadPool = NULL;
3
     m_analysisFile = NULL;
4
     m_offsetEmergency = NULL;
5
+    m_iFrameNum = 0;
6
+    m_iPPSQpMinus26 = 0;
7
+    m_iLastSliceQp = 0;
8
+    m_rpsInSpsCount = 0;
9
     for (int i = 0; i < X265_MAX_FRAME_THREADS; i++)
10
         m_frameEncoder[i] = NULL;
11
 
12
@@ -145,12 +149,6 @@
13
         p->bEnableWavefront = p->bDistributeModeAnalysis = p->bDistributeMotionEstimation = p->lookaheadSlices = 0;
14
     }
15
 
16
-    if (!p->bEnableWavefront && p->rc.vbvBufferSize)
17
-    {
18
-        x265_log(p, X265_LOG_ERROR, "VBV requires wavefront parallelism\n");
19
-        m_aborted = true;
20
-    }
21
-
22
     x265_log(p, X265_LOG_INFO, "Slices                              : %d\n", p->maxSlices);
23
 
24
     char buf[128];
25
@@ -318,6 +316,8 @@
26
     if (!m_lookahead->create())
27
         m_aborted = true;
28
 
29
+    initRefIdx();
30
+
31
     if (m_param->analysisMode)
32
     {
33
         const char* name = m_param->analysisFileName;
34
@@ -869,6 +869,58 @@
35
                 slice->m_endCUAddr = slice->realEndAddress(m_sps.numCUsInFrame * NUM_4x4_PARTITIONS);
36
             }
37
 
38
+            if (m_param->searchMethod == X265_SEA && frameEnc->m_lowres.sliceType != X265_TYPE_B)
39
+            {
40
+                int padX = g_maxCUSize + 32;
41
+                int padY = g_maxCUSize + 16;
42
+                uint32_t numCuInHeight = (frameEnc->m_encData->m_reconPic->m_picHeight + g_maxCUSize - 1) / g_maxCUSize;
43
+                int maxHeight = numCuInHeight * g_maxCUSize;
44
+                for (int i = 0; i < INTEGRAL_PLANE_NUM; i++)
45
+                {
46
+                    frameEnc->m_encData->m_meBuffer[i] = X265_MALLOC(uint32_t, frameEnc->m_reconPic->m_stride * (maxHeight + (2 * padY)));
47
+                    if (frameEnc->m_encData->m_meBuffer[i])
48
+                    {
49
+                        memset(frameEnc->m_encData->m_meBuffer[i], 0, sizeof(uint32_t)* frameEnc->m_reconPic->m_stride * (maxHeight + (2 * padY)));
50
+                        frameEnc->m_encData->m_meIntegral[i] = frameEnc->m_encData->m_meBuffer[i] + frameEnc->m_encData->m_reconPic->m_stride * padY + padX;
51
+                    }
52
+                    else
53
+                        x265_log(m_param, X265_LOG_ERROR, "SEA motion search: POC %d Integral buffer[%d] unallocated\n", frameEnc->m_poc, i);
54
+                }
55
+            }
56
+
57
+            if (m_param->bOptQpPPS && frameEnc->m_lowres.bKeyframe && m_param->bRepeatHeaders)
58
+            {
59
+                ScopedLock qpLock(m_sliceQpLock);
60
+                if (m_iFrameNum > 0)
61
+                {
62
+                    //Search the least cost
63
+                    int64_t iLeastCost = m_iBitsCostSum[0];
64
+                    int iLeastId = 0;
65
+                    for (int i = 1; i < QP_MAX_MAX + 1; i++)
66
+                    {
67
+                        if (iLeastCost > m_iBitsCostSum[i])
68
+                        {
69
+                            iLeastId = i;
70
+                            iLeastCost = m_iBitsCostSum[i];
71
+                        }
72
+                    }
73
+
74
+                    /* If last slice Qp is close to (26 + m_iPPSQpMinus26) or outputs is all I-frame video,
75
+                       we don't need to change m_iPPSQpMinus26. */
76
+                    if ((abs(m_iLastSliceQp - (26 + m_iPPSQpMinus26)) > 1) && (m_iFrameNum > 1))
77
+                        m_iPPSQpMinus26 = (iLeastId + 1) - 26;
78
+                    m_iFrameNum = 0;
79
+                }
80
+
81
+                for (int i = 0; i < QP_MAX_MAX + 1; i++)
82
+                    m_iBitsCostSum[i] = 0;
83
+            }
84
+
85
+            frameEnc->m_encData->m_slice->m_iPPSQpMinus26 = m_iPPSQpMinus26;
86
+            frameEnc->m_encData->m_slice->numRefIdxDefault[0] = m_pps.numRefIdxDefault[0];
87
+            frameEnc->m_encData->m_slice->numRefIdxDefault[1] = m_pps.numRefIdxDefault[1];
88
+            frameEnc->m_encData->m_slice->m_iNumRPSInSPS = m_sps.spsrpsNum;
89
+
90
             curEncoder->m_rce.encodeOrder = frameEnc->m_encodeOrder = m_encodedFrameNum++;
91
             if (m_bframeDelay)
92
             {
93
@@ -1031,6 +1083,13 @@
94
 
95
         x265_log(m_param, X265_LOG_INFO, "lossless compression ratio %.2f::1\n", uncompressed / m_analyzeAll.m_accBits);
96
     }
97
+    if (m_param->bMultiPassOptRPS && m_param->rc.bStatRead)
98
+    {
99
+        x265_log(m_param, X265_LOG_INFO, "RPS in SPS: %d frames (%.2f%%), RPS not in SPS: %d frames (%.2f%%)\n", 
100
+            m_rpsInSpsCount, (float)100.0 * m_rpsInSpsCount / m_rateControl->m_numEntries, 
101
+            m_rateControl->m_numEntries - m_rpsInSpsCount, 
102
+            (float)100.0 * (m_rateControl->m_numEntries - m_rpsInSpsCount) / m_rateControl->m_numEntries);
103
+    }
104
 
105
     if (m_analyzeAll.m_numPics)
106
     {
107
@@ -1353,6 +1412,7 @@
108
         frameStats->qp = curEncData.m_avgQpAq;
109
         frameStats->bits = bits;
110
         frameStats->bScenecut = curFrame->m_lowres.bScenecut;
111
+        frameStats->bufferFill = m_rateControl->m_bufferFillActual;
112
         frameStats->frameLatency = inPoc - poc;
113
         if (m_param->rc.rateControlMode == X265_RC_CRF)
114
             frameStats->rateFactor = curEncData.m_rateFactor;
115
@@ -1413,6 +1473,66 @@
116
 #pragma warning(disable: 4127) // conditional expression is constant
117
 #endif
118
 
119
+void Encoder::initRefIdx()
120
+{
121
+    int j = 0;
122
+
123
+    for (j = 0; j < MAX_NUM_REF_IDX; j++)
124
+    {
125
+        m_refIdxLastGOP.numRefIdxl0[j] = 0;
126
+        m_refIdxLastGOP.numRefIdxl1[j] = 0;
127
+    }
128
+
129
+    return;
130
+}
131
+
132
+void Encoder::analyseRefIdx(int *numRefIdx)
133
+{
134
+    int i_l0 = 0;
135
+    int i_l1 = 0;
136
+
137
+    i_l0 = numRefIdx[0];
138
+    i_l1 = numRefIdx[1];
139
+
140
+    if ((0 < i_l0) && (MAX_NUM_REF_IDX > i_l0))
141
+        m_refIdxLastGOP.numRefIdxl0[i_l0]++;
142
+    if ((0 < i_l1) && (MAX_NUM_REF_IDX > i_l1))
143
+        m_refIdxLastGOP.numRefIdxl1[i_l1]++;
144
+
145
+    return;
146
+}
147
+
148
+void Encoder::updateRefIdx()
149
+{
150
+    int i_max_l0 = 0;
151
+    int i_max_l1 = 0;
152
+    int j = 0;
153
+
154
+    i_max_l0 = 0;
155
+    i_max_l1 = 0;
156
+    m_refIdxLastGOP.numRefIdxDefault[0] = 1;
157
+    m_refIdxLastGOP.numRefIdxDefault[1] = 1;
158
+    for (j = 0; j < MAX_NUM_REF_IDX; j++)
159
+    {
160
+        if (i_max_l0 < m_refIdxLastGOP.numRefIdxl0[j])
161
+        {
162
+            i_max_l0 = m_refIdxLastGOP.numRefIdxl0[j];
163
+            m_refIdxLastGOP.numRefIdxDefault[0] = j;
164
+        }
165
+        if (i_max_l1 < m_refIdxLastGOP.numRefIdxl1[j])
166
+        {
167
+            i_max_l1 = m_refIdxLastGOP.numRefIdxl1[j];
168
+            m_refIdxLastGOP.numRefIdxDefault[1] = j;
169
+        }
170
+    }
171
+
172
+    m_pps.numRefIdxDefault[0] = m_refIdxLastGOP.numRefIdxDefault[0];
173
+    m_pps.numRefIdxDefault[1] = m_refIdxLastGOP.numRefIdxDefault[1];
174
+    initRefIdx();
175
+
176
+    return;
177
+}
178
+
179
 void Encoder::getStreamHeaders(NALList& list, Entropy& sbacCoder, Bitstream& bs)
180
 {
181
     sbacCoder.setBitstream(&bs);
182
@@ -1429,7 +1549,7 @@
183
     list.serialize(NAL_UNIT_SPS, bs);
184
 
185
     bs.resetBits();
186
-    sbacCoder.codePPS(m_pps, (m_param->maxSlices <= 1));
187
+    sbacCoder.codePPS( m_pps, (m_param->maxSlices <= 1), m_iPPSQpMinus26);
188
     bs.writeByteAlignment();
189
     list.serialize(NAL_UNIT_PPS, bs);
190
 
191
@@ -1458,9 +1578,9 @@
192
         list.serialize(NAL_UNIT_PREFIX_SEI, bs);
193
     }
194
 
195
-    if (!m_param->bDiscardSEI && m_param->bEmitInfoSEI)
196
+    if (m_param->bEmitInfoSEI)
197
     {
198
-        char *opts = x265_param2string(m_param);
199
+        char *opts = x265_param2string(m_param, m_sps.conformanceWindow.rightOffset, m_sps.conformanceWindow.bottomOffset);
200
         if (opts)
201
x265_2.1.tar.gz/source/encoder/encoder.h -> x265_2.2.tar.gz/source/encoder/encoder.h Changed
66
 
1
@@ -26,6 +26,7 @@
2
 
3
 #include "common.h"
4
 #include "slice.h"
5
+#include "threading.h"
6
 #include "scalinglist.h"
7
 #include "x265.h"
8
 #include "nal.h"
9
@@ -69,6 +70,24 @@
10
     void addSsim(double ssim);
11
 };
12
 
13
+#define MAX_NUM_REF_IDX 64
14
+
15
+struct RefIdxLastGOP
16
+{
17
+    int numRefIdxDefault[2];
18
+    int numRefIdxl0[MAX_NUM_REF_IDX];
19
+    int numRefIdxl1[MAX_NUM_REF_IDX];
20
+};
21
+
22
+struct RPSListNode
23
+{
24
+    int idx;
25
+    int count;
26
+    RPS* rps;
27
+    RPSListNode* next;
28
+    RPSListNode* prior;
29
+};
30
+
31
 class FrameEncoder;
32
 class DPB;
33
 class Lookahead;
34
@@ -136,6 +155,19 @@
35
      * one is done. Requires bIntraRefresh to be set.*/
36
     int                m_bQueuedIntraRefresh;
37
 
38
+    /* For optimising slice QP */
39
+    Lock               m_sliceQpLock;
40
+    int                m_iFrameNum;   
41
+    int                m_iPPSQpMinus26;
42
+    int                m_iLastSliceQp;
43
+    int64_t            m_iBitsCostSum[QP_MAX_MAX + 1];
44
+
45
+    Lock               m_sliceRefIdxLock;
46
+    RefIdxLastGOP      m_refIdxLastGOP;
47
+
48
+    Lock               m_rpsInSpsLock;
49
+    int                m_rpsInSpsCount;
50
+
51
     Encoder();
52
     ~Encoder() {}
53
 
54
@@ -173,6 +205,11 @@
55
 
56
     void calcRefreshInterval(Frame* frameEnc);
57
 
58
+    void initRefIdx();
59
+    void analyseRefIdx(int *numRefIdx);
60
+    void updateRefIdx();
61
+    bool computeSPSRPSIndex();
62
+
63
 protected:
64
 
65
     void initVPS(VPS *vps);
66
x265_2.1.tar.gz/source/encoder/entropy.cpp -> x265_2.2.tar.gz/source/encoder/entropy.cpp Changed
122
 
1
@@ -312,19 +312,21 @@
2
     WRITE_FLAG(sps.bUseSAO, "sample_adaptive_offset_enabled_flag");
3
 
4
     WRITE_FLAG(0, "pcm_enabled_flag");
5
-    WRITE_UVLC(0, "num_short_term_ref_pic_sets");
6
+    WRITE_UVLC(sps.spsrpsNum, "num_short_term_ref_pic_sets");
7
+    for (int i = 0; i < sps.spsrpsNum; i++)
8
+        codeShortTermRefPicSet(sps.spsrps[i], i);
9
     WRITE_FLAG(0, "long_term_ref_pics_present_flag");
10
 
11
     WRITE_FLAG(sps.bTemporalMVPEnabled, "sps_temporal_mvp_enable_flag");
12
     WRITE_FLAG(sps.bUseStrongIntraSmoothing, "sps_strong_intra_smoothing_enable_flag");
13
 
14
     WRITE_FLAG(1, "vui_parameters_present_flag");
15
-    codeVUI(sps.vuiParameters, sps.maxTempSubLayers, sps.bDiscardOptionalVUI);
16
+    codeVUI(sps.vuiParameters, sps.maxTempSubLayers, sps.bEmitVUITimingInfo, sps.bEmitVUIHRDInfo);
17
 
18
     WRITE_FLAG(0, "sps_extension_flag");
19
 }
20
 
21
-void Entropy::codePPS(const PPS& pps, bool filerAcross)
22
+void Entropy::codePPS( const PPS& pps, bool filerAcross, int iPPSInitQpMinus26 )
23
 {
24
     WRITE_UVLC(0,                          "pps_pic_parameter_set_id");
25
     WRITE_UVLC(0,                          "pps_seq_parameter_set_id");
26
@@ -333,10 +335,10 @@
27
     WRITE_CODE(0, 3,                       "num_extra_slice_header_bits");
28
     WRITE_FLAG(pps.bSignHideEnabled,       "sign_data_hiding_flag");
29
     WRITE_FLAG(0,                          "cabac_init_present_flag");
30
-    WRITE_UVLC(0,                          "num_ref_idx_l0_default_active_minus1");
31
-    WRITE_UVLC(0,                          "num_ref_idx_l1_default_active_minus1");
32
+    WRITE_UVLC(pps.numRefIdxDefault[0] - 1, "num_ref_idx_l0_default_active_minus1");
33
+    WRITE_UVLC(pps.numRefIdxDefault[1] - 1, "num_ref_idx_l1_default_active_minus1");
34
 
35
-    WRITE_SVLC(0, "init_qp_minus26");
36
+    WRITE_SVLC(iPPSInitQpMinus26,         "init_qp_minus26");
37
     WRITE_FLAG(pps.bConstrainedIntraPred, "constrained_intra_pred_flag");
38
     WRITE_FLAG(pps.bTransformSkipEnabled, "transform_skip_enabled_flag");
39
 
40
@@ -422,7 +424,7 @@
41
     }
42
 }
43
 
44
-void Entropy::codeVUI(const VUI& vui, int maxSubTLayers, bool bDiscardOptionalVUI)
45
+void Entropy::codeVUI(const VUI& vui, int maxSubTLayers, bool bEmitVUITimingInfo, bool bEmitVUIHRDInfo)
46
 {
47
     WRITE_FLAG(vui.aspectRatioInfoPresentFlag, "aspect_ratio_info_present_flag");
48
     if (vui.aspectRatioInfoPresentFlag)
49
@@ -473,7 +475,7 @@
50
         WRITE_UVLC(vui.defaultDisplayWindow.bottomOffset, "def_disp_win_bottom_offset");
51
     }
52
 
53
-    if (bDiscardOptionalVUI)
54
+    if (!bEmitVUITimingInfo)
55
         WRITE_FLAG(0, "vui_timing_info_present_flag");
56
     else
57
     {
58
@@ -483,7 +485,7 @@
59
         WRITE_FLAG(0, "vui_poc_proportional_to_timing_flag");
60
     }
61
 
62
-    if (bDiscardOptionalVUI)
63
+    if (!bEmitVUIHRDInfo)
64
         WRITE_FLAG(0, "vui_hrd_parameters_present_flag");
65
     else
66
     {
67
@@ -614,8 +616,21 @@
68
             }
69
 #endif
70
 
71
-        WRITE_FLAG(0, "short_term_ref_pic_set_sps_flag");
72
-        codeShortTermRefPicSet(slice.m_rps);
73
+        if (slice.m_rpsIdx < 0)
74
+        {
75
+            WRITE_FLAG(0, "short_term_ref_pic_set_sps_flag");
76
+            codeShortTermRefPicSet(slice.m_rps, slice.m_sps->spsrpsNum);
77
+        }
78
+        else
79
+        {
80
+            WRITE_FLAG(1, "short_term_ref_pic_set_sps_flag");
81
+            int numBits = 0;
82
+            while ((1 << numBits) < slice.m_iNumRPSInSPS)
83
+                numBits++;
84
+
85
+            if (numBits > 0)
86
+                WRITE_CODE(slice.m_rpsIdx, numBits, "short_term_ref_pic_set_idx");
87
+        }
88
 
89
         if (slice.m_sps->bTemporalMVPEnabled)
90
             WRITE_FLAG(1, "slice_temporal_mvp_enable_flag");
91
@@ -633,7 +648,7 @@
92
 
93
     if (!slice.isIntra())
94
     {
95
-        bool overrideFlag = (slice.m_numRefIdx[0] != 1 || (slice.isInterB() && slice.m_numRefIdx[1] != 1));
96
+        bool overrideFlag = (slice.m_numRefIdx[0] != slice.numRefIdxDefault[0] || (slice.isInterB() && slice.m_numRefIdx[1] != slice.numRefIdxDefault[1]));
97
         WRITE_FLAG(overrideFlag, "num_ref_idx_active_override_flag");
98
         if (overrideFlag)
99
         {
100
@@ -673,7 +688,7 @@
101
     if (!slice.isIntra())
102
         WRITE_UVLC(MRG_MAX_NUM_CANDS - slice.m_maxNumMergeCand, "five_minus_max_num_merge_cand");
103
 
104
-    int code = sliceQp - 26;
105
+    int code = sliceQp - (slice.m_iPPSQpMinus26 + 26);
106
     WRITE_SVLC(code, "slice_qp_delta");
107
 
108
     // TODO: Enable when pps_loop_filter_across_slices_enabled_flag==1
109
@@ -707,8 +722,11 @@
110
         WRITE_CODE(substreamSizes[i] - 1, offsetLen, "entry_point_offset_minus1");
111
 }
112
 
113
-void Entropy::codeShortTermRefPicSet(const RPS& rps)
114
+void Entropy::codeShortTermRefPicSet(const RPS& rps, int idx)
115
 {
116
+    if (idx > 0)
117
+        WRITE_FLAG(0, "inter_ref_pic_set_prediction_flag");
118
+
119
     WRITE_UVLC(rps.numberOfNegativePictures, "num_negative_pics");
120
     WRITE_UVLC(rps.numberOfPositivePictures, "num_positive_pics");
121
     int prev = 0;
122
x265_2.1.tar.gz/source/encoder/entropy.h -> x265_2.2.tar.gz/source/encoder/entropy.h Changed
19
 
1
@@ -142,14 +142,14 @@
2
 
3
     void codeVPS(const VPS& vps);
4
     void codeSPS(const SPS& sps, const ScalingList& scalingList, const ProfileTierLevel& ptl);
5
-    void codePPS(const PPS& pps, bool filerAcross);
6
-    void codeVUI(const VUI& vui, int maxSubTLayers, bool discardOptionalVUI);
7
+    void codePPS( const PPS& pps, bool filerAcross, int iPPSInitQpMinus26 );
8
+    void codeVUI(const VUI& vui, int maxSubTLayers, bool bEmitVUITimingInfo, bool bEmitVUIHRDInfo);
9
     void codeAUD(const Slice& slice);
10
     void codeHrdParameters(const HRDInfo& hrd, int maxSubTLayers);
11
 
12
     void codeSliceHeader(const Slice& slice, FrameData& encData, uint32_t slice_addr, uint32_t slice_addr_bits, int sliceQp);
13
     void codeSliceHeaderWPPEntryPoints(const uint32_t *substreamSizes, uint32_t numSubStreams, uint32_t maxOffset);
14
-    void codeShortTermRefPicSet(const RPS& rps);
15
+    void codeShortTermRefPicSet(const RPS& rps, int idx);
16
     void finishSlice()                 { encodeBinTrm(1); finish(); dynamic_cast<Bitstream*>(m_bitIf)->writeByteAlignment(); }
17
 
18
     void encodeCTU(const CUData& cu, const CUGeom& cuGeom);
19
x265_2.1.tar.gz/source/encoder/frameencoder.cpp -> x265_2.2.tar.gz/source/encoder/frameencoder.cpp Changed
201
 
1
@@ -50,6 +50,7 @@
2
     m_bAllRowsStop = false;
3
     m_vbvResetTriggerRow = -1;
4
     m_outStreams = NULL;
5
+    m_backupStreams = NULL;
6
     m_substreamSizes = NULL;
7
     m_nr = NULL;
8
     m_tld = NULL;
9
@@ -85,6 +86,7 @@
10
 
11
     delete[] m_rows;
12
     delete[] m_outStreams;
13
+    delete[] m_backupStreams;
14
     X265_FREE(m_sliceBaseRow);
15
     X265_FREE(m_cuGeoms);
16
     X265_FREE(m_ctuGeomMap);
17
@@ -121,7 +123,7 @@
18
     int range  = m_param->searchRange;       /* fpel search */
19
     range += !!(m_param->searchMethod < 2);  /* diamond/hex range check lag */
20
     range += NTAPS_LUMA / 2;                 /* subpel filter half-length */
21
-    range += 2 + MotionEstimate::hpelIterationCount(m_param->subpelRefine) / 2; /* subpel refine steps */
22
+    range += 2 + (MotionEstimate::hpelIterationCount(m_param->subpelRefine) + 1) / 2; /* subpel refine steps */
23
     m_refLagRows = /*(m_param->maxSlices > 1 ? 1 : 0) +*/ 1 + ((range + g_maxCUSize - 1) / g_maxCUSize);
24
 
25
     // NOTE: 2 times of numRows because both Encoder and Filter in same queue
26
@@ -152,7 +154,7 @@
27
     // 7.4.7.1 - Ceil( Log2( PicSizeInCtbsY ) ) bits
28
     {
29
         unsigned long tmp;
30
-        CLZ(tmp, (numRows * numCols));
31
+        CLZ(tmp, (numRows * numCols - 1));
32
         m_sliceAddrBits = (uint16_t)(tmp + 1);
33
     }
34
 
35
@@ -305,6 +307,19 @@
36
     weightAnalyse(*frame->m_encData->m_slice, *frame, *master.m_param);
37
 }
38
 
39
+
40
+uint32_t getBsLength( int32_t code )
41
+{
42
+    uint32_t ucode = (code <= 0) ? -code << 1 : (code << 1) - 1;
43
+
44
+    ++ucode;
45
+    unsigned long idx;
46
+    CLZ( idx, ucode );
47
+    uint32_t length = (uint32_t)idx * 2 + 1;
48
+
49
+    return length;
50
+}
51
+
52
 void FrameEncoder::compressFrame()
53
 {
54
     ProfileScopeEvent(frameThread);
55
@@ -340,7 +355,28 @@
56
         m_nalList.serialize(NAL_UNIT_ACCESS_UNIT_DELIMITER, m_bs);
57
     }
58
     if (m_frame->m_lowres.bKeyframe && m_param->bRepeatHeaders)
59
-        m_top->getStreamHeaders(m_nalList, m_entropyCoder, m_bs);
60
+    {
61
+        if (m_param->bOptRefListLengthPPS)
62
+        {
63
+            ScopedLock refIdxLock(m_top->m_sliceRefIdxLock);
64
+            m_top->updateRefIdx();
65
+        }
66
+        if (m_top->m_param->rc.bStatRead  && m_top->m_param->bMultiPassOptRPS)
67
+        {
68
+            ScopedLock refIdxLock(m_top->m_rpsInSpsLock);
69
+            if (!m_top->computeSPSRPSIndex())
70
+            {
71
+                x265_log(m_param, X265_LOG_ERROR, "compute commonly RPS failed!\n");
72
+                m_top->m_aborted = true;
73
+            }
74
+            m_top->getStreamHeaders(m_nalList, m_entropyCoder, m_bs);
75
+        }
76
+        else
77
+            m_top->getStreamHeaders(m_nalList, m_entropyCoder, m_bs);
78
+    }
79
+
80
+    if (m_top->m_param->rc.bStatRead && m_top->m_param->bMultiPassOptRPS)
81
+        m_frame->m_encData->m_slice->m_rpsIdx = (m_top->m_rateControl->m_rce2Pass + m_frame->m_encodeOrder)->rpsIdx;
82
 
83
     // Weighted Prediction parameters estimation.
84
     bool bUseWeightP = slice->m_sliceType == P_SLICE && slice->m_pps->bUseWeightPred;
85
@@ -448,6 +484,19 @@
86
     /* Clip slice QP to 0-51 spec range before encoding */
87
     slice->m_sliceQp = x265_clip3(-QP_BD_OFFSET, QP_MAX_SPEC, qp);
88
 
89
+    if (m_param->bOptQpPPS && m_param->bRepeatHeaders)
90
+    {
91
+        ScopedLock qpLock(m_top->m_sliceQpLock);
92
+        for (int i = 0; i < (QP_MAX_MAX + 1); i++)
93
+        {
94
+            int delta = slice->m_sliceQp - (i + 1);
95
+            int codeLength = getBsLength( delta );
96
+            m_top->m_iBitsCostSum[i] += codeLength;
97
+        }
98
+        m_top->m_iFrameNum++;
99
+        m_top->m_iLastSliceQp = slice->m_sliceQp;
100
+    }
101
+
102
     m_initSliceContext.resetEntropy(*slice);
103
 
104
     m_frameFilter.start(m_frame, m_initSliceContext);
105
@@ -485,6 +534,8 @@
106
     if (!m_outStreams)
107
     {
108
         m_outStreams = new Bitstream[numSubstreams];
109
+        if (!m_param->bEnableWavefront)
110
+            m_backupStreams = new Bitstream[numSubstreams];
111
         m_substreamSizes = X265_MALLOC(uint32_t, numSubstreams);
112
         if (!m_param->bEnableSAO)
113
             for (uint32_t i = 0; i < numSubstreams; i++)
114
@@ -498,7 +549,7 @@
115
 
116
     if (m_frame->m_lowres.bKeyframe)
117
     {
118
-        if (!m_param->bDiscardSEI && m_param->bEmitHRDSEI)
119
+        if (m_param->bEmitHRDSEI)
120
         {
121
             SEIBufferingPeriod* bpSei = &m_top->m_rateControl->m_bufPeriodSEI;
122
 
123
@@ -520,7 +571,7 @@
124
         }
125
     }
126
 
127
-    if (!m_param->bDiscardSEI && (m_param->bEmitHRDSEI || !!m_param->interlaceMode))
128
+    if ((m_param->bEmitHRDSEI || !!m_param->interlaceMode))
129
     {
130
         SEIPictureTiming *sei = m_rce.picTimingSEI;
131
         const VUI *vui = &slice->m_sps->vuiParameters;
132
@@ -556,22 +607,19 @@
133
     }
134
 
135
     /* Write user SEI */
136
-    if (!m_param->bDiscardSEI)
137
+    for (int i = 0; i < m_frame->m_userSEI.numPayloads; i++)
138
     {
139
-        for (int i = 0; i < m_frame->m_userSEI.numPayloads; i++)
140
-        {
141
-            x265_sei_payload *payload = &m_frame->m_userSEI.payloads[i];
142
-            SEIuserDataUnregistered sei;
143
+        x265_sei_payload *payload = &m_frame->m_userSEI.payloads[i];
144
+        SEIuserDataUnregistered sei;
145
 
146
-            sei.m_payloadType = payload->payloadType;
147
-            sei.m_userDataLength = payload->payloadSize;
148
-            sei.m_userData = payload->payload;
149
+        sei.m_payloadType = payload->payloadType;
150
+        sei.m_userDataLength = payload->payloadSize;
151
+        sei.m_userData = payload->payload;
152
 
153
-            m_bs.resetBits();
154
-            sei.write(m_bs, *slice->m_sps);
155
-            m_bs.writeByteAlignment();
156
-            m_nalList.serialize(NAL_UNIT_PREFIX_SEI, m_bs);
157
-        }
158
+        m_bs.resetBits();
159
+        sei.write(m_bs, *slice->m_sps);
160
+        m_bs.writeByteAlignment();
161
+        m_nalList.serialize(NAL_UNIT_PREFIX_SEI, m_bs);
162
     }
163
 
164
     /* CQP and CRF (without capped VBV) doesn't use mid-frame statistics to 
165
@@ -606,8 +654,7 @@
166
                 const uint32_t sliceEndRow = m_sliceBaseRow[sliceId + 1] - 1;
167
                 const uint32_t row = sliceStartRow + rowInSlice;
168
 
169
-                if (row >= m_numRows)
170
-                    break;
171
+                X265_CHECK(row < m_numRows, "slices row fault was detected");
172
 
173
                 if (row > sliceEndRow)
174
                     continue;
175
@@ -626,7 +673,7 @@
176
                             refpic->m_reconRowFlag[rowIdx].waitForChange(0);
177
 
178
                         if ((bUseWeightP || bUseWeightB) && m_mref[l][ref].isWeighted)
179
-                            m_mref[l][ref].applyWeight(row + m_refLagRows, m_numRows, sliceEndRow + 1, sliceId);
180
+                            m_mref[l][ref].applyWeight(rowIdx, m_numRows, sliceEndRow, sliceId);
181
                     }
182
                 }
183
 
184
@@ -666,7 +713,7 @@
185
                             refpic->m_reconRowFlag[rowIdx].waitForChange(0);
186
 
187
                         if ((bUseWeightP || bUseWeightB) && m_mref[l][ref].isWeighted)
188
-                            m_mref[list][ref].applyWeight(i + m_refLagRows, m_numRows, m_numRows, 0);
189
+                            m_mref[list][ref].applyWeight(rowIdx, m_numRows, m_numRows, 0);
190
                     }
191
                 }
192
 
193
@@ -830,6 +877,11 @@
194
             const uint32_t sliceAddr = nextSliceRow * m_numCols;
195
             //CUData* ctu = m_frame->m_encData->getPicCTU(sliceAddr);
196
             //const int sliceQp = ctu->m_qp[0];
197
+            if (m_param->bOptRefListLengthPPS)
198
+            {
199
+                ScopedLock refIdxLock(m_top->m_sliceRefIdxLock);
200
+                m_top->analyseRefIdx(slice->m_numRefIdx);
201
x265_2.1.tar.gz/source/encoder/frameencoder.h -> x265_2.2.tar.gz/source/encoder/frameencoder.h Changed
9
 
1
@@ -184,6 +184,7 @@
2
     NoiseReduction*          m_nr;
3
     ThreadLocalData*         m_tld; /* for --no-wpp */
4
     Bitstream*               m_outStreams;
5
+    Bitstream*               m_backupStreams;
6
     uint32_t*                m_substreamSizes;
7
 
8
     CUGeom*                  m_cuGeoms;
9
x265_2.1.tar.gz/source/encoder/framefilter.cpp -> x265_2.2.tar.gz/source/encoder/framefilter.cpp Changed
201
 
1
@@ -35,6 +35,109 @@
2
 static uint64_t computeSSD(pixel *fenc, pixel *rec, intptr_t stride, uint32_t width, uint32_t height);
3
 static float calculateSSIM(pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, uint32_t width, uint32_t height, void *buf, uint32_t& cnt);
4
 
5
+static void integral_init4h(uint32_t *sum, pixel *pix, intptr_t stride)
6
+{
7
+    int32_t v = pix[0] + pix[1] + pix[2] + pix[3];
8
+    for (int16_t x = 0; x < stride - 4; x++)
9
+    {
10
+        sum[x] = v + sum[x - stride];
11
+        v += pix[x + 4] - pix[x];
12
+    }
13
+}
14
+
15
+static void integral_init8h(uint32_t *sum, pixel *pix, intptr_t stride)
16
+{
17
+    int32_t v = pix[0] + pix[1] + pix[2] + pix[3] + pix[4] + pix[5] + pix[6] + pix[7];
18
+    for (int16_t x = 0; x < stride - 8; x++)
19
+    {
20
+        sum[x] = v + sum[x - stride];
21
+        v += pix[x + 8] - pix[x];
22
+    }
23
+}
24
+
25
+static void integral_init12h(uint32_t *sum, pixel *pix, intptr_t stride)
26
+{
27
+    int32_t v = pix[0] + pix[1] + pix[2] + pix[3] + pix[4] + pix[5] + pix[6] + pix[7] +
28
+        pix[8] + pix[9] + pix[10] + pix[11];
29
+    for (int16_t x = 0; x < stride - 12; x++)
30
+    {
31
+        sum[x] = v + sum[x - stride];
32
+        v += pix[x + 12] - pix[x];
33
+    }
34
+}
35
+
36
+static void integral_init16h(uint32_t *sum, pixel *pix, intptr_t stride)
37
+{
38
+    int32_t v = pix[0] + pix[1] + pix[2] + pix[3] + pix[4] + pix[5] + pix[6] + pix[7] +
39
+        pix[8] + pix[9] + pix[10] + pix[11] + pix[12] + pix[13] + pix[14] + pix[15];
40
+    for (int16_t x = 0; x < stride - 16; x++)
41
+    {
42
+        sum[x] = v + sum[x - stride];
43
+        v += pix[x + 16] - pix[x];
44
+    }
45
+}
46
+
47
+static void integral_init24h(uint32_t *sum, pixel *pix, intptr_t stride)
48
+{
49
+    int32_t v = pix[0] + pix[1] + pix[2] + pix[3] + pix[4] + pix[5] + pix[6] + pix[7] +
50
+        pix[8] + pix[9] + pix[10] + pix[11] + pix[12] + pix[13] + pix[14] + pix[15] +
51
+        pix[16] + pix[17] + pix[18] + pix[19] + pix[20] + pix[21] + pix[22] + pix[23];
52
+    for (int16_t x = 0; x < stride - 24; x++)
53
+    {
54
+        sum[x] = v + sum[x - stride];
55
+        v += pix[x + 24] - pix[x];
56
+    }
57
+}
58
+
59
+static void integral_init32h(uint32_t *sum, pixel *pix, intptr_t stride)
60
+{
61
+    int32_t v = pix[0] + pix[1] + pix[2] + pix[3] + pix[4] + pix[5] + pix[6] + pix[7] +
62
+        pix[8] + pix[9] + pix[10] + pix[11] + pix[12] + pix[13] + pix[14] + pix[15] +
63
+        pix[16] + pix[17] + pix[18] + pix[19] + pix[20] + pix[21] + pix[22] + pix[23] +
64
+        pix[24] + pix[25] + pix[26] + pix[27] + pix[28] + pix[29] + pix[30] + pix[31];
65
+    for (int16_t x = 0; x < stride - 32; x++)
66
+    {
67
+        sum[x] = v + sum[x - stride];
68
+        v += pix[x + 32] - pix[x];
69
+    }
70
+}
71
+
72
+static void integral_init4v(uint32_t *sum4, intptr_t stride)
73
+{
74
+    for (int x = 0; x < stride; x++)
75
+        sum4[x] = sum4[x + 4 * stride] - sum4[x];
76
+}
77
+
78
+static void integral_init8v(uint32_t *sum8, intptr_t stride)
79
+{
80
+    for (int x = 0; x < stride; x++)
81
+        sum8[x] = sum8[x + 8 * stride] - sum8[x];
82
+}
83
+
84
+static void integral_init12v(uint32_t *sum12, intptr_t stride)
85
+{
86
+    for (int x = 0; x < stride; x++)
87
+        sum12[x] = sum12[x + 12 * stride] - sum12[x];
88
+}
89
+
90
+static void integral_init16v(uint32_t *sum16, intptr_t stride)
91
+{
92
+    for (int x = 0; x < stride; x++)
93
+        sum16[x] = sum16[x + 16 * stride] - sum16[x];
94
+}
95
+
96
+static void integral_init24v(uint32_t *sum24, intptr_t stride)
97
+{
98
+    for (int x = 0; x < stride; x++)
99
+        sum24[x] = sum24[x + 24 * stride] - sum24[x];
100
+}
101
+
102
+static void integral_init32v(uint32_t *sum32, intptr_t stride)
103
+{
104
+    for (int x = 0; x < stride; x++)
105
+        sum32[x] = sum32[x + 32 * stride] - sum32[x];
106
+}
107
+
108
 void FrameFilter::destroy()
109
 {
110
     X265_FREE(m_ssimBuf);
111
@@ -65,6 +168,7 @@
112
     m_saoRowDelay = m_param->bEnableLoopFilter ? 1 : 0;
113
     m_lastHeight = (m_param->sourceHeight % g_maxCUSize) ? (m_param->sourceHeight % g_maxCUSize) : g_maxCUSize;
114
     m_lastWidth = (m_param->sourceWidth % g_maxCUSize) ? (m_param->sourceWidth % g_maxCUSize) : g_maxCUSize;
115
+    integralCompleted.set(0);
116
 
117
     if (m_param->bEnableSsim)
118
         m_ssimBuf = X265_MALLOC(int, 8 * (m_param->sourceWidth / 4 + 3));
119
@@ -499,14 +603,19 @@
120
     if (!ctu->m_bFirstRowInSlice)
121
         processPostRow(row - 1);
122
 
123
-    if (ctu->m_bLastRowInSlice)
124
-        processPostRow(row);
125
-
126
     // NOTE: slices parallelism will be execute out-of-order
127
-    int numRowFinished;
128
-    for(numRowFinished = 0; numRowFinished < m_numRows; numRowFinished++)
129
-        if (!m_frame->m_reconRowFlag[numRowFinished].get())
130
-            break;
131
+    int numRowFinished = 0;
132
+    if (m_frame->m_reconRowFlag)
133
+    {
134
+        for (numRowFinished = 0; numRowFinished < m_numRows; numRowFinished++)
135
+        {
136
+            if (!m_frame->m_reconRowFlag[numRowFinished].get())
137
+                break;
138
+
139
+            if (numRowFinished == row)
140
+                continue;
141
+        }
142
+    }
143
 
144
     if (numRowFinished == m_numRows)
145
     {
146
@@ -522,6 +631,9 @@
147
             m_parallelFilter[0].m_sao.rdoSaoUnitRowEnd(saoParam, encData.m_slice->m_sps->numCUsInFrame);
148
         }
149
     }
150
+
151
+    if (ctu->m_bLastRowInSlice)
152
+        processPostRow(row);
153
 }
154
 
155
 void FrameFilter::processPostRow(int row)
156
@@ -656,6 +768,107 @@
157
         }
158
     } // end of (m_param->maxSlices == 1)
159
 
160
+    int lastRow = row == (int)m_frame->m_encData->m_slice->m_sps->numCuInHeight - 1;
161
+
162
+    /* generate integral planes for SEA motion search */
163
+    if (m_param->searchMethod == X265_SEA && m_frame->m_encData->m_meIntegral && m_frame->m_lowres.sliceType != X265_TYPE_B)
164
+    {
165
+        /* If WPP, other than first row, integral calculation for current row needs to wait till the
166
+        * integral for the previous row is computed */
167
+        if (m_param->bEnableWavefront && row)
168
+        {
169
+            while (m_parallelFilter[row - 1].m_frameFilter->integralCompleted.get() == 0)
170
+            {
171
+                m_parallelFilter[row - 1].m_frameFilter->integralCompleted.waitForChange(0);
172
+            }
173
+        }
174
+
175
+        int stride = (int)m_frame->m_reconPic->m_stride;
176
+        int padX = g_maxCUSize + 32;
177
+        int padY = g_maxCUSize + 16;
178
+        int numCuInHeight = m_frame->m_encData->m_slice->m_sps->numCuInHeight;
179
+        int maxHeight = numCuInHeight * g_maxCUSize;
180
+        int startRow = 0;
181
+
182
+        if (m_param->interlaceMode)
183
+            startRow = (row * g_maxCUSize >> 1);
184
+        else
185
+            startRow = row * g_maxCUSize;
186
+
187
+        int height = lastRow ? (maxHeight + g_maxCUSize * m_param->interlaceMode) : (((row + m_param->interlaceMode) * g_maxCUSize) + g_maxCUSize);
188
+
189
+        if (!row)
190
+        {
191
+            for (int i = 0; i < INTEGRAL_PLANE_NUM; i++)
192
+                memset(m_frame->m_encData->m_meIntegral[i] - padY * stride - padX, 0, stride * sizeof(uint32_t));
193
+            startRow = -padY;
194
+        }
195
+
196
+        if (lastRow)
197
+            height += padY - 1;
198
+
199
+        for (int y = startRow; y < height; y++)
200
+        {
201
x265_2.1.tar.gz/source/encoder/framefilter.h -> x265_2.2.tar.gz/source/encoder/framefilter.h Changed
10
 
1
@@ -57,6 +57,8 @@
2
     int           m_lastHeight;
3
     int           m_lastWidth;
4
     
5
+    ThreadSafeInteger integralCompleted;     /* check if integral calculation is completed in this row */
6
+
7
     void*         m_ssimBuf;        /* Temp storage for ssim computation */
8
 
9
 #define MAX_PFILTER_CUS     (4) /* maximum CUs for every thread */
10
x265_2.1.tar.gz/source/encoder/motion.cpp -> x265_2.2.tar.gz/source/encoder/motion.cpp Changed
201
 
1
@@ -109,6 +109,8 @@
2
     blockOffset = 0;
3
     bChromaSATD = false;
4
     chromaSatd = NULL;
5
+    for (int i = 0; i < INTEGRAL_PLANE_NUM; i++)
6
+        integral[i] = NULL;
7
 }
8
 
9
 void MotionEstimate::init(int csp)
10
@@ -165,10 +167,12 @@
11
     partEnum = partitionFromSizes(pwidth, pheight);
12
     X265_CHECK(LUMA_4x4 != partEnum, "4x4 inter partition detected!\n");
13
     sad = primitives.pu[partEnum].sad;
14
+    ads = primitives.pu[partEnum].ads;
15
     satd = primitives.pu[partEnum].satd;
16
     sad_x3 = primitives.pu[partEnum].sad_x3;
17
     sad_x4 = primitives.pu[partEnum].sad_x4;
18
 
19
+
20
     blockwidth = pwidth;
21
     blockOffset = offset;
22
     absPartIdx = ctuAddr = -1;
23
@@ -188,6 +192,7 @@
24
     partEnum = partitionFromSizes(pwidth, pheight);
25
     X265_CHECK(LUMA_4x4 != partEnum, "4x4 inter partition detected!\n");
26
     sad = primitives.pu[partEnum].sad;
27
+    ads = primitives.pu[partEnum].ads;
28
     satd = primitives.pu[partEnum].satd;
29
     sad_x3 = primitives.pu[partEnum].sad_x3;
30
     sad_x4 = primitives.pu[partEnum].sad_x4;
31
@@ -278,12 +283,31 @@
32
         costs[1] += mvcost((omv + MV(m1x, m1y)) << 2); \
33
         costs[2] += mvcost((omv + MV(m2x, m2y)) << 2); \
34
         costs[3] += mvcost((omv + MV(m3x, m3y)) << 2); \
35
-        COPY2_IF_LT(bcost, costs[0], bmv, omv + MV(m0x, m0y)); \
36
-        COPY2_IF_LT(bcost, costs[1], bmv, omv + MV(m1x, m1y)); \
37
-        COPY2_IF_LT(bcost, costs[2], bmv, omv + MV(m2x, m2y)); \
38
-        COPY2_IF_LT(bcost, costs[3], bmv, omv + MV(m3x, m3y)); \
39
+        if ((omv.y + m0y >= mvmin.y) & (omv.y + m0y <= mvmax.y)) \
40
+            COPY2_IF_LT(bcost, costs[0], bmv, omv + MV(m0x, m0y)); \
41
+        if ((omv.y + m1y >= mvmin.y) & (omv.y + m1y <= mvmax.y)) \
42
+            COPY2_IF_LT(bcost, costs[1], bmv, omv + MV(m1x, m1y)); \
43
+        if ((omv.y + m2y >= mvmin.y) & (omv.y + m2y <= mvmax.y)) \
44
+            COPY2_IF_LT(bcost, costs[2], bmv, omv + MV(m2x, m2y)); \
45
+        if ((omv.y + m3y >= mvmin.y) & (omv.y + m3y <= mvmax.y)) \
46
+            COPY2_IF_LT(bcost, costs[3], bmv, omv + MV(m3x, m3y)); \
47
     }
48
 
49
+#define COST_MV_X3_ABS( m0x, m0y, m1x, m1y, m2x, m2y )\
50
+{\
51
+    sad_x3(fenc, \
52
+    fref + (m0x) + (m0y) * stride, \
53
+    fref + (m1x) + (m1y) * stride, \
54
+    fref + (m2x) + (m2y) * stride, \
55
+    stride, costs); \
56
+    costs[0] += p_cost_mvx[(m0x) << 2]; /* no cost_mvy */\
57
+    costs[1] += p_cost_mvx[(m1x) << 2]; \
58
+    costs[2] += p_cost_mvx[(m2x) << 2]; \
59
+    COPY3_IF_LT(bcost, costs[0], bmv.x, m0x, bmv.y, m0y); \
60
+    COPY3_IF_LT(bcost, costs[1], bmv.x, m1x, bmv.y, m1y); \
61
+    COPY3_IF_LT(bcost, costs[2], bmv.x, m2x, bmv.y, m2y); \
62
+}
63
+
64
 #define COST_MV_X4_DIR(m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y, costs) \
65
     { \
66
         pixel *pix_base = fref + bmv.x + bmv.y * stride; \
67
@@ -627,6 +651,7 @@
68
         {
69
             bcost = cost;
70
             bmv = 0;
71
+            bmv.y = X265_MAX(X265_MIN(0, mvmax.y), mvmin.y);
72
         }
73
     }
74
 
75
@@ -659,8 +684,10 @@
76
         do
77
         {
78
             COST_MV_X4_DIR(0, -1, 0, 1, -1, 0, 1, 0, costs);
79
-            COPY1_IF_LT(bcost, (costs[0] << 4) + 1);
80
-            COPY1_IF_LT(bcost, (costs[1] << 4) + 3);
81
+            if ((bmv.y - 1 >= mvmin.y) & (bmv.y - 1 <= mvmax.y))
82
+                COPY1_IF_LT(bcost, (costs[0] << 4) + 1);
83
+            if ((bmv.y + 1 >= mvmin.y) & (bmv.y + 1 <= mvmax.y))
84
+                COPY1_IF_LT(bcost, (costs[1] << 4) + 3);
85
             COPY1_IF_LT(bcost, (costs[2] << 4) + 4);
86
             COPY1_IF_LT(bcost, (costs[3] << 4) + 12);
87
             if (!(bcost & 15))
88
@@ -698,36 +725,57 @@
89
       /* equivalent to the above, but eliminates duplicate candidates */
90
         COST_MV_X3_DIR(-2, 0, -1, 2,  1, 2, costs);
91
         bcost <<= 3;
92
-        COPY1_IF_LT(bcost, (costs[0] << 3) + 2);
93
-        COPY1_IF_LT(bcost, (costs[1] << 3) + 3);
94
-        COPY1_IF_LT(bcost, (costs[2] << 3) + 4);
95
+        if ((bmv.y >= mvmin.y) & (bmv.y <= mvmax.y))
96
+            COPY1_IF_LT(bcost, (costs[0] << 3) + 2);
97
+        if ((bmv.y + 2 >= mvmin.y) & (bmv.y + 2 <= mvmax.y))
98
+        {
99
+            COPY1_IF_LT(bcost, (costs[1] << 3) + 3);
100
+            COPY1_IF_LT(bcost, (costs[2] << 3) + 4);
101
+        }
102
+
103
         COST_MV_X3_DIR(2, 0,  1, -2, -1, -2, costs);
104
-        COPY1_IF_LT(bcost, (costs[0] << 3) + 5);
105
-        COPY1_IF_LT(bcost, (costs[1] << 3) + 6);
106
-        COPY1_IF_LT(bcost, (costs[2] << 3) + 7);
107
+        if ((bmv.y >= mvmin.y) & (bmv.y <= mvmax.y))
108
+            COPY1_IF_LT(bcost, (costs[0] << 3) + 5);
109
+        if ((bmv.y - 2 >= mvmin.y) & (bmv.y - 2 <= mvmax.y))
110
+        {
111
+            COPY1_IF_LT(bcost, (costs[1] << 3) + 6);
112
+            COPY1_IF_LT(bcost, (costs[2] << 3) + 7);
113
+        }
114
 
115
         if (bcost & 7)
116
         {
117
             int dir = (bcost & 7) - 2;
118
-            bmv += hex2[dir + 1];
119
 
120
-            /* half hexagon, not overlapping the previous iteration */
121
-            for (int i = (merange >> 1) - 1; i > 0 && bmv.checkRange(mvmin, mvmax); i--)
122
+            if ((bmv.y + hex2[dir + 1].y >= mvmin.y) & (bmv.y + hex2[dir + 1].y <= mvmax.y))
123
             {
124
-                COST_MV_X3_DIR(hex2[dir + 0].x, hex2[dir + 0].y,
125
-                               hex2[dir + 1].x, hex2[dir + 1].y,
126
-                               hex2[dir + 2].x, hex2[dir + 2].y,
127
-                               costs);
128
-                bcost &= ~7;
129
-                COPY1_IF_LT(bcost, (costs[0] << 3) + 1);
130
-                COPY1_IF_LT(bcost, (costs[1] << 3) + 2);
131
-                COPY1_IF_LT(bcost, (costs[2] << 3) + 3);
132
-                if (!(bcost & 7))
133
-                    break;
134
-                dir += (bcost & 7) - 2;
135
-                dir = mod6m1[dir + 1];
136
                 bmv += hex2[dir + 1];
137
-            }
138
+
139
+                /* half hexagon, not overlapping the previous iteration */
140
+                for (int i = (merange >> 1) - 1; i > 0 && bmv.checkRange(mvmin, mvmax); i--)
141
+                {
142
+                    COST_MV_X3_DIR(hex2[dir + 0].x, hex2[dir + 0].y,
143
+                        hex2[dir + 1].x, hex2[dir + 1].y,
144
+                        hex2[dir + 2].x, hex2[dir + 2].y,
145
+                        costs);
146
+                    bcost &= ~7;
147
+
148
+                    if ((bmv.y + hex2[dir + 0].y >= mvmin.y) & (bmv.y + hex2[dir + 0].y <= mvmax.y))
149
+                        COPY1_IF_LT(bcost, (costs[0] << 3) + 1);
150
+
151
+                    if ((bmv.y + hex2[dir + 1].y >= mvmin.y) & (bmv.y + hex2[dir + 1].y <= mvmax.y))
152
+                        COPY1_IF_LT(bcost, (costs[1] << 3) + 2);
153
+
154
+                    if ((bmv.y + hex2[dir + 2].y >= mvmin.y) & (bmv.y + hex2[dir + 2].y <= mvmax.y))
155
+                        COPY1_IF_LT(bcost, (costs[2] << 3) + 3);
156
+
157
+                    if (!(bcost & 7))
158
+                        break;
159
+
160
+                    dir += (bcost & 7) - 2;
161
+                    dir = mod6m1[dir + 1];
162
+                    bmv += hex2[dir + 1];
163
+                }
164
+            } // if ((bmv.y + hex2[dir + 1].y >= mvmin.y) & (bmv.y + hex2[dir + 1].y <= mvmax.y))
165
         }
166
         bcost >>= 3;
167
 #endif // if 0
168
@@ -735,15 +783,21 @@
169
         /* square refine */
170
         int dir = 0;
171
         COST_MV_X4_DIR(0, -1,  0, 1, -1, 0, 1, 0, costs);
172
-        COPY2_IF_LT(bcost, costs[0], dir, 1);
173
-        COPY2_IF_LT(bcost, costs[1], dir, 2);
174
+        if ((bmv.y - 1 >= mvmin.y) & (bmv.y - 1 <= mvmax.y))
175
+            COPY2_IF_LT(bcost, costs[0], dir, 1);
176
+        if ((bmv.y + 1 >= mvmin.y) & (bmv.y + 1 <= mvmax.y))
177
+            COPY2_IF_LT(bcost, costs[1], dir, 2);
178
         COPY2_IF_LT(bcost, costs[2], dir, 3);
179
         COPY2_IF_LT(bcost, costs[3], dir, 4);
180
         COST_MV_X4_DIR(-1, -1, -1, 1, 1, -1, 1, 1, costs);
181
-        COPY2_IF_LT(bcost, costs[0], dir, 5);
182
-        COPY2_IF_LT(bcost, costs[1], dir, 6);
183
-        COPY2_IF_LT(bcost, costs[2], dir, 7);
184
-        COPY2_IF_LT(bcost, costs[3], dir, 8);
185
+        if ((bmv.y - 1 >= mvmin.y) & (bmv.y - 1 <= mvmax.y))
186
+            COPY2_IF_LT(bcost, costs[0], dir, 5);
187
+        if ((bmv.y + 1 >= mvmin.y) & (bmv.y + 1 <= mvmax.y))
188
+            COPY2_IF_LT(bcost, costs[1], dir, 6);
189
+        if ((bmv.y - 1 >= mvmin.y) & (bmv.y - 1 <= mvmax.y))
190
+            COPY2_IF_LT(bcost, costs[2], dir, 7);
191
+        if ((bmv.y + 1 >= mvmin.y) & (bmv.y + 1 <= mvmax.y))
192
+            COPY2_IF_LT(bcost, costs[3], dir, 8);
193
         bmv += square1[dir];
194
         break;
195
     }
196
@@ -756,6 +810,7 @@
197
         /* refine predictors */
198
         omv = bmv;
199
         ucost1 = bcost;
200
+        X265_CHECK(((pmv.y >= mvmin.y) & (pmv.y <= mvmax.y)), "pmv outside of search range!");
201
x265_2.1.tar.gz/source/encoder/motion.h -> x265_2.2.tar.gz/source/encoder/motion.h Changed
17
 
1
@@ -52,6 +52,7 @@
2
     pixelcmp_t sad;
3
     pixelcmp_x3_t sad_x3;
4
     pixelcmp_x4_t sad_x4;
5
+    pixelcmp_ads_t ads;
6
     pixelcmp_t satd;
7
     pixelcmp_t chromaSatd;
8
 
9
@@ -61,6 +62,7 @@
10
 
11
     static const int COST_MAX = 1 << 28;
12
 
13
+    uint32_t* integral[INTEGRAL_PLANE_NUM];
14
     Yuv fencPUYuv;
15
     int partEnum;
16
     bool bChromaSATD;
17
x265_2.1.tar.gz/source/encoder/nal.h -> x265_2.2.tar.gz/source/encoder/nal.h Changed
9
 
1
@@ -34,6 +34,7 @@
2
 
3
 class NALList
4
 {
5
+public:
6
     static const int MAX_NAL_UNITS = 16;
7
 
8
 public:
9
x265_2.1.tar.gz/source/encoder/ratecontrol.cpp -> x265_2.2.tar.gz/source/encoder/ratecontrol.cpp Changed
201
 
1
@@ -341,6 +341,8 @@
2
             m_param->rc.vbvBufferInit = x265_clip3(0.0, 1.0, m_param->rc.vbvBufferInit / m_param->rc.vbvBufferSize);
3
         m_param->rc.vbvBufferInit = x265_clip3(0.0, 1.0, X265_MAX(m_param->rc.vbvBufferInit, m_bufferRate / m_bufferSize));
4
         m_bufferFillFinal = m_bufferSize * m_param->rc.vbvBufferInit;
5
+        m_bufferFillActual = m_bufferFillFinal;
6
+        m_bufferExcess = 0;
7
     }
8
 
9
     m_totalBits = 0;
10
@@ -431,7 +433,7 @@
11
                 }
12
                 *statsIn = '\0';
13
                 statsIn++;
14
-                if (sscanf(opts, "#options: %dx%d", &i, &j) != 2)
15
+                if ((p = strstr(opts, " input-res=")) == 0 || sscanf(p, " input-res=%dx%d", &i, &j) != 2)
16
                 {
17
                     x265_log(m_param, X265_LOG_ERROR, "Resolution specified in stats file not valid\n");
18
                     return false;
19
@@ -457,9 +459,15 @@
20
                 CMP_OPT_FIRST_PASS("bframes", m_param->bframes);
21
                 CMP_OPT_FIRST_PASS("b-pyramid", m_param->bBPyramid);
22
                 CMP_OPT_FIRST_PASS("open-gop", m_param->bOpenGOP);
23
-                CMP_OPT_FIRST_PASS("keyint", m_param->keyframeMax);
24
+                CMP_OPT_FIRST_PASS(" keyint", m_param->keyframeMax);
25
                 CMP_OPT_FIRST_PASS("scenecut", m_param->scenecutThreshold);
26
                 CMP_OPT_FIRST_PASS("intra-refresh", m_param->bIntraRefresh);
27
+                if (m_param->bMultiPassOptRPS)
28
+                {
29
+                    CMP_OPT_FIRST_PASS("multi-pass-opt-rps", m_param->bMultiPassOptRPS);
30
+                    CMP_OPT_FIRST_PASS("repeat-headers", m_param->bRepeatHeaders);
31
+                    CMP_OPT_FIRST_PASS("min-keyint", m_param->keyframeMin);
32
+                }
33
 
34
                 if ((p = strstr(opts, "b-adapt=")) != 0 && sscanf(p, "b-adapt=%d", &i) && i >= X265_B_ADAPT_NONE && i <= X265_B_ADAPT_TRELLIS)
35
                 {
36
@@ -542,10 +550,27 @@
37
                 }
38
                 rce = &m_rce2Pass[encodeOrder];
39
                 m_encOrder[frameNumber] = encodeOrder;
40
-                e += sscanf(p, " in:%*d out:%*d type:%c q:%lf q-aq:%lf q-noVbv:%lf q-Rceq:%lf tex:%d mv:%d misc:%d icu:%lf pcu:%lf scu:%lf",
41
-                       &picType, &qpRc, &qpAq, &qNoVbv, &qRceq, &rce->coeffBits,
42
-                       &rce->mvBits, &rce->miscBits, &rce->iCuCount, &rce->pCuCount,
43
-                       &rce->skipCuCount);
44
+                if (!m_param->bMultiPassOptRPS)
45
+                {
46
+                    e += sscanf(p, " in:%*d out:%*d type:%c q:%lf q-aq:%lf q-noVbv:%lf q-Rceq:%lf tex:%d mv:%d misc:%d icu:%lf pcu:%lf scu:%lf",
47
+                        &picType, &qpRc, &qpAq, &qNoVbv, &qRceq, &rce->coeffBits,
48
+                        &rce->mvBits, &rce->miscBits, &rce->iCuCount, &rce->pCuCount,
49
+                        &rce->skipCuCount);
50
+                }
51
+                else
52
+                {
53
+                    char deltaPOC[128];
54
+                    char bUsed[40];
55
+                    memset(deltaPOC, 0, sizeof(deltaPOC));
56
+                    memset(bUsed, 0, sizeof(bUsed));
57
+                    e += sscanf(p, " in:%*d out:%*d type:%c q:%lf q-aq:%lf q-noVbv:%lf q-Rceq:%lf tex:%d mv:%d misc:%d icu:%lf pcu:%lf scu:%lf nump:%d numnegp:%d numposp:%d deltapoc:%s bused:%s",
58
+                        &picType, &qpRc, &qpAq, &qNoVbv, &qRceq, &rce->coeffBits,
59
+                        &rce->mvBits, &rce->miscBits, &rce->iCuCount, &rce->pCuCount,
60
+                        &rce->skipCuCount, &rce->rpsData.numberOfPictures, &rce->rpsData.numberOfNegativePictures, &rce->rpsData.numberOfPositivePictures, deltaPOC, bUsed);
61
+                    splitdeltaPOC(deltaPOC, rce);
62
+                    splitbUsed(bUsed, rce);
63
+                    rce->rpsIdx = -1;
64
+                }
65
                 rce->keptAsRef = true;
66
                 rce->isIdr = false;
67
                 if (picType == 'b' || picType == 'p')
68
@@ -598,7 +623,7 @@
69
                 x265_log_file(m_param, X265_LOG_ERROR, "can't open stats file %s.temp\n", fileName);
70
                 return false;
71
             }
72
-            p = x265_param2string(m_param);
73
+            p = x265_param2string(m_param, sps.conformanceWindow.rightOffset, sps.conformanceWindow.bottomOffset);
74
             if (p)
75
                 fprintf(m_statFileOut, "#options: %s\n", p);
76
             X265_FREE(p);
77
@@ -1649,15 +1674,18 @@
78
                 if (m_pred[m_predType].count == 1)
79
                     qScale = x265_clip3(lmin, lmax, qScale);
80
                 m_lastQScaleFor[m_sliceType] = qScale;
81
-                rce->frameSizePlanned = predictSize(&m_pred[m_predType], qScale, (double)m_currentSatd);
82
             }
83
-            else
84
-                rce->frameSizePlanned = qScale2bits(rce, qScale);
85
+        }
86
 
87
-            /* Limit planned size by MinCR */
88
+        if (m_2pass)
89
+            rce->frameSizePlanned = qScale2bits(rce, qScale);
90
+        else
91
+            rce->frameSizePlanned = predictSize(&m_pred[m_predType], qScale, (double)m_currentSatd);
92
+
93
+        /* Limit planned size by MinCR */
94
+        if (m_isVbv)
95
             rce->frameSizePlanned = X265_MIN(rce->frameSizePlanned, rce->frameSizeMaximum);
96
-            rce->frameSizeEstimated = rce->frameSizePlanned;
97
-        }
98
+        rce->frameSizeEstimated = rce->frameSizePlanned;
99
 
100
         rce->newQScale = qScale;
101
         if(rce->bLastMiniGopBFrame)
102
@@ -1875,7 +1903,7 @@
103
         if ((m_curSlice->m_poc == 0 || m_lastQScaleFor[P_SLICE] < q) && !(m_2pass && !m_isVbv))
104
             m_lastQScaleFor[P_SLICE] = q * fabs(m_param->rc.ipFactor);
105
 
106
-        if (m_2pass && m_isVbv)
107
+        if (m_2pass)
108
             rce->frameSizePlanned = qScale2bits(rce, q);
109
         else
110
             rce->frameSizePlanned = predictSize(&m_pred[m_predType], q, (double)m_currentSatd);
111
@@ -2161,7 +2189,7 @@
112
     for (uint32_t row = 0; row < maxRows; row++)
113
     {
114
         encodedBitsSoFar += curEncData.m_rowStat[row].encodedBits;
115
-        rowSatdCostSoFar = curEncData.m_rowStat[row].diagSatd;
116
+        rowSatdCostSoFar = curEncData.m_rowStat[row].rowSatd;
117
         uint32_t satdCostForPendingCus = curEncData.m_rowStat[row].satdForVbv - rowSatdCostSoFar;
118
         satdCostForPendingCus >>= X265_DEPTH - 8;
119
         if (satdCostForPendingCus  > 0)
120
@@ -2190,7 +2218,7 @@
121
                 }
122
 
123
                 refRowSatdCost >>= X265_DEPTH - 8;
124
-                refQScale = refEncData.m_rowStat[row].diagQpScale;
125
+                refQScale = refEncData.m_rowStat[row].rowQpScale;
126
             }
127
 
128
             if (picType == I_SLICE || qScale >= refQScale)
129
@@ -2212,7 +2240,7 @@
130
             }
131
             else if (picType == P_SLICE)
132
             {
133
-                intraCostForPendingCus = curEncData.m_rowStat[row].intraSatdForVbv - curEncData.m_rowStat[row].diagIntraSatd;
134
+                intraCostForPendingCus = curEncData.m_rowStat[row].intraSatdForVbv - curEncData.m_rowStat[row].rowIntraSatd;
135
                 intraCostForPendingCus >>= X265_DEPTH - 8;
136
                 /* Our QP is lower than the reference! */
137
                 double pred_intra = predictSize(rce->rowPred[1], qScale, intraCostForPendingCus);
138
@@ -2227,16 +2255,16 @@
139
     return totalSatdBits + encodedBitsSoFar;
140
 }
141
 
142
-int RateControl::rowDiagonalVbvRateControl(Frame* curFrame, uint32_t row, RateControlEntry* rce, double& qpVbv)
143
+int RateControl::rowVbvRateControl(Frame* curFrame, uint32_t row, RateControlEntry* rce, double& qpVbv)
144
 {
145
     FrameData& curEncData = *curFrame->m_encData;
146
     double qScaleVbv = x265_qp2qScale(qpVbv);
147
-    uint64_t rowSatdCost = curEncData.m_rowStat[row].diagSatd;
148
+    uint64_t rowSatdCost = curEncData.m_rowStat[row].rowSatd;
149
     double encodedBits = curEncData.m_rowStat[row].encodedBits;
150
 
151
-    if (row == 1)
152
+    if (m_param->bEnableWavefront && row == 1)
153
     {
154
-        rowSatdCost += curEncData.m_rowStat[0].diagSatd;
155
+        rowSatdCost += curEncData.m_rowStat[0].rowSatd;
156
         encodedBits += curEncData.m_rowStat[0].encodedBits;
157
     }
158
     rowSatdCost >>= X265_DEPTH - 8;
159
@@ -2244,11 +2272,11 @@
160
     if (curEncData.m_slice->m_sliceType != I_SLICE)
161
     {
162
         Frame* refFrame = curEncData.m_slice->m_refFrameList[0][0];
163
-        if (qpVbv < refFrame->m_encData->m_rowStat[row].diagQp)
164
+        if (qpVbv < refFrame->m_encData->m_rowStat[row].rowQp)
165
         {
166
-            uint64_t intraRowSatdCost = curEncData.m_rowStat[row].diagIntraSatd;
167
-            if (row == 1)
168
-                intraRowSatdCost += curEncData.m_rowStat[0].diagIntraSatd;
169
+            uint64_t intraRowSatdCost = curEncData.m_rowStat[row].rowIntraSatd;
170
+            if (m_param->bEnableWavefront && row == 1)
171
+                intraRowSatdCost += curEncData.m_rowStat[0].rowIntraSatd;
172
             intraRowSatdCost >>= X265_DEPTH - 8;
173
             updatePredictor(rce->rowPred[1], qScaleVbv, (double)intraRowSatdCost, encodedBits);
174
         }
175
@@ -2309,7 +2337,7 @@
176
         }
177
 
178
         while (qpVbv > qpMin
179
-               && (qpVbv > curEncData.m_rowStat[0].diagQp || m_singleFrameVbv)
180
+               && (qpVbv > curEncData.m_rowStat[0].rowQp || m_singleFrameVbv)
181
                && (((accFrameBits < rce->frameSizePlanned * 0.8f && qpVbv <= prevRowQp)
182
                    || accFrameBits < (rce->bufferFill - m_bufferSize + m_bufferRate) * 1.1)
183
                    && (!m_param->rc.bStrictCbr ? 1 : abrOvershoot < 0)))
184
@@ -2329,7 +2357,7 @@
185
                 accFrameBits = predictRowsSizeSum(curFrame, rce, qpVbv, encodedBitsSoFar);
186
                 abrOvershoot = (accFrameBits + m_totalBits - m_wantedBitsWindow) / totalBitsNeeded;
187
             }
188
-            if (qpVbv > curEncData.m_rowStat[0].diagQp &&
189
+            if (qpVbv > curEncData.m_rowStat[0].rowQp &&
190
                 abrOvershoot < -0.1 && timeDone > 0.5 && accFrameBits < rce->frameSizePlanned - rcTol)
191
             {
192
                 qpVbv -= stepSize;
193
@@ -2446,6 +2474,10 @@
194
     m_bufferFillFinal = X265_MAX(m_bufferFillFinal, 0);
195
     m_bufferFillFinal += m_bufferRate;
196
     m_bufferFillFinal = X265_MIN(m_bufferFillFinal, m_bufferSize);
197
+    double bufferBits = X265_MIN(bits + m_bufferExcess, m_bufferRate);
198
+    m_bufferExcess = X265_MAX(m_bufferExcess - bufferBits + bits, 0);
199
+    m_bufferFillActual += bufferBits - bits;
200
+    m_bufferFillActual = X265_MIN(m_bufferFillActual, m_bufferSize);
201
x265_2.1.tar.gz/source/encoder/ratecontrol.h -> x265_2.2.tar.gz/source/encoder/ratecontrol.h Changed
37
 
1
@@ -111,6 +111,8 @@
2
     bool     isIdr;
3
     SEIPictureTiming *picTimingSEI;
4
     HRDTiming        *hrdTiming;
5
+    int      rpsIdx;
6
+    RPS      rpsData;
7
 };
8
 
9
 class RateControl
10
@@ -144,6 +146,8 @@
11
     double m_rateFactorMaxIncrement; /* Don't allow RF above (CRF + this value). */
12
     double m_rateFactorMaxDecrement; /* don't allow RF below (this value). */
13
     double m_avgPFrameQp;
14
+    double m_bufferFillActual;
15
+    double m_bufferExcess;
16
     bool   m_isFirstMiniGop;
17
     Predictor m_pred[4];       /* Slice predictors to preidct bits for each Slice type - I,P,Bref and B */
18
     int64_t m_leadingNoBSatd;
19
@@ -239,7 +243,7 @@
20
     int  rateControlStart(Frame* curFrame, RateControlEntry* rce, Encoder* enc);
21
     void rateControlUpdateStats(RateControlEntry* rce);
22
     int  rateControlEnd(Frame* curFrame, int64_t bits, RateControlEntry* rce);
23
-    int  rowDiagonalVbvRateControl(Frame* curFrame, uint32_t row, RateControlEntry* rce, double& qpVbv);
24
+    int  rowVbvRateControl(Frame* curFrame, uint32_t row, RateControlEntry* rce, double& qpVbv);
25
     int  rateControlSliceType(int frameNum);
26
     bool cuTreeReadFor2Pass(Frame* curFrame);
27
     void hrdFullness(SEIBufferingPeriod* sei);
28
@@ -280,6 +284,8 @@
29
     bool   findUnderflow(double *fills, int *t0, int *t1, int over, int framesCount);
30
     bool   fixUnderflow(int t0, int t1, double adjustment, double qscaleMin, double qscaleMax);
31
     double tuneQScaleForGrain(double rcOverflow);
32
+    void   splitdeltaPOC(char deltapoc[], RateControlEntry *rce);
33
+    void   splitbUsed(char deltapoc[], RateControlEntry *rce);
34
 };
35
 }
36
 #endif // ifndef X265_RATECONTROL_H
37
x265_2.1.tar.gz/source/encoder/reference.cpp -> x265_2.2.tar.gz/source/encoder/reference.cpp Changed
27
 
1
@@ -128,11 +128,12 @@
2
     intptr_t stride = reconPic->m_stride;
3
     int width   = reconPic->m_picWidth;
4
     int height  = (finishedRows - numWeightedRows) * g_maxCUSize;
5
-    if ((finishedRows == maxNumRows) && (reconPic->m_picHeight % g_maxCUSize))
6
+    /* the last row may be partial height */
7
+    if (finishedRows == maxNumRows - 1)
8
     {
9
-        /* the last row may be partial height */
10
-        height -= g_maxCUSize;
11
-        height += reconPic->m_picHeight % g_maxCUSize;
12
+        const int leftRows = (reconPic->m_picHeight & (g_maxCUSize - 1));
13
+
14
+        height += leftRows ? leftRows : g_maxCUSize;
15
     }
16
     int cuHeight = g_maxCUSize;
17
 
18
@@ -172,7 +173,7 @@
19
         }
20
 
21
         // Extending Bottom
22
-        if (finishedRows == maxNumRows)
23
+        if (finishedRows == maxNumRows - 1)
24
         {
25
             int picHeight = reconPic->m_picHeight;
26
             if (c) picHeight >>= reconPic->m_vChromaShift;
27
x265_2.1.tar.gz/source/encoder/sao.cpp -> x265_2.2.tar.gz/source/encoder/sao.cpp Changed
17
 
1
@@ -1208,10 +1208,15 @@
2
     if (!saoParam->bSaoFlag[0])
3
         m_depthSaoRate[0 * SAO_DEPTHRATE_SIZE + m_refDepth] = 1.0;
4
     else
5
+    {
6
+        X265_CHECK(m_numNoSao[0] <= numctus, "m_numNoSao check failure!");
7
         m_depthSaoRate[0 * SAO_DEPTHRATE_SIZE + m_refDepth] = m_numNoSao[0] / ((double)numctus);
8
+    }
9
 
10
     if (!saoParam->bSaoFlag[1])
11
+    {
12
         m_depthSaoRate[1 * SAO_DEPTHRATE_SIZE + m_refDepth] = 1.0;
13
+    }
14
     else
15
         m_depthSaoRate[1 * SAO_DEPTHRATE_SIZE + m_refDepth] = m_numNoSao[1] / ((double)numctus);
16
 }
17
x265_2.1.tar.gz/source/encoder/search.cpp -> x265_2.2.tar.gz/source/encoder/search.cpp Changed
201
 
1
@@ -67,6 +67,7 @@
2
     m_param = NULL;
3
     m_slice = NULL;
4
     m_frame = NULL;
5
+    m_maxTUDepth = -1;
6
 }
7
 
8
 bool Search::initSearch(const x265_param& param, ScalingList& scalingList)
9
@@ -93,6 +94,19 @@
10
     uint32_t sizeC = sizeL >> (m_hChromaShift + m_vChromaShift);
11
     uint32_t numPartitions = 1 << (maxLog2CUSize - LOG2_UNIT_SIZE) * 2;
12
 
13
+    m_limitTU = 0;
14
+    if (m_param->limitTU)
15
+    {
16
+        if (m_param->limitTU == 1)
17
+            m_limitTU = X265_TU_LIMIT_BFS;
18
+        else if (m_param->limitTU == 2)
19
+            m_limitTU = X265_TU_LIMIT_DFS;
20
+        else if (m_param->limitTU == 3)
21
+            m_limitTU = X265_TU_LIMIT_NEIGH;
22
+        else if (m_param->limitTU == 4)
23
+            m_limitTU = X265_TU_LIMIT_DFS + X265_TU_LIMIT_NEIGH;
24
+    }
25
+
26
     /* these are indexed by qtLayer (log2size - 2) so nominally 0=4x4, 1=8x8, 2=16x16, 3=32x32
27
      * the coeffRQT and reconQtYuv are allocated to the max CU size at every depth. The parts
28
      * which are reconstructed at each depth are valid. At the end, the transform depth table
29
@@ -2131,6 +2145,13 @@
30
                 int mvpIdx = selectMVP(cu, pu, amvp, list, ref);
31
                 MV mvmin, mvmax, outmv, mvp = amvp[mvpIdx];
32
 
33
+                if (m_param->searchMethod == X265_SEA)
34
+                {
35
+                    int puX = puIdx & 1;
36
+                    int puY = puIdx >> 1;
37
+                    for (int planes = 0; planes < INTEGRAL_PLANE_NUM; planes++)
38
+                        m_me.integral[planes] = interMode.fencYuv->m_integral[list][ref][planes] + puX * pu.width + puY * pu.height * m_slice->m_refFrameList[list][ref]->m_reconPic->m_stride;
39
+                }
40
                 setSearchRange(cu, mvp, m_param->searchRange, mvmin, mvmax);
41
                 int satdCost = m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc, mvc, m_param->searchRange, outmv,
42
                   m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
43
@@ -2229,7 +2250,13 @@
44
                         if (lmv.notZero())
45
                             mvc[numMvc++] = lmv;
46
                     }
47
-
48
+                    if (m_param->searchMethod == X265_SEA)
49
+                    {
50
+                        int puX = puIdx & 1;
51
+                        int puY = puIdx >> 1;
52
+                        for (int planes = 0; planes < INTEGRAL_PLANE_NUM; planes++)
53
+                            m_me.integral[planes] = interMode.fencYuv->m_integral[list][ref][planes] + puX * pu.width + puY * pu.height * m_slice->m_refFrameList[list][ref]->m_reconPic->m_stride;
54
+                    }
55
                     setSearchRange(cu, mvp, m_param->searchRange, mvmin, mvmax);
56
                     int satdCost = m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc, mvc, m_param->searchRange, outmv, 
57
                       m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
58
@@ -2544,6 +2571,9 @@
59
     /* conditional clipping for frame parallelism */
60
     mvmin.y = X265_MIN(mvmin.y, (int16_t)m_refLagPixels);
61
     mvmax.y = X265_MIN(mvmax.y, (int16_t)m_refLagPixels);
62
+
63
+    /* conditional clipping for negative mv range */
64
+    mvmax.y = X265_MAX(mvmax.y, mvmin.y);
65
 }
66
 
67
 /* Note: this function overwrites the RD cost variables of interMode, but leaves the sa8d cost unharmed */
68
@@ -2617,8 +2647,29 @@
69
 
70
     m_entropyCoder.load(m_rqt[depth].cur);
71
 
72
+    if ((m_limitTU & X265_TU_LIMIT_DFS) && !(m_limitTU & X265_TU_LIMIT_NEIGH))
73
+        m_maxTUDepth = -1;
74
+    else if (m_limitTU & X265_TU_LIMIT_BFS)
75
+        memset(&m_cacheTU, 0, sizeof(TUInfoCache));
76
+
77
     Cost costs;
78
-    estimateResidualQT(interMode, cuGeom, 0, 0, *resiYuv, costs, tuDepthRange);
79
+    if (m_limitTU & X265_TU_LIMIT_NEIGH)
80
+    {
81
+        /* Save and reload maxTUDepth to avoid changing of maxTUDepth between modes */
82
+        int32_t tempDepth = m_maxTUDepth;
83
+        if (m_maxTUDepth != -1)
84
+        {
85
+            uint32_t splitFlag = interMode.cu.m_partSize[0] != SIZE_2Nx2N;
86
+            uint32_t minSize = tuDepthRange[0];
87
+            uint32_t maxSize = tuDepthRange[1];
88
+            maxSize = X265_MIN(maxSize, cuGeom.log2CUSize - splitFlag);
89
+            m_maxTUDepth = x265_clip3(cuGeom.log2CUSize - maxSize, cuGeom.log2CUSize - minSize, (uint32_t)m_maxTUDepth);
90
+        }
91
+        estimateResidualQT(interMode, cuGeom, 0, 0, *resiYuv, costs, tuDepthRange);
92
+        m_maxTUDepth = tempDepth;
93
+    }
94
+    else
95
+        estimateResidualQT(interMode, cuGeom, 0, 0, *resiYuv, costs, tuDepthRange);
96
 
97
     uint32_t tqBypass = cu.m_tqBypass[0];
98
     if (!tqBypass)
99
@@ -2867,7 +2918,57 @@
100
         return m_rdCost.calcRdCost(dist, nullBits);
101
 }
102
 
103
-void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, ShortYuv& resiYuv, Cost& outCosts, const uint32_t depthRange[2])
104
+bool Search::splitTU(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, ShortYuv& resiYuv, Cost& splitCost, const uint32_t depthRange[2], int32_t splitMore)
105
+{
106
+    CUData& cu = mode.cu;
107
+    uint32_t depth = cuGeom.depth + tuDepth;
108
+    uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
109
+
110
+    uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
111
+    uint32_t ycbf = 0, ucbf = 0, vcbf = 0;
112
+    for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
113
+    {
114
+        if ((m_limitTU & X265_TU_LIMIT_DFS) && tuDepth == 0 && qIdx == 1)
115
+        {
116
+            m_maxTUDepth = cu.m_tuDepth[0];
117
+            // Fetch maximum TU depth of first sub partition to limit recursion of others
118
+            for (uint32_t i = 1; i < cuGeom.numPartitions / 4; i++)
119
+                m_maxTUDepth = X265_MAX(m_maxTUDepth, cu.m_tuDepth[i]);
120
+        }
121
+        estimateResidualQT(mode, cuGeom, qPartIdx, tuDepth + 1, resiYuv, splitCost, depthRange, splitMore);
122
+        ycbf |= cu.getCbf(qPartIdx, TEXT_LUMA,     tuDepth + 1);
123
+        if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
124
+        {
125
+            ucbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
126
+            vcbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
127
+        }
128
+    }
129
+    cu.m_cbf[0][absPartIdx] |= ycbf << tuDepth;
130
+    if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)
131
+    {
132
+        cu.m_cbf[1][absPartIdx] |= ucbf << tuDepth;
133
+        cu.m_cbf[2][absPartIdx] |= vcbf << tuDepth;
134
+    }
135
+
136
+    // Here we were encoding cbfs and coefficients for splitted blocks. Since I have collected coefficient bits
137
+    // for each individual blocks, only encoding cbf values. As I mentioned encoding chroma cbfs is different then luma.
138
+    // But have one doubt that if coefficients are encoded in context at depth 2 (for example) and cbfs are encoded in context
139
+    // at depth 0 (for example).
140
+    m_entropyCoder.load(m_rqt[depth].rqtRoot);
141
+    m_entropyCoder.resetBits();
142
+    codeInterSubdivCbfQT(cu, absPartIdx, tuDepth, depthRange);
143
+    uint32_t splitCbfBits = m_entropyCoder.getNumberOfWrittenBits();
144
+    splitCost.bits += splitCbfBits;
145
+
146
+    if (m_rdCost.m_psyRd)
147
+        splitCost.rdcost = m_rdCost.calcPsyRdCost(splitCost.distortion, splitCost.bits, splitCost.energy);
148
+    else
149
+        splitCost.rdcost = m_rdCost.calcRdCost(splitCost.distortion, splitCost.bits);
150
+        
151
+    return ycbf || ucbf || vcbf;
152
+}
153
+
154
+void Search::estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, ShortYuv& resiYuv, Cost& outCosts, const uint32_t depthRange[2], int32_t splitMore)
155
 {
156
     CUData& cu = mode.cu;
157
     uint32_t depth = cuGeom.depth + tuDepth;
158
@@ -2876,6 +2977,37 @@
159
 
160
     bool bCheckSplit = log2TrSize > depthRange[0];
161
     bool bCheckFull = log2TrSize <= depthRange[1];
162
+    bool bSaveTUData = false, bLoadTUData = false;
163
+    uint32_t idx = 0;
164
+
165
+    if ((m_limitTU & X265_TU_LIMIT_BFS) && splitMore >= 0)
166
+    {
167
+        if (bCheckSplit && bCheckFull && tuDepth)
168
+        {
169
+            uint32_t qNumParts = 1 << (log2TrSize - LOG2_UNIT_SIZE) * 2;
170
+            uint32_t qIdx = (absPartIdx / qNumParts) % 4;
171
+            idx = (depth - 1) * 4 + qIdx;
172
+            if (splitMore)
173
+            {
174
+                bLoadTUData = true;
175
+                bCheckFull = false;
176
+            }
177
+            else
178
+            {
179
+                bSaveTUData = true;
180
+                bCheckSplit = false;
181
+            }
182
+        }
183
+    }
184
+    else if (m_limitTU & X265_TU_LIMIT_DFS || m_limitTU & X265_TU_LIMIT_NEIGH)
185
+    {
186
+        if (bCheckSplit && m_maxTUDepth >= 0)
187
+        {
188
+            uint32_t log2MaxTrSize = cuGeom.log2CUSize - m_maxTUDepth;
189
+            bCheckSplit = log2TrSize > log2MaxTrSize;
190
+        }
191
+    }
192
+
193
     bool bSplitPresentFlag = bCheckSplit && bCheckFull;
194
 
195
     if (cu.m_partSize[0] != SIZE_2Nx2N && !tuDepth && bCheckSplit)
196
@@ -3194,6 +3326,8 @@
197
                 singlePsyEnergy[TEXT_LUMA][0] = nonZeroPsyEnergyY;
198
                 cbfFlag[TEXT_LUMA][0] = !!numSigTSkipY;
199
                 bestTransformMode[TEXT_LUMA][0] = 1;
200
+                if (m_param->limitTU)
201
x265_2.1.tar.gz/source/encoder/search.h -> x265_2.2.tar.gz/source/encoder/search.h Changed
39
 
1
@@ -49,6 +49,8 @@
2
 #define ProfileCounter(cu, count)
3
 #endif
4
 
5
+#define NUM_SUBPART MAX_TS_SIZE * 4 // 4 sub partitions * 4 depth
6
+
7
 namespace X265_NS {
8
 // private namespace
9
 
10
@@ -275,6 +277,9 @@
11
     uint32_t        m_numLayers;
12
     uint32_t        m_refLagPixels;
13
 
14
+    int32_t         m_maxTUDepth;
15
+    uint16_t        m_limitTU;
16
+
17
     int16_t         m_sliceMaxY;
18
     int16_t         m_sliceMinY;
19
 
20
@@ -377,8 +382,17 @@
21
         Cost() { rdcost = 0; bits = 0; distortion = 0; energy = 0; }
22
     };
23
 
24
+    struct TUInfoCache
25
+    {
26
+        Cost cost[NUM_SUBPART];
27
+        uint32_t bestTransformMode[NUM_SUBPART][MAX_NUM_COMPONENT][2];
28
+        uint8_t cbfFlag[NUM_SUBPART][MAX_NUM_COMPONENT][2];
29
+        Entropy rqtStore[NUM_SUBPART];
30
+    } m_cacheTU;
31
+
32
     uint64_t estimateNullCbfCost(sse_t dist, uint32_t psyEnergy, uint32_t tuDepth, TextType compId);
33
-    void     estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, ShortYuv& resiYuv, Cost& costs, const uint32_t depthRange[2]);
34
+    bool     splitTU(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, ShortYuv& resiYuv, Cost& splitCost, const uint32_t depthRange[2], int32_t splitMore);
35
+    void     estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, ShortYuv& resiYuv, Cost& costs, const uint32_t depthRange[2], int32_t splitMore = -1);
36
 
37
     // generate prediction, generate residual and recon. if bAllowSplit, find optimal RQT splits
38
     void     codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, bool bAllowSplit, Cost& costs, const uint32_t depthRange[2]);
39
x265_2.1.tar.gz/source/encoder/slicetype.cpp -> x265_2.2.tar.gz/source/encoder/slicetype.cpp Changed
10
 
1
@@ -1617,7 +1617,7 @@
2
 
3
     /* magic numbers pulled out of thin air */
4
     float threshMin = (float)(threshMax * 0.25);
5
-    double bias = 0.05;
6
+    double bias = m_param->scenecutBias;
7
     if (bRealScenecut)
8
     {
9
         if (m_param->keyframeMin == m_param->keyframeMax)
10
x265_2.1.tar.gz/source/input/y4m.cpp -> x265_2.2.tar.gz/source/input/y4m.cpp Changed
46
 
1
@@ -280,7 +280,7 @@
2
                 {
3
                     c = ifs->get();
4
 
5
-                    if (c <= '9' && c >= '0')
6
+                    if (c <= 'o' && c >= '0')
7
                         csp = csp * 10 + (c - '0');
8
                     else if (c == 'p')
9
                     {
10
@@ -300,9 +300,23 @@
11
                         break;
12
                 }
13
 
14
-                if (d >= 8 && d <= 16)
15
-                    depth = d;
16
-                colorSpace = (csp == 444) ? X265_CSP_I444 : (csp == 422) ? X265_CSP_I422 : X265_CSP_I420;
17
+                switch (csp)
18
+                {
19
+                case ('m'-'0')*100000 + ('o'-'0')*10000 + ('n'-'0')*1000 + ('o'-'0')*100 + 16:
20
+                    colorSpace = X265_CSP_I400;
21
+                    depth = 16;
22
+                    break;
23
+
24
+                case ('m'-'0')*1000 + ('o'-'0')*100 + ('n'-'0')*10 + ('o'-'0'):
25
+                    colorSpace = X265_CSP_I400;
26
+                    depth = 8;
27
+                    break;
28
+                   
29
+                default:
30
+                    if (d >= 8 && d <= 16)
31
+                        depth = d;
32
+                    colorSpace = (csp == 444) ? X265_CSP_I444 : (csp == 422) ? X265_CSP_I422 : X265_CSP_I420;
33
+                }
34
                 break;
35
 
36
             default:
37
@@ -324,7 +338,7 @@
38
     if (width < MIN_FRAME_WIDTH || width > MAX_FRAME_WIDTH ||
39
         height < MIN_FRAME_HEIGHT || height > MAX_FRAME_HEIGHT ||
40
         (rateNum / rateDenom) < 1 || (rateNum / rateDenom) > MAX_FRAME_RATE ||
41
-        colorSpace <= X265_CSP_I400 || colorSpace >= X265_CSP_COUNT)
42
+        colorSpace < X265_CSP_I400 || colorSpace >= X265_CSP_COUNT)
43
         return false;
44
 
45
     return true;
46
x265_2.1.tar.gz/source/test/rate-control-tests.txt -> x265_2.2.tar.gz/source/test/rate-control-tests.txt Changed
18
 
1
@@ -21,6 +21,9 @@
2
 big_buck_bunny_360p24.y4m,--preset medium --bitrate 400 --vbv-bufsize 600 --vbv-maxrate 600 --aud --hrd --tune fast-decode
3
 sita_1920x1080_30.yuv,--preset superfast --crf 25 --vbv-bufsize 3000 --vbv-maxrate 4000 --vbv-bufsize 5000 --hrd  --crf-max 30
4
 sita_1920x1080_30.yuv,--preset superfast --bitrate 3000 --vbv-bufsize 3000 --vbv-maxrate 3000 --aud --strict-cbr
5
+BasketballDrive_1920x1080_50.y4m,--preset ultrafast --bitrate 3000 --vbv-bufsize 3000 --vbv-maxrate 3000 --no-wpp
6
+big_buck_bunny_360p24.y4m,--preset medium --bitrate 400 --vbv-bufsize 600 --vbv-maxrate 600 --no-wpp --aud --hrd --tune fast-decode
7
+sita_1920x1080_30.yuv,--preset superfast --bitrate 3000 --vbv-bufsize 3000 --vbv-maxrate 3000 --aud --strict-cbr --no-wpp
8
 
9
 
10
 
11
@@ -38,4 +41,5 @@
12
 RaceHorses_416x240_30_10bit.yuv,--preset medium --crf 40 --pass 1, --preset faster --bitrate 200 --pass 2 -F4
13
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset superfast --bitrate 2500 --pass 1 -F4 --slow-firstpass,--preset superfast --bitrate 2500 --pass 2 -F4
14
 RaceHorses_416x240_30_10bit.yuv,--preset medium --crf 26 --vbv-maxrate 1000 --vbv-bufsize 1000 --pass 1,--preset fast --bitrate 1000  --vbv-maxrate 1000 --vbv-bufsize 700 --pass 3 -F4,--preset slow --bitrate 500 --vbv-maxrate 500  --vbv-bufsize 700 --pass 2 -F4
15
-
16
+sita_1920x1080_30.yuv, --preset ultrafast --crf 20 --no-cutree --keyint 50 --min-keyint 50 --no-open-gop --pass 1 --vbv-bufsize 7000 --vbv-maxrate 5000, --preset ultrafast --crf 20 --no-cutree --keyint 50 --min-keyint 50 --no-open-gop --pass 2 --vbv-bufsize 7000 --vbv-maxrate 5000 --repeat-headers
17
+sita_1920x1080_30.yuv, --preset medium --crf 20 --no-cutree --keyint 50 --min-keyint 50 --no-open-gop --pass 1 --vbv-bufsize 7000 --vbv-maxrate 5000 --repeat-headers --multi-pass-opt-rps, --preset medium --crf 20 --no-cutree --keyint 50 --min-keyint 50 --no-open-gop --pass 2 --vbv-bufsize 7000 --vbv-maxrate 5000 --repeat-headers --multi-pass-opt-rps
18
x265_2.1.tar.gz/source/test/regression-tests.txt -> x265_2.2.tar.gz/source/test/regression-tests.txt Changed
114
 
1
@@ -14,20 +14,21 @@
2
 BasketballDrive_1920x1080_50.y4m,--preset ultrafast --signhide --colormatrix bt709
3
 BasketballDrive_1920x1080_50.y4m,--preset superfast --psy-rd 1 --ctu 16 --no-wpp --limit-modes
4
 BasketballDrive_1920x1080_50.y4m,--preset veryfast --tune zerolatency --no-temporal-mvp
5
-BasketballDrive_1920x1080_50.y4m,--preset faster --aq-strength 2 --merange 190
6
-BasketballDrive_1920x1080_50.y4m,--preset medium --ctu 16 --max-tu-size 8 --subme 7 --qg-size 16 --cu-lossless
7
+BasketballDrive_1920x1080_50.y4m,--preset faster --aq-strength 2 --merange 190 --slices 3
8
+BasketballDrive_1920x1080_50.y4m,--preset medium --ctu 16 --max-tu-size 8 --subme 7 --qg-size 16 --cu-lossless --tu-inter-depth 3 --limit-tu 1
9
 BasketballDrive_1920x1080_50.y4m,--preset medium --keyint -1 --nr-inter 100 -F4 --no-sao
10
 BasketballDrive_1920x1080_50.y4m,--preset medium --no-cutree --analysis-mode=save --bitrate 7000 --limit-modes,--preset medium --no-cutree --analysis-mode=load --bitrate 7000 --limit-modes
11
 BasketballDrive_1920x1080_50.y4m,--preset slow --nr-intra 100 -F4 --aq-strength 3 --qg-size 16 --limit-refs 1
12
-BasketballDrive_1920x1080_50.y4m,--preset slower --lossless --chromaloc 3 --subme 0
13
+BasketballDrive_1920x1080_50.y4m,--preset slower --lossless --chromaloc 3 --subme 0 --limit-tu 4
14
 BasketballDrive_1920x1080_50.y4m,--preset slower --no-cutree --analysis-mode=save --bitrate 7000,--preset slower --no-cutree --analysis-mode=load --bitrate 7000
15
-BasketballDrive_1920x1080_50.y4m,--preset veryslow --crf 4 --cu-lossless --pmode --limit-refs 1 --aq-mode 3
16
-BasketballDrive_1920x1080_50.y4m,--preset veryslow --no-cutree --analysis-mode=save --bitrate 7000 --tskip-fast,--preset veryslow --no-cutree --analysis-mode=load --bitrate 7000  --tskip-fast
17
+BasketballDrive_1920x1080_50.y4m,--preset veryslow --crf 4 --cu-lossless --pmode --limit-refs 1 --aq-mode 3 --limit-tu 3
18
+BasketballDrive_1920x1080_50.y4m,--preset veryslow --no-cutree --analysis-mode=save --bitrate 7000 --tskip-fast --limit-tu 4,--preset veryslow --no-cutree --analysis-mode=load --bitrate 7000  --tskip-fast --limit-tu 4
19
 BasketballDrive_1920x1080_50.y4m,--preset veryslow --recon-y4m-exec "ffplay -i pipe:0 -autoexit"
20
 Coastguard-4k.y4m,--preset ultrafast --recon-y4m-exec "ffplay -i pipe:0 -autoexit"
21
 Coastguard-4k.y4m,--preset superfast --tune grain --overscan=crop
22
+Coastguard-4k.y4m,--preset superfast --tune grain --pme --aq-strength 2 --merange 190
23
 Coastguard-4k.y4m,--preset veryfast --no-cutree --analysis-mode=save --bitrate 15000,--preset veryfast --no-cutree --analysis-mode=load --bitrate 15000
24
-Coastguard-4k.y4m,--preset medium --rdoq-level 1 --tune ssim --no-signhide --me umh
25
+Coastguard-4k.y4m,--preset medium --rdoq-level 1 --tune ssim --no-signhide --me umh --slices 2
26
 Coastguard-4k.y4m,--preset slow --tune psnr --cbqpoffs -1 --crqpoffs 1 --limit-refs 1
27
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset ultrafast --weightp --tune zerolatency --qg-size 16
28
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset superfast --weightp --no-wpp --sao
29
@@ -41,13 +42,14 @@
30
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset superfast --weightp --dither --no-psy-rd
31
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset veryfast --temporal-layers --repeat-headers --limit-refs 2
32
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset medium --dither --keyint -1 --rdoq-level 1 --limit-modes
33
-CrowdRun_1920x1080_50_10bit_444.yuv,--preset veryslow --tskip --tskip-fast --no-scenecut
34
+CrowdRun_1920x1080_50_10bit_444.yuv,--preset veryslow --tskip --tskip-fast --no-scenecut --limit-tu 1
35
 DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset superfast --weightp --qg-size 16
36
 DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset medium --tune psnr --bframes 16 --limit-modes
37
 DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset slow --temporal-layers --no-psy-rd --qg-size 32 --limit-refs 0 --cu-lossless
38
 DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset veryfast --weightp --nr-intra 1000 -F4
39
 DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset medium --nr-inter 500 -F4 --no-psy-rdoq
40
-DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset slower --no-weightp --rdoq-level 0 --limit-refs 3
41
+DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset slower --no-weightp --rdoq-level 0 --limit-refs 3 --tu-inter-depth 4 --limit-tu 3
42
+DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset fast --no-cutree --analysis-mode=save --bitrate 3000 --early-skip --tu-inter-depth 3 --limit-tu 1,--preset fast --no-cutree --analysis-mode=load --bitrate 3000 --early-skip --tu-inter-depth 3 --limit-tu 1
43
 FourPeople_1280x720_60.y4m,--preset superfast --no-wpp --lookahead-slices 2
44
 FourPeople_1280x720_60.y4m,--preset veryfast --aq-mode 2 --aq-strength 1.5 --qg-size 8
45
 FourPeople_1280x720_60.y4m,--preset medium --qp 38 --no-psy-rd
46
@@ -61,24 +63,27 @@
47
 KristenAndSara_1280x720_60.y4m,--preset ultrafast --strong-intra-smoothing
48
 KristenAndSara_1280x720_60.y4m,--preset superfast --min-cu-size 16 --qg-size 16 --limit-refs 1
49
 KristenAndSara_1280x720_60.y4m,--preset medium --no-cutree --max-tu-size 16
50
-KristenAndSara_1280x720_60.y4m,--preset slower --pmode --max-tu-size 8 --limit-refs 0 --limit-modes
51
+KristenAndSara_1280x720_60.y4m,--preset slower --pmode --max-tu-size 8 --limit-refs 0 --limit-modes --limit-tu 1
52
 NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset superfast --tune psnr
53
 NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset medium --tune grain --limit-refs 2
54
 NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset slow --no-cutree --analysis-mode=save --bitrate 9000,--preset slow --no-cutree --analysis-mode=load --bitrate 9000
55
 News-4k.y4m,--preset ultrafast --no-cutree --analysis-mode=save --bitrate 15000,--preset ultrafast --no-cutree --analysis-mode=load --bitrate 15000
56
 News-4k.y4m,--preset superfast --lookahead-slices 6 --aq-mode 0
57
+News-4k.y4m,--preset superfast --slices 4 --aq-mode 0 
58
 News-4k.y4m,--preset medium --tune ssim --no-sao --qg-size 16
59
 News-4k.y4m,--preset veryslow --no-rskip
60
+News-4k.y4m,--preset veryslow --pme --crf 40
61
 OldTownCross_1920x1080_50_10bit_422.yuv,--preset superfast --weightp
62
 OldTownCross_1920x1080_50_10bit_422.yuv,--preset medium --no-weightp
63
 OldTownCross_1920x1080_50_10bit_422.yuv,--preset slower --tune fastdecode
64
 ParkScene_1920x1080_24_10bit_444.yuv,--preset superfast --weightp --lookahead-slices 4
65
 ParkScene_1920x1080_24.y4m,--preset medium --qp 40 --rdpenalty 2 --tu-intra-depth 3
66
+ParkScene_1920x1080_24.y4m,--preset medium --pme --tskip-fast --tskip --min-keyint 48 --weightb --limit-refs 3
67
 ParkScene_1920x1080_24.y4m,--preset slower --no-weightp
68
 RaceHorses_416x240_30.y4m,--preset superfast --no-cutree
69
 RaceHorses_416x240_30.y4m,--preset medium --tskip-fast --tskip
70
-RaceHorses_416x240_30.y4m,--preset slower --keyint -1 --rdoq-level 0
71
-RaceHorses_416x240_30.y4m,--preset veryslow --tskip-fast --tskip --limit-refs 3
72
+RaceHorses_416x240_30.y4m,--preset slower --keyint -1 --rdoq-level 0 --limit-tu 2
73
+RaceHorses_416x240_30.y4m,--preset veryslow --tskip-fast --tskip --limit-refs 3 --limit-tu 3
74
 RaceHorses_416x240_30_10bit.yuv,--preset ultrafast --tune psnr --limit-refs 1
75
 RaceHorses_416x240_30_10bit.yuv,--preset veryfast --weightb
76
 RaceHorses_416x240_30_10bit.yuv,--preset faster --rdoq-level 0 --dither
77
@@ -108,7 +113,7 @@
78
 ducks_take_off_420_720p50.y4m,--preset veryslow --constrained-intra --bframes 2
79
 mobile_calendar_422_ntsc.y4m,--preset superfast --weightp
80
 mobile_calendar_422_ntsc.y4m,--preset medium --bitrate 500 -F4
81
-mobile_calendar_422_ntsc.y4m,--preset slower --tskip --tskip-fast
82
+mobile_calendar_422_ntsc.y4m,--preset slower --tskip --tskip-fast --limit-tu 4
83
 mobile_calendar_422_ntsc.y4m,--preset veryslow --tskip --limit-refs 2
84
 old_town_cross_444_720p50.y4m,--preset ultrafast --weightp --min-cu 32
85
 old_town_cross_444_720p50.y4m,--preset superfast --weightp --min-cu 16 --limit-modes
86
@@ -118,6 +123,7 @@
87
 old_town_cross_444_720p50.y4m,--preset medium --keyint -1 --no-weightp --ref 6
88
 old_town_cross_444_720p50.y4m,--preset slow --rdoq-level 1 --early-skip --ref 7 --no-b-pyramid
89
 old_town_cross_444_720p50.y4m,--preset slower --crf 4 --cu-lossless
90
+old_town_cross_444_720p50.y4m,--preset veryslow --max-tu-size 4 --min-cu-size 32 --limit-tu 4
91
 parkrun_ter_720p50.y4m,--preset medium --no-open-gop --sao-non-deblock --crf 4 --cu-lossless
92
 parkrun_ter_720p50.y4m,--preset slower --fast-intra --no-rect --tune grain
93
 silent_cif_420.y4m,--preset superfast --weightp --rect
94
@@ -133,6 +139,11 @@
95
 vtc1nw_422_ntsc.y4m,--preset slower --nr-inter 1000 -F4 --tune fast-decode --qg-size 16
96
 washdc_422_ntsc.y4m,--preset slower --psy-rdoq 2.0 --rdoq-level 2 --qg-size 32 --limit-refs 1
97
 washdc_422_ntsc.y4m,--preset veryslow --crf 4 --cu-lossless --limit-refs 3 --limit-modes
98
+washdc_422_ntsc.y4m,--preset veryslow --crf 4 --cu-lossless --limit-refs 3 --limit-modes --slices 2
99
+Kimono1_1920x1080_24_400.yuv,--preset ultrafast --slices 1 --weightp --tu-intra-depth 4
100
+Kimono1_1920x1080_24_400.yuv,--preset medium --rdoq-level 0 --limit-refs 3 --slices 2
101
+Kimono1_1920x1080_24_400.yuv,--preset veryslow --crf 4 --cu-lossless --slices 2 --limit-refs 3 --limit-modes
102
+Kimono1_1920x1080_24_400.yuv,--preset placebo --ctu 32 --max-tu-size 8 --limit-tu 2
103
 
104
 # Main12 intraCost overflow bug test
105
 720p50_parkrun_ter.y4m,--preset medium
106
@@ -141,4 +152,7 @@
107
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset faster --interlace tff
108
 CrowdRun_1920x1080_50_10bit_422.yuv,--preset fast --interlace bff
109
 
110
+#SEA Implementation Test
111
+silent_cif_420.y4m,--preset veryslow --me 4
112
+big_buck_bunny_360p24.y4m,--preset superfast --me 4
113
 # vim: tw=200
114
x265_2.1.tar.gz/source/test/smoke-tests.txt -> x265_2.2.tar.gz/source/test/smoke-tests.txt Changed
25
 
1
@@ -3,10 +3,9 @@
2
 # consider VBV tests a failure if new bitrate is more than 5% different
3
 # from the old bitrate
4
 # vbv-tolerance = 0.05
5
-
6
 big_buck_bunny_360p24.y4m,--preset=superfast --bitrate 400 --vbv-bufsize 600 --vbv-maxrate 400 --hrd --aud --repeat-headers
7
 big_buck_bunny_360p24.y4m,--preset=medium --bitrate 1000 -F4 --cu-lossless --scaling-list default
8
-big_buck_bunny_360p24.y4m,--preset=slower --no-weightp --pme --qg-size 16
9
+big_buck_bunny_360p24.y4m,--preset=slower --no-weightp --qg-size 16
10
 washdc_422_ntsc.y4m,--preset=faster --no-strong-intra-smoothing --keyint 1 --qg-size 16
11
 washdc_422_ntsc.y4m,--preset=medium --qp 40 --nr-inter 400 -F4
12
 washdc_422_ntsc.y4m,--preset=veryslow --pmode --tskip --rdoq-level 0
13
@@ -16,9 +15,10 @@
14
 RaceHorses_416x240_30_10bit.yuv,--preset=veryfast --max-tu-size 8
15
 RaceHorses_416x240_30_10bit.yuv,--preset=slower --bitrate 500 -F4 --rdoq-level 1
16
 CrowdRun_1920x1080_50_10bit_444.yuv,--preset=ultrafast --constrained-intra --min-keyint 5 --keyint 10
17
-CrowdRun_1920x1080_50_10bit_444.yuv,--preset=medium --max-tu-size 16
18
+CrowdRun_1920x1080_50_10bit_444.yuv,--preset=medium --max-tu-size 16 --tu-inter-depth 2 --limit-tu 3
19
 DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset=veryfast --min-cu 16
20
 DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset=fast --weightb --interlace bff
21
+DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset=veryslow --limit-ref 1 --limit-mode --tskip --limit-tu 1
22
 
23
 # Main12 intraCost overflow bug test
24
 720p50_parkrun_ter.y4m,--preset medium
25
x265_2.1.tar.gz/source/x265-extras.cpp -> x265_2.2.tar.gz/source/x265-extras.cpp Changed
37
 
1
@@ -64,6 +64,8 @@
2
                 fprintf(csvfp, "Encode Order, Type, POC, QP, Bits, Scenecut, ");
3
                 if (param.rc.rateControlMode == X265_RC_CRF)
4
                     fprintf(csvfp, "RateFactor, ");
5
+                if (param.rc.vbvBufferSize)
6
+                    fprintf(csvfp, "BufferFill, ");
7
                 if (param.bEnablePsnr)
8
                     fprintf(csvfp, "Y PSNR, U PSNR, V PSNR, YUV PSNR, ");
9
                 if (param.bEnableSsim)
10
@@ -132,6 +134,8 @@
11
     fprintf(csvfp, "%d, %c-SLICE, %4d, %2.2lf, %10d, %d,", frameStats->encoderOrder, frameStats->sliceType, frameStats->poc, frameStats->qp, (int)frameStats->bits, frameStats->bScenecut);
12
     if (param.rc.rateControlMode == X265_RC_CRF)
13
         fprintf(csvfp, "%.3lf,", frameStats->rateFactor);
14
+    if (param.rc.vbvBufferSize)
15
+        fprintf(csvfp, "%.3lf,", frameStats->bufferFill);
16
     if (param.bEnablePsnr)
17
         fprintf(csvfp, "%.3lf, %.3lf, %.3lf, %.3lf,", frameStats->psnrY, frameStats->psnrU, frameStats->psnrV, frameStats->psnr);
18
     if (param.bEnableSsim)
19
@@ -187,7 +191,7 @@
20
     fflush(stderr);
21
 }
22
 
23
-void x265_csvlog_encode(FILE* csvfp, const x265_api& api, const x265_param& param, const x265_stats& stats, int level, int argc, char** argv)
24
+void x265_csvlog_encode(FILE* csvfp, const char* version, const x265_param& param, const x265_stats& stats, int level, int argc, char** argv)
25
 {
26
     if (!csvfp)
27
         return;
28
@@ -277,7 +281,7 @@
29
     else
30
         fprintf(csvfp, " -, -, -, -, -, -, -,");
31
 
32
-    fprintf(csvfp, " %-6u, %-6u, %s\n", stats.maxCLL, stats.maxFALL, api.version_str);
33
+    fprintf(csvfp, " %-6u, %-6u, %s\n", stats.maxCLL, stats.maxFALL, version);
34
 }
35
 
36
 /* The dithering algorithm is based on Sierra-2-4A error diffusion.
37
x265_2.1.tar.gz/source/x265-extras.h -> x265_2.2.tar.gz/source/x265-extras.h Changed
10
 
1
@@ -53,7 +53,7 @@
2
 /* Log final encode statistics to the CSV file handle. 'argc' and 'argv' are
3
  * intended to be command line arguments passed to the encoder. Encode
4
  * statistics should be queried from the encoder just prior to closing it. */
5
-LIBAPI void x265_csvlog_encode(FILE* csvfp, const x265_api& api, const x265_param& param, const x265_stats& stats, int level, int argc, char** argv);
6
+LIBAPI void x265_csvlog_encode(FILE* csvfp, const char* version, const x265_param& param, const x265_stats& stats, int level, int argc, char** argv);
7
 
8
 /* In-place downshift from a bit-depth greater than 8 to a bit-depth of 8, using
9
  * the residual bits to dither each row. */
10
x265_2.1.tar.gz/source/x265.cpp -> x265_2.2.tar.gz/source/x265.cpp Changed
10
 
1
@@ -746,7 +746,7 @@
2
 
3
     api->encoder_get_stats(encoder, &stats, sizeof(stats));
4
     if (cliopt.csvfpt && !b_ctrl_c)
5
-        x265_csvlog_encode(cliopt.csvfpt, *api, *param, stats, cliopt.csvLogLevel, argc, argv);
6
+        x265_csvlog_encode(cliopt.csvfpt, api->version_str, *param, stats, cliopt.csvLogLevel, argc, argv);
7
     api->encoder_close(encoder);
8
 
9
     int64_t second_largest_pts = 0;
10
x265_2.1.tar.gz/source/x265.h -> x265_2.2.tar.gz/source/x265.h Changed
104
 
1
@@ -137,6 +137,7 @@
2
     double           avgPsyEnergy;
3
     double           avgResEnergy;
4
     double           avgLumaLevel;
5
+    double           bufferFill;
6
     uint64_t         bits;
7
     int              encoderOrder;
8
     int              poc;
9
@@ -289,6 +290,7 @@
10
     X265_HEX_SEARCH,
11
     X265_UMH_SEARCH,
12
     X265_STAR_SEARCH,
13
+    X265_SEA,
14
     X265_FULL_SEARCH
15
 } X265_ME_METHODS;
16
 
17
@@ -334,6 +336,9 @@
18
 #define X265_CPU_NEON            0x0000002  /* ARM NEON */
19
 #define X265_CPU_FAST_NEON_MRC   0x0000004  /* Transfer from NEON to ARM register is fast (Cortex-A9) */
20
 
21
+/* IBM Power8 */
22
+#define X265_CPU_ALTIVEC         0x0000001
23
+
24
 #define X265_MAX_SUBPEL_LEVEL   7
25
 
26
 /* Log level */
27
@@ -351,6 +356,10 @@
28
 #define X265_REF_LIMIT_DEPTH    1
29
 #define X265_REF_LIMIT_CU       2
30
 
31
+#define X265_TU_LIMIT_BFS       1
32
+#define X265_TU_LIMIT_DFS       2
33
+#define X265_TU_LIMIT_NEIGH     4
34
+
35
 #define X265_BFRAME_MAX         16
36
 #define X265_MAX_FRAME_THREADS  16
37
 
38
@@ -456,7 +465,7 @@
39
 } x265_stats;
40
 
41
 /* String values accepted by x265_param_parse() (and CLI) for various parameters */
42
-static const char * const x265_motion_est_names[] = { "dia", "hex", "umh", "star", "full", 0 };
43
+static const char * const x265_motion_est_names[] = { "dia", "hex", "umh", "star", "sea", "full", 0 };
44
 static const char * const x265_source_csp_names[] = { "i400", "i420", "i422", "i444", "nv12", "nv16", 0 };
45
 static const char * const x265_video_format_names[] = { "component", "pal", "ntsc", "secam", "mac", "undef", 0 };
46
 static const char * const x265_fullrange_names[] = { "limited", "full", 0 };
47
@@ -823,6 +832,10 @@
48
      * compressed by the DCT transforms, at the expense of much more compute */
49
     uint32_t  tuQTMaxIntraDepth;
50
 
51
+    /* Enable early exit decisions for inter coded blocks to avoid recursing to
52
+     * higher TU depths. Default: 0 */
53
+    uint32_t  limitTU;
54
+
55
     /* Set the amount of rate-distortion analysis to use within quant. 0 implies
56
      * no rate-distortion optimization. At level 1 rate-distortion cost is used to
57
      * find optimal rounding values for each level (and allows psy-rdoq to be
58
@@ -898,9 +911,9 @@
59
     /* Limit modes analyzed for each CU using cost metrics from the 4 sub-CUs */
60
     uint32_t limitModes;
61
 
62
-    /* ME search method (DIA, HEX, UMH, STAR, FULL). The search patterns
63
+    /* ME search method (DIA, HEX, UMH, STAR, SEA, FULL). The search patterns
64
      * (methods) are sorted in increasing complexity, with diamond being the
65
-     * simplest and fastest and full being the slowest.  DIA, HEX, and UMH were
66
+     * simplest and fastest and full being the slowest.  DIA, HEX, UMH and SEA were
67
      * adapted from x264 directly. STAR is an adaption of the HEVC reference
68
      * encoder's three step search, while full is a naive exhaustive search. The
69
      * default is the star search, it has a good balance of performance and
70
@@ -1300,15 +1313,28 @@
71
     /* Maximum of the picture order count */
72
     int log2MaxPocLsb;
73
 
74
-    /* Dicard SEI messages when printing */
75
-    int bDiscardSEI;
76
-    
77
-    /* Control removing optional vui information (timing, HRD info) to get low bitrate */
78
-    int       bDiscardOptionalVUI;
79
+    /* Emit VUI Timing info, an optional VUI field */
80
+    int bEmitVUITimingInfo;
81
+
82
+    /* Emit HRD Timing info */
83
+    int bEmitVUIHRDInfo;
84
 
85
     /* Maximum count of Slices of picture, the value range is [1, maximum rows] */
86
     unsigned int maxSlices;
87
 
88
+    /* Optimize QP in PPS based on statistics from prevvious GOP*/
89
+    int bOptQpPPS;
90
+
91
+    /* Opitmize ref list length in PPS based on stats from previous GOP*/
92
+    int bOptRefListLengthPPS;
93
+
94
+    /* Enable storing commonly RPS in SPS in multi pass mode */
95
+    int       bMultiPassOptRPS;
96
+
97
+    /* This value represents the percentage difference between the inter cost and
98
+    * intra cost of a frame used in scenecut detection. Default 5. */
99
+    double     scenecutBias;
100
+
101
 } x265_param;
102
 
103
 /* x265_param_alloc:
104
x265_2.1.tar.gz/source/x265cli.h -> x265_2.2.tar.gz/source/x265cli.h Changed
73
 
1
@@ -85,6 +85,7 @@
2
     { "max-tu-size",    required_argument, NULL, 0 },
3
     { "tu-intra-depth", required_argument, NULL, 0 },
4
     { "tu-inter-depth", required_argument, NULL, 0 },
5
+    { "limit-tu",       required_argument, NULL, 0 },
6
     { "me",             required_argument, NULL, 0 },
7
     { "subme",          required_argument, NULL, 'm' },
8
     { "merange",        required_argument, NULL, 0 },
9
@@ -120,6 +121,7 @@
10
     { "min-keyint",     required_argument, NULL, 'i' },
11
     { "scenecut",       required_argument, NULL, 0 },
12
     { "no-scenecut",          no_argument, NULL, 0 },
13
+    { "scenecut-bias",  required_argument, NULL, 0 },
14
     { "intra-refresh",        no_argument, NULL, 0 },
15
     { "rc-lookahead",   required_argument, NULL, 0 },
16
     { "lookahead-slices", required_argument, NULL, 0 },
17
@@ -208,8 +210,14 @@
18
     { "min-luma",       required_argument, NULL, 0 },
19
     { "max-luma",       required_argument, NULL, 0 },
20
     { "log2-max-poc-lsb", required_argument, NULL, 8 },
21
-    { "discard-sei",          no_argument, NULL, 0 },
22
-    { "discard-vui",          no_argument, NULL, 0 },
23
+    { "vui-timing-info",      no_argument, NULL, 0 },
24
+    { "no-vui-timing-info",   no_argument, NULL, 0 },
25
+    { "vui-hrd-info",         no_argument, NULL, 0 },
26
+    { "no-vui-hrd-info",      no_argument, NULL, 0 },
27
+    { "opt-qp-pps",           no_argument, NULL, 0 },
28
+    { "no-opt-qp-pps",        no_argument, NULL, 0 },
29
+    { "opt-ref-list-length-pps",         no_argument, NULL, 0 },
30
+    { "no-opt-ref-list-length-pps",      no_argument, NULL, 0 },
31
     { "no-dither",            no_argument, NULL, 0 },
32
     { "dither",               no_argument, NULL, 0 },
33
     { "no-repeat-headers",    no_argument, NULL, 0 },
34
@@ -229,6 +237,8 @@
35
     { "pass",           required_argument, NULL, 0 },
36
     { "slow-firstpass",       no_argument, NULL, 0 },
37
     { "no-slow-firstpass",    no_argument, NULL, 0 },
38
+    { "multi-pass-opt-rps",   no_argument, NULL, 0 },
39
+    { "no-multi-pass-opt-rps", no_argument, NULL, 0 },
40
     { "analysis-mode",  required_argument, NULL, 0 },
41
     { "analysis-file",  required_argument, NULL, 0 },
42
     { "strict-cbr",           no_argument, NULL, 0 },
43
@@ -317,6 +327,7 @@
44
     H0("   --max-tu-size <32|16|8|4>     Maximum TU size (WxH). Default %d\n", param->maxTUSize);
45
     H0("   --tu-intra-depth <integer>    Max TU recursive depth for intra CUs. Default %d\n", param->tuQTMaxIntraDepth);
46
     H0("   --tu-inter-depth <integer>    Max TU recursive depth for inter CUs. Default %d\n", param->tuQTMaxInterDepth);
47
+    H0("   --limit-tu <0..4>             Enable early exit from TU recursion for inter coded blocks. Default %d\n", param->limitTU);
48
     H0("\nAnalysis:\n");
49
     H0("   --rd <1..6>                   Level of RDO in mode decision 1:least....6:full RDO. Default %d\n", param->rdLevel);
50
     H0("   --[no-]psy-rd <0..5.0>        Strength of psycho-visual rate distortion optimization, 0 to disable. Default %.1f\n", param->psyRd);
51
@@ -357,6 +368,7 @@
52
     H0("-i/--min-keyint <integer>        Scenecuts closer together than this are coded as I, not IDR. Default: auto\n");
53
     H0("   --no-scenecut                 Disable adaptive I-frame decision\n");
54
     H0("   --scenecut <integer>          How aggressively to insert extra I-frames. Default %d\n", param->scenecutThreshold);
55
+    H1("   --scenecut-bias <0..100.0>    Bias for scenecut detection. Default %.2f\n", param->scenecutBias);
56
     H0("   --intra-refresh               Use Periodic Intra Refresh instead of IDR frames\n");
57
     H0("   --rc-lookahead <integer>      Number of frames for frame-type lookahead (determines encoder latency) Default %d\n", param->lookaheadDepth);
58
     H1("   --lookahead-slices <0..16>    Number of slices to use per lookahead cost estimate. Default %d\n", param->lookaheadSlices);
59
@@ -448,8 +460,11 @@
60
     H0("   --[no-]aud                    Emit access unit delimiters at the start of each access unit. Default %s\n", OPT(param->bEnableAccessUnitDelimiters));
61
     H1("   --hash <integer>              Decoded Picture Hash SEI 0: disabled, 1: MD5, 2: CRC, 3: Checksum. Default %d\n", param->decodedPictureHashSEI);
62
     H0("   --log2-max-poc-lsb <integer>  Maximum of the picture order count\n");
63
-    H0("   --discard-sei                 Discard SEI packets in bitstream. Default %s\n", OPT(param->bDiscardSEI));
64
-    H0("   --discard-vui                 Discard optional VUI information from the bistream. Default %s\n", OPT(param->bDiscardOptionalVUI));
65
+    H0("   --[no-]vui-timing-info        Emit VUI timing information in the bistream. Default %s\n", OPT(param->bEmitVUITimingInfo));
66
+    H0("   --[no-]vui-hrd-info           Emit VUI HRD information in the bistream. Default %s\n", OPT(param->bEmitVUIHRDInfo));
67
+    H0("   --[no-]opt-qp-pps             Dynamically optimize QP in PPS (instead of default 26) based on QPs in previous GOP. Default %s\n", OPT(param->bOptQpPPS));
68
+    H0("   --[no-]opt-ref-list-length-pps  Dynamically set L0 and L1 ref list length in PPS (instead of default 0) based on values in last GOP. Default %s\n", OPT(param->bOptRefListLengthPPS));
69
+    H0("   --[no-]multi-pass-opt-rps     Enable storing commonly used RPS in SPS in multi pass mode. Default %s\n", OPT(param->bMultiPassOptRPS));
70
     H1("\nReconstructed video options (debugging):\n");
71
     H1("-r/--recon <filename>            Reconstructed raw image YUV or Y4M output file name\n");
72
     H1("   --recon-depth <integer>       Bit-depth of reconstructed raw image file. Defaults to input bit depth, or 8 if Y4M\n");
73